java lucene tokenize icu4j

Condenar sentencia tailandesa con ICUTokenizer JAVA



lucene icu4j (1)

Finalmente descubrió cómo usar ICU4J en un programa Java

import java.io.IOException; import java.io.Reader; import java.io.StringReader; import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; public class icuEstes { public static void main(String[] args) throws IOException { Reader reader = new StringReader("การที่ได้ต้องแสดงว่างานดี This is a test ກວ່າດອກ"); ICUTokenizer icut = new ICUTokenizer(); icut.setReader(reader); icut.addAttribute(CharTermAttribute.class); icut.reset(); while (icut.incrementToken()) { System.out.println(icut.toString()); System.out.println(icut.getAttribute(CharTermAttribute.class)); } icut.close(); }}

Estoy intentando el siguiente código para obtener todos los tokens de la oración thai. Lanza una excepción. ¿Alguien puede señalarme que tokenize thai en JAVA?

import org.apache.lucene.analysis.Analyzer.TokenStreamComponents; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.icu.ICUNormalizer2Filter; import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; public class Tokenizer{ public static void main(String[] args) throws IOException { ICUTokenizer tokenizer = new ICUTokenizer(new StringReader("การที่ได้ต้องแสดงว่างานดี")); TokenFilter filter = new ICUNormalizer2Filter(tokenizer); TokenStreamComponents tt = new TokenStreamComponents(tokenizer, filter); TokenStream ts = tt.getTokenStream(); CharTermAttribute cattr = ts.addAttribute(CharTermAttribute.class); ts.reset(); while(ts.incrementToken()){ System.out.println(cattr.toString()+"-----"); } } }

La excepción es la siguiente

Exception in thread "main" java.lang.ExceptionInInitializerError at org.apache.lucene.analysis.icu.segmentation.ICUTokenizer.<init>(ICUTokenizer.java:72) at com.tokenizer.tt.main(tt.java:22) Caused by: java.lang.RuntimeException: java.io.IOException: ICU data file error: Not an ICU data file at org.apache.lucene.analysis.icu.segmentation.DefaultICUTokenizerConfig.readBreakIterator(DefaultICUTokenizerConfig.java:128) at org.apache.lucene.analysis.icu.segmentation.DefaultICUTokenizerConfig.<clinit>(DefaultICUTokenizerConfig.java:66) ... 2 more Caused by: java.io.IOException: ICU data file error: Not an ICU data file at com.ibm.icu.impl.ICUBinary.readHeader(ICUBinary.java:577) at com.ibm.icu.text.RBBIDataWrapper.get(RBBIDataWrapper.java:173) at com.ibm.icu.text.RuleBasedBreakIterator.getInstanceFromCompiledRules(RuleBasedBreakIterator.java:71) at org.apache.lucene.analysis.icu.segmentation.DefaultICUTokenizerConfig.readBreakIterator(DefaultICUTokenizerConfig.java:123) ... 3 more