package io.github.ihongs.dh.search.stoker; import java.io.IOException; import java.io.StringReader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.custom.CustomAnalyzer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; /** * 分析器测试 * 已经逐步针对中文特点构建自己的分析器 * @author Hongs */ public class DemoTest { public static void main(String[] args) throws IOException { Analyzer az = CustomAnalyzer.builder() //.withTokenizer("Standard") .withTokenizer("Name") .addTokenFilter("EdgeNGram", "minGramSize", "1", "maxGramSize", "20") //.addTokenFilter("ICUTransform", "id", "Han-Latin;NFD;[[:NonspacingMark:][:Space:]] Remove") //.addTokenFilter("EdgeNGram", "minGramSize", "1", "maxGramSize", "20") .build(); StringReader sr = new StringReader(args[0]); TokenStream ts = az.tokenStream ("" , sr); OffsetAttribute oa = ts.addAttribute (OffsetAttribute.class); CharTermAttribute ta = ts.addAttribute (CharTermAttribute.class); try { ts.reset(); // Resets this stream to the beginning. (Required) while (ts.incrementToken()) { System.out.println(ta.toString() + "|" + ta.length() + "[" + oa.startOffset() + "," + oa.endOffset() + "]"); } ts.end( ); // Perform end-of-stream operations, e.g. set the final offset. } finally { ts.close(); // Release resources associated with this stream. } } }