package com.hankcs.lucene; import com.hankcs.hanlp.HanLP; import com.hankcs.hanlp.corpus.io.IOUtil; import com.hankcs.hanlp.seg.Segment; import com.hankcs.hanlp.seg.common.Term; import com.hankcs.hanlp.tokenizer.TraditionalChineseTokenizer; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.util.TokenizerFactory; import org.apache.lucene.util.AttributeFactory; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeSet; public class HanLPTokenizerFactory extends TokenizerFactory { private boolean enableIndexMode; private boolean enablePorterStemming; private boolean enableNumberQuantifierRecognize; private boolean enableCustomDictionary; private boolean enableTranslatedNameRecognize; private boolean enableJapaneseNameRecognize; private boolean enableOrganizationRecognize; private boolean enablePlaceRecognize; private boolean enableNameRecognize; private boolean enableTraditionalChineseMode; private Set<String> stopWordDictionary; /** * 初始化工厂类 * * @param args 通过这个Map保存xml中的配置项 */ public HanLPTokenizerFactory(Map<String, String> args) { super(args); enableIndexMode = getBoolean(args, "enableIndexMode", true); enablePorterStemming = getBoolean(args, "enablePorterStemming", false); enableNumberQuantifierRecognize = getBoolean(args, "enableNumberQuantifierRecognize", false); enableCustomDictionary = getBoolean(args, "enableCustomDictionary", true); enableTranslatedNameRecognize = getBoolean(args, "enableTranslatedNameRecognize", false); enableJapaneseNameRecognize = getBoolean(args, "enableJapaneseNameRecognize", false); enableOrganizationRecognize = getBoolean(args, "enableOrganizationRecognize", false); enableNameRecognize = getBoolean(args, "enableNameRecognize", false); enablePlaceRecognize = getBoolean(args, "enablePlaceRecognize", false); enableTraditionalChineseMode = getBoolean(args, "enableTraditionalChineseMode", false); HanLP.Config.Normalization = getBoolean(args, "enableNormalization", HanLP.Config.Normalization); Set<String> customDictionaryPathSet = getSet(args, "customDictionaryPath"); if (customDictionaryPathSet != null) { HanLP.Config.CustomDictionaryPath = customDictionaryPathSet.toArray(new String[0]); } String stopWordDictionaryPath = get(args, "stopWordDictionaryPath"); if (stopWordDictionaryPath != null) { stopWordDictionary = new TreeSet<>(); stopWordDictionary.addAll(IOUtil.readLineListWithLessMemory(stopWordDictionaryPath)); } if (getBoolean(args, "enableDebug", false)) { HanLP.Config.enableDebug(); } } @Override public Tokenizer create(AttributeFactory factory) { Segment segment = HanLP.newSegment().enableOffset(true).enableIndexMode(enableIndexMode) .enableNameRecognize(enableNameRecognize) .enableNumberQuantifierRecognize(enableNumberQuantifierRecognize) .enableCustomDictionary(enableCustomDictionary) .enableTranslatedNameRecognize(enableTranslatedNameRecognize) .enableJapaneseNameRecognize(enableJapaneseNameRecognize) .enableOrganizationRecognize(enableOrganizationRecognize) .enablePlaceRecognize(enablePlaceRecognize); if (enableTraditionalChineseMode) { segment.enableIndexMode(false); Segment inner = segment; TraditionalChineseTokenizer.SEGMENT = inner; segment = new Segment() { @Override protected List<Term> segSentence(char[] sentence) { List<Term> termList = TraditionalChineseTokenizer.segment(new String(sentence)); return termList; } }; } return new HanLPTokenizer(segment, stopWordDictionary, enablePorterStemming); } }