package eu.modernmt.decoder.neural.memory.lucene.analysis; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.shingle.ShingleFilter; import java.io.Reader; public class ContentAnalyzer extends Analyzer { private final int shingleSize; private final boolean outputUnigrams; public ContentAnalyzer(int shingleSize, boolean outputUnigrams) { super(GLOBAL_REUSE_STRATEGY); this.shingleSize = shingleSize; this.outputUnigrams = outputUnigrams; } @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new WhitespaceTokenizer(reader); TokenStream filter; filter = new PunctuationFilter(tokenizer); if (shingleSize > 0) { ShingleFilter shingleFilter = new ShingleFilter(filter, shingleSize, shingleSize); shingleFilter.setOutputUnigrams(outputUnigrams); filter = shingleFilter; } return new TokenStreamComponents(tokenizer, filter); } }