package eu.modernmt.processing.tokenizer.lucene.analyzers; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.ar.ArabicNormalizationFilter; import org.apache.lucene.analysis.ar.ArabicStemFilter; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; import java.io.Reader; /** * Created by davide on 12/11/15. */ public class ArabicAnalyzer extends Analyzer { /** * Creates * {@link TokenStreamComponents} * used to tokenize all the text in the provided {@link Reader}. * * @return {@link TokenStreamComponents} * built from an {@link StandardTokenizer} filtered with * {@link LowerCaseFilter}, {@link StopFilter}, * {@link ArabicNormalizationFilter}, {@link SetKeywordMarkerFilter} * if a stem exclusion set is provided and {@link ArabicStemFilter}. */ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer source = new StandardTokenizer(reader); TokenStream result = new ArabicNormalizationFilter(source); return new TokenStreamComponents(source, result); } }