org.apache.lucene.analysis.core.LowerCaseFilter Java Examples

The following examples show how to use org.apache.lucene.analysis.core.LowerCaseFilter. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestProtectedTermFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testBasic() throws IOException {

    CannedTokenStream cts = new CannedTokenStream(
        new Token("Alice", 1, 0, 5),
        new Token("Bob", 1, 6, 9),
        new Token("Clara", 1, 10, 15),
        new Token("David", 1, 16, 21)
    );

    CharArraySet protectedTerms = new CharArraySet(5, true);
    protectedTerms.add("bob");

    TokenStream ts = new ProtectedTermFilter(protectedTerms, cts, LowerCaseFilter::new);
    assertTokenStreamContents(ts, new String[]{ "alice", "Bob", "clara", "david" });

  }
 
Example #2
Source File: LowerCaseTokenFilterFactory.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
@Override
public TokenStream create(TokenStream tokenStream) {
    if (lang == null) {
        return new LowerCaseFilter(tokenStream);
    } else if (lang.equalsIgnoreCase("greek")) {
        return new GreekLowerCaseFilter(tokenStream);
    } else if (lang.equalsIgnoreCase("irish")) {
        return new IrishLowerCaseFilter(tokenStream);
    } else if (lang.equalsIgnoreCase("turkish")) {
        return new TurkishLowerCaseFilter(tokenStream);
    } else {
        throw new IllegalArgumentException("language [" + lang + "] not support for lower case");
    }
}
 
Example #3
Source File: TestBrazilianAnalyzer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testWithKeywordAttribute() throws IOException {
  CharArraySet set = new CharArraySet(1, true);
  set.add("Brasília");
  Tokenizer tokenizer = new LetterTokenizer();
  tokenizer.setReader(new StringReader("Brasília Brasilia"));
  BrazilianStemFilter filter = new BrazilianStemFilter(new SetKeywordMarkerFilter(new LowerCaseFilter(tokenizer), set));

  assertTokenStreamContents(filter, new String[] { "brasília", "brasil" });
}
 
Example #4
Source File: PatternAnalyzerImpl.java    From database with GNU General Public License v2.0 5 votes vote down vote up
@Override
protected TokenStreamComponents createComponents(final String field) {
	//Use default grouping
	final Tokenizer tokenizer = new PatternTokenizer(pattern,-1);
	final TokenStream filter = new LowerCaseFilter(tokenizer);
	return new TokenStreamComponents(tokenizer, filter);
}
 
Example #5
Source File: SynonymLoader.java    From elasticsearch-analysis-synonym with Apache License 2.0 5 votes vote down vote up
protected static Analyzer getAnalyzer(final boolean ignoreCase) {
    return new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(final String fieldName) {
            final Tokenizer tokenizer = new KeywordTokenizer();
            final TokenStream stream = ignoreCase ? new LowerCaseFilter(tokenizer) : tokenizer;
            return new TokenStreamComponents(tokenizer, stream);
        }
    };
}
 
Example #6
Source File: AnnotationAnalyzer.java    From elasticsearch-analysis-annotation with Apache License 2.0 5 votes vote down vote up
@Override
protected TokenStreamComponents createComponents(String fieldName,
		Reader reader) {
	Tokenizer source = new WhitespaceTokenizer(version, reader);
	TokenStream filter = new LowerCaseFilter(version, source);
	filter = new InlineAnnotationFilter(filter);
	
	return new TokenStreamComponents(source, filter);
}
 
Example #7
Source File: NGramAnalyzer.java    From onedev with MIT License 4 votes vote down vote up
@Override
protected TokenStreamComponents createComponents(String fieldName) {
	Tokenizer src = new NGramTokenizer(minGram, maxGram);
	TokenStream stream = new LowerCaseFilter(src);
	return new TokenStreamComponents(src, stream);
}
 
Example #8
Source File: DynamicSynonymTokenFilterFactory.java    From elasticsearch-analysis-dynamic-synonym with Apache License 2.0 4 votes vote down vote up
public DynamicSynonymTokenFilterFactory(IndexSettings indexSettings, Environment env, AnalysisRegistry analysisRegistry,
                                        String name, Settings settings) throws IOException {

    //加载配置
    super(indexSettings, name, settings);
    this.indexName = indexSettings.getIndex().getName();
    this.interval = settings.getAsInt("interval", 60);
    this.ignoreCase = settings.getAsBoolean("ignore_case", false);
    this.expand = settings.getAsBoolean("expand", true);
    this.format = settings.get("format", "");
    this.location = settings.get("synonyms_path");

    logger.info("indexName:{} synonyms_path:{} interval:{} ignore_case:{} expand:{} format:{}",
            indexName, location, interval, ignoreCase, expand, format);

    //属性检查
    if (this.location == null) {
        throw new IllegalArgumentException(
                "dynamic synonym requires `synonyms_path` to be configured");
    }

    String tokenizerName = settings.get("tokenizer", "whitespace");
    AnalysisModule.AnalysisProvider<TokenizerFactory> tokenizerFactoryFactory =
            analysisRegistry.getTokenizerProvider(tokenizerName, indexSettings);
    if (tokenizerFactoryFactory == null) {
        throw new IllegalArgumentException("failed to find tokenizer [" + tokenizerName + "] for synonym token filter");
    }
    final TokenizerFactory tokenizerFactory = tokenizerFactoryFactory.get(indexSettings, env, tokenizerName,
            AnalysisRegistry.getSettingsFromIndexSettings(indexSettings, AnalysisRegistry.INDEX_ANALYSIS_TOKENIZER + "." + tokenizerName));
    Analyzer analyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = tokenizerFactory == null ? new WhitespaceTokenizer() : tokenizerFactory.create();
            TokenStream stream = ignoreCase ? new LowerCaseFilter(tokenizer) : tokenizer;
            return new TokenStreamComponents(tokenizer, stream);
        }
    };


    //根据location前缀初始化同义词更新策略
    SynonymFile synonymFile;
    if (location.startsWith("http://")) {
        synonymFile = new RemoteSynonymFile(env, analyzer, expand, format, location);
    } else {
        synonymFile = new LocalSynonymFile(env, analyzer, expand, format, location);
    }
    synonymMap = synonymFile.reloadSynonymMap();

    //加入监控队列,定时load
    scheduledFutures.putIfAbsent(this.indexName, new CopyOnWriteArrayList<ScheduledFuture>());
    scheduledFutures.get(this.indexName)
            .add(monitorPool.scheduleAtFixedRate(new Monitor(synonymFile), interval, interval, TimeUnit.SECONDS));
}
 
Example #9
Source File: SynonymTokenFilterFactory.java    From elasticsearch-analysis-synonym with Apache License 2.0 4 votes vote down vote up
public SynonymTokenFilterFactory(final IndexSettings indexSettings, final Environment environment, final String name, final Settings settings,
        final AnalysisRegistry analysisRegistry) throws IOException {
    super(indexSettings, name, settings);

    this.ignoreCase = settings.getAsBoolean("ignore_case", false);
    final boolean expand = settings.getAsBoolean("expand", true);

    final String tokenizerName = settings.get("tokenizer", "whitespace");

    AnalysisModule.AnalysisProvider<TokenizerFactory> tokenizerFactoryFactory = null;
    if (analysisRegistry != null) {
        tokenizerFactoryFactory = analysisRegistry.getTokenizerProvider(tokenizerName, indexSettings);
        if (tokenizerFactoryFactory == null) {
            throw new IllegalArgumentException("failed to find tokenizer [" + tokenizerName + "] for synonym token filter");
        }
    }

    final TokenizerFactory tokenizerFactory = tokenizerFactoryFactory == null ? null
            : tokenizerFactoryFactory.get(indexSettings, environment, tokenizerName, AnalysisRegistry
                    .getSettingsFromIndexSettings(indexSettings, AnalysisRegistry.INDEX_ANALYSIS_TOKENIZER + "." + tokenizerName));

    final Analyzer analyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(final String fieldName) {
            final Tokenizer tokenizer = tokenizerFactory == null ? new WhitespaceTokenizer() : tokenizerFactory.create();
            final TokenStream stream = ignoreCase ? new LowerCaseFilter(tokenizer) : tokenizer;
            return new TokenStreamComponents(tokenizer, stream);
        }
    };

    synonymLoader = new SynonymLoader(environment, settings, expand, analyzer);
    if (synonymLoader.getSynonymMap() == null) {
        if (settings.getAsList("synonyms", null) != null) {
            logger.warn("synonyms values are empty.");
        } else if (settings.get("synonyms_path") != null) {
            logger.warn("synonyms_path[{}] is empty.", settings.get("synonyms_path"));
        } else {
            throw new IllegalArgumentException("synonym requires either `synonyms` or `synonyms_path` to be configured");
        }
    }
}