org.apache.lucene.analysis.core.StopFilterFactory Java Examples

The following examples show how to use org.apache.lucene.analysis.core.StopFilterFactory. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestCustomAnalyzer.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testStopWordsFromClasspath() throws Exception {
  CustomAnalyzer a = CustomAnalyzer.builder()
      .withTokenizer(WhitespaceTokenizerFactory.class)
      .addTokenFilter("stop",
          "ignoreCase", "true",
          "words", "org/apache/lucene/analysis/custom/teststop.txt",
          "format", "wordset")
      .build();
  
  assertSame(WhitespaceTokenizerFactory.class, a.getTokenizerFactory().getClass());
  assertEquals(Collections.emptyList(), a.getCharFilterFactories());
  List<TokenFilterFactory> tokenFilters = a.getTokenFilterFactories();
  assertEquals(1, tokenFilters.size());
  assertSame(StopFilterFactory.class, tokenFilters.get(0).getClass());
  assertEquals(0, a.getPositionIncrementGap("dummy"));
  assertEquals(1, a.getOffsetGap("dummy"));
  assertSame(Version.LATEST, a.getVersion());

  assertAnalyzesTo(a, "foo Foo Bar", new String[0]);
  a.close();
}
 
Example #2
Source File: AnalyzerFactory.java    From airsonic-advanced with GNU General Public License v3.0 5 votes vote down vote up
private Builder createDefaultAnalyzerBuilder() throws IOException {
    Builder builder = CustomAnalyzer.builder()
            .withTokenizer(StandardTokenizerFactory.class)
            .addTokenFilter(CJKWidthFilterFactory.class)
            .addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "false")
            .addTokenFilter(LowerCaseFilterFactory.class)
            .addTokenFilter(StopFilterFactory.class, "words", STOP_WORDS)
            .addTokenFilter(EnglishPossessiveFilterFactory.class);
    addTokenFilterForUnderscoreRemovalAroundToken(builder);
    return builder;
}
 
Example #3
Source File: AnalyzerFactory.java    From airsonic-advanced with GNU General Public License v3.0 5 votes vote down vote up
private Builder createArtistAnalyzerBuilder() throws IOException {
    Builder builder = CustomAnalyzer.builder()
            .withTokenizer(StandardTokenizerFactory.class)
            .addTokenFilter(CJKWidthFilterFactory.class)
            .addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "false")
            .addTokenFilter(LowerCaseFilterFactory.class)
            .addTokenFilter(StopFilterFactory.class, "words", STOP_WORDS_ARTIST)
            .addTokenFilter(EnglishPossessiveFilterFactory.class);
    addTokenFilterForUnderscoreRemovalAroundToken(builder);
    return builder;
}
 
Example #4
Source File: AnalyzerFactory.java    From airsonic with GNU General Public License v3.0 5 votes vote down vote up
private Builder createDefaultAnalyzerBuilder() throws IOException {
    Builder builder = CustomAnalyzer.builder()
            .withTokenizer(StandardTokenizerFactory.class)
            .addTokenFilter(CJKWidthFilterFactory.class)
            .addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "false")
            .addTokenFilter(LowerCaseFilterFactory.class)
            .addTokenFilter(StopFilterFactory.class, "words", STOP_WORDS)
            .addTokenFilter(EnglishPossessiveFilterFactory.class);
    addTokenFilterForUnderscoreRemovalAroundToken(builder);
    return builder;
}
 
Example #5
Source File: AnalyzerFactory.java    From airsonic with GNU General Public License v3.0 5 votes vote down vote up
private Builder createArtistAnalyzerBuilder() throws IOException {
    Builder builder = CustomAnalyzer.builder()
            .withTokenizer(StandardTokenizerFactory.class)
            .addTokenFilter(CJKWidthFilterFactory.class)
            .addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "false")
            .addTokenFilter(LowerCaseFilterFactory.class)
            .addTokenFilter(StopFilterFactory.class, "words", STOP_WORDS_ARTIST)
            .addTokenFilter(EnglishPossessiveFilterFactory.class);
    addTokenFilterForUnderscoreRemovalAroundToken(builder);
    return builder;
}
 
Example #6
Source File: TestCustomAnalyzer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testStopWordsFromClasspathWithMap() throws Exception {
  Map<String,String> stopConfig1 = new HashMap<>();
  stopConfig1.put("ignoreCase", "true");
  stopConfig1.put("words", "org/apache/lucene/analysis/custom/teststop.txt");
  stopConfig1.put("format", "wordset");
  
  Map<String,String> stopConfig2 = new HashMap<>(stopConfig1);
  Map<String,String> stopConfigImmutable = Collections.unmodifiableMap(new HashMap<>(stopConfig1));

  CustomAnalyzer a = CustomAnalyzer.builder()
      .withTokenizer("whitespace")
      .addTokenFilter("stop", stopConfig1)
      .build();
  assertTrue(stopConfig1.isEmpty());
  assertAnalyzesTo(a, "foo Foo Bar", new String[0]);
  
  a = CustomAnalyzer.builder()
      .withTokenizer(WhitespaceTokenizerFactory.class)
      .addTokenFilter(StopFilterFactory.class, stopConfig2)
      .build();
  assertTrue(stopConfig2.isEmpty());
  assertAnalyzesTo(a, "foo Foo Bar", new String[0]);
  
  // try with unmodifiableMap, should fail
  expectThrows(UnsupportedOperationException.class, () -> {
    CustomAnalyzer.builder()
        .withTokenizer("whitespace")
        .addTokenFilter("stop", stopConfigImmutable)
        .build();
  });
  a.close();
}
 
Example #7
Source File: SolrStopwordsCarrot2LexicalDataFactory.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Obtains stop words for a field from the associated
 * {@link StopFilterFactory}, if any.
 */
private List<CharArraySet> getSolrStopWordsForField(String fieldName) {
  // No need to synchronize here, Carrot2 ensures that instances
  // of this class are not used by multiple threads at a time.
  synchronized (solrStopWords) {
    if (!solrStopWords.containsKey(fieldName)) {
      solrStopWords.put(fieldName, new ArrayList<>());

      IndexSchema schema = core.getLatestSchema();
      final Analyzer fieldAnalyzer = schema.getFieldType(fieldName).getIndexAnalyzer();
      if (fieldAnalyzer instanceof TokenizerChain) {
        final TokenFilterFactory[] filterFactories = 
            ((TokenizerChain) fieldAnalyzer).getTokenFilterFactories();
        for (TokenFilterFactory factory : filterFactories) {
          if (factory instanceof StopFilterFactory) {
            // StopFilterFactory holds the stop words in a CharArraySet
            CharArraySet stopWords = ((StopFilterFactory) factory).getStopWords();
            solrStopWords.get(fieldName).add(stopWords);
          }

          if (factory instanceof CommonGramsFilterFactory) {
            CharArraySet commonWords = ((CommonGramsFilterFactory) factory).getCommonWords();
            solrStopWords.get(fieldName).add(commonWords);
          }
        }
      }
    }
    return solrStopWords.get(fieldName);
  }
}
 
Example #8
Source File: TaggerRequestHandler.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private boolean fieldHasIndexedStopFilter(String field, SolrQueryRequest req) {
  FieldType fieldType = req.getSchema().getFieldType(field);
  Analyzer analyzer = fieldType.getIndexAnalyzer();//index analyzer
  if (analyzer instanceof TokenizerChain) {
    TokenizerChain tokenizerChain = (TokenizerChain) analyzer;
    TokenFilterFactory[] tokenFilterFactories = tokenizerChain.getTokenFilterFactories();
    for (TokenFilterFactory tokenFilterFactory : tokenFilterFactories) {
      if (tokenFilterFactory instanceof StopFilterFactory)
        return true;
    }
  }
  return false;
}
 
Example #9
Source File: TaggerRequestHandler.java    From SolrTextTagger with Apache License 2.0 5 votes vote down vote up
private boolean fieldHasIndexedStopFilter(String field, SolrQueryRequest req) {
  FieldType fieldType = req.getSchema().getFieldType(field);
  Analyzer analyzer = fieldType.getIndexAnalyzer();//index analyzer
  if (analyzer instanceof TokenizerChain) {
    TokenizerChain tokenizerChain = (TokenizerChain) analyzer;
    TokenFilterFactory[] tokenFilterFactories = tokenizerChain.getTokenFilterFactories();
    for (TokenFilterFactory tokenFilterFactory : tokenFilterFactories) {
      if (tokenFilterFactory instanceof StopFilterFactory)
        return true;
    }
  }
  return false;
}