Java Code Examples for org.apache.lucene.analysis.StopFilter

The following examples show how to use org.apache.lucene.analysis.StopFilter. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: lucene-solr   Source File: TestFreeTextSuggester.java    License: Apache License 2.0 6 votes vote down vote up
public void testEndingHole() throws Exception {
  // Just deletes "of"
  Analyzer a = new Analyzer() {
      @Override
      public TokenStreamComponents createComponents(String field) {
        Tokenizer tokenizer = new MockTokenizer();
        CharArraySet stopSet = StopFilter.makeStopSet("of");
        return new TokenStreamComponents(tokenizer, new StopFilter(tokenizer, stopSet));
      }
    };

  Iterable<Input> keys = AnalyzingSuggesterTest.shuffle(
      new Input("wizard of oz", 50)
  );
  FreeTextSuggester sug = new FreeTextSuggester(a, a, 3, (byte) 0x20);
  sug.build(new InputArrayIterator(keys));
  assertEquals("wizard _ oz/1.00",
               toString(sug.lookup("wizard of", 10)));

  // Falls back to unigram model, with backoff 0.4 times
  // prop 0.5:
  assertEquals("oz/0.20",
               toString(sug.lookup("wizard o", 10)));
  a.close();
}
 
Example 2
Source Project: lucene-solr   Source File: TestFreeTextSuggester.java    License: Apache License 2.0 6 votes vote down vote up
public void testTwoEndingHoles() throws Exception {
  // Just deletes "of"
  Analyzer a = new Analyzer() {
      @Override
      public TokenStreamComponents createComponents(String field) {
        Tokenizer tokenizer = new MockTokenizer();
        CharArraySet stopSet = StopFilter.makeStopSet("of");
        return new TokenStreamComponents(tokenizer, new StopFilter(tokenizer, stopSet));
      }
    };

  Iterable<Input> keys = AnalyzingSuggesterTest.shuffle(
      new Input("wizard of of oz", 50)
  );
  FreeTextSuggester sug = new FreeTextSuggester(a, a, 3, (byte) 0x20);
  sug.build(new InputArrayIterator(keys));
  assertEquals("",
               toString(sug.lookup("wizard of of", 10)));
  a.close();
}
 
Example 3
Source Project: lucene-solr   Source File: TestSuggestStopFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testEndNotStopWord() throws Exception {
  CharArraySet stopWords = StopFilter.makeStopSet("to");
  Tokenizer stream = new MockTokenizer();
  stream.setReader(new StringReader("go to"));
  TokenStream filter = new SuggestStopFilter(stream, stopWords);
  assertTokenStreamContents(filter,
                            new String[] {"go", "to"},
                            new int[] {0, 3},
                            new int[] {2, 5},
                            null,
                            new int[] {1, 1},
                            null,
                            5,
                            new boolean[] {false, true},
                            true);
}
 
Example 4
Source Project: lucene-solr   Source File: TestSuggestStopFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testEndIsStopWord() throws Exception {
                            
  CharArraySet stopWords = StopFilter.makeStopSet("to");
  Tokenizer stream = new MockTokenizer();
  stream.setReader(new StringReader("go to "));
  TokenStream filter = new SuggestStopFilter(stream, stopWords);
  assertTokenStreamContents(filter,
                            new String[] {"go"},
                            new int[] {0},
                            new int[] {2},
                            null,
                            new int[] {1},
                            null,
                            6,
                            new boolean[] {false},
                            true);
}
 
Example 5
Source Project: lucene-solr   Source File: TestSuggestStopFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testMidStopWord() throws Exception {
                            
  CharArraySet stopWords = StopFilter.makeStopSet("to");
  Tokenizer stream = new MockTokenizer();
  stream.setReader(new StringReader("go to school"));
  TokenStream filter = new SuggestStopFilter(stream, stopWords);
  assertTokenStreamContents(filter,
                            new String[] {"go", "school"},
                            new int[] {0, 6},
                            new int[] {2, 12},
                            null,
                            new int[] {1, 2},
                            null,
                            12,
                            new boolean[] {false, false},
                            true);
}
 
Example 6
Source Project: lucene-solr   Source File: TestSuggestStopFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testMultipleStopWords() throws Exception {
                            
  CharArraySet stopWords = StopFilter.makeStopSet("to", "the", "a");
  Tokenizer stream = new MockTokenizer();
  stream.setReader(new StringReader("go to a the school"));
  TokenStream filter = new SuggestStopFilter(stream, stopWords);
  assertTokenStreamContents(filter,
                            new String[] { "go", "school" },
                            new int[] {0, 12},
                            new int[] {2, 18},
                            null,
                            new int[] {1, 4},
                            null,
                            18,
                            new boolean[] {false, false},
                            true);
}
 
Example 7
Source Project: lucene-solr   Source File: TestSuggestStopFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testMultipleStopWordsEnd() throws Exception {
                            
  CharArraySet stopWords = StopFilter.makeStopSet("to", "the", "a");
  Tokenizer stream = new MockTokenizer();
  stream.setReader(new StringReader("go to a the"));
  TokenStream filter = new SuggestStopFilter(stream, stopWords);
  assertTokenStreamContents(filter,
                            new String[] { "go", "the"},
                            new int[] {0, 8},
                            new int[] {2, 11},
                            null,
                            new int[] {1, 3},
                            null,
                            11,
                            new boolean[] {false, true},
                            true);
}
 
Example 8
Source Project: lucene-solr   Source File: TestSuggestStopFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testMultipleStopWordsEnd2() throws Exception {
                            
  CharArraySet stopWords = StopFilter.makeStopSet("to", "the", "a");
  Tokenizer stream = new MockTokenizer();
  stream.setReader(new StringReader("go to a the "));
  TokenStream filter = new SuggestStopFilter(stream, stopWords);
  assertTokenStreamContents(filter,
                            new String[] { "go"},
                            new int[] {0},
                            new int[] {2},
                            null,
                            new int[] {1},
                            null,
                            12,
                            new boolean[] {false},
                            true);
}
 
Example 9
Source Project: lucene-solr   Source File: TestConcatenateGraphFilter.java    License: Apache License 2.0 6 votes vote down vote up
@Test
  public void testWithStopword() throws Exception {
    for (boolean preservePosInc : new boolean[]{true, false}) {
      Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true);
      String input = "a mykeyword a keyword"; //LUCENE-8344 add "a"
      tokenStream.setReader(new StringReader(input));
      TokenFilter tokenFilter = new StopFilter(tokenStream, StopFilter.makeStopSet("a"));
      ConcatenateGraphFilter concatStream = new ConcatenateGraphFilter(tokenFilter, SEP_LABEL, preservePosInc, 10);
      CharsRefBuilder builder = new CharsRefBuilder();
      if (preservePosInc) {
        builder.append(SEP_LABEL);
      }
      builder.append("mykeyword");
      builder.append(SEP_LABEL);
      if (preservePosInc) {
        builder.append(SEP_LABEL);
      }
      builder.append("keyword");
//      if (preservePosInc) { LUCENE-8344 uncomment
//        builder.append(SEP_LABEL);
//      }
      assertTokenStreamContents(concatStream, new String[]{builder.toCharsRef().toString()});
    }
  }
 
Example 10
Source Project: subsonic   Source File: SearchService.java    License: GNU General Public License v3.0 6 votes vote down vote up
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
    class SavedStreams {
        StandardTokenizer tokenStream;
        TokenStream filteredTokenStream;
    }

    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
    if (streams == null) {
        streams = new SavedStreams();
        setPreviousTokenStream(streams);
        streams.tokenStream = new StandardTokenizer(LUCENE_VERSION, reader);
        streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
        streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
        streams.filteredTokenStream = new StopFilter(true, streams.filteredTokenStream, STOP_WORDS_SET);
        streams.filteredTokenStream = new ASCIIFoldingFilter(streams.filteredTokenStream);
    } else {
        streams.tokenStream.reset(reader);
    }
    streams.tokenStream.setMaxTokenLength(DEFAULT_MAX_TOKEN_LENGTH);

    return streams.filteredTokenStream;
}
 
Example 11
Source Project: lucene-solr   Source File: QueryAutoStopWordAnalyzer.java    License: Apache License 2.0 5 votes vote down vote up
@Override
protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
  Set<String> stopWords = stopWordsPerField.get(fieldName);
  if (stopWords == null) {
    return components;
  }
  StopFilter stopFilter = new StopFilter(components.getTokenStream(), 
      new CharArraySet(stopWords, false));
  return new TokenStreamComponents(components.getSource(), stopFilter);
}
 
Example 12
Source Project: lucene-solr   Source File: TestConcatenateGraphFilter.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testSeparatorWithStopWords() throws IOException {
  Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  String input = "A B C D E F J H";
  tokenStream.setReader(new StringReader(input));
  TokenStream tokenFilter = new StopFilter(tokenStream, StopFilter.makeStopSet("A", "D", "E", "J"));
  ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenFilter, '-', false, 100);

  assertTokenStreamContents(stream, new String[] {"B-C-F-H"}, null, null, new int[] { 1 });
}
 
Example 13
Source Project: lucene-solr   Source File: TestConcatenateGraphFilter.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testSeparatorWithStopWordsAndPreservePositionIncrements() throws IOException {
  Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  String input = "A B C D E F J H";
  tokenStream.setReader(new StringReader(input));
  TokenStream tokenFilter = new StopFilter(tokenStream, StopFilter.makeStopSet("A", "D", "E", "J"));
  ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenFilter, '-', true, 100);

  assertTokenStreamContents(stream, new String[] {"-B-C---F--H"}, null, null, new int[] { 1 });
}
 
Example 14
@Before
public void up() {
    analyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer source = new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
                    new DefaultIcuTokenizerConfig(false, true));
            TokenStream result = new CJKBigramFilter(source);
            return new TokenStreamComponents(source, new StopFilter(result, CharArraySet.EMPTY_SET));
        }
    };
}
 
Example 15
Source Project: crate   Source File: StopTokenFilterFactory.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public TokenStream create(TokenStream tokenStream) {
    if (removeTrailing) {
        return new StopFilter(tokenStream, stopWords);
    } else {
        return new SuggestStopFilter(tokenStream, stopWords);
    }
}
 
Example 16
Source Project: crate   Source File: CommonAnalysisPlugin.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
    List<PreConfiguredTokenFilter> filters = new ArrayList<>();
    filters.add(PreConfiguredTokenFilter.singleton("apostrophe", false, ApostropheFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("arabic_normalization", true, ArabicNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("arabic_stem", false, ArabicStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("asciifolding", true, ASCIIFoldingFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("bengali_normalization", true, BengaliNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("brazilian_stem", false, BrazilianStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("cjk_bigram", false, CJKBigramFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("cjk_width", true, CJKWidthFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("classic", false, ClassicFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton(
        "common_grams",
        false,
        input -> new CommonGramsFilter(input, CharArraySet.EMPTY_SET)));
    filters.add(PreConfiguredTokenFilter.singleton("czech_stem", false, CzechStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("decimal_digit", true, DecimalDigitFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("delimited_payload_filter", false, input ->
            new DelimitedPayloadTokenFilter(input,
                    DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER,
                    DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER)));
    filters.add(PreConfiguredTokenFilter.singleton("delimited_payload", false, input ->
            new DelimitedPayloadTokenFilter(input,
                    DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER,
                    DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER)));
    filters.add(PreConfiguredTokenFilter.singleton("dutch_stem", false, input -> new SnowballFilter(input, new DutchStemmer())));
    filters.add(PreConfiguredTokenFilter.singleton(
        "edge_ngram",
        false,
        input -> new EdgeNGramTokenFilter(input, EdgeNGramTokenFilterFactory.SIDE_FRONT, EdgeNGramTokenFilterFactory.SIDE_BACK, EdgeNGramTokenFilter.DEFAULT_PRESERVE_ORIGINAL)));
    filters.add(PreConfiguredTokenFilter.singleton(
        "elision",
        true,
        input -> new ElisionFilter(input, FrenchAnalyzer.DEFAULT_ARTICLES))
    );
    filters.add(PreConfiguredTokenFilter.singleton("french_stem", false, input -> new SnowballFilter(input, new FrenchStemmer())));
    filters.add(PreConfiguredTokenFilter.singleton("german_normalization", true, GermanNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("german_stem", false, GermanStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("hindi_normalization", true, HindiNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("indic_normalization", true, IndicNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, KeywordRepeatFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("kstem", false, KStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("length", false, input ->
        new LengthFilter(input, 0, Integer.MAX_VALUE)));  // TODO this one seems useless
    filters.add(PreConfiguredTokenFilter.singleton(
        "limit",
        false,
        input -> new LimitTokenCountFilter(
            input,
            LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT,
            LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS)
        )
    );
    filters.add(PreConfiguredTokenFilter.singleton("ngram", false, reader -> new NGramTokenFilter(reader, 1, 2, false)));
    filters.add(PreConfiguredTokenFilter.singleton("persian_normalization", true, PersianNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("porter_stem", false, PorterStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("reverse", false, ReverseStringFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("russian_stem", false, input -> new SnowballFilter(input, "Russian")));
    filters.add(PreConfiguredTokenFilter.singleton("scandinavian_folding", true, ScandinavianFoldingFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("scandinavian_normalization", true, ScandinavianNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("shingle", false, input -> {
        TokenStream ts = new ShingleFilter(input);
        /**
         * We disable the graph analysis on this token stream
         * because it produces shingles of different size.
         * Graph analysis on such token stream is useless and dangerous as it may create too many paths
         * since shingles of different size are not aligned in terms of positions.
         */
        ts.addAttribute(DisableGraphAttribute.class);
        return ts;
    }));
    filters.add(PreConfiguredTokenFilter.singleton("snowball", false, input -> new SnowballFilter(input, "English")));
    filters.add(PreConfiguredTokenFilter.singleton("sorani_normalization", true, SoraniNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("stemmer", false, PorterStemFilter::new));
    // The stop filter is in lucene-core but the English stop words set is in lucene-analyzers-common
    filters.add(PreConfiguredTokenFilter.singleton("stop", false, input -> new StopFilter(input, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET)));
    filters.add(PreConfiguredTokenFilter.singleton("trim", true, TrimFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("truncate", false, input -> new TruncateTokenFilter(input, 10)));
    filters.add(PreConfiguredTokenFilter.singleton("type_as_payload", false, TypeAsPayloadTokenFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("unique", false, UniqueTokenFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("uppercase", true, UpperCaseFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("word_delimiter", false, input ->
            new WordDelimiterFilter(input,
                    WordDelimiterFilter.GENERATE_WORD_PARTS
                  | WordDelimiterFilter.GENERATE_NUMBER_PARTS
                  | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE
                  | WordDelimiterFilter.SPLIT_ON_NUMERICS
                  | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null)));
    filters.add(PreConfiguredTokenFilter.singleton("word_delimiter_graph", false, input ->
            new WordDelimiterGraphFilter(input,
                    WordDelimiterGraphFilter.GENERATE_WORD_PARTS
                  | WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS
                  | WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE
                  | WordDelimiterGraphFilter.SPLIT_ON_NUMERICS
                  | WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null)));
    return filters;
}
 
Example 17
@Override public TokenStream create(TokenStream tokenStream) {
    return new StopFilter(tokenStream, VietnameseAnalyzer.getDefaultStopSet());
}
 
Example 18
Source Project: lucene-solr   Source File: StopAnalyzer.java    License: Apache License 2.0 2 votes vote down vote up
/**
 * Creates
 * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
 * used to tokenize all the text in the provided {@link Reader}.
 * 
 * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
 *         built from a {@link LetterTokenizer} filtered with
 *         {@link StopFilter}
 */
@Override
protected TokenStreamComponents createComponents(String fieldName) {
  final Tokenizer source = new LetterTokenizer();
  return new TokenStreamComponents(source, new StopFilter(new LowerCaseFilter(source), stopwords));
}