Java Code Examples for org.apache.lucene.analysis.TokenFilter

The following examples show how to use org.apache.lucene.analysis.TokenFilter. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: lucene-solr   Source File: NGramTokenFilterTest.java    License: Apache License 2.0 6 votes vote down vote up
public void testInvalidOffsets() throws Exception {
  Analyzer analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
      filters = new NGramTokenFilter(filters, 2, 2, false);
      return new TokenStreamComponents(tokenizer, filters);
    }
  };
  assertAnalyzesTo(analyzer, "mosfellsbær",
      new String[] { "mo", "os", "sf", "fe", "el", "ll", "ls", "sb", "ba", "ae", "er" },
      new int[]    {    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
      new int[]    {   11,   11,   11,   11,   11,   11,   11,   11,   11,   11,   11 },
      new int[]    {     1,   0,    0,    0,    0,    0,    0,    0,    0,    0,    0  });
  analyzer.close();
}
 
Example 2
Source Project: lucene-solr   Source File: TestConcatenateGraphFilter.java    License: Apache License 2.0 6 votes vote down vote up
@Test
  public void testWithStopword() throws Exception {
    for (boolean preservePosInc : new boolean[]{true, false}) {
      Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true);
      String input = "a mykeyword a keyword"; //LUCENE-8344 add "a"
      tokenStream.setReader(new StringReader(input));
      TokenFilter tokenFilter = new StopFilter(tokenStream, StopFilter.makeStopSet("a"));
      ConcatenateGraphFilter concatStream = new ConcatenateGraphFilter(tokenFilter, SEP_LABEL, preservePosInc, 10);
      CharsRefBuilder builder = new CharsRefBuilder();
      if (preservePosInc) {
        builder.append(SEP_LABEL);
      }
      builder.append("mykeyword");
      builder.append(SEP_LABEL);
      if (preservePosInc) {
        builder.append(SEP_LABEL);
      }
      builder.append("keyword");
//      if (preservePosInc) { LUCENE-8344 uncomment
//        builder.append(SEP_LABEL);
//      }
      assertTokenStreamContents(concatStream, new String[]{builder.toCharsRef().toString()});
    }
  }
 
Example 3
@Override
public TokenStream create(TokenStream input) {
  return new TokenFilter(input) {
    @Override
    public boolean incrementToken() throws IOException {
      if (input.incrementToken()) {
        try {
          throw exceptionClass.getConstructor().newInstance();
        } catch (IllegalAccessException | InstantiationException | InvocationTargetException | NoSuchMethodException iae) {
          throw new RuntimeException(iae);
        }
      }
      return false;
    }
  };
}
 
Example 4
Source Project: lucene-solr   Source File: TestElision.java    License: Apache License 2.0 5 votes vote down vote up
public void testElision() throws Exception {
  String test = "Plop, juste pour voir l'embrouille avec O'brian. M'enfin.";
  Tokenizer tokenizer = new StandardTokenizer(newAttributeFactory());
  tokenizer.setReader(new StringReader(test));
  CharArraySet articles = new CharArraySet(asSet("l", "M"), false);
  TokenFilter filter = new ElisionFilter(tokenizer, articles);
  List<String> tas = filter(filter);
  assertEquals("embrouille", tas.get(4));
  assertEquals("O'brian", tas.get(6));
  assertEquals("enfin", tas.get(7));
}
 
Example 5
Source Project: lucene-solr   Source File: CommonGramsFilterTest.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Test that CommonGramsFilter works correctly in case-insensitive mode
 */
public void testCaseSensitive() throws Exception {
  final String input = "How The s a brown s cow d like A B thing?";
  MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  wt.setReader(new StringReader(input));
  TokenFilter cgf = new CommonGramsFilter(wt, commonWords);
  assertTokenStreamContents(cgf, new String[] {"How", "The", "The_s", "s",
      "s_a", "a", "a_brown", "brown", "brown_s", "s", "s_cow", "cow",
      "cow_d", "d", "d_like", "like", "A", "B", "thing?"});
}
 
Example 6
Source Project: lucene-solr   Source File: CommonGramsFilterTest.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Test CommonGramsQueryFilter in the case that the last word is a stopword
 */
public void testLastWordisStopWord() throws Exception {
  final String input = "dog the";
  MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  wt.setReader(new StringReader(input));
  CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
  TokenFilter nsf = new CommonGramsQueryFilter(cgf);
  assertTokenStreamContents(nsf, new String[] { "dog_the" });
}
 
Example 7
Source Project: lucene-solr   Source File: CommonGramsFilterTest.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Test CommonGramsQueryFilter in the case that the first word is a stopword
 */
public void testFirstWordisStopWord() throws Exception {
  final String input = "the dog";
  MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  wt.setReader(new StringReader(input));
  CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
  TokenFilter nsf = new CommonGramsQueryFilter(cgf);
  assertTokenStreamContents(nsf, new String[] { "the_dog" });
}
 
Example 8
Source Project: lucene-solr   Source File: CommonGramsFilterTest.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Test CommonGramsQueryFilter in the case of a single (stop)word query
 */
public void testOneWordQueryStopWord() throws Exception {
  final String input = "the";
  MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  wt.setReader(new StringReader(input));
  CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
  TokenFilter nsf = new CommonGramsQueryFilter(cgf);
  assertTokenStreamContents(nsf, new String[] { "the" });
}
 
Example 9
Source Project: lucene-solr   Source File: CommonGramsFilterTest.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Test CommonGramsQueryFilter in the case of a single word query
 */
public void testOneWordQuery() throws Exception {
  final String input = "monster";
  MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  wt.setReader(new StringReader(input));
  CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
  TokenFilter nsf = new CommonGramsQueryFilter(cgf);
  assertTokenStreamContents(nsf, new String[] { "monster" });
}
 
Example 10
Source Project: lucene-solr   Source File: CommonGramsFilterTest.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Test CommonGramsQueryFilter when first and last words are stopwords.
 */
public void TestFirstAndLastStopWord() throws Exception {
  final String input = "the of";
  MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  wt.setReader(new StringReader(input));
  CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
  TokenFilter nsf = new CommonGramsQueryFilter(cgf);
  assertTokenStreamContents(nsf, new String[] { "the_of" });
}
 
Example 11
@Override
protected TokenStreamComponents createComponents(String fieldName) {
  Tokenizer tokenizer = new MockTokenizer();
  if (fieldName.equals("distinctiveFieldName")) {
    TokenFilter tosser = new TokenFilter(tokenizer) {
      @Override
      public boolean incrementToken() throws IOException {
        throw new BadNews("Something is icky.");
      }
    };
    return new TokenStreamComponents(tokenizer, tosser);
  } else {
    return new TokenStreamComponents(tokenizer);
  }
}
 
Example 12
Source Project: hmftools   Source File: TreatmentCurator.java    License: GNU General Public License v3.0 5 votes vote down vote up
@NotNull
private static Analyzer spellcheckAnalyzer(@NotNull SpellChecker spellChecker) {
    return new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(@NotNull String field) {
            Tokenizer source = new WhitespaceTokenizer();
            source.setReader(new StringReader(field));
            SpellCheckerTokenFilter spellCheckFilter = new SpellCheckerTokenFilter(defaultTokenFilter(source), spellChecker);
            TokenFilter concatenatingFilter = new ConcatenatingFilter(spellCheckFilter, ' ');
            return new TokenStreamComponents(source, concatenatingFilter);
        }
    };
}
 
Example 13
Source Project: hmftools   Source File: TreatmentCurator.java    License: GNU General Public License v3.0 5 votes vote down vote up
@NotNull
private static Analyzer concatenatingAnalyzer() {
    return new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(@NotNull String field) {
            Tokenizer source = new WhitespaceTokenizer();
            source.setReader(new StringReader(field));
            TokenFilter concatenatingFilter = new ConcatenatingFilter(defaultTokenFilter(source), ' ');
            return new TokenStreamComponents(source, concatenatingFilter);
        }
    };
}
 
Example 14
private static Analyzer createAnalyzer() {
    return new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
                    new DefaultIcuTokenizerConfig(false, true));
            TokenFilter filter = new IcuNormalizerFilter(tokenizer,
                    Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
            return new TokenStreamComponents(tokenizer, filter);
        }
    };
}
 
Example 15
@Override
public TokenFilter create(TokenStream tokenStream) {
    return new PatternCaptureGroupTokenFilter(tokenStream, preserveOriginal, patterns);
}
 
Example 16
Source Project: lucene-solr   Source File: TestMoreLikeThis.java    License: Apache License 2.0 4 votes vote down vote up
public void testCustomFrequecy() throws IOException {
  // define an analyzer with delimited term frequency, e.g. "foo|2 bar|3"
  Analyzer analyzer = new Analyzer() {

    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false, 100);
      MockTokenFilter filt = new MockTokenFilter(tokenizer, MockTokenFilter.EMPTY_STOPSET);
      return new TokenStreamComponents(tokenizer, addCustomTokenFilter(filt));
    }

    TokenStream addCustomTokenFilter(TokenStream input) {
      return new TokenFilter(input) {
        final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
        final TermFrequencyAttribute tfAtt = addAttribute(TermFrequencyAttribute.class);

        @Override
        public boolean incrementToken() throws IOException {
          if (input.incrementToken()) {
            final char[] buffer = termAtt.buffer();
            final int length = termAtt.length();
            for (int i = 0; i < length; i++) {
              if (buffer[i] == '|') {
                termAtt.setLength(i);
                i++;
                tfAtt.setTermFrequency(ArrayUtil.parseInt(buffer, i, length - i));
                return true;
              }
            }
            return true;
          }
          return false;
        }
      };
    }
  };

  mlt.setAnalyzer(analyzer);
  mlt.setFieldNames(new String[] {"text"});
  mlt.setBoost(true);

  final double boost10 = ((BooleanQuery) mlt.like("text", new StringReader("lucene|10 release|1")))
      .clauses()
      .stream()
      .map(BooleanClause::getQuery)
      .map(BoostQuery.class::cast)
      .filter(x -> ((TermQuery) x.getQuery()).getTerm().text().equals("lucene"))
      .mapToDouble(BoostQuery::getBoost)
      .sum();

  final double boost1 = ((BooleanQuery) mlt.like("text", new StringReader("lucene|1 release|1")))
      .clauses()
      .stream()
      .map(BooleanClause::getQuery)
      .map(BoostQuery.class::cast)
      .filter(x -> ((TermQuery) x.getQuery()).getTerm().text().equals("lucene"))
      .mapToDouble(BoostQuery::getBoost)
      .sum();

  // mlt should use the custom frequencies provided by the analyzer so "lucene|10" should be boosted more than "lucene|1"
  assertTrue(String.format(Locale.ROOT, "%s should be grater than %s", boost10, boost1), boost10 > boost1);
}
 
Example 17
Source Project: lucene-solr   Source File: TestTermAutomatonQuery.java    License: Apache License 2.0 4 votes vote down vote up
public RandomSynonymFilter(TokenFilter in) {
  super(in);
}
 
Example 18
Source Project: lucene-solr   Source File: NGramFilterFactory.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public TokenFilter create(TokenStream input) {
  return new NGramTokenFilter(input, minGramSize, maxGramSize, preserveOriginal);
}
 
Example 19
Source Project: lucene-solr   Source File: EdgeNGramFilterFactory.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public TokenFilter create(TokenStream input) {
  return new EdgeNGramTokenFilter(input, minGramSize, maxGramSize, preserveOriginal);
}
 
Example 20
@Override
public TokenFilter create(TokenStream input) {
  return new HyphenationCompoundWordTokenFilter(input, hyphenator, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
}
 
Example 21
Source Project: lucene-solr   Source File: ClassicFilterFactory.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public TokenFilter create(TokenStream input) {
  return new ClassicFilter(input);
}
 
Example 22
Source Project: lucene-solr   Source File: KStemFilterFactory.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public TokenFilter create(TokenStream input) {
  return new KStemFilter(input);
}
 
Example 23
Source Project: lucene-solr   Source File: CommonGramsFilterFactory.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public TokenFilter create(TokenStream input) {
  CommonGramsFilter commonGrams = new CommonGramsFilter(input, commonWords);
  return commonGrams;
}
 
Example 24
Source Project: lucene-solr   Source File: CommonGramsQueryFilterFactory.java    License: Apache License 2.0 4 votes vote down vote up
/**
 * Create a CommonGramsFilter and wrap it with a CommonGramsQueryFilter
 */
@Override
public TokenFilter create(TokenStream input) {
  CommonGramsFilter commonGrams = (CommonGramsFilter) super.create(input);
  return new CommonGramsQueryFilter(commonGrams);
}
 
Example 25
Source Project: lucene-solr   Source File: WordDelimiterFilterFactory.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public TokenFilter create(TokenStream input) {
  return new WordDelimiterFilter(input, typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE : typeTable,
                                 flags, protectedWords);
}
 
Example 26
@Override
public TokenFilter create(TokenStream input) {
  return new WordDelimiterGraphFilter(input, adjustOffsets, typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE : typeTable,
                                      flags, protectedWords);
}
 
Example 27
Source Project: lucene-solr   Source File: TestIndicNormalizer.java    License: Apache License 2.0 4 votes vote down vote up
private void check(String input, String output) throws IOException {
  Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);;
  tokenizer.setReader(new StringReader(input));
  TokenFilter tf = new IndicNormalizationFilter(tokenizer);
  assertTokenStreamContents(tf, new String[] { output });
}
 
Example 28
Source Project: lucene-solr   Source File: TestHindiStemmer.java    License: Apache License 2.0 4 votes vote down vote up
private void check(String input, String output) throws IOException {
  Tokenizer tokenizer = whitespaceMockTokenizer(input);
  TokenFilter tf = new HindiStemFilter(tokenizer);
  assertTokenStreamContents(tf, new String[] { output });
}
 
Example 29
Source Project: lucene-solr   Source File: TestHindiNormalizer.java    License: Apache License 2.0 4 votes vote down vote up
private void check(String input, String output) throws IOException {
  Tokenizer tokenizer = whitespaceMockTokenizer(input);
  TokenFilter tf = new HindiNormalizationFilter(tokenizer);
  assertTokenStreamContents(tf, new String[] { output });
}
 
Example 30
Source Project: lucene-solr   Source File: TestBengaliStemmer.java    License: Apache License 2.0 4 votes vote down vote up
private void check(String input, String output) throws IOException {
  Tokenizer tokenizer = whitespaceMockTokenizer(input);
  TokenFilter tf = new BengaliStemFilter(tokenizer);
  assertTokenStreamContents(tf, new String[] { output });
}