org.apache.lucene.analysis.TokenFilter Java Examples

The following examples show how to use org.apache.lucene.analysis.TokenFilter. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: NGramTokenFilterTest.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testInvalidOffsets() throws Exception {
  Analyzer analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
      filters = new NGramTokenFilter(filters, 2, 2, false);
      return new TokenStreamComponents(tokenizer, filters);
    }
  };
  assertAnalyzesTo(analyzer, "mosfellsbær",
      new String[] { "mo", "os", "sf", "fe", "el", "ll", "ls", "sb", "ba", "ae", "er" },
      new int[]    {    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
      new int[]    {   11,   11,   11,   11,   11,   11,   11,   11,   11,   11,   11 },
      new int[]    {     1,   0,    0,    0,    0,    0,    0,    0,    0,    0,    0  });
  analyzer.close();
}
 
Example #2
Source File: TestConcatenateGraphFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Test
  public void testWithStopword() throws Exception {
    for (boolean preservePosInc : new boolean[]{true, false}) {
      Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true);
      String input = "a mykeyword a keyword"; //LUCENE-8344 add "a"
      tokenStream.setReader(new StringReader(input));
      TokenFilter tokenFilter = new StopFilter(tokenStream, StopFilter.makeStopSet("a"));
      ConcatenateGraphFilter concatStream = new ConcatenateGraphFilter(tokenFilter, SEP_LABEL, preservePosInc, 10);
      CharsRefBuilder builder = new CharsRefBuilder();
      if (preservePosInc) {
        builder.append(SEP_LABEL);
      }
      builder.append("mykeyword");
      builder.append(SEP_LABEL);
      if (preservePosInc) {
        builder.append(SEP_LABEL);
      }
      builder.append("keyword");
//      if (preservePosInc) { LUCENE-8344 uncomment
//        builder.append(SEP_LABEL);
//      }
      assertTokenStreamContents(concatStream, new String[]{builder.toCharsRef().toString()});
    }
  }
 
Example #3
Source File: ThrowingMockTokenFilterFactory.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
public TokenStream create(TokenStream input) {
  return new TokenFilter(input) {
    @Override
    public boolean incrementToken() throws IOException {
      if (input.incrementToken()) {
        try {
          throw exceptionClass.getConstructor().newInstance();
        } catch (IllegalAccessException | InstantiationException | InvocationTargetException | NoSuchMethodException iae) {
          throw new RuntimeException(iae);
        }
      }
      return false;
    }
  };
}
 
Example #4
Source File: TestElision.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testElision() throws Exception {
  String test = "Plop, juste pour voir l'embrouille avec O'brian. M'enfin.";
  Tokenizer tokenizer = new StandardTokenizer(newAttributeFactory());
  tokenizer.setReader(new StringReader(test));
  CharArraySet articles = new CharArraySet(asSet("l", "M"), false);
  TokenFilter filter = new ElisionFilter(tokenizer, articles);
  List<String> tas = filter(filter);
  assertEquals("embrouille", tas.get(4));
  assertEquals("O'brian", tas.get(6));
  assertEquals("enfin", tas.get(7));
}
 
Example #5
Source File: CommonGramsFilterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Test that CommonGramsFilter works correctly in case-insensitive mode
 */
public void testCaseSensitive() throws Exception {
  final String input = "How The s a brown s cow d like A B thing?";
  MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  wt.setReader(new StringReader(input));
  TokenFilter cgf = new CommonGramsFilter(wt, commonWords);
  assertTokenStreamContents(cgf, new String[] {"How", "The", "The_s", "s",
      "s_a", "a", "a_brown", "brown", "brown_s", "s", "s_cow", "cow",
      "cow_d", "d", "d_like", "like", "A", "B", "thing?"});
}
 
Example #6
Source File: CommonGramsFilterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Test CommonGramsQueryFilter in the case that the last word is a stopword
 */
public void testLastWordisStopWord() throws Exception {
  final String input = "dog the";
  MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  wt.setReader(new StringReader(input));
  CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
  TokenFilter nsf = new CommonGramsQueryFilter(cgf);
  assertTokenStreamContents(nsf, new String[] { "dog_the" });
}
 
Example #7
Source File: CommonGramsFilterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Test CommonGramsQueryFilter in the case that the first word is a stopword
 */
public void testFirstWordisStopWord() throws Exception {
  final String input = "the dog";
  MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  wt.setReader(new StringReader(input));
  CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
  TokenFilter nsf = new CommonGramsQueryFilter(cgf);
  assertTokenStreamContents(nsf, new String[] { "the_dog" });
}
 
Example #8
Source File: CommonGramsFilterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Test CommonGramsQueryFilter in the case of a single (stop)word query
 */
public void testOneWordQueryStopWord() throws Exception {
  final String input = "the";
  MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  wt.setReader(new StringReader(input));
  CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
  TokenFilter nsf = new CommonGramsQueryFilter(cgf);
  assertTokenStreamContents(nsf, new String[] { "the" });
}
 
Example #9
Source File: CommonGramsFilterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Test CommonGramsQueryFilter in the case of a single word query
 */
public void testOneWordQuery() throws Exception {
  final String input = "monster";
  MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  wt.setReader(new StringReader(input));
  CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
  TokenFilter nsf = new CommonGramsQueryFilter(cgf);
  assertTokenStreamContents(nsf, new String[] { "monster" });
}
 
Example #10
Source File: CommonGramsFilterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Test CommonGramsQueryFilter when first and last words are stopwords.
 */
public void TestFirstAndLastStopWord() throws Exception {
  final String input = "the of";
  MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  wt.setReader(new StringReader(input));
  CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
  TokenFilter nsf = new CommonGramsQueryFilter(cgf);
  assertTokenStreamContents(nsf, new String[] { "the_of" });
}
 
Example #11
Source File: TestDocInverterPerFieldErrorInfo.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
protected TokenStreamComponents createComponents(String fieldName) {
  Tokenizer tokenizer = new MockTokenizer();
  if (fieldName.equals("distinctiveFieldName")) {
    TokenFilter tosser = new TokenFilter(tokenizer) {
      @Override
      public boolean incrementToken() throws IOException {
        throw new BadNews("Something is icky.");
      }
    };
    return new TokenStreamComponents(tokenizer, tosser);
  } else {
    return new TokenStreamComponents(tokenizer);
  }
}
 
Example #12
Source File: TreatmentCurator.java    From hmftools with GNU General Public License v3.0 5 votes vote down vote up
@NotNull
private static Analyzer spellcheckAnalyzer(@NotNull SpellChecker spellChecker) {
    return new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(@NotNull String field) {
            Tokenizer source = new WhitespaceTokenizer();
            source.setReader(new StringReader(field));
            SpellCheckerTokenFilter spellCheckFilter = new SpellCheckerTokenFilter(defaultTokenFilter(source), spellChecker);
            TokenFilter concatenatingFilter = new ConcatenatingFilter(spellCheckFilter, ' ');
            return new TokenStreamComponents(source, concatenatingFilter);
        }
    };
}
 
Example #13
Source File: TreatmentCurator.java    From hmftools with GNU General Public License v3.0 5 votes vote down vote up
@NotNull
private static Analyzer concatenatingAnalyzer() {
    return new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(@NotNull String field) {
            Tokenizer source = new WhitespaceTokenizer();
            source.setReader(new StringReader(field));
            TokenFilter concatenatingFilter = new ConcatenatingFilter(defaultTokenFilter(source), ' ');
            return new TokenStreamComponents(source, concatenatingFilter);
        }
    };
}
 
Example #14
Source File: SegmentationIcuTokenizerTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
private static Analyzer createAnalyzer() {
    return new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
                    new DefaultIcuTokenizerConfig(false, true));
            TokenFilter filter = new IcuNormalizerFilter(tokenizer,
                    Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
            return new TokenStreamComponents(tokenizer, filter);
        }
    };
}
 
Example #15
Source File: PatternCaptureGroupTokenFilterFactory.java    From Elasticsearch with Apache License 2.0 4 votes vote down vote up
@Override
public TokenFilter create(TokenStream tokenStream) {
    return new PatternCaptureGroupTokenFilter(tokenStream, preserveOriginal, patterns);
}
 
Example #16
Source File: TestMoreLikeThis.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void testCustomFrequecy() throws IOException {
  // define an analyzer with delimited term frequency, e.g. "foo|2 bar|3"
  Analyzer analyzer = new Analyzer() {

    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false, 100);
      MockTokenFilter filt = new MockTokenFilter(tokenizer, MockTokenFilter.EMPTY_STOPSET);
      return new TokenStreamComponents(tokenizer, addCustomTokenFilter(filt));
    }

    TokenStream addCustomTokenFilter(TokenStream input) {
      return new TokenFilter(input) {
        final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
        final TermFrequencyAttribute tfAtt = addAttribute(TermFrequencyAttribute.class);

        @Override
        public boolean incrementToken() throws IOException {
          if (input.incrementToken()) {
            final char[] buffer = termAtt.buffer();
            final int length = termAtt.length();
            for (int i = 0; i < length; i++) {
              if (buffer[i] == '|') {
                termAtt.setLength(i);
                i++;
                tfAtt.setTermFrequency(ArrayUtil.parseInt(buffer, i, length - i));
                return true;
              }
            }
            return true;
          }
          return false;
        }
      };
    }
  };

  mlt.setAnalyzer(analyzer);
  mlt.setFieldNames(new String[] {"text"});
  mlt.setBoost(true);

  final double boost10 = ((BooleanQuery) mlt.like("text", new StringReader("lucene|10 release|1")))
      .clauses()
      .stream()
      .map(BooleanClause::getQuery)
      .map(BoostQuery.class::cast)
      .filter(x -> ((TermQuery) x.getQuery()).getTerm().text().equals("lucene"))
      .mapToDouble(BoostQuery::getBoost)
      .sum();

  final double boost1 = ((BooleanQuery) mlt.like("text", new StringReader("lucene|1 release|1")))
      .clauses()
      .stream()
      .map(BooleanClause::getQuery)
      .map(BoostQuery.class::cast)
      .filter(x -> ((TermQuery) x.getQuery()).getTerm().text().equals("lucene"))
      .mapToDouble(BoostQuery::getBoost)
      .sum();

  // mlt should use the custom frequencies provided by the analyzer so "lucene|10" should be boosted more than "lucene|1"
  assertTrue(String.format(Locale.ROOT, "%s should be grater than %s", boost10, boost1), boost10 > boost1);
}
 
Example #17
Source File: TestTermAutomatonQuery.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public RandomSynonymFilter(TokenFilter in) {
  super(in);
}
 
Example #18
Source File: NGramFilterFactory.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public TokenFilter create(TokenStream input) {
  return new NGramTokenFilter(input, minGramSize, maxGramSize, preserveOriginal);
}
 
Example #19
Source File: EdgeNGramFilterFactory.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public TokenFilter create(TokenStream input) {
  return new EdgeNGramTokenFilter(input, minGramSize, maxGramSize, preserveOriginal);
}
 
Example #20
Source File: HyphenationCompoundWordTokenFilterFactory.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public TokenFilter create(TokenStream input) {
  return new HyphenationCompoundWordTokenFilter(input, hyphenator, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
}
 
Example #21
Source File: ClassicFilterFactory.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public TokenFilter create(TokenStream input) {
  return new ClassicFilter(input);
}
 
Example #22
Source File: KStemFilterFactory.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public TokenFilter create(TokenStream input) {
  return new KStemFilter(input);
}
 
Example #23
Source File: CommonGramsFilterFactory.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public TokenFilter create(TokenStream input) {
  CommonGramsFilter commonGrams = new CommonGramsFilter(input, commonWords);
  return commonGrams;
}
 
Example #24
Source File: CommonGramsQueryFilterFactory.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
/**
 * Create a CommonGramsFilter and wrap it with a CommonGramsQueryFilter
 */
@Override
public TokenFilter create(TokenStream input) {
  CommonGramsFilter commonGrams = (CommonGramsFilter) super.create(input);
  return new CommonGramsQueryFilter(commonGrams);
}
 
Example #25
Source File: WordDelimiterFilterFactory.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public TokenFilter create(TokenStream input) {
  return new WordDelimiterFilter(input, typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE : typeTable,
                                 flags, protectedWords);
}
 
Example #26
Source File: WordDelimiterGraphFilterFactory.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public TokenFilter create(TokenStream input) {
  return new WordDelimiterGraphFilter(input, adjustOffsets, typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE : typeTable,
                                      flags, protectedWords);
}
 
Example #27
Source File: TestIndicNormalizer.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private void check(String input, String output) throws IOException {
  Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);;
  tokenizer.setReader(new StringReader(input));
  TokenFilter tf = new IndicNormalizationFilter(tokenizer);
  assertTokenStreamContents(tf, new String[] { output });
}
 
Example #28
Source File: TestHindiStemmer.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private void check(String input, String output) throws IOException {
  Tokenizer tokenizer = whitespaceMockTokenizer(input);
  TokenFilter tf = new HindiStemFilter(tokenizer);
  assertTokenStreamContents(tf, new String[] { output });
}
 
Example #29
Source File: TestHindiNormalizer.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private void check(String input, String output) throws IOException {
  Tokenizer tokenizer = whitespaceMockTokenizer(input);
  TokenFilter tf = new HindiNormalizationFilter(tokenizer);
  assertTokenStreamContents(tf, new String[] { output });
}
 
Example #30
Source File: TestBengaliStemmer.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private void check(String input, String output) throws IOException {
  Tokenizer tokenizer = whitespaceMockTokenizer(input);
  TokenFilter tf = new BengaliStemFilter(tokenizer);
  assertTokenStreamContents(tf, new String[] { output });
}