org.apache.lucene.analysis.TokenFilter Java Examples

The following examples show how to use org.apache.lucene.analysis.TokenFilter. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ThrowingMockTokenFilterFactory.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
public TokenStream create(TokenStream input) {
  return new TokenFilter(input) {
    @Override
    public boolean incrementToken() throws IOException {
      if (input.incrementToken()) {
        try {
          throw exceptionClass.getConstructor().newInstance();
        } catch (IllegalAccessException | InstantiationException | InvocationTargetException | NoSuchMethodException iae) {
          throw new RuntimeException(iae);
        }
      }
      return false;
    }
  };
}
 
Example #2
Source File: TestConcatenateGraphFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Test
  public void testWithStopword() throws Exception {
    for (boolean preservePosInc : new boolean[]{true, false}) {
      Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true);
      String input = "a mykeyword a keyword"; //LUCENE-8344 add "a"
      tokenStream.setReader(new StringReader(input));
      TokenFilter tokenFilter = new StopFilter(tokenStream, StopFilter.makeStopSet("a"));
      ConcatenateGraphFilter concatStream = new ConcatenateGraphFilter(tokenFilter, SEP_LABEL, preservePosInc, 10);
      CharsRefBuilder builder = new CharsRefBuilder();
      if (preservePosInc) {
        builder.append(SEP_LABEL);
      }
      builder.append("mykeyword");
      builder.append(SEP_LABEL);
      if (preservePosInc) {
        builder.append(SEP_LABEL);
      }
      builder.append("keyword");
//      if (preservePosInc) { LUCENE-8344 uncomment
//        builder.append(SEP_LABEL);
//      }
      assertTokenStreamContents(concatStream, new String[]{builder.toCharsRef().toString()});
    }
  }
 
Example #3
Source File: NGramTokenFilterTest.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testInvalidOffsets() throws Exception {
  Analyzer analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
      filters = new NGramTokenFilter(filters, 2, 2, false);
      return new TokenStreamComponents(tokenizer, filters);
    }
  };
  assertAnalyzesTo(analyzer, "mosfellsbær",
      new String[] { "mo", "os", "sf", "fe", "el", "ll", "ls", "sb", "ba", "ae", "er" },
      new int[]    {    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
      new int[]    {   11,   11,   11,   11,   11,   11,   11,   11,   11,   11,   11 },
      new int[]    {     1,   0,    0,    0,    0,    0,    0,    0,    0,    0,    0  });
  analyzer.close();
}
 
Example #4
Source File: CommonGramsFilterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Test that CommonGramsFilter works correctly in case-insensitive mode
 */
public void testCaseSensitive() throws Exception {
  final String input = "How The s a brown s cow d like A B thing?";
  MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  wt.setReader(new StringReader(input));
  TokenFilter cgf = new CommonGramsFilter(wt, commonWords);
  assertTokenStreamContents(cgf, new String[] {"How", "The", "The_s", "s",
      "s_a", "a", "a_brown", "brown", "brown_s", "s", "s_cow", "cow",
      "cow_d", "d", "d_like", "like", "A", "B", "thing?"});
}
 
Example #5
Source File: TestElision.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testElision() throws Exception {
  String test = "Plop, juste pour voir l'embrouille avec O'brian. M'enfin.";
  Tokenizer tokenizer = new StandardTokenizer(newAttributeFactory());
  tokenizer.setReader(new StringReader(test));
  CharArraySet articles = new CharArraySet(asSet("l", "M"), false);
  TokenFilter filter = new ElisionFilter(tokenizer, articles);
  List<String> tas = filter(filter);
  assertEquals("embrouille", tas.get(4));
  assertEquals("O'brian", tas.get(6));
  assertEquals("enfin", tas.get(7));
}
 
Example #6
Source File: SegmentationIcuTokenizerTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
private static Analyzer createAnalyzer() {
    return new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
                    new DefaultIcuTokenizerConfig(false, true));
            TokenFilter filter = new IcuNormalizerFilter(tokenizer,
                    Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
            return new TokenStreamComponents(tokenizer, filter);
        }
    };
}
 
Example #7
Source File: CommonGramsFilterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Test CommonGramsQueryFilter in the case that the last word is a stopword
 */
public void testLastWordisStopWord() throws Exception {
  final String input = "dog the";
  MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  wt.setReader(new StringReader(input));
  CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
  TokenFilter nsf = new CommonGramsQueryFilter(cgf);
  assertTokenStreamContents(nsf, new String[] { "dog_the" });
}
 
Example #8
Source File: TreatmentCurator.java    From hmftools with GNU General Public License v3.0 5 votes vote down vote up
@NotNull
private static Analyzer concatenatingAnalyzer() {
    return new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(@NotNull String field) {
            Tokenizer source = new WhitespaceTokenizer();
            source.setReader(new StringReader(field));
            TokenFilter concatenatingFilter = new ConcatenatingFilter(defaultTokenFilter(source), ' ');
            return new TokenStreamComponents(source, concatenatingFilter);
        }
    };
}
 
Example #9
Source File: TreatmentCurator.java    From hmftools with GNU General Public License v3.0 5 votes vote down vote up
@NotNull
private static Analyzer spellcheckAnalyzer(@NotNull SpellChecker spellChecker) {
    return new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(@NotNull String field) {
            Tokenizer source = new WhitespaceTokenizer();
            source.setReader(new StringReader(field));
            SpellCheckerTokenFilter spellCheckFilter = new SpellCheckerTokenFilter(defaultTokenFilter(source), spellChecker);
            TokenFilter concatenatingFilter = new ConcatenatingFilter(spellCheckFilter, ' ');
            return new TokenStreamComponents(source, concatenatingFilter);
        }
    };
}
 
Example #10
Source File: TestDocInverterPerFieldErrorInfo.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
protected TokenStreamComponents createComponents(String fieldName) {
  Tokenizer tokenizer = new MockTokenizer();
  if (fieldName.equals("distinctiveFieldName")) {
    TokenFilter tosser = new TokenFilter(tokenizer) {
      @Override
      public boolean incrementToken() throws IOException {
        throw new BadNews("Something is icky.");
      }
    };
    return new TokenStreamComponents(tokenizer, tosser);
  } else {
    return new TokenStreamComponents(tokenizer);
  }
}
 
Example #11
Source File: CommonGramsFilterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Test CommonGramsQueryFilter in the case that the first word is a stopword
 */
public void testFirstWordisStopWord() throws Exception {
  final String input = "the dog";
  MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  wt.setReader(new StringReader(input));
  CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
  TokenFilter nsf = new CommonGramsQueryFilter(cgf);
  assertTokenStreamContents(nsf, new String[] { "the_dog" });
}
 
Example #12
Source File: CommonGramsFilterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Test CommonGramsQueryFilter in the case of a single (stop)word query
 */
public void testOneWordQueryStopWord() throws Exception {
  final String input = "the";
  MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  wt.setReader(new StringReader(input));
  CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
  TokenFilter nsf = new CommonGramsQueryFilter(cgf);
  assertTokenStreamContents(nsf, new String[] { "the" });
}
 
Example #13
Source File: CommonGramsFilterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Test CommonGramsQueryFilter when first and last words are stopwords.
 */
public void TestFirstAndLastStopWord() throws Exception {
  final String input = "the of";
  MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  wt.setReader(new StringReader(input));
  CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
  TokenFilter nsf = new CommonGramsQueryFilter(cgf);
  assertTokenStreamContents(nsf, new String[] { "the_of" });
}
 
Example #14
Source File: CommonGramsFilterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Test CommonGramsQueryFilter in the case of a single word query
 */
public void testOneWordQuery() throws Exception {
  final String input = "monster";
  MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  wt.setReader(new StringReader(input));
  CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
  TokenFilter nsf = new CommonGramsQueryFilter(cgf);
  assertTokenStreamContents(nsf, new String[] { "monster" });
}
 
Example #15
Source File: TestRandomChains.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@BeforeClass
public static void beforeClass() throws Exception {
  List<Class<?>> analysisClasses = getClassesForPackage("org.apache.lucene.analysis");
  tokenizers = new ArrayList<>();
  tokenfilters = new ArrayList<>();
  charfilters = new ArrayList<>();
  for (final Class<?> c : analysisClasses) {
    final int modifiers = c.getModifiers();
    if (
      // don't waste time with abstract classes or deprecated known-buggy ones
      Modifier.isAbstract(modifiers) || !Modifier.isPublic(modifiers)
      || c.isSynthetic() || c.isAnonymousClass() || c.isMemberClass() || c.isInterface()
      || c.isAnnotationPresent(Deprecated.class)
      || !(Tokenizer.class.isAssignableFrom(c) || TokenFilter.class.isAssignableFrom(c) || CharFilter.class.isAssignableFrom(c))
    ) {
      continue;
    }
    
    for (final Constructor<?> ctor : c.getConstructors()) {
      // don't test synthetic or deprecated ctors, they likely have known bugs:
      if (ctor.isSynthetic() || ctor.isAnnotationPresent(Deprecated.class) || brokenConstructors.get(ctor) == ALWAYS) {
        continue;
      }
      // conditional filters are tested elsewhere
      if (ConditionalTokenFilter.class.isAssignableFrom(c)) {
        continue;
      }
      if (Tokenizer.class.isAssignableFrom(c)) {
        assertTrue(ctor.toGenericString() + " has unsupported parameter types",
          allowedTokenizerArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
        tokenizers.add(castConstructor(Tokenizer.class, ctor));
      } else if (TokenFilter.class.isAssignableFrom(c)) {
        assertTrue(ctor.toGenericString() + " has unsupported parameter types",
          allowedTokenFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
        tokenfilters.add(castConstructor(TokenFilter.class, ctor));
      } else if (CharFilter.class.isAssignableFrom(c)) {
        assertTrue(ctor.toGenericString() + " has unsupported parameter types",
          allowedCharFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
        charfilters.add(castConstructor(CharFilter.class, ctor));
      } else {
        fail("Cannot get here");
      }
    }
  }
  
  final Comparator<Constructor<?>> ctorComp = (arg0, arg1) -> arg0.toGenericString().compareTo(arg1.toGenericString());
  Collections.sort(tokenizers, ctorComp);
  Collections.sort(tokenfilters, ctorComp);
  Collections.sort(charfilters, ctorComp);
  if (VERBOSE) {
    System.out.println("tokenizers = " + tokenizers);
    System.out.println("tokenfilters = " + tokenfilters);
    System.out.println("charfilters = " + charfilters);
  }
}
 
Example #16
Source File: PatternCaptureGroupTokenFilterFactory.java    From crate with Apache License 2.0 4 votes vote down vote up
@Override
public TokenFilter create(TokenStream tokenStream) {
    return new PatternCaptureGroupTokenFilter(tokenStream, preserveOriginal, patterns);
}
 
Example #17
Source File: TestRandomChains.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private TokenFilterSpec newFilterChain(Random random, Tokenizer tokenizer) {
  TokenFilterSpec spec = new TokenFilterSpec();
  spec.stream = tokenizer;
  StringBuilder descr = new StringBuilder();
  int numFilters = random.nextInt(5);
  for (int i = 0; i < numFilters; i++) {

    // Insert ValidatingTF after each stage so we can
    // catch problems right after the TF that "caused"
    // them:
    spec.stream = new ValidatingTokenFilter(spec.stream, "stage " + i);

    while (true) {
      final Constructor<? extends TokenFilter> ctor = tokenfilters.get(random.nextInt(tokenfilters.size()));
      if (random.nextBoolean() && avoidConditionals.contains(ctor.getDeclaringClass()) == false) {
        long seed = random.nextLong();
        spec.stream = new ConditionalTokenFilter(spec.stream, in -> {
          final Object args[] = newFilterArgs(random, in, ctor.getParameterTypes());
          if (broken(ctor, args)) {
            return in;
          }
          TokenStream ts = createComponent(ctor, args, descr, true);
          if (ts == null) {
            return in;
          }
          return ts;
        }) {
          Random random = new Random(seed);

          @Override
          public void reset() throws IOException {
            super.reset();
            random = new Random(seed);
          }

          @Override
          protected boolean shouldFilter() throws IOException {
            return random.nextBoolean();
          }
        };
        break;
      }
      else {
        final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes());
        if (broken(ctor, args)) {
          continue;
        }
        final TokenFilter flt = createComponent(ctor, args, descr, false);
        if (flt != null) {
          spec.stream = flt;
          break;
        }
      }
    }
  }

  // Insert ValidatingTF after each stage so we can
  // catch problems right after the TF that "caused"
  // them:
  spec.stream = new ValidatingTokenFilter(spec.stream, "last stage");

  spec.toString = descr.toString();
  return spec;
}
 
Example #18
Source File: TreatmentCurator.java    From hmftools with GNU General Public License v3.0 4 votes vote down vote up
@NotNull
private static TokenFilter defaultTokenFilter(@NotNull Tokenizer source) {
    TokenFilter filteredSource = new LowerCaseFilter(source);
    return new WordDelimiterGraphFilter(filteredSource, SPLIT_ON_NUMERICS | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, null);
}
 
Example #19
Source File: TextFieldMapper.java    From crate with Apache License 2.0 4 votes vote down vote up
@Override
protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
    TokenFilter filter = new EdgeNGramTokenFilter(components.getTokenStream(), minChars, maxChars, false);
    return new TokenStreamComponents(components.getSource(), filter);
}
 
Example #20
Source File: PatternCaptureGroupTokenFilterFactory.java    From Elasticsearch with Apache License 2.0 4 votes vote down vote up
@Override
public TokenFilter create(TokenStream tokenStream) {
    return new PatternCaptureGroupTokenFilter(tokenStream, preserveOriginal, patterns);
}
 
Example #21
Source File: TestBengaliNormalizer.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private void check(String input, String output) throws IOException {
  Tokenizer tokenizer = whitespaceMockTokenizer(input);
  TokenFilter tf = new BengaliNormalizationFilter(tokenizer);
  assertTokenStreamContents(tf, new String[] { output });
}
 
Example #22
Source File: TestBengaliStemmer.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private void check(String input, String output) throws IOException {
  Tokenizer tokenizer = whitespaceMockTokenizer(input);
  TokenFilter tf = new BengaliStemFilter(tokenizer);
  assertTokenStreamContents(tf, new String[] { output });
}
 
Example #23
Source File: TestHindiNormalizer.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private void check(String input, String output) throws IOException {
  Tokenizer tokenizer = whitespaceMockTokenizer(input);
  TokenFilter tf = new HindiNormalizationFilter(tokenizer);
  assertTokenStreamContents(tf, new String[] { output });
}
 
Example #24
Source File: TestHindiStemmer.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private void check(String input, String output) throws IOException {
  Tokenizer tokenizer = whitespaceMockTokenizer(input);
  TokenFilter tf = new HindiStemFilter(tokenizer);
  assertTokenStreamContents(tf, new String[] { output });
}
 
Example #25
Source File: TestIndicNormalizer.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private void check(String input, String output) throws IOException {
  Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);;
  tokenizer.setReader(new StringReader(input));
  TokenFilter tf = new IndicNormalizationFilter(tokenizer);
  assertTokenStreamContents(tf, new String[] { output });
}
 
Example #26
Source File: WordDelimiterGraphFilterFactory.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public TokenFilter create(TokenStream input) {
  return new WordDelimiterGraphFilter(input, adjustOffsets, typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE : typeTable,
                                      flags, protectedWords);
}
 
Example #27
Source File: WordDelimiterFilterFactory.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public TokenFilter create(TokenStream input) {
  return new WordDelimiterFilter(input, typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE : typeTable,
                                 flags, protectedWords);
}
 
Example #28
Source File: CommonGramsQueryFilterFactory.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
/**
 * Create a CommonGramsFilter and wrap it with a CommonGramsQueryFilter
 */
@Override
public TokenFilter create(TokenStream input) {
  CommonGramsFilter commonGrams = (CommonGramsFilter) super.create(input);
  return new CommonGramsQueryFilter(commonGrams);
}
 
Example #29
Source File: CommonGramsFilterFactory.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public TokenFilter create(TokenStream input) {
  CommonGramsFilter commonGrams = new CommonGramsFilter(input, commonWords);
  return commonGrams;
}
 
Example #30
Source File: KStemFilterFactory.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public TokenFilter create(TokenStream input) {
  return new KStemFilter(input);
}