Java Code Examples for org.apache.lucene.analysis.core.WhitespaceTokenizer

The following examples show how to use org.apache.lucene.analysis.core.WhitespaceTokenizer. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: lucene-solr   Source File: CommonGramsFilterTest.java    License: Apache License 2.0 6 votes vote down vote up
public void testReset() throws Exception {
  final String input = "How the s a brown s cow d like A B thing?";
  WhitespaceTokenizer wt = new WhitespaceTokenizer();
  wt.setReader(new StringReader(input));
  CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
  
  CharTermAttribute term = cgf.addAttribute(CharTermAttribute.class);
  cgf.reset();
  assertTrue(cgf.incrementToken());
  assertEquals("How", term.toString());
  assertTrue(cgf.incrementToken());
  assertEquals("How_the", term.toString());
  assertTrue(cgf.incrementToken());
  assertEquals("the", term.toString());
  assertTrue(cgf.incrementToken());
  assertEquals("the_s", term.toString());
  cgf.close();
  
  wt.setReader(new StringReader(input));
  cgf.reset();
  assertTrue(cgf.incrementToken());
  assertEquals("How", term.toString());
}
 
Example 2
Source Project: lucene-solr   Source File: CommonGramsFilterTest.java    License: Apache License 2.0 6 votes vote down vote up
public void testQueryReset() throws Exception {
  final String input = "How the s a brown s cow d like A B thing?";
  WhitespaceTokenizer wt = new WhitespaceTokenizer();
  wt.setReader(new StringReader(input));
  CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
  CommonGramsQueryFilter nsf = new CommonGramsQueryFilter(cgf);
  
  CharTermAttribute term = wt.addAttribute(CharTermAttribute.class);
  nsf.reset();
  assertTrue(nsf.incrementToken());
  assertEquals("How_the", term.toString());
  assertTrue(nsf.incrementToken());
  assertEquals("the_s", term.toString());
  nsf.close();
  
  wt.setReader(new StringReader(input));
  nsf.reset();
  assertTrue(nsf.incrementToken());
  assertEquals("How_the", term.toString());
}
 
Example 3
Source Project: lucene-solr   Source File: ShingleFilterTest.java    License: Apache License 2.0 6 votes vote down vote up
public void testReset() throws Exception {
  Tokenizer wsTokenizer = new WhitespaceTokenizer();
  wsTokenizer.setReader(new StringReader("please divide this sentence"));
  TokenStream filter = new ShingleFilter(wsTokenizer, 2);
  assertTokenStreamContents(filter,
    new String[]{"please","please divide","divide","divide this","this","this sentence","sentence"},
    new int[]{0,0,7,7,14,14,19}, new int[]{6,13,13,18,18,27,27},
    new String[]{TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE},
    new int[]{1,0,1,0,1,0,1}
  );
  wsTokenizer.setReader(new StringReader("please divide this sentence"));
  assertTokenStreamContents(filter,
    new String[]{"please","please divide","divide","divide this","this","this sentence","sentence"},
    new int[]{0,0,7,7,14,14,19}, new int[]{6,13,13,18,18,27,27},
    new String[]{TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE},
    new int[]{1,0,1,0,1,0,1}
  );
}
 
Example 4
Source Project: lucene-solr   Source File: XmlInterpolationTest.java    License: Apache License 2.0 6 votes vote down vote up
private String[] analyzeReturnTokens(String docText) {
  List<String> result = new ArrayList<>();

  Reader filter = new HTMLStripCharFilter(new StringReader(docText),
          Collections.singleton("unescaped"));
  WhitespaceTokenizer ts = new WhitespaceTokenizer();
  final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class);
  try {
    ts.setReader(filter);
    ts.reset();
    while (ts.incrementToken()) {
      result.add(termAttribute.toString());
    }
    ts.end();
  } catch (IOException e) {
    throw new RuntimeException(e);
  } finally {
    IOUtils.closeQuietly(ts);
  }
  return result.toArray(new String[result.size()]);
}
 
Example 5
private int countTokensInText(String text) {
    WhitespaceTokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(text));
    int tokens = 0;
    try {
        tokenizer.reset();
        while (tokenizer.incrementToken()) {
            ++tokens;
        }
    } catch (Exception e) {
        LOGGER.error("Error while tokenizing text. Returning.", e);
    } finally {
        IOUtils.closeQuietly(tokenizer);
    }
    return tokens;
}
 
Example 6
public void testOverlappingAtBeginning() throws Exception {
    final CharArraySet phraseSets = new CharArraySet(Arrays.asList(
            "new york", "new york city", "city of new york"), false);

    final String input = "new york city is great";

    StringReader reader = new StringReader(input);
    final WhitespaceTokenizer in = new WhitespaceTokenizer();
    in.setReader(reader);

    AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter(in, phraseSets, false);
    aptf.setReplaceWhitespaceWith('_');
    CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class);
    aptf.reset();

    assertTrue(aptf.incrementToken());
    assertEquals("new_york_city", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("is", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("great", term.toString());
}
 
Example 7
public void testOverlappingAtEnd() throws Exception {
    final CharArraySet phraseSets = new CharArraySet(Arrays.asList(
            "new york", "new york city", "city of new york"), false);

    final String input = "the great city of new york";

    StringReader reader = new StringReader(input);
    final WhitespaceTokenizer in = new WhitespaceTokenizer();
    in.setReader(reader);

    AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter(in, phraseSets, false);
    aptf.setReplaceWhitespaceWith('_');
    CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class);
    aptf.reset();

    assertTrue(aptf.incrementToken());
    assertEquals("the", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("great", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("city_of_new_york", term.toString());
}
 
Example 8
public void testIncompletePhrase() throws Exception {
    final CharArraySet phraseSets = new CharArraySet(Arrays.asList(
            "big apple", "new york city", "property tax", "three word phrase"), false);

    final String input = "some new york";

    StringReader reader = new StringReader(input);
    final WhitespaceTokenizer in = new WhitespaceTokenizer();
    in.setReader(reader);

    AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter(in, phraseSets, false);
    aptf.setReplaceWhitespaceWith('_');
    CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class);
    aptf.reset();

    assertTrue(aptf.incrementToken());
    assertEquals("some", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("new", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("york", term.toString());
}
 
Example 9
Source Project: oodt   Source File: CASAnalyzer.java    License: Apache License 2.0 6 votes vote down vote up
@Override
protected TokenStreamComponents createComponents(String fieldName) {
    TokenStream result = new WhitespaceTokenizer(/*reader*/);
    /*result = new StandardFilter(result);
    result = new StopFilter(result, STOP_WORDS);

    try {
        result.reset();
    } catch (IOException e) {
        e.printStackTrace();
    }
    StandardTokenizer tokenizer = new StandardTokenizer(factory);

    return new TokenStreamComponents(tokenizer, result);*/
    return new TokenStreamComponents(new WhitespaceTokenizer());

}
 
Example 10
Source Project: SolrTextTagger   Source File: XmlInterpolationTest.java    License: Apache License 2.0 6 votes vote down vote up
private String[] analyzeReturnTokens(String docText) {
  List<String> result = new ArrayList<>();

  Reader filter = new HTMLStripCharFilter(new StringReader(docText),
          Collections.singleton("unescaped"));
  WhitespaceTokenizer ts = new WhitespaceTokenizer();
  final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class);
  try {
    ts.setReader(filter);
    ts.reset();
    while (ts.incrementToken()) {
      result.add(termAttribute.toString());
    }
    ts.end();
  } catch (IOException e) {
    throw new RuntimeException(e);
  } finally {
    IOUtils.closeQuietly(ts);
  }
  return result.toArray(new String[result.size()]);
}
 
Example 11
Source Project: SolrTextTagger   Source File: ConcatenateFilterTest.java    License: Apache License 2.0 6 votes vote down vote up
public void testTypical() throws IOException {
  String NYC = "new york city";
  WhitespaceTokenizer stream = new WhitespaceTokenizer();
  stream.setReader(new StringReader(NYC));
  ConcatenateFilter filter = new ConcatenateFilter(stream);
  try {
    assertTokenStreamContents(filter, new String[]{NYC},
        new int[]{0}, new int[]{NYC.length()}, new String[]{"shingle"},
        new int[]{1}, null, NYC.length(), true);
  } catch (AssertionError e) {
    //assertTokenStreamContents tries to test if tokenStream.end() was implemented correctly.
    // It's manner of checking this is imperfect and incompatible with
    // ConcatenateFilter. Specifically it modifies a special attribute *after* incrementToken(),
    // which is weird. To the best of my ability, end() appears to be implemented correctly.
    if (!e.getMessage().equals("super.end()/clearAttributes() was not called correctly in end()"))
      throw e;
  }
}
 
Example 12
Source Project: lucene-solr   Source File: EdgeNGramTokenFilterTest.java    License: Apache License 2.0 5 votes vote down vote up
public void testReset() throws Exception {
  WhitespaceTokenizer tokenizer = new WhitespaceTokenizer();
  tokenizer.setReader(new StringReader("abcde"));
  EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(tokenizer, 1, 3, false);
  assertTokenStreamContents(filter, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5});
  tokenizer.setReader(new StringReader("abcde"));
  assertTokenStreamContents(filter, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5});
}
 
Example 13
Source Project: lucene-solr   Source File: NGramTokenFilterTest.java    License: Apache License 2.0 5 votes vote down vote up
public void testReset() throws Exception {
  WhitespaceTokenizer tokenizer = new WhitespaceTokenizer();
  tokenizer.setReader(new StringReader("abcde"));
  NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 1, false);
  assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0});
  tokenizer.setReader(new StringReader("abcde"));
  assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0});
}
 
Example 14
Source Project: lucene-solr   Source File: XmlInterpolationTest.java    License: Apache License 2.0 5 votes vote down vote up
private int[] analyzeTagOne(String docText, String start, String end) {
  int[] result = {-1, -1};

  Reader filter = new HTMLStripCharFilter(new StringReader(docText));

  WhitespaceTokenizer ts = new WhitespaceTokenizer();
  final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class);
  final OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class);
  try {
    ts.setReader(filter);
    ts.reset();
    while (ts.incrementToken()) {
      final String termString = termAttribute.toString();
      if (termString.equals(start))
        result[0] = offsetAttribute.startOffset();
      if (termString.equals(end)) {
        result[1] = offsetAttribute.endOffset();
        return result;
      }
    }
    ts.end();
  } catch (IOException e) {
    throw new RuntimeException(e);
  } finally {
    IOUtils.closeQuietly(ts);
  }
  return result;
}
 
Example 15
Source Project: hmftools   Source File: TreatmentCurator.java    License: GNU General Public License v3.0 5 votes vote down vote up
@NotNull
private static Analyzer createShingleAnalyzer(int maxShingles) {
    return new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(@NotNull String field) {
            Tokenizer source = new WhitespaceTokenizer();
            source.setReader(new StringReader(field));
            ShingleFilter shingleFilter = new ShingleFilter(defaultTokenFilter(source), maxShingles);
            shingleFilter.setOutputUnigrams(true);
            return new TokenStreamComponents(source, shingleFilter);
        }
    };
}
 
Example 16
Source Project: hmftools   Source File: TreatmentCurator.java    License: GNU General Public License v3.0 5 votes vote down vote up
@NotNull
private static Analyzer spellcheckAnalyzer(@NotNull SpellChecker spellChecker) {
    return new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(@NotNull String field) {
            Tokenizer source = new WhitespaceTokenizer();
            source.setReader(new StringReader(field));
            SpellCheckerTokenFilter spellCheckFilter = new SpellCheckerTokenFilter(defaultTokenFilter(source), spellChecker);
            TokenFilter concatenatingFilter = new ConcatenatingFilter(spellCheckFilter, ' ');
            return new TokenStreamComponents(source, concatenatingFilter);
        }
    };
}
 
Example 17
Source Project: hmftools   Source File: TreatmentCurator.java    License: GNU General Public License v3.0 5 votes vote down vote up
@NotNull
private static Analyzer wordDelimiterAnalyzer() {
    return new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(@NotNull String field) {
            Tokenizer source = new WhitespaceTokenizer();
            source.setReader(new StringReader(field));
            return new TokenStreamComponents(source, defaultTokenFilter(source));
        }
    };
}
 
Example 18
Source Project: hmftools   Source File: TreatmentCurator.java    License: GNU General Public License v3.0 5 votes vote down vote up
@NotNull
private static Analyzer concatenatingAnalyzer() {
    return new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(@NotNull String field) {
            Tokenizer source = new WhitespaceTokenizer();
            source.setReader(new StringReader(field));
            TokenFilter concatenatingFilter = new ConcatenatingFilter(defaultTokenFilter(source), ' ');
            return new TokenStreamComponents(source, concatenatingFilter);
        }
    };
}
 
Example 19
Source Project: modernmt   Source File: ContentAnalyzer.java    License: Apache License 2.0 5 votes vote down vote up
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    Tokenizer tokenizer = new WhitespaceTokenizer(reader);
    TokenStream filter;
    filter = new PunctuationFilter(tokenizer);

    if (shingleSize > 0) {
        ShingleFilter shingleFilter = new ShingleFilter(filter, shingleSize, shingleSize);
        shingleFilter.setOutputUnigrams(outputUnigrams);

        filter = shingleFilter;
    }

    return new TokenStreamComponents(tokenizer, filter);
}
 
Example 20
Source Project: SciGraph   Source File: EntityAnalyzer.java    License: Apache License 2.0 5 votes vote down vote up
@Override
protected TokenStreamComponents createComponents(String fieldName) {
  Tokenizer tokenizer = new WhitespaceTokenizer();
  TokenStream result =
      new PatternReplaceFilter(tokenizer,
          Pattern.compile("^([\\.!\\?,:;\"'\\(\\)]*)(.*?)([\\.!\\?,:;\"'\\(\\)]*)$"), "$2", true);
  result = new PatternReplaceFilter(result, Pattern.compile("'s"), "s", true);
  return new TokenStreamComponents(tokenizer, result);
}
 
Example 21
public void testOverlappingAtBeginningEmitSingle() throws Exception {
    final CharArraySet phraseSets = new CharArraySet(Arrays.asList(
            "new york", "new york city", "city of new york"), false);

    final String input = "new york city is great";

    StringReader reader = new StringReader(input);
    final WhitespaceTokenizer in = new WhitespaceTokenizer();
    in.setReader(reader);

    AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter(in, phraseSets, true);
    aptf.setReplaceWhitespaceWith('_');
    CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class);
    aptf.reset();

    assertTrue(aptf.incrementToken());
    assertEquals("new", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("york", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("new_york", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("new_york_city", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("city", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("is", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("great", term.toString());
}
 
Example 22
public void testOverlappingAtEndEmitSingle() throws Exception {
    final CharArraySet phraseSets = new CharArraySet(Arrays.asList(
            "new york", "new york city", "city of new york"), false);

    final String input = "the great city of new york";

    StringReader reader = new StringReader(input);
    final WhitespaceTokenizer in = new WhitespaceTokenizer();
    in.setReader(reader);

    AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter(in, phraseSets, true);
    aptf.setReplaceWhitespaceWith('_');
    CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class);
    aptf.reset();

    assertTrue(aptf.incrementToken());
    assertEquals("the", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("great", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("city", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("of", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("new", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("york", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("city_of_new_york", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("new_york", term.toString());
}
 
Example 23
@Override
protected TokenStreamComponents createComponents(String fieldName,
		Reader reader) {
	Tokenizer source = new WhitespaceTokenizer(version, reader);
	TokenStream filter = new LowerCaseFilter(version, source);
	filter = new InlineAnnotationFilter(filter);
	
	return new TokenStreamComponents(source, filter);
}
 
Example 24
Source Project: SolrTextTagger   Source File: XmlInterpolationTest.java    License: Apache License 2.0 5 votes vote down vote up
private int[] analyzeTagOne(String docText, String start, String end) {
  int[] result = {-1, -1};

  Reader filter = new HTMLStripCharFilter(new StringReader(docText));

  WhitespaceTokenizer ts = new WhitespaceTokenizer();
  final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class);
  final OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class);
  try {
    ts.setReader(filter);
    ts.reset();
    while (ts.incrementToken()) {
      final String termString = termAttribute.toString();
      if (termString.equals(start))
        result[0] = offsetAttribute.startOffset();
      if (termString.equals(end)) {
        result[1] = offsetAttribute.endOffset();
        return result;
      }
    }
    ts.end();
  } catch (IOException e) {
    throw new RuntimeException(e);
  } finally {
    IOUtils.closeQuietly(ts);
  }
  return result;
}
 
Example 25
Source Project: crate   Source File: SimplePhoneticAnalysisTests.java    License: Apache License 2.0 5 votes vote down vote up
public void testPhoneticTokenFilterBeiderMorseNoLanguage() throws IOException {
    TokenFilterFactory filterFactory = analysis.tokenFilter.get("beidermorsefilter");
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader("ABADIAS"));
    String[] expected = new String[] { "abYdias", "abYdios", "abadia", "abadiaS", "abadias", "abadio", "abadioS", "abadios", "abodia",
            "abodiaS", "abodias", "abodio", "abodioS", "abodios", "avadias", "avadios", "avodias", "avodios", "obadia", "obadiaS",
            "obadias", "obadio", "obadioS", "obadios", "obodia", "obodiaS", "obodias", "obodioS" };
    BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected);
}
 
Example 26
Source Project: crate   Source File: SimplePhoneticAnalysisTests.java    License: Apache License 2.0 5 votes vote down vote up
public void testPhoneticTokenFilterBeiderMorseWithLanguage() throws IOException {
    TokenFilterFactory filterFactory = analysis.tokenFilter.get("beidermorsefilterfrench");
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader("Rimbault"));
    String[] expected = new String[] { "rimbD", "rimbDlt", "rimba", "rimbalt", "rimbo", "rimbolt", "rimbu", "rimbult", "rmbD", "rmbDlt",
            "rmba", "rmbalt", "rmbo", "rmbolt", "rmbu", "rmbult" };
    BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected);
}
 
Example 27
Source Project: crate   Source File: SimplePhoneticAnalysisTests.java    License: Apache License 2.0 5 votes vote down vote up
public void testPhoneticTokenFilterDaitchMotokoff() throws IOException {
    TokenFilterFactory filterFactory = analysis.tokenFilter.get("daitch_mokotoff");
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader("chauptman"));
    String[] expected = new String[] { "473660", "573660" };
    assertThat(filterFactory.create(tokenizer), instanceOf(DaitchMokotoffSoundexFilter.class));
    BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected);
}
 
Example 28
Source Project: crate   Source File: CommonAnalysisPlugin.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() {
    List<PreConfiguredTokenizer> tokenizers = new ArrayList<>();
    tokenizers.add(PreConfiguredTokenizer.singleton("keyword", KeywordTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("classic", ClassicTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("uax_url_email", UAX29URLEmailTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("path_hierarchy", PathHierarchyTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("letter", LetterTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("whitespace", WhitespaceTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("ngram", NGramTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("edge_ngram",
        () -> new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE), null));
    tokenizers.add(PreConfiguredTokenizer.singleton("pattern", () -> new PatternTokenizer(Regex.compile("\\W+", null), -1), null));
    tokenizers.add(PreConfiguredTokenizer.singleton("thai", ThaiTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("lowercase", XLowerCaseTokenizer::new, () -> new TokenFilterFactory() {
        @Override
        public String name() {
            return "lowercase";
        }

        @Override
        public TokenStream create(TokenStream tokenStream) {
            return new LowerCaseFilter(tokenStream);
        }
    }));

    // Temporary shim for aliases. TODO deprecate after they are moved
    tokenizers.add(PreConfiguredTokenizer.singleton("PathHierarchy", PathHierarchyTokenizer::new, null));

    return tokenizers;
}
 
Example 29
Source Project: Elasticsearch   Source File: WhitespaceTokenizerFactory.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public Tokenizer create() {
    return new WhitespaceTokenizer();
}
 
Example 30
static void assertAlgorithm(boolean inject, String input, String[] expected) throws Exception {
  Tokenizer tokenizer = new WhitespaceTokenizer();
  tokenizer.setReader(new StringReader(input));
  DaitchMokotoffSoundexFilter filter = new DaitchMokotoffSoundexFilter(tokenizer, inject);
  assertTokenStreamContents(filter, expected);
}