org.apache.lucene.analysis.standard.StandardTokenizer Java Examples

The following examples show how to use org.apache.lucene.analysis.standard.StandardTokenizer. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestMorfologikAnalyzer.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/** */
public final void testKeywordAttrTokens() throws IOException {
  Analyzer a = new MorfologikAnalyzer() {
    @Override
    protected TokenStreamComponents createComponents(String field) {
      final CharArraySet keywords = new CharArraySet(1, false);
      keywords.add("liście");

      final Tokenizer src = new StandardTokenizer();
      TokenStream result = new SetKeywordMarkerFilter(src, keywords);
      result = new MorfologikFilter(result); 

      return new TokenStreamComponents(src, result);
    }
  };

  assertAnalyzesTo(
    a,
    "liście danych",
    new String[] { "liście", "dany", "dana", "dane", "dać" },
    new int[] { 0, 7, 7, 7, 7 },
    new int[] { 6, 13, 13, 13, 13 },
    new int[] { 1, 1, 0, 0, 0 });
  a.close();
}
 
Example #2
Source File: QueryParserImpl.java    From AdSearch_Endpoints with Apache License 2.0 6 votes vote down vote up
@Override
  public List<String> parseQuery(String queryStr) {
    // tokenize queryStr, remove stop word, stemming
	List<String> tokens = new ArrayList<String>();
	AttributeFactory factory = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY;
	Tokenizer tokenizer = new StandardTokenizer(factory);
	tokenizer.setReader(new StringReader(queryStr));
	CharArraySet stopWords = EnglishAnalyzer.getDefaultStopSet();
    TokenStream tokenStream = new StopFilter(tokenizer, stopWords);
//    StringBuilder sb = new StringBuilder();
    CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class);
    try {
    	tokenStream.reset();
        while (tokenStream.incrementToken()) {
            String term = charTermAttribute.toString();
            
            tokens.add(term);
//            sb.append(term + " ");
        }
        tokenStream.end();
        tokenStream.close();

        tokenizer.close();  
	} catch (IOException e) {
		e.printStackTrace();
	}
//	System.out.println("QU="+ sb.toString());
	return tokens;	
  }
 
Example #3
Source File: TestCJKBigramFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testHanOnly() throws Exception {
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer t = new StandardTokenizer();
      return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN));
    }
  };
  assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
      new String[] { "多", "く", "の",  "学生", "が",  "試験", "に",  "落", "ち", "た" },
      new int[] { 0, 1, 2, 3, 5, 6, 8, 9, 10, 11 },
      new int[] { 1, 2, 3, 5, 6, 8, 9, 10, 11, 12 },
      new String[] { "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<DOUBLE>", "<HIRAGANA>", "<DOUBLE>", 
                     "<HIRAGANA>", "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>" },
      new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
      new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 });
  a.close();
}
 
Example #4
Source File: SearchService.java    From subsonic with GNU General Public License v3.0 6 votes vote down vote up
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
    class SavedStreams {
        StandardTokenizer tokenStream;
        TokenStream filteredTokenStream;
    }

    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
    if (streams == null) {
        streams = new SavedStreams();
        setPreviousTokenStream(streams);
        streams.tokenStream = new StandardTokenizer(LUCENE_VERSION, reader);
        streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
        streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
        streams.filteredTokenStream = new StopFilter(true, streams.filteredTokenStream, STOP_WORDS_SET);
        streams.filteredTokenStream = new ASCIIFoldingFilter(streams.filteredTokenStream);
    } else {
        streams.tokenStream.reset(reader);
    }
    streams.tokenStream.setMaxTokenLength(DEFAULT_MAX_TOKEN_LENGTH);

    return streams.filteredTokenStream;
}
 
Example #5
Source File: TestCJKBigramFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testUnigramsAndBigramsHanOnly() throws Exception {
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer t = new StandardTokenizer();
      return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN, true));
    }
  };
  assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
      new String[] { "多", "く", "の",  "学", "学生", "生", "が",  "試", "試験", "験", "に",  "落", "ち", "た" },
      new int[] { 0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11 },
      new int[] { 1, 2, 3, 4, 5, 5, 6, 7, 8, 8, 9, 10, 11, 12 },
      new String[] { "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>", "<DOUBLE>", 
                     "<SINGLE>", "<HIRAGANA>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", 
                     "<HIRAGANA>", "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>" },
      new int[] { 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1 },
      new int[] { 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1 });
  a.close();
}
 
Example #6
Source File: TestTypeTokenFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Test Position increments applied by TypeTokenFilter with and without enabling this option.
 */
public void testStopPositons() throws IOException {
  StringBuilder sb = new StringBuilder();
  for (int i = 10; i < 20; i++) {
    if (i % 3 != 0) {
      sb.append(i).append(" ");
    } else {
      String w = English.intToEnglish(i).trim();
      sb.append(w).append(" ");
    }
  }
  log(sb.toString());
  String stopTypes[] = new String[]{"<NUM>"};
  Set<String> stopSet = asSet(stopTypes);

  // with increments
  StringReader reader = new StringReader(sb.toString());
  final StandardTokenizer input = new StandardTokenizer();
  input.setReader(reader);
  TypeTokenFilter typeTokenFilter = new TypeTokenFilter(input, stopSet);
  testPositons(typeTokenFilter);

}
 
Example #7
Source File: StandardPreProcessorIterator.java    From Indra with MIT License 5 votes vote down vote up
StandardPreProcessorIterator(CorpusMetadata metadata, String text) {
    this.metadata = Objects.requireNonNull(metadata);
    ;
    this.tokenizer = new StandardTokenizer();
    this.tokenStream = createStream(metadata, tokenizer);
    setTransformers();
    initialize(Objects.requireNonNull(text));
}
 
Example #8
Source File: NGramAnalyzer.java    From spacewalk with GNU General Public License v2.0 5 votes vote down vote up
/**
 * @param fieldName ignored param
 * @param reader contains data to parse
 * @return TokenStream of ngrams
 */
public TokenStream tokenStream(String fieldName, Reader reader) {
    return new NGramTokenFilter(
            new LowerCaseFilter(
                new StandardFilter(
                    new StandardTokenizer(reader))), min_ngram, max_ngram);
}
 
Example #9
Source File: LuceneTokenizer.java    From semanticvectors with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * 
 * @param string
 * @return arrayList of tokens of string converted to lowercase
 * @throws IOException
 */
public static ArrayList<String> tokenize(String string) throws IOException{
	ArrayList<String> retList = new ArrayList<String>();
	StringReader reader = new StringReader(string);
	StandardTokenizer tokenizer = new StandardTokenizer();
	while(tokenizer.incrementToken()){
		retList.add(tokenizer.getAttribute(Token.class).toString());
	}
	tokenizer.close();
	reader.close();
	return retList;
}
 
Example #10
Source File: SearchService.java    From subsonic with GNU General Public License v3.0 5 votes vote down vote up
private String analyzeQuery(String query) throws IOException {
    StringBuilder result = new StringBuilder();
    ASCIIFoldingFilter filter = new ASCIIFoldingFilter(new StandardTokenizer(LUCENE_VERSION, new StringReader(query)));
    TermAttribute termAttribute = filter.getAttribute(TermAttribute.class);
    while (filter.incrementToken()) {
        result.append(termAttribute.term()).append("* ");
    }
    return result.toString();
}
 
Example #11
Source File: TestTypeTokenFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testTypeFilterWhitelist() throws IOException {
  StringReader reader = new StringReader("121 is palindrome, while 123 is not");
  Set<String> stopTypes = Collections.singleton("<NUM>");
  final StandardTokenizer input = new StandardTokenizer(newAttributeFactory());
  input.setReader(reader);
  TokenStream stream = new TypeTokenFilter(input, stopTypes, true);
  assertTokenStreamContents(stream, new String[]{"121", "123"});
}
 
Example #12
Source File: TestTypeTokenFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testTypeFilter() throws IOException {
  StringReader reader = new StringReader("121 is palindrome, while 123 is not");
  Set<String> stopTypes = asSet("<NUM>");
  final StandardTokenizer input = new StandardTokenizer(newAttributeFactory());
  input.setReader(reader);
  TokenStream stream = new TypeTokenFilter(input, stopTypes);
  assertTokenStreamContents(stream, new String[]{"is", "palindrome", "while", "is", "not"});
}
 
Example #13
Source File: TestAnalyzers.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("unused")
public void _testStandardConstants() {
  int x = StandardTokenizer.ALPHANUM;
  x = StandardTokenizer.NUM;
  x = StandardTokenizer.SOUTHEAST_ASIAN;
  x = StandardTokenizer.IDEOGRAPHIC;
  x = StandardTokenizer.HIRAGANA;
  x = StandardTokenizer.KATAKANA;
  x = StandardTokenizer.HANGUL;
  String[] y = StandardTokenizer.TOKEN_TYPES;
}
 
Example #14
Source File: TestConditionalTokenFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testFilteringWithReadahead() throws IOException {

    CharArraySet protectedTerms = new CharArraySet(2, true);
    protectedTerms.add("two");
    protectedTerms.add("two three");

    Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName) {
        Tokenizer source = new StandardTokenizer();
        TokenStream sink = new ShingleFilter(source, 3);
        sink = new ProtectedTermFilter(protectedTerms, sink, in -> new TypeTokenFilter(in, Collections.singleton("ALL"), true));
        return new TokenStreamComponents(source, sink);
      }
    };

    String input = "one two three four";

    try (TokenStream ts = analyzer.tokenStream("", input)) {
      assertTokenStreamContents(ts, new String[]{
          "two", "two three"
      }, new int[]{
           4,     4
      }, new int[]{
           7,     13
      }, new int[]{
           2,     0
      }, new int[]{
           1,     2
      }, 18);
    }

  }
 
Example #15
Source File: TestElision.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testElision() throws Exception {
  String test = "Plop, juste pour voir l'embrouille avec O'brian. M'enfin.";
  Tokenizer tokenizer = new StandardTokenizer(newAttributeFactory());
  tokenizer.setReader(new StringReader(test));
  CharArraySet articles = new CharArraySet(asSet("l", "M"), false);
  TokenFilter filter = new ElisionFilter(tokenizer, articles);
  List<String> tas = filter(filter);
  assertEquals("embrouille", tas.get(4));
  assertEquals("O'brian", tas.get(6));
  assertEquals("enfin", tas.get(7));
}
 
Example #16
Source File: TestCJKBigramFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testAllScripts() throws Exception {
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer t = new StandardTokenizer();
      return new TokenStreamComponents(t, 
          new CJKBigramFilter(t, 0xff, false));
    }
  };
  assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
      new String[] { "多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた" });
  a.close();
}
 
Example #17
Source File: TestCJKAnalyzer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public boolean incrementToken() throws IOException {
  if (input.incrementToken()) {
    typeAtt.setType(StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC]);
    return true;
  } else {
    return false;
  }
}
 
Example #18
Source File: NGramAnalyzer.java    From uyuni with GNU General Public License v2.0 5 votes vote down vote up
/**
 * @param fieldName ignored param
 * @param reader contains data to parse
 * @return TokenStream of ngrams
 */
public TokenStream tokenStream(String fieldName, Reader reader) {
    return new NGramTokenFilter(
            new LowerCaseFilter(
                new StandardFilter(
                    new StandardTokenizer(reader))), min_ngram, max_ngram);
}
 
Example #19
Source File: TestTeeSinkTokenFilter.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private StandardTokenizer standardTokenizer(StringBuilder builder) {
  StandardTokenizer tokenizer = new StandardTokenizer();
  tokenizer.setReader(new StringReader(builder.toString()));
  return tokenizer;
}
 
Example #20
Source File: QueryAutoFilteringComponent.java    From query-autofiltering-component with Apache License 2.0 4 votes vote down vote up
private Tokenizer getTokenizerImpl( String input ) throws IOException {
  StandardTokenizer sttk = new StandardTokenizer( );
  sttk.setReader( new StringReader( input ) );
  return sttk;
}
 
Example #21
Source File: QueryAutoFilteringComponent.java    From query-autofiltering-component with Apache License 2.0 4 votes vote down vote up
private Tokenizer getTokenizerImpl( String input ) throws IOException {
  StandardTokenizer sttk = new StandardTokenizer( new StringReader( input ) );
  return sttk;
}
 
Example #22
Source File: StandardTokenizerFactory.java    From crate with Apache License 2.0 4 votes vote down vote up
@Override
public Tokenizer create() {
    StandardTokenizer tokenizer = new StandardTokenizer();
    tokenizer.setMaxTokenLength(maxTokenLength);
    return tokenizer;
}
 
Example #23
Source File: ArabicAnalyzer.java    From modernmt with Apache License 2.0 3 votes vote down vote up
/**
 * Creates
 * {@link TokenStreamComponents}
 * used to tokenize all the text in the provided {@link Reader}.
 *
 * @return {@link TokenStreamComponents}
 * built from an {@link StandardTokenizer} filtered with
 * {@link LowerCaseFilter}, {@link StopFilter},
 * {@link ArabicNormalizationFilter}, {@link SetKeywordMarkerFilter}
 * if a stem exclusion set is provided and {@link ArabicStemFilter}.
 */
@Override
protected TokenStreamComponents createComponents(String fieldName,
                                                 Reader reader) {
    final Tokenizer source = new StandardTokenizer(reader);
    TokenStream result = new ArabicNormalizationFilter(source);
    return new TokenStreamComponents(source, result);
}
 
Example #24
Source File: MorfologikAnalyzer.java    From lucene-solr with Apache License 2.0 3 votes vote down vote up
/**
 * Creates a
 * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
 * which tokenizes all the text in the provided {@link Reader}.
 * 
 * @param field ignored field name
 * @return A {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
 *         built from an {@link StandardTokenizer} filtered with
 *         {@link MorfologikFilter}.
 */
@Override
protected TokenStreamComponents createComponents(final String field) {
  final Tokenizer src = new StandardTokenizer();
  
  return new TokenStreamComponents(
      src, 
      new MorfologikFilter(src, dictionary));
}