Java Code Examples for org.apache.lucene.analysis.standard.StandardTokenizer

The following examples show how to use org.apache.lucene.analysis.standard.StandardTokenizer. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: lucene-solr   Source File: TestMorfologikAnalyzer.java    License: Apache License 2.0 6 votes vote down vote up
/** */
public final void testKeywordAttrTokens() throws IOException {
  Analyzer a = new MorfologikAnalyzer() {
    @Override
    protected TokenStreamComponents createComponents(String field) {
      final CharArraySet keywords = new CharArraySet(1, false);
      keywords.add("liście");

      final Tokenizer src = new StandardTokenizer();
      TokenStream result = new SetKeywordMarkerFilter(src, keywords);
      result = new MorfologikFilter(result); 

      return new TokenStreamComponents(src, result);
    }
  };

  assertAnalyzesTo(
    a,
    "liście danych",
    new String[] { "liście", "dany", "dana", "dane", "dać" },
    new int[] { 0, 7, 7, 7, 7 },
    new int[] { 6, 13, 13, 13, 13 },
    new int[] { 1, 1, 0, 0, 0 });
  a.close();
}
 
Example 2
Source Project: lucene-solr   Source File: TestCJKBigramFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testHanOnly() throws Exception {
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer t = new StandardTokenizer();
      return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN));
    }
  };
  assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
      new String[] { "多", "く", "の",  "学生", "が",  "試験", "に",  "落", "ち", "た" },
      new int[] { 0, 1, 2, 3, 5, 6, 8, 9, 10, 11 },
      new int[] { 1, 2, 3, 5, 6, 8, 9, 10, 11, 12 },
      new String[] { "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<DOUBLE>", "<HIRAGANA>", "<DOUBLE>", 
                     "<HIRAGANA>", "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>" },
      new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
      new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 });
  a.close();
}
 
Example 3
Source Project: lucene-solr   Source File: TestCJKBigramFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testUnigramsAndBigramsHanOnly() throws Exception {
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer t = new StandardTokenizer();
      return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN, true));
    }
  };
  assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
      new String[] { "多", "く", "の",  "学", "学生", "生", "が",  "試", "試験", "験", "に",  "落", "ち", "た" },
      new int[] { 0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11 },
      new int[] { 1, 2, 3, 4, 5, 5, 6, 7, 8, 8, 9, 10, 11, 12 },
      new String[] { "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>", "<DOUBLE>", 
                     "<SINGLE>", "<HIRAGANA>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", 
                     "<HIRAGANA>", "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>" },
      new int[] { 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1 },
      new int[] { 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1 });
  a.close();
}
 
Example 4
Source Project: lucene-solr   Source File: TestTypeTokenFilter.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Test Position increments applied by TypeTokenFilter with and without enabling this option.
 */
public void testStopPositons() throws IOException {
  StringBuilder sb = new StringBuilder();
  for (int i = 10; i < 20; i++) {
    if (i % 3 != 0) {
      sb.append(i).append(" ");
    } else {
      String w = English.intToEnglish(i).trim();
      sb.append(w).append(" ");
    }
  }
  log(sb.toString());
  String stopTypes[] = new String[]{"<NUM>"};
  Set<String> stopSet = asSet(stopTypes);

  // with increments
  StringReader reader = new StringReader(sb.toString());
  final StandardTokenizer input = new StandardTokenizer();
  input.setReader(reader);
  TypeTokenFilter typeTokenFilter = new TypeTokenFilter(input, stopSet);
  testPositons(typeTokenFilter);

}
 
Example 5
Source Project: subsonic   Source File: SearchService.java    License: GNU General Public License v3.0 6 votes vote down vote up
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
    class SavedStreams {
        StandardTokenizer tokenStream;
        TokenStream filteredTokenStream;
    }

    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
    if (streams == null) {
        streams = new SavedStreams();
        setPreviousTokenStream(streams);
        streams.tokenStream = new StandardTokenizer(LUCENE_VERSION, reader);
        streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
        streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
        streams.filteredTokenStream = new StopFilter(true, streams.filteredTokenStream, STOP_WORDS_SET);
        streams.filteredTokenStream = new ASCIIFoldingFilter(streams.filteredTokenStream);
    } else {
        streams.tokenStream.reset(reader);
    }
    streams.tokenStream.setMaxTokenLength(DEFAULT_MAX_TOKEN_LENGTH);

    return streams.filteredTokenStream;
}
 
Example 6
Source Project: AdSearch_Endpoints   Source File: QueryParserImpl.java    License: Apache License 2.0 6 votes vote down vote up
@Override
  public List<String> parseQuery(String queryStr) {
    // tokenize queryStr, remove stop word, stemming
	List<String> tokens = new ArrayList<String>();
	AttributeFactory factory = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY;
	Tokenizer tokenizer = new StandardTokenizer(factory);
	tokenizer.setReader(new StringReader(queryStr));
	CharArraySet stopWords = EnglishAnalyzer.getDefaultStopSet();
    TokenStream tokenStream = new StopFilter(tokenizer, stopWords);
//    StringBuilder sb = new StringBuilder();
    CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class);
    try {
    	tokenStream.reset();
        while (tokenStream.incrementToken()) {
            String term = charTermAttribute.toString();
            
            tokens.add(term);
//            sb.append(term + " ");
        }
        tokenStream.end();
        tokenStream.close();

        tokenizer.close();  
	} catch (IOException e) {
		e.printStackTrace();
	}
//	System.out.println("QU="+ sb.toString());
	return tokens;	
  }
 
Example 7
Source Project: uyuni   Source File: NGramAnalyzer.java    License: GNU General Public License v2.0 5 votes vote down vote up
/**
 * @param fieldName ignored param
 * @param reader contains data to parse
 * @return TokenStream of ngrams
 */
public TokenStream tokenStream(String fieldName, Reader reader) {
    return new NGramTokenFilter(
            new LowerCaseFilter(
                new StandardFilter(
                    new StandardTokenizer(reader))), min_ngram, max_ngram);
}
 
Example 8
Source Project: Indra   Source File: StandardPreProcessorIterator.java    License: MIT License 5 votes vote down vote up
StandardPreProcessorIterator(CorpusMetadata metadata, String text) {
    this.metadata = Objects.requireNonNull(metadata);
    ;
    this.tokenizer = new StandardTokenizer();
    this.tokenStream = createStream(metadata, tokenizer);
    setTransformers();
    initialize(Objects.requireNonNull(text));
}
 
Example 9
Source Project: lucene-solr   Source File: TestCJKAnalyzer.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public boolean incrementToken() throws IOException {
  if (input.incrementToken()) {
    typeAtt.setType(StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC]);
    return true;
  } else {
    return false;
  }
}
 
Example 10
Source Project: lucene-solr   Source File: TestCJKBigramFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void testAllScripts() throws Exception {
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer t = new StandardTokenizer();
      return new TokenStreamComponents(t, 
          new CJKBigramFilter(t, 0xff, false));
    }
  };
  assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
      new String[] { "多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた" });
  a.close();
}
 
Example 11
Source Project: lucene-solr   Source File: TestElision.java    License: Apache License 2.0 5 votes vote down vote up
public void testElision() throws Exception {
  String test = "Plop, juste pour voir l'embrouille avec O'brian. M'enfin.";
  Tokenizer tokenizer = new StandardTokenizer(newAttributeFactory());
  tokenizer.setReader(new StringReader(test));
  CharArraySet articles = new CharArraySet(asSet("l", "M"), false);
  TokenFilter filter = new ElisionFilter(tokenizer, articles);
  List<String> tas = filter(filter);
  assertEquals("embrouille", tas.get(4));
  assertEquals("O'brian", tas.get(6));
  assertEquals("enfin", tas.get(7));
}
 
Example 12
Source Project: lucene-solr   Source File: TestConditionalTokenFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void testFilteringWithReadahead() throws IOException {

    CharArraySet protectedTerms = new CharArraySet(2, true);
    protectedTerms.add("two");
    protectedTerms.add("two three");

    Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName) {
        Tokenizer source = new StandardTokenizer();
        TokenStream sink = new ShingleFilter(source, 3);
        sink = new ProtectedTermFilter(protectedTerms, sink, in -> new TypeTokenFilter(in, Collections.singleton("ALL"), true));
        return new TokenStreamComponents(source, sink);
      }
    };

    String input = "one two three four";

    try (TokenStream ts = analyzer.tokenStream("", input)) {
      assertTokenStreamContents(ts, new String[]{
          "two", "two three"
      }, new int[]{
           4,     4
      }, new int[]{
           7,     13
      }, new int[]{
           2,     0
      }, new int[]{
           1,     2
      }, 18);
    }

  }
 
Example 13
Source Project: lucene-solr   Source File: TestAnalyzers.java    License: Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("unused")
public void _testStandardConstants() {
  int x = StandardTokenizer.ALPHANUM;
  x = StandardTokenizer.NUM;
  x = StandardTokenizer.SOUTHEAST_ASIAN;
  x = StandardTokenizer.IDEOGRAPHIC;
  x = StandardTokenizer.HIRAGANA;
  x = StandardTokenizer.KATAKANA;
  x = StandardTokenizer.HANGUL;
  String[] y = StandardTokenizer.TOKEN_TYPES;
}
 
Example 14
Source Project: lucene-solr   Source File: TestTypeTokenFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void testTypeFilter() throws IOException {
  StringReader reader = new StringReader("121 is palindrome, while 123 is not");
  Set<String> stopTypes = asSet("<NUM>");
  final StandardTokenizer input = new StandardTokenizer(newAttributeFactory());
  input.setReader(reader);
  TokenStream stream = new TypeTokenFilter(input, stopTypes);
  assertTokenStreamContents(stream, new String[]{"is", "palindrome", "while", "is", "not"});
}
 
Example 15
Source Project: lucene-solr   Source File: TestTypeTokenFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void testTypeFilterWhitelist() throws IOException {
  StringReader reader = new StringReader("121 is palindrome, while 123 is not");
  Set<String> stopTypes = Collections.singleton("<NUM>");
  final StandardTokenizer input = new StandardTokenizer(newAttributeFactory());
  input.setReader(reader);
  TokenStream stream = new TypeTokenFilter(input, stopTypes, true);
  assertTokenStreamContents(stream, new String[]{"121", "123"});
}
 
Example 16
Source Project: subsonic   Source File: SearchService.java    License: GNU General Public License v3.0 5 votes vote down vote up
private String analyzeQuery(String query) throws IOException {
    StringBuilder result = new StringBuilder();
    ASCIIFoldingFilter filter = new ASCIIFoldingFilter(new StandardTokenizer(LUCENE_VERSION, new StringReader(query)));
    TermAttribute termAttribute = filter.getAttribute(TermAttribute.class);
    while (filter.incrementToken()) {
        result.append(termAttribute.term()).append("* ");
    }
    return result.toString();
}
 
Example 17
/**
 * 
 * @param string
 * @return arrayList of tokens of string converted to lowercase
 * @throws IOException
 */
public static ArrayList<String> tokenize(String string) throws IOException{
	ArrayList<String> retList = new ArrayList<String>();
	StringReader reader = new StringReader(string);
	StandardTokenizer tokenizer = new StandardTokenizer();
	while(tokenizer.incrementToken()){
		retList.add(tokenizer.getAttribute(Token.class).toString());
	}
	tokenizer.close();
	reader.close();
	return retList;
}
 
Example 18
Source Project: spacewalk   Source File: NGramAnalyzer.java    License: GNU General Public License v2.0 5 votes vote down vote up
/**
 * @param fieldName ignored param
 * @param reader contains data to parse
 * @return TokenStream of ngrams
 */
public TokenStream tokenStream(String fieldName, Reader reader) {
    return new NGramTokenFilter(
            new LowerCaseFilter(
                new StandardFilter(
                    new StandardTokenizer(reader))), min_ngram, max_ngram);
}
 
Example 19
Source Project: lucene-solr   Source File: TestTeeSinkTokenFilter.java    License: Apache License 2.0 4 votes vote down vote up
private StandardTokenizer standardTokenizer(StringBuilder builder) {
  StandardTokenizer tokenizer = new StandardTokenizer();
  tokenizer.setReader(new StringReader(builder.toString()));
  return tokenizer;
}
 
Example 20
private Tokenizer getTokenizerImpl( String input ) throws IOException {
  StandardTokenizer sttk = new StandardTokenizer( );
  sttk.setReader( new StringReader( input ) );
  return sttk;
}
 
Example 21
private Tokenizer getTokenizerImpl( String input ) throws IOException {
  StandardTokenizer sttk = new StandardTokenizer( new StringReader( input ) );
  return sttk;
}
 
Example 22
Source Project: crate   Source File: StandardTokenizerFactory.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public Tokenizer create() {
    StandardTokenizer tokenizer = new StandardTokenizer();
    tokenizer.setMaxTokenLength(maxTokenLength);
    return tokenizer;
}
 
Example 23
Source Project: lucene-solr   Source File: MorfologikAnalyzer.java    License: Apache License 2.0 3 votes vote down vote up
/**
 * Creates a
 * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
 * which tokenizes all the text in the provided {@link Reader}.
 * 
 * @param field ignored field name
 * @return A {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
 *         built from an {@link StandardTokenizer} filtered with
 *         {@link MorfologikFilter}.
 */
@Override
protected TokenStreamComponents createComponents(final String field) {
  final Tokenizer src = new StandardTokenizer();
  
  return new TokenStreamComponents(
      src, 
      new MorfologikFilter(src, dictionary));
}
 
Example 24
Source Project: modernmt   Source File: ArabicAnalyzer.java    License: Apache License 2.0 3 votes vote down vote up
/**
 * Creates
 * {@link TokenStreamComponents}
 * used to tokenize all the text in the provided {@link Reader}.
 *
 * @return {@link TokenStreamComponents}
 * built from an {@link StandardTokenizer} filtered with
 * {@link LowerCaseFilter}, {@link StopFilter},
 * {@link ArabicNormalizationFilter}, {@link SetKeywordMarkerFilter}
 * if a stem exclusion set is provided and {@link ArabicStemFilter}.
 */
@Override
protected TokenStreamComponents createComponents(String fieldName,
                                                 Reader reader) {
    final Tokenizer source = new StandardTokenizer(reader);
    TokenStream result = new ArabicNormalizationFilter(source);
    return new TokenStreamComponents(source, result);
}