Java Code Examples for org.apache.lucene.analysis.TokenStream#addAttribute()

The following examples show how to use org.apache.lucene.analysis.TokenStream#addAttribute() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Tagger.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public Tagger(Terms terms, Bits liveDocs, TokenStream tokenStream,
              TagClusterReducer tagClusterReducer, boolean skipAltTokens,
              boolean ignoreStopWords) throws IOException {
  this.terms = terms;
  this.liveDocs = liveDocs;
  this.tokenStream = tokenStream;
  this.skipAltTokens = skipAltTokens;
  this.ignoreStopWords = ignoreStopWords;
  byteRefAtt = tokenStream.addAttribute(TermToBytesRefAttribute.class);
  posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
  offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
  taggingAtt = tokenStream.addAttribute(TaggingAttribute.class);
  tokenStream.reset();

  this.tagClusterReducer = tagClusterReducer;
}
 
Example 2
Source File: TestCompoundWordTokenFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testRetainMockAttribute() throws Exception {
  CharArraySet dict = makeDictionary("abc", "d", "efg");
  Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  tokenizer.setReader(new StringReader("abcdefg"));
  TokenStream stream = new MockRetainAttributeFilter(tokenizer);
  stream = new DictionaryCompoundWordTokenFilter(
      stream, dict,
      CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
      CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
      CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
  MockRetainAttribute retAtt = stream.addAttribute(MockRetainAttribute.class);
  stream.reset();
  while (stream.incrementToken()) {
    assertTrue("Custom attribute value was lost", retAtt.getRetain());
  }

}
 
Example 3
Source File: TransportExtendedAnalyzeAction.java    From elasticsearch-extended-analyze with Apache License 2.0 6 votes vote down vote up
private List<ExtendedAnalyzeResponse.ExtendedAnalyzeToken> processAnalysis(TokenStream stream, Set<String> includeAttributes, boolean shortAttrName, int lastPosition, int lastOffset) throws IOException {
    List<ExtendedAnalyzeResponse.ExtendedAnalyzeToken> tokens = new ArrayList<>();
    stream.reset();

    //and each tokens output
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
    OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
    TypeAttribute type = stream.addAttribute(TypeAttribute.class);

    while (stream.incrementToken()) {
        int increment = posIncr.getPositionIncrement();
        if (increment > 0) {
            lastPosition = lastPosition + increment;
        }

        tokens.add(new ExtendedAnalyzeResponse.ExtendedAnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(),
            lastOffset +offset.endOffset(), type.type(), extractExtendedAttributes(stream, includeAttributes, shortAttrName)));
    }
    stream.end();
    return tokens;

}
 
Example 4
Source File: Tokenizers.java    From ache with Apache License 2.0 6 votes vote down vote up
public List<String> tokenize(String cleanText) {
    try {
        TokenStream ts = analyzer.tokenStream("cleanText", cleanText);
        CharTermAttribute cattr = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        List<String> tokens = new ArrayList<String>();
        while (ts.incrementToken()) {
            String token = cattr.toString();
            tokens.add(token);
        }
        ts.close();
        return tokens;
    } catch (IOException e) {
        throw new RuntimeException(
                "Shigle tokenization failed for string: " + cleanText, e);
    }
}
 
Example 5
Source File: AnalysisRequestHandlerBase.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Analyzes the given TokenStream, collecting the Tokens it produces.
 *
 * @param tokenStream TokenStream to analyze
 *
 * @return List of tokens produced from the TokenStream
 */
private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream) {
  final List<AttributeSource> tokens = new ArrayList<>();
  final PositionIncrementAttribute posIncrAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
  final TokenTrackingAttribute trackerAtt = tokenStream.addAttribute(TokenTrackingAttribute.class);
  // for backwards compatibility, add all "common" attributes
  tokenStream.addAttribute(OffsetAttribute.class);
  tokenStream.addAttribute(TypeAttribute.class);
  try {
    tokenStream.reset();
    int position = 0;
    while (tokenStream.incrementToken()) {
      position += posIncrAtt.getPositionIncrement();
      trackerAtt.setActPosition(position);
      tokens.add(tokenStream.cloneAttributes());
    }
    tokenStream.end(); // TODO should we capture?
  } catch (IOException ioe) {
    throw new RuntimeException("Error occurred while iterating over tokenstream", ioe);
  } finally {
    IOUtils.closeWhileHandlingException(tokenStream);
  }

  return tokens;
}
 
Example 6
Source File: EdgeNGramTokenFilterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testEndPositionIncrement() throws IOException {
  TokenStream source = whitespaceMockTokenizer("seventeen one two three four");
  TokenStream input = new EdgeNGramTokenFilter(source, 8, 8, false);
  PositionIncrementAttribute posIncAtt = input.addAttribute(PositionIncrementAttribute.class);
  input.reset();
  while (input.incrementToken()) {}
  input.end();
  assertEquals(4, posIncAtt.getPositionIncrement());
}
 
Example 7
Source File: TestKoreanNumberFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void analyze(Analyzer analyzer, Reader reader, Writer writer) throws IOException {
  TokenStream stream = analyzer.tokenStream("dummy", reader);
  stream.reset();

  CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);

  while (stream.incrementToken()) {
    writer.write(termAttr.toString());
    writer.write("\n");
  }

  reader.close();
  writer.close();
}
 
Example 8
Source File: TestJapaneseNumberFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void analyze(Analyzer analyzer, Reader reader, Writer writer) throws IOException {
  TokenStream stream = analyzer.tokenStream("dummy", reader);
  stream.reset();

  CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);

  while (stream.incrementToken()) {
    writer.write(termAttr.toString());
    writer.write("\n");
  }

  reader.close();
  writer.close();
}
 
Example 9
Source File: NGramTokenFilterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testEndPositionIncrement() throws IOException {
  TokenStream source = whitespaceMockTokenizer("seventeen one two three four");
  TokenStream input = new NGramTokenFilter(source, 8, 8, false);
  PositionIncrementAttribute posIncAtt = input.addAttribute(PositionIncrementAttribute.class);
  input.reset();
  while (input.incrementToken()) {}
  input.end();
  assertEquals(4, posIncAtt.getPositionIncrement());
}
 
Example 10
Source File: HighlighterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public SynonymTokenizer(TokenStream realStream, Map<String, String> synonyms) {
  this.realStream = realStream;
  this.synonyms = synonyms;
  realTermAtt = realStream.addAttribute(CharTermAttribute.class);
  realPosIncrAtt = realStream.addAttribute(PositionIncrementAttribute.class);
  realOffsetAtt = realStream.addAttribute(OffsetAttribute.class);

  termAtt = addAttribute(CharTermAttribute.class);
  posIncrAtt = addAttribute(PositionIncrementAttribute.class);
  offsetAtt = addAttribute(OffsetAttribute.class);
}
 
Example 11
Source File: TokenStreamOffsetStrategy.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
TokenStreamOffsetsEnum(TokenStream ts, CharArrayMatcher[] matchers) throws IOException {
  this.stream = ts;
  this.matchers = matchers;
  matchDescriptions = new BytesRef[matchers.length];
  charTermAtt = ts.addAttribute(CharTermAttribute.class);
  offsetAtt = ts.addAttribute(OffsetAttribute.class);
  ts.reset();
}
 
Example 12
Source File: TestTeeSinkTokenFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testGeneral() throws IOException {
  final TeeSinkTokenFilter source = new TeeSinkTokenFilter(whitespaceMockTokenizer(buffer1.toString()));
  final TokenStream sink = source.newSinkTokenStream();
  
  source.addAttribute(CheckClearAttributesAttribute.class);
  sink.addAttribute(CheckClearAttributesAttribute.class);
  
  assertTokenStreamContents(source, tokens1);
  assertTokenStreamContents(sink, tokens1);
}
 
Example 13
Source File: KuromojiUDF.java    From incubator-hivemall with Apache License 2.0 5 votes vote down vote up
private static void analyzeTokens(@Nonnull final TokenStream stream,
        @Nonnull final List<Text> tokenResult, @Nonnull final List<Text> posResult)
        throws IOException {
    // instantiate an attribute placeholder once
    CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
    PartOfSpeechAttribute posAttr = stream.addAttribute(PartOfSpeechAttribute.class);
    stream.reset();

    while (stream.incrementToken()) {
        String term = termAttr.toString();
        tokenResult.add(new Text(term));
        String pos = posAttr.getPartOfSpeech();
        posResult.add(new Text(pos));
    }
}
 
Example 14
Source File: SimpleNaiveBayesDocumentClassifier.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Returns a token array from the {@link org.apache.lucene.analysis.TokenStream} in input
 *
 * @param tokenizedText the tokenized content of a field
 * @return a {@code String} array of the resulting tokens
 * @throws java.io.IOException If tokenization fails because there is a low-level I/O error
 */
protected String[] getTokenArray(TokenStream tokenizedText) throws IOException {
  Collection<String> tokens = new LinkedList<>();
  CharTermAttribute charTermAttribute = tokenizedText.addAttribute(CharTermAttribute.class);
  tokenizedText.reset();
  while (tokenizedText.incrementToken()) {
    tokens.add(charTermAttribute.toString());
  }
  tokenizedText.end();
  tokenizedText.close();
  return tokens.toArray(new String[0]);
}
 
Example 15
Source File: VietnameseAnalysisTokenTest.java    From elasticsearch-analysis-vietnamese with Apache License 2.0 5 votes vote down vote up
private void inputToken(String inputText, String[] expectArray) throws IOException {
    TestAnalysis analysis = VietnameseAnalysisTest.createTestAnalysis();
    NamedAnalyzer analyzer = analysis.indexAnalyzers.get("vi_analyzer");
    assertNotNull(analyzer);

    TokenStream ts = analyzer.analyzer().tokenStream("test", inputText);
    CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    ts.reset();
    for (String expected : expectArray) {
        assertThat(ts.incrementToken(), equalTo(true));
        assertThat(term.toString(), equalTo(expected));
    }
    assertThat(ts.incrementToken(), equalTo(false));
}
 
Example 16
Source File: LuceneAnalyzerIntegrationTest.java    From tutorials with MIT License 5 votes vote down vote up
public List<String> analyze(String text, Analyzer analyzer) throws IOException {
    List<String> result = new ArrayList<String>();
    TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
    CharTermAttribute attr = tokenStream.addAttribute(CharTermAttribute.class);
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        result.add(attr.toString());
    }
    return result;
}
 
Example 17
Source File: DelimitedBoostTokenFilterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
void assertTermEquals(String expected, TokenStream stream, float expectedBoost) throws Exception {
  CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
  BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class);
  assertTrue(stream.incrementToken());
  assertEquals(expected, termAtt.toString());
  float actualBoost = boostAtt.getBoost();
  assertTrue(actualBoost + " does not equal: " + expectedBoost, actualBoost == expectedBoost);
}
 
Example 18
Source File: EdgeNGramTokenFilterTest.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void testSupplementaryCharacters() throws IOException {
  for (int i = 0; i < 20; i++) {
    final String s = TestUtil.randomUnicodeString(random(), 10);
    final int codePointCount = s.codePointCount(0, s.length());
    final int minGram = TestUtil.nextInt(random(), 1, 3);
    final int maxGram = TestUtil.nextInt(random(), minGram, 10);
    final boolean preserveOriginal = TestUtil.nextInt(random(), 0, 1) % 2 == 0;

    TokenStream tk = new KeywordTokenizer();
    ((Tokenizer)tk).setReader(new StringReader(s));
    tk = new EdgeNGramTokenFilter(tk, minGram, maxGram, preserveOriginal);
    final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
    final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
    tk.reset();

    if (codePointCount < minGram && preserveOriginal) {
      assertTrue(tk.incrementToken());
      assertEquals(0, offsetAtt.startOffset());
      assertEquals(s.length(), offsetAtt.endOffset());
      assertEquals(s, termAtt.toString());
    }

    for (int j = minGram; j <= Math.min(codePointCount, maxGram); j++) {
      assertTrue(tk.incrementToken());
      assertEquals(0, offsetAtt.startOffset());
      assertEquals(s.length(), offsetAtt.endOffset());
      final int end = Character.offsetByCodePoints(s, 0, j);
      assertEquals(s.substring(0, end), termAtt.toString());
    }

    if (codePointCount > maxGram && preserveOriginal) {
      assertTrue(tk.incrementToken());
      assertEquals(0, offsetAtt.startOffset());
      assertEquals(s.length(), offsetAtt.endOffset());
      assertEquals(s, termAtt.toString());
    }

    assertFalse(tk.incrementToken());
    tk.close();
  }
}
 
Example 19
Source File: SearchSuggester.java    From webdsl with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("deprecation")
public static ArrayList<String> findAutoCompletionsForField(Class<?> entityClass, String baseDir,
        String suggestedField, int maxSuggestionCount, Analyzer analyzer, String toSuggestOn) {

    if (toSuggestOn == null || toSuggestOn.isEmpty())
        return new ArrayList<String>();

    AutoCompleter autoCompleter = null;
    String indexPath = baseDir + suggestedField;
    try {
        autoCompleter = getAutoCompleter(indexPath);

        TokenStream tokenStream = analyzer.tokenStream(suggestedField, new StringReader(
                toSuggestOn));
        CharTermAttributeImpl ta = (CharTermAttributeImpl) tokenStream
                .addAttribute(CharTermAttribute.class);

        boolean dontstop = tokenStream.incrementToken();
        StringBuilder prefixSb = new StringBuilder( toSuggestOn.length() + 16 );
        String word = "";

        while (dontstop){ //eat up all tokens
            word = ta.term();
            dontstop = tokenStream.incrementToken();
            if(dontstop)
                prefixSb.append(word ).append( " ");
        }

        String prefix = prefixSb.toString();

        String[] suggestions = autoCompleter.suggestSimilar(word, maxSuggestionCount);


        ArrayList<String> allSuggestions = new ArrayList<String>();

        if (suggestions == null || suggestions.length == 0){
                return allSuggestions;
        }

        for(int i = 0; i < suggestions.length; i++){
            allSuggestions.add(prefix + suggestions[i]);
        }
        return allSuggestions;

    } catch (Exception e) {
        org.webdsl.logging.Logger.error("EXCEPTION",e);
        //if something goes wrong, close and remove current AutoCompleter instance, so it gets renewed
        try {
            autoCompleter.close();
        } catch (IOException e2) {
            org.webdsl.logging.Logger.error("EXCEPTION",e2);
        }
        autoCompleterMap.remove(indexPath);
    }
    return new ArrayList<String>();
}
 
Example 20
Source File: Analysis.java    From Elasticsearch with Apache License 2.0 3 votes vote down vote up
/**
 * Check whether the provided token stream is able to provide character
 * terms.
 * <p>Although most analyzers generate character terms (CharTermAttribute),
 * some token only contain binary terms (BinaryTermAttribute,
 * CharTermAttribute being a special type of BinaryTermAttribute), such as
 * {@link NumericTokenStream} and unsuitable for highlighting and
 * more-like-this queries which expect character terms.</p>
 */
public static boolean isCharacterTokenStream(TokenStream tokenStream) {
    try {
        tokenStream.addAttribute(CharTermAttribute.class);
        return true;
    } catch (IllegalArgumentException e) {
        return false;
    }
}