Java Code Examples for org.apache.lucene.analysis.Tokenizer#addAttribute()

The following examples show how to use org.apache.lucene.analysis.Tokenizer#addAttribute() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: QueryAutoFilteringComponent.java From query-autofiltering-component with Apache License 2.0

6 votes

private ArrayList<char[]> tokenize( String input ) throws IOException {

    Log.debug( "tokenize '" + input + "'" );
    ArrayList<char[]> tokens = new ArrayList<char[]>( );
    Tokenizer tk = getTokenizerImpl( input );

    CharTermAttribute term = tk.addAttribute( CharTermAttribute.class );
    tk.reset( );
    while (tk.incrementToken( ) ) {
      int bufLen = term.length();
      char[] copy = new char[ bufLen ];
      System.arraycopy(term.buffer( ), 0, copy, 0, bufLen );
      tokens.add( copy );
    }

    return tokens;
  }

Example 2

Source File: QueryAutoFilteringComponent.java From query-autofiltering-component with Apache License 2.0

6 votes

private ArrayList<char[]> tokenize( String input ) throws IOException {
      
  Log.debug( "tokenize '" + input + "'" );
  ArrayList<char[]> tokens = new ArrayList<char[]>( );
  Tokenizer tk = getTokenizerImpl( input );
  
  CharTermAttribute term = tk.addAttribute( CharTermAttribute.class );
  tk.reset( );
  while (tk.incrementToken( ) ) {
    int bufLen = term.length();
    char[] copy = new char[ bufLen ];
    System.arraycopy(term.buffer( ), 0, copy, 0, bufLen );
    tokens.add( copy );
  }
      
  return tokens;
}

Example 3

Source File: TestAnsj.java From ansj4solr with Apache License 2.0

6 votes

public static void main(String[] args) throws IOException {
	List<Term> parse = ToAnalysis.parse("天天向上，媒体打打。《回家真好》");
	System.out.println(parse);
	Tokenizer tokenizer = new AnsjTokenizer(new StringReader("天天向上，媒体打打。《回家真好》"), 0, true);
	CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
	OffsetAttribute offsetAtt = 
			tokenizer.addAttribute(OffsetAttribute.class);
		PositionIncrementAttribute positionIncrementAtt = 
			tokenizer.addAttribute(PositionIncrementAttribute.class);

	
	while (tokenizer.incrementToken()){

		System.out.print(new String(termAtt.toString()) );
		System.out.print( offsetAtt.startOffset() + "-" + offsetAtt.endOffset() + "-" );
		System.out.print( positionIncrementAtt.getPositionIncrement() +"/");

	}
	tokenizer.close();
}

Example 4

Source File: HighlighterTest.java From lucene-solr with Apache License 2.0

5 votes

@Override
public TokenStreamComponents createComponents(String arg0) {
  Tokenizer stream = new MockTokenizer(MockTokenizer.SIMPLE, true);
  stream.addAttribute(CharTermAttribute.class);
  stream.addAttribute(PositionIncrementAttribute.class);
  stream.addAttribute(OffsetAttribute.class);
  return new TokenStreamComponents(stream, new SynonymTokenizer(stream, synonyms));
}

Example 5

Source File: MeCabKoStandardTokenizerTest.java From mecab-ko-lucene-analyzer with Apache License 2.0

5 votes

private String tokenizerToString(Tokenizer tokenizer) throws Exception {
  OffsetAttribute extOffset = tokenizer.addAttribute(OffsetAttribute.class);
  PositionIncrementAttribute posIncrAtt = 
      tokenizer.addAttribute(PositionIncrementAttribute.class);
  PositionLengthAttribute posLengthAtt = 
      tokenizer.addAttribute(PositionLengthAttribute.class);
  CharTermAttribute term =
      (CharTermAttribute)tokenizer.addAttribute(CharTermAttribute.class);
  TypeAttribute type =
      (TypeAttribute)tokenizer.addAttribute(TypeAttribute.class);
  SemanticClassAttribute semanticClass = 
      (SemanticClassAttribute)tokenizer.addAttribute(SemanticClassAttribute.class);
  PartOfSpeechAttribute pos = 
      (PartOfSpeechAttribute)tokenizer.addAttribute(PartOfSpeechAttribute.class);
      

  StringBuilder result = new StringBuilder();
  while (tokenizer.incrementToken() == true) {
    result.append(new String(term.buffer(), 0, term.length())).append(":");
    result.append(type.type()).append(":");
    result.append(pos.partOfSpeech()).append(":");
    result.append(semanticClass.semanticClass()).append(":");
    result.append(String.valueOf(posIncrAtt.getPositionIncrement())).append(":");
    result.append(String.valueOf(posLengthAtt.getPositionLength())).append(":");
    result.append(String.valueOf(extOffset.startOffset())).append(":");
    result.append(String.valueOf(extOffset.endOffset()));
    result.append(",");
  }
  tokenizer.end();
  return result.toString();
}

Example 6

Source File: NGramTokenizerTest.java From lucene-solr with Apache License 2.0

4 votes

static void testNGrams(int minGram, int maxGram, String s, final String nonTokenChars, boolean edgesOnly) throws IOException {
  // convert the string to code points
  final int[] codePoints = toCodePoints(s);
  final int[] offsets = new int[codePoints.length + 1];
  for (int i = 0; i < codePoints.length; ++i) {
    offsets[i+1] = offsets[i] + Character.charCount(codePoints[i]);
  }
  final Tokenizer grams = new NGramTokenizer(minGram, maxGram, edgesOnly) {
    @Override
    protected boolean isTokenChar(int chr) {
      return nonTokenChars.indexOf(chr) < 0;
    }
  };
  grams.setReader(new StringReader(s));
  final CharTermAttribute termAtt = grams.addAttribute(CharTermAttribute.class);
  final PositionIncrementAttribute posIncAtt = grams.addAttribute(PositionIncrementAttribute.class);
  final PositionLengthAttribute posLenAtt = grams.addAttribute(PositionLengthAttribute.class);
  final OffsetAttribute offsetAtt = grams.addAttribute(OffsetAttribute.class);
  grams.reset();
  for (int start = 0; start < codePoints.length; ++start) {
    nextGram:
    for (int end = start + minGram; end <= start + maxGram && end <= codePoints.length; ++end) {
      if (edgesOnly && start > 0 && isTokenChar(nonTokenChars, codePoints[start - 1])) {
        // not on an edge
        continue nextGram;
      }
      for (int j = start; j < end; ++j) {
        if (!isTokenChar(nonTokenChars, codePoints[j])) {
          continue nextGram;
        }
      }
      assertTrue(grams.incrementToken());
      assertArrayEquals(ArrayUtil.copyOfSubArray(codePoints, start, end), toCodePoints(termAtt));
      assertEquals(1, posIncAtt.getPositionIncrement());
      assertEquals(1, posLenAtt.getPositionLength());
      assertEquals(offsets[start], offsetAtt.startOffset());
      assertEquals(offsets[end], offsetAtt.endOffset());
    }
  }
  assertFalse(grams.incrementToken());
  grams.end();
  assertEquals(s.length(), offsetAtt.startOffset());
  assertEquals(s.length(), offsetAtt.endOffset());
}