Java Code Examples for org.apache.lucene.analysis.TokenStream#getAttribute()

The following examples show how to use org.apache.lucene.analysis.TokenStream#getAttribute() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: VietnameseAnalysisTest.java    From elasticsearch-analysis-vietnamese with Apache License 2.0 6 votes vote down vote up
public void testTokenOffset() throws IOException {

        TestAnalysis analysis = createTestAnalysis();
        NamedAnalyzer analyzer = analysis.indexAnalyzers.get("vi_analyzer");
        assertNotNull(analyzer);

        TokenStream ts = analyzer.analyzer().tokenStream("test", "Phụ tùng xe Mazda bán tải dưới 7 chỗ: ống dẫn gió tới két làm mát khí nạp- cao su lưu hóa, mới 100%, phục vụ BHBD. Ms:1D0013246A");
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        OffsetAttribute offset = ts.getAttribute(OffsetAttribute.class);
        ts.reset();
        String[] expected = new String[]{"phụ tùng", "xe", "mazda", "bán", "tải", "7", "chỗ", "ống", "dẫn", "gió", "tới", "két", "làm", "mát", "khí", "nạp", "cao su", "lưu hóa", "mới", "100%", "phục vụ", "bhbd", "ms", "1", "d0", "013246", "a"};
        int[] expectedOffset = new int[]{0, 9, 12, 18, 22, 31, 33, 38, 42, 46, 50, 54, 58, 62, 66, 70, 75, 82, 91, 95, 101, 109, 115, 118, 119, 121, 127};

        for (int i = 0; i < expected.length; i++) {
            assertThat(ts.incrementToken(), equalTo(true));
            assertThat(term.toString(), equalTo(expected[i]));
            assertTrue(offset.startOffset() == expectedOffset[i]);
        }
        assertThat(ts.incrementToken(), equalTo(false));
    }
 
Example 2
Source File: TestNGramFilters.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Test EdgeNGramFilterFactory on tokens with payloads
 */
public void testEdgeNGramFilterPayload() throws Exception {
  Reader reader = new StringReader("test|0.1");
  TokenStream stream = whitespaceMockTokenizer(reader);
  stream = tokenFilterFactory("DelimitedPayload", "encoder", "float").create(stream);
  stream = tokenFilterFactory("EdgeNGram", "minGramSize", "1", "maxGramSize", "2").create(stream);

  stream.reset();
  while (stream.incrementToken()) {
    PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class);
    assertNotNull(payAttr);
    BytesRef payData = payAttr.getPayload();
    assertNotNull(payData);
    float payFloat = PayloadHelper.decodeFloat(payData.bytes);
    assertEquals(0.1f, payFloat, 0.0f);
  }
  stream.end();
  stream.close();
}
 
Example 3
Source File: PinyinAnalysisTest.java    From elasticsearch-analysis-lc-pinyin with Artistic License 2.0 6 votes vote down vote up
@Test
public void testSearch() throws IOException {
    LcPinyinAnalyzer analyzer = new LcPinyinAnalyzer(AnalysisSetting.search);
    TokenStream tokenStream = analyzer.tokenStream("lc", "重qing");

    CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
    PositionIncrementAttribute positionIncrementAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class);

    tokenStream.reset();
    Assert.assertTrue(tokenStream.incrementToken());
    Assert.assertEquals(charTermAttribute.toString(), "重");
    Assert.assertEquals(offsetAttribute.startOffset(), 0);
    Assert.assertEquals(offsetAttribute.endOffset(), 1);
    Assert.assertEquals(positionIncrementAttribute.getPositionIncrement(), 1);

    Assert.assertTrue(tokenStream.incrementToken());
    Assert.assertEquals(charTermAttribute.toString(), "qing");
    Assert.assertEquals(offsetAttribute.startOffset(), 1);
    Assert.assertEquals(offsetAttribute.endOffset(), 5);
    Assert.assertEquals(positionIncrementAttribute.getPositionIncrement(), 1);

    tokenStream.close();
}
 
Example 4
Source File: LuceneToken.java    From jstarcraft-nlp with Apache License 2.0 6 votes vote down vote up
public LuceneToken(TokenStream stream) {
    this.stream = stream;
    this.term = stream.getAttribute(CharTermAttribute.class);
    this.offset = stream.getAttribute(OffsetAttribute.class);
    try {
        this.flag = this.stream.incrementToken();
        if (!flag) {
            this.stream.close();
        }
    } catch (Exception exception) {
        try {
            this.stream.close();
        } catch (Exception throwable) {
        }
        throw new RuntimeException(exception);
    }
}
 
Example 5
Source File: LuceneUtils.java    From modernmt with Apache License 2.0 6 votes vote down vote up
public static Set<String> analyze(Analyzer analyzer, String content) throws IOException {
    HashSet<String> terms = new HashSet<>();

    TokenStream stream = null;

    try {
        stream = analyzer.tokenStream("none", content);
        stream.reset();

        CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class);

        while (stream.incrementToken()) {
            terms.add(termAttribute.toString());
        }

        stream.close();
    } finally {
        IOUtils.closeQuietly(stream);
    }

    return terms;
}
 
Example 6
Source File: TestDelimitedPayloadTokenFilterFactory.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testDelim() throws Exception {
  Reader reader = new StringReader("the*0.1 quick*0.1 red*0.1");
  TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  ((Tokenizer)stream).setReader(reader);
  stream = tokenFilterFactory("DelimitedPayload",
      "encoder", "float",
      "delimiter", "*").create(stream);
  stream.reset();
  while (stream.incrementToken()) {
    PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class);
    assertNotNull(payAttr);
    byte[] payData = payAttr.getPayload().bytes;
    assertNotNull(payData);
    float payFloat = PayloadHelper.decodeFloat(payData);
    assertEquals(0.1f, payFloat, 0.0f);
  }
  stream.end();
  stream.close();
}
 
Example 7
Source File: QueryBuilder.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/** 
 * Creates complex boolean query from the cached tokenstream contents 
 */
protected Query analyzeMultiBoolean(String field, TokenStream stream, BooleanClause.Occur operator) throws IOException {
  BooleanQuery.Builder q = newBooleanQuery();
  List<TermAndBoost> currentQuery = new ArrayList<>();
  
  TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
  PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
  BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class);

  stream.reset();
  while (stream.incrementToken()) {
    if (posIncrAtt.getPositionIncrement() != 0) {
      add(q, currentQuery, operator);
      currentQuery.clear();
    }
    currentQuery.add(new TermAndBoost(new Term(field, termAtt.getBytesRef()), boostAtt.getBoost()));
  }
  add(q, currentQuery, operator);
  
  return q.build();
}
 
Example 8
Source File: ChineseWordAnalyzerTest.java    From word with Apache License 2.0 6 votes vote down vote up
@Test
public void test2() {
    try{
        Analyzer analyzer = new ChineseWordAnalyzer();
        TokenStream tokenStream = analyzer.tokenStream("text", "叔叔亲了我妈妈也亲了我");
        List<String> words = new ArrayList<>();
        tokenStream.reset();
        while(tokenStream.incrementToken()){
            CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
            words.add(charTermAttribute.toString());
        }
        tokenStream.close();
        String expResult = "[叔叔, 亲了, 我, 妈妈, 也, 亲了, 我]";
        assertEquals(expResult, words.toString());
    }catch(IOException e){
        fail("分词出错"+e.getMessage());
    }
}
 
Example 9
Source File: KuromojiUDF.java    From incubator-hivemall with Apache License 2.0 5 votes vote down vote up
private static void analyzeTokens(@Nonnull final TokenStream stream,
        @Nonnull final List<Text> tokens) throws IOException {
    // instantiate an attribute placeholder once
    CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
    stream.reset();

    while (stream.incrementToken()) {
        String term = termAttr.toString();
        tokens.add(new Text(term));
    }
}
 
Example 10
Source File: DelimitedBoostTokenFilterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
void assertTermEquals(String expected, TokenStream stream, float expectedBoost) throws Exception {
  CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
  BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class);
  assertTrue(stream.incrementToken());
  assertEquals(expected, termAtt.toString());
  float actualBoost = boostAtt.getBoost();
  assertTrue(actualBoost + " does not equal: " + expectedBoost, actualBoost == expectedBoost);
}
 
Example 11
Source File: SmartcnUDF.java    From incubator-hivemall with Apache License 2.0 5 votes vote down vote up
private static void analyzeTokens(@Nonnull TokenStream stream, @Nonnull List<Text> results)
        throws IOException {
    // instantiate an attribute placeholder once
    CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
    stream.reset();

    while (stream.incrementToken()) {
        String term = termAttr.toString();
        results.add(new Text(term));
    }
}
 
Example 12
Source File: QueryBuilder.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Creates simple term query from the cached tokenstream contents 
 */
protected Query analyzeTerm(String field, TokenStream stream) throws IOException {
  TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
  BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class);
  
  stream.reset();
  if (!stream.incrementToken()) {
    throw new AssertionError();
  }
  
  return newTermQuery(new Term(field, termAtt.getBytesRef()), boostAtt.getBoost());
}
 
Example 13
Source File: LuceneUtil.java    From antsdb with GNU Lesser General Public License v3.0 5 votes vote down vote up
static void tokenize(String text, BiConsumer<String, String> lambda) {
	try (StandardAnalyzer analyzer = new StandardAnalyzer()) {
		TokenStream stream = analyzer.tokenStream("", text);
		CharTermAttribute term = stream.getAttribute(CharTermAttribute.class);
		TypeAttribute type = stream.getAttribute(TypeAttribute.class);
		stream.reset();
		while (stream.incrementToken()) {
			lambda.accept(type.type(), term.toString());
		}
	}
	catch (IOException x) {
		throw new RuntimeException(x);
	}
}
 
Example 14
Source File: BaseformTokenFilterTests.java    From elasticsearch-analysis-baseform with Apache License 2.0 5 votes vote down vote up
private void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException {
    stream.reset();
    CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
    assertNotNull(termAttr);
    int i = 0;
    while (stream.incrementToken()) {
        assertTrue(i < expected.length);
        assertEquals(expected[i], termAttr.toString());
        i++;
    }
    assertEquals(i, expected.length);
    stream.close();
}
 
Example 15
Source File: GermanBaseformTokenFilterTests.java    From elasticsearch-analysis-baseform with Apache License 2.0 5 votes vote down vote up
private void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException {
    stream.reset();
    CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
    assertNotNull(termAttr);
    int i = 0;
    while (stream.incrementToken()) {
        assertTrue(i < expected.length);
        assertEquals(expected[i++], termAttr.toString());
    }
    assertEquals(i, expected.length);
    stream.close();
}
 
Example 16
Source File: TestJapaneseTokenizer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private ArrayList<String> makeTokenList(Analyzer a, String in) throws Exception {
  ArrayList<String> list = new ArrayList<>();
  TokenStream ts = a.tokenStream("dummy", in);
  CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);

  ts.reset();
  while (ts.incrementToken()) {
    list.add(termAtt.toString());
  }
  ts.end();
  ts.close();
  return list;
}
 
Example 17
Source File: ChineseMatcher.java    From zxl with Apache License 2.0 5 votes vote down vote up
public double oneWayMatch(String text1,String text2) {
try {
	Set<String> set = new HashSet<String>(10);
	TokenStream tokenStream = smartChineseAnalyzer.tokenStream("field", text1);
	CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
	tokenStream.reset();
	while (tokenStream.incrementToken()) {
		set.add(charTermAttribute.toString());
	}
	int originalCount = set.size();
	tokenStream.end();
	tokenStream.close();
	tokenStream = smartChineseAnalyzer.tokenStream("field", text2);
	charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
	tokenStream.reset();
	int smallWeightWordsCount = 0;
	int denominator = 0;
	while (tokenStream.incrementToken()) {
		denominator++;
		String word = charTermAttribute.toString();
		int tempSize = set.size();
		set.add(word);
		if (tempSize + 1 == set.size() && smallWeightWords.contains(word)) {
			smallWeightWordsCount++;
		}
	}
	int numerator = set.size() - originalCount;
	double unmatchRate = (smallWeightWordsCount * smallWeight + numerator - ((double)smallWeightWordsCount))/denominator;
	tokenStream.end();
	tokenStream.close();
	return unmatchRate;
} catch (IOException e) {
	return 1D;
}

  }
 
Example 18
Source File: QueryBuilder.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** 
 * Creates simple boolean query from the cached tokenstream contents 
 */
protected Query analyzeBoolean(String field, TokenStream stream) throws IOException {
  TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
  BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class);
  
  stream.reset();
  List<TermAndBoost> terms = new ArrayList<>();
  while (stream.incrementToken()) {
    terms.add(new TermAndBoost(new Term(field, termAtt.getBytesRef()), boostAtt.getBoost()));
  }
  
  return newSynonymQuery(terms.toArray(new TermAndBoost[0]));
}
 
Example 19
Source File: Zemberek2StemFilterFactory.java    From lucene-solr-analysis-turkish with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws IOException {

        StringReader reader = new StringReader("elması utansın ortaklar çekişme ile");

        Map<String, String> map = new HashMap<>();
        map.put("strategy", "frequency");

        Zemberek2StemFilterFactory factory = new Zemberek2StemFilterFactory(map);

        WhitespaceTokenizer whitespaceTokenizer = new WhitespaceTokenizer();
        whitespaceTokenizer.setReader(reader);

        TokenStream stream = factory.create(whitespaceTokenizer);

        CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class);

        stream.reset();
        while (stream.incrementToken()) {

            String term = termAttribute.toString();
            System.out.println(term);
        }
        stream.end();
        reader.close();
    }
 
Example 20
Source File: TokenSourcesTest.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void testPayloads() throws Exception {
  Directory dir = newDirectory();
  RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
  FieldType myFieldType = new FieldType(TextField.TYPE_NOT_STORED);
  myFieldType.setStoreTermVectors(true);
  myFieldType.setStoreTermVectorOffsets(true);
  myFieldType.setStoreTermVectorPositions(true);
  myFieldType.setStoreTermVectorPayloads(true);

  curOffset = 0;

  Token[] tokens = new Token[] {
    getToken("foxes"),
    getToken("can"),
    getToken("jump"),
    getToken("high")
  };

  Document doc = new Document();
  doc.add(new Field("field", new CannedTokenStream(tokens), myFieldType));
  writer.addDocument(doc);

  IndexReader reader = writer.getReader();
  writer.close();
  assertEquals(1, reader.numDocs());

  TokenStream ts = TokenSources.getTermVectorTokenStreamOrNull("field", reader.getTermVectors(0), -1);

  CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
  PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
  OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
  PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);

  ts.reset();
  for(Token token : tokens) {
    assertTrue(ts.incrementToken());
    assertEquals(token.toString(), termAtt.toString());
    assertEquals(token.getPositionIncrement(), posIncAtt.getPositionIncrement());
    assertEquals(token.getPayload(), payloadAtt.getPayload());
    assertEquals(token.startOffset(), offsetAtt.startOffset());
    assertEquals(token.endOffset(), offsetAtt.endOffset());
  }

  assertFalse(ts.incrementToken());

  reader.close();
  dir.close();
}