Java Code Examples for org.apache.lucene.analysis.TokenStream#getAttribute()

The following examples show how to use org.apache.lucene.analysis.TokenStream#getAttribute() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: VietnameseAnalysisTest.java From elasticsearch-analysis-vietnamese with Apache License 2.0

6 votes

public void testTokenOffset() throws IOException {

        TestAnalysis analysis = createTestAnalysis();
        NamedAnalyzer analyzer = analysis.indexAnalyzers.get("vi_analyzer");
        assertNotNull(analyzer);

        TokenStream ts = analyzer.analyzer().tokenStream("test", "Phụ tùng xe Mazda bán tải dưới 7 chỗ: ống dẫn gió tới két làm mát khí nạp- cao su lưu hóa, mới 100%, phục vụ BHBD. Ms:1D0013246A");
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        OffsetAttribute offset = ts.getAttribute(OffsetAttribute.class);
        ts.reset();
        String[] expected = new String[]{"phụ tùng", "xe", "mazda", "bán", "tải", "7", "chỗ", "ống", "dẫn", "gió", "tới", "két", "làm", "mát", "khí", "nạp", "cao su", "lưu hóa", "mới", "100%", "phục vụ", "bhbd", "ms", "1", "d0", "013246", "a"};
        int[] expectedOffset = new int[]{0, 9, 12, 18, 22, 31, 33, 38, 42, 46, 50, 54, 58, 62, 66, 70, 75, 82, 91, 95, 101, 109, 115, 118, 119, 121, 127};

        for (int i = 0; i < expected.length; i++) {
            assertThat(ts.incrementToken(), equalTo(true));
            assertThat(term.toString(), equalTo(expected[i]));
            assertTrue(offset.startOffset() == expectedOffset[i]);
        }
        assertThat(ts.incrementToken(), equalTo(false));
    }

Example 2

Source File: TestNGramFilters.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Test EdgeNGramFilterFactory on tokens with payloads
 */
public void testEdgeNGramFilterPayload() throws Exception {
  Reader reader = new StringReader("test|0.1");
  TokenStream stream = whitespaceMockTokenizer(reader);
  stream = tokenFilterFactory("DelimitedPayload", "encoder", "float").create(stream);
  stream = tokenFilterFactory("EdgeNGram", "minGramSize", "1", "maxGramSize", "2").create(stream);

  stream.reset();
  while (stream.incrementToken()) {
    PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class);
    assertNotNull(payAttr);
    BytesRef payData = payAttr.getPayload();
    assertNotNull(payData);
    float payFloat = PayloadHelper.decodeFloat(payData.bytes);
    assertEquals(0.1f, payFloat, 0.0f);
  }
  stream.end();
  stream.close();
}

Example 3

Source File: PinyinAnalysisTest.java From elasticsearch-analysis-lc-pinyin with Artistic License 2.0

6 votes

@Test
public void testSearch() throws IOException {
    LcPinyinAnalyzer analyzer = new LcPinyinAnalyzer(AnalysisSetting.search);
    TokenStream tokenStream = analyzer.tokenStream("lc", "重qing");

    CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
    PositionIncrementAttribute positionIncrementAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class);

    tokenStream.reset();
    Assert.assertTrue(tokenStream.incrementToken());
    Assert.assertEquals(charTermAttribute.toString(), "重");
    Assert.assertEquals(offsetAttribute.startOffset(), 0);
    Assert.assertEquals(offsetAttribute.endOffset(), 1);
    Assert.assertEquals(positionIncrementAttribute.getPositionIncrement(), 1);

    Assert.assertTrue(tokenStream.incrementToken());
    Assert.assertEquals(charTermAttribute.toString(), "qing");
    Assert.assertEquals(offsetAttribute.startOffset(), 1);
    Assert.assertEquals(offsetAttribute.endOffset(), 5);
    Assert.assertEquals(positionIncrementAttribute.getPositionIncrement(), 1);

    tokenStream.close();
}

Example 4

Source File: LuceneToken.java From jstarcraft-nlp with Apache License 2.0

6 votes

public LuceneToken(TokenStream stream) {
    this.stream = stream;
    this.term = stream.getAttribute(CharTermAttribute.class);
    this.offset = stream.getAttribute(OffsetAttribute.class);
    try {
        this.flag = this.stream.incrementToken();
        if (!flag) {
            this.stream.close();
        }
    } catch (Exception exception) {
        try {
            this.stream.close();
        } catch (Exception throwable) {
        }
        throw new RuntimeException(exception);
    }
}

Example 5

Source File: LuceneUtils.java From modernmt with Apache License 2.0

6 votes

public static Set<String> analyze(Analyzer analyzer, String content) throws IOException {
    HashSet<String> terms = new HashSet<>();

    TokenStream stream = null;

    try {
        stream = analyzer.tokenStream("none", content);
        stream.reset();

        CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class);

        while (stream.incrementToken()) {
            terms.add(termAttribute.toString());
        }

        stream.close();
    } finally {
        IOUtils.closeQuietly(stream);
    }

    return terms;
}

Example 6

Source File: TestDelimitedPayloadTokenFilterFactory.java From lucene-solr with Apache License 2.0

6 votes

public void testDelim() throws Exception {
  Reader reader = new StringReader("the*0.1 quick*0.1 red*0.1");
  TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  ((Tokenizer)stream).setReader(reader);
  stream = tokenFilterFactory("DelimitedPayload",
      "encoder", "float",
      "delimiter", "*").create(stream);
  stream.reset();
  while (stream.incrementToken()) {
    PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class);
    assertNotNull(payAttr);
    byte[] payData = payAttr.getPayload().bytes;
    assertNotNull(payData);
    float payFloat = PayloadHelper.decodeFloat(payData);
    assertEquals(0.1f, payFloat, 0.0f);
  }
  stream.end();
  stream.close();
}

Example 7

Source File: QueryBuilder.java From lucene-solr with Apache License 2.0

6 votes

/** 
 * Creates complex boolean query from the cached tokenstream contents 
 */
protected Query analyzeMultiBoolean(String field, TokenStream stream, BooleanClause.Occur operator) throws IOException {
  BooleanQuery.Builder q = newBooleanQuery();
  List<TermAndBoost> currentQuery = new ArrayList<>();
  
  TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
  PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
  BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class);

  stream.reset();
  while (stream.incrementToken()) {
    if (posIncrAtt.getPositionIncrement() != 0) {
      add(q, currentQuery, operator);
      currentQuery.clear();
    }
    currentQuery.add(new TermAndBoost(new Term(field, termAtt.getBytesRef()), boostAtt.getBoost()));
  }
  add(q, currentQuery, operator);
  
  return q.build();
}

Example 8

Source File: ChineseWordAnalyzerTest.java From word with Apache License 2.0

6 votes

@Test
public void test2() {
    try{
        Analyzer analyzer = new ChineseWordAnalyzer();
        TokenStream tokenStream = analyzer.tokenStream("text", "叔叔亲了我妈妈也亲了我");
        List<String> words = new ArrayList<>();
        tokenStream.reset();
        while(tokenStream.incrementToken()){
            CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
            words.add(charTermAttribute.toString());
        }
        tokenStream.close();
        String expResult = "[叔叔, 亲了, 我, 妈妈, 也, 亲了, 我]";
        assertEquals(expResult, words.toString());
    }catch(IOException e){
        fail("分词出错"+e.getMessage());
    }
}

Example 9

Source File: KuromojiUDF.java From incubator-hivemall with Apache License 2.0

5 votes

private static void analyzeTokens(@Nonnull final TokenStream stream,
        @Nonnull final List<Text> tokens) throws IOException {
    // instantiate an attribute placeholder once
    CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
    stream.reset();

    while (stream.incrementToken()) {
        String term = termAttr.toString();
        tokens.add(new Text(term));
    }
}

Example 10

Source File: DelimitedBoostTokenFilterTest.java From lucene-solr with Apache License 2.0

5 votes

void assertTermEquals(String expected, TokenStream stream, float expectedBoost) throws Exception {
  CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
  BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class);
  assertTrue(stream.incrementToken());
  assertEquals(expected, termAtt.toString());
  float actualBoost = boostAtt.getBoost();
  assertTrue(actualBoost + " does not equal: " + expectedBoost, actualBoost == expectedBoost);
}

Example 11

Source File: SmartcnUDF.java From incubator-hivemall with Apache License 2.0

5 votes

private static void analyzeTokens(@Nonnull TokenStream stream, @Nonnull List<Text> results)
        throws IOException {
    // instantiate an attribute placeholder once
    CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
    stream.reset();

    while (stream.incrementToken()) {
        String term = termAttr.toString();
        results.add(new Text(term));
    }
}

Example 12

Source File: QueryBuilder.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Creates simple term query from the cached tokenstream contents 
 */
protected Query analyzeTerm(String field, TokenStream stream) throws IOException {
  TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
  BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class);
  
  stream.reset();
  if (!stream.incrementToken()) {
    throw new AssertionError();
  }
  
  return newTermQuery(new Term(field, termAtt.getBytesRef()), boostAtt.getBoost());
}

Example 13

Source File: LuceneUtil.java From antsdb with GNU Lesser General Public License v3.0

5 votes

static void tokenize(String text, BiConsumer<String, String> lambda) {
	try (StandardAnalyzer analyzer = new StandardAnalyzer()) {
		TokenStream stream = analyzer.tokenStream("", text);
		CharTermAttribute term = stream.getAttribute(CharTermAttribute.class);
		TypeAttribute type = stream.getAttribute(TypeAttribute.class);
		stream.reset();
		while (stream.incrementToken()) {
			lambda.accept(type.type(), term.toString());
		}
	}
	catch (IOException x) {
		throw new RuntimeException(x);
	}
}

Example 14

Source File: BaseformTokenFilterTests.java From elasticsearch-analysis-baseform with Apache License 2.0

5 votes

private void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException {
    stream.reset();
    CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
    assertNotNull(termAttr);
    int i = 0;
    while (stream.incrementToken()) {
        assertTrue(i < expected.length);
        assertEquals(expected[i], termAttr.toString());
        i++;
    }
    assertEquals(i, expected.length);
    stream.close();
}

Example 15

Source File: GermanBaseformTokenFilterTests.java From elasticsearch-analysis-baseform with Apache License 2.0

5 votes

private void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException {
    stream.reset();
    CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
    assertNotNull(termAttr);
    int i = 0;
    while (stream.incrementToken()) {
        assertTrue(i < expected.length);
        assertEquals(expected[i++], termAttr.toString());
    }
    assertEquals(i, expected.length);
    stream.close();
}

Example 16

Source File: TestJapaneseTokenizer.java From lucene-solr with Apache License 2.0

5 votes

private ArrayList<String> makeTokenList(Analyzer a, String in) throws Exception {
  ArrayList<String> list = new ArrayList<>();
  TokenStream ts = a.tokenStream("dummy", in);
  CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);

  ts.reset();
  while (ts.incrementToken()) {
    list.add(termAtt.toString());
  }
  ts.end();
  ts.close();
  return list;
}

Example 17

Source File: ChineseMatcher.java From zxl with Apache License 2.0

5 votes

public double oneWayMatch(String text1,String text2) {
try {
	Set<String> set = new HashSet<String>(10);
	TokenStream tokenStream = smartChineseAnalyzer.tokenStream("field", text1);
	CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
	tokenStream.reset();
	while (tokenStream.incrementToken()) {
		set.add(charTermAttribute.toString());
	}
	int originalCount = set.size();
	tokenStream.end();
	tokenStream.close();
	tokenStream = smartChineseAnalyzer.tokenStream("field", text2);
	charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
	tokenStream.reset();
	int smallWeightWordsCount = 0;
	int denominator = 0;
	while (tokenStream.incrementToken()) {
		denominator++;
		String word = charTermAttribute.toString();
		int tempSize = set.size();
		set.add(word);
		if (tempSize + 1 == set.size() && smallWeightWords.contains(word)) {
			smallWeightWordsCount++;
		}
	}
	int numerator = set.size() - originalCount;
	double unmatchRate = (smallWeightWordsCount * smallWeight + numerator - ((double)smallWeightWordsCount))/denominator;
	tokenStream.end();
	tokenStream.close();
	return unmatchRate;
} catch (IOException e) {
	return 1D;
}

  }

Example 18

Source File: QueryBuilder.java From lucene-solr with Apache License 2.0

5 votes

/** 
 * Creates simple boolean query from the cached tokenstream contents 
 */
protected Query analyzeBoolean(String field, TokenStream stream) throws IOException {
  TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
  BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class);
  
  stream.reset();
  List<TermAndBoost> terms = new ArrayList<>();
  while (stream.incrementToken()) {
    terms.add(new TermAndBoost(new Term(field, termAtt.getBytesRef()), boostAtt.getBoost()));
  }
  
  return newSynonymQuery(terms.toArray(new TermAndBoost[0]));
}

Example 19

Source File: Zemberek2StemFilterFactory.java From lucene-solr-analysis-turkish with Apache License 2.0

4 votes

public static void main(String[] args) throws IOException {

        StringReader reader = new StringReader("elması utansın ortaklar çekişme ile");

        Map<String, String> map = new HashMap<>();
        map.put("strategy", "frequency");

        Zemberek2StemFilterFactory factory = new Zemberek2StemFilterFactory(map);

        WhitespaceTokenizer whitespaceTokenizer = new WhitespaceTokenizer();
        whitespaceTokenizer.setReader(reader);

        TokenStream stream = factory.create(whitespaceTokenizer);

        CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class);

        stream.reset();
        while (stream.incrementToken()) {

            String term = termAttribute.toString();
            System.out.println(term);
        }
        stream.end();
        reader.close();
    }

Example 20

Source File: TokenSourcesTest.java From lucene-solr with Apache License 2.0

4 votes

public void testPayloads() throws Exception {
  Directory dir = newDirectory();
  RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
  FieldType myFieldType = new FieldType(TextField.TYPE_NOT_STORED);
  myFieldType.setStoreTermVectors(true);
  myFieldType.setStoreTermVectorOffsets(true);
  myFieldType.setStoreTermVectorPositions(true);
  myFieldType.setStoreTermVectorPayloads(true);

  curOffset = 0;

  Token[] tokens = new Token[] {
    getToken("foxes"),
    getToken("can"),
    getToken("jump"),
    getToken("high")
  };

  Document doc = new Document();
  doc.add(new Field("field", new CannedTokenStream(tokens), myFieldType));
  writer.addDocument(doc);

  IndexReader reader = writer.getReader();
  writer.close();
  assertEquals(1, reader.numDocs());

  TokenStream ts = TokenSources.getTermVectorTokenStreamOrNull("field", reader.getTermVectors(0), -1);

  CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
  PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
  OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
  PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);

  ts.reset();
  for(Token token : tokens) {
    assertTrue(ts.incrementToken());
    assertEquals(token.toString(), termAtt.toString());
    assertEquals(token.getPositionIncrement(), posIncAtt.getPositionIncrement());
    assertEquals(token.getPayload(), payloadAtt.getPayload());
    assertEquals(token.startOffset(), offsetAtt.startOffset());
    assertEquals(token.endOffset(), offsetAtt.endOffset());
  }

  assertFalse(ts.incrementToken());

  reader.close();
  dir.close();
}