Java Code Examples for org.apache.lucene.analysis.TokenStream#close()

The following examples show how to use org.apache.lucene.analysis.TokenStream#close() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: TestNGramFilters.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Test EdgeNGramFilterFactory on tokens with payloads
 */
public void testEdgeNGramFilterPayload() throws Exception {
  Reader reader = new StringReader("test|0.1");
  TokenStream stream = whitespaceMockTokenizer(reader);
  stream = tokenFilterFactory("DelimitedPayload", "encoder", "float").create(stream);
  stream = tokenFilterFactory("EdgeNGram", "minGramSize", "1", "maxGramSize", "2").create(stream);

  stream.reset();
  while (stream.incrementToken()) {
    PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class);
    assertNotNull(payAttr);
    BytesRef payData = payAttr.getPayload();
    assertNotNull(payData);
    float payFloat = PayloadHelper.decodeFloat(payData.bytes);
    assertEquals(0.1f, payFloat, 0.0f);
  }
  stream.end();
  stream.close();
}

Example 2

Source File: AnalyzerTest.java From mmseg4j-solr with Apache License 2.0

6 votes

public static void printlnToken(String txt, Analyzer analyzer) throws IOException {
	System.out.println("---------"+txt.length()+"\n"+txt);
	TokenStream ts = analyzer.tokenStream("text", new StringReader(txt));
	/*//lucene 2.9 以下
	for(Token t= new Token(); (t=ts.next(t)) !=null;) {
		System.out.println(t);
	}*/
	/*while(ts.incrementToken()) {
		TermAttribute termAtt = (TermAttribute)ts.getAttribute(TermAttribute.class);
		OffsetAttribute offsetAtt = (OffsetAttribute)ts.getAttribute(OffsetAttribute.class);
		TypeAttribute typeAtt = (TypeAttribute)ts.getAttribute(TypeAttribute.class);

		System.out.println("("+termAtt.term()+","+offsetAtt.startOffset()+","+offsetAtt.endOffset()+",type="+typeAtt.type()+")");
	}*/
	ts.reset();
	for(PackedTokenAttributeImpl t= new PackedTokenAttributeImpl(); (t=TokenUtils.nextToken(ts, t)) !=null;) {
		System.out.println(t);
	}
	ts.close();
}

Example 3

Source File: LuceneUtils.java From SciGraph with Apache License 2.0

6 votes

public static List<String> getTokenization(Analyzer analyzer, CharSequence term) {
  List<String> ret = Lists.newArrayList();

  try {
    TokenStream stream = analyzer.tokenStream("", new StringReader(term.toString()));
    CharTermAttribute token = stream.getAttribute(CharTermAttribute.class);
    stream.reset();
    while (stream.incrementToken()) {
      ret.add(token.toString());
    }
    stream.close();
  } catch (IOException e) {
    e.printStackTrace();
  }
  return ret;
}

Example 4

Source File: ReadTokensTask.java From lucene-solr with Apache License 2.0

6 votes

@Override
public int doLogic() throws Exception {
  List<IndexableField> fields = doc.getFields();
  Analyzer analyzer = getRunData().getAnalyzer();
  int tokenCount = 0;
  for(final IndexableField field : fields) {
    if (field.fieldType().indexOptions() == IndexOptions.NONE ||
        field.fieldType().tokenized() == false) {
      continue;
    }
    
    final TokenStream stream = field.tokenStream(analyzer, null);
    // reset the TokenStream to the first token
    stream.reset();

    TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
    while(stream.incrementToken()) {
      termAtt.getBytesRef();
      tokenCount++;
    }
    stream.end();
    stream.close();
  }
  totalTokenCount += tokenCount;
  return tokenCount;
}

Example 5

Source File: ChineseWordAnalyzerTest.java From word with Apache License 2.0

6 votes

@Test
public void test2() {
    try{
        Analyzer analyzer = new ChineseWordAnalyzer();
        TokenStream tokenStream = analyzer.tokenStream("text", "叔叔亲了我妈妈也亲了我");
        List<String> words = new ArrayList<>();
        tokenStream.reset();
        while(tokenStream.incrementToken()){
            CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
            words.add(charTermAttribute.toString());
        }
        tokenStream.close();
        String expResult = "[叔叔, 亲了, 我, 妈妈, 也, 亲了, 我]";
        assertEquals(expResult, words.toString());
    }catch(IOException e){
        fail("分词出错"+e.getMessage());
    }
}

Example 6

Source File: AnalyzerFactoryTestCase.java From airsonic-advanced with GNU General Public License v3.0

6 votes

private List<String> toTermString(String field, String str) {
    List<String> result = new ArrayList<>();
    try {
        TokenStream stream = analyzerFactory.getAnalyzer().tokenStream(field,
                new StringReader(str));
        stream.reset();
        while (stream.incrementToken()) {
            result.add(stream.getAttribute(CharTermAttribute.class).toString()
                    .replaceAll("^term\\=", ""));
        }
        stream.close();
    } catch (IOException e) {
        LoggerFactory.getLogger(AnalyzerFactoryTestCase.class)
                .error("Error during Token processing.", e);
    }
    return result;
}

Example 7

Source File: SuggestUtils.java From Elasticsearch with Apache License 2.0

6 votes

/** NOTE: this method closes the TokenStream, even on exception, which is awkward
 *  because really the caller who called {@link Analyzer#tokenStream} should close it,
 *  but when trying that there are recursion issues when we try to use the same
 *  TokenStrem twice in the same recursion... */
public static int analyze(TokenStream stream, TokenConsumer consumer) throws IOException {
    int numTokens = 0;
    boolean success = false;
    try {
        stream.reset();
        consumer.reset(stream);
        while (stream.incrementToken()) {
            consumer.nextToken();
            numTokens++;
        }
        consumer.end();
    } finally {
        if (success) {
            stream.close();
        } else {
            IOUtils.closeWhileHandlingException(stream);
        }
    }
    return numTokens;
}

Example 8

Source File: TestDelimitedPayloadTokenFilterFactory.java From lucene-solr with Apache License 2.0

6 votes

public void testEncoder() throws Exception {
  Reader reader = new StringReader("the|0.1 quick|0.1 red|0.1");
  TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  ((Tokenizer)stream).setReader(reader);
  stream = tokenFilterFactory("DelimitedPayload", "encoder", "float").create(stream);

  stream.reset();
  while (stream.incrementToken()) {
    PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class);
    assertNotNull(payAttr);
    byte[] payData = payAttr.getPayload().bytes;
    assertNotNull(payData);
    float payFloat = PayloadHelper.decodeFloat(payData);
    assertEquals(0.1f, payFloat, 0.0f);
  }
  stream.end();
  stream.close();
}

Example 9

Source File: Tokenizers.java From ache with Apache License 2.0

6 votes

public List<String> tokenize(String cleanText) {
    try {
        TokenStream ts = analyzer.tokenStream("cleanText", cleanText);
        CharTermAttribute cattr = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        List<String> tokens = new ArrayList<String>();
        while (ts.incrementToken()) {
            String token = cattr.toString();
            tokens.add(token);
        }
        ts.close();
        return tokens;
    } catch (IOException e) {
        throw new RuntimeException(
                "Shigle tokenization failed for string: " + cleanText, e);
    }
}

Example 10

Source File: ChineseMatcher.java From zxl with Apache License 2.0

5 votes

public double oneWayMatch(String text1,String text2) {
try {
	Set<String> set = new HashSet<String>(10);
	TokenStream tokenStream = smartChineseAnalyzer.tokenStream("field", text1);
	CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
	tokenStream.reset();
	while (tokenStream.incrementToken()) {
		set.add(charTermAttribute.toString());
	}
	int originalCount = set.size();
	tokenStream.end();
	tokenStream.close();
	tokenStream = smartChineseAnalyzer.tokenStream("field", text2);
	charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
	tokenStream.reset();
	int smallWeightWordsCount = 0;
	int denominator = 0;
	while (tokenStream.incrementToken()) {
		denominator++;
		String word = charTermAttribute.toString();
		int tempSize = set.size();
		set.add(word);
		if (tempSize + 1 == set.size() && smallWeightWords.contains(word)) {
			smallWeightWordsCount++;
		}
	}
	int numerator = set.size() - originalCount;
	double unmatchRate = (smallWeightWordsCount * smallWeight + numerator - ((double)smallWeightWordsCount))/denominator;
	tokenStream.end();
	tokenStream.close();
	return unmatchRate;
} catch (IOException e) {
	return 1D;
}

  }

Example 11

Source File: MinHashFilterTest.java From lucene-solr with Apache License 2.0

5 votes

private ArrayList<String> getTokens(TokenStream ts) throws IOException {
  ArrayList<String> tokens = new ArrayList<>();
  ts.reset();
  while (ts.incrementToken()) {
    CharTermAttribute termAttribute = ts.getAttribute(CharTermAttribute.class);
    String token = new String(termAttribute.buffer(), 0, termAttribute.length());
    tokens.add(token);
  }
  ts.end();
  ts.close();

  return tokens;
}

Example 12

Source File: FlexibleQuery.java From linden with Apache License 2.0

5 votes

private List<SegToken> parseToTokens(String content, float boost) throws IOException {
  List<SegToken> tokens = new ArrayList<>();
  TokenStream stream = analyzer.tokenStream("", new StringReader(content));
  try {
    CharTermAttribute term = stream.getAttribute(CharTermAttribute.class);
    stream.reset();
    while (stream.incrementToken()) {
      tokens.add(new SegToken(term.toString(), boost));
    }
  } finally {
    if (stream != null) stream.close();
  }
  return tokens;
}

Example 13

Source File: MinHashQParser.java From lucene-solr with Apache License 2.0

5 votes

private void getHashesFromTokenStream(String analyserField, ArrayList<BytesRef> hashes) throws Exception {
  TokenStream ts = getReq().getSchema().getIndexAnalyzer().tokenStream(analyserField, qstr);
  TermToBytesRefAttribute termAttribute = ts.getAttribute(TermToBytesRefAttribute.class);
  ts.reset();
  while (ts.incrementToken()) {
    BytesRef term = termAttribute.getBytesRef();
    hashes.add(BytesRef.deepCopyOf(term));
  }
  ts.end();
  ts.close();
}

Example 14

Source File: EdismaxQueryConverter.java From solr-researcher with Apache License 2.0

5 votes

protected String[] analyze(String text, Analyzer analyzer) throws IOException {
  List<String> result = new ArrayList<String>();
  TokenStream stream = analyzer.tokenStream("", new StringReader(text));
  CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
  stream.reset();
  while (stream.incrementToken()) {
    result.add(new String(termAtt.buffer(), 0, termAtt.length()));
  }
  stream.end();
  stream.close();

  return result.toArray(new String[result.size()]);
}

Example 15

Source File: MinHashFilterTest.java From lucene-solr with Apache License 2.0

5 votes

@Test
public void testTokenStream2() throws IOException {
  TokenStream ts = createTokenStream(5, "woof woof woof woof woof" + " " + "woof woof woof woof puff", 100, 1, 1,
      false);
  ArrayList<String> tokens = getTokens(ts);
  ts.close();

  assertEquals(100, tokens.size());
}

Example 16

Source File: AbstractSearchTest.java From database with GNU General Public License v2.0

5 votes

private void compareTokenStream(Analyzer a, String text, String expected[]) throws IOException {
	TokenStream s = a.tokenStream(null, new StringReader(text));
	int ix = 0;
	
	s.reset();
	
	while (s.incrementToken()) {
		final CharTermAttribute term = s.getAttribute(CharTermAttribute.class);
		final String word = term.toString();
		assertTrue(ix < expected.length);
		assertEquals(expected[ix++], word);
	}
	s.close();
	assertEquals(ix, expected.length);
}

Example 17

Source File: TestLegacyFieldReuse.java From lucene-solr with Apache License 2.0

5 votes

private void assertNumericContents(int value, TokenStream ts) throws IOException {
  assertTrue(ts instanceof LegacyNumericTokenStream);
  LegacyNumericTermAttribute numericAtt = ts.getAttribute(LegacyNumericTermAttribute.class);
  ts.reset();
  boolean seen = false;
  while (ts.incrementToken()) {
    if (numericAtt.getShift() == 0) {
      assertEquals(value, numericAtt.getRawValue());
      seen = true;
    }
  }
  ts.end();
  ts.close();
  assertTrue(seen);
}

Example 18

Source File: SortFormTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

5 votes

private BytesRef sortKeyFromTokenStream(TokenStream stream) throws Exception {
    TermToBytesRefAttribute termAttr = stream.getAttribute(TermToBytesRefAttribute.class);
    BytesRefBuilder b = new BytesRefBuilder();
    stream.reset();
    while (stream.incrementToken()) {
        b.append(termAttr.getBytesRef());
    }
    stream.close();
    return b.get();
}

Example 19

Source File: SolrInformationServer.java From SearchServices with GNU Lesser General Public License v3.0

4 votes

private void addContentPropertyToDocUsingAlfrescoRepository(
        SolrInputDocument doc,
        QName propertyQName,
        long dbId,
        String locale) throws AuthenticationException, IOException
{
    long start = System.nanoTime();

    // Expensive call to be done with ContentTracker
    try (GetTextContentResponse response = repositoryClient.getTextContent(dbId, propertyQName, null)) {
        addContentPropertyMetadata(doc, propertyQName, AlfrescoSolrDataModel.SpecializedFieldType.TRANSFORMATION_STATUS, response);
        addContentPropertyMetadata(doc, propertyQName, AlfrescoSolrDataModel.SpecializedFieldType.TRANSFORMATION_EXCEPTION, response);
        addContentPropertyMetadata(doc, propertyQName, AlfrescoSolrDataModel.SpecializedFieldType.TRANSFORMATION_TIME, response);

        final String textContent = textContentFrom(response);

        if (fingerprintHasBeenEnabledOnThisInstance && !textContent.isBlank()) {
            Analyzer analyzer = core.getLatestSchema().getFieldType("min_hash").getIndexAnalyzer();
            TokenStream ts = analyzer.tokenStream("dummy_field", textContent);
            CharTermAttribute termAttribute = ts.getAttribute(CharTermAttribute.class);
            ts.reset();
            while (ts.incrementToken()) {
                StringBuilder tokenBuff = new StringBuilder();
                char[] buff = termAttribute.buffer();

                for (int i = 0; i < termAttribute.length(); i++) {
                    tokenBuff.append(Integer.toHexString(buff[i]));
                }
                doc.addField(FINGERPRINT_FIELD, tokenBuff.toString());

            }
            ts.end();
            ts.close();
        }

        this.getTrackerStats().addDocTransformationTime(System.nanoTime() - start);

        String storedField = dataModel.getStoredContentField(propertyQName);
        doc.setField(storedField, "\u0000" + languageFrom(locale) + "\u0000" + textContent);

        dataModel.getIndexedFieldNamesForProperty(propertyQName)
                .getFields()
                .forEach(field -> addFieldIfNotSet(doc, field.getField()));
    }
}

Example 20

Source File: NGramTokenFilterTest.java From lucene-solr with Apache License 2.0

4 votes

public void testSupplementaryCharacters() throws IOException {
  for (int i = 0; i < 20; i++) {
    final String s = TestUtil.randomUnicodeString(random(), 10);
    final int codePointCount = s.codePointCount(0, s.length());
    final int minGram = TestUtil.nextInt(random(), 1, 3);
    final int maxGram = TestUtil.nextInt(random(), minGram, 10);
    final boolean preserveOriginal = TestUtil.nextInt(random(), 0, 1) % 2 == 0;

    TokenStream tk = new KeywordTokenizer();
    ((Tokenizer)tk).setReader(new StringReader(s));
    tk = new NGramTokenFilter(tk, minGram, maxGram, preserveOriginal);
    final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
    final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
    tk.reset();

    if (codePointCount < minGram && preserveOriginal) {
      assertTrue(tk.incrementToken());
      assertEquals(0, offsetAtt.startOffset());
      assertEquals(s.length(), offsetAtt.endOffset());
      assertEquals(s, termAtt.toString());
    }
    
    for (int start = 0; start < codePointCount; ++start) {
      for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) {
        assertTrue(tk.incrementToken());
        assertEquals(0, offsetAtt.startOffset());
        assertEquals(s.length(), offsetAtt.endOffset());
        final int startIndex = Character.offsetByCodePoints(s, 0, start);
        final int endIndex = Character.offsetByCodePoints(s, 0, end);
        assertEquals(s.substring(startIndex, endIndex), termAtt.toString());
      }
    }
    
    if (codePointCount > maxGram && preserveOriginal) {
      assertTrue(tk.incrementToken());
      assertEquals(0, offsetAtt.startOffset());
      assertEquals(s.length(), offsetAtt.endOffset());
      assertEquals(s, termAtt.toString());
    }
    
    assertFalse(tk.incrementToken());
    tk.close();
  }
}