Java Code Examples for org.apache.lucene.analysis.TokenStream#close()

The following examples show how to use org.apache.lucene.analysis.TokenStream#close() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestNGramFilters.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Test EdgeNGramFilterFactory on tokens with payloads
 */
public void testEdgeNGramFilterPayload() throws Exception {
  Reader reader = new StringReader("test|0.1");
  TokenStream stream = whitespaceMockTokenizer(reader);
  stream = tokenFilterFactory("DelimitedPayload", "encoder", "float").create(stream);
  stream = tokenFilterFactory("EdgeNGram", "minGramSize", "1", "maxGramSize", "2").create(stream);

  stream.reset();
  while (stream.incrementToken()) {
    PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class);
    assertNotNull(payAttr);
    BytesRef payData = payAttr.getPayload();
    assertNotNull(payData);
    float payFloat = PayloadHelper.decodeFloat(payData.bytes);
    assertEquals(0.1f, payFloat, 0.0f);
  }
  stream.end();
  stream.close();
}
 
Example 2
Source File: AnalyzerTest.java    From mmseg4j-solr with Apache License 2.0 6 votes vote down vote up
public static void printlnToken(String txt, Analyzer analyzer) throws IOException {
	System.out.println("---------"+txt.length()+"\n"+txt);
	TokenStream ts = analyzer.tokenStream("text", new StringReader(txt));
	/*//lucene 2.9 以下
	for(Token t= new Token(); (t=ts.next(t)) !=null;) {
		System.out.println(t);
	}*/
	/*while(ts.incrementToken()) {
		TermAttribute termAtt = (TermAttribute)ts.getAttribute(TermAttribute.class);
		OffsetAttribute offsetAtt = (OffsetAttribute)ts.getAttribute(OffsetAttribute.class);
		TypeAttribute typeAtt = (TypeAttribute)ts.getAttribute(TypeAttribute.class);

		System.out.println("("+termAtt.term()+","+offsetAtt.startOffset()+","+offsetAtt.endOffset()+",type="+typeAtt.type()+")");
	}*/
	ts.reset();
	for(PackedTokenAttributeImpl t= new PackedTokenAttributeImpl(); (t=TokenUtils.nextToken(ts, t)) !=null;) {
		System.out.println(t);
	}
	ts.close();
}
 
Example 3
Source File: LuceneUtils.java    From SciGraph with Apache License 2.0 6 votes vote down vote up
public static List<String> getTokenization(Analyzer analyzer, CharSequence term) {
  List<String> ret = Lists.newArrayList();

  try {
    TokenStream stream = analyzer.tokenStream("", new StringReader(term.toString()));
    CharTermAttribute token = stream.getAttribute(CharTermAttribute.class);
    stream.reset();
    while (stream.incrementToken()) {
      ret.add(token.toString());
    }
    stream.close();
  } catch (IOException e) {
    e.printStackTrace();
  }
  return ret;
}
 
Example 4
Source File: ReadTokensTask.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
public int doLogic() throws Exception {
  List<IndexableField> fields = doc.getFields();
  Analyzer analyzer = getRunData().getAnalyzer();
  int tokenCount = 0;
  for(final IndexableField field : fields) {
    if (field.fieldType().indexOptions() == IndexOptions.NONE ||
        field.fieldType().tokenized() == false) {
      continue;
    }
    
    final TokenStream stream = field.tokenStream(analyzer, null);
    // reset the TokenStream to the first token
    stream.reset();

    TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
    while(stream.incrementToken()) {
      termAtt.getBytesRef();
      tokenCount++;
    }
    stream.end();
    stream.close();
  }
  totalTokenCount += tokenCount;
  return tokenCount;
}
 
Example 5
Source File: ChineseWordAnalyzerTest.java    From word with Apache License 2.0 6 votes vote down vote up
@Test
public void test2() {
    try{
        Analyzer analyzer = new ChineseWordAnalyzer();
        TokenStream tokenStream = analyzer.tokenStream("text", "叔叔亲了我妈妈也亲了我");
        List<String> words = new ArrayList<>();
        tokenStream.reset();
        while(tokenStream.incrementToken()){
            CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
            words.add(charTermAttribute.toString());
        }
        tokenStream.close();
        String expResult = "[叔叔, 亲了, 我, 妈妈, 也, 亲了, 我]";
        assertEquals(expResult, words.toString());
    }catch(IOException e){
        fail("分词出错"+e.getMessage());
    }
}
 
Example 6
Source File: AnalyzerFactoryTestCase.java    From airsonic-advanced with GNU General Public License v3.0 6 votes vote down vote up
private List<String> toTermString(String field, String str) {
    List<String> result = new ArrayList<>();
    try {
        TokenStream stream = analyzerFactory.getAnalyzer().tokenStream(field,
                new StringReader(str));
        stream.reset();
        while (stream.incrementToken()) {
            result.add(stream.getAttribute(CharTermAttribute.class).toString()
                    .replaceAll("^term\\=", ""));
        }
        stream.close();
    } catch (IOException e) {
        LoggerFactory.getLogger(AnalyzerFactoryTestCase.class)
                .error("Error during Token processing.", e);
    }
    return result;
}
 
Example 7
Source File: SuggestUtils.java    From Elasticsearch with Apache License 2.0 6 votes vote down vote up
/** NOTE: this method closes the TokenStream, even on exception, which is awkward
 *  because really the caller who called {@link Analyzer#tokenStream} should close it,
 *  but when trying that there are recursion issues when we try to use the same
 *  TokenStrem twice in the same recursion... */
public static int analyze(TokenStream stream, TokenConsumer consumer) throws IOException {
    int numTokens = 0;
    boolean success = false;
    try {
        stream.reset();
        consumer.reset(stream);
        while (stream.incrementToken()) {
            consumer.nextToken();
            numTokens++;
        }
        consumer.end();
    } finally {
        if (success) {
            stream.close();
        } else {
            IOUtils.closeWhileHandlingException(stream);
        }
    }
    return numTokens;
}
 
Example 8
Source File: TestDelimitedPayloadTokenFilterFactory.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testEncoder() throws Exception {
  Reader reader = new StringReader("the|0.1 quick|0.1 red|0.1");
  TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  ((Tokenizer)stream).setReader(reader);
  stream = tokenFilterFactory("DelimitedPayload", "encoder", "float").create(stream);

  stream.reset();
  while (stream.incrementToken()) {
    PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class);
    assertNotNull(payAttr);
    byte[] payData = payAttr.getPayload().bytes;
    assertNotNull(payData);
    float payFloat = PayloadHelper.decodeFloat(payData);
    assertEquals(0.1f, payFloat, 0.0f);
  }
  stream.end();
  stream.close();
}
 
Example 9
Source File: Tokenizers.java    From ache with Apache License 2.0 6 votes vote down vote up
public List<String> tokenize(String cleanText) {
    try {
        TokenStream ts = analyzer.tokenStream("cleanText", cleanText);
        CharTermAttribute cattr = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        List<String> tokens = new ArrayList<String>();
        while (ts.incrementToken()) {
            String token = cattr.toString();
            tokens.add(token);
        }
        ts.close();
        return tokens;
    } catch (IOException e) {
        throw new RuntimeException(
                "Shigle tokenization failed for string: " + cleanText, e);
    }
}
 
Example 10
Source File: ChineseMatcher.java    From zxl with Apache License 2.0 5 votes vote down vote up
public double oneWayMatch(String text1,String text2) {
try {
	Set<String> set = new HashSet<String>(10);
	TokenStream tokenStream = smartChineseAnalyzer.tokenStream("field", text1);
	CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
	tokenStream.reset();
	while (tokenStream.incrementToken()) {
		set.add(charTermAttribute.toString());
	}
	int originalCount = set.size();
	tokenStream.end();
	tokenStream.close();
	tokenStream = smartChineseAnalyzer.tokenStream("field", text2);
	charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
	tokenStream.reset();
	int smallWeightWordsCount = 0;
	int denominator = 0;
	while (tokenStream.incrementToken()) {
		denominator++;
		String word = charTermAttribute.toString();
		int tempSize = set.size();
		set.add(word);
		if (tempSize + 1 == set.size() && smallWeightWords.contains(word)) {
			smallWeightWordsCount++;
		}
	}
	int numerator = set.size() - originalCount;
	double unmatchRate = (smallWeightWordsCount * smallWeight + numerator - ((double)smallWeightWordsCount))/denominator;
	tokenStream.end();
	tokenStream.close();
	return unmatchRate;
} catch (IOException e) {
	return 1D;
}

  }
 
Example 11
Source File: MinHashFilterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private ArrayList<String> getTokens(TokenStream ts) throws IOException {
  ArrayList<String> tokens = new ArrayList<>();
  ts.reset();
  while (ts.incrementToken()) {
    CharTermAttribute termAttribute = ts.getAttribute(CharTermAttribute.class);
    String token = new String(termAttribute.buffer(), 0, termAttribute.length());
    tokens.add(token);
  }
  ts.end();
  ts.close();

  return tokens;
}
 
Example 12
Source File: FlexibleQuery.java    From linden with Apache License 2.0 5 votes vote down vote up
private List<SegToken> parseToTokens(String content, float boost) throws IOException {
  List<SegToken> tokens = new ArrayList<>();
  TokenStream stream = analyzer.tokenStream("", new StringReader(content));
  try {
    CharTermAttribute term = stream.getAttribute(CharTermAttribute.class);
    stream.reset();
    while (stream.incrementToken()) {
      tokens.add(new SegToken(term.toString(), boost));
    }
  } finally {
    if (stream != null) stream.close();
  }
  return tokens;
}
 
Example 13
Source File: MinHashQParser.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private void getHashesFromTokenStream(String analyserField, ArrayList<BytesRef> hashes) throws Exception {
  TokenStream ts = getReq().getSchema().getIndexAnalyzer().tokenStream(analyserField, qstr);
  TermToBytesRefAttribute termAttribute = ts.getAttribute(TermToBytesRefAttribute.class);
  ts.reset();
  while (ts.incrementToken()) {
    BytesRef term = termAttribute.getBytesRef();
    hashes.add(BytesRef.deepCopyOf(term));
  }
  ts.end();
  ts.close();
}
 
Example 14
Source File: EdismaxQueryConverter.java    From solr-researcher with Apache License 2.0 5 votes vote down vote up
protected String[] analyze(String text, Analyzer analyzer) throws IOException {
  List<String> result = new ArrayList<String>();
  TokenStream stream = analyzer.tokenStream("", new StringReader(text));
  CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
  stream.reset();
  while (stream.incrementToken()) {
    result.add(new String(termAtt.buffer(), 0, termAtt.length()));
  }
  stream.end();
  stream.close();

  return result.toArray(new String[result.size()]);
}
 
Example 15
Source File: MinHashFilterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Test
public void testTokenStream2() throws IOException {
  TokenStream ts = createTokenStream(5, "woof woof woof woof woof" + " " + "woof woof woof woof puff", 100, 1, 1,
      false);
  ArrayList<String> tokens = getTokens(ts);
  ts.close();

  assertEquals(100, tokens.size());
}
 
Example 16
Source File: AbstractSearchTest.java    From database with GNU General Public License v2.0 5 votes vote down vote up
private void compareTokenStream(Analyzer a, String text, String expected[]) throws IOException {
	TokenStream s = a.tokenStream(null, new StringReader(text));
	int ix = 0;
	
	s.reset();
	
	while (s.incrementToken()) {
		final CharTermAttribute term = s.getAttribute(CharTermAttribute.class);
		final String word = term.toString();
		assertTrue(ix < expected.length);
		assertEquals(expected[ix++], word);
	}
	s.close();
	assertEquals(ix, expected.length);
}
 
Example 17
Source File: TestLegacyFieldReuse.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private void assertNumericContents(int value, TokenStream ts) throws IOException {
  assertTrue(ts instanceof LegacyNumericTokenStream);
  LegacyNumericTermAttribute numericAtt = ts.getAttribute(LegacyNumericTermAttribute.class);
  ts.reset();
  boolean seen = false;
  while (ts.incrementToken()) {
    if (numericAtt.getShift() == 0) {
      assertEquals(value, numericAtt.getRawValue());
      seen = true;
    }
  }
  ts.end();
  ts.close();
  assertTrue(seen);
}
 
Example 18
Source File: SortFormTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
private BytesRef sortKeyFromTokenStream(TokenStream stream) throws Exception {
    TermToBytesRefAttribute termAttr = stream.getAttribute(TermToBytesRefAttribute.class);
    BytesRefBuilder b = new BytesRefBuilder();
    stream.reset();
    while (stream.incrementToken()) {
        b.append(termAttr.getBytesRef());
    }
    stream.close();
    return b.get();
}
 
Example 19
Source File: SolrInformationServer.java    From SearchServices with GNU Lesser General Public License v3.0 4 votes vote down vote up
private void addContentPropertyToDocUsingAlfrescoRepository(
        SolrInputDocument doc,
        QName propertyQName,
        long dbId,
        String locale) throws AuthenticationException, IOException
{
    long start = System.nanoTime();

    // Expensive call to be done with ContentTracker
    try (GetTextContentResponse response = repositoryClient.getTextContent(dbId, propertyQName, null)) {
        addContentPropertyMetadata(doc, propertyQName, AlfrescoSolrDataModel.SpecializedFieldType.TRANSFORMATION_STATUS, response);
        addContentPropertyMetadata(doc, propertyQName, AlfrescoSolrDataModel.SpecializedFieldType.TRANSFORMATION_EXCEPTION, response);
        addContentPropertyMetadata(doc, propertyQName, AlfrescoSolrDataModel.SpecializedFieldType.TRANSFORMATION_TIME, response);

        final String textContent = textContentFrom(response);

        if (fingerprintHasBeenEnabledOnThisInstance && !textContent.isBlank()) {
            Analyzer analyzer = core.getLatestSchema().getFieldType("min_hash").getIndexAnalyzer();
            TokenStream ts = analyzer.tokenStream("dummy_field", textContent);
            CharTermAttribute termAttribute = ts.getAttribute(CharTermAttribute.class);
            ts.reset();
            while (ts.incrementToken()) {
                StringBuilder tokenBuff = new StringBuilder();
                char[] buff = termAttribute.buffer();

                for (int i = 0; i < termAttribute.length(); i++) {
                    tokenBuff.append(Integer.toHexString(buff[i]));
                }
                doc.addField(FINGERPRINT_FIELD, tokenBuff.toString());

            }
            ts.end();
            ts.close();
        }

        this.getTrackerStats().addDocTransformationTime(System.nanoTime() - start);

        String storedField = dataModel.getStoredContentField(propertyQName);
        doc.setField(storedField, "\u0000" + languageFrom(locale) + "\u0000" + textContent);

        dataModel.getIndexedFieldNamesForProperty(propertyQName)
                .getFields()
                .forEach(field -> addFieldIfNotSet(doc, field.getField()));
    }
}
 
Example 20
Source File: NGramTokenFilterTest.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void testSupplementaryCharacters() throws IOException {
  for (int i = 0; i < 20; i++) {
    final String s = TestUtil.randomUnicodeString(random(), 10);
    final int codePointCount = s.codePointCount(0, s.length());
    final int minGram = TestUtil.nextInt(random(), 1, 3);
    final int maxGram = TestUtil.nextInt(random(), minGram, 10);
    final boolean preserveOriginal = TestUtil.nextInt(random(), 0, 1) % 2 == 0;

    TokenStream tk = new KeywordTokenizer();
    ((Tokenizer)tk).setReader(new StringReader(s));
    tk = new NGramTokenFilter(tk, minGram, maxGram, preserveOriginal);
    final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
    final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
    tk.reset();

    if (codePointCount < minGram && preserveOriginal) {
      assertTrue(tk.incrementToken());
      assertEquals(0, offsetAtt.startOffset());
      assertEquals(s.length(), offsetAtt.endOffset());
      assertEquals(s, termAtt.toString());
    }
    
    for (int start = 0; start < codePointCount; ++start) {
      for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) {
        assertTrue(tk.incrementToken());
        assertEquals(0, offsetAtt.startOffset());
        assertEquals(s.length(), offsetAtt.endOffset());
        final int startIndex = Character.offsetByCodePoints(s, 0, start);
        final int endIndex = Character.offsetByCodePoints(s, 0, end);
        assertEquals(s.substring(startIndex, endIndex), termAtt.toString());
      }
    }
    
    if (codePointCount > maxGram && preserveOriginal) {
      assertTrue(tk.incrementToken());
      assertEquals(0, offsetAtt.startOffset());
      assertEquals(s.length(), offsetAtt.endOffset());
      assertEquals(s, termAtt.toString());
    }
    
    assertFalse(tk.incrementToken());
    tk.close();
  }
}