Java Code Examples for org.apache.lucene.analysis.TokenStream#end()

The following examples show how to use org.apache.lucene.analysis.TokenStream#end() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TokenStreamAssertions.java    From elasticsearch-analysis-openkoreantext with Apache License 2.0 6 votes vote down vote up
public static void assertTokenStream(TokenStream tokenStream, String[] expectedCharTerms, String[] expectedTypes, int[] expectedStartOffsets, int[] expectedEndOffsets) throws IOException {
    tokenStream.reset();
    int index = 0;
    while (tokenStream.incrementToken() == true) {
        assertEquals(expectedCharTerms[index], tokenStream.getAttribute(CharTermAttribute.class).toString());

        if(expectedTypes != null) {
            assertEquals(expectedTypes[index], tokenStream.getAttribute(TypeAttribute.class).type());
        }

        OffsetAttribute offsets = tokenStream.getAttribute(OffsetAttribute.class);

        if(expectedStartOffsets != null) {
            assertEquals(expectedStartOffsets[index], offsets.startOffset());
        }

        if(expectedEndOffsets != null) {
            assertEquals(expectedEndOffsets[index], offsets.endOffset());
        }

        index++;
    }
    tokenStream.end();
}
 
Example 2
Source File: TreatmentCurator.java    From hmftools with GNU General Public License v3.0 6 votes vote down vote up
@NotNull
private static List<SearchToken> generateSearchTokens(@NotNull String searchTerm) {
    Set<SearchToken> searchTokens = Sets.newHashSet();
    TokenStream tokenStream = getSpellCheckedShingleStream(searchTerm);
    try {
        tokenStream.reset();

        while (tokenStream.incrementToken()) {
            String searchToken = tokenStream.getAttribute(CharTermAttribute.class).toString();
            OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
            searchTokens.add(ImmutableSearchToken.of(searchToken, offsetAttribute.startOffset(), offsetAttribute.endOffset()));
        }
        tokenStream.end();
        tokenStream.close();
        return searchTokens.stream()
                .sorted(Comparator.comparing(SearchToken::length).reversed().thenComparing(SearchToken::startOffset))
                .collect(Collectors.toList());
    } catch (IOException exception) {
        LOGGER.warn("Caught IOException in treatment curation: {}", exception.getMessage());
        return Lists.newArrayList();
    }
}
 
Example 3
Source File: TestNGramFilters.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Test NGramFilterFactory on tokens with payloads
 */
public void testNGramFilterPayload() throws Exception {
  Reader reader = new StringReader("test|0.1");
  TokenStream stream = whitespaceMockTokenizer(reader);
  stream = tokenFilterFactory("DelimitedPayload", "encoder", "float").create(stream);
  stream = tokenFilterFactory("NGram", "minGramSize", "1", "maxGramSize", "2").create(stream);

  stream.reset();
  while (stream.incrementToken()) {
    PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class);
    assertNotNull(payAttr);
    BytesRef payData = payAttr.getPayload();
    assertNotNull(payData);
    float payFloat = PayloadHelper.decodeFloat(payData.bytes);
    assertEquals(0.1f, payFloat, 0.0f);
  }
  stream.end();
  stream.close();
}
 
Example 4
Source File: AnalysisRequestHandlerBase.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Analyzes the given TokenStream, collecting the Tokens it produces.
 *
 * @param tokenStream TokenStream to analyze
 *
 * @return List of tokens produced from the TokenStream
 */
private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream) {
  final List<AttributeSource> tokens = new ArrayList<>();
  final PositionIncrementAttribute posIncrAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
  final TokenTrackingAttribute trackerAtt = tokenStream.addAttribute(TokenTrackingAttribute.class);
  // for backwards compatibility, add all "common" attributes
  tokenStream.addAttribute(OffsetAttribute.class);
  tokenStream.addAttribute(TypeAttribute.class);
  try {
    tokenStream.reset();
    int position = 0;
    while (tokenStream.incrementToken()) {
      position += posIncrAtt.getPositionIncrement();
      trackerAtt.setActPosition(position);
      tokens.add(tokenStream.cloneAttributes());
    }
    tokenStream.end(); // TODO should we capture?
  } catch (IOException ioe) {
    throw new RuntimeException("Error occurred while iterating over tokenstream", ioe);
  } finally {
    IOUtils.closeWhileHandlingException(tokenStream);
  }

  return tokens;
}
 
Example 5
Source File: SpellingQueryConverter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
protected void analyze(Collection<Token> result, String text, int offset, int flagsAttValue) throws IOException {
  TokenStream stream = analyzer.tokenStream("", text);
  // TODO: support custom attributes
  CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
  TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class);
  PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class);
  PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class);
  OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
  stream.reset();
  while (stream.incrementToken()) {      
    Token token = new Token();
    token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
    token.setOffset(offset + offsetAtt.startOffset(), 
                    offset + offsetAtt.endOffset());
    token.setFlags(flagsAttValue); //overwriting any flags already set...
    token.setType(typeAtt.type());
    token.setPayload(payloadAtt.getPayload());
    token.setPositionIncrement(posIncAtt.getPositionIncrement());
    result.add(token);
  }
  stream.end();
  stream.close();
}
 
Example 6
Source File: TestBeiderMorseFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testCustomAttribute() throws IOException {
  TokenStream stream = new MockTokenizer(MockTokenizer.KEYWORD, false);
  ((Tokenizer)stream).setReader(new StringReader("D'Angelo"));
  stream = new PatternKeywordMarkerFilter(stream, Pattern.compile(".*"));
  stream = new BeiderMorseFilter(stream, new PhoneticEngine(NameType.GENERIC, RuleType.EXACT, true));
  KeywordAttribute keyAtt = stream.addAttribute(KeywordAttribute.class);
  stream.reset();
  int i = 0;
  while(stream.incrementToken()) {
    assertTrue(keyAtt.isKeyword());
    i++;
  }
  assertEquals(12, i);
  stream.end();
  stream.close();
}
 
Example 7
Source File: TestDuelingAnalyzers.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void assertEquals(String s, TokenStream left, TokenStream right) throws Exception {
  left.reset();
  right.reset();
  CharTermAttribute leftTerm = left.addAttribute(CharTermAttribute.class);
  CharTermAttribute rightTerm = right.addAttribute(CharTermAttribute.class);
  OffsetAttribute leftOffset = left.addAttribute(OffsetAttribute.class);
  OffsetAttribute rightOffset = right.addAttribute(OffsetAttribute.class);
  PositionIncrementAttribute leftPos = left.addAttribute(PositionIncrementAttribute.class);
  PositionIncrementAttribute rightPos = right.addAttribute(PositionIncrementAttribute.class);
  
  while (left.incrementToken()) {
    assertTrue("wrong number of tokens for input: " + s, right.incrementToken());
    assertEquals("wrong term text for input: " + s, leftTerm.toString(), rightTerm.toString());
    assertEquals("wrong position for input: " + s, leftPos.getPositionIncrement(), rightPos.getPositionIncrement());
    assertEquals("wrong start offset for input: " + s, leftOffset.startOffset(), rightOffset.startOffset());
    assertEquals("wrong end offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset());
  };
  assertFalse("wrong number of tokens for input: " + s, right.incrementToken());
  left.end();
  right.end();
  assertEquals("wrong final offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset());
  left.close();
  right.close();
}
 
Example 8
Source File: ReadTokensTask.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
public int doLogic() throws Exception {
  List<IndexableField> fields = doc.getFields();
  Analyzer analyzer = getRunData().getAnalyzer();
  int tokenCount = 0;
  for(final IndexableField field : fields) {
    if (field.fieldType().indexOptions() == IndexOptions.NONE ||
        field.fieldType().tokenized() == false) {
      continue;
    }
    
    final TokenStream stream = field.tokenStream(analyzer, null);
    // reset the TokenStream to the first token
    stream.reset();

    TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
    while(stream.incrementToken()) {
      termAtt.getBytesRef();
      tokenCount++;
    }
    stream.end();
    stream.close();
  }
  totalTokenCount += tokenCount;
  return tokenCount;
}
 
Example 9
Source File: FeatureExtractorUtilities.java    From samantha with MIT License 6 votes vote down vote up
static public Map<String, Integer> getTermFreq(Analyzer analyzer, String text, String termField) {
    TokenStream ts = analyzer.tokenStream(termField, text);
    CharTermAttribute cattr = ts.addAttribute(CharTermAttribute.class);
    Map<String, Integer> termFreq = new HashMap<>();
    try {
        ts.reset();
        while (ts.incrementToken()) {
            String term = cattr.toString();
            int cnt = termFreq.getOrDefault(
                    FeatureExtractorUtilities.composeKey(termField, term), 0);
            termFreq.put(term, cnt + 1);
        }
        ts.end();
        ts.close();
    } catch (IOException e) {
        logger.error("{}", e.getMessage());
        throw new BadRequestException(e);
    }
    return termFreq;
}
 
Example 10
Source File: MLAnalayserTest.java    From SearchServices with GNU Lesser General Public License v3.0 5 votes vote down vote up
/**
 * Check that the TokenStream yields the exact tokens specified.
 * Note that order is not checked, since the map of locales will not provide a
 * predictable ordering when enumerated.
 * 
 * The expected list of tokens may contain the same token more than once and
 * the number of instances will have to match the number found in the stream.
 * 
 * @param ts              TokenStream to inspect.
 * @param expectedTokens  List of tokens in the order expected from the stream.
 * @throws IOException
 */
private void verifyTokenStream(TokenStream ts, List<String> expectedTokens) throws IOException
{
    final int expectedCount = expectedTokens.size();
    int count = 0;
    
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    
    try
    {
        ts.reset();
        while (ts.incrementToken())
        {
            count++;
            System.out.println("Token: " + termAtt.toString());
            if (expectedTokens.contains(termAtt.toString()))
            {
                // remove an instance of the term text so that it is not matched again
                expectedTokens.remove(termAtt.toString());
            }
            else
            {
                fail("Unexpected token: " + termAtt.toString());
            }
        }
        ts.end();
    }
    finally
    {
        ts.close();
    }
    
    assertEquals("Incorrect number of tokens generated.", expectedCount, count);
}
 
Example 11
Source File: TransportAnalyzeAction.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
private void analyze(TokenStream stream, Analyzer analyzer, String field, Set<String> includeAttributes) {
    try {
        stream.reset();
        CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
        OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
        TypeAttribute type = stream.addAttribute(TypeAttribute.class);

        while (stream.incrementToken()) {
            int increment = posIncr.getPositionIncrement();
            if (increment > 0) {
                lastPosition = lastPosition + increment;
            }
            tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(),
                lastOffset +offset.endOffset(), type.type(), extractExtendedAttributes(stream, includeAttributes)));

        }
        stream.end();
        lastOffset += offset.endOffset();
        lastPosition += posIncr.getPositionIncrement();

        lastPosition += analyzer.getPositionIncrementGap(field);
        lastOffset += analyzer.getOffsetGap(field);

    } catch (IOException e) {
        throw new ElasticsearchException("failed to analyze", e);
    } finally {
        IOUtils.closeWhileHandlingException(stream);
    }
}
 
Example 12
Source File: MinHashQParser.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private void getHashesFromTokenStream(String analyserField, ArrayList<BytesRef> hashes) throws Exception {
  TokenStream ts = getReq().getSchema().getIndexAnalyzer().tokenStream(analyserField, qstr);
  TermToBytesRefAttribute termAttribute = ts.getAttribute(TermToBytesRefAttribute.class);
  ts.reset();
  while (ts.incrementToken()) {
    BytesRef term = termAttribute.getBytesRef();
    hashes.add(BytesRef.deepCopyOf(term));
  }
  ts.end();
  ts.close();
}
 
Example 13
Source File: ChineseMatcher.java    From zxl with Apache License 2.0 5 votes vote down vote up
public double oneWayMatch(String text1,String text2) {
try {
	Set<String> set = new HashSet<String>(10);
	TokenStream tokenStream = smartChineseAnalyzer.tokenStream("field", text1);
	CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
	tokenStream.reset();
	while (tokenStream.incrementToken()) {
		set.add(charTermAttribute.toString());
	}
	int originalCount = set.size();
	tokenStream.end();
	tokenStream.close();
	tokenStream = smartChineseAnalyzer.tokenStream("field", text2);
	charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
	tokenStream.reset();
	int smallWeightWordsCount = 0;
	int denominator = 0;
	while (tokenStream.incrementToken()) {
		denominator++;
		String word = charTermAttribute.toString();
		int tempSize = set.size();
		set.add(word);
		if (tempSize + 1 == set.size() && smallWeightWords.contains(word)) {
			smallWeightWordsCount++;
		}
	}
	int numerator = set.size() - originalCount;
	double unmatchRate = (smallWeightWordsCount * smallWeight + numerator - ((double)smallWeightWordsCount))/denominator;
	tokenStream.end();
	tokenStream.close();
	return unmatchRate;
} catch (IOException e) {
	return 1D;
}

  }
 
Example 14
Source File: SimpleNaiveBayesDocumentClassifier.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Returns a token array from the {@link org.apache.lucene.analysis.TokenStream} in input
 *
 * @param tokenizedText the tokenized content of a field
 * @return a {@code String} array of the resulting tokens
 * @throws java.io.IOException If tokenization fails because there is a low-level I/O error
 */
protected String[] getTokenArray(TokenStream tokenizedText) throws IOException {
  Collection<String> tokens = new LinkedList<>();
  CharTermAttribute charTermAttribute = tokenizedText.addAttribute(CharTermAttribute.class);
  tokenizedText.reset();
  while (tokenizedText.incrementToken()) {
    tokens.add(charTermAttribute.toString());
  }
  tokenizedText.end();
  tokenizedText.close();
  return tokens.toArray(new String[0]);
}
 
Example 15
Source File: MinHashFilterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private ArrayList<String> getTokens(TokenStream ts) throws IOException {
  ArrayList<String> tokens = new ArrayList<>();
  ts.reset();
  while (ts.incrementToken()) {
    CharTermAttribute termAttribute = ts.getAttribute(CharTermAttribute.class);
    String token = new String(termAttribute.buffer(), 0, termAttribute.length());
    tokens.add(token);
  }
  ts.end();
  ts.close();

  return tokens;
}
 
Example 16
Source File: TestEmptyTokenStream.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testConsume() throws IOException {
  TokenStream ts = new EmptyTokenStream();
  ts.reset();
  assertFalse(ts.incrementToken());
  ts.end();
  ts.close();
  // try again with reuse:
  ts.reset();
  assertFalse(ts.incrementToken());
  ts.end();
  ts.close();
}
 
Example 17
Source File: TokenSourcesTest.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void testMaxStartOffsetConsistency() throws IOException {
  FieldType tvFieldType = new FieldType(TextField.TYPE_NOT_STORED);
  tvFieldType.setStoreTermVectors(true);
  tvFieldType.setStoreTermVectorOffsets(true);
  tvFieldType.setStoreTermVectorPositions(true);

  Directory dir = newDirectory();

  MockAnalyzer analyzer = new MockAnalyzer(random());
  analyzer.setEnableChecks(false);//we don't necessarily consume the whole stream because of limiting by startOffset
  Document doc = new Document();
  final String TEXT = " f gg h";
  doc.add(new Field("fld_tv", analyzer.tokenStream("fooFld", TEXT), tvFieldType));
  doc.add(new TextField("fld_notv", analyzer.tokenStream("barFld", TEXT)));

  IndexReader reader;
  try (RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) {
    writer.addDocument(doc);
    reader = writer.getReader();
  }
  try {
    Fields tvFields = reader.getTermVectors(0);
    for (int maxStartOffset = -1; maxStartOffset <= TEXT.length(); maxStartOffset++) {
      TokenStream tvStream = TokenSources.getTokenStream("fld_tv", tvFields, TEXT, analyzer, maxStartOffset);
      TokenStream anaStream = TokenSources.getTokenStream("fld_notv", tvFields, TEXT, analyzer, maxStartOffset);

      //assert have same tokens, none of which has a start offset > maxStartOffset
      final OffsetAttribute tvOffAtt = tvStream.addAttribute(OffsetAttribute.class);
      final OffsetAttribute anaOffAtt = anaStream.addAttribute(OffsetAttribute.class);
      tvStream.reset();
      anaStream.reset();
      while (tvStream.incrementToken()) {
        assertTrue(anaStream.incrementToken());
        assertEquals(tvOffAtt.startOffset(), anaOffAtt.startOffset());
        if (maxStartOffset >= 0)
          assertTrue(tvOffAtt.startOffset() <= maxStartOffset);
      }
      assertTrue(anaStream.incrementToken() == false);
      tvStream.end();
      anaStream.end();
      tvStream.close();
      anaStream.close();
    }

  } finally {
    reader.close();
  }



  dir.close();
}
 
Example 18
Source File: Zemberek2StemFilterFactory.java    From lucene-solr-analysis-turkish with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws IOException {

        StringReader reader = new StringReader("elması utansın ortaklar çekişme ile");

        Map<String, String> map = new HashMap<>();
        map.put("strategy", "frequency");

        Zemberek2StemFilterFactory factory = new Zemberek2StemFilterFactory(map);

        WhitespaceTokenizer whitespaceTokenizer = new WhitespaceTokenizer();
        whitespaceTokenizer.setReader(reader);

        TokenStream stream = factory.create(whitespaceTokenizer);

        CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class);

        stream.reset();
        while (stream.incrementToken()) {

            String term = termAttribute.toString();
            System.out.println(term);
        }
        stream.end();
        reader.close();
    }
 
Example 19
Source File: Zemberek2DeASCIIfyFilterFactory.java    From lucene-solr-analysis-turkish with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws IOException {

        StringReader reader = new StringReader("kus asisi ortaklar çekişme masali");

        Map<String, String> map = new HashMap<>();


        Zemberek2DeASCIIfyFilterFactory factory = new Zemberek2DeASCIIfyFilterFactory(map);
        WhitespaceTokenizer whitespaceTokenizer = new WhitespaceTokenizer();
        whitespaceTokenizer.setReader(reader);

        TokenStream stream = factory.create(whitespaceTokenizer);

        CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class);

        stream.reset();
        while (stream.incrementToken()) {

            String term = termAttribute.toString();
            System.out.println(term);
        }
        stream.end();
        reader.close();
    }
 
Example 20
Source File: FullTextIndex.java    From database with GNU General Public License v2.0 2 votes vote down vote up
/**
     * Index a field in a document.
     * <p>
     * Note: This method does NOT force a write on the indices. If the <i>buffer</i>
     * overflows, then there will be an index write. Once the caller is done
     * indexing, they MUST invoke {@link TokenBuffer#flush()} to force any data
     * remaining in their <i>buffer</i> to the indices.
     * <p>
     * Note: If a document is pre-existing, then the existing data for that
     * document MUST be removed unless you know that the fields to be found in
     * the will not have changed (they may have different contents, but the same
     * fields exist in the old and new versions of the document).
     * 
     * @param buffer
     *            Used to buffer writes onto the text index.
     * @param docId
     *            The document identifier.
     * @param fieldId
     *            The field identifier.
     * @param languageCode
     *            The language code -or- <code>null</code> to use the default
     *            {@link Locale}.
     * @param r
     *            A reader on the text to be indexed.
     * @param filterStopwords
     * 			  if true, filter stopwords from the token stream            
     * 
     * @see TokenBuffer#flush()
     */
    public void index(final TokenBuffer<V> buffer, final V docId,
            final int fieldId, final String languageCode, final Reader r,
            final boolean filterStopwords) {

        /*
         * Note: You can invoke this on a read-only index. It is only overflow
         * of the TokenBuffer that requires a writable index. Overflow itself
         * will only occur on {document,field} tuple boundaries, so it will
         * never overflow when indexing a search query.
         */
//        assertWritable();
        
        int n = 0;
        
        // tokenize (note: docId,fieldId are not on the tokenStream, but the field could be).
        final TokenStream tokenStream = getTokenStream(languageCode, r,
                filterStopwords);

        
        try {

            tokenStream.reset();   

            while (tokenStream.incrementToken()) {
                
                final CharTermAttribute term = tokenStream
                        .getAttribute(CharTermAttribute.class);
                
                buffer.add(docId, fieldId, term.toString());

                n++;

            }
            
            tokenStream.end();
            
            tokenStream.close();

        } catch (IOException ioe) {
            
            throw new RuntimeException(ioe);
            
        }

        if (log.isInfoEnabled())
            log.info("Indexed " + n + " tokens: docId=" + docId + ", fieldId="
                    + fieldId);

    }