Java Code Examples for org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute#getBytesRef()

The following examples show how to use org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute#getBytesRef() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ReadTokensTask.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
public int doLogic() throws Exception {
  List<IndexableField> fields = doc.getFields();
  Analyzer analyzer = getRunData().getAnalyzer();
  int tokenCount = 0;
  for(final IndexableField field : fields) {
    if (field.fieldType().indexOptions() == IndexOptions.NONE ||
        field.fieldType().tokenized() == false) {
      continue;
    }
    
    final TokenStream stream = field.tokenStream(analyzer, null);
    // reset the TokenStream to the first token
    stream.reset();

    TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
    while(stream.incrementToken()) {
      termAtt.getBytesRef();
      tokenCount++;
    }
    stream.end();
    stream.close();
  }
  totalTokenCount += tokenCount;
  return tokenCount;
}
 
Example 2
Source File: TestPerfTasksLogic.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
private void assertEqualCollation(Analyzer a1, Analyzer a2, String text)
    throws Exception {
  TokenStream ts1 = a1.tokenStream("bogus", text);
  TokenStream ts2 = a2.tokenStream("bogus", text);
  ts1.reset();
  ts2.reset();
  TermToBytesRefAttribute termAtt1 = ts1.addAttribute(TermToBytesRefAttribute.class);
  TermToBytesRefAttribute termAtt2 = ts2.addAttribute(TermToBytesRefAttribute.class);
  assertTrue(ts1.incrementToken());
  assertTrue(ts2.incrementToken());
  BytesRef bytes1 = termAtt1.getBytesRef();
  BytesRef bytes2 = termAtt2.getBytesRef();
  assertEquals(bytes1, bytes2);
  assertFalse(ts1.incrementToken());
  assertFalse(ts2.incrementToken());
  ts1.close();
  ts2.close();
}
 
Example 3
Source File: PhraseCountQueryBuilder.java    From pyramid with Apache License 2.0 6 votes vote down vote up
protected Query doToQuery(QueryShardContext context) throws IOException {
//        Analyzer analyzer = context.getMapperService().searchAnalyzer();
        Analyzer analyzer = new WhitespaceAnalyzer();
        try (TokenStream source = analyzer.tokenStream(fieldName, value.toString())) {
            CachingTokenFilter stream = new CachingTokenFilter(new LowerCaseFilter(source));
            TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
            if (termAtt == null) {
                return null;
            }
            List<CustomSpanTermQuery> clauses = new ArrayList<>();
            stream.reset();
            while (stream.incrementToken()) {
                Term term = new Term(fieldName, termAtt.getBytesRef());
                    clauses.add(new CustomSpanTermQuery(term));
            }
            return new PhraseCountQuery(clauses.toArray(new CustomSpanTermQuery[clauses.size()]), slop, inOrder, weightedCount);
        } catch (IOException e) {
            throw new RuntimeException("Error analyzing query text", e);
        }


    }
 
Example 4
Source File: AlfrescoFieldType.java    From SearchServices with GNU Lesser General Public License v3.0 5 votes vote down vote up
public static BytesRef analyzeMultiTerm(String field, String part, Analyzer analyzerIn)
{
    if (part == null || analyzerIn == null)
        return null;

    TokenStream source = null;
    try
    {
        source = analyzerIn.tokenStream(field, part);
        source.reset();

        TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
        BytesRef bytes = termAtt.getBytesRef();

        if (!source.incrementToken())
            throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "analyzer returned no terms for multiTerm term: " + part);
        if (source.incrementToken())
            throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "analyzer returned too many terms for multiTerm term: " + part);

        source.end();
        return BytesRef.deepCopyOf(bytes);
    }
    catch (IOException e)
    {
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "error analyzing range part: " + part, e);
    }
    finally
    {
        IOUtils.closeWhileHandlingException(source);
    }
}
 
Example 5
Source File: TestLongPostings.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private String getRandomTerm(String other) throws IOException {
  Analyzer a = new MockAnalyzer(random());
  while(true) {
    String s = TestUtil.randomRealisticUnicodeString(random());
    if (other != null && s.equals(other)) {
      continue;
    }
    try (TokenStream ts = a.tokenStream("foo", s)) {
      final TermToBytesRefAttribute termAtt = ts.getAttribute(TermToBytesRefAttribute.class);
      ts.reset();

      int count = 0;
      boolean changed = false;

      while(ts.incrementToken()) {
        final BytesRef termBytes = termAtt.getBytesRef();
        if (count == 0 && !termBytes.utf8ToString().equals(s)) {
          // The value was changed during analysis.  Keep iterating so the
          // tokenStream is exhausted.
          changed = true;
        }
        count++;
      }

      ts.end();
      // Did we iterate just once and the value was unchanged?
      if (!changed && count == 1) {
        return s;
      }
    }
  }
}
 
Example 6
Source File: MinHashQParser.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private void getHashesFromTokenStream(String analyserField, ArrayList<BytesRef> hashes) throws Exception {
  TokenStream ts = getReq().getSchema().getIndexAnalyzer().tokenStream(analyserField, qstr);
  TermToBytesRefAttribute termAttribute = ts.getAttribute(TermToBytesRefAttribute.class);
  ts.reset();
  while (ts.incrementToken()) {
    BytesRef term = termAttribute.getBytesRef();
    hashes.add(BytesRef.deepCopyOf(term));
  }
  ts.end();
  ts.close();
}
 
Example 7
Source File: IcuCollationAnalyzerTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
private BytesRef bytesFromTokenStream(TokenStream stream) throws Exception {
    TermToBytesRefAttribute termAttr = stream.getAttribute(TermToBytesRefAttribute.class);
    stream.reset();
    BytesRefBuilder bytesRefBuilder = new BytesRefBuilder();
    while (stream.incrementToken()) {
        BytesRef bytesRef = termAttr.getBytesRef();
        bytesRefBuilder.append(bytesRef);
    }
    stream.close();
    return bytesRefBuilder.toBytesRef();
}
 
Example 8
Source File: IcuCollationKeyAnalyzerTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 4 votes vote down vote up
private void assertThreadSafe(final Random random, final Analyzer analyzer) throws Exception {
    int numTestPoints = 100;
    int numThreads = randomIntBetween(random, 3, 5);
    final HashMap<String, BytesRef> map = new HashMap<>();
    for (int i = 0; i < numTestPoints; i++) {
        String term = randomSimpleString(random, 10);
        try (TokenStream ts = analyzer.tokenStream("fake", term)) {
            TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
            BytesRef bytes = termAtt.getBytesRef();
            ts.reset();
            assertTrue(ts.incrementToken());
            // ensure we make a copy of the actual bytes too
            map.put(term, BytesRef.deepCopyOf(bytes));
            assertFalse(ts.incrementToken());
            ts.end();
        }
    }

    Thread threads[] = new Thread[numThreads];
    for (int i = 0; i < numThreads; i++) {
        threads[i] = new Thread(() -> {
            try {
                for (Map.Entry<String, BytesRef> mapping : map.entrySet()) {
                    String term = mapping.getKey();
                    BytesRef expected = mapping.getValue();
                    try (TokenStream ts = analyzer.tokenStream("fake", term)) {
                        TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
                        BytesRef bytes = termAtt.getBytesRef();
                        ts.reset();
                        ts.incrementToken();
                        if (!expected.utf8ToString().equals(bytes.utf8ToString())) {
                            throw new IOException("unexpected: bytes=" + bytes.utf8ToString() + " expected=" + expected.utf8ToString());
                        }
                        ts.incrementToken();
                        ts.end();
                    }
                }
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        });
    }
    for (int i = 0; i < numThreads; i++) {
        threads[i].start();
    }
    for (int i = 0; i < numThreads; i++) {
        threads[i].join();
    }
}