Java Code Examples for org.apache.lucene.index.TermsEnum#seekCeil()

The following examples show how to use org.apache.lucene.index.TermsEnum#seekCeil() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: DocToDoubleVectorUtils.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * create a sparse <code>Double</code> vector given doc and field term vectors using local frequency of the terms in the doc
 *
 * @param docTerms   term vectors for a given document
 * @param fieldTerms field term vectors
 * @return a sparse vector of <code>Double</code>s as an array
 * @throws IOException in case accessing the underlying index fails
 */
public static Double[] toSparseLocalFreqDoubleArray(Terms docTerms, Terms fieldTerms) throws IOException {
  TermsEnum fieldTermsEnum = fieldTerms.iterator();
  Double[] freqVector = null;
  if (docTerms != null && fieldTerms.size() > -1) {
    freqVector = new Double[(int) fieldTerms.size()];
    int i = 0;
    TermsEnum docTermsEnum = docTerms.iterator();
    BytesRef term;
    while ((term = fieldTermsEnum.next()) != null) {
      TermsEnum.SeekStatus seekStatus = docTermsEnum.seekCeil(term);
      if (seekStatus.equals(TermsEnum.SeekStatus.END)) {
        docTermsEnum = docTerms.iterator();
      }
      if (seekStatus.equals(TermsEnum.SeekStatus.FOUND)) {
        long termFreqLocal = docTermsEnum.totalTermFreq(); // the total number of occurrences of this term in the given document
        freqVector[i] = Long.valueOf(termFreqLocal).doubleValue();
      } else {
        freqVector[i] = 0d;
      }
      i++;
    }
  }
  return freqVector;
}
 
Example 2
Source File: SrndTermQuery.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
public void visitMatchingTerms(
  IndexReader reader,
  String fieldName,
  MatchingTermVisitor mtv) throws IOException
{
  /* check term presence in index here for symmetry with other SimpleTerm's */
  Terms terms = MultiTerms.getTerms(reader, fieldName);
  if (terms != null) {
    TermsEnum termsEnum = terms.iterator();

    TermsEnum.SeekStatus status = termsEnum.seekCeil(new BytesRef(getTermText()));
    if (status == TermsEnum.SeekStatus.FOUND) {
      mtv.visitMatchingTerm(getLuceneTerm(fieldName));
    }
  }
}
 
Example 3
Source File: LuceneIndexCorpus.java    From word2vec-lucene with Apache License 2.0 6 votes vote down vote up
@Override
public void learnVocab() throws IOException {
  super.learnVocab();

  final String field = ((LuceneIndexConfig)config).getField();
  final Terms terms = MultiFields.getTerms(reader, field);
  final BytesRef maxTerm = terms.getMax();
  final BytesRef minTerm = terms.getMin();
  Query q = new TermRangeQuery(field, minTerm, maxTerm, true, true);
  IndexSearcher searcher = new IndexSearcher(reader);
  topDocs = searcher.search(q, Integer.MAX_VALUE);

  TermsEnum termsEnum = null;
  termsEnum = terms.iterator(termsEnum);

  termsEnum.seekCeil(new BytesRef());
  BytesRef term = termsEnum.term();
  while(term != null){
    int p = addWordToVocab(term.utf8ToString());
    vocab[p].setCn((int)termsEnum.totalTermFreq());
    term = termsEnum.next();
  }
}
 
Example 4
Source File: SrndTruncQuery.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public void visitMatchingTerms(
  IndexReader reader,
  String fieldName,
  MatchingTermVisitor mtv) throws IOException
{
  int prefixLength = prefix.length();
  Terms terms = MultiTerms.getTerms(reader, fieldName);
  if (terms != null) {
    Matcher matcher = pattern.matcher("");
    try {
      TermsEnum termsEnum = terms.iterator();

      TermsEnum.SeekStatus status = termsEnum.seekCeil(prefixRef);
      BytesRef text;
      if (status == TermsEnum.SeekStatus.FOUND) {
        text = prefixRef;
      } else if (status == TermsEnum.SeekStatus.NOT_FOUND) {
        text = termsEnum.term();
      } else {
        text = null;
      }

      while(text != null) {
        if (text != null && StringHelper.startsWith(text, prefixRef)) {
          String textString = text.utf8ToString();
          matcher.reset(textString.substring(prefixLength));
          if (matcher.matches()) {
            mtv.visitMatchingTerm(new Term(fieldName, textString));
          }
        } else {
          break;
        }
        text = termsEnum.next();
      }
    } finally {
      matcher.reset();
    }
  }
}
 
Example 5
Source File: SrndPrefixQuery.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public void visitMatchingTerms(
  IndexReader reader,
  String fieldName,
  MatchingTermVisitor mtv) throws IOException
{
  /* inspired by PrefixQuery.rewrite(): */
  Terms terms = MultiTerms.getTerms(reader, fieldName);
  if (terms != null) {
    TermsEnum termsEnum = terms.iterator();

    boolean skip = false;
    TermsEnum.SeekStatus status = termsEnum.seekCeil(new BytesRef(getPrefix()));
    if (status == TermsEnum.SeekStatus.FOUND) {
      mtv.visitMatchingTerm(getLucenePrefixTerm(fieldName));
    } else if (status == TermsEnum.SeekStatus.NOT_FOUND) {
      if (StringHelper.startsWith(termsEnum.term(), prefixRef)) {
        mtv.visitMatchingTerm(new Term(fieldName, termsEnum.term().utf8ToString()));
      } else {
        skip = true;
      }
    } else {
      // EOF
      skip = true;
    }

    if (!skip) {
      while(true) {
        BytesRef text = termsEnum.next();
        if (text != null && StringHelper.startsWith(text, prefixRef)) {
          mtv.visitMatchingTerm(new Term(fieldName, text.utf8ToString()));
        } else {
          break;
        }
      }
    }
  }
}
 
Example 6
Source File: SecureAtomicReaderTestBase.java    From incubator-retired-blur with Apache License 2.0 5 votes vote down vote up
private int getTermWithSeekCount(Fields fields, String field) throws IOException {
  Terms terms = fields.terms(field);
  TermsEnum termsEnum = terms.iterator(null);
  SeekStatus seekStatus = termsEnum.seekCeil(new BytesRef(""));
  if (seekStatus == SeekStatus.END) {
    return 0;
  }
  System.out.println(termsEnum.term().utf8ToString());
  int count = 1;
  while (termsEnum.next() != null) {
    count++;
  }
  return count;
}
 
Example 7
Source File: MultiPhrasePrefixQuery.java    From crate with Apache License 2.0 5 votes vote down vote up
private void getPrefixTerms(ObjectHashSet<Term> terms, final Term prefix, final IndexReader reader) throws IOException {
    // SlowCompositeReaderWrapper could be used... but this would merge all terms from each segment into one terms
    // instance, which is very expensive. Therefore I think it is better to iterate over each leaf individually.
    List<LeafReaderContext> leaves = reader.leaves();
    for (LeafReaderContext leaf : leaves) {
        Terms _terms = leaf.reader().terms(field);
        if (_terms == null) {
            continue;
        }

        TermsEnum termsEnum = _terms.iterator();
        TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefix.bytes());
        if (TermsEnum.SeekStatus.END == seekStatus) {
            continue;
        }

        for (BytesRef term = termsEnum.term(); term != null; term = termsEnum.next()) {
            if (!StringHelper.startsWith(term, prefix.bytes())) {
                break;
            }

            terms.add(new Term(field, BytesRef.deepCopyOf(term)));
            if (terms.size() >= maxExpansions) {
                return;
            }
        }
    }
}
 
Example 8
Source File: TestMultiPhraseQuery.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void testPhrasePrefix() throws IOException {
  Directory indexStore = newDirectory();
  RandomIndexWriter writer = new RandomIndexWriter(random(), indexStore);
  add("blueberry pie", writer);
  add("blueberry strudel", writer);
  add("blueberry pizza", writer);
  add("blueberry chewing gum", writer);
  add("bluebird pizza", writer);
  add("bluebird foobar pizza", writer);
  add("piccadilly circus", writer);
  
  IndexReader reader = writer.getReader();
  IndexSearcher searcher = newSearcher(reader);
  
  // search for "blueberry pi*":
  MultiPhraseQuery.Builder query1builder = new MultiPhraseQuery.Builder();
  // search for "strawberry pi*":
  MultiPhraseQuery.Builder query2builder = new MultiPhraseQuery.Builder();
  query1builder.add(new Term("body", "blueberry"));
  query2builder.add(new Term("body", "strawberry"));
  
  LinkedList<Term> termsWithPrefix = new LinkedList<>();
  
  // this TermEnum gives "piccadilly", "pie" and "pizza".
  String prefix = "pi";
  TermsEnum te = MultiTerms.getTerms(reader,"body").iterator();
  te.seekCeil(new BytesRef(prefix));
  do {
    String s = te.term().utf8ToString();
    if (s.startsWith(prefix)) {
      termsWithPrefix.add(new Term("body", s));
    } else {
      break;
    }
  } while (te.next() != null);
  
  query1builder.add(termsWithPrefix.toArray(new Term[0]));
  MultiPhraseQuery query1 = query1builder.build();
  assertEquals("body:\"blueberry (piccadilly pie pizza)\"", query1.toString());
  
  query2builder.add(termsWithPrefix.toArray(new Term[0]));
  MultiPhraseQuery query2 = query2builder.build();
  assertEquals("body:\"strawberry (piccadilly pie pizza)\"", query2.toString());
  
  ScoreDoc[] result;
  result = searcher.search(query1, 1000).scoreDocs;
  assertEquals(2, result.length);
  result = searcher.search(query2, 1000).scoreDocs;
  assertEquals(0, result.length);
  
  // search for "blue* pizza":
  MultiPhraseQuery.Builder query3builder = new MultiPhraseQuery.Builder();
  termsWithPrefix.clear();
  prefix = "blue";
  te.seekCeil(new BytesRef(prefix));
  
  do {
    if (te.term().utf8ToString().startsWith(prefix)) {
      termsWithPrefix.add(new Term("body", te.term().utf8ToString()));
    }
  } while (te.next() != null);
  
  query3builder.add(termsWithPrefix.toArray(new Term[0]));
  query3builder.add(new Term("body", "pizza"));
  
  MultiPhraseQuery query3 = query3builder.build();
  
  result = searcher.search(query3, 1000).scoreDocs;
  assertEquals(2, result.length); // blueberry pizza, bluebird pizza
  assertEquals("body:\"(blueberry bluebird) pizza\"", query3.toString());
  
  // test slop:
  query3builder.setSlop(1);
  query3 = query3builder.build();
  result = searcher.search(query3, 1000).scoreDocs;
  
  // just make sure no exc:
  searcher.explain(query3, 0);
  
  assertEquals(3, result.length); // blueberry pizza, bluebird pizza, bluebird
                                  // foobar pizza
  
  MultiPhraseQuery.Builder query4builder = new MultiPhraseQuery.Builder();
  expectThrows(IllegalArgumentException.class, () -> {
    query4builder.add(new Term("field1", "foo"));
    query4builder.add(new Term("field2", "foobar"));
  });
  
  writer.close();
  reader.close();
  indexStore.close();
}
 
Example 9
Source File: TestPhrasePrefixQuery.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
/**
   *
   */
public void testPhrasePrefix() throws IOException {
  Directory indexStore = newDirectory();
  RandomIndexWriter writer = new RandomIndexWriter(random(), indexStore);
  Document doc1 = new Document();
  Document doc2 = new Document();
  Document doc3 = new Document();
  Document doc4 = new Document();
  Document doc5 = new Document();
  doc1.add(newTextField("body", "blueberry pie", Field.Store.YES));
  doc2.add(newTextField("body", "blueberry strudel", Field.Store.YES));
  doc3.add(newTextField("body", "blueberry pizza", Field.Store.YES));
  doc4.add(newTextField("body", "blueberry chewing gum", Field.Store.YES));
  doc5.add(newTextField("body", "piccadilly circus", Field.Store.YES));
  writer.addDocument(doc1);
  writer.addDocument(doc2);
  writer.addDocument(doc3);
  writer.addDocument(doc4);
  writer.addDocument(doc5);
  IndexReader reader = writer.getReader();
  writer.close();
  
  IndexSearcher searcher = newSearcher(reader);
  
  // PhrasePrefixQuery query1 = new PhrasePrefixQuery();
  MultiPhraseQuery.Builder query1builder = new MultiPhraseQuery.Builder();
  // PhrasePrefixQuery query2 = new PhrasePrefixQuery();
  MultiPhraseQuery.Builder query2builder = new MultiPhraseQuery.Builder();
  query1builder.add(new Term("body", "blueberry"));
  query2builder.add(new Term("body", "strawberry"));
  
  LinkedList<Term> termsWithPrefix = new LinkedList<>();
  
  // this TermEnum gives "piccadilly", "pie" and "pizza".
  String prefix = "pi";
  TermsEnum te = MultiTerms.getTerms(reader, "body").iterator();
  te.seekCeil(new BytesRef(prefix));
  do {
    String s = te.term().utf8ToString();
    if (s.startsWith(prefix)) {
      termsWithPrefix.add(new Term("body", s));
    } else {
      break;
    }
  } while (te.next() != null);
  
  query1builder.add(termsWithPrefix.toArray(new Term[0]));
  query2builder.add(termsWithPrefix.toArray(new Term[0]));
  
  ScoreDoc[] result;
  result = searcher.search(query1builder.build(), 1000).scoreDocs;
  assertEquals(2, result.length);
  
  result = searcher.search(query2builder.build(), 1000).scoreDocs;
  assertEquals(0, result.length);
  reader.close();
  indexStore.close();
}
 
Example 10
Source File: TestBlockPostingsFormat3.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private void assertTermsSeeking(Terms leftTerms, Terms rightTerms) throws Exception {
  TermsEnum leftEnum = null;
  TermsEnum rightEnum = null;
  
  // just an upper bound
  int numTests = atLeast(20);
  Random random = random();
  
  // collect this number of terms from the left side
  HashSet<BytesRef> tests = new HashSet<>();
  int numPasses = 0;
  while (numPasses < 10 && tests.size() < numTests) {
    leftEnum = leftTerms.iterator();
    BytesRef term = null;
    while ((term = leftEnum.next()) != null) {
      int code = random.nextInt(10);
      if (code == 0) {
        // the term
        tests.add(BytesRef.deepCopyOf(term));
      } else if (code == 1) {
        // truncated subsequence of term
        term = BytesRef.deepCopyOf(term);
        if (term.length > 0) {
          // truncate it
          term.length = random.nextInt(term.length);
        }
      } else if (code == 2) {
        // term, but ensure a non-zero offset
        byte newbytes[] = new byte[term.length+5];
        System.arraycopy(term.bytes, term.offset, newbytes, 5, term.length);
        tests.add(new BytesRef(newbytes, 5, term.length));
      }
    }
    numPasses++;
  }
  
  ArrayList<BytesRef> shuffledTests = new ArrayList<>(tests);
  Collections.shuffle(shuffledTests, random);
  
  for (BytesRef b : shuffledTests) {
    leftEnum = leftTerms.iterator();
    rightEnum = rightTerms.iterator();
    
    assertEquals(leftEnum.seekExact(b), rightEnum.seekExact(b));
    assertEquals(leftEnum.seekExact(b), rightEnum.seekExact(b));
    
    SeekStatus leftStatus;
    SeekStatus rightStatus;
    
    leftStatus = leftEnum.seekCeil(b);
    rightStatus = rightEnum.seekCeil(b);
    assertEquals(leftStatus, rightStatus);
    if (leftStatus != SeekStatus.END) {
      assertEquals(leftEnum.term(), rightEnum.term());
    }
    
    leftStatus = leftEnum.seekCeil(b);
    rightStatus = rightEnum.seekCeil(b);
    assertEquals(leftStatus, rightStatus);
    if (leftStatus != SeekStatus.END) {
      assertEquals(leftEnum.term(), rightEnum.term());
    }
  }
}