Java Code Examples for org.apache.lucene.index.TermsEnum#seekExact()

The following examples show how to use org.apache.lucene.index.TermsEnum#seekExact() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: DisjunctionMatchesIterator.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Create a {@link DisjunctionMatchesIterator} over a list of terms extracted from a {@link BytesRefIterator}
 *
 * Only terms that have at least one match in the given document will be included
 */
static MatchesIterator fromTermsEnum(LeafReaderContext context, int doc, Query query, String field, BytesRefIterator terms) throws IOException {
  Objects.requireNonNull(field);
  Terms t = context.reader().terms(field);
  if (t == null)
    return null;
  TermsEnum te = t.iterator();
  PostingsEnum reuse = null;
  for (BytesRef term = terms.next(); term != null; term = terms.next()) {
    if (te.seekExact(term)) {
      PostingsEnum pe = te.postings(reuse, PostingsEnum.OFFSETS);
      if (pe.advance(doc) == doc) {
        return new TermsEnumDisjunctionMatchesIterator(new TermMatchesIterator(query, pe), terms, te, doc, query);
      }
      else {
        reuse = pe;
      }
    }
  }
  return null;
}
 
Example 2
Source File: PayloadFilteredTermIntervalsSource.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
public IntervalIterator intervals(String field, LeafReaderContext ctx) throws IOException {
  Terms terms = ctx.reader().terms(field);
  if (terms == null)
    return null;
  if (terms.hasPositions() == false) {
    throw new IllegalArgumentException("Cannot create an IntervalIterator over field " + field + " because it has no indexed positions");
  }
  if (terms.hasPayloads() == false) {
    throw new IllegalArgumentException("Cannot create a payload-filtered iterator over field " + field + " because it has no indexed payloads");
  }
  TermsEnum te = terms.iterator();
  if (te.seekExact(term) == false) {
    return null;
  }
  return intervals(te);
}
 
Example 3
Source File: PayloadFilteredTermIntervalsSource.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
public IntervalMatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException {
  Terms terms = ctx.reader().terms(field);
  if (terms == null)
    return null;
  if (terms.hasPositions() == false) {
    throw new IllegalArgumentException("Cannot create an IntervalIterator over field " + field + " because it has no indexed positions");
  }
  if (terms.hasPayloads() == false) {
    throw new IllegalArgumentException("Cannot create a payload-filtered iterator over field " + field + " because it has no indexed payloads");
  }
  TermsEnum te = terms.iterator();
  if (te.seekExact(term) == false) {
    return null;
  }
  return matches(te, doc);
}
 
Example 4
Source File: TestRTGBase.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
protected int getFirstMatch(IndexReader r, Term t) throws IOException {
  Terms terms = MultiTerms.getTerms(r, t.field());
  if (terms == null) return -1;
  BytesRef termBytes = t.bytes();
  final TermsEnum termsEnum = terms.iterator();
  if (!termsEnum.seekExact(termBytes)) {
    return -1;
  }
  PostingsEnum docs = termsEnum.postings(null, PostingsEnum.NONE);
  docs = BitsFilteredPostingsEnum.wrap(docs, MultiBits.getLiveDocs(r));
  int id = docs.nextDoc();
  if (id != DocIdSetIterator.NO_MORE_DOCS) {
    int next = docs.nextDoc();
    assertEquals(DocIdSetIterator.NO_MORE_DOCS, next);
  }
  return id == DocIdSetIterator.NO_MORE_DOCS ? -1 : id;
}
 
Example 5
Source File: MutatableAction.java    From incubator-retired-blur with Apache License 2.0 6 votes vote down vote up
private IterableRow getIterableRow(String rowId, IndexSearcherCloseable searcher) throws IOException {
  IndexReader indexReader = searcher.getIndexReader();
  BytesRef rowIdRef = new BytesRef(rowId);
  List<AtomicReaderTermsEnum> possibleRowIds = new ArrayList<AtomicReaderTermsEnum>();
  for (AtomicReaderContext atomicReaderContext : indexReader.leaves()) {
    AtomicReader atomicReader = atomicReaderContext.reader();
    Fields fields = atomicReader.fields();
    if (fields == null) {
      continue;
    }
    Terms terms = fields.terms(BlurConstants.ROW_ID);
    if (terms == null) {
      continue;
    }
    TermsEnum termsEnum = terms.iterator(null);
    if (!termsEnum.seekExact(rowIdRef, true)) {
      continue;
    }
    // need atomic read as well...
    possibleRowIds.add(new AtomicReaderTermsEnum(atomicReader, termsEnum));
  }
  if (possibleRowIds.isEmpty()) {
    return null;
  }
  return new IterableRow(rowId, getRecords(possibleRowIds));
}
 
Example 6
Source File: FuzzyLikeThisQuery.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
private Query newTermQuery(IndexReader reader, Term term) throws IOException {
  if (ignoreTF) {
    return new ConstantScoreQuery(new TermQuery(term));
  } else {
    // we build an artificial TermStates that will give an overall df and ttf
    // equal to 1
    TermStates context = new TermStates(reader.getContext());
    for (LeafReaderContext leafContext : reader.leaves()) {
      Terms terms = leafContext.reader().terms(term.field());
      if (terms != null) {
        TermsEnum termsEnum = terms.iterator();
        if (termsEnum.seekExact(term.bytes())) {
          int freq = 1 - context.docFreq(); // we want the total df and ttf to be 1
          context.register(termsEnum.termState(), leafContext.ord, freq, freq);
        }
      }
    }
    return new TermQuery(term, context);
  }
}
 
Example 7
Source File: TermsIncludingScoreQuery.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
protected void fillDocsAndScores(FixedBitSet matchingDocs, TermsEnum termsEnum) throws IOException {
  BytesRef spare = new BytesRef();
  PostingsEnum postingsEnum = null;
  for (int i = 0; i < terms.size(); i++) {
    if (termsEnum.seekExact(terms.get(ords[i], spare))) {
      postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
      float score = TermsIncludingScoreQuery.this.scores[ords[i]];
      for (int doc = postingsEnum.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = postingsEnum.nextDoc()) {
        // I prefer this:
        /*if (scores[doc] < score) {
          scores[doc] = score;
          matchingDocs.set(doc);
        }*/
        // But this behaves the same as MVInnerScorer and only then the tests will pass:
        if (!matchingDocs.get(doc)) {
          scores[doc] = score;
          matchingDocs.set(doc);
        }
      }
    }
  }
}
 
Example 8
Source File: TermsIncludingScoreQuery.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
protected void fillDocsAndScores(FixedBitSet matchingDocs, TermsEnum termsEnum) throws IOException {
  BytesRef spare = new BytesRef();
  PostingsEnum postingsEnum = null;
  for (int i = 0; i < terms.size(); i++) {
    if (termsEnum.seekExact(terms.get(ords[i], spare))) {
      postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
      float score = TermsIncludingScoreQuery.this.scores[ords[i]];
      for (int doc = postingsEnum.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = postingsEnum.nextDoc()) {
        matchingDocs.set(doc);
        // In the case the same doc is also related to a another doc, a score might be overwritten. I think this
        // can only happen in a many-to-many relation
        scores[doc] = score;
      }
    }
  }
}
 
Example 9
Source File: TestMemoryIndex.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testSeekByTermOrd() throws IOException {
  MemoryIndex mi = new MemoryIndex();
  mi.addField("field", "some terms be here", analyzer);
  IndexSearcher searcher = mi.createSearcher();
  LeafReader reader = (LeafReader) searcher.getIndexReader();
  TermsEnum terms = reader.terms("field").iterator();
  terms.seekExact(0);
  assertEquals("be", terms.term().utf8ToString());
  TestUtil.checkReader(reader);
}
 
Example 10
Source File: FeatureSortField.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
protected void doSetNextReader(LeafReaderContext context) throws IOException {
  Terms terms = context.reader().terms(field);
  if (terms == null) {
    currentReaderPostingsValues = null;
  } else {
    TermsEnum termsEnum = terms.iterator();
    if (termsEnum.seekExact(featureName) == false) {
      currentReaderPostingsValues = null;
    } else {
      currentReaderPostingsValues = termsEnum.postings(currentReaderPostingsValues, PostingsEnum.FREQS);
    }
  }
}
 
Example 11
Source File: FeatureDoubleValuesSource.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public DoubleValues getValues(LeafReaderContext ctx, DoubleValues scores) throws IOException {
  Terms terms = ctx.reader().terms(field);
  if (terms == null) {
    return DoubleValues.EMPTY;
  } else {
    TermsEnum termsEnum = terms.iterator();
    if (termsEnum.seekExact(featureName) == false) {
      return DoubleValues.EMPTY;
    } else {
      PostingsEnum currentReaderPostingsValues = termsEnum.postings(null, PostingsEnum.FREQS);
      return new FeatureDoubleValues(currentReaderPostingsValues);
    }
  }
}
 
Example 12
Source File: TermGroupFacetCollector.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
SegmentResult(int[] counts, int total, int missingCountIndex, TermsEnum tenum, int startFacetOrd, int endFacetOrd) throws IOException {
  super(counts, total - counts[missingCountIndex], counts[missingCountIndex],
      endFacetOrd == missingCountIndex + 1 ?  missingCountIndex : endFacetOrd);
  this.tenum = tenum;
  this.mergePos = startFacetOrd;
  if (tenum != null) {
    tenum.seekExact(mergePos);
    mergeTerm = tenum.term();
  }
}
 
Example 13
Source File: TermIntervalsSource.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public IntervalMatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException {
  Terms terms = ctx.reader().terms(field);
  if (terms == null)
    return null;
  if (terms.hasPositions() == false) {
    throw new IllegalArgumentException("Cannot create an IntervalIterator over field " + field + " because it has no indexed positions");
  }
  TermsEnum te = terms.iterator();
  if (te.seekExact(term) == false) {
    return null;
  }
  return matches(te, doc, field);
}
 
Example 14
Source File: TermIntervalsSource.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public IntervalIterator intervals(String field, LeafReaderContext ctx) throws IOException {
  Terms terms = ctx.reader().terms(field);
  if (terms == null)
    return null;
  if (terms.hasPositions() == false) {
    throw new IllegalArgumentException("Cannot create an IntervalIterator over field " + field + " because it has no indexed positions");
  }
  TermsEnum te = terms.iterator();
  if (te.seekExact(term) == false) {
    return null;
  }
  return intervals(term, te);
}
 
Example 15
Source File: BlurUtil.java    From incubator-retired-blur with Apache License 2.0 5 votes vote down vote up
private static void applyFamily(OpenBitSet bits, String family, AtomicReader atomicReader, int primeDocRowId,
    int numberOfDocsInRow, Bits liveDocs) throws IOException {
  Fields fields = atomicReader.fields();
  Terms terms = fields.terms(BlurConstants.FAMILY);
  TermsEnum iterator = terms.iterator(null);
  BytesRef text = new BytesRef(family);
  int lastDocId = primeDocRowId + numberOfDocsInRow;
  if (iterator.seekExact(text, true)) {
    DocsEnum docs = iterator.docs(liveDocs, null, DocsEnum.FLAG_NONE);
    int doc = primeDocRowId;
    while ((doc = docs.advance(doc)) < lastDocId) {
      bits.set(doc - primeDocRowId);
    }
  }
}
 
Example 16
Source File: FieldOffsetStrategy.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
protected void createOffsetsEnumsForTerms(BytesRef[] sourceTerms, Terms termsIndex, int doc, List<OffsetsEnum> results) throws IOException {
  TermsEnum termsEnum = termsIndex.iterator();//does not return null
  for (BytesRef term : sourceTerms) {
    if (termsEnum.seekExact(term)) {
      PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS);
      if (postingsEnum == null) {
        // no offsets or positions available
        throw new IllegalArgumentException("field '" + getField() + "' was indexed without offsets, cannot highlight");
      }
      if (doc == postingsEnum.advance(doc)) { // now it's positioned, although may be exhausted
        results.add(new OffsetsEnum.OfPostings(term, postingsEnum));
      }
    }
  }
}
 
Example 17
Source File: DirectoryTaxonomyWriter.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
/**
 * Look up the given category in the cache and/or the on-disk storage,
 * returning the category's ordinal, or a negative number in case the
 * category does not yet exist in the taxonomy.
 */
protected synchronized int findCategory(FacetLabel categoryPath) throws IOException {
  // If we can find the category in the cache, or we know the cache is
  // complete, we can return the response directly from it
  int res = cache.get(categoryPath);
  if (res >= 0 || cacheIsComplete) {
    return res;
  }

  cacheMisses.incrementAndGet();
  // After a few cache misses, it makes sense to read all the categories
  // from disk and into the cache. The reason not to do this on the first
  // cache miss (or even when opening the writer) is that it will
  // significantly slow down the case when a taxonomy is opened just to
  // add one category. The idea only spending a long time on reading
  // after enough time was spent on cache misses is known as an "online
  // algorithm".
  perhapsFillCache();
  res = cache.get(categoryPath);
  if (res >= 0 || cacheIsComplete) {
    // if after filling the cache from the info on disk, the category is in it
    // or the cache is complete, return whatever cache.get returned.
    return res;
  }

  // if we get here, it means the category is not in the cache, and it is not
  // complete, and therefore we must look for the category on disk.
  
  // We need to get an answer from the on-disk index.
  initReaderManager();

  int doc = -1;
  DirectoryReader reader = readerManager.acquire();
  try {
    final BytesRef catTerm = new BytesRef(FacetsConfig.pathToString(categoryPath.components, categoryPath.length));
    PostingsEnum docs = null; // reuse
    for (LeafReaderContext ctx : reader.leaves()) {
      Terms terms = ctx.reader().terms(Consts.FULL);
      if (terms != null) {
        // TODO: share per-segment TermsEnum here!
        TermsEnum termsEnum = terms.iterator();
        if (termsEnum.seekExact(catTerm)) {
          // liveDocs=null because the taxonomy has no deletes
          docs = termsEnum.postings(docs, 0 /* freqs not required */);
          // if the term was found, we know it has exactly one document.
          doc = docs.nextDoc() + ctx.docBase;
          break;
        }
      }
    }
  } finally {
    readerManager.release(reader);
  }
  if (doc > 0) {
    addToCache(categoryPath, doc);
  }
  return doc;
}
 
Example 18
Source File: SpellChecker.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
/**
 * Indexes the data from the given {@link Dictionary}.
 * @param dict Dictionary to index
 * @param config {@link IndexWriterConfig} to use
 * @param fullMerge whether or not the spellcheck index should be fully merged
 * @throws AlreadyClosedException if the Spellchecker is already closed
 * @throws IOException If there is a low-level I/O error.
 */
public final void indexDictionary(Dictionary dict, IndexWriterConfig config, boolean fullMerge) throws IOException {
  synchronized (modifyCurrentIndexLock) {
    ensureOpen();
    final Directory dir = this.spellIndex;
    final IndexWriter writer = new IndexWriter(dir, config);
    IndexSearcher indexSearcher = obtainSearcher();
    final List<TermsEnum> termsEnums = new ArrayList<>();

    final IndexReader reader = searcher.getIndexReader();
    if (reader.maxDoc() > 0) {
      for (final LeafReaderContext ctx : reader.leaves()) {
        Terms terms = ctx.reader().terms(F_WORD);
        if (terms != null)
          termsEnums.add(terms.iterator());
      }
    }
    
    boolean isEmpty = termsEnums.isEmpty();

    try { 
      BytesRefIterator iter = dict.getEntryIterator();
      BytesRef currentTerm;
      
      terms: while ((currentTerm = iter.next()) != null) {

        String word = currentTerm.utf8ToString();
        int len = word.length();
        if (len < 3) {
          continue; // too short we bail but "too long" is fine...
        }

        if (!isEmpty) {
          for (TermsEnum te : termsEnums) {
            if (te.seekExact(currentTerm)) {
              continue terms;
            }
          }
        }

        // ok index the word
        Document doc = createDocument(word, getMin(len), getMax(len));
        writer.addDocument(doc);
      }
    } finally {
      releaseSearcher(indexSearcher);
    }
    if (fullMerge) {
      writer.forceMerge(1);
    }
    // close writer
    writer.close();
    // TODO: this isn't that great, maybe in the future SpellChecker should take
    // IWC in its ctor / keep its writer open?
    
    // also re-open the spell index to see our own changes when the next suggestion
    // is fetched:
    swapSearcher(dir);
  }
}
 
Example 19
Source File: DocTermOrds.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
/** Returns the term ({@link BytesRef}) corresponding to
 *  the provided ordinal. */
public BytesRef lookupTerm(TermsEnum termsEnum, int ord) throws IOException {
  termsEnum.seekExact(ord);
  return termsEnum.term();
}
 
Example 20
Source File: DocSetInfoCommand.java    From clue with Apache License 2.0 4 votes vote down vote up
@Override
public void execute(Namespace args, PrintStream out) throws Exception {
  String field = args.getString("field");
  String termVal = null;
  int bucketSize = args.getInt("size");

  if (field != null){
    String[] parts = field.split(":");
    if (parts.length > 1){
      field = parts[0];
      termVal = parts[1];
    }
  }
  
  IndexReader reader = ctx.getIndexReader();
  List<LeafReaderContext> leaves = reader.leaves();
  

  PostingsEnum postingsEnum = null;
  for (LeafReaderContext leaf : leaves) {
    LeafReader atomicReader = leaf.reader();
    Terms terms = atomicReader.terms(field);
    if (terms == null){
      continue;
    }
    if (terms != null && termVal != null){        
      TermsEnum te = terms.iterator();
      
      if (te.seekExact(new BytesRef(termVal))){
        postingsEnum = te.postings(postingsEnum, PostingsEnum.FREQS);
        
        int docFreq = te.docFreq();
        
        int minDocId = -1, maxDocId = -1;
        int doc, count = 0;
        
        int[] percentDocs = new int[PERCENTILES.length];
        
        int percentileIdx = 0;
        
        while ((doc = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
          maxDocId = doc;
          if (minDocId == -1) {
            minDocId = doc;
          }
          count ++;
          
          double perDocs = (double) count / (double) docFreq * 100.0;
          while (percentileIdx < percentDocs.length) {
            if (perDocs > PERCENTILES[percentileIdx]) {
              percentDocs[percentileIdx] = doc;
              percentileIdx++;
            } else {
              break;
            }
          }
        }
        
        // calculate histogram          
        int[] buckets = null;
        if (maxDocId > 0) {
          buckets = new int[maxDocId / bucketSize + 1];
          
          postingsEnum = te.postings(postingsEnum, PostingsEnum.FREQS);
          while ((doc = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
            int bucketIdx = doc / bucketSize;
            buckets[bucketIdx]++;
          }
        }
        
        double density = (double) docFreq / (double) (maxDocId - minDocId) ; 
        out.println(String.format("min: %d, max: %d, count: %d, density: %.2f", minDocId, maxDocId, docFreq, density));
        out.println("percentiles: " + Arrays.toString(PERCENTILES) + " => " + Arrays.toString(percentDocs));
        out.println("histogram: (bucketsize=" + bucketSize+")");
        out.println(Arrays.toString(buckets));
      }
    }
  }
}