Java Code Examples for org.apache.lucene.index.PostingsEnum#nextDoc()

The following examples show how to use org.apache.lucene.index.PostingsEnum#nextDoc() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: TestBlockPostingsFormat3.java From lucene-solr with Apache License 2.0

6 votes

/**
 * checks docs + freqs + positions + payloads, sequentially
 */
public void assertDocsAndPositionsEnum(PostingsEnum leftDocs, PostingsEnum rightDocs) throws Exception {
  assertNotNull(leftDocs);
  assertNotNull(rightDocs);
  assertEquals(-1, leftDocs.docID());
  assertEquals(-1, rightDocs.docID());
  int docid;
  while ((docid = leftDocs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
    assertEquals(docid, rightDocs.nextDoc());
    int freq = leftDocs.freq();
    assertEquals(freq, rightDocs.freq());
    for (int i = 0; i < freq; i++) {
      assertEquals(leftDocs.nextPosition(), rightDocs.nextPosition());
      // we don't assert offsets/payloads, they are allowed to be different
    }
  }
  assertEquals(DocIdSetIterator.NO_MORE_DOCS, rightDocs.nextDoc());
}

Example 2

Source File: TermsIncludingScoreQuery.java From lucene-solr with Apache License 2.0

6 votes

@Override
protected void fillDocsAndScores(FixedBitSet matchingDocs, TermsEnum termsEnum) throws IOException {
  BytesRef spare = new BytesRef();
  PostingsEnum postingsEnum = null;
  for (int i = 0; i < terms.size(); i++) {
    if (termsEnum.seekExact(terms.get(ords[i], spare))) {
      postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
      float score = TermsIncludingScoreQuery.this.scores[ords[i]];
      for (int doc = postingsEnum.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = postingsEnum.nextDoc()) {
        // I prefer this:
        /*if (scores[doc] < score) {
          scores[doc] = score;
          matchingDocs.set(doc);
        }*/
        // But this behaves the same as MVInnerScorer and only then the tests will pass:
        if (!matchingDocs.get(doc)) {
          scores[doc] = score;
          matchingDocs.set(doc);
        }
      }
    }
  }
}

Example 3

Source File: TestRTGBase.java From lucene-solr with Apache License 2.0

6 votes

protected int getFirstMatch(IndexReader r, Term t) throws IOException {
  Terms terms = MultiTerms.getTerms(r, t.field());
  if (terms == null) return -1;
  BytesRef termBytes = t.bytes();
  final TermsEnum termsEnum = terms.iterator();
  if (!termsEnum.seekExact(termBytes)) {
    return -1;
  }
  PostingsEnum docs = termsEnum.postings(null, PostingsEnum.NONE);
  docs = BitsFilteredPostingsEnum.wrap(docs, MultiBits.getLiveDocs(r));
  int id = docs.nextDoc();
  if (id != DocIdSetIterator.NO_MORE_DOCS) {
    int next = docs.nextDoc();
    assertEquals(DocIdSetIterator.NO_MORE_DOCS, next);
  }
  return id == DocIdSetIterator.NO_MORE_DOCS ? -1 : id;
}

Example 4

Source File: LuceneUtils.java From semanticvectors with BSD 3-Clause "New" or "Revised" License

6 votes

/**
 * Gets the 1 - entropy (i.e. 1+ plogp) of a term,
 * a function that favors terms that are focally distributed
 * We use the definition of log-entropy weighting provided in
 * Martin and Berry (2007):
 * Entropy = 1 + sum ((Pij log2(Pij)) /  log2(n))
 * where Pij = frequency of term i in doc j / global frequency of term i
 * 		 n	 = number of documents in collection
 * @param term whose entropy you want
 * Thanks to Vidya Vasuki for adding the hash table to
 * eliminate redundant calculation
 */
private float getEntropy(Term term) {
  if (termEntropy.containsKey(term.field()+"_"+term.text()))
    return termEntropy.get(term.field()+"_"+term.text());
  int gf = getGlobalTermFreq(term);
  double entropy = 0;
  try {
    PostingsEnum docsEnum = this.getDocsForTerm(term);
    while ((docsEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
      double p = docsEnum.freq(); //frequency in this document
      p = p / gf;    //frequency across all documents
      entropy += p * (Math.log(p) / Math.log(2)); //sum of Plog(P)
    }
    int n = this.getNumDocs();
    double log2n = Math.log(n) / Math.log(2);
    entropy = entropy / log2n;
  } catch (IOException e) {
    logger.info("Couldn't get term entropy for term " + term.text());
  }
  termEntropy.put(term.field()+"_"+term.text(), 1 + (float) entropy);
  return (float) (1 + entropy);
}

Example 5

Source File: LukeRequestHandler.java From lucene-solr with Apache License 2.0

6 votes

private static Document getFirstLiveDoc(Terms terms, LeafReader reader) throws IOException {
  PostingsEnum postingsEnum = null;
  TermsEnum termsEnum = terms.iterator();
  BytesRef text;
  // Deal with the chance that the first bunch of terms are in deleted documents. Is there a better way?
  for (int idx = 0; idx < 1000 && postingsEnum == null; ++idx) {
    text = termsEnum.next();
    if (text == null) { // Ran off the end of the terms enum without finding any live docs with that field in them.
      return null;
    }
    postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
    final Bits liveDocs = reader.getLiveDocs();
    if (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
      if (liveDocs != null && liveDocs.get(postingsEnum.docID())) {
        continue;
      }
      return reader.document(postingsEnum.docID());
    }
  }
  return null;
}

Example 6

Source File: TermVectorEntry.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Returns a new term vector entry representing the specified term, and optionally, positions.
 *
 * @param te - positioned terms iterator
 * @return term vector entry
 * @throws IOException - if there is a low level IO error.
 */
static TermVectorEntry of(TermsEnum te) throws IOException {
  Objects.requireNonNull(te);

  String termText = BytesRefUtils.decode(te.term());

  List<TermVectorEntry.TermVectorPosition> tvPositions = new ArrayList<>();
  PostingsEnum pe = te.postings(null, PostingsEnum.OFFSETS);
  pe.nextDoc();
  int freq = pe.freq();
  for (int i = 0; i < freq; i++) {
    int pos = pe.nextPosition();
    if (pos < 0) {
      // no position information available
      continue;
    }
    TermVectorPosition tvPos = TermVectorPosition.of(pos, pe);
    tvPositions.add(tvPos);
  }

  return new TermVectorEntry(termText, te.totalTermFreq(), tvPositions);
}

Example 7

Source File: ESIndex.java From pyramid with Apache License 2.0

5 votes

private Map<Integer,String> getTermVectorWithException(String field, String id) throws IOException {
    TermVectorsResponse response = client.prepareTermVector(indexName, documentType, id)
            .setOffsets(false).setPositions(true).setFieldStatistics(false)
            .setTermStatistics(false)
            .setSelectedFields(field).
                    execute().actionGet();

    Map<Integer,String> map = new HashMap<>();
    Terms terms = response.getFields().terms(field);
    if (terms==null){
        return map;
    }
    TermsEnum iterator = terms.iterator();
    PostingsEnum postings = null;
    
    for (BytesRef termBytes = null; (termBytes = iterator.next()) != null; ) {
    	String term = termBytes.utf8ToString();
    	
    	postings = iterator.postings(postings, PostingsEnum.ALL);
    	
    	//there can only be one doc since we are getting with id. get the doc and the position 
    	postings.nextDoc();
    	
    	int tf = postings.freq();
    	
    	for (int i = 0; i < tf; i++) {
    		int pos = postings.nextPosition();
            map.put(pos,term);
    	}
    	
    }
    
    return map;
}

Example 8

Source File: TestBlockPostingsFormat3.java From lucene-solr with Apache License 2.0

5 votes

/**
 * checks advancing docs
 */
public void assertDocsSkipping(int docFreq, PostingsEnum leftDocs, PostingsEnum rightDocs) throws Exception {
  if (leftDocs == null) {
    assertNull(rightDocs);
    return;
  }
  int docid = -1;
  int averageGap = MAXDOC / (1+docFreq);
  int skipInterval = 16;

  while (true) {
    if (random().nextBoolean()) {
      // nextDoc()
      docid = leftDocs.nextDoc();
      assertEquals(docid, rightDocs.nextDoc());
    } else {
      // advance()
      int skip = docid + (int) Math.ceil(Math.abs(skipInterval + random().nextGaussian() * averageGap));
      docid = leftDocs.advance(skip);
      assertEquals(docid, rightDocs.advance(skip));
    }
    
    if (docid == DocIdSetIterator.NO_MORE_DOCS) {
      return;
    }
    // we don't assert freqs, they are allowed to be different
  }
}

Example 9

Source File: TestBlockPostingsFormat3.java From lucene-solr with Apache License 2.0

5 votes

/**
 * checks docs + freqs, sequentially
 */
public void assertDocsEnum(PostingsEnum leftDocs, PostingsEnum rightDocs) throws Exception {
  if (leftDocs == null) {
    assertNull(rightDocs);
    return;
  }
  assertEquals(-1, leftDocs.docID());
  assertEquals(-1, rightDocs.docID());
  int docid;
  while ((docid = leftDocs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
    assertEquals(docid, rightDocs.nextDoc());
    // we don't assert freqs, they are allowed to be different
  }
  assertEquals(DocIdSetIterator.NO_MORE_DOCS, rightDocs.nextDoc());
}

Example 10

Source File: TermPrefixCursor.java From SolrTextTagger with Apache License 2.0

5 votes

/** Returns an IntsRef either cached or reading postingsEnum. Not null.
 * @param postingsEnum*/
private IntsRef postingsEnumToIntsRef(PostingsEnum postingsEnum, Bits liveDocs) throws IOException {
  // (The cache can have empty IntsRefs)

  //lookup prefixBuf in a cache
  if (docIdsCache != null) {
    docIds = docIdsCache.get(prefixBuf);
    if (docIds != null) {
      return docIds;
    }
  }

  //read postingsEnum
  docIds = new IntsRef(termsEnum.docFreq());
  int docId;
  while ((docId = postingsEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
    if (liveDocs != null && !liveDocs.get(postingsEnum.docID())) {
      continue;
    }
    docIds.ints[docIds.length++] = docId;
  }
  if (docIds.length == 0)
    docIds = EMPTY_INTSREF;

  //cache
  if (docIdsCache != null) {
    ensureBufIsACopy();
    //clone is shallow; that's okay as the prefix isn't overwritten; it's just appended to
    docIdsCache.put(prefixBuf.clone(), docIds);
  }
  return docIds;
}

Example 11

Source File: FrequencyCtxSentenceBasedFBWorker.java From jate with GNU Lesser General Public License v3.0

5 votes

private List<MWESentenceContext> collectTermOffsets(Terms termVectorLookup) throws IOException {
    List<MWESentenceContext> result = new ArrayList<>();

    TermsEnum tiRef= termVectorLookup.iterator();
    BytesRef luceneTerm = tiRef.next();
    while (luceneTerm != null) {
        if (luceneTerm.length == 0) {
            luceneTerm = tiRef.next();
            continue;
        }
        String tString = luceneTerm.utf8ToString();
        if(!allCandidates.contains(tString)) {
            luceneTerm=tiRef.next();
            continue;
        }


        PostingsEnum postingsEnum = tiRef.postings(null, PostingsEnum.ALL);
        //PostingsEnum postingsEnum = ti.postings(null, PostingsEnum.OFFSETS);

        int doc = postingsEnum.nextDoc(); //this should be just 1 doc, i.e., the constraint for getting this TV
        if (doc != PostingsEnum.NO_MORE_DOCS) {
            int totalOccurrence = postingsEnum.freq();
            for (int i = 0; i < totalOccurrence; i++) {
                postingsEnum.nextPosition();
                int start = postingsEnum.startOffset();
                int end = postingsEnum.endOffset();
                BytesRef payload=postingsEnum.getPayload();
                int sentenceId=-1;
                if(payload!=null){
                    sentenceId=new SentenceContext(MWEMetadata.deserialize(payload.utf8ToString())).getSentenceId();
                }
                result.add(new MWESentenceContext(tString,sentenceId, start, end));
            }
        }
        luceneTerm = tiRef.next();
    }
    Collections.sort(result);
    return result;
}

Example 12

Source File: TestBlockPostingsFormat3.java From lucene-solr with Apache License 2.0

5 votes

/**
 * checks advancing docs + positions
 */
public void assertPositionsSkipping(int docFreq, PostingsEnum leftDocs, PostingsEnum rightDocs) throws Exception {
  if (leftDocs == null || rightDocs == null) {
    assertNull(leftDocs);
    assertNull(rightDocs);
    return;
  }
  
  int docid = -1;
  int averageGap = MAXDOC / (1+docFreq);
  int skipInterval = 16;

  while (true) {
    if (random().nextBoolean()) {
      // nextDoc()
      docid = leftDocs.nextDoc();
      assertEquals(docid, rightDocs.nextDoc());
    } else {
      // advance()
      int skip = docid + (int) Math.ceil(Math.abs(skipInterval + random().nextGaussian() * averageGap));
      docid = leftDocs.advance(skip);
      assertEquals(docid, rightDocs.advance(skip));
    }
    
    if (docid == DocIdSetIterator.NO_MORE_DOCS) {
      return;
    }
    int freq = leftDocs.freq();
    assertEquals(freq, rightDocs.freq());
    for (int i = 0; i < freq; i++) {
      assertEquals(leftDocs.nextPosition(), rightDocs.nextPosition());
      // we don't compare the payloads, it's allowed that one is empty etc
    }
  }
}

Example 13

Source File: AlfrescoLukeRequestHandler.java From SearchServices with GNU Lesser General Public License v3.0

5 votes

protected static Document getFirstLiveDoc(Terms terms, LeafReader reader)
		throws IOException {
	TermsEnum termsEnum = terms.iterator();
	if (termsEnum.next() == null) { // Ran off the end of the terms enum without finding any live docs with that field in them.
		return null;
	}
	PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.NONE);
	final Bits liveDocs = reader.getLiveDocs();
	if (postingsEnum.nextDoc() == DocIdSetIterator.NO_MORE_DOCS
			|| (liveDocs != null && liveDocs.get(postingsEnum.docID()))) {
		return null;
	}
	return reader.document(postingsEnum.docID());
}

Example 14

Source File: FieldCacheImpl.java From lucene-solr with Apache License 2.0

4 votes

@Override
protected Accountable createValue(LeafReader reader, CacheKey key)
    throws IOException {

  final int maxDoc = reader.maxDoc();

  Terms terms = reader.terms(key.field);

  final float acceptableOverheadRatio = ((Float) key.custom).floatValue();

  final PagedBytes bytes = new PagedBytes(15);

  int startTermsBPV;

  // TODO: use Uninvert?
  if (terms != null) {
    // Try for coarse estimate for number of bits; this
    // should be an underestimate most of the time, which
    // is fine -- GrowableWriter will reallocate as needed
    long numUniqueTerms = terms.size();
    if (numUniqueTerms != -1L) {
      if (numUniqueTerms > maxDoc) {
        throw new IllegalStateException("Type mismatch: " + key.field + " was indexed with multiple values per document, use SORTED_SET instead");
      }

      startTermsBPV = PackedInts.bitsRequired(numUniqueTerms);
    } else {
      startTermsBPV = 1;
    }
  } else {
    startTermsBPV = 1;
  }

  PackedLongValues.Builder termOrdToBytesOffset = PackedLongValues.monotonicBuilder(PackedInts.COMPACT);
  final GrowableWriter docToTermOrd = new GrowableWriter(startTermsBPV, maxDoc, acceptableOverheadRatio);

  int termOrd = 0;

  // TODO: use Uninvert?

  if (terms != null) {
    final TermsEnum termsEnum = terms.iterator();
    PostingsEnum docs = null;

    while(true) {
      final BytesRef term = termsEnum.next();
      if (term == null) {
        break;
      }
      if (termOrd >= maxDoc) {
        throw new IllegalStateException("Type mismatch: " + key.field + " was indexed with multiple values per document, use SORTED_SET instead");
      }

      termOrdToBytesOffset.add(bytes.copyUsingLengthPrefix(term));
      docs = termsEnum.postings(docs, PostingsEnum.NONE);
      while (true) {
        final int docID = docs.nextDoc();
        if (docID == DocIdSetIterator.NO_MORE_DOCS) {
          break;
        }
        // Store 1+ ord into packed bits
        docToTermOrd.set(docID, 1+termOrd);
      }
      termOrd++;
    }
  }

  // maybe an int-only impl?
  return new SortedDocValuesImpl(bytes.freeze(true), termOrdToBytesOffset.build(), docToTermOrd.getMutable(), termOrd);
}

Example 15

Source File: DirectoryTaxonomyReader.java From lucene-solr with Apache License 2.0

4 votes

@Override
public int getOrdinal(FacetLabel cp) throws IOException {
  ensureOpen();
  if (cp.length == 0) {
    return ROOT_ORDINAL;
  }

  // First try to find the answer in the LRU cache:
  synchronized (ordinalCache) {
    Integer res = ordinalCache.get(cp);
    if (res != null) {
      if (res.intValue() < indexReader.maxDoc()) {
        // Since the cache is shared with DTR instances allocated from
        // doOpenIfChanged, we need to ensure that the ordinal is one that
        // this DTR instance recognizes.
        return res.intValue();
      } else {
        // if we get here, it means that the category was found in the cache,
        // but is not recognized by this TR instance. Therefore there's no
        // need to continue search for the path on disk, because we won't find
        // it there too.
        return TaxonomyReader.INVALID_ORDINAL;
      }
    }
  }

  // If we're still here, we have a cache miss. We need to fetch the
  // value from disk, and then also put it in the cache:
  int ret = TaxonomyReader.INVALID_ORDINAL;
  PostingsEnum docs = MultiTerms.getTermPostingsEnum(indexReader, Consts.FULL, new BytesRef(FacetsConfig.pathToString(cp.components, cp.length)), 0);
  if (docs != null && docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
    ret = docs.docID();
    
    // we only store the fact that a category exists, not its inexistence.
    // This is required because the caches are shared with new DTR instances
    // that are allocated from doOpenIfChanged. Therefore, if we only store
    // information about found categories, we cannot accidently tell a new
    // generation of DTR that a category does not exist.
    synchronized (ordinalCache) {
      ordinalCache.put(cp, Integer.valueOf(ret));
    }
  }

  return ret;
}

Example 16

Source File: DocVectors.java From semanticvectors with BSD 3-Clause "New" or "Revised" License

4 votes

/**
 * Creates doc vectors, iterating over terms.
 */
private void trainDocVectors() throws IOException {
  VerbatimLogger.info("Building document vectors ... ");
  Enumeration<ObjectVector> termEnum = termVectors.getAllVectors();
  try {
    int tc = 0;
    while (termEnum.hasMoreElements()) {
      // Output progress counter.
      if ((tc % 10000 == 0) || (tc < 10000 && tc % 1000 == 0)) {
        VerbatimLogger.info("Processed " + tc + " terms ... ");
      }
      tc++;

      ObjectVector termVectorObject = termEnum.nextElement();
      Vector termVector = termVectorObject.getVector();
      String word = (String) termVectorObject.getObject();

      // Go through checking terms for each fieldName.
      for (String fieldName : flagConfig.contentsfields()) {
        Term term = new Term(fieldName, word);
        float globalweight = luceneUtils.getGlobalTermWeight(term);
        float fieldweight = 1;

        // Get any docs for this term.
        PostingsEnum docsEnum = this.luceneUtils.getDocsForTerm(term);

        // This may occur frequently if one term vector store is derived from multiple fields
        if (docsEnum == null)  { continue; }

        while (docsEnum.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
          String externalDocID = luceneUtils.getExternalDocId(docsEnum.docID());
          // Add vector from this term, taking freq into account.
          Vector docVector = this.docVectors.getVector(externalDocID);
          float localweight = docsEnum.freq();

          if (flagConfig.fieldweight()) {
            //field weight: 1/sqrt(number of terms in field)
            TermsEnum terms = luceneUtils.getTermVector(docsEnum.docID(), fieldName).iterator();
            int numTerms = 0;
            while (terms.next() != null) {
              numTerms++;
            }
            fieldweight = (float) (1/Math.sqrt(numTerms));
          }

          docVector.superpose(
              termVector, localweight * globalweight * fieldweight, null);
        }
      }
    }
  }
  catch (IOException e) { // catches from indexReader.
    e.printStackTrace();
  }

  VerbatimLogger.info("\nNormalizing doc vectors ...\n");
  
  Enumeration<ObjectVector> docEnum = docVectors.getAllVectors();
  while (docEnum.hasMoreElements())
  	docEnum.nextElement().getVector().normalize();
}

Example 17

Source File: VectorScoreQuery.java From solr-vector-scoring with Apache License 2.0

4 votes

@Override
protected CustomScoreProvider getCustomScoreProvider(LeafReaderContext context) throws IOException {
	return new CustomScoreProvider(context){
		@Override
		public float customScore(int docID, float subQueryScore, float valSrcScore) throws IOException {
			float score = 0;
			double docVectorNorm = 0;
			LeafReader reader = context.reader();
			Terms terms = reader.getTermVector(docID, field);
			if(vector.size() != terms.size()){
				throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "indexed and input vector array must have same length");
			}
			TermsEnum iter = terms.iterator();
		    BytesRef text;
		    while ((text = iter.next()) != null) {
		    	String term = text.utf8ToString();
		    	float payloadValue = 0f;
		    	PostingsEnum postings = iter.postings(null, PostingsEnum.ALL);
		    	while (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
		    		int freq = postings.freq();
		    		while (freq-- > 0) postings.nextPosition();

		    		BytesRef payload = postings.getPayload();
		    		payloadValue = PayloadHelper.decodeFloat(payload.bytes, payload.offset); 
		    		
		    		if (cosine)
		              docVectorNorm += Math.pow(payloadValue, 2.0);
		    	}
		    		
		    	score = (float)(score + payloadValue * (vector.get(Integer.parseInt(term))));
		    }
		    
		    if (cosine) {
		      if ((docVectorNorm == 0) || (queryVectorNorm == 0)) return 0f;
		      return (float)(score / (Math.sqrt(docVectorNorm) * Math.sqrt(queryVectorNorm)));
		    }

			return score;
		}
	};
}

Example 18

Source File: FieldCacheImpl.java From lucene-solr with Apache License 2.0

4 votes

@Override
protected Accountable createValue(LeafReader reader, CacheKey key)
    throws IOException {

  // TODO: would be nice to first check if DocTermsIndex
  // was already cached for this field and then return
  // that instead, to avoid insanity

  final int maxDoc = reader.maxDoc();
  Terms terms = reader.terms(key.field);

  final float acceptableOverheadRatio = ((Float) key.custom).floatValue();

  final int termCountHardLimit = maxDoc;

  // Holds the actual term data, expanded.
  final PagedBytes bytes = new PagedBytes(15);

  int startBPV;

  if (terms != null) {
    // Try for coarse estimate for number of bits; this
    // should be an underestimate most of the time, which
    // is fine -- GrowableWriter will reallocate as needed
    long numUniqueTerms = terms.size();
    if (numUniqueTerms != -1L) {
      if (numUniqueTerms > termCountHardLimit) {
        numUniqueTerms = termCountHardLimit;
      }
      startBPV = PackedInts.bitsRequired(numUniqueTerms*4);
    } else {
      startBPV = 1;
    }
  } else {
    startBPV = 1;
  }

  final GrowableWriter docToOffset = new GrowableWriter(startBPV, maxDoc, acceptableOverheadRatio);
  
  // pointer==0 means not set
  bytes.copyUsingLengthPrefix(new BytesRef());

  if (terms != null) {
    int termCount = 0;
    final TermsEnum termsEnum = terms.iterator();
    PostingsEnum docs = null;
    while(true) {
      if (termCount++ == termCountHardLimit) {
        // app is misusing the API (there is more than
        // one term per doc); in this case we make best
        // effort to load what we can (see LUCENE-2142)
        break;
      }

      final BytesRef term = termsEnum.next();
      if (term == null) {
        break;
      }
      final long pointer = bytes.copyUsingLengthPrefix(term);
      docs = termsEnum.postings(docs, PostingsEnum.NONE);
      while (true) {
        final int docID = docs.nextDoc();
        if (docID == DocIdSetIterator.NO_MORE_DOCS) {
          break;
        }
        docToOffset.set(docID, pointer);
      }
    }
  }

  final PackedInts.Reader offsetReader = docToOffset.getMutable();
  Bits docsWithField = new Bits() {
    @Override
    public boolean get(int index) {
      return offsetReader.get(index) != 0;
    }

    @Override
    public int length() {
      return maxDoc;
    }
  };

  wrapper.setDocsWithField(reader, key.field, docsWithField, null);
  // maybe an int-only impl?
  return new BinaryDocValuesImpl(bytes.freeze(true), offsetReader, docsWithField);
}

Example 19

Source File: FilterableTermsEnum.java From Elasticsearch with Apache License 2.0

4 votes

@Override
public boolean seekExact(BytesRef text) throws IOException {
    int docFreq = 0;
    long totalTermFreq = 0;
    for (Holder anEnum : enums) {
        if (anEnum.termsEnum.seekExact(text)) {
            if (anEnum.bits == null) {
                docFreq += anEnum.termsEnum.docFreq();
                if (docsEnumFlag == PostingsEnum.FREQS) {
                    long leafTotalTermFreq = anEnum.termsEnum.totalTermFreq();
                    if (totalTermFreq == -1 || leafTotalTermFreq == -1) {
                        totalTermFreq = -1;
                        continue;
                    }
                    totalTermFreq += leafTotalTermFreq;
                }
            } else {
                final PostingsEnum docsEnum = anEnum.docsEnum = anEnum.termsEnum.postings(anEnum.docsEnum, docsEnumFlag);
                // 2 choices for performing same heavy loop - one attempts to calculate totalTermFreq and other does not
                if (docsEnumFlag == PostingsEnum.FREQS) {
                    for (int docId = docsEnum.nextDoc(); docId != DocIdSetIterator.NO_MORE_DOCS; docId = docsEnum.nextDoc()) {
                        if (anEnum.bits != null && anEnum.bits.get(docId) == false) {
                            continue;
                        }
                        docFreq++;
                        // docsEnum.freq() returns 1 if doc indexed with IndexOptions.DOCS_ONLY so no way of knowing if value
                        // is really 1 or unrecorded when filtering like this
                        totalTermFreq += docsEnum.freq();
                    }
                } else {
                    for (int docId = docsEnum.nextDoc(); docId != DocIdSetIterator.NO_MORE_DOCS; docId = docsEnum.nextDoc()) {
                        if (anEnum.bits != null && anEnum.bits.get(docId) == false) {
                            continue;
                        }
                        // docsEnum.freq() behaviour is undefined if docsEnumFlag==PostingsEnum.FLAG_NONE so don't bother with call
                        docFreq++;
                    }
                }
            }
        }
    }
    if (docFreq > 0) {
        currentDocFreq = docFreq;
        currentTotalTermFreq = totalTermFreq;
        current = text;
        return true;
    } else {
        currentDocFreq = NOT_FOUND;
        currentTotalTermFreq = NOT_FOUND;
        current = null;
        return false;
    }
}

Example 20

Source File: TermVectorComponent.java From lucene-solr with Apache License 2.0

4 votes

private void mapOneVector(NamedList<Object> docNL, FieldOptions fieldOptions, IndexReader reader, int docID, TermsEnum termsEnum, String field) throws IOException {
  NamedList<Object> fieldNL = new NamedList<>();
  docNL.add(field, fieldNL);

  BytesRef text;
  PostingsEnum dpEnum = null;
  while((text = termsEnum.next()) != null) {
    String term = text.utf8ToString();
    NamedList<Object> termInfo = new NamedList<>();
    fieldNL.add(term, termInfo);
    final int freq = (int) termsEnum.totalTermFreq();
    if (fieldOptions.termFreq == true) {
      termInfo.add("tf", freq);
    }

    int dpEnumFlags = 0;
    dpEnumFlags |= fieldOptions.positions ? PostingsEnum.POSITIONS : 0;
    //payloads require offsets
    dpEnumFlags |= (fieldOptions.offsets || fieldOptions.payloads) ? PostingsEnum.OFFSETS : 0;
    dpEnumFlags |= fieldOptions.payloads ? PostingsEnum.PAYLOADS : 0;
    dpEnum = termsEnum.postings(dpEnum, dpEnumFlags);

    boolean atNextDoc = false;
    if (dpEnum != null) {
      dpEnum.nextDoc();
      atNextDoc = true;
    }

    if (atNextDoc && dpEnumFlags != 0) {
      NamedList<Integer> positionsNL = null;
      NamedList<Number> theOffsets = null;
      NamedList<String> thePayloads = null;

      for (int i = 0; i < freq; i++) {
        final int pos = dpEnum.nextPosition();
        if (fieldOptions.positions && pos >= 0) {
          if (positionsNL == null) {
            positionsNL = new NamedList<>();
            termInfo.add("positions", positionsNL);
          }
          positionsNL.add("position", pos);
        }

        int startOffset = fieldOptions.offsets ? dpEnum.startOffset() : -1;
        if (startOffset >= 0) {
          if (theOffsets == null) {
            theOffsets = new NamedList<>();
            termInfo.add("offsets", theOffsets);
          }
          theOffsets.add("start", dpEnum.startOffset());
          theOffsets.add("end", dpEnum.endOffset());
        }

        BytesRef payload = fieldOptions.payloads ? dpEnum.getPayload() : null;
        if (payload != null) {
          if (thePayloads == null) {
            thePayloads = new NamedList<>();
            termInfo.add("payloads", thePayloads);
          }
          thePayloads.add("payload", Base64.byteArrayToBase64(payload.bytes, payload.offset, payload.length));
        }
      }
    }
    
    int df = 0;
    if (fieldOptions.docFreq || fieldOptions.tfIdf) {
      df = reader.docFreq(new Term(field, text));
    }

    if (fieldOptions.docFreq) {
      termInfo.add("df", df);
    }

    // TODO: this is not TF/IDF by anyone's definition!
    if (fieldOptions.tfIdf) {
      double tfIdfVal = ((double) freq) / df;
      termInfo.add("tf-idf", tfIdfVal);
    }
  }
}