Java Code Examples for org.apache.lucene.index.PostingsEnum#NO_MORE_DOCS

The following examples show how to use org.apache.lucene.index.PostingsEnum#NO_MORE_DOCS . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: FieldFeatureTFExtractorFactory.java    From ltr4l with Apache License 2.0 6 votes vote down vote up
@Override
public FieldFeatureExtractor[] create(LeafReaderContext context, Set<Integer> allDocs) throws IOException {
  FieldFeatureExtractor[] extractors = new FieldFeatureExtractor[terms.length];
  int i = 0;
  for(Term term: terms){
    final TermsEnum termsEnum = getTermsEnum(context, term);
    if (termsEnum == null) {
      extractors[i] = new FieldFeatureNullExtractor();
    }
    else{
      extractors[i] = new FieldFeatureTFExtractor(termsEnum.postings(null, PostingsEnum.FREQS));
      // get it twice without reuse to clone it...
      PostingsEnum docs = termsEnum.postings(null, PostingsEnum.FREQS);
      for(int docId = docs.nextDoc(); docId != PostingsEnum.NO_MORE_DOCS; docId = docs.nextDoc()){
        allDocs.add(docId);
      }
    }
    i++;
  }
  return extractors;
}
 
Example 2
Source File: LuceneIndex.java    From rdf4j with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
private static Document getDocument(LeafReader reader, Term term) throws IOException {
	PostingsEnum docs = reader.postings(term);
	if (docs != null) {
		int docId = docs.nextDoc();
		// PostingsEnum may contain deleted documents, we have to cope for it
		while (docId != PostingsEnum.NO_MORE_DOCS) {

			// if document is deleted, skip and continue
			Bits liveDocs = reader.getLiveDocs();
			if (liveDocs != null && !liveDocs.get(docId)) {
				docId = docs.nextDoc();
				continue;
			}
			if (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
				throw new IllegalStateException("Multiple Documents for term " + term.text());
			}
			return readDocument(reader, docId, null);
		}
	}
	return null;
}
 
Example 3
Source File: DocumentsImpl.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
public Optional<Integer> firstTermDoc() {
  if (tenum == null) {
    // terms enum is not set
    log.warn("Terms enum un-positioned.");
    return Optional.empty();
  }

  try {
    setPostingsIterator(tenum.postings(penum, PostingsEnum.ALL));

    if (penum.nextDoc() == PostingsEnum.NO_MORE_DOCS) {
      // no docs available for this term
      resetPostingsIterator();
      log.warn("No docs available for term: {} in field: {}.", BytesRefUtils.decode(tenum.term()), curField);
      return Optional.empty();
    } else {
      return Optional.of(penum.docID());
    }
  } catch (IOException e) {
    resetPostingsIterator();
    throw new LukeException(String.format(Locale.ENGLISH, "Term docs not available for field: %s.", curField), e);
  }
}
 
Example 4
Source File: DocumentsImpl.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
public Optional<Integer> nextTermDoc() {
  if (penum == null) {
    // postings enum is not initialized
    log.warn("Postings enum un-positioned for field: {}.", curField);
    return Optional.empty();
  }

  try {
    if (penum.nextDoc() == PostingsEnum.NO_MORE_DOCS) {
      // end of the iterator
      resetPostingsIterator();
      if (log.isInfoEnabled()) {
        log.info("Reached the end of the postings iterator for term: {} in field: {}", BytesRefUtils.decode(tenum.term()), curField);
      }
      return Optional.empty();
    } else {
      return Optional.of(penum.docID());
    }
  } catch (IOException e) {
    resetPostingsIterator();
    throw new LukeException(String.format(Locale.ENGLISH, "Term docs not available for field: %s.", curField), e);
  }
}
 
Example 5
Source File: TestIDVersionPostingsFormat.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/** Returns docID if found, else -1. */
public int lookup(BytesRef id, long version) throws IOException {
  for(int seg=0;seg<numSegs;seg++) {
    if (((IDVersionSegmentTermsEnum) termsEnums[seg]).seekExact(id, version)) {
      if (VERBOSE) {
        System.out.println("  found in seg=" + termsEnums[seg]);
      }
      postingsEnums[seg] = termsEnums[seg].postings(postingsEnums[seg], 0);
      int docID = postingsEnums[seg].nextDoc();
      if (docID != PostingsEnum.NO_MORE_DOCS && (liveDocs[seg] == null || liveDocs[seg].get(docID))) {
        lastVersion = ((IDVersionSegmentTermsEnum) termsEnums[seg]).getVersion();
        return docBases[seg] + docID;
      }
      assert hasDeletions;
    }
  }

  return -1;
}
 
Example 6
Source File: LuceneUtils.java    From semanticvectors with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Gets the 1 - entropy (i.e. 1+ plogp) of a term,
 * a function that favors terms that are focally distributed
 * We use the definition of log-entropy weighting provided in
 * Martin and Berry (2007):
 * Entropy = 1 + sum ((Pij log2(Pij)) /  log2(n))
 * where Pij = frequency of term i in doc j / global frequency of term i
 * 		 n	 = number of documents in collection
 * @param term whose entropy you want
 * Thanks to Vidya Vasuki for adding the hash table to
 * eliminate redundant calculation
 */
private float getEntropy(Term term) {
  if (termEntropy.containsKey(term.field()+"_"+term.text()))
    return termEntropy.get(term.field()+"_"+term.text());
  int gf = getGlobalTermFreq(term);
  double entropy = 0;
  try {
    PostingsEnum docsEnum = this.getDocsForTerm(term);
    while ((docsEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
      double p = docsEnum.freq(); //frequency in this document
      p = p / gf;    //frequency across all documents
      entropy += p * (Math.log(p) / Math.log(2)); //sum of Plog(P)
    }
    int n = this.getNumDocs();
    double log2n = Math.log(n) / Math.log(2);
    entropy = entropy / log2n;
  } catch (IOException e) {
    logger.info("Couldn't get term entropy for term " + term.text());
  }
  termEntropy.put(term.field()+"_"+term.text(), 1 + (float) entropy);
  return (float) (1 + entropy);
}
 
Example 7
Source File: LuceneIndex.java    From rdf4j with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
private static void addDocuments(LeafReader reader, Term term, Collection<Document> documents) throws IOException {
	PostingsEnum docs = reader.postings(term);
	if (docs != null) {
		int docId;
		while ((docId = docs.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
			Bits liveDocs = reader.getLiveDocs();
			// Maybe some of the docs have been deleted! Check that too..
			if (liveDocs != null && !liveDocs.get(docId)) {
				continue;
			}
			Document document = readDocument(reader, docId, null);
			documents.add(document);
		}
	}
}
 
Example 8
Source File: FrequencyCtxSentenceBasedFBWorker.java    From jate with GNU Lesser General Public License v3.0 5 votes vote down vote up
private List<MWESentenceContext> collectTermOffsets(Terms termVectorLookup) throws IOException {
    List<MWESentenceContext> result = new ArrayList<>();

    TermsEnum tiRef= termVectorLookup.iterator();
    BytesRef luceneTerm = tiRef.next();
    while (luceneTerm != null) {
        if (luceneTerm.length == 0) {
            luceneTerm = tiRef.next();
            continue;
        }
        String tString = luceneTerm.utf8ToString();
        if(!allCandidates.contains(tString)) {
            luceneTerm=tiRef.next();
            continue;
        }


        PostingsEnum postingsEnum = tiRef.postings(null, PostingsEnum.ALL);
        //PostingsEnum postingsEnum = ti.postings(null, PostingsEnum.OFFSETS);

        int doc = postingsEnum.nextDoc(); //this should be just 1 doc, i.e., the constraint for getting this TV
        if (doc != PostingsEnum.NO_MORE_DOCS) {
            int totalOccurrence = postingsEnum.freq();
            for (int i = 0; i < totalOccurrence; i++) {
                postingsEnum.nextPosition();
                int start = postingsEnum.startOffset();
                int end = postingsEnum.endOffset();
                BytesRef payload=postingsEnum.getPayload();
                int sentenceId=-1;
                if(payload!=null){
                    sentenceId=new SentenceContext(MWEMetadata.deserialize(payload.utf8ToString())).getSentenceId();
                }
                result.add(new MWESentenceContext(tString,sentenceId, start, end));
            }
        }
        luceneTerm = tiRef.next();
    }
    Collections.sort(result);
    return result;
}
 
Example 9
Source File: TermPrefixCursor.java    From SolrTextTagger with Apache License 2.0 5 votes vote down vote up
/** Returns an IntsRef either cached or reading postingsEnum. Not null.
 * @param postingsEnum*/
private IntsRef postingsEnumToIntsRef(PostingsEnum postingsEnum, Bits liveDocs) throws IOException {
  // (The cache can have empty IntsRefs)

  //lookup prefixBuf in a cache
  if (docIdsCache != null) {
    docIds = docIdsCache.get(prefixBuf);
    if (docIds != null) {
      return docIds;
    }
  }

  //read postingsEnum
  docIds = new IntsRef(termsEnum.docFreq());
  int docId;
  while ((docId = postingsEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
    if (liveDocs != null && !liveDocs.get(postingsEnum.docID())) {
      continue;
    }
    docIds.ints[docIds.length++] = docId;
  }
  if (docIds.length == 0)
    docIds = EMPTY_INTSREF;

  //cache
  if (docIdsCache != null) {
    ensureBufIsACopy();
    //clone is shallow; that's okay as the prefix isn't overwritten; it's just appended to
    docIdsCache.put(prefixBuf.clone(), docIds);
  }
  return docIds;
}
 
Example 10
Source File: LuceneIndexTest.java    From rdf4j with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
private static boolean next(PostingsEnum docs) throws IOException {
	return (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS);
}
 
Example 11
Source File: DrillSidewaysScorer.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
/** Used when base query is highly constraining vs the
 *  drilldowns, or when the docs must be scored at once
 *  (i.e., like BooleanScorer2, not BooleanScorer).  In
 *  this case we just .next() on base and .advance() on
 *  the dim filters. */ 
private void doQueryFirstScoring(Bits acceptDocs, LeafCollector collector, DocsAndCost[] dims) throws IOException {
  //if (DEBUG) {
  //  System.out.println("  doQueryFirstScoring");
  //}
  int docID = baseScorer.docID();

  nextDoc: while (docID != PostingsEnum.NO_MORE_DOCS) {
    if (acceptDocs != null && acceptDocs.get(docID) == false) {
      docID = baseIterator.nextDoc();
      continue;
    }
    LeafCollector failedCollector = null;
    for (DocsAndCost dim : dims) {
      // TODO: should we sort this 2nd dimension of
      // docsEnums from most frequent to least?
      if (dim.approximation.docID() < docID) {
        dim.approximation.advance(docID);
      }

      boolean matches = false;
      if (dim.approximation.docID() == docID) {
        if (dim.twoPhase == null) {
          matches = true;
        } else {
          matches = dim.twoPhase.matches();
        }
      }

      if (matches == false) {
        if (failedCollector != null) {
          // More than one dim fails on this document, so
          // it's neither a hit nor a near-miss; move to
          // next doc:
          docID = baseIterator.nextDoc();
          continue nextDoc;
        } else {
          failedCollector = dim.sidewaysLeafCollector;
        }
      }
    }

    collectDocID = docID;

    // TODO: we could score on demand instead since we are
    // daat here:
    collectScore = baseScorer.score();

    if (failedCollector == null) {
      // Hit passed all filters, so it's "real":
      collectHit(collector, dims);
    } else {
      // Hit missed exactly one filter:
      collectNearMiss(failedCollector);
    }

    docID = baseIterator.nextDoc();
  }
}
 
Example 12
Source File: DocVectors.java    From semanticvectors with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
/**
 * Creates doc vectors, iterating over terms.
 */
private void trainDocVectors() throws IOException {
  VerbatimLogger.info("Building document vectors ... ");
  Enumeration<ObjectVector> termEnum = termVectors.getAllVectors();
  try {
    int tc = 0;
    while (termEnum.hasMoreElements()) {
      // Output progress counter.
      if ((tc % 10000 == 0) || (tc < 10000 && tc % 1000 == 0)) {
        VerbatimLogger.info("Processed " + tc + " terms ... ");
      }
      tc++;

      ObjectVector termVectorObject = termEnum.nextElement();
      Vector termVector = termVectorObject.getVector();
      String word = (String) termVectorObject.getObject();

      // Go through checking terms for each fieldName.
      for (String fieldName : flagConfig.contentsfields()) {
        Term term = new Term(fieldName, word);
        float globalweight = luceneUtils.getGlobalTermWeight(term);
        float fieldweight = 1;

        // Get any docs for this term.
        PostingsEnum docsEnum = this.luceneUtils.getDocsForTerm(term);

        // This may occur frequently if one term vector store is derived from multiple fields
        if (docsEnum == null)  { continue; }

        while (docsEnum.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
          String externalDocID = luceneUtils.getExternalDocId(docsEnum.docID());
          // Add vector from this term, taking freq into account.
          Vector docVector = this.docVectors.getVector(externalDocID);
          float localweight = docsEnum.freq();

          if (flagConfig.fieldweight()) {
            //field weight: 1/sqrt(number of terms in field)
            TermsEnum terms = luceneUtils.getTermVector(docsEnum.docID(), fieldName).iterator();
            int numTerms = 0;
            while (terms.next() != null) {
              numTerms++;
            }
            fieldweight = (float) (1/Math.sqrt(numTerms));
          }

          docVector.superpose(
              termVector, localweight * globalweight * fieldweight, null);
        }
      }
    }
  }
  catch (IOException e) { // catches from indexReader.
    e.printStackTrace();
  }

  VerbatimLogger.info("\nNormalizing doc vectors ...\n");
  
  Enumeration<ObjectVector> docEnum = docVectors.getAllVectors();
  while (docEnum.hasMoreElements())
  	docEnum.nextElement().getVector().normalize();
}
 
Example 13
Source File: FrequencyCtxWindowBasedFBWorker.java    From jate with GNU Lesser General Public License v3.0 4 votes vote down vote up
private List<MWEInSentence> collectTermSentenceContext(Terms termVectorLookup,
                                                            Map<Integer, Integer> sentenceBoundaries) throws IOException {
    List<MWEInSentence> result = new ArrayList<>();

    TermsEnum tiRef = termVectorLookup.iterator();
    BytesRef luceneTerm = tiRef.next();
    while (luceneTerm != null) {
        if (luceneTerm.length == 0) {
            luceneTerm = tiRef.next();
            continue;
        }
        String tString = luceneTerm.utf8ToString();
        if (!allCandidates.contains(tString)) {
            luceneTerm = tiRef.next();
            continue;
        }


        PostingsEnum postingsEnum = tiRef.postings(null, PostingsEnum.ALL);
        //PostingsEnum postingsEnum = ti.postings(null, PostingsEnum.OFFSETS);

        int doc = postingsEnum.nextDoc(); //this should be just 1 doc, i.e., the constraint for getting this TV
        if (doc != PostingsEnum.NO_MORE_DOCS) {
            int totalOccurrence = postingsEnum.freq();
            for (int i = 0; i < totalOccurrence; i++) {
                postingsEnum.nextPosition();
                int start = postingsEnum.startOffset();
                int end = postingsEnum.endOffset();
                BytesRef payload = postingsEnum.getPayload();
                SentenceContext sentenceContextInfo = null;
                if (payload != null) {
                    sentenceContextInfo = new SentenceContext(MWEMetadata.deserialize(payload.utf8ToString()));
                }
                if (sentenceContextInfo == null)
                    result.add(new MWEInSentence(tString, start, end, 0, 0, 0));
                else {
                    result.add(new MWEInSentence(tString, start, end,
                            sentenceContextInfo.getFirstTokenIdx(),
                            sentenceContextInfo.getLastTokenIdx(),
                            sentenceContextInfo.getSentenceId()));

                    Integer endBound = sentenceBoundaries.get(sentenceContextInfo.getSentenceId());
                    if (endBound == null || endBound < sentenceContextInfo.getLastTokenIdx())
                        sentenceBoundaries.put(sentenceContextInfo.getSentenceId(),
                                sentenceContextInfo.getLastTokenIdx());
                }
            }
        }
        luceneTerm = tiRef.next();
    }
    Collections.sort(result);
    return result;
}