Java Code Examples for org.apache.lucene.index.PostingsEnum#freq()

The following examples show how to use org.apache.lucene.index.PostingsEnum#freq() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TermVectorsResponse.java    From Elasticsearch with Apache License 2.0 6 votes vote down vote up
private void buildTerm(XContentBuilder builder, final CharsRefBuilder spare, Terms curTerms, TermsEnum termIter, BoostAttribute boostAtt) throws IOException {
    // start term, optimized writing
    BytesRef term = termIter.next();
    spare.copyUTF8Bytes(term);
    builder.startObject(spare.toString());
    buildTermStatistics(builder, termIter);
    // finally write the term vectors
    PostingsEnum posEnum = termIter.postings(null, PostingsEnum.ALL);
    int termFreq = posEnum.freq();
    builder.field(FieldStrings.TERM_FREQ, termFreq);
    initMemory(curTerms, termFreq);
    initValues(curTerms, posEnum, termFreq);
    buildValues(builder, curTerms, termFreq);
    buildScore(builder, boostAtt);
    builder.endObject();
}
 
Example 2
Source File: PhraseHelper.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
public void collectLeaf(PostingsEnum postings, int position, Term term) throws IOException {
  if (!fieldMatcher.test(term.field())) {
    return;
  }

  SpanCollectedOffsetsEnum offsetsEnum = termToOffsetsEnums.get(term.bytes());
  if (offsetsEnum == null) {
    // If it's pos insensitive we handle it outside of PhraseHelper.  term.field() is from the Query.
    if (positionInsensitiveTerms.contains(term.bytes())) {
      return;
    }
    offsetsEnum = new SpanCollectedOffsetsEnum(term.bytes(), postings.freq());
    termToOffsetsEnums.put(term.bytes(), offsetsEnum);
  }
  offsetsEnum.add(postings.startOffset(), postings.endOffset());
}
 
Example 3
Source File: LuceneUtils.java    From semanticvectors with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Gets the 1 - entropy (i.e. 1+ plogp) of a term,
 * a function that favors terms that are focally distributed
 * We use the definition of log-entropy weighting provided in
 * Martin and Berry (2007):
 * Entropy = 1 + sum ((Pij log2(Pij)) /  log2(n))
 * where Pij = frequency of term i in doc j / global frequency of term i
 * 		 n	 = number of documents in collection
 * @param term whose entropy you want
 * Thanks to Vidya Vasuki for adding the hash table to
 * eliminate redundant calculation
 */
private float getEntropy(Term term) {
  if (termEntropy.containsKey(term.field()+"_"+term.text()))
    return termEntropy.get(term.field()+"_"+term.text());
  int gf = getGlobalTermFreq(term);
  double entropy = 0;
  try {
    PostingsEnum docsEnum = this.getDocsForTerm(term);
    while ((docsEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
      double p = docsEnum.freq(); //frequency in this document
      p = p / gf;    //frequency across all documents
      entropy += p * (Math.log(p) / Math.log(2)); //sum of Plog(P)
    }
    int n = this.getNumDocs();
    double log2n = Math.log(n) / Math.log(2);
    entropy = entropy / log2n;
  } catch (IOException e) {
    logger.info("Couldn't get term entropy for term " + term.text());
  }
  termEntropy.put(term.field()+"_"+term.text(), 1 + (float) entropy);
  return (float) (1 + entropy);
}
 
Example 4
Source File: CodecCollector.java    From mtas with Apache License 2.0 6 votes vote down vote up
/**
 * Compute termvector number full.
 *
 * @param docSet
 *          the doc set
 * @param termDocId
 *          the term doc id
 * @param termsEnum
 *          the terms enum
 * @param lrc
 *          the lrc
 * @param postingsEnum
 *          the postings enum
 * @param positionsData
 *          the positions data
 * @return the termvector number full
 * @throws IOException
 *           Signals that an I/O exception has occurred.
 */
private static TermvectorNumberFull computeTermvectorNumberFull(
    List<Integer> docSet, int termDocId, TermsEnum termsEnum,
    LeafReaderContext lrc, PostingsEnum postingsEnum,
    Map<Integer, Integer> positionsData) throws IOException {
  TermvectorNumberFull result = new TermvectorNumberFull(docSet.size());
  Iterator<Integer> docIterator = docSet.iterator();
  int localTermDocId = termDocId;
  postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.FREQS);
  while (docIterator.hasNext()) {
    int docId = docIterator.next() - lrc.docBase;
    if (docId >= localTermDocId && ((docId == localTermDocId)
        || ((localTermDocId = postingsEnum.advance(docId)) == docId))) {
      result.args[result.docNumber] = postingsEnum.freq();
      result.positions[result.docNumber] = (positionsData == null) ? 0
          : positionsData.get(docId + lrc.docBase);
      result.docNumber++;
    }
  }
  return result;
}
 
Example 5
Source File: TermVectorEntry.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Returns a new term vector entry representing the specified term, and optionally, positions.
 *
 * @param te - positioned terms iterator
 * @return term vector entry
 * @throws IOException - if there is a low level IO error.
 */
static TermVectorEntry of(TermsEnum te) throws IOException {
  Objects.requireNonNull(te);

  String termText = BytesRefUtils.decode(te.term());

  List<TermVectorEntry.TermVectorPosition> tvPositions = new ArrayList<>();
  PostingsEnum pe = te.postings(null, PostingsEnum.OFFSETS);
  pe.nextDoc();
  int freq = pe.freq();
  for (int i = 0; i < freq; i++) {
    int pos = pe.nextPosition();
    if (pos < 0) {
      // no position information available
      continue;
    }
    TermVectorPosition tvPos = TermVectorPosition.of(pos, pe);
    tvPositions.add(tvPos);
  }

  return new TermVectorEntry(termText, te.totalTermFreq(), tvPositions);
}
 
Example 6
Source File: TestBlockPostingsFormat3.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * checks docs + freqs + positions + payloads, sequentially
 */
public void assertDocsAndPositionsEnum(PostingsEnum leftDocs, PostingsEnum rightDocs) throws Exception {
  assertNotNull(leftDocs);
  assertNotNull(rightDocs);
  assertEquals(-1, leftDocs.docID());
  assertEquals(-1, rightDocs.docID());
  int docid;
  while ((docid = leftDocs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
    assertEquals(docid, rightDocs.nextDoc());
    int freq = leftDocs.freq();
    assertEquals(freq, rightDocs.freq());
    for (int i = 0; i < freq; i++) {
      assertEquals(leftDocs.nextPosition(), rightDocs.nextPosition());
      // we don't assert offsets/payloads, they are allowed to be different
    }
  }
  assertEquals(DocIdSetIterator.NO_MORE_DOCS, rightDocs.nextDoc());
}
 
Example 7
Source File: FrequencyCtxSentenceBasedFBWorker.java    From jate with GNU Lesser General Public License v3.0 5 votes vote down vote up
private List<MWESentenceContext> collectTermOffsets(Terms termVectorLookup) throws IOException {
    List<MWESentenceContext> result = new ArrayList<>();

    TermsEnum tiRef= termVectorLookup.iterator();
    BytesRef luceneTerm = tiRef.next();
    while (luceneTerm != null) {
        if (luceneTerm.length == 0) {
            luceneTerm = tiRef.next();
            continue;
        }
        String tString = luceneTerm.utf8ToString();
        if(!allCandidates.contains(tString)) {
            luceneTerm=tiRef.next();
            continue;
        }


        PostingsEnum postingsEnum = tiRef.postings(null, PostingsEnum.ALL);
        //PostingsEnum postingsEnum = ti.postings(null, PostingsEnum.OFFSETS);

        int doc = postingsEnum.nextDoc(); //this should be just 1 doc, i.e., the constraint for getting this TV
        if (doc != PostingsEnum.NO_MORE_DOCS) {
            int totalOccurrence = postingsEnum.freq();
            for (int i = 0; i < totalOccurrence; i++) {
                postingsEnum.nextPosition();
                int start = postingsEnum.startOffset();
                int end = postingsEnum.endOffset();
                BytesRef payload=postingsEnum.getPayload();
                int sentenceId=-1;
                if(payload!=null){
                    sentenceId=new SentenceContext(MWEMetadata.deserialize(payload.utf8ToString())).getSentenceId();
                }
                result.add(new MWESentenceContext(tString,sentenceId, start, end));
            }
        }
        luceneTerm = tiRef.next();
    }
    Collections.sort(result);
    return result;
}
 
Example 8
Source File: ESIndex.java    From pyramid with Apache License 2.0 5 votes vote down vote up
private Map<Integer,String> getTermVectorWithException(String field, String id) throws IOException {
    TermVectorsResponse response = client.prepareTermVector(indexName, documentType, id)
            .setOffsets(false).setPositions(true).setFieldStatistics(false)
            .setTermStatistics(false)
            .setSelectedFields(field).
                    execute().actionGet();

    Map<Integer,String> map = new HashMap<>();
    Terms terms = response.getFields().terms(field);
    if (terms==null){
        return map;
    }
    TermsEnum iterator = terms.iterator();
    PostingsEnum postings = null;
    
    for (BytesRef termBytes = null; (termBytes = iterator.next()) != null; ) {
    	String term = termBytes.utf8ToString();
    	
    	postings = iterator.postings(postings, PostingsEnum.ALL);
    	
    	//there can only be one doc since we are getting with id. get the doc and the position 
    	postings.nextDoc();
    	
    	int tf = postings.freq();
    	
    	for (int i = 0; i < tf; i++) {
    		int pos = postings.nextPosition();
            map.put(pos,term);
    	}
    	
    }
    
    return map;
}
 
Example 9
Source File: CodecCollector.java    From mtas with Apache License 2.0 5 votes vote down vote up
/**
 * Compute termvector number basic.
 *
 * @param docSet
 *          the doc set
 * @param termDocId
 *          the term doc id
 * @param termsEnum
 *          the terms enum
 * @param r
 *          the r
 * @param lrc
 *          the lrc
 * @param postingsEnum
 *          the postings enum
 * @return the termvector number basic
 * @throws IOException
 *           Signals that an I/O exception has occurred.
 */
private static TermvectorNumberBasic computeTermvectorNumberBasic(
    List<Integer> docSet, int termDocId, TermsEnum termsEnum, LeafReader r,
    LeafReaderContext lrc, PostingsEnum postingsEnum) throws IOException {
  TermvectorNumberBasic result = new TermvectorNumberBasic();
  boolean hasDeletedDocuments = (r.getLiveDocs() != null);
  if ((docSet.size() == r.numDocs()) && !hasDeletedDocuments) {
    try {
      return computeTermvectorNumberBasic(termsEnum, r);
    } catch (IOException e) {
      log.debug("problem", e);
      // problem
    }
  }
  result.docNumber = 0;
  result.valueSum[0] = 0;
  int localTermDocId = termDocId;
  Iterator<Integer> docIterator = docSet.iterator();
  postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.FREQS);
  int docId;
  while (docIterator.hasNext()) {
    docId = docIterator.next() - lrc.docBase;
    if (docId >= localTermDocId && ((docId == localTermDocId)
        || ((localTermDocId = postingsEnum.advance(docId)) == docId))) {
      result.docNumber++;
      result.valueSum[0] += postingsEnum.freq();
    }
    if (localTermDocId == DocIdSetIterator.NO_MORE_DOCS) {
      break;
    }
  }
  return result;
}
 
Example 10
Source File: TestBlockPostingsFormat3.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * checks advancing docs + positions
 */
public void assertPositionsSkipping(int docFreq, PostingsEnum leftDocs, PostingsEnum rightDocs) throws Exception {
  if (leftDocs == null || rightDocs == null) {
    assertNull(leftDocs);
    assertNull(rightDocs);
    return;
  }
  
  int docid = -1;
  int averageGap = MAXDOC / (1+docFreq);
  int skipInterval = 16;

  while (true) {
    if (random().nextBoolean()) {
      // nextDoc()
      docid = leftDocs.nextDoc();
      assertEquals(docid, rightDocs.nextDoc());
    } else {
      // advance()
      int skip = docid + (int) Math.ceil(Math.abs(skipInterval + random().nextGaussian() * averageGap));
      docid = leftDocs.advance(skip);
      assertEquals(docid, rightDocs.advance(skip));
    }
    
    if (docid == DocIdSetIterator.NO_MORE_DOCS) {
      return;
    }
    int freq = leftDocs.freq();
    assertEquals(freq, rightDocs.freq());
    for (int i = 0; i < freq; i++) {
      assertEquals(leftDocs.nextPosition(), rightDocs.nextPosition());
      // we don't compare the payloads, it's allowed that one is empty etc
    }
  }
}
 
Example 11
Source File: TaxonomyIndexArrays.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private void initParents(IndexReader reader, int first) throws IOException {
  if (reader.maxDoc() == first) {
    return;
  }
  
  // it's ok to use MultiTerms because we only iterate on one posting list.
  // breaking it to loop over the leaves() only complicates code for no
  // apparent gain.
  PostingsEnum positions = MultiTerms.getTermPostingsEnum(reader,
      Consts.FIELD_PAYLOADS, Consts.PAYLOAD_PARENT_BYTES_REF,
      PostingsEnum.PAYLOADS);

  // shouldn't really happen, if it does, something's wrong
  if (positions == null || positions.advance(first) == DocIdSetIterator.NO_MORE_DOCS) {
    throw new CorruptIndexException("Missing parent data for category " + first, reader.toString());
  }
  
  int num = reader.maxDoc();
  for (int i = first; i < num; i++) {
    if (positions.docID() == i) {
      if (positions.freq() == 0) { // shouldn't happen
        throw new CorruptIndexException("Missing parent data for category " + i, reader.toString());
      }
      
      parents[i] = positions.nextPosition();
      
      if (positions.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
        if (i + 1 < num) {
          throw new CorruptIndexException("Missing parent data for category "+ (i + 1), reader.toString());
        }
        break;
      }
    } else { // this shouldn't happen
      throw new CorruptIndexException("Missing parent data for category " + i, reader.toString());
    }
  }
}
 
Example 12
Source File: TermMatchesIterator.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
/**
 * Create a new {@link TermMatchesIterator} for the given term and postings list
 */
TermMatchesIterator(Query query, PostingsEnum pe) throws IOException {
  this.pe = pe;
  this.query = query;
  this.upto = pe.freq();
}
 
Example 13
Source File: TermIntervalsSource.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
static IntervalMatchesIterator matches(TermsEnum te, int doc, String field) throws IOException {
  TermQuery query = new TermQuery(new Term(field, te.term()));
  PostingsEnum pe = te.postings(null, PostingsEnum.OFFSETS);
  if (pe.advance(doc) != doc) {
    return null;
  }
  return new IntervalMatchesIterator() {

    @Override
    public int gaps() {
      return 0;
    }

    @Override
    public int width() {
      return 1;
    }

    int upto = pe.freq();
    int pos = -1;

    @Override
    public boolean next() throws IOException {
      if (upto <= 0) {
        pos = IntervalIterator.NO_MORE_INTERVALS;
        return false;
      }
      upto--;
      pos = pe.nextPosition();
      return true;
    }

    @Override
    public int startPosition() {
      return pos;
    }

    @Override
    public int endPosition() {
      return pos;
    }

    @Override
    public int startOffset() throws IOException {
      return pe.startOffset();
    }

    @Override
    public int endOffset() throws IOException {
      return pe.endOffset();
    }

    @Override
    public MatchesIterator getSubMatches() {
      return null;
    }

    @Override
    public Query getQuery() {
      return query;
    }
  };
}
 
Example 14
Source File: PayloadFilteredTermIntervalsSource.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private IntervalMatchesIterator matches(TermsEnum te, int doc) throws IOException {
  PostingsEnum pe = te.postings(null, PostingsEnum.ALL);
  if (pe.advance(doc) != doc) {
    return null;
  }
  return new IntervalMatchesIterator() {

    @Override
    public int gaps() {
      return 0;
    }

    @Override
    public int width() {
      return 1;
    }

    int upto = pe.freq();
    int pos = -1;

    @Override
    public boolean next() throws IOException {
      do {
        if (upto <= 0) {
          pos = IntervalIterator.NO_MORE_INTERVALS;
          return false;
        }
        upto--;
        pos = pe.nextPosition();
      }
      while (filter.test(pe.getPayload()) == false);
      return true;
    }

    @Override
    public int startPosition() {
      return pos;
    }

    @Override
    public int endPosition() {
      return pos;
    }

    @Override
    public int startOffset() throws IOException {
      return pe.startOffset();
    }

    @Override
    public int endOffset() throws IOException {
      return pe.endOffset();
    }

    @Override
    public MatchesIterator getSubMatches() {
      return null;
    }

    @Override
    public Query getQuery() {
      throw new UnsupportedOperationException();
    }
  };
}
 
Example 15
Source File: TestPerfTasksLogic.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
/**
 * Test ReadTokensTask
 */
public void testReadTokens() throws Exception {

  // We will call ReadTokens on this many docs
  final int NUM_DOCS = 20;

  // Read tokens from first NUM_DOCS docs from Reuters and
  // then build index from the same docs
  String algLines1[] = {
    "# ----- properties ",
    "analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer",
    "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
    "docs.file=" + getReuters20LinesFile(),
    "# ----- alg ",
    "{ReadTokens}: " + NUM_DOCS,
    "ResetSystemErase",
    "CreateIndex",
    "{AddDoc}: " + NUM_DOCS,
    "CloseIndex",
  };

  // Run algo
  Benchmark benchmark = execBenchmark(algLines1);

  List<TaskStats> stats = benchmark.getRunData().getPoints().taskStats();

  // Count how many tokens all ReadTokens saw
  int totalTokenCount1 = 0;
  for (final TaskStats stat : stats) {
    if (stat.getTask().getName().equals("ReadTokens")) {
      totalTokenCount1 += stat.getCount();
    }
  }

  // Separately count how many tokens are actually in the index:
  IndexReader reader = DirectoryReader.open(benchmark.getRunData().getDirectory());
  assertEquals(NUM_DOCS, reader.numDocs());

  int totalTokenCount2 = 0;

  Collection<String> fields = FieldInfos.getIndexedFields(reader);

  for (String fieldName : fields) {
    if (fieldName.equals(DocMaker.ID_FIELD) || fieldName.equals(DocMaker.DATE_MSEC_FIELD) || fieldName.equals(DocMaker.TIME_SEC_FIELD)) {
      continue;
    }
    Terms terms = MultiTerms.getTerms(reader, fieldName);
    if (terms == null) {
      continue;
    }
    TermsEnum termsEnum = terms.iterator();
    PostingsEnum docs = null;
    while(termsEnum.next() != null) {
      docs = TestUtil.docs(random(), termsEnum, docs, PostingsEnum.FREQS);
      while(docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
        totalTokenCount2 += docs.freq();
      }
    }
  }
  reader.close();

  // Make sure they are the same
  assertEquals(totalTokenCount1, totalTokenCount2);
}
 
Example 16
Source File: OffsetsEnum.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public OfPostings(BytesRef term, PostingsEnum postingsEnum) throws IOException {
  this(term, postingsEnum.freq(), postingsEnum);
}
 
Example 17
Source File: DocVectors.java    From semanticvectors with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
/**
 * Creates doc vectors, iterating over terms.
 */
private void trainDocVectors() throws IOException {
  VerbatimLogger.info("Building document vectors ... ");
  Enumeration<ObjectVector> termEnum = termVectors.getAllVectors();
  try {
    int tc = 0;
    while (termEnum.hasMoreElements()) {
      // Output progress counter.
      if ((tc % 10000 == 0) || (tc < 10000 && tc % 1000 == 0)) {
        VerbatimLogger.info("Processed " + tc + " terms ... ");
      }
      tc++;

      ObjectVector termVectorObject = termEnum.nextElement();
      Vector termVector = termVectorObject.getVector();
      String word = (String) termVectorObject.getObject();

      // Go through checking terms for each fieldName.
      for (String fieldName : flagConfig.contentsfields()) {
        Term term = new Term(fieldName, word);
        float globalweight = luceneUtils.getGlobalTermWeight(term);
        float fieldweight = 1;

        // Get any docs for this term.
        PostingsEnum docsEnum = this.luceneUtils.getDocsForTerm(term);

        // This may occur frequently if one term vector store is derived from multiple fields
        if (docsEnum == null)  { continue; }

        while (docsEnum.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
          String externalDocID = luceneUtils.getExternalDocId(docsEnum.docID());
          // Add vector from this term, taking freq into account.
          Vector docVector = this.docVectors.getVector(externalDocID);
          float localweight = docsEnum.freq();

          if (flagConfig.fieldweight()) {
            //field weight: 1/sqrt(number of terms in field)
            TermsEnum terms = luceneUtils.getTermVector(docsEnum.docID(), fieldName).iterator();
            int numTerms = 0;
            while (terms.next() != null) {
              numTerms++;
            }
            fieldweight = (float) (1/Math.sqrt(numTerms));
          }

          docVector.superpose(
              termVector, localweight * globalweight * fieldweight, null);
        }
      }
    }
  }
  catch (IOException e) { // catches from indexReader.
    e.printStackTrace();
  }

  VerbatimLogger.info("\nNormalizing doc vectors ...\n");
  
  Enumeration<ObjectVector> docEnum = docVectors.getAllVectors();
  while (docEnum.hasMoreElements())
  	docEnum.nextElement().getVector().normalize();
}
 
Example 18
Source File: VectorScoreQuery.java    From solr-vector-scoring with Apache License 2.0 4 votes vote down vote up
@Override
protected CustomScoreProvider getCustomScoreProvider(LeafReaderContext context) throws IOException {
	return new CustomScoreProvider(context){
		@Override
		public float customScore(int docID, float subQueryScore, float valSrcScore) throws IOException {
			float score = 0;
			double docVectorNorm = 0;
			LeafReader reader = context.reader();
			Terms terms = reader.getTermVector(docID, field);
			if(vector.size() != terms.size()){
				throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "indexed and input vector array must have same length");
			}
			TermsEnum iter = terms.iterator();
		    BytesRef text;
		    while ((text = iter.next()) != null) {
		    	String term = text.utf8ToString();
		    	float payloadValue = 0f;
		    	PostingsEnum postings = iter.postings(null, PostingsEnum.ALL);
		    	while (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
		    		int freq = postings.freq();
		    		while (freq-- > 0) postings.nextPosition();

		    		BytesRef payload = postings.getPayload();
		    		payloadValue = PayloadHelper.decodeFloat(payload.bytes, payload.offset); 
		    		
		    		if (cosine)
		              docVectorNorm += Math.pow(payloadValue, 2.0);
		    	}
		    		
		    	score = (float)(score + payloadValue * (vector.get(Integer.parseInt(term))));
		    }
		    
		    if (cosine) {
		      if ((docVectorNorm == 0) || (queryVectorNorm == 0)) return 0f;
		      return (float)(score / (Math.sqrt(docVectorNorm) * Math.sqrt(queryVectorNorm)));
		    }

			return score;
		}
	};
}
 
Example 19
Source File: FrequencyCtxWindowBasedFBWorker.java    From jate with GNU Lesser General Public License v3.0 4 votes vote down vote up
private List<MWEInSentence> collectTermSentenceContext(Terms termVectorLookup,
                                                            Map<Integer, Integer> sentenceBoundaries) throws IOException {
    List<MWEInSentence> result = new ArrayList<>();

    TermsEnum tiRef = termVectorLookup.iterator();
    BytesRef luceneTerm = tiRef.next();
    while (luceneTerm != null) {
        if (luceneTerm.length == 0) {
            luceneTerm = tiRef.next();
            continue;
        }
        String tString = luceneTerm.utf8ToString();
        if (!allCandidates.contains(tString)) {
            luceneTerm = tiRef.next();
            continue;
        }


        PostingsEnum postingsEnum = tiRef.postings(null, PostingsEnum.ALL);
        //PostingsEnum postingsEnum = ti.postings(null, PostingsEnum.OFFSETS);

        int doc = postingsEnum.nextDoc(); //this should be just 1 doc, i.e., the constraint for getting this TV
        if (doc != PostingsEnum.NO_MORE_DOCS) {
            int totalOccurrence = postingsEnum.freq();
            for (int i = 0; i < totalOccurrence; i++) {
                postingsEnum.nextPosition();
                int start = postingsEnum.startOffset();
                int end = postingsEnum.endOffset();
                BytesRef payload = postingsEnum.getPayload();
                SentenceContext sentenceContextInfo = null;
                if (payload != null) {
                    sentenceContextInfo = new SentenceContext(MWEMetadata.deserialize(payload.utf8ToString()));
                }
                if (sentenceContextInfo == null)
                    result.add(new MWEInSentence(tString, start, end, 0, 0, 0));
                else {
                    result.add(new MWEInSentence(tString, start, end,
                            sentenceContextInfo.getFirstTokenIdx(),
                            sentenceContextInfo.getLastTokenIdx(),
                            sentenceContextInfo.getSentenceId()));

                    Integer endBound = sentenceBoundaries.get(sentenceContextInfo.getSentenceId());
                    if (endBound == null || endBound < sentenceContextInfo.getLastTokenIdx())
                        sentenceBoundaries.put(sentenceContextInfo.getSentenceId(),
                                sentenceContextInfo.getLastTokenIdx());
                }
            }
        }
        luceneTerm = tiRef.next();
    }
    Collections.sort(result);
    return result;
}
 
Example 20
Source File: FilterableTermsEnum.java    From Elasticsearch with Apache License 2.0 4 votes vote down vote up
@Override
public boolean seekExact(BytesRef text) throws IOException {
    int docFreq = 0;
    long totalTermFreq = 0;
    for (Holder anEnum : enums) {
        if (anEnum.termsEnum.seekExact(text)) {
            if (anEnum.bits == null) {
                docFreq += anEnum.termsEnum.docFreq();
                if (docsEnumFlag == PostingsEnum.FREQS) {
                    long leafTotalTermFreq = anEnum.termsEnum.totalTermFreq();
                    if (totalTermFreq == -1 || leafTotalTermFreq == -1) {
                        totalTermFreq = -1;
                        continue;
                    }
                    totalTermFreq += leafTotalTermFreq;
                }
            } else {
                final PostingsEnum docsEnum = anEnum.docsEnum = anEnum.termsEnum.postings(anEnum.docsEnum, docsEnumFlag);
                // 2 choices for performing same heavy loop - one attempts to calculate totalTermFreq and other does not
                if (docsEnumFlag == PostingsEnum.FREQS) {
                    for (int docId = docsEnum.nextDoc(); docId != DocIdSetIterator.NO_MORE_DOCS; docId = docsEnum.nextDoc()) {
                        if (anEnum.bits != null && anEnum.bits.get(docId) == false) {
                            continue;
                        }
                        docFreq++;
                        // docsEnum.freq() returns 1 if doc indexed with IndexOptions.DOCS_ONLY so no way of knowing if value
                        // is really 1 or unrecorded when filtering like this
                        totalTermFreq += docsEnum.freq();
                    }
                } else {
                    for (int docId = docsEnum.nextDoc(); docId != DocIdSetIterator.NO_MORE_DOCS; docId = docsEnum.nextDoc()) {
                        if (anEnum.bits != null && anEnum.bits.get(docId) == false) {
                            continue;
                        }
                        // docsEnum.freq() behaviour is undefined if docsEnumFlag==PostingsEnum.FLAG_NONE so don't bother with call
                        docFreq++;
                    }
                }
            }
        }
    }
    if (docFreq > 0) {
        currentDocFreq = docFreq;
        currentTotalTermFreq = totalTermFreq;
        current = text;
        return true;
    } else {
        currentDocFreq = NOT_FOUND;
        currentTotalTermFreq = NOT_FOUND;
        current = null;
        return false;
    }
}