Java Code Examples for org.apache.lucene.index.PostingsEnum#nextPosition()

The following examples show how to use org.apache.lucene.index.PostingsEnum#nextPosition() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TermVectorsResponse.java    From Elasticsearch with Apache License 2.0 6 votes vote down vote up
private void initValues(Terms curTerms, PostingsEnum posEnum, int termFreq) throws IOException {
    for (int j = 0; j < termFreq; j++) {
        int nextPos = posEnum.nextPosition();
        if (curTerms.hasPositions()) {
            currentPositions[j] = nextPos;
        }
        if (curTerms.hasOffsets()) {
            currentStartOffset[j] = posEnum.startOffset();
            currentEndOffset[j] = posEnum.endOffset();
        }
        if (curTerms.hasPayloads()) {
            BytesRef curPayload = posEnum.getPayload();
            if (curPayload != null) {
                currentPayloads[j] = new BytesArray(curPayload.bytes, 0, curPayload.length);
            } else {
                currentPayloads[j] = null;
            }
        }
    }
}
 
Example 2
Source File: TermVectorEntry.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Returns a new term vector entry representing the specified term, and optionally, positions.
 *
 * @param te - positioned terms iterator
 * @return term vector entry
 * @throws IOException - if there is a low level IO error.
 */
static TermVectorEntry of(TermsEnum te) throws IOException {
  Objects.requireNonNull(te);

  String termText = BytesRefUtils.decode(te.term());

  List<TermVectorEntry.TermVectorPosition> tvPositions = new ArrayList<>();
  PostingsEnum pe = te.postings(null, PostingsEnum.OFFSETS);
  pe.nextDoc();
  int freq = pe.freq();
  for (int i = 0; i < freq; i++) {
    int pos = pe.nextPosition();
    if (pos < 0) {
      // no position information available
      continue;
    }
    TermVectorPosition tvPos = TermVectorPosition.of(pos, pe);
    tvPositions.add(tvPos);
  }

  return new TermVectorEntry(termText, te.totalTermFreq(), tvPositions);
}
 
Example 3
Source File: BlendedInfixSuggester.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Create the coefficient to transform the weight.
 *
 * @param doc id of the document
 * @param matchedTokens tokens found in the query
 * @param prefixToken unfinished token in the query
 * @return the coefficient
 * @throws IOException If there are problems reading term vectors from the underlying Lucene index.
 */
private double createCoefficient(IndexSearcher searcher, int doc, Set<String> matchedTokens, String prefixToken) throws IOException {

  Terms tv = searcher.getIndexReader().getTermVector(doc, TEXT_FIELD_NAME);
  TermsEnum it = tv.iterator();

  Integer position = Integer.MAX_VALUE;
  BytesRef term;
  // find the closest token position
  while ((term = it.next()) != null) {

    String docTerm = term.utf8ToString();

    if (matchedTokens.contains(docTerm) || (prefixToken != null && docTerm.startsWith(prefixToken))) {
 
      PostingsEnum docPosEnum = it.postings(null, PostingsEnum.OFFSETS);
      docPosEnum.nextDoc();

      // use the first occurrence of the term
      int p = docPosEnum.nextPosition();
      if (p < position) {
        position = p;
      }
    }
  }

  // create corresponding coefficient based on position
  return calculateCoefficient(position);
}
 
Example 4
Source File: TaxonomyIndexArrays.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private void initParents(IndexReader reader, int first) throws IOException {
  if (reader.maxDoc() == first) {
    return;
  }
  
  // it's ok to use MultiTerms because we only iterate on one posting list.
  // breaking it to loop over the leaves() only complicates code for no
  // apparent gain.
  PostingsEnum positions = MultiTerms.getTermPostingsEnum(reader,
      Consts.FIELD_PAYLOADS, Consts.PAYLOAD_PARENT_BYTES_REF,
      PostingsEnum.PAYLOADS);

  // shouldn't really happen, if it does, something's wrong
  if (positions == null || positions.advance(first) == DocIdSetIterator.NO_MORE_DOCS) {
    throw new CorruptIndexException("Missing parent data for category " + first, reader.toString());
  }
  
  int num = reader.maxDoc();
  for (int i = first; i < num; i++) {
    if (positions.docID() == i) {
      if (positions.freq() == 0) { // shouldn't happen
        throw new CorruptIndexException("Missing parent data for category " + i, reader.toString());
      }
      
      parents[i] = positions.nextPosition();
      
      if (positions.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
        if (i + 1 < num) {
          throw new CorruptIndexException("Missing parent data for category "+ (i + 1), reader.toString());
        }
        break;
      }
    } else { // this shouldn't happen
      throw new CorruptIndexException("Missing parent data for category " + i, reader.toString());
    }
  }
}
 
Example 5
Source File: TestTeeSinkTokenFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testEndOffsetPositionWithTeeSinkTokenFilter() throws Exception {
  Directory dir = newDirectory();
  Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
  IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(analyzer));
  Document doc = new Document();
  TokenStream tokenStream = analyzer.tokenStream("field", "abcd   ");
  TeeSinkTokenFilter tee = new TeeSinkTokenFilter(tokenStream);
  TokenStream sink = tee.newSinkTokenStream();
  FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
  ft.setStoreTermVectors(true);
  ft.setStoreTermVectorOffsets(true);
  ft.setStoreTermVectorPositions(true);
  Field f1 = new Field("field", tee, ft);
  Field f2 = new Field("field", sink, ft);
  doc.add(f1);
  doc.add(f2);
  w.addDocument(doc);
  w.close();

  IndexReader r = DirectoryReader.open(dir);
  Terms vector = r.getTermVectors(0).terms("field");
  assertEquals(1, vector.size());
  TermsEnum termsEnum = vector.iterator();
  termsEnum.next();
  assertEquals(2, termsEnum.totalTermFreq());
  PostingsEnum positions = termsEnum.postings(null, PostingsEnum.ALL);
  assertTrue(positions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
  assertEquals(2, positions.freq());
  positions.nextPosition();
  assertEquals(0, positions.startOffset());
  assertEquals(4, positions.endOffset());
  positions.nextPosition();
  assertEquals(8, positions.startOffset());
  assertEquals(12, positions.endOffset());
  assertEquals(DocIdSetIterator.NO_MORE_DOCS, positions.nextDoc());
  r.close();
  dir.close();
  analyzer.close();
}
 
Example 6
Source File: ESIndex.java    From pyramid with Apache License 2.0 5 votes vote down vote up
private Map<Integer,String> getTermVectorWithException(String field, String id) throws IOException {
    TermVectorsResponse response = client.prepareTermVector(indexName, documentType, id)
            .setOffsets(false).setPositions(true).setFieldStatistics(false)
            .setTermStatistics(false)
            .setSelectedFields(field).
                    execute().actionGet();

    Map<Integer,String> map = new HashMap<>();
    Terms terms = response.getFields().terms(field);
    if (terms==null){
        return map;
    }
    TermsEnum iterator = terms.iterator();
    PostingsEnum postings = null;
    
    for (BytesRef termBytes = null; (termBytes = iterator.next()) != null; ) {
    	String term = termBytes.utf8ToString();
    	
    	postings = iterator.postings(postings, PostingsEnum.ALL);
    	
    	//there can only be one doc since we are getting with id. get the doc and the position 
    	postings.nextDoc();
    	
    	int tf = postings.freq();
    	
    	for (int i = 0; i < tf; i++) {
    		int pos = postings.nextPosition();
            map.put(pos,term);
    	}
    	
    }
    
    return map;
}
 
Example 7
Source File: FrequencyCtxSentenceBasedFBWorker.java    From jate with GNU Lesser General Public License v3.0 5 votes vote down vote up
private List<MWESentenceContext> collectTermOffsets(Terms termVectorLookup) throws IOException {
    List<MWESentenceContext> result = new ArrayList<>();

    TermsEnum tiRef= termVectorLookup.iterator();
    BytesRef luceneTerm = tiRef.next();
    while (luceneTerm != null) {
        if (luceneTerm.length == 0) {
            luceneTerm = tiRef.next();
            continue;
        }
        String tString = luceneTerm.utf8ToString();
        if(!allCandidates.contains(tString)) {
            luceneTerm=tiRef.next();
            continue;
        }


        PostingsEnum postingsEnum = tiRef.postings(null, PostingsEnum.ALL);
        //PostingsEnum postingsEnum = ti.postings(null, PostingsEnum.OFFSETS);

        int doc = postingsEnum.nextDoc(); //this should be just 1 doc, i.e., the constraint for getting this TV
        if (doc != PostingsEnum.NO_MORE_DOCS) {
            int totalOccurrence = postingsEnum.freq();
            for (int i = 0; i < totalOccurrence; i++) {
                postingsEnum.nextPosition();
                int start = postingsEnum.startOffset();
                int end = postingsEnum.endOffset();
                BytesRef payload=postingsEnum.getPayload();
                int sentenceId=-1;
                if(payload!=null){
                    sentenceId=new SentenceContext(MWEMetadata.deserialize(payload.utf8ToString())).getSentenceId();
                }
                result.add(new MWESentenceContext(tString,sentenceId, start, end));
            }
        }
        luceneTerm = tiRef.next();
    }
    Collections.sort(result);
    return result;
}
 
Example 8
Source File: WindowBuildingTVM.java    From wiseowl with MIT License 4 votes vote down vote up
public void map(Terms terms,Spans spans) throws IOException {
	int primStart = spanStart - primaryWS;
    int primEnd = spanEnd + primaryWS;
    // stores the start and end of the adjacent previous and following
    int adjLBStart = primStart - adjWS;
    int adjLBEnd = primStart - 1;//don't overlap
    int adjUBStart = primEnd + 1;//don't overlap
    int adjUBEnd = primEnd + adjWS;
    //stores the start and end of the secondary previous and the secondary following
    int secLBStart = adjLBStart - secWS;
    int secLBEnd = adjLBStart - 1; //don't overlap the adjacent window
    int secUBStart = adjUBEnd + 1;
    int secUBEnd = adjUBEnd + secWS;
    WindowTerm lastWT = null;
    if(terms!=null)
    {}
	TermsEnum termsEnum = terms.iterator();
    BytesRef termref = null;
    String term=null;
    
    while ((termref = termsEnum.next()) != null) {
  	term=termsEnum.term().utf8ToString();
  	PostingsEnum postings = termsEnum.postings(null, PostingsEnum.PAYLOADS | PostingsEnum.OFFSETS);
  	postings.nextDoc();
  if (term.startsWith(NameFilter.NE_PREFIX) == false && term.startsWith(PassageRankingComponent.NE_PREFIX_LOWER) == false) {//filter out the types, as we don't need them here
    //construct the windows, which means we need a bunch of 
	//bracketing variables to know what window we are in
    //start and end of the primary window
      //unfortunately, we still have to loop over the positions
      //we'll make this inclusive of the boundaries, do an upfront check here so
      //we can skip over anything that is outside of all windows
    	//int position=spans.nextStartPosition();
    	int position=postings.nextPosition();
      if (position >= secLBStart && position <= secUBEnd) {
        //fill in the windows
        WindowTerm wt;
        //offsets aren't required, but they are nice to have
        
        if (postings != null){
        //log.warn("terms if postings!=null {}",term);
        wt = new WindowTerm(term, position, postings.startOffset(), postings.endOffset());
        } else {
          wt = new WindowTerm(term, position);
          //log.warn("terms if postings==null {}",term);
        }
        
        if (position >= primStart && position <= primEnd) {//are we in the primary window
          passage.terms.add(wt);
          //we are only going to keep bigrams for the primary window.  You could do it for the other windows, too
          if (lastWT != null) {
            WindowTerm bigramWT = new WindowTerm(lastWT.term + "," + term, lastWT.position);//we don't care about offsets for bigrams
            passage.bigrams.add(bigramWT);
          }
          lastWT = wt;
        } else if (position >= secLBStart && position <= secLBEnd) {
        	//are we in the secondary previous window?
          passage.secPrevTerms.add(wt);
        } else if (position >= secUBStart && position <= secUBEnd) {//are we in the secondary following window?
          passage.secFollowTerms.add(wt);
        } else if (position >= adjLBStart && position <= adjLBEnd) {//are we in the adjacent previous window?
          passage.prevTerms.add(wt);
        } else if (position >= adjUBStart && position <= adjUBEnd) {//are we in the adjacent following window?
          passage.followTerms.add(wt);
        }
      }
    //}
  }}
}
 
Example 9
Source File: VectorScoreQuery.java    From solr-vector-scoring with Apache License 2.0 4 votes vote down vote up
@Override
protected CustomScoreProvider getCustomScoreProvider(LeafReaderContext context) throws IOException {
	return new CustomScoreProvider(context){
		@Override
		public float customScore(int docID, float subQueryScore, float valSrcScore) throws IOException {
			float score = 0;
			double docVectorNorm = 0;
			LeafReader reader = context.reader();
			Terms terms = reader.getTermVector(docID, field);
			if(vector.size() != terms.size()){
				throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "indexed and input vector array must have same length");
			}
			TermsEnum iter = terms.iterator();
		    BytesRef text;
		    while ((text = iter.next()) != null) {
		    	String term = text.utf8ToString();
		    	float payloadValue = 0f;
		    	PostingsEnum postings = iter.postings(null, PostingsEnum.ALL);
		    	while (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
		    		int freq = postings.freq();
		    		while (freq-- > 0) postings.nextPosition();

		    		BytesRef payload = postings.getPayload();
		    		payloadValue = PayloadHelper.decodeFloat(payload.bytes, payload.offset); 
		    		
		    		if (cosine)
		              docVectorNorm += Math.pow(payloadValue, 2.0);
		    	}
		    		
		    	score = (float)(score + payloadValue * (vector.get(Integer.parseInt(term))));
		    }
		    
		    if (cosine) {
		      if ((docVectorNorm == 0) || (queryVectorNorm == 0)) return 0f;
		      return (float)(score / (Math.sqrt(docVectorNorm) * Math.sqrt(queryVectorNorm)));
		    }

			return score;
		}
	};
}
 
Example 10
Source File: PayloadFilteredTermIntervalsSource.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private IntervalMatchesIterator matches(TermsEnum te, int doc) throws IOException {
  PostingsEnum pe = te.postings(null, PostingsEnum.ALL);
  if (pe.advance(doc) != doc) {
    return null;
  }
  return new IntervalMatchesIterator() {

    @Override
    public int gaps() {
      return 0;
    }

    @Override
    public int width() {
      return 1;
    }

    int upto = pe.freq();
    int pos = -1;

    @Override
    public boolean next() throws IOException {
      do {
        if (upto <= 0) {
          pos = IntervalIterator.NO_MORE_INTERVALS;
          return false;
        }
        upto--;
        pos = pe.nextPosition();
      }
      while (filter.test(pe.getPayload()) == false);
      return true;
    }

    @Override
    public int startPosition() {
      return pos;
    }

    @Override
    public int endPosition() {
      return pos;
    }

    @Override
    public int startOffset() throws IOException {
      return pe.startOffset();
    }

    @Override
    public int endOffset() throws IOException {
      return pe.endOffset();
    }

    @Override
    public MatchesIterator getSubMatches() {
      return null;
    }

    @Override
    public Query getQuery() {
      throw new UnsupportedOperationException();
    }
  };
}
 
Example 11
Source File: TermIntervalsSource.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
static IntervalMatchesIterator matches(TermsEnum te, int doc, String field) throws IOException {
  TermQuery query = new TermQuery(new Term(field, te.term()));
  PostingsEnum pe = te.postings(null, PostingsEnum.OFFSETS);
  if (pe.advance(doc) != doc) {
    return null;
  }
  return new IntervalMatchesIterator() {

    @Override
    public int gaps() {
      return 0;
    }

    @Override
    public int width() {
      return 1;
    }

    int upto = pe.freq();
    int pos = -1;

    @Override
    public boolean next() throws IOException {
      if (upto <= 0) {
        pos = IntervalIterator.NO_MORE_INTERVALS;
        return false;
      }
      upto--;
      pos = pe.nextPosition();
      return true;
    }

    @Override
    public int startPosition() {
      return pos;
    }

    @Override
    public int endPosition() {
      return pos;
    }

    @Override
    public int startOffset() throws IOException {
      return pe.startOffset();
    }

    @Override
    public int endOffset() throws IOException {
      return pe.endOffset();
    }

    @Override
    public MatchesIterator getSubMatches() {
      return null;
    }

    @Override
    public Query getQuery() {
      return query;
    }
  };
}
 
Example 12
Source File: TermVectorComponent.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private void mapOneVector(NamedList<Object> docNL, FieldOptions fieldOptions, IndexReader reader, int docID, TermsEnum termsEnum, String field) throws IOException {
  NamedList<Object> fieldNL = new NamedList<>();
  docNL.add(field, fieldNL);

  BytesRef text;
  PostingsEnum dpEnum = null;
  while((text = termsEnum.next()) != null) {
    String term = text.utf8ToString();
    NamedList<Object> termInfo = new NamedList<>();
    fieldNL.add(term, termInfo);
    final int freq = (int) termsEnum.totalTermFreq();
    if (fieldOptions.termFreq == true) {
      termInfo.add("tf", freq);
    }

    int dpEnumFlags = 0;
    dpEnumFlags |= fieldOptions.positions ? PostingsEnum.POSITIONS : 0;
    //payloads require offsets
    dpEnumFlags |= (fieldOptions.offsets || fieldOptions.payloads) ? PostingsEnum.OFFSETS : 0;
    dpEnumFlags |= fieldOptions.payloads ? PostingsEnum.PAYLOADS : 0;
    dpEnum = termsEnum.postings(dpEnum, dpEnumFlags);

    boolean atNextDoc = false;
    if (dpEnum != null) {
      dpEnum.nextDoc();
      atNextDoc = true;
    }

    if (atNextDoc && dpEnumFlags != 0) {
      NamedList<Integer> positionsNL = null;
      NamedList<Number> theOffsets = null;
      NamedList<String> thePayloads = null;

      for (int i = 0; i < freq; i++) {
        final int pos = dpEnum.nextPosition();
        if (fieldOptions.positions && pos >= 0) {
          if (positionsNL == null) {
            positionsNL = new NamedList<>();
            termInfo.add("positions", positionsNL);
          }
          positionsNL.add("position", pos);
        }

        int startOffset = fieldOptions.offsets ? dpEnum.startOffset() : -1;
        if (startOffset >= 0) {
          if (theOffsets == null) {
            theOffsets = new NamedList<>();
            termInfo.add("offsets", theOffsets);
          }
          theOffsets.add("start", dpEnum.startOffset());
          theOffsets.add("end", dpEnum.endOffset());
        }

        BytesRef payload = fieldOptions.payloads ? dpEnum.getPayload() : null;
        if (payload != null) {
          if (thePayloads == null) {
            thePayloads = new NamedList<>();
            termInfo.add("payloads", thePayloads);
          }
          thePayloads.add("payload", Base64.byteArrayToBase64(payload.bytes, payload.offset, payload.length));
        }
      }
    }
    
    int df = 0;
    if (fieldOptions.docFreq || fieldOptions.tfIdf) {
      df = reader.docFreq(new Term(field, text));
    }

    if (fieldOptions.docFreq) {
      termInfo.add("df", df);
    }

    // TODO: this is not TF/IDF by anyone's definition!
    if (fieldOptions.tfIdf) {
      double tfIdfVal = ((double) freq) / df;
      termInfo.add("tf-idf", tfIdfVal);
    }
  }
}
 
Example 13
Source File: FrequencyCtxWindowBasedFBWorker.java    From jate with GNU Lesser General Public License v3.0 4 votes vote down vote up
private List<MWEInSentence> collectTermSentenceContext(Terms termVectorLookup,
                                                            Map<Integer, Integer> sentenceBoundaries) throws IOException {
    List<MWEInSentence> result = new ArrayList<>();

    TermsEnum tiRef = termVectorLookup.iterator();
    BytesRef luceneTerm = tiRef.next();
    while (luceneTerm != null) {
        if (luceneTerm.length == 0) {
            luceneTerm = tiRef.next();
            continue;
        }
        String tString = luceneTerm.utf8ToString();
        if (!allCandidates.contains(tString)) {
            luceneTerm = tiRef.next();
            continue;
        }


        PostingsEnum postingsEnum = tiRef.postings(null, PostingsEnum.ALL);
        //PostingsEnum postingsEnum = ti.postings(null, PostingsEnum.OFFSETS);

        int doc = postingsEnum.nextDoc(); //this should be just 1 doc, i.e., the constraint for getting this TV
        if (doc != PostingsEnum.NO_MORE_DOCS) {
            int totalOccurrence = postingsEnum.freq();
            for (int i = 0; i < totalOccurrence; i++) {
                postingsEnum.nextPosition();
                int start = postingsEnum.startOffset();
                int end = postingsEnum.endOffset();
                BytesRef payload = postingsEnum.getPayload();
                SentenceContext sentenceContextInfo = null;
                if (payload != null) {
                    sentenceContextInfo = new SentenceContext(MWEMetadata.deserialize(payload.utf8ToString()));
                }
                if (sentenceContextInfo == null)
                    result.add(new MWEInSentence(tString, start, end, 0, 0, 0));
                else {
                    result.add(new MWEInSentence(tString, start, end,
                            sentenceContextInfo.getFirstTokenIdx(),
                            sentenceContextInfo.getLastTokenIdx(),
                            sentenceContextInfo.getSentenceId()));

                    Integer endBound = sentenceBoundaries.get(sentenceContextInfo.getSentenceId());
                    if (endBound == null || endBound < sentenceContextInfo.getLastTokenIdx())
                        sentenceBoundaries.put(sentenceContextInfo.getSentenceId(),
                                sentenceContextInfo.getLastTokenIdx());
                }
            }
        }
        luceneTerm = tiRef.next();
    }
    Collections.sort(result);
    return result;
}