org.apache.lucene.index.PostingsEnum#endOffset

Source File: TermVectorsResponse.java From Elasticsearch with Apache License 2.0

6 votes

private void initValues(Terms curTerms, PostingsEnum posEnum, int termFreq) throws IOException {
    for (int j = 0; j < termFreq; j++) {
        int nextPos = posEnum.nextPosition();
        if (curTerms.hasPositions()) {
            currentPositions[j] = nextPos;
        }
        if (curTerms.hasOffsets()) {
            currentStartOffset[j] = posEnum.startOffset();
            currentEndOffset[j] = posEnum.endOffset();
        }
        if (curTerms.hasPayloads()) {
            BytesRef curPayload = posEnum.getPayload();
            if (curPayload != null) {
                currentPayloads[j] = new BytesArray(curPayload.bytes, 0, curPayload.length);
            } else {
                currentPayloads[j] = null;
            }
        }
    }
}

Source File: TermPosting.java From lucene-solr with Apache License 2.0

6 votes

static TermPosting of(int position, PostingsEnum penum) throws IOException {
  TermPosting posting = new TermPosting();

  // set position
  posting.position = position;

  // set offset (if available)
  int sOffset = penum.startOffset();
  int eOffset = penum.endOffset();
  if (sOffset >= 0 && eOffset >= 0) {
    posting.startOffset = sOffset;
    posting.endOffset = eOffset;
  }

  // set payload (if available)
  if (penum.getPayload() != null) {
    posting.payload = BytesRef.deepCopyOf(penum.getPayload());
  }

  return posting;
}

Source File: TermVectorEntry.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Returns a new position entry representing the specified posting, and optionally, start and end offsets.
 * @param pos - term position
 * @param pe - positioned postings iterator
 * @return position entry
 * @throws IOException - if there is a low level IO error.
 */
static TermVectorPosition of(int pos, PostingsEnum pe) throws IOException {
  Objects.requireNonNull(pe);

  int sOffset = pe.startOffset();
  int eOffset = pe.endOffset();
  if (sOffset >= 0 && eOffset >= 0) {
    return new TermVectorPosition(pos, sOffset, eOffset);
  }
  return new TermVectorPosition(pos);
}

Source File: FrequencyCtxSentenceBasedFBWorker.java From jate with GNU Lesser General Public License v3.0

5 votes

private List<MWESentenceContext> collectTermOffsets(Terms termVectorLookup) throws IOException {
    List<MWESentenceContext> result = new ArrayList<>();

    TermsEnum tiRef= termVectorLookup.iterator();
    BytesRef luceneTerm = tiRef.next();
    while (luceneTerm != null) {
        if (luceneTerm.length == 0) {
            luceneTerm = tiRef.next();
            continue;
        }
        String tString = luceneTerm.utf8ToString();
        if(!allCandidates.contains(tString)) {
            luceneTerm=tiRef.next();
            continue;
        }


        PostingsEnum postingsEnum = tiRef.postings(null, PostingsEnum.ALL);
        //PostingsEnum postingsEnum = ti.postings(null, PostingsEnum.OFFSETS);

        int doc = postingsEnum.nextDoc(); //this should be just 1 doc, i.e., the constraint for getting this TV
        if (doc != PostingsEnum.NO_MORE_DOCS) {
            int totalOccurrence = postingsEnum.freq();
            for (int i = 0; i < totalOccurrence; i++) {
                postingsEnum.nextPosition();
                int start = postingsEnum.startOffset();
                int end = postingsEnum.endOffset();
                BytesRef payload=postingsEnum.getPayload();
                int sentenceId=-1;
                if(payload!=null){
                    sentenceId=new SentenceContext(MWEMetadata.deserialize(payload.utf8ToString())).getSentenceId();
                }
                result.add(new MWESentenceContext(tString,sentenceId, start, end));
            }
        }
        luceneTerm = tiRef.next();
    }
    Collections.sort(result);
    return result;
}

Source File: WindowBuildingTVM.java From wiseowl with MIT License

4 votes

public void map(Terms terms,Spans spans) throws IOException {
	int primStart = spanStart - primaryWS;
    int primEnd = spanEnd + primaryWS;
    // stores the start and end of the adjacent previous and following
    int adjLBStart = primStart - adjWS;
    int adjLBEnd = primStart - 1;//don't overlap
    int adjUBStart = primEnd + 1;//don't overlap
    int adjUBEnd = primEnd + adjWS;
    //stores the start and end of the secondary previous and the secondary following
    int secLBStart = adjLBStart - secWS;
    int secLBEnd = adjLBStart - 1; //don't overlap the adjacent window
    int secUBStart = adjUBEnd + 1;
    int secUBEnd = adjUBEnd + secWS;
    WindowTerm lastWT = null;
    if(terms!=null)
    {}
	TermsEnum termsEnum = terms.iterator();
    BytesRef termref = null;
    String term=null;
    
    while ((termref = termsEnum.next()) != null) {
  	term=termsEnum.term().utf8ToString();
  	PostingsEnum postings = termsEnum.postings(null, PostingsEnum.PAYLOADS | PostingsEnum.OFFSETS);
  	postings.nextDoc();
  if (term.startsWith(NameFilter.NE_PREFIX) == false && term.startsWith(PassageRankingComponent.NE_PREFIX_LOWER) == false) {//filter out the types, as we don't need them here
    //construct the windows, which means we need a bunch of 
	//bracketing variables to know what window we are in
    //start and end of the primary window
      //unfortunately, we still have to loop over the positions
      //we'll make this inclusive of the boundaries, do an upfront check here so
      //we can skip over anything that is outside of all windows
    	//int position=spans.nextStartPosition();
    	int position=postings.nextPosition();
      if (position >= secLBStart && position <= secUBEnd) {
        //fill in the windows
        WindowTerm wt;
        //offsets aren't required, but they are nice to have
        
        if (postings != null){
        //log.warn("terms if postings!=null {}",term);
        wt = new WindowTerm(term, position, postings.startOffset(), postings.endOffset());
        } else {
          wt = new WindowTerm(term, position);
          //log.warn("terms if postings==null {}",term);
        }
        
        if (position >= primStart && position <= primEnd) {//are we in the primary window
          passage.terms.add(wt);
          //we are only going to keep bigrams for the primary window.  You could do it for the other windows, too
          if (lastWT != null) {
            WindowTerm bigramWT = new WindowTerm(lastWT.term + "," + term, lastWT.position);//we don't care about offsets for bigrams
            passage.bigrams.add(bigramWT);
          }
          lastWT = wt;
        } else if (position >= secLBStart && position <= secLBEnd) {
        	//are we in the secondary previous window?
          passage.secPrevTerms.add(wt);
        } else if (position >= secUBStart && position <= secUBEnd) {//are we in the secondary following window?
          passage.secFollowTerms.add(wt);
        } else if (position >= adjLBStart && position <= adjLBEnd) {//are we in the adjacent previous window?
          passage.prevTerms.add(wt);
        } else if (position >= adjUBStart && position <= adjUBEnd) {//are we in the adjacent following window?
          passage.followTerms.add(wt);
        }
      }
    //}
  }}
}

Source File: PayloadFilteredTermIntervalsSource.java From lucene-solr with Apache License 2.0

4 votes

private IntervalMatchesIterator matches(TermsEnum te, int doc) throws IOException {
  PostingsEnum pe = te.postings(null, PostingsEnum.ALL);
  if (pe.advance(doc) != doc) {
    return null;
  }
  return new IntervalMatchesIterator() {

    @Override
    public int gaps() {
      return 0;
    }

    @Override
    public int width() {
      return 1;
    }

    int upto = pe.freq();
    int pos = -1;

    @Override
    public boolean next() throws IOException {
      do {
        if (upto <= 0) {
          pos = IntervalIterator.NO_MORE_INTERVALS;
          return false;
        }
        upto--;
        pos = pe.nextPosition();
      }
      while (filter.test(pe.getPayload()) == false);
      return true;
    }

    @Override
    public int startPosition() {
      return pos;
    }

    @Override
    public int endPosition() {
      return pos;
    }

    @Override
    public int startOffset() throws IOException {
      return pe.startOffset();
    }

    @Override
    public int endOffset() throws IOException {
      return pe.endOffset();
    }

    @Override
    public MatchesIterator getSubMatches() {
      return null;
    }

    @Override
    public Query getQuery() {
      throw new UnsupportedOperationException();
    }
  };
}

Source File: TermIntervalsSource.java From lucene-solr with Apache License 2.0

4 votes

static IntervalMatchesIterator matches(TermsEnum te, int doc, String field) throws IOException {
  TermQuery query = new TermQuery(new Term(field, te.term()));
  PostingsEnum pe = te.postings(null, PostingsEnum.OFFSETS);
  if (pe.advance(doc) != doc) {
    return null;
  }
  return new IntervalMatchesIterator() {

    @Override
    public int gaps() {
      return 0;
    }

    @Override
    public int width() {
      return 1;
    }

    int upto = pe.freq();
    int pos = -1;

    @Override
    public boolean next() throws IOException {
      if (upto <= 0) {
        pos = IntervalIterator.NO_MORE_INTERVALS;
        return false;
      }
      upto--;
      pos = pe.nextPosition();
      return true;
    }

    @Override
    public int startPosition() {
      return pos;
    }

    @Override
    public int endPosition() {
      return pos;
    }

    @Override
    public int startOffset() throws IOException {
      return pe.startOffset();
    }

    @Override
    public int endOffset() throws IOException {
      return pe.endOffset();
    }

    @Override
    public MatchesIterator getSubMatches() {
      return null;
    }

    @Override
    public Query getQuery() {
      return query;
    }
  };
}

Source File: TestMatchesIterator.java From lucene-solr with Apache License 2.0

4 votes

public TermMatch(PostingsEnum pe, int position) throws IOException {
  this.position = position;
  this.startOffset = pe.startOffset();
  this.endOffset = pe.endOffset();
}

Source File: FrequencyCtxWindowBasedFBWorker.java From jate with GNU Lesser General Public License v3.0

4 votes

private List<MWEInSentence> collectTermSentenceContext(Terms termVectorLookup,
                                                            Map<Integer, Integer> sentenceBoundaries) throws IOException {
    List<MWEInSentence> result = new ArrayList<>();

    TermsEnum tiRef = termVectorLookup.iterator();
    BytesRef luceneTerm = tiRef.next();
    while (luceneTerm != null) {
        if (luceneTerm.length == 0) {
            luceneTerm = tiRef.next();
            continue;
        }
        String tString = luceneTerm.utf8ToString();
        if (!allCandidates.contains(tString)) {
            luceneTerm = tiRef.next();
            continue;
        }


        PostingsEnum postingsEnum = tiRef.postings(null, PostingsEnum.ALL);
        //PostingsEnum postingsEnum = ti.postings(null, PostingsEnum.OFFSETS);

        int doc = postingsEnum.nextDoc(); //this should be just 1 doc, i.e., the constraint for getting this TV
        if (doc != PostingsEnum.NO_MORE_DOCS) {
            int totalOccurrence = postingsEnum.freq();
            for (int i = 0; i < totalOccurrence; i++) {
                postingsEnum.nextPosition();
                int start = postingsEnum.startOffset();
                int end = postingsEnum.endOffset();
                BytesRef payload = postingsEnum.getPayload();
                SentenceContext sentenceContextInfo = null;
                if (payload != null) {
                    sentenceContextInfo = new SentenceContext(MWEMetadata.deserialize(payload.utf8ToString()));
                }
                if (sentenceContextInfo == null)
                    result.add(new MWEInSentence(tString, start, end, 0, 0, 0));
                else {
                    result.add(new MWEInSentence(tString, start, end,
                            sentenceContextInfo.getFirstTokenIdx(),
                            sentenceContextInfo.getLastTokenIdx(),
                            sentenceContextInfo.getSentenceId()));

                    Integer endBound = sentenceBoundaries.get(sentenceContextInfo.getSentenceId());
                    if (endBound == null || endBound < sentenceContextInfo.getLastTokenIdx())
                        sentenceBoundaries.put(sentenceContextInfo.getSentenceId(),
                                sentenceContextInfo.getLastTokenIdx());
                }
            }
        }
        luceneTerm = tiRef.next();
    }
    Collections.sort(result);
    return result;
}

Java Code Examples for org.apache.lucene.index.PostingsEnum#endOffset()