Java Code Examples for org.apache.lucene.search.spans.Spans#nextStartPosition()

The following examples show how to use org.apache.lucene.search.spans.Spans#nextStartPosition() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestPayloadSpans.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
private void checkSpans(Spans spans, int expectedNumSpans, int expectedNumPayloads,
                        int expectedPayloadLength, int expectedFirstByte) throws IOException {
  assertTrue("spans is null and it shouldn't be", spans != null);
  //each position match should have a span associated with it, since there is just one underlying term query, there should
  //only be one entry in the span
  VerifyingCollector collector = new VerifyingCollector();
  int seen = 0;
  while (spans.nextDoc() != Spans.NO_MORE_DOCS) {
    while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
      collector.reset();
      spans.collect(collector);
      collector.verify(expectedPayloadLength, expectedFirstByte);
      assertEquals("expectedNumPayloads", expectedNumPayloads, collector.payloads.size());
      seen++;
    }
  }
  assertEquals("expectedNumSpans", expectedNumSpans, seen);
}
 
Example 2
Source File: TestPayloadSpans.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
private void checkSpans(Spans spans, int numSpans, int[] numPayloads) throws IOException {
  int cnt = 0;
  VerifyingCollector collector = new VerifyingCollector();
  while (spans.nextDoc() != Spans.NO_MORE_DOCS) {
    while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
      if(VERBOSE)
        System.out.println("\nSpans Dump --");
      collector.reset();
      spans.collect(collector);
      assertEquals("payload size", numPayloads[cnt], collector.payloads.size());

      cnt++;
    }
  }

  assertEquals("expected numSpans", numSpans, cnt);
}
 
Example 3
Source File: TestPayloadSpans.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testShrinkToAfterShortestMatch() throws IOException {
  Directory directory = newDirectory();
  RandomIndexWriter writer = new RandomIndexWriter(random(), directory,
                                                   newIndexWriterConfig(new TestPayloadAnalyzer()));

  Document doc = new Document();
  doc.add(new TextField("content", new StringReader("a b c d e f g h i j a k")));
  writer.addDocument(doc);

  IndexReader reader = writer.getReader();
  IndexSearcher is = newSearcher(getOnlyLeafReader(reader), false);
  writer.close();

  SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a"));
  SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k"));
  SpanQuery[] sqs = { stq1, stq2 };
  SpanNearQuery snq = new SpanNearQuery(sqs, 1, true);
  VerifyingCollector collector = new VerifyingCollector();
  Spans spans = snq.createWeight(is, ScoreMode.COMPLETE_NO_SCORES, 1f).getSpans(is.getIndexReader().leaves().get(0), SpanWeight.Postings.PAYLOADS);

  TopDocs topDocs = is.search(snq, 1);
  Set<String> payloadSet = new HashSet<>();
  for (int i = 0; i < topDocs.scoreDocs.length; i++) {
    while (spans.nextDoc() != Spans.NO_MORE_DOCS) {
      while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
        collector.reset();
        spans.collect(collector);
        for (final BytesRef payload : collector.payloads) {
          payloadSet.add(Term.toString(payload));
        }
      }
    }
  }
  assertEquals(2, payloadSet.size());
  assertTrue(payloadSet.contains("a:Noise:10"));
  assertTrue(payloadSet.contains("k:Noise:11"));
  reader.close();
  directory.close();
}
 
Example 4
Source File: TestPayloadSpans.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testShrinkToAfterShortestMatch2() throws IOException {
  Directory directory = newDirectory();
  RandomIndexWriter writer = new RandomIndexWriter(random(), directory,
                                                   newIndexWriterConfig(new TestPayloadAnalyzer()));

  Document doc = new Document();
  doc.add(new TextField("content", new StringReader("a b a d k f a h i k a k")));
  writer.addDocument(doc);
  IndexReader reader = writer.getReader();
  IndexSearcher is = newSearcher(getOnlyLeafReader(reader), false);
  writer.close();

  SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a"));
  SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k"));
  SpanQuery[] sqs = { stq1, stq2 };
  SpanNearQuery snq = new SpanNearQuery(sqs, 0, true);
  VerifyingCollector collector = new VerifyingCollector();
  Spans spans = snq.createWeight(is, ScoreMode.COMPLETE_NO_SCORES, 1f).getSpans(is.getIndexReader().leaves().get(0), SpanWeight.Postings.PAYLOADS);

  TopDocs topDocs = is.search(snq, 1);
  Set<String> payloadSet = new HashSet<>();
  for (int i = 0; i < topDocs.scoreDocs.length; i++) {
    while (spans.nextDoc() != Spans.NO_MORE_DOCS) {
      while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
        collector.reset();
        spans.collect(collector);
        for (final BytesRef payload: collector.payloads) {
          payloadSet.add(Term.toString(payload));
        }
      }
    }
  }
  assertEquals(2, payloadSet.size());
  assertTrue(payloadSet.contains("a:Noise:10"));
  assertTrue(payloadSet.contains("k:Noise:11"));
  reader.close();
  directory.close();
}
 
Example 5
Source File: TestPayloadTermQuery.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testMultipleMatchesPerDoc() throws Exception {
  SpanQuery query = new PayloadScoreQuery(new SpanTermQuery(new Term(PayloadHelper.MULTI_FIELD, "seventy")),
          new MaxPayloadFunction(), PayloadDecoder.FLOAT_DECODER);
  TopDocs hits = searcher.search(query, 100);
  assertTrue("hits is null and it shouldn't be", hits != null);
  assertTrue("hits Size: " + hits.totalHits.value + " is not: " + 100, hits.totalHits.value == 100);

  //they should all have the exact same score, because they all contain seventy once, and we set
  //all the other similarity factors to be 1

  //System.out.println("Hash: " + seventyHash + " Twice Hash: " + 2*seventyHash);
  //there should be exactly 10 items that score a 4, all the rest should score a 2
  //The 10 items are: 70 + i*100 where i in [0-9]
  int numTens = 0;
  for (int i = 0; i < hits.scoreDocs.length; i++) {
    ScoreDoc doc = hits.scoreDocs[i];
    if (doc.doc % 10 == 0) {
      numTens++;
      assertTrue(doc.score + " does not equal: " + 4.0, doc.score == 4.0);
    } else {
      assertTrue(doc.score + " does not equal: " + 2, doc.score == 2);
    }
  }
  assertTrue(numTens + " does not equal: " + 10, numTens == 10);
  CheckHits.checkExplanations(query, "field", searcher, true);
  Spans spans = query.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1f).getSpans(searcher.getIndexReader().leaves().get(0), SpanWeight.Postings.POSITIONS);
  assertTrue("spans is null and it shouldn't be", spans != null);
  //should be two matches per document
  int count = 0;
  //100 hits times 2 matches per hit, we should have 200 in count
  while (spans.nextDoc() != Spans.NO_MORE_DOCS) {
    while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
      count++;
    }
  }
  assertTrue(count + " does not equal: " + 200, count == 200);
}
 
Example 6
Source File: MtasDocumentIndex.java    From inception with Apache License 2.0 4 votes vote down vote up
private long doCountResults(IndexSearcher searcher,
    SearchQueryRequest aRequest, MtasSpanQuery q) throws IOException
{
    ListIterator<LeafReaderContext> leafReaderContextIterator = searcher.getIndexReader()
            .leaves().listIterator();

    Map<Long, Long> annotatableDocuments = listAnnotatableDocuments(aRequest.getProject(),
        aRequest.getUser());

    final float boost = 0;
    SpanWeight spanweight = q.rewrite(searcher.getIndexReader()).createWeight(searcher, false,
            boost);

    long numResults = 0;

    while (leafReaderContextIterator.hasNext()) {
        LeafReaderContext leafReaderContext = leafReaderContextIterator.next();
        try {
            Spans spans = spanweight.getSpans(leafReaderContext, SpanWeight.Postings.POSITIONS);
            SegmentReader segmentReader = (SegmentReader) leafReaderContext.reader();
            if (spans != null) {
                while (spans.nextDoc() != Spans.NO_MORE_DOCS) {
                    if (segmentReader.numDocs() == segmentReader.maxDoc()
                            || segmentReader.getLiveDocs().get(spans.docID())) {
                        Document document = segmentReader.document(spans.docID());

                        // Retrieve user
                        String user = document.get(FIELD_USER);

                        // Retrieve source and annotation document ids
                        String rawSourceDocumentId = document.get(FIELD_SOURCE_DOCUMENT_ID);
                        String rawAnnotationDocumentId = document
                                .get(FIELD_ANNOTATION_DOCUMENT_ID);
                        if (rawSourceDocumentId == null || rawAnnotationDocumentId == null) {
                            log.trace("Indexed document lacks source/annotation document IDs"
                                    + " - source: {}, annotation: {}", rawSourceDocumentId,
                                rawAnnotationDocumentId);
                            continue;

                        }
                        long sourceDocumentId = Long.valueOf(rawSourceDocumentId);
                        long annotationDocumentId = Long.valueOf(rawAnnotationDocumentId);

                        // If the query is limited to a given document, skip any results
                        // which are not in the given document
                        Optional<SourceDocument> limitedToDocument = aRequest
                                .getLimitedToDocument();
                        if (limitedToDocument.isPresent() && !Objects
                            .equals(limitedToDocument.get().getId(), sourceDocumentId)) {
                            log.trace("Query limited to document {}, skipping results for "
                                    + "document {}", limitedToDocument.get().getId(),
                                sourceDocumentId);
                            continue;
                        }

                        if (annotatableDocuments.containsKey(sourceDocumentId)
                            && annotationDocumentId == -1) {
                            // Exclude result if the retrieved document is a sourcedocument
                            // (that is, has annotationDocument = -1) AND it has a
                            // corresponding annotation document for this user
                            log.trace("Skipping results from indexed source document {} in" 
                                + "favor of results from the corresponding annotation "
                                + "document", sourceDocumentId);
                            continue;
                        }
                        else if (annotationDocumentId != -1 && !aRequest.getUser().getUsername()
                            .equals(user)) {
                            // Exclude result if the retrieved document is an annotation
                            // document (that is, annotationDocument != -1 and its username
                            // is different from the quering user
                            log.trace("Skipping results from annotation document for user {} "
                                    + "which does not match the requested user {}", user,
                                aRequest.getUser().getUsername());
                            continue;
                        }

                        while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
                            numResults++;
                        }
                    }
                }
            }
        }
        catch (Exception e) {
            log.error("Unable to process query results", e);
            numResults = -1;
        }
    }
    return numResults;
}
 
Example 7
Source File: MtasUimaParserLuceneTest.java    From inception with Apache License 2.0 4 votes vote down vote up
private static void doQuery(IndexReader indexReader, String field, MtasSpanQuery q,
        List<String> prefixes)
    throws IOException
{
    ListIterator<LeafReaderContext> iterator = indexReader.leaves().listIterator();
    IndexSearcher searcher = new IndexSearcher(indexReader);
    final float boost = 0;
    SpanWeight spanweight = q.rewrite(indexReader).createWeight(searcher, false, boost);

    while (iterator.hasNext()) {
        System.out.println("#### new iteration ####");
        LeafReaderContext lrc = iterator.next();
        Spans spans = spanweight.getSpans(lrc, SpanWeight.Postings.POSITIONS);
        SegmentReader segmentReader = (SegmentReader) lrc.reader();
        Terms terms = segmentReader.terms(field);
        CodecInfo mtasCodecInfo = CodecInfo.getCodecInfoFromTerms(terms);
        if (spans != null) {
            while (spans.nextDoc() != Spans.NO_MORE_DOCS) {
                if (segmentReader.numDocs() == segmentReader.maxDoc()
                        || segmentReader.getLiveDocs().get(spans.docID())) {
                    String idValue = segmentReader.document(spans.docID()).getField(FIELD_ID)
                            .stringValue();
                    System.out.println("********  New doc " + spans.docID() + "-" + idValue);
                    while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
                        System.out.println("------");
                        List<MtasTokenString> tokens = mtasCodecInfo
                                .getPrefixFilteredObjectsByPositions(field, spans.docID(),
                                        prefixes, spans.startPosition(),
                                        (spans.endPosition() - 1));
                        for (MtasTokenString token : tokens) {
                            System.out.print("docId: " + (lrc.docBase + spans.docID()) + ", ");
                            System.out.print(" position: " + token.getPositionStart()
                                    + (!Objects.equals(token.getPositionEnd(),
                                            token.getPositionStart())
                                                    ? "-" + token.getPositionEnd()
                                                    : ""));
                            System.out.print(" offset: " + token.getOffsetStart() + "-"
                                    + token.getOffsetEnd());
                            System.out.print(" mtasId: " + token.getId());
                            System.out.println(" " + token.getPrefix()
                                    + (token.getPostfix() != null ? ":" + token.getPostfix()
                                            : "")
                                    + ", ");
                        }
                        System.out.println("------");
                        List<MtasTreeHit<String>> hits = mtasCodecInfo
                                .getPositionedTermsByPrefixesAndPositionRange(field,
                                        spans.docID(), prefixes, spans.startPosition(),
                                        (spans.endPosition() - 1));
                        for (MtasTreeHit<String> hit : hits) {
                            System.out.print("docId: " + (lrc.docBase + spans.docID()) + ", ");
                            System.out.print("position: " + hit.startPosition
                                    + (hit.endPosition != hit.startPosition
                                            ? "-" + hit.endPosition
                                            : ""));
                            System.out.println(" " + CodecUtil.termPrefix(hit.data)
                                    + (CodecUtil.termValue(hit.data) != null
                                            ? ":" + CodecUtil.termValue(hit.data)
                                            : "")
                                    + ", ");
                        }
                    }
                    // if (prefixes != null && !prefixes.isEmpty()) {
                    // }
                }
            }
        }
    }
}
 
Example 8
Source File: TestPositionIncrement.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void testPayloadsPos0() throws Exception {
  Directory dir = newDirectory();
  RandomIndexWriter writer = new RandomIndexWriter(random(), dir, new MockPayloadAnalyzer());
  Document doc = new Document();
  doc.add(new TextField("content", new StringReader(
      "a a b c d e a f g h i j a b k k")));
  writer.addDocument(doc);

  final IndexReader readerFromWriter = writer.getReader();
  LeafReader r = getOnlyLeafReader(readerFromWriter);

  PostingsEnum tp = r.postings(new Term("content", "a"), PostingsEnum.ALL);
  
  int count = 0;
  assertTrue(tp.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
  // "a" occurs 4 times
  assertEquals(4, tp.freq());
  assertEquals(0, tp.nextPosition());
  assertEquals(1, tp.nextPosition());
  assertEquals(3, tp.nextPosition());
  assertEquals(6, tp.nextPosition());

  // only one doc has "a"
  assertEquals(DocIdSetIterator.NO_MORE_DOCS, tp.nextDoc());

  IndexSearcher is = newSearcher(getOnlyLeafReader(readerFromWriter));

  SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a"));
  SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k"));
  SpanQuery[] sqs = { stq1, stq2 };
  SpanNearQuery snq = new SpanNearQuery(sqs, 30, false);

  count = 0;
  boolean sawZero = false;
  if (VERBOSE) {
    System.out.println("\ngetPayloadSpans test");
  }
  PayloadSpanCollector collector = new PayloadSpanCollector();
  Spans pspans = snq.createWeight(is, ScoreMode.COMPLETE_NO_SCORES, 1f).getSpans(is.getIndexReader().leaves().get(0), SpanWeight.Postings.PAYLOADS);
  while (pspans.nextDoc() != Spans.NO_MORE_DOCS) {
    while (pspans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
      if (VERBOSE) {
        System.out.println("doc " + pspans.docID() + ": span " + pspans.startPosition()
            + " to " + pspans.endPosition());
      }
      collector.reset();
      pspans.collect(collector);
      sawZero |= pspans.startPosition() == 0;
      for (BytesRef payload : collector.payloads) {
        count++;
        if (VERBOSE) {
          System.out.println("  payload: " + Term.toString(payload));
        }
      }
    }
  }
  assertTrue(sawZero);
  assertEquals(8, count);

  // System.out.println("\ngetSpans test");
  Spans spans = snq.createWeight(is, ScoreMode.COMPLETE_NO_SCORES, 1f).getSpans(is.getIndexReader().leaves().get(0), SpanWeight.Postings.POSITIONS);
  count = 0;
  sawZero = false;
  while (spans.nextDoc() != Spans.NO_MORE_DOCS) {
    while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
      count++;
      sawZero |= spans.startPosition() == 0;
      // System.out.println(spans.doc() + " - " + spans.start() + " - " +
      // spans.end());
    }
  }
  assertEquals(4, count);
  assertTrue(sawZero);

  writer.close();
  is.getIndexReader().close();
  dir.close();
}