Java Code Examples for org.apache.lucene.index.LeafReader#getLiveDocs()

The following examples show how to use org.apache.lucene.index.LeafReader#getLiveDocs() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: LuceneIndex.java From rdf4j with BSD 3-Clause "New" or "Revised" License

6 votes

private static Document getDocument(LeafReader reader, Term term) throws IOException {
	PostingsEnum docs = reader.postings(term);
	if (docs != null) {
		int docId = docs.nextDoc();
		// PostingsEnum may contain deleted documents, we have to cope for it
		while (docId != PostingsEnum.NO_MORE_DOCS) {

			// if document is deleted, skip and continue
			Bits liveDocs = reader.getLiveDocs();
			if (liveDocs != null && !liveDocs.get(docId)) {
				docId = docs.nextDoc();
				continue;
			}
			if (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
				throw new IllegalStateException("Multiple Documents for term " + term.text());
			}
			return readDocument(reader, docId, null);
		}
	}
	return null;
}

Example 2

Source File: IndexSizeEstimator.java From lucene-solr with Apache License 2.0

6 votes

private void estimateTermVectors(Map<String, Object> result) throws IOException {
  log.info("- estimating term vectors...");
  Map<String, Map<String, Object>> stats = new HashMap<>();
  for (LeafReaderContext leafReaderContext : reader.leaves()) {
    LeafReader leafReader = leafReaderContext.reader();
    Bits liveDocs = leafReader.getLiveDocs();
    for (int docId = 0; docId < leafReader.maxDoc(); docId += samplingStep) {
      if (liveDocs != null && !liveDocs.get(docId)) {
        continue;
      }
      Fields termVectors = leafReader.getTermVectors(docId);
      if (termVectors == null) {
        continue;
      }
      for (String field : termVectors) {
        Terms terms = termVectors.terms(field);
        if (terms == null) {
          continue;
        }
        estimateTermStats(field, terms, stats, true);
      }
    }
  }
  result.put(TERM_VECTORS, stats);
}

Example 3

Source File: LukeRequestHandler.java From lucene-solr with Apache License 2.0

6 votes

private static Document getFirstLiveDoc(Terms terms, LeafReader reader) throws IOException {
  PostingsEnum postingsEnum = null;
  TermsEnum termsEnum = terms.iterator();
  BytesRef text;
  // Deal with the chance that the first bunch of terms are in deleted documents. Is there a better way?
  for (int idx = 0; idx < 1000 && postingsEnum == null; ++idx) {
    text = termsEnum.next();
    if (text == null) { // Ran off the end of the terms enum without finding any live docs with that field in them.
      return null;
    }
    postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
    final Bits liveDocs = reader.getLiveDocs();
    if (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
      if (liveDocs != null && liveDocs.get(postingsEnum.docID())) {
        continue;
      }
      return reader.document(postingsEnum.docID());
    }
  }
  return null;
}

Example 4

Source File: LuceneBatchIterator.java From crate with Apache License 2.0

6 votes

private boolean innerMoveNext() throws IOException {
    while (tryAdvanceDocIdSetIterator()) {
        LeafReader reader = currentLeaf.reader();
        Bits liveDocs = reader.getLiveDocs();
        int doc;
        while ((doc = currentDocIdSetIt.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
            if (docDeleted(liveDocs, doc) || belowMinScore(currentScorer)) {
                continue;
            }
            onDoc(doc);
            return true;
        }
        currentDocIdSetIt = null;
    }
    clearState();
    return false;
}

Example 5

Source File: AlfrescoLukeRequestHandler.java From SearchServices with GNU Lesser General Public License v3.0

5 votes

protected static Document getFirstLiveDoc(Terms terms, LeafReader reader)
		throws IOException {
	TermsEnum termsEnum = terms.iterator();
	if (termsEnum.next() == null) { // Ran off the end of the terms enum without finding any live docs with that field in them.
		return null;
	}
	PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.NONE);
	final Bits liveDocs = reader.getLiveDocs();
	if (postingsEnum.nextDoc() == DocIdSetIterator.NO_MORE_DOCS
			|| (liveDocs != null && liveDocs.get(postingsEnum.docID()))) {
		return null;
	}
	return reader.document(postingsEnum.docID());
}

Example 6

Source File: LuceneIndex.java From rdf4j with BSD 3-Clause "New" or "Revised" License

5 votes

private static void addDocuments(LeafReader reader, Term term, Collection<Document> documents) throws IOException {
	PostingsEnum docs = reader.postings(term);
	if (docs != null) {
		int docId;
		while ((docId = docs.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
			Bits liveDocs = reader.getLiveDocs();
			// Maybe some of the docs have been deleted! Check that too..
			if (liveDocs != null && !liveDocs.get(docId)) {
				continue;
			}
			Document document = readDocument(reader, docId, null);
			documents.add(document);
		}
	}
}

Example 7

Source File: IndexSizeEstimator.java From lucene-solr with Apache License 2.0

5 votes

private void estimateStoredFields(Map<String, Object> result) throws IOException {
  log.info("- estimating stored fields...");
  Map<String, Map<String, Object>> stats = new HashMap<>();
  for (LeafReaderContext context : reader.leaves()) {
    LeafReader leafReader = context.reader();
    EstimatingVisitor visitor = new EstimatingVisitor(stats, topN, maxLength, samplingStep);
    Bits liveDocs = leafReader.getLiveDocs();
    if (leafReader instanceof CodecReader) {
      CodecReader codecReader = (CodecReader)leafReader;
      StoredFieldsReader storedFieldsReader = codecReader.getFieldsReader();
      // this instance may be faster for a full sequential pass
      StoredFieldsReader mergeInstance = storedFieldsReader.getMergeInstance();
      for (int docId = 0; docId < leafReader.maxDoc(); docId += samplingStep) {
        if (liveDocs != null && !liveDocs.get(docId)) {
          continue;
        }
        mergeInstance.visitDocument(docId, visitor);
      }
      if (mergeInstance != storedFieldsReader) {
        mergeInstance.close();
      }
    } else {
      for (int docId = 0; docId < leafReader.maxDoc(); docId += samplingStep) {
        if (liveDocs != null && !liveDocs.get(docId)) {
          continue;
        }
        leafReader.document(docId, visitor);
      }
    }
  }
  result.put(STORED_FIELDS, stats);
}

Example 8

Source File: CodecCollector.java From mtas with Apache License 2.0

5 votes

/**
 * Compute termvector number basic.
 *
 * @param termsEnum
 *          the terms enum
 * @param r
 *          the r
 * @return the termvector number basic
 * @throws IOException
 *           Signals that an I/O exception has occurred.
 */
private static TermvectorNumberBasic computeTermvectorNumberBasic(
    TermsEnum termsEnum, LeafReader r) throws IOException {
  TermvectorNumberBasic result = new TermvectorNumberBasic();
  boolean hasDeletedDocuments = (r.getLiveDocs() != null);
  if (!hasDeletedDocuments) {
    result.valueSum[0] = termsEnum.totalTermFreq();
    result.docNumber = termsEnum.docFreq();
    if (result.valueSum[0] > -1) {
      return result;
    }
  }
  throw new IOException("should not call this");
}

Example 9

Source File: CodecCollector.java From mtas with Apache License 2.0

5 votes

/**
 * Compute termvector number basic.
 *
 * @param docSet
 *          the doc set
 * @param termDocId
 *          the term doc id
 * @param termsEnum
 *          the terms enum
 * @param r
 *          the r
 * @param lrc
 *          the lrc
 * @param postingsEnum
 *          the postings enum
 * @return the termvector number basic
 * @throws IOException
 *           Signals that an I/O exception has occurred.
 */
private static TermvectorNumberBasic computeTermvectorNumberBasic(
    List<Integer> docSet, int termDocId, TermsEnum termsEnum, LeafReader r,
    LeafReaderContext lrc, PostingsEnum postingsEnum) throws IOException {
  TermvectorNumberBasic result = new TermvectorNumberBasic();
  boolean hasDeletedDocuments = (r.getLiveDocs() != null);
  if ((docSet.size() == r.numDocs()) && !hasDeletedDocuments) {
    try {
      return computeTermvectorNumberBasic(termsEnum, r);
    } catch (IOException e) {
      log.debug("problem", e);
      // problem
    }
  }
  result.docNumber = 0;
  result.valueSum[0] = 0;
  int localTermDocId = termDocId;
  Iterator<Integer> docIterator = docSet.iterator();
  postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.FREQS);
  int docId;
  while (docIterator.hasNext()) {
    docId = docIterator.next() - lrc.docBase;
    if (docId >= localTermDocId && ((docId == localTermDocId)
        || ((localTermDocId = postingsEnum.advance(docId)) == docId))) {
      result.docNumber++;
      result.valueSum[0] += postingsEnum.freq();
    }
    if (localTermDocId == DocIdSetIterator.NO_MORE_DOCS) {
      break;
    }
  }
  return result;
}

Example 10

Source File: TestDocTermOrdsUninvertLimit.java From lucene-solr with Apache License 2.0

4 votes

@SuppressWarnings({"ConstantConditions", "PointlessBooleanExpression"})
@Nightly
public void testTriggerUnInvertLimit() throws IOException {
  final boolean SHOULD_TRIGGER = false; // Set this to true to use the test with the old implementation

  // Ensure enough terms inside of a single UnInvert-pass-structure to trigger the limit
  final int REF_LIMIT = (int) Math.pow(2, 24); // Maximum number of references within a single pass-structure
  final int DOCS = (1<<16)-1;                  // The number of documents within a single pass (simplified)
  final int TERMS = REF_LIMIT/DOCS;            // Each document must have this many references aka terms hit limit

  // disk based Directory and IWC settings to reduce risk of OOM
  Directory dir = newFSDirectory(createTempDir("TestDocTermOrdsUninvertLimit"));
  final IndexWriter w = new IndexWriter(dir,
                                        new IndexWriterConfig(new MockAnalyzer(random()))
                                        .setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH)
                                        .setRAMBufferSizeMB(256.0)
                                        .setMergeScheduler(new ConcurrentMergeScheduler())
                                        .setMergePolicy(newLogMergePolicy(false, 10))
                                        .setOpenMode(IndexWriterConfig.OpenMode.CREATE)
                                        .setCodec(TestUtil.getDefaultCodec()));
  
  Document doc = new Document();
  Field field = newTextField("field", "", Field.Store.NO);
  doc.add(field);

  StringBuilder sb = new StringBuilder(TERMS*(Integer.toString(TERMS).length()+1));
  for (int i = 0 ; i < TERMS ; i++) {
    sb.append(" ").append(Integer.toString(i));
  }
  field.setStringValue(sb.toString());

  for (int i = 0 ; i < DOCS ; i++) {
    w.addDocument(doc);
  }
  //System.out.println("\n Finished adding " + DOCS + " documents of " + TERMS + " unique terms");
  w.close();
  
  final IndexReader r = DirectoryReader.open(dir);
  try {
    final LeafReader ar = SlowCompositeReaderWrapper.wrap(r);
    TestUtil.checkReader(ar);
    final DocTermOrds dto = new DocTermOrds(ar, ar.getLiveDocs(), "field"); // bigTerms turned off
    if (SHOULD_TRIGGER) {
      fail("DocTermOrds should have failed with a \"Too many values for UnInvertedField\" message");
    }
  } catch (IllegalStateException e) {
    if (!SHOULD_TRIGGER) {
      fail("DocTermsOrd should not have failed with this implementation, but got exception " +
          e.getClass().getSimpleName() + " with message " + e.getMessage());
    }
    // This is (hopefully) "Too many values for UnInvertedField faceting on field field", so all is as expected
  } finally {
    r.close();
    dir.close();
  }
}

Example 11

Source File: TestDocTermOrds.java From lucene-solr with Apache License 2.0

4 votes

public void testSimple() throws Exception {
  Directory dir = newDirectory();
  final RandomIndexWriter w = new RandomIndexWriter(random(), dir, newIndexWriterConfig(new MockAnalyzer(random())).setMergePolicy(newLogMergePolicy()));
  Document doc = new Document();
  Field field = newTextField("field", "", Field.Store.NO);
  doc.add(field);
  field.setStringValue("a b c");
  w.addDocument(doc);

  field.setStringValue("d e f");
  w.addDocument(doc);

  field.setStringValue("a f");
  w.addDocument(doc);
  
  final IndexReader r = w.getReader();
  w.close();

  final LeafReader ar = SlowCompositeReaderWrapper.wrap(r);
  TestUtil.checkReader(ar);
  final DocTermOrds dto = new DocTermOrds(ar, ar.getLiveDocs(), "field");
  SortedSetDocValues iter = dto.iterator(ar);
  
  assertEquals(0, iter.nextDoc());
  assertEquals(0, iter.nextOrd());
  assertEquals(1, iter.nextOrd());
  assertEquals(2, iter.nextOrd());
  assertEquals(SortedSetDocValues.NO_MORE_ORDS, iter.nextOrd());
  
  assertEquals(1, iter.nextDoc());
  assertEquals(3, iter.nextOrd());
  assertEquals(4, iter.nextOrd());
  assertEquals(5, iter.nextOrd());
  assertEquals(SortedSetDocValues.NO_MORE_ORDS, iter.nextOrd());

  assertEquals(2, iter.nextDoc());
  assertEquals(0, iter.nextOrd());
  assertEquals(5, iter.nextOrd());
  assertEquals(SortedSetDocValues.NO_MORE_ORDS, iter.nextOrd());

  r.close();
  dir.close();
}

Example 12

Source File: IndexSizeEstimatorTest.java From lucene-solr with Apache License 2.0

4 votes

@Test
public void testEstimator() throws Exception {
  JettySolrRunner jetty = cluster.getRandomJetty(random());
  String randomCoreName = jetty.getCoreContainer().getAllCoreNames().iterator().next();
  SolrCore core = jetty.getCoreContainer().getCore(randomCoreName);
  RefCounted<SolrIndexSearcher> searcherRef = core.getSearcher();
  try {
    SolrIndexSearcher searcher = searcherRef.get();
    // limit the max length
    IndexSizeEstimator estimator = new IndexSizeEstimator(searcher.getRawReader(), 20, 50, true, true);
    IndexSizeEstimator.Estimate estimate = estimator.estimate();
    Map<String, Long> fieldsBySize = estimate.getFieldsBySize();
    assertFalse("empty fieldsBySize", fieldsBySize.isEmpty());
    assertEquals(fieldsBySize.toString(), fields.size(), fieldsBySize.size());
    fieldsBySize.forEach((k, v) -> assertTrue("unexpected size of " + k + ": " + v, v > 0));
    Map<String, Long> typesBySize = estimate.getTypesBySize();
    assertFalse("empty typesBySize", typesBySize.isEmpty());
    assertTrue("expected at least 8 types: " + typesBySize.toString(), typesBySize.size() >= 8);
    typesBySize.forEach((k, v) -> assertTrue("unexpected size of " + k + ": " + v, v > 0));
    Map<String, Object> summary = estimate.getSummary();
    assertNotNull("summary", summary);
    assertFalse("empty summary", summary.isEmpty());
    assertEquals(summary.keySet().toString(), fields.size(), summary.keySet().size());
    Map<String, Object> details = estimate.getDetails();
    assertNotNull("details", details);
    assertFalse("empty details", details.isEmpty());
    // by type
    assertEquals(details.keySet().toString(), 6, details.keySet().size());

    // check sampling
    estimator.setSamplingThreshold(searcher.getRawReader().maxDoc() / 2);
    IndexSizeEstimator.Estimate sampledEstimate = estimator.estimate();
    Map<String, Long> sampledFieldsBySize = sampledEstimate.getFieldsBySize();
    assertFalse("empty fieldsBySize", sampledFieldsBySize.isEmpty());
    // verify that the sampled values are within 50% of the original values
    fieldsBySize.forEach((field, size) -> {
      Long sampledSize = sampledFieldsBySize.get(field);
      assertNotNull("sampled size for " + field + " is missing in " + sampledFieldsBySize, sampledSize);
      double delta = (double) size * 0.5;
      assertEquals("sampled size of " + field + " is wildly off", (double)size, (double)sampledSize, delta);
    });
    // verify the reader is still usable - SOLR-13694
    IndexReader reader = searcher.getRawReader();
    for (LeafReaderContext context : reader.leaves()) {
      LeafReader leafReader = context.reader();
      assertTrue("unexpected LeafReader class: " + leafReader.getClass().getName(), leafReader instanceof CodecReader);
      Bits liveDocs = leafReader.getLiveDocs();
      CodecReader codecReader = (CodecReader) leafReader;
      StoredFieldsReader storedFieldsReader = codecReader.getFieldsReader();
      StoredFieldVisitor visitor = new DocumentStoredFieldVisitor();
      assertNotNull(storedFieldsReader);
      for (int docId = 0; docId < leafReader.maxDoc(); docId++) {
        if (liveDocs != null && !liveDocs.get(docId)) {
          continue;
        }
        storedFieldsReader.visitDocument(docId, visitor);
      }
    }
  } finally {
    searcherRef.decref();
    core.close();
  }
}