Java Code Examples for org.apache.lucene.index.DirectoryReader#numDocs()

The following examples show how to use org.apache.lucene.index.DirectoryReader#numDocs() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: TableShardCountCollapserTest.java From incubator-retired-blur with Apache License 2.0

6 votes

private void assertData(int totalShardCount) throws IOException {
  Partitioner<IntWritable, IntWritable> partitioner = new HashPartitioner<IntWritable, IntWritable>();
  for (int i = 0; i < totalShardCount; i++) {
    HdfsDirectory directory = new HdfsDirectory(configuration, new Path(path, ShardUtil.getShardName(i)));
    DirectoryReader reader = DirectoryReader.open(directory);
    int numDocs = reader.numDocs();
    for (int d = 0; d < numDocs; d++) {
      Document document = reader.document(d);
      IndexableField field = document.getField("id");
      Integer id = (Integer) field.numericValue();
      int partition = partitioner.getPartition(new IntWritable(id), null, totalShardCount);
      assertEquals(i, partition);
    }
    reader.close();
  }
}

Example 2

Source File: LuceneSearchIndex.java From dremio-oss with Apache License 2.0

5 votes

public int getLiveRecords() {
  checkIfChanged();
  try(Searcher searcher = acquireSearcher()) {
    DirectoryReader reader = (DirectoryReader) searcher.searcher.getIndexReader();
    return reader.numDocs();
  }
}

Example 3

Source File: DirectoryTaxonomyWriter.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Takes the categories from the given taxonomy directory, and adds the
 * missing ones to this taxonomy. Additionally, it fills the given
 * {@link OrdinalMap} with a mapping from the original ordinal to the new
 * ordinal.
 */
public void addTaxonomy(Directory taxoDir, OrdinalMap map) throws IOException {
  ensureOpen();
  DirectoryReader r = DirectoryReader.open(taxoDir);
  try {
    final int size = r.numDocs();
    final OrdinalMap ordinalMap = map;
    ordinalMap.setSize(size);
    int base = 0;
    PostingsEnum docs = null;
    for (final LeafReaderContext ctx : r.leaves()) {
      final LeafReader ar = ctx.reader();
      final Terms terms = ar.terms(Consts.FULL);
      // TODO: share per-segment TermsEnum here!
      TermsEnum te = terms.iterator();
      while (te.next() != null) {
        FacetLabel cp = new FacetLabel(FacetsConfig.stringToPath(te.term().utf8ToString()));
        final int ordinal = addCategory(cp);
        docs = te.postings(docs, PostingsEnum.NONE);
        ordinalMap.addMapping(docs.nextDoc() + base, ordinal);
      }
      base += ar.maxDoc(); // no deletions, so we're ok
    }
    ordinalMap.addDone();
  } finally {
    r.close();
  }
}

Example 4

Source File: SolrIndexSplitterTest.java From lucene-solr with Apache License 2.0

4 votes

private void doTestSplitAlternately(SolrIndexSplitter.SplitMethod splitMethod) throws Exception {
  LocalSolrQueryRequest request = null;
  Directory directory = null;
  try {
    // add an even number of docs
    int max = (1 + random().nextInt(10)) * 3;
    log.info("Adding {} number of documents", max);
    for (int i = 0; i < max; i++) {
      assertU(adoc("id", String.valueOf(i)));
    }
    assertU(commit());

    request = lrf.makeRequest("q", "dummy");
    SolrQueryResponse rsp = new SolrQueryResponse();
    SplitIndexCommand command = new SplitIndexCommand(request, rsp,
        Lists.newArrayList(indexDir1.getAbsolutePath(), indexDir2.getAbsolutePath(), indexDir3.getAbsolutePath()),
        null, null, new PlainIdRouter(), null, null, splitMethod);
    doSplit(command);

    directory = h.getCore().getDirectoryFactory().get(indexDir1.getAbsolutePath(),
        DirectoryFactory.DirContext.DEFAULT, h.getCore().getSolrConfig().indexConfig.lockType);
    DirectoryReader reader = DirectoryReader.open(directory);
    int numDocs1 = reader.numDocs();
    reader.close();
    h.getCore().getDirectoryFactory().release(directory);
    directory = h.getCore().getDirectoryFactory().get(indexDir2.getAbsolutePath(),
        DirectoryFactory.DirContext.DEFAULT, h.getCore().getSolrConfig().indexConfig.lockType);
    reader = DirectoryReader.open(directory);
    int numDocs2 = reader.numDocs();
    reader.close();
    h.getCore().getDirectoryFactory().release(directory);
    directory = h.getCore().getDirectoryFactory().get(indexDir3.getAbsolutePath(),
        DirectoryFactory.DirContext.DEFAULT, h.getCore().getSolrConfig().indexConfig.lockType);
    reader = DirectoryReader.open(directory);
    int numDocs3 = reader.numDocs();
    reader.close();
    h.getCore().getDirectoryFactory().release(directory);
    directory = null;
    assertEquals("split indexes lost some documents!", max, numDocs1 + numDocs2 + numDocs3);
    assertEquals("split index1 has wrong number of documents", max / 3, numDocs1);
    assertEquals("split index2 has wrong number of documents", max / 3, numDocs2);
    assertEquals("split index3 has wrong number of documents", max / 3, numDocs3);
  } finally {
    if (request != null) request.close(); // decrefs the searcher
    if (directory != null)  {
      // perhaps an assert failed, release the directory
      h.getCore().getDirectoryFactory().release(directory);
    }
  }
}

Example 5

Source File: Blur024CodecTest.java From incubator-retired-blur with Apache License 2.0

4 votes

@Test
public void testLargeDocs() throws IOException {
  Random random = new Random();
  Iterable<? extends IndexableField> doc = getLargeDoc(random);
  RAMDirectory directory = new RAMDirectory();
  IndexWriterConfig conf1 = new IndexWriterConfig(Version.LUCENE_43, new WhitespaceAnalyzer(Version.LUCENE_43));
  conf1.setCodec(new Blur024Codec());
  IndexWriter writer1 = new IndexWriter(directory, conf1);
  writer1.addDocument(doc);
  writer1.close();

  DirectoryReader reader1 = DirectoryReader.open(directory);
  int numDocs1 = reader1.numDocs();
  assertEquals(1, numDocs1);

  // for (int i = 0; i < numDocs1; i++) {
  // System.out.println(reader1.document(i));
  // }

  IndexWriterConfig conf2 = new IndexWriterConfig(Version.LUCENE_43, new WhitespaceAnalyzer(Version.LUCENE_43));
  conf2.setCodec(new Blur024Codec(1 << 16, CompressionMode.HIGH_COMPRESSION));
  IndexWriter writer2 = new IndexWriter(directory, conf2);
  writer2.addDocument(doc);
  writer2.close();

  DirectoryReader reader2 = DirectoryReader.open(directory);
  int numDocs2 = reader2.numDocs();
  assertEquals(2, numDocs2);

  for (int i = 0; i < 2; i++) {

    long t1 = System.nanoTime();
    Document document1 = reader1.document(0);
    long t2 = System.nanoTime();
    Document document2 = reader2.document(1);
    long t3 = System.nanoTime();

    System.out.println((t3 - t2) / 1000000.0);
    System.out.println((t2 - t1) / 1000000.0);

    System.out.println("doc1 " + document1.hashCode());
    System.out.println("doc2 " + document2.hashCode());
  }

  // for (int i = 0; i < numDocs2; i++) {
  // System.out.println(reader2.document(i));
  // }

  // long fileLength = directory.fileLength("_0.fdt");

  for (String name : directory.listAll()) {
    if (name.endsWith(".fdt")) {
      System.out.println(name);
      System.out.println(directory.fileLength(name));
    }
  }

}

Example 6

Source File: Blur024CodecTest.java From incubator-retired-blur with Apache License 2.0

4 votes

@Test
public void testSmallDocs() throws IOException {

  RAMDirectory directory = new RAMDirectory();
  IndexWriterConfig conf1 = new IndexWriterConfig(Version.LUCENE_43, new WhitespaceAnalyzer(Version.LUCENE_43));
  conf1.setCodec(new Blur024Codec());
  Random random1 = new Random(1);
  IndexWriter writer1 = new IndexWriter(directory, conf1);
  for (int i = 0; i < 1000; i++) {
    writer1.addDocument(getSmallDoc(random1));
  }
  writer1.close();

  DirectoryReader reader1 = DirectoryReader.open(directory);
  int numDocs1 = reader1.numDocs();
  assertEquals(1000, numDocs1);

  // for (int i = 0; i < numDocs1; i++) {
  // System.out.println(reader1.document(i));
  // }

  IndexWriterConfig conf2 = new IndexWriterConfig(Version.LUCENE_43, new WhitespaceAnalyzer(Version.LUCENE_43));
  conf2.setCodec(new Blur024Codec(1 << 16, CompressionMode.HIGH_COMPRESSION));
  Random random2 = new Random(1);
  IndexWriter writer2 = new IndexWriter(directory, conf2);
  for (int i = 0; i < 1000; i++) {
    writer2.addDocument(getSmallDoc(random2));
  }
  writer2.close();

  DirectoryReader reader2 = DirectoryReader.open(directory);
  int numDocs2 = reader2.numDocs();
  assertEquals(2000, numDocs2);

  for (int i = 0; i < 2; i++) {

    long t1 = System.nanoTime();
    long hash1 = 0;
    long hash2 = 0;
    for (int d = 0; d < 1000; d++) {
      Document document1 = reader1.document(d);
      hash1 += document1.hashCode();
    }
    long t2 = System.nanoTime();
    for (int d = 0; d < 1000; d++) {
      Document document2 = reader2.document(d + 1000);
      hash2 += document2.hashCode();
    }
    long t3 = System.nanoTime();

    System.out.println((t3 - t2) / 1000000.0);
    System.out.println((t2 - t1) / 1000000.0);

    System.out.println("doc1 " + hash1);
    System.out.println("doc2 " + hash2);
  }

  // for (int i = 0; i < numDocs2; i++) {
  // System.out.println(reader2.document(i));
  // }

  // long fileLength = directory.fileLength("_0.fdt");

  for (String name : directory.listAll()) {
    if (name.endsWith(".fdt")) {
      System.out.println(name);
      System.out.println(directory.fileLength(name));
    }
  }
}

Example 7

Source File: Blur022CodecTest.java From incubator-retired-blur with Apache License 2.0

4 votes

@Test
public void testLargeDocs() throws IOException {
  Random random = new Random();
  Iterable<? extends IndexableField> doc = getLargeDoc(random);
  RAMDirectory directory = new RAMDirectory();
  IndexWriterConfig conf1 = new IndexWriterConfig(Version.LUCENE_43, new WhitespaceAnalyzer(Version.LUCENE_43));
  conf1.setCodec(new Blur022Codec());
  IndexWriter writer1 = new IndexWriter(directory, conf1);
  writer1.addDocument(doc);
  writer1.close();

  DirectoryReader reader1 = DirectoryReader.open(directory);
  int numDocs1 = reader1.numDocs();
  assertEquals(1, numDocs1);

  // for (int i = 0; i < numDocs1; i++) {
  // System.out.println(reader1.document(i));
  // }

  IndexWriterConfig conf2 = new IndexWriterConfig(Version.LUCENE_43, new WhitespaceAnalyzer(Version.LUCENE_43));
  conf2.setCodec(new Blur022Codec(1 << 16, CompressionMode.HIGH_COMPRESSION));
  IndexWriter writer2 = new IndexWriter(directory, conf2);
  writer2.addDocument(doc);
  writer2.close();

  DirectoryReader reader2 = DirectoryReader.open(directory);
  int numDocs2 = reader2.numDocs();
  assertEquals(2, numDocs2);

  for (int i = 0; i < 2; i++) {

    long t1 = System.nanoTime();
    Document document1 = reader1.document(0);
    long t2 = System.nanoTime();
    Document document2 = reader2.document(1);
    long t3 = System.nanoTime();

    System.out.println((t3 - t2) / 1000000.0);
    System.out.println((t2 - t1) / 1000000.0);

    System.out.println("doc1 " + document1.hashCode());
    System.out.println("doc2 " + document2.hashCode());
  }

  // for (int i = 0; i < numDocs2; i++) {
  // System.out.println(reader2.document(i));
  // }

  // long fileLength = directory.fileLength("_0.fdt");

  for (String name : directory.listAll()) {
    if (name.endsWith(".fdt")) {
      System.out.println(name);
      System.out.println(directory.fileLength(name));
    }
  }

}

Example 8

Source File: Blur022CodecTest.java From incubator-retired-blur with Apache License 2.0

4 votes

@Test
public void testSmallDocs() throws IOException {

  RAMDirectory directory = new RAMDirectory();
  IndexWriterConfig conf1 = new IndexWriterConfig(Version.LUCENE_43, new WhitespaceAnalyzer(Version.LUCENE_43));
  conf1.setCodec(new Blur022Codec());
  Random random1 = new Random(1);
  IndexWriter writer1 = new IndexWriter(directory, conf1);
  for (int i = 0; i < 1000; i++) {
    writer1.addDocument(getSmallDoc(random1));
  }
  writer1.close();

  DirectoryReader reader1 = DirectoryReader.open(directory);
  int numDocs1 = reader1.numDocs();
  assertEquals(1000, numDocs1);

  // for (int i = 0; i < numDocs1; i++) {
  // System.out.println(reader1.document(i));
  // }

  IndexWriterConfig conf2 = new IndexWriterConfig(Version.LUCENE_43, new WhitespaceAnalyzer(Version.LUCENE_43));
  conf2.setCodec(new Blur022Codec(1 << 16, CompressionMode.HIGH_COMPRESSION));
  Random random2 = new Random(1);
  IndexWriter writer2 = new IndexWriter(directory, conf2);
  for (int i = 0; i < 1000; i++) {
    writer2.addDocument(getSmallDoc(random2));
  }
  writer2.close();

  DirectoryReader reader2 = DirectoryReader.open(directory);
  int numDocs2 = reader2.numDocs();
  assertEquals(2000, numDocs2);

  for (int i = 0; i < 2; i++) {

    long t1 = System.nanoTime();
    long hash1 = 0;
    long hash2 = 0;
    for (int d = 0; d < 1000; d++) {
      Document document1 = reader1.document(d);
      hash1 += document1.hashCode();
    }
    long t2 = System.nanoTime();
    for (int d = 0; d < 1000; d++) {
      Document document2 = reader2.document(d + 1000);
      hash2 += document2.hashCode();
    }
    long t3 = System.nanoTime();

    System.out.println((t3 - t2) / 1000000.0);
    System.out.println((t2 - t1) / 1000000.0);

    System.out.println("doc1 " + hash1);
    System.out.println("doc2 " + hash2);
  }

  // for (int i = 0; i < numDocs2; i++) {
  // System.out.println(reader2.document(i));
  // }

  // long fileLength = directory.fileLength("_0.fdt");

  for (String name : directory.listAll()) {
    if (name.endsWith(".fdt")) {
      System.out.println(name);
      System.out.println(directory.fileLength(name));
    }
  }
}

Example 9

Source File: BlurOutputFormatTest.java From incubator-retired-blur with Apache License 2.0

4 votes

@Test
public void testBlurOutputFormatOverFlowMultipleReducersTest() throws IOException, InterruptedException,
    ClassNotFoundException {
  Path input = getInDir();
  Path output = getOutDir();
  _fileSystem.delete(input, true);
  _fileSystem.delete(output, true);
  // 1500 * 50 = 75,000
  writeRecordsFile(new Path(input, "part1"), 1, 50, 1, 1500, "cf1");
  // 100 * 50 = 5,000
  writeRecordsFile(new Path(input, "part2"), 1, 50, 2000, 100, "cf1");

  Job job = Job.getInstance(_conf, "blur index");
  job.setJarByClass(BlurOutputFormatTest.class);
  job.setMapperClass(CsvBlurMapper.class);
  job.setInputFormatClass(TextInputFormat.class);

  FileInputFormat.addInputPath(job, input);
  CsvBlurMapper.addColumns(job, "cf1", "col");

  Path tablePath = new Path(new Path(_root, "table"), "test");

  TableDescriptor tableDescriptor = new TableDescriptor();
  tableDescriptor.setShardCount(2);
  tableDescriptor.setTableUri(tablePath.toString());
  tableDescriptor.setName("test");

  createShardDirectories(output, 2);

  BlurOutputFormat.setupJob(job, tableDescriptor);
  BlurOutputFormat.setOutputPath(job, output);
  BlurOutputFormat.setIndexLocally(job, false);
  BlurOutputFormat.setDocumentBufferStrategy(job, DocumentBufferStrategyHeapSize.class);
  BlurOutputFormat.setMaxDocumentBufferHeapSize(job, 128 * 1024);

  assertTrue(job.waitForCompletion(true));
  Counters ctrs = job.getCounters();
  System.out.println("Counters: " + ctrs);

  long total = 0;
  for (int i = 0; i < tableDescriptor.getShardCount(); i++) {
    Path path = new Path(output, ShardUtil.getShardName(i));
    Collection<Path> commitedTasks = getCommitedTasks(path);
    assertEquals(1, commitedTasks.size());

    DirectoryReader reader = DirectoryReader.open(new HdfsDirectory(_conf, commitedTasks.iterator().next()));
    total += reader.numDocs();
    reader.close();
  }
  assertEquals(80000, total);

}

Example 10

Source File: BlurOutputFormatTest.java From incubator-retired-blur with Apache License 2.0

4 votes

@Test
public void testBlurOutputFormatOverFlowMultipleReducersWithReduceMultiplierTest() throws IOException,
    InterruptedException, ClassNotFoundException {
  Path input = getInDir();
  Path output = getOutDir();
  _fileSystem.delete(input, true);
  _fileSystem.delete(output, true);

  // 1500 * 50 = 75,000
  writeRecordsFile(new Path(input, "part1"), 1, 50, 1, 1500, "cf1");
  // 100 * 50 = 5,000
  writeRecordsFile(new Path(input, "part2"), 1, 50, 2000, 100, "cf1");

  Job job = Job.getInstance(_conf, "blur index");
  job.setJarByClass(BlurOutputFormatTest.class);
  job.setMapperClass(CsvBlurMapper.class);
  job.setInputFormatClass(TextInputFormat.class);

  FileInputFormat.addInputPath(job, input);
  CsvBlurMapper.addColumns(job, "cf1", "col");

  Path tablePath = new Path(new Path(_root, "table"), "test");

  TableDescriptor tableDescriptor = new TableDescriptor();
  tableDescriptor.setShardCount(7);
  tableDescriptor.setTableUri(tablePath.toString());
  tableDescriptor.setName("test");

  createShardDirectories(output, 7);

  BlurOutputFormat.setupJob(job, tableDescriptor);
  BlurOutputFormat.setOutputPath(job, output);
  int multiple = 2;
  BlurOutputFormat.setReducerMultiplier(job, multiple);

  assertTrue(job.waitForCompletion(true));
  Counters ctrs = job.getCounters();
  System.out.println("Counters: " + ctrs);

  long total = 0;
  for (int i = 0; i < tableDescriptor.getShardCount(); i++) {
    Path path = new Path(output, ShardUtil.getShardName(i));
    Collection<Path> commitedTasks = getCommitedTasks(path);
    assertTrue(commitedTasks.size() >= multiple);
    for (Path p : commitedTasks) {
      DirectoryReader reader = DirectoryReader.open(new HdfsDirectory(_conf, p));
      total += reader.numDocs();
      reader.close();
    }
  }
  assertEquals(80000, total);

}