Java Code Examples for org.apache.lucene.index.IndexWriterConfig#setRAMBufferSizeMB()

The following examples show how to use org.apache.lucene.index.IndexWriterConfig#setRAMBufferSizeMB() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: WordFrequencyStore.java    From SourcererCC with GNU General Public License v3.0 6 votes vote down vote up
public void prepareIndex() throws IOException {
    File globalWFMDIr = new File(Util.GTPM_INDEX_DIR);
    if (!globalWFMDIr.exists()) {
        Util.createDirs(Util.GTPM_INDEX_DIR);
    }
    KeywordAnalyzer keywordAnalyzer = new KeywordAnalyzer();
    IndexWriterConfig wfmIndexWriterConfig = new IndexWriterConfig(Version.LUCENE_46, keywordAnalyzer);
    wfmIndexWriterConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
    wfmIndexWriterConfig.setRAMBufferSizeMB(1024);

    logger.info("PREPARE INDEX");
    try {
        wfmIndexWriter = new IndexWriter(FSDirectory.open(new File(Util.GTPM_INDEX_DIR)), wfmIndexWriterConfig);
        wfmIndexWriter.commit();
        wfmIndexer = new DocumentMaker(wfmIndexWriter);
    } catch (IOException e) {
        e.printStackTrace();
    }
}
 
Example 2
Source File: test.java    From vscode-extension with MIT License 5 votes vote down vote up
private IndexWriterConfig getIndexWriterConfig() {
    final IndexWriterConfig iwc = new IndexWriterConfig(engineConfig.getAnalyzer());
    iwc.setCommitOnClose(false); // we by default don't commit on close
    iwc.setOpenMode(IndexWriterConfig.OpenMode.APPEND);
    iwc.setIndexDeletionPolicy(combinedDeletionPolicy);
    // with tests.verbose, lucene sets this up: plumb to align with filesystem stream
    boolean verbose = false;
    try {
        verbose = Boolean.parseBoolean(System.getProperty("tests.verbose"));
    } catch (Exception ignore) {
    }
    iwc.setInfoStream(verbose ? InfoStream.getDefault() : new LoggerInfoStream(logger));
    iwc.setMergeScheduler(mergeScheduler);
    // Give us the opportunity to upgrade old segments while performing
    // background merges
    MergePolicy mergePolicy = config().getMergePolicy();
    // always configure soft-deletes field so an engine with soft-deletes disabled can open a Lucene index with soft-deletes.
    iwc.setSoftDeletesField(Lucene.SOFT_DELETES_FIELD);
    if (softDeleteEnabled) {
        mergePolicy = new RecoverySourcePruneMergePolicy(SourceFieldMapper.RECOVERY_SOURCE_NAME, softDeletesPolicy::getRetentionQuery,
            new SoftDeletesRetentionMergePolicy(Lucene.SOFT_DELETES_FIELD, softDeletesPolicy::getRetentionQuery, mergePolicy));
    }
    iwc.setMergePolicy(new ElasticsearchMergePolicy(mergePolicy));
    iwc.setSimilarity(engineConfig.getSimilarity());
    iwc.setRAMBufferSizeMB(engineConfig.getIndexingBufferSize().getMbFrac());
    iwc.setCodec(engineConfig.getCodec());
    iwc.setUseCompoundFile(true); // always use compound on flush - reduces # of file-handles on refresh
    if (config().getIndexSort() != null) {
        iwc.setIndexSort(config().getIndexSort());
    }
    return iwc;
}
 
Example 3
Source File: LindenConfig.java    From linden with Apache License 2.0 5 votes vote down vote up
public IndexWriterConfig createIndexWriterConfig() throws IOException {
  IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LATEST, getIndexAnalyzerInstance());
  indexWriterConfig.setRAMBufferSizeMB(48);

  MergePolicy mergePolicy = getPluginManager().getInstance(LindenConfigBuilder.MERGE_POLICY, MergePolicy.class);
  if (mergePolicy != null) {
    indexWriterConfig.setMergePolicy(mergePolicy);
  }
  LOGGER.info("Merge policy : {}", mergePolicy == null ? "Default" : mergePolicy);

  ConcurrentMergeScheduler cms = new ConcurrentMergeScheduler();
  cms.setMaxMergesAndThreads(8, 1);
  indexWriterConfig.setMergeScheduler(cms);
  return indexWriterConfig;
}
 
Example 4
Source File: TestLucene80DocValuesFormat.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private IndexWriter createFastIndexWriter(Directory dir, int maxBufferedDocs) throws IOException {
  IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
  conf.setMaxBufferedDocs(maxBufferedDocs);
  conf.setRAMBufferSizeMB(-1);
  conf.setMergePolicy(newLogMergePolicy(random().nextBoolean()));
  return new IndexWriter(dir, conf);
}
 
Example 5
Source File: TestLucene80DocValuesFormat.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private void doTestSparseNumericBlocksOfVariousBitsPerValue(double density) throws Exception {
  Directory dir = newDirectory();
  IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
  conf.setMaxBufferedDocs(atLeast(Lucene80DocValuesFormat.NUMERIC_BLOCK_SIZE));
  conf.setRAMBufferSizeMB(-1);
  conf.setMergePolicy(newLogMergePolicy(random().nextBoolean()));
  IndexWriter writer = new IndexWriter(dir, conf);
  Document doc = new Document();
  Field storedField = newStringField("stored", "", Field.Store.YES);
  Field dvField = new NumericDocValuesField("dv", 0);
  doc.add(storedField);
  doc.add(dvField);

  final int numDocs = atLeast(Lucene80DocValuesFormat.NUMERIC_BLOCK_SIZE*3);
  final LongSupplier longs = blocksOfVariousBPV();
  for (int i = 0; i < numDocs; i++) {
    if (random().nextDouble() > density) {
      writer.addDocument(new Document());
      continue;
    }
    long value = longs.getAsLong();
    storedField.setStringValue(Long.toString(value));
    dvField.setLongValue(value);
    writer.addDocument(doc);
  }

  writer.forceMerge(1);

  writer.close();
  
  // compare
  assertDVIterate(dir);
  assertDVAdvance(dir, 1); // Tests all jump-lengths from 1 to maxDoc (quite slow ~= 1 minute for 200K docs)

  dir.close();
}
 
Example 6
Source File: SolrIndexConfig.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public IndexWriterConfig toIndexWriterConfig(SolrCore core) throws IOException {
  IndexSchema schema = core.getLatestSchema();
  IndexWriterConfig iwc = new IndexWriterConfig(new DelayedSchemaAnalyzer(core));
  if (maxBufferedDocs != -1)
    iwc.setMaxBufferedDocs(maxBufferedDocs);

  if (ramBufferSizeMB != -1)
    iwc.setRAMBufferSizeMB(ramBufferSizeMB);

  if (ramPerThreadHardLimitMB != -1) {
    iwc.setRAMPerThreadHardLimitMB(ramPerThreadHardLimitMB);
  }

  iwc.setSimilarity(schema.getSimilarity());
  MergePolicy mergePolicy = buildMergePolicy(core.getResourceLoader(), schema);
  iwc.setMergePolicy(mergePolicy);
  MergeScheduler mergeScheduler = buildMergeScheduler(core.getResourceLoader());
  iwc.setMergeScheduler(mergeScheduler);
  iwc.setInfoStream(infoStream);

  if (mergePolicy instanceof SortingMergePolicy) {
    Sort indexSort = ((SortingMergePolicy) mergePolicy).getSort();
    iwc.setIndexSort(indexSort);
  }

  iwc.setUseCompoundFile(useCompoundFile);

  if (mergedSegmentWarmerInfo != null) {
    // TODO: add infostream -> normal logging system (there is an issue somewhere)
    @SuppressWarnings({"rawtypes"})
    IndexReaderWarmer warmer = core.getResourceLoader().newInstance(mergedSegmentWarmerInfo.className,
                                                                      IndexReaderWarmer.class,
                                                                      null,
                                                                      new Class[] { InfoStream.class },
                                                                      new Object[] { iwc.getInfoStream() });
    iwc.setMergedSegmentWarmer(warmer);
  }

  return iwc;
}
 
Example 7
Source File: TextSearchQueriesTest.java    From incubator-pinot with Apache License 2.0 5 votes vote down vote up
@Test
public void testMultiThreadedLuceneRealtime() throws Exception {
  File indexFile = new File(INDEX_DIR.getPath() + "/realtime-test3.index");
  Directory indexDirectory = FSDirectory.open(indexFile.toPath());
  StandardAnalyzer standardAnalyzer = new StandardAnalyzer();
  // create and open a writer
  IndexWriterConfig indexWriterConfig = new IndexWriterConfig(standardAnalyzer);
  indexWriterConfig.setRAMBufferSizeMB(500);
  IndexWriter indexWriter = new IndexWriter(indexDirectory, indexWriterConfig);

  // create an NRT index reader
  SearcherManager searcherManager = new SearcherManager(indexWriter, false, false, null);

  // background thread to refresh NRT reader
  ControlledRealTimeReopenThread controlledRealTimeReopenThread =
      new ControlledRealTimeReopenThread(indexWriter, searcherManager, 0.01, 0.01);
  controlledRealTimeReopenThread.start();

  // start writer and reader
  Thread writer = new Thread(new RealtimeWriter(indexWriter));
  Thread realtimeReader = new Thread(new RealtimeReader(searcherManager, standardAnalyzer));

  writer.start();
  realtimeReader.start();

  writer.join();
  realtimeReader.join();
  controlledRealTimeReopenThread.join();
}
 
Example 8
Source File: InternalEngine.java    From crate with Apache License 2.0 5 votes vote down vote up
private IndexWriterConfig getIndexWriterConfig() {
    final IndexWriterConfig iwc = new IndexWriterConfig(engineConfig.getAnalyzer());
    iwc.setCommitOnClose(false); // we by default don't commit on close
    iwc.setOpenMode(IndexWriterConfig.OpenMode.APPEND);
    iwc.setReaderAttributes(getReaderAttributes(store.directory()));
    iwc.setIndexDeletionPolicy(combinedDeletionPolicy);
    // with tests.verbose, lucene sets this up: plumb to align with filesystem stream
    boolean verbose = false;
    try {
        verbose = Boolean.parseBoolean(System.getProperty("tests.verbose"));
    } catch (Exception ignore) {
        // ignored
    }
    iwc.setInfoStream(verbose ? InfoStream.getDefault() : new LoggerInfoStream(logger));
    iwc.setMergeScheduler(mergeScheduler);
    // Give us the opportunity to upgrade old segments while performing
    // background merges
    MergePolicy mergePolicy = config().getMergePolicy();
    // always configure soft-deletes field so an engine with soft-deletes disabled can open a Lucene index with soft-deletes.
    iwc.setSoftDeletesField(Lucene.SOFT_DELETES_FIELD);
    if (softDeleteEnabled) {
        mergePolicy = new RecoverySourcePruneMergePolicy(SourceFieldMapper.RECOVERY_SOURCE_NAME, softDeletesPolicy::getRetentionQuery,
            new SoftDeletesRetentionMergePolicy(Lucene.SOFT_DELETES_FIELD, softDeletesPolicy::getRetentionQuery, mergePolicy));
    }
    iwc.setMergePolicy(new ElasticsearchMergePolicy(mergePolicy));
    iwc.setRAMBufferSizeMB(engineConfig.getIndexingBufferSize().getMbFrac());
    iwc.setCodec(engineConfig.getCodec());
    iwc.setUseCompoundFile(true); // always use compound on flush - reduces # of file-handles on refresh
    return iwc;
}
 
Example 9
Source File: DocumentIndexer.java    From act with GNU General Public License v3.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {
  System.out.println("Starting up...");
  System.out.flush();
  Options opts = new Options();
  opts.addOption(Option.builder("i").
      longOpt("input").hasArg().required().desc("Input file or directory to index").build());
  opts.addOption(Option.builder("x").
      longOpt("index").hasArg().required().desc("Path to index file to generate").build());
  opts.addOption(Option.builder("h").longOpt("help").desc("Print this help message and exit").build());
  opts.addOption(Option.builder("v").longOpt("verbose").desc("Print verbose log output").build());

  HelpFormatter helpFormatter = new HelpFormatter();
  CommandLineParser cmdLineParser = new DefaultParser();
  CommandLine cmdLine = null;
  try {
    cmdLine = cmdLineParser.parse(opts, args);
  } catch (ParseException e) {
    System.out.println("Caught exception when parsing command line: " + e.getMessage());
    helpFormatter.printHelp("DocumentIndexer", opts);
    System.exit(1);
  }

  if (cmdLine.hasOption("help")) {
    helpFormatter.printHelp("DocumentIndexer", opts);
    System.exit(0);
  }

  if (cmdLine.hasOption("verbose")) {
    // With help from http://stackoverflow.com/questions/23434252/programmatically-change-log-level-in-log4j2
    LoggerContext ctx = (LoggerContext) LogManager.getContext(false);
    Configuration ctxConfig = ctx.getConfiguration();
    LoggerConfig logConfig = ctxConfig.getLoggerConfig(LogManager.ROOT_LOGGER_NAME);
    logConfig.setLevel(Level.DEBUG);

    ctx.updateLoggers();
    LOGGER.debug("Verbose logging enabled");
  }

  LOGGER.info("Opening index at " + cmdLine.getOptionValue("index"));
  Directory indexDir = FSDirectory.open(new File(cmdLine.getOptionValue("index")).toPath());

  /* The standard analyzer is too aggressive with chemical entities (it strips structural annotations, for one
   * thing), and the whitespace analyzer doesn't do any case normalization or stop word elimination.  This custom
   * analyzer appears to treat chemical entities better than the standard analyzer without admitting too much
   * cruft to the index. */
  Analyzer analyzer = CustomAnalyzer.builder().
      withTokenizer("whitespace").
      addTokenFilter("lowercase").
      addTokenFilter("stop").
      build();

  IndexWriterConfig writerConfig = new IndexWriterConfig(analyzer);
  writerConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
  writerConfig.setRAMBufferSizeMB(1 << 10);
  IndexWriter indexWriter = new IndexWriter(indexDir, writerConfig);

  String inputFileOrDir = cmdLine.getOptionValue("input");
  File splitFileOrDir = new File(inputFileOrDir);
  if (!(splitFileOrDir.exists())) {
    LOGGER.error("Unable to find directory at " + inputFileOrDir);
    System.exit(1);
  }

  DocumentIndexer indexer = new DocumentIndexer(indexWriter);
  PatentCorpusReader corpusReader = new PatentCorpusReader(indexer, splitFileOrDir);
  corpusReader.readPatentCorpus();
  indexer.commitAndClose();
}
 
Example 10
Source File: TestLucene80DocValuesFormat.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private void doTestSortedNumericBlocksOfVariousBitsPerValue(LongSupplier counts) throws Exception {
  Directory dir = newDirectory();
  IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
  conf.setMaxBufferedDocs(atLeast(Lucene80DocValuesFormat.NUMERIC_BLOCK_SIZE));
  conf.setRAMBufferSizeMB(-1);
  conf.setMergePolicy(newLogMergePolicy(random().nextBoolean()));
  IndexWriter writer = new IndexWriter(dir, conf);
  
  final int numDocs = atLeast(Lucene80DocValuesFormat.NUMERIC_BLOCK_SIZE*3);
  final LongSupplier values = blocksOfVariousBPV();
  for (int i = 0; i < numDocs; i++) {
    Document doc = new Document();
    
    int valueCount = (int) counts.getAsLong();
    long valueArray[] = new long[valueCount];
    for (int j = 0; j < valueCount; j++) {
      long value = values.getAsLong();
      valueArray[j] = value;
      doc.add(new SortedNumericDocValuesField("dv", value));
    }
    Arrays.sort(valueArray);
    for (int j = 0; j < valueCount; j++) {
      doc.add(new StoredField("stored", Long.toString(valueArray[j])));
    }
    writer.addDocument(doc);
    if (random().nextInt(31) == 0) {
      writer.commit();
    }
  }
  writer.forceMerge(1);

  writer.close();
  
  // compare
  DirectoryReader ir = DirectoryReader.open(dir);
  TestUtil.checkReader(ir);
  for (LeafReaderContext context : ir.leaves()) {
    LeafReader r = context.reader();
    SortedNumericDocValues docValues = DocValues.getSortedNumeric(r, "dv");
    for (int i = 0; i < r.maxDoc(); i++) {
      if (i > docValues.docID()) {
        docValues.nextDoc();
      }
      String expected[] = r.document(i).getValues("stored");
      if (i < docValues.docID()) {
        assertEquals(0, expected.length);
      } else {
        String actual[] = new String[docValues.docValueCount()];
        for (int j = 0; j < actual.length; j++) {
          actual[j] = Long.toString(docValues.nextValue());
        }
        assertArrayEquals(expected, actual);
      }
    }
  }
  ir.close();
  dir.close();
}
 
Example 11
Source File: LuceneTextIndexCreator.java    From incubator-pinot with Apache License 2.0 4 votes vote down vote up
/**
 * Called by {@link org.apache.pinot.core.segment.creator.impl.SegmentColumnarIndexCreator}
 * when building an offline segment. Similar to how it creates per column
 * dictionary, forward and inverted index, a text index is also created
 * if text search is enabled on a column.
 * @param column column name
 * @param segmentIndexDir segment index directory
 * @param commit true if the index should be committed (at the end after all documents have
 *               been added), false if index should not be committed
 * Note on commit:
 *               Once {@link org.apache.pinot.core.segment.creator.impl.SegmentColumnarIndexCreator}
 *               finishes indexing all documents/rows for the segment, we need to commit and close
 *               the Lucene index which will internally persist the index on disk, do the necessary
 *               resource cleanup etc. We commit during {@link InvertedIndexCreator#seal()}
 *               and close during {@link InvertedIndexCreator#close()}.
 *               This lucene index writer is used by both offline and realtime (both during
 *               indexing in-memory MutableSegment and later during conversion to offline).
 *               Since realtime segment conversion is again going to go through the offline
 *               indexing path and will do everything (indexing, commit, close etc), there is
 *               no need to commit the index from the realtime side. So when the realtime segment
 *               is destroyed (which is after the realtime segment has been committed and converted
 *               to offline), we close this lucene index writer to release resources but don't commit.
 *               This is the reason to have commit flag part of the constructor.
 */
public LuceneTextIndexCreator(String column, File segmentIndexDir, boolean commit) {
  _textColumn = column;
  try {
    // segment generation is always in V1 and later we convert (as part of post creation processing)
    // to V3 if segmentVersion is set to V3 in SegmentGeneratorConfig.
    File indexFile = getV1TextIndexFile(segmentIndexDir);
    _indexDirectory = FSDirectory.open(indexFile.toPath());
    StandardAnalyzer standardAnalyzer = new StandardAnalyzer(ENGLISH_STOP_WORDS_SET);
    IndexWriterConfig indexWriterConfig = new IndexWriterConfig(standardAnalyzer);
    indexWriterConfig.setRAMBufferSizeMB(LUCENE_INDEX_MAX_BUFFER_SIZE_MB);
    indexWriterConfig.setCommitOnClose(commit);
    _indexWriter = new IndexWriter(_indexDirectory, indexWriterConfig);
  } catch (Exception e) {
    LOGGER.error("Failed to instantiate Lucene text index creator for column {}, exception {}", column, e.getMessage());
    throw new RuntimeException(e);
  }
}