Java Code Examples for org.apache.lucene.index.IndexWriter#setMergeFactor()

The following examples show how to use org.apache.lucene.index.IndexWriter#setMergeFactor() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: IndexWriterWorker.java    From olat with Apache License 2.0 6 votes vote down vote up
/**
 * @param id
 *            Unique index ID. Is used to generate unique directory name.
 * @param tempIndexPath
 *            Absolute directory-path where the temporary index can be generated.
 * @param fullIndexer
 *            Reference to full-index
 */
public IndexWriterWorker(final int id, final File tempIndexDir, final OlatFullIndexer fullIndexer) {
    this.id = id;
    this.indexPartDir = new File(tempIndexDir, "part" + id);
    this.fullIndexer = fullIndexer;
    try {
        final Directory luceneIndexPartDir = FSDirectory.open(indexPartDir);
        indexWriter = new IndexWriter(luceneIndexPartDir, new StandardAnalyzer(Version.LUCENE_30), true, IndexWriter.MaxFieldLength.UNLIMITED);
        indexWriter.setMergeFactor(fullIndexer.getSearchModuleConfig().getIndexerWriterMergeFactor());
        log.info("IndexWriter config MergeFactor=" + indexWriter.getMergeFactor());
        indexWriter.setRAMBufferSizeMB(fullIndexer.getSearchModuleConfig().getIndexerWriterRamBuffer());
        log.info("IndexWriter config RAMBufferSizeMB=" + indexWriter.getRAMBufferSizeMB());
        indexWriter.setUseCompoundFile(false);
    } catch (final IOException e) {
        log.warn("Can not create IndexWriter");
    }
}
 
Example 2
Source File: TestMixedDirectory.java    From RDFS with Apache License 2.0 5 votes vote down vote up
public void updateIndex(Directory dir, int base, int numDocs,
    IndexDeletionPolicy policy) throws IOException {
  IndexWriter writer =
      new IndexWriter(dir, false, new StandardAnalyzer(), policy);
  writer.setMaxBufferedDocs(maxBufferedDocs);
  writer.setMergeFactor(1000);
  for (int i = 0; i < numDocs; i++) {
    addDoc(writer, base + i);
  }
  writer.close();
}
 
Example 3
Source File: TestMixedDirectory.java    From hadoop-gpu with Apache License 2.0 5 votes vote down vote up
public void updateIndex(Directory dir, int base, int numDocs,
    IndexDeletionPolicy policy) throws IOException {
  IndexWriter writer =
      new IndexWriter(dir, false, new StandardAnalyzer(), policy);
  writer.setMaxBufferedDocs(maxBufferedDocs);
  writer.setMergeFactor(1000);
  for (int i = 0; i < numDocs; i++) {
    addDoc(writer, base + i);
  }
  writer.close();
}
 
Example 4
Source File: BuildIndexForEntityFragments.java    From gAnswer with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
public void indexforentity() throws Exception
{
	if(EntityFragmentFields.entityId2Name == null)
		EntityFragmentFields.load();
	
	long startTime = new Date().getTime();
	
	//Try update KB index to DBpedia2015. by husen 2016-04-08
	//Try update KB index to DBpedia2016. by husen 2018-8-22
	File indexDir_en = new File("D:/husen/gAnswer/data/DBpedia2016/lucene/entity_fragment_index");
	File sourceDir_en = new File("D:/husen/gAnswer/data/DBpedia2016/fragments/entity_RDF_fragment/16entity_fragment.txt");
	
	Analyzer luceneAnalyzer_en = new StandardAnalyzer();  
	IndexWriter indexWriter_en = new IndexWriter(indexDir_en, luceneAnalyzer_en,true); 
	
	int mergeFactor = 100000;    //default 10
	int maxBufferedDoc = 1000;   //default 10
	int maxMergeDoc = Integer.MAX_VALUE;  //INF
	
	//indexWriter.DEFAULT_MERGE_FACTOR = mergeFactor;
	indexWriter_en.setMergeFactor(mergeFactor);
	indexWriter_en.setMaxBufferedDocs(maxBufferedDoc);
	indexWriter_en.setMaxMergeDocs(maxMergeDoc);		
	
	
	FileInputStream file = new FileInputStream(sourceDir_en);		
	InputStreamReader in = new InputStreamReader(file,"UTF-8");	
	BufferedReader br = new BufferedReader(in);		
	
	int count = 0;
	while(true)
	{			
		String _line = br.readLine();
		{
			if(_line == null) break;
		}
		count++;
		if(count % 100000 == 0)
			System.out.println(count);				
		
		String line = _line;		
		String temp[] = line.split("\t");
		
		if(temp.length != 2)
			continue;
		else
		{
			int entity_id = Integer.parseInt(temp[0]);
			if(!EntityFragmentFields.entityId2Name.containsKey(entity_id))
				continue;
			
			String entity_name = EntityFragmentFields.entityId2Name.get(entity_id);
			String entity_fragment = temp[1];
			entity_name = entity_name.replace("____", " ");
			entity_name = entity_name.replace("__", " ");
			entity_name = entity_name.replace("_", " ");
		
				
			Document document = new Document(); 
			
			Field EntityName = new Field("EntityName", entity_name, Field.Store.YES,
					Field.Index.TOKENIZED,
					Field.TermVector.WITH_POSITIONS_OFFSETS);	
			Field EntityId = new Field("EntityId", String.valueOf(entity_id),
					Field.Store.YES, Field.Index.NO);
			Field EntityFragment = new Field("EntityFragment", entity_fragment,
					Field.Store.YES, Field.Index.NO);
			
			document.add(EntityName);
			document.add(EntityId);
			document.add(EntityFragment);
			indexWriter_en.addDocument(document);
		}			
	}
	
	indexWriter_en.optimize();
	indexWriter_en.close();
	br.close();

	// input the time of Build index
	long endTime = new Date().getTime();
	System.out.println("entity_name index has build ->" + count + " " + "Time:" + (endTime - startTime));
}
 
Example 5
Source File: BuildIndexForTypeShortName.java    From gAnswer with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
public static void buildIndex(HashMap<String, ArrayList<Integer>> typeShortName2IdList) throws Exception
{
	long startTime = new Date().getTime();
	File indexDir_li = new File("D:/husen/gAnswer/data/DBpedia2016/lucene/type_fragment_index");
	
	Analyzer luceneAnalyzer_li = new StandardAnalyzer();  
	IndexWriter indexWriter_li = new IndexWriter(indexDir_li, luceneAnalyzer_li,true); 
	
	int mergeFactor = 100000;
	int maxBufferedDoc = 1000;
	int maxMergeDoc = Integer.MAX_VALUE;
	
	//indexWriter.DEFAULT_MERGE_FACTOR = mergeFactor;
	indexWriter_li.setMergeFactor(mergeFactor);
	indexWriter_li.setMaxBufferedDocs(maxBufferedDoc);
	indexWriter_li.setMaxMergeDocs(maxMergeDoc);
	
	int count = 0;
	Iterator<String> it = typeShortName2IdList.keySet().iterator();
	while (it.hasNext()) 
	{
		String sn = it.next();
		if (sn.length() == 0) {
			continue;
		}
		
		count ++;
	
		StringBuilder splittedSn = new StringBuilder("");
		
		if(sn.contains("_"))
		{
			String nsn = sn.replace("_", " ");
			splittedSn.append(nsn.toLowerCase());
		}
		else
		{
			int last = 0, i = 0;
			for(i = 0; i < sn.length(); i ++) 
			{
				// if it were not a small letter, then break it.
				if(!(sn.charAt(i)>='a' && sn.charAt(i)<='z')) 
				{
					splittedSn.append(sn.substring(last, i).toLowerCase());
					splittedSn.append(' ');
					last = i;
				}
			}
			splittedSn.append(sn.substring(last, i).toLowerCase());
			while(splittedSn.charAt(0) == ' ') {
				splittedSn.deleteCharAt(0);
			}
		}
		
		System.out.println("SplitttedType: "+splittedSn);
		
		Document document = new Document(); 

		Field SplittedTypeShortName = new Field("SplittedTypeShortName", splittedSn.toString(), 
				Field.Store.YES,
				Field.Index.TOKENIZED,
				Field.TermVector.WITH_POSITIONS_OFFSETS);			
		Field TypeShortName = new Field("TypeShortName", sn,
				Field.Store.YES, Field.Index.NO);
		
		document.add(SplittedTypeShortName);
		document.add(TypeShortName);
		indexWriter_li.addDocument(document);	
	}
			
	indexWriter_li.optimize();
	indexWriter_li.close();

	// input the time of Build index
	long endTime = new Date().getTime();
	System.out.println("TypeShortName index has build ->" + count + " " + "Time:" + (endTime - startTime));
}
 
Example 6
Source File: OlatFullIndexer.java    From olat with Apache License 2.0 4 votes vote down vote up
/**
 * Create index-writer object. In multi-threaded mode ctreates an array of index-workers. Start indexing with main-index as root object. Index recursive all elements.
 * At the end optimze and close new index. The new index is stored in [temporary-index-path]/main
 * 
 * @throws InterruptedException
 */
private void doIndex() throws InterruptedException {
    try {
        final File tempIndexDir = new File(tempIndexPath);
        final Directory indexPath = FSDirectory.open(new File(tempIndexDir, "main"));
        final Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
        indexWriter = new IndexWriter(indexPath, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
        indexWriter.deleteAll();
        indexWriter.setMergeFactor(INDEX_MERGE_FACTOR); // for better performance
        indexWriter.setRAMBufferSizeMB(ramBufferSizeMB);// for better performance set to 48MB (see lucene docu 'how to make indexing faster")
        log.info("IndexWriter config RAMBufferSizeMB=" + indexWriter.getRAMBufferSizeMB());
        indexWriter.setUseCompoundFile(useCompoundFile); // for better performance (see lucene docu 'how to make indexing faster")
        log.info("IndexWriter config UseCompoundFile=" + indexWriter.getUseCompoundFile());
        // Create IndexWriterWorker
        log.info("Running with " + numberIndexWriter + " IndexerWriterWorker");
        indexWriterWorkers = new IndexWriterWorker[numberIndexWriter];
        final Directory[] partIndexDirs = new Directory[numberIndexWriter];
        for (int i = 0; i < numberIndexWriter; i++) {
            final IndexWriterWorker indexWriterWorker = new IndexWriterWorker(i, tempIndexDir, this);
            indexWriterWorkers[i] = indexWriterWorker;
            indexWriterWorkers[i].start();
            partIndexDirs[i] = indexWriterWorkers[i].getIndexDir();
        }

        final SearchResourceContext searchResourceContext = new SearchResourceContext();
        log.info("doIndex start. OlatFullIndexer with Debug output");
        mainIndexer.doIndex(searchResourceContext, null /* no parent */, this);

        log.info("Wait until every folder indexer is finished");

        DBFactory.getInstance().commitAndCloseSession();
        // check if every folder indexer is finished max waiting-time 10Min (=waitingCount-limit = 60)
        int waitingCount = 0;
        final int MAX_WAITING_COUNT = 60;// = 10Min
        while (FolderIndexerWorkerPool.getInstance().isIndexerRunning() && (waitingCount++ < MAX_WAITING_COUNT)) {
            Thread.sleep(10000);
        }
        if (waitingCount >= MAX_WAITING_COUNT) {
            log.info("Finished with max waiting time!");
        }
        log.info("Set Finish-flag for each indexWriterWorkers");
        // Set Finish-flag
        for (int i = 0; i < numberIndexWriter; i++) {
            indexWriterWorkers[i].finishIndexing();
        }

        log.info("Wait until every indexworker is finished");
        // check if every indexworker is finished max waiting-time 10Min (=waitingCount-limit = 60)
        waitingCount = 0;
        while (!areIndexingDone() && (waitingCount++ < MAX_WAITING_COUNT)) {
            Thread.sleep(10000);
        }
        if (waitingCount >= MAX_WAITING_COUNT) {
            log.info("Finished with max waiting time!");
        }

        // Merge all partIndex
        DBFactory.getInstance().commitAndCloseSession();
        if (partIndexDirs.length > 0) {
            log.info("Start merging part Indexes");
            indexWriter.addIndexesNoOptimize(partIndexDirs);
            log.info("Added all part Indexes");
        }
        fullIndexerStatus.setIndexSize(indexWriter.maxDoc());
        indexWriter.optimize();
        indexWriter.close();
    } catch (final IOException e) {
        e.printStackTrace();
        log.warn("Can not create IndexWriter, indexname=" + tempIndexPath, e);
    } finally {
        DBFactory.getInstance().commitAndCloseSession();
        log.debug("doIndex: commit & close session");
    }
}