Java Code Examples for org.apache.lucene.index.IndexWriter#setMaxBufferedDocs()

The following examples show how to use org.apache.lucene.index.IndexWriter#setMaxBufferedDocs() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestMixedDirectory.java    From RDFS with Apache License 2.0 5 votes vote down vote up
public void updateIndex(Directory dir, int base, int numDocs,
    IndexDeletionPolicy policy) throws IOException {
  IndexWriter writer =
      new IndexWriter(dir, false, new StandardAnalyzer(), policy);
  writer.setMaxBufferedDocs(maxBufferedDocs);
  writer.setMergeFactor(1000);
  for (int i = 0; i < numDocs; i++) {
    addDoc(writer, base + i);
  }
  writer.close();
}
 
Example 2
Source File: TestMixedDirectory.java    From hadoop-gpu with Apache License 2.0 5 votes vote down vote up
public void updateIndex(Directory dir, int base, int numDocs,
    IndexDeletionPolicy policy) throws IOException {
  IndexWriter writer =
      new IndexWriter(dir, false, new StandardAnalyzer(), policy);
  writer.setMaxBufferedDocs(maxBufferedDocs);
  writer.setMergeFactor(1000);
  for (int i = 0; i < numDocs; i++) {
    addDoc(writer, base + i);
  }
  writer.close();
}
 
Example 3
Source File: BuildIndexForEntityFragments.java    From gAnswer with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
public void indexforentity() throws Exception
{
	if(EntityFragmentFields.entityId2Name == null)
		EntityFragmentFields.load();
	
	long startTime = new Date().getTime();
	
	//Try update KB index to DBpedia2015. by husen 2016-04-08
	//Try update KB index to DBpedia2016. by husen 2018-8-22
	File indexDir_en = new File("D:/husen/gAnswer/data/DBpedia2016/lucene/entity_fragment_index");
	File sourceDir_en = new File("D:/husen/gAnswer/data/DBpedia2016/fragments/entity_RDF_fragment/16entity_fragment.txt");
	
	Analyzer luceneAnalyzer_en = new StandardAnalyzer();  
	IndexWriter indexWriter_en = new IndexWriter(indexDir_en, luceneAnalyzer_en,true); 
	
	int mergeFactor = 100000;    //default 10
	int maxBufferedDoc = 1000;   //default 10
	int maxMergeDoc = Integer.MAX_VALUE;  //INF
	
	//indexWriter.DEFAULT_MERGE_FACTOR = mergeFactor;
	indexWriter_en.setMergeFactor(mergeFactor);
	indexWriter_en.setMaxBufferedDocs(maxBufferedDoc);
	indexWriter_en.setMaxMergeDocs(maxMergeDoc);		
	
	
	FileInputStream file = new FileInputStream(sourceDir_en);		
	InputStreamReader in = new InputStreamReader(file,"UTF-8");	
	BufferedReader br = new BufferedReader(in);		
	
	int count = 0;
	while(true)
	{			
		String _line = br.readLine();
		{
			if(_line == null) break;
		}
		count++;
		if(count % 100000 == 0)
			System.out.println(count);				
		
		String line = _line;		
		String temp[] = line.split("\t");
		
		if(temp.length != 2)
			continue;
		else
		{
			int entity_id = Integer.parseInt(temp[0]);
			if(!EntityFragmentFields.entityId2Name.containsKey(entity_id))
				continue;
			
			String entity_name = EntityFragmentFields.entityId2Name.get(entity_id);
			String entity_fragment = temp[1];
			entity_name = entity_name.replace("____", " ");
			entity_name = entity_name.replace("__", " ");
			entity_name = entity_name.replace("_", " ");
		
				
			Document document = new Document(); 
			
			Field EntityName = new Field("EntityName", entity_name, Field.Store.YES,
					Field.Index.TOKENIZED,
					Field.TermVector.WITH_POSITIONS_OFFSETS);	
			Field EntityId = new Field("EntityId", String.valueOf(entity_id),
					Field.Store.YES, Field.Index.NO);
			Field EntityFragment = new Field("EntityFragment", entity_fragment,
					Field.Store.YES, Field.Index.NO);
			
			document.add(EntityName);
			document.add(EntityId);
			document.add(EntityFragment);
			indexWriter_en.addDocument(document);
		}			
	}
	
	indexWriter_en.optimize();
	indexWriter_en.close();
	br.close();

	// input the time of Build index
	long endTime = new Date().getTime();
	System.out.println("entity_name index has build ->" + count + " " + "Time:" + (endTime - startTime));
}
 
Example 4
Source File: BuildIndexForTypeShortName.java    From gAnswer with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
public static void buildIndex(HashMap<String, ArrayList<Integer>> typeShortName2IdList) throws Exception
{
	long startTime = new Date().getTime();
	File indexDir_li = new File("D:/husen/gAnswer/data/DBpedia2016/lucene/type_fragment_index");
	
	Analyzer luceneAnalyzer_li = new StandardAnalyzer();  
	IndexWriter indexWriter_li = new IndexWriter(indexDir_li, luceneAnalyzer_li,true); 
	
	int mergeFactor = 100000;
	int maxBufferedDoc = 1000;
	int maxMergeDoc = Integer.MAX_VALUE;
	
	//indexWriter.DEFAULT_MERGE_FACTOR = mergeFactor;
	indexWriter_li.setMergeFactor(mergeFactor);
	indexWriter_li.setMaxBufferedDocs(maxBufferedDoc);
	indexWriter_li.setMaxMergeDocs(maxMergeDoc);
	
	int count = 0;
	Iterator<String> it = typeShortName2IdList.keySet().iterator();
	while (it.hasNext()) 
	{
		String sn = it.next();
		if (sn.length() == 0) {
			continue;
		}
		
		count ++;
	
		StringBuilder splittedSn = new StringBuilder("");
		
		if(sn.contains("_"))
		{
			String nsn = sn.replace("_", " ");
			splittedSn.append(nsn.toLowerCase());
		}
		else
		{
			int last = 0, i = 0;
			for(i = 0; i < sn.length(); i ++) 
			{
				// if it were not a small letter, then break it.
				if(!(sn.charAt(i)>='a' && sn.charAt(i)<='z')) 
				{
					splittedSn.append(sn.substring(last, i).toLowerCase());
					splittedSn.append(' ');
					last = i;
				}
			}
			splittedSn.append(sn.substring(last, i).toLowerCase());
			while(splittedSn.charAt(0) == ' ') {
				splittedSn.deleteCharAt(0);
			}
		}
		
		System.out.println("SplitttedType: "+splittedSn);
		
		Document document = new Document(); 

		Field SplittedTypeShortName = new Field("SplittedTypeShortName", splittedSn.toString(), 
				Field.Store.YES,
				Field.Index.TOKENIZED,
				Field.TermVector.WITH_POSITIONS_OFFSETS);			
		Field TypeShortName = new Field("TypeShortName", sn,
				Field.Store.YES, Field.Index.NO);
		
		document.add(SplittedTypeShortName);
		document.add(TypeShortName);
		indexWriter_li.addDocument(document);	
	}
			
	indexWriter_li.optimize();
	indexWriter_li.close();

	// input the time of Build index
	long endTime = new Date().getTime();
	System.out.println("TypeShortName index has build ->" + count + " " + "Time:" + (endTime - startTime));
}