Java Code Examples for org.apache.lucene.document.Document#removeFields()

The following examples show how to use org.apache.lucene.document.Document#removeFields() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: LuceneMessageSearchIndex.java    From james-project with Apache License 2.0 6 votes vote down vote up
private void update(MailboxId mailboxId, MessageUid uid, Flags f) throws IOException {
    try (IndexSearcher searcher = new IndexSearcher(IndexReader.open(writer, true))) {
        BooleanQuery query = new BooleanQuery();
        query.add(new TermQuery(new Term(MAILBOX_ID_FIELD, mailboxId.serialize())), BooleanClause.Occur.MUST);
        query.add(createQuery(MessageRange.one(uid)), BooleanClause.Occur.MUST);
        query.add(new PrefixQuery(new Term(FLAGS_FIELD, "")), BooleanClause.Occur.MUST);

        TopDocs docs = searcher.search(query, 100000);
        ScoreDoc[] sDocs = docs.scoreDocs;
        for (ScoreDoc sDoc : sDocs) {
            Document doc = searcher.doc(sDoc.doc);

            doc.removeFields(FLAGS_FIELD);
            indexFlags(doc, f);

            writer.updateDocument(new Term(ID_FIELD, doc.get(ID_FIELD)), doc);
        }
    }
}
 
Example 2
Source File: TermRecognitionRequestHandler.java    From jate with GNU Lesser General Public License v3.0 6 votes vote down vote up
private void iterateAddDomainTermFields(boolean isBoosted, String domainTermsFieldName,
                                        IndexSchema indexSchema, Document doc,
                                        List<Pair<String, Double>> filteredCandidateTerms) {
    // remove previous fields if exists
    doc.removeFields(domainTermsFieldName);

    for (Pair<String, Double> filteredTerm : filteredCandidateTerms) {
        if (filteredTerm == null) {
            continue;
        }

        if (isBoosted) {
            doc.add(indexSchema.getField(domainTermsFieldName).createField(filteredTerm.first()
                    ));
        } else {
            doc.add(indexSchema.getField(domainTermsFieldName).createField(filteredTerm.first()
                    ));
        }
    }
}
 
Example 3
Source File: TestDuelingCodecs.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * populates a writer with random stuff. this must be fully reproducable with the seed!
 */
public static void createRandomIndex(int numdocs, RandomIndexWriter writer, long seed) throws IOException {
  Random random = new Random(seed);
  // primary source for our data is from linefiledocs, it's realistic.
  LineFileDocs lineFileDocs = new LineFileDocs(random);

  // TODO: we should add other fields that use things like docs&freqs but omit positions,
  // because linefiledocs doesn't cover all the possibilities.
  for (int i = 0; i < numdocs; i++) {
    Document document = lineFileDocs.nextDoc();
    // grab the title and add some SortedSet instances for fun
    String title = document.get("titleTokenized");
    String split[] = title.split("\\s+");
    document.removeFields("sortedset");
    for (String trash : split) {
      document.add(new SortedSetDocValuesField("sortedset", new BytesRef(trash)));
    }
    // add a numeric dv field sometimes
    document.removeFields("sparsenumeric");
    if (random.nextInt(4) == 2) {
      document.add(new NumericDocValuesField("sparsenumeric", random.nextInt()));
    }
    // add sortednumeric sometimes
    document.removeFields("sparsesortednum");
    if (random.nextInt(5) == 1) {
      document.add(new SortedNumericDocValuesField("sparsesortednum", random.nextLong()));
      if (random.nextBoolean()) {
        document.add(new SortedNumericDocValuesField("sparsesortednum", random.nextLong()));
      }
    }
    writer.addDocument(document);
  }
  
  lineFileDocs.close();
}
 
Example 4
Source File: AnchorIndexer.java    From tagme with Apache License 2.0 4 votes vote down vote up
@Override
	public void makeIndex(String lang, File workingDir) throws IOException
	{
		log.info("Loading support datasets...");
		
		File all_anchors = new WikipediaAnchorParser(lang).getFile();
		long numAnchors = ExternalSortUtils.wcl(all_anchors);
		AnchorIterator iterator = new AnchorIterator(all_anchors);
		
		IntSet people = new PeopleWIDs(lang).getDataset();
		
//		IndexSearcher articles = Indexes.getSearcher(RepositoryDirs.WIKIPEDIA.getPath(lang));
		IndexSearcher articles = openWikipediaIndex(lang);
		//QueryParser queryParser = new QueryParser(Version.LUCENE_34, WikipediaIndexer.FIELD_BODY, new WhitespaceAnalyzer(Version.LUCENE_34));
		QueryParser queryParser = new QueryParser(Version.LUCENE_34, WikipediaIndexer.FIELD_BODY, new StandardAnalyzer(Version.LUCENE_34, new HashSet<String>()));
		
		IndexWriter index = new IndexWriter(FSDirectory.open(workingDir.getAbsoluteFile()), new IndexWriterConfig(Version.LUCENE_34, new KeywordAnalyzer()));
		Document doc = new Document();
		Field fId = new Field(FIELD_ID, "", Store.YES, Index.NOT_ANALYZED);
		Field fText = new Field(FIELD_TEXT, "", Store.YES, Index.NOT_ANALYZED);
		Field fObject = new Field(FIELD_OBJECT, "", Store.YES, Index.NO);
		
		doc.add(fId);
		doc.add(fText);
		doc.add(fObject);
		
//		Field fOriginal = new Field(FIELD_ORIGINAL, "", Store.YES, Index.ANALYZED);
//		Field fWID = new Field(FIELD_WID, "", Store.NO, Index.ANALYZED);
		
		PLogger plog = new PLogger(log, Step.TEN_MINUTES, "lines", "anchors", "searches", "indexed", "0-freq","dropped");
		plog.setEnd(0, numAnchors);
		plog.start("Support datasets loaded, now parsing...");
		int id=0;
		while(iterator.next())
		{
			plog.update(0, iterator.scroll);
			plog.update(1);
			String anchorText = iterator.anchor;
			
			int freq = freq(iterator.originals, articles, queryParser);
			plog.update(2, iterator.originals.size());
			if (freq == 0) plog.update(4);
			
			Anchor anchorObj = Anchor.build(id, iterator.links, freq, people);
			if (anchorObj == null){
				plog.update(5);
				continue;
			}
			
			String anchorSerial = Anchor.serialize(anchorObj);
			fId.setValue(Integer.toString(++id));
			fText.setValue(anchorText);
			fObject.setValue(anchorSerial);
			
			for(int page : anchorObj){
				Field fWID = new Field(FIELD_WID, Integer.toString(page), Store.YES, Index.NOT_ANALYZED);
//				fWID.setBoost(iterator.links.get(page));
				doc.add(fWID);
			}
			for(String original : iterator.originals) {
				doc.add(new Field(FIELD_ORIGINAL, original, Store.YES, Index.NOT_ANALYZED));
			}
			
			index.addDocument(doc);
			plog.update(3);
			
			doc.removeFields(FIELD_ORIGINAL);
			doc.removeFields(FIELD_WID);
		}
		plog.stop();
		iterator.close();
		
		log.info("Now optimizing...");
		index.optimize();
		
		index.close();
		log.info("Done.");
	}
 
Example 5
Source File: TopicIndexer.java    From tagme with Apache License 2.0 4 votes vote down vote up
@Override
	public void makeIndex(String lang, File workingDir) throws IOException
	{
		
		IndexReader articles = Indexes.getReader(RepositoryDirs.WIKIPEDIA.getPath(lang));
		Int2ObjectMap<String> bestAnchorMap = new BestAnchors(lang).getDataset();
		
		IndexWriter index = new IndexWriter(new SimpleFSDirectory(workingDir), new IndexWriterConfig(Version.LUCENE_34, new KeywordAnalyzer()));
		Document doc = new Document();
		Field fWID = new Field(FIELD_WID, "", Store.YES, Index.NOT_ANALYZED);
		Field fTitle = new Field(FIELD_TITLE, "", Store.YES, Index.NOT_ANALYZED);
		Field fAbstract = new Field(FIELD_ABSTRACT, "", Store.YES, Index.NO);
		Field fBestAnchor = new Field(FIELD_BEST_ANCHOR, "", Store.YES, Index.NO);
		doc.add(fWID);
		doc.add(fTitle);
		doc.add(fAbstract);
		doc.add(fBestAnchor);
				
		
		int max = articles.maxDoc();
		PLogger plog = new PLogger(log, Step.TEN_MINUTES, "pages", "indexed", "noBest");
		plog.setEnd(max);
		plog.start("Start indexing...");
		
		for(int i=0; i<max; i++)
		{
			plog.update(0);
			Document oldDoc = articles.document(i);
			PageType type = PageType.valueOf(oldDoc.get(WikipediaIndexer.FIELD_TYPE));
			if (type == PageType.TOPIC)
			{
				int wid = Integer.parseInt(oldDoc.get(WikipediaIndexer.FIELD_WID));
				fWID.setValue(oldDoc.get(WikipediaIndexer.FIELD_WID));
				fAbstract.setValue(oldDoc.get(WikipediaIndexer.FIELD_ABSTRACT));
				fTitle.setValue(oldDoc.get(WikipediaIndexer.FIELD_TITLE));
				
				String bestAnchor = bestAnchorMap.get(wid);
				if (bestAnchor == null || bestAnchor.length() == 0) plog.update(2);
				fBestAnchor.setValue(bestAnchor==null?"":bestAnchor);
				
				String[] cats = oldDoc.getValues(WikipediaIndexer.FIELD_CAT);
				if (cats != null) {
					for (int j=0; j<cats.length; j++)
						doc.add(new Field(FIELD_CAT, cats[j], Store.YES, Index.NOT_ANALYZED));
				}
				
				index.addDocument(doc);
				plog.update(1);
				
				doc.removeFields(FIELD_CAT);
			}
		}
		
		plog.stop();
		
		log.info("Now optimizing...");
		index.optimize();
		
		index.close();
		
		//we cannot call this because the index is still in the temporary dir
		//so TopicDocs will be created using old index
//		log.info("Index Done, now creating WID->DOC_ID map");
//		
//		TopicDocs td = new TopicDocs(lang);
//		td.forceParsing();
		
		log.info("Done.");
	}