Java Code Examples for org.apache.lucene.index.IndexReader#document()

The following examples show how to use org.apache.lucene.index.IndexReader#document() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: LuceneTranslationMemory.java    From modernmt with Apache License 2.0 6 votes vote down vote up
@Override
public void dump(long memory, Consumer<Entry> consumer) throws IOException {
    IndexSearcher searcher = getIndexSearcher();
    IndexReader reader = getIndexReader();

    int size = reader.numDocs();
    if (size == 0)
        return;

    Query memoryQuery = new TermQuery(documentBuilder.makeMemoryTerm(memory));
    TopDocs docs = searcher.search(memoryQuery, size);

    for (ScoreDoc scoreDoc : docs.scoreDocs) {
        Document document = reader.document(scoreDoc.doc);
        if (documentBuilder.getMemory(document) > 0) {
            TranslationMemory.Entry entry = documentBuilder.asEntry(document);
            consumer.accept(entry);
        }
    }
}
 
Example 2
Source File: LuceneTranslationMemory.java    From modernmt with Apache License 2.0 6 votes vote down vote up
@Override
public void dumpAll(Consumer<Entry> consumer) throws IOException {
    IndexSearcher searcher = getIndexSearcher();
    IndexReader reader = getIndexReader();

    int size = reader.numDocs();
    if (size == 0)
        return;

    TopDocs docs = searcher.search(new MatchAllDocsQuery(), size);

    for (ScoreDoc scoreDoc : docs.scoreDocs) {
        Document document = reader.document(scoreDoc.doc);
        if (documentBuilder.getMemory(document) > 0) {
            TranslationMemory.Entry entry = documentBuilder.asEntry(document);
            consumer.accept(entry);
        }
    }
}
 
Example 3
Source File: AbstractLuceneIndexerImpl.java    From alfresco-repository with GNU Lesser General Public License v3.0 6 votes vote down vote up
protected boolean locateContainer(String nodeRef, IndexReader reader)
{
    boolean found = false;
    try
    {
        TermDocs td = reader.termDocs(new Term("ID", nodeRef));
        while (td.next())
        {
            int doc = td.doc();
            Document document = reader.document(doc);
            if (document.getField("ISCONTAINER") != null)
            {
                found = true;
                break;
            }
        }
        td.close();
    }
    catch (IOException e)
    {
        throw new LuceneIndexException("Failed to delete container and below for " + nodeRef, e);
    }
    return found;        
}
 
Example 4
Source File: Catalog.java    From cxf with Apache License 2.0 6 votes vote down vote up
@GET
@Produces(MediaType.APPLICATION_JSON)
public JsonArray getBooks() throws IOException {
    final IndexReader reader = DirectoryReader.open(directory);
    final IndexSearcher searcher = new IndexSearcher(reader);
    final JsonArrayBuilder builder = Json.createArrayBuilder();

    try {
        final Query query = new MatchAllDocsQuery();

        for (final ScoreDoc scoreDoc: searcher.search(query, 1000).scoreDocs) {
            final DocumentStoredFieldVisitor fieldVisitor =
                new DocumentStoredFieldVisitor(LuceneDocumentMetadata.SOURCE_FIELD);

            reader.document(scoreDoc.doc, fieldVisitor);
            builder.add(fieldVisitor
                    .getDocument()
                    .getField(LuceneDocumentMetadata.SOURCE_FIELD)
                    .stringValue());
        }

        return builder.build();
    } finally {
        reader.close();
    }
}
 
Example 5
Source File: TestBlockJoin.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
private void compareHits(IndexReader r, IndexReader joinR, TopDocs controlHits, Map<Integer, TopDocs> joinResults) throws Exception {
  int currentParentID = -1;
  int childHitSlot = 0;
  TopDocs childHits = new TopDocs(new TotalHits(0, TotalHits.Relation.EQUAL_TO), new ScoreDoc[0]);
  for (ScoreDoc controlHit : controlHits.scoreDocs) {
    Document controlDoc = r.document(controlHit.doc);
    int parentID = Integer.parseInt(controlDoc.get("parentID"));
    if (parentID != currentParentID) {
      assertEquals(childHitSlot, childHits.scoreDocs.length);
      currentParentID = parentID;
      childHitSlot = 0;
      childHits = joinResults.get(parentID);
    }

    String controlChildID = controlDoc.get("childID");
    Document childDoc = joinR.document(childHits.scoreDocs[childHitSlot++].doc);
    String childID = childDoc.get("childID");
    assertEquals(controlChildID, childID);
  }
}
 
Example 6
Source File: LuceneHelper.java    From dexter with Apache License 2.0 6 votes vote down vote up
private Document getDoc(int wikiId) {
	IndexReader reader = getReader();

	// System.out.println("get docId "+pos);
	if (wikiId <= 0)
		return null;
	int docId = getLuceneId(wikiId);
	if (docId < 0) {
		logger.warn("no id for wikiId {}", wikiId);

		return null;
	}
	logger.debug("get wikiId {}  ->  docId {}", wikiId, docId);
	Document doc = null;
	try {
		doc = reader.document(docId);
	} catch (Exception e) {
		logger.error("retrieving doc in position {} {}", docId,
				e.toString());
		System.exit(-1);
	}

	return doc;
}
 
Example 7
Source File: TestBlockJoin.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
private void compareChildHits(IndexReader r, IndexReader joinR, TopDocs results, TopDocs joinResults) throws Exception {
  assertEquals(results.totalHits.value, joinResults.totalHits.value);
  assertEquals(results.scoreDocs.length, joinResults.scoreDocs.length);
  for(int hitCount=0;hitCount<results.scoreDocs.length;hitCount++) {
    ScoreDoc hit = results.scoreDocs[hitCount];
    ScoreDoc joinHit = joinResults.scoreDocs[hitCount];
    Document doc1 = r.document(hit.doc);
    Document doc2 = joinR.document(joinHit.doc);
    assertEquals("hit " + hitCount + " differs",
                 doc1.get("childID"), doc2.get("childID"));
    // don't compare scores -- they are expected to differ


    assertTrue(hit instanceof FieldDoc);
    assertTrue(joinHit instanceof FieldDoc);

    FieldDoc hit0 = (FieldDoc) hit;
    FieldDoc joinHit0 = (FieldDoc) joinHit;
    assertArrayEquals(hit0.fields, joinHit0.fields);
  }
}
 
Example 8
Source File: SearchTravRetHighlightTask.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
  IndexReader reader = searcher.getIndexReader();
  highlighter.setFragmentScorer(new QueryScorer(q));
  // highlighter.setTextFragmenter();  unfortunately no sentence mechanism, not even regex. Default here is trivial
  for (ScoreDoc scoreDoc : docIdOrder(hits.scoreDocs)) {
    Document document = reader.document(scoreDoc.doc, hlFields);
    Fields tvFields = termVecs ? reader.getTermVectors(scoreDoc.doc) : null;
    for (IndexableField indexableField : document) {
      TokenStream tokenStream;
      if (termVecs) {
        tokenStream = TokenSources.getTokenStream(indexableField.name(), tvFields,
            indexableField.stringValue(), analyzer, maxDocCharsToAnalyze);
      } else {
        tokenStream = analyzer.tokenStream(indexableField.name(), indexableField.stringValue());
      }
      // will close TokenStream:
      String[] fragments = highlighter.getBestFragments(tokenStream, indexableField.stringValue(), maxFrags);
      preventOptimizeAway = fragments.length;
    }
  }
}
 
Example 9
Source File: AbstractLuceneIndexerImpl.java    From alfresco-repository with GNU Lesser General Public License v3.0 5 votes vote down vote up
protected static Set<String> deleteReference(Collection<String> nodeRefs, IndexReader reader, boolean delete)
        throws LuceneIndexException
{

    Set<String> refs = new LinkedHashSet<String>();

    for (String nodeRef : nodeRefs)
    {

        try
        {
            TermDocs td = reader.termDocs(new Term("PARENT", nodeRef));
            while (td.next())
            {
                int doc = td.doc();
                Document document = reader.document(doc);
                String[] ids = document.getValues("ID");
                refs.add(ids[ids.length - 1]);
                if (delete)
                {
                    reader.deleteDocument(doc);
                }
            }
            td.close();
        }
        catch (IOException e)
        {
            throw new LuceneIndexException("Failed to delete node by parent for " + nodeRef, e);
        }
    }

    return refs;

}
 
Example 10
Source File: AbstractLuceneIndexerImpl.java    From alfresco-repository with GNU Lesser General Public License v3.0 5 votes vote down vote up
protected static Set<String> deleteContainerAndBelow(String nodeRef, IndexReader reader, boolean delete,
        boolean cascade) throws LuceneIndexException
{
    Set<String> refs = new LinkedHashSet<String>();

    try
    {
        if (delete)
        {
            reader.deleteDocuments(new Term("ID", nodeRef));
        }
        refs.add(nodeRef);
        if (cascade)
        {
            TermDocs td = reader.termDocs(new Term("ANCESTOR", nodeRef));
            while (td.next())
            {
                int doc = td.doc();
                Document document = reader.document(doc);
                String[] ids = document.getValues("ID");
                refs.add(ids[ids.length - 1]);
                if (delete)
                {
                    reader.deleteDocument(doc);
                }
            }
            td.close();
        }
    }
    catch (IOException e)
    {
        throw new LuceneIndexException("Failed to delete container and below for " + nodeRef, e);
    }
    return refs;
}
 
Example 11
Source File: SearchTravRetLoadFieldSelectorTask.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
protected Document retrieveDoc(IndexReader ir, int id) throws IOException {
  if (fieldsToLoad == null) {
    return ir.document(id);
  } else {
    DocumentStoredFieldVisitor visitor = new DocumentStoredFieldVisitor(fieldsToLoad);
    ir.document(id, visitor);
    return visitor.getDocument();
  }
}
 
Example 12
Source File: LuceneIndex.java    From rdf4j with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
private static Document readDocument(IndexReader reader, int docId, Set<String> fieldsToLoad) throws IOException {
	DocumentStoredFieldVisitor visitor = new DocumentStoredFieldVisitor(fieldsToLoad);
	reader.document(docId, visitor);
	return visitor.getDocument();
}
 
Example 13
Source File: TestBinaryDocument.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void testBinaryFieldInIndex()
  throws Exception
{
  FieldType ft = new FieldType();
  ft.setStored(true);
  StoredField binaryFldStored = new StoredField("binaryStored", binaryValStored.getBytes(StandardCharsets.UTF_8));
  Field stringFldStored = new Field("stringStored", binaryValStored, ft);

  Document doc = new Document();
  
  doc.add(binaryFldStored);
  
  doc.add(stringFldStored);

  /** test for field count */
  assertEquals(2, doc.getFields().size());
  
  /** add the doc to a ram index */
  Directory dir = newDirectory();
  RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
  writer.addDocument(doc);
  
  /** open a reader and fetch the document */ 
  IndexReader reader = writer.getReader();
  Document docFromReader = reader.document(0);
  assertTrue(docFromReader != null);
  
  /** fetch the binary stored field and compare its content with the original one */
  BytesRef bytes = docFromReader.getBinaryValue("binaryStored");
  assertNotNull(bytes);
  String binaryFldStoredTest = new String(bytes.bytes, bytes.offset, bytes.length, StandardCharsets.UTF_8);
  assertTrue(binaryFldStoredTest.equals(binaryValStored));
  
  /** fetch the string field and compare its content with the original one */
  String stringFldStoredTest = docFromReader.get("stringStored");
  assertTrue(stringFldStoredTest.equals(binaryValStored));
  
  writer.close();
  reader.close();
  dir.close();
}
 
Example 14
Source File: Catalog.java    From cxf with Apache License 2.0 4 votes vote down vote up
@GET
@Produces(MediaType.APPLICATION_JSON)
@CrossOriginResourceSharing(allowAllOrigins = true)
@Path("/search")
public Response findBook(@Context SearchContext searchContext,
        @Context final UriInfo uri) throws IOException {

    final IndexReader reader = DirectoryReader.open(directory);
    final IndexSearcher searcher = new IndexSearcher(reader);
    final JsonArrayBuilder builder = Json.createArrayBuilder();

    try {
        visitor.reset();
        visitor.visit(searchContext.getCondition(SearchBean.class));

        final Query query = visitor.getQuery();
        if (query != null) {
            final TopDocs topDocs = searcher.search(query, 1000);
            for (final ScoreDoc scoreDoc: topDocs.scoreDocs) {
                final Document document = reader.document(scoreDoc.doc);
                final String source = document
                    .getField(LuceneDocumentMetadata.SOURCE_FIELD)
                    .stringValue();

                builder.add(
                    Json.createObjectBuilder()
                        .add("source", source)
                        .add("score", scoreDoc.score)
                        .add("url", uri.getBaseUriBuilder()
                                .path(Catalog.class)
                                .path(source)
                                .build().toString())
                );
            }
        }

        return Response.ok(builder.build()).build();
    } finally {
        reader.close();
    }
}
 
Example 15
Source File: ReadTask.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
protected Document retrieveDoc(IndexReader ir, int id) throws IOException {
  return ir.document(id);
}
 
Example 16
Source File: TokenSources.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Deprecated // maintenance reasons LUCENE-6445
public static TokenStream getTokenStream(IndexReader reader, int docId,
    String field, Analyzer analyzer) throws IOException {
  Document doc = reader.document(docId);
  return getTokenStream(doc, field, analyzer);
}
 
Example 17
Source File: TopicIndexer.java    From tagme with Apache License 2.0 4 votes vote down vote up
@Override
	public void makeIndex(String lang, File workingDir) throws IOException
	{
		
		IndexReader articles = Indexes.getReader(RepositoryDirs.WIKIPEDIA.getPath(lang));
		Int2ObjectMap<String> bestAnchorMap = new BestAnchors(lang).getDataset();
		
		IndexWriter index = new IndexWriter(new SimpleFSDirectory(workingDir), new IndexWriterConfig(Version.LUCENE_34, new KeywordAnalyzer()));
		Document doc = new Document();
		Field fWID = new Field(FIELD_WID, "", Store.YES, Index.NOT_ANALYZED);
		Field fTitle = new Field(FIELD_TITLE, "", Store.YES, Index.NOT_ANALYZED);
		Field fAbstract = new Field(FIELD_ABSTRACT, "", Store.YES, Index.NO);
		Field fBestAnchor = new Field(FIELD_BEST_ANCHOR, "", Store.YES, Index.NO);
		doc.add(fWID);
		doc.add(fTitle);
		doc.add(fAbstract);
		doc.add(fBestAnchor);
				
		
		int max = articles.maxDoc();
		PLogger plog = new PLogger(log, Step.TEN_MINUTES, "pages", "indexed", "noBest");
		plog.setEnd(max);
		plog.start("Start indexing...");
		
		for(int i=0; i<max; i++)
		{
			plog.update(0);
			Document oldDoc = articles.document(i);
			PageType type = PageType.valueOf(oldDoc.get(WikipediaIndexer.FIELD_TYPE));
			if (type == PageType.TOPIC)
			{
				int wid = Integer.parseInt(oldDoc.get(WikipediaIndexer.FIELD_WID));
				fWID.setValue(oldDoc.get(WikipediaIndexer.FIELD_WID));
				fAbstract.setValue(oldDoc.get(WikipediaIndexer.FIELD_ABSTRACT));
				fTitle.setValue(oldDoc.get(WikipediaIndexer.FIELD_TITLE));
				
				String bestAnchor = bestAnchorMap.get(wid);
				if (bestAnchor == null || bestAnchor.length() == 0) plog.update(2);
				fBestAnchor.setValue(bestAnchor==null?"":bestAnchor);
				
				String[] cats = oldDoc.getValues(WikipediaIndexer.FIELD_CAT);
				if (cats != null) {
					for (int j=0; j<cats.length; j++)
						doc.add(new Field(FIELD_CAT, cats[j], Store.YES, Index.NOT_ANALYZED));
				}
				
				index.addDocument(doc);
				plog.update(1);
				
				doc.removeFields(FIELD_CAT);
			}
		}
		
		plog.stop();
		
		log.info("Now optimizing...");
		index.optimize();
		
		index.close();
		
		//we cannot call this because the index is still in the temporary dir
		//so TopicDocs will be created using old index
//		log.info("Index Done, now creating WID->DOC_ID map");
//		
//		TopicDocs td = new TopicDocs(lang);
//		td.forceParsing();
		
		log.info("Done.");
	}
 
Example 18
Source File: AnchorTrieDump.java    From tagme with Apache License 2.0 4 votes vote down vote up
@Override
protected AnchorTrie parseSet() throws IOException
{
	IndexReader anchors = Indexes.getReader(RepositoryDirs.ANCHORS.getPath(lang));
	AnchorTrie trie = new AnchorTrie();
	int maxdoc = anchors.maxDoc();
	
	PLogger plog = new PLogger(log, Step.TEN_MINUTES, "anchors", "skipped", "duplicates");
	plog.setEnd(0, maxdoc);
	plog.start("Inserting in to trie...");
	for(int i=0; i<maxdoc; i++)
	{
		plog.update(0);
		Document doc = anchors.document(i);
		if (doc == null){
			plog.update(1);
			continue;
		}
		
		String anchorText = doc.get(AnchorIndexer.FIELD_TEXT);
		String serial = doc.get(AnchorIndexer.FIELD_OBJECT);
		Anchor anchorObj = Anchor.deserialize(serial);
		
		if (anchorObj == null){
			plog.update(1);
			continue;
		}
		
		boolean done = trie.add(anchorText, anchorObj);
		
		if (!done) plog.update(2);
	}
	plog.stop();
	
	log.info("Now trimming...");
	trie.trim();
	log.info("Done.");
	
	return trie;
	
}
 
Example 19
Source File: MemoryIndex.java    From netbeans with Apache License 2.0 4 votes vote down vote up
@Override
public <S, T> void queryDocTerms(
        @NonNull Map<? super T, Set<S>> result,
        @NonNull Convertor<? super Document, T> convertor,
        @NonNull Convertor<? super Term, S> termConvertor,
        @NullAllowed FieldSelector selector,
        @NullAllowed AtomicBoolean cancel,
        @NonNull Query... queries) throws IOException, InterruptedException {
    Parameters.notNull("result", result);   //NOI18N
    Parameters.notNull("convertor", convertor);   //NOI18N
    Parameters.notNull("termConvertor", termConvertor); //NOI18N
    Parameters.notNull("queries", queries);   //NOI18N
    
    
    if (selector == null) {
        selector = AllFieldsSelector.INSTANCE;
    }

    lock.readLock().lock();
    try {
        final IndexReader in = getReader();
        if (in == null) {
            return;
        }
        final BitSet bs = new BitSet(in.maxDoc());
        final Collector c = new BitSetCollector(bs);
        final Searcher searcher = new IndexSearcher(in);
        final TermCollector termCollector = new TermCollector(c);
        try {
            for (Query q : queries) {
                if (cancel != null && cancel.get()) {
                    throw new InterruptedException ();
                }
                if (q instanceof TermCollector.TermCollecting) {
                    ((TermCollector.TermCollecting)q).attach(termCollector);
                } else {
                    throw new IllegalArgumentException (
                            String.format("Query: %s does not implement TermCollecting",    //NOI18N
                            q.getClass().getName()));
                }
                searcher.search(q, termCollector);
            }
        } finally {
            searcher.close();
        }

        for (int docNum = bs.nextSetBit(0); docNum >= 0; docNum = bs.nextSetBit(docNum+1)) {
            if (cancel != null && cancel.get()) {
                throw new InterruptedException ();
            }
            final Document doc = in.document(docNum, selector);
            final T value = convertor.convert(doc);
            if (value != null) {
                final Set<Term> terms = termCollector.get(docNum);
                if (terms != null) {
                    result.put (value, convertTerms(termConvertor, terms));
                }
            }
        }
    } finally {
        lock.readLock().unlock();
    }
}
 
Example 20
Source File: MemoryIndex.java    From netbeans with Apache License 2.0 4 votes vote down vote up
@Override
public <T> void query(
        @NonNull Collection<? super T> result,
        @NonNull Convertor<? super Document, T> convertor,
        @NullAllowed FieldSelector selector,
        @NullAllowed AtomicBoolean cancel,
        @NonNull Query... queries) throws IOException, InterruptedException {
    Parameters.notNull("queries", queries);   //NOI18N
    Parameters.notNull("convertor", convertor); //NOI18N
    Parameters.notNull("result", result);       //NOI18N   
    
    if (selector == null) {
        selector = AllFieldsSelector.INSTANCE;
    }
    
    lock.readLock().lock();
    try {
        final IndexReader in = getReader();
        if (in == null) {
            return;
        }
        final BitSet bs = new BitSet(in.maxDoc());
        final Collector c = new BitSetCollector(bs);
        final Searcher searcher = new IndexSearcher(in);
        try {
            for (Query q : queries) {
                if (cancel != null && cancel.get()) {
                    throw new InterruptedException ();
                }
                searcher.search(q, c);
            }
        } finally {
            searcher.close();
        }        
        for (int docNum = bs.nextSetBit(0); docNum >= 0; docNum = bs.nextSetBit(docNum+1)) {
            if (cancel != null && cancel.get()) {
                throw new InterruptedException ();
            }
            final Document doc = in.document(docNum, selector);
            final T value = convertor.convert(doc);
            if (value != null) {
                result.add (value);
            }
        }
    } finally {
        lock.readLock().unlock();
    }
}