Java Code Examples for org.apache.lucene.document.Document#getValues()

The following examples show how to use org.apache.lucene.document.Document#getValues() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: LuceneAnchorSearcher.java    From tagme with Apache License 2.0 6 votes vote down vote up
public String[] getOriginals(Query q)
{
	try {
		TopDocs td = searcher.search(q, 1);
		if (td.totalHits == 0) return null;
		else {
			Document doc = searcher.doc(td.scoreDocs[0].doc);
			return doc.getValues(AnchorIndexer.FIELD_ORIGINAL);
		}
		
	} catch(IOException ioe){
		throw new RuntimeException("Unable to search in the anchor index!", ioe);
	}
	
	
}
 
Example 2
Source File: LuceneWorkflowInstanceRepository.java    From oodt with Apache License 2.0 6 votes vote down vote up
private WorkflowTaskConfiguration toTaskConfig(String taskId, Document doc) {
    WorkflowTaskConfiguration taskConfig = new WorkflowTaskConfiguration();

    String[] propNames = doc.getValues(taskId + "_config_property_name");
    String[] propValues = doc.getValues(taskId + "_config_property_value");

    if (propNames == null) {
        return taskConfig;
    }

    if (propNames.length != propValues.length) {
        LOG.log(Level.WARNING,
                "Task Config prop name and value arrays are not "
                        + "of same size!");
        return null;
    }

    for (int i = 0; i < propNames.length; i++) {
        taskConfig.addConfigProperty(propNames[i], propValues[i]);
    }

    return taskConfig;
}
 
Example 3
Source File: AbstractLuceneIndexerImpl.java    From alfresco-repository with GNU Lesser General Public License v3.0 5 votes vote down vote up
protected static Set<String> deletePrimary(Collection<String> nodeRefs, IndexReader reader, boolean delete)
        throws LuceneIndexException
{

    Set<String> refs = new LinkedHashSet<String>();

    for (String nodeRef : nodeRefs)
    {

        try
        {
            TermDocs td = reader.termDocs(new Term("PRIMARYPARENT", nodeRef));
            while (td.next())
            {
                int doc = td.doc();
                Document document = reader.document(doc);
                String[] ids = document.getValues("ID");
                refs.add(ids[ids.length - 1]);
                if (delete)
                {
                    reader.deleteDocument(doc);
                }
            }
            td.close();
        }
        catch (IOException e)
        {
            throw new LuceneIndexException("Failed to delete node by primary parent for " + nodeRef, e);
        }
    }

    return refs;

}
 
Example 4
Source File: AbstractLuceneIndexerImpl.java    From alfresco-repository with GNU Lesser General Public License v3.0 5 votes vote down vote up
protected static Set<String> deleteReference(Collection<String> nodeRefs, IndexReader reader, boolean delete)
        throws LuceneIndexException
{

    Set<String> refs = new LinkedHashSet<String>();

    for (String nodeRef : nodeRefs)
    {

        try
        {
            TermDocs td = reader.termDocs(new Term("PARENT", nodeRef));
            while (td.next())
            {
                int doc = td.doc();
                Document document = reader.document(doc);
                String[] ids = document.getValues("ID");
                refs.add(ids[ids.length - 1]);
                if (delete)
                {
                    reader.deleteDocument(doc);
                }
            }
            td.close();
        }
        catch (IOException e)
        {
            throw new LuceneIndexException("Failed to delete node by parent for " + nodeRef, e);
        }
    }

    return refs;

}
 
Example 5
Source File: AbstractLuceneIndexerImpl.java    From alfresco-repository with GNU Lesser General Public License v3.0 5 votes vote down vote up
protected static Set<String> deleteContainerAndBelow(String nodeRef, IndexReader reader, boolean delete,
        boolean cascade) throws LuceneIndexException
{
    Set<String> refs = new LinkedHashSet<String>();

    try
    {
        if (delete)
        {
            reader.deleteDocuments(new Term("ID", nodeRef));
        }
        refs.add(nodeRef);
        if (cascade)
        {
            TermDocs td = reader.termDocs(new Term("ANCESTOR", nodeRef));
            while (td.next())
            {
                int doc = td.doc();
                Document document = reader.document(doc);
                String[] ids = document.getValues("ID");
                refs.add(ids[ids.length - 1]);
                if (delete)
                {
                    reader.deleteDocument(doc);
                }
            }
            td.close();
        }
    }
    catch (IOException e)
    {
        throw new LuceneIndexException("Failed to delete container and below for " + nodeRef, e);
    }
    return refs;
}
 
Example 6
Source File: KNearestNeighborDocumentClassifier.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Returns the top k results from a More Like This query based on the input document
 *
 * @param document the document to use for More Like This search
 * @return the top results for the MLT query
 * @throws IOException If there is a low-level I/O error
 */
private TopDocs knnSearch(Document document) throws IOException {
  BooleanQuery.Builder mltQuery = new BooleanQuery.Builder();

  for (String fieldName : textFieldNames) {
    String boost = null;
    if (fieldName.contains("^")) {
      String[] field2boost = fieldName.split("\\^");
      fieldName = field2boost[0];
      boost = field2boost[1];
    }
    String[] fieldValues = document.getValues(fieldName);
    mlt.setBoost(true); // we want always to use the boost coming from TF * IDF of the term
    if (boost != null) {
      mlt.setBoostFactor(Float.parseFloat(boost)); // this is an additional multiplicative boost coming from the field boost
    }
    mlt.setAnalyzer(field2analyzer.get(fieldName));
    for (String fieldContent : fieldValues) {
      mltQuery.add(new BooleanClause(mlt.like(fieldName, new StringReader(fieldContent)), BooleanClause.Occur.SHOULD));
    }
    mlt.setBoostFactor(1);// restore neutral boost for next field
  }
  Query classFieldQuery = new WildcardQuery(new Term(classFieldName, "*"));
  mltQuery.add(new BooleanClause(classFieldQuery, BooleanClause.Occur.MUST));
  if (query != null) {
    mltQuery.add(query, BooleanClause.Occur.MUST);
  }
  return indexSearcher.search(mltQuery.build(), k);
}
 
Example 7
Source File: LuceneIndexCorpus.java    From word2vec-lucene with Apache License 2.0 5 votes vote down vote up
@Override
public String nextWord() throws IOException {
  
  while(true){
    // check the tokenStream first
    if(tokenStream != null && tokenStream.incrementToken()){
      return new String(termAtt.buffer(), 0, termAtt.length());
    }

    if(tokenStream != null)
      tokenStream.close();
    if(valPos < values.length){
      tokenStream = analyzer.tokenStream(field, values[valPos++]);
      termAtt = tokenStream.getAttribute(CharTermAttribute.class);
      tokenStream.reset();
      eoc = false;
      return null;
    }
    else{
      if(tdPos >= topDocs.totalHits){
        tokenStream = null;
        eoc = true;
        return null;   // end of index == end of corpus
      }
      Document doc = reader.document(topDocs.scoreDocs[tdPos++].doc);
      values = doc.getValues(field);   // This method returns an empty array when there are no matching fields.
                                       // It never returns null.
      valPos = 0;
      tokenStream = null;
    }
  }
}
 
Example 8
Source File: LuceneWorkflowInstanceRepository.java    From oodt with Apache License 2.0 5 votes vote down vote up
private List toTasks(Document doc) {
    List taskList = new Vector();

    String[] taskIds = doc.getValues("task_id");
    String[] taskNames = doc.getValues("task_name");
    String[] taskOrders = doc.getValues("task_order");
    String[] taskClasses = doc.getValues("task_class");

    if (taskIds.length != taskNames.length
            || taskIds.length != taskOrders.length
            || taskIds.length != taskClasses.length) {
        LOG.log(Level.WARNING,
                "task arrays are not of same size when rebuilding "
                        + "task list from Document!");
        return null;
    }

    for (int i = 0; i < taskIds.length; i++) {
        WorkflowTask task = new WorkflowTask();
        task.setOrder(Integer.parseInt(taskOrders[i]));
        task.setTaskName(taskNames[i]);
        task.setTaskId(taskIds[i]);
        task.setTaskInstanceClassName(taskClasses[i]);

        task.setConditions(toConditions(task.getTaskId(), doc));
        task.setTaskConfig(toTaskConfig(task.getTaskId(), doc));
        taskList.add(task);
    }

    return taskList;
}
 
Example 9
Source File: ReferenceCountingReadOnlyIndexReaderFactory.java    From alfresco-repository with GNU Lesser General Public License v3.0 4 votes vote down vote up
public String[] getLinkAspects(int n) throws IOException
{
    // return getStringValues(n, "LINKASPECT");
    Document d = document(n, new SingleFieldSelector("LINKASPECT", false));
    return d.getValues("LINKASPECT");
}
 
Example 10
Source File: ReferenceCountingReadOnlyIndexReaderFactory.java    From alfresco-repository with GNU Lesser General Public License v3.0 4 votes vote down vote up
public String[] getParents(int n) throws IOException
{
    // return getStringValues(n, "PARENT");
    Document d = document(n, new SingleFieldSelector("PARENT", false));
    return d.getValues("PARENT");
}
 
Example 11
Source File: TopicIndexer.java    From tagme with Apache License 2.0 4 votes vote down vote up
@Override
	public void makeIndex(String lang, File workingDir) throws IOException
	{
		
		IndexReader articles = Indexes.getReader(RepositoryDirs.WIKIPEDIA.getPath(lang));
		Int2ObjectMap<String> bestAnchorMap = new BestAnchors(lang).getDataset();
		
		IndexWriter index = new IndexWriter(new SimpleFSDirectory(workingDir), new IndexWriterConfig(Version.LUCENE_34, new KeywordAnalyzer()));
		Document doc = new Document();
		Field fWID = new Field(FIELD_WID, "", Store.YES, Index.NOT_ANALYZED);
		Field fTitle = new Field(FIELD_TITLE, "", Store.YES, Index.NOT_ANALYZED);
		Field fAbstract = new Field(FIELD_ABSTRACT, "", Store.YES, Index.NO);
		Field fBestAnchor = new Field(FIELD_BEST_ANCHOR, "", Store.YES, Index.NO);
		doc.add(fWID);
		doc.add(fTitle);
		doc.add(fAbstract);
		doc.add(fBestAnchor);
				
		
		int max = articles.maxDoc();
		PLogger plog = new PLogger(log, Step.TEN_MINUTES, "pages", "indexed", "noBest");
		plog.setEnd(max);
		plog.start("Start indexing...");
		
		for(int i=0; i<max; i++)
		{
			plog.update(0);
			Document oldDoc = articles.document(i);
			PageType type = PageType.valueOf(oldDoc.get(WikipediaIndexer.FIELD_TYPE));
			if (type == PageType.TOPIC)
			{
				int wid = Integer.parseInt(oldDoc.get(WikipediaIndexer.FIELD_WID));
				fWID.setValue(oldDoc.get(WikipediaIndexer.FIELD_WID));
				fAbstract.setValue(oldDoc.get(WikipediaIndexer.FIELD_ABSTRACT));
				fTitle.setValue(oldDoc.get(WikipediaIndexer.FIELD_TITLE));
				
				String bestAnchor = bestAnchorMap.get(wid);
				if (bestAnchor == null || bestAnchor.length() == 0) plog.update(2);
				fBestAnchor.setValue(bestAnchor==null?"":bestAnchor);
				
				String[] cats = oldDoc.getValues(WikipediaIndexer.FIELD_CAT);
				if (cats != null) {
					for (int j=0; j<cats.length; j++)
						doc.add(new Field(FIELD_CAT, cats[j], Store.YES, Index.NOT_ANALYZED));
				}
				
				index.addDocument(doc);
				plog.update(1);
				
				doc.removeFields(FIELD_CAT);
			}
		}
		
		plog.stop();
		
		log.info("Now optimizing...");
		index.optimize();
		
		index.close();
		
		//we cannot call this because the index is still in the temporary dir
		//so TopicDocs will be created using old index
//		log.info("Index Done, now creating WID->DOC_ID map");
//		
//		TopicDocs td = new TopicDocs(lang);
//		td.forceParsing();
		
		log.info("Done.");
	}
 
Example 12
Source File: LuceneIndex.java    From rdf4j with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
@SuppressWarnings("unused")
private void logIndexStats() {
	try {
		IndexReader reader = null;
		try {
			reader = getIndexReader();

			Document doc;
			int totalFields = 0;

			Set<String> ids = new HashSet<>();
			String[] idArray;
			int count = 0;
			for (int i = 0; i < reader.maxDoc(); i++) {
				if (isDeleted(reader, i)) {
					continue;
				}
				doc = readDocument(reader, i, null);
				totalFields += doc.getFields().size();
				count++;
				idArray = doc.getValues("id");
				for (String id : idArray) {
					ids.add(id);
				}

			}

			logger.info("Total documents in the index: " + reader.numDocs()
					+ ", number of deletable documents in the index: " + reader.numDeletedDocs()
					+ ", valid documents: " + count + ", total fields in all documents: " + totalFields
					+ ", average number of fields per document: " + ((double) totalFields) / reader.numDocs());
			logger.info("Distinct ids in the index: " + ids.size());

		} finally {
			ReaderMonitor toCloseCurrentMonitor = currentMonitor;
			currentMonitor = null;
			if (toCloseCurrentMonitor != null) {
				toCloseCurrentMonitor.closeWhenPossible();
			}
		}
	} catch (IOException e) {
		logger.warn(e.getMessage(), e);
	}

}
 
Example 13
Source File: ConfusionMatrixGenerator.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
/**
 * get the {@link org.apache.lucene.classification.utils.ConfusionMatrixGenerator.ConfusionMatrix} of a given {@link Classifier},
 * generated on the given {@link IndexReader}, class and text fields.
 *
 * @param reader              the {@link IndexReader} containing the index used for creating the {@link Classifier}
 * @param classifier          the {@link Classifier} whose confusion matrix has to be generated
 * @param classFieldName      the name of the Lucene field used as the classifier's output
 * @param textFieldName       the nome the Lucene field used as the classifier's input
 * @param timeoutMilliseconds timeout to wait before stopping creating the confusion matrix
 * @param <T>                 the return type of the {@link ClassificationResult} returned by the given {@link Classifier}
 * @return a {@link org.apache.lucene.classification.utils.ConfusionMatrixGenerator.ConfusionMatrix}
 * @throws IOException if problems occurr while reading the index or using the classifier
 */
public static <T> ConfusionMatrix getConfusionMatrix(IndexReader reader, Classifier<T> classifier, String classFieldName,
                                                     String textFieldName, long timeoutMilliseconds) throws IOException {

  ExecutorService executorService = Executors.newFixedThreadPool(1, new NamedThreadFactory("confusion-matrix-gen-"));

  try {

    Map<String, Map<String, Long>> counts = new HashMap<>();
    IndexSearcher indexSearcher = new IndexSearcher(reader);
    TopDocs topDocs = indexSearcher.search(new TermRangeQuery(classFieldName, null, null, true, true), Integer.MAX_VALUE);
    double time = 0d;

    int counter = 0;
    for (ScoreDoc scoreDoc : topDocs.scoreDocs) {

      if (timeoutMilliseconds > 0 && time >= timeoutMilliseconds) {
        break;
      }

      Document doc = reader.document(scoreDoc.doc);
      String[] correctAnswers = doc.getValues(classFieldName);

      if (correctAnswers != null && correctAnswers.length > 0) {
        Arrays.sort(correctAnswers);
        ClassificationResult<T> result;
        String text = doc.get(textFieldName);
        if (text != null) {
          try {
            // fail if classification takes more than 5s
            long start = System.currentTimeMillis();
            result = executorService.submit(() -> classifier.assignClass(text)).get(5, TimeUnit.SECONDS);
            long end = System.currentTimeMillis();
            time += end - start;

            if (result != null) {
              T assignedClass = result.getAssignedClass();
              if (assignedClass != null) {
                counter++;
                String classified = assignedClass instanceof BytesRef ? ((BytesRef) assignedClass).utf8ToString() : assignedClass.toString();

                String correctAnswer;
                if (Arrays.binarySearch(correctAnswers, classified) >= 0) {
                  correctAnswer = classified;
                } else {
                  correctAnswer = correctAnswers[0];
                }

                Map<String, Long> stringLongMap = counts.get(correctAnswer);
                if (stringLongMap != null) {
                  Long aLong = stringLongMap.get(classified);
                  if (aLong != null) {
                    stringLongMap.put(classified, aLong + 1);
                  } else {
                    stringLongMap.put(classified, 1L);
                  }
                } else {
                  stringLongMap = new HashMap<>();
                  stringLongMap.put(classified, 1L);
                  counts.put(correctAnswer, stringLongMap);
                }

              }
            }
          } catch (TimeoutException timeoutException) {
            // add classification timeout
            time += 5000;
          } catch (ExecutionException | InterruptedException executionException) {
            throw new RuntimeException(executionException);
          }

        }
      }
    }
    return new ConfusionMatrix(counts, time / counter, counter);
  } finally {
    executorService.shutdown();
  }
}
 
Example 14
Source File: SearcherDAO.java    From entando-core with GNU Lesser General Public License v3.0 4 votes vote down vote up
protected FacetedContentsResult searchContents(SearchEngineFilter[] filters,
        Collection<ITreeNode> categories, Collection<String> allowedGroups, boolean faceted) throws ApsSystemException {
    FacetedContentsResult result = new FacetedContentsResult();
    List<String> contentsId = new ArrayList<String>();
    IndexSearcher searcher = null;
    try {
        searcher = this.getSearcher();
        Query query = null;
        if ((null == filters || filters.length == 0)
                && (null == categories || categories.isEmpty())
                && (allowedGroups != null && allowedGroups.contains(Group.ADMINS_GROUP_NAME))) {
            query = new MatchAllDocsQuery();
        } else {
            query = this.createQuery(filters, categories, allowedGroups);
        }
        TopDocs topDocs = searcher.search(query, 1000);
        ScoreDoc[] scoreDocs = topDocs.scoreDocs;
        Map<String, Integer> occurrences = new HashMap<String, Integer>();
        if (scoreDocs.length > 0) {
            for (int index = 0; index < scoreDocs.length; index++) {
                Document doc = searcher.doc(scoreDocs[index].doc);
                contentsId.add(doc.get(IIndexerDAO.DATAOBJECT_ID_FIELD_NAME));
                if (faceted) {
                    Set<String> codes = new HashSet<String>();
                    String[] categoryPaths = doc.getValues(IIndexerDAO.DATAOBJECT_CATEGORY_FIELD_NAME);
                    for (int i = 0; i < categoryPaths.length; i++) {
                        String categoryPath = categoryPaths[i];
                        String[] paths = categoryPath.split(IIndexerDAO.DATAOBJECT_CATEGORY_SEPARATOR);
                        codes.addAll(Arrays.asList(paths));
                    }
                    Iterator<String> iter = codes.iterator();
                    while (iter.hasNext()) {
                        String code = iter.next();
                        Integer value = occurrences.get(code);
                        if (null == value) {
                            value = 0;
                        }
                        occurrences.put(code, (value + 1));
                    }
                }
            }
        }
        result.setOccurrences(occurrences);
        result.setContentsId(contentsId);
    } catch (IndexNotFoundException inf) {
        logger.error("no index was found in the Directory", inf);
    } catch (Throwable t) {
        logger.error("Error extracting documents", t);
        throw new ApsSystemException("Error extracting documents", t);
    } finally {
        this.releaseResources(searcher);
    }
    return result;
}
 
Example 15
Source File: LuceneWorkflowInstanceRepository.java    From oodt with Apache License 2.0 4 votes vote down vote up
private WorkflowInstance toWorkflowInstance(Document doc) {
    WorkflowInstance inst = new WorkflowInstance();

    // first read all the instance info
    inst.setId(doc.get("workflow_inst_id"));
    
    inst.setTimesBlocked(Integer.parseInt(doc.get("workflow_inst_timesblocked") != 
      null ? doc.get("workflow_inst_timesblocked"):"0"));
    
    // try and construct a state
    WorkflowState state = new WorkflowState();
    state.setName(doc.get("workflow_inst_status"));
    if(doc.get("workflow_inst_state_category") != null){
      WorkflowLifecycleStage category = new WorkflowLifecycleStage();
      category.setName(doc.get("workflow_inst_state_category"));
      state.setCategory(category);
    }
    
    if(doc.get("workflow_inst_state_desc") != null){
      state.setDescription(doc.get("workflow_inst_state_desc"));
    }
    
    if(doc.get("workflow_inst_state_message") != null){
      state.setMessage(doc.get("workflow_inst_state_message"));
    }        
    inst.setState(state);
    inst.setCurrentTaskId(doc.get("workflow_inst_current_task_id"));
    inst.setCurrentTaskStartDateTimeIsoStr(doc
            .get("workflow_inst_currenttask_startdatetime"));
    inst.setCurrentTaskEndDateTimeIsoStr(doc
            .get("workflow_inst_currenttask_enddatetime"));
    inst.setStartDateTimeIsoStr(doc.get("workflow_inst_startdatetime"));
    inst.setEndDateTimeIsoStr(doc.get("workflow_inst_enddatetime"));
    inst.setPriority(Priority.getPriority(doc.get("workflow_inst_priority") != null ? 
        Double.valueOf(doc.get("workflow_inst_priority")):Priority.getDefault().getValue()));

    // read the workflow instance metadata
    Metadata sharedContext = new Metadata();
    String[] instMetFields = doc.getValues("workflow_inst_met_flds");
    if (instMetFields != null && instMetFields.length > 0) {
        for (String fldName : instMetFields) {
            String[] vals = doc.getValues(fldName);
            if (vals != null && vals.length > 0) {
                for (String val : vals) {
                    sharedContext.addMetadata(fldName, val);
                }
            }
        }
    }

    inst.setSharedContext(sharedContext);

    // now read all of the workflow info

    Workflow workflow = new Workflow();

    workflow.setId(doc.get("workflow_id"));
    workflow.setName(doc.get("workflow_name"));
    workflow.setTasks(toTasks(doc));
    workflow.setConditions(toConditions("workflow_condition_"+workflow.getId(), doc));

    inst.setWorkflow(workflow);

    return inst;
}
 
Example 16
Source File: LuceneWorkflowInstanceRepository.java    From oodt with Apache License 2.0 4 votes vote down vote up
private List toConditions(String taskId, Document doc) {
    List condList = new Vector();

    String[] condNames = doc.getValues(taskId + "_condition_name");
    String[] condClasses = doc.getValues(taskId + "_condition_class");
    String[] condOrders = doc.getValues(taskId + "_condition_order");
    String[] condIds = doc.getValues(taskId + "_condition_id");
    String[] condTimeouts = doc.getValues(taskId+"_condition_timeout");
    String[] condOptionals = doc.getValues(taskId+"_condition_optional");

    if (condNames == null) {
        return condList;
    }
    
    if (condNames.length != condClasses.length
            || condNames.length != condOrders.length
            || condNames.length != condIds.length 
            || (condTimeouts != null && condNames.length != condTimeouts.length)
            || (condOptionals != null && condNames.length != condOptionals.length)) {
        LOG.log(Level.WARNING,
                "Condition arrays are not of same size when "
                        + "rebuilding from given Document");
        return null;
    }
    
    for (int i = 0; i < condNames.length; i++) {
        WorkflowCondition cond = new WorkflowCondition();
        cond.setConditionId(condIds[i]);
        cond.setConditionInstanceClassName(condClasses[i]);
        cond.setConditionName(condNames[i]);
        cond.setOrder(Integer.parseInt(condOrders[i]));
        if(condTimeouts != null){
          cond.setTimeoutSeconds(Long.parseLong(condTimeouts[i]));
        }
        if(condOptionals != null){
          cond.setOptional(Boolean.valueOf(condOptionals[i]));
        }
        condList.add(cond);
    }
    
    return condList;
}