Java Code Examples for org.apache.lucene.index.IndexReader#docFreq()

The following examples show how to use org.apache.lucene.index.IndexReader#docFreq() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: QueryTermExtractor.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Extracts all terms texts of a given Query into an array of WeightedTerms
 *
 * @param query      Query to extract term texts from
 * @param reader used to compute IDF which can be used to a) score selected fragments better
 * b) use graded highlights eg changing intensity of font color
 * @param fieldName the field on which Inverse Document Frequency (IDF) calculations are based
 * @return an array of the terms used in a query, plus their weights.
 */
public static final WeightedTerm[] getIdfWeightedTerms(Query query, IndexReader reader, String fieldName)
{
    WeightedTerm[] terms=getTerms(query,false, fieldName);
    int totalNumDocs=reader.maxDoc();
    for (int i = 0; i < terms.length; i++)
      {
        try
          {
              int docFreq=reader.docFreq(new Term(fieldName,terms[i].term));
              //IDF algorithm taken from ClassicSimilarity class
              float idf=(float)(Math.log(totalNumDocs/(double)(docFreq+1)) + 1.0);
              terms[i].weight*=idf;
          } 
        catch (IOException e)
          {
            //ignore
          }
      }
  return terms;
}
 
Example 2
Source File: LuceneHelper.java    From dexter with Apache License 2.0 6 votes vote down vote up
/**
 * Builds the TFIDF vector and its norm2
 * 
 * @param tfidf
 *            - the vector containing for each term its TFIDF score, it will
 *            be populated by this method
 * @param freq
 *            - the vector containing for each term its frequency
 * @param field
 *            - the field on which to compute the inverse document frequency
 * 
 * @return the norm of the TFIDF vector
 * 
 */
private double tfidfVector(Map<String, Double> tfidf,
		Map<String, Integer> freq, String field) {
	IndexReader reader = getReader();

	double norm = 0;
	for (Map.Entry<String, Integer> entry : freq.entrySet()) {
		Term t = new Term(field, entry.getKey());
		int df = 0;
		try {
			df = reader.docFreq(t);
		} catch (IOException e) {
			logger.error("computing tfidfVector ({}) ", e.toString());
			System.exit(-1);
		}
		double idf = Math.log(collectionSize / (double) df + 1)
				/ Math.log(2) + 1;
		double tfidfValue = entry.getValue() * idf;
		norm += tfidfValue * tfidfValue;
		tfidf.put(entry.getKey(), tfidfValue);
	}
	return Math.sqrt(norm);

}
 
Example 3
Source File: PassageRankingComponent.java    From wiseowl with MIT License 5 votes vote down vote up
protected float calculateWeight(Term term, IndexReader reader) throws IOException {
  //if a term is not in the index, then it's weight is 0
int docFrequency = reader.docFreq(term);
  if (docFrequency !=0) {
  	log.warn("Term {} doc freq.{}",term.toString(),docFrequency);
    return 1.0f / docFrequency;
  } else {
    log.warn("Couldn't find doc freq for term {}", term);
    return 0f;
  }

}
 
Example 4
Source File: WeightedSpanTermExtractor.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Creates a Map of <code>WeightedSpanTerms</code> from the given <code>Query</code> and <code>TokenStream</code>. Uses a supplied
 * <code>IndexReader</code> to properly weight terms (for gradient highlighting).
 * 
 * <p>
 * 
 * @param query
 *          that caused hit
 * @param tokenStream
 *          of text to be highlighted
 * @param fieldName
 *          restricts Term's used based on field name
 * @param reader
 *          to use for scoring
 * @return Map of WeightedSpanTerms with quasi tf/idf scores
 * @throws IOException If there is a low-level I/O error
 */
public Map<String,WeightedSpanTerm> getWeightedSpanTermsWithScores(Query query, float boost, TokenStream tokenStream, String fieldName,
    IndexReader reader) throws IOException {
  if (fieldName != null) {
    this.fieldName = fieldName;
  } else {
    this.fieldName = null;
  }
  this.tokenStream = tokenStream;

  Map<String,WeightedSpanTerm> terms = new PositionCheckingMap<>();
  extract(query, boost, terms);

  int totalNumDocs = reader.maxDoc();
  Set<String> weightedTerms = terms.keySet();
  Iterator<String> it = weightedTerms.iterator();

  try {
    while (it.hasNext()) {
      WeightedSpanTerm weightedSpanTerm = terms.get(it.next());
      int docFreq = reader.docFreq(new Term(fieldName, weightedSpanTerm.term));
      // IDF algorithm taken from ClassicSimilarity class
      float idf = (float) (Math.log(totalNumDocs / (double) (docFreq + 1)) + 1.0);
      weightedSpanTerm.weight *= idf;
    }
  } finally {
    IOUtils.close(internalReader);
  }

  return terms;
}
 
Example 5
Source File: WordBreakSpellChecker.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private SuggestWord generateSuggestWord(IndexReader ir, String fieldname, String text) throws IOException {
  Term term = new Term(fieldname, text);
  int freq = ir.docFreq(term);
  SuggestWord word = new SuggestWord();
  word.freq = freq;
  word.score = 1;
  word.string = text;
  return word;
}
 
Example 6
Source File: RandomSamplingFacetsCollector.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Note: if you use a counting {@link Facets} implementation, you can amortize the
 * sampled counts by calling this method. Uses the {@link FacetsConfig} and
 * the {@link IndexSearcher} to determine the upper bound for each facet value.
 */
public FacetResult amortizeFacetCounts(FacetResult res, FacetsConfig config, IndexSearcher searcher) throws IOException {
  if (res == null || totalHits <= sampleSize) {
    return res;
  }
  
  LabelAndValue[] fixedLabelValues = new LabelAndValue[res.labelValues.length];
  IndexReader reader = searcher.getIndexReader();
  DimConfig dimConfig = config.getDimConfig(res.dim);
  
  // +2 to prepend dimension, append child label
  String[] childPath = new String[res.path.length + 2];
  childPath[0] = res.dim;
  
  System.arraycopy(res.path, 0, childPath, 1, res.path.length); // reuse
  
  for (int i = 0; i < res.labelValues.length; i++) {
    childPath[res.path.length + 1] = res.labelValues[i].label;
    String fullPath = FacetsConfig.pathToString(childPath, childPath.length);
    int max = reader.docFreq(new Term(dimConfig.indexFieldName, fullPath));
    int correctedCount = (int) (res.labelValues[i].value.doubleValue() / samplingRate);
    correctedCount = Math.min(max, correctedCount);
    fixedLabelValues[i] = new LabelAndValue(res.labelValues[i].label, correctedCount);
  }
  
  // cap the total count on the total number of non-deleted documents in the reader
  int correctedTotalCount = res.value.intValue();
  if (correctedTotalCount > 0) {
    correctedTotalCount = Math.min(reader.numDocs(), (int) (res.value.doubleValue() / samplingRate));
  }
  
  return new FacetResult(res.dim, res.path, correctedTotalCount, fixedLabelValues, res.childCount);
}
 
Example 7
Source File: TermQuery.java    From alfresco-repository with GNU Lesser General Public License v3.0 4 votes vote down vote up
public Explanation explain(IndexReader reader, int doc)
  throws IOException {

  ComplexExplanation result = new ComplexExplanation();
  result.setDescription("weight("+getQuery()+" in "+doc+"), product of:");

  Explanation idfExpl =
    new Explanation(idf, "idf(docFreq=" + reader.docFreq(term) +
        ", numDocs=" + reader.numDocs() + ")");

  // explain query weight
  Explanation queryExpl = new Explanation();
  queryExpl.setDescription("queryWeight(" + getQuery() + "), product of:");

  Explanation boostExpl = new Explanation(getBoost(), "boost");
  if (getBoost() != 1.0f)
    queryExpl.addDetail(boostExpl);
  queryExpl.addDetail(idfExpl);

  Explanation queryNormExpl = new Explanation(queryNorm,"queryNorm");
  queryExpl.addDetail(queryNormExpl);

  queryExpl.setValue(boostExpl.getValue() *
                     idfExpl.getValue() *
                     queryNormExpl.getValue());

  result.addDetail(queryExpl);

  // explain field weight
  String field = term.field();
  ComplexExplanation fieldExpl = new ComplexExplanation();
  fieldExpl.setDescription("fieldWeight("+term+" in "+doc+
                           "), product of:");

  Explanation tfExpl = scorer(reader).explain(doc);
  fieldExpl.addDetail(tfExpl);
  fieldExpl.addDetail(idfExpl);

  Explanation fieldNormExpl = new Explanation();
  byte[] fieldNorms = reader.norms(field);
  float fieldNorm =
    fieldNorms!=null ? Similarity.decodeNorm(fieldNorms[doc]) : 0.0f;
  fieldNormExpl.setValue(fieldNorm);
  fieldNormExpl.setDescription("fieldNorm(field="+field+", doc="+doc+")");
  fieldExpl.addDetail(fieldNormExpl);
  
  fieldExpl.setMatch(Boolean.valueOf(tfExpl.isMatch()));
  fieldExpl.setValue(tfExpl.getValue() *
                     idfExpl.getValue() *
                     fieldNormExpl.getValue());

  result.addDetail(fieldExpl);
  result.setMatch(fieldExpl.getMatch());
  
  // combine them
  result.setValue(queryExpl.getValue() * fieldExpl.getValue());

  if (queryExpl.getValue() == 1.0f)
    return fieldExpl;

  return result;
}
 
Example 8
Source File: WordBreakSpellChecker.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
/**
 * <p>
 * Generate suggestions by breaking the passed-in term into multiple words.
 * The scores returned are equal to the number of word breaks needed so a
 * lower score is generally preferred over a higher score.
 * </p>
 * 
 * @param suggestMode
 *          - default = {@link SuggestMode#SUGGEST_WHEN_NOT_IN_INDEX}
 * @param sortMethod
 *          - default =
 *          {@link BreakSuggestionSortMethod#NUM_CHANGES_THEN_MAX_FREQUENCY}
 * @return one or more arrays of words formed by breaking up the original term
 * @throws IOException If there is a low-level I/O error.
 */
public SuggestWord[][] suggestWordBreaks(Term term, int maxSuggestions,
    IndexReader ir, SuggestMode suggestMode,
    BreakSuggestionSortMethod sortMethod) throws IOException {
  if (maxSuggestions < 1) {
    return new SuggestWord[0][0];
  }
  if (suggestMode == null) {
    suggestMode = SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX;
  }
  if (sortMethod == null) {
    sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY;
  }
  
  int queueInitialCapacity = maxSuggestions > 10 ? 10 : maxSuggestions;
  Comparator<SuggestWordArrayWrapper> queueComparator = sortMethod == BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY ? new LengthThenMaxFreqComparator()
      : new LengthThenSumFreqComparator();
  Queue<SuggestWordArrayWrapper> suggestions = new PriorityQueue<>(
      queueInitialCapacity, queueComparator);
  
  int origFreq = ir.docFreq(term);
  if (origFreq > 0 && suggestMode == SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX) {
    return new SuggestWord[0][];
  }
  
  int useMinSuggestionFrequency = minSuggestionFrequency;
  if (suggestMode == SuggestMode.SUGGEST_MORE_POPULAR) {
    useMinSuggestionFrequency = (origFreq == 0 ? 1 : origFreq);
  }
  
  generateBreakUpSuggestions(term, ir, 1, maxSuggestions,
      useMinSuggestionFrequency, new SuggestWord[0], suggestions, 0,
      sortMethod);
  
  SuggestWord[][] suggestionArray = new SuggestWord[suggestions.size()][];
  for (int i = suggestions.size() - 1; i >= 0; i--) {
    suggestionArray[i] = suggestions.remove().suggestWords;
  }
  
  return suggestionArray;
}
 
Example 9
Source File: TermVectorComponent.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private void mapOneVector(NamedList<Object> docNL, FieldOptions fieldOptions, IndexReader reader, int docID, TermsEnum termsEnum, String field) throws IOException {
  NamedList<Object> fieldNL = new NamedList<>();
  docNL.add(field, fieldNL);

  BytesRef text;
  PostingsEnum dpEnum = null;
  while((text = termsEnum.next()) != null) {
    String term = text.utf8ToString();
    NamedList<Object> termInfo = new NamedList<>();
    fieldNL.add(term, termInfo);
    final int freq = (int) termsEnum.totalTermFreq();
    if (fieldOptions.termFreq == true) {
      termInfo.add("tf", freq);
    }

    int dpEnumFlags = 0;
    dpEnumFlags |= fieldOptions.positions ? PostingsEnum.POSITIONS : 0;
    //payloads require offsets
    dpEnumFlags |= (fieldOptions.offsets || fieldOptions.payloads) ? PostingsEnum.OFFSETS : 0;
    dpEnumFlags |= fieldOptions.payloads ? PostingsEnum.PAYLOADS : 0;
    dpEnum = termsEnum.postings(dpEnum, dpEnumFlags);

    boolean atNextDoc = false;
    if (dpEnum != null) {
      dpEnum.nextDoc();
      atNextDoc = true;
    }

    if (atNextDoc && dpEnumFlags != 0) {
      NamedList<Integer> positionsNL = null;
      NamedList<Number> theOffsets = null;
      NamedList<String> thePayloads = null;

      for (int i = 0; i < freq; i++) {
        final int pos = dpEnum.nextPosition();
        if (fieldOptions.positions && pos >= 0) {
          if (positionsNL == null) {
            positionsNL = new NamedList<>();
            termInfo.add("positions", positionsNL);
          }
          positionsNL.add("position", pos);
        }

        int startOffset = fieldOptions.offsets ? dpEnum.startOffset() : -1;
        if (startOffset >= 0) {
          if (theOffsets == null) {
            theOffsets = new NamedList<>();
            termInfo.add("offsets", theOffsets);
          }
          theOffsets.add("start", dpEnum.startOffset());
          theOffsets.add("end", dpEnum.endOffset());
        }

        BytesRef payload = fieldOptions.payloads ? dpEnum.getPayload() : null;
        if (payload != null) {
          if (thePayloads == null) {
            thePayloads = new NamedList<>();
            termInfo.add("payloads", thePayloads);
          }
          thePayloads.add("payload", Base64.byteArrayToBase64(payload.bytes, payload.offset, payload.length));
        }
      }
    }
    
    int df = 0;
    if (fieldOptions.docFreq || fieldOptions.tfIdf) {
      df = reader.docFreq(new Term(field, text));
    }

    if (fieldOptions.docFreq) {
      termInfo.add("df", df);
    }

    // TODO: this is not TF/IDF by anyone's definition!
    if (fieldOptions.tfIdf) {
      double tfIdfVal = ((double) freq) / df;
      termInfo.add("tf-idf", tfIdfVal);
    }
  }
}
 
Example 10
Source File: IndexManager.java    From incubator-retired-blur with Apache License 2.0 4 votes vote down vote up
public static long recordFrequency(IndexReader reader, String columnFamily, String columnName, String value)
    throws IOException {
  return reader.docFreq(getTerm(columnFamily, columnName, value));
}