org.apache.lucene.index.IndexReader#docFreq

Source File: QueryTermExtractor.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Extracts all terms texts of a given Query into an array of WeightedTerms
 *
 * @param query      Query to extract term texts from
 * @param reader used to compute IDF which can be used to a) score selected fragments better
 * b) use graded highlights eg changing intensity of font color
 * @param fieldName the field on which Inverse Document Frequency (IDF) calculations are based
 * @return an array of the terms used in a query, plus their weights.
 */
public static final WeightedTerm[] getIdfWeightedTerms(Query query, IndexReader reader, String fieldName)
{
    WeightedTerm[] terms=getTerms(query,false, fieldName);
    int totalNumDocs=reader.maxDoc();
    for (int i = 0; i < terms.length; i++)
      {
        try
          {
              int docFreq=reader.docFreq(new Term(fieldName,terms[i].term));
              //IDF algorithm taken from ClassicSimilarity class
              float idf=(float)(Math.log(totalNumDocs/(double)(docFreq+1)) + 1.0);
              terms[i].weight*=idf;
          } 
        catch (IOException e)
          {
            //ignore
          }
      }
  return terms;
}

Source File: LuceneHelper.java From dexter with Apache License 2.0

6 votes

/**
 * Builds the TFIDF vector and its norm2
 * 
 * @param tfidf
 *            - the vector containing for each term its TFIDF score, it will
 *            be populated by this method
 * @param freq
 *            - the vector containing for each term its frequency
 * @param field
 *            - the field on which to compute the inverse document frequency
 * 
 * @return the norm of the TFIDF vector
 * 
 */
private double tfidfVector(Map<String, Double> tfidf,
		Map<String, Integer> freq, String field) {
	IndexReader reader = getReader();

	double norm = 0;
	for (Map.Entry<String, Integer> entry : freq.entrySet()) {
		Term t = new Term(field, entry.getKey());
		int df = 0;
		try {
			df = reader.docFreq(t);
		} catch (IOException e) {
			logger.error("computing tfidfVector ({}) ", e.toString());
			System.exit(-1);
		}
		double idf = Math.log(collectionSize / (double) df + 1)
				/ Math.log(2) + 1;
		double tfidfValue = entry.getValue() * idf;
		norm += tfidfValue * tfidfValue;
		tfidf.put(entry.getKey(), tfidfValue);
	}
	return Math.sqrt(norm);

}

Source File: PassageRankingComponent.java From wiseowl with MIT License

5 votes

protected float calculateWeight(Term term, IndexReader reader) throws IOException {
  //if a term is not in the index, then it's weight is 0
int docFrequency = reader.docFreq(term);
  if (docFrequency !=0) {
  	log.warn("Term {} doc freq.{}",term.toString(),docFrequency);
    return 1.0f / docFrequency;
  } else {
    log.warn("Couldn't find doc freq for term {}", term);
    return 0f;
  }

}

Source File: WeightedSpanTermExtractor.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Creates a Map of <code>WeightedSpanTerms</code> from the given <code>Query</code> and <code>TokenStream</code>. Uses a supplied
 * <code>IndexReader</code> to properly weight terms (for gradient highlighting).
 * 
 * <p>
 * 
 * @param query
 *          that caused hit
 * @param tokenStream
 *          of text to be highlighted
 * @param fieldName
 *          restricts Term's used based on field name
 * @param reader
 *          to use for scoring
 * @return Map of WeightedSpanTerms with quasi tf/idf scores
 * @throws IOException If there is a low-level I/O error
 */
public Map<String,WeightedSpanTerm> getWeightedSpanTermsWithScores(Query query, float boost, TokenStream tokenStream, String fieldName,
    IndexReader reader) throws IOException {
  if (fieldName != null) {
    this.fieldName = fieldName;
  } else {
    this.fieldName = null;
  }
  this.tokenStream = tokenStream;

  Map<String,WeightedSpanTerm> terms = new PositionCheckingMap<>();
  extract(query, boost, terms);

  int totalNumDocs = reader.maxDoc();
  Set<String> weightedTerms = terms.keySet();
  Iterator<String> it = weightedTerms.iterator();

  try {
    while (it.hasNext()) {
      WeightedSpanTerm weightedSpanTerm = terms.get(it.next());
      int docFreq = reader.docFreq(new Term(fieldName, weightedSpanTerm.term));
      // IDF algorithm taken from ClassicSimilarity class
      float idf = (float) (Math.log(totalNumDocs / (double) (docFreq + 1)) + 1.0);
      weightedSpanTerm.weight *= idf;
    }
  } finally {
    IOUtils.close(internalReader);
  }

  return terms;
}

Source File: WordBreakSpellChecker.java From lucene-solr with Apache License 2.0

5 votes

private SuggestWord generateSuggestWord(IndexReader ir, String fieldname, String text) throws IOException {
  Term term = new Term(fieldname, text);
  int freq = ir.docFreq(term);
  SuggestWord word = new SuggestWord();
  word.freq = freq;
  word.score = 1;
  word.string = text;
  return word;
}

Source File: RandomSamplingFacetsCollector.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Note: if you use a counting {@link Facets} implementation, you can amortize the
 * sampled counts by calling this method. Uses the {@link FacetsConfig} and
 * the {@link IndexSearcher} to determine the upper bound for each facet value.
 */
public FacetResult amortizeFacetCounts(FacetResult res, FacetsConfig config, IndexSearcher searcher) throws IOException {
  if (res == null || totalHits <= sampleSize) {
    return res;
  }
  
  LabelAndValue[] fixedLabelValues = new LabelAndValue[res.labelValues.length];
  IndexReader reader = searcher.getIndexReader();
  DimConfig dimConfig = config.getDimConfig(res.dim);
  
  // +2 to prepend dimension, append child label
  String[] childPath = new String[res.path.length + 2];
  childPath[0] = res.dim;
  
  System.arraycopy(res.path, 0, childPath, 1, res.path.length); // reuse
  
  for (int i = 0; i < res.labelValues.length; i++) {
    childPath[res.path.length + 1] = res.labelValues[i].label;
    String fullPath = FacetsConfig.pathToString(childPath, childPath.length);
    int max = reader.docFreq(new Term(dimConfig.indexFieldName, fullPath));
    int correctedCount = (int) (res.labelValues[i].value.doubleValue() / samplingRate);
    correctedCount = Math.min(max, correctedCount);
    fixedLabelValues[i] = new LabelAndValue(res.labelValues[i].label, correctedCount);
  }
  
  // cap the total count on the total number of non-deleted documents in the reader
  int correctedTotalCount = res.value.intValue();
  if (correctedTotalCount > 0) {
    correctedTotalCount = Math.min(reader.numDocs(), (int) (res.value.doubleValue() / samplingRate));
  }
  
  return new FacetResult(res.dim, res.path, correctedTotalCount, fixedLabelValues, res.childCount);
}

Source File: TermQuery.java From alfresco-repository with GNU Lesser General Public License v3.0

4 votes

public Explanation explain(IndexReader reader, int doc)
  throws IOException {

  ComplexExplanation result = new ComplexExplanation();
  result.setDescription("weight("+getQuery()+" in "+doc+"), product of:");

  Explanation idfExpl =
    new Explanation(idf, "idf(docFreq=" + reader.docFreq(term) +
        ", numDocs=" + reader.numDocs() + ")");

  // explain query weight
  Explanation queryExpl = new Explanation();
  queryExpl.setDescription("queryWeight(" + getQuery() + "), product of:");

  Explanation boostExpl = new Explanation(getBoost(), "boost");
  if (getBoost() != 1.0f)
    queryExpl.addDetail(boostExpl);
  queryExpl.addDetail(idfExpl);

  Explanation queryNormExpl = new Explanation(queryNorm,"queryNorm");
  queryExpl.addDetail(queryNormExpl);

  queryExpl.setValue(boostExpl.getValue() *
                     idfExpl.getValue() *
                     queryNormExpl.getValue());

  result.addDetail(queryExpl);

  // explain field weight
  String field = term.field();
  ComplexExplanation fieldExpl = new ComplexExplanation();
  fieldExpl.setDescription("fieldWeight("+term+" in "+doc+
                           "), product of:");

  Explanation tfExpl = scorer(reader).explain(doc);
  fieldExpl.addDetail(tfExpl);
  fieldExpl.addDetail(idfExpl);

  Explanation fieldNormExpl = new Explanation();
  byte[] fieldNorms = reader.norms(field);
  float fieldNorm =
    fieldNorms!=null ? Similarity.decodeNorm(fieldNorms[doc]) : 0.0f;
  fieldNormExpl.setValue(fieldNorm);
  fieldNormExpl.setDescription("fieldNorm(field="+field+", doc="+doc+")");
  fieldExpl.addDetail(fieldNormExpl);
  
  fieldExpl.setMatch(Boolean.valueOf(tfExpl.isMatch()));
  fieldExpl.setValue(tfExpl.getValue() *
                     idfExpl.getValue() *
                     fieldNormExpl.getValue());

  result.addDetail(fieldExpl);
  result.setMatch(fieldExpl.getMatch());
  
  // combine them
  result.setValue(queryExpl.getValue() * fieldExpl.getValue());

  if (queryExpl.getValue() == 1.0f)
    return fieldExpl;

  return result;
}

Source File: WordBreakSpellChecker.java From lucene-solr with Apache License 2.0

4 votes

/**
 * <p>
 * Generate suggestions by breaking the passed-in term into multiple words.
 * The scores returned are equal to the number of word breaks needed so a
 * lower score is generally preferred over a higher score.
 * </p>
 * 
 * @param suggestMode
 *          - default = {@link SuggestMode#SUGGEST_WHEN_NOT_IN_INDEX}
 * @param sortMethod
 *          - default =
 *          {@link BreakSuggestionSortMethod#NUM_CHANGES_THEN_MAX_FREQUENCY}
 * @return one or more arrays of words formed by breaking up the original term
 * @throws IOException If there is a low-level I/O error.
 */
public SuggestWord[][] suggestWordBreaks(Term term, int maxSuggestions,
    IndexReader ir, SuggestMode suggestMode,
    BreakSuggestionSortMethod sortMethod) throws IOException {
  if (maxSuggestions < 1) {
    return new SuggestWord[0][0];
  }
  if (suggestMode == null) {
    suggestMode = SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX;
  }
  if (sortMethod == null) {
    sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY;
  }
  
  int queueInitialCapacity = maxSuggestions > 10 ? 10 : maxSuggestions;
  Comparator<SuggestWordArrayWrapper> queueComparator = sortMethod == BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY ? new LengthThenMaxFreqComparator()
      : new LengthThenSumFreqComparator();
  Queue<SuggestWordArrayWrapper> suggestions = new PriorityQueue<>(
      queueInitialCapacity, queueComparator);
  
  int origFreq = ir.docFreq(term);
  if (origFreq > 0 && suggestMode == SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX) {
    return new SuggestWord[0][];
  }
  
  int useMinSuggestionFrequency = minSuggestionFrequency;
  if (suggestMode == SuggestMode.SUGGEST_MORE_POPULAR) {
    useMinSuggestionFrequency = (origFreq == 0 ? 1 : origFreq);
  }
  
  generateBreakUpSuggestions(term, ir, 1, maxSuggestions,
      useMinSuggestionFrequency, new SuggestWord[0], suggestions, 0,
      sortMethod);
  
  SuggestWord[][] suggestionArray = new SuggestWord[suggestions.size()][];
  for (int i = suggestions.size() - 1; i >= 0; i--) {
    suggestionArray[i] = suggestions.remove().suggestWords;
  }
  
  return suggestionArray;
}

Source File: TermVectorComponent.java From lucene-solr with Apache License 2.0

4 votes

private void mapOneVector(NamedList<Object> docNL, FieldOptions fieldOptions, IndexReader reader, int docID, TermsEnum termsEnum, String field) throws IOException {
  NamedList<Object> fieldNL = new NamedList<>();
  docNL.add(field, fieldNL);

  BytesRef text;
  PostingsEnum dpEnum = null;
  while((text = termsEnum.next()) != null) {
    String term = text.utf8ToString();
    NamedList<Object> termInfo = new NamedList<>();
    fieldNL.add(term, termInfo);
    final int freq = (int) termsEnum.totalTermFreq();
    if (fieldOptions.termFreq == true) {
      termInfo.add("tf", freq);
    }

    int dpEnumFlags = 0;
    dpEnumFlags |= fieldOptions.positions ? PostingsEnum.POSITIONS : 0;
    //payloads require offsets
    dpEnumFlags |= (fieldOptions.offsets || fieldOptions.payloads) ? PostingsEnum.OFFSETS : 0;
    dpEnumFlags |= fieldOptions.payloads ? PostingsEnum.PAYLOADS : 0;
    dpEnum = termsEnum.postings(dpEnum, dpEnumFlags);

    boolean atNextDoc = false;
    if (dpEnum != null) {
      dpEnum.nextDoc();
      atNextDoc = true;
    }

    if (atNextDoc && dpEnumFlags != 0) {
      NamedList<Integer> positionsNL = null;
      NamedList<Number> theOffsets = null;
      NamedList<String> thePayloads = null;

      for (int i = 0; i < freq; i++) {
        final int pos = dpEnum.nextPosition();
        if (fieldOptions.positions && pos >= 0) {
          if (positionsNL == null) {
            positionsNL = new NamedList<>();
            termInfo.add("positions", positionsNL);
          }
          positionsNL.add("position", pos);
        }

        int startOffset = fieldOptions.offsets ? dpEnum.startOffset() : -1;
        if (startOffset >= 0) {
          if (theOffsets == null) {
            theOffsets = new NamedList<>();
            termInfo.add("offsets", theOffsets);
          }
          theOffsets.add("start", dpEnum.startOffset());
          theOffsets.add("end", dpEnum.endOffset());
        }

        BytesRef payload = fieldOptions.payloads ? dpEnum.getPayload() : null;
        if (payload != null) {
          if (thePayloads == null) {
            thePayloads = new NamedList<>();
            termInfo.add("payloads", thePayloads);
          }
          thePayloads.add("payload", Base64.byteArrayToBase64(payload.bytes, payload.offset, payload.length));
        }
      }
    }
    
    int df = 0;
    if (fieldOptions.docFreq || fieldOptions.tfIdf) {
      df = reader.docFreq(new Term(field, text));
    }

    if (fieldOptions.docFreq) {
      termInfo.add("df", df);
    }

    // TODO: this is not TF/IDF by anyone's definition!
    if (fieldOptions.tfIdf) {
      double tfIdfVal = ((double) freq) / df;
      termInfo.add("tf-idf", tfIdfVal);
    }
  }
}

Source File: IndexManager.java From incubator-retired-blur with Apache License 2.0

4 votes

public static long recordFrequency(IndexReader reader, String columnFamily, String columnName, String value)
    throws IOException {
  return reader.docFreq(getTerm(columnFamily, columnName, value));
}

Java Code Examples for org.apache.lucene.index.IndexReader#docFreq()