Java Code Examples for org.apache.lucene.search.CollectionStatistics#sumTotalTermFreq()

The following examples show how to use org.apache.lucene.search.CollectionStatistics#sumTotalTermFreq() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SMARTBNNBNNSimilarity.java    From lucene4ir with Apache License 2.0 6 votes vote down vote up
@Override
   public final SimWeight computeWeight(CollectionStatistics collectionStats,
				 TermStatistics... termStats)
   {
float N, n, idf, adl;
idf = 1.0f;
N   = collectionStats.maxDoc();
adl = collectionStats.sumTotalTermFreq() / N;

if (termStats.length == 1) {
    n = termStats[0].docFreq();
    idf = log(N/n);
}
else {
    for (final TermStatistics stat : termStats) {
	n = stat.docFreq();
	idf += log(N/n);
    }
}

return new TFIDFWeight(collectionStats.field(), idf, adl);
   }
 
Example 2
Source File: TermVectorsWriter.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
private void writeFieldStatistics(CollectionStatistics fieldStats) throws IOException {
    long sttf = fieldStats.sumTotalTermFreq();
    assert (sttf >= -1);
    writePotentiallyNegativeVLong(sttf);
    long sdf = fieldStats.sumDocFreq();
    assert (sdf >= -1);
    writePotentiallyNegativeVLong(sdf);
    int dc = (int) fieldStats.docCount();
    assert (dc >= -1);
    writePotentiallyNegativeVInt(dc);
}
 
Example 3
Source File: CollectionStats.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public CollectionStats(CollectionStatistics stats) {
  this.field = stats.field();
  this.maxDoc = stats.maxDoc();
  this.docCount = stats.docCount();
  this.sumTotalTermFreq = stats.sumTotalTermFreq();
  this.sumDocFreq = stats.sumDocFreq();
}
 
Example 4
Source File: BM25Similarity.java    From lucene4ir with Apache License 2.0 5 votes vote down vote up
/** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code>,
 * or returns <code>1</code> if the index does not store sumTotalTermFreq:
 * any field that omits frequency information). */
protected float avgFieldLength(CollectionStatistics collectionStats) {
  final long sumTotalTermFreq = collectionStats.sumTotalTermFreq();
  if (sumTotalTermFreq <= 0) {
    return 1f;       // field does not exist, or stat is unsupported
  } else {
    final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
    return (float) (sumTotalTermFreq / (double) docCount);
  }
}
 
Example 5
Source File: OKAPIBM25Similarity.java    From lucene4ir with Apache License 2.0 5 votes vote down vote up
@Override
   public final SimWeight computeWeight(CollectionStatistics collectionStats,
				 TermStatistics... termStats)
   {
long  N, n;
float idf_, avdl;

idf_ = 1.0f;

N    = collectionStats.docCount();
if (N == -1)
    N = collectionStats.maxDoc();

avdl = collectionStats.sumTotalTermFreq() / N;

if (termStats.length == 1) {
    n    = termStats[0].docFreq();
    idf_ = idf(n, N);
}
else { /* computation for a phrase */
    for (final TermStatistics stat : termStats) {
	n     = stat.docFreq();
	idf_ += idf(n, N);
    }
}

return new TFIDFWeight(collectionStats.field(), idf_, avdl);
   }
 
Example 6
Source File: DumpTermsApp.java    From lucene4ir with Apache License 2.0 5 votes vote down vote up
public void reportCollectionStatistics()throws IOException {

        IndexSearcher searcher = new IndexSearcher(reader);

        CollectionStatistics collectionStats = searcher.collectionStatistics(lucene4ir.Lucene4IRConstants.FIELD_ALL);
        long token_count = collectionStats.sumTotalTermFreq();
        long doc_count = collectionStats.docCount();
        long sum_doc_count = collectionStats.sumDocFreq();
        long avg_doc_length = token_count / doc_count;

        System.out.println("ALL: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);

        collectionStats = searcher.collectionStatistics(lucene4ir.Lucene4IRConstants.FIELD_TITLE);
        token_count = collectionStats.sumTotalTermFreq();
        doc_count = collectionStats.docCount();
        sum_doc_count = collectionStats.sumDocFreq();
        avg_doc_length = token_count / doc_count;

        System.out.println("TITLE: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);


        collectionStats = searcher.collectionStatistics(lucene4ir.Lucene4IRConstants.FIELD_CONTENT);
        token_count = collectionStats.sumTotalTermFreq();
        doc_count = collectionStats.docCount();
        sum_doc_count = collectionStats.sumDocFreq();
        avg_doc_length = token_count / doc_count;

        System.out.println("CONTENT: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);

    }
 
Example 7
Source File: ExampleStatsApp.java    From lucene4ir with Apache License 2.0 5 votes vote down vote up
public void reportCollectionStatistics()throws IOException {

        IndexSearcher searcher = new IndexSearcher(reader);

        CollectionStatistics collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_ALL);
        long token_count = collectionStats.sumTotalTermFreq();
        long doc_count = collectionStats.docCount();
        long sum_doc_count = collectionStats.sumDocFreq();
        long avg_doc_length = token_count / doc_count;

        System.out.println("ALL: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);

        collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_TITLE);
        token_count = collectionStats.sumTotalTermFreq();
        doc_count = collectionStats.docCount();
        sum_doc_count = collectionStats.sumDocFreq();
        avg_doc_length = token_count / doc_count;

        System.out.println("TITLE: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);


        collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_CONTENT);
        token_count = collectionStats.sumTotalTermFreq();
        doc_count = collectionStats.docCount();
        sum_doc_count = collectionStats.sumDocFreq();
        avg_doc_length = token_count / doc_count;

        System.out.println("CONTENT: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);

    }
 
Example 8
Source File: BaseSimilarityTestCase.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
/**
 * returns new random term, that fits within the bounds of the corpus
 */
static TermStatistics newTerm(Random random, CollectionStatistics corpus) {
  final long docFreq;
  switch (random.nextInt(3)) {
    case 0:
      // rare term
      docFreq = 1;
      break;
    case 1:
      // common term
      docFreq = corpus.docCount();
      break;
    default:
      // random specificity
      docFreq = TestUtil.nextLong(random, 1, corpus.docCount());
      break;
  }
  final long totalTermFreq;
  // can't require docs to have > 2B tokens
  long upperBound;
  try {
    upperBound = Math.min(corpus.sumTotalTermFreq(), Math.multiplyExact(docFreq, Integer.MAX_VALUE));
  } catch (ArithmeticException overflow) {
    upperBound = corpus.sumTotalTermFreq();
  }
  if (corpus.sumTotalTermFreq() == corpus.sumDocFreq()) {
    // omitTF
    totalTermFreq = docFreq;
  } else {
    switch (random.nextInt(3)) {
      case 0:
        // no repetition
        totalTermFreq = docFreq;
        break;
      case 1:
        // maximum repetition
        totalTermFreq = upperBound;
        break;
      default:
        // random repetition
        totalTermFreq = TestUtil.nextLong(random, docFreq, upperBound);
        break;
    }
  }
  return new TermStatistics(TERM, docFreq, totalTermFreq);
}
 
Example 9
Source File: BM25Similarity.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
/** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code> */
protected float avgFieldLength(CollectionStatistics collectionStats) {
  return (float) (collectionStats.sumTotalTermFreq() / (double) collectionStats.docCount());
}