Java Code Examples for org.apache.lucene.search.CollectionStatistics#sumDocFreq()

The following examples show how to use org.apache.lucene.search.CollectionStatistics#sumDocFreq() . These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: Elasticsearch   File: TermVectorsWriter.java    License: Apache License 2.0 5 votes vote down vote up
private void writeFieldStatistics(CollectionStatistics fieldStats) throws IOException {
    long sttf = fieldStats.sumTotalTermFreq();
    assert (sttf >= -1);
    writePotentiallyNegativeVLong(sttf);
    long sdf = fieldStats.sumDocFreq();
    assert (sdf >= -1);
    writePotentiallyNegativeVLong(sdf);
    int dc = (int) fieldStats.docCount();
    assert (dc >= -1);
    writePotentiallyNegativeVInt(dc);
}
 
Example 2
Source Project: lucene-solr   File: CollectionStats.java    License: Apache License 2.0 5 votes vote down vote up
public CollectionStats(CollectionStatistics stats) {
  this.field = stats.field();
  this.maxDoc = stats.maxDoc();
  this.docCount = stats.docCount();
  this.sumTotalTermFreq = stats.sumTotalTermFreq();
  this.sumDocFreq = stats.sumDocFreq();
}
 
Example 3
Source Project: lucene4ir   File: DumpTermsApp.java    License: Apache License 2.0 5 votes vote down vote up
public void reportCollectionStatistics()throws IOException {

        IndexSearcher searcher = new IndexSearcher(reader);

        CollectionStatistics collectionStats = searcher.collectionStatistics(lucene4ir.Lucene4IRConstants.FIELD_ALL);
        long token_count = collectionStats.sumTotalTermFreq();
        long doc_count = collectionStats.docCount();
        long sum_doc_count = collectionStats.sumDocFreq();
        long avg_doc_length = token_count / doc_count;

        System.out.println("ALL: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);

        collectionStats = searcher.collectionStatistics(lucene4ir.Lucene4IRConstants.FIELD_TITLE);
        token_count = collectionStats.sumTotalTermFreq();
        doc_count = collectionStats.docCount();
        sum_doc_count = collectionStats.sumDocFreq();
        avg_doc_length = token_count / doc_count;

        System.out.println("TITLE: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);


        collectionStats = searcher.collectionStatistics(lucene4ir.Lucene4IRConstants.FIELD_CONTENT);
        token_count = collectionStats.sumTotalTermFreq();
        doc_count = collectionStats.docCount();
        sum_doc_count = collectionStats.sumDocFreq();
        avg_doc_length = token_count / doc_count;

        System.out.println("CONTENT: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);

    }
 
Example 4
Source Project: lucene4ir   File: ExampleStatsApp.java    License: Apache License 2.0 5 votes vote down vote up
public void reportCollectionStatistics()throws IOException {

        IndexSearcher searcher = new IndexSearcher(reader);

        CollectionStatistics collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_ALL);
        long token_count = collectionStats.sumTotalTermFreq();
        long doc_count = collectionStats.docCount();
        long sum_doc_count = collectionStats.sumDocFreq();
        long avg_doc_length = token_count / doc_count;

        System.out.println("ALL: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);

        collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_TITLE);
        token_count = collectionStats.sumTotalTermFreq();
        doc_count = collectionStats.docCount();
        sum_doc_count = collectionStats.sumDocFreq();
        avg_doc_length = token_count / doc_count;

        System.out.println("TITLE: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);


        collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_CONTENT);
        token_count = collectionStats.sumTotalTermFreq();
        doc_count = collectionStats.docCount();
        sum_doc_count = collectionStats.sumDocFreq();
        avg_doc_length = token_count / doc_count;

        System.out.println("CONTENT: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);

    }
 
Example 5
/**
 * returns new random term, that fits within the bounds of the corpus
 */
static TermStatistics newTerm(Random random, CollectionStatistics corpus) {
  final long docFreq;
  switch (random.nextInt(3)) {
    case 0:
      // rare term
      docFreq = 1;
      break;
    case 1:
      // common term
      docFreq = corpus.docCount();
      break;
    default:
      // random specificity
      docFreq = TestUtil.nextLong(random, 1, corpus.docCount());
      break;
  }
  final long totalTermFreq;
  // can't require docs to have > 2B tokens
  long upperBound;
  try {
    upperBound = Math.min(corpus.sumTotalTermFreq(), Math.multiplyExact(docFreq, Integer.MAX_VALUE));
  } catch (ArithmeticException overflow) {
    upperBound = corpus.sumTotalTermFreq();
  }
  if (corpus.sumTotalTermFreq() == corpus.sumDocFreq()) {
    // omitTF
    totalTermFreq = docFreq;
  } else {
    switch (random.nextInt(3)) {
      case 0:
        // no repetition
        totalTermFreq = docFreq;
        break;
      case 1:
        // maximum repetition
        totalTermFreq = upperBound;
        break;
      default:
        // random repetition
        totalTermFreq = TestUtil.nextLong(random, docFreq, upperBound);
        break;
    }
  }
  return new TermStatistics(TERM, docFreq, totalTermFreq);
}