Java Code Examples for org.apache.lucene.search.CollectionStatistics

The following examples show how to use org.apache.lucene.search.CollectionStatistics. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: Elasticsearch   Source File: DfsSearchResult.java    License: Apache License 2.0 6 votes vote down vote up
public static ObjectObjectHashMap<String, CollectionStatistics> readFieldStats(StreamInput in, ObjectObjectHashMap<String, CollectionStatistics> fieldStatistics) throws IOException {
    final int numFieldStatistics = in.readVInt();
    if (fieldStatistics == null) {
        fieldStatistics = HppcMaps.newNoNullKeysMap(numFieldStatistics);
    }
    for (int i = 0; i < numFieldStatistics; i++) {
        final String field = in.readString();
        assert field != null;
        final long maxDoc = in.readVLong();
        final long docCount = subOne(in.readVLong());
        final long sumTotalTermFreq = subOne(in.readVLong());
        final long sumDocFreq = subOne(in.readVLong());
        CollectionStatistics stats = new CollectionStatistics(field, maxDoc, docCount, sumTotalTermFreq, sumDocFreq);
        fieldStatistics.put(field, stats);
    }
    return fieldStatistics;
}
 
Example 2
Source Project: lucene-solr   Source File: NormValueSource.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public FunctionValues getValues(Map<Object, Object> context, LeafReaderContext readerContext) throws IOException {
  IndexSearcher searcher = (IndexSearcher)context.get("searcher");
  final TFIDFSimilarity similarity = IDFValueSource.asTFIDF(searcher.getSimilarity(), field);
  if (similarity == null) {
    throw new UnsupportedOperationException("requires a TFIDFSimilarity (such as ClassicSimilarity)");
  }
  // Only works if the contribution of the tf is 1 when the freq is 1 and contribution of the idf
  // is 1 when docCount == docFreq == 1
  final SimScorer simScorer = similarity.scorer(1f,
      new CollectionStatistics(field, 1, 1, 1, 1),
      new TermStatistics(new BytesRef("bogus"), 1, 1));
  final LeafSimScorer leafSimScorer = new LeafSimScorer(simScorer, readerContext.reader(), field, true);
  
  return new FloatDocValues(this) {
    int lastDocID = -1;
    @Override
    public float floatVal(int docID) throws IOException {
      if (docID < lastDocID) {
        throw new AssertionError("docs out of order: lastDocID=" + lastDocID + " docID=" + docID);
      }
      lastDocID = docID;
      return leafSimScorer.score(docID, 1f);
    }
  };
}
 
Example 3
Source Project: lucene-solr   Source File: SpanWeight.java    License: Apache License 2.0 6 votes vote down vote up
private Similarity.SimScorer buildSimWeight(SpanQuery query, IndexSearcher searcher, Map<Term, TermStates> termStates, float boost) throws IOException {
  if (termStates == null || termStates.size() == 0 || query.getField() == null)
    return null;
  TermStatistics[] termStats = new TermStatistics[termStates.size()];
  int termUpTo = 0;
  for (Map.Entry<Term, TermStates> entry : termStates.entrySet()) {
    TermStates ts = entry.getValue();
    if (ts.docFreq() > 0) {
      termStats[termUpTo++] = searcher.termStatistics(entry.getKey(), ts.docFreq(), ts.totalTermFreq());
    }
  }
  CollectionStatistics collectionStats = searcher.collectionStatistics(query.getField());
  if (termUpTo > 0) {
    return similarity.scorer(boost, collectionStats, ArrayUtil.copyOfSubArray(termStats, 0, termUpTo));
  } else {
    return null; // no terms at all exist, we won't use similarity
  }
}
 
Example 4
Source Project: lucene4ir   Source File: SMARTBNNBNNSimilarity.java    License: Apache License 2.0 6 votes vote down vote up
@Override
   public final SimWeight computeWeight(CollectionStatistics collectionStats,
				 TermStatistics... termStats)
   {
float N, n, idf, adl;
idf = 1.0f;
N   = collectionStats.maxDoc();
adl = collectionStats.sumTotalTermFreq() / N;

if (termStats.length == 1) {
    n = termStats[0].docFreq();
    idf = log(N/n);
}
else {
    for (final TermStatistics stat : termStats) {
	n = stat.docFreq();
	idf += log(N/n);
    }
}

return new TFIDFWeight(collectionStats.field(), idf, adl);
   }
 
Example 5
Source Project: Elasticsearch   Source File: DfsSearchResult.java    License: Apache License 2.0 5 votes vote down vote up
public static void writeFieldStats(StreamOutput out, ObjectObjectHashMap<String, CollectionStatistics> fieldStatistics) throws IOException {
    out.writeVInt(fieldStatistics.size());

    for (ObjectObjectCursor<String, CollectionStatistics> c : fieldStatistics) {
        out.writeString(c.key);
        CollectionStatistics statistics = c.value;
        assert statistics.maxDoc() >= 0;
        out.writeVLong(statistics.maxDoc());
        out.writeVLong(addOne(statistics.docCount()));
        out.writeVLong(addOne(statistics.sumTotalTermFreq()));
        out.writeVLong(addOne(statistics.sumDocFreq()));
    }
}
 
Example 6
Source Project: Elasticsearch   Source File: TermVectorsWriter.java    License: Apache License 2.0 5 votes vote down vote up
private void writeFieldStatistics(CollectionStatistics fieldStats) throws IOException {
    long sttf = fieldStats.sumTotalTermFreq();
    assert (sttf >= -1);
    writePotentiallyNegativeVLong(sttf);
    long sdf = fieldStats.sumDocFreq();
    assert (sdf >= -1);
    writePotentiallyNegativeVLong(sdf);
    int dc = (int) fieldStats.docCount();
    assert (dc >= -1);
    writePotentiallyNegativeVInt(dc);
}
 
Example 7
Source Project: linden   Source File: LindenSimilarity.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
  final long df = termStats.docFreq();
  final long max = collectionStats.maxDoc();
  final float idf = idfManager.getIDF(termStats.term().utf8ToString());
  return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")");
}
 
Example 8
Source Project: lucene-solr   Source File: TestMemoryIndex.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testSimilarities() throws IOException {

  MemoryIndex mi = new MemoryIndex();
  mi.addField("f1", "a long text field that contains many many terms", analyzer);

  IndexSearcher searcher = mi.createSearcher();
  LeafReader reader = (LeafReader) searcher.getIndexReader();
  NumericDocValues norms = reader.getNormValues("f1");
  assertEquals(0, norms.nextDoc());
  float n1 = norms.longValue();

  // Norms are re-computed when we change the Similarity
  mi.setSimilarity(new Similarity() {

    @Override
    public long computeNorm(FieldInvertState state) {
      return 74;
    }

    @Override
    public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
      throw new UnsupportedOperationException();
    }

  });
  norms = reader.getNormValues("f1");
  assertEquals(0, norms.nextDoc());
  float n2 = norms.longValue();

  assertTrue(n1 != n2);
  TestUtil.checkReader(reader);
}
 
Example 9
Source Project: lucene-solr   Source File: AssertingSimilarity.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
  assert boost >= 0;
  assert collectionStats != null;
  assert termStats.length > 0;
  for (TermStatistics term : termStats) {
    assert term != null;
  }
  // TODO: check that TermStats is in bounds with respect to collection? e.g. docFreq <= maxDoc
  SimScorer scorer = delegate.scorer(boost, collectionStats, termStats);
  assert scorer != null;
  return new AssertingSimScorer(scorer, boost);
}
 
Example 10
Source Project: lucene-solr   Source File: SimilarityBase.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public final SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
  SimScorer weights[] = new SimScorer[termStats.length];
  for (int i = 0; i < termStats.length; i++) {
    BasicStats stats = newStats(collectionStats.field(), boost);
    fillBasicStats(stats, collectionStats, termStats[i]);
    weights[i] = new BasicSimScorer(stats);
  }
  if (weights.length == 1) {
    return weights[0];
  } else {
    return new MultiSimilarity.MultiSimScorer(weights);
  }
}
 
Example 11
Source Project: lucene-solr   Source File: SimilarityBase.java    License: Apache License 2.0 5 votes vote down vote up
/** Fills all member fields defined in {@code BasicStats} in {@code stats}. 
 *  Subclasses can override this method to fill additional stats. */
protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) {
  // TODO: validate this for real, somewhere else
  assert termStats.totalTermFreq() <= collectionStats.sumTotalTermFreq();
  assert termStats.docFreq() <= collectionStats.sumDocFreq();
 
  // TODO: add sumDocFreq for field (numberOfFieldPostings)
  stats.setNumberOfDocuments(collectionStats.docCount());
  stats.setNumberOfFieldTokens(collectionStats.sumTotalTermFreq());
  stats.setAvgFieldLength(collectionStats.sumTotalTermFreq() / (double) collectionStats.docCount());
  stats.setDocFreq(termStats.docFreq());
  stats.setTotalTermFreq(termStats.totalTermFreq());
}
 
Example 12
Source Project: lucene-solr   Source File: BM25Similarity.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public final SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
  Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats);
  float avgdl = avgFieldLength(collectionStats);

  float[] cache = new float[256];
  for (int i = 0; i < cache.length; i++) {
    cache[i] = 1f / (k1 * ((1 - b) + b * LENGTH_TABLE[i] / avgdl));
  }
  return new BM25Scorer(boost, k1, b, idf, avgdl, cache);
}
 
Example 13
Source Project: lucene-solr   Source File: MultiSimilarity.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
  SimScorer subScorers[] = new SimScorer[sims.length];
  for (int i = 0; i < subScorers.length; i++) {
    subScorers[i] = sims[i].scorer(boost, collectionStats, termStats);
  }
  return new MultiSimScorer(subScorers);
}
 
Example 14
Source Project: lucene-solr   Source File: LMSimilarity.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Computes the collection probability of the current term in addition to the
 * usual statistics.
 */
@Override
protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) {
  super.fillBasicStats(stats, collectionStats, termStats);
  LMStats lmStats = (LMStats) stats;
  lmStats.setCollectionProbability(collectionModel.computeProbability(stats));
}
 
Example 15
Source Project: lucene-solr   Source File: ClassicSimilarity.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
  final long df = termStats.docFreq();
  final long docCount = collectionStats.docCount();
  final float idf = idf(df, docCount);
  return Explanation.match(idf, "idf, computed as log((docCount+1)/(docFreq+1)) + 1 from:",
      Explanation.match(df, "docFreq, number of documents containing term"),
      Explanation.match(docCount, "docCount, total number of documents with field"));
}
 
Example 16
Source Project: lucene-solr   Source File: TestSimilarityBase.java    License: Apache License 2.0 5 votes vote down vote up
private CollectionStatistics toCollectionStats(BasicStats stats) {
  long sumTtf = stats.getNumberOfFieldTokens();
  long sumDf;
  if (sumTtf == -1) {
    sumDf = TestUtil.nextLong(random(), stats.getNumberOfDocuments(), 2L * stats.getNumberOfDocuments());
  } else {
    sumDf = TestUtil.nextLong(random(), Math.min(stats.getNumberOfDocuments(), sumTtf), sumTtf);
  }
  int docCount = Math.toIntExact(Math.min(sumDf, stats.getNumberOfDocuments()));
  int maxDoc = TestUtil.nextInt(random(), docCount, docCount + 10);

  return new CollectionStatistics(stats.field, maxDoc, docCount, sumTtf, sumDf);
}
 
Example 17
Source Project: lucene-solr   Source File: TestMaxTermFrequency.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
  return new SimScorer() {

    @Override
    public float score(float freq, long norm) {
      return 0;
    }

  };
}
 
Example 18
Source Project: lucene-solr   Source File: LRUStatsCache.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public CollectionStatistics collectionStatistics(SolrIndexSearcher localSearcher, String field)
    throws IOException {
  CollectionStats colStats = currentGlobalColStats.get(field);
  if (colStats == null) {
    log.debug("## Missing global colStats info: {}, using local", field);
    missingColStats.add(field);
    metrics.missingGlobalFieldStats.increment();
    return localSearcher != null ? localSearcher.localCollectionStatistics(field) : null;
  } else {
    return colStats.toCollectionStatistics();
  }
}
 
Example 19
Source Project: lucene-solr   Source File: ExactStatsCache.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public CollectionStatistics collectionStatistics(SolrIndexSearcher localSearcher, String field)
    throws IOException {
  CollectionStats colStats = colStatsCache.get(field);
  if (colStats == null) {
    log.debug("Missing global colStats info for field={}, using local", field);
    metrics.missingGlobalFieldStats.increment();
    return localSearcher != null ? localSearcher.localCollectionStatistics(field) : null;
  } else {
    return colStats.toCollectionStatistics();
  }
}
 
Example 20
Source Project: lucene-solr   Source File: StatsCache.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public CollectionStatistics collectionStatistics(String field) throws IOException {
  if (statsSource.collectionStatistics(null, field) == null) {
    missingFieldStats.accept(field);
    missingFieldsCount++;
  }
  return super.collectionStatistics(field);
}
 
Example 21
Source Project: lucene-solr   Source File: CollectionStats.java    License: Apache License 2.0 5 votes vote down vote up
public CollectionStats(CollectionStatistics stats) {
  this.field = stats.field();
  this.maxDoc = stats.maxDoc();
  this.docCount = stats.docCount();
  this.sumTotalTermFreq = stats.sumTotalTermFreq();
  this.sumDocFreq = stats.sumDocFreq();
}
 
Example 22
Source Project: lucene4ir   Source File: BM25Similarity.java    License: Apache License 2.0 5 votes vote down vote up
/** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code>,
 * or returns <code>1</code> if the index does not store sumTotalTermFreq:
 * any field that omits frequency information). */
protected float avgFieldLength(CollectionStatistics collectionStats) {
  final long sumTotalTermFreq = collectionStats.sumTotalTermFreq();
  if (sumTotalTermFreq <= 0) {
    return 1f;       // field does not exist, or stat is unsupported
  } else {
    final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
    return (float) (sumTotalTermFreq / (double) docCount);
  }
}
 
Example 23
Source Project: lucene4ir   Source File: BM25Similarity.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public final SimWeight computeWeight(CollectionStatistics collectionStats, TermStatistics... termStats) {
  Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats);

  float avgdl = avgFieldLength(collectionStats);

  // compute freq-independent part of bm25 equation across all norm values
  float cache[] = new float[256];
  for (int i = 0; i < cache.length; i++) {
    cache[i] = k1 * ((1 - b) + b * decodeNormValue((byte)i) / avgdl);
  }
  return new BM25Stats(collectionStats.field(), idf, avgdl, cache);
}
 
Example 24
Source Project: lucene4ir   Source File: OKAPIBM25Similarity.java    License: Apache License 2.0 5 votes vote down vote up
@Override
   public final SimWeight computeWeight(CollectionStatistics collectionStats,
				 TermStatistics... termStats)
   {
long  N, n;
float idf_, avdl;

idf_ = 1.0f;

N    = collectionStats.docCount();
if (N == -1)
    N = collectionStats.maxDoc();

avdl = collectionStats.sumTotalTermFreq() / N;

if (termStats.length == 1) {
    n    = termStats[0].docFreq();
    idf_ = idf(n, N);
}
else { /* computation for a phrase */
    for (final TermStatistics stat : termStats) {
	n     = stat.docFreq();
	idf_ += idf(n, N);
    }
}

return new TFIDFWeight(collectionStats.field(), idf_, avdl);
   }
 
Example 25
Source Project: lucene4ir   Source File: DumpTermsApp.java    License: Apache License 2.0 5 votes vote down vote up
public void reportCollectionStatistics()throws IOException {

        IndexSearcher searcher = new IndexSearcher(reader);

        CollectionStatistics collectionStats = searcher.collectionStatistics(lucene4ir.Lucene4IRConstants.FIELD_ALL);
        long token_count = collectionStats.sumTotalTermFreq();
        long doc_count = collectionStats.docCount();
        long sum_doc_count = collectionStats.sumDocFreq();
        long avg_doc_length = token_count / doc_count;

        System.out.println("ALL: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);

        collectionStats = searcher.collectionStatistics(lucene4ir.Lucene4IRConstants.FIELD_TITLE);
        token_count = collectionStats.sumTotalTermFreq();
        doc_count = collectionStats.docCount();
        sum_doc_count = collectionStats.sumDocFreq();
        avg_doc_length = token_count / doc_count;

        System.out.println("TITLE: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);


        collectionStats = searcher.collectionStatistics(lucene4ir.Lucene4IRConstants.FIELD_CONTENT);
        token_count = collectionStats.sumTotalTermFreq();
        doc_count = collectionStats.docCount();
        sum_doc_count = collectionStats.sumDocFreq();
        avg_doc_length = token_count / doc_count;

        System.out.println("CONTENT: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);

    }
 
Example 26
Source Project: lucene4ir   Source File: ExampleStatsApp.java    License: Apache License 2.0 5 votes vote down vote up
public void reportCollectionStatistics()throws IOException {

        IndexSearcher searcher = new IndexSearcher(reader);

        CollectionStatistics collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_ALL);
        long token_count = collectionStats.sumTotalTermFreq();
        long doc_count = collectionStats.docCount();
        long sum_doc_count = collectionStats.sumDocFreq();
        long avg_doc_length = token_count / doc_count;

        System.out.println("ALL: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);

        collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_TITLE);
        token_count = collectionStats.sumTotalTermFreq();
        doc_count = collectionStats.docCount();
        sum_doc_count = collectionStats.sumDocFreq();
        avg_doc_length = token_count / doc_count;

        System.out.println("TITLE: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);


        collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_CONTENT);
        token_count = collectionStats.sumTotalTermFreq();
        doc_count = collectionStats.docCount();
        sum_doc_count = collectionStats.sumDocFreq();
        avg_doc_length = token_count / doc_count;

        System.out.println("CONTENT: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);

    }
 
Example 27
Source Project: Elasticsearch   Source File: AggregatedDfs.java    License: Apache License 2.0 4 votes vote down vote up
public AggregatedDfs(ObjectObjectHashMap<Term, TermStatistics> termStatistics, ObjectObjectHashMap<String, CollectionStatistics> fieldStatistics, long maxDoc) {
    this.termStatistics = termStatistics;
    this.fieldStatistics = fieldStatistics;
    this.maxDoc = maxDoc;
}
 
Example 28
Source Project: Elasticsearch   Source File: AggregatedDfs.java    License: Apache License 2.0 4 votes vote down vote up
public ObjectObjectHashMap<String, CollectionStatistics> fieldStatistics() {
    return fieldStatistics;
}
 
Example 29
Source Project: Elasticsearch   Source File: DfsSearchResult.java    License: Apache License 2.0 4 votes vote down vote up
public ObjectObjectHashMap<String, CollectionStatistics> fieldStatistics() {
    return fieldStatistics;
}
 
Example 30
Source Project: Elasticsearch   Source File: DfsSearchResult.java    License: Apache License 2.0 4 votes vote down vote up
public static ObjectObjectHashMap<String, CollectionStatistics> readFieldStats(StreamInput in) throws IOException {
    return readFieldStats(in, null);
}