org.apache.lucene.search.CollectionStatistics Java Examples

The following examples show how to use org.apache.lucene.search.CollectionStatistics. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: DfsSearchResult.java    From Elasticsearch with Apache License 2.0 6 votes vote down vote up
public static ObjectObjectHashMap<String, CollectionStatistics> readFieldStats(StreamInput in, ObjectObjectHashMap<String, CollectionStatistics> fieldStatistics) throws IOException {
    final int numFieldStatistics = in.readVInt();
    if (fieldStatistics == null) {
        fieldStatistics = HppcMaps.newNoNullKeysMap(numFieldStatistics);
    }
    for (int i = 0; i < numFieldStatistics; i++) {
        final String field = in.readString();
        assert field != null;
        final long maxDoc = in.readVLong();
        final long docCount = subOne(in.readVLong());
        final long sumTotalTermFreq = subOne(in.readVLong());
        final long sumDocFreq = subOne(in.readVLong());
        CollectionStatistics stats = new CollectionStatistics(field, maxDoc, docCount, sumTotalTermFreq, sumDocFreq);
        fieldStatistics.put(field, stats);
    }
    return fieldStatistics;
}
 
Example #2
Source File: NormValueSource.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
public FunctionValues getValues(Map<Object, Object> context, LeafReaderContext readerContext) throws IOException {
  IndexSearcher searcher = (IndexSearcher)context.get("searcher");
  final TFIDFSimilarity similarity = IDFValueSource.asTFIDF(searcher.getSimilarity(), field);
  if (similarity == null) {
    throw new UnsupportedOperationException("requires a TFIDFSimilarity (such as ClassicSimilarity)");
  }
  // Only works if the contribution of the tf is 1 when the freq is 1 and contribution of the idf
  // is 1 when docCount == docFreq == 1
  final SimScorer simScorer = similarity.scorer(1f,
      new CollectionStatistics(field, 1, 1, 1, 1),
      new TermStatistics(new BytesRef("bogus"), 1, 1));
  final LeafSimScorer leafSimScorer = new LeafSimScorer(simScorer, readerContext.reader(), field, true);
  
  return new FloatDocValues(this) {
    int lastDocID = -1;
    @Override
    public float floatVal(int docID) throws IOException {
      if (docID < lastDocID) {
        throw new AssertionError("docs out of order: lastDocID=" + lastDocID + " docID=" + docID);
      }
      lastDocID = docID;
      return leafSimScorer.score(docID, 1f);
    }
  };
}
 
Example #3
Source File: SpanWeight.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
private Similarity.SimScorer buildSimWeight(SpanQuery query, IndexSearcher searcher, Map<Term, TermStates> termStates, float boost) throws IOException {
  if (termStates == null || termStates.size() == 0 || query.getField() == null)
    return null;
  TermStatistics[] termStats = new TermStatistics[termStates.size()];
  int termUpTo = 0;
  for (Map.Entry<Term, TermStates> entry : termStates.entrySet()) {
    TermStates ts = entry.getValue();
    if (ts.docFreq() > 0) {
      termStats[termUpTo++] = searcher.termStatistics(entry.getKey(), ts.docFreq(), ts.totalTermFreq());
    }
  }
  CollectionStatistics collectionStats = searcher.collectionStatistics(query.getField());
  if (termUpTo > 0) {
    return similarity.scorer(boost, collectionStats, ArrayUtil.copyOfSubArray(termStats, 0, termUpTo));
  } else {
    return null; // no terms at all exist, we won't use similarity
  }
}
 
Example #4
Source File: SMARTBNNBNNSimilarity.java    From lucene4ir with Apache License 2.0 6 votes vote down vote up
@Override
   public final SimWeight computeWeight(CollectionStatistics collectionStats,
				 TermStatistics... termStats)
   {
float N, n, idf, adl;
idf = 1.0f;
N   = collectionStats.maxDoc();
adl = collectionStats.sumTotalTermFreq() / N;

if (termStats.length == 1) {
    n = termStats[0].docFreq();
    idf = log(N/n);
}
else {
    for (final TermStatistics stat : termStats) {
	n = stat.docFreq();
	idf += log(N/n);
    }
}

return new TFIDFWeight(collectionStats.field(), idf, adl);
   }
 
Example #5
Source File: AssertingSimilarity.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
  assert boost >= 0;
  assert collectionStats != null;
  assert termStats.length > 0;
  for (TermStatistics term : termStats) {
    assert term != null;
  }
  // TODO: check that TermStats is in bounds with respect to collection? e.g. docFreq <= maxDoc
  SimScorer scorer = delegate.scorer(boost, collectionStats, termStats);
  assert scorer != null;
  return new AssertingSimScorer(scorer, boost);
}
 
Example #6
Source File: MultiSimilarity.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
  SimScorer subScorers[] = new SimScorer[sims.length];
  for (int i = 0; i < subScorers.length; i++) {
    subScorers[i] = sims[i].scorer(boost, collectionStats, termStats);
  }
  return new MultiSimScorer(subScorers);
}
 
Example #7
Source File: LRUStatsCache.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public CollectionStatistics collectionStatistics(SolrIndexSearcher localSearcher, String field)
    throws IOException {
  CollectionStats colStats = currentGlobalColStats.get(field);
  if (colStats == null) {
    log.debug("## Missing global colStats info: {}, using local", field);
    missingColStats.add(field);
    metrics.missingGlobalFieldStats.increment();
    return localSearcher != null ? localSearcher.localCollectionStatistics(field) : null;
  } else {
    return colStats.toCollectionStatistics();
  }
}
 
Example #8
Source File: ExactStatsCache.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public CollectionStatistics collectionStatistics(SolrIndexSearcher localSearcher, String field)
    throws IOException {
  CollectionStats colStats = colStatsCache.get(field);
  if (colStats == null) {
    log.debug("Missing global colStats info for field={}, using local", field);
    metrics.missingGlobalFieldStats.increment();
    return localSearcher != null ? localSearcher.localCollectionStatistics(field) : null;
  } else {
    return colStats.toCollectionStatistics();
  }
}
 
Example #9
Source File: TestMaxTermFrequency.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
  return new SimScorer() {

    @Override
    public float score(float freq, long norm) {
      return 0;
    }

  };
}
 
Example #10
Source File: BM25Similarity.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public final SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
  Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats);
  float avgdl = avgFieldLength(collectionStats);

  float[] cache = new float[256];
  for (int i = 0; i < cache.length; i++) {
    cache[i] = 1f / (k1 * ((1 - b) + b * LENGTH_TABLE[i] / avgdl));
  }
  return new BM25Scorer(boost, k1, b, idf, avgdl, cache);
}
 
Example #11
Source File: StatsCache.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public CollectionStatistics collectionStatistics(String field) throws IOException {
  if (statsSource.collectionStatistics(null, field) == null) {
    missingFieldStats.accept(field);
    missingFieldsCount++;
  }
  return super.collectionStatistics(field);
}
 
Example #12
Source File: LMSimilarity.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Computes the collection probability of the current term in addition to the
 * usual statistics.
 */
@Override
protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) {
  super.fillBasicStats(stats, collectionStats, termStats);
  LMStats lmStats = (LMStats) stats;
  lmStats.setCollectionProbability(collectionModel.computeProbability(stats));
}
 
Example #13
Source File: SimilarityBase.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** Fills all member fields defined in {@code BasicStats} in {@code stats}. 
 *  Subclasses can override this method to fill additional stats. */
protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) {
  // TODO: validate this for real, somewhere else
  assert termStats.totalTermFreq() <= collectionStats.sumTotalTermFreq();
  assert termStats.docFreq() <= collectionStats.sumDocFreq();
 
  // TODO: add sumDocFreq for field (numberOfFieldPostings)
  stats.setNumberOfDocuments(collectionStats.docCount());
  stats.setNumberOfFieldTokens(collectionStats.sumTotalTermFreq());
  stats.setAvgFieldLength(collectionStats.sumTotalTermFreq() / (double) collectionStats.docCount());
  stats.setDocFreq(termStats.docFreq());
  stats.setTotalTermFreq(termStats.totalTermFreq());
}
 
Example #14
Source File: SimilarityBase.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public final SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
  SimScorer weights[] = new SimScorer[termStats.length];
  for (int i = 0; i < termStats.length; i++) {
    BasicStats stats = newStats(collectionStats.field(), boost);
    fillBasicStats(stats, collectionStats, termStats[i]);
    weights[i] = new BasicSimScorer(stats);
  }
  if (weights.length == 1) {
    return weights[0];
  } else {
    return new MultiSimilarity.MultiSimScorer(weights);
  }
}
 
Example #15
Source File: CollectionStats.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public CollectionStats(CollectionStatistics stats) {
  this.field = stats.field();
  this.maxDoc = stats.maxDoc();
  this.docCount = stats.docCount();
  this.sumTotalTermFreq = stats.sumTotalTermFreq();
  this.sumDocFreq = stats.sumDocFreq();
}
 
Example #16
Source File: TestSimilarityBase.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private CollectionStatistics toCollectionStats(BasicStats stats) {
  long sumTtf = stats.getNumberOfFieldTokens();
  long sumDf;
  if (sumTtf == -1) {
    sumDf = TestUtil.nextLong(random(), stats.getNumberOfDocuments(), 2L * stats.getNumberOfDocuments());
  } else {
    sumDf = TestUtil.nextLong(random(), Math.min(stats.getNumberOfDocuments(), sumTtf), sumTtf);
  }
  int docCount = Math.toIntExact(Math.min(sumDf, stats.getNumberOfDocuments()));
  int maxDoc = TestUtil.nextInt(random(), docCount, docCount + 10);

  return new CollectionStatistics(stats.field, maxDoc, docCount, sumTtf, sumDf);
}
 
Example #17
Source File: BM25Similarity.java    From lucene4ir with Apache License 2.0 5 votes vote down vote up
/** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code>,
 * or returns <code>1</code> if the index does not store sumTotalTermFreq:
 * any field that omits frequency information). */
protected float avgFieldLength(CollectionStatistics collectionStats) {
  final long sumTotalTermFreq = collectionStats.sumTotalTermFreq();
  if (sumTotalTermFreq <= 0) {
    return 1f;       // field does not exist, or stat is unsupported
  } else {
    final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
    return (float) (sumTotalTermFreq / (double) docCount);
  }
}
 
Example #18
Source File: BM25Similarity.java    From lucene4ir with Apache License 2.0 5 votes vote down vote up
@Override
public final SimWeight computeWeight(CollectionStatistics collectionStats, TermStatistics... termStats) {
  Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats);

  float avgdl = avgFieldLength(collectionStats);

  // compute freq-independent part of bm25 equation across all norm values
  float cache[] = new float[256];
  for (int i = 0; i < cache.length; i++) {
    cache[i] = k1 * ((1 - b) + b * decodeNormValue((byte)i) / avgdl);
  }
  return new BM25Stats(collectionStats.field(), idf, avgdl, cache);
}
 
Example #19
Source File: OKAPIBM25Similarity.java    From lucene4ir with Apache License 2.0 5 votes vote down vote up
@Override
   public final SimWeight computeWeight(CollectionStatistics collectionStats,
				 TermStatistics... termStats)
   {
long  N, n;
float idf_, avdl;

idf_ = 1.0f;

N    = collectionStats.docCount();
if (N == -1)
    N = collectionStats.maxDoc();

avdl = collectionStats.sumTotalTermFreq() / N;

if (termStats.length == 1) {
    n    = termStats[0].docFreq();
    idf_ = idf(n, N);
}
else { /* computation for a phrase */
    for (final TermStatistics stat : termStats) {
	n     = stat.docFreq();
	idf_ += idf(n, N);
    }
}

return new TFIDFWeight(collectionStats.field(), idf_, avdl);
   }
 
Example #20
Source File: TestMemoryIndex.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Test
public void testSimilarities() throws IOException {

  MemoryIndex mi = new MemoryIndex();
  mi.addField("f1", "a long text field that contains many many terms", analyzer);

  IndexSearcher searcher = mi.createSearcher();
  LeafReader reader = (LeafReader) searcher.getIndexReader();
  NumericDocValues norms = reader.getNormValues("f1");
  assertEquals(0, norms.nextDoc());
  float n1 = norms.longValue();

  // Norms are re-computed when we change the Similarity
  mi.setSimilarity(new Similarity() {

    @Override
    public long computeNorm(FieldInvertState state) {
      return 74;
    }

    @Override
    public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
      throw new UnsupportedOperationException();
    }

  });
  norms = reader.getNormValues("f1");
  assertEquals(0, norms.nextDoc());
  float n2 = norms.longValue();

  assertTrue(n1 != n2);
  TestUtil.checkReader(reader);
}
 
Example #21
Source File: LindenSimilarity.java    From linden with Apache License 2.0 5 votes vote down vote up
@Override
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
  final long df = termStats.docFreq();
  final long max = collectionStats.maxDoc();
  final float idf = idfManager.getIDF(termStats.term().utf8ToString());
  return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")");
}
 
Example #22
Source File: TermVectorsWriter.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
private void writeFieldStatistics(CollectionStatistics fieldStats) throws IOException {
    long sttf = fieldStats.sumTotalTermFreq();
    assert (sttf >= -1);
    writePotentiallyNegativeVLong(sttf);
    long sdf = fieldStats.sumDocFreq();
    assert (sdf >= -1);
    writePotentiallyNegativeVLong(sdf);
    int dc = (int) fieldStats.docCount();
    assert (dc >= -1);
    writePotentiallyNegativeVInt(dc);
}
 
Example #23
Source File: DumpTermsApp.java    From lucene4ir with Apache License 2.0 5 votes vote down vote up
public void reportCollectionStatistics()throws IOException {

        IndexSearcher searcher = new IndexSearcher(reader);

        CollectionStatistics collectionStats = searcher.collectionStatistics(lucene4ir.Lucene4IRConstants.FIELD_ALL);
        long token_count = collectionStats.sumTotalTermFreq();
        long doc_count = collectionStats.docCount();
        long sum_doc_count = collectionStats.sumDocFreq();
        long avg_doc_length = token_count / doc_count;

        System.out.println("ALL: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);

        collectionStats = searcher.collectionStatistics(lucene4ir.Lucene4IRConstants.FIELD_TITLE);
        token_count = collectionStats.sumTotalTermFreq();
        doc_count = collectionStats.docCount();
        sum_doc_count = collectionStats.sumDocFreq();
        avg_doc_length = token_count / doc_count;

        System.out.println("TITLE: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);


        collectionStats = searcher.collectionStatistics(lucene4ir.Lucene4IRConstants.FIELD_CONTENT);
        token_count = collectionStats.sumTotalTermFreq();
        doc_count = collectionStats.docCount();
        sum_doc_count = collectionStats.sumDocFreq();
        avg_doc_length = token_count / doc_count;

        System.out.println("CONTENT: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);

    }
 
Example #24
Source File: DfsSearchResult.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
public static void writeFieldStats(StreamOutput out, ObjectObjectHashMap<String, CollectionStatistics> fieldStatistics) throws IOException {
    out.writeVInt(fieldStatistics.size());

    for (ObjectObjectCursor<String, CollectionStatistics> c : fieldStatistics) {
        out.writeString(c.key);
        CollectionStatistics statistics = c.value;
        assert statistics.maxDoc() >= 0;
        out.writeVLong(statistics.maxDoc());
        out.writeVLong(addOne(statistics.docCount()));
        out.writeVLong(addOne(statistics.sumTotalTermFreq()));
        out.writeVLong(addOne(statistics.sumDocFreq()));
    }
}
 
Example #25
Source File: ExampleStatsApp.java    From lucene4ir with Apache License 2.0 5 votes vote down vote up
public void reportCollectionStatistics()throws IOException {

        IndexSearcher searcher = new IndexSearcher(reader);

        CollectionStatistics collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_ALL);
        long token_count = collectionStats.sumTotalTermFreq();
        long doc_count = collectionStats.docCount();
        long sum_doc_count = collectionStats.sumDocFreq();
        long avg_doc_length = token_count / doc_count;

        System.out.println("ALL: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);

        collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_TITLE);
        token_count = collectionStats.sumTotalTermFreq();
        doc_count = collectionStats.docCount();
        sum_doc_count = collectionStats.sumDocFreq();
        avg_doc_length = token_count / doc_count;

        System.out.println("TITLE: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);


        collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_CONTENT);
        token_count = collectionStats.sumTotalTermFreq();
        doc_count = collectionStats.docCount();
        sum_doc_count = collectionStats.sumDocFreq();
        avg_doc_length = token_count / doc_count;

        System.out.println("CONTENT: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);

    }
 
Example #26
Source File: ClassicSimilarity.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
  final long df = termStats.docFreq();
  final long docCount = collectionStats.docCount();
  final float idf = idf(df, docCount);
  return Explanation.match(idf, "idf, computed as log((docCount+1)/(docFreq+1)) + 1 from:",
      Explanation.match(df, "docFreq, number of documents containing term"),
      Explanation.match(docCount, "docCount, total number of documents with field"));
}
 
Example #27
Source File: TestOmitTf.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics[] termStats) {
  return Explanation.match(1.0f, "Inexplicable");
}
 
Example #28
Source File: TestIndexSorting.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
  return in.scorer(boost, collectionStats, termStats);
}
 
Example #29
Source File: TestUniqueTermCount.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
  throw new UnsupportedOperationException();
}
 
Example #30
Source File: TFSimilarity.java    From lumongo with Apache License 2.0 4 votes vote down vote up
@Override
public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
	return new TFSimilarity.BooleanWeight(boost);
}