Java Code Examples for org.apache.lucene.search.TermStatistics

The following examples show how to use org.apache.lucene.search.TermStatistics. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: Elasticsearch   Source File: AggregatedDfs.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void writeTo(final StreamOutput out) throws IOException {
    out.writeVInt(termStatistics.size());
    
    for (ObjectObjectCursor<Term, TermStatistics> c : termStatistics()) {
        Term term = (Term) c.key;
        out.writeString(term.field());
        out.writeBytesRef(term.bytes());
        TermStatistics stats = (TermStatistics) c.value;
        out.writeBytesRef(stats.term());
        out.writeVLong(stats.docFreq());
        out.writeVLong(DfsSearchResult.addOne(stats.totalTermFreq()));
    }

    DfsSearchResult.writeFieldStats(out, fieldStatistics);
    out.writeVLong(maxDoc);
}
 
Example 2
Source Project: Elasticsearch   Source File: DfsSearchResult.java    License: Apache License 2.0 6 votes vote down vote up
public static TermStatistics[] readTermStats(StreamInput in, Term[] terms) throws IOException {
    int termsStatsSize = in.readVInt();
    final TermStatistics[] termStatistics;
    if (termsStatsSize == 0) {
        termStatistics = EMPTY_TERM_STATS;
    } else {
        termStatistics = new TermStatistics[termsStatsSize];
        assert terms.length == termsStatsSize;
        for (int i = 0; i < termStatistics.length; i++) {
            BytesRef term = terms[i].bytes();
            final long docFreq = in.readVLong();
            assert docFreq >= 0;
            final long totalTermFreq = subOne(in.readVLong());
            termStatistics[i] = new TermStatistics(term, docFreq, totalTermFreq);
        }
    }
    return termStatistics;
}
 
Example 3
Source Project: lucene-solr   Source File: NormValueSource.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public FunctionValues getValues(Map<Object, Object> context, LeafReaderContext readerContext) throws IOException {
  IndexSearcher searcher = (IndexSearcher)context.get("searcher");
  final TFIDFSimilarity similarity = IDFValueSource.asTFIDF(searcher.getSimilarity(), field);
  if (similarity == null) {
    throw new UnsupportedOperationException("requires a TFIDFSimilarity (such as ClassicSimilarity)");
  }
  // Only works if the contribution of the tf is 1 when the freq is 1 and contribution of the idf
  // is 1 when docCount == docFreq == 1
  final SimScorer simScorer = similarity.scorer(1f,
      new CollectionStatistics(field, 1, 1, 1, 1),
      new TermStatistics(new BytesRef("bogus"), 1, 1));
  final LeafSimScorer leafSimScorer = new LeafSimScorer(simScorer, readerContext.reader(), field, true);
  
  return new FloatDocValues(this) {
    int lastDocID = -1;
    @Override
    public float floatVal(int docID) throws IOException {
      if (docID < lastDocID) {
        throw new AssertionError("docs out of order: lastDocID=" + lastDocID + " docID=" + docID);
      }
      lastDocID = docID;
      return leafSimScorer.score(docID, 1f);
    }
  };
}
 
Example 4
Source Project: lucene-solr   Source File: SpanWeight.java    License: Apache License 2.0 6 votes vote down vote up
private Similarity.SimScorer buildSimWeight(SpanQuery query, IndexSearcher searcher, Map<Term, TermStates> termStates, float boost) throws IOException {
  if (termStates == null || termStates.size() == 0 || query.getField() == null)
    return null;
  TermStatistics[] termStats = new TermStatistics[termStates.size()];
  int termUpTo = 0;
  for (Map.Entry<Term, TermStates> entry : termStates.entrySet()) {
    TermStates ts = entry.getValue();
    if (ts.docFreq() > 0) {
      termStats[termUpTo++] = searcher.termStatistics(entry.getKey(), ts.docFreq(), ts.totalTermFreq());
    }
  }
  CollectionStatistics collectionStats = searcher.collectionStatistics(query.getField());
  if (termUpTo > 0) {
    return similarity.scorer(boost, collectionStats, ArrayUtil.copyOfSubArray(termStats, 0, termUpTo));
  } else {
    return null; // no terms at all exist, we won't use similarity
  }
}
 
Example 5
Source Project: lucene4ir   Source File: SMARTBNNBNNSimilarity.java    License: Apache License 2.0 6 votes vote down vote up
@Override
   public final SimWeight computeWeight(CollectionStatistics collectionStats,
				 TermStatistics... termStats)
   {
float N, n, idf, adl;
idf = 1.0f;
N   = collectionStats.maxDoc();
adl = collectionStats.sumTotalTermFreq() / N;

if (termStats.length == 1) {
    n = termStats[0].docFreq();
    idf = log(N/n);
}
else {
    for (final TermStatistics stat : termStats) {
	n = stat.docFreq();
	idf += log(N/n);
    }
}

return new TFIDFWeight(collectionStats.field(), idf, adl);
   }
 
Example 6
Source Project: Elasticsearch   Source File: AggregatedDfs.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void readFrom(StreamInput in) throws IOException {
    int size = in.readVInt();
    termStatistics = HppcMaps.newMap(size);
    for (int i = 0; i < size; i++) {
        Term term = new Term(in.readString(), in.readBytesRef());
        TermStatistics stats = new TermStatistics(in.readBytesRef(), 
                in.readVLong(), 
                DfsSearchResult.subOne(in.readVLong()));
        termStatistics.put(term, stats);
    }
    fieldStatistics = DfsSearchResult.readFieldStats(in);
    maxDoc = in.readVLong();
}
 
Example 7
Source Project: Elasticsearch   Source File: TermVectorsWriter.java    License: Apache License 2.0 5 votes vote down vote up
private void writeTermStatistics(TermStatistics termStatistics) throws IOException {
    int docFreq = (int) termStatistics.docFreq();
    assert (docFreq >= -1);
    writePotentiallyNegativeVInt(docFreq);
    long ttf = termStatistics.totalTermFreq();
    assert (ttf >= -1);
    writePotentiallyNegativeVLong(ttf);
}
 
Example 8
Source Project: linden   Source File: LindenSimilarity.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
  final long df = termStats.docFreq();
  final long max = collectionStats.maxDoc();
  final float idf = idfManager.getIDF(termStats.term().utf8ToString());
  return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")");
}
 
Example 9
Source Project: lucene-solr   Source File: TestMemoryIndex.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testSimilarities() throws IOException {

  MemoryIndex mi = new MemoryIndex();
  mi.addField("f1", "a long text field that contains many many terms", analyzer);

  IndexSearcher searcher = mi.createSearcher();
  LeafReader reader = (LeafReader) searcher.getIndexReader();
  NumericDocValues norms = reader.getNormValues("f1");
  assertEquals(0, norms.nextDoc());
  float n1 = norms.longValue();

  // Norms are re-computed when we change the Similarity
  mi.setSimilarity(new Similarity() {

    @Override
    public long computeNorm(FieldInvertState state) {
      return 74;
    }

    @Override
    public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
      throw new UnsupportedOperationException();
    }

  });
  norms = reader.getNormValues("f1");
  assertEquals(0, norms.nextDoc());
  float n2 = norms.longValue();

  assertTrue(n1 != n2);
  TestUtil.checkReader(reader);
}
 
Example 10
Source Project: lucene-solr   Source File: AssertingSimilarity.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
  assert boost >= 0;
  assert collectionStats != null;
  assert termStats.length > 0;
  for (TermStatistics term : termStats) {
    assert term != null;
  }
  // TODO: check that TermStats is in bounds with respect to collection? e.g. docFreq <= maxDoc
  SimScorer scorer = delegate.scorer(boost, collectionStats, termStats);
  assert scorer != null;
  return new AssertingSimScorer(scorer, boost);
}
 
Example 11
Source Project: lucene-solr   Source File: SimilarityBase.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public final SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
  SimScorer weights[] = new SimScorer[termStats.length];
  for (int i = 0; i < termStats.length; i++) {
    BasicStats stats = newStats(collectionStats.field(), boost);
    fillBasicStats(stats, collectionStats, termStats[i]);
    weights[i] = new BasicSimScorer(stats);
  }
  if (weights.length == 1) {
    return weights[0];
  } else {
    return new MultiSimilarity.MultiSimScorer(weights);
  }
}
 
Example 12
Source Project: lucene-solr   Source File: SimilarityBase.java    License: Apache License 2.0 5 votes vote down vote up
/** Fills all member fields defined in {@code BasicStats} in {@code stats}. 
 *  Subclasses can override this method to fill additional stats. */
protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) {
  // TODO: validate this for real, somewhere else
  assert termStats.totalTermFreq() <= collectionStats.sumTotalTermFreq();
  assert termStats.docFreq() <= collectionStats.sumDocFreq();
 
  // TODO: add sumDocFreq for field (numberOfFieldPostings)
  stats.setNumberOfDocuments(collectionStats.docCount());
  stats.setNumberOfFieldTokens(collectionStats.sumTotalTermFreq());
  stats.setAvgFieldLength(collectionStats.sumTotalTermFreq() / (double) collectionStats.docCount());
  stats.setDocFreq(termStats.docFreq());
  stats.setTotalTermFreq(termStats.totalTermFreq());
}
 
Example 13
Source Project: lucene-solr   Source File: BM25Similarity.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public final SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
  Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats);
  float avgdl = avgFieldLength(collectionStats);

  float[] cache = new float[256];
  for (int i = 0; i < cache.length; i++) {
    cache[i] = 1f / (k1 * ((1 - b) + b * LENGTH_TABLE[i] / avgdl));
  }
  return new BM25Scorer(boost, k1, b, idf, avgdl, cache);
}
 
Example 14
Source Project: lucene-solr   Source File: MultiSimilarity.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
  SimScorer subScorers[] = new SimScorer[sims.length];
  for (int i = 0; i < subScorers.length; i++) {
    subScorers[i] = sims[i].scorer(boost, collectionStats, termStats);
  }
  return new MultiSimScorer(subScorers);
}
 
Example 15
Source Project: lucene-solr   Source File: LMSimilarity.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Computes the collection probability of the current term in addition to the
 * usual statistics.
 */
@Override
protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) {
  super.fillBasicStats(stats, collectionStats, termStats);
  LMStats lmStats = (LMStats) stats;
  lmStats.setCollectionProbability(collectionModel.computeProbability(stats));
}
 
Example 16
Source Project: lucene-solr   Source File: ClassicSimilarity.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
  final long df = termStats.docFreq();
  final long docCount = collectionStats.docCount();
  final float idf = idf(df, docCount);
  return Explanation.match(idf, "idf, computed as log((docCount+1)/(docFreq+1)) + 1 from:",
      Explanation.match(df, "docFreq, number of documents containing term"),
      Explanation.match(docCount, "docCount, total number of documents with field"));
}
 
Example 17
Source Project: lucene-solr   Source File: TestMaxTermFrequency.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
  return new SimScorer() {

    @Override
    public float score(float freq, long norm) {
      return 0;
    }

  };
}
 
Example 18
Source Project: lucene-solr   Source File: LRUStatsCache.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public TermStatistics termStatistics(SolrIndexSearcher localSearcher, Term term, int docFreq, long totalTermFreq)
    throws IOException {
  TermStats termStats = currentGlobalTermStats.get(term.toString());
  if (termStats == null) {
    log.debug("## Missing global termStats info: {}, using local", term);
    missingTermStats.add(term);
    metrics.missingGlobalTermStats.increment();
    return localSearcher != null ? localSearcher.localTermStatistics(term, docFreq, totalTermFreq) : null;
  } else {
    return termStats.toTermStatistics();
  }
}
 
Example 19
Source Project: lucene-solr   Source File: ExactStatsCache.java    License: Apache License 2.0 5 votes vote down vote up
public TermStatistics termStatistics(SolrIndexSearcher localSearcher, Term term, int docFreq, long totalTermFreq)
    throws IOException {
  TermStats termStats = termStatsCache.get(term.toString());
  // TermStats == null is also true if term has no docFreq anyway,
  // see returnLocalStats, if docFreq == 0, they are not added anyway
  // Not sure we need a warning here
  if (termStats == null) {
    log.debug("Missing global termStats info for term={}, using local stats", term);
    metrics.missingGlobalTermStats.increment();
    return localSearcher != null ? localSearcher.localTermStatistics(term, docFreq, totalTermFreq) : null;
  } else {
    return termStats.toTermStatistics();
  }
}
 
Example 20
Source Project: lucene-solr   Source File: StatsCache.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public TermStatistics termStatistics(Term term, int docFreq, long totalTermFreq) throws IOException {
  if (statsSource.termStatistics(null, term, docFreq, totalTermFreq) == null) {
    missingTermStats.accept(term);
    missingTermsCount++;
  }
  return super.termStatistics(term, docFreq, totalTermFreq);
}
 
Example 21
Source Project: lucene4ir   Source File: BM25Similarity.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public final SimWeight computeWeight(CollectionStatistics collectionStats, TermStatistics... termStats) {
  Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats);

  float avgdl = avgFieldLength(collectionStats);

  // compute freq-independent part of bm25 equation across all norm values
  float cache[] = new float[256];
  for (int i = 0; i < cache.length; i++) {
    cache[i] = k1 * ((1 - b) + b * decodeNormValue((byte)i) / avgdl);
  }
  return new BM25Stats(collectionStats.field(), idf, avgdl, cache);
}
 
Example 22
Source Project: lucene4ir   Source File: OKAPIBM25Similarity.java    License: Apache License 2.0 5 votes vote down vote up
@Override
   public final SimWeight computeWeight(CollectionStatistics collectionStats,
				 TermStatistics... termStats)
   {
long  N, n;
float idf_, avdl;

idf_ = 1.0f;

N    = collectionStats.docCount();
if (N == -1)
    N = collectionStats.maxDoc();

avdl = collectionStats.sumTotalTermFreq() / N;

if (termStats.length == 1) {
    n    = termStats[0].docFreq();
    idf_ = idf(n, N);
}
else { /* computation for a phrase */
    for (final TermStatistics stat : termStats) {
	n     = stat.docFreq();
	idf_ += idf(n, N);
    }
}

return new TFIDFWeight(collectionStats.field(), idf_, avdl);
   }
 
Example 23
Source Project: Elasticsearch   Source File: AggregatedDfs.java    License: Apache License 2.0 4 votes vote down vote up
public AggregatedDfs(ObjectObjectHashMap<Term, TermStatistics> termStatistics, ObjectObjectHashMap<String, CollectionStatistics> fieldStatistics, long maxDoc) {
    this.termStatistics = termStatistics;
    this.fieldStatistics = fieldStatistics;
    this.maxDoc = maxDoc;
}
 
Example 24
Source Project: Elasticsearch   Source File: AggregatedDfs.java    License: Apache License 2.0 4 votes vote down vote up
public ObjectObjectHashMap<Term, TermStatistics> termStatistics() {
    return termStatistics;
}
 
Example 25
Source Project: Elasticsearch   Source File: DfsSearchResult.java    License: Apache License 2.0 4 votes vote down vote up
public DfsSearchResult termsStatistics(Term[] terms, TermStatistics[] termStatistics) {
    this.terms = terms;
    this.termStatistics = termStatistics;
    return this;
}
 
Example 26
Source Project: Elasticsearch   Source File: DfsSearchResult.java    License: Apache License 2.0 4 votes vote down vote up
public TermStatistics[] termStatistics() {
    return termStatistics;
}
 
Example 27
Source Project: Elasticsearch   Source File: DfsSearchResult.java    License: Apache License 2.0 4 votes vote down vote up
public static void writeTermStats(StreamOutput out, TermStatistics[] termStatistics) throws IOException {
    out.writeVInt(termStatistics.length);
    for (TermStatistics termStatistic : termStatistics) {
        writeSingleTermStats(out, termStatistic);
    }
}
 
Example 28
Source Project: Elasticsearch   Source File: DfsSearchResult.java    License: Apache License 2.0 4 votes vote down vote up
public  static void writeSingleTermStats(StreamOutput out, TermStatistics termStatistic) throws IOException {
    assert termStatistic.docFreq() >= 0;
    out.writeVLong(termStatistic.docFreq());
    out.writeVLong(addOne(termStatistic.totalTermFreq()));        
}
 
Example 29
Source Project: Elasticsearch   Source File: TermVectorsFilter.java    License: Apache License 2.0 4 votes vote down vote up
private TermStatistics getTermStatistics(TermsEnum termsEnum, Term term) throws IOException {
    if (dfs != null) {
        return dfs.termStatistics().get(term);
    }
    return new TermStatistics(termsEnum.term(), termsEnum.docFreq(), termsEnum.totalTermFreq());
}
 
Example 30
Source Project: lucene-solr   Source File: TestPayloadScoreQuery.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics[] termStats) {
  return Explanation.match(1.0f, "Inexplicable");
}