Java Code Examples for org.apache.lucene.index.Terms#size()

The following examples show how to use org.apache.lucene.index.Terms#size() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: DocToDoubleVectorUtils.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * create a sparse <code>Double</code> vector given doc and field term vectors using local frequency of the terms in the doc
 *
 * @param docTerms   term vectors for a given document
 * @param fieldTerms field term vectors
 * @return a sparse vector of <code>Double</code>s as an array
 * @throws IOException in case accessing the underlying index fails
 */
public static Double[] toSparseLocalFreqDoubleArray(Terms docTerms, Terms fieldTerms) throws IOException {
  TermsEnum fieldTermsEnum = fieldTerms.iterator();
  Double[] freqVector = null;
  if (docTerms != null && fieldTerms.size() > -1) {
    freqVector = new Double[(int) fieldTerms.size()];
    int i = 0;
    TermsEnum docTermsEnum = docTerms.iterator();
    BytesRef term;
    while ((term = fieldTermsEnum.next()) != null) {
      TermsEnum.SeekStatus seekStatus = docTermsEnum.seekCeil(term);
      if (seekStatus.equals(TermsEnum.SeekStatus.END)) {
        docTermsEnum = docTerms.iterator();
      }
      if (seekStatus.equals(TermsEnum.SeekStatus.FOUND)) {
        long termFreqLocal = docTermsEnum.totalTermFreq(); // the total number of occurrences of this term in the given document
        freqVector[i] = Long.valueOf(termFreqLocal).doubleValue();
      } else {
        freqVector[i] = 0d;
      }
      i++;
    }
  }
  return freqVector;
}
 
Example 2
Source File: DocToDoubleVectorUtils.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * create a dense <code>Double</code> vector given doc and field term vectors using local frequency of the terms in the doc
 *
 * @param docTerms term vectors for a given document
 * @return a dense vector of <code>Double</code>s as an array
 * @throws IOException in case accessing the underlying index fails
 */
public static Double[] toDenseLocalFreqDoubleArray(Terms docTerms) throws IOException {
  Double[] freqVector = null;
  if (docTerms != null) {
    freqVector = new Double[(int) docTerms.size()];
    int i = 0;
    TermsEnum docTermsEnum = docTerms.iterator();

    while (docTermsEnum.next() != null) {
      long termFreqLocal = docTermsEnum.totalTermFreq(); // the total number of occurrences of this term in the given document
      freqVector[i] = Long.valueOf(termFreqLocal).doubleValue();
      i++;
    }
  }
  return freqVector;
}
 
Example 3
Source File: GroupByOptimizedIterator.java    From crate with Apache License 2.0 6 votes vote down vote up
static boolean hasHighCardinalityRatio(Supplier<Engine.Searcher> acquireSearcher, String fieldName) {
    // acquire separate searcher:
    // Can't use sharedShardContexts() yet, if we bail out the "getOrCreateContext" causes issues later on in the fallback logic
    try (Engine.Searcher searcher = acquireSearcher.get()) {
        for (LeafReaderContext leaf : searcher.reader().leaves()) {
            Terms terms = leaf.reader().terms(fieldName);
            if (terms == null) {
                return true;
            }
            double cardinalityRatio = terms.size() / (double) leaf.reader().numDocs();
            if (cardinalityRatio > CARDINALITY_RATIO_THRESHOLD) {
                return true;
            }
        }
    } catch (IOException e) {
        return true;
    }
    return false;
}
 
Example 4
Source File: WordScorer.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
public WordScorer(IndexReader reader, Terms terms, String field, double realWordLikelyHood, BytesRef separator) throws IOException {
    this.field = field;
    if (terms == null) {
        throw new IllegalArgumentException("Field: [" + field + "] does not exist");
    }
    this.terms = terms;
    final long vocSize = terms.getSumTotalTermFreq();
    this.vocabluarySize =  vocSize == -1 ? reader.maxDoc() : vocSize;
    this.useTotalTermFreq = vocSize != -1;
    this.numTerms = terms.size();
    this.termsEnum = new FreqTermsEnum(reader, field, !useTotalTermFreq, useTotalTermFreq, null, BigArrays.NON_RECYCLING_INSTANCE); // non recycling for now
    this.reader = reader;
    this.realWordLikelyhood = realWordLikelyHood;
    this.separator = separator;
}
 
Example 5
Source File: TermVectorsResponse.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
private void buildField(XContentBuilder builder, final CharsRefBuilder spare, Fields theFields, Iterator<String> fieldIter) throws IOException {
    String fieldName = fieldIter.next();
    builder.startObject(fieldName);
    Terms curTerms = theFields.terms(fieldName);
    // write field statistics
    buildFieldStatistics(builder, curTerms);
    builder.startObject(FieldStrings.TERMS);
    TermsEnum termIter = curTerms.iterator();
    BoostAttribute boostAtt = termIter.attributes().addAttribute(BoostAttribute.class);
    for (int i = 0; i < curTerms.size(); i++) {
        buildTerm(builder, spare, curTerms, termIter, boostAtt);
    }
    builder.endObject();
    builder.endObject();
}
 
Example 6
Source File: DocToDoubleVectorUtilsTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Test
public void testSparseFreqDoubleArrayConversion() throws Exception {
  Terms fieldTerms = MultiTerms.getTerms(index, "text");
  if (fieldTerms != null && fieldTerms.size() != -1) {
    IndexSearcher indexSearcher = new IndexSearcher(index);
    for (ScoreDoc scoreDoc : indexSearcher.search(new MatchAllDocsQuery(), Integer.MAX_VALUE).scoreDocs) {
      Terms docTerms = index.getTermVector(scoreDoc.doc, "text");
      Double[] vector = DocToDoubleVectorUtils.toSparseLocalFreqDoubleArray(docTerms, fieldTerms);
      assertNotNull(vector);
      assertTrue(vector.length > 0);
    }
  }
}
 
Example 7
Source File: TestBlockPostingsFormat3.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** 
 * checks collection-level statistics on Terms 
 */
public void assertTermsStatistics(Terms leftTerms, Terms rightTerms) throws Exception {
  assertEquals(leftTerms.getDocCount(), rightTerms.getDocCount());
  assertEquals(leftTerms.getSumDocFreq(), rightTerms.getSumDocFreq());
  if (leftTerms.hasFreqs() && rightTerms.hasFreqs()) {
    assertEquals(leftTerms.getSumTotalTermFreq(), rightTerms.getSumTotalTermFreq());
  }
  if (leftTerms.size() != -1 && rightTerms.size() != -1) {
    assertEquals(leftTerms.size(), rightTerms.size());
  }
}
 
Example 8
Source File: TermsSet.java    From lucene4ir with Apache License 2.0 5 votes vote down vote up
private Set<String> getTerms(IndexReader ir) {
    Set<String> t = new HashSet<>();
    for (int i = 0; i < ir.leaves().size(); i++) {
        Terms termsList;
        try {
            // Get all the terms at this level of the tree.
            termsList = ir.leaves().get(i).reader().terms(Lucene4IRConstants.FIELD_ALL);
            if (termsList != null && termsList.size() > 0) {
                TermsEnum te = termsList.iterator();
                BytesRef termBytes;
                while ((termBytes = te.next()) != null) {
                    t.add(termBytes.utf8ToString());
                }
            }

            // Get all the terms at the next level of the tree.
            if (ir.leaves().get(i).children() != null && ir.leaves().get(i).children().size() > 0) {
                for (IndexReaderContext c : ir.leaves().get(i).children()) {
                    t.addAll(getTerms(c.reader()));
                }
            }

        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    return t;
}
 
Example 9
Source File: LookupBuilderReducer.java    From incubator-retired-blur with Apache License 2.0 5 votes vote down vote up
private long getTotalNumberOfRowIds(DirectoryReader reader) throws IOException {
  long total = 0;
  List<AtomicReaderContext> leaves = reader.leaves();
  for (AtomicReaderContext context : leaves) {
    AtomicReader atomicReader = context.reader();
    Terms terms = atomicReader.terms(BlurConstants.ROW_ID);
    long expectedInsertions = terms.size();
    if (expectedInsertions < 0) {
      return -1;
    }
    total += expectedInsertions;
  }
  return total;
}
 
Example 10
Source File: VectorScoreQuery.java    From solr-vector-scoring with Apache License 2.0 4 votes vote down vote up
@Override
protected CustomScoreProvider getCustomScoreProvider(LeafReaderContext context) throws IOException {
	return new CustomScoreProvider(context){
		@Override
		public float customScore(int docID, float subQueryScore, float valSrcScore) throws IOException {
			float score = 0;
			double docVectorNorm = 0;
			LeafReader reader = context.reader();
			Terms terms = reader.getTermVector(docID, field);
			if(vector.size() != terms.size()){
				throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "indexed and input vector array must have same length");
			}
			TermsEnum iter = terms.iterator();
		    BytesRef text;
		    while ((text = iter.next()) != null) {
		    	String term = text.utf8ToString();
		    	float payloadValue = 0f;
		    	PostingsEnum postings = iter.postings(null, PostingsEnum.ALL);
		    	while (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
		    		int freq = postings.freq();
		    		while (freq-- > 0) postings.nextPosition();

		    		BytesRef payload = postings.getPayload();
		    		payloadValue = PayloadHelper.decodeFloat(payload.bytes, payload.offset); 
		    		
		    		if (cosine)
		              docVectorNorm += Math.pow(payloadValue, 2.0);
		    	}
		    		
		    	score = (float)(score + payloadValue * (vector.get(Integer.parseInt(term))));
		    }
		    
		    if (cosine) {
		      if ((docVectorNorm == 0) || (queryVectorNorm == 0)) return 0f;
		      return (float)(score / (Math.sqrt(docVectorNorm) * Math.sqrt(queryVectorNorm)));
		    }

			return score;
		}
	};
}
 
Example 11
Source File: PhraseWildcardQuery.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private long getTermsSize(LeafReaderContext leafReaderContext) throws IOException {
  Terms terms = leafReaderContext.reader().terms(field);
  return terms == null ? 0 : terms.size();
}
 
Example 12
Source File: FieldCacheImpl.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
protected Accountable createValue(LeafReader reader, CacheKey key)
    throws IOException {

  final int maxDoc = reader.maxDoc();

  Terms terms = reader.terms(key.field);

  final float acceptableOverheadRatio = ((Float) key.custom).floatValue();

  final PagedBytes bytes = new PagedBytes(15);

  int startTermsBPV;

  // TODO: use Uninvert?
  if (terms != null) {
    // Try for coarse estimate for number of bits; this
    // should be an underestimate most of the time, which
    // is fine -- GrowableWriter will reallocate as needed
    long numUniqueTerms = terms.size();
    if (numUniqueTerms != -1L) {
      if (numUniqueTerms > maxDoc) {
        throw new IllegalStateException("Type mismatch: " + key.field + " was indexed with multiple values per document, use SORTED_SET instead");
      }

      startTermsBPV = PackedInts.bitsRequired(numUniqueTerms);
    } else {
      startTermsBPV = 1;
    }
  } else {
    startTermsBPV = 1;
  }

  PackedLongValues.Builder termOrdToBytesOffset = PackedLongValues.monotonicBuilder(PackedInts.COMPACT);
  final GrowableWriter docToTermOrd = new GrowableWriter(startTermsBPV, maxDoc, acceptableOverheadRatio);

  int termOrd = 0;

  // TODO: use Uninvert?

  if (terms != null) {
    final TermsEnum termsEnum = terms.iterator();
    PostingsEnum docs = null;

    while(true) {
      final BytesRef term = termsEnum.next();
      if (term == null) {
        break;
      }
      if (termOrd >= maxDoc) {
        throw new IllegalStateException("Type mismatch: " + key.field + " was indexed with multiple values per document, use SORTED_SET instead");
      }

      termOrdToBytesOffset.add(bytes.copyUsingLengthPrefix(term));
      docs = termsEnum.postings(docs, PostingsEnum.NONE);
      while (true) {
        final int docID = docs.nextDoc();
        if (docID == DocIdSetIterator.NO_MORE_DOCS) {
          break;
        }
        // Store 1+ ord into packed bits
        docToTermOrd.set(docID, 1+termOrd);
      }
      termOrd++;
    }
  }

  // maybe an int-only impl?
  return new SortedDocValuesImpl(bytes.freeze(true), termOrdToBytesOffset.build(), docToTermOrd.getMutable(), termOrd);
}
 
Example 13
Source File: FieldCacheImpl.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
protected Accountable createValue(LeafReader reader, CacheKey key)
    throws IOException {

  // TODO: would be nice to first check if DocTermsIndex
  // was already cached for this field and then return
  // that instead, to avoid insanity

  final int maxDoc = reader.maxDoc();
  Terms terms = reader.terms(key.field);

  final float acceptableOverheadRatio = ((Float) key.custom).floatValue();

  final int termCountHardLimit = maxDoc;

  // Holds the actual term data, expanded.
  final PagedBytes bytes = new PagedBytes(15);

  int startBPV;

  if (terms != null) {
    // Try for coarse estimate for number of bits; this
    // should be an underestimate most of the time, which
    // is fine -- GrowableWriter will reallocate as needed
    long numUniqueTerms = terms.size();
    if (numUniqueTerms != -1L) {
      if (numUniqueTerms > termCountHardLimit) {
        numUniqueTerms = termCountHardLimit;
      }
      startBPV = PackedInts.bitsRequired(numUniqueTerms*4);
    } else {
      startBPV = 1;
    }
  } else {
    startBPV = 1;
  }

  final GrowableWriter docToOffset = new GrowableWriter(startBPV, maxDoc, acceptableOverheadRatio);
  
  // pointer==0 means not set
  bytes.copyUsingLengthPrefix(new BytesRef());

  if (terms != null) {
    int termCount = 0;
    final TermsEnum termsEnum = terms.iterator();
    PostingsEnum docs = null;
    while(true) {
      if (termCount++ == termCountHardLimit) {
        // app is misusing the API (there is more than
        // one term per doc); in this case we make best
        // effort to load what we can (see LUCENE-2142)
        break;
      }

      final BytesRef term = termsEnum.next();
      if (term == null) {
        break;
      }
      final long pointer = bytes.copyUsingLengthPrefix(term);
      docs = termsEnum.postings(docs, PostingsEnum.NONE);
      while (true) {
        final int docID = docs.nextDoc();
        if (docID == DocIdSetIterator.NO_MORE_DOCS) {
          break;
        }
        docToOffset.set(docID, pointer);
      }
    }
  }

  final PackedInts.Reader offsetReader = docToOffset.getMutable();
  Bits docsWithField = new Bits() {
    @Override
    public boolean get(int index) {
      return offsetReader.get(index) != 0;
    }

    @Override
    public int length() {
      return maxDoc;
    }
  };

  wrapper.setDocsWithField(reader, key.field, docsWithField, null);
  // maybe an int-only impl?
  return new BinaryDocValuesImpl(bytes.freeze(true), offsetReader, docsWithField);
}