Java Code Examples for org.apache.lucene.index.Terms#iterator()

The following examples show how to use org.apache.lucene.index.Terms#iterator() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File:    From lucene-solr with Apache License 2.0 6 votes vote down vote up
 * Create a {@link DisjunctionMatchesIterator} over a list of terms extracted from a {@link BytesRefIterator}
 * Only terms that have at least one match in the given document will be included
static MatchesIterator fromTermsEnum(LeafReaderContext context, int doc, Query query, String field, BytesRefIterator terms) throws IOException {
  Terms t = context.reader().terms(field);
  if (t == null)
    return null;
  TermsEnum te = t.iterator();
  PostingsEnum reuse = null;
  for (BytesRef term =; term != null; term = {
    if (te.seekExact(term)) {
      PostingsEnum pe = te.postings(reuse, PostingsEnum.OFFSETS);
      if (pe.advance(doc) == doc) {
        return new TermsEnumDisjunctionMatchesIterator(new TermMatchesIterator(query, pe), terms, te, doc, query);
      else {
        reuse = pe;
  return null;
Example 2
Source File:    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void write(Fields fields, NormsProducer norms) throws IOException {

  for(String field : fields) {

    Terms terms = fields.terms(field);
    if (terms == null) {

    TermsEnum termsEnum = terms.iterator();

    TermsWriter termsWriter = addField(fieldInfos.fieldInfo(field));

    while (true) {
      BytesRef term =;
      if (term == null) {

      termsWriter.write(term, termsEnum, norms);

Example 3
Source File:    From lucene-solr with Apache License 2.0 6 votes vote down vote up
 * Calculate probabilities for all classes for a given input text
 * @param inputDocument the input text as a {@code String}
 * @return a {@code List} of {@code ClassificationResult}, one for each existing class
 * @throws IOException if assigning probabilities fails
private List<ClassificationResult<BytesRef>> assignClassNormalizedList(String inputDocument) throws IOException {
  List<ClassificationResult<BytesRef>> assignedClasses = new ArrayList<>();

  Terms classes = MultiTerms.getTerms(indexReader, classFieldName);
  TermsEnum classesEnum = classes.iterator();
  BytesRef next;
  String[] tokenizedText = tokenize(inputDocument);
  while ((next = != null) {
    if (next.length > 0) {
      Term term = new Term(this.classFieldName, next);
      assignedClasses.add(new ClassificationResult<>(term.bytes(), calculateLogPrior(term) + calculateLogLikelihood(tokenizedText, term)));

  return normClassificationResults(assignedClasses);
Example 4
Source File:    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public RangeTermsEnum(Terms terms) throws IOException {
  if (terms == null) {
    positioned = true;
  } else {
    te = terms.iterator();
    if (lower != null) {
      TermsEnum.SeekStatus status = te.seekCeil(lower);
      if (status == TermsEnum.SeekStatus.END) {
        positioned = true;
        curr = null;
      } else if (status == SeekStatus.FOUND) {
        positioned = includeLower();
        curr = te.term();
      } else {
        // lower bound not found, so includeLower is irrelevant
        positioned = true;
        curr = te.term();
Example 5
Source File:    From linden with Apache License 2.0 6 votes vote down vote up
protected Accountable createValue(final AtomicReader reader, CacheKey key, boolean setDocsWithField)
    throws IOException {
  final Map<String, Integer> uidMap = new HashMap<>();

  Uninvert u = new Uninvert() {
    private String currentValue;

    public void visitTerm(BytesRef term) {
      currentValue = term.utf8ToString();

    public void visitDoc(int docID) {
      uidMap.put(currentValue, docID);

    protected TermsEnum termsEnum(Terms terms) throws IOException {
      return terms.iterator(null);
  u.uninvert(reader, key.field, setDocsWithField);
  return new PerReaderUIDMaps(reader.getContext().ord, uidMap);
Example 6
Source File:    From jate with GNU Lesser General Public License v3.0 5 votes vote down vote up
 * Retrieve term candidates from solr field
 *      see @code {}
 * The method assumes that the term candidates are extracted at index-time and stored in pre-configured field
 * @return Set, a set of term candidate surface form
 * @throws JATEException
 * @throws IOException
protected Set<String> getUniqueTerms() throws JATEException, IOException {
    Terms terms =SolrUtil.getTermVector(properties.getSolrFieldNameJATECTerms(),solrIndexSearcher);

    /*TermsEnum source = terms.iterator();
    String term = //"thrownawayorusedjustforelementarystatistical profile";
    "l hierar hy";

            if (source.seekExact(new BytesRef(term.getBytes("UTF-8")))) {
                PostingsEnum docEnum = source.postings(null);
                int doc = 0;
                while ((doc = docEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
                    int tfid = docEnum.freq();  //tf in document


            } else {


    TermsEnum termsEnum = terms.iterator();
    Set<String> allTermCandidates = new HashSet<>();

    while ( != null) {
        BytesRef t = termsEnum.term();
        if (t.length == 0)
    return allTermCandidates;
Example 7
Source File:    From jate with GNU Lesser General Public License v3.0 5 votes vote down vote up
private List<MWESentenceContext> collectTermOffsets(Terms termVectorLookup) throws IOException {
    List<MWESentenceContext> result = new ArrayList<>();

    TermsEnum tiRef= termVectorLookup.iterator();
    BytesRef luceneTerm =;
    while (luceneTerm != null) {
        if (luceneTerm.length == 0) {
            luceneTerm =;
        String tString = luceneTerm.utf8ToString();
        if(!allCandidates.contains(tString)) {

        PostingsEnum postingsEnum = tiRef.postings(null, PostingsEnum.ALL);
        //PostingsEnum postingsEnum = ti.postings(null, PostingsEnum.OFFSETS);

        int doc = postingsEnum.nextDoc(); //this should be just 1 doc, i.e., the constraint for getting this TV
        if (doc != PostingsEnum.NO_MORE_DOCS) {
            int totalOccurrence = postingsEnum.freq();
            for (int i = 0; i < totalOccurrence; i++) {
                int start = postingsEnum.startOffset();
                int end = postingsEnum.endOffset();
                BytesRef payload=postingsEnum.getPayload();
                int sentenceId=-1;
                    sentenceId=new SentenceContext(MWEMetadata.deserialize(payload.utf8ToString())).getSentenceId();
                result.add(new MWESentenceContext(tString,sentenceId, start, end));
        luceneTerm =;
    return result;
Example 8
Source File:    From lucene-solr with Apache License 2.0 5 votes vote down vote up
protected void doSetNextReader(LeafReaderContext context) throws IOException {
  Terms terms = context.reader().terms(field);
  if (terms == null) {
    currentReaderPostingsValues = null;
  } else {
    TermsEnum termsEnum = terms.iterator();
    if (termsEnum.seekExact(featureName) == false) {
      currentReaderPostingsValues = null;
    } else {
      currentReaderPostingsValues = termsEnum.postings(currentReaderPostingsValues, PostingsEnum.FREQS);
Example 9
Source File:    From incubator-retired-blur with Apache License 2.0 5 votes vote down vote up
private int getTermWithSeekCount(Fields fields, String field) throws IOException {
  Terms terms = fields.terms(field);
  TermsEnum termsEnum = terms.iterator(null);
  SeekStatus seekStatus = termsEnum.seekCeil(new BytesRef(""));
  if (seekStatus == SeekStatus.END) {
    return 0;
  int count = 1;
  while ( != null) {
  return count;
Example 10
Source File:    From incubator-retired-blur with Apache License 2.0 5 votes vote down vote up
public Boolean execute(IndexContext context) throws IOException, InterruptedException {
  try {
    IndexReader indexReader = context.getIndexReader();
    while (true) {
      long hash = 0;
      for (AtomicReaderContext atomicReaderContext : indexReader.leaves()) {
        AtomicReader reader = atomicReaderContext.reader();
        for (String field : reader.fields()) {
          Terms terms = reader.terms(field);
          BytesRef bytesRef;
          TermsEnum iterator = terms.iterator(null);
          while ((bytesRef = != null) {
            hash += bytesRef.hashCode();
      System.out.println("hashcode = " + hash);
  } catch (IOException e) {
    throw e;
  } catch (Throwable t) {
    if (t instanceof InterruptedException) {
      throw t;
    } else if (t instanceof RuntimeException) {
      throw (RuntimeException) t;
    throw new RuntimeException(t);
Example 11
Source File:    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testEndOffsetPositionWithTeeSinkTokenFilter() throws Exception {
  Directory dir = newDirectory();
  Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
  IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(analyzer));
  Document doc = new Document();
  TokenStream tokenStream = analyzer.tokenStream("field", "abcd   ");
  TeeSinkTokenFilter tee = new TeeSinkTokenFilter(tokenStream);
  TokenStream sink = tee.newSinkTokenStream();
  FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
  Field f1 = new Field("field", tee, ft);
  Field f2 = new Field("field", sink, ft);

  IndexReader r =;
  Terms vector = r.getTermVectors(0).terms("field");
  assertEquals(1, vector.size());
  TermsEnum termsEnum = vector.iterator();;
  assertEquals(2, termsEnum.totalTermFreq());
  PostingsEnum positions = termsEnum.postings(null, PostingsEnum.ALL);
  assertTrue(positions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
  assertEquals(2, positions.freq());
  assertEquals(0, positions.startOffset());
  assertEquals(4, positions.endOffset());
  assertEquals(8, positions.startOffset());
  assertEquals(12, positions.endOffset());
  assertEquals(DocIdSetIterator.NO_MORE_DOCS, positions.nextDoc());
Example 12
Source File:    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
private void buildField(XContentBuilder builder, final CharsRefBuilder spare, Fields theFields, Iterator<String> fieldIter) throws IOException {
    String fieldName =;
    Terms curTerms = theFields.terms(fieldName);
    // write field statistics
    buildFieldStatistics(builder, curTerms);
    TermsEnum termIter = curTerms.iterator();
    BoostAttribute boostAtt = termIter.attributes().addAttribute(BoostAttribute.class);
    for (int i = 0; i < curTerms.size(); i++) {
        buildTerm(builder, spare, curTerms, termIter, boostAtt);
Example 13
Source File:    From lucene-solr with Apache License 2.0 5 votes vote down vote up
 * Create the coefficient to transform the weight.
 * @param doc id of the document
 * @param matchedTokens tokens found in the query
 * @param prefixToken unfinished token in the query
 * @return the coefficient
 * @throws IOException If there are problems reading term vectors from the underlying Lucene index.
private double createCoefficient(IndexSearcher searcher, int doc, Set<String> matchedTokens, String prefixToken) throws IOException {

  Terms tv = searcher.getIndexReader().getTermVector(doc, TEXT_FIELD_NAME);
  TermsEnum it = tv.iterator();

  Integer position = Integer.MAX_VALUE;
  BytesRef term;
  // find the closest token position
  while ((term = != null) {

    String docTerm = term.utf8ToString();

    if (matchedTokens.contains(docTerm) || (prefixToken != null && docTerm.startsWith(prefixToken))) {
      PostingsEnum docPosEnum = it.postings(null, PostingsEnum.OFFSETS);

      // use the first occurrence of the term
      int p = docPosEnum.nextPosition();
      if (p < position) {
        position = p;

  // create corresponding coefficient based on position
  return calculateCoefficient(position);
Example 14
Source File:    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void testPerformance() throws Exception {
  MockAnalyzer analyzer = new MockAnalyzer(random());
  int numDocs = atLeast(10);
  LeafReader leafReader = getRandomIndex(analyzer, numDocs);
  try {
    Classifier<BytesRef> classifier = new KNearestFuzzyClassifier(leafReader, null, analyzer, null, 3, categoryFieldName, textFieldName);

    ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(leafReader,
        classifier, categoryFieldName, textFieldName, -1);

    double avgClassificationTime = confusionMatrix.getAvgClassificationTime();
    assertTrue(avgClassificationTime >= 0);

    double accuracy = confusionMatrix.getAccuracy();
    assertTrue(accuracy >= 0d);
    assertTrue(accuracy <= 1d);

    double recall = confusionMatrix.getRecall();
    assertTrue(recall >= 0d);
    assertTrue(recall <= 1d);

    double precision = confusionMatrix.getPrecision();
    assertTrue(precision >= 0d);
    assertTrue(precision <= 1d);

    Terms terms = MultiTerms.getTerms(leafReader, categoryFieldName);
    TermsEnum iterator = terms.iterator();
    BytesRef term;
    while ((term = != null) {
      String s = term.utf8ToString();
      recall = confusionMatrix.getRecall(s);
      assertTrue(recall >= 0d);
      assertTrue(recall <= 1d);
      precision = confusionMatrix.getPrecision(s);
      assertTrue(precision >= 0d);
      assertTrue(precision <= 1d);
      double f1Measure = confusionMatrix.getF1Measure(s);
      assertTrue(f1Measure >= 0d);
      assertTrue(f1Measure <= 1d);
  } finally {
Example 15
Source File:    From lucene-solr with Apache License 2.0 4 votes vote down vote up
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
  return new SimplePrefixTermsEnum(terms.iterator(), prefix);
Example 16
Source File:    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void finish() throws IOException {
  NamedList<Double> analytics = new NamedList<Double>();
  @SuppressWarnings({"unchecked", "rawtypes"})
  NamedList<Integer> topFreq = new NamedList();

  @SuppressWarnings({"unchecked", "rawtypes"})
  NamedList<Integer> allFreq = new NamedList();

  rb.rsp.add("featuredTerms", analytics);
  rb.rsp.add("docFreq", topFreq);
  rb.rsp.add("numDocs", count);

  TreeSet<TermWithScore> topTerms = new TreeSet<>();

  double numDocs = count;
  double pc = numPositiveDocs / numDocs;
  double entropyC = binaryEntropy(pc);

  Terms terms = ((SolrIndexSearcher)searcher).getSlowAtomicReader().terms(field);
  TermsEnum termsEnum = terms == null ? TermsEnum.EMPTY : terms.iterator();
  BytesRef term;
  PostingsEnum postingsEnum = null;
  while ((term = != null) {
    postingsEnum = termsEnum.postings(postingsEnum);
    int xc = 0;
    int nc = 0;
    while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
      if (positiveSet.get(postingsEnum.docID())) {
      } else if (negativeSet.get(postingsEnum.docID())) {

    int docFreq = xc+nc;

    double entropyContainsTerm = binaryEntropy( (double) xc / docFreq );
    double entropyNotContainsTerm = binaryEntropy( (double) (numPositiveDocs - xc) / (numDocs - docFreq + 1) );
    double score = entropyC - ( (docFreq / numDocs) * entropyContainsTerm + (1.0 - docFreq / numDocs) * entropyNotContainsTerm);

    topFreq.add(term.utf8ToString(), docFreq);
    if (topTerms.size() < numTerms) {
      topTerms.add(new TermWithScore(term.utf8ToString(), score));
    } else  {
      if (topTerms.first().score < score) {
        topTerms.add(new TermWithScore(term.utf8ToString(), score));

  for (TermWithScore topTerm : topTerms) {
    analytics.add(topTerm.term, topTerm.score);
    topFreq.add(topTerm.term, allFreq.get(topTerm.term));

  if (this.delegate instanceof DelegatingCollector) {
    ((DelegatingCollector) this.delegate).finish();
Example 17
Source File:    From SearchServices with GNU Lesser General Public License v3.0 4 votes vote down vote up
private static void getDetailedFieldInfo(SolrQueryRequest req,
		String field, SimpleOrderedMap<Object> fieldMap) throws IOException {

	SolrParams params = req.getParams();
	final int numTerms = params.getInt(NUMTERMS, DEFAULT_COUNT);

	TopTermQueue tiq = new TopTermQueue(numTerms + 1); // Something to
														// collect the top N
														// terms in.

	final CharsRefBuilder spare = new CharsRefBuilder();

	Terms terms = MultiFields.getTerms(req.getSearcher().getIndexReader(),
	if (terms == null) { // field does not exist
	TermsEnum termsEnum = terms.iterator();
	BytesRef text;
	int[] buckets = new int[HIST_ARRAY_SIZE];
	while ((text = != null) {
		int freq = termsEnum.docFreq(); // This calculation seems odd, but
										// it gives the same results as it
										// used to.
		int slot = 32 - Integer.numberOfLeadingZeros(Math.max(0, freq - 1));
		buckets[slot] = buckets[slot] + 1;
		if (numTerms > 0 && freq > tiq.minFreq) {
			String t = spare.toString();

			tiq.add(new TopTermQueue.TermInfo(new Term(field, t), termsEnum
			if (tiq.size() > numTerms) { // if tiq full
				tiq.pop(); // remove lowest in tiq
				tiq.minFreq = tiq.getTopTermInfo().docFreq;
	fieldMap.add("distinct", tiq.distinctTerms);

	// Include top terms
	fieldMap.add("topTerms", tiq.toNamedList(req.getSearcher().getSchema()));

	// Add a histogram
	fieldMap.add("histogram", tiq.histogram.toNamedList());
Example 18
Source File:    From stratio-cassandra with Apache License 2.0 4 votes vote down vote up
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
    return new FullKeyDataRangeFilteredTermsEnum(terms.iterator(null));
Example 19
Source File:    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void testPerformance() throws Exception {
  MockAnalyzer analyzer = new MockAnalyzer(random());
  int numDocs = atLeast(10);
  LeafReader leafReader = getRandomIndex(analyzer,  numDocs);
  try {
    CachingNaiveBayesClassifier simpleNaiveBayesClassifier = new CachingNaiveBayesClassifier(leafReader,
        analyzer, null, categoryFieldName, textFieldName);

    ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(leafReader,
        simpleNaiveBayesClassifier, categoryFieldName, textFieldName, -1);

    double avgClassificationTime = confusionMatrix.getAvgClassificationTime();
    assertTrue(avgClassificationTime >= 0);
    double accuracy = confusionMatrix.getAccuracy();
    assertTrue(accuracy >= 0d);
    assertTrue(accuracy <= 1d);

    double recall = confusionMatrix.getRecall();
    assertTrue(recall >= 0d);
    assertTrue(recall <= 1d);

    double precision = confusionMatrix.getPrecision();
    assertTrue(precision >= 0d);
    assertTrue(precision <= 1d);

    Terms terms = MultiTerms.getTerms(leafReader, categoryFieldName);
    TermsEnum iterator = terms.iterator();
    BytesRef term;
    while ((term = != null) {
      String s = term.utf8ToString();
      recall = confusionMatrix.getRecall(s);
      assertTrue(recall >= 0d);
      assertTrue(recall <= 1d);
      precision = confusionMatrix.getPrecision(s);
      assertTrue(precision >= 0d);
      assertTrue(precision <= 1d);
      double f1Measure = confusionMatrix.getF1Measure(s);
      assertTrue(f1Measure >= 0d);
      assertTrue(f1Measure <= 1d);
  } finally {

Example 20
Source File:    From Siamese with GNU General Public License v3.0 4 votes vote down vote up
private static void analyseTermFreq(String indexName, String field, String freqType, String outputFileName) {
        String indexFile = elasticsearchLoc + "/data/stackoverflow/nodes/0/indices/"
                + indexName + "/0/index";
        DecimalFormat df = new DecimalFormat("#.00");
        int printEvery = 100000;
        File outputFile = new File(outputFileName);
        if (outputFile.exists()) {
            if (!outputFile.delete()) {
                System.out.println("ERROR: cannot delete the output file.");
        /* adapted from
        int count = 0;
        try {
            IndexReader reader =;
            Fields fields = MultiFields.getFields(reader);
            Terms terms = fields.terms(field);
            TermsEnum termsEnum = terms.iterator();
            int size = 0;
            // TODO: is there a better solution?
            // iterate to get the size
            while ( != null) {
//            String[] termArr = new String[size];
            long[] freqArr = new long[size];
            // do the real work
            termsEnum = terms.iterator();
            while ( != null) {
//                String term = termsEnum.term().utf8ToString();
                long tfreq = 0;
                if (freqType.equals("tf"))
                    tfreq = termsEnum.totalTermFreq();
                else if (freqType.equals("df"))
                    tfreq = termsEnum.docFreq();
                else {
                    System.out.println("Wrong frequency. Quit!");
//                termArr[count] = term;
                freqArr[count] = tfreq;
                if (count % printEvery == 0) {
                    System.out.println("processed: " + count + " terms "
                            + " [" + df.format(((long)count * 100)/size) + "%]");
            System.out.println(field + ": total = " + count);
            double[] data = new double[size];
            String output = "freq\n";
            for (int i = 0; i < freqArr.length; i++) {
                data[i] = freqArr[i];
                output += freqArr[i] + "\n";
                if (i > 0 && i % printEvery == 0) {
                    MyUtils.writeToFile("./", outputFileName, output, true);
                    System.out.println("written: " + i + " terms "
                            + " [" + df.format(((long)i * 100)/size) + "%]");
                    output = "";
            // write the rest to the file
            MyUtils.writeToFile("./",outputFileName, output, true);
        } catch (IOException e) {