Java Code Examples for org.apache.lucene.index.TermsEnum#next()

The following examples show how to use org.apache.lucene.index.TermsEnum#next() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: QueryAutoStopWordAnalyzer.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for the
 * given selection of fields from terms with a document frequency greater than
 * the given maxDocFreq
 *
 * @param delegate Analyzer whose TokenStream will be filtered
 * @param indexReader IndexReader to identify the stopwords from
 * @param fields Selection of fields to calculate stopwords for
 * @param maxDocFreq Document frequency terms should be above in order to be stopwords
 * @throws IOException Can be thrown while reading from the IndexReader
 */
public QueryAutoStopWordAnalyzer(
    Analyzer delegate,
    IndexReader indexReader,
    Collection<String> fields,
    int maxDocFreq) throws IOException {
  super(delegate.getReuseStrategy());
  this.delegate = delegate;
  
  for (String field : fields) {
    Set<String> stopWords = new HashSet<>();
    Terms terms = MultiTerms.getTerms(indexReader, field);
    CharsRefBuilder spare = new CharsRefBuilder();
    if (terms != null) {
      TermsEnum te = terms.iterator();
      BytesRef text;
      while ((text = te.next()) != null) {
        if (te.docFreq() > maxDocFreq) {
          spare.copyUTF8Bytes(text);
          stopWords.add(spare.toString());
        }
      }
    }
    stopWordsPerField.put(field, stopWords);
  }
}

Example 2

Source File: LuceneIndexCorpus.java From word2vec-lucene with Apache License 2.0

6 votes

@Override
public void learnVocab() throws IOException {
  super.learnVocab();

  final String field = ((LuceneIndexConfig)config).getField();
  final Terms terms = MultiFields.getTerms(reader, field);
  final BytesRef maxTerm = terms.getMax();
  final BytesRef minTerm = terms.getMin();
  Query q = new TermRangeQuery(field, minTerm, maxTerm, true, true);
  IndexSearcher searcher = new IndexSearcher(reader);
  topDocs = searcher.search(q, Integer.MAX_VALUE);

  TermsEnum termsEnum = null;
  termsEnum = terms.iterator(termsEnum);

  termsEnum.seekCeil(new BytesRef());
  BytesRef term = termsEnum.term();
  while(term != null){
    int p = addWordToVocab(term.utf8ToString());
    vocab[p].setCn((int)termsEnum.totalTermFreq());
    term = termsEnum.next();
  }
}

Example 3

Source File: MultiTermIntervalsSource.java From lucene-solr with Apache License 2.0

6 votes

@Override
public IntervalIterator intervals(String field, LeafReaderContext ctx) throws IOException {
  Terms terms = ctx.reader().terms(field);
  if (terms == null) {
    return null;
  }
  List<IntervalIterator> subSources = new ArrayList<>();
  TermsEnum te = automaton.getTermsEnum(terms);
  BytesRef term;
  int count = 0;
  while ((term = te.next()) != null) {
    subSources.add(TermIntervalsSource.intervals(term, te));
    if (++count > maxExpansions) {
      throw new IllegalStateException("Automaton [" + this.pattern + "] expanded to too many terms (limit " + maxExpansions + ")");
    }
  }
  if (subSources.size() == 0) {
    return null;
  }
  return new DisjunctionIntervalsSource.DisjunctionIntervalIterator(subSources);
}

Example 4

Source File: ReconstructCommand.java From clue with Apache License 2.0

6 votes

public String reconstructNoPositions(TermsEnum te, int docid, Bits liveDocs) throws IOException{
  List<String> textList = new ArrayList<String>();
  BytesRef text;
  PostingsEnum postings = null;
  while ((text = te.next()) != null) {
    postings = te.postings(postings, PostingsEnum.FREQS);
    int iterDoc = postings.advance(docid);
    if (iterDoc == docid) {
      textList.add(text.utf8ToString());
    }
  }
  StringBuilder buf = new StringBuilder();
  for (String s : textList) {
    buf.append(s+" ");
  }
  return buf.toString();
}

Example 5

Source File: ReadmeSimilarityCalculator.java From scava with Eclipse Public License 2.0

6 votes

private HashMap<String, Integer> getAllTerms() throws IOException {
	HashMap<String, Integer> allTerms = new HashMap<>();
	int pos = 0;
	for (int docId = 0; docId < getTotalDocumentInIndex(); docId++) {
		Terms vector = getIndexReader().getTermVector(docId, FIELD_CONTENT);
		TermsEnum termsEnum = null;
		termsEnum = vector.iterator();
		BytesRef text = null;
		while ((text = termsEnum.next()) != null) {
			String term = text.utf8ToString();
			allTerms.put(term, pos++);
		}
	}

	// Update postition
	pos = 0;
	for (Entry<String, Integer> s : allTerms.entrySet()) {
		s.setValue(pos++);
	}
	return allTerms;
}

Example 6

Source File: HashTermStatistics.java From liresolr with GNU General Public License v2.0

6 votes

public static void addToStatistics(SolrIndexSearcher searcher, String field) throws IOException {
        // check if this field is already in the stats.
//        synchronized (instance) {
            if (termstats.get(field)!=null) return;
//        }
        // else add it to the stats.
        Terms terms = searcher.getSlowAtomicReader().terms(field);
        HashMap<String, Integer> term2docFreq = new HashMap<String, Integer>(1000);
        termstats.put(field, term2docFreq);
        if (terms!=null) {
            TermsEnum termsEnum = terms.iterator();
            BytesRef term;
            while ((term = termsEnum.next()) != null) {
                term2docFreq.put(term.utf8ToString(), termsEnum.docFreq());
            }
        }
    }

Example 7

Source File: BlockTermsWriter.java From lucene-solr with Apache License 2.0

6 votes

@Override
public void write(Fields fields, NormsProducer norms) throws IOException {

  for(String field : fields) {

    Terms terms = fields.terms(field);
    if (terms == null) {
      continue;
    }

    TermsEnum termsEnum = terms.iterator();

    TermsWriter termsWriter = addField(fieldInfos.fieldInfo(field));

    while (true) {
      BytesRef term = termsEnum.next();
      if (term == null) {
        break;
      }

      termsWriter.write(term, termsEnum, norms);
    }

    termsWriter.finish();
  }
}

Example 8

Source File: TestPhraseWildcardQuery.java From lucene-solr with Apache License 2.0

6 votes

protected Term[] expandMultiTerm(String field, String term, int maxExpansions) throws IOException {
  if (maxExpansions == 0) {
    return new Term[0];
  }
  Set<Term> expansions = new HashSet<>();
  WildcardQuery wq = new WildcardQuery(new Term(field, term));
  expansion:
  for (final LeafReaderContext ctx : reader.leaves()) {
    Terms terms = ctx.reader().terms(field);
    if (terms != null) {
      TermsEnum termsEnum = wq.getTermsEnum(terms);
      while (termsEnum.next() != null) {
        expansions.add(new Term(field, termsEnum.term()));
        if (expansions.size() >= maxExpansions) {
          break expansion;
        }
      }
    }
  }
  return expansions.toArray(new Term[0]);
}

Example 9

Source File: TermVectorsAdapter.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Returns the term vectors for the specified field in the specified document.
 * If no term vector is available for the field, empty list is returned.
 *
 * @param docid - document id
 * @param field - field name
 * @return list of term vector elements
 * @throws IOException - if there is a low level IO error.
 */
List<TermVectorEntry> getTermVector(int docid, String field) throws IOException {
  Terms termVector = reader.getTermVector(docid, field);
  if (termVector == null) {
    // no term vector available
    log.warn("No term vector indexed for doc: #{} and field: {}", docid, field);
    return Collections.emptyList();
  }

  List<TermVectorEntry> res = new ArrayList<>();
  TermsEnum te = termVector.iterator();
  while (te.next() != null) {
    res.add(TermVectorEntry.of(te));
  }
  return res;
}

Example 10

Source File: TestMultiThreadTermVectors.java From lucene-solr with Apache License 2.0

5 votes

static void verifyVector(TermsEnum vector, int num) throws IOException {
  StringBuilder temp = new StringBuilder();
  while(vector.next() != null) {
    temp.append(vector.term().utf8ToString());
  }
  assertEquals(English.intToEnglish(num).trim(), temp.toString().trim());
}

Example 11

Source File: QueryAutoFilteringComponent.java From query-autofiltering-component with Apache License 2.0

5 votes

private void buildFieldMap( ResponseBuilder rb ) throws IOException {
  Log.debug( "buildFieldMap" );
  SolrIndexSearcher searcher = rb.req.getSearcher();
  // build a synonym map from the SortedDocValues -
  // for each field value: lower case, stemmed, lookup synonyms from synonyms.txt - map to fieldValue
  SynonymMap.Builder fieldBuilder = new SynonymMap.Builder( true );
  SynonymMap.Builder termBuilder = new SynonymMap.Builder( true );
    
  ArrayList<String> searchFields = getStringFields( searcher );

  for (String searchField : searchFields ) {
    Log.debug( "adding searchField " + searchField );
    CharsRef fieldChars = new CharsRef( searchField );
    SortedSetDocValues sdv = FieldCache.DEFAULT.getDocTermOrds( searcher.getAtomicReader( ), searchField );
    if (sdv == null) continue;
    Log.debug( "got SortedSetDocValues for " + searchField );
    TermsEnum te = sdv.termsEnum();
    while (te.next() != null) {
      BytesRef term = te.term();
      String fieldValue = term.utf8ToString( );
      addTerm ( fieldChars, fieldValue, fieldBuilder, termBuilder );
    }
  }
    
  addDistributedTerms( rb, fieldBuilder, termBuilder, searchFields );
    
  fieldMap = fieldBuilder.build( );
  termMap = termBuilder.build( );
}

Example 12

Source File: CountingQuery.java From lucene-query-example with Apache License 2.0

5 votes

public float customScore(int doc, float subQueryScore, float valSrcScores[]) throws IOException {
	IndexReader r = context.reader();
	Terms tv = r.getTermVector(doc, _field);
	TermsEnum termsEnum = tv.iterator();
    int numTerms = 0;
	while((termsEnum.next()) != null) {
    	numTerms++;
    }
	return (float)(numTerms);
}

Example 13

Source File: FSTTermsWriter.java From lucene-solr with Apache License 2.0

5 votes

@Override
public void write(Fields fields, NormsProducer norms) throws IOException {
  for(String field : fields) {
    Terms terms = fields.terms(field);
    if (terms == null) {
      continue;
    }
    FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
    boolean hasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
    TermsEnum termsEnum = terms.iterator();
    TermsWriter termsWriter = new TermsWriter(fieldInfo);

    long sumTotalTermFreq = 0;
    long sumDocFreq = 0;
    FixedBitSet docsSeen = new FixedBitSet(maxDoc);

    while (true) {
      BytesRef term = termsEnum.next();
      if (term == null) {
        break;
      }
          
      BlockTermState termState = postingsWriter.writeTerm(term, termsEnum, docsSeen, norms);
      if (termState != null) {
        termsWriter.finishTerm(term, termState);
        sumTotalTermFreq += termState.totalTermFreq;
        sumDocFreq += termState.docFreq;
      }
    }

    termsWriter.finish(hasFreq ? sumTotalTermFreq : -1, sumDocFreq, docsSeen.cardinality());
  }
}

Example 14

Source File: CompletionFieldsConsumer.java From lucene-solr with Apache License 2.0

5 votes

@Override
public void write(Fields fields, NormsProducer norms) throws IOException {
  delegateFieldsConsumer.write(fields, norms);

  for (String field : fields) {
    CompletionTermWriter termWriter = new CompletionTermWriter();
    Terms terms = fields.terms(field);
    if (terms == null) {
      // this can happen from ghost fields, where the incoming Fields iterator claims a field exists but it does not
      continue;
    }
    TermsEnum termsEnum = terms.iterator();

    // write terms
    BytesRef term;
    while ((term = termsEnum.next()) != null) {
      termWriter.write(term, termsEnum);
    }

    // store lookup, if needed
    long filePointer = dictOut.getFilePointer();
    if (termWriter.finish(dictOut)) {
      seenFields.put(field, new CompletionMetaData(filePointer,
          termWriter.minWeight,
          termWriter.maxWeight,
          termWriter.type));
    }
  }
}

Example 15

Source File: QualityQueriesFinder.java From lucene-solr with Apache License 2.0

5 votes

private String [] bestTerms(String field,int numTerms) throws IOException {
  PriorityQueue<TermDf> pq = new TermsDfQueue(numTerms);
  IndexReader ir = DirectoryReader.open(dir);
  try {
    int threshold = ir.maxDoc() / 10; // ignore words too common.
    Terms terms = MultiTerms.getTerms(ir, field);
    if (terms != null) {
      TermsEnum termsEnum = terms.iterator();
      while (termsEnum.next() != null) {
        int df = termsEnum.docFreq();
        if (df<threshold) {
          String ttxt = termsEnum.term().utf8ToString();
          pq.insertWithOverflow(new TermDf(ttxt,df));
        }
      }
    }
  } finally {
    ir.close();
  }
  String res[] = new String[pq.size()];
  int i = 0;
  while (pq.size()>0) {
    TermDf tdf = pq.pop(); 
    res[i++] = tdf.word;
    System.out.println(i+".   word:  "+tdf.df+"   "+tdf.word);
  }
  return res;
}

Example 16

Source File: TermVectorsFromLucene.java From semanticvectors with BSD 3-Clause "New" or "Revised" License

5 votes

private void createTermBasedRRIVectorsImpl() throws IOException, RuntimeException {
  this.termVectors = new ElementalVectorStore(flagConfig);

  if (!flagConfig.initialtermvectors().isEmpty() && !flagConfig.initialtermvectors().equals("random")) {
    VerbatimLogger.info("Using elemental term vectors from file " + flagConfig.initialtermvectors());
    CloseableVectorStore inputReader = VectorStoreReader.openVectorStore(flagConfig.initialtermvectors(), flagConfig);
    Enumeration<ObjectVector> termEnumeration = inputReader.getAllVectors();
    this.termVectors = new VectorStoreRAM(flagConfig);
    
    int count = 0;
    while (termEnumeration.hasMoreElements()) {
      ObjectVector next = termEnumeration.nextElement();
      ((VectorStoreRAM) this.termVectors).putVector(next.getObject(), next.getVector());
      count++;
    }
    inputReader.close();
    logger.info("Read in " + count + " vectors");
  } else {
    logger.info("Generating new elemental term vectors");
    this.termVectors = new ElementalVectorStore(flagConfig);
    for(String fieldName : flagConfig.contentsfields()) {
      TermsEnum terms = luceneUtils.getTermsForField(fieldName).iterator();
      BytesRef bytes;
      while ((bytes = terms.next()) != null) {
        Term term = new Term(fieldName, bytes);
        // Skip terms that don't pass the filter.
        if (!luceneUtils.termFilter(term))
          continue;
        // Automatically triggers elemental vector generation.
        this.termVectors.getVector(term.text());
      }
    }
  }
}

Example 17

Source File: QueryIndex.java From lucene-solr with Apache License 2.0

5 votes

QueryTermFilter(IndexReader reader) throws IOException {
  for (LeafReaderContext ctx : reader.leaves()) {
    for (FieldInfo fi : ctx.reader().getFieldInfos()) {
      BytesRefHash terms = termsHash.computeIfAbsent(fi.name, f -> new BytesRefHash());
      Terms t = ctx.reader().terms(fi.name);
      if (t != null) {
        TermsEnum te = t.iterator();
        BytesRef term;
        while ((term = te.next()) != null) {
          terms.add(term);
        }
      }
    }
  }
}

Example 18

Source File: FieldCacheImpl.java From lucene-solr with Apache License 2.0

4 votes

@Override
protected Accountable createValue(LeafReader reader, CacheKey key)
    throws IOException {

  final int maxDoc = reader.maxDoc();

  Terms terms = reader.terms(key.field);

  final float acceptableOverheadRatio = ((Float) key.custom).floatValue();

  final PagedBytes bytes = new PagedBytes(15);

  int startTermsBPV;

  // TODO: use Uninvert?
  if (terms != null) {
    // Try for coarse estimate for number of bits; this
    // should be an underestimate most of the time, which
    // is fine -- GrowableWriter will reallocate as needed
    long numUniqueTerms = terms.size();
    if (numUniqueTerms != -1L) {
      if (numUniqueTerms > maxDoc) {
        throw new IllegalStateException("Type mismatch: " + key.field + " was indexed with multiple values per document, use SORTED_SET instead");
      }

      startTermsBPV = PackedInts.bitsRequired(numUniqueTerms);
    } else {
      startTermsBPV = 1;
    }
  } else {
    startTermsBPV = 1;
  }

  PackedLongValues.Builder termOrdToBytesOffset = PackedLongValues.monotonicBuilder(PackedInts.COMPACT);
  final GrowableWriter docToTermOrd = new GrowableWriter(startTermsBPV, maxDoc, acceptableOverheadRatio);

  int termOrd = 0;

  // TODO: use Uninvert?

  if (terms != null) {
    final TermsEnum termsEnum = terms.iterator();
    PostingsEnum docs = null;

    while(true) {
      final BytesRef term = termsEnum.next();
      if (term == null) {
        break;
      }
      if (termOrd >= maxDoc) {
        throw new IllegalStateException("Type mismatch: " + key.field + " was indexed with multiple values per document, use SORTED_SET instead");
      }

      termOrdToBytesOffset.add(bytes.copyUsingLengthPrefix(term));
      docs = termsEnum.postings(docs, PostingsEnum.NONE);
      while (true) {
        final int docID = docs.nextDoc();
        if (docID == DocIdSetIterator.NO_MORE_DOCS) {
          break;
        }
        // Store 1+ ord into packed bits
        docToTermOrd.set(docID, 1+termOrd);
      }
      termOrd++;
    }
  }

  // maybe an int-only impl?
  return new SortedDocValuesImpl(bytes.freeze(true), termOrdToBytesOffset.build(), docToTermOrd.getMutable(), termOrd);
}

Example 19

Source File: AlfrescoLukeRequestHandler.java From SearchServices with GNU Lesser General Public License v3.0

4 votes

@SuppressWarnings("unchecked")
private static void getDetailedFieldInfo(SolrQueryRequest req,
		String field, SimpleOrderedMap<Object> fieldMap) throws IOException {

	SolrParams params = req.getParams();
	final int numTerms = params.getInt(NUMTERMS, DEFAULT_COUNT);

	TopTermQueue tiq = new TopTermQueue(numTerms + 1); // Something to
														// collect the top N
														// terms in.

	final CharsRefBuilder spare = new CharsRefBuilder();

	Terms terms = MultiFields.getTerms(req.getSearcher().getIndexReader(),
			field);
	if (terms == null) { // field does not exist
		return;
	}
	TermsEnum termsEnum = terms.iterator();
	BytesRef text;
	int[] buckets = new int[HIST_ARRAY_SIZE];
	while ((text = termsEnum.next()) != null) {
		++tiq.distinctTerms;
		int freq = termsEnum.docFreq(); // This calculation seems odd, but
										// it gives the same results as it
										// used to.
		int slot = 32 - Integer.numberOfLeadingZeros(Math.max(0, freq - 1));
		buckets[slot] = buckets[slot] + 1;
		if (numTerms > 0 && freq > tiq.minFreq) {
			spare.copyUTF8Bytes(text);
			String t = spare.toString();

			tiq.add(new TopTermQueue.TermInfo(new Term(field, t), termsEnum
					.docFreq()));
			if (tiq.size() > numTerms) { // if tiq full
				tiq.pop(); // remove lowest in tiq
				tiq.minFreq = tiq.getTopTermInfo().docFreq;
			}
		}
	}
	tiq.histogram.add(buckets);
	fieldMap.add("distinct", tiq.distinctTerms);

	// Include top terms
	fieldMap.add("topTerms", tiq.toNamedList(req.getSearcher().getSchema()));

	// Add a histogram
	fieldMap.add("histogram", tiq.histogram.toNamedList());
}

Example 20

Source File: BooleanPerceptronClassifierTest.java From lucene-solr with Apache License 2.0

4 votes

@Test
public void testPerformance() throws Exception {
  MockAnalyzer analyzer = new MockAnalyzer(random());
  int numDocs = atLeast(10);
  LeafReader leafReader = getRandomIndex(analyzer, numDocs);
  try {
    BooleanPerceptronClassifier classifier = new BooleanPerceptronClassifier(leafReader, analyzer, null, 1, null, booleanFieldName, textFieldName);

    ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(leafReader,
        classifier, booleanFieldName, textFieldName, -1);
    assertNotNull(confusionMatrix);

    double avgClassificationTime = confusionMatrix.getAvgClassificationTime();
    assertTrue(avgClassificationTime >= 0);

    double f1 = confusionMatrix.getF1Measure();
    assertTrue(f1 >= 0d);
    assertTrue(f1 <= 1d);

    double accuracy = confusionMatrix.getAccuracy();
    assertTrue(accuracy >= 0d);
    assertTrue(accuracy <= 1d);

    double recall = confusionMatrix.getRecall();
    assertTrue(recall >= 0d);
    assertTrue(recall <= 1d);

    double precision = confusionMatrix.getPrecision();
    assertTrue(precision >= 0d);
    assertTrue(precision <= 1d);

    Terms terms = MultiTerms.getTerms(leafReader, booleanFieldName);
    TermsEnum iterator = terms.iterator();
    BytesRef term;
    while ((term = iterator.next()) != null) {
      String s = term.utf8ToString();
      recall = confusionMatrix.getRecall(s);
      assertTrue(recall >= 0d);
      assertTrue(recall <= 1d);
      precision = confusionMatrix.getPrecision(s);
      assertTrue(precision >= 0d);
      assertTrue(precision <= 1d);
      double f1Measure = confusionMatrix.getF1Measure(s);
      assertTrue(f1Measure >= 0d);
      assertTrue(f1Measure <= 1d);
    }
  } finally {
    leafReader.close();
  }
}