Java Code Examples for org.apache.lucene.index.TermsEnum#next()

The following examples show how to use org.apache.lucene.index.TermsEnum#next() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: QueryAutoStopWordAnalyzer.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for the
 * given selection of fields from terms with a document frequency greater than
 * the given maxDocFreq
 *
 * @param delegate Analyzer whose TokenStream will be filtered
 * @param indexReader IndexReader to identify the stopwords from
 * @param fields Selection of fields to calculate stopwords for
 * @param maxDocFreq Document frequency terms should be above in order to be stopwords
 * @throws IOException Can be thrown while reading from the IndexReader
 */
public QueryAutoStopWordAnalyzer(
    Analyzer delegate,
    IndexReader indexReader,
    Collection<String> fields,
    int maxDocFreq) throws IOException {
  super(delegate.getReuseStrategy());
  this.delegate = delegate;
  
  for (String field : fields) {
    Set<String> stopWords = new HashSet<>();
    Terms terms = MultiTerms.getTerms(indexReader, field);
    CharsRefBuilder spare = new CharsRefBuilder();
    if (terms != null) {
      TermsEnum te = terms.iterator();
      BytesRef text;
      while ((text = te.next()) != null) {
        if (te.docFreq() > maxDocFreq) {
          spare.copyUTF8Bytes(text);
          stopWords.add(spare.toString());
        }
      }
    }
    stopWordsPerField.put(field, stopWords);
  }
}
 
Example 2
Source File: LuceneIndexCorpus.java    From word2vec-lucene with Apache License 2.0 6 votes vote down vote up
@Override
public void learnVocab() throws IOException {
  super.learnVocab();

  final String field = ((LuceneIndexConfig)config).getField();
  final Terms terms = MultiFields.getTerms(reader, field);
  final BytesRef maxTerm = terms.getMax();
  final BytesRef minTerm = terms.getMin();
  Query q = new TermRangeQuery(field, minTerm, maxTerm, true, true);
  IndexSearcher searcher = new IndexSearcher(reader);
  topDocs = searcher.search(q, Integer.MAX_VALUE);

  TermsEnum termsEnum = null;
  termsEnum = terms.iterator(termsEnum);

  termsEnum.seekCeil(new BytesRef());
  BytesRef term = termsEnum.term();
  while(term != null){
    int p = addWordToVocab(term.utf8ToString());
    vocab[p].setCn((int)termsEnum.totalTermFreq());
    term = termsEnum.next();
  }
}
 
Example 3
Source File: MultiTermIntervalsSource.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
public IntervalIterator intervals(String field, LeafReaderContext ctx) throws IOException {
  Terms terms = ctx.reader().terms(field);
  if (terms == null) {
    return null;
  }
  List<IntervalIterator> subSources = new ArrayList<>();
  TermsEnum te = automaton.getTermsEnum(terms);
  BytesRef term;
  int count = 0;
  while ((term = te.next()) != null) {
    subSources.add(TermIntervalsSource.intervals(term, te));
    if (++count > maxExpansions) {
      throw new IllegalStateException("Automaton [" + this.pattern + "] expanded to too many terms (limit " + maxExpansions + ")");
    }
  }
  if (subSources.size() == 0) {
    return null;
  }
  return new DisjunctionIntervalsSource.DisjunctionIntervalIterator(subSources);
}
 
Example 4
Source File: ReconstructCommand.java    From clue with Apache License 2.0 6 votes vote down vote up
public String reconstructNoPositions(TermsEnum te, int docid, Bits liveDocs) throws IOException{
  List<String> textList = new ArrayList<String>();
  BytesRef text;
  PostingsEnum postings = null;
  while ((text = te.next()) != null) {
    postings = te.postings(postings, PostingsEnum.FREQS);
    int iterDoc = postings.advance(docid);
    if (iterDoc == docid) {
      textList.add(text.utf8ToString());
    }
  }
  StringBuilder buf = new StringBuilder();
  for (String s : textList) {
    buf.append(s+" ");
  }
  return buf.toString();
}
 
Example 5
Source File: ReadmeSimilarityCalculator.java    From scava with Eclipse Public License 2.0 6 votes vote down vote up
private HashMap<String, Integer> getAllTerms() throws IOException {
	HashMap<String, Integer> allTerms = new HashMap<>();
	int pos = 0;
	for (int docId = 0; docId < getTotalDocumentInIndex(); docId++) {
		Terms vector = getIndexReader().getTermVector(docId, FIELD_CONTENT);
		TermsEnum termsEnum = null;
		termsEnum = vector.iterator();
		BytesRef text = null;
		while ((text = termsEnum.next()) != null) {
			String term = text.utf8ToString();
			allTerms.put(term, pos++);
		}
	}

	// Update postition
	pos = 0;
	for (Entry<String, Integer> s : allTerms.entrySet()) {
		s.setValue(pos++);
	}
	return allTerms;
}
 
Example 6
Source File: HashTermStatistics.java    From liresolr with GNU General Public License v2.0 6 votes vote down vote up
public static void addToStatistics(SolrIndexSearcher searcher, String field) throws IOException {
        // check if this field is already in the stats.
//        synchronized (instance) {
            if (termstats.get(field)!=null) return;
//        }
        // else add it to the stats.
        Terms terms = searcher.getSlowAtomicReader().terms(field);
        HashMap<String, Integer> term2docFreq = new HashMap<String, Integer>(1000);
        termstats.put(field, term2docFreq);
        if (terms!=null) {
            TermsEnum termsEnum = terms.iterator();
            BytesRef term;
            while ((term = termsEnum.next()) != null) {
                term2docFreq.put(term.utf8ToString(), termsEnum.docFreq());
            }
        }
    }
 
Example 7
Source File: BlockTermsWriter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
public void write(Fields fields, NormsProducer norms) throws IOException {

  for(String field : fields) {

    Terms terms = fields.terms(field);
    if (terms == null) {
      continue;
    }

    TermsEnum termsEnum = terms.iterator();

    TermsWriter termsWriter = addField(fieldInfos.fieldInfo(field));

    while (true) {
      BytesRef term = termsEnum.next();
      if (term == null) {
        break;
      }

      termsWriter.write(term, termsEnum, norms);
    }

    termsWriter.finish();
  }
}
 
Example 8
Source File: TestPhraseWildcardQuery.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
protected Term[] expandMultiTerm(String field, String term, int maxExpansions) throws IOException {
  if (maxExpansions == 0) {
    return new Term[0];
  }
  Set<Term> expansions = new HashSet<>();
  WildcardQuery wq = new WildcardQuery(new Term(field, term));
  expansion:
  for (final LeafReaderContext ctx : reader.leaves()) {
    Terms terms = ctx.reader().terms(field);
    if (terms != null) {
      TermsEnum termsEnum = wq.getTermsEnum(terms);
      while (termsEnum.next() != null) {
        expansions.add(new Term(field, termsEnum.term()));
        if (expansions.size() >= maxExpansions) {
          break expansion;
        }
      }
    }
  }
  return expansions.toArray(new Term[0]);
}
 
Example 9
Source File: TermVectorsAdapter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Returns the term vectors for the specified field in the specified document.
 * If no term vector is available for the field, empty list is returned.
 *
 * @param docid - document id
 * @param field - field name
 * @return list of term vector elements
 * @throws IOException - if there is a low level IO error.
 */
List<TermVectorEntry> getTermVector(int docid, String field) throws IOException {
  Terms termVector = reader.getTermVector(docid, field);
  if (termVector == null) {
    // no term vector available
    log.warn("No term vector indexed for doc: #{} and field: {}", docid, field);
    return Collections.emptyList();
  }

  List<TermVectorEntry> res = new ArrayList<>();
  TermsEnum te = termVector.iterator();
  while (te.next() != null) {
    res.add(TermVectorEntry.of(te));
  }
  return res;
}
 
Example 10
Source File: TestMultiThreadTermVectors.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
static void verifyVector(TermsEnum vector, int num) throws IOException {
  StringBuilder temp = new StringBuilder();
  while(vector.next() != null) {
    temp.append(vector.term().utf8ToString());
  }
  assertEquals(English.intToEnglish(num).trim(), temp.toString().trim());
}
 
Example 11
Source File: QueryAutoFilteringComponent.java    From query-autofiltering-component with Apache License 2.0 5 votes vote down vote up
private void buildFieldMap( ResponseBuilder rb ) throws IOException {
  Log.debug( "buildFieldMap" );
  SolrIndexSearcher searcher = rb.req.getSearcher();
  // build a synonym map from the SortedDocValues -
  // for each field value: lower case, stemmed, lookup synonyms from synonyms.txt - map to fieldValue
  SynonymMap.Builder fieldBuilder = new SynonymMap.Builder( true );
  SynonymMap.Builder termBuilder = new SynonymMap.Builder( true );
    
  ArrayList<String> searchFields = getStringFields( searcher );

  for (String searchField : searchFields ) {
    Log.debug( "adding searchField " + searchField );
    CharsRef fieldChars = new CharsRef( searchField );
    SortedSetDocValues sdv = FieldCache.DEFAULT.getDocTermOrds( searcher.getAtomicReader( ), searchField );
    if (sdv == null) continue;
    Log.debug( "got SortedSetDocValues for " + searchField );
    TermsEnum te = sdv.termsEnum();
    while (te.next() != null) {
      BytesRef term = te.term();
      String fieldValue = term.utf8ToString( );
      addTerm ( fieldChars, fieldValue, fieldBuilder, termBuilder );
    }
  }
    
  addDistributedTerms( rb, fieldBuilder, termBuilder, searchFields );
    
  fieldMap = fieldBuilder.build( );
  termMap = termBuilder.build( );
}
 
Example 12
Source File: CountingQuery.java    From lucene-query-example with Apache License 2.0 5 votes vote down vote up
public float customScore(int doc, float subQueryScore, float valSrcScores[]) throws IOException {
	IndexReader r = context.reader();
	Terms tv = r.getTermVector(doc, _field);
	TermsEnum termsEnum = tv.iterator();
    int numTerms = 0;
	while((termsEnum.next()) != null) {
    	numTerms++;
    }
	return (float)(numTerms);
}
 
Example 13
Source File: FSTTermsWriter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public void write(Fields fields, NormsProducer norms) throws IOException {
  for(String field : fields) {
    Terms terms = fields.terms(field);
    if (terms == null) {
      continue;
    }
    FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
    boolean hasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
    TermsEnum termsEnum = terms.iterator();
    TermsWriter termsWriter = new TermsWriter(fieldInfo);

    long sumTotalTermFreq = 0;
    long sumDocFreq = 0;
    FixedBitSet docsSeen = new FixedBitSet(maxDoc);

    while (true) {
      BytesRef term = termsEnum.next();
      if (term == null) {
        break;
      }
          
      BlockTermState termState = postingsWriter.writeTerm(term, termsEnum, docsSeen, norms);
      if (termState != null) {
        termsWriter.finishTerm(term, termState);
        sumTotalTermFreq += termState.totalTermFreq;
        sumDocFreq += termState.docFreq;
      }
    }

    termsWriter.finish(hasFreq ? sumTotalTermFreq : -1, sumDocFreq, docsSeen.cardinality());
  }
}
 
Example 14
Source File: CompletionFieldsConsumer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public void write(Fields fields, NormsProducer norms) throws IOException {
  delegateFieldsConsumer.write(fields, norms);

  for (String field : fields) {
    CompletionTermWriter termWriter = new CompletionTermWriter();
    Terms terms = fields.terms(field);
    if (terms == null) {
      // this can happen from ghost fields, where the incoming Fields iterator claims a field exists but it does not
      continue;
    }
    TermsEnum termsEnum = terms.iterator();

    // write terms
    BytesRef term;
    while ((term = termsEnum.next()) != null) {
      termWriter.write(term, termsEnum);
    }

    // store lookup, if needed
    long filePointer = dictOut.getFilePointer();
    if (termWriter.finish(dictOut)) {
      seenFields.put(field, new CompletionMetaData(filePointer,
          termWriter.minWeight,
          termWriter.maxWeight,
          termWriter.type));
    }
  }
}
 
Example 15
Source File: QualityQueriesFinder.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private String [] bestTerms(String field,int numTerms) throws IOException {
  PriorityQueue<TermDf> pq = new TermsDfQueue(numTerms);
  IndexReader ir = DirectoryReader.open(dir);
  try {
    int threshold = ir.maxDoc() / 10; // ignore words too common.
    Terms terms = MultiTerms.getTerms(ir, field);
    if (terms != null) {
      TermsEnum termsEnum = terms.iterator();
      while (termsEnum.next() != null) {
        int df = termsEnum.docFreq();
        if (df<threshold) {
          String ttxt = termsEnum.term().utf8ToString();
          pq.insertWithOverflow(new TermDf(ttxt,df));
        }
      }
    }
  } finally {
    ir.close();
  }
  String res[] = new String[pq.size()];
  int i = 0;
  while (pq.size()>0) {
    TermDf tdf = pq.pop(); 
    res[i++] = tdf.word;
    System.out.println(i+".   word:  "+tdf.df+"   "+tdf.word);
  }
  return res;
}
 
Example 16
Source File: TermVectorsFromLucene.java    From semanticvectors with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
private void createTermBasedRRIVectorsImpl() throws IOException, RuntimeException {
  this.termVectors = new ElementalVectorStore(flagConfig);

  if (!flagConfig.initialtermvectors().isEmpty() && !flagConfig.initialtermvectors().equals("random")) {
    VerbatimLogger.info("Using elemental term vectors from file " + flagConfig.initialtermvectors());
    CloseableVectorStore inputReader = VectorStoreReader.openVectorStore(flagConfig.initialtermvectors(), flagConfig);
    Enumeration<ObjectVector> termEnumeration = inputReader.getAllVectors();
    this.termVectors = new VectorStoreRAM(flagConfig);
    
    int count = 0;
    while (termEnumeration.hasMoreElements()) {
      ObjectVector next = termEnumeration.nextElement();
      ((VectorStoreRAM) this.termVectors).putVector(next.getObject(), next.getVector());
      count++;
    }
    inputReader.close();
    logger.info("Read in " + count + " vectors");
  } else {
    logger.info("Generating new elemental term vectors");
    this.termVectors = new ElementalVectorStore(flagConfig);
    for(String fieldName : flagConfig.contentsfields()) {
      TermsEnum terms = luceneUtils.getTermsForField(fieldName).iterator();
      BytesRef bytes;
      while ((bytes = terms.next()) != null) {
        Term term = new Term(fieldName, bytes);
        // Skip terms that don't pass the filter.
        if (!luceneUtils.termFilter(term))
          continue;
        // Automatically triggers elemental vector generation.
        this.termVectors.getVector(term.text());
      }
    }
  }
}
 
Example 17
Source File: QueryIndex.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
QueryTermFilter(IndexReader reader) throws IOException {
  for (LeafReaderContext ctx : reader.leaves()) {
    for (FieldInfo fi : ctx.reader().getFieldInfos()) {
      BytesRefHash terms = termsHash.computeIfAbsent(fi.name, f -> new BytesRefHash());
      Terms t = ctx.reader().terms(fi.name);
      if (t != null) {
        TermsEnum te = t.iterator();
        BytesRef term;
        while ((term = te.next()) != null) {
          terms.add(term);
        }
      }
    }
  }
}
 
Example 18
Source File: FieldCacheImpl.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
protected Accountable createValue(LeafReader reader, CacheKey key)
    throws IOException {

  final int maxDoc = reader.maxDoc();

  Terms terms = reader.terms(key.field);

  final float acceptableOverheadRatio = ((Float) key.custom).floatValue();

  final PagedBytes bytes = new PagedBytes(15);

  int startTermsBPV;

  // TODO: use Uninvert?
  if (terms != null) {
    // Try for coarse estimate for number of bits; this
    // should be an underestimate most of the time, which
    // is fine -- GrowableWriter will reallocate as needed
    long numUniqueTerms = terms.size();
    if (numUniqueTerms != -1L) {
      if (numUniqueTerms > maxDoc) {
        throw new IllegalStateException("Type mismatch: " + key.field + " was indexed with multiple values per document, use SORTED_SET instead");
      }

      startTermsBPV = PackedInts.bitsRequired(numUniqueTerms);
    } else {
      startTermsBPV = 1;
    }
  } else {
    startTermsBPV = 1;
  }

  PackedLongValues.Builder termOrdToBytesOffset = PackedLongValues.monotonicBuilder(PackedInts.COMPACT);
  final GrowableWriter docToTermOrd = new GrowableWriter(startTermsBPV, maxDoc, acceptableOverheadRatio);

  int termOrd = 0;

  // TODO: use Uninvert?

  if (terms != null) {
    final TermsEnum termsEnum = terms.iterator();
    PostingsEnum docs = null;

    while(true) {
      final BytesRef term = termsEnum.next();
      if (term == null) {
        break;
      }
      if (termOrd >= maxDoc) {
        throw new IllegalStateException("Type mismatch: " + key.field + " was indexed with multiple values per document, use SORTED_SET instead");
      }

      termOrdToBytesOffset.add(bytes.copyUsingLengthPrefix(term));
      docs = termsEnum.postings(docs, PostingsEnum.NONE);
      while (true) {
        final int docID = docs.nextDoc();
        if (docID == DocIdSetIterator.NO_MORE_DOCS) {
          break;
        }
        // Store 1+ ord into packed bits
        docToTermOrd.set(docID, 1+termOrd);
      }
      termOrd++;
    }
  }

  // maybe an int-only impl?
  return new SortedDocValuesImpl(bytes.freeze(true), termOrdToBytesOffset.build(), docToTermOrd.getMutable(), termOrd);
}
 
Example 19
Source File: AlfrescoLukeRequestHandler.java    From SearchServices with GNU Lesser General Public License v3.0 4 votes vote down vote up
@SuppressWarnings("unchecked")
private static void getDetailedFieldInfo(SolrQueryRequest req,
		String field, SimpleOrderedMap<Object> fieldMap) throws IOException {

	SolrParams params = req.getParams();
	final int numTerms = params.getInt(NUMTERMS, DEFAULT_COUNT);

	TopTermQueue tiq = new TopTermQueue(numTerms + 1); // Something to
														// collect the top N
														// terms in.

	final CharsRefBuilder spare = new CharsRefBuilder();

	Terms terms = MultiFields.getTerms(req.getSearcher().getIndexReader(),
			field);
	if (terms == null) { // field does not exist
		return;
	}
	TermsEnum termsEnum = terms.iterator();
	BytesRef text;
	int[] buckets = new int[HIST_ARRAY_SIZE];
	while ((text = termsEnum.next()) != null) {
		++tiq.distinctTerms;
		int freq = termsEnum.docFreq(); // This calculation seems odd, but
										// it gives the same results as it
										// used to.
		int slot = 32 - Integer.numberOfLeadingZeros(Math.max(0, freq - 1));
		buckets[slot] = buckets[slot] + 1;
		if (numTerms > 0 && freq > tiq.minFreq) {
			spare.copyUTF8Bytes(text);
			String t = spare.toString();

			tiq.add(new TopTermQueue.TermInfo(new Term(field, t), termsEnum
					.docFreq()));
			if (tiq.size() > numTerms) { // if tiq full
				tiq.pop(); // remove lowest in tiq
				tiq.minFreq = tiq.getTopTermInfo().docFreq;
			}
		}
	}
	tiq.histogram.add(buckets);
	fieldMap.add("distinct", tiq.distinctTerms);

	// Include top terms
	fieldMap.add("topTerms", tiq.toNamedList(req.getSearcher().getSchema()));

	// Add a histogram
	fieldMap.add("histogram", tiq.histogram.toNamedList());
}
 
Example 20
Source File: BooleanPerceptronClassifierTest.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Test
public void testPerformance() throws Exception {
  MockAnalyzer analyzer = new MockAnalyzer(random());
  int numDocs = atLeast(10);
  LeafReader leafReader = getRandomIndex(analyzer, numDocs);
  try {
    BooleanPerceptronClassifier classifier = new BooleanPerceptronClassifier(leafReader, analyzer, null, 1, null, booleanFieldName, textFieldName);

    ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(leafReader,
        classifier, booleanFieldName, textFieldName, -1);
    assertNotNull(confusionMatrix);

    double avgClassificationTime = confusionMatrix.getAvgClassificationTime();
    assertTrue(avgClassificationTime >= 0);

    double f1 = confusionMatrix.getF1Measure();
    assertTrue(f1 >= 0d);
    assertTrue(f1 <= 1d);

    double accuracy = confusionMatrix.getAccuracy();
    assertTrue(accuracy >= 0d);
    assertTrue(accuracy <= 1d);

    double recall = confusionMatrix.getRecall();
    assertTrue(recall >= 0d);
    assertTrue(recall <= 1d);

    double precision = confusionMatrix.getPrecision();
    assertTrue(precision >= 0d);
    assertTrue(precision <= 1d);

    Terms terms = MultiTerms.getTerms(leafReader, booleanFieldName);
    TermsEnum iterator = terms.iterator();
    BytesRef term;
    while ((term = iterator.next()) != null) {
      String s = term.utf8ToString();
      recall = confusionMatrix.getRecall(s);
      assertTrue(recall >= 0d);
      assertTrue(recall <= 1d);
      precision = confusionMatrix.getPrecision(s);
      assertTrue(precision >= 0d);
      assertTrue(precision <= 1d);
      double f1Measure = confusionMatrix.getF1Measure(s);
      assertTrue(f1Measure >= 0d);
      assertTrue(f1Measure <= 1d);
    }
  } finally {
    leafReader.close();
  }
}