org.apache.lucene.index.Terms Java Examples

The following examples show how to use org.apache.lucene.index.Terms. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: TransportFieldStatsTransportAction.java From Elasticsearch with Apache License 2.0

7 votes

@Override
protected FieldStatsShardResponse shardOperation(FieldStatsShardRequest request) {
    ShardId shardId = request.shardId();
    Map<String, FieldStats> fieldStats = new HashMap<>();
    IndexService indexServices = indicesService.indexServiceSafe(shardId.getIndex());
    MapperService mapperService = indexServices.mapperService();
    IndexShard shard = indexServices.shardSafe(shardId.id());
    try (Engine.Searcher searcher = shard.acquireSearcher("fieldstats")) {
        for (String field : request.getFields()) {
            MappedFieldType fieldType = mapperService.fullName(field);
            if (fieldType != null) {
                IndexReader reader = searcher.reader();
                Terms terms = MultiFields.getTerms(reader, field);
                if (terms != null) {
                    fieldStats.put(field, fieldType.stats(terms, reader.maxDoc()));
                }
            } else {
                throw new IllegalArgumentException("field [" + field + "] doesn't exist");
            }
        }
    } catch (IOException e) {
        throw ExceptionsHelper.convertToElastic(e);
    }
    return new FieldStatsShardResponse(shardId, fieldStats);
}

Example #2

Source File: TestLegacyTerms.java From lucene-solr with Apache License 2.0

6 votes

public void testFloatFieldMinMax() throws Exception {
  Directory dir = newDirectory();
  RandomIndexWriter w = new RandomIndexWriter(random(), dir);
  int numDocs = atLeast(100);
  float minValue = Float.POSITIVE_INFINITY;
  float maxValue = Float.NEGATIVE_INFINITY;
  for(int i=0;i<numDocs;i++ ){
    Document doc = new Document();
    float num = random().nextFloat();
    minValue = Math.min(num, minValue);
    maxValue = Math.max(num, maxValue);
    doc.add(new LegacyFloatField("field", num, Field.Store.NO));
    w.addDocument(doc);
  }
  
  IndexReader r = w.getReader();
  Terms terms = MultiTerms.getTerms(r, "field");
  assertEquals(minValue, NumericUtils.sortableIntToFloat(LegacyNumericUtils.getMinInt(terms)), 0.0f);
  assertEquals(maxValue, NumericUtils.sortableIntToFloat(LegacyNumericUtils.getMaxInt(terms)), 0.0f);

  r.close();
  w.close();
  dir.close();
}

Example #3

Source File: FuzzyLikeThisQuery.java From lucene-solr with Apache License 2.0

6 votes

private Query newTermQuery(IndexReader reader, Term term) throws IOException {
  if (ignoreTF) {
    return new ConstantScoreQuery(new TermQuery(term));
  } else {
    // we build an artificial TermStates that will give an overall df and ttf
    // equal to 1
    TermStates context = new TermStates(reader.getContext());
    for (LeafReaderContext leafContext : reader.leaves()) {
      Terms terms = leafContext.reader().terms(term.field());
      if (terms != null) {
        TermsEnum termsEnum = terms.iterator();
        if (termsEnum.seekExact(term.bytes())) {
          int freq = 1 - context.docFreq(); // we want the total df and ttf to be 1
          context.register(termsEnum.termState(), leafContext.ord, freq, freq);
        }
      }
    }
    return new TermQuery(term, context);
  }
}

Example #4

Source File: TestLegacyTerms.java From lucene-solr with Apache License 2.0

6 votes

public void testLongFieldMinMax() throws Exception {
  Directory dir = newDirectory();
  RandomIndexWriter w = new RandomIndexWriter(random(), dir);
  int numDocs = atLeast(100);
  long minValue = Long.MAX_VALUE;
  long maxValue = Long.MIN_VALUE;
  for(int i=0;i<numDocs;i++ ){
    Document doc = new Document();
    long num = random().nextLong();
    minValue = Math.min(num, minValue);
    maxValue = Math.max(num, maxValue);
    doc.add(new LegacyLongField("field", num, Field.Store.NO));
    w.addDocument(doc);
  }
  
  IndexReader r = w.getReader();

  Terms terms = MultiTerms.getTerms(r, "field");
  assertEquals(Long.valueOf(minValue), LegacyNumericUtils.getMinLong(terms));
  assertEquals(Long.valueOf(maxValue), LegacyNumericUtils.getMaxLong(terms));

  r.close();
  w.close();
  dir.close();
}

Example #5

Source File: TestCompressingTermVectorsFormat.java From lucene-solr with Apache License 2.0

6 votes

public void testNoOrds() throws Exception {
  Directory dir = newDirectory();
  RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
  Document doc = new Document();
  FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
  ft.setStoreTermVectors(true);
  doc.add(new Field("foo", "this is a test", ft));
  iw.addDocument(doc);
  LeafReader ir = getOnlyLeafReader(iw.getReader());
  Terms terms = ir.getTermVector(0, "foo");
  assertNotNull(terms);
  TermsEnum termsEnum = terms.iterator();
  assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(new BytesRef("this")));

  expectThrows(UnsupportedOperationException.class, termsEnum::ord);
  expectThrows(UnsupportedOperationException.class, () -> termsEnum.seekExact(0));

  ir.close();
  iw.close();
  dir.close();
}

Example #6

Source File: SimpleNaiveBayesClassifier.java From lucene-solr with Apache License 2.0

6 votes

/**
 * count the number of documents in the index having at least a value for the 'class' field
 *
 * @return the no. of documents having a value for the 'class' field
 * @throws IOException if accessing to term vectors or search fails
 */
protected int countDocsWithClass() throws IOException {
  Terms terms = MultiTerms.getTerms(this.indexReader, this.classFieldName);
  int docCount;
  if (terms == null || terms.getDocCount() == -1) { // in case codec doesn't support getDocCount
    TotalHitCountCollector classQueryCountCollector = new TotalHitCountCollector();
    BooleanQuery.Builder q = new BooleanQuery.Builder();
    q.add(new BooleanClause(new WildcardQuery(new Term(classFieldName, String.valueOf(WildcardQuery.WILDCARD_STRING))), BooleanClause.Occur.MUST));
    if (query != null) {
      q.add(query, BooleanClause.Occur.MUST);
    }
    indexSearcher.search(q.build(),
        classQueryCountCollector);
    docCount = classQueryCountCollector.getTotalHits();
  } else {
    docCount = terms.getDocCount();
  }
  return docCount;
}

Example #7

Source File: TestLegacyTerms.java From lucene-solr with Apache License 2.0

6 votes

public void testIntFieldMinMax() throws Exception {
  Directory dir = newDirectory();
  RandomIndexWriter w = new RandomIndexWriter(random(), dir);
  int numDocs = atLeast(100);
  int minValue = Integer.MAX_VALUE;
  int maxValue = Integer.MIN_VALUE;
  for(int i=0;i<numDocs;i++ ){
    Document doc = new Document();
    int num = random().nextInt();
    minValue = Math.min(num, minValue);
    maxValue = Math.max(num, maxValue);
    doc.add(new LegacyIntField("field", num, Field.Store.NO));
    w.addDocument(doc);
  }
  
  IndexReader r = w.getReader();
  Terms terms = MultiTerms.getTerms(r, "field");
  assertEquals(Integer.valueOf(minValue), LegacyNumericUtils.getMinInt(terms));
  assertEquals(Integer.valueOf(maxValue), LegacyNumericUtils.getMaxInt(terms));

  r.close();
  w.close();
  dir.close();
}

Example #8

Source File: SrndTermQuery.java From lucene-solr with Apache License 2.0

6 votes

@Override
public void visitMatchingTerms(
  IndexReader reader,
  String fieldName,
  MatchingTermVisitor mtv) throws IOException
{
  /* check term presence in index here for symmetry with other SimpleTerm's */
  Terms terms = MultiTerms.getTerms(reader, fieldName);
  if (terms != null) {
    TermsEnum termsEnum = terms.iterator();

    TermsEnum.SeekStatus status = termsEnum.seekCeil(new BytesRef(getTermText()));
    if (status == TermsEnum.SeekStatus.FOUND) {
      mtv.visitMatchingTerm(getLuceneTerm(fieldName));
    }
  }
}

Example #9

Source File: TermVectorOffsetStrategy.java From lucene-solr with Apache License 2.0

6 votes

@Override
public OffsetsEnum getOffsetsEnum(LeafReader reader, int docId, String content) throws IOException {
  Terms tvTerms = reader.getTermVector(docId, getField());
  if (tvTerms == null) {
    return OffsetsEnum.EMPTY;
  }

  LeafReader singleDocReader = new TermVectorLeafReader(getField(), tvTerms);
  return createOffsetsEnumFromReader(
      new OverlaySingleDocTermsLeafReader(
          reader,
          singleDocReader,
          getField(),
          docId),
      docId);
}

Example #10

Source File: TokenSources.java From lucene-solr with Apache License 2.0

6 votes

/**
 * A convenience method that tries a number of approaches to getting a token
 * stream. The cost of finding there are no termVectors in the index is
 * minimal (1000 invocations still registers 0 ms). So this "lazy" (flexible?)
 * approach to coding is probably acceptable
 * 
 * @return null if field not stored correctly
 * @throws IOException If there is a low-level I/O error
 */
@Deprecated // maintenance reasons LUCENE-6445
public static TokenStream getAnyTokenStream(IndexReader reader, int docId,
    String field, Analyzer analyzer) throws IOException {
  TokenStream ts = null;

  Fields vectors = reader.getTermVectors(docId);
  if (vectors != null) {
    Terms vector = vectors.terms(field);
    if (vector != null) {
      ts = getTokenStream(vector);
    }
  }

  // No token info stored so fall back to analyzing raw content
  if (ts == null) {
    ts = getTokenStream(reader, docId, field, analyzer);
  }
  return ts;
}

Example #11

Source File: BlockTermsWriter.java From lucene-solr with Apache License 2.0

6 votes

@Override
public void write(Fields fields, NormsProducer norms) throws IOException {

  for(String field : fields) {

    Terms terms = fields.terms(field);
    if (terms == null) {
      continue;
    }

    TermsEnum termsEnum = terms.iterator();

    TermsWriter termsWriter = addField(fieldInfos.fieldInfo(field));

    while (true) {
      BytesRef term = termsEnum.next();
      if (term == null) {
        break;
      }

      termsWriter.write(term, termsEnum, norms);
    }

    termsWriter.finish();
  }
}

Example #12

Source File: PayloadFilteredTermIntervalsSource.java From lucene-solr with Apache License 2.0

6 votes

@Override
public IntervalIterator intervals(String field, LeafReaderContext ctx) throws IOException {
  Terms terms = ctx.reader().terms(field);
  if (terms == null)
    return null;
  if (terms.hasPositions() == false) {
    throw new IllegalArgumentException("Cannot create an IntervalIterator over field " + field + " because it has no indexed positions");
  }
  if (terms.hasPayloads() == false) {
    throw new IllegalArgumentException("Cannot create a payload-filtered iterator over field " + field + " because it has no indexed payloads");
  }
  TermsEnum te = terms.iterator();
  if (te.seekExact(term) == false) {
    return null;
  }
  return intervals(te);
}

Example #13

Source File: QERetrievalApp.java From lucene4ir with Apache License 2.0

6 votes

/**
 * Combines the individual term vectors of each document into a single list.
 * @param terms
 * @return
 */
public HashMap<String, QETerm> combineTerms(Vector<Terms> terms){
    HashMap<String, QETerm> combinedTerms = new HashMap<String, QETerm>();
    int numDocs = terms.size();
    for(Terms ts : terms){
        try {
            TermsEnum te = ts.iterator();
            BytesRef term;
            while ((term = te.next()) != null) {
                String tString = term.utf8ToString();
                QETerm qet = new QETerm(tString, te.totalTermFreq(),te.docFreq(),numDocs);
                if (combinedTerms.containsKey(tString)){
                    QETerm mergedTerm = qet.combine(combinedTerms.get(tString));
                    combinedTerms.replace(tString,mergedTerm);
                }
                else
                    combinedTerms.put(tString,qet);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    return combinedTerms;
}

Example #14

Source File: VersionFieldUpgrader.java From Elasticsearch with Apache License 2.0

6 votes

static CodecReader wrap(CodecReader reader) throws IOException {
    final FieldInfos fieldInfos = reader.getFieldInfos();
    final FieldInfo versionInfo = fieldInfos.fieldInfo(VersionFieldMapper.NAME);
    if (versionInfo != null && versionInfo.getDocValuesType() != DocValuesType.NONE) {
        // the reader is a recent one, it has versions and they are stored
        // in a numeric doc values field
        return reader;
    }
    // The segment is an old one, look at the _uid field
    final Terms terms = reader.terms(UidFieldMapper.NAME);
    if (terms == null || !terms.hasPayloads()) {
        // The segment doesn't have an _uid field or doesn't have payloads
        // don't try to do anything clever. If any other segment has versions
        // all versions of this segment will be initialized to 0
        return reader;
    }
    // convert _uid payloads -> _version docvalues
    return new VersionFieldUpgrader(reader);
}

Example #15

Source File: ReadmeSimilarityCalculator.java From scava with Eclipse Public License 2.0

6 votes

private HashMap<String, Integer> getAllTerms() throws IOException {
	HashMap<String, Integer> allTerms = new HashMap<>();
	int pos = 0;
	for (int docId = 0; docId < getTotalDocumentInIndex(); docId++) {
		Terms vector = getIndexReader().getTermVector(docId, FIELD_CONTENT);
		TermsEnum termsEnum = null;
		termsEnum = vector.iterator();
		BytesRef text = null;
		while ((text = termsEnum.next()) != null) {
			String term = text.utf8ToString();
			allTerms.put(term, pos++);
		}
	}

	// Update postition
	pos = 0;
	for (Entry<String, Integer> s : allTerms.entrySet()) {
		s.setValue(pos++);
	}
	return allTerms;
}

Example #16

Source File: HashTermStatistics.java From liresolr with GNU General Public License v2.0

6 votes

public static void addToStatistics(SolrIndexSearcher searcher, String field) throws IOException {
        // check if this field is already in the stats.
//        synchronized (instance) {
            if (termstats.get(field)!=null) return;
//        }
        // else add it to the stats.
        Terms terms = searcher.getSlowAtomicReader().terms(field);
        HashMap<String, Integer> term2docFreq = new HashMap<String, Integer>(1000);
        termstats.put(field, term2docFreq);
        if (terms!=null) {
            TermsEnum termsEnum = terms.iterator();
            BytesRef term;
            while ((term = termsEnum.next()) != null) {
                term2docFreq.put(term.utf8ToString(), termsEnum.docFreq());
            }
        }
    }

Example #17

Source File: TermVectorsResponse.java From Elasticsearch with Apache License 2.0

6 votes

private void buildFieldStatistics(XContentBuilder builder, Terms curTerms) throws IOException {
    long sumDocFreq = curTerms.getSumDocFreq();
    int docCount = curTerms.getDocCount();
    long sumTotalTermFrequencies = curTerms.getSumTotalTermFreq();
    if (docCount > 0) {
        assert ((sumDocFreq > 0)) : "docCount >= 0 but sumDocFreq ain't!";
        assert ((sumTotalTermFrequencies > 0)) : "docCount >= 0 but sumTotalTermFrequencies ain't!";
        builder.startObject(FieldStrings.FIELD_STATISTICS);
        builder.field(FieldStrings.SUM_DOC_FREQ, sumDocFreq);
        builder.field(FieldStrings.DOC_COUNT, docCount);
        builder.field(FieldStrings.SUM_TTF, sumTotalTermFrequencies);
        builder.endObject();
    } else if (docCount == -1) { // this should only be -1 if the field
        // statistics were not requested at all. In
        // this case all 3 values should be -1
        assert ((sumDocFreq == -1)) : "docCount was -1 but sumDocFreq ain't!";
        assert ((sumTotalTermFrequencies == -1)) : "docCount was -1 but sumTotalTermFrequencies ain't!";
    } else {
        throw new IllegalStateException(
                "Something is wrong with the field statistics of the term vector request: Values are " + "\n"
                        + FieldStrings.SUM_DOC_FREQ + " " + sumDocFreq + "\n" + FieldStrings.DOC_COUNT + " " + docCount + "\n"
                        + FieldStrings.SUM_TTF + " " + sumTotalTermFrequencies);
    }
}

Example #18

Source File: IndexToolsImpl.java From lucene-solr with Apache License 2.0

6 votes

public String exportTerms(String destDir, String field, String delimiter) {
  String filename = "terms_" + field + "_" + System.currentTimeMillis() + ".out";
  Path path = Paths.get(destDir, filename);
  try {
    Terms terms = MultiTerms.getTerms(reader, field);
    if (terms == null) {
      throw new LukeException(String.format(Locale.US, "Field %s does not contain any terms to be exported", field));
    }
    try (BufferedWriter writer = Files.newBufferedWriter(path, Charset.forName("UTF-8"))) {
      TermsEnum termsEnum = terms.iterator();
      BytesRef term;
      while (!Thread.currentThread().isInterrupted() && (term = termsEnum.next()) != null) {
        writer.write(String.format(Locale.US, "%s%s%d\n", term.utf8ToString(), delimiter, +termsEnum.docFreq()));
      }
      return path.toString();
    }
  } catch (IOException e) {
    throw new LukeException("Terms file export for field [" + field + "] to file [" + filename + "] has failed.", e);
  }
}

Example #19

Source File: TermVectorsResponse.java From Elasticsearch with Apache License 2.0

6 votes

private void buildTerm(XContentBuilder builder, final CharsRefBuilder spare, Terms curTerms, TermsEnum termIter, BoostAttribute boostAtt) throws IOException {
    // start term, optimized writing
    BytesRef term = termIter.next();
    spare.copyUTF8Bytes(term);
    builder.startObject(spare.toString());
    buildTermStatistics(builder, termIter);
    // finally write the term vectors
    PostingsEnum posEnum = termIter.postings(null, PostingsEnum.ALL);
    int termFreq = posEnum.freq();
    builder.field(FieldStrings.TERM_FREQ, termFreq);
    initMemory(curTerms, termFreq);
    initValues(curTerms, posEnum, termFreq);
    buildValues(builder, curTerms, termFreq);
    buildScore(builder, boostAtt);
    builder.endObject();
}

Example #20

Source File: DfsOnlyRequest.java From Elasticsearch with Apache License 2.0

6 votes

public DfsOnlyRequest(Fields termVectorsFields, String[] indices, String[] types, Set<String> selectedFields) throws IOException {
    super(indices);

    // build a search request with a query of all the terms
    final BoolQueryBuilder boolBuilder = boolQuery();
    for (String fieldName : termVectorsFields) {
        if ((selectedFields != null) && (!selectedFields.contains(fieldName))) {
            continue;
        }
        Terms terms = termVectorsFields.terms(fieldName);
        TermsEnum iterator = terms.iterator();
        while (iterator.next() != null) {
            String text = iterator.term().utf8ToString();
            boolBuilder.should(QueryBuilders.termQuery(fieldName, text));
        }
    }
    // wrap a search request object
    this.searchRequest = new SearchRequest(indices).types(types).source(new SearchSourceBuilder().query(boolBuilder));
}

Example #21

Source File: ReadmeSimilarityCalculator.java From scava with Eclipse Public License 2.0

6 votes

private DocVector[] getDocumentVectors() throws IOException {
	DocVector[] docVector = new DocVector[getTotalDocumentInIndex()];
	for (int docId = 0; docId < getTotalDocumentInIndex(); docId++) {
		Terms vector = getIndexReader().getTermVector(docId, FIELD_CONTENT);
		TermsEnum termsEnum = null;
		termsEnum = vector.iterator();
		BytesRef text = null;
		docVector[docId] = new DocVector(getAllTerms());
		while ((text = termsEnum.next()) != null) {
			String term = text.utf8ToString();
			int freq = (int) termsEnum.totalTermFreq();
			docVector[docId].setEntry(term, freq);
		}
		docVector[docId].normalize();
	}
	getIndexReader().close();
	return docVector;
}

Example #22

Source File: TermTermVectorsFromLucene.java From semanticvectors with BSD 3-Clause "New" or "Revised" License

5 votes

/**
  * Initialize queue of cached Terms objects
  */


 private synchronized void populateQueue() {
  
  
  
  if (this.totalQueueCount.get() >= luceneUtils.getNumDocs() || randomStartpoints.isEmpty())  
  { if (theQ.size() == 0) exhaustedQ.set(true); return; }

int added = 0;
   int startdoc = randomStartpoints.poll();
   int stopdoc  = Math.min(startdoc+ qsize, luceneUtils.getNumDocs());
   
   for (int a = startdoc; a < stopdoc; a++) {
     for (String field : flagConfig.contentsfields())
       try {
         int docID = a;
         Terms incomingTermVector = luceneUtils.getTermVector(a, field);
         totalQueueCount.incrementAndGet();
         if (incomingTermVector != null){ 
         theQ.add(new DocIdTerms(docID,incomingTermVector));
         added++;
         }
       } catch (IOException e) {
         // TODO Auto-generated catch block
         e.printStackTrace();
       }
   }
   
   if (added > 0)
     System.err.println("Initialized TermVector Queue with " + added + " documents");
  
 }

Example #23

Source File: TermVectorLeafReader.java From lucene-solr with Apache License 2.0

5 votes

public TermVectorLeafReader(String field, Terms terms) {
  fields = new Fields() {
    @Override
    public Iterator<String> iterator() {
      return Collections.singletonList(field).iterator();
    }

    @Override
    public Terms terms(String fld) throws IOException {
      if (!field.equals(fld)) {
        return null;
      }
      return terms;
    }

    @Override
    public int size() {
      return 1;
    }
  };

  IndexOptions indexOptions;
  if (!terms.hasFreqs()) {
    indexOptions = IndexOptions.DOCS;
  } else if (!terms.hasPositions()) {
    indexOptions = IndexOptions.DOCS_AND_FREQS;
  } else if (!terms.hasOffsets()) {
    indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
  } else {
    indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
  }
  FieldInfo fieldInfo = new FieldInfo(field, 0,
                                      true, true, terms.hasPayloads(),
                                      indexOptions, DocValuesType.NONE, -1, Collections.emptyMap(), 0, 0, 0, false);
  fieldInfos = new FieldInfos(new FieldInfo[]{fieldInfo});
}

Example #24

Source File: TokenSources.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Returns a token stream generated from a {@link Terms}. This
 * can be used to feed the highlighter with a pre-parsed token
 * stream.  The {@link Terms} must have offsets available. If there are no positions available,
 * all tokens will have position increments reflecting adjacent tokens, or coincident when terms
 * share a start offset. If there are stopwords filtered from the index, you probably want to ensure
 * term vectors have positions so that phrase queries won't match across stopwords.
 *
 * @throws IllegalArgumentException if no offsets are available
 */
@Deprecated // maintenance reasons LUCENE-6445
public static TokenStream getTokenStream(final Terms tpv) throws IOException {

  if (!tpv.hasOffsets()) {
    throw new IllegalArgumentException("Highlighting requires offsets from the TokenStream.");
    //TokenStreamFromTermVector can handle a lack of offsets if there are positions. But
    // highlighters require offsets, so we insist here.
  }

  return new TokenStreamFromTermVector(tpv, -1); // TODO propagate maxStartOffset; see LUCENE-6445
}

Example #25

Source File: JoinDocFreqValueSource.java From lucene-solr with Apache License 2.0

5 votes

@Override
public FunctionValues getValues(Map<Object, Object> context, LeafReaderContext readerContext) throws IOException
{
  final BinaryDocValues terms = DocValues.getBinary(readerContext.reader(), field);
  final IndexReader top = ReaderUtil.getTopLevelContext(readerContext).reader();
  Terms t = MultiTerms.getTerms(top, qfield);
  final TermsEnum termsEnum = t == null ? TermsEnum.EMPTY : t.iterator();
  
  return new IntDocValues(this) {

    int lastDocID = -1;

    @Override
    public int intVal(int doc) throws IOException {
      if (doc < lastDocID) {
        throw new IllegalArgumentException("docs were sent out-of-order: lastDocID=" + lastDocID + " vs docID=" + doc);
      }
      lastDocID = doc;
      int curDocID = terms.docID();
      if (doc > curDocID) {
        curDocID = terms.advance(doc);
      }
      if (doc == curDocID) {
        BytesRef term = terms.binaryValue();
        if (termsEnum.seekExact(term)) {
          return termsEnum.docFreq();
        }
      }
      return 0;
    }
  };
}

Example #26

Source File: PostingsWithTermVectorsOffsetStrategy.java From lucene-solr with Apache License 2.0

5 votes

@Override
public OffsetsEnum getOffsetsEnum(LeafReader leafReader, int docId, String content) throws IOException {
  Terms docTerms = leafReader.getTermVector(docId, getField());
  if (docTerms == null) {
    return OffsetsEnum.EMPTY;
  }
  leafReader = new TermVectorFilteredLeafReader(leafReader, docTerms, getField());

  return createOffsetsEnumFromReader(leafReader, docId);
}

Example #27

Source File: TestTopDocsCollector.java From lucene-solr with Apache License 2.0

5 votes

public void testRealisticConcurrentMinimumScore() throws Exception {
  Directory dir = newDirectory();
  RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
  try (LineFileDocs docs = new LineFileDocs(random())) {
    int numDocs = atLeast(100);
    for (int i = 0; i < numDocs; i++) {
      writer.addDocument(docs.nextDoc());
    }
  }

  IndexReader reader = writer.getReader();
  writer.close();

  Terms terms = MultiTerms.getTerms(reader, "body");
  int termCount = 0;
  TermsEnum termsEnum = terms.iterator();
  while(termsEnum.next() != null) {
    termCount++;
  }
  assertTrue(termCount > 0);

  // Target ~10 terms to search:
  double chance = 10.0 / termCount;
  termsEnum = terms.iterator();
  while(termsEnum.next() != null) {
    if (random().nextDouble() <= chance) {
      BytesRef term = BytesRef.deepCopyOf(termsEnum.term());
      Query query = new TermQuery(new Term("body", term));

      TopDocsCollector<ScoreDoc> collector = doSearchWithThreshold(5, 0, query, reader);
      TopDocs tdc = doConcurrentSearchWithThreshold(5, 0, query, reader);
      TopDocs tdc2 = collector.topDocs();

      CheckHits.checkEqual(query, tdc.scoreDocs, tdc2.scoreDocs);
    }
  }

  reader.close();
  dir.close();
}

Example #28

Source File: STUniformSplitTermsWriter.java From lucene-solr with Apache License 2.0

5 votes

@Override
public void merge(MergeState mergeState, NormsProducer normsProducer) throws IOException {
  if (mergeState.needsIndexSort) {
    // This custom merging does not support sorted index.
    // Fall back to the default merge, which is inefficient for this postings format.
    super.merge(mergeState, normsProducer);
    return;
  }
  FieldsProducer[] fieldsProducers = mergeState.fieldsProducers;
  List<TermIterator<SegmentTerms>> segmentTermsList = new ArrayList<>(fieldsProducers.length);
  for (int segmentIndex = 0; segmentIndex < fieldsProducers.length; segmentIndex++) {
    FieldsProducer fieldsProducer = fieldsProducers[segmentIndex];
    // Iterate the FieldInfo provided by mergeState.fieldInfos because they may be
    // filtered by PerFieldMergeState.
    for (FieldInfo fieldInfo : mergeState.fieldInfos[segmentIndex]) {
      // Iterate all fields only the get the *first* Terms instanceof STUniformSplitTerms.
      // See the break below.
      Terms terms = fieldsProducer.terms(fieldInfo.name);
      if (terms != null) {
        if (!(terms instanceof STUniformSplitTerms)) {
          // Terms is not directly an instance of STUniformSplitTerms, it is wrapped/filtered.
          // Fall back to the default merge, which is inefficient for this postings format.
          super.merge(mergeState, normsProducer);
          return;
        }
        STUniformSplitTerms sharedTerms = (STUniformSplitTerms) terms;
        segmentTermsList.add(new SegmentTerms(
            segmentIndex, sharedTerms.createMergingBlockReader(), mergeState.docMaps[segmentIndex]));
        // We have the STUniformSplitTerms for the segment. Break the field
        // loop to iterate the next segment.
        break;
      }
    }
  }
  writeSegment((blockWriter, dictionaryBuilder) -> mergeSegments(mergeState, normsProducer, segmentTermsList, blockWriter, dictionaryBuilder));
}

Example #29

Source File: QualityQueriesFinder.java From lucene-solr with Apache License 2.0

5 votes

private String [] bestTerms(String field,int numTerms) throws IOException {
  PriorityQueue<TermDf> pq = new TermsDfQueue(numTerms);
  IndexReader ir = DirectoryReader.open(dir);
  try {
    int threshold = ir.maxDoc() / 10; // ignore words too common.
    Terms terms = MultiTerms.getTerms(ir, field);
    if (terms != null) {
      TermsEnum termsEnum = terms.iterator();
      while (termsEnum.next() != null) {
        int df = termsEnum.docFreq();
        if (df<threshold) {
          String ttxt = termsEnum.term().utf8ToString();
          pq.insertWithOverflow(new TermDf(ttxt,df));
        }
      }
    }
  } finally {
    ir.close();
  }
  String res[] = new String[pq.size()];
  int i = 0;
  while (pq.size()>0) {
    TermDf tdf = pq.pop(); 
    res[i++] = tdf.word;
    System.out.println(i+".   word:  "+tdf.df+"   "+tdf.word);
  }
  return res;
}

Example #30

Source File: TokenStreamFromTermVector.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Constructor. The uninversion doesn't happen here; it's delayed till the first call to
 * {@link #incrementToken}.
 *
 * @param vector Terms that contains the data for
 *        creating the TokenStream. Must have positions and/or offsets.
 * @param maxStartOffset if a token's start offset exceeds this then the token is not added. -1 disables the limit.
 */
public TokenStreamFromTermVector(Terms vector, int maxStartOffset) throws IOException {
  this.maxStartOffset = maxStartOffset < 0 ? Integer.MAX_VALUE : maxStartOffset;
  assert !hasAttribute(PayloadAttribute.class) : "AttributeFactory shouldn't have payloads *yet*";
  if (!vector.hasPositions() && !vector.hasOffsets()) {
    throw new IllegalArgumentException("The term vector needs positions and/or offsets.");
  }
  assert vector.hasFreqs();
  this.vector = vector;
  termAttribute = addAttribute(CharTermAttribute.class);
  positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
}