org.apache.lucene.index.TermsEnum Java Examples

The following examples show how to use org.apache.lucene.index.TermsEnum. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestCompressingTermVectorsFormat.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testNoOrds() throws Exception {
  Directory dir = newDirectory();
  RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
  Document doc = new Document();
  FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
  ft.setStoreTermVectors(true);
  doc.add(new Field("foo", "this is a test", ft));
  iw.addDocument(doc);
  LeafReader ir = getOnlyLeafReader(iw.getReader());
  Terms terms = ir.getTermVector(0, "foo");
  assertNotNull(terms);
  TermsEnum termsEnum = terms.iterator();
  assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(new BytesRef("this")));

  expectThrows(UnsupportedOperationException.class, termsEnum::ord);
  expectThrows(UnsupportedOperationException.class, () -> termsEnum.seekExact(0));

  ir.close();
  iw.close();
  dir.close();
}
 
Example #2
Source File: TestUtil.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public static PostingsEnum docs(Random random, TermsEnum termsEnum, PostingsEnum reuse, int flags) throws IOException {
  // TODO: simplify this method? it would be easier to randomly either use the flags passed, or do the random selection,
  // FREQS should be part fo the random selection instead of outside on its own?
  if (random.nextBoolean()) {
    if (random.nextBoolean()) {
      final int posFlags;
      switch (random.nextInt(4)) {
        case 0: posFlags = PostingsEnum.POSITIONS; break;
        case 1: posFlags = PostingsEnum.OFFSETS; break;
        case 2: posFlags = PostingsEnum.PAYLOADS; break;
        default: posFlags = PostingsEnum.ALL; break;
      }
      return termsEnum.postings(null, posFlags);
    }
    flags |= PostingsEnum.FREQS;
  }
  return termsEnum.postings(reuse, flags);
}
 
Example #3
Source File: DfsOnlyRequest.java    From Elasticsearch with Apache License 2.0 6 votes vote down vote up
public DfsOnlyRequest(Fields termVectorsFields, String[] indices, String[] types, Set<String> selectedFields) throws IOException {
    super(indices);

    // build a search request with a query of all the terms
    final BoolQueryBuilder boolBuilder = boolQuery();
    for (String fieldName : termVectorsFields) {
        if ((selectedFields != null) && (!selectedFields.contains(fieldName))) {
            continue;
        }
        Terms terms = termVectorsFields.terms(fieldName);
        TermsEnum iterator = terms.iterator();
        while (iterator.next() != null) {
            String text = iterator.term().utf8ToString();
            boolBuilder.should(QueryBuilders.termQuery(fieldName, text));
        }
    }
    // wrap a search request object
    this.searchRequest = new SearchRequest(indices).types(types).source(new SearchSourceBuilder().query(boolBuilder));
}
 
Example #4
Source File: BlockTermsWriter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
public void write(Fields fields, NormsProducer norms) throws IOException {

  for(String field : fields) {

    Terms terms = fields.terms(field);
    if (terms == null) {
      continue;
    }

    TermsEnum termsEnum = terms.iterator();

    TermsWriter termsWriter = addField(fieldInfos.fieldInfo(field));

    while (true) {
      BytesRef term = termsEnum.next();
      if (term == null) {
        break;
      }

      termsWriter.write(term, termsEnum, norms);
    }

    termsWriter.finish();
  }
}
 
Example #5
Source File: SimpleNaiveBayesClassifier.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Calculate probabilities for all classes for a given input text
 * @param inputDocument the input text as a {@code String}
 * @return a {@code List} of {@code ClassificationResult}, one for each existing class
 * @throws IOException if assigning probabilities fails
 */
protected List<ClassificationResult<BytesRef>> assignClassNormalizedList(String inputDocument) throws IOException {
  List<ClassificationResult<BytesRef>> assignedClasses = new ArrayList<>();

  Terms classes = MultiTerms.getTerms(indexReader, classFieldName);
  if (classes != null) {
    TermsEnum classesEnum = classes.iterator();
    BytesRef next;
    String[] tokenizedText = tokenize(inputDocument);
    int docsWithClassSize = countDocsWithClass();
    while ((next = classesEnum.next()) != null) {
      if (next.length > 0) {
        Term term = new Term(this.classFieldName, next);
        double clVal = calculateLogPrior(term, docsWithClassSize) + calculateLogLikelihood(tokenizedText, term, docsWithClassSize);
        assignedClasses.add(new ClassificationResult<>(term.bytes(), clVal));
      }
    }
  }
  // normalization; the values transforms to a 0-1 range
  return normClassificationResults(assignedClasses);
}
 
Example #6
Source File: ReconstructCommand.java    From clue with Apache License 2.0 6 votes vote down vote up
public String reconstructNoPositions(TermsEnum te, int docid, Bits liveDocs) throws IOException{
  List<String> textList = new ArrayList<String>();
  BytesRef text;
  PostingsEnum postings = null;
  while ((text = te.next()) != null) {
    postings = te.postings(postings, PostingsEnum.FREQS);
    int iterDoc = postings.advance(docid);
    if (iterDoc == docid) {
      textList.add(text.utf8ToString());
    }
  }
  StringBuilder buf = new StringBuilder();
  for (String s : textList) {
    buf.append(s+" ");
  }
  return buf.toString();
}
 
Example #7
Source File: TestRTGBase.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
protected int getFirstMatch(IndexReader r, Term t) throws IOException {
  Terms terms = MultiTerms.getTerms(r, t.field());
  if (terms == null) return -1;
  BytesRef termBytes = t.bytes();
  final TermsEnum termsEnum = terms.iterator();
  if (!termsEnum.seekExact(termBytes)) {
    return -1;
  }
  PostingsEnum docs = termsEnum.postings(null, PostingsEnum.NONE);
  docs = BitsFilteredPostingsEnum.wrap(docs, MultiBits.getLiveDocs(r));
  int id = docs.nextDoc();
  if (id != DocIdSetIterator.NO_MORE_DOCS) {
    int next = docs.nextDoc();
    assertEquals(DocIdSetIterator.NO_MORE_DOCS, next);
  }
  return id == DocIdSetIterator.NO_MORE_DOCS ? -1 : id;
}
 
Example #8
Source File: ShardSplittingQuery.java    From crate with Apache License 2.0 6 votes vote down vote up
private static void findSplitDocs(String idField, Predicate<BytesRef> includeInShard, LeafReader leafReader,
                                  IntConsumer consumer) throws IOException {
    Terms terms = leafReader.terms(idField);
    TermsEnum iterator = terms.iterator();
    BytesRef idTerm;
    PostingsEnum postingsEnum = null;
    while ((idTerm = iterator.next()) != null) {
        if (includeInShard.test(idTerm) == false) {
            postingsEnum = iterator.postings(postingsEnum);
            int doc;
            while ((doc = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                consumer.accept(doc);
            }
        }
    }
}
 
Example #9
Source File: ReadmeSimilarityCalculator.java    From scava with Eclipse Public License 2.0 6 votes vote down vote up
private DocVector[] getDocumentVectors() throws IOException {
	DocVector[] docVector = new DocVector[getTotalDocumentInIndex()];
	for (int docId = 0; docId < getTotalDocumentInIndex(); docId++) {
		Terms vector = getIndexReader().getTermVector(docId, FIELD_CONTENT);
		TermsEnum termsEnum = null;
		termsEnum = vector.iterator();
		BytesRef text = null;
		docVector[docId] = new DocVector(getAllTerms());
		while ((text = termsEnum.next()) != null) {
			String term = text.utf8ToString();
			int freq = (int) termsEnum.totalTermFreq();
			docVector[docId].setEntry(term, freq);
		}
		docVector[docId].normalize();
	}
	getIndexReader().close();
	return docVector;
}
 
Example #10
Source File: TermVectorsAdapter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Returns the term vectors for the specified field in the specified document.
 * If no term vector is available for the field, empty list is returned.
 *
 * @param docid - document id
 * @param field - field name
 * @return list of term vector elements
 * @throws IOException - if there is a low level IO error.
 */
List<TermVectorEntry> getTermVector(int docid, String field) throws IOException {
  Terms termVector = reader.getTermVector(docid, field);
  if (termVector == null) {
    // no term vector available
    log.warn("No term vector indexed for doc: #{} and field: {}", docid, field);
    return Collections.emptyList();
  }

  List<TermVectorEntry> res = new ArrayList<>();
  TermsEnum te = termVector.iterator();
  while (te.next() != null) {
    res.add(TermVectorEntry.of(te));
  }
  return res;
}
 
Example #11
Source File: IndexImporter.java    From incubator-retired-blur with Apache License 2.0 6 votes vote down vote up
private void runOldMergeSortRowIdCheckAndDelete(boolean emitDeletes, IndexReader currentIndexReader,
    BlurPartitioner blurPartitioner, Text key, int numberOfShards, int shardId, Action action,
    AtomicReader atomicReader) throws IOException {
  MergeSortRowIdLookup lookup = new MergeSortRowIdLookup(currentIndexReader);
  Fields fields = atomicReader.fields();
  Terms terms = fields.terms(BlurConstants.ROW_ID);
  if (terms != null) {
    TermsEnum termsEnum = terms.iterator(null);
    BytesRef ref = null;
    while ((ref = termsEnum.next()) != null) {
      key.set(ref.bytes, ref.offset, ref.length);
      int partition = blurPartitioner.getPartition(key, null, numberOfShards);
      if (shardId != partition) {
        throw new IOException("Index is corrupted, RowIds are found in wrong shard, partition [" + partition
            + "] does not shard [" + shardId + "], this can happen when rows are not hashed correctly.");
      }
      if (emitDeletes) {
        lookup.lookup(ref, action);
      }
    }
  }
}
 
Example #12
Source File: LindenFieldCacheImpl.java    From linden with Apache License 2.0 6 votes vote down vote up
@Override
protected Accountable createValue(final AtomicReader reader, CacheKey key, boolean setDocsWithField)
    throws IOException {
  final Map<String, Integer> uidMap = new HashMap<>();

  Uninvert u = new Uninvert() {
    private String currentValue;

    @Override
    public void visitTerm(BytesRef term) {
      currentValue = term.utf8ToString();
    }

    @Override
    public void visitDoc(int docID) {
      uidMap.put(currentValue, docID);
    }

    @Override
    protected TermsEnum termsEnum(Terms terms) throws IOException {
      return terms.iterator(null);
    }
  };
  u.uninvert(reader, key.field, setDocsWithField);
  return new PerReaderUIDMaps(reader.getContext().ord, uidMap);
}
 
Example #13
Source File: SolrRangeQuery.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public RangeTermsEnum(Terms terms) throws IOException {
  if (terms == null) {
    positioned = true;
  } else {
    te = terms.iterator();
    if (lower != null) {
      TermsEnum.SeekStatus status = te.seekCeil(lower);
      if (status == TermsEnum.SeekStatus.END) {
        positioned = true;
        curr = null;
      } else if (status == SeekStatus.FOUND) {
        positioned = includeLower();
        curr = te.term();
      } else {
        // lower bound not found, so includeLower is irrelevant
        positioned = true;
        curr = te.term();
      }
    }
  }
}
 
Example #14
Source File: HashTermStatistics.java    From liresolr with GNU General Public License v2.0 6 votes vote down vote up
public static void addToStatistics(SolrIndexSearcher searcher, String field) throws IOException {
        // check if this field is already in the stats.
//        synchronized (instance) {
            if (termstats.get(field)!=null) return;
//        }
        // else add it to the stats.
        Terms terms = searcher.getSlowAtomicReader().terms(field);
        HashMap<String, Integer> term2docFreq = new HashMap<String, Integer>(1000);
        termstats.put(field, term2docFreq);
        if (terms!=null) {
            TermsEnum termsEnum = terms.iterator();
            BytesRef term;
            while ((term = termsEnum.next()) != null) {
                term2docFreq.put(term.utf8ToString(), termsEnum.docFreq());
            }
        }
    }
 
Example #15
Source File: UniformSplitTermsWriter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
public void write(Fields fields, NormsProducer normsProducer) throws IOException {
  BlockWriter blockWriter = new BlockWriter(blockOutput, targetNumBlockLines, deltaNumLines, blockEncoder);
  ByteBuffersDataOutput fieldsOutput = new ByteBuffersDataOutput();
  int fieldsNumber = 0;
  for (String field : fields) {
    Terms terms = fields.terms(field);
    if (terms != null) {
      TermsEnum termsEnum = terms.iterator();
      FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
      fieldsNumber += writeFieldTerms(blockWriter, fieldsOutput, termsEnum, fieldInfo, normsProducer);
    }
  }
  writeFieldsMetadata(fieldsNumber, fieldsOutput);
  CodecUtil.writeFooter(dictionaryOutput);
}
 
Example #16
Source File: DisjunctionMatchesIterator.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Create a {@link DisjunctionMatchesIterator} over a list of terms extracted from a {@link BytesRefIterator}
 *
 * Only terms that have at least one match in the given document will be included
 */
static MatchesIterator fromTermsEnum(LeafReaderContext context, int doc, Query query, String field, BytesRefIterator terms) throws IOException {
  Objects.requireNonNull(field);
  Terms t = context.reader().terms(field);
  if (t == null)
    return null;
  TermsEnum te = t.iterator();
  PostingsEnum reuse = null;
  for (BytesRef term = terms.next(); term != null; term = terms.next()) {
    if (te.seekExact(term)) {
      PostingsEnum pe = te.postings(reuse, PostingsEnum.OFFSETS);
      if (pe.advance(doc) == doc) {
        return new TermsEnumDisjunctionMatchesIterator(new TermMatchesIterator(query, pe), terms, te, doc, query);
      }
      else {
        reuse = pe;
      }
    }
  }
  return null;
}
 
Example #17
Source File: DocToDoubleVectorUtils.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * create a sparse <code>Double</code> vector given doc and field term vectors using local frequency of the terms in the doc
 *
 * @param docTerms   term vectors for a given document
 * @param fieldTerms field term vectors
 * @return a sparse vector of <code>Double</code>s as an array
 * @throws IOException in case accessing the underlying index fails
 */
public static Double[] toSparseLocalFreqDoubleArray(Terms docTerms, Terms fieldTerms) throws IOException {
  TermsEnum fieldTermsEnum = fieldTerms.iterator();
  Double[] freqVector = null;
  if (docTerms != null && fieldTerms.size() > -1) {
    freqVector = new Double[(int) fieldTerms.size()];
    int i = 0;
    TermsEnum docTermsEnum = docTerms.iterator();
    BytesRef term;
    while ((term = fieldTermsEnum.next()) != null) {
      TermsEnum.SeekStatus seekStatus = docTermsEnum.seekCeil(term);
      if (seekStatus.equals(TermsEnum.SeekStatus.END)) {
        docTermsEnum = docTerms.iterator();
      }
      if (seekStatus.equals(TermsEnum.SeekStatus.FOUND)) {
        long termFreqLocal = docTermsEnum.totalTermFreq(); // the total number of occurrences of this term in the given document
        freqVector[i] = Long.valueOf(termFreqLocal).doubleValue();
      } else {
        freqVector[i] = 0d;
      }
      i++;
    }
  }
  return freqVector;
}
 
Example #18
Source File: TermGroupFacetCollector.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
SegmentResult(int[] counts, int total, TermsEnum tenum, int startFacetOrd, int endFacetOrd) throws IOException {
  super(counts, total - counts[0], counts[0], endFacetOrd+1);
  this.tenum = tenum;
  this.mergePos = startFacetOrd == -1 ? 1 : startFacetOrd+1;
  if (mergePos < maxTermPos) {
    assert tenum != null;
    tenum.seekExact(startFacetOrd == -1 ? 0 : startFacetOrd);
    mergeTerm = tenum.term();
  }
}
 
Example #19
Source File: CompressingTermVectorsReader.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public TermsEnum iterator() throws IOException {
  TVTermsEnum termsEnum = new TVTermsEnum();
  termsEnum.reset(numTerms, flags, prefixLengths, suffixLengths, termFreqs, positionIndex, positions, startOffsets, lengths,
      payloadIndex, payloadBytes,
      new ByteArrayDataInput(termBytes.bytes, termBytes.offset, termBytes.length));
  return termsEnum;
}
 
Example #20
Source File: SeekingTermSetTermsEnum.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Constructor
 */
public SeekingTermSetTermsEnum(TermsEnum tenum, BytesRefHash terms, int[] ords) {
  super(tenum);
  this.terms = terms;
  this.ords = ords;
  lastElement = terms.size() - 1;
  lastTerm = terms.get(ords[lastElement], new BytesRef());
  seekTerm = terms.get(ords[upto], spare);
}
 
Example #21
Source File: FSTTermsWriter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public void write(Fields fields, NormsProducer norms) throws IOException {
  for(String field : fields) {
    Terms terms = fields.terms(field);
    if (terms == null) {
      continue;
    }
    FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
    boolean hasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
    TermsEnum termsEnum = terms.iterator();
    TermsWriter termsWriter = new TermsWriter(fieldInfo);

    long sumTotalTermFreq = 0;
    long sumDocFreq = 0;
    FixedBitSet docsSeen = new FixedBitSet(maxDoc);

    while (true) {
      BytesRef term = termsEnum.next();
      if (term == null) {
        break;
      }
          
      BlockTermState termState = postingsWriter.writeTerm(term, termsEnum, docsSeen, norms);
      if (termState != null) {
        termsWriter.finishTerm(term, termState);
        sumTotalTermFreq += termState.totalTermFreq;
        sumDocFreq += termState.docFreq;
      }
    }

    termsWriter.finish(hasFreq ? sumTotalTermFreq : -1, sumDocFreq, docsSeen.cardinality());
  }
}
 
Example #22
Source File: FieldReader.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
  // if (DEBUG) System.out.println("  FieldReader.intersect startTerm=" + BlockTreeTermsWriter.brToString(startTerm));
  //System.out.println("intersect: " + compiled.type + " a=" + compiled.automaton);
  // TODO: we could push "it's a range" or "it's a prefix" down into IntersectTermsEnum?
  // can we optimize knowing that...?
  if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
    throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
  }
  return new IntersectTermsEnum(this, compiled.automaton, compiled.runAutomaton, compiled.commonSuffixRef, startTerm);
}
 
Example #23
Source File: DisjunctionMatchesIterator.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
TermsEnumDisjunctionMatchesIterator(MatchesIterator first, BytesRefIterator terms, TermsEnum te, int doc, Query query) {
  this.first = first;
  this.terms = terms;
  this.te = te;
  this.doc = doc;
  this.query = query;
}
 
Example #24
Source File: QueryAutoFilteringComponent.java    From query-autofiltering-component with Apache License 2.0 5 votes vote down vote up
private void buildFieldMap( ResponseBuilder rb ) throws IOException {
  Log.debug( "buildFieldMap" );
  SolrIndexSearcher searcher = rb.req.getSearcher();
  // build a synonym map from the SortedDocValues -
  // for each field value: lower case, stemmed, lookup synonyms from synonyms.txt - map to fieldValue
  SynonymMap.Builder fieldBuilder = new SynonymMap.Builder( true );
  SynonymMap.Builder termBuilder = new SynonymMap.Builder( true );
    
  ArrayList<String> searchFields = getStringFields( searcher );

  for (String searchField : searchFields ) {
    Log.debug( "adding searchField " + searchField );
    CharsRef fieldChars = new CharsRef( searchField );
    SortedSetDocValues sdv = FieldCache.DEFAULT.getDocTermOrds( searcher.getAtomicReader( ), searchField );
    if (sdv == null) continue;
    Log.debug( "got SortedSetDocValues for " + searchField );
    TermsEnum te = sdv.termsEnum();
    while (te.next() != null) {
      BytesRef term = te.term();
      String fieldValue = term.utf8ToString( );
      addTerm ( fieldChars, fieldValue, fieldBuilder, termBuilder );
    }
  }
    
  addDistributedTerms( rb, fieldBuilder, termBuilder, searchFields );
    
  fieldMap = fieldBuilder.build( );
  termMap = termBuilder.build( );
}
 
Example #25
Source File: JoinDocFreqValueSource.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public FunctionValues getValues(Map<Object, Object> context, LeafReaderContext readerContext) throws IOException
{
  final BinaryDocValues terms = DocValues.getBinary(readerContext.reader(), field);
  final IndexReader top = ReaderUtil.getTopLevelContext(readerContext).reader();
  Terms t = MultiTerms.getTerms(top, qfield);
  final TermsEnum termsEnum = t == null ? TermsEnum.EMPTY : t.iterator();
  
  return new IntDocValues(this) {

    int lastDocID = -1;

    @Override
    public int intVal(int doc) throws IOException {
      if (doc < lastDocID) {
        throw new IllegalArgumentException("docs were sent out-of-order: lastDocID=" + lastDocID + " vs docID=" + doc);
      }
      lastDocID = doc;
      int curDocID = terms.docID();
      if (doc > curDocID) {
        curDocID = terms.advance(doc);
      }
      if (doc == curDocID) {
        BytesRef term = terms.binaryValue();
        if (termsEnum.seekExact(term)) {
          return termsEnum.docFreq();
        }
      }
      return 0;
    }
  };
}
 
Example #26
Source File: TermPrefixCursor.java    From SolrTextTagger with Apache License 2.0 5 votes vote down vote up
/** Seeks to prefixBuf or the next term that is prefixed by prefixBuf plus the separator char.
 * Sets docIds. **/
private boolean seekPrefix() throws IOException {
  TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefixBuf);

  docIds = null;//invalidate
  switch (seekStatus) {
    case END:
      return false;

    case FOUND:
      postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
      docIds = postingsEnumToIntsRef(postingsEnum, liveDocs);
      if (docIds.length > 0) {
        return true;
      }

      //Pretend we didn't find it; go to next term
      docIds = null;
      if (termsEnum.next() == null) { // case END
        return false;
      }
      //fall through to NOT_FOUND

    case NOT_FOUND:
      //termsEnum must start with prefixBuf to continue
      BytesRef teTerm = termsEnum.term();

      if (teTerm.length > prefixBuf.length) {
        for (int i = 0; i < prefixBuf.length; i++) {
          if (prefixBuf.bytes[prefixBuf.offset + i] != teTerm.bytes[teTerm.offset + i])
            return false;
        }
        if (teTerm.bytes[teTerm.offset + prefixBuf.length] != SEPARATOR_CHAR)
          return false;
        return true;
      }
      return false;
  }
  throw new IllegalStateException(seekStatus.toString());
}
 
Example #27
Source File: TestOrdsBlockTree.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testBasic() throws Exception {
  Directory dir = newDirectory();
  RandomIndexWriter w = new RandomIndexWriter(random(), dir);
  Document doc = new Document();
  doc.add(newTextField("field", "a b c", Field.Store.NO));
  w.addDocument(doc);
  IndexReader r = w.getReader();
  TermsEnum te = MultiTerms.getTerms(r, "field").iterator();

  // Test next()
  assertEquals(new BytesRef("a"), te.next());
  assertEquals(0L, te.ord());
  assertEquals(new BytesRef("b"), te.next());
  assertEquals(1L, te.ord());
  assertEquals(new BytesRef("c"), te.next());
  assertEquals(2L, te.ord());
  assertNull(te.next());

  // Test seekExact by term
  assertTrue(te.seekExact(new BytesRef("b")));
  assertEquals(1, te.ord());
  assertTrue(te.seekExact(new BytesRef("a")));
  assertEquals(0, te.ord());
  assertTrue(te.seekExact(new BytesRef("c")));
  assertEquals(2, te.ord());

  // Test seekExact by ord
  te.seekExact(1);
  assertEquals(new BytesRef("b"), te.term());
  te.seekExact(0);
  assertEquals(new BytesRef("a"), te.term());
  te.seekExact(2);
  assertEquals(new BytesRef("c"), te.term());

  r.close();
  w.close();
  dir.close();
}
 
Example #28
Source File: BlockTreeTermsWriter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** Writes one term's worth of postings. */
public void write(BytesRef text, TermsEnum termsEnum, NormsProducer norms) throws IOException {
  /*
  if (DEBUG) {
    int[] tmp = new int[lastTerm.length];
    System.arraycopy(prefixStarts, 0, tmp, 0, tmp.length);
    System.out.println("BTTW: write term=" + brToString(text) + " prefixStarts=" + Arrays.toString(tmp) + " pending.size()=" + pending.size());
  }
  */

  BlockTermState state = postingsWriter.writeTerm(text, termsEnum, docsSeen, norms);
  if (state != null) {

    assert state.docFreq != 0;
    assert fieldInfo.getIndexOptions() == IndexOptions.DOCS || state.totalTermFreq >= state.docFreq: "postingsWriter=" + postingsWriter;
    pushTerm(text);
   
    PendingTerm term = new PendingTerm(text, state);
    pending.add(term);
    //if (DEBUG) System.out.println("    add pending term = " + text + " pending.size()=" + pending.size());

    sumDocFreq += state.docFreq;
    sumTotalTermFreq += state.totalTermFreq;
    numTerms++;
    if (firstPendingTerm == null) {
      firstPendingTerm = term;
    }
    lastPendingTerm = term;
  }
}
 
Example #29
Source File: SimpleTextTermVectorsReader.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public long getSumTotalTermFreq() throws IOException {
  // TODO: make it constant-time
  long ttf = 0;
  TermsEnum iterator = iterator();
  for (BytesRef b = iterator.next(); b != null; b = iterator.next()) {
    ttf += iterator.totalTermFreq();
  }
  return ttf;
}
 
Example #30
Source File: MultiPhrasePrefixQuery.java    From crate with Apache License 2.0 5 votes vote down vote up
private void getPrefixTerms(ObjectHashSet<Term> terms, final Term prefix, final IndexReader reader) throws IOException {
    // SlowCompositeReaderWrapper could be used... but this would merge all terms from each segment into one terms
    // instance, which is very expensive. Therefore I think it is better to iterate over each leaf individually.
    List<LeafReaderContext> leaves = reader.leaves();
    for (LeafReaderContext leaf : leaves) {
        Terms _terms = leaf.reader().terms(field);
        if (_terms == null) {
            continue;
        }

        TermsEnum termsEnum = _terms.iterator();
        TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefix.bytes());
        if (TermsEnum.SeekStatus.END == seekStatus) {
            continue;
        }

        for (BytesRef term = termsEnum.term(); term != null; term = termsEnum.next()) {
            if (!StringHelper.startsWith(term, prefix.bytes())) {
                break;
            }

            terms.add(new Term(field, BytesRef.deepCopyOf(term)));
            if (terms.size() >= maxExpansions) {
                return;
            }
        }
    }
}