Java Code Examples for org.apache.lucene.index.LeafReader#terms()

The following examples show how to use org.apache.lucene.index.LeafReader#terms() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: GeoPointArrayIndexFieldData.java    From Elasticsearch with Apache License 2.0 6 votes vote down vote up
@Override
public AtomicGeoPointFieldData loadDirect(LeafReaderContext context) throws Exception {
    LeafReader reader = context.reader();

    Terms terms = reader.terms(getFieldNames().indexName());
    AtomicGeoPointFieldData data = null;
    // TODO: Use an actual estimator to estimate before loading.
    NonEstimatingEstimator estimator = new NonEstimatingEstimator(breakerService.getBreaker(CircuitBreaker.FIELDDATA));
    if (terms == null) {
        data = AbstractAtomicGeoPointFieldData.empty(reader.maxDoc());
        estimator.afterLoad(null, data.ramBytesUsed());
        return data;
    }
    return (Version.indexCreated(indexSettings).before(Version.V_2_2_0)) ?
        loadLegacyFieldData(reader, estimator, terms, data) : loadFieldData22(reader, estimator, terms, data);
}
 
Example 2
Source File: CodecCollector.java    From mtas with Apache License 2.0 6 votes vote down vote up
/**
 * Collect collection.
 *
 * @param reader
 *          the reader
 * @param docSet
 *          the doc set
 * @param collectionInfo
 *          the collection info
 * @throws IOException
 *           Signals that an I/O exception has occurred.
 */
public static void collectCollection(IndexReader reader, List<Integer> docSet,
    ComponentCollection collectionInfo) throws IOException {
  if (collectionInfo.action().equals(ComponentCollection.ACTION_CHECK)) {
    // can't do anything in lucene for check
  } else if (collectionInfo.action()
      .equals(ComponentCollection.ACTION_LIST)) {
    // can't do anything in lucene for list
  } else if (collectionInfo.action()
      .equals(ComponentCollection.ACTION_CREATE)) {
    BytesRef term = null;
    PostingsEnum postingsEnum = null;
    Integer docId;
    Integer termDocId = -1;
    Terms terms;
    LeafReaderContext lrc;
    LeafReader r;
    ListIterator<LeafReaderContext> iterator = reader.leaves().listIterator();
    while (iterator.hasNext()) {
      lrc = iterator.next();
      r = lrc.reader();
      for (String field : collectionInfo.fields()) {
        if ((terms = r.terms(field)) != null) {
          TermsEnum termsEnum = terms.iterator();
          while ((term = termsEnum.next()) != null) {
            Iterator<Integer> docIterator = docSet.iterator();
            postingsEnum = termsEnum.postings(postingsEnum,
                PostingsEnum.NONE);
            termDocId = -1;
            while (docIterator.hasNext()) {
              docId = docIterator.next() - lrc.docBase;
              if ((docId >= termDocId) && ((docId.equals(termDocId))
                  || ((termDocId = postingsEnum.advance(docId))
                      .equals(docId)))) {
                collectionInfo.addValue(term.utf8ToString());
                break;
              }
              if (termDocId.equals(PostingsEnum.NO_MORE_DOCS)) {
                break;
              }
            }
          }
        }
      }
    }
  }
}
 
Example 3
Source File: ShardSplittingQuery.java    From crate with Apache License 2.0 6 votes vote down vote up
private static void findSplitDocs(String idField, Predicate<BytesRef> includeInShard, LeafReader leafReader,
                                  IntConsumer consumer) throws IOException {
    Terms terms = leafReader.terms(idField);
    TermsEnum iterator = terms.iterator();
    BytesRef idTerm;
    PostingsEnum postingsEnum = null;
    while ((idTerm = iterator.next()) != null) {
        if (includeInShard.test(idTerm) == false) {
            postingsEnum = iterator.postings(postingsEnum);
            int doc;
            while ((doc = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                consumer.accept(doc);
            }
        }
    }
}
 
Example 4
Source File: AbstractPrefixTreeQuery.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public BaseTermsEnumTraverser(LeafReaderContext context) throws IOException {
  this.context = context;
  LeafReader reader = context.reader();
  this.maxDoc = reader.maxDoc();
  terms = reader.terms(fieldName);
  if (terms != null) {
    this.termsEnum = terms.iterator();
  } else {
    this.termsEnum = null;
  }
}
 
Example 5
Source File: CompletionWeight.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public BulkScorer bulkScorer(final LeafReaderContext context) throws IOException {
  final LeafReader reader = context.reader();
  final Terms terms;
  final NRTSuggester suggester;
  if ((terms = reader.terms(completionQuery.getField())) == null) {
    return null;
  }
  if (terms instanceof CompletionTerms) {
    CompletionTerms completionTerms = (CompletionTerms) terms;
    if ((suggester = completionTerms.suggester()) == null) {
      // a segment can have a null suggester
      // i.e. no FST was built
      return null;
    }
  } else {
    throw new IllegalArgumentException(completionQuery.getField() + " is not a SuggestField");
  }

  BitsProducer filter = completionQuery.getFilter();
  Bits filteredDocs = null;
  if (filter != null) {
    filteredDocs = filter.getBits(context);
    if (filteredDocs.getClass() == Bits.MatchNoBits.class) {
      return null;
    }
  }
  return new CompletionScorer(this, suggester, reader, filteredDocs, filter != null, automaton);
}
 
Example 6
Source File: DirectoryTaxonomyWriter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Takes the categories from the given taxonomy directory, and adds the
 * missing ones to this taxonomy. Additionally, it fills the given
 * {@link OrdinalMap} with a mapping from the original ordinal to the new
 * ordinal.
 */
public void addTaxonomy(Directory taxoDir, OrdinalMap map) throws IOException {
  ensureOpen();
  DirectoryReader r = DirectoryReader.open(taxoDir);
  try {
    final int size = r.numDocs();
    final OrdinalMap ordinalMap = map;
    ordinalMap.setSize(size);
    int base = 0;
    PostingsEnum docs = null;
    for (final LeafReaderContext ctx : r.leaves()) {
      final LeafReader ar = ctx.reader();
      final Terms terms = ar.terms(Consts.FULL);
      // TODO: share per-segment TermsEnum here!
      TermsEnum te = terms.iterator();
      while (te.next() != null) {
        FacetLabel cp = new FacetLabel(FacetsConfig.stringToPath(te.term().utf8ToString()));
        final int ordinal = addCategory(cp);
        docs = te.postings(docs, PostingsEnum.NONE);
        ordinalMap.addMapping(docs.nextDoc() + base, ordinal);
      }
      base += ar.maxDoc(); // no deletions, so we're ok
    }
    ordinalMap.addDone();
  } finally {
    r.close();
  }
}
 
Example 7
Source File: TermFilteredPresearcher.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private Query buildFilterClause(LeafReader reader, String field) throws IOException {

    Terms terms = reader.terms(field);
    if (terms == null)
      return null;

    BooleanQuery.Builder bq = new BooleanQuery.Builder();

    int docsInBatch = reader.maxDoc();

    BytesRef term;
    TermsEnum te = terms.iterator();
    while ((term = te.next()) != null) {
      // we need to check that every document in the batch has the same field values, otherwise
      // this filtering will not work
      if (te.docFreq() != docsInBatch)
        throw new IllegalArgumentException("Some documents in this batch do not have a term value of "
            + field + ":" + Term.toString(term));
      bq.add(new TermQuery(new Term(field, BytesRef.deepCopyOf(term))), BooleanClause.Occur.SHOULD);
    }

    BooleanQuery built = bq.build();

    if (built.clauses().size() == 0)
      return null;

    return built;
  }
 
Example 8
Source File: CompletionQuery.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public Query rewrite(IndexReader reader) throws IOException {
  byte type = 0;
  boolean first = true;
  Terms terms;
  for (LeafReaderContext context : reader.leaves()) {
    LeafReader leafReader = context.reader();
    try {
      if ((terms = leafReader.terms(getField())) == null) {
        continue;
      }
    } catch (IOException e) {
      continue;
    }
    if (terms instanceof CompletionTerms) {
      CompletionTerms completionTerms = (CompletionTerms) terms;
      byte t = completionTerms.getType();
      if (first) {
        type = t;
        first = false;
      } else if (type != t) {
        throw new IllegalStateException(getField() + " has values of multiple types");
      }
    }
  }

  if (first == false) {
    if (this instanceof ContextQuery) {
      if (type == SuggestField.TYPE) {
        throw new IllegalStateException(this.getClass().getSimpleName()
            + " can not be executed against a non context-enabled SuggestField: "
            + getField());
      }
    } else {
      if (type == ContextSuggestField.TYPE) {
        return new ContextQuery(this);
      }
    }
  }
  return super.rewrite(reader);
}
 
Example 9
Source File: TermFilteredPresearcher.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public final Query buildQuery(LeafReader reader, BiPredicate<String, BytesRef> termAcceptor) {
  try {
    DocumentQueryBuilder queryBuilder = getQueryBuilder();
    for (FieldInfo field : reader.getFieldInfos()) {

      Terms terms = reader.terms(field.name);
      if (terms == null) {
        continue;
      }

      TokenStream ts = new TermsEnumTokenStream(terms.iterator());
      for (CustomQueryHandler handler : queryHandlers) {
        ts = handler.wrapTermStream(field.name, ts);
      }

      ts = new FilteringTokenFilter(ts) {
        TermToBytesRefAttribute termAtt = addAttribute(TermToBytesRefAttribute.class);
        @Override
        protected boolean accept() {
          return filterFields.contains(field.name) == false && termAcceptor.test(field.name, termAtt.getBytesRef());
        }
      };

      TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
      while (ts.incrementToken()) {
        queryBuilder.addTerm(field.name, BytesRef.deepCopyOf(termAtt.getBytesRef()));
      }
      ts.close();

    }
    Query presearcherQuery = queryBuilder.build();

    BooleanQuery.Builder bq = new BooleanQuery.Builder();
    bq.add(presearcherQuery, BooleanClause.Occur.SHOULD);
    bq.add(new TermQuery(new Term(ANYTOKEN_FIELD, ANYTOKEN)), BooleanClause.Occur.SHOULD);
    presearcherQuery = bq.build();
    if (filterFields.isEmpty() == false) {
      bq = new BooleanQuery.Builder();
      bq.add(presearcherQuery, BooleanClause.Occur.MUST);
      Query filterQuery = buildFilterFields(reader);
      if (filterQuery != null) {
        bq.add(filterQuery, BooleanClause.Occur.FILTER);
        presearcherQuery = bq.build();
      }
    }
    return presearcherQuery;
  } catch (IOException e) {
    // We're a MemoryIndex, so this shouldn't happen...
    throw new RuntimeException(e);
  }
}
 
Example 10
Source File: FieldCacheImpl.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
protected Accountable createValue(LeafReader reader, CacheKey key)
    throws IOException {

  final int maxDoc = reader.maxDoc();

  Terms terms = reader.terms(key.field);

  final float acceptableOverheadRatio = ((Float) key.custom).floatValue();

  final PagedBytes bytes = new PagedBytes(15);

  int startTermsBPV;

  // TODO: use Uninvert?
  if (terms != null) {
    // Try for coarse estimate for number of bits; this
    // should be an underestimate most of the time, which
    // is fine -- GrowableWriter will reallocate as needed
    long numUniqueTerms = terms.size();
    if (numUniqueTerms != -1L) {
      if (numUniqueTerms > maxDoc) {
        throw new IllegalStateException("Type mismatch: " + key.field + " was indexed with multiple values per document, use SORTED_SET instead");
      }

      startTermsBPV = PackedInts.bitsRequired(numUniqueTerms);
    } else {
      startTermsBPV = 1;
    }
  } else {
    startTermsBPV = 1;
  }

  PackedLongValues.Builder termOrdToBytesOffset = PackedLongValues.monotonicBuilder(PackedInts.COMPACT);
  final GrowableWriter docToTermOrd = new GrowableWriter(startTermsBPV, maxDoc, acceptableOverheadRatio);

  int termOrd = 0;

  // TODO: use Uninvert?

  if (terms != null) {
    final TermsEnum termsEnum = terms.iterator();
    PostingsEnum docs = null;

    while(true) {
      final BytesRef term = termsEnum.next();
      if (term == null) {
        break;
      }
      if (termOrd >= maxDoc) {
        throw new IllegalStateException("Type mismatch: " + key.field + " was indexed with multiple values per document, use SORTED_SET instead");
      }

      termOrdToBytesOffset.add(bytes.copyUsingLengthPrefix(term));
      docs = termsEnum.postings(docs, PostingsEnum.NONE);
      while (true) {
        final int docID = docs.nextDoc();
        if (docID == DocIdSetIterator.NO_MORE_DOCS) {
          break;
        }
        // Store 1+ ord into packed bits
        docToTermOrd.set(docID, 1+termOrd);
      }
      termOrd++;
    }
  }

  // maybe an int-only impl?
  return new SortedDocValuesImpl(bytes.freeze(true), termOrdToBytesOffset.build(), docToTermOrd.getMutable(), termOrd);
}
 
Example 11
Source File: FieldCacheImpl.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
protected Accountable createValue(LeafReader reader, CacheKey key)
    throws IOException {

  // TODO: would be nice to first check if DocTermsIndex
  // was already cached for this field and then return
  // that instead, to avoid insanity

  final int maxDoc = reader.maxDoc();
  Terms terms = reader.terms(key.field);

  final float acceptableOverheadRatio = ((Float) key.custom).floatValue();

  final int termCountHardLimit = maxDoc;

  // Holds the actual term data, expanded.
  final PagedBytes bytes = new PagedBytes(15);

  int startBPV;

  if (terms != null) {
    // Try for coarse estimate for number of bits; this
    // should be an underestimate most of the time, which
    // is fine -- GrowableWriter will reallocate as needed
    long numUniqueTerms = terms.size();
    if (numUniqueTerms != -1L) {
      if (numUniqueTerms > termCountHardLimit) {
        numUniqueTerms = termCountHardLimit;
      }
      startBPV = PackedInts.bitsRequired(numUniqueTerms*4);
    } else {
      startBPV = 1;
    }
  } else {
    startBPV = 1;
  }

  final GrowableWriter docToOffset = new GrowableWriter(startBPV, maxDoc, acceptableOverheadRatio);
  
  // pointer==0 means not set
  bytes.copyUsingLengthPrefix(new BytesRef());

  if (terms != null) {
    int termCount = 0;
    final TermsEnum termsEnum = terms.iterator();
    PostingsEnum docs = null;
    while(true) {
      if (termCount++ == termCountHardLimit) {
        // app is misusing the API (there is more than
        // one term per doc); in this case we make best
        // effort to load what we can (see LUCENE-2142)
        break;
      }

      final BytesRef term = termsEnum.next();
      if (term == null) {
        break;
      }
      final long pointer = bytes.copyUsingLengthPrefix(term);
      docs = termsEnum.postings(docs, PostingsEnum.NONE);
      while (true) {
        final int docID = docs.nextDoc();
        if (docID == DocIdSetIterator.NO_MORE_DOCS) {
          break;
        }
        docToOffset.set(docID, pointer);
      }
    }
  }

  final PackedInts.Reader offsetReader = docToOffset.getMutable();
  Bits docsWithField = new Bits() {
    @Override
    public boolean get(int index) {
      return offsetReader.get(index) != 0;
    }

    @Override
    public int length() {
      return maxDoc;
    }
  };

  wrapper.setDocsWithField(reader, key.field, docsWithField, null);
  // maybe an int-only impl?
  return new BinaryDocValuesImpl(bytes.freeze(true), offsetReader, docsWithField);
}
 
Example 12
Source File: LukeRequestHandler.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private static SimpleOrderedMap<Object> getIndexedFieldsInfo(SolrQueryRequest req)
    throws Exception {

  SolrIndexSearcher searcher = req.getSearcher();
  SolrParams params = req.getParams();

  Set<String> fields = null;
  String fl = params.get(CommonParams.FL);
  if (fl != null) {
    fields = new TreeSet<>(Arrays.asList(fl.split( "[,\\s]+" )));
  }

  LeafReader reader = searcher.getSlowAtomicReader();
  IndexSchema schema = searcher.getSchema();

  // Don't be tempted to put this in the loop below, the whole point here is to alphabetize the fields!
  Set<String> fieldNames = new TreeSet<>();
  for(FieldInfo fieldInfo : reader.getFieldInfos()) {
    fieldNames.add(fieldInfo.name);
  }

  // Walk the term enum and keep a priority queue for each map in our set
  SimpleOrderedMap<Object> finfo = new SimpleOrderedMap<>();

  for (String fieldName : fieldNames) {
    if (fields != null && ! fields.contains(fieldName) && ! fields.contains("*")) {
      continue; //we're not interested in this field Still an issue here
    }

    SimpleOrderedMap<Object> fieldMap = new SimpleOrderedMap<>();

    SchemaField sfield = schema.getFieldOrNull( fieldName );
    FieldType ftype = (sfield==null)?null:sfield.getType();

    fieldMap.add( "type", (ftype==null)?null:ftype.getTypeName() );
    fieldMap.add("schema", getFieldFlags(sfield));
    if (sfield != null && schema.isDynamicField(sfield.getName()) && schema.getDynamicPattern(sfield.getName()) != null) {
      fieldMap.add("dynamicBase", schema.getDynamicPattern(sfield.getName()));
    }
    Terms terms = reader.terms(fieldName);
    if (terms == null) { // Not indexed, so we need to report what we can (it made it through the fl param if specified)
      finfo.add( fieldName, fieldMap );
      continue;
    }

    if(sfield != null && sfield.indexed() ) {
      if (params.getBool(INCLUDE_INDEX_FIELD_FLAGS,true)) {
        Document doc = getFirstLiveDoc(terms, reader);

        if (doc != null) {
          // Found a document with this field
          try {
            IndexableField fld = doc.getField(fieldName);
            if (fld != null) {
              fieldMap.add("index", getFieldFlags(fld));
            } else {
              // it is a non-stored field...
              fieldMap.add("index", "(unstored field)");
            }
          } catch (Exception ex) {
            log.warn("error reading field: {}", fieldName);
          }
        }
      }
      fieldMap.add("docs", terms.getDocCount());
    }
    if (fields != null && (fields.contains(fieldName) || fields.contains("*"))) {
      getDetailedFieldInfo(req, fieldName, fieldMap);
    }
    // Add the field
    finfo.add( fieldName, fieldMap );
  }
  return finfo;
}
 
Example 13
Source File: DocSetInfoCommand.java    From clue with Apache License 2.0 4 votes vote down vote up
@Override
public void execute(Namespace args, PrintStream out) throws Exception {
  String field = args.getString("field");
  String termVal = null;
  int bucketSize = args.getInt("size");

  if (field != null){
    String[] parts = field.split(":");
    if (parts.length > 1){
      field = parts[0];
      termVal = parts[1];
    }
  }
  
  IndexReader reader = ctx.getIndexReader();
  List<LeafReaderContext> leaves = reader.leaves();
  

  PostingsEnum postingsEnum = null;
  for (LeafReaderContext leaf : leaves) {
    LeafReader atomicReader = leaf.reader();
    Terms terms = atomicReader.terms(field);
    if (terms == null){
      continue;
    }
    if (terms != null && termVal != null){        
      TermsEnum te = terms.iterator();
      
      if (te.seekExact(new BytesRef(termVal))){
        postingsEnum = te.postings(postingsEnum, PostingsEnum.FREQS);
        
        int docFreq = te.docFreq();
        
        int minDocId = -1, maxDocId = -1;
        int doc, count = 0;
        
        int[] percentDocs = new int[PERCENTILES.length];
        
        int percentileIdx = 0;
        
        while ((doc = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
          maxDocId = doc;
          if (minDocId == -1) {
            minDocId = doc;
          }
          count ++;
          
          double perDocs = (double) count / (double) docFreq * 100.0;
          while (percentileIdx < percentDocs.length) {
            if (perDocs > PERCENTILES[percentileIdx]) {
              percentDocs[percentileIdx] = doc;
              percentileIdx++;
            } else {
              break;
            }
          }
        }
        
        // calculate histogram          
        int[] buckets = null;
        if (maxDocId > 0) {
          buckets = new int[maxDocId / bucketSize + 1];
          
          postingsEnum = te.postings(postingsEnum, PostingsEnum.FREQS);
          while ((doc = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
            int bucketIdx = doc / bucketSize;
            buckets[bucketIdx]++;
          }
        }
        
        double density = (double) docFreq / (double) (maxDocId - minDocId) ; 
        out.println(String.format("min: %d, max: %d, count: %d, density: %.2f", minDocId, maxDocId, docFreq, density));
        out.println("percentiles: " + Arrays.toString(PERCENTILES) + " => " + Arrays.toString(percentDocs));
        out.println("histogram: (bucketsize=" + bucketSize+")");
        out.println(Arrays.toString(buckets));
      }
    }
  }
}