Java Code Examples for org.apache.lucene.util.StringHelper#startsWith()

The following examples show how to use org.apache.lucene.util.StringHelper#startsWith() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SimpleTextFieldsReader.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
private TreeMap<String,Long> readFields(IndexInput in) throws IOException {
  ChecksumIndexInput input = new BufferedChecksumIndexInput(in);
  BytesRefBuilder scratch = new BytesRefBuilder();
  TreeMap<String,Long> fields = new TreeMap<>();

  while (true) {
    SimpleTextUtil.readLine(input, scratch);
    if (scratch.get().equals(END)) {
      SimpleTextUtil.checkFooter(input);
      return fields;
    } else if (StringHelper.startsWith(scratch.get(), FIELD)) {
      String fieldName = new String(scratch.bytes(), FIELD.length, scratch.length() - FIELD.length, StandardCharsets.UTF_8);
      fields.put(fieldName, input.getFilePointer());
    }
  }
}
 
Example 2
Source File: MultiPhrasePrefixQuery.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
private void getPrefixTerms(ObjectHashSet<Term> terms, final Term prefix, final IndexReader reader) throws IOException {
    // SlowCompositeReaderWrapper could be used... but this would merge all terms from each segment into one terms
    // instance, which is very expensive. Therefore I think it is better to iterate over each leaf individually.
    List<LeafReaderContext> leaves = reader.leaves();
    for (LeafReaderContext leaf : leaves) {
        Terms _terms = leaf.reader().terms(field);
        if (_terms == null) {
            continue;
        }

        TermsEnum termsEnum = _terms.iterator();
        TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefix.bytes());
        if (TermsEnum.SeekStatus.END == seekStatus) {
            continue;
        }

        for (BytesRef term = termsEnum.term(); term != null; term = termsEnum.next()) {
            if (!StringHelper.startsWith(term, prefix.bytes())) {
                break;
            }

            terms.add(new Term(field, BytesRef.deepCopyOf(term)));
            if (terms.size() >= maxExpansions) {
                return;
            }
        }
    }
}
 
Example 3
Source File: SimpleTextUtil.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public static void checkFooter(ChecksumIndexInput input) throws IOException {
  BytesRefBuilder scratch = new BytesRefBuilder();
  String expectedChecksum = String.format(Locale.ROOT, "%020d", input.getChecksum());
  readLine(input, scratch);
  if (StringHelper.startsWith(scratch.get(), CHECKSUM) == false) {
    throw new CorruptIndexException("SimpleText failure: expected checksum line but got " + scratch.get().utf8ToString(), input);
  }
  String actualChecksum = new BytesRef(scratch.bytes(), CHECKSUM.length, scratch.length() - CHECKSUM.length).utf8ToString();
  if (!expectedChecksum.equals(actualChecksum)) {
    throw new CorruptIndexException("SimpleText checksum failure: " + actualChecksum + " != " + expectedChecksum, input);
  }
  if (input.length() != input.getFilePointer()) {
    throw new CorruptIndexException("Unexpected stuff at the end of file, please be careful with your text editor!", input);
  }
}
 
Example 4
Source File: SrndTruncQuery.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public void visitMatchingTerms(
  IndexReader reader,
  String fieldName,
  MatchingTermVisitor mtv) throws IOException
{
  int prefixLength = prefix.length();
  Terms terms = MultiTerms.getTerms(reader, fieldName);
  if (terms != null) {
    Matcher matcher = pattern.matcher("");
    try {
      TermsEnum termsEnum = terms.iterator();

      TermsEnum.SeekStatus status = termsEnum.seekCeil(prefixRef);
      BytesRef text;
      if (status == TermsEnum.SeekStatus.FOUND) {
        text = prefixRef;
      } else if (status == TermsEnum.SeekStatus.NOT_FOUND) {
        text = termsEnum.term();
      } else {
        text = null;
      }

      while(text != null) {
        if (text != null && StringHelper.startsWith(text, prefixRef)) {
          String textString = text.utf8ToString();
          matcher.reset(textString.substring(prefixLength));
          if (matcher.matches()) {
            mtv.visitMatchingTerm(new Term(fieldName, textString));
          }
        } else {
          break;
        }
        text = termsEnum.next();
      }
    } finally {
      matcher.reset();
    }
  }
}
 
Example 5
Source File: SrndPrefixQuery.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public void visitMatchingTerms(
  IndexReader reader,
  String fieldName,
  MatchingTermVisitor mtv) throws IOException
{
  /* inspired by PrefixQuery.rewrite(): */
  Terms terms = MultiTerms.getTerms(reader, fieldName);
  if (terms != null) {
    TermsEnum termsEnum = terms.iterator();

    boolean skip = false;
    TermsEnum.SeekStatus status = termsEnum.seekCeil(new BytesRef(getPrefix()));
    if (status == TermsEnum.SeekStatus.FOUND) {
      mtv.visitMatchingTerm(getLucenePrefixTerm(fieldName));
    } else if (status == TermsEnum.SeekStatus.NOT_FOUND) {
      if (StringHelper.startsWith(termsEnum.term(), prefixRef)) {
        mtv.visitMatchingTerm(new Term(fieldName, termsEnum.term().utf8ToString()));
      } else {
        skip = true;
      }
    } else {
      // EOF
      skip = true;
    }

    if (!skip) {
      while(true) {
        BytesRef text = termsEnum.next();
        if (text != null && StringHelper.startsWith(text, prefixRef)) {
          mtv.visitMatchingTerm(new Term(fieldName, text.utf8ToString()));
        } else {
          break;
        }
      }
    }
  }
}
 
Example 6
Source File: DocTermOrds.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private BytesRef setTerm() throws IOException {
  term = termsEnum.term();
  //System.out.println("  setTerm() term=" + term.utf8ToString() + " vs prefix=" + (prefix == null ? "null" : prefix.utf8ToString()));
  if (prefix != null && !StringHelper.startsWith(term, prefix)) {
    term = null;
  }
  return term;
}
 
Example 7
Source File: MultiPhrasePrefixQuery.java    From crate with Apache License 2.0 5 votes vote down vote up
private void getPrefixTerms(ObjectHashSet<Term> terms, final Term prefix, final IndexReader reader) throws IOException {
    // SlowCompositeReaderWrapper could be used... but this would merge all terms from each segment into one terms
    // instance, which is very expensive. Therefore I think it is better to iterate over each leaf individually.
    List<LeafReaderContext> leaves = reader.leaves();
    for (LeafReaderContext leaf : leaves) {
        Terms _terms = leaf.reader().terms(field);
        if (_terms == null) {
            continue;
        }

        TermsEnum termsEnum = _terms.iterator();
        TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefix.bytes());
        if (TermsEnum.SeekStatus.END == seekStatus) {
            continue;
        }

        for (BytesRef term = termsEnum.term(); term != null; term = termsEnum.next()) {
            if (!StringHelper.startsWith(term, prefix.bytes())) {
                break;
            }

            terms.add(new Term(field, BytesRef.deepCopyOf(term)));
            if (terms.size() >= maxExpansions) {
                return;
            }
        }
    }
}
 
Example 8
Source File: SimpleTextPointsReader.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private boolean startsWith(BytesRef prefix) {
  return StringHelper.startsWith(scratch.get(), prefix);
}
 
Example 9
Source File: SimpleTextBKDReader.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private boolean startsWith(BytesRefBuilder scratch, BytesRef prefix) {
  return StringHelper.startsWith(scratch.get(), prefix);
}
 
Example 10
Source File: SimpleTextDocValuesReader.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
/** Used only in ctor: */
private boolean startsWith(BytesRef prefix) {
  return StringHelper.startsWith(scratch.get(), prefix);
}
 
Example 11
Source File: SimpleTextFieldsReader.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private void loadTerms() throws IOException {
  PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton();
  final FSTCompiler<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fstCompiler;
  final PairOutputs<Long,Long> outputsInner = new PairOutputs<>(posIntOutputs, posIntOutputs);
  final PairOutputs<Long,PairOutputs.Pair<Long,Long>> outputs = new PairOutputs<>(posIntOutputs,
      outputsInner);
  fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
  IndexInput in = SimpleTextFieldsReader.this.in.clone();
  in.seek(termsStart);
  final BytesRefBuilder lastTerm = new BytesRefBuilder();
  long lastDocsStart = -1;
  int docFreq = 0;
  long totalTermFreq = 0;
  FixedBitSet visitedDocs = new FixedBitSet(maxDoc);
  final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
  while(true) {
    SimpleTextUtil.readLine(in, scratch);
    if (scratch.get().equals(END) || StringHelper.startsWith(scratch.get(), FIELD)) {
      if (lastDocsStart != -1) {
        fstCompiler.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef),
            outputs.newPair(lastDocsStart,
                outputsInner.newPair((long) docFreq, totalTermFreq)));
        sumTotalTermFreq += totalTermFreq;
      }
      break;
    } else if (StringHelper.startsWith(scratch.get(), DOC)) {
      docFreq++;
      sumDocFreq++;
      totalTermFreq++;
      scratchUTF16.copyUTF8Bytes(scratch.bytes(), DOC.length, scratch.length()-DOC.length);
      int docID = ArrayUtil.parseInt(scratchUTF16.chars(), 0, scratchUTF16.length());
      visitedDocs.set(docID);
    } else if (StringHelper.startsWith(scratch.get(), FREQ)) {
      scratchUTF16.copyUTF8Bytes(scratch.bytes(), FREQ.length, scratch.length()-FREQ.length);
      totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars(), 0, scratchUTF16.length()) - 1;
    } else if (StringHelper.startsWith(scratch.get(), TERM)) {
      if (lastDocsStart != -1) {
        fstCompiler.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef), outputs.newPair(lastDocsStart,
            outputsInner.newPair((long) docFreq, totalTermFreq)));
      }
      lastDocsStart = in.getFilePointer();
      final int len = scratch.length() - TERM.length;
      lastTerm.grow(len);
      System.arraycopy(scratch.bytes(), TERM.length, lastTerm.bytes(), 0, len);
      lastTerm.setLength(len);
      docFreq = 0;
      sumTotalTermFreq += totalTermFreq;
      totalTermFreq = 0;
      termCount++;
    }
  }
  docCount = visitedDocs.cardinality();
  fst = fstCompiler.compile();
  /*
  PrintStream ps = new PrintStream("out.dot");
  fst.toDot(ps);
  ps.close();
  System.out.println("SAVED out.dot");
  */
  //System.out.println("FST " + fst.sizeInBytes());
}
 
Example 12
Source File: TestPrefixRandom.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
protected AcceptStatus accept(BytesRef term) throws IOException {
  return StringHelper.startsWith(term, prefix) ? AcceptStatus.YES : AcceptStatus.NO;
}
 
Example 13
Source File: SplitOp.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
/**
 *   Returns a list of range counts sorted by the range lower bound, using the indexed "id" field (i.e. the terms are full IDs, not just prefixes)
 */
static Collection<RangeCount> getHashHistogramFromId(SolrIndexSearcher searcher, String idField, DocRouter router, DocCollection collection) throws IOException {
  RTimer timer = new RTimer();

  TreeMap<DocRouter.Range, RangeCount> counts = new TreeMap<>();

  Terms terms = MultiTerms.getTerms(searcher.getIndexReader(), idField);
  if (terms == null) {
    return counts.values();
  }

  int numPrefixes = 0;
  int numCollisions = 0;
  long sumBuckets = 0;


  byte sep = (byte) CompositeIdRouter.SEPARATOR.charAt(0);
  TermsEnum termsEnum = terms.iterator();
  BytesRef currPrefix = new BytesRef();  // prefix of the previous "id" term
  int bucketCount = 0; // count of the number of docs in the current bucket

  // We're going to iterate over all terms, so do the minimum amount of work per term.
  // Terms are sorted, so all terms sharing a prefix will be grouped together.  The extra work
  // is really just limited to stepping over all the terms in the id field.
  for (;;) {
    BytesRef term = termsEnum.next();

    // compare to current prefix bucket and see if this new term shares the same prefix
    if (term != null && term.length >= currPrefix.length && currPrefix.length > 0) {
      if (StringHelper.startsWith(term, currPrefix)) {
        bucketCount++;  // use 1 since we are dealing with unique ids
        continue;
      }
    }

    // At this point the prefix did not match, so if we had a bucket we were working on, record it.
    if (currPrefix.length > 0) {
      numPrefixes++;
      sumBuckets += bucketCount;
      String currPrefixStr = currPrefix.utf8ToString();
      DocRouter.Range range = router.getSearchRangeSingle(currPrefixStr, null, collection);

      RangeCount rangeCount = new RangeCount(range, bucketCount);
      bucketCount = 0;

      RangeCount prev = counts.put(rangeCount.range, rangeCount);
      if (prev != null) {
        // we hit a hash collision, so add the buckets together.
        rangeCount.count += prev.count;
        numCollisions++;
      }
    }

    // if the current term is null, we ran out of values
    if (term == null) break;

    // find the new prefix (if any)

    // resize if needed
    if (currPrefix.length < term.length) {
      currPrefix.bytes = new byte[term.length+10];
    }

    // Copy the bytes up to and including the separator, and set the length if the separator is found.
    // If there was no separator, then length remains 0 and it's the indicator that we have no prefix bucket
    currPrefix.length = 0;
    for (int i=0; i<term.length; i++) {
      byte b = term.bytes[i + term.offset];
      currPrefix.bytes[i] = b;
      if (b == sep) {
        currPrefix.length = i + 1;
        bucketCount++;
        break;
      }
    }
  }

  if (log.isInfoEnabled()) {
    log.info("Split histogram from idField {}: ms={}, numBuckets={} sumBuckets={} numPrefixes={} numCollisions={}"
        , idField, timer.getTime(), counts.size(), sumBuckets, numPrefixes, numCollisions);
  }

  return counts.values();
}