org.apache.lucene.util.automaton.CompiledAutomaton Java Examples

The following examples show how to use org.apache.lucene.util.automaton.CompiledAutomaton. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SortedSetDocValues.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Returns a {@link TermsEnum} over the values, filtered by a {@link CompiledAutomaton}
 * The enum supports {@link TermsEnum#ord()}.
 */
public TermsEnum intersect(CompiledAutomaton automaton) throws IOException {
  TermsEnum in = termsEnum();
  switch (automaton.type) {
    case NONE:
      return TermsEnum.EMPTY;
    case ALL:
      return in;
    case SINGLE:
      return new SingleTermsEnum(in, automaton.term);
    case NORMAL:
      return new AutomatonTermsEnum(in, automaton);
    default:
      // unreachable
      throw new RuntimeException("unhandled case");
  }
}
 
Example #2
Source File: MultiTerms.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
  final List<MultiTermsEnum.TermsEnumIndex> termsEnums = new ArrayList<>();
  for(int i=0;i<subs.length;i++) {
    final TermsEnum termsEnum = subs[i].intersect(compiled, startTerm);
    if (termsEnum != null) {
      termsEnums.add(new MultiTermsEnum.TermsEnumIndex(termsEnum, i));
    }
  }

  if (termsEnums.size() > 0) {
    return new MultiTermsEnum(subSlices).reset(termsEnums.toArray(MultiTermsEnum.TermsEnumIndex.EMPTY_ARRAY));
  } else {
    return TermsEnum.EMPTY;
  }
}
 
Example #3
Source File: SortedDocValues.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Returns a {@link TermsEnum} over the values, filtered by a {@link CompiledAutomaton}
 * The enum supports {@link TermsEnum#ord()}.
 */
public TermsEnum intersect(CompiledAutomaton automaton) throws IOException {
  TermsEnum in = termsEnum();
  switch (automaton.type) {
    case NONE:
      return TermsEnum.EMPTY;
    case ALL:
      return in;
    case SINGLE:
      return new SingleTermsEnum(in, automaton.term);
    case NORMAL:
      return new AutomatonTermsEnum(in, automaton);
    default:
      // unreachable
      throw new RuntimeException("unhandled case");
  }
}
 
Example #4
Source File: TestIntervals.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testMultiTerm() throws IOException {
  RegExp re = new RegExp("p.*e");
  IntervalsSource source = Intervals.multiterm(new CompiledAutomaton(re.toAutomaton()), re.toString());

  checkIntervals(source, "field1", 5, new int[][]{
      {},
      { 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7 },
      { 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7 },
      { 7, 7 },
      { 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7 },
      { 0, 0 }
  });

  IllegalStateException e = expectThrows(IllegalStateException.class, () -> {
    IntervalsSource s = Intervals.multiterm(new CompiledAutomaton(re.toAutomaton()), 1, re.toString());
    for (LeafReaderContext ctx : searcher.getIndexReader().leaves()) {
      s.intervals("field1", ctx);
    }
  });
  assertEquals("Automaton [\\p(.)*\\e] expanded to too many terms (limit 1)", e.getMessage());

  checkVisits(source, 1);
}
 
Example #5
Source File: TestTermsEnum.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testIntersectRegexp() throws Exception {
  Directory d = newDirectory();
  RandomIndexWriter w = new RandomIndexWriter(random(), d);
  Document doc = new Document();
  doc.add(newStringField("field", "foobar", Field.Store.NO));
  w.addDocument(doc);
  IndexReader r = w.getReader();
  Terms terms = MultiTerms.getTerms(r, "field");
  CompiledAutomaton automaton = new CompiledAutomaton(new RegExp("do_not_match_anything").toAutomaton());
  String message = expectThrows(IllegalArgumentException.class, () -> {terms.intersect(automaton, null);}).getMessage();
  assertEquals("please use CompiledAutomaton.getTermsEnum instead", message);
  r.close();
  w.close();
  d.close();
}
 
Example #6
Source File: AutomatonTermsEnum.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Construct an enumerator based upon an automaton, enumerating the specified
 * field, working on a supplied TermsEnum
 *
 * @lucene.experimental 
 * @param compiled CompiledAutomaton
 */
public AutomatonTermsEnum(TermsEnum tenum, CompiledAutomaton compiled) {
  super(tenum);
  if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
    throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
  }
  this.finite = compiled.finite;
  this.runAutomaton = compiled.runAutomaton;
  assert this.runAutomaton != null;
  this.commonSuffixRef = compiled.commonSuffixRef;
  this.automaton = compiled.automaton;

  // No need to track visited states for a finite language without loops.
  visited = finite ? null : new short[runAutomaton.getSize()];
}
 
Example #7
Source File: Terms.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** Returns a TermsEnum that iterates over all terms and
 *  documents that are accepted by the provided {@link
 *  CompiledAutomaton}.  If the <code>startTerm</code> is
 *  provided then the returned enum will only return terms
 *  {@code > startTerm}, but you still must call
 *  next() first to get to the first term.  Note that the
 *  provided <code>startTerm</code> must be accepted by
 *  the automaton.
 *
 *  <p>This is an expert low-level API and will only work
 *  for {@code NORMAL} compiled automata.  To handle any
 *  compiled automata you should instead use
 *  {@link CompiledAutomaton#getTermsEnum} instead.
 *
 *  <p><b>NOTE</b>: the returned TermsEnum cannot seek</p>.
 */
public TermsEnum intersect(CompiledAutomaton compiled, final BytesRef startTerm) throws IOException {
  
  // TODO: could we factor out a common interface b/w
  // CompiledAutomaton and FST?  Then we could pass FST there too,
  // and likely speed up resolving terms to deleted docs ... but
  // AutomatonTermsEnum makes this tricky because of its on-the-fly cycle
  // detection
  
  // TODO: eventually we could support seekCeil/Exact on
  // the returned enum, instead of only being able to seek
  // at the start

  TermsEnum termsEnum = iterator();

  if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
    throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
  }

  if (startTerm == null) {
    return new AutomatonTermsEnum(termsEnum, compiled);
  } else {
    return new AutomatonTermsEnum(termsEnum, compiled) {
      @Override
      protected BytesRef nextSeekTerm(BytesRef term) throws IOException {
        if (term == null) {
          term = startTerm;
        }
        return super.nextSeekTerm(term);
      }
    };
  }
}
 
Example #8
Source File: TermInSetQuery.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private ByteRunAutomaton asByteRunAutomaton() {
  TermIterator iterator = termData.iterator();
  List<Automaton> automata = new ArrayList<>();
  for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
    automata.add(Automata.makeBinary(term));
  }
  return new CompiledAutomaton(Operations.union(automata)).runAutomaton;

}
 
Example #9
Source File: FuzzyAutomatonBuilder.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
CompiledAutomaton buildMaxEditAutomaton() {
  try {
    return new CompiledAutomaton(levBuilder.toAutomaton(maxEdits, prefix), true, false);
  } catch (TooComplexToDeterminizeException e) {
    throw new FuzzyTermsEnum.FuzzyTermsException(term, e);
  }
}
 
Example #10
Source File: FuzzyAutomatonBuilder.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
CompiledAutomaton[] buildAutomatonSet() {
  CompiledAutomaton[] compiled = new CompiledAutomaton[maxEdits + 1];
  for (int i = 0; i <= maxEdits; i++) {
    try {
      compiled[i] = new CompiledAutomaton(levBuilder.toAutomaton(i, prefix), true, false);
    }
    catch (TooComplexToDeterminizeException e) {
      throw new FuzzyTermsEnum.FuzzyTermsException(term, e);
    }
  }
  return compiled;
}
 
Example #11
Source File: FieldReader.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
  // if (DEBUG) System.out.println("  FieldReader.intersect startTerm=" + BlockTreeTermsWriter.brToString(startTerm));
  //System.out.println("intersect: " + compiled.type + " a=" + compiled.automaton);
  // TODO: we could push "it's a range" or "it's a prefix" down into IntersectTermsEnum?
  // can we optimize knowing that...?
  if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
    throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
  }
  return new IntersectTermsEnum(this, compiled.automaton, compiled.runAutomaton, compiled.commonSuffixRef, startTerm);
}
 
Example #12
Source File: LuceneTestCase.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** 
 * Terms api equivalency 
 */
public void assertTermsEquals(String info, IndexReader leftReader, Terms leftTerms, Terms rightTerms, boolean deep) throws IOException {
  if (leftTerms == null || rightTerms == null) {
    assertNull(info, leftTerms);
    assertNull(info, rightTerms);
    return;
  }
  assertTermsStatisticsEquals(info, leftTerms, rightTerms);
  assertEquals("hasOffsets", leftTerms.hasOffsets(), rightTerms.hasOffsets());
  assertEquals("hasPositions", leftTerms.hasPositions(), rightTerms.hasPositions());
  assertEquals("hasPayloads", leftTerms.hasPayloads(), rightTerms.hasPayloads());

  TermsEnum leftTermsEnum = leftTerms.iterator();
  TermsEnum rightTermsEnum = rightTerms.iterator();
  assertTermsEnumEquals(info, leftReader, leftTermsEnum, rightTermsEnum, true);
  
  assertTermsSeekingEquals(info, leftTerms, rightTerms);
  
  if (deep) {
    int numIntersections = atLeast(3);
    for (int i = 0; i < numIntersections; i++) {
      String re = AutomatonTestUtil.randomRegexp(random());
      CompiledAutomaton automaton = new CompiledAutomaton(new RegExp(re, RegExp.NONE).toAutomaton());
      if (automaton.type == CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
        // TODO: test start term too
        TermsEnum leftIntersection = leftTerms.intersect(automaton, null);
        TermsEnum rightIntersection = rightTerms.intersect(automaton, null);
        assertTermsEnumEquals(info, leftReader, leftIntersection, rightIntersection, rarely());
      }
    }
  }
}
 
Example #13
Source File: TestTermsEnum.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private boolean accepts(CompiledAutomaton c, BytesRef b) {
  int state = 0;
  for(int idx=0;idx<b.length;idx++) {
    assertTrue(state != -1);
    state = c.runAutomaton.step(state, b.bytes[b.offset+idx] & 0xff);
  }
  return c.runAutomaton.isAccept(state);
}
 
Example #14
Source File: AssertingLeafReader.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public TermsEnum intersect(CompiledAutomaton automaton, BytesRef bytes) throws IOException {
  TermsEnum termsEnum = in.intersect(automaton, bytes);
  assert termsEnum != null;
  assert bytes == null || bytes.isValid();
  return new AssertingTermsEnum(termsEnum, hasFreqs());
}
 
Example #15
Source File: FSTTermsReader.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
  if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
    throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
  }
  return new IntersectTermsEnum(compiled, startTerm);
}
 
Example #16
Source File: TestBlockPostingsFormat3.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void assertTerms(Terms leftTerms, Terms rightTerms, boolean deep) throws Exception {
  if (leftTerms == null || rightTerms == null) {
    assertNull(leftTerms);
    assertNull(rightTerms);
    return;
  }
  assertTermsStatistics(leftTerms, rightTerms);
  
  // NOTE: we don't assert hasOffsets/hasPositions/hasPayloads because they are allowed to be different

  boolean bothHaveFreqs = leftTerms.hasFreqs() && rightTerms.hasFreqs();
  boolean bothHavePositions = leftTerms.hasPositions() && rightTerms.hasPositions();
  TermsEnum leftTermsEnum = leftTerms.iterator();
  TermsEnum rightTermsEnum = rightTerms.iterator();
  assertTermsEnum(leftTermsEnum, rightTermsEnum, true, bothHaveFreqs, bothHavePositions);
  
  assertTermsSeeking(leftTerms, rightTerms);
  
  if (deep) {
    int numIntersections = atLeast(3);
    for (int i = 0; i < numIntersections; i++) {
      String re = AutomatonTestUtil.randomRegexp(random());
      CompiledAutomaton automaton = new CompiledAutomaton(new RegExp(re, RegExp.NONE).toAutomaton());
      if (automaton.type == CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
        // TODO: test start term too
        TermsEnum leftIntersection = leftTerms.intersect(automaton, null);
        TermsEnum rightIntersection = rightTerms.intersect(automaton, null);
        assertTermsEnum(leftIntersection, rightIntersection, rarely(), bothHaveFreqs, bothHavePositions);
      }
    }
  }
}
 
Example #17
Source File: FSTTermsReader.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
IntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
  super();
  //if (TEST) System.out.println("Enum init, startTerm=" + startTerm);
  this.fst = dict;
  this.fstReader = fst.getBytesReader();
  this.fstOutputs = dict.outputs;
  this.fsa = compiled.runAutomaton;
  this.level = -1;
  this.stack = new Frame[16];
  for (int i = 0 ; i < stack.length; i++) {
    this.stack[i] = new Frame();
  }

  loadVirtualFrame(newFrame());
  this.level++;
  pushFrame(loadFirstFrame(newFrame()));

  this.meta = null;
  this.metaUpto = 1;
  this.decoded = false;
  this.pending = false;

  if (startTerm == null) {
    pending = isAccept(topFrame());
  } else {
    doSeekCeil(startTerm);
    pending = (term == null || !startTerm.equals(term.get())) && isValid(topFrame()) && isAccept(topFrame());
  }
}
 
Example #18
Source File: BlockTreeTermsReader.java    From incubator-retired-blur with Apache License 2.0 5 votes vote down vote up
@Override
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
  if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
    throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
  }
  return new IntersectEnum(compiled, startTerm);
}
 
Example #19
Source File: DirectPostingsFormat.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public TermsEnum intersect(CompiledAutomaton compiled, final BytesRef startTerm) {
  if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
    throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
  }
  return new DirectIntersectTermsEnum(compiled, startTerm);
}
 
Example #20
Source File: IntersectBlockReader.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
protected IntersectBlockReader(CompiledAutomaton compiled, BytesRef startTerm,
                               IndexDictionary.BrowserSupplier dictionaryBrowserSupplier, IndexInput blockInput,
                               PostingsReaderBase postingsReader, FieldMetadata fieldMetadata,
                               BlockDecoder blockDecoder) throws IOException {
  super(dictionaryBrowserSupplier, blockInput, postingsReader, fieldMetadata, blockDecoder);
  automaton = compiled.automaton;
  runAutomaton = compiled.runAutomaton;
  finite = compiled.finite;
  commonSuffix = compiled.commonSuffixRef;
  minTermLength = getMinTermLength();
  nextStringCalculator = new AutomatonNextTermCalculator(compiled);
  seekTerm = startTerm;
}
 
Example #21
Source File: MultiTermIntervalsSource.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
MultiTermIntervalsSource(CompiledAutomaton automaton, int maxExpansions, String pattern) {
  this.automaton = automaton;
  if (maxExpansions > IndexSearcher.getMaxClauseCount()) {
    throw new IllegalArgumentException("maxExpansions [" + maxExpansions
        + "] cannot be greater than BooleanQuery.getMaxClauseCount [" + IndexSearcher.getMaxClauseCount() + "]");
  }
  this.maxExpansions = maxExpansions;
  this.pattern = pattern;
}
 
Example #22
Source File: AssertingLeafReader.java    From crate with Apache License 2.0 5 votes vote down vote up
@Override
public TermsEnum intersect(CompiledAutomaton automaton, BytesRef bytes) throws IOException {
    TermsEnum termsEnum = in.intersect(automaton, bytes);
    assert termsEnum != null;
    assert bytes == null || bytes.isValid();
    return new AssertingTermsEnum(termsEnum, hasFreqs());
}
 
Example #23
Source File: OrdsFieldReader.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
  if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
    throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
  }
  return new OrdsIntersectTermsEnum(this, compiled, startTerm);
}
 
Example #24
Source File: SecureAtomicReader.java    From incubator-retired-blur with Apache License 2.0 4 votes vote down vote up
@Override
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
  return new SecureTermsEnum(in.intersect(compiled, startTerm), _accessControlReader, _maxDoc);
}
 
Example #25
Source File: IncludeExclude.java    From Elasticsearch with Apache License 2.0 4 votes vote down vote up
private AutomatonBackedOrdinalsFilter(Automaton automaton) {
    this.compiled = new CompiledAutomaton(automaton);
}
 
Example #26
Source File: FilterSortedDocValues.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public TermsEnum intersect(CompiledAutomaton automaton) throws IOException {
  return in.intersect(automaton);
}
 
Example #27
Source File: SortingLeafReader.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm)
    throws IOException {
  return new SortingTermsEnum(in.intersect(compiled, startTerm), docMap, indexOptions, hasPositions());
}
 
Example #28
Source File: SecureAtomicReader.java    From incubator-retired-blur with Apache License 2.0 4 votes vote down vote up
@Override
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
  TermsEnum maskTermsEnum = _readMask.intersect(compiled, startTerm);
  return new ReadMaskTermsEnum(maskTermsEnum, in.intersect(compiled, startTerm));
}
 
Example #29
Source File: BlockTreeTermsReader.java    From incubator-retired-blur with Apache License 2.0 4 votes vote down vote up
public IntersectEnum(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
  // if (DEBUG) {
  //   System.out.println("\nintEnum.init seg=" + segment + " commonSuffix=" + brToString(compiled.commonSuffixRef));
  // }
  runAutomaton = compiled.runAutomaton;
  compiledAutomaton = compiled;
  in = BlockTreeTermsReader.this.in.clone();
  stack = new Frame[5];
  for(int idx=0;idx<stack.length;idx++) {
    stack[idx] = new Frame(idx);
  }
  for(int arcIdx=0;arcIdx<arcs.length;arcIdx++) {
    arcs[arcIdx] = new FST.Arc<BytesRef>();
  }

  if (index == null) {
    fstReader = null;
  } else {
    fstReader = index.getBytesReader();
  }

  // TODO: if the automaton is "smallish" we really
  // should use the terms index to seek at least to
  // the initial term and likely to subsequent terms
  // (or, maybe just fallback to ATE for such cases).
  // Else the seek cost of loading the frames will be
  // too costly.

  final FST.Arc<BytesRef> arc = index.getFirstArc(arcs[0]);
  // Empty string prefix must have an output in the index!
  assert arc.isFinal();

  // Special pushFrame since it's the first one:
  final Frame f = stack[0];
  f.fp = f.fpOrig = rootBlockFP;
  f.prefix = 0;
  f.setState(runAutomaton.getInitialState());
  f.arc = arc;
  f.outputPrefix = arc.output;
  f.load(rootCode);

  // for assert:
  assert setSavedStartTerm(startTerm);

  currentFrame = f;
  if (startTerm != null) {
    seekToStartTerm(startTerm);
  }
}
 
Example #30
Source File: ExitableReader.java    From incubator-retired-blur with Apache License 2.0 4 votes vote down vote up
@Override
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
  return new ExitableTermsEnum(_terms.intersect(compiled, startTerm), _exitObject);
}