Java Code Examples for org.apache.lucene.util.automaton.CompiledAutomaton

The following examples show how to use org.apache.lucene.util.automaton.CompiledAutomaton. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: lucene-solr   Source File: TestIntervals.java    License: Apache License 2.0 6 votes vote down vote up
public void testMultiTerm() throws IOException {
  RegExp re = new RegExp("p.*e");
  IntervalsSource source = Intervals.multiterm(new CompiledAutomaton(re.toAutomaton()), re.toString());

  checkIntervals(source, "field1", 5, new int[][]{
      {},
      { 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7 },
      { 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7 },
      { 7, 7 },
      { 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7 },
      { 0, 0 }
  });

  IllegalStateException e = expectThrows(IllegalStateException.class, () -> {
    IntervalsSource s = Intervals.multiterm(new CompiledAutomaton(re.toAutomaton()), 1, re.toString());
    for (LeafReaderContext ctx : searcher.getIndexReader().leaves()) {
      s.intervals("field1", ctx);
    }
  });
  assertEquals("Automaton [\\p(.)*\\e] expanded to too many terms (limit 1)", e.getMessage());

  checkVisits(source, 1);
}
 
Example 2
Source Project: lucene-solr   Source File: SortedSetDocValues.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Returns a {@link TermsEnum} over the values, filtered by a {@link CompiledAutomaton}
 * The enum supports {@link TermsEnum#ord()}.
 */
public TermsEnum intersect(CompiledAutomaton automaton) throws IOException {
  TermsEnum in = termsEnum();
  switch (automaton.type) {
    case NONE:
      return TermsEnum.EMPTY;
    case ALL:
      return in;
    case SINGLE:
      return new SingleTermsEnum(in, automaton.term);
    case NORMAL:
      return new AutomatonTermsEnum(in, automaton);
    default:
      // unreachable
      throw new RuntimeException("unhandled case");
  }
}
 
Example 3
Source Project: lucene-solr   Source File: MultiTerms.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
  final List<MultiTermsEnum.TermsEnumIndex> termsEnums = new ArrayList<>();
  for(int i=0;i<subs.length;i++) {
    final TermsEnum termsEnum = subs[i].intersect(compiled, startTerm);
    if (termsEnum != null) {
      termsEnums.add(new MultiTermsEnum.TermsEnumIndex(termsEnum, i));
    }
  }

  if (termsEnums.size() > 0) {
    return new MultiTermsEnum(subSlices).reset(termsEnums.toArray(MultiTermsEnum.TermsEnumIndex.EMPTY_ARRAY));
  } else {
    return TermsEnum.EMPTY;
  }
}
 
Example 4
Source Project: lucene-solr   Source File: SortedDocValues.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Returns a {@link TermsEnum} over the values, filtered by a {@link CompiledAutomaton}
 * The enum supports {@link TermsEnum#ord()}.
 */
public TermsEnum intersect(CompiledAutomaton automaton) throws IOException {
  TermsEnum in = termsEnum();
  switch (automaton.type) {
    case NONE:
      return TermsEnum.EMPTY;
    case ALL:
      return in;
    case SINGLE:
      return new SingleTermsEnum(in, automaton.term);
    case NORMAL:
      return new AutomatonTermsEnum(in, automaton);
    default:
      // unreachable
      throw new RuntimeException("unhandled case");
  }
}
 
Example 5
Source Project: lucene-solr   Source File: FSTTermsReader.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
  if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
    throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
  }
  return new IntersectTermsEnum(compiled, startTerm);
}
 
Example 6
Source Project: lucene-solr   Source File: FSTTermsReader.java    License: Apache License 2.0 5 votes vote down vote up
IntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
  super();
  //if (TEST) System.out.println("Enum init, startTerm=" + startTerm);
  this.fst = dict;
  this.fstReader = fst.getBytesReader();
  this.fstOutputs = dict.outputs;
  this.fsa = compiled.runAutomaton;
  this.level = -1;
  this.stack = new Frame[16];
  for (int i = 0 ; i < stack.length; i++) {
    this.stack[i] = new Frame();
  }

  loadVirtualFrame(newFrame());
  this.level++;
  pushFrame(loadFirstFrame(newFrame()));

  this.meta = null;
  this.metaUpto = 1;
  this.decoded = false;
  this.pending = false;

  if (startTerm == null) {
    pending = isAccept(topFrame());
  } else {
    doSeekCeil(startTerm);
    pending = (term == null || !startTerm.equals(term.get())) && isValid(topFrame()) && isAccept(topFrame());
  }
}
 
Example 7
Source Project: lucene-solr   Source File: DirectPostingsFormat.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public TermsEnum intersect(CompiledAutomaton compiled, final BytesRef startTerm) {
  if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
    throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
  }
  return new DirectIntersectTermsEnum(compiled, startTerm);
}
 
Example 8
Source Project: lucene-solr   Source File: OrdsFieldReader.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
  if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
    throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
  }
  return new OrdsIntersectTermsEnum(this, compiled, startTerm);
}
 
Example 9
Source Project: lucene-solr   Source File: IntersectBlockReader.java    License: Apache License 2.0 5 votes vote down vote up
protected IntersectBlockReader(CompiledAutomaton compiled, BytesRef startTerm,
                               IndexDictionary.BrowserSupplier dictionaryBrowserSupplier, IndexInput blockInput,
                               PostingsReaderBase postingsReader, FieldMetadata fieldMetadata,
                               BlockDecoder blockDecoder) throws IOException {
  super(dictionaryBrowserSupplier, blockInput, postingsReader, fieldMetadata, blockDecoder);
  automaton = compiled.automaton;
  runAutomaton = compiled.runAutomaton;
  finite = compiled.finite;
  commonSuffix = compiled.commonSuffixRef;
  minTermLength = getMinTermLength();
  nextStringCalculator = new AutomatonNextTermCalculator(compiled);
  seekTerm = startTerm;
}
 
Example 10
Source Project: lucene-solr   Source File: MultiTermIntervalsSource.java    License: Apache License 2.0 5 votes vote down vote up
MultiTermIntervalsSource(CompiledAutomaton automaton, int maxExpansions, String pattern) {
  this.automaton = automaton;
  if (maxExpansions > IndexSearcher.getMaxClauseCount()) {
    throw new IllegalArgumentException("maxExpansions [" + maxExpansions
        + "] cannot be greater than BooleanQuery.getMaxClauseCount [" + IndexSearcher.getMaxClauseCount() + "]");
  }
  this.maxExpansions = maxExpansions;
  this.pattern = pattern;
}
 
Example 11
Source Project: lucene-solr   Source File: AssertingLeafReader.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public TermsEnum intersect(CompiledAutomaton automaton, BytesRef bytes) throws IOException {
  TermsEnum termsEnum = in.intersect(automaton, bytes);
  assert termsEnum != null;
  assert bytes == null || bytes.isValid();
  return new AssertingTermsEnum(termsEnum, hasFreqs());
}
 
Example 12
Source Project: lucene-solr   Source File: LuceneTestCase.java    License: Apache License 2.0 5 votes vote down vote up
/** 
 * Terms api equivalency 
 */
public void assertTermsEquals(String info, IndexReader leftReader, Terms leftTerms, Terms rightTerms, boolean deep) throws IOException {
  if (leftTerms == null || rightTerms == null) {
    assertNull(info, leftTerms);
    assertNull(info, rightTerms);
    return;
  }
  assertTermsStatisticsEquals(info, leftTerms, rightTerms);
  assertEquals("hasOffsets", leftTerms.hasOffsets(), rightTerms.hasOffsets());
  assertEquals("hasPositions", leftTerms.hasPositions(), rightTerms.hasPositions());
  assertEquals("hasPayloads", leftTerms.hasPayloads(), rightTerms.hasPayloads());

  TermsEnum leftTermsEnum = leftTerms.iterator();
  TermsEnum rightTermsEnum = rightTerms.iterator();
  assertTermsEnumEquals(info, leftReader, leftTermsEnum, rightTermsEnum, true);
  
  assertTermsSeekingEquals(info, leftTerms, rightTerms);
  
  if (deep) {
    int numIntersections = atLeast(3);
    for (int i = 0; i < numIntersections; i++) {
      String re = AutomatonTestUtil.randomRegexp(random());
      CompiledAutomaton automaton = new CompiledAutomaton(new RegExp(re, RegExp.NONE).toAutomaton());
      if (automaton.type == CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
        // TODO: test start term too
        TermsEnum leftIntersection = leftTerms.intersect(automaton, null);
        TermsEnum rightIntersection = rightTerms.intersect(automaton, null);
        assertTermsEnumEquals(info, leftReader, leftIntersection, rightIntersection, rarely());
      }
    }
  }
}
 
Example 13
Source Project: lucene-solr   Source File: FieldReader.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
  // if (DEBUG) System.out.println("  FieldReader.intersect startTerm=" + BlockTreeTermsWriter.brToString(startTerm));
  //System.out.println("intersect: " + compiled.type + " a=" + compiled.automaton);
  // TODO: we could push "it's a range" or "it's a prefix" down into IntersectTermsEnum?
  // can we optimize knowing that...?
  if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
    throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
  }
  return new IntersectTermsEnum(this, compiled.automaton, compiled.runAutomaton, compiled.commonSuffixRef, startTerm);
}
 
Example 14
Source Project: lucene-solr   Source File: FuzzyAutomatonBuilder.java    License: Apache License 2.0 5 votes vote down vote up
CompiledAutomaton[] buildAutomatonSet() {
  CompiledAutomaton[] compiled = new CompiledAutomaton[maxEdits + 1];
  for (int i = 0; i <= maxEdits; i++) {
    try {
      compiled[i] = new CompiledAutomaton(levBuilder.toAutomaton(i, prefix), true, false);
    }
    catch (TooComplexToDeterminizeException e) {
      throw new FuzzyTermsEnum.FuzzyTermsException(term, e);
    }
  }
  return compiled;
}
 
Example 15
Source Project: lucene-solr   Source File: FuzzyAutomatonBuilder.java    License: Apache License 2.0 5 votes vote down vote up
CompiledAutomaton buildMaxEditAutomaton() {
  try {
    return new CompiledAutomaton(levBuilder.toAutomaton(maxEdits, prefix), true, false);
  } catch (TooComplexToDeterminizeException e) {
    throw new FuzzyTermsEnum.FuzzyTermsException(term, e);
  }
}
 
Example 16
Source Project: lucene-solr   Source File: TermInSetQuery.java    License: Apache License 2.0 5 votes vote down vote up
private ByteRunAutomaton asByteRunAutomaton() {
  TermIterator iterator = termData.iterator();
  List<Automaton> automata = new ArrayList<>();
  for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
    automata.add(Automata.makeBinary(term));
  }
  return new CompiledAutomaton(Operations.union(automata)).runAutomaton;

}
 
Example 17
Source Project: lucene-solr   Source File: AutomatonTermsEnum.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Construct an enumerator based upon an automaton, enumerating the specified
 * field, working on a supplied TermsEnum
 *
 * @lucene.experimental 
 * @param compiled CompiledAutomaton
 */
public AutomatonTermsEnum(TermsEnum tenum, CompiledAutomaton compiled) {
  super(tenum);
  if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
    throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
  }
  this.finite = compiled.finite;
  this.runAutomaton = compiled.runAutomaton;
  assert this.runAutomaton != null;
  this.commonSuffixRef = compiled.commonSuffixRef;
  this.automaton = compiled.automaton;

  // No need to track visited states for a finite language without loops.
  visited = finite ? null : new short[runAutomaton.getSize()];
}
 
Example 18
Source Project: lucene-solr   Source File: Terms.java    License: Apache License 2.0 5 votes vote down vote up
/** Returns a TermsEnum that iterates over all terms and
 *  documents that are accepted by the provided {@link
 *  CompiledAutomaton}.  If the <code>startTerm</code> is
 *  provided then the returned enum will only return terms
 *  {@code > startTerm}, but you still must call
 *  next() first to get to the first term.  Note that the
 *  provided <code>startTerm</code> must be accepted by
 *  the automaton.
 *
 *  <p>This is an expert low-level API and will only work
 *  for {@code NORMAL} compiled automata.  To handle any
 *  compiled automata you should instead use
 *  {@link CompiledAutomaton#getTermsEnum} instead.
 *
 *  <p><b>NOTE</b>: the returned TermsEnum cannot seek</p>.
 */
public TermsEnum intersect(CompiledAutomaton compiled, final BytesRef startTerm) throws IOException {
  
  // TODO: could we factor out a common interface b/w
  // CompiledAutomaton and FST?  Then we could pass FST there too,
  // and likely speed up resolving terms to deleted docs ... but
  // AutomatonTermsEnum makes this tricky because of its on-the-fly cycle
  // detection
  
  // TODO: eventually we could support seekCeil/Exact on
  // the returned enum, instead of only being able to seek
  // at the start

  TermsEnum termsEnum = iterator();

  if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
    throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
  }

  if (startTerm == null) {
    return new AutomatonTermsEnum(termsEnum, compiled);
  } else {
    return new AutomatonTermsEnum(termsEnum, compiled) {
      @Override
      protected BytesRef nextSeekTerm(BytesRef term) throws IOException {
        if (term == null) {
          term = startTerm;
        }
        return super.nextSeekTerm(term);
      }
    };
  }
}
 
Example 19
Source Project: lucene-solr   Source File: TestTermsEnum.java    License: Apache License 2.0 5 votes vote down vote up
private boolean accepts(CompiledAutomaton c, BytesRef b) {
  int state = 0;
  for(int idx=0;idx<b.length;idx++) {
    assertTrue(state != -1);
    state = c.runAutomaton.step(state, b.bytes[b.offset+idx] & 0xff);
  }
  return c.runAutomaton.isAccept(state);
}
 
Example 20
Source Project: lucene-solr   Source File: TestTermsEnum.java    License: Apache License 2.0 5 votes vote down vote up
public void testIntersectRegexp() throws Exception {
  Directory d = newDirectory();
  RandomIndexWriter w = new RandomIndexWriter(random(), d);
  Document doc = new Document();
  doc.add(newStringField("field", "foobar", Field.Store.NO));
  w.addDocument(doc);
  IndexReader r = w.getReader();
  Terms terms = MultiTerms.getTerms(r, "field");
  CompiledAutomaton automaton = new CompiledAutomaton(new RegExp("do_not_match_anything").toAutomaton());
  String message = expectThrows(IllegalArgumentException.class, () -> {terms.intersect(automaton, null);}).getMessage();
  assertEquals("please use CompiledAutomaton.getTermsEnum instead", message);
  r.close();
  w.close();
  d.close();
}
 
Example 21
Source Project: lucene-solr   Source File: TestBlockPostingsFormat3.java    License: Apache License 2.0 5 votes vote down vote up
public void assertTerms(Terms leftTerms, Terms rightTerms, boolean deep) throws Exception {
  if (leftTerms == null || rightTerms == null) {
    assertNull(leftTerms);
    assertNull(rightTerms);
    return;
  }
  assertTermsStatistics(leftTerms, rightTerms);
  
  // NOTE: we don't assert hasOffsets/hasPositions/hasPayloads because they are allowed to be different

  boolean bothHaveFreqs = leftTerms.hasFreqs() && rightTerms.hasFreqs();
  boolean bothHavePositions = leftTerms.hasPositions() && rightTerms.hasPositions();
  TermsEnum leftTermsEnum = leftTerms.iterator();
  TermsEnum rightTermsEnum = rightTerms.iterator();
  assertTermsEnum(leftTermsEnum, rightTermsEnum, true, bothHaveFreqs, bothHavePositions);
  
  assertTermsSeeking(leftTerms, rightTerms);
  
  if (deep) {
    int numIntersections = atLeast(3);
    for (int i = 0; i < numIntersections; i++) {
      String re = AutomatonTestUtil.randomRegexp(random());
      CompiledAutomaton automaton = new CompiledAutomaton(new RegExp(re, RegExp.NONE).toAutomaton());
      if (automaton.type == CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
        // TODO: test start term too
        TermsEnum leftIntersection = leftTerms.intersect(automaton, null);
        TermsEnum rightIntersection = rightTerms.intersect(automaton, null);
        assertTermsEnum(leftIntersection, rightIntersection, rarely(), bothHaveFreqs, bothHavePositions);
      }
    }
  }
}
 
Example 22
@Override
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
  if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
    throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
  }
  return new IntersectEnum(compiled, startTerm);
}
 
Example 23
Source Project: crate   Source File: AssertingLeafReader.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public TermsEnum intersect(CompiledAutomaton automaton, BytesRef bytes) throws IOException {
    TermsEnum termsEnum = in.intersect(automaton, bytes);
    assert termsEnum != null;
    assert bytes == null || bytes.isValid();
    return new AssertingTermsEnum(termsEnum, hasFreqs());
}
 
Example 24
Source Project: Elasticsearch   Source File: IncludeExclude.java    License: Apache License 2.0 4 votes vote down vote up
private AutomatonBackedOrdinalsFilter(Automaton automaton) {
    this.compiled = new CompiledAutomaton(automaton);
}
 
Example 25
Source Project: lucene-solr   Source File: TermVectorFilteredLeafReader.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
  return new TermVectorFilteredTermsEnum(in.iterator(), filterTerms.intersect(compiled, startTerm));
}
 
Example 26
Source Project: lucene-solr   Source File: OrdsIntersectTermsEnum.java    License: Apache License 2.0 4 votes vote down vote up
public OrdsIntersectTermsEnum(OrdsFieldReader fr, CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
  // if (DEBUG) {
  //   System.out.println("\nintEnum.init seg=" + segment + " commonSuffix=" + brToString(compiled.commonSuffixRef));
  // }
  this.fr = fr;
  runAutomaton = compiled.runAutomaton;
  compiledAutomaton = compiled;
  in = fr.parent.in.clone();
  stack = new OrdsIntersectTermsEnumFrame[5];
  for(int idx=0;idx<stack.length;idx++) {
    stack[idx] = new OrdsIntersectTermsEnumFrame(this, idx);
  }
  for(int arcIdx=0;arcIdx<arcs.length;arcIdx++) {
    arcs[arcIdx] = new FST.Arc<>();
  }

  if (fr.index == null) {
    fstReader = null;
  } else {
    fstReader = fr.index.getBytesReader();
  }

  // TODO: if the automaton is "smallish" we really
  // should use the terms index to seek at least to
  // the initial term and likely to subsequent terms
  // (or, maybe just fallback to ATE for such cases).
  // Else the seek cost of loading the frames will be
  // too costly.

  final FST.Arc<Output> arc = fr.index.getFirstArc(arcs[0]);
  // Empty string prefix must have an output in the index!
  assert arc.isFinal();

  // Special pushFrame since it's the first one:
  final OrdsIntersectTermsEnumFrame f = stack[0];
  f.fp = f.fpOrig = fr.rootBlockFP;
  f.prefix = 0;
  f.setState(0);
  f.arc = arc;
  f.outputPrefix = arc.output();
  f.load(fr.rootCode);

  // for assert:
  assert setSavedStartTerm(startTerm);

  currentFrame = f;
  if (startTerm != null) {
    seekToStartTerm(startTerm);
  }
}
 
Example 27
Source Project: lucene-solr   Source File: BloomFilteringPostingsFormat.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public TermsEnum intersect(CompiledAutomaton compiled,
    final BytesRef startTerm) throws IOException {
  return delegateTerms.intersect(compiled, startTerm);
}
 
Example 28
Source Project: lucene-solr   Source File: IntersectBlockReader.java    License: Apache License 2.0 4 votes vote down vote up
protected AutomatonNextTermCalculator(CompiledAutomaton compiled) {
  visited = compiled.finite ? null : new short[runAutomaton.getSize()];
}
 
Example 29
Source Project: lucene-solr   Source File: UniformSplitTerms.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
  checkIntersectAutomatonType(compiled);
  return new IntersectBlockReader(compiled, startTerm, dictionaryBrowserSupplier, blockInput, postingsReader, fieldMetadata, blockDecoder);
}
 
Example 30
Source Project: lucene-solr   Source File: UniformSplitTerms.java    License: Apache License 2.0 4 votes vote down vote up
protected void checkIntersectAutomatonType(CompiledAutomaton automaton) {
  // This check is consistent with other impls and precondition stated in javadoc.
  if (automaton.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
    throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
  }
}