org.apache.lucene.util.automaton.Operations Java Examples

The following examples show how to use org.apache.lucene.util.automaton.Operations. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: XAnalyzingSuggester.java    From Elasticsearch with Apache License 2.0 6 votes vote down vote up
final Automaton toLookupAutomaton(final CharSequence key) throws IOException {
    // TODO: is there a Reader from a CharSequence?
    // Turn tokenstream into automaton:
    Automaton automaton = null;

    try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) {
        automaton = getTokenStreamToAutomaton().toAutomaton(ts);
    }

    automaton = replaceSep(automaton);

    // TODO: we can optimize this somewhat by determinizing
    // while we convert

    // This automaton should not blow up during determinize:
    automaton = Operations.determinize(automaton, Integer.MAX_VALUE);
    return automaton;
}
 
Example #2
Source File: IncludeExclude.java    From Elasticsearch with Apache License 2.0 6 votes vote down vote up
private Automaton toAutomaton() {
    Automaton a = null;
    if (include != null) {
        a = include.toAutomaton();
    } else if (includeValues != null) {
        a = Automata.makeStringUnion(includeValues);
    } else {
        a = Automata.makeAnyString();
    }
    if (exclude != null) {
        a = Operations.minus(a, exclude.toAutomaton(), Operations.DEFAULT_MAX_DETERMINIZED_STATES);
    } else if (excludeValues != null) {
        a = Operations.minus(a, Automata.makeStringUnion(excludeValues), Operations.DEFAULT_MAX_DETERMINIZED_STATES);
    }
    return a;
}
 
Example #3
Source File: ContextMapping.java    From Elasticsearch with Apache License 2.0 6 votes vote down vote up
/**
 * Create a automaton for a given context query this automaton will be used
 * to find the matching paths with the fst
 *
 * @param preserveSep set an additional char (<code>XAnalyzingSuggester.SEP_LABEL</code>) between each context query
 * @param queries list of {@link ContextQuery} defining the lookup context
 *
 * @return Automaton matching the given Query
 */
public static Automaton toAutomaton(boolean preserveSep, Iterable<ContextQuery> queries) {
    Automaton a = Automata.makeEmptyString();

    Automaton gap = Automata.makeChar(ContextMapping.SEPARATOR);
    if (preserveSep) {
        // if separators are preserved the fst contains a SEP_LABEL
        // behind each gap. To have a matching automaton, we need to
        // include the SEP_LABEL in the query as well
        gap = Operations.concatenate(gap, Automata.makeChar(XAnalyzingSuggester.SEP_LABEL));
    }

    for (ContextQuery query : queries) {
        a = Operations.concatenate(Arrays.asList(query.toAutomaton(), gap, a));
    }

    // TODO: should we limit this?  Do any of our ContextQuery impls really create exponential regexps?  GeoQuery looks safe (union
    // of strings).
    return Operations.determinize(a, Integer.MAX_VALUE);
}
 
Example #4
Source File: TestRegexpQuery.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testCustomProvider() throws IOException {
  AutomatonProvider myProvider = new AutomatonProvider() {
    // automaton that matches quick or brown
    private Automaton quickBrownAutomaton = Operations.union(Arrays
        .asList(Automata.makeString("quick"),
        Automata.makeString("brown"),
        Automata.makeString("bob")));
    
    @Override
    public Automaton getAutomaton(String name) {
      if (name.equals("quickBrown")) return quickBrownAutomaton;
      else return null;
    }
  };
  RegexpQuery query = new RegexpQuery(newTerm("<quickBrown>"), RegExp.ALL,
    myProvider, DEFAULT_MAX_DETERMINIZED_STATES);
  assertEquals(1, searcher.search(query, 5).totalHits.value);
}
 
Example #5
Source File: TestDuelingAnalyzers.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@BeforeClass
public static void beforeClass() throws Exception {
  Automaton single = new Automaton();
  int initial = single.createState();
  int accept = single.createState();
  single.setAccept(accept, true);

  // build an automaton matching this jvm's letter definition
  for (int i = 0; i <= 0x10FFFF; i++) {
    if (Character.isLetter(i)) {
      single.addTransition(initial, accept, i);
    }
  }
  Automaton repeat = Operations.repeat(single);
  jvmLetter = new CharacterRunAutomaton(repeat);
}
 
Example #6
Source File: TestAutomatonQuery.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Test some very simple automata.
 */
public void testAutomata() throws IOException {
  assertAutomatonHits(0, Automata.makeEmpty());
  assertAutomatonHits(0, Automata.makeEmptyString());
  assertAutomatonHits(2, Automata.makeAnyChar());
  assertAutomatonHits(3, Automata.makeAnyString());
  assertAutomatonHits(2, Automata.makeString("doc"));
  assertAutomatonHits(1, Automata.makeChar('a'));
  assertAutomatonHits(2, Automata.makeCharRange('a', 'b'));
  assertAutomatonHits(2, Automata.makeDecimalInterval(1233, 2346, 0));
  assertAutomatonHits(1, Automata.makeDecimalInterval(0, 2000, 0));
  assertAutomatonHits(2, Operations.union(Automata.makeChar('a'),
      Automata.makeChar('b')));
  assertAutomatonHits(0, Operations.intersection(Automata
      .makeChar('a'), Automata.makeChar('b')));
  assertAutomatonHits(1, Operations.minus(Automata.makeCharRange('a', 'b'), 
      Automata.makeChar('a'), DEFAULT_MAX_DETERMINIZED_STATES));
}
 
Example #7
Source File: FuzzyCompletionQuery.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
  final Automaton originalAutomata;
  try (CompletionTokenStream stream = (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text()) ) {
    originalAutomata = stream.toAutomaton(unicodeAware);
  }
  Set<IntsRef> refs = new HashSet<>();
  Automaton automaton = toLevenshteinAutomata(originalAutomata, refs);
  if (unicodeAware) {
    Automaton utf8automaton = new UTF32ToUTF8().convert(automaton);
    utf8automaton = Operations.determinize(utf8automaton, maxDeterminizedStates);
    automaton = utf8automaton;
  }
  // TODO Accumulating all refs is bad, because the resulting set may be very big.
  // TODO Better iterate over automaton again inside FuzzyCompletionWeight?
  return new FuzzyCompletionWeight(this, automaton, refs);
}
 
Example #8
Source File: ContextQuery.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
private static Automaton toContextAutomaton(final Map<IntsRef, ContextMetaData> contexts, final boolean matchAllContexts) {
  final Automaton matchAllAutomaton = Operations.repeat(Automata.makeAnyString());
  final Automaton sep = Automata.makeChar(ContextSuggestField.CONTEXT_SEPARATOR);
  if (matchAllContexts || contexts.size() == 0) {
    return Operations.concatenate(matchAllAutomaton, sep);
  } else {
    Automaton contextsAutomaton = null;
    for (Map.Entry<IntsRef, ContextMetaData> entry : contexts.entrySet()) {
      final ContextMetaData contextMetaData = entry.getValue();
      final IntsRef ref = entry.getKey();
      Automaton contextAutomaton = Automata.makeString(ref.ints, ref.offset, ref.length);
      if (contextMetaData.exact == false) {
        contextAutomaton = Operations.concatenate(contextAutomaton, matchAllAutomaton);
      }
      contextAutomaton = Operations.concatenate(contextAutomaton, sep);
      if (contextsAutomaton == null) {
        contextsAutomaton = contextAutomaton;
      } else {
        contextsAutomaton = Operations.union(contextsAutomaton, contextAutomaton);
      }
    }
    return contextsAutomaton;
  }
}
 
Example #9
Source File: TermInSetQuery.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private ByteRunAutomaton asByteRunAutomaton() {
  TermIterator iterator = termData.iterator();
  List<Automaton> automata = new ArrayList<>();
  for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
    automata.add(Automata.makeBinary(term));
  }
  return new CompiledAutomaton(Operations.union(automata)).runAutomaton;

}
 
Example #10
Source File: TestMockAnalyzer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** Test a configuration that behaves a lot like KeepWordFilter */
public void testKeep() throws Exception {
  CharacterRunAutomaton keepWords = 
    new CharacterRunAutomaton(
        Operations.complement(
            Operations.union(
                Arrays.asList(Automata.makeString("foo"), Automata.makeString("bar"))),
            DEFAULT_MAX_DETERMINIZED_STATES));
  Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, keepWords);
  assertAnalyzesTo(a, "quick foo brown bar bar fox foo",
      new String[] { "foo", "bar", "bar", "foo" },
      new int[] { 2, 2, 1, 2 });
}
 
Example #11
Source File: TestFuzzyQuery.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testErrorMessage() {
  // 45 states per vector from Lev2TParametricDescription
  final int length = (Operations.DEFAULT_MAX_DETERMINIZED_STATES / 45) + 10;
  final String value = randomRealisticMultiByteUnicode(length);

  FuzzyTermsEnum.FuzzyTermsException expected = expectThrows(FuzzyTermsEnum.FuzzyTermsException.class, () -> {
    new FuzzyAutomatonBuilder(value, 2, 0, true).buildMaxEditAutomaton();
  });
  assertThat(expected.getMessage(), containsString(value));

  expected = expectThrows(FuzzyTermsEnum.FuzzyTermsException.class,
      () -> new FuzzyAutomatonBuilder(value, 2, 0, true).buildAutomatonSet());
  assertThat(expected.getMessage(), containsString(value));
}
 
Example #12
Source File: TestAutomatonQuery.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Test that a nondeterministic automaton works correctly. (It should will be
 * determinized)
 */
public void testNFA() throws IOException {
  // accept this or three, the union is an NFA (two transitions for 't' from
  // initial state)
  Automaton nfa = Operations.union(Automata.makeString("this"),
      Automata.makeString("three"));
  assertAutomatonHits(2, nfa);
}
 
Example #13
Source File: TestAutomatonQuery.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testEquals() {
  AutomatonQuery a1 = new AutomatonQuery(newTerm("foobar"), Automata
      .makeString("foobar"));
  // reference to a1
  AutomatonQuery a2 = a1;
  // same as a1 (accepts the same language, same term)
  AutomatonQuery a3 = new AutomatonQuery(newTerm("foobar"),
                          Operations.concatenate(
                               Automata.makeString("foo"),
                               Automata.makeString("bar")));
  // different than a1 (same term, but different language)
  AutomatonQuery a4 = new AutomatonQuery(newTerm("foobar"),
                                         Automata.makeString("different"));
  // different than a1 (different term, same language)
  AutomatonQuery a5 = new AutomatonQuery(newTerm("blah"),
                                         Automata.makeString("foobar"));
  
  assertEquals(a1.hashCode(), a2.hashCode());
  assertEquals(a1, a2);
  
  assertEquals(a1.hashCode(), a3.hashCode());
  assertEquals(a1, a3);

  // different class
  AutomatonQuery w1 = new WildcardQuery(newTerm("foobar"));
  // different class
  AutomatonQuery w2 = new RegexpQuery(newTerm("foobar"));
  
  assertFalse(a1.equals(w1));
  assertFalse(a1.equals(w2));
  assertFalse(w1.equals(w2));
  assertFalse(a1.equals(a4));
  assertFalse(a1.equals(a5));
  assertFalse(a1.equals(null));
}
 
Example #14
Source File: TestAutomatonQuery.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Test that rewriting to a prefix query works as expected, preserves
 * MultiTermQuery semantics.
 */
public void testRewritePrefix() throws IOException {
  Automaton pfx = Automata.makeString("do");
  Automaton prefixAutomaton = Operations.concatenate(pfx, Automata.makeAnyString());
  AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"), prefixAutomaton);
  assertEquals(3, automatonQueryNrHits(aq));
}
 
Example #15
Source File: TestGraphTokenizers.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testSynOverMultipleHoles() throws Exception {
  final TokenStream ts = new CannedTokenStream(
    new Token[] {
      token("a", 1, 1),
      token("x", 0, 3),
      token("b", 3, 1),
    });
  final Automaton a1 = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b")); 
  final Automaton a2 = join(s2a("x"), SEP_A, s2a("b")); 
  assertSameLanguage(Operations.union(a1, a2), ts);
}
 
Example #16
Source File: TestGraphTokenizers.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private Automaton join(String ... strings) {
  List<Automaton> as = new ArrayList<>();
  for(String s : strings) {
    as.add(s2a(s));
    as.add(SEP_A);
  }
  as.remove(as.size()-1);
  return Operations.concatenate(as);
}
 
Example #17
Source File: TestGraphTokenizers.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testOverlappedTokensSausage() throws Exception {

    // Two tokens on top of each other (sausage):
    final TokenStream ts = new CannedTokenStream(
      new Token[] {
        token("abc", 1, 1),
        token("xyz", 0, 1)
      });
    final Automaton a1 = s2a("abc");
    final Automaton a2 = s2a("xyz");
    assertSameLanguage(Operations.union(a1, a2), ts);
  }
 
Example #18
Source File: TestGraphTokenizers.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testOverlappedTokensLattice() throws Exception {

    final TokenStream ts = new CannedTokenStream(
      new Token[] {
        token("abc", 1, 1),
        token("xyz", 0, 2),
        token("def", 1, 1),
      });
    final Automaton a1 = s2a("xyz");
    final Automaton a2 = join("abc", "def");
    assertSameLanguage(Operations.union(a1, a2), ts);
  }
 
Example #19
Source File: TestGraphTokenizers.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testSynOverHole() throws Exception {

    final TokenStream ts = new CannedTokenStream(
      new Token[] {
        token("a", 1, 1),
        token("X", 0, 2),
        token("b", 2, 1),
      });
    final Automaton a1 = Operations.union(join(s2a("a"), SEP_A, HOLE_A), s2a("X"));
    final Automaton expected = Operations.concatenate(a1, join(SEP_A, s2a("b")));
    assertSameLanguage(expected, ts);
  }
 
Example #20
Source File: TestGraphTokenizers.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testSynOverHole2() throws Exception {

    final TokenStream ts = new CannedTokenStream(
      new Token[] {
        token("xyz", 1, 1),
        token("abc", 0, 3),
        token("def", 2, 1),
      });
    final Automaton expected = Operations.union(
      join(s2a("xyz"), SEP_A, HOLE_A, SEP_A, s2a("def")), s2a("abc"));
    assertSameLanguage(expected, ts);
  }
 
Example #21
Source File: TestGraphTokenizers.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testOverlappedTokensLattice2() throws Exception {

    final TokenStream ts = new CannedTokenStream(
      new Token[] {
        token("abc", 1, 1),
        token("xyz", 0, 3),
        token("def", 1, 1),
        token("ghi", 1, 1),
      });
    final Automaton a1 = s2a("xyz");
    final Automaton a2 = join("abc", "def", "ghi");
    assertSameLanguage(Operations.union(a1, a2), ts);
  }
 
Example #22
Source File: TestGraphTokenizers.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testSynHangingOverEnd() throws Exception {
  final TokenStream ts = new CannedTokenStream(
    new Token[] {
      token("a", 1, 1),
      token("X", 0, 10),
    });
  assertSameLanguage(Operations.union(s2a("a"), s2a("X")), ts);
}
 
Example #23
Source File: TestGraphTokenizers.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testTokenStreamGraphWithHoles() throws Exception {
  final TokenStream ts = new CannedTokenStream(
    new Token[] {
      token("abc", 1, 1),
      token("xyz", 1, 8),
      token("def", 1, 1),
      token("ghi", 1, 1),
    });
  assertSameLanguage(Operations.union(join(s2a("abc"), SEP_A, s2a("xyz")),
                                      join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def"), SEP_A, s2a("ghi"))), ts);
}
 
Example #24
Source File: TestReversedWildcardFilterFactory.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** fragile assert: depends on our implementation, but cleanest way to check for now */ 
private boolean wasReversed(SolrQueryParser qp, String query) throws Exception {
  Query q = qp.parse(query);
  if (!(q instanceof AutomatonQuery)) {
    return false;
  }
  Automaton automaton = ((AutomatonQuery) q).getAutomaton();
  String prefix = Operations.getCommonPrefix(Operations.determinize(automaton,
    Operations.DEFAULT_MAX_DETERMINIZED_STATES));
  return prefix.length() > 0 && prefix.charAt(0) == '\u0001';
}
 
Example #25
Source File: Regex.java    From crate with Apache License 2.0 5 votes vote down vote up
/** Return an {@link Automaton} that matches the given pattern. */
public static Automaton simpleMatchToAutomaton(String pattern) {
    List<Automaton> automata = new ArrayList<>();
    int previous = 0;
    for (int i = pattern.indexOf('*'); i != -1; i = pattern.indexOf('*', i + 1)) {
        automata.add(Automata.makeString(pattern.substring(previous, i)));
        automata.add(Automata.makeAnyString());
        previous = i + 1;
    }
    automata.add(Automata.makeString(pattern.substring(previous)));
    return Operations.concatenate(automata);
}
 
Example #26
Source File: Regex.java    From crate with Apache License 2.0 5 votes vote down vote up
/**
 * Return an Automaton that matches the union of the provided patterns.
 */
public static Automaton simpleMatchToAutomaton(String... patterns) {
    if (patterns.length < 1) {
        throw new IllegalArgumentException("There must be at least one pattern, zero given");
    }
    List<Automaton> automata = new ArrayList<>();
    for (String pattern : patterns) {
        automata.add(simpleMatchToAutomaton(pattern));
    }
    return Operations.union(automata);
}
 
Example #27
Source File: CategoryContextMapping.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
@Override
public Automaton toAutomaton() {
    List<Automaton> automatons = new ArrayList<>();
    for (CharSequence value : values) {
        automatons.add(Automata.makeString(value.toString()));
    }
    return Operations.union(automatons);
}
 
Example #28
Source File: XAnalyzingSuggester.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
protected Automaton convertAutomaton(Automaton a) {
  if (queryPrefix != null) {
    a = Operations.concatenate(Arrays.asList(queryPrefix, a));
    // This automaton should not blow up during determinize:
    a = Operations.determinize(a, Integer.MAX_VALUE);
  }
  return a;
}
 
Example #29
Source File: TestSynonymGraphFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** Renumbers nodes according to their topo sort */
private Automaton topoSort(Automaton in) {
  int[] newToOld = Operations.topoSortStates(in);
  int[] oldToNew = new int[newToOld.length];

  Automaton.Builder a = new Automaton.Builder();
  //System.out.println("remap:");
  for(int i=0;i<newToOld.length;i++) {
    a.createState();
    oldToNew[newToOld[i]] = i;
    //System.out.println("  " + newToOld[i] + " -> " + i);
    if (in.isAccept(newToOld[i])) {
      a.setAccept(i, true);
      //System.out.println("    **");
    }
  }

  Transition t = new Transition();
  for(int i=0;i<newToOld.length;i++) {
    int count = in.initTransition(newToOld[i], t);
    for(int j=0;j<count;j++) {
      in.getNextTransition(t);
      a.addTransition(i, oldToNew[t.dest], t.min, t.max);
    }
  }

  return a.finish();
}
 
Example #30
Source File: ConcatenateGraphFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Converts the tokenStream to an automaton.  Does *not* close it.
 */
public Automaton toAutomaton(boolean unicodeAware) throws IOException {
  // TODO refactor this
  // maybe we could hook up a modified automaton from TermAutomatonQuery here?

  // Create corresponding automaton: labels are bytes
  // from each analyzed token, with byte 0 used as
  // separator between tokens:
  final TokenStreamToAutomaton tsta;
  if (tokenSeparator != null) {
    tsta = new EscapingTokenStreamToAutomaton(tokenSeparator);
  } else {
    // When we're not preserving sep, we don't steal 0xff
    // byte, so we don't need to do any escaping:
    tsta = new TokenStreamToAutomaton();
  }
  tsta.setPreservePositionIncrements(preservePositionIncrements);
  tsta.setUnicodeArcs(unicodeAware);

  Automaton automaton = tsta.toAutomaton(inputTokenStream);

  // TODO: we can optimize this somewhat by determinizing
  // while we convert
  automaton = replaceSep(automaton, tokenSeparator);
  // This automaton should not blow up during determinize:
  return Operations.determinize(automaton, maxGraphExpansions);
}