org.apache.lucene.util.automaton.Automaton Java Examples

The following examples show how to use org.apache.lucene.util.automaton.Automaton. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TermAutomatonQuery.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public TermAutomatonWeight(Automaton automaton, IndexSearcher searcher, Map<Integer,TermStates> termStates, float boost) throws IOException {
  super(TermAutomatonQuery.this);
  this.automaton = automaton;
  this.termStates = termStates;
  this.similarity = searcher.getSimilarity();
  List<TermStatistics> allTermStats = new ArrayList<>();
  for(Map.Entry<Integer,BytesRef> ent : idToTerm.entrySet()) {
    Integer termID = ent.getKey();
    if (ent.getValue() != null) {
      TermStates ts = termStates.get(termID);
      if (ts.docFreq() > 0) {
        allTermStats.add(searcher.termStatistics(new Term(field, ent.getValue()), ts.docFreq(), ts.totalTermFreq()));
      }
    }
  }

  if (allTermStats.isEmpty()) {
    stats = null; // no terms matched at all, will not use sim
  } else {
    stats = similarity.scorer(boost, searcher.collectionStatistics(field),
                                     allTermStats.toArray(new TermStatistics[allTermStats.size()]));
  }
}
 
Example #2
Source File: TestDuelingAnalyzers.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@BeforeClass
public static void beforeClass() throws Exception {
  Automaton single = new Automaton();
  int initial = single.createState();
  int accept = single.createState();
  single.setAccept(accept, true);

  // build an automaton matching this jvm's letter definition
  for (int i = 0; i <= 0x10FFFF; i++) {
    if (Character.isLetter(i)) {
      single.addTransition(initial, accept, i);
    }
  }
  Automaton repeat = Operations.repeat(single);
  jvmLetter = new CharacterRunAutomaton(repeat);
}
 
Example #3
Source File: AnalyzingSuggester.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
final Automaton toAutomaton(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException {
  // Analyze surface form:
  Automaton automaton;
  try (TokenStream ts = indexAnalyzer.tokenStream("", surfaceForm.utf8ToString())) {

    // Create corresponding automaton: labels are bytes
    // from each analyzed token, with byte 0 used as
    // separator between tokens:
    automaton = ts2a.toAutomaton(ts);
  }

  automaton = replaceSep(automaton);
  automaton = convertAutomaton(automaton);

  // TODO: LUCENE-5660 re-enable this once we disallow massive suggestion strings
  // assert SpecialOperations.isFinite(automaton);

  // Get all paths from the automaton (there can be
  // more than one path, eg if the analyzer created a
  // graph using SynFilter or WDF):
  return automaton;
}
 
Example #4
Source File: MtasToken.java    From mtas with Apache License 2.0 6 votes vote down vote up
/**
 * Creates the automaton map.
 *
 * @param prefix the prefix
 * @param valueList the value list
 * @param filter the filter
 * @return the map
 */
public static Map<String, Automaton> createAutomatonMap(String prefix,
    List<String> valueList, Boolean filter) {
  HashMap<String, Automaton> automatonMap = new HashMap<>();
  if (valueList != null) {
    for (String item : valueList) {
      if (filter) {
        item = item.replaceAll("([\\\"\\)\\(\\<\\>\\.\\@\\#\\]\\[\\{\\}])",
            "\\\\$1");
      }
      automatonMap.put(item,
          new RegExp(prefix + MtasToken.DELIMITER + item + "\u0000*")
              .toAutomaton());
    }
  }
  return automatonMap;
}
 
Example #5
Source File: TestSynonymGraphFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/** Just creates a side path from startState to endState with the provided tokens. */
private static void addSidePath(Automaton.Builder a, int startState, int endState, char[] tokens, List<Integer> flatStates) {
  int lastState = startState;
  for(int i=0;i<tokens.length;i++) {
    int nextState;
    if (i == tokens.length-1) {
      nextState = endState;
    } else if (flatStates == null || i >= flatStates.size()) {
      nextState = a.createState();
      if (flatStates != null) {
        assert i == flatStates.size();
        flatStates.add(nextState);
      }
    } else {
      nextState = flatStates.get(i);
    }
    a.addTransition(lastState, nextState, tokens[i]);

    lastState = nextState;
  }
}
 
Example #6
Source File: FuzzySuggester.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths,
                                                                     Automaton lookupAutomaton,
                                                                     FST<Pair<Long,BytesRef>> fst)
  throws IOException {

  // TODO: right now there's no penalty for fuzzy/edits,
  // ie a completion whose prefix matched exactly what the
  // user typed gets no boost over completions that
  // required an edit, which get no boost over completions
  // requiring two edits.  I suspect a multiplicative
  // factor is appropriate (eg, say a fuzzy match must be at
  // least 2X better weight than the non-fuzzy match to
  // "compete") ... in which case I think the wFST needs
  // to be log weights or something ...

  Automaton levA = convertAutomaton(toLevenshteinAutomata(lookupAutomaton));
  /*
    Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), StandardCharsets.UTF_8);
    w.write(levA.toDot());
    w.close();
    System.out.println("Wrote LevA to out.dot");
  */
  return FSTUtil.intersectPrefixPaths(levA, fst);
}
 
Example #7
Source File: PrefixQuery.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/** Build an automaton accepting all terms with the specified prefix. */
public static Automaton toAutomaton(BytesRef prefix) {
  final int numStatesAndTransitions = prefix.length+1;
  final Automaton automaton = new Automaton(numStatesAndTransitions, numStatesAndTransitions);
  int lastState = automaton.createState();
  for(int i=0;i<prefix.length;i++) {
    int state = automaton.createState();
    automaton.addTransition(lastState, state, prefix.bytes[prefix.offset+i]&0xff);
    lastState = state;
  }
  automaton.setAccept(lastState, true);
  automaton.addTransition(lastState, lastState, 0, 255);
  automaton.finishState();
  assert automaton.isDeterministic();
  return automaton;
}
 
Example #8
Source File: GraphTokenStreamFiniteStrings.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Returns the articulation points (or cut vertices) of the graph:
 * https://en.wikipedia.org/wiki/Biconnected_component
 */
public int[] articulationPoints() {
  if (det.getNumStates() == 0) {
    return new int[0];
  }
  //
  Automaton.Builder undirect = new Automaton.Builder();
  undirect.copy(det);
  for (int i = 0; i < det.getNumStates(); i++) {
    int numT = det.initTransition(i, transition);
    for (int j = 0; j < numT; j++) {
      det.getNextTransition(transition);
      undirect.addTransition(transition.dest, i, transition.min);
    }
  }
  int numStates = det.getNumStates();
  BitSet visited = new BitSet(numStates);
  int[] depth = new int[det.getNumStates()];
  int[] low = new int[det.getNumStates()];
  int[] parent = new int[det.getNumStates()];
  Arrays.fill(parent, -1);
  List<Integer> points = new ArrayList<>();
  articulationPointsRecurse(undirect.finish(), 0, 0, depth, low, parent, visited, points);
  Collections.reverse(points);
  return points.stream().mapToInt(p -> p).toArray();
}
 
Example #9
Source File: ContextMapping.java    From Elasticsearch with Apache License 2.0 6 votes vote down vote up
/**
 * Create a automaton for a given context query this automaton will be used
 * to find the matching paths with the fst
 *
 * @param preserveSep set an additional char (<code>XAnalyzingSuggester.SEP_LABEL</code>) between each context query
 * @param queries list of {@link ContextQuery} defining the lookup context
 *
 * @return Automaton matching the given Query
 */
public static Automaton toAutomaton(boolean preserveSep, Iterable<ContextQuery> queries) {
    Automaton a = Automata.makeEmptyString();

    Automaton gap = Automata.makeChar(ContextMapping.SEPARATOR);
    if (preserveSep) {
        // if separators are preserved the fst contains a SEP_LABEL
        // behind each gap. To have a matching automaton, we need to
        // include the SEP_LABEL in the query as well
        gap = Operations.concatenate(gap, Automata.makeChar(XAnalyzingSuggester.SEP_LABEL));
    }

    for (ContextQuery query : queries) {
        a = Operations.concatenate(Arrays.asList(query.toAutomaton(), gap, a));
    }

    // TODO: should we limit this?  Do any of our ContextQuery impls really create exponential regexps?  GeoQuery looks safe (union
    // of strings).
    return Operations.determinize(a, Integer.MAX_VALUE);
}
 
Example #10
Source File: IncludeExclude.java    From Elasticsearch with Apache License 2.0 6 votes vote down vote up
private Automaton toAutomaton() {
    Automaton a = null;
    if (include != null) {
        a = include.toAutomaton();
    } else if (includeValues != null) {
        a = Automata.makeStringUnion(includeValues);
    } else {
        a = Automata.makeAnyString();
    }
    if (exclude != null) {
        a = Operations.minus(a, exclude.toAutomaton(), Operations.DEFAULT_MAX_DETERMINIZED_STATES);
    } else if (excludeValues != null) {
        a = Operations.minus(a, Automata.makeStringUnion(excludeValues), Operations.DEFAULT_MAX_DETERMINIZED_STATES);
    }
    return a;
}
 
Example #11
Source File: TestRegexpQuery.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testCustomProvider() throws IOException {
  AutomatonProvider myProvider = new AutomatonProvider() {
    // automaton that matches quick or brown
    private Automaton quickBrownAutomaton = Operations.union(Arrays
        .asList(Automata.makeString("quick"),
        Automata.makeString("brown"),
        Automata.makeString("bob")));
    
    @Override
    public Automaton getAutomaton(String name) {
      if (name.equals("quickBrown")) return quickBrownAutomaton;
      else return null;
    }
  };
  RegexpQuery query = new RegexpQuery(newTerm("<quickBrown>"), RegExp.ALL,
    myProvider, DEFAULT_MAX_DETERMINIZED_STATES);
  assertEquals(1, searcher.search(query, 5).totalHits.value);
}
 
Example #12
Source File: XAnalyzingSuggester.java    From Elasticsearch with Apache License 2.0 6 votes vote down vote up
final Automaton toLookupAutomaton(final CharSequence key) throws IOException {
    // TODO: is there a Reader from a CharSequence?
    // Turn tokenstream into automaton:
    Automaton automaton = null;

    try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) {
        automaton = getTokenStreamToAutomaton().toAutomaton(ts);
    }

    automaton = replaceSep(automaton);

    // TODO: we can optimize this somewhat by determinizing
    // while we convert

    // This automaton should not blow up during determinize:
    automaton = Operations.determinize(automaton, Integer.MAX_VALUE);
    return automaton;
}
 
Example #13
Source File: FuzzyCompletionQuery.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
  final Automaton originalAutomata;
  try (CompletionTokenStream stream = (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text()) ) {
    originalAutomata = stream.toAutomaton(unicodeAware);
  }
  Set<IntsRef> refs = new HashSet<>();
  Automaton automaton = toLevenshteinAutomata(originalAutomata, refs);
  if (unicodeAware) {
    Automaton utf8automaton = new UTF32ToUTF8().convert(automaton);
    utf8automaton = Operations.determinize(utf8automaton, maxDeterminizedStates);
    automaton = utf8automaton;
  }
  // TODO Accumulating all refs is bad, because the resulting set may be very big.
  // TODO Better iterate over automaton again inside FuzzyCompletionWeight?
  return new FuzzyCompletionWeight(this, automaton, refs);
}
 
Example #14
Source File: ContextQuery.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
private static Automaton toContextAutomaton(final Map<IntsRef, ContextMetaData> contexts, final boolean matchAllContexts) {
  final Automaton matchAllAutomaton = Operations.repeat(Automata.makeAnyString());
  final Automaton sep = Automata.makeChar(ContextSuggestField.CONTEXT_SEPARATOR);
  if (matchAllContexts || contexts.size() == 0) {
    return Operations.concatenate(matchAllAutomaton, sep);
  } else {
    Automaton contextsAutomaton = null;
    for (Map.Entry<IntsRef, ContextMetaData> entry : contexts.entrySet()) {
      final ContextMetaData contextMetaData = entry.getValue();
      final IntsRef ref = entry.getKey();
      Automaton contextAutomaton = Automata.makeString(ref.ints, ref.offset, ref.length);
      if (contextMetaData.exact == false) {
        contextAutomaton = Operations.concatenate(contextAutomaton, matchAllAutomaton);
      }
      contextAutomaton = Operations.concatenate(contextAutomaton, sep);
      if (contextsAutomaton == null) {
        contextsAutomaton = contextAutomaton;
      } else {
        contextsAutomaton = Operations.union(contextsAutomaton, contextAutomaton);
      }
    }
    return contextsAutomaton;
  }
}
 
Example #15
Source File: TestGraphTokenizers.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testSynOverMultipleHoles() throws Exception {
  final TokenStream ts = new CannedTokenStream(
    new Token[] {
      token("a", 1, 1),
      token("x", 0, 3),
      token("b", 3, 1),
    });
  final Automaton a1 = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b")); 
  final Automaton a2 = join(s2a("x"), SEP_A, s2a("b")); 
  assertSameLanguage(Operations.union(a1, a2), ts);
}
 
Example #16
Source File: TestAutomatonQuery.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private void assertAutomatonHits(int expected, Automaton automaton)
    throws IOException {
  AutomatonQuery query = new AutomatonQuery(newTerm("bogus"), automaton);
  
  query.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_REWRITE);
  assertEquals(expected, automatonQueryNrHits(query));
  
  query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_REWRITE);
  assertEquals(expected, automatonQueryNrHits(query));
  
  query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_REWRITE);
  assertEquals(expected, automatonQueryNrHits(query));
}
 
Example #17
Source File: TestAutomatonQueryUnicode.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private void assertAutomatonHits(int expected, Automaton automaton)
    throws IOException {
  AutomatonQuery query = new AutomatonQuery(newTerm("bogus"), automaton);

  query.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_REWRITE);
  assertEquals(expected, automatonQueryNrHits(query));

  query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_REWRITE);
  assertEquals(expected, automatonQueryNrHits(query));

  query.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_REWRITE);
  assertEquals(expected, automatonQueryNrHits(query));
}
 
Example #18
Source File: TestAutomatonQuery.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Test that rewriting to a prefix query works as expected, preserves
 * MultiTermQuery semantics.
 */
public void testRewritePrefix() throws IOException {
  Automaton pfx = Automata.makeString("do");
  Automaton prefixAutomaton = Operations.concatenate(pfx, Automata.makeAnyString());
  AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"), prefixAutomaton);
  assertEquals(3, automatonQueryNrHits(aq));
}
 
Example #19
Source File: XContentMapValues.java    From crate with Apache License 2.0 5 votes vote down vote up
/**
 * Returns a function that filters a document map based on the given include and exclude rules.
 * @see #filter(Map, String[], String[]) for details
 */
public static Function<Map<String, ?>, Map<String, Object>> filter(String[] includes, String[] excludes) {
    CharacterRunAutomaton matchAllAutomaton = new CharacterRunAutomaton(Automata.makeAnyString());

    CharacterRunAutomaton include;
    if (includes == null || includes.length == 0) {
        include = matchAllAutomaton;
    } else {
        Automaton includeA = Regex.simpleMatchToAutomaton(includes);
        includeA = makeMatchDotsInFieldNames(includeA);
        include = new CharacterRunAutomaton(includeA);
    }

    Automaton excludeA;
    if (excludes == null || excludes.length == 0) {
        excludeA = Automata.makeEmpty();
    } else {
        excludeA = Regex.simpleMatchToAutomaton(excludes);
        excludeA = makeMatchDotsInFieldNames(excludeA);
    }
    CharacterRunAutomaton exclude = new CharacterRunAutomaton(excludeA);

    // NOTE: We cannot use Operations.minus because of the special case that
    // we want all sub properties to match as soon as an object matches

    return (map) -> filter(map,
        include, 0,
        exclude, 0,
        matchAllAutomaton);
}
 
Example #20
Source File: TestIndexWriter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testStopwordsPosIncHole2() throws Exception {
  // use two stopfilters for testing here
  Directory dir = newDirectory();
  final Automaton secondSet = Automata.makeString("foobar");
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer();
      TokenStream stream = new MockTokenFilter(tokenizer, MockTokenFilter.ENGLISH_STOPSET);
      stream = new MockTokenFilter(stream, new CharacterRunAutomaton(secondSet));
      return new TokenStreamComponents(tokenizer, stream);
    }
  };
  RandomIndexWriter iw = new RandomIndexWriter(random(), dir, a);
  Document doc = new Document();
  doc.add(new TextField("body", "just a foobar", Field.Store.NO));
  doc.add(new TextField("body", "test of gaps", Field.Store.NO));
  iw.addDocument(doc);
  IndexReader ir = iw.getReader();
  iw.close();
  IndexSearcher is = newSearcher(ir);
  PhraseQuery.Builder builder = new PhraseQuery.Builder();
  builder.add(new Term("body", "just"), 0);
  builder.add(new Term("body", "test"), 3);
  PhraseQuery pq = builder.build();
  // body:"just ? ? test"
  assertEquals(1, is.search(pq, 5).totalHits.value);
  ir.close();
  dir.close();
}
 
Example #21
Source File: TestGraphTokenizers.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testOverlappedTokensLattice() throws Exception {

    final TokenStream ts = new CannedTokenStream(
      new Token[] {
        token("abc", 1, 1),
        token("xyz", 0, 2),
        token("def", 1, 1),
      });
    final Automaton a1 = s2a("xyz");
    final Automaton a2 = join("abc", "def");
    assertSameLanguage(Operations.union(a1, a2), ts);
  }
 
Example #22
Source File: XAnalyzingSuggester.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
/**
 * Creates a new suggester.
 *
 * @param indexAnalyzer Analyzer that will be used for
 *   analyzing suggestions while building the index.
 * @param queryAnalyzer Analyzer that will be used for
 *   analyzing query text during lookup
 * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP}
 * @param maxSurfaceFormsPerAnalyzedForm Maximum number of
 *   surface forms to keep for a single analyzed form.
 *   When there are too many surface forms we discard the
 *   lowest weighted ones.
 * @param maxGraphExpansions Maximum number of graph paths
 *   to expand from the analyzed form.  Set this to -1 for
 *   no limit.
 */
public XAnalyzingSuggester(Analyzer indexAnalyzer, Automaton queryPrefix, Analyzer queryAnalyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
                           boolean preservePositionIncrements, FST<Pair<Long, BytesRef>> fst, boolean hasPayloads, int maxAnalyzedPathsForOneInput,
                           int sepLabel, int payloadSep, int endByte, int holeCharacter) {
    // SIMON EDIT: I added fst, hasPayloads and maxAnalyzedPathsForOneInput
  this.indexAnalyzer = indexAnalyzer;
  this.queryAnalyzer = queryAnalyzer;
  this.fst = fst;
  this.hasPayloads = hasPayloads;
  if ((options & ~(EXACT_FIRST | PRESERVE_SEP)) != 0) {
    throw new IllegalArgumentException("options should only contain EXACT_FIRST and PRESERVE_SEP; got " + options);
  }
  this.exactFirst = (options & EXACT_FIRST) != 0;
  this.preserveSep = (options & PRESERVE_SEP) != 0;

  // FLORIAN EDIT: I added <code>queryPrefix</code> for context dependent suggestions
  this.queryPrefix = queryPrefix;

  // NOTE: this is just an implementation limitation; if
  // somehow this is a problem we could fix it by using
  // more than one byte to disambiguate ... but 256 seems
  // like it should be way more then enough.
  if (maxSurfaceFormsPerAnalyzedForm <= 0 || maxSurfaceFormsPerAnalyzedForm > 256) {
    throw new IllegalArgumentException("maxSurfaceFormsPerAnalyzedForm must be > 0 and < 256 (got: " + maxSurfaceFormsPerAnalyzedForm + ")");
  }
  this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm;

  if (maxGraphExpansions < 1 && maxGraphExpansions != -1) {
    throw new IllegalArgumentException("maxGraphExpansions must -1 (no limit) or > 0 (got: " + maxGraphExpansions + ")");
  }
  this.maxGraphExpansions = maxGraphExpansions;
  this.maxAnalyzedPathsForOneInput = maxAnalyzedPathsForOneInput;
  this.preservePositionIncrements = preservePositionIncrements;
  this.sepLabel = sepLabel;
  this.payloadSep = payloadSep;
  this.endByte = endByte;
  this.holeCharacter = holeCharacter;
}
 
Example #23
Source File: TestSimplePatternTokenizer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testNotDeterminized() throws Exception {
  Automaton a = new Automaton();
  int start = a.createState();
  int mid1 = a.createState();
  int mid2 = a.createState();
  int end = a.createState();
  a.setAccept(end, true);
  a.addTransition(start, mid1, 'a', 'z');
  a.addTransition(start, mid2, 'a', 'z');
  a.addTransition(mid1, end, 'b');
  a.addTransition(mid2, end, 'b');
  expectThrows(IllegalArgumentException.class, () -> {new SimplePatternTokenizer(a);});
}
 
Example #24
Source File: SimplePatternTokenizer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** Runs a pre-built automaton. */
public SimplePatternTokenizer(AttributeFactory factory, Automaton dfa) {
  super(factory);

  // we require user to do this up front because it is a possibly very costly operation, and user may be creating us frequently, not
  // realizing this ctor is otherwise trappy
  if (dfa.isDeterministic() == false) {
    throw new IllegalArgumentException("please determinize the incoming automaton first");
  }

  runDFA = new CharacterRunAutomaton(dfa, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
}
 
Example #25
Source File: TestSynonymGraphFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private Automaton toAutomaton(TokenStream ts) throws IOException {
  PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
  PositionLengthAttribute posLenAtt = ts.addAttribute(PositionLengthAttribute.class);
  CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
  ts.reset();
  Automaton a = new Automaton();
  int srcNode = -1;
  int destNode = -1;
  int state = a.createState();
  while (ts.incrementToken()) {
    assert termAtt.length() == 1;
    char c = termAtt.charAt(0);
    int posInc = posIncAtt.getPositionIncrement();
    if (posInc != 0) {
      srcNode += posInc;
      while (state < srcNode) {
        state = a.createState();
      }
    }
    destNode = srcNode + posLenAtt.getPositionLength();
    while (state < destNode) {
      state = a.createState();
    }
    a.addTransition(srcNode, destNode, c);
  }
  ts.end();
  ts.close();
  a.finishState();
  a.setAccept(destNode, true);
  return a;
}
 
Example #26
Source File: ConcatenateGraphFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public boolean incrementToken() throws IOException {
  if (finiteStrings == null) {
    if (wasReset == false) {
      throw new IllegalStateException("reset() missing before incrementToken");
    }
    // lazy init/consume
    Automaton automaton = toAutomaton(); // calls reset(), incrementToken() repeatedly, and end() on inputTokenStream
    finiteStrings = new LimitedFiniteStringsIterator(automaton, maxGraphExpansions);
    //note: would be nice to know the startOffset but toAutomaton doesn't capture it.  We'll assume 0
    endOffset = inputTokenStream.getAttribute(OffsetAttribute.class).endOffset();
  }

  IntsRef string = finiteStrings.next();
  if (string == null) {
    return false;
  }

  clearAttributes();

  if (finiteStrings.size() > 1) { // if number of iterated strings so far is more than one...
    posIncrAtt.setPositionIncrement(0); // stacked
  }

  offsetAtt.setOffset(0, endOffset);

  Util.toBytesRef(string, bytesAtt.builder()); // now we have UTF-8
  if (charTermAttribute != null) {
    charTermAttribute.setLength(0);
    charTermAttribute.append(bytesAtt.toUTF16());
  }

  return true;
}
 
Example #27
Source File: TestSynonymGraphFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private boolean approxSubsetOf(Automaton a1, Automaton a2) {
  AutomatonTestUtil.RandomAcceptedStrings ras = new AutomatonTestUtil.RandomAcceptedStrings(a1);
  for(int i=0;i<2000;i++) {
    int[] ints = ras.getRandomAcceptedString(random());
    IntsRef path = new IntsRef(ints, 0, ints.length);
    if (accepts(a2, path) == false) {
      throw new RuntimeException("a2 does not accept " + path);
    }
  }

  // Presumed true
  return true;
}
 
Example #28
Source File: GraphEdgeCollector.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public Query getResultQuery(SchemaField matchField, boolean useAutomaton) {
  if (collectorTerms == null || collectorTerms.size() == 0) {
    // return null if there are no terms (edges) to traverse.
    return null;
  } else {
    // Create a query
    Query q = null;

    // TODO: see if we should dynamically select this based on the frontier size.
    if (useAutomaton) {
      // build an automaton based query for the frontier.
      Automaton autn = buildAutomaton(collectorTerms);
      AutomatonQuery autnQuery = new AutomatonQuery(new Term(matchField.getName()), autn);
      q = autnQuery;
    } else {
      List<BytesRef> termList = new ArrayList<>(collectorTerms.size());
      for (int i = 0; i < collectorTerms.size(); i++) {
        BytesRef ref = new BytesRef();
        collectorTerms.get(i, ref);
        termList.add(ref);
      }
      q = (matchField.hasDocValues() && !matchField.indexed())
              ? new DocValuesTermsQuery(matchField.getName(), termList)
              : new TermInSetQuery(matchField.getName(), termList);
    }

    return q;
  }
}
 
Example #29
Source File: Regex.java    From crate with Apache License 2.0 5 votes vote down vote up
/**
 * Return an Automaton that matches the union of the provided patterns.
 */
public static Automaton simpleMatchToAutomaton(String... patterns) {
    if (patterns.length < 1) {
        throw new IllegalArgumentException("There must be at least one pattern, zero given");
    }
    List<Automaton> automata = new ArrayList<>();
    for (String pattern : patterns) {
        automata.add(simpleMatchToAutomaton(pattern));
    }
    return Operations.union(automata);
}
 
Example #30
Source File: GraphEdgeCollector.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Build an automaton to represent the frontier query
 */
private Automaton buildAutomaton(BytesRefHash termBytesHash) {
  // need top pass a sorted set of terms to the autn builder (maybe a better way to avoid this?)
  final TreeSet<BytesRef> terms = new TreeSet<BytesRef>();
  for (int i = 0; i < termBytesHash.size(); i++) {
    BytesRef ref = new BytesRef();
    termBytesHash.get(i, ref);
    terms.add(ref);
  }
  final Automaton a = DaciukMihovAutomatonBuilder.build(terms);
  return a;
}