Java Code Examples for org.apache.lucene.util.automaton.Operations#determinize()

The following examples show how to use org.apache.lucene.util.automaton.Operations#determinize() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: XAnalyzingSuggester.java From Elasticsearch with Apache License 2.0

6 votes

final Automaton toLookupAutomaton(final CharSequence key) throws IOException {
    // TODO: is there a Reader from a CharSequence?
    // Turn tokenstream into automaton:
    Automaton automaton = null;

    try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) {
        automaton = getTokenStreamToAutomaton().toAutomaton(ts);
    }

    automaton = replaceSep(automaton);

    // TODO: we can optimize this somewhat by determinizing
    // while we convert

    // This automaton should not blow up during determinize:
    automaton = Operations.determinize(automaton, Integer.MAX_VALUE);
    return automaton;
}

Example 2

Source File: ContextMapping.java From Elasticsearch with Apache License 2.0

6 votes

/**
 * Create a automaton for a given context query this automaton will be used
 * to find the matching paths with the fst
 *
 * @param preserveSep set an additional char (<code>XAnalyzingSuggester.SEP_LABEL</code>) between each context query
 * @param queries list of {@link ContextQuery} defining the lookup context
 *
 * @return Automaton matching the given Query
 */
public static Automaton toAutomaton(boolean preserveSep, Iterable<ContextQuery> queries) {
    Automaton a = Automata.makeEmptyString();

    Automaton gap = Automata.makeChar(ContextMapping.SEPARATOR);
    if (preserveSep) {
        // if separators are preserved the fst contains a SEP_LABEL
        // behind each gap. To have a matching automaton, we need to
        // include the SEP_LABEL in the query as well
        gap = Operations.concatenate(gap, Automata.makeChar(XAnalyzingSuggester.SEP_LABEL));
    }

    for (ContextQuery query : queries) {
        a = Operations.concatenate(Arrays.asList(query.toAutomaton(), gap, a));
    }

    // TODO: should we limit this?  Do any of our ContextQuery impls really create exponential regexps?  GeoQuery looks safe (union
    // of strings).
    return Operations.determinize(a, Integer.MAX_VALUE);
}

Example 3

Source File: FuzzyCompletionQuery.java From lucene-solr with Apache License 2.0

6 votes

@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
  final Automaton originalAutomata;
  try (CompletionTokenStream stream = (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text()) ) {
    originalAutomata = stream.toAutomaton(unicodeAware);
  }
  Set<IntsRef> refs = new HashSet<>();
  Automaton automaton = toLevenshteinAutomata(originalAutomata, refs);
  if (unicodeAware) {
    Automaton utf8automaton = new UTF32ToUTF8().convert(automaton);
    utf8automaton = Operations.determinize(utf8automaton, maxDeterminizedStates);
    automaton = utf8automaton;
  }
  // TODO Accumulating all refs is bad, because the resulting set may be very big.
  // TODO Better iterate over automaton again inside FuzzyCompletionWeight?
  return new FuzzyCompletionWeight(this, automaton, refs);
}

Example 4

Source File: XAnalyzingSuggester.java From Elasticsearch with Apache License 2.0

5 votes

protected Automaton convertAutomaton(Automaton a) {
  if (queryPrefix != null) {
    a = Operations.concatenate(Arrays.asList(queryPrefix, a));
    // This automaton should not blow up during determinize:
    a = Operations.determinize(a, Integer.MAX_VALUE);
  }
  return a;
}

Example 5

Source File: FuzzySuggester.java From lucene-solr with Apache License 2.0

5 votes

@Override
protected Automaton convertAutomaton(Automaton a) {
  if (unicodeAware) {
    Automaton utf8automaton = new UTF32ToUTF8().convert(a);
    utf8automaton = Operations.determinize(utf8automaton, DEFAULT_MAX_DETERMINIZED_STATES);
    return utf8automaton;
  } else {
    return a;
  }
}

Example 6

Source File: AnalyzingSuggester.java From lucene-solr with Apache License 2.0

5 votes

final Automaton toLookupAutomaton(final CharSequence key) throws IOException {
  // TODO: is there a Reader from a CharSequence?
  // Turn tokenstream into automaton:
  Automaton automaton = null;
  try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) {
    automaton = getTokenStreamToAutomaton().toAutomaton(ts);
  }

  automaton = replaceSep(automaton);

  // TODO: we can optimize this somewhat by determinizing
  // while we convert
  automaton = Operations.determinize(automaton, DEFAULT_MAX_DETERMINIZED_STATES);
  return automaton;
}

Example 7

Source File: ContextQuery.java From lucene-solr with Apache License 2.0

5 votes

@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
  final CompletionWeight innerWeight = ((CompletionWeight) innerQuery.createWeight(searcher, scoreMode, boost));
  final Automaton innerAutomaton = innerWeight.getAutomaton();

  // If the inner automaton matches nothing, then we return an empty weight to avoid
  // traversing all contexts during scoring.
  if (innerAutomaton.getNumStates() == 0) {
    return new CompletionWeight(this, innerAutomaton);
  }

  // if separators are preserved the fst contains a SEP_LABEL
  // behind each gap. To have a matching automaton, we need to
  // include the SEP_LABEL in the query as well
  Automaton optionalSepLabel = Operations.optional(Automata.makeChar(ConcatenateGraphFilter.SEP_LABEL));
  Automaton prefixAutomaton = Operations.concatenate(optionalSepLabel, innerAutomaton);
  Automaton contextsAutomaton = Operations.concatenate(toContextAutomaton(contexts, matchAllContexts), prefixAutomaton);
  contextsAutomaton = Operations.determinize(contextsAutomaton, Operations.DEFAULT_MAX_DETERMINIZED_STATES);

  final Map<IntsRef, Float> contextMap = new HashMap<>(contexts.size());
  final TreeSet<Integer> contextLengths = new TreeSet<>();
  for (Map.Entry<IntsRef, ContextMetaData> entry : contexts.entrySet()) {
    ContextMetaData contextMetaData = entry.getValue();
    contextMap.put(entry.getKey(), contextMetaData.boost);
    contextLengths.add(entry.getKey().length);
  }
  int[] contextLengthArray = new int[contextLengths.size()];
  final Iterator<Integer> iterator = contextLengths.descendingIterator();
  for (int i = 0; iterator.hasNext(); i++) {
    contextLengthArray[i] = iterator.next();
  }
  return new ContextCompletionWeight(this, contextsAutomaton, innerWeight, contextMap, contextLengthArray);
}

Example 8

Source File: SimplePatternSplitTokenizerFactory.java From lucene-solr with Apache License 2.0

5 votes

/** Creates a new SimpleSplitPatternTokenizerFactory */
public SimplePatternSplitTokenizerFactory(Map<String,String> args) {
  super(args);
  maxDeterminizedStates = getInt(args, "maxDeterminizedStates", Operations.DEFAULT_MAX_DETERMINIZED_STATES);
  dfa = Operations.determinize(new RegExp(require(args, PATTERN)).toAutomaton(), maxDeterminizedStates);
  if (args.isEmpty() == false) {
    throw new IllegalArgumentException("Unknown parameters: " + args);
  }
}

Example 9

Source File: SimplePatternTokenizerFactory.java From lucene-solr with Apache License 2.0

5 votes

/** Creates a new SimplePatternTokenizerFactory */
public SimplePatternTokenizerFactory(Map<String,String> args) {
  super(args);
  maxDeterminizedStates = getInt(args, "maxDeterminizedStates", Operations.DEFAULT_MAX_DETERMINIZED_STATES);
  dfa = Operations.determinize(new RegExp(require(args, PATTERN)).toAutomaton(), maxDeterminizedStates);
  if (args.isEmpty() == false) {
    throw new IllegalArgumentException("Unknown parameters: " + args);
  }
}

Example 10

Source File: ConcatenateGraphFilter.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Converts the tokenStream to an automaton.  Does *not* close it.
 */
public Automaton toAutomaton(boolean unicodeAware) throws IOException {
  // TODO refactor this
  // maybe we could hook up a modified automaton from TermAutomatonQuery here?

  // Create corresponding automaton: labels are bytes
  // from each analyzed token, with byte 0 used as
  // separator between tokens:
  final TokenStreamToAutomaton tsta;
  if (tokenSeparator != null) {
    tsta = new EscapingTokenStreamToAutomaton(tokenSeparator);
  } else {
    // When we're not preserving sep, we don't steal 0xff
    // byte, so we don't need to do any escaping:
    tsta = new TokenStreamToAutomaton();
  }
  tsta.setPreservePositionIncrements(preservePositionIncrements);
  tsta.setUnicodeArcs(unicodeAware);

  Automaton automaton = tsta.toAutomaton(inputTokenStream);

  // TODO: we can optimize this somewhat by determinizing
  // while we convert
  automaton = replaceSep(automaton, tokenSeparator);
  // This automaton should not blow up during determinize:
  return Operations.determinize(automaton, maxGraphExpansions);
}