org.apache.lucene.util.automaton.LevenshteinAutomata Java Examples

The following examples show how to use org.apache.lucene.util.automaton.LevenshteinAutomata. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestSimpleQueryParser.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/** test a fuzzy query */
public void testFuzzy() throws Exception {
  Query regular = new TermQuery(new Term("field", "foobar"));
  Query expected = new FuzzyQuery(new Term("field", "foobar"), 2);

  assertEquals(expected, parse("foobar~2"));
  assertEquals(expected, parse("foobar~"));
  assertEquals(regular, parse("foobar~a"));
  assertEquals(regular, parse("foobar~1a"));

  BooleanQuery.Builder bool = new BooleanQuery.Builder();
  FuzzyQuery fuzzy = new FuzzyQuery(new Term("field", "foo"), LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
  bool.add(fuzzy, Occur.MUST);
  bool.add(new TermQuery(new Term("field", "bar")), Occur.MUST);

  assertEquals(bool.build(), parse("foo~" + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + 1 + " bar"));
}
 
Example #2
Source File: FuzzyAutomatonBuilder.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
FuzzyAutomatonBuilder(String term, int maxEdits, int prefixLength, boolean transpositions) {
  if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
    throw new IllegalArgumentException("max edits must be 0.." + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + ", inclusive; got: " + maxEdits);
  }
  if (prefixLength < 0) {
    throw new IllegalArgumentException("prefixLength cannot be less than 0");
  }
  this.term = term;
  this.maxEdits = maxEdits;
  int[] codePoints = stringToUTF32(term);
  this.termLength = codePoints.length;
  prefixLength = Math.min(prefixLength, codePoints.length);
  int[] suffix = new int[codePoints.length - prefixLength];
  System.arraycopy(codePoints, prefixLength, suffix, 0, suffix.length);
  this.levBuilder = new LevenshteinAutomata(suffix, Character.MAX_CODE_POINT, transpositions);
  this.prefix = UnicodeUtil.newString(codePoints, 0, prefixLength);
}
 
Example #3
Source File: FuzzyQuery.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Create a new FuzzyQuery that will match terms with an edit distance 
 * of at most <code>maxEdits</code> to <code>term</code>.
 * If a <code>prefixLength</code> &gt; 0 is specified, a common prefix
 * of that length is also required.
 * 
 * @param term the term to search for
 * @param maxEdits must be {@code >= 0} and {@code <=} {@link LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE}.
 * @param prefixLength length of common (non-fuzzy) prefix
 * @param maxExpansions the maximum number of terms to match. If this number is
 *  greater than {@link IndexSearcher#getMaxClauseCount} when the query is rewritten,
 *  then the maxClauseCount will be used instead.
 * @param transpositions true if transpositions should be treated as a primitive
 *        edit operation. If this is false, comparisons will implement the classic
 *        Levenshtein algorithm.
 */
public FuzzyQuery(Term term, int maxEdits, int prefixLength, int maxExpansions, boolean transpositions) {
  super(term.field());
  
  if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
    throw new IllegalArgumentException("maxEdits must be between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
  }
  if (prefixLength < 0) {
    throw new IllegalArgumentException("prefixLength cannot be negative.");
  }
  if (maxExpansions <= 0) {
    throw new IllegalArgumentException("maxExpansions must be positive.");
  }
  
  this.term = term;
  this.maxEdits = maxEdits;
  this.prefixLength = prefixLength;
  this.transpositions = transpositions;
  this.maxExpansions = maxExpansions;
  setRewriteMethod(new MultiTermQuery.TopTermsBlendedFreqScoringRewrite(maxExpansions));
}
 
Example #4
Source File: TestFuzzyQuery.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testValidation() {
  IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
    new FuzzyQuery(new Term("field", "foo"), -1, 0, 1, false);
  });
  assertTrue(expected.getMessage().contains("maxEdits"));

  expected = expectThrows(IllegalArgumentException.class, () -> {
    new FuzzyQuery(new Term("field", "foo"), LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + 1, 0, 1, false);
  });
  assertTrue(expected.getMessage().contains("maxEdits must be between"));

  expected = expectThrows(IllegalArgumentException.class, () -> {
    new FuzzyQuery(new Term("field", "foo"), 1, -1, 1, false);
  });
  assertTrue(expected.getMessage().contains("prefixLength cannot be negative"));

  expected = expectThrows(IllegalArgumentException.class, () -> {
    new FuzzyQuery(new Term("field", "foo"), 1, 0, -1, false);
  });
  assertTrue(expected.getMessage().contains("maxExpansions must be positive"));

  expected = expectThrows(IllegalArgumentException.class, () -> {
    new FuzzyQuery(new Term("field", "foo"), 1, 0, -1, false);
  });
  assertTrue(expected.getMessage().contains("maxExpansions must be positive"));
}
 
Example #5
Source File: SuggestUtils.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
public static boolean parseDirectSpellcheckerSettings(XContentParser parser, String fieldName,
            DirectSpellcheckerSettings suggestion, ParseFieldMatcher parseFieldMatcher) throws IOException {
        if ("accuracy".equals(fieldName)) {
            suggestion.accuracy(parser.floatValue());
        } else if (parseFieldMatcher.match(fieldName, Fields.SUGGEST_MODE)) {
            suggestion.suggestMode(SuggestUtils.resolveSuggestMode(parser.text()));
        } else if ("sort".equals(fieldName)) {
            suggestion.sort(SuggestUtils.resolveSort(parser.text()));
        } else if (parseFieldMatcher.match(fieldName, Fields.STRING_DISTANCE)) {
        suggestion.stringDistance(SuggestUtils.resolveDistance(parser.text()));
        } else if (parseFieldMatcher.match(fieldName, Fields.MAX_EDITS)) {
        suggestion.maxEdits(parser.intValue());
            if (suggestion.maxEdits() < 1 || suggestion.maxEdits() > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
                throw new IllegalArgumentException("Illegal max_edits value " + suggestion.maxEdits());
            }
        } else if (parseFieldMatcher.match(fieldName, Fields.MAX_INSPECTIONS)) {
        suggestion.maxInspections(parser.intValue());
        } else if (parseFieldMatcher.match(fieldName, Fields.MAX_TERM_FREQ)) {
        suggestion.maxTermFreq(parser.floatValue());
        } else if (parseFieldMatcher.match(fieldName, Fields.PREFIX_LENGTH)) {
        suggestion.prefixLength(parser.intValue());
        } else if (parseFieldMatcher.match(fieldName, Fields.MIN_WORD_LENGTH)) {
        suggestion.minQueryLength(parser.intValue());
        } else if (parseFieldMatcher.match(fieldName, Fields.MIN_DOC_FREQ)) {
        suggestion.minDocFreq(parser.floatValue());
        } else {
            return false;
        }
        return true;
}
 
Example #6
Source File: FuzzyLikeThisQuery.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Adds user input for "fuzzification" 
 * @param queryString The string which will be parsed by the analyzer and for which fuzzy variants will be parsed
 * @param minSimilarity The minimum similarity of the term variants; must be 0, 1 or 2 (see FuzzyTermsEnum)
 * @param prefixLength Length of required common prefix on variant terms (see FuzzyTermsEnum)
 */
public void addTerms(String queryString, String fieldName,float minSimilarity, int prefixLength) 
{
  int maxEdits = (int) minSimilarity;
  if (maxEdits != minSimilarity || maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
    throw new IllegalArgumentException("minSimilarity must integer value between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + ", inclusive; got " + minSimilarity);
  }
  fieldVals.add(new FieldVals(fieldName,maxEdits,prefixLength,queryString));
}
 
Example #7
Source File: NearestFuzzyQuery.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Adds user input for "fuzzification"
 *
 * @param queryString The string which will be parsed by the analyzer and for which fuzzy variants will be parsed
 */
public void addTerms(String queryString, String fieldName) {
  int maxEdits = (int) MIN_SIMILARITY;
  if (maxEdits != MIN_SIMILARITY) {
    throw new IllegalArgumentException("MIN_SIMILARITY must integer value between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + ", inclusive; got " + MIN_SIMILARITY);
  }
  fieldVals.add(new FieldVals(fieldName, maxEdits, queryString));
}
 
Example #8
Source File: FuzzyQuery.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Helper function to convert from "minimumSimilarity" fractions
 * to raw edit distances.
 * 
 * @param minimumSimilarity scaled similarity
 * @param termLen length (in unicode codepoints) of the term.
 * @return equivalent number of maxEdits
 */
public static int floatToEdits(float minimumSimilarity, int termLen) {
  if (minimumSimilarity >= 1f) {
    return (int) Math.min(minimumSimilarity, LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
  } else if (minimumSimilarity == 0.0f) {
    return 0; // 0 means exact, not infinite # of edits!
  } else {
    return Math.min((int) ((1D-minimumSimilarity) * termLen), 
      LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
  }
}
 
Example #9
Source File: FuzzySuggester.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
/**
 * Creates a {@link FuzzySuggester} instance.
 * 
 * @param indexAnalyzer Analyzer that will be used for
 *        analyzing suggestions while building the index.
 * @param queryAnalyzer Analyzer that will be used for
 *        analyzing query text during lookup
 * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP}
 * @param maxSurfaceFormsPerAnalyzedForm Maximum number of
 *        surface forms to keep for a single analyzed form.
 *        When there are too many surface forms we discard the
 *        lowest weighted ones.
 * @param maxGraphExpansions Maximum number of graph paths
 *        to expand from the analyzed form.  Set this to -1 for
 *        no limit.
 * @param preservePositionIncrements Whether position holes should appear in the automaton
 * @param maxEdits must be &gt;= 0 and &lt;= {@link LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE} .
 * @param transpositions <code>true</code> if transpositions should be treated as a primitive 
 *        edit operation. If this is false, comparisons will implement the classic
 *        Levenshtein algorithm.
 * @param nonFuzzyPrefix length of common (non-fuzzy) prefix (see default {@link #DEFAULT_NON_FUZZY_PREFIX}
 * @param minFuzzyLength minimum length of lookup key before any edits are allowed (see default {@link #DEFAULT_MIN_FUZZY_LENGTH})
 * @param unicodeAware operate Unicode code points instead of bytes.
 */
public FuzzySuggester(Directory tempDir, String tempFileNamePrefix, Analyzer indexAnalyzer, Analyzer queryAnalyzer,
                      int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
                      boolean preservePositionIncrements, int maxEdits, boolean transpositions,
                      int nonFuzzyPrefix, int minFuzzyLength, boolean unicodeAware) {
  super(tempDir, tempFileNamePrefix, indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, preservePositionIncrements);
  if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
    throw new IllegalArgumentException("maxEdits must be between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
  }
  if (nonFuzzyPrefix < 0) {
    throw new IllegalArgumentException("nonFuzzyPrefix must not be >= 0 (got " + nonFuzzyPrefix + ")");
  }
  if (minFuzzyLength < 0) {
    throw new IllegalArgumentException("minFuzzyLength must not be >= 0 (got " + minFuzzyLength + ")");
  }
  
  this.maxEdits = maxEdits;
  this.transpositions = transpositions;
  this.nonFuzzyPrefix = nonFuzzyPrefix;
  this.minFuzzyLength = minFuzzyLength;
  this.unicodeAware = unicodeAware;
}
 
Example #10
Source File: DirectSpellChecker.java    From lucene-solr with Apache License 2.0 2 votes vote down vote up
/** Sets the maximum number of Levenshtein edit-distances to draw
 *  candidate terms from. This value can be 1 or 2. The default is 2.
 *  <p>
 *  Note: a large number of spelling errors occur with an edit distance
 *  of 1, by setting this value to 1 you can increase both performance
 *  and precision at the cost of recall.
 */
public void setMaxEdits(int maxEdits) {
  if (maxEdits < 1 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE)
    throw new UnsupportedOperationException("Invalid maxEdits");
  this.maxEdits = maxEdits;
}