org.apache.lucene.util.automaton.LevenshteinAutomata Java Exaples

Source File: TestSimpleQueryParser.java From lucene-solr with Apache License 2.0

6 votes

/** test a fuzzy query */
public void testFuzzy() throws Exception {
  Query regular = new TermQuery(new Term("field", "foobar"));
  Query expected = new FuzzyQuery(new Term("field", "foobar"), 2);

  assertEquals(expected, parse("foobar~2"));
  assertEquals(expected, parse("foobar~"));
  assertEquals(regular, parse("foobar~a"));
  assertEquals(regular, parse("foobar~1a"));

  BooleanQuery.Builder bool = new BooleanQuery.Builder();
  FuzzyQuery fuzzy = new FuzzyQuery(new Term("field", "foo"), LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
  bool.add(fuzzy, Occur.MUST);
  bool.add(new TermQuery(new Term("field", "bar")), Occur.MUST);

  assertEquals(bool.build(), parse("foo~" + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + 1 + " bar"));
}

Source File: FuzzyAutomatonBuilder.java From lucene-solr with Apache License 2.0

6 votes

FuzzyAutomatonBuilder(String term, int maxEdits, int prefixLength, boolean transpositions) {
  if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
    throw new IllegalArgumentException("max edits must be 0.." + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + ", inclusive; got: " + maxEdits);
  }
  if (prefixLength < 0) {
    throw new IllegalArgumentException("prefixLength cannot be less than 0");
  }
  this.term = term;
  this.maxEdits = maxEdits;
  int[] codePoints = stringToUTF32(term);
  this.termLength = codePoints.length;
  prefixLength = Math.min(prefixLength, codePoints.length);
  int[] suffix = new int[codePoints.length - prefixLength];
  System.arraycopy(codePoints, prefixLength, suffix, 0, suffix.length);
  this.levBuilder = new LevenshteinAutomata(suffix, Character.MAX_CODE_POINT, transpositions);
  this.prefix = UnicodeUtil.newString(codePoints, 0, prefixLength);
}

Source File: FuzzyQuery.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Create a new FuzzyQuery that will match terms with an edit distance 
 * of at most <code>maxEdits</code> to <code>term</code>.
 * If a <code>prefixLength</code> &gt; 0 is specified, a common prefix
 * of that length is also required.
 * 
 * @param term the term to search for
 * @param maxEdits must be {@code >= 0} and {@code <=} {@link LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE}.
 * @param prefixLength length of common (non-fuzzy) prefix
 * @param maxExpansions the maximum number of terms to match. If this number is
 *  greater than {@link IndexSearcher#getMaxClauseCount} when the query is rewritten,
 *  then the maxClauseCount will be used instead.
 * @param transpositions true if transpositions should be treated as a primitive
 *        edit operation. If this is false, comparisons will implement the classic
 *        Levenshtein algorithm.
 */
public FuzzyQuery(Term term, int maxEdits, int prefixLength, int maxExpansions, boolean transpositions) {
  super(term.field());
  
  if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
    throw new IllegalArgumentException("maxEdits must be between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
  }
  if (prefixLength < 0) {
    throw new IllegalArgumentException("prefixLength cannot be negative.");
  }
  if (maxExpansions <= 0) {
    throw new IllegalArgumentException("maxExpansions must be positive.");
  }
  
  this.term = term;
  this.maxEdits = maxEdits;
  this.prefixLength = prefixLength;
  this.transpositions = transpositions;
  this.maxExpansions = maxExpansions;
  setRewriteMethod(new MultiTermQuery.TopTermsBlendedFreqScoringRewrite(maxExpansions));
}

Source File: TestFuzzyQuery.java From lucene-solr with Apache License 2.0

6 votes

public void testValidation() {
  IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
    new FuzzyQuery(new Term("field", "foo"), -1, 0, 1, false);
  });
  assertTrue(expected.getMessage().contains("maxEdits"));

  expected = expectThrows(IllegalArgumentException.class, () -> {
    new FuzzyQuery(new Term("field", "foo"), LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + 1, 0, 1, false);
  });
  assertTrue(expected.getMessage().contains("maxEdits must be between"));

  expected = expectThrows(IllegalArgumentException.class, () -> {
    new FuzzyQuery(new Term("field", "foo"), 1, -1, 1, false);
  });
  assertTrue(expected.getMessage().contains("prefixLength cannot be negative"));

  expected = expectThrows(IllegalArgumentException.class, () -> {
    new FuzzyQuery(new Term("field", "foo"), 1, 0, -1, false);
  });
  assertTrue(expected.getMessage().contains("maxExpansions must be positive"));

  expected = expectThrows(IllegalArgumentException.class, () -> {
    new FuzzyQuery(new Term("field", "foo"), 1, 0, -1, false);
  });
  assertTrue(expected.getMessage().contains("maxExpansions must be positive"));
}

Source File: SuggestUtils.java From Elasticsearch with Apache License 2.0

5 votes

public static boolean parseDirectSpellcheckerSettings(XContentParser parser, String fieldName,
            DirectSpellcheckerSettings suggestion, ParseFieldMatcher parseFieldMatcher) throws IOException {
        if ("accuracy".equals(fieldName)) {
            suggestion.accuracy(parser.floatValue());
        } else if (parseFieldMatcher.match(fieldName, Fields.SUGGEST_MODE)) {
            suggestion.suggestMode(SuggestUtils.resolveSuggestMode(parser.text()));
        } else if ("sort".equals(fieldName)) {
            suggestion.sort(SuggestUtils.resolveSort(parser.text()));
        } else if (parseFieldMatcher.match(fieldName, Fields.STRING_DISTANCE)) {
        suggestion.stringDistance(SuggestUtils.resolveDistance(parser.text()));
        } else if (parseFieldMatcher.match(fieldName, Fields.MAX_EDITS)) {
        suggestion.maxEdits(parser.intValue());
            if (suggestion.maxEdits() < 1 || suggestion.maxEdits() > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
                throw new IllegalArgumentException("Illegal max_edits value " + suggestion.maxEdits());
            }
        } else if (parseFieldMatcher.match(fieldName, Fields.MAX_INSPECTIONS)) {
        suggestion.maxInspections(parser.intValue());
        } else if (parseFieldMatcher.match(fieldName, Fields.MAX_TERM_FREQ)) {
        suggestion.maxTermFreq(parser.floatValue());
        } else if (parseFieldMatcher.match(fieldName, Fields.PREFIX_LENGTH)) {
        suggestion.prefixLength(parser.intValue());
        } else if (parseFieldMatcher.match(fieldName, Fields.MIN_WORD_LENGTH)) {
        suggestion.minQueryLength(parser.intValue());
        } else if (parseFieldMatcher.match(fieldName, Fields.MIN_DOC_FREQ)) {
        suggestion.minDocFreq(parser.floatValue());
        } else {
            return false;
        }
        return true;
}

Source File: FuzzyLikeThisQuery.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Adds user input for "fuzzification" 
 * @param queryString The string which will be parsed by the analyzer and for which fuzzy variants will be parsed
 * @param minSimilarity The minimum similarity of the term variants; must be 0, 1 or 2 (see FuzzyTermsEnum)
 * @param prefixLength Length of required common prefix on variant terms (see FuzzyTermsEnum)
 */
public void addTerms(String queryString, String fieldName,float minSimilarity, int prefixLength) 
{
  int maxEdits = (int) minSimilarity;
  if (maxEdits != minSimilarity || maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
    throw new IllegalArgumentException("minSimilarity must integer value between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + ", inclusive; got " + minSimilarity);
  }
  fieldVals.add(new FieldVals(fieldName,maxEdits,prefixLength,queryString));
}

Source File: NearestFuzzyQuery.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Adds user input for "fuzzification"
 *
 * @param queryString The string which will be parsed by the analyzer and for which fuzzy variants will be parsed
 */
public void addTerms(String queryString, String fieldName) {
  int maxEdits = (int) MIN_SIMILARITY;
  if (maxEdits != MIN_SIMILARITY) {
    throw new IllegalArgumentException("MIN_SIMILARITY must integer value between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + ", inclusive; got " + MIN_SIMILARITY);
  }
  fieldVals.add(new FieldVals(fieldName, maxEdits, queryString));
}

Source File: FuzzyQuery.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Helper function to convert from "minimumSimilarity" fractions
 * to raw edit distances.
 * 
 * @param minimumSimilarity scaled similarity
 * @param termLen length (in unicode codepoints) of the term.
 * @return equivalent number of maxEdits
 */
public static int floatToEdits(float minimumSimilarity, int termLen) {
  if (minimumSimilarity >= 1f) {
    return (int) Math.min(minimumSimilarity, LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
  } else if (minimumSimilarity == 0.0f) {
    return 0; // 0 means exact, not infinite # of edits!
  } else {
    return Math.min((int) ((1D-minimumSimilarity) * termLen), 
      LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
  }
}

Source File: FuzzySuggester.java From lucene-solr with Apache License 2.0

4 votes

/**
 * Creates a {@link FuzzySuggester} instance.
 * 
 * @param indexAnalyzer Analyzer that will be used for
 *        analyzing suggestions while building the index.
 * @param queryAnalyzer Analyzer that will be used for
 *        analyzing query text during lookup
 * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP}
 * @param maxSurfaceFormsPerAnalyzedForm Maximum number of
 *        surface forms to keep for a single analyzed form.
 *        When there are too many surface forms we discard the
 *        lowest weighted ones.
 * @param maxGraphExpansions Maximum number of graph paths
 *        to expand from the analyzed form.  Set this to -1 for
 *        no limit.
 * @param preservePositionIncrements Whether position holes should appear in the automaton
 * @param maxEdits must be &gt;= 0 and &lt;= {@link LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE} .
 * @param transpositions <code>true</code> if transpositions should be treated as a primitive 
 *        edit operation. If this is false, comparisons will implement the classic
 *        Levenshtein algorithm.
 * @param nonFuzzyPrefix length of common (non-fuzzy) prefix (see default {@link #DEFAULT_NON_FUZZY_PREFIX}
 * @param minFuzzyLength minimum length of lookup key before any edits are allowed (see default {@link #DEFAULT_MIN_FUZZY_LENGTH})
 * @param unicodeAware operate Unicode code points instead of bytes.
 */
public FuzzySuggester(Directory tempDir, String tempFileNamePrefix, Analyzer indexAnalyzer, Analyzer queryAnalyzer,
                      int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
                      boolean preservePositionIncrements, int maxEdits, boolean transpositions,
                      int nonFuzzyPrefix, int minFuzzyLength, boolean unicodeAware) {
  super(tempDir, tempFileNamePrefix, indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, preservePositionIncrements);
  if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
    throw new IllegalArgumentException("maxEdits must be between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
  }
  if (nonFuzzyPrefix < 0) {
    throw new IllegalArgumentException("nonFuzzyPrefix must not be >= 0 (got " + nonFuzzyPrefix + ")");
  }
  if (minFuzzyLength < 0) {
    throw new IllegalArgumentException("minFuzzyLength must not be >= 0 (got " + minFuzzyLength + ")");
  }
  
  this.maxEdits = maxEdits;
  this.transpositions = transpositions;
  this.nonFuzzyPrefix = nonFuzzyPrefix;
  this.minFuzzyLength = minFuzzyLength;
  this.unicodeAware = unicodeAware;
}

Source File: DirectSpellChecker.java From lucene-solr with Apache License 2.0

2 votes

/** Sets the maximum number of Levenshtein edit-distances to draw
 *  candidate terms from. This value can be 1 or 2. The default is 2.
 *  <p>
 *  Note: a large number of spelling errors occur with an edit distance
 *  of 1, by setting this value to 1 you can increase both performance
 *  and precision at the cost of recall.
 */
public void setMaxEdits(int maxEdits) {
  if (maxEdits < 1 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE)
    throw new UnsupportedOperationException("Invalid maxEdits");
  this.maxEdits = maxEdits;
}

org.apache.lucene.util.automaton.LevenshteinAutomata Java Examples