/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search;


import java.io.IOException;
import java.util.Objects;

import org.apache.lucene.index.SingleTermsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.LevenshteinAutomata;

/** Implements the fuzzy search query. The similarity measurement
 * is based on the Damerau-Levenshtein (optimal string alignment) algorithm,
 * though you can explicitly choose classic Levenshtein by passing <code>false</code>
 * to the <code>transpositions</code> parameter.
 * 
 * <p>This query uses {@link MultiTermQuery.TopTermsBlendedFreqScoringRewrite}
 * as default. So terms will be collected and scored according to their
 * edit distance. Only the top terms are used for building the {@link BooleanQuery}.
 * It is not recommended to change the rewrite mode for fuzzy queries.
 * 
 * <p>At most, this query will match terms up to 
 * {@value org.apache.lucene.util.automaton.LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE} edits. 
 * Higher distances (especially with transpositions enabled), are generally not useful and 
 * will match a significant amount of the term dictionary. If you really want this, consider
 * using an n-gram indexing technique (such as the SpellChecker in the 
 * <a href="{@docRoot}/../suggest/overview-summary.html">suggest module</a>) instead.
 *
 * <p>NOTE: terms of length 1 or 2 will sometimes not match because of how the scaled
 * distance between two terms is computed.  For a term to match, the edit distance between
 * the terms must be less than the minimum length term (either the input term, or
 * the candidate term).  For example, FuzzyQuery on term "abcd" with maxEdits=2 will
 * not match an indexed term "ab", and FuzzyQuery on term "a" with maxEdits=2 will not
 * match an indexed term "abc".
 */
public class FuzzyQuery extends MultiTermQuery {
  
  public final static int defaultMaxEdits = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE;
  public final static int defaultPrefixLength = 0;
  public final static int defaultMaxExpansions = 50;
  public final static boolean defaultTranspositions = true;
  
  private final int maxEdits;
  private final int maxExpansions;
  private final boolean transpositions;
  private final int prefixLength;
  private final Term term;
  
  /**
   * Create a new FuzzyQuery that will match terms with an edit distance 
   * of at most <code>maxEdits</code> to <code>term</code>.
   * If a <code>prefixLength</code> &gt; 0 is specified, a common prefix
   * of that length is also required.
   * 
   * @param term the term to search for
   * @param maxEdits must be {@code >= 0} and {@code <=} {@link LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE}.
   * @param prefixLength length of common (non-fuzzy) prefix
   * @param maxExpansions the maximum number of terms to match. If this number is
   *  greater than {@link IndexSearcher#getMaxClauseCount} when the query is rewritten,
   *  then the maxClauseCount will be used instead.
   * @param transpositions true if transpositions should be treated as a primitive
   *        edit operation. If this is false, comparisons will implement the classic
   *        Levenshtein algorithm.
   */
  public FuzzyQuery(Term term, int maxEdits, int prefixLength, int maxExpansions, boolean transpositions) {
    super(term.field());
    
    if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
      throw new IllegalArgumentException("maxEdits must be between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
    }
    if (prefixLength < 0) {
      throw new IllegalArgumentException("prefixLength cannot be negative.");
    }
    if (maxExpansions <= 0) {
      throw new IllegalArgumentException("maxExpansions must be positive.");
    }
    
    this.term = term;
    this.maxEdits = maxEdits;
    this.prefixLength = prefixLength;
    this.transpositions = transpositions;
    this.maxExpansions = maxExpansions;
    setRewriteMethod(new MultiTermQuery.TopTermsBlendedFreqScoringRewrite(maxExpansions));
  }
  
  /**
   * Calls {@link #FuzzyQuery(Term, int, int, int, boolean) 
   * FuzzyQuery(term, maxEdits, prefixLength, defaultMaxExpansions, defaultTranspositions)}.
   */
  public FuzzyQuery(Term term, int maxEdits, int prefixLength) {
    this(term, maxEdits, prefixLength, defaultMaxExpansions, defaultTranspositions);
  }
  
  /**
   * Calls {@link #FuzzyQuery(Term, int, int) FuzzyQuery(term, maxEdits, defaultPrefixLength)}.
   */
  public FuzzyQuery(Term term, int maxEdits) {
    this(term, maxEdits, defaultPrefixLength);
  }

  /**
   * Calls {@link #FuzzyQuery(Term, int) FuzzyQuery(term, defaultMaxEdits)}.
   */
  public FuzzyQuery(Term term) {
    this(term, defaultMaxEdits);
  }
  
  /**
   * @return the maximum number of edit distances allowed for this query to match.
   */
  public int getMaxEdits() {
    return maxEdits;
  }
    
  /**
   * Returns the non-fuzzy prefix length. This is the number of characters at the start
   * of a term that must be identical (not fuzzy) to the query term if the query
   * is to match that term. 
   */
  public int getPrefixLength() {
    return prefixLength;
  }
  
  /**
   * Returns true if transpositions should be treated as a primitive edit operation. 
   * If this is false, comparisons will implement the classic Levenshtein algorithm.
   */
  public boolean getTranspositions() {
    return transpositions;
  }

  /**
   * Returns the compiled automata used to match terms
   */
  public CompiledAutomaton getAutomata() {
    FuzzyAutomatonBuilder builder = new FuzzyAutomatonBuilder(term.text(), maxEdits, prefixLength, transpositions);
    return builder.buildMaxEditAutomaton();
  }

  @Override
  public void visit(QueryVisitor visitor) {
    if (visitor.acceptField(field)) {
      if (maxEdits == 0 || prefixLength >= term.text().length()) {
        visitor.consumeTerms(this, term);
      } else {
        visitor.consumeTermsMatching(this, term.field(), () -> getAutomata().runAutomaton);
      }
    }
  }

  @Override
  protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
    if (maxEdits == 0) { // can only match if it's exact
      return new SingleTermsEnum(terms.iterator(), term.bytes());
    }
    return new FuzzyTermsEnum(terms, atts, getTerm(), maxEdits, prefixLength, transpositions);
  }

  /**
   * Returns the pattern term.
   */
  public Term getTerm() {
    return term;
  }
    
  @Override
  public String toString(String field) {
    final StringBuilder buffer = new StringBuilder();
    if (!term.field().equals(field)) {
        buffer.append(term.field());
        buffer.append(":");
    }
    buffer.append(term.text());
    buffer.append('~');
    buffer.append(maxEdits);
    return buffer.toString();
  }

  @Override
  public int hashCode() {
    final int prime = 31;
    int result = super.hashCode();
    result = prime * result + maxEdits;
    result = prime * result + prefixLength;
    result = prime * result + maxExpansions;
    result = prime * result + (transpositions ? 0 : 1);
    result = prime * result + ((term == null) ? 0 : term.hashCode());
    return result;
  }

  @Override
  public boolean equals(Object obj) {
    if (this == obj)
      return true;
    if (!super.equals(obj))
      return false;
    if (getClass() != obj.getClass())
      return false;
    FuzzyQuery other = (FuzzyQuery) obj;
    return Objects.equals(maxEdits, other.maxEdits) && Objects.equals(prefixLength, other.prefixLength)
        && Objects.equals(maxExpansions, other.maxExpansions) && Objects.equals(transpositions, other.transpositions)
        && Objects.equals(term, other.term);
  }

  /**
   * Helper function to convert from "minimumSimilarity" fractions
   * to raw edit distances.
   * 
   * @param minimumSimilarity scaled similarity
   * @param termLen length (in unicode codepoints) of the term.
   * @return equivalent number of maxEdits
   */
  public static int floatToEdits(float minimumSimilarity, int termLen) {
    if (minimumSimilarity >= 1f) {
      return (int) Math.min(minimumSimilarity, LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
    } else if (minimumSimilarity == 0.0f) {
      return 0; // 0 means exact, not infinite # of edits!
    } else {
      return Math.min((int) ((1D-minimumSimilarity) * termLen), 
        LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
    }
  }

}