Java Code Examples for org.apache.commons.lang.StringUtils#getLevenshteinDistance()

The following examples show how to use org.apache.commons.lang.StringUtils#getLevenshteinDistance() . These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may want to check out the right sidebar which shows the related API usage.
Example 1
Source Project: hop   File: ValueDataUtil.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Levenshtein distance (LD) is a measure of the similarity between two strings, which we will refer to as the source
 * string (s) and the target string (t). The distance is the number of deletions, insertions, or substitutions
 * required to transform s into t.
 */
public static Long getLevenshtein_Distance( IValueMeta metaA, Object dataA, IValueMeta metaB,
                                            Object dataB ) {
  if ( dataA == null || dataB == null ) {
    return null;
  }
  return new Long( StringUtils.getLevenshteinDistance( dataA.toString(), dataB.toString() ) );
}
 
Example 2
/**
 *  Two abstracts are regarded probably same
 *  if their levenshtein distance is less than a configured percentage of the text length.
 */
@Override
public boolean isProbablySameAs(MCRMerger other) {
    if (!(other instanceof MCRAbstractMerger)) {
        return false;
    }

    String textOther = ((MCRAbstractMerger) other).text;
    int length = Math.min(text.length(), textOther.length());
    int distance = StringUtils.getLevenshteinDistance(text, textOther);
    System.out.println(distance);
    return (distance * 100 / length) < MAX_DISTANCE_PERCENT;
}
 
Example 3
@Override
public boolean isSimilar(String s0, String s1) {
	if(Strings.isEmpty(s0) || Strings.isEmpty(s1)) {
		return false;
	}
	double levenshteinDistance = StringUtils.getLevenshteinDistance(s0, s1);
	return levenshteinDistance <= 1;
}
 
Example 4
Source Project: rya   File: DuplicateDataDetector.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public boolean areObjectsApproxEquals(final String lhs, final String rhs) {
    if (isOnlyOneNull(lhs, rhs)) {
        return false;
    }
    if (StringUtils.equalsIgnoreCase(lhs, rhs)) {
        // They're exactly equals so get out
        return true;
    } else if (tolerance.getValue() == 0) {
        // If they're not exactly equals with zero tolerance then get out
        return false;
    }

    // Only check one-way. Terms are not bi-directionally equivalent
    // unless specified.
    final List<String> lhsTermEquivalents = equivalentTermsMap.get(lhs);
    if (lhsTermEquivalents != null && lhsTermEquivalents.contains(rhs)) {
        return true;
    }
    final int distance = StringUtils.getLevenshteinDistance(lhs, rhs);
    // Check based on tolerance
    switch (tolerance.getToleranceType()) {
        case PERCENTAGE:
            if (lhs.length() == 0) {
                return lhs.length() == rhs.length();
            }
            if (tolerance.getValue() >= 1) {
                return true;
            }
            return ((double)distance / lhs.length()) <= tolerance.getValue();
        case DIFFERENCE:
        default:
            return distance <= tolerance.getValue();
    }
}
 
Example 5
Source Project: rya   File: DuplicateDataDetector.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public boolean areObjectsApproxEquals(final IRI lhs, final IRI rhs) {
    if (isOnlyOneNull(lhs, rhs)) {
        return false;
    }
    if (Objects.equals(lhs, rhs)) {
        return true;
    }
    final String uriString1 = lhs.stringValue();
    final String uriString2 = rhs.stringValue();
    if (StringUtils.equalsIgnoreCase(uriString1, uriString2)) {
        // They're exactly equals so get out
        return true;
    } else if (tolerance.getValue() == 0) {
        // If they're not exactly equals with zero tolerance then get out
        return false;
    }
    final int distance = StringUtils.getLevenshteinDistance(uriString1, uriString2);
    // Check based on tolerance
    switch (tolerance.getToleranceType()) {
        case PERCENTAGE:
            if (uriString1.length() == 0) {
                return uriString1.length() == uriString2.length();
            }
            if (tolerance.getValue() >= 1) {
                return true;
            }
            return ((double)distance / uriString1.length()) <= tolerance.getValue();
        case DIFFERENCE:
        default:
            return distance <= tolerance.getValue();
    }
}
 
Example 6
Source Project: pentaho-kettle   File: ValueDataUtil.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Levenshtein distance (LD) is a measure of the similarity between two strings, which we will refer to as the source
 * string (s) and the target string (t). The distance is the number of deletions, insertions, or substitutions
 * required to transform s into t.
 */
public static Long getLevenshtein_Distance( ValueMetaInterface metaA, Object dataA, ValueMetaInterface metaB,
  Object dataB ) {
  if ( dataA == null || dataB == null ) {
    return null;
  }
  return new Long( StringUtils.getLevenshteinDistance( dataA.toString(), dataB.toString() ) );
}
 
Example 7
Source Project: hop   File: FuzzyMatch.java    License: Apache License 2.0 4 votes vote down vote up
private Object[] doDistance( Object[] row ) throws HopValueException {
  // Reserve room
  Object[] rowData = buildEmptyRow();

  Iterator<Object[]> it = data.look.iterator();

  long distance = -1;

  // Object o=row[data.indexOfMainField];
  String lookupvalue = getInputRowMeta().getString( row, data.indexOfMainField );

  while ( it.hasNext() ) {
    // Get cached row data
    Object[] cachedData = it.next();
    // Key value is the first value
    String cacheValue = (String) cachedData[ 0 ];

    int cdistance = -1;
    String usecacheValue = cacheValue;
    String uselookupvalue = lookupvalue;
    if ( !meta.isCaseSensitive() ) {
      usecacheValue = cacheValue.toLowerCase();
      uselookupvalue = lookupvalue.toLowerCase();
    }

    switch ( meta.getAlgorithmType() ) {
      case FuzzyMatchMeta.OPERATION_TYPE_DAMERAU_LEVENSHTEIN:
        cdistance = Utils.getDamerauLevenshteinDistance( usecacheValue, uselookupvalue );
        break;
      case FuzzyMatchMeta.OPERATION_TYPE_NEEDLEMAN_WUNSH:
        cdistance = Math.abs( (int) new NeedlemanWunsch().score( usecacheValue, uselookupvalue ) );
        break;
      default:
        cdistance = StringUtils.getLevenshteinDistance( usecacheValue, uselookupvalue );
        break;
    }

    if ( data.minimalDistance <= cdistance && cdistance <= data.maximalDistance ) {
      if ( meta.isGetCloserValue() ) {
        if ( cdistance < distance || distance == -1 ) {
          // Get closer value
          // minimal distance
          distance = cdistance;
          int index = 0;
          rowData[ index++ ] = cacheValue;
          // Add metric value?
          if ( data.addValueFieldName ) {
            rowData[ index++ ] = distance;
          }
          // Add additional return values?
          if ( data.addAdditionalFields ) {
            for ( int i = 0; i < meta.getValue().length; i++ ) {
              int nr = i + 1;
              int nf = i + index;
              rowData[ nf ] = cachedData[ nr ];
            }
          }
        }
      } else {
        // get all values separated by values separator
        if ( rowData[ 0 ] == null ) {
          rowData[ 0 ] = cacheValue;
        } else {
          rowData[ 0 ] = (String) rowData[ 0 ] + data.valueSeparator + cacheValue;
        }
      }
    }
  }

  return rowData;
}
 
Example 8
/**
 * Locates the best match for the given pattern in the given set of candidate items.
 *
 * @return The match if exactly 1 match found, null if no matches or multiple matches.
 */
public String find(String pattern, Collection<String> items) {
    this.pattern = pattern;
    matches.clear();
    candidates.clear();

    if (items.contains(pattern)) {
        matches.add(pattern);
        return pattern;
    }

    if (pattern.length() == 0) {
        return null;
    }

    Pattern camelCasePattern = getPatternForName(pattern);
    Pattern normalisedCamelCasePattern = Pattern.compile(camelCasePattern.pattern(), Pattern.CASE_INSENSITIVE);
    String normalisedPattern = pattern.toUpperCase();

    Set<String> caseInsensitiveMatches = new TreeSet<String>();
    Set<String> caseSensitiveCamelCaseMatches = new TreeSet<String>();
    Set<String> caseInsensitiveCamelCaseMatches = new TreeSet<String>();

    for (String candidate : items) {
        if (candidate.equalsIgnoreCase(pattern)) {
            caseInsensitiveMatches.add(candidate);
        }
        if (camelCasePattern.matcher(candidate).matches()) {
            caseSensitiveCamelCaseMatches.add(candidate);
            continue;
        }
        if (normalisedCamelCasePattern.matcher(candidate).lookingAt()) {
            caseInsensitiveCamelCaseMatches.add(candidate);
            continue;
        }
        if (StringUtils.getLevenshteinDistance(normalisedPattern, candidate.toUpperCase()) <= Math.min(3, pattern.length() / 2)) {
            candidates.add(candidate);
        }
    }

    if (!caseInsensitiveMatches.isEmpty()) {
        matches.addAll(caseInsensitiveMatches);
    } else if (!caseSensitiveCamelCaseMatches.isEmpty()) {
        matches.addAll(caseSensitiveCamelCaseMatches);
    } else {
        matches.addAll(caseInsensitiveCamelCaseMatches);
    }

    if (matches.size() == 1) {
        return matches.first();
    }

    return null;
}
 
Example 9
/**
 * Locates the best match for the given pattern in the given set of candidate items.
 *
 * @return The match if exactly 1 match found, null if no matches or multiple matches.
 */
public String find(String pattern, Collection<String> items) {
    this.pattern = pattern;
    matches.clear();
    candidates.clear();

    if (items.contains(pattern)) {
        matches.add(pattern);
        return pattern;
    }

    if (pattern.length() == 0) {
        return null;
    }

    Pattern camelCasePattern = getPatternForName(pattern);
    Pattern normalisedCamelCasePattern = Pattern.compile(camelCasePattern.pattern(), Pattern.CASE_INSENSITIVE);
    String normalisedPattern = pattern.toUpperCase();

    Set<String> caseInsensitiveMatches = new TreeSet<String>();
    Set<String> caseSensitiveCamelCaseMatches = new TreeSet<String>();
    Set<String> caseInsensitiveCamelCaseMatches = new TreeSet<String>();

    for (String candidate : items) {
        if (candidate.equalsIgnoreCase(pattern)) {
            caseInsensitiveMatches.add(candidate);
        }
        if (camelCasePattern.matcher(candidate).matches()) {
            caseSensitiveCamelCaseMatches.add(candidate);
            continue;
        }
        if (normalisedCamelCasePattern.matcher(candidate).lookingAt()) {
            caseInsensitiveCamelCaseMatches.add(candidate);
            continue;
        }
        if (StringUtils.getLevenshteinDistance(normalisedPattern, candidate.toUpperCase()) <= Math.min(3, pattern.length() / 2)) {
            candidates.add(candidate);
        }
    }

    if (!caseInsensitiveMatches.isEmpty()) {
        matches.addAll(caseInsensitiveMatches);
    } else if (!caseSensitiveCamelCaseMatches.isEmpty()) {
        matches.addAll(caseSensitiveCamelCaseMatches);
    } else {
        matches.addAll(caseInsensitiveCamelCaseMatches);
    }

    if (matches.size() == 1) {
        return matches.first();
    }

    return null;
}
 
Example 10
private double getDistance(String text1, String text2) {
  int distance = StringUtils.getLevenshteinDistance(text1, text2);
  return (double) distance / Math.max(text1.length(), text2.length());
}
 
Example 11
private double getDistance(String text1, String text2) {
  int distance = StringUtils.getLevenshteinDistance(text1, text2);
  return (double) distance / Math.max(text1.length(), text2.length());
}
 
Example 12
protected boolean isSimilarTypeName(String s0, String s1) {
	double levenshteinDistance = StringUtils.getLevenshteinDistance(s0, s1);
	return levenshteinDistance <= 3;
}
 
Example 13
/**
 * Locates the best match for the given pattern in the given set of candidate items.
 *
 * @return The match if exactly 1 match found, null if no matches or multiple matches.
 */
public String find(String pattern, Collection<String> items) {
    this.pattern = pattern;
    matches.clear();
    candidates.clear();

    if (items.contains(pattern)) {
        matches.add(pattern);
        return pattern;
    }

    if (pattern.length() == 0) {
        return null;
    }

    Pattern camelCasePattern = getPatternForName(pattern);
    Pattern normalisedCamelCasePattern = Pattern.compile(camelCasePattern.pattern(), Pattern.CASE_INSENSITIVE);
    String normalisedPattern = pattern.toUpperCase();

    Set<String> caseInsensitiveMatches = new TreeSet<String>();
    Set<String> caseSensitiveCamelCaseMatches = new TreeSet<String>();
    Set<String> caseInsensitiveCamelCaseMatches = new TreeSet<String>();

    for (String candidate : items) {
        if (candidate.equalsIgnoreCase(pattern)) {
            caseInsensitiveMatches.add(candidate);
        }
        if (camelCasePattern.matcher(candidate).matches()) {
            caseSensitiveCamelCaseMatches.add(candidate);
            continue;
        }
        if (normalisedCamelCasePattern.matcher(candidate).lookingAt()) {
            caseInsensitiveCamelCaseMatches.add(candidate);
            continue;
        }
        if (StringUtils.getLevenshteinDistance(normalisedPattern, candidate.toUpperCase()) <= Math.min(3, pattern.length() / 2)) {
            candidates.add(candidate);
        }
    }

    if (!caseInsensitiveMatches.isEmpty()) {
        matches.addAll(caseInsensitiveMatches);
    } else if (!caseSensitiveCamelCaseMatches.isEmpty()) {
        matches.addAll(caseSensitiveCamelCaseMatches);
    } else {
        matches.addAll(caseInsensitiveCamelCaseMatches);
    }

    if (matches.size() == 1) {
        return matches.first();
    }

    return null;
}
 
Example 14
/**
 * Locates the best match for the given pattern in the given set of candidate items.
 *
 * @return The match if exactly 1 match found, null if no matches or multiple matches.
 */
public String find(String pattern, Collection<String> items) {
    this.pattern = pattern;
    matches.clear();
    candidates.clear();

    if (items.contains(pattern)) {
        matches.add(pattern);
        return pattern;
    }

    if (pattern.length() == 0) {
        return null;
    }

    Pattern camelCasePattern = getPatternForName(pattern);
    Pattern normalisedCamelCasePattern = Pattern.compile(camelCasePattern.pattern(), Pattern.CASE_INSENSITIVE);
    String normalisedPattern = pattern.toUpperCase();

    Set<String> caseInsensitiveMatches = new TreeSet<String>();
    Set<String> caseSensitiveCamelCaseMatches = new TreeSet<String>();
    Set<String> caseInsensitiveCamelCaseMatches = new TreeSet<String>();

    for (String candidate : items) {
        if (candidate.equalsIgnoreCase(pattern)) {
            caseInsensitiveMatches.add(candidate);
        }
        if (camelCasePattern.matcher(candidate).matches()) {
            caseSensitiveCamelCaseMatches.add(candidate);
            continue;
        }
        if (normalisedCamelCasePattern.matcher(candidate).lookingAt()) {
            caseInsensitiveCamelCaseMatches.add(candidate);
            continue;
        }
        if (StringUtils.getLevenshteinDistance(normalisedPattern, candidate.toUpperCase()) <= Math.min(3, pattern.length() / 2)) {
            candidates.add(candidate);
        }
    }

    if (!caseInsensitiveMatches.isEmpty()) {
        matches.addAll(caseInsensitiveMatches);
    } else if (!caseSensitiveCamelCaseMatches.isEmpty()) {
        matches.addAll(caseSensitiveCamelCaseMatches);
    } else {
        matches.addAll(caseInsensitiveCamelCaseMatches);
    }

    if (matches.size() == 1) {
        return matches.first();
    }

    return null;
}
 
Example 15
private float computeLDscore(String s1, String s2) {
    float maxLength = (float) Math.max(s1.length(), s2.length());
    float ld = (float) StringUtils.getLevenshteinDistance(s1, s2);
    return 1 - ld / maxLength;
}
 
Example 16
Source Project: pentaho-kettle   File: FuzzyMatch.java    License: Apache License 2.0 4 votes vote down vote up
private Object[] doDistance( Object[] row ) throws KettleValueException {
  // Reserve room
  Object[] rowData = buildEmptyRow();

  Iterator<Object[]> it = data.look.iterator();

  long distance = -1;

  // Object o=row[data.indexOfMainField];
  String lookupvalue = getInputRowMeta().getString( row, data.indexOfMainField );

  while ( it.hasNext() ) {
    // Get cached row data
    Object[] cachedData = it.next();
    // Key value is the first value
    String cacheValue = (String) cachedData[0];

    int cdistance = -1;
    String usecacheValue = cacheValue;
    String uselookupvalue = lookupvalue;
    if ( !meta.isCaseSensitive() ) {
      usecacheValue = cacheValue.toLowerCase();
      uselookupvalue = lookupvalue.toLowerCase();
    }

    switch ( meta.getAlgorithmType() ) {
      case FuzzyMatchMeta.OPERATION_TYPE_DAMERAU_LEVENSHTEIN:
        cdistance = Utils.getDamerauLevenshteinDistance( usecacheValue, uselookupvalue );
        break;
      case FuzzyMatchMeta.OPERATION_TYPE_NEEDLEMAN_WUNSH:
        cdistance = Math.abs( (int) new NeedlemanWunsch().score( usecacheValue, uselookupvalue ) );
        break;
      default:
        cdistance = StringUtils.getLevenshteinDistance( usecacheValue, uselookupvalue );
        break;
    }

    if ( data.minimalDistance <= cdistance && cdistance <= data.maximalDistance ) {
      if ( meta.isGetCloserValue() ) {
        if ( cdistance < distance || distance == -1 ) {
          // Get closer value
          // minimal distance
          distance = cdistance;
          int index = 0;
          rowData[index++] = cacheValue;
          // Add metric value?
          if ( data.addValueFieldName ) {
            rowData[index++] = distance;
          }
          // Add additional return values?
          if ( data.addAdditionalFields ) {
            for ( int i = 0; i < meta.getValue().length; i++ ) {
              int nr = i + 1;
              int nf = i + index;
              rowData[nf] = cachedData[nr];
            }
          }
        }
      } else {
        // get all values separated by values separator
        if ( rowData[0] == null ) {
          rowData[0] = cacheValue;
        } else {
          rowData[0] = (String) rowData[0] + data.valueSeparator + cacheValue;
        }
      }
    }
  }

  return rowData;
}
 
Example 17
Source Project: anthelion   File: SpellCheckedMetadata.java    License: Apache License 2.0 3 votes vote down vote up
/**
 * Get the normalized name of metadata attribute name. This method tries to
 * find a well-known metadata name (one of the metadata names defined in this
 * class) that matches the specified name. The matching is error tolerent. For
 * instance,
 * <ul>
 * <li>content-type gives Content-Type</li>
 * <li>CoNtEntType gives Content-Type</li>
 * <li>ConTnTtYpe gives Content-Type</li>
 * </ul>
 * If no matching with a well-known metadata name is found, then the original
 * name is returned.
 *
 * @param name
 *          Name to normalize
 * @return normalized name
 */
public static String getNormalizedName(final String name) {
  String searched = normalize(name);
  String value = NAMES_IDX.get(searched);

  if ((value == null) && (normalized != null)) {
    int threshold = searched.length() / TRESHOLD_DIVIDER;
    for (int i = 0; i < normalized.length && value == null; i++) {
      if (StringUtils.getLevenshteinDistance(searched, normalized[i]) < threshold) {
        value = NAMES_IDX.get(normalized[i]);
      }
    }
  }
  return (value != null) ? value : name;
}
 
Example 18
/**
 * Get the normalized name of metadata attribute name. This method tries to
 * find a well-known metadata name (one of the metadata names defined in this
 * class) that matches the specified name. The matching is error tolerent. For
 * instance,
 * <ul>
 * <li>content-type gives Content-Type</li>
 * <li>CoNtEntType gives Content-Type</li>
 * <li>ConTnTtYpe gives Content-Type</li>
 * </ul>
 * If no matching with a well-known metadata name is found, then the original
 * name is returned.
 *
 * @param name
 *          Name to normalize
 * @return normalized name
 */
public static String getNormalizedName(final String name) {
  String searched = normalize(name);
  String value = NAMES_IDX.get(searched);

  if ((value == null) && (normalized != null)) {
    int threshold = searched.length() / TRESHOLD_DIVIDER;
    for (int i = 0; i < normalized.length && value == null; i++) {
      if (StringUtils.getLevenshteinDistance(searched, normalized[i]) < threshold) {
        value = NAMES_IDX.get(normalized[i]);
      }
    }
  }
  return (value != null) ? value : name;
}