/** * * APDPlat - Application Product Development Platform * Copyright (c) 2013, 杨尚川, [email protected] * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ package org.apdplat.word.analysis; import org.apdplat.word.segmentation.Word; import java.util.List; /** * 文本相似度计算 * 判定方式:编辑距离(Edit Distance) * 指两个字串之间,由一个转成另一个所需的最少编辑操作次数 * 允许的编辑操作包括将一个字符替换成另一个字符,增加一个字符,删除一个字符 * 例如将kitten一字转成sitting: * sitten (k→s)将一个字符k替换成另一个字符s * sittin (e→i)将一个字符e替换成另一个字符i * sitting (→g)增加一个字符g * 因为这个算法是俄罗斯科学家Vladimir Levenshtein在1965年提出 * 所以编辑距离(Edit Distance)又称Levenshtein距离 * @author 杨尚川 */ public class EditDistanceTextSimilarity extends TextSimilarity { /** * 计算相似度分值 * @param words1 词列表1 * @param words2 词列表2 * @return 相似度分值 */ @Override protected double scoreImpl(List<Word> words1, List<Word> words2){ //文本1 StringBuilder text1 = new StringBuilder(); words1.forEach(word -> text1.append(word.getText())); //文本2 StringBuilder text2 = new StringBuilder(); words2.forEach(word -> text2.append(word.getText())); int maxTextLength = Math.max(text1.length(), text2.length()); if(maxTextLength == 0){ //两个空字符串 return 1.0; } //计算文本1和文本2的编辑距离 int editDistance = editDistance(text1.toString(), text2.toString()); double score = (1 - editDistance / (double)maxTextLength); if(LOGGER.isDebugEnabled()){ LOGGER.debug("文本1:"+text1.toString()); LOGGER.debug("文本2:"+text2.toString()); LOGGER.debug("文本1和文本2的编辑距离:"+editDistance); LOGGER.debug("文本1和文本2的最大长度:"+maxTextLength); LOGGER.debug("文本1和文本2的相似度分值:1 - "+editDistance+" / (double)"+maxTextLength+"="+score); } return score; } private int editDistance(String text1, String text2) { int[] costs = new int[text2.length() + 1]; for (int i = 0; i <= text1.length(); i++) { int previousValue = i; for (int j = 0; j <= text2.length(); j++) { if (i == 0) { costs[j] = j; } else if (j > 0) { int useValue = costs[j - 1]; if (text1.charAt(i - 1) != text2.charAt(j - 1)) { useValue = Math.min(Math.min(useValue, previousValue), costs[j]) + 1; } costs[j - 1] = previousValue; previousValue = useValue; } } if (i > 0) { costs[text2.length()] = previousValue; } } return costs[text2.length()]; } public static void main(String[] args) { String text1 = "我爱购物"; String text2 = "我爱读书"; String text3 = "他是黑客"; TextSimilarity textSimilarity = new EditDistanceTextSimilarity(); double score1pk1 = textSimilarity.similarScore(text1, text1); double score1pk2 = textSimilarity.similarScore(text1, text2); double score1pk3 = textSimilarity.similarScore(text1, text3); double score2pk2 = textSimilarity.similarScore(text2, text2); double score2pk3 = textSimilarity.similarScore(text2, text3); double score3pk3 = textSimilarity.similarScore(text3, text3); System.out.println(text1+" 和 "+text1+" 的相似度分值:"+score1pk1); System.out.println(text1+" 和 "+text2+" 的相似度分值:"+score1pk2); System.out.println(text1+" 和 "+text3+" 的相似度分值:"+score1pk3); System.out.println(text2+" 和 "+text2+" 的相似度分值:"+score2pk2); System.out.println(text2+" 和 "+text3+" 的相似度分值:"+score2pk3); System.out.println(text3+" 和 "+text3+" 的相似度分值:"+score3pk3); } }