/* Copyleft (C) 2005 Hélio Perroni Filho [email protected] ICQ: 2490863 This file is part of ChatterBean. ChatterBean is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. ChatterBean is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with ChatterBean (look at the Documents/ directory); if not, either write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA, or visit (http://www.gnu.org/licenses/gpl.txt). */ package bitoflife.chatterbean.text; import com.util.ChineseSegmenter; import java.util.ArrayList; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.regex.Matcher; import java.util.regex.Pattern; import static java.util.regex.Pattern.CASE_INSENSITIVE; import static java.util.regex.Pattern.UNICODE_CASE; /** * Provides operations for normalizing a request, before submiting it to the matching operation. */ public class Transformations { /* Inner Classes */ private class Mapper { /* Attributes */ private int charIndex; private int listIndex; private int spaceCount; private final List<Integer> mappings = new LinkedList<Integer>(); private String input; private String find; private String replace; /* Constructor */ public Mapper(String input) { char[] chars = input.toCharArray(); for (int i = 0, n = chars.length; i < n; i++) if (chars[i] == ' ') mappings.add(i); } /* Methods */ private int spaceCount(String string) { return spaceCount(string, 0, string.length()); } private int spaceCount(String string, int beginIndex, int endIndex) { int spaces = 0; char[] chars = string.toCharArray(); for (int i = beginIndex, n = endIndex; i < n; i++) if (chars[i] == ' ') spaces++; return spaces; } public void prepare(String input, String find, String replace) { this.input = input; this.find = find; this.replace = replace; spaceCount = spaceCount(find); listIndex = 0; charIndex = 0; } public void update(int beginIndex) { listIndex += spaceCount(input, charIndex, beginIndex); charIndex = beginIndex; int n = spaceCount; for (int j = 0, m = replace.length(); j < m; j++) if (replace.charAt(j) == ' ' && --n < 0) mappings.add(listIndex++, null); while (n-- > 0 && mappings.size() > listIndex) // 中文支持 mappings.remove(listIndex); } public Integer[] toArray() { return mappings.toArray(INTEGER_ARRAY); } } /* Attribute Section */ private static final Integer[] INTEGER_ARRAY = new Integer[0]; private final Tokenizer tokenizer; // private final Pattern fitting = Pattern.compile("[^A-Z0-9]+"); // 中文支持 private final Pattern fitting = Pattern.compile("[^A-Z0-9\\u4e00-\\u9fa5]+"); private final Pattern wordBreakers = Pattern.compile("([,;:])([A-Za-z]|\\s{2,})"); // The regular expression which will split entries by sentence splitters. private final SentenceSplitter splitter; // The collection of substitutions known to the system. private Map<String, String> correction; private Map<String, String> protection; private List<Substitution> person; private List<Substitution> person2; private List<Substitution> gender; /* Constructor Section */ /** * Constructs a new Transformations out of a list of sentence splitters and several substitution maps. */ public Transformations(List<String> splitters, Map<String, Map<String, String>> substitutions, Tokenizer tokenizer) { this.tokenizer = tokenizer; this.splitter = new SentenceSplitter(substitutions.get("protection"), splitters); correction = substitutions.get("correction"); person = newSubstitutionList(substitutions.get("person")); person2 = newSubstitutionList(substitutions.get("person2")); gender = newSubstitutionList(substitutions.get("gender")); } /* Method Section */ private List<Substitution> newSubstitutionList(Map<String, String> inputs) { List<Substitution> subsitutions = new ArrayList<Substitution>(inputs.size()); for (Entry<String, String> entry : inputs.entrySet()) { Substitution substitution = new Substitution(entry.getKey(), entry.getValue(), tokenizer); subsitutions.add(substitution); } return subsitutions; } /** * 这个breakWords感觉是分词的意思 * @param input * @return */ private String breakWords(String input) { /* See the description of java.util.regex.Matcher.appendReplacement() in the Javadocs to understand this code. */ Matcher matcher = wordBreakers.matcher(input); StringBuffer buffer = new StringBuffer(); while (matcher.find()) { String replace = matcher.group(2); if (replace.charAt(0) != ' ') replace = matcher.group(1) + ' ' + replace; else replace = matcher.group(1) + ' '; matcher.appendReplacement(buffer, replace); } matcher.appendTail(buffer); return buffer.toString(); } private String fit(String input) { input = input.toUpperCase(); Matcher matcher = fitting.matcher(input); return matcher.replaceAll(" "); } /** * 开启大写, * Turns the entry to UPPERCASE, takes sequences of non-alphanumeric characters out of it (replacing them with a single whitespace) and sees that the entry is trimmed off leading and trailing whitespaces. */ private String fit(String input, Mapper mapper) { input = input.toUpperCase(); Matcher matcher = fitting.matcher(input); StringBuffer buffer = new StringBuffer(); while (!matcher.hitEnd() && matcher.find()) { mapper.prepare(input, matcher.group(), " "); mapper.update(matcher.start()); matcher.appendReplacement(buffer, " "); } matcher.appendTail(buffer); return buffer.toString(); } private String substitute(String input) { for (String find : correction.keySet()) { Pattern pattern = Pattern.compile(find, CASE_INSENSITIVE | UNICODE_CASE); Matcher matcher = pattern.matcher(input); String replace = correction.get(find); input = matcher.replaceAll(replace); } return input; } private String substitute(String input, Mapper mapper) { StringBuffer buffer = new StringBuffer(); for (String find : correction.keySet()) { Pattern pattern = Pattern.compile(find, CASE_INSENSITIVE | UNICODE_CASE); Matcher matcher = pattern.matcher(input); String replace = correction.get(find); mapper.prepare(input, find, replace); while (!matcher.hitEnd() && matcher.find()) { mapper.update(matcher.start() + 1); matcher.appendReplacement(buffer, replace); } matcher.appendTail(buffer); input = buffer.toString(); buffer.delete(0, buffer.length()); } return input; } private String transform(String input, List<Substitution> substitutions) { List<String> tokens = tokenizer.tokenize(input); outer: for (int i = 0; i < tokens.size(); ) { int offset = i; for (final Substitution substitution : substitutions) { i = substitution.substitute(offset, tokens); if (i > offset) continue outer; } // Only gets here if no substitution matches. i++; } return tokenizer.toString(tokens); } public void normalization(Request request) { // 处理两边的空格,且将两个以上的空格为一个空格 String original = ' ' + request.getOriginal() + ' '; original = original.replaceAll("\\s{2,}", " "); String input[] = splitter.split(original); Sentence[] sentences = new Sentence[input.length]; for (int i = 0, n = input.length; i < n; i++) { sentences[i] = new Sentence(input[i]); normalization(sentences[i]);// 调用normalization } request.setOriginal(original); request.setSentences(sentences); } /** * 处理 * * @param sentence 句子 */ public void normalization(Sentence sentence) { // 处理两个以上的空格为一个空格 String input = breakWords(sentence.getOriginal()); // 处理分词 input = ChineseSegmenter.analysis(input); // 前后空格追加 input = ' ' + input + ' '; input = input.replaceAll("\\s{2,}", " "); sentence.setOriginal(input); Mapper mapper = new Mapper(input); input = substitute(input, mapper); input = fit(input, mapper); sentence.setMappings(mapper.toArray()); sentence.setNormalized(input); } /** * 被用于查询li * @param input * @return */ public String normalization(String input) { // 处理分词 input = ChineseSegmenter.analysis(input); // 空格处理 input = ' ' + input + ' '; input = input.replaceAll("\\s{2,}", " "); input = substitute(input); input = fit(input); return input; } public String gender(String input) { return transform(input, gender); } public String person(String input) { return transform(input, person); } public String person2(String input) { return transform(input, person2); } }