/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.joshua.decoder.io; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Denormalize a(n English) string in a collection of ways listed below. * <UL> * <LI>Capitalize the first character in the string</LI> * <LI>Detokenize</LI> * <LI>Delete whitespace in front of periods and commas</LI> * <LI>Join contractions</LI> * <LI>Capitalize name titles (Mr Ms Miss Dr etc.)</LI> * <LI>TODO: Handle surrounding characters ([{<"''">}])</LI> * <LI>TODO: Join multi-period abbreviations (e.g. M.Phil. i.e.)</LI> * <LI>TODO: Handle ambiguities like "st.", which can be an abbreviation for both "Saint" and * "street"</LI> * <LI>TODO: Capitalize both the title and the name of a person, e.g. Mr. Morton (named entities * should be demarcated).</LI> * </UL> * <b>N.B.</b> These methods all assume that every translation result that will be * denormalized has the following format: * <UL> * <LI>There is only one space between every pair of tokens</LI> * <LI>There is no whitespace before the first token</LI> * <LI>There is no whitespace after the final token</LI> * <LI>Standard spaces are the only type of whitespace</LI> * </UL> */ public class DeNormalize { /** * Apply all the denormalization methods to the normalized input line. * * @param normalized a normalized input line * @return the denormalized String */ public static String processSingleLine(String normalized) { // The order in which the methods are applied could matter in some situations. E.g., a token to // be matched is "phd", but if it is the first token in the line, it might have already been // capitalized to "Phd" by the capitalizeFirstLetter method, and because the "phd" token won't // match, "Phd" won't be corrected to "PhD". String deNormalized = normalized; deNormalized = capitalizeNameTitleAbbrvs(deNormalized); deNormalized = replaceBracketTokens(deNormalized); deNormalized = joinPunctuationMarks(deNormalized); deNormalized = joinHyphen(deNormalized); deNormalized = joinContractions(deNormalized); deNormalized = capitalizeLineFirstLetter(deNormalized); return deNormalized; } /** * Capitalize the first letter of a line. This should be the last denormalization step applied to * a line. * * @param line The single-line input string * @return The input string modified as described above */ public static String capitalizeLineFirstLetter(String line) { String result = null; Pattern regexp = Pattern.compile("[^\\p{Punct}\\p{Space}¡¿]"); Matcher matcher = regexp.matcher(line); if (matcher.find()) { String match = matcher.group(0); result = line.replaceFirst(match, match.toUpperCase()); } else { result = line; } return result; } /** * Scanning from left-to-right, a comma or period preceded by a space will become just the * comma/period. * * @param line The single-line input string * @return The input string modified as described above */ public static String joinPunctuationMarks(String line) { String result = line; result = result.replace(" ,", ","); result = result.replace(" ;", ";"); result = result.replace(" :", ":"); result = result.replace(" .", "."); result = result.replace(" !", "!"); result = result.replace("¡ ", "¡"); result = result.replace(" ?", "?"); result = result.replace("¿ ", "¿"); result = result.replace(" )", ")"); result = result.replace(" ]", "]"); result = result.replace(" }", "}"); result = result.replace("( ", "("); result = result.replace("[ ", "["); result = result.replace("{ ", "{"); return result; } /** * Scanning from left-to-right, a hyphen surrounded by a space before and after it will become * just the hyphen. * * @param line The single-line input string * @return The input string modified as described above */ public static String joinHyphen(String line) { return line.replace(" - ", "-"); } /** * Scanning the line from left-to-right, a contraction suffix preceded by a space will become just * the contraction suffix. <br> * <br> * I.e., the preceding space will be deleting, joining the prefix to the suffix. <br> * <br> * E.g. * * <pre>wo n't</pre> * * becomes * * <pre>won't</pre> * * @param line The single-line input string * @return The input string modified as described above */ public static String joinContractions(String line) { String result = line; for (String suffix : new String[] {"'d", "'ll", "'m", "n't", "'re", "'s", "'ve",}) { result = result.replace(" " + suffix, suffix); } return result; } /** * Capitalize the first character of the titles of names: Mr Mrs Ms Miss Dr Prof * * @param line The single-line input string * @return The input string modified as described above */ public static String capitalizeNameTitleAbbrvs(String line) { String result = line; // Capitalize only the first character of certain name titles. for (String title : new String[] {"dr", "miss", "mr", "mrs", "ms", "prof"}) { result = result.replaceAll("\\b" + title + "\\b", Character.toUpperCase(title.charAt(0)) + title.substring(1)); } // Capitalize the relevant characters of certain name titles. result = result.replaceAll("\\b" + "phd" + "\\b", "PhD"); result = result.replaceAll("\\b" + "mphil" + "\\b", "MPhil"); return result; } public static String capitalizeI(String line) { // Capitalize only the first character of certain name titles. return line.replaceAll("\\b" + "i" + "\\b", "I"); } /** * Case-insensitively replace all of the character sequences that represent a bracket character. * * Keys are token representations of abbreviations of titles for names that capitalize more than * just the first letter.<br> * Bracket token sequences: -lrb- -rrb- -lsb- -rsb- -lcb- -rcb- <br> * <br> * See http://www.cis.upenn.edu/~treebank/tokenization.html * * @param line The single-line input string * @return The input string modified as described above */ public static String replaceBracketTokens(String line) { String result = line; result = result.replaceAll("(?iu)" + "-lrb-", "("); result = result.replaceAll("(?iu)" + "-rrb-", ")"); result = result.replaceAll("(?iu)" + "-lsb-", "["); result = result.replaceAll("(?iu)" + "-rsb-", "]"); result = result.replaceAll("(?iu)" + "-lcb-", "{"); result = result.replaceAll("(?iu)" + "-rcb-", "}"); return result; } }