package info.ephyra.nlp.semantics; import info.ephyra.io.MsgPrinter; import info.ephyra.util.FileCache; import info.ephyra.util.RegexConverter; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.util.ArrayList; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * A Wrapper for the ASSERT semantic role labeler. * * @author Andy Schlaikjer, Nico Schlaefer * @version 2007-04-25 */ public class ASSERT { /** Directory of ASSERT. */ private static final String ASSERT_DIR = System.getenv("ASSERT"); /** Run ASSERT in client mode. */ private static final boolean ASSERT_CLIENT_MODE = false; /** Enable caching of parses. */ private static final boolean CACHING = true; /** Cache directory where parses are stored. */ private static final String CACHE_DIR = "cache/assert"; /** Pattern for extracting parses from ASSERT's output file. */ private static final Pattern PARSE_P = Pattern.compile("(\\d++): (.*+)"); /** * Creates a temporary file containing the sentences to be processed by ASSERT. * * @param ss sentences to be parsed * @return input file */ private static File createInputFile(String[] ss) throws Exception { try { File input = File.createTempFile("assert", ".input", new File(ASSERT_DIR + "/scripts")); // input.deleteOnExit(); PrintWriter pw = new PrintWriter(new BufferedWriter( new OutputStreamWriter(new FileOutputStream(input), "ISO-8859-1"))); for (String sentence : ss) { pw.println(sentence); if (pw.checkError()) throw new IOException(); } pw.close(); if (pw.checkError()) throw new IOException(); return input; } catch (IOException e) { throw new IOException("Failed to create input file."); } } /** * Instantiates an ASSERT process using the supplied input file. * * @param input an input file initialized previously with a call to <code>createInputFile()</code> * @return log file */ private static File execAssertProcess(File input) throws Exception { String basename = input.getCanonicalPath(); basename = basename.substring(0, basename.lastIndexOf('.')); File logf = new File(basename + ".log"); // logf.deleteOnExit(); // instantiate an annotator process Process process; try { // in order to pipe stdout and stderr from our subprocess, we start // an instance of bash and then exec assert from within this shell. process = Runtime.getRuntime().exec("bash"); PrintWriter pw = new PrintWriter(process.getOutputStream()); // assert generates its intermediate and final output files within // the current working directory, so we first must "cd" to the // directory we want output to be stored, making sure to test error // status of the "cd" operation. String cmd = "cd " + input.getParent() + " 2> " + logf.getCanonicalPath() + "; rv=\"$?\"; if [ \"$rv\" -ne \"0\" ]; then exit $rv; fi; " + ASSERT_DIR + "/scripts/assert " + (ASSERT_CLIENT_MODE ? "--mode=client " : "") + input.getCanonicalPath() + " >> " + logf.getCanonicalPath() + " 2>&1; exit"; pw.println(cmd); // close stdin, stdout, and stderr of the subprocess for good // measure. This is usually a good thing to do, both from a java // threads point of view (many helper threads to handle each // subprocess io stream), and daemonization of subprocess point of // view (we don't want the subprocess thinking we've got something // more to say to it). pw.close(); // process.getInputStream().close(); // process.getErrorStream().close(); } catch (Exception e) { throw new Exception("Failed to execute annotator process.", e); } // timeout on the Assert process // Interruptor interruptor = new Interruptor(timeout * 1000); int rv; try { // start the interruptor thread just before we block on the // previosly instantiated annotator process. // interruptor.run(); // block on the annotator process. if the process does not finish // before the timeout provided to the interruptor thread, then the // interruptor will interrupt this blocking call and we'll recieve // an InterruptedException. rv = process.waitFor(); // as soon as the annotator process finishes, interrupt the // interruptor so it doesn't needlessly interrupt us! // interruptor.interrupt(); } catch (InterruptedException e) { // the subprocess has timed out, so we kill it before throwing a new // exception. process.destroy(); throw new Exception("Process timed out."); } // test return value if (rv != 0) throw new Exception("Process returned error code: " + rv); return logf; } /** * Reads the annotated sentences from the output file created by ASSERT. * * @param input an input file initialized previously with a call to <code>createInputFile()</code> * @param sentCount number of sentences that have been passed to ASSERT * @return annotated sentences */ private static String[][] readOutputFile(File input, int sentCount) throws Exception { try { String basename = input.getCanonicalPath(); basename = basename.substring(0, basename.lastIndexOf('.')); File output = new File(basename + ".parses"); // output.deleteOnExit(); BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(output), "ISO-8859-1")); // this leaves out sentences that could not be parsed // ArrayList<String> as = new ArrayList<String>(); // ArrayList<String[]> ass = new ArrayList<String[]>(); // String line; // int previous = -1; // while ((line = br.readLine()) != null) { // Matcher parseM = PARSE_P.matcher(line); // if (parseM.find()) { // int sid = Integer.parseInt(parseM.group(1)); // String annotation = parseM.group(2).trim(); // // if (sid != previous) { // if (as.size() > 0) ass.add(as.toArray(new String[as.size()])); // as = new ArrayList<String>(); // previous = sid; // } // as.add(annotation); // } else throw new IOException(); // } // br.close(); // if (as.size() > 0) ass.add(as.toArray(new String[as.size()])); // return ass.toArray(new String[ass.size()][]); // this creates an empty array for sentences that could not be parsed ArrayList<String> as = new ArrayList<String>(); String[][] ass = new String[sentCount][]; String line; int previous = -1; while ((line = br.readLine()) != null) { Matcher parseM = PARSE_P.matcher(line); if (parseM.find()) { int sid = Integer.parseInt(parseM.group(1)); String annotation = parseM.group(2).trim(); if (sid != previous) { if (as.size() > 0) ass[previous] = as.toArray(new String[as.size()]); as = new ArrayList<String>(); previous = sid; } as.add(annotation); } else { if (!line.equals("ERROR: Found an empty input file... exiting.")) throw new IOException("Malformatted line: " + line); } } br.close(); if (as.size() > 0) ass[previous] = as.toArray(new String[as.size()]); for (int i = 0; i < ass.length; i++) if (ass[i] == null) ass[i] = new String[0]; return ass; } catch (IOException e) { throw new IOException("Failed to parse output file."); } } /** * Checks the log file for ASSERT failures. Returns <code>Integer.MAX_VALUE</code> if ASSERT successfully parsed the * sentences or the index of the last sentence that was parsed if ASSERT failed. -1 indicates that no sentence could * be parsed. * * @param logf log file * @return <code>Integer.MAX_VALUE</code> or index of last sentence that was parsed */ private static int checkLogFile(File logf) { try { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(logf), "ISO-8859-1")); int lastIndex = -1; Pattern p = Pattern.compile("^(\\d++): "); while (br.ready()) { String line = br.readLine(); Matcher m = p.matcher(line); if (m.find()) lastIndex = Integer.parseInt(m.group(1)); if (line.contains(" DOMAIN/FRAME/")) { br.close(); return lastIndex; // ASSERT crashed } } br.close(); return Integer.MAX_VALUE; // log file looks ok } catch (IOException e) { return -1; // log file cannot be read } } /** * Annotates the predicates in an array of sentences. * * @param ss sentences to be parsed * @return annotated sentences */ public static String[][] annotatePredicates(String ss[]) { // drop special characters that ASSERT cannot handle Pattern p = Pattern.compile(".++"); for (int i = 0; i < ss.length; i++) { String noSpecChar = ""; Matcher m = p.matcher(ss[i]); while (m.find()) noSpecChar += " " + m.group(0); ss[i] = noSpecChar.trim(); } // if caching is enabled, try to read parses from cache String[][] allParses = new String[ss.length][]; // parses from both cache and ASSERT ArrayList<Integer> originalIndices = new ArrayList<Integer>(); // used to merge parses from cache and ASSERT if (CACHING) { FileCache cache = new FileCache(CACHE_DIR); ArrayList<String> notInCache = new ArrayList<String>(); // sentences that are not in the cache for (int i = 0; i < ss.length; i++) { String[] parses = cache.read(ss[i]); if (parses != null) allParses[i] = parses; else { notInCache.add(ss[i]); originalIndices.add(i); } } ss = notInCache.toArray(new String[notInCache.size()]); } // get missing parses from ASSERT String[][] parses = new String[ss.length][]; if (ss.length > 0 && ASSERT_DIR != null && ASSERT_DIR.length() > 0) { try { MsgPrinter.printStatusMsgTimestamp("Parsing " + ss.length + " sentences with ASSERT..."); int beginIndex = 0; while (beginIndex < ss.length) { // restart ASSERT if it crashed // copy sentences that have not been parsed yet String[] sentences = new String[ss.length - beginIndex]; for (int i = 0; i < sentences.length; i++) sentences[i] = ss[i + beginIndex]; // parse these sentences File input = createInputFile(sentences); File logf = execAssertProcess(input); String[][] output = readOutputFile(input, ss.length); // merge parses in one array int lastIndex = checkLogFile(logf); if (lastIndex > -1 && lastIndex < Integer.MAX_VALUE) { MsgPrinter.printErrorMsg("ASSERT could not parse sentence:\n" + sentences[lastIndex]); output[lastIndex] = null; } else if (lastIndex == Integer.MAX_VALUE) { lastIndex = sentences.length - 1; } lastIndex = beginIndex + lastIndex; for (int i = beginIndex; i <= lastIndex; i++) parses[i] = output[i - beginIndex]; beginIndex = lastIndex + 1; } MsgPrinter.printStatusMsgTimestamp("...done"); } catch (Exception e) { MsgPrinter.printErrorMsg("\nCould not call ASSERT:\n" + e.getMessage()); System.exit(1); } } // if caching is enabled, write new parses to cache and merge parses from cache and ASSERT if (CACHING) { FileCache cache = new FileCache(CACHE_DIR); for (int i = 0; i < parses.length; i++) { if (parses[i] != null) cache.write(ss[i], parses[i]); // write to cache allParses[originalIndices.get(i)] = parses[i]; // merge with results from cache } } else { allParses = parses; } // return an empty array for sentences that could not be parsed for (int i = 0; i < allParses.length; i++) if (allParses[i] == null) allParses[i] = new String[0]; return allParses; } /** * This untokenizer is tailored for ASSERT. It does not only remove abundant blanks but it also tries to restore * special characters that have been dropped by ASSERT. * * @param text text to untokenize * @param original string that contains the original text as a subsequence * @return subsequence of the original string or <code>null</code>, iff there is no such subsequence */ public static String untokenize(String text, String original) { // try with boundary matchers String regex = RegexConverter.strToRegexWithBounds(text); regex = regex.replace(" ", "\\W*"); Matcher m = Pattern.compile(regex).matcher(original); if (m.find()) return m.group(0); // try without boundary matchers regex = RegexConverter.strToRegex(text); regex = regex.replace(" ", "\\W*"); m = Pattern.compile(regex).matcher(original); if (m.find()) return m.group(0); // untokenization failed return null; } }