edu.stanford.nlp.util.StringUtils Java Examples
The following examples show how to use
edu.stanford.nlp.util.StringUtils.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: FrenchPostprocessor.java From phrasal with GNU General Public License v3.0 | 6 votes |
/** * A main method for training and evaluating the postprocessor. * * @param args */ public static void main(String[] args) { // Strips off hyphens Properties options = StringUtils.argsToProperties(args, optionArgDefs()); if (options.containsKey("help") || args.length == 0) { System.err.println(usage(FrenchPostprocessor.class.getName())); System.exit(-1); } int nThreads = PropertiesUtils.getInt(options, "nthreads", 1); FrenchPreprocessor preProcessor = new FrenchPreprocessor(); FrenchPostprocessor postProcessor = new FrenchPostprocessor(options); CRFPostprocessor.setup(postProcessor, preProcessor, options); CRFPostprocessor.execute(nThreads, preProcessor, postProcessor); }
Example #2
Source File: POSTaggerTest.java From JHazm with MIT License | 6 votes |
@Test public void batchTagTest() throws IOException { POSTagger tagger = new POSTagger(); String[] input = new String[] { "من", "به", "مدرسه", "رفته بودم", "."}; List<TaggedWord> expected = new ArrayList<>(); expected.add(new TaggedWord("من","PR")); expected.add(new TaggedWord("به","PREP")); expected.add(new TaggedWord("مدرسه","N")); expected.add(new TaggedWord("رفته بودم","V")); expected.add(new TaggedWord(".","PUNC")); List<TaggedWord> actual = tagger.batchTag(Arrays.asList(input)); assertEquals("Failed to tagged words of '" + StringUtils.join(input, " ") + "' sentence", expected.size(), actual.size()); for (int i = 0; i < expected.size(); i++) { TaggedWord actualTaggedWord = actual.get(i); TaggedWord expectedTaggedWord = expected.get(i); if (!actualTaggedWord.tag().equals(expectedTaggedWord.tag())) assertEquals("Failed to tagged words of '" + StringUtils.join(input, " ") + "' sentence", expectedTaggedWord, actualTaggedWord); } }
Example #3
Source File: SyntheticRules.java From phrasal with GNU General Public License v3.0 | 6 votes |
private static <TK,FV> int tgtCompoundCnt(IString srcToken, IString tgtToken, List<DynamicTranslationModel<FV>> tmList) { int tgtSize = tgtToken.length(); int cnt = 0; // each compound should have at least 4 characters for(int k = 4; k < tgtSize - 3 ; ++k) { IString[] preSuffixes = new IString[3]; preSuffixes[0] = new IString(tgtToken.subSequence(0, k).toString()); // prefix preSuffixes[1] = new IString(tgtToken.subSequence(tgtSize - k , tgtSize).toString()); // suffix preSuffixes[2] = new IString(StringUtils.capitalize(tgtToken.subSequence(tgtSize - k, tgtSize).toString())); // capitalized suffix int mMax = preSuffixes[1] == preSuffixes[2] ? 2 : 3; for(int m = 0; m < mMax; ++m) { IString tgt = preSuffixes[m]; if(!tgtGarbageCollection(tgt, tmList)) { cnt += tmList.stream().mapToInt(tm -> tm.getJointLexCount(srcToken, tgt)).sum(); } } } return cnt; }
Example #4
Source File: SyntheticRules.java From phrasal with GNU General Public License v3.0 | 6 votes |
private static <TK,FV> int srcCompoundCnt(IString srcToken, IString tgtToken, List<DynamicTranslationModel<FV>> tmList) { int srcSize = srcToken.length(); int cnt = 0; // each compound should have at least 4 characters for(int k = 4; k < srcSize - 3 ; ++k) { IString[] preSuffixes = new IString[3]; preSuffixes[0] = new IString(srcToken.subSequence(0, k).toString()); // prefix preSuffixes[1] = new IString(srcToken.subSequence(srcSize - k , srcSize).toString()); // suffix preSuffixes[2] = new IString(StringUtils.capitalize(srcToken.subSequence(srcSize - k, srcSize).toString())); // capitalized suffix int mMax = preSuffixes[1] == preSuffixes[2] ? 2 : 3; for(int m = 0; m < mMax; ++m) { IString src = preSuffixes[m]; if(!srcGarbageCollection(src, tmList)) { cnt += tmList.stream().mapToInt(tm -> tm.getJointLexCount(src, tgtToken)).sum(); } } } return cnt; }
Example #5
Source File: SpanishPostprocessor.java From phrasal with GNU General Public License v3.0 | 6 votes |
/** * A main method for training and evaluating the postprocessor. * * @param args */ public static void main(String[] args) { // Strips off hyphens Properties options = StringUtils.argsToProperties(args, optionArgDefs()); if (options.containsKey("help") || args.length == 0) { System.err.println(usage(SpanishPostprocessor.class.getName())); System.exit(-1); } int nThreads = PropertiesUtils.getInt(options, "nthreads", 1); SpanishPreprocessor preProcessor = new SpanishPreprocessor(); SpanishPostprocessor postProcessor = new SpanishPostprocessor(options); CRFPostprocessor.setup(postProcessor, preProcessor, options); CRFPostprocessor.execute(nThreads, preProcessor, postProcessor); }
Example #6
Source File: EnglishPostprocessor.java From phrasal with GNU General Public License v3.0 | 6 votes |
/** * A main method for training and evaluating the postprocessor. * * @param args */ public static void main(String[] args) { // Strips off hyphens Properties options = StringUtils.argsToProperties(args, optionArgDefs()); if (options.containsKey("help") || args.length == 0) { System.err.println(usage(EnglishPostprocessor.class.getName())); System.exit(-1); } int nThreads = PropertiesUtils.getInt(options, "nthreads", 1); EnglishPreprocessor preProcessor = new EnglishPreprocessor(); EnglishPostprocessor postProcessor = new EnglishPostprocessor(options); CRFPostprocessor.setup(postProcessor, preProcessor, options); CRFPostprocessor.execute(nThreads, preProcessor, postProcessor); }
Example #7
Source File: GermanPostprocessor.java From phrasal with GNU General Public License v3.0 | 6 votes |
/** * A main method for training and evaluating the postprocessor. * * @param args */ public static void main(String[] args) { // Strips off hyphens Properties options = StringUtils.argsToProperties(args, optionArgDefs()); if (options.containsKey("help") || args.length == 0) { System.err.println(usage(GermanPostprocessor.class.getName())); System.exit(-1); } int nThreads = PropertiesUtils.getInt(options, "nthreads", 1); GermanPreprocessor preProcessor = new GermanPreprocessor(); GermanPostprocessor postProcessor = new GermanPostprocessor(options); CRFPostprocessor.setup(postProcessor, preProcessor, options); CRFPostprocessor.execute(nThreads, preProcessor, postProcessor); }
Example #8
Source File: Phrasal.java From phrasal with GNU General Public License v3.0 | 6 votes |
/** * Run Phrasal from the command line. * * @param args * @throws Exception */ public static void main(String[] args) throws Exception { final Properties options = StringUtils.argsToProperties(args); final String configFile = options.containsKey("") ? (String) options.get("") : null; options.remove(""); if ((options.size() == 0 && configFile == null) || options.containsKey("help") || options.containsKey("h")) { System.err.println(usage()); System.exit(-1); } // by default, exit on uncaught exception Thread.setDefaultUncaughtExceptionHandler((t, ex) -> { logger.fatal("Uncaught top-level exception", ex); System.exit(-1); }); final Map<String, List<String>> configuration = getConfigurationFrom(configFile, options); final Phrasal p = Phrasal.loadDecoder(configuration); if (options.containsKey("text")) p.decode(new FileInputStream(new File(options.getProperty("text"))), true); else p.decode(System.in, true); }
Example #9
Source File: InteractiveDriver.java From InformationExtraction with GNU General Public License v3.0 | 6 votes |
public static void main(String[] args) throws IOException { Properties props = StringUtils.argsToProperties(args); props.setProperty("annotators", "tokenize,ssplit,pos,lemma,ner,regexner,parse,mention,coref,kbp"); props.setProperty("regexner.mapping", "ignorecase=true,validpospattern=^(NN|JJ).*,edu/stanford/nlp/models/kbp/regexner_caseless.tab;edu/stanford/nlp/models/kbp/regexner_cased.tab"); Set<String> interested = Stream.of("per:title", "per:employee_of", "org:top_members/employees").collect(Collectors.toSet()); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); IOUtils.console("sentence> ", line -> { Annotation ann = new Annotation(line); pipeline.annotate(ann); for (CoreMap sentence : ann.get(CoreAnnotations.SentencesAnnotation.class)) { sentence.get(CoreAnnotations.KBPTriplesAnnotation.class).forEach(r -> { String relation = r.relationGloss(); if(interested.contains(relation)) { System.err.println(r); } }); } }); }
Example #10
Source File: KBPTest.java From InformationExtraction with GNU General Public License v3.0 | 6 votes |
/** * A debugging method to try relation extraction from the console. * @throws IOException */ public static void main(String[] args) throws IOException { Properties props = StringUtils.argsToProperties(args); props.setProperty("annotators", "tokenize,ssplit,pos,lemma,ner,regexner,parse,mention,coref,kbp"); props.setProperty("regexner.mapping", "ignorecase=true,validpospattern=^(NN|JJ).*,edu/stanford/nlp/models/kbp/regexner_caseless.tab;edu/stanford/nlp/models/kbp/regexner_cased.tab"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); IOUtils.console("sentence> ", line -> { Annotation ann = new Annotation(line); pipeline.annotate(ann); for (CoreMap sentence : ann.get(CoreAnnotations.SentencesAnnotation.class)) { sentence.get(CoreAnnotations.KBPTriplesAnnotation.class).forEach(System.err::println); System.out.println(sentence); } }); }
Example #11
Source File: JavaReExTest.java From InformationExtraction with GNU General Public License v3.0 | 6 votes |
public static void main(String[] args){ try{ Properties props = StringUtils.argsToProperties(args); // props.setProperty("annotators", "tokenize,ssplit,lemma,pos,parse,ner"); StanfordCoreNLP pipeline = new StanfordCoreNLP(); String sentence = "John Gerspach was named Chief Financial Officer of Citi in July 2009."; Annotation doc = new Annotation(sentence); pipeline.annotate(doc); RelationExtractorAnnotator r = new RelationExtractorAnnotator(props); r.annotate(doc); for(CoreMap s: doc.get(CoreAnnotations.SentencesAnnotation.class)){ System.out.println("For sentence " + s.get(CoreAnnotations.TextAnnotation.class)); List<RelationMention> rls = s.get(RelationMentionsAnnotation.class); for(RelationMention rl: rls){ System.out.println(rl.toString()); } } }catch(Exception e){ e.printStackTrace(); } }
Example #12
Source File: CRFPreprocessor.java From phrasal with GNU General Public License v3.0 | 5 votes |
public static CRFClassifier<CoreLabel> loadClassifier(String options) throws IllegalArgumentException { String[] inputFlags = options.split(" "); Properties props = StringUtils.argsToProperties(inputFlags); SeqClassifierFlags flags = new SeqClassifierFlags(props); CRFClassifier<CoreLabel> crfSegmenter = new CRFClassifier<>(flags); if(flags.loadClassifier == null) { throw new IllegalArgumentException("missing -loadClassifier flag for CRF preprocessor."); } crfSegmenter.loadClassifierNoExceptions(flags.loadClassifier, props); crfSegmenter.loadTagIndex(); return crfSegmenter; }
Example #13
Source File: SentenceLevelEvaluation.java From phrasal with GNU General Public License v3.0 | 5 votes |
/** * * @param args * @throws IOException */ public static void main(String[] args) throws IOException { if (args.length < 1) { System.err.print(usage()); System.exit(-1); } Properties options = StringUtils.argsToProperties(args, argDefs()); int ngramOrder = PropertiesUtils.getInt(options, "order", BLEUMetric.DEFAULT_MAX_NGRAM_ORDER); boolean disableTokenization = PropertiesUtils.getBool(options, "no-nist", false); String metric = options.getProperty("metric", "bleu"); String[] refs = options.getProperty("").split("\\s+"); List<List<Sequence<IString>>> referencesList = MetricUtils.readReferences(refs, ! disableTokenization); System.err.printf("Metric: %s with %d references%n", metric, referencesList.get(0).size()); LineNumberReader reader = new LineNumberReader(new InputStreamReader( System.in)); int sourceInputId = 0; for (String line; (line = reader.readLine()) != null; ++sourceInputId) { line = disableTokenization ? line : NISTTokenizer.tokenize(line); Sequence<IString> translation = IStrings.tokenize(line); double score = getScore(translation, referencesList.get(sourceInputId), ngramOrder, metric); System.out.printf("%.4f%n", score); } System.err.printf("Scored %d input segments%n", sourceInputId); }
Example #14
Source File: SerializedDependencyToCoNLL.java From phrasal with GNU General Public License v3.0 | 5 votes |
public static void main(String[] args) { Properties options = StringUtils.argsToProperties(args, optionArgDefs()); String annotations = PropertiesUtils.get(options, "annotations", null, String.class); boolean changepreps = PropertiesUtils.getBool(options, "changepreps", false); int sentenceCount = CoreNLPCache.loadSerialized(annotations); CoreMap sentence; for (int i = 0; i < sentenceCount; i++) { try { sentence = CoreNLPCache.get(i); if (sentence == null) { System.out.println(); System.err.println("Empty sentence #" + i); continue; } printDependencies(sentence, changepreps); //System.err.println("---------------------------"); } catch (Exception e) { System.err.println("SourceSentence #" + i); e.printStackTrace(); return; } } }
Example #15
Source File: BasicNBestEntry.java From phrasal with GNU General Public License v3.0 | 5 votes |
public BasicNBestEntry(String input) { line = input; List<List<String>> fields = StringUtils.splitFieldsFast(input.trim(), "|||"); sentenceNumber = Integer.valueOf(fields.get(0).get(0)); tokens = IStrings.toIStringSequence(fields.get(1)); score = Float.parseFloat(fields.get(3).get(0)); }
Example #16
Source File: MinimumBayesRisk.java From phrasal with GNU General Public License v3.0 | 5 votes |
/** * * @param args * @throws IOException */ public static void main(String[] args) throws IOException { if (args.length < 1) { System.err.print(usage()); System.exit(-1); } Properties options = StringUtils.argsToProperties(args, argDefs()); final double scale = PropertiesUtils.getDouble(options, "s", DEFAULT_SCALE); final String orientation = options.getProperty("o", "utility"); final boolean risk = "risk".equals(orientation); final String metricName = options.getProperty("m", DEFAULT_METRIC); final String filename = options.getProperty(""); BasicNBestList nbestlists = new BasicNBestList(filename); MulticoreWrapper<List<BasicNBestEntry>, List<Pair<Double, String>>> wrapper = new MulticoreWrapper<List<BasicNBestEntry>, List<Pair<Double, String>>>(0, new Processor(metricName, risk, scale), true); for (List<BasicNBestEntry> nbestlist : nbestlists) { wrapper.put(nbestlist); while (wrapper.peek()) { DumpRescored(wrapper.poll()); } } wrapper.join(); while (wrapper.peek()) { DumpRescored(wrapper.poll()); } }
Example #17
Source File: DependencyBnBPreorderer.java From phrasal with GNU General Public License v3.0 | 5 votes |
private static String preorder(Tree tree) { List<Tree> queue = new LinkedList<>(); queue.add(tree); while ( ! queue.isEmpty()) { Tree currentNode = queue.remove(0); if (currentNode.isLeaf()) continue; Tree children[] = currentNode.children(); int childCount = children.length; IndexedWord hw = (IndexedWord) currentNode.label(); List<FeatureNode> featureNodes = new ArrayList<>(childCount); for (int i = 0; i < childCount; i++) { featureNodes.add(new FeatureNode(children[i], hw)); queue.add(children[i]); } if (childCount < 8) { Pair<Double, List<Integer>> result = search(featureNodes, new LinkedList<Integer>(), Double.NEGATIVE_INFINITY); if (result != null) { List<Integer> permutation = result.second; List<Tree> newChildren = new ArrayList<>(Arrays.asList(children)); for (int i = 0; i < childCount; i++) { int idx = permutation.get(i); newChildren.set(idx, children[i]); } currentNode.setChildren(newChildren); } else { System.err.println("Warning: No path found."); } } } return StringUtils.join(tree.yieldWords()); }
Example #18
Source File: MultiLangsStanfordCoreNLPClient.java From blog-codes with Apache License 2.0 | 5 votes |
/** * The main constructor. Create a client from a properties file and a list of backends. * Note that this creates at least one Daemon thread. * * @param properties The properties file, as would be passed to {@link StanfordCoreNLP}. * @param backends The backends to run on. * @param apiKey The key to authenticate with as a username * @param apiSecret The key to authenticate with as a password */ private MultiLangsStanfordCoreNLPClient(Properties properties, List<Backend> backends, String apiKey, String apiSecret, String lang) { this.lang = lang; // Save the constructor variables this.properties = properties; Properties serverProperties = new Properties(); for (String key : properties.stringPropertyNames()) { serverProperties.setProperty(key, properties.getProperty(key)); } Collections.shuffle(backends, new Random(System.currentTimeMillis())); this.scheduler = new BackendScheduler(backends); this.apiKey = apiKey; this.apiSecret = apiSecret; // Set required serverProperties serverProperties.setProperty("inputFormat", "serialized"); serverProperties.setProperty("outputFormat", "serialized"); serverProperties.setProperty("inputSerializer", ProtobufAnnotationSerializer.class.getName()); serverProperties.setProperty("outputSerializer", ProtobufAnnotationSerializer.class.getName()); // Create a list of all the properties, as JSON map elements List<String> jsonProperties = serverProperties.stringPropertyNames().stream().map(key -> '"' + StringUtils.escapeJsonString(key) + "\": \"" + StringUtils.escapeJsonString(serverProperties.getProperty(key)) + '"') .collect(Collectors.toList()); // Create the JSON object this.propsAsJSON = "{ " + StringUtils.join(jsonProperties, ", ") + " }"; // Start 'er up this.scheduler.start(); }
Example #19
Source File: GenericDBReader.java From ambiverse-nlu with Apache License 2.0 | 5 votes |
public List<Fact> getFacts(String subject, String relation, String object) throws SQLException { List<Fact> triples = new LinkedList<>(); Statement stmt; ResultSet rs; Connection con = EntityLinkingManager.getConnectionForDatabase(EntityLinkingManager.DB_YAGO); List<String> queryCondition = new LinkedList<>(); if (subject != null) { queryCondition.add("subject = '" + StringEscapeUtils.escapeSql(subject) + "'"); } if (relation != null) { queryCondition.add("predicate = '" + StringEscapeUtils.escapeSql(relation) + "'"); } if (object != null) { queryCondition.add("object = '" + StringEscapeUtils.escapeSql(object) + "'"); } String conditionString = StringUtils.join(queryCondition, " AND "); stmt = con.createStatement(); String sql = "SELECT id, subject, object, predicate FROM " + tableName + " WHERE " + conditionString; rs = stmt.executeQuery(sql); while (rs.next()) { triples.add(new Fact(rs.getString("id"), rs.getString("subject"), rs.getString("predicate"), rs.getString("object"))); } rs.close(); EntityLinkingManager.releaseConnection(con); return triples; }
Example #20
Source File: MakeWordClasses.java From phrasal with GNU General Public License v3.0 | 5 votes |
/** * @param args */ public static void main(String[] args) { Properties options = StringUtils.argsToProperties(args, optionArgDefs()); String[] filenames = options.getProperty("","").split("\\s+"); if (filenames.length < 1 || filenames[0].length() == 0 || options.containsKey("h") || options.containsKey("help")) { System.err.println(usage()); System.exit(-1); } MakeWordClasses mkWordCls = new MakeWordClasses(options); mkWordCls.run(filenames); mkWordCls.writeResults(System.out); }
Example #21
Source File: PhraseViewer.java From phrasal with GNU General Public License v3.0 | 5 votes |
private static boolean validateCommandLine(String[] args) { // Command line parsing Properties options = StringUtils.argsToProperties(args, argDefs()); VERBOSE = options.containsKey("v"); SRC_FILE = options.getProperty("s", null); OPTS_FILE = options.getProperty("o", null); XSD_FILE = options.getProperty("x", null); FIRST_ID = PropertiesUtils.getInt(options, "f", Integer.MIN_VALUE); LAST_ID = PropertiesUtils.getInt(options,"l",Integer.MAX_VALUE); return true; }
Example #22
Source File: AskServlet.java From QuestionAnsweringSystem with Apache License 2.0 | 5 votes |
/** * Processes requests for both HTTP <code>GET</code> and <code>POST</code> * methods. * * @param request servlet request * @param response servlet response * @throws ServletException if a servlet-specific error occurs * @throws IOException if an I/O error occurs */ protected void processRequest(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { response.setContentType("application/json;charset=UTF-8"); response.setCharacterEncoding("UTF-8"); request.setCharacterEncoding("UTF-8"); String questionStr = request.getParameter("q"); String n = request.getParameter("n"); int topN = -1; if(n != null && StringUtils.isNumeric(n)){ topN = Integer.parseInt(n); } Question question = null; List<CandidateAnswer> candidateAnswers = null; if (questionStr != null && questionStr.trim().length() > 3) { question = SharedQuestionAnsweringSystem.getInstance().answerQuestion(questionStr); if (question != null) { candidateAnswers = question.getAllCandidateAnswer(); } } LOG.info("问题:"+questionStr); try (PrintWriter out = response.getWriter()) { String json = JsonGenerator.generate(candidateAnswers, topN); out.println(json); LOG.info("答案:"+json); } }
Example #23
Source File: TrainNerModel.java From InformationExtraction with GNU General Public License v3.0 | 5 votes |
public static void main(String[] args) { String path = IntelConfig.DEPARTMENT_TRAIN_PROPERTY; Properties props = StringUtils.propFileToProperties(path); SeqClassifierFlags flags = new SeqClassifierFlags(props); CRFClassifier<CoreLabel> crf = new CRFClassifier<CoreLabel>(flags); crf.train(); String modelPath = props.getProperty("serializeTo"); crf.serializeClassifier(modelPath); System.out.println("Build model to " + modelPath); }
Example #24
Source File: YAGO3DBReader.java From ambiverse-nlu with Apache License 2.0 | 5 votes |
public List<Fact> getFacts(String subject, String relation, String object) throws SQLException { List<Fact> triples = new LinkedList<>(); Statement stmt; ResultSet rs; Connection con = EntityLinkingManager.getConnectionForDatabase(EntityLinkingManager.DB_YAGO); List<String> queryCondition = new LinkedList<>(); if (subject != null) { queryCondition.add("subject='" + StringEscapeUtils.escapeSql(subject) + "'"); } if (relation != null) { queryCondition.add("predicate='" + StringEscapeUtils.escapeSql(relation) + "'"); } if (object != null) { queryCondition.add("object='" + StringEscapeUtils.escapeSql(object) + "'"); } String conditionString = StringUtils.join(queryCondition, " AND "); stmt = con.createStatement(); String sql = "SELECT id, subject, object, predicate FROM " + tableName + " WHERE " + conditionString; rs = stmt.executeQuery(sql); while (rs.next()) { triples.add(new Fact(rs.getString("id"), rs.getString("subject"), rs.getString("predicate"), rs.getString("object"))); } rs.close(); EntityLinkingManager.releaseConnection(con); return triples; }
Example #25
Source File: LodCsvTableWriter.java From winter with Apache License 2.0 | 4 votes |
protected void writeLine(List<String> values, BufferedWriter w) throws IOException { w.write(String.format("\"%s\"\n", StringUtils.join(values, "\",\""))); values.clear(); }
Example #26
Source File: BuildDependencyLMData.java From phrasal with GNU General Public License v3.0 | 4 votes |
public static void main(String[] args) throws IOException { Properties options = StringUtils.argsToProperties(args, optionArgDefs()); String sourceTokens = PropertiesUtils.get(options, "input", null, String.class); String outdirPath = PropertiesUtils.get(options, "outdir", ".", String.class); String leftDepLMFilename = outdirPath + File.separator + "left.deplm"; String rightDepLMFilename = outdirPath + File.separator + "right.deplm"; String headDepLMFilename = outdirPath + File.separator + "head.deplm"; File leftDepLMFile = new File(leftDepLMFilename); if (!leftDepLMFile.exists()) leftDepLMFile.createNewFile(); File rightDepLMFile = new File(rightDepLMFilename); if (!rightDepLMFile.exists()) rightDepLMFile.createNewFile(); File headDepLMFile = new File(headDepLMFilename); if (!headDepLMFile.exists()) headDepLMFile.createNewFile(); FileWriter leftFW = new FileWriter(leftDepLMFile.getAbsoluteFile()); FileWriter rightFW = new FileWriter(rightDepLMFile.getAbsoluteFile()); FileWriter headFW = new FileWriter(headDepLMFile.getAbsoluteFile()); leftDepLMWriter = new BufferedWriter(leftFW); rightDepLMWriter = new BufferedWriter(rightFW); headDepLMWriter = new BufferedWriter(headFW); File sourceSentences = new File(sourceTokens); BufferedReader sourceReader = new BufferedReader(new FileReader(sourceSentences)); String sourceSentence; Map<Integer, NavigableSet<Integer>> dependencies = new HashMap<>(); List<String> tokens = new LinkedList<>(); while (true) { sourceSentence = sourceReader.readLine(); if (sourceSentence == null || sourceSentence.equals("")) { printLeftAndRightDependencies(dependencies, new ArraySequence<String>(tokens)); dependencies = new HashMap<>(); tokens = new LinkedList<>(); if (sourceSentence == null) { break; } else { continue; } } String fields[] = sourceSentence.split("\t"); int id = Integer.parseInt(fields[0]) - 1; int head = fields[7].equals("frag") ? -2 : Integer.parseInt(fields[6]) - 1; String token = fields[1]; tokens.add(token); if (!isWord(token)) continue; if (!dependencies.containsKey(head)) dependencies.put(head, new TreeSet<Integer>()); if (!dependencies.containsKey(id)) dependencies.put(id, new TreeSet<Integer>()); dependencies.get(head).add(id); } sourceReader.close(); leftDepLMWriter.close(); rightDepLMWriter.close(); headDepLMWriter.close(); }
Example #27
Source File: PhrasalService.java From phrasal with GNU General Public License v3.0 | 4 votes |
/** * Start the service. * * @param args */ public static void main(String[] args) { Properties options = StringUtils.argsToProperties(args, optionArgDefs()); int port = PropertiesUtils.getInt(options, "p", DEFAULT_HTTP_PORT); boolean loadMockServlet = PropertiesUtils.getBool(options, "m", false); boolean localHost = PropertiesUtils.getBool(options, "l", false); String uiFile = options.getProperty("u", "debug.html"); String resourcePath = options.getProperty("r", "."); // Parse arguments String argList = options.getProperty("",null); String[] parsedArgs = argList == null ? null : argList.split("\\s+"); if (parsedArgs == null || parsedArgs.length != 1) { System.out.println(usage()); System.exit(-1); } String phrasalIniFile = parsedArgs[0]; // Setup the jetty server Server server = new Server(); // Jetty 8 way of configuring the server // Connector connector = new SelectChannelConnector(); // connector.setPort(port); // server.addConnector(connector); // Jetty9 way of configuring the server ServerConnector connector = new ServerConnector(server); connector.setPort(port); server.addConnector(connector); if (localHost) { connector.setHost(DEBUG_URL); } // Setup the servlet context ServletContextHandler context = new ServletContextHandler(ServletContextHandler.SESSIONS); context.setContextPath("/"); // Add Phrasal servlet PhrasalServlet servlet = loadMockServlet ? new PhrasalServlet() : new PhrasalServlet(phrasalIniFile); context.addServlet(new ServletHolder(servlet), SERVLET_ROOT); // TODO(spenceg): gzip compression causes an encoding problem for unicode characters // on the client. Not sure if the compression or decompression is the problem. // EnumSet<DispatcherType> dispatches = EnumSet.of(DispatcherType.REQUEST, DispatcherType.ASYNC); // context.addFilter(new FilterHolder(new IncludableGzipFilter()), "/t", dispatches); // Add debugging web-page ResourceHandler resourceHandler = new ResourceHandler(); resourceHandler.setWelcomeFiles(new String[]{ uiFile }); resourceHandler.setResourceBase(resourcePath); HandlerList handlers = new HandlerList(); handlers.setHandlers(new Handler[] { resourceHandler, context }); server.setHandler(handlers); // Start the service try { logger.info("Starting PhrasalService on port: " + String.valueOf(port)); server.start(); server.join(); } catch (Exception e) { logger.error("Servlet crashed. Service shutting down."); e.printStackTrace(); } }
Example #28
Source File: MakePTMPhrasalInput.java From phrasal with GNU General Public License v3.0 | 4 votes |
/** * @param args * @throws IOException */ public static void main(String[] args) throws IOException { if (args.length != 3) { System.err.print(usage()); System.exit(-1); } Properties options = StringUtils.argsToProperties(args, argDefs()); String[] positionalArgs = options.getProperty("").split("\\s+"); String srcLang = positionalArgs[0]; String tgtLang = positionalArgs[1]; String sqlFile = positionalArgs[2]; Preprocessor srcPreproc = ProcessorFactory.getPreprocessor(srcLang); Preprocessor tgtPreproc = ProcessorFactory.getPreprocessor(tgtLang); System.out.printf("%s\t%s\t%s\t%s\t%s\t%s\t%s%n", "doc_id", "seg_id", "username", "mt_tok", "user_tok", "s2mt_tok", "src_tok"); // CSVReader reader = new CSVReader(new FileReader(sqlFile)); // Skip header boolean seenHeader = false; // for (String[] fields; (fields = reader.readNext()) != null;) { for (String[] fields = null;;) { if ( ! seenHeader) { seenHeader = true; continue; } // String segId = String.format("%s:%s", fields[0], fields[1]).replace(".src.json", ".tgt"); String tgtLine = fields[3].trim(); String alignStr = extend(fields[5]).trim(); String srcLine = fields[6].trim(); SymmetricalWordAlignment s2t = new SymmetricalWordAlignment(srcLine, tgtLine, alignStr); SymmetricalWordAlignment s2sPrime = srcPreproc.processAndAlign(srcLine); SymmetricalWordAlignment t2tPrime = tgtPreproc.processAndAlign(tgtLine); String userTextTok = tgtPreproc.process(fields[3]).toString(); // Want sprime --> tprime List<String> alignmentList = new LinkedList<>(); for (int i = 0, size = s2sPrime.eSize(); i < size; ++i) { Set<Integer> alignments = s2sPrime.e2f(i); for (int j : alignments) { Set<Integer> alignments2 = s2t.f2e(j); for (int k : alignments2) { Set<Integer> alignments3 = t2tPrime.f2e(k); for (int q : alignments3) { alignmentList.add(String.format("%d-%d",i,q)); } } } } System.out.printf("%s\t%s\t%s\t%s\t%s\t%s\t%s%n", fields[0], fields[1], fields[2], t2tPrime.e().toString(), userTextTok, Sentence.listToString(alignmentList), s2sPrime.e().toString()); } // reader.close(); }
Example #29
Source File: BLEUGenreEvaluator.java From phrasal with GNU General Public License v3.0 | 4 votes |
/** * * @param args * @throws IOException */ public static void main(String[] args) throws IOException { if (args.length < 2) { System.err.print(usage()); System.exit(-1); } Properties options = StringUtils.argsToProperties(args, argDefs()); int BLEUOrder = PropertiesUtils.getInt(options, "order", BLEUMetric.DEFAULT_MAX_NGRAM_ORDER); boolean doCased = PropertiesUtils.getBool(options, "cased", false); // Setup the metric tokenization scheme. Applies to both the references and // hypotheses if (doCased) NISTTokenizer.lowercase(false); // Load the references String[] parameters = options.getProperty("").split("\\s+"); String[] refs = new String[parameters.length - 1]; System.arraycopy(parameters, 1, refs, 0, refs.length); List<InputProperties> inputProperties = InputProperties.parse(new File(parameters[0])); List<List<Sequence<IString>>> referencesList = MetricUtils.readReferences(refs, true); Map<String,BLEUMetric<IString, String>.BLEUIncrementalMetric> metrics = BLEUGenreEvaluator.run(referencesList, inputProperties, BLEUOrder, System.in); for (Map.Entry<String,BLEUMetric<IString, String>.BLEUIncrementalMetric> entry : metrics.entrySet()) { String genre = entry.getKey(); BLEUMetric<IString, String>.BLEUIncrementalMetric incMetric = entry.getValue(); System.out.printf("Genre: %s%n", genre); double[] ngramPrecisions = incMetric.ngramPrecisions(); System.out.printf("BLEU = %.3f, ", 100 * incMetric.score()); for (int i = 0; i < ngramPrecisions.length; i++) { if (i != 0) { System.out.print("/"); } System.out.printf("%.3f", ngramPrecisions[i] * 100); } System.out.printf(" (BP=%.3f, ratio=%.3f %d/%d)%n", incMetric .brevityPenalty(), ((1.0 * incMetric.candidateLength()) / incMetric .effectiveReferenceLength()), incMetric.candidateLength(), incMetric .effectiveReferenceLength()); System.out.printf("%nPrecision Details:%n"); double[][] precCounts = incMetric.ngramPrecisionCounts(); for (int i = 0; i < ngramPrecisions.length; i++) { System.out.printf("\t%d:%d/%d%n", i, (int) precCounts[i][0], (int) precCounts[i][1]); } System.out.println(); } }
Example #30
Source File: Evaluate.java From phrasal with GNU General Public License v3.0 | 4 votes |
/** * * @param args * @throws IOException */ public static void main(String[] args) throws IOException { if (args.length < 2) { System.err.print(usage()); System.exit(-1); } Properties options = StringUtils.argsToProperties(args, argDefs()); boolean disableTokenization = PropertiesUtils.getBool(options, "no-nist", false); boolean doCased = PropertiesUtils.getBool(options, "cased", false); // Setup the metric tokenization scheme. Applies to both the references and // hypotheses if (doCased) NISTTokenizer.lowercase(false); NISTTokenizer.normalize( ! disableTokenization); // Load the references String[] parsedArgs = options.getProperty("").split("\\s+"); final String evalMetric = parsedArgs[0]; String[] refs= Arrays.copyOfRange(parsedArgs, 1, parsedArgs.length); final List<List<Sequence<IString>>> references = MetricUtils.readReferences(refs, true); System.out.printf("Metric: %s with %d references%n", evalMetric, refs.length); EvaluationMetric<IString,String> metric = CorpusLevelMetricFactory.newMetric(evalMetric, references); IncrementalEvaluationMetric<IString,String> incMetric = metric.getIncrementalMetric(); LineNumberReader reader = new LineNumberReader(new InputStreamReader( System.in)); for (String line; (line = reader.readLine()) != null; ) { line = NISTTokenizer.tokenize(line); Sequence<IString> translation = IStrings.tokenize(line); ScoredFeaturizedTranslation<IString, String> tran = new ScoredFeaturizedTranslation<>( translation, null, 0); incMetric.add(tran); } // Check for an incomplete set of translations if (reader.getLineNumber() < references.size()) { System.err.printf("WARNING: Translation candidate file is shorter than references (%d/%d)%n", reader.getLineNumber(), references.size()); } reader.close(); System.out.printf("%s = %.3f%n", evalMetric, 100 * Math.abs(incMetric.score())); System.out.printf("Details:%n%s%n", incMetric.scoreDetails()); }