/* * Copyright 2008 Novamente LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package relex; import java.lang.Math; import java.lang.String; import relex.feature.FeatureNode; import relex.feature.RelationCallback; import relex.stats.Histogram; /** * This class collects a miscellany of statistics about a parsed text. * * Copyright (C) 2008 Linas Vepstas <[email protected]> */ public class ParseStats { private int count; private Histogram parse_count; private int max_parses; private Histogram word_count; private int failed_parses; private Histogram first_parse_confidence; private Histogram second_parse_confidence; private Histogram third_parse_confidence; private Histogram fourth_parse_confidence; private Histogram relations; private int relcnt; public ParseStats() { count = 0; word_count = new Histogram(1,31); max_parses = 10; parse_count = new Histogram(0,max_parses); failed_parses = 0; first_parse_confidence = new Histogram(20, 0.0, 1.0); second_parse_confidence = new Histogram(20, 0.0, 1.0); third_parse_confidence = new Histogram(20, 0.0, 1.0); fourth_parse_confidence = new Histogram(20, 0.0, 1.0); relations = new Histogram(1, 21); } public void bin(Sentence sntc) { if (null == sntc) return; count ++; int nparses = sntc.getParses().size(); parse_count.bin(nparses); if (nparses <= 0) return; ParsedSentence fs = sntc.getParses().get(0); word_count.bin(fs.getNumWords()); // If the first parse has skipped words, the parse is "failed" if (fs.getNumSkippedWords() != 0) failed_parses ++; // Count the first parse only if its "good" if (fs.getNumSkippedWords() == 0) first_parse_confidence.bin(fs.getTruthValue().getConfidence()); if (2 <= nparses) second_parse_confidence.bin(sntc.getParses().get(1).getTruthValue().getConfidence()); if (3 <= nparses) third_parse_confidence.bin(sntc.getParses().get(2).getTruthValue().getConfidence()); if (4 <= nparses) fourth_parse_confidence.bin(sntc.getParses().get(3).getTruthValue().getConfidence()); // Count average number of relations per sentence. // But only for the first, most high-confidence parse. relcnt = 0; RelCount rcnt = new RelCount(); fs.foreach(rcnt); relations.bin(relcnt); } private class RelCount implements RelationCallback { public Boolean UnaryRelationCB(FeatureNode from, String rel) { return false; } public Boolean BinaryHeadCB(FeatureNode from) { return false; } public Boolean BinaryRelationCB(String relation, FeatureNode from, FeatureNode to) { relcnt ++; return false; } } public String toString() { double failed = 100.0 * ((double) failed_parses) / ((double) count); int pf = (int) Math.floor(failed+0.5); double overflow = 100.0 * ((double) parse_count.getOverflow()) / ((double) count); int ovfl = (int) Math.floor(overflow+0.5); String str = ""; str += "\nTotal sentences: " + count; str += "\nFailed parses: " + failed_parses + " Percent failed: " + pf + "%" + " (these are parses with one or more words skipped)"; str += "\nWords per sentence: " + word_count.getMean(); str += "\nParses per sentence, mode: " + parse_count.getMode() + " median: " + parse_count.getMedian() + " mean: " + parse_count.getMean() + " stddev: " + parse_count.getStdDev(); str += "\nsentences with more than " + max_parses + " parses: " + parse_count.getOverflow() + " as percent: " + ovfl + "%"; str += "\nRelations per parse, mode: " + relations.getMode() + " median: " + relations.getMedian() + " mean: " + relations.getMean() + " stddev: " + relations.getStdDev(); str += "\nConfidence of first parse: " + first_parse_confidence.getMean() + " (out of " + first_parse_confidence.getCount() + " parses)"; str += "\nFirst parse hi/lo: " + first_parse_confidence.getAllTimeHigh() + " / " + first_parse_confidence.getAllTimeLow(); str += " stddev: " + first_parse_confidence.getStdDev(); str += "\nConfidence of second parse: " + second_parse_confidence.getMean() + " (out of " + second_parse_confidence.getCount() + " parses)"; str += ", stddev: " + second_parse_confidence.getStdDev(); str += "\nConfidence of third parse: " + third_parse_confidence.getMean() + " (out of " + third_parse_confidence.getCount() + " parses)"; str += ", stddev: " + third_parse_confidence.getStdDev(); str += "\nConfidence of fourth parse: " + fourth_parse_confidence.getMean() + " (out of " + fourth_parse_confidence.getCount() + " parses)"; str += ", stddev: " + fourth_parse_confidence.getStdDev(); str += "\n"; return str; } }