org.grobid.core.utilities.TextUtilities Java Examples
The following examples show how to use
org.grobid.core.utilities.TextUtilities.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: NEDCorpusEvaluation.java From entity-fishing with Apache License 2.0 | 6 votes |
private void reportMetrics(StringBuilder report, String corpus, List<Result> results) { report.append(String.format("\n%-20s %-12s %-12s %-12s %-7s\n\n", corpus, "accuracy", "precision", "recall", "f1")); for (Result result : results) { report.append(String.format("%-20s %-12s %-12s %-12s %-7s\n", result.method, TextUtilities.formatTwoDecimals(result.accuracy * 100), TextUtilities.formatTwoDecimals(result.precision * 100), TextUtilities.formatTwoDecimals(result.recall * 100), TextUtilities.formatTwoDecimals(result.f1 * 100))); } }
Example #2
Source File: NerdEntity.java From entity-fishing with Apache License 2.0 | 5 votes |
private String simpleStringNormalisation(String str) { // dehyphenize String result = TextUtilities.dehyphenize(str); // otherwise clean return result.replace("\n", " ").trim().replaceAll(" +", " "); }
Example #3
Source File: TextSaxHandler.java From grobid-ner with Apache License 2.0 | 5 votes |
public void endElement(String uri, String localName, String qName) throws SAXException { if (qName.equals("txt")) { String token = getText(); token = token.replace("n't", " n't"); StringTokenizer st = new StringTokenizer(token, TextUtilities.delimiters, true); List<String> currentTmpVector = new ArrayList<String>(); while (st.hasMoreTokens()) { currentTmpVector.add(st.nextToken()); } // basic re-tokenization for numbers to match Idilia tokenization //currentTmpVector = retokenize(currentTmpVector); // finally remove spaces for(String tok : currentTmpVector) { if (!tok.equals(" ")) { if (currentVector == null) { currentVector = new ArrayList<String>(); } currentVector.add(tok); } } //textVector.add(currentVector); //currentVector = null; } else if (qName.equals("sent")) { textVector.add(currentVector); currentVector = null; } accumulator.setLength(0); }
Example #4
Source File: ReutersSaxHandler.java From grobid-ner with Apache License 2.0 | 5 votes |
public void endElement(java.lang.String uri, java.lang.String localName, java.lang.String qName) throws SAXException { if (qName.equals("p") || qName.equals("headline")) { String section = getText(); section = section.replace("n't", " n't"); StringTokenizer st = new StringTokenizer(section, TextUtilities.delimiters, true); //Pattern pattern = Pattern.compile("([,\\.] )|[\\n\\t\\(\\[ :;\\?!/\\)-\\\\\"“”‘’'`\\]\\*\\+]"); //Splitter splitter = new Splitter(pattern, true); //String[] tokens = splitter.split(section); //char[] cs = section.toCharArray(); //TokenizerFactory factory = IndoEuropeanTokenizerFactory.INSTANCE; //Tokenizer tokenizer = factory.tokenizer(cs,0,cs.length); //for (String tok : tokenizer) { List<String> currentTmpVector = new ArrayList<String>(); while (st.hasMoreTokens()) { currentTmpVector.add(st.nextToken()); } // basic re-tokenization for numbers to match Idilia tokenization currentTmpVector = retokenize(currentTmpVector); // finally remove spaces for(String tok : currentTmpVector) { if (!tok.equals(" ")) currentVector.add(tok); } textVector.add(currentVector); currentVector = null; } accumulator.setLength(0); }
Example #5
Source File: EvaluationUtil.java From entity-fishing with Apache License 2.0 | 4 votes |
public static LabelStat evaluate(ArticleTrainingSample testSet, List<LabelStat> stats) throws Exception { //DecimalFormat format = new DecimalFormat("#0.00"); double accumulatedRecall = 0.0; double accumulatedPrecision = 0.0; double accumulatedF1Score = 0.0; double lowerPrecision = 1.0; double lowerRecall = 1.0; double lowerF1Score = 1.0; int perfectRecall = 0; int perfectPrecision = 0; LabelStat globalStats = new LabelStat(); int i = 0; for (Article article : testSet.getSample()) { if (i == testSet.size()) break; LabelStat localStats = stats.get(i); globalStats.incrementObserved(localStats.getObserved()); globalStats.incrementExpected(localStats.getExpected()); globalStats.incrementFalsePositive(localStats.getFalsePositive()); globalStats.incrementFalseNegative(localStats.getFalseNegative()); accumulatedRecall += localStats.getRecall(); accumulatedPrecision += localStats.getPrecision(); accumulatedF1Score += localStats.getF1Score(); /*System.out.println(localStats.toString()); System.out.println("local recall: " + localStats.getRecall()); System.out.println("local precision: " + localStats.getPrecision()); System.out.println("local f1: " + localStats.getF1Score());*/ lowerPrecision = Math.min(lowerPrecision, localStats.getPrecision()); lowerRecall = Math.min(lowerRecall, localStats.getRecall()); if (localStats.getPrecision() == 1.0) perfectPrecision++; if (localStats.getRecall() == 1.0) perfectRecall++; i++; } double microAveragePrecision = 0.0; double microAverageRecall = 0.0; double microAverageF1Score = 0.0; double macroAveragePrecision = 0.0; double macroAverageRecall = 0.0; double macroAverageF1Score = 0.0; StringBuilder builder = new StringBuilder(); builder.append("\nEvaluation on " + testSet.size() + " articles "); builder.append("\n\n-- Macro-average --\n"); builder.append("precision: ").append(TextUtilities.formatFourDecimals(accumulatedPrecision / testSet.size())).append("\n"); builder.append("recall: ").append(TextUtilities.formatFourDecimals(accumulatedRecall / testSet.size())).append("\n"); builder.append("f1-score: ").append(TextUtilities.formatFourDecimals(accumulatedF1Score / testSet.size())).append("\n\n"); builder.append("-- Micro-average --\n"); builder.append("precision: ").append(TextUtilities.formatFourDecimals(globalStats.getPrecision())).append("\n"); builder.append("recall: ").append(TextUtilities.formatFourDecimals(globalStats.getRecall())).append("\n"); builder.append("f1-score: ").append(TextUtilities.formatFourDecimals(globalStats.getF1Score())).append("\n\n"); builder.append("lower precision in evaluation set: ").append(TextUtilities.formatFourDecimals(lowerPrecision)).append("\n"); builder.append("lower recall in evalution set : ").append(TextUtilities.formatFourDecimals(lowerRecall)).append("\n"); builder .append("perfect precision in evaluation set: ") .append(perfectPrecision) .append(" / ") .append(testSet.size()) .append("\n"); builder .append("perfect recall in evaluation set: ") .append(perfectRecall) .append(" / ") .append(testSet.size()) .append("\n"); System.out.println(builder.toString()); return globalStats; }
Example #6
Source File: WikiTextConverter.java From entity-fishing with Apache License 2.0 | 4 votes |
private void writeNewlines(int num) { finishLine(); sb.append(TextUtilities.strrep('\n', num)); needNewlines = 0; needSpace = false; }