org.grobid.core.utilities.TextUtilities Java Examples

The following examples show how to use org.grobid.core.utilities.TextUtilities. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: NEDCorpusEvaluation.java    From entity-fishing with Apache License 2.0 6 votes vote down vote up
private void reportMetrics(StringBuilder report,
                           String corpus,
                           List<Result> results) {
    report.append(String.format("\n%-20s %-12s %-12s %-12s %-7s\n\n",
            corpus,
            "accuracy",
            "precision",
            "recall",
            "f1"));
    for (Result result : results) {
        report.append(String.format("%-20s %-12s %-12s %-12s %-7s\n",
                result.method,
                TextUtilities.formatTwoDecimals(result.accuracy * 100),
                TextUtilities.formatTwoDecimals(result.precision * 100),
                TextUtilities.formatTwoDecimals(result.recall * 100),
                TextUtilities.formatTwoDecimals(result.f1 * 100)));
    }
}
 
Example #2
Source File: NerdEntity.java    From entity-fishing with Apache License 2.0 5 votes vote down vote up
private String simpleStringNormalisation(String str) {
	// dehyphenize
	String result = TextUtilities.dehyphenize(str);

	// otherwise clean
	return result.replace("\n", " ").trim().replaceAll(" +", " ");
}
 
Example #3
Source File: TextSaxHandler.java    From grobid-ner with Apache License 2.0 5 votes vote down vote up
public void endElement(String uri,
                          String localName,
                          String qName) throws SAXException {
       if (qName.equals("txt")) {
		String token = getText();
		token = token.replace("n't", " n't");
		StringTokenizer st = new StringTokenizer(token, TextUtilities.delimiters, true);
		List<String> currentTmpVector = new ArrayList<String>();			
		while (st.hasMoreTokens()) {	
			currentTmpVector.add(st.nextToken());
		}
		
		// basic re-tokenization for numbers to match Idilia tokenization
		//currentTmpVector = retokenize(currentTmpVector);
		
		// finally remove spaces 
		for(String tok : currentTmpVector) {
		 	if (!tok.equals(" ")) {
				if (currentVector == null)	{
					currentVector = new ArrayList<String>();
				}
				currentVector.add(tok);
			}
		}
		
		//textVector.add(currentVector);
		//currentVector = null;
	}
	else if (qName.equals("sent")) {
		textVector.add(currentVector);
		currentVector = null;
	}	
	accumulator.setLength(0);
}
 
Example #4
Source File: ReutersSaxHandler.java    From grobid-ner with Apache License 2.0 5 votes vote down vote up
public void endElement(java.lang.String uri,
                          java.lang.String localName,
                          java.lang.String qName) throws SAXException {
       if (qName.equals("p") || qName.equals("headline")) {
		String section = getText();
		section = section.replace("n't", " n't");
		StringTokenizer st = new StringTokenizer(section, TextUtilities.delimiters, true);
		//Pattern pattern = Pattern.compile("([,\\.] )|[\\n\\t\\(\\[ :;\\?!/\\)-\\\\\"“”‘’'`\\]\\*\\+]");
		//Splitter splitter = new Splitter(pattern, true);
		//String[] tokens = splitter.split(section);
		
		//char[] cs = section.toCharArray();
	 	//TokenizerFactory factory = IndoEuropeanTokenizerFactory.INSTANCE;
		//Tokenizer tokenizer = factory.tokenizer(cs,0,cs.length);
		//for (String tok : tokenizer) {	
		List<String> currentTmpVector = new ArrayList<String>();			
		while (st.hasMoreTokens()) {	
			currentTmpVector.add(st.nextToken());
		}
		
		// basic re-tokenization for numbers to match Idilia tokenization
		currentTmpVector = retokenize(currentTmpVector);
		
		// finally remove spaces 
		for(String tok : currentTmpVector) {
		 	if (!tok.equals(" "))
				currentVector.add(tok);
		}
		
		textVector.add(currentVector);
		currentVector = null;
	}
	
	accumulator.setLength(0);
}
 
Example #5
Source File: EvaluationUtil.java    From entity-fishing with Apache License 2.0 4 votes vote down vote up
public static LabelStat evaluate(ArticleTrainingSample testSet, List<LabelStat> stats) throws Exception {	
		//DecimalFormat format = new DecimalFormat("#0.00");

		double accumulatedRecall = 0.0;
		double accumulatedPrecision = 0.0;
		double accumulatedF1Score = 0.0;

		double lowerPrecision = 1.0;
		double lowerRecall = 1.0;
		double lowerF1Score = 1.0;
		
		int perfectRecall = 0;
		int perfectPrecision = 0;
		
		LabelStat globalStats = new LabelStat();
		int i = 0; 
		for (Article article : testSet.getSample()) {
			if (i == testSet.size())
				break;
			LabelStat localStats = stats.get(i);
			
			globalStats.incrementObserved(localStats.getObserved());
			globalStats.incrementExpected(localStats.getExpected());
			globalStats.incrementFalsePositive(localStats.getFalsePositive());
			globalStats.incrementFalseNegative(localStats.getFalseNegative());

			accumulatedRecall += localStats.getRecall();
			accumulatedPrecision += localStats.getPrecision();
			accumulatedF1Score += localStats.getF1Score();

/*System.out.println(localStats.toString());
System.out.println("local recall: " + localStats.getRecall());
System.out.println("local precision: " + localStats.getPrecision());
System.out.println("local f1: " + localStats.getF1Score());*/

			lowerPrecision = Math.min(lowerPrecision, localStats.getPrecision());
			lowerRecall = Math.min(lowerRecall, localStats.getRecall());
			
			if (localStats.getPrecision() == 1.0) 
				perfectPrecision++;
			if (localStats.getRecall() == 1.0)
				perfectRecall++;

			i++;
		}

		double microAveragePrecision = 0.0;
		double microAverageRecall = 0.0;
		double microAverageF1Score = 0.0;

		double macroAveragePrecision = 0.0;
		double macroAverageRecall = 0.0;
		double macroAverageF1Score = 0.0;

		StringBuilder builder = new StringBuilder();

		builder.append("\nEvaluation on " + testSet.size() + " articles ");

		builder.append("\n\n-- Macro-average --\n");
		builder.append("precision: ").append(TextUtilities.formatFourDecimals(accumulatedPrecision / testSet.size())).append("\n");
		builder.append("recall: ").append(TextUtilities.formatFourDecimals(accumulatedRecall / testSet.size())).append("\n");
		builder.append("f1-score: ").append(TextUtilities.formatFourDecimals(accumulatedF1Score / testSet.size())).append("\n\n");

		builder.append("-- Micro-average --\n");
		builder.append("precision: ").append(TextUtilities.formatFourDecimals(globalStats.getPrecision())).append("\n");
		builder.append("recall: ").append(TextUtilities.formatFourDecimals(globalStats.getRecall())).append("\n");
		builder.append("f1-score: ").append(TextUtilities.formatFourDecimals(globalStats.getF1Score())).append("\n\n");		

		builder.append("lower precision in evaluation set: ").append(TextUtilities.formatFourDecimals(lowerPrecision)).append("\n");
		builder.append("lower recall in evalution set : ").append(TextUtilities.formatFourDecimals(lowerRecall)).append("\n");
		builder
			.append("perfect precision in evaluation set: ")
			.append(perfectPrecision)
			.append(" / ")
			.append(testSet.size())
			.append("\n");
		builder
			.append("perfect recall in evaluation set: ")
			.append(perfectRecall)
			.append(" / ")
			.append(testSet.size())
			.append("\n");
		
		System.out.println(builder.toString());

		return globalStats;
	}
 
Example #6
Source File: WikiTextConverter.java    From entity-fishing with Apache License 2.0 4 votes vote down vote up
private void writeNewlines(int num) {
	finishLine();
	sb.append(TextUtilities.strrep('\n', num));
	needNewlines = 0;
	needSpace = false;
}