java source code of NERScorer

package edu.uncc.cs.watsonsim.scorers;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;

import opennlp.tools.cmdline.parser.ParserTool;
import opennlp.tools.parser.Parse;
import opennlp.tools.parser.Parser;
import opennlp.tools.parser.ParserFactory;
import opennlp.tools.parser.ParserModel;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSTaggerME;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.Span;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;

import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;

//@author Varsha Devadas


public class NERScorer {
	private boolean modelsAreInitialized=false;	

	public  String modelsPath="/home/varsha/opennlpModels"; //models directory
	private File parserMFile; 
	private File sentDetectorMFile;
	private File posMFile;
	private File nerMFile;

	public SentenceModel sentenceModel; //sentence detection model 
	public ParserModel parserModel; //parsing model
	public POSTaggerME tagger;
	public TokenNameFinderModel nerModel;

	public String ca="Jane Austen"; 
	public String q="Jane Austen wrote Emma .";
	public String passage="Jane Austen was very modest about her own genius.[7] She once famously described her work as "+
			"the little bit (two Inches wide) of Ivory, on which I work with so fine a brush, " +
			"as produces little effect after much labor [7]. " +
			"When she was a girl she wrote stories. Her works were printed only after much revision. " +
			"Only four of her novels were printed while she was alive. They were Sense and Sensibility (1811), " +
			"Pride and Prejudice (1813), Mansfield Park (1814) and Emma (1816). " +
			"Two other novels, Northanger Abbey and Persuasion, were printed in 1817 with " +
			"a biographical notice by her brother, Henry Austen. Persuasion was written shortly before her death. " +
			"She also wrote two earlier works, Lady Susan, and an unfinished novel, The Watsons. " +
			"She had been working on a new novel, Sanditon, but she died before she could finish it.";




	//initialize all models needed for processing a passage of text (multiple sentences)
	//TODO: allow partial initialization parserInit() and chunkerInit()
	public void init() throws InvalidFormatException{
		File modelsDir = new File(this.modelsPath);

		this.parserMFile = new File(modelsDir, "en-parser-chunking.bin");
		this.sentDetectorMFile = new File(modelsDir, "en-sent.bin");
		this.posMFile = new File(modelsDir,"en-pos-maxent.bin");
		this.nerMFile = new File(modelsDir,"en-ner-person.bin");
		

		InputStream sentModelIn = null;
		InputStream nerModelIn = null;
		
		
		FileInputStream parserStream;
		try {
			//for finding sentences
			sentModelIn = new FileInputStream(sentDetectorMFile);
			this.sentenceModel = new SentenceModel(sentModelIn);
			//for finding POS
			FileInputStream posModelStream = new FileInputStream(posMFile);
			POSModel model = new POSModel(posModelStream);
			this.tagger = new POSTaggerME(model);
			//for parsing
			parserStream = new FileInputStream(parserMFile);
			this.parserModel = new ParserModel(parserStream);
			
			nerModelIn = new FileInputStream(nerMFile);
			this.nerModel = new TokenNameFinderModel(nerModelIn);
			
		} catch (FileNotFoundException e2) {
			// TODO Auto-generated catch block
			e2.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		this.modelsAreInitialized=true;
	}

	
	//parses a segment of text and shows the parse, and children of the top node
	public void parserTest1() throws IOException {
		if (!this.modelsAreInitialized) init();
		Parser parser = ParserFactory.create(
				this.parserModel,
				20, // beam size
				0.95); 
		Parse[] results = ParserTool.parseLine("Jane Austen was very modest about her own genius ."+this.q,
				parser, 1);
		Parse[] qResults = ParserTool.parseLine(this.q,parser, 1);
		Parse[] rChn = (results[0].getChildren())[0].getChildren();
		
		results[0].expandTopNode(results[0]);
		for (int i = 0; i < results.length; i++) {
			results[i].show();
		}
		for (int i = 0; i < qResults.length; i++) {
			qResults[i].show();
		}
		System.out.print("\n\n");
		for (int i = 0; i < rChn.length; i++) {
			rChn[i].show();
			System.out.print("\n");
		}
	}

//find sentences in a text
	public void testSentDetector(String testSents) throws InvalidFormatException{
		init();
		SentenceDetectorME sentenceDetector = new SentenceDetectorME(this.sentenceModel);
		String[] sentences = sentenceDetector.sentDetect(testSents);
		for (int i=0;i<sentences.length; i++)
			System.err.println("sent: "+sentences[i]);
	}
	
	/*public void testNamedEntityRecognition(String testSents) throws InvalidFormatException{
		init();
		NameFinderME nameFinder = new NameFinderME(this.nerModel);
		//Span nameSpans[] = nameFinder.find(testSents);
	}*/


	public Parse[] parsePassageText(String p) throws InvalidFormatException{
		if (!modelsAreInitialized)init();
		//initialize 	 
		SentenceDetectorME sentenceDetector = new SentenceDetectorME(this.sentenceModel);
		NameFinderME nameFinder = new NameFinderME(this.nerModel);
		Parser parser = ParserFactory.create(
				this.parserModel,
				20, // beam size
				0.95); // advance percentage
		//find sentences, tokenize each, parse each, return top parse for each 	 	 
		String[] sentences = sentenceDetector.sentDetect(p);
		Parse[] results = new Parse[sentences.length];
		for (int i=0;i<sentences.length;i++){
			//String[] tks = SimpleTokenizer.INSTANCE.tokenize(sentences[i]);
			
			//StringTokenizer st = new StringTokenizer(tks[i]); 
			//There are several tokenizers available. SimpleTokenizer works best
			Tokenizer tokenizer = SimpleTokenizer.INSTANCE;
			for (int si = 0; si < sentences.length; si++) {
		        Span[] tokenSpans = tokenizer.tokenizePos(sentences[si]);
		        String[] tokens = Span.spansToStrings(tokenSpans, sentences[si]);
		        Span[] names = nameFinder.find(tokens);
		        for (int ni = 0; ni < names.length; ni++) {
		            Span startSpan = tokenSpans[names[ni].getStart()];
		            int nameStart = startSpan.getStart();
		            Span endSpan = tokenSpans[names[ni].getEnd() - 1];
		            int nameEnd = endSpan.getEnd();
		            String name = sentences[si].substring(nameStart, nameEnd);
		            System.out.println(name);
		        }
		    }
			String sent= StringUtils.join(tokenizer," ");
			System.out.println("Found sentence " + sent);
			Parse[] sentResults = ParserTool.parseLine(sent,parser, 1);
			results[i]=sentResults[0];
		}
		return results;
	}

	public void taggerTest(){
		String[] words = SimpleTokenizer.INSTANCE.tokenize(
				"The quick, red fox jumped over the lazy, brown dogs.");
		String[] result = tagger.tag(words);
		for (int i=0 ; i < words.length; i++) {
			System.err.print(words[i] + "/" + result[i] + " ");
		}
		System.err.println("n");
	}

	//
	public Parse[] getAllChildren(Parse[] parseAr){
		Parse[] allChildren = parseAr;
		Parse[] allChldr;		
		for (int i=0; i<parseAr.length;i++){
			Parse[] children = parseAr[i].getChildren();			
			allChldr= getAllChildren(children);
			allChildren  =ArrayUtils.addAll(allChildren, allChldr);
		}	
		return allChildren;		
	}


	public Parse[] getAllChildren(Parse parse){
		Parse[] allChildren = new Parse[1];
		allChildren[0]=parse;
		Parse[] allChldr;
		Parse[] children = parse.getChildren();			
		allChldr= getAllChildren(children);
		allChildren  =ArrayUtils.addAll(allChildren, allChldr);
		return allChildren;		
	}

//computes the number of matches between two sets of parses
//a match means same label over the same string 
	public double compareParseChunks(Parse[] pa1, Parse[] pa2, boolean verbose){
		HashMap<String,String> pa1h= new HashMap<String, String>();
		double numMatches=0;
		for (int i=0;i<pa1.length;i++){
			String[] key = new String[2];
			key[0]=pa1[i].getCoveredText();
			key[1]=pa1[i].getLabel();
			pa1h.put(key[1]+key[0],"y");
		}
		for (int j=0;j<pa2.length;j++){
			String[] key = new String[2];
			key[0]=pa2[j].getCoveredText();
			key[1]=pa2[j].getLabel();
			if (pa1h.containsKey(key[1]+key[0])){ 
				numMatches++;
				if (verbose) System.out.println("\n");
				pa2[j].show();
				if (verbose) System.out.println("span: "+pa2[j].getSpan());
				if (verbose) System.out.println("type: "+pa2[j].getType());
			};
		}
		if (verbose) System.out.println("numMatches "+numMatches);
		return numMatches;
	}

	//a simple scorer based on the number of matches; requires the first string to be in the passage
	public double scoreStructure(String ca, String q, String passage, boolean verbose) throws InvalidFormatException{
		double score1=0, score2=0;
		Parse[] caParse = this.parsePassageText(ca);
		Parse[] qParse = this.parsePassageText(q);
		Parse[] pasParse = this.parsePassageText(passage);
		Parse[] caParseCh = getAllChildren(caParse);
		Parse[] qParseCh = getAllChildren(qParse);
		Parse[] pasParseCh = getAllChildren(pasParse);
		score1=compareParseChunks(qParseCh, pasParseCh,verbose);
		score2=compareParseChunks(caParseCh, pasParseCh,verbose);
		return score1*score2;
	}

	//normalized scorer. 
	//TODO divide by passage length containing the matches, not the full passage length
	public double scoreStructureNorm(String ca, String q, String passage, boolean verbose) throws InvalidFormatException{
		double score1=0, score2=0;
		//OnlpParserTest pt= new OnlpParserTest();
		Parse[] caParse = this.parsePassageText(ca);
		Parse[] qParse = this.parsePassageText(q);
		Parse[] pasParse = this.parsePassageText(passage);
		Parse[] caParseCh = getAllChildren(caParse);
		Parse[] qParseCh = getAllChildren(qParse);
		Parse[] pasParseCh = getAllChildren(pasParse);
		score1=compareParseChunks(qParseCh, pasParseCh,verbose);
		score2=compareParseChunks(caParseCh, pasParseCh,verbose);
		return score1*score2/passage.length();
	}	


	public static void main(String[] args) throws IOException {
		// TODO Auto-generated method stub
		NERScorer pt= new NERScorer();
		Parse[] parses = pt.parsePassageText("this is a cat . this is a dog .");
		for (int i=0;i<parses.length;i++){
			System.out.println("parses: ");
			parses[i].show();
		}
		String q="red fox jumped over brown dogs ."; 
		String pas="red fox jumped over brown dogs . The quick , red fox jumped over the lazy , brown dogs . ";
		//parses = pt.parsePassageText("The quick, red fox jumped over the lazy, brown dogs. The quick , red fox jumped over the lazy , brown dogs . ");
		parses = pt.parsePassageText(pas);
		Parse[][] parsecs = new Parse[2][];
		for (int i=0;i<parses.length;i++){
			System.out.print("parse["+i+"]: ");
			parses[i].show();
			System.out.println(parses[i].getText());
			Parse[] parsec = pt.getAllChildren(parses[i]);
			parsecs[i]=parsec;
			for (int j=0;j<parsec.length;j++){
				System.out.print("parses child: ");
				parsec[j].show();
			}
			System.out.println("number of children in the parses: "+parsec.length);
		}
		pt.compareParseChunks(parsecs[0],parsecs[1],true);
		System.out.println();
		System.out.println("NormalizedScore: "+pt.scoreStructureNorm("red fox",pas,q ,false)); 
		System.out.println("Raw Score: "+pt.scoreStructure("red fox",pas,q ,false)); 
		System.out.println("\n\n");
		
		
		String ca="Jane Austen"; 
		String qq="Jane Austen wrote Emma";
		String passage="Jane Austen was very modest about her own genius.[7] She once famously described her work as "+
				"the little bit (two Inches wide) of Ivory, on which I work with so fine a brush, " +
				"as produces little effect after much labor [7]. " +
				"Jane Austen wrote Emma."+
				"When she was a girl she wrote stories. Her works were printed only after much revision. " +
				"Only four of her novels were printed while she was alive. They were Sense and Sensibility (1811), " +
				"Pride and Prejudice (1813), Mansfield Park (1814) and Emma (1816). " +
				"Two other novels, Northanger Abbey and Persuasion, were printed in 1817 with " +
				"a biographical notice by her brother, Henry Austen. Persuasion was written shortly before her death. " +
				"She also wrote two earlier works, Lady Susan, and an unfinished novel, The Watsons. " +
				"She had been working on a new novel, Sanditon, but she died before she could finish it.";
		
		System.out.println();
		System.out.println("NormalizedScore: "+pt.scoreStructureNorm(ca,qq, passage,false));
		System.out.println("Raw Score: "+pt.scoreStructure(ca,qq, passage,false));
		
		//pt.taggerTest();
		//pt.testSentDetector(passage);
	}
}
//ts here