java source code of JsonPipeline

package corenlp;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;

import org.codehaus.jackson.JsonNode;

import util.JsonUtil;
import util.U;

import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;

import edu.stanford.nlp.dcoref.CorefChain;
import edu.stanford.nlp.dcoref.CorefCoreAnnotations.CorefChainAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetBeginAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetEndAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.EntityTypeAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.MentionsAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.NormalizedNamedEntityTagAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SentenceIndexAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokenBeginAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokenEndAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
// paths for stanford 3.2.0.  before that, it's e.s.nlp.trees.semgraph.SemanticGraph
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.BasicDependenciesAnnotation;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation;
import edu.stanford.nlp.semgraph.SemanticGraphEdge;
import edu.stanford.nlp.time.TimeAnnotations.TimexAnnotation;
import edu.stanford.nlp.trees.TreeCoreAnnotations;
import edu.stanford.nlp.util.CoreMap;

/** 
 * A wrapper around a CoreNLP Pipeline object that knows how to turn output annotations into JSON,
 * with 0-oriented indexing conventions.
 * 
 *  implementation: needs to mirror edu/stanford/nlp/pipeline/XMLOutputter.java somewhat
 */
public class JsonPipeline {

	StanfordCoreNLP pipeline;
	Properties props = new Properties();
	
	int numTokens = 0;
	int numDocs = 0;
	int numChars = 0;
	long startMilli = -1;
	
	public JsonPipeline() {
	}

	static void addTokenBasics(Map<String,Object> sent_info, CoreMap sentence) {
		List<List<Integer>> tokenSpans = Lists.newArrayList();
		List<String> tokenTexts = Lists.newArrayList();
		for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
			List<Integer> span = Lists.newArrayList(token.beginPosition(), token.endPosition());
			tokenSpans.add(span);
			tokenTexts.add(token.value());
		}
		sent_info.put("tokens", (Object) tokenTexts);
		sent_info.put("char_offsets", (Object) tokenSpans);
	}
	
	@SuppressWarnings({ "rawtypes", "unchecked" })
	static void addTokenAnno(Map<String,Object> sent_info, CoreMap sentence,
			String keyname, Class annoClass) {
		List<String> tokenAnnos = Lists.newArrayList();
		for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
			tokenAnnos.add(token.getString(annoClass));
		}
		sent_info.put(keyname, (Object) tokenAnnos);
	}
	
	static void addParseTree(Map<String,Object> sent_info, CoreMap sentence) {
		sent_info.put("parse", sentence.get(TreeCoreAnnotations.TreeAnnotation.class).toString());
	}
	
	@SuppressWarnings("rawtypes")
	static void addDepsCC(Map<String,Object> sent_info, CoreMap sentence) {
		SemanticGraph dependencies = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class);
		List deps = jsonFriendlyDeps(dependencies);
		sent_info.put("deps_cc", deps);
	}
	
	@SuppressWarnings("rawtypes")
	static void addDepsBasic(Map<String,Object> sent_info, CoreMap sentence) {
		SemanticGraph dependencies = sentence.get(BasicDependenciesAnnotation.class);
		List deps = jsonFriendlyDeps(dependencies);
		sent_info.put("deps_basic", deps);
	}
	@SuppressWarnings({ "rawtypes", "unchecked" })
	static void addEntityMentions(Map<String,Object> sent_info, CoreMap sentence) {
        List<CoreMap> coreMentions = sentence.get(MentionsAnnotation.class);
        List<Map> jsonMentions = new ArrayList<>();
        /* trying to figure out the keys in each mention. here's a printout from one.
MENTION August 2014
class edu.stanford.nlp.ling.CoreAnnotations$TextAnnotation	August 2014
class edu.stanford.nlp.ling.CoreAnnotations$CharacterOffsetBeginAnnotation	3
class edu.stanford.nlp.ling.CoreAnnotations$CharacterOffsetEndAnnotation	14
class edu.stanford.nlp.ling.CoreAnnotations$TokensAnnotation	[August-2, 2014-3]
class edu.stanford.nlp.ling.CoreAnnotations$TokenBeginAnnotation	1
class edu.stanford.nlp.ling.CoreAnnotations$TokenEndAnnotation	3
class edu.stanford.nlp.ling.CoreAnnotations$NamedEntityTagAnnotation	DATE
class edu.stanford.nlp.ling.CoreAnnotations$NormalizedNamedEntityTagAnnotation	2014-08
class edu.stanford.nlp.ling.CoreAnnotations$EntityTypeAnnotation	DATE
class edu.stanford.nlp.ling.CoreAnnotations$SentenceIndexAnnotation	0
class edu.stanford.nlp.time.TimeAnnotations$TimexAnnotation	<TIMEX3 tid="t1" type="DATE" value="2014-08">August 2014</TIMEX3>
MENTION Barack Obama
class edu.stanford.nlp.ling.CoreAnnotations$TextAnnotation	Barack Obama
class edu.stanford.nlp.ling.CoreAnnotations$CharacterOffsetBeginAnnotation	17
class edu.stanford.nlp.ling.CoreAnnotations$CharacterOffsetEndAnnotation	29
class edu.stanford.nlp.ling.CoreAnnotations$TokensAnnotation	[Barack-5, Obama-6]
class edu.stanford.nlp.ling.CoreAnnotations$TokenBeginAnnotation	4
class edu.stanford.nlp.ling.CoreAnnotations$TokenEndAnnotation	6
class edu.stanford.nlp.ling.CoreAnnotations$NamedEntityTagAnnotation	PERSON
class edu.stanford.nlp.ling.CoreAnnotations$EntityTypeAnnotation	PERSON
class edu.stanford.nlp.ling.CoreAnnotations$SentenceIndexAnnotation	0
MENTION Paris
class edu.stanford.nlp.ling.CoreAnnotations$TextAnnotation	Paris
class edu.stanford.nlp.ling.CoreAnnotations$CharacterOffsetBeginAnnotation	66
class edu.stanford.nlp.ling.CoreAnnotations$CharacterOffsetEndAnnotation	71
class edu.stanford.nlp.ling.CoreAnnotations$TokensAnnotation	[Paris-5]
class edu.stanford.nlp.ling.CoreAnnotations$TokenBeginAnnotation	14
class edu.stanford.nlp.ling.CoreAnnotations$TokenEndAnnotation	15
class edu.stanford.nlp.ling.CoreAnnotations$NamedEntityTagAnnotation	LOCATION
class edu.stanford.nlp.ling.CoreAnnotations$EntityTypeAnnotation	LOCATION
class edu.stanford.nlp.ling.CoreAnnotations$SentenceIndexAnnotation	1
         */
        for (CoreMap mention : coreMentions) {
//            U.p("MENTION " + mention);
//        	for (Class k : mention.keySet()) {
//        		U.pf("%s\t%s\n", k, mention.get(k));
//        	}
            Map m = new HashMap<String, Object>();
            m.put("tokspan", Lists.newArrayList(
            		mention.get(TokenBeginAnnotation.class).intValue(),
            		mention.get(TokenEndAnnotation.class).intValue()));
            m.put("charspan", Lists.newArrayList(
            		mention.get(CharacterOffsetBeginAnnotation.class).intValue(),
            		mention.get(CharacterOffsetEndAnnotation.class).intValue()));
            m.put("sentence", mention.get(SentenceIndexAnnotation.class).intValue());
            String entityType = mention.get(EntityTypeAnnotation.class);
            m.put("type", entityType);
            if (mention.containsKey(NormalizedNamedEntityTagAnnotation.class)) {
            	m.put("normalized", mention.get(NormalizedNamedEntityTagAnnotation.class));
            }
            if (mention.containsKey(TimexAnnotation.class)) {
            	m.put("timex_xml", mention.get(TimexAnnotation.class).toString());
            }
            jsonMentions.add(m);
        }
        sent_info.put("entitymentions", jsonMentions);
	}
	
	@SuppressWarnings({ "rawtypes", "unchecked" })
	static List jsonFriendlyDeps(SemanticGraph dependencies) {
		List deps = new ArrayList();
		// Since the dependencies are for each sentence, we obtain the root
		// and add it to the list of dependency triples.
		// The method is explained in the following link:
		// http://stackoverflow.com/questions/16300056/stanford-core-nlp-missing-roots
		List deptriple;
		try {
			IndexedWord root = dependencies.getFirstRoot();
			deptriple = Lists.newArrayList(
					"root",
					-1,
					root.index() - 1);
			deps.add(deptriple);
		} catch (Exception e) {
			// This can happen: https://github.com/stanfordnlp/CoreNLP/issues/55
		}

		for (SemanticGraphEdge e : dependencies.edgeIterable()) {
			deptriple = Lists.newArrayList(
					e.getRelation().toString(), 
					e.getGovernor().index() - 1,
					e.getDependent().index() - 1);
			deps.add(deptriple);
		}
		return deps;
	}
	
	public void setConfigurationFromFile(String iniPropertiesFilename) throws FileNotFoundException, IOException {
		props.load(new FileInputStream(iniPropertiesFilename));
	}
	
	/** assume the properties object has been set */
	void initializeCorenlpPipeline() {
		pipeline = new StanfordCoreNLP(props);
	}

	List getCorefInfo(Annotation doc) {
		Map<Integer, CorefChain> corefChains = doc.get(CorefChainAnnotation.class);
//		List<CoreMap> sentences = doc.get(SentencesAnnotation.class);
		List entities = new ArrayList();
		for (CorefChain chain : corefChains.values()) {
			List mentions = new ArrayList();
			CorefChain.CorefMention representative = chain.getRepresentativeMention();
			for (CorefChain.CorefMention corement : chain.getMentionsInTextualOrder()) {
				Map outment = new HashMap();
				outment.put("sentence", corement.sentNum-1);
				outment.put("tokspan_in_sentence", Lists.newArrayList(
								corement.startIndex-1, corement.endIndex-1));
				outment.put("head",corement.headIndex-1);
				outment.put("gender", corement.gender.toString());
				outment.put("animacy", corement.animacy.toString());
				outment.put("number", corement.number.toString());
				outment.put("mentiontype", corement.mentionType.toString());
				outment.put("mentionid", corement.mentionID);
				if (representative!=null && corement.mentionID==representative.mentionID) {
					outment.put("representative", true);
				}
				mentions.add(outment);
			}
			Map entity = ImmutableMap.builder()
					.put("mentions", mentions)
					.put("entityid", chain.getChainID())
					.build();
			entities.add(entity);
		}
		return entities;
	}
	/** annotator is a stanford corenlp notion.  */
	void addAnnoToSentenceObject(Map<String,Object> sent_info, CoreMap sentence, String annotator) {
		switch(annotator) {
		case "tokenize":
		case "cleanxml":
		case "ssplit":
			break;
		case "pos":
			addTokenAnno(sent_info,sentence, "pos", PartOfSpeechAnnotation.class);
			break;
		case "lemma":
			addTokenAnno(sent_info,sentence, "lemmas", LemmaAnnotation.class);
			break;
		case "ner":
			addTokenAnno(sent_info, sentence, "ner", NamedEntityTagAnnotation.class);
			addTokenAnno(sent_info, sentence, "normner", NormalizedNamedEntityTagAnnotation.class);
			break;
		case "regexner":
			addTokenAnno(sent_info, sentence, "ner", NamedEntityTagAnnotation.class);
			break;
		case "sentiment": throw new RuntimeException("TODO");
		case "truecase": throw new RuntimeException("TODO");
		case "parse":
			addParseTree(sent_info,sentence);
			addDepsCC(sent_info,sentence);
			addDepsBasic(sent_info,sentence);
			break;
		case "depparse":
			addDepsCC(sent_info,sentence);
			addDepsBasic(sent_info,sentence);
			break;
		case "dcoref":
			break;
		case "relation": throw new RuntimeException("TODO");
		case "natlog": throw new RuntimeException("TODO");
		case "quote": throw new RuntimeException("TODO");
		case "entitymentions":
			addEntityMentions(sent_info, sentence);
			break;
		default:
			throw new RuntimeException("don't know how to handle annotator " + annotator);
		}
	}

	String[] annotators() {
		String annotatorsAllstr = (String) props.get("annotators");
		if (annotatorsAllstr==null || annotatorsAllstr.trim().isEmpty()) {
			throw new RuntimeException("'annotators' property seems to not be set");
		}
		return annotatorsAllstr.trim().split(",\\s*");
	}
	
	/** runs the corenlp pipeline with all options, and returns all results as a JSON object. */
	@SuppressWarnings({ "rawtypes", "unchecked" })
	JsonNode processTextDocument(String doctext) {
		if (startMilli==-1)  startMilli = System.currentTimeMillis();
		numDocs++;
		numChars += doctext.length();

		Annotation document = new Annotation(doctext);
		pipeline.annotate(document);

		List<CoreMap> sentences = document.get(SentencesAnnotation.class);
		List<Map> outSentences = Lists.newArrayList();

		for(CoreMap sentence: sentences) {
			Map<String,Object> sent_info = Maps.newHashMap();
			addTokenBasics(sent_info, sentence);
			numTokens += ((List) sent_info.get("tokens")).size();
			for (String annotator : annotators()) {
				addAnnoToSentenceObject(sent_info, sentence, annotator);
			}
			outSentences.add(sent_info);
		}


		ImmutableMap.Builder b = new ImmutableMap.Builder();
//		b.put("text", doctext);
		b.put("sentences", outSentences);
		
		if (Lists.newArrayList(annotators()).contains("dcoref")) {
			List outCoref = getCorefInfo(document);
			b.put("entities", outCoref);
		}
		Map outDoc = b.build();
		return JsonUtil.toJson(outDoc);
	}


}