java source code of Edges

package edu.uncc.cs.watsonsim.index;

import java.io.IOException;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentSkipListMap;
import org.apache.log4j.Logger;
import org.iq80.leveldb.*;
import org.junit.Test;

import static org.fusesource.leveldbjni.JniDBFactory.*;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.fail;

import java.io.*;

import edu.stanford.nlp.dcoref.CorefChain.CorefMention;
import edu.stanford.nlp.dcoref.Dictionaries;
import edu.stanford.nlp.dcoref.Dictionaries.Animacy;
import edu.stanford.nlp.dcoref.Dictionaries.Gender;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.Triple;
import edu.uncc.cs.watsonsim.Database;
import edu.uncc.cs.watsonsim.Passage;
import edu.uncc.cs.watsonsim.Phrase;
import edu.uncc.cs.watsonsim.nlp.Trees;

public class Edges implements Segment {
	private ConcurrentSkipListMap<String, Integer> all_edges = new ConcurrentSkipListMap<>();
	private final Logger log = Logger.getLogger(getClass());
	private final DB ldb;
	private final Database sqldb;
	
	public static final class Edge extends Triple<String, String, String> {
		public Edge(String a, String b, String c) {
			super(a, b, c);
		}
	}
	
	public Edges(Database sqldb) {
		
		// Setup LevelDB
		Options options = new Options();
		options.createIfMissing(true);
		try {
			ldb = factory.open(new File("data/edges-leveldb-depparse-lemma0"), options);
		} catch (IOException e) {
			// If we can't open the database we're toast.
			e.printStackTrace();
			throw new RuntimeException(e);
		}
		
		// Setup the SQL db
		this.sqldb = sqldb;
		
	}
	
	/**
	 * Write the contents to disk
	 * @throws IOException
	 */
	public synchronized void flush() throws IOException {
		try (WriteBatch batch = ldb.createWriteBatch()) {
			// Take a snapshot
			ConcurrentSkipListMap<String, Integer> rels = all_edges;
			all_edges = new ConcurrentSkipListMap<>();
			System.out.println("writing out  " + rels.size() + " edges.");
			rels.forEach((key, value) -> {
				byte[] bkey = bytes(key);
				byte[] dbval = ldb.get(bkey);
				if (dbval != null)
					value += Integer.parseInt(asString(dbval));
				batch.put(bkey, bytes(value.toString()));
			});
			ldb.write(batch);
		}
	}

	@Override
	public synchronized void close() throws IOException {
		flush();
		
		/* Now populate the relational database using the leveldb
		 * This strange two-step process comes because:
		 *  1) Leveldb is about 10x faster for batched writes
		 *  2) Sqlite & Postgresql support concurrent readers
		 *  Otherwise, I would be thrilled to use either all the way.
		 */
		System.out.println("Pushing histograms into the main database.");
		try {
			sqldb.prep("DELETE FROM semantic_graph;").execute();
			sqldb.prep("PRAGMA synchronous=OFF;").execute();
			// source, tag, dest, count
			PreparedStatement graph = sqldb.prep("INSERT INTO semantic_graph VALUES (?, ?, ?, ?);");
			DBIterator i = ldb.iterator();
			i.seekToFirst(); // for() doesn't work
			Map.Entry<byte[],byte[]> entry;
			int queue=0;
			while ((entry = i.next()) != null) {
				String[] words = asString(entry.getKey()).split("\t");
				try {
					graph.setString(1, words[0]);
					graph.setString(2, words[1]);
					graph.setString(3, words[2]);
					graph.setInt(4, Integer.parseInt(asString(entry.getValue())));
					graph.addBatch();
					if (++queue % 1000000 == 0) {
						System.out.println("Enqueued " + queue + " rows");
						graph.executeBatch();
						//sqldb.commit();
					}
				} catch (Exception e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}
			}
			i.close();
			System.out.println("SQL batch " + graph.executeBatch().length);
		} catch (SQLException e) {
			e.printStackTrace();
			// Call it an IO exception
			throw new IOException(e);
		}	
	}
	
	/**
	 * Returns some new rules learned about a pronoun given its match
	 * context from anaphora resolution.
	 * 
	 * Specifically, we fill in the tags
	 * 
	 * _animate(main mention, ___).
	 * _gender(main mention, ___).
	 * _number(main mention, ___).
	 * 
	 * Basically, we can tell if it is animate, it's gender, and it's count.
	 * @return A list of semantic notes.
	 */
	public static List<Edge> generatePronounEdges(
			SemanticGraph g, IndexedWord w, Phrase t) {
		List<Edge> edges = new ArrayList<>();
		if (t.getUnpronoun().containsKey(w.index())) {
			// Use what we know about the pronoun
			Pair<CorefMention, CorefMention> mention_edge = t.getUnpronoun().get(w.index());
			String main_noun = Trees.concatNoun(g, g.getNodeByIndex(mention_edge.second.headIndex));
			
			Animacy is_animate = mention_edge.first.animacy;
			if (is_animate != Animacy.UNKNOWN) {
				edges.add(new Edge(
					main_noun, "_animate", is_animate.toString()));
			}
			
			Gender gender = mention_edge.first.gender;
			if (gender != Gender.UNKNOWN) {
				edges.add(new Edge(
					main_noun, "_gender", gender.toString()));
			}
			
			Dictionaries.Number number = mention_edge.first.number;
			if (number != Dictionaries.Number.UNKNOWN) {
				edges.add(new Edge(
					main_noun, "_number", number.toString()));
			}
		}
		return edges;
	}
	
	/**
	 * Get the full text of the main mention of a particular word, if it has a
	 * better mention. Otherwise just get it's segment of the tree using
	 * concatNoun()
	 * 
	 * @param phrase
	 * @param w
	 * @return
	 */
	public static String getMainMention(
			Phrase phrase, SemanticGraph graph, IndexedWord word) {
		Pair<CorefMention, CorefMention> linked_refs =
				phrase.getUnpronoun().get(word.index());
		if (linked_refs == null) {
			return Trees.concatNoun(graph, word);
		} else {
			return linked_refs.second.mentionSpan;
		}
	}

	
	/**
	 * Take a passage and find relevant semantic edges in it.
	 * 
	 * 1) We know that handling these words one at a time yields very
	 * boring results. We can connect "Donald" with "Duck," which is neat, but
	 * we can't tell anything Donald does that other ducks do not because any
	 * verb will be attached to "duck" but know nothing about Donald.
	 * 
	 * So to solve this, we connect all the [nn, cd] links, and invert
	 * "prep_of" links, prepending them. (This is the Trees.concatNoun method).
	 * How many of these are worth joining is up for debate. But it has to be
	 * consistent for indexing and later querying. Suppose we found "Donald
	 * duck is a cool cartoon character." We'll get a higher level relation
	 * like nsubj("Donald duck", "cartoon character") -> 1
	 * 
	 * 2) We find that a lot of the links are to pronouns. So in the links,
	 * we replace the pronouns with their "representative mentions", using
	 * CoreNLP's dcoref.
	 * 
	 * 3) For good measure, we include a few fake tags to indicate other
	 * tidbits we learned about relations. For example, we know gender
	 * and animation based on the pronouns used with something.
	 * 
	 * 	tag      | meaning
	 *  -------------------
	 *  _gender  | he / she, if available
	 *  _animate | (he/she), it
	 *  _number  | how many there are
	 *  _isa     | lexical type
	 * 
	 * 
	 * ## Possible investigation for later
	 * We can also join relations. Where we have relations that look like this:
	 * 
	 *  tagname(words, words)
	 *  
	 * We may want something that looks more like this:
	 * 
	 *  tagname [word tagname]* (words, words)
	 *  
	 * That way we can bridge across common concepts and get to the more
	 * interesting links they bridge. It sounds logical to me but I can't come
	 * up with any convincing examples where it would actually be useful to
	 * know, and I seem to find many transitive connections that are
	 * irrelevant.
	 * 
	 */
	public static List<Edge> generateEdges(Phrase phrase) {
		List<Edge> edges = new ArrayList<>();
		phrase.getGraphs().forEach(g -> {
			g.edgeIterable().forEach(e -> {
				if (e.getRelation().getShortName() != "nn") {
					// "nn" is garbled by the concatNoun() anyway
					
					// Dcoref on the source and target
					//edges.addAll(generatePronounEdges(g, e.getSource(), phrase));
					
					// Find the main mention and optionally concat it 
					String source = getMainMention(phrase, g, e.getSource());
					String target = getMainMention(phrase, g, e.getTarget());
					
					edges.add(new Edge(
							source,
							Trees.getSpecificPreps(e.getRelation()),
							target));
				}
			});
		});
		
		// Also extract the types while we are at it.
		/*SupportCandidateType.extract(phrase).forEach(nt -> {
			edges.add(new Edge(nt.first, "_isa", nt.second));
		});*/
		return edges;
	}
	
	@Override
	/**
	 * Stores the edges resulting from generateEdges into a database,
	 * delimiting the keys by tabs, since spaces are taken by concatNoun().
	 */
	public void accept(Passage t) {
		generateEdges(t).forEach(edge -> {
			all_edges.merge(
					edge.first + "\t"
						+ edge.second + "\t"
						+ edge.third,
					1,
					(a, b) -> a+b);
		});
			
		// Try to keep it from absorbing all available memory
		if (all_edges.size() > 1_000_000)
			try {
				flush();
			} catch (IOException e1) {
				// TODO Auto-generated catch block
				e1.printStackTrace();
			}
	}
	

	@Test
	public void testGenerateEdges() {
		Phrase p = new Phrase("This is an example.");
		assertEquals(null, Edges.generateEdges(p));
		fail("Not yet implemented");
	}

}