package edu.uncc.cs.watsonsim.index; import java.io.IOException; import java.sql.PreparedStatement; import java.sql.SQLException; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentSkipListMap; import org.apache.log4j.Logger; import org.iq80.leveldb.*; import org.junit.Test; import static org.fusesource.leveldbjni.JniDBFactory.*; import static org.junit.Assert.assertEquals; import static org.junit.Assert.fail; import java.io.*; import edu.stanford.nlp.dcoref.CorefChain.CorefMention; import edu.stanford.nlp.dcoref.Dictionaries; import edu.stanford.nlp.dcoref.Dictionaries.Animacy; import edu.stanford.nlp.dcoref.Dictionaries.Gender; import edu.stanford.nlp.ling.IndexedWord; import edu.stanford.nlp.semgraph.SemanticGraph; import edu.stanford.nlp.util.Pair; import edu.stanford.nlp.util.Triple; import edu.uncc.cs.watsonsim.Database; import edu.uncc.cs.watsonsim.Passage; import edu.uncc.cs.watsonsim.Phrase; import edu.uncc.cs.watsonsim.nlp.Trees; public class Edges implements Segment { private ConcurrentSkipListMap<String, Integer> all_edges = new ConcurrentSkipListMap<>(); private final Logger log = Logger.getLogger(getClass()); private final DB ldb; private final Database sqldb; public static final class Edge extends Triple<String, String, String> { public Edge(String a, String b, String c) { super(a, b, c); } } public Edges(Database sqldb) { // Setup LevelDB Options options = new Options(); options.createIfMissing(true); try { ldb = factory.open(new File("data/edges-leveldb-depparse-lemma0"), options); } catch (IOException e) { // If we can't open the database we're toast. e.printStackTrace(); throw new RuntimeException(e); } // Setup the SQL db this.sqldb = sqldb; } /** * Write the contents to disk * @throws IOException */ public synchronized void flush() throws IOException { try (WriteBatch batch = ldb.createWriteBatch()) { // Take a snapshot ConcurrentSkipListMap<String, Integer> rels = all_edges; all_edges = new ConcurrentSkipListMap<>(); System.out.println("writing out " + rels.size() + " edges."); rels.forEach((key, value) -> { byte[] bkey = bytes(key); byte[] dbval = ldb.get(bkey); if (dbval != null) value += Integer.parseInt(asString(dbval)); batch.put(bkey, bytes(value.toString())); }); ldb.write(batch); } } @Override public synchronized void close() throws IOException { flush(); /* Now populate the relational database using the leveldb * This strange two-step process comes because: * 1) Leveldb is about 10x faster for batched writes * 2) Sqlite & Postgresql support concurrent readers * Otherwise, I would be thrilled to use either all the way. */ System.out.println("Pushing histograms into the main database."); try { sqldb.prep("DELETE FROM semantic_graph;").execute(); sqldb.prep("PRAGMA synchronous=OFF;").execute(); // source, tag, dest, count PreparedStatement graph = sqldb.prep("INSERT INTO semantic_graph VALUES (?, ?, ?, ?);"); DBIterator i = ldb.iterator(); i.seekToFirst(); // for() doesn't work Map.Entry<byte[],byte[]> entry; int queue=0; while ((entry = i.next()) != null) { String[] words = asString(entry.getKey()).split("\t"); try { graph.setString(1, words[0]); graph.setString(2, words[1]); graph.setString(3, words[2]); graph.setInt(4, Integer.parseInt(asString(entry.getValue()))); graph.addBatch(); if (++queue % 1000000 == 0) { System.out.println("Enqueued " + queue + " rows"); graph.executeBatch(); //sqldb.commit(); } } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } i.close(); System.out.println("SQL batch " + graph.executeBatch().length); } catch (SQLException e) { e.printStackTrace(); // Call it an IO exception throw new IOException(e); } } /** * Returns some new rules learned about a pronoun given its match * context from anaphora resolution. * * Specifically, we fill in the tags * * _animate(main mention, ___). * _gender(main mention, ___). * _number(main mention, ___). * * Basically, we can tell if it is animate, it's gender, and it's count. * @return A list of semantic notes. */ public static List<Edge> generatePronounEdges( SemanticGraph g, IndexedWord w, Phrase t) { List<Edge> edges = new ArrayList<>(); if (t.getUnpronoun().containsKey(w.index())) { // Use what we know about the pronoun Pair<CorefMention, CorefMention> mention_edge = t.getUnpronoun().get(w.index()); String main_noun = Trees.concatNoun(g, g.getNodeByIndex(mention_edge.second.headIndex)); Animacy is_animate = mention_edge.first.animacy; if (is_animate != Animacy.UNKNOWN) { edges.add(new Edge( main_noun, "_animate", is_animate.toString())); } Gender gender = mention_edge.first.gender; if (gender != Gender.UNKNOWN) { edges.add(new Edge( main_noun, "_gender", gender.toString())); } Dictionaries.Number number = mention_edge.first.number; if (number != Dictionaries.Number.UNKNOWN) { edges.add(new Edge( main_noun, "_number", number.toString())); } } return edges; } /** * Get the full text of the main mention of a particular word, if it has a * better mention. Otherwise just get it's segment of the tree using * concatNoun() * * @param phrase * @param w * @return */ public static String getMainMention( Phrase phrase, SemanticGraph graph, IndexedWord word) { Pair<CorefMention, CorefMention> linked_refs = phrase.getUnpronoun().get(word.index()); if (linked_refs == null) { return Trees.concatNoun(graph, word); } else { return linked_refs.second.mentionSpan; } } /** * Take a passage and find relevant semantic edges in it. * * 1) We know that handling these words one at a time yields very * boring results. We can connect "Donald" with "Duck," which is neat, but * we can't tell anything Donald does that other ducks do not because any * verb will be attached to "duck" but know nothing about Donald. * * So to solve this, we connect all the [nn, cd] links, and invert * "prep_of" links, prepending them. (This is the Trees.concatNoun method). * How many of these are worth joining is up for debate. But it has to be * consistent for indexing and later querying. Suppose we found "Donald * duck is a cool cartoon character." We'll get a higher level relation * like nsubj("Donald duck", "cartoon character") -> 1 * * 2) We find that a lot of the links are to pronouns. So in the links, * we replace the pronouns with their "representative mentions", using * CoreNLP's dcoref. * * 3) For good measure, we include a few fake tags to indicate other * tidbits we learned about relations. For example, we know gender * and animation based on the pronouns used with something. * * tag | meaning * ------------------- * _gender | he / she, if available * _animate | (he/she), it * _number | how many there are * _isa | lexical type * * * ## Possible investigation for later * We can also join relations. Where we have relations that look like this: * * tagname(words, words) * * We may want something that looks more like this: * * tagname [word tagname]* (words, words) * * That way we can bridge across common concepts and get to the more * interesting links they bridge. It sounds logical to me but I can't come * up with any convincing examples where it would actually be useful to * know, and I seem to find many transitive connections that are * irrelevant. * */ public static List<Edge> generateEdges(Phrase phrase) { List<Edge> edges = new ArrayList<>(); phrase.getGraphs().forEach(g -> { g.edgeIterable().forEach(e -> { if (e.getRelation().getShortName() != "nn") { // "nn" is garbled by the concatNoun() anyway // Dcoref on the source and target //edges.addAll(generatePronounEdges(g, e.getSource(), phrase)); // Find the main mention and optionally concat it String source = getMainMention(phrase, g, e.getSource()); String target = getMainMention(phrase, g, e.getTarget()); edges.add(new Edge( source, Trees.getSpecificPreps(e.getRelation()), target)); } }); }); // Also extract the types while we are at it. /*SupportCandidateType.extract(phrase).forEach(nt -> { edges.add(new Edge(nt.first, "_isa", nt.second)); });*/ return edges; } @Override /** * Stores the edges resulting from generateEdges into a database, * delimiting the keys by tabs, since spaces are taken by concatNoun(). */ public void accept(Passage t) { generateEdges(t).forEach(edge -> { all_edges.merge( edge.first + "\t" + edge.second + "\t" + edge.third, 1, (a, b) -> a+b); }); // Try to keep it from absorbing all available memory if (all_edges.size() > 1_000_000) try { flush(); } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } } @Test public void testGenerateEdges() { Phrase p = new Phrase("This is an example."); assertEquals(null, Edges.generateEdges(p)); fail("Not yet implemented"); } }