package de.uni_mannheim.utils.coreNLP; import java.util.List; import java.util.Properties; import java.util.Set; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.IndexedWord; import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.StanfordCoreNLP; import edu.stanford.nlp.semgraph.SemanticGraph; import edu.stanford.nlp.semgraph.SemanticGraphEdge; import edu.stanford.nlp.semgraph.SemanticGraphFactory; import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.BasicDependenciesAnnotation; import edu.stanford.nlp.trees.EnglishGrammaticalRelations; import edu.stanford.nlp.trees.EnglishGrammaticalStructure; import edu.stanford.nlp.trees.GrammaticalRelation; import edu.stanford.nlp.trees.TreeGraphNode; import edu.stanford.nlp.trees.TypedDependency; import edu.stanford.nlp.util.CoreMap; import edu.stanford.nlp.util.Generics; import it.unimi.dsi.fastutil.ints.IntArrayList; import it.unimi.dsi.fastutil.objects.ObjectArrayList; import it.unimi.dsi.fastutil.objects.ObjectList; import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet; import de.uni_mannheim.constant.CHARACTER; import de.uni_mannheim.constant.NE_TYPE; import de.uni_mannheim.constant.POS_TAG; import de.uni_mannheim.constant.SEPARATOR; import de.uni_mannheim.constant.WORDS; import de.uni_mannheim.utils.fastutils.FastUtil; /** * @author Kiril Gashteovski */ public class CoreNLPUtils { /** * Initializes and returns StanfordCoreNLP pipeline * @return StanfordCoreNLP pipeline */ public static StanfordCoreNLP StanfordDepNNParser(){ Properties props = new Properties(); props.put("language", "english"); props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, depparse"); props.put("depparse.model", "edu/stanford/nlp/models/parser/nndep/english_SD.gz"); props.put("parse.originalDependencies", true); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); return pipeline; } /** * Given a CoreNLP pipeline and an input sentence, generate dependency parse for the sentence and return * the SemanticGraph object as a result * @param pipeline - CoreNLP pipeline * @param snt - input sentence * @return dependency parse in SemanticGraph object */ public static SemanticGraph parse(StanfordCoreNLP pipeline, String snt) { Annotation document = new Annotation(snt); pipeline.annotate(document); //A CoreMap is a sentence with annotations List<CoreMap> sentences = document.get(SentencesAnnotation.class); SemanticGraph semanticGraph = null; for(CoreMap sentence: sentences) { semanticGraph = sentence.get(BasicDependenciesAnnotation.class); } return semanticGraphUniversalEnglishToEnglish(semanticGraph); } /** * Given a sequence of indexed words, return a string in the format "[POS1|NER1] [POS2|NER2] ... [POSn|NERn]" * If a given word has a NER type -> write the type, else -> write the POS tag. * When we have a verb, noun, adverb,...unify them under a "common" POS tag (e.g:VB for all verbs, NN for all nouns,etc.) * @param words: a list of indexed words * @return a string in the format "[POS1|NER1] [POS2|NER2] ... [POSn|NERn]" */ public static String wordsToPosMergedNerSeq(ObjectArrayList<IndexedWord> words){ StringBuffer sbSeq = new StringBuffer(); for (int i = 0; i < words.size(); i++){ if (words.get(i).ner().equals(NE_TYPE.NO_NER)){ if (isAdj(words.get(i).tag())) sbSeq.append(POS_TAG.JJ); else if (isAdverb(words.get(i).tag())) sbSeq.append(POS_TAG.RB); else if (isNoun(words.get(i).tag())) sbSeq.append(POS_TAG.NN); else if (isPronoun(words.get(i).tag())) sbSeq.append(POS_TAG.PR); else if (isVerb(words.get(i).tag())) sbSeq.append(POS_TAG.VB); else if (isWhPronoun(words.get(i).tag())) sbSeq.append(POS_TAG.WP); else sbSeq.append(words.get(i).tag()); sbSeq.append(SEPARATOR.SPACE); } else { sbSeq.append(words.get(i).ner()); sbSeq.append(SEPARATOR.SPACE); } } return sbSeq.toString().trim(); } /** * Given a sequence of indexed words, return a string in the format "[POS1] [POS2] ... [POSn]" * Same as "wordsToPosMergedNerSeq", the difference being that this function returns sequence of POS tags only * (ignores the NER types) * When we have a verb, noun, adverb,...unify them under a "common" POS tag (e.g:VB for all verbs, NN for all nouns,etc.) * @param words: a list of indexed words * @return a string in the format "[POS1] [POS2] ... [POSn]" */ public static String wordsToPosMergedSeq(ObjectArrayList<IndexedWord> words){ StringBuffer sbSeq = new StringBuffer(); for (int i = 0; i < words.size(); i++){ if (isAdj(words.get(i).tag())) sbSeq.append(POS_TAG.JJ); else if (isAdverb(words.get(i).tag())) sbSeq.append(POS_TAG.RB); else if (isNoun(words.get(i).tag())) sbSeq.append(POS_TAG.NN); else if (isPronoun(words.get(i).tag())) sbSeq.append(POS_TAG.PR); else if (isVerb(words.get(i).tag())) sbSeq.append(POS_TAG.VB); else if (isWhPronoun(words.get(i).tag())) sbSeq.append(POS_TAG.WP); else sbSeq.append(words.get(i).tag()); sbSeq.append(SEPARATOR.SPACE); } return sbSeq.toString().trim(); } /** * Given a list of indexed words and a semantic graph, return the root word of the word list. We assume that * all the words from the list can be found in the semantic graph sg, and the words in wordList are connected * within the semantic graph of the sentence - sg, and that they all share a common root. * @param sg: semantic graph of the sentence * @param wordList: the phrase from the sentence, represented as a list of words * @return the root word from the phrase */ public static IndexedWord getRootFromWordList(SemanticGraph sg, ObjectArrayList<IndexedWord> wordList){ // If the word list is consisted of one word - return that word if (wordList.size() == 1) return wordList.get(0); IndexedWord constituentRoot = null; // We only search as high as grandparents // constituentRoot = sg.getCommonAncestor(wordList.get(0), wordList.get(wordList.size()-1)); // If the commonancestor is deeper in the tree, the constituent root is the word with shortest distance // to the root of the sentence int minPathToRoot = Integer.MAX_VALUE; int pathToRoot = -1; for (int i = 0; i < wordList.size(); i++){ // The words with index -2 are the ones that cannot be found in the semantic graph (synthetic words) // This happens in the relations (see in clausie.ClauseDetector.java), and those words are the head words if (wordList.get(i).index() == -2){ return wordList.get(i); } pathToRoot = sg.getShortestDirectedPathNodes(sg.getFirstRoot(), wordList.get(i)).size(); if (pathToRoot < minPathToRoot){ //TODO: throws NPE sometimes minPathToRoot = pathToRoot; constituentRoot = wordList.get(i); } } return constituentRoot; } /** * Given a list of core maps (each core map beeing a word) and a semantic graph of the sentence, return the * root word of the word list (i.e. the one which is closest to the root of the semantic graph). We assume that * all the words from the list can be found in the semantic graph sg, and the words in wordList are connected * within the semantic graph of the sentence - sg, and that they all share a common root. * * @param sg: semantic graph of the sentence * @param wordList: the phrase from the sentence, represented as a list of words * @return the root word from the phrase */ public static IndexedWord getRootFromCoreMapWordList(SemanticGraph sg, List<CoreMap> wordList){ ObjectArrayList<IndexedWord> indWordList = CoreNLPUtils.listOfCoreMapWordsToIndexedWordList(wordList); return getRootFromWordList(sg, indWordList); } /** * Given a list of indexed words and a semantic graph, return the root word of the word list. We assume that * all the words from the list can be found in the semantic graph sg, and the words in wordList are connected * within the semantic graph of the sentence - sg. If there are multiple words which have the shortest distance * to the sentence root, then choose the most-left verb. * * @param sg: sentence semantic graph * @param wordsList: list of words from which to choose "root" from * @return */ public static IndexedWord getVerbRootFromWordList(SemanticGraph sg, ObjectArrayList<IndexedWord> wordList){ IndexedWord constituentRoot = null; IntArrayList shortestDirectedPathDistances = new IntArrayList(); int minPathToRoot = Integer.MAX_VALUE; int pathToRoot = -1; for (int i = 0; i < wordList.size(); i++){ // The words with index -2 are the ones that cannot be found in the semantic graph (synthetic words) // This happens in the relations (see in clausie.ClauseDetector.java), and those words are the head words if (wordList.get(i).index() == -2){ return wordList.get(i); } pathToRoot = sg.getShortestDirectedPathNodes(sg.getFirstRoot(), wordList.get(i)).size(); if (pathToRoot < minPathToRoot){ minPathToRoot = pathToRoot; } shortestDirectedPathDistances.add(pathToRoot); } // If the shortest path is one element, return it, else, return the first verb containing that index if (FastUtil.countElement(minPathToRoot, shortestDirectedPathDistances) == 1) return wordList.get(shortestDirectedPathDistances.indexOf(minPathToRoot)); else { for (int i = 0; i < shortestDirectedPathDistances.size(); i++){ if (shortestDirectedPathDistances.getInt(i) == minPathToRoot){ if (isVerb(wordList.get(i).tag())){ constituentRoot = wordList.get(i); break; } } } } return constituentRoot; } /** * Given a semantic graph of a whole sentence (sg) and a "local root" node, get the subgraph from 'sg' which has * 'localRoot' as a root. * @param sg: semantic graph of the whole sentence * @param localRoot: the root of the subgraph * @return semantic graph object which is the subgraph from 'sg' */ public static SemanticGraph getSubgraph(SemanticGraph sg, IndexedWord localRoot){ ObjectArrayList<TypedDependency> subGraphDependencies = getSubgraphTypedDependencies(sg, localRoot, new ObjectArrayList<TypedDependency>()); TreeGraphNode rootTGN = new TreeGraphNode(new CoreLabel(localRoot)); EnglishGrammaticalStructure gs = new EnglishGrammaticalStructure(subGraphDependencies, rootTGN); return SemanticGraphFactory.generateUncollapsedDependencies(gs); } private static ObjectArrayList<TypedDependency> getSubgraphTypedDependencies(SemanticGraph sg, IndexedWord parent, ObjectArrayList<TypedDependency> tds){ Set<IndexedWord> children = sg.getChildren(parent); for (IndexedWord child: children){ GrammaticalRelation gRel = sg.getEdge(parent, child).getRelation(); tds.add(new TypedDependency(gRel, parent, child)); if (sg.hasChildren(child)) getSubgraphTypedDependencies(sg, child, tds); } return tds; } /** * Given the sentence semantic graph and a list of words, get a subgraph containing just the words in the list * 'words'. Each typed dependency has each word from the list as a governor. * @param sg: sentence semantic graph * @param words: list of words which should contain the semantic graph * @return subgraph containing the words from 'words' * TODO: this needs to be double checked! In some cases we have weird graphs, where there are words missing. * E.g. the sentence 120 from NYT "The International ... ". Try this for getting the subgraph when the source is * detected. */ public static SemanticGraph getSubgraphFromWords(SemanticGraph sg, ObjectArrayList<IndexedWord> words){ // Determining the root int minInd = Integer.MAX_VALUE; IndexedWord root = new IndexedWord(); for (IndexedWord w: words){ if (w.index() < minInd){ minInd = w.index(); root = w; } } // Getting the typed dependency ObjectArrayList<TypedDependency> tds = new ObjectArrayList<TypedDependency>(); for (TypedDependency td: sg.typedDependencies()){ if (words.contains(td.gov()) && words.contains(td.dep())) tds.add(td); } // Create the semantic graph TreeGraphNode rootTGN = new TreeGraphNode(new CoreLabel(root)); EnglishGrammaticalStructure gs = new EnglishGrammaticalStructure(tds, rootTGN); SemanticGraph phraseSg = SemanticGraphFactory.generateUncollapsedDependencies(gs); return phraseSg; } /** * Given a semantic graph and a node which is part of the graph, return the constituent subgraph which has as root * constituentRoot * @param sg: the semantic graph from which the constituent sub-graph should be derived * @param constituentRoot: the root node for the constituent * @return the subgraph with constituentRoot as a root */ public static SemanticGraph getSubgraph(SemanticGraph sg, IndexedWord constituentRoot, ObjectArrayList<IndexedWord> words){ int maxPathLength = -1; int pathLength; for (IndexedWord word: words){ pathLength = sg.getShortestDirectedPathEdges(sg.getFirstRoot(), word).size(); if (pathLength > maxPathLength) maxPathLength = pathLength; } ObjectArrayList<TypedDependency> tds = new ObjectArrayList<TypedDependency>(); return getSubgraph(tds, sg, constituentRoot, null, maxPathLength, words); } private static SemanticGraph getSubgraph(ObjectArrayList<TypedDependency> tds, SemanticGraph sg, IndexedWord parent, SemanticGraphEdge e, int maxPathLength, ObjectArrayList<IndexedWord> words){ Set<IndexedWord> children = sg.getChildren(parent); for (IndexedWord child: children){ if (((sg.getShortestDirectedPathEdges(sg.getFirstRoot(), child)).size() <= maxPathLength) && words.contains(child)){ e = sg.getEdge(parent, child); tds.add(new TypedDependency(e.getRelation(), parent, child)); if (sg.hasChildren(child)) getSubgraph(tds, sg, child, e, maxPathLength, words); } // else break; } TreeGraphNode rootTGN = new TreeGraphNode(new CoreLabel(parent)); EnglishGrammaticalStructure gs = new EnglishGrammaticalStructure(tds, rootTGN); return SemanticGraphFactory.generateUncollapsedDependencies(gs); } /** * Given a pivot word and a list of words, return a list of "chained words" (i.e. words with same tags, or NERs * to the left and right of the pivot word in the list). * @param pivot: the pivot word being examined * @param words: list of words from which the pivot word is part of * @return */ public static ObjectArrayList<IndexedWord> getChainedWords(IndexedWord pivot, ObjectArrayList<IndexedWord> words){ // TODO: double check how we generate chained words (considering the NERs) // In case the pivot word is not in the list - return empty list if (words.indexOf(pivot) == -1) return new ObjectArrayList<>(); ObjectArrayList<IndexedWord> chainedWords = new ObjectArrayList<>(); if (!pivot.ner().equals(NE_TYPE.NO_NER)) chainedWords = getChainedNERs(words, words.indexOf(pivot)); else if (CoreNLPUtils.isNoun(pivot.tag())) chainedWords = getChainedNouns(words, words.indexOf(pivot)); else chainedWords = getChainedTagNoNER(words, words.indexOf(pivot)); return chainedWords; } /** * Given a sequence of words and a pivot-word index, return the chained nouns from the left and from the right * of the pivot word. * @param sequence: a sequence of words (list of IndexedWord) * @param wordInd: the index of the pivot word * @return a list of chained nouns to the left and the right of the pivot word (the pivot word is included) */ public static ObjectArrayList<IndexedWord> getChainedNouns(ObjectArrayList<IndexedWord> sequence, int wordInd){ IntArrayList chainedNounsInd = new IntArrayList(); // Get the chained nouns from left and right IntArrayList chainedNounsLeft = getChainedNounsFromLeft(sequence, chainedNounsInd.clone(), wordInd); IntArrayList chainedNounsRight = getChainedNounsFromRight(sequence, chainedNounsInd.clone(), wordInd); // Add all the words to the chained nouns chainedNounsInd.addAll(chainedNounsLeft); chainedNounsInd.add(wordInd); chainedNounsInd.addAll(chainedNounsRight); // IndexedWord chained nouns ObjectArrayList<IndexedWord> iChainedNouns = new ObjectArrayList<IndexedWord>(); for (int i: FastUtil.sort(chainedNounsInd)){ iChainedNouns.add(sequence.get(i)); } return iChainedNouns; } /** * Given a sequence of indexed words and a noun, get all the nouns 'chained' to the word from the left. * @param sequence: a list of words * @param wordInd: the word index from where the search starts * @return a list of nouns which precede 'word' */ private static IntArrayList getChainedNounsFromLeft(ObjectArrayList<IndexedWord> sequence, IntArrayList chainedNouns, int wordInd){ // If the word is the leftiest word or it's not a noun - return if (wordInd > 0 && isNoun(sequence.get(wordInd-1).tag())){ chainedNouns.add(wordInd-1); getChainedNounsFromLeft(sequence, chainedNouns, wordInd-1); } return chainedNouns; } /** * Given a sequence of indexed words and a noun, get all the nouns 'chained' to the word from the right. * @param sequence: a list of words * @param wordInd: the word index from where the search starts * @return a list of nouns which precede 'word' */ private static IntArrayList getChainedNounsFromRight(ObjectArrayList<IndexedWord> sequence, IntArrayList chainedNouns, int wordInd){ // If the word is the rightiest word or it's not a noun - return if (wordInd < sequence.size()-1 && isNoun(sequence.get(wordInd+1).tag())){ chainedNouns.add(wordInd + 1); getChainedNounsFromRight(sequence, chainedNouns, wordInd + 1); } return chainedNouns; } /** * Given a sequence of words and a pivot-word index, return the chained verbs from the left and from the right * of the pivot word. * @param sequence: a sequence of words (list of IndexedWord) * @param wordInd: the index of the pivot word * @return a list of chained verbs to the left and the right of the pivot word (the pivot word is included) */ public static ObjectArrayList<IndexedWord> getChainedVerbs(ObjectArrayList<IndexedWord> sequence, int wordInd){ IntArrayList chainedVerbsInd = new IntArrayList(); // Get the chained verbs from left and right IntArrayList chainedVerbsLeft = getChainedVerbsFromLeft(sequence, chainedVerbsInd.clone(), wordInd); IntArrayList chainedVerbsRight = getChainedVerbsFromRight(sequence, chainedVerbsInd.clone(), wordInd); // Add all the words to the chained verbs chainedVerbsInd.addAll(chainedVerbsLeft); chainedVerbsInd.add(wordInd); chainedVerbsInd.addAll(chainedVerbsRight); // IndexedWord chained verbs ObjectArrayList<IndexedWord> iChainedVerbs = new ObjectArrayList<IndexedWord>(); for (int i: FastUtil.sort(chainedVerbsInd)){ iChainedVerbs.add(sequence.get(i)); } return iChainedVerbs; } /** * Given a sequence of indexed words and a verb, get all the verbs 'chained' to the word from the left. * @param sequence: a list of words * @param wordInd: the word index from where the search starts * @return a list of verbs which precede 'word' */ private static IntArrayList getChainedVerbsFromLeft(ObjectArrayList<IndexedWord> sequence, IntArrayList chainedVerbs, int wordInd){ // If the word is the leftiest word or it's not a verb - return if (wordInd > 0 && isVerb(sequence.get(wordInd - 1).tag())){ chainedVerbs.add(wordInd-1); getChainedVerbsFromLeft(sequence, chainedVerbs, wordInd-1); } return chainedVerbs; } /** * Given a sequence of indexed words and a verb, get all the verbs 'chained' to the word from the right. * @param sequence: a list of words * @param wordInd: the word index from where the search starts * @return a list of verbs which precede 'word' */ private static IntArrayList getChainedVerbsFromRight(ObjectArrayList<IndexedWord> sequence, IntArrayList chainedVerbs, int wordInd){ // If the word is the rightiest word or it's not a verb - return if (wordInd < sequence.size()-1 && isVerb(sequence.get(wordInd + 1).tag())){ chainedVerbs.add(wordInd + 1); getChainedVerbsFromRight(sequence, chainedVerbs, wordInd + 1); } return chainedVerbs; } /** * Given a sequence of words and a pivot-word index, return the "chained words" from the left and from the right * of the pivot word. "Chained words" are a list of words, which all of them share the same POS tag and have no * NE types. * * @param sequence: a sequence of words (list of IndexedWord) * @param wordInd: the index of the pivot word * @return a list of chained words to the left and the right of the pivot word (the pivot word is included) */ public static ObjectArrayList<IndexedWord> getChainedTagNoNER(ObjectArrayList<IndexedWord> sequence, int wordInd){ IntArrayList chainedPosWordsInd = new IntArrayList(); // Get the chained nouns from left and right IntArrayList chainedPosWordsLeft = getChainedTagsFromLeftNoNER(sequence, chainedPosWordsInd.clone(), wordInd); IntArrayList chainedPosWordsRight = getChainedTagsFromRightNoNER(sequence, chainedPosWordsInd.clone(), wordInd); // Add all the words to the chained nouns chainedPosWordsInd.addAll(chainedPosWordsLeft); chainedPosWordsInd.add(wordInd); chainedPosWordsInd.addAll(chainedPosWordsRight); // IndexedWord chained nouns ObjectArrayList<IndexedWord> iChainedNouns = new ObjectArrayList<IndexedWord>(); for (int i: FastUtil.sort(chainedPosWordsInd)){ iChainedNouns.add(sequence.get(i)); } return iChainedNouns; } /** * Given a sequence of indexed words and a pivot, get all the words 'chained' to the word from the left (i.e. having * the same POS tag as the pivot word). Also, the chained words should not have NE types. * * @param sequence: a list of words * @param wordInd: the word index from where the search starts * @return a list of words which precede 'word' */ private static IntArrayList getChainedTagsFromLeftNoNER(ObjectArrayList<IndexedWord> sequence, IntArrayList chainedPosWords, int wordInd){ // If the word is the leftiest word or it's not with the same POS tag - return if (wordInd > 0 && sequence.get(wordInd).tag().equals(sequence.get(wordInd-1).tag()) && sequence.get(wordInd-1).ner().equals(NE_TYPE.NO_NER)){ chainedPosWords.add(wordInd-1); getChainedTagsFromLeftNoNER(sequence, chainedPosWords, wordInd-1); } return chainedPosWords; } /** * Given a sequence of indexed words and a noun, get all the nouns 'chained' to the word from the right. * Also, the chained nouns should not have NE types. * @param sequence: a list of words * @param wordInd: the word index from where the search starts * @return a list of nouns which preced 'word' */ private static IntArrayList getChainedTagsFromRightNoNER(ObjectArrayList<IndexedWord> sequence, IntArrayList chainedNouns, int wordInd){ // If the word is the rightiest word or it's not a noun - return if (wordInd < sequence.size()-1 && sequence.get(wordInd).tag().equals(sequence.get(wordInd+1).tag()) && sequence.get(wordInd+1).ner().equals(NE_TYPE.NO_NER)){ chainedNouns.add(wordInd + 1); getChainedTagsFromRightNoNER(sequence, chainedNouns, wordInd + 1); } return chainedNouns; } /** * Given a sequence of words and a pivot-word index, return the chained words of same NER, both from the left and * from the right of the pivot word (it is assumed that the pivot word is also NER). * @param sequence: a sequence of words (list of IndexedWord) * @param wordInd: the index of the pivot word * @return a list of chained nouns to the left and the right of the pivot word (the pivot word is included) */ public static ObjectArrayList<IndexedWord> getChainedNERs(ObjectArrayList<IndexedWord> sequence, int wordInd){ IntArrayList chainedNounsInd = new IntArrayList(); // Get the chained nouns from left and right IntArrayList chainedNounsLeft = getChainedNERsFromLeft(sequence, chainedNounsInd.clone(), wordInd, sequence.get(wordInd).ner()); IntArrayList chainedNounsRight = getChainedNERsFromRight(sequence, chainedNounsInd.clone(), wordInd, sequence.get(wordInd).ner()); // Add all the words to the chained nouns chainedNounsInd.addAll(chainedNounsLeft); chainedNounsInd.add(wordInd); chainedNounsInd.addAll(chainedNounsRight); // IndexedWord chained nouns ObjectArrayList<IndexedWord> iChainedNouns = new ObjectArrayList<IndexedWord>(); for (int i: FastUtil.sort(chainedNounsInd)){ iChainedNouns.add(sequence.get(i)); } return iChainedNouns; } /** * Given a sequence of indexed words and a NER word, get all the NERs 'chained' to the word from the left (they all * must have the same NER). * @param sequence: a list of words * @param wordInd: the word index from where the search starts (the pivot word) * @param ner: the NE type of the pivot word * @return a list of nouns which preced 'word' */ private static IntArrayList getChainedNERsFromLeft(ObjectArrayList<IndexedWord> sequence, IntArrayList chainedNERs, int wordInd, String ner){ // If the word is the leftiest word or it's not a noun - return if (wordInd > 0 && sequence.get(wordInd-1).ner().equals(ner)){ chainedNERs.add(wordInd-1); getChainedNERsFromLeft(sequence, chainedNERs, wordInd-1, ner); } return chainedNERs; } /** * Given a sequence of indexed words and a NER word, get all the NERs 'chained' to the word from the right (they all * must have the same NER). * @param sequence: a list of words * @param wordInd: the word index from where the search starts (the pivot word) * @param ner: the NE type of the pivot word * @return a list of nouns which preced 'word' */ private static IntArrayList getChainedNERsFromRight(ObjectArrayList<IndexedWord> sequence, IntArrayList chainedNERs, int wordInd, String ner){ // If the word is the rightiest word or it's not a noun - return if (wordInd < sequence.size()-1 && sequence.get(wordInd+1).ner().equals(ner)){ chainedNERs.add(wordInd + 1); getChainedNERsFromRight(sequence, chainedNERs, wordInd + 1, ner); } return chainedNERs; } /** * Checks if a word is some kind of a verb (i.e. if it has POS tag: VB, VBD, VBG, VBN, VBP or VBZ) * @param word: String the POS tag of the word * @return true if it is a verb, false otherwise */ public static boolean isVerb(String pos){ return pos.equals(POS_TAG.VB) || pos.equals(POS_TAG.VBD) || pos.equals(POS_TAG.VBG) || pos.equals(POS_TAG.VBN) || pos.equals(POS_TAG.VBP) || pos.equals(POS_TAG.VBZ); } /** * Checks if a word is some kind of a noun (i.e. if it has POS tag: NN, NNS, NNP or NNPS) * @param word: String the POS tag of the word * @return true if it is a noun, false otherwise */ public static boolean isNoun(String pos){ return pos.equals(POS_TAG.NN) ||pos.equals(POS_TAG.NNS) || pos.equals(POS_TAG.NNP) || pos.equals(POS_TAG.NNPS); } /** * Checks if a word is some kind of an adjective (i.e. if it has POS tag: JJ, JJR or JJS) * @param word: String the POS tag of the word * @return true if it is an adjective, false otherwise */ public static boolean isAdj(String pos){ return pos.equals(POS_TAG.JJ) || pos.equals(POS_TAG.JJR) || pos.equals(POS_TAG.JJS); } /** * Checks if a word is some kind of an adverb (i.e. if it has POS tag: RB, RBR or RBS) * @param word: String the POS tag of the word * @return true if it is an adverb, false otherwise */ public static boolean isAdverb(String pos){ return pos.equals(POS_TAG.RB) || pos.equals(POS_TAG.RBR) || pos.equals(POS_TAG.RBS); } /** * Checks if a word is some kind of a pronoun (i.e. if it has POS tag: PRP or PRP$) * @param word: String the POS tag of the word * @return true if it is a pronoun, false otherwise */ public static boolean isPronoun(String pos){ return pos.equals(POS_TAG.PRP) || pos.equals(POS_TAG.PRP_P); } /** * Checks if a word is some kind of a wh-pronoun (i.e. if it has POS tag: WP or WP$) * @param word: String the POS tag of the word * @return true if it is a wh-pronoun, false otherwise */ public static boolean isWhPronoun(String pos){ return pos.equals(POS_TAG.WP) || pos.equals(POS_TAG.WP_P); } /** * Given a semantic graph, recreate the sentence and return it as a string. * @param sg: the semantic graph of the sentence * @return the sentence (as a string) */ public static String semGraphToSentence(SemanticGraph sg){ StringBuffer sbSentence = new StringBuffer(); for (int i = 0; i < sg.size(); i++){ sbSentence.append(sg.getNodeByIndex(i + 1).word() + SEPARATOR.SPACE); } return sbSentence.toString(); } /** * Given a list of words, check if there is a verb in the list * @param words: list of indexed words * @return true -> if there is a verb in the list of words, false -> otherwise */ public static boolean verbInList(ObjectArrayList<IndexedWord> words){ for (IndexedWord word: words){ if (isVerb(word.tag())) return true; } return false; } /** * Given a list of words, check if there is a noun in the list * @param words: list of indexed words * @return true -> if there is a noun in the list of words, false -> otherwise */ public static boolean nounInList(ObjectArrayList<IndexedWord> words){ for (IndexedWord word: words){ if (isNoun(word.tag())) return true; } return false; } /** * Given a list of words, return the phrase of words as a whole string, separated with empty space * @param words: list of words (e.g. [Kiril, lives, in, Mannheim]) * @return string of the list of words separated by space (e.g. it returns "Kiril lives in Mannheim") */ public static String listOfWordsToWordsString(ObjectArrayList<IndexedWord> words){ StringBuffer sbSentence = new StringBuffer(); for (int i = 0; i < words.size(); i++){ sbSentence.append(words.get(i).word()); sbSentence.append(SEPARATOR.SPACE); } return sbSentence.toString().trim(); } /** * Given a list of indexed words, return a list of strings, which contain the words * @param words: list of indexed words * @return list of strings (the words from 'words') */ public static ObjectArrayList<String> listOfWordsToWordsStringList(ObjectArrayList<IndexedWord> words){ ObjectArrayList<String> stWords = new ObjectArrayList<String>(); for (int i = 0; i < words.size(); i++){ stWords.add(words.get(i).word()); } return stWords; } /** * Given a list of words (as core maps), return the phrase of words as a whole string, separated with empty space * @param words: list of words (e.g. [She, is, pretty]) * @return string of the list of words separated by space (e.g. it returns "She is pretty") */ public static String listOfCoreMapWordsToWordString(List<CoreMap> cmList){ StringBuffer sbSentence = new StringBuffer(); CoreLabel cl; for (CoreMap cm: cmList){ cl = new CoreLabel(cm); sbSentence.append(cl.word().toLowerCase()); sbSentence.append(SEPARATOR.SPACE); } return sbSentence.toString().trim(); } /** * Given a list of words (as core maps), return the phrase of words as a list of indexed word objects * @param words: list of words (e.g. [She, is, pretty]) * @return list of words (as IndexedWord) */ public static ObjectArrayList<IndexedWord> listOfCoreMapWordsToIndexedWordList(List<CoreMap> cmList){ ObjectArrayList<IndexedWord> wordList = new ObjectArrayList<>(); for (CoreMap cm: cmList){ wordList.add(new IndexedWord(new CoreLabel(cm))); } return wordList; } /** * */ public static ObjectArrayList<SemanticGraphEdge> listOfIndexedWordsToParentEdges(SemanticGraph semanticGraph, ObjectArrayList<IndexedWord> wordList) { ObjectArrayList<SemanticGraphEdge> result = new ObjectArrayList<>(); for (IndexedWord word: wordList) { if (!semanticGraph.containsVertex(word)) continue; SemanticGraphEdge edge = semanticGraph.getEdge(semanticGraph.getParent(word), word); result.add(edge); } return result; } /** * */ public static ObjectArrayList<SemanticGraphEdge> listOfIndexedWordsToParentEdges(SemanticGraph semanticGraph, ObjectOpenHashSet<IndexedWord> wordList) { ObjectArrayList<SemanticGraphEdge> result = new ObjectArrayList<>(); for (IndexedWord word: wordList) { SemanticGraphEdge edge = semanticGraph.getEdge(semanticGraph.getParent(word), word); result.add(edge); } return result; } public static ObjectArrayList<SemanticGraphEdge> listOfCoreMapWordsToParentEdges(SemanticGraph semanticGraph, List<CoreMap> cmList) { return listOfIndexedWordsToParentEdges(semanticGraph, listOfCoreMapWordsToIndexedWordList(cmList)); } /** * Given a list of words, return the phrase of words' lemmas as a whole string, separated with empty space * @param words: list of words (e.g. [She, is, pretty]) * @return string of the list of words separated by space (e.g. it returns "She be pretty") */ public static String listOfWordsToLemmaString(ObjectArrayList<IndexedWord> words){ StringBuffer sbSentence = new StringBuffer(); for (int i = 0; i < words.size(); i++){ sbSentence.append(words.get(i).lemma()); sbSentence.append(SEPARATOR.SPACE); } return sbSentence.toString().trim(); } /** * Given a list of indexed words 'words', return an integer list of indices of the words * @param words: list of indexed words * @return list of indices of the words */ public static IntArrayList listOfWordsToIndexList(ObjectArrayList<IndexedWord> words){ IntArrayList indices = new IntArrayList(); for (IndexedWord word: words){ indices.add(word.index()); } return indices; } /** * Given a list of words (as core maps), return the phrase of words' lemmas as a whole string, separated with empty space * @param words: list of words (e.g. [She, is, pretty]) * @return string of the list of words separated by space (e.g. it returns "She be pretty") */ public static String listOfCoreMapWordsToLemmaString(List<CoreMap> cmList){ StringBuffer sbSentence = new StringBuffer(); CoreLabel cl; for (CoreMap cm: cmList){ cl = new CoreLabel(cm); sbSentence.append(cl.lemma().toLowerCase()); sbSentence.append(SEPARATOR.SPACE); } return sbSentence.toString().trim(); } /** * Given a list of words (as core maps), return the phrase of words' lemmas as a list of strings, which are the lemmas * of each word, lowercased. * @param cmList: list of core maps * @return list of strings, the lemmas of each word, lowercased */ public static ObjectArrayList<String> listOfCoreMapWordsToLemmaStringList(List<CoreMap> cmList){ CoreLabel cl; ObjectArrayList<String> lemmaList = new ObjectArrayList<>(); for (CoreMap cm: cmList){ cl = new CoreLabel(cm); lemmaList.add(cl.lemma().toLowerCase()); } return lemmaList; } /** * Given a list of indexed words, return a list of strings, which contain the lemmas of the words * @param words: list of indexed words * @return list of strings (the lemmas of the words from 'words') */ public static ObjectArrayList<String> listOfWordsToLemmasStringList(ObjectArrayList<IndexedWord> words){ ObjectArrayList<String> stWords = new ObjectArrayList<String>(); for (int i = 0; i < words.size(); i++){ stWords.add(words.get(i).lemma()); } return stWords; } /** * Given a starting vertice, grabs the subtree encapsulated by portion of the semantic graph, excluding * a given edge. A tabu list is maintained, in order to deal with cyclical relations (such as between a * rcmod (relative clause) and its nsubj). * * @param vertice: starting vertice from which the sub-tree needs to be returned * @param sg: semantic graph of the sentence * @param excludedEdge: excluded edge * * Copied from: https://github.com/stanfordnlp/CoreNLP/blob/master/src/edu/stanford/nlp/semgraph/SemanticGraphUtils.java */ public static Set<SemanticGraphEdge> getSubTreeEdges(IndexedWord vertice, SemanticGraph sg, SemanticGraphEdge excludedEdge) { Set<SemanticGraphEdge> tabu = Generics.newHashSet(); tabu.add(excludedEdge); getSubTreeEdgesHelper(vertice, sg, tabu); tabu.remove(excludedEdge); // Do not want this in the returned edges return tabu; } private static void getSubTreeEdgesHelper(IndexedWord vertice, SemanticGraph sg, Set<SemanticGraphEdge> tabuEdges) { for (SemanticGraphEdge edge : sg.outgoingEdgeIterable(vertice)) { if (!tabuEdges.contains(edge)) { IndexedWord dep = edge.getDependent(); tabuEdges.add(edge); getSubTreeEdgesHelper(dep, sg, tabuEdges); } } } /** * Given a starting vertice, grabs the subtree encapsulated by portion of the semantic graph, excluding * a given edge. Returns the nodes of the subtree sorted by their indexes in the sentence. * * @param vertice: starting vertice from which the sub-tree needs to be returned * @param sg: semantic graph of the sentence * @param excludedEdge: excluded edge * @return list of IndexedWord objects */ public static ObjectArrayList<IndexedWord> getSubTreeSortedNodes(IndexedWord vertice, SemanticGraph sg, SemanticGraphEdge excludedEdge) { Set<SemanticGraphEdge> subTreeEdges = getSubTreeEdges(vertice, sg, null); return getSortedWordsFromListOfEdges(subTreeEdges); } /** * Get the semgrex pattern for "{} < {idx:word.index()}", where we do the matching with the index of the word * @param word: {} is the dependent of a relation reln with 'word' * @return semgrex pattern string */ public static String getSemgrexDependentOf(IndexedWord word){ StringBuffer sb = new StringBuffer(); sb.append(CHARACTER.LBRACE); sb.append(CHARACTER.RBRACE); sb.append(SEPARATOR.SPACE); sb.append(CHARACTER.LESS); sb.append(SEPARATOR.SPACE); sb.append(CHARACTER.LBRACE); sb.append(WORDS.idx); sb.append(CHARACTER.COLON); sb.append(word.index()); sb.append(CHARACTER.RBRACE); return sb.toString().trim(); } /** * Given a fast util object list of indexed words, return object array list of the same object list * @param: oWordList: list of indexed word (object list) * @return: an object array list object of oWordList */ public static ObjectArrayList<IndexedWord> objectListToObjectArrayList(ObjectList<IndexedWord> oWordList){ ObjectArrayList<IndexedWord> oaWordList = new ObjectArrayList<>(); for (IndexedWord w: oWordList){ oaWordList.add(w); } return oaWordList.clone(); } /** * Given a list of edges, get all the indexed words from them (their nodes) and return them sorted by index * @param edges: list of edges * @return list of indexed words sorted by index */ public static ObjectArrayList<IndexedWord> getSortedWordsFromListOfEdges(Set<SemanticGraphEdge> edges){ ObjectOpenHashSet<IndexedWord> wordsSet = new ObjectOpenHashSet<>(); for (SemanticGraphEdge e: edges){ wordsSet.add(e.getGovernor()); wordsSet.add(e.getDependent()); } return getSortedWordsFromSetOfWords(wordsSet); } /** * Given a set of indexed words, sort them by sentence index, and return them as a list of indexed words * @param wordSet: set of words to be sorted by sentence index * @return list of indexed words (wordSet sorted by sentence index) */ public static ObjectArrayList<IndexedWord> getSortedWordsFromSetOfWords(Set<IndexedWord> wordSet){ ObjectArrayList<IndexedWord> sortedWords = new ObjectArrayList<>(); IntArrayList wordsIndices = new IntArrayList(); for (IndexedWord w: wordSet){ wordsIndices.add(w.index()); } int [] sorted = FastUtil.sort(wordsIndices); for (int x: sorted){ for (IndexedWord w: wordSet){ if (w.index() == x){ sortedWords.add(w); } } } return sortedWords; } /** * Given a list of indexed words, sort them by sentence index, and return them as a list of indexed words * @param wordSet: set of words to be sorted by sentence index * @return list of indexed words (wordSet sorted by sentence index) */ public static ObjectArrayList<IndexedWord> getSortedListOfWords(ObjectArrayList<IndexedWord> wordList){ ObjectArrayList<IndexedWord> sortedWords = new ObjectArrayList<>(); IntArrayList wordsIndices = new IntArrayList(); for (IndexedWord w: wordList){ wordsIndices.add(w.index()); } int [] sorted = FastUtil.sort(wordsIndices); for (int x: sorted){ for (IndexedWord w: wordList){ if (w.index() == x){ sortedWords.add(w); } } } return sortedWords; } /** * Get the number of prepositions in the list of words (TO is also counted) * @param wList: list of words * @return number of prepositions in the list */ public static int countPrepositionsInList(ObjectArrayList<IndexedWord> wList){ int prepCount = 0; for (IndexedWord w: wList){ if (w.tag().equals(POS_TAG.IN) || w.tag().equals(POS_TAG.TO)) prepCount++; } return prepCount; } public static ObjectArrayList<CoreLabel> getCoreLabelListFromCoreMapList(ObjectArrayList<CoreMap> coreMapList){ ObjectArrayList<CoreLabel> coreLabelList = new ObjectArrayList<>(); for (CoreMap cm: coreMapList){ coreLabelList.add(new CoreLabel(cm)); } return coreLabelList; } public static ObjectArrayList<CoreLabel> getCoreLabelListFromIndexedWordList(ObjectArrayList<IndexedWord> words) { ObjectArrayList<CoreLabel> coreLabelList = new ObjectArrayList<>(); for (IndexedWord w: words) { coreLabelList.add(new CoreLabel(w)); } return coreLabelList; } public static ObjectArrayList<IndexedWord> getWordListFromCoreMapList(List<CoreMap> coreMapList){ ObjectArrayList<IndexedWord> coreLabelList = new ObjectArrayList<>(); for (CoreMap cm: coreMapList){ coreLabelList.add(new IndexedWord(new CoreLabel(cm))); } return coreLabelList; } public static ObjectOpenHashSet<IndexedWord> getWordSetFromCoreMapList(List<CoreMap> coreMapList){ ObjectOpenHashSet<IndexedWord> coreLabelSet = new ObjectOpenHashSet<>(); for (CoreMap cm: coreMapList){ coreLabelSet.add(new IndexedWord(new CoreLabel(cm))); } return coreLabelSet; } public static SemanticGraph semanticGraphUniversalEnglishToEnglish(SemanticGraph semanticGraph) { for (SemanticGraphEdge edge: semanticGraph.edgeListSorted()) { GrammaticalRelation oldRel = edge.getRelation(); edge.setRelation(EnglishGrammaticalRelations.shortNameToGRel.get(oldRel.getShortName())); } return semanticGraph; } public static boolean isOneNER(ObjectArrayList<IndexedWord> wordList) { String firstType = wordList.get(0).ner(); if (firstType.equals(NE_TYPE.NO_NER)) { return false; } boolean isOneNER = true; for (IndexedWord w: wordList) { if (!w.ner().equals(firstType)) { isOneNER = false; } } return isOneNER; } /** * Given a list of words, check if there is a verb in the list * @param words: list of indexed words * @return true -> if there is a verb in the list of words, false -> otherwise */ public static boolean hasVerb(ObjectArrayList<IndexedWord> words){ for (IndexedWord word: words){ if (isVerb(word.tag())) { return true; } } return false; } }