#!/usr/bin/python # -*- coding: utf-8 -*- import codecs import re, igraph from pattern.text import parsetree import numpy as np import networkx as nx from random import shuffle from sklearn.metrics.pairwise import linear_kernel from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer from pulp import *#LpAffineExpression, LpConstraint, LpConstraintVar, lpSum import Stemmer from sklearn.metrics.pairwise import cosine_similarity from absummarizer import summarizer english_stemmer = Stemmer.Stemmer('en') class StemmedTfidfVectorizer(TfidfVectorizer): def build_analyzer(self): analyzer = super(TfidfVectorizer, self).build_analyzer() return lambda doc: english_stemmer.stemWords(analyzer(doc)) #import matplotlib.pyplot as plt #~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~ # [ Class word_graph #~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~ class word_graph: """ The word_graph class constructs a word graph from the set of sentences given as input. The set of sentences is a list of strings, sentences are tokenized and words are POS-tagged (e.g. ``"Saturn/NNP is/VBZ the/DT sixth/JJ planet/NN from/IN the/DT Sun/NNP in/IN the/DT Solar/NNP System/NNP"``). Four optional parameters can be specified: - nb_words is is the minimal number of words for the best compression (default value is 8). - lang is the language parameter and is used for selecting the correct stopwords list (default is "en" for english, stopword lists are localized in /resources/ directory). - punct_tag is the punctuation mark tag used during graph construction (default is PUNCT). """ #-T-----------------------------------------------------------------------T- def __init__(self, sentence_list, lang="en", punct_tag="PUNCT"): self.sentence = list(sentence_list) """ A list of sentences provided by the user. """ self.length = len(sentence_list) """ The number of sentences given for fusion. """ #self.nb_words = nb_words """ The minimal number of words in the compression. """ self.resources = os.path.dirname(__file__)+"/../"+"resources/" #self.resources = os.path.dirname(__file__) + '/resources/' """ The path of the resources folder. """ self.stopword_path = self.resources+'stopwords.'+lang+'.dat' """ The path of the stopword list, e.g. stopwords.[lang].dat. """ self.stopwords = self.load_stopwords(self.stopword_path) """ The set of stopwords loaded from stopwords.[lang].dat. """ self.punct_tag = punct_tag """ The stopword tag used in the graph. """ self.graph = nx.DiGraph() """ The directed graph used for fusion. """ self.start = '-start-' """ The start token in the graph. """ self.stop = '-end-' """ The end token in the graph. """ self.sep = '/-/' """ The separator used between a word and its POS in the graph. """ self.term_freq = {} """ The frequency of a given term. """ self.verbs = set(['VB', 'VBD', 'VBP', 'VBZ', 'VH', 'VHD', 'VHP', 'VBZ', 'VV', 'VVD', 'VVP', 'VVZ']) """ The list of verb POS tags required in the compression. At least *one* verb must occur in the candidate compressions. """ # Replacing default values for French if lang == "fr": self.verbs = set(['V', 'VPP', 'VINF']) # 1. Pre-process the sentences self.pre_process_sentences() # 2. Compute term statistics self.compute_statistics() # 3. Build the word graph self.build_graph() #-B-----------------------------------------------------------------------B- #-T-----------------------------------------------------------------------T- def pre_process_sentences(self): """ Pre-process the list of sentences given as input. Split sentences using whitespaces and convert each sentence to a list of (word, POS) tuples. """ for i in range(self.length): # Normalise extra white spaces self.sentence[i] = re.sub(' +', ' ', self.sentence[i]) # Tokenize the current sentence in word/POS sentence = self.sentence[i].split(' ') #print sentence # Creating an empty container for the cleaned up sentence container = [(self.start, self.start)] #print "======", sentence,'===========' # Looping over the words j=0 p=0 for w in sentence: w=w.strip() #w=w.lstrip("#") p=p+1 # Splitting word, POS if w.startswith("/"): continue if w.startswith("@"): continue if w.startswith("http"): continue if w.startswith(".") or w.startswith("\?"): continue if (w.startswith(":")) and j == 0: continue #if w.startswith(":") and j == len(sentence) - 1: if w[0] in ':-\?' and p == (len(sentence)-1): continue j+=1 #if w.startswith("#"): # w=w[1:] m = re.match("^(.+)/(.+)$", w) # Extract the word information token, POS = m.group(1), m.group(2) if (POS.strip() == ",") and p == 1: continue #print token, POS if "RT" in token: continue token=token.lstrip("#") # Add the token/POS to the sentence container if (sentence.index(w)==0 and (POS[0] not in ('^')) and w!="I"): container.append((token.lower(), POS)) else: container.append((token, POS)) # Add the stop token at the end of the container container.append((self.stop, self.stop)) # Recopy the container into the current sentence self.sentence[i] = container #-B-----------------------------------------------------------------------B- def build_graph(self): """ Constructs a directed word graph from the list of input sentences. Each sentence is iteratively added to the directed graph according to the following algorithm: - Word mapping/creation is done in four steps: 1. non-stopwords for which no candidate exists in the graph or for which an unambiguous mapping is possible or which occur more than once in the sentence 2. non-stopwords for which there are either several possible candidates in the graph 3. stopwords 4. punctuation marks For the last three groups of words where mapping is ambiguous we check the immediate context (the preceding and following words in the sentence and the neighboring nodes in the graph) and select the candidate which has larger overlap in the context, or the one with a greater frequency (i.e. the one which has more words mapped onto it). Stopwords are mapped only if there is some overlap in non-stopwords neighbors, otherwise a new node is created. Punctuation marks are mapped only if the preceding and following words in the sentence and the neighboring nodes are the same. - Edges are then computed and added between mapped words. Each node in the graph is represented as a tuple ('word/POS', id) and possesses an info list containing (sentence_id, position_in_sentence) tuples. """ # Iteratively add each sentence in the graph --------------------------- for i in range(self.length): # Compute the sentence length sentence_len = len(self.sentence[i]) # Create the mapping container mapping = [0] * sentence_len #------------------------------------------------------------------- # 1. non-stopwords for which no candidate exists in the graph or for # which an unambiguous mapping is possible or which occur more # than once in the sentence. #------------------------------------------------------------------- for j in range(sentence_len): # Get the word and tag token, POS = self.sentence[i][j] # If stopword or punctuation mark, continues if token in self.stopwords or re.search('(?u)^\W$', token): continue # Create the node identifier node = token + self.sep + POS # Find the number of ambiguous nodes in the graph k = self.ambiguous_nodes(node) # If there is no node in the graph, create one with id = 0 if k == 0: # Add the node in the graph self.graph.add_node( (node, 0), info=[(i, j)], label=token ) # Mark the word as mapped to k mapping[j] = (node, 0) # If there is only one matching node in the graph (id is 0) elif k == 1: # Get the sentences id of this node ids = [] for sid, pos_s in self.graph.node[(node, 0)]['info']: ids.append(sid) # Update the node in the graph if not same sentence if not i in ids: self.graph.node[(node, 0)]['info'].append((i, j)) mapping[j] = (node, 0) # Else Create new node for redundant word else: self.graph.add_node( (node, 1), info=[(i, j)], label=token ) mapping[j] = (node, 1) #------------------------------------------------------------------- # 2. non-stopwords for which there are either several possible # candidates in the graph. #------------------------------------------------------------------- for j in range(sentence_len): # Get the word and tag token, POS = self.sentence[i][j] # If stopword or punctuation mark, continues if token in self.stopwords or re.search('(?u)^\W$', token): continue # If word is not already mapped to a node if mapping[j] == 0: # Create the node identifier node = token + self.sep + POS # Create the neighboring nodes identifiers prev_token, prev_POS = self.sentence[i][j-1] next_token, next_POS = self.sentence[i][j+1] prev_node = prev_token + self.sep + prev_POS next_node = next_token + self.sep + next_POS # Find the number of ambiguous nodes in the graph k = self.ambiguous_nodes(node) # Search for the ambiguous node with the larger overlap in # context or the greater frequency. ambinode_overlap = [] ambinode_frequency = [] # For each ambiguous node for l in range(k): # Get the immediate context words of the nodes l_context = self.get_directed_context(node, l, 'left') r_context = self.get_directed_context(node, l, 'right') # Compute the (directed) context sum val = l_context.count(prev_node) val += r_context.count(next_node) # Add the count of the overlapping words ambinode_overlap.append(val) # Add the frequency of the ambiguous node ambinode_frequency.append( len( self.graph.node[(node, l)]['info'] ) ) # Search for the best candidate while avoiding a loop found = False selected = 0 while not found: # Select the ambiguous node selected = self.max_index(ambinode_overlap) if ambinode_overlap[selected] == 0: selected = self.max_index(ambinode_frequency) # Get the sentences id of this node ids = [] for sid, p in self.graph.node[(node, selected)]['info']: ids.append(sid) # Test if there is no loop if i not in ids: found = True break # Remove the candidate from the lists else: del ambinode_overlap[selected] del ambinode_frequency[selected] # Avoid endless loops if len(ambinode_overlap) == 0: break # Update the node in the graph if not same sentence if found: self.graph.node[(node, selected)]['info'].append((i, j)) mapping[j] = (node, selected) # Else create new node for redundant word else: self.graph.add_node( (node, k), info=[(i, j)], label=token ) mapping[j] = (node, k) #------------------------------------------------------------------- # 3. map the stopwords to the nodes #------------------------------------------------------------------- for j in range(sentence_len): # Get the word and tag token, POS = self.sentence[i][j] # If *NOT* stopword, continues if not token in self.stopwords : continue # Create the node identifier node = token + self.sep + POS # Find the number of ambiguous nodes in the graph k = self.ambiguous_nodes(node) # If there is no node in the graph, create one with id = 0 if k == 0: # Add the node in the graph self.graph.add_node( (node, 0), info=[(i, j)], label=token ) # Mark the word as mapped to k mapping[j] = (node, 0) # Else find the node with overlap in context or create one else: # Create the neighboring nodes identifiers prev_token, prev_POS = self.sentence[i][j-1] next_token, next_POS = self.sentence[i][j+1] prev_node = prev_token + self.sep + prev_POS next_node = next_token + self.sep + next_POS ambinode_overlap = [] # For each ambiguous node for l in range(k): # Get the immediate context words of the nodes, the # boolean indicates to consider only non stopwords l_context = self.get_directed_context(node, l, 'left',\ True) r_context = self.get_directed_context(node, l, 'right',\ True) # Compute the (directed) context sum val = l_context.count(prev_node) val += r_context.count(next_node) # Add the count of the overlapping words ambinode_overlap.append(val) # Get best overlap candidate selected = self.max_index(ambinode_overlap) # Get the sentences id of the best candidate node ids = [] for sid, pos_s in self.graph.node[(node, selected)]['info']: ids.append(sid) # Update the node in the graph if not same sentence and # there is at least one overlap in context if i not in ids and ambinode_overlap[selected] > 0: # if i not in ids and \ # (ambinode_overlap[selected] > 1 and POS==self.punct_tag) or\ # (ambinode_overlap[selected] > 0 and POS!=self.punct_tag) : # Update the node in the graph self.graph.node[(node, selected)]['info'].append((i, j)) # Mark the word as mapped to k mapping[j] = (node, selected) # Else create a new node else: # Add the node in the graph self.graph.add_node( (node, k) , info=[(i, j)], label=token ) # Mark the word as mapped to k mapping[j] = (node, k) #------------------------------------------------------------------- # 4. lasty map the punctuation marks to the nodes #------------------------------------------------------------------- for j in range(sentence_len): # Get the word and tag token, POS = self.sentence[i][j] # If *NOT* punctuation mark, continues if not re.search('(?u)^\W$', token): continue # Create the node identifier node = token + self.sep + POS # Find the number of ambiguous nodes in the graph k = self.ambiguous_nodes(node) # If there is no node in the graph, create one with id = 0 if k == 0: # Add the node in the graph self.graph.add_node( (node, 0), info=[(i, j)], label=token ) # Mark the word as mapped to k mapping[j] = (node, 0) # Else find the node with overlap in context or create one else: # Create the neighboring nodes identifiers prev_token, prev_POS = self.sentence[i][j-1] next_token, next_POS = self.sentence[i][j+1] prev_node = prev_token + self.sep + prev_POS next_node = next_token + self.sep + next_POS ambinode_overlap = [] # For each ambiguous node for l in range(k): # Get the immediate context words of the nodes l_context = self.get_directed_context(node, l, 'left') r_context = self.get_directed_context(node, l, 'right') # Compute the (directed) context sum val = l_context.count(prev_node) val += r_context.count(next_node) # Add the count of the overlapping words ambinode_overlap.append(val) # Get best overlap candidate selected = self.max_index(ambinode_overlap) # Get the sentences id of the best candidate node ids = [] for sid, pos_s in self.graph.node[(node, selected)]['info']: ids.append(sid) # Update the node in the graph if not same sentence and # there is at least one overlap in context if i not in ids and ambinode_overlap[selected] > 1: # Update the node in the graph self.graph.node[(node, selected)]['info'].append((i, j)) # Mark the word as mapped to k mapping[j] = (node, selected) # Else create a new node else: # Add the node in the graph self.graph.add_node( (node, k), info=[(i, j)], label=token ) # Mark the word as mapped to k mapping[j] = (node, k) #------------------------------------------------------------------- # 4. Connects the mapped words with directed edges #------------------------------------------------------------------- for j in range(1, len(mapping)): #print 'Mapping j-1', mapping[j-1], self.graph.node[mapping[j-1]]['info'] self.graph.add_edge(mapping[j-1], mapping[j], marker=1) # Assigns a weight to each node in the graph --------------------------- for node1, node2 in self.graph.edges_iter(): edge_weight = self.get_edge_weight(node1, node2) self.graph.add_edge(node1, node2, weight=edge_weight) #-B-----------------------------------------------------------------------B- #-T-----------------------------------------------------------------------T- def ambiguous_nodes(self, node): """ Takes a node in parameter and returns the number of possible candidate (ambiguous) nodes in the graph. """ k = 0 #if node == "," + "/" + "," : # return k while(self.graph.has_node((node, k))): k += 1 return k #-B-----------------------------------------------------------------------B- #-T-----------------------------------------------------------------------T- def get_directed_context(self, node, k, dir='all', non_pos=False): """ Returns the directed context of a given node, i.e. a list of word/POS of the left or right neighboring nodes in the graph. The function takes four parameters : - node is the word/POS tuple - k is the node identifier used when multiple nodes refer to the same word/POS (e.g. k=0 for (the/DET, 0), k=1 for (the/DET, 1), etc.) - dir is the parameter that controls the directed context calculation, it can be set to left, right or all (default) - non_pos is a boolean allowing to remove stopwords from the context (default is false) """ # Define the context containers l_context = [] r_context = [] # For all the sentence/position tuples for sid, off in self.graph.node[(node, k)]['info']: prev = self.sentence[sid][off-1][0] + self.sep +\ self.sentence[sid][off-1][1] next = self.sentence[sid][off+1][0] + self.sep +\ self.sentence[sid][off+1][1] if non_pos: if self.sentence[sid][off-1][0] not in self.stopwords: l_context.append(prev) if self.sentence[sid][off+1][0] not in self.stopwords: r_context.append(next) else: l_context.append(prev) r_context.append(next) # Returns the left (previous) context if dir == 'left': return l_context # Returns the right (next) context elif dir == 'right': return r_context # Returns the whole context else: l_context.extend(r_context) return l_context #-B-----------------------------------------------------------------------B- #-B-----------------------------------------------------------------------B- #-T-----------------------------------------------------------------------T- def get_edge_weight(self, node1, node2): """ Compute the weight of an edge *e* between nodes *node1* and *node2*. It is computed as e_ij = (A / B) / C with: - A = freq(i) + freq(j), - B = Sum (s in S) 1 / diff(s, i, j) - C = freq(i) * freq(j) A node is a tuple of ('word/POS', unique_id). """ # Get the list of (sentence_id, pos_in_sentence) for node1 info1 = self.graph.node[node1]['info'] # Get the list of (sentence_id, pos_in_sentence) for node2 info2 = self.graph.node[node2]['info'] # Get the frequency of node1 in the graph # freq1 = self.graph.degree(node1) freq1 = len(info1) # Get the frequency of node2 in cluster # freq2 = self.graph.degree(node2) freq2 = len(info2) # Initializing the diff function list container diff = [] # For each sentence of the cluster (for s in S) for s in range(self.length): # Compute diff(s, i, j) which is calculated as # pos(s, i) - pos(s, j) if pos(s, i) < pos(s, j) # O otherwise # Get the positions of i and j in s, named pos(s, i) and pos(s, j) # As a word can appear at multiple positions in a sentence, a list # of positions is used pos_i_in_s = [] pos_j_in_s = [] # For each (sentence_id, pos_in_sentence) of node1 for sentence_id, pos_in_sentence in info1: # If the sentence_id is s if sentence_id == s: # Add the position in s pos_i_in_s.append(pos_in_sentence) # For each (sentence_id, pos_in_sentence) of node2 for sentence_id, pos_in_sentence in info2: # If the sentence_id is s if sentence_id == s: # Add the position in s pos_j_in_s.append(pos_in_sentence) # Container for all the diff(s, i, j) for i and j all_diff_pos_i_j = [] # Loop over all the i, j couples for x in range(len(pos_i_in_s)): for y in range(len(pos_j_in_s)): diff_i_j = pos_i_in_s[x] - pos_j_in_s[y] # Test if word i appears *BEFORE* word j in s if diff_i_j < 0: all_diff_pos_i_j.append(-1.0*diff_i_j) # Add the mininum distance to diff (i.e. in case of multiple # occurrencies of i or/and j in sentence s), 0 otherwise. if len(all_diff_pos_i_j) > 0: diff.append(1.0/min(all_diff_pos_i_j)) else: diff.append(0.0) weight1 = freq1 weight2 = freq2 return ( (freq1 + freq2) / sum(diff) ) / (weight1 * weight2) #-B-----------------------------------------------------------------------B- #-T-----------------------------------------------------------------------T- #-B-----------------------------------------------------------------------B- #-T-----------------------------------------------------------------------T- def max_index(self, l): """ Returns the index of the maximum value of a given list. """ ll = len(l) if ll < 0: return None elif ll == 1: return 0 max_val = l[0] max_ind = 0 for z in range(1, ll): if l[z] > max_val: max_val = l[z] max_ind = z return max_ind #-B-----------------------------------------------------------------------B- #-T-----------------------------------------------------------------------T- def compute_statistics(self): """ This function iterates over the cluster's sentences and computes the following statistics about each word: - term frequency (self.term_freq) """ # Structure for containing the list of sentences in which a term occurs terms = {} # Loop over the sentences for i in range(self.length): #print self.sentence[i] # For each tuple (token, POS) of sentence i for token, POS in self.sentence[i]: # generate the word/POS token #node = token + self.sep + POS node = token + self.sep + POS # Add the token to the terms list if not terms.has_key(node): terms[node] = [i] else: terms[node].append(i) # Loop over the terms for w in terms: # Compute the term frequency self.term_freq[w] = len(terms[w]) #-B-----------------------------------------------------------------------B- #-T-----------------------------------------------------------------------T- def load_stopwords(self, path): """ This function loads a stopword list from the *path* file and returns a set of words. Lines begining by '#' are ignored. """ # Set of stopwords stopwords = set([]) # For each line in the file for line in codecs.open(path, 'r', 'utf-8'): if not re.search('^#', line) and len(line.strip()) > 0: stopwords.add(line.strip()) # Return the set of stopwords return stopwords #-B-----------------------------------------------------------------------B- #-T-----------------------------------------------------------------------T- def write_dot(self, dotfile): """ Outputs the word graph in dot format in the specified file. """ nx.write_dot(self.graph, dotfile) #-B-----------------------------------------------------------------------B- def normalizedWords(self, path): """ This function loads a stopword list from the *path* file and returns a set of words. Lines begining by '#' are ignored. """ # Set of stopwords words = dict() # For each line in the file for line in codecs.open(path, 'r', 'utf-8'): ww=line.split("==>") words[ww[0].strip()]=ww[1].strip() # Return the set of stopwords return words #~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~ # ] Ending word_graph class #~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~ #-B-----------------------------------------------------------------------B- #-T-----------------------------------------------------------------------T- def wordpos_to_tuple(self, word, delim='/'): """ This function converts a word/POS to a (word, POS) tuple. The character used for separating word and POS can be specified (default is /). """ # Splitting word, POS using regex m = re.match("^(.+)"+delim+"(.+)$", word) # Extract the word information token, POS = m.group(1), m.group(2) # Return the tuple return (token, POS) #-B-----------------------------------------------------------------------B- #-T-----------------------------------------------------------------------T- def tuple_to_wordpos(self, wordpos_tuple, delim='/'): """ This function converts a (word, POS) tuple to word/POS. The character used for separating word and POS can be specified (default is /). """ # Return the word +delim+ POS return wordpos_tuple[0]+delim+wordpos_tuple[1] #-B-----------------------------------------------------------------------B- def sentenceTuple(sentence): modSentence = [] position=0 for w,t in sentence: if t in ('NNP', 'NNPS','^'): modSentence.append((w.title(),t)) else: modSentence.append((w,t)) position +=1 #print modSentence return modSentence def load_stopwords(path): """ This function loads a stopword list from the *path* file and returns a set of words. Lines begining by '#' are ignored. """ # Set of stopwords stopwords = set([]) # For each line in the file for line in codecs.open(path, 'r', 'utf-8'): if not re.search('^#', line) and len(line.strip()) > 0: stopwords.add(line.strip()) # Return the set of stopwords return stopwords #-B------------------- def getVertex(graph, string): for vertex in graph.vs: if string in vertex['name']: return vertex def find_all_paths_igraph(graph, start, end, path=[]): path = path + [start] if start == end: return [path] paths = [] for node in set(graph.neighbors(start)) - set(path): paths.extend(find_all_paths_igraph(graph, node, end, path)) return paths def findPaths2(G,u,n,excludeSet = None): if excludeSet == None: excludeSet = set([u]) else: excludeSet.add(u) if n==0: return [[u]] paths = [[u]+path for neighbor in G.neighbors(u) if neighbor not in excludeSet for path in findPaths2(G,neighbor,n-1,excludeSet)] excludeSet.remove(u) return paths def find_all_paths_nx(graph, start, end): path = [] paths = [] queue = [(start, end, path)] while queue: start, end, path = queue.pop() #print 'PATH', path path = path + [start] if start == end and len(path)>= 12 and len(path)<= 15: paths.append(path) if len(paths)>50000: return paths for node in set(graph[start]).difference(path): queue.append((node, end, path)) return paths def find_all_paths_igraph_adj(graph, start, end): def find_all_paths_aux(adjlist, start, end, path): path = path + [start] if start == end and (len(path) > 10 and len(path) < 25): return [path] paths = [] for node in adjlist[start] - set(path): paths.extend(find_all_paths_aux(adjlist, node, end, path)) if len(paths) >=10000: return paths return paths adjlist = [set(graph.neighbors(node)) for node in xrange(graph.vcount())] return find_all_paths_aux(adjlist, start, end, []) def find_all_paths_tamas(graph, start, end): def find_all_paths_aux_tamas(adjlist, start, end, path): path = path + [start] if start == end: return [path] paths = [] #if len(path) < 22:# and len(path) < 25: for node in adjlist[start] - set(path): paths.extend(find_all_paths_aux_tamas(adjlist, node, end, path)) #if len(paths) > 20000: # return paths #if len(paths) > 10000: # return paths return paths adjlist = [set(graph.neighbors(node)) for node in xrange(graph.vcount())] return find_all_paths_aux_tamas(adjlist, start, end, []) def adjlist_find_paths(a, n, m, path=[]): "Find paths from node index n to m using adjacency list a." path = path + [n] if n == m: return [path] paths = [] #successors=a.successors(n) #shuffle(successors) for child in a[n]: if child not in path: child_paths = adjlist_find_paths(a, child, m, path) for child_path in child_paths: if len(child_path)<12: #<=25: continue if len(child_path)>25: #<=25: continue #print "Appending path" paths.append(child_path) #print len(paths) if (len(paths)>=1000): return paths return paths def paths_from_to_old(graph, source, dest): "Find paths in graph from vertex source to vertex dest." a = graph.get_adjlist() n = source.index m = dest.index return adjlist_find_paths(a, n, m) #import shortestPath as sp def paths_from_to(graph, source, dest): "Find paths in graph from vertex source to vertex dest." a = graph.get_adjlist() n = source.index m = dest.index #return graph.get_shortest_paths(source, dest) #return sp.yen_igraph(graph, source, dest, 1000) #return graph.get_shortest_paths(n, to=m) #r#eturn yenksp.algorithms.ksp_yen(graph, source, dest, max_k=10000) #return sp.yen_igraph(graph, source, dest, 2000, weights=None) return adjlist_find_paths(a, n, m) def getWordFromVertexName(nameString): word=nameString.split('/')[0] if word not in ['-start-','-end-']: return word else: return '' def generateTempRewrittenSentences(taggedSentences): final_tagged_Sentences=[] for ensent in taggedSentences: tagged_sentence='' for w, t in ensent: tagged_sentence=tagged_sentence+w+'/'+t+' ' final_tagged_Sentences.append(tagged_sentence.strip()) return final_tagged_Sentences def retrieveNewSentences(sentences, stopwords, mode=None): taggedTweets=list(sentences) shuffle(taggedTweets) simMatrix=simCalcMatrix(taggedTweets) compresser = word_graph(taggedTweets, lang = 'en', punct_tag = "PUNCT" ) print "Number of nodes", nx.number_of_nodes(compresser.graph) #candidates = compresser.get_compression(100) #print compresser.graph.nodes(data='True') ''' #NOT USING THIS NOW: THIS is for IGRAPH matrix=nx.to_scipy_sparse_matrix(compresser.graph) nodelist=compresser.graph.nodes() g = scipy_to_igraph(matrix,nodelist) startvertex = getVertex(g, '-start-/-/-start-') endvertex = getVertex(g, '-end-/-/-end-') vertexList = g.vs() startvertex=startvertex.index endvertex=endvertex.index #elements=g.get_all_shortest_paths(startvertex, endvertex)[0] print "WG done" #allpaths=elements allpaths = find_all_paths_igraph_adj(g, startvertex, endvertex) ''' #nlist= nx.nodes(compresser.graph) #for n in nlist: # print 'Info', compresser.graph.node[n]['info'] g=nx.convert_node_labels_to_integers(compresser.graph) nodelist=g.nodes(data=True) for node in nodelist: n, r = node #print 'Labels--', r['label'] if r['label']=='-start-': startnode=n if r['label']=='-end-': endnode=n label_list=[] ''' Labellist contains info and label : info has info on which sentences resulted in that node ''' for node in nodelist: #print node r, d = node if d['label'] not in ['-start-','-end-']: label_list.append((d['info'],d['label'])) else: label_list.append(('','')) #if mode in 'EXTRACT': cutoff_threshold=18 import timeit starttime=timeit.default_timer() allpaths=[] for path in nx.all_simple_paths(g, startnode, endnode, cutoff=cutoff_threshold): allpaths.append(path) if len(allpaths) >=100000: break print "Total time for getting all paths in seconds: ", (timeit.default_timer()-starttime), " s" print "Total paths, ", len(allpaths) shuffle(allpaths) #allpaths=allpaths[0:10000] generatedSentences = [] sentence_container = {} for path in allpaths: #print 'Path', path if len(path)<12: continue paired_parentheses = 0 quotation_mark_number = 0 sentence='' i=0 total_in_path=len(path) #print total_in_path for element in path: label= label_list[element][1] #print label, i if i == (total_in_path - 2): sentence = sentence + ' ' + label.split("||")[0]+" "+ label.split("||")[1] else: sentence = sentence + ' ' + label_list[element][1].split("||")[0] i+=1 avgSim = avgPairwiseSimilarity(simMatrix, getSentIndices(path, label_list)) for word in sentence.split(): #print word if word == '(': paired_parentheses -= 1 elif word == ')': paired_parentheses += 1 elif (word == '\"' or word == '\'\'' or word == '``' or word == '"'): quotation_mark_number += 1 if (paired_parentheses == 0 and quotation_mark_number%2 == 0 and not sentence_container.has_key(sentence.strip())): generatedSentences.append((sentence.strip(), avgSim)) sentence_container[sentence.strip()]=1 shuffle(generatedSentences) #if mode in 'EXTRACT': generatedSentences= sorted(generatedSentences, key=lambda tup: tup[1], reverse=True) #else: generatedSentences=generatedSentences[0:300] print "Num variables -->", len(generatedSentences) svolist=[] generatedSentences=removeSimilarSentences(generatedSentences, sentences, stopwords) return generatedSentences, svolist # print generatedSentences def avgPairwiseSimilarity(simMatrix, indices): num_elements=len(indices) sum_sim=0.0 num=0.0 if num_elements==1: return 0.00001 for i in xrange(0, num_elements-1): for j in xrange(i+1, num_elements): sum_sim+=simMatrix[i,j] num+=1 #print num return (sum_sim/num) def getSentIndices(path, labellist): sentenceSet=set() path_length=len(path) for i in xrange(1,path_length): keys1=labellist[path[i-1]][0] sentindices_1=[sentnum for sentnum, wordnum in keys1] keys2=labellist[path[i]][0] sentindices_2=[sentnum for sentnum, wordnum in keys2] sentence_indices=set(sentindices_1).intersection(set(sentindices_2)) for ind in sentence_indices: sentenceSet.add(ind) return sentenceSet def removeSimilarSentences(generatedSentences, originalSentences, stopwords,threshold=0.80,): docs=[] for sent, sim in generatedSentences: docs.append(sent) docs.extend(originalSentences) bow_matrix = StemmedTfidfVectorizer(stop_words=stopwords).fit_transform(docs) normalized = TfidfTransformer().fit_transform(bow_matrix) #simMatrix = (normalized[0:] * normalized[0:].T).A simindices=[] #print 'Num original, ', len(originalSentences) for i in xrange(len(generatedSentences)): simGeneratedScores = linear_kernel(normalized[i], normalized[len(generatedSentences):]).flatten() if(max(simGeneratedScores) >= threshold): simindices.append(i) #print simindices finalGen=[sentence for k,sentence in enumerate(generatedSentences) if k not in simindices] #print len(generatedSentences), len(finalGen) return finalGen def getSVO(sentence): sbjstring=' ' objstring=' ' vpstring=' ' s = parsetree(sentence, tokenize = True, relations=True, lemmata = False) relationList=s.sentences[0].relations if 'SBJ' in relationList: for chunk in relationList['SBJ'].values(): #print chunk.words sbjstring =sbjstring+' '+ ' '.join(word.string for word in chunk.words) if 'OBJ' in relationList: for chunk in relationList['OBJ'].values(): #print chunk.words objstring =objstring+' '+ ' '.join(word.string for word in chunk.words) if 'VP' in relationList: for chunk in relationList['VP'].values(): #print chunk.words vpstring =vpstring+' '+ ' '.join(word.string for word in chunk.words) #print sbjstring.strip() #print objstring.strip() #print vpstring.strip() return sbjstring.strip(), vpstring.strip(), objstring.strip() def simCalcMatrix(docs): tfidf_vectorizer = TfidfVectorizer(min_df=0, stop_words=None) tfidf_matrix_train = tfidf_vectorizer.fit_transform(docs) #finds the tfidf score with normalization cosineSimilarities=cosine_similarity(tfidf_matrix_train, tfidf_matrix_train) return cosineSimilarities def generateSimMatrix(phraseList): #print 'Num elements', len(phraseList), phraseList all_elements=[] #for elementlist in phraseList: for element in phraseList: if len(element.strip())==0: all_elements.append(' ') else: all_elements.append(element.strip()) tfidf_vectorizer = TfidfVectorizer(min_df=0, stop_words=None) tfidf_matrix_train = tfidf_vectorizer.fit_transform(all_elements) #finds the tfidf score with normalization cosineSimilarities=cosine_similarity(tfidf_matrix_train, tfidf_matrix_train) return cosineSimilarities from nltk.tokenize import WordPunctTokenizer def getredundantComponents(sentences): window_size=4 introList=[] midlist=[] endlist=[] for sent in sentences: words = WordPunctTokenizer().tokenize(sent) length_sent=len(words) f_point = (length_sent)//3 m_point=(length_sent)//2 index_span=window_size//2 intro=' '.join(word for word in words[0:window_size]) mid=' '.join(word for word in words[m_point-index_span:m_point+index_span]) end=' '.join(word for word in words[-window_size:]) introList.append(intro) midlist.append(mid) endlist.append(end) return introList, midlist, endlist def solveILPFactBased(groupedList, languageModel, stopwords,ranker, intraGenSimThreshold=0.5,l_max=100, mode="Extractive" ): if len(groupedList) == 0: return # Create a new model print "Starting to solve ILP...." m = LpProblem("mip1", LpMaximize) sbjthreshold=0.3 objthreshold=0.4 docs=[] intraSentRelatedNessScores=[] if mode == "Extractive": for element in groupedList: docs.append(element) else: for element, sim in groupedList: docs.append(element) intraSentRelatedNessScores.append(sim) ''' Full sentence cosine sim comparison ''' bow_matrix = StemmedTfidfVectorizer(stop_words=stopwords).fit_transform(docs) normalized = TfidfTransformer().fit_transform(bow_matrix) cosine_similarity_matrix = (normalized * normalized.T).A ''' splitting a sentence into three parts ''' ilist, mlist, elist=getredundantComponents(docs) bow_matrix_i = StemmedTfidfVectorizer(stop_words=None).fit_transform(ilist) normalized_i = TfidfTransformer().fit_transform(bow_matrix_i) cosine_similarity_matrix_i = (normalized_i * normalized_i.T).A bow_matrix_m = StemmedTfidfVectorizer(stop_words=None).fit_transform(mlist) normalized_m = TfidfTransformer().fit_transform(bow_matrix_m) cosine_similarity_matrix_m = (normalized_m * normalized_m.T).A #elist=factContent(elist, contentWordScores) bow_matrix_e = StemmedTfidfVectorizer(stop_words=None).fit_transform(elist) normalized_e = TfidfTransformer().fit_transform(bow_matrix_e) cosine_similarity_matrix_e = (normalized_e * normalized_e.T).A txtRankScores=[] if ranker == "Centroid": txtRankScores=summarizer.getScores(docs) if ranker == "textrank": sources, targets = cosine_similarity_matrix.nonzero() similarity_igraph = igraph.Graph(zip(sources, targets), directed=True) txtRankScores = igraph.Graph.pagerank(similarity_igraph) lingQualityScores=[] if mode=="Abstractive": for sent in docs: normalizer=len(sent.split(" ")) lm_score=(1/normalizer)*languageModel.score(sent) lm_score=1/(1-lm_score) lingQualityScores.append(lm_score) varlist=[] for i in xrange(len(docs)): var=LpVariable("var_"+str(i),cat=LpBinary) varlist.append(var) if mode=="Abstractive": m += lpSum([txtRankScores[i]*lingQualityScores[i]*varlist[i] for i in xrange(len(txtRankScores))]), "Obj function" else: m += lpSum([txtRankScores[i]*varlist[i] for i in xrange(len(txtRankScores))]), "Obj function" visitedlist=[] for i in xrange(len(varlist)): i_indices=np.where(cosine_similarity_matrix_i[i,:]>= objthreshold)[0] m_indices=np.where(cosine_similarity_matrix_m[i,:]>= objthreshold)[0] e_indices=np.where(cosine_similarity_matrix_e[i,:]>= objthreshold)[0] s_indices=np.where(cosine_similarity_matrix[i,:]>= sbjthreshold)[0] all_indices=np.concatenate((i_indices, m_indices)) all_indices=np.concatenate((all_indices, e_indices)) all_indices=np.unique(all_indices) all_indices=np.concatenate((all_indices, s_indices)) all_indices=np.unique(all_indices).tolist() for j in all_indices: if i==j: continue if j==len(varlist): continue #print j if (i, j) not in visitedlist: visitedlist.append((i,j)) m+=varlist[i] + varlist[j] <=1.0, "constraint_facts_svo_"+str(i)+"_"+varlist[i].name+"_"+varlist[j].name completelist=[] completelist.extend(mlist) completelist.append(ilist[i]) lastelement=len(completelist)-1 bow_matrix_ilist = StemmedTfidfVectorizer(stop_words=None).fit_transform(completelist) normalized_ilist = TfidfTransformer().fit_transform(bow_matrix_ilist) cosine_similarity_matrix_ilist = (normalized_ilist * normalized_ilist.T).A r_indices=np.where(cosine_similarity_matrix_ilist[lastelement,:]>= objthreshold)[0] #print len(completelist) #print "R_indices", r_indices oth_indices=r_indices completelist=[] completelist.extend(elist) completelist.append(ilist[i]) bow_matrix_ilist = StemmedTfidfVectorizer(stop_words=None).fit_transform(completelist) normalized_ilist = TfidfTransformer().fit_transform(bow_matrix_ilist) cosine_similarity_matrix_ilist = (normalized_ilist * normalized_ilist.T).A r_indices=np.where(cosine_similarity_matrix_ilist[lastelement,:]>= objthreshold)[0] #print "P_indices", p_indices oth_indices=np.concatenate((oth_indices, r_indices)) completelist=[] completelist.extend(mlist) completelist.append(elist[i]) bow_matrix_ilist = StemmedTfidfVectorizer(stop_words=None).fit_transform(completelist) normalized_ilist = TfidfTransformer().fit_transform(bow_matrix_ilist) cosine_similarity_matrix_ilist = (normalized_ilist * normalized_ilist.T).A r_indices=np.where(cosine_similarity_matrix_ilist[lastelement,:]>= objthreshold)[0] #print "Q_indices", q_indices oth_indices=np.concatenate((oth_indices, r_indices)) oth_indices=np.unique(oth_indices).tolist() for j in oth_indices: if i==j: continue if j==len(varlist): continue #print j if (i, j) not in visitedlist: visitedlist.append((i,j)) m+=varlist[i] + varlist[j] <=1.0, "constraint_facts_comps_"+str(i)+"_"+varlist[i].name+"_"+varlist[j].name gen_lengths=[] for i in xrange(len(txtRankScores)): words=docs[i].split(" ") count=0 for word in words: if word[0].isalpha() or word[0].isdigit(): count+=1 gen_lengths.append(count) #print "Gen Lengths" , gen_lengths m += lpSum([varlist[i]*gen_lengths[i] for i in xrange(len(txtRankScores))]) <= l_max, "length of summary" m.solve() solutionList=[] for v in m.variables(): if v.varValue == 1.0: indexVar=v.name.split("_")[1] solutionList.append(docs[int(indexVar)]) return solutionList