""" Copyright 2018 IBM Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ from gensim.utils import simple_preprocess from gensim.models import Word2Vec from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import numpy as np class CustomParVec(): ''' Custom Paragraph Vector. Each paragraph(or sentence) is the sum of each of its word's Word2Vec vector representation scaled by that word's tf-idf. ''' def __init__(self, word_sentence_list, workers = 2, dimensions = 100, min_word_count = 2, context = 5, downsampling = 0, tfidf = True, pre_trained = None, supervised_docs = None): ''' Args: word_sentence_list (list[list[str]]) : List of lists of words that make up a paragraph(or sentence). workers (int) : Number of threads to run in parallel (Default = 2). dimensions (int) : Vector dimensionality (Default = 100). min_word_count (int) : Minimum word count (Default = 2). context (int) : Context window size (Default = 5). downsampling (int) : Downsample setting for frequent words (Default = 0). tfidf (bool) : Specify whether or not to use idf in scaling (Default = True). pre_trained (Word2Vec) : Use a pre-trained Word2Vec model (Default = None). supervised_docs (list[str]) : List of sentences from some "ground truth" (Default = None). ''' self.dimensions = dimensions # Set the number of dimension if not pre_trained: self.word2vec_model = Word2Vec(word_sentence_list, workers=workers, \ size=self.dimensions, min_count = min_word_count, \ window = context, sample = downsampling) self.word2vec_model.init_sims(replace=True) # used for memory efficiency else: self.word2vec_model = pre_trained self.sentences = [' '.join(words) for words in word_sentence_list] # Keep a list of the full sentences themselves. self.tf_idf_obj = TfidfVectorizer(use_idf = tfidf) # Create TfidfVectorizer object self.tf_idf_obj.fit(self.sentences) # Transform and fit tf-idf to all sentences(could be paragraphs) self.tf_idf = self.tf_idf_obj.transform(self.sentences) self.word_index = self.tf_idf_obj.get_feature_names() # Keep track of words by index for lookups if supervised_docs: self.extra_tf_idf_obj = TfidfVectorizer(use_idf = tfidf) # Create TfidfVectorizer object self.extra_tf_idf_obj.fit(supervised_docs) # Transform and fit tf-idf to all sentences(could be paragraphs) self.extra_tf_idf = self.extra_tf_idf_obj.transform(supervised_docs) self.extra_word_index = self.extra_tf_idf_obj.get_feature_names() # Keep track of words by index for lookups else: self.extra_tf_idf_obj = None def learnVectors(self): ''' Create a vector representation of every paragraph(or sentence) in the initial data provided. Yields: numpy.ndarray: Next numpy array representing the paragraph (or sentence). ''' rows, cols = self.tf_idf.nonzero() # Get the rows and column indices of non zero tf-idf values curr_line = 0 curr_vec = np.zeros(self.dimensions) for row, col in zip(rows, cols): if curr_line == row: # Check that the current word belongs to the same paragraph (or sentence). try: # Infer the vector of the current word by scaling the word's word2vec vector by its tf-idf value. # Add that inferred vector to the current vector representing the current paragraph. curr_vec += (self.word2vec_model[(self.word_index[col])] * self.tf_idf[row, col]) except: continue else: # If we are on the next paragraph, yield the current vector and reset it. yield(curr_vec) curr_line = row curr_vec = np.zeros(self.dimensions) try: curr_vec = self.word2vec_model[(self.word_index[col])] * self.tf_idf[row, col] except: continue def train(self): self.vectors = list(self.learnVectors()) def getMostSimilar(self, sentence, top_n = 10, threshold = 0.5, sentences = None, vectors = None): ''' Given a new sentence, find the closest top_n elements Args: sentence(string) : Text we want to find most similar to. top_n (int) : Total number of most similar tuples we want returned (Default value is 5). threshold (float) : Minimum Cosine Distance to be returned sentences (list[string]) : List of sentences to be compared to vectors[list[numpy nd array]] : Vector embedding of sentences Returns: list[(float, string)]: A list of (cosine similarity, sentence) tuples of size top_n closest to the input sentence. ''' inferred_vector = self.inferVector(sentence) if sentences and vectors: corpus = sentences vecs = vectors else: corpus = self.sentences vecs = self.vectors cos_similarities = np.ravel(cosine_similarity(inferred_vector.reshape(1,-1), vecs)) most_similar = np.argpartition(-cos_similarities, top_n)[:top_n] return [(cos_similarities[sentence_index], corpus[sentence_index]) for sentence_index in most_similar if cos_similarities[sentence_index] >= threshold] def inferVector(self, line): if self.extra_tf_idf_obj: return self.inferVector2(line) return self.inferVector1(line) def inferVector1(self, line): ''' Given a new line, infer a custom vector representation using the corpus tfidf. Args: line : new sentence to be inferred Returns: numpy.ndarray : vector representation of the line ''' line = ' '.join(simple_preprocess(line)) # pre-process the line line_tf_idf = self.tf_idf_obj.transform([line]) # infer the tf-idf values for the words in the line rows, cols = line_tf_idf.nonzero() new_vec = np.zeros(self.dimensions) # Apply the same sentence to vector conversion as above. for col in cols: try: new_vec += (self.word2vec_model[(self.word_index[col])] * line_tf_idf[0, col]) except: continue return np.asarray(new_vec) def inferVector2(self, line): ''' Given a new line, infer a custom vector representation using the ground truth tfidf. Args: line : new sentence to be inferred Returns: numpy.ndarray : vector representation of the line ''' line = ' '.join(simple_preprocess(line)) # pre-process the line replacement_words = [] for word in line.split(): if word not in self.extra_tf_idf_obj.vocabulary_: try: similar_words = self.word2vec_model.similar_by_word(word, topn=10, restrict_vocab=None) for sim in similar_words: if sim[0] in self.extra_tf_idf_obj.vocabulary_: replacement_words.append((word, sim[0])) break except: continue for old, new in replacement_words: line = line.replace(old, new) line_tf_idf = self.extra_tf_idf_obj.transform([line]) # infer the tf-idf values for the words in the line rows, cols = line_tf_idf.nonzero() new_vec = np.zeros(self.dimensions) # Apply the same sentence to vector conversion as above. for col in cols: try: new_vec += (self.word2vec_model[(self.extra_word_index[col])] * line_tf_idf[0, col]) except: continue return np.asarray(new_vec)