## Quora Question-pairs testing script ''' Accepts two questions/sentences as input arguments and outputs a binary (0/1 - no/yes) answer based on whether the two questions are semantically same, i.e. duplicate or not ''' import sys q1 = sys.argv[1] q2 = sys.argv[2] # Importing standard libraries import numpy as np import pandas as pd import matplotlib.pyplot as plt import re, nltk, gensim ## Importing required NLTK libraries #nltk.download('stopwords') #nltk.download('wordnet') from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer from nltk.stem.snowball import SnowballStemmer from nltk.stem.wordnet import WordNetLemmatizer ## Importing required keras libraries import keras from keras.preprocessing import sequence from keras.models import Model from keras.layers import LSTM, Embedding, Input, Merge from keras.optimizers import Adadelta import keras.backend as K #print("\nAll required libraries imported") ## Helper functions # Pre-process and convert text to a list of words def text_clean(corpus, keep_list): ''' Purpose : Function to keep only alphabets, digits and certain words (punctuations, qmarks, tabs etc. removed) Input : Takes a text corpus, 'corpus' to be cleaned along with a list of words, 'keep_list', which have to be retained even after the cleaning process Output : Returns the cleaned text corpus ''' cleaned_corpus = pd.Series() for row in corpus: qs_list = [] for word in row.split(): word = word.lower() word = re.sub(r"[^a-zA-Z0-9^.']"," ",word) word = re.sub(r"what's", "what is ", word) word = re.sub(r"\'ve", " have ", word) word = re.sub(r"can't", "cannot ", word) word = re.sub(r"n't", " not ", word) word = re.sub(r"i'm", "i am ", word) word = re.sub(r"\'re", " are ", word) word = re.sub(r"\'d", " would ", word) word = re.sub(r"\'ll", " will ", word) # If the word contains numbers with decimals, this will preserve it if bool(re.search(r'\d', word) and re.search(r'\.', word)) and word not in keep_list: keep_list.append(word) # Preserves certain frequently occuring dot words if word not in keep_list: p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word) qs_list.append(p1) else : qs_list.append(word) cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs_list))) return cleaned_corpus def preprocess(corpus, keep_list, cleaning = True, stemming = False, stem_type = None, lemmatization = True, remove_stopwords = True): ''' Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.) Input : 'corpus' - Text corpus on which pre-processing tasks will be performed 'keep_list' - List of words to be retained during cleaning process 'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should be performed or not 'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is "None", which corresponds to Porter Stemmer. 'snowball' corresponds to Snowball Stemmer Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together Output : Returns the processed text corpus ''' if cleaning == True: corpus = text_clean(corpus, keep_list) ''' All stopwords except the 'wh-' words are removed ''' if remove_stopwords == True: wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom'] stop = set(stopwords.words('english')) for word in wh_words: stop.remove(word) corpus = [[x for x in x.split() if x not in stop] for x in corpus] else : corpus = [[x for x in x.split()] for x in corpus] if lemmatization == True: lem = WordNetLemmatizer() corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus] if stemming == True: if stem_type == 'snowball': stemmer = SnowballStemmer(language = 'english') corpus = [[stemmer.stem(x) for x in x] for x in corpus] else : stemmer = PorterStemmer() corpus = [[stemmer.stem(x) for x in x] for x in corpus] return corpus def exponent_neg_manhattan_distance(left, right): ''' Purpose : Helper function for the similarity estimate of the LSTMs outputs Inputs : Two n-dimensional vectors Output : Manhattan distance between the input vectors ''' return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True)) #print("\n Helper functions loaded") # Based on the training set, a keep list of common dot words was prepared common_dot_words = ['u.s.', 'b.tech', 'm.tech', 'st.', 'e.g.', 'rs.', 'vs.', 'mr.', 'dr.', 'u.s', 'i.e.', 'node.js'] qs = [q1, q2] qs = preprocess(qs, keep_list = common_dot_words, remove_stopwords = False) # Separating processed questions q1 = [qs[0]] q2 = [qs[1]] #print("\n Text pre-processing done") # Loading pre-trained word vectors EMBEDDING_FILE = 'GoogleNews-vectors-negative300.bin' word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary = True) w2v = dict(zip(word2vec_model.wv.index2word, word2vec_model.wv.syn0)) #print("\n Pre-trained word vectors loaded") # Prepare word-to-index mapping vocabulary = dict() inverse_vocabulary = ['<unk>'] # '<unk>' will never be used, it is only a placeholder for the [0, 0, ....0] embedding qs = pd.DataFrame({'q1': q1, 'q2': q2}) questions_cols = ['q1', 'q2'] # Iterate through the text of both questions for index, row in qs.iterrows(): for question in questions_cols: q2n = [] # q2n -> numerical vector representation of each question for word in row[question]: # Check for stopwords who do not have a word2vec mapping and ignore them if word in set(stopwords.words('english')) and word not in word2vec_model.vocab: continue if word not in vocabulary: vocabulary[word] = len(inverse_vocabulary) q2n.append(len(inverse_vocabulary)) inverse_vocabulary.append(word) else: q2n.append(vocabulary[word]) # Replace questions with equivalent numerical vector/ word-indices qs.set_value(index, question, q2n) # Prepare embedding layer embedding_dim = 300 embeddings = np.random.randn(len(vocabulary)+1, embedding_dim) # Embedding matrix embeddings[0] = 0 # This is to ignore the zero padding at the beginning of the sequence # Build the embedding matrix for word, index in vocabulary.items(): if word in word2vec_model.vocab: embeddings[index] = w2v[word] del word2vec_model, w2v #print("\n Embedding matrix prepared") # Feature-space of the two questions X_test = {'left': qs['q1'], 'right': qs['q2']} ## Truncating and padding sequences to a length of 50 max_seq_length = 50 X_test['left'] = sequence.pad_sequences(X_test['left'], maxlen = max_seq_length) X_test['right'] = sequence.pad_sequences(X_test['right'], maxlen = max_seq_length) # Checking shapes and sizes to ensure no errors occur assert X_test['left'].shape == X_test['right'].shape #print("\n Begin model building") ## Define model architecture # Model variables n_hidden = 30 batch_size = 64 n_epoch = 1 # The visible layer left_input = Input(shape=(max_seq_length,), dtype='int32', name = 'input_1') right_input = Input(shape=(max_seq_length,), dtype='int32', name = 'input_2') embedding_layer = Embedding(len(embeddings), embedding_dim, weights=[embeddings], input_length=max_seq_length, trainable=False, name = 'embed_new') # Embedded version of the inputs encoded_left = embedding_layer(left_input) encoded_right = embedding_layer(right_input) # Since this is a siamese network, both sides share the same LSTM shared_lstm = LSTM(n_hidden, activation = 'relu', name = 'lstm_1_2') left_output = shared_lstm(encoded_left) right_output = shared_lstm(encoded_right) # Calculates the distance as defined by the MaLSTM model malstm_distance = Merge(mode=lambda x: exponent_neg_manhattan_distance(x[0], x[1]), output_shape=lambda x: (x[0][0], 1))([left_output, right_output]) # Combine all of the above in a Model model = Model([left_input, right_input], [malstm_distance]) #print("\nModel built") ## Loading weights from a pre-trained model model.load_weights("model30_relu_epoch_3.h5", by_name = True) model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy']) #print("\n Weights loaded and compiled") #print("\n Making prediction") ## Predict using pre-trained model pred = model.predict([X_test['left'], X_test['right']]) print("\n") if pred>0.5: print(1) else : print(0)