import re from tensorflow.python.keras import backend as K from tensorflow.python.keras.layers import Layer from tensorflow.python.keras.preprocessing.sequence import pad_sequences from nltk.corpus import stopwords from gensim.models import KeyedVectors import gensim import pdb import numpy as np import itertools def generate_confidence_intervals(y_test, preds, estimator, nsample=20000, conf= 95.0): d = (100-conf)/2.0 uval = 100-d lval = d print('y_test.shape: {}, preds.shape: {}'.format(len(y_test), len(preds) )) pred_pair = np.array(list(zip(y_test,preds))) delta_vals =[] x_bar = estimator(y_test, preds) print('x_bar: {}'.format(x_bar)) n = len(preds) for i in range(nsample): bootstrap_idx = np.random.choice(n,n) ny_test = pred_pair[:,0][bootstrap_idx] npreds = pred_pair[:,1][bootstrap_idx] nscore = estimator(ny_test, npreds) delta = nscore - x_bar delta_vals.append(delta) delta_vals = np.array(delta_vals) lower_bound = np.percentile(delta_vals, lval) lscore = x_bar+lower_bound upper_bound = np.percentile(delta_vals, uval) uscore = x_bar+upper_bound return (lscore,uscore) def text_to_word_list(text): # Pre process and convert texts to a list of words text = str(text) text = text.lower() # Clean the text text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text) text = re.sub(r"what's", "what is ", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"can't", "cannot ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"i'm", "i am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r",", " ", text) text = re.sub(r"\.", " ", text) text = re.sub(r"!", " ! ", text) text = re.sub(r"\/", " ", text) text = re.sub(r"\^", " ^ ", text) text = re.sub(r"\+", " + ", text) text = re.sub(r"\-", " - ", text) text = re.sub(r"\=", " = ", text) text = re.sub(r"'", " ", text) text = re.sub(r"(\d+)(k)", r"\g<1>000", text) text = re.sub(r":", " : ", text) text = re.sub(r" e g ", " eg ", text) text = re.sub(r" b g ", " bg ", text) text = re.sub(r" u s ", " american ", text) text = re.sub(r"\0s", "0", text) text = re.sub(r" 9 11 ", "911", text) text = re.sub(r"e - mail", "email", text) text = re.sub(r"j k", "jk", text) text = re.sub(r"\s{2,}", " ", text) text = text.split() return text def make_w2v_embeddings(df, embedding_dim=300, empty_w2v=False): vocabs = {} vocabs_cnt = 0 vocabs_not_w2v = {} vocabs_not_w2v_cnt = 0 # Stopwords stops = set(stopwords.words('english')) # Load word2vec print("Loading word2vec model(it may takes 2-3 mins) ...") if empty_w2v: word2vec = EmptyWord2Vec else: word2vec = KeyedVectors.load_word2vec_format("/scratchd/home/satwik/embeddings/GoogleNews-vectors-negative300.bin.gz", binary=True, limit = 80000) # word2vec = gensim.models.word2vec.Word2Vec.load("./data/Quora-Question-Pairs.w2v").wv for index, row in df.iterrows(): # Print the number of embedded sentences. if index != 0 and index % 1000 == 0: print("{:,} sentences embedded.".format(index), flush=True) # Iterate through the text of both questions of the row for question in ['question1', 'question2']: q2n = [] # q2n -> question numbers representation for word in text_to_word_list(row[question]): # Check for unwanted words if word in stops: continue # If a word is missing from word2vec model. if word not in word2vec.vocab: if word not in vocabs_not_w2v: vocabs_not_w2v_cnt += 1 vocabs_not_w2v[word] = 1 # If you have never seen a word, append it to vocab dictionary. if word not in vocabs: vocabs_cnt += 1 vocabs[word] = vocabs_cnt q2n.append(vocabs_cnt) else: q2n.append(vocabs[word]) # Append question as number representation df.at[index, question + '_n'] = q2n embeddings = 1 * np.random.randn(len(vocabs) + 1, embedding_dim) # This will be the embedding matrix embeddings[0] = 0 # So that the padding will be ignored # Build the embedding matrix for word, index in vocabs.items(): if word in word2vec.vocab: embeddings[index] = word2vec.word_vec(word) del word2vec return df, embeddings def split_and_zero_padding(df, max_seq_length): # Split to dicts X = {'left': df['question1_n'], 'right': df['question2_n']} # Zero padding for dataset, side in itertools.product([X], ['left', 'right']): dataset[side] = pad_sequences(dataset[side], padding='pre', truncating='post', maxlen=max_seq_length) return dataset # -- class ManDist(Layer): """ Keras Custom Layer that calculates Manhattan Distance. """ # initialize the layer, No need to include inputs parameter! def __init__(self, **kwargs): self.result = None super(ManDist, self).__init__(**kwargs) # input_shape will automatic collect input shapes to build layer def build(self, input_shape): super(ManDist, self).build(input_shape) # This is where the layer's logic lives. def call(self, x, **kwargs): self.result = K.exp(-K.sum(K.abs(x[0] - x[1]), axis=1, keepdims=True)) return self.result # return output shape def compute_output_shape(self, input_shape): return K.int_shape(self.result) class EmptyWord2Vec: """ Just for test use. """ vocab = {} word_vec = {}