python source code of util

import re
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.layers import Layer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from gensim.models import KeyedVectors
import gensim
import pdb
import numpy as np
import itertools



def generate_confidence_intervals(y_test, preds, estimator, nsample=20000, conf= 95.0):

	d = (100-conf)/2.0
	uval = 100-d
	lval = d
	print('y_test.shape: {}, preds.shape: {}'.format(len(y_test), len(preds) ))
	pred_pair = np.array(list(zip(y_test,preds)))
	delta_vals =[]
	x_bar = estimator(y_test, preds)
	print('x_bar: {}'.format(x_bar))
	n = len(preds)

	for i in range(nsample):
		bootstrap_idx = np.random.choice(n,n)
		ny_test = pred_pair[:,0][bootstrap_idx]
		npreds = pred_pair[:,1][bootstrap_idx]
		nscore = estimator(ny_test, npreds)
		delta = nscore - x_bar
		delta_vals.append(delta)

	delta_vals = np.array(delta_vals)

	lower_bound = np.percentile(delta_vals, lval)
	lscore = x_bar+lower_bound
	upper_bound = np.percentile(delta_vals, uval)
	uscore = x_bar+upper_bound
	return (lscore,uscore)

def text_to_word_list(text):
	# Pre process and convert texts to a list of words
	text = str(text)
	text = text.lower()

	# Clean the text
	text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
	text = re.sub(r"what's", "what is ", text)
	text = re.sub(r"\'s", " ", text)
	text = re.sub(r"\'ve", " have ", text)
	text = re.sub(r"can't", "cannot ", text)
	text = re.sub(r"n't", " not ", text)
	text = re.sub(r"i'm", "i am ", text)
	text = re.sub(r"\'re", " are ", text)
	text = re.sub(r"\'d", " would ", text)
	text = re.sub(r"\'ll", " will ", text)
	text = re.sub(r",", " ", text)
	text = re.sub(r"\.", " ", text)
	text = re.sub(r"!", " ! ", text)
	text = re.sub(r"\/", " ", text)
	text = re.sub(r"\^", " ^ ", text)
	text = re.sub(r"\+", " + ", text)
	text = re.sub(r"\-", " - ", text)
	text = re.sub(r"\=", " = ", text)
	text = re.sub(r"'", " ", text)
	text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
	text = re.sub(r":", " : ", text)
	text = re.sub(r" e g ", " eg ", text)
	text = re.sub(r" b g ", " bg ", text)
	text = re.sub(r" u s ", " american ", text)
	text = re.sub(r"\0s", "0", text)
	text = re.sub(r" 9 11 ", "911", text)
	text = re.sub(r"e - mail", "email", text)
	text = re.sub(r"j k", "jk", text)
	text = re.sub(r"\s{2,}", " ", text)

	text = text.split()

	return text


def make_w2v_embeddings(df, embedding_dim=300, empty_w2v=False):
	vocabs = {}
	vocabs_cnt = 0

	vocabs_not_w2v = {}
	vocabs_not_w2v_cnt = 0

	# Stopwords
	stops = set(stopwords.words('english'))

	# Load word2vec
	print("Loading word2vec model(it may takes 2-3 mins) ...")

	if empty_w2v:
		word2vec = EmptyWord2Vec
	else:
		word2vec = KeyedVectors.load_word2vec_format("/scratchd/home/satwik/embeddings/GoogleNews-vectors-negative300.bin.gz", binary=True, limit = 80000)
		# word2vec = gensim.models.word2vec.Word2Vec.load("./data/Quora-Question-Pairs.w2v").wv

	for index, row in df.iterrows():
		# Print the number of embedded sentences.
		if index != 0 and index % 1000 == 0:
			print("{:,} sentences embedded.".format(index), flush=True)

		# Iterate through the text of both questions of the row
		for question in ['question1', 'question2']:

			q2n = []  # q2n -> question numbers representation
			for word in text_to_word_list(row[question]):
				# Check for unwanted words
				if word in stops:
					continue

				# If a word is missing from word2vec model.
				if word not in word2vec.vocab:
					if word not in vocabs_not_w2v:
						vocabs_not_w2v_cnt += 1
						vocabs_not_w2v[word] = 1

				# If you have never seen a word, append it to vocab dictionary.
				if word not in vocabs:
					vocabs_cnt += 1
					vocabs[word] = vocabs_cnt
					q2n.append(vocabs_cnt)
				else:
					q2n.append(vocabs[word])

			# Append question as number representation
			df.at[index, question + '_n'] = q2n

	embeddings = 1 * np.random.randn(len(vocabs) + 1, embedding_dim)  # This will be the embedding matrix
	embeddings[0] = 0  # So that the padding will be ignored

	# Build the embedding matrix
	for word, index in vocabs.items():
		if word in word2vec.vocab:
			embeddings[index] = word2vec.word_vec(word)
	del word2vec

	return df, embeddings


def split_and_zero_padding(df, max_seq_length):
	# Split to dicts
	X = {'left': df['question1_n'], 'right': df['question2_n']}

	# Zero padding
	for dataset, side in itertools.product([X], ['left', 'right']):
		dataset[side] = pad_sequences(dataset[side], padding='pre', truncating='post', maxlen=max_seq_length)

	return dataset


#  --

class ManDist(Layer):
	"""
	Keras Custom Layer that calculates Manhattan Distance.
	"""

	# initialize the layer, No need to include inputs parameter!
	def __init__(self, **kwargs):
		self.result = None
		super(ManDist, self).__init__(**kwargs)

	# input_shape will automatic collect input shapes to build layer
	def build(self, input_shape):
		super(ManDist, self).build(input_shape)

	# This is where the layer's logic lives.
	def call(self, x, **kwargs):
		self.result = K.exp(-K.sum(K.abs(x[0] - x[1]), axis=1, keepdims=True))
		return self.result

	# return output shape
	def compute_output_shape(self, input_shape):
		return K.int_shape(self.result)


class EmptyWord2Vec:
	"""
	Just for test use.
	"""
	vocab = {}
	word_vec = {}