python source code of skipthoughts

'''
Skip-thought vectors
https://github.com/ryankiros/skip-thoughts
'''
import os

import theano
import theano.tensor as tensor

import _pickle as pkl
import numpy
import copy
import nltk

from collections import OrderedDict, defaultdict
from scipy.linalg import norm
from nltk.tokenize import word_tokenize

profile = False

#-----------------------------------------------------------------------------#
# Specify model and table locations here
#-----------------------------------------------------------------------------#
path_to_models = 'Data/skipthoughts/'
path_to_tables = 'Data/skipthoughts/'
#-----------------------------------------------------------------------------#

path_to_umodel = path_to_models + 'uni_skip.npz'
path_to_bmodel = path_to_models + 'bi_skip.npz'


def load_model():
	"""
	Load the model with saved tables
	"""
	# Load model options
	print('Loading model parameters...')
	with open('%s.pkl'%path_to_umodel, 'rb') as f:
		uoptions = pkl.load(f)
	with open('%s.pkl'%path_to_bmodel, 'rb') as f:
		boptions = pkl.load(f)

	# Load parameters
	uparams = init_params(uoptions)
	uparams = load_params(path_to_umodel, uparams)
	utparams = init_tparams(uparams)
	bparams = init_params_bi(boptions)
	bparams = load_params(path_to_bmodel, bparams)
	btparams = init_tparams(bparams)

	# Extractor functions
	print('Compiling encoders...')
	embedding, x_mask, ctxw2v = build_encoder(utparams, uoptions)
	f_w2v = theano.function([embedding, x_mask], ctxw2v, name='f_w2v')
	embedding, x_mask, ctxw2v = build_encoder_bi(btparams, boptions)
	f_w2v2 = theano.function([embedding, x_mask], ctxw2v, name='f_w2v2')

	# Tables
	print('Loading tables...')
	utable, btable = load_tables()

	# Store everything we need in a dictionary
	print('Packing up...')
	model = {}
	model['uoptions'] = uoptions
	model['boptions'] = boptions
	model['utable'] = utable
	model['btable'] = btable
	model['f_w2v'] = f_w2v
	model['f_w2v2'] = f_w2v2

	return model


def load_tables():
	"""
	Load the tables
	"""
	words = []
	utable = numpy.load(path_to_tables + 'utable.npy', encoding='bytes')
	btable = numpy.load(path_to_tables + 'btable.npy', encoding='bytes')
	f = open(path_to_tables + 'dictionary.txt', 'rb')
	for line in f:
		words.append(line.decode('utf-8').strip())
	f.close()
	utable = OrderedDict(zip(words, utable))
	btable = OrderedDict(zip(words, btable))
	return utable, btable


def encode(model, X, use_norm=True, verbose=True, batch_size=128, use_eos=False):
	"""
	Encode sentences in the list X. Each entry will return a vector
	"""
	# first, do preprocessing
	X = preprocess(X)

	# word dictionary and init
	d = defaultdict(lambda : 0)
	for w in model['utable'].keys():
		d[w] = 1
	ufeatures = numpy.zeros((len(X), model['uoptions']['dim']), dtype='float32')
	bfeatures = numpy.zeros((len(X), 2 * model['boptions']['dim']), dtype='float32')

	# length dictionary
	ds = defaultdict(list)
	captions = [s.split() for s in X]
	for i,s in enumerate(captions):
		ds[len(s)].append(i)

	# Get features. This encodes by length, in order to avoid wasting computation
	for k in ds.keys():
		if verbose:
			print(k)
		numbatches = len(ds[k]) / batch_size + 1
		for minibatch in range(int(numbatches)):
			caps = ds[k][int(minibatch)::int(numbatches)]

			if use_eos:
				uembedding = numpy.zeros((k+1, len(caps), model['uoptions']['dim_word']), dtype='float32')
				bembedding = numpy.zeros((k+1, len(caps), model['boptions']['dim_word']), dtype='float32')
			else:
				uembedding = numpy.zeros((k, len(caps), model['uoptions']['dim_word']), dtype='float32')
				bembedding = numpy.zeros((k, len(caps), model['boptions']['dim_word']), dtype='float32')
			for ind, c in enumerate(caps):
				caption = captions[c]
				for j in range(len(caption)):
					if d[caption[j]] > 0:
						uembedding[j,ind] = model['utable'][caption[j]]
						bembedding[j,ind] = model['btable'][caption[j]]
					else:
						uembedding[j,ind] = model['utable']['UNK']
						bembedding[j,ind] = model['btable']['UNK']
				if use_eos:
					uembedding[-1,ind] = model['utable']['<eos>']
					bembedding[-1,ind] = model['btable']['<eos>']
			if use_eos:
				uff = model['f_w2v'](uembedding, numpy.ones((len(caption)+1,len(caps)), dtype='float32'))
				bff = model['f_w2v2'](bembedding, numpy.ones((len(caption)+1,len(caps)), dtype='float32'))
			else:
				uff = model['f_w2v'](uembedding, numpy.ones((len(caption),len(caps)), dtype='float32'))
				bff = model['f_w2v2'](bembedding, numpy.ones((len(caption),len(caps)), dtype='float32'))
			if use_norm:
				for j in range(len(uff)):
					uff[j] /= norm(uff[j])
					bff[j] /= norm(bff[j])
			for ind, c in enumerate(caps):
				ufeatures[c] = uff[ind]
				bfeatures[c] = bff[ind]
	
	features = numpy.c_[ufeatures, bfeatures]
	return features


def preprocess(text):
	"""
	Preprocess text for encoder
	"""
	X = []
	sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
	for t in text:
		sents = sent_detector.tokenize(t)
		result = ''
		for s in sents:
			tokens = word_tokenize(s)
			result += ' ' + ' '.join(tokens)
		X.append(result)
	return X


def nn(model, text, vectors, query, k=5):
	"""
	Return the nearest neighbour sentences to query
	text: list of sentences
	vectors: the corresponding representations for text
	query: a string to search
	"""
	qf = encode(model, [query])
	qf /= norm(qf)
	scores = numpy.dot(qf, vectors.T).flatten()
	sorted_args = numpy.argsort(scores)[::-1]
	sentences = [text[a] for a in sorted_args[:k]]
	print('QUERY: ' + query)
	print('NEAREST: ')
	for i, s in enumerate(sentences):
		print(s, sorted_args[i])


def word_features(table):
	"""
	Extract word features into a normalized matrix
	"""
	features = numpy.zeros((len(table), 620), dtype='float32')
	keys = table.keys()
	for i in range(len(table)):
		f = table[keys[i]]
		features[i] = f / norm(f)
	return features


def nn_words(table, wordvecs, query, k=10):
	"""
	Get the nearest neighbour words
	"""
	keys = table.keys()
	qf = table[query]
	scores = numpy.dot(qf, wordvecs.T).flatten()
	sorted_args = numpy.argsort(scores)[::-1]
	words = [keys[a] for a in sorted_args[:k]]
	print('QUERY: ' + query)
	print('NEAREST: ')
	for i, w in enumerate(words):
		print(w)


def _p(pp, name):
	"""
	make prefix-appended name
	"""
	return '%s_%s'%(pp, name)


def init_tparams(params):
	"""
	initialize Theano shared variables according to the initial parameters
	"""
	tparams = OrderedDict()
	for kk, pp in params.items():
		tparams[kk] = theano.shared(params[kk], name=kk)
	return tparams


def load_params(path, params):
	"""
	load parameters
	"""
	pp = numpy.load(path)
	for kk, vv in params.items():
		if kk not in pp:
			warnings.warn('%s is not in the archive'%kk)
			continue
		params[kk] = pp[kk]
	return params


# layers: 'name': ('parameter initializer', 'feedforward')
layers = {'gru': ('param_init_gru', 'gru_layer')}

def get_layer(name):
	fns = layers[name]
	return (eval(fns[0]), eval(fns[1]))


def init_params(options):
	"""
	initialize all parameters needed for the encoder
	"""
	params = OrderedDict()

	# embedding
	params['Wemb'] = norm_weight(options['n_words_src'], options['dim_word'])

	# encoder: GRU
	params = get_layer(options['encoder'])[0](options, params, prefix='encoder',
											  nin=options['dim_word'], dim=options['dim'])
	return params


def init_params_bi(options):
	"""
	initialize all paramters needed for bidirectional encoder
	"""
	params = OrderedDict()

	# embedding
	params['Wemb'] = norm_weight(options['n_words_src'], options['dim_word'])

	# encoder: GRU
	params = get_layer(options['encoder'])[0](options, params, prefix='encoder',
											  nin=options['dim_word'], dim=options['dim'])
	params = get_layer(options['encoder'])[0](options, params, prefix='encoder_r',
											  nin=options['dim_word'], dim=options['dim'])
	return params


def build_encoder(tparams, options):
	"""
	build an encoder, given pre-computed word embeddings
	"""
	# word embedding (source)
	embedding = tensor.tensor3('embedding', dtype='float32')
	x_mask = tensor.matrix('x_mask', dtype='float32')

	# encoder
	proj = get_layer(options['encoder'])[1](tparams, embedding, options,
											prefix='encoder',
											mask=x_mask)
	ctx = proj[0][-1]

	return embedding, x_mask, ctx


def build_encoder_bi(tparams, options):
	"""
	build bidirectional encoder, given pre-computed word embeddings
	"""
	# word embedding (source)
	embedding = tensor.tensor3('embedding', dtype='float32')
	embeddingr = embedding[::-1]
	x_mask = tensor.matrix('x_mask', dtype='float32')
	xr_mask = x_mask[::-1]

	# encoder
	proj = get_layer(options['encoder'])[1](tparams, embedding, options,
											prefix='encoder',
											mask=x_mask)
	projr = get_layer(options['encoder'])[1](tparams, embeddingr, options,
											 prefix='encoder_r',
											 mask=xr_mask)

	ctx = tensor.concatenate([proj[0][-1], projr[0][-1]], axis=1)

	return embedding, x_mask, ctx


# some utilities
def ortho_weight(ndim):
	W = numpy.random.randn(ndim, ndim)
	u, s, v = numpy.linalg.svd(W)
	return u.astype('float32')


def norm_weight(nin,nout=None, scale=0.1, ortho=True):
	if nout == None:
		nout = nin
	if nout == nin and ortho:
		W = ortho_weight(nin)
	else:
		W = numpy.random.uniform(low=-scale, high=scale, size=(nin, nout))
	return W.astype('float32')


def param_init_gru(options, params, prefix='gru', nin=None, dim=None):
	"""
	parameter init for GRU
	"""
	if nin == None:
		nin = options['dim_proj']
	if dim == None:
		dim = options['dim_proj']
	W = numpy.concatenate([norm_weight(nin,dim),
						   norm_weight(nin,dim)], axis=1)
	params[_p(prefix,'W')] = W
	params[_p(prefix,'b')] = numpy.zeros((2 * dim,)).astype('float32')
	U = numpy.concatenate([ortho_weight(dim),
						   ortho_weight(dim)], axis=1)
	params[_p(prefix,'U')] = U

	Wx = norm_weight(nin, dim)
	params[_p(prefix,'Wx')] = Wx
	Ux = ortho_weight(dim)
	params[_p(prefix,'Ux')] = Ux
	params[_p(prefix,'bx')] = numpy.zeros((dim,)).astype('float32')

	return params


def gru_layer(tparams, state_below, options, prefix='gru', mask=None, **kwargs):
	"""
	Forward pass through GRU layer
	"""
	nsteps = state_below.shape[0]
	if state_below.ndim == 3:
		n_samples = state_below.shape[1]
	else:
		n_samples = 1

	dim = tparams[_p(prefix,'Ux')].shape[1]

	if mask == None:
		mask = tensor.alloc(1., state_below.shape[0], 1)

	def _slice(_x, n, dim):
		if _x.ndim == 3:
			return _x[:, :, n*dim:(n+1)*dim]
		return _x[:, n*dim:(n+1)*dim]

	state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')]
	state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + tparams[_p(prefix, 'bx')]
	U = tparams[_p(prefix, 'U')]
	Ux = tparams[_p(prefix, 'Ux')]

	def _step_slice(m_, x_, xx_, h_, U, Ux):
		preact = tensor.dot(h_, U)
		preact += x_

		r = tensor.nnet.sigmoid(_slice(preact, 0, dim))
		u = tensor.nnet.sigmoid(_slice(preact, 1, dim))

		preactx = tensor.dot(h_, Ux)
		preactx = preactx * r
		preactx = preactx + xx_

		h = tensor.tanh(preactx)

		h = u * h_ + (1. - u) * h
		h = m_[:,None] * h + (1. - m_)[:,None] * h_

		return h

	seqs = [mask, state_below_, state_belowx]
	_step = _step_slice

	rval, updates = theano.scan(_step,
								sequences=seqs,
								outputs_info = [tensor.alloc(0., n_samples, dim)],
								non_sequences = [tparams[_p(prefix, 'U')],
												 tparams[_p(prefix, 'Ux')]],
								name=_p(prefix, '_layers'),
								n_steps=nsteps,
								profile=profile,
								strict=True)
	rval = [rval]
	return rval