python source code of hatt

import json
import re
import sys
from collections import Counter, OrderedDict

import numpy as np
import pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers.wrappers import Bidirectional
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Convolution1D, MaxPooling1D, Flatten, concatenate, GlobalMaxPooling1D
from keras.layers import GlobalMaxPooling1D, GlobalAveragePooling1D, SpatialDropout1D ,TimeDistributed
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from keras.constraints import maxnorm
from keras.models import model_from_json
from keras.optimizers import Adam
from keras import regularizers


import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score

from attention import AttentionWithContext
from data_gen import hierarchicalCorpus as Corpus


# Modify this paths as well
DATA_DIR = '/home/alex/Documents/git_projects/Document-Classifier-LSTM/data/'
TRAIN_FILE = 'train_set.csv'
TRAIN_LABS = 'train_set_labels.csv'
EMBEDDING_FILE = '/home/alex/Documents/Python/glove.6B/glove.6B.200d.txt'

# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 80000
# Max number of words in each abstract.
MAX_SEQUENCE_LENGTH = 100 # MAYBE BIGGER
MAX_SENT_LEN = 25
MAX_SEQ_LEN = 5
# This is fixed.
EMBEDDING_DIM = 200
# The name of the model.
STAMP = 'doc_hatt_blstm'


def f1_score(y_true, y_pred):
	"""
	Compute the micro f(b) score with b=1.
	"""
	y_true = tf.cast(y_true, "float32")
	y_pred = tf.cast(tf.round(y_pred), "float32") # implicit 0.5 threshold via tf.round
	y_correct = y_true * y_pred


	sum_true = tf.reduce_sum(y_true, axis=1)
	sum_pred = tf.reduce_sum(y_pred, axis=1)
	sum_correct = tf.reduce_sum(y_correct, axis=1)


	precision = sum_correct / sum_pred
	recall = sum_correct / sum_true
	f_score = 2 * precision * recall / (precision + recall)
	f_score = tf.where(tf.is_nan(f_score), tf.zeros_like(f_score), f_score)


	return tf.reduce_mean(f_score)


def load_data(train_set,
	multilabel=True):
	"""
	"""

	X_data = []
	y_data = []
	for c,(vector,target) in enumerate(train_set):
		X_data.append(vector)
		y_data.append(target)
		if c % 10000 == 0: 
			print c

	num_texts = len(X_data)
	print num_texts, 'training examples'

	
	tokenizer = Tokenizer(num_words=MAX_NB_WORDS,
		oov_token=1)

	X_data_flat = []
	for raw_txt in X_data:
		flat_txt = ''
		for sent in raw_txt:
			flat_txt += sent
		X_data_flat.append(flat_txt)


	tokenizer.fit_on_texts(X_data_flat)
	X_data_int = np.zeros((num_texts,MAX_SEQ_LEN,MAX_SENT_LEN))
	for idx,raw_txt in enumerate(X_data):
		sentences_batch = np.zeros((MAX_SEQ_LEN,MAX_SENT_LEN))
		tokens = tokenizer.texts_to_sequences(raw_txt)
		sentences =  pad_sequences(tokens,
			maxlen=MAX_SENT_LEN,
			padding='post',
			truncating='post',
			dtype='int32')
		for j,sent in enumerate(sentences):
			if j >= MAX_SEQ_LEN:
				break
			sentences_batch[j,:] = sent
		X_data_int[idx,:,:] = sentences_batch

	X_data = X_data_int
	print('Shape of data tensor:', X_data.shape)

	
	word_index = tokenizer.word_index
	print('Found %s unique tokens' % len(word_index))
	with open('word_index.json', 'w') as fp:
		json.dump(word_index, fp)
	print 'Exported word dictionary'


	class_freqs = Counter([y for y_seq in y_data for y in y_seq]).most_common()

	class_list = [y[0] for y in class_freqs]
	nb_classes = len(class_list)
	print nb_classes,'classes'
	class_dict = dict(zip(class_list, np.arange(len(class_list))))

	with open('class_dict.json', 'w') as fp:
		json.dump(class_dict, fp)
	print 'Exported class dictionary'


	y_data_int = []
	for y_seq in y_data:
		y_data_int.append([class_dict[y] for y in y_seq])


	if multilabel:
		mlb = MultiLabelBinarizer()
		mlb.fit([class_dict.values()])
		y_data = mlb.transform(y_data_int)
	else:
		y_data = to_categorical(y_data_int)
		y_h_data = to_categorical(y_h_data_int)
	print('Shape of label tensor:', y_data.shape)

	X_train, X_val, y_train, y_val = train_test_split(X_data, y_data,
		train_size=0.8,
		test_size=0.2,
		random_state=42)

	return X_train, X_val, y_train, y_val, nb_classes, word_index


def prepare_embeddings(wrd2id): 
	"""
	"""

	vocab_size = MAX_NB_WORDS
	print "Found %s words in the vocabulary." % vocab_size


	embedding_idx = {}
	glove_f = open(EMBEDDING_FILE)
	for line in glove_f:
		values = line.split()
		wrd = values[0]
		coefs = np.asarray(values[1:],
			dtype='float32')
		embedding_idx[wrd] = coefs
	glove_f.close()
	print "Found %s word vectors." % len(embedding_idx)


	embedding_mat = np.random.rand(vocab_size+1,EMBEDDING_DIM)

	wrds_with_embeddings = 0
	# Keep the MAX_NB_WORDS most frequent tokens.
	for wrd, i in wrd2id.items():
		if i > vocab_size:
			continue

		embedding_vec = embedding_idx.get(wrd)
		# words without embeddings will be left with random values.
		if embedding_vec is not None:
			wrds_with_embeddings += 1
			embedding_mat[i] = embedding_vec


	print embedding_mat.shape
	print 'Words with embeddings:',wrds_with_embeddings

	return embedding_mat, vocab_size


def build_model(nb_classes,
	word_index,
	embedding_dim,
	seq_length,
	stamp,
	multilabel=True):
	"""
	"""

	embedding_matrix, nb_words = prepare_embeddings(word_index)

	input_layer = Input(shape=(MAX_SEQ_LEN,MAX_SENT_LEN),
		dtype='int32')


	sentence_input = Input(shape=(MAX_SENT_LEN,),
		dtype='int32')
	embedding_layer = Embedding(input_dim=nb_words+1,
		output_dim=embedding_dim,
		input_length=MAX_SENT_LEN,
		weights=[embedding_matrix],
		embeddings_regularizer=regularizers.l2(0.00),
		trainable=True)(sentence_input)

	drop1 = SpatialDropout1D(0.3)(embedding_layer)

	sent_lstm = Bidirectional(LSTM(100, name='blstm_1',
		activation='tanh',
		recurrent_activation='hard_sigmoid',
		recurrent_dropout=0.0,
		dropout=0.4, 
		kernel_initializer='glorot_uniform',
		return_sequences=True),
		merge_mode='concat')(drop1)


	sent_att_layer = AttentionWithContext()(sent_lstm)
	sentEncoder = Model(sentence_input, sent_att_layer)
	sentEncoder.summary()
	
	textEncoder = TimeDistributed(sentEncoder)(input_layer)

	drop2 = Dropout(0.4)(textEncoder)

	lstm_1 = Bidirectional(LSTM(100, name='blstm_2',
		activation='tanh',
		recurrent_activation='hard_sigmoid',
		recurrent_dropout=0.0,
		dropout=0.4, 
		kernel_initializer='glorot_uniform',
		return_sequences=True),
		merge_mode='concat')(drop2)
	lstm_1 = BatchNormalization()(lstm_1)

	att_layer = AttentionWithContext()(lstm_1)


	drop3 = Dropout(0.5)(att_layer)
	

	if multilabel:
		predictions = Dense(nb_classes, activation='sigmoid')(drop3)

		model = Model(inputs=input_layer, outputs=predictions)

		adam = Adam(lr=0.001,
			decay=0.0)

		model.compile(loss='binary_crossentropy',
			optimizer=adam,
			metrics=[f1_score])


	else:
		predictions = Dense(nb_classes, activation='softmax')(drop3)

		model = Model(inputs=input_layer, outputs=predictions)

		adam = Adam(lr=0.001,
			decay=0.0)

		model.compile(loss='categorical_crossentropy',
			optimizer=adam,
			metrics=['accuracy'])


	model.summary()
	print(stamp)


	# Save the model.
	model_json = model.to_json()
	with open(stamp + ".json", "w") as json_file:
		json_file.write(model_json)


	return model


def load_model(stamp,
	multilabel=True):
	"""
	"""

	json_file = open(stamp+'.json', 'r')
	loaded_model_json = json_file.read()
	json_file.close()
	model = model_from_json(loaded_model_json)

	model.load_weights(stamp+'.h5')
	print("Loaded model from disk")

	model.summary()


	adam = Adam(lr=0.001)
	if multilabel:
		model.compile(loss='binary_crossentropy',
			optimizer=adam,
			metrics=[f1_score])
	else:
		model.compile(loss='categorical_crossentropy',
			optimizer=adam,
			metrics=['accuracy'])

	return model


if __name__ == '__main__':

	multilabel,load_previous = sys.argv[1:]

	print multilabel,load_previous

	if multilabel == 'multi':
		multilabel = True
	else:
		multilabel = False


	if load_previous == 'load':
		load_previous = True
	else:
		load_previous = False


	train_set = Corpus(DATA_DIR+TRAIN_FILE,DATA_DIR+TRAIN_LABS)

	X_train, X_val, y_train, y_val, nb_classes, word_index = load_data(train_set,
		multilabel)

	if load_previous:
		model = load_model(STAMP,
			multilabel)
	else:
		model = build_model(nb_classes,
			word_index,
			EMBEDDING_DIM,
			MAX_SEQUENCE_LENGTH,
			STAMP,
			multilabel)

	if multilabel:
		monitor_metric = 'val_f1_score'
	else:
		monitor_metric = 'val_loss'

	early_stopping =EarlyStopping(monitor=monitor_metric,
		patience=5)
	bst_model_path = STAMP + '.h5'
	model_checkpoint = ModelCheckpoint(bst_model_path,
		monitor=monitor_metric,
		verbose=1,
		save_best_only=True,
		mode='max',
		save_weights_only=True)

	hist = model.fit(X_train, y_train,
		validation_data=(X_val, y_val),
		epochs=100,
		batch_size=128,
		shuffle=True,
		callbacks=[model_checkpoint])

	print hist.history