import numpy as np import re import itertools from collections import Counter import cPickle as pickle import os def clean_str(string): """ Tokenization/string cleaning for all datasets except for SST. """ string = re.sub(r"[^A-Za-z0-9:(),!?\'\`]", " ", string) string = re.sub(r" : ", ":", string) string = re.sub(r"\'s", " \'s", string) string = re.sub(r"\'ve", " \'ve", string) string = re.sub(r"n\'t", " n\'t", string) string = re.sub(r"\'re", " \'re", string) string = re.sub(r"\'d", " \'d", string) string = re.sub(r"\'ll", " \'ll", string) string = re.sub(r",", " , ", string) string = re.sub(r"!", " ! ", string) string = re.sub(r"\(", " \( ", string) string = re.sub(r"\)", " \) ", string) string = re.sub(r"\?", " \? ", string) string = re.sub(r"\s{2,}", " ", string) return string.strip().lower() def clean_str_vn(string): """ Tokenization/string cleaning for all datasets except for SST. """ string = re.sub(r"[~`@#$%^&*-+]", " ", string) def sharp(str): b = re.sub('\s[A-Za-z]\s\.', ' .', ' '+str) while (b.find('. . ')>=0): b = re.sub(r'\.\s\.\s', '. ', b) b = re.sub(r'\s\.\s', ' # ', b) return b string = sharp(string) string = re.sub(r" : ", ":", string) string = re.sub(r",", " , ", string) string = re.sub(r"!", " ! ", string) string = re.sub(r"\(", " \( ", string) string = re.sub(r"\)", " \) ", string) string = re.sub(r"\?", " \? ", string) string = re.sub(r"\s{2,}", " ", string) return string.strip().lower() def load_data_and_labels(vn): """ Loads data from files, splits the data into words and generates labels. Returns split sentences and labels. """ # Load data from files folder_prefix = 'data' + int(vn)*'vn' + '/' x_train = list(open(folder_prefix+"train").readlines()) x_test = list(open(folder_prefix+"test").readlines()) test_size = len(x_test) x_text = x_train + x_test # Split by words if not vn: clean_func = clean_str else: clean_func = clean_str_vn x_text = [clean_func(sent) for sent in x_text] y = [s.split(' ')[0].split(':')[0] for s in x_text] x_text = [s.split(" ")[1:] for s in x_text] # Generate labels all_label = dict() for label in y: if not label in all_label: all_label[label] = len(all_label) + 1 one_hot = np.identity(len(all_label)) y = [one_hot[ all_label[label]-1 ] for label in y] return [x_text, y, test_size] def load_trained_vecs(vn, vn_file, en_file, vocabulary): folder_prefix = 'data' + int(vn)*'vn' + '/' if not os.path.exists(folder_prefix + 'trained_vecs.PICKLE'): binfile = int(vn)*vn_file + (1-int(vn))*en_file trained_vecs = load_bin_vec(folder_prefix + binfile, vocabulary) with open(folder_prefix + 'trained_vecs.PICKLE', 'wb') as f: pickle.dump([trained_vecs],f,protocol=-1) else: with open(folder_prefix + 'trained_vecs.PICKLE', 'rb') as f: trained_vecs = pickle.load(f)[0] return trained_vecs def pad_sentences(sentences, padding_word="<PAD/>"): """ Pads all sentences to the same length. The length is defined by the longest sentence. Returns padded sentences. """ sequence_length = max(len(x) for x in sentences) padded_sentences = [] for i in range(len(sentences)): sentence = sentences[i] num_padding = sequence_length - len(sentence) new_sentence = sentence + [padding_word] * num_padding padded_sentences.append(new_sentence) return padded_sentences def build_vocab(sentences): """ Builds a vocabulary mapping from word to index based on the sentences. Returns vocabulary mapping and inverse vocabulary mapping. """ # Build vocabulary word_counts = Counter(itertools.chain(*sentences)) # Mapping from index to word # vocabulary_inv=['<PAD/>', 'the', ....] vocabulary_inv = [x[0] for x in word_counts.most_common()] # Mapping from word to index # vocabulary = {'<PAD/>': 0, 'the': 1, ',': 2, 'a': 3, 'and': 4, ..} vocabulary = {x: i for i, x in enumerate(vocabulary_inv)} return [vocabulary, vocabulary_inv] def build_input_data(sentences, labels, vocabulary): """ Maps sentences and labels to vectors based on a vocabulary. """ x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences]) y = np.array(labels) return [x, y] def load_data(vn): """ Loads and preprocessed data Returns input vectors, labels, vocabulary, and inverse vocabulary. """ # Load and preprocess data sentences, labels, test_size = load_data_and_labels(vn) sentences_padded = pad_sentences(sentences) vocabulary, vocabulary_inv = build_vocab(sentences_padded) x, y = build_input_data(sentences_padded, labels, vocabulary) return [x, y, vocabulary, vocabulary_inv, test_size] def load_bin_vec(fname, vocab): """ Loads 300x1 word vecs from Google (Mikolov) word2vec """ word_vecs = {} with open(fname, "rb") as f: header = f.readline() vocab_size, layer1_size = map(int, header.split()) binary_len = np.dtype('float32').itemsize * layer1_size for line in xrange(vocab_size): word = [] while True: ch = f.read(1) if ch == ' ': word = ''.join(word) break if ch != '\n': word.append(ch) if word in vocab: word_vecs[word] = np.fromstring(f.read(binary_len), dtype='float32') else: f.read(binary_len) return word_vecs def add_unknown_words(word_vecs, vocab, min_df=0, k=300): """ 0.25 is chosen so the unknown vectors have (approximately) same variance as pre-trained ones """ count = 0 for word in vocab: if word not in word_vecs and vocab[word] >= min_df: word_vecs[word] = np.random.uniform(-0.25,0.25,k) else: count += 1 return count def batch_iter(data, batch_size, num_epochs): """ Generates a batch iterator for a dataset. """ data = np.array(data) data_size = len(data) num_batches_per_epoch = int(len(data)/batch_size) + 1 for epoch in range(num_epochs): # Shuffle the data at each epoch shuffle_indices = np.random.permutation(np.arange(data_size)) shuffled_data = data[shuffle_indices] for batch_num in range(num_batches_per_epoch): start_index = batch_num * batch_size end_index = (batch_num + 1) * batch_size if end_index > data_size: end_index = data_size start_index = end_index - batch_size yield shuffled_data[start_index:end_index]