""" Data processing for VisualWordLSTM happens here; this creates a class that acts as a data generator/feed for model training. """ from __future__ import print_function from collections import defaultdict import cPickle import h5py import logging import numpy as np np.set_printoptions(threshold='nan') import os import sys import random # Set up logger logging.basicConfig(level=logging.INFO, stream=sys.stdout) logger = logging.getLogger(__name__) # Strings for beginning, end of sentence, padding # These get specified indices in word2index BOS = "<S>" # index 1 EOS = "<E>" # index 2 PAD = "<P>" # index 0 # Dimensionality of image feature vector IMG_FEATS = 4096 class VisualWordDataGenerator(object): """ Creates input arrays for VisualWordLSTM and deals with input dataset in general. Input dataset must now be in HTF5 format. Important methods: random_generator() yields random batches from the training data split fixed_generator() yields batches in the order it is stored on disk generation_generator() yields batches with empty word sequences """ def __init__(self, args_dict, input_dataset=None): """ Initialise data generator: this involves loading the dataset and generating vocabulary sizes. If dataset is not given, use flickr8k.h5. """ logger.info("Initialising data generator") self.args = args_dict # Number of descriptions to return per image. self.num_sents = args_dict.num_sents # default 5 (for flickr8k) self.unk = args_dict.unk # default 5 self.run_string = args_dict.run_string # self.datasets holds 1+ datasets, where additional datasets will # be used for supertraining the model self.datasets = [] self.openmode = "r+" if self.args.h5_writeable else "r" if not input_dataset: logger.warn("No dataset given, using flickr8k") self.dataset = h5py.File("flickr8k/dataset.h5", self.openmode) else: self.dataset = h5py.File("%s/dataset.h5" % input_dataset, self.openmode) logger.info("Train/val dataset: %s", input_dataset) if args_dict.supertrain_datasets is not None: for path in args_dict.supertrain_datasets: logger.info("Adding supertrain datasets: %s", path) self.datasets.append(h5py.File("%s/dataset.h5" % path, "r")) self.datasets.append(self.dataset) # hsn doesn't have to be a class variable. # what happens if self.hsn is false but hsn_size is not zero? self.use_source = False if self.args.source_vectors is not None: self.source_dataset = h5py.File("%s/dataset.h5" % self.args.source_vectors, "r") self.source_encoder = args_dict.source_enc self.source_type = args_dict.source_type h5_dataset_keys = self.source_dataset['train']['000000'].keys() self.h5_dataset_str = next((z for z in h5_dataset_keys if z.startswith("%s-hidden_feats-%s" % (self.source_type, self.source_encoder))), None) #self.h5_dataset_str = "%s-hidden_feats-%s-%d" % (self.source_type, # self.source_encoder, # self.source_dim) assert self.h5_dataset_str is not None self.hsn_size = len(self.source_dataset['train']['000000'] [self.h5_dataset_str][0]) self.source_dim = self.hsn_size self.num_hsn = len(self.source_dataset['train']['000000'] [self.h5_dataset_str]) self.use_source = True logger.info("Reading %d source vectors from %s with %d dims", self.num_hsn, self.h5_dataset_str, self.hsn_size) self.use_image = False if self.args.no_image else True # These variables are filled by extract_vocabulary self.word2index = dict() self.index2word = dict() # This is set to include BOS & EOS padding self.max_seq_len = 0 # Can check after extract_vocabulary what the actual max seq length is # (including padding) self.actual_max_seq_len = 0 # This counts number of descriptions per split # Ignores test for now (change in extract_vocabulary) self.split_sizes = {'train': 0, 'val': 0, 'test': 0} # These are used to speed up the validation process self._cached_val_input = None self._cached_val_targets = None self._cached_references = None if self.args.use_predicted_tokens and self.args.no_image: logger.info("Input predicted descriptions") self.ds_type = 'predicted_description' else: logger.info("Input gold descriptions") self.ds_type = 'descriptions' def random_generator(self, split): """ Generator that produces input/output tuples for a given dataset and split. Typically used to produce random batches for training a model. The data is yielded by first shuffling the description indices and then shuffling the image instances within the split. """ # For randomization, we use a independent Random instance. random_instance = random.Random() # Make sure that the desired split is actually in the dataset. assert split in self.dataset # Get a list of the keys. We will use this list to shuffle and iterate over. identifiers = self.dataset[split].keys() # Get the number of descriptions. first_id = identifiers[0] num_descriptions = len(self.dataset[split][first_id]['descriptions']) description_indices = list(range(num_descriptions)) arrays = self.get_batch_arrays(self.args.batch_size) batch_indices = [] j = 0 # Shuffle the description indices. random_instance.shuffle(description_indices) while j <= len(identifiers): # And loop over them. i = 0 for desc_idx in description_indices: # For each iteration over the description indices, also shuffle the # identifiers. random_instance.shuffle(identifiers) # And loop over them. for ident in identifiers: if i == self.args.batch_size: targets = self.get_target_descriptions(arrays[0]) yield_data = self.create_yield_dict(arrays, targets, batch_indices) #logger.debug(yield_data['img'][0,0,:]) #logger.debug(' '.join([self.index2word[np.argmax(x)] for x in yield_data['text'][0,:,:]])) #logger.debug(' '.join([self.index2word[np.argmax(x)] for x in yield_data['output'][0,:,:]])) yield yield_data i = 0 arrays = self.get_batch_arrays(self.args.batch_size) batch_indices = [] description = self.dataset[split][ident]['descriptions'][desc_idx] img_feats = self.get_image_features(self.dataset, split, ident) try: description_array = self.format_sequence(description.split(), train=True) arrays[0][i] = description_array if self.use_image and self.use_source: if self.args.peeking_source: arrays[1][i, :] = \ self.get_source_features(split, ident) else: arrays[1][i, 0] = \ self.get_source_features(split, ident) if self.args.mrnn: arrays[2][i, :] = img_feats else: arrays[2][i, 0] = img_feats elif self.use_image: if self.args.mrnn: arrays[1][i, :] = img_feats else: arrays[1][i, 0] = img_feats elif self.use_source: if self.args.peeking_source: arrays[1][i, :] = \ self.get_source_features(split, ident) else: arrays[1][i, 0] = \ self.get_source_features(split, ident) batch_indices.append([ident, desc_idx]) i += 1 except AssertionError: # If the description doesn't share any words with the vocabulary. pass if i != 0: self.resize_arrays(i, arrays) targets = self.get_target_descriptions(arrays[0]) #logger.info(' '.join([self.index2word[np.argmax(x)] for x in arrays[0][0,:,:]])) yield_data = self.create_yield_dict(arrays,targets, batch_indices) yield yield_data i = 0 j = 0 arrays = self.get_batch_arrays(self.args.batch_size) batch_indices = [] def fixed_generator(self, split='val'): """Generator that returns the instances in a split in the fixed order defined in the underlying data. Useful for calculating perplexity, etc. No randomization.""" arrays = self.get_batch_arrays(self.args.batch_size) batch_indices = [] i = 0 j = 0 # Get the number of descriptions. identifiers = self.dataset[split].keys() first_id = identifiers[0] num_descriptions = len(self.dataset[split][first_id]['descriptions']) description_indices = list(range(num_descriptions)) while j <= len(identifiers): i = 0 for ident in identifiers: for desc_idx in description_indices: if i == self.args.batch_size: targets = self.get_target_descriptions(arrays[0]) yield_data = self.create_yield_dict(arrays, targets, batch_indices) yield yield_data i = 0 arrays = self.get_batch_arrays(self.args.batch_size) batch_indices = [] description = self.dataset[split][ident]['descriptions'][desc_idx] img_feats = self.get_image_features(self.dataset, split, ident) try: description_array = self.format_sequence(description.split()) arrays[0][i] = description_array if self.use_image and self.use_source: if self.args.peeking_source: arrays[1][i, :] = \ self.get_source_features(split, ident) else: arrays[1][i, 0] = \ self.get_source_features(split, ident) if self.args.mrnn: arrays[2][i, :] = img_feats else: arrays[2][i, 0] = img_feats elif self.use_image: if self.args.mrnn: arrays[1][i, :] = img_feats else: arrays[1][i, 0] = img_feats elif self.use_source: if self.args.peeking_source: arrays[1][i, :] = \ self.get_source_features(split, ident) else: arrays[1][i, 0] = \ self.get_source_features(split, ident) batch_indices.append([ident, desc_idx]) i += 1 except AssertionError: # If the description doesn't share any words with the vocabulary. logger.info('Could not encode %s', description) pass if i != 0: logger.debug("Outside for loop") self.resize_arrays(i, arrays) targets = self.get_target_descriptions(arrays[0]) logger.debug(' '.join([self.index2word[np.argmax(x)] for x in arrays[0][0,:,:] if self.index2word[np.argmax(x)] != "<P>"])) yield_data = self.create_yield_dict(arrays, targets, batch_indices) yield yield_data i = 0 j = 0 arrays = self.get_batch_arrays(self.args.batch_size) batch_indices = [] def generation_generator(self, split='val', batch_size=-1, in_callbacks=False): """Generator for generating descriptions. This will only return one array per instance in the data. No randomization. batch_size=1 will return minibatches of one. Use this for beam search decoding. """ identifiers = self.dataset[split].keys() i = 0 # used to control the enumerator batch_size = self.args.batch_size \ if batch_size == -1 \ else batch_size arrays = self.get_batch_arrays(batch_size, generation=not in_callbacks) batch_indices = [] desc_idx = 0 for ident in identifiers: if i == batch_size: targets = self.get_target_descriptions(arrays[0]) logger.debug(arrays[0].shape) logger.debug(' '.join([self.index2word[np.argmax(x)] for x in arrays[0][0,:,:] if self.index2word[np.argmax(x)] != "<P>"])) yield_data = self.create_yield_dict(arrays, targets, batch_indices) yield yield_data i = 0 arrays = self.get_batch_arrays(batch_size, generation=not in_callbacks) batch_indices = [] description = self.dataset[split][ident]['descriptions'][desc_idx] img_feats = self.get_image_features(self.dataset, split, ident) try: description_array = self.format_sequence(description.split(), generation=not in_callbacks, in_callbacks=in_callbacks) arrays[0][i] = description_array if self.use_image and self.use_source: if self.args.peeking_source: arrays[1][i, :] = \ self.get_source_features(split, ident) else: arrays[1][i, 0] = \ self.get_source_features(split, ident) if self.args.mrnn: arrays[2][i, :] = img_feats else: arrays[2][i, 0] = img_feats elif self.use_image: if self.args.mrnn: arrays[1][i, :] = img_feats else: arrays[1][i, 0] = img_feats elif self.use_source: if self.args.peeking_source: arrays[1][i, :] = \ self.get_source_features(split, ident) else: arrays[1][i, 0] = \ self.get_source_features(split, ident) batch_indices.append([ident, desc_idx]) i += 1 except AssertionError: # If the description doesn't share any words with the vocabulary. pass if i != 0: logger.debug("Outside for loop") self.resize_arrays(i, arrays) targets = self.get_target_descriptions(arrays[0]) logger.debug(' '.join([self.index2word[np.argmax(x)] for x in arrays[0][0,:,:] if self.index2word[np.argmax(x)] != "<P>"])) yield_data = self.create_yield_dict(arrays, targets, batch_indices) yield yield_data i = 0 arrays = self.get_batch_arrays(batch_size, generation=not in_callbacks) batch_indices = [] def get_batch_arrays(self, batch_size, generation=False): """ Get empty arrays for yield_training_batch. Helper function for {random/fixed/generation}_generator() """ t = self.args.generation_timesteps if generation else self.max_seq_len arrays = [] # dscrp_array at arrays[0] arrays.append(np.zeros((batch_size, t, len(self.word2index)))) if self.use_source: # hsn_array at arrays[1] (if used) arrays.append(np.zeros((batch_size, t, self.hsn_size))) if self.use_image: # at arrays[2] or arrays[1] arrays.append(np.zeros((batch_size, t, IMG_FEATS))) return arrays def create_yield_dict(self, array, targets, indices): ''' Returns a dictionary object of the array, the targets, and the image, description indices in the batch. Helper function for {random,fixed,generation}_generator(). ''' if self.use_source and self.use_image: return [{'text': array[0], 'src': array[1], 'img': array[2], 'indices': indices}, {'output': targets}] elif self.use_image: return [{'text': array[0], 'img': array[1], 'indices': indices}, {'output': targets}] elif self.use_source: return [{'text': array[0], 'src': array[1], 'indices': indices}, {'output': targets}] def resize_arrays(self, new_size, arrays): """ Resize all the arrays to new_size along dimension 0. Sometimes we need to initialise a np.zeros() to an arbitrary size and then cut it down to out intended new_size. """ logger.debug("Resizing batch_size in structures from %d -> %d", arrays[0].shape[0], new_size) for i, array in enumerate(arrays): arrays[i] = np.resize(array, (new_size, array.shape[1], array.shape[2])) return arrays def format_sequence(self, sequence, generation=False, train=False, in_callbacks=False): """ Transforms a list of words (sequence) into input matrix seq_array of (timesteps, vocab-onehot) generation == True will return an input matrix of length self.args.generation_timesteps. The first timestep will be set to <B>, everything else will be <P>. The zero default value is equal to padding. """ if generation: timesteps = self.max_seq_len if in_callbacks else self.args.generation_timesteps seq_array = np.zeros((timesteps, len(self.word2index))) seq_array[0, self.word2index[BOS]] = 1 # BOS token at t=0 return seq_array seq_array = np.zeros((self.max_seq_len, len(self.word2index))) w_indices = [self.word2index[w] for w in sequence if w in self.word2index] if train and self.is_too_long(w_indices): # We don't process training sequences that are too long logger.debug("Skipping '%s' because it is too long" % ' '.join([x for x in sequence])) raise AssertionError if len(w_indices) > self.actual_max_seq_len: self.actual_max_seq_len = len(w_indices) seq_array[0, self.word2index[BOS]] = 1 # BOS token at zero timestep time = 0 for time, vocab in enumerate(w_indices): seq_array[time + 1, vocab] += 1 # add EOS token at end of sentence try: assert time + 1 == len(w_indices),\ "time %d sequence %s len w_indices %d seq_array %s" % ( time, " ".join([x for x in sequence]), len(w_indices), seq_array) except AssertionError: if len(w_indices) == 0 and time == 0: # none of the words in this description appeared in the # vocabulary. this is most likely caused by the --unk # threshold. # # we don't encode this sentence because [BOS, EOS] doesn't # make sense logger.debug("Skipping '%s' because none of its words appear in the vocabulary" % ' '.join([x for x in sequence])) raise AssertionError seq_array[len(w_indices) + 1, self.word2index[EOS]] += 1 return seq_array def get_target_descriptions(self, input_array): """ Target is always _next_ word, so we move input_array over by -1 timesteps (target at t=1 is input at t=2). Helper function used by {random,fixed,generation}_generator() """ target_array = np.zeros(input_array.shape) target_array[:, :-1, :] = input_array[:, 1:, :] return target_array def get_refs_by_split_as_list(self, split): """ Returns a list of lists of gold standard sentences. Useful for automatic evaluation (BLEU, Meteor, etc.) Helper function for callbacks.py and generate.py """ # Not needed for train. assert split in ['test', 'val'], "Not possible for split %s" % split references = [] for data_key in self.dataset[split]: this_image = [] for descr in self.dataset[split][data_key]['descriptions']: this_image.append(descr) references.append(this_image) return references def get_source_features(self, split, data_key): ''' Return the source feature vector from self.source_dataset. Relies on self.source_encoder, self.source_dim, self.source_type. The type of the returned vector depends on self.args.source_type: 'sum': will add all the vectors into the same vector 'avg': will do 'sum' and then divide by the number of vectors TODO: support a 'concat' mode for merging the source features ''' mode = self.args.source_merge try: source = self.source_dataset[split][data_key][self.h5_dataset_str] if mode == 'sum' or mode =='avg': return_feats = np.zeros(self.source_dim) for feats in source: return_feats = np.add(return_feats, feats) if mode == 'avg': return_feats = return_feats/len(source) #elif mode =='concat': # return_feats = np.zeros(self.source_dim*self.args.num_sents) # marker = 0 # for feats in source: # return_feats[marker:marker+len(feats)] = feats # marker += len(feats) return return_feats except KeyError: # this image -- description pair doesn't have a source-language # vector. Raise a KeyError so the requester can deal with the # missing data. logger.info("Skipping '%s' because it doesn't have a source vector", data_key) raise KeyError def get_image_features(self, dataset, split, data_key): """ Return image features vector for split[data_key].""" return dataset[split][data_key]['img_feats'][:] def set_predicted_description(self, split, data_key, sentence): ''' Set the predicted sentence tokens in the data_key group, creating the group if necessary, or erasing the current value if necessary. ''' if self.openmode != "r+": # forcefully quit when trying to write to a read-only file raise RuntimeError("Dataset is read-only, try again with --h5_writable") dataset_key = 'predicted_description' try: predicted_text = self.dataset[split][data_key].create_dataset(dataset_key, (1,), dtype=h5py.special_dtype(vlen=unicode)) except RuntimeError: # the dataset already exists, erase it and create an empty space del self.dataset[split][data_key][dataset_key] predicted_text = self.dataset[split][data_key].create_dataset(dataset_key, (1,), dtype=h5py.special_dtype(vlen=unicode)) predicted_text[0] = " ".join([x for x in sentence]) def set_source_features(self, split, data_key, dataset_key, feats, dims, desc_idx=0): ''' Set the source feature vector stored in the dataset_key group, creating the group if necessary, or erasing the current value if necessary. ''' if self.openmode != "r+": # forcefully quit when trying to write to a read-only file raise RuntimeError("Dataset is read-only, try again with --h5_writable") try: source_data = self.dataset[split][data_key].create_dataset( dataset_key, ((self.args.num_sents, dims)), dtype='float32') except RuntimeError: # the dataset already exists so we just need to fill in the # relevant element, given the dataset key source_data = self.dataset[split][data_key][dataset_key] source_data[desc_idx] = feats def set_vocabulary(self, path): ''' Initialise the vocabulary from a checkpointed model. TODO: some duplication from extract_vocabulary ''' self.extract_complete_vocab() logger.info("Initialising vocabulary from pre-defined model") try: v = cPickle.load(open("%s/../vocabulary.pk" % path, "rb")) except: v = cPickle.load(open("%s/vocabulary.pk" % path, "rb")) self.index2word = dict((v, k) for k, v in v.iteritems()) self.word2index = dict((k, v) for k, v in v.iteritems()) longest_sentence = 0 # set the length of the longest sentence train_longest = self.find_longest_sentence('train') val_longest = self.find_longest_sentence('val') self.longest_sentence = max(longest_sentence, train_longest, val_longest) self.calculate_split_sizes() self.corpus_statistics() # self.max_seq_len = longest_sentence + 2 # logger.info("Max seq length %d, setting max_seq_len to %d", # longest_sentence, self.max_seq_len) # # logger.info("Split sizes %s", self.split_sizes) # # logger.info("Number of words in vocabulary %d", len(self.word2index)) # #logger.debug("word2index %s", self.word2index.items()) # logger.debug("Number of indices %d", len(self.index2word)) # #logger.debug("index2word: %s", self.index2word.items()) def find_longest_sentence(self, split): ''' Calculcates the length of the longest sentence in a given split of a dataset and updates the number of sentences in a split. TODO: can we get split_sizes from H5 dataset indices directly? ''' local_ds_type = "descriptions" if split == 'train' else self.ds_type longest_sentence = 0 for dataset in self.datasets: for data_key in dataset[split]: for description in dataset[split][data_key][local_ds_type][0:self.args.num_sents]: d = description.split() if len(d) > longest_sentence: longest_sentence = len(d) return longest_sentence def extract_vocabulary(self): ''' Collect word frequency counts over the train / val inputs and use these to create a model vocabulary. Words that appear fewer than self.unk times will be ignored. Also finds longest sentence, since it's already iterating over the whole dataset. HOWEVER this is the longest sentence *including* UNK words, which are removed from the data and shouldn't really be included in max_seq_len. But max_seq_len/longest_sentence is just supposed to be a safe upper bound, so we're good (except for some redundant cycles.) ''' logger.info("Extracting vocabulary") self.extract_complete_vocab() longest_sentence = 0 # set the length of the longest sentence train_longest = self.find_longest_sentence('train') val_longest = self.find_longest_sentence('val') self.longest_sentence = max(longest_sentence, train_longest, val_longest) # vocabulary is a word:id dict (superceded by/identical to word2index?) # <S>, <E> are special first indices vocabulary = {PAD: 0, BOS: 1, EOS: 2} for v in self.unk_dict: if self.unk_dict[v] > self.unk: vocabulary[v] = len(vocabulary) assert vocabulary[BOS] == 1 assert vocabulary[EOS] == 2 logger.info("Pickling dictionary to checkpoint/%s/vocabulary.pk", self.run_string) try: os.mkdir("checkpoints/%s" % self.run_string) except OSError: pass cPickle.dump(vocabulary, open("checkpoints/%s/vocabulary.pk" % self.run_string, "wb")) self.index2word = dict((v, k) for k, v in vocabulary.iteritems()) self.word2index = vocabulary self.calculate_split_sizes() self.corpus_statistics() def extract_complete_vocab(self): """ Extract the complete vocabulary over the training data. Stores the result in a dictionary of word:count pairs in self.unk_dict """ self.unk_dict = defaultdict(int) for dataset in self.datasets: for data_key in dataset['train']: for description in dataset['train'][data_key]['descriptions'][0:self.args.num_sents]: for token in description.split(): self.unk_dict[token] += 1 def calculate_split_sizes(self): ''' Calculates the expected number of instances in a data split. Does not include sentences that cannot be encoded in the vocabulary. TODO: handle splits for which we don't yet have the test data. ''' for split in ["train", "val", "test"]: for dataset in self.datasets: for data_key in dataset[split]: for idx, description in enumerate(dataset[split][data_key]['descriptions'][0:self.args.num_sents]): w_indices = [self.word2index[w] for w in description.split() if w in self.word2index] if split == "train" and self.is_too_long(w_indices): logger.debug("Skipping [%s][%s] ('%s') because\ it contains too many words", data_key, idx, description) continue if split == "train": if len(w_indices) != 0: self.split_sizes[split] += 1 else: logger.debug("Skipping [%s][%s] ('%s') because\ none of its words appear in the vocabulary", data_key, idx, description) else: self.split_sizes[split] += 1 def corpus_statistics(self): """ Logs some possibly useful information about the dataset. """ self.max_seq_len = self.longest_sentence + 2 logger.info("Max seq length %d, setting max_seq_len to %d", self.longest_sentence, self.max_seq_len) logger.info("Split sizes %s", self.split_sizes) logger.info("Number of words %d -> %d", len(self.unk_dict), len(self.word2index)) actual_len, true_len = self.discard_percentage() logger.info("Retained / Original Tokens: %d / %d (%.2f pc)", actual_len, true_len, 100 * float(actual_len)/true_len) avg_len = self.avg_len() logger.info("Average train sentence length: %.2f tokens" % avg_len) def get_vocab_size(self): """ Return training data vocabulary size. """ return len(self.word2index) def discard_percentage(self): ''' One-off calculation of how many words are throw-out from the training sequences using the defined UNK threshold. ''' true_len = 0 actual_len = 0 split = 'train' for data_key in self.dataset[split]: for description in self.dataset[split][data_key]['descriptions'][0:self.args.num_sents]: d = description.split() true_len += len(d) unk_d = [self.word2index[w] for w in d if w in self.word2index] actual_len += len(unk_d) return (actual_len, true_len) def avg_len(self): ''' One-off calculation of the average length of sentences in the training data before UNKing. ''' true_len = 0 num_sents = 0.0 split = 'train' for data_key in self.dataset[split]: for description in self.dataset[split][data_key][self.ds_type][0:self.args.num_sents]: d = description.split() true_len += len(d) num_sents += 1 return (true_len/num_sents) def is_too_long(self, sequence): """ Determine if a sequence is too long to be included in the training data. Sentences that are too long (--maximum_length) are not processed in the training data. The validation and test data are always processed, regardless of --maxmimum_length. """ if len(sequence) > self.args.maximum_length: return True else: return False