python source code of load

import numpy as np
from utils.preprocessing import *
from pickle import load, dump
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import random
'''
	*We have Flickr_8k.trainImages.txt and Flickr_8k.devImages.txt files which consist of unique identifiers(id) 
		which can be used to filter the images and their descriptions
	*Load a pre-defined list of image identifiers(id)
	*Glimpse of file:
		2513260012_03d33305cf.jpg
		2903617548_d3e38d7f88.jpg
		3338291921_fe7ae0c8f8.jpg
		488416045_1c6d903fe0.jpg
		2644326817_8f45080b87.jpg
'''
def load_set(filename):
	file = open(filename, 'r')
	doc = file.read()
	file.close()
	ids = list()
	# Process line by line
	for line in doc.split('\n'):
		# Skip empty lines
		if len(line) < 1:
			continue
		# Get the image identifier(id)
		_id = line.split('.')[0]
		ids.append(_id)
	return set(ids)

'''
	*The model we'll develop will generate a caption for a given image and the caption will be generated one word at a time. 
	*The sequence of previously generated words will be provided as input. Therefore, we will need a ‘first word’ to 
		kick-off the generation process and a ‘last word‘ to signal the end of the caption.
	*We'll use the strings ‘startseq‘ and ‘endseq‘ for this purpose. These tokens are added to the captions
		as they are loaded. 
	*It is important to do this now before we encode the text so that the tokens are also encoded correctly.
	*Load captions into memory
	*Glimpse of file:
		1000268201_693b08cb0e child in pink dress is climbing up set of stairs in an entry way
		1000268201_693b08cb0e girl going into wooden building
		1000268201_693b08cb0e little girl climbing into wooden playhouse
		1000268201_693b08cb0e little girl climbing the stairs to her playhouse
		1000268201_693b08cb0e little girl in pink dress going into wooden cabin
'''
def load_cleaned_captions(filename, ids):
	file = open(filename, 'r')
	doc = file.read()
	file.close()
	captions = dict()
	_count = 0
	# Process line by line
	for line in doc.split('\n'):
		# Split line on white space
		tokens = line.split()
		# Split id from caption
		image_id, image_caption = tokens[0], tokens[1:]
		# Skip images not in the ids set
		if image_id in ids:
			# Create list
			if image_id not in captions:
				captions[image_id] = list()
			# Wrap caption in start & end tokens
			caption = 'startseq ' + ' '.join(image_caption) + ' endseq'
			# Store
			captions[image_id].append(caption)
			_count = _count+1
	return captions, _count

# Load image features
def load_image_features(filename, ids):
	# load all features
	all_features = load(open(filename, 'rb'))
	# filter features
	features = {_id: all_features[_id] for _id in ids}
	return features

# Convert a dictionary to a list
def to_lines(captions):
	all_captions = list()
	for image_id in captions.keys():
		[all_captions.append(caption) for caption in captions[image_id]]
	return all_captions

'''
	*The captions will need to be encoded to numbers before it can be presented to the model.
	*The first step in encoding the captions is to create a consistent mapping from words to unique integer values.
		Keras provides the Tokenizer class that can learn this mapping from the loaded captions.
	*Fit a tokenizer on given captions
'''
def create_tokenizer(captions):
	lines = to_lines(captions)
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# Calculate the length of the captions with the most words
def calc_max_length(captions):
	lines = to_lines(captions)
	return max(len(line.split()) for line in lines)

'''
	*Each caption will be split into words. The model will be provided one word & the image and it generates the next word. 
	*Then the first two words of the caption will be provided to the model as input with the image to generate the next word. 
	*This is how the model will be trained.
	*For example, the input sequence “little girl running in field” would be 
		split into 6 input-output pairs to train the model:

		X1		X2(text sequence) 								y(word)
		-----------------------------------------------------------------
		image	startseq,										little
		image	startseq, little,								girl
		image	startseq, little, girl,							running
		image	startseq, little, girl, running,				in
		image	startseq, little, girl, running, in,			field
		image	startseq, little, girl, running, in, field,		endseq
'''
# Create sequences of images, input sequences and output words for an image
def create_sequences(tokenizer, max_length, captions_list, image):
	# X1 : input for image features
	# X2 : input for text features
	# y  : output word
	X1, X2, y = list(), list(), list()
	vocab_size = len(tokenizer.word_index) + 1
	# Walk through each caption for the image
	for caption in captions_list:
		# Encode the sequence
		seq = tokenizer.texts_to_sequences([caption])[0]
		# Split one sequence into multiple X,y pairs
		for i in range(1, len(seq)):
			# Split into input and output pair
			in_seq, out_seq = seq[:i], seq[i]
			# Pad input sequence
			in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
			# Encode output sequence
			out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
			# Store
			X1.append(image)
			X2.append(in_seq)
			y.append(out_seq)
	return X1, X2, y

# Data generator, intended to be used in a call to model.fit_generator()
def data_generator(images, captions, tokenizer, max_length, batch_size, random_seed):
	# Setting random seed for reproducibility of results
	random.seed(random_seed)
	# Image ids
	image_ids = list(captions.keys())
	_count=0
	assert batch_size<= len(image_ids), 'Batch size must be less than or equal to {}'.format(len(image_ids))
	while True:
		if _count >= len(image_ids):
			# Generator exceeded or reached the end so restart it
			_count = 0
		# Batch list to store data
		input_img_batch, input_sequence_batch, output_word_batch = list(), list(), list()
		for i in range(_count, min(len(image_ids), _count+batch_size)):
			# Retrieve the image id
			image_id = image_ids[i]
			# Retrieve the image features
			image = images[image_id][0]
			# Retrieve the captions list
			captions_list = captions[image_id]
			# Shuffle captions list
			random.shuffle(captions_list)
			input_img, input_sequence, output_word = create_sequences(tokenizer, max_length, captions_list, image)
			# Add to batch
			for j in range(len(input_img)):
				input_img_batch.append(input_img[j])
				input_sequence_batch.append(input_sequence[j])
				output_word_batch.append(output_word[j])
		_count = _count + batch_size
		yield [[np.array(input_img_batch), np.array(input_sequence_batch)], np.array(output_word_batch)]

def loadTrainData(config):
	train_image_ids = load_set(config['train_data_path'])
	# Check if we already have preprocessed data saved and if not, preprocess the data.
	# Create and save 'captions.txt' & features.pkl
	preprocessData(config)
	# Load captions
	train_captions, _count = load_cleaned_captions(config['model_data_path']+'captions.txt', train_image_ids)
	# Load image features
	train_image_features = load_image_features(config['model_data_path']+'features_'+str(config['model_type'])+'.pkl', train_image_ids)
	print('{}: Available images for training: {}'.format(mytime(),len(train_image_features)))
	print('{}: Available captions for training: {}'.format(mytime(),_count))
	if not os.path.exists(config['model_data_path']+'tokenizer.pkl'):
		# Prepare tokenizer
		tokenizer = create_tokenizer(train_captions)
		# Save the tokenizer
		dump(tokenizer, open(config['model_data_path']+'tokenizer.pkl', 'wb'))
	# Determine the maximum sequence length
	max_length = calc_max_length(train_captions)
	return train_image_features, train_captions, max_length

def loadValData(config):
	val_image_ids = load_set(config['val_data_path'])
	# Load captions
	val_captions, _count = load_cleaned_captions(config['model_data_path']+'captions.txt', val_image_ids)
	# Load image features
	val_features = load_image_features(config['model_data_path']+'features_'+str(config['model_type'])+'.pkl', val_image_ids)
	print('{}: Available images for validation: {}'.format(mytime(),len(val_features)))
	print('{}: Available captions for validation: {}'.format(mytime(),_count))
	return val_features, val_captions