from multiprocessing.pool import Pool import io import math import json import os.path as path from zipfile import ZipFile import numpy as np import tensorflow as tf from tqdm import tqdm from python.download.util.content_dir import ContentDir from python.operator import make_sequence_example random_seed = 450849059 # From random.org def char_vocabulary(): vocabulary_index = np.concatenate([ np.array([ '<eos>', '<unknown>', ' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z' ]) ]) return dict((v, i) for (i, v) in enumerate(vocabulary_index)) def make_source_target_alignment(words, char_map, max_length, verbose=False): space_char_code = char_map[' '] source = [] source_current = [] target = [] target_current = [] length = [] length_current = 0 for word in tqdm(words, disable=not verbose): if length_current + len(word) + 1 > max_length: # concatenate current data and move it to storage source.append(np.concatenate(source_current)) target.append(np.concatenate(target_current)) length.append(length_current) # prepear for new source and target source_current = [] target_current = [] length_current = 0 # add source and target, while maintaining the current total length source_current.append( np.array([space_char_code] + [char_map[char] for char in word], dtype='int32') ) target_current.append( np.array([char_map[char] for char in word] + [space_char_code], dtype='int32') ) length_current += 1 + len(word) # concatenate remaning data and move it to storage if length_current > 0: source.append(np.concatenate(source_current)) target.append(np.concatenate(target_current)) length.append(length_current) return (length, source, target) def export_map(vocab_map): items_sorted = sorted(vocab_map.items(), key=lambda item: item[1]) values_sorted = map(lambda item: item[0], items_sorted) return np.asarray(list(values_sorted), dtype='str') def build_dataset(text, max_length=200, verbose=False, **kwargs): if verbose: print('tokenizing file ...') words = np.array(text.strip().split(' ')) if verbose: print('building vocabulary ...') char_map = char_vocabulary() if verbose: print('making dataset ...') length, source, target = make_source_target_alignment( words, char_map, max_length=max_length, verbose=verbose ) return { 'char_map': export_map(char_map), 'length': length, 'source': source, 'target': target } def split_dataset(dataset, train_ratio=0.9, valid_ratio=0.05, **kwargs): # Create indices permutation array observations = len(dataset['length']) shuffle_indices = np.random.RandomState(random_seed).permutation(observations) # Compute number of observations in each dataset train_size = math.floor(observations * train_ratio) valid_size = math.floor(observations * valid_ratio) # Make train split train_indices = shuffle_indices[:train_size] train = { 'length': np.take(dataset['length'], train_indices, axis=0), 'source': np.take(dataset['source'], train_indices, axis=0), 'target': np.take(dataset['target'], train_indices, axis=0) } # Make validation split valid_indices = shuffle_indices[train_size:train_size + valid_size] valid = { 'length': np.take(dataset['length'], valid_indices, axis=0), 'source': np.take(dataset['source'], valid_indices, axis=0), 'target': np.take(dataset['target'], valid_indices, axis=0) } # Make test split test_indices = shuffle_indices[train_size + valid_size:] test = { 'length': np.take(dataset['length'], test_indices, axis=0), 'source': np.take(dataset['source'], test_indices, axis=0), 'target': np.take(dataset['target'], test_indices, axis=0) } return [train, valid, test] # Serialize dataset in parallel (because it takes forever) def tfrecord_serializer(item): length, source, target = item return make_sequence_example(length, source, target).SerializeToString() def save_tfrecord(filename, dataset, verbose=False): observations = len(dataset['length']) serialized = [] with Pool(processes=4) as pool: for serialized_string in tqdm(pool.imap( tfrecord_serializer, zip(dataset['length'], dataset['source'], dataset['target']), chunksize=10 ), total=observations, disable=not verbose): serialized.append(serialized_string) # Save seriealized dataset writer = tf.python_io.TFRecordWriter( filename, options=tf.python_io.TFRecordOptions( tf.python_io.TFRecordCompressionType.ZLIB ) ) for serialized_string in tqdm(serialized, disable=not verbose): writer.write(serialized_string) writer.close() def preprocess_generate(**kwargs): with ContentDir() as content: content.download('text8.zip', 'http://mattmahoney.net/dc/text8.zip') with ZipFile(content.filepath('text8.zip')) as zip_reader: with zip_reader.open('text8') as text8_file: text = io.TextIOWrapper(text8_file).read() dataset = build_dataset(text, **kwargs) train, valid, test = split_dataset(dataset, **kwargs) print('saving train data ...') save_tfrecord(content.filepath('generate.train.tfrecord'), train, verbose=True) print('saving valid data ...') save_tfrecord(content.filepath('generate.valid.tfrecord'), valid, verbose=True) print('saving test data ...') save_tfrecord(content.filepath('generate.test.tfrecord'), test, verbose=True) print('saving maps ...') np.savez(content.filepath('generate.map.npz'), char_map=dataset['char_map'], verbose=True) print('saving metadata ...') metadata = { 'observations': { 'train': len(train['length']), 'valid': len(valid['length']), 'test': len(test['length']) } } with open(content.filepath('generate.meta.json'), 'w') as fp: json.dump(metadata, fp)