from __future__ import division from collections import OrderedDict from functools import partial import gzip import io import os import logging import os.path import h5py import numpy from picklable_itertools.extras import equizip from PIL import Image from scipy.io.matlab import loadmat from six.moves import zip, xrange import zmq from fuel.converters.base import check_exists, progress_bar from fuel.datasets import H5PYDataset from fuel.utils.formats import tar_open from fuel.utils.parallel import producer_consumer from fuel import config log = logging.getLogger(__name__) DEVKIT_ARCHIVE = 'ILSVRC2010_devkit-1.0.tar.gz' DEVKIT_META_PATH = 'devkit-1.0/data/meta.mat' DEVKIT_VALID_GROUNDTRUTH_PATH = ('devkit-1.0/data/' 'ILSVRC2010_validation_ground_truth.txt') PATCH_IMAGES_TAR = 'patch_images.tar' TEST_GROUNDTRUTH = 'ILSVRC2010_test_ground_truth.txt' TRAIN_IMAGES_TAR = 'ILSVRC2010_images_train.tar' VALID_IMAGES_TAR = 'ILSVRC2010_images_val.tar' TEST_IMAGES_TAR = 'ILSVRC2010_images_test.tar' IMAGE_TARS = (TRAIN_IMAGES_TAR, VALID_IMAGES_TAR, TEST_IMAGES_TAR, PATCH_IMAGES_TAR) PUBLIC_FILES = TEST_GROUNDTRUTH, DEVKIT_ARCHIVE ALL_FILES = PUBLIC_FILES + IMAGE_TARS @check_exists(required_files=ALL_FILES) def convert_ilsvrc2010(directory, output_directory, output_filename='ilsvrc2010.hdf5', shuffle_seed=config.default_seed): """Converter for data from the ILSVRC 2010 competition. Source files for this dataset can be obtained by registering at [ILSVRC2010WEB]. Parameters ---------- input_directory : str Path from which to read raw data files. output_directory : str Path to which to save the HDF5 file. output_filename : str, optional The output filename for the HDF5 file. Default: 'ilsvrc2010.hdf5'. shuffle_seed : int or sequence, optional Seed for a random number generator used to shuffle the order of the training set on disk, so that sequential reads will not be ordered by class. .. [ILSVRC2010WEB] http://image-net.org/challenges/LSVRC/2010/index """ devkit_path = os.path.join(directory, DEVKIT_ARCHIVE) test_groundtruth_path = os.path.join(directory, TEST_GROUNDTRUTH) train, valid, test, patch = [os.path.join(directory, fn) for fn in IMAGE_TARS] n_train, valid_groundtruth, test_groundtruth, wnid_map = \ prepare_metadata(devkit_path, test_groundtruth_path) n_valid, n_test = len(valid_groundtruth), len(test_groundtruth) output_path = os.path.join(output_directory, output_filename) with h5py.File(output_path, 'w') as f: log.info('Creating HDF5 datasets...') prepare_hdf5_file(f, n_train, n_valid, n_test) log.info('Processing training set...') process_train_set(f, train, patch, n_train, wnid_map, shuffle_seed) log.info('Processing validation set...') process_other_set(f, 'valid', valid, patch, valid_groundtruth, n_train) log.info('Processing test set...') process_other_set(f, 'test', test, patch, test_groundtruth, n_train + n_valid) log.info('Done.') return (output_path,) def fill_subparser(subparser): """Sets up a subparser to convert the ILSVRC2010 dataset files. Parameters ---------- subparser : :class:`argparse.ArgumentParser` Subparser handling the `ilsvrc2010` command. """ subparser.add_argument( "--shuffle-seed", help="Seed to use for randomizing order of the " "training set on disk.", default=config.default_seed, type=int, required=False) return convert_ilsvrc2010 def prepare_metadata(devkit_archive, test_groundtruth_path): """Extract dataset metadata required for HDF5 file setup. Parameters ---------- devkit_archive : str or file-like object The filename or file-handle for the gzipped TAR archive containing the ILSVRC2010 development kit. test_groundtruth_path : str or file-like object The filename or file-handle for the text file containing the ILSVRC2010 test set ground truth. Returns ------- n_train : int The number of examples in the training set. valid_groundtruth : ndarray, 1-dimensional An ndarray containing the validation set groundtruth in terms of 0-based class indices. test_groundtruth : ndarray, 1-dimensional An ndarray containing the test groundtruth in terms of 0-based class indices. wnid_map : dict A dictionary that maps WordNet IDs to 0-based class indices. """ # Read what's necessary from the development kit. synsets, cost_matrix, raw_valid_groundtruth = read_devkit(devkit_archive) # Mapping to take WordNet IDs to our internal 0-999 encoding. wnid_map = dict(zip((s.decode('utf8') for s in synsets['WNID']), xrange(1000))) # Map the 'ILSVRC2010 ID' to our zero-based ID. ilsvrc_id_to_zero_based = dict(zip(synsets['ILSVRC2010_ID'], xrange(len(synsets)))) # Map the validation set groundtruth to 0-999 labels. valid_groundtruth = [ilsvrc_id_to_zero_based[id_] for id_ in raw_valid_groundtruth] # Raw test data groundtruth, ILSVRC2010 IDs. raw_test_groundtruth = numpy.loadtxt(test_groundtruth_path, dtype=numpy.int16) # Map the test set groundtruth to 0-999 labels. test_groundtruth = [ilsvrc_id_to_zero_based[id_] for id_ in raw_test_groundtruth] # Ascertain the number of filenames to prepare appropriate sized # arrays. n_train = int(synsets['num_train_images'].sum()) log.info('Training set: {} images'.format(n_train)) log.info('Validation set: {} images'.format(len(valid_groundtruth))) log.info('Test set: {} images'.format(len(test_groundtruth))) n_total = n_train + len(valid_groundtruth) + len(test_groundtruth) log.info('Total (train/valid/test): {} images'.format(n_total)) return n_train, valid_groundtruth, test_groundtruth, wnid_map def create_splits(n_train, n_valid, n_test): n_total = n_train + n_valid + n_test tuples = {} tuples['train'] = (0, n_train) tuples['valid'] = (n_train, n_train + n_valid) tuples['test'] = (n_train + n_valid, n_total) sources = ['encoded_images', 'targets', 'filenames'] return OrderedDict( (split, OrderedDict((source, tuples[split]) for source in sources)) for split in ('train', 'valid', 'test') ) def prepare_hdf5_file(hdf5_file, n_train, n_valid, n_test): """Create datasets within a given HDF5 file. Parameters ---------- hdf5_file : :class:`h5py.File` instance HDF5 file handle to which to write. n_train : int The number of training set examples. n_valid : int The number of validation set examples. n_test : int The number of test set examples. """ n_total = n_train + n_valid + n_test splits = create_splits(n_train, n_valid, n_test) hdf5_file.attrs['split'] = H5PYDataset.create_split_array(splits) vlen_dtype = h5py.special_dtype(vlen=numpy.dtype('uint8')) hdf5_file.create_dataset('encoded_images', shape=(n_total,), dtype=vlen_dtype) hdf5_file.create_dataset('targets', shape=(n_total, 1), dtype=numpy.int16) hdf5_file.create_dataset('filenames', shape=(n_total, 1), dtype='S32') def process_train_set(hdf5_file, train_archive, patch_archive, n_train, wnid_map, shuffle_seed=None): """Process the ILSVRC2010 training set. Parameters ---------- hdf5_file : :class:`h5py.File` instance HDF5 file handle to which to write. Assumes `features`, `targets` and `filenames` already exist and have first dimension larger than `n_train`. train_archive : str or file-like object Filename or file handle for the TAR archive of training images. patch_archive : str or file-like object Filename or file handle for the TAR archive of patch images. n_train : int The number of items in the training set. wnid_map : dict A dictionary mapping WordNet IDs to class indices. shuffle_seed : int or sequence, optional Seed for a NumPy random number generator that permutes the training set on disk. If `None`, no permutation is performed (this is the default). """ producer = partial(train_set_producer, train_archive=train_archive, patch_archive=patch_archive, wnid_map=wnid_map) consumer = partial(image_consumer, hdf5_file=hdf5_file, num_expected=n_train, shuffle_seed=shuffle_seed) producer_consumer(producer, consumer) def _write_to_hdf5(hdf5_file, index, image_filename, image_data, class_index): hdf5_file['filenames'][index] = image_filename.encode('ascii') hdf5_file['encoded_images'][index] = image_data if class_index is not None: hdf5_file['targets'][index] = class_index def train_set_producer(socket, train_archive, patch_archive, wnid_map): """Load/send images from the training set TAR file or patch images. Parameters ---------- socket : :class:`zmq.Socket` PUSH socket on which to send loaded images. train_archive : str or file-like object Filename or file handle for the TAR archive of training images. patch_archive : str or file-like object Filename or file handle for the TAR archive of patch images. wnid_map : dict A dictionary that maps WordNet IDs to 0-based class indices. Used to decode the filenames of the inner TAR files. """ patch_images = extract_patch_images(patch_archive, 'train') num_patched = 0 with tar_open(train_archive) as tar: for inner_tar_info in tar: with tar_open(tar.extractfile(inner_tar_info.name)) as inner: wnid = inner_tar_info.name.split('.')[0] class_index = wnid_map[wnid] filenames = sorted(info.name for info in inner if info.isfile()) images_gen = (load_from_tar_or_patch(inner, filename, patch_images) for filename in filenames) pathless_filenames = (os.path.split(fn)[-1] for fn in filenames) stream = equizip(pathless_filenames, images_gen) for image_fn, (image_data, patched) in stream: if patched: num_patched += 1 socket.send_pyobj((image_fn, class_index), zmq.SNDMORE) socket.send(image_data) if num_patched != len(patch_images): raise ValueError('not all patch images were used') def image_consumer(socket, hdf5_file, num_expected, shuffle_seed=None, offset=0): """Fill an HDF5 file with incoming images from a socket. Parameters ---------- socket : :class:`zmq.Socket` PULL socket on which to receive images. hdf5_file : :class:`h5py.File` instance HDF5 file handle to which to write. Assumes `features`, `targets` and `filenames` already exist and have first dimension larger than `sum(images_per_class)`. num_expected : int The number of items we expect to be sent over the socket. shuffle_seed : int or sequence, optional Seed for a NumPy random number generator that permutes the images on disk. offset : int, optional The offset in the HDF5 datasets at which to start writing received examples. Defaults to 0. """ with progress_bar('images', maxval=num_expected) as pb: if shuffle_seed is None: index_gen = iter(xrange(num_expected)) else: rng = numpy.random.RandomState(shuffle_seed) index_gen = iter(rng.permutation(num_expected)) for i, num in enumerate(index_gen): image_filename, class_index = socket.recv_pyobj(zmq.SNDMORE) image_data = numpy.fromstring(socket.recv(), dtype='uint8') _write_to_hdf5(hdf5_file, num + offset, image_filename, image_data, class_index) pb.update(i + 1) def process_other_set(hdf5_file, which_set, image_archive, patch_archive, groundtruth, offset): """Process the validation or test set. Parameters ---------- hdf5_file : :class:`h5py.File` instance HDF5 file handle to which to write. Assumes `features`, `targets` and `filenames` already exist and have first dimension larger than `sum(images_per_class)`. which_set : str Which set of images is being processed. One of 'train', 'valid', 'test'. Used for extracting the appropriate images from the patch archive. image_archive : str or file-like object The filename or file-handle for the TAR archive containing images. patch_archive : str or file-like object Filename or file handle for the TAR archive of patch images. groundtruth : iterable Iterable container containing scalar 0-based class index for each image, sorted by filename. offset : int The offset in the HDF5 datasets at which to start writing. """ producer = partial(other_set_producer, image_archive=image_archive, patch_archive=patch_archive, groundtruth=groundtruth, which_set=which_set) consumer = partial(image_consumer, hdf5_file=hdf5_file, num_expected=len(groundtruth), offset=offset) producer_consumer(producer, consumer) def other_set_producer(socket, which_set, image_archive, patch_archive, groundtruth): """Push image files read from the valid/test set TAR to a socket. Parameters ---------- socket : :class:`zmq.Socket` PUSH socket on which to send images. which_set : str Which set of images is being processed. One of 'train', 'valid', 'test'. Used for extracting the appropriate images from the patch archive. image_archive : str or file-like object The filename or file-handle for the TAR archive containing images. patch_archive : str or file-like object Filename or file handle for the TAR archive of patch images. groundtruth : iterable Iterable container containing scalar 0-based class index for each image, sorted by filename. """ patch_images = extract_patch_images(patch_archive, which_set) num_patched = 0 with tar_open(image_archive) as tar: filenames = sorted(info.name for info in tar if info.isfile()) images = (load_from_tar_or_patch(tar, filename, patch_images) for filename in filenames) pathless_filenames = (os.path.split(fn)[-1] for fn in filenames) image_iterator = equizip(images, pathless_filenames, groundtruth) for (image_data, patched), filename, class_index in image_iterator: if patched: num_patched += 1 socket.send_pyobj((filename, class_index), zmq.SNDMORE) socket.send(image_data, copy=False) if num_patched != len(patch_images): raise Exception def load_from_tar_or_patch(tar, image_filename, patch_images): """Do everything necessary to process an image inside a TAR. Parameters ---------- tar : `TarFile` instance The tar from which to read `image_filename`. image_filename : str Fully-qualified path inside of `tar` from which to read an image file. patch_images : dict A dictionary containing filenames (without path) of replacements to be substituted in place of the version of the same file found in `tar`. Returns ------- image_data : bytes The JPEG bytes representing either the image from the TAR archive or its replacement from the patch dictionary. patched : bool True if the image was retrieved from the patch dictionary. False if it was retrieved from the TAR file. """ patched = True image_bytes = patch_images.get(os.path.basename(image_filename), None) if image_bytes is None: patched = False try: image_bytes = tar.extractfile(image_filename).read() numpy.array(Image.open(io.BytesIO(image_bytes))) except (IOError, OSError): with gzip.GzipFile(fileobj=tar.extractfile(image_filename)) as gz: image_bytes = gz.read() numpy.array(Image.open(io.BytesIO(image_bytes))) return image_bytes, patched def read_devkit(f): """Read relevant information from the development kit archive. Parameters ---------- f : str or file-like object The filename or file-handle for the gzipped TAR archive containing the ILSVRC2010 development kit. Returns ------- synsets : ndarray, 1-dimensional, compound dtype See :func:`read_metadata_mat_file` for details. cost_matrix : ndarray, 2-dimensional, uint8 See :func:`read_metadata_mat_file` for details. raw_valid_groundtruth : ndarray, 1-dimensional, int16 The labels for the ILSVRC2010 validation set, distributed with the development kit code. """ with tar_open(f) as tar: # Metadata table containing class hierarchy, textual descriptions, etc. meta_mat = tar.extractfile(DEVKIT_META_PATH) synsets, cost_matrix = read_metadata_mat_file(meta_mat) # Raw validation data groundtruth, ILSVRC2010 IDs. Confusingly # distributed inside the development kit archive. raw_valid_groundtruth = numpy.loadtxt(tar.extractfile( DEVKIT_VALID_GROUNDTRUTH_PATH), dtype=numpy.int16) return synsets, cost_matrix, raw_valid_groundtruth def read_metadata_mat_file(meta_mat): """Read ILSVRC2010 metadata from the distributed MAT file. Parameters ---------- meta_mat : str or file-like object The filename or file-handle for `meta.mat` from the ILSVRC2010 development kit. Returns ------- synsets : ndarray, 1-dimensional, compound dtype A table containing ILSVRC2010 metadata for the "synonym sets" or "synsets" that comprise the classes and superclasses, including the following fields: * `ILSVRC2010_ID`: the integer ID used in the original competition data. * `WNID`: A string identifier that uniquely identifies a synset in ImageNet and WordNet. * `wordnet_height`: The length of the longest path to a leaf node in the FULL ImageNet/WordNet hierarchy (leaf nodes in the FULL ImageNet/WordNet hierarchy have `wordnet_height` 0). * `gloss`: A string representation of an English textual description of the concept represented by this synset. * `num_children`: The number of children in the hierarchy for this synset. * `words`: A string representation, comma separated, of different synoym words or phrases for the concept represented by this synset. * `children`: A vector of `ILSVRC2010_ID`s of children of this synset, padded with -1. Note that these refer to `ILSVRC2010_ID`s from the original data and *not* the zero-based index in the table. * `num_train_images`: The number of training images for this synset. cost_matrix : ndarray, 2-dimensional, uint8 A 1000x1000 matrix containing the precomputed pairwise cost (based on distance in the hierarchy) for all low-level synsets (i.e. the thousand possible output classes with training data associated). """ mat = loadmat(meta_mat, squeeze_me=True) synsets = mat['synsets'] cost_matrix = mat['cost_matrix'] new_dtype = numpy.dtype([ ('ILSVRC2010_ID', numpy.int16), ('WNID', ('S', max(map(len, synsets['WNID'])))), ('wordnet_height', numpy.int8), ('gloss', ('S', max(map(len, synsets['gloss'])))), ('num_children', numpy.int8), ('words', ('S', max(map(len, synsets['words'])))), ('children', (numpy.int8, max(synsets['num_children']))), ('num_train_images', numpy.uint16) ]) new_synsets = numpy.empty(synsets.shape, dtype=new_dtype) for attr in ['ILSVRC2010_ID', 'WNID', 'wordnet_height', 'gloss', 'num_children', 'words', 'num_train_images']: new_synsets[attr] = synsets[attr] children = [numpy.atleast_1d(ch) for ch in synsets['children']] padded_children = [ numpy.concatenate((c, -numpy.ones(new_dtype['children'].shape[0] - len(c), dtype=numpy.int16))) for c in children ] new_synsets['children'] = padded_children return new_synsets, cost_matrix def extract_patch_images(f, which_set): """Extracts a dict of the "patch images" for ILSVRC2010. Parameters ---------- f : str or file-like object The filename or file-handle to the patch images TAR file. which_set : str Which set of images to extract. One of 'train', 'valid', 'test'. Returns ------- dict A dictionary contains a mapping of filenames (without path) to a bytes object containing the replacement image. Notes ----- Certain images in the distributed archives are blank, or display an "image not available" banner. A separate TAR file of "patch images" is distributed with the corrected versions of these. It is this archive that this function is intended to read. """ if which_set not in ('train', 'valid', 'test'): raise ValueError('which_set must be one of train, valid, or test') which_set = 'val' if which_set == 'valid' else which_set patch_images = {} with tar_open(f) as tar: for info_obj in tar: if not info_obj.name.endswith('.JPEG'): continue # Pretty sure that '/' is used for tarfile regardless of # os.path.sep, but I officially don't care about Windows. tokens = info_obj.name.split('/') file_which_set = tokens[-2] if file_which_set != which_set: continue filename = tokens[-1] patch_images[filename] = tar.extractfile(info_obj.name).read() return patch_images