""" This module contains utility functions to process and load various datasets. Most of the datasets are public, but are not included in the package; MNIST dataset will be automatically downloaded. There are also some classes to represent datasets. `ExampleVisiting` is an helper class that implements the stochastic sampling of data and is optimized to work with `Reverse/ForwardHyperGradient` (has helper funcitons to create training and validation `feed_dict` suppliers). """ from collections import OrderedDict import numpy as np from functools import reduce import tensorflow as tf from tensorflow.examples.tutorials.mnist.input_data import read_data_sets import os from rfho.utils import as_list, np_normalize_data, merge_dicts import sys try: import pandas as pd except ImportError: pd = None print(sys.exc_info()) print('pandas not found. Some load function might not work') try: import scipy.io as scio from scipy import linalg import scipy.sparse as sc_sp import scipy as sp SPARSE_SCIPY_MATRICES = (sc_sp.csr.csr_matrix, sc_sp.coo.coo_matrix) except ImportError: scio, linalg, sp, sc_sp = None, None, None, None SPARSE_SCIPY_MATRICES = () print(sys.exc_info()) print('scipy not found. Some load function might not work') try: import sklearn.datasets as sk_dt from sklearn.utils import shuffle as sk_shuffle except ImportError: sk_dt, sk_shuffle = None, None print('sklearn not found. Some load function might not work') try: import intervaltree as it except ImportError: it = None print(sys.exc_info()) print('intervaltree not found. WindowedData will not work. (You can get intervaltree with pip!)') import _pickle as cpickle from_env = os.getenv('RFHO_DATA_FOLDER') if from_env: DATA_FOLDER = from_env # print('Congratulations, RFHO_DATA_FOLDER found!') else: print('Environment variable RFHO_DATA_FOLDER not found. Variables HELP_WIN and HELP_UBUNTU contain info.') DATA_FOLDER = os.getcwd() _COMMON_BEGIN = "You can set environment variable RFHO_DATA_FOLDER to" \ "specify root folder in which you store various datasets. \n" _COMMON_END = """\n You can also skip this step... \n In this case all load_* methods take a FOLDER path as first argument. \n Bye.""" HELP_UBUNTU = _COMMON_BEGIN + """ Bash command is: export RFHO_DATA_FOLDER='absolute/path/to/dataset/folder \n Remember! To add the global variable kinda permanently in your system you should add export command in bash.bashrc file located in etc folder. """ + _COMMON_END HELP_WIN = _COMMON_BEGIN + """ Cmd command is: Set RFHO_DATA_FOLDER absolute/path/to/dataset/folder for one session. \n To set it permanently use SetX instead of Set (and probably reboot system) """ + _COMMON_END print('Data folder is', DATA_FOLDER) # kind of private TIMIT_DIR = os.path.join(DATA_FOLDER, 'timit4python') XRMB_DIR = os.path.join(DATA_FOLDER, 'XRMB') IROS15_BASE_FOLDER = os.path.join(DATA_FOLDER, os.path.join('dls_collaboration', 'Learning')) # easy to find! IRIS_TRAINING = os.path.join(DATA_FOLDER, 'iris', "training.csv") IRIS_TEST = os.path.join(DATA_FOLDER, 'iris', "test.csv") MNIST_DIR = os.path.join(DATA_FOLDER, "mnist_data") CALTECH101_30_DIR = os.path.join(DATA_FOLDER, "caltech101-30") CALTECH101_DIR = os.path.join(DATA_FOLDER, "caltech") CENSUS_TRAIN = os.path.join(DATA_FOLDER, 'census', "train.csv") CENSUS_TEST = os.path.join(DATA_FOLDER, 'census', "test.csv") CIFAR10_DIR = os.path.join(DATA_FOLDER, "CIFAR-10") CIFAR100_DIR = os.path.join(DATA_FOLDER, "CIFAR-100") REALSIM = os.path.join(DATA_FOLDER, "realsim") # scikit learn datasets SCIKIT_LEARN_DATA = os.path.join(DATA_FOLDER, 'scikit_learn_data') class Datasets: """ Simple object for standard datasets. Has the field `train` `validation` and `test` and support indexing """ def __init__(self, train=None, validation=None, test=None): self.train = train self.validation = validation self.test = test self._lst = [train, validation, test] def setting(self): return {k: v.setting() if hasattr(v, 'setting') else None for k, v in vars(self).items()} def __getitem__(self, item): return self._lst[item] def __len__(self): return len([_ for _ in self._lst if _ is not None]) @staticmethod def from_list(list_of_datasets): """ Generates a `Datasets` object from a list. :param list_of_datasets: list containing from one to three dataset :return: """ train, valid, test = None, None, None train = list_of_datasets[0] if len(list_of_datasets) > 3: print('There are more then 3 Datasets here...') return list_of_datasets if len(list_of_datasets) > 1: test = list_of_datasets[-1] if len(list_of_datasets) == 3: valid = list_of_datasets[1] return Datasets(train, valid, test) @staticmethod def stack(*datasets_s): """ Stack some datasets calling stack for each dataset. :param datasets_s: :return: a new dataset """ return Datasets.from_list([Dataset.stack(*[d[k] for d in datasets_s if d[k] is not None]) for k in range(3)]) def _maybe_cast_to_scalar(what): return what[0] if len(what) == 1 else what def convert_sparse_matrix_to_sparse_tensor(X): if isinstance(X, sc_sp.csr.csr_matrix): coo = X.tocoo() indices = np.mat([coo.row, coo.col]).transpose() else: coo, indices = X, [X.row, X.col] # data = np.array(coo.data, dtype=) return tf.SparseTensor(indices, tf.constant(coo.data, dtype=tf.float32), coo.shape) class Dataset: """ Class for managing a single dataset, includes data and target fields and has some utility functions. It allows also to convert the dataset into tensors and to store additional information both on a per-example basis and general infos. """ def __init__(self, data, target, sample_info=None, info=None): """ :param data: Numpy array containing data :param target: Numpy array containing targets :param sample_info: either an array of dicts or a single dict, in which case it is cast to array of dicts. :param info: (optional) dictionary with further info about the dataset """ self._tensor_mode = False self._data = data self._target = target if sample_info is None: sample_info = {} self.sample_info = np.array([sample_info] * self.num_examples) \ if isinstance(sample_info, dict) else sample_info assert self.num_examples == len(self.sample_info) assert self.num_examples == self._shape(self._target)[0] self.info = info or {} def _shape(self, what): return what.get_shape().as_list() if self._tensor_mode else what.shape def setting(self): """ for save setting purposes, does not save the actual data :return: """ return { 'num_examples': self.num_examples, 'dim_data': self.dim_data, 'dim_target': self.dim_target, 'info': self.info } @property def data(self): return self._data @property def target(self): return self._target @property def num_examples(self): """ :return: Number of examples in this dataset """ return self._shape(self.data)[0] @property def dim_data(self): """ :return: The data dimensionality as an integer, if input are vectors, or a tuple in the general case """ return _maybe_cast_to_scalar(self._shape(self.data)[1:]) @property def dim_target(self): """ :return: The target dimensionality as an integer, if targets are vectors, or a tuple in the general case """ shape = self._shape(self.target) return 1 if len(shape) == 1 else _maybe_cast_to_scalar(shape[1:]) def convert_to_tensor(self, keep_sparse=True): matrices = ['_data', '_target'] for att in matrices: if keep_sparse and isinstance(self.__getattribute__(att), SPARSE_SCIPY_MATRICES): self.__setattr__(att, convert_sparse_matrix_to_sparse_tensor(self.__getattribute__(att))) else: self.__setattr__(att, tf.convert_to_tensor(self.__getattribute__(att), dtype=tf.float32)) self._tensor_mode = True def create_supplier(self, x, y, other_feeds=None): """ Return a standard feed dictionary for this dataset. :param x: placeholder for data :param y: placeholder for target :param other_feeds: optional other feeds :return: a callable. """ if not other_feeds: other_feeds = {} # noinspection PyUnusedLocal def _supplier(step=None): """ :param step: unused, just for making it compatible with `HG` and `Saver` :return: the feed dictionary """ if isinstance(self.data, WindowedData): data = self.data.generate_all() return {**{x: self.data, y: self.target}, **other_feeds} return _supplier @staticmethod def stack(*datasets): """ Assuming that the datasets have same structure, stucks data and targets :param datasets: :return: stacked dataset """ return Dataset(data=vstack([d.data for d in datasets]), target=stack_or_concat([d.target for d in datasets]), sample_info=stack_or_concat([d.sample_info for d in datasets]), info={k: [d.info.get(k, None) for d in datasets] for k in merge_dicts(*[d.info for d in datasets])}) def to_one_hot_enc(seq, dimension=None): da_max = dimension or np.max(seq) + 1 def create_and_set(_p): _tmp = np.zeros(da_max) _tmp[_p] = 1 return _tmp return np.array([create_and_set(_v) for _v in seq]) def load_census(): COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num", "marital_status", "occupation", "relationship", "race", "gender", "capital_gain", "capital_loss", "hours_per_week", "native_country", "income_bracket"] df_train = pd.read_csv(CENSUS_TRAIN, names=COLUMNS, skipinitialspace=True) df_test = pd.read_csv(CENSUS_TEST, names=COLUMNS, skipinitialspace=True, skiprows=1) LABEL_COLUMN = "label" df_train[LABEL_COLUMN] = (df_train["income_bracket"].apply(lambda x: ">50K" in x)).astype(int) df_test[LABEL_COLUMN] = (df_test["income_bracket"].apply(lambda x: ">50K" in x)).astype(int) def load_iris(partitions_proportions=None, classes=3): """Loads Iris dataset divided as training and test set (by default)""" training_set = tf.contrib.learn.datasets.base.load_csv_with_header( filename=IRIS_TRAINING, target_dtype=np.int, features_dtype=np.float32) test_set = tf.contrib.learn.datasets.base.load_csv_with_header( filename=IRIS_TEST, target_dtype=np.int, features_dtype=np.float32) tr_set = training_set.data tr_targets = to_one_hot_enc(training_set.target) tr_dst = Dataset(data=tr_set, target=tr_targets) tst_set = test_set.data tst_targets = to_one_hot_enc(test_set.target) tst_dst = Dataset(data=tst_set, target=tst_targets) if partitions_proportions: if classes == 2: # noinspection PyUnusedLocal def filter_class(x, y, info, i): return np.argmax(y) != 0 # discard first class filter_list = [filter_class] # noinspection PyUnusedLocal def project_map(x, y, info, i): return x, y[1:], info else: filter_list, project_map = (None, None) res = redivide_data([tr_dst, tst_dst], partitions_proportions, filters=filter_list, maps=project_map) res += [None] * (3 - len(res)) return Datasets(train=res[0], validation=res[1], test=res[2]) return Datasets(train=tr_dst, test=tst_dst, validation=None) def stack_or_concat(list_of_arays): func = np.concatenate if list_of_arays[0].ndim == 1 else np.vstack return func(list_of_arays) def vstack(lst): """ Vstack that considers sparse matrices :param lst: :return: """ return sp.vstack(lst) if sp and isinstance(lst[0], sp.sparse.csr.csr_matrix) else np.vstack(lst) def redivide_data(datasets, partition_proportions=None, shuffle=False, filters=None, maps=None, balance_classes=False): """ Function that redivides datasets. Can be use also to shuffle or filter or map examples. :param datasets: original datasets, instances of class Dataset (works with get_data and get_targets for compatibility with mnist datasets :param partition_proportions: (optional, default None) list of fractions that can either sum up to 1 or less then one, in which case one additional partition is created with proportion 1 - sum(partition proportions). If None it will retain the same proportion of samples found in datasets :param shuffle: (optional, default False) if True shuffles the examples :param filters: (optional, default None) filter or list of filters: functions with signature (data, target, index) -> boolean (accept or reject the sample) :param maps: (optional, default None) map or list of maps: functions with signature (data, target, index) -> (new_data, new_target) (maps the old sample to a new one, possibly also to more than one sample, for data augmentation) :return: a list of datasets of length equal to the (possibly augmented) partition_proportion """ all_data = vstack([get_data(d) for d in datasets]) all_labels = stack_or_concat([get_targets(d) for d in datasets]) all_infos = np.concatenate([d.sample_info for d in datasets]) N = all_data.shape[0] if partition_proportions: # argument check partition_proportions = list([partition_proportions] if isinstance(partition_proportions, float) else partition_proportions) sum_proportions = sum(partition_proportions) assert sum_proportions <= 1, "partition proportions must sum up to at most one: %d" % sum_proportions if sum_proportions < 1.: partition_proportions += [1. - sum_proportions] else: partition_proportions = [1. * get_data(d).shape[0] / N for d in datasets] if shuffle: if sp and isinstance(all_data, sp.sparse.csr.csr_matrix): raise NotImplementedError() # if sk_shuffle: # TODO this does not work!!! find a way to shuffle these matrices while # keeping compatibility with tensorflow! # all_data, all_labels, all_infos = sk_shuffle(all_data, all_labels, all_infos) # else: permutation = np.arange(all_data.shape[0]) np.random.shuffle(permutation) all_data = all_data[permutation] all_labels = np.array(all_labels[permutation]) all_infos = np.array(all_infos[permutation]) if filters: if sp and isinstance(all_data, sp.sparse.csr.csr_matrix): raise NotImplementedError() filters = as_list(filters) data_triple = [(x, y, d) for x, y, d in zip(all_data, all_labels, all_infos)] for fiat in filters: data_triple = [xy for i, xy in enumerate(data_triple) if fiat(xy[0], xy[1], xy[2], i)] all_data = np.vstack([e[0] for e in data_triple]) all_labels = np.vstack([e[1] for e in data_triple]) all_infos = np.vstack([e[2] for e in data_triple]) if maps: if sp and isinstance(all_data, sp.sparse.csr.csr_matrix): raise NotImplementedError() maps = as_list(maps) data_triple = [(x, y, d) for x, y, d in zip(all_data, all_labels, all_infos)] for _map in maps: data_triple = [_map(xy[0], xy[1], xy[2], i) for i, xy in enumerate(data_triple)] all_data = np.vstack([e[0] for e in data_triple]) all_labels = np.vstack([e[1] for e in data_triple]) all_infos = np.vstack([e[2] for e in data_triple]) N = all_data.shape[0] assert N == all_labels.shape[0] calculated_partitions = reduce( lambda v1, v2: v1 + [sum(v1) + v2], [int(N * prp) for prp in partition_proportions], [0] ) calculated_partitions[-1] = N print('datasets.redivide_data:, computed partitions numbers -', calculated_partitions, 'len all', N, end=' ') new_general_info_dict = {} for data in datasets: new_general_info_dict = {**new_general_info_dict, **data.info} if balance_classes: new_datasets = [] forbidden_indices = np.empty(0, dtype=np.int64) for d1, d2 in zip(calculated_partitions[:-1], calculated_partitions[1:-1]): indices = np.array(get_indices_balanced_classes(d2 - d1, all_labels, forbidden_indices)) dataset = Dataset(data=all_data[indices], target=all_labels[indices], sample_info=all_infos[indices], info=new_general_info_dict) new_datasets.append(dataset) forbidden_indices = np.append(forbidden_indices, indices) test_if_balanced(dataset) remaining_indices = np.array(list(set(list(range(N))) - set(forbidden_indices))) new_datasets.append(Dataset(data=all_data[remaining_indices], target=all_labels[remaining_indices], sample_info=all_infos[remaining_indices], info=new_general_info_dict)) else: new_datasets = [ Dataset(data=all_data[d1:d2], target=all_labels[d1:d2], sample_info=all_infos[d1:d2], info=new_general_info_dict) for d1, d2 in zip(calculated_partitions, calculated_partitions[1:]) ] print('DONE') return new_datasets def get_indices_balanced_classes(n_examples, labels, forbidden_indices): N = len(labels) n_classes = len(labels[0]) indices = [] current_class = 0 for i in range(n_examples): index = np.random.random_integers(0, N - 1, 1)[0] while index in indices or index in forbidden_indices or np.argmax(labels[index]) != current_class: index = np.random.random_integers(0, N - 1, 1)[0] indices.append(index) current_class = (current_class + 1) % n_classes return indices def test_if_balanced(dataset): labels = dataset.target n_classes = len(labels[0]) class_counter = [0] * n_classes for l in labels: class_counter[np.argmax(l)] += 1 print('exemple by class: ', class_counter) def load_20newsgroup_vectorized(folder=SCIKIT_LEARN_DATA, one_hot=True, partitions_proportions=None, shuffle=False, binary_problem=False, as_tensor=True, minus_value=-1.): data_train = sk_dt.fetch_20newsgroups_vectorized(data_home=folder, subset='train') data_test = sk_dt.fetch_20newsgroups_vectorized(data_home=folder, subset='test') X_train = data_train.data X_test = data_test.data y_train = data_train.target y_test = data_test.target if binary_problem: y_train[data_train.target < 10] = minus_value y_train[data_train.target >= 10] = 1. y_test[data_test.target < 10] = minus_value y_test[data_test.target >= 10] = 1. if one_hot: y_train = to_one_hot_enc(y_train) y_test = to_one_hot_enc(y_test) # if shuffle and sk_shuffle: # xtr = X_train.tocoo() # xts = X_test.tocoo() d_train = Dataset(data=X_train, target=y_train, info={'target names': data_train.target_names}) d_test = Dataset(data=X_test, target=y_test, info={'target names': data_train.target_names}) res = [d_train, d_test] if partitions_proportions: res = redivide_data([d_train, d_test], partition_proportions=partitions_proportions, shuffle=False) if as_tensor: [dat.convert_to_tensor() for dat in res] return Datasets.from_list(res) def load_realsim(folder=REALSIM, one_hot=True, partitions_proportions=None, shuffle=False, as_tensor=True): X, y = sk_dt.load_svmlight_file(folder + "/real-sim") y = np.array([int(yy) for yy in y]) if one_hot: y = to_one_hot_enc(y) res = [Dataset(data=X, target=y)] if partitions_proportions: res = redivide_data(res, shuffle=shuffle, partition_proportions=partitions_proportions) res = Datasets.from_list(res) if as_tensor: [dat.convert_to_tensor() for dat in res] return res # noinspection PyPep8Naming def load_XRMB(folder=XRMB_DIR, half_window=2, max_speakers=100, only_independent=False, normalize_single_speaker=False): """ Loads XRMB data. :param max_speakers: :param folder: path for root directory. :param half_window: half window size for the data. :param only_independent: if False returns speaker datasets that do not keep track of the speaker. :param normalize_single_speaker: if True normalizes each dataset independently :return: A Datasets class containing speaker independent data for training, validation and test, or a list a triplet of lists of Dataset if speaker_dependent is True. """ prefix = folder + "/xrbm_spk_" set_types = ['train', 'val', 'test'] def load_speaker(speaker_number, set_type): assert set_type in set_types files = (prefix + str(speaker_number).zfill(3) + "_%s%s.csv" % (set_type, data_type) for data_type in ('audio', 'motor', 'sentences')) arrays = [pd.read_csv(fl, header=None).values for fl in files] return arrays[0], arrays[1], arrays[2] - 1 # sentence bounds are with MATLAB convetions def load_all_in(_range=range(1)): datasets = {n: [] for n in set_types} m, mo, sd, sto = None, None, None, None k = 0 for set_type in set_types: for k in _range: try: general_info_dict = {'speaker': k, 'original set': set_type} data, targets, sentence_bounds = load_speaker(k, set_type) if normalize_single_speaker and k != 0: # with k = 0 use mean and sd from training set data, m_sd, sd_sd = np_normalize_data(data, return_mean_and_sd=True) targets, mo_sd, sto_sd = np_normalize_data(targets, return_mean_and_sd=True) general_info_dict['normalizing stats'] = (m_sd, sd_sd, mo_sd, sto_sd) else: data, m, sd = np_normalize_data(data, m, sd, return_mean_and_sd=True) targets, mo, sto = np_normalize_data(targets, mo, sto, return_mean_and_sd=True) general_info_dict['normalizing stats'] = (m, sd, mo, sto) data = WindowedData(data, sentence_bounds, window=half_window, process_all=True) datasets[set_type].append(Dataset(data, targets, sample_info={'speaker': k} if k != 0 else None, info=general_info_dict)) except OSError or FileNotFoundError: k -= 1 break print('loaded %d speakers for %s' % (k, set_type)) return datasets if not only_independent: res = load_all_in(range(0, max_speakers)) for _set_type in set_types: # sample-wise speaker info to the general datasets res[_set_type][0].sample_info_dicts = np.concatenate([ np.array([{'speaker': k + 1}] * ds.num_examples) for k, ds in enumerate(res[_set_type][1:]) ]) return Datasets(train=res['train'], validation=res['val'], test=res['test']) else: res = load_all_in() return Datasets(train=res['train'][0], validation=res['val'][0], test=res['test'][0]) def load_timit_for_joint_training(folder, small=False, one_hot=True, only_gender=False): """ :param folder: source folder... :param small: if `True` loads a smaller version of the dataset :param one_hot: whether to use one hot encoding for output :return: A list of `Datasets` where the first one is for the speaker dependent net and the subsequent are for group dependent nets. The first dataset should include validation and test data, while for the others (at the moment) is not needed """ # # example # X, Y = np.array(), np.array() # group_id = 0 # gender = 'M' # train = Dataset(X, Y, general_info_dict={'group': group_id, 'gender': gender}) # datasets = Datasets(train=train) if small: set_names = ['train_small', 'validation_small', 'coretest_small'] else: set_names = ['train', 'validation', 'coretest'] Xall = {} Yall = {} datasets = [None] for gender in ['F', 'M']: _temp_gender = [] for dr in range(1, 9): sets = [] for s in set_names: # Loading data fname = '{}_DR{}_{}.npy'.format(s, dr, gender) data = np.load(os.path.join(folder, fname)) # Creating dataset X = data[:, :-1] Y = data[:, -1] if one_hot: Y = to_one_hot_enc(np.array(Y, dtype=np.int32), dimension=183) info = {'group': dr, 'gender': gender} sets.append(Dataset(X, Y, info=info)) # Stacking data for full dataset Xall[s] = np.vstack((Xall[s], X)) if s in Xall else X if one_hot: Yall[s] = np.vstack((Yall[s], Y)) if s in Yall else Y else: Yall[s] = np.hstack((Yall[s], Y)) if s in Yall else Y ds = Datasets(train=sets[0], validation=sets[1], test=sets[2]) if not only_gender: datasets.append(ds) else: _temp_gender.append(ds) if only_gender: datasets.append(Datasets.stack(*_temp_gender)) # Building full dataset # sets = [] # for s in set_names: # sets.append(Dataset(Xall[s], Yall[s])) # ds = Datasets(train=sets[0], validation=sets[1], test=sets[2]) # datasets[0] = ds datasets[0] = Datasets.stack(*datasets[1:]) return datasets # noinspection PyUnusedLocal def load_timit(folder=TIMIT_DIR, only_primary=False, filters=None, maps=None, small=False, context=None, fake=False, process_all=False): def load_timit_sentence_bound(): def sentence_bound_reader(name): bnd = pd.read_csv(folder + '/timit_%sSentenceBound.csv' % name, header=None).values return bnd - 1 return [sentence_bound_reader(n) for n in ['train', 'val', 'test']] folder = folder or TIMIT_DIR if isinstance(process_all, bool): process_all = [process_all] * 3 if fake: def generate_dataset(secondary=False): target = np.random.randn(2000, 183) if secondary: target = np.hstack([target, np.random.randn(2000, 300)]) return np.random.randn(2000, 123), target training_data, training_target = generate_dataset(not only_primary) validation_data, validation_target = generate_dataset() test_data, test_target = generate_dataset() training_info_dict = None else: split_number = '00' if small else '' training_target = pd.read_csv(folder + '/timit_trainTargets%s.csv' % split_number, header=None).values training_data = pd.read_csv(folder + '/timit-preproc_traindata_norm_noctx%s.csv' % split_number, header=None).values training_info_dict = {'dim_primary_target': training_target.shape[1]} print('loaded primary training data') if not only_primary: training_secondary_target = pd.read_csv(folder + '/timit_trainTargetsPE%s.csv' % split_number, header=None).values training_target = np.hstack([training_target, training_secondary_target]) training_info_dict['dim_secondary_target'] = training_secondary_target.shape[1] print('loaded secondary task targets') validation_data = pd.read_csv(folder + '/timit-preproc_valdata_norm_noctx%s.csv' % split_number, header=None).values validation_target = pd.read_csv(folder + '/timit_valTargets%s.csv' % split_number, header=None).values print('loaded validation data') test_data = pd.read_csv(folder + '/timit-preproc_testdata_norm_noctx.csv', header=None).values test_target = pd.read_csv(folder + '/timit_testTargets.csv', header=None).values print('loaded test data') if context: sbs = load_timit_sentence_bound() training_data, validation_data, test_data = (WindowedData(d, s, context, process_all=pa) for d, s, pa in zip([training_data, validation_data, test_data], sbs, process_all)) test_dataset = Dataset(data=test_data, target=test_target) validation_dataset = Dataset(data=validation_data, target=validation_target) training_dataset = Dataset(data=training_data, target=training_target, info=training_info_dict) res = Datasets(train=training_dataset, validation=validation_dataset, test=test_dataset) return res def load_mnist(folder=None, one_hot=True, partitions=None, filters=None, maps=None, shuffle=False): if not folder: folder = MNIST_DIR datasets = read_data_sets(folder, one_hot=one_hot) train = Dataset(datasets.train.images, datasets.train.labels) validation = Dataset(datasets.validation.images, datasets.validation.labels) test = Dataset(datasets.test.images, datasets.test.labels) res = [train, validation, test] if partitions: res = redivide_data(res, partition_proportions=partitions, filters=filters, maps=maps, shuffle=shuffle) res += [None] * (3 - len(res)) return Datasets.from_list(res) def load_caltech101_30(folder=CALTECH101_30_DIR, tiny_problem=False): caltech = scio.loadmat(folder + '/caltech101-30.matlab') k_train, k_test = caltech['Ktrain'], caltech['Ktest'] label_tr, label_te = caltech['tr_label'], caltech['te_label'] file_tr, file_te = caltech['tr_files'], caltech['te_files'] if tiny_problem: pattern_step = 5 fraction_limit = 0.2 k_train = k_train[:int(len(label_tr) * fraction_limit):pattern_step, :int(len(label_tr) * fraction_limit):pattern_step] label_tr = label_tr[:int(len(label_tr) * fraction_limit):pattern_step] U, s, Vh = linalg.svd(k_train) S_sqrt = linalg.diagsvd(s ** 0.5, len(s), len(s)) X = np.dot(U, S_sqrt) # examples in rows train_x, val_x, test_x = X[0:len(X):3, :], X[1:len(X):3, :], X[2:len(X):3, :] label_tr_enc = to_one_hot_enc(np.array(label_tr) - 1) train_y, val_y, test_y = label_tr_enc[0:len(X):3, :], label_tr_enc[1:len(X):3, :], label_tr_enc[2:len(X):3, :] train_file, val_file, test_file = file_tr[0:len(X):3], file_tr[1:len(X):3], file_tr[2:len(X):3] test_dataset = Dataset(data=test_x, target=test_y, info={'files': test_file}) validation_dataset = Dataset(data=val_x, target=val_y, info={'files': val_file}) training_dataset = Dataset(data=train_x, target=train_y, info={'files': train_file}) return Datasets(train=training_dataset, validation=validation_dataset, test=test_dataset) def load_iros15(folder=IROS15_BASE_FOLDER, resolution=15, legs='all', part_proportions=(.7, .2), one_hot=True, shuffle=True): resolutions = (5, 11, 15) legs_names = ('LF', 'LH', 'RF', 'RH') assert resolution in resolutions folder += str(resolution) if legs == 'all': legs = legs_names base_name_by_leg = lambda leg: os.path.join(folder, 'trainingSet%sx%sFromSensor%s.mat' % (resolution, resolution, leg)) datasets = {} for _leg in legs: dat = scio.loadmat(base_name_by_leg(_leg)) data, target = dat['X'], to_one_hot_enc(dat['Y']) if one_hot else dat['Y'] # maybe pre-processing??? or it is already done? ask... datasets[_leg] = Datasets.from_list( redivide_data([Dataset(data, target, info={'leg': _leg})], partition_proportions=part_proportions, shuffle=shuffle)) return datasets def load_caltech101(folder=CALTECH101_DIR, one_hot=True, partitions=None, filters=None, maps=None): path = folder + "/caltech101.pickle" with open(path, "rb") as input_file: X, target_name, files = cpickle.load(input_file) dict_name_ID = {} i = 0 list_of_targets = sorted(list(set(target_name))) for k in list_of_targets: dict_name_ID[k] = i i += 1 dict_ID_name = {v: k for k, v in dict_name_ID.items()} Y = [] for name_y in target_name: Y.append(dict_name_ID[name_y]) if one_hot: Y = to_one_hot_enc(Y) dataset = Dataset(data=X, target=Y, info={'dict_name_ID': dict_name_ID, 'dict_ID_name': dict_ID_name}, sample_info=[{'target_name': t, 'files': f} for t, f in zip(target_name, files)]) if partitions: res = redivide_data([dataset], partitions, filters=filters, maps=maps, shuffle=True) res += [None] * (3 - len(res)) return Datasets(train=res[0], validation=res[1], test=res[2]) return dataset def load_cifar10(folder=CIFAR10_DIR, one_hot=True, partitions=None, filters=None, maps=None, balance_classes=False): path = folder + "/cifar-10.pickle" with open(path, "rb") as input_file: X, target_name, files = cpickle.load(input_file) X = np.array(X) dict_name_ID = {} i = 0 list_of_targets = sorted(list(set(target_name))) for k in list_of_targets: dict_name_ID[k] = i i += 1 dict_ID_name = {v: k for k, v in dict_name_ID.items()} Y = [] for name_y in target_name: Y.append(dict_name_ID[name_y]) if one_hot: Y = to_one_hot_enc(Y) dataset = Dataset(data=X, target=Y, info={'dict_name_ID': dict_name_ID, 'dict_ID_name': dict_ID_name}, sample_info=[{'target_name': t, 'files': f} for t, f in zip(target_name, files)]) if partitions: res = redivide_data([dataset], partitions, filters=filters, maps=maps, shuffle=True, balance_classes=True) res += [None] * (3 - len(res)) return Datasets(train=res[0], validation=res[1], test=res[2]) return dataset def load_cifar100(folder=CIFAR100_DIR, one_hot=True, partitions=None, filters=None, maps=None): path = folder + "/cifar-100.pickle" with open(path, "rb") as input_file: X, target_ID_fine, target_ID_coarse, fine_ID_corr, coarse_ID_corr, files = cpickle.load(input_file) X = np.array(X); target_ID_fine = target_ID_fine[:len(X)] target_ID_coarse = target_ID_coarse[:len(X)] fine_ID_corr = {v: k for v, k in zip(range(len(fine_ID_corr)), fine_ID_corr)} coarse_ID_corr = {v: k for v, k in zip(range(len(coarse_ID_corr)), coarse_ID_corr)} fine_label_corr = {v: k for k, v in fine_ID_corr.items()} coarse_label_corr = {v: k for k, v in coarse_ID_corr.items()} Y = [] for name_y in target_ID_fine: Y.append(name_y) Y = np.array(Y) if one_hot: Y = to_one_hot_enc(Y) superY = [] for name_y in target_ID_coarse: superY.append(name_y) superY = np.array(superY) if one_hot: superY = to_one_hot_enc(superY) print(len(X)) print(len(Y)) dataset = Dataset(data=X, target=Y, info={'dict_name_ID_fine': fine_label_corr, 'dict_name_ID_coarse': coarse_label_corr, 'dict_ID_name_fine': fine_ID_corr, 'dict_ID_name_coarse': coarse_ID_corr}, sample_info=[{'Y_coarse': yc, 'files': f} for yc, f in zip(superY, files)]) if partitions: res = redivide_data([dataset], partitions, filters=filters, maps=maps, shuffle=True) res += [None] * (3 - len(res)) return Datasets(train=res[0], validation=res[1], test=res[2]) return dataset def generate_multiclass_dataset(n_samples=100, n_features=10, n_informative=5, n_redundant=3, n_repeated=2, n_classes=2, n_clusters_per_class=2, weights=None, flip_y=0.01, class_sep=1.0, hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=None, hot_encoded=True, partitions_proportions=None, negative_labels=-1.): X, y = sk_dt.make_classification(n_samples=n_samples, n_features=n_features, n_informative=n_informative, n_redundant=n_redundant, n_repeated=n_repeated, n_classes=n_classes, n_clusters_per_class=n_clusters_per_class, weights=weights, flip_y=flip_y, class_sep=class_sep, hypercube=hypercube, shift=shift, scale=scale, shuffle=True, random_state=random_state) if hot_encoded: y = to_one_hot_enc(y) else: y[y == 0] = negative_labels res = Dataset(data=np.array(X, dtype=np.float32), target=np.array(y, dtype=np.float32), info={'n_informative': n_informative, 'n_redundant': n_redundant, 'n_repeated': n_repeated, 'n_classes': n_classes, 'n_clusters_per_class': n_clusters_per_class, 'weights': weights, 'flip_y': flip_y, 'class_sep': class_sep, 'hypercube': hypercube, 'shift': shift, 'scale': scale, 'shuffle': True, 'random_state': random_state}) np.random.seed(random_state) if partitions_proportions: res = redivide_data([res], shuffle=shuffle, partition_proportions=partitions_proportions) res = Datasets.from_list(res) return res def get_data(d_set): if hasattr(d_set, 'images'): data = d_set.images elif hasattr(d_set, 'data'): data = d_set.data else: raise ValueError("something wrong with the dataset %s" % d_set) return data def get_targets(d_set): if hasattr(d_set, 'labels'): return d_set.labels elif hasattr(d_set, 'target'): return d_set.target else: raise ValueError("something wrong with the dataset %s" % d_set) # class ExampleVisiting: def __init__(self, dataset, batch_size, epochs=None): """ Class for stochastic sampling of data points. It is most useful for feeding examples for the the training ops of `ReverseHG` or `ForwardHG`. Most notably, if the number of epochs is specified, the class takes track of the examples per mini-batches which is important for the backward pass of `ReverseHG` method. :param dataset: instance of `Dataset` class :param batch_size: :param epochs: number of epochs (can be None, in which case examples are fed continuously) """ self.dataset = dataset self.batch_size = batch_size self.epochs = epochs self.T = int(np.ceil(dataset.num_examples / batch_size)) if self.epochs: self.T *= self.epochs self.training_schedule = None self.iter_per_epoch = int(dataset.num_examples / batch_size) def setting(self): excluded = ['training_schedule', 'datasets'] dictionary = {k: v for k, v in vars(self).items() if k not in excluded} if hasattr(self.dataset, 'setting'): dictionary['dataset'] = self.dataset.setting() return dictionary def generate_visiting_scheme(self): """ Generates and stores example visiting scheme, as a numpy array of integers. :return: self """ def all_indices_shuffled(): _res = list(range(self.dataset.num_examples)) np.random.shuffle(_res) return _res # noinspection PyUnusedLocal self.training_schedule = np.concatenate([all_indices_shuffled() for _ in range(self.epochs or 1)]) return self def create_supplier(self, x, y, other_feeds=None, lambda_feeds=None): return self.create_feed_dict_supplier(x, y, other_feeds=other_feeds, lambda_feeds=lambda_feeds) def create_feed_dict_supplier(self, x, y, other_feeds=None, lambda_feeds=None): """ :param x: placeholder for independent variable :param y: placeholder for dependent variable :param lambda_feeds: dictionary of placeholders: number_of_example -> substitution :param other_feeds: dictionary of other feeds (e.g. dropout factor, ...) to add to the input output feed_dict :return: a function that generates a feed_dict with the right signature for Reverse and Forward HyperGradient classes """ if not lambda_feeds: lambda_processed_feeds = {} if not other_feeds: other_feeds = {} def _training_supplier(step=None): nonlocal lambda_processed_feeds, other_feeds if step >= self.T: if step % self.T == 0: if self.epochs: print('WARNING: End of the training scheme reached.' 'Generating another scheme.') self.generate_visiting_scheme() step %= self.T if self.training_schedule is None: # print('visiting scheme not yet generated!') self.generate_visiting_scheme() # noinspection PyTypeChecker nb = self.training_schedule[step * self.batch_size: min( (step + 1) * self.batch_size, len(self.training_schedule))] bx = self.dataset.data[nb, :] by = self.dataset.target[nb, :] if lambda_feeds: lambda_processed_feeds = {k: v(nb) for k, v in lambda_feeds.items()} else: lambda_processed_feeds = {} return {**{x: bx, y: by}, **other_feeds, **lambda_processed_feeds} return _training_supplier def pad(_example, _size): return np.concatenate([_example] * _size) class WindowedData(object): def __init__(self, data, row_sentence_bounds, window=5, process_all=False): """ Class for managing windowed input data (like TIMIT). :param data: Numpy matrix. Each row should be an example data :param row_sentence_bounds: Numpy matrix with bounds for padding. TODO add default NONE :param window: half-window size :param process_all: (default False) if True adds context to all data at object initialization. Otherwise the windowed data is created in runtime. """ self.window = window self.data = data base_shape = self.data.shape self.shape = (base_shape[0], (2 * self.window + 1) * base_shape[1]) self.tree = it.IntervalTree([it.Interval(int(e[0]), int(e[1]) + 1) for e in row_sentence_bounds]) if process_all: print('adding context to all the dataset', end='- ') self.data = self.generate_all() print('DONE') self.process_all = process_all def generate_all(self): return self[:] def __getitem__(self, item): # TODO should be right for all the common use... But better write down a TestCase if hasattr(self, 'process_all') and self.process_all: # keep attr check! return self.data[item] if isinstance(item, int): return self.get_context(item=item) if isinstance(item, tuple): if len(item) == 2: rows, columns = item if isinstance(rows, int) and isinstance(columns, int): # TODO check here # do you want the particular element? return self.get_context(item=rows)[columns] else: raise TypeError('NOT IMPLEMENTED <|>') if isinstance(rows, slice): rows = range(*rows.indices(self.shape[0])) return np.vstack([self.get_context(r) for r in rows])[:, columns] else: if isinstance(item, slice): item = range(*item.indices(self.shape[0])) return np.vstack([self.get_context(r) for r in item]) def __len__(self): return self.shape[0] def get_context(self, item): interval = list(self.tree[item])[0] # print(interval) left, right = interval[0], interval[1] left_pad = max(self.window + left - item, 0) right_pad = max(0, self.window - min(right, len(self) - 1) + item) # this is to cope with reduce datasets # print(left, right, item) # print(left_pad, right_pad) base = np.concatenate(self.data[item - self.window + left_pad: item + self.window + 1 - right_pad]) if left_pad: base = np.concatenate([pad(self.data[item], left_pad), base]) if right_pad: base = np.concatenate([base, pad(self.data[item], right_pad)]) return base # # # if __name__ == '__main__': # # _datasets = load_20newsgroup_feed_vectorized(one_hot=False, binary_problem=True) # # print(_datasets.train.dim_data) # # print(_datasets.train.dim_target) # # mnist = load_mnist(partitions=[0.1, .2], filters=lambda x, y, d, k: True) # # print(len(_datasets.train))\ # # load_20newsgroup_vectorized(one_hot=False, shuffle=True, partitions_proportions=(1 / 3, 1 / 3)) # # mnist = load_mnist(partitions=(.1, .1), shuffle=True) # # print(mnist.train.data) # print(type(mnist.train.data)) # # # dt = load_20newsgroup_vectorized() # # print(dt.train.num_examples) # # print(dt.train.num_examples)