from time import time import os import numpy as np import scipy.io as sio import argparse import random from config import cfg, get_data_dir, get_output_dir from sklearn.preprocessing import scale as skscale from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import normalize from sklearn.decomposition import PCA def load_data(filename, n_samples): import cPickle fo = open(filename, 'rb') data = cPickle.load(fo) fo.close() labels = data['labels'][0:n_samples] labels = np.squeeze(labels) features = data['data'][0:n_samples] features = features.astype(np.float32, copy=False) features = features.reshape((n_samples, -1)) return labels, features def load_matdata(filename, n_samples): # TODO switch other loading to also use new X,Y convention instead of labels,data? data = sio.loadmat(filename) labels = data['Y'][0:n_samples] labels = np.squeeze(labels) features = data['X'][0:n_samples] features = features.astype(np.float32, copy=False) # TODO figure out why we need to reshape this... # features = features.reshape((n_samples, -1)) return labels, features def load_data_h5py(filename, n_samples): import h5py data = h5py.File(filename, 'r') labels = data['labels'][0:n_samples] labels = np.squeeze(labels) features = data['data'][0:n_samples] features = features.astype(np.float32, copy=False) features = features.reshape((n_samples, -1)) data.close() return labels, features def load_train_and_validation(loader, datadir, n_samples): td = os.path.join(datadir, 'traindata.mat') # TODO n_samples don't really make sense as a single parameter anymore since data set is split in 2 lt, ft = loader(td, n_samples) tv = os.path.join(datadir, 'testdata.mat') lv, fv = loader(tv, n_samples) return np.concatenate((lt, lv)), np.concatenate((ft, fv)) def feature_transformation(features, preprocessing='normalization'): n_samples, n_features = features.shape if preprocessing == 'scale': features = skscale(features, copy=False) elif preprocessing == 'minmax': minmax_scale = MinMaxScaler().fit(features) features = minmax_scale.transform(features) elif preprocessing == 'normalization': features = np.sqrt(n_features) * normalize(features, copy=False) else: print('No preprocessing is applied') return features def kNN(X, k, measure='euclidean'): """ Construct pairwise weights by finding the k nearest neighbors to each point and assigning a Gaussian-based distance. Parameters ---------- X : [n_samples, n_dim] array k : int number of neighbors for each sample in X """ from scipy.spatial import distance weights = [] w = distance.cdist(X, X, measure) y = np.argsort(w, axis=1) for i, x in enumerate(X): distances, indices = w[i, y[i, 1:k + 1]], y[i, 1:k + 1] for (d, j) in zip(distances, indices): if i < j: weights.append((i, j, d * d)) else: weights.append((j, i, d * d)) weights = sorted(weights, key=lambda r: (r[0], r[1])) return np.unique(np.asarray(weights), axis=0) def mkNN(X, k, measure='euclidean'): """ Construct mutual_kNN for large scale dataset If j is one of i's closest neighbors and i is also one of j's closest members, the edge will appear once with (i,j) where i < j. Parameters ---------- X : [n_samples, n_dim] array k : int number of neighbors for each sample in X """ from scipy.spatial import distance from scipy.sparse import csr_matrix, triu, find from scipy.sparse.csgraph import minimum_spanning_tree samples = X.shape[0] batchsize = 10000 b = np.arange(k + 1) b = tuple(b[1:].ravel()) z = np.zeros((samples, k)) weigh = np.zeros_like(z) # This loop speeds up the computation by operating in batches # This can be parallelized to further utilize CPU/GPU resource for x in np.arange(0, samples, batchsize): start = x end = min(x + batchsize, samples) w = distance.cdist(X[start:end], X, measure) y = np.argpartition(w, b, axis=1) z[start:end, :] = y[:, 1:k + 1] weigh[start:end, :] = np.reshape(w[tuple(np.repeat(np.arange(end - start), k)), tuple(y[:, 1:k + 1].ravel())], (end - start, k)) del (w) ind = np.repeat(np.arange(samples), k) P = csr_matrix((np.ones((samples * k)), (ind.ravel(), z.ravel())), shape=(samples, samples)) Q = csr_matrix((weigh.ravel(), (ind.ravel(), z.ravel())), shape=(samples, samples)) Tcsr = minimum_spanning_tree(Q) P = P.minimum(P.transpose()) + Tcsr.maximum(Tcsr.transpose()) P = triu(P, k=1) return np.asarray(find(P)).T def compressed_data(dataset, n_samples, k, preprocess=None, algo='mknn', isPCA=None, format='mat'): datadir = get_data_dir(dataset) if format == 'pkl': labels, features = load_train_and_validation(load_data, datadir, n_samples) elif format == 'h5': labels, features = load_train_and_validation(load_data_h5py, datadir, n_samples) else: labels, features = load_train_and_validation(load_matdata, datadir, n_samples) features = feature_transformation(features, preprocessing=preprocess) # PCA is computed for Text dataset. Please refer RCC paper for exact details. features1 = features.copy() if isPCA is not None: pca = PCA(n_components=isPCA, svd_solver='full').fit(features) features1 = pca.transform(features) t0 = time() if algo == 'knn': weights = kNN(features1, k=k, measure='euclidean') else: weights = mkNN(features1, k=k, measure='cosine') print('The time taken for edge set computation is {}'.format(time() - t0)) filepath = os.path.join(datadir, 'pretrained') if format == 'h5': import h5py fo = h5py.File(filepath + '.h5', 'w') fo.create_dataset('X', data=features) fo.create_dataset('w', data=weights[:, :2]) fo.create_dataset('gtlabels', data=labels) fo.close() else: sio.savemat(filepath + '.mat', mdict={'X': features, 'w': weights[:, :2], 'gtlabels': labels}) def parse_args(): """ Parse input arguments """ parser = argparse.ArgumentParser(description='Feature extraction for RCC algorithm') parser.add_argument('--dataset', default=None, type=str, help='The entered dataset file must be in the Data folder') parser.add_argument('--prep', dest='prep', default='none', type=str, help='preprocessing of data: scale,minmax,normalization,none') parser.add_argument('--algo', dest='algo', default='mknn', type=str, help='Algorithm to use: knn,mknn') parser.add_argument('--k', dest='k', default=10, type=int, help='Number of nearest neighbor to consider') parser.add_argument('--pca', dest='pca', default=None, type=int, help='Dimension of PCA processing before kNN graph construction') parser.add_argument('--samples', dest='nsamples', default=0, type=int, help='total samples to consider') parser.add_argument('--format', choices=['mat', 'pkl', 'h5'], default='mat', help='Dataset format') args = parser.parse_args() return args if __name__ == '__main__': """ ----------------------------- Dataset |samples| dimension ----------------------------- Mnist |70000 | [28,28,1] YaleB |2414 | [168,192,1] Coil100 |7200 | [128,128,3] YTF |10056 | [55,55,3] Reuters |9082 | 2000 RCV1 |10000 | 2000 ----------------------------- """ random.seed(50) args = parse_args() print('Called with args:') print(args) # storing compressed data compressed_data(args.dataset, args.nsamples, args.k, preprocess=args.prep, algo=args.algo, isPCA=args.pca, format=args.format)