Python sklearn.utils.shuffle() Examples
The following are 30
code examples of sklearn.utils.shuffle().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.utils
, or try the search function
.

Example #1
Source File: iterators.py From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 | 6 votes |
def reset(self): """Resets the iterator to the beginning of the data.""" self.curr_idx = 0 #shuffle data in each bucket random.shuffle(self.idx) for i, buck in enumerate(self.sentences): self.indices[i], self.sentences[i], self.characters[i], self.label[i] = shuffle(self.indices[i], self.sentences[i], self.characters[i], self.label[i]) self.ndindex = [] self.ndsent = [] self.ndchar = [] self.ndlabel = [] #for each bucket of data for i, buck in enumerate(self.sentences): #append the lists with an array self.ndindex.append(ndarray.array(self.indices[i], dtype=self.dtype)) self.ndsent.append(ndarray.array(self.sentences[i], dtype=self.dtype)) self.ndchar.append(ndarray.array(self.characters[i], dtype=self.dtype)) self.ndlabel.append(ndarray.array(self.label[i], dtype=self.dtype))
Example #2
Source File: test_combination.py From pyod with BSD 2-Clause "Simplified" License | 6 votes |
def test_aom_static_norepeat(self): score = aom(self.scores, 3, method='static', bootstrap_estimators=False, random_state=42) assert_equal(score.shape, (4,)) shuffled_list = shuffle(list(range(0, 6, 1)), random_state=42) manual_scores = np.zeros([4, 3]) manual_scores[:, 0] = np.max(self.scores[:, shuffled_list[0:2]], axis=1) manual_scores[:, 1] = np.max(self.scores[:, shuffled_list[2:4]], axis=1) manual_scores[:, 2] = np.max(self.scores[:, shuffled_list[4:6]], axis=1) manual_score = np.mean(manual_scores, axis=1) assert_array_equal(score, manual_score)
Example #3
Source File: test_combination.py From pyod with BSD 2-Clause "Simplified" License | 6 votes |
def test_moa_static_norepeat(self): score = moa(self.scores, 3, method='static', bootstrap_estimators=False, random_state=42) assert_equal(score.shape, (4,)) shuffled_list = shuffle(list(range(0, 6, 1)), random_state=42) manual_scores = np.zeros([4, 3]) manual_scores[:, 0] = np.mean(self.scores[:, shuffled_list[0:2]], axis=1) manual_scores[:, 1] = np.mean(self.scores[:, shuffled_list[2:4]], axis=1) manual_scores[:, 2] = np.mean(self.scores[:, shuffled_list[4:6]], axis=1) manual_score = np.max(manual_scores, axis=1) assert_array_equal(score, manual_score)
Example #4
Source File: lda_model.py From redshells with MIT License | 6 votes |
def fit(self, texts: List[List[str]], adjust_passes=True, test_size=0.1, random_state=123, dictionary: Optional[gensim.corpora.Dictionary] = None) -> None: texts = shuffle(texts) dictionary = dictionary or self._make_dictionary(texts) corpus = self._make_corpus(texts=texts, dictionary=dictionary) train, test = train_test_split(corpus, test_size=test_size, random_state=random_state) passes = np.clip(int(round(100000 / (len(corpus) + 1))), 1, 20) if adjust_passes else 1 self._lda = gensim.models.LdaModel( alpha='auto', corpus=train, num_topics=self.n_topics, id2word=dictionary, iterations=self.iterations, passes=passes) self.log_perplexity = self._lda.log_perplexity(test) logger.info('log_perplexity=%s', self.log_perplexity)
Example #5
Source File: test_weight_boosting.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_importances(): # Check variable importances. X, y = datasets.make_classification(n_samples=2000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=1) for alg in ['SAMME', 'SAMME.R']: clf = AdaBoostClassifier(algorithm=alg) clf.fit(X, y) importances = clf.feature_importances_ assert_equal(importances.shape[0], 10) assert_equal((importances[:3, np.newaxis] >= importances[3:]).all(), True)
Example #6
Source File: test_validation.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_learning_curve_batch_and_incremental_learning_are_equal(): X, y = make_classification(n_samples=30, n_features=1, n_informative=1, n_redundant=0, n_classes=2, n_clusters_per_class=1, random_state=0) train_sizes = np.linspace(0.2, 1.0, 5) estimator = PassiveAggressiveClassifier(max_iter=1, tol=None, shuffle=False) train_sizes_inc, train_scores_inc, test_scores_inc = \ learning_curve( estimator, X, y, train_sizes=train_sizes, cv=3, exploit_incremental_learning=True) train_sizes_batch, train_scores_batch, test_scores_batch = \ learning_curve( estimator, X, y, cv=3, train_sizes=train_sizes, exploit_incremental_learning=False) assert_array_equal(train_sizes_inc, train_sizes_batch) assert_array_almost_equal(train_scores_inc.mean(axis=1), train_scores_batch.mean(axis=1)) assert_array_almost_equal(test_scores_inc.mean(axis=1), test_scores_batch.mean(axis=1))
Example #7
Source File: test_validation.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def check_cross_val_predict_multiclass(est, X, y, method): """Helper for tests of cross_val_predict with multiclass classification""" cv = KFold(n_splits=3, shuffle=False) # Generate expected outputs float_min = np.finfo(np.float64).min default_values = {'decision_function': float_min, 'predict_log_proba': float_min, 'predict_proba': 0} expected_predictions = np.full((len(X), len(set(y))), default_values[method], dtype=np.float64) _, y_enc = np.unique(y, return_inverse=True) for train, test in cv.split(X, y_enc): est = clone(est).fit(X[train], y_enc[train]) fold_preds = getattr(est, method)(X[test]) i_cols_fit = np.unique(y_enc[train]) expected_predictions[np.ix_(test, i_cols_fit)] = fold_preds # Check actual outputs for several representations of y for tg in [y, y + 1, y - 2, y.astype('str')]: assert_allclose(cross_val_predict(est, X, tg, method=method, cv=cv), expected_predictions)
Example #8
Source File: test_data.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_power_transformer_nans(method): # Make sure lambda estimation is not influenced by NaN values # and that transform() supports NaN silently X = np.abs(X_1col) pt = PowerTransformer(method=method) pt.fit(X) lmbda_no_nans = pt.lambdas_[0] # concat nans at the end and check lambda stays the same X = np.concatenate([X, np.full_like(X, np.nan)]) X = shuffle(X, random_state=0) pt.fit(X) lmbda_nans = pt.lambdas_[0] assert_almost_equal(lmbda_no_nans, lmbda_nans, decimal=5) X_trans = pt.transform(X) assert_array_equal(np.isnan(X_trans), np.isnan(X))
Example #9
Source File: dataset.py From cv-tricks.com with MIT License | 6 votes |
def read_train_sets(train_path, image_size, classes, validation_size): class DataSets(object): pass data_sets = DataSets() images, labels, img_names, cls = load_train(train_path, image_size, classes) images, labels, img_names, cls = shuffle(images, labels, img_names, cls) if isinstance(validation_size, float): validation_size = int(validation_size * images.shape[0]) validation_images = images[:validation_size] validation_labels = labels[:validation_size] validation_img_names = img_names[:validation_size] validation_cls = cls[:validation_size] train_images = images[validation_size:] train_labels = labels[validation_size:] train_img_names = img_names[validation_size:] train_cls = cls[validation_size:] data_sets.train = DataSet(train_images, train_labels, train_img_names, train_cls) data_sets.valid = DataSet(validation_images, validation_labels, validation_img_names, validation_cls) return data_sets
Example #10
Source File: dataset.py From Neural-Network-Programming-with-TensorFlow with MIT License | 6 votes |
def next_batch(self, batch_size): """Return the next `batch_size` examples from this data set.""" start = self._index_in_epoch self._index_in_epoch += batch_size if self._index_in_epoch > self._num_examples: # Finished epoch self._epochs_completed += 1 # # Shuffle the data (maybe) # perm = np.arange(self._num_examples) # np.random.shuffle(perm) # self._images = self._images[perm] # self._labels = self._labels[perm] # Start next epoch start = 0 self._index_in_epoch = batch_size assert batch_size <= self._num_examples end = self._index_in_epoch return self._images[start:end], self._labels[start:end], self._ids[start:end], self._cls[start:end]
Example #11
Source File: dataset.py From Neural-Network-Programming-with-TensorFlow with MIT License | 6 votes |
def read_train_sets(train_path, image_size, classes, validation_size=0): class DataSets(object): pass data_sets = DataSets() images, labels, ids, cls = load_train(train_path, image_size, classes) images, labels, ids, cls = shuffle(images, labels, ids, cls) # shuffle the data if isinstance(validation_size, float): validation_size = int(validation_size * images.shape[0]) validation_images = images[:validation_size] validation_labels = labels[:validation_size] validation_ids = ids[:validation_size] validation_cls = cls[:validation_size] train_images = images[validation_size:] train_labels = labels[validation_size:] train_ids = ids[validation_size:] train_cls = cls[validation_size:] data_sets.train = DataSet(train_images, train_labels, train_ids, train_cls) data_sets.valid = DataSet(validation_images, validation_labels, validation_ids, validation_cls) return data_sets
Example #12
Source File: utils.py From adversarial-autoencoder with MIT License | 6 votes |
def load_mnist(): with open('mnist/train-images-idx3-ubyte', 'rb') as f: data = np.fromfile(file=f, dtype=np.uint8) X_train = data[16:].reshape(60000, 28 * 28).astype(np.float32) with open('mnist/train-labels-idx1-ubyte', 'rb') as f: data = np.fromfile(file=f, dtype=np.uint8) y_train = data[8:].reshape(60000).astype(np.uint8) with open('mnist/t10k-images-idx3-ubyte', 'rb') as f: data = np.fromfile(file=f, dtype=np.uint8) X_test = data[16:].reshape(10000, 28 * 28).astype(np.float32) with open('mnist/t10k-labels-idx1-ubyte', 'rb') as f: data = np.fromfile(file=f, dtype=np.uint8) y_test = data[8:].reshape(10000).astype(np.uint8) X_train, y_train = shuffle(X_train, y_train) X_test, y_test = shuffle(X_test, y_test) X_train /= 255. X_test /= 255. return X_train, y_train, X_test, y_test
Example #13
Source File: iterators.py From training_results_v0.6 with Apache License 2.0 | 6 votes |
def reset(self): """Resets the iterator to the beginning of the data.""" self.curr_idx = 0 #shuffle data in each bucket random.shuffle(self.idx) for i, buck in enumerate(self.sentences): self.indices[i], self.sentences[i], self.characters[i], self.label[i] = shuffle(self.indices[i], self.sentences[i], self.characters[i], self.label[i]) self.ndindex = [] self.ndsent = [] self.ndchar = [] self.ndlabel = [] #for each bucket of data for i, buck in enumerate(self.sentences): #append the lists with an array self.ndindex.append(ndarray.array(self.indices[i], dtype=self.dtype)) self.ndsent.append(ndarray.array(self.sentences[i], dtype=self.dtype)) self.ndchar.append(ndarray.array(self.characters[i], dtype=self.dtype)) self.ndlabel.append(ndarray.array(self.label[i], dtype=self.dtype))
Example #14
Source File: helper.py From Kitchen2D with MIT License | 6 votes |
def gen_biased_data(func, pos_ratio, N): ''' Generate N data points on function func, with pos_ratio percentage of the data points to have a positive label. ''' pos = [] neg = [] i = 0 while len(pos) < pos_ratio * N or len(neg) < N - pos_ratio * N: x = np.random.uniform(func.x_range[0], func.x_range[1]) y = func(x) if y > 0: if len(pos) < pos_ratio * N: pos.append(np.hstack((x, y))) elif len(neg) < N - pos_ratio * N: neg.append(np.hstack((x, y))) xy = np.vstack((pos, neg)) xy = shuffle(xy) return xy[:, :-1], xy[:, -1]
Example #15
Source File: functional_autoencoder_test.py From FATE with Apache License 2.0 | 6 votes |
def getKaggleMNIST(file_path): # MNIST data: # column 0 is labels # column 1-785 is data, with values 0 .. 255 # total size of CSV: (42000, 1, 28, 28) train = pd.read_csv(file_path) train = train.as_matrix() train = shuffle(train) Xtrain = train[:-1000, 1:] / 255 Ytrain = train[:-1000, 0].astype(np.int32) Xtest = train[-1000:, 1:] / 255 Ytest = train[-1000:, 0].astype(np.int32) return Xtrain, Ytrain, Xtest, Ytest
Example #16
Source File: spec.py From BirdCLEF-Baseline with MIT License | 6 votes |
def getSpecs(path): specs = [] noise = [] # Get mel-specs for file for spec in audio.specsFromFile(path, rate=cfg.SAMPLE_RATE, seconds=cfg.SPEC_LENGTH, overlap=cfg.SPEC_OVERLAP, minlen=cfg.SPEC_MINLEN, fmin=cfg.SPEC_FMIN, fmax=cfg.SPEC_FMAX, spec_type=cfg.SPEC_TYPE, shape=(cfg.IM_SIZE[1], cfg.IM_SIZE[0])): # Determine signal to noise ratio s2n = audio.signal2noise(spec) specs.append(spec) noise.append(s2n) # Shuffle arrays (we want to select randomly later) specs, noise = shuffle(specs, noise, random_state=RANDOM) return specs, noise
Example #17
Source File: vqc.py From qiskit-aqua with Apache License 2.0 | 6 votes |
def batch_data(self, data, labels=None, minibatch_size=-1): """ batch data """ label_batches = None if 0 < minibatch_size < len(data): batch_size = min(minibatch_size, len(data)) if labels is not None: shuffled_samples, shuffled_labels = shuffle(data, labels, random_state=aqua_globals.random_seed) label_batches = np.array_split(shuffled_labels, batch_size) else: shuffled_samples = shuffle(data, random_state=aqua_globals.random_seed) batches = np.array_split(shuffled_samples, batch_size) else: batches = np.asarray([data]) label_batches = np.asarray([labels]) return batches, label_batches
Example #18
Source File: train.py From models with MIT License | 6 votes |
def run_epoch(): for xmb, mmb, ymb in iter_data(*shuffle(trX, trM, trYt, random_state=np.random), n_batch=n_batch_train, truncate=True, verbose=True): global n_updates XMB = model.xp.asarray(xmb) YMB = model.xp.asarray(ymb) MMB = model.xp.asarray(mmb) h = model(XMB) lm_logits = lm_head(h) clf_logits = clf_head(h, XMB) compute_loss_fct(XMB, YMB, MMB, clf_logits, lm_logits) n_updates += 1 if n_updates in [ 1000, 2000, 4000, 8000, 16000, 32000] and n_epochs == 0: log()
Example #19
Source File: classify.py From blow with Apache License 2.0 | 5 votes |
def batch_loop(e,r,x,y,eval): if eval: model.eval() else: model.train() r=shuffle(r) losses=[] predictions=[] for b in range(0,len(r),sbatch): if b+sbatch>len(r): rr=r[b:] else: rr=r[b:b+sbatch] rr=torch.LongTensor(rr) xb=x[rr,:].to(args.device) yb=y[rr].to(args.device) ybhat=model.forward(xb) loss=loss_function(ybhat,yb) losses+=list(loss.data.cpu().numpy()) predictions+=list(ybhat.data.max(1)[1].cpu().numpy()) if not eval: loss=loss.mean() optim.zero_grad() loss.backward() optim.step() print('\rEpoch {:03d}/{:03d} - {:5.1f}% : loss = {:7.3f}'.format(e+1,nepochs,100*len(losses)/len(x),np.mean(losses)),end='') return losses,predictions
Example #20
Source File: data.py From malss with MIT License | 5 votes |
def fit_transform(self, X, y=None): if isinstance(X, np.ndarray): self.X = pd.DataFrame(X) if y is not None: self.y = pd.Series(y) else: self.X = X.copy(deep=True) if y is not None: if isinstance(y, pd.Series): self.y = y.copy(deep=True) else: self.y = y.iloc[:, 0] # Convert Dataframe to Series if not isinstance(self.X, pd.DataFrame): raise ValueError(f'{type(X)} is not supported') if y is not None and len(X) != len(y): raise ValueError(('Found input variables with inconsistent ' f'numbers of samples: [{len(X)}, {len(y)}]')) self.shape_before = self.X.shape self.X, self.col_was_null = self.__impute(self.X) self._label_encoder = None self._onehot_encoder = None self.X, self.del_columns = self.__encode(self.X) self._standardizer = None if self.standardize: self.X = self.__standardize(self.X) if self.shuffle: if self.y is not None: self.X, self.y = sk_shuffle(self.X, self.y, random_state=self.random_state) else: self.X = sk_shuffle(self.X, random_state=self.random_state)
Example #21
Source File: prepare_data.py From cloudless with Apache License 2.0 | 5 votes |
def _split_data_sets(details): """ Shuffles and splits our datasets into training and validation sets. """ image_paths = details["image_paths"] targets = details["targets"] print "\tShuffling data..." (image_paths, targets) = shuffle(image_paths, targets, random_state=0) print "\tSplitting data 80% training, 20% validation..." return train_test_split(image_paths, targets, train_size=0.8, test_size=0.2, \ random_state=0)
Example #22
Source File: test_hierarchical_sampling.py From libact with BSD 2-Clause "Simplified" License | 5 votes |
def setUp(self): iris = datasets.load_iris() X, y = shuffle(iris.data, iris.target, random_state=1126) self.X = X.tolist() self.y = y.tolist() self.classes = list(set(self.y))
Example #23
Source File: data_utils.py From EvolutionaryGAN with MIT License | 5 votes |
def shuffle(*arrays, **options): if isinstance(arrays[0][0], basestring): return list_shuffle(*arrays) else: return skutils.shuffle(*arrays, random_state=np_rng)
Example #24
Source File: keras-theano.py From DeepLearning-IDS with MIT License | 5 votes |
def loadData(fileName): dataFile = os.path.join(dataPath, fileName) pickleDump = '{}.pickle'.format(dataFile) if os.path.exists(pickleDump): df = pd.read_pickle(pickleDump) else: df = pd.read_csv(dataFile) df = df.dropna() df = shuffle(df) df.to_pickle(pickleDump) return df
Example #25
Source File: keras-tensorflow.py From DeepLearning-IDS with MIT License | 5 votes |
def loadData(fileName): dataFile = os.path.join(dataPath, fileName) pickleDump = '{}.pickle'.format(dataFile) if os.path.exists(pickleDump): df = pd.read_pickle(pickleDump) else: df = pd.read_csv(dataFile) df = df.dropna() df = shuffle(df) df.to_pickle(pickleDump) return df
Example #26
Source File: fastai-expriments.py From DeepLearning-IDS with MIT License | 5 votes |
def loadData(fileName): dataFile = os.path.join(dataPath, fileName) pickleDump = '{}.pickle'.format(dataFile) if os.path.exists(pickleDump): df = pd.read_pickle(pickleDump) else: df = pd.read_csv(dataFile) df = df.dropna() df = shuffle(df) df.to_pickle(pickleDump) return df
Example #27
Source File: fastai-expriments.py From DeepLearning-IDS with MIT License | 5 votes |
def experimentIndividual(dataFile, epochs=5, normalize=False): # procs = [FillMissing, Categorify, Normalize] procs = [FillMissing, Categorify] if normalize: procs.append(Normalize) seed = 7 np.random.seed(seed) # load data data = loadData(dataFile) # define 10-fold cross validation test harness kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed) cvscores = [] fold = 1 for train_idx, test_idx in kfold.split(data.index, data[dep_var]): print('running fold = ', fold) fold += 1 # create model data_fold = (TabularList.from_df(data, path=dataPath, cat_names=cat_names, cont_names=cont_names, procs=procs) .split_by_idxs(train_idx, test_idx) .label_from_df(cols=dep_var) .databunch()) # create model and learn model = tabular_learner( data_fold, layers=[200, 100], metrics=accuracy, callback_fns=ShowGraph) model.fit(epochs, 1e-2) model.save('{}.model'.format(os.path.basename(dataFile))) # train the model, iterating on the data in batches of batch_size # evaluate the model loss, acc = model.validate() print('loss {}: accuracy: {:.2f}%'.format(loss, acc*100)) cvscores.append(acc*100) resultFile = os.path.join(resultPath, dataFile) with open('{}.result'.format(resultFile), 'a') as fout: fout.write( 'accuracy: {:.2f} std-dev: {:.2f}\n'.format(np.mean(cvscores), np.std(cvscores)))
Example #28
Source File: test_common.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_sample_order_invariance(name): random_state = check_random_state(0) y_true = random_state.randint(0, 2, size=(20, )) y_pred = random_state.randint(0, 2, size=(20, )) y_true_shuffle, y_pred_shuffle = shuffle(y_true, y_pred, random_state=0) with ignore_warnings(): metric = ALL_METRICS[name] assert_allclose(metric(y_true, y_pred), metric(y_true_shuffle, y_pred_shuffle), err_msg="%s is not sample order invariant" % name)
Example #29
Source File: test_common.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_sample_order_invariance_multilabel_and_multioutput(): random_state = check_random_state(0) # Generate some data y_true = random_state.randint(0, 2, size=(20, 25)) y_pred = random_state.randint(0, 2, size=(20, 25)) y_score = random_state.normal(size=y_true.shape) y_true_shuffle, y_pred_shuffle, y_score_shuffle = shuffle(y_true, y_pred, y_score, random_state=0) for name in MULTILABELS_METRICS: metric = ALL_METRICS[name] assert_allclose(metric(y_true, y_pred), metric(y_true_shuffle, y_pred_shuffle), err_msg="%s is not sample order invariant" % name) for name in THRESHOLDED_MULTILABEL_METRICS: metric = ALL_METRICS[name] assert_allclose(metric(y_true, y_score), metric(y_true_shuffle, y_score_shuffle), err_msg="%s is not sample order invariant" % name) for name in MULTIOUTPUT_METRICS: metric = ALL_METRICS[name] assert_allclose(metric(y_true, y_score), metric(y_true_shuffle, y_score_shuffle), err_msg="%s is not sample order invariant" % name) assert_allclose(metric(y_true, y_pred), metric(y_true_shuffle, y_pred_shuffle), err_msg="%s is not sample order invariant" % name)
Example #30
Source File: test_utils.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_shuffle_on_ndim_equals_three(): def to_tuple(A): # to make the inner arrays hashable return tuple(tuple(tuple(C) for C in B) for B in A) A = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]) # A.shape = (2,2,2) S = set(to_tuple(A)) shuffle(A) # shouldn't raise a ValueError for dim = 3 assert_equal(set(to_tuple(A)), S)