Python sklearn.utils.shuffle() Examples

The following are 30 code examples of sklearn.utils.shuffle(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.utils , or try the search function .
Example #1
Source File: test_data.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_power_transformer_nans(method):
    # Make sure lambda estimation is not influenced by NaN values
    # and that transform() supports NaN silently

    X = np.abs(X_1col)
    pt = PowerTransformer(method=method)
    pt.fit(X)
    lmbda_no_nans = pt.lambdas_[0]

    # concat nans at the end and check lambda stays the same
    X = np.concatenate([X, np.full_like(X, np.nan)])
    X = shuffle(X, random_state=0)

    pt.fit(X)
    lmbda_nans = pt.lambdas_[0]

    assert_almost_equal(lmbda_no_nans, lmbda_nans, decimal=5)

    X_trans = pt.transform(X)
    assert_array_equal(np.isnan(X_trans), np.isnan(X)) 
Example #2
Source File: vqc.py    From qiskit-aqua with Apache License 2.0 6 votes vote down vote up
def batch_data(self, data, labels=None, minibatch_size=-1):
        """ batch data """
        label_batches = None

        if 0 < minibatch_size < len(data):
            batch_size = min(minibatch_size, len(data))
            if labels is not None:
                shuffled_samples, shuffled_labels = shuffle(data, labels,
                                                            random_state=aqua_globals.random_seed)
                label_batches = np.array_split(shuffled_labels, batch_size)
            else:
                shuffled_samples = shuffle(data, random_state=aqua_globals.random_seed)
            batches = np.array_split(shuffled_samples, batch_size)
        else:
            batches = np.asarray([data])
            label_batches = np.asarray([labels])
        return batches, label_batches 
Example #3
Source File: helper.py    From Kitchen2D with MIT License 6 votes vote down vote up
def gen_biased_data(func, pos_ratio, N):
    '''
    Generate N data points on function func, with pos_ratio percentage of the 
    data points to have a positive label.
    '''
    pos = []
    neg = []
    i = 0
    while len(pos) < pos_ratio * N or len(neg) < N - pos_ratio * N:
        x = np.random.uniform(func.x_range[0], func.x_range[1])
        y = func(x)
        if y > 0:
            if len(pos) < pos_ratio * N:
                pos.append(np.hstack((x, y)))
        elif len(neg) < N - pos_ratio * N:
            neg.append(np.hstack((x, y)))
    xy = np.vstack((pos, neg))
    xy = shuffle(xy)
    return xy[:, :-1], xy[:, -1] 
Example #4
Source File: train.py    From models with MIT License 6 votes vote down vote up
def run_epoch():
    for xmb, mmb, ymb in iter_data(*shuffle(trX, trM, trYt, random_state=np.random),
                                   n_batch=n_batch_train, truncate=True, verbose=True):
        global n_updates
        XMB = model.xp.asarray(xmb)
        YMB = model.xp.asarray(ymb)
        MMB = model.xp.asarray(mmb)
        h = model(XMB)
        lm_logits = lm_head(h)
        clf_logits = clf_head(h, XMB)
        compute_loss_fct(XMB, YMB, MMB, clf_logits, lm_logits)
        n_updates += 1
        if n_updates in [
                1000,
                2000,
                4000,
                8000,
                16000,
                32000] and n_epochs == 0:
            log() 
Example #5
Source File: spec.py    From BirdCLEF-Baseline with MIT License 6 votes vote down vote up
def getSpecs(path):
    
    specs = []
    noise = []

    # Get mel-specs for file
    for spec in audio.specsFromFile(path,
                                    rate=cfg.SAMPLE_RATE,
                                    seconds=cfg.SPEC_LENGTH,
                                    overlap=cfg.SPEC_OVERLAP,
                                    minlen=cfg.SPEC_MINLEN,
                                    fmin=cfg.SPEC_FMIN,
                                    fmax=cfg.SPEC_FMAX,
                                    spec_type=cfg.SPEC_TYPE,
                                    shape=(cfg.IM_SIZE[1], cfg.IM_SIZE[0])):

        # Determine signal to noise ratio
        s2n = audio.signal2noise(spec)
        specs.append(spec)
        noise.append(s2n)

    # Shuffle arrays (we want to select randomly later)
    specs, noise = shuffle(specs, noise, random_state=RANDOM)

    return specs, noise 
Example #6
Source File: functional_autoencoder_test.py    From FATE with Apache License 2.0 6 votes vote down vote up
def getKaggleMNIST(file_path):

    # MNIST data:
    # column 0 is labels
    # column 1-785 is data, with values 0 .. 255
    # total size of CSV: (42000, 1, 28, 28)

    train = pd.read_csv(file_path)
    train = train.as_matrix()
    train = shuffle(train)

    Xtrain = train[:-1000, 1:] / 255
    Ytrain = train[:-1000, 0].astype(np.int32)
    Xtest  = train[-1000:, 1:] / 255
    Ytest  = train[-1000:, 0].astype(np.int32)

    return Xtrain, Ytrain, Xtest, Ytest 
Example #7
Source File: iterators.py    From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 6 votes vote down vote up
def reset(self):
        """Resets the iterator to the beginning of the data."""
        self.curr_idx = 0
        #shuffle data in each bucket
        random.shuffle(self.idx)
        for i, buck in enumerate(self.sentences):
            self.indices[i], self.sentences[i], self.characters[i], self.label[i] = shuffle(self.indices[i],
                                                                                            self.sentences[i],
                                                                                            self.characters[i],
                                                                                            self.label[i])

        self.ndindex = []
        self.ndsent = []
        self.ndchar = []
        self.ndlabel = []

        #for each bucket of data
        for i, buck in enumerate(self.sentences):
            #append the lists with an array
            self.ndindex.append(ndarray.array(self.indices[i], dtype=self.dtype))
            self.ndsent.append(ndarray.array(self.sentences[i], dtype=self.dtype))
            self.ndchar.append(ndarray.array(self.characters[i], dtype=self.dtype))
            self.ndlabel.append(ndarray.array(self.label[i], dtype=self.dtype)) 
Example #8
Source File: test_combination.py    From pyod with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def test_aom_static_norepeat(self):
        score = aom(self.scores, 3, method='static',
                    bootstrap_estimators=False,
                    random_state=42)

        assert_equal(score.shape, (4,))

        shuffled_list = shuffle(list(range(0, 6, 1)), random_state=42)
        manual_scores = np.zeros([4, 3])
        manual_scores[:, 0] = np.max(self.scores[:, shuffled_list[0:2]],
                                     axis=1)
        manual_scores[:, 1] = np.max(self.scores[:, shuffled_list[2:4]],
                                     axis=1)
        manual_scores[:, 2] = np.max(self.scores[:, shuffled_list[4:6]],
                                     axis=1)

        manual_score = np.mean(manual_scores, axis=1)
        assert_array_equal(score, manual_score) 
Example #9
Source File: test_combination.py    From pyod with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def test_moa_static_norepeat(self):
        score = moa(self.scores, 3, method='static',
                    bootstrap_estimators=False, random_state=42)

        assert_equal(score.shape, (4,))

        shuffled_list = shuffle(list(range(0, 6, 1)), random_state=42)
        manual_scores = np.zeros([4, 3])
        manual_scores[:, 0] = np.mean(self.scores[:, shuffled_list[0:2]],
                                      axis=1)
        manual_scores[:, 1] = np.mean(self.scores[:, shuffled_list[2:4]],
                                      axis=1)
        manual_scores[:, 2] = np.mean(self.scores[:, shuffled_list[4:6]],
                                      axis=1)

        manual_score = np.max(manual_scores, axis=1)
        assert_array_equal(score, manual_score) 
Example #10
Source File: lda_model.py    From redshells with MIT License 6 votes vote down vote up
def fit(self,
            texts: List[List[str]],
            adjust_passes=True,
            test_size=0.1,
            random_state=123,
            dictionary: Optional[gensim.corpora.Dictionary] = None) -> None:
        texts = shuffle(texts)
        dictionary = dictionary or self._make_dictionary(texts)
        corpus = self._make_corpus(texts=texts, dictionary=dictionary)
        train, test = train_test_split(corpus, test_size=test_size, random_state=random_state)
        passes = np.clip(int(round(100000 / (len(corpus) + 1))), 1, 20) if adjust_passes else 1
        self._lda = gensim.models.LdaModel(
            alpha='auto',
            corpus=train,
            num_topics=self.n_topics,
            id2word=dictionary,
            iterations=self.iterations,
            passes=passes)
        self.log_perplexity = self._lda.log_perplexity(test)
        logger.info('log_perplexity=%s', self.log_perplexity) 
Example #11
Source File: test_weight_boosting.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_importances():
    # Check variable importances.
    X, y = datasets.make_classification(n_samples=2000,
                                        n_features=10,
                                        n_informative=3,
                                        n_redundant=0,
                                        n_repeated=0,
                                        shuffle=False,
                                        random_state=1)

    for alg in ['SAMME', 'SAMME.R']:
        clf = AdaBoostClassifier(algorithm=alg)

        clf.fit(X, y)
        importances = clf.feature_importances_

        assert_equal(importances.shape[0], 10)
        assert_equal((importances[:3, np.newaxis] >= importances[3:]).all(),
                     True) 
Example #12
Source File: iterators.py    From training_results_v0.6 with Apache License 2.0 6 votes vote down vote up
def reset(self):
        """Resets the iterator to the beginning of the data."""
        self.curr_idx = 0
        #shuffle data in each bucket
        random.shuffle(self.idx)
        for i, buck in enumerate(self.sentences):
            self.indices[i], self.sentences[i], self.characters[i], self.label[i] = shuffle(self.indices[i],
                                                                                            self.sentences[i],
                                                                                            self.characters[i],
                                                                                            self.label[i])

        self.ndindex = []
        self.ndsent = []
        self.ndchar = []
        self.ndlabel = []

        #for each bucket of data
        for i, buck in enumerate(self.sentences):
            #append the lists with an array
            self.ndindex.append(ndarray.array(self.indices[i], dtype=self.dtype))
            self.ndsent.append(ndarray.array(self.sentences[i], dtype=self.dtype))
            self.ndchar.append(ndarray.array(self.characters[i], dtype=self.dtype))
            self.ndlabel.append(ndarray.array(self.label[i], dtype=self.dtype)) 
Example #13
Source File: utils.py    From adversarial-autoencoder with MIT License 6 votes vote down vote up
def load_mnist():
    with open('mnist/train-images-idx3-ubyte', 'rb') as f:
        data = np.fromfile(file=f, dtype=np.uint8)
    X_train = data[16:].reshape(60000, 28 * 28).astype(np.float32)
    with open('mnist/train-labels-idx1-ubyte', 'rb') as f:
        data = np.fromfile(file=f, dtype=np.uint8)
    y_train = data[8:].reshape(60000).astype(np.uint8)

    with open('mnist/t10k-images-idx3-ubyte', 'rb') as f:
        data = np.fromfile(file=f, dtype=np.uint8)
    X_test = data[16:].reshape(10000, 28 * 28).astype(np.float32)
    with open('mnist/t10k-labels-idx1-ubyte', 'rb') as f:
        data = np.fromfile(file=f, dtype=np.uint8)
    y_test = data[8:].reshape(10000).astype(np.uint8)

    X_train, y_train = shuffle(X_train, y_train)
    X_test, y_test = shuffle(X_test, y_test)

    X_train /= 255.
    X_test /= 255.

    return X_train, y_train, X_test, y_test 
Example #14
Source File: test_validation.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_learning_curve_batch_and_incremental_learning_are_equal():
    X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
                               n_redundant=0, n_classes=2,
                               n_clusters_per_class=1, random_state=0)
    train_sizes = np.linspace(0.2, 1.0, 5)
    estimator = PassiveAggressiveClassifier(max_iter=1, tol=None,
                                            shuffle=False)

    train_sizes_inc, train_scores_inc, test_scores_inc = \
        learning_curve(
            estimator, X, y, train_sizes=train_sizes,
            cv=3, exploit_incremental_learning=True)
    train_sizes_batch, train_scores_batch, test_scores_batch = \
        learning_curve(
            estimator, X, y, cv=3, train_sizes=train_sizes,
            exploit_incremental_learning=False)

    assert_array_equal(train_sizes_inc, train_sizes_batch)
    assert_array_almost_equal(train_scores_inc.mean(axis=1),
                              train_scores_batch.mean(axis=1))
    assert_array_almost_equal(test_scores_inc.mean(axis=1),
                              test_scores_batch.mean(axis=1)) 
Example #15
Source File: test_validation.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def check_cross_val_predict_multiclass(est, X, y, method):
    """Helper for tests of cross_val_predict with multiclass classification"""
    cv = KFold(n_splits=3, shuffle=False)

    # Generate expected outputs
    float_min = np.finfo(np.float64).min
    default_values = {'decision_function': float_min,
                      'predict_log_proba': float_min,
                      'predict_proba': 0}
    expected_predictions = np.full((len(X), len(set(y))),
                                   default_values[method],
                                   dtype=np.float64)
    _, y_enc = np.unique(y, return_inverse=True)
    for train, test in cv.split(X, y_enc):
        est = clone(est).fit(X[train], y_enc[train])
        fold_preds = getattr(est, method)(X[test])
        i_cols_fit = np.unique(y_enc[train])
        expected_predictions[np.ix_(test, i_cols_fit)] = fold_preds

    # Check actual outputs for several representations of y
    for tg in [y, y + 1, y - 2, y.astype('str')]:
        assert_allclose(cross_val_predict(est, X, tg, method=method, cv=cv),
                        expected_predictions) 
Example #16
Source File: dataset.py    From Neural-Network-Programming-with-TensorFlow with MIT License 6 votes vote down vote up
def read_train_sets(train_path, image_size, classes, validation_size=0):
  class DataSets(object):
    pass
  data_sets = DataSets()

  images, labels, ids, cls = load_train(train_path, image_size, classes)
  images, labels, ids, cls = shuffle(images, labels, ids, cls)  # shuffle the data

  if isinstance(validation_size, float):
    validation_size = int(validation_size * images.shape[0])

  validation_images = images[:validation_size]
  validation_labels = labels[:validation_size]
  validation_ids = ids[:validation_size]
  validation_cls = cls[:validation_size]

  train_images = images[validation_size:]
  train_labels = labels[validation_size:]
  train_ids = ids[validation_size:]
  train_cls = cls[validation_size:]

  data_sets.train = DataSet(train_images, train_labels, train_ids, train_cls)
  data_sets.valid = DataSet(validation_images, validation_labels, validation_ids, validation_cls)

  return data_sets 
Example #17
Source File: dataset.py    From Neural-Network-Programming-with-TensorFlow with MIT License 6 votes vote down vote up
def next_batch(self, batch_size):
    """Return the next `batch_size` examples from this data set."""
    start = self._index_in_epoch
    self._index_in_epoch += batch_size

    if self._index_in_epoch > self._num_examples:
      # Finished epoch
      self._epochs_completed += 1

      # # Shuffle the data (maybe)
      # perm = np.arange(self._num_examples)
      # np.random.shuffle(perm)
      # self._images = self._images[perm]
      # self._labels = self._labels[perm]
      # Start next epoch

      start = 0
      self._index_in_epoch = batch_size
      assert batch_size <= self._num_examples
    end = self._index_in_epoch

    return self._images[start:end], self._labels[start:end], self._ids[start:end], self._cls[start:end] 
Example #18
Source File: dataset.py    From cv-tricks.com with MIT License 6 votes vote down vote up
def read_train_sets(train_path, image_size, classes, validation_size):
  class DataSets(object):
    pass
  data_sets = DataSets()

  images, labels, img_names, cls = load_train(train_path, image_size, classes)
  images, labels, img_names, cls = shuffle(images, labels, img_names, cls)  

  if isinstance(validation_size, float):
    validation_size = int(validation_size * images.shape[0])

  validation_images = images[:validation_size]
  validation_labels = labels[:validation_size]
  validation_img_names = img_names[:validation_size]
  validation_cls = cls[:validation_size]

  train_images = images[validation_size:]
  train_labels = labels[validation_size:]
  train_img_names = img_names[validation_size:]
  train_cls = cls[validation_size:]

  data_sets.train = DataSet(train_images, train_labels, train_img_names, train_cls)
  data_sets.valid = DataSet(validation_images, validation_labels, validation_img_names, validation_cls)

  return data_sets 
Example #19
Source File: train_model.py    From laughter-detection with MIT License 5 votes vote down vote up
def evaluate_on_parts(data_parts, label_parts, name):
    #train_data_parts, train_label_parts = shuffle(train_data_parts, train_label_parts, random_state=0)
    i = 0
    accs = []
    while i < len(data_parts):
        #if i % 10000 == 0: print(i)
        X_subset, y_subset = get_data_subset(data_parts, label_parts, i, i+100)
        #model.fit(X_subset,y_subset,shuffle=True,batch_size = 2000, epochs=1,verbose=False)
        acc = model.evaluate(X_subset, y_subset,verbose=False)[1]
        accs.append(acc)
        i += 100
    print("%s accuracy %f " % (name, np.mean(accs)))
    return (np.mean(accs)) 
Example #20
Source File: value_function.py    From PPO-Stein-Control-Variate with MIT License 5 votes vote down vote up
def fit(self, x, y):
        """ Fit model to current data batch + previous data batch

        Args:
            x: features
            y: target
        """
        num_batches = max(x.shape[0] // 256, 1)
        batch_size = x.shape[0] // num_batches
        y_hat = self.predict(x)  # check explained variance prior to update
        old_exp_var = 1 - np.var(y - y_hat)/np.var(y)
        if self.replay_buffer_x is None:
            x_train, y_train = x, y
        else:
            x_train = np.concatenate([x, self.replay_buffer_x])
            y_train = np.concatenate([y, self.replay_buffer_y])
        self.replay_buffer_x = x
        self.replay_buffer_y = y
        for e in range(self.epochs):
            x_train, y_train = shuffle(x_train, y_train)
            for j in range(num_batches):
                start = j * batch_size
                end = (j + 1) * batch_size
                feed_dict = {self.obs_ph: x_train[start:end, :],
                             self.val_ph: y_train[start:end]}
                _, l = self.sess.run([self.train_op, self.loss], feed_dict=feed_dict)
        y_hat = self.predict(x)
        loss = np.mean(np.square(y_hat - y))         # explained variance after update
        exp_var = 1 - np.var(y - y_hat) / np.var(y)  # diagnose over-fitting of val func

        logger.record_dicts({
            'VarFuncLoss': loss,
            'ExplainedVarNew':exp_var,
            'ExplainedVarOld': old_exp_var}) 
Example #21
Source File: train_model.py    From laughter-detection with MIT License 5 votes vote down vote up
def train_on_parts(train_data_parts, train_label_parts, name):
    train_data_parts, train_label_parts = shuffle(train_data_parts, train_label_parts, random_state=0)
    i = 0
    accs = []
    while i < len(train_data_parts):
        #print(i)
        X_subset, y_subset = get_data_subset(train_data_parts, train_label_parts, i, i+2000)
        model.fit(X_subset,y_subset,shuffle=True,batch_size = 500, epochs=1,verbose=False)
        acc = model.evaluate(X_subset, y_subset,verbose=False)[1]
        accs.append(acc)
        #print(np.mean(accs))
        i += 2000
    print("%s accuracy %f" % (name, np.mean(accs))) 
Example #22
Source File: value_function.py    From PPO-Stein-Control-Variate with MIT License 5 votes vote down vote up
def fit(self, x, y):
        """ Fit model to current data batch + previous data batch

        Args:
            x: features
            y: target
        """
        num_batches = max(x.shape[0] // 256, 1)
        batch_size = x.shape[0] // num_batches
        y_hat = self.predict(x)  # check explained variance prior to update
        old_exp_var = 1 - np.var(y - y_hat)/np.var(y)
        if self.replay_buffer_x is None:
            x_train, y_train = x, y
        else:
            x_train = np.concatenate([x, self.replay_buffer_x])
            y_train = np.concatenate([y, self.replay_buffer_y])
        self.replay_buffer_x = x
        self.replay_buffer_y = y
        for e in range(self.epochs):
            x_train, y_train = shuffle(x_train, y_train)
            for j in range(num_batches):
                start = j * batch_size
                end = (j + 1) * batch_size
                feed_dict = {self.obs_ph: x_train[start:end, :],
                             self.val_ph: y_train[start:end]}
                _, l = self.sess.run([self.train_op, self.loss], feed_dict=feed_dict)
        y_hat = self.predict(x)
        loss = np.mean(np.square(y_hat - y))         # explained variance after update
        exp_var = 1 - np.var(y - y_hat) / np.var(y)  # diagnose over-fitting of val func

        logger.record_dicts({
            'VarFuncLoss': loss,
            'ExplainedVarNew': exp_var,
            'ExplainedVarOld': old_exp_var}) 
Example #23
Source File: image_loading.py    From classifying-cancer with GNU General Public License v3.0 5 votes vote down vote up
def read_img_sets(image_dir, image_size, validation_size=0):
    class DataSets:
        pass

    data_sets = DataSets()

    images, labels, ids, cls, cls_map = load_data(image_dir, image_size)
    images, labels, ids, cls = shuffle(images, labels, ids, cls)

    if isinstance(validation_size, float):
        validation_size = int(validation_size * images.shape[0])

    test_images = images[:validation_size]
    test_labels = labels[:validation_size]
    test_ids = ids[:validation_size]
    test_cls = cls[:validation_size]

    train_images = images[validation_size:]
    train_labels = labels[validation_size:]
    train_ids = ids[validation_size:]
    train_cls = cls[validation_size:]

    data_sets.train = DataSet(train_images, train_labels, train_ids, train_cls)
    data_sets.test = DataSet(test_images, test_labels, test_ids, test_cls)

    return data_sets, cls_map 
Example #24
Source File: wordvec_regressor.py    From Wordbatch with GNU General Public License v2.0 5 votes vote down vote up
def fit_batch(self, texts, labels, rcount):
		texts, labels = shuffle(texts, labels)
		print("Transforming", rcount)
		#texts= self.wb.fit_transform(texts, tn__batcher=self.batcher, dct__reset= False, dct__batcher= self.batcher)
		texts = self.wb.fit_transform(texts)
		print("Training", rcount)
		self.clf.fit(texts, labels, reset= False) 
Example #25
Source File: generator.py    From KerasDeepSpeech with GNU Affero General Public License v3.0 5 votes vote down vote up
def genshuffle(self):
        self.wavpath, self.transcript, self.finish = shuffle(self.wavpath,
                                                             self.transcript,
                                                             self.finish) 
Example #26
Source File: generator.py    From KerasDeepSpeech with GNU Affero General Public License v3.0 5 votes vote down vote up
def shuffle_data(self):
    self.wavpath, self.transcript, self.finish = shuffle(self.wavpath,
                                                         self.transcript,
                                                         self.finish)
    return 
Example #27
Source File: shrink.py    From CarND-Transfer-Learning-Lab with MIT License 5 votes vote down vote up
def main(_):
    # load bottleneck data
    with open(FLAGS.training_file, 'rb') as f:
        train_data = pickle.load(f)
    X_train = train_data['features']
    y_train = train_data['labels']

    print(X_train.shape, y_train.shape)

    X_train, y_train = shuffle(X_train, y_train, random_state=0)
    keep_indices = []
    keep_counter = Counter()

    for i, label in enumerate(y_train.reshape(-1)):
        if keep_counter[label] < FLAGS.size:
            keep_counter[label] += 1
            keep_indices.append(i)

    X_train_small = X_train[keep_indices]
    y_train_small = y_train[keep_indices]

    print(X_train_small.shape, y_train_small.shape)

    print("Writing to {}".format(FLAGS.output_file))
    data = {'features': X_train_small, 'labels': y_train_small}
    pickle.dump(data, open(FLAGS.output_file, 'wb'))

# parses flags and calls the `main` function above 
Example #28
Source File: test_multiclass.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_ovo_ties2():
    # test that ties can not only be won by the first two labels
    X = np.array([[1, 2], [2, 1], [-2, 1], [-2, -1]])
    y_ref = np.array([2, 0, 1, 2])

    # cycle through labels so that each label wins once
    for i in range(3):
        y = (y_ref + i) % 3
        multi_clf = OneVsOneClassifier(Perceptron(shuffle=False, max_iter=4,
                                                  tol=None))
        ovo_prediction = multi_clf.fit(X, y).predict(X)
        assert_equal(ovo_prediction[0], i % 3) 
Example #29
Source File: test_multiclass.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_ovo_ties():
    # Test that ties are broken using the decision function,
    # not defaulting to the smallest label
    X = np.array([[1, 2], [2, 1], [-2, 1], [-2, -1]])
    y = np.array([2, 0, 1, 2])
    multi_clf = OneVsOneClassifier(Perceptron(shuffle=False, max_iter=4,
                                              tol=None))
    ovo_prediction = multi_clf.fit(X, y).predict(X)
    ovo_decision = multi_clf.decision_function(X)

    # Classifiers are in order 0-1, 0-2, 1-2
    # Use decision_function to compute the votes and the normalized
    # sum_of_confidences, which is used to disambiguate when there is a tie in
    # votes.
    votes = np.round(ovo_decision)
    normalized_confidences = ovo_decision - votes

    # For the first point, there is one vote per class
    assert_array_equal(votes[0, :], 1)
    # For the rest, there is no tie and the prediction is the argmax
    assert_array_equal(np.argmax(votes[1:], axis=1), ovo_prediction[1:])
    # For the tie, the prediction is the class with the highest score
    assert_equal(ovo_prediction[0], normalized_confidences[0].argmax())


# 0.23. warning about tol not having its correct default value. 
Example #30
Source File: mnist.py    From skorch with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def get_data(num_samples):
    mnist = fetch_openml('mnist_784')
    torch.manual_seed(0)
    X = mnist.data.astype('float32').reshape(-1, 1, 28, 28)
    y = mnist.target.astype('int64')
    X, y = shuffle(X, y)
    X, y = X[:num_samples], y[:num_samples]
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
    X_train /= 255
    X_test /= 255
    return X_train, X_test, y_train, y_test