Python sklearn.model_selection.StratifiedShuffleSplit() Examples

The following are 30 code examples of sklearn.model_selection.StratifiedShuffleSplit(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.model_selection , or try the search function

Example #1

Source File: hpc_svm.py From cwcf with MIT License

8 votes

def get_full_rbf_svm_clf(train_x, train_y, c_range=None, gamma_range=None):
		param_grid = dict(gamma=gamma_range, C=c_range)
		cv = StratifiedShuffleSplit(n_splits=2, test_size=0.2, random_state=42)
		grid = GridSearchCV(SVC(cache_size=1024), param_grid=param_grid, cv=cv, n_jobs=14, verbose=10)
		grid.fit(train_x, train_y)
		
		print("The best parameters are %s with a score of %0.2f" % (grid.best_params_, grid.best_score_))
		
		scores = grid.cv_results_['mean_test_score'].reshape(len(c_range), len(gamma_range))
		print("Scores:")
		print(scores)
		
		print("c_range:", c_range)
		print("gamma_range:", gamma_range)

		c_best = grid.best_params_['C']
		gamma_best = grid.best_params_['gamma']

		clf = SVC(C=c_best, gamma=gamma_best, verbose=True)
		return clf

#----------------

Example #2

Source File: test_split.py From twitter-stock-recommendation with MIT License

6 votes

def test_stratified_shuffle_split_multilabel_many_labels():
    # fix in PR #9922: for multilabel data with > 1000 labels, str(row)
    # truncates with an ellipsis for elements in positions 4 through
    # len(row) - 4, so labels were not being correctly split using the powerset
    # method for transforming a multilabel problem to a multiclass one; this
    # test checks that this problem is fixed.
    row_with_many_zeros = [1, 0, 1] + [0] * 1000 + [1, 0, 1]
    row_with_many_ones = [1, 0, 1] + [1] * 1000 + [1, 0, 1]
    y = np.array([row_with_many_zeros] * 10 + [row_with_many_ones] * 100)
    X = np.ones_like(y)

    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=0)
    train, test = next(sss.split(X=X, y=y))
    y_train = y[train]
    y_test = y[test]

    # correct stratification of entire rows
    # (by design, here y[:, 4] uniquely determines the entire row of y)
    expected_ratio = np.mean(y[:, 4])
    assert_equal(expected_ratio, np.mean(y_train[:, 4]))
    assert_equal(expected_ratio, np.mean(y_test[:, 4]))

Example #3

Source File: image_dataset.py From self-ensemble-visual-domain-adapt-photo with MIT License

6 votes

def subset_indices(d_source, d_target, subsetsize, subsetseed):
    if subsetsize > 0:
        if subsetseed != 0:
            subset_rng = np.random.RandomState(subsetseed)
        else:
            subset_rng = np.random
        strat = StratifiedShuffleSplit(n_splits=1, test_size=subsetsize, random_state=subset_rng)
        shuf = ShuffleSplit(n_splits=1, test_size=subsetsize, random_state=subset_rng)
        _, source_indices = next(strat.split(d_source.y, d_source.y))
        n_src = source_indices.shape[0]
        if d_target.has_ground_truth:
            _, target_indices = next(strat.split(d_target.y, d_target.y))
        else:
            _, target_indices = next(shuf.split(np.arange(len(d_target.images))))
        n_tgt = target_indices.shape[0]
    else:
        source_indices = None
        target_indices = None
        n_src = len(d_source.images)
        n_tgt = len(d_target.images)

    return source_indices, target_indices, n_src, n_tgt

Example #4

Source File: test_split.py From twitter-stock-recommendation with MIT License

6 votes

def test_2d_y():
    # smoke test for 2d y and multi-label
    n_samples = 30
    rng = np.random.RandomState(1)
    X = rng.randint(0, 3, size=(n_samples, 2))
    y = rng.randint(0, 3, size=(n_samples,))
    y_2d = y.reshape(-1, 1)
    y_multilabel = rng.randint(0, 2, size=(n_samples, 3))
    groups = rng.randint(0, 3, size=(n_samples,))
    splitters = [LeaveOneOut(), LeavePOut(p=2), KFold(), StratifiedKFold(),
                 RepeatedKFold(), RepeatedStratifiedKFold(),
                 ShuffleSplit(), StratifiedShuffleSplit(test_size=.5),
                 GroupShuffleSplit(), LeaveOneGroupOut(),
                 LeavePGroupsOut(n_groups=2), GroupKFold(), TimeSeriesSplit(),
                 PredefinedSplit(test_fold=groups)]
    for splitter in splitters:
        list(splitter.split(X, y, groups))
        list(splitter.split(X, y_2d, groups))
        try:
            list(splitter.split(X, y_multilabel, groups))
        except ValueError as e:
            allowed_target_types = ('binary', 'multiclass')
            msg = "Supported target types are: {}. Got 'multilabel".format(
                allowed_target_types)
            assert msg in str(e)

Example #5

Source File: test_search.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_grid_search_groups():
    # Check if ValueError (when groups is None) propagates to GridSearchCV
    # And also check if groups is correctly passed to the cv object
    rng = np.random.RandomState(0)

    X, y = make_classification(n_samples=15, n_classes=2, random_state=0)
    groups = rng.randint(0, 3, 15)

    clf = LinearSVC(random_state=0)
    grid = {'C': [1]}

    group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(),
                 GroupShuffleSplit()]
    for cv in group_cvs:
        gs = GridSearchCV(clf, grid, cv=cv)
        assert_raise_message(ValueError,
                             "The 'groups' parameter should not be None.",
                             gs.fit, X, y)
        gs.fit(X, y, groups=groups)

    non_group_cvs = [StratifiedKFold(), StratifiedShuffleSplit()]
    for cv in non_group_cvs:
        gs = GridSearchCV(clf, grid, cv=cv)
        # Should not raise an error
        gs.fit(X, y)

Example #6

Source File: test_split.py From twitter-stock-recommendation with MIT License

6 votes

def test_stratified_shuffle_split_multilabel():
    # fix for issue 9037
    for y in [np.array([[0, 1], [1, 0], [1, 0], [0, 1]]),
              np.array([[0, 1], [1, 1], [1, 1], [0, 1]])]:
        X = np.ones_like(y)
        sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=0)
        train, test = next(sss.split(X=X, y=y))
        y_train = y[train]
        y_test = y[test]

        # no overlap
        assert_array_equal(np.intersect1d(train, test), [])

        # complete partition
        assert_array_equal(np.union1d(train, test), np.arange(len(y)))

        # correct stratification of entire rows
        # (by design, here y[:, 0] uniquely determines the entire row of y)
        expected_ratio = np.mean(y[:, 0])
        assert_equal(expected_ratio, np.mean(y_train[:, 0]))
        assert_equal(expected_ratio, np.mean(y_test[:, 0]))

Example #7

Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_stratified_shuffle_split_multilabel_many_labels():
    # fix in PR #9922: for multilabel data with > 1000 labels, str(row)
    # truncates with an ellipsis for elements in positions 4 through
    # len(row) - 4, so labels were not being correctly split using the powerset
    # method for transforming a multilabel problem to a multiclass one; this
    # test checks that this problem is fixed.
    row_with_many_zeros = [1, 0, 1] + [0] * 1000 + [1, 0, 1]
    row_with_many_ones = [1, 0, 1] + [1] * 1000 + [1, 0, 1]
    y = np.array([row_with_many_zeros] * 10 + [row_with_many_ones] * 100)
    X = np.ones_like(y)

    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=0)
    train, test = next(sss.split(X=X, y=y))
    y_train = y[train]
    y_test = y[test]

    # correct stratification of entire rows
    # (by design, here y[:, 4] uniquely determines the entire row of y)
    expected_ratio = np.mean(y[:, 4])
    assert_equal(expected_ratio, np.mean(y_train[:, 4]))
    assert_equal(expected_ratio, np.mean(y_test[:, 4]))

Example #8

Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_stratified_shuffle_split_multilabel():
    # fix for issue 9037
    for y in [np.array([[0, 1], [1, 0], [1, 0], [0, 1]]),
              np.array([[0, 1], [1, 1], [1, 1], [0, 1]])]:
        X = np.ones_like(y)
        sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=0)
        train, test = next(sss.split(X=X, y=y))
        y_train = y[train]
        y_test = y[test]

        # no overlap
        assert_array_equal(np.intersect1d(train, test), [])

        # complete partition
        assert_array_equal(np.union1d(train, test), np.arange(len(y)))

        # correct stratification of entire rows
        # (by design, here y[:, 0] uniquely determines the entire row of y)
        expected_ratio = np.mean(y[:, 0])
        assert_equal(expected_ratio, np.mean(y_train[:, 0]))
        assert_equal(expected_ratio, np.mean(y_test[:, 0]))

Example #9

Source File: temporalnet_combined.py From Fall-Detection-with-CNNs-and-Optical-Flow with MIT License

6 votes

def divide_train_val(zeroes, ones, val_size):
    """ sss = StratifiedShuffleSplit(n_splits=1,
                    test_size=val_size/2,
                    random_state=7)
    indices_0 = sss.split(np.zeros(len(zeroes)), zeroes)
    indices_1 = sss.split(np.zeros(len(ones)), ones)
    train_indices_0, val_indices_0 = indices_0.next()
    train_indices_1, val_indices_1 = indices_1.next() """

    rand0 = np.random.permutation(len(zeroes))
    train_indices_0 = zeroes[rand0[val_size//2:]]
    val_indices_0 = zeroes[rand0[:val_size//2]]
    rand1 = np.random.permutation(len(ones))
    train_indices_1 = ones[rand1[val_size//2:]]
    val_indices_1 = ones[rand1[:val_size//2]]

    return (train_indices_0, train_indices_1,
            val_indices_0, val_indices_1)

Example #10

Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_stratified_shuffle_split_init():
    X = np.arange(7)
    y = np.asarray([0, 1, 1, 1, 2, 2, 2])
    # Check that error is raised if there is a class with only one sample
    assert_raises(ValueError, next,
                  StratifiedShuffleSplit(3, 0.2).split(X, y))

    # Check that error is raised if the test set size is smaller than n_classes
    assert_raises(ValueError, next, StratifiedShuffleSplit(3, 2).split(X, y))
    # Check that error is raised if the train set size is smaller than
    # n_classes
    assert_raises(ValueError, next,
                  StratifiedShuffleSplit(3, 3, 2).split(X, y))

    X = np.arange(9)
    y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2])

    # Train size or test size too small
    assert_raises(ValueError, next,
                  StratifiedShuffleSplit(train_size=2).split(X, y))
    assert_raises(ValueError, next,
                  StratifiedShuffleSplit(test_size=2).split(X, y))

Example #11

Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_2d_y():
    # smoke test for 2d y and multi-label
    n_samples = 30
    rng = np.random.RandomState(1)
    X = rng.randint(0, 3, size=(n_samples, 2))
    y = rng.randint(0, 3, size=(n_samples,))
    y_2d = y.reshape(-1, 1)
    y_multilabel = rng.randint(0, 2, size=(n_samples, 3))
    groups = rng.randint(0, 3, size=(n_samples,))
    splitters = [LeaveOneOut(), LeavePOut(p=2), KFold(), StratifiedKFold(),
                 RepeatedKFold(), RepeatedStratifiedKFold(),
                 ShuffleSplit(), StratifiedShuffleSplit(test_size=.5),
                 GroupShuffleSplit(), LeaveOneGroupOut(),
                 LeavePGroupsOut(n_groups=2), GroupKFold(), TimeSeriesSplit(),
                 PredefinedSplit(test_fold=groups)]
    for splitter in splitters:
        list(splitter.split(X, y, groups))
        list(splitter.split(X, y_2d, groups))
        try:
            list(splitter.split(X, y_multilabel, groups))
        except ValueError as e:
            allowed_target_types = ('binary', 'multiclass')
            msg = "Supported target types are: {}. Got 'multilabel".format(
                allowed_target_types)
            assert msg in str(e)

Example #12

Source File: helpers.py From neupy with MIT License

5 votes

def simple_classification(n_samples=100, n_features=10, random_state=33):
    """
    Generate simple classification task for training.

    Parameters
    ----------
    n_samples : int
        Number of samples in dataset.
    n_features : int
        Number of features for each sample.
    random_state : int
        Random state to make results reproducible.

    Returns
    -------
    tuple
        Returns tuple that contains 4 variables. There are input train,
        input test, target train, target test respectevly.
    """
    X, y = datasets.make_classification(
        n_samples=n_samples,
        n_features=n_features,
        random_state=random_state,
    )
    shuffle_split = StratifiedShuffleSplit(
        n_splits=1,
        train_size=0.6,
        test_size=0.1,
        random_state=random_state,
    )

    train_index, test_index = next(shuffle_split.split(X, y))
    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    return x_train, x_test, y_train, y_test

Example #13

Source File: split.py From gumpy with MIT License

5 votes

def  stratified_shuffle_Split(features, labels, n_splits,test_size,random_state):

    """Stratified ShuffleSplit cross-validator
    """
    cv = StratifiedShuffleSplit(n_splits, test_size, random_state=random_state)
    for train_index, test_index in cv.split(features,labels):
        X_train = features[train_index]
        X_test = features[test_index]
        Y_train = labels[train_index]
        Y_test = labels[test_index]
    return X_train, X_test, Y_train, Y_test


#Random permutation cross-validator

Example #14

Source File: core.py From HungaBunga with MIT License

5 votes

def cv_clf(x, y, test_size = 0.2, n_splits = 5, random_state=None, doesUpsample = True):
    sss_obj = sss(n_splits, test_size, random_state=random_state).split(x, y)
    if not doesUpsample: yield sss_obj
    for train_inds, valid_inds in sss_obj: yield (upsample_indices_clf(train_inds, y[train_inds]), valid_inds)

Example #15

Source File: test_model_selection.py From pandas-ml with BSD 3-Clause "New" or "Revised" License

5 votes

def test_objectmapper(self):
        df = pdml.ModelFrame([])

        # Splitter Classes
        self.assertIs(df.model_selection.KFold, ms.KFold)
        self.assertIs(df.model_selection.GroupKFold, ms.GroupKFold)
        self.assertIs(df.model_selection.StratifiedKFold, ms.StratifiedKFold)

        self.assertIs(df.model_selection.LeaveOneGroupOut, ms.LeaveOneGroupOut)
        self.assertIs(df.model_selection.LeavePGroupsOut, ms.LeavePGroupsOut)
        self.assertIs(df.model_selection.LeaveOneOut, ms.LeaveOneOut)
        self.assertIs(df.model_selection.LeavePOut, ms.LeavePOut)

        self.assertIs(df.model_selection.ShuffleSplit, ms.ShuffleSplit)
        self.assertIs(df.model_selection.GroupShuffleSplit,
                      ms.GroupShuffleSplit)
        # self.assertIs(df.model_selection.StratifiedShuffleSplit,
        #               ms.StratifiedShuffleSplit)
        self.assertIs(df.model_selection.PredefinedSplit, ms.PredefinedSplit)
        self.assertIs(df.model_selection.TimeSeriesSplit, ms.TimeSeriesSplit)

        # Splitter Functions

        # Hyper-parameter optimizers
        self.assertIs(df.model_selection.GridSearchCV, ms.GridSearchCV)
        self.assertIs(df.model_selection.RandomizedSearchCV, ms.RandomizedSearchCV)
        self.assertIs(df.model_selection.ParameterGrid, ms.ParameterGrid)
        self.assertIs(df.model_selection.ParameterSampler, ms.ParameterSampler)

        # Model validation

Example #16

Source File: test_model_selection.py From pandas-ml with BSD 3-Clause "New" or "Revised" License

5 votes

def test_objectmapper_abbr(self):
        df = pdml.ModelFrame([])

        # Splitter Classes
        self.assertIs(df.ms.KFold, ms.KFold)
        self.assertIs(df.ms.GroupKFold, ms.GroupKFold)
        self.assertIs(df.ms.StratifiedKFold, ms.StratifiedKFold)

        self.assertIs(df.ms.LeaveOneGroupOut, ms.LeaveOneGroupOut)
        self.assertIs(df.ms.LeavePGroupsOut, ms.LeavePGroupsOut)
        self.assertIs(df.ms.LeaveOneOut, ms.LeaveOneOut)
        self.assertIs(df.ms.LeavePOut, ms.LeavePOut)

        self.assertIs(df.ms.ShuffleSplit, ms.ShuffleSplit)
        self.assertIs(df.ms.GroupShuffleSplit,
                      ms.GroupShuffleSplit)
        # self.assertIs(df.ms.StratifiedShuffleSplit,
        #               ms.StratifiedShuffleSplit)
        self.assertIs(df.ms.PredefinedSplit, ms.PredefinedSplit)
        self.assertIs(df.ms.TimeSeriesSplit, ms.TimeSeriesSplit)

        # Splitter Functions

        # Hyper-parameter optimizers
        self.assertIs(df.ms.GridSearchCV, ms.GridSearchCV)
        self.assertIs(df.ms.RandomizedSearchCV, ms.RandomizedSearchCV)
        self.assertIs(df.ms.ParameterGrid, ms.ParameterGrid)
        self.assertIs(df.ms.ParameterSampler, ms.ParameterSampler)

        # Model validation

Example #17

Source File: test_model_selection.py From pandas-ml with BSD 3-Clause "New" or "Revised" License

5 votes

def test_StratifiedShuffleSplit(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)
        sf1 = df.model_selection.StratifiedShuffleSplit(random_state=self.random_state)
        sf2 = ms.StratifiedShuffleSplit(random_state=self.random_state)

        # consume generator
        ind1 = [x for x in sf1.split(df.data.values, df.target.values)]
        ind2 = [x for x in sf2.split(iris.data, iris.target)]

        for i1, i2 in zip(ind1, ind2):
            self.assertIsInstance(i1, tuple)
            self.assertEqual(len(i1), 2)
            self.assertIsInstance(i2, tuple)
            self.assertEqual(len(i2), 2)
            tm.assert_numpy_array_equal(i1[0], i1[0])
            tm.assert_numpy_array_equal(i1[1], i2[1])

        sf1 = df.model_selection.StratifiedShuffleSplit(random_state=self.random_state)
        with tm.assert_produces_warning(FutureWarning):
            gen = df.model_selection.iterate(sf1)

        # StratifiedShuffleSplit is not a subclass of BaseCrossValidator
        for train_df, test_df in gen:
            self.assertIsInstance(train_df, pdml.ModelFrame)
            self.assertIsInstance(test_df, pdml.ModelFrame)
            tm.assert_index_equal(df.columns, train_df.columns)
            tm.assert_index_equal(df.columns, test_df.columns)

            self.assertEqual(len(df), len(train_df) + len(test_df))

Example #18

Source File: evaluation.py From nonconformist with MIT License

5 votes

def fit(self, x, y):
		split = StratifiedShuffleSplit(n_splits=1,
		                               test_size=self.calibration_portion)
		for train, cal in split.split(np.zeros((y.size, 1)), y):
			self.icp.fit(x[train, :], y[train])
			self.icp.calibrate(x[cal, :], y[cal])

Example #19

Source File: acp.py From nonconformist with MIT License

5 votes

def gen_samples(self, y, n_samples, problem_type):
		if problem_type == 'classification':
			splits = StratifiedShuffleSplit(
					n_splits=n_samples,
					test_size=self.cal_portion
				)

			split_ = splits.split(np.zeros((y.size, 1)), y)
		
		else:
			splits = ShuffleSplit(
				n_splits=n_samples,
				test_size=self.cal_portion
			)

			split_ = splits.split(np.zeros((y.size, 1)))

		for train, cal in split_:
			yield train, cal


# -----------------------------------------------------------------------------
# Conformal ensemble
# -----------------------------------------------------------------------------

Example #20

Source File: data.py From keras-text with MIT License

5 votes

def update_test_indices(self, test_size=0.1):
        """Updates `test_indices` property with indices of `test_size` proportion.

        Args:
            test_size: The test proportion in [0, 1] (Default value: 0.1)
        """
        if self.is_multi_label:
            self._train_indices, self._test_indices = sampling.multi_label_train_test_split(self.y, test_size)
        else:
            sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size)
            self._train_indices, self._test_indices = next(sss.split(self.X, self.y))

Example #21

Source File: data.py From keras-text with MIT License

5 votes

def train_val_split(self, split_ratio=0.1):
        """Generates train and validation sets from the training indices.

        Args:
            split_ratio: The split proportion in [0, 1] (Default value: 0.1)

        Returns:
            The stratified train and val subsets. Multi-label outputs are handled as well.
        """
        if self.is_multi_label:
            train_indices, val_indices = sampling.multi_label_train_test_split(self.y, split_ratio)
        else:
            sss = StratifiedShuffleSplit(n_splits=1, test_size=split_ratio)
            train_indices, val_indices = next(sss.split(self.X, self.y))
        return self.X[train_indices], self.X[val_indices], self.y[train_indices], self.y[val_indices]

Example #22

Source File: test_split.py From twitter-stock-recommendation with MIT License

5 votes

def test_stratified_shuffle_split_init():
    X = np.arange(7)
    y = np.asarray([0, 1, 1, 1, 2, 2, 2])
    # Check that error is raised if there is a class with only one sample
    assert_raises(ValueError, next,
                  StratifiedShuffleSplit(3, 0.2).split(X, y))

    # Check that error is raised if the test set size is smaller than n_classes
    assert_raises(ValueError, next, StratifiedShuffleSplit(3, 2).split(X, y))
    # Check that error is raised if the train set size is smaller than
    # n_classes
    assert_raises(ValueError, next,
                  StratifiedShuffleSplit(3, 3, 2).split(X, y))

    X = np.arange(9)
    y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2])
    # Check that errors are raised if there is not enough samples
    assert_raises(ValueError, StratifiedShuffleSplit, 3, 0.5, 0.6)
    assert_raises(ValueError, next,
                  StratifiedShuffleSplit(3, 8, 0.6).split(X, y))
    assert_raises(ValueError, next,
                  StratifiedShuffleSplit(3, 0.6, 8).split(X, y))

    # Train size or test size too small
    assert_raises(ValueError, next,
                  StratifiedShuffleSplit(train_size=2).split(X, y))
    assert_raises(ValueError, next,
                  StratifiedShuffleSplit(test_size=2).split(X, y))

Example #23

Source File: test_split.py From twitter-stock-recommendation with MIT License

5 votes

def test_stratified_shuffle_split_respects_test_size():
    y = np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2])
    test_size = 5
    train_size = 10
    sss = StratifiedShuffleSplit(6, test_size=test_size, train_size=train_size,
                                 random_state=0).split(np.ones(len(y)), y)
    for train, test in sss:
        assert_equal(len(train), train_size)
        assert_equal(len(test), test_size)

Example #24

Source File: test_split.py From twitter-stock-recommendation with MIT License

5 votes

def test_stratified_shuffle_split_iter():
    ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]),
          np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
          np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2] * 2),
          np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]),
          np.array([-1] * 800 + [1] * 50),
          np.concatenate([[i] * (100 + i) for i in range(11)]),
          [1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3],
          ['1', '1', '1', '1', '2', '2', '2', '3', '3', '3', '3', '3'],
          ]

    for y in ys:
        sss = StratifiedShuffleSplit(6, test_size=0.33,
                                     random_state=0).split(np.ones(len(y)), y)
        y = np.asanyarray(y)  # To make it indexable for y[train]
        # this is how test-size is computed internally
        # in _validate_shuffle_split
        test_size = np.ceil(0.33 * len(y))
        train_size = len(y) - test_size
        for train, test in sss:
            assert_array_equal(np.unique(y[train]), np.unique(y[test]))
            # Checks if folds keep classes proportions
            p_train = (np.bincount(np.unique(y[train],
                                   return_inverse=True)[1]) /
                       float(len(y[train])))
            p_test = (np.bincount(np.unique(y[test],
                                  return_inverse=True)[1]) /
                      float(len(y[test])))
            assert_array_almost_equal(p_train, p_test, 1)
            assert_equal(len(train) + len(test), y.size)
            assert_equal(len(train), train_size)
            assert_equal(len(test), test_size)
            assert_array_equal(np.lib.arraysetops.intersect1d(train, test), [])

Example #25

Source File: test_split.py From twitter-stock-recommendation with MIT License

5 votes

def test_stratifiedshufflesplit_list_input():
    # Check that when y is a list / list of string labels, it works.
    sss = StratifiedShuffleSplit(test_size=2, random_state=42)
    X = np.ones(7)
    y1 = ['1'] * 4 + ['0'] * 3
    y2 = np.hstack((np.ones(4), np.zeros(3)))
    y3 = y2.tolist()

    np.testing.assert_equal(list(sss.split(X, y1)),
                            list(sss.split(X, y2)))
    np.testing.assert_equal(list(sss.split(X, y3)),
                            list(sss.split(X, y2)))

Example #26

Source File: test_split.py From twitter-stock-recommendation with MIT License

5 votes

def test_nested_cv():
    # Test if nested cross validation works with different combinations of cv
    rng = np.random.RandomState(0)

    X, y = make_classification(n_samples=15, n_classes=2, random_state=0)
    groups = rng.randint(0, 5, 15)

    cvs = [LeaveOneGroupOut(), LeaveOneOut(), GroupKFold(), StratifiedKFold(),
           StratifiedShuffleSplit(n_splits=3, random_state=0)]

    for inner_cv, outer_cv in combinations_with_replacement(cvs, 2):
        gs = GridSearchCV(Ridge(), param_grid={'alpha': [1, .1]},
                          cv=inner_cv)
        cross_val_score(gs, X=X, y=y, groups=groups, cv=outer_cv,
                        fit_params={'groups': groups})

Example #27

Source File: test_split.py From twitter-stock-recommendation with MIT License

5 votes

def test_train_test_default_warning():
    assert_warns(FutureWarning, ShuffleSplit, train_size=0.75)
    assert_warns(FutureWarning, GroupShuffleSplit, train_size=0.75)
    assert_warns(FutureWarning, StratifiedShuffleSplit, train_size=0.75)
    assert_warns(FutureWarning, train_test_split, range(3),
                 train_size=0.75)

Example #28

Source File: problem.py From ramp-workflow with BSD 3-Clause "New" or "Revised" License

5 votes

def get_cv(X, y):
    cv = StratifiedShuffleSplit(n_splits=2, test_size=0.2, random_state=57)
    return cv.split(X, y)

Example #29

Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_nested_cv():
    # Test if nested cross validation works with different combinations of cv
    rng = np.random.RandomState(0)

    X, y = make_classification(n_samples=15, n_classes=2, random_state=0)
    groups = rng.randint(0, 5, 15)

    cvs = [LeaveOneGroupOut(), LeaveOneOut(), GroupKFold(), StratifiedKFold(),
           StratifiedShuffleSplit(n_splits=3, random_state=0)]

    for inner_cv, outer_cv in combinations_with_replacement(cvs, 2):
        gs = GridSearchCV(Ridge(), param_grid={'alpha': [1, .1]},
                          cv=inner_cv, error_score='raise', iid=False)
        cross_val_score(gs, X=X, y=y, groups=groups, cv=outer_cv,
                        fit_params={'groups': groups})

Example #30

Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_stratifiedshufflesplit_list_input():
    # Check that when y is a list / list of string labels, it works.
    sss = StratifiedShuffleSplit(test_size=2, random_state=42)
    X = np.ones(7)
    y1 = ['1'] * 4 + ['0'] * 3
    y2 = np.hstack((np.ones(4), np.zeros(3)))
    y3 = y2.tolist()

    np.testing.assert_equal(list(sss.split(X, y1)),
                            list(sss.split(X, y2)))
    np.testing.assert_equal(list(sss.split(X, y3)),
                            list(sss.split(X, y2)))