Python sklearn.model_selection.GroupKFold() Examples

The following are 24 code examples for showing how to use sklearn.model_selection.GroupKFold(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module sklearn.model_selection , or try the search function .

Example 1
Project: category_encoders   Author: scikit-learn-contrib   File: test_wrapper.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_custom_cv(self):
        x = np.array([
            ['a', 'b', 'c'],
            ['a', 'b', 'c'],
            ['a', 'b', 'c'],
            ['a', 'b', 'c'],
            ['b', 'b', 'c'],
            ['b', 'b', 'c'],
            ['b', 'b', 'b'],
            ['b', 'b', 'b'],
            ['b', 'b', 'b'],
            ['b', 'b', 'b'],
            ['a', 'b', 'a'],
            ['a', 'b', 'a'],
        ])
        groups = [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3]
        y = [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3]
        gkfold = GroupKFold(n_splits=3)
        wrapper = NestedCVWrapper(encoders.TargetEncoder(), cv=gkfold)
        result_train, result_valid = wrapper.fit_transform(x, y, X_test=x, groups=groups)

        # We would expect result_train != result_valid since result_train has been generated using nested
        # folds and result_valid is generated by fitting the encoder on all of the x & y daya
        self.assertFalse(np.allclose(result_train, result_valid)) 
Example 2
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_rfe.py    License: MIT License 6 votes vote down vote up
def test_rfe_cv_groups():
    generator = check_random_state(0)
    iris = load_iris()
    number_groups = 4
    groups = np.floor(np.linspace(0, number_groups, len(iris.target)))
    X = iris.data
    y = (iris.target > 0).astype(int)

    est_groups = RFECV(
        estimator=RandomForestClassifier(random_state=generator),
        step=1,
        scoring='accuracy',
        cv=GroupKFold(n_splits=2)
    )
    est_groups.fit(X, y, groups=groups)
    assert est_groups.n_features_ > 0 
Example 3
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_split.py    License: MIT License 6 votes vote down vote up
def test_2d_y():
    # smoke test for 2d y and multi-label
    n_samples = 30
    rng = np.random.RandomState(1)
    X = rng.randint(0, 3, size=(n_samples, 2))
    y = rng.randint(0, 3, size=(n_samples,))
    y_2d = y.reshape(-1, 1)
    y_multilabel = rng.randint(0, 2, size=(n_samples, 3))
    groups = rng.randint(0, 3, size=(n_samples,))
    splitters = [LeaveOneOut(), LeavePOut(p=2), KFold(), StratifiedKFold(),
                 RepeatedKFold(), RepeatedStratifiedKFold(),
                 ShuffleSplit(), StratifiedShuffleSplit(test_size=.5),
                 GroupShuffleSplit(), LeaveOneGroupOut(),
                 LeavePGroupsOut(n_groups=2), GroupKFold(), TimeSeriesSplit(),
                 PredefinedSplit(test_fold=groups)]
    for splitter in splitters:
        list(splitter.split(X, y, groups))
        list(splitter.split(X, y_2d, groups))
        try:
            list(splitter.split(X, y_multilabel, groups))
        except ValueError as e:
            allowed_target_types = ('binary', 'multiclass')
            msg = "Supported target types are: {}. Got 'multilabel".format(
                allowed_target_types)
            assert msg in str(e) 
Example 4
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_validation.py    License: MIT License 6 votes vote down vote up
def test_cross_val_score_predict_groups():
    # Check if ValueError (when groups is None) propagates to cross_val_score
    # and cross_val_predict
    # And also check if groups is correctly passed to the cv object
    X, y = make_classification(n_samples=20, n_classes=2, random_state=0)

    clf = SVC(kernel="linear")

    group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(),
                 GroupShuffleSplit()]
    for cv in group_cvs:
        assert_raise_message(ValueError,
                             "The 'groups' parameter should not be None.",
                             cross_val_score, estimator=clf, X=X, y=y, cv=cv)
        assert_raise_message(ValueError,
                             "The 'groups' parameter should not be None.",
                             cross_val_predict, estimator=clf, X=X, y=y, cv=cv) 
Example 5
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_search.py    License: MIT License 6 votes vote down vote up
def test_grid_search_groups():
    # Check if ValueError (when groups is None) propagates to GridSearchCV
    # And also check if groups is correctly passed to the cv object
    rng = np.random.RandomState(0)

    X, y = make_classification(n_samples=15, n_classes=2, random_state=0)
    groups = rng.randint(0, 3, 15)

    clf = LinearSVC(random_state=0)
    grid = {'C': [1]}

    group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(),
                 GroupShuffleSplit()]
    for cv in group_cvs:
        gs = GridSearchCV(clf, grid, cv=cv)
        assert_raise_message(ValueError,
                             "The 'groups' parameter should not be None.",
                             gs.fit, X, y)
        gs.fit(X, y, groups=groups)

    non_group_cvs = [StratifiedKFold(), StratifiedShuffleSplit()]
    for cv in non_group_cvs:
        gs = GridSearchCV(clf, grid, cv=cv)
        # Should not raise an error
        gs.fit(X, y) 
Example 6
Project: tpot   Author: EpistasisLab   File: tpot_tests.py    License: GNU Lesser General Public License v3.0 6 votes vote down vote up
def test_fit_GroupKFold():
    """Assert that TPOT properly handles the group parameter when using GroupKFold."""
    # This check tests if the darker digits images would generalize to the lighter ones.
    means = np.mean(training_features, axis=1)
    groups = means >= np.median(means)

    tpot_obj = TPOTClassifier(
        random_state=42,
        population_size=2,
        offspring_size=4,
        generations=1,
        verbosity=0,
        config_dict='TPOT light',
        cv=model_selection.GroupKFold(n_splits=2),
    )
    tpot_obj.fit(training_features, training_target, groups=groups)

    assert_greater_equal(tpot_obj.score(testing_features, testing_target), 0.97) 
Example 7
Project: CDSS   Author: HealthRex   File: SupervisedClassifier.py    License: GNU General Public License v3.0 6 votes vote down vote up
def __init__(self, classes, hyperparams=None, groups=None):
        self._classes = classes

        # Initialize params.
        self._params = {}
        self._model = None

        '''
        Used by GroupKFold for splitting train/validation. 
        '''
        self._groups = groups

        # Initialize hyperparams.
        self._hyperparams = {} if hyperparams is None else hyperparams
        self._hyperparam_search_space = {}
        # Set algorithm.
        self._get_or_set_hyperparam('algorithm')
        # Set random state.
        self._get_or_set_hyperparam('random_state')
        # Set CV strategy.
        self._get_or_set_hyperparam('hyperparam_strategy') 
Example 8
Project: keras-tuner   Author: keras-team   File: sklearn_test.py    License: Apache License 2.0 6 votes vote down vote up
def test_sklearn_cv_with_groups(tmp_dir):
    tuner = sklearn_tuner.Sklearn(
        oracle=kt.oracles.BayesianOptimization(
            objective=kt.Objective('score', 'max'),
            max_trials=10),
        hypermodel=build_model,
        cv=model_selection.GroupKFold(5),
        directory=tmp_dir)

    x = np.random.uniform(size=(50, 10))
    y = np.random.randint(0, 2, size=(50,))
    groups = np.random.randint(0, 5, size=(50,))
    tuner.search(x, y, groups=groups)

    assert len(tuner.oracle.trials) == 10

    best_trial = tuner.oracle.get_best_trials()[0]
    assert best_trial.status == 'COMPLETED'
    assert best_trial.score is not None
    assert best_trial.best_step == 0
    assert best_trial.metrics.exists('score')

    # Make sure best model can be reloaded.
    best_model = tuner.get_best_models()[0]
    best_model.score(x, y) 
Example 9
Project: twitter-stock-recommendation   Author: alvarobartt   File: test_split.py    License: MIT License 6 votes vote down vote up
def test_2d_y():
    # smoke test for 2d y and multi-label
    n_samples = 30
    rng = np.random.RandomState(1)
    X = rng.randint(0, 3, size=(n_samples, 2))
    y = rng.randint(0, 3, size=(n_samples,))
    y_2d = y.reshape(-1, 1)
    y_multilabel = rng.randint(0, 2, size=(n_samples, 3))
    groups = rng.randint(0, 3, size=(n_samples,))
    splitters = [LeaveOneOut(), LeavePOut(p=2), KFold(), StratifiedKFold(),
                 RepeatedKFold(), RepeatedStratifiedKFold(),
                 ShuffleSplit(), StratifiedShuffleSplit(test_size=.5),
                 GroupShuffleSplit(), LeaveOneGroupOut(),
                 LeavePGroupsOut(n_groups=2), GroupKFold(), TimeSeriesSplit(),
                 PredefinedSplit(test_fold=groups)]
    for splitter in splitters:
        list(splitter.split(X, y, groups))
        list(splitter.split(X, y_2d, groups))
        try:
            list(splitter.split(X, y_multilabel, groups))
        except ValueError as e:
            allowed_target_types = ('binary', 'multiclass')
            msg = "Supported target types are: {}. Got 'multilabel".format(
                allowed_target_types)
            assert msg in str(e) 
Example 10
Project: twitter-stock-recommendation   Author: alvarobartt   File: test_validation.py    License: MIT License 6 votes vote down vote up
def test_cross_val_score_predict_groups():
    # Check if ValueError (when groups is None) propagates to cross_val_score
    # and cross_val_predict
    # And also check if groups is correctly passed to the cv object
    X, y = make_classification(n_samples=20, n_classes=2, random_state=0)

    clf = SVC(kernel="linear")

    group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(),
                 GroupShuffleSplit()]
    for cv in group_cvs:
        assert_raise_message(ValueError,
                             "The 'groups' parameter should not be None.",
                             cross_val_score, estimator=clf, X=X, y=y, cv=cv)
        assert_raise_message(ValueError,
                             "The 'groups' parameter should not be None.",
                             cross_val_predict, estimator=clf, X=X, y=y, cv=cv) 
Example 11
Project: twitter-stock-recommendation   Author: alvarobartt   File: test_search.py    License: MIT License 6 votes vote down vote up
def test_grid_search_groups():
    # Check if ValueError (when groups is None) propagates to GridSearchCV
    # And also check if groups is correctly passed to the cv object
    rng = np.random.RandomState(0)

    X, y = make_classification(n_samples=15, n_classes=2, random_state=0)
    groups = rng.randint(0, 3, 15)

    clf = LinearSVC(random_state=0)
    grid = {'C': [1]}

    group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(),
                 GroupShuffleSplit()]
    for cv in group_cvs:
        gs = GridSearchCV(clf, grid, cv=cv)
        assert_raise_message(ValueError,
                             "The 'groups' parameter should not be None.",
                             gs.fit, X, y)
        gs.fit(X, y, groups=groups)

    non_group_cvs = [StratifiedKFold(), StratifiedShuffleSplit()]
    for cv in non_group_cvs:
        gs = GridSearchCV(clf, grid, cv=cv)
        # Should not raise an error
        gs.fit(X, y) 
Example 12
Project: nyaggle   Author: nyanp   File: test_run.py    License: MIT License 5 votes vote down vote up
def test_experiment_manual_cv_group(tmpdir_name):
    df1 = pd.DataFrame()
    df1['x'] = np.random.randint(0, 10, size=1000)
    df1['y'] = df1['x'] > 5
    df1['grp'] = 0

    df2 = pd.DataFrame()
    df2['x'] = np.random.randint(0, 10, size=100)
    df2['y'] = df2['x'] <= 5
    df2['grp'] = 1

    X = pd.concat([df1, df2]).reset_index(drop=True)
    y = X['y']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

    grp = X_train['grp']
    X_train = X_train.drop(['y', 'grp'], axis=1)
    X_test = X_test.drop(['y', 'grp'], axis=1)

    params = {
        'objective': 'binary',
        'max_depth': 8
    }

    result = run_experiment(params, X_train, y_train, X_test, tmpdir_name, cv=GroupKFold(2), groups=grp)
    assert result.metrics[-1] < 0.7 
Example 13
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_split.py    License: MIT License 5 votes vote down vote up
def test_nested_cv():
    # Test if nested cross validation works with different combinations of cv
    rng = np.random.RandomState(0)

    X, y = make_classification(n_samples=15, n_classes=2, random_state=0)
    groups = rng.randint(0, 5, 15)

    cvs = [LeaveOneGroupOut(), LeaveOneOut(), GroupKFold(), StratifiedKFold(),
           StratifiedShuffleSplit(n_splits=3, random_state=0)]

    for inner_cv, outer_cv in combinations_with_replacement(cvs, 2):
        gs = GridSearchCV(Ridge(), param_grid={'alpha': [1, .1]},
                          cv=inner_cv, error_score='raise', iid=False)
        cross_val_score(gs, X=X, y=y, groups=groups, cv=outer_cv,
                        fit_params={'groups': groups}) 
Example 14
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_split.py    License: MIT License 5 votes vote down vote up
def test_nsplit_default_warn():
    # Test that warnings are raised. Will be removed in 0.22
    assert_warns_message(FutureWarning, NSPLIT_WARNING, KFold)
    assert_warns_message(FutureWarning, NSPLIT_WARNING, GroupKFold)
    assert_warns_message(FutureWarning, NSPLIT_WARNING, StratifiedKFold)
    assert_warns_message(FutureWarning, NSPLIT_WARNING, TimeSeriesSplit)

    assert_no_warnings(KFold, n_splits=5)
    assert_no_warnings(GroupKFold, n_splits=5)
    assert_no_warnings(StratifiedKFold, n_splits=5)
    assert_no_warnings(TimeSeriesSplit, n_splits=5) 
Example 15
Project: pandas-ml   Author: pandas-ml   File: test_model_selection.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_objectmapper(self):
        df = pdml.ModelFrame([])

        # Splitter Classes
        self.assertIs(df.model_selection.KFold, ms.KFold)
        self.assertIs(df.model_selection.GroupKFold, ms.GroupKFold)
        self.assertIs(df.model_selection.StratifiedKFold, ms.StratifiedKFold)

        self.assertIs(df.model_selection.LeaveOneGroupOut, ms.LeaveOneGroupOut)
        self.assertIs(df.model_selection.LeavePGroupsOut, ms.LeavePGroupsOut)
        self.assertIs(df.model_selection.LeaveOneOut, ms.LeaveOneOut)
        self.assertIs(df.model_selection.LeavePOut, ms.LeavePOut)

        self.assertIs(df.model_selection.ShuffleSplit, ms.ShuffleSplit)
        self.assertIs(df.model_selection.GroupShuffleSplit,
                      ms.GroupShuffleSplit)
        # self.assertIs(df.model_selection.StratifiedShuffleSplit,
        #               ms.StratifiedShuffleSplit)
        self.assertIs(df.model_selection.PredefinedSplit, ms.PredefinedSplit)
        self.assertIs(df.model_selection.TimeSeriesSplit, ms.TimeSeriesSplit)

        # Splitter Functions

        # Hyper-parameter optimizers
        self.assertIs(df.model_selection.GridSearchCV, ms.GridSearchCV)
        self.assertIs(df.model_selection.RandomizedSearchCV, ms.RandomizedSearchCV)
        self.assertIs(df.model_selection.ParameterGrid, ms.ParameterGrid)
        self.assertIs(df.model_selection.ParameterSampler, ms.ParameterSampler)

        # Model validation 
Example 16
Project: pandas-ml   Author: pandas-ml   File: test_model_selection.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_objectmapper_abbr(self):
        df = pdml.ModelFrame([])

        # Splitter Classes
        self.assertIs(df.ms.KFold, ms.KFold)
        self.assertIs(df.ms.GroupKFold, ms.GroupKFold)
        self.assertIs(df.ms.StratifiedKFold, ms.StratifiedKFold)

        self.assertIs(df.ms.LeaveOneGroupOut, ms.LeaveOneGroupOut)
        self.assertIs(df.ms.LeavePGroupsOut, ms.LeavePGroupsOut)
        self.assertIs(df.ms.LeaveOneOut, ms.LeaveOneOut)
        self.assertIs(df.ms.LeavePOut, ms.LeavePOut)

        self.assertIs(df.ms.ShuffleSplit, ms.ShuffleSplit)
        self.assertIs(df.ms.GroupShuffleSplit,
                      ms.GroupShuffleSplit)
        # self.assertIs(df.ms.StratifiedShuffleSplit,
        #               ms.StratifiedShuffleSplit)
        self.assertIs(df.ms.PredefinedSplit, ms.PredefinedSplit)
        self.assertIs(df.ms.TimeSeriesSplit, ms.TimeSeriesSplit)

        # Splitter Functions

        # Hyper-parameter optimizers
        self.assertIs(df.ms.GridSearchCV, ms.GridSearchCV)
        self.assertIs(df.ms.RandomizedSearchCV, ms.RandomizedSearchCV)
        self.assertIs(df.ms.ParameterGrid, ms.ParameterGrid)
        self.assertIs(df.ms.ParameterSampler, ms.ParameterSampler)

        # Model validation 
Example 17
Project: twitter-stock-recommendation   Author: alvarobartt   File: test_split.py    License: MIT License 5 votes vote down vote up
def test_nested_cv():
    # Test if nested cross validation works with different combinations of cv
    rng = np.random.RandomState(0)

    X, y = make_classification(n_samples=15, n_classes=2, random_state=0)
    groups = rng.randint(0, 5, 15)

    cvs = [LeaveOneGroupOut(), LeaveOneOut(), GroupKFold(), StratifiedKFold(),
           StratifiedShuffleSplit(n_splits=3, random_state=0)]

    for inner_cv, outer_cv in combinations_with_replacement(cvs, 2):
        gs = GridSearchCV(Ridge(), param_grid={'alpha': [1, .1]},
                          cv=inner_cv)
        cross_val_score(gs, X=X, y=y, groups=groups, cv=outer_cv,
                        fit_params={'groups': groups}) 
Example 18
Project: twitter-stock-recommendation   Author: alvarobartt   File: test_validation.py    License: MIT License 5 votes vote down vote up
def test_learning_curve_with_shuffle():
    # Following test case was designed this way to verify the code
    # changes made in pull request: #7506.
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [11, 12], [13, 14], [15, 16],
                 [17, 18], [19, 20], [7, 8], [9, 10], [11, 12], [13, 14],
                 [15, 16], [17, 18]])
    y = np.array([1, 1, 1, 2, 3, 4, 1, 1, 2, 3, 4, 1, 2, 3, 4])
    groups = np.array([1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 4, 4, 4, 4])
    # Splits on these groups fail without shuffle as the first iteration
    # of the learning curve doesn't contain label 4 in the training set.
    estimator = PassiveAggressiveClassifier(max_iter=5, tol=None,
                                            shuffle=False)

    cv = GroupKFold(n_splits=2)
    train_sizes_batch, train_scores_batch, test_scores_batch = learning_curve(
        estimator, X, y, cv=cv, n_jobs=1, train_sizes=np.linspace(0.3, 1.0, 3),
        groups=groups, shuffle=True, random_state=2)
    assert_array_almost_equal(train_scores_batch.mean(axis=1),
                              np.array([0.75, 0.3, 0.36111111]))
    assert_array_almost_equal(test_scores_batch.mean(axis=1),
                              np.array([0.36111111, 0.25, 0.25]))
    assert_raises(ValueError, learning_curve, estimator, X, y, cv=cv, n_jobs=1,
                  train_sizes=np.linspace(0.3, 1.0, 3), groups=groups)

    train_sizes_inc, train_scores_inc, test_scores_inc = learning_curve(
        estimator, X, y, cv=cv, n_jobs=1, train_sizes=np.linspace(0.3, 1.0, 3),
        groups=groups, shuffle=True, random_state=2,
        exploit_incremental_learning=True)
    assert_array_almost_equal(train_scores_inc.mean(axis=1),
                              train_scores_batch.mean(axis=1))
    assert_array_almost_equal(test_scores_inc.mean(axis=1),
                              test_scores_batch.mean(axis=1)) 
Example 19
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_ridge.py    License: MIT License 4 votes vote down vote up
def test_ridge_gcv_sample_weights(
        gcv_mode, X_constructor, fit_intercept, n_features, y_shape, noise):
    alphas = [1e-3, .1, 1., 10., 1e3]
    rng = np.random.RandomState(0)
    n_targets = y_shape[-1] if len(y_shape) == 2 else 1
    X, y = _make_sparse_offset_regression(
        n_samples=11, n_features=n_features, n_targets=n_targets,
        random_state=0, shuffle=False, noise=noise)
    y = y.reshape(y_shape)

    sample_weight = 3 * rng.randn(len(X))
    sample_weight = (sample_weight - sample_weight.min() + 1).astype(int)
    indices = np.repeat(np.arange(X.shape[0]), sample_weight)
    sample_weight = sample_weight.astype(float)
    X_tiled, y_tiled = X[indices], y[indices]

    cv = GroupKFold(n_splits=X.shape[0])
    splits = cv.split(X_tiled, y_tiled, groups=indices)
    kfold = RidgeCV(
        alphas=alphas, cv=splits, scoring='neg_mean_squared_error',
        fit_intercept=fit_intercept)
    # ignore warning from GridSearchCV: DeprecationWarning: The default of the
    # `iid` parameter will change from True to False in version 0.22 and will
    # be removed in 0.24
    with ignore_warnings(category=DeprecationWarning):
        kfold.fit(X_tiled, y_tiled)

    ridge_reg = Ridge(alpha=kfold.alpha_, fit_intercept=fit_intercept)
    splits = cv.split(X_tiled, y_tiled, groups=indices)
    predictions = cross_val_predict(ridge_reg, X_tiled, y_tiled, cv=splits)
    kfold_errors = (y_tiled - predictions)**2
    kfold_errors = [
        np.sum(kfold_errors[indices == i], axis=0) for
        i in np.arange(X.shape[0])]
    kfold_errors = np.asarray(kfold_errors)

    X_gcv = X_constructor(X)
    gcv_ridge = RidgeCV(
        alphas=alphas, store_cv_values=True,
        gcv_mode=gcv_mode, fit_intercept=fit_intercept)
    gcv_ridge.fit(X_gcv, y, sample_weight=sample_weight)
    if len(y_shape) == 2:
        gcv_errors = gcv_ridge.cv_values_[:, :, alphas.index(kfold.alpha_)]
    else:
        gcv_errors = gcv_ridge.cv_values_[:, alphas.index(kfold.alpha_)]

    assert kfold.alpha_ == pytest.approx(gcv_ridge.alpha_)
    assert_allclose(gcv_errors, kfold_errors, rtol=1e-3)
    assert_allclose(gcv_ridge.coef_, kfold.coef_, rtol=1e-3)
    assert_allclose(gcv_ridge.intercept_, kfold.intercept_, rtol=1e-3) 
Example 20
Project: ibeis   Author: Erotemic   File: sklearn_utils.py    License: Apache License 2.0 4 votes vote down vote up
def temp(samples):
    from sklearn import model_selection
    from ibeis.algo.verif import sklearn_utils
    def check_balance(idxs):
        # from sklearn.utils.fixes import bincount
        print('-------')
        for count, (test, train) in enumerate(idxs):
            print('split %r' % (count))
            groups_train = set(groups.take(train))
            groups_test = set(groups.take(test))
            n_group_isect = len(groups_train.intersection(groups_test))
            y_train_freq = bincount(y.take(train))
            y_test_freq = bincount(y.take(test))
            y_test_ratio = y_test_freq / y_test_freq.sum()
            y_train_ratio = y_train_freq / y_train_freq.sum()
            balance_error = np.sum((y_test_ratio - y_train_ratio) ** 2)
            print('n_group_isect = %r' % (n_group_isect,))
            print('y_test_ratio = %r' % (y_test_ratio,))
            print('y_train_ratio = %r' % (y_train_ratio,))
            print('balance_error = %r' % (balance_error,))

    X = np.empty((len(samples), 0))
    y = samples.encoded_1d().values
    groups = samples.group_ids

    n_splits = 3

    splitter = model_selection.GroupShuffleSplit(n_splits=n_splits)
    idxs = list(splitter.split(X=X, y=y, groups=groups))
    check_balance(idxs)

    splitter = model_selection.GroupKFold(n_splits=n_splits)
    idxs = list(splitter.split(X=X, y=y, groups=groups))
    check_balance(idxs)

    splitter = model_selection.StratifiedKFold(n_splits=n_splits)
    idxs = list(splitter.split(X=X, y=y, groups=groups))
    check_balance(idxs)

    splitter = sklearn_utils.StratifiedGroupKFold(n_splits=n_splits)
    idxs = list(splitter.split(X=X, y=y, groups=groups))
    check_balance(idxs) 
Example 21
Project: nltools   Author: cosanlab   File: cross_validation.py    License: MIT License 4 votes vote down vote up
def set_cv(Y=None, cv_dict=None, return_generator=True):
    """ Helper function to create a sci-kit learn compatible cv object using
    common parameters for prediction analyses.

    Args:
        Y:  (pd.DataFrame) Pandas Dataframe of Y labels
        cv_dict: (dict) Type of cross_validation to use. A dictionary of
            {'type': 'kfolds', 'n_folds': n},
            {'type': 'kfolds', 'n_folds': n, 'stratified': Y},
            {'type': 'kfolds', 'n_folds': n, 'subject_id': holdout}, or
            {'type': 'loso', 'subject_id': holdout}
        return_generator (bool): return a cv generator instead of an instance; default True
    Returns:
        cv: a scikit-learn model-selection generator

     """

    if isinstance(cv_dict, dict):
        if cv_dict['type'] == 'kfolds':
            if 'subject_id' in cv_dict:  # Hold out subjects within each fold
                from sklearn.model_selection import GroupKFold
                cv_inst = GroupKFold(n_splits=cv_dict['n_folds'])
                cv = cv_inst.split(X=np.zeros(len(Y)), y=Y, groups=cv_dict['subject_id'])
            elif 'stratified' in cv_dict:  # Stratified K-Folds Continuous
                from nltools.cross_validation import KFoldStratified
                cv_inst = KFoldStratified(n_splits=cv_dict['n_folds'])
                cv = cv_inst.split(X=np.zeros(len(Y)), y=Y)
            else:  # Normal K-Folds
                from sklearn.model_selection import KFold
                cv_inst = KFold(n_splits=cv_dict['n_folds'])
                cv = cv_inst.split(X=np.zeros(len(Y)), y=Y)
        elif cv_dict['type'] == 'loso':  # Leave One Subject Out
            from sklearn.model_selection import LeaveOneGroupOut
            cv_inst = LeaveOneGroupOut()
            cv = cv_inst.split(X=np.zeros(len(Y)), y=Y, groups=cv_dict['subject_id'])
        else:
            raise ValueError("""Make sure you specify a dictionary of
                            {'type': 'kfolds', 'n_folds': n},
                            {'type': 'kfolds', 'n_folds': n, 'stratified': Y},
                            {'type': 'kfolds', 'n_folds': n,
                            'subject_id': holdout}, or {'type': 'loso',
                            'subject_id': holdout}, where n = number of folds,
                            and subject = vector of subject ids that
                            corresponds to self.Y""")
    else:
        raise ValueError("Make sure 'cv_dict' is a dictionary.")
    if return_generator:
        return cv
    else:
        return cv_inst 
Example 22
Project: mlcomp   Author: lightforever   File: split.py    License: Apache License 2.0 4 votes vote down vote up
def file_group_kfold(
        n_splits: int,
        output: str = None,
        get_group=None,
        sort=False,
        must_equal=(),
        **files
):
    assert len(files) > 0, 'at lease 1 type of files is required'
    fold = GroupKFold(n_splits)
    keys = sorted(list(files))

    def get_name(file):
        return splitext(basename(file))[0]

    if sort:
        for k, v in files.items():
            files[k] = sorted(files[k], key=get_name)

    file_first = sorted(files[keys[0]])

    assert len(file_first) > n_splits, \
        f'at least {n_splits} files is required. Provided: {len(file_first)}'

    for k, v in files.items():
        assert len(files[k]) == len(file_first), \
            f'count of files in key = {k} is not the same as in {keys[0]}'

    for k, v in files.items():
        if k not in must_equal:
            continue
        for i in range(len(file_first)):
            names_equal = get_name(v[i]) == get_name(file_first[i])
            assert names_equal, \
                f'file name in {k} does not equal to {keys[0]}, ' \
                f'file name = {basename(v[i])}'

    df = pd.DataFrame(files)[keys]
    df['fold'] = 0

    groups = [
        i if not get_group else get_group(file)
        for i, file in enumerate(file_first)
    ]

    for i, (train_index,
            test_index) in enumerate(fold.split(groups, groups=groups)):
        df.loc[test_index, 'fold'] = i

    df = df.sample(frac=1)
    if output:
        df.to_csv(output, index=False)
    return df 
Example 23
Project: fklearn   Author: nubank   File: splitters.py    License: Apache License 2.0 4 votes vote down vote up
def out_of_time_and_space_splitter(train_data: pd.DataFrame,
                                   n_splits: int,
                                   in_time_limit: DateType,
                                   time_column: str,
                                   space_column: str,
                                   holdout_gap: timedelta = timedelta(days=0)) -> SplitterReturnType:
    """
    Makes K grouped train/test split folds for cross validation.
    The folds are made so that every ID is used at least once for
    evaluating and K-1 times for training. Also, for each fold, evaluation
    will always be out-of-ID and out-of-time.

    Parameters
    ----------
    train_data : pandas.DataFrame
        A Pandas' DataFrame that will be split into K out-of-time and ID
        folds for cross validation.

    n_splits : int
        The number of folds K for the K-Fold cross validation strategy.

    in_time_limit : str or datetime.datetime
        A String representing the end time of the training data.
        It should be in the same format as the Date column in `train_data`.

    time_column : str
        The name of the Date column of `train_data`.

    space_column : str
        The name of the ID column of `train_data`.

    holdout_gap: datetime.timedelta
        Timedelta of the gap between the end of the training period and the start of the validation period.
    """

    # first generate folds by space, using LabelKFold
    # GroupKFold is not supposed to be randomized, that's why there's no random_state here
    train_data = train_data.reset_index()
    space_folds = GroupKFold(n_splits).split(train_data, groups=train_data[space_column])

    if isinstance(in_time_limit, str):
        in_time_limit = datetime.strptime(in_time_limit, "%Y-%m-%d")

    # train_indexes have time_column <= in_time_limit
    # test_indexes have time_column > in_time_limit
    folds = pipe(space_folds,
                 partial(starmap, lambda f_train, f_test: [train_data.iloc[f_train][time_column],
                                                           train_data.iloc[f_test][time_column]]),
                 partial(starmap, lambda train, test: (train[train <= in_time_limit],  # filter train time
                                                       test[test > (in_time_limit + holdout_gap)])),  # filter test time
                 list)

    logs = list(map(_log_time_fold, folds))  # get fold logs
    folds_indexes = _lc_fold_to_indexes(folds)  # final formatting with idx
    return folds_indexes, logs 
Example 24
Project: Benchmarks   Author: ECP-CANDLE   File: skwrapper.py    License: MIT License 4 votes vote down vote up
def split_data(df, ycol='0', classify=False, cv=5, bins=0, cutoffs=None, groupcols=None, ignore_categoricals=False, verbose=True):
    if groupcols is not None:
        groups = make_group_from_columns(df, groupcols)

    cat_cols = df.select_dtypes(['object']).columns
    if ignore_categoricals:
        df[cat_cols] = 0
    else:
        df[cat_cols] = df[cat_cols].apply(lambda x: x.astype('category').cat.codes)

    if ycol.isdigit():
        ycol = df.columns[int(ycol)]

    y = df.loc[:, ycol].values
    x = df.drop(ycol, axis=1).values
    features = df.drop(ycol, axis=1).columns.tolist()

    if verbose:
        print('Target column: {}'.format(ycol))
        print('  count = {}, uniq = {}, mean = {:.3g}, std = {:.3g}'.format(len(y), len(np.unique(y)), np.mean(y), np.std(y)))
        print('  min = {:.3g}, q1 = {:.3g}, median = {:.3g}, q3 = {:.3g}, max = {:.3g}'.format(np.min(y), np.percentile(y, 25), np.median(y), np.percentile(y, 75), np.max(y)))

    if not classify:
        y_even = discretize(y, bins=5, verbose=False)
    elif bins >= 2:
        y = discretize(y, bins=bins, min_count=cv, verbose=verbose)
    elif cutoffs:
        y = discretize(y, cutoffs=cutoffs, min_count=cv, verbose=verbose)
    elif df[ycol].dtype in [np.dtype('float64'), np.dtype('float32')]:
        warnings.warn('Warning: classification target is float; consider using --bins or --cutoffs')
        y = y.astype(int)

    if classify:
        mask = np.ones(len(y), dtype=bool)
        unique, counts = np.unique(y, return_counts=True)
        for v, c in zip(unique, counts):
            if c < cv:
                mask[y == v] = False
        x = x[mask]
        y = y[mask]
        removed = len(mask) - np.sum(mask)
        if removed and verbose:
            print('Removed {} rows in small classes: count < {}'.format(removed, cv))

    if groupcols is None:
        if classify:
            y_even = y
        skf = StratifiedKFold(n_splits=cv, shuffle=True)
        splits = skf.split(x, y_even)
    else:
        if classify:
            groups = groups[mask]
        gkf = GroupKFold(n_splits=cv)
        splits = gkf.split(x, y, groups)

    if verbose:
        print()

    return x, y, list(splits), features