Python sklearn.model_selection.RepeatedStratifiedKFold() Examples

The following are 16 code examples for showing how to use sklearn.model_selection.RepeatedStratifiedKFold(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module sklearn.model_selection , or try the search function .

Example 1
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_split.py    License: MIT License 6 votes vote down vote up
def test_2d_y():
    # smoke test for 2d y and multi-label
    n_samples = 30
    rng = np.random.RandomState(1)
    X = rng.randint(0, 3, size=(n_samples, 2))
    y = rng.randint(0, 3, size=(n_samples,))
    y_2d = y.reshape(-1, 1)
    y_multilabel = rng.randint(0, 2, size=(n_samples, 3))
    groups = rng.randint(0, 3, size=(n_samples,))
    splitters = [LeaveOneOut(), LeavePOut(p=2), KFold(), StratifiedKFold(),
                 RepeatedKFold(), RepeatedStratifiedKFold(),
                 ShuffleSplit(), StratifiedShuffleSplit(test_size=.5),
                 GroupShuffleSplit(), LeaveOneGroupOut(),
                 LeavePGroupsOut(n_groups=2), GroupKFold(), TimeSeriesSplit(),
                 PredefinedSplit(test_fold=groups)]
    for splitter in splitters:
        list(splitter.split(X, y, groups))
        list(splitter.split(X, y_2d, groups))
        try:
            list(splitter.split(X, y_multilabel, groups))
        except ValueError as e:
            allowed_target_types = ('binary', 'multiclass')
            msg = "Supported target types are: {}. Got 'multilabel".format(
                allowed_target_types)
            assert msg in str(e) 
Example 2
Project: hyperparameter_hunter   Author: HunterMcGushion   File: lambda_callback_example.py    License: MIT License 6 votes vote down vote up
def execute():
    env = Environment(
        train_dataset=get_toy_classification_data(),
        results_path="HyperparameterHunterAssets",
        metrics=["roc_auc_score"],
        cv_type=RepeatedStratifiedKFold,
        cv_params=dict(n_splits=5, n_repeats=2, random_state=32),
        runs=2,
        # Just instantiate `Environment` with your list of callbacks, and go about business as usual
        experiment_callbacks=[printer_callback(), confusion_matrix_oof()],
        # In addition to `printer_callback` made above, we're also adding the `confusion_matrix_oof` callback
        # This, and other callbacks, can be found in `hyperparameter_hunter.callbacks.recipes`
    )

    experiment = CVExperiment(
        model_initializer=XGBClassifier,
        model_init_params={},
        model_extra_params=dict(fit=dict(verbose=False)),
    ) 
Example 3
Project: pyrsa   Author: Charestlab   File: create.py    License: GNU Lesser General Public License v3.0 6 votes vote down vote up
def rdm_lda_kfold(x, labels):
    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
    from sklearn.model_selection import RepeatedStratifiedKFold
    from sklearn.model_selection import cross_val_score
    lda = LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto')
    folding = RepeatedStratifiedKFold(n_splits=3, n_repeats=3)

    objects = numpy.unique(labels)
    pairs = list(itertools.combinations(objects, 2))
    npairs = len(pairs)
    utv = numpy.full([npairs,], numpy.nan)
    for p in trange(npairs, desc='pairs', leave=False, ascii=True):
        pair = pairs[p]
        pair_mask = numpy.isin(labels, pair)
        x_pair = x[pair_mask, :]
        labels_pair = labels[pair_mask]
        scores = cross_val_score(lda, x_pair, labels_pair, cv=folding)
        utv[p] = scores.mean()
    return utv 
Example 4
Project: autogluon   Author: awslabs   File: utils.py    License: Apache License 2.0 6 votes vote down vote up
def generate_kfold(X, y=None, n_splits=5, random_state=0, stratified=False, n_repeats=1):
    if stratified and (y is not None):
        if n_repeats > 1:
            kf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state)
        else:
            kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

        kf.get_n_splits(X, y)
        return [[train_index, test_index] for train_index, test_index in kf.split(X, y)]
    else:
        if n_repeats > 1:
            kf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state)
        else:
            kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

        kf.get_n_splits(X)
        return [[train_index, test_index] for train_index, test_index in kf.split(X)] 
Example 5
Project: twitter-stock-recommendation   Author: alvarobartt   File: test_split.py    License: MIT License 6 votes vote down vote up
def test_2d_y():
    # smoke test for 2d y and multi-label
    n_samples = 30
    rng = np.random.RandomState(1)
    X = rng.randint(0, 3, size=(n_samples, 2))
    y = rng.randint(0, 3, size=(n_samples,))
    y_2d = y.reshape(-1, 1)
    y_multilabel = rng.randint(0, 2, size=(n_samples, 3))
    groups = rng.randint(0, 3, size=(n_samples,))
    splitters = [LeaveOneOut(), LeavePOut(p=2), KFold(), StratifiedKFold(),
                 RepeatedKFold(), RepeatedStratifiedKFold(),
                 ShuffleSplit(), StratifiedShuffleSplit(test_size=.5),
                 GroupShuffleSplit(), LeaveOneGroupOut(),
                 LeavePGroupsOut(n_groups=2), GroupKFold(), TimeSeriesSplit(),
                 PredefinedSplit(test_fold=groups)]
    for splitter in splitters:
        list(splitter.split(X, y, groups))
        list(splitter.split(X, y_2d, groups))
        try:
            list(splitter.split(X, y_multilabel, groups))
        except ValueError as e:
            allowed_target_types = ('binary', 'multiclass')
            msg = "Supported target types are: {}. Got 'multilabel".format(
                allowed_target_types)
            assert msg in str(e) 
Example 6
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_split.py    License: MIT License 5 votes vote down vote up
def test_repeated_cv_value_errors():
    # n_repeats is not integer or <= 0
    for cv in (RepeatedKFold, RepeatedStratifiedKFold):
        assert_raises(ValueError, cv, n_repeats=0)
        assert_raises(ValueError, cv, n_repeats=1.5) 
Example 7
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_split.py    License: MIT License 5 votes vote down vote up
def test_get_n_splits_for_repeated_stratified_kfold():
    n_splits = 3
    n_repeats = 4
    rskf = RepeatedStratifiedKFold(n_splits, n_repeats)
    expected_n_splits = n_splits * n_repeats
    assert_equal(expected_n_splits, rskf.get_n_splits()) 
Example 8
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_split.py    License: MIT License 5 votes vote down vote up
def test_repeated_stratified_kfold_determinstic_split():
    X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
    y = [1, 1, 1, 0, 0]
    random_state = 1944695409
    rskf = RepeatedStratifiedKFold(
        n_splits=2,
        n_repeats=2,
        random_state=random_state)

    # split should produce same and deterministic splits on
    # each call
    for _ in range(3):
        splits = rskf.split(X, y)
        train, test = next(splits)
        assert_array_equal(train, [1, 4])
        assert_array_equal(test, [0, 2, 3])

        train, test = next(splits)
        assert_array_equal(train, [0, 2, 3])
        assert_array_equal(test, [1, 4])

        train, test = next(splits)
        assert_array_equal(train, [2, 3])
        assert_array_equal(test, [0, 1, 4])

        train, test = next(splits)
        assert_array_equal(train, [0, 1, 4])
        assert_array_equal(test, [2, 3])

        assert_raises(StopIteration, next, splits) 
Example 9
Project: hyperparameter_hunter   Author: HunterMcGushion   File: do_full_save_example.py    License: MIT License 5 votes vote down vote up
def execute():
    env = Environment(
        train_dataset=get_toy_classification_data(),
        results_path="HyperparameterHunterAssets",
        metrics=["roc_auc_score"],
        cv_type=RepeatedStratifiedKFold,
        cv_params=dict(n_splits=3, n_repeats=2, random_state=32),
        do_full_save=do_full_save,
    )

    experiment_0 = CVExperiment(
        model_initializer=XGBClassifier, model_init_params=dict(subsample=0.01)
    )
    # Pro Tip: By setting XGBoost's subsample ridiculously low, we can get bad scores on purpose

    # Upon completion of this Experiment, we see a warning that not all result files will be saved
    # This is because the final score of the Experiment was below our threshold of 0.75
    # Specifically, we skipped saving prediction files (OOF, holdout, test, or in-fold), and the heartbeat file

    # What still got saved is the Experiment's: key information, leaderboard position, and description file
    # These are saved to allow us to use the information for future hyperparameter optimization, and detect repeated Experiments
    # Additionally, the Experiment's script backup is saved, but that's because its one of the first things that happens
    # For even finer control over what gets saved, use `do_full_save` together with `file_blacklist`

    # Now, lets perform another Experiment that does a bit better than our intentionally miserable one
    experiment_1 = CVExperiment(
        model_initializer=XGBClassifier, model_init_params=dict(subsample=0.5)
    )
    # Our second Experiment was executed in the same Environment, so it was still subject to the `do_full_save` constraint
    # However, because it scored above 0.75 (hopefully), all of the result files were saved 
Example 10
Project: hyperparameter_hunter   Author: HunterMcGushion   File: test_general.py    License: MIT License 5 votes vote down vote up
def env_0():
    def do_full_save(experiment_result):
        return experiment_result["final_evaluations"]["oof"]["roc_auc_score"] > 0.75

    return Environment(
        train_dataset=get_toy_classification_data(),
        results_path=assets_dir,
        metrics=["roc_auc_score"],
        cv_type=RepeatedStratifiedKFold,
        cv_params=dict(n_splits=3, n_repeats=2, random_state=32),
        do_full_save=do_full_save,
    ) 
Example 11
Project: hyperparameter_hunter   Author: HunterMcGushion   File: test_general.py    License: MIT License 5 votes vote down vote up
def env_3():
    def printer_callback():
        def printer_helper(_rep, _fold, _run, last_evaluation_results):
            print(f"{_rep}.{_fold}.{_run}   {last_evaluation_results}")

        return lambda_callback(
            on_exp_start=printer_helper,
            on_exp_end=printer_helper,
            on_rep_start=printer_helper,
            on_rep_end=printer_helper,
            on_fold_start=printer_helper,
            on_fold_end=printer_helper,
            on_run_start=printer_helper,
            on_run_end=printer_helper,
        )

    return Environment(
        train_dataset=get_toy_classification_data(),
        results_path=assets_dir,
        metrics=["roc_auc_score"],
        holdout_dataset=get_toy_classification_data(),
        cv_type=RepeatedStratifiedKFold,
        cv_params=dict(n_splits=3, n_repeats=2, random_state=32),
        runs=2,
        experiment_callbacks=[
            printer_callback(),
            confusion_matrix_oof(),
            confusion_matrix_holdout(),
        ],
    ) 
Example 12
Project: hyperparameter_hunter   Author: HunterMcGushion   File: test_environment.py    License: MIT License 5 votes vote down vote up
def test_experiment_callbacks_setter_value_error(env_fixture_0):
    with pytest.raises(ValueError, match="experiment_callbacks must be LambdaCallback instances.*"):
        env_fixture_0.experiment_callbacks = [RepeatedStratifiedKFold]


##################################################
# `define_holdout_set` Scenarios
################################################## 
Example 13
Project: catalyst   Author: catalyst-team   File: find_thresholds.py    License: Apache License 2.0 5 votes vote down vote up
def find_best_threshold(
    y_pred: np.ndarray,
    y_true: np.ndarray,
    metric_fn: Callable = metrics.roc_auc_score,
    num_splits: int = 5,
    num_repeats: int = 1,
    random_state: int = 42,
):
    """@TODO: Docs. Contribution is welcome."""
    rkf = RepeatedStratifiedKFold(
        n_splits=num_splits, n_repeats=num_repeats, random_state=random_state
    )
    fold_thresholds = []
    fold_metrics = {k: [] for k in _BINARY_PER_CLASS_METRICS.copy()}

    for train_index, test_index in rkf.split(y_true, y_true):
        y_pred_train, y_pred_test = y_pred[train_index], y_pred[test_index]
        y_true_train, y_true_test = y_true[train_index], y_true[test_index]

        best_threshold = find_best_split_threshold(
            y_pred_train, y_true_train, metric=metric_fn
        )
        best_predictions = (y_pred_test >= best_threshold).astype(int)

        for metric_name in fold_metrics.keys():
            try:
                metric_value = metrics.__dict__[metric_name](
                    y_true_test, best_predictions
                )
            except ValueError:
                metric_value = 0.0

            fold_metrics[metric_name].append(metric_value)
        fold_thresholds.append(best_threshold)

    fold_best_threshold = np.mean(fold_thresholds)
    for metric_name in fold_metrics:
        fold_metrics[metric_name] = np.mean(fold_metrics[metric_name])

    return fold_best_threshold, fold_metrics 
Example 14
Project: twitter-stock-recommendation   Author: alvarobartt   File: test_split.py    License: MIT License 5 votes vote down vote up
def test_repeated_cv_value_errors():
    # n_repeats is not integer or <= 0
    for cv in (RepeatedKFold, RepeatedStratifiedKFold):
        assert_raises(ValueError, cv, n_repeats=0)
        assert_raises(ValueError, cv, n_repeats=1.5) 
Example 15
Project: twitter-stock-recommendation   Author: alvarobartt   File: test_split.py    License: MIT License 5 votes vote down vote up
def test_get_n_splits_for_repeated_stratified_kfold():
    n_splits = 3
    n_repeats = 4
    rskf = RepeatedStratifiedKFold(n_splits, n_repeats)
    expected_n_splits = n_splits * n_repeats
    assert_equal(expected_n_splits, rskf.get_n_splits()) 
Example 16
Project: twitter-stock-recommendation   Author: alvarobartt   File: test_split.py    License: MIT License 5 votes vote down vote up
def test_repeated_stratified_kfold_determinstic_split():
    X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
    y = [1, 1, 1, 0, 0]
    random_state = 1944695409
    rskf = RepeatedStratifiedKFold(
        n_splits=2,
        n_repeats=2,
        random_state=random_state)

    # split should produce same and deterministic splits on
    # each call
    for _ in range(3):
        splits = rskf.split(X, y)
        train, test = next(splits)
        assert_array_equal(train, [1, 4])
        assert_array_equal(test, [0, 2, 3])

        train, test = next(splits)
        assert_array_equal(train, [0, 2, 3])
        assert_array_equal(test, [1, 4])

        train, test = next(splits)
        assert_array_equal(train, [2, 3])
        assert_array_equal(test, [0, 1, 4])

        train, test = next(splits)
        assert_array_equal(train, [0, 1, 4])
        assert_array_equal(test, [2, 3])

        assert_raises(StopIteration, next, splits)