Python sklearn.model_selection.TimeSeriesSplit() Examples

The following are 18 code examples for showing how to use sklearn.model_selection.TimeSeriesSplit(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module sklearn.model_selection , or try the search function .

Example 1
Project: fight-churn   Author: carl24k   File: listing_9_3_backtest.py    License: MIT License 8 votes vote down vote up
def backtest(data_set_path,n_test_split):

    X,y = prepare_data(data_set_path,as_retention=False)

    tscv = TimeSeriesSplit(n_splits=n_test_split)

    lift_scorer = make_scorer(calc_lift, needs_proba=True)
    score_models = {'lift': lift_scorer, 'AUC': 'roc_auc'}

    retain_reg = LogisticRegression(penalty='l1', solver='liblinear', fit_intercept=True)

    gsearch = GridSearchCV(estimator=retain_reg,scoring=score_models, cv=tscv, verbose=1,
                           return_train_score=False,  param_grid={'C' : [1]}, refit='AUC')

    gsearch.fit(X,y)
    result_df = pd.DataFrame(gsearch.cv_results_)

    save_path = data_set_path.replace('.csv', '_backtest.csv')
    result_df.to_csv(save_path, index=False)
    print('Saved test scores to ' + save_path) 
Example 2
Project: gordo   Author: equinor   File: test_anomaly_detectors.py    License: GNU Affero General Public License v3.0 6 votes vote down vote up
def test_diff_detector_cross_validate(return_estimator: bool):
    """
    DiffBasedAnomalyDetector.cross_validate implementation should be the
    same as sklearn.model_selection.cross_validate if called the same.

    And it always will update `return_estimator` to True, as it requires
    the intermediate models to calculate the thresholds
    """
    X = np.random.random((100, 10))
    y = np.random.random((100, 1))

    model = DiffBasedAnomalyDetector(base_estimator=LinearRegression())

    cv = TimeSeriesSplit(n_splits=3)
    cv_results_da = model.cross_validate(
        X=X, y=y, cv=cv, return_estimator=return_estimator
    )
    cv_results_sk = cross_validate(model, X=X, y=y, cv=cv, return_estimator=True)

    assert cv_results_da.keys() == cv_results_sk.keys() 
Example 3
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_split.py    License: MIT License 6 votes vote down vote up
def test_2d_y():
    # smoke test for 2d y and multi-label
    n_samples = 30
    rng = np.random.RandomState(1)
    X = rng.randint(0, 3, size=(n_samples, 2))
    y = rng.randint(0, 3, size=(n_samples,))
    y_2d = y.reshape(-1, 1)
    y_multilabel = rng.randint(0, 2, size=(n_samples, 3))
    groups = rng.randint(0, 3, size=(n_samples,))
    splitters = [LeaveOneOut(), LeavePOut(p=2), KFold(), StratifiedKFold(),
                 RepeatedKFold(), RepeatedStratifiedKFold(),
                 ShuffleSplit(), StratifiedShuffleSplit(test_size=.5),
                 GroupShuffleSplit(), LeaveOneGroupOut(),
                 LeavePGroupsOut(n_groups=2), GroupKFold(), TimeSeriesSplit(),
                 PredefinedSplit(test_fold=groups)]
    for splitter in splitters:
        list(splitter.split(X, y, groups))
        list(splitter.split(X, y_2d, groups))
        try:
            list(splitter.split(X, y_multilabel, groups))
        except ValueError as e:
            allowed_target_types = ('binary', 'multiclass')
            msg = "Supported target types are: {}. Got 'multilabel".format(
                allowed_target_types)
            assert msg in str(e) 
Example 4
Project: fight-churn   Author: carl24k   File: listing_9_6_crossvalidate_xgb.py    License: MIT License 6 votes vote down vote up
def crossvalidate_xgb(data_set_path,n_test_split):

    X,y = prepare_data(data_set_path,ext='',as_retention=False)

    tscv = TimeSeriesSplit(n_splits=n_test_split)

    score_models = {'lift': make_scorer(calc_lift, needs_proba=True), 'AUC': 'roc_auc'}

    xgb_model = xgb.XGBClassifier(objective='binary:logistic')
    test_params = { 'max_depth': [1,2,4,6],
                    'learning_rate': [0.1,0.2,0.3,0.4],
                    'n_estimators': [20,40,80,120],
                    'min_child_weight' : [3,6,9,12]}
    gsearch = GridSearchCV(estimator=xgb_model,n_jobs=-1, scoring=score_models, cv=tscv, verbose=1,
                           return_train_score=False,  param_grid=test_params,refit='AUC')
    gsearch.fit(X.values,y)

    result_df = pd.DataFrame(gsearch.cv_results_)
    result_df.sort_values('mean_test_AUC',ascending=False,inplace=True)
    save_path = data_set_path.replace('.csv', '_crossval_xgb.csv')
    result_df.to_csv(save_path, index=False)
    print('Saved test scores to ' + save_path)

    pickle_path = data_set_path.replace('.csv', '_xgb_model.pkl')
    with open(pickle_path, 'wb') as fid:
        pickle.dump(gsearch.best_estimator_, fid)
    print('Saved model pickle to ' + pickle_path)

    predictions = gsearch.best_estimator_.predict_proba(X.values)
    predict_df = pd.DataFrame(predictions, index=X.index, columns=['retain_prob','churn_prob'])
    forecast_save_path = data_set_path.replace('.csv', '_xgb_predictions.csv')
    print('Saving results to %s' % forecast_save_path)
    predict_df.to_csv(forecast_save_path, header=True)

    forecast_histogram(data_set_path,predict_df,ext='xgb') 
Example 5
Project: fight-churn   Author: carl24k   File: churn_calc.py    License: MIT License 6 votes vote down vote up
def prepare_xy(self,groups=True):

        if groups:
            self.apply_behavior_grouping()
            dat= pd.DataFrame(self.churn_data_reduced)
            cols=self.grouped_columns
        else:
            self.normalize_skewscale()
            dat = pd.DataFrame(self.data_scores)
            cols = self.metric_columns

        # The result has to be sorted by date for the TimeSeriesSplit to work properly
        dat['temp_obs_date'] = self.observe_dates.values
        dat.sort_values('temp_obs_date',inplace=True)

        X = dat[cols]
        y = dat['is_churn']

        return X,y 
Example 6
Project: fight-churn   Author: carl24k   File: churn_calc.py    License: MIT License 6 votes vote down vote up
def crossvalidate_churn_model(self,model_code,groups=True):
        X,y = self.prepare_xy(groups)
        params = self.cv_params(model_code)
        model = self.model_instance(model_code)
        tscv = TimeSeriesSplit(n_splits=3)
        lift_scorer = make_scorer(top_decile_lift,needs_proba=True)
        score_models = {'lift_scorer' : lift_scorer, 'AUC' : 'roc_auc'}
        gsearch = GridSearchCV(estimator=model, param_grid=params, scoring=score_models, cv=tscv, n_jobs=8,verbose=5,
                               return_train_score=True,refit='AUC')


        gsearch.fit(X, y)
        result_df = pd.DataFrame(gsearch.cv_results_)
        if len(params)>1:
            result_df.sort_values('mean_test_AUC',ascending=False,inplace=True)


        save_file_name = model_code + '_CV'
        save_path = self.save_path(save_file_name, subdir=self.grouping_correlation_subdir(groups))

        result_df.to_csv(save_path)
        print('Saved result to ' + save_path)
        return result_df 
Example 7
Project: twitter-stock-recommendation   Author: alvarobartt   File: test_split.py    License: MIT License 6 votes vote down vote up
def test_2d_y():
    # smoke test for 2d y and multi-label
    n_samples = 30
    rng = np.random.RandomState(1)
    X = rng.randint(0, 3, size=(n_samples, 2))
    y = rng.randint(0, 3, size=(n_samples,))
    y_2d = y.reshape(-1, 1)
    y_multilabel = rng.randint(0, 2, size=(n_samples, 3))
    groups = rng.randint(0, 3, size=(n_samples,))
    splitters = [LeaveOneOut(), LeavePOut(p=2), KFold(), StratifiedKFold(),
                 RepeatedKFold(), RepeatedStratifiedKFold(),
                 ShuffleSplit(), StratifiedShuffleSplit(test_size=.5),
                 GroupShuffleSplit(), LeaveOneGroupOut(),
                 LeavePGroupsOut(n_groups=2), GroupKFold(), TimeSeriesSplit(),
                 PredefinedSplit(test_fold=groups)]
    for splitter in splitters:
        list(splitter.split(X, y, groups))
        list(splitter.split(X, y_2d, groups))
        try:
            list(splitter.split(X, y_multilabel, groups))
        except ValueError as e:
            allowed_target_types = ('binary', 'multiclass')
            msg = "Supported target types are: {}. Got 'multilabel".format(
                allowed_target_types)
            assert msg in str(e) 
Example 8
Project: gordo   Author: equinor   File: test_model.py    License: GNU Affero General Public License v3.0 5 votes vote down vote up
def test_keras_autoencoder_crossval(model, kind):
    """
    Test ability for cross validation
    """
    Model = pydoc.locate(f"gordo.machine.model.models.{model}")
    model = Pipeline([("model", Model(kind=kind))])

    X = np.random.random(size=(15, 2))
    y = X.copy()

    scores = cross_val_score(
        model, X, y, cv=TimeSeriesSplit(n_splits=2, max_train_size=2)
    )
    assert isinstance(scores, np.ndarray)
    logger.info(f"Mean score: {scores.mean():.4f} - Std score: {scores.std():.4f}") 
Example 9
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_split.py    License: MIT License 5 votes vote down vote up
def test_time_series_cv():
    X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14]]

    # Should fail if there are more folds than samples
    assert_raises_regexp(ValueError, "Cannot have number of folds.*greater",
                         next,
                         TimeSeriesSplit(n_splits=7).split(X))

    tscv = TimeSeriesSplit(2)

    # Manually check that Time Series CV preserves the data
    # ordering on toy datasets
    splits = tscv.split(X[:-1])
    train, test = next(splits)
    assert_array_equal(train, [0, 1])
    assert_array_equal(test, [2, 3])

    train, test = next(splits)
    assert_array_equal(train, [0, 1, 2, 3])
    assert_array_equal(test, [4, 5])

    splits = TimeSeriesSplit(2).split(X)

    train, test = next(splits)
    assert_array_equal(train, [0, 1, 2])
    assert_array_equal(test, [3, 4])

    train, test = next(splits)
    assert_array_equal(train, [0, 1, 2, 3, 4])
    assert_array_equal(test, [5, 6])

    # Check get_n_splits returns the correct number of splits
    splits = TimeSeriesSplit(2).split(X)
    n_splits_actual = len(list(splits))
    assert_equal(n_splits_actual, tscv.get_n_splits())
    assert_equal(n_splits_actual, 2) 
Example 10
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_split.py    License: MIT License 5 votes vote down vote up
def test_time_series_max_train_size():
    X = np.zeros((6, 1))
    splits = TimeSeriesSplit(n_splits=3).split(X)
    check_splits = TimeSeriesSplit(n_splits=3, max_train_size=3).split(X)
    _check_time_series_max_train_size(splits, check_splits, max_train_size=3)

    # Test for the case where the size of a fold is greater than max_train_size
    check_splits = TimeSeriesSplit(n_splits=3, max_train_size=2).split(X)
    _check_time_series_max_train_size(splits, check_splits, max_train_size=2)

    # Test for the case where the size of each fold is less than max_train_size
    check_splits = TimeSeriesSplit(n_splits=3, max_train_size=5).split(X)
    _check_time_series_max_train_size(splits, check_splits, max_train_size=2) 
Example 11
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_split.py    License: MIT License 5 votes vote down vote up
def test_nsplit_default_warn():
    # Test that warnings are raised. Will be removed in 0.22
    assert_warns_message(FutureWarning, NSPLIT_WARNING, KFold)
    assert_warns_message(FutureWarning, NSPLIT_WARNING, GroupKFold)
    assert_warns_message(FutureWarning, NSPLIT_WARNING, StratifiedKFold)
    assert_warns_message(FutureWarning, NSPLIT_WARNING, TimeSeriesSplit)

    assert_no_warnings(KFold, n_splits=5)
    assert_no_warnings(GroupKFold, n_splits=5)
    assert_no_warnings(StratifiedKFold, n_splits=5)
    assert_no_warnings(TimeSeriesSplit, n_splits=5) 
Example 12
Project: timeserio   Author: octoenergy   File: time_series_split.py    License: MIT License 5 votes vote down vote up
def split(self, df, y=None, groups=None):
        self._validate_df(df)
        groups = df.groupby(self.groupby).indices
        splits = {}
        while True:
            X_idxs, y_idxs = [], []
            for key, sub_idx in groups.items():
                sub_df = df.iloc[sub_idx]
                sub_y = y[sub_idx] if y is not None else None

                if key not in splits:
                    splitter = TimeSeriesSplit(
                        self.n_splits, self.max_train_size
                    )
                    splits[key] = splitter.split(sub_df, sub_y)

                try:
                    X_idx, y_idx = next(splits[key])
                    X_idx = np.array(
                        [df.index.get_loc(i) for i in sub_df.iloc[X_idx].index]
                    )
                    y_idx = np.array(
                        [df.index.get_loc(i) for i in sub_df.iloc[y_idx].index]
                    )
                    X_idxs.append(X_idx)
                    y_idxs.append(y_idx)
                except StopIteration:
                    pass

            if len(X_idxs) == 0:
                break

            yield np.concatenate(X_idxs), np.concatenate(y_idxs) 
Example 13
Project: fight-churn   Author: carl24k   File: listing_9_5_crossvalidate.py    License: MIT License 5 votes vote down vote up
def crossvalidate(data_set_path,n_test_split):

    X,y = prepare_data(data_set_path,as_retention=False)
    tscv = TimeSeriesSplit(n_splits=n_test_split)
    score_models = {'lift': make_scorer(calc_lift, needs_proba=True), 'AUC': 'roc_auc'}
    retain_reg = LogisticRegression(penalty='l1', solver='liblinear', fit_intercept=True)
    test_params = {'C' : [0.64, 0.32, 0.16, 0.08, 0.04, 0.02, 0.01, 0.005, 0.0025]}
    gsearch = GridSearchCV(estimator=retain_reg,scoring=score_models, cv=tscv, verbose=1,
                           return_train_score=False,  param_grid=test_params, refit=False)
    gsearch.fit(X,y)

    result_df = pd.DataFrame(gsearch.cv_results_)
    result_df['n_weights']= test_n_weights(X,y,test_params)
    result_df.to_csv(data_set_path.replace('.csv', '_crossval.csv'), index=False)
    plot_regression_test(data_set_path,result_df) 
Example 14
Project: pandas-ml   Author: pandas-ml   File: test_model_selection.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_objectmapper(self):
        df = pdml.ModelFrame([])

        # Splitter Classes
        self.assertIs(df.model_selection.KFold, ms.KFold)
        self.assertIs(df.model_selection.GroupKFold, ms.GroupKFold)
        self.assertIs(df.model_selection.StratifiedKFold, ms.StratifiedKFold)

        self.assertIs(df.model_selection.LeaveOneGroupOut, ms.LeaveOneGroupOut)
        self.assertIs(df.model_selection.LeavePGroupsOut, ms.LeavePGroupsOut)
        self.assertIs(df.model_selection.LeaveOneOut, ms.LeaveOneOut)
        self.assertIs(df.model_selection.LeavePOut, ms.LeavePOut)

        self.assertIs(df.model_selection.ShuffleSplit, ms.ShuffleSplit)
        self.assertIs(df.model_selection.GroupShuffleSplit,
                      ms.GroupShuffleSplit)
        # self.assertIs(df.model_selection.StratifiedShuffleSplit,
        #               ms.StratifiedShuffleSplit)
        self.assertIs(df.model_selection.PredefinedSplit, ms.PredefinedSplit)
        self.assertIs(df.model_selection.TimeSeriesSplit, ms.TimeSeriesSplit)

        # Splitter Functions

        # Hyper-parameter optimizers
        self.assertIs(df.model_selection.GridSearchCV, ms.GridSearchCV)
        self.assertIs(df.model_selection.RandomizedSearchCV, ms.RandomizedSearchCV)
        self.assertIs(df.model_selection.ParameterGrid, ms.ParameterGrid)
        self.assertIs(df.model_selection.ParameterSampler, ms.ParameterSampler)

        # Model validation 
Example 15
Project: pandas-ml   Author: pandas-ml   File: test_model_selection.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_objectmapper_abbr(self):
        df = pdml.ModelFrame([])

        # Splitter Classes
        self.assertIs(df.ms.KFold, ms.KFold)
        self.assertIs(df.ms.GroupKFold, ms.GroupKFold)
        self.assertIs(df.ms.StratifiedKFold, ms.StratifiedKFold)

        self.assertIs(df.ms.LeaveOneGroupOut, ms.LeaveOneGroupOut)
        self.assertIs(df.ms.LeavePGroupsOut, ms.LeavePGroupsOut)
        self.assertIs(df.ms.LeaveOneOut, ms.LeaveOneOut)
        self.assertIs(df.ms.LeavePOut, ms.LeavePOut)

        self.assertIs(df.ms.ShuffleSplit, ms.ShuffleSplit)
        self.assertIs(df.ms.GroupShuffleSplit,
                      ms.GroupShuffleSplit)
        # self.assertIs(df.ms.StratifiedShuffleSplit,
        #               ms.StratifiedShuffleSplit)
        self.assertIs(df.ms.PredefinedSplit, ms.PredefinedSplit)
        self.assertIs(df.ms.TimeSeriesSplit, ms.TimeSeriesSplit)

        # Splitter Functions

        # Hyper-parameter optimizers
        self.assertIs(df.ms.GridSearchCV, ms.GridSearchCV)
        self.assertIs(df.ms.RandomizedSearchCV, ms.RandomizedSearchCV)
        self.assertIs(df.ms.ParameterGrid, ms.ParameterGrid)
        self.assertIs(df.ms.ParameterSampler, ms.ParameterSampler)

        # Model validation 
Example 16
Project: twitter-stock-recommendation   Author: alvarobartt   File: test_split.py    License: MIT License 5 votes vote down vote up
def test_time_series_cv():
    X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14]]

    # Should fail if there are more folds than samples
    assert_raises_regexp(ValueError, "Cannot have number of folds.*greater",
                         next,
                         TimeSeriesSplit(n_splits=7).split(X))

    tscv = TimeSeriesSplit(2)

    # Manually check that Time Series CV preserves the data
    # ordering on toy datasets
    splits = tscv.split(X[:-1])
    train, test = next(splits)
    assert_array_equal(train, [0, 1])
    assert_array_equal(test, [2, 3])

    train, test = next(splits)
    assert_array_equal(train, [0, 1, 2, 3])
    assert_array_equal(test, [4, 5])

    splits = TimeSeriesSplit(2).split(X)

    train, test = next(splits)
    assert_array_equal(train, [0, 1, 2])
    assert_array_equal(test, [3, 4])

    train, test = next(splits)
    assert_array_equal(train, [0, 1, 2, 3, 4])
    assert_array_equal(test, [5, 6])

    # Check get_n_splits returns the correct number of splits
    splits = TimeSeriesSplit(2).split(X)
    n_splits_actual = len(list(splits))
    assert_equal(n_splits_actual, tscv.get_n_splits())
    assert_equal(n_splits_actual, 2) 
Example 17
Project: twitter-stock-recommendation   Author: alvarobartt   File: test_split.py    License: MIT License 5 votes vote down vote up
def test_time_series_max_train_size():
    X = np.zeros((6, 1))
    splits = TimeSeriesSplit(n_splits=3).split(X)
    check_splits = TimeSeriesSplit(n_splits=3, max_train_size=3).split(X)
    _check_time_series_max_train_size(splits, check_splits, max_train_size=3)

    # Test for the case where the size of a fold is greater than max_train_size
    check_splits = TimeSeriesSplit(n_splits=3, max_train_size=2).split(X)
    _check_time_series_max_train_size(splits, check_splits, max_train_size=2)

    # Test for the case where the size of each fold is less than max_train_size
    check_splits = TimeSeriesSplit(n_splits=3, max_train_size=5).split(X)
    _check_time_series_max_train_size(splits, check_splits, max_train_size=2) 
Example 18
Project: Persimmon   Author: AlvarBer   File: tssplitblock.py    License: MIT License 5 votes vote down vote up
def function(self):
        self.out_1.val = TimeSeriesSplit()