Python sklearn.model_selection.StratifiedKFold() Examples

The following are 30 code examples of sklearn.model_selection.StratifiedKFold(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.model_selection , or try the search function

Example #1

Source File: train_eval.py From pytorch_geometric with MIT License

8 votes

def k_fold(dataset, folds):
    skf = StratifiedKFold(folds, shuffle=True, random_state=12345)

    test_indices, train_indices = [], []
    for _, idx in skf.split(torch.zeros(len(dataset)), dataset.data.y):
        test_indices.append(torch.from_numpy(idx).to(torch.long))

    val_indices = [test_indices[i - 1] for i in range(folds)]

    for i in range(folds):
        train_mask = torch.ones(len(dataset), dtype=torch.bool)
        train_mask[test_indices[i]] = 0
        train_mask[val_indices[i]] = 0
        train_indices.append(train_mask.nonzero().view(-1))

    return train_indices, test_indices, val_indices

Example #2

Source File: test_search.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_grid_search_groups():
    # Check if ValueError (when groups is None) propagates to GridSearchCV
    # And also check if groups is correctly passed to the cv object
    rng = np.random.RandomState(0)

    X, y = make_classification(n_samples=15, n_classes=2, random_state=0)
    groups = rng.randint(0, 3, 15)

    clf = LinearSVC(random_state=0)
    grid = {'C': [1]}

    group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(),
                 GroupShuffleSplit()]
    for cv in group_cvs:
        gs = GridSearchCV(clf, grid, cv=cv)
        assert_raise_message(ValueError,
                             "The 'groups' parameter should not be None.",
                             gs.fit, X, y)
        gs.fit(X, y, groups=groups)

    non_group_cvs = [StratifiedKFold(), StratifiedShuffleSplit()]
    for cv in non_group_cvs:
        gs = GridSearchCV(clf, grid, cv=cv)
        # Should not raise an error
        gs.fit(X, y)

Example #3

Source File: keras_models.py From gentun with Apache License 2.0

6 votes

def cross_validate(self):
        """Train model using k-fold cross validation and
        return mean value of the validation accuracy.
        """
        acc = .0
        kfold = StratifiedKFold(n_splits=self.kfold, shuffle=True)
        for fold, (train, validation) in enumerate(kfold.split(self.x_train, np.where(self.y_train == 1)[1])):
            print("KFold {}/{}".format(fold + 1, self.kfold))
            self.reset_weights()
            for epochs, learning_rate in zip(self.epochs, self.learning_rate):
                print("Training {} epochs with learning rate {}".format(epochs, learning_rate))
                self.model.compile(optimizer=Adam(lr=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
                self.model.fit(
                    self.x_train[train], self.y_train[train], epochs=epochs, batch_size=self.batch_size, verbose=1
                )
            acc += self.model.evaluate(self.x_train[validation], self.y_train[validation], verbose=0)[1] / self.kfold
        return acc

Example #4

Source File: stratifiedKfold.py From RecommenderSystems with MIT License

6 votes

def main():

    train_x, train_y = _load_data()
    print('loading data done!')

    folds = list(StratifiedKFold(n_splits=10, shuffle=True,
                             random_state=config.RANDOM_SEED).split(train_x, train_y))

    fold_index = []
    for i,(train_id, valid_id) in enumerate(folds):
        fold_index.append(valid_id)

    print("fold num: %d" % (len(fold_index)))

    fold_index = np.array(fold_index)
    np.save(config.DATA_PATH +  "fold_index.npy", fold_index)

    save_x_y(fold_index, train_x, train_y)
    print("save train_x_y done!")

    fold_index = np.load(config.DATA_PATH +  "fold_index.npy")
    save_i(fold_index)
    print("save index done!")

Example #5

Source File: dataloader.py From dgl with Apache License 2.0

6 votes

def _split_fold10(self, labels, fold_idx=0, seed=0, shuffle=True):
        ''' 10 flod '''
        assert 0 <= fold_idx and fold_idx < 10, print(
            "fold_idx must be from 0 to 9.")

        skf = StratifiedKFold(n_splits=10, shuffle=shuffle, random_state=seed)
        idx_list = []
        for idx in skf.split(np.zeros(len(labels)), labels):    # split(x, y)
            idx_list.append(idx)
        train_idx, valid_idx = idx_list[fold_idx]

        print(
            "train_set : test_set = %d : %d",
            len(train_idx), len(valid_idx))

        return train_idx, valid_idx

Example #6

Source File: test.py From rasa_nlu with Apache License 2.0

6 votes

def generate_folds(n, td):
    """Generates n cross validation folds for training data td."""

    from sklearn.model_selection import StratifiedKFold
    skf = StratifiedKFold(n_splits=n, shuffle=True)
    x = td.intent_examples
    y = [example.get("intent") for example in x]
    for i_fold, (train_index, test_index) in enumerate(skf.split(x, y)):
        logger.debug("Fold: {}".format(i_fold))
        train = [x[i] for i in train_index]
        test = [x[i] for i in test_index]
        yield (TrainingData(training_examples=train,
                            entity_synonyms=td.entity_synonyms,
                            regex_features=td.regex_features),
               TrainingData(training_examples=test,
                            entity_synonyms=td.entity_synonyms,
                            regex_features=td.regex_features))

Example #7

Source File: dataset.py From heamy with MIT License

6 votes

def kfold(self, k=5, stratify=False, shuffle=True, seed=33):
        """K-Folds cross validation iterator.

        Parameters
        ----------
        k : int, default 5
        stratify : bool, default False
        shuffle : bool, default True
        seed : int, default 33

        Yields
        -------
        X_train, y_train, X_test, y_test, train_index, test_index
        """
        if stratify:
            kf = StratifiedKFold(n_splits=k, random_state=seed, shuffle=shuffle)
        else:
            kf = KFold(n_splits=k, random_state=seed, shuffle=shuffle)

        for train_index, test_index in kf.split(self.X_train, self.y_train):
            X_train, y_train = idx(self.X_train, train_index), self.y_train[train_index]
            X_test, y_test = idx(self.X_train, test_index), self.y_train[test_index]
            yield X_train, y_train, X_test, y_test, train_index, test_index

Example #8

Source File: stratifiedKfold.py From AutoInt with MIT License

6 votes

def main():

    train_x, train_y = _load_data()
    print('loading data done!')

    folds = list(StratifiedKFold(n_splits=10, shuffle=True,
                             random_state=config.RANDOM_SEED).split(train_x, train_y))

    fold_index = []
    for i,(train_id, valid_id) in enumerate(folds):
        fold_index.append(valid_id)

    print("fold num: %d" % (len(fold_index)))

    fold_index = np.array(fold_index)
    np.save(config.DATA_PATH +  "fold_index.npy", fold_index)

    save_x_y(fold_index, train_x, train_y)
    print("save train_x_y done!")

    fold_index = np.load(config.DATA_PATH +  "fold_index.npy")
    save_i(fold_index)
    print("save index done!")

Example #9

Source File: dataloader.py From dgl with Apache License 2.0

6 votes

def _split_fold10(self, labels, fold_idx=0, seed=0, shuffle=True):
        ''' 10 flod '''
        assert 0 <= fold_idx and fold_idx < 10, print(
            "fold_idx must be from 0 to 9.")

        skf = StratifiedKFold(n_splits=10, shuffle=shuffle, random_state=seed)
        idx_list = []
        for idx in skf.split(np.zeros(len(labels)), labels):    # split(x, y)
            idx_list.append(idx)
        train_idx, valid_idx = idx_list[fold_idx]

        print(
            "train_set : test_set = %d : %d",
            len(train_idx), len(valid_idx))

        return train_idx, valid_idx

Example #10

Source File: tests.py From scikit-mdr with MIT License

6 votes

def test_mdr_sklearn_pipeline():
    """Ensure that MDR can be used as a transformer in a scikit-learn pipeline"""
    features = np.array([[2,    0],
                         [0,    0],
                         [0,    1],
                         [0,    0],
                         [0,    0],
                         [0,    0],
                         [0,    1],
                         [0,    0],
                         [0,    0],
                         [0,    1],
                         [0,    0],
                         [0,    0],
                         [0,    0],
                         [1,    1],
                         [1,    1]])

    classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
    clf = make_pipeline(MDR(), LogisticRegression())
    cv_scores = cross_val_score(clf, features, classes, cv=StratifiedKFold(n_splits=5, shuffle=True))
    assert np.mean(cv_scores) > 0.

Example #11

Source File: 2_AssignCVFolds.py From kaggle-rsna18 with MIT License

6 votes

def assign_folds(orig_df, num_folds, val_frac=0.10, seed=88):
    # Stratified splits
    np.random.seed(seed) 
    df = orig_df.copy() 
    df["fold"] = None  
    skf = StratifiedKFold(n_splits=num_folds, random_state=0, shuffle=True) 
    fold_counter = 0 
    for train_index, test_index in skf.split(df.patientId, df.combined_cat):
        df["fold"].iloc[test_index] = fold_counter
        fold_counter += 1 
    # for each_fold in np.unique(df.fold): 
    #     train_df = df[df.fold != each_fold] 
    #     val_counter = 0
    #     train_df["val{}".format(each_fold)] = None 
    #     for train_index, test_index in skf.split(train_df.patientId, train_df.combined_cat): 
    #         train_df["val{}".format(each_fold)].iloc[test_index] = val_counter
    #         val_counter += 1
    #     df = df.merge(train_df[["patientId", "val{}".format(each_fold)]], on="patientId", how="left")
    return df

##########
# SCRIPT #
##########

Example #12

Source File: filter_datasets.py From pysaliency with MIT License

6 votes

def _get_stratified_crossval_split(stimuli, fixations, split_count, included_splits, random=True, stratified_attributes=None):
    from sklearn.model_selection import StratifiedKFold
    labels = []
    for attribute_name in stratified_attributes:
        attribute_data = np.array(stimuli.attributes[attribute_name])
        if attribute_data.ndim == 1:
            attribute_data = attribute_data[:, np.newaxis]
        labels.append(attribute_data)
    labels = np.vstack(labels)
    X = np.ones((len(stimuli), 1))

    rst = np.random.RandomState(42)

    inds = []
    k_fold = StratifiedKFold(n_splits=split_count, shuffle=random, random_state=rst)
    for i, (train_index, test_index) in enumerate(k_fold.split(X, labels)):
        if i in included_splits:
            inds.extend(test_index)

    stimuli, fixations = create_subset(stimuli, fixations, inds)
    return stimuli, fixations

Example #13

Source File: mvpa_voxelselector.py From brainiak with Apache License 2.0

6 votes

def _sfn(data, mask, myrad, bcast_var):
    """Score classifier on searchlight data using cross-validation.

    The classifier is in `bcast_var[2]`. The labels are in `bast_var[0]`. The
    number of cross-validation folds is in `bast_var[1].
    """
    clf = bcast_var[2]
    masked_data = data[0][mask, :].T
    # print(l[0].shape, mask.shape, data.shape)
    skf = model_selection.StratifiedKFold(n_splits=bcast_var[1],
                                          shuffle=False)
    accuracy = np.mean(model_selection.cross_val_score(clf, masked_data,
                                                       y=bcast_var[0],
                                                       cv=skf,
                                                       n_jobs=1))
    return accuracy

Example #14

Source File: inbreast.py From deep-mil-for-whole-mammogram-classification with MIT License

6 votes

def cvsplit(fold, totalfold, mydict):
  '''get the split of train and test
  fold is the returned fold th data, from 0 to totalfold-1
  total fold is for the cross validation
  mydict is the return dict from readlabel'''
  skf = StratifiedKFold(n_splits=totalfold)  # default shuffle is false, okay!
  #readdicom(mydict)
  y = mydict.values()
  x = mydict.keys()
  count = 0
  for train, test in skf.split(x,y):
    print(len(train), len(test))
    if count == fold:
      #print test
      return train, test
    count += 1

Example #15

Source File: classification.py From brainiak with Apache License 2.0

6 votes

def example_of_cross_validation_using_model_selection(raw_data, labels, num_subjects, num_epochs_per_subj):
    # NOTE: this method does not work for sklearn.svm.SVC with precomputed kernel
    # when the kernel matrix is computed in portions; also, this method only works
    # for self-correlation, i.e. correlation between the same data matrix.

    # no shrinking, set C=1
    svm_clf = svm.SVC(kernel='precomputed', shrinking=False, C=1, gamma='auto')
    #logit_clf = LogisticRegression()
    clf = Classifier(svm_clf, epochs_per_subj=num_epochs_per_subj)
    # doing leave-one-subject-out cross validation
    # no shuffling in cv
    skf = model_selection.StratifiedKFold(n_splits=num_subjects,
                                          shuffle=False)
    scores = model_selection.cross_val_score(clf, list(zip(raw_data, raw_data)),
                                             y=labels,
                                             cv=skf)
    print(scores)
    logger.info(
        'the overall cross validation accuracy is %.2f' %
        np.mean(scores)
    )

Example #16

Source File: eval_train_test.py From fanci with GNU General Public License v3.0

6 votes

def kfold_cv(clf_type, data_sets: [DataSet], fold_count=5, repetitions=5, n_jobs=-1, parallel_verbose=1, persist=True):
    """
    Do a kfold cross validation with a SVM classifier.
    :param data_sets: list of data sets
    :param fold_count: count of folds to be made and hence also runs
    :return: a Statistics object
    """
    log.info('Starting {!s}-fold cv. Set count: {!s}'.format(fold_count, len(data_sets)))
    parallel = Parallel(n_jobs=n_jobs, verbose=parallel_verbose)

    skf = StratifiedKFold(n_splits=fold_count, shuffle=True)
    stats_list = parallel(delayed(_fit_and_score)(clf, domains, labels, train_index, test_index, i, data_set_id, fold_count)
                          for domains, labels, data_set_id, clf in _data_sets_generator(data_sets, clf_type)
                          for i in range(repetitions)
                          for train_index, test_index in skf.split(domains, labels)
                          )
    where = settings.EVAL_FOLDER + '/' + '{!s}fold_cv_{!s}_{!s}rep_{!s}sets_{!s}.pkl'.format(fold_count, clf_type, repetitions, len(data_sets),
                                                                                                settings.NOW_STR)
    return _serialize_cv_results(stats_list, persist, where)

Example #17

Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_2d_y():
    # smoke test for 2d y and multi-label
    n_samples = 30
    rng = np.random.RandomState(1)
    X = rng.randint(0, 3, size=(n_samples, 2))
    y = rng.randint(0, 3, size=(n_samples,))
    y_2d = y.reshape(-1, 1)
    y_multilabel = rng.randint(0, 2, size=(n_samples, 3))
    groups = rng.randint(0, 3, size=(n_samples,))
    splitters = [LeaveOneOut(), LeavePOut(p=2), KFold(), StratifiedKFold(),
                 RepeatedKFold(), RepeatedStratifiedKFold(),
                 ShuffleSplit(), StratifiedShuffleSplit(test_size=.5),
                 GroupShuffleSplit(), LeaveOneGroupOut(),
                 LeavePGroupsOut(n_groups=2), GroupKFold(), TimeSeriesSplit(),
                 PredefinedSplit(test_fold=groups)]
    for splitter in splitters:
        list(splitter.split(X, y, groups))
        list(splitter.split(X, y_2d, groups))
        try:
            list(splitter.split(X, y_multilabel, groups))
        except ValueError as e:
            allowed_target_types = ('binary', 'multiclass')
            msg = "Supported target types are: {}. Got 'multilabel".format(
                allowed_target_types)
            assert msg in str(e)

Example #18

Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_stratified_kfold_ratios():
    # Check that stratified kfold preserves class ratios in individual splits
    # Repeat with shuffling turned off and on
    n_samples = 1000
    X = np.ones(n_samples)
    y = np.array([4] * int(0.10 * n_samples) +
                 [0] * int(0.89 * n_samples) +
                 [1] * int(0.01 * n_samples))

    for shuffle in (False, True):
        for train, test in StratifiedKFold(5, shuffle=shuffle).split(X, y):
            assert_almost_equal(np.sum(y[train] == 4) / len(train), 0.10, 2)
            assert_almost_equal(np.sum(y[train] == 0) / len(train), 0.89, 2)
            assert_almost_equal(np.sum(y[train] == 1) / len(train), 0.01, 2)
            assert_almost_equal(np.sum(y[test] == 4) / len(test), 0.10, 2)
            assert_almost_equal(np.sum(y[test] == 0) / len(test), 0.89, 2)
            assert_almost_equal(np.sum(y[test] == 1) / len(test), 0.01, 2)

Example #19

Source File: clf_helpers.py From ibeis with Apache License 2.0

6 votes

def setup(pblm):
        import sklearn.datasets
        iris = sklearn.datasets.load_iris()

        pblm.primary_task_key = 'iris'
        pblm.default_data_key = 'learn(all)'
        pblm.default_clf_key = 'RF'

        X_df = pd.DataFrame(iris.data, columns=iris.feature_names)
        samples = MultiTaskSamples(X_df.index)
        samples.apply_indicators(
            {'iris': {name: iris.target == idx
                      for idx, name in enumerate(iris.target_names)}})
        samples.X_dict = {'learn(all)': X_df}

        pblm.samples = samples
        pblm.xval_kw['type'] = 'StratifiedKFold'

Example #20

Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_shuffle_stratifiedkfold():
    # Check that shuffling is happening when requested, and for proper
    # sample coverage
    X_40 = np.ones(40)
    y = [0] * 20 + [1] * 20
    kf0 = StratifiedKFold(5, shuffle=True, random_state=0)
    kf1 = StratifiedKFold(5, shuffle=True, random_state=1)
    for (_, test0), (_, test1) in zip(kf0.split(X_40, y),
                                      kf1.split(X_40, y)):
        assert_not_equal(set(test0), set(test1))
    check_cv_coverage(kf0, X_40, y, groups=None, expected_n_splits=5)

    # Ensure that we shuffle each class's samples with different
    # random_state in StratifiedKFold
    # See https://github.com/scikit-learn/scikit-learn/pull/13124
    X = np.arange(10)
    y = [0] * 5 + [1] * 5
    kf1 = StratifiedKFold(5, shuffle=True, random_state=0)
    kf2 = StratifiedKFold(5, shuffle=True, random_state=1)
    test_set1 = sorted([tuple(s[1]) for s in kf1.split(X, y)])
    test_set2 = sorted([tuple(s[1]) for s in kf2.split(X, y)])
    assert test_set1 != test_set2

Example #21

Source File: clf_helpers.py From ibeis with Apache License 2.0

6 votes

def stratified_kfold_indices(samples, **xval_kw):
        """
        TODO: check xval label frequency


        """
        from sklearn import model_selection

        X = np.empty((len(samples), 0))
        y = samples.encoded_1d().values
        groups = samples.group_ids

        type_ = xval_kw.pop('type', 'StratifiedGroupKFold')
        if type_ == 'StratifiedGroupKFold':
            assert groups is not None
            # FIXME: The StratifiedGroupKFold could be implemented better.
            splitter = sklearn_utils.StratifiedGroupKFold(**xval_kw)
            skf_list = list(splitter.split(X=X, y=y, groups=groups))
        elif type_ == 'StratifiedKFold':
            splitter = model_selection.StratifiedKFold(**xval_kw)
            skf_list = list(splitter.split(X=X, y=y))
        return skf_list

Example #22

Source File: test_validation.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_cross_val_predict_unbalanced():
    X, y = make_classification(n_samples=100, n_features=2, n_redundant=0,
                               n_informative=2, n_clusters_per_class=1,
                               random_state=1)
    # Change the first sample to a new class
    y[0] = 2
    clf = LogisticRegression(random_state=1)
    cv = StratifiedKFold(n_splits=2, random_state=1)
    train, test = list(cv.split(X, y))
    yhat_proba = cross_val_predict(clf, X, y, cv=cv, method="predict_proba")
    assert y[test[0]][0] == 2  # sanity check for further assertions
    assert np.all(yhat_proba[test[0]][:, 2] == 0)
    assert np.all(yhat_proba[test[0]][:, 0:1] > 0)
    assert np.all(yhat_proba[test[1]] > 0)
    assert_array_almost_equal(yhat_proba.sum(axis=1), np.ones(y.shape),
                              decimal=12)

Example #23

Source File: stacking.py From xam with MIT License

5 votes

def __init__(self, models, meta_model, cv=model_selection.StratifiedKFold(n_splits=3),
                 metric=metrics.roc_auc_score, use_base_features=False, use_probas=True):
        super().__init__(
            models=models,
            meta_model=meta_model,
            cv=cv,
            metric=metric,
            use_base_features=use_base_features,
            use_probas=use_probas,
        )

Example #24

Source File: test_stacker.py From xcessiv with Apache License 2.0

5 votes

def setUp(self):
        bl1 = RandomForestClassifier(random_state=8)
        bl2 = LogisticRegression()
        bl3 = RandomForestClassifier(max_depth=10, random_state=10)

        meta_est = LogisticRegression()

        skf = StratifiedKFold(random_state=8).split

        self.stacked_ensemble = stacker.XcessivStackedEnsemble(
            [bl1, bl2, bl3],
            ['predict', 'predict_proba', 'predict_proba'],
            meta_est,
            skf
        )

Example #25

Source File: test_logistic.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_LogisticRegressionCV_GridSearchCV_elastic_net_ovr():
    # make sure LogisticRegressionCV gives same best params (l1 and C) as
    # GridSearchCV when penalty is elasticnet and multiclass is ovr. We can't
    # compare best_params like in the previous test because
    # LogisticRegressionCV with multi_class='ovr' will have one C and one
    # l1_param for each class, while LogisticRegression will share the
    # parameters over the *n_classes* classifiers.

    X, y = make_classification(n_samples=200, n_classes=3, n_informative=3,
                               random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    cv = StratifiedKFold(5, random_state=0)

    l1_ratios = np.linspace(0, 1, 5)
    Cs = np.logspace(-4, 4, 5)

    lrcv = LogisticRegressionCV(penalty='elasticnet', Cs=Cs, solver='saga',
                                cv=cv, l1_ratios=l1_ratios, random_state=0,
                                multi_class='ovr')
    lrcv.fit(X_train, y_train)

    param_grid = {'C': Cs, 'l1_ratio': l1_ratios}
    lr = LogisticRegression(penalty='elasticnet', solver='saga',
                            random_state=0, multi_class='ovr')
    gs = GridSearchCV(lr, param_grid, cv=cv, iid=False)
    gs.fit(X_train, y_train)

    # Check that predictions are 80% the same
    assert (lrcv.predict(X_train) == gs.predict(X_train)).mean() >= .8
    assert (lrcv.predict(X_test) == gs.predict(X_test)).mean() >= .8

Example #26

Source File: test_logistic.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_LogisticRegressionCV_GridSearchCV_elastic_net(multi_class):
    # make sure LogisticRegressionCV gives same best params (l1 and C) as
    # GridSearchCV when penalty is elasticnet

    if multi_class == 'ovr':
        # This is actually binary classification, ovr multiclass is treated in
        # test_LogisticRegressionCV_GridSearchCV_elastic_net_ovr
        X, y = make_classification(random_state=0)
    else:
        X, y = make_classification(n_samples=200, n_classes=3, n_informative=3,
                                   random_state=0)

    cv = StratifiedKFold(5, random_state=0)

    l1_ratios = np.linspace(0, 1, 5)
    Cs = np.logspace(-4, 4, 5)

    lrcv = LogisticRegressionCV(penalty='elasticnet', Cs=Cs, solver='saga',
                                cv=cv, l1_ratios=l1_ratios, random_state=0,
                                multi_class=multi_class)
    lrcv.fit(X, y)

    param_grid = {'C': Cs, 'l1_ratio': l1_ratios}
    lr = LogisticRegression(penalty='elasticnet', solver='saga',
                            random_state=0, multi_class=multi_class)
    gs = GridSearchCV(lr, param_grid, cv=cv)
    gs.fit(X, y)

    assert gs.best_params_['l1_ratio'] == lrcv.l1_ratio_[0]
    assert gs.best_params_['C'] == lrcv.C_[0]

Example #27

Source File: Stacking.py From MachineLearning with Apache License 2.0

5 votes

def train(self, train_data, train_label):
        if self.norm_type == "Standardization":
            train_data = preProcess.Standardization(train_data)
        else:
            train_data = preProcess.Normalization(train_data)

        skf = StratifiedKFold(self.n_folds)
        prediction_feature = np.zeros((train_data.shape[0], len(self.classifier_set)))
        trained_model = []

        # the first layer in Stacking
        for j, clf in enumerate(self.classifier_set):
            # train each submodel
            subtrained_model = []
            # cross validation
            for (train_index, test_index) in skf.split(train_data, train_label):
                X_train, X_test = train_data[train_index], train_data[test_index]
                y_train, y_test = train_label[train_index], train_label[test_index]
                # train and save the model trained with S-si
                clf.train(X_train, y_train)
                subtrained_model.append(clf)
                # get the prediction feature for each sub model
                prediction_feature[test_index, j] = clf.predict(X_test)[:, 0]
            # save the models
            trained_model.append(subtrained_model)

        self.trained_classifier_set = trained_model
        return self

Example #28

Source File: clf_helpers.py From ibeis with Apache License 2.0

5 votes

def subsplit_indices(samples, subset_idx, **xval_kw):
        """ split an existing set """
        from sklearn import model_selection

        X = np.empty((len(subset_idx), 0))
        y = samples.encoded_1d().values[subset_idx]
        groups = samples.group_ids[subset_idx]

        xval_kw_ = xval_kw.copy()
        if 'n_splits' not in xval_kw_:
            xval_kw_['n_splits'] = 3
        type_ = xval_kw_.pop('type', 'StratifiedGroupKFold')
        if type_ == 'StratifiedGroupKFold':
            assert groups is not None
            # FIXME: The StratifiedGroupKFold could be implemented better.
            splitter = sklearn_utils.StratifiedGroupKFold(**xval_kw_)
            rel_skf_list = list(splitter.split(X=X, y=y, groups=groups))
        elif type_ == 'StratifiedKFold':
            splitter = model_selection.StratifiedKFold(**xval_kw_)
            rel_skf_list = list(splitter.split(X=X, y=y))

        # map back into original coords
        skf_list = [(subset_idx[rel_idx1], subset_idx[rel_idx2])
                    for rel_idx1, rel_idx2 in rel_skf_list]

        for idx1, idx2 in skf_list:
            assert len(np.intersect1d(subset_idx, idx1)) == len(idx1)
            assert len(np.intersect1d(subset_idx, idx2)) == len(idx2)
            # assert
        return skf_list

Example #29

Source File: utils.py From ICIAR2018 with MIT License

5 votes

def make_folds():
    """Creates stratified splits based on train directory listing

    # Dumps
        folds: list of splits dict{
                                   "train": {
                                               "x": train files list,
                                               "y": train labels},
                                   "test": {
                                               "x": test files list,
                                               "y": test labels}}
                                }
    """
    files = np.array([basename(f) for f in glob.glob("data/preprocessed/train/ResNet-0.5-400/*.npy")])
    labels = []
    classes = np.array([0, 1, 2, 3])
    for f in files:
        lb = np.array([f.startswith("n"),
                       f.startswith("b"),
                       f.startswith("is"),
                       f.startswith("iv")])
        labels.append(classes[np.argmax(lb)])
    labels = np.array(labels)

    folds = []
    skf = StratifiedKFold(n_splits=10, shuffle=True)
    for train_index, test_index in skf.split(files, labels):
        f_train, f_test = files[train_index], files[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        folds.append({"train": {"x": f_train, "y": y_train}, "test": {"x": f_test, "y": y_test}})

    with open("data/folds-10.pkl", "wb") as f:
        pickle.dump(folds, f)

Example #30

Source File: linear_svm.py From driverlessai-recipes with Apache License 2.0

5 votes

def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs):
        X = dt.Frame(X)

        orig_cols = list(X.names)

        if self.num_classes >= 2:
            mod = linsvc(random_state=self.random_state, C=self.params["C"], penalty=self.params["penalty"],
                         loss=self.params["loss"], dual=self.params["dual"])
            kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=self.random_state)
            model = CalibratedClassifierCV(base_estimator=mod, method='isotonic', cv=kf)
            lb = LabelEncoder()
            lb.fit(self.labels)
            y = lb.transform(y)
        else:
            model = LinearSVR(epsilon=self.params["epsilon"], C=self.params["C"], loss=self.params["loss"],
                              dual=self.params["dual"], random_state=self.random_state)
        self.means = dict()
        self.standard_scaler = StandardScaler()
        for col in X.names:
            XX = X[:, col]
            self.means[col] = XX.mean1()
            if self.means[col] is None:
                self.means[col] = 0
            XX.replace(None, self.means[col])
            X[:, col] = XX
            assert X[dt.isna(dt.f[col]), col].nrows == 0
        X = X.to_numpy()
        X = self.standard_scaler.fit_transform(X)
        model.fit(X, y, sample_weight=sample_weight)
        importances = np.array([0.0 for k in range(len(orig_cols))])
        if self.num_classes >= 2:
            for classifier in model.calibrated_classifiers_:
                importances += np.array(abs(classifier.base_estimator.get_coeff()))
        else:
            importances += np.array(abs(model.coef_[0]))

        self.set_model_properties(model=model,
                                  features=orig_cols,
                                  importances=importances.tolist(),  # abs(model.coef_[0])
                                  iterations=0)