Python sklearn.model_selection.StratifiedKFold() Examples

The following are 30 code examples for showing how to use sklearn.model_selection.StratifiedKFold(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module sklearn.model_selection , or try the search function .

Example 1
Project: pytorch_geometric   Author: rusty1s   File: train_eval.py    License: MIT License 6 votes vote down vote up
def k_fold(dataset, folds):
    skf = StratifiedKFold(folds, shuffle=True, random_state=12345)

    test_indices, train_indices = [], []
    for _, idx in skf.split(torch.zeros(len(dataset)), dataset.data.y):
        test_indices.append(torch.from_numpy(idx).to(torch.long))

    val_indices = [test_indices[i - 1] for i in range(folds)]

    for i in range(folds):
        train_mask = torch.ones(len(dataset), dtype=torch.bool)
        train_mask[test_indices[i]] = 0
        train_mask[val_indices[i]] = 0
        train_indices.append(train_mask.nonzero().view(-1))

    return train_indices, test_indices, val_indices 
Example 2
Project: dgl   Author: dmlc   File: dataloader.py    License: Apache License 2.0 6 votes vote down vote up
def _split_fold10(self, labels, fold_idx=0, seed=0, shuffle=True):
        ''' 10 flod '''
        assert 0 <= fold_idx and fold_idx < 10, print(
            "fold_idx must be from 0 to 9.")

        skf = StratifiedKFold(n_splits=10, shuffle=shuffle, random_state=seed)
        idx_list = []
        for idx in skf.split(np.zeros(len(labels)), labels):    # split(x, y)
            idx_list.append(idx)
        train_idx, valid_idx = idx_list[fold_idx]

        print(
            "train_set : test_set = %d : %d",
            len(train_idx), len(valid_idx))

        return train_idx, valid_idx 
Example 3
Project: dgl   Author: dmlc   File: dataloader.py    License: Apache License 2.0 6 votes vote down vote up
def _split_fold10(self, labels, fold_idx=0, seed=0, shuffle=True):
        ''' 10 flod '''
        assert 0 <= fold_idx and fold_idx < 10, print(
            "fold_idx must be from 0 to 9.")

        skf = StratifiedKFold(n_splits=10, shuffle=shuffle, random_state=seed)
        idx_list = []
        for idx in skf.split(np.zeros(len(labels)), labels):    # split(x, y)
            idx_list.append(idx)
        train_idx, valid_idx = idx_list[fold_idx]

        print(
            "train_set : test_set = %d : %d",
            len(train_idx), len(valid_idx))

        return train_idx, valid_idx 
Example 4
Project: heamy   Author: rushter   File: dataset.py    License: MIT License 6 votes vote down vote up
def kfold(self, k=5, stratify=False, shuffle=True, seed=33):
        """K-Folds cross validation iterator.

        Parameters
        ----------
        k : int, default 5
        stratify : bool, default False
        shuffle : bool, default True
        seed : int, default 33

        Yields
        -------
        X_train, y_train, X_test, y_test, train_index, test_index
        """
        if stratify:
            kf = StratifiedKFold(n_splits=k, random_state=seed, shuffle=shuffle)
        else:
            kf = KFold(n_splits=k, random_state=seed, shuffle=shuffle)

        for train_index, test_index in kf.split(self.X_train, self.y_train):
            X_train, y_train = idx(self.X_train, train_index), self.y_train[train_index]
            X_test, y_test = idx(self.X_train, test_index), self.y_train[test_index]
            yield X_train, y_train, X_test, y_test, train_index, test_index 
Example 5
Project: deep-mil-for-whole-mammogram-classification   Author: wentaozhu   File: inbreast.py    License: MIT License 6 votes vote down vote up
def cvsplit(fold, totalfold, mydict):
  '''get the split of train and test
  fold is the returned fold th data, from 0 to totalfold-1
  total fold is for the cross validation
  mydict is the return dict from readlabel'''
  skf = StratifiedKFold(n_splits=totalfold)  # default shuffle is false, okay!
  #readdicom(mydict)
  y = mydict.values()
  x = mydict.keys()
  count = 0
  for train, test in skf.split(x,y):
    print(len(train), len(test))
    if count == fold:
      #print test
      return train, test
    count += 1 
Example 6
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_split.py    License: MIT License 6 votes vote down vote up
def test_2d_y():
    # smoke test for 2d y and multi-label
    n_samples = 30
    rng = np.random.RandomState(1)
    X = rng.randint(0, 3, size=(n_samples, 2))
    y = rng.randint(0, 3, size=(n_samples,))
    y_2d = y.reshape(-1, 1)
    y_multilabel = rng.randint(0, 2, size=(n_samples, 3))
    groups = rng.randint(0, 3, size=(n_samples,))
    splitters = [LeaveOneOut(), LeavePOut(p=2), KFold(), StratifiedKFold(),
                 RepeatedKFold(), RepeatedStratifiedKFold(),
                 ShuffleSplit(), StratifiedShuffleSplit(test_size=.5),
                 GroupShuffleSplit(), LeaveOneGroupOut(),
                 LeavePGroupsOut(n_groups=2), GroupKFold(), TimeSeriesSplit(),
                 PredefinedSplit(test_fold=groups)]
    for splitter in splitters:
        list(splitter.split(X, y, groups))
        list(splitter.split(X, y_2d, groups))
        try:
            list(splitter.split(X, y_multilabel, groups))
        except ValueError as e:
            allowed_target_types = ('binary', 'multiclass')
            msg = "Supported target types are: {}. Got 'multilabel".format(
                allowed_target_types)
            assert msg in str(e) 
Example 7
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_split.py    License: MIT License 6 votes vote down vote up
def test_stratified_kfold_ratios():
    # Check that stratified kfold preserves class ratios in individual splits
    # Repeat with shuffling turned off and on
    n_samples = 1000
    X = np.ones(n_samples)
    y = np.array([4] * int(0.10 * n_samples) +
                 [0] * int(0.89 * n_samples) +
                 [1] * int(0.01 * n_samples))

    for shuffle in (False, True):
        for train, test in StratifiedKFold(5, shuffle=shuffle).split(X, y):
            assert_almost_equal(np.sum(y[train] == 4) / len(train), 0.10, 2)
            assert_almost_equal(np.sum(y[train] == 0) / len(train), 0.89, 2)
            assert_almost_equal(np.sum(y[train] == 1) / len(train), 0.01, 2)
            assert_almost_equal(np.sum(y[test] == 4) / len(test), 0.10, 2)
            assert_almost_equal(np.sum(y[test] == 0) / len(test), 0.89, 2)
            assert_almost_equal(np.sum(y[test] == 1) / len(test), 0.01, 2) 
Example 8
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_split.py    License: MIT License 6 votes vote down vote up
def test_shuffle_stratifiedkfold():
    # Check that shuffling is happening when requested, and for proper
    # sample coverage
    X_40 = np.ones(40)
    y = [0] * 20 + [1] * 20
    kf0 = StratifiedKFold(5, shuffle=True, random_state=0)
    kf1 = StratifiedKFold(5, shuffle=True, random_state=1)
    for (_, test0), (_, test1) in zip(kf0.split(X_40, y),
                                      kf1.split(X_40, y)):
        assert_not_equal(set(test0), set(test1))
    check_cv_coverage(kf0, X_40, y, groups=None, expected_n_splits=5)

    # Ensure that we shuffle each class's samples with different
    # random_state in StratifiedKFold
    # See https://github.com/scikit-learn/scikit-learn/pull/13124
    X = np.arange(10)
    y = [0] * 5 + [1] * 5
    kf1 = StratifiedKFold(5, shuffle=True, random_state=0)
    kf2 = StratifiedKFold(5, shuffle=True, random_state=1)
    test_set1 = sorted([tuple(s[1]) for s in kf1.split(X, y)])
    test_set2 = sorted([tuple(s[1]) for s in kf2.split(X, y)])
    assert test_set1 != test_set2 
Example 9
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_validation.py    License: MIT License 6 votes vote down vote up
def test_cross_val_predict_unbalanced():
    X, y = make_classification(n_samples=100, n_features=2, n_redundant=0,
                               n_informative=2, n_clusters_per_class=1,
                               random_state=1)
    # Change the first sample to a new class
    y[0] = 2
    clf = LogisticRegression(random_state=1)
    cv = StratifiedKFold(n_splits=2, random_state=1)
    train, test = list(cv.split(X, y))
    yhat_proba = cross_val_predict(clf, X, y, cv=cv, method="predict_proba")
    assert y[test[0]][0] == 2  # sanity check for further assertions
    assert np.all(yhat_proba[test[0]][:, 2] == 0)
    assert np.all(yhat_proba[test[0]][:, 0:1] > 0)
    assert np.all(yhat_proba[test[1]] > 0)
    assert_array_almost_equal(yhat_proba.sum(axis=1), np.ones(y.shape),
                              decimal=12) 
Example 10
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_search.py    License: MIT License 6 votes vote down vote up
def test_grid_search_groups():
    # Check if ValueError (when groups is None) propagates to GridSearchCV
    # And also check if groups is correctly passed to the cv object
    rng = np.random.RandomState(0)

    X, y = make_classification(n_samples=15, n_classes=2, random_state=0)
    groups = rng.randint(0, 3, 15)

    clf = LinearSVC(random_state=0)
    grid = {'C': [1]}

    group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(),
                 GroupShuffleSplit()]
    for cv in group_cvs:
        gs = GridSearchCV(clf, grid, cv=cv)
        assert_raise_message(ValueError,
                             "The 'groups' parameter should not be None.",
                             gs.fit, X, y)
        gs.fit(X, y, groups=groups)

    non_group_cvs = [StratifiedKFold(), StratifiedShuffleSplit()]
    for cv in non_group_cvs:
        gs = GridSearchCV(clf, grid, cv=cv)
        # Should not raise an error
        gs.fit(X, y) 
Example 11
Project: ibeis   Author: Erotemic   File: clf_helpers.py    License: Apache License 2.0 6 votes vote down vote up
def stratified_kfold_indices(samples, **xval_kw):
        """
        TODO: check xval label frequency


        """
        from sklearn import model_selection

        X = np.empty((len(samples), 0))
        y = samples.encoded_1d().values
        groups = samples.group_ids

        type_ = xval_kw.pop('type', 'StratifiedGroupKFold')
        if type_ == 'StratifiedGroupKFold':
            assert groups is not None
            # FIXME: The StratifiedGroupKFold could be implemented better.
            splitter = sklearn_utils.StratifiedGroupKFold(**xval_kw)
            skf_list = list(splitter.split(X=X, y=y, groups=groups))
        elif type_ == 'StratifiedKFold':
            splitter = model_selection.StratifiedKFold(**xval_kw)
            skf_list = list(splitter.split(X=X, y=y))
        return skf_list 
Example 12
Project: ibeis   Author: Erotemic   File: clf_helpers.py    License: Apache License 2.0 6 votes vote down vote up
def setup(pblm):
        import sklearn.datasets
        iris = sklearn.datasets.load_iris()

        pblm.primary_task_key = 'iris'
        pblm.default_data_key = 'learn(all)'
        pblm.default_clf_key = 'RF'

        X_df = pd.DataFrame(iris.data, columns=iris.feature_names)
        samples = MultiTaskSamples(X_df.index)
        samples.apply_indicators(
            {'iris': {name: iris.target == idx
                      for idx, name in enumerate(iris.target_names)}})
        samples.X_dict = {'learn(all)': X_df}

        pblm.samples = samples
        pblm.xval_kw['type'] = 'StratifiedKFold' 
Example 13
Project: fanci   Author: fanci-dga-detection   File: eval_train_test.py    License: GNU General Public License v3.0 6 votes vote down vote up
def kfold_cv(clf_type, data_sets: [DataSet], fold_count=5, repetitions=5, n_jobs=-1, parallel_verbose=1, persist=True):
    """
    Do a kfold cross validation with a SVM classifier.
    :param data_sets: list of data sets
    :param fold_count: count of folds to be made and hence also runs
    :return: a Statistics object
    """
    log.info('Starting {!s}-fold cv. Set count: {!s}'.format(fold_count, len(data_sets)))
    parallel = Parallel(n_jobs=n_jobs, verbose=parallel_verbose)

    skf = StratifiedKFold(n_splits=fold_count, shuffle=True)
    stats_list = parallel(delayed(_fit_and_score)(clf, domains, labels, train_index, test_index, i, data_set_id, fold_count)
                          for domains, labels, data_set_id, clf in _data_sets_generator(data_sets, clf_type)
                          for i in range(repetitions)
                          for train_index, test_index in skf.split(domains, labels)
                          )
    where = settings.EVAL_FOLDER + '/' + '{!s}fold_cv_{!s}_{!s}rep_{!s}sets_{!s}.pkl'.format(fold_count, clf_type, repetitions, len(data_sets),
                                                                                                settings.NOW_STR)
    return _serialize_cv_results(stats_list, persist, where) 
Example 14
Project: brainiak   Author: brainiak   File: classification.py    License: Apache License 2.0 6 votes vote down vote up
def example_of_cross_validation_using_model_selection(raw_data, labels, num_subjects, num_epochs_per_subj):
    # NOTE: this method does not work for sklearn.svm.SVC with precomputed kernel
    # when the kernel matrix is computed in portions; also, this method only works
    # for self-correlation, i.e. correlation between the same data matrix.

    # no shrinking, set C=1
    svm_clf = svm.SVC(kernel='precomputed', shrinking=False, C=1, gamma='auto')
    #logit_clf = LogisticRegression()
    clf = Classifier(svm_clf, epochs_per_subj=num_epochs_per_subj)
    # doing leave-one-subject-out cross validation
    # no shuffling in cv
    skf = model_selection.StratifiedKFold(n_splits=num_subjects,
                                          shuffle=False)
    scores = model_selection.cross_val_score(clf, list(zip(raw_data, raw_data)),
                                             y=labels,
                                             cv=skf)
    print(scores)
    logger.info(
        'the overall cross validation accuracy is %.2f' %
        np.mean(scores)
    ) 
Example 15
Project: brainiak   Author: brainiak   File: mvpa_voxelselector.py    License: Apache License 2.0 6 votes vote down vote up
def _sfn(data, mask, myrad, bcast_var):
    """Score classifier on searchlight data using cross-validation.

    The classifier is in `bcast_var[2]`. The labels are in `bast_var[0]`. The
    number of cross-validation folds is in `bast_var[1].
    """
    clf = bcast_var[2]
    masked_data = data[0][mask, :].T
    # print(l[0].shape, mask.shape, data.shape)
    skf = model_selection.StratifiedKFold(n_splits=bcast_var[1],
                                          shuffle=False)
    accuracy = np.mean(model_selection.cross_val_score(clf, masked_data,
                                                       y=bcast_var[0],
                                                       cv=skf,
                                                       n_jobs=1))
    return accuracy 
Example 16
Project: pysaliency   Author: matthias-k   File: filter_datasets.py    License: MIT License 6 votes vote down vote up
def _get_stratified_crossval_split(stimuli, fixations, split_count, included_splits, random=True, stratified_attributes=None):
    from sklearn.model_selection import StratifiedKFold
    labels = []
    for attribute_name in stratified_attributes:
        attribute_data = np.array(stimuli.attributes[attribute_name])
        if attribute_data.ndim == 1:
            attribute_data = attribute_data[:, np.newaxis]
        labels.append(attribute_data)
    labels = np.vstack(labels)
    X = np.ones((len(stimuli), 1))

    rst = np.random.RandomState(42)

    inds = []
    k_fold = StratifiedKFold(n_splits=split_count, shuffle=random, random_state=rst)
    for i, (train_index, test_index) in enumerate(k_fold.split(X, labels)):
        if i in included_splits:
            inds.extend(test_index)

    stimuli, fixations = create_subset(stimuli, fixations, inds)
    return stimuli, fixations 
Example 17
Project: kaggle-rsna18   Author: i-pan   File: 2_AssignCVFolds.py    License: MIT License 6 votes vote down vote up
def assign_folds(orig_df, num_folds, val_frac=0.10, seed=88):
    # Stratified splits
    np.random.seed(seed) 
    df = orig_df.copy() 
    df["fold"] = None  
    skf = StratifiedKFold(n_splits=num_folds, random_state=0, shuffle=True) 
    fold_counter = 0 
    for train_index, test_index in skf.split(df.patientId, df.combined_cat):
        df["fold"].iloc[test_index] = fold_counter
        fold_counter += 1 
    # for each_fold in np.unique(df.fold): 
    #     train_df = df[df.fold != each_fold] 
    #     val_counter = 0
    #     train_df["val{}".format(each_fold)] = None 
    #     for train_index, test_index in skf.split(train_df.patientId, train_df.combined_cat): 
    #         train_df["val{}".format(each_fold)].iloc[test_index] = val_counter
    #         val_counter += 1
    #     df = df.merge(train_df[["patientId", "val{}".format(each_fold)]], on="patientId", how="left")
    return df

##########
# SCRIPT #
########## 
Example 18
Project: AutoInt   Author: shichence   File: stratifiedKfold.py    License: MIT License 6 votes vote down vote up
def main():

    train_x, train_y = _load_data()
    print('loading data done!')

    folds = list(StratifiedKFold(n_splits=10, shuffle=True,
                             random_state=config.RANDOM_SEED).split(train_x, train_y))

    fold_index = []
    for i,(train_id, valid_id) in enumerate(folds):
        fold_index.append(valid_id)

    print("fold num: %d" % (len(fold_index)))

    fold_index = np.array(fold_index)
    np.save(config.DATA_PATH +  "fold_index.npy", fold_index)

    save_x_y(fold_index, train_x, train_y)
    print("save train_x_y done!")

    fold_index = np.load(config.DATA_PATH +  "fold_index.npy")
    save_i(fold_index)
    print("save index done!") 
Example 19
Project: rasa_nlu   Author: weizhenzhao   File: test.py    License: Apache License 2.0 6 votes vote down vote up
def generate_folds(n, td):
    """Generates n cross validation folds for training data td."""

    from sklearn.model_selection import StratifiedKFold
    skf = StratifiedKFold(n_splits=n, shuffle=True)
    x = td.intent_examples
    y = [example.get("intent") for example in x]
    for i_fold, (train_index, test_index) in enumerate(skf.split(x, y)):
        logger.debug("Fold: {}".format(i_fold))
        train = [x[i] for i in train_index]
        test = [x[i] for i in test_index]
        yield (TrainingData(training_examples=train,
                            entity_synonyms=td.entity_synonyms,
                            regex_features=td.regex_features),
               TrainingData(training_examples=test,
                            entity_synonyms=td.entity_synonyms,
                            regex_features=td.regex_features)) 
Example 20
Project: gentun   Author: gmontamat   File: keras_models.py    License: Apache License 2.0 6 votes vote down vote up
def cross_validate(self):
        """Train model using k-fold cross validation and
        return mean value of the validation accuracy.
        """
        acc = .0
        kfold = StratifiedKFold(n_splits=self.kfold, shuffle=True)
        for fold, (train, validation) in enumerate(kfold.split(self.x_train, np.where(self.y_train == 1)[1])):
            print("KFold {}/{}".format(fold + 1, self.kfold))
            self.reset_weights()
            for epochs, learning_rate in zip(self.epochs, self.learning_rate):
                print("Training {} epochs with learning rate {}".format(epochs, learning_rate))
                self.model.compile(optimizer=Adam(lr=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
                self.model.fit(
                    self.x_train[train], self.y_train[train], epochs=epochs, batch_size=self.batch_size, verbose=1
                )
            acc += self.model.evaluate(self.x_train[validation], self.y_train[validation], verbose=0)[1] / self.kfold
        return acc 
Example 21
Project: RecommenderSystems   Author: DeepGraphLearning   File: stratifiedKfold.py    License: MIT License 6 votes vote down vote up
def main():

    train_x, train_y = _load_data()
    print('loading data done!')

    folds = list(StratifiedKFold(n_splits=10, shuffle=True,
                             random_state=config.RANDOM_SEED).split(train_x, train_y))

    fold_index = []
    for i,(train_id, valid_id) in enumerate(folds):
        fold_index.append(valid_id)

    print("fold num: %d" % (len(fold_index)))

    fold_index = np.array(fold_index)
    np.save(config.DATA_PATH +  "fold_index.npy", fold_index)

    save_x_y(fold_index, train_x, train_y)
    print("save train_x_y done!")

    fold_index = np.load(config.DATA_PATH +  "fold_index.npy")
    save_i(fold_index)
    print("save index done!") 
Example 22
Project: scikit-mdr   Author: EpistasisLab   File: tests.py    License: MIT License 6 votes vote down vote up
def test_mdr_sklearn_pipeline():
    """Ensure that MDR can be used as a transformer in a scikit-learn pipeline"""
    features = np.array([[2,    0],
                         [0,    0],
                         [0,    1],
                         [0,    0],
                         [0,    0],
                         [0,    0],
                         [0,    1],
                         [0,    0],
                         [0,    0],
                         [0,    1],
                         [0,    0],
                         [0,    0],
                         [0,    0],
                         [1,    1],
                         [1,    1]])

    classes = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
    clf = make_pipeline(MDR(), LogisticRegression())
    cv_scores = cross_val_score(clf, features, classes, cv=StratifiedKFold(n_splits=5, shuffle=True))
    assert np.mean(cv_scores) > 0. 
Example 23
Project: xcessiv   Author: reiinakano   File: functions.py    License: Apache License 2.0 5 votes vote down vote up
def get_sample_dataset(dataset_properties):
    """Returns sample dataset

    Args:
        dataset_properties (dict): Dictionary corresponding to the properties of the dataset
            used to verify the estimator and metric generators.

    Returns:
        X (array-like): Features array

        y (array-like): Labels array

        splits (iterator): This is an iterator that returns train test splits for
            cross-validation purposes on ``X`` and ``y``.
    """
    kwargs = dataset_properties.copy()
    data_type = kwargs.pop('type')
    if data_type == 'multiclass':
        try:
            X, y = datasets.make_classification(random_state=8, **kwargs)
            splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
        except Exception as e:
            raise exceptions.UserError(repr(e))
    elif data_type == 'iris':
        X, y = datasets.load_iris(return_X_y=True)
        splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
    elif data_type == 'mnist':
        X, y = datasets.load_digits(return_X_y=True)
        splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
    elif data_type == 'breast_cancer':
        X, y = datasets.load_breast_cancer(return_X_y=True)
        splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
    elif data_type == 'boston':
        X, y = datasets.load_boston(return_X_y=True)
        splits = model_selection.KFold(n_splits=2, random_state=8).split(X)
    elif data_type == 'diabetes':
        X, y = datasets.load_diabetes(return_X_y=True)
        splits = model_selection.KFold(n_splits=2, random_state=8).split(X)
    else:
        raise exceptions.UserError('Unknown dataset type {}'.format(dataset_properties['type']))
    return X, y, splits 
Example 24
Project: xcessiv   Author: reiinakano   File: test_stacker.py    License: Apache License 2.0 5 votes vote down vote up
def setUp(self):
        bl1 = RandomForestClassifier(random_state=8)
        bl2 = LogisticRegression()
        bl3 = RandomForestClassifier(max_depth=10, random_state=10)

        meta_est = LogisticRegression()

        skf = StratifiedKFold(random_state=8).split

        self.stacked_ensemble = stacker.XcessivStackedEnsemble(
            [bl1, bl2, bl3],
            ['predict', 'predict_proba', 'predict_proba'],
            meta_est,
            skf
        ) 
Example 25
Project: MKLpy   Author: IvanoLauriola   File: model_selection.py    License: GNU General Public License v3.0 5 votes vote down vote up
def cross_val_score(KL, Y, estimator, cv=None, n_folds=3, scoring='accuracy', random_state=None, shuffle=True):
    '''performs the cross validation'''

    scorer, f = __def_score__(scoring)
    f = getattr(estimator,f)
    n = len(Y)
    cv   = cv or KFold(n_folds, random_state=random_state, shuffle=shuffle)
    results = []
    for train,test in cv.split(Y,Y):
        KLtr = [K[train][:,train] for K in KL]
        KLte = [K[test ][:,train] for K in KL]
        clf = estimator.fit(KLtr,Y[train])
        y = f(KLte)
        results.append(scorer(Y[test],y))
    return results 
Example 26
Project: pytorch_geometric   Author: rusty1s   File: dna.py    License: MIT License 5 votes vote down vote up
def gen_uniform_20_20_60_split(data):
    skf = StratifiedKFold(5, shuffle=True, random_state=55)
    idx = [torch.from_numpy(i) for _, i in skf.split(data.y, data.y)]
    data.train_idx = idx[0].to(torch.long)
    data.val_idx = idx[1].to(torch.long)
    data.test_idx = torch.cat(idx[2:], dim=0).to(torch.long)
    return data 
Example 27
Project: wsdm19cup   Author: lampts   File: train_predict_trees_batch2.py    License: MIT License 5 votes vote down vote up
def validate_predict(model,X,y,X_test,n_splits=10,seed=42,model_type='lgb',verbose=0, sample_weights=sample_weights):
        
    preds = np.zeros((X.shape[0],3))
    preds_test = np.zeros((X_test.shape[0],3))
    cv_scores = []
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    for idx_train, idx_val in skf.split(X, y):
        X_train, X_val = X[idx_train,:], X[idx_val,:]
        y_train, y_val = y[idx_train], y[idx_val]
        
        if model_type == 'lgb':
            model.fit(X_train, y_train,
                        eval_set=[(X_train, y_train), (X_val, y_val)],
                        verbose=verbose)
        else:
            model.fit(X_train, y_train)
            
        if hasattr(model, 'predict_proba'):
            yhat_val = model.predict_proba(X_val)
            preds_test = preds_test + model.predict_proba(X_test)
            preds[idx_val] = yhat_val
        else:
            yhat_val = model.predict(X_val)
            preds_test = preds_test + model.predict(X_test)
            preds[idx_val] = yhat_val
        cv_scores.append(accuracy_score(y_val, np.array(['agreed', 'disagreed', 'unrelated'])[np.argmax(yhat_val,axis=1)]))
    print("local cv", np.mean(cv_scores), np.std(cv_scores))
    print(f"Val accuracy: {accuracy_score(y, np.array(['agreed', 'disagreed', 'unrelated'])[np.argmax(preds,axis=1)], sample_weight=sample_weights):.5f}")
    preds_test /= n_splits       
    return preds, preds_test

### TRAIN - PREDICT ### 
Example 28
Project: wsdm19cup   Author: lampts   File: train_predict_trees_batch3.py    License: MIT License 5 votes vote down vote up
def validate_predict(model,X,y,X_test,n_splits=10,seed=42,model_type='lgb',verbose=0, sample_weights=sample_weights):
        
    preds = np.zeros((X.shape[0],3))
    preds_test = np.zeros((X_test.shape[0],3))
    cv_scores = []
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    for idx_train, idx_val in skf.split(X, y):
        X_train, X_val = X[idx_train,:], X[idx_val,:]
        y_train, y_val = y[idx_train], y[idx_val]
        
        if model_type == 'lgb':
            model.fit(X_train, y_train,
                        eval_set=[(X_train, y_train), (X_val, y_val)],
                        verbose=verbose)
        else:
            model.fit(X_train, y_train)
            
        if hasattr(model, 'predict_proba'):
            yhat_val = model.predict_proba(X_val)
            preds_test = preds_test + model.predict_proba(X_test)
            preds[idx_val] = yhat_val
        else:
            yhat_val = model.predict(X_val)
            preds_test = preds_test + model.predict(X_test)
            preds[idx_val] = yhat_val
        cv_scores.append(accuracy_score(y_val, np.array(['agreed', 'disagreed', 'unrelated'])[np.argmax(yhat_val,axis=1)]))
    print("local cv", np.mean(cv_scores), np.std(cv_scores))
    print(f"Val accuracy: {accuracy_score(y, np.array(['agreed', 'disagreed', 'unrelated'])[np.argmax(preds,axis=1)], sample_weight=sample_weights):.5f}")
    preds_test /= n_splits       
    return preds, preds_test 
Example 29
Project: category_encoders   Author: scikit-learn-contrib   File: wrapper.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self, feature_encoder, cv=5, shuffle=True, random_state=None):
        self.feature_encoder = feature_encoder
        self.__name__ = feature_encoder.__class__.__name__

        if type(cv) == int:
            self.cv = StratifiedKFold(n_splits=cv, shuffle=shuffle, random_state=random_state)
        else:
            self.cv = cv 
Example 30
Project: LogClass   Author: NetManAIOps   File: train_binary.py    License: MIT License 5 votes vote down vote up
def train(params, x_data, y_data, target_names):
    # KFold Cross Validation
    kfold = StratifiedKFold(n_splits=params['kfold']).split(x_data, y_data)
    best_pu_fs = 0.
    for train_index, test_index in tqdm(kfold):
        x_train, x_test = x_data[train_index], x_data[test_index]
        y_train, y_test = y_data[train_index], y_data[test_index]
        x_train, _ = extract_features(x_train, params)
        with TestingParameters(params):
            x_test, _ = extract_features(x_test, params)
        # Binary training features
        y_test_pu = binary_train_gtruth(y_test)
        y_train_pu = binary_train_gtruth(y_train)
        # Binary PULearning with RF
        binary_clf_getter =\
            binary_classifier_registry.get_binary_model(
                params['binary_classifier'])
        binary_clf = binary_clf_getter(params)
        binary_clf.fit(x_train, y_train_pu)
        y_pred_pu = binary_clf.predict(x_test)
        get_accuracy = black_box_report_registry.get_bb_report('acc')
        binary_acc = get_accuracy(y_test_pu, y_pred_pu)
        better_results = binary_acc > best_pu_fs
        if better_results:
            if binary_acc > best_pu_fs:
                best_pu_fs = binary_acc
            save_params(params)
            binary_clf.save()
            print(binary_acc)

        for report in params['report']:
            try:
                get_bb_report = black_box_report_registry.get_bb_report(report)
                result = get_bb_report(y_test_pu, y_pred_pu)
            except Exception:
                pass
            else:
                print(f'Binary classification {report} report:')
                print(result)