Python sklearn.cross_validation.StratifiedKFold() Examples

The following are code examples for showing how to use sklearn.cross_validation.StratifiedKFold(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: dynamicgem   Author: Sujit-O   File: embutils.py    MIT License 7 votes vote down vote up
def _validate_link_reconstruction(self, samples, lbs):
        # cache = utils.KeyDefaultDict(lambda x: self.embeddings_at(x))
        # feat = []
        # for v in samples:
        #     emb = cache[v[0] - 1]
        #     # feat.append(np.concatenate((emb[v[1]], emb[v[2]]), axis=0))
        #     feat.append(np.abs(emb[v[1]] - emb[v[2]]))
        # feat = np.vstack(feat)
        feat = self.make_features(samples)
        feat = np.abs(feat[:, 0] - feat[:, 1])

        clf = LogisticRegression()
        try:
            cv = StratifiedKFold(lbs, n_folds=2, shuffle=True)
            parts = cv
        except TypeError:
            cv = StratifiedKFold(n_splits=2, shuffle=True)
            parts = cv.split(feat, lbs)

        val_score = []
        for tr, te in parts:
            model = clf.fit(feat[tr], lbs[tr])
            p = model.predict(feat[te])
            val_score.append(f1_score(lbs[te], p))
        return np.mean(val_score) 
Example 2
Project: RotationForest   Author: joshloyal   File: simple_benchmark.py    MIT License 7 votes vote down vote up
def test_toy_data(name, clf):
    X, y = classification_data()
    k_folds = 5
    cv = StratifiedKFold(y, k_folds, random_state=1234)

    acc, auc = [], []
    for train, test in cv:
        xt, xv, yt, yv = X[train, :], X[test, :], y[train], y[test]
        clf.fit(xt, yt)
        yhat = clf.predict(xv)
        proba = clf.predict_proba(xv)[:, 1]
        acc.append(np.mean(yhat == yv))
        auc.append(roc_auc_score(yv, proba))

    acc_mean, acc_std = np.mean(acc), np.std(acc)
    auc_mean, auc_std = np.mean(auc), np.std(auc)
    print name
    print 'accuracy: {0:.3f} +/- {1:.3f}'.format(acc_mean, acc_std)
    print 'auc: {0:.3f} +/- {1:.3f}'.format(auc_mean, auc_std)
    print '-'*80
    return {'name': name,
            'acc_mean': acc_mean,
            'acc_std': acc_std,
            'auc_mean': auc_mean,
            'auc_std': auc_std} 
Example 3
Project: 2016CCF-sougou   Author: prozhuchen   File: class_w2v.py    Apache License 2.0 6 votes vote down vote up
def validation(self,X,Y,kind):
        """

        使用2-fold进行验证
        """
        print 'validating...'
        fold_n=2
        folds = list(StratifiedKFold(Y, n_folds=fold_n, random_state=0))
        score=np.zeros(fold_n)
        for j, (train_idx, test_idx) in enumerate(folds):
            print j + 1, '-fold'
            X_train = X[train_idx]
            y_train = Y[train_idx]
            X_test = X[test_idx]
            y_test = Y[test_idx]

            res = self.fit(X_train, y_train, X_test)
            cur = sum(y_test == res) * 1.0 / len(res)
            score[j] = cur
        print score, score.mean()
        return score.mean() 
Example 4
Project: dynamicgem   Author: Sujit-O   File: embutils.py    MIT License 6 votes vote down vote up
def _validate_node_classify(self, samples, lbs):
        # note that the 1-st dimension of feat is for each node in each sample (time, node1, node2, ...)
        feat = self.make_features(samples)[:, 0]
        assert len(feat) == len(lbs)

        clf = LogisticRegression(class_weight='balanced')
        try:
            cv = StratifiedKFold(lbs, n_folds=2, shuffle=True)
            parts = cv
        except TypeError as e:
            cv = StratifiedKFold(n_splits=2, shuffle=True)
            parts = cv.split(feat, lbs)

        val_score = []
        for tr, te in parts:
            model = clf.fit(feat[tr], lbs[tr])
            p = model.predict(feat[te])
            val_score.append(f1_score(lbs[te], p))
        return np.mean(val_score) 
Example 5
Project: dsw-ont-ijcai   Author: anonymous-ijcai   File: topics_music.py    GNU General Public License v3.0 6 votes vote down vote up
def evaluate_depth_based_classifier_cross_validated(depth):
    np.random.seed(0)
    categories, classes, inner_cross_validation,\
        outer_cross_validation, model_selection_measure,\
        evaluation_measures = default_classifier_evaluation_params()
    def smaller_cross_validation(outputs):
        return cross_validation.StratifiedKFold(outputs, n_folds=2)
    tuned_clf = depth_based_selection(precompute_full_selection(), 6)
    return topics.train_evaluate_topic_classifier_cv(
        tuned_clf, categories, classes,
        inner_cross_validation,
#       smaller_cross_validation,
        outer_cross_validation,
        model_selection_measure,
        evaluation_measures,
        param_grid=None,
        learning=False)

# metrics = evaluate_depth_based_classifier_cross_validated(6)
# metric_names = ['accuracy', 'f1_pos', 'f1_neg', 'weighted_f1']
# for metric_name, metric in zip(metric_names, metrics):
#     print("{:<11s} : {:.3f} +- {:.3f}".format(metric_name, metric.mean(), metric.std()))

# <codecell> 
Example 6
Project: pySpatialTools   Author: tgquintela   File: check_features.py    MIT License 6 votes vote down vote up
def fit_model(model, X, y):
    "Function to fit the model we want."
    n_folds = 3
    skf = StratifiedKFold(y, n_folds=n_folds)
    models, measures = [], []
    for train_index, test_index in skf:
        ## Extract Kfold
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        ## Fit models
        model_i = model()
        models.append(model_i.fit(X_train, y_train))
        ## Compute measure
        proba_m = model_i.predict_proba(X_test)
        measures.append(compute_measure(y_test, proba_m))

    i = np.argmax(measures)
    model, measure = models[i], measures[i]
    return model, measure 
Example 7
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 6 votes vote down vote up
def test_stratified_kfold_no_shuffle():
    # Manually check that StratifiedKFold preserves the data ordering as much
    # as possible on toy datasets in order to avoid hiding sample dependencies
    # when possible
    splits = iter(cval.StratifiedKFold([1, 1, 0, 0], 2))
    train, test = next(splits)
    assert_array_equal(test, [0, 2])
    assert_array_equal(train, [1, 3])

    train, test = next(splits)
    assert_array_equal(test, [1, 3])
    assert_array_equal(train, [0, 2])

    splits = iter(cval.StratifiedKFold([1, 1, 1, 0, 0, 0, 0], 2))
    train, test = next(splits)
    assert_array_equal(test, [0, 1, 3, 4])
    assert_array_equal(train, [2, 5, 6])

    train, test = next(splits)
    assert_array_equal(test, [2, 5, 6])
    assert_array_equal(train, [0, 1, 3, 4]) 
Example 8
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 6 votes vote down vote up
def test_stratified_kfold_ratios():
    # Check that stratified kfold preserves label ratios in individual splits
    # Repeat with shuffling turned off and on
    n_samples = 1000
    labels = np.array([4] * int(0.10 * n_samples) +
                      [0] * int(0.89 * n_samples) +
                      [1] * int(0.01 * n_samples))
    for shuffle in [False, True]:
        for train, test in cval.StratifiedKFold(labels, 5, shuffle=shuffle):
            assert_almost_equal(np.sum(labels[train] == 4) / len(train), 0.10,
                                2)
            assert_almost_equal(np.sum(labels[train] == 0) / len(train), 0.89,
                                2)
            assert_almost_equal(np.sum(labels[train] == 1) / len(train), 0.01,
                                2)
            assert_almost_equal(np.sum(labels[test] == 4) / len(test), 0.10, 2)
            assert_almost_equal(np.sum(labels[test] == 0) / len(test), 0.89, 2)
            assert_almost_equal(np.sum(labels[test] == 1) / len(test), 0.01, 2) 
Example 9
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 6 votes vote down vote up
def test_cross_val_generator_with_default_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    loo = cval.LeaveOneOut(4)
    lpo = cval.LeavePOut(4, 2)
    kf = cval.KFold(4, 2)
    skf = cval.StratifiedKFold(y, 2)
    lolo = cval.LeaveOneLabelOut(labels)
    lopo = cval.LeavePLabelOut(labels, 2)
    b = cval.Bootstrap(2)  # only in index mode
    ss = cval.ShuffleSplit(2)
    ps = cval.PredefinedSplit([1, 1, 2, 2])
    for cv in [loo, lpo, kf, skf, lolo, lopo, b, ss, ps]:
        for train, test in cv:
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            X[train], X[test]
            y[train], y[test] 
Example 10
Project: linear_neuron   Author: uglyboxer   File: test_grid_search.py    MIT License 6 votes vote down vote up
def test_grid_search_score_consistency():
    # test that correct scores are used
    clf = LinearSVC(random_state=0)
    X, y = make_blobs(random_state=0, centers=2)
    Cs = [.1, 1, 10]
    for score in ['f1', 'roc_auc']:
        grid_search = GridSearchCV(clf, {'C': Cs}, scoring=score)
        grid_search.fit(X, y)
        cv = StratifiedKFold(n_folds=3, y=y)
        for C, scores in zip(Cs, grid_search.grid_scores_):
            clf.set_params(C=C)
            scores = scores[2]  # get the separate runs from grid scores
            i = 0
            for train, test in cv:
                clf.fit(X[train], y[train])
                if score == "f1":
                    correct_score = f1_score(y[test], clf.predict(X[test]))
                elif score == "roc_auc":
                    dec = clf.decision_function(X[test])
                    correct_score = roc_auc_score(y[test], dec)
                assert_almost_equal(correct_score, scores[i])
                i += 1 
Example 11
Project: Weiss   Author: WangWenjun559   File: test_cross_validation.py    Apache License 2.0 6 votes vote down vote up
def test_stratified_kfold_no_shuffle():
    # Manually check that StratifiedKFold preserves the data ordering as much
    # as possible on toy datasets in order to avoid hiding sample dependencies
    # when possible
    splits = iter(cval.StratifiedKFold([1, 1, 0, 0], 2))
    train, test = next(splits)
    assert_array_equal(test, [0, 2])
    assert_array_equal(train, [1, 3])

    train, test = next(splits)
    assert_array_equal(test, [1, 3])
    assert_array_equal(train, [0, 2])

    splits = iter(cval.StratifiedKFold([1, 1, 1, 0, 0, 0, 0], 2))
    train, test = next(splits)
    assert_array_equal(test, [0, 1, 3, 4])
    assert_array_equal(train, [2, 5, 6])

    train, test = next(splits)
    assert_array_equal(test, [2, 5, 6])
    assert_array_equal(train, [0, 1, 3, 4]) 
Example 12
Project: Weiss   Author: WangWenjun559   File: test_cross_validation.py    Apache License 2.0 6 votes vote down vote up
def test_stratified_kfold_ratios():
    # Check that stratified kfold preserves label ratios in individual splits
    # Repeat with shuffling turned off and on
    n_samples = 1000
    labels = np.array([4] * int(0.10 * n_samples) +
                      [0] * int(0.89 * n_samples) +
                      [1] * int(0.01 * n_samples))
    for shuffle in [False, True]:
        for train, test in cval.StratifiedKFold(labels, 5, shuffle=shuffle):
            assert_almost_equal(np.sum(labels[train] == 4) / len(train), 0.10,
                                2)
            assert_almost_equal(np.sum(labels[train] == 0) / len(train), 0.89,
                                2)
            assert_almost_equal(np.sum(labels[train] == 1) / len(train), 0.01,
                                2)
            assert_almost_equal(np.sum(labels[test] == 4) / len(test), 0.10, 2)
            assert_almost_equal(np.sum(labels[test] == 0) / len(test), 0.89, 2)
            assert_almost_equal(np.sum(labels[test] == 1) / len(test), 0.01, 2) 
Example 13
Project: Weiss   Author: WangWenjun559   File: test_cross_validation.py    Apache License 2.0 6 votes vote down vote up
def test_cross_val_generator_with_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    # explicitly passing indices value is deprecated
    loo = cval.LeaveOneOut(4)
    lpo = cval.LeavePOut(4, 2)
    kf = cval.KFold(4, 2)
    skf = cval.StratifiedKFold(y, 2)
    lolo = cval.LeaveOneLabelOut(labels)
    lopo = cval.LeavePLabelOut(labels, 2)
    ps = cval.PredefinedSplit([1, 1, 2, 2])
    ss = cval.ShuffleSplit(2)
    for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]:
        for train, test in cv:
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            X[train], X[test]
            y[train], y[test] 
Example 14
Project: Weiss   Author: WangWenjun559   File: test_cross_validation.py    Apache License 2.0 6 votes vote down vote up
def test_cross_val_generator_with_default_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    loo = cval.LeaveOneOut(4)
    lpo = cval.LeavePOut(4, 2)
    kf = cval.KFold(4, 2)
    skf = cval.StratifiedKFold(y, 2)
    lolo = cval.LeaveOneLabelOut(labels)
    lopo = cval.LeavePLabelOut(labels, 2)
    ss = cval.ShuffleSplit(2)
    ps = cval.PredefinedSplit([1, 1, 2, 2])
    for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]:
        for train, test in cv:
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            X[train], X[test]
            y[train], y[test] 
Example 15
Project: Weiss   Author: WangWenjun559   File: test_grid_search.py    Apache License 2.0 6 votes vote down vote up
def test_grid_search_score_consistency():
    # test that correct scores are used
    clf = LinearSVC(random_state=0)
    X, y = make_blobs(random_state=0, centers=2)
    Cs = [.1, 1, 10]
    for score in ['f1', 'roc_auc']:
        grid_search = GridSearchCV(clf, {'C': Cs}, scoring=score)
        grid_search.fit(X, y)
        cv = StratifiedKFold(n_folds=3, y=y)
        for C, scores in zip(Cs, grid_search.grid_scores_):
            clf.set_params(C=C)
            scores = scores[2]  # get the separate runs from grid scores
            i = 0
            for train, test in cv:
                clf.fit(X[train], y[train])
                if score == "f1":
                    correct_score = f1_score(y[test], clf.predict(X[test]))
                elif score == "roc_auc":
                    dec = clf.decision_function(X[test])
                    correct_score = roc_auc_score(y[test], dec)
                assert_almost_equal(correct_score, scores[i])
                i += 1 
Example 16
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_cross_validation.py    Apache License 2.0 6 votes vote down vote up
def test_stratified_kfold_no_shuffle():
    # Manually check that StratifiedKFold preserves the data ordering as much
    # as possible on toy datasets in order to avoid hiding sample dependencies
    # when possible
    splits = iter(cval.StratifiedKFold([1, 1, 0, 0], 2))
    train, test = next(splits)
    assert_array_equal(test, [0, 2])
    assert_array_equal(train, [1, 3])

    train, test = next(splits)
    assert_array_equal(test, [1, 3])
    assert_array_equal(train, [0, 2])

    splits = iter(cval.StratifiedKFold([1, 1, 1, 0, 0, 0, 0], 2))
    train, test = next(splits)
    assert_array_equal(test, [0, 1, 3, 4])
    assert_array_equal(train, [2, 5, 6])

    train, test = next(splits)
    assert_array_equal(test, [2, 5, 6])
    assert_array_equal(train, [0, 1, 3, 4]) 
Example 17
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_cross_validation.py    Apache License 2.0 6 votes vote down vote up
def test_stratified_kfold_ratios():
    # Check that stratified kfold preserves label ratios in individual splits
    # Repeat with shuffling turned off and on
    n_samples = 1000
    labels = np.array([4] * int(0.10 * n_samples) +
                      [0] * int(0.89 * n_samples) +
                      [1] * int(0.01 * n_samples))
    for shuffle in [False, True]:
        for train, test in cval.StratifiedKFold(labels, 5, shuffle=shuffle):
            assert_almost_equal(np.sum(labels[train] == 4) / len(train), 0.10,
                                2)
            assert_almost_equal(np.sum(labels[train] == 0) / len(train), 0.89,
                                2)
            assert_almost_equal(np.sum(labels[train] == 1) / len(train), 0.01,
                                2)
            assert_almost_equal(np.sum(labels[test] == 4) / len(test), 0.10, 2)
            assert_almost_equal(np.sum(labels[test] == 0) / len(test), 0.89, 2)
            assert_almost_equal(np.sum(labels[test] == 1) / len(test), 0.01, 2) 
Example 18
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_cross_validation.py    Apache License 2.0 6 votes vote down vote up
def test_cross_val_generator_with_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    # explicitly passing indices value is deprecated
    loo = cval.LeaveOneOut(4)
    lpo = cval.LeavePOut(4, 2)
    kf = cval.KFold(4, 2)
    skf = cval.StratifiedKFold(y, 2)
    lolo = cval.LeaveOneLabelOut(labels)
    lopo = cval.LeavePLabelOut(labels, 2)
    ps = cval.PredefinedSplit([1, 1, 2, 2])
    ss = cval.ShuffleSplit(2)
    for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]:
        for train, test in cv:
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            X[train], X[test]
            y[train], y[test] 
Example 19
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_cross_validation.py    Apache License 2.0 6 votes vote down vote up
def test_cross_val_generator_with_default_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    loo = cval.LeaveOneOut(4)
    lpo = cval.LeavePOut(4, 2)
    kf = cval.KFold(4, 2)
    skf = cval.StratifiedKFold(y, 2)
    lolo = cval.LeaveOneLabelOut(labels)
    lopo = cval.LeavePLabelOut(labels, 2)
    ss = cval.ShuffleSplit(2)
    ps = cval.PredefinedSplit([1, 1, 2, 2])
    for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]:
        for train, test in cv:
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            X[train], X[test]
            y[train], y[test] 
Example 20
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_grid_search.py    Apache License 2.0 6 votes vote down vote up
def test_grid_search_score_consistency():
    # test that correct scores are used
    clf = LinearSVC(random_state=0)
    X, y = make_blobs(random_state=0, centers=2)
    Cs = [.1, 1, 10]
    for score in ['f1', 'roc_auc']:
        grid_search = GridSearchCV(clf, {'C': Cs}, scoring=score)
        grid_search.fit(X, y)
        cv = StratifiedKFold(n_folds=3, y=y)
        for C, scores in zip(Cs, grid_search.grid_scores_):
            clf.set_params(C=C)
            scores = scores[2]  # get the separate runs from grid scores
            i = 0
            for train, test in cv:
                clf.fit(X[train], y[train])
                if score == "f1":
                    correct_score = f1_score(y[test], clf.predict(X[test]))
                elif score == "roc_auc":
                    dec = clf.decision_function(X[test])
                    correct_score = roc_auc_score(y[test], dec)
                assert_almost_equal(correct_score, scores[i])
                i += 1 
Example 21
Project: kaggle_otto   Author: ahara   File: blender.py    BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def get_weights():
    # Read validation labels
    _, labels, _, _, _ = utils.load_data()
    skf = StratifiedKFold(labels, n_folds=5, random_state=23)
    test_index = None
    for _, test_idx in skf:
        test_index = np.append(test_index, test_idx) if test_index is not None else test_idx
    val_labels = labels[test_index]
    # Read predictions on validation set
    val_predictions = []
    prediction_files = utils.get_prediction_files()
    for preds_file in prediction_files:
        vp = np.genfromtxt(os.path.join(consts.BLEND_PATH, preds_file), delimiter=',')
        val_predictions.append(vp)
    # Minimize blending function
    p0 = [1.] * len(prediction_files)
    p = fmin_cobyla(error, p0, args=(val_predictions, val_labels), cons=[constraint], rhoend=1e-5)

    return p 
Example 22
Project: kaggle_otto   Author: ahara   File: utils.py    BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def make_blender_cv(classifier, x, y, calibrate=False):
    skf = StratifiedKFold(y, n_folds=5, random_state=23)
    scores, predictions = [], None
    for train_index, test_index in skf:
        if calibrate:
            # Make training and calibration
            calibrated_classifier = CalibratedClassifierCV(classifier, method='isotonic', cv=get_cv(y[train_index]))
            fitted_classifier = calibrated_classifier.fit(x[train_index, :], y[train_index])
        else:
            fitted_classifier = classifier.fit(x[train_index, :], y[train_index])
        preds = fitted_classifier.predict_proba(x[test_index, :])

        # Free memory
        calibrated_classifier, fitted_classifier = None, None
        gc.collect()

        scores.append(log_loss(y[test_index], preds))
        predictions = np.append(predictions, preds, axis=0) if predictions is not None else preds
    return scores, predictions 
Example 23
Project: macos-re   Author: tbarabosch   File: classify_macho.py    GNU General Public License v2.0 5 votes vote down vote up
def classify(X, y):
    # TODO: split learning and plotting
    X = numpy.array(X)
    y = numpy.array(y)

    X = StandardScaler().fit_transform(X)
    skf = StratifiedKFold(y, n_folds=10)

    for classifier in range(0,len(classifiers)):
        print "Training classifier %s..." % (names[classifier])

        mean_tpr = 0.0
        mean_fpr = numpy.linspace(0, 1, 100)

        for train_index, test_index in skf:
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            clf = classifiers[classifier]
            probas_ = clf.fit(X_train, y_train).predict_proba(X_test)
            fpr, tpr, thresholds = roc_curve(y_test, probas_[:, 1])
            mean_tpr += interp(mean_fpr, fpr, tpr)
            mean_tpr[0] = 0.0
        mean_tpr /= len(skf)
        mean_tpr[-1] = 1.0
        mean_auc = auc(mean_fpr, mean_tpr)
        plt.plot(mean_fpr, mean_tpr, 'k--',
             label= names[classifier] + ' (AUC = %0.3f)' % mean_auc, lw=2, c=colors[classifier])

    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')

    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc="lower right", fancybox=True, shadow=True, fontsize=10)
    plt.tight_layout()
    plt.savefig("osx_malware_goodware_roc.svg") 
Example 24
Project: kaggle-seizure-prediction   Author: sics-lm   File: seizure_modeling.py    MIT License 5 votes vote down vote up
def get_cv_generator(training_data, do_segment_split=True, random_state=None):
    """
    Returns a cross validation generator.
    :param training_data: The training data to create the folds from.
    :param do_segment_split: If True, the folds will be generated based on the segment names.
    :param random_state: A constant to use as a random seed.
    :return: A generator which can be used by the grid search to generate cross validation folds.
    """
    k_fold_kwargs = dict(n_folds=10, random_state=random_state)
    if do_segment_split:
        cv = dataset.SegmentCrossValidator(training_data, cross_validation.StratifiedKFold, **k_fold_kwargs)
    else:
        cv = sklearn.cross_validation.StratifiedKFold(training_data['Preictal'], **k_fold_kwargs)
    return cv 
Example 25
Project: kaggle-seizure-prediction   Author: sics-lm   File: dataset.py    MIT License 5 votes vote down vote up
def __init__(self, dataframe, base_cv=None, **cv_kwargs):
        # We create a copy of the dataframe with a new last level
        # index which is an enumeration of the rows (like proper indices)
        self.all_segments = pd.DataFrame({'Preictal': dataframe['Preictal'], 'i': np.arange(len(dataframe))})
        self.all_segments.set_index('i', append=True, inplace=True)

        # Now create a series with only the segments as rows. This is what we will pass into the wrapped cross
        # validation generator
        self.segments = self.all_segments['Preictal'].groupby(level='segment').first()
        self.segments.sort(inplace=True)

        if base_cv is None:
            self.cv = cross_validation.StratifiedKFold(self.segments, **cv_kwargs)
        else:
            self.cv = base_cv(self.segments, **cv_kwargs) 
Example 26
Project: kaggle-seizure-prediction   Author: sics-lm   File: dataset.py    MIT License 5 votes vote down vote up
def split_dataset(dataframe, training_ratio=.8, do_segment_split=True, shuffle=False, random_state=None):
    """
    Splits the dataset into a training and test partition.
    :param dataframe: A data frame to split. Should have a 'Preictal' column.
    :param training_ratio: The ratio of the data to use for the first part.
    :param do_segment_split: If True, the split will be done on whole segments.
    :param shuffle: If true, the split will shuffle the data before splitting.
    :param random_state: Seed
    :return: A pair of disjoint data frames, where the first frame contains *training_ratio* of all the data.
    """

    # We'll make the splits based on the sklearn cross validators,
    # We calculate the number of folds which correspond to the
    # desired training ratio. If *r* is the training ratio and *k*
    # the nubmer of folds, we'd like *r* = (*k* - 1)/*k*, that is,
    # the ratio should be the same as all the included folds divided
    # by the total number of folds. This gives us *k* = 1/(1-*r*)
    k = int(np.floor(1/(1 - training_ratio)))

    if do_segment_split:
        # We use the segment based cross validator to get a stratified split.
        cv = SegmentCrossValidator(dataframe,
                                   n_folds=k,
                                   shuffle=shuffle,
                                   random_state=random_state)
    else:
        # Don't split by segment, but still do a stratified split
        cv = cross_validation.StratifiedKFold(dataframe['Preictal'],
                                              n_folds=k,
                                              shuffle=shuffle,
                                              random_state=random_state)

    training_indices, test_indices = first(cv)
    return dataframe.iloc[training_indices], dataframe.iloc[test_indices] 
Example 27
Project: jr-tools   Author: kingjr   File: base.py    BSD 2-Clause "Simplified" License 5 votes vote down vote up
def fit(self, epochs, y=None):
        from sklearn.cross_validation import check_cv, StratifiedKFold
        from mne.decoding.time_gen import _check_epochs_input
        X, y, self.gat.picks_ = _check_epochs_input(epochs, y, self.gat.picks)
        gat_list = list()

        cv = self.cv
        if isinstance(cv, (int, np.int)):
            cv = StratifiedKFold(y, cv)
        cv = check_cv(cv, X, y, classifier=True)
        # Construct meta epoch and fit gat with a single fold
        for ii, (train, test) in enumerate(cv):
            # meta trial
            epochs_ = make_meta_epochs(epochs[train], y[train], n_bin=self.n)
            # fit gat
            gat_ = deepcopy(self.gat)
            cv_one_fold = [(range(len(epochs_)), [])]
            gat_.cv = cv_one_fold
            gat_.fit(epochs_, epochs_.events[:, 2])
            gat_list.append(gat_)

        # gather
        self.gat = gat_
        self.gat.train_times_ = gat_.train_times_
        self.gat.estimators_ = np.squeeze(
            [gat.estimators_ for gat in gat_list]).T.tolist()
        self.gat.cv_ = cv
        self.gat.y_train_ = y 
Example 28
Project: astronet   Author: CasvandenBogaard   File: split.py    GNU General Public License v2.0 5 votes vote down vote up
def __call__(self, X, y, net):
        
        if not self.cutoff:
            if self.eval_size:
                
                if net.regression or not self.stratify:
                    kf = KFold(y.shape[0], round(1. / self.eval_size))
                else:
                    kf = StratifiedKFold(y, round(1. / self.eval_size))
      
                train_indices, valid_indices = next(iter(kf))
                X_train, y_train = _sldict(X, train_indices), y[train_indices]
                X_valid, y_valid = _sldict(X, valid_indices), y[valid_indices]
                
            else:
                
                X_train, y_train = X, y
                X_valid, y_valid = _sldict(X, slice(len(y), None)), y[len(y):]
  
            return X_train, X_valid, y_train, y_valid
    
        else:
            
            train_indices, valid_indices = range(self.cutoff), range(self.cutoff, len(y))
            X_train, y_train = _sldict(X, train_indices), y[train_indices]
            X_valid, y_valid = _sldict(X, valid_indices), y[valid_indices]
            
            return X_train, X_valid, y_train, y_valid 
Example 29
Project: dockerizeme   Author: dockerizeme   File: snippet.py    Apache License 2.0 5 votes vote down vote up
def fit(self, X, s):
        if self.trad_clf is None:
            self.trad_clf = GridSearchCV(SGDClassifier(loss="log", penalty="l2"), param_grid={"alpha": np.logspace(-4, 0, 10)})

        c = np.zeros(self.n_folds)
        for i, (itr, ite) in enumerate(StratifiedKFold(s, n_folds=self.n_folds, shuffle=True)):
            self.trad_clf.fit(X[itr], s[itr])
            c[i] = self.trad_clf.predict_proba(X[ite][s[ite]==1])[:,1].mean()
        self.c = c.mean()
        return self 
Example 30
Project: WeightedTags-MF   Author: andreuvall   File: TMF.py    BSD 2-Clause "Simplified" License 5 votes vote down vote up
def get_data(full_R, num_folds, alpha):
    # full_R: ratings in coo format
    # num_folds: data splits
    # alpha: weight for the binary ratings

    # Make data splits balancing users in each fold
    splits = cv.StratifiedKFold(full_R.row, n_folds=num_folds, random_state=1)
    data = []
    test_indices = open('test_' + dataset + '_TMF.txt', 'wa')
    for train, test in splits:
        # Train data (remind R is in coo format)
        R = sparse.csr_matrix((full_R.data[train], (full_R.row[train],
                                                    full_R.col[train])),
                              shape=full_R.shape)

        # P = R > 0 is really not needed through the code

        # Weight data
        weights = 1. + alpha * np.log(1. + R.data)
        W = sparse.csr_matrix((weights, R.nonzero()), shape=full_R.shape)

        # Test data
        Rt = sparse.coo_matrix((full_R.data[test], (full_R.row[test],
                                                    full_R.col[test])),
                               shape=full_R.shape)

        fold_data = {'W': W, 'Rt': Rt}
        data.append(fold_data)

        # Store test indices for further mpr calculation
        np.savetxt(test_indices, test, fmt='%i')

    test_indices.close()
    return data


# RMSE function 
Example 31
Project: WeightedTags-MF   Author: andreuvall   File: WTMF.py    BSD 2-Clause "Simplified" License 5 votes vote down vote up
def get_data(full_R, num_folds, alpha):
    # full_R: ratings in coo format
    # num_folds: data splits
    # alpha: weight for the binary ratings

    # Make data splits balancing users in each fold
    splits = cv.StratifiedKFold(full_R.row, n_folds=num_folds, random_state=1)
    data = []
    test_indices = open('test_' + dataset + '_WTMF.txt', 'wa')
    for train, test in splits:
        # Train data (remind R is in coo format)
        R = sparse.csr_matrix((full_R.data[train], (full_R.row[train],
                                                    full_R.col[train])),
                              shape=full_R.shape)

        # P = R > 0 is really not needed through the code

        # Weight data
        weights = 1. + alpha * np.log(1. + R.data)
        W = sparse.csr_matrix((weights, R.nonzero()), shape=full_R.shape)

        # Test data
        Rt = sparse.coo_matrix((full_R.data[test], (full_R.row[test],
                                                    full_R.col[test])),
                               shape=full_R.shape)

        fold_data = {'WR': W, 'Rt': Rt}
        data.append(fold_data)

        # Store test indices for further mpr calculation
        np.savetxt(test_indices, test, fmt='%i')

    test_indices.close()
    return data


# Weighting for the tags 
Example 32
Project: 2016CCF-sougou   Author: prozhuchen   File: classify.py    Apache License 2.0 5 votes vote down vote up
def validation(self, X, Y, wv_X, kind):
        """
        2-fold validation
        :param X: train text
        :param Y: train label
        :param wv_X: train wv_vec
        :param kind: age/gender/education
        :return: mean score of 2-fold validation
        """
        print '向量化中...'
        X=np.array(X)
        fold_n=2
        folds = list(StratifiedKFold(Y, n_folds=fold_n, shuffle=False,random_state=0))
        score = np.zeros(fold_n)
        for j, (train_idx, test_idx) in enumerate(folds):
            print j+1,'-fold'

            X_train = X[train_idx]
            y_train = Y[train_idx]
            X_test = X[test_idx]
            y_test = Y[test_idx]

            wv_X_train =wv_X[train_idx]
            wv_X_test = wv_X[test_idx]

            vec = TfidfVectorizer(use_idf=True,sublinear_tf=False, max_features=50000, binary=True)
            vec.fit(X_train, y_train)
            X_train = vec.transform(X_train)
            X_test = vec.transform(X_test)

            print 'shape',X_train.shape

            ypre = self.stacking(X_train,y_train,X_test,wv_X_train,wv_X_test,kind)
            cur = sum(y_test == ypre) * 1.0 / len(ypre)
            score[j] = cur

        print score
        print score.mean(),kind
        return score.mean() 
Example 33
Project: color-features   Author: skearnes   File: models.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def get_cv(labels):
    """Get a cross-validation iterator (NOT generator)."""
    cv = cross_validation.StratifiedKFold(labels, n_folds=5, shuffle=True,
                                          random_state=20160416)
    return list(cv) 
Example 34
Project: DCASE2017-task1   Author: ronggong   File: xgb_classification.py    GNU Affero General Public License v3.0 5 votes vote down vote up
def train_evaluate_stratified(clf, X, y, labels):
    skf = StratifiedKFold(y, n_folds=10)
    for fold_number, (train_index, test_index) in enumerate(skf):
        X_train, y_train = X[train_index], y[train_index]
        X_test, y_test = X[test_index], y[test_index]
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        save_results(y_test, y_pred, labels, fold_number) 
Example 35
Project: cage   Author: bm2-lab   File: logit_selector.py    MIT License 5 votes vote down vote up
def LogitSelector(x, y, cv, njob):

    lb = prep.LabelBinarizer()
    y = lb.fit_transform(y).ravel()

    cls = LogisticRegression()
    def __Auc(xte, yte):
        ypo = cls.predict_proba(xte)
        flt_auc = roc_auc_score(yte, ypo[:,1])
        return flt_auc
    
    skf = StratifiedKFold(y, n_folds=cv)
    model = LogisticRegressionCV(penalty='l1', solver='liblinear', fit_intercept=False, cv=cv, n_jobs=njob)
    with warnings.catch_warnings():
        warnings.simplefilter('ignore', UserWarning)
        warnings.simplefilter('ignore', ConvergenceWarning)
        model.fit(x, y)
    columns = np.arange(x.shape[1])[model.coef_.ravel() != 0]
    
    mdl_eval = lambda func: lambda idx_tr, idx_te: func(y[idx_te], cls.fit(x[idx_tr][:,columns], y[idx_tr]).predict(x[idx_te][:,columns]))
    auc_eval = lambda idx_tr, idx_te: roc_auc_score(y[idx_te], cls.fit(x[idx_tr][:,columns], y[idx_tr]).predict_proba(x[idx_te][:,columns])[:,1])
    res_eval = lambda func: np.average(map(mdl_eval(func), *zip(*[(idx_tr, idx_te) for idx_tr, idx_te in skf])))

    accu = res_eval(accuracy_score)
    prec = res_eval(precision_score)
    rec = res_eval(recall_score)
    f1 = res_eval(f1_score)
    au = np.average(map(auc_eval, *zip(*[(idx_tr, idx_te) for idx_tr, idx_te in skf])))

    cls.fit(x[:,columns], y)
    return Mdc(model=cls, idx=columns, accu=accu, prec=prec, rec=rec, f1=f1, au=au) 
Example 36
Project: egk   Author: mlds-lab   File: random_mvegk.py    MIT License 5 votes vote down vote up
def grid_search_cv(means, covs, label, gamma_grid, c_grid, n_sample=50,
                   normalize=True):
    best_score = -np.inf

    for gamma in gamma_grid:
        rp = MultiEGKSampler(gamma, n_sample=n_sample, normalize=normalize)
        scores = defaultdict(float)
        for idx_train, idx_test in StratifiedKFold(label):
            X_train = rp.fit_transform(means[idx_train], covs[idx_train])
            X_test = rp.transform(means[idx_test], covs[idx_test])
            l_train = label[idx_train]
            l_test = label[idx_test]
            for C in c_grid:
                clf = LinearSVC(C=C)
                clf.fit(X_train, l_train)
                l_predict = clf.predict(X_test)
                accuracy = np.mean(l_predict == l_test)
                scores[C] += accuracy

        best_C_score, best_C = max((score, C)
                                   for (C, score) in scores.iteritems())
        #print gamma, scores
        if best_C_score > best_score:
            best_score = best_C_score
            best_parms = {'gamma': gamma, 'C': best_C}

    return best_parms 
Example 37
Project: Kaggler   Author: qqgeogor   File: mf_qe_nn_clf.py    GNU General Public License v3.0 5 votes vote down vote up
def make_mf_lr(X ,y, clf, X_test, n_round=3):
    n = X.shape[0]
    '''
    Fit metafeature by @clf and get prediction for test. Assumed that @clf -- regressor
    '''
    print clf
    mf_tr = np.zeros(X.shape[0])
    mf_te = np.zeros(X_test.shape[0])
    for i in range(n_round):
        skf = StratifiedKFold(y, n_folds=2, shuffle=True, random_state=42+i*1000)
        for ind_tr, ind_te in skf:
            X_tr = X[ind_tr]
            X_te = X[ind_te]
            
            # print('X_tr shape',X_tr.shape)
            # print('X_te shape',X_te.shape)
            
            y_tr = y[ind_tr]
            y_te = y[ind_te]
            
            clf.fit(X_tr, y_tr)
            mf_tr[ind_te] += clf.predict_proba(X_te)[:,1]
            mf_te += clf.predict_proba(X_test)[:,1]*0.5
            y_pred = clf.predict_proba(X_te)[:,1]
            score = roc_auc_score(y_te, y_pred)
            print 'pred[{}] score:{}'.format(i, score)
    return (mf_tr / n_round, mf_te / n_round) 
Example 38
Project: Kaggler   Author: qqgeogor   File: mf_qe_nn_clf.py    GNU General Public License v3.0 5 votes vote down vote up
def make_mf_lsvc(X ,y, clf, X_test, n_round=3):
    n = X.shape[0]
    '''
    Fit metafeature by @clf and get prediction for test. Assumed that @clf -- regressor
    '''
    print clf
    mf_tr = np.zeros(X.shape[0])
    mf_te = np.zeros(X_test.shape[0])
    for i in range(n_round):
        skf = StratifiedKFold(y, n_folds=2, shuffle=True, random_state=42+i*1000)
        for ind_tr, ind_te in skf:
            X_tr = X[ind_tr]
            X_te = X[ind_te]
            
            # print('X_tr shape',X_tr.shape)
            # print('X_te shape',X_te.shape)
            
            y_tr = y[ind_tr]
            y_te = y[ind_te]
            
            clf.fit(X_tr, y_tr)
            mf_tr[ind_te] += clf.decision_function(X_te)
            mf_te += clf.decision_function(X_test)*0.5
            y_pred = clf.decision_function(X_te)
            score = roc_auc_score(y_te, y_pred)
            print 'pred[{}] score:{}'.format(i, score)
    return (mf_tr / n_round, mf_te / n_round) 
Example 39
Project: Kaggler   Author: qqgeogor   File: mf_qe_nn_clf.py    GNU General Public License v3.0 5 votes vote down vote up
def make_mf_nn(X ,y, X_test, n_round=3):
    n = X.shape[0]
    '''
    Fit metafeature by @clf and get prediction for test. Assumed that @clf -- regressor
    '''
    from kaggler.online_model.ftrl import FTRL
    mf_tr = np.zeros(X.shape[0])
    mf_te = np.zeros(X_test.shape[0])
    for i in range(n_round):
        skf = StratifiedKFold(y, n_folds=2, shuffle=True, random_state=42+i*1000)
        for ind_tr, ind_te in skf:
            clf = build_model(X)
            X_tr = [X[:,0][ind_tr],X[:,1][ind_tr]]
            X_te = [X[:,0][ind_te],X[:,1][ind_te]]

            # print('X_tr shape',X_tr.shape)
            # print('X_te shape',X_te.shape)
            
            y_tr = y[ind_tr]
            y_te = y[ind_te]
            
            clf.fit(X_tr, y_tr,nb_epoch=2,batch_size=128,validation_data=[X_te,y_te])
            mf_tr[ind_te] += clf.predict(X_te).ravel()
            mf_te += clf.predict([X_test[:,0],X_test[:,1]]).ravel()*0.5
            y_pred = clf.predict(X_te).ravel()
            score = roc_auc_score(y_te, y_pred)
            print 'pred[{}] score:{}'.format(i, score)
    return (mf_tr / n_round, mf_te / n_round) 
Example 40
Project: res   Author: bustios   File: cross_val_scores.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def cross_val_scores(clf, X, y, n_iters=10, n_folds=10, n_jobs=1):
    scores = np.zeros((n_iters, 2))
    for iter in range(n_iters):
        # random_state=iter to control the randomness for reproducibility
        cv = StratifiedKFold(y, n_folds, shuffle=True, random_state=iter)
        y_pred = cross_val_predict(clf, X, y, cv, n_jobs=n_jobs)
        scores[iter, 0] = accuracy_score(y, y_pred)
        scores[iter, 1] = cohen_kappa_score(y, y_pred)

    return (scores[:,0].mean(), scores[:,0].std(),
            scores[:,1].mean(), scores[:,1].std()) 
Example 41
Project: dynamicgem   Author: Sujit-O   File: dynamicTriad.py    MIT License 5 votes vote down vote up
def __classify(self, feat, lbs):
        sm = None

        poscnt, negcnt = np.sum(lbs == 1), np.sum(lbs == -1)
        print("classifying with pos:neg = {}:{}".format(poscnt, negcnt))

        try:
            cv = StratifiedKFold(n_splits=5, shuffle=True)
            parts = cv.split(feat, lbs)
        except TypeError:
            cv = StratifiedKFold(lbs, n_folds=5, shuffle=True)
            parts = cv

        f1, prec, rec, acc = [], [], [], []
        for tr, te in parts:
            if sm is not None:
                x, y = sm.fit_sample(feat[tr], lbs[tr])
                # x, y = feat[tr], lbs[tr]
            else:
                x, y = feat[tr], lbs[tr]
            model = self.clf.fit(x, y)
            p = model.predict(feat[te])
            # self._model=model
            # if self.debug:
            #     print("results:", p, lbs[te])
            # print(p,np.shape(p))
            f1.append(f1_score(lbs[te], p))
            prec.append(precision_score(lbs[te], p))
            rec.append(recall_score(lbs[te], p))
            acc.append(accuracy_score(lbs[te], p))
        # idx = np.random.permutation(len(lbs))
        # x,y = feat[idx], lbs[idx]
        # self._model=self.clf.fit(x, y)    
        return prec, rec, f1, acc 
Example 42
Project: dsw-ont-ijcai   Author: anonymous-ijcai   File: relation_type.py    GNU General Public License v3.0 5 votes vote down vote up
def train_classifier(predictors, response, feature_names=relevant_feature_names, tuned_clf=Clf.LINEAR_SVC,
                     param_grid=None, test_size=0.5, scoring=weighted_f1, random_state=0):
    param_grid = param_grid or default_param_grid(tuned_clf)
    kf_cv = cross_validation.StratifiedKFold(response, n_folds=10, shuffle=True, random_state=random_state)
    cv_clf = GridSearchCV(estimator=tuned_clf, param_grid=param_grid, cv=kf_cv, scoring=scoring)
    cv_clf.fit(predictors, response)
    
    return cv_clf 
Example 43
Project: dsw-ont-ijcai   Author: anonymous-ijcai   File: topics.py    GNU General Public License v3.0 5 votes vote down vote up
def default_cross_validation(outputs):
    return cross_validation.StratifiedKFold(outputs, n_folds=5) 
Example 44
Project: arrc   Author: chop-dbhi   File: sklearn_extensions.py    MIT License 5 votes vote down vote up
def grid_analysis(pipeline, parameters, train_input, train_labels, cv = None):
    if not cv:
        cv = StratifiedKFold(train_labels, n_folds=5, shuffle=False)
    grid_search = GridSearchCV(pipeline, parameters, scoring='accuracy', cv=cv, refit=True, verbose = 1, n_jobs=2)
    print("Performing grid search...")
    tic = time.clock()
    grid_search.fit(train_input,train_labels)
    toc = time.clock()
    print("Grid search complete in {0} sec".format(toc-tic))
    return grid_search 
Example 45
Project: jingjuSingingPhraseMatching   Author: ronggong   File: xgb_classification.py    GNU Affero General Public License v3.0 5 votes vote down vote up
def train_evaluate_stratified(clf, X, y, labels):
    skf = StratifiedKFold(y, n_folds=10)
    for fold_number, (train_index, test_index) in enumerate(skf):
        X_train, y_train = X[train_index], y[train_index]
        X_test, y_test = X[test_index], y[test_index]
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        save_results(y_test, y_pred, labels, fold_number) 
Example 46
Project: kaggle-right-whale   Author: felixlaumon   File: __init__.py    MIT License 5 votes vote down vote up
def __call__(self, X, y, net):
        if self.eval_size is not None:
            if net.regression or not self.stratify:
                # test_size = self.eval_size
                # kf = ShuffleSplit(
                #     y.shape[0], test_size=test_size,
                #     random_state=self.random_state
                # )
                # train_indices, valid_indices = next(iter(kf))
                # valid_indices = shuffle(valid_indices)
                test_size = 1 - self.eval_size
                kf = ShuffleSplit(
                    y.shape[0], test_size=test_size,
                    random_state=self.random_state
                )
                valid_indices, train_indices = next(iter(kf))
            else:
                n_folds = int(round(1 / self.eval_size))
                kf = StratifiedKFold(y, n_folds=n_folds, random_state=self.random_state)
                train_indices, valid_indices = next(iter(kf))

            X_train, y_train = X[train_indices], y[train_indices]
            X_valid, y_valid = X[valid_indices], y[valid_indices]
        else:
            X_train, y_train = X, y
            X_valid, y_valid = X[len(X):], y[len(y):]

        return X_train, X_valid, y_train, y_valid 
Example 47
Project: kaggle-right-whale   Author: felixlaumon   File: train_model.py    MIT License 5 votes vote down vote up
def train_test_split(X, y, test_size=0.25, random_state=42, stratify=True):
    if stratify:
        n_folds = int(round(1 / test_size))
        sss = StratifiedKFold(y, n_folds=n_folds, random_state=random_state)
    else:
        sss = ShuffleSplit(len(y), test_size=test_size, random_state=random_state)
    train_idx, test_idx = iter(sss).next()
    return X[train_idx], X[test_idx], y[train_idx], y[test_idx] 
Example 48
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 5 votes vote down vote up
def test_kfold_valueerrors():
    # Check that errors are raised if there is not enough samples
    assert_raises(ValueError, cval.KFold, 3, 4)

    # Check that a warning is raised if the least populated class has too few
    # members.
    y = [3, 3, -1, -1, 2]

    cv = assert_warns_message(Warning, "The least populated class",
                              cval.StratifiedKFold, y, 3)

    # Check that despite the warning the folds are still computed even
    # though all the classes are not necessarily represented at on each
    # side of the split at each split
    check_cv_coverage(cv, expected_n_iter=3, n_samples=len(y))

    # Error when number of folds is <= 1
    assert_raises(ValueError, cval.KFold, 2, 0)
    assert_raises(ValueError, cval.KFold, 2, 1)
    assert_raises(ValueError, cval.StratifiedKFold, y, 0)
    assert_raises(ValueError, cval.StratifiedKFold, y, 1)

    # When n is not integer:
    assert_raises(ValueError, cval.KFold, 2.5, 2)

    # When n_folds is not integer:
    assert_raises(ValueError, cval.KFold, 5, 1.5)
    assert_raises(ValueError, cval.StratifiedKFold, y, 1.5) 
Example 49
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 5 votes vote down vote up
def test_stratifiedkfold_balance():
    # Check that KFold returns folds with balanced sizes (only when
    # stratification is possible)
    # Repeat with shuffling turned off and on
    labels = [0] * 3 + [1] * 14
    for shuffle in [False, True]:
        for skf in [cval.StratifiedKFold(labels[:i], 3, shuffle=shuffle)
                    for i in range(11, 17)]:
            sizes = []
            for _, test in skf:
                sizes.append(len(test))

            assert_true((np.max(sizes) - np.min(sizes)) <= 1)
            assert_equal(np.sum(sizes), skf.n) 
Example 50
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 5 votes vote down vote up
def test_shuffle_stratifiedkfold():
    # Check that shuffling is happening when requested, and for proper
    # sample coverage
    labels = [0] * 20 + [1] * 20
    kf0 = list(cval.StratifiedKFold(labels, 5, shuffle=True, random_state=0))
    kf1 = list(cval.StratifiedKFold(labels, 5, shuffle=True, random_state=1))
    for (_, test0), (_, test1) in zip(kf0, kf1):
        assert_true(set(test0) != set(test1))
    check_cv_coverage(kf0, expected_n_iter=5, n_samples=40) 
Example 51
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 5 votes vote down vote up
def test_cross_val_generator_with_mask():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    loo = assert_warns(DeprecationWarning, cval.LeaveOneOut,
                       4, indices=False)
    lpo = assert_warns(DeprecationWarning, cval.LeavePOut,
                       4, 2, indices=False)
    kf = assert_warns(DeprecationWarning, cval.KFold,
                      4, 2, indices=False)
    skf = assert_warns(DeprecationWarning, cval.StratifiedKFold,
                       y, 2, indices=False)
    lolo = assert_warns(DeprecationWarning, cval.LeaveOneLabelOut,
                        labels, indices=False)
    lopo = assert_warns(DeprecationWarning, cval.LeavePLabelOut,
                        labels, 2, indices=False)
    ss = assert_warns(DeprecationWarning, cval.ShuffleSplit,
                      4, indices=False)
    ps = assert_warns(DeprecationWarning, cval.PredefinedSplit, [1, 1, 2, 2],
                      indices=False)
    for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]:
        for train, test in cv:
            assert_equal(np.asarray(train).dtype.kind, 'b')
            assert_equal(np.asarray(train).dtype.kind, 'b')
            X[train], X[test]
            y[train], y[test] 
Example 52
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 5 votes vote down vote up
def test_cross_val_generator_with_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    # explicitly passing indices value is deprecated
    loo = assert_warns(DeprecationWarning, cval.LeaveOneOut,
                       4, indices=True)
    lpo = assert_warns(DeprecationWarning, cval.LeavePOut,
                       4, 2, indices=True)
    kf = assert_warns(DeprecationWarning, cval.KFold,
                      4, 2, indices=True)
    skf = assert_warns(DeprecationWarning, cval.StratifiedKFold,
                       y, 2, indices=True)
    lolo = assert_warns(DeprecationWarning, cval.LeaveOneLabelOut,
                        labels, indices=True)
    lopo = assert_warns(DeprecationWarning, cval.LeavePLabelOut,
                        labels, 2, indices=True)
    ps = assert_warns(DeprecationWarning, cval.PredefinedSplit,
                      [1, 1, 2, 2], indices=True)
    # Bootstrap as a cross-validation is deprecated
    b = assert_warns(DeprecationWarning, cval.Bootstrap, 2)
    ss = assert_warns(DeprecationWarning, cval.ShuffleSplit,
                      2, indices=True)
    for cv in [loo, lpo, kf, skf, lolo, lopo, b, ss, ps]:
        for train, test in cv:
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            X[train], X[test]
            y[train], y[test] 
Example 53
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 5 votes vote down vote up
def test_cross_val_generator_mask_indices_same():
    # Test that the cross validation generators return the same results when
    # indices=True and when indices=False
    y = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2])
    labels = np.array([1, 1, 2, 3, 3, 3, 4])

    loo_mask = cval.LeaveOneOut(5, indices=False)
    loo_ind = cval.LeaveOneOut(5, indices=True)
    lpo_mask = cval.LeavePOut(10, 2, indices=False)
    lpo_ind = cval.LeavePOut(10, 2, indices=True)
    kf_mask = cval.KFold(10, 5, indices=False, shuffle=True, random_state=1)
    kf_ind = cval.KFold(10, 5, indices=True, shuffle=True, random_state=1)
    skf_mask = cval.StratifiedKFold(y, 3, indices=False)
    skf_ind = cval.StratifiedKFold(y, 3, indices=True)
    lolo_mask = cval.LeaveOneLabelOut(labels, indices=False)
    lolo_ind = cval.LeaveOneLabelOut(labels, indices=True)
    lopo_mask = cval.LeavePLabelOut(labels, 2, indices=False)
    lopo_ind = cval.LeavePLabelOut(labels, 2, indices=True)
    ps_mask = cval.PredefinedSplit([1, 1, 2, 2], indices=False)
    ps_ind = cval.PredefinedSplit([1, 1, 2, 2], indices=True)

    for cv_mask, cv_ind in [(loo_mask, loo_ind), (lpo_mask, lpo_ind),
                            (kf_mask, kf_ind), (skf_mask, skf_ind),
                            (lolo_mask, lolo_ind), (lopo_mask, lopo_ind),
                            (ps_mask, ps_ind)]:
        for (train_mask, test_mask), (train_ind, test_ind) in \
                zip(cv_mask, cv_ind):
            assert_array_equal(np.where(train_mask)[0], train_ind)
            assert_array_equal(np.where(test_mask)[0], test_ind) 
Example 54
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 5 votes vote down vote up
def test_check_cv_return_types():
    X = np.ones((9, 2))
    cv = cval._check_cv(3, X, classifier=False)
    assert_true(isinstance(cv, cval.KFold))

    y_binary = np.array([0, 1, 0, 1, 0, 0, 1, 1, 1])
    cv = cval._check_cv(3, X, y_binary, classifier=True)
    assert_true(isinstance(cv, cval.StratifiedKFold))

    y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2])
    cv = cval._check_cv(3, X, y_multiclass, classifier=True)
    assert_true(isinstance(cv, cval.StratifiedKFold))

    X = np.ones((5, 2))
    y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]]

    with warnings.catch_warnings(record=True):
        # deprecated sequence of sequence format
        cv = cval._check_cv(3, X, y_seq_of_seqs, classifier=True)
    assert_true(isinstance(cv, cval.KFold))

    y_indicator_matrix = LabelBinarizer().fit_transform(y_seq_of_seqs)
    cv = cval._check_cv(3, X, y_indicator_matrix, classifier=True)
    assert_true(isinstance(cv, cval.KFold))

    y_multioutput = np.array([[1, 2], [0, 3], [0, 0], [3, 1], [2, 0]])
    cv = cval._check_cv(3, X, y_multioutput, classifier=True)
    assert_true(isinstance(cv, cval.KFold)) 
Example 55
Project: Weiss   Author: WangWenjun559   File: test_cross_validation.py    Apache License 2.0 5 votes vote down vote up
def test_kfold_valueerrors():
    # Check that errors are raised if there is not enough samples
    assert_raises(ValueError, cval.KFold, 3, 4)

    # Check that a warning is raised if the least populated class has too few
    # members.
    y = [3, 3, -1, -1, 2]

    cv = assert_warns_message(Warning, "The least populated class",
                              cval.StratifiedKFold, y, 3)

    # Check that despite the warning the folds are still computed even
    # though all the classes are not necessarily represented at on each
    # side of the split at each split
    check_cv_coverage(cv, expected_n_iter=3, n_samples=len(y))

    # Error when number of folds is <= 1
    assert_raises(ValueError, cval.KFold, 2, 0)
    assert_raises(ValueError, cval.KFold, 2, 1)
    assert_raises(ValueError, cval.StratifiedKFold, y, 0)
    assert_raises(ValueError, cval.StratifiedKFold, y, 1)

    # When n is not integer:
    assert_raises(ValueError, cval.KFold, 2.5, 2)

    # When n_folds is not integer:
    assert_raises(ValueError, cval.KFold, 5, 1.5)
    assert_raises(ValueError, cval.StratifiedKFold, y, 1.5) 
Example 56
Project: Weiss   Author: WangWenjun559   File: test_cross_validation.py    Apache License 2.0 5 votes vote down vote up
def test_stratifiedkfold_balance():
    # Check that KFold returns folds with balanced sizes (only when
    # stratification is possible)
    # Repeat with shuffling turned off and on
    labels = [0] * 3 + [1] * 14
    for shuffle in [False, True]:
        for skf in [cval.StratifiedKFold(labels[:i], 3, shuffle=shuffle)
                    for i in range(11, 17)]:
            sizes = []
            for _, test in skf:
                sizes.append(len(test))

            assert_true((np.max(sizes) - np.min(sizes)) <= 1)
            assert_equal(np.sum(sizes), skf.n) 
Example 57
Project: Weiss   Author: WangWenjun559   File: test_cross_validation.py    Apache License 2.0 5 votes vote down vote up
def test_shuffle_stratifiedkfold():
    # Check that shuffling is happening when requested, and for proper
    # sample coverage
    labels = [0] * 20 + [1] * 20
    kf0 = list(cval.StratifiedKFold(labels, 5, shuffle=True, random_state=0))
    kf1 = list(cval.StratifiedKFold(labels, 5, shuffle=True, random_state=1))
    for (_, test0), (_, test1) in zip(kf0, kf1):
        assert_true(set(test0) != set(test1))
    check_cv_coverage(kf0, expected_n_iter=5, n_samples=40) 
Example 58
Project: Weiss   Author: WangWenjun559   File: test_cross_validation.py    Apache License 2.0 5 votes vote down vote up
def test_check_cv_return_types():
    X = np.ones((9, 2))
    cv = cval._check_cv(3, X, classifier=False)
    assert_true(isinstance(cv, cval.KFold))

    y_binary = np.array([0, 1, 0, 1, 0, 0, 1, 1, 1])
    cv = cval._check_cv(3, X, y_binary, classifier=True)
    assert_true(isinstance(cv, cval.StratifiedKFold))

    y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2])
    cv = cval._check_cv(3, X, y_multiclass, classifier=True)
    assert_true(isinstance(cv, cval.StratifiedKFold))

    X = np.ones((5, 2))
    y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]]

    with warnings.catch_warnings(record=True):
        # deprecated sequence of sequence format
        cv = cval._check_cv(3, X, y_seq_of_seqs, classifier=True)
    assert_true(isinstance(cv, cval.KFold))

    y_indicator_matrix = LabelBinarizer().fit_transform(y_seq_of_seqs)
    cv = cval._check_cv(3, X, y_indicator_matrix, classifier=True)
    assert_true(isinstance(cv, cval.KFold))

    y_multioutput = np.array([[1, 2], [0, 3], [0, 0], [3, 1], [2, 0]])
    cv = cval._check_cv(3, X, y_multioutput, classifier=True)
    assert_true(isinstance(cv, cval.KFold)) 
Example 59
Project: dr.b   Author: taoddiao   File: classify_nodes.py    Apache License 2.0 5 votes vote down vote up
def classifyData():
    X = np.load("dataX.npy")
    Y = np.load("dataY.npy")

    kf = KFold(Y, n_folds=3)
    y_pred = Y * 0
    for train, test in kf:
        X_train, X_test, y_train, y_test = X[train,:], X[test,:], Y[train], Y[test]
        clf = RF(n_estimators=100, n_jobs=3)
        clf.fit(X_train, y_train)
        y_pred[test] = clf.predict(X_test)
    print classification_report(Y, y_pred, target_names=["No Cancer", "Cancer"])
    print("logloss",logloss(Y, y_pred))

    # All Cancer
    print "Predicting all positive"
    y_pred = np.ones(Y.shape)
    print classification_report(Y, y_pred, target_names=["No Cancer", "Cancer"])
    print("logloss",logloss(Y, y_pred))

    # No Cancer
    print "Predicting all negative"
    y_pred = Y*0
    print classification_report(Y, y_pred, target_names=["No Cancer", "Cancer"])
    print("logloss",logloss(Y, y_pred))

    # try XGBoost
    print ("XGBoost")
    kf = KFold(Y, n_folds=3)
    y_pred = Y * 0
    for train, test in kf:
        X_train, X_test, y_train, y_test = X[train,:], X[test,:], Y[train], Y[test]
        clf = xgb.XGBClassifier(objective="binary:logistic")
        clf.fit(X_train, y_train)
        y_pred[test] = clf.predict(X_test)
    print classification_report(Y, y_pred, target_names=["No Cancer", "Cancer"])
    print("logloss",logloss(Y, y_pred)) 
Example 60
Project: Lyssandra   Author: ektormak   File: classify.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __call__(self, X, y):
        """
        given a dataset X,y we split it, in order to do cross validation,
        according to the procedure explained below:
        if n_folds is not None, then we do cross validation
        based on stratified folds
        if n_class_samples is not None, then we do cross validation
        using only <n_class_samples> training samples per class
        if n_test_samples is not None, then we do cross validation
        using only <n_test_samples> cross validaition samples per class
        assumes that each datapoint is in a column of X
        """
        n_classes = len(set(y))
        if self.n_folds is not None:
            # generate the folds
            self.folds = StratifiedKFold(y, n_folds=self.n_folds,
                                         shuffle=False, random_state=None)

        elif self.n_class_samples is not None:

            self.folds = []
            for i in range(self.n_tests):

                if type(self.n_class_samples) is not list:
                    self.n_class_samples = (np.ones(n_classes) * self.n_class_samples).astype(int)
                if self.n_test_samples is not None:
                    self.n_test_samples = (np.ones(n_classes) * self.n_test_samples).astype(int)

                data_idx = split_dataset(self.n_class_samples, self.n_test_samples, y)
                train_idx = data_idx[0]
                test_idx = data_idx[1]
                self.folds.append((train_idx, test_idx))

        self.cross_validate(X, y) 
Example 61
Project: twitter-adr-convnet   Author: chop-dbhi   File: sklearn_extensions.py    GNU General Public License v2.0 5 votes vote down vote up
def grid_analysis(pipeline, parameters, train_input, train_labels, cv = None, scoring='accuracy'):
    if not cv:
        cv = StratifiedKFold(train_labels, n_folds=5, shuffle=False)
    grid_search = GridSearchCV(pipeline, parameters, scoring=scoring, cv=cv, refit=True, verbose = 1, n_jobs=2)
    print("Performing grid search...")
    tic = time.clock()
    grid_search.fit(train_input,train_labels)
    toc = time.clock()
    print "Grid search complete in {0} sec".format(toc-tic)
    return grid_search 
Example 62
Project: egk   Author: steveli   File: random_mvegk.py    MIT License 5 votes vote down vote up
def grid_search_cv(means, covs, label, gamma_grid, c_grid, n_sample=50,
                   normalize=True):
    best_score = -np.inf

    for gamma in gamma_grid:
        rp = MultiEGKSampler(gamma, n_sample=n_sample, normalize=normalize)
        scores = defaultdict(float)
        for idx_train, idx_test in StratifiedKFold(label):
            X_train = rp.fit_transform(means[idx_train], covs[idx_train])
            X_test = rp.transform(means[idx_test], covs[idx_test])
            l_train = label[idx_train]
            l_test = label[idx_test]
            for C in c_grid:
                clf = LinearSVC(C=C)
                clf.fit(X_train, l_train)
                l_predict = clf.predict(X_test)
                accuracy = np.mean(l_predict == l_test)
                scores[C] += accuracy

        best_C_score, best_C = max((score, C)
                                   for (C, score) in scores.iteritems())
        #print gamma, scores
        if best_C_score > best_score:
            best_score = best_C_score
            best_parms = {'gamma': gamma, 'C': best_C}

    return best_parms 
Example 63
Project: EasyOverHard   Author: WeiFoo   File: experiment.py    MIT License 5 votes vote down vote up
def run_tuning_SVM(word2vec_src, repeats=1,
                   fold=10,
                   tuning=True):
  """
  :param word2vec_src:str, path of word2vec model
  :param repeats:int, number of repeats
  :param fold: int,number of folds
  :param tuning: boolean, tuning or not.
  :return: None
  """
  print("# word2vec:", word2vec_src)
  word2vec_model = gensim.models.Word2Vec.load(word2vec_src)
  data = PaperData(word2vec=word2vec_model)
  train_pd = load_vec(data, data.train_data, file_name=False)
  test_pd = load_vec(data, data.test_data, file_name=False)
  learner = [SK_SVM][0]
  goal = {0: "PD", 1: "PF", 2: "PREC", 3: "ACC", 4: "F", 5: "G", 6: "Macro_F",
          7: "Micro_F"}[6]
  F = {}
  clfs = []
  for i in xrange(repeats):  # repeat n times here
    kf = StratifiedKFold(train_pd.loc[:, "LinkTypeId"].values, fold,
                         shuffle=True)
    for train_index, tune_index in kf:
      train_data = train_pd.ix[train_index]
      tune_data = train_pd.ix[tune_index]
      train_X = train_data.loc[:, "Output"].values
      train_Y = train_data.loc[:, "LinkTypeId"].values
      tune_X = tune_data.loc[:, "Output"].values
      tune_Y = tune_data.loc[:, "LinkTypeId"].values
      test_X = test_pd.loc[:, "Output"].values
      test_Y = test_pd.loc[:, "LinkTypeId"].values
      params, evaluation = tune_learner(learner, train_X, train_Y, tune_X,
                                        tune_Y, goal) if tuning else ({}, 0)
      clf = learner(train_X, train_Y, test_X, test_Y, goal)
      F = clf.learn(F, **params)
      clfs.append(clf)
  print_results(clfs) 
Example 64
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_split.py    Apache License 2.0 5 votes vote down vote up
def test_check_cv():
    X = np.ones(9)
    cv = check_cv(3, classifier=False)
    # Use numpy.testing.assert_equal which recursively compares
    # lists of lists
    np.testing.assert_equal(list(KFold(3).split(X)), list(cv.split(X)))

    y_binary = np.array([0, 1, 0, 1, 0, 0, 1, 1, 1])
    cv = check_cv(3, y_binary, classifier=True)
    np.testing.assert_equal(list(StratifiedKFold(3).split(X, y_binary)),
                            list(cv.split(X, y_binary)))

    y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2])
    cv = check_cv(3, y_multiclass, classifier=True)
    np.testing.assert_equal(list(StratifiedKFold(3).split(X, y_multiclass)),
                            list(cv.split(X, y_multiclass)))

    X = np.ones(5)
    y_multilabel = np.array([[0, 0, 0, 0], [0, 1, 1, 0], [0, 0, 0, 1],
                             [1, 1, 0, 1], [0, 0, 1, 0]])
    cv = check_cv(3, y_multilabel, classifier=True)
    np.testing.assert_equal(list(KFold(3).split(X)), list(cv.split(X)))

    y_multioutput = np.array([[1, 2], [0, 3], [0, 0], [3, 1], [2, 0]])
    cv = check_cv(3, y_multioutput, classifier=True)
    np.testing.assert_equal(list(KFold(3).split(X)), list(cv.split(X)))

    # Check if the old style classes are wrapped to have a split method
    X = np.ones(9)
    y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2])
    cv1 = check_cv(3, y_multiclass, classifier=True)

    with warnings.catch_warnings(record=True):
        from sklearn.cross_validation import StratifiedKFold as OldSKF

    cv2 = check_cv(OldSKF(y_multiclass, n_folds=3))
    np.testing.assert_equal(list(cv1.split(X, y_multiclass)),
                            list(cv2.split()))

    assert_raises(ValueError, check_cv, cv="lolo") 
Example 65
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_split.py    Apache License 2.0 5 votes vote down vote up
def test_cv_iterable_wrapper():
    y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2])

    with warnings.catch_warnings(record=True):
        from sklearn.cross_validation import StratifiedKFold as OldSKF

    cv = OldSKF(y_multiclass, n_folds=3)
    wrapped_old_skf = _CVIterableWrapper(cv)

    # Check if split works correctly
    np.testing.assert_equal(list(cv), list(wrapped_old_skf.split()))

    # Check if get_n_splits works correctly
    assert_equal(len(cv), wrapped_old_skf.get_n_splits())

    kf_iter = KFold(n_splits=5).split(X, y)
    kf_iter_wrapped = check_cv(kf_iter)
    # Since the wrapped iterable is enlisted and stored,
    # split can be called any number of times to produce
    # consistent results.
    np.testing.assert_equal(list(kf_iter_wrapped.split(X, y)),
                            list(kf_iter_wrapped.split(X, y)))
    # If the splits are randomized, successive calls to split yields different
    # results
    kf_randomized_iter = KFold(n_splits=5, shuffle=True).split(X, y)
    kf_randomized_iter_wrapped = check_cv(kf_randomized_iter)
    # numpy's assert_array_equal properly compares nested lists
    np.testing.assert_equal(list(kf_randomized_iter_wrapped.split(X, y)),
                            list(kf_randomized_iter_wrapped.split(X, y)))

    try:
        np.testing.assert_equal(list(kf_iter_wrapped.split(X, y)),
                                list(kf_randomized_iter_wrapped.split(X, y)))
        splits_are_equal = True
    except AssertionError:
        splits_are_equal = False
    assert_false(splits_are_equal, "If the splits are randomized, "
                 "successive calls to split should yield different results") 
Example 66
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_cross_validation.py    Apache License 2.0 5 votes vote down vote up
def test_kfold_valueerrors():
    # Check that errors are raised if there is not enough samples
    assert_raises(ValueError, cval.KFold, 3, 4)

    # Check that a warning is raised if the least populated class has too few
    # members.
    y = [3, 3, -1, -1, 3]

    cv = assert_warns_message(Warning, "The least populated class",
                              cval.StratifiedKFold, y, 3)

    # Check that despite the warning the folds are still computed even
    # though all the classes are not necessarily represented at on each
    # side of the split at each split
    check_cv_coverage(cv, expected_n_iter=3, n_samples=len(y))

    # Check that errors are raised if all n_labels for individual
    # classes are less than n_folds.
    y = [3, 3, -1, -1, 2]

    assert_raises(ValueError, cval.StratifiedKFold, y, 3)

    # Error when number of folds is <= 1
    assert_raises(ValueError, cval.KFold, 2, 0)
    assert_raises(ValueError, cval.KFold, 2, 1)
    error_string = ("k-fold cross validation requires at least one"
                    " train / test split")
    assert_raise_message(ValueError, error_string,
                         cval.StratifiedKFold, y, 0)
    assert_raise_message(ValueError, error_string,
                         cval.StratifiedKFold, y, 1)

    # When n is not integer:
    assert_raises(ValueError, cval.KFold, 2.5, 2)

    # When n_folds is not integer:
    assert_raises(ValueError, cval.KFold, 5, 1.5)
    assert_raises(ValueError, cval.StratifiedKFold, y, 1.5) 
Example 67
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_cross_validation.py    Apache License 2.0 5 votes vote down vote up
def test_shuffle_stratifiedkfold():
    # Check that shuffling is happening when requested, and for proper
    # sample coverage
    labels = [0] * 20 + [1] * 20
    kf0 = list(cval.StratifiedKFold(labels, 5, shuffle=True, random_state=0))
    kf1 = list(cval.StratifiedKFold(labels, 5, shuffle=True, random_state=1))
    for (_, test0), (_, test1) in zip(kf0, kf1):
        assert_true(set(test0) != set(test1))
    check_cv_coverage(kf0, expected_n_iter=5, n_samples=40) 
Example 68
Project: kuaa   Author: rafaelwerneck   File: plugin_stratified_k_fold.py    GNU General Public License v3.0 4 votes vote down vote up
def train_test(images, classes, parameters):
    """
    Divides the dictionary keys of the images in folds, according to the
    parameters:
        - Number of Folds: number of folds that will be created.
    
    The first n % n_folds folds have size n // n_folds + 1, other folds have
    size n // n_folds.
    """
    
    print "Train and Test: Stratified K-Fold"
    
    print parameters
    
    #Get parameters
    param_folds = parameters['Number of Folds']
    
    list_train_test = []
    
    #Split the dataset into train and test
    print "\tSpliting the dataset into train and test."
    
    #Train and Test Split
    #--------------------------------------------------------------------------
    len_images = len(images)
    images_keys = images.keys()
    
    k_fold = StratifiedKFold([str(images[key][ZERO][ZERO]) for key in images.iterkeys()], param_folds)
    
    #Transform the index of the KFold function into the keys of the images
    #dictionary
    for train_index, test_index in k_fold:
        train = []
        test = []
        for index in train_index:
            train.append(images_keys[index])
        for index in test_index:
            test.append(images_keys[index])
        list_train_test.append([train, test])
    #--------------------------------------------------------------------------
    
    return list_train_test 
Example 69
Project: WeightedTags-MF   Author: andreuvall   File: MF.py    BSD 2-Clause "Simplified" License 4 votes vote down vote up
def get_data(collection, dataset, num_folds, alpha):
    # collection: data collection folder
    # dataset: dataset folder
    # num_folds: data splits
    # alpha: weight for the binary ratings

    # Load ratings data
    full_R = np.loadtxt('../data/' + collection + '/' + dataset + '/playcounts.txt', delimiter=",")
    full_R = sparse.coo_matrix((full_R[:, 2], (full_R[:, 0], full_R[:, 1])))
    num_users, num_items = full_R.shape

    # Make data splits balancing users in each fold and prepare data
    splits = cv.StratifiedKFold(full_R.row, n_folds=num_folds, random_state=1)
    data = []
    test_indices = open('test_' + dataset + '_MF.txt', 'wa')
    for train, test in splits:
        # Train data
        R = sparse.csr_matrix((full_R.data[train], (full_R.row[train],
                                                    full_R.col[train])),
                              shape=(num_users, num_items))

        # P = R > 0 is really not needed through the code

        # Weight data
        weights = 1. + alpha * np.log(1. + R.data)
        C = sparse.csr_matrix((weights, R.nonzero()),
                              shape=(num_users, num_items))

        # Test data
        Rt = sparse.coo_matrix((full_R.data[test], (full_R.row[test],
                                                    full_R.col[test])),
                               shape=(num_users, num_items))

        fold_data = {'C': C, 'Rt': Rt}
        data.append(fold_data)

        # Store test indices for further mpr calculation
        np.savetxt(test_indices, test, fmt='%i')

    test_indices.close()
    return data


# RMSE function 
Example 70
Project: dsw-ont-ijcai   Author: anonymous-ijcai   File: topics_music.py    GNU General Public License v3.0 4 votes vote down vote up
def evaluate_learning_based_classifier_cross_validated(training_size=None):
    np.random.seed(0)
    categories, classes, inner_cross_validation,\
        outer_cross_validation, model_selection_measure,\
        evaluation_measures = default_classifier_evaluation_params()
    def smaller_cross_validation(outputs):
        return cross_validation.StratifiedKFold(outputs, n_folds=2)
    inner_cross_validation = lambda outputs: cross_validation.StratifiedKFold(outputs, n_folds=3)
    param_grid = topics.new_training_params_cv()['param_grid']
    param_grid[0]['C'] = [0.25, 0.5, 1, 3, 7, 15]
    full_selection = precompute_full_selection()
    features = topics.default_features.copy()
    classifier_params = topics.default_classifier_params.copy()
    classifier_params['C'] = 0.25
    tuned_clf = topics.CategorySelectionClassifier(
        full_selection=full_selection,
        features=features,
        classifier_fn=topics.default_classifier,
        max_depth=full_selection._max_depth,
        instance_weight=lambda x: 1,
        **classifier_params)
    print(classes.dtype)
    return topics.train_evaluate_topic_classifier_cv(
        tuned_clf, categories, classes,
        inner_cross_validation,
        outer_cross_validation,
#         smaller_cross_validation,                                              
        model_selection_measure,
        evaluation_measures,
        param_grid=param_grid,
        learning=True,
        training_size=training_size)

# <codecell>

# metrics = evaluate_learning_based_classifier_cross_validated()

# <codecell>

# metric_names = ['accuracy', 'f1_pos', 'f1_neg', 'weighted_f1']
# for metric_name, metric in zip(metric_names, metrics):
#     print("{:<11s} : {:.3f} +- {:.3f}".format(metric_name, metric.mean(), metric.std()))

# <codecell> 
Example 71
Project: kddcup2015   Author: its-fun   File: modeling.py    GNU General Public License v2.0 4 votes vote down vote up
def sgd():
    """
    Submission: sgd_0620_03.csv
    E_val: 0.863628
    E_in: 0.854373
    E_out:
    """
    from sklearn.linear_model import SGDClassifier
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline
    from sklearn.grid_search import GridSearchCV
    from sklearn.cross_validation import StratifiedKFold

    X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47'))
    y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47'))

    raw_scaler = StandardScaler()
    raw_scaler.fit(X)
    X_scaled = raw_scaler.transform(X)

    rfe = util.fetch(util.cache_path('feature_selection.RFE.21'))

    X_pruned = rfe.transform(X_scaled)

    new_scaler = StandardScaler()
    new_scaler.fit(X_pruned)
    X_new = new_scaler.transform(X_pruned)

    sgd = SGDClassifier(n_iter=50, n_jobs=-1)
    params = {
        'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge',
                 'perceptron', 'squared_loss', 'huber', 'epsilon_insensitive',
                 'squared_epsilon_insensitive']
    }
    grid = GridSearchCV(sgd, param_grid=params, cv=StratifiedKFold(y, 5),
                        scoring='roc_auc', n_jobs=-1)
    grid.fit(X_new, y)

    logger.debug('Best score (E_val): %f', grid.best_score_)

    sgd = grid.best_estimator_

    logger.debug('E_in: %f', auc_score(sgd, X_new, y))
    to_submission(Pipeline([('scale_raw', raw_scaler),
                            ('rfe', rfe),
                            ('scale_new', new_scaler),
                            ('sgd', sgd)]), 'sgd_0620_03') 
Example 72
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 4 votes vote down vote up
def test_permutation_score():
    iris = load_iris()
    X = iris.data
    X_sparse = coo_matrix(X)
    y = iris.target
    svm = SVC(kernel='linear')
    cv = cval.StratifiedKFold(y, 2)

    score, scores, pvalue = cval.permutation_test_score(
        svm, X, y, n_permutations=30, cv=cv, scoring="accuracy")
    assert_greater(score, 0.9)
    assert_almost_equal(pvalue, 0.0, 1)

    score_label, _, pvalue_label = cval.permutation_test_score(
        svm, X, y, n_permutations=30, cv=cv, scoring="accuracy",
        labels=np.ones(y.size), random_state=0)
    assert_true(score_label == score)
    assert_true(pvalue_label == pvalue)

    # check that we obtain the same results with a sparse representation
    svm_sparse = SVC(kernel='linear')
    cv_sparse = cval.StratifiedKFold(y, 2)
    score_label, _, pvalue_label = cval.permutation_test_score(
        svm_sparse, X_sparse, y, n_permutations=30, cv=cv_sparse,
        scoring="accuracy", labels=np.ones(y.size), random_state=0)

    assert_true(score_label == score)
    assert_true(pvalue_label == pvalue)

    # test with custom scoring object
    def custom_score(y_true, y_pred):
        return (((y_true == y_pred).sum() - (y_true != y_pred).sum())
                / y_true.shape[0])

    scorer = make_scorer(custom_score)
    score, _, pvalue = cval.permutation_test_score(
        svm, X, y, n_permutations=100, scoring=scorer, cv=cv, random_state=0)
    assert_almost_equal(score, .93, 2)
    assert_almost_equal(pvalue, 0.01, 3)

    # set random y
    y = np.mod(np.arange(len(y)), 3)

    score, scores, pvalue = cval.permutation_test_score(
        svm, X, y, n_permutations=30, cv=cv, scoring="accuracy")

    assert_less(score, 0.5)
    assert_greater(pvalue, 0.2) 
Example 73
Project: linear_neuron   Author: uglyboxer   File: test_logistic.py    MIT License 4 votes vote down vote up
def test_ovr_multinomial_iris():
    # Test that OvR and multinomial are correct using the iris dataset.
    train, target = iris.data, iris.target
    n_samples, n_features = train.shape

    # Use pre-defined fold as folds generated for different y
    cv = StratifiedKFold(target, 3)
    clf = LogisticRegressionCV(cv=cv)
    clf.fit(train, target)

    clf1 = LogisticRegressionCV(cv=cv)
    target_copy = target.copy()
    target_copy[target_copy == 0] = 1
    clf1.fit(train, target_copy)

    assert_array_almost_equal(clf.scores_[2], clf1.scores_[2])
    assert_array_almost_equal(clf.intercept_[2:], clf1.intercept_)
    assert_array_almost_equal(clf.coef_[2][np.newaxis, :], clf1.coef_)

    # Test the shape of various attributes.
    assert_equal(clf.coef_.shape, (3, n_features))
    assert_array_equal(clf.classes_, [0, 1, 2])
    coefs_paths = np.asarray(list(clf.coefs_paths_.values()))
    assert_array_almost_equal(coefs_paths.shape, (3, 3, 10, n_features + 1))
    assert_equal(clf.Cs_.shape, (10, ))
    scores = np.asarray(list(clf.scores_.values()))
    assert_equal(scores.shape, (3, 3, 10))

    # Test that for the iris data multinomial gives a better accuracy than OvR
    for solver in ['lbfgs', 'newton-cg']:
        clf_multi = LogisticRegressionCV(
            solver=solver, multi_class='multinomial', max_iter=15
            )
        clf_multi.fit(train, target)
        multi_score = clf_multi.score(train, target)
        ovr_score = clf.score(train, target)
        assert_greater(multi_score, ovr_score)

        # Test attributes of LogisticRegressionCV
        assert_equal(clf.coef_.shape, clf_multi.coef_.shape)
        assert_array_equal(clf_multi.classes_, [0, 1, 2])
        coefs_paths = np.asarray(list(clf_multi.coefs_paths_.values()))
        assert_array_almost_equal(coefs_paths.shape, (3, 3, 10,
                                                      n_features + 1))
        assert_equal(clf_multi.Cs_.shape, (10, ))
        scores = np.asarray(list(clf_multi.scores_.values()))
        assert_equal(scores.shape, (3, 3, 10)) 
Example 74
Project: Weiss   Author: WangWenjun559   File: test_cross_validation.py    Apache License 2.0 4 votes vote down vote up
def test_permutation_score():
    iris = load_iris()
    X = iris.data
    X_sparse = coo_matrix(X)
    y = iris.target
    svm = SVC(kernel='linear')
    cv = cval.StratifiedKFold(y, 2)

    score, scores, pvalue = cval.permutation_test_score(
        svm, X, y, n_permutations=30, cv=cv, scoring="accuracy")
    assert_greater(score, 0.9)
    assert_almost_equal(pvalue, 0.0, 1)

    score_label, _, pvalue_label = cval.permutation_test_score(
        svm, X, y, n_permutations=30, cv=cv, scoring="accuracy",
        labels=np.ones(y.size), random_state=0)
    assert_true(score_label == score)
    assert_true(pvalue_label == pvalue)

    # check that we obtain the same results with a sparse representation
    svm_sparse = SVC(kernel='linear')
    cv_sparse = cval.StratifiedKFold(y, 2)
    score_label, _, pvalue_label = cval.permutation_test_score(
        svm_sparse, X_sparse, y, n_permutations=30, cv=cv_sparse,
        scoring="accuracy", labels=np.ones(y.size), random_state=0)

    assert_true(score_label == score)
    assert_true(pvalue_label == pvalue)

    # test with custom scoring object
    def custom_score(y_true, y_pred):
        return (((y_true == y_pred).sum() - (y_true != y_pred).sum())
                / y_true.shape[0])

    scorer = make_scorer(custom_score)
    score, _, pvalue = cval.permutation_test_score(
        svm, X, y, n_permutations=100, scoring=scorer, cv=cv, random_state=0)
    assert_almost_equal(score, .93, 2)
    assert_almost_equal(pvalue, 0.01, 3)

    # set random y
    y = np.mod(np.arange(len(y)), 3)

    score, scores, pvalue = cval.permutation_test_score(
        svm, X, y, n_permutations=30, cv=cv, scoring="accuracy")

    assert_less(score, 0.5)
    assert_greater(pvalue, 0.2) 
Example 75
Project: 2020plus   Author: KarchinLab   File: generic_classifier.py    Apache License 2.0 4 votes vote down vote up
def train_cv(self, k=10):
        """Train classifier on entire data set provided, but done in cross-validation."""
        # generate indices for kfold cross validation
        self.num_pred = 0  # number of predictions
        self.test_fold_df = pd.DataFrame({l+1: 0 for l in range(self.total_iter)}, index=self.x.index)

        for i in range(self.total_iter):
            # randomize for another round
            self.x, self.y = futils.randomize(self.x, self.prng)
            futils.check_num_classes(self.y) # warn user if not 3 classes

            # set up stratified kfold iterator
            k_fold = cross_validation.StratifiedKFold(self.y,
                                                      n_folds=k)

            # obtain predictions from single round of kfold validation
            for nfold, (train_ix, test_ix) in enumerate(k_fold):
                # retreive indices from pandas dataframe using row number
                tmp_train_ix = self.x.iloc[train_ix].index

                # save which genes are in the test fold
                tmp_test_ix = self.x.iloc[test_ix].index
                self.test_fold_df.loc[tmp_test_ix, i+1] = nfold + 1

                if self.is_weighted_sample:
                    # figure out sample weights
                    num_train = len(train_ix)
                    sample_weight = np.zeros(num_train)
                    onco_ix = np.nonzero(self.y.ix[tmp_train_ix]==self.onco_num)[0]
                    tsg_ix = np.nonzero(self.y.ix[tmp_train_ix]==self.tsg_num)[0]
                    other_ix = np.nonzero(self.y.ix[tmp_train_ix]==self.other_num)[0]
                    sample_weight[onco_ix] = 1. / len(onco_ix)
                    sample_weight[tsg_ix] = 1. / len(tsg_ix)
                    sample_weight[other_ix] = 1. / len(other_ix)

                    # do training with sample weighting
                    self.clf.fit(self.x.ix[tmp_train_ix].copy(),
                                 self.y.ix[tmp_train_ix].copy(),
                                 sample_weight=sample_weight)
                else:
                    # do training without weighting
                    self.clf.fit(self.x.ix[tmp_train_ix].copy(),
                                 self.y.ix[tmp_train_ix].copy())
                self.clf.append_fold_result()  # add the training result from each fold
            self.clf.append_cv_result()  # add the training result for a single CV to the R variable

            self.num_pred += 1
        self.clf.set_cv_fold(self.test_fold_df) 
Example 76
Project: 2020plus   Author: KarchinLab   File: generic_classifier.py    Apache License 2.0 4 votes vote down vote up
def kfold_prediction(self, k=10):
        # generate indices for kfold cross validation
        self.num_pred = 0  # number of predictions

        prediction = pd.Series(index=self.y.index)  # predicted class
        onco_prob = pd.Series(index=self.y.index).fillna(0)
        tsg_prob = pd.Series(index=self.y.index).fillna(0)

        for i in range(self.total_iter):
            # randomize for another round
            self.x, self.y = futils.randomize(self.x, self.prng)
            futils.check_num_classes(self.y) # warn user if not 3 classes

            # set up stratified kfold iterator
            k_fold = cross_validation.StratifiedKFold(self.y,
                                                      n_folds=k)

            # obtain predictions from single round of kfold validation
            for train_ix, test_ix in k_fold:
                # retreive indices from pandas dataframe using row number
                tmp_train_ix = self.x.iloc[train_ix].index
                tmp_test_ix = self.x.iloc[test_ix].index

                if self.is_weighted_sample:
                    # figure out sample weights
                    num_train = len(train_ix)
                    sample_weight = np.zeros(num_train)
                    onco_ix = np.nonzero(self.y.ix[tmp_train_ix]==self.onco_num)[0]
                    tsg_ix = np.nonzero(self.y.ix[tmp_train_ix]==self.tsg_num)[0]
                    other_ix = np.nonzero(self.y.ix[tmp_train_ix]==self.other_num)[0]
                    sample_weight[onco_ix] = 1. / len(onco_ix)
                    sample_weight[tsg_ix] = 1. / len(tsg_ix)
                    sample_weight[other_ix] = 1. / len(other_ix)

                    # do training with sample weighting
                    self.clf.fit(self.x.ix[tmp_train_ix].copy(),
                                 self.y.ix[tmp_train_ix].copy(),
                                 sample_weight=sample_weight)
                else:
                    # do training without weighting
                    self.clf.fit(self.x.ix[tmp_train_ix].copy(),
                                 self.y.ix[tmp_train_ix].copy())

                # predict test data in kfold validation
                tmp_prob = self.clf.predict_proba(self.x.ix[tmp_test_ix])
                onco_prob.ix[tmp_test_ix] += tmp_prob[:, self.onco_num]
                tsg_prob.ix[tmp_test_ix] += tmp_prob[:, self.tsg_num]

            self.num_pred += 1

        # convert number of trees to fraction of trees
        onco_prob /= self.num_pred
        tsg_prob /= self.num_pred
        other_prob = 1 - (onco_prob + tsg_prob)

        # return prediction.astype(int), prob
        return onco_prob, tsg_prob, other_prob 
Example 77
Project: allen-ai-science-qa   Author: arranger1044   File: ensemble.py    GNU General Public License v3.0 4 votes vote down vote up
def meta_classifier_cv_score(train_preds,
                             train_labels,
                             seeds=[1337, 6666, 7777, 5555],
                             n_folds=10,
                             meta_classifier=OneVsRestClassifier,
                             model_dict={
                                 "base_model": linear_model.LogisticRegression,
                                 "base_model_params": {
                                     "fit_intercept": True,
                                     "class_weight": "balanced",
                                     "penalty": "l2",
                                     "C": 10.0,
                                     "max_iter": 200}
                             }):

    seed_valid_acc_list = []

    for r, seed in enumerate(seeds):

        kf = StratifiedKFold(train_labels, n_folds=n_folds, shuffle=True, random_state=seed)

        # rand_gen = random.Random(seed)
        numpy_rand_gen = numpy.random.RandomState(seed)

        cv_valid_accs = []
        cv_train_accs = []

        for k, (train_ids, test_ids) in enumerate(kf):

            # print('Fold', k)

            train_x = train_preds[train_ids]
            train_y = train_labels[train_ids]
            test_x = train_preds[test_ids]
            test_y = train_labels[test_ids]

            model = meta_classifier(model_dict['base_model'],
                                    feature_selector=None,  # base_feature_sel_dict,
                                    **model_dict['base_model_params'])

            #
            # fitting
            # print('Fitting')
            model.fit(train_x, train_y)

            #
            # predicting
            # print('Predicting on test')
            test_pred_probs = model.predict(test_x)
            hard_test_preds = hard_preds(test_pred_probs)

            test_acc = compute_accuracy(test_y, hard_test_preds)
            cv_valid_accs.append(test_acc)

        avg_valid_acc = sum(cv_valid_accs) / n_folds
        # print('\tAVG on TEST', avg_valid_acc, end='   \r')
        seed_valid_acc_list.append(avg_valid_acc)

    avg_seed_valid_acc = sum(seed_valid_acc_list) / len(seeds)
    return avg_seed_valid_acc 
Example 78
Project: allen-ai-science-qa   Author: arranger1044   File: ensemble.py    GNU General Public License v3.0 4 votes vote down vote up
def feature_importances_meta_clf_cv(train_preds,
                                    train_labels,
                                    seeds=[1337, 6666, 7777, 5555],
                                    n_folds=5,
                                    meta_classifier=OneVsRestClassifier,
                                    model_dict={
                                        "base_model": linear_model.LogisticRegression,
                                        "base_model_params": {
                                            "fit_intercept": True,
                                            "class_weight": "balanced",
                                            "penalty": "l2",
                                            "C": 10.0,
                                            "max_iter": 200}
                                    }):

    n_predictors = train_preds.shape[2]

    feature_importances = numpy.zeros(n_predictors)

    for r, seed in enumerate(seeds):

        kf = StratifiedKFold(train_labels, n_folds=n_folds, shuffle=True, random_state=seed)

        # rand_gen = random.Random(seed)
        numpy_rand_gen = numpy.random.RandomState(seed)

        for k, (train_ids, test_ids) in enumerate(kf):

            # print('Fold', k)

            train_x = train_preds[train_ids]
            train_y = train_labels[train_ids]

            model_feature_importances = feature_importances_meta_clf(train_x,
                                                                     train_y,
                                                                     meta_classifier,
                                                                     model_dict)

            #
            # always getting the max importance, being conservative
            for p in range(n_predictors):
                if abs(feature_importances[p]) < abs(model_feature_importances[p]):
                    feature_importances[p] = abs(model_feature_importances[p])

    #
    # min, max scaling
    scaled_feature_importances = ((feature_importances - numpy.min(feature_importances)) /
                                  (numpy.max(feature_importances) - numpy.min(feature_importances)))

    return scaled_feature_importances 
Example 79
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_cross_validation.py    Apache License 2.0 4 votes vote down vote up
def test_kfold_can_detect_dependent_samples_on_digits():  # see #2372
    # The digits samples are dependent: they are apparently grouped by authors
    # although we don't have any information on the groups segment locations
    # for this data. We can highlight this fact be computing k-fold cross-
    # validation with and without shuffling: we observe that the shuffling case
    # wrongly makes the IID assumption and is therefore too optimistic: it
    # estimates a much higher accuracy (around 0.96) than the non
    # shuffling variant (around 0.86).

    digits = load_digits()
    X, y = digits.data[:800], digits.target[:800]
    model = SVC(C=10, gamma=0.005)
    n = len(y)

    cv = cval.KFold(n, 5, shuffle=False)
    mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
    assert_greater(0.88, mean_score)
    assert_greater(mean_score, 0.85)

    # Shuffling the data artificially breaks the dependency and hides the
    # overfitting of the model with regards to the writing style of the authors
    # by yielding a seriously overestimated score:

    cv = cval.KFold(n, 5, shuffle=True, random_state=0)
    mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
    assert_greater(mean_score, 0.95)

    cv = cval.KFold(n, 5, shuffle=True, random_state=1)
    mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
    assert_greater(mean_score, 0.95)

    # Similarly, StratifiedKFold should try to shuffle the data as little
    # as possible (while respecting the balanced class constraints)
    # and thus be able to detect the dependency by not overestimating
    # the CV score either. As the digits dataset is approximately balanced
    # the estimated mean score is close to the score measured with
    # non-shuffled KFold

    cv = cval.StratifiedKFold(y, 5)
    mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
    assert_greater(0.88, mean_score)
    assert_greater(mean_score, 0.85) 
Example 80
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_cross_validation.py    Apache License 2.0 4 votes vote down vote up
def test_permutation_score():
    iris = load_iris()
    X = iris.data
    X_sparse = coo_matrix(X)
    y = iris.target
    svm = SVC(kernel='linear')
    cv = cval.StratifiedKFold(y, 2)

    score, scores, pvalue = cval.permutation_test_score(
        svm, X, y, n_permutations=30, cv=cv, scoring="accuracy")
    assert_greater(score, 0.9)
    assert_almost_equal(pvalue, 0.0, 1)

    score_label, _, pvalue_label = cval.permutation_test_score(
        svm, X, y, n_permutations=30, cv=cv, scoring="accuracy",
        labels=np.ones(y.size), random_state=0)
    assert_true(score_label == score)
    assert_true(pvalue_label == pvalue)

    # check that we obtain the same results with a sparse representation
    svm_sparse = SVC(kernel='linear')
    cv_sparse = cval.StratifiedKFold(y, 2)
    score_label, _, pvalue_label = cval.permutation_test_score(
        svm_sparse, X_sparse, y, n_permutations=30, cv=cv_sparse,
        scoring="accuracy", labels=np.ones(y.size), random_state=0)

    assert_true(score_label == score)
    assert_true(pvalue_label == pvalue)

    # test with custom scoring object
    def custom_score(y_true, y_pred):
        return (((y_true == y_pred).sum() - (y_true != y_pred).sum())
                / y_true.shape[0])

    scorer = make_scorer(custom_score)
    score, _, pvalue = cval.permutation_test_score(
        svm, X, y, n_permutations=100, scoring=scorer, cv=cv, random_state=0)
    assert_almost_equal(score, .93, 2)
    assert_almost_equal(pvalue, 0.01, 3)

    # set random y
    y = np.mod(np.arange(len(y)), 3)

    score, scores, pvalue = cval.permutation_test_score(
        svm, X, y, n_permutations=30, cv=cv, scoring="accuracy")

    assert_less(score, 0.5)
    assert_greater(pvalue, 0.2)