Python sklearn.cross_validation.cross_val_score() Examples

The following are code examples for showing how to use sklearn.cross_validation.cross_val_score(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: JAABF   Author: drr3d   File: estimator.py    GNU General Public License v3.0 7 votes vote down vote up
def crossValScore(self, X, y, X_Test, y_test):
        ## Getting num of features for cross-validation
        x_test = self.__vectorizers.transform(X_Test)
        print("num of test_samples: %d, num of test_features: %d \n" % x_test.shape)

        kfold = KFold(n=x_test.shape[1], n_folds=4, shuffle=True, random_state=337)
            
        X_shuf, Y_shuf = shuffle(X_Test, y_test)
        cross_val_score = cross_validation.cross_val_score(self.models_, X_shuf, \
                                                            Y_shuf, cv=kfold, \
                                                            scoring='accuracy', n_jobs=-1  # -1 = use all cores = faster
                                                            )
        print("\ncross_validation:\n",cross_val_score , "\n")
        print("Baseline: %.2f%% (%.2f%%)" % (cross_val_score.mean()*100, cross_val_score.std()*100)) 
Example 2
Project: Edge-Analytics   Author: IoT-Expedition   File: train_test.py    Apache License 2.0 6 votes vote down vote up
def crossValidation(sensor):
    meta = readMeta(sensor)

    # Geenrate a trainig set
    labels = generateLabels(meta)
    data = generateData(meta, labels, dataDir(sensor))

    # Prescaling
    scaler = preprocessing.Scaler().fit(data.features)
    scaledFeatures = scaler.transform(data.features)

    # Feature selection
    selector = feature_selection.SelectKBest(feature_selection.f_regression).fit(scaledFeatures, data.labels)
    selectedFeatures = selector.transform(scaledFeatures)

    # Train a classifier
    clf = SVC(kernel='linear', C=1)

    scores = cross_val_score(clf, selectedFeatures, data.labels, cv=5)
    return scores 
Example 3
Project: Edge-Analytics   Author: IoT-Expedition   File: train.py    Apache License 2.0 6 votes vote down vote up
def crossValidation(sensor):
    meta = readMeta(sensor)

    # Geenrate a trainig set
    labels = generateLabels(meta)
    data = generateData(meta, labels, dataDir(sensor))

    # Prescaling
    scaler = preprocessing.Scaler().fit(data.features)
    scaledFeatures = scaler.transform(data.features)

    # Feature selection
    selector = feature_selection.SelectKBest(feature_selection.f_regression).fit(scaledFeatures, data.labels)
    selectedFeatures = selector.transform(scaledFeatures)

    # Train a classifier
    clf = SVC(kernel='linear', C=1)

    scores = cross_val_score(clf, selectedFeatures, data.labels, cv=5)
    return scores


# Main function 
Example 4
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 6 votes vote down vote up
def test_cross_val_score_with_score_func_classification():
    iris = load_iris()
    clf = SVC(kernel='linear')

    # Default score (should be the accuracy score)
    scores = cval.cross_val_score(clf, iris.data, iris.target, cv=5)
    assert_array_almost_equal(scores, [0.97, 1., 0.97, 0.97, 1.], 2)

    # Correct classification score (aka. zero / one score) - should be the
    # same as the default estimator score
    zo_scores = cval.cross_val_score(clf, iris.data, iris.target,
                                     scoring="accuracy", cv=5)
    assert_array_almost_equal(zo_scores, [0.97, 1., 0.97, 0.97, 1.], 2)

    # F1 score (class are balanced so f1_score should be equal to zero/one
    # score
    f1_scores = cval.cross_val_score(clf, iris.data, iris.target,
                                     scoring="f1_weighted", cv=5)
    assert_array_almost_equal(f1_scores, [0.97, 1., 0.97, 0.97, 1.], 2) 
Example 5
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 6 votes vote down vote up
def test_cross_val_score_with_score_func_regression():
    X, y = make_regression(n_samples=30, n_features=20, n_informative=5,
                           random_state=0)
    reg = Ridge()

    # Default score of the Ridge regression estimator
    scores = cval.cross_val_score(reg, X, y, cv=5)
    assert_array_almost_equal(scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)

    # R2 score (aka. determination coefficient) - should be the
    # same as the default estimator score
    r2_scores = cval.cross_val_score(reg, X, y, scoring="r2", cv=5)
    assert_array_almost_equal(r2_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)

    # Mean squared error; this is a loss function, so "scores" are negative
    mse_scores = cval.cross_val_score(reg, X, y, cv=5,
                                      scoring="mean_squared_error")
    expected_mse = np.array([-763.07, -553.16, -274.38, -273.26, -1681.99])
    assert_array_almost_equal(mse_scores, expected_mse, 2)

    # Explained variance
    scoring = make_scorer(explained_variance_score)
    ev_scores = cval.cross_val_score(reg, X, y, cv=5, scoring=scoring)
    assert_array_almost_equal(ev_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2) 
Example 6
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 6 votes vote down vote up
def test_cross_val_score_multilabel():
    X = np.array([[-3, 4], [2, 4], [3, 3], [0, 2], [-3, 1],
                  [-2, 1], [0, 0], [-2, -1], [-1, -2], [1, -2]])
    y = np.array([[1, 1], [0, 1], [0, 1], [0, 1], [1, 1],
                  [0, 1], [1, 0], [1, 1], [1, 0], [0, 0]])
    clf = KNeighborsClassifier(n_neighbors=1)
    scoring_micro = make_scorer(precision_score, average='micro')
    scoring_macro = make_scorer(precision_score, average='macro')
    scoring_samples = make_scorer(precision_score, average='samples')
    score_micro = cval.cross_val_score(clf, X, y, scoring=scoring_micro, cv=5)
    score_macro = cval.cross_val_score(clf, X, y, scoring=scoring_macro, cv=5)
    score_samples = cval.cross_val_score(clf, X, y,
                                         scoring=scoring_samples, cv=5)
    assert_almost_equal(score_micro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 3])
    assert_almost_equal(score_macro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4])
    assert_almost_equal(score_samples, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4]) 
Example 7
Project: Weiss   Author: WangWenjun559   File: test_score_objects.py    Apache License 2.0 6 votes vote down vote up
def test_check_scoring_gridsearchcv():
    # test that check_scoring works on GridSearchCV and pipeline.
    # slightly redundant non-regression test.

    grid = GridSearchCV(LinearSVC(), param_grid={'C': [.1, 1]})
    scorer = check_scoring(grid, "f1")
    assert_true(isinstance(scorer, _PredictScorer))

    pipe = make_pipeline(LinearSVC())
    scorer = check_scoring(pipe, "f1")
    assert_true(isinstance(scorer, _PredictScorer))

    # check that cross_val_score definitely calls the scorer
    # and doesn't make any assumptions about the estimator apart from having a
    # fit.
    scores = cross_val_score(EstimatorWithFit(), [[1], [2], [3]], [1, 0, 1],
                             scoring=DummyScorer())
    assert_array_equal(scores, 1) 
Example 8
Project: Weiss   Author: WangWenjun559   File: test_cross_validation.py    Apache License 2.0 6 votes vote down vote up
def test_cross_val_score_mask():
    # test that cross_val_score works with boolean masks
    svm = SVC(kernel="linear")
    iris = load_iris()
    X, y = iris.data, iris.target
    cv_indices = cval.KFold(len(y), 5)
    scores_indices = cval.cross_val_score(svm, X, y, cv=cv_indices)
    cv_indices = cval.KFold(len(y), 5)
    cv_masks = []
    for train, test in cv_indices:
        mask_train = np.zeros(len(y), dtype=np.bool)
        mask_test = np.zeros(len(y), dtype=np.bool)
        mask_train[train] = 1
        mask_test[test] = 1
        cv_masks.append((train, test))
    scores_masks = cval.cross_val_score(svm, X, y, cv=cv_masks)
    assert_array_equal(scores_indices, scores_masks) 
Example 9
Project: Weiss   Author: WangWenjun559   File: test_cross_validation.py    Apache License 2.0 6 votes vote down vote up
def test_cross_val_score_fit_params():
    clf = MockClassifier()
    n_samples = X.shape[0]
    n_classes = len(np.unique(y))

    DUMMY_INT = 42
    DUMMY_STR = '42'
    DUMMY_OBJ = object()

    def assert_fit_params(clf):
        # Function to test that the values are passed correctly to the
        # classifier arguments for non-array type

        assert_equal(clf.dummy_int, DUMMY_INT)
        assert_equal(clf.dummy_str, DUMMY_STR)
        assert_equal(clf.dummy_obj, DUMMY_OBJ)

    fit_params = {'sample_weight': np.ones(n_samples),
                  'class_prior': np.ones(n_classes) / n_classes,
                  'sparse_sample_weight': W_sparse,
                  'sparse_param': P_sparse,
                  'dummy_int': DUMMY_INT,
                  'dummy_str': DUMMY_STR,
                  'dummy_obj': DUMMY_OBJ,
                  'callback': assert_fit_params}
    cval.cross_val_score(clf, X, y, fit_params=fit_params) 
Example 10
Project: Weiss   Author: WangWenjun559   File: test_cross_validation.py    Apache License 2.0 6 votes vote down vote up
def test_cross_val_score_with_score_func_classification():
    iris = load_iris()
    clf = SVC(kernel='linear')

    # Default score (should be the accuracy score)
    scores = cval.cross_val_score(clf, iris.data, iris.target, cv=5)
    assert_array_almost_equal(scores, [0.97, 1., 0.97, 0.97, 1.], 2)

    # Correct classification score (aka. zero / one score) - should be the
    # same as the default estimator score
    zo_scores = cval.cross_val_score(clf, iris.data, iris.target,
                                     scoring="accuracy", cv=5)
    assert_array_almost_equal(zo_scores, [0.97, 1., 0.97, 0.97, 1.], 2)

    # F1 score (class are balanced so f1_score should be equal to zero/one
    # score
    f1_scores = cval.cross_val_score(clf, iris.data, iris.target,
                                     scoring="f1_weighted", cv=5)
    assert_array_almost_equal(f1_scores, [0.97, 1., 0.97, 0.97, 1.], 2) 
Example 11
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_cross_validation.py    Apache License 2.0 6 votes vote down vote up
def test_cross_val_score_with_score_func_classification():
    iris = load_iris()
    clf = SVC(kernel='linear')

    # Default score (should be the accuracy score)
    scores = cval.cross_val_score(clf, iris.data, iris.target, cv=5)
    assert_array_almost_equal(scores, [0.97, 1., 0.97, 0.97, 1.], 2)

    # Correct classification score (aka. zero / one score) - should be the
    # same as the default estimator score
    zo_scores = cval.cross_val_score(clf, iris.data, iris.target,
                                     scoring="accuracy", cv=5)
    assert_array_almost_equal(zo_scores, [0.97, 1., 0.97, 0.97, 1.], 2)

    # F1 score (class are balanced so f1_score should be equal to zero/one
    # score
    f1_scores = cval.cross_val_score(clf, iris.data, iris.target,
                                     scoring="f1_weighted", cv=5)
    assert_array_almost_equal(f1_scores, [0.97, 1., 0.97, 0.97, 1.], 2) 
Example 12
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_cross_validation.py    Apache License 2.0 6 votes vote down vote up
def test_cross_val_score_with_score_func_regression():
    X, y = make_regression(n_samples=30, n_features=20, n_informative=5,
                           random_state=0)
    reg = Ridge()

    # Default score of the Ridge regression estimator
    scores = cval.cross_val_score(reg, X, y, cv=5)
    assert_array_almost_equal(scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)

    # R2 score (aka. determination coefficient) - should be the
    # same as the default estimator score
    r2_scores = cval.cross_val_score(reg, X, y, scoring="r2", cv=5)
    assert_array_almost_equal(r2_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)

    # Mean squared error; this is a loss function, so "scores" are negative
    neg_mse_scores = cval.cross_val_score(reg, X, y, cv=5,
                                          scoring="neg_mean_squared_error")
    expected_neg_mse = np.array([-763.07, -553.16, -274.38, -273.26, -1681.99])
    assert_array_almost_equal(neg_mse_scores, expected_neg_mse, 2)

    # Explained variance
    scoring = make_scorer(explained_variance_score)
    ev_scores = cval.cross_val_score(reg, X, y, cv=5, scoring=scoring)
    assert_array_almost_equal(ev_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2) 
Example 13
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_cross_validation.py    Apache License 2.0 6 votes vote down vote up
def test_cross_val_score_multilabel():
    X = np.array([[-3, 4], [2, 4], [3, 3], [0, 2], [-3, 1],
                  [-2, 1], [0, 0], [-2, -1], [-1, -2], [1, -2]])
    y = np.array([[1, 1], [0, 1], [0, 1], [0, 1], [1, 1],
                  [0, 1], [1, 0], [1, 1], [1, 0], [0, 0]])
    clf = KNeighborsClassifier(n_neighbors=1)
    scoring_micro = make_scorer(precision_score, average='micro')
    scoring_macro = make_scorer(precision_score, average='macro')
    scoring_samples = make_scorer(precision_score, average='samples')
    score_micro = cval.cross_val_score(clf, X, y, scoring=scoring_micro, cv=5)
    score_macro = cval.cross_val_score(clf, X, y, scoring=scoring_macro, cv=5)
    score_samples = cval.cross_val_score(clf, X, y,
                                         scoring=scoring_samples, cv=5)
    assert_almost_equal(score_micro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 3])
    assert_almost_equal(score_macro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4])
    assert_almost_equal(score_samples, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4]) 
Example 14
Project: Mussy-Robot   Author: arnomoonens   File: training.py    MIT License 5 votes vote down vote up
def evaluate_cross_validation(clf, X, y, K):
    
    # create a k-fold cross validation iterator
    cv = KFold(len(y), K, shuffle=True, random_state=0)
    # by default the score used is the one returned by score method of the estimator (accuracy)
    scores = cross_val_score(clf, X, y, cv=cv)
    print (scores)
    print ("Mean score: {0:.3f} (+/-{1:.3f})".format(numpy.mean(scores), sem(scores))) 
Example 15
Project: twitter-svm   Author: josh-byster   File: calculations.py    MIT License 5 votes vote down vote up
def crossValidate(X,Y,folds=10,c=1):
    svm=LinearSVC(C=c)
    cv=cross_validation.KFold(len(X), n_folds=folds,shuffle=True,random_state=None)
    for i in cross_validation.cross_val_score(svm,X,Y,cv=cv):
        print(i) 
Example 16
Project: r2-learner   Author: kudkudak   File: fit_models.py    MIT License 5 votes vote down vote up
def nk_folds(model, params, data, n=50, n_folds=10, n_jobs=4):

    model.set_params(params)
    scores = []
    for _ in xrange(n) :
        data = shuffle_data(data)
        scores.append(cross_val_score(estimator=model, X=data.data, y=data.target, scoring='accuracy', cv=n_folds, n_jobs=n_jobs).mean())

    return np.mean(scores), np.std(scores) 
Example 17
Project: fc-aaai18   Author: thanhan   File: utils.py    MIT License 5 votes vote down vote up
def run_cv(self, n_folds=10):
        if self.display:
            print('Running {0:d}-fold cross-validation'.format(n_folds))

        measures = []
        cms = []

        def scorer(estimator, XX, yy):
            if self.display:
                print('\n>> Fold: {0} <<'.format(self.fold))
            score = estimator.score(XX, yy)
            if self.display:
                print(str(score))
            self.fold += 1
            measures.append(score.measures)
            cms.append(score.cm)
            return score.accuracy

        skf = ClaimKFold(self.X, n_folds)
        scr = cross_val_score(self.predictor, self.X, self.y.values, cv=skf, scoring=scorer)

        if self.display:
            print('\n>> Averages across all folds <<')
        av_measures = (sum([m.unstack() for m in measures]).unstack() / len(measures)).T
        av_cms = (sum([m.unstack() for m in cms]).unstack() / len(cms)).T
        score = Score(av_cms, np.mean(scr), av_measures)
        if self.display:
            print(score)
            print('')
        return score 
Example 18
Project: BuildingMachineLearning   Author: ademyanchuk   File: chapter.py    MIT License 5 votes vote down vote up
def accuracy(features, labels):
    from sklearn.linear_model import LogisticRegression
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler
    from sklearn import cross_validation
    
    clf = Pipeline([('preproc', StandardScaler()),
                ('classifier', LogisticRegression())])
    cv = cross_validation.LeaveOneOut(len(features))
    scores = cross_validation.cross_val_score(
        clf, features, labels, cv=cv)
    return scores.mean() 
Example 19
Project: BuildingMachineLearning   Author: ademyanchuk   File: image-classification.py    MIT License 5 votes vote down vote up
def accuracy(features, labels):
    from sklearn.linear_model import LogisticRegression
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler
    from sklearn import cross_validation
    # We use logistic regression because it is very fast.
    # Feel free to experiment with other classifiers
    clf = Pipeline([('preproc', StandardScaler()),
                ('classifier', LogisticRegression())])
    cv = cross_validation.LeaveOneOut(len(features))
    scores = cross_validation.cross_val_score(
        clf, features, labels, cv=cv)
    return scores.mean() 
Example 20
Project: Building-Machine-Learning-Systems-With-Python-Second-Edition   Author: PacktPublishing   File: image-classification.py    MIT License 5 votes vote down vote up
def accuracy(features, labels):
    from sklearn.linear_model import LogisticRegression
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler
    from sklearn import cross_validation
    # We use logistic regression because it is very fast.
    # Feel free to experiment with other classifiers
    clf = Pipeline([('preproc', StandardScaler()),
                ('classifier', LogisticRegression())])
    cv = cross_validation.LeaveOneOut(len(features))
    scores = cross_validation.cross_val_score(
        clf, features, labels, cv=cv)
    return scores.mean() 
Example 21
Project: TGIF-Release   Author: raingo   File: rank_tags.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def stump(X, y):
    score = cross_val_score(LinearSVC(), X, y, cv = 5, n_jobs=5, scoring = 'average_precision')
    clf = LinearSVC()
    clf.fit(X, y)
    coef = clf.coef_[0,0]
    inter = clf.intercept_[0]
    return np.mean(score), np.sign(coef), inter / np.abs(coef) 
Example 22
Project: user-agent-ml   Author: 904labs   File: train.py    Apache License 2.0 5 votes vote down vote up
def train(samples, vocabulary):
    logger.debug("Extracting features")
    
    X = []
    for s in samples:
        X.append(extract_features(s[0], vocabulary))
    X = sp.vstack(X, format='csr')

    y = np.array([s[1] for s in samples])

    clf = RandomForestClassifier(n_estimators=30)
    
    if args["-c"]:
        logger.debug("Performing N-fold cross-validation (N=%s)" % args["-f"])
        scores = cross_validation.cross_val_score(clf, X.toarray(), y,
                                                    n_jobs=int(args["-j"]),
                                                    cv=int(args["-f"]),
                                                    scoring=args["-s"])
        print("F1: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))
    
    logger.debug("Training model on all data")
    clf.fit(X.toarray(), y)
    
    logger.debug("Done, returning model and vocabulary")
    
    return (clf, vocabulary) 
Example 23
Project: NepClassifier   Author: kad4   File: svm.py    MIT License 5 votes vote down vote up
def contruct_cost_function(self, input_matrix, output_matrix):
        def eval(c):

            logger.info("Training using c={}".format(c))

            # Construct classifier
            classifier = svm.SVC(c, kernel="linear")

            scores = cross_validation.cross_val_score(
                classifier,
                input_matrix,
                output_matrix,
                cv=self.cross_validation_folds,
                scoring=calculate_scores
            )

            score = scores.mean()
            logger.info("Mean score={}".format(score))

            # Return loss
            return 1-score

        return eval 
Example 24
Project: automl-phase-2   Author: jamesrobertlloyd   File: sandpit.py    MIT License 5 votes vote down vote up
def _f(x):
        # iris = load_iris()
        X, y = X, y = make_hastie_10_2(random_state=0)
        x = np.ravel(x)
        f = np.zeros(x.shape)
        for i in range(f.size):
            clf = RandomForestClassifier(n_estimators=1, min_samples_leaf=int(np.round(x[i])), random_state=0)
            # scores = cross_val_score(clf, iris.data, iris.target)
            scores = cross_val_score(clf, X, y, cv=5)
            f[i] = -scores.mean()
        return f.ravel() 
Example 25
Project: linear_neuron   Author: uglyboxer   File: test_score_objects.py    MIT License 5 votes vote down vote up
def test_check_scoring_gridsearchcv():
    # test that check_scoring works on GridSearchCV and pipeline.
    # slightly redundant non-regression test.

    grid = GridSearchCV(LinearSVC(), param_grid={'C': [.1, 1]})
    scorer = check_scoring(grid, "f1")
    assert_true(isinstance(scorer, _PredictScorer))

    pipe = make_pipeline(LinearSVC())
    scorer = check_scoring(pipe, "f1")
    assert_true(isinstance(scorer, _PredictScorer))

    # check that cross_val_score definitely calls the scorer
    # and doesn't make any assumptions about the estimator apart from having a
    # fit.
    scores = cross_val_score(EstimatorWithFit(), [[1], [2], [3]], [1, 0, 1],
                             scoring=DummyScorer())
    assert_array_equal(scores, 1) 
Example 26
Project: linear_neuron   Author: uglyboxer   File: test_score_objects.py    MIT License 5 votes vote down vote up
def test_raises_on_score_list():
    # Test that when a list of scores is returned, we raise proper errors.
    X, y = make_blobs(random_state=0)
    f1_scorer_no_average = make_scorer(f1_score, average=None)
    clf = DecisionTreeClassifier()
    assert_raises(ValueError, cross_val_score, clf, X, y,
                  scoring=f1_scorer_no_average)
    grid_search = GridSearchCV(clf, scoring=f1_scorer_no_average,
                               param_grid={'max_depth': [1, 2]})
    assert_raises(ValueError, grid_search.fit, X, y) 
Example 27
Project: linear_neuron   Author: uglyboxer   File: test_text.py    MIT License 5 votes vote down vote up
def test_vectorizer_pipeline_cross_validation():
    # raw documents
    data = JUNK_FOOD_DOCS + NOTJUNK_FOOD_DOCS

    # label junk food as -1, the others as +1
    target = [-1] * len(JUNK_FOOD_DOCS) + [1] * len(NOTJUNK_FOOD_DOCS)

    pipeline = Pipeline([('vect', TfidfVectorizer()),
                         ('svc', LinearSVC())])

    cv_scores = cross_val_score(pipeline, data, target, cv=3)
    assert_array_equal(cv_scores, [1., 1., 1.]) 
Example 28
Project: linear_neuron   Author: uglyboxer   File: test_naive_bayes.py    MIT License 5 votes vote down vote up
def test_check_accuracy_on_digits():
    # Non regression test to make sure that any further refactoring / optim
    # of the NB models do not harm the performance on a slightly non-linearly
    # separable dataset
    digits = load_digits()
    X, y = digits.data, digits.target
    binary_3v8 = np.logical_or(digits.target == 3, digits.target == 8)
    X_3v8, y_3v8 = X[binary_3v8], y[binary_3v8]

    # Multinomial NB
    scores = cross_val_score(MultinomialNB(alpha=10), X, y, cv=10)
    assert_greater(scores.mean(), 0.86)

    scores = cross_val_score(MultinomialNB(alpha=10), X_3v8, y_3v8, cv=10)
    assert_greater(scores.mean(), 0.94)

    # Bernoulli NB
    scores = cross_val_score(BernoulliNB(alpha=10), X > 4, y, cv=10)
    assert_greater(scores.mean(), 0.83)

    scores = cross_val_score(BernoulliNB(alpha=10), X_3v8 > 4, y_3v8, cv=10)
    assert_greater(scores.mean(), 0.92)

    # Gaussian NB
    scores = cross_val_score(GaussianNB(), X, y, cv=10)
    assert_greater(scores.mean(), 0.77)

    scores = cross_val_score(GaussianNB(), X_3v8, y_3v8, cv=10)
    assert_greater(scores.mean(), 0.86) 
Example 29
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 5 votes vote down vote up
def test_cross_val_score_pandas():
    # check cross_val_score doesn't destroy pandas dataframe
    types = [(MockDataFrame, MockDataFrame)]
    try:
        from pandas import Series, DataFrame
        types.append((Series, DataFrame))
    except ImportError:
        pass
    for TargetType, InputFeatureType in types:
        # X dataframe, y series
        X_df, y_ser = InputFeatureType(X), TargetType(y)
        check_df = lambda x: isinstance(x, InputFeatureType)
        check_series = lambda x: isinstance(x, TargetType)
        clf = CheckingClassifier(check_X=check_df, check_y=check_series)
        cval.cross_val_score(clf, X_df, y_ser) 
Example 30
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 5 votes vote down vote up
def test_cross_val_score_mask():
    # test that cross_val_score works with boolean masks
    svm = SVC(kernel="linear")
    iris = load_iris()
    X, y = iris.data, iris.target
    with warnings.catch_warnings(record=True):
        cv_indices = cval.KFold(len(y), 5, indices=True)
    scores_indices = cval.cross_val_score(svm, X, y, cv=cv_indices)
    with warnings.catch_warnings(record=True):
        cv_masks = cval.KFold(len(y), 5, indices=False)
    scores_masks = cval.cross_val_score(svm, X, y, cv=cv_masks)
    assert_array_equal(scores_indices, scores_masks) 
Example 31
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 5 votes vote down vote up
def test_cross_val_score_precomputed():
    # test for svm with precomputed kernel
    svm = SVC(kernel="precomputed")
    iris = load_iris()
    X, y = iris.data, iris.target
    linear_kernel = np.dot(X, X.T)
    score_precomputed = cval.cross_val_score(svm, linear_kernel, y)
    svm = SVC(kernel="linear")
    score_linear = cval.cross_val_score(svm, X, y)
    assert_array_equal(score_precomputed, score_linear)

    # Error raised for non-square X
    svm = SVC(kernel="precomputed")
    assert_raises(ValueError, cval.cross_val_score, svm, X, y)

    # test error is raised when the precomputed kernel is not array-like
    # or sparse
    assert_raises(ValueError, cval.cross_val_score, svm,
                  linear_kernel.tolist(), y) 
Example 32
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 5 votes vote down vote up
def test_cross_val_score_fit_params():
    clf = MockClassifier()
    n_samples = X.shape[0]
    n_classes = len(np.unique(y))

    DUMMY_INT = 42
    DUMMY_STR = '42'
    DUMMY_OBJ = object()

    def assert_fit_params(clf):
        # Function to test that the values are passed correctly to the
        # classifier arguments for non-array type

        assert_equal(clf.dummy_int, DUMMY_INT)
        assert_equal(clf.dummy_str, DUMMY_STR)
        assert_equal(clf.dummy_obj, DUMMY_OBJ)

    fit_params = {'sample_weight': np.ones(n_samples),
                  'class_prior': np.ones(n_classes) / n_classes,
                  'sparse_sample_weight': W_sparse,
                  'sparse_param': P_sparse,
                  'dummy_int': DUMMY_INT,
                  'dummy_str': DUMMY_STR,
                  'dummy_obj': DUMMY_OBJ,
                  'callback': assert_fit_params}
    cval.cross_val_score(clf, X, y, fit_params=fit_params) 
Example 33
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 5 votes vote down vote up
def test_cross_val_score_score_func():
    clf = MockClassifier()
    _score_func_args = []

    def score_func(y_test, y_predict):
        _score_func_args.append((y_test, y_predict))
        return 1.0

    with warnings.catch_warnings(record=True):
        scoring = make_scorer(score_func)
        score = cval.cross_val_score(clf, X, y, scoring=scoring)
    assert_array_equal(score, [1.0, 1.0, 1.0])
    assert len(_score_func_args) == 3 
Example 34
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 5 votes vote down vote up
def train_test_split_pandas():
    # check cross_val_score doesn't destroy pandas dataframe
    types = [MockDataFrame]
    try:
        from pandas import DataFrame
        types.append(DataFrame)
    except ImportError:
        pass
    for InputFeatureType in types:
        # X dataframe
        X_df = InputFeatureType(X)
        X_train, X_test = cval.train_test_split(X_df)
        assert_true(isinstance(X_train, InputFeatureType))
        assert_true(isinstance(X_test, InputFeatureType)) 
Example 35
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 5 votes vote down vote up
def test_cross_val_score_allow_nans():
    # Check that cross_val_score allows input data with NaNs
    X = np.arange(200, dtype=np.float64).reshape(10, -1)
    X[2, :] = np.nan
    y = np.repeat([0, 1], X.shape[0] / 2)
    p = Pipeline([
        ('imputer', Imputer(strategy='mean', missing_values='NaN')),
        ('classifier', MockClassifier()),
    ])
    cval.cross_val_score(p, X, y, cv=5) 
Example 36
Project: Weiss   Author: WangWenjun559   File: test_voting_classifier.py    Apache License 2.0 5 votes vote down vote up
def test_majority_label_iris():
    """Check classification by majority label on dataset iris."""
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(random_state=123)
    clf3 = GaussianNB()
    eclf = VotingClassifier(estimators=[
                ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
                voting='hard')
    scores = cross_validation.cross_val_score(eclf,
                                              X,
                                              y,
                                              cv=5,
                                              scoring='accuracy')
    assert_almost_equal(scores.mean(), 0.95, decimal=2) 
Example 37
Project: Weiss   Author: WangWenjun559   File: test_voting_classifier.py    Apache License 2.0 5 votes vote down vote up
def test_weights_iris():
    """Check classification by average probabilities on dataset iris."""
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(random_state=123)
    clf3 = GaussianNB()
    eclf = VotingClassifier(estimators=[
                            ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
                            voting='soft',
                            weights=[1, 2, 10])
    scores = cross_validation.cross_val_score(eclf,
                                              X,
                                              y,
                                              cv=5,
                                              scoring='accuracy')
    assert_almost_equal(scores.mean(), 0.93, decimal=2) 
Example 38
Project: Weiss   Author: WangWenjun559   File: test_score_objects.py    Apache License 2.0 5 votes vote down vote up
def test_raises_on_score_list():
    # Test that when a list of scores is returned, we raise proper errors.
    X, y = make_blobs(random_state=0)
    f1_scorer_no_average = make_scorer(f1_score, average=None)
    clf = DecisionTreeClassifier()
    assert_raises(ValueError, cross_val_score, clf, X, y,
                  scoring=f1_scorer_no_average)
    grid_search = GridSearchCV(clf, scoring=f1_scorer_no_average,
                               param_grid={'max_depth': [1, 2]})
    assert_raises(ValueError, grid_search.fit, X, y) 
Example 39
Project: Weiss   Author: WangWenjun559   File: test_text.py    Apache License 2.0 5 votes vote down vote up
def test_vectorizer_pipeline_cross_validation():
    # raw documents
    data = JUNK_FOOD_DOCS + NOTJUNK_FOOD_DOCS

    # label junk food as -1, the others as +1
    target = [-1] * len(JUNK_FOOD_DOCS) + [1] * len(NOTJUNK_FOOD_DOCS)

    pipeline = Pipeline([('vect', TfidfVectorizer()),
                         ('svc', LinearSVC())])

    cv_scores = cross_val_score(pipeline, data, target, cv=3)
    assert_array_equal(cv_scores, [1., 1., 1.]) 
Example 40
Project: Weiss   Author: WangWenjun559   File: test_naive_bayes.py    Apache License 2.0 5 votes vote down vote up
def test_check_accuracy_on_digits():
    # Non regression test to make sure that any further refactoring / optim
    # of the NB models do not harm the performance on a slightly non-linearly
    # separable dataset
    digits = load_digits()
    X, y = digits.data, digits.target
    binary_3v8 = np.logical_or(digits.target == 3, digits.target == 8)
    X_3v8, y_3v8 = X[binary_3v8], y[binary_3v8]

    # Multinomial NB
    scores = cross_val_score(MultinomialNB(alpha=10), X, y, cv=10)
    assert_greater(scores.mean(), 0.86)

    scores = cross_val_score(MultinomialNB(alpha=10), X_3v8, y_3v8, cv=10)
    assert_greater(scores.mean(), 0.94)

    # Bernoulli NB
    scores = cross_val_score(BernoulliNB(alpha=10), X > 4, y, cv=10)
    assert_greater(scores.mean(), 0.83)

    scores = cross_val_score(BernoulliNB(alpha=10), X_3v8 > 4, y_3v8, cv=10)
    assert_greater(scores.mean(), 0.92)

    # Gaussian NB
    scores = cross_val_score(GaussianNB(), X, y, cv=10)
    assert_greater(scores.mean(), 0.77)

    scores = cross_val_score(GaussianNB(), X_3v8, y_3v8, cv=10)
    assert_greater(scores.mean(), 0.86) 
Example 41
Project: Weiss   Author: WangWenjun559   File: test_cross_validation.py    Apache License 2.0 5 votes vote down vote up
def test_cross_val_score():
    clf = MockClassifier()
    for a in range(-10, 10):
        clf.a = a
        # Smoke test
        scores = cval.cross_val_score(clf, X, y)
        assert_array_equal(scores, clf.score(X, y))

        # test with multioutput y
        scores = cval.cross_val_score(clf, X_sparse, X)
        assert_array_equal(scores, clf.score(X_sparse, X))

        scores = cval.cross_val_score(clf, X_sparse, y)
        assert_array_equal(scores, clf.score(X_sparse, y))

        # test with multioutput y
        scores = cval.cross_val_score(clf, X_sparse, X)
        assert_array_equal(scores, clf.score(X_sparse, X))

    # test with X and y as list
    list_check = lambda x: isinstance(x, list)
    clf = CheckingClassifier(check_X=list_check)
    scores = cval.cross_val_score(clf, X.tolist(), y.tolist())

    clf = CheckingClassifier(check_y=list_check)
    scores = cval.cross_val_score(clf, X, y.tolist())

    assert_raises(ValueError, cval.cross_val_score, clf, X, y,
                  scoring="sklearn")

    # test with 3d X and
    X_3d = X[:, :, np.newaxis]
    clf = MockClassifier(allow_nd=True)
    scores = cval.cross_val_score(clf, X_3d, y)

    clf = MockClassifier(allow_nd=False)
    assert_raises(ValueError, cval.cross_val_score, clf, X_3d, y) 
Example 42
Project: Weiss   Author: WangWenjun559   File: test_cross_validation.py    Apache License 2.0 5 votes vote down vote up
def test_cross_val_score_pandas():
    # check cross_val_score doesn't destroy pandas dataframe
    types = [(MockDataFrame, MockDataFrame)]
    try:
        from pandas import Series, DataFrame
        types.append((Series, DataFrame))
    except ImportError:
        pass
    for TargetType, InputFeatureType in types:
        # X dataframe, y series
        X_df, y_ser = InputFeatureType(X), TargetType(y)
        check_df = lambda x: isinstance(x, InputFeatureType)
        check_series = lambda x: isinstance(x, TargetType)
        clf = CheckingClassifier(check_X=check_df, check_y=check_series)
        cval.cross_val_score(clf, X_df, y_ser) 
Example 43
Project: Weiss   Author: WangWenjun559   File: test_cross_validation.py    Apache License 2.0 5 votes vote down vote up
def test_cross_val_score_score_func():
    clf = MockClassifier()
    _score_func_args = []

    def score_func(y_test, y_predict):
        _score_func_args.append((y_test, y_predict))
        return 1.0

    with warnings.catch_warnings(record=True):
        scoring = make_scorer(score_func)
        score = cval.cross_val_score(clf, X, y, scoring=scoring)
    assert_array_equal(score, [1.0, 1.0, 1.0])
    assert len(_score_func_args) == 3 
Example 44
Project: Weiss   Author: WangWenjun559   File: test_cross_validation.py    Apache License 2.0 5 votes vote down vote up
def test_cross_val_score_errors():
    class BrokenEstimator:
        pass

    assert_raises(TypeError, cval.cross_val_score, BrokenEstimator(), X) 
Example 45
Project: Weiss   Author: WangWenjun559   File: test_cross_validation.py    Apache License 2.0 5 votes vote down vote up
def train_test_split_pandas():
    # check cross_val_score doesn't destroy pandas dataframe
    types = [MockDataFrame]
    try:
        from pandas import DataFrame
        types.append(DataFrame)
    except ImportError:
        pass
    for InputFeatureType in types:
        # X dataframe
        X_df = InputFeatureType(X)
        X_train, X_test = cval.train_test_split(X_df)
        assert_true(isinstance(X_train, InputFeatureType))
        assert_true(isinstance(X_test, InputFeatureType)) 
Example 46
Project: Weiss   Author: WangWenjun559   File: test_cross_validation.py    Apache License 2.0 5 votes vote down vote up
def test_cross_val_score_allow_nans():
    # Check that cross_val_score allows input data with NaNs
    X = np.arange(200, dtype=np.float64).reshape(10, -1)
    X[2, :] = np.nan
    y = np.repeat([0, 1], X.shape[0] / 2)
    p = Pipeline([
        ('imputer', Imputer(strategy='mean', missing_values='NaN')),
        ('classifier', MockClassifier()),
    ])
    cval.cross_val_score(p, X, y, cv=5) 
Example 47
Project: Weiss   Author: WangWenjun559   File: test_cross_validation.py    Apache License 2.0 5 votes vote down vote up
def test_cross_val_score_multilabel():
    X = np.array([[-3, 4], [2, 4], [3, 3], [0, 2], [-3, 1],
                  [-2, 1], [0, 0], [-2, -1], [-1, -2], [1, -2]])
    y = np.array([[1, 1], [0, 1], [0, 1], [0, 1], [1, 1],
                  [0, 1], [1, 0], [1, 1], [1, 0], [0, 0]])
    clf = KNeighborsClassifier(n_neighbors=1)
    scoring_micro = make_scorer(precision_score, average='micro')
    scoring_macro = make_scorer(precision_score, average='macro')
    scoring_samples = make_scorer(precision_score, average='samples')
    score_micro = cval.cross_val_score(clf, X, y, scoring=scoring_micro, cv=5)
    score_macro = cval.cross_val_score(clf, X, y, scoring=scoring_macro, cv=5)
    score_samples = cval.cross_val_score(clf, X, y,
                                         scoring=scoring_samples, cv=5)
    assert_almost_equal(score_micro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 3])
    assert_almost_equal(score_macro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4])
    assert_almost_equal(score_samples, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4]) 
Example 48
Project: Weiss   Author: WangWenjun559   File: test_cross_validation.py    Apache License 2.0 5 votes vote down vote up
def test_cross_val_predict_pandas():
    # check cross_val_score doesn't destroy pandas dataframe
    types = [(MockDataFrame, MockDataFrame)]
    try:
        from pandas import Series, DataFrame
        types.append((Series, DataFrame))
    except ImportError:
        pass
    for TargetType, InputFeatureType in types:
        # X dataframe, y series
        X_df, y_ser = InputFeatureType(X), TargetType(y)
        check_df = lambda x: isinstance(x, InputFeatureType)
        check_series = lambda x: isinstance(x, TargetType)
        clf = CheckingClassifier(check_X=check_df, check_y=check_series)
        cval.cross_val_predict(clf, X_df, y_ser) 
Example 49
Project: Weiss   Author: WangWenjun559   File: test_cross_validation.py    Apache License 2.0 5 votes vote down vote up
def test_sparse_fit_params():
    iris = load_iris()
    X, y = iris.data, iris.target
    clf = MockClassifier()
    fit_params = {'sparse_sample_weight': coo_matrix(np.eye(X.shape[0]))}
    a = cval.cross_val_score(clf, X, y, fit_params=fit_params)
    assert_array_equal(a, np.ones(3)) 
Example 50
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_cross_validation.py    Apache License 2.0 5 votes vote down vote up
def test_cross_val_score():
    clf = MockClassifier()
    for a in range(-10, 10):
        clf.a = a
        # Smoke test
        scores = cval.cross_val_score(clf, X, y)
        assert_array_equal(scores, clf.score(X, y))

        # test with multioutput y
        scores = cval.cross_val_score(clf, X_sparse, X)
        assert_array_equal(scores, clf.score(X_sparse, X))

        scores = cval.cross_val_score(clf, X_sparse, y)
        assert_array_equal(scores, clf.score(X_sparse, y))

        # test with multioutput y
        scores = cval.cross_val_score(clf, X_sparse, X)
        assert_array_equal(scores, clf.score(X_sparse, X))

    # test with X and y as list
    list_check = lambda x: isinstance(x, list)
    clf = CheckingClassifier(check_X=list_check)
    scores = cval.cross_val_score(clf, X.tolist(), y.tolist())

    clf = CheckingClassifier(check_y=list_check)
    scores = cval.cross_val_score(clf, X, y.tolist())

    assert_raises(ValueError, cval.cross_val_score, clf, X, y,
                  scoring="sklearn")

    # test with 3d X and
    X_3d = X[:, :, np.newaxis]
    clf = MockClassifier(allow_nd=True)
    scores = cval.cross_val_score(clf, X_3d, y)

    clf = MockClassifier(allow_nd=False)
    assert_raises(ValueError, cval.cross_val_score, clf, X_3d, y) 
Example 51
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_cross_validation.py    Apache License 2.0 5 votes vote down vote up
def test_cross_val_score_pandas():
    # check cross_val_score doesn't destroy pandas dataframe
    types = [(MockDataFrame, MockDataFrame)]
    try:
        from pandas import Series, DataFrame
        types.append((Series, DataFrame))
    except ImportError:
        pass
    for TargetType, InputFeatureType in types:
        # X dataframe, y series
        X_df, y_ser = InputFeatureType(X), TargetType(y)
        check_df = lambda x: isinstance(x, InputFeatureType)
        check_series = lambda x: isinstance(x, TargetType)
        clf = CheckingClassifier(check_X=check_df, check_y=check_series)
        cval.cross_val_score(clf, X_df, y_ser) 
Example 52
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_cross_validation.py    Apache License 2.0 5 votes vote down vote up
def test_cross_val_score_mask():
    # test that cross_val_score works with boolean masks
    svm = SVC(kernel="linear")
    iris = load_iris()
    X, y = iris.data, iris.target
    cv_indices = cval.KFold(len(y), 5)
    scores_indices = cval.cross_val_score(svm, X, y, cv=cv_indices)
    cv_indices = cval.KFold(len(y), 5)
    cv_masks = []
    for train, test in cv_indices:
        mask_train = np.zeros(len(y), dtype=np.bool)
        mask_test = np.zeros(len(y), dtype=np.bool)
        mask_train[train] = 1
        mask_test[test] = 1
        cv_masks.append((train, test))
    scores_masks = cval.cross_val_score(svm, X, y, cv=cv_masks)
    assert_array_equal(scores_indices, scores_masks) 
Example 53
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_cross_validation.py    Apache License 2.0 5 votes vote down vote up
def test_cross_val_score_precomputed():
    # test for svm with precomputed kernel
    svm = SVC(kernel="precomputed")
    iris = load_iris()
    X, y = iris.data, iris.target
    linear_kernel = np.dot(X, X.T)
    score_precomputed = cval.cross_val_score(svm, linear_kernel, y)
    svm = SVC(kernel="linear")
    score_linear = cval.cross_val_score(svm, X, y)
    assert_array_equal(score_precomputed, score_linear)

    # Error raised for non-square X
    svm = SVC(kernel="precomputed")
    assert_raises(ValueError, cval.cross_val_score, svm, X, y)

    # test error is raised when the precomputed kernel is not array-like
    # or sparse
    assert_raises(ValueError, cval.cross_val_score, svm,
                  linear_kernel.tolist(), y) 
Example 54
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_cross_validation.py    Apache License 2.0 5 votes vote down vote up
def test_cross_val_score_score_func():
    clf = MockClassifier()
    _score_func_args = []

    def score_func(y_test, y_predict):
        _score_func_args.append((y_test, y_predict))
        return 1.0

    with warnings.catch_warnings(record=True):
        scoring = make_scorer(score_func)
        score = cval.cross_val_score(clf, X, y, scoring=scoring)
    assert_array_equal(score, [1.0, 1.0, 1.0])
    assert len(_score_func_args) == 3 
Example 55
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_cross_validation.py    Apache License 2.0 5 votes vote down vote up
def test_cross_val_score_errors():
    class BrokenEstimator:
        pass

    assert_raises(TypeError, cval.cross_val_score, BrokenEstimator(), X) 
Example 56
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_cross_validation.py    Apache License 2.0 5 votes vote down vote up
def train_test_split_pandas():
    # check cross_val_score doesn't destroy pandas dataframe
    types = [MockDataFrame]
    try:
        from pandas import DataFrame
        types.append(DataFrame)
    except ImportError:
        pass
    for InputFeatureType in types:
        # X dataframe
        X_df = InputFeatureType(X)
        X_train, X_test = cval.train_test_split(X_df)
        assert_true(isinstance(X_train, InputFeatureType))
        assert_true(isinstance(X_test, InputFeatureType)) 
Example 57
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_cross_validation.py    Apache License 2.0 5 votes vote down vote up
def test_cross_val_predict_pandas():
    # check cross_val_score doesn't destroy pandas dataframe
    types = [(MockDataFrame, MockDataFrame)]
    try:
        from pandas import Series, DataFrame
        types.append((Series, DataFrame))
    except ImportError:
        pass
    for TargetType, InputFeatureType in types:
        # X dataframe, y series
        X_df, y_ser = InputFeatureType(X), TargetType(y)
        check_df = lambda x: isinstance(x, InputFeatureType)
        check_series = lambda x: isinstance(x, TargetType)
        clf = CheckingClassifier(check_X=check_df, check_y=check_series)
        cval.cross_val_predict(clf, X_df, y_ser) 
Example 58
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_cross_validation.py    Apache License 2.0 5 votes vote down vote up
def test_sparse_fit_params():
    iris = load_iris()
    X, y = iris.data, iris.target
    clf = MockClassifier()
    fit_params = {'sparse_sample_weight': coo_matrix(np.eye(X.shape[0]))}
    a = cval.cross_val_score(clf, X, y, fit_params=fit_params)
    assert_array_equal(a, np.ones(3)) 
Example 59
Project: Supply-demand-forecasting   Author: LevinJ   File: sklearnbasemodel.py    MIT License 5 votes vote down vote up
def run_croos_validation(self):
        features,labels,cv = self.getFeaturesLabel()
        scores = cross_validation.cross_val_score(self.clf, features, labels, cv=cv, scoring=mean_absolute_percentage_error_scoring, n_jobs = -1)
        print "cross validation scores: means, {}, std, {}, details,{}".format(np.absolute(scores.mean()), scores.std(), np.absolute(scores))
        return -np.absolute(scores.mean()) 
Example 60
Project: digit-ocr   Author: Nozdi   File: train.py    MIT License 5 votes vote down vote up
def cv(model, X, y, n_iter=5, test_size=0.3):
    split = cross_validation.ShuffleSplit(
        len(X), n_iter=n_iter, test_size=test_size,
    )
    return cross_validation.cross_val_score(model, X, y, cv=split,
                                            scoring='accuracy', n_jobs=-1) 
Example 61
Project: Kaggle   Author: lawlite19   File: solution.py    MIT License 5 votes vote down vote up
def baseline_logisticRegression():
    train_data = pd.read_csv(r"data/train.csv")
    #print u"数据信息:\n",train_data.info()
    #print u'数据描述:\n',train_data.describe()  
    #display_data(train_data)  # 简单显示数据信息
    #display_with_process(train_data) # 根据数据的理解,简单处理一下数据显示,验证猜想
    process_data = pre_processData(train_data,'process_train_data')  # 数据预处理,要训练的数据
    train_data = process_data.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')  # 使用正则抽取想要的数据
    train_np = train_data.as_matrix()  # 转为矩阵
    '''训练model'''
    X = train_np[:,1:]
    y = train_np[:,0]
    #=X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
    #=model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X_train,y_train)
    model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X,y)
    print pd.DataFrame({"columns":list(train_data.columns)[1:],"coef_":list(model.coef_.T)})
    #=prediction = model.predict(X_test)
    #=cv_error = pd.DataFrame(data=list(X_test[np.where(prediction!=y_test)]),columns=list(train_data.columns)[1:])
    #=cv_error.to_csv(r'error.csv',index=True)
    #=print np.float32(np.sum(prediction == y_test))/np.float32(prediction.shape[0])
    
    '''测试集上预测'''
    test_data = pd.read_csv(r"data/test.csv")
    process_test_data = pre_processData(test_data,'process_test_data')  # 预处理数据
    test_data = process_test_data.filter(regex='Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
    test_np = test_data.as_matrix()
    predict = model.predict(test_np)
    result = pd.DataFrame(data={'PassengerId':process_test_data['PassengerId'].as_matrix(),'Survived':predict.astype(np.int32)})
    result.to_csv(r'baseline_logisticRegression_result/prediction.csv',index=False)
    #clf = linear_model.LogisticRegression(C=1.0,tol=1e-6)
    #print cross_validation.cross_val_score(clf, X,y,cv=5)


# baseline:SVM模型——0.78947 
Example 62
Project: Kaggle   Author: lawlite19   File: solution.py    MIT License 5 votes vote down vote up
def baseline_logisticRegression_crossValidate():
    origin_train_data = pd.read_csv(r"data/train.csv")
    process_data = fe_preprocessData(origin_train_data,'process_train_data')  # 数据预处理,要训练的数据
    process_data_train,process_data_cv = train_test_split(process_data,test_size=0.2)
    train_data = process_data_train.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')  # 使用正则抽取想要的数据
    train_np = train_data.as_matrix()  # 转为矩阵
    '''训练model'''
    X_train = train_np[:,1:]
    y_train = train_np[:,0]
    model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X_train,y_train)
    print pd.DataFrame({'columns':list(train_data.columns[1:]),'coef_':list(model.coef_.T)})
    
    cv_data = process_data_cv.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
    cv_np = cv_data.as_matrix()
    X_cv = cv_np[:,1:]
    y_cv = cv_np[:,0]
    predictions = model.predict(X_cv)
    print np.float32(np.sum(predictions == y_cv))/np.float32(predictions.shape[0])
    
    '''找到预测错的原始数据,并保存到文件'''
    error_items = origin_train_data.loc[origin_train_data['PassengerId'].isin(process_data_cv[predictions != y_cv]['PassengerId'].values)]
    predictions_item = pd.DataFrame(data=process_data_cv[predictions != y_cv]['PassengerId'])
    predictions_item.columns=['error_PassengerId']
    error_result = pd.concat([error_items,predictions_item],axis=1)
    error_result.to_csv(r'error.csv',index=False)
    
    #=print pd.DataFrame({"columns":list(train_data.columns)[1:],"coef_":list(model.coef_.T)})
    #=prediction = model.predict(X_test)
    #=print np.float32(np.sum(prediction == y_test))/np.float32(prediction.shape[0])    
    
    '''测试集上预测'''
    '''test_data = pd.read_csv(r"data/test.csv")
    process_test_data = fe_preprocessData(test_data,'process_test_data',optimize=True)  # 预处理数据
    test_data = process_test_data.filter(regex='Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
    test_np = test_data.as_matrix()
    predict = model.predict(test_np)
    result = pd.DataFrame(data={'PassengerId':process_test_data['PassengerId'].as_matrix(),'Survived':predict.astype(np.int32)})
    result.to_csv(r'logisticRegression_result/prediction.csv',index=False)'''
    #clf = linear_model.LogisticRegression(C=1.0,tol=1e-6)
    #print cross_validation.cross_val_score(clf, X,y,cv=5) 
Example 63
Project: Kaggle   Author: lawlite19   File: solution.py    MIT License 5 votes vote down vote up
def optimize_logisticRegression():
    train_data = pd.read_csv(r"data/train.csv")
    print u"数据信息:\n",train_data.info()
    print u'数据描述:\n',train_data.describe()  
    #display_data(train_data)  # 简单显示数据信息
    #display_with_process(train_data) # 根据数据的理解,简单处理一下数据显示,验证猜想
    process_data = fe_preprocessData(train_data,'process_train_data')  # 数据预处理,要训练的数据
    train_data = process_data.filter(regex='Survived|Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')  # 使用正则抽取想要的数据
    train_np = train_data.as_matrix()  # 转为矩阵
    '''训练model'''
    X = train_np[:,1:]
    y = train_np[:,0]
    #=X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
    #=model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X_train,y_train)
    model = linear_model.LogisticRegression(C=1.0,tol=1e-6).fit(X,y)
    print pd.DataFrame({"columns":list(train_data.columns)[1:],"coef_":list(model.coef_.T)})

    '''测试集上预测'''
    test_data = pd.read_csv(r"data/test.csv")
    process_test_data = fe_preprocessData(test_data,'process_test_data')  # 预处理数据
    test_data = process_test_data.filter(regex='Age|SibSp|Parch|Fare|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
    test_np = test_data.as_matrix()
    predict = model.predict(test_np)
    result = pd.DataFrame(data={'PassengerId':process_test_data['PassengerId'].as_matrix(),'Survived':predict.astype(np.int32)})
    result.to_csv(r'optimize_logisticRegression_result/prediction.csv',index=False)
    #clf = linear_model.LogisticRegression(C=1.0,tol=1e-6)
    #print cross_validation.cross_val_score(clf, X,y,cv=5)    
## 两项映射为多项式 
Example 64
Project: automl-phase-1   Author: jamesrobertlloyd   File: sandpit.py    MIT License 5 votes vote down vote up
def _f(x):
        # iris = load_iris()
        X, y = X, y = make_hastie_10_2(random_state=0)
        x = np.ravel(x)
        f = np.zeros(x.shape)
        for i in range(f.size):
            clf = RandomForestClassifier(n_estimators=1, min_samples_leaf=int(np.round(x[i])), random_state=0)
            # scores = cross_val_score(clf, iris.data, iris.target)
            scores = cross_val_score(clf, X, y, cv=5)
            f[i] = -scores.mean()
        return f.ravel() 
Example 65
Project: patient-data-predictions   Author: b-carter   File: classification.py    GNU General Public License v3.0 5 votes vote down vote up
def eval_cross_validation(patients, patient_feature_vectors, clf):
	X = np.array([patient_feature_vectors.get_patient_feature_vector(patient) for patient in patients])
	y = np.array([patient.transfused for patient in patients])
	scores = cross_validation.cross_val_score(clf, X, y, cv=5)
	print scores
	print "Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2) 
Example 66
Project: pines   Author: dmitru   File: test_models.py    MIT License 5 votes vote down vote up
def test_iris(self):
        dataset = load_iris()
        score = np.mean(cross_val_score(
                DecisionTreeClassifier(tree_type=self.tree_type), dataset.data, dataset.target, cv=10))
        print('iris: tree_type: {}, score = {}'.format(self.tree_type, score))
        self.assertTrue(score > 0.8) 
Example 67
Project: pines   Author: dmitru   File: test_models.py    MIT License 5 votes vote down vote up
def test_breast_cancer(self):
        dataset = load_breast_cancer()
        score = np.mean(cross_val_score(
                DecisionTreeClassifier(tree_type=self.tree_type), dataset.data, dataset.target, cv=10))
        print('breast_cancer: tree_type: {}, score = {}'.format(self.tree_type, score))
        self.assertTrue(score > 0.8) 
Example 68
Project: pines   Author: dmitru   File: test_models.py    MIT License 5 votes vote down vote up
def test_iris(self):
        dataset = load_iris()
        score = np.mean(cross_val_score(
                DecisionTreeClassifier(tree_type=self.tree_type), dataset.data, dataset.target, cv=10))
        self.assertTrue(score > 0.8)
        print('iris: tree_type: {}, score = {}'.format(self.tree_type, score)) 
Example 69
Project: pines   Author: dmitru   File: test_models.py    MIT License 5 votes vote down vote up
def test_breast_cancer(self):
        dataset = load_breast_cancer()
        score = np.mean(cross_val_score(
                DecisionTreeClassifier(tree_type=self.tree_type), dataset.data, dataset.target, cv=10))
        self.assertTrue(score > 0.8)
        print('breast_cancer: tree_type: {}, score = {}'.format(self.tree_type, score)) 
Example 70
Project: map-classification   Author: hunzikp   File: pixel_classifier.py    GNU General Public License v2.0 5 votes vote down vote up
def get_cv_score(self, n_estimators, filtername, filter_d, filter_sigmacolor, filter_sigmaspace, cv):
        train_data = self.get_training_data(filtername=filtername, filter_d=filter_d, filter_sigmacolor=filter_sigmacolor, filter_sigmaspace=filter_sigmaspace)
        rf = RandomForestClassifier(n_estimators=n_estimators)
        scores = cross_validation.cross_val_score(rf, train_data[:,2:5], train_data[:,5], cv=cv)
        return scores.mean() 
Example 71
Project: map-classification   Author: hunzikp   File: character_classifier.py    GNU General Public License v2.0 5 votes vote down vote up
def tune(self, par_list, cv):
        self.icc.makefeatures(self.featurefactory)
        X, y = self.icc.gettrainingdata()
        factors = self.label2factor(y)
        par_list = np.asarray(par_list)
        score_vec = []
        for par in par_list:
            clf_rf = self.skcl
            clf_rf.n_estimators = par
            scores = cross_validation.cross_val_score(clf_rf, X, factors, cv=cv, scoring='accuracy')
            score_vec.append(scores.mean())
            print "par: " + str(par) + ", score:" + str(scores.mean())
        par_opt = par_list[score_vec == max(score_vec)][0]
        self.skcl.n_estimators = par_opt
        self.skcl.fit(X, factors) 
Example 72
Project: Sklearn-type-class-for-multiple-algorithm-testing   Author: shubhankar90   File: algocomparison.py    GNU General Public License v3.0 5 votes vote down vote up
def _fit_cross_val(self,X, y, model, name):
        kfold = cross_validation.KFold(n=X.shape[0], n_folds=self.crossval_n_folds, random_state=self.seed)
        cv_results = cross_validation.cross_val_score(model, X, y, cv=kfold, scoring=self.scoring)
        return cv_results 
Example 73
Project: pandas2sklearn   Author: mouradmourafiq   File: test_dateset.py    MIT License 5 votes vote down vote up
def test_with_iris_dataframe(iris_data):
    dataset = DataSet(iris_data, target_column='target')
    pipeline = Pipeline([
        ('preprocess', DataSetTransformer([
            (['petal length (cm)', 'petal width (cm)'], StandardScaler()),
            ('sepal length (cm)', MinMaxScaler()),
            ('sepal width (cm)', None),
        ])),
        ('classify', SVC(kernel='linear'))
    ])
    scores = cross_val_score(pipeline, dataset, dataset.target)
    assert scores.mean() > 0.96 
Example 74
Project: r2-learner   Author: gmum   File: fit_models.py    MIT License 5 votes vote down vote up
def nk_folds(model, params, data, n=50, n_folds=10, n_jobs=4):

    model.set_params(params)
    scores = []
    for _ in xrange(n) :
        data = shuffle_data(data)
        scores.append(cross_val_score(estimator=model, X=data.data, y=data.target, scoring='accuracy', cv=n_folds, n_jobs=n_jobs).mean())

    return np.mean(scores), np.std(scores) 
Example 75
Project: jamespy_py3   Author: jskDr   File: _jgrid_r0.py    MIT License 5 votes vote down vote up
def _cv_LinearRegression_r0( xM, yV):

	print(xM.shape, yV.shape)

	clf = linear_model.Ridge()
	kf5 = cross_validation.KFold( xM.shape[0], n_folds=5, shuffle=True)
	cv_scores = cross_validation.cross_val_score( clf, xM, yV, scoring = 'r2', cv = kf5, n_jobs = -1)

	return cv_scores 
Example 76
Project: Flu-Prediction   Author: RK900   File: Flu-Tree.py    GNU General Public License v3.0 4 votes vote down vote up
def predictFluSeq(seqs): # Seqs is the file path of your FASTA files
    #returns cross-val scores and MSE
    X0 = []

    # adding to X and y

    for i in range(0, len(seqs) - 1):
        X0.append(seqs[i].seq)

    y0 = []
    for j in range(1, len(seqs)):
        y0.append(seqs[i].seq)

    from Encoding_v2 import encoding

    # Encoding letters into numbers

    X = []
    for k in range(len(X0)):
        encoded_X = encoding(X0[k])
        X.append(encoded_X)

    y = []
    for l in range(len(y0)):
        encoded_y = encoding(y0[l])
        y.append(encoded_y)

    from sklearn import ensemble, cross_validation, metrics

    # Cross-Validation
    rfr = ensemble.RandomForestRegressor()
    rfrscores = cross_validation.cross_val_score(rfr, X, y, cv=2)

    cv_score = ("Random Forests cross-validation score", rfrscores)
    avg_cv_score = ("Average Cross-Val Accuracy: %0.2f (+/- %0.2f)" % (rfrscores.mean()*100, rfrscores.std() *100))

    # Mean Squared Error
    X_train,X_test,y_train,y_test = cross_validation.train_test_split(X,y,test_size=0.5,random_state=50)

    rfr.fit(X_train,y_train)
    y_predicted = rfr.predict(X_test)
    mse_score = ('Random Forests MSE:', metrics.mean_squared_error(y_test,y_predicted))

    return cv_score, avg_cv_score, mse_score 
Example 77
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 4 votes vote down vote up
def test_kfold_can_detect_dependent_samples_on_digits():  # see #2372
    # The digits samples are dependent: they are apparently grouped by authors
    # although we don't have any information on the groups segment locations
    # for this data. We can highlight this fact be computing k-fold cross-
    # validation with and without shuffling: we observe that the shuffling case
    # wrongly makes the IID assumption and is therefore too optimistic: it
    # estimates a much higher accuracy (around 0.96) than than the non
    # shuffling variant (around 0.86).

    digits = load_digits()
    X, y = digits.data[:800], digits.target[:800]
    model = SVC(C=10, gamma=0.005)
    n = len(y)

    cv = cval.KFold(n, 5, shuffle=False)
    mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
    assert_greater(0.88, mean_score)
    assert_greater(mean_score, 0.85)

    # Shuffling the data artificially breaks the dependency and hides the
    # overfitting of the model with regards to the writing style of the authors
    # by yielding a seriously overestimated score:

    cv = cval.KFold(n, 5, shuffle=True, random_state=0)
    mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
    assert_greater(mean_score, 0.95)

    cv = cval.KFold(n, 5, shuffle=True, random_state=1)
    mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
    assert_greater(mean_score, 0.95)

    # Similarly, StratifiedKFold should try to shuffle the data as little
    # as possible (while respecting the balanced class constraints)
    # and thus be able to detect the dependency by not overestimating
    # the CV score either. As the digits dataset is approximately balanced
    # the estimated mean score is close to the score measured with
    # non-shuffled KFold

    cv = cval.StratifiedKFold(y, 5)
    mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
    assert_greater(0.88, mean_score)
    assert_greater(mean_score, 0.85) 
Example 78
Project: Weiss   Author: WangWenjun559   File: test_cross_validation.py    Apache License 2.0 4 votes vote down vote up
def test_kfold_can_detect_dependent_samples_on_digits():  # see #2372
    # The digits samples are dependent: they are apparently grouped by authors
    # although we don't have any information on the groups segment locations
    # for this data. We can highlight this fact be computing k-fold cross-
    # validation with and without shuffling: we observe that the shuffling case
    # wrongly makes the IID assumption and is therefore too optimistic: it
    # estimates a much higher accuracy (around 0.96) than than the non
    # shuffling variant (around 0.86).

    digits = load_digits()
    X, y = digits.data[:800], digits.target[:800]
    model = SVC(C=10, gamma=0.005)
    n = len(y)

    cv = cval.KFold(n, 5, shuffle=False)
    mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
    assert_greater(0.88, mean_score)
    assert_greater(mean_score, 0.85)

    # Shuffling the data artificially breaks the dependency and hides the
    # overfitting of the model with regards to the writing style of the authors
    # by yielding a seriously overestimated score:

    cv = cval.KFold(n, 5, shuffle=True, random_state=0)
    mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
    assert_greater(mean_score, 0.95)

    cv = cval.KFold(n, 5, shuffle=True, random_state=1)
    mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
    assert_greater(mean_score, 0.95)

    # Similarly, StratifiedKFold should try to shuffle the data as little
    # as possible (while respecting the balanced class constraints)
    # and thus be able to detect the dependency by not overestimating
    # the CV score either. As the digits dataset is approximately balanced
    # the estimated mean score is close to the score measured with
    # non-shuffled KFold

    cv = cval.StratifiedKFold(y, 5)
    mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
    assert_greater(0.88, mean_score)
    assert_greater(mean_score, 0.85) 
Example 79
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_cross_validation.py    Apache License 2.0 4 votes vote down vote up
def test_kfold_can_detect_dependent_samples_on_digits():  # see #2372
    # The digits samples are dependent: they are apparently grouped by authors
    # although we don't have any information on the groups segment locations
    # for this data. We can highlight this fact be computing k-fold cross-
    # validation with and without shuffling: we observe that the shuffling case
    # wrongly makes the IID assumption and is therefore too optimistic: it
    # estimates a much higher accuracy (around 0.96) than the non
    # shuffling variant (around 0.86).

    digits = load_digits()
    X, y = digits.data[:800], digits.target[:800]
    model = SVC(C=10, gamma=0.005)
    n = len(y)

    cv = cval.KFold(n, 5, shuffle=False)
    mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
    assert_greater(0.88, mean_score)
    assert_greater(mean_score, 0.85)

    # Shuffling the data artificially breaks the dependency and hides the
    # overfitting of the model with regards to the writing style of the authors
    # by yielding a seriously overestimated score:

    cv = cval.KFold(n, 5, shuffle=True, random_state=0)
    mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
    assert_greater(mean_score, 0.95)

    cv = cval.KFold(n, 5, shuffle=True, random_state=1)
    mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
    assert_greater(mean_score, 0.95)

    # Similarly, StratifiedKFold should try to shuffle the data as little
    # as possible (while respecting the balanced class constraints)
    # and thus be able to detect the dependency by not overestimating
    # the CV score either. As the digits dataset is approximately balanced
    # the estimated mean score is close to the score measured with
    # non-shuffled KFold

    cv = cval.StratifiedKFold(y, 5)
    mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
    assert_greater(0.88, mean_score)
    assert_greater(mean_score, 0.85) 
Example 80
Project: FLASH   Author: yuyuz   File: ml_framework.py    GNU General Public License v3.0 4 votes vote down vote up
def pipeline(params, **kwargs):
    # Params is a dict that contains the params
    # As the values are forwarded as strings you might want to convert and check them

    data_path = kwargs['data_path']
    dataset = kwargs['dataset']
    enable_cache = int(kwargs['use_caching']) == 1

    data_train = os.path.join(data_path, dataset, 'train.arff')
    X, y = load_arff_data(data_train)

    dpr = get_data_preprocessor_rescaling(params)
    params = get_data_preprocessor_balancing(params, y)
    fp = get_feature_preprocessor(params)
    clf = get_classifier(params)

    # **kwargs contains further information, like for cross validation
    #    kwargs['folds'] is 1 when no cv
    #    kwargs['fold'] is the current fold. The index is zero-based

    if enable_cache:
        steps = []
        if dpr is not None:
            steps.append(dpr)
        if fp is not None:
            steps.append(fp)
        steps.append(clf)
        import cache
        score = cache.cached_run(steps, X, y)
        result = 100.0 - 100.0 * score
    else:
        # Run your algorithm and receive a result, you want to minimize
        steps = []
        if dpr is not None:
            steps.append(('data_preprocessor_rescaling', dpr))
        if fp is not None:
            steps.append(('feature_preprocessor', fp))
        steps.append(('classifier', clf))

        ppl = Pipeline(steps)
        scores = cross_val_score(ppl, X, y, cv=int(kwargs['cv_folds']))
        result = 100.0 - 100.0 * scores.mean()

    return result