Python sklearn.cross_validation.ShuffleSplit() Examples

The following are 16 code examples of sklearn.cross_validation.ShuffleSplit(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.cross_validation , or try the search function

Example #1

Source File: ml.py From info-flow-experiments with GNU General Public License v3.0

6 votes

def split_data(X, y, splittype='timed', splitfrac=0.1, verbose=False):  
    if(splittype == 'rand'):
        rs1 = cross_validation.ShuffleSplit(len(X), n_iter=1, test_size=splitfrac)
        for train, test in rs1:
            if(verbose):
                print "Training blocks:", train 
                print "Test blocks:", test
            X_train, y_train, X_test, y_test = X[train], y[train], X[test], y[test]
    elif(splittype == 'timed'):
        split = int((1.-splitfrac)*len(X))
        if(verbose):
            print "Split at block ", str(split) 
        X_train, y_train, X_test, y_test = X[:split], y[:split], X[split:], y[split:]
    else:
        raw_input("Split type ERROR in ml.py")  
    return X_train, y_train, X_test, y_test

Example #2

Source File: Deopen_classification.py From Deopen with MIT License

6 votes

def data_split(inputfile):
    data = hkl.load(inputfile)
    X = data['mat']
    X_kspec = data['kmer']
    y = data['y']
    rs = ShuffleSplit(len(y), n_iter=1,random_state = 1)
    X_kspec = X_kspec.reshape((X_kspec.shape[0],1024,4))
    X = np.concatenate((X,X_kspec), axis = 1)
    X = X[:,np.newaxis]
    X = X.transpose((0,1,3,2))
    for train_idx, test_idx in rs:
        X_train = X[train_idx,:]
        y_train = y[train_idx]
        X_test = X[test_idx,:]
        y_test = y[test_idx]
    X_train = X_train.astype('float32')
    y_train = y_train.astype('int32')
    X_test = X_test.astype('float32')
    y_test = y_test.astype('int32')
    return [X_train, y_train, X_test, y_test]

#define the network architecture

Example #3

Source File: 02_tuning.py From Building-Machine-Learning-Systems-With-Python-Second-Edition with MIT License

6 votes

def grid_search_model(clf_factory, X, Y):
    cv = ShuffleSplit(
        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)

    param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)],
                      vect__min_df=[1, 2],
                      vect__stop_words=[None, "english"],
                      vect__smooth_idf=[False, True],
                      vect__use_idf=[False, True],
                      vect__sublinear_tf=[False, True],
                      vect__binary=[False, True],
                      clf__alpha=[0, 0.01, 0.05, 0.1, 0.5, 1],
                      )

    grid_search = GridSearchCV(clf_factory(),
                               param_grid=param_grid,
                               cv=cv,
                               score_func=f1_score,
                               verbose=10)
    grid_search.fit(X, Y)
    clf = grid_search.best_estimator_
    print clf

    return clf

Example #4

Source File: 04_sent.py From Building-Machine-Learning-Systems-With-Python-Second-Edition with MIT License

6 votes

def __grid_search_model(clf_factory, X, Y):
    cv = ShuffleSplit(
        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)

    param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)],
                      vect__min_df=[1, 2],
                      vect__smooth_idf=[False, True],
                      vect__use_idf=[False, True],
                      vect__sublinear_tf=[False, True],
                      vect__binary=[False, True],
                      clf__alpha=[0, 0.01, 0.05, 0.1, 0.5, 1],
                      )

    grid_search = GridSearchCV(clf_factory(),
                               param_grid=param_grid,
                               cv=cv,
                               score_func=f1_score,
                               verbose=10)
    grid_search.fit(X, Y)
    clf = grid_search.best_estimator_
    print clf

    return clf

Example #5

Source File: test_cross_validation.py From twitter-stock-recommendation with MIT License

6 votes

def test_cross_val_generator_with_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    # explicitly passing indices value is deprecated
    loo = cval.LeaveOneOut(4)
    lpo = cval.LeavePOut(4, 2)
    kf = cval.KFold(4, 2)
    skf = cval.StratifiedKFold(y, 2)
    lolo = cval.LeaveOneLabelOut(labels)
    lopo = cval.LeavePLabelOut(labels, 2)
    ps = cval.PredefinedSplit([1, 1, 2, 2])
    ss = cval.ShuffleSplit(2)
    for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]:
        for train, test in cv:
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            X[train], X[test]
            y[train], y[test]

Example #6

Source File: test_cross_validation.py From twitter-stock-recommendation with MIT License

6 votes

def test_cross_val_generator_with_default_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    loo = cval.LeaveOneOut(4)
    lpo = cval.LeavePOut(4, 2)
    kf = cval.KFold(4, 2)
    skf = cval.StratifiedKFold(y, 2)
    lolo = cval.LeaveOneLabelOut(labels)
    lopo = cval.LeavePLabelOut(labels, 2)
    ss = cval.ShuffleSplit(2)
    ps = cval.PredefinedSplit([1, 1, 2, 2])
    for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]:
        for train, test in cv:
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            X[train], X[test]
            y[train], y[test]

Example #7

Source File: Deopen_regression.py From Deopen with MIT License

5 votes

def data_split(inputfile,reads_count):
    data = hkl.load(inputfile)
    reads_count= hkl.load(reads_count)
    X = data['mat']
    X_kspec = data['kmer']
    reads_count = np.array(reads_count)
    y = np.mean(reads_count, axis = 1)
    y = np.log(y+1e-3)
    rs = ShuffleSplit(len(y), n_iter=1,random_state = 1)
    X_kspec = X_kspec.reshape((X_kspec.shape[0],1024,4))
    X = np.concatenate((X,X_kspec), axis = 1)
    X = X[:,np.newaxis]
    X = X.transpose((0,1,3,2))
    for train_idx, test_idx in rs:
        X_train = X[train_idx,:]
        y_train = y[train_idx]
        X_test = X[test_idx,:]
        y_test = y[test_idx]
    X_train = X_train.astype('float32')
    y_train = y_train.astype('float32')
    X_test = X_test.astype('float32')
    y_test = y_test.astype('float32')
    print 'Data prepration done!'
    return [X_train, y_train, X_test, y_test]


#define the network architecture

Example #8

Source File: __init__.py From kaggle-right-whale with MIT License

5 votes

def __call__(self, X, y, net):
        if self.eval_size is not None:
            if net.regression or not self.stratify:
                # test_size = self.eval_size
                # kf = ShuffleSplit(
                #     y.shape[0], test_size=test_size,
                #     random_state=self.random_state
                # )
                # train_indices, valid_indices = next(iter(kf))
                # valid_indices = shuffle(valid_indices)
                test_size = 1 - self.eval_size
                kf = ShuffleSplit(
                    y.shape[0], test_size=test_size,
                    random_state=self.random_state
                )
                valid_indices, train_indices = next(iter(kf))
            else:
                n_folds = int(round(1 / self.eval_size))
                kf = StratifiedKFold(y, n_folds=n_folds, random_state=self.random_state)
                train_indices, valid_indices = next(iter(kf))

            X_train, y_train = X[train_indices], y[train_indices]
            X_valid, y_valid = X[valid_indices], y[valid_indices]
        else:
            X_train, y_train = X, y
            X_valid, y_valid = X[len(X):], y[len(y):]

        return X_train, X_valid, y_train, y_valid

Example #9

Source File: train_model.py From kaggle-right-whale with MIT License

5 votes

def train_test_split(X, y, test_size=0.25, random_state=42, stratify=True):
    if stratify:
        n_folds = int(round(1 / test_size))
        sss = StratifiedKFold(y, n_folds=n_folds, random_state=random_state)
    else:
        sss = ShuffleSplit(len(y), test_size=test_size, random_state=random_state)
    train_idx, test_idx = iter(sss).next()
    return X[train_idx], X[test_idx], y[train_idx], y[test_idx]

Example #10

Source File: test_cross_validation.py From twitter-stock-recommendation with MIT License

5 votes

def test_shuffle_split():
    ss1 = cval.ShuffleSplit(10, test_size=0.2, random_state=0)
    ss2 = cval.ShuffleSplit(10, test_size=2, random_state=0)
    ss3 = cval.ShuffleSplit(10, test_size=np.int32(2), random_state=0)
    for typ in six.integer_types:
        ss4 = cval.ShuffleSplit(10, test_size=typ(2), random_state=0)
    for t1, t2, t3, t4 in zip(ss1, ss2, ss3, ss4):
        assert_array_equal(t1[0], t2[0])
        assert_array_equal(t2[0], t3[0])
        assert_array_equal(t3[0], t4[0])
        assert_array_equal(t1[1], t2[1])
        assert_array_equal(t2[1], t3[1])
        assert_array_equal(t3[1], t4[1])

Example #11

Source File: test_cross_validation.py From twitter-stock-recommendation with MIT License

5 votes

def test_shufflesplit_errors():
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=2.0)
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=1.0)
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=0.1,
                  train_size=0.95)
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=11)
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=10)
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=8, train_size=3)
    assert_raises(ValueError, cval.ShuffleSplit, 10, train_size=1j)
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=None,
                  train_size=None)

Example #12

Source File: test_cross_validation.py From twitter-stock-recommendation with MIT License

5 votes

def test_shufflesplit_reproducible():
    # Check that iterating twice on the ShuffleSplit gives the same
    # sequence of train-test when the random_state is given
    ss = cval.ShuffleSplit(10, random_state=21)
    assert_array_equal(list(a for a, b in ss), list(a for a, b in ss))

Example #13

Source File: ml.py From info-flow-experiments with GNU General Public License v3.0

4 votes

def crossVal_algo(k, algo, params, X, y, splittype, splitfrac, verbose=False):              # performs cross_validation
    if(splittype=='rand'):
        rs2 = cross_validation.ShuffleSplit(len(X), n_iter=k, test_size=splitfrac)
    elif(splittype=='timed'):
        rs2 = cross_validation.KFold(n=len(X), n_folds=k)
    max, max_params = 0, {}
    par = []
    for param in params.keys():
        par.append(params[param])
    for p in product(*par):
        if(verbose):
            print "val=", p
        score = 0.0
        for train, test in rs2:
            X_train, y_train, X_test, y_test = X[train], y[train], X[test], y[test]
            X_train = np.array([item for sublist in X_train for item in sublist])
            y_train = np.array([item for sublist in y_train for item in sublist])
            X_test = np.array([item for sublist in X_test for item in sublist])
            y_test = np.array([item for sublist in y_test for item in sublist])
            #print X_train.shape, y_train.shape, X_test.shape, y_test.shape
            if(algo == 'svc'):
                clf = LinearSVC(C=p[params.keys().index('C')],
                    penalty="l1", dual=False)               ## Larger C increases model complexity
            if(algo=='kNN'):
                clf = KNeighborsClassifier(n_neighbors=p[params.keys().index('k')], 
                    warn_on_equidistant=False, p=p[params.keys().index('p')])
            if(algo=='linearSVM'):
                clf = svm.SVC(kernel='linear', C=p[params.keys().index('C')])
            if(algo=='polySVM'):
                clf = svm.SVC(kernel='poly', degree = p[params.keys().index('degree')], 
                    C=p[params.keys().index('C')])
            if(algo=='rbfSVM'):
                clf = svm.SVC(kernel='rbf', gamma = p[params.keys().index('gamma')], 
                    C=p[params.keys().index('C')])          ## a smaller gamma gives a decision boundary with a smoother curvature
            if(algo=='logit'):
                clf = LogisticRegression(penalty=p[params.keys().index('penalty')], dual=False, 
                    C=p[params.keys().index('C')])
            if(algo=='tree'):
                clf = ExtraTreesClassifier(n_estimators=p[params.keys().index('ne')], compute_importances=True, random_state=0)
            if(algo=='randlog'):
                clf = RandomizedLogisticRegression(C=p[params.keys().index('C')])
            clf.fit(X_train, y_train)
            score += clf.score(X_test, y_test)
        score /= k
        if(verbose):
            print score
        if score>max:
            max = score
            max_params = p
            classifier = clf
    return max, max_params, classifier

Example #14

Source File: 02_tuning.py From Building-Machine-Learning-Systems-With-Python-Second-Edition with MIT License

4 votes

def train_model(clf, X, Y, name="NB ngram", plot=False):
    # create it again for plotting
    cv = ShuffleSplit(
        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = []
    precisions, recalls, thresholds = [], [], []

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf.fit(X_train, y_train)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        scores.append(test_score)
        proba = clf.predict_proba(X_test)

        fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
        precision, recall, pr_thresholds = precision_recall_curve(
            y_test, proba[:, 1])

        pr_scores.append(auc(recall, precision))
        precisions.append(precision)
        recalls.append(recall)
        thresholds.append(pr_thresholds)

    if plot:
        scores_to_sort = pr_scores
        median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]

        plot_pr(pr_scores[median], name, phase, precisions[median],
                recalls[median], label=name)

    summary = (np.mean(scores), np.std(scores),
               np.mean(pr_scores), np.std(pr_scores))
    print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary

    return np.mean(train_errors), np.mean(test_errors)

Example #15

Source File: 03_clean.py From Building-Machine-Learning-Systems-With-Python-Second-Edition with MIT License

4 votes

def train_model(clf, X, Y, name="NB ngram", plot=False):
    # create it again for plotting
    cv = ShuffleSplit(
        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = []
    precisions, recalls, thresholds = [], [], []

    clfs = []  # just to later get the median

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf.fit(X_train, y_train)
        clfs.append(clf)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        scores.append(test_score)
        proba = clf.predict_proba(X_test)

        fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
        precision, recall, pr_thresholds = precision_recall_curve(
            y_test, proba[:, 1])

        pr_scores.append(auc(recall, precision))
        precisions.append(precision)
        recalls.append(recall)
        thresholds.append(pr_thresholds)

    if plot:
        scores_to_sort = pr_scores
        median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]

        plot_pr(pr_scores[median], name, phase, precisions[median],
                recalls[median], label=name)

        log_false_positives(clfs[median], X_test, y_test, name)

    summary = (np.mean(scores), np.std(scores),
               np.mean(pr_scores), np.std(pr_scores))
    print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary

    return np.mean(train_errors), np.mean(test_errors)

Example #16

Source File: 04_sent.py From Building-Machine-Learning-Systems-With-Python-Second-Edition with MIT License

4 votes

def train_model(clf, X, Y, name="NB ngram", plot=False):
    # create it again for plotting
    cv = ShuffleSplit(
        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = []
    precisions, recalls, thresholds = [], [], []

    clfs = []  # just to later get the median

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf.fit(X_train, y_train)
        clfs.append(clf)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        scores.append(test_score)
        proba = clf.predict_proba(X_test)

        fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
        precision, recall, pr_thresholds = precision_recall_curve(
            y_test, proba[:, 1])

        pr_scores.append(auc(recall, precision))
        precisions.append(precision)
        recalls.append(recall)
        thresholds.append(pr_thresholds)

    if plot:
        scores_to_sort = pr_scores
        median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]

        plot_pr(pr_scores[median], name, phase, precisions[median],
                recalls[median], label=name)

        log_false_positives(clfs[median], X_test, y_test, name)

    summary = (np.mean(scores), np.std(scores),
               np.mean(pr_scores), np.std(pr_scores))
    print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary

    return np.mean(train_errors), np.mean(test_errors)