Python sklearn.cross_validation.ShuffleSplit() Examples

The following are 16 code examples of sklearn.cross_validation.ShuffleSplit(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.cross_validation , or try the search function .
Example #1
Source File: ml.py    From info-flow-experiments with GNU General Public License v3.0 6 votes vote down vote up
def split_data(X, y, splittype='timed', splitfrac=0.1, verbose=False):  
    if(splittype == 'rand'):
        rs1 = cross_validation.ShuffleSplit(len(X), n_iter=1, test_size=splitfrac)
        for train, test in rs1:
            if(verbose):
                print "Training blocks:", train 
                print "Test blocks:", test
            X_train, y_train, X_test, y_test = X[train], y[train], X[test], y[test]
    elif(splittype == 'timed'):
        split = int((1.-splitfrac)*len(X))
        if(verbose):
            print "Split at block ", str(split) 
        X_train, y_train, X_test, y_test = X[:split], y[:split], X[split:], y[split:]
    else:
        raw_input("Split type ERROR in ml.py")  
    return X_train, y_train, X_test, y_test 
Example #2
Source File: Deopen_classification.py    From Deopen with MIT License 6 votes vote down vote up
def data_split(inputfile):
    data = hkl.load(inputfile)
    X = data['mat']
    X_kspec = data['kmer']
    y = data['y']
    rs = ShuffleSplit(len(y), n_iter=1,random_state = 1)
    X_kspec = X_kspec.reshape((X_kspec.shape[0],1024,4))
    X = np.concatenate((X,X_kspec), axis = 1)
    X = X[:,np.newaxis]
    X = X.transpose((0,1,3,2))
    for train_idx, test_idx in rs:
        X_train = X[train_idx,:]
        y_train = y[train_idx]
        X_test = X[test_idx,:]
        y_test = y[test_idx]
    X_train = X_train.astype('float32')
    y_train = y_train.astype('int32')
    X_test = X_test.astype('float32')
    y_test = y_test.astype('int32')
    return [X_train, y_train, X_test, y_test]

#define the network architecture 
Example #3
Source File: 02_tuning.py    From Building-Machine-Learning-Systems-With-Python-Second-Edition with MIT License 6 votes vote down vote up
def grid_search_model(clf_factory, X, Y):
    cv = ShuffleSplit(
        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)

    param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)],
                      vect__min_df=[1, 2],
                      vect__stop_words=[None, "english"],
                      vect__smooth_idf=[False, True],
                      vect__use_idf=[False, True],
                      vect__sublinear_tf=[False, True],
                      vect__binary=[False, True],
                      clf__alpha=[0, 0.01, 0.05, 0.1, 0.5, 1],
                      )

    grid_search = GridSearchCV(clf_factory(),
                               param_grid=param_grid,
                               cv=cv,
                               score_func=f1_score,
                               verbose=10)
    grid_search.fit(X, Y)
    clf = grid_search.best_estimator_
    print clf

    return clf 
Example #4
Source File: 04_sent.py    From Building-Machine-Learning-Systems-With-Python-Second-Edition with MIT License 6 votes vote down vote up
def __grid_search_model(clf_factory, X, Y):
    cv = ShuffleSplit(
        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)

    param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)],
                      vect__min_df=[1, 2],
                      vect__smooth_idf=[False, True],
                      vect__use_idf=[False, True],
                      vect__sublinear_tf=[False, True],
                      vect__binary=[False, True],
                      clf__alpha=[0, 0.01, 0.05, 0.1, 0.5, 1],
                      )

    grid_search = GridSearchCV(clf_factory(),
                               param_grid=param_grid,
                               cv=cv,
                               score_func=f1_score,
                               verbose=10)
    grid_search.fit(X, Y)
    clf = grid_search.best_estimator_
    print clf

    return clf 
Example #5
Source File: test_cross_validation.py    From twitter-stock-recommendation with MIT License 6 votes vote down vote up
def test_cross_val_generator_with_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    # explicitly passing indices value is deprecated
    loo = cval.LeaveOneOut(4)
    lpo = cval.LeavePOut(4, 2)
    kf = cval.KFold(4, 2)
    skf = cval.StratifiedKFold(y, 2)
    lolo = cval.LeaveOneLabelOut(labels)
    lopo = cval.LeavePLabelOut(labels, 2)
    ps = cval.PredefinedSplit([1, 1, 2, 2])
    ss = cval.ShuffleSplit(2)
    for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]:
        for train, test in cv:
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            X[train], X[test]
            y[train], y[test] 
Example #6
Source File: test_cross_validation.py    From twitter-stock-recommendation with MIT License 6 votes vote down vote up
def test_cross_val_generator_with_default_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    loo = cval.LeaveOneOut(4)
    lpo = cval.LeavePOut(4, 2)
    kf = cval.KFold(4, 2)
    skf = cval.StratifiedKFold(y, 2)
    lolo = cval.LeaveOneLabelOut(labels)
    lopo = cval.LeavePLabelOut(labels, 2)
    ss = cval.ShuffleSplit(2)
    ps = cval.PredefinedSplit([1, 1, 2, 2])
    for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]:
        for train, test in cv:
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            X[train], X[test]
            y[train], y[test] 
Example #7
Source File: Deopen_regression.py    From Deopen with MIT License 5 votes vote down vote up
def data_split(inputfile,reads_count):
    data = hkl.load(inputfile)
    reads_count= hkl.load(reads_count)
    X = data['mat']
    X_kspec = data['kmer']
    reads_count = np.array(reads_count)
    y = np.mean(reads_count, axis = 1)
    y = np.log(y+1e-3)
    rs = ShuffleSplit(len(y), n_iter=1,random_state = 1)
    X_kspec = X_kspec.reshape((X_kspec.shape[0],1024,4))
    X = np.concatenate((X,X_kspec), axis = 1)
    X = X[:,np.newaxis]
    X = X.transpose((0,1,3,2))
    for train_idx, test_idx in rs:
        X_train = X[train_idx,:]
        y_train = y[train_idx]
        X_test = X[test_idx,:]
        y_test = y[test_idx]
    X_train = X_train.astype('float32')
    y_train = y_train.astype('float32')
    X_test = X_test.astype('float32')
    y_test = y_test.astype('float32')
    print 'Data prepration done!'
    return [X_train, y_train, X_test, y_test]


#define the network architecture 
Example #8
Source File: __init__.py    From kaggle-right-whale with MIT License 5 votes vote down vote up
def __call__(self, X, y, net):
        if self.eval_size is not None:
            if net.regression or not self.stratify:
                # test_size = self.eval_size
                # kf = ShuffleSplit(
                #     y.shape[0], test_size=test_size,
                #     random_state=self.random_state
                # )
                # train_indices, valid_indices = next(iter(kf))
                # valid_indices = shuffle(valid_indices)
                test_size = 1 - self.eval_size
                kf = ShuffleSplit(
                    y.shape[0], test_size=test_size,
                    random_state=self.random_state
                )
                valid_indices, train_indices = next(iter(kf))
            else:
                n_folds = int(round(1 / self.eval_size))
                kf = StratifiedKFold(y, n_folds=n_folds, random_state=self.random_state)
                train_indices, valid_indices = next(iter(kf))

            X_train, y_train = X[train_indices], y[train_indices]
            X_valid, y_valid = X[valid_indices], y[valid_indices]
        else:
            X_train, y_train = X, y
            X_valid, y_valid = X[len(X):], y[len(y):]

        return X_train, X_valid, y_train, y_valid 
Example #9
Source File: train_model.py    From kaggle-right-whale with MIT License 5 votes vote down vote up
def train_test_split(X, y, test_size=0.25, random_state=42, stratify=True):
    if stratify:
        n_folds = int(round(1 / test_size))
        sss = StratifiedKFold(y, n_folds=n_folds, random_state=random_state)
    else:
        sss = ShuffleSplit(len(y), test_size=test_size, random_state=random_state)
    train_idx, test_idx = iter(sss).next()
    return X[train_idx], X[test_idx], y[train_idx], y[test_idx] 
Example #10
Source File: test_cross_validation.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_shuffle_split():
    ss1 = cval.ShuffleSplit(10, test_size=0.2, random_state=0)
    ss2 = cval.ShuffleSplit(10, test_size=2, random_state=0)
    ss3 = cval.ShuffleSplit(10, test_size=np.int32(2), random_state=0)
    for typ in six.integer_types:
        ss4 = cval.ShuffleSplit(10, test_size=typ(2), random_state=0)
    for t1, t2, t3, t4 in zip(ss1, ss2, ss3, ss4):
        assert_array_equal(t1[0], t2[0])
        assert_array_equal(t2[0], t3[0])
        assert_array_equal(t3[0], t4[0])
        assert_array_equal(t1[1], t2[1])
        assert_array_equal(t2[1], t3[1])
        assert_array_equal(t3[1], t4[1]) 
Example #11
Source File: test_cross_validation.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_shufflesplit_errors():
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=2.0)
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=1.0)
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=0.1,
                  train_size=0.95)
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=11)
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=10)
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=8, train_size=3)
    assert_raises(ValueError, cval.ShuffleSplit, 10, train_size=1j)
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=None,
                  train_size=None) 
Example #12
Source File: test_cross_validation.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_shufflesplit_reproducible():
    # Check that iterating twice on the ShuffleSplit gives the same
    # sequence of train-test when the random_state is given
    ss = cval.ShuffleSplit(10, random_state=21)
    assert_array_equal(list(a for a, b in ss), list(a for a, b in ss)) 
Example #13
Source File: ml.py    From info-flow-experiments with GNU General Public License v3.0 4 votes vote down vote up
def crossVal_algo(k, algo, params, X, y, splittype, splitfrac, verbose=False):              # performs cross_validation
    if(splittype=='rand'):
        rs2 = cross_validation.ShuffleSplit(len(X), n_iter=k, test_size=splitfrac)
    elif(splittype=='timed'):
        rs2 = cross_validation.KFold(n=len(X), n_folds=k)
    max, max_params = 0, {}
    par = []
    for param in params.keys():
        par.append(params[param])
    for p in product(*par):
        if(verbose):
            print "val=", p
        score = 0.0
        for train, test in rs2:
            X_train, y_train, X_test, y_test = X[train], y[train], X[test], y[test]
            X_train = np.array([item for sublist in X_train for item in sublist])
            y_train = np.array([item for sublist in y_train for item in sublist])
            X_test = np.array([item for sublist in X_test for item in sublist])
            y_test = np.array([item for sublist in y_test for item in sublist])
            #print X_train.shape, y_train.shape, X_test.shape, y_test.shape
            if(algo == 'svc'):
                clf = LinearSVC(C=p[params.keys().index('C')],
                    penalty="l1", dual=False)               ## Larger C increases model complexity
            if(algo=='kNN'):
                clf = KNeighborsClassifier(n_neighbors=p[params.keys().index('k')], 
                    warn_on_equidistant=False, p=p[params.keys().index('p')])
            if(algo=='linearSVM'):
                clf = svm.SVC(kernel='linear', C=p[params.keys().index('C')])
            if(algo=='polySVM'):
                clf = svm.SVC(kernel='poly', degree = p[params.keys().index('degree')], 
                    C=p[params.keys().index('C')])
            if(algo=='rbfSVM'):
                clf = svm.SVC(kernel='rbf', gamma = p[params.keys().index('gamma')], 
                    C=p[params.keys().index('C')])          ## a smaller gamma gives a decision boundary with a smoother curvature
            if(algo=='logit'):
                clf = LogisticRegression(penalty=p[params.keys().index('penalty')], dual=False, 
                    C=p[params.keys().index('C')])
            if(algo=='tree'):
                clf = ExtraTreesClassifier(n_estimators=p[params.keys().index('ne')], compute_importances=True, random_state=0)
            if(algo=='randlog'):
                clf = RandomizedLogisticRegression(C=p[params.keys().index('C')])
            clf.fit(X_train, y_train)
            score += clf.score(X_test, y_test)
        score /= k
        if(verbose):
            print score
        if score>max:
            max = score
            max_params = p
            classifier = clf
    return max, max_params, classifier 
Example #14
Source File: 02_tuning.py    From Building-Machine-Learning-Systems-With-Python-Second-Edition with MIT License 4 votes vote down vote up
def train_model(clf, X, Y, name="NB ngram", plot=False):
    # create it again for plotting
    cv = ShuffleSplit(
        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = []
    precisions, recalls, thresholds = [], [], []

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf.fit(X_train, y_train)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        scores.append(test_score)
        proba = clf.predict_proba(X_test)

        fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
        precision, recall, pr_thresholds = precision_recall_curve(
            y_test, proba[:, 1])

        pr_scores.append(auc(recall, precision))
        precisions.append(precision)
        recalls.append(recall)
        thresholds.append(pr_thresholds)

    if plot:
        scores_to_sort = pr_scores
        median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]

        plot_pr(pr_scores[median], name, phase, precisions[median],
                recalls[median], label=name)

    summary = (np.mean(scores), np.std(scores),
               np.mean(pr_scores), np.std(pr_scores))
    print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary

    return np.mean(train_errors), np.mean(test_errors) 
Example #15
Source File: 03_clean.py    From Building-Machine-Learning-Systems-With-Python-Second-Edition with MIT License 4 votes vote down vote up
def train_model(clf, X, Y, name="NB ngram", plot=False):
    # create it again for plotting
    cv = ShuffleSplit(
        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = []
    precisions, recalls, thresholds = [], [], []

    clfs = []  # just to later get the median

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf.fit(X_train, y_train)
        clfs.append(clf)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        scores.append(test_score)
        proba = clf.predict_proba(X_test)

        fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
        precision, recall, pr_thresholds = precision_recall_curve(
            y_test, proba[:, 1])

        pr_scores.append(auc(recall, precision))
        precisions.append(precision)
        recalls.append(recall)
        thresholds.append(pr_thresholds)

    if plot:
        scores_to_sort = pr_scores
        median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]

        plot_pr(pr_scores[median], name, phase, precisions[median],
                recalls[median], label=name)

        log_false_positives(clfs[median], X_test, y_test, name)

    summary = (np.mean(scores), np.std(scores),
               np.mean(pr_scores), np.std(pr_scores))
    print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary

    return np.mean(train_errors), np.mean(test_errors) 
Example #16
Source File: 04_sent.py    From Building-Machine-Learning-Systems-With-Python-Second-Edition with MIT License 4 votes vote down vote up
def train_model(clf, X, Y, name="NB ngram", plot=False):
    # create it again for plotting
    cv = ShuffleSplit(
        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = []
    precisions, recalls, thresholds = [], [], []

    clfs = []  # just to later get the median

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf.fit(X_train, y_train)
        clfs.append(clf)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        scores.append(test_score)
        proba = clf.predict_proba(X_test)

        fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
        precision, recall, pr_thresholds = precision_recall_curve(
            y_test, proba[:, 1])

        pr_scores.append(auc(recall, precision))
        precisions.append(precision)
        recalls.append(recall)
        thresholds.append(pr_thresholds)

    if plot:
        scores_to_sort = pr_scores
        median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]

        plot_pr(pr_scores[median], name, phase, precisions[median],
                recalls[median], label=name)

        log_false_positives(clfs[median], X_test, y_test, name)

    summary = (np.mean(scores), np.std(scores),
               np.mean(pr_scores), np.std(pr_scores))
    print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary

    return np.mean(train_errors), np.mean(test_errors)