Python sklearn.cross_validation.StratifiedShuffleSplit() Examples

The following are code examples for showing how to use sklearn.cross_validation.StratifiedShuffleSplit(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: jr-tools   Author: kingjr   File: classifiers.py    BSD 2-Clause "Simplified" License 6 votes vote down vote up
def fit(self, X, y):
        from copy import deepcopy
        from sklearn.cross_validation import StratifiedShuffleSplit
        cv = StratifiedShuffleSplit(y, self._n_repeats,
                                    train_size=self._train_size,
                                    test_size=self._test_size,
                                    random_state=self._random_state)
        attr = dict()
        for key in self._mean_attributes:
            attr[key] = list()
        # fit and collect classifiers attributes
        for train, test in cv:
            self._clf.fit(X[train], y[train])
            for key in self._mean_attributes:
                attr[key].append(deepcopy(self._clf.__getattribute__(key)))

        for key in self._mean_attributes:
            self._clf.__setattr__(key, np.mean(attr[key], axis=0)) 
Example 2
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 6 votes vote down vote up
def test_stratified_shuffle_split_init():
    y = np.asarray([0, 1, 1, 1, 2, 2, 2])
    # Check that error is raised if there is a class with only one sample
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.2)

    # Check that error is raised if the test set size is smaller than n_classes
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 2)
    # Check that error is raised if the train set size is smaller than
    # n_classes
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 3, 2)

    y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2])
    # Check that errors are raised if there is not enough samples
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.5, 0.6)
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 8, 0.6)
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.6, 8)

    # Train size or test size too small
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, train_size=2)
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, test_size=2) 
Example 3
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 6 votes vote down vote up
def test_stratified_shuffle_split_iter():
    ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]),
          np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
          np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]),
          np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]),
          np.array([-1] * 800 + [1] * 50)
          ]

    for y in ys:
        sss = cval.StratifiedShuffleSplit(y, 6, test_size=0.33,
                                          random_state=0)
        for train, test in sss:
            assert_array_equal(np.unique(y[train]), np.unique(y[test]))
            # Checks if folds keep classes proportions
            p_train = (np.bincount(np.unique(y[train], return_inverse=True)[1])
                       / float(len(y[train])))
            p_test = (np.bincount(np.unique(y[test], return_inverse=True)[1])
                      / float(len(y[test])))
            assert_array_almost_equal(p_train, p_test, 1)
            assert_equal(y[train].size + y[test].size, y.size)
            assert_array_equal(np.lib.arraysetops.intersect1d(train, test), []) 
Example 4
Project: Weiss   Author: WangWenjun559   File: test_cross_validation.py    Apache License 2.0 6 votes vote down vote up
def test_stratified_shuffle_split_init():
    y = np.asarray([0, 1, 1, 1, 2, 2, 2])
    # Check that error is raised if there is a class with only one sample
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.2)

    # Check that error is raised if the test set size is smaller than n_classes
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 2)
    # Check that error is raised if the train set size is smaller than
    # n_classes
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 3, 2)

    y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2])
    # Check that errors are raised if there is not enough samples
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.5, 0.6)
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 8, 0.6)
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.6, 8)

    # Train size or test size too small
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, train_size=2)
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, test_size=2) 
Example 5
Project: Weiss   Author: WangWenjun559   File: test_cross_validation.py    Apache License 2.0 6 votes vote down vote up
def test_stratified_shuffle_split_iter():
    ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]),
          np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
          np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]),
          np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]),
          np.array([-1] * 800 + [1] * 50)
          ]

    for y in ys:
        sss = cval.StratifiedShuffleSplit(y, 6, test_size=0.33,
                                          random_state=0)
        for train, test in sss:
            assert_array_equal(np.unique(y[train]), np.unique(y[test]))
            # Checks if folds keep classes proportions
            p_train = (np.bincount(np.unique(y[train], return_inverse=True)[1])
                       / float(len(y[train])))
            p_test = (np.bincount(np.unique(y[test], return_inverse=True)[1])
                      / float(len(y[test])))
            assert_array_almost_equal(p_train, p_test, 1)
            assert_equal(y[train].size + y[test].size, y.size)
            assert_array_equal(np.lib.arraysetops.intersect1d(train, test), []) 
Example 6
Project: lime   Author: jklynch   File: stability_selection.py    MIT License 6 votes vote down vote up
def select_features(aligned_snp_df, aligned_taxa_df, lo_alpha_coef):
    X = aligned_taxa_df.values
    y = aligned_snp_df.values.flatten()
    with warnings.catch_warnings():
        warnings.simplefilter('ignore', UserWarning)
        warnings.simplefilter('ignore', ConvergenceWarning)
        lars_cv = LassoLarsCV(cv=StratifiedShuffleSplit(y, n_iter=100, test_size=0.2)).fit(X, y)

    print('lars_cv.alphas_: {}'.format(lars_cv.alphas_))
    alphas = np.linspace(lars_cv.alphas_[0], lo_alpha_coef * lars_cv.alphas_[0], 10)
    print('alphas: {}'.format(alphas))
    clf = RandomizedLasso(
        alpha=alphas,
        sample_fraction=0.8,
        n_resampling=1000
        #random_state=13
    ).fit(X, y)

    feature_scores_df = pd.DataFrame(clf.scores_, index=aligned_taxa_df.columns)

    return feature_scores_df, lars_cv.alphas_ 
Example 7
Project: BloodTestReportOCR   Author: csxiaoyaojianxian   File: caffe_sex_train_predict.py    Apache License 2.0 6 votes vote down vote up
def create_data_lmdb():

    #prefit
    X, y = extract('data_set.csv')
    vec_log = np.vectorize(lambda x: x)
    vec_int = np.vectorize(lambda str: int(str[-1]))
    features = vec_log(X)
    labels = vec_int(y)

    #train : test = 9 : 1
    sss = StratifiedShuffleSplit(labels, 1, test_size=0.1, random_state=0)
    sss = list(sss)[0]

    features_training = features[sss[0],]
    labels_training = labels[sss[0],]

    features_testing = features[sss[1],]
    labels_testing = labels[sss[1],]

    #nomalized data 66%, unnomalized data 53%
    features_training, features_testing = nomalize(features_training, features_testing)

    load_data_into_lmdb("train_data_lmdb", features_training, labels_training)
    load_data_into_lmdb("test_data_lmdb", features_testing, labels_testing) 
Example 8
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_cross_validation.py    Apache License 2.0 6 votes vote down vote up
def test_stratified_shuffle_split_init():
    y = np.asarray([0, 1, 1, 1, 2, 2, 2])
    # Check that error is raised if there is a class with only one sample
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.2)

    # Check that error is raised if the test set size is smaller than n_classes
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 2)
    # Check that error is raised if the train set size is smaller than
    # n_classes
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 3, 2)

    y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2])
    # Check that errors are raised if there is not enough samples
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.5, 0.6)
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 8, 0.6)
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.6, 8)

    # Train size or test size too small
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, train_size=2)
    assert_raises(ValueError, cval.StratifiedShuffleSplit, y, test_size=2) 
Example 9
Project: melanoma-transfer   Author: learningtitans   File: data.py    GNU General Public License v3.0 5 votes vote down vote up
def split_indices_old(files, labels, test_size=0.1, random_state=RANDOM_STATE):
    names = get_names(files)
    labels = get_labels(names, per_patient=True)
    spl = cross_validation.StratifiedShuffleSplit(labels[:, 0],
                                                  test_size=test_size,
                                                  random_state=random_state,
                                                  n_iter=1)
    tr, te = next(iter(spl))
    tr = np.hstack([tr * 2, tr * 2 + 1])
    te = np.hstack([te * 2, te * 2 + 1])
    return tr, te 
Example 10
Project: melanoma-transfer   Author: learningtitans   File: data.py    GNU General Public License v3.0 5 votes vote down vote up
def split_indices(files, labels, label_file, test_size=0.1, random_state=RANDOM_STATE):                             # <-- Necessary for running with training on melanoma database, not using per_patient
    names = get_names(files)
    labels = get_labels(names, label_file=label_file, per_patient=False)
    spl = cross_validation.StratifiedShuffleSplit(labels,
                                                  test_size=test_size,
                                                  random_state=random_state,
                                                  n_iter=1)
    tr, te = next(iter(spl))
    return tr, te 
Example 11
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 5 votes vote down vote up
def test_stratified_shuffle_split_iter_no_indices():
    y = np.asarray([0, 1, 2] * 10)

    sss1 = cval.StratifiedShuffleSplit(y, indices=False, random_state=0)
    train_mask, test_mask = next(iter(sss1))

    sss2 = cval.StratifiedShuffleSplit(y, indices=True, random_state=0)
    train_indices, test_indices = next(iter(sss2))

    assert_array_equal(sorted(test_indices), np.where(test_mask)[0]) 
Example 12
Project: DataMining   Author: lidalei   File: kernel_selection2.py    MIT License 5 votes vote down vote up
def hot(X, y):
    
    
    C_range = np.logspace(-15, 15, 31,base = 2.0)
    gamma_range = np.logspace(-15, 15, 31, base = 2.0)
     
#     param_grid = dict(gamma=gamma_range, C=C_range)
#     cv = StratifiedShuffleSplit(y, n_iter=10, test_size=0.2, random_state=42)
    roc_auc_scorer = get_scorer("roc_auc")
    scores = []
    for C in C_range:
        for gamma in gamma_range:
            auc_scorer = []
            for train, test in KFold(n=len(X), n_folds=10, random_state=42):
                rbf_svc = svm.SVC(C=C, kernel='rbf', gamma=gamma, probability=True)
                X_train, y_train = X[train], y[train]
                X_test, y_test = X[test], y[test]
                rbf_clf = rbf_svc.fit(X_train, y_train)
                auc_scorer.append(roc_auc_scorer(rbf_clf, X_test, y_test))
            scores.append(np.mean(auc_scorer))
#     grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
#     grid.fit(X, y)
#     scores = [x[1] for x in grid.grid_scores_]
    scores = np.array(scores).reshape(len(C_range), len(gamma_range))
    print scores
    plt.figure(figsize=(15, 12))
    plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95)
    plt.imshow(scores, interpolation='nearest', cmap=plt.cm.hot,
               norm=MidpointNormalize(vmin=0.2, midpoint=0.92))
    plt.xlabel('gamma')
    plt.ylabel('C')
    plt.colorbar()
    plt.xticks(np.arange(len(gamma_range)), gamma_range, rotation=90)
    plt.yticks(np.arange(len(C_range)), C_range)
    plt.title('AUC')
    plt.show() 
Example 13
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_cross_validation.py    Apache License 2.0 5 votes vote down vote up
def test_stratified_shuffle_split_iter():
    ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]),
          np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
          np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2] * 2),
          np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]),
          np.array([-1] * 800 + [1] * 50)
          ]

    for y in ys:
        sss = cval.StratifiedShuffleSplit(y, 6, test_size=0.33,
                                          random_state=0)
        test_size = np.ceil(0.33 * len(y))
        train_size = len(y) - test_size
        for train, test in sss:
            assert_array_equal(np.unique(y[train]), np.unique(y[test]))
            # Checks if folds keep classes proportions
            p_train = (np.bincount(np.unique(y[train],
                                   return_inverse=True)[1]) /
                       float(len(y[train])))
            p_test = (np.bincount(np.unique(y[test],
                                  return_inverse=True)[1]) /
                      float(len(y[test])))
            assert_array_almost_equal(p_train, p_test, 1)
            assert_equal(len(train) + len(test), y.size)
            assert_equal(len(train), train_size)
            assert_equal(len(test), test_size)
            assert_array_equal(np.lib.arraysetops.intersect1d(train, test), []) 
Example 14
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_cross_validation.py    Apache License 2.0 5 votes vote down vote up
def test_stratified_shuffle_split_overlap_train_test_bug():
    # See https://github.com/scikit-learn/scikit-learn/issues/6121 for
    # the original bug report
    labels = [0, 1, 2, 3] * 3 + [4, 5] * 5

    splits = cval.StratifiedShuffleSplit(labels, n_iter=1,
                                         test_size=0.5, random_state=0)
    train, test = next(iter(splits))

    assert_array_equal(np.intersect1d(train, test), []) 
Example 15
Project: kaggle_otto   Author: ahara   File: utils.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def stratified_split(x, y, test_size=0.2):
    strat_shuffled_split = StratifiedShuffleSplit(y, n_iter=1, test_size=test_size, random_state=23)
    train_index, valid_index = [s for s in strat_shuffled_split][0]

    x_train, y_train, x_valid, y_valid = x[train_index, :], y[train_index], x[valid_index, :], y[valid_index]

    return x_train, y_train, x_valid, y_valid 
Example 16
Project: painters   Author: inejc   File: data_dirs_organizer.py    MIT License 5 votes vote down vote up
def _train_val_split_indices(labels):
    split = StratifiedShuffleSplit(
        labels, n_iter=1, test_size=VAL_SIZE, random_state=42)
    indices_tr, indices_val = next(iter(split))

    _save_organized_data_info(
        split.classes, indices_tr, indices_val, multi_crop=False)
    _save_organized_data_info(
        split.classes, indices_tr, indices_val, multi_crop=True)
    return indices_tr, indices_val, split.classes 
Example 17
Project: dnnwsd   Author: crscardellino   File: supervised.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def split_dataset(self):
        if self._kfolds > 0:
            dataset_split = StratifiedKFold(
                self._processor.target, n_folds=self._kfolds, shuffle=True
            )
        else:
            dataset_split = StratifiedShuffleSplit(
                self._processor.target, n_iter=1, train_size=TRAIN_RATIO, test_size=None
            )

        return dataset_split 
Example 18
Project: kaggle   Author: ldamewood   File: xgb.py    MIT License 5 votes vote down vote up
def do_train(data, bids, params, eval_size=0.05):
    mindex = ['bidder_id', 'bid_id']
    y = data['outcome'].values.astype('int')
    sss = StratifiedShuffleSplit(y, 1, test_size=eval_size, random_state=0)
    train_idx, evalu_idx = next(iter(sss))
    train = data.iloc[train_idx]
    evalu = data.iloc[evalu_idx]
    tr = pd.merge(train, bids).set_index(mindex)
    ev = pd.merge(evalu, bids).set_index(mindex)

    X_train = tr.drop('outcome',axis=1).values.astype('float')
    y_train = tr['outcome'].values.astype('int')
    X_eval = ev.drop('outcome',axis=1).values.astype('float')
    y_eval = ev['outcome'].values.astype('int')
    dtrain = xgb.DMatrix(X_train, label=y_train, missing=np.nan)
    deval = xgb.DMatrix(X_eval, label=y_eval, missing=np.nan)
    ratio = float(np.sum(y_train == 0)) / np.sum(y_train==1)
    params['scale_pos_weight'] = ratio
    
    def logregobj(preds, dtrain):
        labels = dtrain.get_label()
        preds = 1.0 / (1.0 + np.exp(-preds))
        grad = preds - labels
        hess = preds * (1.0-preds)
        return grad, hess
    
    def evalerror(preds, dtrain):
        labels = dtrain.get_label()
        if len(labels) == tr.shape[0]:
            ids = map(lambda x: x[0], tr.index.ravel())
        else:
            ids = map(lambda x: x[0], ev.index.ravel())
        return 'auc', -auc_score(labels, preds, ids)
    
    params['validation_set'] = deval
    evals = dict()
    watchlist = [ (dtrain, 'train'), (deval, 'eval') ]
    return xgb.train(params, dtrain, 1000, watchlist, feval=evalerror, obj=logregobj, 
                    early_stopping_rounds=25, evals_result=evals) 
Example 19
Project: kaggle   Author: ldamewood   File: otto.py    MIT License 5 votes vote down vote up
def save_holdout(cls):
        infile = cls.__data__['train']
        df = pd.read_csv(infile, index_col = 'id')
        y = df['target']
        idx1, idx2 = next(iter(StratifiedShuffleSplit(y, test_size=0.05, random_state=0)))
        holdout = pd.DataFrame({'id':idx2})
        holdout.to_csv(cls.__data__['holdout'], index=False) 
Example 20
Project: kaggle   Author: ldamewood   File: xgb.py    MIT License 5 votes vote down vote up
def train(X, y, params, deval, eval_size=0.1):
    if deval is None and eval_size > 0:
        sss = StratifiedShuffleSplit(y, 1, test_size=eval_size, random_state=0)
        train_idx, eval_idx = next(iter(sss))
        X_train, X_eval = X[train_idx], X[eval_idx]
        y_train, y_eval = y[train_idx], y[eval_idx]
        dtrain = xgb.DMatrix(X_train, label=y_train)
        deval = xgb.DMatrix(X_eval, label=y_eval)
    else:
        dtrain = xgb.DMatrix(X, label=y)
    params['validation_set'] = deval
    evals = dict()
    watchlist = [ (dtrain, 'train'), (deval, 'eval') ]
    return xgb.train(params, dtrain, 10000, watchlist, feval=evalerror, 
                    early_stopping_rounds=100, evals_result=evals) 
Example 21
Project: Identificador-Fraude-Enron   Author: luisneto98   File: tester.py    MIT License 4 votes vote down vote up
def test_classifier(clf, dataset, feature_list, folds = 1000):
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv: 
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )
        
        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print "Warning: Found a predicted label not == 0 or 1."
                print "All predictions should take value 0 or 1."
                print "Evaluating performance for processed predictions:"
                break
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0*(true_positives + true_negatives)/total_predictions
        precision = 1.0*true_positives/(true_positives+false_positives)
        recall = 1.0*true_positives/(true_positives+false_negatives)
        f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
        f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
        print clf
        print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5)
        print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)
        print ""
    except:
        print "Got a divide by zero when trying out:", clf
        print "Precision or recall may be undefined due to a lack of true positive predicitons." 
Example 22
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 4 votes vote down vote up
def test_stratified_shuffle_split_even():
    # Test the StratifiedShuffleSplit, indices are drawn with a
    # equal chance
    n_folds = 5
    n_iter = 1000

    def assert_counts_are_ok(idx_counts, p):
        # Here we test that the distribution of the counts
        # per index is close enough to a binomial
        threshold = 0.05 / n_splits
        bf = stats.binom(n_splits, p)
        for count in idx_counts:
            p = bf.pmf(count)
            assert_true(p > threshold,
                        "An index is not drawn with chance corresponding "
                        "to even draws")

    for n_samples in (6, 22):
        labels = np.array((n_samples // 2) * [0, 1])
        splits = cval.StratifiedShuffleSplit(labels, n_iter=n_iter,
                                             test_size=1. / n_folds,
                                             random_state=0)

        train_counts = [0] * n_samples
        test_counts = [0] * n_samples
        n_splits = 0
        for train, test in splits:
            n_splits += 1
            for counter, ids in [(train_counts, train), (test_counts, test)]:
                for id in ids:
                    counter[id] += 1
        assert_equal(n_splits, n_iter)

        assert_equal(len(train), splits.n_train)
        assert_equal(len(test), splits.n_test)
        assert_equal(len(set(train).intersection(test)), 0)

        label_counts = np.unique(labels)
        assert_equal(splits.test_size, 1.0 / n_folds)
        assert_equal(splits.n_train + splits.n_test, len(labels))
        assert_equal(len(label_counts), 2)
        ex_test_p = float(splits.n_test) / n_samples
        ex_train_p = float(splits.n_train) / n_samples

        assert_counts_are_ok(train_counts, ex_train_p)
        assert_counts_are_ok(test_counts, ex_test_p) 
Example 23
Project: Weiss   Author: WangWenjun559   File: test_cross_validation.py    Apache License 2.0 4 votes vote down vote up
def test_stratified_shuffle_split_even():
    # Test the StratifiedShuffleSplit, indices are drawn with a
    # equal chance
    n_folds = 5
    n_iter = 1000

    def assert_counts_are_ok(idx_counts, p):
        # Here we test that the distribution of the counts
        # per index is close enough to a binomial
        threshold = 0.05 / n_splits
        bf = stats.binom(n_splits, p)
        for count in idx_counts:
            p = bf.pmf(count)
            assert_true(p > threshold,
                        "An index is not drawn with chance corresponding "
                        "to even draws")

    for n_samples in (6, 22):
        labels = np.array((n_samples // 2) * [0, 1])
        splits = cval.StratifiedShuffleSplit(labels, n_iter=n_iter,
                                             test_size=1. / n_folds,
                                             random_state=0)

        train_counts = [0] * n_samples
        test_counts = [0] * n_samples
        n_splits = 0
        for train, test in splits:
            n_splits += 1
            for counter, ids in [(train_counts, train), (test_counts, test)]:
                for id in ids:
                    counter[id] += 1
        assert_equal(n_splits, n_iter)

        assert_equal(len(train), splits.n_train)
        assert_equal(len(test), splits.n_test)
        assert_equal(len(set(train).intersection(test)), 0)

        label_counts = np.unique(labels)
        assert_equal(splits.test_size, 1.0 / n_folds)
        assert_equal(splits.n_train + splits.n_test, len(labels))
        assert_equal(len(label_counts), 2)
        ex_test_p = float(splits.n_test) / n_samples
        ex_train_p = float(splits.n_train) / n_samples

        assert_counts_are_ok(train_counts, ex_train_p)
        assert_counts_are_ok(test_counts, ex_test_p) 
Example 24
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_cross_validation.py    Apache License 2.0 4 votes vote down vote up
def test_stratified_shuffle_split_even():
    # Test the StratifiedShuffleSplit, indices are drawn with a
    # equal chance
    n_folds = 5
    n_iter = 1000

    def assert_counts_are_ok(idx_counts, p):
        # Here we test that the distribution of the counts
        # per index is close enough to a binomial
        threshold = 0.05 / n_splits
        bf = stats.binom(n_splits, p)
        for count in idx_counts:
            p = bf.pmf(count)
            assert_true(p > threshold,
                        "An index is not drawn with chance corresponding "
                        "to even draws")

    for n_samples in (6, 22):
        labels = np.array((n_samples // 2) * [0, 1])
        splits = cval.StratifiedShuffleSplit(labels, n_iter=n_iter,
                                             test_size=1. / n_folds,
                                             random_state=0)

        train_counts = [0] * n_samples
        test_counts = [0] * n_samples
        n_splits = 0
        for train, test in splits:
            n_splits += 1
            for counter, ids in [(train_counts, train), (test_counts, test)]:
                for id in ids:
                    counter[id] += 1
        assert_equal(n_splits, n_iter)

        assert_equal(len(train), splits.n_train)
        assert_equal(len(test), splits.n_test)
        assert_equal(len(set(train).intersection(test)), 0)

        label_counts = np.unique(labels)
        assert_equal(splits.test_size, 1.0 / n_folds)
        assert_equal(splits.n_train + splits.n_test, len(labels))
        assert_equal(len(label_counts), 2)
        ex_test_p = float(splits.n_test) / n_samples
        ex_train_p = float(splits.n_train) / n_samples

        assert_counts_are_ok(train_counts, ex_train_p)
        assert_counts_are_ok(test_counts, ex_test_p) 
Example 25
Project: adni_rs_fmri_analysis   Author: mrahim   File: base_connectivity_classifier.py    GNU General Public License v2.0 4 votes vote down vote up
def StratifiedSubjectShuffleSplit(dataset, groups, n_iter=100, test_size=.3,
                                  random_state=42):
    """ Stratified ShuffleSplit on subjects
    (train and test size may change depending on the number of acquistions)"""

    idx = set_group_indices(dataset.dx_group)
    groups_idx = np.hstack([idx[group] for group in groups])

    subjects = np.asarray(dataset.subjects)
    subjects = subjects[groups_idx]

    dx = np.asarray(dataset.dx_group)
    dx = dx[groups_idx]

    # extract unique subject ids and dx
    subjects_unique_values, \
    subjects_unique_indices = np.unique(subjects, return_index=True)

    # extract indices for the needed groups
    dx_unique_values = dx[subjects_unique_indices]
    y = dx_unique_values

    # generate folds stratified on dx
    sss = StratifiedShuffleSplit(y, n_iter=n_iter, test_size=test_size,
                                 random_state=random_state)
    ssss = []
    for tr, ts in sss:
        # get training subjects
        subjects_tr = subjects_unique_values[tr]

        # get testing subjects
        subjects_ts = subjects_unique_values[ts]

        # get all subject indices
        train = []
        test = []
        for subj in subjects_tr:
            train.extend(np.where(subjects == subj)[0])
        for subj in subjects_ts:
            test.extend(np.where(subjects == subj)[0])

        # append ssss
        ssss.append([train, test])
    return ssss 
Example 26
Project: adni_rs_fmri_analysis   Author: mrahim   File: base_connectivity_classifier.py    GNU General Public License v2.0 4 votes vote down vote up
def classify(self, dataset=None, groups=['AD', 'MCI'],
                 classifier_name='logreg_l2'):
        """ Returns accuracy scores
        """
        if hasattr(self, 'connectivity'):
            groups_idx = np.hstack([self.idx[group] for group in groups])
            subjects = np.array(dataset.subjects)
            subjects = subjects[groups_idx]
            y = np.hstack([[i] * len(self.idx[group])
                          for i, group in enumerate(groups)])

            X = self.connectivity[groups_idx, :]

            if dataset is None:
                sss = StratifiedShuffleSplit(y, n_iter=self.n_iter,
                                             test_size=.25,
                                             random_state=self.random_state)
            else:
                sss = StratifiedSubjectShuffleSplit(dataset, groups,
                                                n_iter=self.n_iter,
                                                test_size=.3,
                                                random_state=self.random_state)

#                sss = SubjectShuffleSplit(dataset, groups, n_iter=self.n_iter,
#                                          test_size=.2,
#                                          random_state=self.random_state)

            results = classify_connectivity(X, y, sss, classifier_name,
                                            n_jobs=self.n_jobs,
                                            subjects=subjects)

            self.y_pred_ = map(lambda r: r['y_pred'], results)
            self.y_dec_ = map(lambda r: r['y_dec'], results)
            self.y_ = map(lambda r: r['y'], results)
            self.coefs_ = map(lambda r: r['coef'], results)
            self.scores_ = map(lambda r: r['score'], results)
            self.subj_ = map(lambda r: r['subj'], results)

            self.results_ = map(average_predictions, self.y_, self.y_dec_,
                                self.subj_)

            self.scores_ = np.asarray(self.scores_)
            self.coefs_ = np.asarray(self.coefs_)
            self.subj_ = np.asarray(self.subj_)
        else:
            raise ValueError('Connectivity not yet computed !')