Python sklearn.cross_validation.ShuffleSplit() Examples

The following are code examples for showing how to use sklearn.cross_validation.ShuffleSplit(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: euclid   Author: njpayne   File: regressors.py    GNU General Public License v2.0 7 votes vote down vote up
def run_support_vector_regressor(training_features, training_labels, test_features, test_labels, passed_parameters = None):
    
    estimator = svm.SVR()

    #set up parameters for the classifier
    if(passed_parameters == None):
        parameters = {'kernel': ['linear']}
    else:
        parameters = passed_parameters

    #create cross validation iterator
    cv = ShuffleSplit(training_features.shape[0], n_iter=5, test_size=0.2, random_state=0)

    #set up tuning algorithm
    regressor = GridSearchCV(estimator=estimator, cv=cv, param_grid=parameters)

    #fit the classifier
    regressor.fit(training_features, training_labels)

    test_prediction = regressor.predict(test_features)
    test_accuracy = regressor.score(test_features, test_labels)

    time_2 = time.time()

    return test_prediction, test_accuracy 
Example 2
Project: Deopen   Author: kimmo1019   File: Deopen_classification.py    MIT License 7 votes vote down vote up
def data_split(inputfile):
    data = hkl.load(inputfile)
    X = data['mat']
    X_kspec = data['kmer']
    y = data['y']
    rs = ShuffleSplit(len(y), n_iter=1,random_state = 1)
    X_kspec = X_kspec.reshape((X_kspec.shape[0],1024,4))
    X = np.concatenate((X,X_kspec), axis = 1)
    X = X[:,np.newaxis]
    X = X.transpose((0,1,3,2))
    for train_idx, test_idx in rs:
        X_train = X[train_idx,:]
        y_train = y[train_idx]
        X_test = X[test_idx,:]
        y_test = y[test_idx]
    X_train = X_train.astype('float32')
    y_train = y_train.astype('int32')
    X_test = X_test.astype('float32')
    y_test = y_test.astype('int32')
    return [X_train, y_train, X_test, y_test]

#define the network architecture 
Example 3
Project: DaD   Author: arunvenk   File: learn_control_demo.py    GNU General Public License v3.0 6 votes vote down vote up
def optimize_learner_dad(learner, X, U, iters, train_size = 0.5):
    num_traj = X.shape[2]
    if train_size < 1.0:
        from sklearn import cross_validation
        rs = cross_validation.ShuffleSplit(num_traj, n_iter=1, train_size=train_size, 
                random_state=0, test_size=1.-train_size)
        for train_index, test_index in rs:
            pass
        Xtrain = X[:,:,train_index]; Xtest = X[:,:,test_index]
        Utrain = U[:,:,train_index]; Utest = U[:,:,test_index]
    elif train_size == 1.0:
        Xtrain = X; Xtest = X
        Utrain = U; Utest = U
    else:
        raise Exception('Train size must be in (0,1]')

    dad = DaDControl()
    dad.learn(Xtrain, Utrain, learner, iters, Xtest, Utest, verbose=False)
    print(' DaD (iters:{:d}). Initial Err: {:.4g}, Best: {:.4g}'.format(iters,
        dad.initial_test_err, dad.min_test_error))
    return dad 
Example 4
Project: BuildingMachineLearning   Author: ademyanchuk   File: 02_tuning.py    MIT License 6 votes vote down vote up
def grid_search_model(clf_factory, X, Y):
    cv = ShuffleSplit(
        n=len(X), n_iter=10, test_size=0.3, random_state=0)

    param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)],
                      vect__min_df=[1, 2],
                      vect__stop_words=[None, "english"],
                      vect__smooth_idf=[False, True],
                      vect__use_idf=[False, True],
                      vect__sublinear_tf=[False, True],
                      vect__binary=[False, True],
                      clf__alpha=[0, 0.01, 0.05, 0.1, 0.5, 1],
                      )

    grid_search = GridSearchCV(clf_factory(),
                               param_grid=param_grid,
                               cv=cv,
                               score_func=f1_score,
                               verbose=10)
    grid_search.fit(X, Y)
    clf = grid_search.best_estimator_
    print(clf)

    return clf 
Example 5
Project: BuildingMachineLearning   Author: ademyanchuk   File: 04_sent.py    MIT License 6 votes vote down vote up
def __grid_search_model(clf_factory, X, Y):
    cv = ShuffleSplit(
        n=len(X), n_iter=10, test_size=0.3, random_state=0)

    param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)],
                      vect__min_df=[1, 2],
                      vect__smooth_idf=[False, True],
                      vect__use_idf=[False, True],
                      vect__sublinear_tf=[False, True],
                      vect__binary=[False, True],
                      clf__alpha=[0, 0.01, 0.05, 0.1, 0.5, 1],
                      )

    grid_search = GridSearchCV(clf_factory(),
                               param_grid=param_grid,
                               cv=cv,
                               score_func=f1_score,
                               verbose=10)
    grid_search.fit(X, Y)
    clf = grid_search.best_estimator_
    print(clf)

    return clf 
Example 6
Project: motion-classification   Author: matthiasplappert   File: evaluate_features.py    MIT License 6 votes vote down vote up
def evaluate(X, args):
    enum = ShuffleSplit(len(X), n_iter=args.n_iterations, test_size=args.test_size)
    train_scores = []
    test_scores = []
    for train_index, test_index in enum:
        X_train = [X[idx] for idx in train_index]
        X_test = [X[idx] for idx in test_index]
        X_train, X_test = preprocess_datasets(X_train, X_test, args)
        model = GaussianHMM(n_states=args.n_states, n_training_iterations=args.n_training_iterations,
                            topology=args.topology)
        model.fit(X_train)
        train_scores.extend([model.loglikelihood(X_curr) for X_curr in X_train])
        test_scores.extend([model.loglikelihood(X_curr) for X_curr in X_test])

    train_scores_array = np.array(train_scores)
    train_mean = float(np.mean(train_scores_array))
    train_std = float(np.std(train_scores_array))
    test_scores_array = np.array(test_scores)
    test_mean = float(np.mean(test_scores_array))
    test_std = float(np.std(test_scores_array))
    return train_mean, train_std, test_mean, test_std 
Example 7
Project: Building-Machine-Learning-Systems-With-Python-Second-Edition   Author: PacktPublishing   File: 02_tuning.py    MIT License 6 votes vote down vote up
def grid_search_model(clf_factory, X, Y):
    cv = ShuffleSplit(
        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)

    param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)],
                      vect__min_df=[1, 2],
                      vect__stop_words=[None, "english"],
                      vect__smooth_idf=[False, True],
                      vect__use_idf=[False, True],
                      vect__sublinear_tf=[False, True],
                      vect__binary=[False, True],
                      clf__alpha=[0, 0.01, 0.05, 0.1, 0.5, 1],
                      )

    grid_search = GridSearchCV(clf_factory(),
                               param_grid=param_grid,
                               cv=cv,
                               score_func=f1_score,
                               verbose=10)
    grid_search.fit(X, Y)
    clf = grid_search.best_estimator_
    print clf

    return clf 
Example 8
Project: Building-Machine-Learning-Systems-With-Python-Second-Edition   Author: PacktPublishing   File: 04_sent.py    MIT License 6 votes vote down vote up
def __grid_search_model(clf_factory, X, Y):
    cv = ShuffleSplit(
        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)

    param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)],
                      vect__min_df=[1, 2],
                      vect__smooth_idf=[False, True],
                      vect__use_idf=[False, True],
                      vect__sublinear_tf=[False, True],
                      vect__binary=[False, True],
                      clf__alpha=[0, 0.01, 0.05, 0.1, 0.5, 1],
                      )

    grid_search = GridSearchCV(clf_factory(),
                               param_grid=param_grid,
                               cv=cv,
                               score_func=f1_score,
                               verbose=10)
    grid_search.fit(X, Y)
    clf = grid_search.best_estimator_
    print clf

    return clf 
Example 9
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 6 votes vote down vote up
def test_cross_val_generator_with_default_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    loo = cval.LeaveOneOut(4)
    lpo = cval.LeavePOut(4, 2)
    kf = cval.KFold(4, 2)
    skf = cval.StratifiedKFold(y, 2)
    lolo = cval.LeaveOneLabelOut(labels)
    lopo = cval.LeavePLabelOut(labels, 2)
    b = cval.Bootstrap(2)  # only in index mode
    ss = cval.ShuffleSplit(2)
    ps = cval.PredefinedSplit([1, 1, 2, 2])
    for cv in [loo, lpo, kf, skf, lolo, lopo, b, ss, ps]:
        for train, test in cv:
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            X[train], X[test]
            y[train], y[test] 
Example 10
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 6 votes vote down vote up
def test_safe_split_with_precomputed_kernel():
    clf = SVC()
    clfp = SVC(kernel="precomputed")

    iris = load_iris()
    X, y = iris.data, iris.target
    K = np.dot(X, X.T)

    cv = cval.ShuffleSplit(X.shape[0], test_size=0.25, random_state=0)
    tr, te = list(cv)[0]

    X_tr, y_tr = cval._safe_split(clf, X, y, tr)
    K_tr, y_tr2 = cval._safe_split(clfp, K, y, tr)
    assert_array_almost_equal(K_tr, np.dot(X_tr, X_tr.T))

    X_te, y_te = cval._safe_split(clf, X, y, te, tr)
    K_te, y_te2 = cval._safe_split(clfp, K, y, te, tr)
    assert_array_almost_equal(K_te, np.dot(X_te, X_tr.T)) 
Example 11
Project: Weiss   Author: WangWenjun559   File: test_cross_validation.py    Apache License 2.0 6 votes vote down vote up
def test_cross_val_generator_with_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    # explicitly passing indices value is deprecated
    loo = cval.LeaveOneOut(4)
    lpo = cval.LeavePOut(4, 2)
    kf = cval.KFold(4, 2)
    skf = cval.StratifiedKFold(y, 2)
    lolo = cval.LeaveOneLabelOut(labels)
    lopo = cval.LeavePLabelOut(labels, 2)
    ps = cval.PredefinedSplit([1, 1, 2, 2])
    ss = cval.ShuffleSplit(2)
    for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]:
        for train, test in cv:
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            X[train], X[test]
            y[train], y[test] 
Example 12
Project: Weiss   Author: WangWenjun559   File: test_cross_validation.py    Apache License 2.0 6 votes vote down vote up
def test_cross_val_generator_with_default_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    loo = cval.LeaveOneOut(4)
    lpo = cval.LeavePOut(4, 2)
    kf = cval.KFold(4, 2)
    skf = cval.StratifiedKFold(y, 2)
    lolo = cval.LeaveOneLabelOut(labels)
    lopo = cval.LeavePLabelOut(labels, 2)
    ss = cval.ShuffleSplit(2)
    ps = cval.PredefinedSplit([1, 1, 2, 2])
    for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]:
        for train, test in cv:
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            X[train], X[test]
            y[train], y[test] 
Example 13
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_cross_validation.py    Apache License 2.0 6 votes vote down vote up
def test_cross_val_generator_with_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    # explicitly passing indices value is deprecated
    loo = cval.LeaveOneOut(4)
    lpo = cval.LeavePOut(4, 2)
    kf = cval.KFold(4, 2)
    skf = cval.StratifiedKFold(y, 2)
    lolo = cval.LeaveOneLabelOut(labels)
    lopo = cval.LeavePLabelOut(labels, 2)
    ps = cval.PredefinedSplit([1, 1, 2, 2])
    ss = cval.ShuffleSplit(2)
    for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]:
        for train, test in cv:
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            X[train], X[test]
            y[train], y[test] 
Example 14
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_cross_validation.py    Apache License 2.0 6 votes vote down vote up
def test_cross_val_generator_with_default_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    loo = cval.LeaveOneOut(4)
    lpo = cval.LeavePOut(4, 2)
    kf = cval.KFold(4, 2)
    skf = cval.StratifiedKFold(y, 2)
    lolo = cval.LeaveOneLabelOut(labels)
    lopo = cval.LeavePLabelOut(labels, 2)
    ss = cval.ShuffleSplit(2)
    ps = cval.PredefinedSplit([1, 1, 2, 2])
    for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]:
        for train, test in cv:
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            X[train], X[test]
            y[train], y[test] 
Example 15
Project: euclid   Author: njpayne   File: regressors.py    GNU General Public License v2.0 5 votes vote down vote up
def run_boosting(training_features, training_labels, test_features, test_labels, passed_parameters = None):
    """
    Classifies the data using sklearn's ADAboost
    Does not natively support pruning so max_depth is being used for the decision tree

    Parameters
    ----------
        training_data: data used to train the classifier. For each row, item 0 assumed to be the label
        test_data: data used to test the classifier. For each row, item 0 assumed to be the label
        max_depth: maximum tree depth to be applied (will simulate pruning)
    
    Returns
    -------
        prediction: predicted labels of the test data
        accuracy: percent of test data labels accurately predicted
    """
    time_1 = time.time()

    #set up underlying decision tree classifier
    base_regressor = tree.DecisionTreeRegressor()

    #set up the boosting method
    estimator = ensemble.AdaBoostRegressor(base_estimator = base_regressor)
    
    #set up parameters for the classifier
    passed_parameters = {'base_estimator__max_depth': range(1, 5), 'n_estimators' : range(10, 200, 50), 'learning_rate' : [1] }

    #create cross validation iterator
    cv = ShuffleSplit(training_features.shape[0], n_iter=5, test_size=0.2, random_state=0)

    #set up tuning algorithm
    regressor = GridSearchCV(estimator=estimator, cv=cv, param_grid=passed_parameters)

    #fit the classifier
    regressor.fit(training_features, training_labels)

    #get the prediction and accuracy of the test set
    test_prediction = regressor.predict(test_features)
    test_accuracy = regressor.score(test_features, test_labels)

    return test_prediction, test_accuracy 
Example 16
Project: euclid   Author: njpayne   File: regressors.py    GNU General Public License v2.0 5 votes vote down vote up
def run_random_forest(training_features, training_labels, test_features, test_labels, passed_parameters = None, ):

    estimator = ensemble.RandomForestRegressor(random_state=0, n_estimators=25)

    #set up parameters for the classifier
    if(passed_parameters == None):
        parameters = {'max_depth': None}
    else:
        parameters = passed_parameters

    #create cross validation iterator
    cv = ShuffleSplit(training_features.shape[0], n_iter=5, test_size=0.2, random_state=0)

    #set up tuning algorithm
    regressor = GridSearchCV(estimator=estimator, cv=cv, param_grid=parameters)

    #fit the classifier
    regressor.fit(training_features, training_labels)

    test_prediction = regressor.predict(test_features)
    test_accuracy = regressor.score(test_features, test_labels)

    time_2 = time.time()

    return test_prediction, test_accuracy 
Example 17
Project: AlzheTect   Author: raidel123   File: mlearning.py    Apache License 2.0 5 votes vote down vote up
def random_forest_regressor(src=r"../train/TADPOLE_train_MCI.csv"):
    model_data = GetModelDataCSV(src)
    split_classes = SplitClassDataCN(indata=model_data, file=False)
    tdata = TransformData(split_classes)

    X = np.array(tdata.drop(['DXCHANGE'], 1))
    Y = np.array(tdata['DXCHANGE'])
    Y = np.array([Resulbinarizer(label) for label in Y])

    X = preprocessing.scale(X)

    names = list(tdata.drop(['DXCHANGE'], 1).columns.values)

    rf = RandomForestClassifier(n_estimators=500, max_features=20, n_jobs=-1, verbose=1)
    scores = defaultdict(list)

    #crossvalidate the scores on a number of different random splits of the data
    for train_idx, test_idx in ShuffleSplit(len(X), 100, .3):
        X_train, X_test = X[train_idx], X[test_idx]
        Y_train, Y_test = Y[train_idx], Y[test_idx]
        r = rf.fit(X_train, Y_train)
        acc = accuracy_score(Y_test, rf.predict(X_test))
        for i in range(X.shape[1]):
            X_t = X_test.copy()
            np.random.shuffle(X_t[:, i])
            shuff_acc = accuracy_score(Y_test, rf.predict(X_t))
            scores[names[i]].append((acc-shuff_acc)/acc)
    print "Features sorted by their score:"
    for item in sorted([(round(np.mean(score), 8), feat) for feat, score in scores.items()], reverse=True)[:20]:
        print item

    json.dump(scores, open('../trained_model/random_forest/rfr_scores2.json', 'w')) 
Example 18
Project: rf_select   Author: dwpsutton   File: rf_select.py    GNU General Public License v3.0 5 votes vote down vote up
def fit(X,y):
        '''
            This function actually calculates the importances and accuracy metric
            using cross validation.
            Usage:
              imp,acc = fit(X,y)
            Arguments:
              X: feature vector, numpy array
              y: label vector, numpy array
            Return values:
              imp: feature importance vector
              acc: estimator accuracy metric
        '''
        scores = defaultdict(list) # Any unknown element is automatically a list
        rf= copy.deepcopy(self.clf)
        #
        #crossvalidate the scores on a number of different random splits of the data
        outAcc= 0.
        for train_idx, test_idx in ShuffleSplit(len(X), self.nCV, .3):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]
            r = rf.fit(X_train, Y_train)
            # Get accuracy metric
            if metric is None:
                outAcc= None
            elif metric == 'OOB':
                outAcc += rf.oob_score
            elif metric == 'AUC':
                outAcc += sklearn.metrics.roc_auc_score(y_test, rf.predict_proba(X_test) )
            if self.algorithm == 'gini':
                scores[i].append( self.giniImportance(rf,X_test,y_test) )
            elif self.algorithm == 'permutation':
                scores[i].append( self.permutationImportance(rf,X_test,y_test) )
            elif self.algorithm == 'conditional':
                scores[i].append( self.conditionalPermutationImportance(rf,X_test,y_test) )
        #
        # Return mean importance and metric
        importances= np.array([np.mean(scores[i]) for i in range(X.shape[1])])
        return importances, outAcc / float(self.nCV) 
Example 19
Project: Deopen   Author: kimmo1019   File: Deopen_regression.py    MIT License 5 votes vote down vote up
def data_split(inputfile,reads_count):
    data = hkl.load(inputfile)
    reads_count= hkl.load(reads_count)
    X = data['mat']
    X_kspec = data['kmer']
    reads_count = np.array(reads_count)
    y = np.mean(reads_count, axis = 1)
    y = np.log(y+1e-3)
    rs = ShuffleSplit(len(y), n_iter=1,random_state = 1)
    X_kspec = X_kspec.reshape((X_kspec.shape[0],1024,4))
    X = np.concatenate((X,X_kspec), axis = 1)
    X = X[:,np.newaxis]
    X = X.transpose((0,1,3,2))
    for train_idx, test_idx in rs:
        X_train = X[train_idx,:]
        y_train = y[train_idx]
        X_test = X[test_idx,:]
        y_test = y[test_idx]
    X_train = X_train.astype('float32')
    y_train = y_train.astype('float32')
    X_test = X_test.astype('float32')
    y_test = y_test.astype('float32')
    print 'Data prepration done!'
    return [X_train, y_train, X_test, y_test]


#define the network architecture 
Example 20
Project: deepjets   Author: deepjets   File: utils.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def load_images(image_h5_file, n_images=-1, shuffle_seed=1):
    """Load images and auxiliary data from h5 file.

    Args:
        image_h5_file: location of h5 file containing images.
        n_images: number of images to load, -1 loads all.
        auxvars: list of auxvar field names to load.
    Returns:
        images: array of image arrays.
        aux_data: dict of auxvar arrays.
    TODO: add support for multiple classes.
    """
    with h5py.File(image_h5_file, 'r') as h5file:
        images = h5file['images']
        auxvars = h5file['auxvars']
        if n_images < 0:
            n_images = len(images)
        elif n_images > len(images):
            print("Cannot load {0} images. Only {1} images in {2}".format(
                n_images, len(images), image_h5_file))
            n_images = len(images)
        if n_images < len(images):
            rs = cross_validation.ShuffleSplit(
                len(images), n_iter=1, test_size=n_images,
                random_state=shuffle_seed)
            for train, test in rs:
                keep = test
            images = np.take(images, keep, axis=0)
            auxvars = np.take(auxvars, keep, axis=0)
        else:
            images = h5file['images'][:]
            auxvars = h5file['auxvars'][:]
    return images, auxvars 
Example 21
Project: kaggle-right-whale   Author: felixlaumon   File: __init__.py    MIT License 5 votes vote down vote up
def __call__(self, X, y, net):
        if self.eval_size is not None:
            if net.regression or not self.stratify:
                # test_size = self.eval_size
                # kf = ShuffleSplit(
                #     y.shape[0], test_size=test_size,
                #     random_state=self.random_state
                # )
                # train_indices, valid_indices = next(iter(kf))
                # valid_indices = shuffle(valid_indices)
                test_size = 1 - self.eval_size
                kf = ShuffleSplit(
                    y.shape[0], test_size=test_size,
                    random_state=self.random_state
                )
                valid_indices, train_indices = next(iter(kf))
            else:
                n_folds = int(round(1 / self.eval_size))
                kf = StratifiedKFold(y, n_folds=n_folds, random_state=self.random_state)
                train_indices, valid_indices = next(iter(kf))

            X_train, y_train = X[train_indices], y[train_indices]
            X_valid, y_valid = X[valid_indices], y[valid_indices]
        else:
            X_train, y_train = X, y
            X_valid, y_valid = X[len(X):], y[len(y):]

        return X_train, X_valid, y_train, y_valid 
Example 22
Project: kaggle-right-whale   Author: felixlaumon   File: train_model.py    MIT License 5 votes vote down vote up
def train_test_split(X, y, test_size=0.25, random_state=42, stratify=True):
    if stratify:
        n_folds = int(round(1 / test_size))
        sss = StratifiedKFold(y, n_folds=n_folds, random_state=random_state)
    else:
        sss = ShuffleSplit(len(y), test_size=test_size, random_state=random_state)
    train_idx, test_idx = iter(sss).next()
    return X[train_idx], X[test_idx], y[train_idx], y[test_idx] 
Example 23
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 5 votes vote down vote up
def test_shuffle_split():
    ss1 = cval.ShuffleSplit(10, test_size=0.2, random_state=0)
    ss2 = cval.ShuffleSplit(10, test_size=2, random_state=0)
    ss3 = cval.ShuffleSplit(10, test_size=np.int32(2), random_state=0)
    for typ in six.integer_types:
        ss4 = cval.ShuffleSplit(10, test_size=typ(2), random_state=0)
    for t1, t2, t3, t4 in zip(ss1, ss2, ss3, ss4):
        assert_array_equal(t1[0], t2[0])
        assert_array_equal(t2[0], t3[0])
        assert_array_equal(t3[0], t4[0])
        assert_array_equal(t1[1], t2[1])
        assert_array_equal(t2[1], t3[1])
        assert_array_equal(t3[1], t4[1]) 
Example 24
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 5 votes vote down vote up
def test_cross_val_generator_with_mask():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    loo = assert_warns(DeprecationWarning, cval.LeaveOneOut,
                       4, indices=False)
    lpo = assert_warns(DeprecationWarning, cval.LeavePOut,
                       4, 2, indices=False)
    kf = assert_warns(DeprecationWarning, cval.KFold,
                      4, 2, indices=False)
    skf = assert_warns(DeprecationWarning, cval.StratifiedKFold,
                       y, 2, indices=False)
    lolo = assert_warns(DeprecationWarning, cval.LeaveOneLabelOut,
                        labels, indices=False)
    lopo = assert_warns(DeprecationWarning, cval.LeavePLabelOut,
                        labels, 2, indices=False)
    ss = assert_warns(DeprecationWarning, cval.ShuffleSplit,
                      4, indices=False)
    ps = assert_warns(DeprecationWarning, cval.PredefinedSplit, [1, 1, 2, 2],
                      indices=False)
    for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]:
        for train, test in cv:
            assert_equal(np.asarray(train).dtype.kind, 'b')
            assert_equal(np.asarray(train).dtype.kind, 'b')
            X[train], X[test]
            y[train], y[test] 
Example 25
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 5 votes vote down vote up
def test_cross_val_generator_with_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    # explicitly passing indices value is deprecated
    loo = assert_warns(DeprecationWarning, cval.LeaveOneOut,
                       4, indices=True)
    lpo = assert_warns(DeprecationWarning, cval.LeavePOut,
                       4, 2, indices=True)
    kf = assert_warns(DeprecationWarning, cval.KFold,
                      4, 2, indices=True)
    skf = assert_warns(DeprecationWarning, cval.StratifiedKFold,
                       y, 2, indices=True)
    lolo = assert_warns(DeprecationWarning, cval.LeaveOneLabelOut,
                        labels, indices=True)
    lopo = assert_warns(DeprecationWarning, cval.LeavePLabelOut,
                        labels, 2, indices=True)
    ps = assert_warns(DeprecationWarning, cval.PredefinedSplit,
                      [1, 1, 2, 2], indices=True)
    # Bootstrap as a cross-validation is deprecated
    b = assert_warns(DeprecationWarning, cval.Bootstrap, 2)
    ss = assert_warns(DeprecationWarning, cval.ShuffleSplit,
                      2, indices=True)
    for cv in [loo, lpo, kf, skf, lolo, lopo, b, ss, ps]:
        for train, test in cv:
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            X[train], X[test]
            y[train], y[test] 
Example 26
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 5 votes vote down vote up
def test_shufflesplit_errors():
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=2.0)
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=1.0)
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=0.1,
                  train_size=0.95)
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=11)
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=10)
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=8, train_size=3)
    assert_raises(ValueError, cval.ShuffleSplit, 10, train_size=1j)
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=None,
                  train_size=None) 
Example 27
Project: Weiss   Author: WangWenjun559   File: test_cross_validation.py    Apache License 2.0 5 votes vote down vote up
def test_shuffle_split():
    ss1 = cval.ShuffleSplit(10, test_size=0.2, random_state=0)
    ss2 = cval.ShuffleSplit(10, test_size=2, random_state=0)
    ss3 = cval.ShuffleSplit(10, test_size=np.int32(2), random_state=0)
    for typ in six.integer_types:
        ss4 = cval.ShuffleSplit(10, test_size=typ(2), random_state=0)
    for t1, t2, t3, t4 in zip(ss1, ss2, ss3, ss4):
        assert_array_equal(t1[0], t2[0])
        assert_array_equal(t2[0], t3[0])
        assert_array_equal(t3[0], t4[0])
        assert_array_equal(t1[1], t2[1])
        assert_array_equal(t2[1], t3[1])
        assert_array_equal(t3[1], t4[1]) 
Example 28
Project: Weiss   Author: WangWenjun559   File: test_cross_validation.py    Apache License 2.0 5 votes vote down vote up
def test_shufflesplit_errors():
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=2.0)
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=1.0)
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=0.1,
                  train_size=0.95)
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=11)
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=10)
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=8, train_size=3)
    assert_raises(ValueError, cval.ShuffleSplit, 10, train_size=1j)
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=None,
                  train_size=None) 
Example 29
Project: Weiss   Author: WangWenjun559   File: test_cross_validation.py    Apache License 2.0 5 votes vote down vote up
def test_shufflesplit_reproducible():
    # Check that iterating twice on the ShuffleSplit gives the same
    # sequence of train-test when the random_state is given
    ss = cval.ShuffleSplit(10, random_state=21)
    assert_array_equal(list(a for a, b in ss), list(a for a, b in ss)) 
Example 30
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_cross_validation.py    Apache License 2.0 5 votes vote down vote up
def test_shuffle_split():
    ss1 = cval.ShuffleSplit(10, test_size=0.2, random_state=0)
    ss2 = cval.ShuffleSplit(10, test_size=2, random_state=0)
    ss3 = cval.ShuffleSplit(10, test_size=np.int32(2), random_state=0)
    for typ in six.integer_types:
        ss4 = cval.ShuffleSplit(10, test_size=typ(2), random_state=0)
    for t1, t2, t3, t4 in zip(ss1, ss2, ss3, ss4):
        assert_array_equal(t1[0], t2[0])
        assert_array_equal(t2[0], t3[0])
        assert_array_equal(t3[0], t4[0])
        assert_array_equal(t1[1], t2[1])
        assert_array_equal(t2[1], t3[1])
        assert_array_equal(t3[1], t4[1]) 
Example 31
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_cross_validation.py    Apache License 2.0 5 votes vote down vote up
def test_shufflesplit_errors():
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=2.0)
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=1.0)
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=0.1,
                  train_size=0.95)
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=11)
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=10)
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=8, train_size=3)
    assert_raises(ValueError, cval.ShuffleSplit, 10, train_size=1j)
    assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=None,
                  train_size=None) 
Example 32
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_cross_validation.py    Apache License 2.0 5 votes vote down vote up
def test_shufflesplit_reproducible():
    # Check that iterating twice on the ShuffleSplit gives the same
    # sequence of train-test when the random_state is given
    ss = cval.ShuffleSplit(10, random_state=21)
    assert_array_equal(list(a for a, b in ss), list(a for a, b in ss)) 
Example 33
Project: digit-ocr   Author: Nozdi   File: train.py    MIT License 5 votes vote down vote up
def cv(model, X, y, n_iter=5, test_size=0.3):
    split = cross_validation.ShuffleSplit(
        len(X), n_iter=n_iter, test_size=test_size,
    )
    return cross_validation.cross_val_score(model, X, y, cv=split,
                                            scoring='accuracy', n_jobs=-1) 
Example 34
Project: tdparse   Author: bluemonk482   File: liblinear.py    MIT License 5 votes vote down vote up
def CV(ci,trfile,CV_trfile,CV_tfile,CV_pfile,CV_truey,id_train):
    """ Cross validation over training data, for parameter optimisation.
    Returns accuracy, 3-class f1 and 2-class f1 scores for each iteration.
    """
    feats = readfeats(trfile)
    # ids = readfeats(id_train)
    # cv = LabelKFold(ids, n_folds=5)
    cv = ShuffleSplit(n=len(feats), n_iter=5, test_size=0.2, random_state=0)
    acc_list = []
    f1_three_list = []
    f1_two_list = []
    count = 0
    for train_index, test_index in cv:
        count+=1
        cv_trfile = CV_trfile+str(count)
        cv_tfile = CV_tfile+str(count)
        cv_pfile = CV_pfile+str(count)
        cv_truey = CV_truey+str(count)
        X_train = feats[train_index]
        X_test = feats[test_index]
        y_test = getlabels(X_test)
        writingfile(cv_trfile, X_train)
        writingfile(cv_tfile, X_test)
        writingfile(cv_truey, y_test)   
        model="../models/cv/"+cv_trfile.split('/')[-1]+".model"     
        traincmd=["../liblinear/train", "-c", "0.001", "-q", cv_trfile, model]
        traincmd[2]=ci
        subprocess.call(traincmd)
        predcmd=["../liblinear/predict", cv_tfile, model, cv_pfile]
        p = subprocess.Popen(predcmd, stdout=subprocess.PIPE)
        output, err = p.communicate()
        y_test, y_predicted = feval(cv_truey, cv_pfile)
        acc_list.append(metrics.accuracy_score(y_test, y_predicted))
        f1_three_list.append(metrics.f1_score(y_test, y_predicted, average='macro'))
        f1_two_list.append((metrics.f1_score(y_test, y_predicted, average=None)[0]+metrics.f1_score(y_test, y_predicted, average=None)[-1])/2)
    f1_three = np.mean(np.asarray(f1_three_list))
    f1_two = np.mean(np.asarray(f1_two_list))
    acc = np.mean(np.asarray(acc_list))
    print "When C=%s, acc is %f, 2-class-f1 is %f and 3-class-f1 is %f"%(ci, acc, f1_two, f1_three)
    return [acc, f1_three, f1_two] 
Example 35
Project: nuclai15   Author: aigamedev   File: train.py    GNU General Public License v3.0 5 votes vote down vote up
def main():
    # get the processed data
    X,y = preprocess_data()

    # get the dummy clf: Very important, it creates a baseline!
    dummy_clf = get_dummy_clf()
    dummy_clf.fit(X, y)
    y_hat = dummy_clf.predict(y)

    # Get the baseline predictions for x and y
    print "Dummy MSE x", mse(y[:,0], y_hat[:,0])
    print "Dummy MSE y", mse(y[:,1], y_hat[:,1])

    # create 5 different crossvalidation folds
    ss = ShuffleSplit(len(y), n_iter=5, random_state=0)

    scores_x = []
    scores_y = []
    for i, (train_index, test_index) in enumerate(ss):
        # Choose a classifier
        #clf = get_linear_clf()
        clf = get_nn_clf()
        clf.fit(X[train_index], y[train_index])
        y_hat = clf.predict(X[test_index])

        # Save the score for each fold
        score_x = mse(y[test_index,0], y_hat[:,0])
        score_y = mse(y[test_index,1], y_hat[:,1])


        # You can print the coefficients/intercept for the linear classifier
        #print clf.steps[-1][1].coef_,clf.steps[-1][1].intercept_

        scores_x.append(score_x)
        scores_y.append(score_y)
        print scores_x,scores_y


    print "MSE CV x", np.array(scores_x).mean()
    print "MSE CV y", np.array(scores_y).mean() 
Example 36
Project: edaHelper   Author: gibsondanield   File: edaHelper.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def train_test_split(self, n_xval_folds=5, holdout=0):
        '''Splits data into test and training sets. Holdout is the proportion of data to be kept in the holdout set'''
        self.log.append('test train split')
        self.n_xval_folds = n_xval_folds
        self.holdout = holdout
        if holdout:  # use indices for this

            ss = ShuffleSplit()
            self.holdout_indices = None
#        self.df_train, self.dfholdout,self.y_train,self.y_holdout= train_test_split(self.df,self.df[y],train_size=holdout)
        if n_xval_folds:
            pass 
Example 37
Project: adni_rs_fmri_analysis   Author: mrahim   File: base_connectivity_classifier.py    GNU General Public License v2.0 5 votes vote down vote up
def SubjectShuffleSplit(dataset, groups, n_iter=100,
                        test_size=.3, random_state=42):
    """ Specific ShuffleSplit (train on all subject images,
    but test only on one image of the remaining subjects)"""

    idx = set_group_indices(dataset.dx_group)
    groups_idx = np.hstack([idx[group] for group in groups])

    subjects = np.asarray(dataset.subjects)
    subjects = subjects[groups_idx]
    subjects_unique = np.unique(subjects)

    n = len(subjects_unique)
    ss = ShuffleSplit(n, n_iter=n_iter,
                      test_size=test_size, random_state=random_state)

    subj_ss = []
    for train, test in ss:
        train_set = np.array([], dtype=int)
        for subj in subjects_unique[train]:
            subj_ind = np.where(subjects == subj)
            train_set = np.concatenate((train_set, subj_ind[0]))
        test_set = np.array([], dtype=int)
        for subj in subjects_unique[test]:
            subj_ind = np.where(subjects == subj)
            test_set = np.concatenate((test_set, subj_ind[0]))
        subj_ss.append([train_set, test_set])
    return subj_ss 
Example 38
Project: TextCategorization   Author: Y-oHr-N   File: document_filter.py    MIT License 4 votes vote down vote up
def __grid_search_model(self, clf_factory, documents, labels, pos_label):
        boolndarr        = labels.values == pos_label
        n                = documents.size
        n_pos            = labels[boolndarr].size
        n_neg            = n - n_pos

        param_grid       = {
            'vect__binary'      : [False, True],
            'vect__min_df'      : [1, 2],
            'vect__ngram_range' : [(1, 1), (1, 2), (1, 3)],
            'vect__smooth_idf'  : [False, True],
            'vect__stop_words'  : [None, 'english'],
            'vect__sublinear_tf': [False, True],
            'vect__use_idf'     : [False, True],
            'clf__alpha'        : [0, 0.01, 0.05, 0.1, 0.5, 1]
        }

        k                = 5
        cv               = ShuffleSplit(
            n,
            n_iter       = k,
            test_size    = 1 / k,
            random_state = 0
        )

        pos_weight       = n_neg / n_pos
        sample_weight    = np.ones(n)
        sample_weight[boolndarr] *= pos_weight
        fit_params       = {'clf__sample_weight': sample_weight}

        f1_scorer        = make_scorer(f1_score, pos_label=pos_label)

        grid_search      = GridSearchCV(
            clf_factory,
            param_grid,
            cv           = cv,
            fit_params   = fit_params,
            n_jobs       = -1,
            scoring      = f1_scorer
        )

        grid_search.fit(documents, labels)
        best_estimator   = grid_search.best_estimator_
        best_score       = grid_search.best_score_
        best_params      = grid_search.best_params_

        print("Best F1 score: {0:04.3f}".format(best_score))
        print("Parameters: {0}".format(best_params))

        return best_estimator 
Example 39
Project: BuildingMachineLearning   Author: ademyanchuk   File: 02_tuning.py    MIT License 4 votes vote down vote up
def train_model(clf, X, Y, name="NB ngram", plot=False):
    # create it again for plotting
    cv = ShuffleSplit(
        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = []
    precisions, recalls, thresholds = [], [], []

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf.fit(X_train, y_train)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        scores.append(test_score)
        proba = clf.predict_proba(X_test)

        fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
        precision, recall, pr_thresholds = precision_recall_curve(
            y_test, proba[:, 1])

        pr_scores.append(auc(recall, precision))
        precisions.append(precision)
        recalls.append(recall)
        thresholds.append(pr_thresholds)

    if plot:
        scores_to_sort = pr_scores
        median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]

        plot_pr(pr_scores[median], name, phase, precisions[median],
                recalls[median], label=name)

    summary = (np.mean(scores), np.std(scores),
               np.mean(pr_scores), np.std(pr_scores))
    print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)

    return np.mean(train_errors), np.mean(test_errors) 
Example 40
Project: BuildingMachineLearning   Author: ademyanchuk   File: 03_clean.py    MIT License 4 votes vote down vote up
def train_model(clf, X, Y, name="NB ngram", plot=False):
    # create it again for plotting
    cv = ShuffleSplit(
        n=len(X), n_iter=10, test_size=0.3, random_state=0)

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = []
    precisions, recalls, thresholds = [], [], []

    clfs = []  # just to later get the median

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf.fit(X_train, y_train)
        clfs.append(clf)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        scores.append(test_score)
        proba = clf.predict_proba(X_test)

        fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
        precision, recall, pr_thresholds = precision_recall_curve(
            y_test, proba[:, 1])

        pr_scores.append(auc(recall, precision))
        precisions.append(precision)
        recalls.append(recall)
        thresholds.append(pr_thresholds)

    if plot:
        scores_to_sort = pr_scores
        median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]

        plot_pr(pr_scores[median], name, phase, precisions[median],
                recalls[median], label=name)

        log_false_positives(clfs[median], X_test, y_test, name)

    summary = (np.mean(scores), np.std(scores),
               np.mean(pr_scores), np.std(pr_scores))
    print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)

    return np.mean(train_errors), np.mean(test_errors) 
Example 41
Project: BuildingMachineLearning   Author: ademyanchuk   File: 04_sent.py    MIT License 4 votes vote down vote up
def train_model(clf, X, Y, name="NB ngram", plot=False):
    # create it again for plotting
    cv = ShuffleSplit(
        n=len(X), n_iter=10, test_size=0.3, random_state=0)

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = []
    precisions, recalls, thresholds = [], [], []

    clfs = []  # just to later get the median

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf.fit(X_train, y_train)
        clfs.append(clf)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        scores.append(test_score)
        proba = clf.predict_proba(X_test)

        fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
        precision, recall, pr_thresholds = precision_recall_curve(
            y_test, proba[:, 1])

        pr_scores.append(auc(recall, precision))
        precisions.append(precision)
        recalls.append(recall)
        thresholds.append(pr_thresholds)

    if plot:
        scores_to_sort = pr_scores
        median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]

        plot_pr(pr_scores[median], name, phase, precisions[median],
                recalls[median], label=name)

        log_false_positives(clfs[median], X_test, y_test, name)

    summary = (np.mean(scores), np.std(scores),
               np.mean(pr_scores), np.std(pr_scores))
    print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)

    return np.mean(train_errors), np.mean(test_errors) 
Example 42
Project: Building-Machine-Learning-Systems-With-Python-Second-Edition   Author: PacktPublishing   File: 02_tuning.py    MIT License 4 votes vote down vote up
def train_model(clf, X, Y, name="NB ngram", plot=False):
    # create it again for plotting
    cv = ShuffleSplit(
        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = []
    precisions, recalls, thresholds = [], [], []

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf.fit(X_train, y_train)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        scores.append(test_score)
        proba = clf.predict_proba(X_test)

        fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
        precision, recall, pr_thresholds = precision_recall_curve(
            y_test, proba[:, 1])

        pr_scores.append(auc(recall, precision))
        precisions.append(precision)
        recalls.append(recall)
        thresholds.append(pr_thresholds)

    if plot:
        scores_to_sort = pr_scores
        median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]

        plot_pr(pr_scores[median], name, phase, precisions[median],
                recalls[median], label=name)

    summary = (np.mean(scores), np.std(scores),
               np.mean(pr_scores), np.std(pr_scores))
    print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary

    return np.mean(train_errors), np.mean(test_errors) 
Example 43
Project: Building-Machine-Learning-Systems-With-Python-Second-Edition   Author: PacktPublishing   File: 03_clean.py    MIT License 4 votes vote down vote up
def train_model(clf, X, Y, name="NB ngram", plot=False):
    # create it again for plotting
    cv = ShuffleSplit(
        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = []
    precisions, recalls, thresholds = [], [], []

    clfs = []  # just to later get the median

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf.fit(X_train, y_train)
        clfs.append(clf)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        scores.append(test_score)
        proba = clf.predict_proba(X_test)

        fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
        precision, recall, pr_thresholds = precision_recall_curve(
            y_test, proba[:, 1])

        pr_scores.append(auc(recall, precision))
        precisions.append(precision)
        recalls.append(recall)
        thresholds.append(pr_thresholds)

    if plot:
        scores_to_sort = pr_scores
        median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]

        plot_pr(pr_scores[median], name, phase, precisions[median],
                recalls[median], label=name)

        log_false_positives(clfs[median], X_test, y_test, name)

    summary = (np.mean(scores), np.std(scores),
               np.mean(pr_scores), np.std(pr_scores))
    print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary

    return np.mean(train_errors), np.mean(test_errors) 
Example 44
Project: Building-Machine-Learning-Systems-With-Python-Second-Edition   Author: PacktPublishing   File: 04_sent.py    MIT License 4 votes vote down vote up
def train_model(clf, X, Y, name="NB ngram", plot=False):
    # create it again for plotting
    cv = ShuffleSplit(
        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = []
    precisions, recalls, thresholds = [], [], []

    clfs = []  # just to later get the median

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf.fit(X_train, y_train)
        clfs.append(clf)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        scores.append(test_score)
        proba = clf.predict_proba(X_test)

        fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
        precision, recall, pr_thresholds = precision_recall_curve(
            y_test, proba[:, 1])

        pr_scores.append(auc(recall, precision))
        precisions.append(precision)
        recalls.append(recall)
        thresholds.append(pr_thresholds)

    if plot:
        scores_to_sort = pr_scores
        median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]

        plot_pr(pr_scores[median], name, phase, precisions[median],
                recalls[median], label=name)

        log_false_positives(clfs[median], X_test, y_test, name)

    summary = (np.mean(scores), np.std(scores),
               np.mean(pr_scores), np.std(pr_scores))
    print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary

    return np.mean(train_errors), np.mean(test_errors) 
Example 45
Project: mmltoolkit   Author: delton137   File: feature_importance.py    MIT License 4 votes vote down vote up
def mean_decrease_accuracy(x, y, feature_names, model=RandomForestRegressor(n_estimators = 50),
                          num_to_print='all', print_latex=True):
    """
        Feature importance by mean decrease accuracy.
        Proposed for random forrests in Breiman, L. Machine Learning (2001) 45: 5. https://doi.org/10.1023/A:1010933404324
        Measures decrease in accuracy values when a given feature is randomly permuted in the dataset.
         If the decrease is low, then the feature is not important, and vice-versa.
        The importances are reported as fractional (percentage) decreases in accuracy after permutation
        Required arguments:
            x : data matrix (number samples x number of features), NumPy array
            y : target property vector, as NumPy array
            feature_names : list of feature names
        Optional arguments:
            model : a scikit-learn model, if none is supplied random forest is used.
        returns:
            sorted_feature_names : sorted list of feature names, most to least important
            sorted_importances : the importances  sorted by absolute value
    """
    from sklearn.cross_validation import ShuffleSplit
    from sklearn.metrics import r2_score
    from collections import defaultdict

    mdoel = model.fit(x, y)

    scores = defaultdict(list)

    #crossvalidate the scores on a number of different random splits of the data
    for train_idx, test_idx in ShuffleSplit(len(X), 100, .3):
        X_train, X_test = x[train_idx], x[test_idx]
        Y_train, Y_test = y[train_idx], y[test_idx]
        r = rf.fit(X_train, Y_train)
        acc = r2_score(Y_test, rf.predict(X_test))
        for i in range(X.shape[1]):
            X_t = X_test.copy()
            np.random.shuffle(X_t[:, i])
            shuff_acc = r2_score(Y_test, rf.predict(X_t))
            scores[names[i]].append((acc-shuff_acc)/acc)
    print( sorted([(round(np.mean(score), 4), feat) for
                  feat, score in scores.items()], reverse=True) )

    #sort dictionary
    sorted_feature_names = sorted(f_imps, key=f_imps.__getitem__, reverse=True)
    sorted_values = sorted(f_imps.values(), reverse=True)

    if (print_latex): fi_print_latex(sorted_feature_names, sorted_values, name="random forest mean decrease impurity")

    return sorted_feature_names, sorted_values

#------------------------------------------------------------------------------------------ 
Example 46
Project: mmltoolkit   Author: delton137   File: feature_importance.py    MIT License 4 votes vote down vote up
def shuffle_importance(x, y, feature_names, model=RandomForestRegressor(n_estimators = 50),
                          num_to_print='all', print_latex=False):
    """
        Feature importance by looking at accuracy drop after shuffling
        Required arguments:
            x : data matrix (number samples x number of features), NumPy array
            y : target property vector, as NumPy array
            feature_names : list of feature names
        Optional arguments:
            model : a scikit-learn model, if none is supplied random forest is used.
        returns:
            sorted_feature_names : sorted list of feature names, most to least important
            sorted_importances : the importances  sorted by absolute value
    """

    model = model.fit(x, y)

    scores = defaultdict(list)

    num_features = x.shape[1]

    #cross validate the scores on a number of different random splits of the data
    for train_idx, test_idx in ShuffleSplit(10, test_size=.4):
        X_train, X_test = x[train_idx], x[test_idx]
        Y_train, Y_test = y[train_idx], y[test_idx]
        model = model.fit(X_train, Y_train)
        MAE = mean_absolute_error(Y_test, model.predict(X_test))
        for i in range(num_features):
            X_t = X_test.copy()
            np.random.shuffle(X_t[:, i])
            scores[feature_names[i]] += [mean_absolute_error(Y_test,model.predict(X_t))/MAE]

    #create dictionary
    f_imps = {}
    f_imps_abs = {}

    for i in range(num_features):
        this_avg_score = np.mean(scores[feature_names[i]])
        f_imps[feature_names[i]] = this_avg_score
        f_imps_abs[feature_names[i]] = np.abs(this_avg_score)

    #sort dictionary by absolute value of coefficient
    sorted_feature_names = sorted(f_imps_abs, key=f_imps_abs.__getitem__, reverse=True)
    sorted_values = [f_imps[name] for name in sorted_feature_names]

    if (print_latex): fi_print_latex(sorted_feature_names, sorted_values, name="shuffle feature importance analysis")

    return sorted_feature_names, sorted_values

#------------------------------------------------------------------------------------------ 
Example 47
Project: adni_rs_fmri_analysis   Author: mrahim   File: base_connectivity_classifier.py    GNU General Public License v2.0 4 votes vote down vote up
def StratifiedSubjectShuffleSplit(dataset, groups, n_iter=100, test_size=.3,
                                  random_state=42):
    """ Stratified ShuffleSplit on subjects
    (train and test size may change depending on the number of acquistions)"""

    idx = set_group_indices(dataset.dx_group)
    groups_idx = np.hstack([idx[group] for group in groups])

    subjects = np.asarray(dataset.subjects)
    subjects = subjects[groups_idx]

    dx = np.asarray(dataset.dx_group)
    dx = dx[groups_idx]

    # extract unique subject ids and dx
    subjects_unique_values, \
    subjects_unique_indices = np.unique(subjects, return_index=True)

    # extract indices for the needed groups
    dx_unique_values = dx[subjects_unique_indices]
    y = dx_unique_values

    # generate folds stratified on dx
    sss = StratifiedShuffleSplit(y, n_iter=n_iter, test_size=test_size,
                                 random_state=random_state)
    ssss = []
    for tr, ts in sss:
        # get training subjects
        subjects_tr = subjects_unique_values[tr]

        # get testing subjects
        subjects_ts = subjects_unique_values[ts]

        # get all subject indices
        train = []
        test = []
        for subj in subjects_tr:
            train.extend(np.where(subjects == subj)[0])
        for subj in subjects_ts:
            test.extend(np.where(subjects == subj)[0])

        # append ssss
        ssss.append([train, test])
    return ssss