Python sklearn.cross_validation.KFold() Examples

The following are code examples for showing how to use sklearn.cross_validation.KFold(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: design_embeddings_jmd_2016   Author: IDEALLab   File: hp_kpca.py    MIT License 7 votes vote down vote up
def cross_validate(gamma, alpha, X, n_folds, n_components):

    # K-fold cross-validation
    kf = KFold(X.shape[0], n_folds=n_folds, shuffle=True)
    i = 1
    loss = 0
    
    for train, test in kf:
        train = train.tolist()
        test = test.tolist()
        
        print 'cross validation: %d' % i
        i += 1
        
        if len(train)>10 and len(test): # if there are enough training and test samples
            # Get cost
            loss += kpca(X, n_components, train, test, kernel='rbf', gamma=gamma, alpha=alpha, evaluation=True)
                                  
        else:
            print 'Please add more samples!'
            
    # Get test reconstruction error
    rec_err_cv = loss/n_folds

    return rec_err_cv 
Example 2
Project: JAABF   Author: drr3d   File: estimator.py    GNU General Public License v3.0 7 votes vote down vote up
def crossValScore(self, X, y, X_Test, y_test):
        ## Getting num of features for cross-validation
        x_test = self.__vectorizers.transform(X_Test)
        print("num of test_samples: %d, num of test_features: %d \n" % x_test.shape)

        kfold = KFold(n=x_test.shape[1], n_folds=4, shuffle=True, random_state=337)
            
        X_shuf, Y_shuf = shuffle(X_Test, y_test)
        cross_val_score = cross_validation.cross_val_score(self.models_, X_shuf, \
                                                            Y_shuf, cv=kfold, \
                                                            scoring='accuracy', n_jobs=-1  # -1 = use all cores = faster
                                                            )
        print("\ncross_validation:\n",cross_val_score , "\n")
        print("Baseline: %.2f%% (%.2f%%)" % (cross_val_score.mean()*100, cross_val_score.std()*100)) 
Example 3
Project: jr-tools   Author: kingjr   File: network.py    BSD 2-Clause "Simplified" License 6 votes vote down vote up
def quick_score(X, y, clf=None, scorer=None):
    from mne.decoding import GeneralizationAcrossTime
    from sklearn.cross_validation import KFold
    regression = (len(np.unique(y)) > 2) & isinstance(y[0], float)
    if scorer is None:
        scorer = scorer_spearman if regression else scorer_auc
    if clf is None:
        clf = RidgeCV(alphas=[(2 * C) ** -1 for C in [1e-4, 1e-2, 1]])\
            if regression else force_predict(LogisticRegression(), axis=1)
    sel = np.where(~np.isnan(y))[0]
    X = X[sel, :, :]
    y = y[sel]
    epochs = mat2mne(X, sfreq=100)
    clf = make_pipeline(StandardScaler(), clf)
    cv = KFold(len(y), 5) if regression else None
    gat = GeneralizationAcrossTime(clf=clf, n_jobs=-1, scorer=scorer, cv=cv)
    gat.fit(epochs, y)
    gat.score(epochs, y)
    return gat

# ################################################################
# Plots 
Example 4
Project: data-science-projects   Author: harshilshah4251   File: main.py    MIT License 6 votes vote down vote up
def classificationModel(model, data, predictors, outcome):
    #Accuracy
    print(model.__str__())
    model.fit(data[predictors], data[outcome])
    predictions=model.predict(data[predictors])
    #print(predictions)
    accuracy=metrics.accuracy_score(predictions, data[outcome])
    print("Accuracy : {:.3%} ".format(accuracy))
    #KFOLD CrossValidation
    kf=KFold(data.shape[0], n_folds=5)
    error=[]
    for train, test in kf:
        train_predictors=data[predictors].iloc[train,:]
        train_target=data[outcome].iloc[train]
        model.fit(train_predictors, train_target)
        error.append(model.score(data[predictors].iloc[test,:], data[outcome].iloc[test]))
    print("Cross Validation Score : {:.3%} \n".format(np.mean(error)))

    model.fit(data[predictors], data[outcome]) 
Example 5
Project: Automated-Essay-Grading   Author: vatika   File: classifiers.py    GNU General Public License v2.0 6 votes vote down vote up
def execute(self):
        kf = KFold(len(self.x_train), n_folds=self.k_cross)
        own_kappa = []
        for train_idx, test_idx in kf:
            x_train, x_test = self.x_train[train_idx], self.x_train[test_idx]
            y_train, y_test = self.y_train[train_idx], self.y_train[test_idx]
            dim_red = LDA()
            x_train = dim_red.fit_transform(x_train, y_train)
            x_test = dim_red.transform(x_test)
            stat_obj = self.stat_class() # reflection bitches
            stat_obj.train(x_train,y_train)
            y_pred = [ 0 for i in xrange(len(y_test)) ]
            for i in range(len(x_test)):
                val = int(np.round(stat_obj.predict(x_test[i])))
                if val > self.range_max: val = self.range_max
                if val < self.range_min: val = self.range_min
                y_pred[i] = [val]
            y_pred = np.matrix(y_pred)
            cohen_kappa_rating = own_wp.quadratic_weighted_kappa(y_test,y_pred,self.range_min,self.range_max)
            self.values.append(cohen_kappa_rating)
        return str(sum(self.values)/self.k_cross) 
Example 6
Project: cage   Author: bm2-lab   File: lasso_selector.py    MIT License 6 votes vote down vote up
def LassoSelector(x, y, cv, njob):
    cor_score = lambda x, y: pearsonr(x, y)[0]
    
    lr = linear_model.LinearRegression(n_jobs=njob)
    skf = KFold(len(y), n_folds=cv)
    model = linear_model.LassoLarsCV(fit_intercept=False, cv=cv, n_jobs=njob)
    with warnings.catch_warnings():
        warnings.simplefilter('ignore', UserWarning)
        warnings.simplefilter('ignore', ConvergenceWarning)
        model.fit(x, y)
    columns = np.arange(x.shape[1])[model.coef_ != 0]
    
    mdl_eval = lambda func: lambda idx_tr, idx_te: func(y[idx_te], lr.fit(x[idx_tr][:,columns], y[idx_tr]).predict(x[idx_te][:,columns]))
    res_eval = lambda func: np.average(map(mdl_eval(func), *zip(*[(idx_tr, idx_te) for idx_tr, idx_te in skf])))

    l1r2 = res_eval(r2_score)
    l1cor = res_eval(cor_score)

    lr.fit(x[:,columns], y)
        
    return Md(model=lr, idx=columns, cor=l1cor, r2=l1r2) 
Example 7
Project: aueb.twitter.sentiment   Author: nlpaueb   File: utilities.py    GNU General Public License v3.0 6 votes vote down vote up
def getConfidenceScores(features_train, labels_train, C):
    train_confidence = []
    #confidence scores for training data are computed using K-fold cross validation
    kfold = KFold(features_train.shape[0], n_folds=10)

    for train_index,test_index in kfold:
        X_train, X_test = features_train[train_index], features_train[test_index]
        y_train, y_test = labels_train[train_index], labels_train[test_index]

        #train classifier for the subset of train data
        m = SVM.train(X_train,y_train,c=C,k="linear")

        #predict confidence for test data and append it to list
        conf = m.decision_function(X_test)
        for x in conf:
                train_confidence.append(x)

    return np.array(train_confidence)
	
#save pos scores 
Example 8
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 6 votes vote down vote up
def test_kfold_no_shuffle():
    # Manually check that KFold preserves the data ordering on toy datasets
    splits = iter(cval.KFold(4, 2))
    train, test = next(splits)
    assert_array_equal(test, [0, 1])
    assert_array_equal(train, [2, 3])

    train, test = next(splits)
    assert_array_equal(test, [2, 3])
    assert_array_equal(train, [0, 1])

    splits = iter(cval.KFold(5, 2))
    train, test = next(splits)
    assert_array_equal(test, [0, 1, 2])
    assert_array_equal(train, [3, 4])

    train, test = next(splits)
    assert_array_equal(test, [3, 4])
    assert_array_equal(train, [0, 1, 2]) 
Example 9
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 6 votes vote down vote up
def test_shuffle_kfold():
    # Check the indices are shuffled properly, and that all indices are
    # returned in the different test folds
    kf = cval.KFold(300, 3, shuffle=True, random_state=0)
    ind = np.arange(300)

    all_folds = None
    for train, test in kf:
        sorted_array = np.arange(100)
        assert_true(np.any(sorted_array != ind[train]))
        sorted_array = np.arange(101, 200)
        assert_true(np.any(sorted_array != ind[train]))
        sorted_array = np.arange(201, 300)
        assert_true(np.any(sorted_array != ind[train]))
        if all_folds is None:
            all_folds = ind[test].copy()
        else:
            all_folds = np.concatenate((all_folds, ind[test]))

    all_folds.sort()
    assert_array_equal(all_folds, ind) 
Example 10
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 6 votes vote down vote up
def test_predefinedsplit_with_kfold_split():
    # Check that PredefinedSplit can reproduce a split generated by Kfold.
    folds = -1 * np.ones(10)
    kf_train = []
    kf_test = []
    for i, (train_ind, test_ind) in enumerate(cval.KFold(10, 5, shuffle=True)):
        kf_train.append(train_ind)
        kf_test.append(test_ind)
        folds[test_ind] = i
    ps_train = []
    ps_test = []
    ps = cval.PredefinedSplit(folds)
    for train_ind, test_ind in ps:
        ps_train.append(train_ind)
        ps_test.append(test_ind)
    assert_array_equal(ps_train, kf_train)
    assert_array_equal(ps_test, kf_test) 
Example 11
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 6 votes vote down vote up
def test_cross_val_generator_with_default_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    loo = cval.LeaveOneOut(4)
    lpo = cval.LeavePOut(4, 2)
    kf = cval.KFold(4, 2)
    skf = cval.StratifiedKFold(y, 2)
    lolo = cval.LeaveOneLabelOut(labels)
    lopo = cval.LeavePLabelOut(labels, 2)
    b = cval.Bootstrap(2)  # only in index mode
    ss = cval.ShuffleSplit(2)
    ps = cval.PredefinedSplit([1, 1, 2, 2])
    for cv in [loo, lpo, kf, skf, lolo, lopo, b, ss, ps]:
        for train, test in cv:
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            X[train], X[test]
            y[train], y[test] 
Example 12
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 6 votes vote down vote up
def test_cross_indices_exception():
    X = coo_matrix(np.array([[1, 2], [3, 4], [5, 6], [7, 8]]))
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    loo = cval.LeaveOneOut(4, indices=False)
    lpo = cval.LeavePOut(4, 2, indices=False)
    kf = cval.KFold(4, 2, indices=False)
    skf = cval.StratifiedKFold(y, 2, indices=False)
    lolo = cval.LeaveOneLabelOut(labels, indices=False)
    lopo = cval.LeavePLabelOut(labels, 2, indices=False)

    assert_raises(ValueError, cval.check_cv, loo, X, y)
    assert_raises(ValueError, cval.check_cv, lpo, X, y)
    assert_raises(ValueError, cval.check_cv, kf, X, y)
    assert_raises(ValueError, cval.check_cv, skf, X, y)
    assert_raises(ValueError, cval.check_cv, lolo, X, y)
    assert_raises(ValueError, cval.check_cv, lopo, X, y) 
Example 13
Project: linear_neuron   Author: uglyboxer   File: test_ridge.py    MIT License 6 votes vote down vote up
def _test_ridge_cv(filter_):
    n_samples = X_diabetes.shape[0]

    ridge_cv = RidgeCV()
    ridge_cv.fit(filter_(X_diabetes), y_diabetes)
    ridge_cv.predict(filter_(X_diabetes))

    assert_equal(len(ridge_cv.coef_.shape), 1)
    assert_equal(type(ridge_cv.intercept_), np.float64)

    cv = KFold(n_samples, 5)
    ridge_cv.set_params(cv=cv)
    ridge_cv.fit(filter_(X_diabetes), y_diabetes)
    ridge_cv.predict(filter_(X_diabetes))

    assert_equal(len(ridge_cv.coef_.shape), 1)
    assert_equal(type(ridge_cv.intercept_), np.float64) 
Example 14
Project: kuaa   Author: rafaelwerneck   File: plugin_k_fold.py    GNU General Public License v3.0 5 votes vote down vote up
def train_test(images, classes, parameters):
    """
    Divides the dictionary keys of the images in folds, according to the
    parameters:
        - Number of Folds: number of folds that will be created.
    
    The first n % n_folds folds have size n // n_folds + 1, other folds have
    size n // n_folds.
    """
    
    print "Train and Test: K-Fold"
    
    print parameters
    
    #Get parameters
    param_folds = parameters['Number of Folds']
    
    list_train_test = []
    
    #Split the dataset into train and test
    print "\tSpliting the dataset into train and test."
    
    #Train and Test Split
    #--------------------------------------------------------------------------
    len_images = len(images)
    images_keys = images.keys()
    k_fold = KFold(len_images, param_folds)
    
    #Transform the index of the KFold function into the keys of the images
    #dictionary
    for train_index, test_index in k_fold:
        train = []
        test = []
        for index in train_index:
            train.append(images_keys[index])
        for index in test_index:
            test.append(images_keys[index])
        list_train_test.append([train, test])
    #--------------------------------------------------------------------------
    
    return list_train_test 
Example 15
Project: design_embeddings_jmd_2016   Author: IDEALLab   File: hp_sae.py    MIT License 5 votes vote down vote up
def cross_validate(hidden_size_l1, hidden_size_l2, hidden_size_l3, hidden_size_l4, p, 
                   l, batch_size, X, n_folds, n_components):

    # K-fold cross-validation
    kf = KFold(X.shape[0], n_folds=n_folds, shuffle=True)
    i = 1
    loss = 0
    
    for train, test in kf:
        train = train.tolist()
        test = test.tolist()
        
        print 'cross validation: %d' % i
        i += 1

        if len(train)>10 and len(test): # if there are enough training and test samples
            # Get cost
            loss += sae(X, n_components, train, test, hidden_size_l1, hidden_size_l2, hidden_size_l3, hidden_size_l4, p, 
                        l, batch_size, evaluation=True)
                                     
        else:
            print 'Please add more samples!'
            
    # Get test reconstruction error
    rec_err_cv = loss/n_folds

    return rec_err_cv 
Example 16
Project: design_embeddings_jmd_2016   Author: IDEALLab   File: hp_mlae.py    MIT License 5 votes vote down vote up
def cross_validate(hidden_size_l1, hidden_size_l2, hidden_size_l3, hidden_size_l4, 
                   l, lr, epsilon, X, n_folds, n_components):
                           
    # K-fold cross-validation
    kf = KFold(X.shape[0], n_folds=n_folds, shuffle=True)
    i = 1
    loss = 0
    
    for train, test in kf:
        train = train.tolist()
        test = test.tolist()
        
        print 'cross validation: %d' % i
        i += 1
        
        if len(train)>10 and len(test): # if there are enough training and test samples
            # Get cost
            loss += mlae(X, n_components, train, test, hidden_size_l1, hidden_size_l2, hidden_size_l3, hidden_size_l4, 
                         l, lr, epsilon, evaluation=True)
                                     
        else:
            print 'Please add more samples!'
            
    # Get test reconstruction error
    rec_err_cv = loss/n_folds

    return rec_err_cv 
Example 17
Project: Mussy-Robot   Author: arnomoonens   File: training.py    MIT License 5 votes vote down vote up
def evaluate_cross_validation(clf, X, y, K):
    
    # create a k-fold cross validation iterator
    cv = KFold(len(y), K, shuffle=True, random_state=0)
    # by default the score used is the one returned by score method of the estimator (accuracy)
    scores = cross_val_score(clf, X, y, cv=cv)
    print (scores)
    print ("Mean score: {0:.3f} (+/-{1:.3f})".format(numpy.mean(scores), sem(scores))) 
Example 18
Project: news-popularity-prediction   Author: MKLab-ITI   File: ranking.py    Apache License 2.0 5 votes vote down vote up
def folding(y, n_folds):
    k_fold = KFold(y.size, n_folds=n_folds, random_state=0)

    return k_fold 
Example 19
Project: astronet   Author: CasvandenBogaard   File: data.py    GNU General Public License v2.0 5 votes vote down vote up
def get_folds(n, train_size=0.5, test_size=0.5, n_folds=1, shuffle=True, random_state=0):

    if n_folds > 1:
        kf = KFold(n, n_folds, shuffle=shuffle, random_state=random_state)
    else:
        kf = _single_fold(n, train_size=train_size, test_size=test_size, shuffle=shuffle, random_state=random_state)

    return kf 
Example 20
Project: astronet   Author: CasvandenBogaard   File: split.py    GNU General Public License v2.0 5 votes vote down vote up
def __call__(self, X, y, net):
        
        if not self.cutoff:
            if self.eval_size:
                
                if net.regression or not self.stratify:
                    kf = KFold(y.shape[0], round(1. / self.eval_size))
                else:
                    kf = StratifiedKFold(y, round(1. / self.eval_size))
      
                train_indices, valid_indices = next(iter(kf))
                X_train, y_train = _sldict(X, train_indices), y[train_indices]
                X_valid, y_valid = _sldict(X, valid_indices), y[valid_indices]
                
            else:
                
                X_train, y_train = X, y
                X_valid, y_valid = _sldict(X, slice(len(y), None)), y[len(y):]
  
            return X_train, X_valid, y_train, y_valid
    
        else:
            
            train_indices, valid_indices = range(self.cutoff), range(self.cutoff, len(y))
            X_train, y_train = _sldict(X, train_indices), y[train_indices]
            X_valid, y_valid = _sldict(X, valid_indices), y[valid_indices]
            
            return X_train, X_valid, y_train, y_valid 
Example 21
Project: twitter-svm   Author: josh-byster   File: calculations.py    MIT License 5 votes vote down vote up
def gs(X,Y,folds,parameters):
    cv=cross_validation.KFold(len(X), n_folds=folds,shuffle=True,random_state=None)
    svr = SVC()
    clf = grid_search.GridSearchCV(svr, parameters,cv=cv)
    print("About to fit...")
    clf.fit(X,Y)
    pprint.pprint(clf.grid_scores_)
    pprint.pprint(clf.best_params_) 
Example 22
Project: twitter-svm   Author: josh-byster   File: calculations.py    MIT License 5 votes vote down vote up
def crossValidate(X,Y,folds=10,c=1):
    svm=LinearSVC(C=c)
    cv=cross_validation.KFold(len(X), n_folds=folds,shuffle=True,random_state=None)
    for i in cross_validation.cross_val_score(svm,X,Y,cv=cv):
        print(i) 
Example 23
Project: Automated-Essay-Grading   Author: vatika   File: graph_diffusion.py    GNU General Public License v2.0 5 votes vote down vote up
def execute(self):
        kf = KFold(len(self.x_train), n_folds=self.k_cross)
        own_kappa = []
        for train_idx, test_idx in kf:
            x_train, x_test = self.x_train[train_idx], self.x_train[test_idx]
            y_train, y_test = self.y_train[train_idx], self.y_train[test_idx]
            stat_obj = self.stat_class(range_min=range_min,range_max=range_max, \
                                        similarity_measure=self.similarity_measure, \
                                        neighbourhood="stochastic") # reflection bitches
            stat_obj.train(x_train,x_test,y_train)
            y_pred = np.matrix(stat_obj.predict()).T
            cohen_kappa_rating = own_wp.quadratic_weighted_kappa(y_test,y_pred,\
                                    self.range_min,self.range_max)
            self.values.append(cohen_kappa_rating)
        return str(sum(self.values)/self.k_cross) 
Example 24
Project: fc-aaai18   Author: thanhan   File: cross_validation.py    MIT License 5 votes vote down vote up
def _iter_test_indices(self):
        claim_ids = np.unique(self.data.claimId)
        cv = KFold(len(claim_ids), self.n_folds, shuffle=self.shuffle)

        for _, test in cv:
            test_claim_ids = claim_ids[test]
            test_data = self.data[self.data.claimId.isin(test_claim_ids)]
            yield test_data.iloc_index.values 
Example 25
Project: Scuba   Author: gzampieri   File: model_selection.py    GNU General Public License v2.0 5 votes vote down vote up
def set_folds(self, genes, labels):
		""" Generate folds for cross-validation. """
		
		self.train_genes = genes
		self.test_list = genes
		self.train_labels = []
		self.test_labels = []
		
		pos_indeces = [i for i in range(len(labels)) if labels[i] == 1]
		
		if self.n_fold == 1:
			kf = KFold(len(pos_indeces), len(pos_indeces))
		else:
			kf = KFold(len(pos_indeces), self.n_fold)
		
		for train_index, test_index in kf:
			ltrain = [-1]*len(genes)
			for i in train_index:
				ltrain[pos_indeces[i]] = 1
			self.train_labels.append(ltrain)
			ltest = [-1]*len(genes)
			for i in test_index:
				ltest[pos_indeces[i]] = 1
			self.test_labels.append(ltest)
		
		return 
Example 26
Project: caltech-machine-learning   Author: zhiyanfoo   File: hw8.py    MIT License 5 votes vote down vote up
def best_c(training_set):
    training_set = a_vs_b(1, 5, training_set)[0]
    svcs = [ SVC(kernel='poly', C=c, degree=2)
            for c in [0.0001, 0.001, 0.01, 0.1, 1] ]
    cv = KFold(n=len(training_set.y), n_folds=10, shuffle=True)
    score_c = [ np.mean(
        cross_val_score(polysvm, training_set.z, training_set.y, cv=cv))
        for polysvm in svcs ]
    return np.argmax(score_c), score_c 
Example 27
Project: dsw-ont-ijcai   Author: anonymous-ijcai   File: topic_type.py    GNU General Public License v3.0 5 votes vote down vote up
def train_cv_clf(topics_train, classes_train, features, n_folds=10, 
                 param_grid=_PARAM_GRID, tuned_clf=SVC(C=1, kernel='linear'),
                 scoring=util.weighted_f1, random_state=0):
    """Trains the topic type classifier, given the various parameters.
    
    """
    kf = cross_validation.KFold(len(topics_train), n_folds=n_folds, random_state=random_state)
    cv_clf = GridSearchCV(estimator=tuned_clf, param_grid=param_grid, cv=kf, scoring=scoring)
    topic_vectors_train = to_features(features, topics_train)
    cv_clf.fit(topic_vectors_train, classes_train)
    return cv_clf 
Example 28
Project: text-classification-with-convnets   Author: osmanbaskaya   File: utils.py    MIT License 5 votes vote down vote up
def cross_validate(model, X, y, n_folds, batch_size, num_epoch, func_for_evaluation=None):

    # let's shuffle first.
    seed = 5
    np.random.seed(seed)
    np.random.shuffle(X)
    np.random.seed(seed)
    np.random.shuffle(y)

    X = np.array(X)
    y = np.array(y)

    scores = np.zeros(n_folds)
    kf = KFold(len(y), n_folds=n_folds)
    for i, (train_index, test_index) in enumerate(kf):
        X_train, y_train = X[train_index, :], y[train_index]
        X_test, y_test = X[test_index, :], y[test_index]
        model.fit(X_train, y_train,
                  batch_size=batch_size,
                  nb_epoch=num_epoch)

        predictions = model.predict(X_test)
        score = func_for_evaluation(predictions[:, 0].tolist(), y_test)
        try:
            scores[i] = score[0]
        except IndexError:
            scores[i] = score


    print "{}-Fold cross validation score: {}".format(n_folds, scores.mean()) 
Example 29
Project: scikit-feature   Author: jundongl   File: test_svm_forward.py    GNU General Public License v2.0 5 votes vote down vote up
def main():
    # load data
    mat = scipy.io.loadmat('../data/COIL20.mat')
    X = mat['X']    # data
    X = X.astype(float)
    y = mat['Y']    # label
    y = y[:, 0]
    n_samples, n_features = X.shape    # number of samples and number of features

    # split data into 10 folds
    ss = KFold(n_samples, n_folds=10, shuffle=True)

    # perform evaluation on classification task
    clf = svm.LinearSVC()    # linear SVM

    correct = 0
    for train, test in ss:
        # obtain the idx of selected features from the training set
        idx = svm_forward.svm_forward(X[train], y[train], n_features)

        # obtain the dataset on the selected features
        X_selected = X[:, idx]

        # train a classification model with the selected features on the training dataset
        clf.fit(X_selected[train], y[train])

        # predict the class labels of test data
        y_predict = clf.predict(X_selected[test])

        # obtain the classification accuracy on the test data
        acc = accuracy_score(y[test], y_predict)
        correct = correct + acc

    # output the average classification accuracy over all 10 folds
    print 'Accuracy:', float(correct)/10 
Example 30
Project: scikit-feature   Author: jundongl   File: test_CFS.py    GNU General Public License v2.0 5 votes vote down vote up
def main():
    # load data
    mat = scipy.io.loadmat('../data/colon.mat')
    X = mat['X']    # data
    X = X.astype(float)
    y = mat['Y']    # label
    y = y[:, 0]
    n_samples, n_features = X.shape    # number of samples and number of features

    # split data into 10 folds
    ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)

    # perform evaluation on classification task
    num_fea = 100    # number of selected features
    clf = svm.LinearSVC()    # linear SVM

    correct = 0
    for train, test in ss:
        # obtain the index of selected features on training set
        idx = CFS.cfs(X[train], y[train])

        # obtain the dataset on the selected features
        selected_features = X[:, idx[0:num_fea]]

        # train a classification model with the selected features on the training dataset
        clf.fit(selected_features[train], y[train])

        # predict the class labels of test data
        y_predict = clf.predict(selected_features[test])

        # obtain the classification accuracy on the test data
        acc = accuracy_score(y[test], y_predict)
        correct = correct + acc

    # output the average classification accuracy over all 10 folds
    print 'Accuracy:', float(correct)/10 
Example 31
Project: scikit-feature   Author: jundongl   File: test_MIFS.py    GNU General Public License v2.0 5 votes vote down vote up
def main():
    # load data
    mat = scipy.io.loadmat('../data/BASEHOCK.mat')
    X = mat['X']    # data
    X = X.astype(float)
    y = mat['Y']    # label
    y = y[:, 0]
    n_samples, n_features = X.shape    # number of samples and number of features

    # split data into 10 folds
    ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)

    # perform evaluation on classification task
    num_fea = 10    # number of selected features
    clf = svm.LinearSVC()    # linear SVM

    correct = 0
    for train, test in ss:
        # obtain the index of each feature on the training set
        idx = MIFS.mifs(X[train], y[train], n_selected_features=num_fea)

        # obtain the dataset on the selected features
        features = X[:, idx[0:num_fea]]

        # train a classification model with the selected features on the training dataset
        clf.fit(features[train], y[train])

        # predict the class labels of test data
        y_predict = clf.predict(features[test])

        # obtain the classification accuracy on the test data
        acc = accuracy_score(y[test], y_predict)
        print acc
        correct = correct + acc

    # output the average classification accuracy over all 10 folds
    print 'Accuracy:', float(correct)/10 
Example 32
Project: scikit-feature   Author: jundongl   File: test_JMI.py    GNU General Public License v2.0 5 votes vote down vote up
def main():
    # load data
    mat = scipy.io.loadmat('../data/colon.mat')
    X = mat['X']    # data
    X = X.astype(float)
    y = mat['Y']    # label
    y = y[:, 0]
    n_samples, n_features = X.shape    # number of samples and number of features

    # split data into 10 folds
    ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)

    # perform evaluation on classification task
    num_fea = 10    # number of selected features
    clf = svm.LinearSVC()    # linear SVM

    correct = 0
    for train, test in ss:
        # obtain the index of each feature on the training set
        idx,_,_ = JMI.jmi(X[train], y[train], n_selected_features=num_fea)

        # obtain the dataset on the selected features
        features = X[:, idx[0:num_fea]]

        # train a classification model with the selected features on the training dataset
        clf.fit(features[train], y[train])

        # predict the class labels of test data
        y_predict = clf.predict(features[test])

        # obtain the classification accuracy on the test data
        acc = accuracy_score(y[test], y_predict)
        correct = correct + acc

    # output the average classification accuracy over all 10 folds
    print 'Accuracy:', float(correct)/10 
Example 33
Project: scikit-feature   Author: jundongl   File: test_decision_tree_backward.py    GNU General Public License v2.0 5 votes vote down vote up
def main():
    # load data
    mat = scipy.io.loadmat('../data/COIL20.mat')
    X = mat['X']    # data
    X = X.astype(float)
    y = mat['Y']    # label
    y = y[:, 0]
    n_samples, n_features = X.shape    # number of samples and number of features

    # split data into 10 folds
    ss = KFold(n_samples, n_folds=10, shuffle=True)

    # perform evaluation on classification task
    clf = svm.LinearSVC()    # linear SVM

    correct = 0
    for train, test in ss:
        # obtain the idx of selected features from the training set
        idx = decision_tree_backward.decision_tree_backward(X[train], y[train], n_features)

        # obtain the dataset on the selected features
        X_selected = X[:, idx]

        # train a classification model with the selected features on the training dataset
        clf.fit(X_selected[train], y[train])

        # predict the class labels of test data
        y_predict = clf.predict(X_selected[test])

        # obtain the classification accuracy on the test data
        acc = accuracy_score(y[test], y_predict)
        correct = correct + acc

    # output the average classification accuracy over all 10 folds
    print 'Accuracy:', float(correct)/10 
Example 34
Project: scikit-feature   Author: jundongl   File: test_fisher_score.py    GNU General Public License v2.0 5 votes vote down vote up
def main():
    # load data
    mat = scipy.io.loadmat('../data/COIL20.mat')
    X = mat['X']    # data
    X = X.astype(float)
    y = mat['Y']    # label
    y = y[:, 0]
    n_samples, n_features = X.shape    # number of samples and number of features

    # split data into 10 folds
    ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)

    # perform evaluation on classification task
    num_fea = 100    # number of selected features
    clf = svm.LinearSVC()    # linear SVM

    correct = 0
    for train, test in ss:
        # obtain the score of each feature on the training set
        score = fisher_score.fisher_score(X[train], y[train])

        # rank features in descending order according to score
        idx = fisher_score.feature_ranking(score)

        # obtain the dataset on the selected features
        selected_features = X[:, idx[0:num_fea]]

        # train a classification model with the selected features on the training dataset
        clf.fit(selected_features[train], y[train])

        # predict the class labels of test data
        y_predict = clf.predict(selected_features[test])

        # obtain the classification accuracy on the test data
        acc = accuracy_score(y[test], y_predict)
        correct = correct + acc

    # output the average classification accuracy over all 10 folds
    print 'Accuracy:', float(correct)/10 
Example 35
Project: scikit-feature   Author: jundongl   File: test_ll_l21.py    GNU General Public License v2.0 5 votes vote down vote up
def main():
    # load data
    mat = scipy.io.loadmat('../data/COIL20.mat')
    X = mat['X']    # data
    X = X.astype(float)
    y = mat['Y']    # label
    y = y[:, 0]
    Y = construct_label_matrix_pan(y)
    n_samples, n_features = X.shape    # number of samples and number of features

    # split data into 10 folds
    ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)

    # perform evaluation on classification task
    num_fea = 100    # number of selected features
    clf = svm.LinearSVC()    # linear SVM

    correct = 0
    for train, test in ss:
        # obtain the feature weight matrix
        Weight, obj, value_gamma = ll_l21.proximal_gradient_descent(X[train], Y[train], 0.1, verbose=False)

        # sort the feature scores in an ascending order according to the feature scores
        idx = feature_ranking(Weight)

        # obtain the dataset on the selected features
        selected_features = X[:, idx[0:num_fea]]

        # train a classification model with the selected features on the training dataset
        clf.fit(selected_features[train], y[train])

        # predict the class labels of test data
        y_predict = clf.predict(selected_features[test])

        # obtain the classification accuracy on the test data
        acc = accuracy_score(y[test], y_predict)
        correct = correct + acc

    # output the average classification accuracy over all 10 folds
    print 'Accuracy:', float(correct)/10 
Example 36
Project: scikit-feature   Author: jundongl   File: test_RFS.py    GNU General Public License v2.0 5 votes vote down vote up
def main():
    # load data
    mat = scipy.io.loadmat('../data/COIL20.mat')
    X = mat['X']    # data
    X = X.astype(float)
    y = mat['Y']    # label
    y = y[:, 0]
    Y = construct_label_matrix(y)
    n_samples, n_features = X.shape

    # split data into 10 folds
    ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)

    # perform evaluation on classification task
    num_fea = 100    # number of selected features
    clf = svm.LinearSVC()    # linear SVM

    correct = 0
    for train, test in ss:
        # obtain the feature weight matrix
        Weight = RFS.rfs(X[train, :], Y[train, :], gamma=0.1)

        # sort the feature scores in an ascending order according to the feature scores
        idx = feature_ranking(Weight)

        # obtain the dataset on the selected features
        selected_features = X[:, idx[0:num_fea]]

        # train a classification model with the selected features on the training dataset
        clf.fit(selected_features[train], y[train])

        # predict the class labels of test data
        y_predict = clf.predict(selected_features[test])

        # obtain the classification accuracy on the test data
        acc = accuracy_score(y[test], y_predict)
        print acc
        correct = correct + acc

    # output the average classification accuracy over all 10 folds
    print 'Accuracy:', float(correct)/10 
Example 37
Project: scikit-feature   Author: jundongl   File: test_FCBF.py    GNU General Public License v2.0 5 votes vote down vote up
def main():
    # load data
    mat = scipy.io.loadmat('../data/colon.mat')
    X = mat['X']    # data
    X = X.astype(float)
    y = mat['Y']    # label
    y = y[:, 0]
    n_samples, n_features = X.shape    # number of samples and number of features

    # split data into 10 folds
    ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)

    # perform evaluation on classification task
    num_fea = 10    # number of selected features
    clf = svm.LinearSVC()    # linear SVM

    correct = 0
    for train, test in ss:
        # obtain the index of each feature on the training set
        idx = FCBF.fcbf(X[train], y[train], n_selected_features=num_fea)

        # obtain the dataset on the selected features
        features = X[:, idx[0:num_fea]]

        # train a classification model with the selected features on the training dataset
        clf.fit(features[train], y[train])

        # predict the class labels of test data
        y_predict = clf.predict(features[test])

        # obtain the classification accuracy on the test data
        acc = accuracy_score(y[test], y_predict)
        correct = correct + acc

    # output the average classification accuracy over all 10 folds
    print 'Accuracy:', float(correct)/10 
Example 38
Project: scikit-feature   Author: jundongl   File: test_trace_ratio.py    GNU General Public License v2.0 5 votes vote down vote up
def main():
    # load data
    mat = scipy.io.loadmat('../data/COIL20.mat')
    X = mat['X']    # data
    X = X.astype(float)
    y = mat['Y']    # label
    y = y[:, 0]
    n_samples, n_features = X.shape    # number of samples and number of features

    # split data into 10 folds
    ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)

    # perform evaluation on classification task
    num_fea = 100    # number of selected features
    clf = svm.LinearSVC()    # linear SVM

    correct = 0
    for train, test in ss:
        # obtain the index of selected features
        idx, feature_score, subset_score = trace_ratio.trace_ratio(X[train], y[train], num_fea, style='fisher')

        # obtain the dataset on the selected features
        selected_features = X[:, idx[0:num_fea]]

        # train a classification model with the selected features on the training dataset
        clf.fit(selected_features[train], y[train])

        # predict the class labels of test data
        y_predict = clf.predict(selected_features[test])

        # obtain the classification accuracy on the test data
        acc = accuracy_score(y[test], y_predict)
        correct = correct + acc

    # output the average classification accuracy over all 10 folds
    print 'Accuracy:', float(correct)/10 
Example 39
Project: scikit-feature   Author: jundongl   File: test_svm_backward.py    GNU General Public License v2.0 5 votes vote down vote up
def main():
    # load data
    mat = scipy.io.loadmat('../data/COIL20.mat')
    X = mat['X']    # data
    X = X.astype(float)
    y = mat['Y']    # label
    y = y[:, 0]
    n_samples, n_features = X.shape    # number of samples and number of features

    # split data into 10 folds
    ss = KFold(n_samples, n_folds=10, shuffle=True)

    # perform evaluation on classification task
    clf = svm.LinearSVC()    # linear SVM

    correct = 0
    for train, test in ss:
        # obtain the idx of selected features from the training set
        idx = svm_backward.svm_backward(X[train], y[train], n_features)

        # obtain the dataset on the selected features
        X_selected = X[:, idx]

        # train a classification model with the selected features on the training dataset
        clf.fit(X_selected[train], y[train])

        # predict the class labels of test data
        y_predict = clf.predict(X_selected[test])

        # obtain the classification accuracy on the test data
        acc = accuracy_score(y[test], y_predict)
        correct = correct + acc

    # output the average classification accuracy over all 10 folds
    print 'Accuracy:', float(correct)/10 
Example 40
Project: scikit-feature   Author: jundongl   File: test_alpha_investing.py    GNU General Public License v2.0 5 votes vote down vote up
def main():
    # load data
    mat = scipy.io.loadmat('../data/COIL20.mat')
    X = mat['X']    # data
    X = X.astype(float)
    y = mat['Y']    # label
    y = y[:, 0]
    y = y.astype(float)
    n_samples, n_features = X.shape    # number of samples and number of features

    # split data into 10 folds
    ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)

    # perform evaluation on classification task
    clf = svm.LinearSVC()    # linear SVM

    correct = 0
    for train, test in ss:
        # obtain the index of selected features
        idx = alpha_investing.alpha_investing(X[train], y[train], 0.05, 0.05)

        # obtain the dataset on the selected features
        selected_features = X[:, idx]

        # train a classification model with the selected features on the training dataset
        clf.fit(selected_features[train], y[train])

        # predict the class labels of test data
        y_predict = clf.predict(selected_features[test])

        # obtain the classification accuracy on the test data
        acc = accuracy_score(y[test], y_predict)
        correct = correct + acc

    # output the average classification accuracy over all 10 folds
    print 'Accuracy:', float(correct)/10 
Example 41
Project: scikit-feature   Author: jundongl   File: test_MIM.py    GNU General Public License v2.0 5 votes vote down vote up
def main():
    # load data
    mat = scipy.io.loadmat('../data/colon.mat')
    X = mat['X']    # data
    X = X.astype(float)
    y = mat['Y']    # label
    y = y[:, 0]
    n_samples, n_features = X.shape    # number of samples and number of features

    # split data into 10 folds
    ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)

    # perform evaluation on classification task
    num_fea = 10    # number of selected features
    clf = svm.LinearSVC()    # linear SVM

    correct = 0
    for train, test in ss:
        # obtain the index of each feature on the training set
        idx,_,_ = MIM.mim(X[train], y[train], n_selected_features=num_fea)

        # obtain the dataset on the selected features
        features = X[:, idx[0:num_fea]]

        # train a classification model with the selected features on the training dataset
        clf.fit(features[train], y[train])

        # predict the class labels of test data
        y_predict = clf.predict(features[test])

        # obtain the classification accuracy on the test data
        acc = accuracy_score(y[test], y_predict)
        correct = correct + acc

    # output the average classification accuracy over all 10 folds
    print 'Accuracy:', float(correct)/10 
Example 42
Project: scikit-feature   Author: jundongl   File: test_f_score.py    GNU General Public License v2.0 5 votes vote down vote up
def main():
    # load data
    mat = scipy.io.loadmat('../data/colon.mat')
    X = mat['X']    # data
    X = X.astype(float)
    y = mat['Y']    # label
    y = y[:, 0]
    n_samples, n_features = X.shape    # number of samples and number of features

    # split data into 10 folds
    ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)

    # perform evaluation on classification task
    num_fea = 100    # number of selected features
    clf = svm.LinearSVC()    # linear SVM

    correct = 0
    for train, test in ss:
        # obtain the f-score of each feature
        score = f_score.f_score(X, y)

        # rank features in descending order according to score
        idx = f_score.feature_ranking(score)

        # obtain the dataset on the selected features
        selected_features = X[:, idx[0:num_fea]]

        # train a classification model with the selected features on the training dataset
        clf.fit(selected_features[train], y[train])

        # predict the class labels of test data
        y_predict = clf.predict(selected_features[test])

        # obtain the classification accuracy on the test data
        acc = accuracy_score(y[test], y_predict)
        correct = correct + acc

    # output the average classification accuracy over all 10 folds
    print 'Accuracy:', float(correct)/10 
Example 43
Project: scikit-feature   Author: jundongl   File: test_t_score.py    GNU General Public License v2.0 5 votes vote down vote up
def main():
    # load data
    mat = scipy.io.loadmat('../data/COIL20.mat')
    X = mat['X']    # data
    X = X.astype(float)
    y = mat['Y']    # label
    y = y[:, 0]
    n_samples, n_features = X.shape    # number of samples and number of features

    # split data into 10 folds
    ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)

    # perform evaluation on classification task
    num_fea = 100    # number of selected features
    clf = svm.LinearSVC()    # linear SVM

    correct = 0
    for train, test in ss:
        # obtain the t-score of each feature
        score = t_score.t_score(X, y)

        # rank features in descending order according to score
        idx = t_score.feature_ranking(score)

        # obtain the dataset on the selected features
        selected_features = X[:, idx[0:num_fea]]

        # train a classification model with the selected features on the training dataset
        clf.fit(selected_features[train], y[train])

        # predict the class labels of test data
        y_predict = clf.predict(selected_features[test])

        # obtain the classification accuracy on the test data
        acc = accuracy_score(y[test], y_predict)
        correct = correct + acc

    # output the average classification accuracy over all 10 folds
    print 'Accuracy:', float(correct)/10 
Example 44
Project: scikit-feature   Author: jundongl   File: test_decision_tree_forward.py    GNU General Public License v2.0 5 votes vote down vote up
def main():
    # load data
    mat = scipy.io.loadmat('../data/COIL20.mat')
    X = mat['X']    # data
    X = X.astype(float)
    y = mat['Y']    # label
    y = y[:, 0]
    n_samples, n_features = X.shape    # number of samples and number of features

    # split data into 10 folds
    ss = KFold(n_samples, n_folds=10, shuffle=True)

    # perform evaluation on classification task
    clf = svm.LinearSVC()    # linear SVM

    correct = 0
    for train, test in ss:
        # obtain the idx of selected features from the training set
        idx = decision_tree_forward.decision_tree_forward(X[train], y[train], n_features)

        # obtain the dataset on the selected features
        X_selected = X[:, idx]

        # train a classification model with the selected features on the training dataset
        clf.fit(X_selected[train], y[train])

        # predict the class labels of test data
        y_predict = clf.predict(X_selected[test])

        # obtain the classification accuracy on the test data
        acc = accuracy_score(y[test], y_predict)
        correct = correct + acc

    # output the average classification accuracy over all 10 folds
    print 'Accuracy:', float(correct)/10 
Example 45
Project: scikit-feature   Author: jundongl   File: test_ls_l21.py    GNU General Public License v2.0 5 votes vote down vote up
def main():
    # load data
    mat = scipy.io.loadmat('../data/COIL20.mat')
    X = mat['X']    # data
    X = X.astype(float)
    y = mat['Y']    # label
    y = y[:, 0]
    Y = construct_label_matrix_pan(y)
    n_samples, n_features = X.shape    # number of samples and number of features

    # split data into 10 folds
    ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)

    # perform evaluation on classification task
    num_fea = 100    # number of selected features
    clf = svm.LinearSVC()    # linear SVM

    correct = 0
    for train, test in ss:
        # obtain the feature weight matrix
        Weight, obj, value_gamma = ls_l21.proximal_gradient_descent(X[train], Y[train], 0.1, verbose=False)

        # sort the feature scores in an ascending order according to the feature scores
        idx = feature_ranking(Weight)

        # obtain the dataset on the selected features
        selected_features = X[:, idx[0:num_fea]]

        # train a classification model with the selected features on the training dataset
        clf.fit(selected_features[train], y[train])

        # predict the class labels of test data
        y_predict = clf.predict(selected_features[test])

        # obtain the classification accuracy on the test data
        acc = accuracy_score(y[test], y_predict)
        correct = correct + acc

    # output the average classification accuracy over all 10 folds
    print 'Accuracy:', float(correct)/10 
Example 46
Project: scikit-feature   Author: jundongl   File: test_chi_square.py    GNU General Public License v2.0 5 votes vote down vote up
def main():
    # load data
    mat = scipy.io.loadmat('../data/BASEHOCK.mat')
    X = mat['X']    # data
    X = X.astype(float)
    y = mat['Y']    # label
    y = y[:, 0]
    n_samples, n_features = X.shape    # number of samples and number of features

    # split data into 10 folds
    ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)

    # perform evaluation on classification task
    num_fea = 100    # number of selected features
    clf = svm.LinearSVC()    # linear SVM

    correct = 0
    for train, test in ss:
        # obtain the chi-square score of each feature
        score = chi_square.chi_square(X, y)

        # rank features in descending order according to score
        idx = chi_square.feature_ranking(score)

        # obtain the dataset on the selected features
        selected_features = X[:, idx[0:num_fea]]

        # train a classification model with the selected features on the training dataset
        clf.fit(selected_features[train], y[train])

        # predict the class labels of test data
        y_predict = clf.predict(selected_features[test])

        # obtain the classification accuracy on the test data
        acc = accuracy_score(y[test], y_predict)
        correct = correct + acc

    # output the average classification accuracy over all 10 folds
    print 'Accuracy:', float(correct)/10 
Example 47
Project: scikit-feature   Author: jundongl   File: test_DISR.py    GNU General Public License v2.0 5 votes vote down vote up
def main():
    # load data
    mat = scipy.io.loadmat('../data/colon.mat')
    X = mat['X']    # data
    X = X.astype(float)
    y = mat['Y']    # label
    y = y[:, 0]
    n_samples, n_features = X.shape    # number of samples and number of features

    # split data into 10 folds
    ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)

    # perform evaluation on classification task
    num_fea = 10    # number of selected features
    clf = svm.LinearSVC()    # linear SVM

    correct = 0
    for train, test in ss:
        # obtain the index of each feature on the training set
        idx,_,_ = DISR.disr(X[train], y[train], n_selected_features=num_fea)

        # obtain the dataset on the selected features
        features = X[:, idx[0:num_fea]]

        # train a classification model with the selected features on the training dataset
        clf.fit(features[train], y[train])

        # predict the class labels of test data
        y_predict = clf.predict(features[test])

        # obtain the classification accuracy on the test data
        acc = accuracy_score(y[test], y_predict)
        correct = correct + acc

    # output the average classification accuracy over all 10 folds
    print 'Accuracy:', float(correct)/10 
Example 48
Project: scikit-feature   Author: jundongl   File: test_ICAP.py    GNU General Public License v2.0 5 votes vote down vote up
def main():
    # load data
    mat = scipy.io.loadmat('../data/colon.mat')
    X = mat['X']    # data
    X = X.astype(float)
    y = mat['Y']    # label
    y = y[:, 0]
    n_samples, n_features = X.shape    # number of samples and number of features

    # split data into 10 folds
    ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)

    # perform evaluation on classification task
    num_fea = 10    # number of selected features
    clf = svm.LinearSVC()    # linear SVM

    correct = 0
    for train, test in ss:
        # obtain the index of each feature on the training set
        idx,_,_ = ICAP.icap(X[train], y[train], n_selected_features=num_fea)

        # obtain the dataset on the selected features
        features = X[:, idx[0:num_fea]]

        # train a classification model with the selected features on the training dataset
        clf.fit(features[train], y[train])

        # predict the class labels of test data
        y_predict = clf.predict(features[test])

        # obtain the classification accuracy on the test data
        acc = accuracy_score(y[test], y_predict)
        correct = correct + acc

    # output the average classification accuracy over all 10 folds
    print 'Accuracy:', float(correct)/10 
Example 49
Project: scikit-feature   Author: jundongl   File: test_gini_index.py    GNU General Public License v2.0 5 votes vote down vote up
def main():
    # load data
    mat = scipy.io.loadmat('../data/colon.mat')
    X = mat['X']    # data
    X = X.astype(float)
    y = mat['Y']    # label
    y = y[:, 0]
    n_samples, n_features = X.shape    # number of samples and number of features

    # split data into 10 folds
    ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)

    # perform evaluation on classification task
    num_fea = 100    # number of selected features
    clf = svm.LinearSVC()    # linear SVM

    correct = 0
    for train, test in ss:
        # obtain the gini_index score of each feature
        score = gini_index.gini_index(X[train], y[train])

        # rank features in descending order according to score
        idx = gini_index.feature_ranking(score)

        # obtain the dataset on the selected features
        selected_features = X[:, idx[0:num_fea]]

        # train a classification model with the selected features on the training dataset
        clf.fit(selected_features[train], y[train])

        # predict the class labels of test data
        y_predict = clf.predict(selected_features[test])

        # obtain the classification accuracy on the test data
        acc = accuracy_score(y[test], y_predict)
        correct = correct + acc

    # output the average classification accuracy over all 10 folds
    print 'Accuracy:', float(correct)/10 
Example 50
Project: email-categorization   Author: ajaybhat   File: classifier.py    GNU General Public License v2.0 5 votes vote down vote up
def cross_validate():
    training_set = load_training_set()
    random.shuffle(training_set)
    average = 0
    cv = KFold(len(training_set), n_folds=10, indices=True, shuffle=False, random_state=None)
    for traincv, evalcv in cv:
        classifier = NaiveBayesClassifier.train(training_set[traincv[0]:traincv[len(traincv) - 1]])
        acc = accuracy(classifier, training_set[evalcv[0]:evalcv[len(evalcv) - 1]])
        print 'Range: ', evalcv[0], 'to', evalcv[len(evalcv) - 1]
        print 'Accuracy: %4.2f' % acc
        average += acc
    print 'Average accuracy: %4.2f' % (average / 10) 
Example 51
Project: Kaggle   Author: InfiniteWing   File: kaggle_keras_kernel.py    MIT License 5 votes vote down vote up
def KFold_Train(x_train,y_train,nfolds=5,batch_size=128):
    model = Amazon_Model()
    kf = KFold(len(y_train), n_folds=nfolds, shuffle=False, random_state=1)
    num_fold = 0
    for train_index, test_index in kf:
    
        X_train = x_train[train_index]
        Y_train = y_train[train_index]
        X_valid = x_train[test_index]
        Y_valid = y_train[test_index]

        num_fold += 1
        print('Start KFold number {} from {}'.format(num_fold, nfolds))
        print('Split train: ', len(X_train), len(Y_train))
        print('Split valid: ', len(X_valid), len(Y_valid))
        weight_path = os.path.join('', '../h5_128_rotate_uint8/weights_kfold_' + str(num_fold) + '.h5')
        if os.path.isfile(weight_path):
            model.load_weights(weight_path)
        
        # I forgot what's the setting here
        # Maybe like these
        epochs_arr = [60, 15, 15]
        learn_rates = [0.001, 0.0001, 0.00001]

        for learn_rate, epochs in zip(learn_rates, epochs_arr):
            opt  = optimizers.Adam(lr=learn_rate)
            model.compile(loss='binary_crossentropy',optimizer=opt,metrics=['accuracy'])
            callbacks = [EarlyStopping(monitor='val_loss', patience=2, verbose=0),
            ModelCheckpoint(weight_path, monitor='val_loss', save_best_only=True, verbose=0)]

            model.fit(x = X_train, y= Y_train, validation_data=(X_valid, Y_valid),
                  batch_size=batch_size,verbose=2, epochs=epochs,callbacks=callbacks,shuffle=True)
        
        p_valid = model.predict(X_valid, batch_size = batch_size, verbose=2)
        print(fbeta_score(Y_valid, np.array(p_valid) > 0.18, beta=2, average='samples')) 
Example 52
Project: FaceRecognition   Author: gswycf   File: facenet.py    MIT License 5 votes vote down vote up
def calculate_roc(thresholds, embeddings1, embeddings2, actual_issame, seed):
    assert(embeddings1.shape[0] == embeddings2.shape[0])
    assert(embeddings1.shape[1] == embeddings2.shape[1])
    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
    nrof_thresholds = len(thresholds)
    nrof_folds = 10
    folds = KFold(n=nrof_pairs, n_folds=nrof_folds, shuffle=True, random_state=seed)
    
    tprs = np.zeros((nrof_folds,nrof_thresholds))
    fprs = np.zeros((nrof_folds,nrof_thresholds))
    accuracy = np.zeros((nrof_folds))
    
    diff = np.subtract(embeddings1, embeddings2)
    dist = np.sum(np.square(diff),1)
    
    for fold_idx, (train, test) in enumerate(folds):
        
        # Find the best threshold for the fold
        acc_train = np.zeros((nrof_thresholds))
        for threshold_idx, threshold in enumerate(thresholds):
            _, _, acc_train[threshold_idx] = calculate_accuracy(threshold, dist[train], actual_issame[train])
        best_threshold_index = np.argmax(acc_train)
        for threshold_idx, threshold in enumerate(thresholds):
            tprs[fold_idx,threshold_idx], fprs[fold_idx,threshold_idx], _ = calculate_accuracy(threshold, dist[test], actual_issame[test])
        _, _, accuracy[fold_idx] = calculate_accuracy(thresholds[best_threshold_index], dist[test], actual_issame[test])
            
        tpr = np.mean(tprs,0)
        fpr = np.mean(fprs,0)
    return tpr, fpr, accuracy 
Example 53
Project: kaggle_submissions   Author: tdvance   File: crossvalidation.py    MIT License 5 votes vote down vote up
def main():
    classifier = GradientBoostingClassifier
    #read in  data, parse into training and target sets
    dataset = np.genfromtxt(open('Data/trainPrep.csv','r'), delimiter=',', dtype='f8')[1:]    
    target = np.array([x[-1] for x in dataset])
    train = np.array([x[:-1] for x in dataset])

    #In this case we'll use a random forest, but this could be any classifier
    cfr = classifier()

    #Simple K-Fold cross validation. 5 folds.
    cv = cross_validation.KFold(len(train), n_folds=5, shuffle=True)

    #iterate through the training and test cross validation segments and
    #run the classifier on each one, aggregating the results into a list
    results = []
    for traincv, testcv in cv:
        predicted = cfr.fit(train[traincv], target[traincv]).predict(train[testcv])
        total=len(predicted)
        correct=0.0
        for i in range(total):
            if target[testcv][i] == predicted[i]:
                correct += 1.0
        results.append(correct/total)

    #print out the mean of the cross-validated results
    print "Results: " + str( np.array(results).mean() ) 
Example 54
Project: kaggle-right-whale   Author: felixlaumon   File: train_localization_model.py    MIT License 5 votes vote down vote up
def train_test_split(X, y, test_size=0.25, random_state=42):
    n_folds = int(1 / float(test_size))
    skv = KFold(len(X), n_folds=n_folds, random_state=random_state)
    train_idx, test_idx = iter(skv).next()
    return X[train_idx], X[test_idx], y[train_idx], y[test_idx] 
Example 55
Project: kaggle-right-whale   Author: felixlaumon   File: train_pts_model.py    MIT License 5 votes vote down vote up
def train_test_split(X, y, test_size=0.25, random_state=42):
    n_folds = int(1 / float(test_size))
    skv = KFold(len(X), n_folds=n_folds, random_state=random_state)
    train_idx, test_idx = iter(skv).next()
    return X[train_idx], X[test_idx], y[train_idx], y[test_idx] 
Example 56
Project: BRISE   Author: dpukhkaiev   File: regression.py    MIT License 5 votes vote down vote up
def __init__(self, file_name, train_size, target, features, indices):
        del self.dict[:]
        del self.indices[:]
        for i in indices:
            self.indices.append(i)

        subset_target = []
        subset_features = []
        for i in self.indices:
            subset_target.append(target[i])
            subset_features.append(features[i])

        self.train_size = train_size

        '''
        kf = cross_validation.KFold(n=len(data), n_folds=10, shuffle=True )
        for train_index, test_index in kf:
            for i in train_index:
                self.feature_train.append(features[i])
                self.target_train.append(target[i])
            for i in test_index:
                self.feature_test.append(features[i])
                self.target_test.append(target[i])
        '''
        # print subset_target
        # print "***************"
        # print subset_features
        self.feature_train, self.feature_test, self.target_train, self.target_test = \
        cross_validation.train_test_split(subset_features, subset_target, train_size=train_size)
        old_indices = []
        return 
Example 57
Project: automl-phase-2   Author: jamesrobertlloyd   File: util.py    MIT License 5 votes vote down vote up
def convert_automl_into_automl_folds(folder, save_folder_root, n_folds=5,
                                     random_state=0, usage='testing'):
    """Convert a dataset in automl format into several folds of automl format"""
    # Load data
    input_dir, basename = os.path.split(folder)
    D = DataManager(basename, input_dir, replace_missing=True, filter_features=True)
    X = D.data['X_train']
    y = D.data['Y_train']
    info = D.info
    if not usage is None:
        info['usage'] = usage
    # Now split into folds and save
    folds = KFold(n=X.shape[0], n_folds=n_folds, shuffle=True, random_state=random_state)
    for (fold, (train_index, test_index)) in enumerate(folds):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        fold_folder = os.path.join(save_folder_root + '_fold_%02d' % (fold + 1), basename)
        mkdir(fold_folder)
        fmt = '%f'
        np.savetxt(os.path.join(fold_folder, basename + '_train.data'), X_train, fmt=fmt, delimiter=' ')
        np.savetxt(os.path.join(fold_folder, basename + '_test.data'), X_test, fmt=fmt, delimiter=' ')
        if info['task'] == 'binary.classification':
            fmt = '%d'
        np.savetxt(os.path.join(fold_folder, basename + '_train.solution'), y_train, fmt=fmt, delimiter=' ')
        np.savetxt(os.path.join(fold_folder, basename + '_test.solution'), y_test, fmt=fmt, delimiter=' ')
        info['train_num'] = X_train.shape[0]
        info['test_num'] = X_test.shape[0]
        with open(os.path.join(fold_folder, basename + '_public.info'), 'w') as info_file:
            for (key, value) in info.iteritems():
                info_file.write('%s = %s\n' % (key, value))
        shutil.copy(os.path.join(folder, basename + '_feat.type'), os.path.join(fold_folder, basename + '_feat.type')) 
Example 58
Project: linear_neuron   Author: uglyboxer   File: test_learning_curve.py    MIT License 5 votes vote down vote up
def test_learning_curve_with_boolean_indices():
    X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
                               n_redundant=0, n_classes=2,
                               n_clusters_per_class=1, random_state=0)
    estimator = MockImprovingEstimator(20)
    cv = KFold(n=30, n_folds=3, indices=False)
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, train_sizes=np.linspace(0.1, 1.0, 10))
    assert_array_equal(train_sizes, np.linspace(2, 20, 10))
    assert_array_almost_equal(train_scores.mean(axis=1),
                              np.linspace(1.9, 1.0, 10))
    assert_array_almost_equal(test_scores.mean(axis=1),
                              np.linspace(0.1, 1.0, 10)) 
Example 59
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 5 votes vote down vote up
def test_kfold_valueerrors():
    # Check that errors are raised if there is not enough samples
    assert_raises(ValueError, cval.KFold, 3, 4)

    # Check that a warning is raised if the least populated class has too few
    # members.
    y = [3, 3, -1, -1, 2]

    cv = assert_warns_message(Warning, "The least populated class",
                              cval.StratifiedKFold, y, 3)

    # Check that despite the warning the folds are still computed even
    # though all the classes are not necessarily represented at on each
    # side of the split at each split
    check_cv_coverage(cv, expected_n_iter=3, n_samples=len(y))

    # Error when number of folds is <= 1
    assert_raises(ValueError, cval.KFold, 2, 0)
    assert_raises(ValueError, cval.KFold, 2, 1)
    assert_raises(ValueError, cval.StratifiedKFold, y, 0)
    assert_raises(ValueError, cval.StratifiedKFold, y, 1)

    # When n is not integer:
    assert_raises(ValueError, cval.KFold, 2.5, 2)

    # When n_folds is not integer:
    assert_raises(ValueError, cval.KFold, 5, 1.5)
    assert_raises(ValueError, cval.StratifiedKFold, y, 1.5) 
Example 60
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 5 votes vote down vote up
def test_kfold_indices():
    # Check all indices are returned in the test folds
    kf = cval.KFold(300, 3)
    check_cv_coverage(kf, expected_n_iter=3, n_samples=300)

    # Check all indices are returned in the test folds even when equal-sized
    # folds are not possible
    kf = cval.KFold(17, 3)
    check_cv_coverage(kf, expected_n_iter=3, n_samples=17) 
Example 61
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 5 votes vote down vote up
def test_kfold_balance():
    # Check that KFold returns folds with balanced sizes
    for kf in [cval.KFold(i, 5) for i in range(11, 17)]:
        sizes = []
        for _, test in kf:
            sizes.append(len(test))

        assert_true((np.max(sizes) - np.min(sizes)) <= 1)
        assert_equal(np.sum(sizes), kf.n) 
Example 62
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 5 votes vote down vote up
def test_cross_val_score_mask():
    # test that cross_val_score works with boolean masks
    svm = SVC(kernel="linear")
    iris = load_iris()
    X, y = iris.data, iris.target
    with warnings.catch_warnings(record=True):
        cv_indices = cval.KFold(len(y), 5, indices=True)
    scores_indices = cval.cross_val_score(svm, X, y, cv=cv_indices)
    with warnings.catch_warnings(record=True):
        cv_masks = cval.KFold(len(y), 5, indices=False)
    scores_masks = cval.cross_val_score(svm, X, y, cv=cv_masks)
    assert_array_equal(scores_indices, scores_masks) 
Example 63
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 5 votes vote down vote up
def test_cross_val_generator_with_mask():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    loo = assert_warns(DeprecationWarning, cval.LeaveOneOut,
                       4, indices=False)
    lpo = assert_warns(DeprecationWarning, cval.LeavePOut,
                       4, 2, indices=False)
    kf = assert_warns(DeprecationWarning, cval.KFold,
                      4, 2, indices=False)
    skf = assert_warns(DeprecationWarning, cval.StratifiedKFold,
                       y, 2, indices=False)
    lolo = assert_warns(DeprecationWarning, cval.LeaveOneLabelOut,
                        labels, indices=False)
    lopo = assert_warns(DeprecationWarning, cval.LeavePLabelOut,
                        labels, 2, indices=False)
    ss = assert_warns(DeprecationWarning, cval.ShuffleSplit,
                      4, indices=False)
    ps = assert_warns(DeprecationWarning, cval.PredefinedSplit, [1, 1, 2, 2],
                      indices=False)
    for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]:
        for train, test in cv:
            assert_equal(np.asarray(train).dtype.kind, 'b')
            assert_equal(np.asarray(train).dtype.kind, 'b')
            X[train], X[test]
            y[train], y[test] 
Example 64
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 5 votes vote down vote up
def test_cross_val_generator_mask_indices_same():
    # Test that the cross validation generators return the same results when
    # indices=True and when indices=False
    y = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2])
    labels = np.array([1, 1, 2, 3, 3, 3, 4])

    loo_mask = cval.LeaveOneOut(5, indices=False)
    loo_ind = cval.LeaveOneOut(5, indices=True)
    lpo_mask = cval.LeavePOut(10, 2, indices=False)
    lpo_ind = cval.LeavePOut(10, 2, indices=True)
    kf_mask = cval.KFold(10, 5, indices=False, shuffle=True, random_state=1)
    kf_ind = cval.KFold(10, 5, indices=True, shuffle=True, random_state=1)
    skf_mask = cval.StratifiedKFold(y, 3, indices=False)
    skf_ind = cval.StratifiedKFold(y, 3, indices=True)
    lolo_mask = cval.LeaveOneLabelOut(labels, indices=False)
    lolo_ind = cval.LeaveOneLabelOut(labels, indices=True)
    lopo_mask = cval.LeavePLabelOut(labels, 2, indices=False)
    lopo_ind = cval.LeavePLabelOut(labels, 2, indices=True)
    ps_mask = cval.PredefinedSplit([1, 1, 2, 2], indices=False)
    ps_ind = cval.PredefinedSplit([1, 1, 2, 2], indices=True)

    for cv_mask, cv_ind in [(loo_mask, loo_ind), (lpo_mask, lpo_ind),
                            (kf_mask, kf_ind), (skf_mask, skf_ind),
                            (lolo_mask, lolo_ind), (lopo_mask, lopo_ind),
                            (ps_mask, ps_ind)]:
        for (train_mask, test_mask), (train_ind, test_ind) in \
                zip(cv_mask, cv_ind):
            assert_array_equal(np.where(train_mask)[0], train_ind)
            assert_array_equal(np.where(test_mask)[0], test_ind) 
Example 65
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 5 votes vote down vote up
def test_check_cv_return_types():
    X = np.ones((9, 2))
    cv = cval._check_cv(3, X, classifier=False)
    assert_true(isinstance(cv, cval.KFold))

    y_binary = np.array([0, 1, 0, 1, 0, 0, 1, 1, 1])
    cv = cval._check_cv(3, X, y_binary, classifier=True)
    assert_true(isinstance(cv, cval.StratifiedKFold))

    y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2])
    cv = cval._check_cv(3, X, y_multiclass, classifier=True)
    assert_true(isinstance(cv, cval.StratifiedKFold))

    X = np.ones((5, 2))
    y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]]

    with warnings.catch_warnings(record=True):
        # deprecated sequence of sequence format
        cv = cval._check_cv(3, X, y_seq_of_seqs, classifier=True)
    assert_true(isinstance(cv, cval.KFold))

    y_indicator_matrix = LabelBinarizer().fit_transform(y_seq_of_seqs)
    cv = cval._check_cv(3, X, y_indicator_matrix, classifier=True)
    assert_true(isinstance(cv, cval.KFold))

    y_multioutput = np.array([[1, 2], [0, 3], [0, 0], [3, 1], [2, 0]])
    cv = cval._check_cv(3, X, y_multioutput, classifier=True)
    assert_true(isinstance(cv, cval.KFold)) 
Example 66
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 5 votes vote down vote up
def test_cross_val_predict():
    boston = load_boston()
    X, y = boston.data, boston.target
    cv = cval.KFold(len(boston.target))

    est = Ridge()

    # Naive loop (should be same as cross_val_predict):
    preds2 = np.zeros_like(y)
    for train, test in cv:
        est.fit(X[train], y[train])
        preds2[test] = est.predict(X[test])

    preds = cval.cross_val_predict(est, X, y, cv=cv)
    assert_array_almost_equal(preds, preds2)

    preds = cval.cross_val_predict(est, X, y)
    assert_equal(len(preds), len(y))

    cv = cval.LeaveOneOut(len(y))
    preds = cval.cross_val_predict(est, X, y, cv=cv)
    assert_equal(len(preds), len(y))

    Xsp = X.copy()
    Xsp *= (Xsp > np.median(Xsp))
    Xsp = coo_matrix(Xsp)
    preds = cval.cross_val_predict(est, Xsp, y)
    assert_array_almost_equal(len(preds), len(y))

    preds = cval.cross_val_predict(KMeans(), X)
    assert_equal(len(preds), len(y))

    def bad_cv():
        for i in range(4):
            yield np.array([0, 1, 2, 3]), np.array([4, 5, 6, 7, 8])

    assert_raises(ValueError, cval.cross_val_predict, est, X, y, cv=bad_cv()) 
Example 67
Project: linear_neuron   Author: uglyboxer   File: test_grid_search.py    MIT License 5 votes vote down vote up
def test_y_as_list():
    # Pass y as list in GridSearchCV
    X = np.arange(100).reshape(10, 10)
    y = np.array([0] * 5 + [1] * 5)

    clf = CheckingClassifier(check_y=lambda x: isinstance(x, list))
    cv = KFold(n=len(X), n_folds=3)
    grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, cv=cv)
    grid_search.fit(X, y.tolist()).score(X, y)
    assert_true(hasattr(grid_search, "grid_scores_")) 
Example 68
Project: linear_neuron   Author: uglyboxer   File: test_grid_search.py    MIT License 5 votes vote down vote up
def test_grid_search_with_multioutput_data():
    # Test search with multi-output estimator

    X, y = make_multilabel_classification(return_indicator=True,
                                          random_state=0)

    est_parameters = {"max_depth": [1, 2, 3, 4]}
    cv = KFold(y.shape[0], random_state=0)

    estimators = [DecisionTreeRegressor(random_state=0),
                  DecisionTreeClassifier(random_state=0)]

    # Test with grid search cv
    for est in estimators:
        grid_search = GridSearchCV(est, est_parameters, cv=cv)
        grid_search.fit(X, y)
        for parameters, _, cv_validation_scores in grid_search.grid_scores_:
            est.set_params(**parameters)

            for i, (train, test) in enumerate(cv):
                est.fit(X[train], y[train])
                correct_score = est.score(X[test], y[test])
                assert_almost_equal(correct_score,
                                    cv_validation_scores[i])

    # Test with a randomized search
    for est in estimators:
        random_search = RandomizedSearchCV(est, est_parameters,
                                           cv=cv, n_iter=3)
        random_search.fit(X, y)
        for parameters, _, cv_validation_scores in random_search.grid_scores_:
            est.set_params(**parameters)

            for i, (train, test) in enumerate(cv):
                est.fit(X[train], y[train])
                correct_score = est.score(X[test], y[test])
                assert_almost_equal(correct_score,
                                    cv_validation_scores[i]) 
Example 69
Project: linear_neuron   Author: uglyboxer   File: test_ridge.py    MIT License 5 votes vote down vote up
def _test_ridge_classifiers(filter_):
    n_classes = np.unique(y_iris).shape[0]
    n_features = X_iris.shape[1]
    for clf in (RidgeClassifier(), RidgeClassifierCV()):
        clf.fit(filter_(X_iris), y_iris)
        assert_equal(clf.coef_.shape, (n_classes, n_features))
        y_pred = clf.predict(filter_(X_iris))
        assert_greater(np.mean(y_iris == y_pred), .79)

    n_samples = X_iris.shape[0]
    cv = KFold(n_samples, 5)
    clf = RidgeClassifierCV(cv=cv)
    clf.fit(filter_(X_iris), y_iris)
    y_pred = clf.predict(filter_(X_iris))
    assert_true(np.mean(y_iris == y_pred) >= 0.8) 
Example 70
Project: policy_diffusion   Author: dssg   File: alignment_classifier.py    MIT License 5 votes vote down vote up
def evaluate_alignment_classifier():
    """runs k-fold cross validation on training set to evaluate classifier"""
    
    training_examples = []
    for line in csv.reader(self._training_file):
        if ( len(line[10].split()) != len(line[11].split()) ) or line[12] not in ["0","1"]:
            continue
        if len(line[10]) <= 1 or len(line[11]) < 1:
            continue
        training_examples.append({"left":line[10].split(),"right":line[11].split(),"label":int(line[12])})
    
    
    

    random.shuffle(training_examples)
    X,y = self.compute_feature_matrix(training_examples)

    self._model.fit(X_train,y_train)
    X,y = np.array(X),np.array(y)
    kf = KFold(n=len(X), n_folds=4, shuffle=False,
                           random_state=None)
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        self._model.fit(X_train,y_train)
        y_pred = self._model.predict(X_test)
        print classification_report(y_test, y_pred)
    
    self._model.fit(X,y)
    feat_names =  ['length','num_gaps_l','num_gaps_l','num_gaps_r','num_mismatches','num_matches','avg_gap_length_l',
            'avg_gap_length_r','avg_consec_match_length','jaccard_score','idf_mean','idf_medien']
    
    for x in zip(feat_names,self._model.coef_.tolist()):
        print x 
Example 71
Project: policy_diffusion   Author: dssg   File: alignment_classifier.py    MIT License 5 votes vote down vote up
def train_model(self):
        """ Trains model using training examples in self._training_file and returns a trained model self._model
        
        Keywords Args:
            None

            Returns:
            None   

        """

        
        training_examples = []
        for line in csv.reader(self._training_file):
            if ( len(line[10].split()) != len(line[11].split()) ) or line[12] not in ["0","1"]:
                continue
            if len(line[10]) <= 1 or len(line[11]) < 1:
                continue
            training_examples.append({"left":line[10].split(),"right":line[11].split(),"label":int(line[12])})
        
        X,y = self.compute_feature_matrix(training_examples)

        self._model.fit(X_train,y_train)
        X,y = np.array(X),np.array(y)
        kf = KFold(n=len(X), n_folds=4, shuffle=False,
                               random_state=None)
        for train_index, test_index in kf:
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            self._model.fit(X_train,y_train)
            y_pred = self._model.predict(X_test)
            print classification_report(y_test, y_pred)
        
        self._model.fit(X,y)
        feat_names =  ['length','num_gaps_l','num_gaps_l','num_gaps_r','num_mismatches','num_matches','avg_gap_length_l',
                'avg_gap_length_r','avg_consec_match_length','jaccard_score','idf_mean','idf_medien']
        
        for x in zip(feat_names,self._model.coef_.tolist()):
            print x 
Example 72
Project: Weiss   Author: WangWenjun559   File: test_learning_curve.py    Apache License 2.0 5 votes vote down vote up
def test_learning_curve_with_boolean_indices():
    X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
                               n_redundant=0, n_classes=2,
                               n_clusters_per_class=1, random_state=0)
    estimator = MockImprovingEstimator(20)
    cv = KFold(n=30, n_folds=3)
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, train_sizes=np.linspace(0.1, 1.0, 10))
    assert_array_equal(train_sizes, np.linspace(2, 20, 10))
    assert_array_almost_equal(train_scores.mean(axis=1),
                              np.linspace(1.9, 1.0, 10))
    assert_array_almost_equal(test_scores.mean(axis=1),
                              np.linspace(0.1, 1.0, 10)) 
Example 73
Project: Weiss   Author: WangWenjun559   File: test_cross_validation.py    Apache License 2.0 5 votes vote down vote up
def test_kfold_valueerrors():
    # Check that errors are raised if there is not enough samples
    assert_raises(ValueError, cval.KFold, 3, 4)

    # Check that a warning is raised if the least populated class has too few
    # members.
    y = [3, 3, -1, -1, 2]

    cv = assert_warns_message(Warning, "The least populated class",
                              cval.StratifiedKFold, y, 3)

    # Check that despite the warning the folds are still computed even
    # though all the classes are not necessarily represented at on each
    # side of the split at each split
    check_cv_coverage(cv, expected_n_iter=3, n_samples=len(y))

    # Error when number of folds is <= 1
    assert_raises(ValueError, cval.KFold, 2, 0)
    assert_raises(ValueError, cval.KFold, 2, 1)
    assert_raises(ValueError, cval.StratifiedKFold, y, 0)
    assert_raises(ValueError, cval.StratifiedKFold, y, 1)

    # When n is not integer:
    assert_raises(ValueError, cval.KFold, 2.5, 2)

    # When n_folds is not integer:
    assert_raises(ValueError, cval.KFold, 5, 1.5)
    assert_raises(ValueError, cval.StratifiedKFold, y, 1.5) 
Example 74
Project: ochem_predict_nn   Author: connorcoley   File: score_candidates_from_edits_compact.py    MIT License 4 votes vote down vote up
def get_data(max_N_c = None, shuffle = False):
	'''Creates a dictionary defining data generators for 
	training and validation given pickled data/label files

	max_N_c and shuffle only refers to training data'''

	with open(DATA_FPATH, 'rb') as fid:
		legend_data = pickle.load(fid)
	with open(LABELS_FPATH, 'rb') as fid:
		legend_labels = pickle.load(fid)

	N_samples =  legend_data['N_examples']

	# New approach: each generator takes whole set, but use allowable_batchNums to filter down
	from sklearn.cross_validation import KFold
	pseudoRandomCV = KFold(len(range(0, N_samples, batch_size)), n_folds = 5, shuffle = True, random_state = 0)
	
	# Stupid solution since KFold can't be indexed
	for i, (trainval_batches, test_batches) in enumerate(pseudoRandomCV):
		if i == (FOLD_NUM - 1): break

	print(trainval_batches)
	np.random.seed(0)
	np.random.shuffle(trainval_batches)
	train_batches = trainval_batches[:int(len(trainval_batches) * split_ratio[0] / (split_ratio[0] + split_ratio[1]))]
	val_batches   = trainval_batches[int(len(trainval_batches) * split_ratio[0] / (split_ratio[0] + split_ratio[1])):]


	print('Train batches: {}'.format(train_batches))
	print('Val batches: {}'.format(val_batches))
	print('Test batches: {}'.format(test_batches))
	
	N_train = len(train_batches) * batch_size
	N_val = len(val_batches) * batch_size
	N_test = len(test_batches) * batch_size
	print('Total number of samples: {}'.format(N_samples))
	print('Training   on {}% - {}'.format(split_ratio[0]*100, N_train))
	print('Validating on {}% - {}'.format(split_ratio[1]*100, N_val))
	print('Testing    on {}% - {}'.format((1-split_ratio[1]-split_ratio[0])*100, N_test))

	return {
		'N_samples': N_samples,
		'N_train': N_train,
		#
		'train_generator': data_generator(0, N_samples, batch_size, max_N_c = max_N_c, shuffle = shuffle, allowable_batchNums = train_batches),
		'train_label_generator': label_generator(0, N_samples, batch_size, allowable_batchNums = train_batches),
		'train_nb_samples': N_train,
		#
		'val_generator': data_generator(0, N_samples, batch_size, allowable_batchNums = val_batches),
		'val_label_generator': label_generator(0, N_samples, batch_size, allowable_batchNums = val_batches),
		'val_nb_samples': N_val,
		#
		'test_generator': data_generator(0, N_samples, batch_size, allowable_batchNums = test_batches),
		'test_label_generator': label_generator(0, N_samples, batch_size, allowable_batchNums = test_batches),
		'test_nb_samples': N_test,
		#
		#
		'batch_size': batch_size,
	} 
Example 75
Project: skutil   Author: tgsmith61591   File: test_big.py    BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def test_large_grid():
        """In this test, we purposely overfit a RandomForest to completely random data
        in order to assert that the test error will far supercede the train error.
        """

        if not SK18:
            custom_cv = KFold(n=y_train.shape[0], n_folds=3, shuffle=True, random_state=42)
        else:
            custom_cv = KFold(n_splits=3, shuffle=True, random_state=42)

        # define the pipe
        pipe = Pipeline([
            ('scaler', SelectiveScaler()),
            ('pca', SelectivePCA(weight=True)),
            ('rf', RandomForestClassifier(random_state=42))
        ])

        # define hyper parameters
        hp = {
            'scaler__scaler': [StandardScaler(), RobustScaler(), MinMaxScaler()],
            'pca__whiten': [True, False],
            'pca__weight': [True, False],
            'pca__n_components': uniform(0.75, 0.15),
            'rf__n_estimators': randint(5, 10),
            'rf__max_depth': randint(5, 15)
        }

        # define the grid
        grid = RandomizedSearchCV(pipe, hp, n_iter=2, scoring='accuracy', n_jobs=1, cv=custom_cv, random_state=42)

        # this will fail because we haven't fit yet
        assert_fails(grid.score, (ValueError, AttributeError), X_train, y_train)

        # fit the grid
        grid.fit(X_train, y_train)

        # score for coverage -- this might warn...
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            grid.score(X_train, y_train)

        # coverage:
        assert grid._estimator_type == 'classifier'

        # get predictions
        tr_pred, te_pred = grid.predict(X_train), grid.predict(X_test)

        # evaluate score (SHOULD be better than random...)
        accuracy_score(y_train, tr_pred), accuracy_score(y_test, te_pred)

        # grid score reports:
        # assert fails for bad percentile
        assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 0.0})
        assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 1.0})

        # assert fails for bad y_axis
        assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'y_axis': 'bad_axis'})

        # assert passes otherwise
        report_grid_score_detail(grid, charts=True, percentile=0.95)  # just ensure percentile works 
Example 76
Project: 2016CCF-sougou   Author: prozhuchen   File: classify.py    Apache License 2.0 4 votes vote down vote up
def stacking(self,X,Y,T,wv_X,wv_T,kind):
        """
        ensemble model:stacking

        """
        print 'fitting..'
        models = self.base_models
        folds = list(KFold(len(Y), n_folds=5, random_state=0))
        S_train = np.zeros((X.shape[0], len(models)))
        S_test = np.zeros((T.shape[0], len(models)))

        for i, bm in enumerate(models):
            clf = bm[1]

            S_test_i = np.zeros((T.shape[0], len(folds)))
            for j, (train_idx, test_idx) in enumerate(folds):
                X_train = X[train_idx]
                y_train = Y[train_idx]
                X_holdout = X[test_idx]

                clf.fit(X_train, y_train)
                y_pred = clf.predict(X_holdout)[:]
                S_train[test_idx, i] = y_pred
                S_test_i[:, j] = clf.predict(T)[:]

            S_test[:, i] = S_test_i.mean(1)

        print S_train.shape,S_test.shape

        S_train = np.concatenate((S_train,wv_X),axis=1)
        S_test = np.concatenate((S_test, wv_T), axis=1)

        print S_train.shape,S_test.shape

        print 'scalering..'
        min_max_scaler = StandardScaler()
        S_train = min_max_scaler.fit_transform(S_train)
        S_test = min_max_scaler.fit_transform(S_test)
        print 'scalering over!'
        self.svc.fit(S_train, Y)
        yp= self.svc.predict(S_test)[:]
        return yp 
Example 77
Project: eecs-499   Author: sandeepraju   File: run.py    BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def main():
    filename = sys.argv[1]

    X = data.load_dataset('{}_X.npy'.format(filename))
    Y = data.load_dataset('{}_Y.npy'.format(filename))

    model = network.build_model()

    # vizualize the model
    network.vizualize_model(model, filename)

    # 80:20
    # print network.train_model(model, (X, Y))
    # score = model.evaluate(X, Y, verbose=0)
    # print 'Test score:', score[0]
    
    # K-Fold
    val_error = []
    losses = []
    kf = KFold(Y.shape[0], n_folds=FOLDS, shuffle=True, random_state=None)
    for train_index, val_index in kf:
        # Generate the dataset for this fold
        X_train, X_val = X[train_index], X[val_index]
        Y_train, Y_val = Y[train_index], Y[val_index]
        print X_train.shape, X_val.shape
        print Y_train.shape, Y_val.shape

        # Train the model on this dataset
        train_history, loss_history = network.train_model(model, (X_train, Y_train), (X_val, Y_val))

        # TODO: save the losses to a file.
        losses.append(loss_history.losses)

        # Evaluate the model
        val_error = model.evaluate(X_val, Y_val, verbose=0)
        print 'Validation error:', val_error

        # NOTE: hack to run only one split
        break
        
    # Print final K-Fold error
    print "K-Fold Error: %0.2f (+/- %0.2f)" % (val_error.mean(), val_error.std() * 2)
        
    # Predict some labels
    # TODO: modify this to suit our image needs.
    counter = 0
    while counter < 1:
        idx = random.choice(xrange(Y.shape[0]))
        prediction = network.predict_model(model, np.expand_dims(X[idx,:], axis=0))
        print 'Testing: sample={}, prediction={}, actual={}'.format(
            idx, prediction, Y[idx,:])

        # save this file
        data.generate_image(prediction)
        counter += 1


    # dump the model to the file
    network.save_model(model, filename) 
Example 78
Project: automl-phase-2   Author: jamesrobertlloyd   File: util.py    MIT License 4 votes vote down vote up
def convert_mat_into_automl_folds(filename, save_folder_root, time_budget=300, n_folds=5, input_type='Numerical',
                                  random_state=0, metric='auc_metric', usage='testing', task='binary.classification',
                                  target_type='Binary'):
    """Convert a dataset in .mat format into several folds of automl format"""
    # Load data
    data = scipy.io.loadmat(filename)
    X = data['X']
    y = data['y']
    data_name = os.path.splitext(os.path.split(filename)[-1])[0]
    # Convert data if appropriate
    if task == 'binary.classification':
        y_max = y.max()
        y[y == y_max] = 1
        y[y < y_max] = 0
    # If input_type is 'infer' we now infer input types
    if input_type == 'infer':
        raise Exception('I do not know how to infer input types yet')
    else:
        input_type_list = [input_type] * X.shape[1]
    # Create info dictionary
    # TODO - some of these defaults need to be changed
    info = dict(usage=usage, name=data_name, task=task, target_type=target_type,
                feat_type='Numerical', metric=metric, feat_num=X.shape[1],
                target_num=1, label_num=0, has_categorical=0, has_missing=0, is_sparse=0,
                time_budget=time_budget, valid_num=0)
    # Now split into folds and save
    folds = KFold(n=X.shape[0], n_folds=n_folds, shuffle=True, random_state=random_state)
    for (fold, (train_index, test_index)) in enumerate(folds):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        fold_folder = os.path.join(save_folder_root + '_fold_%02d' % (fold + 1), data_name)
        mkdir(fold_folder)
        fmt = '%f'
        np.savetxt(os.path.join(fold_folder, data_name + '_train.data'), X_train, fmt=fmt, delimiter=' ')
        np.savetxt(os.path.join(fold_folder, data_name + '_test.data'), X_test, fmt=fmt, delimiter=' ')
        if task == 'binary.classification':
            fmt = '%d'
        np.savetxt(os.path.join(fold_folder, data_name + '_train.solution'), y_train, fmt=fmt, delimiter=' ')
        np.savetxt(os.path.join(fold_folder, data_name + '_test.solution'), y_test, fmt=fmt, delimiter=' ')
        info['train_num'] = X_train.shape[0]
        info['test_num'] = X_test.shape[0]
        with open(os.path.join(fold_folder, data_name + '_public.info'), 'w') as info_file:
            for (key, value) in info.iteritems():
                info_file.write('%s = %s\n' % (key, value))
        with open(os.path.join(fold_folder, data_name + '_feat.type'), 'w') as feature_file:
            for feat_type in input_type_list:
                feature_file.write('%s\n' % feat_type) 
Example 79
Project: linear_neuron   Author: uglyboxer   File: test_cross_validation.py    MIT License 4 votes vote down vote up
def test_kfold_can_detect_dependent_samples_on_digits():  # see #2372
    # The digits samples are dependent: they are apparently grouped by authors
    # although we don't have any information on the groups segment locations
    # for this data. We can highlight this fact be computing k-fold cross-
    # validation with and without shuffling: we observe that the shuffling case
    # wrongly makes the IID assumption and is therefore too optimistic: it
    # estimates a much higher accuracy (around 0.96) than than the non
    # shuffling variant (around 0.86).

    digits = load_digits()
    X, y = digits.data[:800], digits.target[:800]
    model = SVC(C=10, gamma=0.005)
    n = len(y)

    cv = cval.KFold(n, 5, shuffle=False)
    mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
    assert_greater(0.88, mean_score)
    assert_greater(mean_score, 0.85)

    # Shuffling the data artificially breaks the dependency and hides the
    # overfitting of the model with regards to the writing style of the authors
    # by yielding a seriously overestimated score:

    cv = cval.KFold(n, 5, shuffle=True, random_state=0)
    mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
    assert_greater(mean_score, 0.95)

    cv = cval.KFold(n, 5, shuffle=True, random_state=1)
    mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
    assert_greater(mean_score, 0.95)

    # Similarly, StratifiedKFold should try to shuffle the data as little
    # as possible (while respecting the balanced class constraints)
    # and thus be able to detect the dependency by not overestimating
    # the CV score either. As the digits dataset is approximately balanced
    # the estimated mean score is close to the score measured with
    # non-shuffled KFold

    cv = cval.StratifiedKFold(y, 5)
    mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
    assert_greater(0.88, mean_score)
    assert_greater(mean_score, 0.85) 
Example 80
Project: deep-mlsa   Author: spinningbytes   File: fit_utils.py    Apache License 2.0 4 votes vote down vote up
def fit_model_cv(config_data, model, train_iterator, valid_iterator):
    assert train_iterator.type == 'loader'

    nb_epochs = config_data['nb_epochs']
    batch_size = config_data['batch_size']

    X_train = train_iterator.input_data
    y_train = train_iterator.output_data

    kf = KFold(n_folds=5, shuffle=True, n=len(X_train[0]))

    path = config_data['output_path']
    basename = config_data['output_basename']
    base_path = join(path, basename)
    opath = join(base_path, 'base_model.h5')
    model.save_weights(opath)

    appendices = []

    for i, (train, test) in enumerate(kf):
        model.load_weights(opath)

        input_train = [X[train] for X in X_train]
        output_train = [y[train] for y in y_train]

        input_valid = [X[test] for X in X_train]
        output_valid = [y[test] for y in y_train]

        appendix = '_{}'.format(i)
        callbacks = run_utils.get_callbacks(config_data, appendix=appendix)
        stored_model = True
        hist = model.fit(
            x=input_train,
            y=output_train,
            batch_size=batch_size,
            validation_data=(input_valid, output_valid),
            nb_epoch=nb_epochs,
            verbose=1,
            callbacks=callbacks,
            class_weight=run_utils.get_classweight(config_data)
        )

        appendices.append(appendix)

        weights_path = join(base_path, 'best_model{}.h5'.format(appendix))
        model.load_weights(weights_path)

        oline_test = run_utils.get_evaluation(config_data, model, train_iterator, basename, '')
        print(oline_test)

    return hist, stored_model, appendices