Python sklearn.feature_selection.f_classif() Examples

The following are 30 code examples of sklearn.feature_selection.f_classif(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.feature_selection , or try the search function .
Example #1
Source File: GetMLPara.py    From dr_droid with Apache License 2.0 6 votes vote down vote up
def find_best_feature_selections(X,y):

    #select the best features usin different technique
    X_new = SelectKBest(chi2, k=80).fit_transform(X,y)
    X_new1 = SelectPercentile(chi2, percentile=20).fit_transform(X,y)

    X_new2 = SelectKBest(f_classif, k=80).fit_transform(X,y) #this one has the best performance
    X_new22 = SelectPercentile(f_classif, percentile=20).fit_transform(X,y)

    X_new3 = SelectKBest(f_classif, k=70).fit_transform(X,y)
    X_new4 = SelectKBest(f_classif, k=60).fit_transform(X,y)

    print (X_new.shape)
    #selection_parameters_for_classfier(X_new,y)
    #print (y.shape)
    train_and_test(X_new,y)
    train_and_test(X_new1,y)
    train_and_test(X_new2,y)
    train_and_test(X_new22,y)
    train_and_test(X_new3,y)
    train_and_test(X_new4,y)
    #X,y = _dataset_sample()

################################PARAMETER  Selected################################
#TODO some problem happens when using the parameter max_leaf_nodes in Dtree and RandomForest 
Example #2
Source File: FeatureSelector.py    From FAE with GNU General Public License v3.0 6 votes vote down vote up
def GetSelectedFeatureIndex(self, data_container):
        data = data_container.GetArray()
        data /= np.linalg.norm(data, ord=2, axis=0)
        label = data_container.GetLabel()

        if data.shape[1] < self.GetSelectedFeatureNumber():
            print(
                'ANOVA: The number of features {:d} in data container is smaller than the required number {:d}'.format(
                    data.shape[1], self.GetSelectedFeatureNumber()))
            self.SetSelectedFeatureNumber(data.shape[1])

        fs = SelectKBest(f_classif, k=self.GetSelectedFeatureNumber())
        fs.fit(data, label)
        feature_index = fs.get_support(True)
        f_value, p_value = f_classif(data, label)
        return feature_index.tolist(), f_value, p_value 
Example #3
Source File: test_dft.py    From pyts with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _compute_expected_results(X, y=None, n_coefs=None, drop_sum=False,
                              anova=False, norm_mean=False, norm_std=False):
    """Compute the expected results."""
    X = np.asarray(X)
    if norm_mean:
        X -= X.mean(axis=1)[:, None]
    if norm_std:
        X /= X.std(axis=1)[:, None]
    X_fft = np.fft.rfft(X)
    X_fft = np.vstack([np.real(X_fft), np.imag(X_fft)])
    X_fft = X_fft.reshape(n_samples, -1, order='F')
    if drop_sum:
        X_fft = X_fft[:, 2:-1]
    else:
        X_fft = np.hstack([X_fft[:, :1], X_fft[:, 2:-1]])
    if n_coefs is None:
        return X_fft
    else:
        if anova:
            _, p = f_classif(X_fft, y)
            support = np.argsort(p)[:n_coefs]
            return X_fft[:, support]
        else:
            return X_fft[:, :n_coefs] 
Example #4
Source File: test_base.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_clone():
    # Tests that clone creates a correct deep copy.
    # We create an estimator, make a copy of its original state
    # (which, in this case, is the current state of the estimator),
    # and check that the obtained copy is a correct deep copy.

    from sklearn.feature_selection import SelectFpr, f_classif

    selector = SelectFpr(f_classif, alpha=0.1)
    new_selector = clone(selector)
    assert selector is not new_selector
    assert_equal(selector.get_params(), new_selector.get_params())

    selector = SelectFpr(f_classif, alpha=np.zeros((10, 2)))
    new_selector = clone(selector)
    assert selector is not new_selector 
Example #5
Source File: test_base.py    From twitter-stock-recommendation with MIT License 6 votes vote down vote up
def test_clone():
    # Tests that clone creates a correct deep copy.
    # We create an estimator, make a copy of its original state
    # (which, in this case, is the current state of the estimator),
    # and check that the obtained copy is a correct deep copy.

    from sklearn.feature_selection import SelectFpr, f_classif

    selector = SelectFpr(f_classif, alpha=0.1)
    new_selector = clone(selector)
    assert_true(selector is not new_selector)
    assert_equal(selector.get_params(), new_selector.get_params())

    selector = SelectFpr(f_classif, alpha=np.zeros((10, 2)))
    new_selector = clone(selector)
    assert_true(selector is not new_selector) 
Example #6
Source File: dft.py    From pyts with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _anova(self, X_fft, y, n_coefs, n_timestamps):
        if n_coefs < X_fft.shape[1]:
            non_constant = np.where(
                ~np.isclose(X_fft.var(axis=0), np.zeros_like(X_fft.shape[1]))
            )[0]
            if non_constant.size == 0:
                raise ValueError("All the Fourier coefficients are constant. "
                                 "Your input data is weirdly homogeneous.")
            elif non_constant.size < n_coefs:
                warn("The number of non constant Fourier coefficients ({0}) "
                     "is lower than the number of coefficients to keep ({1}). "
                     "The number of coefficients to keep is truncated to {2}"
                     ".".format(non_constant.size, n_coefs, non_constant.size))
                support = non_constant
            else:
                _, p = f_classif(X_fft[:, non_constant], y)
                support = non_constant[np.argsort(p)[:n_coefs]]
        else:
            support = np.arange(n_coefs)
        return support 
Example #7
Source File: f_score.py    From scikit-feature with GNU General Public License v2.0 6 votes vote down vote up
def f_score(X, y):
    """
    This function implements the anova f_value feature selection (existing method for classification in scikit-learn),
    where f_score = sum((ni/(c-1))*(mean_i - mean)^2)/((1/(n - c))*sum((ni-1)*std_i^2))

    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data
    y : {numpy array},shape (n_samples,)
        input class labels

    Output
    ------
    F: {numpy array}, shape (n_features,)
        f-score for each feature
    """

    F, pval = f_classif(X, y)
    return F 
Example #8
Source File: export_tests.py    From tpot with GNU Lesser General Public License v3.0 6 votes vote down vote up
def test_export_pipeline():
    """Assert that exported_pipeline() generated a compile source file as expected given a fixed pipeline."""

    pipeline_string = (
        'KNeighborsClassifier(CombineDFs('
        'DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini, '
        'DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,'
        'DecisionTreeClassifier__min_samples_split=5),SelectPercentile(input_matrix, SelectPercentile__percentile=20))'
        'KNeighborsClassifier__n_neighbors=10, '
        'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform'
    )

    pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
    expected_code = """import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline, make_union
from sklearn.tree import DecisionTreeClassifier
from tpot.builtins import StackingEstimator

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \\
            train_test_split(features, tpot_data['target'], random_state=None)

exported_pipeline = make_pipeline(
    make_union(
        StackingEstimator(estimator=DecisionTreeClassifier(criterion="gini", max_depth=8, min_samples_leaf=5, min_samples_split=5)),
        SelectPercentile(score_func=f_classif, percentile=20)
    ),
    KNeighborsClassifier(n_neighbors=10, p=1, weights="uniform")
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
"""
    assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset) 
Example #9
Source File: dominance.py    From dominance-analysis with MIT License 6 votes vote down vote up
def get_top_k(self):
		columns=list(self.data.columns.values)
		columns.remove(self.target)
		# remove intercept from top_k
		if(self.objective):
			top_k_vars=SelectKBest(f_regression, k=self.top_k)
			top_k_vars.fit_transform(self.data[columns], self.data[self.target])
		else:
			columns.remove('intercept')
			try:
				top_k_vars=SelectKBest(chi2, k=self.top_k)
				top_k_vars.fit_transform(self.data[columns], self.data[self.target])
			except:
				top_k_vars=SelectKBest(f_classif, k=self.top_k)
				top_k_vars.fit_transform(self.data[columns], self.data[self.target])
		return [columns[i] for i in top_k_vars.get_support(indices=True)] 
Example #10
Source File: scores.py    From SecuML with GNU General Public License v2.0 5 votes vote down vote up
def compute_scoring_func(self, func):
        if func == 'variance':
            features = self.instances.features.get_values()
            annotations = self.instances.annotations.get_labels()
            if isinstance(features, spmatrix):
                variance = mean_variance_axis(features, axis=0)[1]
            else:
                variance = features.var(axis=0)
            return variance, None

        features = self.annotated_instances.features.get_values()
        annotations = self.annotated_instances.annotations.get_supervision(
                                                               self.multiclass)
        if func == 'f_classif':
            return f_classif(features, annotations)
        elif func == 'mutual_info_classif':
            if isinstance(features, spmatrix):
                discrete_indexes = True
            else:
                features_types = self.instances.features.info.types
                discrete_indexes = [i for i, t in enumerate(features_types)
                                    if t == FeatureType.binary]
                if not discrete_indexes:
                    discrete_indexes = False
            return (mutual_info_classif(features, annotations,
                                        discrete_features=discrete_indexes),
                    None)
        elif func == 'chi2':
            return chi2(features, annotations)
        else:
            assert(False) 
Example #11
Source File: GetMLPara.py    From dr_droid with Apache License 2.0 5 votes vote down vote up
def my_get_fp_fn_CV(X_original,y):

    #generate classfiers
    knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', n_neighbors=5, p=2, weights='uniform')

    #decision tree
    dtree = DecisionTreeClassifier( criterion='gini', min_samples_leaf=4, min_samples_split=2, random_state=None, splitter='best')

    #naive
    #nbbern = BernoulliNB()

    #random forest
    rforest = RandomForestClassifier(bootstrap=True, criterion='gini', max_depth=None, max_features='auto',  min_samples_leaf=1, min_samples_split=2, n_estimators=10, n_jobs=1, oob_score=False, random_state=3)

    #svm
    svmrbf= svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,  kernel='rbf', max_iter=-1, probability=True, random_state=None,
shrinking=True, tol=0.001, verbose=False)

    #reduce the size
    #X = SelectKBest(f_classif, k=80).fit_transform(X_original,y)
    skb = SelectKBest(f_classif, k=80).fit(X_original,y)
    X = skb.fit_transform(X_original,y)

    print ("KNN")
    my_get_fp_fn_inter(knn,X,y)
    print ("DTree")
    my_get_fp_fn_inter(dtree,X,y)
    print ("rforest")
    my_get_fp_fn_inter(rforest,X,y)
    #print ("naive bayes")
    #my_get_fp_fn_inter(nbbern,X,y)
    print ("SVMrbf")
    my_get_fp_fn_inter(svmrbf,X,y) 
Example #12
Source File: GetMLPara.py    From dr_droid with Apache License 2.0 5 votes vote down vote up
def precision_recall_curve_draw(X_o,y):

    X = SelectKBest(f_classif, k=80).fit_transform(X_o,y)
    print (X.shape)
    print (y.shape)

    svmrbf= svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,  kernel='rbf', max_iter=-1, probability=True, random_state=None,
shrinking=True, tol=0.001, verbose=False)
    knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', n_neighbors=5, p=2, weights='uniform')

    dtree = DecisionTreeClassifier( criterion='gini', min_samples_leaf=1, min_samples_split=2, random_state=None, splitter='best')

    rforest = RandomForestClassifier(bootstrap=True, criterion='gini', max_depth=None, max_features='auto',  min_samples_leaf=1, min_samples_split=2, n_estimators=10, n_jobs=1, oob_score=False, random_state=3)

    p_svmrbf, r_svmrbf, auc_svmrbf = get_my_pecision_recall(svmrbf,X,y)


    p_knn, r_knn, auc_knn = get_my_pecision_recall(knn, X, y)
    p_dtree, r_dtree, auc_dtree = get_my_pecision_recall(dtree, X, y)
    p_rforest, r_rforest, auc_rforest = get_my_pecision_recall(rforest, X, y)

    plt.clf()
    plt.plot(r_svmrbf,p_svmrbf, 'y.--', label ='SVM auc=%0.3f'% auc_svmrbf)
    plt.plot(r_knn, p_knn, 'r^--', label='KNN auc=%0.3f' %auc_knn)
    plt.plot(r_dtree, p_dtree, 'b>--', label ='Decision Tree auc=%0.3f'% auc_dtree)
    plt.plot(r_rforest, p_rforest, 'go--', label ='Random Forest auc=%0.3f'% auc_rforest)

    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('recall rate')
    plt.ylabel('precision rate')
    plt.title('precision-recall curve')
    plt.legend(loc="lower right")
    plt.show()

    del X
    del y
###############################################Examples to show the difference of features representation ################################## 
Example #13
Source File: utilanalisis.py    From pghumor with Apache License 2.0 5 votes vote down vote up
def f_score_feature_selection(features, clases, nombres_features_ordenadas):
    print("Realizando f-score feature selection")
    f_score = feature_selection.f_classif(features, clases)

    imprimir_importancias(f_score, "f-score", nombres_features_ordenadas) 
Example #14
Source File: test_feature_selection.py    From pandas-ml with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_f_classif(self):
        diabetes = datasets.load_diabetes()
        df = pdml.ModelFrame(diabetes)

        result = df.feature_selection.f_classif()
        expected = fs.f_classif(diabetes.data, diabetes.target)

        self.assertEqual(len(result), 2)
        tm.assert_numpy_array_equal(result[0], expected[0])
        tm.assert_numpy_array_equal(result[1], expected[1]) 
Example #15
Source File: PipeTasks.py    From ProFET with GNU General Public License v3.0 5 votes vote down vote up
def PlotPerfPercentFeatures(X,y,est=LinearSVC()):
    '''
    Performance of a classifier (default: SVM-Anova)
    varying the percentile of features selected (F-test) .

    http://scikit-learn.org/stable/auto_examples/svm/plot_svm_anova.html#example-svm-plot-svm-anova-py

    See Also: (Similar but with model seelction from among classifiers):
    http://nbviewer.ipython.org/github/bugra/pydata-nyc-2014/blob/master/6.%20Scikit%20Learn%20-%20Model%20Selection.ipynb

    '''
    transform = SelectPercentile(f_classif)

    clf = Pipeline([('anova', transform), ('est', est)])
    ###############################################################################
    # Plot the cross-validation score as a function of percentile of features
    score_means = list()
    score_stds = list()
    percentiles = (1,2,3,5,7,10,13,15,20,25,33,50,65,75,90, 99)
    # percentiles = (1,5,10,25,50,75,90)

    for percentile in percentiles:
        # print(percentile)
        clf.set_params(anova__percentile=percentile)
        this_scores = cross_val_score(clf, X, y,cv=StratifiedShuffleSplit(y, n_iter=7, test_size=0.3), n_jobs=-1)
        score_means.append(this_scores.mean())
        score_stds.append(this_scores.std())
    print("Outputting Graph:")

    plt.errorbar(percentiles, score_means, np.array(score_stds))

    plt.title(
        'Predictor Performance, varying percent of features used')
    plt.xlabel('Percentile')
    plt.ylabel('Prediction Performance')
    plt.axis('tight')
    plt.show() 
Example #16
Source File: preprocessing.py    From sk-dist with Apache License 2.0 5 votes vote down vote up
def __init__(self, selector="fpr", score_func=feature_selection.f_classif, threshold=0.05):
        self.selector = selector
        self.score_func = score_func
        self.threshold = threshold 
Example #17
Source File: scores.py    From SecuML with GNU General Public License v2.0 5 votes vote down vote up
def _set_scoring_func(self):
        self.scoring_func = [('variance', False)]
        if self.annotated_instances.num_instances() > 0:
            self.scoring_func.append(('f_classif', True))
            self.scoring_func.append(('mutual_info_classif', False))
            if self.instances.features.all_positives():
                self.scoring_func.append(('chi2', True)) 
Example #18
Source File: relation.py    From visualize_ML with MIT License 5 votes vote down vote up
def evaluate_anova(x,y):
    F_value,pvalue = f_classif(x,y)
    return F_value,pvalue

# In descriptive statistics, a box plot or boxplot is a convenient way of graphically depicting groups of numerical data through their quartiles. Box plots may also have lines extending vertically from the boxes (whiskers) indicating variability outside the upper and lower quartiles, hence the terms box-and-whisker plot and box-and-whisker diagram.
# Quartile: In descriptive statistics, the quartiles of a ranked set of data values are the three points that divide the data set into four equal groups, each group comprising a quarter of the data 
Example #19
Source File: test_pipeline.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_pipeline_methods_anova():
    # Test the various methods of the pipeline (anova).
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Test with Anova + LogisticRegression
    clf = LogisticRegression()
    filter1 = SelectKBest(f_classif, k=2)
    pipe = Pipeline([('anova', filter1), ('logistic', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y) 
Example #20
Source File: test_base.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_clone_2():
    # Tests that clone doesn't copy everything.
    # We first create an estimator, give it an own attribute, and
    # make a copy of its original state. Then we check that the copy doesn't
    # have the specific attribute we manually added to the initial estimator.

    from sklearn.feature_selection import SelectFpr, f_classif

    selector = SelectFpr(f_classif, alpha=0.1)
    selector.own_attribute = "test"
    new_selector = clone(selector)
    assert_false(hasattr(new_selector, "own_attribute")) 
Example #21
Source File: test_randomized_l1.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_randomized_logistic():
    # Check randomized sparse logistic regression
    iris = load_iris()
    X = iris.data[:, [0, 2]]
    y = iris.target
    X = X[y != 2]
    y = y[y != 2]

    F, _ = f_classif(X, y)

    scaling = 0.3
    clf = RandomizedLogisticRegression(verbose=False, C=1., random_state=42,
                                       scaling=scaling, n_resampling=50,
                                       tol=1e-3)
    X_orig = X.copy()
    feature_scores = clf.fit(X, y).scores_
    assert_array_equal(X, X_orig)   # fit does not modify X
    assert_array_equal(np.argsort(F), np.argsort(feature_scores))

    clf = RandomizedLogisticRegression(verbose=False, C=[1., 0.5],
                                       random_state=42, scaling=scaling,
                                       n_resampling=50, tol=1e-3)
    feature_scores = clf.fit(X, y).scores_
    assert_array_equal(np.argsort(F), np.argsort(feature_scores))

    clf = RandomizedLogisticRegression(verbose=False, C=[[1., 0.5]])
    assert_raises(ValueError, clf.fit, X, y) 
Example #22
Source File: test_randomized_l1.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_randomized_logistic_sparse():
    # Check randomized sparse logistic regression on sparse data
    iris = load_iris()
    X = iris.data[:, [0, 2]]
    y = iris.target
    X = X[y != 2]
    y = y[y != 2]

    # center here because sparse matrices are usually not centered
    # labels should not be centered
    X, _, _, _, _ = _preprocess_data(X, y, True, True)

    X_sp = sparse.csr_matrix(X)

    F, _ = f_classif(X, y)

    scaling = 0.3
    clf = RandomizedLogisticRegression(verbose=False, C=1., random_state=42,
                                       scaling=scaling, n_resampling=50,
                                       tol=1e-3)
    feature_scores = clf.fit(X, y).scores_
    clf = RandomizedLogisticRegression(verbose=False, C=1., random_state=42,
                                       scaling=scaling, n_resampling=50,
                                       tol=1e-3)
    feature_scores_sp = clf.fit(X_sp, y).scores_
    assert_array_equal(feature_scores, feature_scores_sp) 
Example #23
Source File: feature_selection.py    From dataiku-contrib with Apache License 2.0 5 votes vote down vote up
def univariate_feature_selection(mode,predictors,target):
    
    if mode == 'f_regression':
        fselect = SelectPercentile(f_regression, 100)
        
    if mode == 'f_classif':
        fselect = SelectPercentile(f_classif, 100)
        
    if mode == 'chi2':
        fselect = SelectPercentile(chi2, 100)
        
    fselect.fit_transform(predictors, target)
    
    return fselect.pvalues_ 
Example #24
Source File: models.py    From aletheia with MIT License 5 votes vote down vote up
def fit(self, X, y):
        
        self.selector = SelectKBest(f_classif, k=self.max_features)
        self.selector.fit(X, y)

        X_train=self.selector.transform(X)
        y_train=y

        param_list=[]
        idx = range(len(y_train))
        for i in range(self.n_estimators):
            random.shuffle(idx)
            param_list.append((X_train[idx[:self.max_samples]], 
                               y_train[idx[:self.max_samples]]))

        pool = ThreadPool(cpu_count())
        self.clf_list = pool.map(self._prepare_classifier, param_list)
        pool.close()
        pool.join()

        """
        X2=[]
        for clf in self.clf_list:
            P=clf.predict_proba(X_train)
            if len(X2)==0:
                X2=P[:, 0]
            else:
                X2=numpy.vstack((X2, P[:, 0]))
        X2=numpy.swapaxes(X2, 0, 1)
        print "X2:", X2.shape

        from sklearn.ensemble import RandomForestClassifier
        self.clf2=RandomForestClassifier(n_estimators=100)
        self.clf2.fit(X2, y_train)
        """ 
Example #25
Source File: test_sfa.py    From pyts with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _compute_expected_results(X, y=None, n_coefs=None, n_bins=4,
                              strategy='quantile', drop_sum=False, anova=False,
                              norm_mean=False, norm_std=False, alphabet=None):
    """Compute the expected results."""
    X = np.asarray(X)
    if norm_mean:
        X -= X.mean(axis=1)[:, None]
    if norm_std:
        X /= X.std(axis=1)[:, None]
    X_fft = np.fft.rfft(X)
    X_fft = np.vstack([np.real(X_fft), np.imag(X_fft)])
    X_fft = X_fft.reshape(n_samples, -1, order='F')
    if drop_sum:
        X_fft = X_fft[:, 2:-1]
    else:
        X_fft = np.hstack([X_fft[:, :1], X_fft[:, 2:-1]])
    if n_coefs is not None:
        if anova:
            _, p = f_classif(X_fft, y)
            support = np.argsort(p)[:n_coefs]
            X_fft = X_fft[:, support]
        else:
            X_fft = X_fft[:, :n_coefs]

    mcb = MultipleCoefficientBinning(n_bins=n_bins, strategy=strategy,
                                     alphabet=alphabet)
    arr_desired = mcb.fit_transform(X_fft)
    return arr_desired 
Example #26
Source File: FieldSelector.py    From Splunking-Crime with GNU Affero General Public License v3.0 5 votes vote down vote up
def __init__(self, options):
        self.handle_options(options)

        out_params = convert_params(
            options.get('params', {}),
            floats=['param'],
            strs=['type', 'mode'],
            aliases={'type': 'score_func'},
        )

        if 'score_func' not in out_params:
            out_params['score_func'] = f_classif
        else:
            if out_params['score_func'].lower() == 'categorical':
                out_params['score_func'] = f_classif
            elif out_params['score_func'].lower() in ['numerical', 'numeric']:
                out_params['score_func'] = f_regression
            else:
                raise RuntimeError('type can either be categorical or numeric.')

        if 'mode' in out_params:
            if out_params['mode'] not in ('k_best', 'fpr', 'fdr', 'fwe', 'percentile'):
                raise RuntimeError('mode can only be one of the following: fdr, fpr, fwe, k_best, and percentile')

            if out_params['mode'] in ['fpr', 'fdr', 'fwe']:
                if 'param' in out_params:
                    if not 0 < out_params['param'] < 1:
                        msg = 'Invalid param value for mode {}: param must be between 0 and 1.'.format(out_params['mode'])
                        raise ValueError(msg)

        # k_best and percentile require integer param
        if 'param' in out_params and out_params.get('mode') not in ['fdr', 'fpr', 'fwe']:
            original_value = out_params['param']
            out_params['param'] = int(out_params['param'])
            if out_params['param'] != original_value:
                msg = 'param value {} is not an integer; mode={} requires an integer.'
                msg = msg.format(original_value, out_params.get('mode', 'percentile'))
                raise ValueError(msg)

        self.estimator = GenericUnivariateSelect(**out_params) 
Example #27
Source File: FieldSelector.py    From Splunking-Crime with GNU Affero General Public License v3.0 5 votes vote down vote up
def decode(cls, obj):
        from sklearn.feature_selection import f_classif, f_regression, GenericUnivariateSelect

        new_obj = GenericUnivariateSelect.__new__(GenericUnivariateSelect)
        new_obj.__dict__ = obj['dict']

        if new_obj.score_func == 'f_classif':
            new_obj.score_func = f_classif
        elif new_obj.score_func == 'f_regression':
            new_obj.score_func = f_regression
        else:
            raise ValueError('Unsupported GenericUnivariateSelect.score_func "%s"' % new_obj.score_func)

        return new_obj 
Example #28
Source File: export_tests.py    From tpot with GNU Lesser General Public License v3.0 5 votes vote down vote up
def test_pipeline_score_save():
    """Assert that the TPOTClassifier can generate a scored pipeline export correctly."""
    tpot_obj = TPOTClassifier()
    tpot_obj._fit_init()
    tpot_obj._pbar = tqdm(total=1, disable=True)
    pipeline_string = (
        'DecisionTreeClassifier(SelectPercentile(input_matrix, SelectPercentile__percentile=20),'
        'DecisionTreeClassifier__criterion=gini, DecisionTreeClassifier__max_depth=8,'
        'DecisionTreeClassifier__min_samples_leaf=5, DecisionTreeClassifier__min_samples_split=5)'
    )
    pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
    expected_code = """import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \\
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: 0.929813743
exported_pipeline = make_pipeline(
    SelectPercentile(score_func=f_classif, percentile=20),
    DecisionTreeClassifier(criterion="gini", max_depth=8, min_samples_leaf=5, min_samples_split=5)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
"""
    assert_equal(expected_code, export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset, pipeline_score=0.929813743)) 
Example #29
Source File: export_tests.py    From tpot with GNU Lesser General Public License v3.0 5 votes vote down vote up
def test_operator_export():
    """Assert that a TPOT operator can export properly with a callable function as a parameter."""
    assert list(TPOTSelectPercentile.arg_types) == TPOTSelectPercentile_args
    export_string = TPOTSelectPercentile.export(5)
    assert export_string == "SelectPercentile(score_func=f_classif, percentile=5)" 
Example #30
Source File: test_pipeline.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_pipeline_methods_anova():
    # Test the various methods of the pipeline (anova).
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Test with Anova + LogisticRegression
    clf = LogisticRegression()
    filter1 = SelectKBest(f_classif, k=2)
    pipe = Pipeline([('anova', filter1), ('logistic', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)