Python sklearn.feature_selection.SelectKBest() Examples

The following are 30 code examples of sklearn.feature_selection.SelectKBest(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.feature_selection , or try the search function .
Example #1
Source File: DataAnalysis.py    From Predicting-Health-Insurance-Cost with BSD 3-Clause "New" or "Revised" License 8 votes vote down vote up
def featuresFromFeatureSelection(X,Y,columnNames):
    
    for f in columnNames:
        print(f)
    X_new_withfitTransform = SelectKBest(chi2, k=34).fit(X, Y)
    colors = getColorNames()
    counter  = 0
    
    scores = X_new_withfitTransform.scores_
    scores_scaled = np.divide(scores, 1000) 
        
    for score in scores_scaled:
        #if(score > 10):
        #print('Feature {:>34}'.format(columnNames[counter]))
        print('{:>34}  '.format( score))
        '''Plot a graph'''    
        plt.bar(counter, score,color=colors[counter])
        counter +=1 

    plt.ylabel('Scores(1k)')
    plt.title('Scores calculated by Chi-Square Test')
    plt.legend(columnNames, bbox_to_anchor=(0., 0.8, 1., .102), loc=3,ncol=5, mode="expand", borderaxespad=0.)
    plt.show()
    
    #print(feature_selection.chi2(X,Y)) 
Example #2
Source File: tester.py    From Text-Classification-Benchmark with MIT License 7 votes vote down vote up
def feature_select(corpus, labels, k=1000):
    """
    select top k features through chi-square test
    """
    bin_cv = CountVectorizer(binary=True)
    le = LabelEncoder()
    X = bin_cv.fit_transform(corpus)
    y = le.fit_transform(labels).reshape(-1, 1)

    k = min(X.shape[1], k)
    skb = SelectKBest(chi2, k=k)
    skb.fit(X, y)

    feature_ids = skb.get_support(indices=True)
    feature_names = bin_cv.get_feature_names()
    vocab = {}

    for new_fid, old_fid in enumerate(feature_ids):
        feature_name = feature_names[old_fid]
        vocab[feature_name] = new_fid

    # we only care about the final extracted feature vocabulary
    return vocab 
Example #3
Source File: train.py    From skorch with BSD 3-Clause "New" or "Revised" License 7 votes vote down vote up
def get_model(with_pipeline=False):
    """Get a multi-layer perceptron model.

    Optionally, put it in a pipeline that scales the data.

    """
    model = NeuralNetClassifier(MLPClassifier)
    if with_pipeline:
        model = Pipeline([
            ('scale', FeatureUnion([
                ('minmax', MinMaxScaler()),
                ('normalize', Normalizer()),
            ])),
            ('select', SelectKBest(k=N_FEATURES)),  # keep input size constant
            ('net', model),
        ])
    return model 
Example #4
Source File: Model_Parameters_CV.py    From ProFET with GNU General Public License v3.0 7 votes vote down vote up
def ReducedFeaturesDF(X,y):
        '''
        Returns a dataframe with only a subset of features/columns retained
        '''
        from sklearn.feature_selection import RFE
        est = LinearSVC( penalty='l1', loss='l2', dual=False, class_weight='auto')
#        selectK = SelectKBest(score_func = f_classif, k=45)
        selectRFE = RFE(estimator=est, n_features_to_select=22, step=0.15)
        selectK=selectRFE

        selectK.fit(X,y)
        selectK_mask=selectK.get_support()
        K_featnames = feature_names[selectK_mask]
        print ("reduced RFE features:")
        print(K_featnames)
        Reduced_df = pd.read_csv(filename, index_col=0)
        Reduced_df = Reduced_df[Reduced_df.columns[selectK_mask]]
#        Reduced_df.to_csv('REDUCED_Feat.csv')
        return Reduced_df

#    ReducedFeaturesDF(X,y)
    # z=pd.DataFrame(data=X_SGD,index=y)
    # z.to_csv('REDUCED_Feat.csv') 
Example #5
Source File: FeatureSelector.py    From FAE with GNU General Public License v3.0 6 votes vote down vote up
def GetSelectedFeatureIndex(self, data_container):
        data = data_container.GetArray()
        data /= np.linalg.norm(data, ord=2, axis=0)
        label = data_container.GetLabel()

        if data.shape[1] < self.GetSelectedFeatureNumber():
            print(
                'ANOVA: The number of features {:d} in data container is smaller than the required number {:d}'.format(
                    data.shape[1], self.GetSelectedFeatureNumber()))
            self.SetSelectedFeatureNumber(data.shape[1])

        fs = SelectKBest(f_classif, k=self.GetSelectedFeatureNumber())
        fs.fit(data, label)
        feature_index = fs.get_support(True)
        f_value, p_value = f_classif(data, label)
        return feature_index.tolist(), f_value, p_value 
Example #6
Source File: dominance.py    From dominance-analysis with MIT License 6 votes vote down vote up
def get_top_k(self):
		columns=list(self.data.columns.values)
		columns.remove(self.target)
		# remove intercept from top_k
		if(self.objective):
			top_k_vars=SelectKBest(f_regression, k=self.top_k)
			top_k_vars.fit_transform(self.data[columns], self.data[self.target])
		else:
			columns.remove('intercept')
			try:
				top_k_vars=SelectKBest(chi2, k=self.top_k)
				top_k_vars.fit_transform(self.data[columns], self.data[self.target])
			except:
				top_k_vars=SelectKBest(f_classif, k=self.top_k)
				top_k_vars.fit_transform(self.data[columns], self.data[self.target])
		return [columns[i] for i in top_k_vars.get_support(indices=True)] 
Example #7
Source File: GetMLPara.py    From dr_droid with Apache License 2.0 6 votes vote down vote up
def find_best_feature_selections(X,y):

    #select the best features usin different technique
    X_new = SelectKBest(chi2, k=80).fit_transform(X,y)
    X_new1 = SelectPercentile(chi2, percentile=20).fit_transform(X,y)

    X_new2 = SelectKBest(f_classif, k=80).fit_transform(X,y) #this one has the best performance
    X_new22 = SelectPercentile(f_classif, percentile=20).fit_transform(X,y)

    X_new3 = SelectKBest(f_classif, k=70).fit_transform(X,y)
    X_new4 = SelectKBest(f_classif, k=60).fit_transform(X,y)

    print (X_new.shape)
    #selection_parameters_for_classfier(X_new,y)
    #print (y.shape)
    train_and_test(X_new,y)
    train_and_test(X_new1,y)
    train_and_test(X_new2,y)
    train_and_test(X_new22,y)
    train_and_test(X_new3,y)
    train_and_test(X_new4,y)
    #X,y = _dataset_sample()

################################PARAMETER  Selected################################
#TODO some problem happens when using the parameter max_leaf_nodes in Dtree and RandomForest 
Example #8
Source File: test_core_pipeline.py    From lale with Apache License 2.0 6 votes vote down vote up
def test_export_to_sklearn_pipeline3(self):
        from lale.lib.lale import ConcatFeatures
        from lale.lib.sklearn import PCA
        from lale.lib.sklearn import KNeighborsClassifier, LogisticRegression, SVC 
        from sklearn.feature_selection import SelectKBest
        from lale.lib.sklearn import Nystroem
        from sklearn.pipeline import FeatureUnion

        lale_pipeline = ((PCA() >> SelectKBest(k=2)) & (Nystroem(random_state = 42) >> SelectKBest(k=3))
         & (SelectKBest(k=3))) >> ConcatFeatures() >> SelectKBest(k=2) >> LogisticRegression()
        trained_lale_pipeline = lale_pipeline.fit(self.X_train, self.y_train)
        sklearn_pipeline = trained_lale_pipeline.export_to_sklearn_pipeline()
        self.assertIsInstance(sklearn_pipeline.named_steps['featureunion'], FeatureUnion)
        self.assertIsInstance(sklearn_pipeline.named_steps['selectkbest'], SelectKBest)
        from sklearn.linear_model import LogisticRegression
        self.assertIsInstance(sklearn_pipeline.named_steps['logisticregression'], LogisticRegression)
        self.assert_equal_predictions(sklearn_pipeline, trained_lale_pipeline) 
Example #9
Source File: test_feature_selection.py    From pandas-ml with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_objectmapper(self):
        df = pdml.ModelFrame([])
        self.assertIs(df.feature_selection.GenericUnivariateSelect,
                      fs.GenericUnivariateSelect)
        self.assertIs(df.feature_selection.SelectPercentile,
                      fs.SelectPercentile)
        self.assertIs(df.feature_selection.SelectKBest, fs.SelectKBest)
        self.assertIs(df.feature_selection.SelectFpr, fs.SelectFpr)
        self.assertIs(df.feature_selection.SelectFromModel,
                      fs.SelectFromModel)
        self.assertIs(df.feature_selection.SelectFdr, fs.SelectFdr)
        self.assertIs(df.feature_selection.SelectFwe, fs.SelectFwe)
        self.assertIs(df.feature_selection.RFE, fs.RFE)
        self.assertIs(df.feature_selection.RFECV, fs.RFECV)
        self.assertIs(df.feature_selection.VarianceThreshold,
                      fs.VarianceThreshold) 
Example #10
Source File: test_base.py    From pandas-ml with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_pipeline(self):
        from sklearn.feature_selection import SelectKBest
        from sklearn.feature_selection import f_regression
        from sklearn.pipeline import Pipeline

        diabetes = datasets.load_diabetes()
        models = ['OLS', 'GLS', 'WLS', 'GLSAR', 'QuantReg', 'GLM', 'RLM']

        for model in models:
            klass = getattr(sm, model)

            selector = SelectKBest(f_regression, k=5)
            estimator = Pipeline([('selector', selector),
                                  ('reg', base.StatsModelsRegressor(klass))])

            estimator.fit(diabetes.data, diabetes.target)
            result = estimator.predict(diabetes.data)

            data = SelectKBest(f_regression, k=5).fit_transform(diabetes.data, diabetes.target)
            expected = klass(diabetes.target, data).fit().predict(data)
            self.assert_numpy_array_almost_equal(result, expected) 
Example #11
Source File: PipeTasks.py    From ProFET with GNU General Public License v3.0 6 votes vote down vote up
def GetKFeatures(filename, method='RFE',kbest=30,alpha=0.01, reduceMatrix = True):
    '''
    Gets best features using chosen method
    (K-best, RFE, RFECV,'L1' (RandomizedLogisticRegression),'Tree' (ExtraTreesClassifier), mrmr),
    then prints top K features' names (from featNames).
    If reduceMatrix =  True, then also returns X reduced to the K best features.

    Available methods' names are: 'RFE','RFECV','RandomizedLogisticRegression','K-best','ExtraTreesClassifier'..
    Note, that effectiveyl, Any scikit learn method could be used, if correctly imported..
    '''
    #est = method()
    '''
    Gets the K-best features (filtered by FDR, then select best ranked by t-test , more advanced options can be implemented).
    Save the data/matrix with the resulting/kept features to a new output file, "REDUCED_Feat.csv"
    '''
    features, labels, lb_encoder,featureNames = load_data(filename)
    X, y = features, labels

    # change the names as ints back to strings
    class_names=lb_encoder.inverse_transform(y)
    print("Data and labels imported. PreFilter Feature matrix shape:")
    print(X.shape)

    selectK = SelectKBest(k=kbest)
    selectK.fit(X,y)
    selectK_mask=selectK.get_support()
    K_featnames = featureNames[selectK_mask]
    print('X After K filter:',X.shape)
    print("K_featnames: %s" %(K_featnames))
    if reduceMatrix ==True :
        Reduced_df = pd.read_csv(filename, index_col=0)
        Reduced_df = Reduced_df[Reduced_df.columns[selectK_mask]]
        Reduced_df.to_csv('REDUCED_Feat.csv')
        print('Saved to REDUCED_Feat.csv')
        return Reduced_df

#WORKS! But unreadable with too many features! 
Example #12
Source File: Model_trainer.py    From ProFET with GNU General Public License v3.0 6 votes vote down vote up
def featureFitting(filename, X, y, featureNames,optimalFlag, kbest=20, alpha=0.05, model=None):
    '''
    Gets the K-best features (filtered by FDR, then select best ranked by t-test, more advanced options can be implemented).
    Save the data/matrix with the resulting/kept features to a new output file, "REDUCED_Feat.csv"
    Returns new features matrix, FD scaler, and K-select scaler
    '''
    a=alpha
    FD = SelectFdr(alpha=a)
    X = FD.fit_transform(X,y)

    selectK = SelectKBest(k=kbest)
    selectK.fit(X,y)
    selectK_mask=selectK.get_support()
    K_featnames = featureNames[selectK_mask]
    print("K_featnames: %s" %(K_featnames))
    Reduced_df = pd.read_csv(filename, index_col=0)
    Reduced_df = Reduced_df[Reduced_df.columns[selectK_mask]]
    Reduced_df.to_csv('REDUCED_Feat.csv')
    return Reduced_df, FD, selectK 
Example #13
Source File: model.py    From student-performance-prediction with MIT License 5 votes vote down vote up
def train_and_score(X, y):
    X_train, X_test, y_train, y_test = split_data(X, y)

    clf = Pipeline([
        ('reduce_dim', SelectKBest(chi2, k=2)),
        ('train', LinearSVC(C=100))
    ])

    scores = cross_val_score(clf, X_train, y_train, cv=5, n_jobs=2)
    print("Mean Model Accuracy:", np.array(scores).mean())

    clf.fit(X_train, y_train)

    confuse(y_test, clf.predict(X_test))
    print() 
Example #14
Source File: utils.py    From IoT-device-type-identification with MIT License 5 votes vote down vote up
def perform_feature_selection(X_train, y_train, k_val):
    """ This method is used in order to perform a feature selection by selecting
    the best k_val features from X_train. It does so according to the chi2
    criterion. The method prints the chosen features and creates
    a new instance of X_train with only these features and returns it
    """
    print("**********FEATURE SELECTION**********")
    # Create and fit selector
    selector = SelectKBest(chi2, k=k_val)
    selector.fit(X_train, y_train)
    # Get idxs of columns to keep
    idxs_selected = selector.get_support(indices=True)
    print(idxs_selected)
    x_new = SelectKBest(chi2, k=k_val).fit_transform(X_train, y_train)
    return x_new 
Example #15
Source File: models.py    From IoT-device-type-identification with MIT License 5 votes vote down vote up
def perform_feature_selection(X_train, y_train, k_val):
    """ This method is used in order to perform a feature selection by selecting
    the best k_val features from X_train. It does so according to the chi2
    criterion. The method prints the chosen features and creates
    a new instance of X_train with only these features and returns it 
    """
    print("**********FEATURE SELECTION**********")
    # Create and fit selector
    selector = SelectKBest(chi2, k=k_val)
    selector.fit(X_train, y_train)
    #Get idxs of columns to keep
    idxs_selected = selector.get_support(indices=True)
    print(idxs_selected)
    X_new = SelectKBest(chi2, k=k_val).fit_transform(X_train, y_train)
    return X_new 
Example #16
Source File: test_bagging.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_bagging_with_pipeline():
    estimator = BaggingClassifier(make_pipeline(SelectKBest(k=1),
                                                DecisionTreeClassifier()),
                                  max_features=2)
    estimator.fit(iris.data, iris.target)
    assert_true(isinstance(estimator[0].steps[-1][1].random_state,
                           int)) 
Example #17
Source File: test_dict_vectorizer.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_feature_selection():
    # make two feature dicts with two useful features and a bunch of useless
    # ones, in terms of chi2
    d1 = dict([("useless%d" % i, 10) for i in range(20)],
              useful1=1, useful2=20)
    d2 = dict([("useless%d" % i, 10) for i in range(20)],
              useful1=20, useful2=1)

    for indices in (True, False):
        v = DictVectorizer().fit([d1, d2])
        X = v.transform([d1, d2])
        sel = SelectKBest(chi2, k=2).fit(X, [0, 1])

        v.restrict(sel.get_support(indices=indices), indices=indices)
        assert_equal(v.get_feature_names(), ["useful1", "useful2"]) 
Example #18
Source File: test_chi2.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def mkchi2(k):
    """Make k-best chi2 selector"""
    return SelectKBest(chi2, k=k) 
Example #19
Source File: test_pipeline.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_pipeline_methods_anova():
    # Test the various methods of the pipeline (anova).
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Test with Anova + LogisticRegression
    clf = LogisticRegression()
    filter1 = SelectKBest(f_classif, k=2)
    pipe = Pipeline([('anova', filter1), ('logistic', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y) 
Example #20
Source File: GetMLPara.py    From dr_droid with Apache License 2.0 5 votes vote down vote up
def precision_recall_curve_draw(X_o,y):

    X = SelectKBest(f_classif, k=80).fit_transform(X_o,y)
    print (X.shape)
    print (y.shape)

    svmrbf= svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,  kernel='rbf', max_iter=-1, probability=True, random_state=None,
shrinking=True, tol=0.001, verbose=False)
    knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', n_neighbors=5, p=2, weights='uniform')

    dtree = DecisionTreeClassifier( criterion='gini', min_samples_leaf=1, min_samples_split=2, random_state=None, splitter='best')

    rforest = RandomForestClassifier(bootstrap=True, criterion='gini', max_depth=None, max_features='auto',  min_samples_leaf=1, min_samples_split=2, n_estimators=10, n_jobs=1, oob_score=False, random_state=3)

    p_svmrbf, r_svmrbf, auc_svmrbf = get_my_pecision_recall(svmrbf,X,y)


    p_knn, r_knn, auc_knn = get_my_pecision_recall(knn, X, y)
    p_dtree, r_dtree, auc_dtree = get_my_pecision_recall(dtree, X, y)
    p_rforest, r_rforest, auc_rforest = get_my_pecision_recall(rforest, X, y)

    plt.clf()
    plt.plot(r_svmrbf,p_svmrbf, 'y.--', label ='SVM auc=%0.3f'% auc_svmrbf)
    plt.plot(r_knn, p_knn, 'r^--', label='KNN auc=%0.3f' %auc_knn)
    plt.plot(r_dtree, p_dtree, 'b>--', label ='Decision Tree auc=%0.3f'% auc_dtree)
    plt.plot(r_rforest, p_rforest, 'go--', label ='Random Forest auc=%0.3f'% auc_rforest)

    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('recall rate')
    plt.ylabel('precision rate')
    plt.title('precision-recall curve')
    plt.legend(loc="lower right")
    plt.show()

    del X
    del y
###############################################Examples to show the difference of features representation ################################## 
Example #21
Source File: GetMLPara.py    From dr_droid with Apache License 2.0 5 votes vote down vote up
def my_get_fp_fn_CV(X_original,y):

    #generate classfiers
    knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', n_neighbors=5, p=2, weights='uniform')

    #decision tree
    dtree = DecisionTreeClassifier( criterion='gini', min_samples_leaf=4, min_samples_split=2, random_state=None, splitter='best')

    #naive
    #nbbern = BernoulliNB()

    #random forest
    rforest = RandomForestClassifier(bootstrap=True, criterion='gini', max_depth=None, max_features='auto',  min_samples_leaf=1, min_samples_split=2, n_estimators=10, n_jobs=1, oob_score=False, random_state=3)

    #svm
    svmrbf= svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,  kernel='rbf', max_iter=-1, probability=True, random_state=None,
shrinking=True, tol=0.001, verbose=False)

    #reduce the size
    #X = SelectKBest(f_classif, k=80).fit_transform(X_original,y)
    skb = SelectKBest(f_classif, k=80).fit(X_original,y)
    X = skb.fit_transform(X_original,y)

    print ("KNN")
    my_get_fp_fn_inter(knn,X,y)
    print ("DTree")
    my_get_fp_fn_inter(dtree,X,y)
    print ("rforest")
    my_get_fp_fn_inter(rforest,X,y)
    #print ("naive bayes")
    #my_get_fp_fn_inter(nbbern,X,y)
    print ("SVMrbf")
    my_get_fp_fn_inter(svmrbf,X,y) 
Example #22
Source File: test_pipeline.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_feature_union_weights():
    # test feature union with transformer weights
    iris = load_iris()
    X = iris.data
    y = iris.target
    pca = PCA(n_components=2, svd_solver='randomized', random_state=0)
    select = SelectKBest(k=1)
    # test using fit followed by transform
    fs = FeatureUnion([("pca", pca), ("select", select)],
                      transformer_weights={"pca": 10})
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    # test using fit_transform
    fs = FeatureUnion([("pca", pca), ("select", select)],
                      transformer_weights={"pca": 10})
    X_fit_transformed = fs.fit_transform(X, y)
    # test it works with transformers missing fit_transform
    fs = FeatureUnion([("mock", Transf()), ("pca", pca), ("select", select)],
                      transformer_weights={"mock": 10})
    X_fit_transformed_wo_method = fs.fit_transform(X, y)
    # check against expected result

    # We use a different pca object to control the random_state stream
    assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
                       select.fit_transform(X, y).ravel())
    assert_array_almost_equal(X_fit_transformed[:, :-1],
                              10 * pca.fit_transform(X))
    assert_array_equal(X_fit_transformed[:, -1],
                       select.fit_transform(X, y).ravel())
    assert_equal(X_fit_transformed_wo_method.shape, (X.shape[0], 7)) 
Example #23
Source File: test_pipeline.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_classes_property():
    iris = load_iris()
    X = iris.data
    y = iris.target

    reg = make_pipeline(SelectKBest(k=1), LinearRegression())
    reg.fit(X, y)
    assert_raises(AttributeError, getattr, reg, "classes_")

    clf = make_pipeline(SelectKBest(k=1), LogisticRegression(random_state=0))
    assert_raises(AttributeError, getattr, clf, "classes_")
    clf.fit(X, y)
    assert_array_equal(clf.classes_, np.unique(y)) 
Example #24
Source File: kerasExperiments.py    From emailinsight with MIT License 5 votes vote down vote up
def select_best_features(dataset, train_labels, num_best, verbose=True):
    (X_train, Y_train), (X_test, Y_test) = dataset
    if verbose:
        print('\nSelecting %d best features\n'%num_best)
    selector = SelectKBest(chi2, k=num_best)
    X_train = selector.fit_transform(X_train,train_labels)
    X_test = selector.transform(X_test)
    return ((X_train, Y_train), (X_test, Y_test)),selector.scores_ 
Example #25
Source File: FeatureSelector.py    From CDSS with GNU General Public License v3.0 5 votes vote down vote up
def _select_K_best(self, k):
        if self._problem == FeatureSelector.CLASSIFICATION:
            score = f_classif
        else:
            score = f_regression

        self._selector = SelectKBest(score, k) 
Example #26
Source File: FeatureSelector.py    From FAE with GNU General Public License v3.0 5 votes vote down vote up
def GetSelectedFeatureIndex(self, data_container):
        data = data_container.GetArray()
        data /= np.linalg.norm(data, ord=2, axis=0)
        label = data_container.GetLabel()

        if data.shape[1] < self.GetSelectedFeatureNumber():
            print('KW: The number of features {:d} in data container is smaller than the required number {:d}'.format(
                data.shape[1], self.GetSelectedFeatureNumber()))
            self.SetSelectedFeatureNumber(data.shape[1])

        fs = SelectKBest(self.KruskalWallisAnalysis, k=self.GetSelectedFeatureNumber())
        fs.fit(data, label)
        feature_index = fs.get_support(True)
        self._f_value, self._p_value = self.KruskalWallisAnalysis(data, label)
        return feature_index.tolist() 
Example #27
Source File: __init__.py    From sklearn2pmml with GNU Affero General Public License v3.0 5 votes vote down vote up
def test_fit(self):
		selector = SelectKBest(score_func = f_regression, k = 1)
		selector_proxy = SelectorProxy(selector)
		self.assertFalse(hasattr(selector_proxy, "support_mask_"))
		selector_proxy.fit(numpy.array([[0, 0], [1.0, 2.0]]), numpy.array([0.5, 1.0]))
		self.assertEqual([0, 1], selector._get_support_mask().tolist())
		self.assertEqual([0, 1], selector_proxy.support_mask_.tolist()) 
Example #28
Source File: test_core_pipeline.py    From lale with Apache License 2.0 5 votes vote down vote up
def test_import_from_sklearn_pipeline_nested_pipeline1(self):
        from sklearn.pipeline import FeatureUnion, make_pipeline       
        from sklearn.decomposition import PCA
        from sklearn.kernel_approximation import Nystroem
        from sklearn.feature_selection import SelectKBest
        from sklearn.neighbors import KNeighborsClassifier
        from sklearn.pipeline import make_pipeline
        union = FeatureUnion([("selectkbest_pca", make_pipeline(SelectKBest(k=3), FeatureUnion([('pca', PCA(n_components=1)), ('nested_pipeline', make_pipeline(SelectKBest(k=2), Nystroem()))]))), ("nys", Nystroem(n_components=2, random_state=42))])        
        sklearn_pipeline = make_pipeline(union, KNeighborsClassifier())
        lale_pipeline = import_from_sklearn_pipeline(sklearn_pipeline)
        self.assertEqual(len(lale_pipeline.edges()), 8)
        #These assertions assume topological sort, which may not be unique. So the assertions are brittle.
        from lale.lib.sklearn.pca import PCAImpl
        from lale.lib.sklearn.nystroem import NystroemImpl
        from lale.lib.lale.concat_features import ConcatFeaturesImpl
        from lale.lib.sklearn.k_neighbors_classifier import KNeighborsClassifierImpl
        from lale.lib.sklearn.select_k_best import SelectKBestImpl
        self.assertEqual(lale_pipeline.edges()[0][0]._impl_class(), SelectKBestImpl)
        self.assertEqual(lale_pipeline.edges()[0][1]._impl_class(), PCAImpl)
        self.assertEqual(lale_pipeline.edges()[1][0]._impl_class(), SelectKBestImpl)
        self.assertEqual(lale_pipeline.edges()[1][1]._impl_class(), SelectKBestImpl)
        self.assertEqual(lale_pipeline.edges()[2][0]._impl_class(), SelectKBestImpl)
        self.assertEqual(lale_pipeline.edges()[2][1]._impl_class(), NystroemImpl)
        self.assertEqual(lale_pipeline.edges()[3][0]._impl_class(), PCAImpl)
        self.assertEqual(lale_pipeline.edges()[3][1]._impl_class(), ConcatFeaturesImpl)
        self.assertEqual(lale_pipeline.edges()[4][0]._impl_class(), NystroemImpl)
        self.assertEqual(lale_pipeline.edges()[4][1]._impl_class(), ConcatFeaturesImpl)
        self.assertEqual(lale_pipeline.edges()[5][0]._impl_class(), ConcatFeaturesImpl)
        self.assertEqual(lale_pipeline.edges()[5][1]._impl_class(), ConcatFeaturesImpl)
        self.assertEqual(lale_pipeline.edges()[6][0]._impl_class(), NystroemImpl)
        self.assertEqual(lale_pipeline.edges()[6][1]._impl_class(), ConcatFeaturesImpl)
        self.assertEqual(lale_pipeline.edges()[7][0]._impl_class(), ConcatFeaturesImpl)
        self.assertEqual(lale_pipeline.edges()[7][1]._impl_class(), KNeighborsClassifierImpl)
        self.assert_equal_predictions(sklearn_pipeline, lale_pipeline) 
Example #29
Source File: test_pipeline.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_classes_property():
    iris = load_iris()
    X = iris.data
    y = iris.target

    reg = make_pipeline(SelectKBest(k=1), LinearRegression())
    reg.fit(X, y)
    assert_raises(AttributeError, getattr, reg, "classes_")

    clf = make_pipeline(SelectKBest(k=1), LogisticRegression(random_state=0))
    assert_raises(AttributeError, getattr, clf, "classes_")
    clf.fit(X, y)
    assert_array_equal(clf.classes_, np.unique(y)) 
Example #30
Source File: svm_classifier.py    From nlp-journey with Apache License 2.0 5 votes vote down vote up
def __select_features(data_set):
        dataset = [clean_en_text(data) for data in data_set[0]]
        tf_idf_model = TfidfVectorizer(ngram_range=(1, 1),
                                       binary=True, 
                                       sublinear_tf=True)
        tf_vectors = tf_idf_model.fit_transform(dataset)

        # 选出前1/5的词用来做特征
        k = int(tf_vectors.shape[1] / 6)
        chi_model = SelectKBest(chi2, k=k)
        chi_features = chi_model.fit_transform(tf_vectors, data_set[1])
        print('tf-idf:\t\t' + str(tf_vectors.shape[1]))
        print('chi:\t\t' + str(chi_features.shape[1]))

        return chi_features, tf_idf_model, chi_model