Python sklearn.feature_selection.f_regression() Examples

The following are 15 code examples of sklearn.feature_selection.f_regression(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.feature_selection , or try the search function .
Example #1
Source File: predict.py    From Loan_Default_Prediction with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def getTopFeatures(train_x, train_y, n_features=100):
    f_val, p_val = f_regression(train_x,train_y)
    f_val_dict = {}
    p_val_dict = {}
    for i in range(len(f_val)):
        if math.isnan(f_val[i]):
            f_val[i] = 0.0
        f_val_dict[i] = f_val[i]
        if math.isnan(p_val[i]):
            p_val[i] = 0.0
        p_val_dict[i] = p_val[i]
    
    sorted_f = sorted(f_val_dict.iteritems(), key=operator.itemgetter(1),reverse=True)
    sorted_p = sorted(p_val_dict.iteritems(), key=operator.itemgetter(1),reverse=True)
    
    feature_indexs = []
    for i in range(0,n_features):
        feature_indexs.append(sorted_f[i][0])
    
    return feature_indexs

# generate the new data, based on which features are generated, and used 
Example #2
Source File: dominance.py    From dominance-analysis with MIT License 6 votes vote down vote up
def get_top_k(self):
		columns=list(self.data.columns.values)
		columns.remove(self.target)
		# remove intercept from top_k
		if(self.objective):
			top_k_vars=SelectKBest(f_regression, k=self.top_k)
			top_k_vars.fit_transform(self.data[columns], self.data[self.target])
		else:
			columns.remove('intercept')
			try:
				top_k_vars=SelectKBest(chi2, k=self.top_k)
				top_k_vars.fit_transform(self.data[columns], self.data[self.target])
			except:
				top_k_vars=SelectKBest(f_classif, k=self.top_k)
				top_k_vars.fit_transform(self.data[columns], self.data[self.target])
		return [columns[i] for i in top_k_vars.get_support(indices=True)] 
Example #3
Source File: test_base.py    From pandas-ml with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_pipeline(self):
        from sklearn.feature_selection import SelectKBest
        from sklearn.feature_selection import f_regression
        from sklearn.pipeline import Pipeline

        diabetes = datasets.load_diabetes()
        models = ['OLS', 'GLS', 'WLS', 'GLSAR', 'QuantReg', 'GLM', 'RLM']

        for model in models:
            klass = getattr(sm, model)

            selector = SelectKBest(f_regression, k=5)
            estimator = Pipeline([('selector', selector),
                                  ('reg', base.StatsModelsRegressor(klass))])

            estimator.fit(diabetes.data, diabetes.target)
            result = estimator.predict(diabetes.data)

            data = SelectKBest(f_regression, k=5).fit_transform(diabetes.data, diabetes.target)
            expected = klass(diabetes.target, data).fit().predict(data)
            self.assert_numpy_array_almost_equal(result, expected) 
Example #4
Source File: feature_selection.py    From dataiku-contrib with Apache License 2.0 5 votes vote down vote up
def univariate_feature_selection(mode,predictors,target):
    
    if mode == 'f_regression':
        fselect = SelectPercentile(f_regression, 100)
        
    if mode == 'f_classif':
        fselect = SelectPercentile(f_classif, 100)
        
    if mode == 'chi2':
        fselect = SelectPercentile(chi2, 100)
        
    fselect.fit_transform(predictors, target)
    
    return fselect.pvalues_ 
Example #5
Source File: model_recommendation.py    From DIVE-backend with GNU General Public License v3.0 5 votes vote down vote up
def get_initial_regression_model_recommendation(project_id, dataset_id, dependent_variable_id=None, recommendation_type=MRT.LASSO.value, table_layout=MCT.LEAVE_ONE_OUT.value, data_size_cutoff=current_app.config['ANALYSIS_DATA_SIZE_CUTOFF'], categorical_value_limit=current_app.config['ANALYSIS_CATEGORICAL_VALUE_LIMIT']):
    df = get_data(project_id=project_id, dataset_id=dataset_id)
    if len(df) > data_size_cutoff:
        df = df.sample(data_size_cutoff)
    field_properties = db_access.get_field_properties(project_id, dataset_id)
    quantitative_field_properties = [ fp for fp in field_properties if fp['general_type'] == 'q']

    dependent_variable = next((f for f in field_properties if f['id'] == dependent_variable_id), None) \
        if dependent_variable_id \
        else np.random.choice(quantitative_field_properties, size=1)[0]

    independent_variables = []
    for fp in field_properties:
        if (fp['name'] != dependent_variable['name']):
            if (fp['general_type'] == 'c' and (fp['is_unique'] or len(fp['unique_values']) > categorical_value_limit)):
                continue
            independent_variables.append(fp)

    recommendationTypeToFunction = {
        MRT.FORWARD_R2.value: forward_r2,
        MRT.LASSO.value: lasso,
        MRT.RFE.value: recursive_feature_elimination,
        MRT.FORWARD_F.value: f_regression
    }

    result = recommendationTypeToFunction[recommendation_type](df, dependent_variable, independent_variables)

    return {
        'recommended': True,
        'table_layout': table_layout,
        'recommendation_type': recommendation_type,
        'dependent_variable_id': dependent_variable['id'],
        'independent_variables_ids': [ x['id'] for x in result ],
    } 
Example #6
Source File: model_recommendation.py    From DIVE-backend with GNU General Public License v3.0 5 votes vote down vote up
def f_regression(df, dependent_variable, independent_variables, interaction_terms=[], model_limit=5):
    considered_independent_variables_per_model, patsy_models = \
        construct_models(df, dependent_variable, independent_variables, interaction_terms, table_layout=MCT.ALL_VARIABLES.value)
    y, X = dmatrices(patsy_models[0], df, return_type='dataframe')

    f_test, r = f_regression(X, y, center=True)
    logger.info(f_test)
    logger.info(r)
    return 
Example #7
Source File: test_core_pipeline.py    From lale with Apache License 2.0 5 votes vote down vote up
def test_import_from_sklearn_pipeline(self):
        from sklearn.feature_selection import SelectKBest
        from sklearn.feature_selection import f_regression        
        from sklearn import svm
        from sklearn.pipeline import Pipeline
        anova_filter = SelectKBest(f_regression, k=3)
        clf = svm.SVC(kernel='linear')        
        sklearn_pipeline = Pipeline([('anova', anova_filter), ('svc', clf)])  
        lale_pipeline = import_from_sklearn_pipeline(sklearn_pipeline)
        for i, pipeline_step in enumerate(sklearn_pipeline.named_steps):
            sklearn_step_params = sklearn_pipeline.named_steps[pipeline_step].get_params()
            lale_sklearn_params = lale_pipeline.steps()[i]._impl._wrapped_model.get_params()
            self.assertEqual(sklearn_step_params, lale_sklearn_params)
        self.assert_equal_predictions(sklearn_pipeline, lale_pipeline) 
Example #8
Source File: test_core_pipeline.py    From lale with Apache License 2.0 5 votes vote down vote up
def test_import_from_sklearn_pipeline2(self):
        from sklearn.feature_selection import SelectKBest
        from sklearn.feature_selection import f_regression        
        from sklearn import svm
        from sklearn.pipeline import Pipeline
        anova_filter = SelectKBest(f_regression, k=3)
        clf = svm.SVC(kernel='linear')        
        sklearn_pipeline = Pipeline([('anova', anova_filter), ('svc', clf)])
        sklearn_pipeline.fit(self.X_train, self.y_train)
        lale_pipeline = import_from_sklearn_pipeline(sklearn_pipeline)
        lale_pipeline.predict(self.X_test) 
Example #9
Source File: test_core_pipeline.py    From lale with Apache License 2.0 5 votes vote down vote up
def test_import_from_sklearn_pipeline3(self):
        from sklearn.feature_selection import SelectKBest
        from sklearn.feature_selection import f_regression        
        from sklearn import svm
        from sklearn.pipeline import Pipeline
        anova_filter = SelectKBest(f_regression, k=3)
        clf = svm.SVC(kernel='linear')        
        sklearn_pipeline = Pipeline([('anova', anova_filter), ('svc', clf)])
        lale_pipeline = import_from_sklearn_pipeline(sklearn_pipeline, fitted=False)
        with self.assertRaises(ValueError):#fitted=False returns a Trainable, so calling predict is invalid.
            lale_pipeline.predict(self.X_test) 
Example #10
Source File: FieldSelector.py    From Splunking-Crime with GNU Affero General Public License v3.0 5 votes vote down vote up
def decode(cls, obj):
        from sklearn.feature_selection import f_classif, f_regression, GenericUnivariateSelect

        new_obj = GenericUnivariateSelect.__new__(GenericUnivariateSelect)
        new_obj.__dict__ = obj['dict']

        if new_obj.score_func == 'f_classif':
            new_obj.score_func = f_classif
        elif new_obj.score_func == 'f_regression':
            new_obj.score_func = f_regression
        else:
            raise ValueError('Unsupported GenericUnivariateSelect.score_func "%s"' % new_obj.score_func)

        return new_obj 
Example #11
Source File: FieldSelector.py    From Splunking-Crime with GNU Affero General Public License v3.0 5 votes vote down vote up
def __init__(self, options):
        self.handle_options(options)

        out_params = convert_params(
            options.get('params', {}),
            floats=['param'],
            strs=['type', 'mode'],
            aliases={'type': 'score_func'},
        )

        if 'score_func' not in out_params:
            out_params['score_func'] = f_classif
        else:
            if out_params['score_func'].lower() == 'categorical':
                out_params['score_func'] = f_classif
            elif out_params['score_func'].lower() in ['numerical', 'numeric']:
                out_params['score_func'] = f_regression
            else:
                raise RuntimeError('type can either be categorical or numeric.')

        if 'mode' in out_params:
            if out_params['mode'] not in ('k_best', 'fpr', 'fdr', 'fwe', 'percentile'):
                raise RuntimeError('mode can only be one of the following: fdr, fpr, fwe, k_best, and percentile')

            if out_params['mode'] in ['fpr', 'fdr', 'fwe']:
                if 'param' in out_params:
                    if not 0 < out_params['param'] < 1:
                        msg = 'Invalid param value for mode {}: param must be between 0 and 1.'.format(out_params['mode'])
                        raise ValueError(msg)

        # k_best and percentile require integer param
        if 'param' in out_params and out_params.get('mode') not in ['fdr', 'fpr', 'fwe']:
            original_value = out_params['param']
            out_params['param'] = int(out_params['param'])
            if out_params['param'] != original_value:
                msg = 'param value {} is not an integer; mode={} requires an integer.'
                msg = msg.format(original_value, out_params.get('mode', 'percentile'))
                raise ValueError(msg)

        self.estimator = GenericUnivariateSelect(**out_params) 
Example #12
Source File: filters.py    From causallib with Apache License 2.0 5 votes vote down vote up
def compute_pvals(self, X, y):
        # TODO: export to stats_utils?
        is_y_binary = (len(np.unique(y)) == 2)
        # is_binary_feature = np.sum(((X != np.nanmin(X, axis=0)[np.newaxis, :]) &
        #                             (X != np.nanmax(X, axis=0)[np.newaxis, :])), axis=0) == 0
        is_binary_feature = areColumnsBinary(X)
        p_vals = np.zeros(X.shape[1])
        if is_y_binary:
            # Process non-binary columns:
            for i in np.where(~is_binary_feature)[0]:
                x0 = X.loc[y == 0, i]
                x1 = X.loc[y == 1, i]
                if self.is_linear:
                    _, p_vals[i] = stats.ttest_ind(x0, x1)
                else:
                    _, p_vals[i] = stats.ks_2samp(x0, x1)

            # Process binary features:
            _, p_vals[is_binary_feature] = feature_selection.chi2(X.loc[:, is_binary_feature], y)

        else:
            # Process non-binary features:
            _, p_vals[~is_binary_feature] = feature_selection.f_regression(X.loc[:, ~is_binary_feature], y)

            # Process binary features:
            y_mat = np.row_stack(y)
            for i in np.where(is_binary_feature)[0]:
                _, p_vals[i] = feature_selection.f_regression(y_mat, X.loc[:, i])
        return p_vals 
Example #13
Source File: __init__.py    From sklearn2pmml with GNU Affero General Public License v3.0 5 votes vote down vote up
def test_init(self):
		selector = SelectKBest(score_func = f_regression, k = 1)
		selector.fit(numpy.array([[0, 0], [1.0, 2.0]]), numpy.array([0.5, 1.0]))
		self.assertEqual([0, 1], selector._get_support_mask().tolist())
		selector_proxy = SelectorProxy(selector)
		self.assertEqual([0, 1], selector_proxy.support_mask_.tolist()) 
Example #14
Source File: __init__.py    From sklearn2pmml with GNU Affero General Public License v3.0 5 votes vote down vote up
def test_fit(self):
		selector = SelectKBest(score_func = f_regression, k = 1)
		selector_proxy = SelectorProxy(selector)
		self.assertFalse(hasattr(selector_proxy, "support_mask_"))
		selector_proxy.fit(numpy.array([[0, 0], [1.0, 2.0]]), numpy.array([0.5, 1.0]))
		self.assertEqual([0, 1], selector._get_support_mask().tolist())
		self.assertEqual([0, 1], selector_proxy.support_mask_.tolist()) 
Example #15
Source File: test_feature_selection.py    From pandas-ml with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_f_regression(self):
        diabetes = datasets.load_diabetes()
        df = pdml.ModelFrame(diabetes)

        result = df.feature_selection.f_regression()
        expected = fs.f_regression(diabetes.data, diabetes.target)

        self.assertEqual(len(result), 2)
        self.assert_numpy_array_almost_equal(result[0], expected[0])
        self.assert_numpy_array_almost_equal(result[1], expected[1])