Python sklearn.feature_selection.VarianceThreshold() Examples

The following are 14 code examples of sklearn.feature_selection.VarianceThreshold(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.feature_selection , or try the search function .
Example #1
Source File: low_variance.py    From scikit-feature with GNU General Public License v2.0 6 votes vote down vote up
def low_variance_feature_selection(X, threshold):
    """
    This function implements the low_variance feature selection (existing method in scikit-learn)

    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data
    p:{float}
        parameter used to calculate the threshold(threshold = p*(1-p))

    Output
    ------
    X_new: {numpy array}, shape (n_samples, n_selected_features)
        data with selected features
    """
    sel = VarianceThreshold(threshold)
    return sel.fit_transform(X) 
Example #2
Source File: test_feature_selection.py    From pandas-ml with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_objectmapper(self):
        df = pdml.ModelFrame([])
        self.assertIs(df.feature_selection.GenericUnivariateSelect,
                      fs.GenericUnivariateSelect)
        self.assertIs(df.feature_selection.SelectPercentile,
                      fs.SelectPercentile)
        self.assertIs(df.feature_selection.SelectKBest, fs.SelectKBest)
        self.assertIs(df.feature_selection.SelectFpr, fs.SelectFpr)
        self.assertIs(df.feature_selection.SelectFromModel,
                      fs.SelectFromModel)
        self.assertIs(df.feature_selection.SelectFdr, fs.SelectFdr)
        self.assertIs(df.feature_selection.SelectFwe, fs.SelectFwe)
        self.assertIs(df.feature_selection.RFE, fs.RFE)
        self.assertIs(df.feature_selection.RFECV, fs.RFECV)
        self.assertIs(df.feature_selection.VarianceThreshold,
                      fs.VarianceThreshold) 
Example #3
Source File: test_variance_threshold.py    From sparkit-learn with Apache License 2.0 6 votes vote down vote up
def test_same_variances(self):
        local = VarianceThreshold()
        dist = SparkVarianceThreshold()

        shapes = [((10, 5), None),
                  ((1e3, 20), None),
                  ((1e3, 20), 100),
                  ((1e4, 100), None),
                  ((1e4, 100), 600)]

        for shape, block_size in shapes:
            X_dense, X_dense_rdd = self.make_dense_rdd()
            X_sparse, X_sparse_rdd = self.make_sparse_rdd()
            Z = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y'))

            local.fit(X_dense)
            dist.fit(X_dense_rdd)
            assert_array_almost_equal(local.variances_, dist.variances_)

            local.fit(X_sparse)
            dist.fit(X_sparse_rdd)
            assert_array_almost_equal(local.variances_, dist.variances_)

            dist.fit(Z)
            assert_array_almost_equal(local.variances_, dist.variances_) 
Example #4
Source File: test_variance_threshold.py    From sparkit-learn with Apache License 2.0 6 votes vote down vote up
def test_same_transform_result(self):
        local = VarianceThreshold()
        dist = SparkVarianceThreshold()

        X_dense, X_dense_rdd = self.make_dense_rdd()
        X_sparse, X_sparse_rdd = self.make_sparse_rdd()
        Z_rdd = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y'))

        result_local = local.fit_transform(X_dense)
        result_dist = dist.fit_transform(X_dense_rdd)
        assert_true(check_rdd_dtype(result_dist, (np.ndarray,)))
        assert_array_almost_equal(result_local, result_dist.toarray())

        result_local = local.fit_transform(X_sparse)
        result_dist = dist.fit_transform(X_sparse_rdd)
        assert_true(check_rdd_dtype(result_dist, (sp.spmatrix,)))
        assert_array_almost_equal(result_local.toarray(),
                                  result_dist.toarray())

        result_dist = dist.fit_transform(Z_rdd)[:, 'X']
        assert_true(check_rdd_dtype(result_dist, (sp.spmatrix,)))
        assert_array_almost_equal(result_local.toarray(),
                                  result_dist.toarray()) 
Example #5
Source File: test_variance_threshold.py    From sparkit-learn with Apache License 2.0 6 votes vote down vote up
def test_same_transform_with_treshold(self):
        local = VarianceThreshold(.03)
        dist = SparkVarianceThreshold(.03)

        X_dense, X_dense_rdd = self.make_dense_rdd()
        X_sparse, X_sparse_rdd = self.make_sparse_rdd()
        Z_rdd = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y'))

        result_local = local.fit_transform(X_dense)
        result_dist = dist.fit_transform(X_dense_rdd)
        assert_true(check_rdd_dtype(result_dist, (np.ndarray,)))
        assert_array_almost_equal(result_local, result_dist.toarray())

        result_local = local.fit_transform(X_sparse)
        result_dist = dist.fit_transform(X_sparse_rdd)
        assert_true(check_rdd_dtype(result_dist, (sp.spmatrix,)))
        assert_array_almost_equal(result_local.toarray(),
                                  result_dist.toarray())

        result_dist = dist.fit_transform(Z_rdd)[:, 'X']
        assert_true(check_rdd_dtype(result_dist, (sp.spmatrix,)))
        assert_array_almost_equal(result_local.toarray(),
                                  result_dist.toarray()) 
Example #6
Source File: test_pipeline.py    From sparkit-learn with Apache License 2.0 6 votes vote down vote up
def test_pipeline_same_results(self):
        X, y, Z = self.make_classification(2, 10000, 2000)

        loc_clf = LogisticRegression()
        loc_filter = VarianceThreshold()
        loc_pipe = Pipeline([
            ('threshold', loc_filter),
            ('logistic', loc_clf)
        ])

        dist_clf = SparkLogisticRegression()
        dist_filter = SparkVarianceThreshold()
        dist_pipe = SparkPipeline([
            ('threshold', dist_filter),
            ('logistic', dist_clf)
        ])

        dist_filter.fit(Z)
        loc_pipe.fit(X, y)
        dist_pipe.fit(Z, logistic__classes=np.unique(y))

        assert_true(np.mean(np.abs(
            loc_pipe.predict(X) -
            np.concatenate(dist_pipe.predict(Z[:, 'X']).collect())
        )) < 0.1) 
Example #7
Source File: test_variance_threshold.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_zero_variance():
    # Test VarianceThreshold with default setting, zero variance.

    for X in [data, csr_matrix(data), csc_matrix(data), bsr_matrix(data)]:
        sel = VarianceThreshold().fit(X)
        assert_array_equal([0, 1, 3, 4], sel.get_support(indices=True))

    assert_raises(ValueError, VarianceThreshold().fit, [[0, 1, 2, 3]])
    assert_raises(ValueError, VarianceThreshold().fit, [[0, 1], [0, 1]]) 
Example #8
Source File: test_variance_threshold.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_variance_threshold():
    # Test VarianceThreshold with custom variance.
    for X in [data, csr_matrix(data)]:
        X = VarianceThreshold(threshold=.4).fit_transform(X)
        assert_equal((len(data), 1), X.shape) 
Example #9
Source File: data_cleaning.py    From open-solution-value-prediction with MIT License 5 votes vote down vote up
def __init__(self, threshold):
        self.selector = fs.VarianceThreshold(threshold=threshold) 
Example #10
Source File: classifiers.py    From oddt with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self, *args, **kwargs):
        """ Assemble Neural network or SVM using sklearn pipeline """

        # Cherrypick arguments for model. Exclude 'steps', which is pipeline argument
        local_kwargs = {key: kwargs.pop(key) for key in list(kwargs.keys())
                        if key != 'steps' and '__' not in key}

        if self._model is None:
            raise ValueError('Model not specified!')
        model = self._model(*args, **local_kwargs)

        self.pipeline = Pipeline([('empty_dims_remover', VarianceThreshold()),
                                  ('scaler', StandardScaler()),
                                  ('model', model)]).set_params(**kwargs) 
Example #11
Source File: regressors.py    From oddt with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self, *args, **kwargs):
        """ Assemble Neural network or SVM using sklearn pipeline """
        # Cherrypick arguments for model. Exclude 'steps', which is pipeline argument
        local_kwargs = {key: kwargs.pop(key) for key in list(kwargs.keys())
                        if key != 'steps' and '__' not in key}

        if self._model is None:
            raise ValueError('Model not specified!')
        model = self._model(*args, **local_kwargs)

        self.pipeline = Pipeline([('empty_dims_remover', VarianceThreshold()),
                                  ('scaler', StandardScaler()),
                                  ('model', model)]).set_params(**kwargs) 
Example #12
Source File: test_variance_threshold.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_zero_variance():
    # Test VarianceThreshold with default setting, zero variance.

    for X in [data, csr_matrix(data), csc_matrix(data), bsr_matrix(data)]:
        sel = VarianceThreshold().fit(X)
        assert_array_equal([0, 1, 3, 4], sel.get_support(indices=True))

    assert_raises(ValueError, VarianceThreshold().fit, [[0, 1, 2, 3]])
    assert_raises(ValueError, VarianceThreshold().fit, [[0, 1], [0, 1]]) 
Example #13
Source File: test_variance_threshold.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_variance_threshold():
    # Test VarianceThreshold with custom variance.
    for X in [data, csr_matrix(data)]:
        X = VarianceThreshold(threshold=.4).fit_transform(X)
        assert_equal((len(data), 1), X.shape) 
Example #14
Source File: features.py    From AlphaPy with Apache License 2.0 4 votes vote down vote up
def remove_lv_features(model, X):
    r"""Remove low-variance features.

    Parameters
    ----------
    model : alphapy.Model
        Model specifications for removing features.
    X : numpy array
        The feature matrix.

    Returns
    -------
    X_reduced : numpy array
        The reduced feature matrix.

    References
    ----------
    You can find more information on low-variance feature selection here [LV]_.

    .. [LV] http://scikit-learn.org/stable/modules/feature_selection.html#variance-threshold

    """

    logger.info("Removing Low-Variance Features")

    # Extract model parameters

    lv_remove = model.specs['lv_remove']
    lv_threshold = model.specs['lv_threshold']
    predict_mode = model.specs['predict_mode']

    # Remove low-variance features

    if lv_remove:
        logger.info("Low-Variance Threshold  : %.2f", lv_threshold)
        logger.info("Original Feature Count  : %d", X.shape[1])
        if not predict_mode:
            selector = VarianceThreshold(threshold=lv_threshold)
            selector.fit(X)
            support = selector.get_support()
            model.feature_map['lv_support'] = support
        else:
            support = model.feature_map['lv_support']
        X_reduced = X[:, support]
        model.feature_names = list(itertools.compress(model.feature_names, support))
        logger.info("Reduced Feature Count   : %d", X_reduced.shape[1])
    else:
        X_reduced = X
        logger.info("Skipping Low-Variance Features")

    assert X_reduced.shape[1] == len(model.feature_names), "Mismatched Features and Names"
    return X_reduced