Python sklearn.utils.sparsefuncs.mean_variance_axis() Examples

The following are 11 code examples of sklearn.utils.sparsefuncs.mean_variance_axis(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.utils.sparsefuncs , or try the search function .
Example #1
Source File: test_data.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_scale_function_without_centering():
    rng = np.random.RandomState(42)
    X = rng.randn(4, 5)
    X[:, 0] = 0.0  # first feature is always of zero
    X_csr = sparse.csr_matrix(X)

    X_scaled = scale(X, with_mean=False)
    assert not np.any(np.isnan(X_scaled))

    X_csr_scaled = scale(X_csr, with_mean=False)
    assert not np.any(np.isnan(X_csr_scaled.data))

    # test csc has same outcome
    X_csc_scaled = scale(X_csr.tocsc(), with_mean=False)
    assert_array_almost_equal(X_scaled, X_csc_scaled.toarray())

    # raises value error on axis != 0
    assert_raises(ValueError, scale, X_csr, with_mean=False, axis=1)

    assert_array_almost_equal(X_scaled.mean(axis=0),
                              [0., -0.01, 2.24, -0.35, -0.78], 2)
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert X_scaled is not X

    X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis(X_csr_scaled, 0)
    assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0))
    assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))

    # null scale
    X_csr_scaled = scale(X_csr, with_mean=False, with_std=False, copy=True)
    assert_array_almost_equal(X_csr.toarray(), X_csr_scaled.toarray()) 
Example #2
Source File: test_preprocessing.py    From scanpy with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_mean_var_sparse():
    from sklearn.utils.sparsefuncs import mean_variance_axis

    csr64 = sp.random(10000, 1000, format="csr", dtype=np.float64)
    csc64 = csr64.tocsc()

    # Test that we're equivalent for 64 bit
    for mtx, ax in product((csr64, csc64), (0, 1)):
        scm, scv = sc.pp._utils._get_mean_var(mtx, axis=ax)
        skm, skv = mean_variance_axis(mtx, ax)
        skv *= (mtx.shape[ax] / (mtx.shape[ax] - 1))

        assert np.allclose(scm, skm)
        assert np.allclose(scv, skv)

    csr32 = csr64.astype(np.float32)
    csc32 = csc64.astype(np.float32)

    # Test whether ours is more accurate for 32 bit
    for mtx32, mtx64 in [(csc32, csc64), (csr32, csr64)]:
        scm32, scv32 = sc.pp._utils._get_mean_var(mtx32)
        scm64, scv64 = sc.pp._utils._get_mean_var(mtx64)
        skm32, skv32 = mean_variance_axis(mtx32, 0)
        skm64, skv64 = mean_variance_axis(mtx64, 0)
        skv32 *= (mtx.shape[0] / (mtx.shape[0] - 1))
        skv64 *= (mtx.shape[0] / (mtx.shape[0] - 1))

        m_resid_sc = np.mean(np.abs(scm64 - scm32))
        m_resid_sk = np.mean(np.abs(skm64 - skm32))
        v_resid_sc = np.mean(np.abs(scv64 - scv32))
        v_resid_sk = np.mean(np.abs(skv64 - skv32))

        assert m_resid_sc < m_resid_sk
        assert v_resid_sc < v_resid_sk 
Example #3
Source File: equal_groups.py    From Same-Size-K-Means with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _tolerance(X, tol):
    """Return a tolerance which is independent of the dataset"""
    if sp.issparse(X):
        variances = mean_variance_axis(X, axis=0)[1]
    else:
        variances = np.var(X, axis=0)
    return np.mean(variances) * tol 
Example #4
Source File: _k_means_0_23.py    From daal4py with Apache License 2.0 5 votes vote down vote up
def _tolerance(X, rtol):
    """Compute absolute tolerance from the relative tolerance"""
    if rtol == 0.0:
        return rtol
    if sp.issparse(X):
        variances = mean_variance_axis(X, axis=0)[1]
        mean_var = np.mean(variances)
    else:
        mean_var = _daal_mean_var(X)
    return mean_var * rtol 
Example #5
Source File: _k_means_0_22.py    From daal4py with Apache License 2.0 5 votes vote down vote up
def _tolerance(X, rtol):
    """Compute absolute tolerance from the relative tolerance"""
    if rtol == 0.0:
        return rtol
    if sp.issparse(X):
        variances = mean_variance_axis(X, axis=0)[1]
        mean_var = np.mean(variances)
    else:
        mean_var = _daal_mean_var(X)
    return mean_var * rtol 
Example #6
Source File: _k_means_0_21.py    From daal4py with Apache License 2.0 5 votes vote down vote up
def _tolerance(X, rtol):
    """Compute absolute tolerance from the relative tolerance"""
    if rtol == 0.0:
        return rtol
    if sp.issparse(X):
        variances = mean_variance_axis(X, axis=0)[1]
        mean_var = np.mean(variances)
    else:
        mean_var = _daal_mean_var(X)
    return mean_var * rtol 
Example #7
Source File: scores.py    From SecuML with GNU General Public License v2.0 5 votes vote down vote up
def compute_scoring_func(self, func):
        if func == 'variance':
            features = self.instances.features.get_values()
            annotations = self.instances.annotations.get_labels()
            if isinstance(features, spmatrix):
                variance = mean_variance_axis(features, axis=0)[1]
            else:
                variance = features.var(axis=0)
            return variance, None

        features = self.annotated_instances.features.get_values()
        annotations = self.annotated_instances.annotations.get_supervision(
                                                               self.multiclass)
        if func == 'f_classif':
            return f_classif(features, annotations)
        elif func == 'mutual_info_classif':
            if isinstance(features, spmatrix):
                discrete_indexes = True
            else:
                features_types = self.instances.features.info.types
                discrete_indexes = [i for i, t in enumerate(features_types)
                                    if t == FeatureType.binary]
                if not discrete_indexes:
                    discrete_indexes = False
            return (mutual_info_classif(features, annotations,
                                        discrete_features=discrete_indexes),
                    None)
        elif func == 'chi2':
            return chi2(features, annotations)
        else:
            assert(False) 
Example #8
Source File: density.py    From SecuML with GNU General Public License v2.0 5 votes vote down vote up
def _display_dataset(self, dataset):
        eps = 0.00001
        linewidth = dataset.linewidth
        delta = self.max_value - self.min_value
        density_delta = 1.2 * delta
        if delta > 0:
            x = np.arange(self.min_value - 0.1*delta,
                          self.max_value + 0.1*delta,
                          density_delta / self.num_points)
        else:
            x = np.array([self.min_value - 2*eps, self.max_value + 2*eps])
        if isinstance(dataset.values, spmatrix):
            variance = mean_variance_axis(dataset.values, axis=0)[1]
        else:
            variance = np.var(dataset.values)
        if variance < eps:
            linewidth += 2
            mean = np.mean(dataset.values)
            x = np.sort(np.append(x, [mean, mean - eps, mean + eps]))
            density = [1 if v == mean else 0 for v in x]
        else:
            self.kde.fit(dataset.values)
            x_density = [[y] for y in x]
            # kde.score_samples returns the 'log' of the density
            log_density = self.kde.score_samples(x_density).tolist()
            density = list(map(math.exp, log_density))
        self.ax.plot(x, density, label=dataset.label, color=dataset.color,
                     linewidth=linewidth, linestyle=dataset.linestyle) 
Example #9
Source File: test_data.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_scale_function_without_centering():
    rng = np.random.RandomState(42)
    X = rng.randn(4, 5)
    X[:, 0] = 0.0  # first feature is always of zero
    X_csr = sparse.csr_matrix(X)

    X_scaled = scale(X, with_mean=False)
    assert_false(np.any(np.isnan(X_scaled)))

    X_csr_scaled = scale(X_csr, with_mean=False)
    assert_false(np.any(np.isnan(X_csr_scaled.data)))

    # test csc has same outcome
    X_csc_scaled = scale(X_csr.tocsc(), with_mean=False)
    assert_array_almost_equal(X_scaled, X_csc_scaled.toarray())

    # raises value error on axis != 0
    assert_raises(ValueError, scale, X_csr, with_mean=False, axis=1)

    assert_array_almost_equal(X_scaled.mean(axis=0),
                              [0., -0.01, 2.24, -0.35, -0.78], 2)
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert_true(X_scaled is not X)

    X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis(X_csr_scaled, 0)
    assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0))
    assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))

    # null scale
    X_csr_scaled = scale(X_csr, with_mean=False, with_std=False, copy=True)
    assert_array_almost_equal(X_csr.toarray(), X_csr_scaled.toarray()) 
Example #10
Source File: variance_threshold.py    From sparkit-learn with Apache License 2.0 4 votes vote down vote up
def fit(self, Z):
        """Learn empirical variances from X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Sample vectors from which to compute variances.

        y : any
            Ignored. This parameter exists only for compatibility with
            sklearn.pipeline.Pipeline.

        Returns
        -------
        self
        """

        X = Z[:, 'X'] if isinstance(Z, DictRDD) else Z
        check_rdd(X, (np.ndarray, sp.spmatrix))

        def mapper(X):
            """Calculate statistics for every numpy or scipy blocks."""
            X = check_array(X, ('csr', 'csc'), dtype=np.float64)
            if hasattr(X, "toarray"):   # sparse matrix
                mean, var = mean_variance_axis(X, axis=0)
            else:
                mean, var = np.mean(X, axis=0), np.var(X, axis=0)
            return X.shape[0], mean, var

        def reducer(a, b):
            """Calculate the combined statistics."""
            n_a, mean_a, var_a = a
            n_b, mean_b, var_b = b
            n_ab = n_a + n_b
            mean_ab = ((mean_a * n_a) + (mean_b * n_b)) / n_ab
            var_ab = (((n_a * var_a) + (n_b * var_b)) / n_ab) + \
                     ((n_a * n_b) * ((mean_b - mean_a) / n_ab) ** 2)
            return (n_ab, mean_ab, var_ab)

        _, _, self.variances_ = X.map(mapper).treeReduce(reducer)

        if np.all(self.variances_ <= self.threshold):
            msg = "No feature in X meets the variance threshold {0:.5f}"
            if X.shape[0] == 1:
                msg += " (X contains only one sample)"
            raise ValueError(msg.format(self.threshold))

        return self 
Example #11
Source File: data.py    From sparkit-learn with Apache License 2.0 4 votes vote down vote up
def fit(self, Z):
        """Compute the mean and std to be used for later scaling.
        Parameters
        ----------
        Z : DictRDD containing (X, y) pairs
            X - Training vector.
                {array-like, sparse matrix}, shape [n_samples, n_features]
                The data used to compute the mean and standard deviation
                used for later scaling along the features axis.
            y - Target labels
                Passthrough for ``Pipeline`` compatibility.
        """

        # Reset internal state before fitting
        self._reset()
        X = Z[:, 'X'] if isinstance(Z, DictRDD) else Z
        check_rdd(X, (np.ndarray, sp.spmatrix))

        def mapper(X):
            """Calculate statistics for every numpy or scipy blocks."""
            X = check_array(X, ('csr', 'csc'), dtype=np.float64)
            if hasattr(X, "toarray"):   # sparse matrix
                mean, var = mean_variance_axis(X, axis=0)
            else:
                mean, var = np.mean(X, axis=0), np.var(X, axis=0)
            return X.shape[0], mean, var

        def reducer(a, b):
            """Calculate the combined statistics."""
            n_a, mean_a, var_a = a
            n_b, mean_b, var_b = b
            n_ab = n_a + n_b
            mean_ab = ((mean_a * n_a) + (mean_b * n_b)) / n_ab
            var_ab = (((n_a * var_a) + (n_b * var_b)) / n_ab) + \
                     ((n_a * n_b) * ((mean_b - mean_a) / n_ab) ** 2)
            return (n_ab, mean_ab, var_ab)

        if check_rdd_dtype(X, (sp.spmatrix)):
            if self.with_mean:
                raise ValueError(
                    "Cannot center sparse matrices: pass `with_mean=False` "
                    "instead. See docstring for motivation and alternatives.")
        self.n_samples_seen_, self.mean_, self.var_ = X.map(mapper).treeReduce(reducer)

        if self.with_std:
            self.scale_ = _handle_zeros_in_scale(np.sqrt(self.var_))
        else:
            self.scale_ = None

        return self