Python sklearn.preprocessing.binarize() Examples

The following are 14 code examples of sklearn.preprocessing.binarize(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.preprocessing , or try the search function .
Example #1
Source File: trace.py    From neuroglia with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def transform(self, X):
        """Binarize each element of X

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape [n_samples, n_features]
            The data to binarize, element by element.
        """
        df = True
        try:
            index = X.index
            columns = X.columns
        except AttributeError:
            df = False

        X_ = binarize(X, threshold=self.threshold, copy=self.copy)

        if df:
            return pd.DataFrame(data=X_,index=index,columns=columns)
        else:
            return X_ 
Example #2
Source File: multi_class_svm.py    From JusticeAI with MIT License 6 votes vote down vote up
def predict(self, data):
        """
        1) Predicts an outcome given facts
        2) Predicts probability that prediction is correct
            2.1) Range goes from [0-1] where x < 0.5 is False
            2.2) The model only returns the probability that a fact is 1
            2.3) therefore to predict that the probability that a fact is 0 we do
                 1 - x when x < 0.5

        :param data: numpy([1, 0, 0, ...])
        :return: np.array([...])
        """
        if self.model is None:
            self.model = Load.load_binary("multi_class_svm_model.bin")
        data = binarize([data], threshold=0)
        probabilities = self.model.predict_proba(data)[0]
        predictions = self.model.predict(data)
        for i in range(len(probabilities)):
            prediction = predictions[0][i]
            if prediction == 0:
                probabilities[i] = 1 - probabilities[i]
            probabilities[i] = format(probabilities[i], '.2f')
        return self.model.predict(data), probabilities 
Example #3
Source File: multi_class_svm.py    From JusticeAI with MIT License 6 votes vote down vote up
def reshape_dataset(self):
        """
        Restructure the data to accomodate the sklearn library
        1) Reshape the x data
            1.1) 2D numpy array: [
                    [precedent #1 facts],
                    [precedent #2 facts],
                    ...
                ]
        2) Reshape the y data
        :return: x_total <#1.1>, y_total <#2.4>
        """

        # 1
        x_total = np.array(
            [np.reshape(precedent['facts_vector'], (len(precedent['facts_vector'], ))) for precedent in self.data_set])
        x_total = binarize(x_total, threshold=0)

        # 2
        y_list = []
        for precedent in self.data_set:
            y_list.append(self.__classify_precedent(precedent))
        y_total = np.array(y_list)
        return x_total, y_total 
Example #4
Source File: test_preprocessing.py    From pandas-ml with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_preprocessing_assignment(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        original_columns = df.data.columns
        df['sepal length (cm)'] = df['sepal length (cm)'].preprocessing.binarize(threshold=6)
        self.assertIsInstance(df, pdml.ModelFrame)
        binarized = pp.binarize(np.atleast_2d(iris.data[:, 0]), threshold=6)
        expected = np.hstack([binarized.T, iris.data[:, 1:]])
        self.assert_numpy_array_almost_equal(df.data.values, expected)
        tm.assert_index_equal(df.data.columns, original_columns)

        # recreate data
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        target_columns = ['sepal length (cm)', 'sepal width (cm)']
        df[target_columns] = df[target_columns].preprocessing.binarize(threshold=6)
        self.assertIsInstance(df, pdml.ModelFrame)
        binarized = pp.binarize(iris.data[:, 0:2], threshold=6)
        expected = np.hstack([binarized, iris.data[:, 2:]])
        self.assert_numpy_array_almost_equal(df.data.values, expected)
        tm.assert_index_equal(df.data.columns, original_columns) 
Example #5
Source File: nb_sklearn.py    From recordlinkage with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _fit_data(self, X):
        """Binarize the data for each column separately.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]

        Returns
        -------
        X_transformed : array-like
            Returns the data where in each columns the labels are
            binarized.

        """

        if self.binarize is not None:
            X = binarize(X, threshold=self.binarize)

        for i in range(X.shape[1]):

            # initialise binarizer and save
            binarizer = LabelBinarizer()

            if self.binarize:
                binarizer.classes_ = np.array([0, 1])

            # fit the data to the binarizer
            binarizer.fit(X[:, i])

            self._binarizers.append(binarizer)

        return self._transform_data(X) 
Example #6
Source File: nb_sklearn.py    From recordlinkage with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _transform_data(self, X):
        """Binarize the data for each column separately."""

        if self._binarizers == []:
            raise NotFittedError()

        if self.binarize is not None:
            X = binarize(X, threshold=self.binarize)

        if len(self._binarizers) != X.shape[1]:
            raise ValueError(
                "Expected input with %d features, got %d instead" %
                (len(self._binarizers), X.shape[1]))

        X_parts = []

        for i in range(X.shape[1]):

            X_i = self._binarizers[i].transform(X[:, i])

            # sklearn returns ndarray with shape (samples, 1) on binary input.
            if self._binarizers[i].classes_.shape[0] == 1:
                X_parts.append(1 - X_i)
            elif self._binarizers[i].classes_.shape[0] == 2:
                X_parts.append(1 - X_i)
                X_parts.append(X_i)
            else:
                X_parts.append(X_i)

        return np.concatenate(X_parts, axis=1) 
Example #7
Source File: nb_sklearn.py    From recordlinkage with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self, alpha=1.0, binarize=0.0, fit_prior=True,
                 class_prior=None):

        self.alpha = alpha
        self.binarize = binarize
        self.fit_prior = fit_prior
        self.class_log_prior_ = class_prior
        self.class_prior = class_prior

        self._binarizers = [] 
Example #8
Source File: nb_sklearn.py    From recordlinkage with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self,
                 init='jaro',
                 max_iter=100,
                 binarize=binarize,
                 atol=10e-5):
        self.init = init
        self.max_iter = max_iter
        self.binarize = binarize
        self.atol = atol

        self._binarizers = [] 
Example #9
Source File: test_util.py    From FeatureHub with MIT License 5 votes vote down vote up
def g(a):
    from sklearn.preprocessing import binarize
    return f(a) 
Example #10
Source File: test_util.py    From FeatureHub with MIT License 5 votes vote down vote up
def test_run_isolated_from_function_from_source():
    args = [1,3,7]
    f_source = b'def f(a):\n    return a+1\n'
    f1 = featurehub.util.get_function(f_source)
    g_source = b'def f(a):\n    return a+1\n\ndef g(a):\n    from sklearn.preprocessing import binarize\n    return f(a)\n'
    g1 = featurehub.util.get_function(g_source)
    for arg in args:
        assert f1(arg) == featurehub.util.run_isolated(f1, arg)
        assert g1(arg) == featurehub.util.run_isolated(g1, arg) 
Example #11
Source File: test_util.py    From FeatureHub with MIT License 5 votes vote down vote up
def test_run_isolated_from_function2_from_source():
    args = [1,3,7]
    f_source = b'def f(a):\n    return a+1\n'
    f1 = featurehub.util.get_function2(f_source)
    g_source = b'def f(a):\n    return a+1\n\ndef g(a):\n    from sklearn.preprocessing import binarize\n    return f(a)\n'
    g1 = featurehub.util.get_function2(g_source)
    for arg in args:
        assert f1(arg) == featurehub.util.run_isolated(f1, arg)
        assert g1(arg) == featurehub.util.run_isolated(g1, arg)

# ------------------------------------------------------------------------------ 
# Test compute_dataset_hash 
Example #12
Source File: test_demo.py    From FeatureHub with MIT License 5 votes vote down vote up
def hi_lo_age(dataset):
    from sklearn.preprocessing import binarize
    cutoff = 30
    return binarize(dataset["users"]["age"].values.reshape(-1,1), cutoff) 
Example #13
Source File: test_preprocessing.py    From pandas-ml with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_binarize(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        result = df.preprocessing.binarize()
        expected = pp.binarize(iris.data)

        self.assertIsInstance(result, pdml.ModelFrame)
        self.assert_numpy_array_almost_equal(result.data.values, expected)
        tm.assert_index_equal(result.columns, df.data.columns)

        result = df.preprocessing.binarize(threshold=5)
        expected = pp.binarize(iris.data, threshold=5)

        self.assertIsInstance(result, pdml.ModelFrame)
        self.assert_numpy_array_almost_equal(result.data.values, expected)
        tm.assert_index_equal(result.columns, df.data.columns)

        s = df['sepal length (cm)']
        self.assertIsInstance(s, pdml.ModelSeries)
        result = s.preprocessing.binarize()
        expected = pp.binarize(iris.data[:, 0].reshape(-1, 1))

        self.assertIsInstance(result, pdml.ModelSeries)
        self.assert_numpy_array_almost_equal(result.values, expected.flatten())
        self.assertEqual(result.name, 'sepal length (cm)')

        result = s.preprocessing.binarize(threshold=6)
        expected = pp.binarize(iris.data[:, 0].reshape(-1, 1), threshold=6)

        self.assertIsInstance(result, pdml.ModelSeries)
        self.assert_numpy_array_almost_equal(result.values, expected.flatten())
        self.assertEqual(result.name, 'sepal length (cm)') 
Example #14
Source File: cross_validation.py    From Pyspatialml with GNU General Public License v3.0 4 votes vote down vote up
def score(self, tpr_threshold=None, cutoff_threshold=None):
        """
        Calculates the scoring metrics using a cutoff threshold that attains a true positive rate
        that is equal or greater than the desired tpr_threshold

        Args
        ----
        tpr_threshold : float
            Minimum true positive rate to achieve
        cutoff_threshold : float
            As an alternative to using a minimum true positive, a probability cutoff threshold
            can be specified to calculate the scoring
        """

        if tpr_threshold is None and cutoff_threshold is None:
            raise ValueError('Either tpr_threshold or cutoff_threshold must be specified')

        scores = OrderedDict((k, []) for (k, v) in self.scoring.items())
        self.thresholds_ = []
        self.tpr_ = []
        self.fpr_ = []
        self.roc_thresholds_ = []

        for idx in self.test_idx_:
            # split fold
            y_true = self.y_true[idx]
            y_pred_ = self.y_pred_[idx, :]

            # get roc curve data
            fpr, tpr, thresholds = roc_curve(
                y_true, y_pred_[:, self.positive])

            self.fpr_.append(fpr)
            self.tpr_.append(tpr)
            self.roc_thresholds_.append(thresholds)

            # calculate cutoff that produces tpr >= threshold
            if cutoff_threshold is None:
                opt_threshold = thresholds[np.where(tpr >= tpr_threshold)[0].min()]
                self.thresholds_ = np.append(self.thresholds_, opt_threshold)
            else:
                opt_threshold = cutoff_threshold

            # calculate performance metrics
            y_pred_opt = binarize(y_pred_, opt_threshold)

            # calculate scores
            for name, score_func in self.scoring.items():
                score_func = self.scoring[name]
                scores[name] = np.append(scores[name], score_func(y_true, y_pred_opt[:, self.positive]))

        return scores