Python sklearn.utils.multiclass.type_of_target() Examples

The following are 30 code examples of sklearn.utils.multiclass.type_of_target(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.utils.multiclass , or try the search function

Example #1

Source File: run.py From nyaggle with MIT License

10 votes

def _dispatch_gbdt_class(algorithm_type: str, type_of_target: str):
    is_regression = type_of_target == 'continuous'

    if algorithm_type == 'lgbm':
        requires_lightgbm()
        from lightgbm import LGBMClassifier, LGBMRegressor
        return LGBMRegressor if is_regression else LGBMClassifier
    elif algorithm_type == 'cat':
        requires_catboost()
        from catboost import CatBoostClassifier, CatBoostRegressor
        return CatBoostRegressor if is_regression else CatBoostClassifier
    else:
        requires_xgboost()
        assert algorithm_type == 'xgb'
        from xgboost import XGBClassifier, XGBRegressor
        return XGBRegressor if is_regression else XGBClassifier

Example #2

Source File: nested_cv.py From Nested-Cross-Validation with MIT License

6 votes

def _predict_and_score(self, X_test, y_test):
        #XXX: Implement type_of_target(y)
        
        if(self.predict_proba):
            y_type = type_of_target(y_test)
            if(y_type in ('binary')):
                pred = self.model.predict_proba(X_test)[:,1]
            else:
                pred = self.model.predict_proba(X_test)
                
        else:
            pred = self.model.predict(X_test)
        
        if(self.multiclass_average == 'binary'):
            return self.metric(y_test, pred), pred
        else:
            return self.metric(y_test, pred, average=self.multiclass_average), pred

Example #3

Source File: WOE_IV.py From exploripy with MIT License

6 votes

def feature_discretion(self, X):
        '''
        Discrete the continuous features of input data X, and keep other features unchanged.
        :param X : numpy array
        :return: the numpy array in which all continuous features are discreted
        '''
        temp = []
        for i in range(0, X.shape[-1]):
            x = X[:, i]
            x_type = type_of_target(x)
            if x_type == 'continuous':
                x1 = self.discrete(x)
                temp.append(x1)
            else:
                temp.append(x)
        return np.array(temp).T

Example #4

Source File: test_multiclass.py From twitter-stock-recommendation with MIT License

6 votes

def test_type_of_target():
    for group, group_examples in iteritems(EXAMPLES):
        for example in group_examples:
            assert_equal(type_of_target(example), group,
                         msg=('type_of_target(%r) should be %r, got %r'
                              % (example, group, type_of_target(example))))

    for example in NON_ARRAY_LIKE_EXAMPLES:
        msg_regex = 'Expected array-like \(array or non-string sequence\).*'
        assert_raises_regex(ValueError, msg_regex, type_of_target, example)

    for example in MULTILABEL_SEQUENCES:
        msg = ('You appear to be using a legacy multi-label data '
               'representation. Sequence of sequences are no longer supported;'
               ' use a binary array or sparse matrix instead.')
        assert_raises_regex(ValueError, msg, type_of_target, example)

    try:
        from pandas import SparseSeries
    except ImportError:
        raise SkipTest("Pandas not found")

    y = SparseSeries([1, 0, 0, 1, 0])
    msg = "y cannot be class 'SparseSeries'."
    assert_raises_regex(ValueError, msg, type_of_target, y)

Example #5

Source File: information_value.py From information_value with MIT License

6 votes

def feature_discretion(self, X):
        '''
        Discrete the continuous features of input data X, and keep other features unchanged.
        :param X : numpy array
        :return: the numpy array in which all continuous features are discreted
        '''
        temp = []
        for i in range(0, X.shape[-1]):
            x = X[:, i]
            x_type = type_of_target(x)
            if x_type == 'continuous':
                x1 = self.discrete(x)
                temp.append(x1)
            else:
                temp.append(x)
        return np.array(temp).T

Example #6

Source File: base.py From polylearn with BSD 2-Clause "Simplified" License

6 votes

def _check_X_y(self, X, y):

        # helpful error message for sklearn < 1.17
        is_2d = hasattr(y, 'shape') and len(y.shape) > 1 and y.shape[1] >= 2

        if is_2d or type_of_target(y) != 'binary':
            raise TypeError("Only binary targets supported. For training "
                            "multiclass or multilabel models, you may use the "
                            "OneVsRest or OneVsAll metaestimators in "
                            "scikit-learn.")

        X, Y = check_X_y(X, y, dtype=np.double, accept_sparse='csc',
                         multi_output=False)

        self.label_binarizer_ = LabelBinarizer(pos_label=1, neg_label=-1)
        y = self.label_binarizer_.fit_transform(Y).ravel().astype(np.double)
        return X, y

Example #7

Source File: _search.py From dask-ml with BSD 3-Clause "New" or "Revised" License

6 votes

def check_cv(cv=3, y=None, classifier=False):
    """Dask aware version of ``sklearn.model_selection.check_cv``

    Same as the scikit-learn version, but works if ``y`` is a dask object.
    """
    if cv is None:
        cv = 3

    # If ``cv`` is not an integer, the scikit-learn implementation doesn't
    # touch the ``y`` object, so passing on a dask object is fine
    if not is_dask_collection(y) or not isinstance(cv, numbers.Integral):
        return model_selection.check_cv(cv, y, classifier=classifier)

    if classifier:
        # ``y`` is a dask object. We need to compute the target type
        target_type = delayed(type_of_target, pure=True)(y).compute()
        if target_type in ("binary", "multiclass"):
            return StratifiedKFold(cv)
    return KFold(cv)

Example #8

Source File: nn.py From tpot with GNU Lesser General Public License v3.0

6 votes

def validate_inputs(self, X, y):
        # Things we don't want to allow until we've tested them:
        # - Sparse inputs
        # - Multiclass outputs (e.g., more than 2 classes in `y`)
        # - Non-finite inputs
        # - Complex inputs

        X, y = check_X_y(X, y, accept_sparse=False, allow_nd=False)

        assert_all_finite(X, y)

        if type_of_target(y) != 'binary':
            raise ValueError("Non-binary targets not supported")

        if np.any(np.iscomplex(X)) or np.any(np.iscomplex(y)):
            raise ValueError("Complex data not supported")
        if np.issubdtype(X.dtype, np.object_) or np.issubdtype(y.dtype, np.object_):
            try:
                X = X.astype(float)
                y = y.astype(int)
            except (TypeError, ValueError):
                raise ValueError("argument must be a string.* number")

        return (X, y)

Example #9

Source File: ml_stratifiers.py From iterative-stratification with BSD 3-Clause "New" or "Revised" License

6 votes

def _make_test_folds(self, X, y):
        y = np.asarray(y, dtype=bool)
        type_of_target_y = type_of_target(y)

        if type_of_target_y != 'multilabel-indicator':
            raise ValueError(
                'Supported target type is: multilabel-indicator. Got {!r} instead.'.format(type_of_target_y))

        num_samples = y.shape[0]

        rng = check_random_state(self.random_state)
        indices = np.arange(num_samples)

        if self.shuffle:
            rng.shuffle(indices)
            y = y[indices]

        r = np.asarray([1 / self.n_splits] * self.n_splits)

        test_folds = IterativeStratification(labels=y, r=r, random_state=rng)

        return test_folds[np.argsort(indices)]

Example #10

Source File: labels.py From pumpp with ISC License

6 votes

def fit(self, y):
        """Fit label binarizer

        Parameters
        ----------
        y : array of shape [n_samples,] or [n_samples, n_classes]
            Target values. The 2-d matrix should only contain 0 and 1,
            represents multilabel classification.

        Returns
        -------
        self : returns an instance of self.
        """
        self.y_type_ = type_of_target(y)
        if 'multioutput' in self.y_type_:
            raise ValueError("Multioutput target data is not supported with "
                             "label binarization")
        if _num_samples(y) == 0:
            raise ValueError('y has 0 samples: %r' % y)

        self.sparse_input_ = sp.issparse(y)
        self.classes_ = unique_labels(y)
        return self

Example #11

Source File: mlp_classifier.py From muffnn with BSD 3-Clause "New" or "Revised" License

6 votes

def _is_multilabel(self, y):
        """
        Return whether the given target array corresponds to a multilabel
        problem.
        """
        temp_y = y.copy()
        temp_y[np.zeros_like(temp_y, dtype=bool) | (temp_y == -1)] = 1
        target_type = type_of_target(temp_y)

        if target_type in ['binary', 'multiclass']:
            return False
        elif target_type == 'multilabel-indicator':
            return True
        else:
            # Raise an error, as in
            # sklearn.utils.multiclass.check_classification_targets.
            raise ValueError("Unknown label type: %s" % target_type)

Example #12

Source File: test_multiclass.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_type_of_target():
    for group, group_examples in EXAMPLES.items():
        for example in group_examples:
            assert_equal(type_of_target(example), group,
                         msg=('type_of_target(%r) should be %r, got %r'
                              % (example, group, type_of_target(example))))

    for example in NON_ARRAY_LIKE_EXAMPLES:
        msg_regex = r'Expected array-like \(array or non-string sequence\).*'
        assert_raises_regex(ValueError, msg_regex, type_of_target, example)

    for example in MULTILABEL_SEQUENCES:
        msg = ('You appear to be using a legacy multi-label data '
               'representation. Sequence of sequences are no longer supported;'
               ' use a binary array or sparse matrix instead.')
        assert_raises_regex(ValueError, msg, type_of_target, example)

    try:
        from pandas import SparseSeries
    except ImportError:
        raise SkipTest("Pandas not found")

    y = SparseSeries([1, 0, 0, 1, 0])
    msg = "y cannot be class 'SparseSeries'."
    assert_raises_regex(ValueError, msg, type_of_target, y)

Example #13

Source File: test_averaging.py From nyaggle with MIT License

6 votes

def _make_1st_stage_preds(X, y, X_test):
    if type_of_target(y) == 'continuous':
        models = [
            SVR(),
            Ridge(random_state=0),
            RandomForestRegressor(n_estimators=30, random_state=0)
        ]
    else:
        models = [
            SVC(random_state=0),
            LogisticRegression(random_state=0),
            RandomForestClassifier(n_estimators=30, random_state=0)
        ]

    results = [cross_validate(m, X, y, X_test, cv=5) for m in models]

    return [r.oof_prediction for r in results], [r.test_prediction for r in results]

Example #14

Source File: test_stacking.py From nyaggle with MIT License

6 votes

def _make_1st_stage_preds(X, y, X_test):
    if type_of_target(y) == 'continuous':
        models = [
            SVR(),
            Ridge(random_state=0),
            RandomForestRegressor(n_estimators=30, random_state=0)
        ]
    else:
        models = [
            SVC(random_state=0),
            LogisticRegression(random_state=0),
            RandomForestClassifier(n_estimators=30, random_state=0)
        ]

    results = [cross_validate(m, X, y, X_test, cv=5) for m in models]

    return [r.oof_prediction for r in results], [r.test_prediction for r in results]

Example #15

Source File: labels.py From pumpp with ISC License

5 votes

def transform(self, y):
        """Transform multi-class labels to binary labels

        The output of transform is sometimes referred to by some authors as
        the 1-of-K coding scheme.

        Parameters
        ----------
        y : array or sparse matrix of shape [n_samples,] or \
            [n_samples, n_classes]
            Target values. The 2-d matrix should only contain 0 and 1,
            represents multilabel classification. Sparse matrix can be
            CSR, CSC, COO, DOK, or LIL.

        Returns
        -------
        Y : numpy array or CSR matrix of shape [n_samples, n_classes]
            Shape will be [n_samples, 1] for binary problems.
        """
        check_is_fitted(self, 'classes_')

        y_is_multilabel = type_of_target(y).startswith('multilabel')
        if y_is_multilabel and not self.y_type_.startswith('multilabel'):
            raise ValueError("The object was not fitted with multilabel"
                             " input.")

        return label_binarize(y, self.classes_,
                              pos_label=self.pos_label,
                              neg_label=self.neg_label,
                              sparse_output=self.sparse_output)

Example #16

Source File: WOE_IV.py From exploripy with MIT License

5 votes

def check_target_binary(self, y):
        '''
        check if the target variable is binary, raise error if not.
        :param y:
        :return:
        '''
        y_type = type_of_target(y)
        if y_type not in ['binary']:
            raise ValueError('Label type must be binary')

Example #17

Source File: test_label.py From twitter-stock-recommendation with MIT License

5 votes

def check_binarized_results(y, classes, pos_label, neg_label, expected):
    for sparse_output in [True, False]:
        if ((pos_label == 0 or neg_label != 0) and sparse_output):
            assert_raises(ValueError, label_binarize, y, classes,
                          neg_label=neg_label, pos_label=pos_label,
                          sparse_output=sparse_output)
            continue

        # check label_binarize
        binarized = label_binarize(y, classes, neg_label=neg_label,
                                   pos_label=pos_label,
                                   sparse_output=sparse_output)
        assert_array_equal(toarray(binarized), expected)
        assert_equal(issparse(binarized), sparse_output)

        # check inverse
        y_type = type_of_target(y)
        if y_type == "multiclass":
            inversed = _inverse_binarize_multiclass(binarized, classes=classes)

        else:
            inversed = _inverse_binarize_thresholding(binarized,
                                                      output_type=y_type,
                                                      classes=classes,
                                                      threshold=((neg_label +
                                                                 pos_label) /
                                                                 2.))

        assert_array_equal(toarray(inversed), toarray(y))

        # Check label binarizer
        lb = LabelBinarizer(neg_label=neg_label, pos_label=pos_label,
                            sparse_output=sparse_output)
        binarized = lb.fit_transform(y)
        assert_array_equal(toarray(binarized), expected)
        assert_equal(issparse(binarized), sparse_output)
        inverse_output = lb.inverse_transform(binarized)
        assert_array_equal(toarray(inverse_output), toarray(y))
        assert_equal(issparse(inverse_output), issparse(y))

Example #18

Source File: split.py From nyaggle with MIT License

5 votes

def check_cv(cv: Union[int, Iterable, BaseCrossValidator] = 5,
             y: Optional[Union[pd.Series, np.ndarray]] = None,
             stratified: bool = False,
             random_state: int = 0):
    if cv is None:
        cv = 5
    if isinstance(cv, numbers.Integral):
        if stratified and (y is not None) and (type_of_target(y) in ('binary', 'multiclass')):
            return StratifiedKFold(cv, shuffle=True, random_state=random_state)
        else:
            return KFold(cv, shuffle=True, random_state=random_state)

    return model_selection.check_cv(cv, y, stratified)

Example #19

Source File: test_common.py From twitter-stock-recommendation with MIT License

5 votes

def check_averaging(name, y_true, y_true_binarize, y_pred, y_pred_binarize,
                    y_score):
    is_multilabel = type_of_target(y_true).startswith("multilabel")

    metric = ALL_METRICS[name]

    if name in METRICS_WITH_AVERAGING:
        _check_averaging(metric, y_true, y_pred, y_true_binarize,
                         y_pred_binarize, is_multilabel)
    elif name in THRESHOLDED_METRICS_WITH_AVERAGING:
        _check_averaging(metric, y_true, y_score, y_true_binarize,
                         y_score, is_multilabel)
    else:
        raise ValueError("Metric is not recorded as having an average option")

Example #20

Source File: __init__.py From autogluon with Apache License 2.0

5 votes

def __call__(self, y_true, y_pred, sample_weight=None):
        """Evaluate decision function output for X relative to y_true.
        Parameters
        ----------
        y_true : array-like
            Gold standard target values for X. These must be class labels,
            not probabilities.

        y_pred : array-like, [n_samples x n_classes]
            Model predictions

        sample_weight : array-like, optional (default=None)
            Sample weights.

        Returns
        -------
        score : float
            Score function applied to prediction of estimator on X.
        """
        if isinstance(y_true, list):
            y_true = np.array(y_true)
        if isinstance(y_pred, list):
            y_pred = np.array(y_pred)
        y_type = type_of_target(y_true)
        if y_type not in ("binary", "multilabel-indicator"):
            raise ValueError("{0} format is not supported".format(y_type))

        if y_type == "binary":
            pass
            # y_pred = y_pred[:, 1]
        elif isinstance(y_pred, list):
            y_pred = np.vstack([p[:, -1] for p in y_pred]).T

        if sample_weight is not None:
            return self._sign * self._score_func(y_true, y_pred,
                                                 sample_weight=sample_weight,
                                                 **self._kwargs)
        else:
            return self._sign * self._score_func(y_true, y_pred, **self._kwargs)

Example #21

Source File: information_value.py From information_value with MIT License

5 votes

def check_target_binary(self, y):
        '''
        check if the target variable is binary, raise error if not.
        :param y:
        :return:
        '''
        y_type = type_of_target(y)
        if y_type not in ['binary']:
            raise ValueError('Label type must be binary')

Example #22

Source File: test_corner_cases.py From smrt with BSD 3-Clause "New" or "Revised" License

5 votes

def test_label_corner_cases():
    # the current max classes is 100 (might change though).
    n_classes = base.MAX_N_CLASSES + 1

    # create n_classes labels, append on itself so there are at least two of each
    # so sklearn will find it as a multi-class and not a continuous target
    labels = np.arange(n_classes)
    labels = np.concatenate([labels, labels])

    # assert that it's multiclass and that we're getting the appropriate ValueError!
    y_type = type_of_target(labels)
    assert y_type == 'multiclass', y_type

    # create an X of random. Doesn't even matter.
    x = np.random.rand(labels.shape[0], 4)

    # try to balance, but it will fail because of the number of classes
    assert_raises(ValueError, smote_balance, x, labels)

    # now time for continuous...
    labels = np.linspace(0, 1000, x.shape[0])

    # fails because improper y_type
    assert_raises(ValueError, smote_balance, x, labels)

    # perform a balancing operation with only one observation, and show that it will raise
    labels = np.zeros(x.shape[0])
    labels[0] = 1  # this is the only one.
    y_type = type_of_target(labels)
    assert y_type == 'binary', y_type

    # fails because only one observation of one of the classes
    assert_raises(ValueError, smote_balance, x, labels)

Example #23

Source File: test_common.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def check_averaging(name, y_true, y_true_binarize, y_pred, y_pred_binarize,
                    y_score):
    is_multilabel = type_of_target(y_true).startswith("multilabel")

    metric = ALL_METRICS[name]

    if name in METRICS_WITH_AVERAGING:
        _check_averaging(metric, y_true, y_pred, y_true_binarize,
                         y_pred_binarize, is_multilabel)
    elif name in THRESHOLDED_METRICS_WITH_AVERAGING:
        _check_averaging(metric, y_true, y_score, y_true_binarize,
                         y_score, is_multilabel)
    else:
        raise ValueError("Metric is not recorded as having an average option")

Example #24

Source File: preprocessing.py From reportgen with MIT License

5 votes

def _posibility(self, x, tag, event=1):
        """计算触发概率
        Parameters:
        ----------
            x (Sequence): - 离散特征序列
            tag (Sequence): - 用于训练的标签序列
            event (any): - True指代的触发事件
        Returns:
        ----------
            Dict[str,Tuple[rate_T, rate_F]]: - 训练好后的好坏触发概率
        """
        if type_of_target(tag) not in ['binary']:
            raise AttributeError("tag must be a binary array")
        #if type_of_target(x) in ['continuous']:
        #    raise AttributeError("input array must not continuous")
        tag = np.array(tag)
        x = np.array(x)
        event_total = (tag == event).sum()
        non_event_total = tag.shape[-1] - event_total
        x_labels = pd.unique(x[pd.notnull(x)])
        pos_dic = {}
        for x1 in x_labels:
            # 当 x1 是nan时，y1 也为空
            y1 = tag[np.where(x == x1)[0]]
            event_count = (y1 == event).sum()
            non_event_count = y1.shape[-1] - event_count
            rate_event = 1.0 * event_count / event_total
            rate_non_event = 1.0 * non_event_count / non_event_total
            pos_dic[x1] = (rate_event, rate_non_event)
        return pos_dic

Example #25

Source File: metrics.py From hyperparameter_hunter with MIT License

5 votes

def get_clean_prediction(target: ArrayLike, prediction: ArrayLike):
    """Create `prediction` that is of a form comparable to `target`

    Parameters
    ----------
    target: Array-like
        True labels for the data. Should be same shape as `prediction`
    prediction: Array-like
        Predicted labels for the data. Should be same shape as `target`

    Returns
    -------
    prediction: Array-like
        If `target` types are ints, and `prediction` types are not, given predicted labels clipped
        between the min, and max of `target`, then rounded to the nearest integer. Else, original
        predicted labels"""
    target_type = type_of_target(target)
    prediction_type = type_of_target(prediction)
    # ValueError probably: "Classification metrics can't handle a mix of binary and continuous targets"
    if _is_int(target) and not _is_int(prediction):
        #################### Get Minimum/Maximum ####################
        target_min, target_max = target.min(), target.max()

        with suppress(TypeError):  # Bypass one-dimensional arrays, whose min/max should be a scalar
            if (len(target_min) == 1) and (len(target_max) == 1):
                target_min, target_max = target_min[0], target_max[0]

        #################### Clip/Round `prediction` ####################
        try:
            prediction = np.clip(prediction, target_min, target_max)
        except ValueError:
            prediction = prediction.clip(target_min, target_max, axis=1)
        finally:
            prediction = prediction.astype(np.float64)
            prediction = np.rint(prediction)
    elif target_type in classification_target_types and prediction_type.startswith("continuous"):
        prediction = classify_output(target, prediction)

    # TODO: One-hot-encoded outputs will be of type "multiclass-multioutput" - Handle it
    return prediction

Example #26

Source File: ml_stratifiers.py From iterative-stratification with BSD 3-Clause "New" or "Revised" License

5 votes

def _iter_indices(self, X, y, groups=None):
        n_samples = _num_samples(X)
        y = check_array(y, ensure_2d=False, dtype=None)
        y = np.asarray(y, dtype=bool)
        type_of_target_y = type_of_target(y)

        if type_of_target_y != 'multilabel-indicator':
            raise ValueError(
                'Supported target type is: multilabel-indicator. Got {!r} instead.'.format(
                    type_of_target_y))

        n_train, n_test = _validate_shuffle_split(n_samples, self.test_size,
                                                  self.train_size)

        n_samples = y.shape[0]
        rng = check_random_state(self.random_state)
        y_orig = y.copy()

        r = np.array([n_train, n_test]) / (n_train + n_test)

        for _ in range(self.n_splits):
            indices = np.arange(n_samples)
            rng.shuffle(indices)
            y = y_orig[indices]

            test_folds = IterativeStratification(labels=y, r=r, random_state=rng)

            test_idx = test_folds[np.argsort(indices)] == 1
            test = np.where(test_idx)[0]
            train = np.where(~test_idx)[0]

            yield train, test

Example #27

Source File: test_label.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def check_binarized_results(y, classes, pos_label, neg_label, expected):
    for sparse_output in [True, False]:
        if ((pos_label == 0 or neg_label != 0) and sparse_output):
            assert_raises(ValueError, label_binarize, y, classes,
                          neg_label=neg_label, pos_label=pos_label,
                          sparse_output=sparse_output)
            continue

        # check label_binarize
        binarized = label_binarize(y, classes, neg_label=neg_label,
                                   pos_label=pos_label,
                                   sparse_output=sparse_output)
        assert_array_equal(toarray(binarized), expected)
        assert_equal(issparse(binarized), sparse_output)

        # check inverse
        y_type = type_of_target(y)
        if y_type == "multiclass":
            inversed = _inverse_binarize_multiclass(binarized, classes=classes)

        else:
            inversed = _inverse_binarize_thresholding(binarized,
                                                      output_type=y_type,
                                                      classes=classes,
                                                      threshold=((neg_label +
                                                                 pos_label) /
                                                                 2.))

        assert_array_equal(toarray(inversed), toarray(y))

        # Check label binarizer
        lb = LabelBinarizer(neg_label=neg_label, pos_label=pos_label,
                            sparse_output=sparse_output)
        binarized = lb.fit_transform(y)
        assert_array_equal(toarray(binarized), expected)
        assert_equal(issparse(binarized), sparse_output)
        inverse_output = lb.inverse_transform(binarized)
        assert_array_equal(toarray(inverse_output), toarray(y))
        assert_equal(issparse(inverse_output), issparse(y))

Example #28

Source File: models.py From scikit-uplift with MIT License

5 votes

def fit(self, X, y, treatment, estimator_fit_params=None):
        """Fit the model according to the given training data.

        Args:
            X (array-like, shape (n_samples, n_features)): Training vector, where n_samples is the number of samples and
                n_features is the number of features.
            y (array-like, shape (n_samples,)): Target vector relative to X.
            treatment (array-like, shape (n_samples,)): Binary treatment vector relative to X.
            estimator_fit_params (dict, optional): Parameters to pass to the fit method of the estimator.

        Returns:
            object: self
        """

        # TODO: check the treatment is binary
        check_consistent_length(X, y, treatment)
        self._type_of_target = type_of_target(y)

        if self._type_of_target != 'binary':
            raise ValueError("This approach is only suitable for binary classification problem")
        _, treatment_counts = np.unique(treatment, return_counts=True)
        if treatment_counts[0] != treatment_counts[1]:
            warnings.warn(
                "It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.",
                category=UserWarning,
                stacklevel=2
            )

        y_mod = (np.array(y) == np.array(treatment)).astype(int)

        if estimator_fit_params is None:
            estimator_fit_params = {}
        self.estimator.fit(X, y_mod, **estimator_fit_params)
        return self

Example #29

Source File: base.py From smrt with BSD 3-Clause "New" or "Revised" License

4 votes

def _validate_X_y_ratio_classes(X, y, ratio):
    # validate the cheap stuff before copying arrays around...
    validate_float(ratio, 'balance_ratio')

    # validate arrays
    X = check_array(X, accept_sparse=False, dtype=NPDTYPE, ensure_2d=True, copy=True)
    y = check_array(y, accept_sparse=False, ensure_2d=False, dtype=None)
    y = column_or_1d(y, warn=False)

    # get n classes in y, ensure they are <= MAX_N_CLASSES, but first ensure these are actually
    # class labels and not floats or anything...
    y_type = type_of_target(y)
    supported_types = {'multiclass', 'binary'}
    if y_type not in supported_types:
        raise ValueError('balancers only support %r, but got %r'
                         % ("(" + ', '.join(supported_types) + ")", y_type))

    present_classes, counts = np.unique(y, return_counts=True)
    n_classes = len(present_classes)

    # ensure <= MAX_N_CLASSES
    if n_classes > MAX_N_CLASSES:
        raise ValueError('balancers currently only support a maximum of %i '
                         'unique class labels, but %i were identified.' % (MAX_N_CLASSES, n_classes))

    # get the majority class label, and its count:
    majority_count_idx = np.argmax(counts, axis=0)
    majority_label, majority_count = present_classes[majority_count_idx], counts[majority_count_idx]
    target_count = max(int(ratio * majority_count), 1)

    # define a min_n_samples based on the sample ratio to max_class
    # required = {target_count - counts[i] for i, v in enumerate(present_classes) if v != majority_label}

    # THIS WAS OUR ORIGINAL LOGIC:
    #   * If there were any instances where the number of synthetic examples required for a class
    #     outweighed the number that existed in the class to begin with, we would end up having to
    #     potentially sample from the synthetic examples. We didn't want to have to do that.
    #
    # But it seems like a totally valid use-case. If we're detecting breast cancer, it might be a rare
    # event that needs lots of bolstering. We should allow that, even though we may discourage it.

    # if any counts < MIN_N_SAMPLES, raise:
    if any(i < MIN_N_SAMPLES for i in counts):
        raise ValueError('All label counts must be >= %i' % MIN_N_SAMPLES)

    return X, y, n_classes, present_classes, counts, majority_label, target_count

Example #30

Source File: base.py From skoot with MIT License

4 votes

def _validate_X_y_ratio_classes(X, y, ratio):
    # validate the cheap stuff before copying arrays around...
    validate_float(ratio, 'balance_ratio')

    # validate arrays
    X, y = indexable(X, y)  # want to allow pd.DataFrame
    y = column_or_1d(y, warn=False)  # type: np.ndarray

    # get n classes in y, ensure they are <= MAX_N_CLASSES, but first
    # ensure these are actually class labels and not floats or anything...
    y_type = type_of_target(y)
    supported_types = {'multiclass', 'binary'}
    if y_type not in supported_types:
        raise ValueError('balancers only support %r, but got %r'
                         % ("(" + ', '.join(supported_types) + ")", y_type))

    present_classes, counts = np.unique(y, return_counts=True)
    n_classes = len(present_classes)

    # ensure <= MAX_N_CLASSES
    if n_classes > MAX_N_CLASSES:
        raise ValueError('balancers currently only support a maximum of %i '
                         'unique class labels, but %i were identified.'
                         % (MAX_N_CLASSES, n_classes))

    # get the majority class label, and its count:
    majority_count_idx = np.argmax(counts, axis=0)
    majority_label, majority_count = (present_classes[majority_count_idx],
                                      counts[majority_count_idx])
    target_count = max(int(ratio * majority_count), 1)

    # define a min_n_samples based on the sample ratio to max_class
    # required = {target_count - counts[i]
    #             for i, v in enumerate(present_classes)
    #             if v != majority_label}

    # THIS WAS OUR ORIGINAL LOGIC:
    #   * If there were any instances where the number of synthetic examples
    #     required for a class outweighed the number that existed in the class
    #     to begin with, we would end up having to potentially sample from the
    #     synthetic examples. We didn't want to have to do that.
    #
    # But it seems like a totally valid use-case. If we're detecting breast
    # cancer, it might be a rare event that needs lots of bolstering. We
    # should allow that, even though we may discourage it.

    # if any counts < MIN_N_SAMPLES, raise:
    if any(i < MIN_N_SAMPLES for i in counts):
        raise ValueError('All label counts must be >= %i' % MIN_N_SAMPLES)

    return (X, y, n_classes, present_classes, counts,
            majority_label, target_count)