Python sklearn.base.is_classifier() Examples

The following are 19 code examples of sklearn.base.is_classifier(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.base , or try the search function .
Example #1
Source File: pipline.py    From MachineLearning with Apache License 2.0 6 votes vote down vote up
def transform(self, X, y=None):
        cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))
        
        X_prob = np.zeros((X.shape[0], self.n_classes))
        X_pred = np.zeros(X.shape[0])
        
        for estimator, (_, test) in zip(self.estimators_, cv.split(X)):
            X_prob[test] = estimator.predict_proba(X[test])
            X_pred[test] = estimator.predict(X[test])
        return np.hstack([X_prob, np.array([X_pred]).T]) 
Example #2
Source File: grid_search_cv.py    From sklearn-onnx with MIT License 6 votes vote down vote up
def convert_sklearn_grid_search_cv(scope, operator, container):
    """
    Converter for scikit-learn's GridSearchCV.
    """
    opts = scope.get_options(operator.raw_operator)
    grid_search_op = operator.raw_operator
    best_estimator = grid_search_op.best_estimator_
    op_type = sklearn_operator_name_map[type(best_estimator)]
    grid_search_operator = scope.declare_local_operator(op_type)
    grid_search_operator.raw_operator = best_estimator
    container.add_options(id(best_estimator), opts)
    grid_search_operator.inputs = operator.inputs
    label_name = scope.declare_local_variable('label')
    grid_search_operator.outputs.append(label_name)
    if is_classifier(best_estimator):
        proba_name = scope.declare_local_variable('probability_tensor',
                                                  FloatTensorType())
        grid_search_operator.outputs.append(proba_name)
    apply_identity(scope, label_name.full_name,
                   operator.outputs[0].full_name, container)
    if is_classifier(best_estimator):
        apply_identity(scope, proba_name.full_name,
                       operator.outputs[1].full_name, container) 
Example #3
Source File: externals.py    From sports-betting with MIT License 6 votes vote down vote up
def fit(self, X, y, sample_weight=None):
        """Fit a separate classifier for each output variable."""

        for _, clf in self.classifiers:
            if not hasattr(clf, 'fit'):
                raise ValueError('Every base classifier should implement a fit method.')

        X, y = check_X_y(X, y, multi_output=True, accept_sparse=True)

        if is_classifier(self):
            check_classification_targets(y)

        if y.ndim == 1:
            raise ValueError('Output y must have at least two dimensions for multi-output classification but has only one.')

        if sample_weight is not None and any([not has_fit_parameter(clf, 'sample_weight') for _, clf in self.classifiers]):
            raise ValueError('One of base classifiers does not support sample weights.')

        self.classifiers_ = Parallel(n_jobs=self.n_jobs)(delayed(_fit_estimator)(clf, X, y[:, i], sample_weight) 
                                                        for i, (_, clf) in zip(range(y.shape[1]), self.classifiers))
        
        return self 
Example #4
Source File: cart.py    From Hands-on-Supervised-Machine-Learning-with-Python with MIT License 6 votes vote down vote up
def __init__(self, X, y, criterion, min_samples_split, max_depth,
                 n_val_sample, random_state):
        # make sure max_depth > 1
        if max_depth < 2:
            raise ValueError("max depth must be > 1")

        # check the input arrays, and if it's classification validate the
        # target values in y
        X, y = check_X_y(X, y, accept_sparse=False, dtype=None, copy=True)
        if is_classifier(self):
            check_classification_targets(y)

        # hyper parameters so we can later inspect attributes of the model
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.n_val_sample = n_val_sample
        self.random_state = random_state

        # create the splitting class
        random_state = check_random_state(random_state)
        self.splitter = RandomSplitter(random_state, criterion, n_val_sample)

        # grow the tree depth first
        self.tree = self._find_next_split(X, y, 0) 
Example #5
Source File: __init__.py    From oddt with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def __init__(self, models):
        """Proxy class to build an ensemble of models with an API as one

        Parameters
        ----------
        models: array
            An array of models
        """
        self._models = models if len(models) else None
        if self._models is not None:
            if is_classifier(self._models[0]):
                check_type = is_classifier
                self._scoring_fun = accuracy_score
            elif is_regressor(self._models[0]):
                check_type = is_regressor
                self._scoring_fun = r2_score
            else:
                raise ValueError('Expected regressors or classifiers,'
                                 ' got %s instead' % type(self._models[0]))
            for model in self._models:
                if not check_type(model):
                    raise ValueError('Different types of models found, privide'
                                     ' either regressors or classifiers.') 
Example #6
Source File: model.py    From gobbli with Apache License 2.0 6 votes vote down vote up
def _validate_estimator(estimator: BaseEstimator):
        """
        Run some checks on the given object to determine if it's an estimator which is
        valid for our purposes.
        """
        # sklearn has a function that does a lot more intensive checking regarding
        # the interface of a candidate Estimator
        # (sklearn.utils.estimator_checks.check_estimator), but the function
        # doesn't work well for our use case as of version 0.22.  It doesn't properly
        # detect Pipeline X_types based on the first pipeline component and won't
        # test anything that doesn't accept a 2-D numpy array as input.  We'll settle
        # for lax checks here until sklearn has something that works better for us.
        if not is_classifier(estimator):
            raise ValueError(
                "Estimator must be a classifier according to sklearn.base.is_classifier()"
            )

        if not hasattr(estimator, "predict_proba"):
            raise ValueError(
                "Estimator must support the predict_proba() method to fulfill gobbli's "
                "interface requirements for a prediction model."
            ) 
Example #7
Source File: sklearn_pipeline.py    From ramp-workflow with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_submission(self, estimator_fitted, X):
        """Predict using a fitted estimator.

        Parameters
        ----------
        estimator_fitted : estimator object
            A fitted scikit-learn estimator.
        X : {array-like, sparse matrix, dataframe} of shape \
                (n_samples, n_features)
            The test data set.

        Returns
        -------
        pred : ndarray of shape (n_samples, n_classes) or (n_samples)
        """
        if is_classifier(estimator_fitted):
            return estimator_fitted.predict_proba(X)
        return estimator_fitted.predict(X) 
Example #8
Source File: test_base.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_is_classifier():
    svc = SVC()
    assert_true(is_classifier(svc))
    assert_true(is_classifier(GridSearchCV(svc, {'C': [0.1, 1]})))
    assert_true(is_classifier(Pipeline([('svc', svc)])))
    assert_true(is_classifier(Pipeline(
        [('svc_cv', GridSearchCV(svc, {'C': [0.1, 1]}))]))) 
Example #9
Source File: pipline.py    From MachineLearning with Apache License 2.0 5 votes vote down vote up
def fit(self, X, y):
        y_labels = self._get_labels(y)
        cv = check_cv(self.cv, y_labels, classifier=is_classifier(self.estimator))
        self.estimators_ = []
        
        for train, _ in cv.split(X, y_labels):
            self.estimators_.append(
                clone(self.estimator).fit(X[train], y_labels[train])
            )
        return self 
Example #10
Source File: _validation.py    From mriqc with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def cross_val_score(
    estimator,
    X,
    y=None,
    groups=None,
    scoring=None,
    cv=None,
    n_jobs=1,
    verbose=0,
    fit_params=None,
    pre_dispatch="2*n_jobs",
):
    """
    Evaluate a score by cross-validation
    """
    if not isinstance(scoring, (list, tuple)):
        scoring = [scoring]

    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    splits = list(cv.split(X, y, groups))
    scorer = [check_scoring(estimator, scoring=s) for s in scoring]
    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)
    scores = parallel(
        delayed(_fit_and_score)(
            clone(estimator), X, y, scorer, train, test, verbose, None, fit_params
        )
        for train, test in splits
    )

    group_order = []
    if hasattr(cv, "groups"):
        group_order = [np.array(cv.groups)[test].tolist()[0] for _, test in splits]
    return np.squeeze(np.array(scores)), group_order 
Example #11
Source File: cross_validation.py    From Pyspatialml with GNU General Public License v3.0 5 votes vote down vote up
def fit(self, X, y=None, groups=None, **fit_params):
        """
        Run fit method with all sets of parameters

        Args
        ----
        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples is the number of samples and
            n_features is the number of features

        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
            Target relative to X for classification or regression;
            None for unsupervised learning

        groups : array-like, shape = [n_samples], optional
            Training vector groups for cross-validation

        **fit_params : dict of string -> object
            Parameters passed to the ``fit`` method of the estimator
        """

        # check estimator and cv methods are valid
        self.cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))

        # check for binary response
        if len(np.unique(y)) > 2:
            raise ValueError('Only a binary response vector is currently supported')

        # check that scoring metric has been specified
        if self.scoring is None:
            raise ValueError('No score function is defined')

        # perform cross validation prediction
        self.y_pred_ = cross_val_predict(
            estimator=self.estimator, X=X, y=y, groups=groups, cv=self.cv,
            method='predict_proba', n_jobs=self.n_jobs, **fit_params)
        self.y_true = y

        # add fold id to the predictions
        self.test_idx_ = [indexes[1] for indexes in self.cv.split(X, y, groups)] 
Example #12
Source File: test_sgd.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_validation_set_not_used_for_training(klass):
    X, Y = iris.data, iris.target
    validation_fraction = 0.4
    seed = 42
    shuffle = False
    max_iter = 10
    clf1 = klass(early_stopping=True,
                 random_state=np.random.RandomState(seed),
                 validation_fraction=validation_fraction,
                 learning_rate='constant', eta0=0.01,
                 tol=None, max_iter=max_iter, shuffle=shuffle)
    clf1.fit(X, Y)
    assert clf1.n_iter_ == max_iter

    clf2 = klass(early_stopping=False,
                 random_state=np.random.RandomState(seed),
                 learning_rate='constant', eta0=0.01,
                 tol=None, max_iter=max_iter, shuffle=shuffle)

    if is_classifier(clf2):
        cv = StratifiedShuffleSplit(test_size=validation_fraction,
                                    random_state=seed)
    else:
        cv = ShuffleSplit(test_size=validation_fraction,
                          random_state=seed)
    idx_train, idx_val = next(cv.split(X, Y))
    idx_train = np.sort(idx_train)  # remove shuffling
    clf2.fit(X[idx_train], Y[idx_train])
    assert clf2.n_iter_ == max_iter

    assert_array_equal(clf1.coef_, clf2.coef_) 
Example #13
Source File: test_sgd.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_late_onset_averaging_not_reached(klass):
    clf1 = klass(average=600)
    clf2 = klass()
    for _ in range(100):
        if is_classifier(clf1):
            clf1.partial_fit(X, Y, classes=np.unique(Y))
            clf2.partial_fit(X, Y, classes=np.unique(Y))
        else:
            clf1.partial_fit(X, Y)
            clf2.partial_fit(X, Y)

    assert_array_almost_equal(clf1.coef_, clf2.coef_, decimal=16)
    assert_almost_equal(clf1.intercept_, clf2.intercept_, decimal=16) 
Example #14
Source File: test_base.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_is_classifier():
    svc = SVC()
    assert is_classifier(svc)
    assert is_classifier(GridSearchCV(svc, {'C': [0.1, 1]}))
    assert is_classifier(Pipeline([('svc', svc)]))
    assert is_classifier(Pipeline(
        [('svc_cv', GridSearchCV(svc, {'C': [0.1, 1]}))])) 
Example #15
Source File: _validation.py    From mriqc with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def permutation_test_score(
    estimator,
    X,
    y,
    groups=None,
    cv=None,
    n_permutations=100,
    n_jobs=1,
    random_state=0,
    verbose=0,
    scoring=None,
):
    """
    Evaluate the significance of a cross-validated score with permutations,
    as in test 1 of [Ojala2010]_.

    A modification of original sklearn's permutation test score function
    to evaluate p-value outside this function, so that the score can be
    reused from outside.


    .. [Ojala2010] Ojala and Garriga. Permutation Tests for Studying Classifier
                   Performance.  The Journal of Machine Learning Research (2010)
                   vol. 11

    """
    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    scorer = check_scoring(estimator, scoring=scoring)
    random_state = check_random_state(random_state)

    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)(
        delayed(_permutation_test_score)(
            clone(estimator), X, _shuffle(y, groups, random_state), groups, cv, scorer
        )
        for _ in range(n_permutations)
    )
    permutation_scores = np.array(permutation_scores)
    return permutation_scores 
Example #16
Source File: sklearn_patches.py    From tslearn with BSD 2-Clause "Simplified" License 4 votes vote down vote up
def yield_all_checks(name, estimator):
    tags = estimator._get_tags()
    if "2darray" not in tags["X_types"]:
        warnings.warn("Can't test estimator {} which requires input "
                      " of type {}".format(name, tags["X_types"]),
                      SkipTestWarning)
        return
    if tags["_skip_test"]:
        warnings.warn("Explicit SKIP via _skip_test tag for estimator "
                      "{}.".format(name),
                      SkipTestWarning)
        return

    yield from _yield_checks(name, estimator)
    if is_classifier(estimator):
        yield from _yield_classifier_checks(name, estimator)
    if is_regressor(estimator):
        yield from _yield_regressor_checks(name, estimator)
    if hasattr(estimator, 'transform'):
        if not tags["allow_variable_length"]:
            # Transformer tests ensure that shapes are the same at fit and
            # transform time, hence we need to skip them for estimators that
            # allow variable-length inputs
            yield from _yield_transformer_checks(name, estimator)
    if isinstance(estimator, ClusterMixin):
        yield from _yield_clustering_checks(name, estimator)
    if is_outlier_detector(estimator):
        yield from _yield_outliers_checks(name, estimator)
    # We are not strict on presence/absence of the 3rd dimension
    # yield check_fit2d_predict1d

    if not tags["non_deterministic"]:
        yield check_methods_subset_invariance

    yield check_fit2d_1sample
    yield check_fit2d_1feature
    yield check_fit1d
    yield check_get_params_invariance
    yield check_set_params
    yield check_dict_unchanged
    yield check_dont_overwrite_parameters
    yield check_fit_idempotent

    if (is_classifier(estimator) or
            is_regressor(estimator) or
            isinstance(estimator, ClusterMixin)):
        if tags["allow_variable_length"]:
            yield check_different_length_fit_predict_transform 
Example #17
Source File: sklearn.py    From optuna with MIT License 4 votes vote down vote up
def _cross_validate_with_pruning(
        self,
        trial,  # type: trial_module.Trial
        estimator,  # type: BaseEstimator
    ):
        # type: (...) -> Dict[str, OneDimArrayLikeType]

        if is_classifier(estimator):
            partial_fit_params = self.fit_params.copy()
            classes = np.unique(self.y)

            partial_fit_params.setdefault("classes", classes)

        else:
            partial_fit_params = self.fit_params

        n_splits = self.cv.get_n_splits(self.X, self.y, groups=self.groups)
        estimators = [clone(estimator) for _ in range(n_splits)]
        scores = {
            "fit_time": np.zeros(n_splits),
            "score_time": np.zeros(n_splits),
            "test_score": np.empty(n_splits),
        }

        if self.return_train_score:
            scores["train_score"] = np.empty(n_splits)

        for step in range(self.max_iter):
            for i, (train, test) in enumerate(self.cv.split(self.X, self.y, groups=self.groups)):
                out = self._partial_fit_and_score(estimators[i], train, test, partial_fit_params)

                if self.return_train_score:
                    scores["train_score"][i] = out.pop(0)

                scores["test_score"][i] = out[0]
                scores["fit_time"][i] += out[1]
                scores["score_time"][i] += out[2]

            intermediate_value = np.nanmean(scores["test_score"])

            trial.report(intermediate_value, step=step)

            if trial.should_prune():
                self._store_scores(trial, scores)

                raise TrialPruned("trial was pruned at iteration {}.".format(step))

        return scores 
Example #18
Source File: test_export.py    From twitter-stock-recommendation with MIT License 4 votes vote down vote up
def test_precision():

    rng_reg = RandomState(2)
    rng_clf = RandomState(8)
    for X, y, clf in zip(
            (rng_reg.random_sample((5, 2)),
             rng_clf.random_sample((1000, 4))),
            (rng_reg.random_sample((5, )),
             rng_clf.randint(2, size=(1000, ))),
            (DecisionTreeRegressor(criterion="friedman_mse", random_state=0,
                                   max_depth=1),
             DecisionTreeClassifier(max_depth=1, random_state=0))):

        clf.fit(X, y)
        for precision in (4, 3):
            dot_data = export_graphviz(clf, out_file=None, precision=precision,
                                       proportion=True)

            # With the current random state, the impurity and the threshold
            # will have the number of precision set in the export_graphviz
            # function. We will check the number of precision with a strict
            # equality. The value reported will have only 2 precision and
            # therefore, only a less equal comparison will be done.

            # check value
            for finding in finditer("value = \d+\.\d+", dot_data):
                assert_less_equal(
                    len(search("\.\d+", finding.group()).group()),
                    precision + 1)
            # check impurity
            if is_classifier(clf):
                pattern = "gini = \d+\.\d+"
            else:
                pattern = "friedman_mse = \d+\.\d+"

            # check impurity
            for finding in finditer(pattern, dot_data):
                assert_equal(len(search("\.\d+", finding.group()).group()),
                             precision + 1)
            # check threshold
            for finding in finditer("<= \d+\.\d+", dot_data):
                assert_equal(len(search("\.\d+", finding.group()).group()),
                             precision + 1) 
Example #19
Source File: test_export.py    From Mastering-Elasticsearch-7.0 with MIT License 4 votes vote down vote up
def test_precision():

    rng_reg = RandomState(2)
    rng_clf = RandomState(8)
    for X, y, clf in zip(
            (rng_reg.random_sample((5, 2)),
             rng_clf.random_sample((1000, 4))),
            (rng_reg.random_sample((5, )),
             rng_clf.randint(2, size=(1000, ))),
            (DecisionTreeRegressor(criterion="friedman_mse", random_state=0,
                                   max_depth=1),
             DecisionTreeClassifier(max_depth=1, random_state=0))):

        clf.fit(X, y)
        for precision in (4, 3):
            dot_data = export_graphviz(clf, out_file=None, precision=precision,
                                       proportion=True)

            # With the current random state, the impurity and the threshold
            # will have the number of precision set in the export_graphviz
            # function. We will check the number of precision with a strict
            # equality. The value reported will have only 2 precision and
            # therefore, only a less equal comparison will be done.

            # check value
            for finding in finditer(r"value = \d+\.\d+", dot_data):
                assert_less_equal(
                    len(search(r"\.\d+", finding.group()).group()),
                    precision + 1)
            # check impurity
            if is_classifier(clf):
                pattern = r"gini = \d+\.\d+"
            else:
                pattern = r"friedman_mse = \d+\.\d+"

            # check impurity
            for finding in finditer(pattern, dot_data):
                assert_equal(len(search(r"\.\d+", finding.group()).group()),
                             precision + 1)
            # check threshold
            for finding in finditer(r"<= \d+\.\d+", dot_data):
                assert_equal(len(search(r"\.\d+", finding.group()).group()),
                             precision + 1)