Python sklearn.model_selection() Examples

The following are 16 code examples of sklearn.model_selection(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn , or try the search function .
Example #1
Source File: fixes.py    From skutil with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _cv_len(cv, X, y):
    """This method computes the length of a cross validation
    object, agnostic of whether sklearn-0.17 or sklearn-0.18
    is being used.

    Parameters
    ----------

    cv : `sklearn.cross_validation._PartitionIterator` or `sklearn.model_selection.BaseCrossValidator`
        The cv object from which to extract length. If using
        sklearn-0.17, this can be computed by calling `len` on
        ``cv``, else it's computed with `cv.get_n_splits(X, y)`.

    X : pd.DataFrame or np.ndarray, shape(n_samples, n_features)
        The dataframe or np.ndarray being fit in the grid search.

    y : np.ndarray, shape(n_samples,)
        The target being fit in the grid search.

    Returns
    -------

    int
    """
    return len(cv) if not SK18 else cv.get_n_splits(X, y) 
Example #2
Source File: clf_helpers.py    From ibeis with Apache License 2.0 6 votes vote down vote up
def stratified_kfold_indices(samples, **xval_kw):
        """
        TODO: check xval label frequency


        """
        from sklearn import model_selection

        X = np.empty((len(samples), 0))
        y = samples.encoded_1d().values
        groups = samples.group_ids

        type_ = xval_kw.pop('type', 'StratifiedGroupKFold')
        if type_ == 'StratifiedGroupKFold':
            assert groups is not None
            # FIXME: The StratifiedGroupKFold could be implemented better.
            splitter = sklearn_utils.StratifiedGroupKFold(**xval_kw)
            skf_list = list(splitter.split(X=X, y=y, groups=groups))
        elif type_ == 'StratifiedKFold':
            splitter = model_selection.StratifiedKFold(**xval_kw)
            skf_list = list(splitter.split(X=X, y=y))
        return skf_list 
Example #3
Source File: classify_shark.py    From ibeis with Apache License 2.0 6 votes vote down vote up
def gen_crossval_idxs(problem, n_folds=2):
        y = problem.ds.target
        rng = 43432
        if hasattr(problem.ds, 'nids'):
            # Ensure that an individual does not appear in both the train
            # and the test dataset
            from ibeis_cnn.dataset import stratified_kfold_label_split
            labels = problem.ds.nids
            _iter = stratified_kfold_label_split(y, labels, n_folds=n_folds, rng=rng)
        else:
            xvalkw = dict(n_folds=n_folds, shuffle=True, random_state=rng)
            import sklearn.cross_validation
            skf = sklearn.cross_validation.StratifiedKFold(y, **xvalkw)
            _iter = skf
            #import sklearn.model_selection
            #skf = sklearn.model_selection.StratifiedKFold(**xvalkw)
            #_iter = skf.split(X=np.empty(len(y)), y=y)
        msg = 'cross-val test on %s' % (problem.ds.name)
        progiter = ut.ProgIter(_iter, length=n_folds, lbl=msg)
        for train_idx, test_idx in progiter:
            yield train_idx, test_idx


# @ut.reloadable_class 
Example #4
Source File: sklearn_intent_classifier.py    From rasa_nlu with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 component_config: Dict[Text, Any] = None,
                 clf: 'sklearn.model_selection.GridSearchCV' = None,
                 le: Optional['sklearn.preprocessing.LabelEncoder'] = None
                 ) -> None:
        """Construct a new intent classifier using the sklearn framework."""
        from sklearn.preprocessing import LabelEncoder

        super(SklearnIntentClassifier, self).__init__(component_config)

        if le is not None:
            self.le = le
        else:
            self.le = LabelEncoder()
        self.clf = clf

        _sklearn_numpy_warning_fix() 
Example #5
Source File: sklearn_intent_classifier.py    From rasa_nlu with Apache License 2.0 6 votes vote down vote up
def _create_classifier(self, num_threads, y):
        from sklearn.model_selection import GridSearchCV
        from sklearn.svm import SVC

        C = self.component_config["C"]
        kernels = self.component_config["kernels"]
        gamma = self.component_config["gamma"]
        # dirty str fix because sklearn is expecting
        # str not instance of basestr...
        tuned_parameters = [{"C": C,
                             "gamma": gamma,
                             "kernel": [str(k) for k in kernels]}]

        # aim for 5 examples in each fold

        cv_splits = self._num_cv_splits(y)

        return GridSearchCV(SVC(C=1,
                                probability=True,
                                class_weight='balanced'),
                            param_grid=tuned_parameters,
                            n_jobs=num_threads,
                            cv=cv_splits,
                            scoring=self.component_config['scoring_function'],
                            verbose=1) 
Example #6
Source File: sklearn_intent_classifier.py    From rasa-for-botfront with Apache License 2.0 6 votes vote down vote up
def __init__(
        self,
        component_config: Optional[Dict[Text, Any]] = None,
        clf: "sklearn.model_selection.GridSearchCV" = None,
        le: Optional["sklearn.preprocessing.LabelEncoder"] = None,
    ) -> None:
        """Construct a new intent classifier using the sklearn framework."""
        from sklearn.preprocessing import LabelEncoder

        super().__init__(component_config)

        if le is not None:
            self.le = le
        else:
            self.le = LabelEncoder()
        self.clf = clf 
Example #7
Source File: sklearn_intent_classifier.py    From Rasa_NLU_Chi with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 component_config=None,  # type: Dict[Text, Any]
                 clf=None,  # type: sklearn.model_selection.GridSearchCV
                 le=None  # type: sklearn.preprocessing.LabelEncoder
                 ):
        # type: (...) -> None
        """Construct a new intent classifier using the sklearn framework."""
        from sklearn.preprocessing import LabelEncoder

        super(SklearnIntentClassifier, self).__init__(component_config)

        if le is not None:
            self.le = le
        else:
            self.le = LabelEncoder()
        self.clf = clf

        _sklearn_numpy_warning_fix() 
Example #8
Source File: sklearn_intent_classifier.py    From Rasa_NLU_Chi with Apache License 2.0 6 votes vote down vote up
def _create_classifier(self, num_threads, y):
        from sklearn.model_selection import GridSearchCV
        from sklearn.svm import SVC

        C = self.component_config["C"]
        kernels = self.component_config["kernels"]
        # dirty str fix because sklearn is expecting
        # str not instance of basestr...
        tuned_parameters = [{"C": C,
                             "kernel": [str(k) for k in kernels]}]

        # aim for 5 examples in each fold

        cv_splits = self._num_cv_splits(y)

        return GridSearchCV(SVC(C=1,
                                probability=True,
                                class_weight='balanced'),
                            param_grid=tuned_parameters,
                            n_jobs=num_threads,
                            cv=cv_splits,
                            scoring='f1_weighted',
                            verbose=1) 
Example #9
Source File: fixes.py    From skutil with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _set_cv(cv, X, y, classifier):
    """This method returns either a `sklearn.cross_validation._PartitionIterator` or 
    `sklearn.model_selection.BaseCrossValidator` depending on whether sklearn-0.17
    or sklearn-0.18 is being used.

    Parameters
    ----------

    cv : int, `_PartitionIterator` or `BaseCrossValidator`
        The CV object or int to check. If an int, will be converted
        into the appropriate class of crossvalidator.

    X : pd.DataFrame or np.ndarray, shape(n_samples, n_features)
        The dataframe or np.ndarray being fit in the grid search.

    y : np.ndarray, shape(n_samples,)
        The target being fit in the grid search.

    classifier : bool
        Whether the estimator being fit is a classifier

    Returns
    -------

    `_PartitionIterator` or `BaseCrossValidator`
    """
    return check_cv(cv, X, y, classifier) if not SK18 else check_cv(cv, y, classifier) 
Example #10
Source File: build_model.py    From gordo with GNU Affero General Public License v3.0 5 votes vote down vote up
def build_split_dict(X: pd.DataFrame, split_obj: Type[BaseCrossValidator]) -> dict:
        """
        Get dictionary of cross-validation training dataset split metadata

        Parameters
        ----------
        X: pd.DataFrame
            The training dataset that will be split during cross-validation.
        split_obj: Type[sklearn.model_selection.BaseCrossValidator]
            The cross-validation object that returns train, test indices for splitting.

        Returns
        -------
        split_metadata: Dict[str,Any]
            Dictionary of cross-validation train/test split metadata
        """
        split_metadata: Dict[str, Any] = dict()
        for i, (train_ind, test_ind) in enumerate(split_obj.split(X)):
            split_metadata.update(
                {
                    f"fold-{i+1}-train-start": X.index[train_ind[0]],
                    f"fold-{i+1}-train-end": X.index[train_ind[-1]],
                    f"fold-{i+1}-test-start": X.index[test_ind[0]],
                    f"fold-{i+1}-test-end": X.index[test_ind[-1]],
                }
            )
            split_metadata.update({f"fold-{i+1}-n-train": len(train_ind)})
            split_metadata.update({f"fold-{i+1}-n-test": len(test_ind)})
        return split_metadata 
Example #11
Source File: smk_pipeline.py    From ibeis with Apache License 2.0 5 votes vote down vote up
def testdata_smk(*args, **kwargs):
    """
    >>> from ibeis.algo.smk.smk_pipeline import *  # NOQA
    >>> kwargs = {}
    """
    import ibeis
    import sklearn
    import sklearn.cross_validation
    # import sklearn.model_selection
    ibs, aid_list = ibeis.testdata_aids(defaultdb='PZ_MTEST')
    nid_list = np.array(ibs.annots(aid_list).nids)
    rng = ut.ensure_rng(0)
    xvalkw = dict(n_folds=4, shuffle=False, random_state=rng)

    skf = sklearn.cross_validation.StratifiedKFold(nid_list, **xvalkw)
    train_idx, test_idx = six.next(iter(skf))
    daids = ut.take(aid_list, train_idx)
    qaids = ut.take(aid_list, test_idx)

    config = {
        'num_words': 1000,
    }
    config.update(**kwargs)
    qreq_ = SMKRequest(ibs, qaids, daids, config)
    smk = qreq_.smk
    #qreq_ = ibs.new_query_request(qaids, daids, cfgdict={'pipeline_root': 'smk', 'proot': 'smk'})
    #qreq_ = ibs.new_query_request(qaids, daids, cfgdict={})
    return ibs, smk, qreq_ 
Example #12
Source File: clf_helpers.py    From ibeis with Apache License 2.0 5 votes vote down vote up
def subsplit_indices(samples, subset_idx, **xval_kw):
        """ split an existing set """
        from sklearn import model_selection

        X = np.empty((len(subset_idx), 0))
        y = samples.encoded_1d().values[subset_idx]
        groups = samples.group_ids[subset_idx]

        xval_kw_ = xval_kw.copy()
        if 'n_splits' not in xval_kw_:
            xval_kw_['n_splits'] = 3
        type_ = xval_kw_.pop('type', 'StratifiedGroupKFold')
        if type_ == 'StratifiedGroupKFold':
            assert groups is not None
            # FIXME: The StratifiedGroupKFold could be implemented better.
            splitter = sklearn_utils.StratifiedGroupKFold(**xval_kw_)
            rel_skf_list = list(splitter.split(X=X, y=y, groups=groups))
        elif type_ == 'StratifiedKFold':
            splitter = model_selection.StratifiedKFold(**xval_kw_)
            rel_skf_list = list(splitter.split(X=X, y=y))

        # map back into original coords
        skf_list = [(subset_idx[rel_idx1], subset_idx[rel_idx2])
                    for rel_idx1, rel_idx2 in rel_skf_list]

        for idx1, idx2 in skf_list:
            assert len(np.intersect1d(subset_idx, idx1)) == len(idx1)
            assert len(np.intersect1d(subset_idx, idx2)) == len(idx2)
            # assert
        return skf_list 
Example #13
Source File: sklearn_intent_classifier.py    From rasa-for-botfront with Apache License 2.0 5 votes vote down vote up
def _create_classifier(
        self, num_threads: int, y
    ) -> "sklearn.model_selection.GridSearchCV":
        from sklearn.model_selection import GridSearchCV
        from sklearn.svm import SVC

        C = self.component_config["C"]
        kernels = self.component_config["kernels"]
        gamma = self.component_config["gamma"]
        # dirty str fix because sklearn is expecting
        # str not instance of basestr...
        tuned_parameters = [
            {"C": C, "gamma": gamma, "kernel": [str(k) for k in kernels]}
        ]

        # aim for 5 examples in each fold

        cv_splits = self._num_cv_splits(y)

        return GridSearchCV(
            SVC(C=1, probability=True, class_weight="balanced"),
            param_grid=tuned_parameters,
            n_jobs=num_threads,
            cv=cv_splits,
            scoring=self.component_config["scoring_function"],
            verbose=1,
            iid=False,
        ) 
Example #14
Source File: test_shap.py    From AIX360 with Apache License 2.0 4 votes vote down vote up
def test_Shap(self):

        np.random.seed(1)
        X_train, X_test, Y_train, Y_test = train_test_split(*shap.datasets.iris(), test_size=0.2, random_state=0)

        # K-nearest neighbors
        knn = sklearn.neighbors.KNeighborsClassifier()
        knn.fit(X_train, Y_train)
        v = 100*np.sum(knn.predict(X_test) == Y_test)/len(Y_test)
        print("Accuracy = {0}%".format(v))

        # Explain a single prediction from the test set
        shapexplainer = KernelExplainer(knn.predict_proba, X_train)
        shap_values = shapexplainer.explain_instance(X_test.iloc[0,:])  # TODO test against original SHAP Lib
        print('knn X_test iloc_0')
        print(shap_values)
        print(shapexplainer.explainer.expected_value[0])
        print(shap_values[0])

        # Explain all the predictions in the test set
        shap_values = shapexplainer.explain_instance(X_test)
        print('knn X_test')
        print(shap_values)
        print(shapexplainer.explainer.expected_value[0])
        print(shap_values[0])

        # SV machine with a linear kernel
        svc_linear = sklearn.svm.SVC(kernel='linear', probability=True)
        svc_linear.fit(X_train, Y_train)
        v = 100*np.sum(svc_linear.predict(X_test) == Y_test)/len(Y_test)
        print("Accuracy = {0}%".format(v))

        # Explain all the predictions in the test set
        shapexplainer = KernelExplainer(svc_linear.predict_proba, X_train)
        shap_values = shapexplainer.explain_instance(X_test)
        print('svc X_test')
        print(shap_values)
        print(shapexplainer.explainer.expected_value[0])
        print(shap_values[0])

        np.random.seed(1)
        X,y = shap.datasets.adult()
        X_train, X_valid, y_train, y_valid = sklearn.model_selection.train_test_split(X, y, test_size=0.2, random_state=7)

        knn = sklearn.neighbors.KNeighborsClassifier()
        knn.fit(X_train, y_train)

        f = lambda x: knn.predict_proba(x)[:,1]
        med = X_train.median().values.reshape((1,X_train.shape[1]))
        shapexplainer = KernelExplainer(f, med)
        shap_values_single = shapexplainer.explain_instance(X.iloc[0,:], nsamples=1000)
        print('Shap Tabular Example')
        print(shapexplainer.explainer.expected_value)
        print(shap_values_single)
        print("Invoked Shap KernelExplainer") 
Example #15
Source File: old_vsone.py    From ibeis with Apache License 2.0 4 votes vote down vote up
def gridsearch_ratio_thresh(matches):
    import sklearn
    import sklearn.metrics
    import vtool_ibeis as vt
    # Param search for vsone
    import plottool_ibeis as pt
    pt.qt4ensure()

    skf = sklearn.model_selection.StratifiedKFold(n_splits=10,
                                                  random_state=119372)

    y = np.array([m.annot1['nid'] == m.annot2['nid'] for m in matches])

    basis = {'ratio_thresh': np.linspace(.6, .7, 50).tolist()}
    grid = ut.all_dict_combinations(basis)
    xdata = np.array(ut.take_column(grid, 'ratio_thresh'))

    def _ratio_thresh(y_true, match_list):
        # Try and find optional ratio threshold
        auc_list = []
        for cfgdict in ut.ProgIter(grid, lbl='gridsearch'):
            y_score = [
                match.fs.compress(match.ratio_test_flags(cfgdict)).sum()
                for match in match_list
            ]
            auc = sklearn.metrics.roc_auc_score(y_true, y_score)
            auc_list.append(auc)
        auc_list = np.array(auc_list)
        return auc_list

    auc_list = _ratio_thresh(y, matches)
    pt.plot(xdata, auc_list)
    subx, suby = vt.argsubmaxima(auc_list, xdata)
    best_ratio_thresh = subx[suby.argmax()]

    skf_results = []
    y_true = y
    for train_idx, test_idx in skf.split(matches, y):
        match_list_ = ut.take(matches, train_idx)
        y_true = y.take(train_idx)
        auc_list = _ratio_thresh(y_true, match_list_)
        subx, suby = vt.argsubmaxima(auc_list, xdata, maxima_thresh=.8)
        best_ratio_thresh = subx[suby.argmax()]
        skf_results.append(best_ratio_thresh)
    print('skf_results.append = %r' % (np.mean(skf_results),))
    import utool
    utool.embed() 
Example #16
Source File: pyglmnet.py    From pyglmnet with MIT License 4 votes vote down vote up
def _set_cv(cv, estimator=None, X=None, y=None):
        """Set the default CV depending on whether clf
           is classifier/regressor."""
        # Detect whether classification or regression
        if estimator in ['classifier', 'regressor']:
            est_is_classifier = estimator == 'classifier'
        else:
            est_is_classifier = is_classifier(estimator)
        # Setup CV
        if check_version('sklearn', '0.18'):
            from sklearn import model_selection as models
            from sklearn.model_selection import (check_cv,
                                                 StratifiedKFold, KFold)
            if isinstance(cv, (int, np.int)):
                XFold = StratifiedKFold if est_is_classifier else KFold
                cv = XFold(n_splits=cv)
            elif isinstance(cv, str):
                if not hasattr(models, cv):
                    raise ValueError('Unknown cross-validation')
                cv = getattr(models, cv)
                cv = cv()
            cv = check_cv(cv=cv, y=y, classifier=est_is_classifier)
        else:
            from sklearn import cross_validation as models
            from sklearn.cross_validation import (check_cv,
                                                  StratifiedKFold, KFold)
            if isinstance(cv, (int, np.int)):
                if est_is_classifier:
                    cv = StratifiedKFold(y=y, n_folds=cv)
                else:
                    cv = KFold(n=len(y), n_folds=cv)
            elif isinstance(cv, str):
                if not hasattr(models, cv):
                    raise ValueError('Unknown cross-validation')
                cv = getattr(models, cv)
                if cv.__name__ not in ['KFold', 'LeaveOneOut']:
                    raise NotImplementedError('CV cannot be defined with str'
                                              ' for sklearn < .017.')
                cv = cv(len(y))
            cv = check_cv(cv=cv, X=X, y=y, classifier=est_is_classifier)

        # Extract train and test set to retrieve them at predict time
        if hasattr(cv, 'split'):
            cv_splits = [(train, test) for train, test in
                         cv.split(X=np.zeros_like(y), y=y)]
        else:
            # XXX support sklearn.cross_validation cv
            cv_splits = [(train, test) for train, test in cv]

        if not np.all([len(train) for train, _ in cv_splits]):
            raise ValueError('Some folds do not have any train epochs.')

        return cv, cv_splits