from __future__ import print_function from __future__ import division from abc import ABCMeta, abstractmethod import warnings import numpy as np import six from sklearn.base import BaseEstimator, clone from sklearn.utils.metaestimators import if_delegate_has_method from sklearn.model_selection import check_cv from sklearn.utils import tosequence, check_X_y from sklearn.externals.joblib import Parallel, delayed try: # TODO: Avoid using a private function from scikit-learn. # _check_fit_params was added at sklearn 0.22.1 from sklearn.utils.validation import _check_fit_params except ImportError: # _index_param_value was removed in sklearn 0.22.1 # See: https://github.com/scikit-learn/scikit-learn/pull/15863 from sklearn.model_selection._validation import _index_param_value warnings.warn( 'Your civisml-extensions installation uses private functions from ' 'scikit-learn < v0.22.1. Please upgrade scikit-learn to v0.22.1 ' 'or beyond. A future version of civisml-extensions will no longer ' 'be compatible with scikit-learn < v0.22.1.', FutureWarning ) def _check_fit_params(X, fit_params, train): return {k: _index_param_value(X, v, train) for k, v in fit_params.items()} def _fit_est(est, X, y, **fit_params): return est.fit(X, y, **fit_params) def _reshape_2d_long(arr): # Reshape output so it's always 2-d and long if arr.ndim < 2: arr = arr.reshape(-1, 1) return arr def _regressor_predict(est, X): """standardized predictions for regression""" return _reshape_2d_long(est.predict(X)) def _regressor_fit_predict(est, Xtrn, ytrn, Xtst, **fit_params): """function for doing fit and predict for regressors""" est.fit(Xtrn, ytrn, **fit_params) return _regressor_predict(est, Xtst) def _classifier_predict(est, X): # Note: this prefers a decision_function to predict_proba # when both are present (e.g. logistic regression), per the # convention in CalibratedClassifierCV. if hasattr(est, "decision_function"): ypred = est.decision_function(X) elif hasattr(est, "predict_proba"): # predict_proba rows always sum to 1, so drop last col ypred = est.predict_proba(X)[:, :-1] elif hasattr(est, "predict"): # you may want to allow predict for pass-through estimators ypred = est.predict(X) else: raise RuntimeError("Estimator without a `decision_function`, " "`predict_proba`, or `predict` method supplied to a" " StackedClassifier.") return _reshape_2d_long(ypred) def _classifier_fit_predict(est, Xtrn, ytrn, Xtst, **fit_params): """function for doing fit and predict for classifiers""" est.fit(Xtrn, ytrn, **fit_params) return _classifier_predict(est, Xtst) @six.add_metaclass(ABCMeta) class BaseStackedModel(BaseEstimator): """Abstract base class for StackedClassifier and StackedRegressor. It is loosely based on sklearn.pipeline.Pipeline. """ def __init__(self, estimator_list, cv=3, n_jobs=1, pre_dispatch='2*n_jobs', verbose=0): self.estimator_list = tosequence(estimator_list) self.cv = cv self.n_jobs = n_jobs self.pre_dispatch = pre_dispatch self.verbose = verbose def get_params(self, deep=True): """Get parameters for this estimator. Parameters ---------- deep: boolean, optional (default: True) If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns ------- out : mapping of string to any Parameter names mapped to their values. """ # If not deep, just get the params of the BaseStackedModel object out = super(BaseStackedModel, self).get_params(deep=False) if not deep: return out # If deep, extract parameters from estimators too est_list = getattr(self, 'estimator_list') # If est_list is an empty list, don't do anything else if len(est_list) > 0: out.update(self.named_base_estimators.copy()) out.update({self.meta_estimator_name: self.meta_estimator}.copy()) for name, estimator in est_list: for key, value in estimator.get_params(deep=True).items(): out['%s__%s' % (name, key)] = value return out def set_params(self, **params): """Set the parameters of this estimator. Valid parameter keys can be listed with ``get_params()``. Returns ------- self """ # Ensure strict ordering of parameter setting: # 1. All estimators if 'estimator_list' in params: setattr(self, 'estimator_list', params.pop('estimator_list')) # 2. Estimator replacement est_names, _ = zip(*getattr(self, 'estimator_list')) for name in list(params.keys()): if '__' not in name and name in est_names: self._replace_est('estimator_list', name, params.pop(name)) # 3. Estimator parameters and other initilisation arguments super(BaseStackedModel, self).set_params(**params) return self def fit(self, X, y, **fit_params): """Fit the model Fit the base estimators on CV folds, then use their prediction on the validation folds to train the meta-estimator. Then re-fit base estimators on full training set. Parameters ---------- X : np.ndarray, list of numbers Training data. y : np.ndarray, list of numbers Training targets. **fit_params : dict of {string, object} Parameters passed to the ``fit`` method of each estimator, where each parameter name is prefixed such that parameter ``p`` for estimator ``s`` has key ``s__p``. Returns ------- self : BaseStackedModel This estimator """ self._validate_estimators() X, y = check_X_y(X, y, multi_output=True) # Fit base estimators on CV training folds, produce features for # meta-estimator from predictions on CV test folds. Xmeta, ymeta, meta_params = self._base_est_fit_predict(X, y, **fit_params) # Fit meta-estimator on test fold predictions of base estimators. self.meta_estimator.fit(Xmeta, ymeta, **meta_params) # Now fit base estimators again, this time on full training set self._base_est_fit(X, y, **fit_params) return self # _replace_est copied nearly verbatim from sklearn.pipeline._BasePipeline # v0.18.1 "_replace_step" method. def _replace_est(self, ests_attr, name, new_val): # assumes `name` is a valid est name new_ests = getattr(self, ests_attr)[:] for i, (est_name, _) in enumerate(new_ests): if est_name == name: new_ests[i] = (name, new_val) break setattr(self, ests_attr, new_ests) # _validate_names copied nearly verbatim from # sklearn.pipeline._BasePipeline v0.18.1 def _validate_names(self, names): if len(set(names)) != len(names): raise ValueError('Names provided are not unique: ' '{0!r}'.format(list(names))) invalid_names = set(names).intersection(self.get_params(deep=False)) if invalid_names: raise ValueError('Estimator names conflict with constructor ' 'arguments: {0!r}'.format(sorted(invalid_names))) invalid_names = [name for name in names if '__' in name] if invalid_names: raise ValueError('Estimator names must not contain __: got ' '{0!r}'.format(invalid_names)) @abstractmethod def _validate_estimators(self): pass def _extract_fit_params(self, **fit_params): """Extract fit parameters for each estimator and store in nested dict """ fit_params_ests = dict((name, {}) for name, est in self.estimator_list) for pname, pval in fit_params.items(): est, param = pname.split('__', 1) fit_params_ests[est][param] = pval return fit_params_ests def _base_est_fit(self, X, y, **fit_params): """Fit the base estimators on X and y. """ fit_params_ests = self._extract_fit_params(**fit_params) _jobs = [] for name, est in self.estimator_list[:-1]: _jobs.append(delayed(_fit_est)( clone(est), X, y, **fit_params_ests[name])) _out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=self.pre_dispatch)(_jobs) for name, _ in self.estimator_list[:-1]: self._replace_est('estimator_list', name, _out.pop(0)) @abstractmethod def _check_cv(self, y): pass def _base_est_fit_predict(self, X, y, **fit_params): """Fit the base estimators on CV training folds, and return their out-of-sample predictions on the test folds as features for the meta-estimator. Also return the fit_params for the meta-estimator. """ y = y.squeeze() # Construct CV iterator cv = self._check_cv(y=y) # Extract CV indices since we need them twice, and un-seeded CV # generators with `shuffle=True` split differently each time. train_inds = [] test_inds = [] for train, test in cv.split(X, y): train_inds.append(train) test_inds.append(test) fit_params_ests = self._extract_fit_params(**fit_params) _fit_predict = self._get_fit_predict_function() _jobs = [] # Loop over CV folds to get out-of-sample predictions, which become the # features for the meta-estimator. for train, test in zip(train_inds, test_inds): for name, est in self.estimator_list[:-1]: # adapted from sklearn.model_selection._fit_and_predict # Adjust length of sample weights fit_params_est_adjusted = _check_fit_params( X, fit_params_ests[name], train ) # Fit estimator on training set and score out-of-sample _jobs.append(delayed(_fit_predict)( clone(est), X[train], y[train], X[test], **fit_params_est_adjusted)) _out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=self.pre_dispatch)(_jobs) # Extract the results from joblib Xmeta, ymeta = None, None for test in test_inds: ybase = np.empty((y[test].shape[0], 0)) for name, est in self.estimator_list[:-1]: # Build design matrix out of out-of-sample predictions ybase = np.hstack((ybase, _out.pop(0))) # Append the test outputs to what will eventually be the features # for the meta-estimator. if Xmeta is not None: ymeta = np.concatenate((ymeta, y[test])) Xmeta = np.vstack((Xmeta, ybase)) else: Xmeta = ybase ymeta = y[test] return Xmeta, ymeta, fit_params_ests[self.meta_estimator_name] def _base_est_predict(self, X): """Return base estimator predictions on X as meta-features. """ Xmeta = np.empty((X.shape[0], 0)) for name, est in self.estimator_list[:-1]: Xmeta = np.hstack((Xmeta, self._est_predict(est, X))) return Xmeta @abstractmethod def _est_predict(self, est, X): """This function ensures that the relevant prediction function is consistently called for a given base estimator. """ pass @abstractmethod def _get_fit_predict_function(self): """return the function to be used for the fit and out-of-sample predictions""" pass @if_delegate_has_method(delegate='meta_estimator') def predict(self, X): """Run predictions through base estimators, then predict with the meta-estimator on the output of the base estimators. Parameters ---------- X : np.ndarray, list of numbers Data to predict on. Returns ------- y_pred : array-like """ Xmeta = self._base_est_predict(X) return self.meta_estimator.predict(Xmeta) @if_delegate_has_method(delegate='meta_estimator') def score(self, X, y, **params): """Run predictions through base estimators, then score with the meta-estimator on the output of the base estimators. Parameters ---------- X : np.ndarray, list of numbers Data to predict on. y : np.ndarray, list of numbers Targets for scoring. Must fulfill label requirements for meta-estimator. params: dict of {string, object} Parameters passed to the ``score`` method of the meta-estimator. Returns ------- y_score : array-like, shape = [n_samples, n_classes] """ Xmeta = self._base_est_predict(X) return self.meta_estimator.score(Xmeta, y, **params) @property def named_base_estimators(self): return dict(self.estimator_list[:-1]) @property def meta_estimator(self): return self.estimator_list[-1][-1] @property def meta_estimator_name(self): return self.estimator_list[-1][0] class StackedClassifier(BaseStackedModel): """Builds a stacked classification model from a list of named estimators, using the final estimator in the list as the meta-estimator. This class takes a list of named estimators, and it fits all but the last of these estimators (called the "base estimators") on part of the training data passed to ``fit``. The remaining training data is used to create out-of-sample predictions from these base estimators. The final named estimator, called the meta-estimator, is trained on these out-of-sample predictions. This allows the meta-estimator to optimally aggregate the predictions of several base estimators, hopefully improving on their individual predictive powers. It is loosely based on sklearn.pipeline.Pipeline. Parameters ---------- estimator_list: list of (str, estimator) tuples This contains tuples holding the name and estimator of the desired base and meta-estimators. The meta-estimator MUST be the final item in the list. The order of the base estimators is irrelevant, as long as they occur before the meta-estimator. cv : int, cross-validation generator, or iterable, optional (default: 3) Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 3-fold cross-validation, - integer, to specify the number of folds. - An object to be used as a cross-validation generator. - An iterable yielding train/test splits. For integer/None inputs, if ``y`` is binary or multiclass, :class:`sklearn.model_selection.StratifiedKFold` is used. Refer :ref:`User Guide <cross_validation>` for the various cross-validation strategies that can be used here. n_jobs : int, (default: 1) Number of jobs to run in parallel. pre_dispatch : int, or string, optional Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an explosion of memory consumption when more jobs get dispatched than CPUs can process. This parameter can be: - None, in which case all the jobs are immediately created and spawned. Use this for lightweight and fast-running jobs, to avoid delays due to on-demand spawning of the jobs - An int, giving the exact number of total jobs that are spawned - A string, giving an expression as a function of n_jobs, as in '2*n_jobs' verbose : integer Controls the verbosity: the higher, the more messages. A value of 10 gives a moderate level of logging. 50 or more is the most amount of logging. Attributes ---------- named_base_estimators : dict Read-only attribute to access any base estimator by user-given name. Keys are estimator names and values are estimators. meta_estimator: estimator The meta-estimator, provided as a separately accessible property. Example ------- >>> from sklearn.linear_model import LogisticRegression >>> from sklearn.ensemble import RandomForestClassifier >>> from civismlext.stacking import StackedClassifier >>> # Note that the final estimator 'metalr' is the meta-estimator >>> estlist = [('rf', RandomForestClassifier()), >>> ('lr', LogisticRegression()), >>> ('metalr', LogisticRegression())] >>> mysm = StackedClassifier(estlist) >>> # Set some parameters, if you didn't set them at instantiation >>> mysm.set_params(rf__random_state=7, lr__random_state=8, >>> metalr__random_state=9, metalr__C=10**7) >>> # Fit >>> mysm.fit(Xtrain, ytrain) >>> # Predict! >>> ypred = mysm.predict_proba(Xtest) """ def __init__(self, estimator_list, cv=3, n_jobs=1, pre_dispatch='2*n_jobs', verbose=0): super(StackedClassifier, self).__init__( estimator_list, cv, n_jobs, pre_dispatch, verbose) def _validate_estimators(self): """Validates that the names and methods of the estimators match expectiations. Overrides `validate_estimators` in `BaseStackedModel`. """ names, estimators = zip(*self.estimator_list) if len(self.estimator_list) < 2: raise RuntimeError("You must have two or more estimators to fit a " "StackedClassifier!") # validate names self._validate_names(names) # validate meta-estimator if not hasattr(self.meta_estimator, "fit"): raise TypeError("Meta-estimator '%s' does not have fit method." % self.meta_estimator_name) _check_classifier_methods(self.meta_estimator, self.meta_estimator_name) # validate base estimators for name, est in self.estimator_list[:-1]: if not hasattr(est, "fit"): raise TypeError("Estimator '%s' does not have fit method." % name) _check_classifier_methods(est, name) def _check_cv(self, y): """Overrides base class _check_cv """ # Squeezed target should be 1-dimensional if len(y.shape) != 1: raise NotImplementedError("StackedClassifier does not currently " "support multi-column classification " "problems. If your target is a one-hot " "encoded multi-class problem, please " "recast it to a single column.") return check_cv(self.cv, y=y, classifier=True) def _est_predict(self, est, X): """This function ensures that the relevant prediction function is consistently called for a given base estimator. """ return _classifier_predict(est, X) def _get_fit_predict_function(self): """return the function to be used for the fit and out-of-sample predictions""" return _classifier_fit_predict @if_delegate_has_method(delegate='meta_estimator') def predict_proba(self, X): """Run predictions through base estimators, then predict class probabilities with the meta-estimator on the output of the base estimators. Parameters ---------- X : np.ndarray, list of numbers Data to predict on. Returns ------- y_proba : array-like, shape = [n_samples, n_classes] """ Xmeta = self._base_est_predict(X) return self.meta_estimator.predict_proba(Xmeta) @if_delegate_has_method(delegate='meta_estimator') def decision_function(self, X): """Run predictions through base estimators, then pass the output of the base estimators to the meta-estimator's decision_function. Parameters ---------- X : np.ndarray, list of numbers Data to predict on. Returns ------- y_score : array-like, shape = [n_samples, n_classes] """ Xmeta = self._base_est_predict(X) return self.meta_estimator.decision_function(Xmeta) @if_delegate_has_method(delegate='meta_estimator') def predict_log_proba(self, X): """Run predictions through base estimators, then predict class log probabilities with the meta-estimator on the output of the base estimators. Parameters ---------- X : np.ndarray, list of numbers Data to predict on. Returns ------- y_score : array-like, shape = [n_samples, n_classes] """ Xmeta = self._base_est_predict(X) return self.meta_estimator.predict_log_proba(Xmeta) @property def classes_(self): return self.meta_estimator.classes_ def _check_classifier_methods(clf, name): """Checks whether clf has either a `predict_proba`, `decision_function`, or `predict` method. Raises if none of these methods are present. Parameters ---------- clf: estimator Estimator to check. Expected to be a classifier. name: str Name of estimator, to be output in case estimator is missing expected classification functionality. """ if not (hasattr(clf, "predict_proba") or hasattr(clf, "predict") or hasattr(clf, "decision_function")): raise RuntimeError("Estimator '%s' does not have `predict_prob`, " "`decision_function`, or `predict` method." % name) class StackedRegressor(BaseStackedModel): """Builds a stacked regression model from a list of named estimators, using the final estimator in the list as the meta-estimator. This class takes a list of named estimators, and it fits all but the last of these estimators (called the "base estimators") on part of the training data passed to ``fit``. The remaining training data is used to create out-of-sample predictions from these base estimators. The final named estimator, called the meta-estimator, is trained on these out-of-sample predictions. This allows the meta-estimator to optimally aggregate the predictions of several base estimators, hopefully improving on their individual predictive powers. It is loosely based on sklearn.pipeline.Pipeline. Parameters ---------- estimator_list: list of (str, estimator) tuples This contains tuples holding the name and estimator of the desired base and meta-estimators. The meta-estimator MUST be the final item in the list. The order of the base estimators is irrelevant, as long as they occur before the meta-estimator. cv : int, cross-validation generator, or iterable, optional (default: 3) Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 3-fold cross-validation, - integer, to specify the number of folds. - An object to be used as a cross-validation generator. - An iterable yielding train/test splits. :class:`sklearn.model_selection.KFold` is used by default for regression targets. Refer :ref:`User Guide <cross_validation>` for the various cross-validation strategies that can be used here. n_jobs : int, (default: 1) Number of jobs to run in parallel. pre_dispatch : int, or string, optional Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an explosion of memory consumption when more jobs get dispatched than CPUs can process. This parameter can be: - None, in which case all the jobs are immediately created and spawned. Use this for lightweight and fast-running jobs, to avoid delays due to on-demand spawning of the jobs - An int, giving the exact number of total jobs that are spawned - A string, giving an expression as a function of n_jobs, as in '2*n_jobs' verbose : integer Controls the verbosity: the higher, the more messages. A value of 10 gives a moderate level of logging. 50 or more is the most amount of logging. Attributes ---------- named_base_estimators : dict Read-only attribute to access any base estimator by user-given name. Keys are estimator names and values are estimators. meta_estimator: estimator The meta-estimator, provided as a separately accessible property. Example ------- >>> from sklearn.linear_model import LinearRegression >>> from sklearn.ensemble import RandomForestRegressor >>> from civismlext.stacking import StackedRegressor >>> from civismlext.nonnegative import NonNegativeLinearRegression >>> # Note that the final estimator 'meta_nnr' is the meta-estimator >>> estlist = [('rf', RandomForestRegressor()), >>> ('lr', LinearRegression()), >>> ('meta_nnr', NonNegativeLinearRegression())] >>> mysm = StackedRegressor(estlist) >>> # Set some parameters, if you didn't set them at instantiation >>> mysm.set_params(rf__random_state=7) >>> # Fit >>> mysm.fit(Xtrain, ytrain) >>> # Predict! >>> ypred = mysm.predict(Xtest) """ def __init__(self, estimator_list, cv=3, n_jobs=1, pre_dispatch='2*n_jobs', verbose=0): super(StackedRegressor, self).__init__( estimator_list, cv, n_jobs, pre_dispatch, verbose) def _validate_estimators(self): """Validates that the names and methods of the estimators match expectations. Overrides `validate_estimators` in `BaseStackedModel`. """ names, estimators = zip(*self.estimator_list) if len(self.estimator_list) < 2: raise RuntimeError("You must have two or more estimators to fit a " "StackedRegressor!") # validate names self._validate_names(names) # validate meta-estimator if not hasattr(self.meta_estimator, "fit"): raise TypeError("Meta-estimator '%s' does not have fit method." % self.meta_estimator_name) # validate base estimators for name, est in self.estimator_list[:-1]: if not hasattr(est, "fit"): raise TypeError("Estimator '%s' does not have fit method." % name) def _check_cv(self, y): """Overrides base class _check_cv """ return check_cv(self.cv, y=y, classifier=False) def _est_predict(self, est, X): """This function ensures that the relevant prediction function is consistently called for a given base estimator. """ return _regressor_predict(est, X) def _get_fit_predict_function(self): """return the function to be used for the fit and out-of-sample predictions""" return _regressor_fit_predict