""" Gradient Boosting decision trees for classification and regression. """ from abc import ABC, abstractmethod import numpy as np from numba import njit, prange from time import time from sklearn.base import BaseEstimator, RegressorMixin, ClassifierMixin from sklearn.utils import check_X_y, check_random_state, check_array from sklearn.utils.validation import check_is_fitted from sklearn.utils.multiclass import check_classification_targets from sklearn.metrics import check_scoring from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder from pygbm.binning import BinMapper from pygbm.grower import TreeGrower from pygbm.loss import _LOSSES class BaseGradientBoostingMachine(BaseEstimator, ABC): """Base class for gradient boosting estimators.""" @abstractmethod def __init__(self, loss, learning_rate, max_iter, max_leaf_nodes, max_depth, min_samples_leaf, l2_regularization, max_bins, scoring, validation_split, n_iter_no_change, tol, verbose, random_state): self.loss = loss self.learning_rate = learning_rate self.max_iter = max_iter self.max_leaf_nodes = max_leaf_nodes self.max_depth = max_depth self.min_samples_leaf = min_samples_leaf self.l2_regularization = l2_regularization self.max_bins = max_bins self.n_iter_no_change = n_iter_no_change self.validation_split = validation_split self.scoring = scoring self.tol = tol self.verbose = verbose self.random_state = random_state def _validate_parameters(self, X): """Validate parameters passed to __init__. The parameters that are directly passed to the grower are checked in TreeGrower.""" if self.loss not in self._VALID_LOSSES: raise ValueError( "Loss {} is not supported for {}. Accepted losses" "are {}.".format(self.loss, self.__class__.__name__, ', '.join(self._VALID_LOSSES))) if self.learning_rate <= 0: raise ValueError(f'learning_rate={self.learning_rate} must ' f'be strictly positive') if self.max_iter < 1: raise ValueError(f'max_iter={self.max_iter} must ' f'not be smaller than 1.') if self.n_iter_no_change is not None and self.n_iter_no_change < 0: raise ValueError(f'n_iter_no_change={self.n_iter_no_change} ' f'must be positive.') if self.validation_split is not None and self.validation_split <= 0: raise ValueError(f'validation_split={self.validation_split} ' f'must be strictly positive, or None.') if self.tol is not None and self.tol < 0: raise ValueError(f'tol={self.tol} ' f'must not be smaller than 0.') if X.dtype == np.uint8: # pre-binned data max_bin_index = X.max() if self.max_bins < max_bin_index + 1: raise ValueError( f'max_bins is set to {self.max_bins} but the data is ' f'pre-binned with {max_bin_index + 1} bins.' ) def fit(self, X, y): """Fit the gradient boosting model. Parameters ---------- X : array-like, shape=(n_samples, n_features) The input samples. If ``X.dtype == np.uint8``, the data is assumed to be pre-binned and the prediction methods (``predict``, ``predict_proba``) will only accept pre-binned data as well. y : array-like, shape=(n_samples,) Target values. Returns ------- self : object """ fit_start_time = time() acc_find_split_time = 0. # time spent finding the best splits acc_apply_split_time = 0. # time spent splitting nodes # time spent predicting X for gradient and hessians update acc_prediction_time = 0. # TODO: add support for mixed-typed (numerical + categorical) data # TODO: add support for missing data X, y = check_X_y(X, y, dtype=[np.float32, np.float64, np.uint8]) y = self._encode_y(y) if X.shape[0] == 1 or X.shape[1] == 1: raise ValueError( 'Passing only one sample or one feature is not supported yet. ' 'See numba issue #3569.' ) rng = check_random_state(self.random_state) self._validate_parameters(X) self.n_features_ = X.shape[1] # used for validation in predict() if X.dtype == np.uint8: # data is pre-binned if self.verbose: print("X is pre-binned.") X_binned = X self.bin_mapper_ = None numerical_thresholds = None n_bins_per_feature = X.max(axis=0).astype(np.uint32) else: if self.verbose: print(f"Binning {X.nbytes / 1e9:.3f} GB of data: ", end="", flush=True) tic = time() self.bin_mapper_ = BinMapper(max_bins=self.max_bins, random_state=rng) X_binned = self.bin_mapper_.fit_transform(X) numerical_thresholds = self.bin_mapper_.numerical_thresholds_ n_bins_per_feature = self.bin_mapper_.n_bins_per_feature_ toc = time() if self.verbose: duration = toc - tic throughput = X.nbytes / duration print(f"{duration:.3f} s ({throughput / 1e6:.3f} MB/s)") self.loss_ = self._get_loss() do_early_stopping = (self.n_iter_no_change is not None and self.n_iter_no_change > 0) if do_early_stopping and self.validation_split is not None: # stratify for classification stratify = y if hasattr(self.loss_, 'predict_proba') else None X_binned_train, X_binned_val, y_train, y_val = train_test_split( X_binned, y, test_size=self.validation_split, stratify=stratify, random_state=rng) if X_binned_train.size == 0 or X_binned_val.size == 0: raise ValueError( f'Not enough data (n_samples={X_binned.shape[0]}) to ' f'perform early stopping with validation_split=' f'{self.validation_split}. Use more training data or ' f'adjust validation_split.' ) # Predicting is faster of C-contiguous arrays, training is faster # on Fortran arrays. X_binned_val = np.ascontiguousarray(X_binned_val) X_binned_train = np.asfortranarray(X_binned_train) else: X_binned_train, y_train = X_binned, y X_binned_val, y_val = None, None # Subsample the training set for score-based monitoring. if do_early_stopping: subsample_size = 10000 n_samples_train = X_binned_train.shape[0] if n_samples_train > subsample_size: indices = rng.choice(X_binned_train.shape[0], subsample_size) X_binned_small_train = X_binned_train[indices] y_small_train = y_train[indices] else: X_binned_small_train = X_binned_train y_small_train = y_train # Predicting is faster of C-contiguous arrays. X_binned_small_train = np.ascontiguousarray(X_binned_small_train) if self.verbose: print("Fitting gradient boosted rounds:") n_samples = X_binned_train.shape[0] self.baseline_prediction_ = self.loss_.get_baseline_prediction( y_train, self.n_trees_per_iteration_) # raw_predictions are the accumulated values predicted by the trees # for the training data. raw_predictions = np.zeros( shape=(n_samples, self.n_trees_per_iteration_), dtype=self.baseline_prediction_.dtype ) raw_predictions += self.baseline_prediction_ # gradients and hessians are 1D arrays of size # n_samples * n_trees_per_iteration gradients, hessians = self.loss_.init_gradients_and_hessians( n_samples=n_samples, prediction_dim=self.n_trees_per_iteration_ ) # predictors_ is a matrix of TreePredictor objects with shape # (n_iter_, n_trees_per_iteration) self.predictors_ = predictors = [] # scorer_ is a callable with signature (est, X, y) and calls # est.predict() or est.predict_proba() depending on its nature. self.scorer_ = check_scoring(self, self.scoring) self.train_scores_ = [] self.validation_scores_ = [] if do_early_stopping: # Add predictions of the initial model (before the first tree) self.train_scores_.append( self._get_scores(X_binned_train, y_train)) if self.validation_split is not None: self.validation_scores_.append( self._get_scores(X_binned_val, y_val)) for iteration in range(self.max_iter): if self.verbose: iteration_start_time = time() print(f"[{iteration + 1}/{self.max_iter}] ", end='', flush=True) # Update gradients and hessians, inplace self.loss_.update_gradients_and_hessians(gradients, hessians, y_train, raw_predictions) predictors.append([]) # Build `n_trees_per_iteration` trees. for k, (gradients_at_k, hessians_at_k) in enumerate(zip( np.array_split(gradients, self.n_trees_per_iteration_), np.array_split(hessians, self.n_trees_per_iteration_))): # the xxxx_at_k arrays are **views** on the original arrays. # Note that for binary classif and regressions, # n_trees_per_iteration is 1 and xxxx_at_k is equivalent to the # whole array. grower = TreeGrower( X_binned_train, gradients_at_k, hessians_at_k, max_bins=self.max_bins, n_bins_per_feature=n_bins_per_feature, max_leaf_nodes=self.max_leaf_nodes, max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf, l2_regularization=self.l2_regularization, shrinkage=self.learning_rate) grower.grow() acc_apply_split_time += grower.total_apply_split_time acc_find_split_time += grower.total_find_split_time predictor = grower.make_predictor(numerical_thresholds) predictors[-1].append(predictor) tic_pred = time() # prepare leaves_data so that _update_raw_predictions can be # @njitted leaves_data = [(l.value, l.sample_indices) for l in grower.finalized_leaves] _update_raw_predictions(leaves_data, raw_predictions[:, k]) toc_pred = time() acc_prediction_time += toc_pred - tic_pred should_early_stop = False if do_early_stopping: should_early_stop = self._check_early_stopping( X_binned_small_train, y_small_train, X_binned_val, y_val) if self.verbose: self._print_iteration_stats(iteration_start_time, do_early_stopping) if should_early_stop: break if self.verbose: duration = time() - fit_start_time n_total_leaves = sum( predictor.get_n_leaf_nodes() for predictors_at_ith_iteration in self.predictors_ for predictor in predictors_at_ith_iteration) n_predictors = sum( len(predictors_at_ith_iteration) for predictors_at_ith_iteration in self.predictors_) print(f"Fit {n_predictors} trees in {duration:.3f} s, " f"({n_total_leaves} total leaves)") print(f"{'Time spent finding best splits:':<32} " f"{acc_find_split_time:.3f}s") print(f"{'Time spent applying splits:':<32} " f"{acc_apply_split_time:.3f}s") print(f"{'Time spent predicting:':<32} " f"{acc_prediction_time:.3f}s") self.train_scores_ = np.asarray(self.train_scores_) self.validation_scores_ = np.asarray(self.validation_scores_) return self def _check_early_stopping(self, X_binned_train, y_train, X_binned_val, y_val): """Check if fitting should be early-stopped. Scores are computed on validation data or on training data. """ self.train_scores_.append( self._get_scores(X_binned_train, y_train)) if self.validation_split is not None: self.validation_scores_.append( self._get_scores(X_binned_val, y_val)) return self._should_stop(self.validation_scores_) return self._should_stop(self.train_scores_) def _should_stop(self, scores): """ Return True (do early stopping) if the last n scores aren't better than the (n-1)th-to-last score, up to some tolerance. """ reference_position = self.n_iter_no_change + 1 if len(scores) < reference_position: return False # A higher score is always better. Higher tol means that it will be # harder for subsequent iteration to be considered an improvement upon # the reference score, and therefore it is more likely to early stop # because of the lack of significant improvement. tol = 0 if self.tol is None else self.tol reference_score = scores[-reference_position] + tol recent_scores = scores[-reference_position + 1:] recent_improvements = [score > reference_score for score in recent_scores] return not any(recent_improvements) def _get_scores(self, X, y): """Compute scores on data X with target y. Scores are either computed with a scorer if scoring parameter is not None, else with the loss. As higher is always better, we return -loss_value. """ if self.scoring is not None: return self.scorer_(self, X, y) # Else, use the negative loss as score. raw_predictions = self._raw_predict(X) return -self.loss_(y, raw_predictions) def _print_iteration_stats(self, iteration_start_time, do_early_stopping): """Print info about the current fitting iteration.""" log_msg = '' predictors_of_ith_iteration = [ predictors_list for predictors_list in self.predictors_[-1] if predictors_list ] n_trees = len(predictors_of_ith_iteration) max_depth = max(predictor.get_max_depth() for predictor in predictors_of_ith_iteration) n_leaves = sum(predictor.get_n_leaf_nodes() for predictor in predictors_of_ith_iteration) if n_trees == 1: log_msg += (f"{n_trees} tree, {n_leaves} leaves, ") else: log_msg += (f"{n_trees} trees, {n_leaves} leaves ") log_msg += (f"({int(n_leaves / n_trees)} on avg), ") log_msg += f"max depth = {max_depth}, " if do_early_stopping: log_msg += f"{self.scoring} train: {self.train_scores_[-1]:.5f}, " if self.validation_split is not None: log_msg += (f"{self.scoring} val: " f"{self.validation_scores_[-1]:.5f}, ") iteration_time = time() - iteration_start_time log_msg += f"in {iteration_time:0.3f}s" print(log_msg) def _raw_predict(self, X): """Return the sum of the leaves values over all predictors. Parameters ---------- X : array-like, shape=(n_samples, n_features) The input samples. If ``X.dtype == np.uint8``, the data is assumed to be pre-binned and the estimator must have been fitted with pre-binned data. Returns ------- raw_predictions : array, shape (n_samples * n_trees_per_iteration,) The raw predicted values. """ X = check_array(X) check_is_fitted(self, 'predictors_') if X.shape[1] != self.n_features_: raise ValueError( f'X has {X.shape[1]} features but this estimator was ' f'trained with {self.n_features_} features.' ) is_binned = X.dtype == np.uint8 if not is_binned and self.bin_mapper_ is None: raise ValueError( 'This estimator was fitted with pre-binned data and ' 'can only predict pre-binned data as well. If your data *is* ' 'already pre-binnned, convert it to uint8 using e.g. ' 'X.astype(np.uint8). If the data passed to fit() was *not* ' 'pre-binned, convert it to float32 and call fit() again.' ) n_samples = X.shape[0] raw_predictions = np.zeros( shape=(n_samples, self.n_trees_per_iteration_), dtype=self.baseline_prediction_.dtype ) raw_predictions += self.baseline_prediction_ # Should we parallelize this? for predictors_of_ith_iteration in self.predictors_: for k, predictor in enumerate(predictors_of_ith_iteration): predict = (predictor.predict_binned if is_binned else predictor.predict) raw_predictions[:, k] += predict(X) return raw_predictions @abstractmethod def _get_loss(self): pass @abstractmethod def _encode_y(self, y=None): pass @property def n_iter_(self): check_is_fitted(self, 'predictors_') return len(self.predictors_) class GradientBoostingRegressor(BaseGradientBoostingMachine, RegressorMixin): """Scikit-learn compatible Gradient Boosting Tree for regression. Parameters ---------- loss : {'least_squares'}, optional(default='least_squares') The loss function to use in the boosting process. learning_rate : float, optional(default=0.1) The learning rate, also known as *shrinkage*. This is used as a multiplicative factor for the leaves values. Use ``1`` for no shrinkage. max_iter : int, optional(default=100) The maximum number of iterations of the boosting process, i.e. the maximum number of trees. max_leaf_nodes : int or None, optional(default=None) The maximum number of leaves for each tree. If None, there is no maximum limit. max_depth : int or None, optional(default=None) The maximum depth of each tree. The depth of a tree is the number of nodes to go from the root to the deepest leaf. min_samples_leaf : int, optional(default=20) The minimum number of samples per leaf. l2_regularization : float, optional(default=0) The L2 regularization parameter. Use 0 for no regularization. max_bins : int, optional(default=256) The maximum number of bins to use. Before training, each feature of the input array ``X`` is binned into at most ``max_bins`` bins, which allows for a much faster training stage. Features with a small number of unique values may use less than ``max_bins`` bins. Must be no larger than 256. scoring : str or callable or None, \ optional (default=None) Scoring parameter to use for early stopping (see sklearn.metrics for available options). If None, early stopping is check w.r.t the loss value. validation_split : int or float or None, optional(default=0.1) Proportion (or absolute size) of training data to set aside as validation data for early stopping. If None, early stopping is done on the training data. n_iter_no_change : int or None, optional (default=5) Used to determine when to "early stop". The fitting process is stopped when none of the last ``n_iter_no_change`` scores are better than the ``n_iter_no_change - 1``th-to-last one, up to some tolerance. If None or 0, no early-stopping is done. tol : float or None optional (default=1e-7) The absolute tolerance to use when comparing scores. The higher the tolerance, the more likely we are to early stop: higher tolerance means that it will be harder for subsequent iterations to be considered an improvement upon the reference score. verbose: int, optional (default=0) The verbosity level. If not zero, print some information about the fitting process. random_state : int, np.random.RandomStateInstance or None, \ optional (default=None) Pseudo-random number generator to control the subsampling in the binning process, and the train/validation data split if early stopping is enabled. See `scikit-learn glossary <https://scikit-learn.org/stable/glossary.html#term-random-state>`_. Examples -------- >>> from sklearn.datasets import load_boston >>> from pygbm import GradientBoostingRegressor >>> X, y = load_boston(return_X_y=True) >>> est = GradientBoostingRegressor().fit(X, y) >>> est.score(X, y) 0.92... """ _VALID_LOSSES = ('least_squares',) def __init__(self, loss='least_squares', learning_rate=0.1, max_iter=100, max_leaf_nodes=31, max_depth=None, min_samples_leaf=20, l2_regularization=0., max_bins=256, scoring=None, validation_split=0.1, n_iter_no_change=5, tol=1e-7, verbose=0, random_state=None): super(GradientBoostingRegressor, self).__init__( loss=loss, learning_rate=learning_rate, max_iter=max_iter, max_leaf_nodes=max_leaf_nodes, max_depth=max_depth, min_samples_leaf=min_samples_leaf, l2_regularization=l2_regularization, max_bins=max_bins, scoring=scoring, validation_split=validation_split, n_iter_no_change=n_iter_no_change, tol=tol, verbose=verbose, random_state=random_state) def predict(self, X): """Predict values for X. Parameters ---------- X : array-like, shape=(n_samples, n_features) The input samples. If ``X.dtype == np.uint8``, the data is assumed to be pre-binned and the estimator must have been fitted with pre-binned data. Returns ------- y : array, shape (n_samples,) The predicted values. """ # Return raw predictions after converting shape # (n_samples, 1) to (n_samples,) return self._raw_predict(X).ravel() def _encode_y(self, y): # Just convert y to float32 self.n_trees_per_iteration_ = 1 y = y.astype(np.float32, copy=False) return y def _get_loss(self): return _LOSSES[self.loss]() class GradientBoostingClassifier(BaseGradientBoostingMachine, ClassifierMixin): """Scikit-learn compatible Gradient Boosting Tree for classification. Parameters ---------- loss : {'auto', 'binary_crossentropy', 'categorical_crossentropy'}, \ optional(default='auto') The loss function to use in the boosting process. 'binary_crossentropy' (also known as logistic loss) is used for binary classification and generalizes to 'categorical_crossentropy' for multiclass classification. 'auto' will automatically choose either loss depending on the nature of the problem. learning_rate : float, optional(default=1) The learning rate, also known as *shrinkage*. This is used as a multiplicative factor for the leaves values. Use ``1`` for no shrinkage. max_iter : int, optional(default=100) The maximum number of iterations of the boosting process, i.e. the maximum number of trees for binary classification. For multiclass classification, `n_classes` trees per iteration are built. max_leaf_nodes : int or None, optional(default=None) The maximum number of leaves for each tree. If None, there is no maximum limit. max_depth : int or None, optional(default=None) The maximum depth of each tree. The depth of a tree is the number of nodes to go from the root to the deepest leaf. min_samples_leaf : int, optional(default=20) The minimum number of samples per leaf. l2_regularization : float, optional(default=0) The L2 regularization parameter. Use 0 for no regularization. max_bins : int, optional(default=256) The maximum number of bins to use. Before training, each feature of the input array ``X`` is binned into at most ``max_bins`` bins, which allows for a much faster training stage. Features with a small number of unique values may use less than ``max_bins`` bins. Must be no larger than 256. scoring : str or callable or None, optional (default=None) Scoring parameter to use for early stopping (see sklearn.metrics for available options). If None, early stopping is check w.r.t the loss value. validation_split : int or float or None, optional(default=0.1) Proportion (or absolute size) of training data to set aside as validation data for early stopping. If None, early stopping is done on the training data. n_iter_no_change : int or None, optional (default=5) Used to determine when to "early stop". The fitting process is stopped when none of the last ``n_iter_no_change`` scores are better than the ``n_iter_no_change - 1``th-to-last one, up to some tolerance. If None or 0, no early-stopping is done. tol : float or None optional (default=1e-7) The absolute tolerance to use when comparing scores. The higher the tolerance, the more likely we are to early stop: higher tolerance means that it will be harder for subsequent iterations to be considered an improvement upon the reference score. verbose: int, optional(default=0) The verbosity level. If not zero, print some information about the fitting process. random_state : int, np.random.RandomStateInstance or None, \ optional(default=None) Pseudo-random number generator to control the subsampling in the binning process, and the train/validation data split if early stopping is enabled. See `scikit-learn glossary <https://scikit-learn.org/stable/glossary.html#term-random-state>`_. Examples -------- >>> from sklearn.datasets import load_iris >>> from pygbm import GradientBoostingClassifier >>> X, y = load_iris(return_X_y=True) >>> clf = GradientBoostingClassifier().fit(X, y) >>> clf.score(X, y) 0.97... """ _VALID_LOSSES = ('binary_crossentropy', 'categorical_crossentropy', 'auto') def __init__(self, loss='auto', learning_rate=0.1, max_iter=100, max_leaf_nodes=31, max_depth=None, min_samples_leaf=20, l2_regularization=0., max_bins=256, scoring=None, validation_split=0.1, n_iter_no_change=5, tol=1e-7, verbose=0, random_state=None): super(GradientBoostingClassifier, self).__init__( loss=loss, learning_rate=learning_rate, max_iter=max_iter, max_leaf_nodes=max_leaf_nodes, max_depth=max_depth, min_samples_leaf=min_samples_leaf, l2_regularization=l2_regularization, max_bins=max_bins, scoring=scoring, validation_split=validation_split, n_iter_no_change=n_iter_no_change, tol=tol, verbose=verbose, random_state=random_state) def predict(self, X): """Predict classes for X. Parameters ---------- X : array-like, shape=(n_samples, n_features) The input samples. If ``X.dtype == np.uint8``, the data is assumed to be pre-binned and the estimator must have been fitted with pre-binned data. Returns ------- y : array, shape (n_samples,) The predicted classes. """ # This could be done in parallel encoded_classes = np.argmax(self.predict_proba(X), axis=1) return self.classes_[encoded_classes] def predict_proba(self, X): """Predict class probabilities for X. Parameters ---------- X : array-like, shape=(n_samples, n_features) The input samples. If ``X.dtype == np.uint8``, the data is assumed to be pre-binned and the estimator must have been fitted with pre-binned data. Returns ------- p : array, shape (n_samples, n_classes) The class probabilities of the input samples. """ raw_predictions = self._raw_predict(X) return self.loss_.predict_proba(raw_predictions) def _encode_y(self, y): # encode classes into 0 ... n_classes - 1 and sets attributes classes_ # and n_trees_per_iteration_ check_classification_targets(y) label_encoder = LabelEncoder() encoded_y = label_encoder.fit_transform(y) self.classes_ = label_encoder.classes_ n_classes = self.classes_.shape[0] # only 1 tree for binary classification. For multiclass classification, # we build 1 tree per class. self.n_trees_per_iteration_ = 1 if n_classes <= 2 else n_classes encoded_y = encoded_y.astype(np.float32, copy=False) return encoded_y def _get_loss(self): if self.loss == 'auto': if self.n_trees_per_iteration_ == 1: return _LOSSES['binary_crossentropy']() else: return _LOSSES['categorical_crossentropy']() return _LOSSES[self.loss]() @njit(parallel=True) def _update_raw_predictions(leaves_data, raw_predictions): """Update raw_predictions by reading the predictions of the ith tree directly form the leaves. Can only be used for predicting the training data. raw_predictions contains the sum of the tree values from iteration 0 to i - 1. This adds the predictions of the ith tree to raw_predictions. Parameters ---------- leaves_data: list of tuples (leaf.value, leaf.sample_indices) The leaves data used to update raw_predictions. raw_predictions : array-like, shape=(n_samples,) The raw predictions for the training data. """ for leaf_idx in prange(len(leaves_data)): leaf_value, sample_indices = leaves_data[leaf_idx] for sample_idx in sample_indices: raw_predictions[sample_idx] += leaf_value