""" Implementation of LambdaMART. Interface is very similar to sklearn's tree ensembles. In fact, the majority of this code is just a port of GradientBoostingRegressor customized for LTR. The most notable difference is that fit() now takes another `qids` parameter containing query ids for all the samples. """ # Derivative of scikit-learn # https://github.com/scikit-learn/scikit-learn/ # sklearn/ensemble/gradient_boosting.py # License: BSD 3 clause import numbers import numpy as np import scipy import sklearn.ensemble import sklearn.externals import sklearn.utils import sklearn.tree import time from . import AdditiveModel from .. import metrics from ..util.group import check_qids, get_groups from ..util.sort import get_sorted_y_positions from sklearn.externals.six.moves import range class LambdaMART(AdditiveModel): """Tree-based learning to rank model. Parameters ---------- metric : object The metric to be maximized by the model. learning_rate : float, optional (default=0.1) Shrinks the contribution of each tree by `learning_rate`. There is a trade-off between learning_rate and n_estimators. n_estimators : int, optional (default=100) The number of boosting stages to perform. Gradient boosting is fairly robust to over-fitting so a large number usually results in better performance. max_depth : int, optional (default=3) Maximum depth of the individual regression estimators. The maximum depth limits the number of nodes in the tree. Tune this parameter for best performance; the best value depends on the interaction of the input variables. Ignored if ``max_leaf_nodes`` is not None. min_samples_split : int, optional (default=2) The minimum number of samples required to split an internal node. min_samples_leaf : int, optional (default=1) The minimum number of samples required to be at a leaf node. subsample : float, optional (default=1.0) The fraction of samples to be used for fitting the individual base learners. I have no idea why one would set this to something lower than one, and results will probably be strange if this is changed from the default. query_subsample : float, optional (default=1.0) The fraction of queries to be used for fitting the individual base learners. max_features : int, float, string or None, optional (default=None) The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a percentage and `int(max_features * n_features)` features are considered at each split. - If "auto", then `max_features=sqrt(n_features)`. - If "sqrt", then `max_features=sqrt(n_features)`. - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. Choosing `max_features < n_features` leads to a reduction of variance and an increase in bias. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features. max_leaf_nodes : int or None, optional (default=None) Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes. If not None then ``max_depth`` will be ignored. verbose : int, optional (default=0) Enable verbose output. If 1 then it prints progress and performance once in a while (the more trees the lower the frequency). If greater than 1 then it prints progress and performance for every tree. warm_start : bool, optional (default=False) When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just erase the previous solution. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Attributes ---------- feature_importances_ : array, shape = [n_features] The feature importances (the higher, the more important the feature). oob_improvement_ : array, shape = [n_estimators] The improvement in loss (= deviance) on the out-of-bag samples relative to the previous iteration. ``oob_improvement_[0]`` is the improvement in loss of the first stage over the ``init`` estimator. train_score_ : array, shape = [n_estimators] The i-th score ``train_score_[i]`` is the deviance (= loss) of the model at iteration ``i`` on the in-bag sample. If ``subsample == 1`` this is the deviance on the training data. estimators_ : ndarray of DecisionTreeRegressor, shape = [n_estimators, 1] The collection of fitted sub-estimators. ``loss_.K`` is 1 for binary classification, otherwise n_classes. estimators_fitted_ : int The number of sub-estimators actually fitted. This may be different from n_estimators in the case of early stoppage, trimming, etc. """ def __init__(self, metric=None, learning_rate=0.1, n_estimators=100, query_subsample=1.0, subsample=1.0, min_samples_split=2, min_samples_leaf=1, max_depth=3, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=True): super(LambdaMART, self).__init__() self.metric = metrics.dcg.NDCG() if metric is None else metric self.learning_rate = learning_rate self.n_estimators = n_estimators self.query_subsample = query_subsample self.subsample = subsample self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.max_depth = max_depth self.random_state = random_state self.max_features = max_features self.verbose = verbose self.max_leaf_nodes = max_leaf_nodes self.warm_start = warm_start def fit(self, X, y, qids, monitor=None): """Fit lambdamart onto a dataset. Parameters ---------- X : array_like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array_like, shape = [n_samples] Target values (integers in classification, real numbers in regression) For classification, labels must correspond to classes. qids : array_like, shape = [n_samples] Query ids for each sample. Samples must be grouped by query such that all queries with the same qid appear in one contiguous block. monitor : callable, optional The monitor is called after each iteration with the current iteration, a reference to the estimator and the local variables of ``_fit_stages`` as keyword arguments ``callable(i, self, locals())``. If the callable returns ``True`` the fitting procedure is stopped. The monitor can be used for various things such as computing held-out estimates, early stopping, model introspecting, and snapshoting. """ if not self.warm_start: self._clear_state() X, y = sklearn.utils.check_X_y(X, y, dtype=sklearn.tree._tree.DTYPE) n_samples, self.n_features = X.shape sklearn.utils.check_consistent_length(X, y, qids) if y.dtype.kind == 'O': y = y.astype(np.float64) random_state = sklearn.utils.check_random_state(self.random_state) self._check_params() if not self._is_initialized(): self._init_state() begin_at_stage = 0 y_pred = np.zeros(y.shape[0]) else: if self.n_estimators < self.estimators_.shape[0]: raise ValueError('n_estimators=%d must be larger or equal to ' 'estimators_.shape[0]=%d when ' 'warm_start==True' % (self.n_estimators, self.estimators_.shape[0])) begin_at_stage = self.estimators_.shape[0] self.estimators_fitted_ = begin_at_stage self.estimators_.resize((self.n_estimators, 1)) self.train_score_.resize(self.n_estimators) if self.query_subsample < 1.0: self.oob_improvement_.resize(self.n_estimators) y_pred = self.predict(X) n_stages = self._fit_stages(X, y, qids, y_pred, random_state, begin_at_stage, monitor) if n_stages < self.estimators_.shape[0]: self.trim(n_stages) return self def predict(self, X): X = sklearn.utils.validation.check_array( X, dtype=sklearn.tree._tree.DTYPE, order='C') score = np.zeros((X.shape[0], 1)) estimators = self.estimators_ if self.estimators_fitted_ < len(estimators): estimators = estimators[:self.estimators_fitted_] sklearn.ensemble._gradient_boosting.predict_stages( estimators, X, self.learning_rate, score) return score.ravel() def iter_y_delta(self, i, X): assert i >= 0 and i < self.estimators_fitted_ X = sklearn.utils.validation.check_array( X, dtype=sklearn.tree._tree.DTYPE, order='C') score = np.zeros((X.shape[0], 1)) sklearn.ensemble._gradient_boosting.predict_stage( self.estimators_, i, X, self.learning_rate, score) return score.ravel() def trim(self, n): assert n <= self.estimators_fitted_ self.estimators_fitted_ = n self.estimators_ = self.estimators_[:n] self.train_score_ = self.train_score_[:n] if hasattr(self, 'oob_improvement_'): self.oob_improvement_ = self.oob_improvement_[:n] @property def feature_importances_(self): """Return the feature importances (the higher, the more important the feature). Returns ------- feature_importances_ : array, shape = [n_features] Array of summed variance reductions. """ if self.estimators_ is None or len(self.estimators_) == 0: raise NotFittedError("Estimator not fitted, call `fit` before" " `feature_importances_`.") total_sum = np.zeros((self.n_features, ), dtype=np.float64) for stage in self.estimators_: stage_sum = sum(tree.feature_importances_ for tree in stage) / len(stage) total_sum += stage_sum importances = total_sum / len(self.estimators_) return importances def _calc_lambdas_deltas(self, qid, y, y_pred): ns = y.shape[0] positions = get_sorted_y_positions(y, y_pred, check=False) actual = y[positions] swap_deltas = self.metric.calc_swap_deltas(qid, actual) max_k = self.metric.max_k() if max_k is None or ns < max_k: max_k = ns lambdas = np.zeros(ns) deltas = np.zeros(ns) for i in range(max_k): for j in range(i + 1, ns): if actual[i] == actual[j]: continue delta_metric = swap_deltas[i, j] if delta_metric == 0.0: continue a, b = positions[i], positions[j] # invariant: y_pred[a] >= y_pred[b] if actual[i] < actual[j]: assert delta_metric > 0.0 logistic = scipy.special.expit(y_pred[a] - y_pred[b]) l = logistic * delta_metric lambdas[a] -= l lambdas[b] += l else: assert delta_metric < 0.0 logistic = scipy.special.expit(y_pred[b] - y_pred[a]) l = logistic * -delta_metric lambdas[a] += l lambdas[b] -= l gradient = (1 - logistic) * l deltas[a] += gradient deltas[b] += gradient return lambdas, deltas def _update_terminal_regions(self, tree, X, y, lambdas, deltas, y_pred, sample_mask): terminal_regions = tree.apply(X) masked_terminal_regions = terminal_regions.copy() masked_terminal_regions[~sample_mask] = -1 for leaf in np.where(tree.children_left == sklearn.tree._tree.TREE_LEAF)[0]: terminal_region = np.where(masked_terminal_regions == leaf) suml = np.sum(lambdas[terminal_region]) sumd = np.sum(deltas[terminal_region]) tree.value[leaf, 0, 0] = 0.0 if abs(sumd) < 1e-300 else (suml / sumd) y_pred += tree.value[terminal_regions, 0, 0] * self.learning_rate def _fit_stage(self, i, X, y, qids, y_pred, sample_weight, sample_mask, query_groups, random_state): """Fit another tree to the boosting model.""" assert sample_mask.dtype == np.bool n_samples = X.shape[0] all_lambdas = np.zeros(n_samples) all_deltas = np.zeros(n_samples) for qid, a, b, _ in query_groups: lambdas, deltas = self._calc_lambdas_deltas(qid, y[a:b], y_pred[a:b]) all_lambdas[a:b] = lambdas all_deltas[a:b] = deltas tree = sklearn.tree.DecisionTreeRegressor( criterion='friedman_mse', splitter='best', presort=True, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=0.0, max_features=self.max_features, max_leaf_nodes=self.max_leaf_nodes, random_state=random_state) if self.subsample < 1.0 or self.query_subsample < 1.0: sample_weight = sample_weight * sample_mask.astype(np.float64) tree.fit(X, all_lambdas, sample_weight=sample_weight, check_input=False) self._update_terminal_regions(tree.tree_, X, y, all_lambdas, all_deltas, y_pred, sample_mask) self.estimators_[i, 0] = tree self.estimators_fitted_ = i + 1 return y_pred def _fit_stages(self, X, y, qids, y_pred, random_state, begin_at_stage=0, monitor=None): n_samples = X.shape[0] do_subsample = self.subsample < 1.0 sample_weight = np.ones(n_samples, dtype=np.float64) n_queries = check_qids(qids) query_groups = np.array([(qid, a, b, np.arange(a, b)) for qid, a, b in get_groups(qids)], dtype=np.object) assert n_queries == len(query_groups) do_query_oob = self.query_subsample < 1.0 query_mask = np.ones(n_queries, dtype=np.bool) query_idx = np.arange(n_queries) q_inbag = max(1, int(self.query_subsample * n_queries)) if self.verbose: verbose_reporter = _VerboseReporter(self.verbose) verbose_reporter.init(self, begin_at_stage) for i in range(begin_at_stage, self.n_estimators): if do_query_oob: random_state.shuffle(query_idx) query_mask = np.zeros(n_queries, dtype=np.bool) query_mask[query_idx[:q_inbag]] = 1 query_groups_to_use = query_groups[query_mask] sample_mask = np.zeros(n_samples, dtype=np.bool) for qid, a, b, sidx in query_groups_to_use: sidx_to_use = sidx if do_subsample: query_samples_inbag = max( 1, int(self.subsample * (b - 1))) random_state.shuffle(sidx) sidx_to_use = sidx[:query_samples_inbag] sample_mask[sidx_to_use] = 1 if do_query_oob: old_oob_total_score = 0.0 for qid, a, b, _ in query_groups[~query_mask]: old_oob_total_score += self.metric.evaluate_preds( qid, y[a:b], y_pred[a:b]) y_pred = self._fit_stage(i, X, y, qids, y_pred, sample_weight, sample_mask, query_groups_to_use, random_state) train_total_score, oob_total_score = 0.0, 0.0 for qidx, (qid, a, b, _) in enumerate(query_groups): score = self.metric.evaluate_preds( qid, y[a:b], y_pred[a:b]) if query_mask[qidx]: train_total_score += score else: oob_total_score += score self.train_score_[i] = train_total_score / q_inbag if do_query_oob: if q_inbag < n_queries: self.oob_improvement_[i] = \ ((oob_total_score - old_oob_total_score) / (n_queries - q_inbag)) early_stop = False monitor_output = None if monitor is not None: monitor_output = monitor(i, self, locals()) if monitor_output is True: early_stop = True if self.verbose > 0: verbose_reporter.update(i, self, monitor_output) if early_stop: break return i + 1 def _is_initialized(self): return len(getattr(self, 'estimators_', [])) > 0 def _init_state(self): self.estimators_ = np.empty((self.n_estimators, 1), dtype=np.object) self.estimators_fitted_ = 0 self.train_score_ = np.zeros(self.n_estimators, dtype=np.float64) if self.query_subsample < 1.0: self.oob_improvement_ = np.zeros((self.n_estimators,), dtype=np.float64) def _clear_state(self): if hasattr(self, 'estimators_'): del self.estimators_ if hasattr(self, 'esimators_trained'): del self.estimators_fitted_ if hasattr(self, 'train_score_'): del self.train_score_ if hasattr(self, 'oob_improvement_'): del self.oob_improvement_ def _check_params(self): if self.n_estimators <= 0: raise ValueError("n_estimators must be greater than 0 but " "was %r" % self.n_estimators) if self.learning_rate <= 0.0: raise ValueError("learning_rate must be greater than 0 but " "was %r" % self.learning_rate) if not (0.0 < self.subsample <= 1.0): raise ValueError("subsample must be in (0,1] but " "was %r" % self.subsample) if not (0.0 < self.query_subsample <= 1.0): raise ValueError("query_subsample must be in (0,1] but " "was %r" % self.query_subsample) if isinstance(self.max_features, sklearn.externals.six.string_types): if self.max_features == "auto": max_features = self.n_features elif self.max_features == "sqrt": max_features = max(1, int(np.sqrt(self.n_features))) elif self.max_features == "log2": max_features = max(1, int(np.log2(self.n_features))) else: raise ValueError("Invalid value for max_features: %r. " "Allowed string values are 'auto', 'sqrt' " "or 'log2'." % self.max_features) elif self.max_features is None: max_features = self.n_features elif isinstance(self.max_features, (numbers.Integral, np.integer)): max_features = self.max_features else: # float if 0. < self.max_features <= 1.: max_features = max(int(self.max_features * self.n_features), 1) else: raise ValueError("max_features must be in (0,1]") self.max_features_ = max_features class _VerboseReporter(object): """Reports verbose output to stdout. If ``verbose==1`` output is printed once in a while (when iteration mod verbose_mod is zero).; if larger than 1 then output is printed for each update. """ def __init__(self, verbose): self.verbose = verbose def init(self, est, begin_at_stage=0): # header fields and line format str header_fields = ['Iter', 'Train score'] verbose_fmt = ['{iter:>5d}', '{train_score:>12.4f}'] # do oob? if est.query_subsample < 1: header_fields.append('OOB Improve') verbose_fmt.append('{oob_impr:>12.4f}') header_fields.append('Remaining') verbose_fmt.append('{remaining_time:>12s}') header_fields.append('Monitor Output') verbose_fmt.append('{monitor_output:>40s}') # print the header line print(('%5s ' + '%12s ' * (len(header_fields) - 2) + '%40s ') % tuple(header_fields)) self.verbose_fmt = ' '.join(verbose_fmt) # plot verbose info each time i % verbose_mod == 0 self.verbose_mod = 1 self.start_time = time.time() self.begin_at_stage = begin_at_stage def update(self, j, est, monitor_output): """Update reporter with new iteration. """ if monitor_output is True: print('Early termination at iteration ', j) return do_query_oob = est.query_subsample < 1 # we need to take into account if we fit additional estimators. i = j - self.begin_at_stage # iteration relative to the start iter if self.verbose > 1 or (i + 1) % self.verbose_mod == 0: oob_impr = est.oob_improvement_[j] if do_query_oob else 0 remaining_time = ((est.n_estimators - (j + 1)) * (time.time() - self.start_time) / float(i + 1)) if remaining_time > 60: remaining_time = '{0:.2f}m'.format(remaining_time / 60.0) else: remaining_time = '{0:.2f}s'.format(remaining_time) if monitor_output is None: monitor_output = '' print(self.verbose_fmt.format(iter=j + 1, train_score=est.train_score_[j], oob_impr=oob_impr, remaining_time=remaining_time, monitor_output=monitor_output)) if i + 1 >= 10: self.verbose_mod = 5 if i + 1 >= 50: self.verbose_mod = 10 if i + 1 >= 100: self.verbose_mod = 20 if i + 1 >= 500: self.verbose_mod = 50 if i + 1 >= 1000: self.verbose_mod = 100