# Licensed under the MIT License - https://opensource.org/licenses/MIT from sklearn import linear_model from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.svm import SVR from sklearn.neural_network import MLPRegressor from sklearn.utils import shuffle from sklearn.base import BaseEstimator from sklearn.utils.validation import check_X_y, check_array, check_is_fitted from sklearn.model_selection import GridSearchCV import math import numpy as np import random import logging import numbers logger = logging.getLogger('pycobra.kernelcobra') class KernelCobra(BaseEstimator): """ Regression algorithm as introduced by Kernel-COBRA: A combined regression-classification strategy using Kernels. Based on the paper by Guedj, Srinivasa Desikan [2018]. Parameters ---------- random_state: integer or a numpy.random.RandomState object. Set the state of the random number generator to pass on to shuffle and loading machines, to ensure reproducibility of your experiments, for example. Attributes ---------- estimators_: A dictionary which maps machine names to the machine objects. The machine object must have a predict method for it to be used during aggregation. machine_predictions_: A dictionary which maps machine name to it's predictions over X_l This value is used to determine which points from y_l are used to aggregate. all_predictions_: numpy array with all the predictions, to be used for bandwidth manipulation. """ def __init__(self, random_state=None, machine_list='basic'): self.random_state = random_state self.machine_list = machine_list def fit(self, X, y, default=True, X_k=None, X_l=None, y_k=None, y_l=None): """ Parameters ---------- X: array-like, [n_samples, n_features] Training data which will be used to create the COBRA aggregate. y: array-like, shape = [n_samples] Target values used to train the machines used in the aggregation. default: bool, optional If set as true then sets up COBRA with default machines and splitting. X_k : shape = [n_samples, n_features] Training data which is used to train the machines used in the aggregation. Can be loaded directly into COBRA; if not, the split_data method is used as default. y_k : array-like, shape = [n_samples] Target values used to train the machines used in the aggregation. X_l : shape = [n_samples, n_features] Training data which is used to form the aggregate. Can be loaded directly into COBRA; if not, the split_data method is used as default. y_l : array-like, shape = [n_samples] Target values which are actually used to form the aggregate. """ X, y = check_X_y(X, y) self.X_ = X self.y_ = y self.X_k_ = X_k self.X_l_ = X_l self.y_k_ = y_k self.y_l_ = y_l self.estimators_ = {} # set-up COBRA with default machines if default: self.split_data() self.load_default(machine_list=self.machine_list) self.load_machine_predictions() return self def pred(self, X, kernel=None, metric=None, bandwidth=1, **kwargs): """ Performs the Kernel-COBRA aggregation scheme, used in predict method. Parameters ---------- X: array-like, [n_features] kernel: function, optional kernel refers to the kernel method which we wish to use to perform the aggregation. metric: function, optional metric refers to the metric method which we wish to use to perform the aggregation. bandwidth: float, optional Bandwidth for the deafult kernel value (gaussian), and is set to 1. kwargs requires you to pass arguments with "kernel_params" and "metric_params", if the custom kernel or metric has more paramteres. Returns ------- avg: prediction """ a = np.zeros(len(self.X_l_)) for machine in self.estimators_: val = self.estimators_[machine].predict(X) for index, value in np.ndenumerate(self.machine_predictions_[machine]): if metric is not None: try: a[index] += metric(value, val, kwargs["metric_params"]) except KeyError: a[index] += metric(value, val) else: a[index] += math.fabs(value - val) # normalise the array if kernel is not None: try: a = np.divide(kernel(a, kwargs["kernel_params"]), np.sum(kernel(a, kwargs["kernel_params"]))) except KeyError: a = np.divide(kernel(a), np.sum(kernel(a))) else: exp = np.nan_to_num(np.exp(- bandwidth * a)) a = np.nan_to_num(np.divide(exp, np.sum(exp))) return np.sum(np.multiply(self.y_l_, a)) def predict(self, X, kernel=None, metric=None, bandwidth=1, **kwargs): """ Performs the Kernel-COBRA aggregation scheme, calls pred. Parameters ---------- X: array-like, [n_features] kernel: function, optional kernel refers to the kernel method which we wish to use to perform the aggregation. metric: function, optional metric refers to the metric method which we wish to use to perform the aggregation. bandwidth: float, optional Bandwidth for the deafult kernel value (gaussian), and is set to 1. kwargs requires you to pass arguments with "kernel_params" and "metric_params", if the custom kernel or metric has more paramteres. Returns ------- avg: prediction """ X = check_array(X) if X.ndim == 1: return self.pred(X.reshape(1, -1)) result = np.zeros(len(X)) avg_points = 0 index = 0 for vector in X: result[index] = self.pred(vector.reshape(1, -1), kernel=kernel, metric=metric, bandwidth=bandwidth, **kwargs) index += 1 return result def split_data(self, k=None, l=None, shuffle_data=False): """ Split the data into different parts for training machines and for aggregation. Parameters ---------- k : int, optional k is the number of points used to train the machines. Those are the first k points of the data provided. l: int, optional l is the number of points used to form the COBRA aggregate. shuffle: bool, optional Boolean value to decide to shuffle the data before splitting. Returns ------- self : returns an instance of self. """ if shuffle_data: self.X_, self.y_ = shuffle(self.X_, self.y_, random_state=self.random_state) if k is None and l is None: k = int(len(self.X_) / 2) l = int(len(self.X_)) if k is not None and l is None: l = len(self.X_) - k if l is not None and k is None: k = len(self.X_) - l self.X_k_ = self.X_[:k] self.X_l_ = self.X_[k:l] self.y_k_ = self.y_[:k] self.y_l_ = self.y_[k:l] return self def load_default(self, machine_list='basic'): """ Loads 4 different scikit-learn regressors by default. The advanced list adds more machines. Parameters ---------- machine_list: optional, list of strings List of default machine names to be loaded. Default is basic, Returns ------- self : returns an instance of self. """ if machine_list == 'basic': machine_list = ['tree', 'ridge', 'random_forest', 'svm'] if machine_list == 'advanced': machine_list=['lasso', 'tree', 'ridge', 'random_forest', 'svm', 'bayesian_ridge', 'sgd'] self.estimators_ = {} for machine in machine_list: try: if machine == 'lasso': self.estimators_['lasso'] = linear_model.LassoCV(random_state=self.random_state).fit(self.X_k_, self.y_k_) if machine == 'tree': self.estimators_['tree'] = DecisionTreeRegressor(random_state=self.random_state).fit(self.X_k_, self.y_k_) if machine == 'ridge': self.estimators_['ridge'] = linear_model.RidgeCV().fit(self.X_k_, self.y_k_) if machine == 'random_forest': self.estimators_['random_forest'] = RandomForestRegressor(random_state=self.random_state).fit(self.X_k_, self.y_k_) if machine == 'svm': self.estimators_['svm'] = SVR().fit(self.X_k_, self.y_k_) if machine == 'sgd': self.estimators_['sgd'] = linear_model.SGDRegressor(random_state=self.random_state).fit(self.X_k_, self.y_k_) if machine == 'bayesian_ridge': self.estimators_['bayesian_ridge'] = linear_model.BayesianRidge().fit(self.X_k_, self.y_k_) except ValueError: continue return self def load_machine(self, machine_name, machine): """ Adds a machine to be used during the aggregation strategy. The machine object must have been trained using X_k and y_k, and must have a 'predict()' method. After the machine is loaded, for it to be used during aggregation, load_machine_predictions must be run. Parameters ---------- machine_name : string Name of the machine you are loading machine: machine/regressor object The regressor machine object which is mapped to the machine_name Returns ------- self : returns an instance of self. """ self.estimators_[machine_name] = machine return self def load_machine_predictions(self, predictions=None): """ Stores the trained machines' predicitons on training data in a dictionary, to be used for predictions. Should be run after all the machines to be used for aggregation is loaded. Parameters ---------- predictions: dictionary, optional A pre-existing machine:predictions dictionary can also be loaded. Returns ------- self : returns an instance of self. """ self.machine_predictions_ = {} self.all_predictions_ = np.array([]) if predictions is None: for machine in self.estimators_: self.machine_predictions_[machine] = self.estimators_[machine].predict(self.X_l_) # all_predictions_ is used in the diagnostics class, and for initialising epsilon self.all_predictions_ = np.append(self.all_predictions_, self.machine_predictions_[machine]) if predictions is not None: self.machine_predictions_ = predictions return self