# -*- coding: utf-8 -*- """ @author: Chenglong Chen <c.chenglong@gmail.com> @brief: utils for scikit-learn models """ import numpy as np import sklearn.svm import sklearn.neighbors import sklearn.ensemble from sklearn.linear_model import Ridge from sklearn.tree import DecisionTreeRegressor from sklearn.tree import ExtraTreeRegressor from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler, PolynomialFeatures from . import dist_utils class SVR: def __init__(self, kernel='rbf', degree=3, gamma='auto', C=1.0, epsilon=0.1, normalize=True, cache_size=2048): svr = sklearn.svm.SVR(kernel=kernel, degree=degree, gamma=gamma, C=C, epsilon=epsilon) if normalize: self.model = Pipeline([('ss', StandardScaler()), ('svr', svr)]) else: self.model = svr def __str__(self): return "SVR" def fit(self, X, y): self.model.fit(X, y) return self def predict(self, X): y_pred = self.model.predict(X) return y_pred class LinearSVR: def __init__(self, epsilon=0.0, C=1.0, loss='epsilon_insensitive', random_state=None, normalize=True): lsvr = sklearn.svm.LinearSVR(epsilon=epsilon, C=C, loss=loss, random_state=random_state) if normalize: self.model = Pipeline([('ss', StandardScaler()), ('lsvr', lsvr)]) else: self.model = lsvr def __str__(self): return "LinearSVR" def fit(self, X, y): self.model.fit(X, y) return self def predict(self, X): y_pred = self.model.predict(X) return y_pred class KNNRegressor: def __init__(self, n_neighbors=5, weights='uniform', leaf_size=30, metric='minkowski', normalize=True): if metric == 'cosine': metric = lambda x,y: dist_utils._cosine_sim(x, y) knn = sklearn.neighbors.KNeighborsRegressor(n_neighbors=n_neighbors, weights=weights, leaf_size=leaf_size, metric=metric) if normalize: self.model = Pipeline([('ss', StandardScaler()), ('knn', knn)]) else: self.model = knn def __str__(self): return "KNNRegressor" def fit(self, X, y): self.model.fit(X, y) return self def predict(self, X): y_pred = self.model.predict(X) return y_pred class AdaBoostRegressor: def __init__(self, base_estimator=None, n_estimators=50, max_features=1.0, max_depth=6, learning_rate=1.0, loss='linear', random_state=None): if base_estimator and base_estimator == 'etr': base_estimator = ExtraTreeRegressor(max_depth=max_depth, max_features=max_features) else: base_estimator = DecisionTreeRegressor(max_depth=max_depth, max_features=max_features) self.model = sklearn.ensemble.AdaBoostRegressor( base_estimator=base_estimator, n_estimators=n_estimators, learning_rate=learning_rate, random_state=random_state, loss=loss) def __str__(self): return "AdaBoostRegressor" def fit(self, X, y): self.model.fit(X, y) return self def predict(self, X): y_pred = self.model.predict(X) return y_pred class RandomRidge: def __init__(self, alpha=1.0, normalize=True, poly=False, n_estimators=10, max_features=1.0, bootstrap=True, subsample=1.0, random_state=2016): self.alpha = alpha self.normalize = normalize self.poly = poly self.n_estimators = n_estimators if isinstance(max_features, float): assert max_features > 0 and max_features <= 1 self.max_features = max_features self.bootstrap = bootstrap assert subsample > 0 and subsample <= 1 self.subsample = subsample self.random_state = random_state self.ridge_list = [0]*self.n_estimators self.feature_idx_list = [0]*self.n_estimators def __str__(self): return "RandomRidge" def _random_feature_idx(self, fdim, random_state): rng = np.random.RandomState(random_state) if isinstance(self.max_features, int): size = min(fdim, self.max_features) else: size = int(fdim * self.max_features) idx = rng.permutation(fdim)[:size] return idx def _random_sample_idx(self, sdim, random_state): rng = np.random.RandomState(random_state) size = int(sdim * self.subsample) if self.bootstrap: idx = rng.randint(sdim, size=size) else: idx = rng.permutation(sdim)[:size] return idx def fit(self, X, y): sdim, fdim = X.shape for i in range(self.n_estimators): ridge = Ridge(alpha=self.alpha, normalize=self.normalize, random_state=self.random_state) fidx = self._random_feature_idx(fdim, self.random_state+i*100) sidx = self._random_sample_idx(sdim, self.random_state+i*10) X_tmp = X[sidx][:,fidx] if self.poly: X_tmp = PolynomialFeatures(degree=2).fit_transform(X_tmp)[:,1:] ridge.fit(X_tmp, y[sidx]) self.ridge_list[i] = ridge self.feature_idx_list[i] = fidx return self def predict(self, X): y_pred = np.zeros((X.shape[0], self.n_estimators)) for i in range(self.n_estimators): fidx = self.feature_idx_list[i] ridge = self.ridge_list[i] X_tmp = X[:,fidx] if self.poly: X_tmp = PolynomialFeatures(degree=2).fit_transform(X_tmp)[:,1:] y_pred[:,i] = ridge.predict(X_tmp) y_pred = np.mean(y_pred, axis=1) return y_pred