"""ML-ENSEMBLE Benchmark of ML-Ensemble against Scikit-learn estimators using Scikit-learn's friedman1 dataset. All estimators are instantiated with default settings, and all estimators in the ensemble are part of the benchmark. The default ensemble configuration achieves a 25% score improvement as compared to the best benchmark estimator (GradientBoostingRegressor). Example output -------------- Benchmark of ML-ENSEMBLE against Scikit-learn estimators on the friedman1 dataset. Scoring metric: Root Mean Squared Error. Available CPUs: 4 Ensemble architecture Num layers: 2 layer-1 | Min Max Scaling - Estimators: ['svr']. layer-1 | Standard Scaling - Estimators: ['elasticnet', 'lasso', 'kneighborsregressor']. layer-1 | No Preprocessing - Estimators: ['randomforestregressor', 'gradientboostingregressor']. layer-2 | (meta) GradientBoostingRegressor Benchmark estimators: GBM KNN Kernel Ridge Lasso Random Forest SVR Elastic-Net Data Features: 10 Training set sizes: from 2000 to 20000 with step size 2000. SCORES size | Ensemble | GBM | KNN | Kern Rid | Lasso | Random F | ... 2000 | 0.83 | 0.92 | 2.26 | 2.42 | 3.13 | 1.61 | ... 4000 | 0.75 | 0.91 | 2.11 | 2.49 | 3.13 | 1.39 | ... 6000 | 0.66 | 0.83 | 2.02 | 2.43 | 3.21 | 1.29 | ... 8000 | 0.66 | 0.84 | 1.95 | 2.43 | 3.19 | 1.24 | ... 10000 | 0.62 | 0.79 | 1.90 | 2.46 | 3.17 | 1.16 | ... 12000 | 0.68 | 0.86 | 1.84 | 2.46 | 3.16 | 1.10 | ... 14000 | 0.59 | 0.75 | 1.78 | 2.45 | 3.15 | 1.05 | ... 16000 | 0.62 | 0.80 | 1.76 | 2.45 | 3.15 | 1.02 | ... 18000 | 0.59 | 0.79 | 1.73 | 2.43 | 3.12 | 1.01 | ... 20000 | 0.56 | 0.73 | 1.70 | 2.42 | 4.87 | 0.99 | ... size | SVR | elNet | 2000 | 2.32 | 3.18 | 4000 | 2.31 | 3.16 | 6000 | 2.18 | 3.25 | 8000 | 2.09 | 3.24 | 10000 | 2.03 | 3.21 | 12000 | 1.97 | 3.21 | 16000 | 1.92 | 3.20 | 16000 | 1.87 | 3.19 | 18000 | 1.83 | 3.17 | 20000 | 1.81 | 4.75 | FIT TIMES size | Ensemble | GBM | KNN | Kern Rid | Lasso | Random F | 2000 | 0:00 | 0:00 | 0:00 | 0:00 | 0:00 | 0:00 | 4000 | 0:01 | 0:00 | 0:00 | 0:00 | 0:00 | 0:00 | 6000 | 0:02 | 0:00 | 0:00 | 0:01 | 0:00 | 0:00 | 8000 | 0:04 | 0:00 | 0:00 | 0:04 | 0:00 | 0:00 | 10000 | 0:06 | 0:01 | 0:00 | 0:08 | 0:00 | 0:00 | 12000 | 0:08 | 0:01 | 0:00 | 0:12 | 0:00 | 0:00 | 14000 | 0:10 | 0:01 | 0:00 | 0:20 | 0:00 | 0:00 | 16000 | 0:13 | 0:02 | 0:00 | 0:34 | 0:00 | 0:00 | 18000 | 0:17 | 0:02 | 0:00 | 0:47 | 0:00 | 0:00 | 20000 | 0:20 | 0:02 | 0:00 | 1:20 | 0:00 | 0:00 | size | SVR | elNet | 2000 | 0:00 | 0:00 | 4000 | 0:00 | 0:00 | 6000 | 0:01 | 0:00 | 8000 | 0:02 | 0:00 | 10000 | 0:03 | 0:00 | 12000 | 0:04 | 0:00 | 16000 | 0:06 | 0:00 | 16000 | 0:08 | 0:00 | 18000 | 0:10 | 0:00 | 20000 | 0:13 | 0:00 | """ import numpy as np import os from mlens.ensemble import SuperLearner from mlens.utils import safe_print from mlens.metrics import rmse from sklearn.datasets import make_friedman1 from sklearn.base import clone from sklearn.pipeline import make_pipeline from sklearn.linear_model import Lasso, ElasticNet from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor from sklearn.svm import SVR from sklearn.neighbors import KNeighborsRegressor from sklearn.preprocessing import StandardScaler, MinMaxScaler from sklearn.kernel_ridge import KernelRidge from time import time def build_ensemble(**kwargs): """Generate ensemble.""" ens = SuperLearner(**kwargs) prep = {'Standard Scaling': [StandardScaler()], 'Min Max Scaling': [MinMaxScaler()], 'No Preprocessing': []} est = {'Standard Scaling': [ElasticNet(), Lasso(), KNeighborsRegressor()], 'Min Max Scaling': [SVR()], 'No Preprocessing': [RandomForestRegressor(random_state=SEED), GradientBoostingRegressor()]} ens.add(est, prep) ens.add(GradientBoostingRegressor(), meta=True) return ens if __name__ == '__main__': safe_print("\nML-ENSEMBLE\n") safe_print("Benchmark of ML-ENSEMBLE against Scikit-learn estimators " "on the friedman1 dataset.\n") safe_print("Scoring metric: Root Mean Squared Error.\n") safe_print("Available CPUs: %i\n" % os.cpu_count()) SEED = 2017 np.random.seed(SEED) step = 4000 mi = step mx = 40000 + step ens_multi = build_ensemble(folds=2, shuffle=False, n_jobs=-1) ESTIMATORS = {'Ensemble': ens_multi, 'Random F': RandomForestRegressor(random_state=SEED, n_jobs=-1), ' elNet': make_pipeline(StandardScaler(), ElasticNet()), ' Lasso': make_pipeline(StandardScaler(), Lasso()), 'Kern Rid': make_pipeline(MinMaxScaler(), KernelRidge()), ' SVR': make_pipeline(MinMaxScaler(), SVR()), ' GBM': GradientBoostingRegressor(), ' KNN': KNeighborsRegressor(n_jobs=-1)} names = {k.strip(' '): k for k in ESTIMATORS} times = {e: [] for e in ESTIMATORS} scores = {e: [] for e in ESTIMATORS} sizes = range(mi, mx, step) safe_print('Ensemble architecture') safe_print("Num layers: %i" % ens_multi.layers.n_layers) for layer in ens_multi.layers.layers: for case in ens_multi.layers.summary[layer]['cases']: if case is None: continue el = ens_multi.layers.layers[layer].estimators[case] safe_print('%s | %s - Estimators: %r.' % (layer, case, [e for e, _ in el])) safe_print("%s | %s" % ('layer-2', '(meta) ' + ens_multi.layers.layers['layer-2'].estimators[0][0])) safe_print("\nBenchmark estimators", end=": ") for name in sorted(names): if name == 'Ensemble': continue safe_print(name, end=" ") safe_print('\n') safe_print('Data') safe_print('Features: %i' % 10) safe_print('Training set sizes: from %i to %i with step size %i.\n' % ( np.floor(mi / 2), np.floor((mx - step) / 2), np.floor(step/2))) safe_print('SCORES') safe_print('%6s' % 'size', end=' | ') for name in sorted(names): safe_print('%s' % names[name], end=' | ') safe_print() for size in sizes: n = int(np.floor(size / 2)) X, y = make_friedman1(n_samples=size, random_state=SEED) safe_print('%6i' % n, end=' | ') for name in sorted(names): e = clone(ESTIMATORS[names[name]]) t0 = time() e.fit(X[:n], y[:n]) t1 = time() - t0 times[names[name]].append(t1) s = rmse(y[n:], e.predict(X[n:])) scores[names[name]].append(s) safe_print('%8.2f' % (s), end=' | ', flush=True) safe_print() safe_print('\nFIT TIMES') safe_print('%6s' % 'size', end=' | ') for name in sorted(names): safe_print('%s' % names[name], end=' | ') safe_print() for i, size in enumerate(sizes): n = int(np.floor(size / 2)) safe_print('%6i' % n, end=' | ') for name in sorted(names): t = times[names[name]][i] m, s = divmod(t, 60) safe_print('%5d:%02d' % (m, s), end=' | ') safe_print()