"""ML-ENSEMBLE

Comparison of multiprocessing performance as data scales.

Example Output
--------------

ML-ENSEMBLE

Ensemble scale benchmark for datadimensioned up to (250000, 20)
Available CPUs: 4

Ensemble architecture
Num layers: 2
layer-1 | Estimators: ['svr', 'randomforestregressor', 'gradientboostingregressor', 'lasso', 'mlpregressor'].
layer-2 | Meta Estimator: lasso

SCORES (TIME TO FIT)
Sample size
      20000 SuperLearner : 0.807 ( 19.83s) | BlendEnsemble : 0.823 (  4.09s) | Subsemble : 0.789 (  9.84s) |
      40000 SuperLearner : 0.396 ( 42.94s) | BlendEnsemble : 0.462 ( 11.37s) | Subsemble : 0.777 ( 28.49s) |
      60000 SuperLearner : 0.280 ( 75.08s) | BlendEnsemble : 0.328 ( 23.43s) | Subsemble : 0.570 ( 56.93s) |
      80000 SuperLearner : 0.310 (126.59s) | BlendEnsemble : 0.414 ( 41.75s) | Subsemble : 0.434 ( 90.66s) |
     100000 SuperLearner : 0.447 (180.77s) | BlendEnsemble : 0.308 ( 63.80s) | Subsemble : 0.541 (111.31s) |
     120000 SuperLearner : 0.306 (243.34s) | BlendEnsemble : 0.281 ( 92.71s) | Subsemble : 0.323 (129.15s) |
     140000 SuperLearner : 0.269 (312.58s) | BlendEnsemble : 0.408 (107.19s) | Subsemble : 0.303 (165.86s) |
     160000 SuperLearner : 0.298 (410.33s) | BlendEnsemble : 0.312 (145.76s) | Subsemble : 0.343 (234.12s) |
     180000 SuperLearner : 0.250 (614.27s) | BlendEnsemble : 0.279 (195.74s) | Subsemble : 0.272 (295.76s) |
     200000 SuperLearner : 0.301 (594.41s) | BlendEnsemble : 0.390 (208.11s) | Subsemble : 0.260 (265.42s) |
     220000 SuperLearner : 0.280 (787.79s) | BlendEnsemble : 0.260 (251.45s) | Subsemble : 0.407 (356.17s) |
     240000 SuperLearner : 0.304 (928.15s) | BlendEnsemble : 0.299 (314.76s) | Subsemble : 0.300 (459.59s) |
     260000 SuperLearner : 0.252 (1226.66s) | BlendEnsemble : 0.273 (350.77s) | Subsemble : 0.279 (462.97s) |
Benchmark done | 04:20:34

Plotting results...
Figure written to /Users/Sebastian/Documents/python/mlens_dev/scale_benchmark2_time.png
Figure written to /Users/Sebastian/Documents/python/mlens_dev/scale_benchmark2_score.png
done.

"""

import os
import numpy as np

from mlens.ensemble import SuperLearner, BlendEnsemble, Subsemble
from mlens.utils import print_time
from mlens.metrics import rmse

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.base import clone
from sklearn.datasets import make_friedman1
from time import perf_counter
import warnings

PLOT = True
ENS = [SuperLearner, BlendEnsemble, Subsemble]
KWG = [{'folds': 2}, {}, {'partitions': 3, 'folds': 2}]
MAX = int(2.5 * 1e5)
STEP = int(2*1e4)
COLS = 20

SEED = 2017
np.random.seed(SEED)


def build_ensemble(kls, **kwargs):
    """Generate ensemble of class kls."""

    ens = kls(**kwargs)
    ens.add([SVR(), RandomForestRegressor(),
             GradientBoostingRegressor(), Lasso(copy_X=False),
             MLPRegressor(shuffle=False, alpha=0.001)])
    ens.add_meta(Lasso(copy_X=False))
    return ens

if __name__ == '__main__':

    warnings.filterwarnings("ignore")

    c = os.cpu_count()

    ens = [build_ensemble(kls, n_jobs=-1, **kwd) for kls, kwd in zip(ENS, KWG)]

    ###########################################################################
    # PRINTED MESSAGE
    print("\nML-ENSEMBLE\n")
    print("Ensemble scale benchmark for data"
          "dimensioned up to (%i, %i)" % (MAX, COLS))
    print("Available CPUs: %i\n" % c)
    print('Ensemble architecture')
    print("Num layers: %i" % ens[0].layers.n_layers)

    for lyr in ens[0].layers.layers:
        if int(lyr[-1]) == ens[0].layers.n_layers:
            continue

        print('%s | Estimators: %r.' %
              (lyr, [e for e, _ in ens[0].layers.layers[lyr].estimators]))

    print("%s | Meta Estimator: %s" %
          ('layer-2', ens[0].layers.layers['layer-2'].estimators[0][0]))

    print('\nSCORES (TIME TO FIT)')
    print('%11s' % 'Sample size', flush=True)

    ###########################################################################
    # ESTIMATION
    times = {kls().__class__.__name__: [] for kls in ENS}
    scores = {kls().__class__.__name__: [] for kls in ENS}

    ts = perf_counter()
    for s in range(STEP, MAX + STEP, STEP):

        q = int(np.floor(s / 2))

        print('%11i' % s, end=" ", flush=True)

        X, y = make_friedman1(n_samples=s, n_features=COLS, random_state=SEED)

        # Iterate over ensembles with given number of cores
        for e in ens:
            name = e.__class__.__name__
            e = clone(e)

            t0 = perf_counter()
            e.fit(X[:q], y[:q])
            t1 = perf_counter() - t0

            sc = rmse(y[q:], e.predict(X[q:]))

            times[name].append(t1)
            scores[name].append(sc)

            print('%s : %.3f (%6.2fs) |' % (name, sc, t1), end=" ", flush=True)
        print()

    print_time(ts, "Benchmark done")

    if PLOT:
        try:
            import matplotlib.pyplot as plt

            plt.ion()
            print("Plotting results...", flush=True)

            plt.figure(figsize=(8, 8))

            x = range(STEP, MAX + STEP, STEP)
            cm = [plt.cm.rainbow(i)
                  for i in np.linspace(0, 1.0, int(len(ENS)))]

            for i, (s, e) in enumerate(times.items()):
                ax = plt.plot(x, e, color=cm[i], marker='.', label='%s' % s)

            plt.xticks([i for i in range(STEP, MAX + STEP, 2*STEP)],
                       [int(i / 2) for i in range(STEP, MAX + STEP, 2*STEP)])
            plt.title('Time to fit')
            plt.xlabel('Sample size')
            plt.ylabel('Time to fit (sec)')
            plt.legend(frameon=False)

            f = os.path.join(os.getcwd(), 'scale_ens_time.png')
            plt.savefig(f, bbox_inches='tight', dpi=600)
            print("Figure written to %s" % f)

            plt.figure(figsize=(8, 8))

            x = range(STEP, MAX + STEP, STEP)
            cm = [plt.cm.rainbow(i)
                  for i in np.linspace(0, 1.0, int(len(ENS)))]

            for i, (s, e) in enumerate(scores.items()):
                ax = plt.plot(x, e, color=cm[i], marker='.', label='%s' % s)

            plt.xticks([i for i in range(STEP, MAX + STEP, 2*STEP)],
                       [int(i / 2) for i in range(STEP, MAX + STEP, 2*STEP)])
            plt.title('Test set accuracy')
            plt.xlabel('Sample size')
            plt.ylabel('Root Mean Square Error')
            plt.legend(frameon=False)

            f = os.path.join(os.getcwd(), 'docs/img/scale_ens_score.png')
            plt.savefig(f, bbox_inches='tight', dpi=600)
            print("Figure written to %s" % f)

            print("done.")
        except ImportError:
            print("Could not import matplotlib. Will ignore PLOT flag.")