python source code of model

#
# Copyright (c) 2018 Jonathan Weyn <jweyn@uw.edu>
#
# See the file LICENSE for your rights.
#

"""
Methods for training scikit-learn models.
"""

from datetime import datetime, timedelta
import pandas as pd

from mosx.util import get_object, to_bool, dewpoint
from mosx.estimators import TimeSeriesEstimator, RainTuningEstimator, BootStrapEnsembleEstimator
import pickle
import numpy as np


def build_estimator(config):
    """
    Build the estimator object from the parameters in config.

    :param config:
    :return:
    """
    regressor = config['Model']['regressor']
    sklearn_kwargs = config['Model']['Parameters']
    train_individual = config['Model']['train_individual']
    ada_boost = config['Model'].get('Ada boosting', None)
    rain_tuning = config['Model'].get('Rain tuning', None)
    bootstrap = config['Model'].get('Bootstrapping', None)
    Regressor = get_object('sklearn.%s' % regressor)
    if config['verbose']:
        print('build_estimator: using sklearn.%s as estimator...' % regressor)

    from sklearn.preprocessing import Imputer
    from sklearn.preprocessing import StandardScaler as Scaler
    from sklearn.pipeline import Pipeline

    # Create and train the learning algorithm
    if config['verbose']:
        print('build_estimator: here are the parameters passed to the learning algorithm...')
        print(sklearn_kwargs)

    # Create the pipeline list
    pipeline = [("imputer", Imputer(missing_values=np.nan, strategy="mean", axis=0))]
    if config['Model']['predict_timeseries']:
        pipeline_timeseries = [("imputer", Imputer(missing_values=np.nan, strategy="mean", axis=0))]

    if not (regressor.startswith('ensemble')):
        # Need to add feature scaling
        pipeline.append(("scaler", Scaler()))
        if config['Model']['predict_timeseries']:
            pipeline_timeseries.append(("scaler", Scaler()))

    # Create the regressor object
    regressor_obj = Regressor(**sklearn_kwargs)
    if ada_boost is not None:
        if config['verbose']:
            print('build_estimator: using Ada boosting...')
        from sklearn.ensemble import AdaBoostRegressor
        regressor_obj = AdaBoostRegressor(regressor_obj, **ada_boost)
    if train_individual:
        if config['verbose']:
            print('build_estimator: training separate models for each parameter...')
        from sklearn.multioutput import MultiOutputRegressor
        multi_regressor = MultiOutputRegressor(regressor_obj, 4)
        pipeline.append(("regressor", multi_regressor))
    else:
        pipeline.append(("regressor", regressor_obj))
    if config['Model']['predict_timeseries']:
        pipeline_timeseries.append(("regressor", regressor_obj))

    # Make the final estimator with a Pipeline
    if config['Model']['predict_timeseries']:
        estimator = TimeSeriesEstimator(Pipeline(pipeline), Pipeline(pipeline_timeseries))
    else:
        estimator = Pipeline(pipeline)

    if rain_tuning is not None and regressor.startswith('ensemble'):
        if config['verbose']:
            print('build_estimator: using rain tuning...')
        rain_kwargs = rain_tuning.copy()
        rain_kwargs.pop('use_raw_rain', None)
        estimator = RainTuningEstimator(estimator, **rain_kwargs)

    # Add bootstrapping if requested
    if bootstrap is not None:
        if config['verbose']:
            print('build_estimator: using bootstrapping ensemble...')
        estimator = BootStrapEnsembleEstimator(estimator, **bootstrap)

    return estimator


def build_train_data(config, predictor_file, no_obs=False, no_models=False, test_size=0):
    """
    Build the array of training (and optionally testing) data.

    :param config:
    :param predictor_file:
    :param no_obs:
    :param no_models:
    :param test_size:
    :return:
    """
    from sklearn.model_selection import train_test_split

    if config['verbose']:
        print('build_train_data: reading predictor file')
    rain_tuning = config['Model'].get('Rain tuning', None)
    with open(predictor_file, 'rb') as handle:
        data = pickle.load(handle)

    # Select data
    if no_obs and no_models:
        no_obs = False
        no_models = False
    if no_obs:
        if config['verbose']:
            print('build_train_data: not using observations to train')
        predictors = data['BUFKIT']
    elif no_models:
        if config['verbose']:
            print('build_train_data: not using models to train')
        predictors = data['OBS']
    else:
        predictors = np.concatenate((data['BUFKIT'], data['OBS']), axis=1)
    if rain_tuning is not None and to_bool(rain_tuning.get('use_raw_rain', False)):
        predictors = np.concatenate((predictors, data.rain), axis=1)
        rain_shape = data.rain.shape[-1]
    targets = data['VERIF']

    if test_size > 0:
        p_train, p_test, t_train, t_test = train_test_split(predictors, targets, test_size=test_size)
        if rain_tuning is not None and to_bool(rain_tuning.get('use_raw_rain', False)):
            r_train = p_train[:, -1*rain_shape:]
            p_train = p_train[:, :-1*rain_shape]
            r_test = p_test[:, -1 * rain_shape:]
            p_test = p_test[:, :-1 * rain_shape]
        else:
            r_train = None
            r_test = None
        return p_train, t_train, r_train, p_test, t_test, r_test
    else:
        if rain_tuning is not None and to_bool(rain_tuning.get('use_raw_rain', False)):
            return predictors, targets, data.rain
        else:
            return predictors, targets, None


def train(config, predictor_file, estimator_file=None, no_obs=False, no_models=False, test_size=0):
    """
    Generate and train a scikit-learn machine learning estimator. The estimator object is saved as a pickle so that it
    may be imported and used for predictions at any time.

    :param config:
    :param predictor_file: str: full path to saved file of predictor data
    :param estimator_file: str: full path to output model file
    :param no_obs: bool: if True, generates the model with no OBS data
    :param no_models: bool: if True, generates the model with no BUFR data
    :param test_size: int: if > 0, returns a subset of the training data of size 'test_size' to test on
    :return: matplotlib Figure if plot_learning_curve is True
    """
    estimator = build_estimator(config)
    rain_tuning = config['Model'].get('Rain tuning', None)
    if test_size > 0:
        p_train, t_train, r_train, p_test, t_test, r_test = build_train_data(config, predictor_file, no_obs=no_obs,
                                                                             no_models=no_models, test_size=test_size)
    else:
        p_train, t_train, r_train = build_train_data(config, predictor_file, no_obs=no_obs, no_models=no_models)

    print('train: training the estimator')
    if rain_tuning is not None and to_bool(rain_tuning.get('use_raw_rain', False)):
        estimator.fit(p_train, t_train, rain_array=r_train)
    else:
        estimator.fit(p_train, t_train)

    if estimator_file is None:
        estimator_file = '%s/%s_mosx.pkl' % (config['MOSX_ROOT'], config['station_id'])
    print('train: -> exporting to %s' % estimator_file)
    with open(estimator_file, 'wb') as handle:
        pickle.dump(estimator, handle, protocol=pickle.HIGHEST_PROTOCOL)

    if test_size > 0:
        return p_test, t_test, r_test
    return


def _plot_learning_curve(estimator, X, y, ylim=None, cv=None, scoring=None, title=None, n_jobs=1,
                         train_sizes=np.linspace(.1, 1.0, 5)):
    import matplotlib.pyplot as plt
    from sklearn.model_selection import learning_curve

    fig = plt.figure()
    if title is not None:
        plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, scoring=scoring, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return fig


def plot_learning_curve(config, predictor_file, no_obs=False, no_models=False, ylim=None, cv=None, scoring=None,
                        title=None, n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and training learning curve. From scikit-learn:
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html

    Parameters
    ----------
    config :

    predictor_file : string
        Full path to file containing predictor data

    no_obs : boolean
        Train model without observations

    no_models : boolean
        Train model without model data

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - An object to be used as a cross-validation generator.
          - An iterable yielding train/test splits.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    scoring :
        Scoring function for the error calculation; should be a scikit-learn scorer object

    title : string
        Title for the chart.

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).

    train_sizes : iterable, optional
        Sequence of subsets of training data used in learning curve plot
    """
    estimator = build_estimator(config)
    X, y = build_train_data(config, predictor_file, no_obs=no_obs, no_models=no_models)
    fig = _plot_learning_curve(estimator, X, y, ylim=ylim, cv=cv, scoring=scoring, title=title, n_jobs=n_jobs,
                               train_sizes=train_sizes)
    return fig


def combine_train_test(config, train_file, test_file, no_obs=False, no_models=False, return_count_test=True):
    """
    Concatenates the arrays of predictors and verification values from the train file and the test file. Useful for
    implementing cross-validation using scikit-learn's methods and the SplitConsecutive class.

    :param config:
    :param train_file: str: full path to predictor file for training
    :param test_file: str: full path to predictor file for validation
    :param no_obs: bool: if True, generates the model with no OBS data
    :param no_models: bool: if True, generates the model with no BUFR data
    :param return_count_test: bool: if True, also returns the number of samples in the test set (see SplitConsecutive)
    :return: predictors, verifications: concatenated arrays of predictors and verification values; count: number of
    samples in the test set
    """
    p_train, t_train = build_train_data(config, train_file, no_obs=no_obs, no_models=no_models)
    p_test, t_test = build_train_data(config, test_file, no_obs=no_obs, no_models=no_models)
    p_combined = np.concatenate((p_train, p_test), axis=0)
    t_combined = np.concatenate((t_train, t_test), axis=0)
    if return_count_test:
        return p_combined, t_combined, t_test.shape[0]
    else:
        return p_combined, t_combined


class SplitConsecutive(object):
    """
    Implements a split method to subset a training set into train and test sets, using the first or last n samples in
    the set.
    """

    def __init__(self, first=False, n_samples=0.2):
        """
        Create an instance of SplitConsecutive.

        :param first: bool: if True, gets test data from the beginning of the data set; otherwise from the end
        :param n_samples: float or int: if float, subsets a fraction (0 to 1) of the data into the test set; if int,
        subsets a specific number of samples.
        """
        if type(first) is not bool:
            raise TypeError("'first' must be a boolean type.")
        try:
            n_samples = int(n_samples)
        except:
            pass
        if type(n_samples) is float and (n_samples <= 0. or n_samples >= 1.):
            raise ValueError("if float, 'n_samples' must be between 0 and 1.")
        if type(n_samples) is not float and type(n_samples) is not int:
            raise TypeError("'n_samples' must be float or int type.")
        self.first = first
        self.n_samples = n_samples
        self.n_splits = 1

    def split(self, X, y=None, groups=None):
        """
        Produces arrays of indices to use for model and test splits.

        :param X: array-like, shape (samples, features): predictor data
        :param y: array-like, shape (samples, outputs) or None: verification data; ignored
        :param groups: ignored
        :return: model, test: 1-D arrays of sample indices in the model and test sets
        """
        num_samples = X.shape[0]
        indices = np.arange(0, num_samples, 1, dtype=np.int32)
        if type(self.n_samples) is float:
            self.n_samples = int(np.round(num_samples * self.n_samples))
        if self.first:
            test = indices[:self.n_samples]
            train = indices[self.n_samples:]
        else:
            test = indices[-self.n_samples:]
            train = indices[:num_samples - self.n_samples]
        yield train, test

    def get_n_splits(self, X=None, y=None, groups=None):
        """
        Return the number of splits. Dummy function for compatibility.

        :param X: ignored
        :param y: ignored
        :param groups: ignored
        :return:
        """
        return self.n_splits


def predict_all(config, predictor_file, ensemble=False, time_series_date=None, naive_rain_correction=False,
                round_result=False, **kwargs):
    """
    Predict forecasts from the estimator in config. Also return probabilities and time series.

    :param config:
    :param predictor_file: str: file containing predictor data from mosx.model.format_predictors
    :param ensemble: bool: if True, return an array of num_trees-by-4 of the predictions of each tree in the estimator
    :param time_series_date: datetime: if set, returns a time series prediction from the estimator, where the datetime
    provided is the day the forecast is for (only works for single-day runs, or assumes last day)
    :param naive_rain_correction: bool: if True, applies manual tuning to the rain forecast
    :param round_result: bool: if True, rounds the predicted estimate
    :param kwargs: passed to the estimator's 'predict' method
    :return:
    predicted: ndarray: num_samples x num_predicted_variables predictions
    all_predicted: ndarray: num_samples x num_predicted_variables x num_ensemble_members predictions for all trees
    predicted_timeseries: DataFrame: time series for final sample
    """
    # Load the predictor data and estimator
    with open(predictor_file, 'rb') as handle:
        predictor_data = pickle.load(handle)
    rain_tuning = config['Model'].get('Rain tuning', None)
    if config['verbose']:
        print('predict: loading estimator %s' % config['Model']['estimator_file'])
    with open(config['Model']['estimator_file'], 'rb') as handle:
        estimator = pickle.load(handle)

    predictors = np.concatenate((predictor_data['BUFKIT'], predictor_data['OBS']), axis=1)
    if config['Model']['rain_forecast_type'] == 'pop' and getattr(estimator, 'is_classifier', False):
        predict_method = estimator.predict_proba
    else:
        predict_method = estimator.predict
    if rain_tuning is not None and to_bool(rain_tuning.get('use_raw_rain', False)):
        predicted = predict_method(predictors, rain_array=predictor_data.rain, **kwargs)
    else:
        predicted = predict_method(predictors, **kwargs)
    precip = predictor_data.rain

    # Check for precipitation override
    if naive_rain_correction:
        for day in range(predicted.shape[0]):
            if sum(precip[day]) < 0.01:
                if config['verbose']:
                    print('predict: warning: overriding MOS-X rain prediction of %0.2f on day %s with 0' %
                          (predicted[day, 3], day))
                predicted[day, 3] = 0.
            elif predicted[day, 3] > max(precip[day]) or predicted[day, 3] < min(precip[day]):
                if config['verbose']:
                    print('predict: warning: overriding MOS-X prediction of %0.2f on day %s with model mean' %
                          (predicted[day, 3], day))
                predicted[day, 3] = max(0., np.mean(precip[day] + [predicted[day, 3]]))
    else:
        # At least make sure we aren't predicting negative values...
        predicted[:, 3][predicted[:, 3] < 0] = 0.0

    # Round off daily values, if selected
    if round_result:
        predicted[:, :3] = np.round(predicted[:, :3])
        predicted[:, 3] = np.round(predicted[:, 3], 2)

    # If probabilities are requested and available, get the results from each tree
    if ensemble:
        num_samples = predictors.shape[0]
        if not hasattr(estimator, 'named_steps'):
            forest = estimator
        else:
            imputer = estimator.named_steps['imputer']
            forest = estimator.named_steps['regressor']
            predictors = imputer.transform(predictors)
        # If we generated our own ensemble by bootstrapping, it must be treated as such
        if config['Model']['train_individual'] and config['Model'].get('Bootstrapping', None) is None:
            num_trees = len(forest.estimators_[0].estimators_)
            all_predicted = np.zeros((num_samples, 4, num_trees))
            for v in range(4):
                for t in range(num_trees):
                    try:
                        all_predicted[:, v, t] = forest.estimators_[v].estimators_[t].predict(predictors)
                    except AttributeError:
                        # Work around the 2-D array of estimators for GBTrees
                        all_predicted[:, v, t] = forest.estimators_[v].estimators_[t][0].predict(predictors)
        else:
            num_trees = len(forest.estimators_)
            all_predicted = np.zeros((num_samples, 4, num_trees))
            for t in range(num_trees):
                try:
                    all_predicted[:, :, t] = forest.estimators_[t].predict(predictors)[:, :4]
                except AttributeError:
                    # Work around the 2-D array of estimators for GBTrees
                    all_predicted[:, :, t] = forest.estimators_[t][0].predict(predictors)[:, :4]
    else:
        all_predicted = None

    if config['Model']['predict_timeseries']:
        if time_series_date is None:
            date_now = datetime.utcnow()
            time_series_date = datetime(date_now.year, date_now.month, date_now.day) + timedelta(days=1)
            print('predict: warning: set time series start date to %s (was unspecified)' % time_series_date)
        num_hours = int(24 / config['time_series_interval']) + 1
        predicted_array = predicted[-1, 4:].reshape((4, num_hours)).T
        # Get dewpoint
        predicted_array[:, 2] = dewpoint(predicted_array[:, 0], predicted_array[:, 2])
        times = pd.date_range(time_series_date.replace(hour=6), periods=num_hours,
                              freq='%dH' % config['time_series_interval']).to_pydatetime().tolist()
        variables = ['temperature', 'rain', 'dewpoint', 'windSpeed']
        round_dict = {'temperature': 0, 'rain': 2, 'dewpoint': 0, 'windSpeed': 0}
        predicted_timeseries = pd.DataFrame(predicted_array, index=times, columns=variables)
        predicted_timeseries = predicted_timeseries.round(round_dict)
    else:
        predicted_timeseries = None

    return predicted, all_predicted, predicted_timeseries


def predict(config, predictor_file, naive_rain_correction=False, round=False, **kwargs):
    """
    Predict forecasts from the estimator in config. Only returns daily values.

    :param config:
    :param predictor_file: str: file containing predictor data from mosx.model.format_predictors
    :param naive_rain_correction: bool: if True, applies manual tuning to the rain forecast
    :param round: bool: if True, rounds the predicted estimate
    :param kwargs: passed to the estimator's 'predict' method
    :return:
    """

    predicted, all_predicted, predicted_timeseries = predict_all(config, predictor_file,
                                                                 naive_rain_correction=naive_rain_correction,
                                                                 round_result=round, **kwargs)
    return predicted


def predict_rain_proba(config, predictor_file):
    """
    Predict probabilistic rain forecasts for 'pop' or 'categorical' types.

    :param config:
    :param predictor_file: str: file containing predictor data from mosx.model.format_predictors
    :return:
    """
    if config['Model']['rain_forecast_type'] not in ['pop', 'categorical']:
        raise TypeError("'quantity' rain forecast is not probabilistic, cannot get probabilities")
    rain_tuning = config['Model'].get('Rain tuning', None)
    if rain_tuning is None:
        raise TypeError('Probabilistic rain forecasts are only possible with a RainTuningEstimator')

    # Load the predictor data and estimator
    with open(predictor_file, 'rb') as handle:
        predictor_data = pickle.load(handle)
    if config['verbose']:
        print('predict: loading estimator %s' % config['Model']['estimator_file'])
    with open(config['Model']['estimator_file'], 'rb') as handle:
        estimator = pickle.load(handle)

    predictors = np.concatenate((predictor_data['BUFKIT'], predictor_data['OBS']), axis=1)
    if to_bool(rain_tuning.get('use_raw_rain', False)):
        rain_proba = estimator.predict_rain_proba(predictors, rain_array=predictor_data.rain)
    else:
        rain_proba = estimator.predict_rain_proba(predictors)

    return rain_proba