python source code of utils

from collections import OrderedDict
import math

from auto_ml import utils
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.metrics import mean_squared_error, make_scorer, brier_score_loss, accuracy_score, explained_variance_score, mean_absolute_error, median_absolute_error, r2_score, log_loss, roc_auc_score
import numpy as np
from tabulate import tabulate

bad_vals_as_strings = set([str(float('nan')), str(float('inf')), str(float('-inf')), 'None', 'none', 'NaN', 'NAN', 'nan', 'NULL', 'null', '', 'inf', '-inf', 'np.nan', 'numpy.nan'])

def advanced_scoring_classifiers(probas, actuals, name=None):
    # pandas Series don't play nice here. Make sure our actuals list is indeed a list
    actuals = list(actuals)
    predictions = list(probas)

    print('Here is our brier-score-loss, which is the default value we optimized for while training, and is the value returned from .score() unless you requested a custom scoring metric')
    print('It is a measure of how close the PROBABILITY predictions are.')
    if name != None:
        print(name)

    # Sometimes we will be given "flattened" probabilities (only the probability of our positive label), while other times we might be given "nested" probabilities (probabilities of both positive and negative, in a list, for each item).
    try:
        probas = [proba[1] for proba in probas]
    except:
        pass

    brier_score = brier_score_loss(actuals, probas)
    print(format(brier_score, '.4f'))


    print('\nHere is the trained estimator\'s overall accuracy (when it predicts a label, how frequently is that the correct label?)')
    predicted_labels = []
    for pred in probas:
        if pred >= 0.5:
            predicted_labels.append(1)
        else:
            predicted_labels.append(0)
    print(format(accuracy_score(y_true=actuals, y_pred=predicted_labels) * 100, '.1f') + '%')


    print('\nHere is a confusion matrix showing predictions vs. actuals by label:')
    #it would make sense to use sklearn's confusion_matrix here but it apparently has no labels
    #took this idea instead from: http://stats.stackexchange.com/a/109015
    conf = pd.crosstab(pd.Series(actuals), pd.Series(predicted_labels), rownames=['v Actual v'], colnames=['Predicted >'], margins=True)
    print(conf)

    #I like knowing the per class accuracy to see if the model is mishandling imbalanced data.
    #For example, if it is predicting 100% of observations to one class just because it is the majority
    #Wikipedia seems to call that Positive/negative predictive value
    print('\nHere is predictive value by class:')
    df = pd.concat([pd.Series(actuals,name='actuals'),pd.Series(predicted_labels,name='predicted')],axis=1)
    targets = list(df.predicted.unique())
    for i in range(0,len(targets)):
        tot_count = len(df[df.predicted==targets[i]])
        true_count = len(df[(df.predicted==targets[i]) & (df.actuals == targets[i])])
        print('Class: ',targets[i],'=',float(true_count)/tot_count)


    # qcut is super fickle. so, try to use 10 buckets first, then 5 if that fails, then nothing
    try:
        try:
            bucket_results = pd.qcut(probas, q=10, duplicates='drop')
        except:
            bucket_results = pd.qcut(probas, q=5, duplicates='drop')

        df_probas = pd.DataFrame(probas, columns=['Predicted Probability Of Bucket'])
        df_probas['Actual Probability of Bucket'] = actuals
        df_probas['Bucket Edges'] = bucket_results

        df_buckets = df_probas.groupby(df_probas['Bucket Edges'])
        try:
            print(tabulate(df_buckets.mean(), headers='keys', floatfmt='.4f', tablefmt='psql', showindex='always'))
        except TypeError:
            print(tabulate(df_buckets.mean(), headers='keys', floatfmt='.4f', tablefmt='psql'))
        print('\nHere is the accuracy of our trained estimator at each level of predicted probabilities')
        print('For a verbose description of what this means, please visit the docs:')
        print('http://auto-ml.readthedocs.io/en/latest/analytics.html#interpreting-predicted-probability-buckets-for-classifiers')

    except:
        pass


    print('\n\n')
    return brier_score


def calculate_and_print_differences(predictions, actuals, name=None):
    pos_differences = []
    neg_differences = []
    # Technically, we're ignoring cases where we are spot on
    for idx, pred in enumerate(predictions):
        difference = pred - actuals[idx]
        if difference > 0:
            pos_differences.append(difference)
        elif difference < 0:
            neg_differences.append(difference)

    if name != None:
        print(name)
    print('Count of positive differences (prediction > actual):')
    print(len(pos_differences))
    print('Count of negative differences:')
    print(len(neg_differences))
    if len(pos_differences) > 0:
        print('Average positive difference:')
        print(sum(pos_differences) * 1.0 / len(pos_differences))
    if len(neg_differences) > 0:
        print('Average negative difference:')
        print(sum(neg_differences) * 1.0 / len(neg_differences))


def advanced_scoring_regressors(predictions, actuals, verbose=2, name=None):
    # pandas Series don't play nice here. Make sure our actuals list is indeed a list
    actuals = list(actuals)
    predictions = list(predictions)

    print('\n\n***********************************************')
    if name != None:
        print(name)
    print('Advanced scoring metrics for the trained regression model on this particular dataset:\n')

    # 1. overall RMSE
    print('Here is the overall RMSE for these predictions:')
    rmse = mean_squared_error(actuals, predictions)**0.5
    print(rmse)

    # 2. overall avg predictions
    print('\nHere is the average of the predictions:')
    print(sum(predictions) * 1.0 / len(predictions))

    # 3. overall avg actuals
    print('\nHere is the average actual value on this validation set:')
    print(sum(actuals) * 1.0 / len(actuals))

    # 2(a). median predictions
    print('\nHere is the median prediction:')
    print(np.median(predictions))

    # 3(a). median actuals
    print('\nHere is the median actual value:')
    print(np.median(actuals))

    # 4. avg differences (not RMSE)
    print('\nHere is the mean absolute error:')
    print(mean_absolute_error(actuals, predictions))

    print('\nHere is the median absolute error (robust to outliers):')
    print(median_absolute_error(actuals, predictions))

    print('\nHere is the explained variance:')
    print(explained_variance_score(actuals, predictions))

    print('\nHere is the R-squared value:')
    print(r2_score(actuals, predictions))

    # 5. pos and neg differences
    calculate_and_print_differences(predictions=predictions, actuals=actuals, name=name)

    actuals_preds = list(zip(actuals, predictions))
    # Sort by PREDICTED value, since this is what what we will know at the time we make a prediction
    actuals_preds.sort(key=lambda pair: pair[1])
    actuals_sorted = [act for act, pred in actuals_preds]
    predictions_sorted = [pred for act, pred in actuals_preds]

    if verbose > 2:
        print('Here\'s how the trained predictor did on each successive decile (ten percent chunk) of the predictions:')
        for i in range(1,11):
            print('\n**************')
            print('Bucket number:')
            print(i)
            # There's probably some fenceposting error here
            min_idx = int((i - 1) / 10.0 * len(actuals_sorted))
            max_idx = int(i / 10.0 * len(actuals_sorted))
            actuals_for_this_decile = actuals_sorted[min_idx:max_idx]
            predictions_for_this_decile = predictions_sorted[min_idx:max_idx]

            print('Avg predicted val in this bucket')
            print(sum(predictions_for_this_decile) * 1.0 / len(predictions_for_this_decile))
            print('Avg actual val in this bucket')
            print(sum(actuals_for_this_decile) * 1.0 / len(actuals_for_this_decile))
            print('RMSE for this bucket')
            print(mean_squared_error(actuals_for_this_decile, predictions_for_this_decile)**0.5)
            calculate_and_print_differences(predictions_for_this_decile, actuals_for_this_decile)

    print('')
    print('\n***********************************************\n\n')
    return rmse

def rmse_func(y, predictions):
    return mean_squared_error(y, predictions)**0.5


scoring_name_function_map = {
    'rmse': rmse_func
    , 'median_absolute_error': median_absolute_error
    , 'r2': r2_score
    , 'r-squared': r2_score
    , 'mean_absolute_error': mean_absolute_error
    , 'accuracy': accuracy_score
    , 'accuracy_score': accuracy_score
    , 'log_loss': log_loss
    , 'roc_auc': roc_auc_score
    , 'brier_score_loss': brier_score_loss
}


class RegressionScorer(object):

    def __init__(self, scoring_method=None):

        if scoring_method is None:
            scoring_method = 'rmse'

        self.scoring_method = scoring_method

        if callable(scoring_method):
            self.scoring_func = scoring_method
        else:
            self.scoring_func = scoring_name_function_map[scoring_method]

        self.scoring_method = scoring_method


    def get(self, prop_name, default=None):
        try:
            return getattr(self, prop_name)
        except AttributeError:
            return default


    def score(self, estimator, X, y, took_log_of_y=False, advanced_scoring=False, verbose=2, name=None):
        X, y = utils.drop_missing_y_vals(X, y, output_column=None)

        if isinstance(estimator, GradientBoostingRegressor):
            X = X.toarray()

        predictions = estimator.predict(X)

        if took_log_of_y:
            for idx, val in enumerate(predictions):
                predictions[idx] = math.exp(val)

        try:
            score = self.scoring_func(y, predictions)
        except ValueError:

            bad_val_indices = []
            for idx, val in enumerate(y):
                if str(val) in bad_vals_as_strings or str(predictions[idx]) in bad_vals_as_strings:
                    bad_val_indices.append(idx)

            predictions = [val for idx, val in enumerate(predictions) if idx not in bad_val_indices]
            y = [val for idx, val in enumerate(y) if idx not in bad_val_indices]

            print('Found ' + str(len(bad_val_indices)) + ' null or infinity values in the predicted or y values. We will ignore these, and report the score on the rest of the dataset')
            score = self.scoring_func(y, predictions)

        if advanced_scoring == True:
            if hasattr(estimator, 'name'):
                print(estimator.name)
            advanced_scoring_regressors(predictions, y, verbose=verbose, name=name)
        return - 1 * score


class ClassificationScorer(object):

    def __init__(self, scoring_method=None):

        if scoring_method is None:
            scoring_method = 'brier_score_loss'

        self.scoring_method = scoring_method

        if callable(scoring_method):
            self.scoring_func = scoring_method
        else:
            self.scoring_func = scoring_name_function_map[scoring_method]


    def get(self, prop_name, default=None):
        try:
            return getattr(self, prop_name)
        except AttributeError:
            return default


    def clean_probas(self, probas):
        print('Warning: We have found some values in the predicted probabilities that fall outside the range {0, 1}')
        print('This is likely the result of a model being trained on too little data, or with a bad set of hyperparameters. If you get this warning while doing a hyperparameter search, for instance, you can probably safely ignore it')
        print('We will cap those values at 0 or 1 for the purposes of scoring, but you should be careful to have similar safeguards in place in prod if you use this model')
        if not isinstance(probas[0], list):
            probas = [val if str(val) not in bad_vals_as_strings else 0 for val in probas]
            probas = [min(max(pred, 0), 1) for pred in probas]
            return probas
        else:
            cleaned_probas = []
            for proba_tuple in probas:
                cleaned_tuple = []
                for item in proba_tuple:
                    if str(item) in bad_vals_as_strings:
                        item = 0
                    cleaned_tuple.append(max(min(item, 1), 0))
                cleaned_probas.append(cleaned_tuple)
            return cleaned_probas



    def score(self, estimator, X, y, advanced_scoring=False):

        X, y = utils.drop_missing_y_vals(X, y, output_column=None)

        if isinstance(estimator, GradientBoostingClassifier):
            X = X.toarray()

        predictions = estimator.predict_proba(X)


        if self.scoring_method == 'brier_score_loss':
            # At the moment, Microsoft's LightGBM returns probabilities > 1 and < 0, which can break some scoring functions. So we have to take the max of 1 and the pred, and the min of 0 and the pred.
            probas = [max(min(row[1], 1), 0) for row in predictions]
            predictions = probas

        try:
            score = self.scoring_func(y, predictions)
        except ValueError as e:
            bad_val_indices = []
            for idx, val in enumerate(y):
                if str(val) in bad_vals_as_strings:
                    bad_val_indices.append(idx)

            predictions = [val for idx, val in enumerate(predictions) if idx not in bad_val_indices]
            y = [val for idx, val in enumerate(y) if idx not in bad_val_indices]

            print('Found ' + str(len(bad_val_indices)) + ' null or infinity values in the y values. We will ignore these, and report the score on the rest of the dataset')
            try:
                score = self.scoring_func(y, predictions)
            except ValueError:
                # Sometimes, particularly for a badly fit model using either too little data, or a really bad set of hyperparameters during a grid search, we can predict probas that are > 1 or < 0. We'll cap those here, while warning the user about them, because they're unlikely to occur in a model that's properly trained with enough data and reasonable params
                predictions = self.clean_probas(predictions)
                score = self.scoring_func(y, predictions)


        if advanced_scoring:
            return (-1 * score, predictions)
        else:
            return -1 * score