python source code of investigate

# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
import textwrap
import warnings
from types import MethodType
import numpy
from numpy.testing import assert_almost_equal
from scipy.sparse.csr import csr_matrix
from sklearn.base import TransformerMixin, ClassifierMixin
from sklearn.base import RegressorMixin, BaseEstimator
from sklearn.pipeline import Pipeline, FeatureUnion
try:
    from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
except ImportError:
    # not avaiable in 0.19
    ColumnTransformer = None
    TransformedTargetRegressor = None


def enumerate_pipeline_models(pipe, coor=None, vs=None):
    """
    Enumerates all the models within a pipeline.
    """
    if coor is None:
        coor = (0,)
    yield coor, pipe, vs
    if hasattr(pipe, 'transformer_and_mapper_list') and len(
            pipe.transformer_and_mapper_list):
        # azureml DataTransformer
        raise NotImplementedError("Unable to handle this specific case.")
    elif hasattr(pipe, 'mapper') and pipe.mapper:
        # azureml DataTransformer
        for couple in enumerate_pipeline_models(pipe.mapper, coor + (0,)):
            yield couple
    elif hasattr(pipe, 'built_features'):
        # sklearn_pandas.dataframe_mapper.DataFrameMapper
        for i, (columns, transformers, _) in enumerate(pipe.built_features):
            if isinstance(columns, str):
                columns = (columns,)
            if transformers is None:
                yield (coor + (i,)), None, columns
            else:
                for couple in enumerate_pipeline_models(transformers,
                                                        coor + (i,),
                                                        columns):
                    yield couple
    elif isinstance(pipe, Pipeline):
        for i, (_, model) in enumerate(pipe.steps):
            for couple in enumerate_pipeline_models(model, coor + (i,)):
                yield couple
    elif ColumnTransformer is not None and isinstance(pipe, ColumnTransformer):
        for i, (_, fitted_transformer, column) in enumerate(pipe.transformers):
            for couple in enumerate_pipeline_models(
                    fitted_transformer, coor + (i,), column):
                yield couple
    elif isinstance(pipe, FeatureUnion):
        for i, (_, model) in enumerate(pipe.transformer_list):
            for couple in enumerate_pipeline_models(model, coor + (i,)):
                yield couple
    elif TransformedTargetRegressor is not None and isinstance(
            pipe, TransformedTargetRegressor):
        raise NotImplementedError(
            "Not yet implemented for TransformedTargetRegressor.")
    elif isinstance(pipe, (TransformerMixin, ClassifierMixin, RegressorMixin)):
        pass
    elif isinstance(pipe, BaseEstimator):
        pass
    else:
        raise TypeError(
            "Parameter pipe is not a scikit-learn object: {}\n{}".format(
                type(pipe), pipe))


class BaseEstimatorDebugInformation:
    """
    Stores information when the outputs of a pipeline
    is computed. It as added by function
    :func:`_alter_model_for_debugging`.
    """

    def __init__(self, model):
        self.model = model
        self.inputs = {}
        self.outputs = {}
        self.methods = {}
        if hasattr(model, "transform") and callable(model.transform):
            model._debug_transform = model.transform
            self.methods["transform"] = \
                lambda model, X: model._debug_transform(X)
        if hasattr(model, "predict") and callable(model.predict):
            model._debug_predict = model.predict
            self.methods["predict"] = lambda model, X: model._debug_predict(X)
        if hasattr(model, "predict_proba") and callable(model.predict_proba):
            model._debug_predict_proba = model.predict_proba
            self.methods["predict_proba"] = \
                lambda model, X: model._debug_predict_proba(X)
        if hasattr(model, "decision_function") and callable(
            model.decision_function):  # noqa
            model._debug_decision_function = model.decision_function  # noqa
            self.methods["decision_function"] = \
                lambda model, X: model._debug_decision_function(X)

    def __repr__(self):
        """
        usual
        """
        return self.to_str()

    def to_str(self, nrows=5):
        """
        Tries to produce a readable message.
        """
        rows = ['BaseEstimatorDebugInformation({})'.format(
            self.model.__class__.__name__)]
        for k in sorted(self.inputs):
            if k in self.outputs:
                rows.append('  ' + k + '(')
                self.display(self.inputs[k], nrows)
                rows.append(textwrap.indent(
                    self.display(self.inputs[k], nrows), '   '))
                rows.append('  ) -> (')
                rows.append(textwrap.indent(
                    self.display(self.outputs[k], nrows), '   '))
                rows.append('  )')
            else:
                raise KeyError(
                    "Unable to find output for method '{}'.".format(k))
        return "\n".join(rows)

    def display(self, data, nrows):
        """
        Displays the first
        """
        text = str(data)
        rows = text.split('\n')
        if len(rows) > nrows:
            rows = rows[:nrows]
            rows.append('...')
        if hasattr(data, 'shape'):
            rows.insert(0, "shape={}".format(data.shape))
        return "\n".join(rows)


def _alter_model_for_debugging(skl_model, recursive=False):
    """
    Overwrite methods transform, predict or predict_proba
    to collect the last inputs and outputs
    seen in these methods.

    :param skl_model: *scikit-learn* pipeline or model
    :param recursive: alter the current model (False) or git into
        contained models
    """

    def transform(self, X, *args, **kwargs):
        self._debug.inputs['transform'] = X
        y = self._debug.methods['transform'](self, X, *args, **kwargs)
        self._debug.outputs['transform'] = y
        return y

    def predict(self, X, *args, **kwargs):
        self._debug.inputs['predict'] = X
        y = self._debug.methods['predict'](self, X, *args, **kwargs)
        self._debug.outputs['predict'] = y
        return y

    def predict_proba(self, X, *args, **kwargs):
        self._debug.inputs['predict_proba'] = X
        y = self._debug.methods['predict_proba'](self, X, *args, **kwargs)
        self._debug.outputs['predict_proba'] = y
        return y

    def decision_function(self, X, *args, **kwargs):
        self._debug.inputs['decision_function'] = X
        y = self._debug.methods['decision_function'](self, X, *args, **kwargs)
        self._debug.outputs['decision_function'] = y
        return y

    new_methods = {
        'decision_function': decision_function,
        'transform': transform,
        'predict': predict,
        'predict_proba': predict_proba,
    }

    if hasattr(skl_model, '_debug'):
        raise RuntimeError("The same operator cannot be used twice in "
                           "the same pipeline or this method was called "
                           "a second time.")

    if recursive:
        for model_ in enumerate_pipeline_models(skl_model):
            model = model_[1]
            model._debug = BaseEstimatorDebugInformation(model)
            for k in model._debug.methods:
                try:
                    setattr(model, k, MethodType(new_methods[k], model))
                except AttributeError:
                    warnings.warn("Unable to overwrite method '{}' for class "
                                  "{}.".format(k, type(model)))
    else:
        skl_model._debug = BaseEstimatorDebugInformation(skl_model)
        for k in skl_model._debug.methods:
            try:
                setattr(skl_model, k, MethodType(new_methods[k], skl_model))
            except AttributeError:
                warnings.warn("Unable to overwrite method '{}' for class "
                              "{}.".format(k, type(skl_model)))


def collect_intermediate_steps(model, *args, **kwargs):
    """
    Converts a scikit-learn model into ONNX with :func:`convert_sklearn`
    and returns intermediate results for each included operator.

    :param model: model or pipeline to convert
    :param args: arguments for :func:`convert_sklearn`
    :param kwargs: optional arguments for :func:`convert_sklearn`

    The model *model* is modified by the function,
    it should be pickled first to be retrieved unaltered.
    This function is used to check every intermediate model in
    a pipeline.
    """
    if 'intermediate' in kwargs:
        if not kwargs['intermediate']:
            raise ValueError("Parameter intermediate must be true.")
        del kwargs['intermediate']

    from .. import convert_sklearn
    from ..helpers.onnx_helper import select_model_inputs_outputs
    from ..common import MissingShapeCalculator, MissingConverter
    try:
        model_onnx, topology = convert_sklearn(
            model, *args, intermediate=True, **kwargs)
    except (MissingShapeCalculator, MissingConverter):
        # The model cannot be converted.
        raise

    steps = []
    for operator in topology.topological_operator_iterator():
        if operator.raw_operator is None:
            continue
        _alter_model_for_debugging(operator.raw_operator)
        inputs = [i.full_name for i in operator.inputs]
        outputs = [o.full_name for o in operator.outputs]
        steps.append({
            'model': operator.raw_operator,
            'model_onnx': model_onnx,
            'inputs': inputs,
            'outputs': outputs,
            'onnx_step': select_model_inputs_outputs(
                model_onnx, outputs=outputs)
        })
    return steps


def compare_objects(o1, o2, decimal=4):
    """
    Compares two objects assuming they are vectors or matrices.
    *o1* and *o2* can be a numpy array, a sparse matrix,
    a dataframe. The function raises an exception if it cannot
    convert both object into the same type or the comparison
    fails.

    :param o1: a dataframe, a series, an array a sparse matrix
    :param o2: a dataframe, a series, an array a sparse matrix
    :param decimal: parameter decimal for assert_almost_equal
    """

    def convert(o):
        if isinstance(o, list) and len(o) == 1:
            if isinstance(o[0], numpy.ndarray):
                if o[0].dtype in (numpy.str, object):
                    o = list(o[0])
                else:
                    o = o[0]
        # Following line avoid importing pandas and taking
        # dependency on pandas.
        if o.__class__.__name__ == "Series":
            c = list(o)
        elif isinstance(o, numpy.ndarray):
            c = o
        elif isinstance(o, csr_matrix):
            c = o.todense()
        elif isinstance(o, list):
            c = o.copy()
        elif isinstance(o, tuple):
            c = list(o)
        else:
            raise TypeError("Unexpected type {}.".format(type(o)))
        return c

    def to_string(c):
        s = str(c)
        if len(s) > 200:
            s = s[:200] + "..."
        return s

    c1 = convert(o1)
    c2 = convert(o2)
    reason = None
    if isinstance(c2, list) and isinstance(c2[0], dict):
        res = numpy.zeros((len(c2), max(len(c) for c in c2)))
        for i, row in enumerate(c2):
            for k, v in row.items():
                res[i, k] = v
        c2 = res
    if isinstance(c1, numpy.ndarray) and isinstance(c2, list):
        c1 = list(c1.ravel())
    if isinstance(c1, list) and isinstance(c2, list):
        try:
            res = c1 == c2
            reason = 'list-equal'
        except ValueError:
            res = False
            reason = 'list'
    elif isinstance(c1, numpy.ndarray) and isinstance(c2, numpy.ndarray):
        try:
            assert_almost_equal(c1, c2, decimal=decimal)
            res = True
        except (AssertionError, TypeError):
            reason = 'array'
            cc1 = c1.ravel()
            cc2 = c2.ravel()
            try:
                assert_almost_equal(cc1, cc2, decimal=decimal)
                res = True
            except (AssertionError, TypeError) as e:
                res = False
                reason = 'array-ravel' + str(e)
    else:
        raise TypeError("Types {} and {}".format(type(c1), type(c2)))
    if not res:
        msg = "o1 and o2 are different ({})\n---o1---\n{}\n---o2---\n{}"
        raise ValueError(msg.format(reason, to_string(c1), to_string(c2)))