# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------

import numpy as np

from sklearn import pipeline
from sklearn.base import (
    ClassifierMixin, ClusterMixin, is_classifier
    from sklearn.base import OutlierMixin
except ImportError:
    # scikit-learn <= 0.19
    class OutlierMixin:

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import NearestNeighbors
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, NuSVC, SVC
    from sklearn.compose import ColumnTransformer
except ImportError:
    # ColumnTransformer was introduced in 0.20.
    ColumnTransformer = None

from ._supported_operators import (
    _get_sklearn_operator_name, cluster_list, outlier_list
from ._supported_operators import (
    sklearn_classifier_list, sklearn_operator_name_map
from .common._container import SklearnModelContainerNode
from .common._registration import _converter_pool, _shape_calculator_pool
from .common._topology import Topology
from .common.data_types import DictionaryType
from .common.data_types import Int64TensorType, SequenceType
from .common.data_types import StringTensorType, TensorType
from .common.utils import get_column_indices
from .common.utils_checking import check_signature
from .common.utils_classifier import get_label_classes

do_not_merge_columns = tuple(
    filter(lambda op: op is not None,
           [OneHotEncoder, ColumnTransformer]))

def _fetch_input_slice(scope, inputs, column_indices):
    if not isinstance(inputs, list):
        raise TypeError("Parameter inputs must be a list.")
    if len(inputs) == 0:
        raise RuntimeError("Operator ArrayFeatureExtractor requires at "
                           "least one inputs.")
    if len(inputs) != 1:
        raise RuntimeError("Operator ArrayFeatureExtractor does not support "
                           "multiple input tensors.")
    if (isinstance(inputs[0].type, TensorType) and
            len(inputs[0].type.shape) == 2 and
            inputs[0].type.shape[1] == len(column_indices)):
        # No need to extract.
        return inputs
    array_feature_extractor_operator = scope.declare_local_operator(
    array_feature_extractor_operator.inputs = inputs
    array_feature_extractor_operator.column_indices = column_indices
    output_variable_name = scope.declare_local_variable(
                            'extracted_feature_columns', inputs[0].type)
    return array_feature_extractor_operator.outputs

def _parse_sklearn_simple_model(scope, model, inputs, custom_parsers=None):
    This function handles all non-pipeline models.

    :param scope: Scope object
    :param model: A scikit-learn object (e.g., *OneHotEncoder*
        or *LogisticRegression*)
    :param inputs: A list of variables
    :return: A list of output variables which will be passed to next
    # alias can be None
    if isinstance(model, str):
        raise RuntimeError("Parameter model must be an object not a "
                           "string '{0}'.".format(model))
    alias = _get_sklearn_operator_name(type(model))
    this_operator = scope.declare_local_operator(alias, model)
    this_operator.inputs = inputs

    if hasattr(model, 'onnx_parser'):
        parser_names = model.onnx_parser(inputs=inputs)
        if parser_names is not None:
            names = parser_names()
            for name in names:
                var = scope.declare_local_variable(name, scope.tensor_type())
            return this_operator.outputs

    if (type(model) in sklearn_classifier_list
            or isinstance(model, ClassifierMixin)
            or (isinstance(model, GridSearchCV)
                and is_classifier(model))):
        # For classifiers, we may have two outputs, one for label and
        # the other one for probabilities of all classes. Notice that
        # their types here are not necessarily correct and they will
        # be fixed in shape inference phase
        label_variable = scope.declare_local_variable('label',
        probability_tensor_variable = scope.declare_local_variable(
                                    'probabilities', scope.tensor_type())

    elif type(model) in cluster_list or isinstance(model, ClusterMixin):
        # For clustering, we may have two outputs, one for label and
        # the other one for scores of all classes. Notice that their
        # types here are not necessarily correct and they will be fixed
        # in shape inference phase
        label_variable = scope.declare_local_variable(
            'label', Int64TensorType())
        score_tensor_variable = scope.declare_local_variable(
            'scores', scope.tensor_type())

    elif type(model) in outlier_list or isinstance(model, OutlierMixin):
        # For clustering, we may have two outputs, one for label and
        # the other one for scores.
        label_variable = scope.declare_local_variable(
            'label', Int64TensorType())
        score_tensor_variable = scope.declare_local_variable(
            'scores', scope.tensor_type())

    elif type(model) == NearestNeighbors:
        # For Nearest Neighbours, we have two outputs, one for nearest
        # neighbours' indices and the other one for distances
        index_variable = scope.declare_local_variable('index',
        distance_variable = scope.declare_local_variable('distance',

    elif type(model) in {GaussianMixture, BayesianGaussianMixture}:
        label_variable = scope.declare_local_variable('label',
        prob_variable = scope.declare_local_variable('probabilities',
        options = scope.get_options(model, dict(score_samples=False))
        if options['score_samples']:
            scores_var = scope.declare_local_variable(
                'score_samples', scope.tensor_type())
        # We assume that all scikit-learn operator produce a single output.
        variable = scope.declare_local_variable(
            'variable', scope.tensor_type())

    return this_operator.outputs

def _parse_sklearn_pipeline(scope, model, inputs, custom_parsers=None):
    The basic ideas of scikit-learn parsing:
        1. Sequentially go though all stages defined in the considered
           scikit-learn pipeline
        2. The output variables of one stage will be fed into its next
           stage as the inputs.

    :param scope: Scope object defined in _topology.py
    :param model: scikit-learn pipeline object
    :param inputs: A list of Variable objects
    :return: A list of output variables produced by the input pipeline
    for step in model.steps:
        inputs = parse_sklearn(scope, step[1], inputs,
    return inputs

def _parse_sklearn_feature_union(scope, model, inputs, custom_parsers=None):
    :param scope: Scope object
    :param model: A scikit-learn FeatureUnion object
    :param inputs: A list of Variable objects
    :return: A list of output variables produced by feature union
    # Output variable name of each transform. It's a list of string.
    transformed_result_names = []
    # Encode each transform as our IR object
    for name, transform in model.transformer_list:
                scope, transform, inputs,
        if (model.transformer_weights is not None and name in
            transform_result = [transformed_result_names.pop()]
            # Create a Multiply ONNX node
            multiply_operator = scope.declare_local_operator('SklearnMultiply')
            multiply_operator.inputs = transform_result
            multiply_operator.operand = model.transformer_weights[name]
            multiply_output = scope.declare_local_variable(
                'multiply_output', scope.tensor_type())

    # Create a Concat ONNX node
    concat_operator = scope.declare_local_operator('SklearnConcat')
    concat_operator.inputs = transformed_result_names

    # Declare output name of scikit-learn FeatureUnion
    union_name = scope.declare_local_variable('union', scope.tensor_type())

    return concat_operator.outputs

def _parse_sklearn_column_transformer(scope, model, inputs,
    :param scope: Scope object
    :param model: A *scikit-learn* *ColumnTransformer* object
    :param inputs: A list of Variable objects
    :return: A list of output variables produced by column transformer
    # Output variable name of each transform. It's a list of string.
    transformed_result_names = []
    # Encode each transform as our IR object
    for name, op, column_indices in model.transformers_:
        if op == 'drop':
        if isinstance(column_indices, slice):
            column_indices = list(range(
                if column_indices.start is not None else 0,
                column_indices.stop, column_indices.step
                if column_indices.step is not None else 1))
        elif isinstance(column_indices, (int, str)):
            column_indices = [column_indices]
        names = get_column_indices(column_indices, inputs, multiple=True)
        transform_inputs = []
        for onnx_var, onnx_is in names.items():
            tr_inputs = _fetch_input_slice(scope, [inputs[onnx_var]], onnx_is)

        merged_cols = False
        if len(transform_inputs) > 1:
            if isinstance(op, Pipeline):
                if not isinstance(op.steps[0][1], do_not_merge_columns):
                    merged_cols = True
            elif not isinstance(op, do_not_merge_columns):
                merged_cols = True

        if merged_cols:
            # Many ONNX operators expect one input vector,
            # the default behaviour is to merge columns.
            ty = transform_inputs[0].type.__class__([None, None])

            conc_op = scope.declare_local_operator('SklearnConcat')
            conc_op.inputs = transform_inputs
            conc_names = scope.declare_local_variable('merged_columns', ty)
            transform_inputs = [conc_names]

        model_obj = model.named_transformers_[name]
        if isinstance(model_obj, str):
            if model_obj == "passthrough":
                var_out = transform_inputs[0]
            elif model_obj == "drop":
                var_out = None
                raise RuntimeError("Unknown operator alias "
                                   "'{0}'. These are specified in "
            var_out = parse_sklearn(
                scope, model_obj,
                transform_inputs, custom_parsers=custom_parsers)[0]
            if (model.transformer_weights is not None and name in
                # Create a Multiply ONNX node
                multiply_operator = scope.declare_local_operator(
                multiply_operator.operand = model.transformer_weights[name]
                var_out = scope.declare_local_variable(
                    'multiply_output', scope.tensor_type())
        if var_out:

    # Create a Concat ONNX node
    if len(transformed_result_names) > 1:
        ty = transformed_result_names[0].type.__class__([None, None])
        concat_operator = scope.declare_local_operator('SklearnConcat')
        concat_operator.inputs = transformed_result_names

        # Declare output name of scikit-learn ColumnTransformer
        transformed_column_name = scope.declare_local_variable(
            'transformed_column', ty)
        return concat_operator.outputs
    return transformed_result_names

def _parse_sklearn_grid_search_cv(scope, model, inputs, custom_parsers=None):
    return (_parse_sklearn_classifier(
                scope, model, inputs, custom_parsers=None)
            if is_classifier(model) else
            _parse_sklearn_simple_model(scope, model, inputs,

def _parse_sklearn_classifier(scope, model, inputs, custom_parsers=None):
    probability_tensor = _parse_sklearn_simple_model(
            scope, model, inputs, custom_parsers=custom_parsers)
    if model.__class__ in [NuSVC, SVC] and not model.probability:
        return probability_tensor
    options = scope.get_options(model, dict(zipmap=True))
    if not options['zipmap']:
        return probability_tensor
    this_operator = scope.declare_local_operator('SklearnZipMap')
    this_operator.inputs = probability_tensor
    label_type = Int64TensorType([None])
    classes = get_label_classes(scope, model)

    if (isinstance(model.classes_, list) and
            isinstance(model.classes_[0], np.ndarray)):
        # multi-label problem
    elif np.issubdtype(classes.dtype, np.floating):
        classes = np.array(list(map(lambda x: int(x), classes)))
        if set(map(lambda x: float(x), classes)) != set(model.classes_):
            raise RuntimeError("skl2onnx implicitly converts float class "
                               "labels into integers but at least one label "
                               "is not an integer. Class labels should "
                               "be integers or strings.")
        this_operator.classlabels_int64s = classes
    elif np.issubdtype(classes.dtype, np.signedinteger):
        this_operator.classlabels_int64s = classes
    elif np.issubdtype(classes.dtype, np.unsignedinteger):
        this_operator.classlabels_int64s = classes
        classes = np.array([s.encode('utf-8') for s in classes])
        this_operator.classlabels_strings = classes
        label_type = StringTensorType([None])

    output_label = scope.declare_local_variable('output_label', label_type)
    output_probability = scope.declare_local_variable(
        SequenceType(DictionaryType(label_type, scope.tensor_type())))
    return this_operator.outputs

def _parse_sklearn_gaussian_process(scope, model, inputs, custom_parsers=None):
    options = scope.get_options(
            model, dict(return_cov=False, return_std=False))
    if options['return_std'] and options['return_cov']:
        raise RuntimeError(
            "Not returning standard deviation of predictions when "
            "returning full covariance.")

    alias = _get_sklearn_operator_name(type(model))
    this_operator = scope.declare_local_operator(alias, model)
    mean_tensor = scope.declare_local_variable("GPmean", scope.tensor_type())
    this_operator.inputs = inputs

    if options['return_std'] or options['return_cov']:
        # covariance or standard deviation
        covstd_tensor = scope.declare_local_variable('GPcovstd',
    return this_operator.outputs

def parse_sklearn(scope, model, inputs, custom_parsers=None, final_types=None):
    This is a delegate function. It does nothing but invokes the
    correct parsing function according to the input model's type.

    :param scope: Scope object
    :param model: A scikit-learn object (e.g., OneHotEncoder
        and LogisticRegression)
    :param inputs: A list of variables
    :param custom_parsers: parsers determines which outputs is expected
        for which particular task, default parsers are defined for
        classifiers, regressors, pipeline but they can be rewritten,
        *custom_parsers* is a dictionary ``{ type: fct_parser(scope,
        model, inputs, custom_parsers=None) }``
    :param final_types: a python list. Works the same way as initial_types
        but not mandatory, it is used to overwrites the type
        (if type is not None) and the name of every output.
    :return: The output variables produced by the input model
    if final_types is not None:
        outputs = []
        for name, ty in final_types:
            var = scope.declare_local_variable(name, ty)
            if var.onnx_name != name:
                raise RuntimeError(
                    "Unable to add duplicated output '{}', '{}'.".format(
                        var.onnx_name, name))
        hidden_outputs = parse_sklearn(
            scope, model, inputs, custom_parsers=custom_parsers)
        if len(hidden_outputs) != len(outputs):
            raise RuntimeError(
                "Number of declared outputs is unexpected, declared '{}' "
                "found '{}'.".format(
                    ", ".join(_.onnx_name for _ in outputs),
                    ", ".join(_.onnx_name for _ in hidden_outputs)))
        for h, o in zip(hidden_outputs, outputs):
            if o.type is None:
                iop = scope.declare_local_operator('SklearnIdentity')
                iop = scope.declare_local_operator('SklearnCast')
            iop.inputs = [h]
            iop.outputs = [o]
        return outputs

    tmodel = type(model)
    if custom_parsers is not None and tmodel in custom_parsers:
        outputs = custom_parsers[tmodel](scope, model, inputs,
    elif tmodel in sklearn_parsers_map:
        outputs = sklearn_parsers_map[tmodel](scope, model, inputs,
    elif isinstance(model, pipeline.Pipeline):
        parser = sklearn_parsers_map[pipeline.Pipeline]
        outputs = parser(scope, model, inputs, custom_parsers=custom_parsers)
        outputs = _parse_sklearn_simple_model(scope, model, inputs,
    return outputs

def parse_sklearn_model(model, initial_types=None, target_opset=None,
                        custom_parsers=None, dtype=np.float32,
                        options=None, white_op=None,
                        black_op=None, final_types=None):
    Puts *scikit-learn* object into an abstract container so that
    our framework can work seamlessly on models created
    with different machine learning tools.

    :param model: A scikit-learn model
    :param initial_types: a python list. Each element is a tuple of a
        variable name and a type defined in data_types.py
    :param target_opset: number, for example, 7 for ONNX 1.2,
        and 8 for ONNX 1.3.
    :param custom_conversion_functions: a dictionary for specifying
        the user customized conversion function if not registered
    :param custom_shape_calculators: a dictionary for specifying the
        user customized shape calculator if not registered
    :param custom_parsers: parsers determines which outputs is expected
        for which particular task, default parsers are defined for
        classifiers, regressors, pipeline but they can be rewritten,
        *custom_parsers* is a dictionary
        ``{ type: fct_parser(scope, model, inputs, custom_parsers=None) }``
    :param dtype: parameter which defines the type for
        float computation (float32 or float64)
    :param options: specific options given to converters
        (see :ref:`l-conv-options`)
    :param white_op: white list of ONNX nodes allowed
        while converting a pipeline, if empty, all are allowed
    :param black_op: black list of ONNX nodes allowed
        while converting a pipeline, if empty, none are blacklisted
    :param final_types: a python list. Works the same way as initial_types
        but not mandatory, it is used to overwrites the type
        (if type is not None) and the name of every output.
    :return: :class:`Topology <skl2onnx.common._topology.Topology>`
    raw_model_container = SklearnModelContainerNode(
        model, dtype, white_op=white_op, black_op=black_op)

    # Declare a computational graph. It will become a representation of
    # the input scikit-learn model after parsing.
    topology = Topology(
            raw_model_container, initial_types=initial_types,
                conv=_converter_pool, shape=_shape_calculator_pool,

    # Declare an object to provide variables' and operators' naming mechanism.
    # In contrast to CoreML, one global scope
    # is enough for parsing scikit-learn models.
    scope = topology.declare_scope('__root__', options=options, dtype=dtype)

    # Declare input variables. They should be the inputs of the scikit-learn
    # model you want to convert into ONNX.
    inputs = []
    for var_name, initial_type in initial_types:
        inputs.append(scope.declare_local_variable(var_name, initial_type))

    # The object raw_model_container is a part of the topology
    # we're going to return. We use it to store the inputs of
    # the scikit-learn's computational graph.
    for variable in inputs:

    # Parse the input scikit-learn model as a Topology object.
    outputs = parse_sklearn(scope, model, inputs,

    # The object raw_model_container is a part of the topology we're
    # going to return. We use it to store the outputs of the
    # scikit-learn's computational graph.
    for variable in outputs:

    return topology

def build_sklearn_parsers_map():
    map_parser = {
        pipeline.Pipeline: _parse_sklearn_pipeline,
        pipeline.FeatureUnion: _parse_sklearn_feature_union,
        GaussianProcessRegressor: _parse_sklearn_gaussian_process,
        GridSearchCV: _parse_sklearn_grid_search_cv,
    if ColumnTransformer is not None:
        map_parser[ColumnTransformer] = _parse_sklearn_column_transformer

    for tmodel in sklearn_classifier_list:
        if tmodel not in [LinearSVC]:
            map_parser[tmodel] = _parse_sklearn_classifier
    return map_parser

def update_registered_parser(model, parser_fct):
    Registers or updates a parser for a new model.
    A parser returns the expected output of a model.

    :param model: model class
    :param parser_fct: parser, signature is the same as
        :func:`parse_sklearn <skl2onnx._parse.parse_sklearn>`
    check_signature(parser_fct, _parse_sklearn_classifier)
    sklearn_parsers_map[model] = parser_fct

# registered parsers
sklearn_parsers_map = build_sklearn_parsers_map()