python source code of tree

from functools import partial

from sklearn.base import ClassifierMixin, RegressorMixin
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.tree._tree import Tree, TREE_LEAF
import numpy as np

from sklearn_pmml.convert.model import EstimatorConverter, ModelMode, Schema
from sklearn_pmml.convert.features import Feature, CategoricalFeature, NumericFeature
import sklearn_pmml.pmml as pmml
from sklearn_pmml.convert.utils import estimator_to_converter


class DecisionTreeConverter(EstimatorConverter):
    SPLIT_BINARY = 'binarySplit'
    OPERATOR_LE = 'lessOrEqual'
    NODE_ROOT = 0
    OUTPUT_PROBABILITY = 'proba'
    OUTPUT_LABEL = 'proba'

    def __init__(self, estimator, context, mode):
        super(DecisionTreeConverter, self).__init__(estimator, context, mode)

        assert len(self.context.schemas[Schema.OUTPUT]) == 1, 'Only one-label trees are supported'
        assert hasattr(estimator, 'tree_'), 'Estimator has no tree_ attribute'
        if mode == ModelMode.CLASSIFICATION:
            if isinstance(self.context.schemas[Schema.OUTPUT][0], CategoricalFeature):
                self.prediction_output = self.OUTPUT_LABEL
            else:
                self.prediction_output = self.OUTPUT_PROBABILITY
            assert isinstance(self.estimator, ClassifierMixin), \
                'Only a classifier can be serialized in classification mode'
        if mode == ModelMode.REGRESSION:
            assert isinstance(self.context.schemas[Schema.OUTPUT][0], NumericFeature), \
                'Only a numeric feature can be an output of regression'
            assert isinstance(self.estimator, RegressorMixin), \
                'Only a regressor can be serialized in regression mode'
        assert estimator.tree_.value.shape[1] == len(self.context.schemas[Schema.OUTPUT]), \
            'Tree outputs {} results while the schema specifies {} output fields'.format(
                estimator.tree_.value.shape[1], len(self.context.schemas[Schema.OUTPUT]))

        # create hidden variables for each categorical output
        # TODO: this code is copied from the ClassifierConverter. To make things right, we need an abstract tree
        # TODO: converter and subclasses for classifier and regression converters
        internal_schema = list(filter(lambda x: isinstance(x, CategoricalFeature), self.context.schemas[Schema.OUTPUT]))
        self.context.schemas[Schema.INTERNAL] = internal_schema

    def _model(self):
        assert Schema.NUMERIC in self.context.schemas, \
            'Either build transformation dictionary or provide {} schema in context'.format(Schema.NUMERIC)
        tm = pmml.TreeModel(functionName=self.model_function.value, splitCharacteristic=self.SPLIT_BINARY)
        tm.append(self.mining_schema())
        tm.append(self.output())
        tm.Node = self._transform_node(
            self.estimator.tree_,
            self.NODE_ROOT,
            self.context.schemas[Schema.NUMERIC],
            self.context.schemas[Schema.OUTPUT][0]
        )
        return tm

    def model(self, verification_data=None):
        assert Schema.NUMERIC in self.context.schemas, \
            'Either build transformation dictionary or provide {} schema in context'.format(Schema.NUMERIC)
        tm = self._model()
        if verification_data is not None:
            tm.ModelVerification = self.model_verification(verification_data)
        return tm

    def _transform_node(self, tree, index, input_schema, output_feature, enter_condition=None):
        """
        Recursive mapping of sklearn Tree into PMML Node tree
        :return: Node element
        """
        assert isinstance(tree, Tree)
        assert isinstance(input_schema, list)
        assert isinstance(output_feature, Feature)

        node = pmml.Node()
        if enter_condition is None:
            node.append(pmml.True_())
        else:
            node.append(enter_condition)
        node.recordCount = tree.n_node_samples[index]

        if tree.children_left[index] != TREE_LEAF:
            feature = input_schema[tree.feature[index]]
            assert isinstance(feature, Feature)
            left_child = self._transform_node(
                tree,
                tree.children_left[index],
                input_schema,
                output_feature,
                enter_condition=pmml.SimplePredicate(
                    field=feature.full_name, operator=DecisionTreeConverter.OPERATOR_LE, value_=tree.threshold[index]
                )
            )
            right_child = self._transform_node(tree, tree.children_right[index], input_schema, output_feature)
            if self.model_function == ModelMode.CLASSIFICATION:
                score, score_prob = None, 0.0
                for i in range(len(tree.value[index][0])):
                    left_score = left_child.ScoreDistribution[i]
                    right_score = right_child.ScoreDistribution[i]
                    prob = float(left_score.recordCount + right_score.recordCount) / node.recordCount
                    node.append(pmml.ScoreDistribution(
                        recordCount=left_score.recordCount + right_score.recordCount,
                        value_=left_score.value_,
                        confidence=prob
                    ))
                    if score_prob < prob:
                        score, score_prob = left_score.value_, prob
                node.score = score
            node.append(left_child).append(right_child)

        else:
            node_value = np.array(tree.value[index][0])
            if self.model_function == ModelMode.CLASSIFICATION:
                probs = node_value / float(node_value.sum())
                for i in range(len(probs)):
                    node.append(pmml.ScoreDistribution(
                        confidence=probs[i],
                        recordCount=node_value[i],
                        value_=output_feature.from_number(i)
                    ))
                node.score = output_feature.from_number(probs.argmax())
            elif self.model_function == ModelMode.REGRESSION:
                node.score = node_value[0]

        return node

    def output(self):
        """
        Output section of PMML contains all model outputs.
        Classification tree output contains output variable as a label,
        and <variable>#<value> as a probability of a value for a variable
        :return: pmml.Output
        """
        output = pmml.Output()

        # the response variables
        for feature in self.context.schemas[Schema.OUTPUT]:
            output_field = pmml.OutputField(
                name=Schema.OUTPUT.extract_feature_name(feature),
                feature='predictedValue',
                optype=feature.optype.value,
                dataType=feature.data_type.value
            )
            output.append(output_field)

        # the probabilities for categories; should only be populated for classification jobs
        for feature in self.context.schemas[Schema.CATEGORIES]:
            output_field = pmml.OutputField(
                name=Schema.CATEGORIES.extract_feature_name(feature),
                optype=feature.optype.value,
                dataType=feature.data_type.value,
                feature='probability',
                targetField=Schema.INTERNAL.extract_feature_name(feature.namespace),
                value_=feature.name
            )
            output.append(output_field)

        return output


estimator_to_converter[DecisionTreeClassifier] = partial(
    DecisionTreeConverter, mode=ModelMode.CLASSIFICATION
)
estimator_to_converter[DecisionTreeRegressor] = partial(
    DecisionTreeConverter, mode=ModelMode.REGRESSION
)