python source code of base

# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
#      http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.

import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import MissingIndicator, SimpleImputer
from sklearn.utils.validation import check_array, check_is_fitted


def is_finite_numeric(arr):
    """Helper function to check if values in an array can be converted to finite numeric
    """

    def _is_finite_numeric(val):
        try:
            f = float(val)
            return np.isfinite(f)
        except ValueError:
            return False

    return np.vectorize(_is_finite_numeric)(arr)


def _get_mask(X, vectorized_mask_function):
    """Compute boolean mask of X for vectorized_mask_function(X) == False
    """
    return np.logical_not(vectorized_mask_function(X).astype("bool"))


def _apply_mask(X, mask):
    X[mask] = np.nan
    return X


class RobustImputer(BaseEstimator, TransformerMixin):
    """Imputer for completing missing values.

    Similar to sklearn.impute.SimpleImputer with added functionality
    - RobustImputer uses a custom mask_function to determine values to impute.
      The default mask_function is sagemaker_sklearn_extension.impute.is_finite_numeric
      which checks if a value can be converted into a float.
    - RobustImputer can perform multi-column imputation with different values
      for each column (strategy=="constant")

    Parameters
    ----------
    dtype : string, type, list of types or None (default=None)
        Data type for output.

        - If left to default, numeric imputation strategies ("median" and "mean"),
        output array dtype will always be floating point dtype. Otherwise it will be
        np.dtype('O')

    strategy : string, optional (default='median')
        The imputation strategy.

        - If "mean", then replace missing values using the mean along
          each column. Can only be used with numeric data.
        - If "median", then replace missing values using the median along
          each column. Can only be used with numeric data.
        - If "most_frequent", then replace missing using the most frequent
          value along each column. Can be used with strings or numeric data.
        - If "constant", then replace missing values with fill_values.
          fill_values can be a singular value or a list of values equal to
          number of columns. Can be used with strings or numeric data.
          If fill_values is not set, fill_value will be 0 when imputing numerical
          data and "missing_value" for strings or object data types.

    fill_values : string, numerical value, or list, optional (default=None)
        When strategy=="constant", fill_values is used to replace all
        values that should be imputed.

        - If string or numerical value, that one value will be used to replace
          all values that should be imputed.
        - If list, fill_values must equal to number of columns of input. Each
          column will be imputed with the corresponding value in fill_values.
          fill_values[i] will replace ith column (X[:,i]).
        - If left to the default, fill_value will be 0 when imputing numerical
          data and "missing_value" for strings or object data types.

    mask_function : callable -> np.array, dtype('bool') (default=None)
        A vectorized python function, accepts np.array, returns np.array
        with dtype('bool')

        For each value, if mask_function(val) == False, that value will
        be imputed. mask_function is used to create a boolean mask that determines
        which values in the input to impute.

        Use np.vectorize to vectorize singular python functions.

        If left to default, mask_function will be
        sagemaker_sklearn_extension.impute.is_finite_numeric

    Notes
    -----
    only accepts 2D, non-sparse inputs
    """

    def __init__(self, dtype=None, strategy="median", fill_values=None, mask_function=None):
        self.dtype = dtype
        self.strategy = strategy
        self.fill_values = fill_values
        self.mask_function = mask_function

    def _validate_input(self, X):
        if self._is_constant_multicolumn_imputation():
            if len(self.fill_values) != X.shape[1]:
                raise ValueError(
                    "'fill_values' should have length equal to number of features in X {num_features}, "
                    "got {fill_values_length}".format(num_features=X.shape[1], fill_values_length=len(self.fill_values))
                )

        dtype = self.dtype or np.dtype("O")

        if hasattr(X, "dtype") and X.dtype is not None and hasattr(X.dtype, "kind") and X.dtype.kind == "c":
            raise ValueError("Complex data not supported\n{}\n".format(X))

        return check_array(X, dtype=dtype, copy=True, force_all_finite=False, ensure_2d=True)

    def _is_constant_multicolumn_imputation(self):
        return self.strategy == "constant" and isinstance(self.fill_values, (list, tuple, np.ndarray))

    def fit(self, X, y=None):
        """Fit the imputer on X.

        Parameters
        ----------
        X : {array-like}, shape (n_samples, n_features)
            Input data, where ``n_samples`` is the number of samples and
            ``n_features`` is the number of features.

        Returns
        -------
        self : RobustImputer
        """
        X = self._validate_input(X)

        self.vectorized_mask_function_ = self.mask_function or is_finite_numeric
        X = _apply_mask(X, _get_mask(X, self.vectorized_mask_function_))

        if self._is_constant_multicolumn_imputation():
            self.simple_imputer_ = SimpleImputer(strategy=self.strategy)
        else:
            self.simple_imputer_ = SimpleImputer(strategy=self.strategy, fill_value=self.fill_values)

        self.simple_imputer_.fit(X)

        # set "SimpleImputer.statistics_" for multicolumn imputations with different column fill values
        # SimpleImputer cannot preform multicolumn imputation with different column fill values
        if self._is_constant_multicolumn_imputation():
            self.simple_imputer_.statistics_ = np.asarray(self.fill_values)

        return self

    def transform(self, X):
        """Impute all missing values in X.

        Parameters
        ----------
        X : {array-like}, shape (n_samples, n_features)
            The input data to complete.

        Returns
        -------
        Xt : {ndarray}, shape (n_samples, n_features)
            The imputed input data. The data type of ``Xt``
            will depend on your input dtype.
        """
        check_is_fitted(self, ["simple_imputer_", "vectorized_mask_function_"])
        X = self._validate_input(X)

        if X.shape[1] != self.simple_imputer_.statistics_.shape[0]:
            raise ValueError(
                "'transform' input X has {transform_dim} features per sample, "
                "expected {fit_dim} from 'fit' input".format(
                    transform_dim=X.shape[1], fit_dim=self.simple_imputer_.statistics_.shape[0]
                )
            )

        X = _apply_mask(X, _get_mask(X, self.vectorized_mask_function_))

        return self.simple_imputer_.transform(X).astype(self.dtype)

    def _more_tags(self):
        return {"allow_nan": True}


class RobustMissingIndicator(BaseEstimator, TransformerMixin):
    """Binary indicators for missing values.

    Note that this component typically should not be used in a vanilla
    :class:`sklearn.pipeline.Pipeline` consisting of transformers and a classifier,
    but rather could be added using a :class:`sklearn.pipeline.FeatureUnion` or
    :class:`sklearn.compose.ColumnTransformer`.

    Similar to sklearn.impute.MissingIndicator with added functionality
    - RobustMissingIndicator uses a custom mask_function to determine the boolean mask.
      The default mask_function is sagemaker_sklearn_extension.impute.is_finite_numeric
      which checks whether or not a value can be converted into a float.

    Parameters
    ----------
    features : str, optional (default="all")
        Whether the imputer mask should represent all or a subset of
        features.

        - If "missing-only", the imputer mask will only represent
          features containing missing values during fit time.
        - If "all" (default), the imputer mask will represent all features.

    error_on_new : boolean, optional (default=True)
        If True (default), transform will raise an error when there are
        features with missing values in transform that have no missing values
        in fit. This is applicable only when ``features="missing-only"``.

    mask_function : callable -> np.array, dtype('bool') (default=None)
        A vectorized python function, accepts np.array, returns np.array
        with dtype('bool')

        For each value, if mask_function(val) == False, that value will
        be imputed. mask_function is used to create a boolean mask that determines
        which values in the input to impute.

        Use np.vectorize to vectorize singular python functions.

        By default, mask_function will be
        sagemaker_sklearn_extension.impute.is_finite_numeric

    Notes
    -----
    only accepts 2D, non-sparse inputs
    """

    def __init__(self, features="all", error_on_new=True, mask_function=None):
        self.features = features
        self.error_on_new = error_on_new
        self.mask_function = mask_function

    def _validate_input(self, X):
        if hasattr(X, "dtype") and X.dtype is not None and hasattr(X.dtype, "kind") and X.dtype.kind == "c":
            raise ValueError("Complex data not supported\n{}\n".format(X))

        return check_array(X, dtype=np.dtype("O"), copy=True, force_all_finite=False, ensure_2d=True)

    def fit(self, X, y=None):
        """Fit the transformer on X.

        Parameters
        ----------
        X : {array-like}, shape (n_samples, n_features)
            Input data, where ``n_samples`` is the number of samples and
            ``n_features`` is the number of features.

        Returns
        -------
        self : RobustMissingIndicator
        """
        X = self._validate_input(X)

        self.vectorized_mask_function_ = self.mask_function or is_finite_numeric
        X = _apply_mask(X, _get_mask(X, self.vectorized_mask_function_))

        self.missing_indicator_ = MissingIndicator(features=self.features, error_on_new=self.error_on_new)
        self.missing_indicator_.fit(X)

        return self

    def transform(self, X):
        """Generate missing values indicator for X.

        Parameters
        ----------
        X : {array-like}, shape (n_samples, n_features)
            The input data to complete.

        Returns
        -------
        Xt : {ndarray}, shape (n_samples, n_features)
            The missing indicator for input data. The data type of ``Xt``
            will be boolean.
        """
        check_is_fitted(self, ["missing_indicator_", "vectorized_mask_function_"])
        X = self._validate_input(X)

        X = _apply_mask(X, _get_mask(X, self.vectorized_mask_function_))

        return self.missing_indicator_.transform(X)

    def _more_tags(self):
        return {"allow_nan": True}