python source code of estimators

from sklearn.base import (BaseEstimator, RegressorMixin, ClassifierMixin, clone, is_classifier, is_regressor)
from sklearn.utils.extmath import weighted_mode
from scipy.stats import mode
from sklearn.neighbors import NearestNeighbors
from abc import ABC, abstractmethod
import numpy as np
from copy import deepcopy


class SpatialLagBase(ABC, BaseEstimator):
    """Base class for spatial lag estimators.

    A spatial lag estimator uses a weighted mean/mode of the values of the K-neighboring
    observations to augment the base_estimator. The weighted mean/mode of the
    surrounding observations are appended as a new feature to the right-most column in
    the training data.

    The K-neighboring observations are determined using the distance metric specified in
    the `metric` argument. The default metric is minkowski, and with p=2 is equivalent
    to the standard Euclidean metric.

    Parameters
    ----------
    base_estimator : estimator object.
        This is assumed to implement the scikit-learn estimator interface. Either estimator
        needs to provide a score function, or scoring must be passed.

    n_neighbors : int, default = 7
        Number of neighbors to use by default for kneighbors queries.
    
    weights : {‘uniform’, ‘distance’} or callable, default=’distance’
        Weight function used in prediction. Possible values:

            - ‘uniform’ : uniform weights. All points in each neighborhood are weighted equally.
            - ‘distance’ : weight points by the inverse of their distance. in this case,
              closer neighbors of a query point will have a greater influence than
              neighbors which are further away.
            - [callable] : a user-defined function which accepts an array of distances,
              and returns an array of the same shape containing the weights.

    radius : float, default=1.0
        Range of parameter space to use by default for radius_neighbors queries.

    algorithm: {‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’}, default=’auto’
        Algorithm used to compute the nearest neighbors:

            - ‘ball_tree’ will use BallTree
            - ‘kd_tree’ will use KDTree
            - ‘brute’ will use a brute-force search.
            - ‘auto’ will attempt to decide the most appropriate algorithm based on the
              values passed to fit method.
            - Note: fitting on sparse input will override the setting of this parameter,
              using brute force.

    leaf_size : int, default=30
        Leaf size passed to BallTree or KDTree. This can affect the speed of the construction
        and query, as well as the memory required to store the tree. The optimal value depends
        on the nature of the problem.
    
    metric : str or callable, default=’minkowski’
        The distance metric to use for the tree. The default metric is minkowski, and
        with p=2 is equivalent to the standard Euclidean metric. See the documentation
        of DistanceMetric for a list of available metrics. If metric is “precomputed”,
        X is assumed to be a distance matrix and must be square during fit. X may be a
        sparse graph, in which case only “nonzero” elements may be considered neighbors.

    p : int, default=2
        Parameter for the Minkowski metric from sklearn.metrics.pairwise.pairwise_distances.
        When p = 1, this is equivalent to using manhattan_distance (l1), and
        euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
    
    metric_params : dict, default=None
        Additional keyword arguments for the metric function.
    
    feature_indices : list, default=None
        By default, the nearest neighbors are determined from the distance metric calculated
        using all of the features. If `feature_indices` are supplied then the distance
        calculation is restricted to the specific column indices. For spatial data, these
        might represent the x,y coordinates for example.

    n_jobs : int, default=None
        The number of parallel jobs to run for neighbors search. None means 1 unless in a
        joblib.parallel_backend context. -1 means using all processors. See Glossary
        for more details.
    """
    def __init__(
        self,
        base_estimator,
        n_neighbors=7,
        weights="distance",
        radius=1.0,
        algorithm="auto",
        leaf_size=30,
        metric="minkowski",
        p=2,
        metric_params=None,
        feature_indices=None,
        n_jobs=1,
    ):

        self.base_estimator = base_estimator
        self.n_neighbors = n_neighbors
        self.weights = weights
        self.radius = radius
        self.algorithm = algorithm
        self.leaf_size = leaf_size
        self.metric = metric
        self.p = p
        self.metric_params = metric_params
        self.feature_indices = feature_indices
        self.n_jobs = n_jobs

        self.knn = NearestNeighbors(
            n_neighbors=self.n_neighbors,
            radius=self.radius,
            algorithm=self.algorithm,
            leaf_size=self.leaf_size,
            metric=self.metric,
            p=self.p,
            metric_params=self.metric_params,
            n_jobs=self.n_jobs,
        )
        
        self.y_ = None

    @abstractmethod
    def _distance_weighting(self, neighbor_vals, neighbor_dist):
        pass        

    @abstractmethod
    def _uniform_weighting(self, neighbor_vals):
        pass
    
    @abstractmethod
    def _custom_weighting(self, neighbor_vals, neighbor_dist):
        pass

    @abstractmethod
    def _validate_base_estimator(self):
        pass
    
    def fit(self, X, y):
        """Fit the base_estimator with features from X {n_samples, n_features} and with an
        additional spatially lagged variable added to the right-most column of the 
        training data. 

        During fitting, the k-neighbors to each training point are used to
        estimate the spatial lag component. The training point is not included in the
        calculation, i.e. the training point is not considered its own neighbor.

        Parameters
        ----------
        X : array-like of sample {n_samples, n_features} using for model fitting
            The training input samples
        
        y : array-like of shape (n_samples,)
            The target values (class labels in classification, real numbers in regression).
        """
        # some checks
        self.base_estimator = clone(self.base_estimator)
        self._validate_base_estimator()

        self.y_ = deepcopy(y)
        distance_data = deepcopy(X)
        
        # use only selected columns in the data for the distances
        if self.feature_indices is not None:
            distance_data = distance_data[:, self.feature_indices]

        # fit knn and get values of neighbors
        self.knn.fit(distance_data)
        neighbor_dist, neighbor_ids = self.knn.kneighbors()

        # mask any zero distances
        neighbor_dist = np.ma.masked_equal(neighbor_dist, 0)
        
        # get y values of neighbouring points
        neighbor_vals = np.array([y[i] for i in neighbor_ids])
        neighbor_vals = np.ma.masked_array(neighbor_vals, mask=neighbor_dist.mask)
                
        # calculated weighted means
        if self.weights == "distance":
            new_X = self._distance_weighting(neighbor_vals, neighbor_dist)
        
        elif self.weights == "uniform":
            new_X = self._uniform_weighting(neighbor_vals)
        
        elif callable(self.weights):
            new_X = self._custom_weighting(neighbor_vals, neighbor_dist)
        
        # fit base estimator on augmented data
        self.base_estimator.fit(np.column_stack((X, new_X)), y)

        return self

    def predict(self, X, y=None):
        """Predict method for spatial lag models.

        Augments new osbservations with a spatial lag variable created from a weighted
        mean/mode (regression/classification) of k-neighboring observations.

        Parameters
        ----------
        X : array-like of sample {n_samples, n_features}
            New samples for the prediction.

        y : None
            Not used.
        """
        distance_data = deepcopy(X)

        # use only selected columns in the data for the distances
        if self.feature_indices is not None:
            distance_data = distance_data[:, self.feature_indices]

        # get distances from training points to new data
        neighbor_dist, neighbor_ids = self.knn.kneighbors(X=distance_data)
        
        # mask zero distances
        neighbor_dist = np.ma.masked_equal(neighbor_dist, 0)
        
        # get values of closest training points to new data
        neighbor_vals = np.array([self.y_[i] for i in neighbor_ids])
        neighbor_vals = np.ma.masked_array(neighbor_vals, mask=neighbor_dist.mask)

        # calculated weighted means
        if self.weights == "distance":
            new_X = self._distance_weighting(neighbor_vals, neighbor_dist)
        
        if self.weights == "uniform":
            new_X = self._uniform_weighting(neighbor_vals)
        
        elif callable(self.weights):
            new_X = self._custom_weighting(neighbor_vals, neighbor_dist)
        
        # fit base estimator on augmented data
        preds = self.base_estimator.predict(np.column_stack((X, new_X)))

        return preds


class SpatialLagRegressor(RegressorMixin, SpatialLagBase):    
    @staticmethod
    def _distance_weighting(neighbor_vals, neighbor_dist):
        neighbor_weights = 1 / neighbor_dist
        X = np.ma.average(neighbor_vals, weights=neighbor_weights, axis=1)
        return X
        
    @staticmethod
    def _uniform_weighting(self, neighbor_vals):
        X = np.ma.average(neighbor_vals, axis=1)
        return X

    def _custom_weighting(self, neighbor_vals, neighbor_dist):
        neighbor_weights = self.weights(neighbor_dist)
        new_X = np.ma.average(neighbor_vals, weights=neighbor_weights, axis=1)
        return new_X
    
    def _validate_base_estimator(self):
        if not is_regressor(self.base_estimator):
            raise ValueError(
                "'base_estimator' parameter should be a regressor. Got {}"
                .format(self.base_estimator)
            )


class SpatialLagClassifier(ClassifierMixin, SpatialLagBase):    
    @staticmethod
    def _distance_weighting(neighbor_vals, neighbor_dist):
        neighbor_weights = 1 / neighbor_dist
        X = weighted_mode(neighbor_vals, neighbor_weights, axis=1)
        return X
        
    @staticmethod
    def _uniform_weighting(self, neighbor_vals):
        X = mode(neighbor_vals, axis=1)
        return X

    def _custom_weighting(self, neighbor_vals, neighbor_dist):
        neighbor_weights = self.weights(neighbor_dist)
        new_X = weighted_mode(neighbor_vals, neighbor_weights, axis=1)
        return new_X

    def _validate_base_estimator(self):
        if not is_classifier(self.base_estimator):
            raise ValueError(
                "'base_estimator' parameter should be a classifier. Got {}"
                .format(self.base_estimator)
            )


class ThresholdClassifierCV(BaseEstimator, ClassifierMixin):
    """
    Metaclassifier to perform cutoff threshold optimization

    This implementation is restricted to binary classification problems

    Notes
    -----

    - The training data are partitioned in k-1, and k sets
    - The metaclassifier trains the BaseEstimator on the k-1 partitions
    - The Kth paritions are used to determine the optimal cutoff taking
      the mean of the thresholds that maximize the scoring metric
    - The optimal cutoff is applied to all classifier predictions
    """

    def __init__(self, estimator, thresholds=np.arange(0.1, 0.9, 0.01),
                 scoring=None, refit=False, cv=3, random_state=None):

        """Initialize a ThresholdClassifierCV instance

        Parameters
        ----------
        estimator : estimator object implementing 'fit'
            The object to use to fit the data.

        thresholds : threshold values to search for optimal cutoff, for
            example a list or array of cutoff thresholds to use for scoring

        scoring : callable, dict
            A callable or dict of key : callable pairs of scoring metrics to
            evaluate at the cutoff thresholds

        refit : string, or None
            String specifying the key name of the metric to use to determine
            the optimal cutoff threshold. Only required when multiple scoring
            metrics are used

        cv : int, cross-validation generator or an iterable, optional
            Determines the cross-validation splitting strategy.

            Possible inputs for cv are:
            - None, to use the default 3-fold cross validation,
            - integer, to specify the number of folds in a `(Stratified)KFold`,
            - An object to be used as a cross-validation generator.
            - An iterable yielding train, test splits.

            For integer/None inputs, if the estimator is a classifier and ``y`` is
            either binary or multiclass, :class:`StratifiedKFold` is used. In all
            other cases, :class:`KFold` is used.
            Refer :ref:`User Guide <cross_validation>` for the various
            cross-validation strategies that can be used here.

        random_state : int, RandomState instance or None, optional (default=0)
            If int, random_state is the seed used by the random number generator;
            If RandomState instance, random_state is the random number generator;
            If None, the random number generator is the RandomState instance used
            by `np.random`.

        Notes
        -----
        Rules for creating a custom scikit-learn estimator
        - All arguments of __init__must have default value, so it's possible to initialize
           the classifier just by typing MyClassifier()
        - No confirmation of input parameters should be in __init__ method
           That belongs to fit method.
        - All arguments of __init__ method should have the same name as they will have as the
           attributes of created object
        - Do not take data as argument here! It should be in fit method

        TODO
        ----
        Parallelize the cross validation loop used to find the optimal threshold
        in the fit method

        Change the behaviour of the scoring parameters so that it accepts arguments
        in the same manner as scikit learn functions such as GridSearchCV, i.e. it
        accepts string, callable, list/tuple, or dict
        """

        # get dict of arguments from function call
        args, _, _, values = inspect.getargvalues(inspect.currentframe())
        values.pop("self")

        for arg, val in values.items():
            setattr(self, arg, val)

    def _find_threshold(self, X, y=None):
        """
        Finds the optimal cutoff threshold based on maximizing/minimizing the
        scoring method
        """

        estimator = self.estimator
        thresholds = self.thresholds
        scorers = self.scorers_
        refit = self.refit

        y_pred = estimator.predict_proba(X)
        scores = dict([(dct, np.zeros((len(thresholds)))) for dct in scorers])

        for i, cutoff in enumerate(thresholds):
            for name, scorer in scorers.items():
                scores[name][i] = self._scorer_cutoff(y, y_pred, scorer, cutoff)

        top_score = scores[refit][scores[refit].argmax()]
        top_threshold = thresholds[scores[refit].argmax()]
        return top_threshold, top_score, scores

    def score(self, X, y):
        """
        Overloading of classifier score method
        score method is required for compatibility with GridSearch
        The scoring metric should be one that can be maximized (bigger=better)
        """

        _threshold, score, _threshold_scores = self._find_threshold(X, y)

        return score

    @staticmethod
    def _scorer_cutoff(y, y_pred, scorer, cutoff):
        """
        Helper method to binarize the probability scores based on a cutoff
        """

        y_pred = (y_pred[:, 1] > cutoff).astype(int)
        return scorer(y, y_pred)

    def fit(self, X, y=None, groups=None, **fit_params):
        """
        Run fit method with all sets of parameters

        Args
        ----
        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples is the number of samples and
            n_features is the number of features

        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
            Target relative to X for classification or regression;
            None for unsupervised learning

        groups : array-like, shape = [n_samples], optional
            Training vector groups for cross-validation

        **fit_params : dict of string -> object
            Parameters passed to the ``fit`` method of the estimator

        Notes
        -----

        Rules

        - Parameters are checked during the fit method
        - New attributes created during fitting should end in _, i.e. fitted_
        - Fit method needs to return self for compatibility reasons with sklearn
        - The response vector, i.e. y, should be initiated with None
        """

        # check estimator and cv methods are valid
        estimator = self.estimator
        cv = check_cv(self.cv, y, classifier=is_classifier(estimator))

        # check for binary response
        if len(np.unique(y)) > 2:
            raise ValueError('Only a binary response vector is currently supported')

        # check that scoring metric has been specified
        if self.scoring is None:
            raise ValueError('No score function is defined')

        # convert scoring callables to dict of key: scorer pairs
        if callable(self.scoring):
            self.scorers_ = {'score': self.scoring}
            self.refit = 'score'

        elif isinstance(self.scoring, dict):
            self.scorers_ = self.scoring

            if self.refit is False:
                raise ValueError("For multi-metric scoring, the parameter "
                                 "refit must be set to a scorer key "
                                 "to determine which scoring method "
                                 "is used to optimize the classifiers "
                                 "cutoff threshold")

        # cross-validate the probability threshold
        self.best_threshold_ = 0
        self.threshold_scores_ = dict([(k, np.zeros((len(self.thresholds)))) for k in self.scorers_])

        for train, cal in cv.split(X, y, groups):
            X_train, y_train, X_cal, y_cal = X[train], y[train], X[cal], y[cal]

            if groups is not None:
                groups_train, groups_cal = groups[train], groups[cal]
                estimator.fit(X_train, y_train, groups_train, **fit_params)
            else:
                estimator.fit(X_train, y_train, **fit_params)

            # find cutoff threshold on calibration set
            # unless a single cutoff threshold is specified, which sets the classifier to this
            # threshold
            if isinstance(self.thresholds, (list, np.array)):
                best_threshold, _score, threshold_scores = self._find_threshold(X_cal, y_cal)

                # sum the scores
                self.best_threshold_ += best_threshold
                self.threshold_scores_ = {
                    key: (self.threshold_scores_[key] + threshold_scores[key]) for key in self.threshold_scores_}

                # average the scores per cross validation fold
                self.best_threshold_ /= cv.get_n_splits()
                self.threshold_scores_ = dict([(k, v / cv.get_n_splits()) for (k, v) in self.threshold_scores_.items()])

            else:
                self.best_threshold_ = self.threshold

        self.classes_ = self.estimator.classes_

    def predict(self, X, y=None):
        y_score = self.estimator.predict_proba(X)
        return np.array(y_score[:,1] >= self.best_threshold_)

    def predict_proba(self, X):
        return self.estimator.predict_proba(X)