python source code of linear

# MIT License
#
# Copyright (C) IBM Corporation 2019
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit
# persons to whom the Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
# Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
#
# New BSD License
#
# Copyright (c) 2007–2019 The scikit-learn developers.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
# following conditions are met:
#
#   a. Redistributions of source code must retain the above copyright notice, this list of conditions and the following
#      disclaimer.
#   b. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the
#      following disclaimer in the documentation and/or other materials provided with the distribution.
#   c. Neither the name of the Scikit-learn Developers  nor the names of its contributors may be used to endorse or
#      promote products derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
# INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
"""
Linear Regression with differential privacy
"""
import warnings

import numpy as np
import sklearn.linear_model as sk_lr
from sklearn.utils import check_X_y, check_array
from sklearn.utils.validation import FLOAT_DTYPES

from diffprivlib.accountant import BudgetAccountant
from diffprivlib.mechanisms import Wishart
from diffprivlib.tools import mean
from diffprivlib.utils import warn_unused_args, PrivacyLeakWarning
from diffprivlib.validation import clip_to_norm, check_bounds, clip_to_bounds

_range = range


# noinspection PyPep8Naming
def _preprocess_data(X, y, fit_intercept, epsilon=1.0, bounds_X=None, bounds_y=None, copy=True, check_input=True,
                     **unused_args):
    warn_unused_args(unused_args)

    if check_input:
        X = check_array(X, copy=copy, accept_sparse=False, dtype=FLOAT_DTYPES)
    elif copy:
        X = X.copy(order='K')

    y = np.asarray(y, dtype=X.dtype)
    X_scale = np.ones(X.shape[1], dtype=X.dtype)

    if fit_intercept:
        bounds_X = check_bounds(bounds_X, X.shape[1])
        bounds_y = check_bounds(bounds_y, y.shape[1] if y.ndim > 1 else 1)

        X = clip_to_bounds(X, bounds_X)
        y = clip_to_bounds(y, bounds_y)

        X_offset = mean(X, axis=0, bounds=bounds_X, epsilon=epsilon, accountant=BudgetAccountant())
        X -= X_offset
        y_offset = mean(y, axis=0, bounds=bounds_y, epsilon=epsilon, accountant=BudgetAccountant())
        y = y - y_offset
    else:
        X_offset = np.zeros(X.shape[1], dtype=X.dtype)
        if y.ndim == 1:
            y_offset = X.dtype.type(0)
        else:
            y_offset = np.zeros(y.shape[1], dtype=X.dtype)

    return X, y, X_offset, y_offset, X_scale


# noinspection PyPep8Naming,PyAttributeOutsideInit
class LinearRegression(sk_lr.LinearRegression):
    r"""
    Ordinary least squares Linear Regression with differential privacy.

    LinearRegression fits a linear model with coefficients w = (w1, ..., wp) to minimize the residual sum of squares
    between the observed targets in the dataset, and the targets predicted by the linear approximation.  Differential
    privacy is guaranteed with respect to the training sample.

    Differential privacy is achieved by adding noise to the second moment matrix using the :class:`.Wishart` mechanism.
    This method is demonstrated in  [She15]_, but our implementation takes inspiration from the use of the Wishart
    distribution in  [IS16]_ to achieve a strict differential privacy guarantee.

    Parameters
    ----------
    epsilon : float, default: 1.0
        Privacy parameter :math:`\epsilon`.

    data_norm : float, optional
        The max l2 norm of any row of the concatenated dataset A = [X; y].  This defines the spread of data that will be
        protected by differential privacy.

        If not specified, the max norm is taken from the data when ``.fit()`` is first called, but will result in a
        :class:`.PrivacyLeakWarning`, as it reveals information about the data.  To preserve differential privacy fully,
        `data_norm` should be selected independently of the data, i.e. with domain knowledge.

    bounds_X:  tuple, optional
        Bounds of the data, provided as a tuple of the form (min, max).  `min` and `max` can either be scalars, covering
        the min/max of the entire data, or vectors with one entry per feature.  If not provided, the bounds are computed
        on the data when ``.fit()`` is first called, resulting in a :class:`.PrivacyLeakWarning`.

    bounds_y : tuple
        Same as `bounds_X`, but for the training label set `y`.

    fit_intercept : bool, default: True
        Whether to calculate the intercept for this model.  If set to False, no intercept will be used in calculations
        (i.e. data is expected to be centered).

    copy_X : bool, default: True
        If True, X will be copied; else, it may be overwritten.

    accountant : BudgetAccountant, optional
        Accountant to keep track of privacy budget.

    Attributes
    ----------
    coef_ : array of shape (n_features, ) or (n_targets, n_features)
        Estimated coefficients for the linear regression problem.  If multiple targets are passed during the fit (y 2D),
        this is a 2D array of shape (n_targets, n_features), while if only one target is passed, this is a 1D array of
        length n_features.

    rank_ : int
        Rank of matrix `X`.

    singular_ : array of shape (min(X, y),)
        Singular values of `X`.

    intercept_ : float or array of shape of (n_targets,)
        Independent term in the linear model.  Set to 0.0 if `fit_intercept = False`.

    References
    ----------
    .. [She15] Sheffet, Or. "Private approximations of the 2nd-moment matrix using existing techniques in linear
        regression." arXiv preprint arXiv:1507.00056 (2015).

    .. [IS16] Imtiaz, Hafiz, and Anand D. Sarwate. "Symmetric matrix perturbation for differentially-private principal
        component analysis." In 2016 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP),
        pp. 2339-2343. IEEE, 2016.
    """
    def __init__(self, epsilon=1.0, data_norm=None, bounds_X=None, bounds_y=None, fit_intercept=True, copy_X=True,
                 accountant=None, **unused_args):
        super().__init__(fit_intercept=fit_intercept, normalize=False, copy_X=copy_X, n_jobs=None)

        self.epsilon = epsilon
        self.data_norm = data_norm
        self.bounds_X = bounds_X
        self.bounds_y = bounds_y
        self.accountant = BudgetAccountant.load_default(accountant)

        warn_unused_args(unused_args)

    def fit(self, X, y, sample_weight=None):
        """
        Fit linear model.

        Parameters
        ----------
        X : array-like or sparse matrix, shape (n_samples, n_features)
            Training data

        y : array_like, shape (n_samples, n_targets)
            Target values.  Will be cast to X's dtype if necessary

        sample_weight : ignored
            Ignored by diffprivlib.  Present for consistency with sklearn API.

        Returns
        -------
        self : returns an instance of self.
        """
        self.accountant.check(self.epsilon, 0)

        if sample_weight is not None:
            warn_unused_args("sample_weight")

        X, y = check_X_y(X, y, accept_sparse=False, y_numeric=True, multi_output=True)

        if self.fit_intercept:
            if self.bounds_X is None or self.bounds_y is None:
                warnings.warn(
                    "Bounds parameters haven't been specified, so falling back to determining bounds from the "
                    "data.\n"
                    "This will result in additional privacy leakage. To ensure differential privacy with no "
                    "additional privacy loss, specify `bounds_X` and `bounds_y`.",
                    PrivacyLeakWarning)

                if self.bounds_X is None:
                    self.bounds_X = (np.min(X, axis=0), np.max(X, axis=0))
                if self.bounds_y is None:
                    self.bounds_y = (np.min(y, axis=0), np.max(y, axis=0))

            self.bounds_X = check_bounds(self.bounds_X, X.shape[1])
            self.bounds_y = check_bounds(self.bounds_y, y.shape[1] if y.ndim > 1 else 1)

        n_features = X.shape[1]
        epsilon_intercept_scale = 1 / (n_features + 1) if self.fit_intercept else 0

        X, y, X_offset, y_offset, X_scale = self._preprocess_data(X, y, fit_intercept=self.fit_intercept,
                                                                  bounds_X=self.bounds_X, bounds_y=self.bounds_y,
                                                                  epsilon=self.epsilon * epsilon_intercept_scale,
                                                                  copy=self.copy_X)

        if self.data_norm is None:
            warnings.warn("Data norm has not been specified and will be calculated on the data provided.  This will "
                          "result in additional privacy leakage. To ensure differential privacy and no additional "
                          "privacy leakage, specify `data_norm` at initialisation.", PrivacyLeakWarning)
            self.data_norm = np.linalg.norm(X, axis=1).max()

        X = clip_to_norm(X, self.data_norm)

        A = np.hstack((X, y[:, np.newaxis] if y.ndim == 1 else y))
        AtA = np.dot(A.T, A)

        mech = Wishart().set_epsilon(self.epsilon * (1 - epsilon_intercept_scale)).set_sensitivity(self.data_norm)
        noisy_AtA = mech.randomise(AtA)

        noisy_AtA = noisy_AtA[:n_features, :]
        XtX = noisy_AtA[:, :n_features]
        Xty = noisy_AtA[:, n_features:]

        self.coef_, self._residues, self.rank_, self.singular_ = np.linalg.lstsq(XtX, Xty, rcond=-1)
        self.coef_ = self.coef_.T

        if y.ndim == 1:
            self.coef_ = np.ravel(self.coef_)
        self._set_intercept(X_offset, y_offset, X_scale)

        self.accountant.spend(self.epsilon, 0)

        return self

    _preprocess_data = staticmethod(_preprocess_data)