"""
.. codeauthor:: Michele Donini <m.donini@ucl.ac.uk>
.. codeauthor:: Ivano Lauriola <ivanolauriola@gmail.com>

A Kernel Method for the Optimization of the Margin Distribution
"""
 
from cvxopt import matrix, solvers
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics.pairwise import rbf_kernel, polynomial_kernel, linear_kernel
from sklearn.utils import check_array, check_consistent_length#, check_random_state
from sklearn.utils import column_or_1d, check_X_y
from sklearn.utils import validation
from sklearn.utils.validation import check_is_fitted
from sklearn.utils.validation import NotFittedError
from sklearn.utils.multiclass import check_classification_targets
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier

class KOMD(BaseEstimator, ClassifierMixin):
    """KOMD.
    
    KOMD is a kernel method for classification and ranking.
    
    Read more in http://www.math.unipd.it/~dasan/papers/km-omd.icann08.pdf
    by F. Aiolli, G. Da San Martino, and A. Sperduti.
    
    For details on the precise mathematical formulation of the provided
    kernel functions and how `gamma`, `coef0` and `degree` affect each
    other, see the corresponding section in the narrative documentation:
    :ref:`svm_kernels`.
	
    Parameters
    ----------
    lam : float, (default=0.1)
        Specifies the lambda value, between 0.0 and 1.0.
    
    kernel : optional (default='linear')
        Specifies the kernel function used by the algorithm.
        It must be one of 'linear', 'poly', 'rbf', a callable or a gram matrix.
        If none is given, 'linear' will be used. If a callable is given it is
        used to pre-compute the kernel matrix from data matrices; that matrix
        should be an array of shape ``(n_samples, n_samples)``.
    
    rbf_gamma : float, optional (default=0.1)
        Coefficient for 'rbf' and 'poly' kernels.
        Ignored by all other kernels.
    
    degree : float, optional (default=2.0)
        Specifies the degree of the 'poly' kernel.
	    Ignored by all other kernels.
    
    coef0 : flaot, optional (default=0.0)
        Specifies the coeff0 in a polynomial kernel.
        Ignored by all other kernels.
    
    max_iter : int, optional (default=100)
        Hard limit on iterations within solver, it can't be negative.
    
    verbose : bool, (default=False)
        Enable verbose output during fit.
    
    multiclass_strategy : string, optional (default='ova')
        Specifies the strategy used in case of multiclass.
        'ova' for one_vs_all pattern (also called one_vs_rest),
        'ovo' for one_vs_one pattern.
        With other unexpected string, 'ova' pattern is used.
    
    Attributes
    ----------
    gamma : array-like, shape = [n_samples]
        probability-like vector that define the distance vector
        over the two class.
    
    classes_ : array-like, shape = [n_classes]
        Vector that contain all possibile labels
    
    multiclass_ : boolean,
        True if the number of classes > 2
    
    Examples
    --------
    >>>import numpy as np
    >>>from ??.komd import KOMD
    >>>X = np.array([[1,2,i] for i in range(5)])
    >>>Y = np.array([1,1,1,-1,-1])
    >>>cls = KOMD()
    >>>cls = cls.fit(X,Y)
    >>>pred = cls.predict([[1,1,5]])
    
    References
    ----------
    `A Kernel Method for the Optimization of the Margin Distribution
    <http://www.math.unipd.it/~dasan/papers/km-omd.icann08.pdf>`__
    """
    
    def __init__(self, lam = 0.1, kernel = 'rbf', rbf_gamma = 0.1, degree = 2.0, coef0 = 0.0, max_iter = 100, verbose = False, multiclass_strategy = 'ova'):
        self.lam = lam
        self.gamma = None
        self.bias = None
        self.X = None
        self.Y = None
        self.is_fitted = False
        self.rbf_gamma = rbf_gamma
        self.degree = degree
        self.coef0 = coef0
        self.max_iter = max_iter
        self.verbose = verbose
        self.kernel = kernel
        self.multiclass_strategy = multiclass_strategy
        self.multiclass_ = None
        self.classes_ = None
        self._pairwise = self.kernel=='precomputed'

    def __kernel_definition__(self):
        """Select the kernel function
        
        Returns
        -------
        kernel : a callable relative to selected kernel
        """
        if hasattr(self.kernel, '__call__'):
            return self.kernel
        if self.kernel == 'rbf' or self.kernel == None:
            return lambda X,Y : rbf_kernel(X,Y,self.rbf_gamma)
        if self.kernel == 'poly':
            return lambda X,Y : polynomial_kernel(X, Y, degree=self.degree, gamma=self.rbf_gamma, coef0=self.coef0)
        if self.kernel == 'linear':
            return lambda X,Y : linear_kernel(X,Y)
        if self.kernel == 'precomputed':
            return lambda X,Y : X
    
    def fit(self, X, Y):
        
        """Fit the model according to the given training data
        
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Matrix of the examples, where
            n_samples is the number of samples and
            n_feature is the number of features
        
        Y : array-like, shape = [n_samples]
            array of the labels relative to X
        
        Returns
        -------
        self : object
            Returns self
        """
        X,Y = validation.check_X_y(X, Y, dtype=np.float64, order='C', accept_sparse='csr')
        #check_consistent_length(X,Y)
        check_classification_targets(Y)
        
        self.classes_ = np.unique(Y)
        if len(self.classes_) < 2:
            raise ValueError("The number of classes has to be almost 2; got ", len(self.classes_))
        
        if len(self.classes_) == 2:
            self.multiclass_ = False
            return self._fit(X,Y)
        else :
            self.multiclass_ = True
            if self.multiclass_strategy == 'ovo':
                return self._one_vs_one(X,Y)
            else :
                return self._one_vs_rest(X,Y)
        raise ValueError('This is a very bad exception...')
    
    def _one_vs_one(self,X,Y):
        self.cls = OneVsOneClassifier(KOMD(**self.get_params())).fit(X,Y)
        self.is_fitted = True
        return self
    
    def _one_vs_rest(self,X,Y):
        self.cls = OneVsRestClassifier(KOMD(**self.get_params())).fit(X,Y)
        self.is_fitted = True
        return self
        
    def _fit(self,X,Y):    
        self.X = X
        values = np.unique(Y)
        Y = [1 if l==values[1] else -1 for l in Y]
        self.Y = Y
        npos = len([1.0 for l in Y if l == 1])
        nneg = len([1.0 for l in Y if l == -1])
        gamma_unif = matrix([1.0/npos if l == 1 else 1.0/nneg for l in Y])
        YY = matrix(np.diag(list(matrix(Y))))

        Kf = self.__kernel_definition__()
        ker_matrix = matrix(Kf(X,X).astype(np.double))
        #KLL = (1.0 / (gamma_unif.T * YY * ker_matrix * YY * gamma_unif)[0])*(1.0-self.lam)*YY*ker_matrix*YY
        KLL = (1.0-self.lam)*YY*ker_matrix*YY
        LID = matrix(np.diag([self.lam * (npos * nneg / (npos+nneg))]*len(Y)))
        Q = 2*(KLL+LID)
        p = matrix([0.0]*len(Y))
        G = -matrix(np.diag([1.0]*len(Y)))
        h = matrix([0.0]*len(Y),(len(Y),1))
        A = matrix([[1.0 if lab==+1 else 0 for lab in Y],[1.0 if lab2==-1 else 0 for lab2 in Y]]).T
        b = matrix([[1.0],[1.0]],(2,1))
        
        solvers.options['show_progress'] = False#True
        solvers.options['maxiters'] = self.max_iter
        sol = solvers.qp(Q,p,G,h,A,b)
        self.gamma = sol['x']
        if self.verbose:
            print ('[KOMD]')
            print ('optimization finished, #iter = %d' % sol['iterations'])
            print ('status of the solution: %s' % sol['status'])
            print ('objval: %.5f' % sol['primal objective'])
            
        bias = 0.5 * self.gamma.T * ker_matrix * YY * self.gamma
        self.bias = bias
        self.is_fitted = True
        self.ker_matrix = ker_matrix
        return self
        
    def predict(self, X):
        """Perform classification on samples in X.
        
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Matrix containing new samples
        
        Returns
        -------
        y_pred : array, shape = [n_samples]
            The value of prediction for each sample
        """
        
        if self.is_fitted == False:
            raise NotFittedError("This KOMD instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.")
        X = check_array(X, accept_sparse='csr', dtype=np.float64, order="C")
        if self.multiclass_ == True:
            return self.cls.predict(X)
        
        return np.array([self.classes_[1] if p >=0 else self.classes_[0] for p in self.decision_function(X)])

    def get_params(self, deep=True):
        # this estimator has parameters:
        return {"lam": self.lam, "kernel": self.kernel, "rbf_gamma":self.rbf_gamma,
                "degree":self.degree, "coef0":self.coef0, "max_iter":self.max_iter,
                "verbose":self.verbose, "multiclass_strategy":self.multiclass_strategy}

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self,parameter,value)
        return self


    def decision_function(self, X):
        """Distance of the samples in X to the separating hyperplane.
        
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
        
        Returns
        -------
        Z : array-like, shape = [n_samples, 1]
            Returns the decision function of the samples.
        """
        
        if self.is_fitted == False:
            raise NotFittedError("This KOMD instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.")
        X = check_array(X, accept_sparse='csr', dtype=np.float64, order="C")
        
        if self.multiclass_ == True:
            return self.cls.decision_function(X)
        
        Kf = self.__kernel_definition__()
        YY = matrix(np.diag(list(matrix(self.Y))))
        ker_matrix = matrix(Kf(X,self.X).astype(np.double))
        z = ker_matrix*YY*self.gamma
        z = z-self.bias
        return np.array(list(z))