python source code of rcc

import math

import numpy as np
import scipy.sparse
import scipy.sparse.linalg

from scipy.sparse import csr_matrix, triu, find
from scipy.sparse.csgraph import minimum_spanning_tree, connected_components
from scipy.spatial import distance


class RccCluster:
    """
    Computes a clustering following: Robust continuous clustering, (Shaha and Koltunb, 2017).
    The interface is based on the sklearn.cluster module.

    Parameters
    ----------
    k (int) number of neighbors for each sample in X
    measure (string) distance metric, one of 'cosine' or 'euclidean'
    clustering_threshold (float) threshold to assign points together in a cluster. Higher means fewer larger clusters
    eps (float) numerical epsilon used for computation
    verbose (boolean) verbosity
    """

    def __init__(self, k=10, measure='euclidean', clustering_threshold=1., eps=1e-5, verbose=True):

        self.k = k
        self.measure = measure
        self.clustering_threshold = clustering_threshold
        self.eps = eps
        self.verbose = verbose

        self.labels_ = None
        self.U = None
        self.i = None
        self.j = None
        self.n_samples = None

    def compute_assignment(self, epsilon):
        """
        Assigns points to clusters based on their representative. Two points are part of the same cluster if their
        representative are close enough (their squared euclidean distance is < delta)
        """
        diff = np.sum((self.U[self.i, :] - self.U[self.j, :])**2, axis=1)

        # computing connected components.
        is_conn = np.sqrt(diff) <= self.clustering_threshold*epsilon

        G = scipy.sparse.coo_matrix((np.ones((2*np.sum(is_conn),)),
                                     (np.concatenate([self.i[is_conn], self.j[is_conn]], axis=0),
                                      np.concatenate([self.j[is_conn], self.i[is_conn]], axis=0))),
                                    shape=[self.n_samples, self.n_samples])

        num_components, labels = connected_components(G, directed=False)

        return labels, num_components

    @staticmethod
    def geman_mcclure(data, mu):
        """
        Geman McClure function. See Bayesian image analysis. An application to single photon emission tomography (1985).

        Parameters
        ----------
        data (array) 2d numpy array of data
        mu (float) scale parameter
        """
        return (mu / (mu + np.sum(data**2, axis=1)))**2

    def compute_obj(self, X, U, lpq, i, j, lambda_, mu, weights, iter_num):
        """
        Computes the value of the objective function.

        Parameters
        ----------
        X (array) data points, 2d numpy array of shape (n_features, n_clusters)
        U (int) representative points, 2d numpy array of shape (n_features, n_clusters)
        lpq (array) penalty term on the connections
        i (array) first slice of w, used for convenience
        j (array) second slice of w, used for convenience
        lambda_ (float) term balancing the contributions of the losses
        mu (float) scale parameter
        weights (array) weights of the connections
        iter_num (int) current iteration, only used for printing to screen if verbose=True
        """

        # computing the objective as in equation [2]
        data = 0.5 * np.sum(np.sum((X - U)**2))
        diff = np.sum((U[i, :] - U[j, :])**2, axis=1)
        smooth = lambda_ * 0.5 * (np.inner(lpq*weights, diff) + mu *
                                  np.inner(weights, (np.sqrt(lpq + self.eps)-1)**2))

        # final objective
        obj = data + smooth
        if self.verbose:
            print(' {} | {} | {} | {}'.format(iter_num, data, smooth, obj))

        return obj

    @staticmethod
    def m_knn(X, k, measure='euclidean'):
        """
        This code is taken from:
        https://bitbucket.org/sohilas/robust-continuous-clustering/src/
        The original terms of the license apply.
        Construct mutual_kNN for large scale dataset

        If j is one of i's closest neighbors and i is also one of j's closest members,
        the edge will appear once with (i,j) where i < j.

        Parameters
        ----------
        X (array) 2d array of data of shape (n_samples, n_dim)
        k (int) number of neighbors for each sample in X
        measure (string) distance metric, one of 'cosine' or 'euclidean'
        """

        samples = X.shape[0]
        batch_size = 10000
        b = np.arange(k+1)
        b = tuple(b[1:].ravel())

        z = np.zeros((samples, k))
        weigh = np.zeros_like(z)

        # This loop speeds up the computation by operating in batches
        # This can be parallelized to further utilize CPU/GPU resource

        for x in np.arange(0, samples, batch_size):
            start = x
            end = min(x+batch_size, samples)

            w = distance.cdist(X[start:end], X, measure)

            y = np.argpartition(w, b, axis=1)

            z[start:end, :] = y[:, 1:k + 1]
            weigh[start:end, :] = np.reshape(w[tuple(np.repeat(np.arange(end-start), k)),
                                               tuple(y[:, 1:k+1].ravel())], (end-start, k))
            del w

        ind = np.repeat(np.arange(samples), k)

        P = csr_matrix((np.ones((samples*k)), (ind.ravel(), z.ravel())), shape=(samples, samples))
        Q = csr_matrix((weigh.ravel(), (ind.ravel(), z.ravel())), shape=(samples, samples))

        Tcsr = minimum_spanning_tree(Q)
        P = P.minimum(P.transpose()) + Tcsr.maximum(Tcsr.transpose())
        P = triu(P, k=1)

        V = np.asarray(find(P)).T
        return V[:, :2].astype(np.int32)

    def run_rcc(self, X, w, max_iter=100, inner_iter=4):
        """
        Main function for computing the clustering.

        Parameters
        ----------
        X (array) 2d array of data of shape (n_samples, n_dim).
        w (array) weights for each edge, as computed by the mutual knn clustering.
        max_iter (int) maximum number of iterations to run the algorithm.
        inner_iter (int) number of inner iterations. 4 works well in most cases.
        """

        X = X.astype(np.float32)  # features stacked as N x D (D is the dimension)

        w = w.astype(np.int32)  # list of edges represented by start and end nodes
        assert w.shape[1] == 2

        # slice w for convenience
        i = w[:, 0]
        j = w[:, 1]

        # initialization
        n_samples, n_features = X.shape

        n_pairs = w.shape[0]

        # precomputing xi
        xi = np.linalg.norm(X, 2)

        # set the weights as given in equation [S1] (supplementary information), making sure to exploit the data
        # sparsity
        R = scipy.sparse.coo_matrix((np.ones((i.shape[0]*2,)),
                                     (np.concatenate([i, j], axis=0),
                                      np.concatenate([j, i], axis=0))), shape=[n_samples, n_samples])

        # number of connections
        n_conn = np.sum(R, axis=1)

        # make sure to convert back to a numpy array from a numpy matrix, since the output of the sum() operation on a
        # sparse matrix is a numpy matrix
        n_conn = np.asarray(n_conn)

        # equation [S1]
        weights = np.mean(n_conn) / np.sqrt(n_conn[i]*n_conn[j])
        weights = weights[:, 0]  # squueze out the unnecessary dimension

        # initializing the representatives U to have the same value as X
        U = X.copy()

        # initialize lpq to 1, that is all connections are active
        # lpq is a penalty term on the connections
        lpq = np.ones((i.shape[0],))

        # compute delta and mu, see SI for details
        epsilon = np.sqrt(np.sum((X[i, :] - X[j, :])**2 + self.eps, axis=1))

        # Note: suppress low values. This hard coded threshold could lead to issues with very poorly normalized data.
        epsilon[epsilon/np.sqrt(n_features) < 1e-2] = np.max(epsilon)

        epsilon = np.sort(epsilon)

        # compute mu, see section Graduated Nonconvexity in the SI
        mu = 3.0 * epsilon[-1]**2

        # take the top 1% of the closest neighbours as a heuristic
        top_samples = np.minimum(250.0, math.ceil(n_pairs*0.01))

        delta = np.mean(epsilon[:int(top_samples)])
        epsilon = np.mean(epsilon[:int(math.ceil(n_pairs*0.01))])

        # computation of matrix A = D-R (here D is the diagonal matrix and R is the symmetric matrix), see equation (8)

        R = scipy.sparse.coo_matrix((np.concatenate([weights*lpq, weights*lpq], axis=0),
                                     (np.concatenate([i,j],axis=0), np.concatenate([j,i],axis=0))),
                                    shape=[n_samples, n_samples])

        D = scipy.sparse.coo_matrix((np.squeeze(np.asarray(np.sum(R,axis=1))),
                                     ((range(n_samples), range(n_samples)))),
                                    (n_samples, n_samples))

        # initial computation of lambda (lambda is a reserved keyword in python)
        # note: compute the largest magnitude eigenvalue instead of the matrix norm as it is faster to compute

        eigval = scipy.sparse.linalg.eigs(D-R, k=1, return_eigenvectors=False).real

        # lambda is a reserved keyword in python, so we use lambda_. Calculate lambda as per equation 9.
        lambda_ = xi / eigval[0]

        if self.verbose:
            print('mu = {}, lambda = {}, epsilon = {}, delta = {}'.format(mu, lambda_, epsilon, delta))
            print(' Iter | Data \t | Smooth \t | Obj \t')    

        # pre-allocate memory for the values of the objective function
        obj = np.zeros((max_iter,))

        inner_iter_count = 0

        # start of optimization phase

        for iter_num in range(1, max_iter):

            # update lpq. Equation 5.
            lpq = self.geman_mcclure(U[i, :]-U[j, :], mu)

            # compute objective. Equation 6.
            obj[iter_num] = self.compute_obj(X, U, lpq, i, j, lambda_, mu, weights, iter_num)
            
            # update U. Equation 7. For efficiency we form sparse matrices R and D separately, then combine them.

            R = scipy.sparse.coo_matrix((np.concatenate([weights*lpq, weights*lpq], axis=0),
                                         (np.concatenate([i,j],axis=0), np.concatenate([j,i],axis=0))),
                                        shape=[n_samples, n_samples])

            D = scipy.sparse.coo_matrix((np.asarray(np.sum(R, axis=1))[:, 0], ((range(n_samples), range(n_samples)))),
                                        shape=(n_samples, n_samples))

            M = scipy.sparse.eye(n_samples) + lambda_ * (D-R)

            # Solve for U. This could be further optimised through appropriate preconditioning.
            U = scipy.sparse.linalg.spsolve(M, X)
          
            # check for stopping criteria
            inner_iter_count += 1

            # check for the termination conditions and modulate delta if necessary.
            if (abs(obj[iter_num-1]-obj[iter_num]) < 1e-1) or inner_iter_count == inner_iter:
                if mu >= delta:
                    mu /= 2.0
                elif inner_iter_count == inner_iter:
                    mu = 0.5 * delta
                else:
                    break

                eigval = scipy.sparse.linalg.eigs(D-R, k=1, return_eigenvectors=False).real
                lambda_ = xi / eigval[0]
                inner_iter_count = 0

        # at the end of the run, assign values to the class members.
        self.U = U.copy()
        self.i = i
        self.j = j
        self.n_samples = n_samples
        C, num_components = self.compute_assignment(epsilon)

        return U, C

    def fit(self, X):
        """
        Computes the clustering and returns the labels
        Parameters
        ----------
        X (array) numpy array of data to cluster with shape (n_samples, n_features)
        """

        assert type(X) == np.ndarray
        assert len(X.shape) == 2

        # compute the mutual knn graph
        mknn_matrix = self.m_knn(X, self.k, measure=self.measure)

        # perform the RCC clustering
        U, C = self.run_rcc(X, mknn_matrix)

        # store the class labels in the appropriate class member to match the sklearn.cluster interface
        self.labels_ = C.copy()

        # return the computed labels
        return self.labels_