python source code of csi

"""
    biclustlib: A Python library of biclustering algorithms and evaluation measures.
    Copyright (C) 2017  Victor Alexandre Padilha

    This file is part of biclustlib.

    biclustlib is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    biclustlib is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
"""

import numpy as np

from scipy import sparse as sp
from .check import check_biclusterings

def csi(predicted_biclustering, reference_biclustering, num_rows, num_cols, sparse=True):
    """The Campello Soft Index (CSI) external evaluation measure.

    CSI computes the similarity between two soft clusterings. This measure was originally
    introduced in (Campello, 2010). In this package, it was implemented following the approach
    presented in (Horta and Campello, 2014), which first transforms each biclustering solution to
    a soft clustering representation and then applies the CSI measure. CSI lies in the interval
    [0, 1], where values close to 1 indicate better biclustering solutions.

    Reference
    ---------
    Campello, R. J. G. B. (2010). Generalized external indexes for comparing data partitions with
    overlapping categories. Pattern Recognition Letters, 31(9), 966-975.

    Horta, D., & Campello, R. J. G. B. (2014). Similarity measures for comparing biclusterings.
    IEEE/ACM Transactions on Computational Biology and Bioinformatics, 11(5), 942-954.

    Parameters
    ----------
    predicted_biclustering : biclustlib.model.Biclustering
        Predicted biclustering solution.

    reference_biclustering : biclustlib.model.Biclustering
        Reference biclustering solution.

    num_rows : int
        Number of rows of the dataset.

    num_cols : int
        Number of columns of the dataset.

    sparse : bool, default: True
        Wheter the (co)association matrices will be represented as sparse matrices. In most cases
        setting this parameter to True will increase computation efficiency.

    Returns
    -------
    csi_value : float
        Similarity score between 0.0 and 1.0.
    """

    check = check_biclusterings(predicted_biclustering, reference_biclustering)

    if isinstance(check, float):
        return check

    predicted_clustering = _biclustering_to_soft_clustering(predicted_biclustering, num_rows, num_cols)
    reference_clustering = _biclustering_to_soft_clustering(reference_biclustering, num_rows, num_cols)

    predicted_association = _calculate_association(predicted_clustering, num_rows, num_cols, sparse)
    predicted_coassociation = _calculate_coassociation(predicted_association)
    predicted_beta = _calculate_beta(predicted_association)

    reference_association = _calculate_association(reference_clustering, num_rows, num_cols, sparse)
    reference_coassociation = _calculate_coassociation(reference_association)
    reference_beta = _calculate_beta(reference_association)

    agreements = _calculate_agreements(predicted_coassociation, reference_coassociation, predicted_beta, reference_beta, sparse)
    disagreements = _calculate_disagreements(predicted_coassociation, reference_coassociation, predicted_beta, reference_beta, sparse)

    return float(agreements) / (agreements + disagreements)

def _biclustering_to_soft_clustering(biclustering, num_rows, num_cols):
    is_singleton = np.ones(num_rows * num_cols, dtype=np.bool)
    soft_clustering = []

    for b in biclustering.biclusters:
        cluster = (b.rows[:, np.newaxis] + b.cols * num_rows).flatten()
        soft_clustering.append(cluster)
        is_singleton[cluster] = False

    soft_clustering.extend(i for i in np.where(is_singleton)[0])

    return soft_clustering

def _calculate_association(clustering, num_rows, num_cols, sparse):
    if sparse:
        association = sp.dok_matrix((len(clustering), num_rows * num_cols), dtype=np.int)
    else:
        association = np.zeros((len(clustering), num_rows * num_cols), dtype=np.int)

    for k, c in enumerate(clustering):
        association[k, c] = 1

    if sparse:
        return sp.csr_matrix(association)
    return association

def _calculate_coassociation(association):
    return association.T.dot(association)

def _calculate_beta(association):
    return association.sum(axis=0) - 1

def _calculate_agreements(predicted_coassociation, reference_coassociation, predicted_beta, reference_beta, sparse):
    num_objects = predicted_coassociation.shape[0]
    min_alpha = _triu(predicted_coassociation.minimum(reference_coassociation), sparse)
    min_beta = np.minimum(predicted_beta, reference_beta)
    return min_alpha.sum() + min_beta.sum() * (num_objects - 1)

def _calculate_disagreements(predicted_coassociation, reference_coassociation, predicted_beta, reference_beta, sparse):
    num_objects = predicted_coassociation.shape[0]
    abs_alpha = abs(_triu(predicted_coassociation - reference_coassociation, sparse))
    abs_beta = abs(predicted_beta - reference_beta)
    return abs_alpha.sum() + abs_beta.sum() * (num_objects - 1)

def _triu(a, sparse):
    if sparse:
        return sp.triu(a, k=1)
    return np.triu(a, k=1)