python source code of cluster

# -*- coding: utf-8 -*-
"""
Created on Wed May 25 04:20:00 CEST 2016

@authors: Juan C Entizne, Juan L. Trincado
@email: juancarlos.entizne01[at]estudiant.upf.edu,
        juanluis.trincado[at]upf.edu
"""

import os
import logging
from itertools import islice
from sklearn.cluster import DBSCAN
from collections import defaultdict
from sklearn.metrics import silhouette_score
from lib.optics import *



def sliding_windows(seq, n=2):
    '''
    Returns a sliding sliding_windows (of width n) over data from the iterable
    s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ...
    '''

    it = iter(seq)
    result = tuple(islice(it, n))
    if len(result) == n:
        yield result
    for elem in it:
        result = result[1:] + (elem,)
        yield result


def is_index_valid(indexes, lst):

    # Setting logging preferences
    logger = logging.getLogger(__name__)

    last_el = int(indexes[-1][-1])
    lst_len = int(len(lst))
    if last_el < len(lst):
        logger.error("Invalid index. Index %d is smaller than the number of columns in the file (%d)."
                     % (last_el, lst_len))
        sys.exit(1)
    elif last_el > lst_len:
        logger.error("Invalid index. Index %d is larger than the number of columns in the file (%d)."
                     % (last_el, lst_len))
        sys.exit(1)
    else:
        pass

    # If the groups of indexes are continuous the diff between the value of the last element of the first
    # indexes_group minus the first element of the next index_group must be equal to 1
    for indx_pair in sliding_windows(indexes):
        diff = indx_pair[1][0]-indx_pair[0][-1]

        if diff <= 0:
            msg = str(indx_pair[0][0])+"-"+str(indx_pair[0][-1])+", "+str(indx_pair[1][0])+"-"+str(indx_pair[1][-1])
            logger.error("Indexes %s are not valid. Cluster groups must not overlap." % msg)
            sys.exit(1)
        elif diff > 1:
            msg = str(indx_pair[0][0])+"-"+str(indx_pair[0][-1])+", "+str(indx_pair[1][0])+"-"+str(indx_pair[1][-1])
            logger.error("Indexes %s are not valid. Cluster groups must be continuous." % msg)
            sys.exit(1)
        else:
            pass

    return True


def average_psi_per_condition(psi_vals, indexes):

    # Setting logging preferences
    logger = logging.getLogger(__name__)

    new_vector = []

    # Check if there is empty elements in index list
    if '' in indexes.split(","):
        logger.error("One of the indexes group is not valid.")
        sys.exit(1)

    grouped_ranges = [range_pair.split("-") for range_pair in indexes.split(",")]
    group_indexes = [list(map(int, ranges)) for ranges in grouped_ranges]

    if is_index_valid(group_indexes, psi_vals):
        for indx in group_indexes:
            frst = indx[0]-1
            last = indx[-1]
            group = np.mean(psi_vals[frst:last])

            new_vector.append(group)

    return new_vector


def process_cluster_input(dpsi, psivec, sig_threshold, dpsi_threshold, indexes):

    sig_events = []
    with open(dpsi) as fh:
        next(fh)
        for ln in fh:
            sl = ln.strip("\n").split("\t")
            ev_id = sl[0]
            ev_dpsi_vals = list(map(abs, map(float, sl[1::2])))
            ev_pvals = list(map(float, sl[2::2]))
            for dpsi_val, p_val in zip(ev_dpsi_vals, ev_pvals):
                if dpsi_val > dpsi_threshold and p_val < sig_threshold:
                    sig_events.append(ev_id)

    psi_matrix, ev_id_lst = ([] for _ in range(2))
    with open(psivec) as fh:
        next(fh)
        for ln in fh:
            sl = ln.strip("\n").split("\t")
            ev_id = sl[0]
            psi_vals_str = sl[1:]
            if ev_id in sig_events:
                # Strict form to filter events: event excluded if -1.0 in any sample
                if "-1.0" not in psi_vals_str:
                    psi_vals = [float(val) for val in psi_vals_str]
                    if indexes:
                        vals = average_psi_per_condition(psi_vals, indexes)
                    else:
                        vals = psi_vals

                    psi_matrix.append(vals)
                    ev_id_lst.append(ev_id)

    #psi_matrix is the average psi per condition (only for events that at least one of the comparisons yielded a significant comparison)
    #ev_id_lst is just the list of the events included in the previous analysis
    return psi_matrix, ev_id_lst


def DBSCAN_cluster(psi_matrix, eventid_lst, dist, minpts, metric):

    # Setting logging preferences
    logger = logging.getLogger(__name__)

    # The metric is "cosine" works only with the algorithm "brute"
    if metric == "cosine":
        alg = 'brute'
    else:
        alg = 'auto'

    try:
        db = DBSCAN(eps=dist, min_samples=minpts, metric=metric, algorithm=alg).fit(psi_matrix)
        labels = db.labels_
    except:
        logger.error("Unknown error: {}".format(sys.exc_info()))
        sys.exit(1)

    eventid_labels_dict = {k: v for k, v in zip(eventid_lst, labels)}

    return eventid_labels_dict, labels


def write_averaged_cluster_output(psi_matrix, eventid_lst, eventid_labels_dict, output):

    res = []
    for ev_id, avg in zip(eventid_lst, psi_matrix):
        avg_vals = "\t".join(list(map(str, avg)))
        cluster_id = eventid_labels_dict[ev_id]
        el = (ev_id, cluster_id, avg_vals)
        res.append(el)

    res.sort(key=lambda x: x[1], reverse=True)

    with open("%s.clustvec" % output, "w+") as fh:
        for el in res:
            ln = "%s\t%d\t%s\n" % (el[0], el[1], el[2])
            fh.write(ln)


def calculate_rmsstd(cluster_vectors):

    n_cond = len(cluster_vectors[0])
    # Degrees of Freedom
    dof = n_cond-1

    return np.sqrt(sum(np.std(cluster_vectors, axis=0))/dof)


def calculate_cluster_scores(x, cluster_labels, output):

    with open("%s_scores.log" % output, "w+") as fh:
        # Filter out singleton "cluster" (labeled as -1)
        filtered_x, filtered_cluster_labels, singletons = ([] for _ in range(3))
        cluster_groups = defaultdict(list)
        for vec, lab in zip(x, cluster_labels):
            if not lab == -1:
                filtered_x.append(vec)
                filtered_cluster_labels.append(lab)

                cluster_groups[lab].append(vec)
            else:
                singletons.append(vec)

        ln = "Number of clustered events: %d/%d (%f%%)\n" % (len(filtered_x), len(filtered_x)+len(singletons),
                                                           (len(filtered_x)/(len(filtered_x)+len(singletons)))*100)
        print(ln.strip("\n"))
        fh.write(ln)

        for group in cluster_groups:
                n_events = len(cluster_groups[group])
                ln = "Cluster %d contains %d events\n" % (group, n_events)
                print(ln.strip("\n"))
                fh.write(ln)

        rmsstd_scores = []
        for group in cluster_groups:
            rmsstd = calculate_rmsstd(np.array(cluster_groups[group]))
            ln = "The RMSSTD score for cluster %d is %f\n" % (group, rmsstd)
            print(ln.strip("\n"))
            fh.write(ln)

            rmsstd_scores.append(rmsstd)

        try:
            silhouette_avg = silhouette_score(np.array(filtered_x), np.array(filtered_cluster_labels))
            ln = "The average silhouette score is : %f\n" % silhouette_avg
            print(ln.strip("\n"))
            fh.write(ln)
        except:
            silhouette_avg = float("nan")
            ln = "Impossible to calculate silhouette score. Only 1 cluster group identified.\n"
            print(ln.strip("\n"))
            fh.write(ln)

    return silhouette_avg, rmsstd_scores


def create_points_list(psi_matrix, eventid_lst):
    '''Generate a list of points from the psi_matrix'''
    points_list = []
    for i,x in enumerate(psi_matrix):
        points_list.append(Point(eventid_lst[i],x))
    return points_list

def generate_labels(clusters, eventid_lst):
    '''From the clusters generated from OPTICS, label the event_ids'''
    eventid_labels_dict = {}
    # Initialize to -1 both structures
    for x in eventid_lst : eventid_labels_dict[x] = -1
    labels = [-1 for x in range(len(eventid_lst))]

    #Read clusters, annotating the cluster number for each event
    if(len(clusters)!=0):
        for number, cluster in enumerate(clusters):
            for point in cluster.points:
                eventid_labels_dict[point.id] = number
                labels[eventid_lst.index(point.id)] = number

    return eventid_labels_dict, labels


def cluster_analysis(dpsi, psivec, sig_threshold, dpsi_threshold, eps, minpts, metric, indexes, clustering,
                     separation, output):

    path = os.path.dirname(os.path.realpath(dpsi))
    os.chdir(path)

    psi_matrix, eventid_lst = process_cluster_input(dpsi, psivec, sig_threshold, dpsi_threshold, indexes)

    if(clustering=="DBSCAN"):
        eventid_labels_dict, labels = DBSCAN_cluster(psi_matrix, eventid_lst, eps, minpts, metric)
        #eventid_labels_dict are the labels of the clustering for eacg event

        write_averaged_cluster_output(psi_matrix, eventid_lst, eventid_labels_dict, output)
        calculate_cluster_scores(psi_matrix, labels, output)

    else:
        #OPTICS
        points_list = create_points_list(psi_matrix, eventid_lst) #Transform the points on psi_matrix to Points from optics.py
        optics = Optics(points_list, eps, minpts)  # Maximum radius to be considered, cluster size >= 2 points
        optics.run()  # run the algorithm
        clusters = optics.cluster(separation)  # minimum threshold for clustering (upper limit to separate the clusters)
        eventid_labels_dict, labels = generate_labels(clusters, eventid_lst)
        write_averaged_cluster_output(psi_matrix, eventid_lst, eventid_labels_dict, output)
        calculate_cluster_scores(psi_matrix, labels, output)