Python scipy.cluster.hierarchy.fcluster() Examples

The following are 29 code examples of scipy.cluster.hierarchy.fcluster(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scipy.cluster.hierarchy , or try the search function .
Example #1
Source File: subroutines.py    From SigProfilerExtractor with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def dendrogram(data, threshold, layer_directory):
    colnames = data.columns
    data = np.array(data)

    Z = hierarchy.linkage(data.T, 'single',  'cosine')
    plt.figure(figsize=(15, 9))
    dn = hierarchy.dendrogram(Z, labels = colnames, color_threshold=threshold)
    plt.title("Clustering of Samples Based on Mutational Signatures" )
    plt.ylabel("Cosine Distance")
    plt.xlabel("Sample IDs")
    #plt.ylim((0,1))
    plt.savefig(layer_directory+'/dendrogram.pdf',figsize=(10, 8), dpi=300)
    # which datapoints goes to which cluster
    # The indices of the datapoints will be displayed as the ids 
    Y = hierarchy.fcluster(Z, threshold, criterion='distance', R=None, monocrit=None)
    dataframe = pd.DataFrame({"Cluster":Y, "Sample Names":list(colnames)})
    dataframe = dataframe.set_index("Sample Names")
    #print(dataframe)
    dictionary = {"clusters":Y, "informations":dn}
    
    return dataframe 


######################################## Plot the reconstruction error vs stabilities and select the optimum number of signature #################################################### 
Example #2
Source File: make_bed.py    From mCaller with MIT License 6 votes vote down vote up
def cluster(currents,context,original_labels,chrom,pos1,plot,plotdir,cluster=False):
    colours = {'m6A':'#B4656F','A':'#55B196'} #TODO update for other labels
    if len(currents) > 1 and cluster :
        pdistance = ssd.pdist(currents,metric='correlation')
        dm = ssd.squareform(pdistance)
        link = linkage(dm,method='complete',metric='correlation')
        klabels = fcluster(link,2,'maxclust') #1,'inconsistent') #2,'maxclust')
        #klabels = [1 if x == 1 else 0 for x in klabels]
        #labels = ['m6A']*len(klabels)
        strategy = 'correlation'
    else:
        klabels = [1 if x==1 else 0 for x in original_labels]
        strategy = 'classifierProb'
    if plot:
        plot_w_labels(klabels,original_labels,currents,strategy,context,'chrom.'+chrom+'.pos.'+pos1,plotdir,colours)
    #for cluster in clusters: 
Example #3
Source File: cluster.py    From cesi with Apache License 2.0 6 votes vote down vote up
def getClusters(self, embed):

		n, m 	= len(embed), self.p.embed_dims
		X 	= np.empty((n, m), np.float32)

		for i in range(len(embed)): 
			X[i, :] = embed[i]

		dist 	  = pdist(X, 	  metric=self.p.metric)
		clust_res = linkage(dist, method=self.p.linkage)
		labels    = fcluster(clust_res, t=self.p.thresh_val, criterion='distance') - 1
		clusters  = [[] for i in range(max(labels) + 1)]

		for i in range(len(labels)): 
			clusters[labels[i]].append(i)

		return clusters 
Example #4
Source File: subroutines.py    From SigProfilerExtractor with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def dendrogram(data, threshold, layer_directory):
    colnames = data.columns
    data = np.array(data)

    Z = hierarchy.linkage(data.T, 'single',  'cosine')
    plt.figure(figsize=(15, 9))
    dn = hierarchy.dendrogram(Z, labels = colnames, color_threshold=threshold)
    plt.title("Clustering of Samples Based on Mutational Signatures" )
    plt.ylabel("Cosine Distance")
    plt.xlabel("Sample IDs")
    #plt.ylim((0,1))
    plt.savefig(layer_directory+'/dendrogram.pdf',figsize=(10, 8), dpi=300)
    # which datapoints goes to which cluster
    # The indices of the datapoints will be displayed as the ids 
    Y = hierarchy.fcluster(Z, threshold, criterion='distance', R=None, monocrit=None)
    dataframe = pd.DataFrame({"Cluster":Y, "Sample Names":list(colnames)})
    dataframe = dataframe.set_index("Sample Names")
    #print(dataframe)
    dictionary = {"clusters":Y, "informations":dn}
    
    return dataframe 


######################################## Plot the reconstruction error vs stabilities and select the optimum number of signature #################################################### 
Example #5
Source File: common.py    From plastering with MIT License 6 votes vote down vote up
def hier_clustering(d, threshold=3):
    srcids = d.keys()
    tokenizer = lambda x: x.split()
    vectorizer = TfidfVectorizer(tokenizer=tokenizer)
    assert isinstance(d, dict)
    assert isinstance(list(d.values())[0], list)
    assert isinstance(list(d.values())[0][0], str)
    doc = [' '.join(d[srcid]) for srcid in srcids]
    vect = vectorizer.fit_transform(doc)
    #TODO: Make vect aligned to the required format
    z = linkage(vect.toarray(), metric='cityblock', method='complete')
    dists = list(set(z[:,2]))
#    threshold = 3
    #threshold = (dists[2] + dists[3]) / 2
    b = hier.fcluster(z, threshold, criterion='distance')
    cluster_dict = defaultdict(list)
    for srcid, cluster_id in zip(srcids, b):
        cluster_dict[str(cluster_id)].append(srcid)
    value_lengther = lambda x: len(x[1])
    return OrderedDict(\
               sorted(cluster_dict.items(), key=value_lengther, reverse=True)) 
Example #6
Source File: zodiac.py    From plastering with MIT License 6 votes vote down vote up
def create_cluster_map(self, bow, srcids):
        cluster_map = {}
        z = linkage(bow, metric='cityblock', method='complete')
        dists = list(set(z[:, 2]))
        thresh = (dists[1] + dists[2]) / 2
        self.logger.info('Threshold: {0}'.format(thresh))
        b = hier.fcluster(z, thresh, criterion='distance')
        assert bow.shape[0] == len(b)
        assert len(b) == len(srcids)
        for cid, srcid in zip(b, srcids):
            cluster_map[cid] = cluster_map.get(cid, []) + [srcid]

        self.logger.info('# of clusters: {0}'.format(len(b)))
        self.logger.info('sizes of clustsers:{0}'.format(sorted(map(len, cluster_map.values()))))

        return cluster_map 
Example #7
Source File: acquisition_scheme.py    From dmipy with MIT License 5 votes vote down vote up
def calculate_shell_bvalues_and_indices(bvalues, max_distance=20e6):
    """
    Calculates which measurements belong to different acquisition shells.
    It uses scipy's linkage clustering algorithm, which uses the max_distance
    input as a limit of including measurements in the same cluster.

    For example, if bvalues were [1, 2, 3, 4, 5] and max_distance was 1, then
    all bvalues would belong to the same cluster.
    However, if bvalues were [1, 2, 4, 5] max max_distance was 1, then this
    would result in 2 clusters.

    Parameters
    ----------
    bvalues: 1D numpy array of shape (Ndata)
        bvalues of the acquisition in s/m^2.
    max_distance: float
        maximum b-value distance for a measurement to be included in the same
        shell.

    Returns
    -------
    shell_indices: 1D numpy array of shape (Ndata)
        array of integers, starting from 0, representing to which shell a
        measurement belongs. The number itself has no meaning other than just
        being different for different shells.
    shell_bvalues: 1D numpy array of shape (Nshells)
        array of the mean bvalues for every acquisition shell.
    """
    linkage_matrix = linkage(np.c_[bvalues])
    clusters = fcluster(linkage_matrix, max_distance, criterion='distance')
    shell_indices = np.empty_like(bvalues, dtype=int)
    cluster_bvalues = np.zeros((np.max(clusters), 2))
    for ind in np.unique(clusters):
        cluster_bvalues[ind - 1] = np.mean(bvalues[clusters == ind]), ind
    shell_bvalues, ordered_cluster_indices = (
        cluster_bvalues[cluster_bvalues[:, 0].argsort()].T)
    for i, ind in enumerate(ordered_cluster_indices):
        shell_indices[clusters == ind] = i
    return shell_indices, shell_bvalues 
Example #8
Source File: helper.py    From Ensemble-Bayesian-Optimization with MIT License 5 votes vote down vote up
def mean_z(z_all, dim_limit):
    # use correlation clustering to average group assignments
    lz = hi.linkage(z_all.T, 'single', 'hamming')
    # not sure why cluster id starts from 1
    z = hi.fcluster(lz, 0) - 1
    all_cat = np.unique(z)
    for a in all_cat:
        a_size = np.sum(a == z)
        if a_size > dim_limit:
            z[a == z] = sample_multinomial([1.] * a_size, a_size, dim_limit)
    return z 
Example #9
Source File: heatmap.py    From SqueezeMeta with GNU General Public License v3.0 5 votes vote down vote up
def plotDendrogram(self, matrix, axis, clusteringThreshold, orientation):

        d = dist.pdist(matrix)
        linkage = cluster.linkage(dist.squareform(d), method='average', metric='cityblock')
        dendrogram = cluster.dendrogram(linkage, orientation=orientation, link_color_func=lambda k: 'k')
        index = cluster.fcluster(linkage, clusteringThreshold * max(linkage[:,2]), 'distance')
        axis.set_xticks([])
        axis.set_yticks([])

        return index, dendrogram['leaves'] 
Example #10
Source File: regions.py    From TOBIAS with MIT License 5 votes vote down vote up
def cluster(self, threshold=0.5, method="average"):
		""" Main function to cluster the overlap dictionary into clusters"""

		self.overlap_to_distance()

		if len(self.names) > 1:
			self.linkage_mat = linkage(squareform(self.distance_mat), method)
			self.labels = fcluster(self.linkage_mat, threshold, criterion="distance")		#ordering of the dendrogram

			#Find clusters below threshold
			self.linkage_clusters = dict(zip(range(self.n), [[num] for num in range(self.n)]))
			for i, row in enumerate(self.linkage_mat):
				ID1 = int(row[0])
				ID2 = int(row[1])
				new = self.n + i
				dist = row[2]

				if dist <= threshold:
					self.linkage_clusters[new] = self.linkage_clusters[ID1] + self.linkage_clusters[ID2] + [new]
					del self.linkage_clusters[ID1]
					del self.linkage_clusters[ID2]

			#Add member-names to clusters
			for cluster in self.linkage_clusters:

				self.clusters[cluster] = {"member_idx": [idx for idx in self.linkage_clusters[cluster] if idx < self.n]}
				self.clusters[cluster]["member_names"] = [self.names[idx] for idx in self.clusters[cluster]["member_idx"]]
		
		else:	#only one TF
			self.linkage_clusters = {0:[0]}
			self.linkage_mat = np.array([[0]])
			self.clusters[0] = {"member_idx":[0]}
			self.clusters[0]["member_names"] = [self.names[idx] for idx in self.clusters[0]["member_idx"]]

		self.get_cluster_names()	#Set names of clusters
		self.assign_colors() 
Example #11
Source File: motifs.py    From TOBIAS with MIT License 5 votes vote down vote up
def cluster(self, threshold=0.5, metric = "pcc", clust_method = "average"):
		""" 
		Returns:
		----------
		dict
			A dictionary with keys=cluster names and values=MotifList objects
		"""

		#Needs gimmemotif
		from gimmemotifs.motif import Motif
		from gimmemotifs.comparison import MotifComparer
		sns.set_style("ticks")	#set style back to ticks, as this is set globally during gimmemotifs import

		#Fill in self.gimme_obj variable
		motif_list = [motif.get_gimmemotif().gimme_obj for motif in self]	#list of gimmemotif objects

		#Similarities between all motifs
		mc = MotifComparer()
		score_dict = mc.get_all_scores(motif_list, motif_list, match = "total", metric = metric, combine = "mean")   #metric can be: seqcor, pcc, ed, distance, wic, chisq, akl or ssd
		self.similarity_matrix = generate_similarity_matrix(score_dict)

		# Clustering
		vector = ssd.squareform(self.similarity_matrix.to_numpy())
		self.linkage_mat = linkage(vector, method=clust_method)

		# Flatten clusters
		fclust_labels = fcluster(self.linkage_mat, threshold, criterion="distance")			#cluster membership per motif
		formatted_labels = ["Cluster_{0}".format(label) for label in fclust_labels]

		# Extract motifs belonging to each cluster
		cluster_dict = {label: MotifList() for label in formatted_labels}	#initialize dictionary
		for i, cluster_label in enumerate(formatted_labels):
			cluster_dict[cluster_label].append(self[i])

		return cluster_dict 
Example #12
Source File: clustering.py    From clust with GNU Lesser General Public License v3.0 5 votes vote down vote up
def chc(X, K, params=()):
    pnames = ['linkage_method',  'distance']
    dflts  = [          'ward', 'euclidean']
    if isinstance(params, np.ndarray):
        paramsloc = params.tolist()
    else:
        paramsloc = params
    (linkage_method, distance) = ds.resolveargumentpairs(pnames, dflts, paramsloc)

    Z = sphc.linkage(X, method=linkage_method, metric=distance)
    C = sphc.fcluster(Z, K, criterion='maxclust')
    return clustVec2partMat(C, K)


# Other related functions 
Example #13
Source File: construction.py    From FinanceHub with MIT License 5 votes vote down vote up
def __init__(self, data, method='single', metric='euclidean'):
        """
        Combines the assets in `data` using HRP
        returns an object with the following attributes:
            - 'cov': covariance matrix of the returns
            - 'corr': correlation matrix of the returns
            - 'sort_ix': list of sorted column names according to cluster
            - 'link': linkage matrix of size (N-1)x4 with structure Y=[{y_m,1  y_m,2  y_m,3  y_m,4}_m=1,N-1].
                      At the i-th iteration, clusters with indices link[i, 0] and link[i, 1] are combined to form
                      cluster n+1. A cluster with an index less than n corresponds to one of the original observations.
                      The distance between clusters link[i, 0] and link[i, 1] is given by link[i, 2]. The fourth value
                      link[i, 3] represents the number of original observations in the newly formed cluster.
            - 'weights': final weights for each asset

        :param data: pandas DataFrame where each column is a series of returns
        :param method: any method available in scipy.cluster.hierarchy.linkage
        :param metric: any metric available in scipy.cluster.hierarchy.linkage
        """

        assert isinstance(data, pd.DataFrame), "input 'data' must be a pandas DataFrame"

        self.cov = data.cov()
        self.corr = data.corr()
        self.method = method
        self.metric = metric

        self.link = self._tree_clustering(self.corr, self.method, self.metric)
        self.sort_ix = self._get_quasi_diag(self.link)
        self.sort_ix = self.corr.index[self.sort_ix].tolist()  # recover labels
        self.sorted_corr = self.corr.loc[self.sort_ix, self.sort_ix]  # reorder correlation matrix
        self.weights = self._get_recursive_bisection(self.cov, self.sort_ix)
        # TODO self.cluster_nember = sch.fcluster(self.link, t=5, criterion='maxclust') 
Example #14
Source File: heatmap.py    From CompareM with GNU General Public License v3.0 5 votes vote down vote up
def plotDendrogram(self, matrix, axis, clusteringThreshold, orientation):

        d = dist.pdist(matrix)
        linkage = cluster.linkage(dist.squareform(d), method='average', metric='cityblock')
        dendrogram = cluster.dendrogram(linkage, orientation=orientation, link_color_func=lambda k: 'k')
        index = cluster.fcluster(linkage, clusteringThreshold * max(linkage[:,2]), 'distance')
        axis.set_xticks([])
        axis.set_yticks([])

        return index, dendrogram['leaves'] 
Example #15
Source File: normalizer.py    From HoloClean-Legacy-deprecated with Apache License 2.0 5 votes vote down vote up
def _normalize_col(self, df, ci):
        """
        Normalizing column in given dataframe

        :param df: input dataframe
        :param ci: column name

        :return: normalized dataframe with respect to given column
        """

        col_name = ci.col_name
        col = df.select(col_name).collect()

        col = [row[col_name].encode('utf-8', 'replace')
               if row[col_name] is not None else ''for row in col]

        distinct = list(set(col))

        if len(distinct) > self.max_distinct or len(distinct) <= 1:
            return df

        similarity = self._compute_distances(distinct, ci.distance_fcn)

        z = linkage(similarity)

        labels = fcluster(z, ci.threshold, 'distance')

        # sets up map from value to most common value in that cluster
        clusters = self._get_exemplars(col, labels, distinct)

        new_col = [clusters[val][0] for val in col]

        df = df.na.replace(col, new_col, col_name)

        return df 
Example #16
Source File: listing_6_4_find_metric_groups.py    From fight-churn with MIT License 5 votes vote down vote up
def find_correlation_clusters(corr,corr_thresh):
    dissimilarity = 1.0 - corr
    hierarchy = linkage(squareform(dissimilarity), method='single')
    diss_thresh = 1.0 - corr_thresh
    labels = fcluster(hierarchy, diss_thresh, criterion='distance')
    return labels 
Example #17
Source File: handcrafting.py    From pysystemtrade with GNU General Public License v3.0 5 votes vote down vote up
def _cluster_breakdown(self):
        """
        Creates clusters from the portfolio (doesn't create sub portfolios, but tells you which ones to make)

        Credit to this notebook: https://github.com/TheLoneNut/CorrelationMatrixClustering/blob/master/CorrelationMatrixClustering.ipynb

        :return: list of int same length as instruments
        """

        X = self.corr_matrix.values
        d = sch.distance.pdist(X)
        L = sch.linkage(d, method='complete')
        ind = sch.fcluster(L, MAX_CLUSTER_SIZE, criterion='maxclust')

        return list(ind) 
Example #18
Source File: SVIM_clustering.py    From svim with GNU General Public License v3.0 5 votes vote down vote up
def clusters_from_partitions(partitions, options):
    """Finds clusters in partitions using span-position distance and hierarchical clustering. 
    Assumes that all signatures in the given partition are of the same type and on the same contig"""
    clusters_final = []
    large_partitions = 0
    # Find clusters in each partition individually.
    for partition in partitions:
        if len(partition) == 1:
            clusters_final.append([partition[0]])
            continue
        elif len(partition) > 100:
            partition_sample = sample(partition, 100)
            large_partitions += 1
        else:
            partition_sample = partition
        element_type = partition_sample[0].type
        if element_type == "DEL" or element_type == "INV" or element_type == "DUP_TAN":
            data = np.array( [[signature.get_source()[1], signature.get_source()[2], options.distance_normalizer] for signature in partition_sample])
            Z = linkage(data, method = "average", metric = span_position_distance)
        elif element_type == "INS":
            data = np.array( [[signature.get_source()[1], signature.get_source()[2], options.distance_normalizer] for signature in partition_sample])
            Z = linkage(data, method = "average", metric = span_position_distance_insertions)
        elif element_type == "DUP_INT":
            data = np.array( [[signature.get_source()[1], signature.get_source()[2], signature.get_destination()[1], options.distance_normalizer] for signature in partition_sample])
            Z = linkage(data, method = "average", metric = span_position_distance_intdups)

        cluster_indices = list(fcluster(Z, options.cluster_max_distance, criterion='distance'))
        new_clusters = [[] for i in range(max(cluster_indices))]
        for signature_index, cluster_index in enumerate(cluster_indices):
            new_clusters[cluster_index-1].append(partition_sample[signature_index])
        clusters_final.extend(new_clusters)
    if len(partitions) > 0:
        if len(partitions[0]) > 0:
            logging.debug("%d out of %d partitions for %s exceeded 100 elements." % (large_partitions, len(partitions), partitions[0][0].type))
    return clusters_final 
Example #19
Source File: SemanticAnalysis.py    From CAN_Reverse_Engineering with GNU General Public License v3.0 5 votes vote down vote up
def signal_clustering(corr_matrix:      DataFrame,
                      threshold:        float,
                      cluster_pickle:   str = "",
                      linkage_pickle:   str = "",
                      force:            bool = False):
    if force:
        if path.isfile(cluster_pickle):
            remove(cluster_pickle)
        if path.isfile(linkage_pickle):
            remove(linkage_pickle)
    if path.isfile(cluster_pickle) and path.isfile(linkage_pickle):
        print("\nSignal clustering already completed and forcing is turned off. Using pickled data...")
        return [load(open(cluster_pickle, "rb")), load(open(linkage_pickle, "rb"))]

    # Remove negative values from the correlation matrix and invert the values
    corr_matrix.where(corr_matrix > 0, 0, inplace=True)
    corr_matrix = 1 - corr_matrix
    X = corr_matrix.values  # type: ndarray
    Y = clip(ssd.squareform(X), 0, None)
    # Z is the linkage matrix. This can serve as input to the scipy.cluster.hierarchy.dendrogram method
    Z = linkage(Y, method='single', optimal_ordering=True)
    fclus = fcluster(Z, t=threshold, criterion='distance')
    cluster_dict = {}
    for i, cluster_label in enumerate(fclus):
        if cluster_label in cluster_dict:
            cluster_dict[cluster_label].append(corr_matrix.index[i])
        else:
            cluster_dict[cluster_label] = [corr_matrix.index[i]]
    return cluster_dict, Z 
Example #20
Source File: atlas3.py    From ssbio with MIT License 5 votes vote down vote up
def remove_correlated_feats(df):
    tmp = df.T
    # Remove columns with no variation
    nunique = tmp.apply(pd.Series.nunique)
    cols_to_drop = nunique[nunique == 1].index
    tmp.drop(cols_to_drop, axis=1, inplace=True)

    perc_spearman = scipy.stats.spearmanr(tmp)
    abs_corr = np.subtract(np.ones(shape=perc_spearman.correlation.shape),
                           np.absolute(perc_spearman.correlation))
    np.fill_diagonal(abs_corr, 0)
    abs_corr_clean = np.maximum(abs_corr,
                                abs_corr.transpose())  # some floating point mismatches, just make symmetric
    clustering = linkage(squareform(abs_corr_clean), method='average')
    clusters = fcluster(clustering, .1, criterion='distance')
    names = tmp.columns.tolist()
    names_to_cluster = list(zip(names, clusters))
    indices_to_keep = []
    ### Extract models closest to cluster centroids
    for x in range(1, len(set(clusters)) + 1):
        # Create mask from the list of assignments for extracting submatrix of the cluster
        mask = np.array([1 if i == x else 0 for i in clusters], dtype=bool)

        # Take the index of the column with the smallest sum of distances from the submatrix
        idx = np.argmin(sum(abs_corr_clean[:, mask][mask, :]))

        # Extract names of cluster elements from names_to_cluster
        sublist = [name for (name, cluster) in names_to_cluster if cluster == x]

        # Element closest to centroid
        centroid = sublist[idx]
        indices_to_keep.append(centroid)

    return df.loc[df.index.isin(indices_to_keep)] 
Example #21
Source File: hierarchy.py    From malss with MIT License 5 votes vote down vote up
def fit_predict(self, X, y=None):
        self.model = linkage(X, method=self.method, metric=self.metric)
        return fcluster(self.model, t=self.n_clusters, criterion='maxclust') - 1 
Example #22
Source File: LogClustering.py    From loglizer with MIT License 5 votes vote down vote up
def _offline_clustering(self, X):
        print('Starting offline clustering...')
        p_dist = pdist(X, metric=self._distance_metric)
        Z = linkage(p_dist, 'complete')
        cluster_index = fcluster(Z, self.max_dist, criterion='distance')
        self._extract_representatives(X, cluster_index)
        print('Processed {} instances.'.format(X.shape[0]))
        print('Found {} clusters offline.\n'.format(len(self.representatives)))
        # print('The representive vectors are:')
        # pprint.pprint(self.representatives.tolist()) 
Example #23
Source File: common.py    From plastering with MIT License 5 votes vote down vote up
def get_word_clusters(sentence_dict):
    srcids = list(sentence_dict.keys())
    sentences = []
    for srcid in srcids:
        sentence = []
        for metadata_type, sent in sentence_dict[srcid].items():
            sentence.append(''.join(sent))
        sentence = '\n'.join(sentence)
        sentence = ' '.join(re.findall('[a-z]+', sentence))
        sentences.append(sentence)
    vect = TfidfVectorizer()
    #vect = CountVectorizer()
    bow = vect.fit_transform(sentences).toarray()
    try:
        z = linkage(bow, metric='cityblock', method='complete')
    except:
        pdb.set_trace()
    dists = list(set(z[:,2]))
    thresh = (dists[2] + dists[3]) /2
    #thresh = (dists[1] + dists[2]) /2
    print("Threshold: ", thresh)
    b = hier.fcluster(z,thresh, criterion='distance')
    cluster_dict = defaultdict(list)

    for srcid, cluster_id in zip(srcids, b):
        cluster_dict[cluster_id].append(srcid)
    return dict(cluster_dict) 
Example #24
Source File: cluster.py    From catch with MIT License 5 votes vote down vote up
def cluster_from_dist_matrix(dist_matrix, threshold):
    """Use scipy to cluster a distance matrix.

    Args:
        dist_matrix: distance matrix, represented in scipy's 1d condensed form
        threshold: maximum inter-cluster distance to merge clusters (higher
            results in fewer clusters)

    Returns:
        list c such that c[i] is a collection of all the observations
        (whose pairwise distances are indexed in dist) in the i'th
        cluster, in sorted order by descending cluster size
    """
    linkage = hierarchy.linkage(dist_matrix, method='average')
    clusters = hierarchy.fcluster(linkage, threshold, criterion='distance')

    # clusters are numbered starting at 1, but base the count on
    # first_clust_num just in case this changes
    first_clust_num = min(clusters)
    num_clusters = max(clusters) + 1 - first_clust_num
    elements_in_cluster = defaultdict(list)
    for i, clust_num in enumerate(clusters):
        elements_in_cluster[clust_num].append(i)
    cluster_sizes = {c: len(elements_in_cluster[c])
                     for c in range(first_clust_num,
                                    num_clusters + first_clust_num)}

    elements_in_cluster_sorted = []
    for clust_num, _ in sorted(cluster_sizes.items(),
            key=operator.itemgetter(1), reverse=True):
        elements_in_cluster_sorted += [elements_in_cluster[clust_num]]
    return elements_in_cluster_sorted 
Example #25
Source File: nominal.py    From dython with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def cluster_correlations(corr_mat, indices=None):
    """
    Apply agglomerative clustering in order to sort
    a correlation matrix.

    Based on https://github.com/TheLoneNut/CorrelationMatrixClustering/blob/master/CorrelationMatrixClustering.ipynb

    Parameters:
    -----------
    - corr_mat : a square correlation matrix (pandas DataFrame)
    - indices : cluster labels [None]; if not provided we'll do
        an aglomerative clustering to get cluster labels.

    Returns:
    --------
    - corr : a sorted correlation matrix
    - indices : cluster indexes based on the original dataset

    Example:
    --------
    >> assoc = associations(
        customers,
        plot=False
    )
    >> correlations = assoc['corr']
    >> correlations, _ = cluster_correlations(correlations)
    """
    if indices is None:
        X = corr_mat.values
        d = sch.distance.pdist(X)
        L = sch.linkage(d, method='complete')
        indices = sch.fcluster(L, 0.5 * d.max(), 'distance')
    columns = [corr_mat.columns.tolist()[i]
               for i in list((np.argsort(indices)))]
    corr_mat = corr_mat.reindex(columns=columns).reindex(index=columns)
    return corr_mat, indices 
Example #26
Source File: env_corr.py    From glosim with MIT License 4 votes vote down vote up
def clusterdistmat(distmatrixfile,sim,dcut,mode='average',plot=False):
	# Compute the clusturing on dist^2 so that the average 
	# distance of a cluster with an other is the RMS distance
	sim2 = sim*sim
	Z = sc.linkage(sim2,mode)

	cdist = Z[:,2]
	# get the full tree
	# dendo = sc.dendrogram(Z)
	# clist = dendo['leaves']
   	nclust = cluster.estimate_ncluster(cdist,dcut)

	clist = sc.fcluster(Z,nclust,criterion='maxclust')
	c_count = Counter(clist)
	nbclst = len(c_count)

	print "Number of clusters", nbclst 
	
	rep_ind = getrep_ind(sim2,clist,c_count)

	# Write the groupe indices and representatives
	filename=basename(distmatrixfile)+'-cluster.index'
	f=open(filename,"w")
	f.write(" # groupid representative \n ")
	for i in range(len(sim)):
		iselect=0
		if i in rep_ind: iselect=2
		f.write("%d   %d \n " %(clist[i]-1,  iselect)) 
	f.close()

   
	if plot: 
		filename=basename(distmatrixfile)+'-dendogram.eps'
		plotdendro(Z,nclust,filename,rep_ind)
	c_list = np.zeros(len(sim))

	# Change cluster groups numbering to (0:n-1)
	for i in range(len(sim)):
		c_list[i] = int(clist[i]-1)

	return c_list,Z

# Determine the representative element of each cluster group 
Example #27
Source File: cluster_eac.py    From combo with BSD 2-Clause "Simplified" License 4 votes vote down vote up
def fit(self, X):
        """Fit estimators.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.
        """

        # Validate inputs X
        X = check_array(X)
        n_samples = X.shape[0]

        # initialize similarity matrix
        sim_mat_all = np.zeros([n_samples, n_samples])

        if self.pre_fitted:
            print("Training Skipped")

        else:
            for clf in self.base_estimators:
                clf.fit(X)
                clf.fitted_ = True

        for i, estimator in enumerate(self.base_estimators):
            check_is_fitted(estimator, ['labels_'])

            # get the labels from each base estimator
            labels = estimator.labels_.reshape(n_samples, 1)

            # generate the similarity matrix for the current estimator
            sim_mat = _generate_similarity_mat(labels)

            # add to the main similarity mat
            sim_mat_all = sim_mat_all + sim_mat

        # get the average of the similarity mat
        sim_mat_avg = np.divide(sim_mat_all, self.n_base_estimators_)

        # flip the similarity. smaller value implies more similarity
        sim_mat_avg = np.abs(np.max(sim_mat_avg) - sim_mat_avg)

        # build clusters
        self.Z_ = linkage(sim_mat_avg, method=self.linkage_method)
        self.labels_ = fcluster(self.Z_, self.n_clusters, criterion='maxclust')

        # it may leads to different number of clusters as specified by the user
        if len(np.unique(self.labels_)) != self.n_clusters:
            warnings.warn(
                'EAC generates {n} clusters instead of {n_clusters}'.format(
                    n=len(np.unique(self.labels_)),
                    n_clusters=self.n_clusters))

        return self 
Example #28
Source File: atlas3.py    From ssbio with MIT License 4 votes vote down vote up
def clean_data(self, keep_features=None, remove_correlated_feats=True):
        self.features_df = self.features_df.astype(float).fillna(0)
        self.features_df = self.features_df.loc[(self.features_df > 0).any(axis=1)]

        if keep_features:
            self.features_df = self.features_df.loc[self.features_df.index.isin(keep_features)]

        if remove_correlated_feats:
            tmp = self.features_df.T

            # Remove columns with no variation
            nunique = tmp.apply(pd.Series.nunique)
            cols_to_drop = nunique[nunique == 1].index
            tmp.drop(cols_to_drop, axis=1, inplace=True)

            perc_spearman = scipy.stats.spearmanr(tmp)
            abs_corr = np.subtract(np.ones(shape=perc_spearman.correlation.shape),
                                   np.absolute(perc_spearman.correlation))
            np.fill_diagonal(abs_corr, 0)
            abs_corr_clean = np.maximum(abs_corr,
                                        abs_corr.transpose())  # some floating point mismatches, just make symmetric
            clustering = linkage(squareform(abs_corr_clean), method='average')
            clusters = fcluster(clustering, .1, criterion='distance')
            names = tmp.columns.tolist()
            names_to_cluster = list(zip(names, clusters))
            indices_to_keep = []
            ### Extract models closest to cluster centroids
            for x in range(1, len(set(clusters)) + 1):
                # Create mask from the list of assignments for extracting submatrix of the cluster
                mask = np.array([1 if i == x else 0 for i in clusters], dtype=bool)

                # Take the index of the column with the smallest sum of distances from the submatrix
                idx = np.argmin(sum(abs_corr_clean[:, mask][mask, :]))

                # Extract names of cluster elements from names_to_cluster
                sublist = [name for (name, cluster) in names_to_cluster if cluster == x]

                # Element closest to centroid
                centroid = sublist[idx]
                indices_to_keep.append(centroid)

            self.features_df = self.features_df.loc[self.features_df.index.isin(indices_to_keep)] 
Example #29
Source File: safe.py    From safepy with GNU General Public License v3.0 4 votes vote down vote up
def define_domains(self, **kwargs):

        # Overwriting global settings, if necessary
        if 'attribute_distance_threshold' in kwargs:
            self.attribute_distance_threshold = kwargs['attribute_distance_threshold']

        # Make sure that the settings are still valid
        self.validate_config()

        m = self.nes_binary[:, self.attributes['top']].T
        Z = linkage(m, method='average', metric=self.attribute_distance_metric)
        max_d = np.max(Z[:, 2] * self.attribute_distance_threshold)
        domains = fcluster(Z, max_d, criterion='distance')

        self.attributes['domain'] = 0
        self.attributes.loc[self.attributes['top'], 'domain'] = domains

        # Assign nodes to domains
        node2nes = pd.DataFrame(data=self.nes,
                                    columns=[self.attributes.index.values, self.attributes['domain']])
        node2nes_binary = pd.DataFrame(data=self.nes_binary,
                                           columns=[self.attributes.index.values, self.attributes['domain']])

        # # A node belongs to the domain that contains the attribute
        # for which the node has the highest enrichment
        # self.node2domain = node2es.groupby(level='domain', axis=1).max()
        # t_max = self.node2domain.loc[:, 1:].max(axis=1)
        # t_idxmax = self.node2domain.loc[:, 1:].idxmax(axis=1)
        # t_idxmax[t_max < -np.log10(self.enrichment_threshold)] = 0

        # A node belongs to the domain that contains the highest number of attributes
        # for which the nodes is significantly enriched
        self.node2domain = node2nes_binary.groupby(level='domain', axis=1).sum()
        t_max = self.node2domain.loc[:, 1:].max(axis=1)
        t_idxmax = self.node2domain.loc[:, 1:].idxmax(axis=1)
        t_idxmax[t_max == 0] = 0

        self.node2domain['primary_domain'] = t_idxmax

        # Get the max NES for the primary domain
        o = node2nes.groupby(level='domain', axis=1).max()
        i = pd.Series(t_idxmax)
        self.node2domain['primary_nes'] = o.lookup(i.index, i.values)

        if self.verbose:
            num_domains = len(np.unique(domains))
            num_attributes_per_domain = self.attributes.loc[self.attributes['domain'] > 0].groupby('domain')['id'].count()
            min_num_attributes = num_attributes_per_domain.min()
            max_num_attributes = num_attributes_per_domain.max()
            print('Number of domains: %d (containing %d-%d attributes)' %
                  (num_domains, min_num_attributes, max_num_attributes))