def dendrogram(data, threshold, layer_directory):
    colnames = data.columns
    data = np.array(data)

    Z = hierarchy.linkage(data.T, 'single',  'cosine')
    plt.figure(figsize=(15, 9))
    dn = hierarchy.dendrogram(Z, labels = colnames, color_threshold=threshold)
    plt.title("Clustering of Samples Based on Mutational Signatures" )
    plt.ylabel("Cosine Distance")
    plt.xlabel("Sample IDs")
    plt.savefig(layer_directory+'/dendrogram.pdf',figsize=(10, 8), dpi=300)
    # which datapoints goes to which cluster
    # The indices of the datapoints will be displayed as the ids 
    Y = hierarchy.fcluster(Z, threshold, criterion='distance', R=None, monocrit=None)
    dataframe = pd.DataFrame({"Cluster":Y, "Sample Names":list(colnames)})
    dataframe = dataframe.set_index("Sample Names")
    dictionary = {"clusters":Y, "informations":dn}
    return dataframe 

######################################## Plot the reconstruction error vs stabilities and select the optimum number of signature #################################################### 
def cluster(currents,context,original_labels,chrom,pos1,plot,plotdir,cluster=False):
    colours = {'m6A':'#B4656F','A':'#55B196'} #TODO update for other labels
    if len(currents) > 1 and cluster :
        pdistance = ssd.pdist(currents,metric='correlation')
        dm = ssd.squareform(pdistance)
        link = linkage(dm,method='complete',metric='correlation')
        klabels = fcluster(link,2,'maxclust') #1,'inconsistent') #2,'maxclust')
        #klabels = [1 if x == 1 else 0 for x in klabels]
        #labels = ['m6A']*len(klabels)
        strategy = 'correlation'
        klabels = [1 if x==1 else 0 for x in original_labels]
        strategy = 'classifierProb'
    if plot:
    #for cluster in clusters: 
def getClusters(self, embed):

		n, m 	= len(embed), self.p.embed_dims
		X 	= np.empty((n, m), np.float32)

		for i in range(len(embed)): 
			X[i, :] = embed[i]

		dist 	  = pdist(X, 	  metric=self.p.metric)
		clust_res = linkage(dist, method=self.p.linkage)
		labels    = fcluster(clust_res, t=self.p.thresh_val, criterion='distance') - 1
		clusters  = [[] for i in range(max(labels) + 1)]

		for i in range(len(labels)): 

		return clusters 
def dendrogram(data, threshold, layer_directory):
    colnames = data.columns
    data = np.array(data)

    Z = hierarchy.linkage(data.T, 'single',  'cosine')
    plt.figure(figsize=(15, 9))
    dn = hierarchy.dendrogram(Z, labels = colnames, color_threshold=threshold)
    plt.title("Clustering of Samples Based on Mutational Signatures" )
    plt.ylabel("Cosine Distance")
    plt.xlabel("Sample IDs")
    plt.savefig(layer_directory+'/dendrogram.pdf',figsize=(10, 8), dpi=300)
    # which datapoints goes to which cluster
    # The indices of the datapoints will be displayed as the ids 
    Y = hierarchy.fcluster(Z, threshold, criterion='distance', R=None, monocrit=None)
    dataframe = pd.DataFrame({"Cluster":Y, "Sample Names":list(colnames)})
    dataframe = dataframe.set_index("Sample Names")
    dictionary = {"clusters":Y, "informations":dn}
    return dataframe 

######################################## Plot the reconstruction error vs stabilities and select the optimum number of signature #################################################### 
def hier_clustering(d, threshold=3):
    srcids = d.keys()
    tokenizer = lambda x: x.split()
    vectorizer = TfidfVectorizer(tokenizer=tokenizer)
    assert isinstance(d, dict)
    assert isinstance(list(d.values())[0], list)
    assert isinstance(list(d.values())[0][0], str)
    doc = [' '.join(d[srcid]) for srcid in srcids]
    vect = vectorizer.fit_transform(doc)
    #TODO: Make vect aligned to the required format
    z = linkage(vect.toarray(), metric='cityblock', method='complete')
    dists = list(set(z[:,2]))
#    threshold = 3
    #threshold = (dists[2] + dists[3]) / 2
    b = hier.fcluster(z, threshold, criterion='distance')
    cluster_dict = defaultdict(list)
    for srcid, cluster_id in zip(srcids, b):
    value_lengther = lambda x: len(x[1])
    return OrderedDict(\
               sorted(cluster_dict.items(), key=value_lengther, reverse=True)) 
def create_cluster_map(self, bow, srcids):
        cluster_map = {}
        z = linkage(bow, metric='cityblock', method='complete')
        dists = list(set(z[:, 2]))
        thresh = (dists[1] + dists[2]) / 2'Threshold: {0}'.format(thresh))
        b = hier.fcluster(z, thresh, criterion='distance')
        assert bow.shape[0] == len(b)
        assert len(b) == len(srcids)
        for cid, srcid in zip(b, srcids):
            cluster_map[cid] = cluster_map.get(cid, []) + [srcid]'# of clusters: {0}'.format(len(b)))'sizes of clustsers:{0}'.format(sorted(map(len, cluster_map.values()))))

        return cluster_map 
def calculate_shell_bvalues_and_indices(bvalues, max_distance=20e6):
    Calculates which measurements belong to different acquisition shells.
    It uses scipy's linkage clustering algorithm, which uses the max_distance
    input as a limit of including measurements in the same cluster.

    For example, if bvalues were [1, 2, 3, 4, 5] and max_distance was 1, then
    all bvalues would belong to the same cluster.
    However, if bvalues were [1, 2, 4, 5] max max_distance was 1, then this
    would result in 2 clusters.

    bvalues: 1D numpy array of shape (Ndata)
        bvalues of the acquisition in s/m^2.
    max_distance: float
        maximum b-value distance for a measurement to be included in the same

    shell_indices: 1D numpy array of shape (Ndata)
        array of integers, starting from 0, representing to which shell a
        measurement belongs. The number itself has no meaning other than just
        being different for different shells.
    shell_bvalues: 1D numpy array of shape (Nshells)
        array of the mean bvalues for every acquisition shell.
    linkage_matrix = linkage(np.c_[bvalues])
    clusters = fcluster(linkage_matrix, max_distance, criterion='distance')
    shell_indices = np.empty_like(bvalues, dtype=int)
    cluster_bvalues = np.zeros((np.max(clusters), 2))
    for ind in np.unique(clusters):
        cluster_bvalues[ind - 1] = np.mean(bvalues[clusters == ind]), ind
    shell_bvalues, ordered_cluster_indices = (
        cluster_bvalues[cluster_bvalues[:, 0].argsort()].T)
    for i, ind in enumerate(ordered_cluster_indices):
        shell_indices[clusters == ind] = i
    return shell_indices, shell_bvalues 
def mean_z(z_all, dim_limit):
    # use correlation clustering to average group assignments
    lz = hi.linkage(z_all.T, 'single', 'hamming')
    # not sure why cluster id starts from 1
    z = hi.fcluster(lz, 0) - 1
    all_cat = np.unique(z)
    for a in all_cat:
        a_size = np.sum(a == z)
        if a_size > dim_limit:
            z[a == z] = sample_multinomial([1.] * a_size, a_size, dim_limit)
    return z 
def plotDendrogram(self, matrix, axis, clusteringThreshold, orientation):

        d = dist.pdist(matrix)
        linkage = cluster.linkage(dist.squareform(d), method='average', metric='cityblock')
        dendrogram = cluster.dendrogram(linkage, orientation=orientation, link_color_func=lambda k: 'k')
        index = cluster.fcluster(linkage, clusteringThreshold * max(linkage[:,2]), 'distance')

        return index, dendrogram['leaves'] 
def cluster(self, threshold=0.5, method="average"):
		""" Main function to cluster the overlap dictionary into clusters"""


		if len(self.names) > 1:
			self.linkage_mat = linkage(squareform(self.distance_mat), method)
			self.labels = fcluster(self.linkage_mat, threshold, criterion="distance")		#ordering of the dendrogram

			#Find clusters below threshold
			self.linkage_clusters = dict(zip(range(self.n), [[num] for num in range(self.n)]))
			for i, row in enumerate(self.linkage_mat):
				ID1 = int(row[0])
				ID2 = int(row[1])
				new = self.n + i
				dist = row[2]

				if dist <= threshold:
					self.linkage_clusters[new] = self.linkage_clusters[ID1] + self.linkage_clusters[ID2] + [new]
					del self.linkage_clusters[ID1]
					del self.linkage_clusters[ID2]

			#Add member-names to clusters
			for cluster in self.linkage_clusters:

				self.clusters[cluster] = {"member_idx": [idx for idx in self.linkage_clusters[cluster] if idx < self.n]}
				self.clusters[cluster]["member_names"] = [self.names[idx] for idx in self.clusters[cluster]["member_idx"]]
		else:	#only one TF
			self.linkage_clusters = {0:[0]}
			self.linkage_mat = np.array([[0]])
			self.clusters[0] = {"member_idx":[0]}
			self.clusters[0]["member_names"] = [self.names[idx] for idx in self.clusters[0]["member_idx"]]

		self.get_cluster_names()	#Set names of clusters
def cluster(self, threshold=0.5, metric = "pcc", clust_method = "average"):
			A dictionary with keys=cluster names and values=MotifList objects

		#Needs gimmemotif
		from gimmemotifs.motif import Motif
		from gimmemotifs.comparison import MotifComparer
		sns.set_style("ticks")	#set style back to ticks, as this is set globally during gimmemotifs import

		#Fill in self.gimme_obj variable
		motif_list = [motif.get_gimmemotif().gimme_obj for motif in self]	#list of gimmemotif objects

		#Similarities between all motifs
		mc = MotifComparer()
		score_dict = mc.get_all_scores(motif_list, motif_list, match = "total", metric = metric, combine = "mean")   #metric can be: seqcor, pcc, ed, distance, wic, chisq, akl or ssd
		self.similarity_matrix = generate_similarity_matrix(score_dict)

		# Clustering
		vector = ssd.squareform(self.similarity_matrix.to_numpy())
		self.linkage_mat = linkage(vector, method=clust_method)

		# Flatten clusters
		fclust_labels = fcluster(self.linkage_mat, threshold, criterion="distance")			#cluster membership per motif
		formatted_labels = ["Cluster_{0}".format(label) for label in fclust_labels]

		# Extract motifs belonging to each cluster
		cluster_dict = {label: MotifList() for label in formatted_labels}	#initialize dictionary
		for i, cluster_label in enumerate(formatted_labels):

		return cluster_dict 
def chc(X, K, params=()):
    pnames = ['linkage_method',  'distance']
    dflts  = [          'ward', 'euclidean']
    if isinstance(params, np.ndarray):
        paramsloc = params.tolist()
        paramsloc = params
    (linkage_method, distance) = ds.resolveargumentpairs(pnames, dflts, paramsloc)

    Z = sphc.linkage(X, method=linkage_method, metric=distance)
    C = sphc.fcluster(Z, K, criterion='maxclust')
    return clustVec2partMat(C, K)

# Other related functions 
def __init__(self, data, method='single', metric='euclidean'):
        Combines the assets in `data` using HRP
        returns an object with the following attributes:
            - 'cov': covariance matrix of the returns
            - 'corr': correlation matrix of the returns
            - 'sort_ix': list of sorted column names according to cluster
            - 'link': linkage matrix of size (N-1)x4 with structure Y=[{y_m,1  y_m,2  y_m,3  y_m,4}_m=1,N-1].
                      At the i-th iteration, clusters with indices link[i, 0] and link[i, 1] are combined to form
                      cluster n+1. A cluster with an index less than n corresponds to one of the original observations.
                      The distance between clusters link[i, 0] and link[i, 1] is given by link[i, 2]. The fourth value
                      link[i, 3] represents the number of original observations in the newly formed cluster.
            - 'weights': final weights for each asset

        :param data: pandas DataFrame where each column is a series of returns
        :param method: any method available in scipy.cluster.hierarchy.linkage
        :param metric: any metric available in scipy.cluster.hierarchy.linkage

        assert isinstance(data, pd.DataFrame), "input 'data' must be a pandas DataFrame"

        self.cov = data.cov()
        self.corr = data.corr()
        self.method = method
        self.metric = metric = self._tree_clustering(self.corr, self.method, self.metric)
        self.sort_ix = self._get_quasi_diag(
        self.sort_ix = self.corr.index[self.sort_ix].tolist()  # recover labels
        self.sorted_corr = self.corr.loc[self.sort_ix, self.sort_ix]  # reorder correlation matrix
        self.weights = self._get_recursive_bisection(self.cov, self.sort_ix)
        # TODO self.cluster_nember = sch.fcluster(, t=5, criterion='maxclust') 
def plotDendrogram(self, matrix, axis, clusteringThreshold, orientation):

        d = dist.pdist(matrix)
        linkage = cluster.linkage(dist.squareform(d), method='average', metric='cityblock')
        dendrogram = cluster.dendrogram(linkage, orientation=orientation, link_color_func=lambda k: 'k')
        index = cluster.fcluster(linkage, clusteringThreshold * max(linkage[:,2]), 'distance')

        return index, dendrogram['leaves'] 
def _normalize_col(self, df, ci):
        Normalizing column in given dataframe

        :param df: input dataframe
        :param ci: column name

        :return: normalized dataframe with respect to given column

        col_name = ci.col_name
        col =

        col = [row[col_name].encode('utf-8', 'replace')
               if row[col_name] is not None else ''for row in col]

        distinct = list(set(col))

        if len(distinct) > self.max_distinct or len(distinct) <= 1:
            return df

        similarity = self._compute_distances(distinct, ci.distance_fcn)

        z = linkage(similarity)

        labels = fcluster(z, ci.threshold, 'distance')

        # sets up map from value to most common value in that cluster
        clusters = self._get_exemplars(col, labels, distinct)

        new_col = [clusters[val][0] for val in col]

        df =, new_col, col_name)

        return df 
def find_correlation_clusters(corr,corr_thresh):
    dissimilarity = 1.0 - corr
    hierarchy = linkage(squareform(dissimilarity), method='single')
    diss_thresh = 1.0 - corr_thresh
    labels = fcluster(hierarchy, diss_thresh, criterion='distance')
    return labels 
def _cluster_breakdown(self):
        Creates clusters from the portfolio (doesn't create sub portfolios, but tells you which ones to make)

        Credit to this notebook:

        :return: list of int same length as instruments

        X = self.corr_matrix.values
        d = sch.distance.pdist(X)
        L = sch.linkage(d, method='complete')
        ind = sch.fcluster(L, MAX_CLUSTER_SIZE, criterion='maxclust')

        return list(ind) 
def clusters_from_partitions(partitions, options):
    """Finds clusters in partitions using span-position distance and hierarchical clustering. 
    Assumes that all signatures in the given partition are of the same type and on the same contig"""
    clusters_final = []
    large_partitions = 0
    # Find clusters in each partition individually.
    for partition in partitions:
        if len(partition) == 1:
        elif len(partition) > 100:
            partition_sample = sample(partition, 100)
            large_partitions += 1
            partition_sample = partition
        element_type = partition_sample[0].type
        if element_type == "DEL" or element_type == "INV" or element_type == "DUP_TAN":
            data = np.array( [[signature.get_source()[1], signature.get_source()[2], options.distance_normalizer] for signature in partition_sample])
            Z = linkage(data, method = "average", metric = span_position_distance)
        elif element_type == "INS":
            data = np.array( [[signature.get_source()[1], signature.get_source()[2], options.distance_normalizer] for signature in partition_sample])
            Z = linkage(data, method = "average", metric = span_position_distance_insertions)
        elif element_type == "DUP_INT":
            data = np.array( [[signature.get_source()[1], signature.get_source()[2], signature.get_destination()[1], options.distance_normalizer] for signature in partition_sample])
            Z = linkage(data, method = "average", metric = span_position_distance_intdups)

        cluster_indices = list(fcluster(Z, options.cluster_max_distance, criterion='distance'))
        new_clusters = [[] for i in range(max(cluster_indices))]
        for signature_index, cluster_index in enumerate(cluster_indices):
    if len(partitions) > 0:
        if len(partitions[0]) > 0:
            logging.debug("%d out of %d partitions for %s exceeded 100 elements." % (large_partitions, len(partitions), partitions[0][0].type))
    return clusters_final 
def signal_clustering(corr_matrix:      DataFrame,
                      threshold:        float,
                      cluster_pickle:   str = "",
                      linkage_pickle:   str = "",
                      force:            bool = False):
    if force:
        if path.isfile(cluster_pickle):
        if path.isfile(linkage_pickle):
    if path.isfile(cluster_pickle) and path.isfile(linkage_pickle):
        print("\nSignal clustering already completed and forcing is turned off. Using pickled data...")
        return [load(open(cluster_pickle, "rb")), load(open(linkage_pickle, "rb"))]

    # Remove negative values from the correlation matrix and invert the values
    corr_matrix.where(corr_matrix > 0, 0, inplace=True)
    corr_matrix = 1 - corr_matrix
    X = corr_matrix.values  # type: ndarray
    Y = clip(ssd.squareform(X), 0, None)
    # Z is the linkage matrix. This can serve as input to the scipy.cluster.hierarchy.dendrogram method
    Z = linkage(Y, method='single', optimal_ordering=True)
    fclus = fcluster(Z, t=threshold, criterion='distance')
    cluster_dict = {}
    for i, cluster_label in enumerate(fclus):
        if cluster_label in cluster_dict:
            cluster_dict[cluster_label] = [corr_matrix.index[i]]
    return cluster_dict, Z 
def remove_correlated_feats(df):
    tmp = df.T
    # Remove columns with no variation
    nunique = tmp.apply(pd.Series.nunique)
    cols_to_drop = nunique[nunique == 1].index
    tmp.drop(cols_to_drop, axis=1, inplace=True)

    perc_spearman = scipy.stats.spearmanr(tmp)
    abs_corr = np.subtract(np.ones(shape=perc_spearman.correlation.shape),
    np.fill_diagonal(abs_corr, 0)
    abs_corr_clean = np.maximum(abs_corr,
                                abs_corr.transpose())  # some floating point mismatches, just make symmetric
    clustering = linkage(squareform(abs_corr_clean), method='average')
    clusters = fcluster(clustering, .1, criterion='distance')
    names = tmp.columns.tolist()
    names_to_cluster = list(zip(names, clusters))
    indices_to_keep = []
    ### Extract models closest to cluster centroids
    for x in range(1, len(set(clusters)) + 1):
        # Create mask from the list of assignments for extracting submatrix of the cluster
        mask = np.array([1 if i == x else 0 for i in clusters], dtype=bool)

        # Take the index of the column with the smallest sum of distances from the submatrix
        idx = np.argmin(sum(abs_corr_clean[:, mask][mask, :]))

        # Extract names of cluster elements from names_to_cluster
        sublist = [name for (name, cluster) in names_to_cluster if cluster == x]

        # Element closest to centroid
        centroid = sublist[idx]

    return df.loc[df.index.isin(indices_to_keep)] 
def fit_predict(self, X, y=None):
        self.model = linkage(X, method=self.method, metric=self.metric)
        return fcluster(self.model, t=self.n_clusters, criterion='maxclust') - 1 
def _offline_clustering(self, X):
        print('Starting offline clustering...')
        p_dist = pdist(X, metric=self._distance_metric)
        Z = linkage(p_dist, 'complete')
        cluster_index = fcluster(Z, self.max_dist, criterion='distance')
        self._extract_representatives(X, cluster_index)
        print('Processed {} instances.'.format(X.shape[0]))
        print('Found {} clusters offline.\n'.format(len(self.representatives)))
        # print('The representive vectors are:')
        # pprint.pprint(self.representatives.tolist()) 
def get_word_clusters(sentence_dict):
    srcids = list(sentence_dict.keys())
    sentences = []
    for srcid in srcids:
        sentence = []
        for metadata_type, sent in sentence_dict[srcid].items():
        sentence = '\n'.join(sentence)
        sentence = ' '.join(re.findall('[a-z]+', sentence))
    vect = TfidfVectorizer()
    #vect = CountVectorizer()
    bow = vect.fit_transform(sentences).toarray()
        z = linkage(bow, metric='cityblock', method='complete')
    dists = list(set(z[:,2]))
    thresh = (dists[2] + dists[3]) /2
    #thresh = (dists[1] + dists[2]) /2
    print("Threshold: ", thresh)
    b = hier.fcluster(z,thresh, criterion='distance')
    cluster_dict = defaultdict(list)

    for srcid, cluster_id in zip(srcids, b):
    return dict(cluster_dict) 
def cluster_from_dist_matrix(dist_matrix, threshold):
    """Use scipy to cluster a distance matrix.

        dist_matrix: distance matrix, represented in scipy's 1d condensed form
        threshold: maximum inter-cluster distance to merge clusters (higher
            results in fewer clusters)

        list c such that c[i] is a collection of all the observations
        (whose pairwise distances are indexed in dist) in the i'th
        cluster, in sorted order by descending cluster size
    linkage = hierarchy.linkage(dist_matrix, method='average')
    clusters = hierarchy.fcluster(linkage, threshold, criterion='distance')

    # clusters are numbered starting at 1, but base the count on
    # first_clust_num just in case this changes
    first_clust_num = min(clusters)
    num_clusters = max(clusters) + 1 - first_clust_num
    elements_in_cluster = defaultdict(list)
    for i, clust_num in enumerate(clusters):
    cluster_sizes = {c: len(elements_in_cluster[c])
                     for c in range(first_clust_num,
                                    num_clusters + first_clust_num)}

    elements_in_cluster_sorted = []
    for clust_num, _ in sorted(cluster_sizes.items(),
            key=operator.itemgetter(1), reverse=True):
        elements_in_cluster_sorted += [elements_in_cluster[clust_num]]
    return elements_in_cluster_sorted 
def cluster_correlations(corr_mat, indices=None):
    Apply agglomerative clustering in order to sort
    a correlation matrix.

    Based on

    - corr_mat : a square correlation matrix (pandas DataFrame)
    - indices : cluster labels [None]; if not provided we'll do
        an aglomerative clustering to get cluster labels.

    - corr : a sorted correlation matrix
    - indices : cluster indexes based on the original dataset

    >> assoc = associations(
    >> correlations = assoc['corr']
    >> correlations, _ = cluster_correlations(correlations)
    if indices is None:
        X = corr_mat.values
        d = sch.distance.pdist(X)
        L = sch.linkage(d, method='complete')
        indices = sch.fcluster(L, 0.5 * d.max(), 'distance')
    columns = [corr_mat.columns.tolist()[i]
               for i in list((np.argsort(indices)))]
    corr_mat = corr_mat.reindex(columns=columns).reindex(index=columns)
    return corr_mat, indices 
def clusterdistmat(distmatrixfile,sim,dcut,mode='average',plot=False):
	# Compute the clusturing on dist^2 so that the average 
	# distance of a cluster with an other is the RMS distance
	sim2 = sim*sim
	Z = sc.linkage(sim2,mode)

	cdist = Z[:,2]
	# get the full tree
	# dendo = sc.dendrogram(Z)
	# clist = dendo['leaves']
   	nclust = cluster.estimate_ncluster(cdist,dcut)

	clist = sc.fcluster(Z,nclust,criterion='maxclust')
	c_count = Counter(clist)
	nbclst = len(c_count)

	print "Number of clusters", nbclst 
	rep_ind = getrep_ind(sim2,clist,c_count)

	# Write the groupe indices and representatives
	f.write(" # groupid representative \n ")
	for i in range(len(sim)):
		if i in rep_ind: iselect=2
		f.write("%d   %d \n " %(clist[i]-1,  iselect)) 

	if plot: 
	c_list = np.zeros(len(sim))

	# Change cluster groups numbering to (0:n-1)
	for i in range(len(sim)):
		c_list[i] = int(clist[i]-1)

	return c_list,Z

# Determine the representative element of each cluster group 
Example #27
        """Fit estimators.

        X : numpy array of shape (n_samples, n_features)
            The input samples.

        # Validate inputs X
        X = check_array(X)
        n_samples = X.shape[0]

        # initialize similarity matrix
        sim_mat_all = np.zeros([n_samples, n_samples])

        if self.pre_fitted:
            print("Training Skipped")

            for clf in self.base_estimators:
                clf.fitted_ = True

        for i, estimator in enumerate(self.base_estimators):
            check_is_fitted(estimator, ['labels_'])

            # get the labels from each base estimator
            labels = estimator.labels_.reshape(n_samples, 1)

            # generate the similarity matrix for the current estimator
            sim_mat = _generate_similarity_mat(labels)

            # add to the main similarity mat
            sim_mat_all = sim_mat_all + sim_mat

        # get the average of the similarity mat
        sim_mat_avg = np.divide(sim_mat_all, self.n_base_estimators_)

        # flip the similarity. smaller value implies more similarity
        sim_mat_avg = np.abs(np.max(sim_mat_avg) - sim_mat_avg)

        # build clusters
        self.Z_ = linkage(sim_mat_avg, method=self.linkage_method)
        self.labels_ = fcluster(self.Z_, self.n_clusters, criterion='maxclust')

        # it may leads to different number of clusters as specified by the user
        if len(np.unique(self.labels_)) != self.n_clusters:
                'EAC generates {n} clusters instead of {n_clusters}'.format(

        return self 
def clean_data(self, keep_features=None, remove_correlated_feats=True):
        self.features_df = self.features_df.astype(float).fillna(0)
        self.features_df = self.features_df.loc[(self.features_df > 0).any(axis=1)]

        if keep_features:
            self.features_df = self.features_df.loc[self.features_df.index.isin(keep_features)]

        if remove_correlated_feats:
            tmp = self.features_df.T

            # Remove columns with no variation
            nunique = tmp.apply(pd.Series.nunique)
            cols_to_drop = nunique[nunique == 1].index
            tmp.drop(cols_to_drop, axis=1, inplace=True)

            perc_spearman = scipy.stats.spearmanr(tmp)
            abs_corr = np.subtract(np.ones(shape=perc_spearman.correlation.shape),
            np.fill_diagonal(abs_corr, 0)
            abs_corr_clean = np.maximum(abs_corr,
                                        abs_corr.transpose())  # some floating point mismatches, just make symmetric
            clustering = linkage(squareform(abs_corr_clean), method='average')
            clusters = fcluster(clustering, .1, criterion='distance')
            names = tmp.columns.tolist()
            names_to_cluster = list(zip(names, clusters))
            indices_to_keep = []
            ### Extract models closest to cluster centroids
            for x in range(1, len(set(clusters)) + 1):
                # Create mask from the list of assignments for extracting submatrix of the cluster
                mask = np.array([1 if i == x else 0 for i in clusters], dtype=bool)

                # Take the index of the column with the smallest sum of distances from the submatrix
                idx = np.argmin(sum(abs_corr_clean[:, mask][mask, :]))

                # Extract names of cluster elements from names_to_cluster
                sublist = [name for (name, cluster) in names_to_cluster if cluster == x]

                # Element closest to centroid
                centroid = sublist[idx]

            self.features_df = self.features_df.loc[self.features_df.index.isin(indices_to_keep)] 
Example #29
        # Overwriting global settings, if necessary
        if 'attribute_distance_threshold' in kwargs:
            self.attribute_distance_threshold = kwargs['attribute_distance_threshold']

        # Make sure that the settings are still valid

        m = self.nes_binary[:, self.attributes['top']].T
        Z = linkage(m, method='average', metric=self.attribute_distance_metric)
        max_d = np.max(Z[:, 2] * self.attribute_distance_threshold)
        domains = fcluster(Z, max_d, criterion='distance')

        self.attributes['domain'] = 0
        self.attributes.loc[self.attributes['top'], 'domain'] = domains

        # Assign nodes to domains
        node2nes = pd.DataFrame(data=self.nes,
                                    columns=[self.attributes.index.values, self.attributes['domain']])
        node2nes_binary = pd.DataFrame(data=self.nes_binary,
                                           columns=[self.attributes.index.values, self.attributes['domain']])

        # # A node belongs to the domain that contains the attribute
        # for which the node has the highest enrichment
        # self.node2domain = node2es.groupby(level='domain', axis=1).max()
        # t_max = self.node2domain.loc[:, 1:].max(axis=1)
        # t_idxmax = self.node2domain.loc[:, 1:].idxmax(axis=1)
        # t_idxmax[t_max < -np.log10(self.enrichment_threshold)] = 0

        # A node belongs to the domain that contains the highest number of attributes
        # for which the nodes is significantly enriched
        self.node2domain = node2nes_binary.groupby(level='domain', axis=1).sum()
        t_max = self.node2domain.loc[:, 1:].max(axis=1)
        t_idxmax = self.node2domain.loc[:, 1:].idxmax(axis=1)
        t_idxmax[t_max == 0] = 0

        self.node2domain['primary_domain'] = t_idxmax

        # Get the max NES for the primary domain
        o = node2nes.groupby(level='domain', axis=1).max()
        i = pd.Series(t_idxmax)
        self.node2domain['primary_nes'] = o.lookup(i.index, i.values)

        if self.verbose:
            num_domains = len(np.unique(domains))
            num_attributes_per_domain = self.attributes.loc[self.attributes['domain'] > 0].groupby('domain')['id'].count()
            min_num_attributes = num_attributes_per_domain.min()
            max_num_attributes = num_attributes_per_domain.max()
            print('Number of domains: %d (containing %d-%d attributes)' %
                  (num_domains, min_num_attributes, max_num_attributes))