Python scipy.cluster() Examples

The following are 24 code examples of scipy.cluster(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scipy , or try the search function .
Example #1
Source File: signal_recompose.py    From NeuroKit with MIT License 6 votes vote down vote up
def _signal_recompose_sum(components, clusters):
    # Reorient components
    components = components.T

    # Reconstruct Time Series from correlated components
    clusters = [np.where(clusters == cluster)[0] for cluster in np.unique(clusters)]

    if len(clusters) == 0:
        raise ValueError("Not enough clusters of components detected. Please decrease the " "`threshold`.")
    # Initialize components matrix
    recomposed = np.zeros((len(components), len(clusters)))
    for i, indices in enumerate(clusters):
        recomposed[:, i] = components[:, indices].sum(axis=1)
    return recomposed.T


# =============================================================================
# Clustering Methods
# =============================================================================

# Weighted Correlation
# ---------------------------------------------------------------------------- 
Example #2
Source File: subroutines.py    From SigProfilerExtractor with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def dendrogram(data, threshold, layer_directory):
    colnames = data.columns
    data = np.array(data)

    Z = hierarchy.linkage(data.T, 'single',  'cosine')
    plt.figure(figsize=(15, 9))
    dn = hierarchy.dendrogram(Z, labels = colnames, color_threshold=threshold)
    plt.title("Clustering of Samples Based on Mutational Signatures" )
    plt.ylabel("Cosine Distance")
    plt.xlabel("Sample IDs")
    #plt.ylim((0,1))
    plt.savefig(layer_directory+'/dendrogram.pdf',figsize=(10, 8), dpi=300)
    # which datapoints goes to which cluster
    # The indices of the datapoints will be displayed as the ids 
    Y = hierarchy.fcluster(Z, threshold, criterion='distance', R=None, monocrit=None)
    dataframe = pd.DataFrame({"Cluster":Y, "Sample Names":list(colnames)})
    dataframe = dataframe.set_index("Sample Names")
    #print(dataframe)
    dictionary = {"clusters":Y, "informations":dn}
    
    return dataframe 


######################################## Plot the reconstruction error vs stabilities and select the optimum number of signature #################################################### 
Example #3
Source File: subroutines.py    From SigProfilerExtractor with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def dendrogram(data, threshold, layer_directory):
    colnames = data.columns
    data = np.array(data)

    Z = hierarchy.linkage(data.T, 'single',  'cosine')
    plt.figure(figsize=(15, 9))
    dn = hierarchy.dendrogram(Z, labels = colnames, color_threshold=threshold)
    plt.title("Clustering of Samples Based on Mutational Signatures" )
    plt.ylabel("Cosine Distance")
    plt.xlabel("Sample IDs")
    #plt.ylim((0,1))
    plt.savefig(layer_directory+'/dendrogram.pdf',figsize=(10, 8), dpi=300)
    # which datapoints goes to which cluster
    # The indices of the datapoints will be displayed as the ids 
    Y = hierarchy.fcluster(Z, threshold, criterion='distance', R=None, monocrit=None)
    dataframe = pd.DataFrame({"Cluster":Y, "Sample Names":list(colnames)})
    dataframe = dataframe.set_index("Sample Names")
    #print(dataframe)
    dictionary = {"clusters":Y, "informations":dn}
    
    return dataframe 


######################################## Plot the reconstruction error vs stabilities and select the optimum number of signature #################################################### 
Example #4
Source File: diarizationFunctions.py    From pyBK with MIT License 6 votes vote down vote up
def performClusteringLinkage(segmentBKTable, segmentCVTable, N_init, linkageCriterion,linkageMetric ):
    from scipy.cluster.hierarchy import linkage
    from scipy import cluster
    if linkageMetric == 'jaccard':
      observations = segmentBKTable
    elif linkageMetric == 'cosine':
      observations = segmentCVTable
    else:
      observations = segmentCVTable      
    clusteringTable = np.zeros([np.size(segmentCVTable,0),N_init]) 
    Z = linkage(observations,method=linkageCriterion,metric=linkageMetric)
    for i in np.arange(N_init):
      clusteringTable[:,i] = cluster.hierarchy.cut_tree(Z,N_init-i).T+1  
    k=N_init
    print('done')
    return clusteringTable, k 
Example #5
Source File: utils.py    From epiScanpy with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def hierarch_cluster(M):
    """Cluster matrix using hierarchical clustering.

    Parameters
    ----------
    M : np.ndarray
        Matrix, for example, distance matrix.

    Returns
    -------
    Mclus : np.ndarray
        Clustered matrix.
    indices : np.ndarray
        Indices used to cluster the matrix.
    """
    import scipy as sp
    import scipy.cluster
    link = sp.cluster.hierarchy.linkage(M)
    indices = sp.cluster.hierarchy.leaves_list(link)
    Mclus = np.array(M[:, indices])
    Mclus = Mclus[indices, :]
    if False:
        pl.matshow(Mclus)
        pl.colorbar()
    return Mclus, indices 
Example #6
Source File: utils.py    From epiScanpy with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def compute_group_overlap_score(ref_labels, pred_labels,
                                threshold_overlap_pred=0.5,
                                threshold_overlap_ref=0.5):
    """How well do the pred_labels explain the ref_labels?

    A predicted cluster explains a reference cluster if it is contained within the reference
    cluster with at least 50% (threshold_overlap_pred) of its points and these correspond
    to at least 50% (threshold_overlap_ref) of the reference cluster.
    """
    ref_unique, ref_counts = np.unique(ref_labels, return_counts=True)
    ref_dict = dict(zip(ref_unique, ref_counts))
    pred_unique, pred_counts = np.unique(pred_labels, return_counts=True)
    pred_dict = dict(zip(pred_unique, pred_counts))
    summary = []
    for true in ref_unique:
        sub_pred_unique, sub_pred_counts = np.unique(pred_labels[true == ref_labels], return_counts=True)
        relative_overlaps_pred = [sub_pred_counts[i] / pred_dict[n] for i, n in enumerate(sub_pred_unique)]
        relative_overlaps_ref = [sub_pred_counts[i] / ref_dict[true] for i, n in enumerate(sub_pred_unique)]
        pred_best_index = np.argmax(relative_overlaps_pred)
        summary.append(1 if (relative_overlaps_pred[pred_best_index] >= threshold_overlap_pred and
                             relative_overlaps_ref[pred_best_index] >= threshold_overlap_ref)
                       else 0)
        # print(true, sub_pred_unique[pred_best_index], relative_overlaps_pred[pred_best_index],
        #       relative_overlaps_ref[pred_best_index], summary[-1])
    return sum(summary)/len(summary) 
Example #7
Source File: dbscan.py    From link-prediction_with_deep-learning with MIT License 6 votes vote down vote up
def process_options(args):    
    options = argparser().parse_args(args)

    if options.max_rank is not None and options.max_rank < 1:
        raise ValueError('max-rank must be >= 1')
    if options.eps <= 0.0:
        raise ValueError('eps must be > 0')

    wv = wvlib.load(options.vectors[0], max_rank=options.max_rank)

    if options.normalize:
        logging.info('normalize vectors to unit length')
        wv.normalize()

    words, vectors = wv.words(), wv.vectors()

    if options.whiten:
        logging.info('normalize features to unit variance')
        vectors = scipy.cluster.vq.whiten(vectors)

    return words, vectors, options 
Example #8
Source File: cmag.py    From neuropythy with GNU Affero General Public License v3.0 6 votes vote down vote up
def sigma_bin_walls(sigma, bins):
        import scipy, scipy.cluster, scipy.cluster.vq as vq
        std = np.std(sigma)
        if np.isclose(std, 0): return pimms.imm_array([0, np.max(sigma)])
        cl = sorted(std * vq.kmeans(sigma/std, bins)[0])
        cl = np.mean([cl[:-1],cl[1:]], axis=0)
        return pimms.imm_array(np.concatenate(([0], cl, [np.max(sigma)]))) 
Example #9
Source File: leveler.py    From Maybe-Useful-Cogs with MIT License 5 votes vote down vote up
def _auto_color(self, url:str, ranks):
        phrases = ["Calculating colors..."] # in case I want more
        #try:
        await self.bot.say("**{}**".format(random.choice(phrases)))
        clusters = 10

        async with aiohttp.get(url) as r:
            image = await r.content.read()
        with open('data/leveler/temp_auto.png','wb') as f:
            f.write(image)

        im = Image.open('data/leveler/temp_auto.png').convert('RGBA')
        im = im.resize((290, 290)) # resized to reduce time
        ar = scipy.misc.fromimage(im)
        shape = ar.shape
        ar = ar.reshape(scipy.product(shape[:2]), shape[2])

        codes, dist = scipy.cluster.vq.kmeans(ar.astype(float), clusters)
        vecs, dist = scipy.cluster.vq.vq(ar, codes)         # assign codes
        counts, bins = scipy.histogram(vecs, len(codes))    # count occurrences

        # sort counts
        freq_index = []
        index = 0
        for count in counts:
            freq_index.append((index, count))
            index += 1
        sorted_list = sorted(freq_index, key=operator.itemgetter(1), reverse=True)

        colors = []
        for rank in ranks:
            color_index = min(rank, len(codes))
            peak = codes[sorted_list[color_index][0]] # gets the original index
            peak = peak.astype(int)

            colors.append(''.join(format(c, '02x') for c in peak))
        return colors # returns array
        #except:
            #await self.bot.say("```Error or no scipy. Install scipy doing 'pip3 install numpy' and 'pip3 install scipy' or read here: https://github.com/AznStevy/Maybe-Useful-Cogs/blob/master/README.md```")

    # converts hex to rgb 
Example #10
Source File: kmeans.py    From link-prediction_with_deep-learning with MIT License 5 votes vote down vote up
def write_cluster_ids(words, cluster_ids, out=None):
    """Write given list of words and their corresponding cluster ids to out."""

    assert len(words) == len(cluster_ids), 'word/cluster ids number mismatch'

    if out is None:
        out = sys.stdout
    for word, cid in izip(words, cluster_ids):
        print >> out, '%s\t%d' % (word, cid) 
Example #11
Source File: data_viewing.py    From lumin with Apache License 2.0 5 votes vote down vote up
def plot_rank_order_dendrogram(df:pd.DataFrame, threshold:float=0.8, savename:Optional[str]=None, settings:PlotSettings=PlotSettings()) \
        -> Dict[str,Union[List[str],float]]:
    r'''
    Plots a dendrogram of features in df clustered via Spearman's rank correlation coefficient.
    Also returns a sets of features with correlation coefficients greater than the threshold

    Arguments:
        df: Pandas DataFrame containing data
        threshold: Threshold on correlation coefficient
        savename: Optional name of file to which to save the plot of feature importances
        settings: :class:`~lumin.plotting.plot_settings.PlotSettings` class to control figure appearance

    Returns:
        Dict of sets of features with correlation coefficients greater than the threshold and cluster distance
    '''

    corr = np.round(scipy.stats.spearmanr(df).correlation, 4)
    corr_condensed = hc.distance.squareform(1-np.abs(corr))  # Abs because negtaive of a feature is a trvial transformation: information unaffected
    z = hc.linkage(corr_condensed, method='average', optimal_ordering=True)

    with sns.axes_style('white'), sns.color_palette(settings.cat_palette):
        plt.figure(figsize=(settings.w_large, (0.5*len(df.columns))))
        hc.dendrogram(z, labels=df.columns, orientation='left', leaf_font_size=settings.lbl_sz, color_threshold=1-threshold)
        plt.xlabel("Distance (1 - |Spearman's Rank Correlation Coefficient|)", fontsize=settings.lbl_sz, color=settings.lbl_col)
        plt.xticks(fontsize=settings.tk_sz, color=settings.tk_col)
        if savename is not None: plt.savefig(settings.savepath/f'{savename}{settings.format}', bbox_inches='tight')
        plt.show()

    feats = df.columns
    sets = {}
    for i, merge in enumerate(z):
        if merge[2] > 1-threshold: continue
        if merge[0] <= len(z): a = [feats[int(merge[0])]]
        else:                  a = sets.pop(int(merge[0]))['children']
        if merge[1] <= len(z): b = [feats[int(merge[1])]]
        else:                  b = sets.pop(int(merge[1]))['children']
        sets[1 + i + len(z)] = {'children': [*a, *b], 'distance': merge[2]}
    return sets 
Example #12
Source File: kmeans.py    From link-prediction_with_deep-learning with MIT License 5 votes vote down vote up
def kmeans(vectors, k, jobs=1):
    vectors = numpy.array(vectors)
    if with_sklearn:
        if jobs == 1:
            kmeans = sklearn.cluster.KMeans(k)
        else:
            kmeans = sklearn.cluster.KMeans(k, n_jobs=jobs) # sklearn > 0.10
        kmeans.fit(vectors)
        return kmeans.labels_
    else:
        codebook, distortion = scipy.cluster.vq.kmeans(vectors, k)
        cluster_ids, dist = scipy.cluster.vq.vq(vectors, codebook)
        return cluster_ids 
Example #13
Source File: kmeans.py    From link-prediction_with_deep-learning with MIT License 5 votes vote down vote up
def minibatch_kmeans(vectors, k):
    if not with_sklearn:
        raise NotImplementedError
    # Sculley (http://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf)
    # uses batch size 1000. sklearn KMeans defaults to n_init 10
    kmeans = sklearn.cluster.MiniBatchKMeans(k, batch_size=1000, n_init=10)
    kmeans.fit(vectors)
    return kmeans.labels_ 
Example #14
Source File: kmeans.py    From link-prediction_with_deep-learning with MIT License 5 votes vote down vote up
def process_options(args):    
    options = argparser().parse_args(args)

    if options.max_rank is not None and options.max_rank < 1:
        raise ValueError('max-rank must be >= 1')
    if options.k is not None and options.k < 2:
        raise ValueError('cluster number must be >= 2')

    if options.method == MINIBATCH_KMEANS and not with_sklearn:
        logging.warning('minibatch kmeans not available, using kmeans (slow)')
        options.method = KMEANS

    if options.jobs != 1 and (options.method != KMEANS or not with_sklearn):
        logging.warning('jobs > 1 only supported scikit-learn %s' % KMEANS)
        options.jobs = 1

    wv = wvlib.load(options.vectors[0], max_rank=options.max_rank)

    if options.k is None:
        options.k = int(math.ceil((len(wv.words())/2)**0.5))
        logging.info('set k=%d (%d words)' % (options.k, len(wv.words())))

    if options.normalize:
        logging.info('normalize vectors to unit length')
        wv.normalize()

    words, vectors = wv.words(), wv.vectors()

    if options.whiten:
        logging.info('normalize features to unit variance')
        vectors = scipy.cluster.vq.whiten(vectors)

    return words, vectors, options 
Example #15
Source File: signal_recompose.py    From NeuroKit with MIT License 5 votes vote down vote up
def _signal_recompose_wcorr(components, threshold=0.5, metric="chebyshev"):
    """"""
    # Calculate the w-correlation matrix.
    wcorr = _signal_recompose_get_wcorr(components, show=False)

    # Find clusters in correlation matrix
    pairwise_distances = scipy.cluster.hierarchy.distance.pdist(wcorr, metric=metric)
    linkage = scipy.cluster.hierarchy.linkage(pairwise_distances, method="complete")
    threshold = threshold * pairwise_distances.max()
    clusters = scipy.cluster.hierarchy.fcluster(linkage, threshold, "distance")

    return clusters 
Example #16
Source File: bounding.py    From dynesty with MIT License 5 votes vote down vote up
def _get_covariance_from_clusters(self, points):
        """Compute covariance from re-centered clusters."""

        # Compute pairwise distances.
        distances = spatial.distance.pdist(points, metric='mahalanobis',
                                           VI=self.am)

        # Identify conglomerates of points by constructing a linkage matrix.
        linkages = cluster.hierarchy.single(distances)

        # Cut when linkage between clusters exceed the radius.
        clusteridxs = cluster.hierarchy.fcluster(linkages, 1.0,
                                                 criterion='distance')
        nclusters = np.max(clusteridxs)
        if nclusters == 1:
            return self._get_covariance_from_all_points(points)
        else:
            i = 0
            overlapped_points = np.empty_like(points)
            for idx in np.unique(clusteridxs):
                group_points = points[clusteridxs == idx, :]
                group_mean = group_points.mean(axis=0).reshape((1, -1))
                j = i + len(group_points)
                overlapped_points[i:j, :] = group_points - group_mean
                i = j
            return self._get_covariance_from_all_points(overlapped_points)


##################
# HELPER FUNCTIONS
################## 
Example #17
Source File: bounding.py    From dynesty with MIT License 5 votes vote down vote up
def _get_covariance_from_clusters(self, points):
        """Compute covariance from re-centered clusters."""

        # Compute pairwise distances.
        distances = spatial.distance.pdist(points, metric='mahalanobis',
                                           VI=self.am)

        # Identify conglomerates of points by constructing a linkage matrix.
        linkages = cluster.hierarchy.single(distances)

        # Cut when linkage between clusters exceed the radius.
        clusteridxs = cluster.hierarchy.fcluster(linkages, 1.0,
                                                 criterion='distance')
        nclusters = np.max(clusteridxs)
        if nclusters == 1:
            return self._get_covariance_from_all_points(points)
        else:
            i = 0
            overlapped_points = np.empty_like(points)
            for idx in np.unique(clusteridxs):
                group_points = points[clusteridxs == idx, :]
                group_mean = group_points.mean(axis=0).reshape((1, -1))
                j = i + len(group_points)
                overlapped_points[i:j, :] = group_points - group_mean
                i = j
            return self._get_covariance_from_all_points(overlapped_points) 
Example #18
Source File: clust_color.py    From pyPESTO with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def assign_colors_for_result_list(num_results, colors=None):
    """
    Creates a list of colors for a list of pypesto.Result objects or checks
    a user-provided list of colors and uses this if everything is ok

    Parameters
    ----------

    num_results: int
        number of results in list

    colors: list, or RGBA, optional
        list of colors, or single color

    Returns
    -------

    colors: list of RGBA
        One for each element in 'vals'.
    """

    # if the user did not specify any colors:
    if colors is None:
        # default colors will be used, on for each entry in the result list.
        # Colors are created from assign_colors, which needs a dummy list
        dummy_clusters = np.array(list(range(num_results)) * 2)

        # we don't want alpha levels for all plotting routines in this case...
        colors = assign_colors(dummy_clusters, balance_alpha=False,
                               highlight_global=False)

        # dummy cluster had twice as many entries as really there. Reduce.
        real_indices = list(range(int(colors.shape[0] / 2)))
        return colors[real_indices]

    # if the user specified color lies does not match the number of results
    if len(colors) != num_results:
        raise ('Incorrect color input. Colors must be specified either as '
               'list of [r, g, b, alpha] with length equal to function '
               'values Number of function (here: ' + str(num_results) + '), '
               'or as one single [r, g, b, alpha] color.')

    return colors 
Example #19
Source File: clust_color.py    From pyPESTO with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def assign_clusters(vals):
    """
    Find clustering.

    Parameters
    ----------

    vals: numeric list or array
        List to be clustered.

    Returns
    -------

    clust: numeric list
         Indicating the corresponding cluster of each element from
         'vals'.

    clustsize: numeric list
        Size of clusters, length equals number of clusters.
    """

    # sanity checks
    if vals is None or len(vals) == 0:
        return [], []
    elif len(vals) == 1:
        return np.array([0]), np.array([1.])

    # linkage requires (n, 1) data array
    vals = np.reshape(vals, (-1, 1))

    # however: clusters are sorted by size, not by value... Resort.
    # Create preallocated object first
    cluster_indices = np.zeros(vals.size, dtype=int)

    # get clustering based on distance
    clust = cluster.hierarchy.fcluster(
        cluster.hierarchy.linkage(vals),
        t=0.1, criterion='distance')

    # get unique clusters
    _, ind_clust = np.unique(clust, return_index=True)
    unique_clust = clust[np.sort(ind_clust)]
    cluster_size = np.zeros(unique_clust.size, dtype=int)

    # loop over clusters: resort and count number of entries
    for index, i_clust in enumerate(unique_clust):
        cluster_indices[np.where(clust == i_clust)] = index
        cluster_size[index] = sum(clust == i_clust)

    return cluster_indices, cluster_size 
Example #20
Source File: bounding.py    From dynesty with MIT License 4 votes vote down vote up
def bounding_ellipsoids(points, pointvol=0., vol_dec=0.5, vol_check=2.):
    """
    Calculate a set of ellipsoids that bound the collection of points.

    Parameters
    ----------
    points : `~numpy.ndarray` with shape (npoints, ndim)
        A set of coordinates.

    pointvol : float, optional
        Volume represented by a single point. When provided,
        used to set a minimum bound on the ellipsoid volume
        as `npoints * pointvol`. Default is `0.`.

    vol_dec : float, optional
        The required fractional reduction in volume after splitting an
        ellipsoid in order to to accept the split. Default is `0.5`.

    vol_check : float, optional
        The factor used to when checking whether the volume of the
        original bounding ellipsoid is large enough to warrant more
        trial splits via `ell.vol > vol_check * npoints * pointvol`.
        Default is `2.0`.

    Returns
    -------
    mell : :class:`MultiEllipsoid` object
        The :class:`MultiEllipsoid` object used to bound the
        collection of points.

    """

    if not HAVE_KMEANS:
        raise ValueError("scipy.cluster.vq.kmeans2 is required to compute "
                         "ellipsoid decompositions.")  # pragma: no cover

    # Calculate the bounding ellipsoid for the points possibly
    # enlarged to a minimum volume.
    ell = bounding_ellipsoid(points, pointvol=pointvol)

    # Recursively split the bounding ellipsoid until the volume of each
    # split no longer decreases by a factor of `vol_dec`.
    ells = _bounding_ellipsoids(points, ell, pointvol=pointvol,
                                vol_dec=vol_dec, vol_check=vol_check)

    return MultiEllipsoid(ells=ells) 
Example #21
Source File: subroutines.py    From SigProfilerExtractor with BSD 2-Clause "Simplified" License 4 votes vote down vote up
def get_normalization_cutoff(data, manual_cutoff=9600):
    
    col_sums = np.array(np.sum(data, axis=0))

    # continue the loop if the differece the means is larger than the 2*2*STD of the larger cluster 
    while True:

        try:
            # doing Kmean clustering
            col_sums_for_cluster = col_sums.reshape(-1,1)
            
            #separate distributions using kmean
            #kmeans = KMeans(n_clusters=2, random_state=0).fit(col_sums_for_cluster)
            #labels = kmeans.labels_
            
            #separate distributions using mixture model
            clf = mixture.GaussianMixture(n_components=2, covariance_type='full')
            clf.fit(col_sums_for_cluster)
            labels = clf.predict(col_sums_for_cluster)
            
            unique, counts = np.unique(labels, return_counts=True)
            if len(unique)==1:
                break
            bigger_cluster = unique[np.argmax(counts)]
            smaller_cluster = unique[np.argmin(counts)]
            
            # estimating the magnitute of discripancy better the clusters
            bigger_cluster__dist = col_sums[labels==bigger_cluster]
            smaller_cluster__dist = col_sums[labels==smaller_cluster]
            bigger_cluster__dist_mean = np.mean(bigger_cluster__dist) 
            bigger_cluster__dist_std = np.std(bigger_cluster__dist) 
            smaller_cluster__dist_mean = np.mean(smaller_cluster__dist) 
            #smaller_cluster__dist_std = np.std(smaller_cluster__dist) 
            
             
            
            #print("bigger_cluster__dist_mean ", bigger_cluster__dist_mean)
            #print("bigger_cluster__dist_std ", bigger_cluster__dist_std)
            #print("smaller_cluster__dist_mean ", smaller_cluster__dist_mean)
            #print("smaller_cluster__dist_std ", smaller_cluster__dist_std)
            
            # continue the loop if the differece the means is larger than the 2*STD of the larger cluster 
            
            if abs(bigger_cluster__dist_mean-smaller_cluster__dist_mean)< 2*2*bigger_cluster__dist_std:
                break
                
            
            # col_sums will be equal to bigger_cluster__dist for the next iteration 
            col_sums = bigger_cluster__dist
        except:
            break
            
     
    mean = np.mean(col_sums)  
    std = np.std(col_sums)
    cutoff = (mean + 2*(std)).astype(int)
    
    if cutoff<manual_cutoff:
        cutoff = manual_cutoff
    
    return cutoff 
Example #22
Source File: subroutines.py    From SigProfilerExtractor with BSD 2-Clause "Simplified" License 4 votes vote down vote up
def get_normalization_cutoff(data, manual_cutoff=9600):
    
    col_sums = np.array(np.sum(data, axis=0))

    # continue the loop if the differece the means is larger than the 2*2*STD of the larger cluster 
    while True:

        try:
            # doing Kmean clustering
            col_sums_for_cluster = col_sums.reshape(-1,1)
            
            #separate distributions using kmean
            #kmeans = KMeans(n_clusters=2, random_state=0).fit(col_sums_for_cluster)
            #labels = kmeans.labels_
            
            #separate distributions using mixture model
            clf = mixture.GaussianMixture(n_components=2, covariance_type='full')
            clf.fit(col_sums_for_cluster)
            labels = clf.predict(col_sums_for_cluster)
            
            unique, counts = np.unique(labels, return_counts=True)
            if len(unique)==1:
                break
            bigger_cluster = unique[np.argmax(counts)]
            smaller_cluster = unique[np.argmin(counts)]
            
            # estimating the magnitute of discripancy better the clusters
            bigger_cluster__dist = col_sums[labels==bigger_cluster]
            smaller_cluster__dist = col_sums[labels==smaller_cluster]
            bigger_cluster__dist_mean = np.mean(bigger_cluster__dist) 
            bigger_cluster__dist_std = np.std(bigger_cluster__dist) 
            smaller_cluster__dist_mean = np.mean(smaller_cluster__dist) 
            #smaller_cluster__dist_std = np.std(smaller_cluster__dist) 
            
             
            
            #print("bigger_cluster__dist_mean ", bigger_cluster__dist_mean)
            #print("bigger_cluster__dist_std ", bigger_cluster__dist_std)
            #print("smaller_cluster__dist_mean ", smaller_cluster__dist_mean)
            #print("smaller_cluster__dist_std ", smaller_cluster__dist_std)
            
            # continue the loop if the differece the means is larger than the 2*STD of the larger cluster 
            
            if abs(bigger_cluster__dist_mean-smaller_cluster__dist_mean)< 2*2*bigger_cluster__dist_std:
                break
                
            
            # col_sums will be equal to bigger_cluster__dist for the next iteration 
            col_sums = bigger_cluster__dist
        except:
            break
            
     
    mean = np.mean(col_sums)  
    std = np.std(col_sums)
    cutoff = (mean + 2*(std)).astype(int)
    
    if cutoff<manual_cutoff:
        cutoff = manual_cutoff
    
    return cutoff 
Example #23
Source File: diarizationFunctions.py    From pyBK with MIT License 4 votes vote down vote up
def performClustering( speechMapping, segmentTable, segmentBKTable, segmentCVTable, Vg, bitsPerSegmentFactor, kbmSize, N_init, initialClustering, clusteringMetric):
    numberOfSegments = np.size(segmentTable,0)
    clusteringTable = np.zeros([numberOfSegments, N_init])
    finalClusteringTable = np.zeros([numberOfSegments, N_init])
    activeClusters = np.ones([N_init,1]) 
    clustersBKTable = np.zeros([N_init, kbmSize])
    clustersCVTable = np.zeros([N_init, kbmSize])    
    clustersBKTable, clustersCVTable = calcClusters(clustersCVTable,clustersBKTable,activeClusters,initialClustering,N_init,segmentTable,kbmSize,speechMapping,Vg,bitsPerSegmentFactor)   
    ####### Here the clustering algorithm begins. Steps are:
    ####### 1. Reassign all data among all existing signatures and retrain them
    ####### using the new clustering
    ####### 2. Save the resulting clustering solution
    ####### 3. Compare all signatures with each other and merge those two with
    ####### highest similarity, creating a new signature for the resulting
    ####### cluster
    ####### 4. Back to 1 if #clusters > 1      
    for k in range(N_init):
    ####### 1. Data reassignment. Calculate the similarity between the current segment with all clusters and assign it to the one which maximizes
    ####### the similarity. Finally re-calculate binaryKeys for all cluster   
    # before doing anything, check if there are remaining clusters
    # if there is only one active cluster, break    
        if np.sum(activeClusters)==1:
            break          
        clustersStillActive=np.zeros([1,N_init])
        segmentToClustersSimilarityMatrix = binaryKeySimilarity_cdist(clusteringMetric,segmentBKTable,segmentCVTable,clustersBKTable,clustersCVTable)
        # clusteringTable[:,k] = finalClusteringTable[:,k] = np.argmax(segmentToClustersSimilarityMatrix,axis=1)+1
        clusteringTable[:,k] = finalClusteringTable[:,k] = np.nanargmax(segmentToClustersSimilarityMatrix,axis=1)+1
        # clustersStillActive[:,np.unique(clusteringTable[:,k]).astype(int)-1] = 1
        clustersStillActive[:,np.unique(clusteringTable[:,k]).astype(int)-1] = 1       
        ####### update all binaryKeys for all new clusters        
        activeClusters = clustersStillActive
        clustersBKTable, clustersCVTable = calcClusters(clustersCVTable,clustersBKTable,activeClusters.T,clusteringTable[:,k].astype(int),N_init,segmentTable,kbmSize,speechMapping,Vg,bitsPerSegmentFactor)                
        ####### 2. Compare all signatures with each other and merge those two with highest similarity, creating a new signature for the resulting        
        clusterSimilarityMatrix = binaryKeySimilarity_cdist(clusteringMetric,clustersBKTable,clustersCVTable,clustersBKTable,clustersCVTable)        
        np.fill_diagonal(clusterSimilarityMatrix,np.nan)        
        value = np.nanmax(clusterSimilarityMatrix)
        location = np.nanargmax(clusterSimilarityMatrix)        
        R,C = np.unravel_index(location,(N_init,N_init))        
        ### Then we merge clusters R and C
        #print('Merging clusters',R+1,'and',C+1,'with a similarity score of',np.around(value,decimals=4))
        print('Merging clusters','%3s'%str(R+1),'and','%3s'%str(C+1),'with a similarity score of',np.around(value,decimals=4))
        activeClusters[0,C]=0        
        ### 3. Save the resulting clustering and go back to 1 if the number of clusters >1
        mergingClusteringIndices = np.where(clusteringTable[:,k]==C+1)
        # update clustering table
        clusteringTable[mergingClusteringIndices[0],k]=R+1
        # remove binarykey for removed cluster
        clustersBKTable[C,:]=np.zeros([1,kbmSize])
        clustersCVTable[C,:]=np.nan
        # prepare the vector with the indices of the features of thew new cluster and then binarize
        segmentsToBinarize = np.where(clusteringTable[:,k]==R+1)[0]
        M=[]
        for l in np.arange(np.size(segmentsToBinarize,0)):
            M = np.append(M,np.arange(int(segmentTable[segmentsToBinarize][:][l,1]),int(segmentTable[segmentsToBinarize][:][l,2])+1))
        clustersBKTable[R,:], clustersCVTable[R,:]=binarizeFeatures(kbmSize,Vg[np.array(speechMapping[np.array(M,dtype='int')],dtype='int')-1].T,bitsPerSegmentFactor)
    print('done')
    return clusteringTable, k 
Example #24
Source File: utils.py    From opensurfaces with MIT License 4 votes vote down vote up
def get_dominant_image_colors(image, num_clusters=4):
    """
    Returns the dominant image color that isn't pure white or black.  Uses
    kmeans on the colors.  Returns the result as RGB hex strings in the format
    ['#rrggbb', '#rrggbb', ...].

    :param image: PIL image or path
    """

    if isinstance(image, basestring):
        image = Image.open(image)

    # downsample for speed
    im = image.resize((512, 512), Image.ANTIALIAS)

    # reshape
    ar0 = scipy.misc.fromimage(im)
    shape = ar0.shape
    npixels = scipy.product(shape[:2])
    ar0 = ar0.reshape(npixels, shape[2])

    # keep only nontransparent elements
    ar = ar0[ar0[:, 3] == 255][:, 0:3]

    try:
        # kmeans clustering
        codes, dist = scipy.cluster.vq.kmeans(ar, num_clusters)
    except:
        # kmeans sometimes fails -- if that is the case, use the mean color and
        # nothing else.
        arf = ar.astype(float)
        clamp = lambda p: max(0, min(255, int(p)))
        return ['#' + ''.join(['%0.2x' % clamp(arf[:, i].sum() / float(arf.shape[1])) for i in (0, 1, 2)])]

    vecs, dist = scipy.cluster.vq.vq(ar, codes)         # assign codes
    counts, bins = scipy.histogram(vecs, len(codes))    # count occurrences

    # sort by count frequency
    indices = [i[0] for i in
               sorted(enumerate(counts), key=lambda x:x[1], reverse=True)]

    # convert to hex strings
    colors = [''.join(chr(c) for c in code).encode('hex') for code in codes]

    results = []
    for idx in indices:
        color = colors[idx]
        if color != 'ffffff' and color != '000000':
            results.append('#' + color)

    return results