Python sklearn.metrics.silhouette_score() Examples

The following are 30 code examples of sklearn.metrics.silhouette_score(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.metrics , or try the search function .
Example #1
Source File: clustering.py    From torchsupport with MIT License 6 votes vote down vote up
def _cluster_plot(self, embedding, labels):
    silhouette = silhouette_score(embedding.squeeze(), labels)
    chs = calinski_harabaz_score(embedding.squeeze(), labels)
    dbs = davies_bouldin_score(embedding.squeeze(), labels)

    n_labels = len(set(labels))

    self.writer.add_scalar(f"silhouette {n_labels}", silhouette, self.step_id)
    self.writer.add_scalar(f"chs {n_labels}", chs, self.step_id)
    self.writer.add_scalar(f"dbs {n_labels}", dbs, self.step_id)

    indices = list(range(len(labels)))
    random.shuffle(indices)
    samples_to_plot = indices[:1000]
    sample_labels = [labels[idx] for idx in samples_to_plot]
    sample_embedding = embedding[samples_to_plot]
    pca = PCA(2).fit_transform(sample_embedding.squeeze())
    fig, ax = plt.subplots()
    ax.scatter(pca[:, 0], pca[:, 1], c=sample_labels, cmap="tab20")
    self.writer.add_figure(f"clustering {n_labels}", fig, self.step_id) 
Example #2
Source File: cluster.py    From PHATE with GNU General Public License v2.0 6 votes vote down vote up
def silhouette_score(phate_op, n_clusters, random_state=None, **kwargs):
    """Compute the Silhouette score on KMeans on the PHATE potential

    Parameters
    ----------
    phate_op : phate.PHATE
        Fitted PHATE operator
    n_clusters : int
        Number of clusters.
    random_state : int or None, optional (default: None)
        Random seed for k-means

    Returns
    -------
    score : float
    """
    cluster_labels = kmeans(phate_op, n_clusters=n_clusters, random_state=random_state, **kwargs)
    return metrics.silhouette_score(phate_op.diff_potential, cluster_labels) 
Example #3
Source File: precluster.py    From texta with GNU General Public License v3.0 6 votes vote down vote up
def _find_optimal_clustering(self,clusterings):

        max_score = float('-inf')
        max_clustering = None

        for clustering in clusterings:
            labeled_vectors = [(node.vector,cluster_idx) for cluster_idx in range(len(clustering)) for node in _get_cluster_nodes(clustering[cluster_idx][1]) ]
            vectors,labels = [np.array(x) for x in zip(*labeled_vectors)]
            if np.in1d([1],labels)[0]:
                score = silhouette_score(vectors,labels,metric='cosine')
            else:
                continue # silhouette doesn't work with just one cluster
            if score > max_score:
                max_score = score
                max_clustering = clustering

        return list(zip(*max_clustering))[1] if max_clustering else list(zip(*clusterings[0]))[1] 
Example #4
Source File: test_combat.py    From scanpy with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_silhouette():
    # this test checks wether combat can align data from several gaussians
    # it checks this by computing the silhouette coefficient in a pca embedding

    # load in data
    adata = sc.datasets.blobs()

    # apply combat
    sc.pp.combat(adata, 'blobs')

    # compute pca
    sc.tl.pca(adata)
    X_pca = adata.obsm['X_pca']

    # compute silhouette coefficient in pca
    sh = silhouette_score(X_pca[:, :2], adata.obs['blobs'].values)

    assert sh < 0.1 
Example #5
Source File: posterior.py    From scVI with MIT License 6 votes vote down vote up
def clustering_scores(self, prediction_algorithm: str = "knn") -> Tuple:
        if self.gene_dataset.n_labels > 1:
            latent, _, labels = self.get_latent()
            if prediction_algorithm == "knn":
                labels_pred = KMeans(
                    self.gene_dataset.n_labels, n_init=200
                ).fit_predict(
                    latent
                )  # n_jobs>1 ?
            elif prediction_algorithm == "gmm":
                gmm = GMM(self.gene_dataset.n_labels)
                gmm.fit(latent)
                labels_pred = gmm.predict(latent)

            asw_score = silhouette_score(latent, labels)
            nmi_score = NMI(labels, labels_pred)
            ari_score = ARI(labels, labels_pred)
            uca_score = unsupervised_clustering_accuracy(labels, labels_pred)[0]
            logger.debug(
                "Clustering Scores:\nSilhouette: %.4f\nNMI: %.4f\nARI: %.4f\nUCA: %.4f"
                % (asw_score, nmi_score, ari_score, uca_score)
            )
            return asw_score, nmi_score, ari_score, uca_score 
Example #6
Source File: clustering.py    From malss with MIT License 6 votes vote down vote up
def calc_scores(cls, model, data, min_clusters, max_clusters, random_state=0):
        silhouettes = []
        davieses = []
        calinskies = []
        if model.__class__.__name__ == 'HierarchicalClustering':
            linkage_matrix = model.fit(data)
        else:
            linkage_matrix = None
        for nc in range(min_clusters, max_clusters + 1):
            model.n_clusters = nc
            model.random_state = random_state
            pred_labels = model.fit_predict(data)
            silhouettes.append(silhouette_score(data, pred_labels, random_state=random_state))
            davieses.append(davies_bouldin_score(data, pred_labels))
            calinskies.append(calinski_harabasz_score(data, pred_labels))

        sil_nc = np.argmax(silhouettes) + min_clusters
        dav_nc = np.argmin(davieses) + min_clusters
        cal_nc = np.argmax(calinskies) + min_clusters

        return silhouettes, sil_nc, davieses, dav_nc, calinskies, cal_nc, linkage_matrix 
Example #7
Source File: k_means_plot.py    From machine-learning with GNU General Public License v3.0 6 votes vote down vote up
def bench_k_means(estimator, name, data):
    t0 = time()
    estimator.fit(data)
    print('% 9s   %.2fs    %i   %.3f   %.3f   %.3f   %.3f   %.3f    %.3f'
          % (name, (time() - t0), estimator.inertia_,
             metrics.homogeneity_score(labels, estimator.labels_),
             metrics.completeness_score(labels, estimator.labels_),
             metrics.v_measure_score(labels, estimator.labels_),
             metrics.adjusted_rand_score(labels, estimator.labels_),
             metrics.adjusted_mutual_info_score(labels,  estimator.labels_),
             metrics.silhouette_score(data, estimator.labels_,
                                      metric='euclidean',
                                      sample_size=sample_size))) 
Example #8
Source File: Util.py    From TrackR-CNN with MIT License 5 votes vote down vote up
def fit(self, X, y=None, sample_weight=None):
    silhouette_avgs = []
    for n_clusters in range(1, self.max_n_clusters):
      self.clusterers[n_clusters-1].fit(X, y, sample_weight)
      if n_clusters == 1:
        silhouette_avgs.append(-1.1)  # TODO
      else:
        silhouette_avgs.append(silhouette_score(X, self.clusterers[n_clusters-1].labels_))
    self.best_n_clusters = silhouette_avgs.index(max(silhouette_avgs)) + 1
    self.labels_ = self.clusterers[self.best_n_clusters-1].labels_
    self.cluster_centers_ = self.clusterers[self.best_n_clusters-1].cluster_centers_ 
Example #9
Source File: ABuMLExecute.py    From abu with GNU General Public License v3.0 5 votes vote down vote up
def run_silhouette_cv_estimator(estimator, x, n_folds=10):
    """
    只针对kmean的cv验证,使用silhouette_score对聚类后的结果labels_
    进行度量使用silhouette_score,kmean的cv验证只是简单的通过np.random.choice
    进行随机筛选x数据进行聚类的silhouette_score度量,并不涉及训练集测试集
    :param estimator: keman或者支持estimator.labels_, 只通过if not isinstance(estimator, ClusterMixin)进行过滤
    :param x: x特征矩阵
    :param n_folds: int,透传KFold参数,切割训练集测试集参数,默认10
    :return: eg: array([ 0.693 ,  0.652 ,  0.6845,  0.6696,  0.6732,  0.6874,  0.668 ,
                         0.6743,  0.6748,  0.671 ])
    """

    if not isinstance(estimator, ClusterMixin):
        print('estimator must be ClusterMixin')
        return

    silhouette_list = list()
    # eg: n_folds = 10, len(x) = 150 -> 150 * 0.9 = 135
    choice_cnt = int(len(x) * ((n_folds - 1) / n_folds))
    choice_source = np.arange(0, x.shape[0])

    # 所有执行fit的操作使用clone一个新的
    estimator = clone(estimator)
    for _ in np.arange(0, n_folds):
        # 只是简单的通过np.random.choice进行随机筛选x数据
        choice_index = np.random.choice(choice_source, choice_cnt)
        x_choice = x[choice_index]
        estimator.fit(x_choice)
        # 进行聚类的silhouette_score度量
        silhouette_score = metrics.silhouette_score(x_choice, estimator.labels_, metric='euclidean')
        silhouette_list.append(silhouette_score)
    return silhouette_list 
Example #10
Source File: test_cluster.py    From pandas-ml with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_KMeans_scores(self):
        digits = datasets.load_digits()
        df = pdml.ModelFrame(digits)

        scaled = pp.scale(digits.data)
        df.data = df.data.pp.scale()
        self.assert_numpy_array_almost_equal(df.data.values, scaled)

        clf1 = cluster.KMeans(init='k-means++', n_clusters=10,
                              n_init=10, random_state=self.random_state)
        clf2 = df.cluster.KMeans(init='k-means++', n_clusters=10,
                                 n_init=10, random_state=self.random_state)
        clf1.fit(scaled)
        df.fit_predict(clf2)

        expected = m.homogeneity_score(digits.target, clf1.labels_)
        self.assertEqual(df.metrics.homogeneity_score(), expected)

        expected = m.completeness_score(digits.target, clf1.labels_)
        self.assertEqual(df.metrics.completeness_score(), expected)

        expected = m.v_measure_score(digits.target, clf1.labels_)
        self.assertEqual(df.metrics.v_measure_score(), expected)

        expected = m.adjusted_rand_score(digits.target, clf1.labels_)
        self.assertEqual(df.metrics.adjusted_rand_score(), expected)

        expected = m.homogeneity_score(digits.target, clf1.labels_)
        self.assertEqual(df.metrics.homogeneity_score(), expected)

        expected = m.silhouette_score(scaled, clf1.labels_, metric='euclidean',
                                      sample_size=300, random_state=self.random_state)
        result = df.metrics.silhouette_score(metric='euclidean', sample_size=300,
                                             random_state=self.random_state)
        self.assertAlmostEqual(result, expected) 
Example #11
Source File: test_metrics.py    From pandas-ml with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_silhouette_score(self):
        result = self.df.metrics.silhouette_score()
        expected = metrics.silhouette_score(self.data, self.pred)
        self.assertAlmostEqual(result, expected) 
Example #12
Source File: plot_kmeans_digits.py    From Computer-Vision-with-Python-3 with MIT License 5 votes vote down vote up
def bench_k_means(estimator, name, data):
    t0 = time()
    estimator.fit(data)
    print('% 9s   %.2fs    %i   %.3f   %.3f   %.3f   %.3f   %.3f    %.3f'
          % (name, (time() - t0), estimator.inertia_,
             metrics.homogeneity_score(labels, estimator.labels_),
             metrics.completeness_score(labels, estimator.labels_),
             metrics.v_measure_score(labels, estimator.labels_),
             metrics.adjusted_rand_score(labels, estimator.labels_),
             metrics.adjusted_mutual_info_score(labels,  estimator.labels_),
             metrics.silhouette_score(data, estimator.labels_,
                                      metric='euclidean',
                                      sample_size=sample_size))) 
Example #13
Source File: cluster.py    From hyperstar with MIT License 5 votes vote down vote up
def evaluate(k):
    km = kmeans[k]
    score = silhouette_score(train_offsets, km.labels_, metric='euclidean', random_state=RANDOM_SEED)
    print('Silhouette score for k=%d is %f.' % (k, score))
    return (k, score) 
Example #14
Source File: algorithm_footprint.py    From CAVE with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def get_clusters(self, features_2d):
        """ Mapping instances to clusters, using silhouette-scores to determine
        number of cluster.

        Returns
        -------
        paths: List[str]
            paths to plots
        """
        # get silhouette scores for k_means with 2 to 12 clusters
        # use number of clusters with highest silhouette score
        best_score, best_n_clusters = -1, -1
        min_clusters, max_clusters = 2, min(features_2d.shape[0], 12)
        clusters = None
        for n_clusters in range(min_clusters, max_clusters):
            km = KMeans(n_clusters=n_clusters)
            y_pred = km.fit_predict(features_2d)
            score = silhouette_score(features_2d, y_pred)
            if score > best_score:
                best_n_clusters = n_clusters
                best_score = score
                clusters = y_pred

        self.logger.debug("%d clusters detected using silhouette scores",
                          best_n_clusters)

        cluster_dict = {n: [] for n in range(best_n_clusters)}
        for i, c in enumerate(clusters):
            cluster_dict[c].append(self.insts[i])

        self.logger.debug("Distribution over clusters: %s",
                          str({k: len(v) for k, v in cluster_dict.items()}))

        return clusters, cluster_dict 
Example #15
Source File: test_silhouette.py    From pyclust with GNU General Public License v2.0 5 votes vote down vote up
def test_gmm():
    sil = pyclust.validate.Silhouette()
    sil_score = sil.score(X, ypred, sample_size=None)

    print(sil_score[0])

    print(sil.sample_scores[:10])

    print(silhouette_score(X, ypred, sample_size=None))
    
    print(silhouette_samples(X, ypred)[:10]) 
Example #16
Source File: omniglot_est_k.py    From DTC with MIT License 5 votes vote down vote up
def labeled_val_fun(u_feats, l_feats, l_targets, k):
    if device=='cuda':
        torch.cuda.empty_cache()
    l_num=len(l_targets)
    kmeans = K_Means(k, pairwise_batch_size = 200)
    kmeans.fit_mix(torch.from_numpy(u_feats).to(device), torch.from_numpy(l_feats).to(device), torch.from_numpy(l_targets).to(device))
    cat_pred = kmeans.labels_.cpu().numpy() 
    u_pred = cat_pred[l_num:]
    silh_score = silhouette_score(u_feats, u_pred)
    del kmeans
    return silh_score, cat_pred 
Example #17
Source File: imagenet_est_k.py    From DTC with MIT License 5 votes vote down vote up
def labeled_val_fun(u_feats, l_feats, l_targets, k):
    if device=='cuda':
        torch.cuda.empty_cache()
    l_num=len(l_targets)
    kmeans = K_Means(k, pairwise_batch_size=256)
    kmeans.fit_mix(torch.from_numpy(u_feats).to(device), torch.from_numpy(l_feats).to(device), torch.from_numpy(l_targets).to(device))
    cat_pred = kmeans.labels_.cpu().numpy() 
    u_pred = cat_pred[l_num:]
    silh_score = silhouette_score(u_feats, u_pred)
    return silh_score, cat_pred 
Example #18
Source File: cifar100_est_k.py    From DTC with MIT License 5 votes vote down vote up
def labeled_val_fun(u_feats, l_feats, l_targets, k):
    if device=='cuda':
        torch.cuda.empty_cache()
    l_num=len(l_targets)
    kmeans = K_Means(k, pairwise_batch_size=256)
    kmeans.fit_mix(torch.from_numpy(u_feats).to(device), torch.from_numpy(l_feats).to(device), torch.from_numpy(l_targets).to(device))
    cat_pred = kmeans.labels_.cpu().numpy() 
    u_pred = cat_pred[l_num:]
    silh_score = silhouette_score(u_feats, u_pred)
    return silh_score, cat_pred 
Example #19
Source File: clusterer.py    From yelp with GNU Lesser General Public License v2.1 5 votes vote down vote up
def evaluate_performance(data, labels, metric='euclidean'):
        score = skmetrics.silhouette_score(data, labels, metric=metric)
        print('Labels:', labels)
        print('Score:', score)

        return score 
Example #20
Source File: k_means_clustering.py    From FunUtils with MIT License 5 votes vote down vote up
def bench_k_means(estimator, name, data):
    estimator.fit(data)
    # A short explanation for every score:
    # homogeneity:          each cluster contains only members of a single class (range 0 - 1)
    # completeness:         all members of a given class are assigned to the same cluster (range 0 - 1)
    # v_measure:            harmonic mean of homogeneity and completeness
    # adjusted_rand:        similarity of the actual values and their predictions,
    #                       ignoring permutations and with chance normalization
    #                       (range -1 to 1, -1 being bad, 1 being perfect and 0 being random)
    # adjusted_mutual_info: agreement of the actual values and predictions, ignoring permutations
    #                       (range 0 - 1, with 0 being random agreement and 1 being perfect agreement)
    # silhouette:           uses the mean distance between a sample and all other points in the same class,
    #                       as well as the mean distance between a sample and all other points in the nearest cluster
    #                       to calculate a score (range: -1 to 1, with the former being incorrect,
    #                       and the latter standing for highly dense clustering.
    #                       0 indicates overlapping clusters.
    print('%-9s \t%i \thomogeneity: %.3f \tcompleteness: %.3f \tv-measure: %.3f \tadjusted-rand: %.3f \t'
          'adjusted-mutual-info: %.3f \tsilhouette: %.3f'
          % (name, estimator.inertia_,
             metrics.homogeneity_score(y, estimator.labels_),
             metrics.completeness_score(y, estimator.labels_),
             metrics.v_measure_score(y, estimator.labels_),
             metrics.adjusted_rand_score(y, estimator.labels_),
             metrics.adjusted_mutual_info_score(y,  estimator.labels_),
             metrics.silhouette_score(data, estimator.labels_,
                                      metric='euclidean'))) 
Example #21
Source File: infer.py    From NLP_Toolkit with Apache License 2.0 5 votes vote down vote up
def n_cluster_embeddings(self, features=None, n_clusters=3, method='ac'):
        '''
        clusters the nodes based on embedding features
        features = None (use DGI generated embeddings)
        '''
        if method == 'ac':
            clustering = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean',\
                                                 linkage='ward')
            clustering.fit(self.embeddings if features is None else features)
            self.labels = clustering.labels_
            self.score = silhouette_score(self.embeddings if features is None else features,\
                                          self.labels)
        return {'labels': self.labels, 'score': self.score} 
Example #22
Source File: firmware_clustering.py    From Firmware_Slap with GNU General Public License v3.0 5 votes vote down vote up
def get_single_cluster(all_functions, centroid_count=2):
    return_dict = {}
    vect, func_sparse = funcs_to_sparse(all_functions)

    transformer = Normalizer().fit(func_sparse)

    func_sparse = transformer.transform(func_sparse)

    # svd = TruncatedSVD(random_state=2)
    # svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)

    # func_sparse = svd.fit_transform(func_sparse)

    labels = []

    result = KMeans(n_clusters=centroid_count, random_state=2).fit(func_sparse)

    score = silhouette_score(func_sparse,
                             result.labels_,
                             metric="cosine",
                             random_state=2,
                             sample_size=5000)
    labels.append(result.labels_)

    #print("Clusters {:<3} | Silhoette Score : {}".format(centroid_count, score))
    return_dict['count'] = centroid_count
    return_dict['score'] = score
    return_dict['labels'] = result.labels_

    return return_dict 
Example #23
Source File: firmware_clustering.py    From Firmware_Slap with GNU General Public License v3.0 5 votes vote down vote up
def single_cluster(all_functions, centroid_count=2):
    vect, func_sparse = funcs_to_sparse(all_functions)

    transformer = Normalizer().fit(func_sparse)

    func_sparse = transformer.transform(func_sparse)

    # svd = TruncatedSVD(random_state=2)
    # svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)

    # func_sparse = svd.fit_transform(func_sparse)

    labels = []

    result = KMeans(n_clusters=centroid_count, random_state=2).fit(func_sparse)

    score = silhouette_score(func_sparse,
                             result.labels_,
                             metric="cosine",
                             random_state=2,
                             sample_size=5000)
    labels.append(result.labels_)

    print("Clusters {:<3} | Silhoette Score : {}".format(
        centroid_count, score))

    return result.labels_ 
Example #24
Source File: helper.py    From practicalDataAnalysisCookbook with GNU General Public License v2.0 5 votes vote down vote up
def printClustersSummary(data, labels, centroids):
    '''
        Helper method to automate models assessment
    '''
    print('Pseudo_F: ', pseudo_F(data, labels, centroids))
    print('Davis-Bouldin: ', 
        davis_bouldin(data, labels, centroids))
    print('Silhouette score: ', 
        mt.silhouette_score(data, np.array(labels), 
            metric='euclidean')) 
Example #25
Source File: solr-similarity.py    From tika-similarity with Apache License 2.0 5 votes vote down vote up
def sk_kmeans(core): #, kval=3

    solrURL = "http://localhost:8983/solr/" + core
    solrInstance = Solr(solrURL)

    list_of_points = []
    docs = solrInstance.query_iterator(query="*:*", start=0)

    for doc in docs:
        list_of_points.append(Vector(doc['id'], doc))

    list_of_Dicts = (point.features for point in list_of_points)

    df = pd.DataFrame(list_of_Dicts)
    df = df.fillna(0)

    silhouettes = {}
    for k in range(2, 10):

        kmeans = KMeans(n_clusters=k,
                    init='k-means++',
                    max_iter=300,  # k-means convergence
                    n_init=10,  # find global minima
                    n_jobs=-2,  # parallelize
                    )

        labels = kmeans.fit_predict(df)
        silhouettes[k] = silhouette_score(df, labels)


    return str(silhouettes) 
Example #26
Source File: function_clustering.py    From Firmware_Slap with GNU General Public License v3.0 5 votes vote down vote up
def test():
    parser = argparse.ArgumentParser()

    parser.add_argument("File")

    args = parser.parse_args()

    info = fh.get_function_information(args.File)
    #info = fh.get_arg_funcs(args.File)

    info = trim_funcs(info, args.File)

    vect, func_sparse = funcs_to_sparse(info)

    transformer = Normalizer().fit(func_sparse)

    func_sparse = transformer.transform(func_sparse)

    #svd = TruncatedSVD(random_state=2)
    svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)

    func_sparse = svd.fit_transform(func_sparse)

    scores = []
    clust_count = []
    for x in range(2, 20):
        result = KMeans(n_clusters=x, random_state=2).fit(func_sparse)

        score = silhouette_score(func_sparse, result.labels_, metric="cosine")
        scores.append(score)
        clust_count.append(x)

        print("Clusters {:<3} | Silhoette Score : {}".format(x, score))

    plt.plot(clust_count, scores)
    plt.xlabel("Cluster Centroid Count")
    plt.ylabel("Silhoette Score")
    plt.grid = True
    plt.show()

    pass 
Example #27
Source File: clustering.py    From retentioneering-tools with Mozilla Public License 2.0 5 votes vote down vote up
def calc_all_metrics(data, km):
    """
    Calculates all quality metrics: Cluster Stability Index, Silhouette score, Homogeneity, distances for clustering.

    Parameters
    --------
    data: pd.DataFrame
        Dataframe with features for clustering indexed as in ``retention_config.index_col``
    km:
        Already fitted clusterer.

    Returns
    --------
    Metrics scores

    Return type
    --------
    Dict
    """
    res = {}
    cl = km.labels_
    res['mean_pd'] = calc_mean_pd(data, cl)
    if hasattr(km, 'cluster_centers_'):
        res['mean_fc'] = calc_mean_dist_from_center(data, km)
    if len(set(cl)) > 1:
        res['silhouette'] = silhouette_score(data, cl, metric='cosine')
    return res 
Example #28
Source File: clustering.py    From retentioneering-tools with Mozilla Public License 2.0 5 votes vote down vote up
def find_best_n_clusters(data, clusterer, max_n_clusters, random_state, **kwargs):
    """
    Finds best number of clusters for KMeans and Gaussian Mixture.

    Parameters
    -------
    data: pd.DataFrame
        Dataframe with features for clustering with index as in ``retention_config.index_col``
    clusterer: sklearn clusterer class
        For instance, ``sklearn.cluster.KMeans`` or ``sklearn.mixture.GaussianMixture``.
    max_n_clusters: int
        Maximal number of clusters for searching.
    random_state: int
        Random state for clusterer.

    Returns
    -------
    Optimal keyword arguments for clustering method.

    Return type
    ------
    Dict
    """
    args = {i: j for i, j in kwargs.items() if i in clusterer.get_params(clusterer)}
    if 'n_clusters' in clusterer.get_params(clusterer):
        kms = True
    else:
        kms = False
    args.pop('n_clusters' if kms else 'n_components', None)
    args.update({'random_state': random_state})
    score = {}
    for i in range(2, max_n_clusters + 1):
        args.update({'n_clusters' if kms else 'n_components': i})
        km = clusterer(**args)
        score[i] = silhouette_score(data, km.fit_predict(data), metric='cosine')
    best = pd.Series(score).idxmax()
    args.update({'n_clusters' if kms else 'n_components': best})
    print(f'Best number of clusters is {best}')
    return args 
Example #29
Source File: kclust.py    From graspy with Apache License 2.0 4 votes vote down vote up
def fit(self, X, y=None):
        """
        Fits kmeans model to the data. 

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            List of n_features-dimensional data points. Each row
            corresponds to a single data point.

        y : array-like, shape (n_samples,), optional (default=None)
            List of labels for `X` if available. Used to compute ARI scores.

        Returns
        -------
        self
        """
        # Deal with number of clusters
        if self.max_clusters > X.shape[0]:
            msg = "n_components must be >= n_samples, but got \
                n_components = {}, n_samples = {}".format(
                self.max_clusters, X.shape[0]
            )
            raise ValueError(msg)
        else:
            max_clusters = self.max_clusters

        # Get parameters
        random_state = self.random_state

        # Compute all models
        models = []
        silhouettes = []
        aris = []
        for n in range(2, max_clusters + 1):
            model = KMeans(n_clusters=n, random_state=random_state)

            # Fit and compute values
            predictions = model.fit_predict(X)
            models.append(model)
            silhouettes.append(silhouette_score(X, predictions))
            if y is not None:
                aris.append(adjusted_rand_score(y, predictions))

        if y is not None:
            self.ari_ = aris
            self.silhouette_ = silhouettes
            self.n_clusters_ = np.argmax(aris) + 1
            self.model_ = models[np.argmax(aris)]
        else:
            self.ari_ = None
            self.silhouette_ = silhouettes
            self.n_clusters_ = np.argmax(silhouettes) + 1
            self.model_ = models[np.argmax(silhouettes)]

        return self 
Example #30
Source File: cluster.py    From PHATE with GNU General Public License v2.0 4 votes vote down vote up
def kmeans(phate_op, n_clusters='auto', max_clusters=10, random_state=None, k=None, **kwargs):
    """KMeans on the PHATE potential

    Clustering on the PHATE operator as introduced in Moon et al.
    This is similar to spectral clustering.

    Parameters
    ----------
    phate_op : phate.PHATE
        Fitted PHATE operator
    n_clusters : int, optional (default: 'auto')
        Number of clusters.
        If 'auto', uses the Silhouette score to determine the optimal number of clusters
    max_clusters : int, optional (default: 10)
        Maximum number of clusters to test if using the Silhouette score.
    random_state : int or None, optional (default: None)
        Random seed for k-means
    k : deprecated for `n_clusters`
    kwargs : additional arguments for `sklearn.cluster.KMeans`

    Returns
    -------
    clusters : np.ndarray
        Integer array of cluster assignments
    """
    if k is not None:
        warnings.warn(
            "k is deprecated. Please use n_clusters in future.", FutureWarning
        )
        n_clusters = k
    if not isinstance(phate_op, PHATE):
        raise TypeError("Expected phate_op to be of type PHATE. Got {}".format(phate_op))
    if phate_op.graph is not None:
        if n_clusters == 'auto':
            n_clusters = np.arange(2, max_clusters)
            silhouette_scores = [silhouette_score(phate_op, k, random_state=random_state, **kwargs) for k in n_clusters]
            n_clusters = n_clusters[np.argmax(silhouette_scores)]
        return cluster.KMeans(n_clusters, random_state=random_state, **kwargs).fit_predict(
            phate_op.diff_potential
        )
    else:
        raise exceptions.NotFittedError(
            "This PHATE instance is not fitted yet. Call "
            "'fit' with appropriate arguments before "
            "using this method."
        )