Python sklearn.cluster.KMeans() Examples

The following are 30 code examples of sklearn.cluster.KMeans(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.cluster , or try the search function .
Example #1
Source File: test_k_means.py    From Mastering-Elasticsearch-7.0 with MIT License 8 votes vote down vote up
def test_k_means_new_centers():
    # Explore the part of the code where a new center is reassigned
    X = np.array([[0, 0, 1, 1],
                  [0, 0, 0, 0],
                  [0, 1, 0, 0],
                  [0, 0, 0, 0],
                  [0, 0, 0, 0],
                  [0, 1, 0, 0]])
    labels = [0, 1, 2, 1, 1, 2]
    bad_centers = np.array([[+0, 1, 0, 0],
                            [.2, 0, .2, .2],
                            [+0, 0, 0, 0]])

    km = KMeans(n_clusters=3, init=bad_centers, n_init=1, max_iter=10,
                random_state=1)
    for this_X in (X, sp.coo_matrix(X)):
        km.fit(this_X)
        this_labels = km.labels_
        # Reorder the labels so that the first instance is in cluster 0,
        # the second in cluster 1, ...
        this_labels = np.unique(this_labels, return_index=True)[1][this_labels]
        np.testing.assert_array_equal(this_labels, labels) 
Example #2
Source File: guess.py    From Keras-BiGAN with MIT License 6 votes vote down vote up
def cluster(points, means = 8):
    kk = KMeans(n_clusters = means)
    kk.fit(points)

    labels = kk.predict(points)

    r = []

    for i in range(means):
        row = []
        while(len(row) < 8):
            image = random.randint(0, data.files.shape[0] - 1)
            if labels[image] == i:
                row.append(data.files[image])

        r.append(np.concatenate(row, axis=1))

    c = np.concatenate(r, axis=0)

    x = Image.fromarray(c)
    x.save('Results/clusters.png') 
Example #3
Source File: posterior.py    From scVI with MIT License 6 votes vote down vote up
def clustering_scores(self, prediction_algorithm: str = "knn") -> Tuple:
        if self.gene_dataset.n_labels > 1:
            latent, _, labels = self.get_latent()
            if prediction_algorithm == "knn":
                labels_pred = KMeans(
                    self.gene_dataset.n_labels, n_init=200
                ).fit_predict(
                    latent
                )  # n_jobs>1 ?
            elif prediction_algorithm == "gmm":
                gmm = GMM(self.gene_dataset.n_labels)
                gmm.fit(latent)
                labels_pred = gmm.predict(latent)

            asw_score = silhouette_score(latent, labels)
            nmi_score = NMI(labels, labels_pred)
            ari_score = ARI(labels, labels_pred)
            uca_score = unsupervised_clustering_accuracy(labels, labels_pred)[0]
            logger.debug(
                "Clustering Scores:\nSilhouette: %.4f\nNMI: %.4f\nARI: %.4f\nUCA: %.4f"
                % (asw_score, nmi_score, ari_score, uca_score)
            )
            return asw_score, nmi_score, ari_score, uca_score 
Example #4
Source File: discretize.py    From msppy with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def SAA(self):
        """Use K-means method to discretize the Markovian process."""
        from sklearn.cluster import KMeans
        if self.int_flag == 0:
            labels = numpy.zeros(self.n_samples,dtype=int)
        self._initialize_matrix()
        for t in range(1,self.T):
            kmeans = KMeans(
                n_clusters=self.n_Markov_states[t],
                random_state=0,
            ).fit(self.samples[:,t,:])
            self.Markov_states[t] = kmeans.cluster_centers_
            if self.int_flag == 0:
                labels_new = kmeans.labels_
                counts = numpy.zeros([self.n_Markov_states[t-1],1])
                for i in range(self.n_samples):
                    counts[labels[i]] += 1
                    self.transition_matrix[t][labels[i]][labels_new[i]] += 1
                self.transition_matrix[t] /= counts
                labels = labels_new
        if self.int_flag == 1:
            self.train_transition_matrix()

        return (self.Markov_states,self.transition_matrix) 
Example #5
Source File: argva_node_clustering.py    From pytorch_geometric with MIT License 6 votes vote down vote up
def test():
    model.eval()
    z = model.encode(data.x, data.train_pos_edge_index)

    # Cluster embedded values using k-means.
    kmeans_input = z.cpu().numpy()
    kmeans = KMeans(n_clusters=7, random_state=0).fit(kmeans_input)
    pred = kmeans.predict(kmeans_input)

    labels = data.y.cpu().numpy()
    completeness = completeness_score(labels, pred)
    hm = homogeneity_score(labels, pred)
    nmi = v_measure_score(labels, pred)

    auc, ap = model.test(z, data.test_pos_edge_index, data.test_neg_edge_index)

    return auc, ap, completeness, hm, nmi 
Example #6
Source File: density_weighted_meta.py    From libact with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def __init__(self, dataset, base_query_strategy, similarity_metric=None,
                 clustering_method=None, beta=1.0, random_state=None):
        super(DensityWeightedMeta, self).__init__(dataset=dataset)
        if not isinstance(base_query_strategy, QueryStrategy):
            raise TypeError(
                "'base_query_strategy' has to be an instance of 'QueryStrategy'"
            )
        if base_query_strategy.dataset != self.dataset:
            raise ValueError("base_query_strategy should share the same"
                             "dataset instance with DensityWeightedMeta")

        self.base_query_strategy = base_query_strategy
        self.beta = beta
        self.random_state_ = seed_random_state(random_state)

        if clustering_method is not None:
            self.clustering_method = clustering_method
        else:
            self.clustering_method = KMeans(
                n_clusters=5, random_state=self.random_state_)
        
        if similarity_metric is not None:
            self.similarity_metric = similarity_metric
        else:
            self.similarity_metric = cosine_similarity 
Example #7
Source File: clustering.py    From retentioneering-tools with Mozilla Public License 2.0 6 votes vote down vote up
def calc_mean_dist_from_center(data, km):
    """
    Calculates mean distance from cluster centers. Note that it will be calculated only for KMeans and GMM, because DBSCAN may have ambiguous form of clusters.

    Parameters
    --------
    data: pd.DataFrame
        Dataframe with features for clustering indexed as in ``retention_config.index_col``
    km:
        Already fitted clusterer.

    Returns
    -------
    Mapping of clusters names to mean distance from cluster centers.

    Return type
    -------
    Dict
    """
    res = {}
    cl = km.labels_
    cs = km.cluster_centers_
    for i in set(cl):
        res[i] = _cosine_dist(data[cl == i], cs[i]).mean()
    return res 
Example #8
Source File: test_clusters.py    From mabwiser with Apache License 2.0 6 votes vote down vote up
def test_copy(self):
        arms, mab = self.predict(arms=[1, 2, 3, 4],
                                 decisions=[1, 1, 1, 2, 2, 3, 3, 3, 3, 3],
                                 rewards=[0, 1, 1, 0, 0, 0, 0, 1, 1, 1],
                                 learning_policy=LearningPolicy.EpsilonGreedy(epsilon=0),
                                 neighborhood_policy=NeighborhoodPolicy.Clusters(2),
                                 context_history=[[0, 1, 2, 3, 5], [1, 1, 1, 1, 1], [0, 0, 1, 0, 0],
                                                  [0, 2, 2, 3, 5], [1, 3, 1, 1, 1], [0, 0, 0, 0, 0],
                                                  [0, 1, 4, 3, 5], [0, 1, 2, 4, 5], [1, 2, 1, 1, 3],
                                                  [0, 2, 1, 0, 0]],
                                 contexts=[[0, 1, 2, 3, 5], [1, 1, 1, 1, 1]],
                                 seed=123456,
                                 num_run=1,
                                 is_predict=True)

        clusters = deepcopy(mab._imp)
        self.assertIsNot(clusters, mab._imp)
        self.assertIsInstance(clusters.lp_list[0], _EpsilonGreedy)
        self.assertIsInstance(clusters.lp_list[1], _EpsilonGreedy)
        self.assertIsInstance(clusters.kmeans, KMeans)
        self.assertIsNot(clusters.kmeans, mab._imp.kmeans)
        self.assertIsNot(clusters.lp_list[0], mab._imp.lp_list[0])
        self.assertIsNot(clusters.lp_list[1], mab._imp.lp_list[1])
        self.assertEqual(clusters.lp_list[0].epsilon, mab._imp.lp_list[0].epsilon)
        self.assertEqual(clusters.lp_list[1].epsilon, mab._imp.lp_list[1].epsilon) 
Example #9
Source File: test_clusters.py    From mabwiser with Apache License 2.0 6 votes vote down vote up
def test_greedy0_n2(self):

        arms, mab = self.predict(arms=[1, 2, 3, 4],
                                 decisions=[1, 1, 1, 2, 2, 3, 3, 3, 3, 3],
                                 rewards=[0, 1, 1, 0, 0, 0, 0, 1, 1, 1],
                                 learning_policy=LearningPolicy.EpsilonGreedy(epsilon=0),
                                 neighborhood_policy=NeighborhoodPolicy.Clusters(2),
                                 context_history=[[0, 1, 2, 3, 5], [1, 1, 1, 1, 1], [0, 0, 1, 0, 0],
                                                  [0, 2, 2, 3, 5], [1, 3, 1, 1, 1], [0, 0, 0, 0, 0],
                                                  [0, 1, 4, 3, 5], [0, 1, 2, 4, 5], [1, 2, 1, 1, 3],
                                                  [0, 2, 1, 0, 0]],
                                 contexts=[[0, 1, 2, 3, 5], [1, 1, 1, 1, 1]],
                                 seed=123456,
                                 num_run=1,
                                 is_predict=True)

        self.assertListEqual(arms, [3, 1])
        self.assertTrue(isinstance(mab._imp.kmeans, KMeans)) 
Example #10
Source File: test_k_means.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_elkan_results(distribution):
    # check that results are identical between lloyd and elkan algorithms
    rnd = np.random.RandomState(0)
    if distribution == 'normal':
        X = rnd.normal(size=(50, 10))
    else:
        X, _ = make_blobs(random_state=rnd)

    km_full = KMeans(algorithm='full', n_clusters=5, random_state=0, n_init=1)
    km_elkan = KMeans(algorithm='elkan', n_clusters=5,
                      random_state=0, n_init=1)

    km_full.fit(X)
    km_elkan.fit(X)
    assert_array_almost_equal(km_elkan.cluster_centers_,
                              km_full.cluster_centers_)
    assert_array_equal(km_elkan.labels_, km_full.labels_) 
Example #11
Source File: test_k_means.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_kmeans_results(representation, algo, dtype):
    # cheks that kmeans works as intended
    array_constr = {'dense': np.array, 'sparse': sp.csr_matrix}[representation]
    X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]], dtype=dtype)
    sample_weight = [3, 1, 1, 3]  # will be rescaled to [1.5, 0.5, 0.5, 1.5]
    init_centers = np.array([[0, 0], [1, 1]], dtype=dtype)

    expected_labels = [0, 0, 1, 1]
    expected_inertia = 0.1875
    expected_centers = np.array([[0.125, 0], [0.875, 1]], dtype=dtype)
    expected_n_iter = 2

    kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo)
    kmeans.fit(X, sample_weight=sample_weight)

    assert_array_equal(kmeans.labels_, expected_labels)
    assert_almost_equal(kmeans.inertia_, expected_inertia)
    assert_array_almost_equal(kmeans.cluster_centers_, expected_centers)
    assert kmeans.n_iter_ == expected_n_iter 
Example #12
Source File: test_k_means.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_k_means_non_collapsed():
    # Check k_means with a bad initialization does not yield a singleton
    # Starting with bad centers that are quickly ignored should not
    # result in a repositioning of the centers to the center of mass that
    # would lead to collapsed centers which in turns make the clustering
    # dependent of the numerical unstabilities.
    my_X = np.array([[1.1, 1.1], [0.9, 1.1], [1.1, 0.9], [0.9, 1.1]])
    array_init = np.array([[1.0, 1.0], [5.0, 5.0], [-5.0, -5.0]])
    km = KMeans(init=array_init, n_clusters=3, random_state=42, n_init=1)
    km.fit(my_X)

    # centers must not been collapsed
    assert_equal(len(np.unique(km.labels_)), 3)

    centers = km.cluster_centers_
    assert np.linalg.norm(centers[0] - centers[1]) >= 0.1
    assert np.linalg.norm(centers[0] - centers[2]) >= 0.1
    assert np.linalg.norm(centers[1] - centers[2]) >= 0.1 
Example #13
Source File: spectral_graph_partition.py    From LanczosNetwork with MIT License 6 votes vote down vote up
def spectral_clustering(L, K, seed=1234):
  """
  Implement paper "Shi, J. and Malik, J., 2000. Normalized cuts and image 
  segmentation. IEEE Transactions on pattern analysis and machine intelligence, 
  22(8), pp.888-905."

  Args:
    L: graph Laplacian, numpy or scipy matrix
    K: int, number of clusters

  Returns:
    node_label: list

  N.B.: for simplicity, we only consider simple and undirected graph
  """
  num_nodes = L.shape[0]
  assert (K < num_nodes - 1)

  eig, eig_vec = scipy.sparse.linalg.eigsh(
      L, k=K, which='LM', maxiter=num_nodes * 10000, tol=0, mode='normal')
  kmeans = KMeans(n_clusters=K, random_state=seed).fit(eig_vec.real)

  return kmeans.labels_ 
Example #14
Source File: test_pipeline.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_fit_predict_on_pipeline():
    # test that the fit_predict method is implemented on a pipeline
    # test that the fit_predict on pipeline yields same results as applying
    # transform and clustering steps separately
    iris = load_iris()
    scaler = StandardScaler()
    km = KMeans(random_state=0)
    # As pipeline doesn't clone estimators on construction,
    # it must have its own estimators
    scaler_for_pipeline = StandardScaler()
    km_for_pipeline = KMeans(random_state=0)

    # first compute the transform and clustering step separately
    scaled = scaler.fit_transform(iris.data)
    separate_pred = km.fit_predict(scaled)

    # use a pipeline to do the transform and clustering in one step
    pipe = Pipeline([
        ('scaler', scaler_for_pipeline),
        ('Kmeans', km_for_pipeline)
    ])
    pipeline_pred = pipe.fit_predict(iris.data)

    assert_array_almost_equal(pipeline_pred, separate_pred) 
Example #15
Source File: clustering_kmeans.py    From practicalDataAnalysisCookbook with GNU General Public License v2.0 6 votes vote down vote up
def findClusters_kmeans(data):
    '''
        Cluster data using k-means
    '''
    # create the classifier object
    kmeans = cl.KMeans(
        n_clusters=4,
        n_jobs=-1,
        verbose=0,
        n_init=30
    )

    # fit the data
    return kmeans.fit(data)

# the file name of the dataset 
Example #16
Source File: pancreas_tests.py    From scanorama with MIT License 6 votes vote down vote up
def entropy_test(datasets_dimred, ds_labels):
    
    ds_labels = np.array(ds_labels)
    X_dimred = np.concatenate(datasets_dimred)
    embedding = None
    
    for k in range(10, 21):
        km = KMeans(n_clusters=k, n_jobs=-1, verbose=0)
        km.fit(X_dimred)

        if False and k % 5 == 0:
            embedding = visualize(
                datasets_dimred,
                km.labels_, NAMESPACE + '_km{}'.format(k),
                [ str(x) for x in range(k) ],
                embedding=embedding
            )
        
        print('k = {}, average normalized entropy = {}'
              .format(k, avg_norm_entropy(ds_labels, km.labels_))) 
Example #17
Source File: feature_preprocess.py    From MassImageRetrieval with Apache License 2.0 6 votes vote down vote up
def analysis_KMeans():
	mean_distortions = []
	K = len(labels_idx)
	K_range = range(320, 1000)
	for k in K_range:
		print("Cluster k is {}".format(k))
		kmeans_model = KMeans(n_clusters=k, init="k-means++", n_jobs=-1)
		kmeans_model.fit(np_features)
		t_distortions = sum(
			np.min(cdist(np_features, kmeans_model.cluster_centers_, 'euclidean'), axis=1)) / np_features.shape[0]
		mean_distortions.append(t_distortions)

	with open("./kmeans_cluster.csv", "a+") as wh:
		for idx in range(len(K_range)):
			wh.write("{},{}\n".format(K_range[idx], mean_distortions[idx]))

	# plt.plot(K_range, mean_distortions, 'bx-')
	# plt.xlabel('k')
	# plt.ylabel(u'Avgerage distortion degree')
	# plt.title(u'Elbows rule to select the best K value')
	# plt.savefig("kmeans_cluster.png") 
Example #18
Source File: cluster.py    From PHATE with GNU General Public License v2.0 6 votes vote down vote up
def silhouette_score(phate_op, n_clusters, random_state=None, **kwargs):
    """Compute the Silhouette score on KMeans on the PHATE potential

    Parameters
    ----------
    phate_op : phate.PHATE
        Fitted PHATE operator
    n_clusters : int
        Number of clusters.
    random_state : int or None, optional (default: None)
        Random seed for k-means

    Returns
    -------
    score : float
    """
    cluster_labels = kmeans(phate_op, n_clusters=n_clusters, random_state=random_state, **kwargs)
    return metrics.silhouette_score(phate_op.diff_potential, cluster_labels) 
Example #19
Source File: SpectralClustering.py    From sparse-subspace-clustering-python with MIT License 6 votes vote down vote up
def SpectralClustering(CKSym, n):
    # This is direct port of JHU vision lab code. Could probably use sklearn SpectralClustering.
    CKSym = CKSym.astype(float)
    N, _ = CKSym.shape
    MAXiter = 1000  # Maximum number of iterations for KMeans
    REPlic = 20  # Number of replications for KMeans

    DN = np.diag(np.divide(1, np.sqrt(np.sum(CKSym, axis=0) + np.finfo(float).eps)))
    LapN = identity(N).toarray().astype(float) - np.matmul(np.matmul(DN, CKSym), DN)
    _, _, vN = np.linalg.svd(LapN)
    vN = vN.T
    kerN = vN[:, N - n:N]
    normN = np.sqrt(np.sum(np.square(kerN), axis=1))
    kerNS = np.divide(kerN, normN.reshape(len(normN), 1) + np.finfo(float).eps)
    km = KMeans(n_clusters=n, n_init=REPlic, max_iter=MAXiter, n_jobs=-1).fit(kerNS)
    return km.labels_ 
Example #20
Source File: test_spectral_embedding.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_pipeline_spectral_clustering(seed=36):
    # Test using pipeline to do spectral clustering
    random_state = np.random.RandomState(seed)
    se_rbf = SpectralEmbedding(n_components=n_clusters,
                               affinity="rbf",
                               random_state=random_state)
    se_knn = SpectralEmbedding(n_components=n_clusters,
                               affinity="nearest_neighbors",
                               n_neighbors=5,
                               random_state=random_state)
    for se in [se_rbf, se_knn]:
        km = KMeans(n_clusters=n_clusters, random_state=random_state)
        km.fit(se.fit_transform(S))
        assert_array_almost_equal(
            normalized_mutual_info_score(
                km.labels_,
                true_labels), 1.0, 2) 
Example #21
Source File: clustering.py    From retentioneering-tools with Mozilla Public License 2.0 5 votes vote down vote up
def simple_cluster(data, max_n_clusters=None, use_csi=True, random_state=0, **kwargs):
    """
    Finds cluster of users in data.

    Parameters
    -------
    data: pd.DataFrame
        Dataframe with features for clustering indexed as in ``retention_config.index_col``
    max_n_clusters: int, optional
        Maximal number of clusters for automatic selection for number of clusters. If ``None``, then uses n_clusters from arguments. Default: `None```
    use_csi: bool, optional
        If ``True``, then cluster stability index will be calculated. IMPORTANT: it may take a lot of time. Default: ``True``
    random_state: int, optional
        Random state for KMeans clusterer. Default: ``0``
    kwargs: optional
        Parameters for ``sklearn.cluster.KMeans``

    Returns
    -------
    Array of clusters

    Return type
    -------
    np.array
    """
    if max_n_clusters is not None:
        kmargs = find_best_n_clusters(data, KMeans, max_n_clusters, random_state, **kwargs)
    else:
        kmargs = {i: j for i, j in kwargs.items() if i in __KMEANS_FILTER__}
    kmargs.update({'random_state': random_state})
    km = KMeans(**kmargs)
    cl = km.fit_predict(data.values)
    bs = pd.get_dummies(cl)
    bs.index = data.index
    metrics = calc_all_metrics(data, km)
    if use_csi:
        metrics['csi'] = cluster_stability_index(data, km, bs, **kwargs)
    return cl, metrics 
Example #22
Source File: clustering.py    From retentioneering-tools with Mozilla Public License 2.0 5 votes vote down vote up
def find_best_n_clusters(data, clusterer, max_n_clusters, random_state, **kwargs):
    """
    Finds best number of clusters for KMeans and Gaussian Mixture.

    Parameters
    -------
    data: pd.DataFrame
        Dataframe with features for clustering with index as in ``retention_config.index_col``
    clusterer: sklearn clusterer class
        For instance, ``sklearn.cluster.KMeans`` or ``sklearn.mixture.GaussianMixture``.
    max_n_clusters: int
        Maximal number of clusters for searching.
    random_state: int
        Random state for clusterer.

    Returns
    -------
    Optimal keyword arguments for clustering method.

    Return type
    ------
    Dict
    """
    args = {i: j for i, j in kwargs.items() if i in clusterer.get_params(clusterer)}
    if 'n_clusters' in clusterer.get_params(clusterer):
        kms = True
    else:
        kms = False
    args.pop('n_clusters' if kms else 'n_components', None)
    args.update({'random_state': random_state})
    score = {}
    for i in range(2, max_n_clusters + 1):
        args.update({'n_clusters' if kms else 'n_components': i})
        km = clusterer(**args)
        score[i] = silhouette_score(data, km.fit_predict(data), metric='cosine')
    best = pd.Series(score).idxmax()
    args.update({'n_clusters' if kms else 'n_components': best})
    print(f'Best number of clusters is {best}')
    return args 
Example #23
Source File: K-Means_scikit-learn.py    From MachineLearning_Python with MIT License 5 votes vote down vote up
def kMenas():
    data = spio.loadmat("data.mat")
    X = data['X']   
    model = KMeans(n_clusters=3).fit(X) # n_clusters指定3类,拟合数据
    centroids = model.cluster_centers_  # 聚类中心
    
    plt.scatter(X[:,0], X[:,1])     # 原数据的散点图
    plt.plot(centroids[:,0],centroids[:,1],'r^',markersize=10)  # 聚类中心
    plt.show() 
Example #24
Source File: clustering.py    From retentioneering-tools with Mozilla Public License 2.0 5 votes vote down vote up
def dbscan(data, use_csi=True, epsq=None, max_cl_number=None, **kwargs):
    """
    Finds cluster of users in data using DBSCAN

    Parameters
    -------
    data: pd.DataFrame
        Dataframe with features for clustering indexed by users (sessions)
    use_csi: bool, optional
        If ``True``, then cluster stability index will be calculated. IMPORTANT: it may take a lot of time. Default: ``True``
    epsq: float, optional
        Quantile of nearest neighbor positive distance between dots, its value will be an eps. If ``None``, then eps from keywords will be used. Default: ``None``
    max_cl_number: int, optional
        Maximal number of clusters for aggregation of small clusters. Default: ``None``
    kwargs: optional
        Parameters for ``sklearn.cluster.KMeans``

    Returns
    --------
    Array of clusters

    Return type
    -------
    np.array
    """
    kmargs = {i: j for i, j in kwargs.items() if i in DBSCAN.get_params(DBSCAN)}
    if epsq is not None:
        kmargs.update({'eps': find_best_eps(data, epsq)})
    km = DBSCAN(**kmargs)
    cl = km.fit_predict(data.values)
    bs = pd.get_dummies(cl)
    bs.index = data.index
    metrics = calc_all_metrics(data, km)
    if use_csi:
        metrics['csi'] = cluster_stability_index(data, km, bs, **kwargs)
    if max_cl_number is not None:
        cl = aggregate_cl(cl, max_cl_number)
    return cl, metrics 
Example #25
Source File: test_weight_boosting.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_sample_weight_missing():
    from sklearn.cluster import KMeans

    clf = AdaBoostClassifier(KMeans(), algorithm="SAMME")
    assert_raises(ValueError, clf.fit, X, y_regr)

    clf = AdaBoostRegressor(KMeans())
    assert_raises(ValueError, clf.fit, X, y_regr) 
Example #26
Source File: clustering_kmeans_search.py    From practicalDataAnalysisCookbook with GNU General Public License v2.0 5 votes vote down vote up
def findClusters_kmeans(data, no_of_clusters):
    '''
        Cluster data using k-means
    '''
    # create the classifier object
    kmeans = cl.KMeans(
        n_clusters=no_of_clusters,
        n_jobs=-1,
        verbose=0,
        n_init=30
    )

    # fit the data
    return kmeans.fit(data) 
Example #27
Source File: cnn_lcd.py    From CNN_LCD with GNU General Public License v3.0 5 votes vote down vote up
def cluster_kmeans(sim):
    """Run k-means on similarity matrix and segment"""
    sim_dim = sim.shape[0]
    sim = sim.reshape(-1, 1)

    # Augment with spatial coordinates
    sim_aug = np.concatenate(
        [sim,
         np.mgrid[:sim_dim, :sim_dim].reshape(-1, sim_dim ** 2).T],
        axis=1
    )

    # Empirical metric for number of loop-closures given number of images
    # in sequence (assumption: equally-spaced samples):
    n_clusters = int(np.sqrt(sim_dim))
    print('Performing clustering via KMeans(n={}).'.format(n_clusters))

    km = KMeans(n_clusters=n_clusters, n_jobs=2,
                max_iter=300)
    labels = km.fit_predict(sim_aug)
    print('Got cluster labels')

    for i in range(n_clusters):
        lab_idx = (labels == i)
        if lab_idx.size:
            cc = sim[lab_idx].mean()
            # cc = sim[lab_idx].max()
            sim[lab_idx] = cc

    # Re-normalize and reshape
    sim = sim.reshape(sim_dim, sim_dim) / sim.max()
    return sim 
Example #28
Source File: sam_knn.py    From scikit-multiflow with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def cluster_down(self, samples, labels):
        """Performs classwise kMeans++ clustering for given samples with corresponding labels.
        The number of samples is halved per class."""
        logging.debug('cluster Down %d' % self.trainStepCount)
        uniqueLabels = np.unique(labels)
        newSamples = np.empty(shape=(0, samples.shape[1]))
        newLabels = np.empty(shape=(0), dtype=np.int32)
        for label in uniqueLabels:
            tmpSamples = samples[labels == label]
            newLength = int(max(tmpSamples.shape[0]/2, 1))
            clustering = KMeans(n_clusters=newLength, n_init=1, random_state=0)
            clustering.fit(tmpSamples)
            newSamples = np.vstack([newSamples, clustering.cluster_centers_])
            newLabels = np.append(newLabels, label*np.ones(shape=newLength, dtype=np.int32))
        return newSamples, newLabels 
Example #29
Source File: clustering_kmeans_search_alternative.py    From practicalDataAnalysisCookbook with GNU General Public License v2.0 5 votes vote down vote up
def findClusters_kmeans(data, no_of_clusters):
    '''
        Cluster data using k-means
    '''
    # create the classifier object
    kmeans = cl.KMeans(
        n_clusters=no_of_clusters,
        n_jobs=-1,
        verbose=0,
        n_init=30
    )

    # fit the data
    return kmeans.fit(data) 
Example #30
Source File: cluster_features.py    From bert-extractive-summarizer with MIT License 5 votes vote down vote up
def __get_model(self, k: int):
        """
        Retrieve clustering model

        :param k: amount of clusters
        :return: Clustering model

        """

        if self.algorithm == 'gmm':
            return GaussianMixture(n_components=k, random_state=self.random_state)
        return KMeans(n_clusters=k, random_state=self.random_state)