Python sklearn.cluster.AgglomerativeClustering() Examples

The following are 30 code examples of sklearn.cluster.AgglomerativeClustering(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.cluster , or try the search function .
Example #1
Source File: graph_eval.py    From nodevectors with MIT License 6 votes vote down vote up
def print_labeled_tests(w, y, test_size=0.2, seed=42):
    """
    Clustering and label prediction tests
    """
    X_train, X_test, y_train, y_test = train_test_split(
        w, y, test_size=test_size, random_state=seed)
    # Print Label Prediction Tests
    res = LabelPrediction(w, y, test_size=test_size, seed=seed)
    # Can only cluster on single-label (not multioutput)
    if len(y.shape) < 2:
        n_clusters = np.unique(y).size
        umpagglo = cluster.AgglomerativeClustering(
            n_clusters=n_clusters, 
            affinity='cosine', 
            linkage='average'
        ).fit(w).labels_
        x = evalClusteringOnLabels(umpagglo, y, verbose=True)
        res = {**res, **x}
    return res 
Example #2
Source File: test_hierarchical.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_linkage_misc():
    # Misc tests on linkage
    rng = np.random.RandomState(42)
    X = rng.normal(size=(5, 5))
    assert_raises(ValueError, AgglomerativeClustering(linkage='foo').fit, X)
    assert_raises(ValueError, linkage_tree, X, linkage='foo')
    assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4)))

    # Smoke test FeatureAgglomeration
    FeatureAgglomeration().fit(X)

    # test hierarchical clustering on a precomputed distances matrix
    dis = cosine_distances(X)

    res = linkage_tree(dis, affinity="precomputed")
    assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])

    # test hierarchical clustering on a precomputed distances matrix
    res = linkage_tree(X, affinity=manhattan_distances)
    assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0]) 
Example #3
Source File: test_hierarchical.py    From twitter-stock-recommendation with MIT License 6 votes vote down vote up
def test_compute_full_tree():
    # Test that the full tree is computed if n_clusters is small
    rng = np.random.RandomState(0)
    X = rng.randn(10, 2)
    connectivity = kneighbors_graph(X, 5, include_self=False)

    # When n_clusters is less, the full tree should be built
    # that is the number of merges should be n_samples - 1
    agc = AgglomerativeClustering(n_clusters=2, connectivity=connectivity)
    agc.fit(X)
    n_samples = X.shape[0]
    n_nodes = agc.children_.shape[0]
    assert_equal(n_nodes, n_samples - 1)

    # When n_clusters is large, greater than max of 100 and 0.02 * n_samples.
    # we should stop when there are n_clusters.
    n_clusters = 101
    X = rng.randn(200, 2)
    connectivity = kneighbors_graph(X, 10, include_self=False)
    agc = AgglomerativeClustering(n_clusters=n_clusters,
                                  connectivity=connectivity)
    agc.fit(X)
    n_samples = X.shape[0]
    n_nodes = agc.children_.shape[0]
    assert_equal(n_nodes, n_samples - n_clusters) 
Example #4
Source File: test_hierarchical.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_single_linkage_clustering():
    # Check that we get the correct result in two emblematic cases
    moons, moon_labels = make_moons(noise=0.05, random_state=42)
    clustering = AgglomerativeClustering(n_clusters=2, linkage='single')
    clustering.fit(moons)
    assert_almost_equal(normalized_mutual_info_score(clustering.labels_,
                                                     moon_labels), 1)

    circles, circle_labels = make_circles(factor=0.5, noise=0.025,
                                          random_state=42)
    clustering = AgglomerativeClustering(n_clusters=2, linkage='single')
    clustering.fit(circles)
    assert_almost_equal(normalized_mutual_info_score(clustering.labels_,
                                                     circle_labels), 1) 
Example #5
Source File: test_hierarchical.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_identical_points():
    # Ensure identical points are handled correctly when using mst with
    # a sparse connectivity matrix
    X = np.array([[0, 0, 0], [0, 0, 0],
                  [1, 1, 1], [1, 1, 1],
                  [2, 2, 2], [2, 2, 2]])
    true_labels = np.array([0, 0, 1, 1, 2, 2])
    connectivity = kneighbors_graph(X, n_neighbors=3, include_self=False)
    connectivity = 0.5 * (connectivity + connectivity.T)
    connectivity, n_components = _fix_connectivity(X,
                                                   connectivity,
                                                   'euclidean')

    for linkage in ('single', 'average', 'average', 'ward'):
        clustering = AgglomerativeClustering(n_clusters=3,
                                             linkage=linkage,
                                             connectivity=connectivity)
        clustering.fit(X)

        assert_almost_equal(normalized_mutual_info_score(clustering.labels_,
                                                         true_labels), 1) 
Example #6
Source File: agglomerative.py    From Python-Machine-Learning-Cookbook-Second-Edition with MIT License 6 votes vote down vote up
def perform_clustering(X, connectivity, title, num_clusters=3, linkage='ward'):
    plt.figure()
    model = AgglomerativeClustering(linkage=linkage, 
                    connectivity=connectivity, n_clusters=num_clusters)
    model.fit(X)

    # extract labels
    labels = model.labels_

    # specify marker shapes for different clusters
    markers = '.vx'

    for i, marker in zip(range(num_clusters), markers):
        # plot the points belong to the current cluster
        plt.scatter(X[labels==i, 0], X[labels==i, 1], s=50, 
                    marker=marker, color='k', facecolors='none')

    plt.title(title) 
Example #7
Source File: cluster_manager.py    From texta with GNU General Public License v3.0 6 votes vote down vote up
def _cluster_documents(self):
        method = self.params['cluster_method']
        n_clusters = int(self.params['cluster_n_clusters'])
        n_samples = len(self.document_vectors)

        if n_clusters > n_samples:
            n_clusters = n_samples

        if method == 'kmeans':
            clusterer = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=100, n_init=1)
        else:
            clusterer = AgglomerativeClustering(n_clusters=n_clusters, linkage='complete', affinity='cosine')

        clustering = clusterer.fit(self.document_vectors)
        cluster_labels = clustering.labels_
        clustering_dict = clustering.__dict__
        clusters = {}

        for document_id,cluster_label in enumerate(cluster_labels):
            if cluster_label not in clusters:
                clusters[cluster_label] = []
            clusters[cluster_label].append(document_id)

        return clusters 
Example #8
Source File: test_hierarchical.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_compute_full_tree():
    # Test that the full tree is computed if n_clusters is small
    rng = np.random.RandomState(0)
    X = rng.randn(10, 2)
    connectivity = kneighbors_graph(X, 5, include_self=False)

    # When n_clusters is less, the full tree should be built
    # that is the number of merges should be n_samples - 1
    agc = AgglomerativeClustering(n_clusters=2, connectivity=connectivity)
    agc.fit(X)
    n_samples = X.shape[0]
    n_nodes = agc.children_.shape[0]
    assert_equal(n_nodes, n_samples - 1)

    # When n_clusters is large, greater than max of 100 and 0.02 * n_samples.
    # we should stop when there are n_clusters.
    n_clusters = 101
    X = rng.randn(200, 2)
    connectivity = kneighbors_graph(X, 10, include_self=False)
    agc = AgglomerativeClustering(n_clusters=n_clusters,
                                  connectivity=connectivity)
    agc.fit(X)
    n_samples = X.shape[0]
    n_nodes = agc.children_.shape[0]
    assert_equal(n_nodes, n_samples - n_clusters) 
Example #9
Source File: test_hierarchical.py    From twitter-stock-recommendation with MIT License 6 votes vote down vote up
def test_linkage_misc():
    # Misc tests on linkage
    rng = np.random.RandomState(42)
    X = rng.normal(size=(5, 5))
    assert_raises(ValueError, AgglomerativeClustering(linkage='foo').fit, X)
    assert_raises(ValueError, linkage_tree, X, linkage='foo')
    assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4)))

    # Smoke test FeatureAgglomeration
    FeatureAgglomeration().fit(X)

    # test hierarchical clustering on a precomputed distances matrix
    dis = cosine_distances(X)

    res = linkage_tree(dis, affinity="precomputed")
    assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])

    # test hierarchical clustering on a precomputed distances matrix
    res = linkage_tree(X, affinity=manhattan_distances)
    assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0]) 
Example #10
Source File: test_hierarchical.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_cluster_distances_with_distance_threshold():
    rng = np.random.RandomState(0)
    n_samples = 100
    X = rng.randint(-10, 10, size=(n_samples, 3))
    # check the distances within the clusters and with other clusters
    distance_threshold = 4
    clustering = AgglomerativeClustering(
        n_clusters=None,
        distance_threshold=distance_threshold,
        linkage="single").fit(X)
    labels = clustering.labels_
    D = pairwise_distances(X, metric="minkowski", p=2)
    # to avoid taking the 0 diagonal in min()
    np.fill_diagonal(D, np.inf)
    for label in np.unique(labels):
        in_cluster_mask = labels == label
        max_in_cluster_distance = (D[in_cluster_mask][:, in_cluster_mask]
                                   .min(axis=0).max())
        min_out_cluster_distance = (D[in_cluster_mask][:, ~in_cluster_mask]
                                    .min(axis=0).min())
        # single data point clusters only have that inf diagonal here
        if in_cluster_mask.sum() > 1:
            assert max_in_cluster_distance < distance_threshold
        assert min_out_cluster_distance >= distance_threshold 
Example #11
Source File: predicting.py    From ImageSetCleaner with GNU General Public License v3.0 6 votes vote down vote up
def detection_with_agglomaritve_clustering(image_set):
    """
    Really good if the classes you are analyzing are close to what the network learned.

    :param image_set: The bottleneck values of the relevant images.
    :return: Predictions vector

     N.B : The detector breaks with a full black image.
    """

    # http://scikit-learn.org/stable/auto_examples/cluster/plot_agglomerative_clustering.html#sphx-glr-auto-examples-cluster-plot-agglomerative-clustering-py
    clf = cluster.AgglomerativeClustering(n_clusters=2, affinity="l2", linkage="complete")

    clf.fit(image_set)

    predictions = clf.labels_
    predictions = normalize_predictions(predictions)

    return predictions 
Example #12
Source File: clusterings.py    From parcellation_fragmenter with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def ward(n_clusters, samples):

    """
    Run Ward clustering on vertex coordinates.

    Parameters:
    - - - - -
    n_clusters : int
        number of clusters to generate
    samples : array
        Euclidean-space coordinates of vertices
    """

    # Generate KNN graph
    knn_graph = neighbors.kneighbors_graph(
        samples, n_neighbors=20, mode='connectivity', metric='minkowski', p=2,
        include_self=False, n_jobs=-1)

    # Apply Ward-Agglomerative clustering
    ward = cluster.AgglomerativeClustering(
        n_clusters=n_clusters, affinity='euclidean', connectivity=knn_graph,
        linkage='ward')

    ward.fit(samples)
    labels = ward.labels_.copy()
    labels = labels.astype(np.int32)+1

    return labels 
Example #13
Source File: __init__.py    From dials with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _agglomerative_clustering(self):
        X = self.coords.as_numpy_array()

        # Perform cluster analysis
        from sklearn.cluster import AgglomerativeClustering
        import numpy as np

        model = AgglomerativeClustering(
            n_clusters=self.params.cluster.n_clusters,
            linkage="average",
            affinity="cosine",
        )
        model.fit(X)
        return flex.int(model.labels_.astype(np.int32)) 
Example #14
Source File: agglomerative.py    From trajminer with MIT License 5 votes vote down vote up
def __init__(self, n_clusters, linkage='ward', measure='precomputed',
                 n_jobs=1):
        self.agglomerative = skAgglomerative(n_clusters=n_clusters,
                                             affinity='precomputed')
        self.n_clusters = n_clusters
        self.measure = measure
        self.n_jobs = n_jobs 
Example #15
Source File: cluster_images.py    From NucleiDetectron with Apache License 2.0 5 votes vote down vote up
def create_color_cluster_agglomerative_clustering(in_df, num_clusters):
    cluster_maker = AgglomerativeClustering(linkage='average', n_clusters=num_clusters)

    cluster_maker.fit(in_df[color_features_names])

    in_df['cluster-id'] = cluster_maker.labels_

    in_df['cluster-id'] = in_df['cluster-id'].map(lambda x: str(x))
    return in_df 
Example #16
Source File: create_endpoints_mask_with_clustering.py    From TractSeg with Apache License 2.0 5 votes vote down vote up
def cluster(points, algorithm=DBSCAN):
    print("Running {}...".format(algorithm))
    if algorithm == "KMeans":
        # not good at finding clusters if close together
        labels = KMeans(n_clusters=2, random_state=0, n_jobs=-1).fit_predict(points)
    elif algorithm == "DBSCAN":
        # no fixed number of labels; slow with high eps
        labels = DBSCAN(eps=3.0, n_jobs=-1).fit_predict(points)
    # labels = SpectralClustering(n_clusters=2, n_jobs=-1).fit_predict(points)  # slow (> 1min)
    # labels = AgglomerativeClustering(n_clusters=2).fit_predict(points)  # fast
    points_start, points_end = select_two_biggest_clusters(labels, points)
    return points_start, points_end 
Example #17
Source File: compare_clustering_algs.py    From mmvt with GNU General Public License v3.0 5 votes vote down vote up
def compare(data, n_groups, output_fol):
    # plot_clusters(data.astype(np.float), scipy.cluster.vq.kmeans, 'scipy.cluster.vq.kmeans', output_fol, (n_groups,), {})
    plot_clusters(data, cluster.KMeans, 'KMeans', output_fol, (), {'n_clusters': n_groups})
    for ct in ['spherical', 'tied', 'diag', 'full']:
        plot_clusters(data, mixture.GaussianMixture, 'GMM_{}'.format(ct), output_fol, (),
                      {'n_components': n_groups, 'covariance_type': ct})
    plot_clusters(data, cluster.AffinityPropagation, 'AffinityPropagation', output_fol, (), {'preference': -5.0, 'damping': 0.95})
    plot_clusters(data, cluster.MeanShift, 'MeanShift', output_fol, (0.175,), {'cluster_all': False})
    plot_clusters(data, cluster.SpectralClustering, 'SpectralClustering', output_fol, (), {'n_clusters': n_groups})
    plot_clusters(data, cluster.AgglomerativeClustering, 'AgglomerativeClustering', output_fol, (), {'n_clusters': n_groups, 'linkage': 'ward'})
    plot_clusters(data, cluster.DBSCAN, 'DBSCAN', output_fol, (), {'eps': 0.025})
    # plot_clusters(data, hdbscan.HDBSCAN, 'HDBSCAN', output_fol, (), {'min_cluster_size': 15}) 
Example #18
Source File: sklearn_cluster.py    From learn-to-cluster with MIT License 5 votes vote down vote up
def hierarchy(feat, n_clusters, knn, **kwargs):
    from sklearn.neighbors import kneighbors_graph
    knn_graph = kneighbors_graph(feat, knn, include_self=False)
    hierarchy = cluster.AgglomerativeClustering(n_clusters=n_clusters,
                                                connectivity=knn_graph,
                                                linkage='ward').fit(feat)
    return hierarchy.labels_ 
Example #19
Source File: test_hierarchical.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_agglomerative_clustering_wrong_arg_memory():
    # Test either if an error is raised when memory is not
    # either a str or a joblib.Memory instance
    rng = np.random.RandomState(0)
    n_samples = 100
    X = rng.randn(n_samples, 50)
    memory = 5
    clustering = AgglomerativeClustering(memory=memory)
    assert_raises(ValueError, clustering.fit, X) 
Example #20
Source File: test_hierarchical.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_connectivity_propagation():
    # Check that connectivity in the ward tree is propagated correctly during
    # merging.
    X = np.array([(.014, .120), (.014, .099), (.014, .097),
                  (.017, .153), (.017, .153), (.018, .153),
                  (.018, .153), (.018, .153), (.018, .153),
                  (.018, .153), (.018, .153), (.018, .153),
                  (.018, .152), (.018, .149), (.018, .144)])
    connectivity = kneighbors_graph(X, 10, include_self=False)
    ward = AgglomerativeClustering(
        n_clusters=4, connectivity=connectivity, linkage='ward')
    # If changes are not propagated correctly, fit crashes with an
    # IndexError
    ward.fit(X) 
Example #21
Source File: test_hierarchical.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_connectivity_fixing_non_lil():
    # Check non regression of a bug if a non item assignable connectivity is
    # provided with more than one component.
    # create dummy data
    x = np.array([[0, 0], [1, 1]])
    # create a mask with several components to force connectivity fixing
    m = np.array([[True, False], [False, True]])
    c = grid_to_graph(n_x=2, n_y=2, mask=m)
    w = AgglomerativeClustering(connectivity=c, linkage='ward')
    assert_warns(UserWarning, w.fit, x) 
Example #22
Source File: test_hierarchical.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_connectivity_callable():
    rng = np.random.RandomState(0)
    X = rng.rand(20, 5)
    connectivity = kneighbors_graph(X, 3, include_self=False)
    aglc1 = AgglomerativeClustering(connectivity=connectivity)
    aglc2 = AgglomerativeClustering(
        connectivity=partial(kneighbors_graph, n_neighbors=3, include_self=False))
    aglc1.fit(X)
    aglc2.fit(X)
    assert_array_equal(aglc1.labels_, aglc2.labels_) 
Example #23
Source File: test_hierarchical.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_agg_n_clusters():
    # Test that an error is raised when n_clusters <= 0

    rng = np.random.RandomState(0)
    X = rng.rand(20, 10)
    for n_clus in [-1, 0]:
        agc = AgglomerativeClustering(n_clusters=n_clus)
        msg = ("n_clusters should be an integer greater than 0."
               " %s was provided." % str(agc.n_clusters))
        assert_raise_message(ValueError, msg, agc.fit, X) 
Example #24
Source File: stitch_patches_page.py    From ScanSSD with MIT License 5 votes vote down vote up
def clustering(math_regions, char_data, image, algorithm, thresh_votes):

    centers = []
    for math_region in math_regions:
        center = [(math_region[0]+math_region[2])/2, (math_region[1]+math_region[3])/2]
        centers.append(center)

    clustering = AgglomerativeClustering().fit(centers)

    labels = np.unique(clustering.labels_)

    for label in labels:
        regions = math_regions[labels==label]

    pass 
Example #25
Source File: baseline_clustering.py    From cdp with MIT License 5 votes vote down vote up
def hierarchy(feat, n_clusters=2, knn=30):
    from sklearn.neighbors import kneighbors_graph
    knn_graph = kneighbors_graph(feat, knn, include_self=False)
    hierarchy = cluster.AgglomerativeClustering(n_clusters=n_clusters,
                                                connectivity=knn_graph,
                                                linkage='ward').fit(feat)
    return hierarchy.labels_ 
Example #26
Source File: test_hierarchical.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_agglomerative_clustering_wrong_arg_memory():
    # Test either if an error is raised when memory is not
    # either a str or a joblib.Memory instance
    rng = np.random.RandomState(0)
    n_samples = 100
    X = rng.randn(n_samples, 50)
    memory = 5
    clustering = AgglomerativeClustering(memory=memory)
    assert_raises(ValueError, clustering.fit, X) 
Example #27
Source File: diarization.py    From VBDiarization with Apache License 2.0 5 votes vote down vote up
def run_ahc(self, n_clusters, embeddings, scores_matrix):
        """ Run agglomerative hierarchical clustering.

        Returns:
            np.array: means of clusters
        """
        scores_matrix = -((scores_matrix - np.min(scores_matrix)) / (np.max(scores_matrix) - np.min(scores_matrix)))
        ahc = AgglomerativeClustering(affinity='precomputed', linkage='complete', n_clusters=n_clusters)
        labels = ahc.fit_predict(scores_matrix)
        return np.array([np.mean(embeddings[np.where(labels == i)], axis=0) for i in range(n_clusters)]) 
Example #28
Source File: infer.py    From NLP_Toolkit with Apache License 2.0 5 votes vote down vote up
def n_cluster_embeddings(self, features=None, n_clusters=3, method='ac'):
        '''
        clusters the nodes based on embedding features
        features = None (use DGI generated embeddings)
        '''
        if method == 'ac':
            clustering = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean',\
                                                 linkage='ward')
            clustering.fit(self.embeddings if features is None else features)
            self.labels = clustering.labels_
            self.score = silhouette_score(self.embeddings if features is None else features,\
                                          self.labels)
        return {'labels': self.labels, 'score': self.score} 
Example #29
Source File: test_hierarchical.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_connectivity_propagation():
    # Check that connectivity in the ward tree is propagated correctly during
    # merging.
    X = np.array([(.014, .120), (.014, .099), (.014, .097),
                  (.017, .153), (.017, .153), (.018, .153),
                  (.018, .153), (.018, .153), (.018, .153),
                  (.018, .153), (.018, .153), (.018, .153),
                  (.018, .152), (.018, .149), (.018, .144)])
    connectivity = kneighbors_graph(X, 10, include_self=False)
    ward = AgglomerativeClustering(
        n_clusters=4, connectivity=connectivity, linkage='ward')
    # If changes are not propagated correctly, fit crashes with an
    # IndexError
    ward.fit(X) 
Example #30
Source File: test_hierarchical.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_n_components_deprecation():
    # Test that a Deprecation warning is thrown when n_components_
    # attribute is accessed

    X = np.array([[1, 2], [1, 4], [1, 0], [4, 2]])
    agc = AgglomerativeClustering().fit(X)

    match = ("``n_components_`` attribute was deprecated "
             "in favor of ``n_connected_components_``")
    with pytest.warns(DeprecationWarning, match=match):
        n = agc.n_components_
    assert n == agc.n_connected_components_