Python sklearn.datasets.samples_generator.make_blobs() Examples

The following are 30 code examples of sklearn.datasets.samples_generator.make_blobs(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.datasets.samples_generator , or try the search function .
Example #1
Source File: test_k_means.py    From twitter-stock-recommendation with MIT License 6 votes vote down vote up
def test_minibatch_sensible_reassign_fit():
    # check if identical initial clusters are reassigned
    # also a regression test for when there are more desired reassignments than
    # samples.
    zeroed_X, true_labels = make_blobs(n_samples=100, centers=5,
                                       cluster_std=1., random_state=42)
    zeroed_X[::2, :] = 0
    mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=10, random_state=42,
                                 init="random")
    mb_k_means.fit(zeroed_X)
    # there should not be too many exact zero cluster centers
    assert_greater(mb_k_means.cluster_centers_.any(axis=1).sum(), 10)

    # do the same with batch-size > X.shape[0] (regression test)
    mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=201,
                                 random_state=42, init="random")
    mb_k_means.fit(zeroed_X)
    # there should not be too many exact zero cluster centers
    assert_greater(mb_k_means.cluster_centers_.any(axis=1).sum(), 10) 
Example #2
Source File: make_data.py    From DCC with MIT License 6 votes vote down vote up
def make_easy_visual_data(path, N=600):
    """Make 3 clusters of 2D data where the cluster centers lie along a line.
    The latent variable would be just their x or y value since that uniquely defines their projection onto the line.
    """

    line = (1.5, 1)
    centers = [(m, m * line[0] + line[1]) for m in (-4, 0, 6)]
    cluster_std = [1, 1, 1.5]
    X, labels = make_blobs(n_samples=N, cluster_std=cluster_std, centers=centers, n_features=len(centers[0]))

    # scale data
    minmaxscale = MinMaxScaler().fit(X)
    X = minmaxscale.transform(X)

    save_misc_data(path, X, labels, N)
    return X, labels 
Example #3
Source File: test_cluster.py    From pandas-ml with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_affinity_propagation_class(self):
        from sklearn.datasets.samples_generator import make_blobs

        centers = [[1, 1], [-1, -1], [1, -1]]
        X, labels_true = make_blobs(n_samples=300, centers=centers,
                                    cluster_std=0.5, random_state=0)

        df = pdml.ModelFrame(data=X, target=labels_true)
        af = df.cluster.AffinityPropagation(preference=-50)
        df.fit(af)

        af2 = cluster.AffinityPropagation(preference=-50).fit(X)

        tm.assert_numpy_array_equal(af.cluster_centers_indices_,
                                    af2.cluster_centers_indices_)
        tm.assert_numpy_array_equal(af.labels_, af2.labels_) 
Example #4
Source File: test_optics.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_dbscan_optics_parity(eps, min_samples):
    # Test that OPTICS clustering labels are <= 5% difference of DBSCAN

    centers = [[1, 1], [-1, -1], [1, -1]]
    X, labels_true = make_blobs(n_samples=750, centers=centers,
                                cluster_std=0.4, random_state=0)

    # calculate optics with dbscan extract at 0.3 epsilon
    op = OPTICS(min_samples=min_samples, cluster_method='dbscan',
                eps=eps).fit(X)

    # calculate dbscan labels
    db = DBSCAN(eps=eps, min_samples=min_samples).fit(X)

    contingency = contingency_matrix(db.labels_, op.labels_)
    agree = min(np.sum(np.max(contingency, axis=0)),
                np.sum(np.max(contingency, axis=1)))
    disagree = X.shape[0] - agree

    percent_mismatch = np.round((disagree - 1) / X.shape[0], 2)

    # verify label mismatch is <= 5% labels
    assert percent_mismatch <= 0.05 
Example #5
Source File: test_k_means.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_elkan_results(distribution):
    # check that results are identical between lloyd and elkan algorithms
    rnd = np.random.RandomState(0)
    if distribution == 'normal':
        X = rnd.normal(size=(50, 10))
    else:
        X, _ = make_blobs(random_state=rnd)

    km_full = KMeans(algorithm='full', n_clusters=5, random_state=0, n_init=1)
    km_elkan = KMeans(algorithm='elkan', n_clusters=5,
                      random_state=0, n_init=1)

    km_full.fit(X)
    km_elkan.fit(X)
    assert_array_almost_equal(km_elkan.cluster_centers_,
                              km_full.cluster_centers_)
    assert_array_equal(km_elkan.labels_, km_full.labels_) 
Example #6
Source File: test_k_means.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_minibatch_sensible_reassign_fit():
    # check if identical initial clusters are reassigned
    # also a regression test for when there are more desired reassignments than
    # samples.
    zeroed_X, true_labels = make_blobs(n_samples=100, centers=5,
                                       cluster_std=1., random_state=42)
    zeroed_X[::2, :] = 0
    mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=10, random_state=42,
                                 init="random")
    mb_k_means.fit(zeroed_X)
    # there should not be too many exact zero cluster centers
    assert_greater(mb_k_means.cluster_centers_.any(axis=1).sum(), 10)

    # do the same with batch-size > X.shape[0] (regression test)
    mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=201,
                                 random_state=42, init="random")
    mb_k_means.fit(zeroed_X)
    # there should not be too many exact zero cluster centers
    assert_greater(mb_k_means.cluster_centers_.any(axis=1).sum(), 10) 
Example #7
Source File: gmm.py    From intro_ds with Apache License 2.0 5 votes vote down vote up
def generateCaseTwo(n):
    """
    随机生成内部方差不相同的数据
    """
    centers = [[-2, 0], [0, 2], [2, 4]]
    std = [0.1, 1, 0.2]
    data, _ = make_blobs(n_samples=n, centers=centers, cluster_std=std)
    return data 
Example #8
Source File: kmeans.py    From MachineLearning with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def plot_kmeans():
    X, y = make_blobs(n_samples=300, centers=4,
                      random_state=0, cluster_std=0.60)

    y_pred = KMeans(4).fit(X).predict(X)

    fig, ax = plt.subplots(1, 2, figsize=(12, 6))

    ax[0].scatter(X[:, 0], X[:, 1])
    ax[0].set_title('Input')

    ax[1].scatter(X[:, 0], X[:, 1], c=y)
    ax[1].set_title('Labels determined by K Means') 
Example #9
Source File: kmeans_limitations.py    From intro_ds with Apache License 2.0 5 votes vote down vote up
def generateCaseTwo(n):
    """
    随机生成内部方差不相同的数据
    """
    centers = [[-2, 0], [0, 2], [2, 4]]
    std = [0.1, 1, 0.2]
    data, _ = make_blobs(n_samples=n, centers=centers, cluster_std=std)
    return data 
Example #10
Source File: kmeans.py    From intro_ds with Apache License 2.0 5 votes vote down vote up
def generateData(n):
    """
    生成随机的聚类数据
    """
    centers = [[1, 1], [-1, -1]]
    X, _ = make_blobs(n_samples=n, centers=centers, cluster_std=0.5)
    return X 
Example #11
Source File: gmm_choose_k.py    From intro_ds with Apache License 2.0 5 votes vote down vote up
def generateData(n):
    """
    随机生成内部方差不相同的数据
    """
    centers = [[-2, 0], [0, 2], [2, 4]]
    std = [0.1, 1, 0.2]
    data, _ = make_blobs(n_samples=n, centers=centers, cluster_std=std)
    return data 
Example #12
Source File: kmeans_choose_k.py    From intro_ds with Apache License 2.0 5 votes vote down vote up
def generateData(n):
    """
    生成随机的聚类数据,聚类中心为3个
    """
    centers = [[1, 1], [-1, -1], [1, -1]]
    X, _ = make_blobs(n_samples=n, centers=centers, cluster_std=0.5)
    return X 
Example #13
Source File: object_ranking_data_generator.py    From cs-ranking with Apache License 2.0 5 votes vote down vote up
def make_gp_non_transitive(
        self,
        n_instances=1000,
        n_objects=5,
        n_features=100,
        center_box=(-10.0, 10.0),
        cluster_std=2.0,
        seed=42,
        **kwd,
    ):
        n_samples = n_instances * n_objects
        random_state = check_random_state(seed=seed)
        x, y = make_blobs(
            n_samples=n_samples,
            centers=n_objects,
            n_features=n_features,
            cluster_std=cluster_std,
            center_box=center_box,
            random_state=random_state,
            shuffle=True,
        )
        y = np.array([y])
        samples = np.append(x, y.T, axis=1)
        samples = samples[samples[:, n_features].argsort()]
        pairwise_prob = create_pairwise_prob_matrix(n_objects)
        X = []
        Y = []
        for inst in range(n_instances):
            feature = np.array(
                [samples[inst + i * n_instances, 0:-1] for i in range(n_objects)]
            )
            matrix = np.random.binomial(1, pairwise_prob)
            objects = list(np.arange(n_objects))
            ordering = np.array(quicksort(objects, matrix))
            ranking = np.argsort(ordering)
            X.append(feature)
            Y.append(ranking)
        X = np.array(X)
        Y = np.array(Y)
        return X, Y 
Example #14
Source File: test_spectral.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_spectral_amg_mode():
    # Test the amg mode of SpectralClustering
    centers = np.array([
        [0., 0., 0.],
        [10., 10., 10.],
        [20., 20., 20.],
    ])
    X, true_labels = make_blobs(n_samples=100, centers=centers,
                                cluster_std=1., random_state=42)
    D = pairwise_distances(X)  # Distance matrix
    S = np.max(D) - D  # Similarity matrix
    S = sparse.coo_matrix(S)
    try:
        from pyamg import smoothed_aggregation_solver  # noqa

        amg_loaded = True
    except ImportError:
        amg_loaded = False
    if amg_loaded:
        labels = spectral_clustering(S, n_clusters=len(centers),
                                     random_state=0, eigen_solver="amg")
        # We don't care too much that it's good, just that it *worked*.
        # There does have to be some lower limit on the performance though.
        assert_greater(np.mean(labels == true_labels), .3)
    else:
        assert_raises(ValueError, spectral_embedding, S,
                      n_components=len(centers),
                      random_state=0, eigen_solver="amg") 
Example #15
Source File: test_spectral.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_spectral_unknown_mode():
    # Test that SpectralClustering fails with an unknown mode set.
    centers = np.array([
        [0., 0., 0.],
        [10., 10., 10.],
        [20., 20., 20.],
    ])
    X, true_labels = make_blobs(n_samples=100, centers=centers,
                                cluster_std=1., random_state=42)
    D = pairwise_distances(X)  # Distance matrix
    S = np.max(D) - D  # Similarity matrix
    S = sparse.coo_matrix(S)
    assert_raises(ValueError, spectral_clustering, S, n_clusters=2,
                  random_state=0, eigen_solver="<unknown>") 
Example #16
Source File: test_spectral.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_spectral_unknown_assign_labels():
    # Test that SpectralClustering fails with an unknown assign_labels set.
    centers = np.array([
        [0., 0., 0.],
        [10., 10., 10.],
        [20., 20., 20.],
    ])
    X, true_labels = make_blobs(n_samples=100, centers=centers,
                                cluster_std=1., random_state=42)
    D = pairwise_distances(X)  # Distance matrix
    S = np.max(D) - D  # Similarity matrix
    S = sparse.coo_matrix(S)
    assert_raises(ValueError, spectral_clustering, S, n_clusters=2,
                  random_state=0, assign_labels="<unknown>") 
Example #17
Source File: test_spectral.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_spectral_clustering_sparse():
    X, y = make_blobs(n_samples=20, random_state=0,
                      centers=[[1, 1], [-1, -1]], cluster_std=0.01)

    S = rbf_kernel(X, gamma=1)
    S = np.maximum(S - 1e-4, 0)
    S = sparse.coo_matrix(S)

    labels = SpectralClustering(random_state=0, n_clusters=2,
                                affinity='precomputed').fit(S).labels_
    assert_equal(adjusted_rand_score(y, labels), 1) 
Example #18
Source File: test_k_means.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_elkan_results():
    rnd = np.random.RandomState(0)
    X_normal = rnd.normal(size=(50, 10))
    X_blobs, _ = make_blobs(random_state=0)
    km_full = KMeans(algorithm='full', n_clusters=5, random_state=0, n_init=1)
    km_elkan = KMeans(algorithm='elkan', n_clusters=5,
                      random_state=0, n_init=1)
    for X in [X_normal, X_blobs]:
        km_full.fit(X)
        km_elkan.fit(X)
        assert_array_almost_equal(km_elkan.cluster_centers_,
                                  km_full.cluster_centers_)
        assert_array_equal(km_elkan.labels_, km_full.labels_) 
Example #19
Source File: test_k_means.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_minibatch_sensible_reassign_partial_fit():
    zeroed_X, true_labels = make_blobs(n_samples=n_samples, centers=5,
                                       cluster_std=1., random_state=42)
    zeroed_X[::2, :] = 0
    mb_k_means = MiniBatchKMeans(n_clusters=20, random_state=42, init="random")
    for i in range(100):
        mb_k_means.partial_fit(zeroed_X)
    # there should not be too many exact zero cluster centers
    assert_greater(mb_k_means.cluster_centers_.any(axis=1).sum(), 10) 
Example #20
Source File: sgd_separator.py    From ESAC-stats-2014 with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def plot_sgd_separator():
    # we create 50 separable points
    X, Y = make_blobs(n_samples=50, centers=2,
                      random_state=0, cluster_std=0.60)

    # fit the model
    clf = SGDClassifier(loss="hinge", alpha=0.01,
                        n_iter=200, fit_intercept=True)
    clf.fit(X, Y)

    # plot the line, the points, and the nearest vectors to the plane
    xx = np.linspace(-1, 5, 10)
    yy = np.linspace(-1, 5, 10)

    X1, X2 = np.meshgrid(xx, yy)
    Z = np.empty(X1.shape)
    for (i, j), val in np.ndenumerate(X1):
        x1 = val
        x2 = X2[i, j]
        p = clf.decision_function([x1, x2])
        Z[i, j] = p[0]
    levels = [-1.0, 0.0, 1.0]
    linestyles = ['dashed', 'solid', 'dashed']
    colors = 'k'

    ax = plt.axes()
    ax.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles)
    ax.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired)

    ax.axis('tight') 
Example #21
Source File: sgd_separator.py    From MachineLearning with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def plot_sgd_separator():
    # we create 50 separable points
    X, Y = make_blobs(n_samples=50, centers=2,
                      random_state=0, cluster_std=0.60)

    # fit the model
    clf = SGDClassifier(loss="hinge", alpha=0.01,
                        n_iter=200, fit_intercept=True)
    clf.fit(X, Y)

    # plot the line, the points, and the nearest vectors to the plane
    xx = np.linspace(-1, 5, 10)
    yy = np.linspace(-1, 5, 10)

    X1, X2 = np.meshgrid(xx, yy)
    Z = np.empty(X1.shape)
    for (i, j), val in np.ndenumerate(X1):
        x1 = val
        x2 = X2[i, j]
        p = clf.decision_function([x1, x2])
        Z[i, j] = p[0]
    levels = [-1.0, 0.0, 1.0]
    linestyles = ['dashed', 'solid', 'dashed']
    colors = 'k'

    ax = plt.axes()
    ax.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles)
    ax.scatter(X[:, 0], X[:, 1], c=Y, s=60)

    ax.axis('tight') 
Example #22
Source File: sgd_separator.py    From sklearn_pydata2015 with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def plot_sgd_separator():
    # we create 50 separable points
    X, Y = make_blobs(n_samples=50, centers=2,
                      random_state=0, cluster_std=0.60)

    # fit the model
    clf = SGDClassifier(loss="hinge", alpha=0.01,
                        n_iter=200, fit_intercept=True)
    clf.fit(X, Y)

    # plot the line, the points, and the nearest vectors to the plane
    xx = np.linspace(-1, 5, 10)
    yy = np.linspace(-1, 5, 10)

    X1, X2 = np.meshgrid(xx, yy)
    Z = np.empty(X1.shape)
    for (i, j), val in np.ndenumerate(X1):
        x1 = val
        x2 = X2[i, j]
        p = clf.decision_function([x1, x2])
        Z[i, j] = p[0]
    levels = [-1.0, 0.0, 1.0]
    linestyles = ['dashed', 'solid', 'dashed']
    colors = 'k'

    ax = plt.axes()
    ax.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles)
    ax.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired)

    ax.axis('tight') 
Example #23
Source File: gmm_plots.py    From numpy-ml with GNU General Public License v3.0 5 votes vote down vote up
def plot():
    fig, axes = plt.subplots(4, 4)
    fig.set_size_inches(10, 10)
    for i, ax in enumerate(axes.flatten()):
        n_ex = 150
        n_in = 2
        n_classes = np.random.randint(2, 4)
        X, y = make_blobs(
            n_samples=n_ex, centers=n_classes, n_features=n_in, random_state=i
        )
        X -= X.mean(axis=0)

        # take best fit over 10 runs
        best_elbo = -np.inf
        for k in range(10):
            _G = GMM(C=n_classes, seed=i * 3)
            ret = _G.fit(X, max_iter=100, verbose=False)
            while ret != 0:
                print("Components collapsed; Refitting")
                ret = _G.fit(X, max_iter=100, verbose=False)

            if _G.best_elbo > best_elbo:
                best_elbo = _G.best_elbo
                G = _G

        ax = plot_clusters(G, X, ax)
        ax.xaxis.set_ticklabels([])
        ax.yaxis.set_ticklabels([])
        ax.set_title("# Classes: {}; Final VLB: {:.2f}".format(n_classes, G.best_elbo))

    plt.tight_layout()
    plt.savefig("img/plot.png", dpi=300)
    plt.close("all") 
Example #24
Source File: lm_plots.py    From numpy-ml with GNU General Public License v3.0 5 votes vote down vote up
def random_classification_problem(n_ex, n_classes, n_in, seed=0):
    X, y = make_blobs(
        n_samples=n_ex, centers=n_classes, n_features=n_in, random_state=seed
    )
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=seed
    )
    return X_train, y_train, X_test, y_test


#######################################################################
#                                Plots                                #
####################################################################### 
Example #25
Source File: test_k_means.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_minibatch_sensible_reassign_partial_fit():
    zeroed_X, true_labels = make_blobs(n_samples=n_samples, centers=5,
                                       cluster_std=1., random_state=42)
    zeroed_X[::2, :] = 0
    mb_k_means = MiniBatchKMeans(n_clusters=20, random_state=42, init="random")
    for i in range(100):
        mb_k_means.partial_fit(zeroed_X)
    # there should not be too many exact zero cluster centers
    assert_greater(mb_k_means.cluster_centers_.any(axis=1).sum(), 10) 
Example #26
Source File: test_optics.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_close_extract():
    # Test extract where extraction eps is close to scaled max_eps

    centers = [[1, 1], [-1, -1], [1, -1]]
    X, labels_true = make_blobs(n_samples=750, centers=centers,
                                cluster_std=0.4, random_state=0)

    # Compute OPTICS
    clust = OPTICS(max_eps=1.0, cluster_method='dbscan',
                   eps=0.3, min_samples=10).fit(X)
    # Cluster ordering starts at 0; max cluster label = 2 is 3 clusters
    assert_equal(max(clust.labels_), 2) 
Example #27
Source File: test_optics.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_bad_reachability():
    msg = "All reachability values are inf. Set a larger max_eps."
    centers = [[1, 1], [-1, -1], [1, -1]]
    X, labels_true = make_blobs(n_samples=750, centers=centers,
                                cluster_std=0.4, random_state=0)

    with pytest.warns(UserWarning, match=msg):
        clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10, eps=0.015)
        clust.fit(X) 
Example #28
Source File: test_optics.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_bad_extract():
    # Test an extraction of eps too close to original eps
    msg = "Specify an epsilon smaller than 0.15. Got 0.3."
    centers = [[1, 1], [-1, -1], [1, -1]]
    X, labels_true = make_blobs(n_samples=750, centers=centers,
                                cluster_std=0.4, random_state=0)

    # Compute OPTICS
    clust = OPTICS(max_eps=5.0 * 0.03,
                   cluster_method='dbscan',
                   eps=0.3, min_samples=10)
    assert_raise_message(ValueError, msg, clust.fit, X) 
Example #29
Source File: test_mean_shift.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_parallel():
    centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10
    X, _ = make_blobs(n_samples=50, n_features=2, centers=centers,
                      cluster_std=0.4, shuffle=True, random_state=11)

    ms1 = MeanShift(n_jobs=2)
    ms1.fit(X)

    ms2 = MeanShift()
    ms2.fit(X)

    assert_array_almost_equal(ms1.cluster_centers_, ms2.cluster_centers_)
    assert_array_equal(ms1.labels_, ms2.labels_) 
Example #30
Source File: test_spectral.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_spectral_clustering_sparse():
    X, y = make_blobs(n_samples=20, random_state=0,
                      centers=[[1, 1], [-1, -1]], cluster_std=0.01)

    S = rbf_kernel(X, gamma=1)
    S = np.maximum(S - 1e-4, 0)
    S = sparse.coo_matrix(S)

    labels = SpectralClustering(random_state=0, n_clusters=2,
                                affinity='precomputed').fit(S).labels_
    assert adjusted_rand_score(y, labels) == 1