python source code of tututils

from pylab import *
import matplotlib.cm as cm
import numpy as np
import scipy.linalg as la
from scipy.stats import chi2
from scipy.spatial import Voronoi, voronoi_plot_2d
from sklearn.datasets import make_blobs


def plot_2d_clusters(X, labels, centers):
    """
    Given an observation array, a label vector, and the location of the centers
    plot the clusters
    """

    clabels = set(labels)
    K = len(clabels)

    if len(centers) != K:
        raise ValueError("Expecting the number of unique labels and centres to"
                         " be the same!")

    # Plot the true clusters
    figure(figsize=(10, 10))
    ax = gca()

    vor = Voronoi(centers)

    voronoi_plot_2d(vor, ax)

    colors = cm.hsv(np.arange(K)/float(K))
    for k, col in enumerate(colors):
        my_members = labels == k
        scatter(X[my_members, 0], X[my_members, 1], c=col, marker='o', s=20)

    for k, col in enumerate(colors):
        cluster_center = centers[k]
        scatter(cluster_center[0], cluster_center[1], c=col, marker='o', s=200)

    axis('tight')
    axis('equal')
    title('Clusters')


def plot_2d_GMMs(X, labels, means, covs, percentcontour=0.66, npoints=30):
    """
    Given an observation array, a label vector (integer values), and GMM mean
    and covariance parameters, plot the clusters and parameters.
    """

    clabels = set(labels)
    K = len(clabels)

    if len(means) != len(covs) != K:
        raise ValueError("Expecting the number of unique labels, means and"
                         "covariances to be the same!")

    phi = np.linspace(-np.pi, np.pi, npoints)

    circle = np.array([np.sin(phi), np.cos(phi)]).T

    figure(figsize=(10, 10))
    gca()

    colors = cm.hsv(np.arange(K)/float(K))
    for k, col in zip(clabels, colors):

        # points
        my_members = labels == k
        scatter(X[my_members, 0], X[my_members, 1], c=col, marker='o', s=20)

        # means
        cluster_center = means[k, :]
        scatter(cluster_center[0], cluster_center[1], c=col, marker='o', s=200)

        # covariance
        L = la.cholesky(np.array(covs[k]) * chi2.ppf(percentcontour, [3])
                        + 1e-5 * np.eye(covs[k].shape[0]))
        covpoints = circle.dot(L) + means[k, :]
        plot(covpoints[:, 0], covpoints[:, 1], color=col, linewidth=3)

    axis('tight')
    axis('equal')
    title('Clusters')


def load_2d_simple():
    """
    Should be easily clustered with K-Means.
    """
    centres = [[1, 1], [-0.5, 0], [1, -1]]
    X, labels_true = make_blobs(n_samples=1000, centers=centres,
                                cluster_std=[[0.3, 0.3]])
    return X


def load_2d_hard():
    """
    Returns non-isotropoic data to motivate the use of non-euclidean norms (as
    well as the ground truth).
    """

    centres = np.array([[3., -1.], [-2., 1.], [2., 5.]])
    covs = []
    covs.append(np.array([[4., 2.], [2., 1.5]]))
    covs.append(np.array([[1, -1.5], [-1.5, 3.]]))
    covs.append(np.array([[1., 0.], [0., 1.]]))

    N = [1000, 500, 300]

    X = [np.random.randn(n, 2).dot(la.cholesky(c, lower=True)) + m
         for n, m, c in zip(N, centres, covs)]
    X = np.vstack(X)

    labels = np.concatenate((np.zeros(N[0]), np.ones(N[1]), 2*np.ones(N[2])))

    return X, labels