import os import io import shutil import numpy as np import pandas import matplotlib.pyplot as plt from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score from jinja2 import Environment, FileSystemLoader from .algorithm import Algorithm from .hierarchy import HierarchicalClustering class Clustering(object): @staticmethod def choose_algorithm(min_clusters, max_clusters, random_state): algorithms = [] algorithms.append( Algorithm( KMeans(random_state=random_state), [], 'K-Means', ('https://scikit-learn.org/stable/modules/generated/' 'sklearn.cluster.KMeans.html') ) ) algorithms.append( Algorithm( HierarchicalClustering(random_state=random_state), [], 'Hierarchical Clustering', ('https://docs.scipy.org/doc/scipy/reference/generated/' 'scipy.cluster.hierarchy.linkage.html') ) ) return algorithms @classmethod def analyze(cls, algorithms, data, min_clusters, max_clusters, random_state, verbose): for i in range(len(algorithms)): if verbose: print(' %s' % algorithms[i].name) if isinstance(data.X, pandas.DataFrame): X = data.X.to_numpy() else: X = data.X gap, sk, nc = Clustering.calc_gap(algorithms[i].estimator, X, min_clusters, max_clusters, random_state=random_state) algorithms[i].results['gap'] = gap algorithms[i].results['gap_sk'] = sk algorithms[i].results['gap_nc'] = nc algorithms[i].results['min_nc'] = min_clusters algorithms[i].results['max_nc'] = max_clusters sil, sil_nc, dav, dav_nc, cal, cal_nc, Z = Clustering.calc_scores( algorithms[i].estimator, X, min_clusters, max_clusters, random_state=random_state) algorithms[i].results['silhouette'] = sil algorithms[i].results['silhouette_nc'] = sil_nc algorithms[i].results['davies'] = dav algorithms[i].results['davies_nc'] = dav_nc algorithms[i].results['calinski'] = cal algorithms[i].results['calinski_nc'] = cal_nc algorithms[i].results['linkage'] = Z # Estimate number of clusters votes = np.zeros(algorithms[0].results['max_nc'] + 1) for alg in algorithms: votes[alg.results['gap_nc']] += 1 votes[alg.results['silhouette_nc']] += 1 votes[alg.results['davies_nc']] += 1 votes[alg.results['calinski_nc']] += 1 nc = list(np.where(votes == votes.max())[0]) for alg in algorithms: alg.results['est_nc'] = nc alg.estimator.n_clusters = nc[0] alg.estimator = alg.estimator.fit(X) @classmethod def predict(cls, algorithms, data): ret = None for alg in algorithms: if hasattr(alg.estimator, 'predict'): if ret is None: ret = {} ret[alg.name] = alg.estimator.predict(data) return ret @staticmethod def calc_inertia(data, labels): inertia = 0 for nc in np.unique(labels): x = data[labels == nc, :] N = x.shape[0] s = 0 for i1 in range(N - 1): for i2 in range(i1, N): if i1 == i2: continue s += np.sum((x[i1] - x[i2]) ** 2) inertia += s / N return inertia @classmethod def calc_gap(cls, model, data, min_clusters, max_clusters, num_iter=50, random_state=0, svd=True): if svd: U, s, V = np.linalg.svd(data, full_matrices=True) X = np.dot(data, V) else: X = data X_max = X.max(axis=0) X_min = X.min(axis=0) gap = [] sk = [] gap_star = [] sk_star = [] for nc in range(min_clusters, max_clusters+1): model.n_clusters = nc model.random_state = random_state pred_labels = model.fit_predict(data) if hasattr(model, 'inertia_'): dispersion = model.inertia_ else: dispersion = Clustering.calc_inertia(data, pred_labels) ref_dispersions = [] for iter in range(num_iter): np.random.seed(random_state + iter) ref = np.random.rand(*data.shape) ref = (ref * (X_max - X_min)) + X_min if svd: ref = np.dot(ref, V) model.n_clusters = nc pred_labels = model.fit_predict(ref) if hasattr(model, 'inertia_'): ref_dispersions.append(model.inertia_) else: ref_dispersions.append(Clustering.calc_inertia(ref, pred_labels)) ref_log_dispersion = np.mean(np.log(ref_dispersions)) log_dispersion = np.log(dispersion) gap.append(ref_log_dispersion - log_dispersion) sd = np.std(np.log(ref_dispersions), ddof=0) sk.append(np.sqrt(1 + 1 /num_iter) * sd) gap_star.append(np.mean(ref_dispersions) - dispersion) sdk_star = np.std(ref_dispersions) sk_star.append(np.sqrt(1.0 + 1.0 / num_iter) * sdk_star) nc = -1 for i in range(len(gap) - 1): if gap[i] >= gap[i + 1] - sk[i + 1]: nc = min_clusters + i break if nc == -1: nc = max_clusters return gap, sk, nc @classmethod def plot_gap(cls, algorithm, dname): if dname is None: return if not os.path.exists(dname): os.mkdir(dname) plt.figure() plt.title(algorithm.estimator.__class__.__name__) plt.xlabel("Number of clusters") plt.ylabel("Gap statistic") plt.plot(range(algorithm.results['min_nc'], algorithm.results['max_nc'] + 1), algorithm.results['gap'], 'o-', color='dodgerblue') plt.errorbar(range(algorithm.results['min_nc'], algorithm.results['max_nc'] + 1), algorithm.results['gap'], algorithm.results['gap_sk'], capsize=3) plt.axvline(x=algorithm.results['gap_nc'], ls='--', C='gray', zorder=0) plt.savefig('%s/gap_%s.png' % (dname, algorithm.estimator.__class__.__name__), bbox_inches='tight', dpi=75) plt.close() @classmethod def calc_scores(cls, model, data, min_clusters, max_clusters, random_state=0): silhouettes = [] davieses = [] calinskies = [] if model.__class__.__name__ == 'HierarchicalClustering': linkage_matrix = model.fit(data) else: linkage_matrix = None for nc in range(min_clusters, max_clusters + 1): model.n_clusters = nc model.random_state = random_state pred_labels = model.fit_predict(data) silhouettes.append(silhouette_score(data, pred_labels, random_state=random_state)) davieses.append(davies_bouldin_score(data, pred_labels)) calinskies.append(calinski_harabasz_score(data, pred_labels)) sil_nc = np.argmax(silhouettes) + min_clusters dav_nc = np.argmin(davieses) + min_clusters cal_nc = np.argmax(calinskies) + min_clusters return silhouettes, sil_nc, davieses, dav_nc, calinskies, cal_nc, linkage_matrix @classmethod def plot_silhouette(cls, algorithm, dname): if dname is None: return if not os.path.exists(dname): os.mkdir(dname) plt.figure() plt.title(algorithm.estimator.__class__.__name__) plt.xlabel("Number of clusters") plt.ylabel("Silhouette score") plt.plot(range(algorithm.results['min_nc'], algorithm.results['max_nc'] + 1), algorithm.results['silhouette'], 'o-', color='darkorange') plt.axvline(x=algorithm.results['silhouette_nc'], ls='--', C='gray', zorder=0) plt.savefig('%s/silhouette_%s.png' % (dname, algorithm.estimator.__class__.__name__), bbox_inches='tight', dpi=75) plt.close() @classmethod def plot_davies(cls, algorithm, dname): if dname is None: return if not os.path.exists(dname): os.mkdir(dname) plt.figure() plt.title(algorithm.estimator.__class__.__name__) plt.xlabel("Number of clusters") plt.ylabel("Davies-Bouldin score") plt.plot(range(algorithm.results['min_nc'], algorithm.results['max_nc'] + 1), algorithm.results['davies'], 'o-', color='limegreen') plt.axvline(x=algorithm.results['davies_nc'], ls='--', C='gray', zorder=0) plt.savefig('%s/davies_%s.png' % (dname, algorithm.estimator.__class__.__name__), bbox_inches='tight', dpi=75) plt.close() @classmethod def plot_calinski(cls, algorithm, dname): if dname is None: return if not os.path.exists(dname): os.mkdir(dname) plt.figure() plt.title(algorithm.estimator.__class__.__name__) plt.xlabel("Number of clusters") plt.ylabel("Calinski and Harabasz score") plt.plot(range(algorithm.results['min_nc'], algorithm.results['max_nc'] + 1), algorithm.results['calinski'], 'o-', color='crimson') plt.axvline(x=algorithm.results['calinski_nc'], ls='--', C='gray', zorder=0) plt.savefig('%s/calinski_%s.png' % (dname, algorithm.estimator.__class__.__name__), bbox_inches='tight', dpi=75) plt.close() @classmethod def plot_dendrogram(cls, algorithm, dname): if dname is None: return if not os.path.exists(dname): os.mkdir(dname) plt.figure() plt.title(algorithm.estimator.__class__.__name__) plt.title('Dendrogram') plt.ylabel('Distance') plt.xlabel(('A number in brackets represents the number of samples in the nodes\n' 'and a number not in brackets represents the index of the sample in the data.')) algorithm.estimator.dendrogram() plt.savefig('%s/dendrogram_%s.png' % (dname, algorithm.estimator.__class__.__name__), bbox_inches='tight', dpi=100) plt.close() @classmethod def make_report(cls, algorithms, data, dname, lang): if not os.path.exists(dname): os.mkdir(dname) shutil.copy(os.path.abspath(os.path.dirname(__file__)) + '/static/kmeans_mouse.png', dname + '/kmeans_mouse.png') nc = ', '.join(map(str, algorithms[0].results['est_nc'])) for algorithm in algorithms: Clustering.plot_gap(algorithm, dname) Clustering.plot_silhouette(algorithm, dname) Clustering.plot_davies(algorithm, dname) Clustering.plot_calinski(algorithm, dname) if algorithm.estimator.__class__.__name__ == 'HierarchicalClustering': Clustering.plot_dendrogram(algorithm, dname) env = Environment( loader=FileSystemLoader( os.path.abspath( os.path.dirname(__file__)) + '/template', encoding='utf8')) if lang == 'jp': tmpl = env.get_template('report_clustering_jp.html.tmp') else: tmpl = env.get_template('report_clustering.html.tmp') html = tmpl.render(algorithms=algorithms, data=data, nc=nc).encode('utf-8') fo = io.open(dname + '/report.html', 'w', encoding='utf-8') fo.write(html.decode('utf-8')) fo.close()