python source code of estimator

#!/usr/bin/env python
"""Provides scikit interface."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
from eden.util import timeit
import random
from toolz.sandbox.core import unzip
from collections import Counter
from toolz.curried import first, second, groupby
from sklearn.cluster import MiniBatchKMeans
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import classification_report
from sklearn.metrics import average_precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer
import pylab as plt
from eden.display import plot_confusion_matrices
from eden.display import plot_aucs
from sklearn.model_selection import cross_val_score
import logging

logger = logging.getLogger()


@timeit
def process_vec_info(g, n_clusters=8):
    """process_vec_info."""
    # extract node vec information and make np data matrix
    data_matrix = np.array([g.node[u]['vec'] for u in g.nodes()])
    # cluster with kmeans
    clu = MiniBatchKMeans(n_clusters=n_clusters, n_init=10)
    clu.fit(data_matrix)
    preds = clu.predict(data_matrix)
    vecs = clu.transform(data_matrix)
    vecs = 1 / (1 + vecs)
    # replace node information
    graph = g.copy()
    for u in graph.nodes():
        graph.node[u]['label'] = str(preds[u])
        graph.node[u]['vec'] = list(vecs[u])
    return graph


def paired_shuffle(iterable1, iterable2):
    """paired_shuffle."""
    i1i2 = list(zip(iterable1, iterable2))
    random.shuffle(i1i2)
    i1, i2 = unzip(i1i2)
    return list(i1), list(i2)


@timeit
def subsample(graphs, targets, subsample_size=100):
    """subsample."""
    tg = zip(targets, graphs)
    num_classes = len(set(targets))
    class_graphs = groupby(lambda x: first(x), tg)
    subgraphs = []
    subtargets = []
    for y in class_graphs:
        class_subgraphs = class_graphs[y][:subsample_size / num_classes]
        class_subgraphs = [second(x) for x in class_subgraphs]
        subgraphs += class_subgraphs
        subtargets += [y] * len(class_subgraphs)
    subgraphs, subtargets = paired_shuffle(subgraphs, subtargets)
    return list(subgraphs), list(subtargets)


@timeit
def balance(graphs, targets, estimator, ratio=2):
    """balance."""
    class_counts = Counter(targets)
    majority_class = None
    max_count = 0
    minority_class = None
    min_count = 1e6
    for class_key in class_counts:
        if max_count < class_counts[class_key]:
            majority_class = class_key
            max_count = class_counts[class_key]
        if min_count > class_counts[class_key]:
            minority_class = class_key
            min_count = class_counts[class_key]

    desired_size = int(min_count * ratio)

    tg = zip(targets, graphs)
    class_graphs = groupby(lambda x: first(x), tg)
    maj_graphs = [second(x) for x in class_graphs[majority_class]]
    min_graphs = [second(x) for x in class_graphs[minority_class]]

    if estimator:
        # select only the instances in the majority class that
        # have a small margin
        preds = estimator.decision_function(maj_graphs)
    else:
        # select at random
        preds = [random.random() for i in range(len(maj_graphs))]
    preds = [abs(pred) for pred in preds]
    pred_graphs = sorted(zip(preds, maj_graphs))[:desired_size]
    maj_graphs = [g for p, g in pred_graphs]

    bal_graphs = min_graphs + maj_graphs
    bal_pos = [minority_class] * len(min_graphs)
    bal_neg = [majority_class] * len(maj_graphs)
    bal_targets = bal_pos + bal_neg

    return paired_shuffle(bal_graphs, bal_targets)


def make_train_test_sets(pos_graphs, neg_graphs,
                         test_proportion=.3, random_state=2):
    """make_train_test_sets."""
    random.seed(random_state)
    random.shuffle(pos_graphs)
    random.shuffle(neg_graphs)
    pos_dim = len(pos_graphs)
    neg_dim = len(neg_graphs)
    tr_pos_graphs = pos_graphs[:-int(pos_dim * test_proportion)]
    te_pos_graphs = pos_graphs[-int(pos_dim * test_proportion):]
    tr_neg_graphs = neg_graphs[:-int(neg_dim * test_proportion)]
    te_neg_graphs = neg_graphs[-int(neg_dim * test_proportion):]
    tr_graphs = tr_pos_graphs + tr_neg_graphs
    te_graphs = te_pos_graphs + te_neg_graphs
    tr_targets = [1] * len(tr_pos_graphs) + [0] * len(tr_neg_graphs)
    te_targets = [1] * len(te_pos_graphs) + [0] * len(te_neg_graphs)
    tr_graphs, tr_targets = paired_shuffle(tr_graphs, tr_targets)
    te_graphs, te_targets = paired_shuffle(te_graphs, te_targets)
    return (tr_graphs, np.array(tr_targets)), (te_graphs, np.array(te_targets))


@timeit
def estimate_predictive_performance(x_y,
                                    estimator=None,
                                    n_splits=10,
                                    random_state=1):
    """estimate_predictive_performance."""
    x, y = x_y
    cv = ShuffleSplit(n_splits=n_splits,
                      test_size=0.3,
                      random_state=random_state)
    scoring = make_scorer(average_precision_score)
    scores = cross_val_score(estimator, x, y, cv=cv, scoring=scoring)
    return scores


def output_avg_and_std(iterable):
    """output_avg_and_std."""
    print(('score: %.2f +-%.2f' % (np.mean(iterable), np.std(iterable))))
    return iterable


@timeit
def perf(y_true, y_pred, y_score):
    """perf."""
    print('Accuracy: %.2f' % accuracy_score(y_true, y_pred))
    print(' AUC ROC: %.2f' % roc_auc_score(y_true, y_score))
    print('  AUC AP: %.2f' % average_precision_score(y_true, y_score))
    print()
    print('Classification Report:')
    print(classification_report(y_true, y_pred))
    print()
    plot_confusion_matrices(y_true, y_pred, size=int(len(set(y_true)) * 2.5))
    print()
    plot_aucs(y_true, y_score, size=10)


def compute_stats(scores):
    """compute_stats."""
    median = np.percentile(scores, 50, axis=1)
    low = np.percentile(scores, 25, axis=1)
    high = np.percentile(scores, 75, axis=1)
    low10 = np.percentile(scores, 10, axis=1)
    high90 = np.percentile(scores, 90, axis=1)
    return median, low, high, low10, high90


def plot_stats(x=None, y=None, label=None, color='navy'):
    """plot_stats."""
    y = np.array(y)
    y0 = y[0]
    y1 = y[1]
    y2 = y[2]
    y3 = y[3]
    y4 = y[4]
    plt.fill_between(x, y3, y4, color=color, alpha=0.08)
    plt.fill_between(x, y1, y2, color=color, alpha=0.08)
    plt.plot(x, y0, '-', lw=2, color=color, label=label)
    plt.plot(x, y0,
             linestyle='None',
             markerfacecolor='white',
             markeredgecolor=color,
             marker='o',
             markeredgewidth=2,
             markersize=8)


def plot_learning_curve(train_sizes, train_scores, test_scores):
    """plot_learning_curve."""
    plt.figure(figsize=(15, 5))
    plt.title('Learning Curve')
    plt.xlabel("Training examples")
    plt.ylabel("AUC ROC")
    tr_ys = compute_stats(train_scores)
    te_ys = compute_stats(test_scores)
    plot_stats(train_sizes, tr_ys,
               label='Training score',
               color='navy')
    plot_stats(train_sizes, te_ys,
               label='Cross-validation score',
               color='orange')
    plt.grid(linestyle=":")
    plt.legend(loc="best")
    plt.show()