python source code of plotting

import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter, MaxNLocator
import numpy as np
import plotly.graph_objs as go
import seaborn as sns
import plotly.figure_factory as pff

DEFAULT_COLORSCALE = "Viridis"


def plot_distribution(ls, column, bins=None):
    """Plot the distribution of numerical columns.

    Create a plotly plot with a histogram of the values in a column. The
    number of bin in the histogram is decided according to the
    Freedman-Diaconis rule unless given by the `bins` parameter.

    Parameters
    ----------
    ls : :class:`~lens.Summary`
        Lens `Summary`.
    column : str
        Name of the column.
    bins : int, optional
        Number of bins to use for histogram. If not given, the
        Freedman-Diaconis rule will be used to estimate the best number of
        bins. This argument also accepts the formats taken by the `bins`
        parameter of matplotlib's :function:`~matplotlib.pyplot.hist`.

    Returns
    -------
    :class:`~matplotlib.Axes`
        Matplotlib axes containing the distribution plot.
    """
    column_summary = ls.summary(column)
    if column_summary["notnulls"] <= 2:
        # Plotly refuses to plot histograms if
        # the tdigest has too few values
        raise ValueError(
            "There are fewer than two non-null values in this column"
        )

    if bins is None:
        counts, edges = ls.histogram(column)
    else:
        xs, counts = ls.tdigest_centroids(column)
        counts, edges = np.histogram(xs, weights=counts, bins=bins)

    fig, ax = plt.subplots()

    ax.bar(
        edges[:-1], counts, width=np.diff(edges), label=column, align="edge"
    )

    ax.set_ylim(bottom=0)

    ax.set_xlabel(column)
    ax.set_title('Distribution of column "{}"'.format(column))

    ax.figure.tight_layout()

    return fig


def _set_integer_tick_labels(axis, labels):
    """Use labels dict to set labels on axis"""
    axis.set_major_formatter(FuncFormatter(lambda x, _: labels.get(x, "")))
    axis.set_major_locator(MaxNLocator(integer=True))


def plot_pairdensity_mpl(ls, column1, column2):
    """Plot the pairwise density between two columns.

    This plot is an approximation of a scatterplot through a 2D Kernel
    Density Estimate for two numerical variables. When one of the variables
    is categorical, a 1D KDE for each of the categories is shown,
    normalised to the total number of non-null observations. For two
    categorical variables, the plot produced is a heatmap representation of
    the contingency table.

    Parameters
    ----------
    ls : :class:`~lens.Summary`
        Lens `Summary`.
    column1 : str
        First column.
    column2 : str
        Second column.

    Returns
    -------
    :class:`plt.Figure`
        Matplotlib figure containing the pairwise density plot.
    """
    pair_details = ls.pair_details(column1, column2)
    pairdensity = pair_details["pairdensity"]

    x = np.array(pairdensity["x"])
    y = np.array(pairdensity["y"])
    Z = np.array(pairdensity["density"])

    fig, ax = plt.subplots()

    if ls.summary(column1)["desc"] == "categorical":
        idx = np.argsort(x)
        x = x[idx]
        Z = Z[:, idx]
        # Create labels and positions for categorical axis
        x_labels = dict(enumerate(x))
        _set_integer_tick_labels(ax.xaxis, x_labels)
        x = np.arange(-0.5, len(x), 1.0)

    if ls.summary(column2)["desc"] == "categorical":
        idx = np.argsort(y)
        y = y[idx]
        Z = Z[idx]
        y_labels = dict(enumerate(y))
        _set_integer_tick_labels(ax.yaxis, y_labels)
        y = np.arange(-0.5, len(y), 1.0)

    X, Y = np.meshgrid(x, y)

    ax.pcolormesh(X, Y, Z, cmap=DEFAULT_COLORSCALE.lower())

    ax.set_xlabel(column1)
    ax.set_ylabel(column2)

    ax.set_title(r"$\it{{ {} }}$ vs $\it{{ {} }}$".format(column1, column2))

    return fig


def plot_correlation_mpl(ls, include=None, exclude=None):
    """Plot the correlation matrix for numeric columns

    Plot a Spearman rank order correlation coefficient matrix showing the
    correlation between columns. The matrix is reordered to group together
    columns that have a higher correlation coefficient.  The columns to be
    plotted in the correlation plot can be selected through either the
    ``include`` or ``exclude`` keyword arguments. Only one of them can be
    given.

    Parameters
    ----------
    ls : :class:`~lens.Summary`
        Lens `Summary`.
    include : list of str
        List of columns to include in the correlation plot.
    exclude : list of str
        List of columns to exclude from the correlation plot.

    Returns
    -------
    :class:`plt.Figure`
        Matplotlib figure containing the pairwise density plot.
    """

    columns, correlation_matrix = ls.correlation_matrix(include, exclude)
    num_cols = len(columns)

    if num_cols > 10:
        annotate = False
    else:
        annotate = True

    fig, ax = plt.subplots()
    sns.heatmap(
        correlation_matrix,
        annot=annotate,
        fmt=".2f",
        ax=ax,
        xticklabels=columns,
        yticklabels=columns,
        vmin=-1,
        vmax=1,
        cmap="RdBu_r",
        square=True,
    )

    ax.xaxis.tick_top()

    # Enforces a width of 2.5 inches per cell in the plot,
    # unless this exceeds 10 inches.
    width_inches = len(columns) * 2.5
    while width_inches > 10:
        width_inches = 10

    fig.set_size_inches(width_inches, width_inches)

    return fig


def plot_cdf(ls, column, N_cdf=100):
    """Plot the empirical cumulative distribution function of a column.

    Creates a plotly plot with the empirical CDF of a column.

    Parameters
    ----------
    ls : :class:`~lens.Summary`
        Lens `Summary`.
    column : str
        Name of the column.
    N_cdf : int
        Number of points in the CDF plot.

    Returns
    -------
    :class:`~matplotlib.Axes`
        Matplotlib axes containing the distribution plot.
    """
    tdigest = ls.tdigest(column)

    cdfs = np.linspace(0, 100, N_cdf)
    xs = [tdigest.percentile(p) for p in cdfs]

    fig, ax = plt.subplots()

    ax.set_ylabel("Percentile")
    ax.set_xlabel(column)
    ax.plot(xs, cdfs)

    if ls._report["column_summary"][column]["logtrans"]:
        ax.set_xscale("log")

    ax.set_title("Empirical Cumulative Distribution Function")

    return fig


def plot_pairdensity(ls, column1, column2):
    """Plot the pairwise density between two columns.

    This plot is an approximation of a scatterplot through a 2D Kernel
    Density Estimate for two numerical variables. When one of the variables
    is categorical, a 1D KDE for each of the categories is shown,
    normalised to the total number of non-null observations. For two
    categorical variables, the plot produced is a heatmap representation of
    the contingency table.

    Parameters
    ----------
    ls : :class:`~lens.Summary`
        Lens `Summary`.
    column1 : str
        First column.
    column2 : str
        Second column.

    Returns
    -------
    :class:`plotly.Figure`
        Plotly figure containing the pairwise density plot.
    """
    pair_details = ls.pair_details(column1, column2)
    pairdensity = pair_details["pairdensity"]

    x = np.array(pairdensity["x"])
    y = np.array(pairdensity["y"])
    Z = np.array(pairdensity["density"])

    if ls.summary(column1)["desc"] == "categorical":
        idx = np.argsort(x)
        x = x[idx]
        Z = Z[:, idx]

    if ls.summary(column2)["desc"] == "categorical":
        idx = np.argsort(y)
        y = y[idx]
        Z = Z[idx]

    data = [go.Heatmap(z=Z, x=x, y=y, colorscale=DEFAULT_COLORSCALE)]
    layout = go.Layout(title="<i>{}</i> vs <i>{}</i>".format(column1, column2))
    layout["xaxis"] = {
        "type": pairdensity["x_scale"],
        "autorange": True,
        "title": column1,
    }
    layout["yaxis"] = {
        "type": pairdensity["y_scale"],
        "autorange": True,
        "title": column2,
    }
    fig = go.Figure(data=data, layout=layout)
    fig.data[0]["showscale"] = False

    return fig


def plot_correlation(ls, include=None, exclude=None):
    """Plot the correlation matrix for numeric columns

    Plot a Spearman rank order correlation coefficient matrix showing the
    correlation between columns. The matrix is reordered to group together
    columns that have a higher correlation coefficient.  The columns to be
    plotted in the correlation plot can be selected through either the
    ``include`` or ``exclude`` keyword arguments. Only one of them can be
    given.

    Parameters
    ----------
    ls : :class:`~lens.Summary`
        Lens `Summary`.
    include : list of str
        List of columns to include in the correlation plot.
    exclude : list of str
        List of columns to exclude from the correlation plot.

    Returns
    -------
    :class:`plotly.Figure`
        Plotly figure containing the pairwise density plot.
    """

    columns, correlation_matrix = ls.correlation_matrix(include, exclude)
    num_cols = len(columns)

    if num_cols > 10:
        annotate = False
    else:
        annotate = True

    hover_text = []
    for i in range(num_cols):
        hover_text.append(
            [
                "Corr({}, {}) = {:.2g}".format(
                    columns[i], columns[j], correlation_matrix[i, j]
                )
                for j in range(num_cols)
            ]
        )

    if annotate:
        t = np.reshape(
            ["{:.2g}".format(x) for x in correlation_matrix.flatten()],
            correlation_matrix.shape,
        )[::-1].tolist()
    else:
        nrows, ncolumns = correlation_matrix.shape
        t = [["" for i in range(nrows)] for j in range(ncolumns)]

    fig = pff.create_annotated_heatmap(
        z=correlation_matrix.tolist()[::-1],
        colorscale="RdBu",
        x=columns,
        y=columns[::-1],
        zmin=-1.0,
        zmax=1.0,
        annotation_text=t,
        text=hover_text[::-1],
        hoverinfo="text",
    )
    w = len(columns) * 2.5 * 72
    while w > 600:
        w /= np.sqrt(1.4)
    fig.layout["width"] = w
    fig.layout["height"] = w
    fig.data[0]["showscale"] = True

    return fig