python source code of resampled

import os
from functools import partial
from multiprocessing import Pool
from contextlib import closing
from itertools import repeat
import numpy as np
import numpy.ma as ma
import pandas as pd
from scipy.stats.mstats import count_tied_groups, rankdata
from scipy.stats.mstats import kruskalwallis as _kruskalwallis
from scipy.special import erfc
from statsmodels.sandbox.stats.multicomp import multipletests


def get_memory():
    """
    """
    return os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES') / (1024 ** 3)


def _mannwhitneyu(x, y, use_continuity=True):
    """
    Computes the Mann-Whitney statistic
    Missing values in `x` and/or `y` are discarded.
    Parameters
    ----------
    x : ndarray,
        Input, vector or observations x features matrix
    y : ndarray,
        Input, vector or observations x features matrix. If matrix, must have
        same number of features as x
    use_continuity : {True, False}, optional
        Whether a continuity correction (1/2.) should be taken into account.
    Returns
    -------
    statistic : float
        The Mann-Whitney statistic
    approx z : float
        The normal-approximated z-score for U.
    pvalue : float
        Approximate p-value assuming a normal distribution.
    """
    if x.ndim == 1 and y.ndim == 1:
        x, y = x[:, np.newaxis], y[:, np.newaxis]
    ranks = rankdata(np.concatenate([x, y]), axis=0)
    nx, ny = x.shape[0], y.shape[0]
    nt = nx + ny
    U = ranks[:nx].sum(0) - nx * (nx + 1) / 2.

    mu = (nx * ny) / 2.
    u = np.amin([U, nx*ny - U], axis=0)  # get smaller U by convention

    sigsq = np.ones(ranks.shape[1]) * (nt ** 3 - nt) / 12.

    for i in np.arange(len(sigsq)):
        ties = count_tied_groups(ranks[:, i])
        sigsq[i] -= np.sum(v * (k ** 3 - k) for (k, v) in ties.items()) / 12.
    sigsq *= nx * ny / float(nt * (nt - 1))

    if use_continuity:
        z = (U - 1 / 2. - mu) / np.sqrt(sigsq)
    else:
        z = (U - mu) / np.sqrt(sigsq)

    prob = erfc(abs(z) / np.sqrt(2))
    return np.vstack([u, z, prob]).T


def find_sampling_value(group_data, percentile):
    """

    :param group_data:
    :param int percentile:
    :return:
    """
    return min(np.percentile(g.sum(axis=1), percentile) for g in group_data)


def normalize(data, downsample_value, upsample=False, labels=None):
    """
    :param data:
    :param downsample_value: value to normalize cell counts to. In current implementation,
        a small number of cells (10%) are upsampled to this value.
    :param upsample: if False, all observations with size < downsample_value are excluded.
        if True, those cells are upsampled to downsample_value.
    :return:
    """
    obs_size = data.sum(axis=1)
    if not upsample:
        keep = obs_size >= downsample_value
        data = data[keep, :]
        if labels is not None:
            labels = labels[keep]
    norm = (data * downsample_value) / data.sum(axis=1)[:, np.newaxis]
    if labels is not None:
        return norm, labels
    else:
        return norm


def _draw_sample(normalized_data, n):
    """
    :param normalized_data:
    :param n:
    """
    np.random.seed()
    idx = np.random.randint(0, normalized_data.shape[0], n)
    sample = normalized_data[idx, :]
    p = np.random.sample(sample.shape)  # round samples probabilistically

    return np.floor(sample) + (sample % 1 > p).astype(int)


def _mw_sampling_function(norm_data, n_cell):
    """
    :param norm_data:
    :param n_cell:
    :return:
    """
    a, b = (_draw_sample(d, n_cell) for d in norm_data)
    return _mannwhitneyu(a, b)  # dim = (n_genes, 3)


def confidence_interval(z):
    """

    :param z:
    :return:
    """
    return np.percentile(z, [2.5, 97.5], axis=0).T


def mannwhitneyu(
        x, y, n_iter=50, sampling_percentile=10, alpha=0.05, verbose=False,
        upsample=False):
    """
    :param x: observations by features array or DataFrame (ndim must be 2, although there
        needn't be more than one feature)
    :param y: observations by features array or DataFrama. Features must be the same as x
    :param n_iter: number of times to sample x and y
    :param sampling_percentile: percentile to downsample to. observations with row sums
        lower than this value will be excluded
    :param alpha: significance threshold for FDR correction
    :param verbose: if True, report number of cells sampled in each iteration and the
        integer value to which cells are downsampled
    :param upsample: if False, cells with size lower than sampling_percentile are
        discarded. If True, those cells are upsampled.
    :return pd.DataFrame: DataFrame with columns:
        U: median u-statistic over the n_iter iterations of the test
        z_approx: median approximate tie-corrected z-score for the mann-whitney U-test
        z_lo: lower bound, 95% confidence interval over z
        z_hi: upper bound, 95% confidence interval over z
        p: p-value for z_approx
        q: FDR-corrected q-value over all tests in output, using two-stage BH-FDR.
    """

    # do some sanity checks on input data
    if isinstance(x, pd.DataFrame) and isinstance(y, pd.DataFrame):
        assert np.array_equal(x.columns, y.columns)
        labels = x.columns
        x = x.values
        y = y.values
    elif x.ndim > 1:
        assert x.shape[1] == y.shape[1]
        labels = None
    else:
        labels = None

    # calculate sampling values
    v = find_sampling_value([x, y], sampling_percentile)
    norm_data = [normalize(d, v, upsample) for d in [x, y]]
    n_cell = min(d.shape[0] for d in norm_data)
    sampling_function = partial(_mw_sampling_function, n_cell=n_cell)

    if verbose:  # report sampling values
        print('sampling %d cells (with replacement) per iteration' % n_cell)
        print('sampling %d molecules per cell' % v)

    with closing(Pool()) as pool:
        results = pool.map(sampling_function, repeat(norm_data, n_iter))

    results = np.stack(results)  # u, z, p

    ci = confidence_interval(results[:, :, 1])
    results = pd.DataFrame(
        data=np.concatenate([np.median(results, axis=0), ci], axis=1),
        index=labels,
        columns=['U', 'z_approx', 'p', 'z_lo', 'z_hi'])

    # add multiple-testing correction
    results['q'] = multipletests(results['p'], alpha=alpha, method='fdr_tsbh')[1]

    # remove low-value genes whose median sampling value is -inf
    neginf = np.isneginf(results['z_approx'])
    results.ix[neginf, 'z_lo'] = np.nan
    results.ix[neginf, 'z_approx'] = 0
    results.ix[neginf, ['p', 'q']] = 1.

    results = results[['U', 'z_approx', 'z_lo', 'z_hi', 'p', 'q']].sort_values('q')
    results.iloc[:, 1:4] = np.round(results.iloc[:, 1:4], 2)

    return results


def _kw_sampling_function(data, splits, n_cell):
    data = [_draw_sample(d, n_cell) for d in np.split(data, splits)]
    return _kruskal(data)


def _kruskal(data):
    """
    Compute the Kruskal-Wallis H-test for independent samples
    Parameters
    ----------
    sample1, sample2, ... : array_like
       Two or more arrays with the sample measurements can be given as
       arguments.
    Returns
    -------
    statistic : float
       The Kruskal-Wallis H statistic, corrected for ties
    pvalue : float
       The p-value for the test using the assumption that H has a chi
       square distribution
    Notes
    -----
    For more details on `kruskal`, see `stats.kruskal`.
    """
    results = []
    for i in np.arange(data[0].shape[1]):
        args = [d[:, i] for d in data]
        try:
            results.append(_kruskalwallis(*args))
        except ValueError:
            results.append([0, 1.])
    return results


def category_to_numeric(labels):
    """transform categorical labels to a numeric array"""
    labels = np.array(labels)
    if np.issubdtype(labels.dtype, np.integer):
        return labels
    else:
        cats = np.unique(labels)
        map_ = dict(zip(cats, np.arange(cats.shape[0])))
        return np.array([map_[i] for i in labels])


def kruskalwallis(
        data, labels, n_iter=50, sampling_percentile=10, alpha=0.05, verbose=False,
        upsample=False):
    """
    :param data: np.ndarray or pd.DataFrame of observations x features
    :param labels: observation labels for categories to be compared
    :param n_iter: number of times to sample x and y
    :param sampling_percentile: percentile to downsample to. observations with row sums
        lower than this value will be excluded
    :param alpha: significance threshold for FDR correction
    :param verbose: if True, report number of cells sampled in each iteration and the
        integer value to which cells are downsampled
    :param upsample: if False, cells with size lower than sampling_percentile are
        discarded. If True, those cells are upsampled.
    :return pd.DataFrame: DataFrame with columns:
        H: median u-statistic over the n_iter iterations of the test
        z_approx: median approximate tie-corrected z-score for the mann-whitney U-test
        z_lo: lower bound, 95% confidence interval over z
        z_hi: upper bound, 95% confidence interval over z
        p: p-value for z_approx
        q: FDR-corrected q-value over all tests in output, using two-stage BH-FDR.
    """

    if isinstance(data, pd.DataFrame):
        features = data.columns
        data = data.values
    elif isinstance(data, np.ndarray):
        features = None
    else:
        raise ValueError('data must be a np.ndarray or pd.DataFrame, not %s' %
                         repr(type(data)))

    # if labels are not numeric, transform to numeric categories
    labels = category_to_numeric(labels)
    if not labels.shape[0] == data.shape[0]:
        raise ValueError('labels (shape=%s) must match dimension 0 of data (shape=%s)' %
                         (repr(labels.shape), repr(labels.data)))

    idx = np.argsort(labels)
    data = data[idx, :]  # will copy
    labels = labels[idx]

    splits = np.where(np.diff(labels))[0] + 1

    # calculate sampling values and downsample data
    v = find_sampling_value(np.split(data, splits), sampling_percentile)
    norm_data, labels = normalize(data, v, upsample, labels)

    splits = np.where(np.diff(labels))[0] + 1  # rediff, norm_data causes loss

    n_cell = min(d.shape[0] for d in np.split(norm_data, splits))
    sampling_function = partial(_kw_sampling_function, n_cell=n_cell, splits=splits)

    if verbose:  # report sampling values
        print('sampling %d cells (with replacement) per iteration' % n_cell)
        print('sampling %d molecules per cell' % v)

    with closing(Pool()) as pool:
        results = pool.map(sampling_function, repeat(norm_data, n_iter))

    results = np.stack(results)  # H, p

    ci = confidence_interval(results[:, :, 0])  # around H
    results = pd.DataFrame(
        data=np.concatenate([np.median(results, axis=0), ci], axis=1),
        index=features,
        columns=['H', 'p', 'H_lo', 'H_hi'])

    results['q'] = multipletests(results['p'], alpha=alpha, method='fdr_tsbh')[1]
    results = results[['H', 'H_lo', 'H_hi', 'p', 'q']]
    return results