python source code of hypothesis

"""
Statistical Hypothesis Tests
"""

import pandas as pd
import scipy.stats as stats
import numpy as np
import sklearn.metrics as metrics
import rpy2.robjects as ro
from collections import Counter


def g_test(data, correction=False):
    """
    G-test (likelihood ratio test).

    Parameters
    ----------
    data :
        the contingency table

    correction :
        whether to apply continuity corrections

    Returns
    -------
    g :
        the test statistic
    p :
        the p-value
    df:
        the number of degrees of freedom
    expected:
        the expected frequencies

    References
    ----------
    https://en.wikipedia.org/wiki/G-test
    """
    if isinstance(data, pd.DataFrame):
        data = data.values

    # remove zero rows/columns
    data = data[~np.all(data == 0, axis=1)]
    data = data[:, ~np.all(data == 0, axis=0)]

    if data.sum() == 0:
        return 0, 1.0, 1, None

    return stats.chi2_contingency(data, correction=correction,
                                  lambda_="log-likelihood")


def z_test(stat, sigma):
    """
    Z-test.

    Parameters
    ----------
    stat :
        the asymptotically normal statistic

    sigma :
        the statistic's standard deviation

    Returns
    -------
    pval :
        the p-value

    References
    ----------
    https://en.wikipedia.org/wiki/Z-test
    """
    z = stat/sigma
    pval = 2*stats.norm.sf(abs(z))
    return pval


def permutation_test_ct2(data, num_samples=10000):
    """
    Monte-Carlo permutation test for a 2-way contingency table

    Parameters
    ----------
    data :
        the contingency table

    num_samples :
        the number of random permutations to perform

    Returns
    -------
    pval :
        the p-value

    References
    ----------
    https://en.wikipedia.org/wiki/Resampling_(statistics)
    """
    if isinstance(data, pd.DataFrame):
        data = np.array(data)

    dim = data.shape
    data_x = []
    data_y = []

    for x in range(0, dim[0]):
        for y in range(0, dim[1]):
            data_x += [x]*data[x, y]
            data_y += [y]*data[x, y]

    stat_0 = metrics.mutual_info_score(data_x, data_y)

    k = 0
    for _ in range(num_samples):
        np.random.shuffle(data_x)
        mi = metrics.mutual_info_score(data_x, data_y)
        k += stat_0 < mi

    pval = (1.0*k) / num_samples
    return max(pval, 1.0/num_samples)


def permutation_test_ct(data, num_samples=100000):
    """
    Monte-Carlo permutation test for a contingency table.
    Uses the chi square statistic.

    Parameters
    ----------
    data :
        the contingency table

    num_samples :
        the number of random permutations to perform

    Returns
    -------
    pval :
        the p-value

    Notes
    -----
    Uses the R 'coin' package that can directly handle contingency
    tables instead of having to convert into arrays x,y

    References
    ----------
    https://en.wikipedia.org/wiki/Resampling_(statistics)
    """
    if not isinstance(data, pd.DataFrame):
        data = pd.DataFrame(data)
    data = data[data.columns[(data != 0).any()]]
    data = data[(data.T != 0).any()]

    # print 'permutation test of size {}'.format(data.sum())

    data = np.array(data, dtype='int')
    if len(data.shape) != 2:
        return 1
    if data.shape[0] < 2 or data.shape[1] < 2:
        return 1

    ro.globalenv['ct'] = data
    pval = ro.r('chisq.test(ct, simulate.p.value = TRUE, B = {})$p.value'.
                format(num_samples))[0]
    return max(pval, 1.0/num_samples)


def permutation_test_corr(x, y, num_samples=10000):
    """
    Monte-Carlo permutation test for correlation

    Parameters
    ----------
    x :
        Values for the first dimension

    y :
        Values for the second dimension

    num_samples :
        the number of random permutations to perform

    Returns
    -------
    pval :
        the p-value

    References
    ----------
    https://en.wikipedia.org/wiki/Resampling_(statistics)
    """
    x = np.array(x, dtype='float')
    y = np.array(y, dtype='float')

    obs_0, _ = stats.pearsonr(x, y)
    k = 0
    z = np.concatenate([x, y])
    for _ in range(num_samples):
        np.random.shuffle(z)
        k += abs(obs_0) < abs(stats.pearsonr(z[:len(x)], z[len(x):])[0])
    pval = (1.0*k) / num_samples
    return max(pval, 1.0/num_samples)


def permutation_test_ct_cond(data, stat, num_samples=10000):
    """
    Monte-Carlo permutation test for a 3-way contingency table

    Parameters
    ----------
    data :
        the contingency table

    stat :
        the statistic to apply to each permuted table

    num_samples :
        the number of random permutations to perform

    Returns
    -------
    pval :
        the p-value

    References
    ----------
    https://en.wikipedia.org/wiki/Resampling_(statistics)
    """
    data = np.array([ct.values if isinstance(ct, pd.DataFrame)
                     else ct for ct in data])

    dim = data.shape
    data_x = [[] for _ in data]
    data_y = [[] for _ in data]
    stat_0 = stat(data)

    # convert the table into an array of data points
    for e in range(0, dim[0]):
        for x in range(0, dim[1]):
            for y in range(0, dim[2]):
                data_x[e] += [x]*data[e, x, y]
                data_y[e] += [y]*data[e, x, y]

    k = 0
    for _ in range(num_samples):
        temp = np.zeros(dim)
        # permute each sub-group individually
        for e in range(0, dim[0]):
            np.random.shuffle(data_x[e])

            # re-form a contingency table
            counter = Counter(zip(data_x[e], data_y[e]))
            for x in range(0, dim[1]):
                for y in range(0, dim[2]):
                    temp[e, x, y] = counter.get((x, y), 0)

        temp_stat = stat(temp)
        k += stat_0 < temp_stat

    pval = (1.0*k) / num_samples
    return max(pval, 1.0/num_samples)