python source code of tableone

"""
The tableone package is used for creating "Table 1" summary statistics for
research papers.
"""

import warnings

import numpy as np
from numpy.linalg import LinAlgError
import pandas as pd
from scipy import stats
from statsmodels.stats import multitest
from tabulate import tabulate

from tableone.modality import hartigan_diptest

# display deprecation warnings
warnings.simplefilter('always', DeprecationWarning)


def load_dataset(name):
    """
    Load an example dataset from the online repository (requires internet).

    These datasets are useful for documentation and testing.

    Parameters
    ----------
    name : str
        Name of the dataset.

    Returns
    -------
    df : :class:`pandas.DataFrame`
        Tabular data.
    """
    path = ("https://raw.githubusercontent.com/"
            "tompollard/tableone/master/datasets/{}.csv")
    full_path = path.format(name)

    df = pd.read_csv(full_path)

    return df


class InputError(Exception):
    """
    Exception raised for errors in the input.
    """
    pass


class TableOne(object):
    """

    If you use the tableone package, please cite:

    Pollard TJ, Johnson AEW, Raffa JD, Mark RG (2018). tableone: An open source
    Python package for producing summary statistics for research papers.
    JAMIA Open, Volume 1, Issue 1, 1 July 2018, Pages 26-31.
    https://doi.org/10.1093/jamiaopen/ooy012

    Create an instance of the tableone summary table.

    Parameters
    ----------
    data : pandas DataFrame
        The dataset to be summarised. Rows are observations, columns are
        variables.
    columns : list, optional
        List of columns in the dataset to be included in the final table.
    categorical : list, optional
        List of columns that contain categorical variables.
    groupby : str, optional
        Optional column for stratifying the final table (default: None).
    nonnormal : list, optional
        List of columns that contain non-normal variables (default: None).
    min_max: list, optional
        List of variables that should report minimum and maximum, instead of
        standard deviation (for normal) or Q1-Q3 (for non-normal).
    pval : bool, optional
        Display computed P-Values (default: False).
    pval_adjust : str, optional
        Method used to adjust P-Values for multiple testing.
        The P-values from the unadjusted table (default when pval=True)
        are adjusted to account for the number of total tests that were performed.
        These adjustments would be useful when many variables are being screened
        to assess if their distribution varies by the variable in the groupby argument.
        For a complete list of methods, see documentation for statsmodels multipletests.
        Available methods include ::

        `None` : no correction applied.
        `bonferroni` : one-step correction
        `sidak` : one-step correction
        `holm-sidak` : step down method using Sidak adjustments
        `simes-hochberg` : step-up method (independent)
        `hommel` : closed method based on Simes tests (non-negative)

    htest_name : bool, optional
        Display a column with the names of hypothesis tests (default: False).
    htest : dict, optional
        Dictionary of custom hypothesis tests. Keys are variable names and
        values are functions. Functions must take a list of Numpy Arrays as
        the input argument and must return a test result.
        e.g. htest = {'age': myfunc}
    missing : bool, optional
        Display a count of null values (default: True).
    ddof : int, optional
        Degrees of freedom for standard deviation calculations (default: 1).
    rename : dict, optional
        Dictionary of alternative names for variables.
        e.g. `rename = {'sex':'gender', 'trt':'treatment'}`
    sort : bool or str, optional
        If `True`, sort the variables alphabetically. If a string
        (e.g. `'P-Value'`), sort by the specified column in ascending order.
        Default (`False`) retains the sequence specified in the `columns`
        argument. Currently the only columns supported are: `'Missing'`,
        `'P-Value'`, `'P-Value (adjusted)'`, and `'Test'`.
    limit : int or dict, optional
        Limit to the top N most frequent categories. If int, apply to all
        categorical variables. If dict, apply to the key (e.g. {'sex': 1}).
    order : dict, optional
        Specify an order for categorical variables. Key is the variable, value
        is a list of values in order.  {e.g. 'sex': ['f', 'm', 'other']}
    remarks : bool, optional
        Add remarks on the appropriateness of the summary measures and the
        statistical tests (default: True).
    label_suffix : bool, optional
        Append summary type (e.g. "mean (SD); median [Q1,Q3], n (%); ") to the
        row label (default: True).
    decimals : int or dict, optional
        Number of decimal places to display. An integer applies the rule to all
        variables (default: 1). A dictionary (e.g. `decimals = {'age': 0)`)
        applies the rule per variable, defaulting to 1 place for unspecified
        variables. For continuous variables, applies to all summary statistics
        (e.g. mean and standard deviation). For categorical variables, applies
        to percentage only.
    overall : bool:
        If True, add an "overall" column to the table. Smd and p-value
        calculations are performed only using stratified columns.
    display_all : bool:
        If True, set pd. display_options to display all columns and rows.
        (default: False)

    Attributes
    ----------
    tableone : dataframe
        Summary of the data (i.e., the "Table 1").

    Examples
    --------
    >>> df = pd.DataFrame({'size': [1, 2, 60, 1, 1],
    ...                   'fruit': ['peach', 'orange', 'peach', 'peach', 'orange'],
    ...                   'tasty': ['yes', 'yes', 'no', 'yes', 'no']})

    >>> df
       size   fruit tasty
    0     1   peach   yes
    1     2  orange   yes
    2    60   peach    no
    3     1   peach   yes
    4     1  orange    no

    >>> TableOne(df, overall=False, groupby="fruit", pval=True)

                        Grouped by fruit
                                 Missing     orange        peach P-Value
    n                                             2            3
    size, mean (SD)                    0  1.5 (0.7)  20.7 (34.1)   0.433
    tasty, n (%)    no                 0   1 (50.0)     1 (33.3)   1.000
                    yes                    1 (50.0)     2 (66.7)

    ...
    """

    def __init__(self, data, columns=None, categorical=None, groupby=None,
                 nonnormal=None, min_max=None, pval=False, pval_adjust=None,
                 htest_name=False, pval_test_name=False, htest=None,
                 isnull=None, missing=True, ddof=1, labels=None, rename=None,
                 sort=False, limit=None, order=None, remarks=True,
                 label_suffix=True, decimals=1, smd=False, overall=True,
                 display_all=False):

        # labels is now rename
        if labels is not None and rename is not None:
            raise TypeError("TableOne received both labels and rename.")
        elif labels is not None:
            warnings.warn("The labels argument is deprecated; use "
                          "rename instead.", DeprecationWarning)
            self._alt_labels = labels
        else:
            self._alt_labels = rename

        # isnull is now missing
        if isnull is not None:
            warnings.warn("The isnull argument is deprecated; use "
                          "missing instead.", DeprecationWarning)
            self._isnull = isnull
        else:
            self._isnull = missing

        # pval_test_name is now htest_name
        if pval_test_name:
            warnings.warn("The pval_test_name argument is deprecated; use "
                          "htest_name instead.", DeprecationWarning)
            self._pval_test_name = pval_test_name
        else:
            self._pval_test_name = htest_name

        # groupby should be a string
        if not groupby:
            groupby = ''
        elif groupby and type(groupby) == list:
            groupby = groupby[0]

        # nonnormal should be a string
        if not nonnormal:
            nonnormal = []
        elif nonnormal and type(nonnormal) == str:
            nonnormal = [nonnormal]

        # min_max should be a list
        if min_max and isinstance(min_max, bool):
            warnings.warn("min_max should specify a list of variables.")
            min_max = None

        # if the input dataframe is empty, raise error
        if data.empty:
            raise InputError("The input dataframe is empty.")

        # if columns are not specified, use all columns
        if not columns:
            columns = data.columns.values

        # check that the columns exist in the dataframe
        if not set(columns).issubset(data.columns):
            notfound = list(set(columns) - set(data.columns))
            raise InputError("""Columns not found in
                                dataset: {}""".format(notfound))

        # check for duplicate columns
        dups = data[columns].columns[data[columns].columns.duplicated()].unique()
        if not dups.empty:
            raise InputError("""Input contains duplicate
                                columns: {}""".format(dups))

        # if categorical not specified, try to identify categorical
        if not categorical and type(categorical) != list:
            categorical = self._detect_categorical_columns(data[columns])
            # omit categorical row if it is specified in groupby
            if groupby:
                categorical = [x for x in categorical if x != groupby]

        if isinstance(pval_adjust, bool) and pval_adjust:
            msg = ("pval_adjust expects a string, but a boolean was specified."
                   " Defaulting to the 'bonferroni' correction.")
            warnings.warn(msg)
            pval_adjust = "bonferroni"

        # if custom order is provided, ensure that values are strings
        if order:
            order = {k: ["{}".format(v) for v in order[k]] for k in order}

        # if input df has ordered categorical variables, get the order.
        order_cats = [x for x in data.select_dtypes("category")
                      if data[x].dtype.ordered]
        if any(order_cats):
            d_order_cats = {v: data[v].cat.categories for v in order_cats}
            d_order_cats = {k: ["{}".format(v) for v in d_order_cats[k]]
                            for k in d_order_cats}

        # combine the orders. custom order takes precedence.
        if order_cats and order:
            new = {**order, **d_order_cats}
            for k in order:
                new[k] = order[k] + [x for x in new[k] if x not in order[k]]
            order = new
        elif order_cats:
            order = d_order_cats

        if pval and not groupby:
            raise InputError("If pval=True then groupby must be specified.")

        self._columns = list(columns)
        self._continuous = [c for c in columns
                            if c not in categorical + [groupby]]
        self._categorical = categorical
        self._nonnormal = nonnormal
        self._min_max = min_max
        self._pval = pval
        self._pval_adjust = pval_adjust
        self._htest = htest
        self._sort = sort
        self._groupby = groupby
        # degrees of freedom for standard deviation
        self._ddof = ddof
        self._limit = limit
        self._order = order
        self._remarks = remarks
        self._label_suffix = label_suffix
        self._decimals = decimals
        self._smd = smd
        self._overall = overall

        # display notes and warnings below the table
        self._warnings = {}

        # output column names that cannot be contained in a groupby
        self._reserved_columns = ['Missing', 'P-Value', 'Test',
                                  'P-Value (adjusted)', 'SMD', 'Overall']

        if self._groupby:
            self._groupbylvls = sorted(data.groupby(groupby).groups.keys())

            # reorder the groupby levels if order is provided
            if self._order and self._groupby in self._order:
                unordered = [x for x in self._groupbylvls
                             if x not in self._order[self._groupby]]
                self._groupbylvls = self._order[self._groupby] + unordered

            # check that the group levels do not include reserved words
            for level in self._groupbylvls:
                if level in self._reserved_columns:
                    raise InputError("""Group level contains '{}', a reserved
                                        keyword.""".format(level))
        else:
            self._groupbylvls = ['Overall']

        # forgive me jraffa
        if self._pval:
            self._htest_table = self._create_htest_table(data)

        # correct for multiple testing
        if self._pval and self._pval_adjust:
            alpha = 0.05
            adjusted = multitest.multipletests(self._htest_table['P-Value'],
                                               alpha=alpha,
                                               method=self._pval_adjust)
            self._htest_table['P-Value (adjusted)'] = adjusted[1]
            self._htest_table['adjust method'] = self._pval_adjust

        # create overall tables if required
        if self._categorical and self._groupby and overall:
            self.cat_describe_all = self._create_cat_describe(data, False,
                                                              ['Overall'])

        if self._continuous and self._groupby and overall:
            self.cont_describe_all = self._create_cont_describe(data, False)

        # create descriptive tables
        if self._categorical:
            self.cat_describe = self._create_cat_describe(data, self._groupby,
                                                          self._groupbylvls)

        if self._continuous:
            self.cont_describe = self._create_cont_describe(data,
                                                            self._groupby)

        # compute standardized mean differences
        if self._smd:
            self.smd_table = self._create_smd_table(data)

        # create continuous and categorical tables
        if self._categorical:
            self.cat_table = self._create_cat_table(data, overall)

        if self._continuous:
            self.cont_table = self._create_cont_table(data, overall)

        # combine continuous variables and categorical variables into table 1
        self.tableone = self._create_tableone(data)

        # wrap dataframe methods
        self.head = self.tableone.head
        self.tail = self.tableone.tail
        self.to_csv = self.tableone.to_csv
        self.to_excel = self.tableone.to_excel
        self.to_html = self.tableone.to_html
        self.to_json = self.tableone.to_json
        self.to_latex = self.tableone.to_latex

        # set display options
        if display_all:
            self._set_display_options()

    def __str__(self):
        return self.tableone.to_string() + self._generate_remarks('\n')

    def __repr__(self):
        return self.tableone.to_string() + self._generate_remarks('\n')

    def _repr_html_(self):
        return self.tableone._repr_html_() + self._generate_remarks('<br />')

    def _set_display_options(self):
        """
        Set pandas display options. Display all rows and columns by default.
        """
        display_options = {'display.max_rows': None,
                           'display.max_columns': None,
                           'display.width': None,
                           'display.max_colwidth': None}

        for k in display_options:
            try:
                pd.set_option(k, display_options[k])
            except ValueError:
                msg = """Newer version of Pandas required to set the '{}'
                         option.""".format(k)
                warnings.warn(msg)

    def tabulate(self, headers=None, tablefmt='grid', **kwargs):
        """
        Pretty-print tableone data. Wrapper for the Python 'tabulate' library.

        Args:
            headers (list): Defines a list of column headers to be used.
            tablefmt (str): Defines how the table is formatted. Table formats
                include: 'plain','simple','github','grid','fancy_grid','pipe',
                'orgtbl','jira','presto','psql','rst','mediawiki','moinmoin',
                'youtrack','html','latex','latex_raw','latex_booktabs',
                and 'textile'.

        Examples:
            To output tableone in github syntax, call tabulate with the
                'tablefmt="github"' argument.

            >>> print(tableone.tabulate(tablefmt='fancy_grid'))
        """
        # reformat table for tabulate
        df = self.tableone

        if not headers:
            try:
                headers = df.columns.levels[1]
            except AttributeError:
                headers = df.columns

        df = df.reset_index()
        df = df.set_index('level_0')
        isdupe = df.index.duplicated()
        df.index = df.index.where(~isdupe, '')
        df = df.rename_axis(None).rename(columns={'level_1': ''})

        return tabulate(df, headers=headers, tablefmt=tablefmt, **kwargs)

    def _generate_remarks(self, newline='\n'):
        """
        Generate a series of remarks that the user should consider
        when interpreting the summary statistics.
        """
        # generate warnings for continuous variables
        if self._continuous:
            # highlight far outliers
            outlier_mask = self.cont_describe.far_outliers > 1
            outlier_vars = list(self.cont_describe.far_outliers[outlier_mask].
                                dropna(how='all').index)
            if outlier_vars:
                self._warnings["""Tukey test indicates far outliers
                                  in"""] = outlier_vars

            # highlight possible multimodal distributions using hartigan's dip
            # test -1 values indicate NaN
            modal_mask = ((self.cont_describe.diptest >= 0) &
                          (self.cont_describe.diptest <= 0.05))
            modal_vars = list(self.cont_describe.diptest[modal_mask].
                              dropna(how='all').index)
            if modal_vars:
                self._warnings["""Hartigan's Dip Test reports possible
                                  multimodal distributions for"""] = modal_vars

            # highlight non normal distributions
            # -1 values indicate NaN
            modal_mask = ((self.cont_describe.normaltest >= 0) &
                          (self.cont_describe.normaltest <= 0.001))
            modal_vars = list(self.cont_describe.normaltest[modal_mask].
                              dropna(how='all').index)
            if modal_vars:
                self._warnings["""Normality test reports non-normal
                                  distributions for"""] = modal_vars

        # create the warning string
        msg = '{}'.format(newline)
        for n, k in enumerate(sorted(self._warnings)):
            msg += '[{}] {}: {}.{}'.format(n+1, k,
                                           ', '.join(self._warnings[k]),
                                           newline)

        return msg

    def _detect_categorical_columns(self, data):
        """
        Detect categorical columns if they are not specified.

        Parameters
        ----------
            data : pandas DataFrame
                The input dataset.

        Returns
        ----------
            likely_cat : list
                List of variables that appear to be categorical.
        """
        # assume all non-numerical and date columns are categorical
        numeric_cols = set(data._get_numeric_data().columns.values)
        date_cols = set(data.select_dtypes(include=[np.datetime64]).columns)
        likely_cat = set(data.columns) - numeric_cols
        likely_cat = list(likely_cat - date_cols)
        # check proportion of unique values if numerical
        for var in data._get_numeric_data().columns:
            likely_flag = 1.0 * data[var].nunique()/data[var].count() < 0.005
            if likely_flag:
                likely_cat.append(var)
        return likely_cat

    def _cont_smd(self, data1=None, data2=None, mean1=None, mean2=None,
                  sd1=None, sd2=None, n1=None, n2=None, unbiased=False):
        """
        Compute the standardized mean difference (regular or unbiased) using
        either raw data or summary measures.

        Parameters
        ----------
        data1 : list
            List of values in dataset 1 (control).
        data2 : list
            List of values in dataset 2 (treatment).
        mean1 : float
            Mean of dataset 1 (control).
        mean2 : float
            Mean of dataset 2 (treatment).
        sd1 : float
            Standard deviation of dataset 1 (control).
        sd2 : float
            Standard deviation of dataset 2 (treatment).
        n1 : int
            Sample size of dataset 1 (control).
        n2 : int
            Sample size of dataset 2 (treatment).
        unbiased : bool
            Return an unbiased estimate using Hedges' correction. Correction
            factor approximated using the formula proposed in Hedges 2011.
            (default = False)

        Returns
        -------
        smd : float
            Estimated standardized mean difference.
        se : float
            Standard error of the estimated standardized mean difference.
        """
        if (data1 and not data2) or (data2 and not data1):
            raise InputError('Two sets of data must be provided.')
        elif data1 and data2:
            if any([mean1, mean2, sd1, sd2, n1, n2]):
                warnings.warn("""Mean, n, and sd were computed from the data.
                                 These input args were ignored.""")
            mean1 = np.mean(data1)
            mean2 = np.mean(data2)
            sd1 = np.std(data1)
            sd2 = np.std(data2)
            n1 = len(data1)
            n2 = len(data2)

        # if (mean1 and not mean2) or (mean2 and not mean1):
        #     raise InputError('mean1 and mean2 must both be provided.')

        # if (sd1 and not sd2) or (sd2 and not sd1):
        #     raise InputError('sd1 and sd2 must both be provided.')

        # if (n1 and not n2) or (n2 and not n1):
        #     raise InputError('n1 and n2 must both be provided.')

        # cohens_d
        smd = (mean2 - mean1) / np.sqrt((sd1 ** 2 + sd2 ** 2) / 2)

        # standard error
        v_d = ((n1+n2) / (n1*n2)) + ((smd ** 2) / (2*(n1+n2)))
        se = np.sqrt(v_d)

        if unbiased:
            # Hedges correction (J. Hedges, 1981)
            # Approximation for the the correction factor from:
            # Introduction to Meta-Analysis. Michael Borenstein,
            # L. V. Hedges, J. P. T. Higgins and H. R. Rothstein
            # Wiley (2011). Chapter 4. Effect Sizes Based on Means.
            j = 1 - (3/(4*(n1+n2-2)-1))
            smd = j * smd
            v_g = (j ** 2) * v_d
            se = np.sqrt(v_g)

        return smd, se

    def _cat_smd(self, prop1=None, prop2=None, n1=None, n2=None,
                 unbiased=False):
        """
        Compute the standardized mean difference (regular or unbiased) using
        either raw data or summary measures.

        Parameters
        ----------
        prop1 : list
            Proportions (range 0-1) for each categorical value in dataset 1
            (control).
        prop2 : list
            Proportions (range 0-1) for each categorical value in dataset 2
            (treatment).
        n1 : int
            Sample size of dataset 1 (control).
        n2 : int
            Sample size of dataset 2 (treatment).
        unbiased : bool
            Return an unbiased estimate using Hedges' correction. Correction
            factor approximated using the formula proposed in Hedges 2011.
            (default = False)

        Returns
        -------
        smd : float
            Estimated standardized mean difference.
        se : float
            Standard error of the estimated standardized mean difference.
        """
        # Categorical SMD Yang & Dalton 2012
        # https://support.sas.com/resources/papers/proceedings12/335-2012.pdf
        prop1 = np.asarray(prop1)
        prop2 = np.asarray(prop2)

        # Drop first level for consistency with R tableone
        # "to eliminate dependence if more than two levels"
        prop1 = prop1[1:]
        prop2 = prop2[1:]

        lst_cov = []
        for p in [prop1, prop2]:
            variance = p * (1 - p)
            covariance = - np.outer(p, p)
            covariance[np.diag_indices_from(covariance)] = variance
            lst_cov.append(covariance)

        mean_diff = np.matrix(prop2 - prop1)
        mean_cov = (lst_cov[0] + lst_cov[1])/2

        # TODO: add steps to deal with nulls

        try:
            sq_md = mean_diff * np.linalg.inv(mean_cov) * mean_diff.T
        except LinAlgError:
            sq_md = np.nan

        try:
            smd = np.asarray(np.sqrt(sq_md))[0][0]
        except IndexError:
            smd = np.nan

        # standard error
        v_d = ((n1+n2) / (n1*n2)) + ((smd ** 2) / (2*(n1+n2)))
        se = np.sqrt(v_d)

        if unbiased:
            # Hedges correction (J. Hedges, 1981)
            # Approximation for the the correction factor from:
            # Introduction to Meta-Analysis. Michael Borenstein,
            # L. V. Hedges, J. P. T. Higgins and H. R. Rothstein
            # Wiley (2011). Chapter 4. Effect Sizes Based on Means.
            j = 1 - (3/(4*(n1+n2-2)-1))
            smd = j * smd
            v_g = (j ** 2) * v_d
            se = np.sqrt(v_g)

        return smd, se

    def _q25(self, x):
        """
        Compute percentile (25th)
        """
        return np.nanpercentile(x.values, 25)

    def _q75(self, x):
        """
        Compute percentile (75th)
        """
        return np.nanpercentile(x.values, 75)

    def _std(self, x):
        """
        Compute standard deviation with ddof degrees of freedom
        """
        if len(x) == 1:
            return 0.0
        else:
            return np.nanstd(x.values, ddof=self._ddof)

    def _diptest(self, x):
        """
        Compute Hartigan Dip Test for modality.

        p < 0.05 suggests possible multimodality.
        """
        p = hartigan_diptest(x.values)
        # dropna=False argument in pivot_table does not function as expected
        # https://github.com/pandas-dev/pandas/issues/22159
        # return -1 instead of None
        if pd.isnull(p):
            return -1
        return p

    def _normaltest(self, x):
        """
        Compute test for normal distribution.

        Null hypothesis: x comes from a normal distribution
        p < alpha suggests the null hypothesis can be rejected.
        """
        if len(x.values[~np.isnan(x.values)]) >= 20:
            stat, p = stats.normaltest(x.values, nan_policy='omit')
        else:
            p = None
        # dropna=False argument in pivot_table does not function as expected
        # return -1 instead of None
        if pd.isnull(p):
            return -1
        return p

    def _tukey(self, x, threshold):
        """
        Count outliers according to Tukey's rule.

        Where Q1 is the lower quartile and Q3 is the upper quartile,
        an outlier is an observation outside of the range:

        [Q1 - k(Q3 - Q1), Q3 + k(Q3 - Q1)]

        k = 1.5 indicates an outlier
        k = 3.0 indicates an outlier that is "far out"
        """
        vals = x.values[~np.isnan(x.values)]

        try:
            q1, q3 = np.percentile(vals, [25, 75])
            iqr = q3 - q1
            low_bound = q1 - (iqr * threshold)
            high_bound = q3 + (iqr * threshold)
            outliers = np.where((vals > high_bound) | (vals < low_bound))
        except IndexError:
            outliers = []

        return outliers

    def _outliers(self, x):
        """
        Compute number of outliers
        """
        outliers = self._tukey(x, threshold=1.5)
        return np.size(outliers)

    def _far_outliers(self, x):
        """
        Compute number of "far out" outliers
        """
        outliers = self._tukey(x, threshold=3.0)
        return np.size(outliers)

    def _t1_summary(self, x):
        """
        Compute median [IQR] or mean (Std) for the input series.

        Parameters
        ----------
            x : pandas Series
                Series of values to be summarised.
        """
        # set decimal places
        if isinstance(self._decimals, int):
            n = self._decimals
        elif isinstance(self._decimals, dict):
            try:
                n = self._decimals[x.name]
            except KeyError:
                n = 1
        else:
            n = 1
            msg = """The decimals arg must be an int or dict.
                     Defaulting to {} d.p.""".format(n)
            warnings.warn(msg)

        if x.name in self._nonnormal:
            f = "{{:.{}f}} [{{:.{}f}},{{:.{}f}}]".format(n, n, n)
            if self._min_max and x.name in self._min_max:
                return f.format(
                    np.nanmedian(x.values), np.nanmin(x.values),
                    np.nanmax(x.values),
                )
            else:
                return f.format(
                    np.nanmedian(x.values),
                    np.nanpercentile(x.values, 25),
                    np.nanpercentile(x.values, 75),
                )
        else:
            if self._min_max and x.name in self._min_max:
                f = "{{:.{}f}} [{{:.{}f}},{{:.{}f}}]".format(n, n, n)
                return f.format(
                    np.nanmean(x.values), np.nanmin(x.values),
                    np.nanmax(x.values),
                )
            else:
                f = '{{:.{}f}} ({{:.{}f}})'.format(n, n)
                return f.format(np.nanmean(x.values), self._std(x))

    def _create_cont_describe(self, data, groupby):
        """
        Describe the continuous data.

        Parameters
        ----------
            data : pandas DataFrame
                The input dataset.

        Returns
        ----------
            df_cont : pandas DataFrame
                Summarise the continuous variables.
        """
        aggfuncs = [pd.Series.count, np.mean, np.median, self._std,
                    self._q25, self._q75, min, max, self._t1_summary,
                    self._diptest, self._outliers, self._far_outliers,
                    self._normaltest]

        # coerce continuous data to numeric
        cont_data = data[self._continuous].apply(pd.to_numeric,
                                                 errors='coerce')
        # check all data in each continuous column is numeric
        bad_cols = cont_data.count() != data[self._continuous].count()
        bad_cols = cont_data.columns[bad_cols]
        if len(bad_cols) > 0:
            msg = ("The following continuous column(s) have "
                   "non-numeric values: {variables}. Either specify the "
                   "column(s) as categorical or remove the "
                   "non-numeric values.").format(variables=bad_cols.values)
            raise InputError(msg)

        # check for coerced column containing all NaN to warn user
        for column in cont_data.columns[cont_data.count() == 0]:
            self._non_continuous_warning(column)

        if groupby:
            # add the groupby column back
            cont_data = cont_data.merge(data[[groupby]],
                                        left_index=True,
                                        right_index=True)

            # group and aggregate data
            df_cont = pd.pivot_table(cont_data,
                                     columns=[groupby],
                                     aggfunc=aggfuncs)
        else:
            # if no groupby, just add single group column
            df_cont = cont_data.apply(aggfuncs).T
            df_cont.columns.name = 'Overall'
            df_cont.columns = pd.MultiIndex.from_product([df_cont.columns,
                                                         ['Overall']])

        df_cont.index = df_cont.index.rename('variable')

        # remove prefix underscore from column names (e.g. _std -> std)
        agg_rename = df_cont.columns.levels[0]
        agg_rename = [x[1:] if x[0] == '_' else x for x in agg_rename]
        df_cont.columns = df_cont.columns.set_levels(agg_rename, level=0)

        return df_cont

    def _format_cat(self, row):
        var = row.name[0]
        if var in self._decimals:
            n = self._decimals[var]
        else:
            n = 1
        f = '{{:.{}f}}'.format(n)
        return f.format(row.percent)

    def _create_cat_describe(self, data, groupby, groupbylvls):
        """
        Describe the categorical data.

        Parameters
        ----------
            data : pandas DataFrame
                The input dataset.

        Returns
        ----------
            df_cat : pandas DataFrame
                Summarise the categorical variables.
        """
        group_dict = {}

        for g in groupbylvls:
            if groupby:
                d_slice = data.loc[data[groupby] == g, self._categorical]
            else:
                d_slice = data[self._categorical].copy()

            # create a dataframe with freq, proportion
            df = d_slice.copy()

            # convert to str to handle int converted to boolean. Avoid nans.
            for column in df.columns:
                df[column] = [str(row) if not pd.isnull(row)
                              else None for row in df[column].values]

            df = df.melt().groupby(['variable',
                                    'value']).size().to_frame(name='freq')

            df['percent'] = df['freq'].div(df.freq.sum(level=0),
                                           level=0).astype(float) * 100

            # set number of decimal places for percent
            if isinstance(self._decimals, int):
                n = self._decimals
                f = '{{:.{}f}}'.format(n)
                df['percent_str'] = df['percent'].astype(float).map(f.format)
            elif isinstance(self._decimals, dict):
                df.loc[:, 'percent_str'] = df.apply(self._format_cat, axis=1)
            else:
                n = 1
                f = '{{:.{}f}}'.format(n)
                df['percent_str'] = df['percent'].astype(float).map(f.format)

            # add n column, listing total non-null values for each variable
            ct = d_slice.count().to_frame(name='n')
            ct.index.name = 'variable'
            df = df.join(ct)

            # add null count
            nulls = d_slice.isnull().sum().to_frame(name='Missing')
            nulls.index.name = 'variable'
            # only save null count to the first category for each variable
            # do this by extracting the first category from the df row index
            levels = df.reset_index()[['variable',
                                       'value']].groupby('variable').first()
            # add this category to the nulls table
            nulls = nulls.join(levels)
            nulls = nulls.set_index('value', append=True)
            # join nulls to categorical
            df = df.join(nulls)

            # add summary column
            df['t1_summary'] = (df.freq.map(str) + ' ('
                                + df.percent_str.map(str)+')')

            # add to dictionary
            group_dict[g] = df

        df_cat = pd.concat(group_dict, axis=1)
        # ensure the groups are the 2nd level of the column index
        if df_cat.columns.nlevels > 1:
            df_cat = df_cat.swaplevel(0, 1, axis=1).sort_index(axis=1, level=0)

        return df_cat

    def _create_htest_table(self, data):
        """
        Create a table containing P-Values for significance tests. Add features
        of the distributions and the P-Values to the dataframe.

        Parameters
        ----------
            data : pandas DataFrame
                The input dataset.

        Returns
        ----------
            df : pandas DataFrame
                A table containing the P-Values, test name, etc.
        """
        # list features of the variable e.g. matched, paired, n_expected
        df = pd.DataFrame(index=self._continuous+self._categorical,
                          columns=['continuous', 'nonnormal',
                                   'min_observed', 'P-Value', 'Test'])

        df.index = df.index.rename('variable')
        df['continuous'] = np.where(df.index.isin(self._continuous),
                                    True, False)

        df['nonnormal'] = np.where(df.index.isin(self._nonnormal),
                                   True, False)

        # list values for each variable, grouped by groupby levels
        for v in df.index:
            is_continuous = df.loc[v]['continuous']
            is_categorical = ~df.loc[v]['continuous']
            is_normal = ~df.loc[v]['nonnormal']

            # if continuous, group data into list of lists
            if is_continuous:
                catlevels = None
                grouped_data = {}
                for s in self._groupbylvls:
                    lvl_data = data.loc[data[self._groupby] == s, v]
                    # coerce to numeric and drop non-numeric data
                    lvl_data = lvl_data.apply(pd.to_numeric,
                                              errors='coerce').dropna()
                    # append to overall group data
                    grouped_data[s] = lvl_data.values
                min_observed = min([len(x) for x in grouped_data.values()])
            # if categorical, create contingency table
            elif is_categorical:
                catlevels = sorted(data[v].astype('category').cat.categories)
                cross_tab = pd.crosstab(data[self._groupby].
                                        rename('_groupby_var_'), data[v])
                min_observed = cross_tab.sum(axis=1).min()
                grouped_data = cross_tab.T.to_dict('list')

            # minimum number of observations across all levels
            df.loc[v, 'min_observed'] = min_observed

            # compute pvalues
            (df.loc[v, 'P-Value'],
                df.loc[v, 'Test']) = self._p_test(v, grouped_data,
                                                  is_continuous,
                                                  is_categorical, is_normal,
                                                  min_observed, catlevels)

        return df

    def _create_smd_table(self, data):
        """
        Create a table containing pairwise Standardized Mean Differences
        (SMDs).

        Parameters
        ----------
            data : pandas DataFrame
                The input dataset.

        Returns
        ----------
            df : pandas DataFrame
                A table containing pairwise standardized mean differences
                (SMDs).
        """
        # create the SMD table
        permutations = [sorted((x, y),
                        key=lambda f: self._groupbylvls.index(f))
                        for x in self._groupbylvls
                        for y in self._groupbylvls if x is not y]

        p_set = set(tuple(x) for x in permutations)

        colname = 'SMD ({0},{1})'
        columns = [colname.format(x[0], x[1]) for x in p_set]
        df = pd.DataFrame(index=self._continuous+self._categorical,
                          columns=columns)
        df.index = df.index.rename('variable')

        for p in p_set:
            try:
                for v in self.cont_describe.index:
                    smd, _ = self._cont_smd(
                                mean1=self.cont_describe['mean'][p[0]].loc[v],
                                mean2=self.cont_describe['mean'][p[1]].loc[v],
                                sd1=self.cont_describe['std'][p[0]].loc[v],
                                sd2=self.cont_describe['std'][p[1]].loc[v],
                                n1=self.cont_describe['count'][p[0]].loc[v],
                                n2=self.cont_describe['count'][p[1]].loc[v],
                                unbiased=False)
                    df[colname.format(p[0], p[1])].loc[v] = smd
            except AttributeError:
                pass

            try:
                for v, _ in self.cat_describe.groupby(level=0):
                    smd, _ = self._cat_smd(
                        prop1=self.cat_describe.loc[[v]]['percent'][p[0]].values/100,
                        prop2=self.cat_describe.loc[[v]]['percent'][p[1]].values/100,
                        n1=self.cat_describe.loc[[v]]['freq'][p[0]].sum(),
                        n2=self.cat_describe.loc[[v]]['freq'][p[1]].sum(),
                        unbiased=False)
                    df[colname.format(p[0], p[1])].loc[v] = smd
            except AttributeError:
                pass

        return df

    def _p_test(self, v, grouped_data, is_continuous, is_categorical,
                is_normal, min_observed, catlevels):
        """
        Compute P-Values.

        Parameters
        ----------
            v : str
                Name of the variable to be tested.
            grouped_data : dict
                Dictionary of Numpy Arrays to be tested.
            is_continuous : bool
                True if the variable is continuous.
            is_categorical : bool
                True if the variable is categorical.
            is_normal : bool
                True if the variable is normally distributed.
            min_observed : int
                Minimum number of values across groups for the variable.
            catlevels : list
                Sorted list of levels for categorical variables.

        Returns
        ----------
            pval : float
                The computed P-Value.
            ptest : str
                The name of the test used to compute the P-Value.
        """

        # no test by default
        pval = np.nan
        ptest = 'Not tested'

        # apply user defined test
        if self._htest and v in self._htest:
            pval = self._htest[v](*grouped_data.values())
            ptest = self._htest[v].__name__
            return pval, ptest

        # do not test if the variable has no observations in a level
        if min_observed == 0:
            msg = ("No P-Value was computed for {variable} due to the low "
                   "number of observations.""").format(variable=v)
            warnings.warn(msg)
            return pval, ptest

        # continuous
        if (is_continuous and is_normal and len(grouped_data) == 2
            and min_observed >= 2):
            ptest = 'Two Sample T-test'
            test_stat, pval = stats.ttest_ind(*grouped_data.values(),
                                              equal_var=False,
                                              nan_policy="omit")
        elif is_continuous and is_normal:
            # normally distributed
            ptest = 'One-way ANOVA'
            test_stat, pval = stats.f_oneway(*grouped_data.values())
        elif is_continuous and not is_normal:
            # non-normally distributed
            ptest = 'Kruskal-Wallis'
            test_stat, pval = stats.kruskal(*grouped_data.values())
        # categorical
        elif is_categorical:
            # default to chi-squared
            ptest = 'Chi-squared'
            grouped_val_list = [x for x in grouped_data.values()]
            chi2, pval, dof, expected = stats.chi2_contingency(grouped_val_list)
            # if any expected cell counts are < 5, chi2 may not be valid
            # if this is a 2x2, switch to fisher exact
            if expected.min() < 5 or min_observed < 5:
                if np.shape(grouped_val_list) == (2, 2):
                    ptest = "Fisher's exact"
                    odds_ratio, pval = stats.fisher_exact(grouped_val_list)
                else:
                    ptest = "Chi-squared (warning: expected count < 5)"
                    chi_warn = ("Chi-squared tests for the following "
                                "variables may be invalid due to the low "
                                "number of observations")
                    try:
                        self._warnings[chi_warn].append(v)
                    except KeyError:
                        self._warnings[chi_warn] = [v]

        return pval, ptest

    def _create_cont_table(self, data, overall):
        """
        Create tableone for continuous data.

        Returns
        ----------
        table : pandas DataFrame
            A table summarising the continuous variables.
        """
        # remove the t1_summary level
        table = self.cont_describe[['t1_summary']].copy()
        table.columns = table.columns.droplevel(level=0)

        # add a column of null counts as 1-count() from previous function
        nulltable = data[self._continuous].isnull().sum().to_frame(name='Missing')
        try:
            table = table.join(nulltable)
        # if columns form a CategoricalIndex, need to convert to string first
        except TypeError:
            table.columns = table.columns.astype(str)
            table = table.join(nulltable)

        # add an empty value column, for joining with cat table
        table['value'] = ''
        table = table.set_index([table.index, 'value'])

        # add pval column
        if self._pval and self._pval_adjust:
            table = table.join(self._htest_table[['P-Value (adjusted)',
                                                  'Test']])
        elif self._pval:
            table = table.join(self._htest_table[['P-Value', 'Test']])

        # add standardized mean difference (SMD) column/s
        if self._smd:
            table = table.join(self.smd_table)

        # join the overall column if needed
        if self._groupby and overall:
            table = table.join(pd.concat([self.cont_describe_all['t1_summary'].
                                          Overall], axis=1, keys=["Overall"]))

        return table

    def _create_cat_table(self, data, overall):
        """
        Create table one for categorical data.

        Returns
        ----------
        table : pandas DataFrame
            A table summarising the categorical variables.
        """
        table = self.cat_describe['t1_summary'].copy()

        # add the total count of null values across all levels
        isnull = data[self._categorical].isnull().sum().to_frame(name='Missing')
        isnull.index = isnull.index.rename('variable')
        try:
            table = table.join(isnull)
        # if columns form a CategoricalIndex, need to convert to string first
        except TypeError:
            table.columns = table.columns.astype(str)
            table = table.join(isnull)

        # add pval column
        if self._pval and self._pval_adjust:
            table = table.join(self._htest_table[['P-Value (adjusted)',
                                                  'Test']])
        elif self._pval:
            table = table.join(self._htest_table[['P-Value', 'Test']])

        # add standardized mean difference (SMD) column/s
        if self._smd:
            table = table.join(self.smd_table)

        # join the overall column if needed
        if self._groupby and overall:
            table = table.join(pd.concat([self.cat_describe_all['t1_summary'].
                                          Overall], axis=1, keys=["Overall"]))

        return table

    def _create_tableone(self, data):
        """
        Create table 1 by combining the continuous and categorical tables.

        Returns
        ----------
        table : pandas DataFrame
            The complete table one.
        """
        if self._continuous and self._categorical:
            # support pandas<=0.22
            try:
                table = pd.concat([self.cont_table, self.cat_table],
                                  sort=False)
            except TypeError:
                table = pd.concat([self.cont_table, self.cat_table])
        elif self._continuous:
            table = self.cont_table
        elif self._categorical:
            table = self.cat_table

        # ensure column headers are strings before reindexing
        table = table.reset_index().set_index(['variable', 'value'])
        table.columns = table.columns.values.astype(str)

        # sort the table rows
        sort_columns = ['Missing', 'P-Value', 'P-Value (adjusted)', 'Test']
        if self._smd:
            sort_columns = sort_columns + list(self.smd_table.columns)

        if self._sort and isinstance(self._sort, bool):
            new_index = sorted(table.index.values, key=lambda x: x[0].lower())
        elif self._sort and isinstance(self._sort, str) and (self._sort in
                                                             sort_columns):
            try:
                new_index = table.sort_values(self._sort).index
            except KeyError:
                new_index = sorted(table.index.values,
                                   key=lambda x: self._columns.index(x[0]))
                warnings.warn('Sort variable not found: {}'.format(self._sort))
        elif self._sort and isinstance(self._sort, str) and (self._sort not in
                                                             sort_columns):
            new_index = sorted(table.index.values,
                               key=lambda x: self._columns.index(x[0]))
            warnings.warn('Sort must be in the following ' +
                          'list: {}.'.format(self._sort))
        else:
            # sort by the columns argument
            new_index = sorted(table.index.values,
                               key=lambda x: self._columns.index(x[0]))
        table = table.reindex(new_index)

        # round pval column and convert to string
        if self._pval and self._pval_adjust:
            table['P-Value (adjusted)'] = table['P-Value (adjusted)'].apply(
                                                '{:.3f}'.format).astype(str)
            table.loc[table['P-Value (adjusted)'] == '0.000',
                      'P-Value (adjusted)'] = '<0.001'
        elif self._pval:
            table['P-Value'] = table['P-Value'].apply(
                                     '{:.3f}'.format).astype(str)
            table.loc[table['P-Value'] == '0.000', 'P-Value'] = '<0.001'

        # round smd columns and convert to string
        if self._smd:
            for c in list(self.smd_table.columns):
                table[c] = table[c].apply('{:.3f}'.format).astype(str)
                table.loc[table[c] == '0.000', c] = '<0.001'

        # if an order is specified, apply it
        if self._order:
            for k in self._order:

                # Skip if the variable isn't present
                try:
                    all_var = table.loc[k].index.unique(level='value')
                except KeyError:
                    if k not in self._groupby:
                        warnings.warn("Order variable not found: {}".format(k))
                    continue

                # Remove value from order if it is not present
                if [i for i in self._order[k] if i not in all_var]:
                    rm_var = [i for i in self._order[k] if i not in all_var]
                    self._order[k] = [i for i in self._order[k]
                                      if i in all_var]
                    warnings.warn(("Order value not found: "
                                   "{}: {}").format(k, rm_var))

                new_seq = [(k, '{}'.format(v)) for v in self._order[k]]
                new_seq += [(k, '{}'.format(v)) for v in all_var
                            if v not in self._order[k]]

                # restructure to match the original idx
                new_idx_array = np.empty((len(new_seq),), dtype=object)
                new_idx_array[:] = [tuple(i) for i in new_seq]
                orig_idx = table.index.values.copy()
                orig_idx[table.index.get_loc(k)] = new_idx_array
                table = table.reindex(orig_idx)

        # set the limit on the number of categorical variables
        if self._limit:
            levelcounts = data[self._categorical].nunique()
            for k, _ in levelcounts.iteritems():

                # set the limit for the variable
                if (isinstance(self._limit, int)
                        and levelcounts[k] >= self._limit):
                    limit = self._limit
                elif isinstance(self._limit, dict) and k in self._limit:
                    limit = self._limit[k]
                else:
                    continue

                if not self._order or (self._order and k not in self._order):
                    # re-order the variables by frequency
                    count = data[k].value_counts().sort_values(ascending=False)
                    new_idx = [(k, '{}'.format(i)) for i in count.index]
                else:
                    # apply order
                    all_var = table.loc[k].index.unique(level='value')
                    new_idx = [(k, '{}'.format(v)) for v in self._order[k]]
                    new_idx += [(k, '{}'.format(v)) for v in all_var
                                if v not in self._order[k]]

                # restructure to match the original idx
                new_idx_array = np.empty((len(new_idx),), dtype=object)
                new_idx_array[:] = [tuple(i) for i in new_idx]
                orig_idx = table.index.values.copy()
                orig_idx[table.index.get_loc(k)] = new_idx_array
                table = table.reindex(orig_idx)

                # drop the rows > the limit
                table = table.drop(new_idx_array[limit:])

        # insert n row
        n_row = pd.DataFrame(columns=['variable', 'value', 'Missing'])
        n_row = n_row.set_index(['variable', 'value'])
        n_row.loc['n', 'Missing'] = None

        # support pandas<=0.22
        try:
            table = pd.concat([n_row, table], sort=False)
        except TypeError:
            table = pd.concat([n_row, table])

        if self._groupbylvls == ['Overall']:
            table.loc['n', 'Overall'] = len(data.index)
        else:
            if self._overall:
                table.loc['n', 'Overall'] = len(data.index)
            for g in self._groupbylvls:
                ct = data[self._groupby][data[self._groupby] == g].count()
                table.loc['n', '{}'.format(g)] = ct

        # only display data in first level row
        dupe_mask = table.groupby(level=[0]).cumcount().ne(0)
        dupe_columns = ['Missing']
        optional_columns = ['P-Value', 'P-Value (adjusted)', 'Test']
        if self._smd:
            optional_columns = optional_columns + list(self.smd_table.columns)
        for col in optional_columns:
            if col in table.columns.values:
                dupe_columns.append(col)

        table[dupe_columns] = table[dupe_columns].mask(dupe_mask).fillna('')

        # remove Missing column if not needed
        if not self._isnull:
            table = table.drop('Missing', axis=1)

        if self._pval and not self._pval_test_name:
            table = table.drop('Test', axis=1)

        # replace nans with empty strings
        table = table.fillna('')

        # add column index
        if not self._groupbylvls == ['Overall']:
            # rename groupby variable if requested
            c = self._groupby
            if self._alt_labels:
                if self._groupby in self._alt_labels:
                    c = self._alt_labels[self._groupby]

            c = 'Grouped by {}'.format(c)
            table.columns = pd.MultiIndex.from_product([[c], table.columns])

        # display alternative labels if assigned
        table = table.rename(index=self._create_row_labels(), level=0)

        # ensure the order of columns is consistent
        if self._groupby and self._order and (self._groupby in self._order):
            header = ['{}'.format(v) for v in table.columns.levels[1].values]
            cols = self._order[self._groupby] + ['{}'.format(v)
                                                 for v in header
                                                 if v not in
                                                 self._order[self._groupby]]
        elif self._groupby:
            cols = ['{}'.format(v) for v in table.columns.levels[1].values]
        else:
            cols = ['{}'.format(v) for v in table.columns.values]

        if self._groupby and self._overall:
            cols = ['Overall'] + [x for x in cols if x != 'Overall']

        if 'Missing' in cols:
            cols = ['Missing'] + [x for x in cols if x != 'Missing']

        # move optional_columns to the end of the dataframe
        for col in optional_columns:
            if col in cols:
                cols = [x for x in cols if x != col] + [col]

        if self._groupby:
            table = table.reindex(cols, axis=1, level=1)
        else:
            table = table.reindex(cols, axis=1)

        try:
            if 'Missing' in self._alt_labels or 'Overall' in self._alt_labels:
                table = table.rename(columns=self._alt_labels)
        except TypeError:
            pass

        # remove the 'variable, value' column names in the index
        table = table.rename_axis([None, None])

        return table

    def _create_row_labels(self):
        """
        Take the original labels for rows. Rename if alternative labels are
        provided. Append label suffix if label_suffix is True.

        Returns
        ----------
        labels : dictionary
            Dictionary, keys are original column name, values are final label.

        """
        # start with the original column names
        labels = {}
        for c in self._columns:
            labels[c] = c

        # replace column names with alternative names if provided
        if self._alt_labels:
            for k in self._alt_labels.keys():
                labels[k] = self._alt_labels[k]

        # append the label suffix
        if self._label_suffix:
            for k in labels.keys():
                if k in self._nonnormal:
                    if self._min_max and k in self._min_max:
                        labels[k] = "{}, {}".format(labels[k], "median [min,max]")
                    else:
                        labels[k] = "{}, {}".format(labels[k], "median [Q1,Q3]")
                elif k in self._categorical:
                    labels[k] = "{}, {}".format(labels[k], "n (%)")
                else:
                    if self._min_max and k in self._min_max:
                        labels[k] = "{}, {}".format(labels[k], "mean [min,max]")
                    else:
                        labels[k] = "{}, {}".format(labels[k], "mean (SD)")

        return labels

    # warnings
    def _non_continuous_warning(self, c):
        msg = ("'{}' has all non-numeric values. Consider including "
               "it in the list of categorical variables.").format(c)
        warnings.warn(msg, RuntimeWarning, stacklevel=2)