"""
Test functions for GEE

External comparisons are to R and Stata.  The statmodels GEE
implementation should generally agree with the R GEE implementation
for the independence and exchangeable correlation structures.  For
other correlation structures, the details of the correlation
estimation differ among implementations and the results will not agree
exactly.
"""

from statsmodels.compat import lrange
from statsmodels.compat.testing import skipif

import numpy as np
import os

from numpy.testing import (assert_almost_equal, assert_equal, assert_allclose,
                           assert_array_less, assert_raises, assert_, dec)
from statsmodels.genmod.generalized_estimating_equations import (
    GEE, OrdinalGEE, NominalGEE, NominalGEEResults, OrdinalGEEResults,
    NominalGEEResultsWrapper, OrdinalGEEResultsWrapper)
from statsmodels.genmod.families import Gaussian, Binomial, Poisson
from statsmodels.genmod.cov_struct import (Exchangeable, Independence,
                                           GlobalOddsRatio, Autoregressive,
                                           Nested, Stationary)
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.api as sm
from scipy.stats.distributions import norm
import warnings

try:
    import matplotlib.pyplot as plt  # makes plt available for test functions
    have_matplotlib = True
except:
    have_matplotlib = False

pdf_output = False

if pdf_output:
    from matplotlib.backends.backend_pdf import PdfPages
    pdf = PdfPages("test_glm.pdf")
else:
    pdf = None


def close_or_save(pdf, fig):
    if pdf_output:
        pdf.savefig(fig)
    plt.close(fig)


def teardown_module():
    if have_matplotlib:
        plt.close('all')
        if pdf_output:
            pdf.close()


def load_data(fname, icept=True):
    """
    Load a data set from the results directory.  The data set should
    be a CSV file with the following format:

    Column 0: Group indicator
    Column 1: endog variable
    Columns 2-end: exog variables

    If `icept` is True, an intercept is prepended to the exog
    variables.
    """

    cur_dir = os.path.dirname(os.path.abspath(__file__))
    Z = np.genfromtxt(os.path.join(cur_dir, 'results', fname),
                      delimiter=",")

    group = Z[:, 0]
    endog = Z[:, 1]
    exog = Z[:, 2:]

    if icept:
        exog = np.concatenate((np.ones((exog.shape[0], 1)), exog),
                              axis=1)

    return endog, exog, group


def check_wrapper(results):
    # check wrapper
    assert_(isinstance(results.params, pd.Series))
    assert_(isinstance(results.fittedvalues, pd.Series))
    assert_(isinstance(results.resid, pd.Series))
    assert_(isinstance(results.centered_resid, pd.Series))

    assert_(isinstance(results._results.params, np.ndarray))
    assert_(isinstance(results._results.fittedvalues, np.ndarray))
    assert_(isinstance(results._results.resid, np.ndarray))
    assert_(isinstance(results._results.centered_resid, np.ndarray))


class TestGEE(object):

    def test_margins_gaussian(self):
        # Check marginal effects for a Gaussian GEE fit.  Marginal
        # effects and ordinary effects should be equal.

        n = 40
        np.random.seed(34234)
        exog = np.random.normal(size=(n, 3))
        exog[:, 0] = 1

        groups = np.kron(np.arange(n / 4), np.r_[1, 1, 1, 1])
        endog = exog[:, 1] + np.random.normal(size=n)

        model = sm.GEE(endog, exog, groups)
        result = model.fit(
            start_params=[-4.88085602e-04, 1.18501903, 4.78820100e-02])

        marg = result.get_margeff()

        assert_allclose(marg.margeff, result.params[1:])
        assert_allclose(marg.margeff_se, result.bse[1:])

        # smoke test
        marg.summary()

    def test_margins_logistic(self):
        # Check marginal effects for a binomial GEE fit.  Comparison
        # comes from Stata.

        np.random.seed(34234)
        endog = np.r_[0, 0, 0, 0, 1, 1, 1, 1]
        exog = np.ones((8, 2))
        exog[:, 1] = np.r_[1, 2, 1, 1, 2, 1, 2, 2]

        groups = np.arange(8)

        model = sm.GEE(endog, exog, groups, family=sm.families.Binomial())
        result = model.fit(
            cov_type='naive', start_params=[-3.29583687,  2.19722458])

        marg = result.get_margeff()

        assert_allclose(marg.margeff, np.r_[0.4119796])
        assert_allclose(marg.margeff_se, np.r_[0.1379962], rtol=1e-6)

    def test_margins_multinomial(self):
        # Check marginal effects for a 2-class multinomial GEE fit,
        # which should be equivalent to logistic regression.  Comparison
        # comes from Stata.

        np.random.seed(34234)
        endog = np.r_[0, 0, 0, 0, 1, 1, 1, 1]
        exog = np.ones((8, 2))
        exog[:, 1] = np.r_[1, 2, 1, 1, 2, 1, 2, 2]

        groups = np.arange(8)

        model = sm.NominalGEE(endog, exog, groups)
        result = model.fit(cov_type='naive', start_params=[
                           3.295837, -2.197225])

        marg = result.get_margeff()

        assert_allclose(marg.margeff, np.r_[-0.41197961], rtol=1e-5)
        assert_allclose(marg.margeff_se, np.r_[0.1379962], rtol=1e-6)

    @skipif(not have_matplotlib, reason='matplotlib not available')
    def test_nominal_plot(self):
        np.random.seed(34234)
        endog = np.r_[0, 0, 0, 0, 1, 1, 1, 1]
        exog = np.ones((8, 2))
        exog[:, 1] = np.r_[1, 2, 1, 1, 2, 1, 2, 2]

        groups = np.arange(8)

        model = sm.NominalGEE(endog, exog, groups)
        result = model.fit(cov_type='naive',
                           start_params=[3.295837, -2.197225])

        # Smoke test for figure
        fig = result.plot_distribution()
        assert_equal(isinstance(fig, plt.Figure), True)
        plt.close(fig)

    def test_margins_poisson(self):
        # Check marginal effects for a Poisson GEE fit.

        np.random.seed(34234)
        endog = np.r_[10, 15, 12, 13, 20, 18, 26, 29]
        exog = np.ones((8, 2))
        exog[:, 1] = np.r_[0, 0, 0, 0, 1, 1, 1, 1]

        groups = np.arange(8)

        model = sm.GEE(endog, exog, groups, family=sm.families.Poisson())
        result = model.fit(cov_type='naive', start_params=[
                           2.52572864, 0.62057649])

        marg = result.get_margeff()

        assert_allclose(marg.margeff, np.r_[11.0928], rtol=1e-6)
        assert_allclose(marg.margeff_se, np.r_[3.269015], rtol=1e-6)

    def test_multinomial(self):
        """
        Check the 2-class multinomial (nominal) GEE fit against
        logistic regression.
        """

        np.random.seed(34234)
        endog = np.r_[0, 0, 0, 0, 1, 1, 1, 1]
        exog = np.ones((8, 2))
        exog[:, 1] = np.r_[1, 2, 1, 1, 2, 1, 2, 2]

        groups = np.arange(8)

        model = sm.NominalGEE(endog, exog, groups)
        results = model.fit(cov_type='naive', start_params=[
                            3.295837, -2.197225])

        logit_model = sm.GEE(endog, exog, groups,
                             family=sm.families.Binomial())
        logit_results = logit_model.fit(cov_type='naive')

        assert_allclose(results.params, -logit_results.params, rtol=1e-5)
        assert_allclose(results.bse, logit_results.bse, rtol=1e-5)

    def test_weighted(self):

        # Simple check where the answer can be computed by hand.
        exog = np.ones(20)
        weights = np.ones(20)
        weights[0:10] = 2
        endog = np.zeros(20)
        endog[0:10] += 1
        groups = np.kron(np.arange(10), np.r_[1, 1])
        model = GEE(endog, exog, groups, weights=weights)
        result = model.fit()
        assert_allclose(result.params, np.r_[2 / 3.])

        # Comparison against stata using groups with different sizes.
        weights = np.ones(20)
        weights[10:] = 2
        endog = np.r_[1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 6, 5, 6, 7, 6,
                      7, 8, 7, 8]
        exog1 = np.r_[1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4,
                      3, 3, 3, 3]
        groups = np.r_[1, 1, 2, 2, 2, 2, 4, 4, 5, 5, 6, 6, 6, 6,
                       8, 8, 9, 9, 10, 10]
        exog = np.column_stack((np.ones(20), exog1))

        # Comparison using independence model
        model = GEE(endog, exog, groups, weights=weights,
                    cov_struct=sm.cov_struct.Independence())
        g = np.mean([2, 4, 2, 2, 4, 2, 2, 2])
        fac = 20 / float(20 - g)
        result = model.fit(ddof_scale=0, scaling_factor=fac)

        assert_allclose(result.params, np.r_[1.247573, 1.436893], atol=1e-6)
        assert_allclose(result.scale, 1.808576)

        # Stata multiples robust SE by sqrt(N / (N - g)), where N is
        # the total sample size and g is the average group size.
        assert_allclose(result.bse, np.r_[0.895366, 0.3425498], atol=1e-5)

        # Comparison using exchangeable model
        # Smoke test for now
        model = GEE(endog, exog, groups, weights=weights,
                    cov_struct=sm.cov_struct.Exchangeable())
        result = model.fit(ddof_scale=0)

    # This is in the release announcement for version 0.6.
    def test_poisson_epil(self):

        cur_dir = os.path.dirname(os.path.abspath(__file__))
        fname = os.path.join(cur_dir, "results", "epil.csv")
        data = pd.read_csv(fname)

        fam = Poisson()
        ind = Independence()
        mod1 = GEE.from_formula("y ~ age + trt + base", data["subject"],
                                data, cov_struct=ind, family=fam)
        rslt1 = mod1.fit(cov_type='naive')

        # Coefficients should agree with GLM
        from statsmodels.genmod.generalized_linear_model import GLM
        from statsmodels.genmod import families

        mod2 = GLM.from_formula("y ~ age + trt + base", data,
                                family=families.Poisson())
        rslt2 = mod2.fit()

        # don't use wrapper, asserts_xxx don't work
        rslt1 = rslt1._results
        rslt2 = rslt2._results

        assert_allclose(rslt1.params, rslt2.params, rtol=1e-6, atol=1e-6)
        assert_allclose(rslt1.bse, rslt2.bse, rtol=1e-6, atol=1e-6)

    def test_missing(self):
        # Test missing data handling for calling from the api.  Missing
        # data handling does not currently work for formulas.

        endog = np.random.normal(size=100)
        exog = np.random.normal(size=(100, 3))
        exog[:, 0] = 1
        groups = np.kron(lrange(20), np.ones(5))

        endog[0] = np.nan
        endog[5:7] = np.nan
        exog[10:12, 1] = np.nan

        mod1 = GEE(endog, exog, groups, missing='drop')
        rslt1 = mod1.fit()

        assert_almost_equal(len(mod1.endog), 95)
        assert_almost_equal(np.asarray(mod1.exog.shape), np.r_[95, 3])

        ii = np.isfinite(endog) & np.isfinite(exog).all(1)

        mod2 = GEE(endog[ii], exog[ii, :], groups[ii], missing='none')
        rslt2 = mod2.fit()

        assert_almost_equal(rslt1.params, rslt2.params)
        assert_almost_equal(rslt1.bse, rslt2.bse)

    def test_missing_formula(self):
        # Test missing data handling for formulas.

        endog = np.random.normal(size=100)
        exog1 = np.random.normal(size=100)
        exog2 = np.random.normal(size=100)
        exog3 = np.random.normal(size=100)
        groups = np.kron(lrange(20), np.ones(5))

        endog[0] = np.nan
        endog[5:7] = np.nan
        exog2[10:12] = np.nan

        data = pd.DataFrame({"endog": endog, "exog1": exog1, "exog2": exog2,
                             "exog3": exog3, "groups": groups})

        mod1 = GEE.from_formula("endog ~ exog1 + exog2 + exog3",
                                groups, data, missing='drop')
        rslt1 = mod1.fit()

        assert_almost_equal(len(mod1.endog), 95)
        assert_almost_equal(np.asarray(mod1.exog.shape), np.r_[95, 4])

        data = data.dropna()
        groups = groups[data.index.values]

        mod2 = GEE.from_formula("endog ~ exog1 + exog2 + exog3",
                                groups, data, missing='none')
        rslt2 = mod2.fit()

        assert_almost_equal(rslt1.params.values, rslt2.params.values)
        assert_almost_equal(rslt1.bse.values, rslt2.bse.values)

    def test_default_time(self):
        # Check that the time defaults work correctly.

        endog, exog, group = load_data("gee_logistic_1.csv")

        # Time values for the autoregressive model
        T = np.zeros(len(endog))
        idx = set(group)
        for ii in idx:
            jj = np.flatnonzero(group == ii)
            T[jj] = lrange(len(jj))

        family = Binomial()
        va = Autoregressive()

        md1 = GEE(endog, exog, group, family=family, cov_struct=va)
        mdf1 = md1.fit()

        md2 = GEE(endog, exog, group, time=T, family=family,
                  cov_struct=va)
        mdf2 = md2.fit()

        assert_almost_equal(mdf1.params, mdf2.params, decimal=6)
        assert_almost_equal(mdf1.standard_errors(),
                            mdf2.standard_errors(), decimal=6)

    def test_logistic(self):
        # R code for comparing results:

        # library(gee)
        # Z = read.csv("results/gee_logistic_1.csv", header=FALSE)
        # Y = Z[,2]
        # Id = Z[,1]
        # X1 = Z[,3]
        # X2 = Z[,4]
        # X3 = Z[,5]

        # mi = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial,
        #         corstr="independence")
        # smi = summary(mi)
        # u = coefficients(smi)
        # cfi = paste(u[,1], collapse=",")
        # sei = paste(u[,4], collapse=",")

        # me = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial,
        #         corstr="exchangeable")
        # sme = summary(me)
        # u = coefficients(sme)
        # cfe = paste(u[,1], collapse=",")
        # see = paste(u[,4], collapse=",")

        # ma = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial,
        #         corstr="AR-M")
        # sma = summary(ma)
        # u = coefficients(sma)
        # cfa = paste(u[,1], collapse=",")
        # sea = paste(u[,4], collapse=",")

        # sprintf("cf = [[%s],[%s],[%s]]", cfi, cfe, cfa)
        # sprintf("se = [[%s],[%s],[%s]]", sei, see, sea)

        endog, exog, group = load_data("gee_logistic_1.csv")

        # Time values for the autoregressive model
        T = np.zeros(len(endog))
        idx = set(group)
        for ii in idx:
            jj = np.flatnonzero(group == ii)
            T[jj] = lrange(len(jj))

        family = Binomial()
        ve = Exchangeable()
        vi = Independence()
        va = Autoregressive()

        # From R gee
        cf = [[0.0167272965285882, 1.13038654425893,
               -1.86896345082962, 1.09397608331333],
              [0.0178982283915449, 1.13118798191788,
               -1.86133518416017, 1.08944256230299],
              [0.0109621937947958, 1.13226505028438,
               -1.88278757333046, 1.09954623769449]]
        se = [[0.127291720283049, 0.166725808326067,
               0.192430061340865, 0.173141068839597],
              [0.127045031730155, 0.165470678232842,
               0.192052750030501, 0.173174779369249],
              [0.127240302296444, 0.170554083928117,
               0.191045527104503, 0.169776150974586]]

        for j, v in enumerate((vi, ve, va)):
            md = GEE(endog, exog, group, T, family, v)
            mdf = md.fit()
            if id(v) != id(va):
                assert_almost_equal(mdf.params, cf[j], decimal=6)
                assert_almost_equal(mdf.standard_errors(), se[j],
                                    decimal=6)

        # Test with formulas
        D = np.concatenate((endog[:, None], group[:, None], exog[:, 1:]),
                           axis=1)
        D = pd.DataFrame(D)
        D.columns = ["Y", "Id", ] + ["X%d" % (k + 1)
                                     for k in range(exog.shape[1] - 1)]
        for j, v in enumerate((vi, ve)):
            md = GEE.from_formula("Y ~ X1 + X2 + X3", "Id", D,
                                  family=family, cov_struct=v)
            mdf = md.fit()
            assert_almost_equal(mdf.params, cf[j], decimal=6)
            assert_almost_equal(mdf.standard_errors(), se[j],
                                decimal=6)

        # Check for run-time exceptions in summary
        # print(mdf.summary())

    def test_autoregressive(self):

        dep_params_true = [0, 0.589208623896, 0.559823804948]

        params_true = [[1.08043787, 1.12709319, 0.90133927],
                       [0.9613677, 1.05826987, 0.90832055],
                       [1.05370439, 0.96084864, 0.93923374]]

        np.random.seed(342837482)

        num_group = 100
        ar_param = 0.5
        k = 3

        ga = Gaussian()

        for gsize in 1, 2, 3:

            ix = np.arange(gsize)[:, None] - np.arange(gsize)[None, :]
            ix = np.abs(ix)
            cmat = ar_param ** ix
            cmat_r = np.linalg.cholesky(cmat)

            endog = []
            exog = []
            groups = []
            for i in range(num_group):
                x = np.random.normal(size=(gsize, k))
                exog.append(x)
                expval = x.sum(1)
                errors = np.dot(cmat_r, np.random.normal(size=gsize))
                endog.append(expval + errors)
                groups.append(i * np.ones(gsize))

            endog = np.concatenate(endog)
            groups = np.concatenate(groups)
            exog = np.concatenate(exog, axis=0)

            ar = Autoregressive()
            md = GEE(endog, exog, groups, family=ga, cov_struct=ar)
            mdf = md.fit()
            assert_almost_equal(ar.dep_params, dep_params_true[gsize - 1])
            assert_almost_equal(mdf.params, params_true[gsize - 1])

    def test_post_estimation(self):

        family = Gaussian()
        endog, exog, group = load_data("gee_linear_1.csv")

        ve = Exchangeable()

        md = GEE(endog, exog, group, None, family, ve)
        mdf = md.fit()

        assert_almost_equal(np.dot(exog, mdf.params),
                            mdf.fittedvalues)
        assert_almost_equal(endog - np.dot(exog, mdf.params),
                            mdf.resid)

    def test_scoretest(self):
        # Regression tests

        np.random.seed(6432)
        n = 200  # Must be divisible by 4
        exog = np.random.normal(size=(n, 4))
        endog = exog[:, 0] + exog[:, 1] + exog[:, 2]
        endog += 3 * np.random.normal(size=n)
        group = np.kron(np.arange(n / 4), np.ones(4))

        # Test under the null.
        L = np.array([[1., -1, 0, 0]])
        R = np.array([0., ])
        family = Gaussian()
        va = Independence()
        mod1 = GEE(endog, exog, group, family=family,
                   cov_struct=va, constraint=(L, R))
        mod1.fit()
        assert_almost_equal(mod1.score_test_results["statistic"],
                            1.08126334)
        assert_almost_equal(mod1.score_test_results["p-value"],
                            0.2984151086)

        # Test under the alternative.
        L = np.array([[1., -1, 0, 0]])
        R = np.array([1.0, ])
        family = Gaussian()
        va = Independence()
        mod2 = GEE(endog, exog, group, family=family,
                   cov_struct=va, constraint=(L, R))
        mod2.fit()
        assert_almost_equal(mod2.score_test_results["statistic"],
                            3.491110965)
        assert_almost_equal(mod2.score_test_results["p-value"],
                            0.0616991659)

        # Compare to Wald tests
        exog = np.random.normal(size=(n, 2))
        L = np.array([[1, -1]])
        R = np.array([0.])
        f = np.r_[1, -1]
        for i in range(10):
            endog = exog[:, 0] + (0.5 + i / 10.) * exog[:, 1] +\
                np.random.normal(size=n)
            family = Gaussian()
            va = Independence()
            mod0 = GEE(endog, exog, group, family=family,
                       cov_struct=va)
            rslt0 = mod0.fit()
            family = Gaussian()
            va = Independence()
            mod1 = GEE(endog, exog, group, family=family,
                       cov_struct=va, constraint=(L, R))
            mod1.fit()
            se = np.sqrt(np.dot(f, np.dot(rslt0.cov_params(), f)))
            wald_z = np.dot(f, rslt0.params) / se
            wald_p = 2 * norm.cdf(-np.abs(wald_z))
            score_p = mod1.score_test_results["p-value"]
            assert_array_less(np.abs(wald_p - score_p), 0.02)

    def test_constraint_covtype(self):
        # Test constraints with different cov types
        np.random.seed(6432)
        n = 200
        exog = np.random.normal(size=(n, 4))
        endog = exog[:, 0] + exog[:, 1] + exog[:, 2]
        endog += 3 * np.random.normal(size=n)
        group = np.kron(np.arange(n / 4), np.ones(4))
        L = np.array([[1., -1, 0, 0]])
        R = np.array([0., ])
        family = Gaussian()
        va = Independence()
        for cov_type in "robust", "naive", "bias_reduced":
            model = GEE(endog, exog, group, family=family,
                        cov_struct=va, constraint=(L, R))
            result = model.fit(cov_type=cov_type)
            result.standard_errors(cov_type=cov_type)
            assert_allclose(result.cov_robust.shape, np.r_[4, 4])
            assert_allclose(result.cov_naive.shape, np.r_[4, 4])
            if cov_type == "bias_reduced":
                assert_allclose(result.cov_robust_bc.shape, np.r_[4, 4])

    def test_linear(self):
        # library(gee)

        # Z = read.csv("results/gee_linear_1.csv", header=FALSE)
        # Y = Z[,2]
        # Id = Z[,1]
        # X1 = Z[,3]
        # X2 = Z[,4]
        # X3 = Z[,5]
        # mi = gee(Y ~ X1 + X2 + X3, id=Id, family=gaussian,
        #         corstr="independence", tol=1e-8, maxit=100)
        # smi = summary(mi)
        # u = coefficients(smi)

        # cfi = paste(u[,1], collapse=",")
        # sei = paste(u[,4], collapse=",")

        # me = gee(Y ~ X1 + X2 + X3, id=Id, family=gaussian,
        #         corstr="exchangeable", tol=1e-8, maxit=100)
        # sme = summary(me)
        # u = coefficients(sme)

        # cfe = paste(u[,1], collapse=",")
        # see = paste(u[,4], collapse=",")

        # sprintf("cf = [[%s],[%s]]", cfi, cfe)
        # sprintf("se = [[%s],[%s]]", sei, see)

        family = Gaussian()

        endog, exog, group = load_data("gee_linear_1.csv")

        vi = Independence()
        ve = Exchangeable()

        # From R gee
        cf = [[-0.01850226507491, 0.81436304278962,
               -1.56167635393184, 0.794239361055003],
              [-0.0182920577154767, 0.814898414022467,
               -1.56194040106201, 0.793499517527478]]
        se = [[0.0440733554189401, 0.0479993639119261,
               0.0496045952071308, 0.0479467597161284],
              [0.0440369906460754, 0.0480069787567662,
               0.049519758758187, 0.0479760443027526]]

        for j, v in enumerate((vi, ve)):
            md = GEE(endog, exog, group, None, family, v)
            mdf = md.fit()
            assert_almost_equal(mdf.params, cf[j], decimal=10)
            assert_almost_equal(mdf.standard_errors(), se[j],
                                decimal=10)

        # Test with formulas
        D = np.concatenate((endog[:, None], group[:, None], exog[:, 1:]),
                           axis=1)
        D = pd.DataFrame(D)
        D.columns = ["Y", "Id", ] + ["X%d" % (k + 1)
                                     for k in range(exog.shape[1] - 1)]
        for j, v in enumerate((vi, ve)):
            md = GEE.from_formula("Y ~ X1 + X2 + X3", "Id", D,
                                  family=family, cov_struct=v)
            mdf = md.fit()
            assert_almost_equal(mdf.params, cf[j], decimal=10)
            assert_almost_equal(mdf.standard_errors(), se[j],
                                decimal=10)

    def test_linear_constrained(self):

        family = Gaussian()

        exog = np.random.normal(size=(300, 4))
        exog[:, 0] = 1
        endog = np.dot(exog, np.r_[1, 1, 0, 0.2]) +\
            np.random.normal(size=300)
        group = np.kron(np.arange(100), np.r_[1, 1, 1])

        vi = Independence()
        ve = Exchangeable()

        L = np.r_[[[0, 0, 0, 1]]]
        R = np.r_[0, ]

        for j, v in enumerate((vi, ve)):
            md = GEE(endog, exog, group, None, family, v,
                     constraint=(L, R))
            mdf = md.fit()
            assert_almost_equal(mdf.params[3], 0, decimal=10)

    def test_nested_linear(self):

        family = Gaussian()

        endog, exog, group = load_data("gee_nested_linear_1.csv")

        group_n = []
        for i in range(endog.shape[0] // 10):
            group_n.extend([0, ] * 5)
            group_n.extend([1, ] * 5)
        group_n = np.array(group_n)[:, None]

        dp = Independence()
        md = GEE(endog, exog, group, None, family, dp)
        mdf1 = md.fit()

        # From statsmodels.GEE (not an independent test)
        cf = np.r_[-0.1671073,  1.00467426, -2.01723004,  0.97297106]
        se = np.r_[0.08629606,  0.04058653,  0.04067038,  0.03777989]
        assert_almost_equal(mdf1.params, cf, decimal=6)
        assert_almost_equal(mdf1.standard_errors(), se,
                            decimal=6)

        ne = Nested()
        md = GEE(endog, exog, group, None, family, ne,
                 dep_data=group_n)
        mdf2 = md.fit(start_params=mdf1.params)

        # From statsmodels.GEE (not an independent test)
        cf = np.r_[-0.16655319,  1.02183688, -2.00858719,  1.00101969]
        se = np.r_[0.08632616,  0.02913582,  0.03114428,  0.02893991]
        assert_almost_equal(mdf2.params, cf, decimal=6)
        assert_almost_equal(mdf2.standard_errors(), se,
                            decimal=6)

    def test_ordinal(self):

        family = Binomial()

        endog, exog, groups = load_data("gee_ordinal_1.csv",
                                        icept=False)

        va = GlobalOddsRatio("ordinal")

        mod = OrdinalGEE(endog, exog, groups, None, family, va)
        rslt = mod.fit()

        # Regression test
        cf = np.r_[1.09250002, 0.0217443, -0.39851092, -0.01812116,
                   0.03023969, 1.18258516, 0.01803453, -1.10203381]
        assert_almost_equal(rslt.params, cf, decimal=5)

        # Regression test
        se = np.r_[0.10883461, 0.10330197, 0.11177088, 0.05486569,
                   0.05997153, 0.09168148, 0.05953324, 0.0853862]
        assert_almost_equal(rslt.bse, se, decimal=5)

        # Check that we get the correct results type
        assert_equal(type(rslt), OrdinalGEEResultsWrapper)
        assert_equal(type(rslt._results), OrdinalGEEResults)


    def test_ordinal_formula(self):

        np.random.seed(434)
        n = 40
        y = np.random.randint(0, 3, n)
        groups = np.arange(n)
        x1 = np.random.normal(size=n)
        x2 = np.random.normal(size=n)

        df = pd.DataFrame({"y": y, "groups": groups, "x1": x1, "x2": x2})

        # smoke test
        model = OrdinalGEE.from_formula("y ~ 0 + x1 + x2", groups, data=df)
        model.fit()

        # smoke test
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            model = NominalGEE.from_formula("y ~ 0 + x1 + x2", groups, data=df)
            model.fit()

    def test_ordinal_independence(self):

        np.random.seed(434)
        n = 40
        y = np.random.randint(0, 3, n)
        groups = np.kron(np.arange(n / 2), np.r_[1, 1])
        x = np.random.normal(size=(n, 1))

        # smoke test
        odi = sm.cov_struct.OrdinalIndependence()
        model1 = OrdinalGEE(y, x, groups, cov_struct=odi)
        model1.fit()

    def test_nominal_independence(self):

        np.random.seed(434)
        n = 40
        y = np.random.randint(0, 3, n)
        groups = np.kron(np.arange(n / 2), np.r_[1, 1])
        x = np.random.normal(size=(n, 1))

        # smoke test
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            nmi = sm.cov_struct.NominalIndependence()
            model1 = NominalGEE(y, x, groups, cov_struct=nmi)
            model1.fit()

    @skipif(not have_matplotlib, reason='matplotlib not available')
    def test_ordinal_plot(self):
        family = Binomial()

        endog, exog, groups = load_data("gee_ordinal_1.csv",
                                        icept=False)

        va = GlobalOddsRatio("ordinal")

        mod = OrdinalGEE(endog, exog, groups, None, family, va)
        rslt = mod.fit()

        # Smoke test for figure
        fig = rslt.plot_distribution()
        assert_equal(isinstance(fig, plt.Figure), True)
        plt.close(fig)

    def test_nominal(self):

        endog, exog, groups = load_data("gee_nominal_1.csv",
                                        icept=False)

        # Test with independence correlation
        va = Independence()
        mod1 = NominalGEE(endog, exog, groups, cov_struct=va)
        rslt1 = mod1.fit()

        # Regression test
        cf1 = np.r_[0.450009, 0.451959, -0.918825, -0.468266]
        se1 = np.r_[0.08915936, 0.07005046, 0.12198139, 0.08281258]
        assert_allclose(rslt1.params, cf1, rtol=1e-5, atol=1e-5)
        assert_allclose(rslt1.standard_errors(), se1, rtol=1e-5, atol=1e-5)

        # Test with global odds ratio dependence
        va = GlobalOddsRatio("nominal")
        mod2 = NominalGEE(endog, exog, groups, cov_struct=va)
        rslt2 = mod2.fit(start_params=rslt1.params)

        # Regression test
        cf2 = np.r_[0.455365, 0.415334, -0.916589, -0.502116]
        se2 = np.r_[0.08803614, 0.06628179, 0.12259726, 0.08411064]
        assert_allclose(rslt2.params, cf2, rtol=1e-5, atol=1e-5)
        assert_allclose(rslt2.standard_errors(), se2, rtol=1e-5, atol=1e-5)

        # Make sure we get the correct results type
        assert_equal(type(rslt1), NominalGEEResultsWrapper)
        assert_equal(type(rslt1._results), NominalGEEResults)

    def test_poisson(self):
        # library(gee)
        # Z = read.csv("results/gee_poisson_1.csv", header=FALSE)
        # Y = Z[,2]
        # Id = Z[,1]
        # X1 = Z[,3]
        # X2 = Z[,4]
        # X3 = Z[,5]
        # X4 = Z[,6]
        # X5 = Z[,7]

        # mi = gee(Y ~ X1 + X2 + X3 + X4 + X5, id=Id, family=poisson,
        #        corstr="independence", scale.fix=TRUE)
        # smi = summary(mi)
        # u = coefficients(smi)
        # cfi = paste(u[,1], collapse=",")
        # sei = paste(u[,4], collapse=",")

        # me = gee(Y ~ X1 + X2 + X3 + X4 + X5, id=Id, family=poisson,
        #        corstr="exchangeable", scale.fix=TRUE)
        # sme = summary(me)

        # u = coefficients(sme)
        # cfe = paste(u[,1], collapse=",")
        # see = paste(u[,4], collapse=",")

        # sprintf("cf = [[%s],[%s]]", cfi, cfe)
        # sprintf("se = [[%s],[%s]]", sei, see)

        family = Poisson()

        endog, exog, group_n = load_data("gee_poisson_1.csv")

        vi = Independence()
        ve = Exchangeable()

        # From R gee
        cf = [[-0.0364450410793481, -0.0543209391301178,
               0.0156642711741052, 0.57628591338724,
               -0.00465659951186211, -0.477093153099256],
              [-0.0315615554826533, -0.0562589480840004,
               0.0178419412298561, 0.571512795340481,
               -0.00363255566297332, -0.475971696727736]]
        se = [[0.0611309237214186, 0.0390680524493108,
               0.0334234174505518, 0.0366860768962715,
               0.0304758505008105, 0.0316348058881079],
              [0.0610840153582275, 0.0376887268649102,
               0.0325168379415177, 0.0369786751362213,
               0.0296141014225009, 0.0306115470200955]]

        for j, v in enumerate((vi, ve)):
            md = GEE(endog, exog, group_n, None, family, v)
            mdf = md.fit()
            assert_almost_equal(mdf.params, cf[j], decimal=5)
            assert_almost_equal(mdf.standard_errors(), se[j],
                                decimal=6)

        # Test with formulas
        D = np.concatenate((endog[:, None], group_n[:, None],
                            exog[:, 1:]), axis=1)
        D = pd.DataFrame(D)
        D.columns = ["Y", "Id", ] + ["X%d" % (k + 1)
                                     for k in range(exog.shape[1] - 1)]
        for j, v in enumerate((vi, ve)):
            md = GEE.from_formula("Y ~ X1 + X2 + X3 + X4 + X5", "Id",
                                  D, family=family, cov_struct=v)
            mdf = md.fit()
            assert_almost_equal(mdf.params, cf[j], decimal=5)
            assert_almost_equal(mdf.standard_errors(), se[j],
                                decimal=6)
            # print(mdf.params)

    def test_groups(self):
        # Test various group structures (nonconsecutive, different
        # group sizes, not ordered, string labels)

        n = 40
        x = np.random.normal(size=(n, 2))
        y = np.random.normal(size=n)

        # groups with unequal group sizes
        groups = np.kron(np.arange(n / 4), np.ones(4))
        groups[8:12] = 3
        groups[34:36] = 9

        model1 = GEE(y, x, groups=groups)
        result1 = model1.fit()

        # Unordered groups
        ix = np.random.permutation(n)
        y1 = y[ix]
        x1 = x[ix, :]
        groups1 = groups[ix]

        model2 = GEE(y1, x1, groups=groups1)
        result2 = model2.fit()

        assert_allclose(result1.params, result2.params)
        assert_allclose(result1.tvalues, result2.tvalues)

        # group labels are strings
        mp = {}
        import string
        for j, g in enumerate(set(groups)):
            mp[g] = string.ascii_letters[j:j + 4]
        groups2 = [mp[g] for g in groups]

        model3 = GEE(y, x, groups=groups2)
        result3 = model3.fit()

        assert_allclose(result1.params, result3.params)
        assert_allclose(result1.tvalues, result3.tvalues)

    def test_compare_OLS(self):
        # Gaussian GEE with independence correlation should agree
        # exactly with OLS for parameter estimates and standard errors
        # derived from the naive covariance estimate.

        vs = Independence()
        family = Gaussian()

        Y = np.random.normal(size=100)
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.kron(lrange(20), np.ones(5))

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        md = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D,
                              family=family, cov_struct=vs)
        mdf = md.fit()

        ols = smf.ols("Y ~ X1 + X2 + X3", data=D).fit()

        # don't use wrapper, asserts_xxx don't work
        ols = ols._results

        assert_almost_equal(ols.params, mdf.params, decimal=10)

        se = mdf.standard_errors(cov_type="naive")
        assert_almost_equal(ols.bse, se, decimal=10)

        naive_tvalues = mdf.params / \
            np.sqrt(np.diag(mdf.cov_naive))
        assert_almost_equal(naive_tvalues, ols.tvalues, decimal=10)

    def test_formulas(self):
        # Check formulas, especially passing groups and time as either
        # variable names or arrays.

        n = 100
        Y = np.random.normal(size=n)
        X1 = np.random.normal(size=n)
        mat = np.concatenate((np.ones((n, 1)), X1[:, None]), axis=1)
        Time = np.random.uniform(size=n)
        groups = np.kron(lrange(20), np.ones(5))

        data = pd.DataFrame({"Y": Y, "X1": X1, "Time": Time, "groups": groups})

        va = Autoregressive()
        family = Gaussian()

        mod1 = GEE(Y, mat, groups, time=Time, family=family,
                   cov_struct=va)
        rslt1 = mod1.fit()

        mod2 = GEE.from_formula("Y ~ X1", groups, data, time=Time,
                                family=family, cov_struct=va)
        rslt2 = mod2.fit()

        mod3 = GEE.from_formula("Y ~ X1", groups, data, time="Time",
                                family=family, cov_struct=va)
        rslt3 = mod3.fit()

        mod4 = GEE.from_formula("Y ~ X1", "groups", data, time=Time,
                                family=family, cov_struct=va)
        rslt4 = mod4.fit()

        mod5 = GEE.from_formula("Y ~ X1", "groups", data, time="Time",
                                family=family, cov_struct=va)
        rslt5 = mod5.fit()

        assert_almost_equal(rslt1.params, rslt2.params, decimal=8)
        assert_almost_equal(rslt1.params, rslt3.params, decimal=8)
        assert_almost_equal(rslt1.params, rslt4.params, decimal=8)
        assert_almost_equal(rslt1.params, rslt5.params, decimal=8)

        check_wrapper(rslt2)

    def test_compare_logit(self):

        vs = Independence()
        family = Binomial()

        Y = 1 * (np.random.normal(size=100) < 0)
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.random.randint(0, 4, size=100)

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        mod1 = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D,
                                family=family, cov_struct=vs)
        rslt1 = mod1.fit()

        mod2 = smf.logit("Y ~ X1 + X2 + X3", data=D)
        rslt2 = mod2.fit(disp=False)

        assert_almost_equal(rslt1.params.values, rslt2.params.values,
                            decimal=10)

    def test_compare_poisson(self):

        vs = Independence()
        family = Poisson()

        Y = np.ceil(-np.log(np.random.uniform(size=100)))
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.random.randint(0, 4, size=100)

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        mod1 = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D,
                                family=family, cov_struct=vs)
        rslt1 = mod1.fit()

        mod2 = smf.poisson("Y ~ X1 + X2 + X3", data=D)
        rslt2 = mod2.fit(disp=False)

        assert_almost_equal(rslt1.params.values, rslt2.params.values,
                            decimal=10)

    def test_predict(self):

        n = 50
        np.random.seed(4324)
        X1 = np.random.normal(size=n)
        X2 = np.random.normal(size=n)
        groups = np.kron(np.arange(n / 2), np.r_[1, 1])
        offset = np.random.uniform(1, 2, size=n)
        Y = np.random.normal(0.1 * (X1 + X2) + offset, size=n)
        data = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "groups": groups,
                             "offset": offset})

        fml = "Y ~ X1 + X2"
        model = GEE.from_formula(fml, groups, data, family=Gaussian(),
                                 offset="offset")
        result = model.fit(start_params=[0, 0.1, 0.1])
        assert_equal(result.converged, True)

        pred1 = result.predict()
        pred2 = result.predict(offset=data.offset)
        pred3 = result.predict(exog=data[["X1", "X2"]], offset=data.offset)
        pred4 = result.predict(exog=data[["X1", "X2"]], offset=0 * data.offset)
        pred5 = result.predict(offset=0 * data.offset)

        assert_allclose(pred1, pred2)
        assert_allclose(pred1, pred3)
        assert_allclose(pred1, pred4 + data.offset)
        assert_allclose(pred1, pred5 + data.offset)

        x1_new = np.random.normal(size=10)
        x2_new = np.random.normal(size=10)
        new_exog = pd.DataFrame({"X1": x1_new, "X2": x2_new})
        pred6 = result.predict(exog=new_exog)
        params = result.params
        pred6_correct = params[0] + params[1] * x1_new + params[2] * x2_new
        assert_allclose(pred6, pred6_correct)

    def test_stationary_grid(self):

        endog = np.r_[4, 2, 3, 1, 4, 5, 6, 7, 8, 3, 2, 4.]
        exog = np.r_[2, 3, 1, 4, 3, 2, 5, 4, 5, 6, 3, 2]
        group = np.r_[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]
        exog = sm.add_constant(exog)

        cs = Stationary(max_lag=2, grid=True)
        model = sm.GEE(endog, exog, group, cov_struct=cs)
        result = model.fit()
        se = result.bse * np.sqrt(12 / 9.)  # Stata adjustment

        assert_allclose(cs.covariance_matrix(np.r_[1, 1, 1], 0)[0].sum(),
                        6.4633538285149452)

        # Obtained from Stata using:
        # xtgee y x, i(g) vce(robust) corr(Stationary2)
        assert_allclose(result.params, np.r_[
                        4.463968, -0.0386674], rtol=1e-5, atol=1e-5)
        assert_allclose(se, np.r_[0.5217202, 0.2800333], rtol=1e-5, atol=1e-5)

    def test_stationary_nogrid(self):

        # First test special case where the data follow a grid but we
        # fit using nogrid
        endog = np.r_[4, 2, 3, 1, 4, 5, 6, 7, 8, 3, 2, 4.]
        exog = np.r_[2, 3, 1, 4, 3, 2, 5, 4, 5, 6, 3, 2]
        time = np.r_[0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2]
        group = np.r_[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]

        exog = sm.add_constant(exog)

        model = sm.GEE(endog, exog, group,
                       cov_struct=Stationary(max_lag=2, grid=False))
        result = model.fit()
        se = result.bse * np.sqrt(12 / 9.)  # Stata adjustment

        # Obtained from Stata using:
        # xtgee y x, i(g) vce(robust) corr(Stationary2)
        assert_allclose(result.params, np.r_[
                        4.463968, -0.0386674], rtol=1e-5, atol=1e-5)
        assert_allclose(se, np.r_[0.5217202, 0.2800333], rtol=1e-5, atol=1e-5)

        # Smoke test for no grid
        time = np.r_[0, 1, 3, 0, 2, 3, 0, 2, 3, 0, 1, 2][:, None]
        model = sm.GEE(endog, exog, group, time=time,
                       cov_struct=Stationary(max_lag=4, grid=False))
        result = model.fit()

    def test_predict_exposure(self):

        n = 50
        X1 = np.random.normal(size=n)
        X2 = np.random.normal(size=n)
        groups = np.kron(np.arange(25), np.r_[1, 1])
        offset = np.random.uniform(1, 2, size=n)
        exposure = np.random.uniform(1, 2, size=n)
        Y = np.random.poisson(0.1 * (X1 + X2) + offset +
                              np.log(exposure), size=n)
        data = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "groups": groups,
                             "offset": offset, "exposure": exposure})

        fml = "Y ~ X1 + X2"
        model = GEE.from_formula(fml, groups, data, family=Poisson(),
                                 offset="offset", exposure="exposure")
        result = model.fit()
        assert_equal(result.converged, True)

        pred1 = result.predict()
        pred2 = result.predict(offset=data["offset"])
        pred3 = result.predict(exposure=data["exposure"])
        pred4 = result.predict(
            offset=data["offset"], exposure=data["exposure"])
        pred5 = result.predict(exog=data[-10:],
                               offset=data["offset"][-10:],
                               exposure=data["exposure"][-10:])
        # without patsy
        pred6 = result.predict(exog=result.model.exog[-10:],
                               offset=data["offset"][-10:],
                               exposure=data["exposure"][-10:],
                               transform=False)
        assert_allclose(pred1, pred2)
        assert_allclose(pred1, pred3)
        assert_allclose(pred1, pred4)
        assert_allclose(pred1[-10:], pred5)
        assert_allclose(pred1[-10:], pred6)

    def test_offset_formula(self):
        # Test various ways of passing offset and exposure to `from_formula`.

        n = 50
        X1 = np.random.normal(size=n)
        X2 = np.random.normal(size=n)
        groups = np.kron(np.arange(25), np.r_[1, 1])
        offset = np.random.uniform(1, 2, size=n)
        exposure = np.exp(offset)
        Y = np.random.poisson(0.1 * (X1 + X2) + 2 * offset, size=n)
        data = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "groups": groups,
                             "offset": offset, "exposure": exposure})

        fml = "Y ~ X1 + X2"
        model1 = GEE.from_formula(fml, groups, data, family=Poisson(),
                                  offset="offset")
        result1 = model1.fit()
        assert_equal(result1.converged, True)

        model2 = GEE.from_formula(fml, groups, data, family=Poisson(),
                                  offset=offset)
        result2 = model2.fit(start_params=result1.params)
        assert_allclose(result1.params, result2.params)
        assert_equal(result2.converged, True)

        model3 = GEE.from_formula(fml, groups, data, family=Poisson(),
                                  exposure=exposure)
        result3 = model3.fit(start_params=result1.params)
        assert_allclose(result1.params, result3.params)
        assert_equal(result3.converged, True)

        model4 = GEE.from_formula(fml, groups, data, family=Poisson(),
                                  exposure="exposure")
        result4 = model4.fit(start_params=result1.params)
        assert_allclose(result1.params, result4.params)
        assert_equal(result4.converged, True)

        model5 = GEE.from_formula(fml, groups, data, family=Poisson(),
                                  exposure="exposure", offset="offset")
        result5 = model5.fit()
        assert_equal(result5.converged, True)

        model6 = GEE.from_formula(fml, groups, data, family=Poisson(),
                                  offset=2 * offset)
        result6 = model6.fit(start_params=result5.params)
        assert_allclose(result5.params, result6.params)
        assert_equal(result6.converged, True)

    def test_sensitivity(self):

        va = Exchangeable()
        family = Gaussian()

        np.random.seed(34234)
        n = 100
        Y = np.random.normal(size=n)
        X1 = np.random.normal(size=n)
        X2 = np.random.normal(size=n)
        groups = np.kron(np.arange(50), np.r_[1, 1])

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2})

        mod = GEE.from_formula("Y ~ X1 + X2", groups, D,
                               family=family, cov_struct=va)
        rslt = mod.fit()
        ps = rslt.params_sensitivity(0, 0.5, 2)
        assert_almost_equal(len(ps), 2)
        assert_almost_equal([x.cov_struct.dep_params for x in ps],
                            [0.0, 0.5])

        # Regression test
        assert_almost_equal([x.params[0] for x in ps],
                            [0.1696214707458818, 0.17836097387799127])

    def test_equivalence(self):
        """
        The Equivalence covariance structure can represent an
        exchangeable covariance structure.  Here we check that the
        results are identical using the two approaches.
        """

        np.random.seed(3424)
        endog = np.random.normal(size=20)
        exog = np.random.normal(size=(20, 2))
        exog[:, 0] = 1
        groups = np.kron(np.arange(5), np.ones(4))
        groups[12:] = 3  # Create unequal size groups

        # Set up an Equivalence covariance structure to mimic an
        # Exchangeable covariance structure.
        pairs = {}
        start = [0, 4, 8, 12]
        for k in range(4):
            pairs[k] = {}

            # Diagonal values (variance parameters)
            if k < 3:
                pairs[k][0] = (start[k] + np.r_[0, 1, 2, 3],
                               start[k] + np.r_[0, 1, 2, 3])
            else:
                pairs[k][0] = (start[k] + np.r_[0, 1, 2, 3, 4, 5, 6, 7],
                               start[k] + np.r_[0, 1, 2, 3, 4, 5, 6, 7])

            # Off-diagonal pairs (covariance parameters)
            if k < 3:
                a, b = np.tril_indices(4, -1)
                pairs[k][1] = (start[k] + a, start[k] + b)
            else:
                a, b = np.tril_indices(8, -1)
                pairs[k][1] = (start[k] + a, start[k] + b)

        ex = sm.cov_struct.Exchangeable()
        model1 = sm.GEE(endog, exog, groups, cov_struct=ex)
        result1 = model1.fit()

        for return_cov in False, True:

            ec = sm.cov_struct.Equivalence(pairs, return_cov=return_cov)
            model2 = sm.GEE(endog, exog, groups, cov_struct=ec)
            result2 = model2.fit()

            # Use large atol/rtol for the correlation case since there
            # are some small differences in the results due to degree
            # of freedom differences.
            if return_cov is True:
                atol, rtol = 1e-6, 1e-6
            else:
                atol, rtol = 1e-3, 1e-3
            assert_allclose(result1.params, result2.params,
                            atol=atol, rtol=rtol)
            assert_allclose(result1.bse, result2.bse, atol=atol, rtol=rtol)
            assert_allclose(result1.scale, result2.scale, atol=atol, rtol=rtol)

    def test_equivalence_from_pairs(self):

        np.random.seed(3424)
        endog = np.random.normal(size=50)
        exog = np.random.normal(size=(50, 2))
        exog[:, 0] = 1
        groups = np.kron(np.arange(5), np.ones(10))
        groups[30:] = 3  # Create unequal size groups

        # Set up labels.
        labels = np.kron(np.arange(5), np.ones(10)).astype(np.int32)
        labels = labels[np.random.permutation(len(labels))]

        eq = sm.cov_struct.Equivalence(labels=labels, return_cov=True)
        model1 = sm.GEE(endog, exog, groups, cov_struct=eq)

        # Call this directly instead of letting init do it to get the
        # result before reindexing.
        eq._pairs_from_labels()

        # Make sure the size is correct to hold every element.
        for g in model1.group_labels:
            p = eq.pairs[g]
            vl = [len(x[0]) for x in p.values()]
            m = sum(groups == g)
            assert_allclose(sum(vl), m * (m + 1) / 2)

        # Check for duplicates.
        ixs = set([])
        for g in model1.group_labels:
            for v in eq.pairs[g].values():
                for a, b in zip(v[0], v[1]):
                    ky = (a, b)
                    assert(ky not in ixs)
                    ixs.add(ky)

        # Smoke test
        eq = sm.cov_struct.Equivalence(labels=labels, return_cov=True)
        model1 = sm.GEE(endog, exog, groups, cov_struct=eq)
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            model1.fit(maxiter=2)


class CheckConsistency(object):

    start_params = None

    def test_cov_type(self):
        mod = self.mod
        res_robust = mod.fit(start_params=self.start_params)
        res_naive = mod.fit(start_params=self.start_params,
                            cov_type='naive')
        res_robust_bc = mod.fit(start_params=self.start_params,
                                cov_type='bias_reduced')

        # call summary to make sure it doesn't change cov_type
        res_naive.summary()
        res_robust_bc.summary()

        # check cov_type
        assert_equal(res_robust.cov_type, 'robust')
        assert_equal(res_naive.cov_type, 'naive')
        assert_equal(res_robust_bc.cov_type, 'bias_reduced')

        # check bse and cov_params
        # we are comparing different runs of the optimization
        # bse in ordinal and multinomial have an atol around 5e-10 for two
        # consecutive calls to fit.
        rtol = 1e-8
        for (res, cov_type, cov) in [
                (res_robust, 'robust', res_robust.cov_robust),
                (res_naive, 'naive', res_robust.cov_naive),
                (res_robust_bc, 'bias_reduced', res_robust_bc.cov_robust_bc)
        ]:
            bse = np.sqrt(np.diag(cov))
            assert_allclose(res.bse, bse, rtol=rtol)
            if cov_type != 'bias_reduced':
                # cov_type=naive shortcuts calculation of bias reduced
                # covariance for efficiency
                bse = res_naive.standard_errors(cov_type=cov_type)
                assert_allclose(res.bse, bse, rtol=rtol)
            assert_allclose(res.cov_params(), cov, rtol=rtol, atol=1e-10)
            assert_allclose(res.cov_params_default, cov, rtol=rtol, atol=1e-10)

        # assert that we don't have a copy
        assert_(res_robust.cov_params_default is res_robust.cov_robust)
        assert_(res_naive.cov_params_default is res_naive.cov_naive)
        assert_(res_robust_bc.cov_params_default is
                res_robust_bc.cov_robust_bc)

        # check exception for misspelled cov_type
        assert_raises(ValueError, mod.fit, cov_type='robust_bc')


class TestGEEPoissonCovType(CheckConsistency):

    @classmethod
    def setup_class(cls):

        endog, exog, group_n = load_data("gee_poisson_1.csv")

        family = Poisson()
        vi = Independence()

        cls.mod = GEE(endog, exog, group_n, None, family, vi)

        cls.start_params = np.array([-0.03644504, -0.05432094,  0.01566427,
                                     0.57628591, -0.0046566,  -0.47709315])

    def test_wrapper(self):

        endog, exog, group_n = load_data("gee_poisson_1.csv",
                                         icept=False)
        endog = pd.Series(endog)
        exog = pd.DataFrame(exog)
        group_n = pd.Series(group_n)

        family = Poisson()
        vi = Independence()

        mod = GEE(endog, exog, group_n, None, family, vi)
        rslt2 = mod.fit()

        check_wrapper(rslt2)


class TestGEEPoissonFormulaCovType(CheckConsistency):

    @classmethod
    def setup_class(cls):

        endog, exog, group_n = load_data("gee_poisson_1.csv")

        family = Poisson()
        vi = Independence()
        # Test with formulas
        D = np.concatenate((endog[:, None], group_n[:, None],
                            exog[:, 1:]), axis=1)
        D = pd.DataFrame(D)
        D.columns = ["Y", "Id", ] + ["X%d" % (k + 1)
                                     for k in range(exog.shape[1] - 1)]

        cls.mod = GEE.from_formula("Y ~ X1 + X2 + X3 + X4 + X5", "Id",
                                   D, family=family, cov_struct=vi)

        cls.start_params = np.array([-0.03644504, -0.05432094,  0.01566427,
                                     0.57628591, -0.0046566,  -0.47709315])


class TestGEEOrdinalCovType(CheckConsistency):

    @classmethod
    def setup_class(cls):

        family = Binomial()

        endog, exog, groups = load_data("gee_ordinal_1.csv",
                                        icept=False)

        va = GlobalOddsRatio("ordinal")

        cls.mod = OrdinalGEE(endog, exog, groups, None, family, va)
        cls.start_params = np.array([1.09250002, 0.0217443, -0.39851092,
                                     -0.01812116, 0.03023969, 1.18258516,
                                     0.01803453, -1.10203381])

    def test_wrapper(self):

        endog, exog, groups = load_data("gee_ordinal_1.csv",
                                        icept=False)

        endog = pd.Series(endog, name='yendog')
        exog = pd.DataFrame(exog)
        groups = pd.Series(groups, name='the_group')

        family = Binomial()
        va = GlobalOddsRatio("ordinal")
        mod = OrdinalGEE(endog, exog, groups, None, family, va)
        rslt2 = mod.fit()

        check_wrapper(rslt2)


class TestGEEMultinomialCovType(CheckConsistency):

    @classmethod
    def setup_class(cls):

        endog, exog, groups = load_data("gee_nominal_1.csv",
                                        icept=False)

        # Test with independence correlation
        va = Independence()
        cls.mod = NominalGEE(endog, exog, groups, cov_struct=va)
        cls.start_params = np.array([0.44944752,  0.45569985, -0.92007064,
                                     -0.46766728])

    def test_wrapper(self):

        endog, exog, groups = load_data("gee_nominal_1.csv",
                                        icept=False)
        endog = pd.Series(endog, name='yendog')
        exog = pd.DataFrame(exog)
        groups = pd.Series(groups, name='the_group')

        va = Independence()
        mod = NominalGEE(endog, exog, groups, cov_struct=va)
        rslt2 = mod.fit()

        check_wrapper(rslt2)


@skipif(not have_matplotlib, reason='matplotlib not available')
def test_plots():

    np.random.seed(378)
    exog = np.random.normal(size=100)
    endog = np.random.normal(size=(100, 2))
    groups = np.kron(np.arange(50), np.r_[1, 1])

    model = sm.GEE(exog, endog, groups)
    result = model.fit()

    # Smoke tests
    fig = result.plot_added_variable(1)
    assert_equal(isinstance(fig, plt.Figure), True)
    plt.close(fig)
    fig = result.plot_partial_residuals(1)
    assert_equal(isinstance(fig, plt.Figure), True)
    plt.close(fig)
    fig = result.plot_ceres_residuals(1)
    assert_equal(isinstance(fig, plt.Figure), True)
    plt.close(fig)
    fig = result.plot_isotropic_dependence()
    assert_equal(isinstance(fig, plt.Figure), True)
    plt.close(fig)



def test_missing():
    # gh-1877
    data = [['id', 'al', 'status', 'fake', 'grps'],
            ['4A', 'A', 1, 1, 0],
            ['5A', 'A', 1, 2.0, 1],
            ['6A', 'A', 1, 3, 2],
            ['7A', 'A', 1, 2.0, 3],
            ['8A', 'A', 1, 1, 4],
            ['9A', 'A', 1, 2.0, 5],
            ['11A', 'A', 1, 1, 6],
            ['12A', 'A', 1, 2.0, 7],
            ['13A', 'A', 1, 1, 8],
            ['14A', 'A', 1, 1, 9],
            ['15A', 'A', 1, 1, 10],
            ['16A', 'A', 1, 2.0, 11],
            ['17A', 'A', 1, 3.0, 12],
            ['18A', 'A', 1, 3.0, 13],
            ['19A', 'A', 1, 2.0, 14],
            ['20A', 'A', 1, 2.0, 15],
            ['2C', 'C', 0, 3.0, 0],
            ['3C', 'C', 0, 1, 1],
            ['4C', 'C', 0, 1, 2],
            ['5C', 'C', 0, 2.0, 3],
            ['6C', 'C', 0, 1, 4],
            ['9C', 'C', 0, 1, 5],
            ['10C', 'C', 0, 3, 6],
            ['12C', 'C', 0, 3, 7],
            ['14C', 'C', 0, 2.5, 8],
            ['15C', 'C', 0, 1, 9],
            ['17C', 'C', 0, 1, 10],
            ['22C', 'C', 0, 1, 11],
            ['23C', 'C', 0, 1, 12],
            ['24C', 'C', 0, 1, 13],
            ['32C', 'C', 0, 2.0, 14],
            ['35C', 'C', 0, 1, 15]]

    df = pd.DataFrame(data[1:], columns=data[0])
    df.loc[df.fake == 1, 'fake'] = np.nan
    mod = smf.gee('status ~ fake', data=df, groups='grps',
                  cov_struct=sm.cov_struct.Independence(),
                  family=sm.families.Binomial())

    df = df.dropna().copy()
    df['constant'] = 1

    mod2 = GEE(df.status, df[['constant', 'fake']], groups=df.grps,
               cov_struct=sm.cov_struct.Independence(),
               family=sm.families.Binomial())

    assert_equal(mod.endog, mod2.endog)
    assert_equal(mod.exog, mod2.exog)
    assert_equal(mod.groups, mod2.groups)

    res = mod.fit()
    res2 = mod2.fit()

    assert_almost_equal(res.params.values, res2.params.values)


if __name__ == "__main__":
    import pytest
    pytest.main([__file__, '-vvs', '-x', '--pdb'])