from typing import List import pandas as pd import numpy as np import numpy.linalg as la # scipy.linalg yields slightly diff results (tsls) from scipy.linalg import sqrtm # notin `numpy.linalg` import scipy.stats as stats from econtools.util.gentools import force_list, force_df from econtools.metrics.regutil import (unpack_shac_args, flag_sample, flag_nonsingletons, set_sample, find_colinear_columns) from econtools.metrics.results import Results # Workhorse classes class RegBase(object): def __init__(self, df, y_name, x_name, **kwargs): self.df = df self.y_name = y_name self.__dict__.update(kwargs) self.sample_cols_labels = ( 'y_name', 'x_name', 'fe_name', 'cluster', 'shac_x', 'shac_y', 'awt_name' ) self.sample_store_labels = ( 'y', 'x', 'A', 'cluster_id', 'shac_x', 'shac_y', 'AWT' ) self.vars_in_reg = ('y', 'x') self.add_constant_to = 'x' # Set `vce_type` self.vce_type = _set_vce_type(self.vce_type, self.cluster, self.shac) # Unpack spatial HAC args sp_args = unpack_shac_args(self.shac) self.shac_x = sp_args[0] self.shac_y = sp_args[1] self.shac_band = sp_args[2] self.shac_kern = sp_args[3] # Force variable names to lists self.x_name = force_list(x_name) def main(self): self.set_sample() self.estimate() self.get_vce() self.set_dof() self.inference() self.results.pull_metadata(self) return self.results def set_sample(self): sample_cols = tuple( [self.__dict__[x] for x in self.sample_cols_labels]) self.sample = flag_sample(self.df, *sample_cols) if self.nosingles and self.fe_name: self.sample &= flag_nonsingletons(self.df, self.fe_name, self.sample) sample_vars = set_sample(self.df, self.sample, sample_cols) self.__dict__.update(dict(zip(self.sample_store_labels, sample_vars))) self.x = force_df(self.x) self.y = self.y.squeeze() # Force regression variables to float64 for var in self.vars_in_reg: self.__dict__[var] = self.__dict__[var].astype(np.float64) # Demean or add constant if self.fe_name is not None: self._demean_sample() elif self.addcons: _cons = np.ones(self.y.shape[0]) x = self.__dict__[self.add_constant_to] if x.empty: x = pd.DataFrame(_cons, columns=['_cons'], index=self.y.index) else: x['_cons'] = _cons self.__dict__[self.add_constant_to] = x # Re-weight sample if self.AWT is not None: self._weight_sample() def _demean_sample(self): self.y_raw = self.y.copy() for var in self.vars_in_reg: self.__dict__[var] = _demean(self.A, self.__dict__[var]) def _weight_sample(self): row_wt = _calc_aweights(self.AWT) for var in self.vars_in_reg: self.__dict__[var] = self.__dict__[var].multiply(row_wt, axis=0) def estimate(self): """ Defined by Implementation. Initializes self.Results object """ raise NotImplementedError def get_vce(self): """ Add estimates of Variance-Covariance matrix (VCE), yhat, and residuals to `results`. """ X_inner_sum, X_for_resid = self._prep_inference_mats() yhat = np.dot(X_for_resid, self.results.beta) resid = self.y - yhat # Check through VCE types xpx_inv = self.results.xpx_inv if self.vce_type is None: vce = vce_homosk(xpx_inv, resid) elif self.vce_type in ('robust', 'hc1'): vce = vce_robust(xpx_inv, resid, X_inner_sum) elif self.vce_type in ('hc2', 'hc3'): vce = vce_hc23(xpx_inv, resid, X_inner_sum, hctype=self.vce_type) elif self.vce_type == 'cluster': vce = vce_cluster(xpx_inv, resid, X_inner_sum, self.cluster_id) elif self.vce_type == 'shac': vce = vce_shac(xpx_inv, resid, X_inner_sum, self.shac_x, self.shac_y, self.shac_kern, self.shac_band) else: raise ValueError # Make sure it's symmetric (floating point error) vce = _wrapSigma((vce + vce.T) / 2, X_for_resid.columns) self.results._add_stat('vce', vce) if not self.save_mem: self.results._add_stat('yhat', yhat) self.results._add_stat('resid', resid) # Not VCE, but needs to go somewhere self.results._add_stat('sample', self.sample) def _prep_inference_mats(self): """ Set matrices for Sandwich estimator. Note: Keep as separate method for sub-classes to override when necessary. """ X_for_inner_sum = self.x X_for_residual = self.x return X_for_inner_sum, X_for_residual def set_dof(self): """ Set degrees of freedom used in hypothesis tests and do DoF correction on VCE matrix. """ N, K = self._set_NK() vce_type = self.vce_type if vce_type in (None, 'robust', 'hc1'): df, vce_correct = df_std(N, K) elif vce_type in ('hc2', 'hc3'): df, vce_correct = df_hc23(N, K) elif vce_type == 'cluster': df, vce_correct, g = df_cluster(N, K, self.cluster_id) self.results._add_stat('g', g) elif vce_type == 'shac': df, vce_correct = df_shac(N, K) self.results._add_stat('N', N) self.results._add_stat('K', K) self.results._add_stat('df_t', df) self.results._add_stat('_df_r', df) self.results._vce_correct = vce_correct self.results.vce *= vce_correct def _set_NK(self): """ Do this in a separate method so `IVReg` can tweak `K` """ # Set `N` and `K` N, K = self.x.shape if self.A is not None: self.fe_count = len(self.A.unique()) if not _fe_nested_in_cluster(self.cluster_id, self.A): K += self.fe_count # Adjust dof's for group means self.results.sst = self.y_raw self.results._nocons = True else: self.results._nocons = self.nocons return N, K def inference(self): vce = self.results.vce beta = self.results.beta t_df = self.results.df_t se = pd.Series(np.sqrt(np.diagonal(vce)), index=vce.columns) t_stat = beta.div(se) p_values = pd.Series( stats.t.cdf(-np.abs(t_stat), t_df)*2, # `t.cdf` is P(x<X) index=vce.columns ) self.results._add_stat('se', se) self.results._add_stat('t_stat', t_stat) self.results._add_stat('pt', p_values) conf_level = .95 crit_value = stats.t.ppf(conf_level + (1 - conf_level)/2, t_df) ci_lo = beta - crit_value*se ci_hi = beta + crit_value*se self.results._add_stat('ci_lo', ci_lo) self.results._add_stat('ci_hi', ci_hi) def _set_vce_type(vce_type, cluster, shac): """ Check for argument conflicts, then set `vce_type` if needed. """ if vce_type not in (None, 'robust', 'hc1', 'hc2', 'hc3', 'cluster', 'shac'): raise ValueError("VCE type '{}' is not supported".format(vce_type)) # Check for conflicts if cluster and shac: raise ValueError("Cannot use `cluster` and `shac` together.") elif cluster and vce_type != 'cluster' and vce_type is not None: raise ValueError("Cannot pass argument to `cluster` and set `vce_type`" " to something other than 'cluster'") elif shac and vce_type != 'shac' and vce_type is not None: raise ValueError("Cannot pass argument to `shac` and set `vce_type`" " to something other than 'shac'") # Set `vce_type` new_vce = 'cluster' if cluster else 'shac' if shac else vce_type return new_vce def _demean(A, df): """ Demean a matrix/DataFrame within group `A` """ # Ignore empty `df` (e.g. empty list of exogenous included regressors) if df is None or df.empty: return df else: group_name = A.name mean = df.groupby(A).mean() large_mean = force_df(A).join(mean, on=group_name).drop(group_name, axis=1) if df.ndim == 1: large_mean = large_mean.squeeze() demeaned = df - large_mean return demeaned def _calc_aweights(aw): scaled_total = aw.sum() / len(aw) row_weights = np.sqrt(aw / scaled_total) return row_weights def _fe_nested_in_cluster(cluster_id, A): """ Check if FE's are nested within clusters (affects DOF correction). """ if (cluster_id is None) or (A is None): return False elif (cluster_id.name == A.name): return True else: joint = pd.concat((cluster_id, A), axis=1) names = [cluster_id.name, A.name] pair_counts = joint.groupby(names)[A.name].count() num_of_clusters = pair_counts.groupby(level=A.name).count() return num_of_clusters.max() == 1 def _wrapSigma(Sigma, cols): return pd.DataFrame(Sigma, index=cols, columns=cols) class Regression(RegBase): def __init__(self, *args, **kwargs): super(Regression, self).__init__(*args, **kwargs) def estimate(self): if self.check_colinear: check_colinear_cols(self.x) beta, xpx_inv = fitguts(self.y, self.x) self.results = Results(beta=beta, xpx_inv=xpx_inv) self.results.sst = self.y class IVReg(RegBase): def __init__(self, df, y_name, x_name, z_name, w_name, **kwargs): super(IVReg, self).__init__(df, y_name, x_name, **kwargs) # Handle extra variable stuff for IV self.z_name = force_list(z_name) self.w_name = force_list(w_name) self.sample_cols_labels += ('z_name', 'w_name') self.sample_store_labels += ('z', 'w') self.vars_in_reg += ('z', 'w') self.add_constant_to = 'w' def estimate(self): y = self.y x = self.x w = self.w z = self.z # TODO: Generalize Z and X creation above _first_stage and _liml if self.iv_method == '2sls': self.Xhat, self.Xtrue = self._first_stage(x, w, z) beta, xpx_inv = fitguts(self.y, self.Xhat) elif self.iv_method == 'liml': beta, xpx_inv, self.Xhat, self.Xtrue, kappa = self._liml( y, x, z, w, self._kappa_debug, self.vce_type ) else: raise ValueError( "IV method '{}' not supported".format(self.method)) self.results = Results(beta=beta, xpx_inv=xpx_inv) self.results.sst = self.y self.results._r2 = np.nan self.results._r2_a = np.nan self.results._add_stat('iv_method', self.iv_method) if self.iv_method == 'liml': self.results._add_stat('kappa', kappa) def _first_stage(self, x, w, z): X = pd.concat((x, w), axis=1) Xhat = X.copy() Z = pd.concat((z, w), axis=1) if self.check_colinear: check_colinear_cols(Z) for an_x in x.columns: this_x = x[an_x] pi_hat, __ = fitguts(this_x, Z) Xhat[an_x] = np.dot(Z, pi_hat) if self.check_colinear: check_colinear_cols(Xhat) return Xhat, X def _liml(self, y, x, z, w, _kappa_debug, vce_type): Z = pd.concat((z, w), axis=1) kappa, ZZ_inv = self._liml_kappa(y, x, w, Z) X = pd.concat((x, w), axis=1) if self.check_colinear: check_colinear_cols(Z) check_colinear_cols(X) # Solve system XX = X.T.dot(X) XZ = X.T.dot(Z) Xy = X.T.dot(y) Zy = Z.T.dot(y) # When `kappa` = 1 is 2sls, `kappa` = 0 is OLS if _kappa_debug is not None: kappa = _kappa_debug # If exactly identified, same as 2sls, make it so elif x.shape[1] == z.shape[1]: kappa = 1 xpx_inv = la.inv( (1-kappa)*XX + kappa*np.dot(XZ.dot(ZZ_inv), XZ.T) ) xpy = (1-kappa)*Xy + kappa*np.dot(XZ.dot(ZZ_inv), Zy) beta = pd.Series(xpx_inv.dot(xpy).squeeze(), index=X.columns) # LIML uses non-standard 'bread' in the sandwich estimator if vce_type is None: se_xpx_inv = xpx_inv else: se_xpx_inv = xpx_inv.dot(XZ).dot(ZZ_inv) return beta, se_xpx_inv, Z, X, kappa def _liml_kappa(self, y, x, w, Z): Y = pd.concat((y, x), axis=1).astype(np.float64) YY = Y.T.dot(Y) YZ = Y.T.dot(Z) ZZ_inv = la.inv(Z.T.dot(Z)) bread = la.inv(sqrtm( YY - np.dot(YZ.dot(ZZ_inv), YZ.T) )) if not w.empty: Yw = Y.T.dot(w) ww_inv = la.inv(w.T.dot(w)) meat = YY - np.dot(Yw.dot(ww_inv), Yw.T) else: meat = YY eigs = la.eigvalsh(bread.dot(meat).dot(bread)) kappa = np.min(eigs) return kappa, ZZ_inv def _prep_inference_mats(self): """ In 2SLS, true X is used to calculate residuals, Xhat used in sandwich estimator (because it is used in calculating beta-hat). """ X_for_inner_sum = self.Xhat X_for_residual = self.Xtrue return X_for_inner_sum, X_for_residual def _set_NK(self): N, K = super(IVReg, self)._set_NK() if self.w is not None: K += self.w.shape[1] return N, K def fitguts(y, x): """ Checks dimensions, inverts, returns beta estimate and (X'X)^-1 """ # Y should be 1D assert y.ndim == 1 # X should be 2D assert x.ndim == 2 xpx_inv = la.inv(np.dot(x.T, x)) xpy = np.dot(x.T, y) beta = pd.Series(np.dot(xpx_inv, xpy).squeeze(), index=x.columns) return beta, xpx_inv def check_colinear_cols(matrix: pd.DataFrame) -> None: colinear_cols = _get_colinear_cols(matrix) _raise_colinear_cols(colinear_cols) def _get_colinear_cols(matrix: pd.DataFrame) -> List[str]: K = matrix.shape[1] rank = la.matrix_rank(matrix) if rank < K: colinear_idx = find_colinear_columns(matrix.values, arr_rank=rank) colinear_cols = matrix.columns[colinear_idx] return colinear_cols.tolist() elif rank > K: raise ValueError("Something has gone very, very wrong.") else: return [] def _raise_colinear_cols(colinear_cols: list) -> None: if len(colinear_cols) == 0: return else: colinear_col_str = '\n' + '\n'.join(colinear_cols) raise ValueError(f"Colinear variables: {colinear_col_str}") # VCE estimators def vce_homosk(xpx_inv, resid): """ Standard OLS VCE with spherical errors. """ s2 = np.dot(resid, resid) / resid.shape[0] vce = s2 * xpx_inv return vce def vce_robust(xpx_inv, resid, x): xu = x.mul(resid, axis=0).values B = xu.T.dot(xu) vce = sandwich(xpx_inv, B, xpx_inv.T) return vce def vce_hc23(xpx_inv, resid, x, hctype='hc2'): xu = x.mul(resid, axis=0).values h = _get_h(x, xpx_inv)[:, np.newaxis] if hctype == 'hc2': xu /= np.sqrt(1 - h) elif hctype == 'hc3': xu /= 1 - h else: raise ValueError B = xu.T.dot(xu) vce = sandwich(xpx_inv, B, xpx_inv.T) return vce def _get_h(x, xpx_inv): n = x.shape[0] h = np.zeros(n) for i in range(n): x_row = x.iloc[i, :] h[i] = x_row.dot(xpx_inv).dot(x_row) return h def vce_cluster(xpx_inv, resid, x, cluster): raw_xu = x.mul(resid, axis=0).values int_cluster = pd.factorize(cluster)[0] xu = np.array([np.bincount(int_cluster, weights=raw_xu[:, col]) for col in range(raw_xu.shape[1])]).T B = xu.T.dot(xu) vce = sandwich(xpx_inv, B, xpx_inv.T) return vce def vce_shac(xpx_inv, resid, x, shac_x, shac_y, shac_kern, shac_band): xu = x.mul(resid, axis=0).values Wxu = _shac_weights(xu, shac_x, shac_y, shac_kern, shac_band) B = xu.T.dot(Wxu) vce = sandwich(xpx_inv, B, xpx_inv.T) return vce def _shac_weights(xu, lon, lat, kernel, band): N, K = xu.shape Wxu = np.zeros((N, K)) lon_arr = lon.squeeze().values.astype(float) lat_arr = lat.squeeze().values.astype(float) kern_func = _shac_kernels(kernel, band) for i in range(N): dist = np.sqrt((lon_arr[i] - lon_arr)**2 + (lat_arr[i] - lat_arr)**2) w_i = kern_func(dist).astype(np.float64) Wxu[i, :] = w_i.dot(xu) return Wxu def _shac_kernels(kernel, band): def unif(x): return x <= band def tria(x): return (1 - x/band)*(x <= band) if kernel == 'unif': return unif elif kernel == 'tria': return tria def sandwich(left, B, right): return left.dot(B).dot(right) # DOF definitions def df_std(n, k): df = n - k vce_correct = n / df return df, vce_correct def df_hc23(n, k): df = n - k vce_correct = 1 return df, vce_correct def df_cluster(n, k, cluster_id): g = len(pd.value_counts(cluster_id)) df = g - 1 vce_correct = ((n - 1) / (n - k)) * (g / (g - 1)) return df, vce_correct, g def df_shac(n, k): df = n - k vce_correct = 1 return df, vce_correct