from __future__ import division import ldscore.sumstats as s import ldscore.parse as ps import unittest import numpy as np import pandas as pd from pandas.util.testing import assert_series_equal, assert_frame_equal from nose.tools import * from numpy.testing import assert_array_equal, assert_array_almost_equal, assert_allclose from nose.plugins.attrib import attr import os from ldsc import parser DIR = os.path.dirname(__file__) N_REP = 200 s._N_CHR = 2 # having to mock 22 files is annoying class Mock(object): ''' Dumb object for mocking args and log ''' def __init__(self): pass def log(self, x): # pass print x log = Mock() args = Mock() t = lambda attr: lambda obj: getattr(obj, attr, float('nan')) def test_check_condnum(): x = np.ones((2, 2)) x[1, 1] += 1e-5 args.invert_anyway = False assert_raises(ValueError, s._check_ld_condnum, args, log, x) args.invert_anyway = True s._check_ld_condnum(args, log, x) # no error def test_check_variance(): ld = pd.DataFrame({'SNP': ['a', 'b', 'c'], 'LD1': np.ones(3).astype(float), 'LD2': np.arange(3).astype(float)}) ld = ld[['SNP', 'LD1', 'LD2']] M_annot = np.array([[1, 2]]) M_annot, ld, novar_col = s._check_variance(log, M_annot, ld) assert_array_equal(M_annot.shape, (1, 1)) assert_array_equal(M_annot, [[2]]) assert_allclose(ld.iloc[:, 1], [0, 1, 2]) assert_array_equal(novar_col, [True, False]) def test_align_alleles(): beta = pd.Series(np.ones(6)) alleles = pd.Series(['ACAC', 'TGTG', 'GTGT', 'AGCT', 'AGTC', 'TCTC']) beta = s._align_alleles(beta, alleles) assert_series_equal(beta, pd.Series([1.0, 1, 1, -1, 1, 1])) def test_filter_bad_alleles(): alleles = pd.Series(['ATAT', 'ATAG', 'DIID', 'ACAC']) bad_alleles = s._filter_alleles(alleles) print bad_alleles assert_series_equal(bad_alleles, pd.Series([False, False, False, True])) def test_read_annot(): ref_ld_chr = None ref_ld = os.path.join(DIR, 'annot_test/test') overlap_matrix, M_tot = s._read_chr_split_files(ref_ld_chr, ref_ld, log, 'annot matrix', ps.annot, frqfile=None) assert_array_equal(overlap_matrix, [[1, 0, 0], [0, 2, 2], [0, 2, 2]]) assert_array_equal(M_tot, 3) frqfile = os.path.join(DIR, 'annot_test/test1') overlap_matrix, M_tot = s._read_chr_split_files(ref_ld_chr, ref_ld, log, 'annot matrix', ps.annot, frqfile=frqfile) assert_array_equal(overlap_matrix, [[1, 0, 0], [0, 1, 1], [0, 1, 1]]) assert_array_equal(M_tot, 2) def test_valid_snps(): x = {'AC', 'AG', 'CA', 'CT', 'GA', 'GT', 'TC', 'TG'} assert_equal(x, s.VALID_SNPS) def test_bases(): x = set(['A', 'T', 'G', 'C']) assert_equal(x, set(s.BASES)) def test_complement(): assert_equal(s.COMPLEMENT, {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}) def test_warn_len(): # nothing to test except that it doesn't throw an error at runtime s._warn_length(log, [1]) def test_match_alleles(): m = {'ACAC', 'ACCA', 'ACGT', 'ACTG', 'AGAG', 'AGCT', 'AGGA', 'AGTC', 'CAAC', 'CACA', 'CAGT', 'CATG', 'CTAG', 'CTCT', 'CTGA', 'CTTC', 'GAAG', 'GACT', 'GAGA', 'GATC', 'GTAC', 'GTCA', 'GTGT', 'GTTG', 'TCAG', 'TCCT', 'TCGA', 'TCTC', 'TGAC', 'TGCA', 'TGGT', 'TGTG'} assert_equal(m, s.MATCH_ALLELES) def test_flip_alleles(): m = {'ACAC': False, 'ACCA': True, 'ACGT': True, 'ACTG': False, 'AGAG': False, 'AGCT': True, 'AGGA': True, 'AGTC': False, 'CAAC': True, 'CACA': False, 'CAGT': False, 'CATG': True, 'CTAG': True, 'CTCT': False, 'CTGA': False, 'CTTC': True, 'GAAG': True, 'GACT': False, 'GAGA': False, 'GATC': True, 'GTAC': True, 'GTCA': False, 'GTGT': False, 'GTTG': True, 'TCAG': False, 'TCCT': True, 'TCGA': True, 'TCTC': False, 'TGAC': False, 'TGCA': True, 'TGGT': True, 'TGTG': False} assert_equal(m, s.FLIP_ALLELES) def test_strand_ambiguous(): m = {'AC': False, 'AG': False, 'AT': True, 'CA': False, 'CG': True, 'CT': False, 'GA': False, 'GC': True, 'GT': False, 'TA': True, 'TC': False, 'TG': False} assert_equal(m, s.STRAND_AMBIGUOUS) @attr('rg') @attr('slow') class Test_RG_Statistical(): @classmethod def setUpClass(cls): args = parser.parse_args('') args.ref_ld = DIR + '/simulate_test/ldscore/twold_onefile' args.w_ld = DIR + '/simulate_test/ldscore/w' args.rg = ','.join( (DIR + '/simulate_test/sumstats/' + str(i) for i in xrange(N_REP))) args.out = DIR + '/simulate_test/1' x = s.estimate_rg(args, log) args.intercept_gencov = ','.join(('0' for _ in xrange(N_REP))) args.intercept_h2 = ','.join(('1' for _ in xrange(N_REP))) y = s.estimate_rg(args, log) cls.rg = x cls.rg_noint = y def test_rg_ratio(self): assert_allclose(np.nanmean(map(t('rg_ratio'), self.rg)), 0, atol=0.02) def test_rg_ratio_noint(self): assert_allclose( np.nanmean(map(t('rg_ratio'), self.rg_noint)), 0, atol=0.02) def test_rg_se(self): assert_allclose(np.nanmean(map(t('rg_se'), self.rg)), np.nanstd( map(t('rg_ratio'), self.rg)), atol=0.02) def test_rg_se_noint(self): assert_allclose(np.nanmean(map(t('rg_se'), self.rg_noint)), np.nanstd( map(t('rg_ratio'), self.rg_noint)), atol=0.02) def test_gencov_tot(self): assert_allclose( np.nanmean(map(t('tot'), map(t('gencov'), self.rg))), 0, atol=0.02) def test_gencov_tot_noint(self): assert_allclose( np.nanmean(map(t('tot'), map(t('gencov'), self.rg_noint))), 0, atol=0.02) def test_gencov_tot_se(self): assert_allclose(np.nanstd(map(t('tot'), map(t('gencov'), self.rg))), np.nanmean( map(t('tot_se'), map(t('gencov'), self.rg))), atol=0.02) def test_gencov_tot_se_noint(self): assert_allclose(np.nanstd(map(t('tot'), map(t('gencov'), self.rg_noint))), np.nanmean( map(t('tot_se'), map(t('gencov'), self.rg_noint))), atol=0.02) def test_gencov_cat(self): assert_allclose( np.nanmean(map(t('cat'), map(t('gencov'), self.rg))), [0, 0], atol=0.02) def test_gencov_cat_noint(self): assert_allclose( np.nanmean(map(t('cat'), map(t('gencov'), self.rg_noint))), [0, 0], atol=0.02) def test_gencov_cat_se(self): assert_allclose(np.nanstd(map(t('cat'), map(t('gencov'), self.rg))), np.nanmean( map(t('cat_se'), map(t('gencov'), self.rg))), atol=0.02) def test_gencov_cat_se_noint(self): assert_allclose(np.nanstd(map(t('cat'), map(t('gencov'), self.rg_noint))), np.nanmean( map(t('cat_se'), map(t('gencov'), self.rg_noint))), atol=0.02) def test_gencov_int(self): assert_allclose( np.nanmean(map(t('intercept'), map(t('gencov'), self.rg))), 0, atol=0.1) def test_gencov_int_se(self): assert_allclose(np.nanmean(map(t('intercept_se'), map(t('gencov'), self.rg))), np.nanstd( map(t('intercept'), map(t('gencov'), self.rg))), atol=0.1) def test_hsq_int(self): assert_allclose( np.nanmean(map(t('intercept'), map(t('hsq2'), self.rg))), 1, atol=0.1) def test_hsq_int_se(self): assert_allclose(np.nanmean(map(t('intercept_se'), map(t('hsq2'), self.rg))), np.nanstd( map(t('intercept'), map(t('hsq2'), self.rg))), atol=0.1) @attr('h2') @attr('slow') class Test_H2_Statistical(unittest.TestCase): @classmethod def setUpClass(cls): args = parser.parse_args('') args.ref_ld = DIR + '/simulate_test/ldscore/twold_onefile' args.w_ld = DIR + '/simulate_test/ldscore/w' args.chisq_max = 99999 h2 = [] h2_noint = [] for i in xrange(N_REP): args.intercept_h2 = None args.h2 = DIR + '/simulate_test/sumstats/' + str(i) args.out = DIR + '/simulate_test/1' h2.append(s.estimate_h2(args, log)) args.intercept_h2 = 1 h2_noint.append(s.estimate_h2(args, log)) cls.h2 = h2 cls.h2_noint = h2_noint def test_tot(self): assert_allclose(np.nanmean(map(t('tot'), self.h2)), 0.9, atol=0.05) def test_tot_noint(self): assert_allclose( np.nanmean(map(t('tot'), self.h2_noint)), 0.9, atol=0.05) def test_tot_se(self): assert_allclose(np.nanmean(map(t('tot_se'), self.h2)), np.nanstd( map(t('tot'), self.h2)), atol=0.05) def test_tot_se_noint(self): assert_allclose(np.nanmean(map(t('tot_se'), self.h2_noint)), np.nanstd( map(t('tot'), self.h2_noint)), atol=0.05) def test_cat(self): x = np.nanmean(map(t('cat'), self.h2_noint), axis=0) y = np.array((0.3, 0.6)).reshape(x.shape) assert_allclose(x, y, atol=0.05) def test_cat_noint(self): x = np.nanmean(map(t('cat'), self.h2_noint), axis=0) y = np.array((0.3, 0.6)).reshape(x.shape) assert_allclose(x, y, atol=0.05) def test_cat_se(self): x = np.nanmean(map(t('cat_se'), self.h2), axis=0) y = np.nanstd(map(t('cat'), self.h2), axis=0).reshape(x.shape) assert_allclose(x, y, atol=0.05) def test_cat_se_noint(self): x = np.nanmean(map(t('cat_se'), self.h2_noint), axis=0) y = np.nanstd(map(t('cat'), self.h2_noint), axis=0).reshape(x.shape) assert_allclose(x, y, atol=0.05) def test_coef(self): # should be h^2/M = [[0.3, 0.9]] / M coef = np.array(((0.3, 0.9))) / self.h2[0].M for h in [self.h2, self.h2_noint]: assert np.all(np.abs(np.nanmean(map(t('coef'), h), axis=0) - coef) < 1e6) def test_coef_se(self): for h in [self.h2, self.h2_noint]: assert_array_almost_equal(np.nanmean(map(t('coef_se'), h), axis=0), np.nanstd(map(t('coef'), h), axis=0)) def test_prop(self): for h in [self.h2, self.h2_noint]: assert np.all(np.nanmean(map(t('prop'), h), axis=0) - [1/3, 2/3] < 0.02) def test_prop_se(self): for h in [self.h2, self.h2_noint]: assert np.all(np.nanmean(map(t('prop_se'), h), axis=0) - np.nanstd(map(t('prop'), h), axis=0) < 0.02) def test_int(self): assert_allclose(np.nanmean(map(t('intercept'), self.h2)), 1, atol=0.1) def test_int_se(self): assert_allclose(np.nanstd(map(t('intercept'), self.h2)), np.nanmean( map(t('intercept_se'), self.h2)), atol=0.1) class Test_Estimate(unittest.TestCase): def test_h2_M(self): # check --M works args = parser.parse_args('') args.ref_ld = DIR + '/simulate_test/ldscore/oneld_onefile' args.w_ld = DIR + '/simulate_test/ldscore/w' args.h2 = DIR + '/simulate_test/sumstats/1' args.out = DIR + '/simulate_test/1' args.print_cov = True # right now just check no runtime errors args.print_delete_vals = True x = s.estimate_h2(args, log) args.M = str( float(open(DIR + '/simulate_test/ldscore/oneld_onefile.l2.M_5_50').read())) y = s.estimate_h2(args, log) assert_array_almost_equal(x.tot, y.tot) assert_array_almost_equal(x.tot_se, y.tot_se) args.M = '1,2' assert_raises(ValueError, s.estimate_h2, args, log) args.M = 'foo_bar' assert_raises(ValueError, s.estimate_h2, args, log) def test_h2_ref_ld(self): # test different ways of reading ref ld args = parser.parse_args('') args.ref_ld_chr = DIR + '/simulate_test/ldscore/twold_onefile' args.w_ld = DIR + '/simulate_test/ldscore/w' args.h2 = DIR + '/simulate_test/sumstats/555' args.out = DIR + '/simulate_test/' x = s.estimate_h2(args, log) args.ref_ld = DIR + '/simulate_test/ldscore/twold_firstfile,' + \ DIR + '/simulate_test/ldscore/twold_secondfile' y = s.estimate_h2(args, log) args.ref_ld_chr = DIR + '/simulate_test/ldscore/twold_firstfile,' + \ DIR + '/simulate_test/ldscore/twold_secondfile' z = s.estimate_h2(args, log) assert_almost_equal(x.tot, y.tot) assert_array_almost_equal(y.cat, z.cat) assert_array_almost_equal(x.prop, y.prop) assert_array_almost_equal(y.coef, z.coef) assert_array_almost_equal(x.tot_se, y.tot_se) assert_array_almost_equal(y.cat_se, z.cat_se) assert_array_almost_equal(x.prop_se, y.prop_se) assert_array_almost_equal(y.coef_se, z.coef_se) # test statistical properties (constrain intercept here) def test_rg_M(self): args = parser.parse_args('') args.ref_ld = DIR + '/simulate_test/ldscore/oneld_onefile' args.w_ld = DIR + '/simulate_test/ldscore/w' args.rg = ','.join( [DIR + '/simulate_test/sumstats/1' for _ in xrange(2)]) args.out = DIR + '/simulate_test/1' x = s.estimate_rg(args, log)[0] args.M = open( DIR + '/simulate_test/ldscore/oneld_onefile.l2.M_5_50', 'rb').read().rstrip('\n') y = s.estimate_rg(args, log)[0] assert_array_almost_equal(x.rg_ratio, y.rg_ratio) assert_array_almost_equal(x.rg_se, y.rg_se) args.M = '1,2' assert_raises(ValueError, s.estimate_rg, args, log) args.M = 'foo_bar' assert_raises(ValueError, s.estimate_rg, args, log) def test_rg_ref_ld(self): args = parser.parse_args('') args.ref_ld_chr = DIR + '/simulate_test/ldscore/twold_onefile' args.w_ld = DIR + '/simulate_test/ldscore/w' args.rg = ','.join( [DIR + '/simulate_test/sumstats/1' for _ in xrange(2)]) args.out = DIR + '/simulate_test/1' args.print_cov = True # right now just check no runtime errors args.print_delete_vals = True x = s.estimate_rg(args, log)[0] args.ref_ld = DIR + '/simulate_test/ldscore/twold_firstfile,' + \ DIR + '/simulate_test/ldscore/twold_secondfile' y = s.estimate_rg(args, log)[0] args.ref_ld_chr = DIR + '/simulate_test/ldscore/twold_firstfile,' + \ DIR + '/simulate_test/ldscore/twold_secondfile' z = s.estimate_rg(args, log)[0] assert_almost_equal(x.rg_ratio, y.rg_ratio) assert_almost_equal(y.rg_jknife, z.rg_jknife) assert_almost_equal(x.rg_se, y.rg_se) def test_no_check_alleles(self): args = parser.parse_args('') args.ref_ld = DIR + '/simulate_test/ldscore/oneld_onefile' args.w_ld = DIR + '/simulate_test/ldscore/w' args.rg = ','.join( [DIR + '/simulate_test/sumstats/1' for _ in xrange(2)]) args.out = DIR + '/simulate_test/1' x = s.estimate_rg(args, log)[0] args.no_check_alleles = True y = s.estimate_rg(args, log)[0] assert_equal(x.rg_ratio, y.rg_ratio) assert_almost_equal(x.rg_jknife, y.rg_jknife) assert_equal(x.rg_se, y.rg_se) def test_twostep_h2(self): # make sure two step isn't going crazy args = parser.parse_args('') args.ref_ld = DIR + '/simulate_test/ldscore/oneld_onefile' args.w_ld = DIR + '/simulate_test/ldscore/w' args.h2 = DIR + '/simulate_test/sumstats/1' args.out = DIR + '/simulate_test/1' args.chisq_max = 9999999 args.two_step = 999 x = s.estimate_h2(args, log) args.chisq_max = 9999 args.two_step = 99999 y = s.estimate_h2(args, log) assert_allclose(x.tot, y.tot, atol=1e-5) def test_twostep_rg(self): # make sure two step isn't going crazy args = parser.parse_args('') args.ref_ld_chr = DIR + '/simulate_test/ldscore/oneld_onefile' args.w_ld = DIR + '/simulate_test/ldscore/w' args.rg = ','.join( [DIR + '/simulate_test/sumstats/1' for _ in xrange(2)]) args.out = DIR + '/simulate_test/rg' args.two_step = 999 x = s.estimate_rg(args, log)[0] args.two_step = 99999 y = s.estimate_rg(args, log)[0] assert_allclose(x.rg_ratio, y.rg_ratio, atol=1e-5) assert_allclose(x.gencov.tot, y.gencov.tot, atol=1e-5)