import pytest import pkgutil import numpy as np from smart_open import smart_open # type: ignore from edgePy.DGEList import DGEList from edgePy.data_import.data_import import get_dataset_path from edgePy.data_import.ensembl.ensembl_flat_file_reader import CanonicalDataStore TEST_GENE_SET_DATA = "transcripts_homo_sapiens_core_75_37.tsv" TEST_GENE_SYMBOLS = "symbols_homo_sapiens_core_75_37.tsv" TEST_DATASET = "GSE49712_HTSeq.txt.gz" TEST_DATASET_NPZ = "GSE49712_HTSeq.txt.npz" TEST_GROUPS = "groups.json" @pytest.fixture def dge_list(): with smart_open(get_dataset_path(TEST_DATASET), 'r') as data_handle, smart_open( get_dataset_path(TEST_GROUPS), 'r' ) as group_handle: return DGEList.create_DGEList_handle(data_handle, group_handle) def test_sample_by_group(): samples = ["A", "B", "C", "D", "E"] expected_output = {'group1': ["A", "B"], 'group2': ["C", "D", "E"]} group_by_sample = ['group1', 'group1', 'group2', 'group2', 'group2'] output = DGEList._sample_group_dict(group_by_sample, samples) assert output == expected_output def test_sample_group_list(): samples = ["A", "B", "C", "D", "E"] sample_by_group = {'group1': ["A", "B"], 'group2': ["C", "D", "E"]} expected_output = np.array(['group1', 'group1', 'group2', 'group2', 'group2']) output = DGEList._sample_group_list(sample_by_group, samples) assert np.array_equal(output, expected_output) def test_minimal_init(): dge_list = DGEList( to_remove_zeroes=False, counts=np.ones(shape=(5, 5)), samples=["A", "B", "C", "D", "E"], genes=["ENSG001", "ENSG002", "ENSG003", "ENSG004", "ENSG005"], groups_in_dict={'group1': ["A", "B"], 'group2': ["C", "D", "E"]}, ) assert dge_list.__repr__() == "DGEList(num_samples=5, num_genes=5)" def test_too_much(): # TODO: Refactor into smaller units. # - Test blank non-parameterized `DGEList()` # - Test opening handles, both gzipped or not # - Test samples and genes are set, validated, typed right assert len(dge_list().samples) == 10 assert len(dge_list().genes) == 21711 def test_too_many_options(): with pytest.raises(Exception): DGEList(counts=np.zeros(shape=(5, 10)), filename=str(get_dataset_path(TEST_DATASET_NPZ))) def test_too_many_options2(): with pytest.raises(Exception): DGEList(counts=np.ones(shape=(5, 10)), filename=str(get_dataset_path(TEST_DATASET_NPZ))) def test_library_size(): dge_list = DGEList(filename=str(get_dataset_path(TEST_DATASET_NPZ))) assert np.array_equal( dge_list.library_size, np.array( [ 63_579_607, 58_531_933, 39_138_521, 78_565_885, 48_667_119, 62_799_917, 66_032_107, 66_194_776, 55_085_875, 37_760_315, ] ), ) def test_setting_DGElist_counts(): dge_list = DGEList( counts=np.zeros(shape=(5, 10)), groups_in_list=['A', 'A', 'B', 'B', 'B'], samples=['S0', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9'], ) assert 5 == dge_list.counts.shape[0] assert 10 == dge_list.counts.shape[1] with pytest.raises(ValueError): c = np.array([[1, 1, 1], [-1, 1, 1]]) DGEList(counts=c, groups_in_list=["a", "b"]) with pytest.raises(ValueError): c = np.array([[1, 1, 1], [np.nan, 1, 1]]) DGEList(counts=c, groups_in_list=["a", "b"]) with pytest.raises(ValueError): c = np.array([1, 1, 1]) DGEList(counts=c, groups_in_list=["a", "b"]) with pytest.raises(TypeError): c = [1, 1, 1] dge_list.counts = c def test_cycle_dge_npz(): import tempfile import os tempdir = tempfile.mkdtemp(prefix="edgePy_tmp") file_name = tempdir + os.sep + next(tempfile._get_candidate_names()) dge_list_first = dge_list() dge_list_first.write_npz_file(filename=file_name) dge_list_second = DGEList(filename=file_name + ".npz") assert np.array_equal(dge_list_first.counts, dge_list_second.counts) assert np.array_equal(dge_list_first.genes, dge_list_second.genes) assert np.array_equal(dge_list_first.samples, dge_list_second.samples) assert np.array_equal(dge_list_first.norm_factors, dge_list_second.norm_factors) assert np.array_equal(dge_list_first.groups_list, dge_list_second.groups_list) os.remove(file_name + ".npz") os.rmdir(tempdir) def testing_setting_samples_and_counts(): # Empty list should fail with pytest.raises(Exception): DGEList( to_remove_zeroes=False, groups_in_list=['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B'], ) # Lists with just counts should fail with pytest.raises(ValueError): DGEList(counts=np.array([[2, 2, 2], [2, 2, 2], [2, 2, 2]]), groups_in_list=['A', 'A', 'B']) # lists sith samples and counts and groups should pass: DGEList( counts=np.array([[2, 2, 2], [2, 2, 2], [2, 2, 2]]), groups_in_list=['A', 'A', 'B'], samples=["S1", 'S2', 'S3'], ) # Lists with just samples should fail with pytest.raises(Exception): DGEList( samples=np.array(["1", "2", "3"]), to_remove_zeroes=False, groups_in_list=['A', 'A', 'B'], ) # Properly formed samples and counts should pass DGEList( samples=np.array(["1", "2", "3"]), counts=np.array([[2, 2, 2], [2, 2, 2], [2, 2, 2]]), groups_in_list=['A', 'A', 'B'], ) # Lists with ill-matched samples and counts should fail pytest.raises( ValueError, "DGEList(samples = np.array(['2', '3'])," " counts = np.array([[2, 2, 2], [2, 2, 2], [2, 2, 2]]))", ) def test_repr(): assert dge_list().__repr__() == "DGEList(num_samples=10, num_genes=21,711)" def test_broken_dge_caGENE_SYMBOL_SQLll(): with pytest.raises(Exception): DGEList(filename="fake_filename", counts=np.array([[1, 1, 1], [1, 1, 1]])) with pytest.raises(Exception): DGEList(counts=None) def test_cpm(): dge_list = DGEList(filename=str(get_dataset_path(TEST_DATASET_NPZ))) first_pos = dge_list.counts[0][0] col_sum = np.sum(dge_list.counts, axis=0) assert isinstance(first_pos, np.integer) new_dge_list = dge_list.cpm() assert new_dge_list.counts[0][0] == first_pos * 1e6 / col_sum[0] def test_rpkm(): dge_list = DGEList(filename=str(get_dataset_path(TEST_DATASET_NPZ))) icd = CanonicalDataStore( get_dataset_path(TEST_GENE_SET_DATA), get_dataset_path(TEST_GENE_SYMBOLS) ) first_pos = dge_list.counts[0][0] first_gene = dge_list.genes[0] col_sum = np.sum(dge_list.counts, axis=0) assert isinstance(first_pos, np.integer) rpm_dge = dge_list.rpkm(icd) ensg_gene = icd.pick_gene_id(icd.get_genes_from_symbol(first_gene)) gene_len = icd.get_length_of_canonical_transcript(ensg_gene) # RPKM=numReads / (geneLength / 1000 * totalNumReads / 1, 000, 000) assert rpm_dge.counts[0][0] == (first_pos / ((gene_len / 1e3) * (col_sum[0] / 1e6))) def test_tpm(): # example hand calculated as in https://www.youtube.com/watch?time_continue=611&v=TTUrtCY2k-w counts = np.array([[10, 12, 30], [20, 25, 60], [5, 8, 15], [0, 0, 1]]) gene_lengths = np.array([2000, 4000, 1000, 10000]) expected = np.array( [ [333_333.333_333_33, 296_296.296_296_3, 332_594.235_033_26], [333_333.333_333_33, 308_641.975_308_64, 332_594.235_033_26], [333_333.333_333_33, 395_061.728_395_06, 332_594.235_033_26], [0.0, 0.0, 2217.294_900_22], ] ) dge_list = DGEList( counts=counts, samples=np.array(['a', 'b', 'c']), genes=np.array(['a', 'b', 'c', 'd']), groups_in_dict={'group1': ['a', 'c'], 'group2': ['b', 'd']}, ) assert isinstance(dge_list.counts[0][0], np.integer) new_dge_list = dge_list.tpm(gene_lengths) assert np.allclose(new_dge_list.counts, expected, atol=1e-1) # make sure that the sums of all genes across are the same the each sample (an important property of TPM) gene_sums = new_dge_list.counts.sum(axis=0) assert np.allclose(gene_sums, [gene_sums[0]] * len(gene_sums)) # Unit tests for ``edgePy.data_import.Importer``.\ def test_init(): dge_list = DGEList.create_DGEList_data_file( data_file=get_dataset_path(TEST_DATASET), group_file=get_dataset_path(TEST_GROUPS) ) assert dge_list.__repr__() == "DGEList(num_samples=10, num_genes=21,711)" dge_list = DGEList.create_DGEList_handle( data_handle=smart_open(get_dataset_path(TEST_DATASET)), group_handle=smart_open(get_dataset_path(TEST_GROUPS)), ) assert dge_list.__repr__() == "DGEList(num_samples=10, num_genes=21,711)" # TestGroupImporter. def test_create_DGEList_handle_init(): dge_list = DGEList.create_DGEList_handle( data_handle=smart_open(get_dataset_path(TEST_DATASET)), group_handle=smart_open(get_dataset_path(TEST_GROUPS)), ) assert 2 == len(dge_list.groups_dict) assert 5 == len(dge_list.groups_dict["Group 1"]) assert 5 == len(dge_list.groups_dict["Group 2"]) assert dge_list.samples.shape == dge_list.groups_list.shape # Unit tests for packaged (optionally zipped during install) data. def test_get_data_stream(): """Tests finding packaged data with ``pkgutil.get_data()``""" pkgutil.get_data("edgePy", "data/GSE49712_HTSeq.txt.gz") def test_create_DGEList(): """Tests the function that converts data into a DGE_List object""" samples = ["AAA", "BBB", "CCC"] genes = ["ENSG001", "ENSG002"] data_set = { "AAA": {"ENSG001": 10, "ENSG002": 20}, "BBB": {"ENSG001": 15, "ENSG002": 40}, "CCC": {"ENSG001": 20, "ENSG002": 80}, } categories = {"One": ["AAA", "BBB"], "Two": ["CCC"]} dge_list = DGEList.create_DGEList( sample_list=samples, data_set=data_set, gene_list=genes, category_to_samples=categories ) assert np.array_equal(dge_list.samples, np.array(samples)) # 2 rows (genes), 3 columns(samples) assert np.array_equal(dge_list.counts, np.array([[10, 15, 20], [20, 40, 80]])) assert np.array_equal(dge_list.groups_list, np.array(["One", "One", "Two"])) assert dge_list.groups_dict, {"One:"} assert np.array_equal(dge_list.genes, np.array(genes))