Python scipy.stats.fisher_exact() Examples
The following are 30
code examples of scipy.stats.fisher_exact().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
scipy.stats
, or try the search function
.
Example #1
Source File: test_differential.py From methplotlib with MIT License | 6 votes |
def old_method(a_range, b_range, bed): from scipy.stats import fisher_exact def meth_counts(interval): if not interval.empty: all, methylated = interval.as_df()[['calls', "methylated"]].sum() return [all - methylated, methylated] else: return [0, 0] rowdicts = [] for chr, begin, end in bed.merge().as_df().itertuples(index=False, name=None): [d1, n1], [d2, n2] = meth_counts( a_range[chr, begin:end]), meth_counts(b_range[chr, begin:end]) rowdicts.append({"Chromosome": chr, "Start": begin, "End": end, "d1": d1, "d2": d2, "n1": n1, "n2": n2}) # print(c1, c2) # print(a, b, c, d) # odr, pv = fisher_exact([c1, c2]) # print(odr, pv) # rowdicts.append({"Chromosome": chr, "Start": begin, "End": end, "OR": odr, "P": pv}) return pd.DataFrame.from_dict(rowdicts)
Example #2
Source File: PipelineGSEnrichment.py From CGATPipelines with MIT License | 6 votes |
def run(self): A = self.GenesWith & self.foreground B = self.GenesWithout & self.foreground C = self.GenesWith & self.background D = self.GenesWithout & self.background fishlist = ((len(A), len(B)), (len(C), len(D))) FT = stats.fisher_exact(fishlist) OR, p = FT padj = self.correct(p) if padj > 1: padj = 1 significant = self.sig(padj) return OR, p, padj, significant, fishlist, A, C # functions below here correspond to specific steps in the # pipeline_enrichment pipeline - they are written as functions # so the cluster_runnable decorater can be used.
Example #3
Source File: test_stats.py From Computable with MIT License | 6 votes |
def test_precise(self): # results from R # # R defines oddsratio differently (see Notes section of fisher_exact # docstring), so those will not match. We leave them in anyway, in # case they will be useful later on. We test only the p-value. tablist = [ ([[100, 2], [1000, 5]], (2.505583993422285e-001, 1.300759363430016e-001)), ([[2, 7], [8, 2]], (8.586235135736206e-002, 2.301413756522114e-002)), ([[5, 1], [10, 10]], (4.725646047336584e+000, 1.973244147157190e-001)), ([[5, 15], [20, 20]], (3.394396617440852e-001, 9.580440012477637e-002)), ([[5, 16], [20, 25]], (3.960558326183334e-001, 1.725864953812994e-001)), ([[10, 5], [10, 1]], (2.116112781158483e-001, 1.973244147157190e-001)), ([[10, 5], [10, 0]], (0.000000000000000e+000, 6.126482213438734e-002)), ([[5, 0], [1, 4]], (np.inf, 4.761904761904762e-002)), ([[0, 5], [1, 4]], (0.000000000000000e+000, 1.000000000000000e+000)), ([[5, 1], [0, 4]], (np.inf, 4.761904761904758e-002)), ([[0, 1], [3, 2]], (0.000000000000000e+000, 1.000000000000000e+000)) ] for table, res_r in tablist: res = stats.fisher_exact(np.asarray(table)) np.testing.assert_almost_equal(res[1], res_r[1], decimal=11, verbose=True)
Example #4
Source File: chicDifferentialTest.py From HiCExplorer with GNU General Public License v3.0 | 6 votes |
def fisher_exact_test(pDataFile1, pDataFile2, pAlpha): test_result = [] accepted = [] rejected = [] for i, (group1, group2) in enumerate(zip(pDataFile1, pDataFile2)): try: odds, p_value = stats.fisher_exact(np.ceil([group1, group2])) if p_value <= pAlpha: test_result.append(p_value) rejected.append([i, p_value]) else: test_result.append(p_value) accepted.append([i, p_value]) except ValueError: test_result.append(np.nan) accepted.append([i, 1.0]) return test_result, accepted, rejected
Example #5
Source File: predict_enriched_two_libraries_decision_tree.py From PIDGINv2 with MIT License | 6 votes |
def doTargetPrediction(pickled_model_name): if os.name == 'nt': sep = '\\' else: sep = '/' mod = pickled_model_name.split(sep)[-1].split('.')[0] clf = open_Model(mod) probs1 = map(int, clf.predict_proba(querymatrix1)[:,1] > threshold) preds1 = sum(probs1) probs2 = map(int, clf.predict_proba(querymatrix2)[:,1] > threshold) preds2 = sum(probs2) oddsratio, pvalue = stats.fisher_exact([[preds2,len(querymatrix2)-preds2],[preds1,len(querymatrix1)-preds1]]) try: ratio, preds1_percentage, preds2_percentage = calcPredictionRatio(preds1,preds2) return ratio, mod, preds1, preds1_percentage, preds2, preds2_percentage, oddsratio, pvalue, probs1 + probs2 except TypeError: return None #prediction runner
Example #6
Source File: test_stats.py From GraphicDesignPatternByPython with MIT License | 6 votes |
def test_precise(self): # results from R # # R defines oddsratio differently (see Notes section of fisher_exact # docstring), so those will not match. We leave them in anyway, in # case they will be useful later on. We test only the p-value. tablist = [ ([[100, 2], [1000, 5]], (2.505583993422285e-001, 1.300759363430016e-001)), ([[2, 7], [8, 2]], (8.586235135736206e-002, 2.301413756522114e-002)), ([[5, 1], [10, 10]], (4.725646047336584e+000, 1.973244147157190e-001)), ([[5, 15], [20, 20]], (3.394396617440852e-001, 9.580440012477637e-002)), ([[5, 16], [20, 25]], (3.960558326183334e-001, 1.725864953812994e-001)), ([[10, 5], [10, 1]], (2.116112781158483e-001, 1.973244147157190e-001)), ([[10, 5], [10, 0]], (0.000000000000000e+000, 6.126482213438734e-002)), ([[5, 0], [1, 4]], (np.inf, 4.761904761904762e-002)), ([[0, 5], [1, 4]], (0.000000000000000e+000, 1.000000000000000e+000)), ([[5, 1], [0, 4]], (np.inf, 4.761904761904758e-002)), ([[0, 1], [3, 2]], (0.000000000000000e+000, 1.000000000000000e+000)) ] for table, res_r in tablist: res = stats.fisher_exact(np.asarray(table)) np.testing.assert_almost_equal(res[1], res_r[1], decimal=11, verbose=True)
Example #7
Source File: test_stats.py From GraphicDesignPatternByPython with MIT License | 5 votes |
def test_large_numbers(self): # Test with some large numbers. Regression test for #1401 pvals = [5.56e-11, 2.666e-11, 1.363e-11] # from R for pval, num in zip(pvals, [75, 76, 77]): res = stats.fisher_exact([[17704, 496], [1065, num]])[1] assert_approx_equal(res, pval, significant=4) res = stats.fisher_exact([[18000, 80000], [20000, 90000]])[1] assert_approx_equal(res, 0.2751, significant=4)
Example #8
Source File: association.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def fisher(cls, *marginals): """Scores bigrams using Fisher's Exact Test (Pedersen 1996). Less sensitive to small counts than PMI or Chi Sq, but also more expensive to compute. Requires scipy. """ n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals) (odds, pvalue) = fisher_exact([[n_ii, n_io], [n_oi, n_oo]], alternative='less') return pvalue
Example #9
Source File: association.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def fisher_exact(*_args, **_kwargs): raise NotImplementedError ### Indices to marginals arguments:
Example #10
Source File: predict_enriched.py From PIDGINv3 with GNU General Public License v3.0 | 5 votes |
def calcProportionPredsPercentageFisher(model_name,p1,p2): #calculate proportion of nan (outside ad) compounds in each set propnanp1 = round((float(sum(np.isnan(p1))) / float(len(p1))),3) propnanp2 = round((float(sum(np.isnan(p2))) / float(len(p2))),3) #filter out nans p1 = p1[np.isfinite(p1)] p2 = p2[np.isfinite(p2)] #calculate active and inactive predictions for first file p1_0, p1_1 = len(p1)-sum(p1), sum(p1) #calculate active and inactive predictions for second file p2_0, p2_1 = len(p2)-sum(p2), sum(p2) #return if no compounds in either set due to ad filter if p1_0+p1_1 ==0 or p2_0+p2_1 ==0: return None #if no actives in either set return none if p1_1 == 0 and p2_1 == 0: return None oddsratio, pvalue = fisher_exact([[p1_1,p1_0],[p2_1,p2_0]]) #if no inactives in either set then set risk & odds to 1.0 and pval to NA if p1_0 == 0 and p2_0 == 0: rr, oddsratio, pvalue = 1.0, 'NA' else: #calculate risk ratio try: rr = (float(p1_1)/(float(p1_1)+float(p1_0)))/(float(p2_1)/(float(p2_1)+float(p2_0))) except ZeroDivisionError: rr = np.inf #calculate percentages pcp1_0, pcp1_1 = round(float(p1_0)/float(len(p1)),3), round(float(p1_1)/float(len(p1)),3) pcp2_0, pcp2_1 = round(float(p2_0)/float(len(p2)),3), round(float(p2_1)/float(len(p2)),3) return oddsratio, model_name, p1_0, pcp1_0, p1_1, pcp1_1, p2_0, pcp2_0, \ p2_1, pcp2_1, propnanp1, propnanp2, rr, pvalue
Example #11
Source File: stats.py From audit-ai with MIT License | 5 votes |
def fisher_exact_test(labels, results, threshold=None): """ Returns odds ratio and p-value of Fisher's exact test Uses scipy.stats.fisher_exact, which only works for 2x2 contingency tables https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.fisher_exact.html Parameters ---------- labels : array_like categorical labels for each corresponding value of `result` ie. M/F results : array_like binary decision values, if continuous values are supplied then the `threshold` must also be supplied to generate binary decisions threshold : numeric value dividing scores into True/False, where result>=threshold == True Returns ------- oddsratio : float This is prior odds ratio and not a posterior estimate. pvalue : float P-value, the probability of obtaining a distribution at least as extreme as the one that was actually observed, assuming that the null hypothesis is true. """ check_consistent_length(labels, results) results = np.array(results) # convert the results to True/False results = boolean_array(results, threshold=threshold) # get crosstab for two groups ctab = top_bottom_crosstab(labels, results) oddsratio, pvalue = fisher_exact(ctab) return oddsratio, pvalue
Example #12
Source File: multilinear.py From Splunking-Crime with GNU Affero General Public License v3.0 | 5 votes |
def _test_group(pvalues, group_name, group, exact=True): """test if the objects in the group are different from the general set. The test is performed on the pvalues set (ad a pandas series) over the group specified via a fisher exact test. """ from scipy.stats import fisher_exact, chi2_contingency totals = 1.0 * len(pvalues) total_significant = 1.0 * np.sum(pvalues) cross_index = [c for c in group if c in pvalues.index] missing = [c for c in group if c not in pvalues.index] if missing: s = ('the test is not well defined if the group ' 'has elements not presents in the significativity ' 'array. group name: {}, missing elements: {}') logging.warning(s.format(group_name, missing)) # how many are significant and not in the group group_total = 1.0 * len(cross_index) group_sign = 1.0 * len([c for c in cross_index if pvalues[c]]) group_nonsign = 1.0 * (group_total - group_sign) # how many are significant and not outside the group extern_sign = 1.0 * (total_significant - group_sign) extern_nonsign = 1.0 * (totals - total_significant - group_nonsign) # make the fisher test or the chi squared test = fisher_exact if exact else chi2_contingency table = [[extern_nonsign, extern_sign], [group_nonsign, group_sign]] pvalue = test(np.array(table))[1] # is the group more represented or less? part = group_sign, group_nonsign, extern_sign, extern_nonsign #increase = (group_sign / group_total) > (total_significant / totals) increase = np.log((totals * group_sign) / (total_significant * group_total)) return pvalue, increase, part
Example #13
Source File: MPileUpVariantCaller.py From cDNA_Cupcake with BSD 3-Clause Clear License | 5 votes |
def call_variant(self): """ mirrors AminoAcidCaller::CallVariants() in https://github.com/PacificBiosciences/minorseq/blob/develop/src/AminoAcidCaller.cpp For each position (that has sufficient coverage), do Fisher exact test w/ correction if p-val < threshold, then store it. Stores results in self.variant as: self.variant[position] = desc list of (base, count). NOTE: base must be either all in lower case (which means - strand) or call upper case (+ strand). If - strand and ('a', 10), it means the ref base in A on the + strand, and the transcript should be T on the - strand. Only positions with more than the ref base is stored. """ for pos in self.positions_to_call: r = self.record_by_pos[pos] alt_variant = [] for base, count in r.clean_counts.most_common()[1:]: assert not base.startswith('+') and not base.startswith('-') # clean counts should NOT have indels exp = r.clean_cov * self.err_sub odds, pval = stats.fisher_exact([[count, r.clean_cov-count], [exp, r.clean_cov-exp]], alternative='greater') pval *= self.number_of_tests if pval < self.pval_cutoff: # store variant if below cutoff alt_variant.append((base, count)) if len(alt_variant) > 0: # only record this variant if there's at least two haps self.variant[pos] = [r.clean_counts.most_common()[0]] + alt_variant self.ref_base[pos] = r.ref
Example #14
Source File: vectorizer.py From scan with GNU Affero General Public License v3.0 | 5 votes |
def get_vocab(self, vectorizer, input_text, input_scores): train_mat = vectorizer.transform(input_text) input_score_med = np.median(input_scores) new_scores = [0 if i <= input_score_med else 1 for i in input_scores] pvalues = [] for i in range(0, train_mat.shape[1]): lcol = np.asarray(train_mat.getcol(i).todense().transpose())[0] good_lcol = lcol[[n for n in range(0, len(new_scores)) if new_scores[n] == 1]] bad_lcol = lcol[[n for n in range(0, len(new_scores)) if new_scores[n] == 0]] good_lcol_present = len(good_lcol[good_lcol > 0]) good_lcol_missing = len(good_lcol[good_lcol == 0]) bad_lcol_present = len(bad_lcol[bad_lcol > 0]) bad_lcol_missing = len(bad_lcol[bad_lcol == 0]) oddsratio, pval = fisher_exact([[good_lcol_present, bad_lcol_present], [good_lcol_missing, bad_lcol_missing]]) pvalues.append(pval) col_inds = range(0, train_mat.shape[1]) p_frame = np.array([col_inds, pvalues]).transpose() p_frame = p_frame[p_frame[:, 1].argsort()] rows = p_frame.shape[0] selection = self.max_features if rows < selection: selection = rows getVar = lambda searchList, ind: [searchList[int(i)] for i in ind] vocab = getVar(vectorizer.get_feature_names(), p_frame[:, 0][-selection:]) return vocab
Example #15
Source File: test_stats.py From GraphicDesignPatternByPython with MIT License | 5 votes |
def test_gh3014(self): # check if issue #3014 has been fixed. # before, this would have risen a ValueError odds, pvalue = stats.fisher_exact([[1, 2], [9, 84419233]])
Example #16
Source File: test_stats.py From GraphicDesignPatternByPython with MIT License | 5 votes |
def test_row_or_col_zero(self): tables = ([[0, 0], [5, 10]], [[5, 10], [0, 0]], [[0, 5], [0, 10]], [[5, 0], [10, 0]]) for table in tables: oddsratio, pval = stats.fisher_exact(table) assert_equal(pval, 1.0) assert_equal(oddsratio, np.nan)
Example #17
Source File: test_stats.py From GraphicDesignPatternByPython with MIT License | 5 votes |
def test_raises(self): # test we raise an error for wrong shape of input. assert_raises(ValueError, stats.fisher_exact, np.arange(6).reshape(2, 3))
Example #18
Source File: association.py From razzy-spinner with GNU General Public License v3.0 | 5 votes |
def fisher_exact(*_args, **_kwargs): raise NotImplementedError ### Indices to marginals arguments:
Example #19
Source File: test_stats.py From GraphicDesignPatternByPython with MIT License | 5 votes |
def test_basic(self): fisher_exact = stats.fisher_exact res = fisher_exact([[14500, 20000], [30000, 40000]])[1] assert_approx_equal(res, 0.01106, significant=4) res = fisher_exact([[100, 2], [1000, 5]])[1] assert_approx_equal(res, 0.1301, significant=4) res = fisher_exact([[2, 7], [8, 2]])[1] assert_approx_equal(res, 0.0230141, significant=6) res = fisher_exact([[5, 1], [10, 10]])[1] assert_approx_equal(res, 0.1973244, significant=6) res = fisher_exact([[5, 15], [20, 20]])[1] assert_approx_equal(res, 0.0958044, significant=6) res = fisher_exact([[5, 16], [20, 25]])[1] assert_approx_equal(res, 0.1725862, significant=6) res = fisher_exact([[10, 5], [10, 1]])[1] assert_approx_equal(res, 0.1973244, significant=6) res = fisher_exact([[5, 0], [1, 4]])[1] assert_approx_equal(res, 0.04761904, significant=6) res = fisher_exact([[0, 1], [3, 2]])[1] assert_approx_equal(res, 1.0) res = fisher_exact([[0, 2], [6, 4]])[1] assert_approx_equal(res, 0.4545454545) res = fisher_exact([[2, 7], [8, 2]]) assert_approx_equal(res[1], 0.0230141, significant=6) assert_approx_equal(res[0], 4.0 / 56)
Example #20
Source File: predict_enriched_decision_tree.py From PIDGINv2 with MIT License | 5 votes |
def doTargetPrediction(pickled_model_name): if os.name == 'nt': sep = '\\' else: sep = '/' mod = pickled_model_name.split(sep)[-1].split('.')[0] clf = open_Model(mod) probs1 = map(int, clf.predict_proba(querymatrix1)[:,1] > threshold) preds1 = sum(probs1) preds2 = bg_preds[mod] oddsratio, pvalue = stats.fisher_exact([[preds2,2000000-preds2],[preds1,len(querymatrix1)-preds1]]) try: ratio, preds1_percentage, preds2_percentage = calcPredictionRatio(preds1,preds2) return ratio, mod, preds1, preds1_percentage, preds2, preds2_percentage, oddsratio, pvalue, probs1 except TypeError: return None #prediction runner
Example #21
Source File: test_stats.py From Computable with MIT License | 5 votes |
def test_row_or_col_zero(self): tables = ([[0, 0], [5, 10]], [[5, 10], [0, 0]], [[0, 5], [0, 10]], [[5, 0], [10, 0]]) for table in tables: oddsratio, pval = stats.fisher_exact(table) assert_equal(pval, 1.0) assert_equal(oddsratio, np.nan)
Example #22
Source File: test_stats.py From Computable with MIT License | 5 votes |
def test_raises(self): # test we raise an error for wrong shape of input. assert_raises(ValueError, stats.fisher_exact, np.arange(6).reshape(2, 3))
Example #23
Source File: test_stats.py From Computable with MIT License | 5 votes |
def test_large_numbers(self): # Test with some large numbers. Regression test for #1401 pvals = [5.56e-11, 2.666e-11, 1.363e-11] # from R for pval, num in zip(pvals, [75, 76, 77]): res = stats.fisher_exact([[17704, 496], [1065, num]])[1] assert_approx_equal(res, pval, significant=4) res = stats.fisher_exact([[18000, 80000], [20000, 90000]])[1] assert_approx_equal(res, 0.2751, significant=4)
Example #24
Source File: test_stats.py From Computable with MIT License | 5 votes |
def test_basic(self): fisher_exact = stats.fisher_exact res = fisher_exact([[14500, 20000], [30000, 40000]])[1] assert_approx_equal(res, 0.01106, significant=4) res = fisher_exact([[100, 2], [1000, 5]])[1] assert_approx_equal(res, 0.1301, significant=4) res = fisher_exact([[2, 7], [8, 2]])[1] assert_approx_equal(res, 0.0230141, significant=6) res = fisher_exact([[5, 1], [10, 10]])[1] assert_approx_equal(res, 0.1973244, significant=6) res = fisher_exact([[5, 15], [20, 20]])[1] assert_approx_equal(res, 0.0958044, significant=6) res = fisher_exact([[5, 16], [20, 25]])[1] assert_approx_equal(res, 0.1725862, significant=6) res = fisher_exact([[10, 5], [10, 1]])[1] assert_approx_equal(res, 0.1973244, significant=6) res = fisher_exact([[5, 0], [1, 4]])[1] assert_approx_equal(res, 0.04761904, significant=6) res = fisher_exact([[0, 1], [3, 2]])[1] assert_approx_equal(res, 1.0) res = fisher_exact([[0, 2], [6, 4]])[1] assert_approx_equal(res, 0.4545454545) res = fisher_exact([[2, 7], [8, 2]]) assert_approx_equal(res[1], 0.0230141, significant=6) assert_approx_equal(res[0], 4.0 / 56)
Example #25
Source File: multilinear.py From vnpy_crypto with MIT License | 5 votes |
def _test_group(pvalues, group_name, group, exact=True): """test if the objects in the group are different from the general set. The test is performed on the pvalues set (ad a pandas series) over the group specified via a fisher exact test. """ from scipy.stats import fisher_exact, chi2_contingency totals = 1.0 * len(pvalues) total_significant = 1.0 * np.sum(pvalues) cross_index = [c for c in group if c in pvalues.index] missing = [c for c in group if c not in pvalues.index] if missing: s = ('the test is not well defined if the group ' 'has elements not presents in the significativity ' 'array. group name: {}, missing elements: {}') logging.warning(s.format(group_name, missing)) # how many are significant and not in the group group_total = 1.0 * len(cross_index) group_sign = 1.0 * len([c for c in cross_index if pvalues[c]]) group_nonsign = 1.0 * (group_total - group_sign) # how many are significant and not outside the group extern_sign = 1.0 * (total_significant - group_sign) extern_nonsign = 1.0 * (totals - total_significant - group_nonsign) # make the fisher test or the chi squared test = fisher_exact if exact else chi2_contingency table = [[extern_nonsign, extern_sign], [group_nonsign, group_sign]] pvalue = test(np.array(table))[1] # is the group more represented or less? part = group_sign, group_nonsign, extern_sign, extern_nonsign #increase = (group_sign / group_total) > (total_significant / totals) increase = np.log((totals * group_sign) / (total_significant * group_total)) return pvalue, increase, part
Example #26
Source File: TermDocMatrix.py From scattertext with Apache License 2.0 | 5 votes |
def _get_fisher_scores_from_counts(self, cat_word_counts, not_cat_word_counts): cat_not_word_counts = cat_word_counts.sum() - cat_word_counts not_cat_not_word_counts = not_cat_word_counts.sum() - not_cat_word_counts def do_fisher_exact(x): return fisher_exact([[x[0], x[1]], [x[2], x[3]]], alternative='greater') odds_ratio, p_values = np.apply_along_axis( do_fisher_exact, 0, np.array([cat_word_counts, cat_not_word_counts, not_cat_word_counts, not_cat_not_word_counts])) return odds_ratio, p_values
Example #27
Source File: association.py From razzy-spinner with GNU General Public License v3.0 | 5 votes |
def fisher(cls, *marginals): """Scores bigrams using Fisher's Exact Test (Pedersen 1996). Less sensitive to small counts than PMI or Chi Sq, but also more expensive to compute. Requires scipy. """ n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals) (odds, pvalue) = fisher_exact([[n_ii, n_io], [n_oi, n_oo]], alternative='less') return pvalue
Example #28
Source File: utils.py From dynamo-release with BSD 3-Clause "New" or "Revised" License | 4 votes |
def TF_link_gene_chip(raw_glmnet_res_var, df_gene_TF_link_ENCODE, var, cor_thresh=0.02): """Filter the raw lasso regression links via chip-seq data based on a Fisher exact test""" glmnet_res_var_filtered = raw_glmnet_res_var.query("abs(corcoef) > @cor_thresh") print(f"\n Number of possible links: glmnet_res_var_filtered.shape[0]") # source - target gene pairs df_gene_TF_link_ENCODE['id_gene'] = df_gene_TF_link_ENCODE['id'].astype('str') + '_' + \ df_gene_TF_link_ENCODE['linked_gene_name'].astype('str') glmnet_res_var_filtered['id_gene'] = glmnet_res_var_filtered['id'].astype('str') + '_' + \ glmnet_res_var_filtered['linked_gene_name'].astype('str') df_gene_TF_link_ENCODE['glmnet_chip_links'] = df_gene_TF_link_ENCODE['id_gene'].isin(glmnet_res_var_filtered['id_gene']) unique_TFs = glmnet_res_var_filtered.id[glmnet_res_var_filtered.id.isin(df_gene_TF_link_ENCODE['id'])].unique() df_tb = pd.crosstab(df_gene_TF_link_ENCODE['glmnet_chip_links'], df_gene_TF_link_ENCODE['peak']) oddsratio, pvalue = stats.fisher_exact(df_tb) print(f'odd ratio and pvalue of Fisher exact test for regression identified regulations were validated by target ' f'TF-binding sites near gene promoters from ENCODE, limiting to TFs from lasso regression and also ' f'characterized in ENCODE are: {oddsratio}, {pvalue}') # Only gene sets with significant enrichment of the correct TF ChIP–seq binding sites were retained unique_TF_pvalue = [None] * len(unique_TFs) for i, tf in enumerate(unique_TFs): df_tmp = df_gene_TF_link_ENCODE.query("id == @tf") df_tb = pd.crosstab(df_tmp['glmnet_chip_links'], df_tmp['peak']) if df_tb.shape == (2, 2): _, pvalue = stats.fisher_exact(df_tb) unique_TF_pvalue[i] = pvalue else: unique_TF_pvalue[i] = 1 df_unique_TF = pd.DataFrame({"id": unique_TFs, "pvalue": unique_TF_pvalue}) df_unique_TF['qval'] = fdr(df_unique_TF['pvalue']) df_unique_TF = df_unique_TF.query("qval < 0.05") print(f"Number of positive TFs: {df_unique_TF.shape[0]}") validated_TF = df_unique_TF['id'].unique() df_gene_TF_link_chip = df_gene_TF_link_ENCODE.loc[df_gene_TF_link_ENCODE.id.isin(validated_TF)] df_gene_TF_link_chip = df_gene_TF_link_chip.merge(glmnet_res_var_filtered[['id_gene', 'corcoef', 'r_squre']]) df_gene = var[['gene_id', 'gene_short_name', 'gene_type']] df_gene.columns = ['linked_gene_id', 'linked_gene_name', df_gene.columns[2]] df_gene_TF_link_chip = df_gene_TF_link_chip.merge(df_gene) df_gene_TF_link_chip['Conf'] = "Chip_peak" df_gene_TF_link_chip['group'] = ["has_link" if i else "no_link" for i in df_gene_TF_link_chip['glmnet_chip_links']] df_gene_TF_link_chip = df_gene_TF_link_chip.query("group == 'has_link'") df_gene_TF_link_chip = df_gene_TF_link_chip[['id', 'linked_gene_name', 'Conf', 'linked_gene_id', 'gene_type', 'corcoef', 'r_squre', 'id_gene']] df_gene_TF_link_chip.columns = ['TF', 'linked_gene', 'Conf', 'linked_gene_id', 'linked_gene_type', 'corcoef', 'r_2', 'TF_link'] return df_gene_TF_link_chip
Example #29
Source File: mappfinder.py From altanalyze with Apache License 2.0 | 4 votes |
def FishersExactTest(r,n,R,N): """ N is the total number of genes measured (Ensembl linked from denom) (total number of ) (number of exons evaluated) R is the total number of genes meeting the criterion (Ensembl linked from input) (number of exonic/intronic regions overlaping with any CLIP peeks) n is the total number of genes in this specific MAPP (Ensembl denom in MAPP) (number of exonic/intronic regions associated with the SF) r is the number of genes meeting the criterion in this MAPP (Ensembl input in MAPP) (number of exonic/intronic regions with peeks overlapping with the SF) With these values, we must create a 2x2 contingency table for a Fisher's Exact Test that reports: +---+---+ a is the # of IDs in the term regulated | a | b | b is the # of IDs in the term not-regulated +---+---+ c is the # of IDs not-in-term and regulated | c | d | d is the # of IDs not-in-term and not-regulated +---+---+ If we know r=20, R=80, n=437 and N=14480 +----+-----+ | 20 | 417 | 437 +----+-----+ | 65 |13978| 14043 +----+-----+ 85 14395 14480 """ a = r; b = n-r; c=R-r; d=N-R-b table = [[int(a),int(b)], [int(c),int(d)]] """ print a,b; print c,d from stats_scripts import fishers_exact_test; table = [[a,b], [c,d]] ft = fishers_exact_test.FishersExactTest(table) print ft.probability_of_table(table); print ft.two_tail_p() print ft.right_tail_p(); print ft.left_tail_p() """ try: ### Scipy version - cuts down rutime by ~1/3rd the time with warnings.catch_warnings(): warnings.filterwarnings("ignore",category=RuntimeWarning) ### hides import warnings oddsratio, pvalue = stats.fisher_exact(table) return pvalue except Exception: ft = fishers_exact_test.FishersExactTest(table) return ft.two_tail_p()
Example #30
Source File: find_enriched_node.py From adage with BSD 3-Clause "New" or "Revised" License | 4 votes |
def find_enriched_node(geneList_folder = None, data_file= None, gold_std = None, out_file = None): ''' geneList_folder: the folder stores high-weight gene files for each node data_file: the microarray file with gene genes gold_std: gold standard file contains a list of genes out_file: the output file that stores each node and its corresponding q value ''' datasets = PCLfile(data_file, skip_col=0) gene_id = datasets.id_list gold_fh = open(gold_std,'r') gold_set = [] for line in gold_fh: gene = line.strip().split('\t')[0] gold_set.append(gene) p_all_node = [] geneList_files = glob.glob(geneList_folder + '/Node*.txt') #Get all the high-weight gene files under the geneList_folder for i in xrange(len(geneList_files)): gene_fh = open(geneList_folder + '/Node'+str(i+1)+'.txt','r') gene_fh.next() geneset = [] #geneset stores the high-weight gene for a node for line in gene_fh: gene = line.strip().split('\t')[0] geneset.append(gene) #Build the contengency table all_overlap_genes = set(gold_set).intersection(set(gene_id)) selected_overlap_genes = set(gold_set).intersection(set(geneset)) a = len(selected_overlap_genes) b = len(all_overlap_genes) - len(selected_overlap_genes) c = len(geneset) - len(selected_overlap_genes) d = len(gene_id) -a -b -c table = [[a,b],[c,d]] #Calculate p-value using fisher exact test oddsratio, pvalue = stats.fisher_exact(table) p_all_node.append(pvalue) #multiple hypothesis correction result_adj_pvalue = multipletests(p_all_node, alpha=0.05, method='fdr_bh')[1] all_node = ['Node'+str(x+1) for x in xrange(50)] #find the node with lowest q value, write the output qvalue_small = 1 node_small = None out_fh = open(out_file, 'w') for node, qvalue in zip(all_node, result_adj_pvalue): out_fh.write(node+'\t'+str(qvalue)+'\n') if qvalue < qvalue_small: qvalue_small = qvalue node_small = node print node_small+' is most significantly associated with this gene set with a q value of '+str(qvalue_small)