Python scipy.stats.fisher_exact() Examples

The following are 30 code examples of scipy.stats.fisher_exact(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scipy.stats , or try the search function .
Example #1
Source File: test_differential.py    From methplotlib with MIT License 6 votes vote down vote up
def old_method(a_range, b_range, bed):

    from scipy.stats import fisher_exact

    def meth_counts(interval):
        if not interval.empty:
            all, methylated = interval.as_df()[['calls', "methylated"]].sum()
            return [all - methylated, methylated]
        else:
            return [0, 0]

    rowdicts = []

    for chr, begin, end in bed.merge().as_df().itertuples(index=False, name=None):
        [d1, n1], [d2, n2] = meth_counts(
            a_range[chr, begin:end]), meth_counts(b_range[chr, begin:end])
        rowdicts.append({"Chromosome": chr, "Start": begin, "End": end,
                         "d1": d1, "d2": d2, "n1": n1, "n2": n2})
        # print(c1, c2)
        # print(a, b, c, d)
        # odr, pv = fisher_exact([c1, c2])
        # print(odr, pv)
        # rowdicts.append({"Chromosome": chr, "Start": begin, "End": end, "OR": odr, "P": pv})

    return pd.DataFrame.from_dict(rowdicts) 
Example #2
Source File: PipelineGSEnrichment.py    From CGATPipelines with MIT License 6 votes vote down vote up
def run(self):
        A = self.GenesWith & self.foreground
        B = self.GenesWithout & self.foreground
        C = self.GenesWith & self.background
        D = self.GenesWithout & self.background

        fishlist = ((len(A), len(B)), (len(C), len(D)))

        FT = stats.fisher_exact(fishlist)
        OR, p = FT
        padj = self.correct(p)
        if padj > 1:
            padj = 1
        significant = self.sig(padj)

        return OR, p, padj, significant, fishlist, A, C

# functions below here correspond to specific steps in the
# pipeline_enrichment pipeline - they are written as functions
# so the cluster_runnable decorater can be used. 
Example #3
Source File: test_stats.py    From Computable with MIT License 6 votes vote down vote up
def test_precise(self):
        # results from R
        #
        # R defines oddsratio differently (see Notes section of fisher_exact
        # docstring), so those will not match.  We leave them in anyway, in
        # case they will be useful later on. We test only the p-value.
        tablist = [
            ([[100, 2], [1000, 5]], (2.505583993422285e-001, 1.300759363430016e-001)),
            ([[2, 7], [8, 2]], (8.586235135736206e-002, 2.301413756522114e-002)),
            ([[5, 1], [10, 10]], (4.725646047336584e+000, 1.973244147157190e-001)),
            ([[5, 15], [20, 20]], (3.394396617440852e-001, 9.580440012477637e-002)),
            ([[5, 16], [20, 25]], (3.960558326183334e-001, 1.725864953812994e-001)),
            ([[10, 5], [10, 1]], (2.116112781158483e-001, 1.973244147157190e-001)),
            ([[10, 5], [10, 0]], (0.000000000000000e+000, 6.126482213438734e-002)),
            ([[5, 0], [1, 4]], (np.inf, 4.761904761904762e-002)),
            ([[0, 5], [1, 4]], (0.000000000000000e+000, 1.000000000000000e+000)),
            ([[5, 1], [0, 4]], (np.inf, 4.761904761904758e-002)),
            ([[0, 1], [3, 2]], (0.000000000000000e+000, 1.000000000000000e+000))
            ]
        for table, res_r in tablist:
            res = stats.fisher_exact(np.asarray(table))
            np.testing.assert_almost_equal(res[1], res_r[1], decimal=11,
                                           verbose=True) 
Example #4
Source File: chicDifferentialTest.py    From HiCExplorer with GNU General Public License v3.0 6 votes vote down vote up
def fisher_exact_test(pDataFile1, pDataFile2, pAlpha):

    test_result = []
    accepted = []
    rejected = []
    for i, (group1, group2) in enumerate(zip(pDataFile1, pDataFile2)):
        try:
            odds, p_value = stats.fisher_exact(np.ceil([group1, group2]))
            if p_value <= pAlpha:
                test_result.append(p_value)
                rejected.append([i, p_value])
            else:
                test_result.append(p_value)
                accepted.append([i, p_value])
        except ValueError:
            test_result.append(np.nan)
            accepted.append([i, 1.0])
    return test_result, accepted, rejected 
Example #5
Source File: predict_enriched_two_libraries_decision_tree.py    From PIDGINv2 with MIT License 6 votes vote down vote up
def doTargetPrediction(pickled_model_name):
	if os.name == 'nt': sep = '\\'
	else: sep = '/'
	mod = pickled_model_name.split(sep)[-1].split('.')[0]
	clf = open_Model(mod)
	probs1 = map(int, clf.predict_proba(querymatrix1)[:,1] > threshold)
	preds1 = sum(probs1)
	probs2 = map(int, clf.predict_proba(querymatrix2)[:,1] > threshold)
	preds2 = sum(probs2)
	oddsratio, pvalue = stats.fisher_exact([[preds2,len(querymatrix2)-preds2],[preds1,len(querymatrix1)-preds1]])
	try:
		ratio, preds1_percentage, preds2_percentage = calcPredictionRatio(preds1,preds2)
		return ratio, mod, preds1, preds1_percentage, preds2, preds2_percentage, oddsratio, pvalue, probs1 + probs2
	except TypeError: return None

#prediction runner 
Example #6
Source File: test_stats.py    From GraphicDesignPatternByPython with MIT License 6 votes vote down vote up
def test_precise(self):
        # results from R
        #
        # R defines oddsratio differently (see Notes section of fisher_exact
        # docstring), so those will not match.  We leave them in anyway, in
        # case they will be useful later on. We test only the p-value.
        tablist = [
            ([[100, 2], [1000, 5]], (2.505583993422285e-001, 1.300759363430016e-001)),
            ([[2, 7], [8, 2]], (8.586235135736206e-002, 2.301413756522114e-002)),
            ([[5, 1], [10, 10]], (4.725646047336584e+000, 1.973244147157190e-001)),
            ([[5, 15], [20, 20]], (3.394396617440852e-001, 9.580440012477637e-002)),
            ([[5, 16], [20, 25]], (3.960558326183334e-001, 1.725864953812994e-001)),
            ([[10, 5], [10, 1]], (2.116112781158483e-001, 1.973244147157190e-001)),
            ([[10, 5], [10, 0]], (0.000000000000000e+000, 6.126482213438734e-002)),
            ([[5, 0], [1, 4]], (np.inf, 4.761904761904762e-002)),
            ([[0, 5], [1, 4]], (0.000000000000000e+000, 1.000000000000000e+000)),
            ([[5, 1], [0, 4]], (np.inf, 4.761904761904758e-002)),
            ([[0, 1], [3, 2]], (0.000000000000000e+000, 1.000000000000000e+000))
            ]
        for table, res_r in tablist:
            res = stats.fisher_exact(np.asarray(table))
            np.testing.assert_almost_equal(res[1], res_r[1], decimal=11,
                                           verbose=True) 
Example #7
Source File: test_stats.py    From GraphicDesignPatternByPython with MIT License 5 votes vote down vote up
def test_large_numbers(self):
        # Test with some large numbers. Regression test for #1401
        pvals = [5.56e-11, 2.666e-11, 1.363e-11]  # from R
        for pval, num in zip(pvals, [75, 76, 77]):
            res = stats.fisher_exact([[17704, 496], [1065, num]])[1]
            assert_approx_equal(res, pval, significant=4)

        res = stats.fisher_exact([[18000, 80000], [20000, 90000]])[1]
        assert_approx_equal(res, 0.2751, significant=4) 
Example #8
Source File: association.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def fisher(cls, *marginals):
        """Scores bigrams using Fisher's Exact Test (Pedersen 1996).  Less
        sensitive to small counts than PMI or Chi Sq, but also more expensive
        to compute. Requires scipy.
        """

        n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)

        (odds, pvalue) = fisher_exact([[n_ii, n_io], [n_oi, n_oo]], alternative='less')
        return pvalue 
Example #9
Source File: association.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def fisher_exact(*_args, **_kwargs):
        raise NotImplementedError


### Indices to marginals arguments: 
Example #10
Source File: predict_enriched.py    From PIDGINv3 with GNU General Public License v3.0 5 votes vote down vote up
def calcProportionPredsPercentageFisher(model_name,p1,p2):
	#calculate proportion of nan (outside ad) compounds in each set
	propnanp1 = round((float(sum(np.isnan(p1))) / float(len(p1))),3)
	propnanp2 = round((float(sum(np.isnan(p2))) / float(len(p2))),3)
	#filter out nans
	p1 = p1[np.isfinite(p1)]
	p2 = p2[np.isfinite(p2)]
	#calculate active and inactive predictions for first file
	p1_0, p1_1 = len(p1)-sum(p1), sum(p1)
	#calculate active and inactive predictions for second file
	p2_0, p2_1 = len(p2)-sum(p2), sum(p2)
	#return if no compounds in either set due to ad filter
	if p1_0+p1_1 ==0 or p2_0+p2_1 ==0: return None
	#if no actives in either set return none
	if p1_1 == 0 and p2_1 == 0: return None
	oddsratio, pvalue = fisher_exact([[p1_1,p1_0],[p2_1,p2_0]])
	#if no inactives in either set then set risk & odds to 1.0 and pval to NA
	if p1_0 == 0 and p2_0 == 0: rr, oddsratio, pvalue = 1.0, 'NA'
	else:
		#calculate risk ratio
		try: rr = (float(p1_1)/(float(p1_1)+float(p1_0)))/(float(p2_1)/(float(p2_1)+float(p2_0)))
		except ZeroDivisionError: rr = np.inf
	#calculate percentages
	pcp1_0, pcp1_1 = round(float(p1_0)/float(len(p1)),3), round(float(p1_1)/float(len(p1)),3)
	pcp2_0, pcp2_1 = round(float(p2_0)/float(len(p2)),3), round(float(p2_1)/float(len(p2)),3)
	return oddsratio, model_name, p1_0, pcp1_0, p1_1, pcp1_1, p2_0, pcp2_0, \
	p2_1, pcp2_1, propnanp1, propnanp2, rr, pvalue 
Example #11
Source File: stats.py    From audit-ai with MIT License 5 votes vote down vote up
def fisher_exact_test(labels, results, threshold=None):
    """
    Returns odds ratio and p-value of Fisher's exact test
    Uses scipy.stats.fisher_exact, which only works for 2x2 contingency tables
    https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.fisher_exact.html

    Parameters
    ----------
    labels : array_like
        categorical labels for each corresponding value of `result` ie. M/F

    results : array_like
        binary decision values, if continuous values are supplied then
        the `threshold` must also be supplied to generate binary decisions

    threshold : numeric
        value dividing scores into True/False, where result>=threshold == True

    Returns
    -------
    oddsratio : float
        This is prior odds ratio and not a posterior estimate.
    pvalue : float
        P-value, the probability of obtaining a distribution at least
        as extreme as the one that was actually observed, assuming that
        the null hypothesis is true.
    """

    check_consistent_length(labels, results)
    results = np.array(results)

    # convert the results to True/False
    results = boolean_array(results, threshold=threshold)
    # get crosstab for two groups
    ctab = top_bottom_crosstab(labels, results)

    oddsratio, pvalue = fisher_exact(ctab)
    return oddsratio, pvalue 
Example #12
Source File: multilinear.py    From Splunking-Crime with GNU Affero General Public License v3.0 5 votes vote down vote up
def _test_group(pvalues, group_name, group, exact=True):
    """test if the objects in the group are different from the general set.

    The test is performed on the pvalues set (ad a pandas series) over
    the group specified via a fisher exact test.
    """
    from scipy.stats import fisher_exact, chi2_contingency

    totals = 1.0 * len(pvalues)
    total_significant = 1.0 * np.sum(pvalues)
    cross_index = [c for c in group if c in pvalues.index]
    missing = [c for c in group if c not in pvalues.index]
    if missing:
        s = ('the test is not well defined if the group '
             'has elements not presents in the significativity '
             'array. group name: {}, missing elements: {}')
        logging.warning(s.format(group_name, missing))
    # how many are significant and not in the group
    group_total = 1.0 * len(cross_index)
    group_sign = 1.0 * len([c for c in cross_index if pvalues[c]])
    group_nonsign = 1.0 * (group_total - group_sign)
    # how many are significant and not outside the group
    extern_sign = 1.0 * (total_significant - group_sign)
    extern_nonsign = 1.0 * (totals - total_significant - group_nonsign)
    # make the fisher test or the chi squared
    test = fisher_exact if exact else chi2_contingency
    table = [[extern_nonsign, extern_sign], [group_nonsign, group_sign]]
    pvalue = test(np.array(table))[1]
    # is the group more represented or less?
    part = group_sign, group_nonsign, extern_sign, extern_nonsign
    #increase = (group_sign / group_total) > (total_significant / totals)
    increase = np.log((totals * group_sign)
                      / (total_significant * group_total))
    return pvalue, increase, part 
Example #13
Source File: MPileUpVariantCaller.py    From cDNA_Cupcake with BSD 3-Clause Clear License 5 votes vote down vote up
def call_variant(self):
        """
        mirrors AminoAcidCaller::CallVariants() in
        https://github.com/PacificBiosciences/minorseq/blob/develop/src/AminoAcidCaller.cpp

        For each position (that has sufficient coverage),
         do Fisher exact test w/ correction
         if p-val < threshold, then store it.

        Stores results in self.variant as:

        self.variant[position] = desc list of (base, count).
        NOTE: base must be either all in lower case (which means - strand)
              or call upper case (+ strand).
              If - strand and ('a', 10), it means the ref base in A on the + strand,
              and the transcript should be T on the - strand.

        Only positions with more than the ref base is stored.
        """
        for pos in self.positions_to_call:
            r = self.record_by_pos[pos]
            alt_variant = []
            for base, count in r.clean_counts.most_common()[1:]:
                assert not base.startswith('+') and not base.startswith('-') # clean counts should NOT have indels
                exp = r.clean_cov * self.err_sub
                odds, pval = stats.fisher_exact([[count, r.clean_cov-count], [exp, r.clean_cov-exp]], alternative='greater')
                pval *= self.number_of_tests
                if pval < self.pval_cutoff: # store variant if below cutoff
                    alt_variant.append((base, count))
            if len(alt_variant) > 0: # only record this variant if there's at least two haps
                self.variant[pos] = [r.clean_counts.most_common()[0]] + alt_variant
                self.ref_base[pos] = r.ref 
Example #14
Source File: vectorizer.py    From scan with GNU Affero General Public License v3.0 5 votes vote down vote up
def get_vocab(self, vectorizer, input_text, input_scores):
        train_mat = vectorizer.transform(input_text)
        input_score_med = np.median(input_scores)
        new_scores = [0 if i <= input_score_med else 1 for i in input_scores]

        pvalues = []
        for i in range(0, train_mat.shape[1]):
            lcol = np.asarray(train_mat.getcol(i).todense().transpose())[0]
            good_lcol = lcol[[n for n in range(0, len(new_scores)) if new_scores[n] == 1]]
            bad_lcol = lcol[[n for n in range(0, len(new_scores)) if new_scores[n] == 0]]
            good_lcol_present = len(good_lcol[good_lcol > 0])
            good_lcol_missing = len(good_lcol[good_lcol == 0])
            bad_lcol_present = len(bad_lcol[bad_lcol > 0])
            bad_lcol_missing = len(bad_lcol[bad_lcol == 0])
            oddsratio, pval = fisher_exact([[good_lcol_present, bad_lcol_present], [good_lcol_missing, bad_lcol_missing]])
            pvalues.append(pval)

        col_inds = range(0, train_mat.shape[1])
        p_frame = np.array([col_inds, pvalues]).transpose()
        p_frame = p_frame[p_frame[:, 1].argsort()]

        rows = p_frame.shape[0]
        selection = self.max_features
        if rows < selection:
            selection = rows

        getVar = lambda searchList, ind: [searchList[int(i)] for i in ind]
        vocab = getVar(vectorizer.get_feature_names(), p_frame[:, 0][-selection:])
        return vocab 
Example #15
Source File: test_stats.py    From GraphicDesignPatternByPython with MIT License 5 votes vote down vote up
def test_gh3014(self):
        # check if issue #3014 has been fixed.
        # before, this would have risen a ValueError
        odds, pvalue = stats.fisher_exact([[1, 2], [9, 84419233]]) 
Example #16
Source File: test_stats.py    From GraphicDesignPatternByPython with MIT License 5 votes vote down vote up
def test_row_or_col_zero(self):
        tables = ([[0, 0], [5, 10]],
                  [[5, 10], [0, 0]],
                  [[0, 5], [0, 10]],
                  [[5, 0], [10, 0]])
        for table in tables:
            oddsratio, pval = stats.fisher_exact(table)
            assert_equal(pval, 1.0)
            assert_equal(oddsratio, np.nan) 
Example #17
Source File: test_stats.py    From GraphicDesignPatternByPython with MIT License 5 votes vote down vote up
def test_raises(self):
        # test we raise an error for wrong shape of input.
        assert_raises(ValueError, stats.fisher_exact,
                      np.arange(6).reshape(2, 3)) 
Example #18
Source File: association.py    From razzy-spinner with GNU General Public License v3.0 5 votes vote down vote up
def fisher_exact(*_args, **_kwargs):
        raise NotImplementedError

### Indices to marginals arguments: 
Example #19
Source File: test_stats.py    From GraphicDesignPatternByPython with MIT License 5 votes vote down vote up
def test_basic(self):
        fisher_exact = stats.fisher_exact

        res = fisher_exact([[14500, 20000], [30000, 40000]])[1]
        assert_approx_equal(res, 0.01106, significant=4)
        res = fisher_exact([[100, 2], [1000, 5]])[1]
        assert_approx_equal(res, 0.1301, significant=4)
        res = fisher_exact([[2, 7], [8, 2]])[1]
        assert_approx_equal(res, 0.0230141, significant=6)
        res = fisher_exact([[5, 1], [10, 10]])[1]
        assert_approx_equal(res, 0.1973244, significant=6)
        res = fisher_exact([[5, 15], [20, 20]])[1]
        assert_approx_equal(res, 0.0958044, significant=6)
        res = fisher_exact([[5, 16], [20, 25]])[1]
        assert_approx_equal(res, 0.1725862, significant=6)
        res = fisher_exact([[10, 5], [10, 1]])[1]
        assert_approx_equal(res, 0.1973244, significant=6)
        res = fisher_exact([[5, 0], [1, 4]])[1]
        assert_approx_equal(res, 0.04761904, significant=6)
        res = fisher_exact([[0, 1], [3, 2]])[1]
        assert_approx_equal(res, 1.0)
        res = fisher_exact([[0, 2], [6, 4]])[1]
        assert_approx_equal(res, 0.4545454545)
        res = fisher_exact([[2, 7], [8, 2]])
        assert_approx_equal(res[1], 0.0230141, significant=6)
        assert_approx_equal(res[0], 4.0 / 56) 
Example #20
Source File: predict_enriched_decision_tree.py    From PIDGINv2 with MIT License 5 votes vote down vote up
def doTargetPrediction(pickled_model_name):
	if os.name == 'nt': sep = '\\'
	else: sep = '/'
	mod = pickled_model_name.split(sep)[-1].split('.')[0]
	clf = open_Model(mod)
	probs1 = map(int, clf.predict_proba(querymatrix1)[:,1] > threshold)
	preds1 = sum(probs1)
	preds2 = bg_preds[mod]
	oddsratio, pvalue = stats.fisher_exact([[preds2,2000000-preds2],[preds1,len(querymatrix1)-preds1]])
	try:
		ratio, preds1_percentage, preds2_percentage = calcPredictionRatio(preds1,preds2)
		return ratio, mod, preds1, preds1_percentage, preds2, preds2_percentage, oddsratio, pvalue, probs1
	except TypeError: return None

#prediction runner 
Example #21
Source File: test_stats.py    From Computable with MIT License 5 votes vote down vote up
def test_row_or_col_zero(self):
        tables = ([[0, 0], [5, 10]],
                  [[5, 10], [0, 0]],
                  [[0, 5], [0, 10]],
                  [[5, 0], [10, 0]])
        for table in tables:
            oddsratio, pval = stats.fisher_exact(table)
            assert_equal(pval, 1.0)
            assert_equal(oddsratio, np.nan) 
Example #22
Source File: test_stats.py    From Computable with MIT License 5 votes vote down vote up
def test_raises(self):
        # test we raise an error for wrong shape of input.
        assert_raises(ValueError, stats.fisher_exact,
                      np.arange(6).reshape(2, 3)) 
Example #23
Source File: test_stats.py    From Computable with MIT License 5 votes vote down vote up
def test_large_numbers(self):
        # Test with some large numbers. Regression test for #1401
        pvals = [5.56e-11, 2.666e-11, 1.363e-11]  # from R
        for pval, num in zip(pvals, [75, 76, 77]):
            res = stats.fisher_exact([[17704, 496], [1065, num]])[1]
            assert_approx_equal(res, pval, significant=4)

        res = stats.fisher_exact([[18000, 80000], [20000, 90000]])[1]
        assert_approx_equal(res, 0.2751, significant=4) 
Example #24
Source File: test_stats.py    From Computable with MIT License 5 votes vote down vote up
def test_basic(self):
        fisher_exact = stats.fisher_exact

        res = fisher_exact([[14500, 20000], [30000, 40000]])[1]
        assert_approx_equal(res, 0.01106, significant=4)
        res = fisher_exact([[100, 2], [1000, 5]])[1]
        assert_approx_equal(res, 0.1301, significant=4)
        res = fisher_exact([[2, 7], [8, 2]])[1]
        assert_approx_equal(res, 0.0230141, significant=6)
        res = fisher_exact([[5, 1], [10, 10]])[1]
        assert_approx_equal(res, 0.1973244, significant=6)
        res = fisher_exact([[5, 15], [20, 20]])[1]
        assert_approx_equal(res, 0.0958044, significant=6)
        res = fisher_exact([[5, 16], [20, 25]])[1]
        assert_approx_equal(res, 0.1725862, significant=6)
        res = fisher_exact([[10, 5], [10, 1]])[1]
        assert_approx_equal(res, 0.1973244, significant=6)
        res = fisher_exact([[5, 0], [1, 4]])[1]
        assert_approx_equal(res, 0.04761904, significant=6)
        res = fisher_exact([[0, 1], [3, 2]])[1]
        assert_approx_equal(res, 1.0)
        res = fisher_exact([[0, 2], [6, 4]])[1]
        assert_approx_equal(res, 0.4545454545)
        res = fisher_exact([[2, 7], [8, 2]])
        assert_approx_equal(res[1], 0.0230141, significant=6)
        assert_approx_equal(res[0], 4.0 / 56) 
Example #25
Source File: multilinear.py    From vnpy_crypto with MIT License 5 votes vote down vote up
def _test_group(pvalues, group_name, group, exact=True):
    """test if the objects in the group are different from the general set.

    The test is performed on the pvalues set (ad a pandas series) over
    the group specified via a fisher exact test.
    """
    from scipy.stats import fisher_exact, chi2_contingency

    totals = 1.0 * len(pvalues)
    total_significant = 1.0 * np.sum(pvalues)
    cross_index = [c for c in group if c in pvalues.index]
    missing = [c for c in group if c not in pvalues.index]
    if missing:
        s = ('the test is not well defined if the group '
             'has elements not presents in the significativity '
             'array. group name: {}, missing elements: {}')
        logging.warning(s.format(group_name, missing))
    # how many are significant and not in the group
    group_total = 1.0 * len(cross_index)
    group_sign = 1.0 * len([c for c in cross_index if pvalues[c]])
    group_nonsign = 1.0 * (group_total - group_sign)
    # how many are significant and not outside the group
    extern_sign = 1.0 * (total_significant - group_sign)
    extern_nonsign = 1.0 * (totals - total_significant - group_nonsign)
    # make the fisher test or the chi squared
    test = fisher_exact if exact else chi2_contingency
    table = [[extern_nonsign, extern_sign], [group_nonsign, group_sign]]
    pvalue = test(np.array(table))[1]
    # is the group more represented or less?
    part = group_sign, group_nonsign, extern_sign, extern_nonsign
    #increase = (group_sign / group_total) > (total_significant / totals)
    increase = np.log((totals * group_sign)
                      / (total_significant * group_total))
    return pvalue, increase, part 
Example #26
Source File: TermDocMatrix.py    From scattertext with Apache License 2.0 5 votes vote down vote up
def _get_fisher_scores_from_counts(self, cat_word_counts, not_cat_word_counts):
        cat_not_word_counts = cat_word_counts.sum() - cat_word_counts
        not_cat_not_word_counts = not_cat_word_counts.sum() - not_cat_word_counts

        def do_fisher_exact(x):
            return fisher_exact([[x[0], x[1]], [x[2], x[3]]], alternative='greater')

        odds_ratio, p_values = np.apply_along_axis(
            do_fisher_exact,
            0,
            np.array([cat_word_counts, cat_not_word_counts, not_cat_word_counts, not_cat_not_word_counts]))
        return odds_ratio, p_values 
Example #27
Source File: association.py    From razzy-spinner with GNU General Public License v3.0 5 votes vote down vote up
def fisher(cls, *marginals):
        """Scores bigrams using Fisher's Exact Test (Pedersen 1996).  Less
        sensitive to small counts than PMI or Chi Sq, but also more expensive
        to compute. Requires scipy.
        """

        n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)

        (odds, pvalue) = fisher_exact([[n_ii, n_io], [n_oi, n_oo]], alternative='less')
        return pvalue 
Example #28
Source File: utils.py    From dynamo-release with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def TF_link_gene_chip(raw_glmnet_res_var, df_gene_TF_link_ENCODE, var, cor_thresh=0.02):
    """Filter the raw lasso regression links via chip-seq data based on a Fisher exact test"""

    glmnet_res_var_filtered = raw_glmnet_res_var.query("abs(corcoef) > @cor_thresh")
    print(f"\n Number of possible links: glmnet_res_var_filtered.shape[0]")

    # source - target gene pairs
    df_gene_TF_link_ENCODE['id_gene'] = df_gene_TF_link_ENCODE['id'].astype('str') + '_' + \
                                        df_gene_TF_link_ENCODE['linked_gene_name'].astype('str')
    glmnet_res_var_filtered['id_gene'] = glmnet_res_var_filtered['id'].astype('str') + '_' + \
                                       glmnet_res_var_filtered['linked_gene_name'].astype('str')

    df_gene_TF_link_ENCODE['glmnet_chip_links'] = df_gene_TF_link_ENCODE['id_gene'].isin(glmnet_res_var_filtered['id_gene'])
    unique_TFs = glmnet_res_var_filtered.id[glmnet_res_var_filtered.id.isin(df_gene_TF_link_ENCODE['id'])].unique()

    df_tb = pd.crosstab(df_gene_TF_link_ENCODE['glmnet_chip_links'], df_gene_TF_link_ENCODE['peak'])
    oddsratio, pvalue = stats.fisher_exact(df_tb)
    print(f'odd ratio and pvalue of Fisher exact test for regression identified regulations were validated by target '
          f'TF-binding sites near gene promoters from ENCODE, limiting to TFs from lasso regression and also '
          f'characterized in ENCODE are: {oddsratio}, {pvalue}')

    # Only gene sets with significant enrichment of the correct TF ChIP–seq binding sites were retained
    unique_TF_pvalue = [None] * len(unique_TFs)
    for i, tf in enumerate(unique_TFs):
        df_tmp = df_gene_TF_link_ENCODE.query("id == @tf")
        df_tb = pd.crosstab(df_tmp['glmnet_chip_links'], df_tmp['peak'])
        if df_tb.shape == (2, 2):
            _, pvalue = stats.fisher_exact(df_tb)
            unique_TF_pvalue[i] = pvalue
        else:
            unique_TF_pvalue[i] = 1

    df_unique_TF = pd.DataFrame({"id": unique_TFs, "pvalue": unique_TF_pvalue})
    df_unique_TF['qval'] = fdr(df_unique_TF['pvalue'])
    df_unique_TF = df_unique_TF.query("qval < 0.05")

    print(f"Number of positive TFs: {df_unique_TF.shape[0]}")

    validated_TF = df_unique_TF['id'].unique()
    df_gene_TF_link_chip = df_gene_TF_link_ENCODE.loc[df_gene_TF_link_ENCODE.id.isin(validated_TF)]
    df_gene_TF_link_chip = df_gene_TF_link_chip.merge(glmnet_res_var_filtered[['id_gene', 'corcoef', 'r_squre']])
    df_gene = var[['gene_id', 'gene_short_name', 'gene_type']]
    df_gene.columns = ['linked_gene_id', 'linked_gene_name', df_gene.columns[2]]

    df_gene_TF_link_chip = df_gene_TF_link_chip.merge(df_gene)
    df_gene_TF_link_chip['Conf'] = "Chip_peak"
    df_gene_TF_link_chip['group'] = ["has_link" if i else "no_link" for i in df_gene_TF_link_chip['glmnet_chip_links']]

    df_gene_TF_link_chip = df_gene_TF_link_chip.query("group == 'has_link'")
    df_gene_TF_link_chip = df_gene_TF_link_chip[['id', 'linked_gene_name', 'Conf', 'linked_gene_id',
                                                 'gene_type', 'corcoef', 'r_squre', 'id_gene']]
    df_gene_TF_link_chip.columns = ['TF', 'linked_gene', 'Conf', 'linked_gene_id', 'linked_gene_type',
                                    'corcoef', 'r_2', 'TF_link']

    return df_gene_TF_link_chip 
Example #29
Source File: mappfinder.py    From altanalyze with Apache License 2.0 4 votes vote down vote up
def FishersExactTest(r,n,R,N):
    """
    N is the total number of genes measured (Ensembl linked from denom) (total number of ) (number of exons evaluated)
    R is the total number of genes meeting the criterion (Ensembl linked from input) (number of exonic/intronic regions overlaping with any CLIP peeks)
    n is the total number of genes in this specific MAPP (Ensembl denom in MAPP) (number of exonic/intronic regions associated with the SF)
    r is the number of genes meeting the criterion in this MAPP (Ensembl input in MAPP) (number of exonic/intronic regions with peeks overlapping with the SF)
    
    With these values, we must create a 2x2 contingency table for a Fisher's Exact Test
    that reports:
     
    +---+---+    a is the # of IDs in the term regulated
    | a | b |    b is the # of IDs in the term not-regulated 
    +---+---+    c is the # of IDs not-in-term and regulated
    | c | d |    d is the # of IDs not-in-term and not-regulated
    +---+---+
    
    If we know r=20, R=80, n=437 and N=14480
    +----+-----+    
    | 20 | 417 |  437   
    +----+-----+    
    | 65 |13978| 14043
    +----+-----+
      85  14395  14480
    """

    a = r; b = n-r; c=R-r; d=N-R-b
    table = [[int(a),int(b)], [int(c),int(d)]]

    """
    print a,b; print c,d
    from stats_scripts import fishers_exact_test; table = [[a,b], [c,d]]
    ft = fishers_exact_test.FishersExactTest(table)
    print ft.probability_of_table(table); print ft.two_tail_p()
    print ft.right_tail_p(); print ft.left_tail_p()
    """
    
    try: ### Scipy version - cuts down rutime by ~1/3rd the time
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore",category=RuntimeWarning) ### hides import warnings
            oddsratio, pvalue = stats.fisher_exact(table)
        return pvalue
    except Exception:
        ft = fishers_exact_test.FishersExactTest(table)
        return ft.two_tail_p() 
Example #30
Source File: find_enriched_node.py    From adage with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def find_enriched_node(geneList_folder = None, data_file= None, gold_std = None, out_file = None):
    '''
    geneList_folder: the folder stores high-weight gene files for each node
    data_file: the microarray file with gene genes
    gold_std: gold standard file contains a list of genes
    out_file: the output file that stores each node and its corresponding q value
    '''

    datasets = PCLfile(data_file, skip_col=0)
    gene_id = datasets.id_list  

    gold_fh = open(gold_std,'r')
    gold_set = []
    for line in gold_fh:
        gene = line.strip().split('\t')[0]
        gold_set.append(gene)

    p_all_node = []
    geneList_files =  glob.glob(geneList_folder + '/Node*.txt') #Get all the high-weight gene files under the geneList_folder
    for i in xrange(len(geneList_files)):
        gene_fh = open(geneList_folder + '/Node'+str(i+1)+'.txt','r')
        gene_fh.next()
        geneset = [] #geneset stores the high-weight gene for a node
        for line in gene_fh:
            gene = line.strip().split('\t')[0]
            geneset.append(gene)
        #Build the contengency table
        all_overlap_genes = set(gold_set).intersection(set(gene_id))
        selected_overlap_genes = set(gold_set).intersection(set(geneset))
        a = len(selected_overlap_genes)
        b = len(all_overlap_genes) - len(selected_overlap_genes)
        c = len(geneset) - len(selected_overlap_genes)
        d = len(gene_id) -a -b -c
        table = [[a,b],[c,d]]
        #Calculate p-value using fisher exact test
        oddsratio, pvalue = stats.fisher_exact(table)
        p_all_node.append(pvalue)   

    #multiple hypothesis correction
    result_adj_pvalue = multipletests(p_all_node, alpha=0.05, method='fdr_bh')[1] 
    all_node = ['Node'+str(x+1) for x in xrange(50)]
    #find the node with lowest q value, write the output
    qvalue_small = 1 
    node_small = None
    out_fh = open(out_file, 'w')
    for node, qvalue in zip(all_node, result_adj_pvalue):
        out_fh.write(node+'\t'+str(qvalue)+'\n')
        if qvalue < qvalue_small:
            qvalue_small = qvalue
            node_small = node
    print node_small+' is most significantly associated with this gene set with a q value of '+str(qvalue_small)