Python scipy.stats.f_oneway() Examples

The following are 21 code examples of scipy.stats.f_oneway(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scipy.stats , or try the search function .
Example #1
Source File: TargetAnalysisCategorical.py    From exploripy with MIT License 8 votes vote down vote up
def Anova(self):
		"""		
		Calculate the F-Score (One Way Anova) for each of Categorical Variables with all the Continuous Variables.
		Output --> List of Continuous Variables, whose pValue is < 0.05
		"""
		target = self.target		
		AnovaList = []
		for ContinuousVar in self.ContinuousFeatures:
			temp_df = self.df[[ContinuousVar, target]].dropna()
			try:
				f,p = stats.f_oneway(*[list(temp_df[temp_df[target]==name][ContinuousVar]) for name in set(temp_df[target])])
				AnovaList.append(dict(Continuous = ContinuousVar, PValue = p))
			except:
				# Do nothing. Skip.
				1==1
			
		Anova_df = pd.DataFrame(AnovaList)
		if Anova_df.shape[0]>0:
			Anova_df = Anova_df[Anova_df['PValue']<=0.05]
			Anova_df.sort_values(['PValue'],ascending = True, inplace=True)
		
		return Anova_df 
Example #2
Source File: TargetAnalysisContinuous.py    From exploripy with MIT License 6 votes vote down vote up
def Anova(self):
		"""		
		Calculate the F-Score (One Way Anova) for each of Categorical Variables with all the Continuous Variables.
		Output --> List of Continuous Variables, whose pValue is < 0.05
		"""
		target = self.target		
		AnovaList = []
		print('Performing ANOVA...')
		for CategoricalVar in tqdm(self.CategoricalFeatures):
			temp_df = self.df[[CategoricalVar, target]].dropna()
			try:
				f,p = stats.f_oneway(*[list(temp_df[temp_df[CategoricalVar]==name][target]) for name in set(temp_df[CategoricalVar])])
				AnovaList.append(dict(Categorical = CategoricalVar, PValue = p))
			except:
				# Do Nothing. Skip.
				1==1
			
		Anova_df = pd.DataFrame(AnovaList)
		if Anova_df.shape[0]>0:
			Anova_df = Anova_df[Anova_df['PValue']<=0.05]
			Anova_df.sort_values(['PValue'],ascending = True, inplace=True)
		
		return Anova_df 
Example #3
Source File: test_correct.py    From abagen with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test__batch():
    rs = np.random.RandomState(1234)
    # p-values for ANOVA should all be ~0 (large group differences) before
    # batch correction
    y = [rs.normal(size=(100, 1000)) + f for f in [5, 0, 0]]
    assert np.allclose(sstats.f_oneway(*y)[1], 0)

    # F-values for ANOVA should all be ~0 (no group differences) after batch
    # correction; p-values returned here are sometimes NaN so not a good test
    out = correct._batch_correct(y)
    assert np.allclose(sstats.f_oneway(*out)[0], 0)

    # mean expressions after correction should be ~equal
    assert np.allclose([o.mean() for o in out], 1.24871965683026)

    with pytest.raises(ValueError):
        correct._batch_correct([y[0]]) 
Example #4
Source File: test_stats.py    From GraphicDesignPatternByPython with MIT License 5 votes vote down vote up
def test_result_attributes(self):
        a = np.array([655, 788], dtype=np.uint16)
        b = np.array([789, 772], dtype=np.uint16)
        res = stats.f_oneway(a, b)
        attributes = ('statistic', 'pvalue')
        check_named_results(res, attributes) 
Example #5
Source File: EDA.py    From exploripy with MIT License 5 votes vote down vote up
def Anova(self):
		"""		
		Calculate the F-Score (One Way Anova) for each of Categorical Variables with all the Continuous Variables
		"""
		# Drop records with Null values
		temp_df = self.df.dropna()
		start = time.time()
		AnovaList = []
		SummaryAnovaList = []
		Insight1 = "With Confidence interval of 0.05, the variable - \"{0}\" is influenced by the categorical variable - \"{1}\". "
		Insight2 = "As the Categorical variable - \"{0}\" is binary, Tukey's HSD test is not necessary. "
		Insight3 = "As the p-Value is higher than the Confidence Interval 0.05, the variable - \"{0}\" is not influenced by the categorical variable - \"{1}\". "
		for CategoricalVar in self.CategoricalFeatures:
			Binary = 'Yes' if CategoricalVar in self.BinaryCategoricalFeatures else 'No'
			for ContinuousVar in self.ContinuousFeatures:
				TukeyResult = None 
				f,p = stats.f_oneway(*[list(temp_df[temp_df[CategoricalVar]==name][ContinuousVar]) for name in set(temp_df[CategoricalVar])])
				if (p<0.05 and CategoricalVar in self.BinaryCategoricalFeatures):
					Insight = Insight1.format(ContinuousVar, CategoricalVar) + Insight2.format(CategoricalVar)
				elif p<0.05:
					TukeyResult = self.Tukey(CategoricalVar, ContinuousVar)
					Insight = Insight1.format(ContinuousVar, CategoricalVar)
				else:
					Insight = Insight3.format(ContinuousVar, CategoricalVar)
				AnovaList.append(dict(Categorical = CategoricalVar, Continuous = ContinuousVar, f = f, p = p, Binary = Binary, Insight = Insight,
				TukeyResult = TukeyResult))
		for entry in AnovaList:
			Categorical = entry['Categorical']
			Continuous = entry['Continuous']
			PValue = entry['p']			
			SummaryAnovaList.append(dict(Categorical=Categorical,Continuous=Continuous,PValue=PValue))
		
		end = time.time()
		if self.debug == 'YES':
			print('Anova',end-start)
		return AnovaList,pd.DataFrame(SummaryAnovaList) 
Example #6
Source File: test_feature_select.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_f_oneway_ints():
    # Smoke test f_oneway on integers: that it does raise casting errors
    # with recent numpys
    rng = np.random.RandomState(0)
    X = rng.randint(10, size=(10, 10))
    y = np.arange(10)
    fint, pint = f_oneway(X, y)

    # test that is gives the same result as with float
    f, p = f_oneway(X.astype(np.float), y)
    assert_array_almost_equal(f, fint, decimal=4)
    assert_array_almost_equal(p, pint, decimal=4) 
Example #7
Source File: test_feature_select.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_f_oneway_vs_scipy_stats():
    # Test that our f_oneway gives the same result as scipy.stats
    rng = np.random.RandomState(0)
    X1 = rng.randn(10, 3)
    X2 = 1 + rng.randn(10, 3)
    f, pv = stats.f_oneway(X1, X2)
    f2, pv2 = f_oneway(X1, X2)
    assert_true(np.allclose(f, f2))
    assert_true(np.allclose(pv, pv2)) 
Example #8
Source File: misc.py    From audit-ai with MIT License 5 votes vote down vote up
def anova(labels, results, subset_labels=None):
    """
    Returns one-way ANOVA f-statistic and p-value from
    input vectors of categorical labels and numeric results

    Parameters
    ------------
    labels : array_like
        containing categorical values like ['M', 'F']
    results : array_like
        containing real numbers
    subset_labels : list of strings, optional
        if only specific labels should be included

    Returns
    ----------
    F_onewayResult : scipy.stats object (essentially a 2-tuple)
        contains one-way f-statistic and p-value, indicating whether
        scores have same sample mean

    """
    check_consistent_length(labels, results)

    df = pd.DataFrame(list(zip(labels, results)), columns=['label', 'result'])
    if subset_labels is not None:
        df = df.loc[df['label'].isin(subset_labels)]

    unique_labels = df['label'].dropna().unique()
    score_vectors = [df.loc[df['label'] == lab, 'result']
                     for lab in unique_labels]
    return f_oneway(*score_vectors) 
Example #9
Source File: plot.py    From SCALE with MIT License 5 votes vote down vote up
def feature_specifity(feature, ref, classes, figsize=(6,6), save=None):
    """
    Calculate the feature specifity:

    Input:
        feature: latent feature
        ref: cluster assignments
        classes: cluster classes
    """
    from scipy.stats import f_oneway
    # n_cluster = max(ref) + 1
    n_cluster = len(classes)
    dim = feature.shape[1] # feature dimension
    pvalue_mat = np.zeros((dim, n_cluster))
    for i,cluster in enumerate(classes):
        for feat in range(dim):
            a = feature.iloc[:, feat][ref == cluster]
            b = feature.iloc[:, feat][ref != cluster]
            pvalue = f_oneway(a,b)[1]
            pvalue_mat[feat, i] = pvalue

    plt.figure(figsize=figsize)
    grid = sns.heatmap(-np.log10(pvalue_mat), cmap='RdBu_r', 
                       vmax=20,
                       yticklabels=np.arange(10)+1, 
                       xticklabels=classes[:n_cluster],
                       )
    grid.set_ylabel('Feature', fontsize=18)
    grid.set_xticklabels(labels=classes[:n_cluster], rotation=45, fontsize=18)
    grid.set_yticklabels(labels=np.arange(dim)+1, fontsize=16)
    cbar = grid.collections[0].colorbar
    cbar.set_label('-log10 (Pvalue)', fontsize=18) #, rotation=0, x=-0.9, y=0)
    
    if save:
        plt.savefig(save, format='pdf', bbox_inches='tight')
    else:
        plt.show() 
Example #10
Source File: test_stats.py    From GraphicDesignPatternByPython with MIT License 5 votes vote down vote up
def test_nist(self):
        # These are the nist ANOVA files. They can be found at:
        # http://www.itl.nist.gov/div898/strd/anova/anova.html
        filenames = ['SiRstv.dat', 'SmLs01.dat', 'SmLs02.dat', 'SmLs03.dat',
                     'AtmWtAg.dat', 'SmLs04.dat', 'SmLs05.dat', 'SmLs06.dat',
                     'SmLs07.dat', 'SmLs08.dat', 'SmLs09.dat']

        for test_case in filenames:
            rtol = 1e-7
            fname = os.path.abspath(os.path.join(os.path.dirname(__file__),
                                                 'data/nist_anova', test_case))
            with open(fname, 'r') as f:
                content = f.read().split('\n')
            certified = [line.split() for line in content[40:48]
                         if line.strip()]
            dataf = np.loadtxt(fname, skiprows=60)
            y, x = dataf.T
            y = y.astype(int)
            caty = np.unique(y)
            f = float(certified[0][-1])

            xlist = [x[y == i] for i in caty]
            res = stats.f_oneway(*xlist)

            # With the hard test cases we relax the tolerance a bit.
            hard_tc = ('SmLs07.dat', 'SmLs08.dat', 'SmLs09.dat')
            if test_case in hard_tc:
                rtol = 1e-4

            assert_allclose(res[0], f, rtol=rtol,
                            err_msg='Failing testcase: %s' % test_case) 
Example #11
Source File: pancreas_tests.py    From scanorama with MIT License 5 votes vote down vote up
def print_oneway(X, genes, ds_labels):
    for gene_idx, gene in enumerate(genes):
        ds_names = sorted(set(ds_labels))
        dist = []
        for ds in ds_names:
            dist.append(X[ds_labels == ds, gene_idx])
        sys.stdout.write('{}\t'.format(gene))
        print('{}\t{}'.format(*f_oneway(*dist))) 
Example #12
Source File: test_stats.py    From GraphicDesignPatternByPython with MIT License 5 votes vote down vote up
def test_large_integer_array(self):
        a = np.array([655, 788], dtype=np.uint16)
        b = np.array([789, 772], dtype=np.uint16)
        F, p = stats.f_oneway(a, b)
        assert_almost_equal(F, 0.77450216931805538) 
Example #13
Source File: test_stats.py    From GraphicDesignPatternByPython with MIT License 5 votes vote down vote up
def test_basic(self):
        # Despite being a floating point calculation, this data should
        # result in F being exactly 2.0.
        F, p = stats.f_oneway([0,2], [2,4])
        assert_equal(F, 2.0) 
Example #14
Source File: test_stats.py    From GraphicDesignPatternByPython with MIT License 5 votes vote down vote up
def test_trivial(self):
        # A trivial test of stats.f_oneway, with F=0.
        F, p = stats.f_oneway([0,2], [0,2])
        assert_equal(F, 0.0) 
Example #15
Source File: eda.py    From xam with MIT License 5 votes vote down vote up
def feature_importance_regression(features, target, n_neighbors=3, random_state=None):

    cont = features.select_dtypes(include=[np.floating])
    disc = features.select_dtypes(include=[np.integer, np.bool])

    cont_imp = pd.DataFrame(index=cont.columns)
    disc_imp = pd.DataFrame(index=disc.columns)

    # Continuous features
    if cont_imp.index.size > 0:

        # Pearson correlation
        pearson = np.array([stats.pearsonr(feature, target) for _, feature in cont.iteritems()])
        cont_imp['pearson_r'] = pearson[:, 0]
        cont_imp['pearson_r_p_value'] = pearson[:, 1]

        # Mutual information
        mut_inf = feature_selection.mutual_info_regression(cont, target, discrete_features=False,
                                                           n_neighbors=n_neighbors,
                                                           random_state=random_state)
        cont_imp['mutual_information'] = mut_inf

    # Discrete features
    if disc_imp.index.size > 0:

        # F-test
        f_tests = defaultdict(dict)

        for feature in disc.columns:
            groups = [target[idxs] for idxs in disc.groupby(feature).groups.values()]
            statistic, p_value = stats.f_oneway(*groups)
            f_tests[feature]['f_statistic'] = statistic
            f_tests[feature]['f_p_value'] = p_value

        f_tests_df = pd.DataFrame.from_dict(f_tests, orient='index')
        disc_imp['f_statistic'] = f_tests_df['f_statistic']
        disc_imp['f_p_value'] = f_tests_df['f_p_value']

        # Mutual information
        mut_inf = feature_selection.mutual_info_regression(disc, target, discrete_features=True,
                                                           n_neighbors=n_neighbors,
                                                           random_state=random_state)
        disc_imp['mutual_information'] = mut_inf

    return cont_imp, disc_imp 
Example #16
Source File: test_feature_select.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_f_oneway_ints():
    # Smoke test f_oneway on integers: that it does raise casting errors
    # with recent numpys
    rng = np.random.RandomState(0)
    X = rng.randint(10, size=(10, 10))
    y = np.arange(10)
    fint, pint = f_oneway(X, y)

    # test that is gives the same result as with float
    f, p = f_oneway(X.astype(np.float), y)
    assert_array_almost_equal(f, fint, decimal=4)
    assert_array_almost_equal(p, pint, decimal=4) 
Example #17
Source File: test_feature_select.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_f_oneway_vs_scipy_stats():
    # Test that our f_oneway gives the same result as scipy.stats
    rng = np.random.RandomState(0)
    X1 = rng.randn(10, 3)
    X2 = 1 + rng.randn(10, 3)
    f, pv = stats.f_oneway(X1, X2)
    f2, pv2 = f_oneway(X1, X2)
    assert np.allclose(f, f2)
    assert np.allclose(pv, pv2) 
Example #18
Source File: test_stats.py    From Computable with MIT License 5 votes vote down vote up
def test_basic(self):
        # A test of stats.f_oneway, with F=2.
        F, p = stats.f_oneway([0,2], [2,4])
        # Despite being a floating point calculation, this data should
        # result in F being exactly 2.0.
        assert_equal(F, 2.0) 
Example #19
Source File: test_stats.py    From Computable with MIT License 5 votes vote down vote up
def test_trivial(self):
        # A trivial test of stats.f_oneway, with F=0.
        F, p = stats.f_oneway([0,2], [0,2])
        assert_equal(F, 0.0) 
Example #20
Source File: ANOVA.py    From TabPy with MIT License 5 votes vote down vote up
def anova(_arg1, _arg2, *_argN):
    """
    ANOVA is a statistical hypothesis test that is used to compare
    two or more group means for equality.For more information on
    the function and how to use it please refer to tabpy-tools.md
    """

    cols = [_arg1, _arg2] + list(_argN)
    for col in cols:
        if not isinstance(col[0], (int, float)):
            print("values must be numeric")
            raise ValueError
    _, p_value = stats.f_oneway(_arg1, _arg2, *_argN)
    return p_value 
Example #21
Source File: numerical_comparison.py    From DIVE-backend with GNU General Public License v3.0 4 votes vote down vote up
def get_valid_tests(equal_var, independent, normal, num_samples):
    '''
    Get valid tests given number of samples and statistical characterization of
    samples:

    Equal variance
    Indepenence
    Normality
    '''
    if num_samples == 1:
        valid_tests = {
            'chisquare': stats.chisquare,
            'power_divergence': stats.power_divergence,
            'kstest': stats.kstest
        }
        if normal:
            valid_tests['input']['one_sample_ttest'] = stats.ttest_1samp

    elif num_samples == 2:
        if independent:
            valid_tests = {
                'mannwhitneyu': stats.mannwhitneyu,
                'kruskal': stats.kruskal,
                'ks_2samp': stats.ks_2samp
            }
            if normal:
                valid_tests['two_sample_ttest'] = stats.ttest_ind
                if equal_var:
                    valid_tests['f_oneway'] = stats.f_oneway
        else:
            valid_tests = {
                'two_sample_ks': stats.ks_2samp,
                'wilcoxon': stats.wilcoxon
            }
            if normal:
                valid_tests['two_sample_related_ttest'] = stats.ttest_rel

    elif num_samples >= 3:
        if independent:
            valid_tests = {
                'kruskal': stats.kruskal
            }
            if normal and equal_var:
                valid_tests['f_oneway'] = stats.f_oneway

        else:
            valid_tests['friedmanchisquare'] = stats.friedmanchisquare

    return valid_tests