Python scipy.stats.chisquare() Examples

The following are code examples for showing how to use scipy.stats.chisquare(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: DataComp   Author: Cojabi   File: test_stats.py    Apache License 2.0 6 votes vote down vote up
def test_test_cat_feats(self):
        # test if warnings are risen due to few data points
        with warnings.catch_warnings(record=True) as w:
            # Cause all warnings to always be triggered.
            warnings.simplefilter("always")
            # Trigger a warning.
            p_vals = cat_test(self.zipper, feat_subset=self.datacol.categorical_feats)

            warns = sorted([str(m.message) for m in w])
            self.assertEqual(len(w), 4) # 4 because one warining will be a divide by zero assertion error
            self.assertTrue("cat1" in warns[0])

        # check categorical table creation
        test_data = _categorical_table(["A", "A", "C", "A", "A"])
        self.assertTrue((test_data == [4, 1]).all())

        # check categorical p-value
        cat1_p_val = chisquare([1, 3, 1],[2, 1, 3])
        self.assertEqual(p_vals["cat3"][(1, 2)], cat1_p_val.pvalue) 
Example 2
Project: LaserTOF   Author: kyleuckert   File: test_stats.py    MIT License 6 votes vote down vote up
def check_power_divergence(self, f_obs, f_exp, ddof, axis, lambda_,
                               expected_stat):
        f_obs = np.asarray(f_obs)
        if axis is None:
            num_obs = f_obs.size
        else:
            b = np.broadcast(f_obs, f_exp)
            num_obs = b.shape[axis]
        stat, p = stats.power_divergence(f_obs=f_obs, f_exp=f_exp, ddof=ddof,
                                         axis=axis, lambda_=lambda_)
        assert_allclose(stat, expected_stat)

        if lambda_ == 1 or lambda_ == "pearson":
            # Also test stats.chisquare.
            stat, p = stats.chisquare(f_obs=f_obs, f_exp=f_exp, ddof=ddof,
                                      axis=axis)
            assert_allclose(stat, expected_stat)

        ddof = np.asarray(ddof)
        expected_p = stats.distributions.chi2.sf(expected_stat,
                                                 num_obs - 1 - ddof)
        assert_allclose(p, expected_p) 
Example 3
Project: att   Author: Centre-Alt-Rendiment-Esportiu   File: test_stats.py    GNU General Public License v3.0 6 votes vote down vote up
def check_power_divergence(self, f_obs, f_exp, ddof, axis, lambda_,
                               expected_stat):
        f_obs = np.asarray(f_obs)
        if axis is None:
            num_obs = f_obs.size
        else:
            b = np.broadcast(f_obs, f_exp)
            num_obs = b.shape[axis]
        stat, p = stats.power_divergence(f_obs=f_obs, f_exp=f_exp, ddof=ddof,
                                         axis=axis, lambda_=lambda_)
        assert_allclose(stat, expected_stat)

        if lambda_ == 1 or lambda_ == "pearson":
            # Also test stats.chisquare.
            stat, p = stats.chisquare(f_obs=f_obs, f_exp=f_exp, ddof=ddof,
                                      axis=axis)
            assert_allclose(stat, expected_stat)

        ddof = np.asarray(ddof)
        expected_p = stats.chisqprob(expected_stat, num_obs - 1 - ddof)
        assert_allclose(p, expected_p) 
Example 4
Project: tgirt-dna-seq   Author: wckdouglas   File: plot_reinitiate.py    MIT License 6 votes vote down vote up
def main():
    data_path = '/stor/work/Lambowitz/cdw2854/ecoli_genome/rawData/k12/umi2id_30/frag_counts'
    figurename = data_path + '/reinitiate_model.pdf'
    bar_figurename = figurename.replace('.pdf','_bar.pdf')
    sim_Df = make_simulation_data()
    files = glob.glob(data_path + '/*UMI_1*unique.tsv')
    files = filter(lambda x: 'two_error' in x, files)
    df = map(read_file, files )
    df = pd.concat(df, axis = 0)
    df = pd.concat([df,sim_Df],axis=0) \
            .assign(normalized_count = lambda d: d.normalized_count * 100)

    how_many = 5
    chi_df = df.query('fragment_counts < %i' %how_many)
    sim_vector = chi_df[chi_df.samplename.str.contains('sim')]['counts']
    sample_vector = chi_df[~chi_df.samplename.str.contains('sim')]['counts']
    #k, pv = chisquare(chi_df[~chi_df.samplename.str.contains('sim')].normalized_count, 
    #                  chi_df[chi_df.samplename.str.contains('sim')].normalized_count)
    k, pv, dof, t = chi2_contingency([sample_vector,sim_vector])

    d = df[df.samplename.str.contains('UMI_1|sim')]\
        .assign(samplename = lambda d: map(rename, d.samplename))

    plot_bar(d, k, pv, bar_figurename)
    return 0 
Example 5
Project: statistics_introduction   Author: dhwang99   File: chi_square_test.py    GNU General Public License v3.0 6 votes vote down vote up
def chi_for_pea():
    pmf = np.array([9./16, 3./16, 3./16, 1./16])
    test_data = np.array([315, 101, 108, 32])
    test_sum = test_data.sum() * 1.

    df = len(pmf) - 1
    npi = test_sum * pmf
    X2 = ((test_data - npi) ** 2) * 1./npi
    chisq = X2.sum()
    p = 1 - chi2.cdf(chisq, df)

    print "Detail  : chisq for pea: %.4f; p value for pea: %.4f" % (chisq, p)

    #直接这么算也可以
    f_obs = test_data 
    f_exp = npi
    chisq, p = chisquare(f_obs, f_exp)
    print "Directed: chisq for pea: %.4f; p value for pea: %.4f" % (chisq, p) 
Example 6
Project: Computable   Author: ktraunmueller   File: test_stats.py    MIT License 6 votes vote down vote up
def check_power_divergence(self, f_obs, f_exp, ddof, axis, lambda_,
                               expected_stat):
        f_obs = np.asarray(f_obs)
        if axis is None:
            num_obs = f_obs.size
        else:
            b = np.broadcast(f_obs, f_exp)
            num_obs = b.shape[axis]
        stat, p = stats.power_divergence(f_obs=f_obs, f_exp=f_exp, ddof=ddof,
                                         axis=axis, lambda_=lambda_)
        assert_allclose(stat, expected_stat)

        if lambda_ == 1 or lambda_ == "pearson":
            # Also test stats.chisquare.
            stat, p = stats.chisquare(f_obs=f_obs, f_exp=f_exp, ddof=ddof,
                                      axis=axis)
            assert_allclose(stat, expected_stat)

        ddof = np.asarray(ddof)
        expected_p = stats.chisqprob(expected_stat, num_obs - 1 - ddof)
        assert_allclose(p, expected_p) 
Example 7
Project: poker   Author: surgebiswas   File: test_stats.py    MIT License 6 votes vote down vote up
def check_power_divergence(self, f_obs, f_exp, ddof, axis, lambda_,
                               expected_stat):
        f_obs = np.asarray(f_obs)
        if axis is None:
            num_obs = f_obs.size
        else:
            b = np.broadcast(f_obs, f_exp)
            num_obs = b.shape[axis]
        stat, p = stats.power_divergence(f_obs=f_obs, f_exp=f_exp, ddof=ddof,
                                         axis=axis, lambda_=lambda_)
        assert_allclose(stat, expected_stat)

        if lambda_ == 1 or lambda_ == "pearson":
            # Also test stats.chisquare.
            stat, p = stats.chisquare(f_obs=f_obs, f_exp=f_exp, ddof=ddof,
                                      axis=axis)
            assert_allclose(stat, expected_stat)

        ddof = np.asarray(ddof)
        expected_p = stats.distributions.chi2.sf(expected_stat,
                                                 num_obs - 1 - ddof)
        assert_allclose(p, expected_p) 
Example 8
Project: PES-Learn   Author: CCQC   File: data_sampler.py    BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def smart_random(self):
        """
        Choose a random training set that has an energy distribution most resembling that of the full dataset.
        Uses the Chi-Squared method to estimate the similarity of the energy distrubtions.
        """
        data = self.dataset.values
        X = data[:, :-1]
        y = data[:,-1].reshape(-1,1)
        full_dataset_dist, binedges = np.histogram(y, bins=10, density=True)
        pvalues = []
        chi = []
        for seed in range(500):
            X_train, X_test, y_train, y_test  = train_test_split(X,y,train_size=self.ntrain, random_state=seed)
            train_dist, tmpbin = np.histogram(y_train, bins=binedges, density=True)
            chisq, p = stats.chisquare(train_dist, f_exp=full_dataset_dist)
            chi.append(chisq)
            pvalues.append(p)
        best_seed = np.argmin(chi)
        #best_seed = np.argmax(chi)
        X_train, X_test, y_train, y_test  = train_test_split(X,y,train_size=self.ntrain, random_state=best_seed)
        train_dist, tmpbin = np.histogram(y_train, bins=binedges, density=True)

        indices = np.arange(self.dataset_size)
        train_indices, test_indices  = train_test_split(indices, train_size=self.ntrain, random_state=best_seed)
        self.set_indices(train_indices, test_indices) 
Example 9
Project: surveyhelper   Author: cwade   File: question.py    MIT License 6 votes vote down vote up
def compare_groups(self, groupby, 
                       remove_exclusions=True, 
                       pval = .05):
        groupnames = groupby.groups.keys()
        obs_by_cut = []
        ct_by_cut = []
        if sum(ct_by_cut) == 0:
            return([False]*len(groupnames))
        for k, df in groupby:
            freqs, tot_resp, tot_nonresp = self.tally(df, remove_exclusions)
            obs_by_cut.append(freqs)
            ct_by_cut.append(tot_resp)
        choice_totals = [sum(x) for x in zip(*obs_by_cut)]
        exp_prop_per_choice = [t/sum(ct_by_cut) for t in choice_totals]
        sigs = []
        for f_obs, choice_tot, p_choice in zip(zip(*obs_by_cut), 
            choice_totals, exp_prop_per_choice):
            f_exp = [p_choice * ct for ct in ct_by_cut]
            chisq, p = chisquare(f_obs, f_exp)
            sigs.append(p < pval)
        return(sigs) 
Example 10
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_stats.py    Apache License 2.0 6 votes vote down vote up
def check_power_divergence(self, f_obs, f_exp, ddof, axis, lambda_,
                               expected_stat):
        f_obs = np.asarray(f_obs)
        if axis is None:
            num_obs = f_obs.size
        else:
            b = np.broadcast(f_obs, f_exp)
            num_obs = b.shape[axis]
        stat, p = stats.power_divergence(f_obs=f_obs, f_exp=f_exp, ddof=ddof,
                                         axis=axis, lambda_=lambda_)
        assert_allclose(stat, expected_stat)

        if lambda_ == 1 or lambda_ == "pearson":
            # Also test stats.chisquare.
            stat, p = stats.chisquare(f_obs=f_obs, f_exp=f_exp, ddof=ddof,
                                      axis=axis)
            assert_allclose(stat, expected_stat)

        ddof = np.asarray(ddof)
        expected_p = stats.distributions.chi2.sf(expected_stat,
                                                 num_obs - 1 - ddof)
        assert_allclose(p, expected_p) 
Example 11
Project: phenotypeXpression   Author: NCBI-Hackathons   File: batcheffect.py    MIT License 6 votes vote down vote up
def _generate_chisq_test(self, total_dist: List, clust_stats: Dict) -> Dict:
        """
        Chi Squared test for cluster distribution independence from total dist
        :param total_dist: list distribution of gds metric for total gds set
        :param clust_stats: pass mutable dict for recursive additions
            key=cluster_ids, value=ordered list of stats for all gds metrics
            (KS/chi-stat, p-value, mean/mode, s.d./category_n)
        :return:
        """
        # total distribution to counts of categories
        total_frame = pd.DataFrame.from_dict(Counter(total_dist), orient='index')
        clust_stats['Overall'].extend([None, None, total_frame[0].idxmax(), len(total_frame.index)]) 
        
        # per cluster counts
        for cluster_id, cluster_list in self.clusters.items():
            cluster_dist = [self.meta_dict[gds][2] for gds in cluster_list] # GPL index in meta_list is 2
            clust_frame = pd.DataFrame.from_dict(Counter(cluster_dist), orient='index')
            chi_frame = clust_frame.merge(total_frame, how='right', right_index=True, left_index=True).fillna(0)
            chi_stat, p_val = stats.chisquare(f_obs=chi_frame['0_x'], f_exp=chi_frame['0_y'])
            clust_stats[cluster_id].extend([chi_stat, p_val, clust_frame[0].idxmax(), len(clust_frame.index)])
            if p_val <= 0.01:
                print('{}\'s platform distribution is significantly different from the overall distribution'.format(cluster_id))
        return 
Example 12
Project: dictances   Author: LucaCappelletti94   File: normal_chi_square_reference.py    MIT License 6 votes vote down vote up
def normal_chi_square_distance(repr1: np.ndarray, repr2: np.ndarray) -> float:
    repr1.tolist()
    repr2.tolist()
    # Normalization of list adapted from https://stackoverflow.com/a/26785464/2941352
    repr1 = [float(x)/sum(repr1) for x in repr1]
    repr2 = [float(x)/sum(repr2) for x in repr2]
    dist_sum = 0

    for x in range(max(len(repr1), len(repr2))):
        val1 = 0
        val2 = 0

        try:
            val1 = repr1[x]
        except IndexError:
            pass

        try:
            val2 = repr2[x]
        except IndexError:
            pass

        dist_sum += chisquare([val1,val2])[0]
    return dist_sum 
Example 13
Project: dictances   Author: LucaCappelletti94   File: chi_square_reference.py    MIT License 6 votes vote down vote up
def chi_square_distance(repr1: np.ndarray, repr2: np.ndarray) -> float:
    repr1.tolist()
    repr2.tolist()
    dist_sum = 0

    for x in range(max(len(repr1), len(repr2))):
        val1 = 0
        val2 = 0

        try:
            val1 = repr1[x]
        except IndexError:
            pass

        try:
            val2 = repr2[x]
        except IndexError:
            pass

        dist_sum += chisquare([val1,val2])[0]
    return dist_sum 
Example 14
Project: edaHelper   Author: gibsondanield   File: edaHelper.py    BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def significance_test(self, target_var=None, p=.05,
                          multivariate_correction=None):
        ''' Runs chi2 on categorical and boolean variables'''
        self.log.append('')

        if target_var is None:
            target_var = self.y

        if self.df[target_var].dtype == bool:
            target_proportion = sum(
                self.df[target_var] == 1) / float(self.df[target_var].size)
            self.chi_2_results = {}
            columns = self._return_categorical_and_boolean_columns()
            for col in columns:
                column_fraction = self.df[self.df[target_var]][col].value_counts(
                ).values / p.map(float, self.df[col].value_counts().values)
                chi_2 = chisquare(column_fraction, [
                                  target_proportion for _ in column_fraction])

                if self.verbose:
                    print col, 'column fraction = ', column_fraction
                    print 'chi_2 = ', chi_2
                    pprint.pprint(chi_2)
                self.chi_2_results[col] = chi_2
        return self.chi_2_results.values() 
Example 15
Project: facebook-message-analysis   Author: szheng17   File: conversation.py    MIT License 6 votes vote down vote up
def n_messages_chi_square(self, time_interval):
        """
        Computes a chi square test against the null hypothesis that the number
        of messages is uniformly distributed across the time interval. Only
        makes sense for the time intervals 'minute in hour', 'minute in day',
        'hour' since those ones have a fixed number of values.

        Args:
            time_interval: One of 'minute in hour', 'minute in day', 'hour'.

        Returns:
            chisq: A float representing the chi square statistic where the
                observations consist of the number of messages in each value of
                time_interval and the null hypothesis is that the number of
                messages is uniformly distributed.
            p: A float representing the p-value of the chi square test.
        """
        valid_time_intervals = ['minute in hour', 'minute in day', 'hour']
        if time_interval not in valid_time_intervals:
            raise ValueError('time_interval must be in {}'.format(valid_time_intervals))
        result = chisquare(self.get_n_messages_in_time_interval(time_interval))
        return (result.statistic, result.pvalue) 
Example 16
Project: LaserTOF   Author: kyleuckert   File: test_stats.py    MIT License 5 votes vote down vote up
def test_jarque_bera_stats(self):
        np.random.seed(987654321)
        x = np.random.normal(0, 1, 100000)
        y = np.random.chisquare(10000, 100000)
        z = np.random.rayleigh(1, 100000)

        assert_(stats.jarque_bera(x)[1] > stats.jarque_bera(y)[1])
        assert_(stats.jarque_bera(x)[1] > stats.jarque_bera(z)[1])
        assert_(stats.jarque_bera(y)[1] > stats.jarque_bera(z)[1]) 
Example 17
Project: att   Author: Centre-Alt-Rendiment-Esportiu   File: test_stats.py    GNU General Public License v3.0 5 votes vote down vote up
def test_jarque_bera_stats(self):
        np.random.seed(987654321)
        x = np.random.normal(0, 1, 100000)
        y = np.random.chisquare(10000, 100000)
        z = np.random.rayleigh(1, 100000)

        assert_(stats.jarque_bera(x)[1] > stats.jarque_bera(y)[1])
        assert_(stats.jarque_bera(x)[1] > stats.jarque_bera(z)[1])
        assert_(stats.jarque_bera(y)[1] > stats.jarque_bera(z)[1]) 
Example 18
Project: att   Author: Centre-Alt-Rendiment-Esportiu   File: test_discrete_basic.py    GNU General Public License v3.0 5 votes vote down vote up
def test_discrete_basic():
    for distname, arg in distdiscrete:
        distfn = getattr(stats, distname)
        np.random.seed(9765456)
        rvs = distfn.rvs(size=2000, *arg)
        supp = np.unique(rvs)
        m, v = distfn.stats(*arg)
        yield check_cdf_ppf, distfn, arg, supp, distname + ' cdf_ppf'

        yield check_pmf_cdf, distfn, arg, distname
        yield check_oth, distfn, arg, supp, distname + ' oth'
        yield check_edge_support, distfn, arg

        alpha = 0.01
        yield check_discrete_chisquare, distfn, arg, rvs, alpha, \
                      distname + ' chisquare'

    seen = set()
    for distname, arg in distdiscrete:
        if distname in seen:
            continue
        seen.add(distname)
        distfn = getattr(stats,distname)
        locscale_defaults = (0,)
        meths = [distfn.pmf, distfn.logpmf, distfn.cdf, distfn.logcdf,
                 distfn.logsf]
        # make sure arguments are within support
        spec_k = {'randint': 11, 'hypergeom': 4, 'bernoulli': 0, }
        k = spec_k.get(distname, 1)
        yield check_named_args, distfn, k, arg, locscale_defaults, meths
        yield check_scale_docstring, distfn

        # Entropy
        yield check_entropy, distfn, arg, distname
        if distfn.__class__._entropy != stats.rv_discrete._entropy:
            yield check_private_entropy, distfn, arg, stats.rv_discrete 
Example 19
Project: AdOps   Author: TeddyCr   File: clearing_win_price_ratio.py    GNU General Public License v3.0 5 votes vote down vote up
def calculateChiSquare(observed_data, expected_ratio):
    """
    Evaluate significance of the test by computing ChiSquare for the data input
    """
    tot_observed_data = sum(observed_data)
    tot_observed_data = int(tot_observed_data)
    
    expected_data = []
    for i in range(len(observed_data)):
        expected_data.append(tot_observed_data*0.01)
        
    return chisquare(observed_data, f_exp=expected_data) 
Example 20
Project: python-mle   Author: ibab   File: __init__.py    MIT License 5 votes vote down vote up
def chisquare(dist, fit_result, data, bins=None, range=None):
    """Perform a Chi^2 test for goodness of fit.

    Tests the H0 hypothesis if the distances between fit result and
    data are compatible  with random fluctuations.

    Args:
        dist:         A mle.Distribution instance
        fit_result:   The solution dict, returned by the Distribution.fit method
        data:         The data used in Distribution.fit
        bins:         Number of bins for the histogram (default: 1+log2(N))
        range:        Range for the histogram (default: min(data), max(data))
    Returns:
        chisquare:    the test statistic, chi^2/ndf
        p-value:      the p-value, probability that differences between dist
                      and data are compatible with random fluctuation
    """
    variables = dist.get_vars()
    if len(variables) > 1:
        raise ValueError("This is a 1d only chisquare test")
    var = variables[0]

    # rule of thumb for number if bins if not provided
    if bins is None:
        bins = np.ceil(2*len(data[var.name])**(1.0/3.0))

    entries, edges = np.histogram(data[var.name], bins=bins, range=range)

    # get expected frequencies from the cdf
    cdf = dist.cdf(edges, **fit_result["x"])
    exp_entries = np.round(len(data[var.name]) * (cdf[1:]-cdf[:-1]))

    # use only bins where more then 4 entries are expected
    mask = exp_entries >= 5

    chisq, pvalue = stats.chisquare(entries[mask], exp_entries[mask], ddof=len(fit_result["x"]))
    chisq = chisq/(np.sum(mask) - len(fit_result["x"]) - 1)
    return chisq, pvalue 
Example 21
Project: partycrasher   Author: naturalness   File: fake_data_generator.py    GNU General Public License v3.0 5 votes vote down vote up
def test(self):
        nr_observations = sum(self.histogram)
        observed_frequencies = []
        expected_frequencies = []
        frequencies_of = []
        thresh = 10
        for i in range(0, len(self.histogram)):
            observed = self.histogram[i]
            expected = stats.poisson.pmf(i, self.lambda_) * nr_observations
            if (
                (observed >= thresh)
                and (expected >= thresh)):
                observed_frequencies.append(observed)
                expected_frequencies.append(expected)
                frequencies_of.append(i)
        results = stats.chisquare(observed_frequencies,
                                  expected_frequencies)
        print("expected: mean %f variance %f" % (
                      self.expected_mean(),
                      self.expected_variance()))
        print("actual: mean %f variance %f" % (
                      self.mean(),
                      self.variance()))
        print(len(expected_frequencies))
        print(results)
        from matplotlib import pyplot
        import matplotlib
        pyplot.switch_backend('Qt5Agg')
        actual_plot, = pyplot.plot(frequencies_of, observed_frequencies, label='actual')
        expected_plot, = pyplot.plot(frequencies_of, expected_frequencies, 'r', linewidth=1, label='expected')
        matplotlib.interactive(True)
        #pyplot.ylabel("People at Table")
        #pyplot.xlabel("Table Number")
        #pyplot.title("Chinese Restaurant Process Unit Test")
        pyplot.legend()
        pyplot.show(block=True)
        return results 
Example 22
Project: Biodemography   Author: skchronicles   File: pipeline.py    MIT License 5 votes vote down vote up
def apriori_v3(q, insig, sex_file_dict, countries_list, age):
    """ This function implements a modified version of the apriori algorithm which can be used for speeding up
    an otherwise exhaustive high-performance computing problem. The apriori algorithm is commonly used in mining
    frequent itemsets for boolean association rules. It uses a monotonic "bottom up" approach, where frequent subsets
    are extended one item at a time."""

    q = [[int(num)] for num in q]  # queue is formatted as a nested list
    insignificant = [[int(num)] for num in insig]
    significant = []

    #print("\nInsig", insignificant)
    #print("Sig", significant)
    #print("Queue\n", q)

    while len(q) > 0:
        element = q[0]
        obs_freqs = []

        for country in countries_list:
            icd_freq = 0
            for freq in element:
                icd_freq += round(float(sex_file_dict[country][age][str(freq)]) * 1000000)
            obs_freqs.append(icd_freq)

        chisq, pvalue = chisquare(obs_freqs)

        if pvalue >= 0.01:
            significant.append(element)

            for i in range(int(element[-1])+1,36):
                if [i] not in insignificant:
                    tentativeCandidate = sorted(list(element)+[i])  # add the two lists together (element is a list)
                    if tentativeCandidate not in q and tentativeCandidate not in significant: # then add it to the queue

                        q.append(tentativeCandidate)
            q.pop(0) #remove it from the queue after we have created/tried all the tentativeCandidates

        else:        # when the p-value not significant
            q.pop(0)
            insignificant.append(element)
    return significant  # grab the last values before breaking out of the while loop 
Example 23
Project: Biodemography   Author: skchronicles   File: pipeline.py    MIT License 5 votes vote down vote up
def bottom_up_trim(q, sex_file_dict, countries_list, age):
    """ Before implementing the apriori algorithm, we prune insignificant item-sets of size one from
    the queue. The generated insig list will help us avoid generating redundant combinations.
    Returns: pruned queue and an insigficant set to be passed to the aprori_v2 function. """
    insig = []
    sig = []

    for num in q:
        obs_freqs = []
        for country in countries_list:

            icd_freq = round(float(sex_file_dict[country][age][str(num)]) * 1000000)
            obs_freqs.append(icd_freq)

        print("Tested values for {}\t{}".format(num, obs_freqs))
        # Calculate chi-square test here
        if 0 not in obs_freqs:
            chisq, pvalue = chisquare(obs_freqs)
            if pvalue <= 0.01:
                # print("Removing {} from the Queue".format(num))
                insig.append(num)
            else:
                # print("Significant: ", pvalue)
                sig.append(num)
        else:  # meaning at least one of the frequencies is zero (to perform chi-squared test counts must be >)
            insig.append(num)
    q = sig
    return q, insig 
Example 24
Project: vnpy_crypto   Author: birforce   File: proportion.py    MIT License 5 votes vote down vote up
def proportions_chisquare_allpairs(count, nobs, multitest_method='hs'):
    '''chisquare test of proportions for all pairs of k samples

    Performs a chisquare test for proportions for all pairwise comparisons.
    The alternative is two-sided

    Parameters
    ----------
    count : integer or array_like
        the number of successes in nobs trials.
    nobs : integer
        the number of trials or observations.
    prop : float, optional
        The probability of success under the null hypothesis,
        `0 <= prop <= 1`. The default value is `prop = 0.5`
    multitest_method : string
        This chooses the method for the multiple testing p-value correction,
        that is used as default in the results.
        It can be any method that is available in  ``multipletesting``.
        The default is Holm-Sidak 'hs'.

    Returns
    -------
    result : AllPairsResults instance
        The returned results instance has several statistics, such as p-values,
        attached, and additional methods for using a non-default
        ``multitest_method``.

    Notes
    -----
    Yates continuity correction is not available.
    '''
    #all_pairs = lmap(list, lzip(*np.triu_indices(4, 1)))
    all_pairs = lzip(*np.triu_indices(len(count), 1))
    pvals = [proportions_chisquare(count[list(pair)], nobs[list(pair)])[1]
               for pair in all_pairs]
    return AllPairsResults(pvals, all_pairs, multitest_method=multitest_method) 
Example 25
Project: vnpy_crypto   Author: birforce   File: test_discrete.py    MIT License 5 votes vote down vote up
def test_predict_prob(self):
        res = self.res
        endog = res.model.endog
        freq = np.bincount(endog.astype(int))

        pr = res.predict(which='prob')
        pr2 = sm.distributions.genpoisson_p.pmf(np.arange(6)[:, None],
                                        res.predict(), res.params[-1], 1).T
        assert_allclose(pr, pr2, rtol=1e-10, atol=1e-10)

        from scipy import stats
        chi2 = stats.chisquare(freq, pr.sum(0))
        assert_allclose(chi2[:], (0.64628806058715882, 0.98578597726324468),
                        rtol=0.01) 
Example 26
Project: statistics_introduction   Author: dhwang99   File: chi_square_test.py    GNU General Public License v3.0 5 votes vote down vote up
def chi_for_war():
    alpha = 0.05
    years = np.linspace(0, 4, 5)
    wars = np.array([223, 142, 48, 15, 4])
    wars_sum = wars.sum()

    #poission 参数估计, lambda_hat = 1/n * sum Xi
    lambda_hat = 1./wars_sum * np.dot(wars, years) 
    
    #频度计算
    fis = np.array(wars)
    fis[-2] += fis[-1]
    
    p_of_years = poisson.pmf(years, lambda_hat)
    npi = p_of_years * wars_sum
    npi[-2] += npi[-1]
    
    stats = np.power(fis - npi,  2) / npi
    chisq = stats[:-1].sum()
    
    #chi 计算
    df = 5 - 1 -1 - 1 # lambda_hat 是模拟数据,于是 df会减1; 合并最后一个 n*pi < 5, 于是又减了一个
    alpha_ppf = chi2.ppf(1-alpha, df=df)
    p = 1 - chi2.cdf(chisq, df=df)
    print "Detail  : chisq: %.4f; p: %.4f; alpha: %.4f; alpha_ppf: %.4f" % (chisq, p, alpha, alpha_ppf)

    #直接这么算也可以
    f_obs = fis[:-1]
    f_exp = npi[:-1]
    #lambda_hat是模拟的,在减去1
    chisq, p = chisquare(f_obs, f_exp, ddof=1)
    print "Directed: chisq: %.4f; p: %.4f; alpha: %.4f; alpha_ppf: %.4f" % (chisq, p, alpha, alpha_ppf) 
Example 27
Project: ble5-nrf52-mac   Author: tomasero   File: test_stats.py    MIT License 5 votes vote down vote up
def check_power_divergence(self, f_obs, f_exp, ddof, axis, lambda_,
                               expected_stat):
        f_obs = np.asarray(f_obs)
        if axis is None:
            num_obs = f_obs.size
        else:
            b = np.broadcast(f_obs, f_exp)
            num_obs = b.shape[axis]

        with suppress_warnings() as sup:
            sup.filter(RuntimeWarning, "Mean of empty slice")
            stat, p = stats.power_divergence(
                                f_obs=f_obs, f_exp=f_exp, ddof=ddof,
                                axis=axis, lambda_=lambda_)
            assert_allclose(stat, expected_stat)

            if lambda_ == 1 or lambda_ == "pearson":
                # Also test stats.chisquare.
                stat, p = stats.chisquare(f_obs=f_obs, f_exp=f_exp, ddof=ddof,
                                          axis=axis)
                assert_allclose(stat, expected_stat)

        ddof = np.asarray(ddof)
        expected_p = stats.distributions.chi2.sf(expected_stat,
                                                 num_obs - 1 - ddof)
        assert_allclose(p, expected_p) 
Example 28
Project: ble5-nrf52-mac   Author: tomasero   File: test_stats.py    MIT License 5 votes vote down vote up
def test_jarque_bera_stats(self):
        np.random.seed(987654321)
        x = np.random.normal(0, 1, 100000)
        y = np.random.chisquare(10000, 100000)
        z = np.random.rayleigh(1, 100000)

        assert_(stats.jarque_bera(x)[1] > stats.jarque_bera(y)[1])
        assert_(stats.jarque_bera(x)[1] > stats.jarque_bera(z)[1])
        assert_(stats.jarque_bera(y)[1] > stats.jarque_bera(z)[1]) 
Example 29
Project: ble5-nrf52-mac   Author: tomasero   File: test_discrete_basic.py    MIT License 5 votes vote down vote up
def test_discrete_basic(distname, arg, first_case):
    try:
        distfn = getattr(stats, distname)
    except TypeError:
        distfn = distname
        distname = 'sample distribution'
    np.random.seed(9765456)
    rvs = distfn.rvs(size=2000, *arg)
    supp = np.unique(rvs)
    m, v = distfn.stats(*arg)
    check_cdf_ppf(distfn, arg, supp, distname + ' cdf_ppf')

    check_pmf_cdf(distfn, arg, distname)
    check_oth(distfn, arg, supp, distname + ' oth')
    check_edge_support(distfn, arg)

    alpha = 0.01
    check_discrete_chisquare(distfn, arg, rvs, alpha,
           distname + ' chisquare')

    if first_case:
        locscale_defaults = (0,)
        meths = [distfn.pmf, distfn.logpmf, distfn.cdf, distfn.logcdf,
                 distfn.logsf]
        # make sure arguments are within support
        spec_k = {'randint': 11, 'hypergeom': 4, 'bernoulli': 0, }
        k = spec_k.get(distname, 1)
        check_named_args(distfn, k, arg, locscale_defaults, meths)
        if distname != 'sample distribution':
            check_scale_docstring(distfn)
        check_random_state_property(distfn, arg)
        check_pickling(distfn, arg)

        # Entropy
        check_entropy(distfn, arg, distname)
        if distfn.__class__._entropy != stats.rv_discrete._entropy:
            check_private_entropy(distfn, arg, stats.rv_discrete) 
Example 30
Project: qso_lya_detection_pipeline   Author: davidparks21   File: absorption.py    MIT License 5 votes vote down vote up
def generate_voigt_model(sightline, absorber):
    """ Generate a continuum-scaled Voigt profile for the absorber
    :param sightline:
    :param absorber:
    :param REST_RANGE:
    :return:
    voigt_wave, voigt_model
    """
    from dla_cnn.data_loader import get_lam_data
    #from dla_cnn.data_loader import generate_voigt_profile
    #from dla_cnn.data_loader import get_peaks_for_voigt_scaling
    from scipy.stats import chisquare
    from scipy.optimize import minimize
    #if REST_RANGE is None:
    #    from dla_cnn.data_loader import REST_RANGE
    # Wavelengths
    full_lam, full_lam_rest, full_ix_dla_range = get_lam_data(sightline.loglam, sightline.z_qso, REST_RANGE)
    # Generate the voigt model using astropy, linetools, etc.
    voigt_flux, voigt_wave = generate_voigt_profile(absorber['z_dla'],
                                                    absorber['column_density'], full_lam)
    # get peaks
    ixs_mypeaks = get_peaks_for_voigt_scaling(sightline, voigt_flux)
    # get indexes where voigt profile is between 0.2 and 0.95
    observed_values = sightline.flux[ixs_mypeaks]
    expected_values = voigt_flux[ixs_mypeaks]
    # Minimize scale variable using chi square measure
    opt = minimize(lambda scale: chisquare(observed_values, expected_values * scale)[0], 1)
    opt_scale = opt.x[0]
    #from IPython import embed; embed()

    # Return
    return voigt_wave, voigt_flux*opt_scale, ixs_mypeaks 
Example 31
Project: Computable   Author: ktraunmueller   File: test_stats.py    MIT License 5 votes vote down vote up
def test_jarque_bera_stats(self):
        np.random.seed(987654321)
        x = np.random.normal(0, 1, 100000)
        y = np.random.chisquare(10000, 100000)
        z = np.random.rayleigh(1, 100000)

        assert_(stats.jarque_bera(x)[1] > stats.jarque_bera(y)[1])
        assert_(stats.jarque_bera(x)[1] > stats.jarque_bera(z)[1])
        assert_(stats.jarque_bera(y)[1] > stats.jarque_bera(z)[1]) 
Example 32
Project: poker   Author: surgebiswas   File: test_stats.py    MIT License 5 votes vote down vote up
def test_jarque_bera_stats(self):
        np.random.seed(987654321)
        x = np.random.normal(0, 1, 100000)
        y = np.random.chisquare(10000, 100000)
        z = np.random.rayleigh(1, 100000)

        assert_(stats.jarque_bera(x)[1] > stats.jarque_bera(y)[1])
        assert_(stats.jarque_bera(x)[1] > stats.jarque_bera(z)[1])
        assert_(stats.jarque_bera(y)[1] > stats.jarque_bera(z)[1]) 
Example 33
Project: P3_image_processing   Author: latedude2   File: test_stats.py    MIT License 5 votes vote down vote up
def check_power_divergence(self, f_obs, f_exp, ddof, axis, lambda_,
                               expected_stat):
        f_obs = np.asarray(f_obs)
        if axis is None:
            num_obs = f_obs.size
        else:
            b = np.broadcast(f_obs, f_exp)
            num_obs = b.shape[axis]

        with suppress_warnings() as sup:
            sup.filter(RuntimeWarning, "Mean of empty slice")
            stat, p = stats.power_divergence(
                                f_obs=f_obs, f_exp=f_exp, ddof=ddof,
                                axis=axis, lambda_=lambda_)
            assert_allclose(stat, expected_stat)

            if lambda_ == 1 or lambda_ == "pearson":
                # Also test stats.chisquare.
                stat, p = stats.chisquare(f_obs=f_obs, f_exp=f_exp, ddof=ddof,
                                          axis=axis)
                assert_allclose(stat, expected_stat)

        ddof = np.asarray(ddof)
        expected_p = stats.distributions.chi2.sf(expected_stat,
                                                 num_obs - 1 - ddof)
        assert_allclose(p, expected_p) 
Example 34
Project: P3_image_processing   Author: latedude2   File: test_stats.py    MIT License 5 votes vote down vote up
def test_jarque_bera_stats(self):
        np.random.seed(987654321)
        x = np.random.normal(0, 1, 100000)
        y = np.random.chisquare(10000, 100000)
        z = np.random.rayleigh(1, 100000)

        assert_(stats.jarque_bera(x)[1] > stats.jarque_bera(y)[1])
        assert_(stats.jarque_bera(x)[1] > stats.jarque_bera(z)[1])
        assert_(stats.jarque_bera(y)[1] > stats.jarque_bera(z)[1]) 
Example 35
Project: P3_image_processing   Author: latedude2   File: test_discrete_basic.py    MIT License 5 votes vote down vote up
def test_discrete_basic(distname, arg, first_case):
    try:
        distfn = getattr(stats, distname)
    except TypeError:
        distfn = distname
        distname = 'sample distribution'
    np.random.seed(9765456)
    rvs = distfn.rvs(size=2000, *arg)
    supp = np.unique(rvs)
    m, v = distfn.stats(*arg)
    check_cdf_ppf(distfn, arg, supp, distname + ' cdf_ppf')

    check_pmf_cdf(distfn, arg, distname)
    check_oth(distfn, arg, supp, distname + ' oth')
    check_edge_support(distfn, arg)

    alpha = 0.01
    check_discrete_chisquare(distfn, arg, rvs, alpha,
           distname + ' chisquare')

    if first_case:
        locscale_defaults = (0,)
        meths = [distfn.pmf, distfn.logpmf, distfn.cdf, distfn.logcdf,
                 distfn.logsf]
        # make sure arguments are within support
        spec_k = {'randint': 11, 'hypergeom': 4, 'bernoulli': 0, }
        k = spec_k.get(distname, 1)
        check_named_args(distfn, k, arg, locscale_defaults, meths)
        if distname != 'sample distribution':
            check_scale_docstring(distfn)
        check_random_state_property(distfn, arg)
        check_pickling(distfn, arg)

        # Entropy
        check_entropy(distfn, arg, distname)
        if distfn.__class__._entropy != stats.rv_discrete._entropy:
            check_private_entropy(distfn, arg, stats.rv_discrete) 
Example 36
Project: chainer   Author: chainer   File: test_orthogonal.py    MIT License 5 votes vote down vote up
def check_initializer_statistics(self, backend_config, n):
        from scipy import stats
        xp = backend_config.xp
        ws = numpy.empty((n,) + self.shape, dtype=self.dtype)
        ws = backend_config.get_array(ws)
        for i in range(n):
            initializer = self.target(**self.target_kwargs)
            initializer(xp.squeeze(ws[i:i+1], axis=0))

        expected_scale = self.scale or 1.1
        sampless = cuda.to_cpu(ws.reshape(n, -1).T)
        alpha = 0.01 / len(sampless)

        larger_dim = max(self.dim_out, self.dim_in)

        ab = 0.5 * (larger_dim - 1)

        for samples in sampless:
            if larger_dim == 1:
                numpy.testing.assert_allclose(abs(samples), expected_scale)
                _, p = stats.chisquare((numpy.sign(samples) + 1) // 2)
            else:
                _, p = stats.kstest(
                    samples,
                    stats.beta(
                        ab, ab,
                        loc=-expected_scale,
                        scale=2*expected_scale
                    ).cdf
                )
            assert p >= alpha 
Example 37
Project: GraphicDesignPatternByPython   Author: Relph1119   File: test_stats.py    MIT License 5 votes vote down vote up
def check_power_divergence(self, f_obs, f_exp, ddof, axis, lambda_,
                               expected_stat):
        f_obs = np.asarray(f_obs)
        if axis is None:
            num_obs = f_obs.size
        else:
            b = np.broadcast(f_obs, f_exp)
            num_obs = b.shape[axis]

        with suppress_warnings() as sup:
            sup.filter(RuntimeWarning, "Mean of empty slice")
            stat, p = stats.power_divergence(
                                f_obs=f_obs, f_exp=f_exp, ddof=ddof,
                                axis=axis, lambda_=lambda_)
            assert_allclose(stat, expected_stat)

            if lambda_ == 1 or lambda_ == "pearson":
                # Also test stats.chisquare.
                stat, p = stats.chisquare(f_obs=f_obs, f_exp=f_exp, ddof=ddof,
                                          axis=axis)
                assert_allclose(stat, expected_stat)

        ddof = np.asarray(ddof)
        expected_p = stats.distributions.chi2.sf(expected_stat,
                                                 num_obs - 1 - ddof)
        assert_allclose(p, expected_p) 
Example 38
Project: GraphicDesignPatternByPython   Author: Relph1119   File: test_stats.py    MIT License 5 votes vote down vote up
def test_jarque_bera_stats(self):
        np.random.seed(987654321)
        x = np.random.normal(0, 1, 100000)
        y = np.random.chisquare(10000, 100000)
        z = np.random.rayleigh(1, 100000)

        assert_(stats.jarque_bera(x)[1] > stats.jarque_bera(y)[1])
        assert_(stats.jarque_bera(x)[1] > stats.jarque_bera(z)[1])
        assert_(stats.jarque_bera(y)[1] > stats.jarque_bera(z)[1]) 
Example 39
Project: GraphicDesignPatternByPython   Author: Relph1119   File: test_discrete_basic.py    MIT License 5 votes vote down vote up
def test_discrete_basic(distname, arg, first_case):
    try:
        distfn = getattr(stats, distname)
    except TypeError:
        distfn = distname
        distname = 'sample distribution'
    np.random.seed(9765456)
    rvs = distfn.rvs(size=2000, *arg)
    supp = np.unique(rvs)
    m, v = distfn.stats(*arg)
    check_cdf_ppf(distfn, arg, supp, distname + ' cdf_ppf')

    check_pmf_cdf(distfn, arg, distname)
    check_oth(distfn, arg, supp, distname + ' oth')
    check_edge_support(distfn, arg)

    alpha = 0.01
    check_discrete_chisquare(distfn, arg, rvs, alpha,
           distname + ' chisquare')

    if first_case:
        locscale_defaults = (0,)
        meths = [distfn.pmf, distfn.logpmf, distfn.cdf, distfn.logcdf,
                 distfn.logsf]
        # make sure arguments are within support
        spec_k = {'randint': 11, 'hypergeom': 4, 'bernoulli': 0, }
        k = spec_k.get(distname, 1)
        check_named_args(distfn, k, arg, locscale_defaults, meths)
        if distname != 'sample distribution':
            check_scale_docstring(distfn)
        check_random_state_property(distfn, arg)
        check_pickling(distfn, arg)

        # Entropy
        check_entropy(distfn, arg, distname)
        if distfn.__class__._entropy != stats.rv_discrete._entropy:
            check_private_entropy(distfn, arg, stats.rv_discrete) 
Example 40
Project: pdftabextract   Author: WZBSocialScienceCenter   File: test_clustering.py    Apache License 2.0 5 votes vote down vote up
def test_adjust_bad_positions():
    pages_positions = {
        0: [8, 28, 33, 38],
        1: [10, 30, 35, 40],
        2: [10, 30, 35, 40],
        3: [0, 20, 25, 32],
        4: [3, 21, 25, 31],
        5: [3, 21, 25, 31],
    }

    mean_widths = np.diff([np.mean(pos) for pos in zip(*pages_positions.values())])

    pages_positions.update({
        6: [3, 21, 20, 31],       # bad: neg. width
        7: [3, 21, 25, 28, 31],   # bad: too many positions
        8: [3, 21, 25, 70],       # bad: invalid last position
    })

    alpha = 0.05
    adj_positions = adjust_bad_positions(pages_positions, pos_check_signif_level=alpha)

    assert pages_positions.keys() == adj_positions.keys()

    for p_num in pages_positions.keys():
        orig = pages_positions[p_num]
        adj = adj_positions[p_num]

        assert len(adj) == 4
        assert adj[0] == orig[0]

        adj_widths = np.diff(adj)
        _, p_val = chisquare(adj_widths, mean_widths)
        assert p_val >= alpha 
Example 41
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_stats.py    Apache License 2.0 5 votes vote down vote up
def test_jarque_bera_stats(self):
        np.random.seed(987654321)
        x = np.random.normal(0, 1, 100000)
        y = np.random.chisquare(10000, 100000)
        z = np.random.rayleigh(1, 100000)

        assert_(stats.jarque_bera(x)[1] > stats.jarque_bera(y)[1])
        assert_(stats.jarque_bera(x)[1] > stats.jarque_bera(z)[1])
        assert_(stats.jarque_bera(y)[1] > stats.jarque_bera(z)[1]) 
Example 42
Project: failing-loudly   Author: steverab   File: shift_tester.py    MIT License 5 votes vote down vote up
def test_chi2_shift(self, X_tr, X_te, nb_classes):

        # Calculate observed and expected counts
        freq_exp = np.zeros(nb_classes)
        freq_obs = np.zeros(nb_classes)

        unique_tr, counts_tr = np.unique(X_tr, return_counts=True)
        total_counts_tr = np.sum(counts_tr)
        unique_te, counts_te = np.unique(X_te, return_counts=True)
        total_counts_te = np.sum(counts_te)

        for i in range(len(unique_tr)):
            val = counts_tr[i]
            freq_exp[unique_tr[i]] = val
            
        for i in range(len(unique_te)):
            freq_obs[unique_te[i]] = counts_te[i]

        if np.amin(freq_exp) == 0 or np.amin(freq_obs) == 0:
            # The chi-squared test using contingency tables is not well defined if zero-element classes exist, which
            # might happen in the low-sample regime. In this case, we calculate the standard chi-squared test.
            for i in range(len(unique_tr)):
                val = counts_tr[i] / total_counts_tr * total_counts_te
                freq_exp[unique_tr[i]] = val
            _, p_val = chisquare(freq_obs, f_exp=freq_exp)
        else:
            # In almost all cases, we resort to obtaining a p-value from the chi-squared test's contingency table.
            freq_conc = np.array([freq_exp, freq_obs])
            _, p_val, _, _ = chi2_contingency(freq_conc)
        
        return p_val 
Example 43
Project: Splunking-Crime   Author: nccgroup   File: proportion.py    GNU Affero General Public License v3.0 5 votes vote down vote up
def proportions_chisquare_allpairs(count, nobs, multitest_method='hs'):
    '''chisquare test of proportions for all pairs of k samples

    Performs a chisquare test for proportions for all pairwise comparisons.
    The alternative is two-sided

    Parameters
    ----------
    count : integer or array_like
        the number of successes in nobs trials.
    nobs : integer
        the number of trials or observations.
    prop : float, optional
        The probability of success under the null hypothesis,
        `0 <= prop <= 1`. The default value is `prop = 0.5`
    multitest_method : string
        This chooses the method for the multiple testing p-value correction,
        that is used as default in the results.
        It can be any method that is available in  ``multipletesting``.
        The default is Holm-Sidak 'hs'.

    Returns
    -------
    result : AllPairsResults instance
        The returned results instance has several statistics, such as p-values,
        attached, and additional methods for using a non-default
        ``multitest_method``.

    Notes
    -----
    Yates continuity correction is not available.
    '''
    #all_pairs = lmap(list, lzip(*np.triu_indices(4, 1)))
    all_pairs = lzip(*np.triu_indices(len(count), 1))
    pvals = [proportions_chisquare(count[list(pair)], nobs[list(pair)])[1]
               for pair in all_pairs]
    return AllPairsResults(pvals, all_pairs, multitest_method=multitest_method) 
Example 44
Project: senior-design   Author: james-tate   File: test_discrete_basic.py    GNU General Public License v2.0 5 votes vote down vote up
def test_discrete_basic():
    for distname, arg in distdiscrete:
        distfn = getattr(stats,distname)
        #assert stats.dlaplace.rvs(0.8) is not None
        np.random.seed(9765456)
        rvs = distfn.rvs(size=2000,*arg)
        supp = np.unique(rvs)
        m,v = distfn.stats(*arg)
        #yield npt.assert_almost_equal(rvs.mean(), m, decimal=4,err_msg='mean')
        #yield npt.assert_almost_equal, rvs.mean(), m, 2, 'mean' # does not work
        yield check_sample_meanvar, rvs.mean(), m, distname + ' sample mean test'
        yield check_sample_meanvar, rvs.var(), v, distname + ' sample var test'
        yield check_cdf_ppf, distfn, arg, distname + ' cdf_ppf'
        yield check_cdf_ppf2, distfn, arg, supp, distname + ' cdf_ppf'
        yield check_pmf_cdf, distfn, arg, distname + ' pmf_cdf'

        # zipf doesn't fail, but generates floating point warnings.
        # Should be checked.
        if not distname in ['zipf']:
            yield check_oth, distfn, arg, distname + ' oth'
            skurt = stats.kurtosis(rvs)
            sskew = stats.skew(rvs)
            yield check_sample_skew_kurt, distfn, arg, skurt, sskew, \
                          distname + ' skew_kurt'

        # dlaplace doesn't fail, but generates lots of floating point warnings.
        # Should be checked.
        if not distname in ['dlaplace']: #['logser']:  #known failure, fixed
            alpha = 0.01
            yield check_discrete_chisquare, distfn, arg, rvs, alpha, \
                          distname + ' chisquare' 
Example 45
Project: mchmm   Author: maximtrp   File: _mc.py    GNU General Public License v3.0 5 votes vote down vote up
def chisquare(self, obs=None, exp=None, **kwargs):
        '''Wrapper function for carrying out a chi-squared test using
        `scipy.stats.chisquare` method.

        Parameters
        ----------
        obs : numpy ndarray
            Observed transition frequency matrix.

        exp : numpy ndarray
            Expected transition frequency matrix.

        kwargs : optional
            Keyword arguments passed to `scipy.stats.chisquare` method.

        Returns
        -------
        chisq : float or numpy ndarray
            Chi-squared test statistic.

        p : float or numpy ndarray
            P value of the test.
        '''
        _obs = self.observed_matrix if obs is None else obs
        _exp = self.expected_matrix if exp is None else exp
        return ss.chisquare(f_obs=_obs, f_exp=_exp, **kwargs) 
Example 46
Project: DataComp   Author: Cojabi   File: stats.py    Apache License 2.0 4 votes vote down vote up
def test_cat_feats(zipper, feat_subset=None, method=None, print_data=False):
    """
    Performs hypothesis testing to identify significantly deviating categorical features. A chi-squared test is used.

    :param zipper: Dictionary storing the feature values of the datasets in a list. Feature name is used as the key.
    :param feat_subset: A list containing feature names. If given, analysis will only be performed for the contained \
    features. If not given all features will be considered.
    :return:
    """

    p_values = dict()

    # consider all features if no feature subset was specified
    if feat_subset is None:
        feat_subset = zipper.keys()

    # set default method to chi-square test
    if method is None:
        method = "chi"

    for feat in feat_subset:
        # initiate dict in dict for dataset1 vs dataset2, d1 vs d3 etc. per feature
        p_values[feat] = dict()

        for i in range(len(zipper[feat]) - 1):  # select dataset1
            for j in range(i + 1, len(zipper[feat])):  # select dataset2
                # count occurences of categorical features like in a confusion matrix for Chi2 tests
                test_data = [_categorical_table(zipper[feat][i]), _categorical_table(zipper[feat][j])]

                # fill missing keys in test data:
                test_data = _non_present_values_to_zero(test_data)

                # sort testing data by index(categories) to align the counts for the categories
                test_data = [data.sort_index() for data in test_data]

                # print testing data if specified
                if print_data:
                    print(feat)
                    print(pd.DataFrame(test_data))
                    print()

                if method == "chi":
                    # skip feature if number of events per group is smaller than 5
                    if (test_data[0] < 5).any() or (test_data[1] < 5).any():
                        warnings.warn(feat + " has under 5 observations in one or more groups.", UserWarning)
                    # calculate u statistic and return p-value
                    p_val = chisquare(*test_data).pvalue

                elif method == "fisher":
                    p_val = fisher_exact(test_data)[1]

                p_values[feat][i + 1, j + 1] = p_val

    return p_values 
Example 47
Project: LaserTOF   Author: kyleuckert   File: test_discrete_basic.py    MIT License 4 votes vote down vote up
def test_discrete_basic():
    for distname, arg in distdiscrete:
        try:
            distfn = getattr(stats, distname)
        except TypeError:
            distfn = distname
            distname = 'sample distribution'
        np.random.seed(9765456)
        rvs = distfn.rvs(size=2000, *arg)
        supp = np.unique(rvs)
        m, v = distfn.stats(*arg)
        yield check_cdf_ppf, distfn, arg, supp, distname + ' cdf_ppf'

        yield check_pmf_cdf, distfn, arg, distname
        yield check_oth, distfn, arg, supp, distname + ' oth'
        yield check_edge_support, distfn, arg

        alpha = 0.01
        yield (check_discrete_chisquare, distfn, arg, rvs, alpha,
               distname + ' chisquare')

    seen = set()
    for distname, arg in distdiscrete:
        if distname in seen:
            continue
        seen.add(distname)
        try:
            distfn = getattr(stats, distname)
        except TypeError:
            distfn = distname
            distname = 'sample distribution'
        locscale_defaults = (0,)
        meths = [distfn.pmf, distfn.logpmf, distfn.cdf, distfn.logcdf,
                 distfn.logsf]
        # make sure arguments are within support
        spec_k = {'randint': 11, 'hypergeom': 4, 'bernoulli': 0, }
        k = spec_k.get(distname, 1)
        yield check_named_args, distfn, k, arg, locscale_defaults, meths
        if distname != 'sample distribution':
            yield check_scale_docstring, distfn
        yield check_random_state_property, distfn, arg
        yield check_pickling, distfn, arg

        # Entropy
        yield check_entropy, distfn, arg, distname
        if distfn.__class__._entropy != stats.rv_discrete._entropy:
            yield check_private_entropy, distfn, arg, stats.rv_discrete 
Example 48
Project: LaserTOF   Author: kyleuckert   File: test_discrete_basic.py    MIT License 4 votes vote down vote up
def check_discrete_chisquare(distfn, arg, rvs, alpha, msg):
    """Perform chisquare test for random sample of a discrete distribution

    Parameters
    ----------
    distname : string
        name of distribution function
    arg : sequence
        parameters of distribution
    alpha : float
        significance level, threshold for p-value

    Returns
    -------
    result : bool
        0 if test passes, 1 if test fails

    """
    wsupp = 0.05

    # construct intervals with minimum mass `wsupp`.
    # intervals are left-half-open as in a cdf difference
    lo = int(max(distfn.a, -1000))
    distsupport = xrange(lo, int(min(distfn.b, 1000)) + 1)
    last = 0
    distsupp = [lo]
    distmass = []
    for ii in distsupport:
        current = distfn.cdf(ii, *arg)
        if current - last >= wsupp - 1e-14:
            distsupp.append(ii)
            distmass.append(current - last)
            last = current
            if current > (1 - wsupp):
                break
    if distsupp[-1] < distfn.b:
        distsupp.append(distfn.b)
        distmass.append(1 - last)
    distsupp = np.array(distsupp)
    distmass = np.array(distmass)

    # convert intervals to right-half-open as required by histogram
    histsupp = distsupp + 1e-8
    histsupp[0] = distfn.a

    # find sample frequencies and perform chisquare test
    freq, hsupp = np.histogram(rvs, histsupp)
    chis, pval = stats.chisquare(np.array(freq), len(rvs)*distmass)

    npt.assert_(pval > alpha,
                'chisquare - test for %s at arg = %s with pval = %s' %
                (msg, str(arg), str(pval))) 
Example 49
Project: att   Author: Centre-Alt-Rendiment-Esportiu   File: test_discrete_basic.py    GNU General Public License v3.0 4 votes vote down vote up
def check_discrete_chisquare(distfn, arg, rvs, alpha, msg):
    """Perform chisquare test for random sample of a discrete distribution

    Parameters
    ----------
    distname : string
        name of distribution function
    arg : sequence
        parameters of distribution
    alpha : float
        significance level, threshold for p-value

    Returns
    -------
    result : bool
        0 if test passes, 1 if test fails

    uses global variable debug for printing results

    """
    n = len(rvs)
    nsupp = 20
    wsupp = 1.0/nsupp

    # construct intervals with minimum mass 1/nsupp
    # intervals are left-half-open as in a cdf difference
    distsupport = xrange(max(distfn.a, -1000), min(distfn.b, 1000) + 1)
    last = 0
    distsupp = [max(distfn.a, -1000)]
    distmass = []
    for ii in distsupport:
        current = distfn.cdf(ii,*arg)
        if current - last >= wsupp-1e-14:
            distsupp.append(ii)
            distmass.append(current - last)
            last = current
            if current > (1-wsupp):
                break
    if distsupp[-1] < distfn.b:
        distsupp.append(distfn.b)
        distmass.append(1-last)
    distsupp = np.array(distsupp)
    distmass = np.array(distmass)

    # convert intervals to right-half-open as required by histogram
    histsupp = distsupp+1e-8
    histsupp[0] = distfn.a

    # find sample frequencies and perform chisquare test
    freq,hsupp = np.histogram(rvs,histsupp)
    cdfs = distfn.cdf(distsupp,*arg)
    (chis,pval) = stats.chisquare(np.array(freq),n*distmass)

    npt.assert_(pval > alpha, 'chisquare - test for %s'
           ' at arg = %s with pval = %s' % (msg,str(arg),str(pval))) 
Example 50
Project: vnpy_crypto   Author: birforce   File: gof.py    MIT License 4 votes vote down vote up
def chisquare(f_obs, f_exp=None, value=0, ddof=0, return_basic=True):
    '''chisquare goodness-of-fit test

    The null hypothesis is that the distance between the expected distribution
    and the observed frequencies is ``value``. The alternative hypothesis is
    that the distance is larger than ``value``. ``value`` is normalized in
    terms of effect size.

    The standard chisquare test has the null hypothesis that ``value=0``, that
    is the distributions are the same.


    Notes
    -----
    The case with value greater than zero is similar to an equivalence test,
    that the exact null hypothesis is replaced by an approximate hypothesis.
    However, TOST "reverses" null and alternative hypothesis, while here the
    alternative hypothesis is that the distance (divergence) is larger than a
    threshold.

    References
    ----------
    McLaren, ...
    Drost,...

    See Also
    --------
    powerdiscrepancy
    scipy.stats.chisquare

    '''

    f_obs = np.asarray(f_obs)
    n_bins = len(f_obs)
    nobs = f_obs.sum(0)
    if f_exp is None:
        # uniform distribution
        f_exp = np.empty(n_bins, float)
        f_exp.fill(nobs / float(n_bins))

    f_exp = np.asarray(f_exp, float)

    chisq = ((f_obs - f_exp)**2 / f_exp).sum(0)
    if value == 0:
        pvalue = stats.chi2.sf(chisq, n_bins - 1 - ddof)
    else:
        pvalue = stats.ncx2.sf(chisq, n_bins - 1 - ddof, value**2 * nobs)

    if return_basic:
        return chisq, pvalue
    else:
        return chisq, pvalue    #TODO: replace with TestResults 
Example 51
Project: vnpy_crypto   Author: birforce   File: gof.py    MIT License 4 votes vote down vote up
def chisquare_power(effect_size, nobs, n_bins, alpha=0.05, ddof=0):
    '''power of chisquare goodness of fit test

    effect size is sqrt of chisquare statistic divided by nobs

    Parameters
    ----------
    effect_size : float
        This is the deviation from the Null of the normalized chi_square
        statistic. This follows Cohen's definition (sqrt).
    nobs : int or float
        number of observations
    n_bins : int (or float)
        number of bins, or points in the discrete distribution
    alpha : float in (0,1)
        significance level of the test, default alpha=0.05

    Returns
    -------
    power : float
        power of the test at given significance level at effect size

    Notes
    -----
    This function also works vectorized if all arguments broadcast.

    This can also be used to calculate the power for power divergence test.
    However, for the range of more extreme values of the power divergence
    parameter, this power is not a very good approximation for samples of
    small to medium size (Drost et al. 1989)

    References
    ----------
    Drost, ...

    See Also
    --------
    chisquare_effectsize
    statsmodels.stats.GofChisquarePower

    '''
    crit = stats.chi2.isf(alpha, n_bins - 1 - ddof)
    power = stats.ncx2.sf(crit, n_bins - 1 - ddof, effect_size**2 * nobs)
    return power 
Example 52
Project: vnpy_crypto   Author: birforce   File: gof.py    MIT License 4 votes vote down vote up
def chisquare_effectsize(probs0, probs1, correction=None, cohen=True, axis=0):
    '''effect size for a chisquare goodness-of-fit test

    Parameters
    ----------
    probs0 : array_like
        probabilities or cell frequencies under the Null hypothesis
    probs1 : array_like
        probabilities or cell frequencies under the Alternative hypothesis
        probs0 and probs1 need to have the same length in the ``axis`` dimension.
        and broadcast in the other dimensions
        Both probs0 and probs1 are normalized to add to one (in the ``axis``
        dimension).
    correction : None or tuple
        If None, then the effect size is the chisquare statistic divide by
        the number of observations.
        If the correction is a tuple (nobs, df), then the effectsize is
        corrected to have less bias and a smaller variance. However, the
        correction can make the effectsize negative. In that case, the
        effectsize is set to zero.
        Pederson and Johnson (1990) as referenced in McLaren et all. (1994)
    cohen : bool
        If True, then the square root is returned as in the definition of the
        effect size by Cohen (1977), If False, then the original effect size
        is returned.
    axis : int
        If the probability arrays broadcast to more than 1 dimension, then
        this is the axis over which the sums are taken.

    Returns
    -------
    effectsize : float
        effect size of chisquare test

    '''
    probs0 = np.asarray(probs0, float)
    probs1 = np.asarray(probs1, float)
    probs0 = probs0 / probs0.sum(axis)
    probs1 = probs1 / probs1.sum(axis)

    d2 = ((probs1 - probs0)**2 / probs0).sum(axis)

    if correction is not None:
        nobs, df = correction
        diff = ((probs1 - probs0) / probs0).sum(axis)
        d2 = np.maximum((d2 * nobs - diff - df) / (nobs - 1.), 0)

    if cohen:
        return np.sqrt(d2)
    else:
        return d2 
Example 53
Project: vnpy_crypto   Author: birforce   File: proportion.py    MIT License 4 votes vote down vote up
def proportions_chisquare(count, nobs, value=None):
    '''test for proportions based on chisquare test

    Parameters
    ----------
    count : integer or array_like
        the number of successes in nobs trials. If this is array_like, then
        the assumption is that this represents the number of successes for
        each independent sample
    nobs : integer
        the number of trials or observations, with the same length as
        count.
    value : None or float or array_like

    Returns
    -------
    chi2stat : float
        test statistic for the chisquare test
    p-value : float
        p-value for the chisquare test
    (table, expected)
        table is a (k, 2) contingency table, ``expected`` is the corresponding
        table of counts that are expected under independence with given
        margins


    Notes
    -----
    Recent version of scipy.stats have a chisquare test for independence in
    contingency tables.

    This function provides a similar interface to chisquare tests as
    ``prop.test`` in R, however without the option for Yates continuity
    correction.

    count can be the count for the number of events for a single proportion,
    or the counts for several independent proportions. If value is given, then
    all proportions are jointly tested against this value. If value is not
    given and count and nobs are not scalar, then the null hypothesis is
    that all samples have the same proportion.

    '''
    nobs = np.atleast_1d(nobs)
    table, expected, n_rows = _table_proportion(count, nobs)
    if value is not None:
        expected = np.column_stack((nobs * value, nobs * (1 - value)))
        ddof = n_rows - 1
    else:
        ddof = n_rows

    #print table, expected
    chi2stat, pval = stats.chisquare(table.ravel(), expected.ravel(),
                                     ddof=ddof)
    return chi2stat, pval, (table, expected) 
Example 54
Project: vnpy_crypto   Author: birforce   File: runs.py    MIT License 4 votes vote down vote up
def median_test_ksample(x, groups):
    '''chisquare test for equality of median/location

    This tests whether all groups have the same fraction of observations
    above the median.

    Parameters
    ----------
    x : array_like
        data values stacked for all groups
    groups : array_like
        group labels or indicator

    Returns
    -------
    stat : float
       test statistic
    pvalue : float
       pvalue from the chisquare distribution
    others ????
       currently some test output, table and expected

    '''
    x = np.asarray(x)
    gruni = np.unique(groups)
    xli = [x[groups==group] for group in gruni]
    xmedian = np.median(x)
    counts_larger = np.array([(xg > xmedian).sum() for xg in xli])
    counts = np.array([len(xg) for xg in xli])
    counts_smaller = counts - counts_larger
    nobs = counts.sum()
    n_larger = (x > xmedian).sum()
    n_smaller = nobs - n_larger
    table = np.vstack((counts_smaller, counts_larger))

    #the following should be replaced by chisquare_contingency table
    expected = np.vstack((counts * 1. / nobs * n_smaller,
                          counts * 1. / nobs * n_larger))

    if (expected < 5).any():
        print('Warning: There are cells with less than 5 expected' \
        'observations. The chisquare distribution might not be a good' \
        'approximation for the true distribution.')

    #check ddof
    return stats.chisquare(table.ravel(), expected.ravel(), ddof=1), table, expected 
Example 55
Project: vnpy_crypto   Author: birforce   File: runs.py    MIT License 4 votes vote down vote up
def cochrans_q(x):
    '''Cochran's Q test for identical effect of k treatments

    Cochran's Q is a k-sample extension of the McNemar test. If there are only
    two treatments, then Cochran's Q test and McNemar test are equivalent.

    Test that the probability of success is the same for each treatment.
    The alternative is that at least two treatments have a different
    probability of success.

    Parameters
    ----------
    x : array_like, 2d (N,k)
        data with N cases and k variables

    Returns
    -------
    q_stat : float
       test statistic
    pvalue : float
       pvalue from the chisquare distribution

    Notes
    -----
    In Wikipedia terminology, rows are blocks and columns are treatments.
    The number of rows N, should be large for the chisquare distribution to be
    a good approximation.
    The Null hypothesis of the test is that all treatments have the
    same effect.

    References
    ----------
    http://en.wikipedia.org/wiki/Cochran_test
    SAS Manual for NPAR TESTS

    '''

    warnings.warn("Deprecated, use stats.cochrans_q instead", DeprecationWarning)

    x = np.asarray(x)
    gruni = np.unique(x)
    N, k = x.shape
    count_row_success = (x==gruni[-1]).sum(1, float)
    count_col_success = (x==gruni[-1]).sum(0, float)
    count_row_ss = count_row_success.sum()
    count_col_ss = count_col_success.sum()
    assert count_row_ss == count_col_ss  #just a calculation check


    #this is SAS manual
    q_stat = (k-1) * (k *  np.sum(count_col_success**2) - count_col_ss**2) \
             / (k * count_row_ss - np.sum(count_row_success**2))

    #Note: the denominator looks just like k times the variance of the
    #columns

    #Wikipedia uses a different, but equivalent expression
##    q_stat = (k-1) * (k *  np.sum(count_row_success**2) - count_row_ss**2) \
##             / (k * count_col_ss - np.sum(count_col_success**2))

    return q_stat, stats.chi2.sf(q_stat, k-1) 
Example 56
Project: vnpy_crypto   Author: birforce   File: runs.py    MIT License 4 votes vote down vote up
def mcnemar(x, y=None, exact=True, correction=True):
    '''McNemar test

    Parameters
    ----------
    x, y : array_like
        two paired data samples. If y is None, then x can be a 2 by 2
        contingency table. x and y can have more than one dimension, then
        the results are calculated under the assumption that axis zero
        contains the observation for the samples.
    exact : bool
        If exact is true, then the binomial distribution will be used.
        If exact is false, then the chisquare distribution will be used, which
        is the approximation to the distribution of the test statistic for
        large sample sizes.
    correction : bool
        If true, then a continuity correction is used for the chisquare
        distribution (if exact is false.)

    Returns
    -------
    stat : float or int, array
        The test statistic is the chisquare statistic if exact is false. If the
        exact binomial distribution is used, then this contains the min(n1, n2),
        where n1, n2 are cases that are zero in one sample but one in the other
        sample.

    pvalue : float or array
        p-value of the null hypothesis of equal effects.

    Notes
    -----
    This is a special case of Cochran's Q test. The results when the chisquare
    distribution is used are identical, except for continuity correction.

    '''

    warnings.warn("Deprecated, use stats.TableSymmetry instead", DeprecationWarning)

    x = np.asarray(x)
    if y is None and x.shape[0] == x.shape[1]:
        if x.shape[0] != 2:
            raise ValueError('table needs to be 2 by 2')
        n1, n2 = x[1, 0], x[0, 1]
    else:
        # I'm not checking here whether x and y are binary,
        # isn't this also paired sign test
        n1 = np.sum(x < y, 0)
        n2 = np.sum(x > y, 0)

    if exact:
        stat = np.minimum(n1, n2)
        # binom is symmetric with p=0.5
        pval = stats.binom.cdf(stat, n1 + n2, 0.5) * 2
        pval = np.minimum(pval, 1)  # limit to 1 if n1==n2
    else:
        corr = int(correction) # convert bool to 0 or 1
        stat = (np.abs(n1 - n2) - corr)**2 / (1. * (n1 + n2))
        df = 1
        pval = stats.chi2.sf(stat, df)
    return stat, pval 
Example 57
Project: vnpy_crypto   Author: birforce   File: runs.py    MIT License 4 votes vote down vote up
def symmetry_bowker(table):
    '''Test for symmetry of a (k, k) square contingency table

    This is an extension of the McNemar test to test the Null hypothesis
    that the contingency table is symmetric around the main diagonal, that is

    n_{i, j} = n_{j, i}  for all i, j

    Parameters
    ----------
    table : array_like, 2d, (k, k)
        a square contingency table that contains the count for k categories
        in rows and columns.

    Returns
    -------
    statistic : float
        chisquare test statistic
    p-value : float
        p-value of the test statistic based on chisquare distribution
    df : int
        degrees of freedom of the chisquare distribution

    Notes
    -----
    Implementation is based on the SAS documentation, R includes it in
    `mcnemar.test` if the table is not 2 by 2.

    The pvalue is based on the chisquare distribution which requires that the
    sample size is not very small to be a good approximation of the true
    distribution. For 2x2 contingency tables exact distribution can be
    obtained with `mcnemar`

    See Also
    --------
    mcnemar


    '''

    warnings.warn("Deprecated, use stats.TableSymmetry instead", DeprecationWarning)

    table = np.asarray(table)
    k, k2 = table.shape
    if k != k2:
        raise ValueError('table needs to be square')

    #low_idx = np.tril_indices(k, -1)  # this doesn't have Fortran order
    upp_idx = np.triu_indices(k, 1)

    tril = table.T[upp_idx]   # lower triangle in column order
    triu = table[upp_idx]     # upper triangle in row order

    stat = ((tril - triu)**2 / (tril + triu + 1e-20)).sum()
    df = k * (k-1) / 2.
    pval = stats.chi2.sf(stat, df)

    return stat, pval, df 
Example 58
Project: ble5-nrf52-mac   Author: tomasero   File: test_discrete_basic.py    MIT License 4 votes vote down vote up
def check_discrete_chisquare(distfn, arg, rvs, alpha, msg):
    """Perform chisquare test for random sample of a discrete distribution

    Parameters
    ----------
    distname : string
        name of distribution function
    arg : sequence
        parameters of distribution
    alpha : float
        significance level, threshold for p-value

    Returns
    -------
    result : bool
        0 if test passes, 1 if test fails

    """
    wsupp = 0.05

    # construct intervals with minimum mass `wsupp`.
    # intervals are left-half-open as in a cdf difference
    lo = int(max(distfn.a, -1000))
    distsupport = xrange(lo, int(min(distfn.b, 1000)) + 1)
    last = 0
    distsupp = [lo]
    distmass = []
    for ii in distsupport:
        current = distfn.cdf(ii, *arg)
        if current - last >= wsupp - 1e-14:
            distsupp.append(ii)
            distmass.append(current - last)
            last = current
            if current > (1 - wsupp):
                break
    if distsupp[-1] < distfn.b:
        distsupp.append(distfn.b)
        distmass.append(1 - last)
    distsupp = np.array(distsupp)
    distmass = np.array(distmass)

    # convert intervals to right-half-open as required by histogram
    histsupp = distsupp + 1e-8
    histsupp[0] = distfn.a

    # find sample frequencies and perform chisquare test
    freq, hsupp = np.histogram(rvs, histsupp)
    chis, pval = stats.chisquare(np.array(freq), len(rvs)*distmass)

    npt.assert_(pval > alpha,
                'chisquare - test for %s at arg = %s with pval = %s' %
                (msg, str(arg), str(pval))) 
Example 59
Project: qso_lya_detection_pipeline   Author: davidparks21   File: absorption.py    MIT License 4 votes vote down vote up
def get_s2n_for_absorbers(sightline, lam, absorbers, nsamp=20):
    from scipy.stats import chisquare
    from scipy.optimize import minimize
    if len(absorbers) == 0:
        return
    # Loop on the DLAs
    for jj in range(len(absorbers)):
        # Find the peak
        isys = absorbers[jj]
        # Get the Voigt (to avoid it)
        voigt_flux, voigt_wave = generate_voigt_profile(isys['z_dla'], isys['column_density'], lam)
        # get peaks
        ixs_mypeaks = get_peaks_for_voigt_scaling(sightline, voigt_flux)
        if len(ixs_mypeaks) < 2:
            s2n = 1.  # KLUDGE
        else:
            # get indexes where voigt profile is between 0.2 and 0.95
            observed_values = sightline.flux[ixs_mypeaks]
            expected_values = voigt_flux[ixs_mypeaks]
            # Minimize scale variable using chi square measure for signal
            opt = minimize(lambda scale: chisquare(observed_values, expected_values * scale)[0], 1)
            opt_scale = opt.x[0]
            # Noise
            core = voigt_flux < 0.8
            rough_noise = np.median(sightline.sig[core])
            if rough_noise == 0:  # Occasional bad data in error array
                s2n = 0.1
            else:
                s2n = opt_scale/rough_noise
        isys['s2n'] = s2n
        '''  Another algorithm
        # Core
        core = np.where(voigt_flux < 0.8)[0]
        # Fluxes -- Take +/-nsamp away from core
        flux_for_stats = np.concatenate([sightline.flux[core[0]-nsamp:core[0]], sightline.flux[core[1]:core[1]+nsamp]])
        # Sort
        asrt = np.argsort(flux_for_stats)
        rough_signal = flux_for_stats[asrt][int(0.9*len(flux_for_stats))]
        rough_noise = np.median(sightline.sig[core])
        #
        s2n = rough_signal/rough_noise
        '''
    return 
Example 60
Project: Computable   Author: ktraunmueller   File: test_discrete_basic.py    MIT License 4 votes vote down vote up
def test_discrete_basic():
    for distname, arg in distdiscrete:
        distfn = getattr(stats,distname)
        # npt.assert_(stats.dlaplace.rvs(0.8) is not None)
        np.random.seed(9765456)
        rvs = distfn.rvs(size=2000,*arg)
        supp = np.unique(rvs)
        m,v = distfn.stats(*arg)
        # yield npt.assert_almost_equal(rvs.mean(), m, decimal=4,err_msg='mean')
        # yield npt.assert_almost_equal, rvs.mean(), m, 2, 'mean' # does not work
        yield check_sample_meanvar, rvs.mean(), m, distname + ' sample mean test'
        yield check_sample_meanvar, rvs.var(), v, distname + ' sample var test'
        yield check_cdf_ppf, distfn, arg, distname + ' cdf_ppf'
        yield check_cdf_ppf2, distfn, arg, supp, distname + ' cdf_ppf'
        yield check_pmf_cdf, distfn, arg, distname + ' pmf_cdf'

        # zipf doesn't fail, but generates floating point warnings.
        # Should be checked.
        if not distname in ['zipf']:
            yield check_oth, distfn, arg, distname + ' oth'
            skurt = stats.kurtosis(rvs)
            sskew = stats.skew(rvs)
            yield check_sample_skew_kurt, distfn, arg, skurt, sskew, \
                          distname + ' skew_kurt'

        # dlaplace doesn't fail, but generates lots of floating point warnings.
        # Should be checked.
        if not distname in ['dlaplace']:  # ['logser']:  #known failure, fixed
            alpha = 0.01
            yield check_discrete_chisquare, distfn, arg, rvs, alpha, \
                          distname + ' chisquare'

    seen = set()
    for distname, arg in distdiscrete:
        if distname in seen:
            continue
        seen.add(distname)
        distfn = getattr(stats,distname)
        locscale_defaults = (0,)
        meths = [distfn.pmf, distfn.logpmf, distfn.cdf, distfn.logcdf,
                 distfn.logsf]
        # make sure arguments are within support
        spec_k = {'randint': 11, 'hypergeom': 4, 'bernoulli': 0, }
        k = spec_k.get(distname, 1)
        yield check_named_args, distfn, k, arg, locscale_defaults, meths
        yield check_scale_docstring, distfn 
Example 61
Project: Computable   Author: ktraunmueller   File: test_discrete_basic.py    MIT License 4 votes vote down vote up
def check_discrete_chisquare(distfn, arg, rvs, alpha, msg):
    '''perform chisquare test for random sample of a discrete distribution

    Parameters
    ----------
    distname : string
        name of distribution function
    arg : sequence
        parameters of distribution
    alpha : float
        significance level, threshold for p-value

    Returns
    -------
    result : bool
        0 if test passes, 1 if test fails

    uses global variable debug for printing results
    '''

    # define parameters for test
##    n=2000
    n = len(rvs)
    nsupp = 20
    wsupp = 1.0/nsupp

##    distfn = getattr(stats, distname)
##    np.random.seed(9765456)
##    rvs = distfn.rvs(size=n,*arg)

    # construct intervals with minimum mass 1/nsupp
    # intervalls are left-half-open as in a cdf difference
    distsupport = xrange(max(distfn.a, -1000), min(distfn.b, 1000) + 1)
    last = 0
    distsupp = [max(distfn.a, -1000)]
    distmass = []
    for ii in distsupport:
        current = distfn.cdf(ii,*arg)
        if current - last >= wsupp-1e-14:
            distsupp.append(ii)
            distmass.append(current - last)
            last = current
            if current > (1-wsupp):
                break
    if distsupp[-1] < distfn.b:
        distsupp.append(distfn.b)
        distmass.append(1-last)
    distsupp = np.array(distsupp)
    distmass = np.array(distmass)

    # convert intervals to right-half-open as required by histogram
    histsupp = distsupp+1e-8
    histsupp[0] = distfn.a

    # find sample frequencies and perform chisquare test
    freq,hsupp = np.histogram(rvs,histsupp)
    cdfs = distfn.cdf(distsupp,*arg)
    (chis,pval) = stats.chisquare(np.array(freq),n*distmass)

    npt.assert_(pval > alpha, 'chisquare - test for %s'
           ' at arg = %s with pval = %s' % (msg,str(arg),str(pval))) 
Example 62
Project: poker   Author: surgebiswas   File: test_discrete_basic.py    MIT License 4 votes vote down vote up
def test_discrete_basic():
    for distname, arg in distdiscrete:
        try:
            distfn = getattr(stats, distname)
        except TypeError:
            distfn = distname
            distname = 'sample distribution'
        np.random.seed(9765456)
        rvs = distfn.rvs(size=2000, *arg)
        supp = np.unique(rvs)
        m, v = distfn.stats(*arg)
        yield check_cdf_ppf, distfn, arg, supp, distname + ' cdf_ppf'

        yield check_pmf_cdf, distfn, arg, distname
        yield check_oth, distfn, arg, supp, distname + ' oth'
        yield check_edge_support, distfn, arg

        alpha = 0.01
        yield (check_discrete_chisquare, distfn, arg, rvs, alpha,
               distname + ' chisquare')

    seen = set()
    for distname, arg in distdiscrete:
        if distname in seen:
            continue
        seen.add(distname)
        try:
            distfn = getattr(stats, distname)
        except TypeError:
            distfn = distname
            distname = 'sample distribution'
        locscale_defaults = (0,)
        meths = [distfn.pmf, distfn.logpmf, distfn.cdf, distfn.logcdf,
                 distfn.logsf]
        # make sure arguments are within support
        spec_k = {'randint': 11, 'hypergeom': 4, 'bernoulli': 0, }
        k = spec_k.get(distname, 1)
        yield check_named_args, distfn, k, arg, locscale_defaults, meths
        if distname != 'sample distribution':
            yield check_scale_docstring, distfn
        yield check_random_state_property, distfn, arg
        yield check_pickling, distfn, arg

        # Entropy
        yield check_entropy, distfn, arg, distname
        if distfn.__class__._entropy != stats.rv_discrete._entropy:
            yield check_private_entropy, distfn, arg, stats.rv_discrete 
Example 63
Project: poker   Author: surgebiswas   File: test_discrete_basic.py    MIT License 4 votes vote down vote up
def check_discrete_chisquare(distfn, arg, rvs, alpha, msg):
    """Perform chisquare test for random sample of a discrete distribution

    Parameters
    ----------
    distname : string
        name of distribution function
    arg : sequence
        parameters of distribution
    alpha : float
        significance level, threshold for p-value

    Returns
    -------
    result : bool
        0 if test passes, 1 if test fails

    """
    wsupp = 0.05

    # construct intervals with minimum mass `wsupp`.
    # intervals are left-half-open as in a cdf difference
    lo = int(max(distfn.a, -1000))
    distsupport = xrange(lo, int(min(distfn.b, 1000)) + 1)
    last = 0
    distsupp = [lo]
    distmass = []
    for ii in distsupport:
        current = distfn.cdf(ii, *arg)
        if current - last >= wsupp - 1e-14:
            distsupp.append(ii)
            distmass.append(current - last)
            last = current
            if current > (1 - wsupp):
                break
    if distsupp[-1] < distfn.b:
        distsupp.append(distfn.b)
        distmass.append(1 - last)
    distsupp = np.array(distsupp)
    distmass = np.array(distmass)

    # convert intervals to right-half-open as required by histogram
    histsupp = distsupp + 1e-8
    histsupp[0] = distfn.a

    # find sample frequencies and perform chisquare test
    freq, hsupp = np.histogram(rvs, histsupp)
    chis, pval = stats.chisquare(np.array(freq), len(rvs)*distmass)

    npt.assert_(pval > alpha,
                'chisquare - test for %s at arg = %s with pval = %s' %
                (msg, str(arg), str(pval))) 
Example 64
Project: P3_image_processing   Author: latedude2   File: test_discrete_basic.py    MIT License 4 votes vote down vote up
def check_discrete_chisquare(distfn, arg, rvs, alpha, msg):
    """Perform chisquare test for random sample of a discrete distribution

    Parameters
    ----------
    distname : string
        name of distribution function
    arg : sequence
        parameters of distribution
    alpha : float
        significance level, threshold for p-value

    Returns
    -------
    result : bool
        0 if test passes, 1 if test fails

    """
    wsupp = 0.05

    # construct intervals with minimum mass `wsupp`.
    # intervals are left-half-open as in a cdf difference
    _a, _b = distfn.support(*arg)
    lo = int(max(_a, -1000))
    high = int(min(_b, 1000)) + 1
    distsupport = xrange(lo, high)
    last = 0
    distsupp = [lo]
    distmass = []
    for ii in distsupport:
        current = distfn.cdf(ii, *arg)
        if current - last >= wsupp - 1e-14:
            distsupp.append(ii)
            distmass.append(current - last)
            last = current
            if current > (1 - wsupp):
                break
    if distsupp[-1] < _b:
        distsupp.append(_b)
        distmass.append(1 - last)
    distsupp = np.array(distsupp)
    distmass = np.array(distmass)

    # convert intervals to right-half-open as required by histogram
    histsupp = distsupp + 1e-8
    histsupp[0] = _a

    # find sample frequencies and perform chisquare test
    freq, hsupp = np.histogram(rvs, histsupp)
    chis, pval = stats.chisquare(np.array(freq), len(rvs)*distmass)

    npt.assert_(pval > alpha,
                'chisquare - test for %s at arg = %s with pval = %s' %
                (msg, str(arg), str(pval))) 
Example 65
Project: GraphicDesignPatternByPython   Author: Relph1119   File: test_discrete_basic.py    MIT License 4 votes vote down vote up
def check_discrete_chisquare(distfn, arg, rvs, alpha, msg):
    """Perform chisquare test for random sample of a discrete distribution

    Parameters
    ----------
    distname : string
        name of distribution function
    arg : sequence
        parameters of distribution
    alpha : float
        significance level, threshold for p-value

    Returns
    -------
    result : bool
        0 if test passes, 1 if test fails

    """
    wsupp = 0.05

    # construct intervals with minimum mass `wsupp`.
    # intervals are left-half-open as in a cdf difference
    lo = int(max(distfn.a, -1000))
    distsupport = xrange(lo, int(min(distfn.b, 1000)) + 1)
    last = 0
    distsupp = [lo]
    distmass = []
    for ii in distsupport:
        current = distfn.cdf(ii, *arg)
        if current - last >= wsupp - 1e-14:
            distsupp.append(ii)
            distmass.append(current - last)
            last = current
            if current > (1 - wsupp):
                break
    if distsupp[-1] < distfn.b:
        distsupp.append(distfn.b)
        distmass.append(1 - last)
    distsupp = np.array(distsupp)
    distmass = np.array(distmass)

    # convert intervals to right-half-open as required by histogram
    histsupp = distsupp + 1e-8
    histsupp[0] = distfn.a

    # find sample frequencies and perform chisquare test
    freq, hsupp = np.histogram(rvs, histsupp)
    chis, pval = stats.chisquare(np.array(freq), len(rvs)*distmass)

    npt.assert_(pval > alpha,
                'chisquare - test for %s at arg = %s with pval = %s' %
                (msg, str(arg), str(pval))) 
Example 66
Project: DIVE-backend   Author: MacroConnections   File: numerical_comparison.py    GNU General Public License v3.0 4 votes vote down vote up
def get_valid_tests(equal_var, independent, normal, num_samples):
    '''
    Get valid tests given number of samples and statistical characterization of
    samples:

    Equal variance
    Indepenence
    Normality
    '''
    if num_samples == 1:
        valid_tests = {
            'chisquare': stats.chisquare,
            'power_divergence': stats.power_divergence,
            'kstest': stats.kstest
        }
        if normal:
            valid_tests['input']['one_sample_ttest'] = stats.ttest_1samp

    elif num_samples == 2:
        if independent:
            valid_tests = {
                'mannwhitneyu': stats.mannwhitneyu,
                'kruskal': stats.kruskal,
                'ks_2samp': stats.ks_2samp
            }
            if normal:
                valid_tests['two_sample_ttest'] = stats.ttest_ind
                if equal_var:
                    valid_tests['f_oneway'] = stats.f_oneway
        else:
            valid_tests = {
                'two_sample_ks': stats.ks_2samp,
                'wilcoxon': stats.wilcoxon
            }
            if normal:
                valid_tests['two_sample_related_ttest'] = stats.ttest_rel

    elif num_samples >= 3:
        if independent:
            valid_tests = {
                'kruskal': stats.kruskal
            }
            if normal and equal_var:
                valid_tests['f_oneway'] = stats.f_oneway

        else:
            valid_tests['friedmanchisquare'] = stats.friedmanchisquare

    return valid_tests 
Example 67
Project: pdftabextract   Author: WZBSocialScienceCenter   File: clustering.py    Apache License 2.0 4 votes vote down vote up
def adjust_bad_positions(positions_per_page, good_positions=None, pos_check_signif_level=0.05):
    """
    Try to adjust the positions in the dict `positions_per_page` (page number -> positions mapping) by determining the
    mean column widths of "good positions" and comparing the individual columns widths with these mean values using a
    Chi squared test. If the p-value of the test is below `pos_check_signif_level`, the positions of the page are
    considered bad and will be fixed by using the mean differences instead.
    If `good_positions` is set to None, the list of good positions will be determined using those that match the median
    number of all positions.
    """

    if not 0 < pos_check_signif_level <= 1:
        raise ValueError('`signif_level` must be in range (0,1]')

    if not positions_per_page or not isinstance(positions_per_page, dict):
        raise ValueError('`positions_per_page` must be a non-empty dict')

    median_n_positions = int(np.median([len(pos) for pos in positions_per_page.values()]))

    if not good_positions:
        # no "good" positions list given -> create a list of good positions be selecting those that match the
        # median number of all positions
        good_positions = [pos for pos in positions_per_page.values() if len(pos) == median_n_positions]

    if not good_positions:   # no "good" positions -> cannot proceed
        return positions_per_page

    mean_widths = np.diff([np.mean(pos) for pos in zip(*good_positions)])
    if min(mean_widths) < 0:
        raise ValueError('invalid positions: got negative mean width')

    adjusted_positions = {}
    for p_num, positions in positions_per_page.items():
        if len(positions) != median_n_positions or min(np.diff(positions)) < 0:
            # num. of positions not as expected or negative widths
            p_val = 0
        else:       # expected num. of positions. check if they match
            widths = np.diff(positions)
            _, p_val = chisquare(widths, mean_widths)

        if p_val < pos_check_signif_level:
            # the adjusted positions will be the first given position as offset plus the mean positions derived from
            # the "good" positions
            positions = np.concatenate([[positions[0]], positions[0] + np.cumsum(mean_widths)])

        adjusted_positions[p_num] = positions

    return adjusted_positions


#%% Helper functions 
Example 68
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_discrete_basic.py    Apache License 2.0 4 votes vote down vote up
def test_discrete_basic():
    for distname, arg in distdiscrete:
        try:
            distfn = getattr(stats, distname)
        except TypeError:
            distfn = distname
            distname = 'sample distribution'
        np.random.seed(9765456)
        rvs = distfn.rvs(size=2000, *arg)
        supp = np.unique(rvs)
        m, v = distfn.stats(*arg)
        yield check_cdf_ppf, distfn, arg, supp, distname + ' cdf_ppf'

        yield check_pmf_cdf, distfn, arg, distname
        yield check_oth, distfn, arg, supp, distname + ' oth'
        yield check_edge_support, distfn, arg

        alpha = 0.01
        yield (check_discrete_chisquare, distfn, arg, rvs, alpha,
               distname + ' chisquare')

    seen = set()
    for distname, arg in distdiscrete:
        if distname in seen:
            continue
        seen.add(distname)
        try:
            distfn = getattr(stats, distname)
        except TypeError:
            distfn = distname
            distname = 'sample distribution'
        locscale_defaults = (0,)
        meths = [distfn.pmf, distfn.logpmf, distfn.cdf, distfn.logcdf,
                 distfn.logsf]
        # make sure arguments are within support
        spec_k = {'randint': 11, 'hypergeom': 4, 'bernoulli': 0, }
        k = spec_k.get(distname, 1)
        yield check_named_args, distfn, k, arg, locscale_defaults, meths
        if distname != 'sample distribution':
            yield check_scale_docstring, distfn
        yield check_random_state_property, distfn, arg
        yield check_pickling, distfn, arg

        # Entropy
        yield check_entropy, distfn, arg, distname
        if distfn.__class__._entropy != stats.rv_discrete._entropy:
            yield check_private_entropy, distfn, arg, stats.rv_discrete 
Example 69
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_discrete_basic.py    Apache License 2.0 4 votes vote down vote up
def check_discrete_chisquare(distfn, arg, rvs, alpha, msg):
    """Perform chisquare test for random sample of a discrete distribution

    Parameters
    ----------
    distname : string
        name of distribution function
    arg : sequence
        parameters of distribution
    alpha : float
        significance level, threshold for p-value

    Returns
    -------
    result : bool
        0 if test passes, 1 if test fails

    """
    wsupp = 0.05

    # construct intervals with minimum mass `wsupp`.
    # intervals are left-half-open as in a cdf difference
    lo = int(max(distfn.a, -1000))
    distsupport = xrange(lo, int(min(distfn.b, 1000)) + 1)
    last = 0
    distsupp = [lo]
    distmass = []
    for ii in distsupport:
        current = distfn.cdf(ii, *arg)
        if current - last >= wsupp - 1e-14:
            distsupp.append(ii)
            distmass.append(current - last)
            last = current
            if current > (1 - wsupp):
                break
    if distsupp[-1] < distfn.b:
        distsupp.append(distfn.b)
        distmass.append(1 - last)
    distsupp = np.array(distsupp)
    distmass = np.array(distmass)

    # convert intervals to right-half-open as required by histogram
    histsupp = distsupp + 1e-8
    histsupp[0] = distfn.a

    # find sample frequencies and perform chisquare test
    freq, hsupp = np.histogram(rvs, histsupp)
    chis, pval = stats.chisquare(np.array(freq), len(rvs)*distmass)

    npt.assert_(pval > alpha,
                'chisquare - test for %s at arg = %s with pval = %s' %
                (msg, str(arg), str(pval))) 
Example 70
Project: Splunking-Crime   Author: nccgroup   File: gof.py    GNU Affero General Public License v3.0 4 votes vote down vote up
def chisquare(f_obs, f_exp=None, value=0, ddof=0, return_basic=True):
    '''chisquare goodness-of-fit test

    The null hypothesis is that the distance between the expected distribution
    and the observed frequencies is ``value``. The alternative hypothesis is
    that the distance is larger than ``value``. ``value`` is normalized in
    terms of effect size.

    The standard chisquare test has the null hypothesis that ``value=0``, that
    is the distributions are the same.


    Notes
    -----
    The case with value greater than zero is similar to an equivalence test,
    that the exact null hypothesis is replaced by an approximate hypothesis.
    However, TOST "reverses" null and alternative hypothesis, while here the
    alternative hypothesis is that the distance (divergence) is larger than a
    threshold.

    References
    ----------
    McLaren, ...
    Drost,...

    See Also
    --------
    powerdiscrepancy
    scipy.stats.chisquare

    '''

    f_obs = np.asarray(f_obs)
    n_bins = len(f_obs)
    nobs = f_obs.sum(0)
    if f_exp is None:
        # uniform distribution
        f_exp = np.empty(n_bins, float)
        f_exp.fill(nobs / float(n_bins))

    f_exp = np.asarray(f_exp, float)

    chisq = ((f_obs - f_exp)**2 / f_exp).sum(0)
    if value == 0:
        pvalue = stats.chi2.sf(chisq, n_bins - 1 - ddof)
    else:
        pvalue = stats.ncx2.sf(chisq, n_bins - 1 - ddof, value**2 * nobs)

    if return_basic:
        return chisq, pvalue
    else:
        return chisq, pvalue    #TODO: replace with TestResults 
Example 71
Project: Splunking-Crime   Author: nccgroup   File: gof.py    GNU Affero General Public License v3.0 4 votes vote down vote up
def chisquare_power(effect_size, nobs, n_bins, alpha=0.05, ddof=0):
    '''power of chisquare goodness of fit test

    effect size is sqrt of chisquare statistic divided by nobs

    Parameters
    ----------
    effect_size : float
        This is the deviation from the Null of the normalized chi_square
        statistic. This follows Cohen's definition (sqrt).
    nobs : int or float
        number of observations
    n_bins : int (or float)
        number of bins, or points in the discrete distribution
    alpha : float in (0,1)
        significance level of the test, default alpha=0.05

    Returns
    -------
    power : float
        power of the test at given significance level at effect size

    Notes
    -----
    This function also works vectorized if all arguments broadcast.

    This can also be used to calculate the power for power divergence test.
    However, for the range of more extreme values of the power divergence
    parameter, this power is not a very good approximation for samples of
    small to medium size (Drost et al. 1989)

    References
    ----------
    Drost, ...

    See Also
    --------
    chisquare_effectsize
    statsmodels.stats.GofChisquarePower

    '''
    crit = stats.chi2.isf(alpha, n_bins - 1 - ddof)
    power = stats.ncx2.sf(crit, n_bins - 1 - ddof, effect_size**2 * nobs)
    return power 
Example 72
Project: Splunking-Crime   Author: nccgroup   File: gof.py    GNU Affero General Public License v3.0 4 votes vote down vote up
def chisquare_effectsize(probs0, probs1, correction=None, cohen=True, axis=0):
    '''effect size for a chisquare goodness-of-fit test

    Parameters
    ----------
    probs0 : array_like
        probabilities or cell frequencies under the Null hypothesis
    probs1 : array_like
        probabilities or cell frequencies under the Alternative hypothesis
        probs0 and probs1 need to have the same length in the ``axis`` dimension.
        and broadcast in the other dimensions
        Both probs0 and probs1 are normalized to add to one (in the ``axis``
        dimension).
    correction : None or tuple (nobs, df)
        If None, then the effect size is the chisquare statistic divide by
        the number of observations.
        If the correction is a tuple (nobs, df), then the effectsize is
        corrected to have less bias and a smaller variance. However, the
        correction can make the effectsize negative. In that case, the
        effectsize is set to zero.
        Pederson and Johnson (1990) as referenced in McLaren et all. (1994)
    cohen : bool
        If True, then the square root is returned as in the definition of the
        effect size by Cohen (1977), If False, then the original effect size
        is returned.
    axis : int
        If the probability arrays broadcast to more than 1 dimension, then
        this is the axis over which the sums are taken.

    Returns
    -------
    effectsize : float
        effect size of chisquare test

    '''
    probs0 = np.asarray(probs0, float)
    probs1 = np.asarray(probs1, float)
    probs0 = probs0 / probs0.sum(axis)
    probs1 = probs1 / probs1.sum(axis)

    d2 = ((probs1 - probs0)**2 / probs0).sum(axis)

    if correction is not None:
        nobs, df = correction
        diff = ((probs1 - probs0) / probs0).sum(axis)
        d2 = np.maximum((d2 * nobs - diff - df) / (nobs - 1.), 0)

    if cohen:
        return np.sqrt(d2)
    else:
        return d2 
Example 73
Project: Splunking-Crime   Author: nccgroup   File: proportion.py    GNU Affero General Public License v3.0 4 votes vote down vote up
def proportions_chisquare(count, nobs, value=None):
    '''test for proportions based on chisquare test

    Parameters
    ----------
    count : integer or array_like
        the number of successes in nobs trials. If this is array_like, then
        the assumption is that this represents the number of successes for
        each independent sample
    nobs : integer
        the number of trials or observations, with the same length as
        count.
    value : None or float or array_like

    Returns
    -------
    chi2stat : float
        test statistic for the chisquare test
    p-value : float
        p-value for the chisquare test
    (table, expected)
        table is a (k, 2) contingency table, ``expected`` is the corresponding
        table of counts that are expected under independence with given
        margins


    Notes
    -----
    Recent version of scipy.stats have a chisquare test for independence in
    contingency tables.

    This function provides a similar interface to chisquare tests as
    ``prop.test`` in R, however without the option for Yates continuity
    correction.

    count can be the count for the number of events for a single proportion,
    or the counts for several independent proportions. If value is given, then
    all proportions are jointly tested against this value. If value is not
    given and count and nobs are not scalar, then the null hypothesis is
    that all samples have the same proportion.

    '''
    nobs = np.atleast_1d(nobs)
    table, expected, n_rows = _table_proportion(count, nobs)
    if value is not None:
        expected = np.column_stack((nobs * value, nobs * (1 - value)))
        ddof = n_rows - 1
    else:
        ddof = n_rows

    #print table, expected
    chi2stat, pval = stats.chisquare(table.ravel(), expected.ravel(),
                                     ddof=ddof)
    return chi2stat, pval, (table, expected) 
Example 74
Project: Splunking-Crime   Author: nccgroup   File: runs.py    GNU Affero General Public License v3.0 4 votes vote down vote up
def median_test_ksample(x, groups):
    '''chisquare test for equality of median/location

    This tests whether all groups have the same fraction of observations
    above the median.

    Parameters
    ----------
    x : array_like
        data values stacked for all groups
    groups : array_like
        group labels or indicator

    Returns
    -------
    stat : float
       test statistic
    pvalue : float
       pvalue from the chisquare distribution
    others ????
       currently some test output, table and expected

    '''
    x = np.asarray(x)
    gruni = np.unique(groups)
    xli = [x[groups==group] for group in gruni]
    xmedian = np.median(x)
    counts_larger = np.array([(xg > xmedian).sum() for xg in xli])
    counts = np.array([len(xg) for xg in xli])
    counts_smaller = counts - counts_larger
    nobs = counts.sum()
    n_larger = (x > xmedian).sum()
    n_smaller = nobs - n_larger
    table = np.vstack((counts_smaller, counts_larger))

    #the following should be replaced by chisquare_contingency table
    expected = np.vstack((counts * 1. / nobs * n_smaller,
                          counts * 1. / nobs * n_larger))

    if (expected < 5).any():
        print('Warning: There are cells with less than 5 expected' \
        'observations. The chisquare distribution might not be a good' \
        'approximation for the true distribution.')

    #check ddof
    return stats.chisquare(table.ravel(), expected.ravel(), ddof=1), table, expected 
Example 75
Project: Splunking-Crime   Author: nccgroup   File: runs.py    GNU Affero General Public License v3.0 4 votes vote down vote up
def cochrans_q(x):
    '''Cochran's Q test for identical effect of k treatments

    Cochran's Q is a k-sample extension of the McNemar test. If there are only
    two treatments, then Cochran's Q test and McNemar test are equivalent.

    Test that the probability of success is the same for each treatment.
    The alternative is that at least two treatments have a different
    probability of success.

    Parameters
    ----------
    x : array_like, 2d (N,k)
        data with N cases and k variables

    Returns
    -------
    q_stat : float
       test statistic
    pvalue : float
       pvalue from the chisquare distribution

    Notes
    -----
    In Wikipedia terminology, rows are blocks and columns are treatments.
    The number of rows N, should be large for the chisquare distribution to be
    a good approximation.
    The Null hypothesis of the test is that all treatments have the
    same effect.

    References
    ----------
    http://en.wikipedia.org/wiki/Cochran_test
    SAS Manual for NPAR TESTS

    '''

    warnings.warn("Deprecated, use stats.cochrans_q instead", DeprecationWarning)

    x = np.asarray(x)
    gruni = np.unique(x)
    N, k = x.shape
    count_row_success = (x==gruni[-1]).sum(1, float)
    count_col_success = (x==gruni[-1]).sum(0, float)
    count_row_ss = count_row_success.sum()
    count_col_ss = count_col_success.sum()
    assert count_row_ss == count_col_ss  #just a calculation check


    #this is SAS manual
    q_stat = (k-1) * (k *  np.sum(count_col_success**2) - count_col_ss**2) \
             / (k * count_row_ss - np.sum(count_row_success**2))

    #Note: the denominator looks just like k times the variance of the
    #columns

    #Wikipedia uses a different, but equivalent expression
##    q_stat = (k-1) * (k *  np.sum(count_row_success**2) - count_row_ss**2) \
##             / (k * count_col_ss - np.sum(count_col_success**2))

    return q_stat, stats.chi2.sf(q_stat, k-1) 
Example 76
Project: Splunking-Crime   Author: nccgroup   File: runs.py    GNU Affero General Public License v3.0 4 votes vote down vote up
def mcnemar(x, y=None, exact=True, correction=True):
    '''McNemar test

    Parameters
    ----------
    x, y : array_like
        two paired data samples. If y is None, then x can be a 2 by 2
        contingency table. x and y can have more than one dimension, then
        the results are calculated under the assumption that axis zero
        contains the observation for the samples.
    exact : bool
        If exact is true, then the binomial distribution will be used.
        If exact is false, then the chisquare distribution will be used, which
        is the approximation to the distribution of the test statistic for
        large sample sizes.
    correction : bool
        If true, then a continuity correction is used for the chisquare
        distribution (if exact is false.)

    Returns
    -------
    stat : float or int, array
        The test statistic is the chisquare statistic if exact is false. If the
        exact binomial distribution is used, then this contains the min(n1, n2),
        where n1, n2 are cases that are zero in one sample but one in the other
        sample.

    pvalue : float or array
        p-value of the null hypothesis of equal effects.

    Notes
    -----
    This is a special case of Cochran's Q test. The results when the chisquare
    distribution is used are identical, except for continuity correction.

    '''

    warnings.warn("Deprecated, use stats.TableSymmetry instead", DeprecationWarning)

    x = np.asarray(x)
    if y is None and x.shape[0] == x.shape[1]:
        if x.shape[0] != 2:
            raise ValueError('table needs to be 2 by 2')
        n1, n2 = x[1, 0], x[0, 1]
    else:
        # I'm not checking here whether x and y are binary,
        # isn't this also paired sign test
        n1 = np.sum(x < y, 0)
        n2 = np.sum(x > y, 0)

    if exact:
        stat = np.minimum(n1, n2)
        # binom is symmetric with p=0.5
        pval = stats.binom.cdf(stat, n1 + n2, 0.5) * 2
        pval = np.minimum(pval, 1)  # limit to 1 if n1==n2
    else:
        corr = int(correction) # convert bool to 0 or 1
        stat = (np.abs(n1 - n2) - corr)**2 / (1. * (n1 + n2))
        df = 1
        pval = stats.chi2.sf(stat, df)
    return stat, pval 
Example 77
Project: Splunking-Crime   Author: nccgroup   File: runs.py    GNU Affero General Public License v3.0 4 votes vote down vote up
def symmetry_bowker(table):
    '''Test for symmetry of a (k, k) square contingency table

    This is an extension of the McNemar test to test the Null hypothesis
    that the contingency table is symmetric around the main diagonal, that is

    n_{i, j} = n_{j, i}  for all i, j

    Parameters
    ----------
    table : array_like, 2d, (k, k)
        a square contingency table that contains the count for k categories
        in rows and columns.

    Returns
    -------
    statistic : float
        chisquare test statistic
    p-value : float
        p-value of the test statistic based on chisquare distribution
    df : int
        degrees of freedom of the chisquare distribution

    Notes
    -----
    Implementation is based on the SAS documentation, R includes it in
    `mcnemar.test` if the table is not 2 by 2.

    The pvalue is based on the chisquare distribution which requires that the
    sample size is not very small to be a good approximation of the true
    distribution. For 2x2 contingency tables exact distribution can be
    obtained with `mcnemar`

    See Also
    --------
    mcnemar


    '''

    warnings.warn("Deprecated, use stats.TableSymmetry instead", DeprecationWarning)

    table = np.asarray(table)
    k, k2 = table.shape
    if k != k2:
        raise ValueError('table needs to be square')

    #low_idx = np.tril_indices(k, -1)  # this doesn't have Fortran order
    upp_idx = np.triu_indices(k, 1)

    tril = table.T[upp_idx]   # lower triangle in column order
    triu = table[upp_idx]     # upper triangle in row order

    stat = ((tril - triu)**2 / (tril + triu + 1e-20)).sum()
    df = k * (k-1) / 2.
    pval = stats.chi2.sf(stat, df)

    return stat, pval, df 
Example 78
Project: senior-design   Author: james-tate   File: test_discrete_basic.py    GNU General Public License v2.0 4 votes vote down vote up
def check_discrete_chisquare(distfn, arg, rvs, alpha, msg):
    '''perform chisquare test for random sample of a discrete distribution

    Parameters
    ----------
    distname : string
        name of distribution function
    arg : sequence
        parameters of distribution
    alpha : float
        significance level, threshold for p-value

    Returns
    -------
    result : bool
        0 if test passes, 1 if test fails

    uses global variable debug for printing results
    '''

    # define parameters for test
##    n=2000
    n = len(rvs)
    nsupp = 20
    wsupp = 1.0/nsupp

##    distfn = getattr(stats, distname)
##    np.random.seed(9765456)
##    rvs = distfn.rvs(size=n,*arg)

    # construct intervals with minimum mass 1/nsupp
    # intervalls are left-half-open as in a cdf difference
    distsupport = xrange(max(distfn.a, -1000), min(distfn.b, 1000) + 1)
    last = 0
    distsupp = [max(distfn.a, -1000)]
    distmass = []
    for ii in distsupport:
        current = distfn.cdf(ii,*arg)
        if  current - last >= wsupp-1e-14:
            distsupp.append(ii)
            distmass.append(current - last)
            last = current
            if current > (1-wsupp):
                break
    if distsupp[-1]  < distfn.b:
        distsupp.append(distfn.b)
        distmass.append(1-last)
    distsupp = np.array(distsupp)
    distmass = np.array(distmass)

    # convert intervals to right-half-open as required by histogram
    histsupp = distsupp+1e-8
    histsupp[0] = distfn.a

    # find sample frequencies and perform chisquare test
    freq,hsupp = np.histogram(rvs,histsupp)
    cdfs = distfn.cdf(distsupp,*arg)
    (chis,pval) = stats.chisquare(np.array(freq),n*distmass)

    npt.assert_(pval > alpha, 'chisquare - test for %s'
           ' at arg = %s with pval = %s' % (msg,str(arg),str(pval))) 
Example 79
Project: py-dbclasd   Author: spalaciob   File: dbclasd.py    GNU General Public License v2.0 4 votes vote down vote up
def is_distribution_stable(pts, pt, nnfinder, thresh0=None):
    """
    Perform a chi-square test on pts and compare the results to pts with pt added to it.
    If the second test stays below the first one, the distribution hasn't changed.
    :param pts: NxM 2d-array with N points of M dimensions.
    :param pt: point to be inserted into pts to test whether adding it, changes the distribution significantly.
    :param nnfinder: instance of scipy's NearestNeighbor fit with all points in the dataset.
    :param thresh0: an external fixed threshold to compare against instead of the statistic from pts. This is useful
    when setting an upper/lower bound for points that are being tested against the same set of reference points.
    :return: whether the distribution remains stable with respect to the one of pts and the threshold that was used.
    """
    if thresh0 is None:
        area, gl, whole_grid = cluster_area(pts, nnfinder)
        grid = whole_grid[whole_grid >= 1]  # Assume they are connected (they are, at least diagonally)
        grid_bins = np.unique(grid)
        lambda_hat = grid.mean()
        p_est = poisson.pmf(grid_bins, lambda_hat)
        p_est[-1] = 1-p_est[:-1].sum()  # This last probability has to be expressed as P(x >= p_n) instead of P(x == p_n)
        chisq, p = sci_chisquare([grid[grid == i].sum() for i in grid_bins]/grid.sum(), f_exp=p_est, ddof=grid_bins.size-2)
    else:
        chisq, p = thresh0, -1.

    area2, gl2, whole_grid2 = cluster_area(np.vstack((pts, pt)), nnfinder)
    grid2 = whole_grid2[whole_grid2 >= 1]
    grid2_bins = np.unique(grid2)
    lambda_hat2 = grid2.mean()
    p_est2 = poisson.pmf(grid2_bins, lambda_hat2)
    p_est2[-1] = 1-p_est2[:-1].sum()  # Same as before
    chisq_2, p_2 = sci_chisquare([grid2[grid2 == i].sum() for i in grid2_bins]/grid2.sum(), f_exp=p_est2, ddof=grid2_bins.size-2)

    # # Plot chi-square fittness (Debugging)
    # global test_cnt
    # plt.plot(np.array([-0.5, 0.5])+test_cnt, [chisq]*2, c='b', marker='_', lw=5)
    # if chisq > chisq_2:
    #     color = 'g'
    # elif chisq == chisq_2:
    #     color = 'k'
    # else:
    #     color = 'r'
    # plt.scatter(test_cnt, chisq_2, c=color, marker='x')
    # test_cnt += 1

    if thresh0 is None:
        new_thresh = chisq
    else:
        new_thresh = thresh0
    return chisq >= chisq_2, new_thresh