Python scipy.stats.ks_2samp() Examples

The following are code examples for showing how to use scipy.stats.ks_2samp(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: GeoPy   Author: aerler   File: stats.py    GNU General Public License v3.0 6 votes vote down vote up
def ks_2samp_wrapper(data, size1=None, ignoreNaN=True):
  ''' Apply the Kolmogorov-Smirnov Test, to test whether two samples are drawn from the same
      underlying (continuous) distribution. This is a wrapper for the SciPy function that 
      removes NaN's, allows application over a field, and only returns the p-value. '''
  if ignoreNaN:
    data1 = data[:size1]; data2 = data[size1:]
    nonans1 = np.invert(np.isnan(data1)) # test for NaN's
    nonans2 = np.invert(np.isnan(data2))
    if np.sum(nonans1) < 3 or np.sum(nonans2) < 3: return np.NaN # return, if less than 3 non-NaN's
    data1 = data1[nonans1]; data2 = data2[nonans2] # remove NaN's
  else:
    data1 = data[:size1]; data2 = data[size1:]
  # apply test
  D, pval = ss.ks_2samp(data1, data2); del D
  return pval  


# Stundent's T-test for two independent samples 
Example 2
Project: ble5-nrf52-mac   Author: tomasero   File: test_multivariate.py    MIT License 6 votes vote down vote up
def test_pairwise_distances(self):
        # Test that the distribution of pairwise distances is close to correct.
        np.random.seed(514)

        def random_ortho(dim):
            u, _s, v = np.linalg.svd(np.random.normal(size=(dim, dim)))
            return np.dot(u, v)

        for dim in range(2, 6):
            def generate_test_statistics(rvs, N=1000, eps=1e-10):
                stats = np.array([
                    np.sum((rvs(dim=dim) - rvs(dim=dim))**2)
                    for _ in range(N)
                ])
                # Add a bit of noise to account for numeric accuracy.
                stats += np.random.uniform(-eps, eps, size=stats.shape)
                return stats

            expected = generate_test_statistics(random_ortho)
            actual = generate_test_statistics(scipy.stats.ortho_group.rvs)

            _D, p = scipy.stats.ks_2samp(expected, actual)

            assert_array_less(.05, p) 
Example 3
Project: P3_image_processing   Author: latedude2   File: test_stats.py    MIT License 6 votes vote down vote up
def testMiddlingBoth(self):
        # 500, 600
        n1, n2 = 500, 600
        delta = 1.0/n1/n2/2/2
        x = np.linspace(1, 200, n1) - delta
        y = np.linspace(2, 200, n2)
        self._testOne(x, y, 'two-sided', 2000.0 / n1 / n2, 1.0, mode='auto')
        self._testOne(x, y, 'two-sided', 2000.0 / n1 / n2, 1.0, mode='asymp')
        self._testOne(x, y, 'greater', 2000.0 / n1 / n2, 0.9697596024683929, mode='asymp')
        self._testOne(x, y, 'less', 500.0 / n1 / n2, 0.9968735843165021, mode='asymp')
        with suppress_warnings() as sup:
            sup.filter(RuntimeWarning, "ks_2samp: Exact calculation overflowed. Switching to mode=asymp")
            self._testOne(x, y, 'greater', 2000.0 / n1 / n2, 0.9697596024683929, mode='exact')
            self._testOne(x, y, 'less', 500.0 / n1 / n2, 0.9968735843165021, mode='exact')
        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter("always")
            self._testOne(x, y, 'less', 500.0 / n1 / n2, 0.9968735843165021, mode='exact')
            _check_warnings(w, RuntimeWarning, 1) 
Example 4
Project: P3_image_processing   Author: latedude2   File: test_stats.py    MIT License 6 votes vote down vote up
def testMediumBoth(self):
        # 1000, 1100
        n1, n2 = 1000, 1100
        delta = 1.0/n1/n2/2/2
        x = np.linspace(1, 200, n1) - delta
        y = np.linspace(2, 200, n2)
        self._testOne(x, y, 'two-sided', 6600.0 / n1 / n2, 1.0, mode='asymp')
        self._testOne(x, y, 'two-sided', 6600.0 / n1 / n2, 1.0, mode='auto')
        self._testOne(x, y, 'greater', 6600.0 / n1 / n2, 0.9573185808092622, mode='asymp')
        self._testOne(x, y, 'less', 1000.0 / n1 / n2, 0.9982410869433984, mode='asymp')

        with suppress_warnings() as sup:
            sup.filter(RuntimeWarning, "ks_2samp: Exact calculation overflowed. Switching to mode=asymp")
            self._testOne(x, y, 'greater', 6600.0 / n1 / n2, 0.9573185808092622, mode='exact')
            self._testOne(x, y, 'less', 1000.0 / n1 / n2, 0.9982410869433984, mode='exact')
        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter("always")
            self._testOne(x, y, 'less', 1000.0 / n1 / n2, 0.9982410869433984, mode='exact')
            _check_warnings(w, RuntimeWarning, 1) 
Example 5
Project: P3_image_processing   Author: latedude2   File: test_stats.py    MIT License 6 votes vote down vote up
def testLargeBoth(self):
        # 10000, 11000
        n1, n2 = 10000, 11000
        lcm = n1*11.0
        delta = 1.0/n1/n2/2/2
        x = np.linspace(1, 200, n1) - delta
        y = np.linspace(2, 200, n2)
        self._testOne(x, y, 'two-sided', 563.0 / lcm, 0.99915729949018561, mode='asymp')
        self._testOne(x, y, 'two-sided', 563.0 / lcm, 1.0, mode='exact')
        self._testOne(x, y, 'two-sided', 563.0 / lcm, 0.99915729949018561, mode='auto')
        self._testOne(x, y, 'greater', 563.0 / lcm, 0.7561851877420673)
        self._testOne(x, y, 'less', 10.0 / lcm, 0.9998239693191724)
        with suppress_warnings() as sup:
            sup.filter(RuntimeWarning, "ks_2samp: Exact calculation overflowed. Switching to mode=asymp")
            self._testOne(x, y, 'greater', 563.0 / lcm, 0.7561851877420673, mode='exact')
            self._testOne(x, y, 'less', 10.0 / lcm, 0.9998239693191724, mode='exact') 
Example 6
Project: P3_image_processing   Author: latedude2   File: test_multivariate.py    MIT License 6 votes vote down vote up
def test_pairwise_distances(self):
        # Test that the distribution of pairwise distances is close to correct.
        np.random.seed(514)

        def random_ortho(dim):
            u, _s, v = np.linalg.svd(np.random.normal(size=(dim, dim)))
            return np.dot(u, v)

        for dim in range(2, 6):
            def generate_test_statistics(rvs, N=1000, eps=1e-10):
                stats = np.array([
                    np.sum((rvs(dim=dim) - rvs(dim=dim))**2)
                    for _ in range(N)
                ])
                # Add a bit of noise to account for numeric accuracy.
                stats += np.random.uniform(-eps, eps, size=stats.shape)
                return stats

            expected = generate_test_statistics(random_ortho)
            actual = generate_test_statistics(scipy.stats.ortho_group.rvs)

            _D, p = scipy.stats.ks_2samp(expected, actual)

            assert_array_less(.05, p) 
Example 7
Project: ssbio   Author: SBRG   File: atlas3.py    MIT License 6 votes vote down vote up
def get_pca_ks_stats(self, maxrange=5):
        """Get a dictionary of PC#: K-S test stat for each """
        pc_to_phenotype_pairs = {}
        num_components = self.principal_observations_df.shape[1]
        if num_components < maxrange:
            maxrange = num_components

        phenotypes = self.principal_observations_df.phenotype.unique().tolist()
        for i in range(0, maxrange):
            phenotype_pair_to_ks = {}
            for p1, p2 in combinations(phenotypes, 2):
                p1_pc = self.principal_observations_df[self.principal_observations_df.phenotype == p1].iloc[:,i].as_matrix()
                p2_pc = self.principal_observations_df[self.principal_observations_df.phenotype == p2].iloc[:,i].as_matrix()
                phenotype_pair_to_ks[(p1, p2)] = ks_2samp(p1_pc, p2_pc)
            pc_to_phenotype_pairs[i + 1] = phenotype_pair_to_ks

        return pc_to_phenotype_pairs 
Example 8
Project: GraphicDesignPatternByPython   Author: Relph1119   File: test_multivariate.py    MIT License 6 votes vote down vote up
def test_pairwise_distances(self):
        # Test that the distribution of pairwise distances is close to correct.
        np.random.seed(514)

        def random_ortho(dim):
            u, _s, v = np.linalg.svd(np.random.normal(size=(dim, dim)))
            return np.dot(u, v)

        for dim in range(2, 6):
            def generate_test_statistics(rvs, N=1000, eps=1e-10):
                stats = np.array([
                    np.sum((rvs(dim=dim) - rvs(dim=dim))**2)
                    for _ in range(N)
                ])
                # Add a bit of noise to account for numeric accuracy.
                stats += np.random.uniform(-eps, eps, size=stats.shape)
                return stats

            expected = generate_test_statistics(random_ortho)
            actual = generate_test_statistics(scipy.stats.ortho_group.rvs)

            _D, p = scipy.stats.ks_2samp(expected, actual)

            assert_array_less(.05, p) 
Example 9
Project: failing-loudly   Author: steverab   File: shift_tester.py    MIT License 6 votes vote down vote up
def one_dimensional_test(self, X_tr, X_te):
        p_vals = []

        # For each dimension we conduct a separate KS test
        for i in range(X_tr.shape[1]):
            feature_tr = X_tr[:, i]
            feature_te = X_te[:, i]

            t_val, p_val = None, None

            if self.ot == OnedimensionalTest.KS:

                # Compute KS statistic and p-value
                t_val, p_val = ks_2samp(feature_tr, feature_te)
            elif self.ot == OnedimensionalTest.AD:
                t_val, _, p_val = anderson_ksamp([feature_tr.tolist(), feature_te.tolist()])

            p_vals.append(p_val)

        # Apply the Bonferroni correction to bound the family-wise error rate. This can be done by picking the minimum
        # p-value from all individual tests.
        p_vals = np.array(p_vals)
        p_val = min(np.min(p_vals), 1.0)

        return p_val, p_vals 
Example 10
Project: default-credit-card-prediction   Author: alexpnt   File: feature_selection.py    MIT License 6 votes vote down vote up
def kolmogorov_smirnov_two_sample_test(X,y):
	"""
	Performs the two sample Kolmogorov-Smirnov test, testing wheter feature values of each class are drawn from identical distributions

	Keyword arguments:
	X -- The feature vectors
	y -- The target vector
	"""

	kolmogorov_smirnov=[[(0,0)]]*len(X[0])
	# print kolmogorov_smirnov
	for feature_col in xrange(len(X[0])):
			ks_test_statistic,p_value=stats.ks_2samp(X[y==0,feature_col],X[y==1,feature_col])
			kolmogorov_smirnov[feature_col]=(ks_test_statistic,p_value)

	#debug
	for f in xrange(23):
		print kolmogorov_smirnov[f]

	return kolmogorov_smirnov 
Example 11
Project: causallib   Author: IBM   File: stat_utils.py    Apache License 2.0 6 votes vote down vote up
def calc_weighted_ks2samp(x, y, wx, wy):
    """
    Weighted Kolmogorov-Smirnov

    References:
        [1] https://stackoverflow.com/a/40059727
    """
    x_ix = np.argsort(x)
    y_ix = np.argsort(y)
    x, wx = x[x_ix], wx[x_ix]
    y, wy = y[y_ix], wy[y_ix]
    data = np.concatenate((x, y))
    wx_cum = np.hstack([0, wx.cumsum() / wx.sum()])
    wy_cum = np.hstack([0, wy.cumsum() / wy.sum()])
    # Align the "steps" between the two distribution so the differences will be well defined:
    x_align = wx_cum[[np.searchsorted(x, data, side="right")]]
    y_align = wy_cum[[np.searchsorted(y, data, side="right")]]
    stat = np.max(np.abs(x_align - y_align))
    # stat = ks_2samp(wx * x, wy * y)
    return stat 
Example 12
Project: GeoPy   Author: aerler   File: stats.py    GNU General Public License v3.0 5 votes vote down vote up
def ks_2samp(sample1, sample2, lstatistic=False, ignoreNaN=True, **kwargs):
  ''' Apply the Kolmogorov-Smirnov Test, to test whether two samples are drawn from the same
      underlying (continuous) distribution; a high p-value means, the two samples are likely
      drawn from the same distribution. 
      The Kolmogorov-Smirnov Test is a non-parametric test that works well for all types of 
      distributions (normal and non-normal). '''
  if lstatistic: raise NotImplementedError("Return of test statistic is not yet implemented; only p-values are returned.")
  testfct = functools.partial(ks_2samp_wrapper, ignoreNaN=ignoreNaN)
  pvar = apply_stat_test_2samp(sample1, sample2, fct=testfct, laax=True, 
                               lpval=True, lrho=False, **kwargs)
  return pvar 
Example 13
Project: LaserTOF   Author: kyleuckert   File: test_stats.py    MIT License 5 votes vote down vote up
def test_ks_2samp():
    # exact small sample solution
    data1 = np.array([1.0,2.0])
    data2 = np.array([1.0,2.0,3.0])
    assert_almost_equal(np.array(stats.ks_2samp(data1+0.01,data2)),
                np.array((0.33333333333333337, 0.99062316386915694)))
    assert_almost_equal(np.array(stats.ks_2samp(data1-0.01,data2)),
                np.array((0.66666666666666674, 0.42490954988801982)))
    # these can also be verified graphically
    assert_almost_equal(
        np.array(stats.ks_2samp(np.linspace(1,100,100),
                              np.linspace(1,100,100)+2+0.1)),
        np.array((0.030000000000000027, 0.99999999996005062)))
    assert_almost_equal(
        np.array(stats.ks_2samp(np.linspace(1,100,100),
                              np.linspace(1,100,100)+2-0.1)),
        np.array((0.020000000000000018, 0.99999999999999933)))
    # these are just regression tests
    assert_almost_equal(
        np.array(stats.ks_2samp(np.linspace(1,100,100),
                              np.linspace(1,100,110)+20.1)),
        np.array((0.21090909090909091, 0.015880386730710221)))
    assert_almost_equal(
        np.array(stats.ks_2samp(np.linspace(1,100,100),
                              np.linspace(1,100,110)+20-0.1)),
        np.array((0.20818181818181825, 0.017981441789762638)))

    # test for namedtuple attribute results
    attributes = ('statistic', 'pvalue')
    res = stats.ks_2samp(data1 - 0.01, data2)
    check_named_results(res, attributes) 
Example 14
Project: LaserTOF   Author: kyleuckert   File: test_multivariate.py    MIT License 5 votes vote down vote up
def test_haar(self):
        # Test that the distribution is constant under rotation
        # Every column should have the same distribution
        # Additionally, the distribution should be invariant under another rotation

        # Generate samples
        dim = 5
        samples = 1000  # Not too many, or the test takes too long
        ks_prob = 0.39  # ...so don't expect much precision
        np.random.seed(514)
        xs = special_ortho_group.rvs(dim, size=samples)

        # Dot a few rows (0, 1, 2) with unit vectors (0, 2, 4, 3),
        #   effectively picking off entries in the matrices of xs.
        #   These projections should all have the same disribution,
        #     establishing rotational invariance. We use the two-sided
        #     KS test to confirm this.
        #   We could instead test that angles between random vectors
        #     are uniformly distributed, but the below is sufficient.
        #   It is not feasible to consider all pairs, so pick a few.
        els = ((0,0), (0,2), (1,4), (2,3))
        #proj = {(er, ec): [x[er][ec] for x in xs] for er, ec in els}
        proj = dict(((er, ec), sorted([x[er][ec] for x in xs])) for er, ec in els)
        pairs = [(e0, e1) for e0 in els for e1 in els if e0 > e1]
        ks_tests = [ks_2samp(proj[p0], proj[p1])[1] for (p0, p1) in pairs]
        assert_array_less([ks_prob]*len(pairs), ks_tests) 
Example 15
Project: LaserTOF   Author: kyleuckert   File: test_multivariate.py    MIT License 5 votes vote down vote up
def test_haar(self):
        # Test that the distribution is constant under rotation
        # Every column should have the same distribution
        # Additionally, the distribution should be invariant under another rotation

        # Generate samples
        dim = 5
        samples = 1000  # Not too many, or the test takes too long
        ks_prob = 0.39  # ...so don't expect much precision
        np.random.seed(518)  # Note that the test is sensitive to seed too
        xs = ortho_group.rvs(dim, size=samples)

        # Dot a few rows (0, 1, 2) with unit vectors (0, 2, 4, 3),
        #   effectively picking off entries in the matrices of xs.
        #   These projections should all have the same disribution,
        #     establishing rotational invariance. We use the two-sided
        #     KS test to confirm this.
        #   We could instead test that angles between random vectors
        #     are uniformly distributed, but the below is sufficient.
        #   It is not feasible to consider all pairs, so pick a few.
        els = ((0,0), (0,2), (1,4), (2,3))
        #proj = {(er, ec): [x[er][ec] for x in xs] for er, ec in els}
        proj = dict(((er, ec), sorted([x[er][ec] for x in xs])) for er, ec in els)
        pairs = [(e0, e1) for e0 in els for e1 in els if e0 > e1]
        ks_tests = [ks_2samp(proj[p0], proj[p1])[1] for (p0, p1) in pairs]
        assert_array_less([ks_prob]*len(pairs), ks_tests) 
Example 16
Project: att   Author: Centre-Alt-Rendiment-Esportiu   File: test_stats.py    GNU General Public License v3.0 5 votes vote down vote up
def test_ks_2samp():
    # exact small sample solution
    data1 = np.array([1.0,2.0])
    data2 = np.array([1.0,2.0,3.0])
    assert_almost_equal(np.array(stats.ks_2samp(data1+0.01,data2)),
                np.array((0.33333333333333337, 0.99062316386915694)))
    assert_almost_equal(np.array(stats.ks_2samp(data1-0.01,data2)),
                np.array((0.66666666666666674, 0.42490954988801982)))
    # these can also be verified graphically
    assert_almost_equal(
        np.array(stats.ks_2samp(np.linspace(1,100,100),
                              np.linspace(1,100,100)+2+0.1)),
        np.array((0.030000000000000027, 0.99999999996005062)))
    assert_almost_equal(
        np.array(stats.ks_2samp(np.linspace(1,100,100),
                              np.linspace(1,100,100)+2-0.1)),
        np.array((0.020000000000000018, 0.99999999999999933)))
    # these are just regression tests
    assert_almost_equal(
        np.array(stats.ks_2samp(np.linspace(1,100,100),
                              np.linspace(1,100,110)+20.1)),
        np.array((0.21090909090909091, 0.015880386730710221)))
    assert_almost_equal(
        np.array(stats.ks_2samp(np.linspace(1,100,100),
                              np.linspace(1,100,110)+20-0.1)),
        np.array((0.20818181818181825, 0.017981441789762638))) 
Example 17
Project: 3DCT   Author: coraxx   File: beadPos.py    GNU General Public License v3.0 5 votes vote down vote up
def gaussfit(data,parent=None,hold=False):
	## Fitting gaussian to data
	data[1] = data[1]-data[1].min()
	p0 = [data[1].max(), data[1].argmax(), 1]
	popt, pcov = curve_fit(gauss, data[0], data[1], p0=p0)

	if parent is not None:
		## Draw graphs in GUI
		x = []
		y = []
		for i in np.arange(len(data[0])):
			x.append(i)
			y.append(gauss(i,*popt))
		if hold is False:
			parent.widget_matplotlib.setupScatterCanvas(width=4,height=4,dpi=52,toolbar=False)
		parent.widget_matplotlib.xyPlot(data[0], data[1], label='z data',clear=True)
		parent.widget_matplotlib.xyPlot(x, y, label='gaussian fit',clear=False)

	## DEBUG
	if clrmsg and debug is True:
		from scipy.stats import ks_2samp
		## Get std from the diagonal of the covariance matrix
		std_height, std_mean, std_sigma = np.sqrt(np.diag(pcov))
		print clrmsg.DEBUG + '='*15, 'GAUSS FIT', '='*25
		print clrmsg.DEBUG + 'Amplitude		:', popt[0]
		print clrmsg.DEBUG + 'Location		:', popt[1]
		## http://mathworld.wolfram.com/GaussianFunction.html -> sigma * 2 * sqrt(2 * ln(2))
		print clrmsg.DEBUG + 'FWHM			:', popt[2] * 2 * math.sqrt(2 * math.log(2,math.e))
		print clrmsg.DEBUG + 'Std. Amplitude	:', std_height
		print clrmsg.DEBUG + 'Std. Location	:', std_mean
		print clrmsg.DEBUG + 'Std. FWHM		:', std_sigma * 2 * math.sqrt(2 * math.log(2,math.e))
		print clrmsg.DEBUG + 'Mean dy		:', np.absolute(y-data[1]).mean()
		print clrmsg.DEBUG + str(ks_2samp(y, data[1]))
	return popt, pcov


## Gaussian 2D fit from http://scipy.github.io/old-wiki/pages/Cookbook/FittingData 
Example 18
Project: ble5-nrf52-mac   Author: tomasero   File: test_stats.py    MIT License 5 votes vote down vote up
def test_ks_2samp():
    # exact small sample solution
    data1 = np.array([1.0,2.0])
    data2 = np.array([1.0,2.0,3.0])
    assert_almost_equal(np.array(stats.ks_2samp(data1+0.01,data2)),
                np.array((0.33333333333333337, 0.99062316386915694)))
    assert_almost_equal(np.array(stats.ks_2samp(data1-0.01,data2)),
                np.array((0.66666666666666674, 0.42490954988801982)))
    # these can also be verified graphically
    assert_almost_equal(
        np.array(stats.ks_2samp(np.linspace(1,100,100),
                              np.linspace(1,100,100)+2+0.1)),
        np.array((0.030000000000000027, 0.99999999996005062)))
    assert_almost_equal(
        np.array(stats.ks_2samp(np.linspace(1,100,100),
                              np.linspace(1,100,100)+2-0.1)),
        np.array((0.020000000000000018, 0.99999999999999933)))
    # these are just regression tests
    assert_almost_equal(
        np.array(stats.ks_2samp(np.linspace(1,100,100),
                              np.linspace(1,100,110)+20.1)),
        np.array((0.21090909090909091, 0.015880386730710221)))
    assert_almost_equal(
        np.array(stats.ks_2samp(np.linspace(1,100,100),
                              np.linspace(1,100,110)+20-0.1)),
        np.array((0.20818181818181825, 0.017981441789762638)))

    # test for namedtuple attribute results
    attributes = ('statistic', 'pvalue')
    res = stats.ks_2samp(data1 - 0.01, data2)
    check_named_results(res, attributes) 
Example 19
Project: ble5-nrf52-mac   Author: tomasero   File: test_multivariate.py    MIT License 5 votes vote down vote up
def test_haar(self):
        # Test that the distribution is constant under rotation
        # Every column should have the same distribution
        # Additionally, the distribution should be invariant under another rotation

        # Generate samples
        dim = 5
        samples = 1000  # Not too many, or the test takes too long
        ks_prob = .05
        np.random.seed(514)
        xs = special_ortho_group.rvs(dim, size=samples)

        # Dot a few rows (0, 1, 2) with unit vectors (0, 2, 4, 3),
        #   effectively picking off entries in the matrices of xs.
        #   These projections should all have the same disribution,
        #     establishing rotational invariance. We use the two-sided
        #     KS test to confirm this.
        #   We could instead test that angles between random vectors
        #     are uniformly distributed, but the below is sufficient.
        #   It is not feasible to consider all pairs, so pick a few.
        els = ((0,0), (0,2), (1,4), (2,3))
        #proj = {(er, ec): [x[er][ec] for x in xs] for er, ec in els}
        proj = dict(((er, ec), sorted([x[er][ec] for x in xs])) for er, ec in els)
        pairs = [(e0, e1) for e0 in els for e1 in els if e0 > e1]
        ks_tests = [ks_2samp(proj[p0], proj[p1])[1] for (p0, p1) in pairs]
        assert_array_less([ks_prob]*len(pairs), ks_tests) 
Example 20
Project: ble5-nrf52-mac   Author: tomasero   File: test_multivariate.py    MIT License 5 votes vote down vote up
def test_haar(self):
        # Test that the distribution is constant under rotation
        # Every column should have the same distribution
        # Additionally, the distribution should be invariant under another rotation

        # Generate samples
        dim = 5
        samples = 1000  # Not too many, or the test takes too long
        ks_prob = .05
        np.random.seed(518)  # Note that the test is sensitive to seed too
        xs = ortho_group.rvs(dim, size=samples)

        # Dot a few rows (0, 1, 2) with unit vectors (0, 2, 4, 3),
        #   effectively picking off entries in the matrices of xs.
        #   These projections should all have the same disribution,
        #     establishing rotational invariance. We use the two-sided
        #     KS test to confirm this.
        #   We could instead test that angles between random vectors
        #     are uniformly distributed, but the below is sufficient.
        #   It is not feasible to consider all pairs, so pick a few.
        els = ((0,0), (0,2), (1,4), (2,3))
        #proj = {(er, ec): [x[er][ec] for x in xs] for er, ec in els}
        proj = dict(((er, ec), sorted([x[er][ec] for x in xs])) for er, ec in els)
        pairs = [(e0, e1) for e0 in els for e1 in els if e0 > e1]
        ks_tests = [ks_2samp(proj[p0], proj[p1])[1] for (p0, p1) in pairs]
        assert_array_less([ks_prob]*len(pairs), ks_tests) 
Example 21
Project: Computable   Author: ktraunmueller   File: test_stats.py    MIT License 5 votes vote down vote up
def test_ks_2samp():
    # exact small sample solution
    data1 = np.array([1.0,2.0])
    data2 = np.array([1.0,2.0,3.0])
    assert_almost_equal(np.array(stats.ks_2samp(data1+0.01,data2)),
                np.array((0.33333333333333337, 0.99062316386915694)))
    assert_almost_equal(np.array(stats.ks_2samp(data1-0.01,data2)),
                np.array((0.66666666666666674, 0.42490954988801982)))
    # these can also be verified graphically
    assert_almost_equal(
        np.array(stats.ks_2samp(np.linspace(1,100,100),
                              np.linspace(1,100,100)+2+0.1)),
        np.array((0.030000000000000027, 0.99999999996005062)))
    assert_almost_equal(
        np.array(stats.ks_2samp(np.linspace(1,100,100),
                              np.linspace(1,100,100)+2-0.1)),
        np.array((0.020000000000000018, 0.99999999999999933)))
    # these are just regression tests
    assert_almost_equal(
        np.array(stats.ks_2samp(np.linspace(1,100,100),
                              np.linspace(1,100,110)+20.1)),
        np.array((0.21090909090909091, 0.015880386730710221)))
    assert_almost_equal(
        np.array(stats.ks_2samp(np.linspace(1,100,100),
                              np.linspace(1,100,110)+20-0.1)),
        np.array((0.20818181818181825, 0.017981441789762638))) 
Example 22
Project: poker   Author: surgebiswas   File: test_stats.py    MIT License 5 votes vote down vote up
def test_ks_2samp():
    # exact small sample solution
    data1 = np.array([1.0,2.0])
    data2 = np.array([1.0,2.0,3.0])
    assert_almost_equal(np.array(stats.ks_2samp(data1+0.01,data2)),
                np.array((0.33333333333333337, 0.99062316386915694)))
    assert_almost_equal(np.array(stats.ks_2samp(data1-0.01,data2)),
                np.array((0.66666666666666674, 0.42490954988801982)))
    # these can also be verified graphically
    assert_almost_equal(
        np.array(stats.ks_2samp(np.linspace(1,100,100),
                              np.linspace(1,100,100)+2+0.1)),
        np.array((0.030000000000000027, 0.99999999996005062)))
    assert_almost_equal(
        np.array(stats.ks_2samp(np.linspace(1,100,100),
                              np.linspace(1,100,100)+2-0.1)),
        np.array((0.020000000000000018, 0.99999999999999933)))
    # these are just regression tests
    assert_almost_equal(
        np.array(stats.ks_2samp(np.linspace(1,100,100),
                              np.linspace(1,100,110)+20.1)),
        np.array((0.21090909090909091, 0.015880386730710221)))
    assert_almost_equal(
        np.array(stats.ks_2samp(np.linspace(1,100,100),
                              np.linspace(1,100,110)+20-0.1)),
        np.array((0.20818181818181825, 0.017981441789762638)))

    # test for namedtuple attribute results
    attributes = ('statistic', 'pvalue')
    res = stats.ks_2samp(data1 - 0.01, data2)
    check_named_results(res, attributes) 
Example 23
Project: poker   Author: surgebiswas   File: test_multivariate.py    MIT License 5 votes vote down vote up
def test_haar(self):
        # Test that the distribution is constant under rotation
        # Every column should have the same distribution
        # Additionally, the distribution should be invariant under another rotation

        # Generate samples
        dim = 5
        samples = 1000  # Not too many, or the test takes too long
        ks_prob = 0.39  # ...so don't expect much precision
        np.random.seed(514)
        xs = special_ortho_group.rvs(dim, size=samples)

        # Dot a few rows (0, 1, 2) with unit vectors (0, 2, 4, 3),
        #   effectively picking off entries in the matrices of xs.
        #   These projections should all have the same disribution,
        #     establishing rotational invariance. We use the two-sided
        #     KS test to confirm this.
        #   We could instead test that angles between random vectors
        #     are uniformly distributed, but the below is sufficient.
        #   It is not feasible to consider all pairs, so pick a few.
        els = ((0,0), (0,2), (1,4), (2,3))
        #proj = {(er, ec): [x[er][ec] for x in xs] for er, ec in els}
        proj = dict(((er, ec), sorted([x[er][ec] for x in xs])) for er, ec in els)
        pairs = [(e0, e1) for e0 in els for e1 in els if e0 > e1]
        ks_tests = [ks_2samp(proj[p0], proj[p1])[1] for (p0, p1) in pairs]
        assert_array_less([ks_prob]*len(pairs), ks_tests) 
Example 24
Project: poker   Author: surgebiswas   File: test_multivariate.py    MIT License 5 votes vote down vote up
def test_haar(self):
        # Test that the distribution is constant under rotation
        # Every column should have the same distribution
        # Additionally, the distribution should be invariant under another rotation

        # Generate samples
        dim = 5
        samples = 1000  # Not too many, or the test takes too long
        ks_prob = 0.39  # ...so don't expect much precision
        np.random.seed(518)  # Note that the test is sensitive to seed too
        xs = ortho_group.rvs(dim, size=samples)

        # Dot a few rows (0, 1, 2) with unit vectors (0, 2, 4, 3),
        #   effectively picking off entries in the matrices of xs.
        #   These projections should all have the same disribution,
        #     establishing rotational invariance. We use the two-sided
        #     KS test to confirm this.
        #   We could instead test that angles between random vectors
        #     are uniformly distributed, but the below is sufficient.
        #   It is not feasible to consider all pairs, so pick a few.
        els = ((0,0), (0,2), (1,4), (2,3))
        #proj = {(er, ec): [x[er][ec] for x in xs] for er, ec in els}
        proj = dict(((er, ec), sorted([x[er][ec] for x in xs])) for er, ec in els)
        pairs = [(e0, e1) for e0 in els for e1 in els if e0 > e1]
        ks_tests = [ks_2samp(proj[p0], proj[p1])[1] for (p0, p1) in pairs]
        assert_array_less([ks_prob]*len(pairs), ks_tests) 
Example 25
Project: vm-reliability-tester   Author: icclab   File: model_validator.py    MIT License 5 votes vote down vote up
def fit_model(data, model_name, ratings):
    n = len(data)
    if model_name == 'expon':
        params = convert_string_to_tuple(ratings['expon'][0][0])
        print params
        std_func = stats.expon.rvs(size=n,
                                   loc=params[0],
                                   scale=params[1])
    elif model_name == 'dweibull':
        params = convert_string_to_tuple(ratings['dweibull'][0][0])
        print params
        std_func = stats.dweibull.rvs(params[0],
                                      loc=params[1],
                                      scale=params[2],
                                      size=n)

    elif model_name == 'norm':
        params = convert_string_to_tuple(ratings['norm'][0][0])
        print params
        std_func = stats.norm.rvs(loc=params[0],
                                  scale=params[1], 
                                  size=n)
    elif model_name == 'lognorm':
        params = convert_string_to_tuple(ratings['lognorm'][0][0])
        print params
        std_func = stats.lognorm.rvs(params[0],
                                     loc=params[1],
                                     scale=params[2],
                                     size=n)
    std_func.sort()
    test_vals = stats.ks_2samp(data, std_func)
    result = [test for test in test_vals]
    result.append(params)
    result.append(std_func)
    return result 
Example 26
Project: vm-reliability-tester   Author: icclab   File: model_fitter.py    MIT License 5 votes vote down vote up
def fit_model(data, model_name):
    n = len(data)
    if(model_name == 'expon'):
        params = stats.expon.fit(data)
        std_model = stats.expon.rvs(size=n,
                                    loc=params[0],
                                    scale=params[1])
    elif(model_name == 'dweibull'):
        params = stats.dweibull.fit(data)
        std_model = stats.dweibull.rvs(params[0],
                                       loc=params[1],
                                       scale=params[2],
                                       size=n)
    elif(model_name == 'norm'):
        params = stats.norm.fit(data)
        std_model = stats.norm.rvs(loc=params[0],
                                   scale=params[1], 
                                      size=n)
    elif(model_name == 'lognorm'):
        params = stats.lognorm.fit(data)
        std_model = stats.lognorm.rvs(params[0],
                                      loc=params[1],
                                      scale=params[2],
                                      size=n)
    std_model.sort()
    test_vals = stats.ks_2samp(data, std_model)
    result = [test for test in test_vals]
    result.append(params)
    result.append(std_model)
    return result 
Example 27
Project: P3_image_processing   Author: latedude2   File: test_stats.py    MIT License 5 votes vote down vote up
def _testOne(self, x1, x2, alternative, expected_statistic, expected_prob, mode='auto'):
        result = stats.ks_2samp(x1, x2, alternative, mode=mode)
        expected = np.array([expected_statistic, expected_prob])
        assert_array_almost_equal(np.array(result), expected) 
Example 28
Project: P3_image_processing   Author: latedude2   File: test_stats.py    MIT License 5 votes vote down vote up
def testNamedAtributes(self):
        # test for namedtuple attribute results
        attributes = ('statistic', 'pvalue')
        res = stats.ks_2samp([1, 2], [3])
        check_named_results(res, attributes) 
Example 29
Project: P3_image_processing   Author: latedude2   File: test_multivariate.py    MIT License 5 votes vote down vote up
def test_haar(self):
        # Test that the distribution is constant under rotation
        # Every column should have the same distribution
        # Additionally, the distribution should be invariant under another rotation

        # Generate samples
        dim = 5
        samples = 1000  # Not too many, or the test takes too long
        ks_prob = .05
        np.random.seed(514)
        xs = special_ortho_group.rvs(dim, size=samples)

        # Dot a few rows (0, 1, 2) with unit vectors (0, 2, 4, 3),
        #   effectively picking off entries in the matrices of xs.
        #   These projections should all have the same disribution,
        #     establishing rotational invariance. We use the two-sided
        #     KS test to confirm this.
        #   We could instead test that angles between random vectors
        #     are uniformly distributed, but the below is sufficient.
        #   It is not feasible to consider all pairs, so pick a few.
        els = ((0,0), (0,2), (1,4), (2,3))
        #proj = {(er, ec): [x[er][ec] for x in xs] for er, ec in els}
        proj = dict(((er, ec), sorted([x[er][ec] for x in xs])) for er, ec in els)
        pairs = [(e0, e1) for e0 in els for e1 in els if e0 > e1]
        ks_tests = [ks_2samp(proj[p0], proj[p1])[1] for (p0, p1) in pairs]
        assert_array_less([ks_prob]*len(pairs), ks_tests) 
Example 30
Project: P3_image_processing   Author: latedude2   File: test_multivariate.py    MIT License 5 votes vote down vote up
def test_haar(self):
        # Test that the distribution is constant under rotation
        # Every column should have the same distribution
        # Additionally, the distribution should be invariant under another rotation

        # Generate samples
        dim = 5
        samples = 1000  # Not too many, or the test takes too long
        ks_prob = .05
        np.random.seed(518)  # Note that the test is sensitive to seed too
        xs = ortho_group.rvs(dim, size=samples)

        # Dot a few rows (0, 1, 2) with unit vectors (0, 2, 4, 3),
        #   effectively picking off entries in the matrices of xs.
        #   These projections should all have the same disribution,
        #     establishing rotational invariance. We use the two-sided
        #     KS test to confirm this.
        #   We could instead test that angles between random vectors
        #     are uniformly distributed, but the below is sufficient.
        #   It is not feasible to consider all pairs, so pick a few.
        els = ((0,0), (0,2), (1,4), (2,3))
        #proj = {(er, ec): [x[er][ec] for x in xs] for er, ec in els}
        proj = dict(((er, ec), sorted([x[er][ec] for x in xs])) for er, ec in els)
        pairs = [(e0, e1) for e0 in els for e1 in els if e0 > e1]
        ks_tests = [ks_2samp(proj[p0], proj[p1])[1] for (p0, p1) in pairs]
        assert_array_less([ks_prob]*len(pairs), ks_tests) 
Example 31
Project: GraphicDesignPatternByPython   Author: Relph1119   File: test_stats.py    MIT License 5 votes vote down vote up
def test_ks_2samp():
    # exact small sample solution
    data1 = np.array([1.0,2.0])
    data2 = np.array([1.0,2.0,3.0])
    assert_almost_equal(np.array(stats.ks_2samp(data1+0.01,data2)),
                np.array((0.33333333333333337, 0.99062316386915694)))
    assert_almost_equal(np.array(stats.ks_2samp(data1-0.01,data2)),
                np.array((0.66666666666666674, 0.42490954988801982)))
    # these can also be verified graphically
    assert_almost_equal(
        np.array(stats.ks_2samp(np.linspace(1,100,100),
                              np.linspace(1,100,100)+2+0.1)),
        np.array((0.030000000000000027, 0.99999999996005062)))
    assert_almost_equal(
        np.array(stats.ks_2samp(np.linspace(1,100,100),
                              np.linspace(1,100,100)+2-0.1)),
        np.array((0.020000000000000018, 0.99999999999999933)))
    # these are just regression tests
    assert_almost_equal(
        np.array(stats.ks_2samp(np.linspace(1,100,100),
                              np.linspace(1,100,110)+20.1)),
        np.array((0.21090909090909091, 0.015880386730710221)))
    assert_almost_equal(
        np.array(stats.ks_2samp(np.linspace(1,100,100),
                              np.linspace(1,100,110)+20-0.1)),
        np.array((0.20818181818181825, 0.017981441789762638)))

    # test for namedtuple attribute results
    attributes = ('statistic', 'pvalue')
    res = stats.ks_2samp(data1 - 0.01, data2)
    check_named_results(res, attributes) 
Example 32
Project: GraphicDesignPatternByPython   Author: Relph1119   File: test_multivariate.py    MIT License 5 votes vote down vote up
def test_haar(self):
        # Test that the distribution is constant under rotation
        # Every column should have the same distribution
        # Additionally, the distribution should be invariant under another rotation

        # Generate samples
        dim = 5
        samples = 1000  # Not too many, or the test takes too long
        ks_prob = .05
        np.random.seed(514)
        xs = special_ortho_group.rvs(dim, size=samples)

        # Dot a few rows (0, 1, 2) with unit vectors (0, 2, 4, 3),
        #   effectively picking off entries in the matrices of xs.
        #   These projections should all have the same disribution,
        #     establishing rotational invariance. We use the two-sided
        #     KS test to confirm this.
        #   We could instead test that angles between random vectors
        #     are uniformly distributed, but the below is sufficient.
        #   It is not feasible to consider all pairs, so pick a few.
        els = ((0,0), (0,2), (1,4), (2,3))
        #proj = {(er, ec): [x[er][ec] for x in xs] for er, ec in els}
        proj = dict(((er, ec), sorted([x[er][ec] for x in xs])) for er, ec in els)
        pairs = [(e0, e1) for e0 in els for e1 in els if e0 > e1]
        ks_tests = [ks_2samp(proj[p0], proj[p1])[1] for (p0, p1) in pairs]
        assert_array_less([ks_prob]*len(pairs), ks_tests) 
Example 33
Project: GraphicDesignPatternByPython   Author: Relph1119   File: test_multivariate.py    MIT License 5 votes vote down vote up
def test_haar(self):
        # Test that the distribution is constant under rotation
        # Every column should have the same distribution
        # Additionally, the distribution should be invariant under another rotation

        # Generate samples
        dim = 5
        samples = 1000  # Not too many, or the test takes too long
        ks_prob = .05
        np.random.seed(518)  # Note that the test is sensitive to seed too
        xs = ortho_group.rvs(dim, size=samples)

        # Dot a few rows (0, 1, 2) with unit vectors (0, 2, 4, 3),
        #   effectively picking off entries in the matrices of xs.
        #   These projections should all have the same disribution,
        #     establishing rotational invariance. We use the two-sided
        #     KS test to confirm this.
        #   We could instead test that angles between random vectors
        #     are uniformly distributed, but the below is sufficient.
        #   It is not feasible to consider all pairs, so pick a few.
        els = ((0,0), (0,2), (1,4), (2,3))
        #proj = {(er, ec): [x[er][ec] for x in xs] for er, ec in els}
        proj = dict(((er, ec), sorted([x[er][ec] for x in xs])) for er, ec in els)
        pairs = [(e0, e1) for e0 in els for e1 in els if e0 > e1]
        ks_tests = [ks_2samp(proj[p0], proj[p1])[1] for (p0, p1) in pairs]
        assert_array_less([ks_prob]*len(pairs), ks_tests) 
Example 34
Project: drifter_ml   Author: EricSchles   File: columnar_tests.py    MIT License 5 votes vote down vote up
def ks_2samp_similar_distribution(self, column,
                                       pvalue_threshold=0.05,
                                       num_rounds=3):
        p_value = permutation_test(
            self.new_data[column],
            self.historical_data[column],
            method="approximate",
            num_rounds=num_rounds,
            func=lambda x, y: stats.ks_2samp(x, y).statistic,
            seed=0)
        if p_value < pvalue_threshold:
            return False
        return True 
Example 35
Project: drifter_ml   Author: EricSchles   File: prototype_test_framework.py    MIT License 5 votes vote down vote up
def similiar_distribution(new_data, historical_data, column_names, pvalue_threshold=0.05):
        for column_name in column_names:
            distribution_info = stats.ks_2samp(new_data[column_name], historical_data[column_name])
            if correlation_info.pvalue < pvalue_threshold:
                return False
        return True

# does the preprocessing break?
# does the model build?
# does the model meet some threshold?
# add memoryful tests for measures over time (like over several days) 
Example 36
Project: aurum-datadiscovery   Author: mitdbg   File: dataanalysis.py    MIT License 5 votes vote down vote up
def compare_num_columns_dist_ks(columnA, columnB):
    ''' 
        Kolmogorov-Smirnov test
    '''
    return ks_2samp(columnA, columnB) 
Example 37
Project: anesthetic   Author: williamjameshandley   File: test_samples.py    MIT License 5 votes vote down vote up
def test_ns_output():
    numpy.random.seed(3)
    pc = NestedSamples(root='./tests/example_data/pc')
    PC = pc.ns_output(1000)
    assert abs(pc.logZ() - PC['logZ'].mean()) < PC['logZ'].std()
    assert PC['d'].mean() < 5
    assert PC.cov()['D']['logZ'] < 0
    assert(abs(PC.logZ.mean() - pc.logZ()) < PC.logZ.std())
    assert(abs(PC.D.mean() - pc.D()) < PC.D.std())
    assert(abs(PC.d.mean() - pc.d()) < PC.d.std())
    assert(ks_2samp(pc.logZ(100), PC.logZ).pvalue > 0.05)
    assert(ks_2samp(pc.D(100), PC.D).pvalue > 0.05)
    assert(ks_2samp(pc.d(100), PC.d).pvalue > 0.05) 
Example 38
Project: cINNamon   Author: hagabbar   File: Radynversion_copy.py    GNU General Public License v3.0 5 votes vote down vote up
def result_stat_tests(inn_samps, mcmc_samps, cnt, parnames):
    """
    Record and print ks and AD test statistics
    """
    
    ks_mcmc_arr = []
    ks_inn_arr = []
    ad_mcmc_arr = []
    ad_inn_arr = []

    # iterate through each parameter
    for i in range(inn_samps.shape[1]):
        # get ideal bayesian number. We want the 2 tailed p value from the KS test FYI
        ks_mcmc_result = ks_2samp(mcmc_samps[:int(mcmc_samps.shape[0]/2.0), i], mcmc_samps[int(mcmc_samps.shape[0]/2.0):, i])
        ad_mcmc_result = anderson_ksamp([mcmc_samps[:int(mcmc_samps.shape[0]/2.0), i], mcmc_samps[int(mcmc_samps.shape[0]/2.0):, i]])

        # get predicted vs. true number
        ks_inn_result = ks_2samp(inn_samps[:,i],mcmc_samps[:,i])
        ad_inn_result = anderson_ksamp([inn_samps[:,i],mcmc_samps[:,i]])
        #print('Test Case %d, Parameter(%s) k-s result: [Ideal(%.6f), Predicted(%.6f)]' % (int(cnt),parnames[i],np.array(ks_mcmc_result[1]),np.array(ks_inn_result[1])))
        #print('Test Case %d, Parameter(%s) A-D result: [Ideal(%.6f), Predicted(%.6f)]' % (int(cnt),parnames[i],np.array(ad_mcmc_result[0]),np.array(ad_inn_result[0])))

        # store result stats
        ks_mcmc_arr.append(ks_mcmc_result[1])
        ks_inn_arr.append(ks_inn_result[1])
        ad_mcmc_arr.append(ad_mcmc_result[0])
        ad_inn_arr.append(ad_inn_result[0])

    return ks_mcmc_arr, ks_inn_arr, ad_mcmc_arr, ad_inn_arr 
Example 39
Project: kepler   Author: jaidevd   File: utils.py    MIT License 5 votes vote down vote up
def is_uniform(x):
    marker = np.random.uniform(x.min(), x.max(), size=(100,))
    _, pval = ks_2samp(x.ravel(), marker)
    return pval > 1e-3 
Example 40
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_stats.py    Apache License 2.0 5 votes vote down vote up
def test_ks_2samp():
    # exact small sample solution
    data1 = np.array([1.0,2.0])
    data2 = np.array([1.0,2.0,3.0])
    assert_almost_equal(np.array(stats.ks_2samp(data1+0.01,data2)),
                np.array((0.33333333333333337, 0.99062316386915694)))
    assert_almost_equal(np.array(stats.ks_2samp(data1-0.01,data2)),
                np.array((0.66666666666666674, 0.42490954988801982)))
    # these can also be verified graphically
    assert_almost_equal(
        np.array(stats.ks_2samp(np.linspace(1,100,100),
                              np.linspace(1,100,100)+2+0.1)),
        np.array((0.030000000000000027, 0.99999999996005062)))
    assert_almost_equal(
        np.array(stats.ks_2samp(np.linspace(1,100,100),
                              np.linspace(1,100,100)+2-0.1)),
        np.array((0.020000000000000018, 0.99999999999999933)))
    # these are just regression tests
    assert_almost_equal(
        np.array(stats.ks_2samp(np.linspace(1,100,100),
                              np.linspace(1,100,110)+20.1)),
        np.array((0.21090909090909091, 0.015880386730710221)))
    assert_almost_equal(
        np.array(stats.ks_2samp(np.linspace(1,100,100),
                              np.linspace(1,100,110)+20-0.1)),
        np.array((0.20818181818181825, 0.017981441789762638)))

    # test for namedtuple attribute results
    attributes = ('statistic', 'pvalue')
    res = stats.ks_2samp(data1 - 0.01, data2)
    check_named_results(res, attributes) 
Example 41
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_multivariate.py    Apache License 2.0 5 votes vote down vote up
def test_haar(self):
        # Test that the distribution is constant under rotation
        # Every column should have the same distribution
        # Additionally, the distribution should be invariant under another rotation

        # Generate samples
        dim = 5
        samples = 1000  # Not too many, or the test takes too long
        ks_prob = 0.39  # ...so don't expect much precision
        np.random.seed(514)
        xs = special_ortho_group.rvs(dim, size=samples)

        # Dot a few rows (0, 1, 2) with unit vectors (0, 2, 4, 3),
        #   effectively picking off entries in the matrices of xs.
        #   These projections should all have the same disribution,
        #     establishing rotational invariance. We use the two-sided
        #     KS test to confirm this.
        #   We could instead test that angles between random vectors
        #     are uniformly distributed, but the below is sufficient.
        #   It is not feasible to consider all pairs, so pick a few.
        els = ((0,0), (0,2), (1,4), (2,3))
        #proj = {(er, ec): [x[er][ec] for x in xs] for er, ec in els}
        proj = dict(((er, ec), sorted([x[er][ec] for x in xs])) for er, ec in els)
        pairs = [(e0, e1) for e0 in els for e1 in els if e0 > e1]
        ks_tests = [ks_2samp(proj[p0], proj[p1])[1] for (p0, p1) in pairs]
        assert_array_less([ks_prob]*len(pairs), ks_tests) 
Example 42
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_multivariate.py    Apache License 2.0 5 votes vote down vote up
def test_haar(self):
        # Test that the distribution is constant under rotation
        # Every column should have the same distribution
        # Additionally, the distribution should be invariant under another rotation

        # Generate samples
        dim = 5
        samples = 1000  # Not too many, or the test takes too long
        ks_prob = 0.39  # ...so don't expect much precision
        np.random.seed(518)  # Note that the test is sensitive to seed too
        xs = ortho_group.rvs(dim, size=samples)

        # Dot a few rows (0, 1, 2) with unit vectors (0, 2, 4, 3),
        #   effectively picking off entries in the matrices of xs.
        #   These projections should all have the same disribution,
        #     establishing rotational invariance. We use the two-sided
        #     KS test to confirm this.
        #   We could instead test that angles between random vectors
        #     are uniformly distributed, but the below is sufficient.
        #   It is not feasible to consider all pairs, so pick a few.
        els = ((0,0), (0,2), (1,4), (2,3))
        #proj = {(er, ec): [x[er][ec] for x in xs] for er, ec in els}
        proj = dict(((er, ec), sorted([x[er][ec] for x in xs])) for er, ec in els)
        pairs = [(e0, e1) for e0 in els for e1 in els if e0 > e1]
        ks_tests = [ks_2samp(proj[p0], proj[p1])[1] for (p0, p1) in pairs]
        assert_array_less([ks_prob]*len(pairs), ks_tests) 
Example 43
Project: phenotypeXpression   Author: NCBI-Hackathons   File: batcheffect.py    MIT License 5 votes vote down vote up
def _generate_ks_test(self, meta_value: int, total_dist: List, clust_stats=None) -> Dict:
        """
        Kolmogrov-Smirnov test for cluster vals and total vals from same dist
        :param meta_value: integer index for gds metric
        :param total_dist: list distribution of gds metric for ONE gds set
        :param clust_stats: pass mutable dict for recursive additions
            key=cluster_ids, value=ordered list of stats for all gds metrics
            (KS/chi-stat, p-value, mean/mode, s.d./category_n)
        :return:
        """
        # Create if empty
        if clust_stats is None:
            clust_stats = defaultdict(list)
        
        #total output values
        total_array = np.array(total_dist)
        clust_stats['Overall'].extend([None, None, np.mean(total_array), np.std(total_array, ddof=1)])    
        # print(total_dist)
        
        #per cluster outputs
        for cluster_id, cluster_list in self.clusters.items():
            cluster_dist = [self.meta_dict[gds][meta_value] for gds in cluster_list]
            # print(cluster_list)
            clust_array = np.array(cluster_dist)
            KS_stat, p_val = stats.ks_2samp(clust_array, total_array)
            clust_stats[cluster_id].extend([KS_stat, p_val, np.mean(clust_array), np.std(clust_array, ddof=1)])
            if p_val <= 0.01:
                print('{}\'s {} sample is drawn from a significantly different distribution than the overall sample'.format(cluster_id, self.meta_list[meta_value]))
        return
    
    # For chi-squared test of platform types different from overall 
Example 44
Project: FAVITES   Author: niemasd   File: distribution_distance.py    GNU General Public License v3.0 5 votes vote down vote up
def d_ks(p_vec, q_vec):
    return tuple(ks_2samp(p_vec,q_vec))

# Jensen-Shannon Divergence 
Example 45
Project: iswc-2016-semantic-labeling   Author: minhptx   File: numeric.py    Apache License 2.0 5 votes vote down vote up
def kolmogorov_smirnov_test(train_examples, test_examples, num1, num2):
    if len(train_examples) > 1 and len(test_examples) > 1:
        result = ks_2samp(train_examples, test_examples)[1]
        return balance_result(num1, num2, True, result)
    return 0 
Example 46
Project: edgePy   Author: r-bioinformatics   File: edgepy.py    MIT License 5 votes vote down vote up
def ks_2_samples(self):
        """Run a 2-tailed Kolmogorov-Smirnov test on the DGEList object.

        Args:
            None.

        Returns:
            gene_details: a dictionary of dictionary (key, gene), holding mean1 and mean2 for the two groups
            gene_likelihood: a dictionary (key, gene), holding the p-value of the separation of the two groups
            group_types: list of the groups in order.

        """
        gene_likelihood1: Dict[Hashable, float] = {}
        group_types = set(self.dge_list.groups_list)
        group_types = list(group_types)
        group_filters: Dict[Hashable, Any] = {}
        gene_details: Dict[Hashable, Dict[Hashable, Any]] = {}
        for group in group_types:
            group_filters[group] = [g == group for g in self.dge_list.groups_list]
        for gene_idx, gene in enumerate(self.dge_list.genes):
            gene_row = self.dge_list.counts[gene_idx]
            if len(group_types) == 2:
                group_data1 = gene_row.compress(group_filters[group_types[0]])
                mean1 = np.mean(group_data1)

                group_data2 = gene_row.compress(group_filters[group_types[1]])
                mean2 = np.mean(group_data2)

                gene_likelihood1[gene] = ks_2samp(group_data1, group_data2)[1]

                gene_details[gene] = {'mean1': mean1, 'mean2': mean2}
        return gene_details, gene_likelihood1, group_types 
Example 47
Project: numerai   Author: gansanay   File: univariateselection.py    MIT License 5 votes vote down vote up
def kolmogorov_smirnov(x_train, x_test):
    r = []
    p = []
    for c in x_train.columns:
        r_, p_ = ks_2samp(x_train[c], x_test[c])
        r.append(r_)
        p.append(p_)
    dfks = pd.DataFrame(index=range(1, 1 + len(x_train.columns)))
    dfks['KS'] = r
    dfks['KS_p'] = p
    return dfks 
Example 48
Project: bnn-analysis   Author: myshkov   File: metrics.py    MIT License 5 votes vote down vote up
def ks_distance(p_samples, q_samples):
    if isinstance(p_samples, tuple):
        idx, p_samples = p_samples

    return sc.ks_2samp(p_samples, q_samples)[0] 
Example 49
Project: pbase   Author: Impavidity   File: visualizer.py    MIT License 5 votes vote down vote up
def ks_significance(self):
        dis1 = np.array(self.dis1)
        dis2 = np.array(self.dis2)
        return stats.ks_2samp(dis1, dis2) 
Example 50
Project: GCIT   Author: alexisbellot   File: utils.py    MIT License 5 votes vote down vote up
def kolmogorov(X,Y):

    X = X.reshape((len(X)))
    Y = Y.reshape((len(Y)))
    return ks_2samp(X, Y)[0] 
Example 51
Project: TeaML   Author: TianjinMouth   File: tea_utils.py    Apache License 2.0 5 votes vote down vote up
def compute_ks(prob, target):
    """
    target: numpy array of shape (1,)
    prob: numpy array of shape (1,), predicted probability of the sample being positive
    returns:
    ks: float, ks score estimation
    """
    get_ks = lambda prob, target: ks_2samp(prob[target == 1], prob[target != 1]).statistic

    return get_ks(prob, target) 
Example 52
Project: senior-design   Author: james-tate   File: test_stats.py    GNU General Public License v2.0 5 votes vote down vote up
def test_ks_2samp():
    #exact small sample solution
    data1 = np.array([1.0,2.0])
    data2 = np.array([1.0,2.0,3.0])
    assert_almost_equal(np.array(stats.ks_2samp(data1+0.01,data2)),
                np.array((0.33333333333333337, 0.99062316386915694)))
    assert_almost_equal(np.array(stats.ks_2samp(data1-0.01,data2)),
                np.array((0.66666666666666674, 0.42490954988801982)))
    #these can also be verified graphically
    assert_almost_equal(
        np.array(stats.ks_2samp(np.linspace(1,100,100),
                              np.linspace(1,100,100)+2+0.1)),
        np.array((0.030000000000000027, 0.99999999996005062)))
    assert_almost_equal(
        np.array(stats.ks_2samp(np.linspace(1,100,100),
                              np.linspace(1,100,100)+2-0.1)),
        np.array((0.020000000000000018, 0.99999999999999933)))
    #these are just regression tests
    assert_almost_equal(
        np.array(stats.ks_2samp(np.linspace(1,100,100),
                              np.linspace(1,100,110)+20.1)),
        np.array((0.21090909090909091, 0.015880386730710221)))
    assert_almost_equal(
        np.array(stats.ks_2samp(np.linspace(1,100,100),
                              np.linspace(1,100,110)+20-0.1)),
        np.array((0.20818181818181825, 0.017981441789762638))) 
Example 53
Project: default-credit-card-prediction   Author: alexpnt   File: feature_selection.py    MIT License 5 votes vote down vote up
def kolmogorov_smirnov_two_sample_test(sample_a,sample_b):
	"""
	Performs the two sample Kolmogorov-Smirnov test, testing wheter twoa samples are drawn from identical distributions

	Keyword arguments:
	sample_a -- The first sample
	sample_b -- The second sample
	"""

	return stats.ks_2samp(sample_a,sample_b) 
Example 54
Project: causallib   Author: IBM   File: filters.py    Apache License 2.0 5 votes vote down vote up
def compute_pvals(self, X, y):
        # TODO: export to stats_utils?
        is_y_binary = (len(np.unique(y)) == 2)
        # is_binary_feature = np.sum(((X != np.nanmin(X, axis=0)[np.newaxis, :]) &
        #                             (X != np.nanmax(X, axis=0)[np.newaxis, :])), axis=0) == 0
        is_binary_feature = areColumnsBinary(X)
        p_vals = np.zeros(X.shape[1])
        if is_y_binary:
            # Process non-binary columns:
            for i in np.where(~is_binary_feature)[0]:
                x0 = X.loc[y == 0, i]
                x1 = X.loc[y == 1, i]
                if self.is_linear:
                    _, p_vals[i] = stats.ttest_ind(x0, x1)
                else:
                    _, p_vals[i] = stats.ks_2samp(x0, x1)

            # Process binary features:
            _, p_vals[is_binary_feature] = feature_selection.chi2(X.loc[:, is_binary_feature], y)

        else:
            # Process non-binary features:
            _, p_vals[~is_binary_feature] = feature_selection.f_regression(X.loc[:, ~is_binary_feature], y)

            # Process binary features:
            y_mat = np.row_stack(y)
            for i in np.where(is_binary_feature)[0]:
                _, p_vals[i] = feature_selection.f_regression(y_mat, X.loc[:, i])
        return p_vals 
Example 55
Project: numerox   Author: numerai   File: metrics.py    GNU General Public License v3.0 5 votes vote down vote up
def concordance(data, prediction, split_pairs=True):
    "Concordance; less than 0.12 is passing; data should be the full dataset."

    pairs = prediction.pairs(as_str=False)
    concords = pd.DataFrame(columns=['concord'], index=pairs)

    # fit clusters
    kmeans = MiniBatchKMeans(n_clusters=5, random_state=1337)
    kmeans.fit(data.x)

    # yhats and clusters for each region
    yhats = []
    clusters = []
    for region in ['validation', 'test', 'live']:
        d = data[region]
        cluster = kmeans.predict(d.x)
        clusters.append(cluster)
        yh = prediction.df.loc[d.df.index].values  # align
        yhats.append(yh)

    # cross cluster distance (KS distance)
    for i in range(len(pairs)):
        ks = []
        for j in set(clusters[0]):
            yhat0 = yhats[0][:, i][clusters[0] == j]
            yhat1 = yhats[1][:, i][clusters[1] == j]
            yhat2 = yhats[2][:, i][clusters[2] == j]
            d = [ks_2samp(yhat0, yhat1)[0],
                 ks_2samp(yhat0, yhat2)[0],
                 ks_2samp(yhat2, yhat1)[0]]
            ks.append(max(d))
        concord = np.mean(ks)
        concords.iloc[i] = concord

    concords = concords.sort_values('concord')

    if split_pairs:
        concords = add_split_pairs(concords)

    return concords 
Example 56
Project: dc_stat_think   Author: justinbois   File: test_dc_stat_think.py    MIT License 5 votes vote down vote up
def test_ks_stat(x):
    theor_data = np.random.normal(0, 1, size=100)
    correct, _ = st.ks_2samp(x, theor_data)
    assert np.isclose(dcst.ks_stat(x, theor_data), correct)

    theor_data = np.random.exponential(1, size=100)
    correct, _ = st.ks_2samp(x, theor_data)
    assert np.isclose(dcst.ks_stat(x, theor_data), correct)

    theor_data = np.random.logistic(0, 1, size=100)
    correct, _ = st.ks_2samp(x, theor_data)
    assert np.isclose(dcst.ks_stat(x, theor_data), correct) 
Example 57
Project: dc_stat_think   Author: justinbois   File: test_dc_stat_think.py    MIT License 5 votes vote down vote up
def test_pandas_conversion(seed):
    df = pd.DataFrame({'a': [3, 2, 1, 4],
                       'b': [8, 6, 7, 5],
                       'c': [9.1, 10.1, 11.1, np.nan]})

    x, y = dcst.ecdf(df.loc[:, 'a'])
    assert (x == np.array([1, 2, 3, 4])).all()
    assert (y == np.array([0.25, 0.5, 0.75, 1.0])).all()

    x, y = dcst.ecdf(df.loc[:, 'c'])
    assert np.allclose(x, np.array([9.1, 10.1, 11.1]))
    assert np.allclose(y, np.array([1/3, 2/3, 1.0]))

    df = pd.DataFrame({
        'a': np.concatenate((np.random.normal(0, 1, size=10), [np.nan]*990)),
        'b': np.random.normal(0, 1, size=1000)})
    correct, _ = st.ks_2samp(df['a'].dropna(), df['b'])
    assert np.isclose(dcst.ks_stat(df['a'], df['b']), correct)

    df = pd.DataFrame({
        'a': np.concatenate((np.random.normal(0, 1, size=80), [np.nan]*20)),
        'b': np.random.normal(0, 1, size=100)})
    dcst_private._seed_numba(seed)
    correct = dcst.draw_bs_reps(df['a'].values, np.mean, size=100)
    dcst_private._seed_numba(seed)
    assert np.allclose(dcst.draw_bs_reps(df['a'], np.mean, size=100), correct,
                       atol=atol)

    dcst_private._seed_numba(seed)
    correct = dcst.draw_bs_reps(df['b'].values, np.mean, size=100)
    dcst_private._seed_numba(seed)
    assert np.allclose(dcst.draw_bs_reps(df['b'], np.mean, size=100), correct,
                       atol=atol)

    dcst_private._seed_numba(seed)
    correct = dcst.draw_perm_reps(df['a'].values, df['b'].values,
                                  dcst.diff_of_means, size=100)
    dcst_private._seed_numba(seed)
    assert np.allclose(dcst.draw_perm_reps(df['a'], df['b'],
                       dcst.diff_of_means, size=100), correct, atol=atol) 
Example 58
Project: DIVE-backend   Author: MacroConnections   File: numerical_comparison.py    GNU General Public License v3.0 4 votes vote down vote up
def get_valid_tests(equal_var, independent, normal, num_samples):
    '''
    Get valid tests given number of samples and statistical characterization of
    samples:

    Equal variance
    Indepenence
    Normality
    '''
    if num_samples == 1:
        valid_tests = {
            'chisquare': stats.chisquare,
            'power_divergence': stats.power_divergence,
            'kstest': stats.kstest
        }
        if normal:
            valid_tests['input']['one_sample_ttest'] = stats.ttest_1samp

    elif num_samples == 2:
        if independent:
            valid_tests = {
                'mannwhitneyu': stats.mannwhitneyu,
                'kruskal': stats.kruskal,
                'ks_2samp': stats.ks_2samp
            }
            if normal:
                valid_tests['two_sample_ttest'] = stats.ttest_ind
                if equal_var:
                    valid_tests['f_oneway'] = stats.f_oneway
        else:
            valid_tests = {
                'two_sample_ks': stats.ks_2samp,
                'wilcoxon': stats.wilcoxon
            }
            if normal:
                valid_tests['two_sample_related_ttest'] = stats.ttest_rel

    elif num_samples >= 3:
        if independent:
            valid_tests = {
                'kruskal': stats.kruskal
            }
            if normal and equal_var:
                valid_tests['f_oneway'] = stats.f_oneway

        else:
            valid_tests['friedmanchisquare'] = stats.friedmanchisquare

    return valid_tests 
Example 59
Project: frbpoppy   Author: davidgardenier   File: frbpoppy_frbcat.py    MIT License 4 votes vote down vote up
def plot_dists(surv_pop, telescope):
    """Plot the fluence and DM distribution of a surveyed population.

    Args:
        surv_pop (Population): Population from which to plot
        telescope (str): Name of the telescope with which to compare the
            distribution. Necessary for Frbcat.

    """
    plot_aa_style(cols=2)

    # Plot dm distribution
    f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)

    dm_frbpoppy = surv_pop.frbs.dm
    pprint(f'Number of detected FRBs: {len(dm_frbpoppy)}')
    ax1.step(*hist(dm_frbpoppy), where='mid', linestyle='dashed')

    df = Frbcat().df
    dm_frbcat = df[df.telescope == telescope].dm
    ax1.step(*hist(dm_frbcat), where='mid')

    # Compare distributions
    ks = ks_2samp(dm_frbpoppy, dm_frbcat)
    text = fr'$p={round(ks[1], 2)}$'
    anchored_text = AnchoredText(text, loc=1, borderpad=1., frameon=False)
    ax1.add_artist(anchored_text)

    ax1.set_xlabel(r'DM ($\textrm{pc}\ \textrm{cm}^{-3}$)')
    ax1.set_ylabel('Fraction')
    ax1.set_ylim([0, 1.1])
    ax1.set_xlim([0, 3500])

    # Plot fluence distributions
    fluence_frbpoppy = surv_pop.frbs.fluence
    ax2.step(*hist(fluence_frbpoppy, bin_type='log'), where='mid',
             label='frbpoppy', linestyle='dashed')

    fluence_frbcat = df[df.telescope == telescope].fluence
    ax2.step(*hist(fluence_frbcat, bin_type='log'), where='mid',
             label='frbcat')

    # Compare distributions
    ks = ks_2samp(fluence_frbpoppy, fluence_frbcat)
    text = fr'$p={round(ks[1], 2)}$'
    anchored_text = AnchoredText(text, loc=1, borderpad=1., frameon=False)
    ax2.add_artist(anchored_text)

    ax2.set_xlabel(r'Fluence (Jy ms)')
    ax2.set_ylim([0, 1.1])
    ax2.set_xlim([5e-1, 1e4])
    plt.xscale('log')

    plt.figlegend(loc='upper center', ncol=2, framealpha=1)

    plt.tight_layout()
    plt.savefig(rel_path(f'plots/frbpoppy_{telescope}.pdf'))
    plt.clf() 
Example 60
Project: cINNamon   Author: hagabbar   File: gw_INN.py    GNU General Public License v3.0 4 votes vote down vote up
def overlap_tests(pred_samp,lalinf_samp,true_vals,kernel_cnn,kernel_lalinf):
    """ Perform Anderson-Darling, K-S, and overlap tests
    to get quantifiable values for accuracy of GAN
    PE method
    Parameters
    ----------
    pred_samp: numpy array
        predicted PE samples from CNN
    lalinf_samp: numpy array
        predicted PE samples from lalinference
    true_vals:
        true scalar point values for parameters to be estimated (taken from GW event paper)
    kernel_cnn: scipy kde instance
        gaussian kde of CNN results
    kernel_lalinf: scipy kde instance
        gaussian kde of lalinference results
    Returns
    -------
    ks_score:
        k-s test score
    ad_score:
        anderson-darling score
    beta_score:
        overlap score. used to determine goodness of CNN PE estimates
    """

    # do k-s test
    ks_mc_score = ks_2samp(pred_samp[:,0].reshape(pred_samp[:,0].shape[0],),lalinf_samp[0][:])
    ks_q_score = ks_2samp(pred_samp[:,1].reshape(pred_samp[:,1].shape[0],),lalinf_samp[1][:])
    ks_score = np.array([ks_mc_score,ks_q_score])

    # do anderson-darling test
    ad_mc_score = anderson_ksamp([pred_samp[:,0].reshape(pred_samp[:,0].shape[0],),lalinf_samp[0][:]])
    ad_q_score = anderson_ksamp([pred_samp[:,1].reshape(pred_samp[:,1].shape[0],),lalinf_samp[1][:]])
    ad_score = [ad_mc_score,ad_q_score]

    # compute overlap statistic
    comb_mc = np.concatenate((pred_samp[:,0].reshape(pred_samp[:,0].shape[0],1),lalinf_samp[0][:].reshape(lalinf_samp[0][:].shape[0],1)))
    comb_q = np.concatenate((pred_samp[:,1].reshape(pred_samp[:,1].shape[0],1),lalinf_samp[1][:].reshape(lalinf_samp[1][:].shape[0],1)))
    X, Y = np.mgrid[np.min(comb_mc):np.max(comb_mc):100j, np.min(comb_q):np.max(comb_q):100j]
    positions = np.vstack([X.ravel(), Y.ravel()])
    #cnn_pdf = np.reshape(kernel_cnn(positions).T, X.shape)
    #print(positions.shape,pred_samp.shape)
    cnn_pdf = kernel_cnn.pdf(positions)

    #X, Y = np.mgrid[np.min(lalinf_samp[0][:]):np.max(lalinf_samp[0][:]):100j, np.min(lalinf_samp[1][:]):np.max(lalinf_samp[1][:]):100j]
    positions = np.vstack([X.ravel(), Y.ravel()])
    #lalinf_pdf = np.reshape(kernel_lalinf(positions).T, X.shape)
    lalinf_pdf = kernel_lalinf.pdf(positions)

    beta_score = np.divide(np.sum( cnn_pdf*lalinf_pdf ),
                              np.sqrt(np.sum( cnn_pdf**2 ) * 
                              np.sum( lalinf_pdf**2 )))
    

    return ks_score, ad_score, beta_score 
Example 61
Project: cINNamon   Author: hagabbar   File: gaussian_INN.py    GNU General Public License v3.0 4 votes vote down vote up
def overlap_tests(pred_samp,lalinf_samp,true_vals,kernel_cnn,kernel_lalinf):
    """ Perform Anderson-Darling, K-S, and overlap tests
    to get quantifiable values for accuracy of GAN
    PE method
    Parameters
    ----------
    pred_samp: numpy array
        predicted PE samples from CNN
    lalinf_samp: numpy array
        predicted PE samples from lalinference
    true_vals:
        true scalar point values for parameters to be estimated (taken from GW event paper)
    kernel_cnn: scipy kde instance
        gaussian kde of CNN results
    kernel_lalinf: scipy kde instance
        gaussian kde of lalinference results
    Returns
    -------
    ks_score:
        k-s test score
    ad_score:
        anderson-darling score
    beta_score:
        overlap score. used to determine goodness of CNN PE estimates
    """

    # do k-s test
    ks_mc_score = ks_2samp(pred_samp[:,0].reshape(pred_samp[:,0].shape[0],),lalinf_samp[0][:])
    ks_q_score = ks_2samp(pred_samp[:,1].reshape(pred_samp[:,1].shape[0],),lalinf_samp[1][:])
    ks_score = np.array([ks_mc_score,ks_q_score])

    # do anderson-darling test
    ad_mc_score = anderson_ksamp([pred_samp[:,0].reshape(pred_samp[:,0].shape[0],),lalinf_samp[0][:]])
    ad_q_score = anderson_ksamp([pred_samp[:,1].reshape(pred_samp[:,1].shape[0],),lalinf_samp[1][:]])
    ad_score = [ad_mc_score,ad_q_score]

    # compute overlap statistic
    comb_mc = np.concatenate((pred_samp[:,0].reshape(pred_samp[:,0].shape[0],1),lalinf_samp[0][:].reshape(lalinf_samp[0][:].shape[0],1)))
    comb_q = np.concatenate((pred_samp[:,1].reshape(pred_samp[:,1].shape[0],1),lalinf_samp[1][:].reshape(lalinf_samp[1][:].shape[0],1)))
    X, Y = np.mgrid[np.min(comb_mc):np.max(comb_mc):100j, np.min(comb_q):np.max(comb_q):100j]
    positions = np.vstack([X.ravel(), Y.ravel()])
    #cnn_pdf = np.reshape(kernel_cnn(positions).T, X.shape)
    #print(positions.shape,pred_samp.shape)
    cnn_pdf = kernel_cnn.pdf(positions)

    #X, Y = np.mgrid[np.min(lalinf_samp[0][:]):np.max(lalinf_samp[0][:]):100j, np.min(lalinf_samp[1][:]):np.max(lalinf_samp[1][:]):100j]
    positions = np.vstack([X.ravel(), Y.ravel()])
    #lalinf_pdf = np.reshape(kernel_lalinf(positions).T, X.shape)
    lalinf_pdf = kernel_lalinf.pdf(positions)

    beta_score = np.divide(np.sum( cnn_pdf*lalinf_pdf ),
                              np.sqrt(np.sum( cnn_pdf**2 ) * 
                              np.sum( lalinf_pdf**2 )))
    

    return ks_score, ad_score, beta_score 
Example 62
Project: cINNamon   Author: hagabbar   File: Radynversion.py    GNU General Public License v3.0 4 votes vote down vote up
def overlap_tests(pred_samp,lalinf_samp,true_vals,kernel_cnn,kernel_lalinf):
    """ Perform Anderson-Darling, K-S, and overlap tests
    to get quantifiable values for accuracy of GAN
    PE method
    Parameters
    ----------
    pred_samp: numpy array
        predicted PE samples from CNN
    lalinf_samp: numpy array
        predicted PE samples from lalinference
    true_vals:
        true scalar point values for parameters to be estimated (taken from GW event paper)
    kernel_cnn: scipy kde instance
        gaussian kde of CNN results
    kernel_lalinf: scipy kde instance
        gaussian kde of lalinference results
    Returns
    -------
    ks_score:
        k-s test score
    ad_score:
        anderson-darling score
    beta_score:
        overlap score. used to determine goodness of CNN PE estimates
    """

    # do k-s test
    ks_mc_score = ks_2samp(pred_samp[:,0].reshape(pred_samp[:,0].shape[0],),lalinf_samp[0][:])
    ks_q_score = ks_2samp(pred_samp[:,1].reshape(pred_samp[:,1].shape[0],),lalinf_samp[1][:])
    ks_score = np.array([ks_mc_score,ks_q_score])

    # do anderson-darling test
    ad_mc_score = anderson_ksamp([pred_samp[:,0].reshape(pred_samp[:,0].shape[0],),lalinf_samp[0][:]])
    ad_q_score = anderson_ksamp([pred_samp[:,1].reshape(pred_samp[:,1].shape[0],),lalinf_samp[1][:]])
    ad_score = [ad_mc_score,ad_q_score]

    # compute overlap statistic
    comb_mc = np.concatenate((pred_samp[:,0].reshape(pred_samp[:,0].shape[0],1),lalinf_samp[0][:].reshape(lalinf_samp[0][:].shape[0],1)))
    comb_q = np.concatenate((pred_samp[:,1].reshape(pred_samp[:,1].shape[0],1),lalinf_samp[1][:].reshape(lalinf_samp[1][:].shape[0],1)))
    X, Y = np.mgrid[np.min(comb_mc):np.max(comb_mc):100j, np.min(comb_q):np.max(comb_q):100j]
    positions = np.vstack([X.ravel(), Y.ravel()])
    #cnn_pdf = np.reshape(kernel_cnn(positions).T, X.shape)
    #print(positions.shape,pred_samp.shape)
    cnn_pdf = kernel_cnn.pdf(positions)

    #X, Y = np.mgrid[np.min(lalinf_samp[0][:]):np.max(lalinf_samp[0][:]):100j, np.min(lalinf_samp[1][:]):np.max(lalinf_samp[1][:]):100j]
    positions = np.vstack([X.ravel(), Y.ravel()])
    #lalinf_pdf = np.reshape(kernel_lalinf(positions).T, X.shape)
    lalinf_pdf = kernel_lalinf.pdf(positions)

    beta_score = np.divide(np.sum( cnn_pdf*lalinf_pdf ),
                              np.sqrt(np.sum( cnn_pdf**2 ) *
                              np.sum( lalinf_pdf**2 )))


    return ks_score, ad_score, beta_score

# Load the training data -- you will need to modify this path.

# In[4]: 
Example 63
Project: cINNamon   Author: hagabbar   File: gw_INN.py    GNU General Public License v3.0 4 votes vote down vote up
def overlap_tests(pred_samp,lalinf_samp,true_vals,kernel_cnn,kernel_lalinf):
    """ Perform Anderson-Darling, K-S, and overlap tests
    to get quantifiable values for accuracy of GAN
    PE method
    Parameters
    ----------
    pred_samp: numpy array
        predicted PE samples from CNN
    lalinf_samp: numpy array
        predicted PE samples from lalinference
    true_vals:
        true scalar point values for parameters to be estimated (taken from GW event paper)
    kernel_cnn: scipy kde instance
        gaussian kde of CNN results
    kernel_lalinf: scipy kde instance
        gaussian kde of lalinference results
    Returns
    -------
    ks_score:
        k-s test score
    ad_score:
        anderson-darling score
    beta_score:
        overlap score. used to determine goodness of CNN PE estimates
    """

    # do k-s test
    ks_mc_score = ks_2samp(pred_samp[:,0].reshape(pred_samp[:,0].shape[0],),lalinf_samp[0][:])
    ks_q_score = ks_2samp(pred_samp[:,1].reshape(pred_samp[:,1].shape[0],),lalinf_samp[1][:])
    ks_score = np.array([ks_mc_score,ks_q_score])

    # do anderson-darling test
    ad_mc_score = anderson_ksamp([pred_samp[:,0].reshape(pred_samp[:,0].shape[0],),lalinf_samp[0][:]])
    ad_q_score = anderson_ksamp([pred_samp[:,1].reshape(pred_samp[:,1].shape[0],),lalinf_samp[1][:]])
    ad_score = [ad_mc_score,ad_q_score]

    # compute overlap statistic
    comb_mc = np.concatenate((pred_samp[:,0].reshape(pred_samp[:,0].shape[0],1),lalinf_samp[0][:].reshape(lalinf_samp[0][:].shape[0],1)))
    comb_q = np.concatenate((pred_samp[:,1].reshape(pred_samp[:,1].shape[0],1),lalinf_samp[1][:].reshape(lalinf_samp[1][:].shape[0],1)))
    X, Y = np.mgrid[np.min(comb_mc):np.max(comb_mc):100j, np.min(comb_q):np.max(comb_q):100j]
    positions = np.vstack([X.ravel(), Y.ravel()])
    #cnn_pdf = np.reshape(kernel_cnn(positions).T, X.shape)
    #print(positions.shape,pred_samp.shape)
    cnn_pdf = kernel_cnn.pdf(positions)

    #X, Y = np.mgrid[np.min(lalinf_samp[0][:]):np.max(lalinf_samp[0][:]):100j, np.min(lalinf_samp[1][:]):np.max(lalinf_samp[1][:]):100j]
    positions = np.vstack([X.ravel(), Y.ravel()])
    #lalinf_pdf = np.reshape(kernel_lalinf(positions).T, X.shape)
    lalinf_pdf = kernel_lalinf.pdf(positions)

    beta_score = np.divide(np.sum( cnn_pdf*lalinf_pdf ),
                              np.sqrt(np.sum( cnn_pdf**2 ) * 
                              np.sum( lalinf_pdf**2 )))
    

    return ks_score, ad_score, beta_score 
Example 64
Project: gwin   Author: gwastro   File: burn_in.py    GNU General Public License v3.0 4 votes vote down vote up
def ks_test(sampler, fp, threshold=0.9):
    """Burn in based on whether the p-value of the KS test between the samples
    at the last iteration and the samples midway along the chain for each
    parameter is > ``threshold``.

    Parameters
    ----------
    sampler : gwin.sampler
        Sampler to determine burn in for. May be either an instance of a
        `gwin.sampler`, or the class itself.
    fp : InferenceFile
        Open inference hdf file containing the samples to load for determing
        burn in.
    threshold : float
        The thershold to use for the p-value. Default is 0.9.

    Returns
    -------
    burn_in_idx : array
        Array of indices giving the burn-in index for each chain.
    is_burned_in : array
        Array of booleans indicating whether each chain is burned in.
    """
    nwalkers = fp.nwalkers
    niterations = fp.niterations
    # Create a dictionary which would have keys are the variable args and
    # values are booleans indicating whether the p-value for the parameters
    # satisfies the KS test
    is_burned_in_param = {}
    # iterate over the parameters
    for param in fp.variable_params:
        # read samples for the parameter from the last iteration of the chain
        samples_last_iter = sampler.read_samples(fp, param, iteration=-1,
                                                 flatten=True)[param]
        # read samples for the parameter from the iteration midway
        # along the chain
        samples_chain_midpt = sampler.read_samples(
            fp, param, iteration=int(niterations/2), flatten=True)[param]
        _, p_value = ks_2samp(samples_last_iter, samples_chain_midpt)
        # check if p_value is > than the desired range
        is_burned_in_param[param] = p_value > threshold

    # The chains are burned in if the p-value of the KS test lies
    # in the range [0.1,0.9] for all the parameters.
    # If the KS test is passed, the chains have burned in at their
    # mid-way point.
    if all(is_burned_in_param.values()):
        is_burned_in = numpy.ones(nwalkers, dtype=bool)
        burn_in_idx = numpy.repeat(niterations/2, nwalkers).astype(int)
    else:
        is_burned_in = numpy.zeros(nwalkers, dtype=bool)
        burn_in_idx = numpy.repeat(niterations, nwalkers).astype(int)
    return burn_in_idx, is_burned_in 
Example 65
Project: SamPy   Author: sniemi   File: AnalyseFieldEs.py    BSD 2-Clause "Simplified" License 4 votes vote down vote up
def main(saved):
    import scipy.stats as S
    import numpy as N

    #Constants
    save = saved
    bmag = 45
    volume = 5.*194.**3.
    type = 9

    #Reads data
    ET4 = N.loadtxt("EllipticalsT4.out")
    fEsall = N.loadtxt("FieldEllipticals.out")
    
    #marks only the ones which fulfil criterion
    indices = N.where(ET4[:,bmag] <= -19.) 
    ET4magb = ET4[indices]

    #Takes away three that are actually subhaloes...
    ind = N.where(fEsall[:,type] == 0)
    fEs = fEsall[ind]

    hists = histograms()

    print '\nNumber of fEs and Es:\n %i %i' % (len(fEs), len(ET4magb))
    
    for plot in hists:   
        column = hists[plot]['column']
        file = hists[plot]['file']
        bins = hists[plot]['bins']
        min =  hists[plot]['min']
        max =  hists[plot]['max']

        if (file == 'colour'):
            temp = fEs[:,45] - fEs[:,47]
            temp1 = ET4magb[:,45] - ET4magb[:,47]
        elif (file == 'companions'):
            temp = fEs[:,63] + fEs[:,64]
            temp1 = fEs[:,64]
        else:
            temp = fEs[:,column]
            temp1 = ET4magb[:,column]

        SMNhistogram(file, temp, temp1, bins, min, max, save, volume)
        
        D, p = S.ks_2samp(temp, temp1)
        print '\nKolmogorov-Smirnov statistics for %s:' % file
        print 'D-value = %e \np-value = %e' % (D, p)


        print '\nStatistics 1st fEs and then Es:'
        print ("%20s" + "%15s"*7) % ("Name", "1st Quart", "Median", "3rd Quart", "Max", "Min", "Mean", "Stdev")
        frmt = "%14s & %12.2f & %12.2f & %12.2f & %12.2f & %12.2f & %12.2f & %12.2f"
        frmt1 = "IfEs &" + frmt
        frmt2 = "Es   &" + frmt
        print frmt1 % (file, S.scoreatpercentile(temp, 25), 
                      N.median(temp), S.scoreatpercentile(temp, 75),
                      N.max(temp), N.min(temp), N.mean(temp), N.std(temp))
        print frmt2 % (file, S.scoreatpercentile(temp1, 25), 
                      N.median(temp1), S.scoreatpercentile(temp1, 75),
                      N.max(temp1), N.min(temp1), N.mean(temp1), N.std(temp1))