python source code of threshold

"""Framework for empirical estimation of thresholds for flaky tests.

The main entry points for this module are
`compute_sufficiently_stringent_threshold` and `check_generator`.
`compute_sufficiently_stringent_threshold` is run once to derive threshold
values, which are then used in unit tests which call `check_generator` to
verify that the thresholds are satisfied.

A `generator` is expected to look after its own stochastic state. You can
conveniently make such a generator using functools.partial. E.g.

    >>> import math, functools, numpy.random, scipy.stats
    >>> r = numpy.random.RandomState(0)
    >>> test_statistic = scipy.stats.norm(0, 1).rvs
    >>> seeded_test_statistc = partial(test_statistic, random_state=r)
    >>> t = compute_sufficiently_stringent_threshold(
                seeded_test_statistc, 10, 1e-6)
    ...
    >>> check_generator(seeded_test_statistc, 10, t.threshold, t.failprob)
    0  # Number of times the test ran before success

    # Force a failure by lying about the number of times the test needs to run
    >>> for _ in xrange(100):
            check_generator(seeded_test_statistc, 2, t.threshold, t.failprob)
    ...
    MultipleTestStatisticFailures: For 2 times in a row, <functools.partial
    object at 0x116dd7fc8> has returned a value less than -0.708. The
    probability of this was empirically estimated to be less than 1e-06,
    suggesting that the distribution of return values has changed.

"""

from collections import namedtuple
import math
import numbers


def lbeta(m, n):
    """Return log(Beta(m,n))"""
    return math.lgamma(m) + math.lgamma(n) - math.lgamma(m + n)


def isnumber(n):
    return isinstance(n, numbers.Number)


class FailProbThreshold(namedtuple('FailProbThreshold', 'fprob fthreshold')):

    """Container for return value of `failprob_threshold`. See its docstring for
    notation unspecified here.

    `fprob`: The estimated probability that `ns` samples from `D` will be less
    than `fthreshold`.

    `fthreshold`: The Maximum-Likelihood estimate, given `observed`, of the
    `threshold`**(1/ns) quantile for `D`, i.e. it's chosen so that the
    probability P(X < fthreshold | D) is approximately `threshold`**(1/ns).

    """
    pass


def failprob_threshold(observed, ns, threshold):
    """Takes ([float] observed, int ns, float threshold)

    `observed`: iid numeric samples of some test statistic following some
    distribution `D`.

    `ns`: number of iid samples to be drawn when the test statistic is being
          used in a test of a statistical method.

    `threshold`: desired maximum probability that `ns` iid samples from `D` are
    all less than some bound to be determined, `fthreshold`.

    Returns a `FailProbThreshold`, which contains the computed `fthreshold` and
    a probability `fprob` which is guaranteed to be less than `threshold`. See
    `FailProbThreshold`'s docstring for more details.

    """

    # Type checking
    if not all(isnumber(d) for d in observed):
        raise ValueError('observed is not a list of numeric values')
    if (not isnumber(threshold)) or (threshold > 1) or (threshold < 0):
        raise ValueError('threshold is not a probability')
    if (not isnumber(ns)) or (round(ns) != ns) or (ns < 1):
        raise ValueError('ns is not a natural number')

    # Compute the quantile which should be tested for in each subtest
    observed = sorted(observed)
    sub_threshold = threshold**(1. / ns)
    mlxidx = int(len(observed) * sub_threshold)
    # ...this is the estimated `threshold**(1/ns)` quantile for `D`.
    mlx = observed[mlxidx]
    if observed.count(mlx) > 1:
        # If mlx occurs more than once, it's likely that it contains
        # non-trivial probability mass. If the target quantile lies within that
        # mass then the key assumption of this approach (i.e. that P(y<mlx) is
        # approximately sub_threshold) has broken down and a different
        # threshold should be chosen.
        raise ValueError('''Requested quantile may lie in Dirac delta fn.

        Note: This may happen because of accidentally repeated "random" state
        in your test statistic.''')

    # Compute the observed counts below and above the threshold mlx
    below = max(0, mlxidx - 1)
    above = len(observed) - mlxidx - 1

    # We have observed "below" samples less than or equal to mlx, "above"
    # samples above it. If we treat these as observations of a binomial, the
    # posterior on P(y<mlx) is a Beta(below+1,above+1) distribution, call it
    # PB. The posterior probability of "ns" iid samples less than or equal to
    # mlx is the integral over the unit interval of (q**ns)*PB(q,1-q), which
    # is, modulo a constant, the integrand of the density
    # Beta(below+ns+1,above+1). I.e.
    lpfail = lbeta(below + ns + 1, above + 1) - lbeta(below + 1, above + 1)
    return FailProbThreshold(fprob=math.exp(lpfail), fthreshold=mlx)

test_threshold_fields = ['threshold', 'failprob', 'sample_size']


class TestThreshold(namedtuple('TestThreshold', test_threshold_fields)):

        """Container for return value of `compute_sufficiently_stringent_threshold`.
        See its docstring for notation unspecified here.

        `threshold`: Threshold value, below which a single instance of the
                     `generator` test statistic is to be deemed failed.

        `failprob`: Estimated maximum probability that `ns` draws from
                    `generator` will all be less than `threshold`

        `sample_size`: How many samples we had to draw to be confident that
                       `threshold` meets the requirements.

        """
        pass


def compute_sufficiently_stringent_threshold(generator, ns, maxprob):
    """Compute a failure threshold for return values of `generator`.

    `generator`: A function which takes no arguments and returns a float. Its
                 return values are assumed to be iid.

    `ns`: Number of samples to be drawn when the generator is being used as a
          test of a statistical method.

    `maxprob`: Desired maximum probability of failure in `ns` samples in a row.

    Returns a `TestThreshold`. See its docstring for details, but the value you
    care most about is `threshold`. The probability of `generator` getting a
    value less than this `ns` times in a row is less than `maxprob`.

    """
    # XXX: The batch size can be computed exactly ahead of time. This is good
    # enough for now, though.
    batchsize = int(maxprob**(-1. / ns)) + 1
    observed = []
    while True:
        observed.extend(generator() for _ in xrange(batchsize))
        probfail, x = failprob_threshold(observed, ns, 0.9 * maxprob)
        if probfail < maxprob:
            return TestThreshold(x, probfail, len(observed))


class MultipleTestStatisticFailures(RuntimeError):
    """Raised when a test statistic is too low too many times."""

    def __init__(self, generator, ns, threshold, probfail, statistics):
        self.generator = generator
        self.ns = ns
        self.threshold = threshold
        self.statistics = statistics
        # Set args so that raising this gives a meaningful message
        failure_template = '''For % i times in a row, %s has returned a value
less than %.3g. The probability of this was empirically estimated to be less
than %.3g, suggesting that the distribution of return values has changed.'''
        super(MultipleTestStatisticFailures, self).__init__(
            failure_template % (ns, generator, threshold, probfail))


def check_generator(generator, ns, threshold, probfail):
    """Check that "generator" is not producing absurdly low values.

    `generator`: the test statistic

    `ns`: The number of times "generator" is allowed to return a value less
        than "threshold" before failure is reported.

    `probfail`: A bound on probability of failure being reported.

    These values should be computed using
    `compute_sufficiently_stringent_threshold`

    Returns the number of failing tests prior to succeeding.

    """

    statistics = []
    for numfailures in range(ns):
        statistics.append(generator())
        if statistics[-1] >= threshold:
            return numfailures
    raise MultipleTestStatisticFailures(generator, ns, threshold, probfail,
                                        statistics)