python source code of anomaly

# Copyright 2017 LinkedIn Corporation. All rights reserved. Licensed under the BSD-2 Clause license.
# See LICENSE in the project root for license information.

import math
import logging
import statistics


log = logging.getLogger(__name__)

# Percent of values with X sigmas, used in various standard 3 sigma checks.
ONE_SIGMA_PERCENT = 0.68
TWO_SIGMA_PERCENT = .9545
THREE_SIGMA_PERCENT = .9973


def within_stdev_percent(values, x_stdev, percent_threshold, min_values=100):
    '''Return True if percent_threshold of values are within x_stdev of the mean.'''
    if len(values) < min_values:
        return True

    mean = statistics.mean(values)
    stdev = statistics.stdev(values)
    found = []
    for v in values:
        diff = abs(mean - v)
        if diff <= (stdev * x_stdev):
            found.append(v)
    percent_found = len(found) / len(values)
    result = percent_found > percent_threshold
    log.debug(f"Within {x_stdev} sigma check was {result}. {percent_found:.2f}%/{percent_threshold:.2f}% within stdev*{x_stdev}. "
              f"Mean: {mean:.2f}. Stdev: {stdev:.2f}. Acceptable range was: {mean - stdev * x_stdev:.2f} - {mean + stdev * x_stdev:.2f}")
    return result


def within_one_sigma(values, percent_threshold=ONE_SIGMA_PERCENT, min_values=100):
    return within_stdev_percent(values=values, x_stdev=1, percent_threshold=percent_threshold, min_values=min_values)


def within_two_sigma(values, percent_threshold=TWO_SIGMA_PERCENT, min_values=100):
    return within_stdev_percent(values, x_stdev=2, percent_threshold=percent_threshold, min_values=min_values)


def within_three_sigma(values, percent_threshold=THREE_SIGMA_PERCENT, min_values=100):
    return within_stdev_percent(values, x_stdev=3, percent_threshold=percent_threshold, min_values=min_values)


def within_all_three_sigma(values, min_values=100):
    '''
    Return false if something does not pass a standard three sigma test.
    If the number of values is less than min_values, will always return False
    '''
    return within_one_sigma(values) and within_two_sigma(values) and within_three_sigma(values)


def abnormal_distribution(values, ignore_zero=False, probability=1e-30):
    '''
    Return True if too many values fall within the same sigma consecutively, meaning the distribution is not normal according to the 3 sigma rule.
    Probability is loosely the likelihood of this test triggering during a normal distribution. A lower value means fewer false positives.
    '''

    result = False

    if ignore_zero:
        values = [value for value in values if value != 0]

    one_sigma_threshold = math.log10(probability) / math.log10(1 - ONE_SIGMA_PERCENT)
    two_sigma_threshold = math.log10(probability) / math.log10(1 - TWO_SIGMA_PERCENT)
    three_sigma_threshold = math.log10(probability) / math.log10(1 - THREE_SIGMA_PERCENT)

    mean = statistics.mean(values)
    stdev = statistics.stdev(values)

    one_sigma = stdev * 1
    two_sigma = stdev * 2
    three_sigma = stdev * 3

    one_sigma_count = 0
    two_sigma_count = 0
    three_sigma_count = 0

    one_sigma_count_max = 0
    two_sigma_count_max = 0
    three_sigma_count_max = 0
    for v in values:
        diff = abs(mean - v)
        if diff < one_sigma:
            one_sigma_count += 1
            one_sigma_count_max = max(one_sigma_count, one_sigma_count_max)
            two_sigma_count = 0
            three_sigma_count = 0
        elif diff < two_sigma:
            one_sigma_count = 0
            two_sigma_count += 1
            two_sigma_count_max = max(two_sigma_count, two_sigma_count_max)
            three_sigma_count = 0
        elif diff < three_sigma:
            one_sigma_count = 0
            two_sigma_count = 0
            three_sigma_count += 1
            three_sigma_count_max = max(three_sigma_count, three_sigma_count_max)
        if one_sigma_count > one_sigma_threshold or \
           two_sigma_count > two_sigma_threshold or \
           three_sigma_count > three_sigma_threshold:
            result = True

    log.debug(f"Abnormal Distribution: {result}. Max consecutive values within one, two, and three sigma in a row: "
              f"{one_sigma_count_max}/{one_sigma_threshold:.2f}, "
              f"{two_sigma_count_max}/{two_sigma_threshold:.2f}, "
              f"{three_sigma_count_max}/{three_sigma_threshold:.2f}. "
              f"Mean: {mean:.2f}. Stdev: {stdev:.2f}.")
    return result