python source code of measures

"""
Module measures contains: 
    function for calculating information gain
    function for calculating minimum description length
    heuristic for searching best binary split of nominal values
    function for equal frequency label discretization of numerical values
    function for random discretization of numerical values
"""

import math
from collections import Counter

import numpy as np


def mdl(x, y, ft, accuracy, separate_max):
    return mdl_nominal(x, y, separate_max) if ft == "d" else mdl_numeric(x, y, accuracy)


def info_gain(x, y, ft, accuracy, separate_max):
    return info_gain_nominal(x, y, separate_max) if ft == "d" else info_gain_numeric(x, y, accuracy)


def nominal_splits(x, y, x_vals, y_dist, separate_max):
    """
    Function uses heuristic to find best binary split of nominal values. Heuristic is described in (1) and it is
    originally defined for binary classes. We extend it to work with multiple classes by comparing label with least
    samples to others.

    x: numpy array - nominal feature
    y: numpy array - label
    x_vals: numpy array - unique nominal values of x
    y_dist: dictionary - distribution of labels

    Reference:
    (1) Classification and Regression Trees by Breiman, Friedman, Olshen, and Stone, pages 101- 102. 
    """
    # select a label with least samples
    y_val = max(y_dist, key=y_dist.get) if separate_max else min(y_dist, key=y_dist.get)

    prior = y_dist[y_val] / float(len(y))  # prior distribution of selected label

    values, dist, splits = [], [], []
    for x_val in x_vals:  # for every unique nominal value
        dist.append(Counter(y[x == x_val]))  # distribution of labels at selected nominal value
        splits.append(x_val)
        suma = sum([prior * dist[-1][y_key] for y_key in y_dist.keys()])
        # estimate probability
        values.append(prior * dist[-1][y_val] / float(suma))
    indices = np.array(values).argsort()[::-1]

    # distributions and splits are sorted according to probabilities
    return np.array(dist)[indices], np.array(splits)[indices].tolist()


def h(values):
    """
    Function calculates entropy.

    values: list of integers
    """
    ent = np.true_divide(values, np.sum(values))
    return -np.sum(np.multiply(ent, np.log2(ent)))


def info_gain_nominal(x, y, separate_max):
    """
    Function calculates information gain for discrete features. If feature is continuous it is firstly discretized.

    x: numpy array - numerical or discrete feature
    y: numpy array - labels
    ft: string - feature type ("c" - continuous, "d" - discrete)
    split_fun: function - function for discretization of numerical features
    """
    x_vals = np.unique(x)  # unique values
    if len(x_vals) < 3:  # if there is just one unique value
        return None
    y_dist = Counter(y)  # label distribution
    h_y = h(y_dist.values())  # class entropy

    # calculate distributions and splits in accordance with feature type

    dist, splits = nominal_splits(x, y, x_vals, y_dist, separate_max)

    indices, repeat = (range(1, len(dist)), 1) if len(dist) < 50 else (range(1, len(dist), len(dist) / 10), 3)
    interval = len(dist) / 10

    max_ig, max_i, iteration = 0, 1, 0
    while iteration < repeat:
        for i in indices:
            dist0 = np.sum([el for el in dist[:i]])  # iter 0: take first distribution
            dist1 = np.sum([el for el in dist[i:]])  # iter 0: take the other distributions without first
            coef = np.true_divide([np.sum(dist0.values()), np.sum(dist1.values())], len(y))
            ig = h_y - np.dot(coef, [h(dist0.values()), h(dist1.values())])  # calculate information gain
            if ig > max_ig:
                max_ig, max_i = ig, i  # store index and value of maximal information gain
        iteration += 1
        if repeat > 1:
            interval = int(interval * 0.5)
            if max_i in indices and interval > 0:
                middle_index = indices.index(max_i)
            else:
                break
            min_index = middle_index if middle_index == 0 else middle_index - 1
            max_index = middle_index if middle_index == len(indices) - 1 else middle_index + 1
            indices = range(indices[min_index], indices[max_index], interval)

    # store splits of maximal information gain in accordance with feature type
    return float(max_ig), [splits[:max_i], splits[max_i:]]


def info_gain_numeric(x, y, accuracy):
    x_unique = list(np.unique(x))
    if len(x_unique) == 1:
        return None
    indices = x.argsort()  # sort numeric attribute
    x, y = x[indices], y[indices]  # save sorted features with sorted labels

    right_dist = np.bincount(y)
    dummy_class = np.array([len(right_dist)])
    class_indices = right_dist.nonzero()[0]
    right_dist = right_dist[class_indices]
    left_dist = np.zeros(len(class_indices))

    diffs = np.nonzero(y[:-1] != y[1:])[0] + 1  # different neighbor classes have value True
    if accuracy > 0:
        diffs = np.array([diffs[i] for i in range(1, len(diffs)) if diffs[i] - diffs[i - 1] > accuracy],
                         dtype=np.int32) if len(diffs) > 15 else diffs
    intervals = np.array((np.concatenate(([0], diffs[:-1])), diffs)).T
    if len(diffs) < 2:
        return None

    max_ig, max_i, max_j = 0, 0, 0
    prior_h = h(right_dist)  # calculate prior entropy

    for i, j in intervals:
        dist = np.bincount(np.concatenate((dummy_class, y[i:j])))[class_indices]
        left_dist += dist
        right_dist -= dist
        coef = np.true_divide((np.sum(left_dist), np.sum(right_dist)), len(y))
        ig = prior_h - np.dot(coef, [h(left_dist[left_dist.nonzero()]), h(right_dist[right_dist.nonzero()])])
        if ig > max_ig:
            max_ig, max_i, max_j = ig, i, j

    if x[max_i] == x[max_j]:
        ind = x_unique.index(x[max_i])
        mean = np.float32(np.mean((x_unique[1 if ind == 0 else ind - 1], x_unique[ind])))
    else:
        mean = np.float32(np.mean((x[max_i], x[max_j])))

    return float(max_ig), [mean, mean]


def multinomLog2(selectors):
    """
    Function calculates logarithm 2 of a kind of multinom.

    selectors: list of integers
    """

    ln2 = 0.69314718055994528622
    noAll = sum(selectors)
    lgNf = math.lgamma(noAll + 1.0) / ln2  # log2(N!)

    lgnFac = []
    for selector in selectors:
        if selector == 0 or selector == 1:
            lgnFac.append(0.0)
        elif selector == 2:
            lgnFac.append(1.0)
        elif selector == noAll:
            lgnFac.append(lgNf)
        else:
            lgnFac.append(math.lgamma(selector + 1.0) / ln2)
    return lgNf - sum(lgnFac)


def calc_mdl(yx_dist, y_dist):
    """
    Function calculates mdl with given label distributions. 

    yx_dist: list of dictionaries - for every split it contains a dictionary with label distributions
    y_dist: dictionary - all label distributions
    
    Reference:
    Igor Kononenko. On biases in estimating multi-valued attributes. In IJCAI, volume 95, pages 1034-1040, 1995.
    """
    prior = multinomLog2(y_dist.values())
    prior += multinomLog2([len(y_dist.keys()) - 1, sum(y_dist.values())])

    post = 0
    for x_val in yx_dist:
        post += multinomLog2([x_val.get(c, 0) for c in y_dist.keys()])
        post += multinomLog2([len(y_dist.keys()) - 1, sum(x_val.values())])
    return (prior - post) / float(sum(y_dist.values()))


def mdl_nominal(x, y, separate_max):
    """
    Function calculates minimum description length for discrete features. If feature is continuous it is firstly discretized.

    x: numpy array - numerical or discrete feature
    y: numpy array - labels
    """
    x_vals = np.unique(x)  # unique values
    if len(x_vals) == 1:  # if there is just one unique value
        return None

    y_dist = Counter(y)  # label distribution
    # calculate distributions and splits in accordance with feature type
    dist, splits = nominal_splits(x, y, x_vals, y_dist, separate_max)
    prior_mdl = calc_mdl(dist, y_dist)

    max_mdl, max_i = 0, 1
    for i in range(1, len(dist)):
        # iter 0: take first distribution
        dist0_x = [el for el in dist[:i]]
        dist0_y = np.sum(dist0_x)
        post_mdl0 = calc_mdl(dist0_x, dist0_y)

        # iter 0: take the other distributions without first
        dist1_x = [el for el in dist[i:]]
        dist1_y = np.sum(dist1_x)
        post_mdl1 = calc_mdl(dist1_x, dist1_y)

        coef = np.true_divide([sum(dist0_y.values()), sum(dist1_y.values())], len(x))
        mdl_val = prior_mdl - np.dot(coef, [post_mdl0, post_mdl1])  # calculate mdl
        if mdl_val > max_mdl:
            max_mdl, max_i = mdl_val, i

            # store splits of maximal mdl in accordance with feature type
    split = [splits[:max_i], splits[max_i:]]
    return (max_mdl, split)


def mdl_numeric(x, y, accuracy):
    x_unique = list(np.unique(x))
    if len(x_unique) == 1:
        return None
    indices = x.argsort()  # sort numeric attribute
    x, y = x[indices], y[indices]  # save sorted features with sorted labels

    right_dist = np.bincount(y)
    dummy_class = np.array([len(right_dist)])
    class_indices = right_dist.nonzero()[0]
    right_dist = right_dist[class_indices]
    left_dist = np.zeros(len(class_indices))
    y_dist = Counter(dict(zip(class_indices, right_dist)))

    diffs = np.nonzero(y[:-1] != y[1:])[0] + 1  # different neighbor classes have value True
    if accuracy > 0:
        diffs = np.array([diffs[i] for i in range(1, len(diffs)) if diffs[i] - diffs[i - 1] > accuracy],
                         dtype=np.int32) if len(diffs) > 15 else diffs
    intervals = np.array((np.concatenate(([0], diffs[:-1])), diffs)).T

    dist = [Counter(dict(zip(class_indices, np.bincount(np.concatenate((dummy_class, y[i:j])))[class_indices]))) for
            i, j in intervals]

    prior_mdl = calc_mdl(dist, y_dist)
    max_mdl, max_i = 0, 0

    for i in range(1, len(dist)):
        # iter 0: take first distribution
        dist0_x = dist[:i]
        dist0_y = np.sum(dist0_x)
        post_mdl0 = calc_mdl(dist0_x, dist0_y)

        # iter 0: take the other distributions without first
        dist1_x = dist[i:]
        dist1_y = np.sum(dist1_x)
        post_mdl1 = calc_mdl(dist1_x, dist1_y)
        coef = np.true_divide([sum(dist0_y.values()), sum(dist1_y.values())], len(x))

        mdl_val = prior_mdl - np.dot(coef, [post_mdl0, post_mdl1])  # calculate mdl
        if mdl_val > max_mdl:
            max_mdl, max_i = mdl_val, i

    max_i, max_j = intervals[max_i]
    if x[max_i] == x[max_j]:
        ind = x_unique.index(x[max_i])
        mean = np.mean((x[1 if ind == 0 else ind - 1], x[ind]))
    else:
        mean = np.mean((x[max_i], x[max_j]))
    return (max_mdl, [mean, mean])