```"""
Module measures contains:
function for calculating information gain
function for calculating minimum description length
heuristic for searching best binary split of nominal values
function for equal frequency label discretization of numerical values
function for random discretization of numerical values
"""

import math
from collections import Counter

import numpy as np

def mdl(x, y, ft, accuracy, separate_max):
return mdl_nominal(x, y, separate_max) if ft == "d" else mdl_numeric(x, y, accuracy)

def info_gain(x, y, ft, accuracy, separate_max):
return info_gain_nominal(x, y, separate_max) if ft == "d" else info_gain_numeric(x, y, accuracy)

def nominal_splits(x, y, x_vals, y_dist, separate_max):
"""
Function uses heuristic to find best binary split of nominal values. Heuristic is described in (1) and it is
originally defined for binary classes. We extend it to work with multiple classes by comparing label with least
samples to others.

x: numpy array - nominal feature
y: numpy array - label
x_vals: numpy array - unique nominal values of x
y_dist: dictionary - distribution of labels

Reference:
(1) Classification and Regression Trees by Breiman, Friedman, Olshen, and Stone, pages 101- 102.
"""
# select a label with least samples
y_val = max(y_dist, key=y_dist.get) if separate_max else min(y_dist, key=y_dist.get)

prior = y_dist[y_val] / float(len(y))  # prior distribution of selected label

values, dist, splits = [], [], []
for x_val in x_vals:  # for every unique nominal value
dist.append(Counter(y[x == x_val]))  # distribution of labels at selected nominal value
splits.append(x_val)
suma = sum([prior * dist[-1][y_key] for y_key in y_dist.keys()])
# estimate probability
values.append(prior * dist[-1][y_val] / float(suma))
indices = np.array(values).argsort()[::-1]

# distributions and splits are sorted according to probabilities
return np.array(dist)[indices], np.array(splits)[indices].tolist()

def h(values):
"""
Function calculates entropy.

values: list of integers
"""
ent = np.true_divide(values, np.sum(values))
return -np.sum(np.multiply(ent, np.log2(ent)))

def info_gain_nominal(x, y, separate_max):
"""
Function calculates information gain for discrete features. If feature is continuous it is firstly discretized.

x: numpy array - numerical or discrete feature
y: numpy array - labels
ft: string - feature type ("c" - continuous, "d" - discrete)
split_fun: function - function for discretization of numerical features
"""
x_vals = np.unique(x)  # unique values
if len(x_vals) < 3:  # if there is just one unique value
return None
y_dist = Counter(y)  # label distribution
h_y = h(y_dist.values())  # class entropy

# calculate distributions and splits in accordance with feature type

dist, splits = nominal_splits(x, y, x_vals, y_dist, separate_max)

indices, repeat = (range(1, len(dist)), 1) if len(dist) < 50 else (range(1, len(dist), len(dist) / 10), 3)
interval = len(dist) / 10

max_ig, max_i, iteration = 0, 1, 0
while iteration < repeat:
for i in indices:
dist0 = np.sum([el for el in dist[:i]])  # iter 0: take first distribution
dist1 = np.sum([el for el in dist[i:]])  # iter 0: take the other distributions without first
coef = np.true_divide([np.sum(dist0.values()), np.sum(dist1.values())], len(y))
ig = h_y - np.dot(coef, [h(dist0.values()), h(dist1.values())])  # calculate information gain
if ig > max_ig:
max_ig, max_i = ig, i  # store index and value of maximal information gain
iteration += 1
if repeat > 1:
interval = int(interval * 0.5)
if max_i in indices and interval > 0:
middle_index = indices.index(max_i)
else:
break
min_index = middle_index if middle_index == 0 else middle_index - 1
max_index = middle_index if middle_index == len(indices) - 1 else middle_index + 1
indices = range(indices[min_index], indices[max_index], interval)

# store splits of maximal information gain in accordance with feature type
return float(max_ig), [splits[:max_i], splits[max_i:]]

def info_gain_numeric(x, y, accuracy):
x_unique = list(np.unique(x))
if len(x_unique) == 1:
return None
indices = x.argsort()  # sort numeric attribute
x, y = x[indices], y[indices]  # save sorted features with sorted labels

right_dist = np.bincount(y)
dummy_class = np.array([len(right_dist)])
class_indices = right_dist.nonzero()[0]
right_dist = right_dist[class_indices]
left_dist = np.zeros(len(class_indices))

diffs = np.nonzero(y[:-1] != y[1:])[0] + 1  # different neighbor classes have value True
if accuracy > 0:
diffs = np.array([diffs[i] for i in range(1, len(diffs)) if diffs[i] - diffs[i - 1] > accuracy],
dtype=np.int32) if len(diffs) > 15 else diffs
intervals = np.array((np.concatenate(([0], diffs[:-1])), diffs)).T
if len(diffs) < 2:
return None

max_ig, max_i, max_j = 0, 0, 0
prior_h = h(right_dist)  # calculate prior entropy

for i, j in intervals:
dist = np.bincount(np.concatenate((dummy_class, y[i:j])))[class_indices]
left_dist += dist
right_dist -= dist
coef = np.true_divide((np.sum(left_dist), np.sum(right_dist)), len(y))
ig = prior_h - np.dot(coef, [h(left_dist[left_dist.nonzero()]), h(right_dist[right_dist.nonzero()])])
if ig > max_ig:
max_ig, max_i, max_j = ig, i, j

if x[max_i] == x[max_j]:
ind = x_unique.index(x[max_i])
mean = np.float32(np.mean((x_unique[1 if ind == 0 else ind - 1], x_unique[ind])))
else:
mean = np.float32(np.mean((x[max_i], x[max_j])))

return float(max_ig), [mean, mean]

def multinomLog2(selectors):
"""
Function calculates logarithm 2 of a kind of multinom.

selectors: list of integers
"""

ln2 = 0.69314718055994528622
noAll = sum(selectors)
lgNf = math.lgamma(noAll + 1.0) / ln2  # log2(N!)

lgnFac = []
for selector in selectors:
if selector == 0 or selector == 1:
lgnFac.append(0.0)
elif selector == 2:
lgnFac.append(1.0)
elif selector == noAll:
lgnFac.append(lgNf)
else:
lgnFac.append(math.lgamma(selector + 1.0) / ln2)
return lgNf - sum(lgnFac)

def calc_mdl(yx_dist, y_dist):
"""
Function calculates mdl with given label distributions.

yx_dist: list of dictionaries - for every split it contains a dictionary with label distributions
y_dist: dictionary - all label distributions

Reference:
Igor Kononenko. On biases in estimating multi-valued attributes. In IJCAI, volume 95, pages 1034-1040, 1995.
"""
prior = multinomLog2(y_dist.values())
prior += multinomLog2([len(y_dist.keys()) - 1, sum(y_dist.values())])

post = 0
for x_val in yx_dist:
post += multinomLog2([x_val.get(c, 0) for c in y_dist.keys()])
post += multinomLog2([len(y_dist.keys()) - 1, sum(x_val.values())])
return (prior - post) / float(sum(y_dist.values()))

def mdl_nominal(x, y, separate_max):
"""
Function calculates minimum description length for discrete features. If feature is continuous it is firstly discretized.

x: numpy array - numerical or discrete feature
y: numpy array - labels
"""
x_vals = np.unique(x)  # unique values
if len(x_vals) == 1:  # if there is just one unique value
return None

y_dist = Counter(y)  # label distribution
# calculate distributions and splits in accordance with feature type
dist, splits = nominal_splits(x, y, x_vals, y_dist, separate_max)
prior_mdl = calc_mdl(dist, y_dist)

max_mdl, max_i = 0, 1
for i in range(1, len(dist)):
# iter 0: take first distribution
dist0_x = [el for el in dist[:i]]
dist0_y = np.sum(dist0_x)
post_mdl0 = calc_mdl(dist0_x, dist0_y)

# iter 0: take the other distributions without first
dist1_x = [el for el in dist[i:]]
dist1_y = np.sum(dist1_x)
post_mdl1 = calc_mdl(dist1_x, dist1_y)

coef = np.true_divide([sum(dist0_y.values()), sum(dist1_y.values())], len(x))
mdl_val = prior_mdl - np.dot(coef, [post_mdl0, post_mdl1])  # calculate mdl
if mdl_val > max_mdl:
max_mdl, max_i = mdl_val, i

# store splits of maximal mdl in accordance with feature type
split = [splits[:max_i], splits[max_i:]]
return (max_mdl, split)

def mdl_numeric(x, y, accuracy):
x_unique = list(np.unique(x))
if len(x_unique) == 1:
return None
indices = x.argsort()  # sort numeric attribute
x, y = x[indices], y[indices]  # save sorted features with sorted labels

right_dist = np.bincount(y)
dummy_class = np.array([len(right_dist)])
class_indices = right_dist.nonzero()[0]
right_dist = right_dist[class_indices]
left_dist = np.zeros(len(class_indices))
y_dist = Counter(dict(zip(class_indices, right_dist)))

diffs = np.nonzero(y[:-1] != y[1:])[0] + 1  # different neighbor classes have value True
if accuracy > 0:
diffs = np.array([diffs[i] for i in range(1, len(diffs)) if diffs[i] - diffs[i - 1] > accuracy],
dtype=np.int32) if len(diffs) > 15 else diffs
intervals = np.array((np.concatenate(([0], diffs[:-1])), diffs)).T

dist = [Counter(dict(zip(class_indices, np.bincount(np.concatenate((dummy_class, y[i:j])))[class_indices]))) for
i, j in intervals]

prior_mdl = calc_mdl(dist, y_dist)
max_mdl, max_i = 0, 0

for i in range(1, len(dist)):
# iter 0: take first distribution
dist0_x = dist[:i]
dist0_y = np.sum(dist0_x)
post_mdl0 = calc_mdl(dist0_x, dist0_y)

# iter 0: take the other distributions without first
dist1_x = dist[i:]
dist1_y = np.sum(dist1_x)
post_mdl1 = calc_mdl(dist1_x, dist1_y)
coef = np.true_divide([sum(dist0_y.values()), sum(dist1_y.values())], len(x))

mdl_val = prior_mdl - np.dot(coef, [post_mdl0, post_mdl1])  # calculate mdl
if mdl_val > max_mdl:
max_mdl, max_i = mdl_val, i

max_i, max_j = intervals[max_i]
if x[max_i] == x[max_j]:
ind = x_unique.index(x[max_i])
mean = np.mean((x[1 if ind == 0 else ind - 1], x[ind]))
else:
mean = np.mean((x[max_i], x[max_j]))
return (max_mdl, [mean, mean])
```