```"""
Naive Bayes with MapReduce

Algorithm calculates multinomial distribution for discrete features and Gaussian distribution for numerical features.
The output of algorithm is consistent with implementation of Naive Bayes classifier in Orange and scikit-learn.

Reference:
MapReduce version of algorithm is proposed by Cheng-Tao Chu; Sang Kyun Kim, Yi-An Lin, YuanYuan Yu, Gary Bradski,
Andrew Ng, and Kunle Olukotun. "Map-Reduce for Machine Learning on Multicore". NIPS 2006.
"""

def simple_init(interface, params):
return params

def map_fit(interface, state, label, inp):
"""
Function counts occurrences of feature values for every row in given data chunk. For continuous features it returns
number of values and it calculates mean and variance for every feature.
For discrete features it counts occurrences of labels and values for every feature. It returns occurrences of pairs:
label, feature index, feature values.
"""
import numpy as np
combiner = {}  # combiner used for joining of intermediate pairs
out = interface.output(0)  # all outputted pairs have the same output label

for row in inp:  # for every row in data chunk
row = row.strip().split(state["delimiter"])  # split row
if len(row) > 1:  # check if row is empty
for i, j in enumerate(state["X_indices"]):  # for defined features
if row[j] not in state["missing_vals"]:  # check missing values
# creates a pair - label, feature index
pair = row[state["y_index"]] + state["delimiter"] + str(j)

if state["X_meta"][i] == "c":  # continuous features
if pair in combiner:
# convert to float and store value
combiner[pair].append(np.float32(row[j]))
else:
combiner[pair] = [np.float32(row[j])]

else:  # discrete features
# add feature value to pair
pair += state["delimiter"] + row[j]
# increase counts of current pair
combiner[pair] = combiner.get(pair, 0) + 1

# increase label counts
combiner[row[state["y_index"]]] = combiner.get(row[state["y_index"]], 0) + 1

for k, v in combiner.iteritems():  # all pairs in combiner are output
if len(k.split(state["delimiter"])) == 2:  # continous features
# number of elements, partial mean and variance
out.add(k, (np.size(v), np.mean(v, dtype=np.float32), np.var(v, dtype=np.float32)))
else:  # discrete features and labels

def reduce_fit(interface, state, label, inp):
"""
Function separates aggregation of continuous and discrete features.
For continuous features it aggregates partially calculated means and variances and returns them. For discrete
features it aggregates pairs and returns them. Pairs with label occurrences are used to calculate prior probabilities
"""
from disco.util import kvgroup  # function for grouping values by key
import numpy as np

out = interface.output(0)  # all outputted pairs have the same output label

# model of naive Bayes stores label names, sum of all label occurrences and pairs
# (feature index, feature values) for discrete features which are needed to optimize predict phase.
fit_model = {"y_labels": [], "y_sum": 0, "iv": set()}
combiner = {}  # combiner maintains correct order of means and variances.
means, variances = [], []
k_prev = ""

for key, value in kvgroup(inp):  # input pairs are sorted and grouped by key
k_split = key.split(state["delimiter"])  # pair is split

if len(k_split) == 3:  # discrete features
# store pair (feature index, feature value)
# aggregate and output occurrences of a pair

elif len(k_split) == 2:  # continuous features

# if label is different than previous.
# This enables calculation of all variances and means for every feature for current label.
if k_split != k_prev and k_prev != "":
mean, var = zip(*[combiner[key] for key in sorted(combiner.keys())])
means.append(mean)
variances.append(var)

# number of elements, partial mean, partial variance.
n_a = mean_a = var_a = 0
# code aggregates partially calculated means and variances
for n_b, mean_b, var_b in value:
n_ab = n_a + n_b
var_a = ((n_a * var_a + n_b * var_b) / float(n_ab)) + (
n_a * n_b * ((mean_b - mean_a) / float(n_ab)) ** 2)
mean_a = (n_a * mean_a + n_b * mean_b) / float(n_ab)
n_a = n_ab

# maintains correct order of statistics for every feature
combiner[int(k_split)] = (mean_a, var_a + 1e-9)
k_prev = k_split

else:  # aggregates label occurrences
fit_model[key] = np.sum(value)
fit_model["y_sum"] += fit_model[key]  # sum of all label occurrences
fit_model["y_labels"].append(key)

# if statistics for continuous features were not output in last iteration
if len(means) > 0:
mean, var = zip(*[combiner[key] for key in sorted(combiner.keys())])
variances = np.array(variances + [var], dtype=np.float32)

# calculation of prior probabilities
prior = [fit_model[y_label] / float(fit_model["y_sum"]) for y_label in fit_model["y_labels"]]

def map_predict(interface, state, label, inp):
"""
Function makes a predictions of samples with given model. It calculates probabilities with multinomial and Gaussian distribution.
"""
import numpy as np
out = interface.output(0)

continuous = [j for i, j in enumerate(state["X_indices"]) if
state["X_meta"][i] == "c"]  # indices of continuous features
discrete = [j for i, j in enumerate(state["X_indices"]) if
state["X_meta"][i] == "d"]  # indices of discrete features

cont = True if len(continuous) > 0 else False  # enables calculation of Gaussian probabilities
disc = True if len(discrete) > 0 else False  # enables calculation of multinomial probabilities.

for row in inp:
row = row.strip().split(state["delimiter"])
if len(row) > 1:  # if row is empty
# set id of a sample
x_id = "" if state["id_index"] == -1 else row[state["id_index"]]
# initialize prior probability for all labels
probs = state["fit_model"]["prior_log"]

if cont:  # continuous features
x = np.array([(0 if row[j] in state["missing_vals"] else float(row[j])) for j in
continuous])  # sets selected features of the sample
# Gaussian distribution
probs = probs - 0.5 * np.sum(
np.true_divide((x - state["fit_model"]["mean"]) ** 2, state["fit_model"]["var"]) +
state["fit_model"]["var_log"], axis=1)

if disc:  # discrete features
# multinomial distribution
probs = probs + np.sum(
[(0 if row[i] in state["missing_vals"] else state["fit_model"].get((str(i), row[i]), np.zeros(1)))
for i in discrete], axis=0)

# normalize by P(x) = P(f_1, ..., f_n)
log_prob_x = np.log(np.sum(np.exp(probs)))
probs = np.exp(np.array(probs) - log_prob_x)
# Predicted label is the one with highest probability
y_predicted = max(zip(probs, state["fit_model"]["y_labels"]))

def fit(dataset, save_results=True, show=False):
"""
Function builds a model for Naive Bayes. It executes multiple map functions and one reduce function which aggregates intermediate results and returns a model.

Parameters
----------
input - dataset object with input urls and other parameters
save_results - save results to ddfs
show - show info about job execution

Returns
-------
Urls of fit model results on ddfs
"""
from disco.worker.pipeline.worker import Worker, Stage
from disco.core import Job

# define a job and set save of results to ddfs
job = Job(worker=Worker(save_results=save_results))

# job parallelizes mappers, sorts intermediate pairs and joins them with one reducer
job.pipeline = [
("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_fit)),
('group_all', Stage("reduce", init=simple_init, process=reduce_fit, sort=True, combine=True))]

job.params = dataset.params  # job parameters (dataset object)
# define name of a job and input data urls
job.run(name="naivebayes_fit", input=dataset.params["data_tag"])
fitmodel_url = job.wait(show=show)
return {"naivebayes_fitmodel": fitmodel_url}  # return results url

def predict(dataset, fitmodel_url, m=1, save_results=True, show=False):
"""
Function starts a job that makes predictions to input data with a given model

Parameters
----------
input - dataset object with input urls and other parameters
fitmodel_url - model created in fit phase
m - m estimate is used with discrete features
save_results - save results to ddfs
show - show info about job execution

Returns
-------
Urls of predictions on ddfs
"""
from disco.worker.pipeline.worker import Worker, Stage
from disco.core import Job, result_iterator
import numpy as np

try:
m = float(m)
except ValueError:
raise Exception("Parameter m should be numerical.")

if "naivebayes_fitmodel" in fitmodel_url:
# fit model is loaded from ddfs
fit_model = dict((k, v) for k, v in result_iterator(fitmodel_url["naivebayes_fitmodel"]))
if len(fit_model["y_labels"]) < 2:
print "There is only one class in training data."
return []
else:
raise Exception("Incorrect fit model.")

if dataset.params["X_meta"].count("d") > 0:  # if there are discrete features in the model
# code calculates logarithms to optimize predict phase as opposed to calculation by every mapped.
np.seterr(divide='ignore')
for iv in fit_model["iv"]:
dist = [fit_model.pop((y,) + iv, 0) for y in fit_model["y_labels"]]
fit_model[iv] = np.nan_to_num(
np.log(np.true_divide(np.array(dist) + m * fit_model["prior"], np.sum(dist) + m))) - fit_model[
"prior_log"]
del (fit_model["iv"])

# define a job and set save of results to ddfs
job = Job(worker=Worker(save_results=save_results))

# job parallelizes execution of mappers
job.pipeline = [
("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))]

job.params = dataset.params  # job parameters (dataset object)
job.params["fit_model"] = fit_model
# define name of a job and input data urls
job.run(name="naivebayes_predict", input=dataset.params["data_tag"])
results = job.wait(show=show)
return results
```