python source code of naivebayes

"""
Naive Bayes with MapReduce

Algorithm calculates multinomial distribution for discrete features and Gaussian distribution for numerical features.
The output of algorithm is consistent with implementation of Naive Bayes classifier in Orange and scikit-learn.

Reference:
MapReduce version of algorithm is proposed by Cheng-Tao Chu; Sang Kyun Kim, Yi-An Lin, YuanYuan Yu, Gary Bradski,
Andrew Ng, and Kunle Olukotun. "Map-Reduce for Machine Learning on Multicore". NIPS 2006.
"""


def simple_init(interface, params):
    return params


def map_fit(interface, state, label, inp):
    """
    Function counts occurrences of feature values for every row in given data chunk. For continuous features it returns
    number of values and it calculates mean and variance for every feature.
    For discrete features it counts occurrences of labels and values for every feature. It returns occurrences of pairs:
    label, feature index, feature values.
    """
    import numpy as np
    combiner = {}  # combiner used for joining of intermediate pairs
    out = interface.output(0)  # all outputted pairs have the same output label

    for row in inp:  # for every row in data chunk
        row = row.strip().split(state["delimiter"])  # split row
        if len(row) > 1:  # check if row is empty
            for i, j in enumerate(state["X_indices"]):  # for defined features
                if row[j] not in state["missing_vals"]:  # check missing values
                    # creates a pair - label, feature index
                    pair = row[state["y_index"]] + state["delimiter"] + str(j)

                    if state["X_meta"][i] == "c":  # continuous features
                        if pair in combiner:
                            # convert to float and store value
                            combiner[pair].append(np.float32(row[j]))
                        else:
                            combiner[pair] = [np.float32(row[j])]

                    else:  # discrete features
                        # add feature value to pair
                        pair += state["delimiter"] + row[j]
                        # increase counts of current pair
                        combiner[pair] = combiner.get(pair, 0) + 1

                    # increase label counts
                    combiner[row[state["y_index"]]] = combiner.get(row[state["y_index"]], 0) + 1

    for k, v in combiner.iteritems():  # all pairs in combiner are output
        if len(k.split(state["delimiter"])) == 2:  # continous features
            # number of elements, partial mean and variance
            out.add(k, (np.size(v), np.mean(v, dtype=np.float32), np.var(v, dtype=np.float32)))
        else:  # discrete features and labels
            out.add(k, v)


def reduce_fit(interface, state, label, inp):
    """
    Function separates aggregation of continuous and discrete features.
    For continuous features it aggregates partially calculated means and variances and returns them. For discrete
    features it aggregates pairs and returns them. Pairs with label occurrences are used to calculate prior probabilities
    """
    from disco.util import kvgroup  # function for grouping values by key
    import numpy as np

    out = interface.output(0)  # all outputted pairs have the same output label

    # model of naive Bayes stores label names, sum of all label occurrences and pairs
    # (feature index, feature values) for discrete features which are needed to optimize predict phase.
    fit_model = {"y_labels": [], "y_sum": 0, "iv": set()}
    combiner = {}  # combiner maintains correct order of means and variances.
    means, variances = [], []
    k_prev = ""

    for key, value in kvgroup(inp):  # input pairs are sorted and grouped by key
        k_split = key.split(state["delimiter"])  # pair is split

        if len(k_split) == 3:  # discrete features
            # store pair (feature index, feature value)
            fit_model["iv"].add(tuple(k_split[1:]))
            # aggregate and output occurrences of a pair
            out.add(tuple(k_split), sum(value))

        elif len(k_split) == 2:  # continuous features

            # if label is different than previous.
            # This enables calculation of all variances and means for every feature for current label.
            if k_split[0] != k_prev and k_prev != "":
                mean, var = zip(*[combiner[key] for key in sorted(combiner.keys())])
                means.append(mean)
                variances.append(var)

            # number of elements, partial mean, partial variance.
            n_a = mean_a = var_a = 0
            # code aggregates partially calculated means and variances
            for n_b, mean_b, var_b in value:
                n_ab = n_a + n_b
                var_a = ((n_a * var_a + n_b * var_b) / float(n_ab)) + (
                    n_a * n_b * ((mean_b - mean_a) / float(n_ab)) ** 2)
                mean_a = (n_a * mean_a + n_b * mean_b) / float(n_ab)
                n_a = n_ab

            # maintains correct order of statistics for every feature
            combiner[int(k_split[1])] = (mean_a, var_a + 1e-9)
            k_prev = k_split[0]

        else:  # aggregates label occurrences
            fit_model[key] = np.sum(value)
            fit_model["y_sum"] += fit_model[key]  # sum of all label occurrences
            fit_model["y_labels"].append(key)

    # if statistics for continuous features were not output in last iteration
    if len(means) > 0:
        mean, var = zip(*[combiner[key] for key in sorted(combiner.keys())])
        out.add("mean", np.array(means + [mean], dtype=np.float32))
        variances = np.array(variances + [var], dtype=np.float32)
        out.add("var", variances)
        out.add("var_log", np.log(np.pi * variances))

    # calculation of prior probabilities
    prior = [fit_model[y_label] / float(fit_model["y_sum"]) for y_label in fit_model["y_labels"]]
    out.add("prior", np.array(prior, dtype=np.float32))
    out.add("prior_log", np.log(prior))
    out.add("iv", list(fit_model["iv"]))
    out.add("y_labels", fit_model["y_labels"])


def map_predict(interface, state, label, inp):
    """
    Function makes a predictions of samples with given model. It calculates probabilities with multinomial and Gaussian distribution.
    """
    import numpy as np
    out = interface.output(0)

    continuous = [j for i, j in enumerate(state["X_indices"]) if
                  state["X_meta"][i] == "c"]  # indices of continuous features
    discrete = [j for i, j in enumerate(state["X_indices"]) if
                state["X_meta"][i] == "d"]  # indices of discrete features

    cont = True if len(continuous) > 0 else False  # enables calculation of Gaussian probabilities
    disc = True if len(discrete) > 0 else False  # enables calculation of multinomial probabilities.

    for row in inp:
        row = row.strip().split(state["delimiter"])
        if len(row) > 1:  # if row is empty
            # set id of a sample
            x_id = "" if state["id_index"] == -1 else row[state["id_index"]]
            # initialize prior probability for all labels
            probs = state["fit_model"]["prior_log"]

            if cont:  # continuous features
                x = np.array([(0 if row[j] in state["missing_vals"] else float(row[j])) for j in
                              continuous])  # sets selected features of the sample
                # Gaussian distribution
                probs = probs - 0.5 * np.sum(
                    np.true_divide((x - state["fit_model"]["mean"]) ** 2, state["fit_model"]["var"]) +
                    state["fit_model"]["var_log"], axis=1)

            if disc:  # discrete features
                # multinomial distribution
                probs = probs + np.sum(
                    [(0 if row[i] in state["missing_vals"] else state["fit_model"].get((str(i), row[i]), np.zeros(1)))
                     for i in discrete], axis=0)

            # normalize by P(x) = P(f_1, ..., f_n)
            log_prob_x = np.log(np.sum(np.exp(probs)))
            probs = np.exp(np.array(probs) - log_prob_x)
            # Predicted label is the one with highest probability
            y_predicted = max(zip(probs, state["fit_model"]["y_labels"]))[1]
            out.add(x_id, (y_predicted, probs.tolist()))


def fit(dataset, save_results=True, show=False):
    """
    Function builds a model for Naive Bayes. It executes multiple map functions and one reduce function which aggregates intermediate results and returns a model.

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls of fit model results on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job

    # define a job and set save of results to ddfs
    job = Job(worker=Worker(save_results=save_results))

    # job parallelizes mappers, sorts intermediate pairs and joins them with one reducer
    job.pipeline = [
        ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_fit)),
        ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, sort=True, combine=True))]

    job.params = dataset.params  # job parameters (dataset object)
    # define name of a job and input data urls
    job.run(name="naivebayes_fit", input=dataset.params["data_tag"])
    fitmodel_url = job.wait(show=show)
    return {"naivebayes_fitmodel": fitmodel_url}  # return results url


def predict(dataset, fitmodel_url, m=1, save_results=True, show=False):
    """
    Function starts a job that makes predictions to input data with a given model

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    fitmodel_url - model created in fit phase
    m - m estimate is used with discrete features
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls of predictions on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    import numpy as np

    try:
        m = float(m)
    except ValueError:
        raise Exception("Parameter m should be numerical.")

    if "naivebayes_fitmodel" in fitmodel_url:
        # fit model is loaded from ddfs
        fit_model = dict((k, v) for k, v in result_iterator(fitmodel_url["naivebayes_fitmodel"]))
        if len(fit_model["y_labels"]) < 2:
            print "There is only one class in training data."
            return []
    else:
        raise Exception("Incorrect fit model.")

    if dataset.params["X_meta"].count("d") > 0:  # if there are discrete features in the model
        # code calculates logarithms to optimize predict phase as opposed to calculation by every mapped.
        np.seterr(divide='ignore')
        for iv in fit_model["iv"]:
            dist = [fit_model.pop((y,) + iv, 0) for y in fit_model["y_labels"]]
            fit_model[iv] = np.nan_to_num(
                np.log(np.true_divide(np.array(dist) + m * fit_model["prior"], np.sum(dist) + m))) - fit_model[
                                "prior_log"]
        del (fit_model["iv"])

    # define a job and set save of results to ddfs
    job = Job(worker=Worker(save_results=save_results))

    # job parallelizes execution of mappers
    job.pipeline = [
        ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))]

    job.params = dataset.params  # job parameters (dataset object)
    job.params["fit_model"] = fit_model
    # define name of a job and input data urls
    job.run(name="naivebayes_predict", input=dataset.params["data_tag"])
    results = job.wait(show=show)
    return results