python source code of utils

"""Utility functions used throughout Distributed Keras."""

## BEGIN Import. ###############################################################

from keras import backend as K

from keras.models import model_from_json

from keras import backend as K

from pyspark.mllib.linalg import DenseVector
from pyspark.sql import Row
from pyspark.sql.functions import rand

import pickle

import json

import numpy as np

import os

import pwd

## END Import. #################################################################


def get_os_username():
    """Returns the username of user on the operating system.

    From: http://stackoverflow.com/questions/842059/is-there-a-portable-way-to-get-the-current-username-in-python
    """
    return pwd.getpwuid(os.getuid())[0]


def set_keras_base_directory(base_dir='/tmp/' + get_os_username()):
    """Sets the base directory of Keras."""
    K._keras_base_dir = base_dir


def to_one_hot_encoded_dense(value, n_dim=2):
    """Converts the value to a one-hot encoded vector.

    # Arguments
        value: float. Value of the single "hot" value.
        n_dim: int. Dimension of the output vector.
    """
    value = int(value)
    vector = np.zeros(n_dim)
    vector[value] = 1.0

    return vector


def new_dataframe_row(old_row, column_name, column_value):
    """Constructs a new Spark Row based on the old row, and a new column name and value."""
    row = Row(*(old_row.__fields__ + [column_name]))(*(old_row + (column_value, )))

    return row


def json_to_dataframe_row(string):
    """Converts a JSON String to a Spark Dataframe row."""
    dictionary = json.loads(string)
    row = Row(**dictionary)

    return row


def pickle_object(o):
    """Pickles the specified model and its weights."""
    return pickle.dumps(o, -1)


def unpickle_object(string):
    """Unpickles the specified string into a model."""
    return pickle.loads(string)


def serialize_keras_model(model):
    """Serializes the specified Keras model into a dictionary."""
    dictionary = {}
    dictionary['model'] = model.to_json()
    dictionary['weights'] = model.get_weights()

    return dictionary


def history_executors_average(history):
    """Returns the averaged training metrics for all the executors."""
    max_iteration = max(history, key=lambda x: x['iteration'])['iteration']
    max_executor = max(history, key=lambda x: x['worker_id'])['worker_id']
    histories = []
    averaged_history = []
    # Fetch the histories of the individual executors.
    for i in range(0, max_executor):
        histories.append(history_executor(history, i))
    # Construct the averaged history.
    for i in range(0, max_iteration):
        num_executors = 0
        sum = np.zeros(2)
        for j in range(0, max_executor):
            if len(histories[j]) - 1 >= i:
                num_executors += 1
                sum += histories[j][i]['history']
        # Average the history.
        sum /= num_executors
        averaged_history.append(sum)

    return averaged_history


def history_executor(history, id):
    """Returns the history of a specific executor."""
    executor_history = [h for h in history if h['worker_id'] == id]
    executor_history.sort(key=lambda x: x['iteration'])

    return executor_history


def deserialize_keras_model(dictionary):
    """Deserialized the Keras model using the specified dictionary."""
    architecture = dictionary['model']
    weights = dictionary['weights']
    model = model_from_json(architecture)
    model.set_weights(weights)

    return model


def uniform_weights(model, constraints=[-0.5, 0.5]):
    """Initializes the parameters of the specified Keras model with uniform
    weights between the specified ranges.

    # Arguments
        model: Keras model.
        constraints: array. An array with two elements which defines the range
                     of the uniform initalization.
    """
    # We assume the following: Keras will return a list of weight matrices.
    # All layers, even the activiation layers, will be randomly initialized.
    weights = model.get_weights()
    for layer in weights:
        shape = layer.shape
        if len(shape) > 1:
            # Fill the matrix with random numbers.
            n_rows = shape[0]
            n_columns = shape[1]
            for i in range(0, n_rows):
                for j in range(0, n_columns):
                    layer[i][j] = np.random.uniform(low=constraints[0], high=constraints[1])
        else:
            # Fill the vector with random numbers.
            n_elements = shape[0]
            for i in range(0, n_elements):
                layer[i] = np.random.uniform(low=constraints[0], high=constraints[1])
    # Set the new weights in the model.
    model.set_weights(weights)


def shuffle(dataset):
    """Shuffles the rows in the specified Spark Dataframe.

    # Arguments
        dataset: dataframe. A Spark Dataframe.
    """
    dataset = dataset.orderBy(rand())
    dataset.cache()

    return dataset


def precache(dataset, num_workers):
    """Precaches the specified dataset.

    Make sure the specified dataframe has the desired partitioning scheme.

    # Arguments
        dataset: dataframe. A Spark Dataframe.
        num_workers: int. Number of workers you are going to use.
    """
    dataset = dataset.repartition(num_workers)
    dataset.cache()
    dataset.count()

    return dataset