# -*- coding: utf-8 -*-
"""
Created on Sun Feb 28 09:12:36 2016

@module:    choice_calcs.py
@name:      Choice Calculations
@author:    Timothy Brathwaite
@summary:   Contains generic functions necessary for calculating choice
            probabilities and for estimating the choice models.
"""
import numpy as np
import scipy.stats
import scipy.linalg
from scipy.linalg import block_diag
from scipy.sparse import hstack

try:
    # Python 3.x does not natively support xrange
    from past.builtins import xrange
except ImportError:
    pass

# Define the boundary values which are not to be exceeded during computation
min_exponent_val = -700
max_exponent_val = 700

max_comp_value = 1e300
min_comp_value = 1e-300
# The value below was determined since its the smallest value, x, for which
# 1 - x != 1
# min_comp_value = 1e-16


def calc_probabilities(beta,
                       design,
                       alt_IDs,
                       rows_to_obs,
                       rows_to_alts,
                       utility_transform,
                       intercept_params=None,
                       shape_params=None,
                       chosen_row_to_obs=None,
                       return_long_probs=False):
    """
    Parameters
    ----------
    beta : 1D or 2D ndarray.
        All elements should by ints, floats, or longs. If 1D, should have 1
        element for each utility coefficient being estimated (i.e.
        num_features). If 2D, should have 1 column for each set of coefficients
        being used to predict the probabilities of each alternative being
        chosen. There should be one row per index coefficient.
    design : 2D or 3D ndarray.
        There should be one row per observation per available alternative.
        There should be one column per utility coefficient being estimated. All
        elements should be ints, floats, or longs. If `len(design.shape) == 3`,
        then beta MUST be 1D.
    alt_IDs : 1D ndarray.
        All elements should be ints. There should be one row per obervation per
        available alternative for the given observation. Elements denote the
        alternative corresponding to the given row of the design matrix.
    rows_to_obs : 2D ndarray.
        There should be one row per observation per available alternative and
        one column per observation. This matrix maps the rows of the design
        matrix to the unique observations (on the columns).
    rows_to_alts : 2D ndarray.
        There should be one row per observation per available alternative and
        one column per possible alternative. This matrix maps the rows of the
        design matrix to the possible alternatives for this dataset.
    utility_transform : callable.
        Should accept a 1D array of systematic utility values, a 1D array of
        alternative IDs, and miscellaneous args and kwargs. Should return a 1D
        array whose elements contain the appropriately transformed systematic
        utility values, based on the current model being evaluated.
    intercept_params : 1D ndarray, or None, optional.
        If an array, each element should be an int, float, or long. For
        identifiability, there should be J- 1 elements where J is the total
        number of observed alternatives for this dataset. Default == None.
    shape_params : 1D ndarray, or None, optional.
        If an array, each element should be an int, float, or long. There
        should be one value per shape parameter of the model being used.
        Default == None.
    chosen_row_to_obs :  2D scipy sparse array, or None, optional.
        There should be one row per observation per available alternative and
        one column per observation. This matrix indicates, for each observation
        (on the columns), which rows of the design matrix were the realized
        outcome. If an array is passed then an array of shape
        (num_observations,) will be returned and each element will be the
        probability of the realized outcome of the given observation.
        Default == None.
    return_long_probs :  bool, optional.
        Indicates whether or not the long format probabilites (a 1D numpy array
        with one element per observation per available alternative) should be
        returned. Default == False.

    Returns
    -------
    numpy array or tuple of two numpy arrays.
        If `chosen_row_to_obs` is passed AND `return_long_probs is True`, then
        the tuple `(chosen_probs, long_probs)` is returned. If
        `return_long_probs is True` and `chosen_row_to_obs is None`, then
        `long_probs` is returned. If `chosen_row_to_obs` is passed and
        `return_long_probs is False` then `chosen_probs` is returned.

        `chosen_probs` is a 1D numpy array of shape (num_observations,). Each
        element is the probability of the corresponding observation being
        associated with its realized outcome.

        `long_probs` is a 1D numpy array with one element per observation per
        available alternative for that observation. Each element is the
        probability of the corresponding observation being associated with that
        rows corresponding alternative.

        If `beta` is a 2D array, `chosen_probs` and `long_probs` will also be
        2D arrays, with as many columns as there are sets of parameters being
        used to calculate probabilities with.

        It is NOT valid to have `chosen_row_to_obs == None` and
        `return_long_probs == False`.
    """
    # Check argument validity
    if (len(beta.shape) >= 2) and (len(design.shape) >= 3):
        msg_1 = "Cannot calculate probabilities with both 3D design matrix AND"
        msg_2 = " 2D coefficient array."
        raise ValueError(msg_1 + msg_2)
    if chosen_row_to_obs is None and return_long_probs is False:
        msg = "chosen_row_to_obs is None AND return_long_probs is False"
        raise ValueError(msg)

    # Calculate the systematic utility for each alternative for each individual
    sys_utilities = design.dot(beta)

    # Calculate the probability from the transformed utilities
    # The transformed utilities will be of shape (num_rows, 1)
    transformed_utilities = utility_transform(sys_utilities,
                                              alt_IDs,
                                              rows_to_alts,
                                              shape_params,
                                              intercept_params)

    # The following commands are to guard against numeric under/over-flow
    too_small_idx = transformed_utilities < min_exponent_val
    too_large_idx = transformed_utilities > max_exponent_val

    transformed_utilities[too_small_idx] = min_exponent_val
    transformed_utilities[too_large_idx] = max_exponent_val

    # Exponentiate the transformed utilities
    long_exponentials = np.exp(transformed_utilities)

    # long_probs will be of shape (num_rows,) Each element will provide the
    # probability of the observation associated with that row having the
    # alternative associated with that row as the observation's outcome
    individual_denominators = np.asarray(rows_to_obs.transpose().dot(
                                                    long_exponentials))
    long_denominators = np.asarray(rows_to_obs.dot(individual_denominators))
    if len(long_exponentials.shape) > 1 and long_exponentials.shape[1] > 1:
        long_probs = (long_exponentials / long_denominators)
    else:
        long_probs = (long_exponentials / long_denominators).ravel()

    # Guard against underflow
    long_probs[long_probs == 0] = min_comp_value

    if chosen_row_to_obs is None:
        chosen_probs = None
    else:
        # chosen_probs will be of shape (num_observations,)
        chosen_exponentials = np.asarray(
                         chosen_row_to_obs.transpose().dot(long_exponentials))
        if len(long_exponentials.shape) > 1 and long_exponentials.shape[1] > 1:
            chosen_probs = chosen_exponentials / individual_denominators
        else:
            chosen_probs = (chosen_exponentials /
                            individual_denominators).ravel()

    # Return the long form and chosen probabilities if desired
    if return_long_probs and chosen_probs is not None:
        return chosen_probs, long_probs
    # If working with predictions, return just the long form probabilities
    elif return_long_probs and chosen_probs is None:
        return long_probs
    # If estimating the model and storing fitted probabilities or testing the
    # model on data for which we know the chosen alternative, just return the
    # chosen probabilities.
    elif chosen_probs is not None:
        return chosen_probs


def calc_log_likelihood(beta,
                        design,
                        alt_IDs,
                        rows_to_obs,
                        rows_to_alts,
                        choice_vector,
                        utility_transform,
                        intercept_params=None,
                        shape_params=None,
                        ridge=None,
                        weights=None):
    """
    Parameters
    ----------
    beta : 1D ndarray.
        All elements should by ints, floats, or longs. Should have 1 element
        for each utility coefficient being estimated (i.e. num_features).
    design : 2D ndarray.
        There should be one row per observation per available alternative.
        There should be one column per utility coefficient being estimated. All
        elements should be ints, floats, or longs.
    alt_IDs : 1D ndarray.
        All elements should be ints. There should be one row per obervation per
        available alternative for the given observation. Elements denote the
        alternative corresponding to the given row of the design matrix.
    rows_to_obs : 2D ndarray.
        There should be one row per observation per available alternative and
        one column per observation. This matrix maps the rows of the design
        matrix to the unique observations (on the columns).
    rows_to_alts : 2D ndarray.
        There should be one row per observation per available alternative and
        one column per possible alternative. This matrix maps the rows of the
        design matrix to the possible alternatives for this dataset.
    choice_vector : 1D ndarray.
        All elements should be either ones or zeros. There should be one row
        per observation per available alternative for the given observation.
        Elements denote the alternative which is chosen by the given
        observation with a 1 and a zero otherwise.
    utility_transform:  callable.
        Should accept a 1D array of systematic utility values, a 1D array of
        alternative IDs, and miscellaneous args and kwargs. Should return a 1D
        array whose elements contain the appropriately transformed systematic
        utility values, based on the current model being evaluated.
    intercept_params:   1D ndarray, or None, optional.
        If an array, each element should be an int, float, or long. For
        identifiability, there should be J- 1 elements where J is the total
        number of observed alternatives for this dataset. Default == None.
    shape_params : 1D ndarray, or None, optional.
        If an array, each element should be an int, float, or long. There
        should be one value per shape parameter of the model being used.
        Default == None.
    ridge : int, float, long, or None, optional.
        Determines whether or not ridge regression is performed. If an int,
        float or long is passed, then that scalar determines the ridge penalty
        for the optimization. Default = None.
    weights : 1D ndarray or None, optional.
        Allows for the calculation of weighted log-likelihoods. The weights can
        represent various things. In stratified samples, the weights may be
        the proportion of the observations in a given strata for a sample in
        relation to the proportion of observations in that strata in the
        population. In latent class models, the weights may be the probability
        of being a particular class.

    Returns
    -------
    log_likelihood : float. The log likelihood of the multinomial choice model.
    """
    # Calculate the probability of each individual choosing each available
    # alternative for that individual.
    long_probs = calc_probabilities(beta,
                                    design,
                                    alt_IDs,
                                    rows_to_obs,
                                    rows_to_alts,
                                    utility_transform,
                                    intercept_params=intercept_params,
                                    shape_params=shape_params,
                                    return_long_probs=True)

    # Calculate the weights for the sample
    if weights is None:
        weights = 1

    # Calculate the log likelihood
    log_likelihood = choice_vector.dot(weights * np.log(long_probs))

    if ridge is None:
        return log_likelihood
    else:
        param_list = [x for x in [shape_params, intercept_params, beta]
                      if x is not None]
        if len(param_list) > 1:
            params = np.concatenate(param_list, axis=0)
        else:
            params = param_list[0]
        return log_likelihood - ridge * np.square(params).sum()


def calc_gradient(beta,
                  design,
                  alt_IDs,
                  rows_to_obs,
                  rows_to_alts,
                  choice_vector,
                  utility_transform,
                  transform_first_deriv_c,
                  transform_first_deriv_v,
                  transform_deriv_alpha,
                  intercept_params,
                  shape_params,
                  ridge,
                  weights):
    """
    Parameters
    ----------
    beta : 1D ndarray.
        All elements should by ints, floats, or longs. Should have 1 element
        for each utility coefficient being estimated (i.e. num_features).
    design : 2D ndarray.
        Tjere should be one row per observation per available alternative.
        There should be one column per utility coefficient being estimated. All
        elements should be ints, floats, or longs.
    alt_IDs : 1D ndarray.
        All elements should be ints. There should be one row per obervation per
        available alternative for the given observation. Elements denote the
        alternative corresponding to the given row of the design matrix.
    rows_to_obs : 2D scipy sparse array.
        There should be one row per observation per available alternative and
        one column per observation. This matrix maps the rows of the design
        matrix to the unique observations (on the columns).
    rows_to_alts : 2D scipy sparse array
        There should be one row per observation per available alternative and
        one column per possible alternative. This matrix maps the rows of the
        design matrix to the possible alternatives for this dataset.
    choice_vector : 1D ndarray.
        All elements should be either ones or zeros. There should be one row
        per observation per available alternative for the given observation.
        Elements denote the alternative which is chosen by the given
        observation with a 1 and a zero otherwise.
    utility_transform : callable.
        Must accept a 1D array of systematic utility values, a 1D array of
        alternative IDs, and miscellaneous args and kwargs. Should return a 1D
        array whose elements contain the appropriately transformed systematic
        utility values, based on the current model being evaluated.
    transform_first_deriv_c : callable.
        Must accept a 1D array of systematic utility values, a 1D array of
        alternative IDs, the `rows_to_alts` array, (shape parameters if there
        are any) and miscellaneous args and kwargs. Should return a 2D matrix
        or sparse array whose elements contain the derivative of the tranformed
        utility vector with respect to the vector of shape parameters. The
        dimensions of the returned vector should be
        `(design.shape[0], num_alternatives)`. If there are no shape parameters
        then the callable should return None.
    transform_first_deriv_v : callable.
        Must accept a 1D array of systematic utility values, a 1D array of
        alternative IDs, (shape parameters if there are any) and miscellaneous
        args and kwargs. Should return a 2D array whose elements contain the
        derivative of the tranformed utility vector with respect to the vector
        of systematic utilities. The dimensions of the returned vector should
        be `(design.shape[0], design.shape[0])`.
    transform_deriv_alpha : callable.
        Must accept a 1D array of systematic utility values, a 1D array of
        alternative IDs, the `rows_to_alts` array, (intercept parameters if
        there are any) and miscellaneous args and kwargs. Should return a 2D
        array whose elements contain the derivative of the tranformed utility
        vector with respect to the vector of shape parameters. The dimensions
        of the returned vector should be
        `(design.shape[0], num_alternatives - 1)`. If there are no intercept
        parameters, the callable should return None.
    intercept_params : 1D numpy array or None.
        If an array, each element should be an int, float, or long. For
        identifiability, there should be J- 1 elements where J is the total
        number of observed alternatives for this dataset. Default == None.
    shape_params : 1D ndarray or None.
       If an array, each element should be an int, float, or long. There should
       be one value per shape parameter of the model being used.
       Default == None.
    ridge : int, float, long, or None.
        Determines whether or not ridge regression is performed. If an int,
        float or long is passed, then that scalar determines the ridge penalty
        for the optimization. Default = None.
    weights : 1D ndarray or None.
        Allows for the calculation of weighted log-likelihoods. The weights can
        represent various things. In stratified samples, the weights may be
        the proportion of the observations in a given strata for a sample in
        relation to the proportion of observations in that strata in the
        population. In latent class models, the weights may be the probability
        of being a particular class.

    Returns
    -------
    gradient : 1D ndarray.
       It's shape is (beta.shape[0], ). It is the second derivative of the log-
       likelihood with respect to beta.
    """
    # Calculate the systematic utility for each alternative for each individual
    sys_utilities = design.dot(beta)

    # Calculate the probability of each individual choosing each available
    # alternative for that individual.
    long_probs = calc_probabilities(beta,
                                    design,
                                    alt_IDs,
                                    rows_to_obs,
                                    rows_to_alts,
                                    utility_transform,
                                    intercept_params=intercept_params,
                                    shape_params=shape_params,
                                    return_long_probs=True)

    # Calculate the weights for the sample
    if weights is None:
        weights = 1

    ##########
    # Get the required matrices
    ##########
    # Differentiate the transformed utilities with respect to the shape params
    # Note that dh_dc should be a sparse array
    dh_dc = transform_first_deriv_c(sys_utilities, alt_IDs,
                                    rows_to_alts, shape_params)
    # Differentiate the transformed utilities by the intercept params
    # Note that dh_d_alpha should be a sparse array
    dh_d_alpha = transform_deriv_alpha(sys_utilities, alt_IDs,
                                       rows_to_alts, intercept_params)
    # Differentiate the transformed utilities with respect to the systematic
    # utilities. Note that dh_dv should be a sparse matrix
    dh_dv = transform_first_deriv_v(sys_utilities, alt_IDs,
                                    rows_to_alts, shape_params)
    # Differentiate the transformed utilities with respect to the utility
    # coefficients. Note that dh_db should be a dense **matrix**, not a dense
    # 2D array. This is because the dot product of a 2D scipy sparse array and
    # a 2D dense numpy array yields a 2D dense numpy matrix
    dh_db = dh_dv.dot(design)
    # Differentiate the log likelihood w/ respect to the transformed utilities
    # Note that d_ll_dh will be a dense 2D numpy array.
    d_ll_dh = np.multiply(weights, choice_vector - long_probs)[np.newaxis, :]

    # Calculate the gradient of the log-likelihood with respect to the betas
    d_ll_d_beta = d_ll_dh.dot(dh_db)

    ##########
    # Form and return the gradient
    ##########
    if shape_params is not None and intercept_params is not None:
        # Note that we use d_ll_dh * dh_dc and d_ll_dh * dh_d_alpha because
        # that is how one computes the dot product between a dense 2D numpy
        # array and a 2D sparse matrix. This is due to numpy ndarrays and
        # scipy sparse matrices not playing nicely together. However, numpy
        # ndarrays and numpy matrices can be dot producted together,
        # hence d_ll_dh.dot(dh_db).

        # Note that the 'np.asarray' is because dll_dh * dh_dc will be a row
        # matrix, but we want a 1D numpy array.
        gradient = np.concatenate((np.asarray(d_ll_dh * hstack((dh_dc,
                                                                dh_d_alpha),
                                                               format='csr')),
                                   d_ll_d_beta), axis=1).ravel()
        params = np.concatenate((shape_params, intercept_params, beta),
                                axis=0)

    elif shape_params is not None and intercept_params is None:
        # Note that we use d_ll_dh * dh_dc because that is how one computes
        # the dot product between a dense 2D numpy array and a 2D sparse matrix
        # This is due to numpy ndarrays and scipy sparse matrices not playing
        # nicely together. However, numpy ndarrays and numpy matrices can be
        # dot producted together, hence d_ll_dh.dot(dh_db).

        # Note that the 'np.asarray' is because dll_dh * dh_dc will be a row
        # matrix, but we want a 1D numpy array.
        gradient = np.concatenate((np.asarray(d_ll_dh * dh_dc), d_ll_d_beta),
                                  axis=1).ravel()
        params = np.concatenate((shape_params, beta), axis=0)

    elif shape_params is None and intercept_params is not None:
        # Note that we use d_ll_dh * dh_d_alpha because that's how one computes
        # the dot product between a dense 2D numpy array and a 2D sparse matrix
        # This is due to numpy ndarrays and scipy sparse matrices not playing
        # nicely together. However, numpy ndarrays and numpy matrices can be
        # dot producted together, hence d_ll_dh.dot(dh_db).

        # Note 'np.asarray' is used because dll_dh * dh_d_alpha will be a row
        # matrix, but we want a 1D numpy array.
        gradient = np.concatenate((np.asarray(d_ll_dh * dh_d_alpha),
                                   d_ll_d_beta), axis=1).ravel()
        params = np.concatenate((intercept_params, beta), axis=0)

    else:
        gradient = d_ll_d_beta.ravel()
        params = beta

    if ridge is not None:
        gradient -= 2 * ridge * params

    return gradient


##########
# The three functions below are used, jointly, to construct dP_dV, the block
# diagonal matrix that is the derivative of the long probability vector with
# respect to the long vector of index values = X*beta.
##########
def create_matrix_block_indices(row_to_obs):
    """
    Parameters
    ----------
    row_to_obs: 2D ndarray.
        There should be one row per observation per available alternative and
        one column per observation. This matrix maps the rows of the design
        matrix to the unique observations (on the columns).

    Returns
    -------
    output_indices : list of arrays.
        There will be one array per column in `row_to_obs`. The array will note
        which rows correspond to which observations.
    """
    # Initialize the list of index arrays to be returned
    output_indices = []
    # Determine the number of observations in the dataset
    num_obs = row_to_obs.shape[1]
    # Get the indices of the non-zero elements and their values
    row_indices, col_indices, values = scipy.sparse.find(row_to_obs)
    # Iterate over each observation, i.e. each column in row_to_obs, and
    # determine which rows belong to that observation (i.e. the rows with ones
    # in them).
    for col in xrange(num_obs):
        # Store the array of row indices belonging to the current observation
        output_indices.append(row_indices[np.where(col_indices == col)])

    return output_indices


def robust_outer_product(vec_1, vec_2):
    """
    Calculates a 'robust' outer product of two vectors that may or may not
    contain very small values.

    Parameters
    ----------
    vec_1 : 1D ndarray
    vec_2 : 1D ndarray

    Returns
    -------
    outer_prod : 2D ndarray. The outer product of vec_1 and vec_2
    """
    mantissa_1, exponents_1 = np.frexp(vec_1)
    mantissa_2, exponents_2 = np.frexp(vec_2)
    new_mantissas = mantissa_1[None, :] * mantissa_2[:, None]
    new_exponents = exponents_1[None, :] + exponents_2[:, None]
    return new_mantissas * np.exp2(new_exponents)


def create_matrix_blocks(long_probs, matrix_block_indices):
    """
    Parameters
    ----------
    long_probs : 1D ndarray.
        There should be one row per observation per available alternative. The
        elements of the array will indicate the probability of the alternative
        being the outcome associated with the corresponding observation.
    matrix_block_indices : list of arrays.
        There will be one array per observation. The arrays will note which
        rows correspond to which observations.

    Returns
    -------
    output_matrices : list of matrices.
        Each matrix will contain the derivative of P_i with respect to H_i, and
        there will be one matrix for each observations i. P_i is the array of
        probabilities of each observation being associated with its available
        alternatives. H_i is the array of transformed index values for each
        alternative that is available to observation i.
    """
    # Initialize the list of matrices that is to be returned.
    output_matrices = []
    # Iterate over each observation, i.e. over each list of rows that
    # corresponds to each observation.
    for indices in matrix_block_indices:
        # Isolate P_i, the vector of probabilities of each alternative that
        # is associated with the current observation
        current_probs = long_probs[indices]
        # Get the outer product of the current probabilities
        # probability_outer_product = np.outer(current_probs, current_probs)
        probability_outer_product = robust_outer_product(current_probs,
                                                         current_probs)

        # Create the desired dP_i/dh_i matrix
        dP_i_dh_i = np.diag(current_probs) - probability_outer_product
        # Ensure that the diagonal is positive and non-zero, since it must be.
        diag_idx = np.diag_indices_from(dP_i_dh_i)
        diag_values = dP_i_dh_i[diag_idx].copy()
        diag_values[diag_values == 0] = min_comp_value
        dP_i_dh_i[diag_idx] = diag_values
        # Guard against underflow on the off-diagonals
        underflow_idxs = np.where(dP_i_dh_i == 0)
        for i in xrange(underflow_idxs[0].size):
            row_idx, col_idx = underflow_idxs[0][i], underflow_idxs[1][i]
            if row_idx != col_idx:
                # Since this type of underflow essentially comes from
                # multiplying two very small numbers, just set the overall
                # result to a small number
                dP_i_dh_i[row_idx,
                          col_idx] = -1 * min_comp_value
        # Store the desired dP_i/dh_i matrix
        output_matrices.append(dP_i_dh_i)

    return output_matrices


def calc_hessian(beta,
                 design,
                 alt_IDs,
                 rows_to_obs,
                 rows_to_alts,
                 utility_transform,
                 transform_first_deriv_c,
                 transform_first_deriv_v,
                 transform_deriv_alpha,
                 block_matrix_idxs,
                 intercept_params,
                 shape_params,
                 ridge,
                 weights):
    """
    Parameters
    ----------
    beta : 1D ndarray.
        All elements should by ints, floats, or longs. Should have 1 element
        for each utility coefficient being estimated (i.e. num_features).
    design : 2D ndarray.
        There should be one row per observation per available alternative.
        There should be one column per utility coefficient being estimated. All
        elements should be ints, floats, or longs.
    alt_IDs : 1D ndarray.
        All elements should be ints. There should be one row per obervation per
        available alternative for the given observation. Elements denote the
        alternative corresponding to the given row of the design matrix.
    rows_to_obs : 2D ndarray.
        There should be one row per observation per available alternative and
        one column per observation. This matrix maps the rows of the design
        matrix to the unique observations (on the columns).
    rows_to_alts: 2D ndarray.
        There should be one row per observation per available alternative and
        one column per possible alternative. This matrix maps the rows of the
        design matrix to the possible alternatives for this dataset.
    utility_transform : callable.
        Must accept a 1D array of systematic utility values, a 1D array of
        alternative IDs, and miscellaneous args and kwargs. Should return a 1D
        array whose elements contain the appropriately transformed systematic
        utility values, based on the current model being evaluated.
    transform_first_deriv_c : callable.
        Must accept a 1D array of systematic utility values, a 1D array of
        alternative IDs, the `rows_to_alts` array, (shape parameters if there
        are any) and miscellaneous args and kwargs. Should return a 2D array
        whose elements contain the derivative of the tranformed utilities with
        respect to the vector of shape parameters. The dimensions of the
        returned vector should be `(design.shape[0], num_alternatives)`.
    transform_first_deriv_v : callable.
        Must accept a 1D array of systematic utility values, a 1D array of
        alternative IDs, (shape parameters if there are any) and miscellaneous
        args and kwargs. Should return a 2D array whose elements contain the
        derivative of the tranformed utility vector with respect to the vector
        of systematic utilities. The dimensions of the returned vector should
        be `(design.shape[0], design.shape[0])`.
    transform_deriv_alpha : callable.
        Must accept a 1D array of systematic utility values, a 1D array of
        alternative IDs, the rows_to_alts array, (intercept parameters if
        there are any) and miscellaneous args and kwargs. Should return a 2D
        array whose elements contain the derivative of the tranformed utilities
        with respect to the vector of shape parameters. The dimensions of the
        returned vector should be `(design.shape[0], num_alternatives - 1)`. If
        `intercept_params == None`, the callable should return None.
    block_matrix_idxs : list of arrays.
        There will be one array per observation. The arrays will note which
        rows correspond to which observations.
    intercept_params : 1D ndarray.
        Each element should be an int, float, or long. For identifiability,
        there should be J- 1 elements where J is the total number of observed
        alternatives in the dataset.
    shape_params: None or 1D ndarray.
        If an array, each element should be an int, float, or long. There
        should be one value per shape parameter of the model being used.
        Default == None.
    ridge : int, float, long, or None.
        Determines whether or not ridge regression is performed. If an int,
        float or long is passed, then that scalar determines the ridge penalty
        for the optimization. Default = None.
    weights : 1D ndarray or None.
        Allows for the calculation of weighted log-likelihoods. The weights can
        represent various things. In stratified samples, the weights may be
        the proportion of the observations in a given strata for a sample in
        relation to the proportion of observations in that strata in the
        population. In latent class models, the weights may be the probability
        of being a particular class.

    Returns
    -------
    hess : 2D ndarray.
        It's shape is `(beta.shape[0], beta.shape[0])`. It is the second
        derivative of the log likelihood with respect to beta.
    """
    # Calculate the systematic utility for each alternative for each individual
    sys_utilities = design.dot(beta)

    # Calculate the probability of each individual choosing each available
    # alternative for that individual.
    long_probs = calc_probabilities(beta,
                                    design,
                                    alt_IDs,
                                    rows_to_obs,
                                    rows_to_alts,
                                    utility_transform,
                                    intercept_params=intercept_params,
                                    shape_params=shape_params,
                                    return_long_probs=True)

    # Calculate the weights for the sample
    if weights is None:
        weights = np.ones(design.shape[0])

    ##########
    # Get the required matrices
    ##########
    # Differentiate the transformed utilities with respect to the shape params
    # Note that dh_dc will be a 2D scipy sparse matrix
    dh_dc = transform_first_deriv_c(sys_utilities, alt_IDs,
                                    rows_to_alts, shape_params)
    # Differentiate the transformed utilities with respect to the systematic
    # utilities. Note that dh_dv will be a 2D scipy sparse matrix.
    dh_dv = transform_first_deriv_v(sys_utilities, alt_IDs,
                                    rows_to_alts, shape_params)
    # Differentiate the transformed utilities by the intercept params
    # Note that dh_d_alpha should be a sparse array
    dh_d_alpha = transform_deriv_alpha(sys_utilities, alt_IDs,
                                       rows_to_alts, intercept_params)
    # Differentiate the transformed utilities with respect to the utility
    # coefficients. Note that dh_db will be a 2D dense numpy matrix
    dh_db = dh_dv.dot(design)

    # Differentiate the probabilities with respect to the transformed utilities
    # Note that dp_dh will be a 2D dense numpy array
    block_matrices = create_matrix_blocks(long_probs, block_matrix_idxs)
    dp_dh = block_diag(*block_matrices) * weights[None, :]

    ##########
    # Calculate the second and mixed partial derivatives within the hessian
    ##########
    # Calculate the second derivative of the log-likelihood with repect to the
    # utility coefficients. Should have shape (design.shape[0],
    # design.shape[0]). Since dp_dh is a 2D dense numpy array and dh_db is a
    # 2D dense numpy matrix, using the .dot() syntax should work to compute
    # the dot product.
    d2_ll_db2 = -1 * dh_db.T.dot(dp_dh.dot(dh_db))

    ##########
    # Form and return the hessian
    ##########
    if shape_params is not None and intercept_params is not None:
        # Calculate the second derivative of the log-likelihood with respect
        # to the shape parameters. Should have shape (shape_params.shape[0],
        # shape_params.shape[0]). Note that since dp_dh is a 2D dense numpy
        # array and dh_dc is a sparse matrix or dense numpy matrix, to
        # compute the dot product we need to use the * operator
        d2_ll_dc2 = -1 * dh_dc.T.dot(dp_dh * dh_dc)

        # Calculate the second derivative of the log-likelihood with respect
        # to the intercept parameters. Should have shape (J - 1, J - 1) where
        # J is the total number of observed alternatives for this dataset.
        # Note that since dp_dh is a 2D dense numpy array and dh_d_alpha is a
        # sparse matrix or dense numpy matrix, to compute the dot product
        # we need to use the * operator
        d2_ll_d_alpha2 = -1 * dh_d_alpha.T.dot(dp_dh * dh_d_alpha)

        # Calculate the mixed second derivative of the log-likelihood with
        # respect to the intercept and shape parameters. Should have shape
        # (dh_d_alpha.shape[1], dh_dc.shape[1]). Note that since dp_dh is a 2D
        # dense numpy array and dh_dc is a sparse matrix or dense numpy
        # matrix, to compute the dot product we need to use the * operator
        d2_ll_dc_d_alpha = -1 * dh_d_alpha.T.dot(dp_dh * dh_dc)

        # Calculate the mixed partial derivative of the log-likelihood with
        # respect to the utility coefficients and then with respect to the
        # shape parameters. Should have shape (design.shape[0],
        # shape_params.shape[0]). Note that since dp_dh is a 2D dense numpy
        # array and dh_dc is a sparse matrix or dense numpy matrix, to
        # compute the dot product we need to use the * operator
        d2_ll_dc_db = -1 * dh_db.T.dot(dp_dh * dh_dc)

        # Calculate the mixed partial derivative of the log-likelihood with
        # respect to the utility coefficients and then with respect to the
        # intercept parameters. Should have shape (design.shape[0],
        # intercept_params.shape[0]). Note that since dp_dh is a 2D dense numpy
        # array and dh_d_alpha is a sparse matrix or dense numpy matrix, to
        # compute the dot product we need to use the * operator
        d2_ll_d_alpha_db = -1 * dh_db.T.dot(dp_dh * dh_d_alpha)

        # Form the 3 by 3 partitioned hessian of 2nd derivatives
        top_row = np.concatenate((d2_ll_dc2,
                                  d2_ll_dc_d_alpha.T,
                                  d2_ll_dc_db.T), axis=1)
        middle_row = np.concatenate((d2_ll_dc_d_alpha,
                                     d2_ll_d_alpha2,
                                     d2_ll_d_alpha_db.T), axis=1)
        last_row = np.concatenate((d2_ll_dc_db,
                                   d2_ll_d_alpha_db,
                                   d2_ll_db2), axis=1)
        hess = np.concatenate((top_row,
                               middle_row,
                               last_row), axis=0)

    elif shape_params is not None and intercept_params is None:
        # Calculate the second derivative of the log-likelihood with respect
        # to the shape parameters. Should have shape (shape_params.shape[0],
        # shape_params.shape[0]). Note that since dp_dh is a 2D dense numpy
        # array and dh_dc is a sparse matrix or dense numpy matrix, to
        # compute the dot product we need to use the * operator
        d2_ll_dc2 = -1 * dh_dc.T.dot(dp_dh * dh_dc)

        # Calculate the mixed partial derivative of the log-likelihood with
        # respect to the utility coefficients and then with respect to the
        # shape parameters. Should have shape (design.shape[0],
        # shape_params.shape[0]). Note that since dp_dh is a 2D dense numpy
        # array and dh_dc is a sparse matrix or dense numpy matrix, to
        # compute the dot product we need to use the * operator
        d2_ll_dc_db = -1 * dh_db.T.dot(dp_dh * dh_dc)

        hess = np.concatenate((np.concatenate((d2_ll_dc2,
                                               d2_ll_dc_db.T), axis=1),
                               np.concatenate((d2_ll_dc_db,
                                               d2_ll_db2), axis=1)), axis=0)

    elif shape_params is None and intercept_params is not None:
        # Calculate the second derivative of the log-likelihood with respect
        # to the intercept parameters. Should have shape (J - 1, J - 1) where
        # J is the total number of observed alternatives for this dataset.
        # Note that since dp_dh is a 2D dense numpy array and dh_d_alpha is a
        # sparse matrix or dense numpy matrix, to compute the dot product
        # we need to use the * operator
        d2_ll_d_alpha2 = -1 * dh_d_alpha.T.dot(dp_dh * dh_d_alpha)

        # Calculate the mixed partial derivative of the log-likelihood with
        # respect to the utility coefficients and then with respect to the
        # intercept parameters. Should have shape (design.shape[0],
        # intercept_params.shape[0]). Note that since dp_dh is a 2D dense numpy
        # array and dh_d_alpha is a sparse matrix or dense numpy matrix, to
        # compute the dot product we need to use the * operator
        d2_ll_d_alpha_db = -1 * dh_db.T.dot(dp_dh * dh_d_alpha)

        hess = np.concatenate((np.concatenate((d2_ll_d_alpha2,
                                               d2_ll_d_alpha_db.T), axis=1),
                               np.concatenate((d2_ll_d_alpha_db,
                                               d2_ll_db2), axis=1)), axis=0)
    else:
        hess = d2_ll_db2

    if ridge is not None:
        hess -= 2 * ridge

    # Make sure we are returning standard numpy arrays
    if isinstance(hess, np.matrixlib.defmatrix.matrix):
        hess = np.asarray(hess)

    return hess


def calc_fisher_info_matrix(beta,
                            design,
                            alt_IDs,
                            rows_to_obs,
                            rows_to_alts,
                            choice_vector,
                            utility_transform,
                            transform_first_deriv_c,
                            transform_first_deriv_v,
                            transform_deriv_alpha,
                            intercept_params,
                            shape_params,
                            ridge,
                            weights):
    """
    Parameters
    ----------
    beta : 1D ndarray.
        All elements should by ints, floats, or longs. Should have 1 element
        for each utility coefficient being estimated (i.e. num_features).
    design : 2D ndarray.
        There should be one row per observation per available alternative.
        There should be one column per utility coefficient being estimated. All
        elements should be ints, floats, or longs.
    alt_IDs : 1D ndarray.
        All elements should be ints. There should be one row per obervation per
        available alternative for the given observation. Elements denote the
        alternative corresponding to the given row of the design matrix.
    rows_to_obs : 2D ndarray.
        There should be one row per observation per available alternative and
        one column per observation. This matrix maps the rows of the design
        matrix to the unique observations (on the columns).
    rows_to_alts : 2D ndarray
        There should be one row per observation per available alternative and
        one column per possible alternative. This matrix maps the rows of the
        design matrix to the possible alternatives for this dataset.
    choice_vector : 1D ndarray.
        All elements should be either ones or zeros. There should be one row
        per observation per available alternative for the given observation.
        Elements denote the alternative which is chosen by the given
        observation with a 1 and a zero otherwise.
    utility_transform : callable.
        Must accept a 1D array of systematic utility values, a 1D array of
        alternative IDs, and miscellaneous args and kwargs. Should return a
        1D array whose elements contain the appropriately transformed
        systematic utility values, based on the current model being evaluated.
    transform_first_deriv_c : callable.
        Must accept a 1D array of systematic utility values, a 1D array of
        alternative IDs, the `rows_to_alts` array, (shape parameters if there
        are any) and miscellaneous args and kwargs. Should return a 2D array
        whose elements contain the derivative of the tranformed utilities with
        respect to the vector of shape parameters. The dimensions of the
        returned vector should be `(design.shape[0], num_alternatives)`.
    transform_first_deriv_v : callable.
        Must accept a 1D array of systematic utility values, a 1D array of
         alternative IDs, (shape parameters if there are any) and miscellaneous
         args and kwargs. Should return a 2D array whose elements contain the
         derivative of the utility tranformation vector with respect to the
         vector of systematic utilities. The dimensions of the returned vector
         should be `(design.shape[0],  design.shape[0])`.
    shape_params : None or 1D ndarray.
        If an array, each element should be an int, float, or long. There
        should be one value per shape parameter of the model being used.
        Default == None.
    ridge : int, float, long, or None.
        Determines whether or not ridge regression is performed. If an int,
        float or long is passed, then that scalar determines the ridge penalty
        for the optimization. Default = None.
    weights : 1D ndarray or None.
        Allows for the calculation of weighted log-likelihoods. The weights can
        represent various things. In stratified samples, the weights may be
        the proportion of the observations in a given strata for a sample in
        relation to the proportion of observations in that strata in the
        population. In latent class models, the weights may be the probability
        of being a particular class.

    Returns
    -------
    fisher_matrix : 2D numpy array.
        It will be a square matrix, with one row and one column for each shape,
        intercept, and index coefficient. Contains the BHHH approximation to
        the Fisher Information matrix of the log likelihood.
    """
    # Calculate the systematic utility for each alternative for each individual
    sys_utilities = design.dot(beta)

    # Calculate the probability that the individual associated with a given row
    # chooses the alternative associated with a given row.
    long_probs = calc_probabilities(beta,
                                    design,
                                    alt_IDs,
                                    rows_to_obs,
                                    rows_to_alts,
                                    utility_transform,
                                    intercept_params=intercept_params,
                                    shape_params=shape_params,
                                    return_long_probs=True)

    # Calculate the weights for the sample
    if weights is None:
        weights = np.ones(design.shape[0])
    weights_per_obs =\
        np.max(rows_to_obs.toarray() * weights[:, None], axis=0)

    ##########
    # Get the required matrices
    ##########
    # Differentiate the transformed utilities with respect to the shape params
    dh_dc = transform_first_deriv_c(sys_utilities, alt_IDs,
                                    rows_to_alts, shape_params)
    # Differentiate the transformed utilities with respect to the systematic
    # utilities
    dh_dv = transform_first_deriv_v(sys_utilities, alt_IDs,
                                    rows_to_alts, shape_params)
    # Differentiate the transformed utilities by the intercept params
    # Note that dh_d_alpha should be a sparse array
    dh_d_alpha = transform_deriv_alpha(sys_utilities, alt_IDs,
                                       rows_to_alts, intercept_params)
    # Differentiate the transformed utilities with respect to the utility
    # coefficients. This should be a dense numpy array.
    dh_db = np.asarray(dh_dv.dot(design))
    # Differentiate the log likelihood w/ respect to the transformed utilities
    d_ll_dh = (choice_vector - long_probs)[np.newaxis, :]

    ##########
    # Create the matrix where each row represents the gradient of a particular
    # observations log-likelihood with respect to the shape parameters and
    # beta, depending on whether there are shape parameters being estimated
    ##########
    if shape_params is not None and intercept_params is not None:
        if isinstance(dh_dc, np.matrixlib.defmatrix.matrix):
            # Note that the '.A' transforms the matrix into a numpy ndarray
            gradient_vec = d_ll_dh.T * np.concatenate((dh_dc.A,
                                                       dh_d_alpha.toarray(),
                                                       dh_db), axis=1)
        else:
            gradient_vec = d_ll_dh.T * np.concatenate((dh_dc.toarray(),
                                                       dh_d_alpha.toarray(),
                                                       dh_db), axis=1)
    elif shape_params is not None and intercept_params is None:
        if isinstance(dh_dc, np.matrixlib.defmatrix.matrix):
            # Note that the '.A' transforms the matrix into a numpy ndarray
            gradient_vec = d_ll_dh.T * np.concatenate((dh_dc.A, dh_db), axis=1)
        else:
            gradient_vec = d_ll_dh.T * np.concatenate((dh_dc.toarray(),
                                                       dh_db), axis=1)
    elif shape_params is None and intercept_params is not None:
        # Note '.to_array()' is used because dh_d_alpha will be a sparse
        # matrix, but we want a 2D numpy array.
        gradient_vec = d_ll_dh.T * np.concatenate((dh_d_alpha.toarray(),
                                                   dh_db), axis=1)
    else:
        gradient_vec = d_ll_dh.T * dh_db

    # Make sure that we calculate the gradient PER OBSERVATION
    # and then take the outer products of those gradients.
    # Note that this is different than taking the outer products of the
    # gradient of the log-likelihood per available alternative per observation
    gradient_vec = rows_to_obs.T.dot(gradient_vec)

    # Compute and return the outer product of each row of the gradient
    # with itself. Then sum these individual matrices together. The line below
    # does the same computation just with less memory and time.
    fisher_matrix =\
        gradient_vec.T.dot(np.multiply(weights_per_obs[:, None], gradient_vec))

    if ridge is not None:
        # The rational behind adding 2 * ridge is that the fisher information
        # matrix should approximate the hessian and in the hessian we add
        # 2 * ridge at the end. I don't know if this is the correct way to
        # calculate the Fisher Information in ridge regression models.
        fisher_matrix -= 2 * ridge

    return fisher_matrix


def calc_asymptotic_covariance(hessian, fisher_info_matrix):
    """
    Parameters
    ----------
    hessian : 2D ndarray.
        It should have shape `(num_vars, num_vars)`. It is the matrix of second
        derivatives of the total loss across the dataset, with respect to each
        pair of coefficients being estimated.
    fisher_info_matrix : 2D ndarray.
        It should have a shape of `(num_vars, num_vars)`.  It is the
        approximation of the negative of the expected hessian formed by taking
        the outer product of (each observation's gradient of the loss function)
        with itself, and then summing across all observations.

    Returns
    -------
    huber_white_matrix : 2D ndarray.
        Will have shape `(num_vars, num_vars)`. The entries in the returned
        matrix are calculated by the following formula:
        `hess_inverse * fisher_info_matrix * hess_inverse`.
    """
    # Calculate the inverse of the hessian
    hess_inv = scipy.linalg.inv(hessian)

    return np.dot(hess_inv, np.dot(fisher_info_matrix, hess_inv))