""" @author: Timothy Brathwaite @name: Bootstrap Sampler @summary: This module provides functions that will perform the stratified resampling needed for the bootstrapping procedure. """ from collections import OrderedDict import numpy as np import pandas as pd def relate_obs_ids_to_chosen_alts(obs_id_array, alt_id_array, choice_array): """ Creates a dictionary that relates each unique alternative id to the set of observations ids that chose the given alternative. Parameters ---------- obs_id_array : 1D ndarray of ints. Should be a long-format array of observation ids. Each element should correspond to the unique id of the unit of observation that corresponds to the given row of the long-format data. Note that each unit of observation may have more than one associated choice situation. alt_id_array : 1D ndarray of ints. Should be a long-format array of alternative ids. Each element should denote the unique id of the alternative that corresponds to the given row of the long format data. choice_array : 1D ndarray of ints. Each element should be either a one or a zero, indicating whether the alternative on the given row of the long format data was chosen or not. Returns ------- chosen_alts_to_obs_ids : dict. Each key will be a unique value from `alt_id_array`. Each key's value will be a 1D ndarray that contains the sorted, unique observation ids of those observational units that chose the given alternative. """ # Figure out which units of observation chose each alternative. chosen_alts_to_obs_ids = {} for alt_id in np.sort(np.unique(alt_id_array)): # Determine which observations chose the current alternative. selection_condition =\ np.where((alt_id_array == alt_id) & (choice_array == 1)) # Store the sorted, unique ids that chose the current alternative. chosen_alts_to_obs_ids[alt_id] =\ np.sort(np.unique(obs_id_array[selection_condition])) # Return the desired dictionary. return chosen_alts_to_obs_ids def get_num_obs_choosing_each_alternative(obs_per_alt_dict): """ Will create an ordered dictionary that records the number of units of observation that have chosen the given alternative (i.e. the associated dictionary key). Will also determine the total number of unique observations in the dataset. Parameters ---------- obs_per_alt_dict : dict. Each key should be a unique alternave id. Each key's value will be 1D ndarray that contains the sorted, unique observation ids of those observational units that chose the given alternative. Returns ------- num_obs_per_group : OrderedDict. Keys will be the alternative ids present in `obs_per_alt_dict`. Values will be the `len(obs_per_alt_dict[alt_id]).` tot_num_obs : int. Denotes the total number of unique observation ids in one's dataset. """ # Initialize the object that is to be returned. num_obs_per_group = OrderedDict() # Determine the number of unique units of observation per group. for alt_id in obs_per_alt_dict: num_obs_per_group[alt_id] = len(obs_per_alt_dict[alt_id]) # Determine the total number of units of observation that will be chosen # for each bootstrap sample. tot_num_obs = sum([num_obs_per_group[g] for g in num_obs_per_group]) # Return the desired objects. return num_obs_per_group, tot_num_obs def create_cross_sectional_bootstrap_samples(obs_id_array, alt_id_array, choice_array, num_samples, seed=None): """ Determines the unique observations that will be present in each bootstrap sample. This function DOES NOT create the new design matrices or a new long-format dataframe for each bootstrap sample. Note that these will be correct bootstrap samples for cross-sectional datasets. This function will not work correctly for panel datasets. Parameters ---------- obs_id_array : 1D ndarray of ints. Each element should denote a unique observation id for the corresponding row of the long format array. alt_id_array : 1D ndarray of ints. Each element should denote a unique alternative id for the corresponding row of the long format array. choice_array : 1D ndarray of ints. Each element should be a one or a zero. The values should denote a whether or not the corresponding alternative in `alt_id_array` was chosen by the observational unit in the corresponding row of `obs_id_array.` num_samples : int. Denotes the number of bootstrap samples that need to be drawn. seed : non-negative int or None, optional. Denotes the random seed to be used in order to ensure reproducibility of the bootstrap sample generation. Default is None. If None, no seed will be used and the generation of the bootstrap samples will (in general) not be reproducible. Returns ------- ids_per_sample : 2D ndarray. Each row represents a complete bootstrap sample. Each column denotes a selected bootstrap observation that comprises the bootstrap sample. The elements of the array denote the observation ids of the chosen observational units. """ # Determine the units of observation that chose each alternative. chosen_alts_to_obs_ids =\ relate_obs_ids_to_chosen_alts(obs_id_array, alt_id_array, choice_array) # Determine the number of unique units of observation per group and overall num_obs_per_group, tot_num_obs =\ get_num_obs_choosing_each_alternative(chosen_alts_to_obs_ids) # Initialize the array that will store the observation ids for each sample ids_per_sample = np.empty((num_samples, tot_num_obs), dtype=float) if seed is not None: # Check the validity of the seed argument. if not isinstance(seed, int): msg = "`boot_seed` MUST be an int." raise ValueError(msg) # If desiring reproducibility, set the random seed within numpy np.random.seed(seed) # Initialize a variable to keep track of what column we're on. col_idx = 0 for alt_id in num_obs_per_group: # Get the set of observations that chose the current alternative. relevant_ids = chosen_alts_to_obs_ids[alt_id] # Determine the number of needed resampled ids. resample_size = num_obs_per_group[alt_id] # Resample, with replacement, observations who chose this alternative. current_ids = (np.random.choice(relevant_ids, size=resample_size * num_samples, replace=True) .reshape((num_samples, resample_size))) # Determine the last column index to use when storing the resampled ids end_col = col_idx + resample_size # Assign the sampled ids to the correct columns of ids_per_sample ids_per_sample[:, col_idx:end_col] = current_ids # Update the column index col_idx += resample_size # Return the resampled observation ids. return ids_per_sample def create_bootstrap_id_array(obs_id_per_sample): """ Creates a 2D ndarray that contains the 'bootstrap ids' for each replication of each unit of observation that is an the set of bootstrap samples. Parameters ---------- obs_id_per_sample : 2D ndarray of ints. Should have one row for each bootsrap sample. Should have one column for each observational unit that is serving as a new bootstrap observational unit. Returns ------- bootstrap_id_array : 2D ndarray of ints. Will have the same shape as `obs_id_per_sample`. Each element will denote the fake observational id in the new bootstrap dataset. """ # Determine the shape of the object to be returned. n_rows, n_cols = obs_id_per_sample.shape # Create the array of bootstrap ids. bootstrap_id_array =\ np.tile(np.arange(n_cols) + 1, n_rows).reshape((n_rows, n_cols)) # Return the desired object return bootstrap_id_array def create_deepcopied_groupby_dict(orig_df, obs_id_col): """ Will create a dictionary where each key corresponds to a unique value in `orig_df[obs_id_col]` and each value corresponds to all of the rows of `orig_df` where `orig_df[obs_id_col] == key`. Parameters ---------- orig_df : pandas DataFrame. Should be long-format dataframe containing the data used to estimate the desired choice model. obs_id_col : str. Should be a column name within `orig_df`. Should denote the original observation id column. Returns ------- groupby_dict : dict. Each key will be a unique value in `orig_df[obs_id_col]` and each value will be the rows of `orig_df` where `orig_df[obs_id_col] == key`. """ # Get the observation id values obs_id_vals = orig_df[obs_id_col].values # Get the unique observation ids unique_obs_ids = np.unique(obs_id_vals) # Initialize the dictionary to be returned. groupby_dict = {} # Populate the dictionary with dataframes for each individual. for obs_id in unique_obs_ids: # Filter out only the rows corresponding to the current observation id. desired_rows = obs_id_vals == obs_id # Add the desired dataframe to the dictionary. groupby_dict[obs_id] = orig_df.loc[desired_rows].copy(deep=True) # Return the desired object. return groupby_dict def check_column_existence(col_name, df, presence=True): """ Checks whether or not `col_name` is in `df` and raises a helpful error msg if the desired condition is not met. Parameters ---------- col_name : str. Should represent a column whose presence in `df` is to be checked. df : pandas DataFrame. The dataframe that will be checked for the presence of `col_name`. presence : bool, optional. If True, then this function checks for the PRESENCE of `col_name` from `df`. If False, then this function checks for the ABSENCE of `col_name` in `df`. Default == True. Returns ------- None. """ if presence: if col_name not in df.columns: msg = "Ensure that `{}` is in `df.columns`." raise ValueError(msg.format(col_name)) else: if col_name in df.columns: msg = "Ensure that `{}` is not in `df.columns`." raise ValueError(msg.format(col_name)) return None def ensure_resampled_obs_ids_in_df(resampled_obs_ids, orig_obs_id_array): """ Checks whether all ids in `resampled_obs_ids` are in `orig_obs_id_array`. Raises a helpful ValueError if not. Parameters ---------- resampled_obs_ids : 1D ndarray of ints. Should contain the observation ids of the observational units that will be used in the current bootstrap sample. orig_obs_id_array : 1D ndarray of ints. Should countain the observation ids of the observational units in the original dataframe containing the data for this model. Returns ------- None. """ if not np.in1d(resampled_obs_ids, orig_obs_id_array).all(): msg =\ "All values in `resampled_obs_ids` MUST be in `orig_obs_id_array`." raise ValueError(msg) return None def create_bootstrap_dataframe(orig_df, obs_id_col, resampled_obs_ids_1d, groupby_dict, boot_id_col="bootstrap_id"): """ Will create the altered dataframe of data needed to estimate a choice model with the particular observations that belong to the current bootstrap sample. Parameters ---------- orig_df : pandas DataFrame. Should be long-format dataframe containing the data used to estimate the desired choice model. obs_id_col : str. Should be a column name within `orig_df`. Should denote the original observation id column. resampled_obs_ids_1d : 1D ndarray of ints. Each value should represent the alternative id of a given bootstrap replicate. groupby_dict : dict. Each key will be a unique value in `orig_df[obs_id_col]` and each value will be the rows of `orig_df` where `orig_df[obs_id_col] == key`. boot_id_col : str, optional. Denotes the new column that will be created to specify the bootstrap observation ids for choice model estimation. Returns ------- bootstrap_df : pandas Dataframe. Will contain all the same columns as `orig_df` as well as the additional `boot_id_col`. For each value in `resampled_obs_ids_1d`, `bootstrap_df` will contain the long format rows from `orig_df` that have the given observation id. """ # Check the validity of the passed arguments. check_column_existence(obs_id_col, orig_df, presence=True) check_column_existence(boot_id_col, orig_df, presence=False) # Alias the observation id column obs_id_values = orig_df[obs_id_col].values # Check the validity of the resampled observation ids. ensure_resampled_obs_ids_in_df(resampled_obs_ids_1d, obs_id_values) # Initialize a list to store the component dataframes that will be # concatenated to form the final bootstrap_df component_dfs = [] # Populate component_dfs for boot_id, obs_id in enumerate(resampled_obs_ids_1d): # Extract the dataframe that we desire. extracted_df = groupby_dict[obs_id].copy() # Add the bootstrap id value. extracted_df[boot_id_col] = boot_id + 1 # Store the component dataframe component_dfs.append(extracted_df) # Create and return the desired dataframe. bootstrap_df = pd.concat(component_dfs, axis=0, ignore_index=True) return bootstrap_df