python source code of ConfigRunnerLogProb

import itertools
import pandas as pd
import numpy as np
import copy
import traceback

""" do not remove, imports required for globals() call """
from cde.density_estimator import LSConditionalDensityEstimation, KernelMixtureNetwork, MixtureDensityNetwork, ConditionalKernelDensityEstimation, NeighborKernelDensityEstimation, NormalizingFlowEstimator
from cde.density_simulation import EconDensity, GaussianMixture, ArmaJump, JumpDiffusionModel, SkewNormal, LinearGaussian, LinearStudentT
from cde.model_fitting.GoodnessOfFitLogProb import GoodnessOfFitLogProb
from cde.model_fitting.GoodnessOfFitResults import GoodnessOfFitResults
from cde.model_fitting.ConfigRunner import load_dumped_estimators, _hash_task_dict, _create_configurations, _add_seeds_to_sim_params
from cde.utils import io
from cde.utils.async_executor import AsyncExecutor
from ml_logger import logger
import tensorflow as tf
import os
import config
import time

EXP_CONFIG_FILE = 'exp_configs.pkl'
RESULTS_FILE = 'results.pkl'

class ConfigRunnerLogProb():
  """
  Args:
    exp_prefix: (str) prefix of experiment configuration
    est_params: dict containing estimator parametrization
                example:

                { 'KernelMixtureNetwork':
                          {'center_sampling_method': ["k_means"],
                           'n_centers': [20],
                           ...
                           }
                  'MixtureDensityNetwork':
                          {
                          ...
                           }
                }


    sim_params: dict containing simulator parametrization
                example:

                {'EconDensity': {'std': [1],
                                'heteroscedastic': [True]
                                },

                'GaussianMixture': { ... }
                }

    observations: either a array-like or a scalar value that defines the number of observations from the
                    simulation model that are used to train the estimators

    keys_of_interest: list of strings, each representing a column in the dataframe / csv export

    n_test_samples: number of samples used to compute test score

    n_seeds: (int) number of different seeds for sampling the data
  """

  def __init__(self, exp_prefix, est_params, sim_params, observations, keys_of_interest, n_test_samples=10 ** 5,
               n_seeds=5, use_gpu=True):

    assert est_params and exp_prefix and sim_params and keys_of_interest
    assert observations.all()

    # convert to dicts to list of tuples
    if isinstance(est_params, dict):
      est_params = list(est_params.items())

    if isinstance(sim_params, dict):
      sim_params = list(sim_params.items())

    # every simulator configuration will be run multiple times with different randomness seeds
    sim_params = _add_seeds_to_sim_params(n_seeds, sim_params)

    self.observations = observations
    self.n_test_samples = n_test_samples
    self.keys_of_interest = keys_of_interest
    self.exp_prefix = exp_prefix
    self.use_gpu = use_gpu

    logger.configure(log_directory=config.DATA_DIR, prefix=exp_prefix, color='green')

    ''' ---------- Either load or generate the configs ----------'''
    config_pkl_path = os.path.join(logger.log_directory, logger.prefix, EXP_CONFIG_FILE)

    if os.path.isfile(config_pkl_path):
      logger.log("{:<70s} {:<30s}".format("Loading experiment previous configs from file: ", config_pkl_path))
      self.configs = logger.load_pkl(EXP_CONFIG_FILE)
    else:
      logger.log("{:<70s} {:<30s}".format("Generating and storing experiment configs under: ", config_pkl_path))
      self.configs = self._generate_configuration_variants(est_params, sim_params)
      logger.dump_pkl(data=self.configs, path=EXP_CONFIG_FILE)

    ''' ---------- Either load already existing results or start a new result collection ---------- '''
    results_pkl_path = os.path.join(logger.log_directory, logger.prefix, RESULTS_FILE)
    if os.path.isfile(results_pkl_path):
      logger.log_line("{:<70s} {:<30s}".format("Continue with: ", results_pkl_path))
      self.gof_single_res_collection = dict(logger.load_pkl_log(RESULTS_FILE))

    else: # start from scratch
      self.gof_single_res_collection = {}

    self.gof_results = GoodnessOfFitResults(self.gof_single_res_collection)

  def _generate_configuration_variants(self, est_params, sim_params):
    """
    Creates all possible combinations from the (configured) estimators and simulators.
    Requires configured estimators and simulators in the constructor:

    Args:
        est_params: estimator parameters as dict with 2 levels
        sim_params: density simulator parameters as dict with 2 levels

    Returns:
        if n_observations is not a list, a list containing n*m=k tuples while k being the number of the cartesian product of estimators and simulators is
        returned --> shape of tuples: (estimator object, simulator object)
        if n_observations is a list, n*m*o=k while o is the number of elements in n_observatons list
    """

    self.est_configs = _create_configurations(est_params)
    self.sim_configs = _create_configurations(sim_params)

    if np.isscalar(self.observations):
      self.observations = [self.observations]

    configs = []
    configured_sims = []

    """ since simulator configurations of the same kind require the same X,Y and x_cond, 
    they have to be generated separately from the estimators"""
    for simulator_name, sim_params in self.sim_configs.items():
      for config in sim_params:
        sim = globals()[simulator_name](**config)

        n_obs_max = max(self.observations)
        X_max, Y_max = sim.simulate(n_obs_max)
        X_max, Y_max = sim._handle_input_dimensionality(X_max, Y_max)
        X_test, Y_test = sim.simulate(self.n_test_samples)

        for obs in self.observations:
          X, Y = X_max[:obs], Y_max[:obs]
          configured_sims.append(dict({"simulator_name": simulator_name, 'simulator_config': config, "n_obs": obs,
                                       "X": X, "Y": Y, "X_test": X_test, "Y_test": Y_test}))

    # merge simulator variants together with estimator variants
    task_number = 0
    for sim_dict in configured_sims:
      for estimator_name, estimator_params in self.est_configs.items() :
        for config in estimator_params:
          simulator_dict = copy.deepcopy(sim_dict)

          simulator_dict['estimator_name'] = estimator_name
          simulator_dict['estimator_config'] = config
          simulator_dict['task_name'] = '%s_task_%i'%(estimator_name, task_number)
          configs.append(simulator_dict)
          task_number += 1

    return configs

  def run_configurations(self, dump_models=False, multiprocessing=True, n_workers=None):
    """
    Runs the given configurations, i.e.
    1) fits the estimator to the simulation and
    2) executes goodness-of-fit (currently: e.g. kl-divergence, wasserstein-distance etc.) tests
    Every successful run yields a result object of type GoodnessOfFitResult which contains
    information on both estimator, simulator and chosen hyperparameters

    such as n_samples, see GoodnessOfFitResult documentation for more information.

      Args:
        estimator_filter: a parameter to decide whether to execute just a specific type of estimator, e.g. "KernelMixtureNetwork",
                          must be one of the density estimator class types
        limit: limit the number of (potentially filtered) tasks
        dump_models: (boolean) whether to save/dump the fitted estimators

      Returns:
         returns two objects: (result_list, full_df)
          1) a GoodnessOfFitResults object containing all configurations as GoodnessOfFitSingleResult objects, carrying information about the
          estimator and simulator hyperparameters as well as n_obs, n_x_cond, n_mc_samples and the statistic results.
          2) a full pandas dataframe of the csv
          Additionally, if export_pickle is True, the path to the pickle file will be returned, i.e. return values are (results_list, full_df, path_to_pickle)

    """
    self.dump_models = dump_models

    ''' Asserts '''
    assert len(self.configs) > 0
    tasks = self.configs

    ''' Run the configurations '''

    logger.log("{:<70s} {:<30s}".format("Number of total tasks in pipeline:", str(len(self.configs))))
    logger.log("{:<70s} {:<30s}".format("Number of aleady finished tasks (found in results pickle): ",
                                         str(len(self.gof_single_res_collection))))


    iters = range(len(tasks))

    if multiprocessing:
      executor = AsyncExecutor(n_jobs=n_workers)
      executor.run(self._run_single_task, iters, tasks)

    else:
      for i, task in zip(iters, tasks):
        self._run_single_task(i, task)


  def _run_single_task(self, i, task):
    start_time = time.time()
    try:
      task_hash = _hash_task_dict(task)  # generate SHA256 hash of task dict as identifier

      # skip task if it has already been completed
      if task_hash in self.gof_single_res_collection.keys():
        logger.log("Task {:<1} {:<63} {:<10} {:<1} {:<1} {:<1}".format(i + 1, "has already been completed:", "Estimator:",
                                                                       task['estimator_name'],
                                                                       " Simulator: ", task["simulator_name"]))
        return None

      # run task when it has not been completed
      else:
        logger.log(
          "Task {:<1} {:<63} {:<10} {:<1} {:<1} {:<1}".format(i + 1, "running:", "Estimator:", task['estimator_name'],
                                                              " Simulator: ", task["simulator_name"]))

        tf.reset_default_graph()

        ''' build simulator and estimator model given the specified configurations '''

        simulator = globals()[task['simulator_name']](**task['simulator_config'])

        t = time.time()
        estimator = globals()[task['estimator_name']](task['task_name'], simulator.ndim_x,
                                                      simulator.ndim_y, **task['estimator_config'])
        time_to_initialize = time.time() - t

        # if desired hide gpu devices
        if not self.use_gpu:
          os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

        with tf.Session() as sess:
          sess.run(tf.global_variables_initializer())

          ''' train the model '''
          gof = GoodnessOfFitLogProb(estimator=estimator, probabilistic_model=simulator, X_train=task['X'], Y_train=task['Y'],
                                     X_test=task['X_test'], Y_test=task['Y_test'], task_name = task['task_name'])

          t = time.time()
          gof.fit_estimator(print_fit_result=True)
          time_to_fit = time.time() - t

          if self.dump_models:
            logger.dump_pkl(data=gof.estimator, path="model_dumps/{}.pkl".format(task['task_name']))
            logger.dump_pkl(data=gof.probabilistic_model, path="model_dumps/{}.pkl".format(task['task_name'] + "_simulator"))

          ''' perform tests with the fitted model '''
          t = time.time()
          gof_results = gof.compute_results()
          time_to_evaluate = time.time() - t

          gof_results.task_name = task['task_name']

          gof_results.hash = task_hash

        logger.log_pkl(data=(task_hash, gof_results), path=RESULTS_FILE)
        logger.flush(file_name=RESULTS_FILE)
        del gof_results

        task_duration = time.time() - start_time
        logger.log(
          "Finished task {:<1} in {:<1.4f} {:<43} {:<10} {:<1} {:<1} {:<2} | {:<1} {:<1.2f} {:<1} {:<1.2f} {:<1} {:<1.2f}".format(i + 1, task_duration, "sec:",
          "Estimator:", task['estimator_name'], " Simulator: ", task["simulator_name"], "t_init:", time_to_initialize, "t_fit:", time_to_fit, "t_eval:", time_to_evaluate))

    except Exception as e:
      logger.log("error in task: ", str(i + 1))
      logger.log(str(e))
      traceback.print_exc()

  def _dump_current_state(self):
    #if self.export_csv:
    #  self._export_results(task=task, gof_result=gof_single_result, file_handle_results=self.file_handle_results_csv)
    #if self.export_pickle:
    with open(self.results_pickle_path, "wb") as f:
      intermediate_gof_results = GoodnessOfFitResults(single_results_dict=self.gof_single_res_collection)
      io.dump_as_pickle(f, intermediate_gof_results, verbose=False)


  def _get_results_dataframe(self, results):
    """ retrieves the dataframe for one or more GoodnessOfFitResults result objects.

      Args:
          results: a list or single object of type GoodnessOfFitResults
      Returns:
         a pandas dataframe
    """
    n_results = len(results)
    assert n_results > 0, "no results given"

    results_dict = results.report_dict(keys_of_interest=self.keys_of_interest)

    return pd.DataFrame.from_dict(data=results_dict)

  def _export_results(self, task, gof_result, file_handle_results):
    assert len(gof_result) > 0, "no results given"

    """ write result to file"""
    try:
      gof_result_df = self._get_results_dataframe(results=gof_result)
      gof_result.result_df = gof_result_df
      io.append_result_to_csv(file_handle_results, gof_result_df)
    except Exception as e:
      print("appending to file was not successful for task: ", task)
      print(str(e))
      traceback.print_exc()


  def _setup_file_names(self):
    if self.prefix_filename is not None:
      self.result_file_name = self.prefix_filename + "_" + self.result_file_name + "_"

    if self.export_pickle:
      if self.results_pickle_file: # continue with old file
        self.results_pickle_path = self.results_pickle_file
      else: # new file name
        self.results_pickle_path = io.get_full_path(output_dir=self.output_dir, suffix=".pickle", file_name=self.result_file_name)

    if self.export_csv:
      if self.results_pickle_file:
        self.results_csv_path = self.results_pickle_file.replace("pickle", "csv")
      else:
        self.results_csv_path = io.get_full_path(output_dir=self.output_dir, suffix=".csv", file_name=self.result_file_name)
      self.file_handle_results_csv = open(self.results_csv_path, "a+")

    if self.dump_models:
      self.model_dump_dir = os.path.join(self.output_dir, 'model_dumps')
      if not os.path.exists(self.model_dump_dir):
        os.makedirs(self.model_dump_dir)