python source code of extreme_ensemble

# -*- coding: utf-8 -*-
"""
@author: Chenglong Chen <c.chenglong@gmail.com>
@brief: extreme ensemble selection module
@note: 
        - support multiprocessing (set multiprocessing = True and multiprocessing_num_cores = #cores)
        - support random weight in greedy forward model selection (set weight_opt_max_evals = 1)
"""

"""
Implement (extreme) ensemble selection with the following techniques described in [1][2]:
1. Ensemble Initialization
2. Bagged Ensemble Selection
3. Ensemble Selection with Replacement
4. Cross-Validated Ensemble Selection
5. Random Weight in Greedy Forward Model Selection

Reference
[1] Ensemble selection from libraries of models, R. Caruana, A. Niculescu-Mizil, G. Crew, and A. Ksikes.
[2] Getting the Most Out of Ensemble Selection, Rich Caruana, Art Munson, and Alexandru Niculescu-Mizil

"""

import csv
import time
from optparse import OptionParser

import scipy
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from sklearn.utils.fixes import bincount
from sklearn.cross_validation import check_random_state
from sklearn.cross_validation import BaseShuffleSplit, ShuffleSplit, StratifiedShuffleSplit
# http://stackoverflow.com/questions/2801882/generating-a-png-with-matplotlib-when-display-is-undefined
# The solution for me was to add the following code in a place 
# that gets read before any other pylab/matplotlib/pyplot import:
import matplotlib
# Force matplotlib to not use any Xwindows backend.
matplotlib.use('Agg')
import matplotlib.pyplot as plt

import config
from utils import dist_utils, np_utils
from utils import logging_utils, os_utils, pkl_utils, time_utils
from get_stacking_feature_conf import get_model_list


splitter_level1 = pkl_utils._load("%s/splits_level1.pkl"%config.SPLIT_DIR)
splitter_level2 = pkl_utils._load("%s/splits_level2.pkl"%config.SPLIT_DIR)
splitter_level3 = pkl_utils._load("%s/splits_level3.pkl"%config.SPLIT_DIR)
assert len(splitter_level1) == len(splitter_level2)
assert len(splitter_level1) == len(splitter_level3)
n_iter = len(splitter_level1)


class StratifiedShuffleSplitReplacement(BaseShuffleSplit):

    def __init__(self, y, n_iter=10, test_size=0.1, train_size=None,
                 random_state=None):

        super(StratifiedShuffleSplitReplacement, self).__init__(
            len(y), n_iter, test_size, train_size, random_state)

        self.y = np.array(y)
        self.classes, self.y_indices = np.unique(y, return_inverse=True)
        n_cls = self.classes.shape[0]

    def _iter_indices(self):
        rng = np.random.RandomState(self.random_state)
        cls_count = bincount(self.y_indices)

        for n in range(self.n_iter):
            train = []
            test = []

            for i, cls in enumerate(self.classes):
                sample_size = int(cls_count[i]*(1-self.test_size))
                randint = rng.randint(cls_count[i], size=sample_size)
                aidx = np.where((self.y == cls))[0]
                iidx = aidx[randint]
                oidx = aidx[list(set(range(cls_count[i])).difference(set(randint)))]

                train.extend(iidx)
                test.extend(oidx)

            train = rng.permutation(train)
            test = rng.permutation(test)

            yield train, test

    def __repr__(self):
        return ('%s(labels=%s, n_iter=%d, test_size=%s, '
                'random_state=%s)' % (
                    self.__class__.__name__,
                    self.y,
                    self.n_iter,
                    str(self.test_size),
                    self.random_state,
                ))

    def __len__(self):
        return self.n_iter


class ExtremeEnsembleSelection:
    def __init__(self, model_folder, model_list, subm_prefix, 
                weight_opt_max_evals=10, w_min=-1., w_max=1., 
                inst_subsample=0.5, inst_subsample_replacement=False, 
                inst_splitter=None,
                model_subsample=1.0, model_subsample_replacement=True,
                bagging_size=10, init_top_k=5, epsilon=0.00001, 
                multiprocessing=False, multiprocessing_num_cores=1,
                enable_extreme=True, random_seed=0):

        self.model_folder = model_folder
        self.model_list = model_list
        self.subm_prefix = subm_prefix
        self.weight_opt_max_evals = weight_opt_max_evals
        self.w_min = w_min
        self.w_max = w_max
        assert inst_subsample > 0 and inst_subsample <= 1.
        self.inst_subsample = inst_subsample
        self.inst_subsample_replacement = inst_subsample_replacement
        self.inst_splitter = inst_splitter
        assert model_subsample > 0
        assert (type(model_subsample) == int) or (model_subsample <= 1.)
        self.model_subsample = model_subsample
        self.model_subsample_replacement = model_subsample_replacement
        self.bagging_size = bagging_size
        self.init_top_k = init_top_k
        self.epsilon = epsilon
        self.multiprocessing = multiprocessing
        self.multiprocessing_num_cores = multiprocessing_num_cores
        self.enable_extreme = enable_extreme
        self.random_seed = random_seed
        logname = "ensemble_selection_%s.log"%time_utils._timestamp()
        self.logger = logging_utils._get_logger(config.LOG_DIR, logname)
        self.n_models = len(self.model_list)

    def _merge_pred(self, w1, p1, w2, p2):
        p = (w1 * p1 + w2 * p2) / (w1 + w2)
        # relevance is in [1,3]
        p = np.clip(p, 1., 3.)
        return p

    def _pick_random_models(self, sorted_models, seed):
        num_model = len(sorted_models)
        #### bagging for models
        rng = np.random.RandomState(seed)
        if type(self.model_subsample) == int:
            sample_size = min(num_model, self.model_subsample)
        else:
            sample_size = int(num_model * self.model_subsample)
        if self.model_subsample_replacement:
            index_base = rng.randint(num_model, size=sample_size)
        else:
            index_base = rng.permutation(num_model)[:sample_size]
        this_sorted_models = [sorted_models[i] for i in sorted(index_base)]
        return this_sorted_models

    def _ens_obj_generic(self, weight2, p1_list, weight1, p2_list, 
                        true_label_list, numBSTMatrix, bst_inst_idx):
        rmse_cv = np.zeros((config.N_RUNS, config.N_FOLDS), dtype=float)
        for run in range(config.N_RUNS):
            for fold in range(config.N_FOLDS):
                numBST = numBSTMatrix[run,fold]
                bidx = bst_inst_idx[run,fold,:numBST].tolist()
                p1 = p1_list[run,fold,bidx]
                p2 = p2_list[run,fold,bidx]
                true_label = true_label_list[run,fold,bidx]
                p_ens = self._merge_pred(weight1, p1, weight2, p2)
                rmse_cv[run,fold] = dist_utils._rmse(p_ens, true_label)
        rmse_mean = np.mean(rmse_cv)
        rmse_std = np.std(rmse_cv)
        return rmse_mean, rmse_std

    def _ens_obj_scipy(self, weight2, p1_list, weight1, p2_list, 
                        true_label_list, numBSTMatrix, bst_inst_idx):
        rmse_mean, rmse_std = self._ens_obj_generic(weight2, p1_list, weight1, p2_list, 
            true_label_list, numBSTMatrix, bst_inst_idx)
        return rmse_mean

    ## scipy.optimize.fmin for the best weight
    def _find_optim_weight_scipy(self, p_ens_list_valid_tmp, pred_list_valid, Y_list_valid, 
                                    numBSTMatrix, bst_inst_idx, w_ens, model_index_dict, model):
        this_p_list_valid = pred_list_valid[model_index_dict[model]]
        if w_ens == 0:
            this_w = 1.
            best_rmse_mean, best_rmse_std = self._ens_obj_generic(this_w, p_ens_list_valid_tmp, 
                0., this_p_list_valid, Y_list_valid, numBSTMatrix, bst_inst_idx)
        else:
            obj = lambda weight2: self._ens_obj_scipy(weight2, p_ens_list_valid_tmp, 
                1., this_p_list_valid, Y_list_valid, numBSTMatrix, bst_inst_idx)
            weight2_init = np.random.uniform(self.w_min, self.w_max, size=1)[0]
            xopt = scipy.optimize.fmin(obj, weight2_init, maxiter=self.weight_opt_max_evals, disp=False)
            this_w = xopt[0]
            best_rmse_mean, best_rmse_std = self._ens_obj_generic(this_w, p_ens_list_valid_tmp, 
                1., this_p_list_valid, Y_list_valid, numBSTMatrix, bst_inst_idx)
            this_w *= w_ens

        return best_rmse_mean, best_rmse_std, model, this_w

    def _ens_predict(self, best_bagged_model_list, best_bagged_model_weight):
        bagging_size = len(best_bagged_model_list)
        for bagging_iter in range(bagging_size):
            w_ens = 0.
            iter = 0
            for model,w in zip(best_bagged_model_list[bagging_iter], best_bagged_model_weight[bagging_iter]):
                iter += 1
                pred_file = "%s/All/test.pred.%s.csv" % (self.model_folder, model)
                df = pd.read_csv(pred_file, dtype=float)
                this_p_valid = df["prediction"].values
                this_p_valid = np.clip(this_p_valid, 1., 3.)
                if iter == 1:
                    p_ens_valid = np.zeros((this_p_valid.shape[0]),dtype=float)
                    id_test = df["id"].values.astype(int)
                p_ens_valid = self._merge_pred(w_ens, p_ens_valid, w, this_p_valid)
                w_ens += w
            
            if bagging_iter == 0:
                p_ens_score_bag = np.zeros((p_ens_valid.shape[0], bagging_size), dtype=float)

            p_ens_score_bag[:,bagging_iter] = p_ens_valid
        ##
        p_ens_score_bag = np.mean(p_ens_score_bag, axis=1)

        output = pd.DataFrame({"id": id_test, "relevance": p_ens_score_bag})
        return output

    def go(self):

        ## initialization
        pred_list_valid = np.zeros((self.n_models, config.N_RUNS, config.N_FOLDS, config.VALID_SIZE_MAX), dtype=float)
        Y_list_valid = np.zeros((config.N_RUNS, config.N_FOLDS, config.VALID_SIZE_MAX), dtype=float)
        numValidMatrix = np.zeros((config.N_RUNS, config.N_FOLDS), dtype=int)
        p_ens_list_valid = np.zeros((config.N_RUNS, config.N_FOLDS, config.VALID_SIZE_MAX), dtype=float)

        bst_inst_idx = np.zeros((config.N_RUNS, config.N_FOLDS, config.VALID_SIZE_MAX), dtype=float)
        numBSTMatrix = np.zeros((config.N_RUNS, config.N_FOLDS), dtype=int)
        oob_inst_idx = np.zeros((config.N_RUNS, config.N_FOLDS, config.VALID_SIZE_MAX), dtype=float)
        numOOBMatrix = np.zeros((config.N_RUNS, config.N_FOLDS), dtype=int)

        self.logger.info("Perform Extreme Ensemble Selection...")
        ## model index
        model_index_dict = dict(zip(self.model_list, range(self.n_models)))
        model_rmse_dict = dict(zip(self.model_list, [0]*self.n_models))
        self.logger.info("="*80)
        self.logger.info("Load model...")
        for model in self.model_list:
            self.logger.info("model: %s" % model)
            model_id = model_index_dict[model]
            rmse_cv = np.zeros((config.N_RUNS, config.N_FOLDS), dtype=float)
            ## load model
            for run in range(config.N_RUNS):
                for fold in range(config.N_FOLDS):
                    path = "%s/Run%d" % (self.model_folder, run+1)
                    pred_file = "%s/valid.pred.%s.csv" % (path, model)

                    this_p_valid = pd.read_csv(pred_file, dtype=float)
                    numValidMatrix[run,fold] = this_p_valid.shape[0]
                    numValid = numValidMatrix[run,fold]
                    this_target = this_p_valid["target"].values
                    this_p_valid = this_p_valid["prediction"].values
                    pred_list_valid[model_id,run,fold,:numValid] = np.clip(this_p_valid, 1., 3.)
                    Y_list_valid[run,fold,:numValid] = this_target

                    ##
                    rmse_cv[run,fold] = dist_utils._rmse(pred_list_valid[model_id,run,fold,:numValid], 
                                                            Y_list_valid[run,fold,:numValid])     

            self.logger.info("rmse: %.6f (%.6f)" % (np.mean(rmse_cv), np.std(rmse_cv)))
            model_rmse_dict[model] = (np.mean(rmse_cv), np.std(rmse_cv))
        self.logger.info("%d models in total." % self.n_models)

        sorted_models = sorted(model_rmse_dict.items(), key=lambda x: x[1][0])
            
        # greedy ensemble
        self.logger.info("="*80)
        best_bagged_model_list = [[]]*self.bagging_size
        best_bagged_model_weight = [[]]*self.bagging_size
        score_valid_bag_mean = np.nan * np.zeros((config.N_RUNS, config.N_FOLDS, config.VALID_SIZE_MAX, self.bagging_size), dtype=float)
        rmse_cv_mean_mean_lst = [0]*self.bagging_size
        rmse_cv_mean_std_lst = [0]*self.bagging_size
        for bagging_iter in range(self.bagging_size):
            seed_model = self.random_seed + 100 * bagging_iter
            if not self.enable_extreme:
                this_sorted_models = self._pick_random_models(sorted_models, seed_model)
            #### instance level subsampling
            for run in range(config.N_RUNS):
                for fold in range(config.N_FOLDS):
                    if self.inst_splitter is None:
                        # GENERAL APPROACH
                        seed_inst = self.random_seed + 1000 * bagging_iter + 100 * run + 10 * fold
                        rng_inst = np.random.RandomState(seed_inst)
                        numValid = numValidMatrix[run,fold]
                        if self.inst_subsample_replacement:
                            sss = StratifiedShuffleSplitReplacement(Y_list_valid[run,fold,:numValid], n_iter=1,
                                test_size=1.-self.inst_subsample, random_state=seed_inst)
                            iidx, oidx = list(sss)[0]
                        else:
                            if self.inst_subsample < 1:
                                # Stratified ShuffleSplit
                                sss = ShuffleSplit(len(Y_list_valid[run,fold,:numValid]), n_iter=1,
                                    test_size=1.-self.inst_subsample, random_state=seed_inst)
                                iidx, oidx = list(sss)[0]
                            elif self.inst_subsample == 1:
                                # set iidx (trianing) the same as oidx (validation)
                                iidx = np.arange(numValid)
                                oidx = np.arange(numValid)
                    else:
                        iidx, oidx = self.inst_splitter[run]
                    numBSTMatrix[run,fold] = len(iidx)
                    bst_inst_idx[run,fold,:numBSTMatrix[run,fold]] = iidx
                    numOOBMatrix[run,fold] = len(oidx)
                    oob_inst_idx[run,fold,:numOOBMatrix[run,fold]] = oidx

            #print this_model_list
            best_model_list = []
            best_model_weight = []
            best_model_rmse = []
            best_rmse = 0
            best_rmse_std = 0
            best_model = None
            p_ens_list_valid_tmp = np.zeros((config.N_RUNS, config.N_FOLDS, config.VALID_SIZE_MAX), dtype=float)
            #### Technique: Ensemble Initialization
            iter = 0
            w_ens, this_w = 0.0, 1.0
            if self.init_top_k > 0:
                # self.logger.info("** Ensemble Initialization **")
                # init_top_k = min(init_top_k, num_model)
                rmse_cv = np.zeros((config.N_RUNS, config.N_FOLDS), dtype=float)
                for cnt in range(self.init_top_k):
                    iter += 1
                    start = time.time()
                    seed_model = self.random_seed + 100 * bagging_iter + 10 * iter
                    if self.enable_extreme:
                        this_sorted_models = self._pick_random_models(sorted_models, seed_model)
                        best_model,(rmse,rmse_std) = this_sorted_models[0]
                    else:
                        best_model,(rmse,rmse_std) = this_sorted_models[cnt]
                    this_p_list_valid = pred_list_valid[model_index_dict[best_model]]
                    for run in range(config.N_RUNS):
                        for fold in range(config.N_FOLDS):
                            numValid = numValidMatrix[run,fold]
                            numBST = numBSTMatrix[run,fold]
                            bidx = bst_inst_idx[run,fold,:numBST].tolist()
                            p_ens_list_valid_tmp[run,fold,:numValid] = self._merge_pred(
                                w_ens, p_ens_list_valid_tmp[run,fold,:numValid], 
                                this_w, this_p_list_valid[run,fold,:numValid])
                            true_label = Y_list_valid[run,fold,bidx]
                            rmse_cv[run,fold] = dist_utils._rmse(p_ens_list_valid_tmp[run,fold,bidx], true_label)
                    end = time.time()
                    best_weight = this_w
                    best_rmse = np.mean(rmse_cv)
                    best_rmse_std = np.std(rmse_cv)

                    self.logger.info("Iter: %d (%.2fs)" % (iter, (end - start)))
                    self.logger.info("     model: %s" % best_model)
                    self.logger.info("     weight: %s" % best_weight)
                    self.logger.info("     rmse: %.6f (%.6f)" % (best_rmse, best_rmse_std))

                    best_model_list.append(best_model)
                    best_model_weight.append(best_weight)
                    w_ens += best_weight

            #### Technique: Ensemble Selection with Replacement
            while True:
                iter += 1
                seed_model = self.random_seed + 100 * bagging_iter + 10 * iter
                if self.enable_extreme:
                    this_sorted_models = self._pick_random_models(sorted_models, seed_model)
                if self.multiprocessing:
                    start = time.time()
                    models_tmp = [model for model,(_,_) in this_sorted_models]
                    best_trial_rmse_mean_lst, best_trial_rmse_std_lst, model_lst, this_w_lst = \
                        zip(*Parallel(n_jobs=self.multiprocessing_num_cores)(
                            delayed(self._find_optim_weight_scipy)(
                                p_ens_list_valid_tmp, pred_list_valid, Y_list_valid, numBSTMatrix, 
                                bst_inst_idx, w_ens, model_index_dict, m
                                ) for m in models_tmp
                            ))
                    ##
                    ind_best = np.argmin(best_trial_rmse_mean_lst)
                    best_trial_rmse_mean = best_trial_rmse_mean_lst[ind_best]
                    best_trial_rmse_std = best_trial_rmse_std_lst[ind_best]
                    model = model_lst[ind_best]
                    this_w = this_w_lst[ind_best]
                    if best_trial_rmse_mean < best_rmse:
                        best_rmse, best_rmse_std = best_trial_rmse_mean, best_trial_rmse_std
                        best_model, best_weight = model, this_w
                    end = time.time()
                else:
                    start = time.time()
                    for model,(_,_) in this_sorted_models:
                        best_trial_rmse_mean, best_trial_rmse_std, model, this_w = \
                            self._find_optim_weight_scipy(
                            p_ens_list_valid_tmp, pred_list_valid, Y_list_valid, numBSTMatrix, 
                            bst_inst_idx, w_ens, model_index_dict, model)
                        if best_trial_rmse_mean < best_rmse:
                            best_rmse, best_rmse_std = best_trial_rmse_mean, best_trial_rmse_std
                            best_model, best_weight = model, this_w
                    end = time.time()
                if best_model is None:
                    break
                if len(best_model_rmse) > 1 and (best_model_rmse[-1] - best_rmse < self.epsilon):
                    break

                ##
                self.logger.info("Iter: %d (%.2fs)" % (iter, (end - start)))
                self.logger.info("     model: %s" % best_model)
                self.logger.info("     weight: %s" % best_weight)
                self.logger.info("     rmse: %.6f (%.6f)" % (best_rmse, best_rmse_std))
                
                # valid
                this_p_list_valid = pred_list_valid[model_index_dict[best_model]]
                pred_raw_list = []
                true_label_list = []
                for run in range(config.N_RUNS):
                    for fold in range(config.N_FOLDS):
                        numValid = numValidMatrix[run,fold]
                        numBST = numBSTMatrix[run,fold]
                        bidx = bst_inst_idx[run,fold,:numBST].tolist()
                        p_ens_list_valid_tmp[run,fold,:numValid] = self._merge_pred(
                            w_ens, p_ens_list_valid_tmp[run,fold,:numValid], 
                            best_weight, this_p_list_valid[run,fold,:numValid])

                        pred_raw_list.append( p_ens_list_valid_tmp[run,fold,bidx] )
                        true_label_list.append( Y_list_valid[run,fold,bidx] )

                best_model_list.append(best_model)
                best_model_weight.append(best_weight)
                best_model_rmse.append(best_rmse)

                best_model = None
                w_ens += best_weight
            
            ## compute OOB score
            rmse_cv_mean = np.zeros((config.N_RUNS, config.N_FOLDS), dtype=float)
            for run in range(config.N_RUNS):
                for fold in range(config.N_FOLDS):
                    numValid = numValidMatrix[run,fold]
                    true_label = Y_list_valid[run,fold,:numValid]
                    numOOB = numOOBMatrix[run,fold]
                    oidx = oob_inst_idx[run,fold,:numOOB].tolist()                
                    pred_raw = p_ens_list_valid_tmp[run,fold,oidx]
                    ## mean
                    score_valid_bag_mean[run,fold,oidx,bagging_iter] = pred_raw
                    pred_mean = np_utils._array_mean(score_valid_bag_mean[run,fold,:numValid,:(bagging_iter+1)])
                    non_nan_idx = pred_mean != config.MISSING_VALUE_NUMERIC
                    rmse_cv_mean[run,fold] = dist_utils._rmse(pred_mean[non_nan_idx], true_label[non_nan_idx])
            self.logger.info("-"*80)
            self.logger.info( "Bag: %d"% (bagging_iter+1))
            self.logger.info( "rmse-mean: %.6f (%.6f)" % (np.mean(rmse_cv_mean), np.std(rmse_cv_mean)))
            self.logger.info("-"*80)

            best_bagged_model_list[bagging_iter] = best_model_list
            best_bagged_model_weight[bagging_iter] = best_model_weight

            ## save the current prediction
            mr = "R" + str(self.model_subsample_replacement).upper()[0]
            ir = "R" + str(self.inst_subsample_replacement).upper()[0]
            ## mean
            best_rmse_mean = np.mean(rmse_cv_mean)
            best_rmse_std = np.std(rmse_cv_mean)
            output = self._ens_predict(best_bagged_model_list[:(bagging_iter+1)], 
                best_bagged_model_weight[:(bagging_iter+1)])
            sub_file = "%s_[MS%.2f_%s]_[IS%.2f_%s]_[Top%d]_[Bag%d]_[Mean%.6f]_[Std%.6f].mean.csv" % (
                self.subm_prefix, self.model_subsample, mr, self.inst_subsample, ir, 
                self.init_top_k, bagging_iter+1, best_rmse_mean, best_rmse_std)
            output.to_csv(sub_file, index=False)
            rmse_cv_mean_mean_lst[bagging_iter] = best_rmse_mean
            rmse_cv_mean_std_lst[bagging_iter] = best_rmse_std

            ## plot OOB score
            x = np.arange(1,bagging_iter+2,1)
            label = "Mean (Best = %.6f, Bag = %d)"%(
                    np.min(rmse_cv_mean_mean_lst[:(bagging_iter+1)]), 
                    np.argmin(rmse_cv_mean_mean_lst[:(bagging_iter+1)])+1)
            plt.errorbar(x, rmse_cv_mean_mean_lst[:(bagging_iter+1)], 
                yerr=rmse_cv_mean_std_lst[:(bagging_iter+1)], 
                fmt='-o', label=label)
            plt.xlim(1, self.bagging_size)
            plt.title("Extreme Ensemble Selection RMSE")
            plt.xlabel("Bag")
            plt.ylabel("CV/OOB RMSE")
            plt.legend(loc="upper right")
            fig_file = "%s/ensemble_selection_%d.pdf"%(config.FIG_DIR, bagging_iter+1)
            plt.savefig(fig_file)
            plt.clf()


#--------------------------- Main ---------------------------
def main(options):

    # create sub folder
    subm_folder = "%s/ensemble_selection"%config.SUBM_DIR
    os_utils._create_dirs( [subm_folder] )
    subm_prefix = "%s/test.pred.[%s]" % (subm_folder, options.outfile)

    # get model list
    log_folder = "%s/level%d_models"%(config.LOG_DIR, options.level-1)
    model_list = get_model_list(log_folder, options.size)

    # get instance splitter
    if options.level not in [2, 3]:
        inst_splitter = None
    elif options.level == 2:
        inst_splitter = splitter_level2
    elif options.level == 3:
        inst_splitter = splitter_level3

    ees = ExtremeEnsembleSelection(
            model_folder=config.OUTPUT_DIR, 
            model_list=model_list, 
            subm_prefix=subm_prefix, 
            weight_opt_max_evals=options.weight_opt_max_evals, 
            w_min=-1., 
            w_max=1., 
            inst_subsample=options.inst_subsample,
            inst_subsample_replacement=options.inst_subsample_replacement,
            inst_splitter=inst_splitter,
            model_subsample=options.model_subsample,
            model_subsample_replacement=options.model_subsample_replacement,
            bagging_size=options.bagging_size, 
            init_top_k=options.init_top_k,
            epsilon=options.epsilon,
            multiprocessing=False, 
            multiprocessing_num_cores=config.NUM_CORES,
            enable_extreme=options.enable_extreme,
            random_seed=config.RANDOM_SEED
        )
    ees.go()


def parse_args(parser):
    parser.add_option("-l", "--level", type="int", dest="level", 
        default=2, help="level of base models")
    parser.add_option("-s", "--size", type="int", dest="size", 
        default=10, help="size of each model")
    parser.add_option("-o", "--outfile", type="string", dest="outfile",
        default="extreme_ensemble_selection", help="output model name")
    parser.add_option("-b", "--bag", type="int", dest="bagging_size",
        default=10, help="bagging_size")
    parser.add_option("-t", "--init", type="int", dest="init_top_k", 
        default=5, help="init_top_k")
    parser.add_option("-i", "--inst_subsample", type="float", dest="inst_subsample", 
        default=0.5, help="inst_subsample")
    parser.add_option("-I", "--inst_subsample_replacement", action="store_true", 
        dest="inst_subsample_replacement", 
        default=False, help="inst_subsample_replacement")
    parser.add_option("-m", "--model_subsample", type="float", dest="model_subsample", 
        default=1.0, help="model_subsample")
    parser.add_option("-M", "--model_subsample_replacement", action="store_true", 
        dest="model_subsample_replacement", 
        default=True, help="model_subsample_replacement")
    parser.add_option("-e", "--epsilon", type="float", dest="epsilon", 
        default=0.0001, help="epsilon")
    parser.add_option("-w", "--weight_opt_max_evals", type="int", dest="weight_opt_max_evals", 
        default=5, help="weight_opt_max_evals")
    parser.add_option("-x", action="store_true", dest="enable_extreme", 
        default=True, help="enable_extreme")
    (options, args) = parser.parse_args()
    return options, args


if __name__ == "__main__":

    parser = OptionParser()
    options, args = parse_args(parser)
    main(options)