python source code of tune

# Jeff Heaton's Kaggle Utilities
# Copyright 2019 by Jeff Heaton, Open Source, Released under the Apache License
# For more information: https://github.com/jeffheaton/jh-kaggle-util
# 
# Tune for XGBoost, based on: https://towardsdatascience.com/fine-tuning-xgboost-in-python-like-a-boss-b4543ed8b1e
#

from util import *
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import GridSearchCV
import multiprocessing
import itertools
import demjson
import time

import pandas as pd
import numpy as np
import xgboost as xgb
import time
import os
import zipfile
import operator
from sklearn.metrics import log_loss
import scipy
from train_xgboost import TrainXGBoost

# http://stackoverflow.com/questions/2853212/all-possible-permutations-of-a-set-of-lists-in-python

FOLDS = 5
EARLY_STOP = 50
MAX_ROUNDS = 5

PARAMS1 = {
'objective': 'reg:linear',
    'eval_metric': 'rmse',
'silent' : 1,
    'learning_rate':0.0045,'seed':4242
}

train = TrainXGBoost("1",params=PARAMS1,run_single_fold=True)

def modelfit(params,x,y):
    #Fit the algorithm on the data
    print("fit")
    alg = XGBClassifier(**params)
    alg.fit(x,y,verbose=True)
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    print(feat_imp)

def update(base_dict, update_copy):
    for key in update_copy.keys():
        base_dict[key] = update_copy[key]

def step1_find_depth_and_child(params):
    test1 = {
        'max_depth': list(range(3,12,2)),
        'min_child_weight': list(range(1,10,2))
    }

    return grid_search(params, test1, 1)

def step2_narrow_depth(params):
    max_depth = params['max_depth']
    test2 = {
        'max_depth': [max_depth-1,max_depth,max_depth+1]
    }
    return grid_search(params, test2, 2)

def step3_gamma(params):
    test3 = {
        'gamma': list([i/10.0 for i in range(0,5)])
    }
    return grid_search(params, test3, 3)

def step4_sample(params):
    test4 = {
        'subsample':list([i/10.0 for i in range(6,10)]),
        'colsample_bytree':list([i/10.0 for i in range(6,10)])
    }
    return grid_search(params, test4, 4)

def step5_reg1(params):
    test5 = {
        'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
    }
    return grid_search(params, test5, 5)




def grid_search(params,grid,num):
    keys = set(grid.keys())
    l = [grid[x] for x in keys]
    perm = list(itertools.product(*l))
    jobs = []
    for i in perm:
        jobs.append({k:v for k,v in zip(keys,i)})

    print("Total number of jobs: {}".format(len(jobs)))
    column_step = []
    column_score = []
    column_jobs = []

    for i,job in enumerate(jobs):
        print("** Starting job: {}:{}/{}".format(num,i+1,len(jobs)))
        params2 = dict(params)
        update(params2,job)
        train.params = params2
        train.rounds = MAX_ROUNDS
        train.early_stop = EARLY_STOP
        result = train.run_cv()
        print("Result: {}".format(result))
        column_jobs.append(str(job))
        column_score.append(result[0])
        column_step.append(result[1])

    df = pd.DataFrame({'job':column_jobs,'step':column_step,'score':column_score},columns=['job','score','step'])
    df.sort_values(by=['score'],ascending=[True],inplace=True)
    print(df)
    path_tune = os.path.join(PATH, "tune-{}.csv".format(num))
    df.to_csv(path_tune, index=False)
    j = df.iloc[0]['job']
    j = demjson.decode(j)
    return j


def main():
    print("loading")
    start_time = time.time()
    train._run_startup()
    params = dict(PARAMS1)

    # Step 1
    step1_result = step1_find_depth_and_child(params)
    #step1_result = {'max_depth': 3, 'min_child_weight': 7}
    update(params,step1_result)
    print("Step 1 Result: {}".format(step1_result))

    # Step 2
    step2_result = step2_narrow_depth(params)
    #step2_result = {'max_depth': 4}
    update(params, step2_result)
    print("Step 2 Result: {}".format(step2_result))

    # Step 3
    step3_result = step3_gamma(params)
    #step3_result = {'gamma': 0.4}
    update(params, step3_result)
    print("Step 3 Result: {}".format(step3_result))

    # Step 4
    step4_result = step4_sample(params)
    #step4_result = {'colsample_bytree': 0.9, 'subsample': 0.8}
    update(params, step4_result)
    print("Step 4 Result: {}".format(step4_result))

    # Step 5
    step5_result = step5_reg1(params)
    #step5_result = {'colsample_bytree': 0.9, 'subsample': 0.8}
    update(params, step5_result)
    print("Step 5 Result: {}".format(step5_result))

    print("Final Result: {}".format(params))
    elapsed_time = time.time() - start_time
    print("Elapsed time: {}".format(hms_string(elapsed_time)))





if __name__ == '__main__':
    main()