# -*- encoding:utf-8 -*- """封装grid search相关操作模块""" from __future__ import division from __future__ import print_function from __future__ import absolute_import import logging import matplotlib.pyplot as plt import numpy as np from sklearn.base import ClassifierMixin, RegressorMixin from sklearn.metrics import explained_variance_score, make_scorer from ..CoreBu.ABuFixes import signature, six from ..CoreBu.ABuFixes import GridSearchCV __author__ = '阿布' __weixin__ = 'abu_quant' __all__ = [ 'grid_search_init_kwargs', 'grid_search_mul_init_kwargs', 'grid_search_init_n_estimators', 'grid_search_init_n_components', 'grid_search_init_max_depth', 'grid_search_init_n_neighbors' ] def _scoring_grid(estimator, scoring): """ 只针对有监督学习过滤无监督学习,对scoring未赋予的情况根据 学习器分类器使用accuracy进行度量,回归器使用可释方差值explained_variance_score, 使用make_scorer对函数进行score封装 :param estimator: 学习器对象 :param scoring: 度量使用的方法,未赋予的情况根据 学习器分类器使用accuracy进行度量,回归器使用explained_variance_score进行度量 :return: scoring """ if not isinstance(estimator, (ClassifierMixin, RegressorMixin)): logging.info('only support supervised learning') # TODO 无监督学习的scoring度量以及GridSearchCV return None if scoring is None: if isinstance(estimator, ClassifierMixin): # 分类器使用accuracy return 'accuracy' elif isinstance(estimator, RegressorMixin): # 回归器使用可释方差值explained_variance_score,使用make_scorer对函数进行score封装 """ make_scorer中通过greater_is_better对返回值进行正负分配 eg: sign = 1 if greater_is_better else -1 """ return make_scorer(explained_variance_score, greater_is_better=True) return None return scoring def grid_search_init_kwargs(estimator, x, y, param_name, param_range, cv=10, n_jobs=-1, scoring=None, show=True): """ 对GridSearchCV进行封装,对单个目标关键字参数进行grid search最优参数搜寻 eg:'n_estimators', 'max_depth' eg:param_range=np.arange(100, 500, 50))对最优参数进行寻找 eg: from abupy import AbuML, ml ttn_abu = AbuML.create_test_more_fiter() ttn_abu.estimator.random_forest_classifier() ml.grid_search_init_kwargs(ttn_abu.estimator.clf, ttn_abu.x, ttn_abu.y, param_name='n_estimators', param_range=np.arange(100, 500, 50)) 可找到n_estimators参数最优为:(0.81930415263748602, {'n_estimators': 300}) :param estimator: 学习器对象 :param x: 训练集x矩阵,numpy矩阵 :param y: 训练集y序列,numpy序列 :param param_name: 做为grid的目标关键字参数,eg:'n_estimators', 'max_depth' :param param_range: 做为grid的目标关键字参数的grid序列,eg:param_range=np.arange(100, 500, 50)) :param cv: int,GridSearchCV切割训练集测试集参数,默认10 :param n_jobs: 并行执行的进程任务数量,默认-1, 开启与cpu相同数量的进程数 :param scoring: 测试集的度量方法,默认为None, None的情况下分类器使用accuracy进行度量, 回归器使用可释方差值explained_variance_score,使用make_scorer对函数进行score封装 :param show: 是否进行可视化 :return: eg:(0.81930415263748602, {'n_estimators': 300}) """ if not isinstance(param_name, six.string_types): # param_name参数需要是字符串类型 logging.info('param_name is str, not {}, eg: \'n_estimators\''.format(param_name)) return None, None # 根据分类回归得到非None的score scoring = _scoring_grid(estimator, scoring) if scoring is None: # 如果_scoring_grid返回的结果仍然是None, 说明无监督学习,暂时不支持 return None, None # 获取学习器的init函数,使用getattr init = getattr(estimator.__class__.__init__, 'deprecated_original', estimator.__class__.__init__) # 获取函数签名 init_signature = signature(init) """ eg:init_signature ['self', 'base_estimator', 'n_estimators', 'max_samples', 'max_features', 'bootstrap', 'bootstrap_features', 'oob_score', 'warm_start', 'n_jobs', 'random_state', 'verbose'] """ if param_name not in init_signature.parameters.keys(): # 如果需要grid的参数param_name不在init函数签名中,打log,返回 logging.info('check init signature {} not in **kwargs\ninit_signature:{}'.format( param_name, init_signature.parameters.keys())) return None, None param_grid = {param_name: param_range} grid = GridSearchCV(estimator, param_grid, cv=cv, scoring=scoring, n_jobs=n_jobs) print('start grid search please wait...') grid.fit(x, y) if show: if hasattr(grid, 'cv_results_'): # 0.18之后的版本有cv_results_,拿出每一次的训练score的mean形成grid_scores cv_results = grid.cv_results_ grid_scores = cv_results['mean_test_score'] else: # 0.18之前的版本 cv_results = grid.grid_scores_ grid_scores = [result.mean_validation_score for result in cv_results] """ cv_results中每一个元素为_CVScoreTuple namedtuple对象,如下所示: class _CVScoreTuple (namedtuple('_CVScoreTuple', ('parameters', 'mean_validation_score', 'cv_validation_scores'))): """ # FIXME 这里假定了所有param_range的元素类型都是数值类型,需要判定,并且根据情况是否需要排序 plt.plot(param_range, grid_scores) # 把最好的红圈标记出来 plt.plot(grid.best_params_[param_name], grid.best_score_, 'ro', markersize=12, markeredgewidth=1.5, markerfacecolor='None', markeredgecolor='r') plt.title('search {}, best is {}'.format(param_name, grid.best_params_[param_name])) plt.show() return grid.best_score_, grid.best_params_ def grid_search_mul_init_kwargs(estimator, x, y, param_grid, cv=10, n_jobs=-1, scoring=None, show=True): """ 对GridSearchCV进行封装,对多个目标关键字参数进行grid search最优参数搜寻 eg: from abupy import AbuML, ml ttn_abu = AbuML.create_test_more_fiter() ttn_abu.estimator.random_forest_classifier() param_grid = {'max_depth': np.arange(2, 5), 'n_estimators': np.arange(100, 300, 50)} ml.grid_search_mul_init_kwargs(ttn_abu.estimator.clf, ttn_abu.x, ttn_abu.y, param_grid=param_grid) out: (0.81593714927048255, {'max_depth': 4, 'n_estimators': 250}) :param estimator: 学习器对象 :param x: 训练集x矩阵,numpy矩阵 :param y: 训练集y序列,numpy序列 :param param_grid: eg:param_grid = {'max_depth': np.arange(2, 5), 'n_estimators': np.arange(100, 300, 50)} :param cv: int,GridSearchCV切割训练集测试集参数,默认10 :param n_jobs: 并行执行的进程任务数量,默认-1, 开启与cpu相同数量的进程数 :param scoring: 测试集的度量方法,默认为None, None的情况下分类器使用accuracy进行度量, 回归器使用可释方差值explained_variance_score,使用make_scorer对函数进行score封装 :param show: 是否进行可视化 :return: eg: (0.81593714927048255, {'max_depth': 4, 'n_estimators': 250}) """ if not isinstance(param_grid, dict): # param_grid参数是dict对象 logging.info('param_grid is dict object, not {}}'.format(param_grid)) return None, None # 根据分类回归得到非None的score scoring = _scoring_grid(estimator, scoring) if scoring is None: # 如果_scoring_grid返回的结果仍然是None, 说明无监督学习,暂时不支持 return None, None for param_name in param_grid.keys(): # 迭代每一个key,即每一个关键字参数,看init方法中的签名是否存在该关键字 # 获取学习器的init函数,使用getattr init = getattr(estimator.__class__.__init__, 'deprecated_original', estimator.__class__.__init__) # 获取函数签名 init_signature = signature(init) if param_name not in init_signature.parameters.keys(): # 如果需要grid的参数param_name不在init函数签名中,打log,返回 logging.info('check init signature {} not in **kwargs\ninit_signature:{}'.format( param_name, init_signature.parameters.keys())) return None, None grid = GridSearchCV(estimator, param_grid, cv=cv, scoring=scoring, n_jobs=n_jobs) print('start grid search please wait...') grid.fit(x, y) if show: if hasattr(grid, 'cv_results_'): # 0.18之后的版本有cv_results_,拿出每一次的训练score的mean形成grid_scores cv_results = grid.cv_results_ grid_scores = cv_results['mean_test_score'] """ eg:grid_scores [0.77890011223344557, 0.79349046015712688, 0.77553310886644222, 0.77441077441077444, 0.80920314253647585, 0.80920314253647585, 0.80808080808080807, 0.81032547699214363, 0.80695847362514028, 0.81144781144781142, 0.80471380471380471, 0.81593714927048255] """ grid_params = cv_results['params'] """ eg:grid_params [{'max_depth': 2, 'n_estimators': 100}, {'max_depth': 2, 'n_estimators': 150}, {'max_depth': 2, 'n_estimators': 200}, {'max_depth': 2, 'n_estimators': 250}, {'max_depth': 3, 'n_estimators': 100}, {'max_depth': 3, 'n_estimators': 150}, {'max_depth': 3, 'n_estimators': 200}, {'max_depth': 3, 'n_estimators': 250}, {'max_depth': 4, 'n_estimators': 100}, {'max_depth': 4, 'n_estimators': 150}, {'max_depth': 4, 'n_estimators': 200}, {'max_depth': 4, 'n_estimators': 250}] """ else: cv_results = grid.grid_scores_ """ cv_results中每一个元素为_CVScoreTuple namedtuple对象,如下所示: class _CVScoreTuple (namedtuple('_CVScoreTuple', ('parameters', 'mean_validation_score', 'cv_validation_scores'))): """ grid_scores = [result.mean_validation_score for result in cv_results] grid_params = [result.parameters for result in cv_results] # 与grid_search_init_kwargs不同可视化grid_scores绘制曲线y,x只使用index plt.plot(grid_scores) cmap = plt.get_cmap('jet', len(grid_scores)) cmap.set_under('gray') for grid_index in np.arange(0, len(grid_scores)): # 迭代每一个分数,绘制点在曲线上根据分数用颜色区分,使用label进行标注 plt.scatter(grid_index, grid_scores[grid_index], s=50, cmap=cmap, vmin=np.min(grid_scores), vmax=np.max(grid_scores), label='{}: {:.2f}'.format(grid_params[grid_index], grid_scores[grid_index])) plt.title('best params is {}'.format(grid.best_params_)) # 将label标注文字绘制在外面 plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) plt.show() return grid.best_score_, grid.best_params_ def grid_search_init_n_estimators(estimator, x, y, n_estimators_range=None, cv=10, n_jobs=-1, scoring=None, show=True): """ 封装grid search特定的'n_estimators'关键字参数最优搜索, 为AbuMLCreater中_estimators_prarms_best提供callback函数, 具体阅读 AbuMLCreater._estimators_prarms_best() + AbuMLCreater.random_forest_classifier_best() eg: from abupy import AbuML, ml ttn_abu = AbuML.create_test_more_fiter() ttn_abu.estimator.random_forest_classifier() ml.grid_search_init_n_estimators(ttn_abu.estimator.clf, ttn_abu.x, ttn_abu.y) :param estimator: 学习器对象 :param x: 训练集x矩阵,numpy矩阵 :param y: 训练集y序列,numpy序列 :param n_estimators_range: 默认None, None则会使用 n_estimators_range = np.arange(50, 500, 10) :param cv: int,GridSearchCV切割训练集测试集参数,默认10 :param n_jobs: 并行执行的进程任务数量,默认-1, 开启与cpu相同数量的进程数 :param scoring: 测试集的度量方法,默认为None, None的情况下分类器使用accuracy进行度量, 回归器使用可释方差值explained_variance_score,使用make_scorer对函数进行score封装 :param show: 是否进行可视化 :return: eg: (0.82154882154882158, {'n_estimators': 310}) """ if n_estimators_range is None: n_estimators_range = np.arange(50, 500, 10) return grid_search_init_kwargs(estimator, x, y, 'n_estimators', n_estimators_range, cv=cv, n_jobs=n_jobs, scoring=scoring, show=show) def grid_search_init_max_depth(estimator, x, y, max_depth_range=None, cv=10, n_jobs=-1, scoring=None, show=True): """ 封装grid search特定的'n_components'关键字参数最优搜索, 为AbuMLCreater中_estimators_prarms_best提供callback函数 具体阅读 AbuMLCreater._estimators_prarms_best() + AbuMLCreater.decision_tree_classifier_best() :param estimator: 学习器对象 :param x: 训练集x矩阵,numpy矩阵 :param y: 训练集y序列,numpy序列 :param max_depth_range: 默认None, None则会使用: max_depth_range = np.arange(2, np.maximum(10, int(x.shape[1]) - 1), 1) :param cv: int,GridSearchCV切割训练集测试集参数,默认10 :param n_jobs: 并行执行的进程任务数量,默认-1, 开启与cpu相同数量的进程数 :param scoring: 测试集的度量方法,默认为None, None的情况下分类器使用accuracy进行度量, 回归器使用可释方差值explained_variance_score,使用make_scorer对函数进行score封装 :param show: 是否进行可视化 :return: eg: (0.82154882154882158, {'max_depth': 3}) """ if max_depth_range is None: max_depth_range = np.arange(2, np.maximum(10, int(x.shape[1]) - 1), 1) return grid_search_init_kwargs(estimator, x, y, 'max_depth', max_depth_range, cv=cv, n_jobs=n_jobs, scoring=scoring, show=show) def grid_search_init_n_neighbors(estimator, x, y, n_neighbors_range=None, cv=10, n_jobs=-1, scoring=None, show=True): """ 封装grid search特定的'n_components'关键字参数最优搜索, 为AbuMLCreater中_estimators_prarms_best提供callback函数 具体阅读 AbuMLCreater._estimators_prarms_best() + AbuMLCreater.knn_classifier_best() :param estimator: 学习器对象 :param x: 训练集x矩阵,numpy矩阵 :param y: 训练集y序列,numpy序列 :param n_neighbors_range: 默认None, None则会使用: n_estimators_range = np.arange(2, np.maximum(10, int(x.shape[1]) - 1), 1) :param cv: int,GridSearchCV切割训练集测试集参数,默认10 :param n_jobs: 并行执行的进程任务数量,默认-1, 开启与cpu相同数量的进程数 :param scoring: 测试集的度量方法,默认为None, None的情况下分类器使用accuracy进行度量, 回归器使用可释方差值explained_variance_score,使用make_scorer对函数进行score封装 :param show: 是否进行可视化 :return: eg: (0.82154882154882158, {'n_components': 10}) """ if n_neighbors_range is None: # 邻居投票者控制在1-np.minimum(26, 总数的1/3) n_neighbors_range = np.arange(1, np.minimum(26, int(x.shape[0] / 3)), 1) return grid_search_init_kwargs(estimator, x, y, 'n_neighbors', n_neighbors_range, cv=cv, n_jobs=n_jobs, scoring=scoring, show=show) def grid_search_init_n_components(estimator, x, y, n_components_range=None, cv=10, n_jobs=-1, scoring=None, show=True): """ 封装grid search特定的'n_components'关键字参数最优搜索, 为AbuMLCreater中_estimators_prarms_best提供callback函数, 具体阅读AbuMLCreater._estimators_prarms_best() :param estimator: 学习器对象 :param x: 训练集x矩阵,numpy矩阵 :param y: 训练集y序列,numpy序列 :param n_components_range: 默认None, None则会使用: n_estimators_range = np.arange(2, np.maximum(10, int(x.shape[1]) - 1), 1) :param cv: int,GridSearchCV切割训练集测试集参数,默认10 :param n_jobs: 并行执行的进程任务数量,默认-1, 开启与cpu相同数量的进程数 :param scoring: 测试集的度量方法,默认为None, None的情况下分类器使用accuracy进行度量,回归器使用 回归器使用可释方差值explained_variance_score,使用make_scorer对函数进行score封装 :param show: 是否进行可视化 :return: eg: (0.82154882154882158, {'n_components': 10}) """ if n_components_range is None: n_components_range = np.arange(2, np.maximum(10, int(x.shape[1]) - 1), 1) return grid_search_init_kwargs(estimator, x, y, 'n_components', n_components_range, cv=cv, n_jobs=n_jobs, scoring=scoring, show=show)