# -*- coding:utf-8 -*- # &Author AnFany # 引入数据 import pm25_GBDT_Data as data # 引入GBDT回归模型 from sklearn.ensemble import GradientBoostingRegressor from sklearn.metrics import mean_squared_error as mse import numpy as np # 绘制不同参数下MSE的对比曲线 from pylab import mpl mpl.rcParams['font.sans-serif'] = ['FangSong'] # 显示中文 mpl.rcParams['axes.unicode_minus'] = False # 显示负号 import matplotlib.pyplot as plt # 根据K折交叉的结果确定比较好的参数组合,然后给出预测数据真实值和预测值的对比 # 更改的参数一是框架的参数,二是弱模型的参数 # 弱模型中树的层数 cengs = [16, 17, 18] # 定义弱模型的个数 models = [55, 56, 57] # 训练函数 def Train(data, modelcount, censhu, yanzhgdata): model = GradientBoostingRegressor(loss='ls', n_estimators=modelcount, max_depth=censhu, learning_rate=0.12, subsample=0.8) model.fit(data[:, :-1], data[:, -1]) # 给出训练数据的预测值 train_out = model.predict(data[:, :-1]) # 计算MSE train_mse = mse(data[:, -1], train_out) # 给出验证数据的预测值 add_yan = model.predict(yanzhgdata[:, :-1]) # 计算MSE add_mse = mse(yanzhgdata[:, -1], add_yan) print(train_mse, add_mse) return train_mse, add_mse # 最终确定组合的函数 def Zuhe(datadict, tre=models, tezhen=cengs): # 存储结果的字典 savedict = {} # 存储序列的字典 sacelist = {} for t in tre: for te in tezhen: print(t, te) sumlist = [] # 因为要展示折数,因此要按序开始 ordelist = sorted(list(datadict.keys())) for jj in ordelist: xun, ya = Train(datadict[jj]['train'], t, te, datadict[jj]['test']) # 根据验证数据的误差确定最佳的组合 sumlist.append(ya) sacelist['%s-%s' % (t, te)] = sumlist savedict['%s-%s' % (t, te)] = np.mean(np.array(sumlist)) # 在结果字典中选择最小的 zuixao = sorted(savedict.items(), key=lambda fu: fu[1])[0][0] # 然后再选出此方法中和值最小的折数 xiao = sacelist[zuixao].index(min(sacelist[zuixao])) return zuixao, xiao, sacelist # 根据字典绘制曲线 def duibi(exdict, you): plt.figure(figsize=(11, 7)) for ii in exdict: plt.plot(list(range(len(exdict[ii]))), exdict[ii], \ label='%s,%d折MSE均值:%.3f' % (ii, len(exdict[ii]), np.mean(np.array(exdict[ii]))), lw=2) plt.legend() plt.title('不同参数的组合MSE对比曲线[最优:%s]' % you) plt.savefig(r'C:\Users\GWT9\Desktop\GBDT_pm25.jpg') return '不同方法对比完毕' # 根据获得最有参数组合绘制真实和预测值的对比曲线 def recspre(exstr, predata, datadict, zhe, count=100): tree, te = exstr.split('-') model = GradientBoostingRegressor(loss='ls', n_estimators=int(tree), max_depth=int(te), learning_rate=0.12, subsample=0.8) model.fit(datadict[zhe]['train'][:, :-1], datadict[zhe]['train'][:, -1]) # 预测 yucede = model.predict(predata[:, :-1]) # 为了便于展示,选100条数据进行展示 zongleng = np.arange(len(yucede)) randomnum = np.random.choice(zongleng, count, replace=False) yucede_se = list(np.array(yucede)[randomnum]) yuce_re = list(np.array(predata[:, -1])[randomnum]) # 对比 plt.figure(figsize=(17, 9)) plt.subplot(2, 1, 1) plt.plot(list(range(len(yucede_se))), yucede_se, 'r--', label='预测', lw=2) plt.scatter(list(range(len(yuce_re))), yuce_re, c='b', marker='.', label='真实', lw=2) plt.xlim(-1, count + 1) plt.legend() plt.title('预测和真实值对比[最大树数%d]' % int(tree)) plt.subplot(2, 1, 2) plt.plot(list(range(len(yucede_se))), np.array(yuce_re) - np.array(yucede_se), 'k--', marker='s', label='真实-预测', lw=2) plt.legend() plt.title('预测和真实值相对误差') plt.savefig(r'C:\Users\GWT9\Desktop\duibi.jpg') return '预测真实对比完毕' # 主函数 if __name__ == "__main__": zijian, zhehsu, xulie = Zuhe(data.dt_data) duibi(xulie, zijian) recspre(zijian, data.predict_data, data.dt_data, zhehsu)