```import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
import ui
import scipy.io as sio
import scipy.optimize as opt
import seaborn as sns

"""for ex5
d['X'] shape = (12, 1)
pandas has trouble taking this 2d ndarray to construct a dataframe, so I ravel
the results
"""
return map(np.ravel, [d['X'], d['y'], d['Xval'], d['yval'], d['Xtest'], d['ytest']])

print('载入训练数据')
X, y, Xval, yval, Xtest, ytest = load_data()
print('可视化 X(水位的变化) 和 y(离开大坝的水流量 )')
df = pd.DataFrame({'water_level':X, 'flow':y})

sns.lmplot('water_level', 'flow', data=df, fit_reg=False, size=7)
plt.show()
print('设置训练、验证、测试集')
X, Xval, Xtest = [np.insert(x.reshape(x.shape[0], 1), 0, np.ones(x.shape[0]), axis=1) for x in (X, Xval, Xtest)]

print('代价函数')
def cost(theta, X, y):
"""
X: R(m*n), m records, n features
y: R(m)
theta : R(n), linear regression parameters
"""
m = X.shape[0]

inner = X @ theta - y  # R(m*1)

# 1*m @ m*1 = 1*1 in matrix multiplication
# but you know numpy didn't do transpose in 1d array, so here is just a
# vector inner product to itselves
square_sum = inner.T @ inner
cost = square_sum / (2 * m)

return cost
print('用初始化参数计算 cost')
theta = np.ones(X.shape[1])
print(cost(theta, X, y))
print('定义梯度')
m = X.shape[0]

inner = X.T @ (X @ theta - y)  # (m,n).T @ (m, 1) -> (n, 1)

return inner / m
print('用初始化参数计算梯度')
print('正则化梯度')
m = X.shape[0]

regularized_term = theta.copy()  # same shape as theta
regularized_term[0] = 0  # don't regularize intercept theta

regularized_term = (l / m) * regularized_term

return gradient(theta, X, y) + regularized_term
print('用初始化参数计算正则化梯度')

print('拟合数据，无正则化项')
def linear_regression_np(X, y, l=1):
"""linear regression
args:
X: feature matrix, (m, n+1) # with incercept x0=1
y: target vector, (m, )
l: lambda constant for regularization

return: trained parameters
"""
# init theta
theta = np.ones(X.shape[1])

# train it
res = opt.minimize(fun=regularized_cost,
x0=theta,
args=(X, y, l),
method='TNC',
options={'disp': True})
return res

def regularized_cost(theta, X, y, l=1):
m = X.shape[0]

regularized_term = (l / (2 * m)) * np.power(theta[1:], 2).sum()

return cost(theta, X, y) + regularized_term

theta = np.ones(X.shape[0])
final_theta = linear_regression_np(X, y, l=0).get('x')
print('绘制图像')
b = final_theta[0] # intercept
m = final_theta[1] # slope

plt.scatter(X[:,1], y, label="Training data")
plt.plot(X[:, 1], X[:, 1]*m + b, label="Prediction")
plt.legend(loc=2)
plt.show()

training_cost, cv_cost = [], []

print('1.使用训练集的子集来拟合应模型')
print('2.在计算训练代价和交叉验证代价时，没有用正则化')
print('3.记住使用相同的训练集子集来计算训练代价')
m = X.shape[0]
for i in range(1, m+1):
#     print('i={}'.format(i))
res = linear_regression_np(X[:i, :], y[:i], l=0)

tc = regularized_cost(res.x, X[:i, :], y[:i], l=0)
cv = regularized_cost(res.x, Xval, yval, l=0)
#     print('tc={}, cv={}'.format(tc, cv))

training_cost.append(tc)
cv_cost.append(cv)

plt.plot(np.arange(1, m+1), training_cost, label='training cost')
plt.plot(np.arange(1, m+1), cv_cost, label='cv cost')
plt.legend(loc=1)
plt.show()
print('模型欠拟合')
ui.split_line2()
print('创建多项式特征')
def prepare_poly_data(*args, power):
"""
args: keep feeding in X, Xval, or Xtest
will return in the same order
"""
def prepare(x):
# expand feature
df = poly_features(x, power=power)

# normalization
ndarr = normalize_feature(df).as_matrix()

return np.insert(ndarr, 0, np.ones(ndarr.shape[0]), axis=1)

return [prepare(x) for x in args]

def poly_features(x, power, as_ndarray=False):
data = {'f{}'.format(i): np.power(x, i) for i in range(1, power + 1)}
df = pd.DataFrame(data)

return df.as_matrix() if as_ndarray else df

X, y, Xval, yval, Xtest, ytest = load_data()

print(poly_features(X, power=3))
print('准备多项式回归数据')
def normalize_feature(df):
"""Applies function along input axis(default 0) of DataFrame."""
return df.apply(lambda column: (column - column.mean()) / column.std())

X_poly, Xval_poly, Xtest_poly= prepare_poly_data(X, Xval, Xtest, power=8)
print(X_poly[:3, :])

print('画出学习曲线')
def plot_learning_curve(X, y, Xval, yval, l=0):
training_cost, cv_cost = [], []
m = X.shape[0]

for i in range(1, m + 1):
# regularization applies here for fitting parameters
res = linear_regression_np(X[:i, :], y[:i], l=l)

# remember, when you compute the cost here, you are computing
# non-regularized cost. Regularization is used to fit parameters only
tc = cost(res.x, X[:i, :], y[:i])
cv = cost(res.x, Xval, yval)

training_cost.append(tc)
cv_cost.append(cv)

plt.plot(np.arange(1, m + 1), training_cost, label='training cost')
plt.plot(np.arange(1, m + 1), cv_cost, label='cv cost')
plt.legend(loc=1)

plot_learning_curve(X_poly, y, Xval_poly, yval, l=0)
plt.show()
print('无正则化，训练代价过低，过拟合')
ui.split_line2()

plot_learning_curve(X_poly, y, Xval_poly, yval, l=1)
plt.show()
print('l=1，减轻了过拟合')

ui.split_line2()

plot_learning_curve(X_poly, y, Xval_poly, yval, l=100)
plt.show()
print('l=100， 正则化过了，变成欠拟合')

print('找到最佳的正则化系数')
l_candidate = [0, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10]
training_cost, cv_cost = [], []
for l in l_candidate:
res = linear_regression_np(X_poly, y, l)

tc = cost(res.x, X_poly, y)
cv = cost(res.x, Xval_poly, yval)

training_cost.append(tc)
cv_cost.append(cv)

plt.plot(l_candidate, training_cost, label='training')
plt.plot(l_candidate, cv_cost, label='cross validation')
plt.legend(loc=2)

plt.xlabel('lambda')

plt.ylabel('cost')
plt.show()

print('分别 计算 Cost')
for l in l_candidate:
theta = linear_regression_np(X_poly, y, l).x
print('test cost(l={}) = {}'.format(l, cost(theta, Xtest_poly, ytest)))

print('0.3 是最佳系数，这时测试误差最小')```