```import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
import ui
import scipy.io as sio
import scipy.optimize as opt
import seaborn as sb

print('PART 1 异常检测')
ui.split_line2()
print('用高斯模型检测数据是否异常')
print('载入数据')
X = data['X']
print(X.shape)
print('查看数据分布')
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(X[:,0], X[:,1])
plt.show()
print('定义高斯函数')
def estimate_gaussian(X):
mu = X.mean(axis=0)
sigma = X.var(axis=0)

return mu, sigma
print('进行高斯预估')
mu, sigma = estimate_gaussian(X)
print(mu, sigma)
print('确定验证集')
Xval = data['Xval']
yval = data['yval']
print(Xval.shape, yval.shape)
print('计算数据点属于正态分布的概率')
from scipy import stats
dist = stats.norm(mu[0], sigma[0])
print(dist.pdf(15))
print('每个点的概率密度')
print(dist.pdf(X[:,0])[0:50])
print('计算数据集中每个点的概率密度')
p = np.zeros((X.shape[0], X.shape[1]))
p[:,0] = stats.norm(mu[0], sigma[0]).pdf(X[:,0])
p[:,1] = stats.norm(mu[1], sigma[1]).pdf(X[:,1])
print(p.shape)
print('计算验证集中每个点的概率密度')
pval = np.zeros((Xval.shape[0], Xval.shape[1]))
pval[:,0] = stats.norm(mu[0], sigma[0]).pdf(Xval[:,0])
pval[:,1] = stats.norm(mu[1], sigma[1]).pdf(Xval[:,1])
print(pval.shape)
print('找到判断的最佳阈值')
def select_threshold(pval, yval):
best_epsilon = 0
best_f1 = 0
f1 = 0

step = (pval.max() - pval.min()) / 1000

for epsilon in np.arange(pval.min(), pval.max(), step):
preds = pval < epsilon

tp = np.sum(np.logical_and(preds == 1, yval == 1)).astype(float)
fp = np.sum(np.logical_and(preds == 1, yval == 0)).astype(float)
fn = np.sum(np.logical_and(preds == 0, yval == 1)).astype(float)

precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = (2 * precision * recall) / (precision + recall)

if f1 > best_f1:
best_f1 = f1
best_epsilon = epsilon

return best_epsilon, best_f1

epsilon, f1 = select_threshold(pval, yval)
print(epsilon, f1)
print('将该阈值应用到数据集，并可视化结果')
# indexes of the values considered to be outliers
outliers = np.where(p < epsilon)
print(outliers)

fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(X[:,0], X[:,1])
ax.scatter(X[outliers[0],0], X[outliers[0],1], s=50, color='r', marker='o')
plt.show()

ui.split_line1()
print('PART 2 协同过滤之电影推荐')
print('载入数据')
print(data)
print('Y是包含从1到5的等级的（数量的电影x数量的用户）数组.R是包含指示用户是否给电影评分的二进制值的“指示符”数组')
print('两者应该具有相同的维度。')
Y = data['Y']
R = data['R']
print(Y.shape, R.shape)
print('电影的平均评级')
print(Y[1,np.where(R[1,:]==1)[0]].mean())
print('可视化')
fig, ax = plt.subplots(figsize=(12,12))
ax.imshow(Y)
ax.set_xlabel('Users')
ax.set_ylabel('Movies')
fig.tight_layout()
plt.show()

print('定义代价函数')
def cost(params, Y, R, num_features):
Y = np.matrix(Y)  # (1682, 943)
R = np.matrix(R)  # (1682, 943)
num_movies = Y.shape[0]
num_users = Y.shape[1]

# reshape the parameter array into parameter matrices
X = np.matrix(np.reshape(params[:num_movies * num_features], (num_movies, num_features)))  # (1682, 10)
Theta = np.matrix(np.reshape(params[num_movies * num_features:], (num_users, num_features)))  # (943, 10)

# initializations
J = 0

# compute the cost
error = np.multiply((X * Theta.T) - Y, R)  # (1682, 943)
squared_error = np.power(error, 2)  # (1682, 943)
J = (1. / 2) * np.sum(squared_error)

return J

print('评估一小段数据')
X = params_data['X']
Theta = params_data['Theta']
X.shape, Theta.shape

users = 4
movies = 5
features = 3

X_sub = X[:movies, :features]
Theta_sub = Theta[:users, :features]
Y_sub = Y[:movies, :users]
R_sub = R[:movies, :users]

params = np.concatenate((np.ravel(X_sub), np.ravel(Theta_sub)))
print('计算 Cost')
print(cost(params, Y_sub, R_sub, features))

print('定义梯度计算')
def cost0(params, Y, R, num_features):
Y = np.matrix(Y)  # (1682, 943)
R = np.matrix(R)  # (1682, 943)
num_movies = Y.shape[0]
num_users = Y.shape[1]

# reshape the parameter array into parameter matrices
X = np.matrix(np.reshape(params[:num_movies * num_features], (num_movies, num_features)))  # (1682, 10)
Theta = np.matrix(np.reshape(params[num_movies * num_features:], (num_users, num_features)))  # (943, 10)

# initializations
J = 0
X_grad = np.zeros(X.shape)  # (1682, 10)
Theta_grad = np.zeros(Theta.shape)  # (943, 10)

# compute the cost
error = np.multiply((X * Theta.T) - Y, R)  # (1682, 943)
squared_error = np.power(error, 2)  # (1682, 943)
J = (1. / 2) * np.sum(squared_error)

# unravel the gradient matrices into a single array

print('计算梯度')
J, grad = cost0(params, Y_sub, R_sub, features)
print('定义正则化 cost')
def cost1(params, Y, R, num_features, learning_rate):
Y = np.matrix(Y)  # (1682, 943)
R = np.matrix(R)  # (1682, 943)
num_movies = Y.shape[0]
num_users = Y.shape[1]

# reshape the parameter array into parameter matrices
X = np.matrix(np.reshape(params[:num_movies * num_features], (num_movies, num_features)))  # (1682, 10)
Theta = np.matrix(np.reshape(params[num_movies * num_features:], (num_users, num_features)))  # (943, 10)

# initializations
J = 0
X_grad = np.zeros(X.shape)  # (1682, 10)
Theta_grad = np.zeros(Theta.shape)  # (943, 10)

# compute the cost
error = np.multiply((X * Theta.T) - Y, R)  # (1682, 943)
squared_error = np.power(error, 2)  # (1682, 943)
J = (1. / 2) * np.sum(squared_error)

J = J + ((learning_rate / 2) * np.sum(np.power(Theta, 2)))
J = J + ((learning_rate / 2) * np.sum(np.power(X, 2)))

# calculate the gradients with regularization
X_grad = (error * Theta) + (learning_rate * X)
Theta_grad = (error.T * X) + (learning_rate * Theta)

# unravel the gradient matrices into a single array

print('计算梯度')
J, grad = cost1(params, Y_sub, R_sub, features, 1.5)

print('加载字典')
movie_idx = {}
f = open('data/movie_ids.txt',encoding= 'gbk')
for line in f:
tokens = line.split(' ')
tokens[-1] = tokens[-1][:-1]
movie_idx[int(tokens[0]) - 1] = ' '.join(tokens[1:])
print('查看第一条')
print(movie_idx[0])

print('进行评分')
ratings = np.zeros((1682, 1))

ratings[0] = 4
ratings[6] = 3
ratings[11] = 5
ratings[53] = 4
ratings[63] = 5
ratings[65] = 3
ratings[68] = 5
ratings[97] = 2
ratings[182] = 4
ratings[225] = 5
ratings[354] = 5

print('Rated {0} with {1} stars.'.format(movie_idx[0], str(int(ratings[0]))))
print('Rated {0} with {1} stars.'.format(movie_idx[6], str(int(ratings[6]))))
print('Rated {0} with {1} stars.'.format(movie_idx[11], str(int(ratings[11]))))
print('Rated {0} with {1} stars.'.format(movie_idx[53], str(int(ratings[53]))))
print('Rated {0} with {1} stars.'.format(movie_idx[63], str(int(ratings[63]))))
print('Rated {0} with {1} stars.'.format(movie_idx[65], str(int(ratings[65]))))
print('Rated {0} with {1} stars.'.format(movie_idx[68], str(int(ratings[68]))))
print('Rated {0} with {1} stars.'.format(movie_idx[97], str(int(ratings[97]))))
print('Rated {0} with {1} stars.'.format(movie_idx[182], str(int(ratings[182]))))
print('Rated {0} with {1} stars.'.format(movie_idx[225], str(int(ratings[225]))))
print('Rated {0} with {1} stars.'.format(movie_idx[354], str(int(ratings[354]))))

print('将评分添加到已有数据集中')
R = data['R']
Y = data['Y']

Y = np.append(Y, ratings, axis=1)
R = np.append(R, ratings != 0, axis=1)

print(Y.shape, R.shape, ratings.shape)
print('对变量进行归一化')
movies = Y.shape[0]  # 1682
users = Y.shape[1]  # 944
features = 10
learning_rate = 10.

X = np.random.random(size=(movies, features))
Theta = np.random.random(size=(users, features))
params = np.concatenate((np.ravel(X), np.ravel(Theta)))

print(X.shape, Theta.shape, params.shape)

Ymean = np.zeros((movies, 1))
Ynorm = np.zeros((movies, users))

for i in range(movies):
idx = np.where(R[i,:] == 1)[0]
Ymean[i] = Y[i,idx].mean()
Ynorm[i,idx] = Y[i,idx] - Ymean[i]

print(Ynorm.mean())

print('进行模型训练与优化')
from scipy.optimize import minimize

fmin = minimize(fun=cost1, x0=params, args=(Ynorm, R, features, learning_rate),
method='CG', jac=True, options={'maxiter': 100})
print(fmin)

X = np.matrix(np.reshape(fmin.x[:movies * features], (movies, features)))
Theta = np.matrix(np.reshape(fmin.x[movies * features:], (users, features)))

print(X.shape, Theta.shape)
print('进行预测')
predictions = X * Theta.T
my_preds = predictions[:, -1] + Ymean
print(my_preds.shape)
print('排序预测')
idx = np.argsort(my_preds, axis=0)[::-1]
print(idx)
print("Top 10 movie predictions:")
for i in range(10):
j = int(idx[i])
print('Predicted rating of {0} for movie {1}.'.format(str(float(my_preds[j])), movie_idx[j]))

```