模型训练完成,怎么直观的观察模型的好坏呢? 观察训练集的方差和交叉验证的方差,可以大致判断是欠拟合还是过拟合
观察训练集的损失图和交叉样本集的损失图,判断优化方向
过拟合-增加正则 | 欠拟合-需增加多项式拟合 |
---|---|
过拟合 | 欠拟合 |
减轻过拟合 | lambda太大造成欠拟合 |
---|---|
lambda=1 | lambda=100 |
这个案例中,输入是水位,输出是流速。即研究水库中水位是如何影响水的流速的。
import numpy as np
import scipy.io as sio
import scipy.optimize as opt
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
def load_data():
"""for ex5
d['X'] shape = (12, 1)
pandas has trouble taking this 2d ndarray to construct a dataframe, so I ravel
the results
pandas 通过二维数组构造dataframe的功能有点问题,这里将结果摊开,原始数据是列形式,转成行
"""
d = sio.loadmat('ex5data1.mat')
return map(np.ravel, [d['X'], d['y'], d['Xval'], d['yval'], d['Xtest'], d['ytest']])
X, y, Xval, yval, Xtest, ytest = load_data()
df = pd.DataFrame({'water_level':X, 'flow':y})
sns.set(context="notebook", style="darkgrid", palette=sns.color_palette("RdBu", 2))
sns.lmplot(x = 'water_level', y = 'flow', data=df, fit_reg=False, height=7)
plt.show()
# 增加一列"1",即截距
X, Xval, Xtest = [np.insert(x.reshape(x.shape[0], 1), 0, np.ones(x.shape[0]), axis=1) for x in (X, Xval, Xtest)]
复习下代价函数
代码函数
def cost(theta, X, y):
"""
X: R(m*n), m records, n features
y: R(m)
theta : R(n), linear regression parameters
"""
m = X.shape[0]
inner = X @ theta - y # R(m*1)
# 1*m @ m*1 = 1*1 in matrix multiplication
# but you know numpy didn't do transpose in 1d array, so here is just a
# vector inner product to itselves
# 既然一维向量相乘不做转置处理,求的是内积,那这里的转置有什么意义呢?看起来一维向量不需要转置
square_sum = inner.T @ inner
cost = square_sum / (2 * m)
return cost
theta = np.ones(X.shape[1])
cost(theta, X, y)
303.9515255535976
复习下梯度 偏导求梯度
求解运算
def gradient(theta, X, y):
m = X.shape[0]
inner = X.T @ (X @ theta - y) # (m,n).T @ (m, 1) -> (n, 1)
return inner / m
gradient(theta, X, y)
array([-15.30301567, 598.16741084])
# 这里的theta向量是两个参数,即一维的线性拟合
正则化梯度
def regularized_gradient(theta, X, y, l=1):
m = X.shape[0]
regularized_term = theta.copy() # same shape as theta
regularized_term[0] = 0 # don't regularize intercept theta
regularized_term = (l / m) * regularized_term
return gradient(theta, X, y) + regularized_term
regularized_gradient(theta, X, y)
array([-15.30301567, 598.25074417])
# l = 1时,求第一次的梯度差别不大
正则项
def linear_regression_np(X, y, l=1):
"""linear regression
args:
X: feature matrix, (m, n+1) # with incercept x0=1
y: target vector, (m, )
l: lambda constant for regularization
return: trained parameters
"""
# init theta
theta = np.ones(X.shape[1])
# train it
res = opt.minimize(fun=regularized_cost,
x0=theta,
args=(X, y, l),
method='TNC',
jac=regularized_gradient,
options={'disp': True})
return res
def regularized_cost(theta, X, y, l=1):
m = X.shape[0]
regularized_term = (l / (2 * m)) * np.power(theta[1:], 2).sum()
return cost(theta, X, y) + regularized_term
theta = np.ones(X.shape[0])
final_theta = linear_regression_np(X, y, l=0).get('x')
b = final_theta[0] # intercept
m = final_theta[1] # slope
plt.scatter(X[:,1], y, label="Training data")
plt.plot(X[:, 1], X[:, 1]*m + b, label="Prediction")
plt.legend(loc=2)
plt.show()
线性拟合
1.使用训练集的子集来拟合应模型 2.在计算训练代价和交叉验证代价时,没有用正则化 3.记住使用相同的训练集子集来计算训练代价
m = X.shape[0]
for i in range(1, m+1):
# print('i={}'.format(i))
res = linear_regression_np(X[:i, :], y[:i], l=0)
tc = regularized_cost(res.x, X[:i, :], y[:i], l=0)
cv = regularized_cost(res.x, Xval, yval, l=0)
# print('tc={}, cv={}'.format(tc, cv))
training_cost.append(tc)
cv_cost.append(cv)
plt.plot(np.arange(1, m+1), training_cost, label='training cost')
plt.plot(np.arange(1, m+1), cv_cost, label='cv cost')
plt.legend(loc=1)
plt.show()
拟合不太好,欠拟合
注意asMatrix新版本里已经去掉了,用values替代;
def prepare_poly_data(*args, power):
"""
args: keep feeding in X, Xval, or Xtest
will return in the same order
"""
def prepare(x):
# expand feature
df = poly_features(x, power=power)
# normalization
ndarr = normalize_feature(df).values
# add intercept term
return np.insert(ndarr, 0, np.ones(ndarr.shape[0]), axis=1)
return [prepare(x) for x in args]
def poly_features(x, power, as_ndarray=False):
data = {'f{}'.format(i): np.power(x, i) for i in range(1, power + 1)}
df = pd.DataFrame(data)
return df.values if as_ndarray else df
X, y, Xval, yval, Xtest, ytest = load_data()
poly_features(X, power=3)
扩展多项式特征
def normalize_feature(df):
"""Applies function along input axis(default 0) of DataFrame."""
return df.apply(lambda column: (column - column.mean()) / column.std())
X_poly, Xval_poly, Xtest_poly= prepare_poly_data(X, Xval, Xtest, power=8)
X_poly[:3, :]
首先,我们没有使用正则化,所以
plt.legend(loc = 1),loc设置"training cost/cv cost的位置"
def plot_learning_curve(X, y, Xval, yval, l=0):
training_cost, cv_cost = [], []
m = X.shape[0]
for i in range(1, m + 1):
# regularization applies here for fitting parameters
res = linear_regression_np(X[:i, :], y[:i], l=l)
# remember, when you compute the cost here, you are computing
# non-regularized cost. Regularization is used to fit parameters only
tc = cost(res.x, X[:i, :], y[:i])
cv = cost(res.x, Xval, yval)
training_cost.append(tc)
cv_cost.append(cv)
plt.plot(np.arange(1, m + 1), training_cost, label='training cost')
plt.plot(np.arange(1, m + 1), cv_cost, label='cv cost')
plt.legend(loc=1)
plot_learning_curve(X_poly, y, Xval_poly, yval, l=0)
plt.show()
训练的代价太低,不真实,这是过拟合了,即泛化的能力很差
plot_learning_curve(X_poly, y, Xval_poly, yval, l=1)
plt.show()
训练代价增加了些,不再是0了。也就是说我们减轻过拟合
plot_learning_curve(X_poly, y, Xval_poly, yval, l=100)
plt.show()
过度正则化,欠拟合
l_candidate = [0, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10]
training_cost, cv_cost = [], []
for l in l_candidate:
res = linear_regression_np(X_poly, y, l)
tc = cost(res.x, X_poly, y)
cv = cost(res.x, Xval_poly, yval)
training_cost.append(tc)
cv_cost.append(cv)
plt.plot(l_candidate, training_cost, label='training')
plt.plot(l_candidate, cv_cost, label='cross validation')
plt.legend(loc=2)
plt.xlabel('lambda')
plt.ylabel('cost')
plt.show()
# best cv I got from all those candidates
l_candidate[np.argmin(cv_cost)]
1
# use test data to compute the cost
for l in l_candidate:
theta = linear_regression_np(X_poly, y, l).x
print('test cost(l={}) = {}'.format(l, cost(theta, Xtest_poly, ytest)))
test cost(l=0) = 10.804375286491785
test cost(l=0.001) = 10.911365745177878
test cost(l=0.003) = 11.265060784108712
test cost(l=0.01) = 10.879143763702967
test cost(l=0.03) = 10.022378551698187
test cost(l=0.1) = 8.631776100446476
test cost(l=0.3) = 7.3365081011786275
test cost(l=1) = 7.466282452677015
test cost(l=3) = 11.643940740451052
test cost(l=10) = 27.715080273166386
调参后,
是最优选择,这个时候测试代价最小