简单理解梯度下降法

$\eta$ 表示学习率，是梯度下降法的一个超参数，其取值影响最优解的速度。太小会减慢收敛学习速度，太大可能导致不收敛。

模拟梯度下降法

import numpy as np

plot_x = np.linspace(-1, 6, 141)
plot_y = (plot_x - 2.5) ** 2 - 1

def dJ(theta):
return 2 * (theta - 2.5)

def J(theta):
return (theta - 2.5) ** 2 - 1

theta = 0.0
eta = 0.1
epsilon = 1e-8

while True:
last_theta = theta

theta = theta - eta * gradient
if (abs(J(theta) - J(last_theta)) < epsilon):
break

OverflowError                             Traceback (most recent call last)
<ipython-input-6-5bd2217401d5> in <module>
8
9     theta = theta - eta * gradient
---> 10     if (abs(J(theta) - J(last_theta)) < epsilon):
11         break
12

<ipython-input-5-bd41aa589cfc> in J(theta)
1 def J(theta):
----> 2     return (theta - 2.5) ** 2 - 1

OverflowError: (34, 'Result too large')

def J(theta):
try:
return (theta - 2.5) ** 2 - 1
except:
return float('inf')

def gradient_descent(initial_theta, eta, n_iters=1e4, epsilon=1e-8):
theta = initial_theta
i_ters = 0

while i_ters < n_iters:
last_theta = theta

theta = theta - eta * gradient
if (abs(J(theta) - J(last_theta)) < epsilon):
break

i_ters += 1

return theta

多元线性回归中的梯度下降法

原理

$$J=\sum_{i=1}^{m}(y^{(i)} - \hat{y}^{(i)})^2$$

$$\nabla J=(\frac{\partial J}{\partial \theta_0},\frac{\partial J}{\partial \theta_1},...,\frac{\partial J}{\partial \theta_n})$$

$$\nabla J(\theta) = \begin{pmatrix} \frac{\partial J}{\partial \theta_0} \\\ \frac{\partial J}{\partial \theta_1} \\\ \frac{\partial J}{\partial \theta_2} \\\ \cdots \\\ \frac{\partial J}{\partial \theta_n} \end{pmatrix} = \begin{pmatrix} \sum_{i=1}^{m}2(y^{(i)} - X_b^{(i)}\theta)·(-1) \\\ \sum_{i=1}^{m}2(y^{(i)} - X_b^{(i)}\theta)·(-X_1^{(i)}) \\\ \sum_{i=1}^{m}2(y^{(i)} - X_b^{(i)}\theta)·(-X_2^{(i)}) \\\ \cdots \\\ \sum_{i=1}^{m}2(y^{(i)} - X_b^{(i)}\theta)·(-X_n^{(i)}) \end{pmatrix} = 2·\begin{pmatrix} \sum_{i=1}^{m}(X_b^{(i)}\theta - y^{(i)}) \\\ \sum_{i=1}^{m}(X_b^{(i)}\theta - y^{(i)})·X_1^{(i)} \\\ \sum_{i=1}^{m}(X_b^{(i)}\theta - y^{(i)})·X_2^{(i)} \\\ \cdots \\\ \sum_{i=1}^{m}(X_b^{(i)}\theta - y^{(i)})·X_n^{(i)} \end{pmatrix}$$

$$\nabla J(\theta) = \frac{2}{m}·\begin{pmatrix} \sum_{i=1}^{m}(X_b^{(i)}\theta - y^{(i)}) \\\ \sum_{i=1}^{m}(X_b^{(i)}\theta - y^{(i)})·X_1^{(i)} \\\ \sum_{i=1}^{m}(X_b^{(i)}\theta - y^{(i)})·X_2^{(i)} \\\ \cdots \\\ \sum_{i=1}^{m}(X_b^{(i)}\theta - y^{(i)})·X_n^{(i)} \end{pmatrix}$$

$$J(\theta) = MSE(y, \hat{y})$$

使用梯度下降法训练模型

import numpy as np

x = np.random.random(size=100)
y = x * 3.0 + 4.0 + np.random.normal(size=100)

X = x.reshape(-1, 1)

def J(theta, X_b, y):
try:
return np.sum((y - X_b.dot(theta)) ** 2) / len(X_b)
except:
return float('inf')

def dJ(theta, X_b, y):
res = np.empty(len(theta))
res[0] = np.sum(X_b.dot(theta) - y)
for i in range(1, len(theta)):
res[i] = (X_b.dot(theta) - y).dot(X_b[:, i])
return res * 2 / len(X_b)

def dJ(theta, X_b, y):
return X_b.T.dot(X_b.dot(theta) - y) * 2 / len(X_b)

def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8):
theta = initial_theta
i_ters = 0

while i_ters < n_iters:
last_theta = theta

theta = theta - eta * gradient
if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
break

i_ters += 1

return theta

X_b = np.hstack([np.ones((len(X), 1)), X])
initial_theta = np.zeros(X_b.shape[1])
eta = 0.01

theta = gradient_descent(X_b, y, initial_theta, eta)

$\theta$ 结果为：

array([4.0269033, 3.0043078])

class LinearRegression:
# other codes here

def fit_gd(self, X_train, y_train, eta=0.01, n_iters=1e4):
def J(theta, X_b, y):
try:
return np.sum((y - X_b.dot(theta)) ** 2) / len(X_b)
except:
return float('inf')

def dJ(theta, X_b, y):
return X_b.T.dot(X_b.dot(theta) - y) * 2 /len(X_b)

def gradient_descent(X_b, y, initial_theta, eta, n_iters=n_iters, epsilon=1e-8):
theta = initial_theta
i_ters = 0

while i_ters < n_iters:
last_theta = theta

theta = theta - eta * gradient

if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
break

i_ters += 1

return theta

X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
initial_theta = np.zeros(X_b.shape[1])

self._theta = gradient_descent(X_b, y_train, initial_theta, eta)
self.interception_ = self._theta[0]
self.coef_ = self._theta[1:]

return self

随机梯度下降法

$\nabla J(\theta) = \frac{2}{m}·\begin{pmatrix} \sum_{i=1}^{m}(X_b^{(i)}\theta - y^{(i)}) \\\ \sum_{i=1}^{m}(X_b^{(i)}\theta - y^{(i)})·X_1^{(i)} \\\ \sum_{i=1}^{m}(X_b^{(i)}\theta - y^{(i)})·X_2^{(i)} \\\ \cdots \\\ \sum_{i=1}^{m}(X_b^{(i)}\theta - y^{(i)})·X_n^{(i)} \end{pmatrix}$ 中每一项都要对所有样本进行计算，因此这种梯度下降法称为批量梯度下降法（Batch Gradient Descent）。如果 m 非常大，使用批量梯度下降法计算梯度的计算量就会非常大。

$$2·\begin{pmatrix} (X_b^{(i)}\theta - y^{(i)})·X_0^{(i)} \\\ (X_b^{(i)}\theta - y^{(i)})·X_1^{(i)} \\\ (X_b^{(i)}\theta - y^{(i)})·X_2^{(i)} \\\ \cdots \\\ (X_b^{(i)}\theta - y^{(i)})·X_n^{(i)} \end{pmatrix} = 2·(X_b^{(i)})^T·(X_b^{(i)}\theta-y^{(i)})$$

$$\eta = \frac{t0}{i\_iter + t1}$$

def dJ_sgd(theta, X_b_i, y_i):
return X_b_i.T.dot(X_b_i.dot(theta) - y_i) * 2.

def sgd(X_b, y, initial_theta, n_iters, t0, t1):

def learning_rate(t):
return t0 / (t + t1)

theta = initial_theta
m = len(X_b)

for cur_iter in range(n_iters):
indexes = np.random.permutation(m)
X_b_new = X_b[indexes]
y_new = y[indexes]

for i in range(m):
grandient = dJ_sgd(theta, X_b_new[i], y_new[i])
theta = theta - learning_rate(cur_iter * m + i) * grandient

return theta

class LinearRegression:
# other codes here

def fit_sgd(self, X_train, y_train, n_iters=5, t0=5, t1=50):
def dJ_sgd(theta, X_b_i, y_i):
return X_b_i.T.dot(X_b_i.dot(theta) - y_i) * 2.

def sgd(X_b, y, initial_theta, n_iters, t0, t1):

def learning_rate(t):
return t0 / (t + t1)

theta = initial_theta
m = len(X_b)

for cur_iter in range(n_iters):
indexes = np.random.permutation(m)
X_b_new = X_b[indexes]
y_new = y[indexes]

for i in range(m):
grandient = dJ_sgd(theta, X_b_new[i], y_new[i])
theta = theta - learning_rate(cur_iter * m + i) * grandient

return theta

X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
initial_theta = np.zeros(X_b.shape[1])

self._theta = sgd(X_b, y_train, initial_theta, n_iters, t0, t1)
self.interception_ = self._theta[0]
self.coef_ = self._theta[1:]

return self

from sklearn.linear_model import SGDRegressor

源码地址

Github | ML-Algorithms-Action

515 人关注
8 篇文章