反向传播算法详解
反向传播算法详解
反向传播(Backpropagation)是训练神经网络的核心算法,通过链式法则计算损失函数对每个参数的梯度。
反向传播原理
链式法则
反向传播基于微积分中的链式法则,从输出层向输入层逐层计算梯度。
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')
# 创建示例数据
X, y = make_moons(n_samples=1000, noise=0.2, random_state=42)
# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# 标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print(f"训练集大小: {X_train_scaled.shape[0]}")
print(f"测试集大小: {X_test_scaled.shape[0]}")
反向传播实现
手动实现反向传播
class NeuralNetworkBackprop:
def __init__(self, layer_sizes, learning_rate=0.01):
"""
初始化神经网络
参数:
layer_sizes: 各层神经元数量列表
learning_rate: 学习率
"""
self.layer_sizes = layer_sizes
self.lr = learning_rate
# 初始化权重和偏置
self.weights = []
self.biases = []
for i in range(len(layer_sizes) - 1):
# He初始化
w = np.random.randn(layer_sizes[i], layer_sizes[i+1]) * np.sqrt(2.0 / layer_sizes[i])
b = np.zeros((1, layer_sizes[i+1]))
self.weights.append(w)
self.biases.append(b)
# 存储中间结果
self.z_values = [] # 线性变换结果
self.a_values = [] # 激活函数结果
def _relu(self, x):
"""ReLU激活函数"""
return np.maximum(0, x)
def _relu_derivative(self, x):
"""ReLU导数"""
return (x > 0).astype(float)
def _sigmoid(self, x):
"""Sigmoid激活函数"""
return 1 / (1 + np.exp(-np.clip(x, -250, 250)))
def _sigmoid_derivative(self, x):
"""Sigmoid导数"""
return x * (1 - x)
def forward(self, X):
"""前向传播"""
self.z_values = []
self.a_values = [X]
current_input = X
for i in range(len(self.weights) - 1):
# 线性变换
z = np.dot(current_input, self.weights[i]) + self.biases[i]
self.z_values.append(z)
# 激活函数(隐藏层使用ReLU)
a = self._relu(z)
self.a_values.append(a)
current_input = a
# 输出层(使用Sigmoid进行二分类)
z = np.dot(current_input, self.weights[-1]) + self.biases[-1]
self.z_values.append(z)
a = self._sigmoid(z)
self.a_values.append(a)
return a
def compute_loss(self, y_true, y_pred):
"""计算二元交叉熵损失"""
m = y_true.shape[0]
y_true = y_true.reshape(-1, 1)
# 避免log(0)
y_pred = np.clip(y_pred, 1e-8, 1 - 1e-8)
loss = -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
return loss
def backward(self, X, y):
"""
反向传播算法
计算损失函数对每个参数的梯度
"""
m = X.shape[0]
y = y.reshape(-1, 1)
# 存储梯度
d_weights = [None] * len(self.weights)
d_biases = [None] * len(self.biases)
# 从输出层开始反向传播
# 输出层误差
output = self.a_values[-1]
delta = output - y # 对于Sigmoid输出和交叉熵损失
# 计算输出层梯度
d_weights[-1] = np.dot(self.a_values[-2].T, delta) / m
d_biases[-1] = np.sum(delta, axis=0, keepdims=True) / m
# 反向传播到隐藏层
for i in range(len(self.weights) - 2, -1, -1):
# 计算当前层的误差
delta = np.dot(delta, self.weights[i+1].T) * self._relu_derivative(self.z_values[i])
# 计算梯度
d_weights[i] = np.dot(self.a_values[i].T, delta) / m
d_biases[i] = np.sum(delta, axis=0, keepdims=True) / m
# 更新权重和偏置
for i in range(len(self.weights)):
self.weights[i] -= self.lr * d_weights[i]
self.biases[i] -= self.lr * d_biases[i]
return d_weights, d_biases
def fit(self, X, y, epochs=1000, verbose=True):
"""训练模型"""
losses = []
accuracies = []
for epoch in range(epochs):
# 前向传播
output = self.forward(X)
# 计算损失
loss = self.compute_loss(y, output)
losses.append(loss)
# 计算准确率
y_pred = (output > 0.5).astype(int).flatten()
accuracy = accuracy_score(y, y_pred)
accuracies.append(accuracy)
# 反向传播
self.backward(X, y)
if verbose and (epoch + 1) % 100 == 0:
print(f"Epoch {epoch+1}/{epochs}, Loss: {loss:.4f}, Accuracy: {accuracy:.4f}")
return losses, accuracies
def predict(self, X):
"""预测"""
output = self.forward(X)
return (output > 0.5).astype(int).flatten()
# 创建神经网络
nn = NeuralNetworkBackprop(layer_sizes=[2, 10, 8, 1], learning_rate=0.1)
# 训练模型
print("训练神经网络(使用反向传播):")
losses, accuracies = nn.fit(X_train_scaled, y_train, epochs=1000, verbose=True)
# 测试集评估
y_pred = nn.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_pred)
print(f"\n测试集准确率: {test_accuracy:.4f}")
可视化训练过程
# 可视化训练过程
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
# 损失曲线
ax1.plot(losses, 'b-', linewidth=2)
ax1.set_xlabel('Epoch')
ax1.set_ylabel('损失值')
ax1.set_title('训练损失曲线')
ax1.grid(True, alpha=0.3)
# 准确率曲线
ax2.plot(accuracies, 'r-', linewidth=2)
ax2.set_xlabel('Epoch')
ax2.set_ylabel('准确率')
ax2.set_title('训练准确率曲线')
ax2.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# 可视化决策边界
def plot_decision_boundary(model, X, y, title="决策边界"):
"""绘制决策边界"""
h = 0.02 # 步长
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.figure(figsize=(10, 6))
plt.contourf(xx, yy, Z, alpha=0.8, cmap=plt.cm.RdYlBu)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.RdYlBu, edgecolors='black')
plt.xlabel('特征1')
plt.ylabel('特征2')
plt.title(title)
plt.grid(True, alpha=0.3)
plt.show()
plot_decision_boundary(nn, X_test_scaled, y_test, "反向传播神经网络决策边界")
梯度下降变体
不同优化器比较
class GradientDescentVariants:
"""比较不同的梯度下降变体"""
def __init__(self, layer_sizes):
self.layer_sizes = layer_sizes
def _init_network(self):
"""初始化网络参数"""
weights = []
biases = []
for i in range(len(self.layer_sizes) - 1):
w = np.random.randn(self.layer_sizes[i], self.layer_sizes[i+1]) * np.sqrt(2.0 / self.layer_sizes[i])
b = np.zeros((1, self.layer_sizes[i+1]))
weights.append(w)
biases.append(b)
return weights, biases
def _forward(self, X, weights, biases):
"""前向传播"""
current_input = X
for i in range(len(weights) - 1):
z = np.dot(current_input, weights[i]) + biases[i]
current_input = np.maximum(0, z) # ReLU
# 输出层
z = np.dot(current_input, weights[-1]) + biases[-1]
output = 1 / (1 + np.exp(-np.clip(z, -250, 250))) # Sigmoid
return output
def compare_optimizers(self, X_train, y_train, X_test, y_test, epochs=500):
"""比较不同优化器"""
optimizers = {
'SGD': {'lr': 0.01},
'Momentum': {'lr': 0.01, 'momentum': 0.9},
'RMSprop': {'lr': 0.001, 'decay': 0.99},
'Adam': {'lr': 0.001, 'beta1': 0.9, 'beta2': 0.999}
}
results = {}
for opt_name, opt_params in optimizers.items():
print(f"\n训练 {opt_name}:")
# 初始化网络
weights, biases = self._init_network()
# 初始化动量等参数(如果需要)
if opt_name == 'Momentum':
v_weights = [np.zeros_like(w) for w in weights]
v_biases = [np.zeros_like(b) for b in biases]
elif opt_name == 'Adam':
m_weights = [np.zeros_like(w) for w in weights]
v_weights = [np.zeros_like(w) for w in weights]
m_biases = [np.zeros_like(b) for b in biases]
v_biases = [np.zeros_like(b) for b in biases]
losses = []
for epoch in range(epochs):
# 前向传播
output = self._forward(X_train, weights, biases)
# 计算损失
y_train_reshaped = y_train.reshape(-1, 1)
output_clipped = np.clip(output, 1e-8, 1 - 1e-8)
loss = -np.mean(y_train_reshaped * np.log(output_clipped) +
(1 - y_train_reshaped) * np.log(1 - output_clipped))
losses.append(loss)
# 简化的反向传播(这里只更新最后一层)
m = X_train.shape[0]
delta = output - y_train_reshaped
# 计算梯度
dW = np.dot(self._forward(X_train, weights[:-1], biases[:-1]).T, delta) / m
dB = np.sum(delta, axis=0, keepdims=True) / m
# 更新权重
if opt_name == 'SGD':
weights[-1] -= opt_params['lr'] * dW
biases[-1] -= opt_params['lr'] * dB
elif opt_name == 'Momentum':
v_weights[-1] = opt_params['momentum'] * v_weights[-1] - opt_params['lr'] * dW
v_biases[-1] = opt_params['momentum'] * v_biases[-1] - opt_params['lr'] * dB
weights[-1] += v_weights[-1]
biases[-1] += v_biases[-1]
elif opt_name == 'Adam':
# 简化的Adam实现
m_weights[-1] = opt_params['beta1'] * m_weights[-1] + (1 - opt_params['beta1']) * dW
v_weights[-1] = opt_params['beta2'] * v_weights[-1] + (1 - opt_params['beta2']) * (dW ** 2)
m_hat = m_weights[-1] / (1 - opt_params['beta1'] ** (epoch + 1))
v_hat = v_weights[-1] / (1 - opt_params['beta2'] ** (epoch + 1))
weights[-1] -= opt_params['lr'] * m_hat / (np.sqrt(v_hat) + 1e-8)
m_biases[-1] = opt_params['beta1'] * m_biases[-1] + (1 - opt_params['beta1']) * dB
v_biases[-1] = opt_params['beta2'] * v_biases[-1] + (1 - opt_params['beta2']) * (dB ** 2)
m_hat_b = m_biases[-1] / (1 - opt_params['beta1'] ** (epoch + 1))
v_hat_b = v_biases[-1] / (1 - opt_params['beta2'] ** (epoch + 1))
biases[-1] -= opt_params['lr'] * m_hat_b / (np.sqrt(v_hat_b) + 1e-8)
# 测试集评估
test_output = self._forward(X_test, weights, biases)
test_pred = (test_output > 0.5).astype(int).flatten()
test_accuracy = accuracy_score(y_test, test_pred)
results[opt_name] = {
'losses': losses,
'accuracy': test_accuracy
}
print(f"最终损失: {losses[-1]:.4f}")
print(f"测试准确率: {test_accuracy:.4f}")
return results
# 比较不同优化器
gd_variants = GradientDescentVariants(layer_sizes=[2, 10, 1])
results = gd_variants.compare_optimizers(X_train_scaled, y_train, X_test_scaled, y_test, epochs=300)
# 可视化比较
plt.figure(figsize=(10, 6))
for opt_name, result in results.items():
plt.plot(result['losses'], label=f"{opt_name} (Acc: {result['accuracy']:.3f})", linewidth=2)
plt.xlabel('Epoch')
plt.ylabel('损失值')
plt.title('不同优化器的训练损失比较')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
实际应用
使用PyTorch实现反向传播
import torch
import torch.nn as nn
import torch.optim as optim
# 转换为PyTorch张量
X_train_tensor = torch.FloatTensor(X_train_scaled)
y_train_tensor = torch.FloatTensor(y_train).reshape(-1, 1)
X_test_tensor = torch.FloatTensor(X_test_scaled)
y_test_tensor = torch.FloatTensor(y_test).reshape(-1, 1)
# 定义模型
model = nn.Sequential(
nn.Linear(2, 10),
nn.ReLU(),
nn.Linear(10, 8),
nn.ReLU(),
nn.Linear(8, 1),
nn.Sigmoid()
)
# 定义损失函数和优化器
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# 训练模型
print("使用PyTorch训练神经网络:")
losses = []
for epoch in range(1000):
# 前向传播
outputs = model(X_train_tensor)
loss = criterion(outputs, y_train_tensor)
# 反向传播和优化
optimizer.zero_grad()
loss.backward() # PyTorch自动计算梯度
optimizer.step()
losses.append(loss.item())
if (epoch + 1) % 200 == 0:
with torch.no_grad():
train_outputs = model(X_train_tensor)
train_accuracy = ((train_outputs > 0.5).float() == y_train_tensor).float().mean()
test_outputs = model(X_test_tensor)
test_accuracy = ((test_outputs > 0.5).float() == y_test_tensor).float().mean()
print(f'Epoch [{epoch+1}/1000], Loss: {loss.item():.4f}, '
f'训练准确率: {train_accuracy:.4f}, 测试准确率: {test_accuracy:.4f}')
# 可视化损失曲线
plt.figure(figsize=(10, 6))
plt.plot(losses, 'b-', linewidth=2)
plt.xlabel('Epoch')
plt.ylabel('损失值')
plt.title('PyTorch训练损失曲线')
plt.grid(True, alpha=0.3)
plt.show()
反向传播最佳实践
- 梯度检查:验证梯度计算的正确性
- 梯度裁剪:防止梯度爆炸
- 学习率调度:动态调整学习率
- 批量归一化:加速训练和稳定梯度
- 权重初始化:使用合适的初始化方法
反向传播是神经网络训练的核心,掌握反向传播算法对于理解深度学习至关重要。