卷积神经网络详解
卷积神经网络详解
卷积神经网络(Convolutional Neural Network,CNN)是专门用于处理网格结构数据(如图像)的深度学习模型。
CNN原理
卷积操作
卷积操作通过卷积核在输入数据上滑动,提取局部特征。
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')
# 加载手写数字数据集
digits = load_digits()
X, y = digits.data, digits.target
# 重塑为图像格式 (样本数, 高度, 宽度, 通道数)
X_reshaped = X.reshape(-1, 8, 8, 1)
# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(
X_reshaped, y, test_size=0.2, random_state=42
)
print(f"训练集形状: {X_train.shape}")
print(f"测试集形状: {X_test.shape}")
print(f"类别数量: {len(np.unique(y))}")
卷积层实现
class SimpleConv2D:
def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0):
"""
简单的2D卷积层实现
参数:
in_channels: 输入通道数
out_channels: 输出通道数(卷积核数量)
kernel_size: 卷积核大小
stride: 步长
padding: 填充
"""
self.in_channels = in_channels
self.out_channels = out_channels
self.kernel_size = kernel_size
self.stride = stride
self.padding = padding
# 初始化卷积核
self.weights = np.random.randn(out_channels, in_channels, kernel_size, kernel_size) * 0.01
self.biases = np.zeros((out_channels, 1))
def forward(self, X):
"""
前向传播
参数:
X: 输入数据 (batch_size, in_channels, height, width)
"""
batch_size, in_channels, height, width = X.shape
# 计算输出尺寸
out_height = (height + 2 * self.padding - self.kernel_size) // self.stride + 1
out_width = (width + 2 * self.padding - self.kernel_size) // self.stride + 1
# 初始化输出
output = np.zeros((batch_size, self.out_channels, out_height, out_width))
# 添加填充
if self.padding > 0:
X_padded = np.pad(X, ((0, 0), (0, 0),
(self.padding, self.padding),
(self.padding, self.padding)))
else:
X_padded = X
# 卷积操作
for i in range(out_height):
for j in range(out_width):
# 提取当前窗口
h_start = i * self.stride
h_end = h_start + self.kernel_size
w_start = j * self.stride
w_end = w_start + self.kernel_size
window = X_padded[:, :, h_start:h_end, w_start:w_end]
# 计算卷积
for k in range(self.out_channels):
output[:, k, i, j] = np.sum(window * self.weights[k], axis=(1, 2, 3)) + self.biases[k]
return output
# 测试卷积层
conv_layer = SimpleConv2D(in_channels=1, out_channels=4, kernel_size=3, stride=1, padding=1)
X_batch = X_train[:5] # 取5个样本
output = conv_layer.forward(X_batch)
print(f"输入形状: {X_batch.shape}")
print(f"输出形状: {output.shape}")
池化层
最大池化
class MaxPooling2D:
def __init__(self, pool_size=2, stride=2):
"""
最大池化层
参数:
pool_size: 池化窗口大小
stride: 步长
"""
self.pool_size = pool_size
self.stride = stride
def forward(self, X):
"""
前向传播
"""
batch_size, channels, height, width = X.shape
out_height = (height - self.pool_size) // self.stride + 1
out_width = (width - self.pool_size) // self.stride + 1
output = np.zeros((batch_size, channels, out_height, out_width))
for i in range(out_height):
for j in range(out_width):
h_start = i * self.stride
h_end = h_start + self.pool_size
w_start = j * self.stride
w_end = w_start + self.pool_size
window = X[:, :, h_start:h_end, w_start:w_end]
output[:, :, i, j] = np.max(window, axis=(2, 3))
return output
# 测试池化层
pool_layer = MaxPooling2D(pool_size=2, stride=2)
pooled_output = pool_layer.forward(output)
print(f"池化前形状: {output.shape}")
print(f"池化后形状: {pooled_output.shape}")
简单CNN实现
完整CNN网络
class SimpleCNN:
def __init__(self, input_channels=1, num_classes=10):
"""
简单的CNN网络
结构: 卷积层 -> ReLU -> 池化 -> 全连接层
"""
self.conv1 = SimpleConv2D(input_channels, 8, kernel_size=3, stride=1, padding=1)
self.pool1 = MaxPooling2D(pool_size=2, stride=2)
# 全连接层参数(假设输入为8x8图像,经过卷积和池化后为4x4)
self.fc1_weights = np.random.randn(8 * 4 * 4, 64) * 0.01
self.fc1_biases = np.zeros((1, 64))
self.fc2_weights = np.random.randn(64, num_classes) * 0.01
self.fc2_biases = np.zeros((1, num_classes))
def _relu(self, x):
return np.maximum(0, x)
def forward(self, X):
"""
前向传播
"""
# 卷积层1
x = self.conv1.forward(X)
x = self._relu(x)
# 池化层1
x = self.pool1.forward(x)
# 展平
batch_size = x.shape[0]
x = x.reshape(batch_size, -1)
# 全连接层1
x = np.dot(x, self.fc1_weights) + self.fc1_biases
x = self._relu(x)
# 全连接层2(输出层)
x = np.dot(x, self.fc2_weights) + self.fc2_biases
return x
def predict(self, X):
"""
预测
"""
output = self.forward(X)
return np.argmax(output, axis=1)
# 创建CNN模型
cnn = SimpleCNN(input_channels=1, num_classes=10)
# 测试前向传播
X_batch = X_train[:5]
output = cnn.forward(X_batch)
print(f"输入形状: {X_batch.shape}")
print(f"输出形状: {output.shape}")
print(f"预测类别: {cnn.predict(X_batch)}")
使用PyTorch实现CNN
PyTorch CNN
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
# 转换为PyTorch张量
X_train_tensor = torch.FloatTensor(X_train.transpose(0, 3, 1, 2)) # (N, C, H, W)
y_train_tensor = torch.LongTensor(y_train)
X_test_tensor = torch.FloatTensor(X_test.transpose(0, 3, 1, 2))
y_test_tensor = torch.LongTensor(y_test)
# 创建数据加载器
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=True)
# 定义CNN模型
class PyTorchCNN(nn.Module):
def __init__(self, num_classes=10):
super(PyTorchCNN, self).__init__()
# 卷积层
self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
# 池化层
self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
# 全连接层(假设输入为8x8图像)
self.fc1 = nn.Linear(32 * 2 * 2, 128)
self.fc2 = nn.Linear(128, num_classes)
# Dropout
self.dropout = nn.Dropout(0.5)
def forward(self, x):
# 卷积层1
x = self.pool(torch.relu(self.conv1(x)))
# 卷积层2
x = self.pool(torch.relu(self.conv2(x)))
# 展平
x = x.view(x.size(0), -1)
# 全连接层1
x = torch.relu(self.fc1(x))
x = self.dropout(x)
# 全连接层2
x = self.fc2(x)
return x
# 创建模型
model = PyTorchCNN(num_classes=10)
# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# 训练模型
print("训练PyTorch CNN:")
for epoch in range(10):
model.train()
running_loss = 0.0
correct = 0
total = 0
for batch_X, batch_y in train_loader:
# 前向传播
outputs = model(batch_X)
loss = criterion(outputs, batch_y)
# 反向传播和优化
optimizer.zero_grad()
loss.backward()
optimizer.step()
# 统计
running_loss += loss.item()
_, predicted = torch.max(outputs.data, 1)
total += batch_y.size(0)
correct += (predicted == batch_y).sum().item()
# 打印统计信息
train_accuracy = 100 * correct / total
print(f'Epoch [{epoch+1}/10], Loss: {running_loss/len(train_loader):.4f}, '
f'训练准确率: {train_accuracy:.2f}%')
# 测试模型
model.eval()
with torch.no_grad():
test_outputs = model(X_test_tensor)
_, predicted = torch.max(test_outputs.data, 1)
test_accuracy = 100 * (predicted == y_test_tensor).sum().item() / y_test_tensor.size(0)
print(f'\n测试准确率: {test_accuracy:.2f}%')
# 混淆矩阵
cm = confusion_matrix(y_test, predicted.numpy())
print(f"\n混淆矩阵:\n{cm}")
可视化卷积核
# 可视化第一个卷积层的卷积核
def visualize_filters(model):
"""可视化卷积核"""
# 获取第一个卷积层的权重
filters = model.conv1.weight.data.numpy()
# 可视化前8个卷积核
fig, axes = plt.subplots(2, 4, figsize=(10, 5))
axes = axes.ravel()
for i in range(8):
axes[i].imshow(filters[i, 0], cmap='gray')
axes[i].set_title(f'卷积核 {i+1}')
axes[i].axis('off')
plt.tight_layout()
plt.show()
visualize_filters(model)
CNN架构
经典CNN架构
# AlexNet风格的简化版
class AlexNetSimplified(nn.Module):
def __init__(self, num_classes=10):
super(AlexNetSimplified, self).__init__()
self.features = nn.Sequential(
nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
nn.ReLU(inplace=True),
)
self.classifier = nn.Sequential(
nn.Dropout(0.5),
nn.Linear(128 * 2 * 2, 256),
nn.ReLU(inplace=True),
nn.Dropout(0.5),
nn.Linear(256, 128),
nn.ReLU(inplace=True),
nn.Linear(128, num_classes),
)
def forward(self, x):
x = self.features(x)
x = x.view(x.size(0), -1)
x = self.classifier(x)
return x
# 创建模型
alexnet = AlexNetSimplified(num_classes=10)
print(f"AlexNet参数数量: {sum(p.numel() for p in alexnet.parameters()):,}")
CNN最佳实践
- 数据增强:使用旋转、翻转等增强数据
- 批量归一化:加速训练和稳定梯度
- 学习率调度:动态调整学习率
- 正则化:使用Dropout和L2正则化
- 预训练模型:使用预训练模型进行迁移学习
CNN是计算机视觉的核心技术,掌握CNN对于图像分类、目标检测等任务至关重要。