🤖

卷积神经网络详解

📂 ai ⏱ 5 min 852 words

卷积神经网络详解

卷积神经网络（Convolutional Neural Network，CNN）是专门用于处理网格结构数据（如图像）的深度学习模型。

CNN原理

卷积操作

卷积操作通过卷积核在输入数据上滑动，提取局部特征。

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# 加载手写数字数据集
digits = load_digits()
X, y = digits.data, digits.target

# 重塑为图像格式 (样本数, 高度, 宽度, 通道数)
X_reshaped = X.reshape(-1, 8, 8, 1)

# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(
    X_reshaped, y, test_size=0.2, random_state=42
)

print(f"训练集形状: {X_train.shape}")
print(f"测试集形状: {X_test.shape}")
print(f"类别数量: {len(np.unique(y))}")

卷积层实现

class SimpleConv2D:
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0):
        """
        简单的2D卷积层实现
        
        参数:
            in_channels: 输入通道数
            out_channels: 输出通道数（卷积核数量）
            kernel_size: 卷积核大小
            stride: 步长
            padding: 填充
        """
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        
        # 初始化卷积核
        self.weights = np.random.randn(out_channels, in_channels, kernel_size, kernel_size) * 0.01
        self.biases = np.zeros((out_channels, 1))
    
    def forward(self, X):
        """
        前向传播
        
        参数:
            X: 输入数据 (batch_size, in_channels, height, width)
        """
        batch_size, in_channels, height, width = X.shape
        
        # 计算输出尺寸
        out_height = (height + 2 * self.padding - self.kernel_size) // self.stride + 1
        out_width = (width + 2 * self.padding - self.kernel_size) // self.stride + 1
        
        # 初始化输出
        output = np.zeros((batch_size, self.out_channels, out_height, out_width))
        
        # 添加填充
        if self.padding > 0:
            X_padded = np.pad(X, ((0, 0), (0, 0), 
                                 (self.padding, self.padding), 
                                 (self.padding, self.padding)))
        else:
            X_padded = X
        
        # 卷积操作
        for i in range(out_height):
            for j in range(out_width):
                # 提取当前窗口
                h_start = i * self.stride
                h_end = h_start + self.kernel_size
                w_start = j * self.stride
                w_end = w_start + self.kernel_size
                
                window = X_padded[:, :, h_start:h_end, w_start:w_end]
                
                # 计算卷积
                for k in range(self.out_channels):
                    output[:, k, i, j] = np.sum(window * self.weights[k], axis=(1, 2, 3)) + self.biases[k]
        
        return output

# 测试卷积层
conv_layer = SimpleConv2D(in_channels=1, out_channels=4, kernel_size=3, stride=1, padding=1)
X_batch = X_train[:5]  # 取5个样本
output = conv_layer.forward(X_batch)

print(f"输入形状: {X_batch.shape}")
print(f"输出形状: {output.shape}")

池化层

最大池化

class MaxPooling2D:
    def __init__(self, pool_size=2, stride=2):
        """
        最大池化层
        
        参数:
            pool_size: 池化窗口大小
            stride: 步长
        """
        self.pool_size = pool_size
        self.stride = stride
    
    def forward(self, X):
        """
        前向传播
        """
        batch_size, channels, height, width = X.shape
        
        out_height = (height - self.pool_size) // self.stride + 1
        out_width = (width - self.pool_size) // self.stride + 1
        
        output = np.zeros((batch_size, channels, out_height, out_width))
        
        for i in range(out_height):
            for j in range(out_width):
                h_start = i * self.stride
                h_end = h_start + self.pool_size
                w_start = j * self.stride
                w_end = w_start + self.pool_size
                
                window = X[:, :, h_start:h_end, w_start:w_end]
                output[:, :, i, j] = np.max(window, axis=(2, 3))
        
        return output

# 测试池化层
pool_layer = MaxPooling2D(pool_size=2, stride=2)
pooled_output = pool_layer.forward(output)

print(f"池化前形状: {output.shape}")
print(f"池化后形状: {pooled_output.shape}")

简单CNN实现

完整CNN网络

class SimpleCNN:
    def __init__(self, input_channels=1, num_classes=10):
        """
        简单的CNN网络
        
        结构: 卷积层 -> ReLU -> 池化 -> 全连接层
        """
        self.conv1 = SimpleConv2D(input_channels, 8, kernel_size=3, stride=1, padding=1)
        self.pool1 = MaxPooling2D(pool_size=2, stride=2)
        
        # 全连接层参数（假设输入为8x8图像，经过卷积和池化后为4x4）
        self.fc1_weights = np.random.randn(8 * 4 * 4, 64) * 0.01
        self.fc1_biases = np.zeros((1, 64))
        
        self.fc2_weights = np.random.randn(64, num_classes) * 0.01
        self.fc2_biases = np.zeros((1, num_classes))
    
    def _relu(self, x):
        return np.maximum(0, x)
    
    def forward(self, X):
        """
        前向传播
        """
        # 卷积层1
        x = self.conv1.forward(X)
        x = self._relu(x)
        
        # 池化层1
        x = self.pool1.forward(x)
        
        # 展平
        batch_size = x.shape[0]
        x = x.reshape(batch_size, -1)
        
        # 全连接层1
        x = np.dot(x, self.fc1_weights) + self.fc1_biases
        x = self._relu(x)
        
        # 全连接层2（输出层）
        x = np.dot(x, self.fc2_weights) + self.fc2_biases
        
        return x
    
    def predict(self, X):
        """
        预测
        """
        output = self.forward(X)
        return np.argmax(output, axis=1)

# 创建CNN模型
cnn = SimpleCNN(input_channels=1, num_classes=10)

# 测试前向传播
X_batch = X_train[:5]
output = cnn.forward(X_batch)

print(f"输入形状: {X_batch.shape}")
print(f"输出形状: {output.shape}")
print(f"预测类别: {cnn.predict(X_batch)}")

使用PyTorch实现CNN

PyTorch CNN

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# 转换为PyTorch张量
X_train_tensor = torch.FloatTensor(X_train.transpose(0, 3, 1, 2))  # (N, C, H, W)
y_train_tensor = torch.LongTensor(y_train)
X_test_tensor = torch.FloatTensor(X_test.transpose(0, 3, 1, 2))
y_test_tensor = torch.LongTensor(y_test)

# 创建数据加载器
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=True)

# 定义CNN模型
class PyTorchCNN(nn.Module):
    def __init__(self, num_classes=10):
        super(PyTorchCNN, self).__init__()
        
        # 卷积层
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        
        # 池化层
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        
        # 全连接层（假设输入为8x8图像）
        self.fc1 = nn.Linear(32 * 2 * 2, 128)
        self.fc2 = nn.Linear(128, num_classes)
        
        # Dropout
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, x):
        # 卷积层1
        x = self.pool(torch.relu(self.conv1(x)))
        
        # 卷积层2
        x = self.pool(torch.relu(self.conv2(x)))
        
        # 展平
        x = x.view(x.size(0), -1)
        
        # 全连接层1
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        
        # 全连接层2
        x = self.fc2(x)
        
        return x

# 创建模型
model = PyTorchCNN(num_classes=10)

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练模型
print("训练PyTorch CNN:")
for epoch in range(10):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for batch_X, batch_y in train_loader:
        # 前向传播
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        
        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # 统计
        running_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += batch_y.size(0)
        correct += (predicted == batch_y).sum().item()
    
    # 打印统计信息
    train_accuracy = 100 * correct / total
    print(f'Epoch [{epoch+1}/10], Loss: {running_loss/len(train_loader):.4f}, '
          f'训练准确率: {train_accuracy:.2f}%')

# 测试模型
model.eval()
with torch.no_grad():
    test_outputs = model(X_test_tensor)
    _, predicted = torch.max(test_outputs.data, 1)
    test_accuracy = 100 * (predicted == y_test_tensor).sum().item() / y_test_tensor.size(0)
    print(f'\n测试准确率: {test_accuracy:.2f}%')

# 混淆矩阵
cm = confusion_matrix(y_test, predicted.numpy())
print(f"\n混淆矩阵:\n{cm}")

可视化卷积核

# 可视化第一个卷积层的卷积核
def visualize_filters(model):
    """可视化卷积核"""
    # 获取第一个卷积层的权重
    filters = model.conv1.weight.data.numpy()
    
    # 可视化前8个卷积核
    fig, axes = plt.subplots(2, 4, figsize=(10, 5))
    axes = axes.ravel()
    
    for i in range(8):
        axes[i].imshow(filters[i, 0], cmap='gray')
        axes[i].set_title(f'卷积核 {i+1}')
        axes[i].axis('off')
    
    plt.tight_layout()
    plt.show()

visualize_filters(model)

CNN架构

经典CNN架构

# AlexNet风格的简化版
class AlexNetSimplified(nn.Module):
    def __init__(self, num_classes=10):
        super(AlexNetSimplified, self).__init__()
        
        self.features = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),
        )
        
        self.classifier = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(128 * 2 * 2, 256),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(256, 128),
            nn.ReLU(inplace=True),
            nn.Linear(128, num_classes),
        )
    
    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

# 创建模型
alexnet = AlexNetSimplified(num_classes=10)
print(f"AlexNet参数数量: {sum(p.numel() for p in alexnet.parameters()):,}")

CNN最佳实践

数据增强：使用旋转、翻转等增强数据
批量归一化：加速训练和稳定梯度
学习率调度：动态调整学习率
正则化：使用Dropout和L2正则化
预训练模型：使用预训练模型进行迁移学习

CNN是计算机视觉的核心技术，掌握CNN对于图像分类、目标检测等任务至关重要。