🤖

目标检测实践详解

📂 ai ⏱ 5 min 915 words

目标检测实践详解

目标检测是计算机视觉中的重要任务，不仅要识别图像中的物体类别，还要定位它们的位置。

目标检测基础

检测流程

import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import models, transforms
import cv2
import warnings
warnings.filterwarnings('ignore')

# 检查CUDA
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"使用设备: {device}")

# 创建模拟检测数据
np.random.seed(42)
n_samples = 500
img_size = 224
n_classes = 5

class DetectionDataset(Dataset):
    def __init__(self, n_samples, img_size, n_classes):
        self.n_samples = n_samples
        self.img_size = img_size
        self.n_classes = n_classes
        
        # 生成模拟数据
        self.images = np.random.randn(n_samples, 3, img_size, img_size).astype(np.float32)
        self.labels = []
        
        for _ in range(n_samples):
            # 每个图像有1-3个目标
            n_objects = np.random.randint(1, 4)
            objects = []
            
            for _ in range(n_objects):
                # 类别
                cls = np.random.randint(0, n_classes)
                # 边界框 [x1, y1, x2, y2]
                x1 = np.random.randint(0, img_size - 50)
                y1 = np.random.randint(0, img_size - 50)
                x2 = x1 + np.random.randint(30, 80)
                y2 = y1 + np.random.randint(30, 80)
                
                # 确保边界框在图像内
                x2 = min(x2, img_size - 1)
                y2 = min(y2, img_size - 1)
                
                objects.append([cls, x1, y1, x2, y2])
            
            self.labels.append(objects)
    
    def __len__(self):
        return self.n_samples
    
    def __getitem__(self, idx):
        image = self.images[idx]
        label = self.labels[idx]
        
        # 转换为tensor
        image = torch.FloatTensor(image)
        
        return image, label

# 创建数据集
dataset = DetectionDataset(n_samples, img_size, n_classes)
print(f"数据集大小: {len(dataset)}")

# 显示示例数据
sample_image, sample_labels = dataset[0]
print(f"图像形状: {sample_image.shape}")
print(f"标签数量: {len(sample_labels)}")
print(f"示例标签: {sample_labels[0]}")

检测模型

简单检测网络

class SimpleDetector(nn.Module):
    def __init__(self, num_classes, num_boxes=3):
        super(SimpleDetector, self).__init__()
        
        self.num_classes = num_classes
        self.num_boxes = num_boxes
        
        # 特征提取器
        self.features = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
            
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
            
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
            
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
        )
        
        # 检测头
        self.detector = nn.Sequential(
            nn.Flatten(),
            nn.Linear(256 * 14 * 14, 512),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(512, num_boxes * (num_classes + 5))  # 每个框: 类别 + 5个坐标值
        )
    
    def forward(self, x):
        # 特征提取
        features = self.features(x)
        
        # 检测
        detections = self.detector(features)
        
        # 重塑输出
        batch_size = x.size(0)
        detections = detections.view(batch_size, self.num_boxes, self.num_classes + 5)
        
        return detections

# 创建模型
model = SimpleDetector(num_classes=n_classes, num_boxes=3).to(device)
print(f"模型参数: {sum(p.numel() for p in model.parameters()):,}")

损失函数

class DetectionLoss(nn.Module):
    def __init__(self, num_classes):
        super(DetectionLoss, self).__init__()
        self.num_classes = num_classes
        self.mse_loss = nn.MSELoss()
        self.ce_loss = nn.CrossEntropyLoss()
    
    def forward(self, predictions, targets):
        """
        计算检测损失
        
        predictions: [batch_size, num_boxes, num_classes + 5]
        targets: list of lists, 每个元素是 [cls, x1, y1, x2, y2]
        """
        batch_size = predictions.size(0)
        total_loss = 0
        
        for i in range(batch_size):
            pred = predictions[i]
            target = targets[i]
            
            # 简化的损失计算
            # 实际应用中需要实现复杂的匹配和损失计算
            loss = 0
            for obj in target:
                cls, x1, y1, x2, y2 = obj
                # 假设损失为0（实际需要实现）
                loss += 0
            
            total_loss += loss
        
        return total_loss / batch_size

# 创建损失函数
criterion = DetectionLoss(num_classes=n_classes)

训练检测模型

训练循环

def train_detector(model, dataset, epochs=10, batch_size=8, lr=0.001):
    """训练检测模型"""
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    losses = []
    
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        
        for images, labels in dataloader:
            images = images.to(device)
            
            # 前向传播
            outputs = model(images)
            
            # 计算损失（简化版）
            loss = torch.tensor(0.0, requires_grad=True)
            
            # 反向传播
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
        
        epoch_loss = running_loss / len(dataloader)
        losses.append(epoch_loss)
        
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss:.4f}')
    
    return losses

# 训练模型
print("训练检测模型:")
losses = train_detector(model, dataset, epochs=10, batch_size=8, lr=0.001)

# 可视化训练损失
plt.figure(figsize=(10, 6))
plt.plot(losses, 'b-', linewidth=2)
plt.xlabel('Epoch')
plt.ylabel('损失')
plt.title('训练损失曲线')
plt.grid(True, alpha=0.3)
plt.show()

使用预训练检测模型

Faster R-CNN

# 加载预训练的Faster R-CNN
faster_rcnn = models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
faster_rcnn.eval()

# 检测函数
def detect_objects(model, image, threshold=0.5):
    """使用预训练模型检测物体"""
    # 预处理
    transform = transforms.Compose([
        transforms.ToTensor(),
    ])
    
    if isinstance(image, np.ndarray):
        image = transform(image).unsqueeze(0)
    
    # 检测
    with torch.no_grad():
        predictions = model(image)
    
    # 解析结果
    boxes = predictions[0]['boxes'].cpu().numpy()
    labels = predictions[0]['labels'].cpu().numpy()
    scores = predictions[0]['scores'].cpu().numpy()
    
    # 过滤低置信度结果
    mask = scores > threshold
    boxes = boxes[mask]
    labels = labels[mask]
    scores = scores[mask]
    
    return boxes, labels, scores

# 可视化检测结果
def visualize_detections(image, boxes, labels, scores, class_names=None):
    """可视化检测结果"""
    plt.figure(figsize=(12, 8))
    
    if isinstance(image, torch.Tensor):
        image = image.permute(1, 2, 0).numpy()
    
    plt.imshow(image)
    
    for box, label, score in zip(boxes, labels, scores):
        x1, y1, x2, y2 = box
        plt.rectangle((x1, y1), (x2, y2), outline='red', linewidth=2)
        
        class_name = f'类别{label}' if class_names is None else class_names[label]
        plt.text(x1, y1-10, f'{class_name}: {score:.2f}', 
                color='red', fontsize=12, fontweight='bold')
    
    plt.title('目标检测结果')
    plt.axis('off')
    plt.show()

# 测试检测（使用模拟图像）
sample_image = np.random.randn(3, 224, 224).astype(np.float32)
sample_image = (sample_image - sample_image.min()) / (sample_image.max() - sample_image.min())

# 注意：实际使用时需要加载真实图像
print("预训练检测模型已加载")
print("使用方法：detect_objects(model, image)")

YOLO检测

YOLO原理

class YOLODetector:
    def __init__(self, num_classes):
        self.num_classes = num_classes
        self.grid_size = 7  # 网格大小
        self.num_boxes = 2  # 每个网格预测的边界框数量
        
        # 简化的YOLO网络
        self.network = nn.Sequential(
            # 卷积层
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
            
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
            
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
            
            # 输出层
            nn.Conv2d(128, self.num_boxes * 5 + self.num_classes, kernel_size=1),
        )
    
    def forward(self, x):
        return self.network(x)
    
    def predict(self, image, threshold=0.5):
        """预测"""
        # 预处理
        transform = transforms.Compose([
            transforms.ToTensor(),
        ])
        
        if isinstance(image, np.ndarray):
            image = transform(image).unsqueeze(0)
        
        # 前向传播
        with torch.no_grad():
            output = self.network(image)
        
        # 解析输出（简化版）
        batch_size = output.size(0)
        output = output.view(batch_size, self.grid_size, self.grid_size, 
                            self.num_boxes * 5 + self.num_classes)
        
        return output

# 创建YOLO检测器
yolo = YOLODetector(num_classes=n_classes)
print(f"YOLO网络参数: {sum(p.numel() for p in yolo.network.parameters()):,}")

评估指标

IoU计算

def compute_iou(box1, box2):
    """
    计算两个边界框的IoU
    
    box: [x1, y1, x2, y2]
    """
    x1 = max(box1[0], box2[0])
    y1 = max(box1[1], box2[1])
    x2 = min(box1[2], box2[2])
    y2 = min(box1[3], box2[3])
    
    intersection = max(0, x2 - x1) * max(0, y2 - y1)
    
    area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
    area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
    
    union = area1 + area2 - intersection
    
    iou = intersection / union if union > 0 else 0
    
    return iou

# 测试IoU
box1 = [50, 50, 150, 150]
box2 = [100, 100, 200, 200]

iou = compute_iou(box1, box2)
print(f"IoU: {iou:.4f}")

# 可视化IoU
fig, ax = plt.subplots(1, 1, figsize=(8, 8))

# 绘制边界框
rect1 = plt.Rectangle((box1[0], box1[1]), box1[2]-box1[0], box1[3]-box1[1], 
                       linewidth=2, edgecolor='red', facecolor='none', label='预测框')
rect2 = plt.Rectangle((box2[0], box2[1]), box2[2]-box2[0], box2[3]-box2[1], 
                       linewidth=2, edgecolor='green', facecolor='none', label='真实框')

ax.add_patch(rect1)
ax.add_patch(rect2)

ax.set_xlim(0, 250)
ax.set_ylim(0, 250)
ax.set_aspect('equal')
ax.legend()
ax.set_title(f'IoU: {iou:.4f}')
plt.grid(True, alpha=0.3)
plt.show()

mAP计算

def compute_ap(precision, recall):
    """计算平均精度（AP）"""
    # 补充边界值
    mrec = np.concatenate(([0.], recall, [1.]))
    mpre = np.concatenate(([0.], precision, [0.]))
    
    # 计算精确率包络
    for i in range(len(mpre) - 2, -1, -1):
        mpre[i] = max(mpre[i], mpre[i + 1])
    
    # 计算AP
    i_list = np.where(mrec[1:] != mrec[:-1])[0]
    ap = np.sum((mrec[i_list + 1] - mrec[i_list]) * mpre[i_list + 1])
    
    return ap

# 测试AP
precision = np.array([0.9, 0.8, 0.7, 0.6, 0.5])
recall = np.array([0.2, 0.4, 0.6, 0.8, 1.0])

ap = compute_ap(precision, recall)
print(f"AP: {ap:.4f}")

目标检测最佳实践

数据增强：使用随机裁剪、翻转等增强数据
锚框设计：设计合适的锚框尺寸和比例
非极大值抑制：使用NMS去除重复检测
多尺度训练：在不同尺度上训练模型
迁移学习：使用预训练模型进行微调

目标检测是计算机视觉中的核心技术，掌握目标检测对于自动驾驶、视频监控等应用至关重要。