← 返回首页
🤖

计算机视觉详解

📂 ai ⏱ 4 min 751 words

计算机视觉详解

计算机视觉是让计算机理解和解释视觉信息的技术,包括图像分类、目标检测、语义分割等任务。

计算机视觉基础

图像表示

import numpy as np
import matplotlib.pyplot as plt
import cv2
from PIL import Image
import torch
import torchvision.transforms as transforms
from torchvision import models
import warnings
warnings.filterwarnings('ignore')

# 创建示例图像
def create_sample_image(height=100, width=100):
    """创建示例图像"""
    image = np.zeros((height, width, 3), dtype=np.uint8)
    
    # 绘制彩色矩形
    image[20:40, 20:80] = [255, 0, 0]  # 红色
    image[40:60, 20:80] = [0, 255, 0]  # 绿色
    image[60:80, 20:80] = [0, 0, 255]  # 蓝色
    
    return image

# 创建图像
image = create_sample_image()

# 显示图像
plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
plt.imshow(image)
plt.title('彩色图像')
plt.axis('off')

# 转换为灰度图
gray_image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
plt.subplot(1, 2, 2)
plt.imshow(gray_image, cmap='gray')
plt.title('灰度图像')
plt.axis('off')

plt.tight_layout()
plt.show()

print(f"彩色图像形状: {image.shape}")
print(f"灰度图像形状: {gray_image.shape}")

图像预处理

基本预处理

class ImagePreprocessor:
    def __init__(self):
        self.transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                               std=[0.229, 0.224, 0.225])
        ])
    
    def preprocess(self, image):
        """预处理图像"""
        # 如果是numpy数组,转换为PIL图像
        if isinstance(image, np.ndarray):
            image = Image.fromarray(image)
        
        # 应用变换
        return self.transform(image).unsqueeze(0)

# 使用预处理器
preprocessor = ImagePreprocessor()

# 预处理图像
processed_image = preprocessor.preprocess(image)
print(f"预处理后形状: {processed_image.shape}")

# 数据增强
data_augmentation = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(10),
    transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
])

# 应用数据增强
augmented_images = [data_augmentation(Image.fromarray(image)) for _ in range(4)]

# 可视化增强结果
fig, axes = plt.subplots(1, 5, figsize=(15, 3))
axes[0].imshow(image)
axes[0].set_title('原始图像')
axes[0].axis('off')

for i, aug_img in enumerate(augmented_images):
    axes[i+1].imshow(aug_img.permute(1, 2, 0).numpy())
    axes[i+1].set_title(f'增强 {i+1}')
    axes[i+1].axis('off')

plt.tight_layout()
plt.show()

图像分类

使用预训练模型

# 加载预训练的ResNet模型
resnet = models.resnet18(pretrained=True)
resnet.eval()

# 图像分类函数
def classify_image(image, model, top_k=5):
    """使用预训练模型分类图像"""
    # 预处理
    preprocessor = ImagePreprocessor()
    processed_image = preprocessor.preprocess(image)
    
    # 预测
    with torch.no_grad():
        outputs = model(processed_image)
        probabilities = torch.nn.functional.softmax(outputs, dim=1)
        top_probs, top_indices = torch.topk(probabilities, top_k)
    
    return top_probs[0].numpy(), top_indices[0].numpy()

# 分类示例图像
probs, indices = classify_image(image, resnet)

# 可视化结果
plt.figure(figsize=(10, 6))
plt.bar(range(len(probs)), probs, color='skyblue', edgecolor='black')
plt.xlabel('类别索引')
plt.ylabel('概率')
plt.title('图像分类结果')
plt.xticks(range(len(probs)), [f'类别 {idx}' for idx in indices])
plt.grid(True, alpha=0.3)
plt.show()

目标检测

简单目标检测

class SimpleObjectDetector:
    def __init__(self):
        # 使用预训练的Faster R-CNN
        self.model = models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
        self.model.eval()
        
        # COCO数据集类别名称
        self.coco_names = [
            '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
            'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
            'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
            'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A',
            'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
            'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
            'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
            'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
            'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table',
            'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
            'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book',
            'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
        ]
    
    def detect(self, image):
        """检测图像中的物体"""
        # 转换为tensor
        transform = transforms.ToTensor()
        img_tensor = transform(image).unsqueeze(0)
        
        # 预测
        with torch.no_grad():
            predictions = self.model(img_tensor)
        
        # 解析结果
        boxes = predictions[0]['boxes'].numpy()
        labels = predictions[0]['labels'].numpy()
        scores = predictions[0]['scores'].numpy()
        
        return boxes, labels, scores
    
    def visualize(self, image, boxes, labels, scores, threshold=0.5):
        """可视化检测结果"""
        plt.figure(figsize=(12, 8))
        plt.imshow(image)
        
        for box, label, score in zip(boxes, labels, scores):
            if score > threshold:
                # 绘制边界框
                x1, y1, x2, y2 = box
                plt.rectangle((x1, y1), (x2, y2), outline='red', linewidth=2)
                
                # 添加标签
                class_name = self.coco_names[label] if label < len(self.coco_names) else f'类别{label}'
                plt.text(x1, y1-10, f'{class_name}: {score:.2f}', 
                        color='red', fontsize=12, fontweight='bold')
        
        plt.title('目标检测结果')
        plt.axis('off')
        plt.show()

# 创建检测器
detector = SimpleObjectDetector()

# 使用示例图像进行检测
# 注意:实际使用时需要加载真实图像
print("目标检测器已创建")
print("使用方法:detector.detect(image)")

语义分割

简单语义分割

class SimpleSegmentation:
    def __init__(self):
        # 使用预训练的DeepLabV3
        self.model = models.segmentation.deeplabv3_resnet50(pretrained=True)
        self.model.eval()
        
        # Pascal VOC类别颜色
        self.voc_colors = [
            [0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0],
            [0, 0, 128], [128, 0, 128], [0, 128, 128], [128, 128, 128],
            [64, 0, 0], [192, 0, 0], [64, 128, 0], [192, 128, 0],
            [64, 0, 128], [192, 0, 128], [64, 128, 128], [192, 128, 128],
            [0, 64, 0], [128, 64, 0], [0, 192, 0], [128, 192, 0],
            [0, 64, 128], [128, 64, 128], [0, 192, 128], [128, 192, 128],
            [64, 64, 0], [192, 64, 0], [64, 192, 0], [192, 192, 0]
        ]
    
    def segment(self, image):
        """分割图像"""
        # 预处理
        transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                               std=[0.229, 0.224, 0.225])
        ])
        
        img_tensor = transform(image).unsqueeze(0)
        
        # 预测
        with torch.no_grad():
            output = self.model(img_tensor)['out']
            predictions = torch.argmax(output.squeeze(), dim=0)
        
        return predictions.numpy()
    
    def visualize(self, image, segmentation_mask):
        """可视化分割结果"""
        # 创建分割图
        seg_image = np.zeros((*segmentation_mask.shape, 3), dtype=np.uint8)
        
        for i, color in enumerate(self.voc_colors):
            seg_image[segmentation_mask == i] = color
        
        # 可视化
        fig, axes = plt.subplots(1, 2, figsize=(12, 5))
        
        axes[0].imshow(image)
        axes[0].set_title('原始图像')
        axes[0].axis('off')
        
        axes[1].imshow(seg_image)
        axes[1].set_title('语义分割结果')
        axes[1].axis('off')
        
        plt.tight_layout()
        plt.show()

# 创建分割器
segmentor = SimpleSegmentation()

print("语义分割器已创建")
print("使用方法:segmentor.segment(image)")

实际应用

图像增强管道

class ImageAugmentationPipeline:
    def __init__(self):
        self.augmentations = {
            'horizontal_flip': transforms.RandomHorizontalFlip(p=0.5),
            'vertical_flip': transforms.RandomVerticalFlip(p=0.5),
            'rotation': transforms.RandomRotation(10),
            'color_jitter': transforms.ColorJitter(brightness=0.2, contrast=0.2),
            'random_crop': transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
            'gaussian_blur': transforms.GaussianBlur(kernel_size=3),
        }
    
    def augment(self, image, augmentation_list=None):
        """应用数据增强"""
        if augmentation_list is None:
            augmentation_list = list(self.augmentations.keys())
        
        augmented_images = []
        for aug_name in augmentation_list:
            if aug_name in self.augmentations:
                aug = self.augmentations[aug_name]
                aug_img = aug(Image.fromarray(image))
                augmented_images.append((aug_name, aug_img))
        
        return augmented_images
    
    def visualize_augmentations(self, image, augmentation_list=None):
        """可视化增强结果"""
        augmented = self.augment(image, augmentation_list)
        
        n_images = len(augmented) + 1
        fig, axes = plt.subplots(1, n_images, figsize=(4*n_images, 4))
        
        # 原始图像
        axes[0].imshow(image)
        axes[0].set_title('原始图像')
        axes[0].axis('off')
        
        # 增强图像
        for i, (name, aug_img) in enumerate(augmented):
            axes[i+1].imshow(aug_img)
            axes[i+1].set_title(name)
            axes[i+1].axis('off')
        
        plt.tight_layout()
        plt.show()

# 使用增强管道
pipeline = ImageAugmentationPipeline()
pipeline.visualize_augmentations(image)

计算机视觉最佳实践

  1. 数据预处理:标准化图像,应用数据增强
  2. 模型选择:根据任务选择合适的预训练模型
  3. 迁移学习:使用预训练模型进行微调
  4. 评估指标:使用合适的评估指标(准确率、mAP、IoU等)
  5. 部署优化:模型量化、剪枝等优化技术

计算机视觉是人工智能的重要分支,掌握计算机视觉技术对于图像处理、目标检测等应用至关重要。