← 返回首页
🧠

AWQ:激活感知量化

📂 llm ⏱ 4 min 628 words

--- title: "AWQ:激活感知量化" description: "掌握AWQ的原理和实现,实现更高质量的LLM量化" tags: ["AWQ", "激活感知量化", "模型压缩", "高效推理"] category: "llm" icon: "🧠"

AWQ:激活感知量化

AWQ简介

AWQ(Activation-aware Weight Quantization)是一种基于激活分布的权重量化方法。与GPTQ不同,AWQ观察到并非所有权重通道都同等重要,通过保护重要通道来提高量化质量。AWQ在保持高质量的同时实现了快速的量化过程。

AWQ的核心优势:

工作原理

激活感知原理

import torch
import torch.nn as nn

def analyze_activation_importance(model, calibration_data):
    """分析激活值的重要性"""
    activation_importance = {}
    
    def hook_fn(module, input, output, name):
        # 计算激活值的统计量
        activation = output.detach()
        
        # 通道重要性 = 激活值的L2范数
        importance = activation.abs().mean(dim=(0, 1))
        
        activation_importance[name] = importance
    
    # 注册钩子
    hooks = []
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            hook = module.register_forward_hook(
                lambda m, i, o, n=name: hook_fn(m, i, o, n)
            )
            hooks.append(hook)
    
    # 前向传播收集统计
    with torch.no_grad():
        for batch in calibration_data:
            model(**batch)
    
    # 移除钩子
    for hook in hooks:
        hook.remove()
    
    return activation_importance

AWQ量化算法

def awq_quantize(weight, scales, zeros, bits=4, group_size=128):
    """AWQ量化"""
    rows, cols = weight.shape
    
    # 应用缩放因子
    scaled_weight = weight * scales.unsqueeze(0)
    
    # 分组量化
    weight_groups = scaled_weight.reshape(rows, cols // group_size, group_size)
    
    # 计算量化范围
    max_vals = weight_groups.abs().max(dim=-1, keepdim=True).values
    qrange = (2 ** (bits - 1) - 1)
    
    # 量化
    quantized = torch.round(weight_groups * qrange / max_vals)
    quantized = torch.clamp(quantized, -qrange - 1, qrange)
    
    # 反量化
    dequantized = quantized * max_vals / qrange
    
    return dequantized.reshape(rows, cols)

使用AWQ库

安装

pip install autoawq

基本使用

from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

def quantize_model_awq(model_name, calibration_data, output_dir):
    """使用AWQ量化模型"""
    # 加载模型
    model = AutoAWQForCausalLM.from_pretrained(
        model_name,
        safetensors=True
    )
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # 量化配置
    quant_config = {
        "zero_point": True,
        "q_group_size": 128,
        "w_bit": 4,
        "version": "GEMM"
    }
    
    # 准备校准数据
    calibration_dataset = []
    for text in calibration_data[:128]:
        inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
        calibration_dataset.append(inputs["input_ids"])
    
    # 量化
    model.quantize(
        tokenizer,
        quant_config=quant_config,
        calib_data=calibration_dataset
    )
    
    # 保存
    model.save_quantized(output_dir)
    tokenizer.save_pretrained(output_dir)
    
    return model

# 使用
calibration_data = ["示例文本1", "示例文本2", ...]
quantized_model = quantize_model_awq(
    "meta-llama/Llama-2-7b-hf",
    calibration_data,
    "./llama2-7b-awq-4bit"
)

加载量化模型

from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

def load_awq_model(model_path):
    """加载AWQ量化模型"""
    model = AutoAWQForCausalLM.from_quantized(
        model_path,
        fuse_layers=True,  # 融合层以提高推理速度
        safetensors=True
    )
    
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    return model, tokenizer

# 使用
model, tokenizer = load_awq_model("./llama2-7b-awq-4bit")

# 推理
inputs = tokenizer("Hello!", return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=100)
print(tokenizer.decode(outputs[0]))

量化配置

配置参数详解

quant_configs = {
    "标准配置": {
        "w_bit": 4,
        "q_group_size": 128,
        "zero_point": True,
        "version": "GEMM"
    },
    "高质量配置": {
        "w_bit": 4,
        "q_group_size": 64,
        "zero_point": True,
        "version": "GEMM"
    },
    "快速推理配置": {
        "w_bit": 4,
        "q_group_size": 128,
        "zero_point": True,
        "version": "Marlin"  # 使用Marlin内核
    },
    "3bit配置": {
        "w_bit": 3,
        "q_group_size": 128,
        "zero_point": True,
        "version": "GEMM"
    }
}

自动配置选择

def select_awq_config(model_size, target_memory_gb, priority="quality"):
    """选择AWQ配置"""
    original_size_gb = model_size * 2  # FP16
    
    if priority == "quality":
        # 优先质量
        return {
            "w_bit": 4,
            "q_group_size": 64,
            "zero_point": True
        }
    elif priority == "speed":
        # 优先速度
        return {
            "w_bit": 4,
            "q_group_size": 128,
            "zero_point": True,
            "version": "Marlin"
        }
    elif priority == "compression":
        # 优先压缩
        if original_size_gb / target_memory_gb > 6:
            return {"w_bit": 3, "q_group_size": 64}
        else:
            return {"w_bit": 4, "q_group_size": 128}

性能对比

与GPTQ对比

def compare_awq_gptq(model_name, calibration_data, eval_dataset):
    """对比AWQ和GPTQ"""
    results = {}
    
    # AWQ量化
    awq_model = quantize_model_awq(model_name, calibration_data, "./awq_model")
    awq_quality = evaluate_model(awq_model, eval_dataset)
    awq_size = get_model_size("./awq_model")
    awq_speed = benchmark_inference(awq_model)
    
    # GPTQ量化
    gptq_model = quantize_model_gptq(model_name, calibration_data, "./gptq_model")
    gptq_quality = evaluate_model(gptq_model, eval_dataset)
    gptq_size = get_model_size("./gptq_model")
    gptq_speed = benchmark_inference(gptq_model)
    
    results = {
        "AWQ": {
            "quality": awq_quality,
            "size": awq_size,
            "speed": awq_speed
        },
        "GPTQ": {
            "quality": gptq_quality,
            "size": gptq_size,
            "speed": gptq_speed
        }
    }
    
    return results

内存和速度基准

def benchmark_awq(model_path):
    """AWQ模型基准测试"""
    import time
    
    model, tokenizer = load_awq_model(model_path)
    
    # 内存使用
    memory_usage = torch.cuda.memory_allocated() / 1024**3
    
    # 推理速度
    prompt = "What is machine learning?"
    inputs = tokenizer(prompt, return_tensors="pt")
    
    # 预热
    for _ in range(5):
        _ = model.generate(**inputs, max_new_tokens=50)
    
    # 测量
    start = time.time()
    for _ in range(20):
        _ = model.generate(**inputs, max_new_tokens=100)
    avg_time = (time.time() - start) / 20
    
    throughput = 100 / avg_time  # tokens/second
    
    return {
        "memory_gb": memory_usage,
        "latency_ms": avg_time * 1000,
        "throughput_tokens_per_sec": throughput
    }

与vLLM集成

from vllm import LLM, SamplingParams

# 使用AWQ量化模型
llm = LLM(
    model="./llama2-7b-awq-4bit",
    quantization="awq",
    max_model_len=4096,
    gpu_memory_utilization=0.9,
    enforce_eager=False  # 启用CUDA Graph
)

# 推理
prompts = ["什么是深度学习?", "解释机器学习"]
sampling_params = SamplingParams(temperature=0.7, max_tokens=256)

outputs = llm.generate(prompts, sampling_params)
for output in outputs:
    print(output.outputs[0].text)

融合层优化

def fuse_awq_layers(model):
    """融合AWQ层以提高推理速度"""
    # AWQ支持层融合
    # 将量化、反量化和矩阵乘法融合为一个操作
    
    # 使用fuse_layers=True加载
    model = AutoAWQForCausalLM.from_quantized(
        model_path,
        fuse_layers=True
    )
    
    return model

常见问题

量化失败

# 解决方案
solutions = {
    "校准数据问题": "确保校准数据质量和数量",
    "内存不足": "使用CPU卸载或减少batch_size",
    "模型兼容性": "检查模型是否支持AWQ量化"
}

质量下降

# 解决方案
def improve_awq_quality(model, calibration_data):
    """提高AWQ量化质量"""
    # 1. 增加校准数据量
    calibration_data = calibration_data[:256]  # 使用更多数据
    
    # 2. 使用更小的分组大小
    quant_config = {
        "w_bit": 4,
        "q_group_size": 64,  # 更小的分组
        "zero_point": True
    }
    
    # 3. 使用更高质量的校准数据
    # 确保数据多样性
    
    return quant_config

AWQ通过激活感知机制,在保持高质量的同时实现了高效的LLM量化,是部署大模型的重要工具。