AWQ:激活感知量化
--- title: "AWQ:激活感知量化" description: "掌握AWQ的原理和实现,实现更高质量的LLM量化" tags: ["AWQ", "激活感知量化", "模型压缩", "高效推理"] category: "llm" icon: "🧠"
AWQ:激活感知量化
AWQ简介
AWQ(Activation-aware Weight Quantization)是一种基于激活分布的权重量化方法。与GPTQ不同,AWQ观察到并非所有权重通道都同等重要,通过保护重要通道来提高量化质量。AWQ在保持高质量的同时实现了快速的量化过程。
AWQ的核心优势:
- 高质量:通过保护重要权重通道保持模型质量
- 快速量化:无需反向传播,量化速度快
- 通用性强:适用于各种模型架构
- 高效推理:量化后的模型推理速度快
工作原理
激活感知原理
import torch
import torch.nn as nn
def analyze_activation_importance(model, calibration_data):
"""分析激活值的重要性"""
activation_importance = {}
def hook_fn(module, input, output, name):
# 计算激活值的统计量
activation = output.detach()
# 通道重要性 = 激活值的L2范数
importance = activation.abs().mean(dim=(0, 1))
activation_importance[name] = importance
# 注册钩子
hooks = []
for name, module in model.named_modules():
if isinstance(module, nn.Linear):
hook = module.register_forward_hook(
lambda m, i, o, n=name: hook_fn(m, i, o, n)
)
hooks.append(hook)
# 前向传播收集统计
with torch.no_grad():
for batch in calibration_data:
model(**batch)
# 移除钩子
for hook in hooks:
hook.remove()
return activation_importance
AWQ量化算法
def awq_quantize(weight, scales, zeros, bits=4, group_size=128):
"""AWQ量化"""
rows, cols = weight.shape
# 应用缩放因子
scaled_weight = weight * scales.unsqueeze(0)
# 分组量化
weight_groups = scaled_weight.reshape(rows, cols // group_size, group_size)
# 计算量化范围
max_vals = weight_groups.abs().max(dim=-1, keepdim=True).values
qrange = (2 ** (bits - 1) - 1)
# 量化
quantized = torch.round(weight_groups * qrange / max_vals)
quantized = torch.clamp(quantized, -qrange - 1, qrange)
# 反量化
dequantized = quantized * max_vals / qrange
return dequantized.reshape(rows, cols)
使用AWQ库
安装
pip install autoawq
基本使用
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
def quantize_model_awq(model_name, calibration_data, output_dir):
"""使用AWQ量化模型"""
# 加载模型
model = AutoAWQForCausalLM.from_pretrained(
model_name,
safetensors=True
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# 量化配置
quant_config = {
"zero_point": True,
"q_group_size": 128,
"w_bit": 4,
"version": "GEMM"
}
# 准备校准数据
calibration_dataset = []
for text in calibration_data[:128]:
inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
calibration_dataset.append(inputs["input_ids"])
# 量化
model.quantize(
tokenizer,
quant_config=quant_config,
calib_data=calibration_dataset
)
# 保存
model.save_quantized(output_dir)
tokenizer.save_pretrained(output_dir)
return model
# 使用
calibration_data = ["示例文本1", "示例文本2", ...]
quantized_model = quantize_model_awq(
"meta-llama/Llama-2-7b-hf",
calibration_data,
"./llama2-7b-awq-4bit"
)
加载量化模型
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
def load_awq_model(model_path):
"""加载AWQ量化模型"""
model = AutoAWQForCausalLM.from_quantized(
model_path,
fuse_layers=True, # 融合层以提高推理速度
safetensors=True
)
tokenizer = AutoTokenizer.from_pretrained(model_path)
return model, tokenizer
# 使用
model, tokenizer = load_awq_model("./llama2-7b-awq-4bit")
# 推理
inputs = tokenizer("Hello!", return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=100)
print(tokenizer.decode(outputs[0]))
量化配置
配置参数详解
quant_configs = {
"标准配置": {
"w_bit": 4,
"q_group_size": 128,
"zero_point": True,
"version": "GEMM"
},
"高质量配置": {
"w_bit": 4,
"q_group_size": 64,
"zero_point": True,
"version": "GEMM"
},
"快速推理配置": {
"w_bit": 4,
"q_group_size": 128,
"zero_point": True,
"version": "Marlin" # 使用Marlin内核
},
"3bit配置": {
"w_bit": 3,
"q_group_size": 128,
"zero_point": True,
"version": "GEMM"
}
}
自动配置选择
def select_awq_config(model_size, target_memory_gb, priority="quality"):
"""选择AWQ配置"""
original_size_gb = model_size * 2 # FP16
if priority == "quality":
# 优先质量
return {
"w_bit": 4,
"q_group_size": 64,
"zero_point": True
}
elif priority == "speed":
# 优先速度
return {
"w_bit": 4,
"q_group_size": 128,
"zero_point": True,
"version": "Marlin"
}
elif priority == "compression":
# 优先压缩
if original_size_gb / target_memory_gb > 6:
return {"w_bit": 3, "q_group_size": 64}
else:
return {"w_bit": 4, "q_group_size": 128}
性能对比
与GPTQ对比
def compare_awq_gptq(model_name, calibration_data, eval_dataset):
"""对比AWQ和GPTQ"""
results = {}
# AWQ量化
awq_model = quantize_model_awq(model_name, calibration_data, "./awq_model")
awq_quality = evaluate_model(awq_model, eval_dataset)
awq_size = get_model_size("./awq_model")
awq_speed = benchmark_inference(awq_model)
# GPTQ量化
gptq_model = quantize_model_gptq(model_name, calibration_data, "./gptq_model")
gptq_quality = evaluate_model(gptq_model, eval_dataset)
gptq_size = get_model_size("./gptq_model")
gptq_speed = benchmark_inference(gptq_model)
results = {
"AWQ": {
"quality": awq_quality,
"size": awq_size,
"speed": awq_speed
},
"GPTQ": {
"quality": gptq_quality,
"size": gptq_size,
"speed": gptq_speed
}
}
return results
内存和速度基准
def benchmark_awq(model_path):
"""AWQ模型基准测试"""
import time
model, tokenizer = load_awq_model(model_path)
# 内存使用
memory_usage = torch.cuda.memory_allocated() / 1024**3
# 推理速度
prompt = "What is machine learning?"
inputs = tokenizer(prompt, return_tensors="pt")
# 预热
for _ in range(5):
_ = model.generate(**inputs, max_new_tokens=50)
# 测量
start = time.time()
for _ in range(20):
_ = model.generate(**inputs, max_new_tokens=100)
avg_time = (time.time() - start) / 20
throughput = 100 / avg_time # tokens/second
return {
"memory_gb": memory_usage,
"latency_ms": avg_time * 1000,
"throughput_tokens_per_sec": throughput
}
与vLLM集成
from vllm import LLM, SamplingParams
# 使用AWQ量化模型
llm = LLM(
model="./llama2-7b-awq-4bit",
quantization="awq",
max_model_len=4096,
gpu_memory_utilization=0.9,
enforce_eager=False # 启用CUDA Graph
)
# 推理
prompts = ["什么是深度学习?", "解释机器学习"]
sampling_params = SamplingParams(temperature=0.7, max_tokens=256)
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
print(output.outputs[0].text)
融合层优化
def fuse_awq_layers(model):
"""融合AWQ层以提高推理速度"""
# AWQ支持层融合
# 将量化、反量化和矩阵乘法融合为一个操作
# 使用fuse_layers=True加载
model = AutoAWQForCausalLM.from_quantized(
model_path,
fuse_layers=True
)
return model
常见问题
量化失败
# 解决方案
solutions = {
"校准数据问题": "确保校准数据质量和数量",
"内存不足": "使用CPU卸载或减少batch_size",
"模型兼容性": "检查模型是否支持AWQ量化"
}
质量下降
# 解决方案
def improve_awq_quality(model, calibration_data):
"""提高AWQ量化质量"""
# 1. 增加校准数据量
calibration_data = calibration_data[:256] # 使用更多数据
# 2. 使用更小的分组大小
quant_config = {
"w_bit": 4,
"q_group_size": 64, # 更小的分组
"zero_point": True
}
# 3. 使用更高质量的校准数据
# 确保数据多样性
return quant_config
AWQ通过激活感知机制,在保持高质量的同时实现了高效的LLM量化,是部署大模型的重要工具。