GPTQ:训练后量化技术
--- title: "GPTQ:训练后量化技术" description: "掌握GPTQ的原理和实现,实现LLM的高效量化压缩" tags: ["GPTQ", "训练后量化", "模型压缩", "INT4量化"] category: "llm" icon: "🧠"
GPTQ:训练后量化技术
GPTQ简介
GPTQ(GPT Quantization)是一种基于二阶信息的训练后量化方法,可以将大语言模型压缩到INT4精度,同时保持较高的模型质量。GPTQ由Elias Frantar等人提出,是目前最流行的LLM量化方法之一。
GPTQ的核心优势:
- 高压缩率:将模型压缩到原始大小的1/4
- 快速量化:量化过程高效
- 质量保持:量化后模型性能损失小
- 无需重训练:纯推理时量化
工作原理
量化基础
import torch
import torch.nn as nn
def quantize_weight(weight, bits=4, group_size=128):
"""权重量化"""
# 分组量化
rows, cols = weight.shape
weight_groups = weight.reshape(rows, cols // group_size, group_size)
# 计算量化参数
max_vals = weight_groups.abs().max(dim=-1, keepdim=True).values
scale = max_vals / (2 ** (bits - 1) - 1)
# 量化
quantized = torch.round(weight_groups / scale)
quantized = torch.clamp(quantized, -(2 ** (bits - 1)), 2 ** (bits - 1) - 1)
# 反量化
dequantized = quantized * scale
return dequantized.reshape(rows, cols), scale
OBS(Optimal Brain Surgeon)
def obs_quantize(weight, hessian_inv, bits=4):
"""基于OBS的量化"""
# weight: 待量化权重
# hessian_inv: Hessian逆矩阵
rows, cols = weight.shape
quantized_weight = weight.clone()
for i in range(cols):
# 计算量化误差
w = weight[:, i]
# 最优量化
delta = w.round() - w
scale = delta / (hessian_inv[i, i] + 1e-6)
# 更新权重
quantized_weight[:, i] = w.round()
# 更新Hessian逆
# ... (省略复杂计算)
return quantized_weight
使用GPTQ库
安装和配置
pip install auto-gptq
pip install transformers
基本使用
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
from transformers import AutoTokenizer
def quantize_model_gptq(model_name, calibration_data, output_dir):
"""使用GPTQ量化模型"""
# 量化配置
quantize_config = BaseQuantizeConfig(
bits=4, # 量化位数
group_size=128, # 分组大小
damp_percent=0.01,
desc_act=True, # 按激活值大小排序
sym=False, # 非对称量化
)
# 加载模型
model = AutoGPTQForCausalLM.from_pretrained(
model_name,
quantize_config,
device_map="auto"
)
# 加载分词器
tokenizer = AutoTokenizer.from_pretrained(model_name)
# 准备校准数据
calibration_dataset = []
for text in calibration_data[:128]: # 使用128条数据校准
inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
calibration_dataset.append(inputs)
# 量化
model.quantize(calibration_dataset)
# 保存量化模型
model.save_quantized(output_dir)
tokenizer.save_pretrained(output_dir)
return model
# 使用
calibration_data = ["示例文本1", "示例文本2", ...]
quantized_model = quantize_model_gptq(
"meta-llama/Llama-2-7b-hf",
calibration_data,
"./llama2-7b-gptq-4bit"
)
加载量化模型
from auto_gptq import AutoGPTQForCausalLM
from transformers import AutoTokenizer
def load_quantized_model(model_path):
"""加载GPTQ量化模型"""
model = AutoGPTQForCausalLM.from_quantized(
model_path,
device_map="auto",
use_triton=False,
use_safetensors=True,
use_cuda_fp16=True
)
tokenizer = AutoTokenizer.from_pretrained(model_path)
return model, tokenizer
# 使用
model, tokenizer = load_quantized_model("./llama2-7b-gptq-4bit")
# 推理
inputs = tokenizer("Hello, how are you?", return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=100)
print(tokenizer.decode(outputs[0]))
量化配置优化
不同配置对比
configs = {
"4bit_128": {
"bits": 4,
"group_size": 128,
"desc_act": True,
"description": "标准配置,平衡质量和大小"
},
"4bit_64": {
"bits": 4,
"group_size": 64,
"desc_act": True,
"description": "更细粒度量化,质量更好"
},
"3bit_128": {
"bits": 3,
"group_size": 128,
"desc_act": True,
"description": "更高压缩,质量有损"
},
"4bit_symmetric": {
"bits": 4,
"group_size": 128,
"sym": True,
"description": "对称量化,推理更快"
}
}
自动配置选择
def select_quantization_config(model_size, target_memory_gb):
"""根据需求选择量化配置"""
# 原始模型大小估算
original_size_gb = model_size * 2 # FP16
# 计算需要的压缩比
compression_ratio = original_size_gb / target_memory_gb
if compression_ratio <= 2:
# INT8量化
return {"bits": 8, "group_size": 128}
elif compression_ratio <= 4:
# INT4量化
return {"bits": 4, "group_size": 128}
elif compression_ratio <= 8:
# INT3量化
return {"bits": 3, "group_size": 64}
else:
# INT2量化(质量损失较大)
return {"bits": 2, "group_size": 128}
性能评估
内存对比
def compare_model_sizes(model_name):
"""对比量化前后的模型大小"""
import os
# 原始模型
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(model_name)
param_count = sum(p.numel() for p in model.parameters())
sizes = {
"FP32": param_count * 4 / 1024**3,
"FP16": param_count * 2 / 1024**3,
"INT8": param_count * 1 / 1024**3,
"INT4": param_count * 0.5 / 1024**3,
}
return sizes
# 使用
sizes = compare_model_sizes("meta-llama/Llama-2-7b-hf")
for format, size in sizes.items():
print(f"{format}: {size:.2f} GB")
推理速度对比
import time
def benchmark_inference(model, tokenizer, prompts, num_runs=10):
"""基准测试推理速度"""
latencies = []
for prompt in prompts:
inputs = tokenizer(prompt, return_tensors="pt")
# 预热
_ = model.generate(**inputs, max_new_tokens=10)
# 测量
start = time.time()
for _ in range(num_runs):
_ = model.generate(**inputs, max_new_tokens=100)
latency = (time.time() - start) / num_runs
latencies.append(latency)
return {
"avg_latency": np.mean(latencies),
"throughput": 100 / np.mean(latencies) # tokens/second
}
质量评估
def evaluate_quantized_quality(original_model, quantized_model, tokenizer, eval_dataset):
"""评估量化质量"""
from datasets import load_metric
results = {}
for sample in eval_dataset:
inputs = tokenizer(sample["text"], return_tensors="pt", max_length=512)
# 原始模型输出
with torch.no_grad():
original_output = original_model(**inputs).logits
# 量化模型输出
with torch.no_grad():
quantized_output = quantized_model(**inputs).logits
# 计算相似度
cosine_sim = F.cosine_similarity(
original_output.flatten(),
quantized_output.flatten(),
dim=0
).item()
results[sample["id"]] = cosine_sim
avg_similarity = np.mean(list(results.values()))
return avg_similarity
与vLLM集成
from vllm import LLM, SamplingParams
# 使用GPTQ量化模型
llm = LLM(
model="./llama2-7b-gptq-4bit",
quantization="gptq", # 指定量化方法
max_model_len=4096,
gpu_memory_utilization=0.9
)
# 推理
prompts = ["什么是机器学习?", "如何学习Python?"]
sampling_params = SamplingParams(temperature=0.7, max_tokens=256)
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
print(output.outputs[0].text)
常见问题
量化质量下降
# 解决方案
solutions = {
"增加校准数据": "使用更多、更多样化的校准数据",
"减小group_size": "使用更小的分组大小(如64)",
"使用desc_act": "启用激活值排序量化",
"混合精度": "对敏感层使用更高精度"
}
显存不足
# 解决方案
def quantize_with_cpu_offload(model_name, calibration_data):
"""CPU卸载量化"""
from auto_gptq import AutoGPTQForCausalLM
model = AutoGPTQForCausalLM.from_pretrained(
model_name,
device_map="cpu", # 使用CPU
max_memory={0: "20GB", "cpu": "100GB"}
)
model.quantize(calibration_data)
return model
GPTQ作为高效的训练后量化方法,已成为LLM部署的标配技术,能够显著降低模型部署成本。