INT4推理
--- title: "INT4推理" description: "INT4量化推理技术详解,涵盖GPTQ、AWQ和GGUF格式" tags: ["INT4推理", "GPTQ", "AWQ", "GGUF", "量化"] category: "llm" icon: "🧠"
INT4推理
INT4量化是将模型权重从16位浮点数压缩到4位整数的技术,可在保持较好性能的同时大幅降低显存占用和推理延迟。本文介绍GPTQ、AWQ和GGUF三种主流4位量化方案。
GPTQ量化
GPTQ基于最优脑损伤(OBD)思想,逐层量化权重矩阵:
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
# 使用transformers直接加载GPTQ量化模型
model_id = "TheBloke/Llama-2-7B-Chat-GPTQ"
gptq_config = GPTQConfig(
bits=4,
group_size=128, # 分组量化大小
desc_act=True, # 按激活值大小排序
dataset="c4", # 校准数据集
)
model = AutoModelForCausalLM.from_pretrained(
model_id,
quantization_config=gptq_config,
device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
# 推理
inputs = tokenizer("什么是GPTQ量化?", return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=256)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
手动GPTQ量化流程
import torch
import torch.nn as nn
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
def gptq_quantize(model_path, save_path, bits=4, group_size=128):
"""手动GPTQ量化流程"""
# 配置量化参数
quantize_config = BaseQuantizeConfig(
bits=bits,
group_size=group_size,
desc_act=True,
damp_percent=0.01,
true_sequential=True,
)
# 加载原始模型
model = AutoGPTQForCausalLM.from_pretrained(
model_path, quantize_config
)
tokenizer = AutoTokenizer.from_pretrained(model_path)
# 准备校准数据
calibration_data = tokenizer(
"这是一个校准文本示例..." * 100,
return_tensors="pt"
).input_ids
# 执行量化
model.quantize(calibration_data, batch_size=1)
# 保存量化模型
model.save_quantized(save_path)
tokenizer.save_pretrained(save_path)
print(f"量化完成,保存至 {save_path}")
AWQ量化
AWQ(Activation-aware Weight Quantization)通过分析激活值分布来保护重要权重:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
# 加载AWQ量化模型
model = AutoAWQForCausalLM.from_pretrained(
"TheBloke/Llama-2-7B-Chat-AWQ",
safetensors=True,
)
tokenizer = AutoTokenizer.from_pretrained("TheBloke/Llama-2-7B-Chat-AWQ")
# 推理
inputs = tokenizer("AWQ量化的优势是什么?", return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=256)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
AWQ量化流程
from awq import AutoAWQForCausalLM
from awq.quantize import quantize
def awq_quantize(model_path, save_path, bits=4, group_size=128):
"""AWQ量化流程"""
model = AutoAWQForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
# AWQ量化配置
quant_config = {
"zero_point": True,
"q_group_size": group_size,
"w_bit": bits,
"version": "GEMM",
}
# 执行量化
quantize(
model.model,
tokenizer,
quant_config=quant_config,
calib_data="pileval", # 使用pileval数据集校准
)
# 保存
model.save_quantized(save_path)
tokenizer.save_pretrained(save_path)
GGUF格式
GGUF是llama.cpp生态的标准格式,支持CPU和混合推理:
from llama_cpp import Llama
# 加载GGUF量化模型
llm = Llama(
model_path="models/llama-2-7b-chat.Q4_K_M.gguf",
n_ctx=4096,
n_threads=8, # CPU线程数
n_gpu_layers=35, # GPU卸载层数(混合推理)
)
# 推理
output = llm.create_chat_completion(
messages=[
{"role": "system", "content": "你是一个有帮助的助手。"},
{"role": "user", "content": "解释GGUF格式的优势"},
],
max_tokens=512,
temperature=0.7,
)
print(output["choices"][0]["message"]["content"])
GGUF量化等级
# GGUF量化等级说明
quantization_levels = {
"Q2_K": "2-bit,最高压缩,质量损失较大",
"Q3_K_S": "3-bit小,平衡压缩和质量",
"Q3_K_M": "3-bit中,较常用",
"Q4_0": "4-bit基础,速度最快",
"Q4_K_S": "4-bit小,推荐日常使用",
"Q4_K_M": "4-bit中,最佳平衡点",
"Q5_0": "5-bit基础",
"Q5_K_S": "5-bit小,高质量",
"Q5_K_M": "5-bit中,接近原始质量",
"Q6_K": "6-bit,几乎无损",
"Q8_0": "8-bit,近似无损",
}
# 转换命令行示例
# python convert_hf_to_gguf.py model_dir --outfile model.gguf --outtype q4_k_m
量化方法对比
def benchmark_quantization():
"""对比不同量化方法的性能"""
import time
methods = {
"FP16": {"memory_gb": 13.5, "tokens_per_sec": 45},
"INT8": {"memory_gb": 6.8, "tokens_per_sec": 52},
"GPTQ-INT4": {"memory_gb": 3.5, "tokens_per_sec": 58},
"AWQ-INT4": {"memory_gb": 3.5, "tokens_per_sec": 60},
"GGUF-Q4_K_M": {"memory_gb": 4.2, "tokens_per_sec": 35}, # CPU
"GGUF-Q4_K_M+GPU": {"memory_gb": 4.2, "tokens_per_sec": 55}, # 混合
}
print(f"{'方法':<15} {'显存(GB)':<10} {'速度(tokens/s)':<15}")
print("-" * 40)
for method, stats in methods.items():
print(f"{method:<15} {stats['memory_gb']:<10.1f} {stats['tokens_per_sec']:<15}")
实际部署建议
# 选择量化方案的决策树
def recommend_quantization(use_case):
"""根据使用场景推荐量化方案"""
recommendations = {
"server_gpu": "GPTQ或AWQ (INT4),使用vLLM部署",
"desktop_gpu": "GGUF Q4_K_M + GPU卸载,使用llama.cpp",
"cpu_only": "GGUF Q4_K_M或Q5_K_M,使用llama.cpp",
"edge_device": "GGUF Q3_K_S或Q2_K,极致压缩",
"quality_critical": "AWQ INT4或直接使用FP16",
}
return recommendations.get(use_case, "AWQ INT4 (通用推荐)")
INT4量化使大模型在消费级硬件上运行成为可能,是推动LLM普及的关键技术。根据硬件条件和精度要求选择合适的量化方案至关重要。