← 返回首页
🧠

INT4推理

📂 llm ⏱ 2 min 351 words

--- title: "INT4推理" description: "INT4量化推理技术详解,涵盖GPTQ、AWQ和GGUF格式" tags: ["INT4推理", "GPTQ", "AWQ", "GGUF", "量化"] category: "llm" icon: "🧠"

INT4推理

INT4量化是将模型权重从16位浮点数压缩到4位整数的技术,可在保持较好性能的同时大幅降低显存占用和推理延迟。本文介绍GPTQ、AWQ和GGUF三种主流4位量化方案。

GPTQ量化

GPTQ基于最优脑损伤(OBD)思想,逐层量化权重矩阵:

from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig

# 使用transformers直接加载GPTQ量化模型
model_id = "TheBloke/Llama-2-7B-Chat-GPTQ"

gptq_config = GPTQConfig(
    bits=4,
    group_size=128,      # 分组量化大小
    desc_act=True,       # 按激活值大小排序
    dataset="c4",        # 校准数据集
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=gptq_config,
    device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# 推理
inputs = tokenizer("什么是GPTQ量化?", return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=256)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

手动GPTQ量化流程

import torch
import torch.nn as nn
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig

def gptq_quantize(model_path, save_path, bits=4, group_size=128):
    """手动GPTQ量化流程"""
    # 配置量化参数
    quantize_config = BaseQuantizeConfig(
        bits=bits,
        group_size=group_size,
        desc_act=True,
        damp_percent=0.01,
        true_sequential=True,
    )
    
    # 加载原始模型
    model = AutoGPTQForCausalLM.from_pretrained(
        model_path, quantize_config
    )
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    # 准备校准数据
    calibration_data = tokenizer(
        "这是一个校准文本示例..." * 100,
        return_tensors="pt"
    ).input_ids
    
    # 执行量化
    model.quantize(calibration_data, batch_size=1)
    
    # 保存量化模型
    model.save_quantized(save_path)
    tokenizer.save_pretrained(save_path)
    print(f"量化完成,保存至 {save_path}")

AWQ量化

AWQ(Activation-aware Weight Quantization)通过分析激活值分布来保护重要权重:

from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

# 加载AWQ量化模型
model = AutoAWQForCausalLM.from_pretrained(
    "TheBloke/Llama-2-7B-Chat-AWQ",
    safetensors=True,
)
tokenizer = AutoTokenizer.from_pretrained("TheBloke/Llama-2-7B-Chat-AWQ")

# 推理
inputs = tokenizer("AWQ量化的优势是什么?", return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=256)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

AWQ量化流程

from awq import AutoAWQForCausalLM
from awq.quantize import quantize

def awq_quantize(model_path, save_path, bits=4, group_size=128):
    """AWQ量化流程"""
    model = AutoAWQForCausalLM.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    
    # AWQ量化配置
    quant_config = {
        "zero_point": True,
        "q_group_size": group_size,
        "w_bit": bits,
        "version": "GEMM",
    }
    
    # 执行量化
    quantize(
        model.model,
        tokenizer,
        quant_config=quant_config,
        calib_data="pileval",  # 使用pileval数据集校准
    )
    
    # 保存
    model.save_quantized(save_path)
    tokenizer.save_pretrained(save_path)

GGUF格式

GGUF是llama.cpp生态的标准格式,支持CPU和混合推理:

from llama_cpp import Llama

# 加载GGUF量化模型
llm = Llama(
    model_path="models/llama-2-7b-chat.Q4_K_M.gguf",
    n_ctx=4096,
    n_threads=8,  # CPU线程数
    n_gpu_layers=35,  # GPU卸载层数(混合推理)
)

# 推理
output = llm.create_chat_completion(
    messages=[
        {"role": "system", "content": "你是一个有帮助的助手。"},
        {"role": "user", "content": "解释GGUF格式的优势"},
    ],
    max_tokens=512,
    temperature=0.7,
)
print(output["choices"][0]["message"]["content"])

GGUF量化等级

# GGUF量化等级说明
quantization_levels = {
    "Q2_K": "2-bit,最高压缩,质量损失较大",
    "Q3_K_S": "3-bit小,平衡压缩和质量",
    "Q3_K_M": "3-bit中,较常用",
    "Q4_0": "4-bit基础,速度最快",
    "Q4_K_S": "4-bit小,推荐日常使用",
    "Q4_K_M": "4-bit中,最佳平衡点",
    "Q5_0": "5-bit基础",
    "Q5_K_S": "5-bit小,高质量",
    "Q5_K_M": "5-bit中,接近原始质量",
    "Q6_K": "6-bit,几乎无损",
    "Q8_0": "8-bit,近似无损",
}

# 转换命令行示例
# python convert_hf_to_gguf.py model_dir --outfile model.gguf --outtype q4_k_m

量化方法对比

def benchmark_quantization():
    """对比不同量化方法的性能"""
    import time
    
    methods = {
        "FP16": {"memory_gb": 13.5, "tokens_per_sec": 45},
        "INT8": {"memory_gb": 6.8, "tokens_per_sec": 52},
        "GPTQ-INT4": {"memory_gb": 3.5, "tokens_per_sec": 58},
        "AWQ-INT4": {"memory_gb": 3.5, "tokens_per_sec": 60},
        "GGUF-Q4_K_M": {"memory_gb": 4.2, "tokens_per_sec": 35},  # CPU
        "GGUF-Q4_K_M+GPU": {"memory_gb": 4.2, "tokens_per_sec": 55},  # 混合
    }
    
    print(f"{'方法':<15} {'显存(GB)':<10} {'速度(tokens/s)':<15}")
    print("-" * 40)
    for method, stats in methods.items():
        print(f"{method:<15} {stats['memory_gb']:<10.1f} {stats['tokens_per_sec']:<15}")

实际部署建议

# 选择量化方案的决策树
def recommend_quantization(use_case):
    """根据使用场景推荐量化方案"""
    recommendations = {
        "server_gpu": "GPTQ或AWQ (INT4),使用vLLM部署",
        "desktop_gpu": "GGUF Q4_K_M + GPU卸载,使用llama.cpp",
        "cpu_only": "GGUF Q4_K_M或Q5_K_M,使用llama.cpp",
        "edge_device": "GGUF Q3_K_S或Q2_K,极致压缩",
        "quality_critical": "AWQ INT4或直接使用FP16",
    }
    return recommendations.get(use_case, "AWQ INT4 (通用推荐)")

INT4量化使大模型在消费级硬件上运行成为可能,是推动LLM普及的关键技术。根据硬件条件和精度要求选择合适的量化方案至关重要。