← 返回首页
🧠

GPTQ:训练后量化技术

📂 llm ⏱ 4 min 615 words

--- title: "GPTQ:训练后量化技术" description: "掌握GPTQ的原理和实现,实现LLM的高效量化压缩" tags: ["GPTQ", "训练后量化", "模型压缩", "INT4量化"] category: "llm" icon: "🧠"

GPTQ:训练后量化技术

GPTQ简介

GPTQ(GPT Quantization)是一种基于二阶信息的训练后量化方法,可以将大语言模型压缩到INT4精度,同时保持较高的模型质量。GPTQ由Elias Frantar等人提出,是目前最流行的LLM量化方法之一。

GPTQ的核心优势:

工作原理

量化基础

import torch
import torch.nn as nn

def quantize_weight(weight, bits=4, group_size=128):
    """权重量化"""
    # 分组量化
    rows, cols = weight.shape
    weight_groups = weight.reshape(rows, cols // group_size, group_size)
    
    # 计算量化参数
    max_vals = weight_groups.abs().max(dim=-1, keepdim=True).values
    scale = max_vals / (2 ** (bits - 1) - 1)
    
    # 量化
    quantized = torch.round(weight_groups / scale)
    quantized = torch.clamp(quantized, -(2 ** (bits - 1)), 2 ** (bits - 1) - 1)
    
    # 反量化
    dequantized = quantized * scale
    
    return dequantized.reshape(rows, cols), scale

OBS(Optimal Brain Surgeon)

def obs_quantize(weight, hessian_inv, bits=4):
    """基于OBS的量化"""
    # weight: 待量化权重
    # hessian_inv: Hessian逆矩阵
    
    rows, cols = weight.shape
    quantized_weight = weight.clone()
    
    for i in range(cols):
        # 计算量化误差
        w = weight[:, i]
        
        # 最优量化
        delta = w.round() - w
        scale = delta / (hessian_inv[i, i] + 1e-6)
        
        # 更新权重
        quantized_weight[:, i] = w.round()
        
        # 更新Hessian逆
        # ... (省略复杂计算)
    
    return quantized_weight

使用GPTQ库

安装和配置

pip install auto-gptq
pip install transformers

基本使用

from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
from transformers import AutoTokenizer

def quantize_model_gptq(model_name, calibration_data, output_dir):
    """使用GPTQ量化模型"""
    # 量化配置
    quantize_config = BaseQuantizeConfig(
        bits=4,  # 量化位数
        group_size=128,  # 分组大小
        damp_percent=0.01,
        desc_act=True,  # 按激活值大小排序
        sym=False,  # 非对称量化
    )
    
    # 加载模型
    model = AutoGPTQForCausalLM.from_pretrained(
        model_name,
        quantize_config,
        device_map="auto"
    )
    
    # 加载分词器
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # 准备校准数据
    calibration_dataset = []
    for text in calibration_data[:128]:  # 使用128条数据校准
        inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
        calibration_dataset.append(inputs)
    
    # 量化
    model.quantize(calibration_dataset)
    
    # 保存量化模型
    model.save_quantized(output_dir)
    tokenizer.save_pretrained(output_dir)
    
    return model

# 使用
calibration_data = ["示例文本1", "示例文本2", ...]
quantized_model = quantize_model_gptq(
    "meta-llama/Llama-2-7b-hf",
    calibration_data,
    "./llama2-7b-gptq-4bit"
)

加载量化模型

from auto_gptq import AutoGPTQForCausalLM
from transformers import AutoTokenizer

def load_quantized_model(model_path):
    """加载GPTQ量化模型"""
    model = AutoGPTQForCausalLM.from_quantized(
        model_path,
        device_map="auto",
        use_triton=False,
        use_safetensors=True,
        use_cuda_fp16=True
    )
    
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    return model, tokenizer

# 使用
model, tokenizer = load_quantized_model("./llama2-7b-gptq-4bit")

# 推理
inputs = tokenizer("Hello, how are you?", return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=100)
print(tokenizer.decode(outputs[0]))

量化配置优化

不同配置对比

configs = {
    "4bit_128": {
        "bits": 4,
        "group_size": 128,
        "desc_act": True,
        "description": "标准配置,平衡质量和大小"
    },
    "4bit_64": {
        "bits": 4,
        "group_size": 64,
        "desc_act": True,
        "description": "更细粒度量化,质量更好"
    },
    "3bit_128": {
        "bits": 3,
        "group_size": 128,
        "desc_act": True,
        "description": "更高压缩,质量有损"
    },
    "4bit_symmetric": {
        "bits": 4,
        "group_size": 128,
        "sym": True,
        "description": "对称量化,推理更快"
    }
}

自动配置选择

def select_quantization_config(model_size, target_memory_gb):
    """根据需求选择量化配置"""
    # 原始模型大小估算
    original_size_gb = model_size * 2  # FP16
    
    # 计算需要的压缩比
    compression_ratio = original_size_gb / target_memory_gb
    
    if compression_ratio <= 2:
        # INT8量化
        return {"bits": 8, "group_size": 128}
    elif compression_ratio <= 4:
        # INT4量化
        return {"bits": 4, "group_size": 128}
    elif compression_ratio <= 8:
        # INT3量化
        return {"bits": 3, "group_size": 64}
    else:
        # INT2量化(质量损失较大)
        return {"bits": 2, "group_size": 128}

性能评估

内存对比

def compare_model_sizes(model_name):
    """对比量化前后的模型大小"""
    import os
    
    # 原始模型
    from transformers import AutoModelForCausalLM
    model = AutoModelForCausalLM.from_pretrained(model_name)
    param_count = sum(p.numel() for p in model.parameters())
    
    sizes = {
        "FP32": param_count * 4 / 1024**3,
        "FP16": param_count * 2 / 1024**3,
        "INT8": param_count * 1 / 1024**3,
        "INT4": param_count * 0.5 / 1024**3,
    }
    
    return sizes

# 使用
sizes = compare_model_sizes("meta-llama/Llama-2-7b-hf")
for format, size in sizes.items():
    print(f"{format}: {size:.2f} GB")

推理速度对比

import time

def benchmark_inference(model, tokenizer, prompts, num_runs=10):
    """基准测试推理速度"""
    latencies = []
    
    for prompt in prompts:
        inputs = tokenizer(prompt, return_tensors="pt")
        
        # 预热
        _ = model.generate(**inputs, max_new_tokens=10)
        
        # 测量
        start = time.time()
        for _ in range(num_runs):
            _ = model.generate(**inputs, max_new_tokens=100)
        latency = (time.time() - start) / num_runs
        
        latencies.append(latency)
    
    return {
        "avg_latency": np.mean(latencies),
        "throughput": 100 / np.mean(latencies)  # tokens/second
    }

质量评估

def evaluate_quantized_quality(original_model, quantized_model, tokenizer, eval_dataset):
    """评估量化质量"""
    from datasets import load_metric
    
    results = {}
    
    for sample in eval_dataset:
        inputs = tokenizer(sample["text"], return_tensors="pt", max_length=512)
        
        # 原始模型输出
        with torch.no_grad():
            original_output = original_model(**inputs).logits
        
        # 量化模型输出
        with torch.no_grad():
            quantized_output = quantized_model(**inputs).logits
        
        # 计算相似度
        cosine_sim = F.cosine_similarity(
            original_output.flatten(),
            quantized_output.flatten(),
            dim=0
        ).item()
        
        results[sample["id"]] = cosine_sim
    
    avg_similarity = np.mean(list(results.values()))
    return avg_similarity

与vLLM集成

from vllm import LLM, SamplingParams

# 使用GPTQ量化模型
llm = LLM(
    model="./llama2-7b-gptq-4bit",
    quantization="gptq",  # 指定量化方法
    max_model_len=4096,
    gpu_memory_utilization=0.9
)

# 推理
prompts = ["什么是机器学习?", "如何学习Python?"]
sampling_params = SamplingParams(temperature=0.7, max_tokens=256)

outputs = llm.generate(prompts, sampling_params)
for output in outputs:
    print(output.outputs[0].text)

常见问题

量化质量下降

# 解决方案
solutions = {
    "增加校准数据": "使用更多、更多样化的校准数据",
    "减小group_size": "使用更小的分组大小(如64)",
    "使用desc_act": "启用激活值排序量化",
    "混合精度": "对敏感层使用更高精度"
}

显存不足

# 解决方案
def quantize_with_cpu_offload(model_name, calibration_data):
    """CPU卸载量化"""
    from auto_gptq import AutoGPTQForCausalLM
    
    model = AutoGPTQForCausalLM.from_pretrained(
        model_name,
        device_map="cpu",  # 使用CPU
        max_memory={0: "20GB", "cpu": "100GB"}
    )
    
    model.quantize(calibration_data)
    return model

GPTQ作为高效的训练后量化方法,已成为LLM部署的标配技术,能够显著降低模型部署成本。