← 返回首页
🧠

ChatGLM:智谱AI对话模型

📂 llm ⏱ 3 min 474 words

--- title: "ChatGLM:智谱AI对话模型" description: "深入了解ChatGLM系列模型的特点、架构和中文对话能力" tags: ["ChatGLM", "智谱AI", "中文对话", "开源模型"] category: "llm" icon: "🧠"

ChatGLM:智谱AI对话模型

ChatGLM简介

ChatGLM是智谱AI开发的开源对话大语言模型系列。ChatGLM以其出色的中文对话能力、较低的部署门槛和活跃的社区生态,在国内开源LLM领域具有重要影响力。

ChatGLM的核心优势:

ChatGLM架构

GLM架构特点

# ChatGLM使用GLM(General Language Model)架构
# 结合了自回归和自编码的优点

glm_config = {
    "padded_vocab_size": 65024,
    "hidden_size": 4096,
    "num_layers": 28,
    "num_attention_heads": 32,
    "ffn_hidden_size": 13696,  # GLM使用7/4倍扩展
    "hidden_dropout": 0.1,
    "attention_dropout": 0.1,
    "layernorm_epsilon": 1e-5,
    "rms_norm": True,
    "bos_token_id": 150004,
    "eos_token_id": 150005,
    "pad_token_id": 150000,
    "mask_token_id": 150000,
    "apply_residual_connection_post_layernorm": False,
    "rope_theta": 10000.0,
    "max_sequence_length": 4096,
    "num_kv_heads": 2,
    "skip_bias_add": False
}

Multi-Query Attention

# ChatGLM使用Multi-Query Attention减少KV Cache
class MultiQueryAttention(nn.Module):
    def __init__(self, hidden_size, num_heads, num_kv_heads):
        super().__init__()
        self.num_heads = num_heads
        self.num_kv_heads = num_kv_heads
        self.head_dim = hidden_size // num_heads
        self.num_kv_groups = num_heads // num_kv_heads
        
        self.q_proj = nn.Linear(hidden_size, num_heads * self.head_dim)
        self.k_proj = nn.Linear(hidden_size, num_kv_heads * self.head_dim)
        self.v_proj = nn.Linear(hidden_size, num_kv_heads * self.head_dim)
        self.o_proj = nn.Linear(hidden_size, hidden_size)
    
    def forward(self, x, mask=None):
        batch_size, seq_len, _ = x.shape
        
        q = self.q_proj(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        k = self.k_proj(x).view(batch_size, seq_len, self.num_kv_heads, self.head_dim).transpose(1, 2)
        v = self.v_proj(x).view(batch_size, seq_len, self.num_kv_heads, self.head_dim).transpose(1, 2)
        
        # 扩展KV头
        k = k.repeat_interleave(self.num_kv_groups, dim=1)
        v = v.repeat_interleave(self.num_kv_groups, dim=1)
        
        attn = torch.matmul(q, k.transpose(-2, -1)) / (self.head_dim ** 0.5)
        if mask is not None:
            attn = attn.masked_fill(mask, float('-inf'))
        attn = F.softmax(attn, dim=-1)
        
        output = torch.matmul(attn, v)
        output = output.transpose(1, 2).contiguous().view(batch_size, seq_len, -1)
        return self.o_proj(output)

ChatGLM版本

# ChatGLM版本演进
versions = {
    "ChatGLM-6B": {
        "参数": "6.2B",
        "上下文": "2K",
        "特点": "首个开源版本"
    },
    "ChatGLM2-6B": {
        "参数": "6.2B",
        "上下文": "32K",
        "特点": "GQA,Flash Attention"
    },
    "ChatGLM3-6B": {
        "参数": "6.2B",
        "上下文": "8K",
        "特点": "工具调用,代码解释器"
    },
    "GLM-4": {
        "参数": "多规格",
        "上下文": "128K",
        "特点": "新一代架构"
    }
}

使用ChatGLM

基本使用

from transformers import AutoTokenizer, AutoModel

# 加载ChatGLM3
model_path = "THUDM/chatglm3-6b"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModel.from_pretrained(
    model_path,
    trust_remote_code=True,
    device_map="auto"
).eval()

# 对话推理
response, history = model.chat(
    tokenizer,
    "你好,请介绍一下自己",
    history=[]
)
print(response)

# 多轮对话
response, history = model.chat(
    tokenizer,
    "你能做什么?",
    history=history
)
print(response)

流式输出

# 流式生成
for response, history in model.stream_chat(
    tokenizer,
    "写一首关于春天的诗",
    history=[]
):
    print(response, end="", flush=True)
print()

vLLM部署

from vllm import LLM, SamplingParams

# 部署ChatGLM3
llm = LLM(
    model="THUDM/chatglm3-6b",
    max_model_len=8192,
    trust_remote_code=True,
    gpu_memory_utilization=0.9
)

sampling_params = SamplingParams(temperature=0.7, max_tokens=512)
outputs = llm.generate(["什么是机器学习?"], sampling_params)
print(outputs[0].outputs[0].text)

微调ChatGLM

LoRA微调

from peft import LoraConfig, get_peft_model

# LoRA配置
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query_key_value"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

# 加载模型
model = AutoModel.from_pretrained(
    "THUDM/chatglm3-6b",
    trust_remote_code=True
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

P-Tuning v2

from peft import get_peft_model, TaskType
from peft import PrefixTuningConfig

# P-Tuning v2配置
p-tuning_config = PrefixTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    num_virtual_tokens=128,
    prefix_hidden_size=4096
)

model = get_peft_model(model, p-tuning_config)

ChatGLM工具调用

# ChatGLM3支持工具调用
import json

# 定义工具
tools = [
    {
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": "获取指定城市的天气信息",
            "parameters": {
                "type": "object",
                "properties": {
                    "city": {"type": "string", "description": "城市名称"}
                },
                "required": ["city"]
            }
        }
    }
]

# 使用工具
response, history = model.chat(
    tokenizer,
    "北京今天天气怎么样?",
    history=[],
    tools=tools
)
print(response)

性能评估

# ChatGLM性能
performance = {
    "ChatGLM3-6B": {
        "C-Eval": "66.4",
        "MMLU": "59.5",
        "CMMLU": "67.7",
        "HumanEval": "58.5",
        "优势": "中文对话流畅"
    },
    "GLM-4-9B": {
        "C-Eval": "75.6",
        "MMLU": "74.7",
        "优势": "综合能力强"
    }
}

量化部署

# INT4量化
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModel.from_pretrained(
    "THUDM/chatglm3-6b",
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

最佳实践

  1. 选择版本:ChatGLM3适合对话,GLM-4适合复杂任务
  2. 使用工具调用:利用ChatGLM3的function calling能力
  3. 量化部署:INT4量化可在消费级GPU运行
  4. 社区资源:利用开源社区的微调数据和代码
  5. 持续更新:关注智谱AI的模型更新

ChatGLM凭借其优秀的中文能力和活跃的社区生态,成为国内开源LLM的重要选择。