ChatGLM:智谱AI对话模型
--- title: "ChatGLM:智谱AI对话模型" description: "深入了解ChatGLM系列模型的特点、架构和中文对话能力" tags: ["ChatGLM", "智谱AI", "中文对话", "开源模型"] category: "llm" icon: "🧠"
ChatGLM:智谱AI对话模型
ChatGLM简介
ChatGLM是智谱AI开发的开源对话大语言模型系列。ChatGLM以其出色的中文对话能力、较低的部署门槛和活跃的社区生态,在国内开源LLM领域具有重要影响力。
ChatGLM的核心优势:
- 中文优化:针对中文对话深度优化
- 部署友好:支持INT4量化,消费级GPU可运行
- 对话流畅:自然的多轮对话能力
- 开源生态:社区活跃,衍生模型众多
ChatGLM架构
GLM架构特点
# ChatGLM使用GLM(General Language Model)架构
# 结合了自回归和自编码的优点
glm_config = {
"padded_vocab_size": 65024,
"hidden_size": 4096,
"num_layers": 28,
"num_attention_heads": 32,
"ffn_hidden_size": 13696, # GLM使用7/4倍扩展
"hidden_dropout": 0.1,
"attention_dropout": 0.1,
"layernorm_epsilon": 1e-5,
"rms_norm": True,
"bos_token_id": 150004,
"eos_token_id": 150005,
"pad_token_id": 150000,
"mask_token_id": 150000,
"apply_residual_connection_post_layernorm": False,
"rope_theta": 10000.0,
"max_sequence_length": 4096,
"num_kv_heads": 2,
"skip_bias_add": False
}
Multi-Query Attention
# ChatGLM使用Multi-Query Attention减少KV Cache
class MultiQueryAttention(nn.Module):
def __init__(self, hidden_size, num_heads, num_kv_heads):
super().__init__()
self.num_heads = num_heads
self.num_kv_heads = num_kv_heads
self.head_dim = hidden_size // num_heads
self.num_kv_groups = num_heads // num_kv_heads
self.q_proj = nn.Linear(hidden_size, num_heads * self.head_dim)
self.k_proj = nn.Linear(hidden_size, num_kv_heads * self.head_dim)
self.v_proj = nn.Linear(hidden_size, num_kv_heads * self.head_dim)
self.o_proj = nn.Linear(hidden_size, hidden_size)
def forward(self, x, mask=None):
batch_size, seq_len, _ = x.shape
q = self.q_proj(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
k = self.k_proj(x).view(batch_size, seq_len, self.num_kv_heads, self.head_dim).transpose(1, 2)
v = self.v_proj(x).view(batch_size, seq_len, self.num_kv_heads, self.head_dim).transpose(1, 2)
# 扩展KV头
k = k.repeat_interleave(self.num_kv_groups, dim=1)
v = v.repeat_interleave(self.num_kv_groups, dim=1)
attn = torch.matmul(q, k.transpose(-2, -1)) / (self.head_dim ** 0.5)
if mask is not None:
attn = attn.masked_fill(mask, float('-inf'))
attn = F.softmax(attn, dim=-1)
output = torch.matmul(attn, v)
output = output.transpose(1, 2).contiguous().view(batch_size, seq_len, -1)
return self.o_proj(output)
ChatGLM版本
# ChatGLM版本演进
versions = {
"ChatGLM-6B": {
"参数": "6.2B",
"上下文": "2K",
"特点": "首个开源版本"
},
"ChatGLM2-6B": {
"参数": "6.2B",
"上下文": "32K",
"特点": "GQA,Flash Attention"
},
"ChatGLM3-6B": {
"参数": "6.2B",
"上下文": "8K",
"特点": "工具调用,代码解释器"
},
"GLM-4": {
"参数": "多规格",
"上下文": "128K",
"特点": "新一代架构"
}
}
使用ChatGLM
基本使用
from transformers import AutoTokenizer, AutoModel
# 加载ChatGLM3
model_path = "THUDM/chatglm3-6b"
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModel.from_pretrained(
model_path,
trust_remote_code=True,
device_map="auto"
).eval()
# 对话推理
response, history = model.chat(
tokenizer,
"你好,请介绍一下自己",
history=[]
)
print(response)
# 多轮对话
response, history = model.chat(
tokenizer,
"你能做什么?",
history=history
)
print(response)
流式输出
# 流式生成
for response, history in model.stream_chat(
tokenizer,
"写一首关于春天的诗",
history=[]
):
print(response, end="", flush=True)
print()
vLLM部署
from vllm import LLM, SamplingParams
# 部署ChatGLM3
llm = LLM(
model="THUDM/chatglm3-6b",
max_model_len=8192,
trust_remote_code=True,
gpu_memory_utilization=0.9
)
sampling_params = SamplingParams(temperature=0.7, max_tokens=512)
outputs = llm.generate(["什么是机器学习?"], sampling_params)
print(outputs[0].outputs[0].text)
微调ChatGLM
LoRA微调
from peft import LoraConfig, get_peft_model
# LoRA配置
lora_config = LoraConfig(
r=8,
lora_alpha=32,
target_modules=["query_key_value"],
lora_dropout=0.1,
bias="none",
task_type="CAUSAL_LM"
)
# 加载模型
model = AutoModel.from_pretrained(
"THUDM/chatglm3-6b",
trust_remote_code=True
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
P-Tuning v2
from peft import get_peft_model, TaskType
from peft import PrefixTuningConfig
# P-Tuning v2配置
p-tuning_config = PrefixTuningConfig(
task_type=TaskType.CAUSAL_LM,
num_virtual_tokens=128,
prefix_hidden_size=4096
)
model = get_peft_model(model, p-tuning_config)
ChatGLM工具调用
# ChatGLM3支持工具调用
import json
# 定义工具
tools = [
{
"type": "function",
"function": {
"name": "get_weather",
"description": "获取指定城市的天气信息",
"parameters": {
"type": "object",
"properties": {
"city": {"type": "string", "description": "城市名称"}
},
"required": ["city"]
}
}
}
]
# 使用工具
response, history = model.chat(
tokenizer,
"北京今天天气怎么样?",
history=[],
tools=tools
)
print(response)
性能评估
# ChatGLM性能
performance = {
"ChatGLM3-6B": {
"C-Eval": "66.4",
"MMLU": "59.5",
"CMMLU": "67.7",
"HumanEval": "58.5",
"优势": "中文对话流畅"
},
"GLM-4-9B": {
"C-Eval": "75.6",
"MMLU": "74.7",
"优势": "综合能力强"
}
}
量化部署
# INT4量化
from transformers import BitsAndBytesConfig
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16
)
model = AutoModel.from_pretrained(
"THUDM/chatglm3-6b",
quantization_config=bnb_config,
device_map="auto",
trust_remote_code=True
)
最佳实践
- 选择版本:ChatGLM3适合对话,GLM-4适合复杂任务
- 使用工具调用:利用ChatGLM3的function calling能力
- 量化部署:INT4量化可在消费级GPU运行
- 社区资源:利用开源社区的微调数据和代码
- 持续更新:关注智谱AI的模型更新
ChatGLM凭借其优秀的中文能力和活跃的社区生态,成为国内开源LLM的重要选择。