DeepSeek:深度求索大模型
--- title: "DeepSeek:深度求索大模型" description: "深入了解DeepSeek系列模型的创新架构和代码能力" tags: ["DeepSeek", "深度求索", "代码模型", "MoE"] category: "llm" icon: "🧠"
DeepSeek:深度求索大模型
DeepSeek简介
DeepSeek(深度求索)是深度求索公司开发的大语言模型系列。DeepSeek以其创新的MoE(混合专家)架构、出色的代码能力和高效的训练方法著称。
DeepSeek的核心优势:
- MoE架构:高效的稀疏专家模型
- 代码能力:出色的代码生成和理解能力
- 开源开放:完全开源模型权重
- 创新架构:Multi-head Latent Attention等创新
DeepSeek架构
Multi-head Latent Attention (MLA)
# DeepSeek-V2引入的MLA机制
# 通过低秩压缩KV Cache减少内存使用
class MultiHeadLatentAttention(nn.Module):
def __init__(self, hidden_size, num_heads, kv_lora_rank=512):
super().__init__()
self.num_heads = num_heads
self.head_dim = hidden_size // num_heads
self.kv_lora_rank = kv_lora_rank
# Q投影
self.q_proj = nn.Linear(hidden_size, num_heads * self.head_dim)
# KV低秩压缩
self.kv_down_proj = nn.Linear(hidden_size, kv_lora_rank)
self.k_up_proj = nn.Linear(kv_lora_rank, num_heads * self.head_dim)
self.v_up_proj = nn.Linear(kv_lora_rank, num_heads * self.head_dim)
self.o_proj = nn.Linear(num_heads * self.head_dim, hidden_size)
def forward(self, x):
batch_size, seq_len, _ = x.shape
# Q投影
q = self.q_proj(x).view(batch_size, seq_len, self.num_heads, self.head_dim)
# KV低秩压缩
kv_compressed = self.kv_down_proj(x) # [batch, seq, kv_lora_rank]
k = self.k_up_proj(kv_compressed).view(batch_size, seq_len, self.num_heads, self.head_dim)
v = self.v_up_proj(kv_compressed).view(batch_size, seq_len, self.num_heads, self.head_dim)
# 注意力计算
q, k, v = [t.transpose(1, 2) for t in [q, k, v]]
attn = torch.matmul(q, k.transpose(-2, -1)) / (self.head_dim ** 0.5)
attn = F.softmax(attn, dim=-1)
output = torch.matmul(attn, v)
output = output.transpose(1, 2).contiguous().view(batch_size, seq_len, -1)
return self.o_proj(output)
DeepSeekMoE
# DeepSeekMoE:更细粒度的专家划分
class DeepSeekMoE(nn.Module):
def __init__(self, hidden_size, intermediate_size, num_experts=64, num_experts_per_tok=6):
super().__init__()
self.num_experts = num_experts
self.num_experts_per_tok = num_experts_per_tok
# 门控网络
self.gate = nn.Linear(hidden_size, num_experts, bias=False)
# 专家网络
self.experts = nn.ModuleList([
SwiGLU(hidden_size, intermediate_size)
for _ in range(num_experts)
])
def forward(self, x):
batch_size, seq_len, hidden_size = x.shape
# 计算专家权重
gate_logits = self.gate(x.view(-1, hidden_size))
weights, indices = torch.topk(gate_logits, self.num_experts_per_tok, dim=-1)
weights = F.softmax(weights, dim=-1)
# 专家计算
output = torch.zeros_like(x)
for i, expert in enumerate(self.experts):
mask = (indices == i).any(dim=-1)
if mask.any():
expert_input = x[mask]
expert_output = expert(expert_input)
# 加权累加
for j in range(self.num_experts_per_tok):
expert_mask = (indices[mask] == i)[:, j]
if expert_mask.any():
output[mask][expert_mask] += expert_output[expert_mask] * weights[mask][expert_mask, j]
return output
DeepSeek版本
# DeepSeek版本演进
versions = {
"DeepSeek-7B": {
"参数": "7B",
"上下文": "4K",
"特点": "基础版本"
},
"DeepSeek-16B": {
"参数": "16B",
"上下文": "4K",
"特点": "更大模型"
},
"DeepSeek-Coder": {
"参数": "6.7B/33B",
"上下文": "16K",
"特点": "代码专用"
},
"DeepSeek-V2": {
"参数": "236B (21B激活)",
"上下文": "128K",
"特点": "MLA,MoE"
},
"DeepSeek-V3": {
"参数": "671B (37B激活)",
"上下文": "128K",
"特点": "最新旗舰"
},
"DeepSeek-R1": {
"参数": "671B",
"上下文": "128K",
"特点": "推理模型"
}
}
使用DeepSeek
基本推理
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
# 加载DeepSeek-V2
model_name = "deepseek-ai/DeepSeek-V2-Chat"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True
)
# 推理
messages = [
{"role": "system", "content": "你是一个有帮助的助手"},
{"role": "user", "content": "什么是混合专家模型?"}
]
inputs = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
return_tensors="pt"
).to(model.device)
outputs = model.generate(inputs, max_new_tokens=512)
response = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
print(response)
使用vLLM部署
from vllm import LLM, SamplingParams
# 部署DeepSeek-V2
llm = LLM(
model="deepseek-ai/DeepSeek-V2-Chat",
tensor_parallel_size=4, # 需要多GPU
max_model_len=128000,
trust_remote_code=True
)
sampling_params = SamplingParams(temperature=0.7, max_tokens=512)
outputs = llm.generate(["什么是MoE架构?"], sampling_params)
print(outputs[0].outputs[0].text)
DeepSeek-Coder
# 代码生成
from transformers import AutoModelForCausalLM, AutoTokenizer
model_name = "deepseek-ai/deepseek-coder-33b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True
)
# 代码生成
messages = [
{"role": "user", "content": "写一个快速排序算法"}
]
response = model.chat(tokenizer, messages, max_new_tokens=512)
print(response)
微调DeepSeek
LoRA微调
from peft import LoraConfig, get_peft_model
# LoRA配置
lora_config = LoraConfig(
r=64,
lora_alpha=16,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
性能评估
# DeepSeek性能
performance = {
"DeepSeek-V2-Chat": {
"MMLU": "78.5",
"HumanEval": "75.6",
"代码能力": "出色",
"优势": "MoE高效,代码强"
},
"DeepSeek-Coder-33B": {
"HumanEval": "56.1",
"MBPP": "66.0",
"优势": "专业代码模型"
}
}
最佳实践
- 选择版本:代码任务使用Coder,通用任务使用V2
- 多GPU部署:大模型需要张量并行
- 利用MoE:激活参数少,推理高效
- 长上下文:充分利用128K上下文
- 代码应用:使用Coder进行代码生成和补全
DeepSeek通过创新的MoE架构和出色的代码能力,在开源LLM领域占据重要地位。