← 返回首页
🧠

DeepSeek:深度求索大模型

📂 llm ⏱ 3 min 487 words

--- title: "DeepSeek:深度求索大模型" description: "深入了解DeepSeek系列模型的创新架构和代码能力" tags: ["DeepSeek", "深度求索", "代码模型", "MoE"] category: "llm" icon: "🧠"

DeepSeek:深度求索大模型

DeepSeek简介

DeepSeek(深度求索)是深度求索公司开发的大语言模型系列。DeepSeek以其创新的MoE(混合专家)架构、出色的代码能力和高效的训练方法著称。

DeepSeek的核心优势:

DeepSeek架构

Multi-head Latent Attention (MLA)

# DeepSeek-V2引入的MLA机制
# 通过低秩压缩KV Cache减少内存使用

class MultiHeadLatentAttention(nn.Module):
    def __init__(self, hidden_size, num_heads, kv_lora_rank=512):
        super().__init__()
        self.num_heads = num_heads
        self.head_dim = hidden_size // num_heads
        self.kv_lora_rank = kv_lora_rank
        
        # Q投影
        self.q_proj = nn.Linear(hidden_size, num_heads * self.head_dim)
        
        # KV低秩压缩
        self.kv_down_proj = nn.Linear(hidden_size, kv_lora_rank)
        self.k_up_proj = nn.Linear(kv_lora_rank, num_heads * self.head_dim)
        self.v_up_proj = nn.Linear(kv_lora_rank, num_heads * self.head_dim)
        
        self.o_proj = nn.Linear(num_heads * self.head_dim, hidden_size)
    
    def forward(self, x):
        batch_size, seq_len, _ = x.shape
        
        # Q投影
        q = self.q_proj(x).view(batch_size, seq_len, self.num_heads, self.head_dim)
        
        # KV低秩压缩
        kv_compressed = self.kv_down_proj(x)  # [batch, seq, kv_lora_rank]
        k = self.k_up_proj(kv_compressed).view(batch_size, seq_len, self.num_heads, self.head_dim)
        v = self.v_up_proj(kv_compressed).view(batch_size, seq_len, self.num_heads, self.head_dim)
        
        # 注意力计算
        q, k, v = [t.transpose(1, 2) for t in [q, k, v]]
        attn = torch.matmul(q, k.transpose(-2, -1)) / (self.head_dim ** 0.5)
        attn = F.softmax(attn, dim=-1)
        output = torch.matmul(attn, v)
        
        output = output.transpose(1, 2).contiguous().view(batch_size, seq_len, -1)
        return self.o_proj(output)

DeepSeekMoE

# DeepSeekMoE:更细粒度的专家划分

class DeepSeekMoE(nn.Module):
    def __init__(self, hidden_size, intermediate_size, num_experts=64, num_experts_per_tok=6):
        super().__init__()
        self.num_experts = num_experts
        self.num_experts_per_tok = num_experts_per_tok
        
        # 门控网络
        self.gate = nn.Linear(hidden_size, num_experts, bias=False)
        
        # 专家网络
        self.experts = nn.ModuleList([
            SwiGLU(hidden_size, intermediate_size)
            for _ in range(num_experts)
        ])
    
    def forward(self, x):
        batch_size, seq_len, hidden_size = x.shape
        
        # 计算专家权重
        gate_logits = self.gate(x.view(-1, hidden_size))
        weights, indices = torch.topk(gate_logits, self.num_experts_per_tok, dim=-1)
        weights = F.softmax(weights, dim=-1)
        
        # 专家计算
        output = torch.zeros_like(x)
        for i, expert in enumerate(self.experts):
            mask = (indices == i).any(dim=-1)
            if mask.any():
                expert_input = x[mask]
                expert_output = expert(expert_input)
                # 加权累加
                for j in range(self.num_experts_per_tok):
                    expert_mask = (indices[mask] == i)[:, j]
                    if expert_mask.any():
                        output[mask][expert_mask] += expert_output[expert_mask] * weights[mask][expert_mask, j]
        
        return output

DeepSeek版本

# DeepSeek版本演进
versions = {
    "DeepSeek-7B": {
        "参数": "7B",
        "上下文": "4K",
        "特点": "基础版本"
    },
    "DeepSeek-16B": {
        "参数": "16B",
        "上下文": "4K",
        "特点": "更大模型"
    },
    "DeepSeek-Coder": {
        "参数": "6.7B/33B",
        "上下文": "16K",
        "特点": "代码专用"
    },
    "DeepSeek-V2": {
        "参数": "236B (21B激活)",
        "上下文": "128K",
        "特点": "MLA,MoE"
    },
    "DeepSeek-V3": {
        "参数": "671B (37B激活)",
        "上下文": "128K",
        "特点": "最新旗舰"
    },
    "DeepSeek-R1": {
        "参数": "671B",
        "上下文": "128K",
        "特点": "推理模型"
    }
}

使用DeepSeek

基本推理

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# 加载DeepSeek-V2
model_name = "deepseek-ai/DeepSeek-V2-Chat"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

# 推理
messages = [
    {"role": "system", "content": "你是一个有帮助的助手"},
    {"role": "user", "content": "什么是混合专家模型?"}
]

inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt"
).to(model.device)

outputs = model.generate(inputs, max_new_tokens=512)
response = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
print(response)

使用vLLM部署

from vllm import LLM, SamplingParams

# 部署DeepSeek-V2
llm = LLM(
    model="deepseek-ai/DeepSeek-V2-Chat",
    tensor_parallel_size=4,  # 需要多GPU
    max_model_len=128000,
    trust_remote_code=True
)

sampling_params = SamplingParams(temperature=0.7, max_tokens=512)
outputs = llm.generate(["什么是MoE架构?"], sampling_params)
print(outputs[0].outputs[0].text)

DeepSeek-Coder

# 代码生成
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "deepseek-ai/deepseek-coder-33b-instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

# 代码生成
messages = [
    {"role": "user", "content": "写一个快速排序算法"}
]

response = model.chat(tokenizer, messages, max_new_tokens=512)
print(response)

微调DeepSeek

LoRA微调

from peft import LoraConfig, get_peft_model

# LoRA配置
lora_config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

性能评估

# DeepSeek性能
performance = {
    "DeepSeek-V2-Chat": {
        "MMLU": "78.5",
        "HumanEval": "75.6",
        "代码能力": "出色",
        "优势": "MoE高效,代码强"
    },
    "DeepSeek-Coder-33B": {
        "HumanEval": "56.1",
        "MBPP": "66.0",
        "优势": "专业代码模型"
    }
}

最佳实践

  1. 选择版本:代码任务使用Coder,通用任务使用V2
  2. 多GPU部署:大模型需要张量并行
  3. 利用MoE:激活参数少,推理高效
  4. 长上下文:充分利用128K上下文
  5. 代码应用:使用Coder进行代码生成和补全

DeepSeek通过创新的MoE架构和出色的代码能力,在开源LLM领域占据重要地位。