← 返回首页
🧠

Gemma:Google开源轻量模型

📂 llm ⏱ 2 min 315 words

--- title: "Gemma:Google开源轻量模型" description: "深入了解Gemma系列模型的特点和轻量化设计" tags: ["Gemma", "Google", "轻量模型", "开源LLM"] category: "llm" icon: "🧠"

Gemma:Google开源轻量模型

Gemma简介

Gemma是Google开发的开源轻量级大语言模型系列。Gemma基于Gemini技术,提供2B和7B两种规格,适合在消费级设备上运行。Gemma以其轻量级设计和良好的性能著称。

Gemma的核心优势:

Gemma架构

核心设计

# Gemma配置
gemma_config = {
    "hidden_size": 3072,
    "intermediate_size": 24576,
    "num_hidden_layers": 28,
    "num_attention_heads": 16,
    "num_key_value_heads": 16,
    "max_position_embeddings": 8192,
    "rms_norm_eps": 1e-6,
    "vocab_size": 256000,
    "hidden_activation": "gelu_pytorch_tanh",
    "rope_theta": 10000.0,
    "attention_bias": True,
    "tie_word_embeddings": False
}

# 关键特性
features = {
    "GeGLU": "激活函数",
    "RMSNorm": "归一化",
    "RoPE": "位置编码",
    "Multi-Query Attention": "MQA"
}

Gemma版本

# Gemma版本
versions = {
    "Gemma-2B": {
        "参数": "2B",
        "上下文": "8K",
        "特点": "超轻量级"
    },
    "Gemma-7B": {
        "参数": "7B",
        "上下文": "8K",
        "特点": "标准版本"
    },
    "Gemma-2-2B": {
        "参数": "2B",
        "上下文": "8K",
        "特点": "新一代2B"
    },
    "Gemma-2-9B": {
        "参数": "9B",
        "上下文": "8K",
        "特点": "新一代9B"
    },
    "Gemma-2-27B": {
        "参数": "27B",
        "上下文": "8K",
        "特点": "旗舰版本"
    }
}

使用Gemma

基本推理

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# 加载Gemma
model_name = "google/gemma-2-9b-it"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

# 推理
messages = [
    {"role": "user", "content": "什么是机器学习?"}
]

inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt"
).to(model.device)

outputs = model.generate(inputs, max_new_tokens=256, do_sample=True, temperature=0.7)
response = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
print(response)

vLLM部署

from vllm import LLM, SamplingParams

# 部署Gemma
llm = LLM(
    model="google/gemma-2-9b-it",
    max_model_len=8192,
    gpu_memory_utilization=0.9
)

sampling_params = SamplingParams(temperature=0.7, max_tokens=256)
outputs = llm.generate(["Hello!"], sampling_params)
print(outputs[0].outputs[0].text)

Ollama运行

# 使用Ollama运行Gemma
ollama run gemma:2b
ollama run gemma:7b

微调Gemma

LoRA微调

from peft import LoraConfig, get_peft_model

# LoRA配置
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# 训练
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./gemma-finetuned",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    learning_rate=2e-4,
    fp16=True,
    optim="adamw_torch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset
)

trainer.train()

性能评估

# Gemma性能
performance = {
    "Gemma-2-9B": {
        "MMLU": "71.3",
        "HumanEval": "44.5",
        "优势": "轻量级,性能好"
    },
    "Gemma-2-27B": {
        "MMLU": "75.2",
        "HumanEval": "54.3",
        "优势": "旗舰模型"
    }
}

边缘设备部署

# CPU推理
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2-2b",
    torch_dtype=torch.float32,  # CPU使用FP32
    device_map="cpu"
)

# 量化部署
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4"
)

model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2-2b",
    quantization_config=bnb_config,
    device_map="auto"
)

最佳实践

  1. 选择规格:2B适合边缘设备,9B/27B适合服务器
  2. 使用Chat版本:经过对话优化
  3. 量化部署:INT4量化适合资源受限设备
  4. Ollama部署:简单快速的部署方式
  5. 微调适配:针对特定任务微调

Gemma凭借其轻量级设计和良好的性能,成为边缘设备和轻量应用的理想选择。