← 返回首页
🧠

Hugging Face入门指南

📂 llm ⏱ 3 min 422 words

--- title: "Hugging Face入门指南" description: "全面介绍Hugging Face生态系统,包括Hub、Transformers和Spaces" tags: ["HuggingFace", "Transformers", "NLP", "开源"] category: "llm" icon: "🧠"

Hugging Face入门指南

Hugging Face简介

Hugging Face是AI/ML领域最重要的开源社区和工具平台,提供模型、数据集、工具和应用的共享生态。

核心组件

components = {
    "HuggingFace Hub": "模型和数据集托管平台",
    "Transformers": "预训练模型库",
    "Datasets": "数据集加载和处理工具",
    "Tokenizers": "高效分词器",
    "Accelerate": "分布式训练工具",
    "PEFT": "参数高效微调",
    "Spaces": "应用部署平台"
}

快速开始

安装

pip install transformers datasets tokenizers
pip install torch  # 或 tensorflow

加载预训练模型

from transformers import AutoModelForCausalLM, AutoTokenizer

# 加载模型和分词器
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# 生成文本
input_text = "The future of AI is"
inputs = tokenizer(input_text, return_tensors="pt")

output = model.generate(
    **inputs,
    max_length=50,
    num_return_sequences=1
)

print(tokenizer.decode(output[0], skip_special_tokens=True))

使用Pipeline

from transformers import pipeline

# 文本生成
generator = pipeline("text-generation", model="gpt2")
result = generator("The future of AI is", max_length=50)
print(result[0]["generated_text"])

# 情感分析
classifier = pipeline("sentiment-analysis")
result = classifier("I love this product!")
print(result)

# 问答
qa = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
result = qa(question="What is Hugging Face?", context="Hugging Face is an AI company.")
print(result)

Hub使用

搜索模型

from huggingface_hub import HfApi

api = HfApi()

# 搜索模型
models = api.list_models(
    search="llama",
    limit=10,
    sort="downloads",
    direction=-1
)

for model in models:
    print(f"{model.id}: {model.downloads} downloads")

下载模型

from huggingface_hub import snapshot_download

# 下载整个模型
path = snapshot_download(
    repo_id="meta-llama/Llama-2-7b-hf",
    local_dir="./models/llama-2-7b"
)

上传模型

from huggingface_hub import HfApi

api = HfApi()

# 创建仓库
api.create_repo(repo_id="my-username/my-model")

# 上传文件
api.upload_file(
    path_or_fileobj="model.safetensors",
    path_in_repo="model.safetensors",
    repo_id="my-username/my-model"
)

Transformers库详解

模型类

from transformers import (
    AutoModel,
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    AutoModelForQuestionAnswering
)

# 根据任务选择模型类
model_causal = AutoModelForCausalLM.from_pretrained("gpt2")
model_cls = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
model_qa = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased")

分词器

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# 编码
text = "Hello, how are you?"
encoded = tokenizer(text)
print(f"Token IDs: {encoded['input_ids']}")

# 解码
decoded = tokenizer.decode(encoded["input_ids"])
print(f"Decoded: {decoded}")

# 批量编码
batch = tokenizer(["Hello", "World"], padding=True, truncation=True)

微调示例

from transformers import (
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset

# 加载数据集
dataset = load_dataset("imdb")

# 加载模型
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2
)

# 训练参数
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    learning_rate=2e-5,
    evaluation_strategy="epoch"
)

# 创建Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"]
)

# 开始训练
# trainer.train()

Datasets库

加载数据集

from datasets import load_dataset

# 加载标准数据集
dataset = load_dataset("squad")

# 查看数据集
print(dataset)
print(dataset["train"][0])

数据处理

from datasets import Dataset

# 创建自定义数据集
data = {
    "text": ["Hello", "World", "Foo", "Bar"],
    "label": [0, 1, 0, 1]
}
dataset = Dataset.from_dict(data)

# 数据转换
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Spaces部署

创建应用

# app.py - Gradio应用示例
import gradio as gr
from transformers import pipeline

classifier = pipeline("sentiment-analysis")

def analyze(text):
    result = classifier(text)
    return result[0]["label"], result[0]["score"]

demo = gr.Interface(
    fn=analyze,
    inputs="text",
    outputs=["text", "number"],
    title="情感分析"
)

demo.launch()

最佳实践

# 1. 使用Auto类自动选择模型
from transformers import AutoModel
model = AutoModel.from_pretrained("model-name")

# 2. 使用pipeline快速开发
from transformers import pipeline
classifier = pipeline("sentiment-analysis", model="model-name")

# 3. 使用缓存避免重复下载
# HuggingFace会自动缓存模型

# 4. 指定精确版本
model = AutoModel.from_pretrained("meta-llama/Llama-2-7b-hf", revision="main")

常见问题

# Q: 如何访问受限模型?
# A: 使用huggingface-cli login登录后下载

# Q: 模型太大加载不了?
# A: 使用device_map="auto"或量化加载

# Q: 如何加速推理?
# A: 使用Flash Attention、批处理、量化等技术

总结

Hugging Face提供了完整的AI开发生态,从模型获取、训练到部署一站式解决。掌握Hugging Face工具链是现代AI工程师的必备技能。