开放数据集:开源LLM训练和评测数据集汇总
开放数据集:开源LLM训练和评测数据集汇总
开放数据集的重要性
开放数据集(Open Datasets)是推动大语言模型研究和发展的核心资源。高质量的开源数据集降低了LLM开发的门槛,促进了学术界和工业界的合作创新。
开放数据集的主要价值:
- 研究可复现:提供标准化的实验基础
- 降低开发成本:减少从零构建数据集的需求
- 促进公平比较:在相同数据条件下比较不同方法
- 加速创新:让研究者专注于算法创新而非数据收集
主要开放数据集分类
1. 预训练数据集
The Pile
from datasets import load_dataset
import json
class PileDatasetLoader:
"""The Pile数据集加载器"""
def __init__(self):
self.subsets = [
"Pile-CC", "PubMed", "ArXiv", "Github", "FreeLaw",
"StackExchange", "USPTO", "Gutenberg", "Wikipedia"
]
def load_subset(self, subset: str, split: str = "train", streaming: bool = True):
"""加载特定子集"""
return load_dataset(
"EleutherAI/the_pile",
subset,
split=split,
streaming=streaming
)
def get_dataset_info(self) -> dict:
"""获取数据集信息"""
return {
"name": "The Pile",
"size": "825 GiB",
"samples": "210B tokens",
"subsets": self.subsets,
"description": "800GB diverse, open-source text dataset"
}
def analyze_composition(self):
"""分析数据集组成"""
composition = {
"Pile-CC": "180 GiB",
"PubMed": "28 GiB",
"ArXiv": "85 GiB",
"Github": "100 GiB",
"FreeLaw": "26 GiB",
"StackExchange": "33 GiB",
"USPTO": "16 GiB",
"Gutenberg": "7 GiB",
"Wikipedia": "20 GiB"
}
return composition
# 使用示例
loader = PileDatasetLoader()
# pile_data = loader.load_subset("Pile-CC", streaming=True)
RedPajama
class RedPajamaLoader:
"""RedPajama数据集加载器"""
def __init__(self):
self.version = "v1"
self.subsets = [
"common_crawl", "github", "wikipedia", "books",
"arxiv", "stackexchange"
]
def load_dataset(self, subset: str, split: str = "train"):
"""加载RedPajama数据集"""
return load_dataset(
"togethercomputer/RedPajama-Data-1T",
subset,
split=split
)
def get_statistics(self) -> dict:
"""获取数据集统计信息"""
return {
"name": "RedPajama-1T",
"total_tokens": "1.21T tokens",
"subsets": {
"common_crawl": "877B tokens",
"github": "59B tokens",
"wikipedia": "24B tokens",
"books": "55B tokens",
"arxiv": "28B tokens",
"stackexchange": "21B tokens"
},
"description": "Open-source reproduction of LLaMA training data"
}
# 使用示例
loader = RedPajamaLoader()
# redpajama_data = loader.load_dataset("common_crawl")
Dolma
class DolmaLoader:
"""Dolma数据集加载器"""
def __init__(self):
self.subsets = [
"wiki", "books", "code", "academic", "social",
"web", "news", "wikipedia"
]
def load_dataset(self, subset: str):
"""加载Dolma数据集"""
return load_dataset(
"allenai/dolma",
subset,
streaming=True
)
def get_info(self) -> dict:
"""获取数据集信息"""
return {
"name": "Dolma",
"size": "3T tokens",
"subsets": self.subsets,
"description": "Open-source pre-training dataset by AI2"
}
# 使用示例
loader = DolmaLoader()
# dolma_data = loader.load_dataset("wiki")
2. 指令微调数据集
Alpaca
class AlpacaDatasetLoader:
"""Alpaca数据集加载器"""
def __init__(self):
self.dataset_name = "tatsu-lab/alpaca"
def load_dataset(self):
"""加载Alpaca数据集"""
return load_dataset(self.dataset_name, split="train")
def format_for_training(self, sample: dict) -> dict:
"""格式化为训练数据"""
if sample["input"]:
prompt = f"### Instruction:\n{sample['instruction']}\n\n### Input:\n{sample['input']}\n\n### Response:\n{sample['output']}"
else:
prompt = f"### Instruction:\n{sample['instruction']}\n\n### Response:\n{sample['output']}"
return {
"text": prompt,
"instruction": sample["instruction"],
"input": sample["input"],
"output": sample["output"]
}
def get_statistics(self) -> dict:
"""获取数据集统计"""
return {
"name": "Alpaca",
"samples": 52002,
"format": "instruction-input-output",
"source": "Self-Instruct with text-davinci-003",
"description": "52K instruction-following demonstrations"
}
# 使用示例
loader = AlpacaDatasetLoader()
alpaca_data = loader.load_dataset()
# formatted_data = [loader.format_for_training(sample) for sample in alpaca_data]
ShareGPT
class ShareGPTLoader:
"""ShareGPT数据集加载器"""
def __init__(self):
self.dataset_name = "anon8231489123/ShareGPT_Vicuna_unfiltered"
def load_dataset(self):
"""加载ShareGPT数据集"""
return load_dataset(self.dataset_name, split="train")
def format_conversation(self, sample: dict) -> dict:
"""格式化对话数据"""
conversations = []
for turn in sample["conversations"]:
conversations.append({
"from": turn["from"],
"value": turn["value"]
})
return {
"conversations": conversations,
"source": "sharegpt"
}
def get_info(self) -> dict:
"""获取数据集信息"""
return {
"name": "ShareGPT",
"description": "User-shared ChatGPT conversations",
"format": "multi-turn dialogue",
"use_case": "Chat model training"
}
# 使用示例
loader = ShareGPTLoader()
# sharegpt_data = loader.load_dataset()
OpenAssistant
class OpenAssistantLoader:
"""OpenAssistant数据集加载器"""
def __init__(self):
self.dataset_name = "OpenAssistant/oasst2"
def load_dataset(self):
"""加载OpenAssistant数据集"""
return load_dataset(self.dataset_name, split="train")
def format_prompt_response(self, sample: dict) -> dict:
"""格式化提示-响应对"""
return {
"prompt": sample.get("text", ""),
"response": sample.get("response", ""),
"language": sample.get("lang", "en"),
"quality": sample.get("quality", "medium")
}
def get_statistics(self) -> dict:
"""获取数据集统计"""
return {
"name": "OpenAssistant OASST2",
"samples": 91413,
"languages": "multilingual",
"description": "Human-annotated assistant responses",
"quality_levels": ["low", "medium", "high", "very_high"]
}
# 使用示例
loader = OpenAssistantLoader()
# oasst_data = loader.load_dataset()
3. 偏好数据集
Anthropic HH-RLHF
class AnthropicHHLoader:
"""Anthropic HH-RLHF数据集加载器"""
def __init__(self):
self.dataset_name = "Anthropic/hh-rlhf"
def load_dataset(self):
"""加载HH-RLHF数据集"""
return load_dataset(self.dataset_name, split="train")
def format_for_dpo(self, sample: dict) -> dict:
"""格式化为DPO训练数据"""
return {
"prompt": sample["prompt"],
"chosen": sample["chosen"],
"rejected": sample["rejected"]
}
def get_info(self) -> dict:
"""获取数据集信息"""
return {
"name": "Anthropic HH-RLHF",
"samples": 170000,
"format": "prompt-chosen-rejected",
"description": "Human preference data for RLHF training",
"source": "Anthropic"
}
# 使用示例
loader = AnthropicHHLoader()
# hh_data = loader.load_dataset()
UltraFeedback
class UltraFeedbackLoader:
"""UltraFeedback数据集加载器"""
def __init__(self):
self.dataset_name = "openbmb/UltraFeedback"
def load_dataset(self):
"""加载UltraFeedback数据集"""
return load_dataset(self.dataset_name, split="train")
def format_for_training(self, sample: dict) -> dict:
"""格式化为训练数据"""
return {
"prompt": sample["prompt"],
"chosen": sample["chosen_response"],
"rejected": sample["rejected_response"],
"category": sample.get("category", "general")
}
def get_statistics(self) -> dict:
"""获取数据集统计"""
return {
"name": "UltraFeedback",
"samples": 64000,
"format": "prompt-chosen-rejected",
"description": "Large-scale preference data for alignment",
"source": "OpenBMB"
}
# 使用示例
loader = UltraFeedbackLoader()
# ultrafeedback_data = loader.load_dataset()
4. 评测数据集
MMLU
class MMLULoader:
"""MMLU数据集加载器"""
def __init__(self):
self.dataset_name = "cais/mmlu"
self.subjects = [
"abstract_algebra", "anatomy", "astronomy", "business_ethics",
"college_biology", "college_chemistry", "college_computer_science"
]
def load_dataset(self, subject: str = None):
"""加载MMLU数据集"""
if subject:
return load_dataset(self.dataset_name, subject, split="test")
return load_dataset(self.dataset_name, "all", split="test")
def get_subjects(self) -> list:
"""获取所有学科列表"""
return self.subjects
def get_info(self) -> dict:
"""获取数据集信息"""
return {
"name": "MMLU",
"description": "Massive Multitask Language Understanding",
"tasks": 57,
"samples": 14042,
"subjects": len(self.subjects)
}
# 使用示例
loader = MMLULoader()
# mmlu_data = loader.load_dataset("abstract_algebra")
HellaSwag
class HellaSwagLoader:
"""HellaSwag数据集加载器"""
def __init__(self):
self.dataset_name = "Rowan/hellaswag"
def load_dataset(self, split: str = "validation"):
"""加载HellaSwag数据集"""
return load_dataset(self.dataset_name, split=split)
def format_prompt(self, sample: dict) -> str:
"""格式化提示"""
context = f"{sample['activity_label']}: {sample['ctx_a']} {sample['ctx_b']}"
prompt = f"{context}\n\nWhich ending is correct?\n"
for i, ending in enumerate(sample["endings"]):
prompt += f"{chr(65+i)}. {ending}\n"
prompt += "Answer:"
return prompt
def get_info(self) -> dict:
"""获取数据集信息"""
return {
"name": "HellaSwag",
"description": "Commonsense Natural Language Inference",
"samples": 10042,
"task_type": "multiple_choice"
}
# 使用示例
loader = HellaSwagLoader()
# hellaswag_data = loader.load_dataset()
5. 安全性数据集
TruthfulQA
class TruthfulQALoader:
"""TruthfulQA数据集加载器"""
def __init__(self):
self.dataset_name = "truthful_qa"
def load_dataset(self, task: str = "generation"):
"""加载TruthfulQA数据集"""
return load_dataset(self.dataset_name, task, split="validation")
def evaluate_truthfulness(self, model, sample: dict) -> dict:
"""评估真实性"""
question = sample["question"]
response = model.generate(question)
# 检查是否包含正确答案
correct = any(
ans.lower() in response.lower()
for ans in sample["correct_answers"]
)
return {
"question": question,
"response": response,
"truthful": correct
}
def get_info(self) -> dict:
"""获取数据集信息"""
return {
"name": "TruthfulQA",
"description": "Measuring How Models Mimic Human Falsehoods",
"samples": 817,
"task_types": ["generation", "multiple_choice"]
}
# 使用示例
loader = TruthfulQALoader()
# truthfulqa_data = loader.load_dataset()
数据集获取与使用指南
1. HuggingFace数据集加载
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer
class DatasetManager:
"""数据集管理器"""
def __init__(self, tokenizer_name: str = "meta-llama/Llama-2-7b-hf"):
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
def load_and_preprocess(self, dataset_name: str, max_length: int = 512):
"""加载并预处理数据集"""
# 加载数据集
dataset = load_dataset(dataset_name, split="train")
# 预处理函数
def preprocess(examples):
return self.tokenizer(
examples["text"],
truncation=True,
padding="max_length",
max_length=max_length
)
# 应用预处理
tokenized_dataset = dataset.map(
preprocess,
batched=True,
remove_columns=dataset.column_names
)
return tokenized_dataset
def split_dataset(self, dataset, train_ratio: float = 0.9):
"""划分训练集和验证集"""
train_size = int(len(dataset) * train_ratio)
test_size = len(dataset) - train_size
split_dataset = dataset.train_test_split(
test_size=test_size,
seed=42
)
return DatasetDict({
"train": split_dataset["train"],
"validation": split_dataset["test"]
})
def create_data_loader(self, dataset, batch_size: int = 8):
"""创建PyTorch数据加载器"""
import torch
from torch.utils.data import DataLoader
def collate_fn(batch):
return {
"input_ids": torch.stack([x["input_ids"] for x in batch]),
"attention_mask": torch.stack([x["attention_mask"] for x in batch])
}
return DataLoader(
dataset,
batch_size=batch_size,
shuffle=True,
collate_fn=collate_fn
)
# 使用示例
manager = DatasetManager()
# processed_dataset = manager.load_and_preprocess("tatsu-lab/alpaca")
# train_dataset, val_dataset = manager.split_dataset(processed_dataset)
2. 数据集下载与本地管理
import os
from pathlib import Path
import json
class LocalDatasetManager:
"""本地数据集管理器"""
def __init__(self, base_dir: str = "./datasets"):
self.base_dir = Path(base_dir)
self.base_dir.mkdir(parents=True, exist_ok=True)
def download_dataset(self, dataset_name: str, save_dir: str = None):
"""下载数据集到本地"""
if save_dir is None:
save_dir = self.base_dir / dataset_name
save_dir = Path(save_dir)
save_dir.mkdir(parents=True, exist_ok=True)
# 下载数据集
dataset = load_dataset(dataset_name)
# 保存为本地文件
dataset.save_to_disk(str(save_dir))
# 保存元数据
metadata = {
"name": dataset_name,
"splits": list(dataset.keys()),
"download_path": str(save_dir)
}
with open(save_dir / "metadata.json", "w") as f:
json.dump(metadata, f, indent=2)
return save_dir
def load_local_dataset(self, dataset_name: str):
"""加载本地数据集"""
dataset_path = self.base_dir / dataset_name
return load_from_disk(str(dataset_path))
def list_datasets(self) -> list:
"""列出所有本地数据集"""
datasets = []
for item in self.base_dir.iterdir():
if item.is_dir():
metadata_path = item / "metadata.json"
if metadata_path.exists():
with open(metadata_path) as f:
metadata = json.load(f)
datasets.append(metadata)
return datasets
def get_dataset_size(self, dataset_name: str) -> dict:
"""获取数据集大小信息"""
dataset_path = self.base_dir / dataset_name
total_size = 0
file_count = 0
for item in dataset_path.rglob("*"):
if item.is_file():
total_size += item.stat().st_size
file_count += 1
return {
"name": dataset_name,
"total_size_mb": total_size / (1024 * 1024),
"file_count": file_count
}
# 使用示例
manager = LocalDatasetManager()
# manager.download_dataset("tatsu-lab/alpaca")
# local_data = manager.load_local_dataset("tatsu-lab/alpaca")
3. 数据集质量检查
class DatasetQualityChecker:
"""数据集质量检查器"""
def __init__(self):
self.quality_metrics = {
"completeness": "数据完整性",
"consistency": "数据一致性",
"accuracy": "数据准确性",
"timeliness": "数据时效性"
}
def check_completeness(self, dataset) -> dict:
"""检查数据完整性"""
total_samples = len(dataset)
null_counts = {}
for column in dataset.column_names:
null_count = sum(1 for sample in dataset if sample[column] is None or sample[column] == "")
null_counts[column] = null_count
completeness = 1 - (sum(null_counts.values()) / (total_samples * len(dataset.column_names)))
return {
"total_samples": total_samples,
"null_counts": null_counts,
"completeness_score": completeness
}
def check_consistency(self, dataset) -> dict:
"""检查数据一致性"""
# 检查数据格式一致性
format_issues = []
for i, sample in enumerate(dataset[:100]): # 抽样检查
# 检查必要字段
if not sample.get("text") and not sample.get("prompt"):
format_issues.append(f"Sample {i}: missing text/prompt field")
consistency_score = 1 - (len(format_issues) / min(100, len(dataset)))
return {
"format_issues": format_issues,
"consistency_score": consistency_score
}
def check_duplicates(self, dataset) -> dict:
"""检查重复数据"""
seen = set()
duplicates = []
for i, sample in enumerate(dataset):
# 创建唯一标识
content_hash = hash(str(sample))
if content_hash in seen:
duplicates.append(i)
else:
seen.add(content_hash)
duplicate_rate = len(duplicates) / len(dataset)
return {
"duplicate_count": len(duplicates),
"duplicate_rate": duplicate_rate,
"unique_samples": len(dataset) - len(duplicates)
}
def generate_quality_report(self, dataset) -> dict:
"""生成质量报告"""
completeness = self.check_completeness(dataset)
consistency = self.check_consistency(dataset)
duplicates = self.check_duplicates(dataset)
return {
"dataset_info": {
"total_samples": len(dataset),
"columns": dataset.column_names
},
"completeness": completeness,
"consistency": consistency,
"duplicates": duplicates,
"overall_score": (
completeness["completeness_score"] +
consistency["consistency_score"] +
(1 - duplicates["duplicate_rate"])
) / 3
}
# 使用示例
checker = DatasetQualityChecker()
# quality_report = checker.generate_quality_report(my_dataset)
数据集使用最佳实践
- 选择合适的数据集:根据任务需求选择最匹配的数据集
- 数据预处理:确保数据格式符合模型要求
- 质量检查:在使用前检查数据质量
- 许可证确认:确保数据使用符合许可要求
- 版本管理:记录使用的数据集版本
- 数据增强:在必要时对数据进行增强处理
- 隐私保护:注意保护个人隐私信息
- 持续更新:关注数据集的更新和改进
总结
开放数据集是LLM研究和开发的重要基础设施。了解各种数据集的特点、正确使用数据集、保证数据质量,对于构建高质量的LLM系统至关重要。随着社区的不断发展,新的数据集和工具也在持续涌现,持续关注最新动态将有助于保持竞争力。