Dolly数据集
--- title: "Dolly数据集" description: "Databricks Dolly数据集详解,包括开源指令微调数据集特点和应用场景" tags: ["Dolly", "Databricks", "指令微调", "开源数据集"] category: "llm" icon: "🧠"
Dolly数据集
什么是Dolly数据集
Dolly数据集是Databricks开发的开源指令微调数据集,包含15K条人工标注的指令-输出对。它是首批完全开源的高质量指令微调数据集之一。
数据集特点
1. 数据结构
from typing import List, Dict
import json
class DollyDataFormat:
def __init__(self):
self.sample_structure = {
"instruction": "解释什么是数据湖",
"context": "数据湖是一种集中式存储库,允许以任意规模存储所有结构化和非结构化数据。",
"response": "数据湖是一种集中式存储架构,可以存储各种类型和规模的数据,包括结构化数据、半结构化数据和非结构化数据。",
"category": "general"
}
def validate_format(self, data: Dict) -> bool:
"""验证Dolly数据格式"""
required_fields = ["instruction", "response", "category"]
return all(field in data for field in required_fields)
def get_categories(self) -> List[str]:
"""获取支持的类别"""
return [
"general",
"closed_qa",
"classification",
"summarization",
"open_qa",
"retrieval",
"brainstorming",
"few_shot"
]
def format_for_training(self, data: Dict) -> str:
"""格式化为训练格式"""
prompt = f"""### Instruction:
{data['instruction']}"""
if data.get("context"):
prompt += f"\n\n### Context:\n{data['context']}"
prompt += f"\n\n### Response:\n{data['response']}"
return prompt
# 数据示例
dolly_examples = [
{
"instruction": "描述云计算的主要优势",
"context": "",
"response": "云计算的主要优势包括:1)弹性扩展,可根据需求快速调整资源;2)按需付费,降低前期投资;3)高可用性,确保服务持续运行;4)易于管理,减少维护负担。",
"category": "general"
},
{
"instruction": "根据以下信息回答问题:Python是一种高级编程语言。",
"context": "Python由Guido van Rossum于1991年创建,以其简洁易读的语法著称。",
"response": "Python是一种高级编程语言,由Guido van Rossum于1991年创建,以其简洁易读的语法著称。",
"category": "closed_qa"
},
{
"instruction": "将以下文本分类为技术、商业或教育类别",
"context": "机器学习正在改变医疗诊断的方式",
"response": "技术",
"category": "classification"
}
]
2. 数据加载与处理
import pandas as pd
from datasets import Dataset, DatasetDict
from typing import List, Dict
class DollyDataLoader:
def __init__(self):
self.data_path = None
self.dataset = None
def load_from_json(self, file_path: str) -> List[Dict]:
"""从JSON文件加载Dolly数据"""
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
self.data_path = file_path
return data
def load_from_huggingface(self) -> Dataset:
"""从HuggingFace加载Dolly数据集"""
from datasets import load_dataset
self.dataset = load_dataset("databricks/dolly-15k")
return self.dataset
def preprocess_data(self, data: List[Dict]) -> List[Dict]:
"""预处理Dolly数据"""
processed_data = []
for item in data:
if not self.validate_item(item):
continue
processed_item = {
"instruction": self.clean_text(item.get("instruction", "")),
"context": self.clean_text(item.get("context", "")),
"response": self.clean_text(item.get("response", "")),
"category": item.get("category", "general")
}
processed_data.append(processed_item)
return processed_data
def validate_item(self, item: Dict) -> bool:
"""验证数据项"""
return (item.get("instruction") and
item.get("response") and
len(item["instruction"]) > 3 and
len(item["response"]) > 5)
def clean_text(self, text: str) -> str:
"""清理文本"""
text = text.strip()
text = text.replace('\r\n', '\n').replace('\r', '\n')
return text
def create_splits(self, data: List[Dict],
train_ratio: float = 0.9,
val_ratio: float = 0.05) -> DatasetDict:
"""创建训练/验证/测试分割"""
import random
# 随机打乱
shuffled_data = data.copy()
random.shuffle(shuffled_data)
total = len(shuffled_data)
train_end = int(total * train_ratio)
val_end = int(total * (train_ratio + val_ratio))
train_data = shuffled_data[:train_end]
val_data = shuffled_data[train_end:val_end]
test_data = shuffled_data[val_end:]
return DatasetDict({
"train": Dataset.from_list(train_data),
"validation": Dataset.from_list(val_data),
"test": Dataset.from_list(test_data)
})
def filter_by_category(self, data: List[Dict],
category: str) -> List[Dict]:
"""按类别筛选数据"""
return [item for item in data if item.get("category") == category]
def get_statistics(self, data: List[Dict]) -> Dict:
"""获取数据统计信息"""
categories = {}
instruction_lengths = []
response_lengths = []
for item in data:
cat = item.get("category", "unknown")
categories[cat] = categories.get(cat, 0) + 1
instruction_lengths.append(len(item.get("instruction", "")))
response_lengths.append(len(item.get("response", "")))
return {
"total_samples": len(data),
"categories": categories,
"avg_instruction_length": sum(instruction_lengths) / len(instruction_lengths),
"avg_response_length": sum(response_lengths) / len(response_lengths),
"min_instruction_length": min(instruction_lengths),
"max_instruction_length": max(instruction_lengths)
}
# 使用示例
loader = DollyDataLoader()
data = loader.load_from_json("dolly-15k.json")
stats = loader.get_statistics(data)
print(f"数据集大小: {stats['total_samples']}")
print(f"类别分布: {stats['categories']}")
与Alpaca数据集对比
class DatasetComparator:
def __init__(self):
self.datasets = {
"dolly": {
"name": "Dolly-15k",
"size": 15000,
"source": "Databricks",
"annotation": "人工标注",
"categories": 8,
"context_support": True,
"license": "CC-BY-SA-3.0"
},
"alpaca": {
"name": "Alpaca-52k",
"size": 52000,
"source": "Stanford",
"annotation": "GPT-3.5生成",
"categories": 0,
"context_support": False,
"license": "CC-BY-NC-4.0"
}
}
def compare_datasets(self) -> pd.DataFrame:
"""对比数据集"""
df = pd.DataFrame(self.datasets).T
return df
def analyze_differences(self) -> Dict:
"""分析差异"""
analysis = {
"quality": {
"dolly": "人工标注,质量更高",
"alpaca": "自动生成,可能存在噪声"
},
"diversity": {
"dolly": "类别更丰富,涵盖8种任务类型",
"alpaca": "任务类型相对单一"
},
"size": {
"dolly": "15K样本,适合小规模微调",
"alpaca": "52K样本,适合大规模训练"
},
"context": {
"dolly": "支持上下文输入",
"alpaca": "主要支持指令-输出对"
},
"licensing": {
"dolly": "商业友好",
"alpaca": "非商业用途"
}
}
return analysis
def recommend_usage(self, use_case: str) -> str:
"""推荐使用场景"""
recommendations = {
"production": "Dolly(人工标注,质量更高)",
"research": "Alpaca(更大规模,适合实验)",
"commercial": "Dolly(商业友好许可证)",
"small_model": "Dolly(数据质量更重要)",
"large_model": "Alpaca(数据规模更重要)"
}
return recommendations.get(use_case, "根据具体需求选择")
数据生成方法
class DollyDataGenerator:
def __init__(self):
self.generation_templates = {
"brainstorming": {
"instruction": "头脑风暴关于{topic}的创意",
"context_template": "我们正在讨论{topic}领域的创新",
"response_template": "以下是关于{topic}的创意想法:\n1. {idea1}\n2. {idea2}\n3. {idea3}"
},
"summarization": {
"instruction": "总结以下文本的主要观点",
"context_template": "{text}",
"response_template": "主要观点包括:{main_points}"
},
"classification": {
"instruction": "将以下内容分类为{categories}",
"context_template": "{content}",
"response_template": "{category}"
}
}
def generate_from_template(self, template_name: str,
variables: Dict) -> Dict:
"""从模板生成数据"""
template = self.generation_templates.get(template_name)
if not template:
return {}
generated = {
"instruction": template["instruction"].format(**variables),
"context": template["context_template"].format(**variables),
"response": template["response_template"].format(**variables),
"category": template_name
}
return generated
def generate_brainstorming_data(self, topics: List[str],
count_per_topic: int = 3) -> List[Dict]:
"""生成头脑风暴数据"""
data = []
for topic in topics:
for i in range(count_per_topic):
variables = {
"topic": topic,
"idea1": f"创新方法{i+1}用于{topic}",
"idea2": f"新的{topic}解决方案",
"idea3": f"改进{topic}的策略"
}
item = self.generate_from_template("brainstorming", variables)
data.append(item)
return data
def generate_classification_data(self,
categories: List[str],
contents: List[str]) -> List[Dict]:
"""生成分类数据"""
data = []
for content in contents:
# 模拟分类逻辑
category = categories[0] # 简化实现
variables = {
"categories": "、".join(categories),
"content": content,
"category": category
}
item = self.generate_from_template("classification", variables)
data.append(item)
return data
def save_generated_data(self, data: List[Dict],
file_path: str):
"""保存生成的数据"""
# 转换为Dolly格式
dolly_format_data = []
for item in data:
dolly_item = {
"instruction": item.get("instruction", ""),
"context": item.get("context", ""),
"response": item.get("response", ""),
"category": item.get("category", "general")
}
dolly_format_data.append(dolly_item)
with open(file_path, 'w', encoding='utf-8') as f:
json.dump(dolly_format_data, f, ensure_ascii=False, indent=2)
# 使用示例
generator = DollyDataGenerator()
brainstorming_data = generator.generate_brainstorming_data(
["人工智能", "机器学习", "深度学习"],
count_per_topic=5
)
generator.save_generated_data(brainstorming_data, "generated_dolly_data.json")
应用场景
class DollyDataApplication:
def __init__(self):
self.loader = DollyDataLoader()
self.generator = DollyDataGenerator()
def prepare_training_pipeline(self, data_path: str) -> Dict:
"""准备训练流水线"""
# 加载数据
raw_data = self.loader.load_from_json(data_path)
# 预处理
processed_data = self.loader.preprocess_data(raw_data)
# 创建分割
splits = self.loader.create_splits(processed_data)
# 统计信息
stats = self.loader.get_statistics(processed_data)
return {
"splits": splits,
"statistics": stats,
"quality_check": self.quality_check(processed_data)
}
def quality_check(self, data: List[Dict]) -> Dict:
"""数据质量检查"""
issues = []
# 检查空值
empty_instructions = sum(1 for d in data if not d.get("instruction"))
if empty_instructions > 0:
issues.append(f"发现 {empty_instructions} 个空指令")
# 检查重复
instructions = [d.get("instruction", "") for d in data]
unique_instructions = set(instructions)
if len(unique_instructions) < len(instructions):
duplicates = len(instructions) - len(unique_instructions)
issues.append(f"发现 {duplicates} 个重复指令")
# 检查长度异常
long_responses = sum(1 for d in data if len(d.get("response", "")) > 1000)
if long_responses > 0:
issues.append(f"发现 {long_responses} 个过长响应")
return {
"passed": len(issues) == 0,
"issues": issues,
"total_checks": 3,
"passed_checks": 3 - len(issues)
}
def create_domain_dataset(self, domain: str,
source_data: List[Dict]) -> List[Dict]:
"""创建领域特定数据集"""
domain_data = []
for item in source_data:
# 转换为领域特定格式
domain_item = {
"instruction": f"在{domain}领域,{item['instruction']}",
"context": item.get("context", ""),
"response": item["response"],
"category": item.get("category", "general"),
"domain": domain
}
domain_data.append(domain_item)
return domain_data
def evaluate_model_with_dolly(self, model, test_data: List[Dict]) -> Dict:
"""使用Dolly数据评估模型"""
results = {
"total": len(test_data),
"correct": 0,
"category_scores": {}
}
for item in test_data:
# 生成响应
prompt = f"Instruction: {item['instruction']}\n"
if item.get("context"):
prompt += f"Context: {item['context']}\n"
prompt += "Response:"
model_response = model.generate(prompt)
# 评估响应
if self.evaluate_response(model_response, item["response"]):
results["correct"] += 1
# 按类别统计
category = item.get("category", "unknown")
if category not in results["category_scores"]:
results["category_scores"][category] = {"correct": 0, "total": 0}
results["category_scores"][category]["total"] += 1
if self.evaluate_response(model_response, item["response"]):
results["category_scores"][category]["correct"] += 1
# 计算准确率
results["accuracy"] = results["correct"] / results["total"]
# 计算各类别准确率
for category in results["category_scores"]:
cat_data = results["category_scores"][category]
cat_data["accuracy"] = cat_data["correct"] / cat_data["total"]
return results
def evaluate_response(self, model_response: str,
expected_response: str) -> bool:
"""评估模型响应"""
# 简化的评估:检查关键词重叠
model_words = set(model_response.lower().split())
expected_words = set(expected_response.lower().split())
# 计算Jaccard相似度
intersection = len(model_words.intersection(expected_words))
union = len(model_words.union(expected_words))
similarity = intersection / union if union > 0 else 0
return similarity > 0.3 # 阈值
# 使用示例
app = DollyDataApplication()
training_pipeline = app.prepare_training_pipeline("dolly-15k.json")
print(f"训练数据质量: {training_pipeline['quality_check']['passed']}")
最佳实践
class DollyBestPractices:
def __init__(self):
self.practices = {
"data_preparation": [
"确保数据格式一致性",
"进行数据清洗和去重",
"验证数据质量",
"创建合适的训练/验证分割"
],
"training": [
"使用适当的超参数",
"监控训练过程",
"进行早停防止过拟合",
"保存检查点"
],
"evaluation": [
"使用多种评估指标",
"进行人工评估",
"测试不同场景",
"比较基线模型"
]
}
def get_practice_guide(self, phase: str) -> List[str]:
"""获取实践指南"""
return self.practices.get(phase, [])
def create_training_config(self,
dataset_size: int,
model_size: str) -> Dict:
"""创建训练配置"""
configs = {
"small": {
"epochs": 3,
"batch_size": 8,
"learning_rate": 2e-5,
"warmup_steps": 100
},
"medium": {
"epochs": 2,
"batch_size": 16,
"learning_rate": 1e-5,
"warmup_steps": 200
},
"large": {
"epochs": 1,
"batch_size": 32,
"learning_rate": 5e-6,
"warmup_steps": 500
}
}
return configs.get(model_size, configs["medium"])
def monitor_training(self,
training_logs: List[Dict]) -> Dict:
"""监控训练过程"""
metrics = {
"loss": [],
"accuracy": [],
"learning_rate": []
}
for log in training_logs:
metrics["loss"].append(log.get("loss", 0))
metrics["accuracy"].append(log.get("accuracy", 0))
metrics["learning_rate"].append(log.get("learning_rate", 0))
# 计算统计信息
return {
"final_loss": metrics["loss"][-1] if metrics["loss"] else 0,
"final_accuracy": metrics["accuracy"][-1] if metrics["accuracy"] else 0,
"avg_loss": sum(metrics["loss"]) / len(metrics["loss"]) if metrics["loss"] else 0,
"convergence": self.check_convergence(metrics["loss"])
}
def check_convergence(self, losses: List[float],
patience: int = 5) -> bool:
"""检查是否收敛"""
if len(losses) < patience:
return False
recent_losses = losses[-patience:]
return all(recent_losses[i] >= recent_losses[i+1]
for i in range(len(recent_losses)-1))
# 使用示例
practices = DollyBestPractices()
training_config = practices.create_training_config(15000, "medium")
print(f"训练配置: {training_config}")
总结
Dolly数据集为LLM指令微调提供了高质量的开源资源。通过人工标注和丰富的类别支持,它成为训练可靠AI助手的重要数据集。结合最佳实践,可以有效提升模型性能。