← 返回首页
🧠

Dolly数据集

📂 llm ⏱ 6 min 1183 words

--- title: "Dolly数据集" description: "Databricks Dolly数据集详解,包括开源指令微调数据集特点和应用场景" tags: ["Dolly", "Databricks", "指令微调", "开源数据集"] category: "llm" icon: "🧠"

Dolly数据集

什么是Dolly数据集

Dolly数据集是Databricks开发的开源指令微调数据集,包含15K条人工标注的指令-输出对。它是首批完全开源的高质量指令微调数据集之一。

数据集特点

1. 数据结构

from typing import List, Dict
import json

class DollyDataFormat:
    def __init__(self):
        self.sample_structure = {
            "instruction": "解释什么是数据湖",
            "context": "数据湖是一种集中式存储库,允许以任意规模存储所有结构化和非结构化数据。",
            "response": "数据湖是一种集中式存储架构,可以存储各种类型和规模的数据,包括结构化数据、半结构化数据和非结构化数据。",
            "category": "general"
        }
    
    def validate_format(self, data: Dict) -> bool:
        """验证Dolly数据格式"""
        required_fields = ["instruction", "response", "category"]
        return all(field in data for field in required_fields)
    
    def get_categories(self) -> List[str]:
        """获取支持的类别"""
        return [
            "general",
            "closed_qa",
            "classification",
            "summarization",
            "open_qa",
            "retrieval",
            "brainstorming",
            "few_shot"
        ]
    
    def format_for_training(self, data: Dict) -> str:
        """格式化为训练格式"""
        prompt = f"""### Instruction:
{data['instruction']}"""
        
        if data.get("context"):
            prompt += f"\n\n### Context:\n{data['context']}"
        
        prompt += f"\n\n### Response:\n{data['response']}"
        
        return prompt

# 数据示例
dolly_examples = [
    {
        "instruction": "描述云计算的主要优势",
        "context": "",
        "response": "云计算的主要优势包括:1)弹性扩展,可根据需求快速调整资源;2)按需付费,降低前期投资;3)高可用性,确保服务持续运行;4)易于管理,减少维护负担。",
        "category": "general"
    },
    {
        "instruction": "根据以下信息回答问题:Python是一种高级编程语言。",
        "context": "Python由Guido van Rossum于1991年创建,以其简洁易读的语法著称。",
        "response": "Python是一种高级编程语言,由Guido van Rossum于1991年创建,以其简洁易读的语法著称。",
        "category": "closed_qa"
    },
    {
        "instruction": "将以下文本分类为技术、商业或教育类别",
        "context": "机器学习正在改变医疗诊断的方式",
        "response": "技术",
        "category": "classification"
    }
]

2. 数据加载与处理

import pandas as pd
from datasets import Dataset, DatasetDict
from typing import List, Dict

class DollyDataLoader:
    def __init__(self):
        self.data_path = None
        self.dataset = None
    
    def load_from_json(self, file_path: str) -> List[Dict]:
        """从JSON文件加载Dolly数据"""
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        self.data_path = file_path
        return data
    
    def load_from_huggingface(self) -> Dataset:
        """从HuggingFace加载Dolly数据集"""
        from datasets import load_dataset
        
        self.dataset = load_dataset("databricks/dolly-15k")
        return self.dataset
    
    def preprocess_data(self, data: List[Dict]) -> List[Dict]:
        """预处理Dolly数据"""
        processed_data = []
        
        for item in data:
            if not self.validate_item(item):
                continue
            
            processed_item = {
                "instruction": self.clean_text(item.get("instruction", "")),
                "context": self.clean_text(item.get("context", "")),
                "response": self.clean_text(item.get("response", "")),
                "category": item.get("category", "general")
            }
            processed_data.append(processed_item)
        
        return processed_data
    
    def validate_item(self, item: Dict) -> bool:
        """验证数据项"""
        return (item.get("instruction") and 
                item.get("response") and 
                len(item["instruction"]) > 3 and
                len(item["response"]) > 5)
    
    def clean_text(self, text: str) -> str:
        """清理文本"""
        text = text.strip()
        text = text.replace('\r\n', '\n').replace('\r', '\n')
        return text
    
    def create_splits(self, data: List[Dict], 
                     train_ratio: float = 0.9,
                     val_ratio: float = 0.05) -> DatasetDict:
        """创建训练/验证/测试分割"""
        import random
        
        # 随机打乱
        shuffled_data = data.copy()
        random.shuffle(shuffled_data)
        
        total = len(shuffled_data)
        train_end = int(total * train_ratio)
        val_end = int(total * (train_ratio + val_ratio))
        
        train_data = shuffled_data[:train_end]
        val_data = shuffled_data[train_end:val_end]
        test_data = shuffled_data[val_end:]
        
        return DatasetDict({
            "train": Dataset.from_list(train_data),
            "validation": Dataset.from_list(val_data),
            "test": Dataset.from_list(test_data)
        })
    
    def filter_by_category(self, data: List[Dict], 
                          category: str) -> List[Dict]:
        """按类别筛选数据"""
        return [item for item in data if item.get("category") == category]
    
    def get_statistics(self, data: List[Dict]) -> Dict:
        """获取数据统计信息"""
        categories = {}
        instruction_lengths = []
        response_lengths = []
        
        for item in data:
            cat = item.get("category", "unknown")
            categories[cat] = categories.get(cat, 0) + 1
            instruction_lengths.append(len(item.get("instruction", "")))
            response_lengths.append(len(item.get("response", "")))
        
        return {
            "total_samples": len(data),
            "categories": categories,
            "avg_instruction_length": sum(instruction_lengths) / len(instruction_lengths),
            "avg_response_length": sum(response_lengths) / len(response_lengths),
            "min_instruction_length": min(instruction_lengths),
            "max_instruction_length": max(instruction_lengths)
        }

# 使用示例
loader = DollyDataLoader()
data = loader.load_from_json("dolly-15k.json")
stats = loader.get_statistics(data)
print(f"数据集大小: {stats['total_samples']}")
print(f"类别分布: {stats['categories']}")

与Alpaca数据集对比

class DatasetComparator:
    def __init__(self):
        self.datasets = {
            "dolly": {
                "name": "Dolly-15k",
                "size": 15000,
                "source": "Databricks",
                "annotation": "人工标注",
                "categories": 8,
                "context_support": True,
                "license": "CC-BY-SA-3.0"
            },
            "alpaca": {
                "name": "Alpaca-52k",
                "size": 52000,
                "source": "Stanford",
                "annotation": "GPT-3.5生成",
                "categories": 0,
                "context_support": False,
                "license": "CC-BY-NC-4.0"
            }
        }
    
    def compare_datasets(self) -> pd.DataFrame:
        """对比数据集"""
        df = pd.DataFrame(self.datasets).T
        return df
    
    def analyze_differences(self) -> Dict:
        """分析差异"""
        analysis = {
            "quality": {
                "dolly": "人工标注,质量更高",
                "alpaca": "自动生成,可能存在噪声"
            },
            "diversity": {
                "dolly": "类别更丰富,涵盖8种任务类型",
                "alpaca": "任务类型相对单一"
            },
            "size": {
                "dolly": "15K样本,适合小规模微调",
                "alpaca": "52K样本,适合大规模训练"
            },
            "context": {
                "dolly": "支持上下文输入",
                "alpaca": "主要支持指令-输出对"
            },
            "licensing": {
                "dolly": "商业友好",
                "alpaca": "非商业用途"
            }
        }
        return analysis
    
    def recommend_usage(self, use_case: str) -> str:
        """推荐使用场景"""
        recommendations = {
            "production": "Dolly(人工标注,质量更高)",
            "research": "Alpaca(更大规模,适合实验)",
            "commercial": "Dolly(商业友好许可证)",
            "small_model": "Dolly(数据质量更重要)",
            "large_model": "Alpaca(数据规模更重要)"
        }
        return recommendations.get(use_case, "根据具体需求选择")

数据生成方法

class DollyDataGenerator:
    def __init__(self):
        self.generation_templates = {
            "brainstorming": {
                "instruction": "头脑风暴关于{topic}的创意",
                "context_template": "我们正在讨论{topic}领域的创新",
                "response_template": "以下是关于{topic}的创意想法:\n1. {idea1}\n2. {idea2}\n3. {idea3}"
            },
            "summarization": {
                "instruction": "总结以下文本的主要观点",
                "context_template": "{text}",
                "response_template": "主要观点包括:{main_points}"
            },
            "classification": {
                "instruction": "将以下内容分类为{categories}",
                "context_template": "{content}",
                "response_template": "{category}"
            }
        }
    
    def generate_from_template(self, template_name: str, 
                              variables: Dict) -> Dict:
        """从模板生成数据"""
        template = self.generation_templates.get(template_name)
        if not template:
            return {}
        
        generated = {
            "instruction": template["instruction"].format(**variables),
            "context": template["context_template"].format(**variables),
            "response": template["response_template"].format(**variables),
            "category": template_name
        }
        
        return generated
    
    def generate_brainstorming_data(self, topics: List[str], 
                                   count_per_topic: int = 3) -> List[Dict]:
        """生成头脑风暴数据"""
        data = []
        
        for topic in topics:
            for i in range(count_per_topic):
                variables = {
                    "topic": topic,
                    "idea1": f"创新方法{i+1}用于{topic}",
                    "idea2": f"新的{topic}解决方案",
                    "idea3": f"改进{topic}的策略"
                }
                
                item = self.generate_from_template("brainstorming", variables)
                data.append(item)
        
        return data
    
    def generate_classification_data(self, 
                                    categories: List[str],
                                    contents: List[str]) -> List[Dict]:
        """生成分类数据"""
        data = []
        
        for content in contents:
            # 模拟分类逻辑
            category = categories[0]  # 简化实现
            
            variables = {
                "categories": "、".join(categories),
                "content": content,
                "category": category
            }
            
            item = self.generate_from_template("classification", variables)
            data.append(item)
        
        return data
    
    def save_generated_data(self, data: List[Dict], 
                           file_path: str):
        """保存生成的数据"""
        # 转换为Dolly格式
        dolly_format_data = []
        for item in data:
            dolly_item = {
                "instruction": item.get("instruction", ""),
                "context": item.get("context", ""),
                "response": item.get("response", ""),
                "category": item.get("category", "general")
            }
            dolly_format_data.append(dolly_item)
        
        with open(file_path, 'w', encoding='utf-8') as f:
            json.dump(dolly_format_data, f, ensure_ascii=False, indent=2)

# 使用示例
generator = DollyDataGenerator()
brainstorming_data = generator.generate_brainstorming_data(
    ["人工智能", "机器学习", "深度学习"],
    count_per_topic=5
)
generator.save_generated_data(brainstorming_data, "generated_dolly_data.json")

应用场景

class DollyDataApplication:
    def __init__(self):
        self.loader = DollyDataLoader()
        self.generator = DollyDataGenerator()
    
    def prepare_training_pipeline(self, data_path: str) -> Dict:
        """准备训练流水线"""
        # 加载数据
        raw_data = self.loader.load_from_json(data_path)
        
        # 预处理
        processed_data = self.loader.preprocess_data(raw_data)
        
        # 创建分割
        splits = self.loader.create_splits(processed_data)
        
        # 统计信息
        stats = self.loader.get_statistics(processed_data)
        
        return {
            "splits": splits,
            "statistics": stats,
            "quality_check": self.quality_check(processed_data)
        }
    
    def quality_check(self, data: List[Dict]) -> Dict:
        """数据质量检查"""
        issues = []
        
        # 检查空值
        empty_instructions = sum(1 for d in data if not d.get("instruction"))
        if empty_instructions > 0:
            issues.append(f"发现 {empty_instructions} 个空指令")
        
        # 检查重复
        instructions = [d.get("instruction", "") for d in data]
        unique_instructions = set(instructions)
        if len(unique_instructions) < len(instructions):
            duplicates = len(instructions) - len(unique_instructions)
            issues.append(f"发现 {duplicates} 个重复指令")
        
        # 检查长度异常
        long_responses = sum(1 for d in data if len(d.get("response", "")) > 1000)
        if long_responses > 0:
            issues.append(f"发现 {long_responses} 个过长响应")
        
        return {
            "passed": len(issues) == 0,
            "issues": issues,
            "total_checks": 3,
            "passed_checks": 3 - len(issues)
        }
    
    def create_domain_dataset(self, domain: str, 
                             source_data: List[Dict]) -> List[Dict]:
        """创建领域特定数据集"""
        domain_data = []
        
        for item in source_data:
            # 转换为领域特定格式
            domain_item = {
                "instruction": f"在{domain}领域,{item['instruction']}",
                "context": item.get("context", ""),
                "response": item["response"],
                "category": item.get("category", "general"),
                "domain": domain
            }
            domain_data.append(domain_item)
        
        return domain_data
    
    def evaluate_model_with_dolly(self, model, test_data: List[Dict]) -> Dict:
        """使用Dolly数据评估模型"""
        results = {
            "total": len(test_data),
            "correct": 0,
            "category_scores": {}
        }
        
        for item in test_data:
            # 生成响应
            prompt = f"Instruction: {item['instruction']}\n"
            if item.get("context"):
                prompt += f"Context: {item['context']}\n"
            prompt += "Response:"
            
            model_response = model.generate(prompt)
            
            # 评估响应
            if self.evaluate_response(model_response, item["response"]):
                results["correct"] += 1
            
            # 按类别统计
            category = item.get("category", "unknown")
            if category not in results["category_scores"]:
                results["category_scores"][category] = {"correct": 0, "total": 0}
            results["category_scores"][category]["total"] += 1
            if self.evaluate_response(model_response, item["response"]):
                results["category_scores"][category]["correct"] += 1
        
        # 计算准确率
        results["accuracy"] = results["correct"] / results["total"]
        
        # 计算各类别准确率
        for category in results["category_scores"]:
            cat_data = results["category_scores"][category]
            cat_data["accuracy"] = cat_data["correct"] / cat_data["total"]
        
        return results
    
    def evaluate_response(self, model_response: str, 
                         expected_response: str) -> bool:
        """评估模型响应"""
        # 简化的评估:检查关键词重叠
        model_words = set(model_response.lower().split())
        expected_words = set(expected_response.lower().split())
        
        # 计算Jaccard相似度
        intersection = len(model_words.intersection(expected_words))
        union = len(model_words.union(expected_words))
        
        similarity = intersection / union if union > 0 else 0
        return similarity > 0.3  # 阈值

# 使用示例
app = DollyDataApplication()
training_pipeline = app.prepare_training_pipeline("dolly-15k.json")
print(f"训练数据质量: {training_pipeline['quality_check']['passed']}")

最佳实践

class DollyBestPractices:
    def __init__(self):
        self.practices = {
            "data_preparation": [
                "确保数据格式一致性",
                "进行数据清洗和去重",
                "验证数据质量",
                "创建合适的训练/验证分割"
            ],
            "training": [
                "使用适当的超参数",
                "监控训练过程",
                "进行早停防止过拟合",
                "保存检查点"
            ],
            "evaluation": [
                "使用多种评估指标",
                "进行人工评估",
                "测试不同场景",
                "比较基线模型"
            ]
        }
    
    def get_practice_guide(self, phase: str) -> List[str]:
        """获取实践指南"""
        return self.practices.get(phase, [])
    
    def create_training_config(self, 
                              dataset_size: int,
                              model_size: str) -> Dict:
        """创建训练配置"""
        configs = {
            "small": {
                "epochs": 3,
                "batch_size": 8,
                "learning_rate": 2e-5,
                "warmup_steps": 100
            },
            "medium": {
                "epochs": 2,
                "batch_size": 16,
                "learning_rate": 1e-5,
                "warmup_steps": 200
            },
            "large": {
                "epochs": 1,
                "batch_size": 32,
                "learning_rate": 5e-6,
                "warmup_steps": 500
            }
        }
        
        return configs.get(model_size, configs["medium"])
    
    def monitor_training(self, 
                        training_logs: List[Dict]) -> Dict:
        """监控训练过程"""
        metrics = {
            "loss": [],
            "accuracy": [],
            "learning_rate": []
        }
        
        for log in training_logs:
            metrics["loss"].append(log.get("loss", 0))
            metrics["accuracy"].append(log.get("accuracy", 0))
            metrics["learning_rate"].append(log.get("learning_rate", 0))
        
        # 计算统计信息
        return {
            "final_loss": metrics["loss"][-1] if metrics["loss"] else 0,
            "final_accuracy": metrics["accuracy"][-1] if metrics["accuracy"] else 0,
            "avg_loss": sum(metrics["loss"]) / len(metrics["loss"]) if metrics["loss"] else 0,
            "convergence": self.check_convergence(metrics["loss"])
        }
    
    def check_convergence(self, losses: List[float], 
                         patience: int = 5) -> bool:
        """检查是否收敛"""
        if len(losses) < patience:
            return False
        
        recent_losses = losses[-patience:]
        return all(recent_losses[i] >= recent_losses[i+1] 
                  for i in range(len(recent_losses)-1))

# 使用示例
practices = DollyBestPractices()
training_config = practices.create_training_config(15000, "medium")
print(f"训练配置: {training_config}")

总结

Dolly数据集为LLM指令微调提供了高质量的开源资源。通过人工标注和丰富的类别支持,它成为训练可靠AI助手的重要数据集。结合最佳实践,可以有效提升模型性能。