← 返回首页
🧠

数据标注:构建高质量训练数据

📂 llm ⏱ 4 min 620 words

--- title: "数据标注:构建高质量训练数据" description: "掌握数据标注的方法论、工具选择和质量控制,构建高质量的LLM训练数据" tags: ["数据标注", "人工标注", "标注工具", "质量控制"] category: "llm" icon: "🧠"

数据标注:构建高质量训练数据

数据标注概述

数据标注是将原始数据转化为可用于模型训练的结构化数据的过程。在LLM时代,高质量的标注数据是构建优秀模型的关键。数据标注包括指令标注、偏好标注、分类标注等多种类型。

数据标注的核心目标:

标注类型

指令标注

# 指令标注格式
instruction_annotation = {
    "instruction": "将以下英文翻译成中文",
    "input": "Machine learning is a subset of artificial intelligence.",
    "output": "机器学习是人工智能的一个子集。",
    "metadata": {
        "category": "翻译",
        "difficulty": "easy",
        "language_pair": "en-zh"
    }
}

# 批量标注模板
annotation_template = {
    "template_id": "translation_001",
    "instruction_template": "将以下{source_lang}文本翻译成{target_lang}",
    "input_field": "source_text",
    "output_field": "translated_text"
}

偏好标注

# 偏好标注格式
preference_annotation = {
    "prompt": "解释什么是深度学习",
    "chosen": "深度学习是机器学习的一个分支,使用多层神经网络从数据中学习特征表示...",
    "rejected": "深度学习就是很多层的神经网络。",
    "annotation_notes": "chosen版本更详细、更准确"
}

分类标注

# 文本分类标注
classification_annotation = {
    "text": "这款产品质量很好,值得购买",
    "label": "positive",
    "confidence": 0.95,
    "sub_labels": ["产品评价", "正面情感"]
}

标注工具

Label Studio

# Label Studio配置
label_config = """
<View>
  <Header value="指令标注任务"/>
  <TextArea name="instruction" placeholder="输入指令"/>
  <TextArea name="input" placeholder="输入内容(可选)"/>
  <TextArea name="output" placeholder="输入期望的输出"/>
  <Choices name="category" toName="instruction">
    <Choice value="翻译"/>
    <Choice value="摘要"/>
    <Choice value="问答"/>
    <Choice value="创作"/>
  </Choices>
  <TextArea name="notes" placeholder="备注"/>
</View>
"""

# 启动标注服务
# label-studio start --port 8080

配置标注任务

import json

def create_labeling_project(config):
    """创建标注项目配置"""
    project_config = {
        "title": config["project_name"],
        "description": config["description"],
        "label_config": config["label_config"],
        "expert_instruction": config["instructions"]
    }
    
    # 保存配置
    with open(f"{config['project_name']}_config.json", "w") as f:
        json.dump(project_config, f, indent=2)
    
    return project_config

标注流程设计

标注指南

annotation_guidelines = """
# 数据标注指南

## 1. 指令标注要求

### 指令质量标准
- 清晰明确:指令应该清晰表达任务要求
- 完整性:包含完成任务所需的所有信息
- 可行性:任务应该是可完成的
- 多样性:覆盖不同类型和难度的任务

### 输出质量标准
- 准确性:回答应该准确无误
- 完整性:回答应该完整覆盖问题
- 简洁性:避免冗余信息
- 格式规范:符合要求的输出格式

## 2. 偏好标注要求

### 选择标准
- 有用性:哪个回答更有帮助
- 准确性:哪个回答更准确
- 安全性:哪个回答更安全
- 详细程度:哪个回答更详细

## 3. 注意事项
- 保持客观公正
- 不要添加个人观点
- 遇到问题及时反馈
"""

# 保存标注指南
with open("annotation_guidelines.txt", "w") as f:
    f.write(annotation_guidelines)

标注流程

class AnnotationPipeline:
    """标注流程管理"""
    
    def __init__(self, project_name):
        self.project_name = project_name
        self.annotations = []
        self.quality_checks = []
    
    def prepare_data(self, raw_data):
        """准备标注数据"""
        prepared = []
        for item in raw_data:
            prepared.append({
                "id": len(prepared),
                "data": item,
                "status": "pending",
                "annotations": []
            })
        return prepared
    
    def assign_to_annotators(self, data, annotators, samples_per_annotator):
        """分配标注任务"""
        assignments = {}
        for annotator in annotators:
            assigned = [d for d in data if d["status"] == "pending"][:samples_per_annotator]
            for item in assigned:
                item["status"] = "assigned"
                item["annotator"] = annotator
            assignments[annotator] = assigned
        return assignments
    
    def collect_annotation(self, item_id, annotator, annotation):
        """收集标注结果"""
        for item in self.annotations:
            if item["id"] == item_id:
                item["annotations"].append({
                    "annotator": annotator,
                    "annotation": annotation,
                    "timestamp": datetime.now()
                })
                break
    
    def calculate_agreement(self):
        """计算标注一致性"""
        # 使用Cohen's Kappa或Fleiss' Kappa
        pass

质量控制

一致性检查

from sklearn.metrics import cohen_kappa_score, f1_score

def calculate_inter_annotator_agreement(annotations):
    """计算标注者间一致性"""
    # 两两计算Cohen's Kappa
    annotators = list(set(a["annotator"] for a in annotations))
    agreements = {}
    
    for i in range(len(annotators)):
        for j in range(i+1, len(annotators)):
            ann1 = [a["label"] for a in annotations if a["annotator"] == annotators[i]]
            ann2 = [a["label"] for a in annotations if a["annotator"] == annotators[j]]
            
            # 对齐长度
            min_len = min(len(ann1), len(ann2))
            ann1, ann2 = ann1[:min_len], ann2[:min_len]
            
            kappa = cohen_kappa_score(ann1, ann2)
            agreements[f"{annotators[i]}_{annotators[j]}"] = kappa
    
    return agreements

质量评估

def evaluate_annotation_quality(annotations, ground_truth=None):
    """评估标注质量"""
    metrics = {
        "total": len(annotations),
        "completion_rate": 0,
        "accuracy": 0,
        "consistency": 0
    }
    
    # 完成率
    completed = sum(1 for a in annotations if a.get("label"))
    metrics["completion_rate"] = completed / metrics["total"]
    
    # 准确率(如果有ground truth)
    if ground_truth:
        correct = sum(1 for a, g in zip(annotations, ground_truth) 
                     if a.get("label") == g["label"])
        metrics["accuracy"] = correct / len(ground_truth)
    
    # 一致性
    if len(annotations) > 1:
        labels = [a.get("label") for a in annotations]
        # 计算标签分布的熵
        from collections import Counter
        import math
        counts = Counter(labels)
        entropy = -sum((c/len(labels)) * math.log2(c/len(labels)) 
                      for c in counts.values())
        metrics["consistency"] = 1 - entropy / math.log2(len(counts)) if len(counts) > 1 else 1
    
    return metrics

质量问题处理

class QualityController:
    """质量控制器"""
    
    def __init__(self, threshold=0.8):
        self.threshold = threshold
        self.issues = []
    
    def check_annotation(self, annotation, reference=None):
        """检查单个标注"""
        issues = []
        
        # 检查完整性
        if not annotation.get("label"):
            issues.append("missing_label")
        
        # 检查一致性(与参考对比)
        if reference and annotation.get("label") != reference.get("label"):
            issues.append("inconsistent_with_reference")
        
        # 检查标注时间(过快可能意味着质量差)
        if annotation.get("time_spent", 0) < 5:  # 少于5秒
            issues.append("too_fast")
        
        return issues
    
    def review_batch(self, annotations, references=None):
        """批量审核"""
        results = []
        for i, ann in enumerate(annotations):
            ref = references[i] if references else None
            issues = self.check_annotation(ann, ref)
            results.append({
                "annotation_id": ann.get("id"),
                "issues": issues,
                "needs_review": len(issues) > 0
            })
        
        return results

标注数据导出

def export_annotations(annotations, format="json"):
    """导出标注数据"""
    if format == "json":
        with open("annotations.json", "w", encoding="utf-8") as f:
            json.dump(annotations, f, ensure_ascii=False, indent=2)
    
    elif format == "csv":
        import csv
        with open("annotations.csv", "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=annotations[0].keys())
            writer.writeheader()
            writer.writerows(annotations)
    
    elif format == "jsonl":
        with open("annotations.jsonl", "w", encoding="utf-8") as f:
            for ann in annotations:
                f.write(json.dumps(ann, ensure_ascii=False) + "\n")
    
    print(f"导出 {len(annotations)} 条标注数据")

高质量的数据标注是构建优秀LLM的基础,需要投入足够的时间和资源来确保标注质量。