数据标注:构建高质量训练数据
--- title: "数据标注:构建高质量训练数据" description: "掌握数据标注的方法论、工具选择和质量控制,构建高质量的LLM训练数据" tags: ["数据标注", "人工标注", "标注工具", "质量控制"] category: "llm" icon: "🧠"
数据标注:构建高质量训练数据
数据标注概述
数据标注是将原始数据转化为可用于模型训练的结构化数据的过程。在LLM时代,高质量的标注数据是构建优秀模型的关键。数据标注包括指令标注、偏好标注、分类标注等多种类型。
数据标注的核心目标:
- 准确性:标注结果准确反映真实情况
- 一致性:不同标注者的结果保持一致
- 完整性:覆盖所有需要标注的内容
- 效率:在保证质量的前提下提高标注速度
标注类型
指令标注
# 指令标注格式
instruction_annotation = {
"instruction": "将以下英文翻译成中文",
"input": "Machine learning is a subset of artificial intelligence.",
"output": "机器学习是人工智能的一个子集。",
"metadata": {
"category": "翻译",
"difficulty": "easy",
"language_pair": "en-zh"
}
}
# 批量标注模板
annotation_template = {
"template_id": "translation_001",
"instruction_template": "将以下{source_lang}文本翻译成{target_lang}",
"input_field": "source_text",
"output_field": "translated_text"
}
偏好标注
# 偏好标注格式
preference_annotation = {
"prompt": "解释什么是深度学习",
"chosen": "深度学习是机器学习的一个分支,使用多层神经网络从数据中学习特征表示...",
"rejected": "深度学习就是很多层的神经网络。",
"annotation_notes": "chosen版本更详细、更准确"
}
分类标注
# 文本分类标注
classification_annotation = {
"text": "这款产品质量很好,值得购买",
"label": "positive",
"confidence": 0.95,
"sub_labels": ["产品评价", "正面情感"]
}
标注工具
Label Studio
# Label Studio配置
label_config = """
<View>
<Header value="指令标注任务"/>
<TextArea name="instruction" placeholder="输入指令"/>
<TextArea name="input" placeholder="输入内容(可选)"/>
<TextArea name="output" placeholder="输入期望的输出"/>
<Choices name="category" toName="instruction">
<Choice value="翻译"/>
<Choice value="摘要"/>
<Choice value="问答"/>
<Choice value="创作"/>
</Choices>
<TextArea name="notes" placeholder="备注"/>
</View>
"""
# 启动标注服务
# label-studio start --port 8080
配置标注任务
import json
def create_labeling_project(config):
"""创建标注项目配置"""
project_config = {
"title": config["project_name"],
"description": config["description"],
"label_config": config["label_config"],
"expert_instruction": config["instructions"]
}
# 保存配置
with open(f"{config['project_name']}_config.json", "w") as f:
json.dump(project_config, f, indent=2)
return project_config
标注流程设计
标注指南
annotation_guidelines = """
# 数据标注指南
## 1. 指令标注要求
### 指令质量标准
- 清晰明确:指令应该清晰表达任务要求
- 完整性:包含完成任务所需的所有信息
- 可行性:任务应该是可完成的
- 多样性:覆盖不同类型和难度的任务
### 输出质量标准
- 准确性:回答应该准确无误
- 完整性:回答应该完整覆盖问题
- 简洁性:避免冗余信息
- 格式规范:符合要求的输出格式
## 2. 偏好标注要求
### 选择标准
- 有用性:哪个回答更有帮助
- 准确性:哪个回答更准确
- 安全性:哪个回答更安全
- 详细程度:哪个回答更详细
## 3. 注意事项
- 保持客观公正
- 不要添加个人观点
- 遇到问题及时反馈
"""
# 保存标注指南
with open("annotation_guidelines.txt", "w") as f:
f.write(annotation_guidelines)
标注流程
class AnnotationPipeline:
"""标注流程管理"""
def __init__(self, project_name):
self.project_name = project_name
self.annotations = []
self.quality_checks = []
def prepare_data(self, raw_data):
"""准备标注数据"""
prepared = []
for item in raw_data:
prepared.append({
"id": len(prepared),
"data": item,
"status": "pending",
"annotations": []
})
return prepared
def assign_to_annotators(self, data, annotators, samples_per_annotator):
"""分配标注任务"""
assignments = {}
for annotator in annotators:
assigned = [d for d in data if d["status"] == "pending"][:samples_per_annotator]
for item in assigned:
item["status"] = "assigned"
item["annotator"] = annotator
assignments[annotator] = assigned
return assignments
def collect_annotation(self, item_id, annotator, annotation):
"""收集标注结果"""
for item in self.annotations:
if item["id"] == item_id:
item["annotations"].append({
"annotator": annotator,
"annotation": annotation,
"timestamp": datetime.now()
})
break
def calculate_agreement(self):
"""计算标注一致性"""
# 使用Cohen's Kappa或Fleiss' Kappa
pass
质量控制
一致性检查
from sklearn.metrics import cohen_kappa_score, f1_score
def calculate_inter_annotator_agreement(annotations):
"""计算标注者间一致性"""
# 两两计算Cohen's Kappa
annotators = list(set(a["annotator"] for a in annotations))
agreements = {}
for i in range(len(annotators)):
for j in range(i+1, len(annotators)):
ann1 = [a["label"] for a in annotations if a["annotator"] == annotators[i]]
ann2 = [a["label"] for a in annotations if a["annotator"] == annotators[j]]
# 对齐长度
min_len = min(len(ann1), len(ann2))
ann1, ann2 = ann1[:min_len], ann2[:min_len]
kappa = cohen_kappa_score(ann1, ann2)
agreements[f"{annotators[i]}_{annotators[j]}"] = kappa
return agreements
质量评估
def evaluate_annotation_quality(annotations, ground_truth=None):
"""评估标注质量"""
metrics = {
"total": len(annotations),
"completion_rate": 0,
"accuracy": 0,
"consistency": 0
}
# 完成率
completed = sum(1 for a in annotations if a.get("label"))
metrics["completion_rate"] = completed / metrics["total"]
# 准确率(如果有ground truth)
if ground_truth:
correct = sum(1 for a, g in zip(annotations, ground_truth)
if a.get("label") == g["label"])
metrics["accuracy"] = correct / len(ground_truth)
# 一致性
if len(annotations) > 1:
labels = [a.get("label") for a in annotations]
# 计算标签分布的熵
from collections import Counter
import math
counts = Counter(labels)
entropy = -sum((c/len(labels)) * math.log2(c/len(labels))
for c in counts.values())
metrics["consistency"] = 1 - entropy / math.log2(len(counts)) if len(counts) > 1 else 1
return metrics
质量问题处理
class QualityController:
"""质量控制器"""
def __init__(self, threshold=0.8):
self.threshold = threshold
self.issues = []
def check_annotation(self, annotation, reference=None):
"""检查单个标注"""
issues = []
# 检查完整性
if not annotation.get("label"):
issues.append("missing_label")
# 检查一致性(与参考对比)
if reference and annotation.get("label") != reference.get("label"):
issues.append("inconsistent_with_reference")
# 检查标注时间(过快可能意味着质量差)
if annotation.get("time_spent", 0) < 5: # 少于5秒
issues.append("too_fast")
return issues
def review_batch(self, annotations, references=None):
"""批量审核"""
results = []
for i, ann in enumerate(annotations):
ref = references[i] if references else None
issues = self.check_annotation(ann, ref)
results.append({
"annotation_id": ann.get("id"),
"issues": issues,
"needs_review": len(issues) > 0
})
return results
标注数据导出
def export_annotations(annotations, format="json"):
"""导出标注数据"""
if format == "json":
with open("annotations.json", "w", encoding="utf-8") as f:
json.dump(annotations, f, ensure_ascii=False, indent=2)
elif format == "csv":
import csv
with open("annotations.csv", "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=annotations[0].keys())
writer.writeheader()
writer.writerows(annotations)
elif format == "jsonl":
with open("annotations.jsonl", "w", encoding="utf-8") as f:
for ann in annotations:
f.write(json.dumps(ann, ensure_ascii=False) + "\n")
print(f"导出 {len(annotations)} 条标注数据")
高质量的数据标注是构建优秀LLM的基础,需要投入足够的时间和资源来确保标注质量。