序列化深度解析:pickle、marshal、msgpack与protobuf
序列化深度解析:pickle、marshal、msgpack与protobuf
序列化是将Python对象转换为可存储或传输格式的过程,反序列化则是其逆过程。选择合适的序列化方案对系统性能、安全性和可维护性都有重要影响。
pickle:Python原生序列化
pickle是Python内置的序列化模块,可以处理几乎所有的Python对象:
import pickle
import json
from dataclasses import dataclass
from typing import List, Optional
@dataclass
class User:
name: str
age: int
email: str
preferences: Optional[dict] = None
@dataclass
class Database:
users: List[User]
def add_user(self, user: User):
self.users.append(user)
def pickle_example():
"""pickle序列化示例"""
# 创建复杂对象
db = Database(users=[
User("Alice", 30, "alice@example.com", {"theme": "dark"}),
User("Bob", 25, "bob@example.com", {"theme": "light"})
])
# 序列化
pickled = pickle.dumps(db)
print(f"Pickle序列化后大小: {len(pickled)} 字节")
# 反序列化
restored_db = pickle.loads(pickled)
print(f"恢复的用户数量: {len(restored_db.users)}")
# 文件序列化
with open("data.pkl", "wb") as f:
pickle.dump(db, f)
with open("data.pkl", "rb") as f:
loaded_db = pickle.load(f)
# 不同协议版本
for protocol in range(pickle.HIGHEST_PROTOCOL + 1):
size = len(pickle.dumps(db, protocol=protocol))
print(f"协议版本 {protocol}: {size} 字节")
class CustomPickler:
"""自定义pickle处理器"""
@staticmethod
def safe_dumps(obj, protocol=None):
"""安全的序列化"""
if protocol is None:
protocol = pickle.HIGHEST_PROTOCOL
# 使用协议4或更高版本以获得更好的性能
return pickle.dumps(obj, protocol=protocol)
@staticmethod
def safe_loads(data: bytes):
"""安全的反序列化(限制允许的类)"""
# 定义允许反序列化的类
allowed_classes = {
'builtins': ['dict', 'list', 'tuple', 'set', 'frozenset'],
'__main__': ['User', 'Database']
}
# 自定义Unpickler限制可反序列化的类
class RestrictedUnpickler(pickle.Unpickler):
def find_class(self, module, name):
if module in allowed_classes:
if name in allowed_classes[module]:
return super().find_class(module, name)
raise pickle.UnpicklingError(
f"不允许反序列化 {module}.{name}"
)
return RestrictedUnpickler(io.BytesIO(data)).load()
marshal:Python内部序列化
marshal是Python内部使用的序列化格式,主要用于.pyc文件:
import marshal
import dis
import types
def marshal_example():
"""marshal序列化示例"""
# 编译Python代码
code = compile(
'result = x + y\nprint(result)',
'<string>',
'exec'
)
# 序列化代码对象
marshaled = marshal.dumps(code)
print(f"Marshal序列化后大小: {len(marshaled)} 字节")
# 反序列化代码对象
restored_code = marshal.loads(marshaled)
# 执行恢复的代码对象
namespace = {'x': 10, 'y': 20}
exec(restored_code, namespace)
print(f"执行结果: {namespace.get('result')}")
# 反汇编代码对象
print("代码反汇编:")
dis.dis(restored_code)
# marshal与pickle的区别
def compare_formats():
"""比较marshal和pickle"""
import sys
# 测试数据
test_data = {
"list": [1, 2, 3, 4, 5],
"dict": {"a": 1, "b": 2},
"nested": {"level1": {"level2": [1, 2, 3]}}
}
# 序列化大小比较
marshal_size = len(marshal.dumps(test_data))
pickle_size = len(pickle.dumps(test_data))
json_size = len(json.dumps(test_data).encode())
print(f"Marshal: {marshal_size} 字节")
print(f"Pickle: {pickle_size} 字节")
print(f"JSON: {json_size} 字节")
# 速度比较
import timeit
marshal_time = timeit.timeit(
lambda: marshal.dumps(test_data),
number=10000
)
pickle_time = timeit.timeit(
lambda: pickle.dumps(test_data),
number=10000
)
print(f"\nMarshal 10000次: {marshal_time:.3f}秒")
print(f"Pickle 10000次: {pickle_time:.3f}秒")
msgpack:高性能跨语言序列化
MessagePack是一种高效的二进制序列化格式,比JSON更小更快:
import msgpack
import json
import time
class MsgPackSerializer:
"""MessagePack序列化器"""
def __init__(self):
self.packer = msgpack.Packer(
use_bin_type=True,
unicode_errors='surrogatepass'
)
def dumps(self, obj):
"""序列化"""
return self.packer.pack(obj)
def loads(self, data: bytes):
"""反序列化"""
return msgpack.unpackb(data, raw=False)
def dumps_json_compatible(self, obj):
"""序列化为JSON兼容格式"""
return msgpack.packb(obj, use_bin_type=True)
def loads_json_compatible(self, data: bytes):
"""反序列化JSON兼容格式"""
return msgpack.unpackb(data, raw=False)
def msgpack_performance_test():
"""msgpack性能测试"""
# 测试数据
test_data = {
"users": [
{"id": i, "name": f"user_{i}", "score": i * 1.5}
for i in range(1000)
]
}
# 序列化测试
serializer = MsgPackSerializer()
# JSON序列化
start = time.time()
for _ in range(100):
json_data = json.dumps(test_data).encode()
json_time = time.time() - start
# MsgPack序列化
start = time.time()
for _ in range(100):
msgpack_data = serializer.dumps(test_data)
msgpack_time = time.time() - start
print(f"JSON大小: {len(json_data)} 字节")
print(f"MsgPack大小: {len(msgpack_data)} 字节")
print(f"大小减少: {(1 - len(msgpack_data)/len(json_data))*100:.1f}%")
print(f"\nJSON 100次序列化: {json_time:.3f}秒")
print(f"MsgPack 100次序列化: {msgpack_time:.3f}秒")
print(f"速度提升: {json_time/msgpack_time:.1f}倍")
# 自定义类型处理
class CustomMsgPackEncoder(msgpack.ExtensionType):
"""自定义类型编码器"""
@staticmethod
def encode(obj):
"""编码自定义对象"""
if isinstance(obj, datetime):
return msgpack.ExtType(1, obj.isoformat().encode())
elif isinstance(obj, Decimal):
return msgpack.ExtType(2, str(obj).encode())
return None
@staticmethod
def decode(code, data):
"""解码自定义对象"""
if code == 1:
return datetime.fromisoformat(data.decode())
elif code == 2:
return Decimal(data.decode())
return msgpack.ExtType(code, data)
Protocol Buffers:Google的序列化方案
Protocol Buffers(protobuf)是Google开发的高效序列化协议:
# 首先需要定义.proto文件
# user.proto
# syntax = "proto3";
#
# message User {
# int32 id = 1;
# string name = 2;
# int32 age = 3;
# string email = 4;
# map<string, string> preferences = 5;
# }
#
# message UserList {
# repeated User users = 1;
# }
# 然后使用protoc编译生成Python代码
# protoc --python_out=. user.proto
# 生成的代码示例(简化版)
import struct
class SimpleProtobuf:
"""简化版protobuf实现(教学用途)"""
@staticmethod
def encode_varint(value: int) -> bytes:
"""编码变长整数"""
result = b''
while value > 0x7f:
result += bytes([(value & 0x7f) | 0x80])
value >>= 7
result += bytes([value])
return result
@staticmethod
def decode_varint(data: bytes) -> tuple:
"""解码变长整数"""
result = 0
shift = 0
for i, byte in enumerate(data):
result |= (byte & 0x7f) << shift
if not (byte & 0x80):
return result, i + 1
shift += 7
return result, len(data)
# 实际使用protobuf
from google.protobuf import descriptor_pb2
import google.protobuf.json_format as json_format
def protobuf_example():
"""protobuf使用示例"""
# 假设有生成的user_pb2模块
# import user_pb2
#
# # 创建用户对象
# user = user_pb2.User()
# user.id = 1
# user.name = "Alice"
# user.age = 30
# user.email = "alice@example.com"
# user.preferences["theme"] = "dark"
#
# # 序列化
# data = user.SerializeToString()
# print(f"Protobuf序列化后大小: {len(data)} 字节")
#
# # 反序列化
# restored_user = user_pb2.User()
# restored_user.ParseFromString(data)
# print(f"恢复的用户: {restored_user.name}")
#
# # JSON转换
# json_str = json_format.MessageToJson(user)
# print(f"JSON格式: {json_str}")
pass
序列化方案对比
import json
import pickle
import msgpack
import time
from dataclasses import dataclass
@dataclass
class ComparisonResult:
format_name: str
serialize_time: float
deserialize_time: float
size: int
human_readable: bool
def compare_serialization_formats():
"""比较不同序列化格式"""
# 测试数据
test_data = {
"id": 12345,
"name": "测试用户",
"scores": [85.5, 92.3, 78.9, 95.1],
"metadata": {
"created": "2024-01-01",
"tags": ["python", "serialization", "performance"]
}
}
results = []
# JSON
start = time.time()
for _ in range(1000):
json_data = json.dumps(test_data, ensure_ascii=False).encode()
json_serialize = time.time() - start
start = time.time()
for _ in range(1000):
json.loads(json_data)
json_deserialize = time.time() - start
results.append(ComparisonResult(
"JSON",
json_serialize,
json_deserialize,
len(json_data),
True
))
# Pickle
start = time.time()
for _ in range(1000):
pickle_data = pickle.dumps(test_data)
pickle_serialize = time.time() - start
start = time.time()
for _ in range(1000):
pickle.loads(pickle_data)
pickle_deserialize = time.time() - start
results.append(ComparisonResult(
"Pickle",
pickle_serialize,
pickle_deserialize,
len(pickle_data),
False
))
# MsgPack
start = time.time()
for _ in range(1000):
msgpack_data = msgpack.packb(test_data, use_bin_type=True)
msgpack_serialize = time.time() - start
start = time.time()
for _ in range(1000):
msgpack.unpackb(msgpack_data, raw=False)
msgpack_deserialize = time.time() - start
results.append(ComparisonResult(
"MsgPack",
msgpack_serialize,
msgpack_deserialize,
len(msgpack_data),
False
))
# 输出结果
print(f"{'格式':<10} {'序列化时间':<15} {'反序列化时间':<15} {'大小':<10} {'可读性'}")
print("-" * 70)
for r in results:
print(f"{r.format_name:<10} {r.serialize_time:<15.4f} {r.deserialize_time:<15.4f} {r.size:<10} {'是' if r.human_readable else '否'}")
# 安全考虑
class SafeSerializer:
"""安全的序列化器"""
@staticmethod
def safe_pickle_loads(data: bytes, allowed_classes: set = None):
"""安全的pickle反序列化"""
import io
if allowed_classes is None:
allowed_classes = set()
class RestrictedUnpickler(pickle.Unpickler):
def find_class(self, module, name):
# 只允许安全的类
safe_modules = {'builtins', 'collections', 'datetime'}
if module in safe_modules:
return super().find_class(module, name)
if module in allowed_classes:
return super().find_class(module, name)
raise pickle.UnpicklingError(
f"安全限制:不允许反序列化 {module}.{name}"
)
return RestrictedUnpickler(io.BytesIO(data)).load()
选择指南
| 格式 | 优点 | 缺点 | 适用场景 |
|---|---|---|---|
| JSON | 人类可读,跨语言,标准 | 速度慢,不支持所有Python类型 | Web API,配置文件 |
| Pickle | 支持所有Python类型,速度快 | 不安全,Python专用 | Python内部数据交换 |
| Marshal | 速度极快,Python专用 | 不稳定,Python专用 | .pyc文件,内部使用 |
| MsgPack | 紧凑,快速,跨语言 | 不如JSON可读 | 微服务通信,缓存 |
| Protobuf | 高效,强类型,跨语言 | 需要编译,学习曲线 | gRPC,大型系统 |
根据具体需求选择合适的序列化格式,在性能、安全性和可维护性之间做出平衡。