📦

序列化深度解析：pickle、marshal、msgpack与protobuf

📂 python ⏱ 5 min 875 words

序列化深度解析：pickle、marshal、msgpack与protobuf

序列化是将Python对象转换为可存储或传输格式的过程，反序列化则是其逆过程。选择合适的序列化方案对系统性能、安全性和可维护性都有重要影响。

pickle：Python原生序列化

pickle是Python内置的序列化模块，可以处理几乎所有的Python对象：

import pickle
import json
from dataclasses import dataclass
from typing import List, Optional

@dataclass
class User:
    name: str
    age: int
    email: str
    preferences: Optional[dict] = None

@dataclass
class Database:
    users: List[User]
    
    def add_user(self, user: User):
        self.users.append(user)

def pickle_example():
    """pickle序列化示例"""
    # 创建复杂对象
    db = Database(users=[
        User("Alice", 30, "alice@example.com", {"theme": "dark"}),
        User("Bob", 25, "bob@example.com", {"theme": "light"})
    ])
    
    # 序列化
    pickled = pickle.dumps(db)
    print(f"Pickle序列化后大小: {len(pickled)} 字节")
    
    # 反序列化
    restored_db = pickle.loads(pickled)
    print(f"恢复的用户数量: {len(restored_db.users)}")
    
    # 文件序列化
    with open("data.pkl", "wb") as f:
        pickle.dump(db, f)
    
    with open("data.pkl", "rb") as f:
        loaded_db = pickle.load(f)
    
    # 不同协议版本
    for protocol in range(pickle.HIGHEST_PROTOCOL + 1):
        size = len(pickle.dumps(db, protocol=protocol))
        print(f"协议版本 {protocol}: {size} 字节")

class CustomPickler:
    """自定义pickle处理器"""
    
    @staticmethod
    def safe_dumps(obj, protocol=None):
        """安全的序列化"""
        if protocol is None:
            protocol = pickle.HIGHEST_PROTOCOL
        
        # 使用协议4或更高版本以获得更好的性能
        return pickle.dumps(obj, protocol=protocol)
    
    @staticmethod
    def safe_loads(data: bytes):
        """安全的反序列化（限制允许的类）"""
        # 定义允许反序列化的类
        allowed_classes = {
            'builtins': ['dict', 'list', 'tuple', 'set', 'frozenset'],
            '__main__': ['User', 'Database']
        }
        
        # 自定义Unpickler限制可反序列化的类
        class RestrictedUnpickler(pickle.Unpickler):
            def find_class(self, module, name):
                if module in allowed_classes:
                    if name in allowed_classes[module]:
                        return super().find_class(module, name)
                raise pickle.UnpicklingError(
                    f"不允许反序列化 {module}.{name}"
                )
        
        return RestrictedUnpickler(io.BytesIO(data)).load()

marshal：Python内部序列化

marshal是Python内部使用的序列化格式，主要用于.pyc文件：

import marshal
import dis
import types

def marshal_example():
    """marshal序列化示例"""
    # 编译Python代码
    code = compile(
        'result = x + y\nprint(result)',
        '<string>',
        'exec'
    )
    
    # 序列化代码对象
    marshaled = marshal.dumps(code)
    print(f"Marshal序列化后大小: {len(marshaled)} 字节")
    
    # 反序列化代码对象
    restored_code = marshal.loads(marshaled)
    
    # 执行恢复的代码对象
    namespace = {'x': 10, 'y': 20}
    exec(restored_code, namespace)
    print(f"执行结果: {namespace.get('result')}")
    
    # 反汇编代码对象
    print("代码反汇编:")
    dis.dis(restored_code)

# marshal与pickle的区别
def compare_formats():
    """比较marshal和pickle"""
    import sys
    
    # 测试数据
    test_data = {
        "list": [1, 2, 3, 4, 5],
        "dict": {"a": 1, "b": 2},
        "nested": {"level1": {"level2": [1, 2, 3]}}
    }
    
    # 序列化大小比较
    marshal_size = len(marshal.dumps(test_data))
    pickle_size = len(pickle.dumps(test_data))
    json_size = len(json.dumps(test_data).encode())
    
    print(f"Marshal: {marshal_size} 字节")
    print(f"Pickle: {pickle_size} 字节")
    print(f"JSON: {json_size} 字节")
    
    # 速度比较
    import timeit
    
    marshal_time = timeit.timeit(
        lambda: marshal.dumps(test_data),
        number=10000
    )
    pickle_time = timeit.timeit(
        lambda: pickle.dumps(test_data),
        number=10000
    )
    
    print(f"\nMarshal 10000次: {marshal_time:.3f}秒")
    print(f"Pickle 10000次: {pickle_time:.3f}秒")

msgpack：高性能跨语言序列化

MessagePack是一种高效的二进制序列化格式，比JSON更小更快：

import msgpack
import json
import time

class MsgPackSerializer:
    """MessagePack序列化器"""
    
    def __init__(self):
        self.packer = msgpack.Packer(
            use_bin_type=True,
            unicode_errors='surrogatepass'
        )
    
    def dumps(self, obj):
        """序列化"""
        return self.packer.pack(obj)
    
    def loads(self, data: bytes):
        """反序列化"""
        return msgpack.unpackb(data, raw=False)
    
    def dumps_json_compatible(self, obj):
        """序列化为JSON兼容格式"""
        return msgpack.packb(obj, use_bin_type=True)
    
    def loads_json_compatible(self, data: bytes):
        """反序列化JSON兼容格式"""
        return msgpack.unpackb(data, raw=False)

def msgpack_performance_test():
    """msgpack性能测试"""
    # 测试数据
    test_data = {
        "users": [
            {"id": i, "name": f"user_{i}", "score": i * 1.5}
            for i in range(1000)
        ]
    }
    
    # 序列化测试
    serializer = MsgPackSerializer()
    
    # JSON序列化
    start = time.time()
    for _ in range(100):
        json_data = json.dumps(test_data).encode()
    json_time = time.time() - start
    
    # MsgPack序列化
    start = time.time()
    for _ in range(100):
        msgpack_data = serializer.dumps(test_data)
    msgpack_time = time.time() - start
    
    print(f"JSON大小: {len(json_data)} 字节")
    print(f"MsgPack大小: {len(msgpack_data)} 字节")
    print(f"大小减少: {(1 - len(msgpack_data)/len(json_data))*100:.1f}%")
    print(f"\nJSON 100次序列化: {json_time:.3f}秒")
    print(f"MsgPack 100次序列化: {msgpack_time:.3f}秒")
    print(f"速度提升: {json_time/msgpack_time:.1f}倍")

# 自定义类型处理
class CustomMsgPackEncoder(msgpack.ExtensionType):
    """自定义类型编码器"""
    
    @staticmethod
    def encode(obj):
        """编码自定义对象"""
        if isinstance(obj, datetime):
            return msgpack.ExtType(1, obj.isoformat().encode())
        elif isinstance(obj, Decimal):
            return msgpack.ExtType(2, str(obj).encode())
        return None
    
    @staticmethod
    def decode(code, data):
        """解码自定义对象"""
        if code == 1:
            return datetime.fromisoformat(data.decode())
        elif code == 2:
            return Decimal(data.decode())
        return msgpack.ExtType(code, data)

Protocol Buffers：Google的序列化方案

Protocol Buffers（protobuf）是Google开发的高效序列化协议：

# 首先需要定义.proto文件
# user.proto
# syntax = "proto3";
#
# message User {
#     int32 id = 1;
#     string name = 2;
#     int32 age = 3;
#     string email = 4;
#     map<string, string> preferences = 5;
# }
#
# message UserList {
#     repeated User users = 1;
# }

# 然后使用protoc编译生成Python代码
# protoc --python_out=. user.proto

# 生成的代码示例（简化版）
import struct

class SimpleProtobuf:
    """简化版protobuf实现（教学用途）"""
    
    @staticmethod
    def encode_varint(value: int) -> bytes:
        """编码变长整数"""
        result = b''
        while value > 0x7f:
            result += bytes([(value & 0x7f) | 0x80])
            value >>= 7
        result += bytes([value])
        return result
    
    @staticmethod
    def decode_varint(data: bytes) -> tuple:
        """解码变长整数"""
        result = 0
        shift = 0
        for i, byte in enumerate(data):
            result |= (byte & 0x7f) << shift
            if not (byte & 0x80):
                return result, i + 1
            shift += 7
        return result, len(data)

# 实际使用protobuf
from google.protobuf import descriptor_pb2
import google.protobuf.json_format as json_format

def protobuf_example():
    """protobuf使用示例"""
    # 假设有生成的user_pb2模块
    # import user_pb2
    # 
    # # 创建用户对象
    # user = user_pb2.User()
    # user.id = 1
    # user.name = "Alice"
    # user.age = 30
    # user.email = "alice@example.com"
    # user.preferences["theme"] = "dark"
    # 
    # # 序列化
    # data = user.SerializeToString()
    # print(f"Protobuf序列化后大小: {len(data)} 字节")
    # 
    # # 反序列化
    # restored_user = user_pb2.User()
    # restored_user.ParseFromString(data)
    # print(f"恢复的用户: {restored_user.name}")
    # 
    # # JSON转换
    # json_str = json_format.MessageToJson(user)
    # print(f"JSON格式: {json_str}")
    pass

序列化方案对比

import json
import pickle
import msgpack
import time
from dataclasses import dataclass

@dataclass
class ComparisonResult:
    format_name: str
    serialize_time: float
    deserialize_time: float
    size: int
    human_readable: bool

def compare_serialization_formats():
    """比较不同序列化格式"""
    # 测试数据
    test_data = {
        "id": 12345,
        "name": "测试用户",
        "scores": [85.5, 92.3, 78.9, 95.1],
        "metadata": {
            "created": "2024-01-01",
            "tags": ["python", "serialization", "performance"]
        }
    }
    
    results = []
    
    # JSON
    start = time.time()
    for _ in range(1000):
        json_data = json.dumps(test_data, ensure_ascii=False).encode()
    json_serialize = time.time() - start
    
    start = time.time()
    for _ in range(1000):
        json.loads(json_data)
    json_deserialize = time.time() - start
    
    results.append(ComparisonResult(
        "JSON",
        json_serialize,
        json_deserialize,
        len(json_data),
        True
    ))
    
    # Pickle
    start = time.time()
    for _ in range(1000):
        pickle_data = pickle.dumps(test_data)
    pickle_serialize = time.time() - start
    
    start = time.time()
    for _ in range(1000):
        pickle.loads(pickle_data)
    pickle_deserialize = time.time() - start
    
    results.append(ComparisonResult(
        "Pickle",
        pickle_serialize,
        pickle_deserialize,
        len(pickle_data),
        False
    ))
    
    # MsgPack
    start = time.time()
    for _ in range(1000):
        msgpack_data = msgpack.packb(test_data, use_bin_type=True)
    msgpack_serialize = time.time() - start
    
    start = time.time()
    for _ in range(1000):
        msgpack.unpackb(msgpack_data, raw=False)
    msgpack_deserialize = time.time() - start
    
    results.append(ComparisonResult(
        "MsgPack",
        msgpack_serialize,
        msgpack_deserialize,
        len(msgpack_data),
        False
    ))
    
    # 输出结果
    print(f"{'格式':<10} {'序列化时间':<15} {'反序列化时间':<15} {'大小':<10} {'可读性'}")
    print("-" * 70)
    for r in results:
        print(f"{r.format_name:<10} {r.serialize_time:<15.4f} {r.deserialize_time:<15.4f} {r.size:<10} {'是' if r.human_readable else '否'}")

# 安全考虑
class SafeSerializer:
    """安全的序列化器"""
    
    @staticmethod
    def safe_pickle_loads(data: bytes, allowed_classes: set = None):
        """安全的pickle反序列化"""
        import io
        
        if allowed_classes is None:
            allowed_classes = set()
        
        class RestrictedUnpickler(pickle.Unpickler):
            def find_class(self, module, name):
                # 只允许安全的类
                safe_modules = {'builtins', 'collections', 'datetime'}
                if module in safe_modules:
                    return super().find_class(module, name)
                
                if module in allowed_classes:
                    return super().find_class(module, name)
                
                raise pickle.UnpicklingError(
                    f"安全限制：不允许反序列化 {module}.{name}"
                )
        
        return RestrictedUnpickler(io.BytesIO(data)).load()

选择指南

格式	优点	缺点	适用场景
JSON	人类可读，跨语言，标准	速度慢，不支持所有Python类型	Web API，配置文件
Pickle	支持所有Python类型，速度快	不安全，Python专用	Python内部数据交换
Marshal	速度极快，Python专用	不稳定，Python专用	.pyc文件，内部使用
MsgPack	紧凑，快速，跨语言	不如JSON可读	微服务通信，缓存
Protobuf	高效，强类型，跨语言	需要编译，学习曲线	gRPC，大型系统

根据具体需求选择合适的序列化格式，在性能、安全性和可维护性之间做出平衡。