🐍

编码与解码

📂 python ⏱ 2 min 392 words

编码基础

编码是将字符转换为字节的过程，解码是将字节转换为字符的过程。Python 3中字符串默认使用Unicode编码。

# 字符串和字节的区别
text = "Hello, 你好"
print(type(text))  # 输出: <class 'str'>

# 编码为字节
bytes_data = text.encode('utf-8')
print(type(bytes_data))  # 输出: <class 'bytes'>
print(bytes_data)  # 输出: b'Hello, \xe4\xbd\xa0\xe5\xa5\xbd'

# 解码为字符串
decoded_text = bytes_data.decode('utf-8')
print(decoded_text)  # 输出: Hello, 你好

常见编码格式

UTF-8

UTF-8是最常用的编码格式，支持所有Unicode字符。

text = "Python编程"

# UTF-8编码
utf8_bytes = text.encode('utf-8')
print(f"UTF-8: {utf8_bytes}")
# 输出: UTF-8: b'Python\xe7\xbc\x96\xe7\xa8\x8b'

# UTF-8编码的字节数
print(f"字节数: {len(utf8_bytes)}")  # 输出: 字节数: 11

ASCII

ASCII是最早的字符编码，只支持英文字符。

text = "Hello"

# ASCII编码
ascii_bytes = text.encode('ascii')
print(f"ASCII: {ascii_bytes}")
# 输出: ASCII: b'Hello'

# 中文无法用ASCII编码
# "你好".encode('ascii')  # 会抛出UnicodeEncodeError

GBK/GB2312

中文常用的编码格式。

text = "你好，世界"

# GBK编码
gbk_bytes = text.encode('gbk')
print(f"GBK: {gbk_bytes}")
# 输出: GKB: b'\xc4\xe3\xba\xc3\xa3\xac\xca\xc0\xbd\xe7'

# GBK解码
decoded = gbk_bytes.decode('gbk')
print(decoded)  # 输出: 你好，世界

bytes和str的转换

# str到bytes
text = "Python"
bytes_data = bytes(text, encoding='utf-8')
# 或者
bytes_data = text.encode('utf-8')

# bytes到str
text = str(bytes_data, encoding='utf-8')
# 或者
text = bytes_data.decode('utf-8')

# 处理不同编码
text = "你好"
gbk_bytes = text.encode('gbk')
utf8_bytes = text.encode('utf-8')

print(f"GBK字节数: {len(gbk_bytes)}")  # 输出: GKB字节数: 4
print(f"UTF-8字节数: {len(utf8_bytes)}")  # 输出: UTF-8字节数: 6

codecs模块

codecs模块提供了更灵活的编码处理功能。

import codecs

# 编码和解码
text = "Hello, 世界"

# 使用codecs编码
encoded = codecs.encode(text, 'utf-8')
print(encoded)  # 输出: b'Hello, \xe4\xb8\x96\xe7\x95\x8c'

# 使用codecs解码
decoded = codecs.decode(encoded, 'utf-8')
print(decoded)  # 输出: Hello, 世界

# 文件编码处理
# 写入文件
with codecs.open('test.txt', 'w', encoding='utf-8') as f:
    f.write("你好，世界！")

# 读取文件
with codecs.open('test.txt', 'r', encoding='utf-8') as f:
    content = f.read()
    print(content)  # 输出: 你好，世界！

错误处理

处理编码错误的不同策略。

text = "Hello, 你好"

# strict（默认）- 遇到错误抛出异常
# try:
#     text.encode('ascii', errors='strict')
# except UnicodeEncodeError as e:
#     print(f"编码错误: {e}")

# ignore - 忽略错误字符
ascii_ignore = text.encode('ascii', errors='ignore')
print(ascii_ignore)  # 输出: b'Hello, '

# replace - 用?替换错误字符
ascii_replace = text.encode('ascii', errors='replace')
print(ascii_replace)  # 输出: b'Hello, ??'

# xmlcharrefreplace - 用XML字符引用替换
ascii_xml = text.encode('ascii', errors='xmlcharrefreplace')
print(ascii_xml)  # 输出: b'Hello, &#20320;&#22909;'

# backslashreplace - 用反斜杠转义替换
ascii_backslash = text.encode('ascii', errors='backslashreplace')
print(ascii_backslash)  # 输出: b'Hello, \\u4f60\\u597d'

实际应用案例

1. 文件编码检测

import chardet

# 检测文件编码
with open('test.txt', 'rb') as f:
    raw_data = f.read()
    result = chardet.detect(raw_data)
    print(f"检测到的编码: {result['encoding']}")
    print(f"置信度: {result['confidence']}")

2. 编码转换

def convert_encoding(text, from_encoding, to_encoding):
    """在不同编码之间转换"""
    # 先解码为Unicode
    if isinstance(text, bytes):
        unicode_text = text.decode(from_encoding)
    else:
        unicode_text = text
    
    # 编码为目标编码
    return unicode_text.encode(to_encoding)

# 使用
text = "你好"
gbk_text = convert_encoding(text, 'utf-8', 'gbk')
print(gbk_text)  # 输出: b'\xc4\xe3\xba\xc3'

utf8_text = convert_encoding(gbk_text, 'gbk', 'utf-8')
print(utf8_text)  # 输出: b'\xe4\xbd\xa0\xe5\xa5\xbd'

3. 处理混合编码

def safe_decode(byte_string, encodings=None):
    """尝试多种编码解码"""
    if encodings is None:
        encodings = ['utf-8', 'gbk', 'gb2312', 'latin-1']
    
    for encoding in encodings:
        try:
            return byte_string.decode(encoding)
        except (UnicodeDecodeError, LookupError):
            continue
    
    raise ValueError("无法识别的编码")

# 使用
mixed_bytes = "Hello".encode('utf-8')
result = safe_decode(mixed_bytes)
print(result)  # 输出: Hello

常见问题解决

1. UnicodeDecodeError

# 问题：读取文件时编码错误
# 解决：指定正确的编码
with open('file.txt', 'r', encoding='utf-8') as f:
    content = f.read()

# 或者使用errors参数
with open('file.txt', 'r', encoding='utf-8', errors='ignore') as f:
    content = f.read()

2. UnicodeEncodeError

# 问题：输出包含非ASCII字符
# 解决：使用合适的编码
text = "你好"
print(text.encode('utf-8').decode('utf-8'))

最佳实践

始终明确指定编码格式
在处理文本时使用Unicode（str类型）
在处理二进制数据时使用bytes类型
使用errors参数处理编码错误
考虑使用chardet检测未知编码

正确处理编码是编写健壮Python程序的关键。