🐍

collections模块

📂 python ⏱ 4 min 675 words

collections模块概述

collections模块提供了高性能的容器数据类型，是Python标准库中非常实用的工具集。

Counter

Counter是一个字典子类，用于计数可哈希对象。

from collections import Counter

# 基本用法
text = "hello world hello python"
words = text.split()
word_count = Counter(words)
print(word_count)
# 输出: Counter({'hello': 2, 'world': 1, 'python': 1})

# 从列表创建
colors = ['red', 'blue', 'red', 'green', 'blue', 'red']
color_count = Counter(colors)
print(color_count)
# 输出: Counter({'red': 3, 'blue': 2, 'green': 1})

# 最常见的元素
print(color_count.most_common(2))  # 输出: [('red', 3), ('blue', 2)]

# 计数相减
c1 = Counter(a=3, b=1)
c2 = Counter(a=1, b=2)
print(c1 - c2)  # 输出: Counter({'a': 2})

# 计数相加
print(c1 + c2)  # 输出: Counter({'a': 4, 'b': 3})

defaultdict

defaultdict在访问不存在的键时自动创建默认值。

from collections import defaultdict

# 基本用法
dd = defaultdict(int)
dd['a'] += 1
dd['b'] += 2
print(dd)  # 输出: defaultdict(<class 'int'>, {'a': 1, 'b': 2})

# 使用list作为默认值
dd_list = defaultdict(list)
dd_list['fruits'].append('apple')
dd_list['fruits'].append('banana')
dd_list['vegetables'].append('carrot')
print(dd_list)
# 输出: defaultdict(<class 'list'>, {'fruits': ['apple', 'banana'], 'vegetables': ['carrot']})

# 分组数据
students = [
    ('Alice', 'A'),
    ('Bob', 'B'),
    ('Charlie', 'A'),
    ('David', 'B'),
    ('Eve', 'C')
]

grade_groups = defaultdict(list)
for name, grade in students:
    grade_groups[grade].append(name)

print(dict(grade_groups))
# 输出: {'A': ['Alice', 'Charlie'], 'B': ['Bob', 'David'], 'C': ['Eve']}

OrderedDict

OrderedDict是保持插入顺序的字典（Python 3.7+中普通字典也保持顺序）。

from collections import OrderedDict

# 基本用法
od = OrderedDict()
od['first'] = 1
od['second'] = 2
od['third'] = 3

print(od)  # 输出: OrderedDict([('first', 1), ('second', 2), ('third', 3)])

# 移动键到末尾
od.move_to_end('first')
print(od)  # 输出: OrderedDict([('second', 2), ('third', 3), ('first', 1)])

# 移动键到开头
od.move_to_end('third', last=False)
print(od)  # 输出: OrderedDict([('third', 3), ('second', 2), ('first', 1)])

# LRU缓存实现
class LRUCache:
    def __init__(self, capacity):
        self.cache = OrderedDict()
        self.capacity = capacity
    
    def get(self, key):
        if key in self.cache:
            self.cache.move_to_end(key)
            return self.cache[key]
        return -1
    
    def put(self, key, value):
        if key in self.cache:
            self.cache.move_to_end(key)
        self.cache[key] = value
        if len(self.cache) > self.capacity:
            self.cache.popitem(last=False)

# 使用
cache = LRUCache(2)
cache.put('a', 1)
cache.put('b', 2)
print(cache.get('a'))  # 输出: 1
cache.put('c', 3)  # 移除'b'
print(cache.get('b'))  # 输出: -1

deque

deque是双端队列，支持从两端高效添加和删除元素。

from collections import deque

# 基本用法
dq = deque([1, 2, 3, 4, 5])
print(dq)  # 输出: deque([1, 2, 3, 4, 5])

# 从两端添加元素
dq.append(6)      # 右端添加
dq.appendleft(0)  # 左端添加
print(dq)  # 输出: deque([0, 1, 2, 3, 4, 5, 6])

# 从两端删除元素
dq.pop()      # 右端删除
dq.popleft()  # 左端删除
print(dq)  # 输出: deque([1, 2, 3, 4, 5])

# 旋转
dq.rotate(2)   # 右旋转
print(dq)  # 输出: deque([4, 5, 1, 2, 3])

dq.rotate(-2)  # 左旋转
print(dq)  # 输出: deque([1, 2, 3, 4, 5])

# 固定长度的deque
dq_fixed = deque(maxlen=3)
dq_fixed.extend([1, 2, 3])
print(dq_fixed)  # 输出: deque([1, 2, 3])
dq_fixed.append(4)
print(dq_fixed)  # 输出: deque([2, 3, 4])

namedtuple

namedtuple是元组的子类，可以通过名称访问元素。

from collections import namedtuple

# 创建namedtuple
Point = namedtuple('Point', ['x', 'y'])

# 使用
p = Point(1, 2)
print(p.x, p.y)  # 输出: 1 2
print(p[0], p[1])  # 输出: 1 2

# 转换为字典
d = p._asdict()
print(d)  # 输出: {'x': 1, 'y': 2}

# 替换字段值
p2 = p._replace(x=10)
print(p2)  # 输出: Point(x=10, y=2)

# 从字典创建
d = {'x': 5, 'y': 10}
p3 = Point(**d)
print(p3)  # 输出: Point(x=5, y=10)

# 实际应用
Student = namedtuple('Student', ['name', 'age', 'grade'])
students = [
    Student('Alice', 20, 'A'),
    Student('Bob', 22, 'B'),
    Student('Charlie', 21, 'A')
]

# 按年龄排序
sorted_students = sorted(students, key=lambda s: s.age)
for s in sorted_students:
    print(f"{s.name}: {s.age}岁")

实际应用案例

1. 文本分析

from collections import Counter

def analyze_text(text):
    """分析文本"""
    words = text.lower().split()
    word_count = Counter(words)
    
    print(f"总词数: {len(words)}")
    print(f"不同词数: {len(word_count)}")
    print(f"最常见的5个词: {word_count.most_common(5)}")
    
    return word_count

# 使用
text = """
Python is a great programming language.
Python is used for web development, data science, and automation.
Python is easy to learn and powerful.
"""

analyze_text(text)

2. 数据分组

from collections import defaultdict

def group_by_category(items):
    """按类别分组"""
    groups = defaultdict(list)
    for item in items:
        groups[item['category']].append(item)
    return dict(groups)

# 使用
products = [
    {'name': 'iPhone', 'category': '手机', 'price': 6999},
    {'name': 'iPad', 'category': '平板', 'price': 3999},
    {'name': 'MacBook', 'category': '电脑', 'price': 9999},
    {'name': 'AirPods', 'category': '配件', 'price': 1299},
    {'name': 'Apple Watch', 'category': '手表', 'price': 2999}
]

groups = group_by_category(products)
for category, items in groups.items():
    print(f"\n{category}:")
    for item in items:
        print(f"  {item['name']}: ¥{item['price']}")

3. 滑动窗口

from collections import deque

def moving_average(data, window_size):
    """计算移动平均"""
    window = deque(maxlen=window_size)
    averages = []
    
    for num in data:
        window.append(num)
        averages.append(sum(window) / len(window))
    
    return averages

# 使用
data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
averages = moving_average(data, 3)
print("移动平均:", averages)
# 输出: 移动平均: [1.0, 1.5, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]

性能对比

import time
from collections import deque, defaultdict

# deque vs list
def test_performance():
    # deque性能
    dq = deque()
    start = time.time()
    for i in range(100000):
        dq.appendleft(i)
    deque_time = time.time() - start
    
    # list性能
    lst = []
    start = time.time()
    for i in range(100000):
        lst.insert(0, i)
    list_time = time.time() - start
    
    print(f"deque: {deque_time:.4f}秒")
    print(f"list: {list_time:.4f}秒")

test_performance()

最佳实践

使用Counter进行计数和频率分析
使用defaultdict简化分组逻辑
使用deque实现队列或栈
使用namedtuple创建轻量级数据类
根据需求选择合适的容器类型

collections模块是Python标准库中非常强大的工具，掌握它们能让你的代码更加高效和优雅。