collections模块
collections模块概述
collections模块提供了高性能的容器数据类型,是Python标准库中非常实用的工具集。
Counter
Counter是一个字典子类,用于计数可哈希对象。
from collections import Counter
# 基本用法
text = "hello world hello python"
words = text.split()
word_count = Counter(words)
print(word_count)
# 输出: Counter({'hello': 2, 'world': 1, 'python': 1})
# 从列表创建
colors = ['red', 'blue', 'red', 'green', 'blue', 'red']
color_count = Counter(colors)
print(color_count)
# 输出: Counter({'red': 3, 'blue': 2, 'green': 1})
# 最常见的元素
print(color_count.most_common(2)) # 输出: [('red', 3), ('blue', 2)]
# 计数相减
c1 = Counter(a=3, b=1)
c2 = Counter(a=1, b=2)
print(c1 - c2) # 输出: Counter({'a': 2})
# 计数相加
print(c1 + c2) # 输出: Counter({'a': 4, 'b': 3})
defaultdict
defaultdict在访问不存在的键时自动创建默认值。
from collections import defaultdict
# 基本用法
dd = defaultdict(int)
dd['a'] += 1
dd['b'] += 2
print(dd) # 输出: defaultdict(<class 'int'>, {'a': 1, 'b': 2})
# 使用list作为默认值
dd_list = defaultdict(list)
dd_list['fruits'].append('apple')
dd_list['fruits'].append('banana')
dd_list['vegetables'].append('carrot')
print(dd_list)
# 输出: defaultdict(<class 'list'>, {'fruits': ['apple', 'banana'], 'vegetables': ['carrot']})
# 分组数据
students = [
('Alice', 'A'),
('Bob', 'B'),
('Charlie', 'A'),
('David', 'B'),
('Eve', 'C')
]
grade_groups = defaultdict(list)
for name, grade in students:
grade_groups[grade].append(name)
print(dict(grade_groups))
# 输出: {'A': ['Alice', 'Charlie'], 'B': ['Bob', 'David'], 'C': ['Eve']}
OrderedDict
OrderedDict是保持插入顺序的字典(Python 3.7+中普通字典也保持顺序)。
from collections import OrderedDict
# 基本用法
od = OrderedDict()
od['first'] = 1
od['second'] = 2
od['third'] = 3
print(od) # 输出: OrderedDict([('first', 1), ('second', 2), ('third', 3)])
# 移动键到末尾
od.move_to_end('first')
print(od) # 输出: OrderedDict([('second', 2), ('third', 3), ('first', 1)])
# 移动键到开头
od.move_to_end('third', last=False)
print(od) # 输出: OrderedDict([('third', 3), ('second', 2), ('first', 1)])
# LRU缓存实现
class LRUCache:
def __init__(self, capacity):
self.cache = OrderedDict()
self.capacity = capacity
def get(self, key):
if key in self.cache:
self.cache.move_to_end(key)
return self.cache[key]
return -1
def put(self, key, value):
if key in self.cache:
self.cache.move_to_end(key)
self.cache[key] = value
if len(self.cache) > self.capacity:
self.cache.popitem(last=False)
# 使用
cache = LRUCache(2)
cache.put('a', 1)
cache.put('b', 2)
print(cache.get('a')) # 输出: 1
cache.put('c', 3) # 移除'b'
print(cache.get('b')) # 输出: -1
deque
deque是双端队列,支持从两端高效添加和删除元素。
from collections import deque
# 基本用法
dq = deque([1, 2, 3, 4, 5])
print(dq) # 输出: deque([1, 2, 3, 4, 5])
# 从两端添加元素
dq.append(6) # 右端添加
dq.appendleft(0) # 左端添加
print(dq) # 输出: deque([0, 1, 2, 3, 4, 5, 6])
# 从两端删除元素
dq.pop() # 右端删除
dq.popleft() # 左端删除
print(dq) # 输出: deque([1, 2, 3, 4, 5])
# 旋转
dq.rotate(2) # 右旋转
print(dq) # 输出: deque([4, 5, 1, 2, 3])
dq.rotate(-2) # 左旋转
print(dq) # 输出: deque([1, 2, 3, 4, 5])
# 固定长度的deque
dq_fixed = deque(maxlen=3)
dq_fixed.extend([1, 2, 3])
print(dq_fixed) # 输出: deque([1, 2, 3])
dq_fixed.append(4)
print(dq_fixed) # 输出: deque([2, 3, 4])
namedtuple
namedtuple是元组的子类,可以通过名称访问元素。
from collections import namedtuple
# 创建namedtuple
Point = namedtuple('Point', ['x', 'y'])
# 使用
p = Point(1, 2)
print(p.x, p.y) # 输出: 1 2
print(p[0], p[1]) # 输出: 1 2
# 转换为字典
d = p._asdict()
print(d) # 输出: {'x': 1, 'y': 2}
# 替换字段值
p2 = p._replace(x=10)
print(p2) # 输出: Point(x=10, y=2)
# 从字典创建
d = {'x': 5, 'y': 10}
p3 = Point(**d)
print(p3) # 输出: Point(x=5, y=10)
# 实际应用
Student = namedtuple('Student', ['name', 'age', 'grade'])
students = [
Student('Alice', 20, 'A'),
Student('Bob', 22, 'B'),
Student('Charlie', 21, 'A')
]
# 按年龄排序
sorted_students = sorted(students, key=lambda s: s.age)
for s in sorted_students:
print(f"{s.name}: {s.age}岁")
实际应用案例
1. 文本分析
from collections import Counter
def analyze_text(text):
"""分析文本"""
words = text.lower().split()
word_count = Counter(words)
print(f"总词数: {len(words)}")
print(f"不同词数: {len(word_count)}")
print(f"最常见的5个词: {word_count.most_common(5)}")
return word_count
# 使用
text = """
Python is a great programming language.
Python is used for web development, data science, and automation.
Python is easy to learn and powerful.
"""
analyze_text(text)
2. 数据分组
from collections import defaultdict
def group_by_category(items):
"""按类别分组"""
groups = defaultdict(list)
for item in items:
groups[item['category']].append(item)
return dict(groups)
# 使用
products = [
{'name': 'iPhone', 'category': '手机', 'price': 6999},
{'name': 'iPad', 'category': '平板', 'price': 3999},
{'name': 'MacBook', 'category': '电脑', 'price': 9999},
{'name': 'AirPods', 'category': '配件', 'price': 1299},
{'name': 'Apple Watch', 'category': '手表', 'price': 2999}
]
groups = group_by_category(products)
for category, items in groups.items():
print(f"\n{category}:")
for item in items:
print(f" {item['name']}: ¥{item['price']}")
3. 滑动窗口
from collections import deque
def moving_average(data, window_size):
"""计算移动平均"""
window = deque(maxlen=window_size)
averages = []
for num in data:
window.append(num)
averages.append(sum(window) / len(window))
return averages
# 使用
data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
averages = moving_average(data, 3)
print("移动平均:", averages)
# 输出: 移动平均: [1.0, 1.5, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]
性能对比
import time
from collections import deque, defaultdict
# deque vs list
def test_performance():
# deque性能
dq = deque()
start = time.time()
for i in range(100000):
dq.appendleft(i)
deque_time = time.time() - start
# list性能
lst = []
start = time.time()
for i in range(100000):
lst.insert(0, i)
list_time = time.time() - start
print(f"deque: {deque_time:.4f}秒")
print(f"list: {list_time:.4f}秒")
test_performance()
最佳实践
- 使用
Counter进行计数和频率分析 - 使用
defaultdict简化分组逻辑 - 使用
deque实现队列或栈 - 使用
namedtuple创建轻量级数据类 - 根据需求选择合适的容器类型
collections模块是Python标准库中非常强大的工具,掌握它们能让你的代码更加高效和优雅。