🎯

集合类型

📂 python ⏱ 3 min 589 words

集合的创建

集合（set）是无序、不重复的元素集合，用花括号 {} 表示：

# 创建集合
fruits = {"苹果", "香蕉", "橘子"}
numbers = {1, 2, 3, 4, 5}

# 注意：空集合必须用set()，不能用{}
empty = set()   # 这是空集合
empty_dict = {} # 这是空字典

# 从其他类型创建
list_to_set = set([1, 2, 2, 3, 3, 3])
print(list_to_set)  # {1, 2, 3}

str_to_set = set("Hello")
print(str_to_set)   # {'H', 'e', 'l', 'o'}（自动去重）

# 集合推导式
squares = {x**2 for x in range(1, 6)}
print(squares)  # {1, 4, 9, 16, 25}

注意：集合中的元素必须是不可变类型（数字、字符串、元组），不能包含列表或字典。

集合的基本操作

fruits = {"苹果", "香蕉", "橘子"}

# 添加元素
fruits.add("西瓜")
print(fruits)  # {'苹果', '香蕉', '橘子', '西瓜'}

# update() - 添加多个元素
fruits.update(["葡萄", "芒果"])
print(fruits)

# 删除元素
fruits.remove("苹果")  # 元素不存在会报KeyError
fruits.discard("不存在")  # 元素不存在不会报错

# pop() - 删除并返回任意一个元素
popped = fruits.pop()
print(popped)

# clear() - 清空集合
fruits.clear()
print(fruits)  # set()

集合运算

集合支持数学中的集合运算：

A = {1, 2, 3, 4, 5}
B = {4, 5, 6, 7, 8}

# 并集（所有元素）
print(A | B)        # {1, 2, 3, 4, 5, 6, 7, 8}
print(A.union(B))   # 同上

# 交集（共同元素）
print(A & B)             # {4, 5}
print(A.intersection(B)) # 同上

# 差集（A有B没有的元素）
print(A - B)              # {1, 2, 3}
print(A.difference(B))    # 同上

# 对称差集（不同时在A和B中的元素）
print(A ^ B)                   # {1, 2, 3, 6, 7, 8}
print(A.symmetric_difference(B)) # 同上

集合关系判断

A = {1, 2, 3}
B = {1, 2, 3, 4, 5}
C = {6, 7, 8}

# 子集判断
print(A.issubset(B))    # True（A是B的子集）
print(A <= B)            # True

# 超集判断
print(B.issuperset(A))  # True（B是A的超集）
print(B >= A)            # True

# 不相交判断
print(A.isdisjoint(C))  # True（A和C没有交集）

不可变集合

frozenset 是不可变版本的集合，可以作为字典的键或集合的元素：

fs = frozenset([1, 2, 3, 4, 5])

# 不能修改
# fs.add(6)  # AttributeError

# 可以用于
nested_sets = {frozenset([1, 2]), frozenset([3, 4])}
print(nested_sets)

# 作为字典键
locations = {
    frozenset(["北京", "天津"]): "华北",
    frozenset(["上海", "杭州"]): "华东"
}

集合推导式

# 基本推导式
squares = {x**2 for x in range(10)}
# {0, 1, 4, 9, 16, 25, 36, 49, 64, 81}

# 带条件过滤
even_squares = {x**2 for x in range(10) if x % 2 == 0}
# {0, 4, 16, 36, 64}

# 字符串处理
words = ["hello", "world", "python", "hello"]
unique_lengths = {len(word) for word in words}
# {5, 6}

# 找出两个列表的共同元素
list1 = [1, 2, 3, 4, 5]
list2 = [4, 5, 6, 7, 8]
common = {x for x in list1 if x in list2}
# {4, 5}

常用模式

列表去重

numbers = [1, 2, 2, 3, 3, 3, 4, 4, 4, 4]

# 方法1：简单去重（不保持顺序）
unique = list(set(numbers))
print(unique)  # [1, 2, 3, 4]

# 方法2：保持顺序的去重
seen = set()
unique_ordered = []
for num in numbers:
    if num not in seen:
        seen.add(num)
        unique_ordered.append(num)
print(unique_ordered)  # [1, 2, 3, 4]

# 方法3：使用dict.fromkeys()（Python 3.7+保持顺序）
unique_ordered = list(dict.fromkeys(numbers))

快速成员检测

# 集合的成员检测比列表快得多
large_list = list(range(1000000))
large_set = set(range(1000000))

import timeit

# 列表查找
list_time = timeit.timeit("999999 in large_list", globals=globals(), number=100)
# 集合查找
set_time = timeit.timeit("999999 in large_set", globals=globals(), number=100)

print(f"列表: {list_time:.4f}秒")
print(f"集合: {set_time:.6f}秒")
# 集合查找通常比列表快100倍以上

数据分析

# 分析用户行为
users_day1 = {"Alice", "Bob", "Charlie", "David"}
users_day2 = {"Bob", "David", "Eve", "Frank"}
users_day3 = {"Alice", "Charlie", "Eve", "George"}

# 三天都活跃的用户
always_active = users_day1 & users_day2 & users_day3
print(f"始终活跃: {always_active}")

# 至少活跃一天的用户
all_users = users_day1 | users_day2 | users_day3
print(f"所有用户: {all_users}")

# 仅在第一天活跃的用户
only_day1 = users_day1 - users_day2 - users_day3
print(f"仅第一天活跃: {only_day1}")

# 活跃天数统计
from collections import Counter
activity = Counter()
for user in users_day1:
    activity[user] += 1
for user in users_day2:
    activity[user] += 1
for user in users_day3:
    activity[user] += 1
print(f"活跃天数: {dict(activity)}")

标签系统

# 文章标签
article1_tags = {"Python", "编程", "入门"}
article2_tags = {"Python", "进阶", "面向对象"}
article3_tags = {"JavaScript", "前端", "入门"}

# 找出Python相关的文章
all_tags = {"Python", "JavaScript", "编程", "前端", "入门", "进阶", "面向对象"}
python_articles = [article1_tags, article2_tags]

# 共同标签
common_tags = article1_tags & article2_tags
print(f"共同标签: {common_tags}")  # {'Python'}

# 所有标签
all_article_tags = article1_tags | article2_tags | article3_tags
print(f"所有标签: {all_article_tags}")

frozenset的应用

# 作为字典键存储配置
permissions = {
    frozenset(["read"]): "只读用户",
    frozenset(["read", "write"]): "编辑用户",
    frozenset(["read", "write", "admin"]): "管理员"
}

user_perms = frozenset(["read", "write"])
print(permissions[user_perms])  # 编辑用户

# 在集合中存储集合
nested = {frozenset([1, 2]), frozenset([3, 4])}
print(len(nested))  # 2

集合的局限性

# 不能存储可变类型
# {[1, 2]}  # TypeError: unhashable type: 'list'

# 不能索引
s = {1, 2, 3}
# s[0]  # TypeError: 'set' object is not subscriptable

# 无序，不能依赖顺序
s = {3, 1, 2}
print(s)  # 可能是 {1, 2, 3} 或 {3, 1, 2}

总结

集合是Python中用于去重和集合运算的高效数据结构。掌握了集合的创建、运算和应用场景后，你就能更优雅地处理数据去重、成员检测和关系分析等问题。至此，Python基础数据类型的学习已完成，你已经具备了编写Python程序的核心能力。