🔍

正则表达式

📂 python ⏱ 2 min 240 words

正则表达式

正则表达式是用于匹配字符串的强大工具。Python的 re 模块提供了完整的正则表达式支持，是文本处理的利器。

基本匹配

import re

text = "Hello, my email is user@example.com"

# 搜索匹配
match = re.search(r'[\w.]+@[\w.]+', text)
if match:
    print(match.group())  # user@example.com

# 完整匹配
result = re.fullmatch(r'\d+', "12345")
print(result is not None)  # True

常用元字符

import re

# . - 匹配任意字符（除换行符）
print(re.findall(r'h.t', 'hat hot hit'))  # ['hat', 'hot', 'hit']

# ^ - 行首锚点
print(re.findall(r'^Hello', 'Hello World'))  # ['Hello']

# $ - 行尾锚点
print(re.findall(r'World$', 'Hello World'))  # ['World']

# * - 0次或多次
print(re.findall(r'ab*c', 'ac abc abbc'))  # ['ac', 'abc', 'abbc']

# + - 1次或多次
print(re.findall(r'ab+c', 'ac abc abbc'))  # ['abc', 'abbc']

# ? - 0次或1次
print(re.findall(r'colou?r', 'color colour'))  # ['color', 'colour']

# {n} - 恰好n次
print(re.findall(r'\d{3}', '12 123 1234'))  # ['123', '123']

# {n,m} - n到m次
print(re.findall(r'\d{2,4}', '1 12 123 1234 12345'))
# ['12', '123', '1234', '1234']

字符类

import re

# [abc] - 匹配a、b或c
print(re.findall(r'[aeiou]', 'hello world'))  # ['e', 'o', 'o']

# [^abc] - 匹配除a、b、c外的字符
print(re.findall(r'[^aeiou\s]', 'hello'))  # ['h', 'l', 'l']

# \d - 数字
print(re.findall(r'\d+', 'abc123def456'))  # ['123', '456']

# \w - 字母、数字、下划线
print(re.findall(r'\w+', 'hello_world 123'))  # ['hello_world', '123']

# \s - 空白字符
print(re.findall(r'\s+', 'a b  c'))  # [' ', '  ']

分组

import re

# 基本分组
text = "2024-01-15"
match = re.search(r'(\d{4})-(\d{2})-(\d{2})', text)
if match:
    print(match.group(0))  # 2024-01-15（完整匹配）
    print(match.group(1))  # 2024
    print(match.group(2))  # 01
    print(match.group(3))  # 15
    print(match.groups())  # ('2024', '01', '15')

# 命名分组
match = re.search(r'(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})', text)
if match:
    print(match.group('year'))   # 2024
    print(match.groupdict())    # {'year': '2024', 'month': '01', 'day': '15'}

匹配模式

import re

# 忽略大小写
print(re.findall(r'hello', 'Hello HELLO hello', re.IGNORECASE))
# ['Hello', 'HELLO', 'hello']

# 多行模式
text = "Line1\nLine2\nLine3"
print(re.findall(r'^Line\d', text, re.MULTILINE))  # ['Line1', 'Line2', 'Line3']

# 点号匹配换行符
print(re.findall(r'.+', 'Line1\nLine2', re.DOTALL))  # ['Line1\nLine2']

常用模式

import re

# 邮箱验证
email_pattern = r'^[\w.+-]+@[\w-]+\.[\w.]+$'
print(bool(re.match(email_pattern, 'user@example.com')))  # True

# 手机号
phone_pattern = r'^1[3-9]\d{9}$'
print(bool(re.match(phone_pattern, '13812345678')))  # True

# URL
url_pattern = r'https?://[\w.-]+(?:\.[\w.-]+)+(?:/[\w.-]*)*'
print(re.findall(url_pattern, 'Visit https://example.com/path'))

正则表达式功能强大，熟练掌握能大幅提升文本处理效率。