正则表达式
正则表达式
正则表达式是用于匹配字符串的强大工具。Python的 re 模块提供了完整的正则表达式支持,是文本处理的利器。
基本匹配
import re
text = "Hello, my email is user@example.com"
# 搜索匹配
match = re.search(r'[\w.]+@[\w.]+', text)
if match:
print(match.group()) # user@example.com
# 完整匹配
result = re.fullmatch(r'\d+', "12345")
print(result is not None) # True
常用元字符
import re
# . - 匹配任意字符(除换行符)
print(re.findall(r'h.t', 'hat hot hit')) # ['hat', 'hot', 'hit']
# ^ - 行首锚点
print(re.findall(r'^Hello', 'Hello World')) # ['Hello']
# $ - 行尾锚点
print(re.findall(r'World$', 'Hello World')) # ['World']
# * - 0次或多次
print(re.findall(r'ab*c', 'ac abc abbc')) # ['ac', 'abc', 'abbc']
# + - 1次或多次
print(re.findall(r'ab+c', 'ac abc abbc')) # ['abc', 'abbc']
# ? - 0次或1次
print(re.findall(r'colou?r', 'color colour')) # ['color', 'colour']
# {n} - 恰好n次
print(re.findall(r'\d{3}', '12 123 1234')) # ['123', '123']
# {n,m} - n到m次
print(re.findall(r'\d{2,4}', '1 12 123 1234 12345'))
# ['12', '123', '1234', '1234']
字符类
import re
# [abc] - 匹配a、b或c
print(re.findall(r'[aeiou]', 'hello world')) # ['e', 'o', 'o']
# [^abc] - 匹配除a、b、c外的字符
print(re.findall(r'[^aeiou\s]', 'hello')) # ['h', 'l', 'l']
# \d - 数字
print(re.findall(r'\d+', 'abc123def456')) # ['123', '456']
# \w - 字母、数字、下划线
print(re.findall(r'\w+', 'hello_world 123')) # ['hello_world', '123']
# \s - 空白字符
print(re.findall(r'\s+', 'a b c')) # [' ', ' ']
分组
import re
# 基本分组
text = "2024-01-15"
match = re.search(r'(\d{4})-(\d{2})-(\d{2})', text)
if match:
print(match.group(0)) # 2024-01-15(完整匹配)
print(match.group(1)) # 2024
print(match.group(2)) # 01
print(match.group(3)) # 15
print(match.groups()) # ('2024', '01', '15')
# 命名分组
match = re.search(r'(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})', text)
if match:
print(match.group('year')) # 2024
print(match.groupdict()) # {'year': '2024', 'month': '01', 'day': '15'}
匹配模式
import re
# 忽略大小写
print(re.findall(r'hello', 'Hello HELLO hello', re.IGNORECASE))
# ['Hello', 'HELLO', 'hello']
# 多行模式
text = "Line1\nLine2\nLine3"
print(re.findall(r'^Line\d', text, re.MULTILINE)) # ['Line1', 'Line2', 'Line3']
# 点号匹配换行符
print(re.findall(r'.+', 'Line1\nLine2', re.DOTALL)) # ['Line1\nLine2']
常用模式
import re
# 邮箱验证
email_pattern = r'^[\w.+-]+@[\w-]+\.[\w.]+$'
print(bool(re.match(email_pattern, 'user@example.com'))) # True
# 手机号
phone_pattern = r'^1[3-9]\d{9}$'
print(bool(re.match(phone_pattern, '13812345678'))) # True
# URL
url_pattern = r'https?://[\w.-]+(?:\.[\w.-]+)+(?:/[\w.-]*)*'
print(re.findall(url_pattern, 'Visit https://example.com/path'))
正则表达式功能强大,熟练掌握能大幅提升文本处理效率。