8.13 正则表达式-平芜编程栈

文章目录

前言
一、基本概念
二、re模块常用方法
- 1. 基本匹配方法
- 2. 编译正则表达式（提高性能）
三、正则表达式语法
- 1. 基本元字符
- 2. 量词（重复匹配）
- 3. 特殊字符类
四、分组和捕获
五、标志（Flags）
六、实际应用示例
- 1. 验证电子邮件
- 2. 提取URL
- 3. 提取电话号码
- 4. 密码强度验证
七、性能优化和最佳实践
- 1. 预编译正则表达式
- 2. 使用原始字符串（r前缀）
- 3. 避免贪婪匹配问题
- 4. 处理Unicode字符
八、常见错误和调试

前言

正则表达式是用于处理字符串的强大工具，Python通过re模块提供对正则表达式的支持。下面我将详细介绍Python中正则表达式的各个方面。

一、基本概念

正则表达式（Regular Expression）是一种用于匹配字符串中字符组合的模式，可以用于查找、替换和验证字符串。

二、re模块常用方法

1. 基本匹配方法

pythonimportre# 1. re.match() - 从字符串开头匹配text="hello world"result=re.match(r'hello',text)# 匹配成功print(result.group())# hello# 2. re.search() - 搜索整个字符串text="world hello world"result=re.search(r'hello',text)# 匹配成功print(result.group())# hello# 3. re.findall() - 查找所有匹配项text="cat bat sat hat"result=re.findall(r'.at',text)# 所有匹配print(result)# ['cat', 'bat', 'sat', 'hat']# 4. re.finditer() - 返回迭代器text="cat bat sat hat"formatchinre.finditer(r'.at',text):print(match.group(),match.span())# 5. re.sub() - 替换匹配项text="Python is great. I love Python."result=re.sub(r'Python','JavaScript',text)print(result)# JavaScript is great. I love JavaScript.# 6. re.split() - 根据模式分割字符串text="apple,banana;orange grape"result=re.split(r'[,;\s]',text)print(result)# ['apple', 'banana', 'orange', 'grape']

2. 编译正则表达式（提高性能）

pythonimportre# 编译正则表达式（多次使用时更高效）pattern=re.compile(r'\d{3}-\d{3}-\d{4}')text="My phone is 123-456-7890"result=pattern.search(text)ifresult:print("Found:",result.group())# 123-456-7890

三、正则表达式语法

1. 基本元字符

pythonimportre# . 匹配任意单个字符（除了换行符）print(re.findall(r'h.t','hat hot hit hut'))# ['hat', 'hot', 'hit', 'hut']# [] 字符集print(re.findall(r'h[aeiou]t','hat hit hot het'))# ['hat', 'het']print(re.findall(r'[a-z]','Hello123'))# ['e', 'l', 'l', 'o']print(re.findall(r'[^0-9]','Hello123'))# ['H', 'e', 'l', 'l', 'o'] - 非数字# | 或运算符print(re.findall(r'cat|dog','I have a cat and a dog'))# ['cat', 'dog']# ^ 字符串开头print(re.findall(r'^Hello','Hello World'))# ['Hello']print(re.findall(r'^Hello','Say Hello'))# [] - 不在开头# $ 字符串结尾print(re.findall(r'World$','Hello World'))# ['World']

2. 量词（重复匹配）

pythonimportre# * 0次或多次print(re.findall(r'ab*c','ac abc abbc abbbc'))# ['ac', 'abc', 'abbc', 'abbbc']# + 1次或多次print(re.findall(r'ab+c','ac abc abbc abbbc'))# ['abc', 'abbc', 'abbbc']# ? 0次或1次print(re.findall(r'ab?c','ac abc abbc'))# ['ac', 'abc']# {n} 精确n次print(re.findall(r'ab{2}c','abc abbc abbbc'))# ['abbc']# {n,} 至少n次print(re.findall(r'ab{2,}c','abc abbc abbbc abbbbc'))# ['abbc', 'abbbc', 'abbbbc']# {n,m} n到m次print(re.findall(r'ab{2,3}c','abc abbc abbbc abbbbc'))# ['abbc', 'abbbc']

3. 特殊字符类

pythonimportre# \d 数字 [0-9]print(re.findall(r'\d+','Phone: 123-456-7890'))# ['123', '456', '7890']# \D 非数字print(re.findall(r'\D+','Phone: 123-456-7890'))# ['Phone: ', '-', '-']# \w 单词字符 [a-zA-Z0-9_]print(re.findall(r'\w+','Hello_World 123!'))# ['Hello_World', '123']# \W 非单词字符print(re.findall(r'\W+','Hello_World 123!'))# [' ', '!']# \s 空白字符 [ \t\n\r\f\v]print(re.findall(r'\s+','Hello World\nPython'))# [' ', '\n']# \S 非空白字符print(re.findall(r'\S+','Hello World\nPython'))# ['Hello', 'World', 'Python']# \b 单词边界print(re.findall(r'\bcat\b','cat catfish concat'))# ['cat']# \B 非单词边界print(re.findall(r'\Bcat\B','cat catfish concat'))# ['cat']

四、分组和捕获

pythonimportre# 1. 简单分组text="John: 30, Jane: 25"pattern=r'(\w+): (\d+)'matches=re.findall(pattern,text)print(matches)# [('John', '30'), ('Jane', '25')]# 2. 命名分组pattern=r'(?P<name>\w+): (?P<age>\d+)'match=re.search(pattern,text)ifmatch:print(match.group('name'))# Johnprint(match.group('age'))# 30# 3. 非捕获分组 (?:...)# 匹配但不捕获text="hello world hello python"matches=re.findall(r'(?:hello) (\w+)',text)print(matches)# ['world', 'python']# 4. 前后查找# 正向肯定预查 (?=...)text="apple banana applepie"matches=re.findall(r'apple(?=pie)',text)print(matches)# ['apple'] - 只匹配后面是pie的apple# 正向否定预查 (?!...)matches=re.findall(r'apple(?!pie)',text)print(matches)# ['apple'] - 只匹配后面不是pie的apple# 反向肯定预查 (?<=...)text="100美元 200人民币"matches=re.findall(r'(?<=\d{3})人民币',text)print(matches)# ['人民币']# 反向否定预查 (?<!...)text="100美元 200人民币"matches=re.findall(r'(?<!\d{3})人民币',text)print(matches)# [] - 没有匹配

五、标志（Flags）

pythonimportre text="Hello\nWorld\nPython"# re.IGNORECASE (re.I) - 忽略大小写print(re.findall(r'hello',text,re.I))# ['Hello']# re.MULTILINE (re.M) - 多行模式print(re.findall(r'^.+$',text,re.M))# ['Hello', 'World', 'Python']# re.DOTALL (re.S) - 使.匹配包括换行符在内的所有字符print(re.findall(r'Hello.*Python',text,re.S))# ['Hello\nWorld\nPython']# re.VERBOSE (re.X) - 允许添加注释和空白pattern=re.compile(r''' \d{3} # 区号 - # 分隔符 \d{3} # 前缀 - # 分隔符 \d{4} # 线路号 ''',re.VERBOSE)# 多个标志组合使用pattern=re.compile(r'^hello',re.I|re.M)

六、实际应用示例

1. 验证电子邮件

pythonimportredefvalidate_email(email):pattern=r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'returnbool(re.match(pattern,email))emails=["test@example.com","invalid.email","name@domain.co.uk"]foremailinemails:print(f"{email}:{validate_email(email)}")

2. 提取URL

pythonimportredefextract_urls(text):pattern=r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+[/\w .-]*/?'returnre.findall(pattern,text)text="Visit https://www.example.com and http://sub.domain.co.uk/path"print(extract_urls(text))

3. 提取电话号码

pythonimportredefextract_phone_numbers(text):pattern=r'(\+\d{1,3}[-.]?)?\(?\d{3}\)?[-.]?\d{3}[-.]?\d{4}'returnre.findall(pattern,text)text="Call me at 123-456-7890 or (987) 654-3210"print(extract_phone_numbers(text))

4. 密码强度验证

pythonimportredefvalidate_password(password):# 至少8个字符，包含大小写字母、数字和特殊字符iflen(password)<8:returnFalsechecks=[r'[A-Z]',# 大写字母r'[a-z]',# 小写字母r'[0-9]',# 数字r'[!@#$%^&*()_+\-=\[\]{};:\'",.<>/?\\|`~]'# 特殊字符]returnall(re.search(pattern,password)forpatterninchecks)passwords=["Weak","StrongPass1!","NoSpecialChar1","GoodPass#2024"]forpwdinpasswords:print(f"{pwd}:{'Valid'ifvalidate_password(pwd)else'Invalid'}")

七、性能优化和最佳实践

1. 预编译正则表达式

pythonimportre# 对于多次使用的模式，预编译可以提高性能phone_pattern=re.compile(r'\d{3}-\d{3}-\d{4}')email_pattern=re.compile(r'^\w+@\w+\.\w+$')# 多次使用texts=["Call 123-456-7890","Email: test@example.com"]fortextintexts:ifphone_pattern.search(text):print("Found phone:",phone_pattern.search(text).group())

2. 使用原始字符串（r前缀）

pythonimportre# 总是使用原始字符串，避免转义问题pattern=r'\d+'# 正确pattern='\\d+'# 正确但易错

3. 避免贪婪匹配问题

pythonimportre text="<div>content1</div><div>content2</div>"# 贪婪匹配（默认）print(re.findall(r'<div>.*</div>',text))# ['<div>content1</div><div>content2</div>']# 非贪婪匹配print(re.findall(r'<div>.*?</div>',text))# ['<div>content1</div>', '<div>content2</div>']

4. 处理Unicode字符

pythonimportre# 匹配Unicode字符text="Résumé Café naïve"print(re.findall(r'\w+',text,re.UNICODE))# ['Résumé', 'Café', 'naïve']

八、常见错误和调试

pythonimportre# 1. 忘记转义特殊字符text="a.b"# 错误：.匹配任意字符print(re.findall(r'a.b',text))# ['a.b']# 正确：转义.print(re.findall(r'a\.b',text))# ['a.b']# 2. 使用re.match()错误地期望搜索整个字符串text="start middle end"# match只检查开头result=re.match(r'middle',text)# None# search搜索整个字符串result=re.search(r'middle',text)# 匹配成功# 3. 调试复杂正则表达式importre pattern=r'(\d{3})-(\d{3})-(\d{4})'text="Phone: 123-456-7890"# 使用re.DEBUG查看匹配过程compiled=re.compile(pattern,re.DEBUG)match=compiled.search(text)ifmatch:print("Groups:",match.groups())