正则表达式

match search findall finditer

re.match(pattern, string[, flags])：扫描字符串开头
re.search(): 扫描整个字符串，返回第一个匹配成功的结果
re.findall()：返回所有结果的列表，失败返回空列表
finditer() 返回迭代器失败返回空
flags 比如是否区分大小写。
re.I：忽略大小写 ignorecase
re.M:多行匹配会影响$和^ multiline
re.S:会匹配包括换行符在内的所有字符 # DOTALL dot matches all


import re
tmp = 'http'
print(re.match(tmp,'http://www.baidu.com')) 
# <_sre.SRE_Match object; span=(0, 4), match='http'>
print(re.match(tmp,'://wwwhttp.baidu.com')) 
# None
print(re.match(tmp,'http://www.baidu.com').span())
# (0, 4)

print(re.search(tmp, 'www.Http://baidu.com', re.I))
# <_sre.SRE_Match object; span=(4, 8), match='Http'>

print(re.findall(tmp, 'http://www.http.com'))
# ['http', 'http']

find = re.finditer(tmp, 'http://HTTp.com0', re.I)
for i in find:
    print(i)
# <_sre.SRE_Match object; span=(0, 4), match='http'>
# <_sre.SRE_Match object; span=(7, 11), match='HTTp'>

各种匹配符号

· 匹配除换行符以外的所有字符
[] 是字符的集合,匹配括号中任意字符
[a-zA-Z] 大小写字符
[0-9]
[^0-9]除数字外的所有字符 ^托字符
d 匹配数字，效果等同于[0-9]
D 匹配除数字以外的字符 [^0-9]
w 匹配数字，字母及下划线效果等同于[0-9a-zA-Z_]
W 取反
s 匹配任意空白字符（空格，换行，制表，换页，回车）[ntfr]
S 取反

print(re.findall('.','http:\n//'))
# ['h', 't', 't', 'p', ':', '/', '/']
print(re.findall('[12345]','qhchg461905nkj'))  
# ['4', '1', '5']


# 保留原始字符的R/r在正则中无效
print(re.findall(r'\D','123qwr'))
# ['q', 'w', 'r']

print(re.findall('\S','123 gh \n 78\r'))
# ['1', '2', '3', 'g', 'h', '7', '8']

^ 行首匹配(不在方括号内)
$ 行尾匹配（不在方括号内）当需要进行多行匹配时候，需要设置flags=re.M，如果不写默认匹配字符串的开头和结尾只匹配一次
A 行首匹配与^的区别:A只匹配整个字符串的开头，即使有flags = re.M参数也不会匹配其他的行首
Z 行尾匹配与$的区别：Z只匹配整个字符串的结尾，即使有flags = re.M参数也不会匹配其他的行尾
b 匹配字符边界就是字符串与空格间的位置，比如'ce\b',可以匹配nice 不能匹配nicer
B 匹配非字符边界

# 看你是否nice开始 类似于match
print(re.findall('^nice','to nice meet you'))
# []

# 多行开头匹配
print(re.findall('^nice','nice to meet you\nnice to meet you', re.M))
# ['nice', 'nice']

# 是否是com结尾
print(re.findall('com$', 'http:baidu.com'))
# ['com']

# \b 本身是个转义字符，但在正则中也有特殊意义，在正则字符串前面加r
# 将将转义字符的意义去掉 保留正则的意义
# 以空格有边界的有一个ce
print(re.findall(r'ce\b', 'nice to meet niceto meet'))
# ['ce']

(abc) 匹配小括号中的字符作为整体
x? ? 匹配0或1个x字符非贪婪匹配
x+ + 匹配至少一个x 贪婪匹配
x* * 匹配任意多个x 贪婪匹配
x{n} 匹配确定的n个x 非负整数
x{n,} 匹配大于等于n个x
x{n,m} 匹配至少n个x，最多m个
x|y 匹配x或y

print(re.findall('[abc]', 'qwracb'))
# ['a', 'c', 'b']

print(re.findall('(abc)', 'qwrabc'))
# ['abc']

print(re.findall('a?','a'))
# ['a', '']

print(re.findall('(aa)?', 'aaab'))
# ['aa', '', '', '']

print(re.findall('(aa)','aaaab'))
# ['aa', 'aa']
print(re.findall('(aa)+','aaaab'))  
# 由于是贪婪匹配，相当于在最长的aaaa中找出： ['aa']

print(re.findall('aa+','aabaaaabaaaba')) 
# 最后一个a没 ['aa', 'aaaa', 'aaa']

print(re.findall('a*', 'abaabbaaabaaaabb'))
# 匹配任意多个a
# ['a', '', 'aa', '', '', 'aaa', '', 'aaaa', '', '', '']

print(re.findall('a{3}','aabaaacsdsaaa'))
# ['aaa', 'aaa']
print(re.findall('a{3,}','aabbaaabbaaaa'))
# ['aaa', 'aaaa']

print(re.findall('(good)','good--Good-good'))
print(re.findall('good','good--Good-good'))
# ['good', 'good']

print(re.findall('(good)|(Good)','good--Good'))
# [('good', ''), ('', 'Good')]
print(re.findall('good|Good','good--Good'))
# ['good', 'Good']

特殊元字符

. 匹配除换行符以为的所有字符
* 匹配任意多个贪婪匹配
? 匹配0个或一个
.*? 将贪婪匹配变为非贪婪


print(re.findall('.',''))
# []
print(re.findall('.*',''))  
# ['']

r = 'who .* he'
print(re.findall(r, 'who is a girl he who is boy he '))
# ['who is a girl he who is boy he'] 贪婪

r = 'who .*? he'
print(re.findall(r, 'who is a girl he who is boy he who  he'))
# ['who is a girl he', 'who is boy he', 'who  he'] 非贪婪

分组

正则中有组的概念
分组：group
正则有判断是否匹配的功能，还提供了提取子串的功能，用()表示提取的分组，从外向里，从左向右标注第几组

tmp = '\d{3}-\d{8}'
str1 = '010-12345678'
result = re.findall(tmp, str1)
print(result)
# ['010-12345678']

tmp2 ='(\d{3})-(\d{8})' # 用小括号括起来的内容为一组
result2 = re.findall(tmp2, str1)
print(result2)
# [('010', '12345678')]

tmp3 = '(\d{3})-(\d{8})'
result3 = re.match(tmp3, str1)
print(result3)
# <_sre.SRE_Match object; span=(0, 12), match='010-12345678'>

# groups()可以查看匹配到的所有分组的情况
print(result3.groups())  # ('010', '12345678')

# group() 可以单独获取分组情况
# group(0) 一直代表匹配到的原始字符串
print(result3.group())   # 010-12345678
print(result3.group(0))  # 010-12345678
print(result3.group(1))  # 010
print(result3.group(2))  # 12345678

# 另起名称(?P<名称>)
tmp4 = '(?P<love>\d{3})-(?P<like>\d{8})'
result4 = re.match(tmp4, str1)
print(result4.group(0))
print(result4.group(1))
print(result4.group(2))
print(result4.group('love'))

正则与字符串

str1 = 'ac b  c   d     e'
print(str1.split())
# ['ac', 'b', 'c', 'd', 'e']

print(re.split(' +', str1))
# ['ac', 'b', 'c', 'd', 'e']

# 替换字符串
# re.sub(pattern, rep1, string)
# pattern: 即将被替换的字符
# rep1:替换的字符
# count 替换次数
# subn 将结果和替换的次数放到一个元组了
res2 = re.sub(' +', '*', str1)
print(res2)  
# ac*b*c*d*e

res2 = re.sub(' +', '*', str1, count=2)
print(res2)
# ac*b*c   d     e

res2 = re.subn(' +', '*', str1)
print(res2)
# ('ac*b*c*d*e', 4)

compile

tmp = '\d{3}-\d{8}'
str1 = '090-99999999'
res = re.match(tmp, str1)
print(res)

tmp2 = '\d{3}-\d{8}'
str2 = '090-99999999'
re_tmp2 = re.compile(tmp2)
res2 = re_tmp2.match(str2)
print(res2)
# <_sre.SRE_Match object; span=(0, 12), match='090-99999999'>

例子

# 判断某一个字符串是不是11位，第一个是否是数字1

r= '^1\d{10}$'
# 1开头
# 10个数字结尾

print(re.findall(r, '12534568545'))


# 前面是3个数字 后面八个数字 中间为-

r= '^\d{3}-\d{8}$'


# 123456@qq.com
# ^.\$ 在正则中有实际意义，如果想要作为普通字符使用需要使用转义字符

r= '^\d{6}@qq\.com'    
print(re.findall(r,'125565@qq.com'))

正则表达式

match search findall finditer

各种匹配符号

特殊元字符

分组

正则与字符串

compile

例子

rottengeek

引用和评论

机器学习汇总

如何减少跨团队交付摩擦？——基于 DevOps 与敏捷的最佳实践

科学计算编程涉及到的技术栈简介

使用 chardet 判断文件编码需要注意的坑——过大的文件会导致高耗时

Python3 格式化时间（qbit）

本地使用PaddleOCR进行图片识别获得文字（返回JSON）

manus 的替代品有哪些？使用LLM大模型技术做手机/网页/浏览器自动化操作技术汇总