python正则基础入门篇
正则表达式全称(Regular Expression)又称 RegEx,通常会用来网页爬虫、文稿处理、数据筛选等。
--- 首先引入一个例子
#import module 导入模块 import re #matching string 预定义两个变量跟一个字符串 pattern1 = "cat" pattern2 = "bird" string = "dog runs o cat" # export result 用字符串去匹配这两个字符,打印结果 print(pattern1 in string) print(pattern2 in string) # output True Flase
---引入正则后
import re #regEx search 正则查找匹配 parttern1 = "Cat" parttern2 = "bird" string = "dog runs cat" #这里I代表不区分大小写 print(re.search(parttern1,string,re.I)) print(re.search(parttern2,string,re.I)) #输出结果显示在索引9-12查到了一个对象:“cat” <re.Match object; span=(9, 12), match='cat'> #bird不在其中,返回None None
---匹配多种可能的情况-基础
#multiple partterns ( run or ran ) 多种情况匹配 #先写规则 pr = r'r[ur]n' string = "dog runs cat" #匹配run的过去分词或过去时两种情况 print(re.search(pr,string)) #可以看出‘[]’是一个占位符,对当前位置进行多内容匹配 <re.Match object; span=(4, 7), match='run'>
---匹配多种可能情况-中阶
#上面我们引入了“[]”,其作用很明显,下面我们考虑更多种情况,如纯英文大小写、数字 #全小写英文字母匹配 print(re.search(r'r[a-z]n',string)) #全数字匹配 print(re.search(r'r[0-9]n',string)) #全大写英文字母匹配 print(re.search(r'r[A-Z]n',string)) #字母数字混合匹配 print(re.search(r'r[0-9A-Za-z]n',string)) #export that <re.Match object; span=(4, 7), match='run'> None None <re.Match object; span=(4, 7), match='run'>
---特殊匹配方式
#为了简化输入,引入了特殊匹配,如0-9用\d,a-z用\w #全数字匹配 print(re.search(r'r\dn',string)) #小写字母匹配 print(re.search(r'r\wn',string)) #output None <re.Match object; span=(4, 7), match='run'>
---除了字母数字外,平时我们还会用到匹配“&”、“#”、“%”、“@”
#匹配除字母、数字、下划线、汉字外的字符 string = "dog r#ns cat" print(re.search(r'r\Wn',string)) <re.Match object; span=(4, 7), match='r#n'> #类似还有\D匹配除数字外字符,所以根据需求使用恰当命令
---匹配空白字符,比如回车换行(\r\n)、空格(\b)、制表符(\t)等
#这里简单示范下 string = "dog r ns cat" print(re.search(r'r\sn',string)) <re.Match object; span=(4, 7), match='r n'>
---匹配带有“\” 及 除 “\r\n”外的字符
string = "dog run\s cat" #匹配"\"必须用"\\"去匹配,否则报错 print(re.search(r'run\\',string))
string = "dog runs cat"
#用”.“来匹配除换行以外字符
print(re.search(r'r.n',string)) <re.Match object; span=(4, 7), match='run'>
---匹配句首、尾
#首尾匹配 string = "dog runs cat" print(re.search(r'^d',string)) print(re.search(r'cat$',string)) <re.Match object; span=(0, 1), match='d'> <re.Match object; span=(9, 12), match='cat'>
---匹配1次或0次
#匹配0次或1次 print(re.search(r'Mon(day)?','Monday')) print(re.search(r'Mon(day)?','Mon')) <re.Match object; span=(0, 6), match='Monday'> <re.Match object; span=(0, 3), match='Mon'>
---匹配多行,一般情况下直接匹配"^I"会返回None,需指定一个参数,确立为句首
#multi-line string =''' dog runs cat. I run dog. ''' print(re.search(r'^I',string)) print(re.search(r'^I',string,flags=re.MULTILINE)) None <re.Match object; span=(16, 17), match='I'>
---匹配0次或1次
#occur 0 or 1 print(re.search(r'ru?','ru')) print(re.search(r'run?','run') print(re.search(r'run?','runnnnnn')) <re.Match object; span=(0, 2), match='ru'> <re.Match object; span=(0, 3), match='run'> <re.Match object; span=(0, 3), match='run'>
---匹配1次或多次
#occur 1 more times print(re.search(r'ru+','ru')) print(re.search(r'run+','run')) print(re.search(r'run+','runnnnnn')) <re.Match object; span=(0, 2), match='ru'> <re.Match object; span=(0, 3), match='run'> <re.Match object; span=(0, 8), match='runnnnnn'>
---匹配0次或多次
# occur 0 more times print(re.search(r'run*','ru')) print(re.search(r'run*','run')) print(re.search(r'run*','runnnsdbbbbnnnnnnnnn')) <re.Match object; span=(0, 2), match='ru'> <re.Match object; span=(0, 3), match='run'> <re.Match object; span=(0, 5), match='runnn'>
---以上可以看出,次数这里,“?” 范围最小,其次”+“,最后”*“
---匹配前置字符单次或可选次数范围
# occur n times print(re.search(r'run{1}','ru')) print(re.search(r'run{1}','run')) print(re.search(r'run{2}','runnn'))
#output None <re.Match object; span=(0, 3), match='run'> <re.Match object; span=(0, 4), match='runn'> # {n,m} occur n to m times print(re.search(r'run{0,1}','ru')) print(re.search(r'run{1,10}','run'))
#output <re.Match object; span=(0, 2), match='ru'> <re.Match object; span=(0, 3), match='run'>
---分组匹配
# group 一个“()”代表一个组 match = re.search(r'(\d+), Date: (.+)' ,'ID: 123456, Date: Feb/08/2020') #输出全部分组匹配结果 print(match.group()) #输出第一组 print(match.group(1)) #输出第二组 print(match.group(2))
#ouput 123456, Date: Feb/08/2020 123456 Feb/08/2020
match = re.search(r'(\d+), Date: (.+)' ,'ID: 123456, Date: Feb/08/2020') #如果实际应用场景中分组太多,用group(1,2,..)无法精确匹配 #引入选择器 :(?P<自定义名称>) match = re.search(r'(?P<id>\d+), Date: (?P<date>.+)' ,'ID: 123456, Date: Feb/08/2020') print(match.group('id')) print(match.group('date'))
---一次性匹配所有规则
#findall print(re.findall(r'r[ua]n','run ran ren')) # | or print(re.findall(r'run|ran','run ran ren')) ['run', 'ran'] ['run', 'ran']
---替换,这里用到sub函数,配合正则简直万能
#replace print(re.sub(r'r[ua]n','e','dog runs cat')) #将匹配到的结果替换为'e' #output dog es cat
---分裂/分割
#split print(re.split(r'[;,.,:]','a;b.c:e')) ['a', 'b', 'c', 'e']
---其他一些用法
#将匹配规则设为变量,从而实现无限次利用 compile comp = re.compile(r'r[ua]n') print(comp.search('dog runs cat')) print(comp.search('tiger runs wolf')) print(comp.search('wolf runs sheep')) print(comp.search('cat runs mouse'))
---这里添加一些自己学习总结的东西
#正则中的优先量词如:"{m}"、"{m,n}"、"*"、"?"、"+"这些都是属于优先量词 #那贪婪于非贪婪其实本质区别是加不加"?",加"?"原本优先匹配量词即改为非贪婪量词 #假如匹配test1 htst = '<p>段一</p><div>test1</div></br><p>段二</p><div>test2</div>' print(re.search(r'<div>.*</div>',htst)) #得到结果,发现把test2也输出了,也就是贪婪模式 <re.Match object; span=(9, 55), match='<div>test1</div></br><p>段二</p><div>test2</div>'> #使用非贪婪模式,获得精确结果 print(re.search(r'<div>.*?</div>',htst)) <re.Match object; span=(9, 25), match='<div>test1</div>'>

浙公网安备 33010602011771号