正则表达式
re功能函数
-
findall,获取匹配到的所有数据
import re text = "dsf130429191912015219k13042919591219521Xkk" res = re.findall('\d{6}\d{4}\d{2}\d{2}\d{3}[\dX]', text) print(res) # ['130429191912015219', '13042919591219521X'] -
match,从起始位置开始匹配,匹配成功返回一个对象,未匹配成功返回None
import re text = "大小逗2B最逗3B欢乐" res = re.match('逗\dB',text) print(res) # Noneimport re text = "逗2B最逗3B欢乐" res = re.match('逗\dB',text) print(res.group()) # <re.Match object; span=(0, 3), match='逗2B'> # 逗2B -
search,浏览整个字符串去匹配第一个,未匹配成功返回None
import re text = "大小逗2B最逗3B欢乐" res = re.search("逗\dB", text) print(res) print(res.group()) # <re.Match object; span=(2, 5), match='逗2B'> # 逗2B -
sub,替换匹配成功的位置
import re text = "逗2B最逗3B欢乐逗5B" res = re.sub("逗\dB", "alex", text) print(res) # alex最alex欢乐aleximport re # 最后的参数表示前前几个 text = "逗2B最逗3B欢乐逗5B" res = re.sub("逗\dB", "john", text, 2) print(res) # john最john欢乐逗5B -
split,根据匹配成功的位置分割
import re text = "逗2B最逗3B欢乐" res = re.split("\dB", text) print(res) # ['逗', '最逗', '欢乐']import re # 最后的参数表示前前几个 text = "逗2B最逗3B欢乐" res = re.split("\dB", text, 1) print(res) # ['逗', '最逗3B欢乐'] -
finditer
import re text = "dsf130429191912015219k13042919591219521Xkk" res = re.finditer('\d{6}(?P<year>\d{4})(?P<month>\d{2})(?P<day>\d{2})[\dX]', text) for item in res: print(item.group()) """ 130429191912015 130429195912195 """import re text = "dsf130429191912015219k13042919591219521Xkk" res = re.finditer('\d{6}(?P<year>\d{4})(?P<month>\d{2})(?P<day>\d{2})[\dX]', text) for item in res: print(item.groupdict()) """ {'year': '1919', 'month': '12', 'day': '01'} {'year': '1959', 'month': '12', 'day': '19'} """
3.2 正则表达式相关
1. 字符相关
-
alex匹配文本中的aleximport re text = '你好alex,阿斯顿发alexhaha 阿士大夫能接受的alexff' res = re.findall("alex", text) print(res) # ['alex', 'alex', 'alex'] -
[abc]匹配a或b或c 字符。import re text = '你好alex,阿斯顿发alexhaha 阿士大夫能接受的alexff' res = re.findall('[aex]', text) print(res) # ['a', 'e', 'x', 'a', 'e', 'x', 'a', 'a', 'a', 'e', 'x']import re text = '你好alex,阿斯顿发alexhaha 阿士大夫能接受的alexab' res = re.findall('a[lb]', text) print(res) # ['al', 'al', 'al', 'ab'] -
[^abc]匹配除了abc意外的其他字符。import re res = re.findall('[^alexhb]', text) print(res) # ['你', '好', ',', '阿', '斯', '顿', '发', ' ', '阿', '士', '大', '夫', '能', '接', '受', '的'] -
[a-z]匹配a~z的任意字符( [0-9]也可以 )。import re text = "alexrootrootadmin" res = re.findall("[a-z]",text) print(res) # ['a', 'l', 'e', 'x', 'r', 'o', 'o', 't', 'r', 'o', 'o', 't', 'a', 'd', 'm', 'i', 'n'] -
.代指除换行符以外的任意字符。import re text = "alexraotrootadmin" res = re.findall('r.o', text) print(res) # ['rao', 'roo']import re text = "alexraotrootadmin" # . 表示匹配任何字符(除了换行符) + 表示至少匹配一次这个字符 res = re.findall('r.+o', text) print(res) # ['raotroo']import re text = "alexraotrootadmin" # . 表示匹配任何字符(除了换行符) + 表示至少匹配一次这个字符 ?是使量词 + 变为非贪婪 ,即匹配尽可能少的字符,而不是尽可能多。 res = re.findall('r.+?o', text) print(res) # # ['rao', 'roo'] -
\w代指字母或数字或下划线(汉字)。import re text = "北京johnalex齐北 京johnalex6" res = re.findall('(john\w+(x|6))',text) print(res) # [('johnalex', 'x'), ('johnalex6', '6')] -
\d代指数字import re text = "root-ad32min-add3-admd1in" res = re.findall("d\d", text) print(res) # ['d3', 'd3', 'd1']import re text = "root-ad32min-add3-admd1in" res = re.findall("d\d+", text) print(res) # ['d32', 'd3', 'd1'] -
\s代指任意的空白符,包括空格、制表符等。import re text = "root admin add admin" res = re.findall("a\w+\s\w+", text) print(res) # ['admin add']
2. 数字相关
-
*重复0次或更多次import re text = "他是大B个,确实是个大2B。" res = re.findall("大\d*B",text) print(res) # ['大B', '大2B'] -
+重复1次或更多次import re text = "他是大B个,确实是个大2B,大3B,大66666B。" res = re.findall("大\d+B",text) print(res) # ['大2B', '大3B', '大66666B'] -
?重复0次或1次import re text = "他是大B个,确实是个大2B,大3B,大66666B。" res = re.findall("大\d?B",text) print(res) # ['大B', '大2B', '大3B'] -
{n}重复n次import re text = "他是大B个,确实是个大2B,大3B,大66666B。" res = re.findall('大\d{5}B',text) print(res) # ['大66666B'] -
{n,}重复n次或更多次import re text = "他是大B个,确实是个大2B,大325B,大66666B。" res = re.findall('大\d{0,}B',text) print(res) # ['大B', '大2B', '大325B', '大66666B'] -
{n,m}重复n到m次import re text = "他是大B个,确实是个大2B,大325B,大66666B。" res = re.findall('大\d{3,5}B',text) print(res) # ['大325B', '大66666B']
3. 括号(分组)
-
提取数据区域
import re text = "楼主太牛逼了,在线想要 442662578@qq.com和xxxxx@live.com谢谢楼主,手机号也可15131255789,搞起来呀" res = re.findall("151312\d{5}",text) print(res) # ['15131255789']import re text = "楼主太牛逼了,在线想要 442662578@qq.com和xxxxx@live.com谢谢楼主,手机号也可15131255789,搞起来15131266666呀" res = re.findall("1513(12)(\d{5})",text) print(res) # [('12', '55789'), ('12', '66666')]import re text = "楼主太牛逼了,在线想要 442662578@qq.com和xxxxx@live.com谢谢楼主,手机号也可15131255789,搞起来呀" res = re.findall("(151312(\d{5}))",text) print(res) # [('15131255789', '55789')] -
获取指定区域 + 或条件
import re text = "楼主15131root太牛15131alex逼了,在线想要 442662578@qq.com和xxxxx@live.com谢谢楼主,手机号也可15131255789,搞起来呀" res = re.findall("(15131(\d{6}))",text) print(res) # [('15131255789', '255789')]import re text = "楼主15131root太牛15131alex逼了,在线想要 442662578@qq.com和xxxxx@live.com谢谢楼主,手机号也可15131255789,搞起来呀" res = re.findall("(15131(\d{6}|r\w+太))",text) print(res) # [('15131root太', 'root太'), ('15131255789', '255789')]
4. 起始和结束
上述示例中都是去一段文本中提取数据,只要文本中存在即可。
但,如果要求用户输入的内容必须是指定的内容开头和结尾,比就需要用到如下两个字符。
-
^开始 -
$结束import re text = "啊442662578@qq.com我靠" email_list = re.findall("^\w+@\w+.\w+$", text, re.ASCII) print(email_list) # []import re text = "442662578@qq.com" email_list = re.findall("^\w+@\w+\.\w+$", text, re.ASCII) print(email_list) # ['442662578@qq.com']这种一般用于对用户输入数据格式的校验比较多,例如:
import re text = input("请输入邮箱:") email = re.findall("^\w+@\w+.\w+$", text, re.ASCII) if not email: print("邮箱格式错误") else: print(email)
5. 特殊字符
由于正则表达式中 * . \ { } ( ) 等都具有特殊的含义,所以如果想要在正则中匹配这种指定的字符,需要转义,例如:
# 错误匹配
import re
text = "我是你{5}哈哈"
res = re.findall("你{5}哈哈",text)
print(res) # []
import re
text = "我是你{5}哈哈"
# 使用 \{ 和 \} 来转义大括号字符。
res = re.findall("你\{5\}哈",text)
print(res) # ['你{5}哈']

浙公网安备 33010602011771号