python模块--re
就其本质而言,正则表达式是一种小型的、高度专业化的编程语言,(在Python中)它内嵌在Python中,并通过 re 模块实现。正则表达式模式被编译成一系列的字节码,然后由用 C 编写的匹配引擎执行。
字符匹配(普通字符,元字符):
普通字符:大多数字符和字母都会和自身匹配
元字符:. ^ $ * + ? { } [ ] | ( ) \
一 元字符
1.1 . ^ $
import re
res1 = re.findall('a.','abcdefgaab') # ['ab', 'aa']
res2 = re.findall('^a.','abcdefgaab') # ['ab']
res3 = re.findall('a.$','abcdefgaab') # ['ab']
1.2 * + { } ?
import re
res4 = re.findall('a.*','abcdefgaab') # ['abcdefgaab']
res5 = re.findall('a.+','abcdefgaab') # ['abcdefgaab']
res6 = re.findall('a.{1,2}','abcdefgaab') # ['abc', 'aab']
# 匹配一个数字包括整型和浮点型
res7 = re.findall('\d+\.?\d*','12.34,12,0.06,100') # ['12.34', '12', '0.06', '100']
注意:前面的*,+,{},?等都是贪婪匹配,也就是尽可能匹配,后面加?号使其变成惰性匹配,后面会详细介绍贪婪匹配
import re
res8 = re.findall('a.*?','abcdefgaab') # ['a', 'a', 'a']
res9 = re.findall('a.+?','abcdefgaab') # ['ab', 'aa']
res6 = re.findall('a.{1,2}?','abcdefgaab') # ['ab', 'aa']
1.3 转义符 \
1. 反斜杠后边跟元字符去除特殊功能,比如\.
2. 反斜杠后边跟普通字符实现特殊功能,比如\d
# \b的应用
res10 = re.findall(r'I\b','I am joe')
print(res10) # ['I']
# 如何匹配abc\de中的‘c\d’
# 注:r是raw的简写,raw string意思是这个字符串中间的特殊字符不用转义
res11 = re.findall(r'c\\d','abc\de') # ['c\\d']
# 前两个\\和后两个\\分别用于在编程语言里转义成反斜杠,转换成两个反斜杠后再在正则表达式里转义成一个反斜杠
res12 = re.findall('c\\\\d','abc\de') # ['c\\d']
1.4 分组
res13 = re.findall(r'(ad)+', 'add') # ['ad']
# 默认显示匹配到的最后一个数据
res14 = re.findall(r'(ad)+joe', 'adadjoe') # ['ad']
res15 = re.findall(r'(\d)+joe', 'ad123joe') # ['3']
# 加?:可以得到我们想要的结果,该点仅针对于re.findall方法
res16 = re.findall(r'(?:ad)+joe', 'adadjoe') # ['adadjoe']
res17 = re.findall(r'(?:\d)+joe', 'ad123joe') # ['123joe']
ret = re.search('(?P<id>\d{3})/(?P<name>\w{3})', '123/com')
print(ret.group()) # 123/com
print(ret.group('id')) # 123
print(ret.group('name')) # com
1.5 |
ret = re.search('(ad)|\d','ad123joe')
print(ret.group()) # ad
ret = re.search('(ade)|\d','ad123joe')
print(ret.group()) # 1
1.6 字符集[]
res18 = re.findall('a[bc]d', 'abdacd') # ['abd', 'acd']
res19 = re.findall('[a-z]', 'abdacd') # ['a', 'b', 'd', 'a', 'c', 'd']
res20 = re.findall('[.*+]', 'a.bc+') # ['.', '+']
# 在字符集里有功能的符号: - ^ \
res21 = re.findall('[1-9]', '12abc3') # ['1', '2', '3']
res22 = re.findall('[^ab]', '12abc3') # ['1', '2', 'c', '3']
res23 = re.findall('[\d]', '12abc3') # ['1', '2', '3']
二 贪婪匹配
贪婪匹配:在满足匹配时,匹配尽可能长的字符串,默认情况下,采用贪婪匹配
# 贪婪匹配
ret = re.findall('a.*c','abcabc') # ['abcabc']
print(ret)
# 非贪婪匹配
ret = re.findall('a.*?c','abcabc') # ['abc', 'abc']
print(ret)
几组常用的非贪婪匹配组合
.*?的用法
# . 是任意字符
# * 是取0至无限长度
# ? 是非贪婪模式
# 搭配在一起就是取尽量少的任意字符,一般不会这么单独写,他大多用在:.*?x,就是取前面任意长度的字符,直到一个x出现
ret = re.findall('.*?c','abbbcabc') # ['abbbc', 'abc']
三 常用方法
3.1 findall(pattern, string, flags=0)
参数说明:
- 正则表达式
- 要匹配的字符串
- 标志位,用于控制正则表达式的匹配方式
I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE # ignore case L = LOCALE = sre_compile.SRE_FLAG_LOCALE # assume current 8-bit locale U = UNICODE = sre_compile.SRE_FLAG_UNICODE # assume unicode locale M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE # make anchors look for newline S = DOTALL = sre_compile.SRE_FLAG_DOTALL # make dot match newline X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE # ignore whitespace and comments
返回所有满足匹配条件的结果,放在列表里,该方法上面已使用
注意:findall的优先级查询
import re
ret = re.findall('www.(baidu|qq).com', 'www.baidu.com')
print(ret) # ['baidu'] 这是因为findall会优先把匹配结果组里内容返回,如果想要匹配结果,取消权限即可
ret = re.findall('www.(?:baidu|qq).com', 'www.baidu.com')
print(ret) # ['www.baidu.com']
3.2 search(pattern, string, flags=0)
在字符串内查找模式匹配,找到第一个匹配然后返回一个包含匹配信息的对象,该对象可以通过调用group()方法得到匹配的字符串,如果字符串没有匹配,则返回None
import re
obj = re.search('\d+', 'abc123')
if obj:
print(obj.group()) # 123
3.3 match(pattern, string, flags=0)
同search,不过是从起始位置开始根据模型去字符串中匹配指定内容,匹配单个
import re
obj = re.match('\d+', '123abc')
if obj:
print(obj.group()) # 123
3.4 sub(pattern, repl, string, count=0, flags=0)
用于替换匹配的字符串
old_data = "123abc456"
new_data = re.sub('\d+', 'joe', old_data)
print(new_data)
new_data = re.sub('\d+', 'joe', old_data, 1)
print(new_data)
3.5 subn(pattern, repl, string, count=0, flags=0):
与sub类似,返回一个包含(新字符串, 替换次数)的2元组
old_data = "123abc456"
ret = re.subn('\d+', 'joe', old_data)
print(ret) # ('joeabcjoe', 2)
3.6 split(pattern, string, maxsplit=0, flags=0)
根据指定匹配进行分组
old_data = "1*2*(3+4)*(9-3)"
new_data = re.split('\*', old_data)
print(new_data) # ['1', '2', '(3+4)', '(9-3)']
new_data = re.split('\*', old_data, 2)
print(new_data) # ['1', '2', '(3+4)*(9-3)']
ret=re.split('[ac]','abcd') # 先按'a'分割得到''和'bcd',在对''和'bcd'分别按'c'分割
print(ret) # ['', 'b', 'd']
注:split的优先级查询
ret=re.split("\d+","abc12def34g")
print(ret) # ['abc', 'def', 'g']
ret=re.split("(\d+)","abc12def34g")
print(ret) # ['abc', '12', 'def', '34', 'g']
3.7 finditer(pattern, string, flags=0)
返回字符串中所有非重叠匹配的迭代器。对于每个匹配,迭代器返回一个匹配对象。
ret = re.finditer('\d+', 'abc123def456g')
print(ret) # <callable_iterator object at 0x0000021C50201978>
print(next(ret).group()) # 123
print(next(ret).group()) # 123
3.8 compile(pattern, flags=0)
编译一个正则表达式模式,返回一个模式对象。
obj = re.compile('\d{3}')
ret = obj.search('abc123defg')
print(ret.group()) # 123
3.9 group&groups
a = "123abc456"
print(re.search("([0-9]*)([a-z]*)([0-9]*)", a).group()) # 123abc456
print(re.search("([0-9]*)([a-z]*)([0-9]*)", a).group(0)) # 123abc456
print(re.search("([0-9]*)([a-z]*)([0-9]*)", a).group(1)) # 123
print(re.search("([0-9]*)([a-z]*)([0-9]*)", a).group(2)) # abc
print(re.search("([0-9]*)([a-z]*)([0-9]*)", a).group(3)) # 456
print(re.search("([0-9]*)([a-z]*)([0-9]*)", a).groups()) # ('123', 'abc', '456')
四 练习
4.1 计算式中的整数匹配
import re # 方式一:利用分组 ret = re.findall(r"-?\d+\.\d*|(-?\d+)","3-2*(12.2+(-30.15/5)-(-5*6))") while '' in ret: ret.remove("") print(ret) # ['3', '-2', '5', '-5', '6'] #方式二:利用sub import json ret = re.findall(r"-?\d+\.?\d*","3-2*(12.2+(-30.15/5)-(-5*6))") ret = re.sub(r"-?\d+\.\d*","",json.dumps(ret)) ret = json.loads(ret) while '' in ret: ret.remove("") print(ret) # ['3', '-2', '5', '-5', '6']
4.2 标签匹配
import re print(re.findall("<(?P<tag_name>\w+)>\w+</(?P=tag_name)>","<h1>hello</h1>")) # ['h1'] print(re.search("<(?P<tag_name>\w+)>\w+</(?P=tag_name)>","<h1>hello</h1>").group()) # <h1>hello</h1> print(re.search(r"<(\w+)>\w+</\1>","<h1>hello</h1>").group()) # <h1>hello</h1>
4.3 爬虫
import requests import re import json def getPage(url): """ 获取网页信息 """ response = requests.get(url) return response.text def parsePage(response_html): """ 根据正则解析网页信息 """ com = re.compile( '<div class="item">.*?<div class="pic">.*?<em .*?>(?P<id>\d+).*?<span class="title">(?P<title>.*?)</span>' '.*?<span class="rating_num" .*?>(?P<rating_num>.*?)</span>.*?<span>(?P<comment_num>.*?)评价</span>', re.S) ret = com.finditer(response_html) for i in ret: yield { "id": i.group("id"), "title": i.group("title"), "rating_num": i.group("rating_num"), "comment_num": i.group("comment_num"), } def main(num): url = 'https://movie.douban.com/top250?start=%s&filter=' % num response_html = getPage(url) ret = parsePage(response_html) f = open("move_info7", "a", encoding="utf8") for obj in ret: print(obj) data = json.dumps(obj, ensure_ascii=False) f.write(data + "\n") if __name__ == '__main__': start_page = 0 for i in range(10): main(start_page) start_page += 25

![image_thumb[4] image_thumb[4]](https://images2018.cnblogs.com/blog/1275320/201805/1275320-20180521185640969-1985435681.png)
![image_thumb[1] image_thumb[1]](https://images2018.cnblogs.com/blog/1275320/201805/1275320-20180521185641714-1633391357.png)
![image_thumb[9] image_thumb[9]](https://images2018.cnblogs.com/blog/1275320/201805/1275320-20180521185642356-985491240.png)
浙公网安备 33010602011771号