1.匹配标签

 1 import re
 2 ret = re.search('<(?P<tag_name>\w+)>\w+</(?P=tag_name)>','<h1>hello</h1>')
 3 #还可以在分组中利用?P<name>的形式给分组起名字
 4 #获取的匹配结果可以直接用group('名字')拿到对应的值
 5 print(ret.group())
 6 print(ret.group('tag_name'))
 7 
 8 
 9 # -------------------------
10 ret = re.search(r'<(\w+)>\w+</\1>','<h1>hello</h1>')
11 #如果不给分组起名字,也可以用\序号来找到对应的组,表说要找的内容和前面的组的内容一致
12 #获取的匹配结果可以直接用group(序号)拿到对应的值
13 print(ret.group())
14 print(ret.group(1))
匹配标签

2.匹配整数

 1 import re
 2 ret = re.findall(r'\d+',"1-2*(60+(-40.35/5)-(-4*3))")
 3 print(ret)
 4 
 5 import re
 6 ret = re.findall(r'-?\d+\.\d*|(-?\d+)',"1-2*(60+(-40.35/5)-(-4*3))")
 7 print(ret)
 8 
 9 ret.remove('')
10 print(ret)
数字匹配

3.数字匹配

 1 # 3.数字匹配
 2 # 1.匹配一段文本中的每行的时间字符串,比如’1990-07-12‘
 3 
 4 # 1.分别取出一年的十二个月:
 5 while True:
 6     cmd = input('请输入月份:>>')
 7     if re.match(r'^(0?[1-9]|1[0-2])$',cmd):
 8         print('格式正确')
 9     else:
10         print('格式错误')
11 
12 # 2.
13 ret =re.match(r'^(0?[1-9]|1[0-2])$','11')
14 print(ret.group())
15 
16 # 一个月的31天
17 ret = re.match(r'^((0?[1-9])|((1|2)[0-9])|30|31)$','31')#从头匹配
18 print(ret.group())
19 
20 # 匹配QQ号
21 while True:
22     cmd = input('请输入你扣扣号:>>')
23     if re.match(r'^[1-9][0-9]{4,10}$',cmd):
24         print('输入正确')
25     else:
26         print('输入错误')
27 
28 # 浮点数
29 ret = re.match(r'-?\d+\.?\d*','21.5')
30 print(ret.group())
数字匹配

4.爬虫练习

 1 import requests
 2 
 3 import re
 4 import json
 5 
 6 def getPage(url):
 7 
 8     response=requests.get(url)
 9     return response.text
10 
11 def parsePage(s):
12     
13     com=re.compile('<div class="item">.*?<div class="pic">.*?<em .*?>(?P<id>\d+).*?<span class="title">(?P<title>.*?)</span>'
14                    '.*?<span class="rating_num" .*?>(?P<rating_num>.*?)</span>.*?<span>(?P<comment_num>.*?)评价</span>',re.S)
15 
16     ret=com.finditer(s)
17     for i in ret:
18         yield {
19             "id":i.group("id"),
20             "title":i.group("title"),
21             "rating_num":i.group("rating_num"),
22             "comment_num":i.group("comment_num"),
23         }
24 
25 def main(num):
26 
27     url='https://movie.douban.com/top250?start=%s&filter='%num
28     response_html=getPage(url)
29     ret=parsePage(response_html)
30     print(ret)
31     f=open("move_info7","a",encoding="utf8")
32 
33     for obj in ret:
34         print(obj)
35         data=json.dumps(obj,ensure_ascii=False)
36         f.write(data+"\n")
37 
38 if __name__ == '__main__':
39     count=0
40     for i in range(10):
41         main(count)
42         count+=25
View Code

 

posted on 2017-08-09 16:48  海燕。  阅读(706)  评论(0编辑  收藏  举报