1 #!/usr/bin/env python
2 # -*- coding:utf-8 -*-
3 # Author:woshinidaye
4
5 import re
6 import csv
7 # #findall 匹配字符串中所有符合正则表达式的内容;
8 # list = re.findall(r'\d+','我的电话号码是133123123123,女朋友的电话是45678') #返回的是列表,列表效率很低
9 # print(list)
10
11
12 # #re.finditer() #匹配字符串中所有符合正则表达式的内容,返回的是迭代器,这个效率很OK;从迭代器中拿到内容需要.group()
13 # list2 = re.finditer(r'\d+','我的电话号码是133123123123,女朋友的电话是45678')
14 # #print(list2,type(list2)) #<callable_iterator object at 0x00000187F8921940> <class 'callable_iterator'>
15 # for i in list2:
16 # print(i.group())
17
18 # #re.search 返回match对象,需要.group一下,search全文检索,检索到一个就结束程序
19 # sear = re.search(r'\d+','我的电话号码是133123123123,女朋友的电话是45678')
20 # print(sear.group())
21
22 # #re.match是从头开始匹配,可以理解为r'^d+'
23 # mat = re.match(r'\d+','我的电话号码是133123123123,女朋友的电话是45678')
24 # print(mat)
25
26 #预加载正则表达式,正则很长
27 # obj = re.compile(r'\d+')
28 # # obj.findall()
29 # # obj.search()
30 # test = obj.finditer('我的电话号码是133123123123,女朋友的电话是45678')
31 # for i in test:
32 # print(i.group())
33 # # obj.match()
34
35
36 # (?P<id>正则表达式) 可以从正则表达式中匹配到的内容中,进一步提取内容,
37
38 ss = '''
39 <div class='歌手'><span id='1'>林俊杰</div>
40 <div class='演员'><span id='2'>不知道</div>
41 <div class='话剧'><span id='3'>乱说</div>
42 <div class='小品'><span id='4'>啥也不说</div>
43 '''
44 # ss本身就是一串有规律的字符串,想把id和name拿出来比较困难,所以,需要用正则表达式匹配成一个一个的元素,
45 # aa = re.compile(r"<div class='.*?'><span id='(?P<id>\d+)'>(?P<name>.*?)</div>",re.S) #flags=re.S 让.匹配换行符
46 # # print(type(aa))
47 # bb = aa.finditer(ss)
48 # for it in bb:
49 # print(it.group('id'),it.group("name"))
50
51
52
53 # 实战示例1
54 # 请求一下豆瓣电影top250
55 #查看页面页数的加载方式:发现是 服务器渲染;
56 #拿到页面源代码;
57 #通过re提取有效信息
58 # import re,requests
59 # for n in range(0,250,25):
60 # # url = 'https://movie.douban.com/top250?start=%s&filter='%n
61 # # url = 'https://movie.douban.com/top250?start={0}&filter='.format(n)
62 # url = f'https://movie.douban.com/top250?start={n}&filter='
63 # # print(url)
64 # header = {
65 # "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"
66 # }
67 #
68 # result = requests.get(url,headers=header)
69 # page = result.text
70 # # with open('1114','w+',encoding='utf-8') as f : #获取页面源代码
71 # # f.write(result.text)
72 # # print(result.text)
73 #
74 # # 解析数据
75 #
76 # obj = re.compile(r'<li>.*?<div class="item">.*?<span class="title">(?P<title>.*?)</span>'
77 # r'.*?<p class="">(?P<actor>.*?)<br>.*?</span>.*?property="v:averag'
78 # r'e">(?P<score>.*?)</span>',re.S)
79 # enenn = obj.finditer(page)
80 # f = open('top250.csv','a+',encoding='utf-8',newline='')
81 # csvwriter = csv.writer(f)
82 # for i in enenn:
83 # # print(i.group('title'))
84 # # print(i.group('actor').strip())
85 # dic = i.groupdict()
86 # dic['title'] = dic['title'].strip()
87 # dic['actor'] = dic['actor'].strip()
88 # # print(dic)
89 # csvwriter.writerow(dic.values())
90 # f.close()
91 # print('=====done===')