import requests
import re
import csv
url = 'https://movie.douban.com/top250?start=0&filter='
for i in range(0,226,25):
#总共10页 每一页数据25条 url里的i表示第几页
url = f'https://movie.douban.com/top250?start={i}&filter='
#设置请求头
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'}
resp = requests.get(url=url,headers = headers)
#获取页面源码
page_content = resp.text
resp.close()
#解析数据:主要通过正则表达式 惰性匹配拿到电影名称上映年份以及豆瓣评分
#先设置正则匹配规则
obj = re.compile(r'<li>.*?<div class="item">.*? <span class="title">(?P<name>.*?)</span>'
r'.*?<br>(?P<year>.*?) '
r'.*?<span class="rating_num" property="v:average">(?P<score>.*?)</span>',re.S)
#将page_content丢到设定好的规则里面
result = obj.finditer(page_content)
#将数据写入CSV文件 或者也可以更改为excel文件
f = open('date.csv',mode='a+',newline='')
csvwrite = csv.writer(f)
for i in result:
dic = i.groupdict()
dic['year'] = dic['year'].strip()
csvwrite.writerow(dic.values())
print('over!!') #每一页爬取完了打印over!!
f.close()
print('爬取完毕!~~')