# -*- coding: utf-8 -*-
"""
@Time : 2022/3/18 15:53
@Author : Andrew
@File : 豆瓣top250.py
"""
import requests # 拿到页面源代码
import re # 提取有效信息
import csv
# 创建文件 ,utf-8保证不乱码,newline=""防止excel打开csv时有多余空行
f = open("data.csv", mode="w", encoding="utf-8", newline="")
csvWriter = csv.writer(f)
title = {'title': "电影名字", 'year': "年份", 'score': "评分", 'number': "评价人数"}
csvWriter.writerow(title.values()) # 写入标题
for page in range(0, 250, 25):
print(page)
url = "https://movie.douban.com/top250" + "?start=" + str(page) + "&filter="
print(url)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/77.0.3865.120 "
"Safari/537.36 Core/1.77.97.400 QQBrowser/10.9.4621.400 "
}
resp = requests.get(url, headers=headers)
# print(resp.text)
pageContent = resp.text
# 解析
"""
1.你得先观察源代码,找到定位,就针对目标往上多找几层父标签,尽可能让其目标被包裹
2.就针对本代码而言,找到了li标签,而li标签与div之间有一个换行又或者空白的文本,这里采用惰性匹配,然后匹配到第一个div标签结束
3.接下来的直到title之间的都无关紧要,就.*?惰性匹配,找到<span class="title">
4.这里就是电影名字了,为了方便后面获取,给它加分组(?P<movieName>.*?),人数、评分一样的套路
5.注意正则表达式中不能加多余的空格
"""
obj = re.compile(r'<li>.*?<div class="item">.*?<span class="title">(?P<movieName>.*?)</span>.*?<p class="">.*?<br>('
r'?P<year>.*?) .*?<span class="rating_num" property="v:average">('
r'?P<score>.*?)</span>.*?<span>('
r'?P<number>.*?)人评价', re.S)
# 开始匹配
result = obj.finditer(pageContent)
for it in result:
# print(it.group("movieName"))
# print(it.group("score"))
# print(it.group("number"))
# print(it.group("year").strip()) # 去除空格
dic = it.groupdict()
dic['year'] = dic['year'].strip()
csvWriter.writerow(dic.values())
resp.close()
f.close()