#使用requests、正则表达式,爬取豆瓣电影top250排行榜
#要求抓取名次、影片名称、年份、导演等字段。
import requests
import re
import csv
import time
class doubanTop250():
film_list = []
#1.发送请求
def send_request(self,url):
#1.1添加请求头
headers= {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"}
#1.3 发送请求
response = requests.get(url=url,headers=headers)
print(response.status_code)
return response
#2.解析数据
def parse(self,response):
data = response.content.decode()
rank = re.findall('<em class="">(\d+)</em>', data)
name = re.findall('<img width="100" alt="(.*) src=', data)
country = re.findall(' / (.*) / ', data)
director = re.findall('导演:(.*)', data)
score = re.findall('<span class="rating_num" property="v:average">(.*)</span>', data)
for i in range(0, len(rank)):
film_dict = {}
film_dict['rank'] = rank[i]
film_dict['name'] = name[i]
film_dict['country'] = country[i]
film_dict['director'] = director[i]
film_dict['score'] = score[i]
self.film_list.append(film_dict)
#3.存储数据
def save_data(self):
#0.创建开启文件
csv_file = open('top250.csv', 'w', encoding='utf-8')
#1.创建csv写入器
csv_writer = csv.writer(csv_file)
#2.写入表头
csv_writer.writerow(self.film_list[0].keys())
#3.写入内容
csv_list = []
for film in self.film_list:
film_data = film.values()
csv_list.append(film_data)
csv_writer.writerows(csv_list)
#4.关闭文件
csv_file.close()
pass
#4.运行
def run(self):
# 1.1目标url地址
# 拼接url
base_url = "https://movie.douban.com/top250?start="
for i in range(0,225,25):
final_url = base_url + str(i)
#1.发送请求,返回response对象
response = self.send_request(final_url)
#2.解析response数据
self.parse(response)
time.sleep(5)
#3.保存数据
self.save_data()
doubanTop250().run()