#需求:抓取猫眼电影TOP100的电影名称、时间、评分、图片等信息,提取的结果会以文件的形式保存下来
import requests
import time
from lxml import etree
import json
import csv
import codecs
class MaoYanTop100Spider:
#存储电影详情页的url
film_page_url_list = []
#存储每个的电影信息
#film_info = {}
film_info_list = []
# 1.获取电影列表页数据
def Top100_list(self, session, headers):
#1.1向列表页发送请求
#https://maoyan.com/board/4?offset=20
#(1)固定url
base_url = "https://maoyan.com/board/4"
#(2)url变化部分:
for i in range(0, 91, 10):
#(3)拼接URL:
final_url = base_url + "?offset=" + str(i)
#(4)发送请求:
time.sleep(5)
response = session.get(url=final_url, headers=headers)
#1.2解析列表页
film_list_page_data = response.content.decode("utf-8")
#1.2.1使用xpath解析数据
#(1)转类型
xpath_data = etree.HTML(film_list_page_data)
#(2)
#/dl/dd[1]/a
#dl/dd[10]/a/img[2]
# dl / dd[2] / a / img[2]
for xpath_num in range(1, 11):
# 电影名称
#dl/dd[1]/a
film_name = xpath_data.xpath('//dl[@class="board-wrapper"]/dd[' + str(xpath_num) + ']/a/@title')[0]
# 时间
#//*[@id="app"]/div/div/div[1]/dl/dd[1]/div/div/div[1]/p[3]
#dl/dd[1]/div/div/div[1]/p[3]
#dl/dd[2]/div/div/div[1]/p[3]
film_time = xpath_data.xpath('//dl[@class="board-wrapper"]/dd[' + str(xpath_num) + ']/div/div/div[1]/p[3]/text()')[0][5:].strip()
# 主演
#dl/dd[1]/div/div/div[1]/p[2]
film_actors = xpath_data.xpath('//dl[@class="board-wrapper"]/dd[' + str(xpath_num) + ']/div/div/div[1]/p[2]/text()')[0].strip()[3:]
# 评分
#dl/dd[1]/div/div/div[2]/p/i[1]
score_int = xpath_data.xpath('//dl[@class="board-wrapper"]/dd[' + str(xpath_num) + ']/div/div/div[2]/p/i[1]/text()')[0]
#dl/dd[1]/div/div/div[2]/p/i[2]
score_fraction = xpath_data.xpath('//dl[@class="board-wrapper"]/dd[' + str(xpath_num) + ']/div/div/div[2]/p/i[2]/text()')[0]
film_score = str(score_int) + str(score_fraction)
# 图片
#dl/dd[1]/a/img[2]
#dl/dd[1]/a/img[2]
film_img = xpath_data.xpath('//dl[@class="board-wrapper"]/dd[' + str(xpath_num) + ']/a/img[2]/@data-src')[0]
# 详情页url
#dl/dd[1]/div/div/div[1]/p[1]/a
#dl/dd[1]/div/div/div[1]/p[1]/a
film_url = xpath_data.xpath('//dl[@class="board-wrapper"]/dd[' + str(xpath_num) + ']/div/div/div[1]/p[1]/a/@href')[0]
#电影信息
film_info = {}
film_info["name"] = film_name
film_info["time"] = film_time
film_info["actors"] = film_actors
film_info["score"] = film_score
film_info["img"] = film_img
film_info["url"] = film_url
self.film_info_list.append(film_info)
#print(film_info)
#详情页url
self.film_page_url_list.append(film_url)
# 2.获取电影详情页数据
def film_page(self, url, session, headers, num):
#2.1向详情页发送请求
base_url = "https://maoyan.com"
final_url = base_url + str(url)
print(final_url)
time.sleep(3)
response = session.get(url=final_url, headers=headers)
data = response.content.decode("utf-8")
#print(response)
#2.2解析详情页
xpath_data = etree.HTML(data)
#//*[@id="app"]/div/div[1]/div/div[2]/div[1]/div[1]/div[2]/span
film_summary = xpath_data.xpath('//span[@class="dra"]/text()')[0].strip()
#print(film_summary)
self.film_info_list[num]["summary"] = film_summary
#将数据保存至CSV文件
def save_data(self):
#1.读取json文件,创建csv文件
#json_fp = open("new.json", "r")
csv_fp = codecs.open("maoyan.csv", "w", "utf-8")
#2.提出csv文件表头,表内容
#2.1 表头
#data_list = json.load(json_fp)
title_list = self.film_info_list[0].keys()
#2.2 表内容
excel_data = []
for data in self.film_info_list:
excel_data.append(data.values())
#3.使用csv写入器,写入文件
#3.1创建csv写入器
csv_writer = csv.writer(csv_fp)
#3.2写入表头和表内容
#(1)写入表头
csv_writer.writerow(title_list)
#(2)写入表内容
csv_writer.writerows(excel_data)
#4.关闭csv文件和json文件
#json_fp.close()
csv_fp.close()
#运行:
def run(self):
#0.创建session,维持会话
session = requests.Session()
#0.1请求头:headers
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
}
#1.获取电影列表页数据
self.Top100_list(session=session, headers=headers)
#print(self.film_info_list)
#2.获取电影详情页数据
for i, film_page_url in enumerate(self.film_page_url_list):
self.film_page(url=film_page_url, session=session, headers=headers, num=i)
print(self.film_info_list[i])
#3.保存数据
self.save_data()
MaoYanTop100Spider().run()