# coding=utf-8
import requests
from lxml import etree
'''
爬取猫眼网站TOP100的电影数据并写为json格式文件
'''
# 通过起始url获取要爬取url列表
def get_url(url):
url_list = [url]
for num in range(10, 91, 10):
data = {'offset': num}
url_list.append(requests.get(url, params=data).url)
return url_list
# 通过url获取页面信息
def get_html(url):
header = {'User-Agent': 'Mozilla/5.0 (X11;Ubuntu;Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'}
html = requests.get(url,headers=header )
# html.encoding = 'utf-8'
html = html.text
return html
# 使用xpath定位元素,爬取原始数据
def get_element(html):
html = etree.HTML(html)
img_href = html.xpath("//dl[@class='board-wrapper']/dd/a/@href")
title = html.xpath("//dl[@class='board-wrapper']/dd/a/@title")
actress = html.xpath("//div[@class='movie-item-info']/p[2]/text()")
relasetime = html.xpath("//div[@class='movie-item-info']/p[3]/text()")
all_data = [img_href,title,actress,relasetime]
return all_data
# 清理爬取的数据
def clear_data(data):
from urllib.parse import urljoin
url = 'http://maoyan.com/'
'''
img_href中的href添加url头部,/films/1203
actress中去除换行符和多余空行
'''
img_url = []
actor = []
img_href = data[0]
actress = data[2]
for tail in img_href:
img_url.append(urljoin(url,tail))
data[0] = img_url
for act in actress:
actor.append(act.split())
data[2] = actor
return data
# 将爬取下来的数据整理为字典格式
def json_dict(data):
'''
将数据整理为:
{title:{主演:演员,img:img,时间:上映时间}}
'''
json_dic = {}
title = data[1]
actress = data[2]
releasetime = data[3]
img = data[0]
for item in title:
json_dic[item] = {}
# 处理演员列表
for i in range(len(actress)):
each_actor = actress[i][0]
key = each_actor.split(':')[0]
value = each_actor.split(':')[1]
json_dic[item] = {key:value}
# 处理上映时间列表
for each_time in releasetime:
rt = each_time.split(':')[0]
T = each_time.split(':')[1]
json_dic[item].update({rt:T})
# 处理图片路径列表
for each_img in img:
json_dic[item].update({'img_url':each_img})
return json_dic
# 转为json文件
def dump_json(dic,filename):
import os,json
abspath = os.path.join(os.path.abspath('.'),filename)
# 不加ensure_ascii写入文件会被编码为ASCII
# indent参数为格式化保存字典
with open(abspath,'w',encoding='utf-8') as f:
json.dump(dic,f,indent=4,ensure_ascii=False)
# 主函数
def main(url):
url_list = get_url(url)
json_dic = {}
for url in url_list:
html = get_html(url)
data = get_element(html)
json_dic.update(json_dict(clear_data(data)))
dump_json(json_dic,'maoyan.json')
if __name__ == '__main__':
url = 'http://maoyan.com/board/4'
main(url)