# -*- coding: utf-8 -*-
"""
Created on Sat Oct 8 13:09:04 2022
@author: 小徐同学
"""
#使用xpath豆瓣
import requests
from lxml.html import fromstring
base_url = "https://movie.douban.com/"
headers= {"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.34"}
request = requests.get(url=base_url,headers=headers)
request.encoding = 'utf-8'
#获取每部电影的url并且以列表形式返回
def get_url(html_text):
doc = fromstring(request.text)
movies_url_list = doc.xpath("//table/tr/td/a")#返回一个a标签的列表
if movies_url_list:
movie_title_urls = [elem.xpath('@href')[0] for elem in movies_url_list]
return movie_title_urls
#获取每部电影的网页源代码
def get_every_text(movie_title_url):
request = requests.get(url=movie_title_url,headers=headers)
request.encoding = 'utf-8'
every_detail=request.text
return every_detail
#获取每部电影的导演,演员,片长等详细信息
def get_every_detail_content(every_detail):
doc = fromstring(every_detail)
list1 = []
title = doc.xpath("//*[@id='content']/h1/span[1]/text()")
list2=[]
list1.append(list2)
list2.append(title)
director = doc.xpath(".//*[@id='info']/span/span/a[@rel='v:directedBy']/text()")#导演
actor_and_actress = doc.xpath(".//div[@id='info']/span[@class='actor']/span//a/text()")#演员
date = doc.xpath(".//div[@id='info']/span[@property='v:initialReleaseDate']/text()")#上映日期
time_long = doc.xpath(".//div[@id='info']/span[@property='v:initialReleaseDate']/text()")#电影时长
score = doc.xpath(".//div[@id='interest_sectl']/div/div[@class=contains(rating_self,clearfix)]/strong/text()")#豆瓣评分
list2.append(director)
list2.append(actor_and_actress)
list2.append(date)
list2.append(time_long)
list2.append(score)
print(list1)
if __name__=="__main__":
movie_title_urls = get_url(request.text)
for url in movie_title_urls:
every_detail = get_every_text(url)
get_every_detail_content(every_detail)