# -*-coding:utf-8-*-
import requests
import re
from bs4 import BeautifulSoup
import json
headers = {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6",
"Connection": "keep-alive",
"Cookie": 'll="118237"; bid=I01Ods0OrJA; __gads=ID=8c2ee8adc452b1dd-226296c262d30071:T=1653474012:RT=1653474012:S=ALNI_MbuAdJQ8W92lI2c2ppXsJ_P2_Ydfg; __utma=30149280.1346640552.1654946721.1654946721.1654946721.1; __utmc=30149280; __utmz=30149280.1654946721.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmt=1; __utmb=30149280.1.10.1654946721; __utma=223695111.1075054031.1654946726.1654946726.1654946726.1; __utmb=223695111.0.10.1654946726; __utmc=223695111; __utmz=223695111.1654946726.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; ap_v=0,6.0; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1654946726%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DryXr-GuTWHjxdd4DH12MpPbsVyyvP7ODlL-Y4K8jE9dnheeNOtr-Dp5otIX841po%26wd%3D%26eqid%3Dece2d0e0001ac7a90000000262a47b9a%22%5D; _pk_ses.100001.4cf6=*; __gpi=UID=000005b82ead9fc6:T=1653474012:RT=1654946726:S=ALNI_MZTl9rO1QOcR3yeHARedOY_xtnAtA; _vwo_uuid_v2=D4B899194D0A310952B78BD259E69F4BB|7e0650f469f7bccd2db729323dbb7556; _pk_id.100001.4cf6=fdfde254c89a301d.1653473993.2.1654946807.1653474011.',
"Host": "movie.douban.com",
"Referer": "https://movie.douban.com/explore",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36",
"X-Requested-With": "XMLHttpRequest",
"sec-ch-ua": 'Not A;Brand";v="99", "Chromium";v="101", "Google Chrome";v="101"',
"sec-ch-ua-mobile": '"?0"',
"sec-ch-ua-platform": '"macOS"',
}
def get_detail(url, title):
"""获取影片详情页面中所有演员角色"""
with open("a.txt", "a+") as f:
response = requests.get(url, headers=headers)
text = response.content.decode("utf-8")
soup = BeautifulSoup(text, "lxml")
role_div = soup.find_all("div", attrs={"class": "list-wrapper"})[1]
li_list = role_div.find_all("li", attrs={"class": "celebrity"})
for item in li_list:
role = item.find("span", attrs={"class": "role"}).text
if "(饰" in role:
role_name = role.split("(饰")[1].split(")")[0].strip()
f.write("\t".join([role_name, title, "\r\n"]))
print("-解析结果:" + " ".join([role_name, title]))
print("-------------------------")
if __name__=="__main__":
for i in range(100):
url = "http://movie.douban.com/j/search_subjects?type=movie&tag=华语&sort=recommend&page_limit=100&page_start=" + str(i*100)
r1 = requests.get(url, headers=headers)
data = json.loads(r1.text)
for movie in data["subjects"]:
title = movie.get("title", "")
detail_url = movie.get("url", "") + "celebrities"
print("当前解析电影:%s" % title)
get_detail(detail_url, title)
pass