Python使用BeautifulSoup爬取人人影视的详情页面

import requests,json
from bs4 import BeautifulSoup

if __name__ == '__main__':
    url = "https://yyets.com/movies/#d#/"
    movie_id = "202718"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
    }
    result = {}
    url = url.replace("#d#", movie_id)
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        head = soup.select("div#single div.sheader")
        for headItem in head:
            result['title'] = headItem.select("div.data h1")[0].getText()
            result['cover_url'] = headItem.select("div.poster img")[0]['src']
            result['date'] = headItem.select("div.extra span.date")[0].getText()
            result['classify'] = []
            for item in headItem.select("div.sgeneros a"):
                result['classify'].append(item.getText())

        info = soup.select("div#single div#info")
        for infoItem in info:
            desc = (infoItem.select("div.wp-content")[0]
                    .getText().strip())
            desc = desc.replace(desc[desc.rfind('\n'):], '')
            result['desc'] = desc
            result['pic_list'] = []
            g_imgs = infoItem.select("div#dt_galery img")
            for imgItem in g_imgs:
                img_src = imgItem['src'].strip()
                result['pic_list'].append(img_src)

        cast = soup.select("div#single div#cast")
        for castItem in cast:
            result['actor'] = []
            for actor in castItem.select("div.persons div.person div.data div.name a")[1:]:
                result['actor'].append(actor.getText())

        try:
            box_links = soup.select("div#single div.box_links")
            for linkItem in box_links:
                result['link'] = linkItem.select("div#videos table tr td a")[0]['href']
        except:
            result['link'] = ''

    with open('result.json', 'w', encoding='utf-8') as f:
        json.dump(result,f, ensure_ascii=False, indent=4)
        print("爬取成功!")


posted @ 2024-06-02 21:53  Excel2016  阅读(17)  评论(0)    收藏  举报