案例-豆瓣最受欢迎的影评:BS4、xpath

地址: https://movie.douban.com/review/best/?start=0

 

xpath代码:

import requests
import json
from lxml import etree

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}

def list_to_str(lst):
    # lst 接收的参数必须是 list
    if type(lst) == list:
        s = ''
        for item in lst:
            s = s + item
        return s

def xpath_html_review(url):
    resp = requests.get(url, headers = headers)
    resp.encoding = "UTF-8"
    html_tree = etree.HTML(resp.text)
    data = html_tree.xpath('//div[@class="main-bd"]/div[1]/div[1]/p/text()')
    res = list_to_str(data)
    return res

def xpath_html(page = 0):
    #url = 'https://movie.douban.com/review/best/?start=20'
    url = 'https://movie.douban.com/review/best/?start={}'.format(page)
    resp = requests.get(url, headers = headers)
    resp.encoding = "UTF-8"
    html_tree = etree.HTML(resp.text)
    data = html_tree.xpath('//div[@class="article"]/div[1]/div')

    for item in data:
        print("影评作者==>", item.xpath('./div//header[@class="main-hd"]//a[@class="name"]/text()')[0])
        data_cid = item.xpath('./@data-cid')
        review_url = 'https://movie.douban.com/review/' + data_cid[0]
        print("影评内容==>", xpath_html_review(review_url))

# 分页    
for i in range(0,100,20):
    print("==>",i) 
    xpath_html(page=i)

posted @ 2023-01-03 17:58  屠魔的少年  阅读(10)  评论(0)    收藏  举报