案例-豆瓣最受欢迎的影评:BS4、xpath
地址: https://movie.douban.com/review/best/?start=0
xpath代码:
import requests
import json
from lxml import etree
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
def list_to_str(lst):
# lst 接收的参数必须是 list
if type(lst) == list:
s = ''
for item in lst:
s = s + item
return s
def xpath_html_review(url):
resp = requests.get(url, headers = headers)
resp.encoding = "UTF-8"
html_tree = etree.HTML(resp.text)
data = html_tree.xpath('//div[@class="main-bd"]/div[1]/div[1]/p/text()')
res = list_to_str(data)
return res
def xpath_html(page = 0):
#url = 'https://movie.douban.com/review/best/?start=20'
url = 'https://movie.douban.com/review/best/?start={}'.format(page)
resp = requests.get(url, headers = headers)
resp.encoding = "UTF-8"
html_tree = etree.HTML(resp.text)
data = html_tree.xpath('//div[@class="article"]/div[1]/div')
for item in data:
print("影评作者==>", item.xpath('./div//header[@class="main-hd"]//a[@class="name"]/text()')[0])
data_cid = item.xpath('./@data-cid')
review_url = 'https://movie.douban.com/review/' + data_cid[0]
print("影评内容==>", xpath_html_review(review_url))
# 分页
for i in range(0,100,20):
print("==>",i)
xpath_html(page=i)
浙公网安备 33010602011771号