案例-三国演义:xpath
网址: http://guoxue.lishichunqiu.com/gdxs/sanguoyanyi
xpath代码:
import requests
import json
from lxml import etree
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
def list_to_str(lst):
# lst 接收的参数必须是 list
if type(lst) == list:
s = ''
for item in lst:
s = s + item
return s
def html_content(url):
resp = requests.get(url, headers=headers)
resp.encoding = 'UTF-8'
html_tree = etree.HTML(resp.text)
data = html_tree.xpath('//div[@id="content"]//span[not(@style="display:none")]/text() | //div[@id="content"]//div[not(@class="prenext") and not(@style="display:none")]/text()')
return list_to_str(data)
url = 'http://guoxue.lishichunqiu.com/gdxs/sanguoyanyi/'
resp = requests.get(url, headers = headers)
resp.encoding = 'UTF-8'
html_tree = etree.HTML(resp.text)
data = html_tree.xpath('//tbody/tr/td/table[2]/tbody/tr/td/ul')
for item in data:
print("章节:", item.xpath('.//a/text()')[0])
item_link = item.xpath('.//a/@href')[0]
print("内容:", html_content(item_link) )
浙公网安备 33010602011771号