案例-爬取笔尖中文小说:xpath
地址: https://www.biquzw.la/wanjiexiaoshuo/
xpath代码:
import requests
import json
from lxml import etree
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
url = 'https://www.biquzw.la/wanjiexiaoshuo/'
def list_to_str(lst):
# lst 接收的参数必须是 list
if type(lst) == list:
s = ''
for item in lst:
s = s + item
return s
def get_contents(url):
resp = requests.get(url, headers=headers)
resp.encoding = 'UTF-8'
html_tree = etree.HTML(resp.text)
data = html_tree.xpath('//div[@id="content"]/text()')
res = list_to_str(data).strip()
print("内容######>",res)
return res
def get_name_url(url):
resp = requests.get(url, headers=headers)
resp.encoding = 'UTF-8'
html_tree = etree.HTML(resp.text)
data = html_tree.xpath('//div[@id="list"]/dl/dd')
for item in data:
item_name = item.xpath('./a/text()')[0]
item_url = item.xpath('./a/@href')[0]
full_url = "{}{}".format(url,item_url)
print("章节名==>", item_name)
print("地址===>",full_url)
contents = get_contents(full_url)
print("内容:===>", contents)
def get_books(url):
resp = requests.get(url, headers=headers)
resp.encoding = 'UTF-8'
html_tree = etree.HTML(resp.text)
data = html_tree.xpath('//div[@class="novelslistss"]/ul/li')
for item in data:
item_type = item.xpath('./span[1]/text()')[0]
item_name = item.xpath('./span[2]/a/text()')[0]
item_url = item.xpath('./span[2]/a/@href')[0]
item_uthor = item.xpath('./span[4]/text()')[0]
print("作品类别:------>",item_type)
print("作者:------>", item_uthor)
print("作品名:------>", item_name)
print("作品链接:------>", item_url)
get_name_url(item_url)
get_books(url)
浙公网安备 33010602011771号