案例-爬取笔尖中文小说:xpath

地址: https://www.biquzw.la/wanjiexiaoshuo/

 

xpath代码:

import requests
import json
from lxml import etree
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}

url = 'https://www.biquzw.la/wanjiexiaoshuo/'

def list_to_str(lst):
    # lst 接收的参数必须是 list
    if type(lst) == list:
        s = ''
        for item in lst:
            s = s + item
        return s

def get_contents(url):
    resp = requests.get(url, headers=headers)
    resp.encoding = 'UTF-8'
    html_tree = etree.HTML(resp.text)
    data = html_tree.xpath('//div[@id="content"]/text()')
    res = list_to_str(data).strip()
    print("内容######>",res)
    return res

def get_name_url(url):
    resp = requests.get(url, headers=headers)
    resp.encoding = 'UTF-8'
    html_tree = etree.HTML(resp.text)
    data = html_tree.xpath('//div[@id="list"]/dl/dd')
    for item in data:
        item_name = item.xpath('./a/text()')[0]
        item_url = item.xpath('./a/@href')[0]
        full_url = "{}{}".format(url,item_url)
        print("章节名==>", item_name)
        print("地址===>",full_url)
        contents = get_contents(full_url)
        print("内容:===>", contents)

def get_books(url):
    resp = requests.get(url, headers=headers)
    resp.encoding = 'UTF-8'
    html_tree = etree.HTML(resp.text)
    data = html_tree.xpath('//div[@class="novelslistss"]/ul/li')
    for item in data:
        item_type = item.xpath('./span[1]/text()')[0]
        item_name = item.xpath('./span[2]/a/text()')[0]
        item_url = item.xpath('./span[2]/a/@href')[0]
        item_uthor = item.xpath('./span[4]/text()')[0]
        print("作品类别:------>",item_type)
        print("作者:------>", item_uthor)
        print("作品名:------>", item_name)
        print("作品链接:------>", item_url)
        get_name_url(item_url)

get_books(url)

 

 

posted @ 2023-01-06 17:23  屠魔的少年  阅读(4)  评论(0)    收藏  举报