案例分享:爬取17k小说网小说优化版

对爬取17k小说的代码进行优化

import requests
import os
from time import sleep
from lxml import etree
import random
from fake_useragent import UserAgent

# 全局变量:UA伪装
# 随机获取UA伪装模块
fake_ua = UserAgent()
# UA伪装之headers
headers = {
    'UserAgent': fake_ua.random,
}

# def get_proxies()
proxy_url = 'http://webapi.http.zhimacangku.com/getip?num=5&type=2&pro=&city=0&yys=0&port=11&pack=292706&ts=0&ys=0&cs=0&lb=1&sb=0&pb=4&mr=1&regions='
response = requests.get(url=proxy_url)
data = response.json()["data"]
proxies_list = []
for item in data:
    ip = str(item["ip"])
    port = str(item["port"])
    proxies = {'https': ip + ':' + port}
    print(proxies)
    proxies_list.append(proxies)
# 利用session绕过cookie验证
session = requests.session()
# 封装数据:用户名及密码
data = {
    "loginName": "",
    "password": ""
}
# 登录网址
login_url = 'https://passport.17k.com/ck/user/login'
session.post(url=login_url, headers=headers, data=data)

# 封装数据
params = {
    "page": "1",
    "appKey": "2406394919"
}
# 书架网址
bookshelf_url = 'https://user.17k.com/ck/author/shelf?'
response = session.get(url=bookshelf_url, headers=headers, params=params)
response.encoding = 'utf8'
data_all = response.json()["data"]
for data_every in data_all:
    bookId = data_every["bookId"]
    bookName = data_every["bookName"]
    book_url = 'https://www.17k.com/list/' + str(bookId) + '.html'
    response = requests.get(url=book_url, headers=headers)
    response.encoding = 'utf8'
    page_text = response.text
    sleep(1)
    # #实例化tree对象
    tree = etree.HTML(page_text)
    sleep(1)
    #作品相关
    dl_work_related = tree.xpath('/html/body/div[5]/dl[1]/dd')
    for dl in dl_work_related:
        novel_related_href = dl.xpath('./a/@href')[0]
    # #正文
    dd_body = tree.xpath('/html/body/div[5]/dl[2]/dd/a')
    for d in dd_body:
        novel_body_href = 'https://www.17k.com' + d.xpath('./@href')[0]
        sleep(1)
        response = requests.get(url=novel_body_href, headers=headers)
        response.encoding = 'utf8'
        page_text = response.text
        tree = etree.HTML(page_text)
        chapter_title = tree.xpath('//*[@id="readArea"]/div[1]/h1/text()')[0]
        chapter_content_all = tree.xpath(
            '//div[contains(@class,"content")]/div[@class="p"]/p[position()<last()]/text()')
        filename = '17k小说'
        if not os.path.exists(filename):
            os.mkdir(filename)
        try:
            filepath = filename + '/' + '%s.txt' % bookName
            with open(filepath, 'a+', encoding='utf8') as f:
                f.write('\n' + "--------%s--------" % chapter_title + '\n')
                for line in chapter_content_all:
                    f.write('\n' + line + '\n')
                print(chapter_title, '已经写入完成')
                f.close()
        except Exception as e:
            print(e)
posted @ 2023-03-13 16:53  Chimengmeng  阅读(338)  评论(0)    收藏  举报