05_扒光一部小说需要多久-协程爬取小说-aiofiles-请求问题

# _*_ coding: utf-8 _*_
import asyncio
import time
from lxml import etree
import aiohttp
import aiofiles
import requests


async def download_one(url):
headers = {
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Mobile Safari/537.36 Edg/115.0.1901.183",
"Referer": "https://www.zanghaihua.org/mingchaonaxieshier/",
"Cookie": "fontFamily=null; fontColor=null; fontSize=null; bg=null; __51cke__=; font=middle; __tins__21535041=%7B%22sid%22%3A%201690213073948%2C%20%22vd%22%3A%203%2C%20%22expires%22%3A%201690215140987%7D; __51laig__=8"
}
while 1:
try:
async with aiohttp.ClientSession() as session:
async with session.get(url, headers=headers, encoding="utf-8") as resp:
                    page_source = await resp.text()
# print(await resp.text())
# print(page_source)

# 开始解析
tree = etree.HTML(page_source)
#
.strip() 加这个是为了没有换行符,空格恶心你
                    title = tree.xpath("//div[@class = 'reader-main']/h1/text()")[0].strip()  # [0] 取第0个
content = "\n".join(tree.xpath("//div[@class = 'content']/text()")).replace("\u3000", "")
# "\n" 换行 .join 拼接 .replace("\u3000", "") 把\u3000换成空格
# print(title)
# print(content)
async with aiofiles.open(f"./明朝那些事儿/{title}.txt", mode="w", encoding="utf-8") as f:
await f.write(content)
break # 下载成功就推出,没有成功就重复
except:
print("报错了,重试一下", url)

print("下载完毕", url)


async def download(href_list):
tasks = []
for href in href_list:
t = asyncio.create_task(download_one(href))
tasks.append(t)
break
await asyncio.wait(tasks)


def get_every_chapter_url(url): # 访问
for i in range(2):
# while 1: # 拿到东西,循环开来
try: # 如果报错了
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.82"
}
resp = requests.get(url, headers=headers) # , verify=False SSL验证,不需要不用加,会报错
resp.encoding = "gbk"
# print(resp.text)
tree = etree.HTML(resp.text)
href_list = tree.xpath('//ul[@id="section-list"]/li/a/@href') # F12 检查 元素
print(href_list) # 的到结果
return href_list
# break
except: # 就在来一偏
print("重来一次", url)
time.sleep(3)


def main(): # 进去
# 拿到页面当中每一个章节的url
url = "https://www.zanghaihua.org/mingchaonaxieshier/"
href_list = get_every_chapter_url(url)
asyncio.run(download(href_list))
# 启动携程,开始一节一节的下载


if __name__ == '__main__': # main
main()
posted @ 2023-07-25 22:40  严永富  阅读(6)  评论(0)    收藏  举报