# _*_ coding: utf-8 _*_
import asyncio
import time
from lxml import etree
import aiohttp
import aiofiles
import requests
async def download_one(url):
headers = {
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Mobile Safari/537.36 Edg/115.0.1901.183",
"Referer": "https://www.zanghaihua.org/mingchaonaxieshier/",
"Cookie": "fontFamily=null; fontColor=null; fontSize=null; bg=null; __51cke__=; font=middle; __tins__21535041=%7B%22sid%22%3A%201690213073948%2C%20%22vd%22%3A%203%2C%20%22expires%22%3A%201690215140987%7D; __51laig__=8"
}
while 1:
try:
async with aiohttp.ClientSession() as session:
async with session.get(url, headers=headers, encoding="utf-8") as resp:
page_source = await resp.text()
# print(await resp.text())
# print(page_source)
# 开始解析
tree = etree.HTML(page_source)
#.strip() 加这个是为了没有换行符,空格恶心你
title = tree.xpath("//div[@class = 'reader-main']/h1/text()")[0].strip() # [0] 取第0个
content = "\n".join(tree.xpath("//div[@class = 'content']/text()")).replace("\u3000", "")
# "\n" 换行 .join 拼接 .replace("\u3000", "") 把\u3000换成空格
# print(title)
# print(content)
async with aiofiles.open(f"./明朝那些事儿/{title}.txt", mode="w", encoding="utf-8") as f:
await f.write(content)
break # 下载成功就推出,没有成功就重复
except:
print("报错了,重试一下", url)
print("下载完毕", url)
async def download(href_list):
tasks = []
for href in href_list:
t = asyncio.create_task(download_one(href))
tasks.append(t)
break
await asyncio.wait(tasks)
def get_every_chapter_url(url): # 访问
for i in range(2):
# while 1: # 拿到东西,循环开来
try: # 如果报错了
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.82"
}
resp = requests.get(url, headers=headers) # , verify=False SSL验证,不需要不用加,会报错
resp.encoding = "gbk"
# print(resp.text)
tree = etree.HTML(resp.text)
href_list = tree.xpath('//ul[@id="section-list"]/li/a/@href') # F12 检查 元素
print(href_list) # 的到结果
return href_list
# break
except: # 就在来一偏
print("重来一次", url)
time.sleep(3)
def main(): # 进去
# 拿到页面当中每一个章节的url
url = "https://www.zanghaihua.org/mingchaonaxieshier/"
href_list = get_every_chapter_url(url)
asyncio.run(download(href_list))
# 启动携程,开始一节一节的下载
if __name__ == '__main__': # main
main()