# -*- coding: utf-8 -*-
"""
@Time : 2022/3/23 11:26
@Author : Andrew
@File : 协程爬小说.py
"""
import aiofiles
import requests
from lxml import etree
import asyncio
import aiohttp
"""
1.同步操作:访问getCatelog 拿到所有章节的cid和名称
2.异步操作:访问getChapterContent 下载文章内容
"""
# # 所有章节地{名称,cid}
# url1 = 'http://dushu.baidu.com/api/pc/getDetail?data={"book_id":"4306063500"}'
# # 小说的一节具体内容
# url2 = "http://dushu.baidu.com/api/pc/getChapterContent?data={'book_id':'4306063500','cid':'4306063500|1569782244'," \
# "'need_bookinfo':1} "
import json
import time
async def aiodownload(cid, b_id, title):
data = {
'book_id': b_id,
'cid': f'{b_id}|{cid}',
'need_bookinfo': 1
}
data = json.dumps(data)
url = f"http://dushu.baidu.com/api/pc/getChapterContent?data={data}"
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp: # 异步requests.get()
dic = await resp.json() # 获取到就给dic,否则就挂起
async with aiofiles.open("./西游记小说异步爬取/" + title+".txt", mode="w", encoding="utf-8") as f: # 异步写文件
await f.write(dic["data"]["novel"]["content"]) # 有数据就写,没有就挂起
print(title, ":下载结束!")
async def getCatelog(url, b_id):
resp = requests.get(url)
dic = resp.json()
resp.close()
tasks = []
for item in dic["data"]["novel"]["items"]:
title = item["title"]
cid = item["cid"]
# 之前的都是同步,
# 准备异步任务很多个aiodownload await 23.7253s
tasks.append(aiodownload(cid, b_id, title))
await asyncio.wait(tasks) # 1.4s
if __name__ == "__main__":
b_id = "4306063500"
# 注意这里的拼接,踢出去的是4306063500,尽管是转成字符串,还是4306063500,并不是"4306063500"
# b_id = str(4306063500)
# 'http://dushu.baidu.com/api/pc/getCatalog?data={"book_id":"'+str(4306063500)+'"}'
url = 'http://dushu.baidu.com/api/pc/getCatalog?data={"book_id":"' + b_id + '"}'
time1 = time.time()
asyncio.run(getCatelog(url, b_id)) # 开启异步调用
time2 = time.time()
print(time2-time1)