Python 异步IO实现并发爬虫
Python 异步IO实现并发爬虫
Python 异步IO库 :asyncio
import asyncio
# 获得事件循环
loop = asyncio.get_event_loop()
# 定义协程
async def myfunc(url):
await get_url(url)
# 创建task列表
tasks = [loop.create_task(myfunc(url)) for url in urls]
# 执行爬虫事件列表
loop.run_until_complete(asyncio.wait(tasks)
- 注意:
-
要用在异步IO编程中,依赖的库必须支持异步IO特性
-
爬虫引用中:requests不支持异步,需要用aiohttp
-
简单示例代码
import asyncio
import aiohttp
import time
urls = [f"https://www.cnblogs.com/#p{page}" for page in range(1, 51)]
async def async_craw(url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
result = await response.text()
print(url, len(result))
loop = asyncio.get_event_loop()
tasks = [loop.create_task(async_craw(url)) for url in urls]
start = time.time()
loop.run_until_complete(asyncio.wait(tasks))
end = time.time()
print("it cost:", end - start)

浙公网安备 33010602011771号