aiohttp+多任务异步协程实现异步爬虫
首先构建爬取的网页,这里使用django渲染三个简单的页面,每个视图函数都sleep2秒用来模拟数据爬取的等待时间
django视图页面
from django.shortcuts import render
from django.http import HttpResponse
import time
def index(request):
return HttpResponse('hello world')
def one(request):
time.sleep(2)
return HttpResponse('hello one')
def two(request):
time.sleep(2)
return HttpResponse('hello two')
def three(request):
time.sleep(2)
return HttpResponse('hello three')
异步爬虫
import aiohttp # 使用该模块中的ClientSession
import asyncio
import requests
import time
start = time.time()
urls = ["http://127.0.0.1:8000/one", "http://127.0.0.1:8000/two", "http://127.0.0.1:8000/three"]
async def get_page(url):
print('开始下载......')
# request.get()是基于同步, 必须使用基于异步的网络请求模块进行url请求发送
# response = requests.get(url)
# aiohttp基于异步请求的网络模块
async with aiohttp.ClientSession() as session:
async with await session.get(url) as response:
# text()方法返回字符串形式的数据
# read()方法返回二进制形式的数据
# json()方法返回json形式的数据
# 注意获取响应数据前要await挂起
page_text = await response.text()
print(page_text)
print('下载结束')
tasks = []
for url in urls:
result = get_page(url)
task = asyncio.ensure_future(result)
tasks.append(task)
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))
end = time.time()
print(end - start)

浙公网安备 33010602011771号