aiohttp+多任务异步协程实现异步爬虫

首先构建爬取的网页,这里使用django渲染三个简单的页面,每个视图函数都sleep2秒用来模拟数据爬取的等待时间

django视图页面

from django.shortcuts import render
from django.http import HttpResponse
import time


def index(request):
    return HttpResponse('hello world')


def one(request):
    time.sleep(2)
    return HttpResponse('hello one')


def two(request):
    time.sleep(2)
    return HttpResponse('hello two')


def three(request):
    time.sleep(2)
    return HttpResponse('hello three')

异步爬虫

import aiohttp  # 使用该模块中的ClientSession
import asyncio
import requests
import time

start = time.time()

urls = ["http://127.0.0.1:8000/one", "http://127.0.0.1:8000/two", "http://127.0.0.1:8000/three"]


async def get_page(url):
    print('开始下载......')
    # request.get()是基于同步, 必须使用基于异步的网络请求模块进行url请求发送
    # response = requests.get(url)


    # aiohttp基于异步请求的网络模块
    async with aiohttp.ClientSession() as session:
        async with await session.get(url) as response:
            # text()方法返回字符串形式的数据
            # read()方法返回二进制形式的数据
            # json()方法返回json形式的数据
            # 注意获取响应数据前要await挂起
            page_text = await response.text()
            print(page_text)
    print('下载结束')


tasks = []
for url in urls:
    result = get_page(url)
    task = asyncio.ensure_future(result)
    tasks.append(task)


loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))

end = time.time()
print(end - start)



posted @ 2020-06-25 23:37  bibicode  阅读(334)  评论(0)    收藏  举报