爬虫之单线程多任务异步抓取

协程

import asyncio
import time
#定义了一个特殊的函数
#特殊:调用后会返回一个协程对象,且函数内部的实现语句不会被立即执行
#创建一个协程对象
# async def test(num):
#     print(num)
#
# c = test(10)
# print(c)


#封装一个任务对象
# async def test(num):
#     print(num)
#
# c = test(10)
# #根据协程对象封装了一个任务对象
# task = asyncio.ensure_future(c)
# print(task)

#事件循环对象
async def request(url):
    print('正在请求:',url)
    time.sleep(2)
    print('请求完毕!',url)


c1 = request('www.1.com')

task_A = asyncio.ensure_future(c1)


#创建一个事件循环对象
loop = asyncio.get_event_loop()
#将任务对象注册到该对象中并且启动事件循环
loop.run_until_complete(task_A)

任务对象绑定回调

import asyncio
import time

async def request(url):
    print('正在请求:',url)
    time.sleep(2)
    print('请求完毕!',url)

    return url


#定义一个任务对象的回调函数
#task参数表示的就是该函数被绑定的那个任务对象
def task_callback(task):
    print('i am task_callback()')
    print(task.result())
    #task.result()返回的就是任务对象对应的特殊函数内部的返回值

c = request('www.xxx.com')

task = asyncio.ensure_future(c)
task.add_done_callback(task_callback)

loop = asyncio.get_event_loop()
loop.run_until_complete(task)

多任务异步协程

import asyncio
import time
start = time.time()
#在特殊函数内部不可以出现不支持异步模块相关的代码
async def request(url):
    print('正在请求:',url)
    # time.sleep(2)#time模块是不支持异步
    await asyncio.sleep(2)  #阻塞操作必须使用await关键字进行挂起
    print('请求完毕!',url)

    return url

urls = [
    'www.1.com',
    'www.2.com',
    'www.3.com'
]
def task_callback(task):
    print(task.result())

tasks = [] #多任务列表:存放多个任务对象
for url in urls:
    c = request(url)
    task = asyncio.ensure_future(c)
    task.add_done_callback(task_callback)
    tasks.append(task)  #将多个任务对象装在到一个任务列表中

loop = asyncio.get_event_loop()
#多任务注册
#wait就是将任务列表中的任务对象进行挂起
loop.run_until_complete(asyncio.wait(tasks))

print(time.time()-start)

多任务异步爬虫

import asyncio
import time
import requests
start = time.time()
#在特殊函数内部不可以出现不支持异步模块相关的代码
async def request(url):
   print('正在请求:',url)
   response = requests.get(url)
   return response.text

urls = [
    'http://127.0.0.1:5000/bobo',
    'http://127.0.0.1:5000/tom',
    'http://127.0.0.1:5000/jay'
]

def parse(task):
    page_text = task.result()
    print(page_text+',请求到的数据!!!')

tasks = []
for url in urls:
    c = request(url)
    task = asyncio.ensure_future(c)
    task.add_done_callback(parse)
    tasks.append(task)

loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))


print(time.time()-start)

aiohttp使用

# import asyncio
# import time
# import aiohttp
# start = time.time()
在特殊函数内部不可以出现不支持异步模块相关的代码
简单的基本架构:
async def request(url):
   with aiohttp.ClientSession() as s:
       #s.get/post和requests中的get/post用法几乎一样:url,headers,data/prames
       #在s.get中如果使用代理操作:proxy="http://ip:port"
       with s.get(url) as response:
           #获取字符串形式的响应数据:response.text()
           #获取byte类型的:response.read()
           page_text = response.text()
           return page_text
在当前架构的基础上补充细节即可
    细节1:在每一个with前加上async关键字
    细节2:在get方法前和response.text()前加上await关键字进行手动挂起操作
# async def request(url):
   # async with aiohttp.ClientSession() as s:
       s.get/post和requests中的get/post用法几乎一样:url,headers,data/prames
       在s.get中如果使用代理操作:proxy="http://ip:port"
       # async with await s.get(url) as response:
           获取字符串形式的响应数据:response.text()
           获取byte类型的:response.read()
           # page_text = await response.text()
           # return page_text

urls = [
    'http://127.0.0.1:5000/bobo',
    'http://127.0.0.1:5000/tom',
    'http://127.0.0.1:5000/jay',
    'http://127.0.0.1:5000/bobo',
    'http://127.0.0.1:5000/tom',
    'http://127.0.0.1:5000/jay',
    'http://127.0.0.1:5000/bobo',
    'http://127.0.0.1:5000/tom',
    'http://127.0.0.1:5000/jay',
]
# urls = []
# for i in range(500):
    # urls.append('http://127.0.0.1:5000/bobo')
# def parse(task):
    # page_text = task.result()
    # print(page_text+',请求到的数据!!!')

# tasks = []
# for url in urls:
    # c = request(url)
    # task = asyncio.ensure_future(c)
    # task.add_done_callback(parse)
    # tasks.append(task)

# loop = asyncio.get_event_loop()
# loop.run_until_complete(asyncio.wait(tasks))
# print(time.time()-start)

案列

import aiohttp
import asyncio
from lxml import etree

all_titles = []

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'

}
async def request(url):
    async with aiohttp.ClientSession() as s:
        async with await s.get(url,headers=headers) as response:
            page_text = await response.text()
            return page_text

urls = []
url = 'http://wz.sun0769.com/index.php/question/questionType?type=4&page=%d'
for page in range(100):
    u_page = page * 30
    new_url = format(url%u_page)
    urls.append(new_url)

tasks = []
def parse(task):
    page_text = task.result()
    page_text = page_text.encode('gb2312').decode('gbk')
    tree = etree.HTML(page_text)
    tr_list = tree.xpath('//*[@id="morelist"]/div/table[2]//tr/td/table//tr')
    for tr in tr_list:
        title = tr.xpath('./td[2]/a[2]/text()')[0]
        print(title)
        all_titles.append(title)

for url in urls:
    c = request(url)
    task = asyncio.ensure_future(c)
    task.add_done_callback(parse)
    tasks.append(task)

loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))

 

posted @ 2019-09-30 14:53  叫我大表哥  阅读(324)  评论(0编辑  收藏  举报