异步IO
greenlet
greenlet是一个用C实现的协程模块,相比与python自带的yield,它可以使你在任意函数之间随意切换,而不需把这个函数先声明为generator
手动切换 # -*- coding:utf-8 -*- from greenlet import greenlet def test1(): print(12) gr2.switch() print(34) gr2.switch() def test2(): print(56) gr1.switch() print(78) gr1 = greenlet(test1) gr2 = greenlet(test2) gr1.switch() 打印结果: 12 56 34 78 自动切换: import gevent def func1(): print('\033[31;1mt1-t1...\033[0m') gevent.sleep(2) print('\033[31;1mt2-t3...\033[0m') def func2(): print('\033[32;1mt3-t4...\033[0m') gevent.sleep(1) print('\033[32;1mt4-t5...\033[0m') gevent.joinall([ gevent.spawn(func1), gevent.spawn(func2), #gevent.spawn(func3), ]) ####注解:遇到io延迟会自动切换到下一个io操作
gevent 协程, 用户态的轻量级线程
- 无需线程上下文切换的开销
- 无需原子操作锁定及同步的开销
"原子操作(atomic operation)是不需要synchronized",所谓原子操作是指不会被线程调度机制打断的操作;这种操作一旦开始,就一直运行到结束,中间不会有任何 context switch (切换到另一个线程)。原子操作可以是一个步骤,也可以是多个操作步骤,但是其顺序是不可以被打乱,或者切割掉只执行部分。视作整体是原子性的核心。
方便切换控制流,简化编程模型
- 高并发+高扩展性+低成本:一个CPU支持上万的协程都不是问题。所以很适合用于高并发处理。
from gevent import monkey ###让gevent知道urllib在进行异步io操作 monkey.patch_all() ###把当前程序所有io操作单独的做上标记 import gevent ###检测不到urllib和socket的阻塞,都是串行 from urllib.request import urlopen def f(url): print('GET: %s' % url) resp = urlopen(url) data = resp.read() print('%d bytes received from %s.' % (len(data), url)) gevent.joinall([ gevent.spawn(f, 'https://www.python.org/'), ###f是函数名称,url是参数 gevent.spawn(f, 'https://www.yahoo.com/'), gevent.spawn(f, 'https://github.com/'), ])
示例(通过gevent实现单线程下的多socket并发)
server端: import sys import socket import time import gevent from gevent import socket,monkey monkey.patch_all() def server(port): s = socket.socket() s.bind(('0.0.0.0', port)) s.listen(500) while True: cli, addr = s.accept() gevent.spawn(handle_request, cli) def handle_request(conn): try: while True: data = conn.recv(1024) print("recv:", data) conn.send(data) if not data: conn.shutdown(socket.SHUT_WR) except Exception as ex: print(ex) finally: conn.close() if __name__ == '__main__': server(8001) client端: import socket HOST = 'localhost' # The remote host PORT = 8001 # The same port as used by the server s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.connect((HOST, PORT)) while True: msg = bytes(input(">>:"),encoding="utf8") s.sendall(msg) data = s.recv(1024) #print(data) print('Received', repr(data)) s.close() 并发100个链接 import socket import threading def sock_conn(): client = socket.socket() client.connect(("localhost",8001)) count = 0 while True: #msg = input(">>:").strip() #if len(msg) == 0:continue client.send( ("hello %s" %count).encode("utf-8")) data = client.recv(1024) print("[%s]recv from server:" % threading.get_ident(),data.decode()) #结果 count +=1 client.close() for i in range(100): t = threading.Thread(target=sock_conn) t.start()
线程&进程
- 线程:计算计算机中工作的最小单元(io请求)
- 进程:默认有主线程,可以多线程共存,并且共享内部资源(计算密集型)
- 协程(微线程):使用进程中的一个线程去做多个任务
- GIL 解释器锁,python特有,用于在进程中对所有的线程加锁,保证同一时候只有一个线程被cpu调度
在执行爬虫的时候,性能消耗主要在io请求中,单线程请求url会引起等待,如下代码
import requests def get_url(url): response = requests.get(url) print (response.content) url_list = ['http://www.github.com', 'http://www.bing.com'] for url in url_list: get_url(url)
使用多线程可提高效率,使用python3自带的ThreadPoolExecutor模块可指定执行的线程数量(默认情况下threading 模块不能指定线程数量,需要配合queue模块,看下面)
#!/usr/bin/env python #-*-coding:utf-8-*- from concurrent.futures import ThreadPoolExecutor import requests def get_url(url): try: response = requests.get(url) print ("获取结果",url,response.content) except Exception as e: print ("异常结果",url,Exception) url_list = [ 'http://www.github.com', 'http://www.bing.com', 'http://www.baidu.com', 'http://www.google.com' #访问google,如果网络不能到达时,会默认给处理,直接不打印任何东西 ] #创建线程池 pool = ThreadPoolExecutor(5) #创建5个线程 for url in url_list: print ("开始请求",url) pool.submit(get_url, url) #get_url为函数名,url为函数参数 pool.shutdown(wait=True) #终止线程池
使用多进程方式发起请求,只需将ThreadPoolExecutor模块改为ProcessPoolExecutor模块
#!/usr/bin/env python #-*-coding:utf-8-*- from concurrent.futures import ProcessPoolExecutor import requests def get_url(url): try: response = requests.get(url) print ("获取结果",url,response.content) except Exception as e: print ("异常结果",url,Exception) url_list = [ 'http://www.github.com', 'http://www.bing.com', 'http://www.baidu.com', 'http://www.google.com' ] #创建线程池 pool = ProcessPoolExecutor(5) #创建5个线程 for url in url_list: print ("开始请求",url) pool.submit(get_url, url) #get_url为函数名url为函数参数 pool.shutdown(wait=True) #终止线程池
使用此种多线程方式可以执行回调函数
from concurrent.futures import ThreadPoolExecutor import requests def get_url(url): response = requests.get(url) return response def callback(future): print(future.result()) url_list = ['http://www.github.com', 'http://www.bing.com'] pool = ThreadPoolExecutor(5) for url in url_list: v = pool.submit(get_url, url) v.add_done_callback(callback) #执行回调函数 pool.shutdown(wait=True) #多进程方法同上
总结:
在io请求中,使用多线程更好,python自带的GIL锁只是负责cpu的调度,与io请求无关
threading与queue结合实现线程池
示例
#!/usr/bin/env python # -*- coding:utf-8 -*- from queue import Queue import threading class ThreadPool(object): def __init__(self, max_num=20): self.queue = Queue(max_num) for i in range(max_num): self.queue.put(threading.Thread) def get_thread(self): return self.queue.get() def add_thread(self): self.queue.put(threading.Thread) pool = ThreadPool(10) #指定线程数量 def func(arg, p): #自定义函数,起线程执行 print (arg) import time time.sleep(2) p.add_thread() for i in range(30): thread = pool.get_thread() t = thread(target=func, args=(i, pool)) t.start()
异步IO之asyncio
通过上述代码均可以完成对请求性能的提高,对于多线程和多进行的缺点是在IO阻塞时会造成了线程和进程的浪费,所以异步IO首选asyncio
默认情况下原生asyncio不支持http请求
import asyncio #异步io请求,协程加上了异步io的能力 @asyncio.coroutine def func1(): print('before...func1......') print('before---func1111') yield from asyncio.sleep(5) #异步io执行5秒,与yield配套使用 print('end...func1......') @asyncio.coroutine def func2(): print('before...func2......') print ('before---func22222') yield from asyncio.sleep(3) #异步io执行5秒,与yield配套使用,这里如果写time.sleep后会不生效 print('end...func2......') tasks = [func1(), func2()] #函数名 #事件循环 loop = asyncio.get_event_loop() loop.run_until_complete(asyncio.gather(*tasks)) loop.close()
#说明:首先打印 before...func1......和before---func1111,然后指定sleep(5)达到io占用的目的,然后执行func2函数,由于func2只sleep 3 秒,所以会打印 end。。。func2..最后执行func1 sleep 5 秒后的代码
原生asyncio实现http支持的方法(只看逻辑)
import asyncio @asyncio.coroutine def get_url(host, url='/'): print(host, url) reader, writer = yield from asyncio.open_connection(host, 80) request_header_content = """GET %s HTTP/1.0\r\nHost: %s\r\n\r\n""" % (url, host,) request_header_content = bytes(request_header_content, encoding='utf-8') writer.write(request_header_content) yield from writer.drain() text = yield from reader.read() print(host, url, text) writer.close() tasks = [ get_url('www.cnblogs.com', '/xxxx/'), get_url('dig.chouti.com', '/pic/show?nid=4073644713430508&lid=10273091') ] loop = asyncio.get_event_loop() results = loop.run_until_complete(asyncio.gather(*tasks)) loop.close()
为了解决不支持http的问题,可使用aiohttp模块
pip3 install aiohttp import aiohttp import asyncio @asyncio.coroutine def fetch_async(url): print(url) response = yield from aiohttp.request('GET', url) # data = yield from response.read() # print(url, data) print(url, response) response.close() tasks = [fetch_async('http://www.google.com/'), fetch_async('http://www.chouti.com/')] event_loop = asyncio.get_event_loop() results = event_loop.run_until_complete(asyncio.gather(*tasks)) event_loop.close()
使用requests模块实现异步io
import asyncio import requests @asyncio.coroutine def get_url(func, *args): loop = asyncio.get_event_loop() future = loop.run_in_executor(None, func, *args) response = yield from future print(response.url, response.content) tasks = [ get_url(requests.get, 'http://www.cnblogs.com/xxx/'), get_url(requests.get, 'http://dig.chouti.com/pic/show?nid=4073644713430508&lid=10273091') ] loop = asyncio.get_event_loop() results = loop.run_until_complete(asyncio.gather(*tasks)) loop.close()
异步IO之gevent
gevent+request示例(io异步)
import gevent import requests from gevent import monkey monkey.patch_all() def get_url(method, url, req_kwargs): print(method, url, req_kwargs) response = requests.request(method=method, url=url, **req_kwargs) print(response.url, response.content) # ##### 发送请求 ##### gevent.joinall([ gevent.spawn(get_url, method='get', url='https://www.python.org/', req_kwargs={}), gevent.spawn(get_url, method='get', url='https://www.yahoo.com/', req_kwargs={}), gevent.spawn(get_url, method='get', url='https://github.com/', req_kwargs={}), ]) # 发送请求(协程池控制最大协程数量) # from gevent.pool import Pool # pool = Pool(None) #这里none表示启动不限个数,如果为3表示一次性执行三个任务 # gevent.joinall([ # pool.spawn(fetch_async, method='get', url='https://www.python.org/', req_kwargs={}), # pool.spawn(fetch_async, method='get', url='https://www.yahoo.com/', req_kwargs={}), # pool.spawn(fetch_async, method='get', url='https://www.github.com/', req_kwargs={}), # ])
grequest 异步io(将上面代码封装成一个模块)
#pip3 install grequests
import grequests request_list = [ grequests.get('http://httpbin.org/delay/1', timeout=0.001), #发get请求 grequests.get('http://fakedomain/'), grequests.get('http://httpbin.org/status/500') ] #执行并获取响应列表 # response_list = grequests.map(request_list) #自动循环request_list列表 # print(response_list) #执行并获取响应列表(处理异常) # def exception_handler(request, exception): # print(request,exception) # print("Request failed") # response_list = grequests.map(request_list, exception_handler=exception_handler) # print(response_list)

浙公网安备 33010602011771号