异步IO

greenlet

greenlet是一个用C实现的协程模块,相比与python自带的yield,它可以使你在任意函数之间随意切换,而不需把这个函数先声明为generator

手动切换

# -*- coding:utf-8 -*-
from greenlet import greenlet

def test1():
    print(12)
    gr2.switch()
    print(34)
    gr2.switch()

def test2():
    print(56)
    gr1.switch()
    print(78)


gr1 = greenlet(test1)
gr2 = greenlet(test2)
gr1.switch()

打印结果:
12
56
34
78


自动切换:

import gevent
 
def func1():
    print('\033[31;1mt1-t1...\033[0m')
    gevent.sleep(2)
    print('\033[31;1mt2-t3...\033[0m')
 
def func2():
    print('\033[32;1mt3-t4...\033[0m')
    gevent.sleep(1)
    print('\033[32;1mt4-t5...\033[0m')
 
 
gevent.joinall([
    gevent.spawn(func1),
    gevent.spawn(func2),
    #gevent.spawn(func3),
])

####注解:遇到io延迟会自动切换到下一个io操作

 

gevent 协程, 用户态的轻量级线程

  • 无需线程上下文切换的开销
  • 无需原子操作锁定及同步的开销

  "原子操作(atomic operation)是不需要synchronized",所谓原子操作是指不会被线程调度机制打断的操作;这种操作一旦开始,就一直运行到结束,中间不会有任何 context switch (切换到另一个线程)。原子操作可以是一个步骤,也可以是多个操作步骤,但是其顺序是不可以被打乱,或者切割掉只执行部分。视作整体是原子性的核心。
方便切换控制流,简化编程模型

  • 高并发+高扩展性+低成本:一个CPU支持上万的协程都不是问题。所以很适合用于高并发处理。
from gevent import monkey       ###让gevent知道urllib在进行异步io操作

monkey.patch_all()                  ###把当前程序所有io操作单独的做上标记
import gevent                       ###检测不到urllib和socket的阻塞,都是串行
from  urllib.request import urlopen


def f(url):
    print('GET: %s' % url)
    resp = urlopen(url)
    data = resp.read()
    print('%d bytes received from %s.' % (len(data), url))


gevent.joinall([
    gevent.spawn(f, 'https://www.python.org/'),         ###f是函数名称,url是参数
    gevent.spawn(f, 'https://www.yahoo.com/'),
    gevent.spawn(f, 'https://github.com/'),
])

 

示例(通过gevent实现单线程下的多socket并发)

server端:

import sys
import socket
import time
import gevent
 
from gevent import socket,monkey
monkey.patch_all()
 
 
def server(port):
    s = socket.socket()
    s.bind(('0.0.0.0', port))
    s.listen(500)
    while True:
        cli, addr = s.accept()
        gevent.spawn(handle_request, cli)
 
 
 
def handle_request(conn):
    try:
        while True:
            data = conn.recv(1024)
            print("recv:", data)
            conn.send(data)
            if not data:
                conn.shutdown(socket.SHUT_WR)
 
    except Exception as  ex:
        print(ex)
    finally:
        conn.close()
if __name__ == '__main__':
    server(8001)


client端:

import socket
 
HOST = 'localhost'    # The remote host
PORT = 8001           # The same port as used by the server
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.connect((HOST, PORT))
while True:
    msg = bytes(input(">>:"),encoding="utf8")
    s.sendall(msg)
    data = s.recv(1024)
    #print(data)
 
    print('Received', repr(data))
s.close()


并发100个链接

import socket
import threading

def sock_conn():

    client = socket.socket()

    client.connect(("localhost",8001))
    count = 0
    while True:
        #msg = input(">>:").strip()
        #if len(msg) == 0:continue
        client.send( ("hello %s" %count).encode("utf-8"))

        data = client.recv(1024)

        print("[%s]recv from server:" % threading.get_ident(),data.decode()) #结果
        count +=1
    client.close()


for i in range(100):
    t = threading.Thread(target=sock_conn)
    t.start()

 

线程&进程

  • 线程:计算计算机中工作的最小单元(io请求)
  • 进程:默认有主线程,可以多线程共存,并且共享内部资源(计算密集型)
  • 协程(微线程):使用进程中的一个线程去做多个任务
  • GIL 解释器锁,python特有,用于在进程中对所有的线程加锁,保证同一时候只有一个线程被cpu调度

 

在执行爬虫的时候,性能消耗主要在io请求中,单线程请求url会引起等待,如下代码

import requests

def get_url(url):
    response = requests.get(url)
    print (response.content)

url_list = ['http://www.github.com', 'http://www.bing.com']

for url in url_list:
    get_url(url)
单线程同步执行

 

使用多线程可提高效率,使用python3自带的ThreadPoolExecutor模块可指定执行的线程数量(默认情况下threading 模块不能指定线程数量,需要配合queue模块,看下面

#!/usr/bin/env python
#-*-coding:utf-8-*-

from concurrent.futures import ThreadPoolExecutor
import requests

def get_url(url):
    try:
        response = requests.get(url)
        print ("获取结果",url,response.content)
    except Exception as e:
        print ("异常结果",url,Exception)

url_list = [
            'http://www.github.com',
            'http://www.bing.com',
            'http://www.baidu.com',
            'http://www.google.com'        #访问google,如果网络不能到达时,会默认给处理,直接不打印任何东西
            ]

#创建线程池
pool = ThreadPoolExecutor(5)        #创建5个线程

for url in url_list:
    print ("开始请求",url)
    pool.submit(get_url, url)           #get_url为函数名,url为函数参数

pool.shutdown(wait=True)    #终止线程池    
多线程异步执行

 

使用多进程方式发起请求,只需将ThreadPoolExecutor模块改为ProcessPoolExecutor模块

#!/usr/bin/env python
#-*-coding:utf-8-*-

from concurrent.futures import ProcessPoolExecutor
import requests

def get_url(url):
    try:
        response = requests.get(url)
        print ("获取结果",url,response.content)
    except Exception as e:
        print ("异常结果",url,Exception)

url_list = [
            'http://www.github.com',
            'http://www.bing.com',
            'http://www.baidu.com',
            'http://www.google.com'
            ]

#创建线程池
pool = ProcessPoolExecutor(5)        #创建5个线程

for url in url_list:
    print ("开始请求",url)
    pool.submit(get_url, url)           #get_url为函数名url为函数参数

pool.shutdown(wait=True)    #终止线程池
多进程异步执行

 

使用此种多线程方式可以执行回调函数

from concurrent.futures import ThreadPoolExecutor
import requests

def get_url(url):
    response = requests.get(url)
    return response


def callback(future):
    print(future.result())


url_list = ['http://www.github.com', 'http://www.bing.com']
pool = ThreadPoolExecutor(5)
for url in url_list:
    v = pool.submit(get_url, url)
    v.add_done_callback(callback)          #执行回调函数        
pool.shutdown(wait=True)  


#多进程方法同上
多线程&回调函数执行

 

总结:

在io请求中,使用多线程更好,python自带的GIL锁只是负责cpu的调度,与io请求无关

 

threading与queue结合实现线程池

示例

#!/usr/bin/env python
# -*- coding:utf-8 -*-
from queue import Queue
import threading


class ThreadPool(object):

    def __init__(self, max_num=20):    
        self.queue = Queue(max_num)
        for i in range(max_num):
            self.queue.put(threading.Thread)

    def get_thread(self):
        return self.queue.get()

    def add_thread(self):
        self.queue.put(threading.Thread)

pool = ThreadPool(10)        #指定线程数量

def func(arg, p):        #自定义函数,起线程执行
    print (arg)
    import time
    time.sleep(2)
    p.add_thread()

for i in range(30):
    thread = pool.get_thread()
    t = thread(target=func, args=(i, pool))
    t.start()

 

异步IO之asyncio

通过上述代码均可以完成对请求性能的提高,对于多线程和多进行的缺点是在IO阻塞时会造成了线程和进程的浪费,所以异步IO首选asyncio

 

默认情况下原生asyncio不支持http请求

import asyncio      #异步io请求,协程加上了异步io的能力

@asyncio.coroutine
def func1():
    print('before...func1......')
    print('before---func1111')
    yield from asyncio.sleep(5)     #异步io执行5秒,与yield配套使用
    print('end...func1......')

@asyncio.coroutine
def func2():
    print('before...func2......')
    print ('before---func22222')
    yield from asyncio.sleep(3)     #异步io执行5秒,与yield配套使用,这里如果写time.sleep后会不生效
    print('end...func2......')


tasks = [func1(), func2()]      #函数名

#事件循环
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.gather(*tasks))
loop.close()

#说明:首先打印
before...func1......和before---func1111,然后指定sleep(5)达到io占用的目的,然后执行func2函数,由于func2只sleep 3 秒,所以会打印 end。。。func2..最后执行func1 sleep 5 秒后的代码

原生asyncio实现http支持的方法(只看逻辑)

import asyncio

@asyncio.coroutine
def get_url(host, url='/'):
    print(host, url)
    reader, writer = yield from asyncio.open_connection(host, 80)

    request_header_content = """GET %s HTTP/1.0\r\nHost: %s\r\n\r\n""" % (url, host,)
    request_header_content = bytes(request_header_content, encoding='utf-8')

    writer.write(request_header_content)
    yield from writer.drain()
    text = yield from reader.read()
    print(host, url, text)
    writer.close()

tasks = [
    get_url('www.cnblogs.com', '/xxxx/'),
    get_url('dig.chouti.com', '/pic/show?nid=4073644713430508&lid=10273091')
]

loop = asyncio.get_event_loop()
results = loop.run_until_complete(asyncio.gather(*tasks))
loop.close()
View Code

 

为了解决不支持http的问题,可使用aiohttp模块

pip3 install aiohttp

import aiohttp
import asyncio

@asyncio.coroutine
def fetch_async(url):
    print(url)
    response = yield from aiohttp.request('GET', url)
    # data = yield from response.read()
    # print(url, data)
    print(url, response)
    response.close()

tasks = [fetch_async('http://www.google.com/'), fetch_async('http://www.chouti.com/')]

event_loop = asyncio.get_event_loop()
results = event_loop.run_until_complete(asyncio.gather(*tasks))
event_loop.close()

 

使用requests模块实现异步io

import asyncio
import requests


@asyncio.coroutine
def get_url(func, *args):
    loop = asyncio.get_event_loop()
    future = loop.run_in_executor(None, func, *args)
    response = yield from future
    print(response.url, response.content)


tasks = [
     get_url(requests.get, 'http://www.cnblogs.com/xxx/'),
     get_url(requests.get, 'http://dig.chouti.com/pic/show?nid=4073644713430508&lid=10273091')
]

loop = asyncio.get_event_loop()
results = loop.run_until_complete(asyncio.gather(*tasks))
loop.close()

 

异步IO之gevent

gevent+request示例(io异步)

import gevent

import requests
from gevent import monkey

monkey.patch_all()

def get_url(method, url, req_kwargs):
    print(method, url, req_kwargs)
    response = requests.request(method=method, url=url, **req_kwargs)
    print(response.url, response.content)

# ##### 发送请求 #####
gevent.joinall([
    gevent.spawn(get_url, method='get', url='https://www.python.org/', req_kwargs={}),
    gevent.spawn(get_url, method='get', url='https://www.yahoo.com/', req_kwargs={}),
    gevent.spawn(get_url, method='get', url='https://github.com/', req_kwargs={}),
])

# 发送请求(协程池控制最大协程数量)
# from gevent.pool import Pool
# pool = Pool(None)          #这里none表示启动不限个数,如果为3表示一次性执行三个任务
# gevent.joinall([
#     pool.spawn(fetch_async, method='get', url='https://www.python.org/', req_kwargs={}),
#     pool.spawn(fetch_async, method='get', url='https://www.yahoo.com/', req_kwargs={}),
#     pool.spawn(fetch_async, method='get', url='https://www.github.com/', req_kwargs={}),
# ])

 

grequest 异步io(将上面代码封装成一个模块)

#pip3 install grequests

import
grequests request_list = [ grequests.get('http://httpbin.org/delay/1', timeout=0.001),    #发get请求 grequests.get('http://fakedomain/'), grequests.get('http://httpbin.org/status/500') ] #执行并获取响应列表 # response_list = grequests.map(request_list)      #自动循环request_list列表 # print(response_list) #执行并获取响应列表(处理异常) # def exception_handler(request, exception): # print(request,exception) # print("Request failed") # response_list = grequests.map(request_list, exception_handler=exception_handler) # print(response_list)

 

posted @ 2018-05-14 17:52  FRESHMANS  阅读(570)  评论(0)    收藏  举报