Fork me on GitHub

aiohttp你不知道的异步操作网络请求

aiohttp支持异步操作的网络请求的模块

1.一个简单异步协程爬取

  • read()
  • text(encoding=编码) 比如:await r.text(encoding="utf-8")
import asyncio
import aiohttp

async def request(url):
    print("当前url:",url)
    #使用aiohttp发起request请求。
    async with aiohttp.request("GET",url) as r:
        #r.read()不变吗,直接读取。返回来是二进制文件
        reponse = await r.read()
    print("返回reponse:",reponse)

urls = [
    'https://www.baidu.com',
    'https://www.sogou.com',
    'https://www.qq.com',
]

#任务列表,存放多个任务对象
stasks=[]
for url in urls:
    c = request(url)
    task = asyncio.ensure_future(c)
    stasks.append(task)

loop = asyncio.get_event_loop()
#需要将任务列表封装到wait中
loop.run_until_complete(asyncio.wait(stasks))

2.发起session请求

#!/usr/bin/env python
# -*- coding:utf-8 -*-
"""
Xu Junkai
"""
import requests
import asyncio
import time
import aiohttp
start_time = time.time()
urls = [
    'https://blog.csdn.net/',
    'https://www.sogou.com',
    'http://www.renren.com/',
]

async def get_page(url):
    print(url)
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as res:
            print(res.status)#获取相应状态码
            print(res.charset)#获取网页编码
            reponse = await res.text()#获取返回文本
            print(reponse)
            
tasks=[]
for url in urls:
    c = get_page(url)
    task = asyncio.ensure_future(c)
    tasks.append(task)

loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))
end_time = time.time()
print('总耗时:',end_time-start_time)
  • session.put
async with session.put(url,data=b"data")

注意:

不要为每次的连接都创建一次session,一般情况下只需要创建一个session,然后使用这个session执行所有的请求。

每个session对象,内部包含了一个连接池,并且将会保持连接和连接复用(默认开启)可以加快整体的性能

3.url中传递参数

import asyncio
import time
import aiohttp
start_time = time.time()
urls = [
    'https://blog.csdn.net/',
    'https://www.sogou.com',
    'http://www.renren.com/',
]
data = {"name":"foo"}
async def get_page(url,data):#定义函数可以放入多个参数
    print(url)
    async with aiohttp.ClientSession() as session:
        async with session.get(url,params= data) as res:
            print(res.status)
            #获取响应内容(由于获取响应内容是一个阻塞耗时过程,所以我们使用await实现协程切换)
            reponse = await res.text()
            print(reponse)
            print(res.charset)
tasks=[]
for url in urls:
    c = get_page(url,data)#传入参数,但不会执行
    task = asyncio.ensure_future(c)
    tasks.append(task)

loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))
end_time = time.time()
print('总耗时:',end_time-start_time)

注意

当使用res.text(),res.read()获取响应内容(由于获取响应内容是一个阻塞耗时过程,所以我们使用await实现协程切换)
正确写法
	await res.text()
	await res.read()  #获取是字节
	await res.json()  可以设置编码,设置处理函数
注意:
	res.json()为Requests中内置的JSON解码器
	其中只有response返回为json格式时,用res.json()打印出响应的内容.
	如果response返回不为json格式,使用res.json()会报错
	
	

4.StreamResponse

  • 因为text(),read()方法是把整个响应体读入内存,如果你是获取大量的数据,请考虑使用”字节流“(StreamResponse)
#字节流形式获取数据
import asyncio
import aiohttp

urls ='https://blog.csdn.net/'
async def get_page(url):
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as res:
            #打印100个字节的数据
            print(await res.content.read(100))

c = get_page(urls,)#函数对象
task = asyncio.ensure_future(c)#放入ensure_future中
loop = asyncio.get_event_loop()#创建循环事件
loop.run_until_complete(task)
#获取100个字节数据
  • 字节流形式读取数据,保存文件
import asyncio
import aiohttp

urls ='https://blog.csdn.net/'
async def get_page(url):
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as res:
            with open("cnds.text","wb") as fp:
            	#循环,100个字节100个字节读取放入文件中
                while True:
                    chunk = await res.content.read(100)
                    if not chunk:
                        break
                    fp.write(chunk)

c = get_page(urls,)
task = asyncio.ensure_future(c)
loop = asyncio.get_event_loop()
loop.run_until_complete(task)

注意

async with session.get(url) as res:#异步上下文管理器
with open("cnds.text","wb") as fp:#普通上下文管理器

#因为异步上下文管理器在enter和exit方法处能够暂停执行上下文管理器
#为了实现此功能,加入了2个新方法:__aenter__ 和__aexit__这两个方法都要返回一个 awaitable类型的值。
详见:
https://www.jb51.net/article/163540.htm
异步迭代器

5.自定义请求头

#与requests方法一样,headers放User-agent比较多。
async def get_page(url):
    async with aiohttp.ClientSession() as session:
        headers = {'Content-Type':'text/html; charset=utf-8'}
        async with session.get(url,headers=headers) as res:
            with open("cnds.text","wb") as fp:
            	#循环,100个字节100个字节读取放入文件中
                while True:
                    chunk = await res.content.read(100)
                    if not chunk:
                        break
                    fp.write(chunk)

6.自定义cookie

  • 注意:对于自定义cookie,我们需要设置在ClientSession(cookies=自定义cookie字典),而不是session.get()中

#源码显示
class ClientSession:
    """First-class interface for making HTTP requests."""

    ATTRS = frozenset([
        '_source_traceback', '_connector',
        'requote_redirect_url', '_loop', '_cookie_jar',
        '_connector_owner', '_default_auth',
        '_version', '_json_serialize',
        '_requote_redirect_url',
        '_timeout', '_raise_for_status', '_auto_decompress',
        '_trust_env', '_default_headers', '_skip_auto_headers',
        '_request_class', '_response_class',
        '_ws_response_class', '_trace_configs'])

    _source_traceback = None
    _connector = None

    def __init__(self, *, connector: Optional[BaseConnector]=None,
                 loop: Optional[asyncio.AbstractEventLoop]=None,
                 cookies: Optional[LooseCookies]=None,
                 headers: Optional[LooseHeaders]=None,
                 skip_auto_headers: Optional[Iterable[str]]=None,
                 auth: Optional[BasicAuth]=None,
                 json_serialize: JSONEncoder=json.dumps,
                 request_class: Type[ClientRequest]=ClientRequest,
                 response_class: Type[ClientResponse]=ClientResponse,
                 ws_response_class: Type[ClientWebSocketResponse]=ClientWebSocketResponse,  # noqa
                 version: HttpVersion=http.HttpVersion11,
                 cookie_jar: Optional[AbstractCookieJar]=None,
                 connector_owner: bool=True,
                 raise_for_status: bool=False,
                 read_timeout: Union[float, object]=sentinel,
                 conn_timeout: Optional[float]=None,
                 timeout: Union[object, ClientTimeout]=sentinel,
                 auto_decompress: bool=True,
                 trust_env: bool=False,
                 requote_redirect_url: bool=True,
                 trace_configs: Optional[List[TraceConfig]]=None) -> None:
  • 使用
cookies = {"cookies":"xxxxxxxxxx"}
async with ClientSession(cookies=cookies) as session:
	...

7.获取网站响应状态码

  • res.status

    async with session.get(url) as res:
    	print(res.status)
    

8.查看响应头

  • res.headers 查看响应头,得到值类型是一个dick
  • res.raw_headers 查看原生响应头,字节类型
import asyncio
import aiohttp
async def get_page(url):
    async with aiohttp.ClientSession() as session:
        headers = {'Content-Type':'text/html; charset=utf-8'}
        async with session.get(url,headers=headers) as res:
            for item,values in res.headers.items():
                print(item,"*******",values)
c = get_page(urls,)
task = asyncio.ensure_future(c)
loop = asyncio.get_event_loop()
loop.run_until_complete(task)

9.查看重定向的响应头

  • res.history

10.超时处理

  • 默认IO操作都有5分钟响应时间,但是时间太长,我们可以自己设置timeout

  • 如果timeout=None或timeout=0将不进行超时检查。也就不限时长。

    async with session.get("https://baidu.com",timeout=60) as res:
    	pass
    

11.ClientSession用于多个连接之间(同一个网站)共享cookie.

import aiohttp
import asyncio


async def request():
    #设置一个cookies
    cookies = {"my_cookie":"my_set_cookies"}
    async with aiohttp.ClientSession(cookies=cookies) as session:
        async with session.get("https://www.csdn.net/") as res:
            print(session.cookie_jar.filter_cookies("https://www.csdn.net/nav/python"))
            print("*******************************************")
        async with session.get("https://www.csdn.net/") as res:
            print(session.cookie_jar.filter_cookies("https://www.csdn.net/nav/java"))


c = request()
task = asyncio.ensure_future(c)
loop = asyncio.get_event_loop()
loop.run_until_complete(task)



#Set-Cookie: dc_session_id=10_1562499942692.566280
#Set-Cookie: my_cookie=my_set_cookies
#Set-Cookie: uuid_tt_dd=10_20709428800-1562499942692-906566
#*******************************************
#Set-Cookie: dc_session_id=10_1562499942692.566280
#Set-Cookie: my_cookie=my_set_cookies
#Set-Cookie: uuid_tt_dd=10_20709428800-1562499942692-906566

  • 最好使用session.cookie_jar.filter_cookies()获取网站cookie,不同于requests模块,虽然我们可以使用res.cookies有可能获取到cookie,但似乎并未获取到所有的cookies。

  • 总结

    1.当我们使用res.cookie时,只会获取到当前url下设置的cookie,不会维护整站的cookie
    2.而session.cookie_jar.filter_cookies(url)会一直保留这个网站的所有设置cookies,含有我们在会话时设置的cookie,并且会根据响应修改更新cookie。这个才是我们需要的
    3.而我们设置cookie,也是需要在aiohttp.ClientSession(cookies=cookies)中设置
    4.ClientSession 还支持 请求头,keep-alive连接和连接池(connection pooling)
    

12.cookie的安全性

  • 默认ClientSession使用的是严格模式的 aiohttp.CookieJar. RFC 2109,明确的禁止接受url和ip地址产生的cookie,只能接受 DNS 解析IP产生的cookie。可以通过设置aiohttp.CookieJar 的 unsafe=True 来配置

    jar = aiohttp.CookieJar(unsafe=True)
    session = aiohttp.ClientSession(cookie_jar=jar)
    

13控制连接数量

  • TCPConnector维持链接池,限制并行连接的总量,当池满了,有请求退出再加入新请求

    async def request():
            cookies = {"my_cookies":"my_cookies"}
            #限制并行的数量
            conn = aiohttp.TCPConnector(limit=5)
            async with aiohttp.ClientSession(cookies=cookies,connector=conn) as session:
                    pass
    
    c = request()
    
    task = asyncio.ensure_future(c)
    loop = asyncio.get_event_loop()
    loop.run_until_complete(task)
    
  • 限制同时打开连接到同一端点的数量,可以通过设置 limit_per_host 参数:

    limit_per_host: 同一端点的最大连接数量。同一端点即(host, port, is_ssl)完全相同情况。
    
    conn = aiohttp.TCPConnector(limit_per_host=30)#默认是0
    

14一个小例子

import asyncio
import aiohttp


headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36",
}
def callback(task):
    #回调函数可以对页面进行解析,这里图省事就打印了
    print(len(task.result()))


async def res(url):
    async with aiohttp.request('GET',url,headers=headers)as fp:
        #
        response =await fp.read()
        #因访问3个网站编码方式不同,统一转码(ISO-8859-1比较全)
        response = response.decode('iso-8859-1')
    # 返回给回调好书
    return response



urls = [
    'https://www.baidu.com',
    'https://www.sogou.com',
    'https://www.qq.com',
]

#proxy="http://some.proxy.com"

if __name__ == '__main__':
    #创建
    stasks = []
    for url in urls:
        #创建协程对象
        c = res(url)
        #封装任务对象
        task = asyncio.ensure_future(c)
        #给任务对象绑定回调函数
        task.add_done_callback(callback)
        #添加列表中
        stasks.append(task)
    # 创建一个事件循环对象
    loop = asyncio.get_event_loop()
    #将任务对象列表注册到事件循环对象中并且开启事件循环
    loop.run_until_complete(asyncio.wait(stasks))
  • 源文来自于https://www.jb51.net/article/163537.htm
posted @ 2019-08-29 10:43  是阿凯啊  阅读(1873)  评论(0编辑  收藏  举报