python async + pyppeteer 并发
# coding=utf-8
import asyncio, time
import pyppeteer
from collections import namedtuple
Response = namedtuple("rs", "title url html cookies headers history status")
async def get_html(url, timeout=10):
# 默认30s
print('---0')
browser = await pyppeteer.launch(headless=False, args=['--no-sandbox'])
page = await browser.newPage()
res = await page.goto(url, options={'timeout': int(timeout * 1000)})
# await asyncio.sleep(3)
data = await page.content()
title = await page.title()
print('title = 0', title)
# resp_cookies = await page.cookies()
# resp_headers = res.headers
# resp_history = None
# resp_status = res.status
# response = Response(title=title, url=url,
# html=data,
# cookies=resp_cookies,
# headers=resp_headers,
# history=resp_history,
# status=resp_status)
await browser.close()
# return response
async def get_html_1(url, timeout=10):
# 默认30s
print('---1')
browser = await pyppeteer.launch(headless=True, args=['--no-sandbox'])
page = await browser.newPage()
res = await page.goto(url, options={'timeout': int(timeout * 1000)})
# await asyncio.sleep(3)
data = await page.content()
title = await page.title()
print('title 1= ', title)
# resp_cookies = await page.cookies()
# resp_headers = res.headers
# resp_history = None
# resp_status = res.status
# response = Response(title=title, url=url,
# html=data,
# cookies=resp_cookies,
# headers=resp_headers,
# history=resp_history,
# status=resp_status)
# return response
async def get_html_2(url, timeout=10):
# 默认30s
print('---2')
browser = await pyppeteer.launch(headless=True, args=['--no-sandbox'])
page = await browser.newPage()
res = await page.goto(url, options={'timeout': int(timeout * 1000)})
# await asyncio.sleep(3)
data = await page.content()
title = await page.title()
print('title 2= ', title)
# resp_cookies = await page.cookies()
# resp_headers = res.headers
# resp_history = None
# resp_status = res.status
# response = Response(title=title, url=url,
# html=data,
# cookies=resp_cookies,
# headers=resp_headers,
# history=resp_history,
# status=resp_status)
# return response
if __name__ == '__main__':
s_time = time.time()
url_list = ["http://www.10086.cn/index/tj/index_220_220.html","http://www.10086.cn/index/tj/index_220_220.html","http://www.10086.cn/index/tj/index_220_220.html","http://www.10086.cn/index/tj/index_220_220.html","http://www.10086.cn/index/tj/index_220_220.html",
] # , "http://www.10010.com/net5/011/", "http://python.jobbole.com/87541/"
# tasks = [get_html(url_list[0]), get_html_1(url_list[1]), get_html_2(url_list[2])]
loop = asyncio.get_event_loop()
# loop.run_until_complete(get_html(url_list[0]))
# loop.run_until_complete(get_html(url_list[1]))
# loop.run_until_complete(get_html(url_list[2]))
# loop.run_until_complete(get_html(url_list[3]))
# loop.run_until_complete(get_html(url_list[4]))
# results = loop.run_until_complete(asyncio.gather(*task))
tasks = [(get_html(url)) for url in url_list]
loop.run_until_complete(asyncio.wait(tasks)) # loop.run_until_complete() 既可以接收一个协程对象, 也可以接收一个 future 对象
# loop.close()
# for res in results:
# print(res.title)
print('耗时:', time.time() - s_time)

浙公网安备 33010602011771号