1 import aiohttp
2 import asyncio
3 import async_timeout
4 from urllib.parse import urljoin,urldefrag
5
6 root_url = 'http://python/org/' # 开始的url
7 crawled_urls,url_hub = [], [root_url]
8 headers = {'user-agent': 'Opera/9.80 (X11; Linux x86_64; U; en) Presto/2.2.15 Version/10.10'}
9
10 async def get_body(url):
11 async with aiohttp.ClientSession() as session:
12 try:
13 with async_timeout.timeout(10): #超时时间的设定
14 async with session.get(url,headers=headers) as response:
15 if response.status == 200:
16 html = await response.text()
17 return {'error':'','html':html,'url':url}
18 else:
19 return {'error':response.status,'html':'','url':url}
20 except Exception as err:
21 return {'error':response.status,'html':'','url':url}
22
23 async def handle_task(task_id,work_queue):
24 while not work_queue.empty():
25 queue_url = await work_queue.get()
26 if not queue_url in crawled_urls:
27
28 body = await get_body(queue_url)
29 if not body['error']:
30 crawled_urls.append(queue_url)
31 parse(body)
32 else:
33 print('{}爬取失败'.format(queue_url))
34
35
36 #解析返回的数据
37 def parse(body):
38 pass
39
40
41
42 def remove_fragment(url):
43 pure_url, frag = urldefrag(url)
44 return pure_url
45
46 #解析html,拼接新的url
47 def get_urls(html):
48 new_urls = [url.split('"')[0] for url in str(html).replace("'", '"').split('href="')[1:]]
49 return [urljoin(root_url, remove_fragment(new_url)) for new_url in new_urls]
50
51 if __name__ == '__main__':
52 q = asyncio.Queue() #初始化一个异步的队列
53 [q.put_nowait(url) for url in url_hub] #从初始的url队列中遍历,把url放入到队列中
54 loop = asyncio.get_event_loop()
55 tasks = [handle_task(task_id, q) for task_id in range(3)] #3个并发
56 loop.run_until_complete(asyncio.wait(tasks))
57 loop.close()
58 for u in crawled_urls:
59 print(u)
60 print('-' * 30)
61 print(len(crawled_urls))