python 协程实现爬网
# import requests import asyncio from aiohttp import ClientSession import time import re import csv class SpiderAsynic(object): def __init__(self, url,file_name): self.url = url self.file_name = file_name self.header = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36" } async def async_response(self): # url = "http://www.baidu.com/" # header = { # "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36" # } print("get response from ", self.url) async with ClientSession() as session: async with session.get(url=self.url, headers=self.header) as resp: resp = await resp.text() print("Crawl from {0}; content: {1}".format( self.url, resp[:500])) return resp async def transfer_html(self, html): pattern = re.compile( '<a class="titlelnk" href="(.*?)" target="_blank">(.*?)</a>', re.S) items = re.findall(pattern, html) results = [] for item in items: # yield [item[0].strip(), item[1].strip()] results.append([item[0].strip(), item[1].strip()]) return results async def save_results(self, results): file_full_path = r"./data/{0}".format(self.file_name) headers = ['URL', 'Name'] with open(file_full_path,'a+',encoding='utf-8',newline='') as f: writer = csv.writer(f) writer.writerow(headers) for index, result in enumerate(results): writer.writerow(result) # f.write("{0};{1}\n".format(result[0], result[1])) # print("save file done to ", file_full_path) async def run(self): html = await self.async_response() results =await self.transfer_html(html) await self.save_results(results) if __name__ == '__main__': start = time.time() loop = asyncio.get_event_loop() base_url = "https://www.cnblogs.com/#p{}" urls = [base_url.format(index) for index in range(1, 100)] # spiders = [SpiderAsynic(base_url.format(index)) for index in range(1,20)] spiders = [SpiderAsynic(url,'asynic_blog.csv') for url in urls] groups = asyncio.gather(*[spider.run() for spider in spiders]) loop.run_until_complete(groups) loop.close() print("timing at ", time.time() - start)