# import requests
import asyncio
from aiohttp import ClientSession
import time
import re
import csv
class SpiderAsynic(object):
def __init__(self, url,file_name):
self.url = url
self.file_name = file_name
self.header = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"
}
async def async_response(self):
# url = "http://www.baidu.com/"
# header = {
# "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"
# }
print("get response from ", self.url)
async with ClientSession() as session:
async with session.get(url=self.url, headers=self.header) as resp:
resp = await resp.text()
print("Crawl from {0}; content: {1}".format(
self.url, resp[:500]))
return resp
async def transfer_html(self, html):
pattern = re.compile(
'<a class="titlelnk" href="(.*?)" target="_blank">(.*?)</a>', re.S)
items = re.findall(pattern, html)
results = []
for item in items:
# yield [item[0].strip(), item[1].strip()]
results.append([item[0].strip(), item[1].strip()])
return results
async def save_results(self, results):
file_full_path = r"./data/{0}".format(self.file_name)
headers = ['URL', 'Name']
with open(file_full_path,'a+',encoding='utf-8',newline='') as f:
writer = csv.writer(f)
writer.writerow(headers)
for index, result in enumerate(results):
writer.writerow(result)
# f.write("{0};{1}\n".format(result[0], result[1]))
# print("save file done to ", file_full_path)
async def run(self):
html = await self.async_response()
results =await self.transfer_html(html)
await self.save_results(results)
if __name__ == '__main__':
start = time.time()
loop = asyncio.get_event_loop()
base_url = "https://www.cnblogs.com/#p{}"
urls = [base_url.format(index) for index in range(1, 100)]
# spiders = [SpiderAsynic(base_url.format(index)) for index in range(1,20)]
spiders = [SpiderAsynic(url,'asynic_blog.csv') for url in urls]
groups = asyncio.gather(*[spider.run() for spider in spiders])
loop.run_until_complete(groups)
loop.close()
print("timing at ", time.time() - start)