python 协程实现爬网

# import requests
import asyncio
from aiohttp import ClientSession
import time
import re
import csv

class SpiderAsynic(object):
    def __init__(self, url,file_name):
        self.url = url
        self.file_name = file_name
        self.header = {
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"
        }

    async def async_response(self):
        # url = "http://www.baidu.com/"
        # header = {
        #     "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"
        # }
        print("get response from ", self.url)
        async with ClientSession() as session:
            async with session.get(url=self.url, headers=self.header) as resp:
                resp = await resp.text()
                print("Crawl from {0}; content: {1}".format(
                    self.url, resp[:500]))
                return resp

    async def transfer_html(self, html):
        pattern =  re.compile(
            '<a class="titlelnk" href="(.*?)" target="_blank">(.*?)</a>', re.S)
        items = re.findall(pattern, html)
        results = []
        for item in items:
            # yield [item[0].strip(), item[1].strip()]
            results.append([item[0].strip(), item[1].strip()])
        return results

    async def save_results(self, results):
        file_full_path = r"./data/{0}".format(self.file_name)
        headers = ['URL', 'Name']
        with open(file_full_path,'a+',encoding='utf-8',newline='') as f:
            writer = csv.writer(f)
            writer.writerow(headers)
            for index, result in enumerate(results):
                writer.writerow(result)
                # f.write("{0};{1}\n".format(result[0], result[1]))

        # print("save file done to ", file_full_path)

    async def run(self):
        html = await self.async_response()
        results =await self.transfer_html(html)
        await self.save_results(results)


if __name__ == '__main__':
    start = time.time()
    loop = asyncio.get_event_loop()
    base_url = "https://www.cnblogs.com/#p{}"
    urls = [base_url.format(index) for index in range(1, 100)]

    # spiders = [SpiderAsynic(base_url.format(index)) for index in range(1,20)]
    spiders = [SpiderAsynic(url,'asynic_blog.csv') for url in urls]
    groups = asyncio.gather(*[spider.run() for spider in spiders])
    loop.run_until_complete(groups)
    loop.close()

    print("timing at ", time.time() - start)

 

posted @ 2019-05-21 16:00  Bradwarden  阅读(305)  评论(0)    收藏  举报