使用多线程爬取 糗事百科 前十页段子

定义装饰器函数

1 def run_forever(func):
2     def wrapper(obj):
3         while True:
4             func(obj)
5     return wrapper

类初始化

    def __init__(self, page):
        self.max_page = page
        self.url_head = 'https://www.qiushibaike.com'
        self.url_mid = 'text/page/'
        self.url_detail = '/'
        self.count = 0
   
        self.url_queue = Queue()  # 页面url队列
        self.get_url_content_queue = Queue()  # 单个页面队列
        self.url_queue_all = Queue()
        self.page_url_list = []

定义类方法

 def add_url_to_queue(self):
        for i in range(1, self.max_page):
            self.url_queue.put(self.url_head + self.url_detail + self.url_mid + str(i) + self.url_detail)

  

    @run_forever
    def get_page_url_to_list(self):
        url = self.url_queue.get()
        response = requests.get(url)
        if response.status_code != 200:
            self.url_queue.put(url)
            print('url {}验证失败 重新写入'.format(url))
        else:
            html = etree.HTML(response.text)
            url_list = html.xpath('//a[@class="contentHerf"]/@href')

            for url in url_list:
                self.url_queue_all.put(self.url_head + url)
            self.url_queue.task_done()

    @run_forever
    def get_url_to_content_queue(self):

        url = self.url_queue_all.get()
        print(url)
        self.get_url_content_queue.put(url)
        self.url_queue_all.task_done()

    @run_forever
    def get_content(self):
        url = self.get_url_content_queue.get()
        try:
            response = requests.get(url, timeout=1)
            if response.status_code != 200:
                self.get_url_content_queue.put(url)
            else:
                html = etree.HTML(response.text)
                title = html.xpath('//h1[@class="article-title"]/text()')
                contents = html.xpath('//div[@class="content"]/text()')
                with open('qiushi.txt', 'a', encoding='utf8') as p:
                    for x in title:
                        p.write("TITLE:" + x)
                        p.write('\n')
                    for i in contents:
                        p.write(i + '\n')
                    p.write('\n')
                response.close()
                self.count += 1
                print("下载完成数:{}".format(self.count))
                self.get_url_content_queue.task_done()
        except:
            print("url truble:{}".format(url))
            self.get_url_content_queue.put(url)

    def run_sue_more_task(self, func, count=1):
        for i in range(0, count):
            t = Thread(target=func)
            t.setDaemon(True)
            t.start()

    def run(self):
        self.add_url_to_queue()
        self.run_sue_more_task(self.get_page_url_to_list, 3)
        self.run_sue_more_task(self.get_url_to_content_queue, 3)
        self.run_sue_more_task(self.get_content, 5)
        self.url_queue.join()
        self.get_url_content_queue.join()
        self.url_queue_all.join()

创建实例,调用方法
if __name__ == '__main__':
    qbs = get_qiushibaike(12)
    qbs.run()

ps:爬虫有风险,封ip需谨慎,线程一时爽,封号火葬场



 

posted @ 2020-06-29 21:28  traurig  阅读(192)  评论(0编辑  收藏  举报