糗事百科_基于队列和多线程

import threading
import time
from queue import Queue

import requests
from lxml import etree


class QiuBaiSpider(object):
    # 1.爬取的的网站,和请求头
    def __init__(self):
        self.base_url = 'https://www.qiushibaike.com/hot/page/{}/'
        self.headers = {
            'User-Agent': "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"}
        self.data = 0
        self.url_queue = Queue()
        self.response_queue = Queue()
        self.data_queue = Queue()
        self.count = 0

    # 2.爬取网站的url
    def get_url_list(self):
        for i in range(1, 13):
            url = self.base_url.format(i)
            self.url_queue.put(url)

    # 3.发送请求
    def send_request(self):
        while True:
            url = self.url_queue.get()
            response = requests.get(url, headers=self.headers)
            self.response_queue.put(response)
            self.url_queue.task_done()

    # 4. 解析数据
    def analysis_data(self):
        while True:
            data = self.response_queue.get().content
            self.count += 1
            html_data = etree.HTML(data)
            div_list = html_data.xpath("""//*[@id="content-left"]/div""")
            for i in div_list:
                text = i.xpath('.//h2/text()')[0]
                self.data += 1
                self.data_queue.put(text)
            self.response_queue.task_done()

    # 5.存储
    def write_file(self):
        while True:
            data = self.data_queue.get()
            self.data_queue.task_done()

    def _start(self):
        th_list = []
        # 获取url
        th_url = threading.Thread(target=self.get_url_list)
        th_list.append(th_url)

        # 发起请求
        for i in range(2):
            th_send = threading.Thread(target=self.send_request)
            th_list.append(th_send)

        # 解析数据
        th_analysis = threading.Thread(target=self.analysis_data)
        th_list.append(th_analysis)

        th_save = threading.Thread(target=self.write_file)
        th_list.append(th_save)
        print(th_list)
        # 开启线程保护,和开启线程
        for th in th_list:
            th.setDaemon(True)
            th.start()
        # 开启队列阻塞
        for q in [self.url_queue, self.response_queue, self.data_queue]:
            q.join()

    def run(self):
        start = time.time()
        self._start()
        end = time.time()
        print(end - start, "结束时间")
        print(self.data)


if __name__ == '__main__':
    qiu_bai = QiuBaiSpider()
    qiu_bai.run()
posted @ 2019-07-31 15:48 Mr_Smith 阅读(152) 评论(0) 收藏举报
刷新页面返回顶部
健忘的我

糗事百科_基于队列和多线程

公告