爬虫7 使用多线程构建爬虫

简述:

  1.  使用python库threading来创建线程

  2. 使用Queue库来创建队列,存储数据

  3. 多线程获取网页数据,保存

 

实例:

  

"""使用多线程 + 队列,来构建爬虫"""
from threading import Thread
from queue import Queue
import time, threading
import requests
from fake_useragent import UserAgent
from lxml import etree


# 创建一个类,调用Thread线程库, 每次运行就产生一个线程
class MyRquest(Thread):
    def __init__(self, i, url_queue, r_queue):
        Thread.__init__(self)
        self.i = i
        self.url_queueu = url_queue
        self.r_queue = r_queue
        
    def run(self):
        # 获取并处理响应数据
        self.getHtml()
        self.saveData()
    
    def getHtml(self):
        print('线程{}开始获取...'.format(self.i))
        myhead = {
            'User-Agent': UserAgent().random
        }
        while True:
            if self.url_queueu.empty():
                break
            r = requests.get(self.url_queueu.get(), headers=myhead)
            if r.encoding not in ['utf-8', 'UTF-8']:
                print('此时编码为: ', r.encoding)
                r.encoding = 'utf-8'
            if r.status_code != 200:
                print('请求错误: ', r.status_code)
                continue
            else:
                # 将返回值添加到新队列中
                self.r_queue.put(r.text)
    
    def saveData(self):
        print('线程{}开始保存...'.format(self.i))
        while True:
            if self.r_queue.empty():
                break
            e = etree.HTML(self.r_queue.get())
            content = e.xpath("//div[@class='content']/span/text()")
            with open('duanzi.txt', 'a', encoding='utf-8') as f:
                for i in content:
                    f.write(str(self.i) + '.    ' + i + '\n')


def main():
    # 创建队列
    url_queue = Queue() # url队列
    r_queue = Queue()   # 获取的页面数据队列
    
    # url
    url = "https://www.qiushibaike.com/text/page/{}/"
    
    # 将要访问的url添加到队列中
    count = int(input("请输入要获取的页数: "))
    for i in range(count):
        url_queue.put(url.format(i+1))
        
    # 调用创建的线程类MyRquest,默认创建3个子线程
    for i in range(0, 3):
        print('线程{}开始...'.format(i))
        mythread = MyRquest(i, url_queue, r_queue)
        mythread.start()
    
    # 查看线程
    while True:
        time.sleep(1)
        threads = threading.enumerate()
        print('共有线程: ', threads)
        if len(threads) <= 1:
            print('结束')
            break


if __name__ == '__main__':
    main()

 

posted @ 2020-05-11 18:27  黑无常  阅读(167)  评论(0)    收藏  举报