爬虫7 使用多线程构建爬虫
简述:
1. 使用python库threading来创建线程类
2. 使用Queue库来创建队列,存储数据
3. 多线程获取网页数据,保存
实例:
"""使用多线程 + 队列,来构建爬虫""" from threading import Thread from queue import Queue import time, threading import requests from fake_useragent import UserAgent from lxml import etree # 创建一个类,调用Thread线程库, 每次运行就产生一个线程 class MyRquest(Thread): def __init__(self, i, url_queue, r_queue): Thread.__init__(self) self.i = i self.url_queueu = url_queue self.r_queue = r_queue def run(self): # 获取并处理响应数据 self.getHtml() self.saveData() def getHtml(self): print('线程{}开始获取...'.format(self.i)) myhead = { 'User-Agent': UserAgent().random } while True: if self.url_queueu.empty(): break r = requests.get(self.url_queueu.get(), headers=myhead) if r.encoding not in ['utf-8', 'UTF-8']: print('此时编码为: ', r.encoding) r.encoding = 'utf-8' if r.status_code != 200: print('请求错误: ', r.status_code) continue else: # 将返回值添加到新队列中 self.r_queue.put(r.text) def saveData(self): print('线程{}开始保存...'.format(self.i)) while True: if self.r_queue.empty(): break e = etree.HTML(self.r_queue.get()) content = e.xpath("//div[@class='content']/span/text()") with open('duanzi.txt', 'a', encoding='utf-8') as f: for i in content: f.write(str(self.i) + '. ' + i + '\n') def main(): # 创建队列 url_queue = Queue() # url队列 r_queue = Queue() # 获取的页面数据队列 # url url = "https://www.qiushibaike.com/text/page/{}/" # 将要访问的url添加到队列中 count = int(input("请输入要获取的页数: ")) for i in range(count): url_queue.put(url.format(i+1)) # 调用创建的线程类MyRquest,默认创建3个子线程 for i in range(0, 3): print('线程{}开始...'.format(i)) mythread = MyRquest(i, url_queue, r_queue) mythread.start() # 查看线程 while True: time.sleep(1) threads = threading.enumerate() print('共有线程: ', threads) if len(threads) <= 1: print('结束') break if __name__ == '__main__': main()

浙公网安备 33010602011771号