爬虫学习(十七)——多线程爬取数据案例

from typing import Optional, Callable, Iterable, Mapping, Any

import requests

from lxml import etree

from threading import Thread

from queue import Queue

import json

url = 'https://www.qiushibaike.com/text/page/%d/'

queue_url = Queue(13)

queue_html = Queue(13)

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9'}

exitFlag = False


class CrawlThread(Thread):

def __init__(self, queue, thread_id) -> None:
super().__init__()
self.queue = queue
self.thread_id = thread_id

def run(self) -> None:
super().run()
print('-------------爬虫线程--%d--启动--------------' % (self.thread_id))
self.get_html()
print('-------------爬虫线程--%d--终止--------------' % (self.thread_id))

def get_html(self):

while True:
# 根据队列是否为空,退出
if self.queue.empty():
break
try:
page = self.queue.get(block=False)

response = requests.get(url=url % (page), headers=headers)

response.encoding = 'utf-8'

html = response.text

queue_html.put((html, page))

self.queue.task_done()

print('----------------爬虫线程--%d--获取--%d--页的数据-----------------' % (self.thread_id, page))
except Exception as e:
pass

pass


class ParseThread(Thread):

def __init__(self, queue, thread_id, fp):
super().__init__()
self.queue = queue
self.thread_id = thread_id
self.fp = fp

def run(self):

print('-------------解析线程--%d--启动--------------' % (self.thread_id))
self.parse_html()
print('-------------解析线程--%d--终止--------------' % (self.thread_id))

def parse_html(self):

while True:
if exitFlag:
break

try:
html,page = self.queue.get(block=False)

tree = etree.HTML(html)

divs = tree.xpath('//div[contains(@id,"qiushi_tag_")]')

# 遍历divs(列表)---------div 元素(包含我们想要的内容)
for div in divs:
try:
content = div.xpath('.//div[@class="content"]/span/text()')[0].strip()
# 点赞
zan = div.xpath('.//span[@class="stats-vote"]/i/text()')[0].strip()
# 评论
comment = div.xpath('.//span[@class="stats-comments"]//i/text()')[0].strip()
# 作者
author = div.xpath('.//div[@class="author clearfix"]//h2/text()')[0].strip()
item = {}
item['author'] = author
item['zan'] = zan
item['comment'] = comment
item['content'] = content
json.dump(item,fp,ensure_ascii=False)
except Exception as e:
print('----------解析异常,页码是:%d---------'%(page))
pass
print('------------解析线程--%d--解析页码--%d--任务-------------'%(self.thread_id,page))
self.queue.task_done()
except Exception as e:
pass

pass


if __name__ == '__main__':
for i in range(13):
queue_url.put(i + 1)

# 开启网络请求
for i in range(5):
t = CrawlThread(queue_url, i)
t.start()

fp = open('./糗事百科.txt', mode='a', encoding='utf-8')
# 解析线程的任务
for i in range(3):
t = ParseThread(queue_html, i, fp)
t.start()


# 队列锁,队列中任务必须全部完成才可以执行下一步
queue_url.join()
queue_html.join()
exitFlag = True

fp.close()
posted @ 2019-02-21 22:01 石桥浪子 阅读(...) 评论(...) 编辑 收藏