import threading
import time
from queue import Queue
import requests
from lxml import etree
class QiuBaiSpider(object):
# 1.爬取的的网站,和请求头
def __init__(self):
self.base_url = 'https://www.qiushibaike.com/hot/page/{}/'
self.headers = {
'User-Agent': "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"}
self.data = 0
self.url_queue = Queue()
self.response_queue = Queue()
self.data_queue = Queue()
self.count = 0
# 2.爬取网站的url
def get_url_list(self):
for i in range(1, 13):
url = self.base_url.format(i)
self.url_queue.put(url)
# 3.发送请求
def send_request(self):
while True:
url = self.url_queue.get()
response = requests.get(url, headers=self.headers)
self.response_queue.put(response)
self.url_queue.task_done()
# 4. 解析数据
def analysis_data(self):
while True:
data = self.response_queue.get().content
self.count += 1
html_data = etree.HTML(data)
div_list = html_data.xpath("""//*[@id="content-left"]/div""")
for i in div_list:
text = i.xpath('.//h2/text()')[0]
self.data += 1
self.data_queue.put(text)
self.response_queue.task_done()
# 5.存储
def write_file(self):
while True:
data = self.data_queue.get()
self.data_queue.task_done()
def _start(self):
th_list = []
# 获取url
th_url = threading.Thread(target=self.get_url_list)
th_list.append(th_url)
# 发起请求
for i in range(2):
th_send = threading.Thread(target=self.send_request)
th_list.append(th_send)
# 解析数据
th_analysis = threading.Thread(target=self.analysis_data)
th_list.append(th_analysis)
th_save = threading.Thread(target=self.write_file)
th_list.append(th_save)
print(th_list)
# 开启线程保护,和开启线程
for th in th_list:
th.setDaemon(True)
th.start()
# 开启队列阻塞
for q in [self.url_queue, self.response_queue, self.data_queue]:
q.join()
def run(self):
start = time.time()
self._start()
end = time.time()
print(end - start, "结束时间")
print(self.data)
if __name__ == '__main__':
qiu_bai = QiuBaiSpider()
qiu_bai.run()