import threading
import time
from queue import Queue
from multiprocessing.dummy import Pool
import requests
from lxml import etree
class QiuBaiSpider(object):
# 1.爬取的的网站,和请求头
def __init__(self):
self.base_url = 'https://www.qiushibaike.com/hot/page/{}/'
self.headers = {
'User-Agent': "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"}
self.data = 0
self.pool = Pool(processes=4)
self.url_queue = Queue()
self.count = 0
self.request = 0
self.response = 0
self.is_finish = False
# 2.爬取网站的url
def get_url_list(self):
for i in range(1, 13):
url = self.base_url.format(i)
self.url_queue.put(url)
self.request += 1
# 3.发送请求
def send_request(self, url):
print(url)
response = requests.get(url, headers=self.headers)
return response
# 4. 解析数据
def analysis_data(self, data):
data = data.content
self.count += 1
html_data = etree.HTML(data)
div_list = html_data.xpath("""//*[@id="content-left"]/div""")
for i in div_list:
text = i.xpath('.//h2/text()')[0]
self.data += 1
self.write_file(text)
# 5.存储
def write_file(self, data):
print(data)
def _start(self):
url = self.url_queue.get()
data = self.send_request(url)
self.analysis_data(data)
self.response += 1
def _callback(self, temp):
self.pool.apply_async(self._start, callback=self._callback)
def async_start(self):
self.get_url_list()
for i in range(4):
self.pool.apply_async(self._start, callback=self._callback)
while True:
time.sleep(0.0001)
if self.response >= self.request:
self.is_finish = True
break
def run(self):
start = time.time()
self.async_start()
end = time.time()
print(end - start, "结束时间")
print(self.data)
if __name__ == '__main__':
qiu_bai = QiuBaiSpider()
qiu_bai.run()