#coding:utf-8
import requests
import json
from lxml import etree
import threading
from queue import Queue
class QiushiSpide(object):
def __init__(self):
self.url_tmp = "https://www.qiushibaike.com/8hr/page/{}/"
self.header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36"}
self.pre_url = "https://www.qiushibaike.com"
self.url_queue = Queue()
self.html_queue = Queue()
self.content_queue = Queue()
def get_url_list(self):
for i in range(1,14):
self.url_queue.put(self.url_tmp.format(i))
print(self.url_queue.qsize())
# return [self.url_tmp.format(i) for i in range(1,14)]
def parse_url(self):
while True:
url = self.url_queue.get()
print(url)
response = requests.get(url, self.header)
self.html_queue.put(response.content.decode())
self.url_queue.task_done()
print("url_queue 完成一个")
# return response.content.decode()
def get_content_list(self):
while True:
html_str = self.html_queue.get()
html = etree.HTML(html_str)
li_list = html.xpath("//li[contains(@class,'item typs_')]")
content_list=[]
for li in li_list:
item = {}
img_list = li.xpath(".//a[contains(@class,'recmd-left')]")
for img in img_list:
item["img_url"] = "https:" + img.xpath("./img/@src")[0] if len(img.xpath("./img/@src"))>0 else None
div_list = li.xpath(".//div[@class='recmd-right']")
for div in div_list:
item["text"] = div.xpath("./a/text()")[0] if len(div.xpath("./a/text()"))>0 else None
item["a_href"] = self.pre_url + div.xpath("./a/@href")[0] if len(div.xpath("./a/@href"))>0 else None
item["smile_num"] = div.xpath(".//div[@class='recmd-num']/span[1]/text()")[0] if len(div.xpath(".//div[@class='recmd-num']/span[1]"))>0 else None
item["comment_num"] = div.xpath(".//div[@class='recmd-num']/span[4]/text()")[0] if len(div.xpath(".//div[@class='recmd-num']/span[4]"))>0 else None
content_list.append(item)
self.content_queue.put(content_list)
self.html_queue.task_done()
print("html_queue 完成一个")
# return content_list
def save_content(self):
while True:
content = self.content_queue.get()
with open("糗百多线程.txt",'a',encoding='utf-8') as f:
f.write(json.dumps(content,ensure_ascii=False,indent=2))
f.write("\n")
self.content_queue.task_done()
def run(self):
# url_list = self.get_url_list()
# for url in url_list:
# print(url)
# html_str = self.parse_url(url)
# content = self.get_content_list(html_str)
# self.save_content(content)
t_list = []
self.get_url_list()
for i in range(4):
p = threading.Thread(target=self.parse_url)
t_list.append(p)
print("添加parse_url线程结束")
for i in range(4):
g = threading.Thread(target=self.get_content_list)
t_list.append(g)
print("添加get_content_list线程结束")
s = threading.Thread(target=self.save_content)
t_list.append(s)
for t in t_list:
t.setDaemon(True) #守护线程,该线程不重要,主线程结束,子线程结束
t.start()
for q in [self.url_queue,self.html_queue,self.content_queue]:
q.join() #让主线程等待阻塞,等待队列的任务执行完了后再 完成
print("主线程end")
if __name__ == "__main__":
q = QiushiSpide()
q.run()