import requests
from lxml import etree
import os
import threading
from queue import Queue
import time
class Meizitu:
def __init__(self):
self.url_temp = "https://www.meizitu.com/a/xinggan.html"
self.headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36"}
self.html_zhu_queue = Queue()
self.url_zhu_queue = Queue()
self.url_zhi_queue = Queue()
self.html_zhi_queue = Queue()
self.url_tu_queue = Queue()
def parse(self):#获取主页面内容
try:
res = requests.get(self.url_temp, headers=self.headers)
print(self.url_temp)
print(res)
zhu_html = res.content
print(zhu_html)
self.html_zhu_queue.put(zhu_html)
return zhu_html
print("取得主页面")
except:
print("nanshou")
def get_page_url(self):#获得图片页面Url
zhu_html = self.html_zhu_queue.get()
html = etree.HTML(zhu_html)
print(html)
print("page_url" * 150)
tent_div_list = html.xpath("//li[@class='wp-item']//div[@class='pic']/a[@target='_blank']/@href")
print("取得PAGEurl")
print(tent_div_list)
self.url_zhi_queue.put(tent_div_list)
self.html_zhu_queue.task_done()
def parse_zhi(self): #获得图片页面HTML
zhi_url = self.url_zhi_queue.get()
for url in zhi_url:
print("wwww//////" * 50)
print(url)
res = requests.get(url, headers=self.headers)
self.html_zhi_queue.put(res.content)
self.url_zhi_queue.task_done()
def get_png_url(self):
while True:
zhi_html = self.html_zhi_queue.get()
print(zhi_html)
print(type(zhi_html))
print("zhi_html 获得图片地址" * 20)
html = etree.HTML(zhi_html)
# print(etree.tostring(html))
# 利用split()函数获取url最后的文件名
div_list = html.xpath("//div[@id='picture']/p/img/@src")
print("获取完图片地址")
self.url_tu_queue.put(div_list)
self.html_zhi_queue.task_done()
def save(self):
root_path = r'D:\picture\zhl' # 创建保存位置
time.sleep(0.5)
while True:
url_list = self.url_tu_queue.get()
print(111)
for url in url_list:
print(url)
img_name = url.split('/')[-1]
img_path = root_path + r'\{0}'.format(img_name)
print("开始保存")
print("正在保存")
try:
if not os.path.exists(root_path):
os.makedirs(root_path)
if not os.path.exists(img_path):
r = requests.get(url)
with open(img_path, 'wb') as f:
f.write(r.content)
f.close()
print("文件保存成功")
else:
print("文件已存在")
except:
print("执行出错")
self.url_tu_queue.task_done()
def run(self):
thread_list = []
zhu_html = threading.Thread(target=self.parse)
thread_list.append(zhu_html)
zhi_url = threading.Thread(target=self.get_page_url)
thread_list.append(zhi_url)
for i in range(10):
zhi_html = threading.Thread(target=self.parse_zhi)
thread_list.append(zhi_html)
tu_url = threading.Thread(target=self.get_png_url)
thread_list.append(tu_url)
for i in range(20):
t_save = threading.Thread(target=self.save)
thread_list.append(t_save)
# print(thread_list)
for t in thread_list:
# t.setDaemon(True) # 把子线程设置为守护线程,该线程不重要主线程结束,子线程结束
t.start()
print(t)
#for tt in [self.html_zhu_queue, self.url_tu_queue, self.html_zhi_queue, self.url_zhi_queue]:
# tt.join()
if __name__ == '__main__':
mm = Meizitu()
mm.run()
重点
1.在RUN函数里应该写好思路
2.这个网站请求的很慢
3.QUEUE模块
1.先定义
2. .put .get .task_done 三个方法的运用
3. while TRue 使函数执行多次
4.每一次get 都必须在函数最后对应 .task_done,将队列中的元素清除
5.运用 .join 让主线程等待阻塞,等待队列的任务完成之后(即队列中没有元素时即认为任务完成,)再完 成 但是研要考虑的是 如果页面响应过慢 队列中还没有元素时,可能会误判,认为任务已经完成
4.THREAD模块
1.通过遍历,添加线程,对于请求,下载的过程 添加较多的线程
2. while True 使循环的无限循环 程序永不停止 通过.setDaemon(True) 把子线程设置为守护线程,该线程不重要,主线程结束,子线程结束
5.数据类型
requestes.get .content 是Byte类型
到底啥时候需要.content.decode()
对数据类型不懂
这段时间补上
浙公网安备 33010602011771号