多进程+线程池+爬取图片+队列

"""
分析:
进程1. 从主页面中解析出详情页的url,从详情页中提取到图片的下载地址

进程2. 把拿到的地址,进行下载

队列:可以进行进程之间的通信
"""
import re
import requests
from multiprocessing import Process, Queue # Queue 队列
from lxml import etree
from urllib import parse
from concurrent.futures import ThreadPoolExecutor # 线程池
def get_img_src(q):
url = "https://www.tutu555.net/a/cn/index.html"
resp = requests.get(url)
resp.encoding = "gb2312"
# print(resp.text)
tree = etree.HTML(resp.text)
href_list = tree.xpath("//ul[@class='clearfix']/li/a/@href") # 浏览器 - 右键 - 显示页面源代码 - 自动换行打勾
# print(href_list)
for href in href_list:
# print(href)
child_resp = requests.get(href)
child_resp.encoding = "gb2312" # 先编码,在转换为 test
# print(child_resp.text)
# break
child_tree = etree.HTML(child_resp.text)
src_list = child_tree.xpath('//div[@class="content"]/img/@src')
# print(src_list)
for src in src_list:
q.put(src) # 向队列里装东西
print(src)
print(f"{src}, 被塞进队列")
q.put("完事了")
# break
# child_href = child_tree.xpath("//div [@class='page']/a[1]/text()")
# ex = '<div class="page">.*?共(.*?)页:'
# child_href = re.findall(ex, child_resp.text, )
# for page in child_href:
# # print(page)

def download(url): # 下载图片
print("开始下载!!!", url)
name = url.split("/")[-1] # 给图片取名字(文件名),用/去切,取最后一个
with open("./img/"+name, mode="wb") as f:
resp = requests.get(url) # 下载图片
f.write(resp.content)
print("下载完毕")

def download_img(q):
with ThreadPoolExecutor(10) as t: # 造10个线程池
while 1: # 不能确定数量多少,死循环
src = q.get() # 从队列中获取数据。如果没数据就会阻塞
if src == "完事了":
break
t.submit(download, src)

if __name__ == '__main__':
q = Queue() # 造队列
p1 = Process(target=get_img_src, args=(q,)) # 往里面存数据
p2 = Process(target=download_img, args=(q,)) # 往外拿数据
p1.start()
p2.start()
posted @ 2023-07-09 18:40  严永富  阅读(145)  评论(0)    收藏  举报