多进程+线程池+爬取图片+队列

"""
分析：
    进程1. 从主页面中解析出详情页的url，从详情页中提取到图片的下载地址

    进程2. 把拿到的地址，进行下载

    队列：可以进行进程之间的通信
"""
import re
import requests
from multiprocessing import Process, Queue  # Queue   队列
from lxml import etree
from urllib import parse
from concurrent.futures import ThreadPoolExecutor         # 线程池
def get_img_src(q):
    url = "https://www.tutu555.net/a/cn/index.html"
    resp = requests.get(url)
    resp.encoding = "gb2312"
    # print(resp.text)
    tree = etree.HTML(resp.text)
    href_list = tree.xpath("//ul[@class='clearfix']/li/a/@href")  # 浏览器 - 右键  -  显示页面源代码  -  自动换行打勾
    # print(href_list)
    for href in href_list:
        # print(href)
        child_resp = requests.get(href)
        child_resp.encoding = "gb2312"    # 先编码，在转换为 test
        # print(child_resp.text)
        # break
        child_tree = etree.HTML(child_resp.text)
        src_list = child_tree.xpath('//div[@class="content"]/img/@src')
        # print(src_list)
        for src in src_list:
            q.put(src)   # 向队列里装东西
            print(src)
            print(f"{src}, 被塞进队列")
        q.put("完事了")
            # break
        # child_href = child_tree.xpath("//div [@class='page']/a[1]/text()")
    #     ex = '<div class="page">.*?共(.*?)页：'
    #     child_href = re.findall(ex, child_resp.text, )
    #     for page in child_href:
    #         # print(page)

def download(url):    # 下载图片
    print("开始下载！！！", url)
    name = url.split("/")[-1]    # 给图片取名字（文件名），用/去切，取最后一个
    with open("./img/"+name, mode="wb") as f:
        resp = requests.get(url)   # 下载图片
        f.write(resp.content)
    print("下载完毕")

def download_img(q):
    with ThreadPoolExecutor(10) as t:      # 造10个线程池
        while 1:    # 不能确定数量多少，死循环
            src = q.get()     # 从队列中获取数据。如果没数据就会阻塞
            if src == "完事了":
                break
            t.submit(download, src)

if __name__ == '__main__':
    q = Queue()      # 造队列
    p1 = Process(target=get_img_src, args=(q,))  # 往里面存数据
    p2 = Process(target=download_img, args=(q,))   # 往外拿数据
    p1.start()
    p2.start()
posted @ 2023-07-09 18:40 严永富阅读(145) 评论(0) 收藏举报
刷新页面返回顶部
yanyongfu

多进程+线程池+爬取图片+队列

公告