python 通过asyncio 协程来下载图片

python 通过asyncio 协程的方式来下载图片

 

"""
以协程的方式下载图片
"""
import os
import sys
import asyncio
import aiohttp
import traceback

sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..' + '/..'))
from db_models import OverseasProductDetail
from urllib.parse import urlparse

dst_base_path = "/backup/bak38/t_dataset/overseas_products_data/accessories_categories"

# 限制最大并发数
semaphore = asyncio.Semaphore(20)


def get_filename_from_url(url):
    parsed = urlparse(url)
    return os.path.basename(parsed.path)


async def download_img(session, url, save_path, file_name):
    """
    协程方式下载图片或视频
    """
    try:
        async with semaphore:
            if not os.path.exists(save_path):
                os.makedirs(save_path)

            save_img_path = os.path.join(save_path, file_name)
            if os.path.exists(save_img_path):
                return

            async with session.get(url, timeout=aiohttp.ClientTimeout(total=15)) as resp:
                if resp.status == 200:
                    content = await resp.read()
                    with open(save_img_path, "wb") as f:
                        f.write(content)
    except Exception as e:
        print(f"下载失败: {url} 错误: {e}")


async def process_product(session, overseas_product_detail):
    try:
        id = overseas_product_detail.id
        product_id = overseas_product_detail.product_id
        product_photos = overseas_product_detail.product_photos
        product_detail = overseas_product_detail.product_detail
        if not product_photos or not product_detail:
            return

        keyword = overseas_product_detail.keyword
        chinese_name = overseas_product_detail.chinese_name

        product_title = product_detail.get("product_title")
        about_product = product_detail.get("about_product")
        country = product_detail.get("country", "")
        product_videos = product_detail.get("product_videos")

        dst_path = os.path.join(dst_base_path, f"{keyword}|{chinese_name}", country, product_id)
        os.makedirs(dst_path, exist_ok=True)

        product_title_path = os.path.join(dst_path, "product_title.txt")
        if not os.path.exists(product_title_path):
            with open(product_title_path, "w") as f:
                f.write(f"{product_title}")

        about_product_path = os.path.join(dst_path, "about_product.txt")
        if not os.path.exists(about_product_path):
            with open(about_product_path, "w") as f:
                f.write(f"{about_product}")

        tasks = []

        for photo_url in product_photos:
            file_name = get_filename_from_url(photo_url)
            tasks.append(download_img(session, photo_url, dst_path, file_name))

        for video in product_videos:
            video_url = video.get("video_url")
            file_name = get_filename_from_url(video_url)
            tasks.append(download_img(session, video_url, dst_path, file_name))

        await asyncio.gather(*tasks)

        OverseasProductDetail.set_download_img(overseas_product_detail_id=id, download_img=1)

        print(f"{dst_path} 下载完成")

    except Exception as e:
        print(e)
        traceback.print_exc()


async def main():
    page = 1
    search_num = 0

    async with aiohttp.ClientSession() as session:
        while True:
            overseas_product_details = OverseasProductDetail.get_products_by_page(page=page, page_size=30)
            if not overseas_product_details:
                break

            tasks = []
            for detail in overseas_product_details:
                tasks.append(process_product(session, detail))

            await asyncio.gather(*tasks)

            page += 1
            search_num += 1


if __name__ == '__main__':
    asyncio.run(main())

 

posted on 2025-05-19 15:31  星河赵  阅读(28)  评论(0)    收藏  举报

导航