爬虫7-多线程爬取壁纸族

# -*- coding: utf-8 -*-
"""
@Time    :  2022/3/22 16:57
@Author  : Andrew
@File    : 多线程应用.py
"""
# 1.如何提取单个页面的数据
# 2.设置线程池，多个页面同时抓取
from concurrent.futures.thread import ThreadPoolExecutor

import requests
from lxml import etree


def downloadOnePage(url):
    # 拿到页面源代码
    resp = requests.get(url)
    html = etree.HTML(resp.text)
    ul = html.xpath("/html/body/div[4]/div[5]/ul")[0]
    lis = ul.xpath("./li")
    for li in lis:
        href = li.xpath("./a/@href")[0]
        imgName = li.xpath("./a/@title")[0]
        resp2 = requests.get(href)
        html2 = etree.HTML(resp2.text)
        src = html2.xpath("/html/body/div[4]/div[2]/div[2]/a[1]/img/@src")[0]
        # 下载图片
        img = requests.get(src)
        with open("./多线程爬的壁纸族/" + imgName + "." + src.split(".")[-1], mode="wb") as f:
            f.write(img.content)
        f.close()
        print(imgName + "：下载完毕！！")
        # break
    resp.close()


if __name__ == "__main__":
    # for i in range(1,41):  # 效率低
    #     url = f"https://www.bizhizu.com/sj/fengguang/list-{i}.html"
    #     downloadOnePage(url)
    with ThreadPoolExecutor(41) as f:  # 多线程
        for i in range(1, 41):
            f.submit(downloadOnePage, f"https://www.bizhizu.com/sj/fengguang/list-{i}.html")

posted @ 2022-03-22 19:21 乔十六阅读(51) 评论(0) 收藏举报

刷新页面返回顶部

qiao-16

爬虫7-多线程爬取壁纸族

公告