爬虫7-多线程爬取壁纸族
# -*- coding: utf-8 -*- """ @Time : 2022/3/22 16:57 @Author : Andrew @File : 多线程应用.py """ # 1.如何提取单个页面的数据 # 2.设置线程池,多个页面同时抓取 from concurrent.futures.thread import ThreadPoolExecutor import requests from lxml import etree def downloadOnePage(url): # 拿到页面源代码 resp = requests.get(url) html = etree.HTML(resp.text) ul = html.xpath("/html/body/div[4]/div[5]/ul")[0] lis = ul.xpath("./li") for li in lis: href = li.xpath("./a/@href")[0] imgName = li.xpath("./a/@title")[0] resp2 = requests.get(href) html2 = etree.HTML(resp2.text) src = html2.xpath("/html/body/div[4]/div[2]/div[2]/a[1]/img/@src")[0] # 下载图片 img = requests.get(src) with open("./多线程爬的壁纸族/" + imgName + "." + src.split(".")[-1], mode="wb") as f: f.write(img.content) f.close() print(imgName + ":下载完毕!!") # break resp.close() if __name__ == "__main__": # for i in range(1,41): # 效率低 # url = f"https://www.bizhizu.com/sj/fengguang/list-{i}.html" # downloadOnePage(url) with ThreadPoolExecutor(41) as f: # 多线程 for i in range(1, 41): f.submit(downloadOnePage, f"https://www.bizhizu.com/sj/fengguang/list-{i}.html")