# -*- coding: utf-8 -*-
"""
@Time : 2022/3/22 16:57
@Author : Andrew
@File : 多线程应用.py
"""
# 1.如何提取单个页面的数据
# 2.设置线程池,多个页面同时抓取
from concurrent.futures.thread import ThreadPoolExecutor
import requests
from lxml import etree
def downloadOnePage(url):
# 拿到页面源代码
resp = requests.get(url)
html = etree.HTML(resp.text)
ul = html.xpath("/html/body/div[4]/div[5]/ul")[0]
lis = ul.xpath("./li")
for li in lis:
href = li.xpath("./a/@href")[0]
imgName = li.xpath("./a/@title")[0]
resp2 = requests.get(href)
html2 = etree.HTML(resp2.text)
src = html2.xpath("/html/body/div[4]/div[2]/div[2]/a[1]/img/@src")[0]
# 下载图片
img = requests.get(src)
with open("./多线程爬的壁纸族/" + imgName + "." + src.split(".")[-1], mode="wb") as f:
f.write(img.content)
f.close()
print(imgName + ":下载完毕!!")
# break
resp.close()
if __name__ == "__main__":
# for i in range(1,41): # 效率低
# url = f"https://www.bizhizu.com/sj/fengguang/list-{i}.html"
# downloadOnePage(url)
with ThreadPoolExecutor(41) as f: # 多线程
for i in range(1, 41):
f.submit(downloadOnePage, f"https://www.bizhizu.com/sj/fengguang/list-{i}.html")