#-*- coding:utf-8 -*-
from multiprocessing import Pool
from bs4 import BeautifulSoup
import requests
from lxml import etree
import re
import os
import time
url = 'https://bing.ioliu.cn/'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
}
#定义request 会话对象
requestsSession = requests.Session()
imgPage = (x for x in range(1,146))
def getImg(imgPage):
page = imgPage
#页数循环
params = {
"p": page
}
#请求页面
responseHtml = requestsSession.get(url=url,headers=headers,params=params,timeout=3).text
#通过etree 实例化,通过xpath 表达式过滤 src
xpathObj = etree.HTML(responseHtml)
imgSrc = xpathObj.xpath('//div[@class="container"]//div[@class="card progressive"]/img/@src')
page =str(page)
#以页数为目录,通过 os.path 判断下
if not os.path.exists('./Img/' + page):
os.mkdir('./Img/' + page)
#获取每个页面的 图片并且存储
for i in imgSrc:
imgName = i.split('/')[-1].split('?')[0]
imgData = requestsSession.get(url=i,headers=headers).content
imgPath = './Img/' + page + '/' + imgName + '.jpg'
with open(imgPath,'wb') as dp:
dp.write(imgData)
print(imgName, '下载成功!')
# 每个页面请求间隔3s 防止被禁
time.sleep(3)
if __name__ == "__main__":
#通过多线程池的方式进行梳理 ,向 map 方法传递一个 函数和一个列表 ,map 函数使函数和 列表一一对应进行处理
with Pool(10) as p:
p.map(getImg,imgPage)