# nvshens按目录图片批量下载爬虫1.00(多线程版)
from bs4 import BeautifulSoup
import requests
import datetime
import urllib.request
import os
import threading
user_agent='Mozilla/4.0 (compatible;MEIE 5.5;windows NT)'
headers={'User-Agent':user_agent}
# 下载图片到本地
def downloadPics(pictures):
while(len(pictures)>0):
pic=pictures.pop()
name=pic.split('/')[-1]
folder=pic.split('/')[-2]
# 判断目录是否存在,不存在则创建之
if os.path.exists('./'+folder)==False:
os.makedirs('./'+folder)
try:
rsp=urllib.request.urlopen(pic)
img=rsp.read()
with open('./'+folder+"/"+name,'wb') as f:
f.write(img)
print('图片'+pic+'下载完成')
except Exception as e:
print('图片'+pic+'下载异常,塞回重试')
pictures.append(pic);
#下载线程类
class dldThread(threading.Thread):
def __init__(self,name,url):
threading.Thread.__init__(self,name=name)
self.name=name
self.url=url
self.pictures=[]
def run(self):
while(self.url!="none"):
print("线程"+self.name+"开始爬取页面"+self.url);
try:
rsp=requests.get(self.url,headers=headers)
self.url="none"#用完之后置空,看下一页能否取到值
soup= BeautifulSoup(rsp.text,'html.parser',from_encoding='utf-8')
for divs in soup.find_all(class_="gallery_wrapper"):
# 把找到的图片放到数组里去
for img in divs.find_all('img'):
print(img.get("src"))
self.pictures.append(img.get("src"))
#找下一页
for link in divs.find_all('a',class_='a1'):
if link.string=='下一页' and link.get("href").find('.html')!=-1:
self.url='https://www.nvshens.com'+link.get("href")
if self.url!="none":
print("线程"+self.name+"前往下一页")
continue
else:
print("线程"+self.name+'爬取结束,开始下载...')
downloadPics(self.pictures)
print("线程"+self.name+'下载图片结束.')
except Exception as e:
print("线程"+self.name+"发生异常。重新爬行")# 不管怎么出现的异常,就让它一直爬到底
continue
# 循环下载图片
def main():
for i in range(10000,20000):#范围自己调整
url='https://www.nvshens.com/g/'+str(i)+'/'
th=dldThread(name=str(i),url=url)
th.start()
# Kickoff Start
main()