爬取图片
import os,random
import time
from urllib.parse import *
from operator import ge
import requests
from lxml import etree
import datetime
import sys
import logging
import re
from logging import handlers
from faker import Faker
uas = Faker()
def downloadHtml(url): # 发起访问请求,获取页面,可重复使用
time.sleep(1)
headers = {
'User-Agent': uas.user_agent()
}
response = requests.get(url=url, headers=headers, timeout=50)
# print(response)
response.encoding = 'utf-8'
if response.status_code != 200:
logging.error('没有获取到页面,是否还未上传?状态码:{}'.format(response.status_code))
# sys.exit(0)
else:
html = etree.HTML(response.text)
return html
def get_pic(html):
img_url = html.xpath('//*[@id="main-wrapper"]/div[2]/p/a/img/@src')[0]
print(img_url)
return img_url
# 日志
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(funcName)s -> %(message)s')
def Download_Image(
downloadUrl: str or list, saveImagePath: str, headers: dict = None, proxies: dict = None
) -> bool or str:
"""
1、下载图片
:param downloadUrl: 下载的图片链接或者列表
:param saveImagePath: 保存路径
:param headers: 自定义头部信息
:param proxies: 自定义代理
:return:
"""
if isinstance(downloadUrl, str):
downloadUrlParse = urlparse(downloadUrl)
if headers is None:
headers = {
'User-Agent': uas.user_agent(),
'Referer': f'{downloadUrlParse.scheme}://{downloadUrlParse.netloc}',
'Host': downloadUrlParse.netloc,
}
# 下载
try:
response = requests.get(
downloadUrl, headers=headers, timeout=20, proxies=proxies).content
except TimeoutError:
logging.info(f'下载图片超时:{downloadUrl}')
return downloadUrl
except Exception as e:
logging.info(f'下载图片失败:{downloadUrl} -> 原因:{e}')
return downloadUrl
# 新保存路径
if os.path.isdir(saveImagePath):
newSaveImagePath = saveImagePath + r'\0.jpg'
else:
newSaveImagePath = os.path.splitext(saveImagePath)[0] + '.jpg'
with open(newSaveImagePath, 'wb') as f:
f.write(response)
elif isinstance(downloadUrl, list):
# 循环下载
for i in range(len(downloadUrl)):
time.sleep(2)
downloadUrlParse = urlparse(downloadUrl[i])
if headers is None:
headers = {
'User-Agent': uas.user_agent(),
'Referer': f'{downloadUrlParse.scheme}://{downloadUrlParse.netloc}',
'Host': downloadUrlParse.netloc,
}
if downloadUrl[i] == '':
continue
# 下载
try:
time.sleep(2)
response = requests.get(
downloadUrl[i], headers=headers, timeout=20, proxies=proxies).content
except TimeoutError:
logging.info(f'下载图片超时:{downloadUrl[i]}')
return downloadUrl[i]
except Exception as e:
logging.info(f'下载图片失败:{downloadUrl[i]} -> 原因:{e}')
continue
# 新保存路径
if os.path.isdir(saveImagePath):
newSaveImagePath = saveImagePath + fr'\{i}.jpg'
else:
newSaveImagePath = os.path.splitext(saveImagePath)[0] + '.jpg'
with open(newSaveImagePath, 'wb') as f:
f.write(response)
else:
logging.info('无法下载')
return '无法下载'
return True
访问网页并获取还是用之前的downloadhtml()函数,下载图片借用了网上的一个download_image()函数,带日志,感觉挺高大上。有一些参数我也不是很懂,比如Faker()函数,随机user_agent
if __name__ == '__main__':
l = [,,]
for j in l:
k = j.lstrip('/').rstrip('.html')
time.sleep(random.randint(1, 3))
url = "https://www.xxx.com"+j
s = downloadHtml(url).xpath(
'//*[@id="page"]/div/span/text()')[0].lstrip('1/')
# pattern = re.compile('_[0-9_]{1,2}')
num = int(s)
print(k, '共'+s+'页')
target_path = r'E:\\'+k
flag = os.path.exists(target_path)
if not flag:
os.makedirs(target_path)
file_count = len(glob.glob(r'E:\\'+k+'\\*.jpg'))
print(file_count)
for i in range(file_count+1, num):
url = "https://www.xx.com" + j.rstrip('.html')+'/page/'+str(i)+'.html'
time.sleep(random.randint(1, 4))
img_html = downloadHtml(url)
img_url = get_pic()
Download_Image(
downloadUrl=img_url,
saveImagePath=target_path+"\\"+str(i)
)
print('finish')
主程序,提前准备了一个列表,要爬取的网页地址后几位。先获取总页数,再循环每一页。这里有一个亮点,爬取时间久了,会断开,但是有的文件夹已经快下好,重新覆盖费时费力。由于每一页一张图,可以先获取当前文件夹中已有多少张图片,然后从断开的地方继续。用了一个glob()函数,四舍五入等于断点续传。
主要还是xpath的运用,字符串的构造。
def save_image(img_link, filename):
time.sleep(random.randint(1, 4))
res = requests.get(img_link, headers=headers, proxies=proxy)
if res.status_code == 404:
print(f"图片{img_link}下载出错------->")
with open(filename, "wb") as f:
f.write(res.content)
# print("存储路径:" + filename)
换了一个网站,Download_image()用不了,写了一个小而精的save_image()
主要的问题还是爬久了会断,这个一时没有太好的办法。用了随机睡眠,减缓访问速度。

浙公网安备 33010602011771号