import urllib.request
import urllib.parse
import urllib.error
import re
import os
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
path = "./images"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
"referer": "https://www.mzitu.com/xinggan/"
}
def handler_request(url, pageIndex):
url = url + str(pageIndex)
# 构建请求对象
request = urllib.request.Request(url=url, headers=headers)
return request
def get_images_url(content, basePath):
patternNames = re.compile(r'<li>.*?<img .* alt=(.*?) .*? />.*?')
patternHrefs = re.compile(r'<li><a href=(.*?) .*?>.*?')
alts = patternNames.findall(content, re.S)
hrefs = patternHrefs.findall(content, re.S)
image_map = {}
for i in range(len(hrefs)):
key = alts[i][1: len(alts[i]) - 1]
image_map[key] = hrefs[i]
for item in image_map.items():
image_category_response(item, basePath)
def image_category_response(item, basePath):
alt = item[0]
save_folder = os.path.join(basePath, alt)
if not os.path.exists(save_folder):
os.mkdir(save_folder)
baseurl = item[1][1: len(item[1]) -1]
pageCount = 1000
try:
for pageIndex in range(pageCount):
page_url = baseurl + "/" + str(pageIndex)
try:
# 构建请求对象
request = urllib.request.Request(url=page_url, headers=headers)
# 发送请求
response = urllib.request.urlopen(request)
content = response.read().decode()
imgPattern = re.compile(r'<div class="main-image"><p>.*?<img src=(.*?) .*? />.*?')
imgUrl = imgPattern.findall(content, re.S)
download_images(imgUrl[0], save_folder)
except urllib.error.URLError as e:
raise TypeError("最大页面数{0}".format(pageIndex - 1))
except Exception as e:
print(e)
def download_images(url, save_path):
url = url[1: len(url) - 1]
print(url)
# 构建请求对象
request = urllib.request.Request(url=url, headers=headers)
# 发送请求
response = urllib.request.urlopen(request)
filename = url.split('/')[-1]
with open(os.path.join(save_path, filename), 'wb') as fb:
fb.write(response.read())
def parse_pages(content):
print(content)
def main():
url = 'https://www.mzitu.com/xinggan/page/'
start_page = int(input("请输入起始页码:"))
end_page = int(input("请输入结束页码:"))
# 创建根文件夹
if not os.path.exists(path):
os.mkdir(path)
for pageIndex in range(start_page, end_page + 1):
print("...........开始下载第{0}页".format(pageIndex))
# 创建文件夹
save_path = create_folder(pageIndex)
# 生成request
request = handler_request(url, pageIndex)
# 发送请求对象,获取相应内容
response = urllib.request.urlopen(request)
content = response.read().decode()
# 解析内容,提取图片并且下载
get_images_url(content, save_path)
print("...........结束下载第{0}页".format(pageIndex))
def create_folder(pageIndex):
save_path = os.path.join(path, str(pageIndex))
if not os.path.exists(save_path):
os.mkdir(save_path)
return save_path.replace("\\", "/") + "/"
if __name__ == "__main__":
main()