import re
import requests
import random
import time
import os
class GetPhoto(object):
def __init__(self, url1, class_):
self.agent = "user_agents.txt"
self.head_url_list = {}
self.pic_item_url = []
self.pic_url_list = []
self.page_url_list = [] #
self.url = url1
self.class_ = class_
with open(self.agent, 'r', encoding='utf-8') as f:
agents = f.readlines()
self.agents = list(map(lambda x: {"user-agent": x.strip()}, agents))
def chartset(self, rsp):
"""
解决中文乱码问题
:param rsp:
:return:
"""
_chart = requests.utils.get_encoding_from_headers(rsp.headers)
if _chart == 'ISO-8859-1':
rsp.encoding = requests.utils.get_encodings_from_content(rsp.text)[0]
# def get_first_url(self):
# rsp = requests.get(self.url, headers=random.choice(self.agents))
# self.chartset(rsp)
# content = rsp.text
# pattern = r'<div class="nav both">.*?<!-- top end -->'
# com = re.compile(pattern, re.S)
# content_info = com.findall(content)
# pattern2 = 'href="(http://(?:[a-zA-Z0-9]+\.){1,2}[a-zA-Z]{2,6}.*?)".*?<span>(.+?)</span>'
# com2 = re.compile(pattern2, re.S)
# self.head_url_list.update(map(lambda x:x[::-1],com2.findall(content_info[0])))
def get_page_urllist(self):
print('开始获取首页下方链接列表url!')
rsp = requests.get(self.url, headers=random.choice(self.agents))
self.chartset(rsp)
content = rsp.text
pattern = r'<div class="page both">.*?</div>'
com = re.compile(pattern, re.S)
content_info = com.findall(content)
pattern2 = r"href='(.*?)'"
com2 = re.compile(pattern2, re.S)
self.page_url_list.append(self.url)
self.page_url_list.extend(list(map(lambda x: url.rsplit("/", 1)[0]+"/"+x, com2.findall(content_info[0]))))
print("首页下方链接列表url获取完成!")
print(self.page_url_list)
def get_sub_url(self):
self.get_page_urllist()
print("开始获取整页图片项目链接:")
for i in self.page_url_list[:10]:
try:
rsp = requests.get(i, headers=random.choice(self.agents))
except Exception as e:
with open('error.log', 'a', encoding='utf-8') as f:
new_time = time.strftime('%Y-%m-%d %H:%M:%S')
f.write('{} {} {}+'.format(new_time, i, e.args[0]))
print('访问失败,暂停10s!')
time.sleep(10)
return
self.chartset(rsp)
content = rsp.text
pattern = r'<div class="imgList2">.*?</div>'
com = re.compile(pattern, re.S)
content_info = com.findall(content)
# print(content_info)
pattern2 = r'href="(?P<url>http://(?:[\w]+\.){1,2}[a-zA-Z]{2,6}.*?)".*?target="_blank" title="(.*?)">'
com2 = re.compile(pattern2, re.S)
sub_url_list = com2.findall(content_info[0])
self.pic_item_url.append(sub_url_list)
print(sub_url_list)
time.sleep(random.randint(1, 3)) #延时1-3秒
print("获取所有页面图片项目完成!")
def get_pic_url(self, url2):
rsp = requests.get(url2, headers=random.choice(self.agents))
self.chartset(rsp)
content = rsp.text
pattern = r'<div class="page".*?</div>'
com = re.compile(pattern, re.S)
content_info = com.findall(content)
pattern2 = r"<a href='([^#]+?)'>"
com2 = re.compile(pattern2, re.S)
pic_url_list = list(map(lambda x: url2.rsplit("/",1)[0]+"/"+x, com2.findall(content_info[0])))
return pic_url_list
def get_pic(self, pic_page_url, dirname, count):
try:
rsp = requests.get(pic_page_url, headers=random.choice(self.agents))
except Exception as e:
with open('error.log', 'a', encoding='utf-8') as f:
new_time = time.strftime('%Y-%m-%d %H:%M:%S')
f.write('{} {} {}\n'.format(new_time, pic_page_url, e.args[0]))
print('访问失败,暂停30s!')
time.sleep(30)
print('下载继续')
return
self.chartset(rsp)
content = rsp.text
# pattern = r"<p align=\"center\" id=\"contents\">.*?img\ssrc='(.*?)'"
pattern = '<p align="center">.*?src="(.*?)"'
# print(content)
com = re.compile(pattern, re.S)
pic_url = com.findall(content)
image_name = dirname + str(count) + '.jpg'
if pic_url:
try:
pic_rsp = requests.get(pic_url[0], headers=random.choice(self.agents))
except Exception as e:
with open('error.log', 'a', encoding='utf-8') as f:
new_time = time.strftime('%Y-%m-%d %H:%M:%S')
f.write('{} {} {}\n'.format(new_time, pic_url[0], e.args[0]))
print('访问失败,暂停30s!')
time.sleep(30)
print('下载继续')
return
img_dirpath = os.path.join(self.class_, dirname)
if not os.path.exists(img_dirpath):
os.makedirs(img_dirpath)
img_path = os.path.join(img_dirpath, image_name)
# print(img_dirpath)
# print(image_name)
# print(img_path)
# return
with open(img_path, 'wb') as f:
f.write(pic_rsp.content)
print('{}下载完成!'.format(image_name))
else:
print('{}下载失败!'.format(image_name))
time.sleep(random.randint(1, 3))
def download(self):
self.get_sub_url()
for i in self.pic_item_url[:3]:
for z in i:
pic_item, dir_name = z
lst = self.get_pic_url(pic_item)
print("开始下载{}系列!".format(dir_name))
for index, j in enumerate(lst, 1):
self.get_pic(j, dir_name, index)
print("{}系列下载完成!".format(dir_name))
if __name__ == '__main__':
url = 'http://www.5442.com/youxi/'
pic = GetPhoto(url, '游戏壁纸')
pic.download()