python 爬取5566图库图片
1 import requests
2 import random
3 import re
4 import time
5 import os
6 from bs4 import BeautifulSoup
7
8
9 class GetGirlsPhoto(object):
10 def __init__(self, head_url, repository_name):
11 self.url = head_url
12 self.list_url = []
13 self.list_pic_url = dict()
14 self.header_file = 'user_agents.txt'
15 self.path = repository_name
16
17 #编码问题解决
18 def chartset(self, rsp):
19 _chart = requests.utils.get_encoding_from_headers(rsp.headers)
20 if _chart == 'ISO-8859-1':
21 rsp.encoding = requests.utils.get_encodings_from_content(rsp.text)
22
23 #随机User-Agent
24 def get_header(self):
25 with open(self.header_file, 'r') as f:
26 headers = f.readlines()
27 header = random.choice(headers).strip()
28 header = {'User-Agent': header}
29 return header
30
31 #获取首页下方页码列表的链接,存入list_url
32 def get_url_list(self):
33 rsp = requests.get(self.url, headers=self.get_header())
34 self.chartset(rsp)
35 tg_bf = BeautifulSoup(rsp.text, 'lxml')
36 tag = tg_bf.find_all('a', target='_self')
37 res_url = r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')"
38 link = re.findall(res_url, str(tag), re.I | re.S | re.M)
39 for i in link[1:-3]:
40 url = self.url+i
41 self.list_url.append(url)
42 print('获取\“%s\”子链接成功' % self.url)
43
44 #根据list_url,获取每页的图片入口链接,存入list_pic_url(所有的图片入口链接)
45 def get_pic_link(self):
46 self.get_url_list()
47 for url in self.list_url:
48 rsp = requests.get(url, headers=self.get_header())
49 self.chartset(rsp)
50 tag_bf = BeautifulSoup(rsp.text, 'lxml')
51 a_tag = tag_bf.find_all('a', class_='picLink')
52 for i in a_tag:
53 self.list_pic_url[i.get('title')] = i.get('href')
54 time.sleep(1)
55 print('获取\“%s\”子链接成功!' % url)
56
57 #根据list_pic_url获取图片详细页的连接,然后分析出图片地址,最后进行下载
58 def get_pic(self):
59 self.get_pic_link()
60 for title, url in self.list_pic_url.items():
61 print('开始下载%s系列' % title)
62 rsp = requests.get(url, headers=self.get_header()).text
63 tag_bf = BeautifulSoup(rsp, 'lxml')
64 tag = tag_bf.find('div', class_='pages')
65 res_url = r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')"
66 link = re.findall(res_url, str(tag), re.I | re.S | re.M)
67 dir_path = self.path+'/'+title
68 is_exist = os.path.exists(dir_path)
69 if not is_exist:
70 os.makedirs(dir_path)
71 for index, i in enumerate(link[1:-1]):
72 real_url = url.rsplit('/', 1)[0]+'/'+i
73 if i == "#":
74 rsp = requests.get(url+i, headers=self.get_header())
75 else:
76 rsp = requests.get(real_url, headers=self.get_header())
77 self.chartset(rsp)
78 a_bf = BeautifulSoup(rsp.text, 'lxml')
79 img = a_bf.find('div', class_='articleBody')
80 res_url = r"(?<=src=\").+?(?=\")|(?<=src=\').+?(?=\')"
81 img_url = re.findall(res_url, str(img), re.I | re.S | re.M)
82 pic_rsp = requests.get(img_url[0], headers=self.get_header())
83 img_name = title+str(index+1)+'.jpg'
84 img_path = dir_path+'/'+img_name
85 with open(img_path, 'wb') as f:
86 f.write(pic_rsp.content)
87 f.flush()
88 f.close()
89 print('%s下载完成!' % img_name)
90 time.sleep(3)
91 print("*" * 30)
92
93
94 if __name__ == '__main__':
95 urls = ['http://www.55156.com/a/Mygirl',
96 'http://www.55156.com/a/Beautyleg']
97 for i in urls:
98 url = i
99 path_name = i.rsplit('/', 1)[1]
100 print(i, path_name)
101 pd = GetGirlsPhoto(head_url=url, repository_name=path_name)
102 pd.get_pic()
103 time.sleep(120)