妹子图爬取__RE__BS4

妹子图爬取

页面链接

感谢崔大佬: 原文链接

正则实现代码:

 1 import requests
 2 import re
 3 import os
 4 import random
 5 
 6 
 7 class mzitu():
 8 
 9     def __init__(self):
10         self.headers = {
11             'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
12             'Referer': 'http://i.meizitu.net'}
13         self.folder_path = str(os.getcwd())
14 
15     def all_url(self, url):
16         html = self.request(url).text
17         all_a_re = r'<a\shref="(http://www.mzitu.com/\d+)"\starget="_blank"'
18         pattern = re.compile(all_a_re)
19         all_a_list = re.findall(pattern, html)
20         # for href in all_a_list:
21         #     self.html(href)
22         self.html(random.choice(all_a_list))
23 
24     def html(self, href):
25         html = self.request(href).text
26         title_pattern = re.compile(r'<h2\sclass="main-title">(.*?)</h2>', re.S)
27         title = re.findall(title_pattern, html)
28         self.mkdir(str(title))
29         max_pattern = re.compile(r'<span>(\d*?)</span>', re.S)
30         max_num = re.findall(max_pattern, html)
31         max_num = int(max_num[-1])
32         for i in range(max_num):
33             page_url = href + "/" + str(i)
34             self.img(page_url)
35 
36     def mkdir(self, folder):
37         path = str(folder.strip())
38         os.makedirs(os.path.join(self.folder_path, path))
39         print(str(folder) + "------已创建")
40         os.chdir(self.folder_path + str("/") + path)
41 
42     def img(self, page_url):
43         img_html = self.request(page_url).text
44         img_url_re = r'<img\ssrc="(.*?)"\salt=".*?"'
45         pattern = re.compile(img_url_re, re.S)
46         img_all = re.findall(pattern, img_html)
47         img_url = img_all[0]
48         self.save(img_url)
49 
50     def save(self, img_url):
51         name = img_url[-6:]
52         print(name + "正在下载")
53         img = self.request(img_url)
54         f = open(name, 'wb')
55         f.write(img.content)
56         f.close()
57 
58     def request(self, url):  # 这个函数获取网页的response 然后返回
59         response = requests.get(url, headers=self.headers)
60         return response
61 
62 
63 Mzitu = mzitu()  # 实例化
64 Mzitu.all_url('http://www.mzitu.com/all')
View Code

 

BS4实现代码:

 1 import requests
 2 from bs4 import BeautifulSoup
 3 import os
 4 import random
 5 
 6 
 7 class mzitu():
 8 
 9     def __init__(self):
10         self.headers = {
11             'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
12             'Referer': 'http://i.meizitu.net'}
13         self.folder_path = str(os.getcwd())
14 
15     def all_url(self, url):
16         html = self.request(url)  # 调用request函数把套图地址传进去会返回给我们一个response
17         all_a = BeautifulSoup(html.text, 'lxml').find(
18             'div', class_='all').find_all('a')
19 
20         # for i in range(1, len(all_a)):
21         #     a = all_a[i]
22         a = random.choice(all_a)
23         title = a.get_text()
24         print(u'开始保存:', title)  # 加点提示不然太枯燥了
25         # 我注意到有个标题带有 ?  这个符号Windows系统是不能创建文件夹的所以要替换掉
26         path = str(title).replace("?", '_')
27         self.mkdir(path)  # 调用mkdir函数创建文件夹!这儿path代表的是标题title哦!!!!!不要糊涂了哦!
28         href = a['href']
29         # 调用html函数把href参数传递过去!href是啥还记的吧? 就是套图的地址哦!!不要迷糊了哦!
30         self.html(href)
31 
32     def html(self, href):  # 这个函数是处理套图地址获得图片的页面地址
33         html = self.request(href)
34         self.headers['referer'] = href
35         max_span = BeautifulSoup(html.text, 'lxml').find(
36             'div', class_='pagenavi').find_all('span')[-2].get_text()
37         for page in range(1, int(max_span) + 1):
38             page_url = href + '/' + str(page)
39             self.img(page_url)  # 调用img函数
40 
41     def img(self, page_url):  # 这个函数处理图片页面地址获得图片的实际地址
42         img_html = self.request(page_url)
43         img_url = BeautifulSoup(img_html.text, 'lxml').find(
44             'div', class_='main-image').find('img')['src']
45         self.save(img_url)
46 
47     def save(self, img_url):  # 这个函数保存图片
48         name = img_url[-6:]
49         print(name + "正在下载")
50         img = self.request(img_url)
51         f = open(name, 'ab')
52         f.write(img.content)
53         f.close()
54 
55     def mkdir(self, path):  # 这个函数创建文件夹
56         path = path.strip()
57         isExists = os.path.exists(os.path.join(self.folder_path, path))
58         if not isExists:
59             print(path)
60             os.makedirs(os.path.join(self.folder_path, path))
61             os.chdir(self.folder_path + str("/") + path)  # 切换到目录
62             return True
63         else:
64             print(u'名字叫做', path, u'的文件夹已经存在了!')
65             return False
66 
67     def request(self, url):  # 这个函数获取网页的response 然后返回
68         content = requests.get(url, headers=self.headers)
69         return content
70 
71 
72 Mzitu = mzitu()  # 实例化
73 Mzitu.all_url('http://www.mzitu.com/all')  # 给函数all_url传入参数  你可以当作启动爬虫(就是入口)
View Code

 

posted @ 2018-06-20 22:53  阿谋  阅读(361)  评论(0编辑  收藏  举报