妹子图爬取__RE__BS4
妹子图爬取
页面链接
感谢崔大佬: 原文链接
正则实现代码:
1 import requests 2 import re 3 import os 4 import random 5 6 7 class mzitu(): 8 9 def __init__(self): 10 self.headers = { 11 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", 12 'Referer': 'http://i.meizitu.net'} 13 self.folder_path = str(os.getcwd()) 14 15 def all_url(self, url): 16 html = self.request(url).text 17 all_a_re = r'<a\shref="(http://www.mzitu.com/\d+)"\starget="_blank"' 18 pattern = re.compile(all_a_re) 19 all_a_list = re.findall(pattern, html) 20 # for href in all_a_list: 21 # self.html(href) 22 self.html(random.choice(all_a_list)) 23 24 def html(self, href): 25 html = self.request(href).text 26 title_pattern = re.compile(r'<h2\sclass="main-title">(.*?)</h2>', re.S) 27 title = re.findall(title_pattern, html) 28 self.mkdir(str(title)) 29 max_pattern = re.compile(r'<span>(\d*?)</span>', re.S) 30 max_num = re.findall(max_pattern, html) 31 max_num = int(max_num[-1]) 32 for i in range(max_num): 33 page_url = href + "/" + str(i) 34 self.img(page_url) 35 36 def mkdir(self, folder): 37 path = str(folder.strip()) 38 os.makedirs(os.path.join(self.folder_path, path)) 39 print(str(folder) + "------已创建") 40 os.chdir(self.folder_path + str("/") + path) 41 42 def img(self, page_url): 43 img_html = self.request(page_url).text 44 img_url_re = r'<img\ssrc="(.*?)"\salt=".*?"' 45 pattern = re.compile(img_url_re, re.S) 46 img_all = re.findall(pattern, img_html) 47 img_url = img_all[0] 48 self.save(img_url) 49 50 def save(self, img_url): 51 name = img_url[-6:] 52 print(name + "正在下载") 53 img = self.request(img_url) 54 f = open(name, 'wb') 55 f.write(img.content) 56 f.close() 57 58 def request(self, url): # 这个函数获取网页的response 然后返回 59 response = requests.get(url, headers=self.headers) 60 return response 61 62 63 Mzitu = mzitu() # 实例化 64 Mzitu.all_url('http://www.mzitu.com/all')
BS4实现代码:
1 import requests 2 from bs4 import BeautifulSoup 3 import os 4 import random 5 6 7 class mzitu(): 8 9 def __init__(self): 10 self.headers = { 11 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", 12 'Referer': 'http://i.meizitu.net'} 13 self.folder_path = str(os.getcwd()) 14 15 def all_url(self, url): 16 html = self.request(url) # 调用request函数把套图地址传进去会返回给我们一个response 17 all_a = BeautifulSoup(html.text, 'lxml').find( 18 'div', class_='all').find_all('a') 19 20 # for i in range(1, len(all_a)): 21 # a = all_a[i] 22 a = random.choice(all_a) 23 title = a.get_text() 24 print(u'开始保存:', title) # 加点提示不然太枯燥了 25 # 我注意到有个标题带有 ? 这个符号Windows系统是不能创建文件夹的所以要替换掉 26 path = str(title).replace("?", '_') 27 self.mkdir(path) # 调用mkdir函数创建文件夹!这儿path代表的是标题title哦!!!!!不要糊涂了哦! 28 href = a['href'] 29 # 调用html函数把href参数传递过去!href是啥还记的吧? 就是套图的地址哦!!不要迷糊了哦! 30 self.html(href) 31 32 def html(self, href): # 这个函数是处理套图地址获得图片的页面地址 33 html = self.request(href) 34 self.headers['referer'] = href 35 max_span = BeautifulSoup(html.text, 'lxml').find( 36 'div', class_='pagenavi').find_all('span')[-2].get_text() 37 for page in range(1, int(max_span) + 1): 38 page_url = href + '/' + str(page) 39 self.img(page_url) # 调用img函数 40 41 def img(self, page_url): # 这个函数处理图片页面地址获得图片的实际地址 42 img_html = self.request(page_url) 43 img_url = BeautifulSoup(img_html.text, 'lxml').find( 44 'div', class_='main-image').find('img')['src'] 45 self.save(img_url) 46 47 def save(self, img_url): # 这个函数保存图片 48 name = img_url[-6:] 49 print(name + "正在下载") 50 img = self.request(img_url) 51 f = open(name, 'ab') 52 f.write(img.content) 53 f.close() 54 55 def mkdir(self, path): # 这个函数创建文件夹 56 path = path.strip() 57 isExists = os.path.exists(os.path.join(self.folder_path, path)) 58 if not isExists: 59 print(path) 60 os.makedirs(os.path.join(self.folder_path, path)) 61 os.chdir(self.folder_path + str("/") + path) # 切换到目录 62 return True 63 else: 64 print(u'名字叫做', path, u'的文件夹已经存在了!') 65 return False 66 67 def request(self, url): # 这个函数获取网页的response 然后返回 68 content = requests.get(url, headers=self.headers) 69 return content 70 71 72 Mzitu = mzitu() # 实例化 73 Mzitu.all_url('http://www.mzitu.com/all') # 给函数all_url传入参数 你可以当作启动爬虫(就是入口)