Python 爬取美女图片
本程序利用requests,bs4实现对http://meizitu.com大分类图片集中所有链接进行遍历,然后抓取图片:
# -*- coding:utf-8 -*- # @Time : 2017-02-10 22:51 # @Author : Vincen_Shen # @Site : # @File : mm.py # @Software : PyCharm ''' 该脚本实现www.meizitu.com自动爬取图片,需要注意的是实际图片存储在mm.howkuai.com。 两个网站都要求get请求时带上hearders,否则会被过滤。 ''' from bs4 import BeautifulSoup import requests import time def images_down(urls): """ 下载图片后以当前时间戳命名保存 :param urls: 接收image的实际地址列表 """ heard = {'User-Agent': r'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0', 'Connection': 'keep-alive', 'Host': 'mm.howkuai.com'} for url in urls: time.sleep(1) print(url) try: image = requests.get(url, headers=heard, stream=True, timeout=5) image_name = str(int(time.time())) + '.jpg' with open(image_name, 'wb') as f: f.write(image.content) except Exception: print('Error!!!') continue def images_urls(url): """ :param url: 接收一个有美女的url :return: 返回提取url中所有美女图片的url地址列表 """ mm_links = [] heard = {'User-Agent': r'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0', 'Connection': 'keep-alive', 'Host': 'www.meizitu.com'} response = requests.get(url, headers=heard, timeout=5) soup = BeautifulSoup(response.text, 'html.parser') urls = soup.find_all(id='picture') soup2 = BeautifulSoup(str(urls)[1:-1], 'html.parser') urls2 = soup2.find_all('img') for url in urls2: mm_links.append(url.get('src')) return mm_links def index_urls(): """ 提取每个页面中的单个美女图片集url """ urls = [] for i in range(1,12): urls.append('http://www.meizitu.com/a/xinggan_2_%s.html' %i) heard = {'User-Agent':r'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0', 'Connection': 'keep-alive', 'Host':'www.meizitu.com'} for url in urls: response = requests.get(url, headers = heard, timeout=5) response.encoding = 'gbk' soup = BeautifulSoup(response.text, 'html.parser') urls_image = soup.find_all(class_="tit") for url in urls_image: links = url.find('a') mm_links = images_urls(links.get('href')) # 调用images_urls函数,获取美女图片集url列表 images_down(mm_links) # 调用images_down函数,下载美女图片 if __name__ == '__main__': index_urls()