爬mei紫图最后代码2015-2019-1-14全部

共用时间4小时20分左右,文件4.36g,运行产生一个文档,用来记录不纯在的图片代码,把这些代码删减去,实际代码很短,因为过程比较无脑。

 

# -*- coding: utf-8 -*-
"""
Created on Mon Jan 14 18:23:10 2019

@author: Administrator
"""

import requests
from bs4 import BeautifulSoup
import re
import time
#import xlwt





'''
book = xlwt.Workbook(encoding='utf-8', style_compression=0)
                
sheet = book.add_sheet('test', cell_overwrite_ok=True)
                
sheet.write(0,0,link[-7:-4])
                
book.save(r'e:\test1.xls')
'''

#url="https://www.mzitu.com/169451"

my_referer = r'https://www.mzitu.com/169451'



#为了获得图片链接,暂时不需要了,因为链接可以生成
'''
r=requests.get(url,headers={'referer':my_referer})
r.encoding=r.apparent_encoding
html=r.content
soup=BeautifulSoup(html,"html.parser")
s=soup.select("div p a")[0].img["src"]
'''

#从链接总获得图片




#s=soup.select(".article-content")

#type(s[0])
#Out[18]: bs4.element.Tag

#t=s[0].get_text()


#f=open("d:/测试解析文档学习.html","w",encoding="utf-8")
#f.write(str(s))
'''
a='https://i.meizitu.net/2019/01/13d'

b="https://i.meizitu.net/2018/12/29d"

c="https://i.meizitu.net/2017/01/01b"
d="https://i.meizitu.net/2017/01/02b"
ls=[]
ls=[a,b,c,d]
'''





"""
p1=["0"+str(i) for i in range(1,10)]    #快速列表生成器
p1.append("10","11","12")  
"""


 
site="https://i.meizitu.net/"     #2018

years=[site+str(i)+"/" for i in range(2015,2020)]    #产生这几年


                              #1-31天
    
    
#p2=[chr(i) for i in range(97,123)]          #百度python生成a-z

#        for j in p2:

#经过分析大多数图片都是 https://i.meizitu.net/2018/12/28a01.jpg 末尾的字母主要是 a b c 所以,为了效率,节省点时间吧




def  nyr(year):
    p0=["0"+str(i) for i in range(1,10)]+[str(i) for i in range(10,13)]   #
    p1=["0"+str(i) for i in range(1,10)]+[str(i) for i in range(10,32)]   #产生31天    
    
    url_day=[]
    for k in p0:    
        for i in p1:
                url_day.append(year+k+'/'+i)     #产生某一天

    return url_day
#这样遍历的全年的不太好用,还不如老老实实爬某一个月的
"""
p12="https://i.meizitu.net/2018/10/"
url_Nov=[]
for k in p1:    
    for i in p2: 
         url_Nov.append(p12+k+i)


"""

header = {
"User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)",  
"Referer": r'https://www.mzitu.com/169451'
}  



#headers={'referer':my_referer}


def downloud(url):
    #p2015=["m","p","s","t","u","w","x","y"]
    p2015=[chr(i) for i in range(97,123)]  
    for j in p2015:
        for i in range(1,60):
            if i<10:
                link=url+j+"0"+str(i)+".jpg"
            else:
                link=url+j+str(i)+".jpg"
                
            
            try:
                r1=requests.get(link,timeout=0.1,headers=header)
                r1.raise_for_status()
                html1=r1.content
                #ss=str(i)
                f=open("f:/爬虫生成文件/2015-/"+link[-17:-4].replace("/","")+".jpg","wb")
                f.write(html1)
                f.close()
            except:
                k=open("f:/爬虫生成文件/爬虫字母.txt","a",encoding="utf-8")
                k.write(link[-7:-4]+",")
                print("不存在:","{:^10}".format(link))
                #k.close()
                break
            

def main():
    start_time=time.time()
    
    for j in range(len(years)):
        n=nyr(years[j])
        for i in range(len(n)):
            downloud(n[i])
    '''
    n=nyr(years[0])
    for i in range(len(n)):
        downloud(n[i])
    '''
    end_time=time.time()
    print("{:10}".format(end_time-start_time))
      
    
main()







"""



def main():
    for i in url_Nov[:200]:
        downloud(i)    
        
"""


"""

对空文件测试
x
Out[107]: 'https://i.meizitu.net/2018/12/12o1'

r1=requests.get(x,headers={'referer':my_referer})

r1
Out[109]: <Response [404]>


明白了这里必须要加上try except ,,raisefor status的原因了



url[-11:].replace("/","")
"""         
       

        

 

 

 

# -*- coding: utf-8 -*-
"""
Created on Mon Jan 14 18:23:10 2019
@author: Administrator
"""

import requests
from bs4 import BeautifulSoup
import re
import time


def  nyr(year):         #当给定第year年 产生year年的每一天
    p0=["0"+str(i) for i in range(1,10)]+[str(i) for i in range(10,13)]   #
    p1=["0"+str(i) for i in range(1,10)]+[str(i) for i in range(10,32)]   #产生31天    
    
    url_day=[]
    for k in p0:    
        for i in p1:
                url_day.append(year+k+'/'+i)    
    return url_day



def downloud(url):        #url是给定的某年某月某日 该函数遍历a00-a79 b00-b79....z00-z79 然后传入下一日
    p2=[chr(i) for i in range(97,123)]  
    for j in p2:         #26字母遍历
        for i in range(1,80):              #查找80张图片(一般一张专辑最多也就50章),这个数目大没关系,不会降低速度,因为如果没有立刻就break了,不会继续查找
           
           if i<10:                       #因为都是两位数 比如3应该转化为03
                link=url+j+"0"+str(i)+".jpg"
            else:
                link=url+j+str(i)+".jpg"
                
            try:                            #下载图片
                r=requests.get(link,timeout=0.1,headers=header)
                r.raise_for_status()
                html=r.content
                f=open("f:/爬虫生成文件/2015-/"+link[-17:-4].replace("/","")+".jpg","wb")
                f.write(html)
                f.close()
            except:
                print("不存在:","{:^10}".format(link))
                break
            

def main():
    my_referer = r'https://www.mzitu.com/169451'         
    site="https://i.meizitu.net/"     
    header = {"User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)",
    "Referer": r'https://www.mzitu.com/169451'
    }  
    years=[site+str(i)+"/" for i in range(2015,2020)]  
    start_time=time.time()  
    for j in range(len(years)):        #循环遍历每一年
        n=nyr(years[j])
        for i in range(len(n)):        #循环遍历该年的每一天,从第0天开始
            downloud(n[i])
    end_time=time.time()
    print("{:10}".format(end_time-start_time))    #计算总共用时间
  
main()








        

 

posted @ 2019-01-16 12:44  V5八旗  阅读(300)  评论(0编辑  收藏  举报