# -*- coding: utf-8 -*-"""
Created on Mon Jan 14 18:23:10 2019
@author: Administrator
"""import requests
from bs4 import BeautifulSoup
import re
import time
#import xlwt'''
book = xlwt.Workbook(encoding='utf-8', style_compression=0)
sheet = book.add_sheet('test', cell_overwrite_ok=True)
sheet.write(0,0,link[-7:-4])
book.save(r'e:\test1.xls')
'''#url="https://www.mzitu.com/169451"
my_referer = r'https://www.mzitu.com/169451'#为了获得图片链接,暂时不需要了,因为链接可以生成'''
r=requests.get(url,headers={'referer':my_referer})
r.encoding=r.apparent_encoding
html=r.content
soup=BeautifulSoup(html,"html.parser")
s=soup.select("div p a")[0].img["src"]
'''#从链接总获得图片#s=soup.select(".article-content")#type(s[0])
#Out[18]: bs4.element.Tag#t=s[0].get_text()#f=open("d:/测试解析文档学习.html","w",encoding="utf-8")
#f.write(str(s))'''
a='https://i.meizitu.net/2019/01/13d'
b="https://i.meizitu.net/2018/12/29d"
c="https://i.meizitu.net/2017/01/01b"
d="https://i.meizitu.net/2017/01/02b"
ls=[]
ls=[a,b,c,d]
'''"""
p1=["0"+str(i) for i in range(1,10)] #快速列表生成器
p1.append("10","11","12")
"""
site="https://i.meizitu.net/"#2018
year=[site+str(i)+"/"for i in range(2015,2020)] #产生这几年#1-31天#p2=[chr(i) for i in range(97,123)] #百度python生成a-z# for j in p2:#经过分析大多数图片都是 https://i.meizitu.net/2018/12/28a01.jpg 末尾的字母主要是 a b c 所以,为了效率,节省点时间吧def nyr(y):
p0=["0"+str(i) for i in range(1,10)]+[str(i) for i in range(10,13)] #月
p1=["0"+str(i) for i in range(1,10)]+[str(i) for i in range(10,32)] #产生31天
url_day=[]
for k in p0:
for i in p1:
url_day.append(y+k+'/'+i) #产生某一天return url_day
#这样遍历的全年的不太好用,还不如老老实实爬某一个月的"""
p12="https://i.meizitu.net/2018/10/"
url_Nov=[]
for k in p1:
for i in p2:
url_Nov.append(p12+k+i)
"""
header = {
"User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)",
"Referer": r'https://www.mzitu.com/169451'
}
#headers={'referer':my_referer}def downloud(url):
#p2=["a","b","c"]
p2=[chr(i) for i in range(97,123)]
for j in p2:
for i in range(1,60):
if i<10:
link=url+j+"0"+str(i)+".jpg"else:
link=url+j+str(i)+".jpg"try:
r1=requests.get(link,timeout=0.1,headers=header)
r1.raise_for_status()
html1=r1.content
#ss=str(i)
f=open("f:/爬虫生成文件/2015-/"+link[-17:-4].replace("/","")+".jpg","wb")
f.write(html1)
f.close()
except:
k=open("f:/爬虫生成文件/爬虫字母统计.txt","a",encoding="utf-8")
k.write(link[-7:-4]+",")
print("不存在:","{:^10}".format(link))
#k.close()breakdef main():
start_time=time.time()
for j in range(len(year)):
n=nyr(year[j])
for i in range(len(n)):
downloud(n[i])
end_time=time.time()
print("{:10}".format(end_time-start_time))
main()
"""
def main():
for i in url_Nov[:200]:
downloud(i)
""""""
对空文件测试
x
Out[107]: 'https://i.meizitu.net/2018/12/12o1'
r1=requests.get(x,headers={'referer':my_referer})
r1
Out[109]: <Response [404]>
明白了这里必须要加上try except ,,raisefor status的原因了
url[-11:].replace("/","")
"""