声明:以某某图 为例,代码仅供学习参考!
1、利用fiddler,访问某某图首页进行header获取 (获取结果如下)
headers = {
"Accept":"image/webp,image/apng,image/*,*/*;q=0.8",
# "Accept-Encoding":"gzip, deflate", 本地查看时,会导致乱码
"Accept-Language":"zh-CN,zh;q=0.8",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36",
"Connection":"keep-alive",
"Referer":"http://www.mzitu.com"}
2、拼接headers备用
headall = []
for key, value in headers.items():
item = (key, value)
headall.append(item)
3、获取html内容
def openhtml():
cjar = http.cookiejar.CookieJar()
#127.0.0.1:8888 为fiddler 的代理地址 方便查看信息 找错
proxy = urllib2.ProxyHandler({'http': '127.0.0.1:8888'})
opener = urllib2.build_opener(proxy, urllib2.HTTPHandler, urllib2.HTTPCookieProcessor(cjar))
opener.addheaders = headall
urllib2.install_opener(opener)
data = urllib2.urlopen(url).read()
return data
4、利用正则表达式获取所有图片链接并保存到本地
def download(data):
#正则匹配url
reg = "data-original='.*?\.jpg"
imgre = re.compile(reg)
imglist = re.findall(imgre, data)
x = 0
for image_url in imglist:
image_url = image_url.replace("data-original='", "")
print image_url
opener = urllib2.build_opener()
#反 防盗链 精髓在此
opener.addheaders = headall
data = opener.open(image_url).read()
with open("C:\Users\zzz\Desktop\images\\" + str(x) + ".jpg", "wb") as code:
code.write(data)
x += 1
5、完整代码
#coding=utf8
import urllib2
import http.cookiejar
import re
url = "http://www.mzitu.com/xinggan"
headers = {
"Accept":"image/webp,image/apng,image/*,*/*;q=0.8",
# "Accept-Encoding":"gzip, deflate",
"Accept-Language":"zh-CN,zh;q=0.8",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36",
"Connection":"keep-alive",
"Referer":"http://www.mzitu.com"}
headall = []
for key, value in headers.items():
item = (key, value)
headall.append(item)
#获取html
def openhtml():
cjar = http.cookiejar.CookieJar()
#127.0.0.1:8888 为fiddler 的代理地址 方便查看信息 找错
proxy = urllib2.ProxyHandler({'http': '127.0.0.1:8888'})
opener = urllib2.build_opener(proxy, urllib2.HTTPHandler, urllib2.HTTPCookieProcessor(cjar))
opener.addheaders = headall
urllib2.install_opener(opener)
data = urllib2.urlopen(url).read()
return data
#下载
def download(data):
#正则匹配url
reg = "data-original='.*?\.jpg"
imgre = re.compile(reg)
imglist = re.findall(imgre, data)
x = 0
for image_url in imglist:
image_url = image_url.replace("data-original='", "")
print image_url
opener = urllib2.build_opener()
#反 防盗链 精髓在此
opener.addheaders = headall
data = opener.open(image_url).read()
with open("C:\Users\zzz\Desktop\images\\" + str(x) + ".jpg", "wb") as code:
code.write(data)
x += 1
if __name__ == '__main__':
data = openhtml()
download(data)