import re
import urllib.request
import random
import os
import http.server
import http.client
from urllib.error import URLError, HTTPError
import urllib.parse
proxy = [] #定义代理IP列表
def change_proxy(): #创建使用随机某个代理IP地址
proxy_ip = random.choice(proxy)
proxy_support = urllib.request.ProxyHandler({"http":proxy_ip})
opener = urllib.request.build_opener(proxy_support)
opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36')]
urllib.request.install_opener(opener)
print("代理IP: %s" % proxy_ip)
def url_open(url): #访问jandan.net网站,如果报错进行重新获取代理IP,最多5次
count = 0
while True:
try:
if count == "5":
print("已经失败了5次,程序退出,重新执行")
count += 1
response = urllib.request.urlopen(url)
html = response.read()
return html
except OSError as e:
print("链接出问题了,智能切换新的代理IP\n出错的问题是:" + str(e))
change_proxy()
continue
except urllib.error.URLError as u:
print("链接出问题了,智能切换新的代理IP\n出错的问题是:" + str(u))
change_proxy()
continue
except (http.client.BadStatusLine,http.client.IncompleteRead) as h:
print("链接出问题了,智能切换新的代理IP\n出错的问题是:" + str(h))
change_proxy()
continue
def get_pagenum(url): #获取jandan网站的页面号(2305)
html = url_open(url).decode("utf-8")
num_re = re.compile(r'<span\sclass="current-comment-page">\[\d{4}\]</span>')
num = num_re.search(html)
a = re.compile(r'\d{4}')
num_b = a.search(num.group())
return num_b.group()
def get_imgurl(url): #获取图片的地址
img = []
html = url_open(url).decode("utf-8")
jpg_re = re.compile(r'<img src="//ww.*\.jpg')
numurl = jpg_re.findall(html)
jpg = re.compile(r'//ww.+\.jpg')
for line in numurl:
imgurl = jpg.findall(line)
img.append(imgurl[0])
return img
def save_img(img): #保存图片
i = 0
for each in img:
i += 1
filename = each.split('/')[-1]
with open(filename,'wb') as f:
imgpage = url_open("http:%s" %each)
f.write(imgpage)
print("下载本页的第%s张图片,名称为%s" %(i,filename))
def get_proxy(): #从IP代理网站上抓取代理IP,存入Proxy列表中
head = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
req = urllib.request.Request(url="http://www.xicidaili.com",headers=head)
response = urllib.request.urlopen(req)
html = response.read().decode("utf-8")
IP = re.compile(r'''<tr\sclass=.+>\s+
<td\s.+</td>\s+
<td>.+</td>\s+
<td>.+</td>\s+
<td>.+</td>\s+
<td\s.+?</td>\s+
<td>.+</td>\s+
<td>.+</td>\s+
<td>.+</td>\s+
</tr>
''',re.VERBOSE)
proxy_ip = IP.findall(html)
for num in range(len(proxy_ip)):
protocol_list = proxy_ip[num].split()
protocol = protocol_list[-4].split(">")
HTTP = protocol[1].split("<")
PORT_list = proxy_ip[num].split()
PORT = PORT_list[8].split(">")
PO = PORT[1].split("<")
ip_list = proxy_ip[num].split()
ip = ip_list[7].split(">")
IP = ip[1].split("<")
if HTTP[0] == "HTTP":
IP_list = IP[0]+":"+PO[0]
proxy.append(IP_list)
return proxy
def download(dir,url):
if not os.path.isdir(dir):
os.mkdir(dir)
os.chdir(dir)
else:
os.chdir(dir)
url = url
page_num = int(get_pagenum(url))
for i in range(10):
page_num -= 1
pageurl = url + "page-" + str(page_num) + "#comments"
imgurl = get_imgurl(pageurl)
print("下载第%s页图片" % page_num)
saveimg = save_img(imgurl)
if __name__ == "__main__":
get_proxy()
change_proxy()
dir = "ooxx"
url = "http://jandan.net/ooxx/"
download(dir,url)