1 # -*- coding:utf-8 -*-
2 #author : willowj
3 import urllib
4 import urllib2
5 from bs4 import BeautifulSoup
6 import re
7 import bs4
8
9 import sys
10
11
12 reload(sys)
13 sys.setdefaultencoding('utf8')
14
15
16 def ip_test(ip,url= "https://www.baidu.com"):
17 #test ip if can be used
18 #url = "http://ip.chinaz.com/getip.aspx"# 默认测试网址
19 ip1="http://"+ip
20 try :
21 res = urllib.urlopen(url,proxies={'http:':ip1}).read() #尝试代理访问
22 print 'ok',ip1 #,res
23 return True
24 except Exception,e:
25 print "failed"
26 return False
27
28
29 def get_iphtml_inyoudaili():
30 url='http://www.youdaili.net'
31 html=urllib2.urlopen(url)
32 code=html.read()
33 #href="http://www.youdaili.net/Daili/http/26672.html" title="12月27号 最新代理http服务器ip地址"
34 regexp='href="(.*?)" .*?最新代理http服务器ip地址'
35 pat=re.compile(regexp)
36 met=re.findall(pat,code)
37 print met[0]
38 #最新代理http服务器ip地址 html
39 return met[0]
40
41
42 def getIps(url):
43 #getip from website, test,and return,save aviable ips in 'ips.txt'
44 htmlip=urllib2.urlopen(url)
45 codeip=htmlip.read()
46
47 regexpip='([1-9][0-9]{0,2}\.\S*?)@HTTP#' #IP样式
48 pat_ip=re.compile(regexpip)
49
50 met_ip=re.findall(pat_ip,codeip)
51
52 ips=[]
53 file_open=open('ips.txt','w')
54 for x in met_ip:
55 print x
56 if ip_test(x):
57 ips.append(x)
58 file_open.write(x+'\n')
59 file_open.close()
60 #print ips,'youdaili'
61 return ips
62
63
64 def saveIps(list):
65 file_open=open('ips.txt','w')
66 for ip in list:
67 file_open.write(ip+'\n')
68 file_open.close()
69
70
71 def read_ips(file='ips.txt'):
72 '''读取IP 以list返回'''
73 file_open=open(file)
74 lines=file_open.readlines()
75 ips=[]
76 for line in lines:
77 ip=line.strip("\n")
78 ips.append(ip)
79 print ips
80 return ips
81
82
83 if __name__=="__main__":
84
85 ips = getIps(get_iphtml_inyoudaili())
86
87 saveIps(ips)