1 #coding="utf-8"
2
3 import urllib2
4 import re
5 import threading
6 import time
7
8 """
9 抓取代理发布页的ip和port10 http://www.xici.net.co/nn/%d
11 """
12
13 proxylist = []
14
15
16 def get_proxy_from_cnproxy():
17 global proxylist
18
19 p = re.compile(r'<td><img alt="(.+?)" src=".+?" /></td>[\s\S]*?<td>(.+?)</td>[\s\S]*?<td>(.+?)</td>[\s\S]*?<td>[\s\S]*?<a href=".+?">.+?</a>[\s\S]*?</td>[\s\S]*?<td>.+?</td>[\s\S]*?<td>(.+?)</td>')
20
21 for i in range(1,2):
22 target = r"http://www.xici.net.co/nn/%d" %i
23 print target
24 req = urllib2.urlopen(target)
25 result = req.read()
26 matchs = p.findall(result)
27 for record in matchs:
28 addr = record[0]
29 ip = record[1]
30 port = record[2]
31 protocol = record[3]
32 l = [ip, port, protocol, addr]
33 #print l
34 proxylist.append(l)
35 print proxylist
36
37
38 class ProxyCheck(threading.Thread):
39 def __init__(self, proxylist, fname):
40 threading.Thread.__init__(self)
41 self.proxylist = proxylist
42 self.timeout = 5
43 self.test_url = "http://www.baidu.com/"
44 self.test_str = "030173"
45 self.checkedPProxyList = []
46 self.fname = fname
47
48 def checkProxy(self):
49 cookies = urllib2.HTTPCookieProcessor()
50 for proxy in self.proxylist:
51 proxy_handler = urllib2.ProxyHandler({"http":r'http://%s:%s'%(proxy[0],proxy[1])})
52 opener = urllib2.build_opener(cookies, proxy_handler)
53 opener.addheaders = [('user-agent', 'mozilla/5.0(iphone; u; cpu like mac os x; en) applewebkit/420+ (khtml, like gecko) version/3.0 mobile/1A537a safari/419.3')]
54 urllib2.install_opener(opener)
55 t1 = time.time()
56 try:
57 req = urllib2.urlopen(self.test_url, timeout = self.timeout)
58 result = req.read()
59 timeused = time.time() - t1
60 pos = result.find(self.test_str)
61 if pos > 1:
62 self.checkedPProxyList.append([proxy[0],proxy[1],proxy[2],proxy[3],timeused])
63 else:
64 continue;
65 except Exception,e:
66 print e.message
67 continue;
68
69 def sort(self):
70 sorted(self.checkedPProxyList,cmp=lambda x,y:cmp(x[4],y[4]))
71
72 def save(self):
73 f = open(self.fname, 'w+')
74 for proxy in self.checkedPProxyList:
75 f.write("%s:%s\t%s\t%s\t%s\n"%(proxy[0],proxy[1],proxy[2],proxy[3],str(proxy[4])))
76 f.close()
77
78 def run(self):
79 self.checkProxy()
80 self.sort()
81 self.save()
82
83
84 if __name__ == "__main__":
85 get_proxy_from_cnproxy()
86 t1 = ProxyCheck(proxylist,"test.txt")
87 t1.start()