Python实例 -- 爬虫

 1 #coding="utf-8"
 2 
 3 import urllib2
 4 import re
 5 import threading
 6 import time
 7 
 8 """
 9 抓取代理发布页的ip和port10 http://www.xici.net.co/nn/%d
11 """
12 
13 proxylist = []
14 
15 
16 def get_proxy_from_cnproxy():
17     global proxylist
18     
19     p = re.compile(r'<td><img alt="(.+?)" src=".+?" /></td>[\s\S]*?<td>(.+?)</td>[\s\S]*?<td>(.+?)</td>[\s\S]*?<td>[\s\S]*?<a href=".+?">.+?</a>[\s\S]*?</td>[\s\S]*?<td>.+?</td>[\s\S]*?<td>(.+?)</td>')
20     
21     for i in range(1,2):
22         target = r"http://www.xici.net.co/nn/%d" %i
23         print target
24         req = urllib2.urlopen(target)
25         result = req.read()
26         matchs = p.findall(result)
27         for record in matchs:
28             addr = record[0]
29             ip = record[1]
30             port = record[2]
31             protocol = record[3]
32             l = [ip, port, protocol, addr]
33             #print l
34             proxylist.append(l)    
35         print proxylist
36 
37 
38 class ProxyCheck(threading.Thread):
39     def __init__(self, proxylist, fname):
40         threading.Thread.__init__(self)
41         self.proxylist = proxylist
42         self.timeout = 5
43         self.test_url = "http://www.baidu.com/"
44         self.test_str = "030173"
45         self.checkedPProxyList = []
46         self.fname = fname
47         
48     def checkProxy(self):
49         cookies = urllib2.HTTPCookieProcessor()
50         for proxy in self.proxylist:
51             proxy_handler = urllib2.ProxyHandler({"http":r'http://%s:%s'%(proxy[0],proxy[1])})
52             opener = urllib2.build_opener(cookies, proxy_handler)
53             opener.addheaders = [('user-agent', 'mozilla/5.0(iphone; u; cpu like mac os x; en) applewebkit/420+ (khtml, like gecko) version/3.0 mobile/1A537a safari/419.3')]
54             urllib2.install_opener(opener)
55             t1 = time.time()
56             try:
57                 req = urllib2.urlopen(self.test_url, timeout = self.timeout)
58                 result = req.read()
59                 timeused = time.time() - t1
60                 pos = result.find(self.test_str)
61                 if pos > 1:
62                     self.checkedPProxyList.append([proxy[0],proxy[1],proxy[2],proxy[3],timeused])
63                 else:
64                     continue;
65             except Exception,e:
66                 print e.message
67                 continue;
68             
69     def sort(self):
70         sorted(self.checkedPProxyList,cmp=lambda x,y:cmp(x[4],y[4]))
71     
72     def save(self):
73         f = open(self.fname, 'w+')
74         for proxy in self.checkedPProxyList:
75             f.write("%s:%s\t%s\t%s\t%s\n"%(proxy[0],proxy[1],proxy[2],proxy[3],str(proxy[4])))
76         f.close()
77         
78     def run(self):
79         self.checkProxy()
80         self.sort()
81         self.save()
82         
83 
84 if __name__ == "__main__":
85     get_proxy_from_cnproxy()
86     t1 = ProxyCheck(proxylist,"test.txt")
87     t1.start()

 

posted @ 2014-08-04 01:15  luzhiyuan  阅读(443)  评论(0编辑  收藏  举报