代码用的python2.7,抓取xici免费代理,检测放入数据库中,为以后爬虫做准备。下面直接上代码

github:https://github.com/zpz351348924/python3_learn

 1 #-*-encoding=utf-8-*-
 2 
 3 import requests
 4 from lxml import etree
 5 import time
 6 import pymongo
 7 from multiprocessing import Pool
 8 
 9 
10 class Getproxy(object):
11     def __init__(self):
12         self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}
13         self.url = 'http://www.xicidaili.com/wt/'
14         self.client = pymongo.MongoClient('localhost',27017)
15         self.xici = self.client['xici']
16         self.xiciipinfo =self.xici['xiciipinfo']
17         #self.removeip = '127.0.0.1' #第一次运行会检测该变量,因为下面只有检测失败了才会赋值
18 
19     def getip(self,num):
20         #爬西祠所有代理,更新放入数据库
21         url = self.url + str(num)
22         wb_data = requests.get(url, headers= self.headers)
23         html = etree.HTML(wb_data.text)
24         # htmls = etree.tostring(html)
25         ips = html.xpath('//tr[@class="odd"]/td[2]/text()')
26         ports = html.xpath('//tr[@class="odd"]/td[3]/text()')
27         protocols = html.xpath('//tr[@class="odd"]/td[6]/text()')
28         areas = html.xpath('//tr[@class="odd"]/td[4]/a/text()')
29         for ip, port, protocol, area in zip(ips, ports, protocols, areas):
30             data = {
31                 'ip': ip,
32                 'port': port,
33                 'protocol': protocol,
34                 'area': area,
35             }
36             print data
37             #self.xiciipinfo.insert_one(data)
38             #if self.removeip != ip: #此处加一个判断,如果是下面检测过的不可用的ip,就不更新进入数据库,可以节省下面的检测时间
39             self.xiciipinfo.update({'ip':ip}, {'$set':data}, True)
40 
41 
42     def count(self,num):
43         for i in range(1,num):
44             self.getip(i)
45             time.sleep(2)
46 
47 
48     def dbclose(self):
49         self.client.close()
50 
51 
52     def getiplist(self):
53         # 将数据库内数据整理放入列表
54         ips = self.xiciipinfo.find()
55         proxylist = []
56         for i in ips:
57             b = "http" + "://" + i['ip'] + ":" + i['port']
58             proxies = {"http": b}
59             # print proxies
60             proxylist.append(proxies)
61         # print proxylist
62         return proxylist
63 
64     def iptest(self, proxy):
65         # 检测ip,并更新进入数据库,删掉不可用的ip
66         ip = proxy['http'][7:].split(':')[0]
67         try:
68             requests.get('http://wenshu.court.gov.cn/', proxies=proxy, timeout = 6)
69         except:
70             print 'field...............>>>>>>>>>>>>>>>>>>>>>>>>'
71             #self.removeip = ip #赋值给类属性
72             self.xiciipinfo.remove({'ip': ip})  # 用remove方法,将符合条件的删掉
73             print 'remove it now.....{}'.format(ip)
74         else:
75             print '<<<<<<<<<<<<<<<<<.............success'
76             print proxy
77 
78 
79 if __name__ == '__main__':
80     pool = Pool()
81     proxy = Getproxy()
82     proxy.count(2)
83     iplist = proxy.getiplist()
84     map(proxy.iptest, iplist)
85     proxy.dbclose()