一直对爬虫这块蛮感兴趣的，所以花了点时间看了看，写了个小脚本
代码可能有点乱，毕竟Python小白，勿喷……
嗯，话不多说，放码出来
 1 # -*- coding: UTF-8 -*-
 2 import re
 3 import requests
 4 
 5 headers = {"User-Agent" : "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"}
 6 
 7 url = "http://www.xicidaili.com/nn/"
 8 
 9 context = requests.get(url,headers = headers)
10 
11 #ip和端口
12 # pattern = re.compile("<td>\d+\.\d+\.\d+\.\d+</td>\s+<td>\d+</td>")
13 pattern = re.compile("<td>\d+\.\d+\.\d+\.\d+</td>\s+<td>\d+</td>\s+<td>\s+<.*?</a>\s+</td>\s+<.*?</td>\s+<td>[A-Z]{2,6}</td>")
14 
15 # re.sub字串替换
16 pat = re.compile('::<.*?::<.*?:')
17 
18 
19 #例：123.135.62.217:8118::<ahref="/2018-01-24/shandong">山东泰安</a>::<tdclass="country">高匿:HTTPS
20 #匹配规则：?::<.*?:
21 
22 content = pattern.findall(context.text)
23 for item in content:
24     item = item.replace("<td>","").replace("</td>","").replace("\n",":").replace(" ","")
25     item = pat.sub("__",item)
26     with open("ip.txt","a") as f:
27         f.write(item+"\n")
28 
29 #ip数
30 i = 0
31 #页面数
32 j = 1
33 
34 #pass ip使用次数
35 #防止过多使用同一个ip被封，虽用的代理ip，还是感觉不太好，勿喷
36 #当然，ip和页面一对一又显得浪费
37 #所以加了这个机制
38 x = 0
39 f = open("ip.txt")
40 lines = f.readlines()
41 
42 
43 #数组的长度，Python应该是字典
44 # print len(lines)
45 
46 while i<len(lines):
47     url = "http://www.xicidaili.com/nn/"+str(j)
48     
49     #ip类型判断
50     if re.findall("HTTPS",lines[i].replace("\n","")):
51         ip = "https://"+lines[i].replace("\n","").replace("__HTTPS","")
52         proxies = {
53             "https":ip
54         }
55     elif re.findall("HTTP",lines[i].replace("\n","")):
56         ip = "http://"+lines[i].replace("\n","").replace("__HTTP","")
57         proxies = {
58             "http":ip
59         }
60     else:
61         print "代理ip获取错误..."
62         exit()
63         
64     #判断ip是否可用
65     try:
66         response = requests.get(url,headers = headers,proxies = proxies)
67     except:
68         print "第"+str(i)+"次失败"
69         i = i+1
70     else:
71         context = pattern.findall(response.text)
72         #可用ip保存，存到ip_pass.txt
73         if x>8:
74             with open("ip_pass.txt","a") as f:
75                 f.write(lines[i])
76             i = i+1
77             x = 1
78             print "第"+str(i)+"次成功"
79             print "."
80             print "."
81             print "."
82         else:
83             x = x+1
84         #保存页面信息
85         for item in context:
86             item = item.replace("<td>","").replace("</td>","").replace("\n",":").replace(" ","")
87             item = pat.sub("__",item)
88             with open("ips.txt","a") as f:
89                 f.write(item+"\n")        
90         print "第"+str(j)+"页爬取成功"
91         j = j+1
92 print "success"
posted on 2018-01-24 15:04 Aike_yuan 阅读(223) 评论(0) 收藏举报
刷新页面返回顶部
公告