一直对爬虫这块蛮感兴趣的,所以花了点时间看了看,写了个小脚本
代码可能有点乱,毕竟Python小白,勿喷……
嗯,话不多说,放码出来
1 # -*- coding: UTF-8 -*- 2 import re 3 import requests 4 5 headers = {"User-Agent" : "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"} 6 7 url = "http://www.xicidaili.com/nn/" 8 9 context = requests.get(url,headers = headers) 10 11 #ip和端口 12 # pattern = re.compile("<td>\d+\.\d+\.\d+\.\d+</td>\s+<td>\d+</td>") 13 pattern = re.compile("<td>\d+\.\d+\.\d+\.\d+</td>\s+<td>\d+</td>\s+<td>\s+<.*?</a>\s+</td>\s+<.*?</td>\s+<td>[A-Z]{2,6}</td>") 14 15 # re.sub字串替换 16 pat = re.compile('::<.*?::<.*?:') 17 18 19 #例:123.135.62.217:8118::<ahref="/2018-01-24/shandong">山东泰安</a>::<tdclass="country">高匿:HTTPS 20 #匹配规则:?::<.*?: 21 22 content = pattern.findall(context.text) 23 for item in content: 24 item = item.replace("<td>","").replace("</td>","").replace("\n",":").replace(" ","") 25 item = pat.sub("__",item) 26 with open("ip.txt","a") as f: 27 f.write(item+"\n") 28 29 #ip数 30 i = 0 31 #页面数 32 j = 1 33 34 #pass ip使用次数 35 #防止过多使用同一个ip被封,虽用的代理ip,还是感觉不太好,勿喷 36 #当然,ip和页面一对一又显得浪费 37 #所以加了这个机制 38 x = 0 39 f = open("ip.txt") 40 lines = f.readlines() 41 42 43 #数组的长度,Python应该是字典 44 # print len(lines) 45 46 while i<len(lines): 47 url = "http://www.xicidaili.com/nn/"+str(j) 48 49 #ip类型判断 50 if re.findall("HTTPS",lines[i].replace("\n","")): 51 ip = "https://"+lines[i].replace("\n","").replace("__HTTPS","") 52 proxies = { 53 "https":ip 54 } 55 elif re.findall("HTTP",lines[i].replace("\n","")): 56 ip = "http://"+lines[i].replace("\n","").replace("__HTTP","") 57 proxies = { 58 "http":ip 59 } 60 else: 61 print "代理ip获取错误..." 62 exit() 63 64 #判断ip是否可用 65 try: 66 response = requests.get(url,headers = headers,proxies = proxies) 67 except: 68 print "第"+str(i)+"次失败" 69 i = i+1 70 else: 71 context = pattern.findall(response.text) 72 #可用ip保存,存到ip_pass.txt 73 if x>8: 74 with open("ip_pass.txt","a") as f: 75 f.write(lines[i]) 76 i = i+1 77 x = 1 78 print "第"+str(i)+"次成功" 79 print "." 80 print "." 81 print "." 82 else: 83 x = x+1 84 #保存页面信息 85 for item in context: 86 item = item.replace("<td>","").replace("</td>","").replace("\n",":").replace(" ","") 87 item = pat.sub("__",item) 88 with open("ips.txt","a") as f: 89 f.write(item+"\n") 90 print "第"+str(j)+"页爬取成功" 91 j = j+1 92 print "success"
浙公网安备 33010602011771号