爬取代理IP
基于用户行为反爬虫
部分网站是通过检测用户行为,例如同一IP短时间内多次访问同一页面,或者同一账户短时间内多次进行相同操作。对于这种情况,使用IP代理就可以解决。我们可以将代理IP检测之后保存在文件当中,定期爬取代理ip进行更新,同时对代理ip进行定时校验,剔除不可用的动态ip。
爬取代理IP
举个栗子,爬取少量代理ip,保存到json文件中(可以自行保存到数据库或者其他数据文件中,定时爬取来维护数据):
import requests
from bs4 import BeautifulSoup
from mymodule.Util import getHeader
import os
import threading
class ProxyIp:
def __init__(self):
self.pagenum = 5;
self.url = 'https://www.xicidaili.com/nn/'
self.header = getHeader()
self.ip_urls = []
def reqIPUrl(self):
try:
lock = threading.Lock()
threads = [threading.Thread(target=self.spiderIp, args=(self.url + str(i), lock, )) for i in range(1, self.pagenum)]
for thread in threads:
thread.start()
for thread in threads:
thread.join()
print(len(self.ip_urls))
print(self.ip_urls)
with open('myip.json', 'w+') as file:
file.write(str(self.ip_urls))
file.close()
# self.filterIp()
print('success')
except Exception as err:
print(err)
os._exit(0)
def spiderIp(self, url, lock):
try:
res = requests.get(url, headers=self.header)
res.raise_for_status()
# 处理中文乱码
page = BeautifulSoup(res.content.decode(encoding='utf-8'), 'lxml')
res.close()
lines = page.select('#ip_list tr')[1:]
ips = [line.select('td')[1].getText() for line in lines]
ports = [line.select('td')[2].getText() for line in lines]
addrs = [line.select('td')[3].getText() for line in lines]
protocals = [line.select('td')[5].getText() for line in lines]
for i in range(0, len(ips)):
lock.acquire()
# print({'ip':ips[i], 'port':ports[i], 'addr':addrs[i], 'prot':protocals[i]})
self.ip_urls.append({'ip':ips[i], 'port':ports[i], 'addr':addrs[i], 'prot':protocals[i]})
lock.release()
except Exception as err:
print(err)
os._exit()
if __name__ == '__main__':
sw = ProxyIp()
sw.reqIPUrl()
校验代理IP
import json
import requests
import threading
#获取ip数据
def getIpData():
try:
with open('myip.json', 'r') as file:
t = file.read()
file.close()
t = t.replace('\'', '\"').replace('\\n', '')
return json.loads(t)
except Exception as err:
return []
def valVer(proxys):
lock = threading.Lock()
threads = [threading.Thread(target=reqIp, args=(proxy, lock)) for proxy in proxys]
for th in threads:
th.start()
for th in threads:
th.join()
def reqIp(proxy, lock):
try:
proxy_host = proxy
protocol = 'https' if 'https' in proxy_host else 'http'
proxies = {protocol: proxy_host}
response = requests.get('https://www.baidu.com', proxies=proxies, timeout=3)
if response.status_code == 200:
print(proxy)
lock.acquire()
good_proxys.append(proxy)
lock.release()
except Exception as e:
print(e)
if __name__ == '__main__':
ipData = getIpData()
good_proxys = []
proxys = []
for ipobj in ipData:
ip = ipobj['ip']
port = ipobj['port']
prot = ipobj['prot']
proxys.append(str(prot).lower() + '://' + ip + ':' + port)
print(proxys)
valVer(proxys)
print(good_proxys)
with open('goodip.json', 'w+') as file:
file.write(str(good_proxys))
file.close()

浙公网安备 33010602011771号