python抓取免费ip存入redis
前言
网络爬虫在运行过程中并不顺利,总是会遇到各种各样的问题,如fan爬虫策略,它会试图阻止网络爬虫的运行,限制我们的的ip,所以说我们先需要在请求的时候加上代理ip,避免真实ip被封禁。在某代理批量抓取ip为我们搭建ip代理池做基础。
代理ip种类
1.透明代理:如果你使用了该形式的代理,服务器端知道你使用了代理机制也知道你的真实ip。
2.匿名代理:知道你使用了代理,但是不知道你的真实ip。
3.高匿代理:不知道你使用了代理,也不知道你的真实ip。
代理的类型
1.http:转发http的请求
2.https:代理只能转发https协议的请求
定义爬虫类
class ProxyPool(object): databases_map = db_map def __init__(self, start, end): # url列表 self.url_list = ['https://free.kuaidaili.com/free/inha/' + str(i) for i in range(start, end)] # 设置请求头 self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'} # 连接redis self.__r = redis.Redis(host='127.0.0.1', port=6379, decode_responses=True, charset='utf-8', encoding='utf-8')
解析数据
def get_html(self, url): response = requests.get(url=url, headers=self.headers).text return response def parse(self, html): tree = etree.HTML(html) # ip地址 ip = tree.xpath('//*[@id="list"]/table/tbody/tr/td[1]/text()') # 端口号 port = tree.xpath('//*[@id="list"]/table/tbody/tr/td[2]/text()') for ips, ports in zip(ip, port): # 完整代理ip http_proxy_url = 'http' + '://' + ips + ':' + ports self.save(http_proxy_url, 'http')
保存至redis
def save(self, proxy, protocol): try: self.__r.lpush(self.databases_map.get(protocol, None), proxy) except Exception as e: print('出错了:', e) print('保存成功', proxy)
main方法设置多线程爬取
def main(self): for url in self.url_list: html = self.get_html(url) self.parse(html) t_list = [] for i in range(10): run = ProxyPool(1, 10) t_list.append(run) thread_list = [] for i in t_list: t = Thread(target=i.main, args=()) thread_list.append(t) for t in thread_list: t.start() for t in thread_list: t.join()
完整代码
import redis import requests from lxml import etree from threading import Thread db_map = { "http": "proxies:http:v1", "https": "proxies:https:v1" } class ProxyPool(object): databases_map = db_map def __init__(self, start, end): self.url_list = ['https://xxxxxxxx' + str(i) for i in range(start, end)] self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'} self.__r = redis.Redis(host='127.0.0.1', port=6379, decode_responses=True, charset='utf-8', encoding='utf-8') def get_html(self, url): response = requests.get(url=url, headers=self.headers).text return response def parse(self, html): tree = etree.HTML(html) # ip地址 ip = tree.xpath('//*[@id="list"]/table/tbody/tr/td[1]/text()') # 端口号 port = tree.xpath('//*[@id="list"]/table/tbody/tr/td[2]/text()') for ips, ports in zip(ip, port): # 完整代理ip http_proxy_url = 'http' + '://' + ips + ':' + ports self.save(http_proxy_url, 'http') def save(self, proxy, protocol): try: self.__r.lpush(self.databases_map.get(protocol, None), proxy) except Exception as e: print('出错了:', e) print('保存成功', proxy) def main(self): for url in self.url_list: html = self.get_html(url) self.parse(html) t_list = [] for i in range(10): run = ProxyPool(1, 10) t_list.append(run) thread_list = [] for i in t_list: t = Thread(target=i.main, args=()) thread_list.append(t) for t in thread_list: t.start() for t in thread_list: t.join() if __name__ == '__main__': run = ProxyPool(1, 10) run.main()
免费的ip似乎不是很稳定,果然好的东西都是花钱买的

浙公网安备 33010602011771号