Flask开发系列之Flask+redis实现IP代理池

Flask开发系列之Flask+redis实现IP代理池

 

代理池的要求

  • 多站抓取,异步检测:多站抓取:指的是我们需要从各大免费的ip代理网站,把他们公开的一些免费代理抓取下来;一步检测指的是:把这些代理通过异步请求的方式,利用这些代理请求网站:如果能正常请求就证明代理可用,如果不能正常请求就证明代理不行,这时就可以把这个代理剔除掉,异步指的是:我们不需要一直等待代理请求网站,到得到response之后在执行相应的操作就可以了,异步可以提高检测效率。

  • 定时筛选,持续更新:我们维护一个代理池,我们需要做的是需要定时从里面拿出一部分来检测,剔除掉不可用的代理。这可以保证代理是可用的

  • 提供接口,易于提取:代理实际上是维护在一个队列中,队列可以使用数据库存储,也可以使用一些数据结构来存储,但是如果要获取代理的话,要提供一个简单的接口,最简单的是web形式的接口:本文主要演示一个利用python flask包来提供接口:之后使用python请求网址,从网页中拿到代理的信息了

 

代理池的架构

  • 获取器:从各大网站平台抓取代理:ip和端口

  • 过滤器:剔除掉不可用的代理

  • 将可用代理放到代理队列

  • 定时检测器:剔除不可用的代理

  • API:通过接口形式拿到代理对象,方便使用

 

测试实现版

import requests
import re
import time
import redis
from bloom_filter import BloomFilter
import ast

pool = redis.ConnectionPool(host='localhost',password='xxx', port=6379, decode_responses=True)
r = redis.Redis(connection_pool=pool)
bloombloom = BloomFilter(max_elements=10000, error_rate=0.1)
bloombloom.add(str({'http': '117.91.232.53:9999'}))


def get_ip(i):
    ip_list=[]
    url = 'https://www.kuaidaili.com/free/inha/'
    url = url + str(i + 1)
    html = requests.get(url=url, ).text
    regip = '<td.*?>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>.*?<td.*?>(\d{1,5})</td>'
    matcher = re.compile(regip, re.S)
    ipstr = re.findall(matcher, html)
    time.sleep(1)
    for j in ipstr:
        ip_list.append(j[0] + ':' + j[1])
    print('共收集到%d个代理ip' % len(ip_list))
    print(ip_list)
    return ip_list



def valVer(proxys):
    global badNum,goodNum,good_list
    good = []
    for proxy in proxys:
        try:
            proxy_host = proxy
            protocol = 'https' if 'https' in proxy_host else 'http'
            proxies = {protocol: proxy_host}
            print('现在正在测试的IP:', proxies)
            response = requests.get('http://www.baidu.com', proxies=proxies, timeout=2)
            if response.status_code != 200:
                badNum += 1
                print(proxy_host, 'bad proxy')
            else:
                goodNum += 1
                good.append(proxies)
                good_list.append(proxies)
                print(proxy_host, 'success proxy')
        except Exception as e:
            print(e)
            # print proxy_host, 'bad proxy'
            badNum += 1
            continue
    print('success proxy num : ', goodNum)
    print('bad proxy num : ', badNum)
    print("这次:",good)
    print("此时全部:",good_list)
    return good


def time_valVer(proxys):
    good = []
    for proxy in proxys:
        try:
            print('现在正在定时测试的IP:',proxy)
            proxy = ast.literal_eval(proxy)
            response = requests.get('http://www.baidu.com', proxies=proxy, timeout=2)
            if response.status_code != 200:
                r.lrem("ip_list", proxy, 1)
                print(proxy, 'bad proxy')
            else:
                good.append(proxy)
                good_list.append(proxy)
                print(proxy, 'success proxy')
        except Exception as e:
            print(e)
            continue

def stone(good):
    for IP in good:
        if str(IP) in bloombloom:
            print("%s不能存储,有相同的IP",IP)
            continue
        else:
            print("存储的IP:", IP)
            bloombloom.add(str(IP))
            r.rpush("ip_list", str(IP))

if __name__ == '__main__':

    badNum = 0
    goodNum = 0
    good_list = []
    for i in range(0,10):
        if i%10 == 0 and i!=0:
            proxy_list = []
            for i in range(0, r.llen("ip_list")):
                proxy_list.append(r.lindex("ip_list", i))
            time_valVer(proxy_list)
        else:
            ip_list = get_ip(i)
            good = valVer(ip_list)
            stone(good)

 

 

from flask import Flask
import redis   # 导入redis模块,通过python操作redis 也可以直接在redis主机的服务端操作缓存数据库


r = redis.Redis(host='localhost', port=6379,password='xxx',decode_responses=True)
app = Flask(__name__)
@app.route('/ip/<int:index>')
def reponse(index):
    print(index)
    print(r.lindex("ip_list", index))
    return r.lindex("ip_list", index)
if __name__ == '__main__':
    app.run(debug=True)

 

 获取ip:

 改进版

import requests
import re
import time
import redis
from bloom_filter import BloomFilter
import ast


pool = redis.ConnectionPool(host='localhost',password='XXX', port=6379, decode_responses=True)
r = redis.Redis(connection_pool=pool)
bloombloom = BloomFilter(max_elements=10000, error_rate=0.1)


def get_ip(i):
    ip_list=[]
    url = 'https://www.kuaidaili.com/free/inha/'
    url = url + str(i + 1)
    html = requests.get(url=url, ).text
    regip = '<td.*?>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>.*?<td.*?>(\d{1,5})</td>'
    matcher = re.compile(regip, re.S)
    ipstr = re.findall(matcher, html)
    time.sleep(1)
    for j in ipstr:
        ip_list.append(j[0] + ':' + j[1])
    print('共收集到%d个代理ip' % len(ip_list))
    print(ip_list)
    return ip_list


def valVer(proxys):
    global badNum,goodNum,good_list
    good = []
    for proxy in proxys:
        try:
            proxy_host = proxy
            protocol = 'https' if 'https' in proxy_host else 'http'
            proxies = {protocol: proxy_host}
            response = requests.get('http://www.baidu.com', proxies=proxies, timeout=2)
            if response.status_code != 200:
                badNum += 1
            else:
                goodNum += 1
                good.append(proxies)
                good_list.append(proxies)
        except Exception as e:
            print(e)
            badNum += 1
            continue
    print('success proxy num : ', goodNum)
    print('bad proxy num : ', badNum)
    print("这次:",good)
    print("此时全部:",good_list)
    return good


def time_valVer(proxys):
    for proxy in proxys:
        try:
            print('现在正在定时测试的IP:',proxy)
            proxy = ast.literal_eval(proxy)
            response = requests.get('http://www.baidu.com', proxies=proxy, timeout=2)
            if response.status_code != 200:
                r.lrem("ip_list", proxy, 1)
        except Exception as e:
            print(e)
            continue

def stone_redis(good):
    for IP in good:
        if str(IP) in bloombloom:
            print("%s不能存储,有相同的IP",IP)
            continue
        else:
            print("存储的IP:", IP)
            bloombloom.add(str(IP))
            r.rpush("ip_list", str(IP))

def init():
    for i in range(0, r.llen("ip_list")):
        print(r.lindex("ip_list", i))
        bloombloom.add(r.lindex("ip_list", i))


if __name__ == '__main__':
    badNum = 0
    goodNum = 0
    good_list = []
    init()
    for i in range(0,10):
        if i%2 == 0 and i!=0:
            proxy_list = []
            for i in range(0, r.llen("ip_list")):
                proxy_list.append(r.lindex("ip_list", i))
            time_valVer(proxy_list)
        else:
            ip_list = get_ip(i)
            good = valVer(ip_list)
            stone_redis(good)

 

 

from flask import Flask, abort, request, jsonify
import redis   # 导入redis模块,通过python操作redis 也可以直接在redis主机的服务端操作缓存数据库

r = redis.Redis(host='localhost', port=6379,password='XXX',decode_responses=True)
app = Flask(__name__)
@app.route('/ip/<int:index>', methods=['GET'])
def reponse(index):
    print(index)
    ip = {"ip":r.lindex("ip_list", index)}
    print(r.lindex("ip_list", index))
    return jsonify(ip)
if __name__ == '__main__':
    app.run(debug=True)

 

获取ip:

 

posted @ 2019-06-11 12:19  -零  阅读(1354)  评论(0编辑  收藏  举报