网站更新内容:请访问: https://bigdata.ministep.cn/

rest_agent

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2020/4/18 16:30
# @Author  : justin.郑 3907721@qq.com
# @File    : rest_agent.py
# @Desc    :

import requests
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np


class RestAgent():
    def __init__(self):
        # request header
        self.user_agent = "Mozilla/5.0 (Windows NT 6.1; WOW64) " \
                     "AppleWebKit/537.36 (KHTML, like Gecko) " \
                     "Chrome/57.0.2987.133 Safari/537.36 "

        # simulate http request
        self.session = requests.Session()
        self.session.headers['User-Agent'] = self.user_agent
        self.session.headers['X-Forwarded-For'] = ':'.join('{0:x}'.format(np.random.randint(0, 2**16 - 1)) for i in range(4)) + ':1'
        self.proxies = None

    def add_headers(self, dict):
        self.session.headers.update(dict)

    def get_cookies(self):
        return self.session.cookies

    def set_proxies(self, proxies):
        self.proxies = proxies

    def do_request(self, url, param = None, method="GET", type="text", encoding = None, json = None, **kwargs):
        if self.proxies is None:
            if method == "GET":
                res = self.session.get(url, params=param, **kwargs)
            else:
                if json is not None:
                    res = self.session.post(url, json=json, **kwargs)
                else:
                    res = self.session.post(url, data=param, **kwargs)
        else:
            if method == "GET":
                res = self.session.get(url, params=param, proxies=self.proxies, **kwargs)
            else:
                if json is not None:
                    res = self.session.post(url, json=json, proxies=self.proxies, **kwargs)
                else:
                    res = self.session.post(url, data=param, proxies=self.proxies, **kwargs)

        if res.status_code != 200:
            return None
        else:

            if encoding is not None:
                res.encoding = encoding

            if type == 'text':
                return res.text
            else:
                return res.content

    def get_aspx_param(self, url):
        html = self.do_request(url)
        bsObj = BeautifulSoup(html, 'html5lib')
        __VIEWSTATE          = bsObj.find('input', {'id': '__VIEWSTATE'}).attrs['value']
        __EVENTVALIDATION    = bsObj.find('input', {'id': '__EVENTVALIDATION'}).attrs['value']
        __VIEWSTATEGENERATOR = bsObj.find('input', {'id' : '__VIEWSTATEGENERATOR'}).attrs['value']

        data = {
            "__VIEWSTATE"            : __VIEWSTATE,
            "__EVENTVALIDATION"     : __EVENTVALIDATION,
            "__VIEWSTATEGENERATOR"  : __VIEWSTATEGENERATOR,
        }
        return data

    def get_proxy_list(self):
        url = "http://www.mimiip.com/gngao/"
        pageno = 0

        proxy_list = []
        while pageno < 10:
            pageno = pageno + 1
            rsp = self.do_request(url + str(pageno), None, None)

            if rsp is None:
                return None

            soup = BeautifulSoup(rsp)
            tables = soup.find_all('table')

            data = []
            for table in tables:
                if table.has_key('class') and "list" in table['class']:
                    rows = table.findAll('tr')
                    for row in rows:
                        cols = row.findAll('td')
                        if (len(cols) > 5) :
                            ip   = cols[0].text
                            port = cols[1].text
                            type = cols[4].text
                            data.append((ip, port, type))

            if len(data) == 0:
                break
            else:
                proxy_list.extend(data)

        df = pd.DataFrame(proxy_list)
        df.columns = ['IP', 'Port', 'Type']

        return df


if __name__ == '__main__':
    aqi = RestAgent()
    result = aqi.get_proxy_list()
    print(result)

posted @ 2021-02-19 19:09  ministep88  阅读(35)  评论(0)    收藏  举报
网站更新内容:请访问:https://bigdata.ministep.cn/