selenium获取页面与mitm中间人代理 爬取数据

使用需要先运行mitm脚本,然后运行selenium自动点击页面

selenium 负责点击链接, 需要使用mitm代理


class ContrlFlow():
    def __init__(self):
        self.driver: Optional[webdriver.Chrome] = self.gen_driver()

    def gen_driver(self) -> webdriver.Chrome:
        options = webdriver.ChromeOptions()
        # 保存用户配置
        user_path = gen_source_path() / 'temp' / 'webdriver' / 'chromedriver' / 'user_data'
        user_path.mkdir(exist_ok=True, parents=True)
        print(user_path.absolute())
        options.add_argument(rf"user-data-dir={user_path.absolute()}")
        # 设置代理  重要  使用代理后, mitm才能查看请求与响应
        options.add_argument('--proxy-server=http://{}:{}'.format('127.0.0.1', 8080))

        # 伪装头
        user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.3 Safari/605.1.15"
        options.add_argument('--user-agent=%s' % user_agent)
        options.add_argument("disable-blink-features=AutomationControlled")
        # 生成浏览器头
        driver = webdriver.Chrome(options=options)
        return driver

    def crawl(self):
        self.driver_flow_market()

    def driver_flow_market(self):
        """获取页面"""
        self.driver_get("https://xxxxx")
        sleep(5)

mitm代理查看请求与响应数据

from mitmproxy.http import HTTPFlow


class Addon:
    """mitmproxy 监听模组"""

    def request(self, flow: HTTPFlow):
        request = flow.request  # 获取请求对象
        print("request data = ", request.data)

    def response(self, flow: HTTPFlow):
        if flow.response.status_code != 200:
            return
        response = flow.response  # 获取响应对象
        print(f"response content = {response.content}")

from mitmproxy.http import HTTPFlow
import json
import pandas as pd
from diskcache import Cache
from mitmproxy.tools import main
from pathlib import Path
from datetime import datetime


class AddonMuji(Addon):

    def __init__(self, ):
        ...
        

    def request(self, flow: HTTPFlow):
        """request的函数名称是固定的"""
        ...
        # request = flow.request  # 获取请求对象
        # print("request data = ", request.data)

    def response(self, flow: HTTPFlow):
        """
        :param flow:
        :return:
        """
        if flow.response.status_code != 200:
            return
        url = flow.request.url
        if url == 'https://xxxxx':
            self.proc_searchOffer_v2(flow=flow)

    def proc_searchOffer_v2(self, flow: HTTPFlow = None):
        """获取报盘价格"""
        data = self.flow_to_json(flow=flow, type='response')
        data = data['result']['list']
        data = pd.json_normalize(data)


    def flow_to_json(self, flow: HTTPFlow = None, type='request'):
        """"""
        if type == 'request':
            try:
                data = flow.request.json()
            except json.decoder.JSONDecodeError as e:
                print(f'解析json报错, flow.request为 ->{flow.request.text}')
                data = None
        else:
            try:
                data = flow.response.json()
            except json.decoder.JSONDecodeError as e:
                print(f'解析json报错, flow.response为 ->{flow.response.text}')
                data = None

        return data


addons = [
    AddonMuji()
]

if __name__ == '__main__':
    # 运行此脚本, 打开代理后运行 selenium自动化脚本
    main.mitmdump(['-s', __file__, '--listen-host', '127.0.0.1', '-p', '8080', '--set', 'block_global=false'])

posted @ 2025-12-26 11:17  meizhengchao  阅读(1)  评论(0)    收藏  举报