selenium获取页面与mitm中间人代理 爬取数据
使用需要先运行mitm脚本,然后运行selenium自动点击页面
selenium 负责点击链接, 需要使用mitm代理
class ContrlFlow():
def __init__(self):
self.driver: Optional[webdriver.Chrome] = self.gen_driver()
def gen_driver(self) -> webdriver.Chrome:
options = webdriver.ChromeOptions()
# 保存用户配置
user_path = gen_source_path() / 'temp' / 'webdriver' / 'chromedriver' / 'user_data'
user_path.mkdir(exist_ok=True, parents=True)
print(user_path.absolute())
options.add_argument(rf"user-data-dir={user_path.absolute()}")
# 设置代理 重要 使用代理后, mitm才能查看请求与响应
options.add_argument('--proxy-server=http://{}:{}'.format('127.0.0.1', 8080))
# 伪装头
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.3 Safari/605.1.15"
options.add_argument('--user-agent=%s' % user_agent)
options.add_argument("disable-blink-features=AutomationControlled")
# 生成浏览器头
driver = webdriver.Chrome(options=options)
return driver
def crawl(self):
self.driver_flow_market()
def driver_flow_market(self):
"""获取页面"""
self.driver_get("https://xxxxx")
sleep(5)
mitm代理查看请求与响应数据
from mitmproxy.http import HTTPFlow
class Addon:
"""mitmproxy 监听模组"""
def request(self, flow: HTTPFlow):
request = flow.request # 获取请求对象
print("request data = ", request.data)
def response(self, flow: HTTPFlow):
if flow.response.status_code != 200:
return
response = flow.response # 获取响应对象
print(f"response content = {response.content}")
from mitmproxy.http import HTTPFlow
import json
import pandas as pd
from diskcache import Cache
from mitmproxy.tools import main
from pathlib import Path
from datetime import datetime
class AddonMuji(Addon):
def __init__(self, ):
...
def request(self, flow: HTTPFlow):
"""request的函数名称是固定的"""
...
# request = flow.request # 获取请求对象
# print("request data = ", request.data)
def response(self, flow: HTTPFlow):
"""
:param flow:
:return:
"""
if flow.response.status_code != 200:
return
url = flow.request.url
if url == 'https://xxxxx':
self.proc_searchOffer_v2(flow=flow)
def proc_searchOffer_v2(self, flow: HTTPFlow = None):
"""获取报盘价格"""
data = self.flow_to_json(flow=flow, type='response')
data = data['result']['list']
data = pd.json_normalize(data)
def flow_to_json(self, flow: HTTPFlow = None, type='request'):
""""""
if type == 'request':
try:
data = flow.request.json()
except json.decoder.JSONDecodeError as e:
print(f'解析json报错, flow.request为 ->{flow.request.text}')
data = None
else:
try:
data = flow.response.json()
except json.decoder.JSONDecodeError as e:
print(f'解析json报错, flow.response为 ->{flow.response.text}')
data = None
return data
addons = [
AddonMuji()
]
if __name__ == '__main__':
# 运行此脚本, 打开代理后运行 selenium自动化脚本
main.mitmdump(['-s', __file__, '--listen-host', '127.0.0.1', '-p', '8080', '--set', 'block_global=false'])
出处: https://www.cnblogs.com/meizhengchao/p/19403623
本文版权归作者和博客园共有,欢迎转载,但未经作者同意必须保留此段声明,且在文章页面明显位置给出, 原文链接 如有问题, 可邮件(meizhengchao@qq.com)咨询.

浙公网安备 33010602011771号