chrome 浏览器 调试 数据采集 Chrome DevTools ProtocolCDP
C:\Program Files\Google\Chrome\Application\chrome.exe --user-data-dir=D:\demo_app\dist\demo_app\data\user_data_dir --no-first-run --no-default-browser-check --remote-debugging-port=9222 --remote-allow-origins=* https://www.baidu.com/index.php
Windows CDP 远程调试:从踩坑到局域网打通 - 进化概率论
[
{
"description": "",
"devtoolsFrontendUrl": "https://chrome-devtools-frontend.appspot.com/serve_rev/@39b29242bd61cdd8cf7d5c293800d97c4ac403d6/inspector.html?ws=127.0.0.1:9222/devtools/page/E0E0B6DB6E243056D67085DFD2F85D8E",
"id": "E0E0B6DB6E243056D67085DFD2F85D8E",
"title": "Omnibox Popup",
"type": "browser_ui",
"url": "chrome://omnibox-popup.top-chrome/",
"webSocketDebuggerUrl": "ws://127.0.0.1:9222/devtools/page/E0E0B6DB6E243056D67085DFD2F85D8E"
},
{
"description": "",
"devtoolsFrontendUrl": "https://chrome-devtools-frontend.appspot.com/serve_rev/@39b29242bd61cdd8cf7d5c293800d97c4ac403d6/inspector.html?ws=127.0.0.1:9222/devtools/page/7721C4E783A34659160D88951B811447",
"id": "7721C4E783A34659160D88951B811447",
"title": "127.0.0.1:9222",
"type": "page",
"url": "http://127.0.0.1:9222/json",
"webSocketDebuggerUrl": "ws://127.0.0.1:9222/devtools/page/7721C4E783A34659160D88951B811447"
},
{
"description": "",
"devtoolsFrontendUrl": "https://chrome-devtools-frontend.appspot.com/serve_rev/@39b29242bd61cdd8cf7d5c293800d97c4ac403d6/inspector.html?ws=127.0.0.1:9222/devtools/page/8BC6BD3D98A9099983EA7D7DDAD709BB",
"faviconUrl": "https://www.baidu.com/favicon.ico",
"id": "8BC6BD3D98A9099983EA7D7DDAD709BB",
"title": "百度一下,你就知道",
"type": "page",
"url": "https://www.baidu.com/index.php",
"webSocketDebuggerUrl": "ws://127.0.0.1:9222/devtools/page/8BC6BD3D98A9099983EA7D7DDAD709BB"
},
{
"description": "",
"devtoolsFrontendUrl": "https://chrome-devtools-frontend.appspot.com/serve_rev/@39b29242bd61cdd8cf7d5c293800d97c4ac403d6/inspector.html?ws=127.0.0.1:9222/devtools/page/213A74CEA917FE3AFB95BB68FA36040F",
"id": "213A74CEA917FE3AFB95BB68FA36040F",
"title": "Omnibox Popup",
"type": "browser_ui",
"url": "chrome://omnibox-popup.top-chrome/omnibox_popup_aim.html",
"webSocketDebuggerUrl": "ws://127.0.0.1:9222/devtools/page/213A74CEA917FE3AFB95BB68FA36040F"
}
]
ws msg 执行js
{
"id": 1234,
"method": "Runtime.evaluate",
"params": {
"expression": "document.documentElement.outerHTML",
"returnByValue": true
}
}
页面数据
document.documentElement.outerHTML
"""CDP WebSocket 客户端。"""
from __future__ import annotations
import json
import time
from typing import Any, Dict, Optional
import websocket
from loguru import logger
def create_ws_connection(ws_url: str, timeout: float):
attempts = [
{"timeout": timeout, "suppress_origin": True},
{"timeout": timeout, "origin": "http://127.0.0.1"},
{"timeout": timeout},
]
last_error = None
for options in attempts:
try:
return websocket.create_connection(ws_url, **options)
except TypeError as exc:
last_error = exc
continue
except Exception as exc:
last_error = exc
continue
if last_error is not None:
raise last_error
raise RuntimeError("WebSocket 连接失败")
class CdpWsClient:
def __init__(self, ws, timeout: float):
self._ws = ws
self._timeout = timeout
self._msg_id = 0
def call(self, method: str, params: Optional[dict] = None) -> dict:
self._msg_id += 1
payload: Dict[str, Any] = {"id": self._msg_id, "method": method}
if params:
payload["params"] = params
logger.info(
f"→ WS {method}"
+ (f" params={json.dumps(params, ensure_ascii=False)}" if params else "")
)
self._ws.send(json.dumps(payload, ensure_ascii=False))
deadline = time.time() + self._timeout
while time.time() < deadline:
remain = max(0.05, deadline - time.time())
self._ws.settimeout(remain)
try:
message = json.loads(self._ws.recv())
except websocket.WebSocketTimeoutException:
continue
if message.get("id") == self._msg_id:
if "error" in message:
raise RuntimeError(f"{method} error: {message['error']}")
result = message.get("result", {})
logger.success(f"← WS {method} OK")
return result
raise TimeoutError(f"{method} timeout")
def evaluate(self, expression: str) -> Any:
result = self.call(
"Runtime.evaluate",
{"expression": expression, "returnByValue": True, "awaitPromise": True},
)
value = (result.get("result") or {}).get("value")
if (result.get("result") or {}).get("type") == "undefined":
return None
return value
def wait_for_ready(self, timeout: float = 30.0) -> None:
deadline = time.time() + timeout
while time.time() < deadline:
state = self.evaluate("document.readyState")
if state in ("interactive", "complete"):
return
time.sleep(0.3)
raise TimeoutError("页面加载超时")
"""从 scenarios/js/ 目录加载 CDP Runtime.evaluate 脚本。""" from __future__ import annotations from functools import lru_cache from pathlib import Path _JS_ROOT = Path(__file__).resolve().parent / "js" @lru_cache(maxsize=None) def load_scenario_js(scenario: str, filename: str) -> str: path = _JS_ROOT / scenario / filename if not path.is_file(): raise FileNotFoundError(f"场景 JS 不存在: {path}") return path.read_text(encoding="utf-8").strip()
"""影刀新闻页:点击「查看更多」。"""
from __future__ import annotations
import time
from typing import List
from loguru import logger
from .js_loader import load_scenario_js
from ..ws_client import CdpWsClient
DEFAULT_PAGE_URL = "https://www.yingdao.com/news/"
_SCENARIO = "yingdao_news"
CLICK_VIEW_MORE_JS = load_scenario_js(_SCENARIO, "click_view_more.js")
COLLECT_NEWS_ITEMS_JS = load_scenario_js(_SCENARIO, "collect_news_items.js")
def _log_news_titles(label: str, snapshot: dict) -> None:
titles = snapshot.get("newsTitles") or []
logger.info(f"{label} 新闻标题 ({len(titles)} 条):")
if not titles:
logger.warning(f"{label} 未采集到新闻标题")
return
for index, title in enumerate(titles, 1):
logger.info(f" [{index}] {title}")
def _click_view_more(client: CdpWsClient) -> dict:
"""定位「查看更多」并用 CDP Input 发送可信鼠标点击。"""
locate = client.evaluate(CLICK_VIEW_MORE_JS) or {}
if not locate.get("ok"):
return locate
x = float(locate["x"])
y = float(locate["y"])
logger.info(f"CDP 点击坐标: ({x}, {y}) tag={locate.get('tag')} class={locate.get('className', '')}")
client.call("Input.dispatchMouseEvent", {"type": "mouseMoved", "x": x, "y": y})
for event_type in ("mousePressed", "mouseReleased"):
client.call(
"Input.dispatchMouseEvent",
{
"type": event_type,
"x": x,
"y": y,
"button": "left",
"clickCount": 1,
},
)
locate["clickMethod"] = "cdp_input"
return locate
def run_yingdao_news_scenario(
client: CdpWsClient,
*,
page_url: str,
click_times: int,
click_interval: float,
) -> dict:
client.call("Page.enable")
client.call("Runtime.enable")
client.call("DOM.enable")
current_url = client.evaluate("location.href") or ""
if page_url.rstrip("/") not in current_url.rstrip("/"):
logger.info(f"导航到: {page_url}")
client.call("Page.navigate", {"url": page_url})
client.wait_for_ready(timeout=30.0)
time.sleep(1.0)
else:
logger.info(f"当前已在目标页: {current_url}")
client.wait_for_ready(timeout=15.0)
before = client.evaluate(COLLECT_NEWS_ITEMS_JS) or {}
logger.info(f"点击前新闻链接数: {before.get('newsLinkCount', '?')}")
_log_news_titles("点击前", before)
click_results: List[dict] = []
for index in range(1, click_times + 1):
logger.info(f"第 {index}/{click_times} 次点击「查看更多」")
snapshot = client.evaluate(COLLECT_NEWS_ITEMS_JS) or {}
click_result = _click_view_more(client)
click_results.append({"attempt": index, **click_result})
if not click_result.get("ok"):
raise RuntimeError(f"第 {index} 次未找到「查看更多」按钮")
logger.success(f"已点击: {click_result.get('text', '')}")
deadline = time.time() + max(click_interval, 2.0)
while time.time() < deadline:
time.sleep(0.4)
after_click = client.evaluate(COLLECT_NEWS_ITEMS_JS) or {}
click_results[-1]["after_click"] = after_click
_log_news_titles(f"第 {index} 次点击后", after_click)
if (
after_click.get("newsLinkCount", 0) > snapshot.get("newsLinkCount", 0)
or after_click.get("bodyHeight", 0) > snapshot.get("bodyHeight", 0)
):
logger.info(
f"列表已展开: 链接 {snapshot.get('newsLinkCount')} → "
f"{after_click.get('newsLinkCount')}, 高度 "
f"{snapshot.get('bodyHeight')} → {after_click.get('bodyHeight')}"
)
after = client.evaluate(COLLECT_NEWS_ITEMS_JS) or {}
logger.info(f"点击后新闻链接数: {after.get('newsLinkCount', '?')}")
_log_news_titles("全部点击完成后", after)
return {
"page_url": page_url,
"click_times": click_times,
"before": before,
"after": after,
"clicks": click_results,
}

浙公网安备 33010602011771号