chrome 浏览器 调试 数据采集 Chrome DevTools ProtocolCDP

C:\Program Files\Google\Chrome\Application\chrome.exe --user-data-dir=D:\demo_app\dist\demo_app\data\user_data_dir --no-first-run --no-default-browser-check --remote-debugging-port=9222 --remote-allow-origins=* https://www.baidu.com/index.php

 

Windows CDP 远程调试:从踩坑到局域网打通 - 进化概率论

 

Chrome DevTools Protocol

 

[
  {
    "description": "",
    "devtoolsFrontendUrl": "https://chrome-devtools-frontend.appspot.com/serve_rev/@39b29242bd61cdd8cf7d5c293800d97c4ac403d6/inspector.html?ws=127.0.0.1:9222/devtools/page/E0E0B6DB6E243056D67085DFD2F85D8E",
    "id": "E0E0B6DB6E243056D67085DFD2F85D8E",
    "title": "Omnibox Popup",
    "type": "browser_ui",
    "url": "chrome://omnibox-popup.top-chrome/",
    "webSocketDebuggerUrl": "ws://127.0.0.1:9222/devtools/page/E0E0B6DB6E243056D67085DFD2F85D8E"
  },
  {
    "description": "",
    "devtoolsFrontendUrl": "https://chrome-devtools-frontend.appspot.com/serve_rev/@39b29242bd61cdd8cf7d5c293800d97c4ac403d6/inspector.html?ws=127.0.0.1:9222/devtools/page/7721C4E783A34659160D88951B811447",
    "id": "7721C4E783A34659160D88951B811447",
    "title": "127.0.0.1:9222",
    "type": "page",
    "url": "http://127.0.0.1:9222/json",
    "webSocketDebuggerUrl": "ws://127.0.0.1:9222/devtools/page/7721C4E783A34659160D88951B811447"
  },
  {
    "description": "",
    "devtoolsFrontendUrl": "https://chrome-devtools-frontend.appspot.com/serve_rev/@39b29242bd61cdd8cf7d5c293800d97c4ac403d6/inspector.html?ws=127.0.0.1:9222/devtools/page/8BC6BD3D98A9099983EA7D7DDAD709BB",
    "faviconUrl": "https://www.baidu.com/favicon.ico",
    "id": "8BC6BD3D98A9099983EA7D7DDAD709BB",
    "title": "百度一下,你就知道",
    "type": "page",
    "url": "https://www.baidu.com/index.php",
    "webSocketDebuggerUrl": "ws://127.0.0.1:9222/devtools/page/8BC6BD3D98A9099983EA7D7DDAD709BB"
  },
  {
    "description": "",
    "devtoolsFrontendUrl": "https://chrome-devtools-frontend.appspot.com/serve_rev/@39b29242bd61cdd8cf7d5c293800d97c4ac403d6/inspector.html?ws=127.0.0.1:9222/devtools/page/213A74CEA917FE3AFB95BB68FA36040F",
    "id": "213A74CEA917FE3AFB95BB68FA36040F",
    "title": "Omnibox Popup",
    "type": "browser_ui",
    "url": "chrome://omnibox-popup.top-chrome/omnibox_popup_aim.html",
    "webSocketDebuggerUrl": "ws://127.0.0.1:9222/devtools/page/213A74CEA917FE3AFB95BB68FA36040F"
  }
]

  ws msg  执行js

{
"id": 1234,
"method": "Runtime.evaluate",
"params": {
"expression": "document.documentElement.outerHTML",
"returnByValue": true
}
}




页面数据
document.documentElement.outerHTML
 
"""CDP WebSocket 客户端。"""

from __future__ import annotations

import json
import time
from typing import Any, Dict, Optional

import websocket
from loguru import logger


def create_ws_connection(ws_url: str, timeout: float):
    attempts = [
        {"timeout": timeout, "suppress_origin": True},
        {"timeout": timeout, "origin": "http://127.0.0.1"},
        {"timeout": timeout},
    ]
    last_error = None
    for options in attempts:
        try:
            return websocket.create_connection(ws_url, **options)
        except TypeError as exc:
            last_error = exc
            continue
        except Exception as exc:
            last_error = exc
            continue
    if last_error is not None:
        raise last_error
    raise RuntimeError("WebSocket 连接失败")


class CdpWsClient:
    def __init__(self, ws, timeout: float):
        self._ws = ws
        self._timeout = timeout
        self._msg_id = 0

    def call(self, method: str, params: Optional[dict] = None) -> dict:
        self._msg_id += 1
        payload: Dict[str, Any] = {"id": self._msg_id, "method": method}
        if params:
            payload["params"] = params

        logger.info(
            f"→ WS {method}"
            + (f" params={json.dumps(params, ensure_ascii=False)}" if params else "")
        )
        self._ws.send(json.dumps(payload, ensure_ascii=False))

        deadline = time.time() + self._timeout
        while time.time() < deadline:
            remain = max(0.05, deadline - time.time())
            self._ws.settimeout(remain)
            try:
                message = json.loads(self._ws.recv())
            except websocket.WebSocketTimeoutException:
                continue

            if message.get("id") == self._msg_id:
                if "error" in message:
                    raise RuntimeError(f"{method} error: {message['error']}")
                result = message.get("result", {})
                logger.success(f"← WS {method} OK")
                return result

        raise TimeoutError(f"{method} timeout")

    def evaluate(self, expression: str) -> Any:
        result = self.call(
            "Runtime.evaluate",
            {"expression": expression, "returnByValue": True, "awaitPromise": True},
        )
        value = (result.get("result") or {}).get("value")
        if (result.get("result") or {}).get("type") == "undefined":
            return None
        return value

    def wait_for_ready(self, timeout: float = 30.0) -> None:
        deadline = time.time() + timeout
        while time.time() < deadline:
            state = self.evaluate("document.readyState")
            if state in ("interactive", "complete"):
                return
            time.sleep(0.3)
        raise TimeoutError("页面加载超时")

  







"""从 scenarios/js/ 目录加载 CDP Runtime.evaluate 脚本。"""

from __future__ import annotations

from functools import lru_cache
from pathlib import Path

_JS_ROOT = Path(__file__).resolve().parent / "js"


@lru_cache(maxsize=None)
def load_scenario_js(scenario: str, filename: str) -> str:
    path = _JS_ROOT / scenario / filename
    if not path.is_file():
        raise FileNotFoundError(f"场景 JS 不存在: {path}")
    return path.read_text(encoding="utf-8").strip()

 

 

"""影刀新闻页:点击「查看更多」。"""

from __future__ import annotations

import time
from typing import List

from loguru import logger

from .js_loader import load_scenario_js
from ..ws_client import CdpWsClient

DEFAULT_PAGE_URL = "https://www.yingdao.com/news/"
_SCENARIO = "yingdao_news"

CLICK_VIEW_MORE_JS = load_scenario_js(_SCENARIO, "click_view_more.js")
COLLECT_NEWS_ITEMS_JS = load_scenario_js(_SCENARIO, "collect_news_items.js")


def _log_news_titles(label: str, snapshot: dict) -> None:
    titles = snapshot.get("newsTitles") or []
    logger.info(f"{label} 新闻标题 ({len(titles)} 条):")
    if not titles:
        logger.warning(f"{label} 未采集到新闻标题")
        return
    for index, title in enumerate(titles, 1):
        logger.info(f"  [{index}] {title}")


def _click_view_more(client: CdpWsClient) -> dict:
    """定位「查看更多」并用 CDP Input 发送可信鼠标点击。"""
    locate = client.evaluate(CLICK_VIEW_MORE_JS) or {}
    if not locate.get("ok"):
        return locate

    x = float(locate["x"])
    y = float(locate["y"])
    logger.info(f"CDP 点击坐标: ({x}, {y}) tag={locate.get('tag')} class={locate.get('className', '')}")

    client.call("Input.dispatchMouseEvent", {"type": "mouseMoved", "x": x, "y": y})
    for event_type in ("mousePressed", "mouseReleased"):
        client.call(
            "Input.dispatchMouseEvent",
            {
                "type": event_type,
                "x": x,
                "y": y,
                "button": "left",
                "clickCount": 1,
            },
        )

    locate["clickMethod"] = "cdp_input"
    return locate


def run_yingdao_news_scenario(
        client: CdpWsClient,
        *,
        page_url: str,
        click_times: int,
        click_interval: float,
) -> dict:
    client.call("Page.enable")
    client.call("Runtime.enable")
    client.call("DOM.enable")

    current_url = client.evaluate("location.href") or ""
    if page_url.rstrip("/") not in current_url.rstrip("/"):
        logger.info(f"导航到: {page_url}")
        client.call("Page.navigate", {"url": page_url})
        client.wait_for_ready(timeout=30.0)
        time.sleep(1.0)
    else:
        logger.info(f"当前已在目标页: {current_url}")
        client.wait_for_ready(timeout=15.0)

    before = client.evaluate(COLLECT_NEWS_ITEMS_JS) or {}
    logger.info(f"点击前新闻链接数: {before.get('newsLinkCount', '?')}")
    _log_news_titles("点击前", before)

    click_results: List[dict] = []
    for index in range(1, click_times + 1):
        logger.info(f"第 {index}/{click_times} 次点击「查看更多」")
        snapshot = client.evaluate(COLLECT_NEWS_ITEMS_JS) or {}
        click_result = _click_view_more(client)
        click_results.append({"attempt": index, **click_result})
        if not click_result.get("ok"):
            raise RuntimeError(f"第 {index} 次未找到「查看更多」按钮")
        logger.success(f"已点击: {click_result.get('text', '')}")

        deadline = time.time() + max(click_interval, 2.0)
        while time.time() < deadline:
            time.sleep(0.4)
        after_click = client.evaluate(COLLECT_NEWS_ITEMS_JS) or {}
        click_results[-1]["after_click"] = after_click
        _log_news_titles(f"第 {index} 次点击后", after_click)
        if (
                after_click.get("newsLinkCount", 0) > snapshot.get("newsLinkCount", 0)
                or after_click.get("bodyHeight", 0) > snapshot.get("bodyHeight", 0)
        ):
            logger.info(
                f"列表已展开: 链接 {snapshot.get('newsLinkCount')} → "
                f"{after_click.get('newsLinkCount')}, 高度 "
                f"{snapshot.get('bodyHeight')} → {after_click.get('bodyHeight')}"
            )

    after = client.evaluate(COLLECT_NEWS_ITEMS_JS) or {}
    logger.info(f"点击后新闻链接数: {after.get('newsLinkCount', '?')}")
    _log_news_titles("全部点击完成后", after)

    return {
        "page_url": page_url,
        "click_times": click_times,
        "before": before,
        "after": after,
        "clicks": click_results,
    }

  

posted @ 2026-06-13 11:45  papering  阅读(3)  评论(0)    收藏  举报