批量重跑DolphinScheduler 作业

Posted on 2026-04-03 10:55  飞行的蟒蛇  阅读(3)  评论(0)    收藏  举报

1️⃣ 依赖配置文件(YAML)

先准备一个 wf_dependency.yaml

domains:
  TRA:
    ODS: ["WF_ODS_TRA_DA"]
    DWD: ["WF_DWD_TRA_DA"]
    DIM: ["WF_DIM_PRD_DA", "WF_DIM_USR_DA"]
    ADS: ["WF_ADS_SALES_DA", "WF_ADS_PROMO_DA"]

  USR:
    ODS: ["WF_ODS_USR_DA"]
    DWD: ["WF_DWD_USR_DA"]
    DIM: []
    ADS: ["WF_ADS_USR_ANALYSIS_DA"]

2️⃣ Python 补数脚本

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import requests
import yaml
import time
import sys

# === 配置区域 ===
DS_HOST = "http://your_ds_host:port"
TOKEN = "your_ds_token"
PROJECT_CODE = "1234567890123"  # 必须在DS项目列表页查看数字ID
CONFIG_FILE = "./wf_dependency.yaml"

HEADERS = {"token": TOKEN}


def load_config():
    with open(CONFIG_FILE, 'r', encoding='utf-8') as f:
        return yaml.safe_load(f)


def get_wf_code(wf_name):
    """通过工作流名称查 code (支持分页搜索)"""
    url = f"{DS_HOST}/dolphinscheduler/api/v1/projects/{PROJECT_CODE}/process-definition"
    params = {"pageSize": 100, "pageNo": 1, "searchVal": wf_name}

    res = requests.get(url, headers=HEADERS, params=params)
    res.raise_for_status()
    data = res.json().get("data", {})

    for wf in data.get("totalList", []):
        if wf.get("name") == wf_name:
            return wf.get("code")
    raise ValueError(f"❌ 找不到工作流: {wf_name}, 请检查名称或 ProjectCode")


def start_workflow(wf_code, biz_date):
    """启动工作流并返回实例 ID"""
    url = f"{DS_HOST}/dolphinscheduler/api/v1/projects/{PROJECT_CODE}/executors/start-process-instance"

    # 注意:startupParams 在不同版本中格式略有不同,这里使用标准 JSON 字符串
    payload = {
        "processDefinitionCode": wf_code,
        "commandType": "START_PROCESS",
        "failureStrategy": "CONTINUE",
        "warningType": "NONE",
        "warningGroupId": 0,
        "runMode": "RUN_MODE_SERIAL",
        "processInstancePriority": "MEDIUM",
        "startParams": f'{{"biz_date":"{biz_date}"}}',
        "dryRun": 0
    }

    res = requests.post(url, data=payload, headers=HEADERS)  # 注意:DS此接口有时接收 Form Data
    res.raise_for_status()

    # 某些版本返回的是 data 字段里的 ID
    result = res.json()
    if result.get("code") != 0:
        raise RuntimeError(f"启动失败: {result.get('msg')}")

    print(f"🚀 已下发启动指令: {wf_code}")
    return True  # 异步启动,我们需要去查最新的实例ID


def get_latest_instance_id(wf_code):
    """获取该工作流刚生成的最新实例ID"""
    url = f"{DS_HOST}/dolphinscheduler/api/v1/projects/{PROJECT_CODE}/process-instances"
    params = {"pageSize": 1, "pageNo": 1, "processDefineCode": wf_code}
    res = requests.get(url, headers=HEADERS, params=params)
    return res.json()["data"]["totalList"][0]["id"]


def check_instances_status(instance_map):
    """
    批量检查实例状态
    instance_map: { instance_id: wf_name }
    """
    url_template = f"{DS_HOST}/dolphinscheduler/api/v1/projects/{PROJECT_CODE}/process-instances/{{}}"

    success_ids = []
    failed_ids = []

    for ins_id, name in instance_map.items():
        res = requests.get(url_template.format(ins_id), headers=HEADERS)
        status = res.json()["data"]["state"]

        if status == "SUCCESS":
            print(f"✅ {name} (ID:{ins_id}) 成功")
            success_ids.append(ins_id)
        elif status in ["FAILURE", "STOP", "PAUSE"]:
            print(f"❌ {name} (ID:{ins_id}) 失败或停止,状态: {status}")
            failed_ids.append(ins_id)

    return success_ids, failed_ids


def rerun_chain(domain, biz_date):
    cfg = load_config()
    # 按照数据仓库层级顺序执行
    stages = ["ODS", "DWD", "DIM", "ADS"]

    if domain not in cfg["domains"]:
        print(f"不存在的 Domain: {domain}")
        return

    for stage in stages:
        wf_names = cfg["domains"][domain].get(stage, [])
        if not wf_names:
            continue

        print(f"\n--- 正在处理阶段: {stage} ---")
        current_batch = {}  # {instance_id: wf_name}

        # 1. 并行启动当前阶段所有工作流
        for name in wf_names:
            code = get_wf_code(name)
            start_workflow(code, biz_date)
            time.sleep(2)  # 避开接口并发限制并等待实例生成
            ins_id = get_latest_instance_id(code)
            current_batch[ins_id] = name

        # 2. 轮询等待当前阶段全部完成
        while current_batch:
            success_ids, failed_ids = check_instances_status(current_batch)

            if failed_ids:
                print("🛑 检测到任务失败,脚本异常退出,请检查 DS 后台!")
                sys.exit(1)

            # 从待监控列表中移除已完成的
            for sid in success_ids:
                current_batch.pop(sid)

            if current_batch:
                print(f"⏳ 阶段 {stage} 尚有 {len(current_batch)} 个任务运行中...")
                time.sleep(20)

    print("\n✨ 全链路补跑任务已圆满完成!")


if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: python rerun_chain.py <domain> <biz_date>")
        sys.exit(1)

    target_domain = sys.argv[1]
    target_date = sys.argv[2]
    rerun_chain(target_domain, target_date)