1️⃣ 依赖配置文件(YAML)
先准备一个 wf_dependency.yaml:
domains:
TRA:
ODS: ["WF_ODS_TRA_DA"]
DWD: ["WF_DWD_TRA_DA"]
DIM: ["WF_DIM_PRD_DA", "WF_DIM_USR_DA"]
ADS: ["WF_ADS_SALES_DA", "WF_ADS_PROMO_DA"]
USR:
ODS: ["WF_ODS_USR_DA"]
DWD: ["WF_DWD_USR_DA"]
DIM: []
ADS: ["WF_ADS_USR_ANALYSIS_DA"]
2️⃣ Python 补数脚本
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import requests import yaml import time import sys # === 配置区域 === DS_HOST = "http://your_ds_host:port" TOKEN = "your_ds_token" PROJECT_CODE = "1234567890123" # 必须在DS项目列表页查看数字ID CONFIG_FILE = "./wf_dependency.yaml" HEADERS = {"token": TOKEN} def load_config(): with open(CONFIG_FILE, 'r', encoding='utf-8') as f: return yaml.safe_load(f) def get_wf_code(wf_name): """通过工作流名称查 code (支持分页搜索)""" url = f"{DS_HOST}/dolphinscheduler/api/v1/projects/{PROJECT_CODE}/process-definition" params = {"pageSize": 100, "pageNo": 1, "searchVal": wf_name} res = requests.get(url, headers=HEADERS, params=params) res.raise_for_status() data = res.json().get("data", {}) for wf in data.get("totalList", []): if wf.get("name") == wf_name: return wf.get("code") raise ValueError(f"❌ 找不到工作流: {wf_name}, 请检查名称或 ProjectCode") def start_workflow(wf_code, biz_date): """启动工作流并返回实例 ID""" url = f"{DS_HOST}/dolphinscheduler/api/v1/projects/{PROJECT_CODE}/executors/start-process-instance" # 注意:startupParams 在不同版本中格式略有不同,这里使用标准 JSON 字符串 payload = { "processDefinitionCode": wf_code, "commandType": "START_PROCESS", "failureStrategy": "CONTINUE", "warningType": "NONE", "warningGroupId": 0, "runMode": "RUN_MODE_SERIAL", "processInstancePriority": "MEDIUM", "startParams": f'{{"biz_date":"{biz_date}"}}', "dryRun": 0 } res = requests.post(url, data=payload, headers=HEADERS) # 注意:DS此接口有时接收 Form Data res.raise_for_status() # 某些版本返回的是 data 字段里的 ID result = res.json() if result.get("code") != 0: raise RuntimeError(f"启动失败: {result.get('msg')}") print(f"🚀 已下发启动指令: {wf_code}") return True # 异步启动,我们需要去查最新的实例ID def get_latest_instance_id(wf_code): """获取该工作流刚生成的最新实例ID""" url = f"{DS_HOST}/dolphinscheduler/api/v1/projects/{PROJECT_CODE}/process-instances" params = {"pageSize": 1, "pageNo": 1, "processDefineCode": wf_code} res = requests.get(url, headers=HEADERS, params=params) return res.json()["data"]["totalList"][0]["id"] def check_instances_status(instance_map): """ 批量检查实例状态 instance_map: { instance_id: wf_name } """ url_template = f"{DS_HOST}/dolphinscheduler/api/v1/projects/{PROJECT_CODE}/process-instances/{{}}" success_ids = [] failed_ids = [] for ins_id, name in instance_map.items(): res = requests.get(url_template.format(ins_id), headers=HEADERS) status = res.json()["data"]["state"] if status == "SUCCESS": print(f"✅ {name} (ID:{ins_id}) 成功") success_ids.append(ins_id) elif status in ["FAILURE", "STOP", "PAUSE"]: print(f"❌ {name} (ID:{ins_id}) 失败或停止,状态: {status}") failed_ids.append(ins_id) return success_ids, failed_ids def rerun_chain(domain, biz_date): cfg = load_config() # 按照数据仓库层级顺序执行 stages = ["ODS", "DWD", "DIM", "ADS"] if domain not in cfg["domains"]: print(f"不存在的 Domain: {domain}") return for stage in stages: wf_names = cfg["domains"][domain].get(stage, []) if not wf_names: continue print(f"\n--- 正在处理阶段: {stage} ---") current_batch = {} # {instance_id: wf_name} # 1. 并行启动当前阶段所有工作流 for name in wf_names: code = get_wf_code(name) start_workflow(code, biz_date) time.sleep(2) # 避开接口并发限制并等待实例生成 ins_id = get_latest_instance_id(code) current_batch[ins_id] = name # 2. 轮询等待当前阶段全部完成 while current_batch: success_ids, failed_ids = check_instances_status(current_batch) if failed_ids: print("🛑 检测到任务失败,脚本异常退出,请检查 DS 后台!") sys.exit(1) # 从待监控列表中移除已完成的 for sid in success_ids: current_batch.pop(sid) if current_batch: print(f"⏳ 阶段 {stage} 尚有 {len(current_batch)} 个任务运行中...") time.sleep(20) print("\n✨ 全链路补跑任务已圆满完成!") if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: python rerun_chain.py <domain> <biz_date>") sys.exit(1) target_domain = sys.argv[1] target_date = sys.argv[2] rerun_chain(target_domain, target_date)
浙公网安备 33010602011771号