完整代码实现,等两个数据域跑完 触发下一个任务
第一步:在数据域 DAG 中定义“产出”
你需要在两个数据域的最后一个任务中增加一个 outlets 声明。
数据域 A (dag_sales.py):
Python
from airflow import DAG, Dataset
from airflow.operators.empty import EmptyOperator
from datetime import datetime
# 定义一个逻辑数据集(名字自定义)
sales_data = Dataset("doris://sales_complete")
with DAG('dag_sales_domain', start_date=datetime(2026, 5, 1), schedule_interval='0 2 * * *', catchup=False) as dag:
# 任务完成后,自动更新数据集状态
etl_sales = EmptyOperator(task_id='sales_etl', outlets=[sales_data])
数据域 B (dag_inventory.py):
Python
from airflow import DAG, Dataset
from airflow.operators.empty import EmptyOperator
from datetime import datetime
# 定义另一个逻辑数据集
inventory_data = Dataset("doris://inventory_complete")
with DAG('dag_inventory_domain', start_date=datetime(2026, 5, 1), schedule_interval='0 3 * * *', catchup=False) as dag:
etl_inv = EmptyOperator(task_id='inventory_etl', outlets=[inventory_data])
第二步:在备份 DAG 中定义“触发条件”
备份 DAG 不再设定时点,而是监听这两个数据集。
备份 DAG (doris_central_backup.py):
Python
import os
import yaml
from airflow import DAG, Dataset
from airflow.providers.mysql.operators.mysql import MySqlOperator
from datetime import datetime
# 引用那两个数据集
sales_data = Dataset("doris://sales_complete")
inventory_data = Dataset("doris://inventory_complete")
# 配置读取(保持你之前的解耦习惯)
def load_config():
config_path = os.path.join(os.path.dirname(__file__), 'config/backup_config.yaml')
with open(config_path, 'r') as f:
return yaml.safe_load(f)['doris_backup']
cfg = load_config()
with DAG(
'doris_central_backup',
start_date=datetime(2026, 5, 1),
# 核心:只有当这两个数据集都被更新过,DAG 才会触发
schedule=[sales_data, inventory_data],
max_active_runs=1,
catchup=False
) as dag:
execute_backup = MySqlOperator(
task_id='execute_snapshot',
mysql_conn_id=cfg['connection_id'],
sql=f"""
BACKUP SNAPSHOT {cfg['db_name']}.full_snap_{{{{ ds_nodash }}}}
TO {cfg['repo_name']};
"""
)
浙公网安备 33010602011771号