DataX JSON 生成自动化脚本

Posted on 2026-03-30 15:03  飞行的蟒蛇  阅读(7)  评论(0)    收藏  举报

yaml

# 1. 数据库通用连接信息
source_db:
host: "192.168.1.100"
port: 1433
user: "sa"
password: "YourPassword123"
database: "SalesDB"

# 2. 待抽取的表列表
tables:
- name: "Orders"
hdfs_path: "/user/hive/warehouse/sales.db/orders"
- name: "Customers"
hdfs_path: "/user/hive/warehouse/sales.db/customers"

# 3. DataX 任务模板
datax_template:
job:
setting:
speed:
channel: 3
content:
- reader:
name: "sqlserverreader"
parameter:
username: "${db_user}"
password: "${db_pwd}"
column: ${reader_columns}
connection:
- querySql: ["SELECT ${column_str} FROM ${table_name}"]
jdbcUrl: ["jdbc:sqlserver://${db_host}:${db_port};DatabaseName=${db_name}"]
writer:
name: "hdfswriter"
parameter:
defaultFS: "hdfs://namenode:8020"
fileType: "orc"
path: "${hdfs_path}"
fileName: "${table_name}"
column: ${writer_columns}
writeMode: "truncate"
fieldDelimiter: "\t"
encoding: "UTF-8"

# 4. Hive 建表 DDL 模板
# 注意:这里使用 ${hive_columns_sql} 来注入拼接好的 "字段名 类型" 字符串
hive_ddl_template: |
CREATE EXTERNAL TABLE IF NOT EXISTS dw_stage.${table_name} (
${hive_columns_sql}
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS ORC
LOCATION '${hdfs_path}';

 

python

import yaml
import pymssql
import json
import os
import copy

def get_type_mapping(mssql_type):
"""
SQL Server 到 Hive 的数据类型转换
"""
mapping = {
'int': {'datax': 'long', 'hive': 'BIGINT'},
'bigint': {'datax': 'long', 'hive': 'BIGINT'},
'smallint': {'datax': 'long', 'hive': 'INT'},
'decimal': {'datax': 'double', 'hive': 'DECIMAL(18,2)'},
'numeric': {'datax': 'double', 'hive': 'DECIMAL(18,2)'},
'float': {'datax': 'double', 'hive': 'DOUBLE'},
'char': {'datax': 'string', 'hive': 'STRING'},
'varchar': {'datax': 'string', 'hive': 'STRING'},
'nvarchar': {'datax': 'string', 'hive': 'STRING'},
'datetime': {'datax': 'date', 'hive': 'TIMESTAMP'},
'date': {'datax': 'date', 'hive': 'DATE'},
'bit': {'datax': 'boolean', 'hive': 'BOOLEAN'}
}
# 默认返回 string
return mapping.get(mssql_type.lower(), {'datax': 'string', 'hive': 'STRING'})

def fill_template(template, mapping):
"""递归替换字典或字符串中的占位符"""
if isinstance(template, dict):
return {k: fill_template(v, mapping) for k, v in template.items()}
elif isinstance(template, list):
return [fill_template(v, mapping) for v in template]
elif isinstance(template, str):
res = template
for k, v in mapping.items():
# 只有当占位符是独立项且 v 是列表/字典时直接返回对象
if template == f"${{{k}}}" and not isinstance(v, str):
return v
# 否则进行字符串替换
if isinstance(v, str):
res = res.replace(f"${{{k}}}", v)
return res
return template

def run():
with open('config.yaml', 'r', encoding='utf-8') as f:
conf = yaml.safe_load(f)

db_info = conf['source_db']
out_json_dir = "output/datax_json"
out_sql_dir = "output/hive_sql"
for d in [out_json_dir, out_sql_dir]:
if not os.path.exists(d): os.makedirs(d)

conn = pymssql.connect(
server=db_info['host'], port=db_info['port'],
user=db_info['user'], password=db_info['password'], database=db_info['database']
)
cursor = conn.cursor(as_dict=True)

for table_item in conf['tables']:
t_name = table_item['name']
print(f"正在同步元数据: {t_name}...")

# 1. 获取 SQL Server 元数据
cursor.execute(f"""
SELECT COLUMN_NAME, DATA_TYPE
FROM INFORMATION_SCHEMA.COLUMNS
WHERE TABLE_NAME='{t_name}'
ORDER BY ORDINAL_POSITION
""")
rows = cursor.fetchall()

# 2. 构造各种转换后的字段列表
reader_cols = []
writer_cols = []
hive_cols_list = []

for r in rows:
col_name = r['COLUMN_NAME']
types = get_type_mapping(r['DATA_TYPE'])

reader_cols.append(col_name)
writer_cols.append({"name": col_name, "type": types['datax']})
hive_cols_list.append(f"`{col_name}` {types['hive']}")

# 3. 准备变量池
replace_map = {
"db_user": db_info['user'],
"db_pwd": db_info['password'],
"db_host": db_info['host'],
"db_port": str(db_info['port']),
"db_name": db_info['database'],
"table_name": t_name,
"hdfs_path": table_item['hdfs_path'],
"column_str": ", ".join([f"[{c}]" for c in reader_cols]),
"reader_columns": reader_columns := reader_cols,
"writer_columns": writer_columns := writer_cols,
"hive_columns_sql": ",\n ".join(hive_cols_list)
}

# 4. 生成 DataX JSON
final_json = fill_template(copy.deepcopy(conf['datax_template']), replace_map)
with open(f"{out_json_dir}/{t_name}.json", 'w', encoding='utf-8') as f:
json.dump(final_json, f, indent=4, ensure_ascii=False)

# 5. 生成 Hive SQL
final_sql = fill_template(conf['hive_ddl_template'], replace_map)
with open(f"{out_sql_dir}/{t_name}.sql", 'w', encoding='utf-8') as f:
f.write(final_sql)

conn.close()
print(f"\n全部生成完毕!")
print(f"JSON 路径: {out_json_dir}")
print(f"SQL 路径: {out_sql_dir}")

if __name__ == "__main__":
run()