DataX基础环境配置
1、下载DataX
wget http://datax-opensource.oss-cn-hangzhou.aliyuncs.com/datax.tar.gz
tar -zxvf datax.tar.gz
cd datax
2、验证安装
python3.9 bin/datax.py job/job.json
![alt text]()
![alt text]()
3、配置DataX环境变量
sudo vim /etc/profile
# DataX
export DATAX_HOME=/opt/datax
export PATH=$PATH:$DATAX_HOME/bin
4、问题处理
4-1、SyntaxError: Missing parentheses in call to 'print'. Did you mean print(readerRef)?
这个错误是因为在 Python 3 中,print 是一个函数,需要使用括号 () 来调用,而在 Python 2 中 print 是一个语句,可以直接使用。DataX 的一些示例代码可能是基于 Python 2 编写的,所以会出现这个错误。
4-2、添加 mysql 8 对应的java jar包
![alt text]()
plugin/reader/mysqlreader/lib
![alt text]()
plugin/writer/mysqlwriter/lib
![alt text]()
MySQL Connectors
下载官网: https://www.mysql.com/products/connector/
# 错误示例
print "Hello, World!"
# 正确示例
print("Hello, World!")
5、datax job 对应mysql数据类型
5-1、基本数据类型
string: 字符串类型,对应 MySQL 的 CHAR, VARCHAR, TEXT 等字符串类型。
long: 长整型,对应 MySQL 的 BIGINT, INTEGER 等整数类型。
int: 整型,对应 MySQL 的 INT 类型。
double: 浮点数类型,对应 MySQL 的 FLOAT, DOUBLE, DECIMAL 等浮点数类型。
bool: 布尔类型,对应 MySQL 的 BOOLEAN, TINYINT(1) 等布尔类型。
date: 日期类型,对应 MySQL 的 DATE, DATETIME, TIMESTAMP 等日期时间类型。
bytes: 字节数组类型,对应 MySQL 的 BLOB, VARBINARY 等二进制类型。
5-2、其他常见类型
datetime: 日期时间类型,对应 MySQL 的 DATETIME, TIMESTAMP。
timestamp: 时间戳类型,对应 MySQL 的 TIMESTAMP。
time: 时间类型,对应 MySQL 的 TIME。
year: 年份类型,对应 MySQL 的 YEAR。
Pycharm 加载DataX + Python3.9
1、解释器配置+代码同步远程映射
pycharm 解释器配置
![alt text]()
![alt text]()
2、pycharm 文件服务器推送同步
2-1、进行数据同步配置
![alt text]()
2-2、配置远程连接
![alt text]()
2-3、配置本地--服务器文件映射
![alt text]()
2-4、验证成功
![alt text]()
3、pycharm 配置 Run/Debug Configurations
![alt text]()
3-1、创建 Python 运行配置
1、打开 PyCharm,点击顶部菜单栏 Run → Edit Configurations。
2、点击 + 号,选择 Python 添加 run_datax。
3-2、关键参数配置
| 参数项 |
配置说明 |
DataX示例值 |
| Script path |
指定调用 DataX 的 Python 脚本路径。 |
/project/run_datax.py |
| Parameters |
传递给脚本的命令行参数(如 DataX 的 JSON 配置文件路径)。 |
--job /datax/job/mysql2hdfs.json |
| Python interpreter |
选择 Python 3.9 解释器(需提前配置)。 |
Conda (Python 3.9) 或系统解释器 |
| Working directory |
设置工作目录(影响 DataX 配置文件的相对路径解析)。 |
/project |
| Environment variables |
添加环境变量(如 DataX 的安装路径)。 |
DATAX_HOME=/opt/datax |
| Add content roots to PYTHONPATH |
勾选后,项目根目录会被加入 PYTHONPATH,方便导入自定义模块。 |
- [x] 勾选 |
4、git 版本控制 (pycharm 汉化)
![alt text]()
参考代码
作业 job/mysql2mysql_user.json 【mysql to mysql】
{
"job": {
"setting": {
"speed": {
"channel": 1
}
},
"content": [
{
"reader": {
"name": "mysqlreader",
"type": "mysql",
"parameter": {
"username": "root",
"password": "pW@123456",
"connection": [
{
"querySql": [
"
SELECT
user_id,
name,
age,
gender,
province,
city,
region,
phone,
birthday,
hobby,
register_date
FROM t_user
"
],
"jdbcUrl": [
"jdbc:mysql://192.168.124.224:3306/test_datax?useSSL=false&characterEncoding=utf8"
]
}
]
}
},
"writer": {
"name": "mysqlwriter",
"parameter": {
"username": "root",
"password": "pW@123456",
"column" : [
"user_id",
"name",
"age",
"gender",
"province",
"city",
"region",
"phone",
"birthday",
"hobby",
"register_date"
],
"connection": [
{
"jdbcUrl": "jdbc:mysql://192.168.124.224:3306/ods?useSSL=false&characterEncoding=utf8",
"table": ["t_user"]
}
],
"preSql": [
"TRUNCATE TABLE t_user"
],
"postSql": [
],
"writeMode": "insert",
"batchSize": 1024,
"print": false,
"encoding": "UTF-8"
}
}
}
]
}
}
主函数 bin/datax.py [python2 转python3 处理]
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import sys
import os
import signal
import subprocess
import time
import re
import socket
import json
from optparse import OptionParser
from optparse import OptionGroup
from string import Template
import codecs
import platform
def isWindows():
return platform.system() == 'Windows'
DATAX_HOME = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
DATAX_VERSION = 'DATAX-OPENSOURCE-3.0'
if isWindows():
codecs.register(lambda name: name == 'cp65001' and codecs.lookup('utf-8') or None)
CLASS_PATH = ("%s/lib/*") % (DATAX_HOME)
else:
CLASS_PATH = ("%s/lib/*:.") % (DATAX_HOME)
LOGBACK_FILE = ("%s/conf/logback.xml") % (DATAX_HOME)
DEFAULT_JVM = "-Xms1g -Xmx1g -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=%s/log" % (DATAX_HOME)
DEFAULT_PROPERTY_CONF = "-Dfile.encoding=UTF-8 -Dlogback.statusListenerClass=ch.qos.logback.core.status.NopStatusListener -Djava.security.egd=file:///dev/urandom -Ddatax.home=%s -Dlogback.configurationFile=%s" % (
DATAX_HOME, LOGBACK_FILE)
ENGINE_COMMAND = "java -server ${jvm} %s -classpath %s ${params} com.alibaba.datax.core.Engine -mode ${mode} -jobid ${jobid} -job ${job}" % (
DEFAULT_PROPERTY_CONF, CLASS_PATH)
REMOTE_DEBUG_CONFIG = "-Xdebug -Xrunjdwp:transport=dt_socket,server=y,address=9999"
RET_STATE = {
"KILL": 143,
"FAIL": -1,
"OK": 0,
"RUN": 1,
"RETRY": 2
}
def getLocalIp():
try:
return socket.gethostbyname(socket.getfqdn(socket.gethostname()))
except:
return "Unknown"
def suicide(signum, e):
global child_process
print >> sys.stderr, "[Error] DataX receive unexpected signal %d, starts to suicide." % (signum)
if child_process:
child_process.send_signal(signal.SIGQUIT)
time.sleep(1)
child_process.kill()
print >> sys.stderr, "DataX Process was killed ! you did ?"
sys.exit(RET_STATE["KILL"])
def register_signal():
if not isWindows():
global child_process
signal.signal(2, suicide)
signal.signal(3, suicide)
signal.signal(15, suicide)
def getOptionParser():
usage = "usage: %prog [options] job-url-or-path"
parser = OptionParser(usage=usage)
prodEnvOptionGroup = OptionGroup(parser, "Product Env Options",
"Normal user use these options to set jvm parameters, job runtime mode etc. "
"Make sure these options can be used in Product Env.")
prodEnvOptionGroup.add_option("-j", "--jvm", metavar="<jvm parameters>", dest="jvmParameters", action="store",
default=DEFAULT_JVM, help="Set jvm parameters if necessary.")
prodEnvOptionGroup.add_option("--jobid", metavar="<job unique id>", dest="jobid", action="store", default="-1",
help="Set job unique id when running by Distribute/Local Mode.")
prodEnvOptionGroup.add_option("-m", "--mode", metavar="<job runtime mode>",
action="store", default="standalone",
help="Set job runtime mode such as: standalone, local, distribute. "
"Default mode is standalone.")
prodEnvOptionGroup.add_option("-p", "--params", metavar="<parameter used in job config>",
action="store", dest="params",
help='Set job parameter, eg: the source tableName you want to set it by command, '
'then you can use like this: -p"-DtableName=your-table-name", '
'if you have mutiple parameters: -p"-DtableName=your-table-name -DcolumnName=your-column-name".'
'Note: you should config in you job tableName with ${tableName}.')
prodEnvOptionGroup.add_option("-r", "--reader", metavar="<parameter used in view job config[reader] template>",
action="store", dest="reader",type="string",
help='View job config[reader] template, eg: mysqlreader,streamreader')
prodEnvOptionGroup.add_option("-w", "--writer", metavar="<parameter used in view job config[writer] template>",
action="store", dest="writer",type="string",
help='View job config[writer] template, eg: mysqlwriter,streamwriter')
parser.add_option_group(prodEnvOptionGroup)
devEnvOptionGroup = OptionGroup(parser, "Develop/Debug Options",
"Developer use these options to trace more details of DataX.")
devEnvOptionGroup.add_option("-d", "--debug", dest="remoteDebug", action="store_true",
help="Set to remote debug mode.")
devEnvOptionGroup.add_option("--loglevel", metavar="<log level>", dest="loglevel", action="store",
default="info", help="Set log level such as: debug, info, all etc.")
parser.add_option_group(devEnvOptionGroup)
return parser
def generateJobConfigTemplate(reader, writer):
readerRef = "Please refer to the %s document:\n https://github.com/alibaba/DataX/blob/master/%s/doc/%s.md \n" % (reader,reader,reader)
writerRef = "Please refer to the %s document:\n https://github.com/alibaba/DataX/blob/master/%s/doc/%s.md \n " % (writer,writer,writer)
print(readerRef)
print(writerRef)
jobGuid = 'Please save the following configuration as a json file and use\n python {DATAX_HOME}/bin/datax.py {JSON_FILE_NAME}.json \nto run the job.\n'
print(jobGuid)
jobTemplate={
"job": {
"setting": {
"speed": {
"channel": ""
}
},
"content": [
{
"reader": {},
"writer": {}
}
]
}
}
readerTemplatePath = "%s/plugin/reader/%s/plugin_job_template.json" % (DATAX_HOME,reader)
writerTemplatePath = "%s/plugin/writer/%s/plugin_job_template.json" % (DATAX_HOME,writer)
try:
readerPar = readPluginTemplate(readerTemplatePath);
except Exception as e:
print("Read reader[%s] template error: can\'t find file %s" % (reader,readerTemplatePath))
try:
writerPar = readPluginTemplate(writerTemplatePath);
except Exception as e:
print("Read writer[%s] template error: : can\'t find file %s" % (writer,writerTemplatePath))
jobTemplate['job']['content'][0]['reader'] = readerPar;
jobTemplate['job']['content'][0]['writer'] = writerPar;
print(json.dumps(jobTemplate, indent=4, sort_keys=True))
def readPluginTemplate(plugin):
with open(plugin, 'r') as f:
return json.load(f)
def isUrl(path):
if not path:
return False
assert (isinstance(path, str))
m = re.match(r"^http[s]?://\S+\w*", path.lower())
if m:
return True
else:
return False
def buildStartCommand(options, args):
commandMap = {}
tempJVMCommand = DEFAULT_JVM
if options.jvmParameters:
tempJVMCommand = tempJVMCommand + " " + options.jvmParameters
if options.remoteDebug:
tempJVMCommand = tempJVMCommand + " " + REMOTE_DEBUG_CONFIG
print('local ip: ', getLocalIp())
if options.loglevel:
tempJVMCommand = tempJVMCommand + " " + ("-Dloglevel=%s" % (options.loglevel))
if options.mode:
commandMap["mode"] = options.mode
# jobResource 可能是 URL,也可能是本地文件路径(相对,绝对)
jobResource = args[0]
if not isUrl(jobResource):
jobResource = os.path.abspath(jobResource)
if jobResource.lower().startswith("file://"):
jobResource = jobResource[len("file://"):]
jobParams = ("-Dlog.file.name=%s") % (jobResource[-20:].replace('/', '_').replace('.', '_'))
if options.params:
jobParams = jobParams + " " + options.params
if options.jobid:
commandMap["jobid"] = options.jobid
commandMap["jvm"] = tempJVMCommand
commandMap["params"] = jobParams
commandMap["job"] = jobResource
return Template(ENGINE_COMMAND).substitute(**commandMap)
def printCopyright():
print ('''
DataX (%s), From Alibaba !
Copyright (C) 2010-2017, Alibaba Group. All Rights Reserved.
''' % DATAX_VERSION)
sys.stdout.flush()
if __name__ == "__main__":
printCopyright()
parser = getOptionParser()
options, args = parser.parse_args(sys.argv[1:])
if options.reader is not None and options.writer is not None:
generateJobConfigTemplate(options.reader,options.writer)
sys.exit(RET_STATE['OK'])
if len(args) != 1:
parser.print_help()
sys.exit(RET_STATE['FAIL'])
startCommand = buildStartCommand(options, args)
print (startCommand)
child_process = subprocess.Popen(startCommand, shell=True)
register_signal()
(stdout, stderr) = child_process.communicate()
sys.exit(child_process.returncode)