SeaTunnel 2.3.11 + Web 1.0.3 Docker 部署实战:Kafka 同步 Hive/ES 完整指南

SeaTunnel 2.3.11 + Web 1.0.3 Docker 部署实战:Kafka 同步 Hive/ES 完整指南

关键词:SeaTunnel 2.3.11、Docker 部署、Kafka 同步、Hive、Elasticsearch、数据集成

本文档详细介绍如何使用 Docker 部署 SeaTunnel 2.3.11 和 SeaTunnel Web 1.0.3,
并配置 Kafka 虚拟表、数据源以及 Kafka 同步到 Hive 和 Elasticsearch 的完整实战案例。

安装准备

目录结构

seatunnel-docker/
├── docker-compose.yml              # 主编排文件
├── hive/                           # Hive 配置
│   ├── hive-site.xml
│   └── lib/                        # 依赖 jar 包
│       └── postgresql-42.5.1.jar
├── init-sql/                       # 初始化 SQL
│   └── seatunnel_server_mysql.sql
├── seatunnel/                      # SeaTunnel 服务端配置
│   ├── Dockerfile
│   └── apache-seatunnel-2.3.11/    # 解压后的二进制包
│       └── lib/                    # 依赖 jar 包
│           ├── hive-exec-3.1.3.jar
│           ├── hive-metastore-3.1.3.jar
│           ├── libfb303-0.9.3.jar
│           ├── mysql-connector-java-8.0.28.jar
│           └── seatunnel-hadoop3-3.1.4-uber.jar
└── seatunnel-web/                  # SeaTunnel Web 配置
    ├── Dockerfile
    └── apache-seatunnel-web-1.0.3-bin/  # 解压后的二进制包
        └── libs/                   # 依赖 jar 包
            └── mysql-connector-java-8.0.28.jar

下载 seatunnel

# seatunnel-2.3.11
https://dlcdn.apache.org/seatunnel/2.3.11/apache-seatunnel-2.3.11-bin.tar.gz

# 源码构建seatunnel-web-1.0.3
git clone https://github.com/apache/seatunnel-web.git
cd seatunnel-web
sh build.sh code

下载依赖包

#hive-metastore容器需要(PostgreSQL为Hive 元数据库)
https://jdbc.postgresql.org/download/postgresql-42.5.1.jar

#hive-metastore同步报错缺少依赖包(实际验证加前3个包即可)
https://repo1.maven.org/maven2/org/apache/hive/hive-exec/3.1.3/hive-exec-3.1.3.jar
https://repo1.maven.org/maven2/org/apache/hive/hive-metastore/3.1.3/hive-metastore-3.1.3.jar
https://repo.maven.apache.org/maven2/org/apache/thrift/libfb303/0.9.3/libfb303-0.9.3.jar
https://repo1.maven.org/maven2/org/apache/thrift/libthrift/0.12.0/libthrift-0.12.0.jar
https://repo1.maven.org/maven2/org/apache/hive/hive-common/3.1.3/hive-common-3.1.3.jar

创建项目目录

将准备好的相关文件存放seatunnel-docker目录

mkdir seatunnel-docker
cd seatunnel-docker

Docker部署

docker-compose.yml配置

version: '3.9'

networks:
  seatunnel-network:
    driver: bridge
    ipam:
      config:
        - subnet: 172.16.0.0/24

services:
  # ===== Hive 相关服务 =====
  hive-metastore-db:
    image: postgres:15
    container_name: hive-metastore-db
    hostname: hive-metastore-db
    environment:
      POSTGRES_DB: metastore_db
      POSTGRES_USER: hive
      POSTGRES_PASSWORD: hive123456
    ports:
      - "5432:5432"
    volumes:
      - ./hive-metastore-db-data:/var/lib/postgresql/data
    networks:
      seatunnel-network:
        ipv4_address: 172.16.0.2
    healthcheck:  # 添加健康检查
      test: ["CMD-SHELL", "pg_isready -U hive -d metastore_db"]
      interval: 5s
      timeout: 5s
      retries: 10
      start_period: 10s
  hive-metastore:
    image: apache/hive:4.0.0
    container_name: hive-metastore
    hostname: hive-metastore
    depends_on:
       hive-metastore-db:
        condition: service_healthy  # 等待数据库健康后才启动
    environment:
      SERVICE_NAME: metastore
      DB_DRIVER: postgres
      SERVICE_OPTS: >-
        -Djavax.jdo.option.ConnectionDriverName=org.postgresql.Driver
        -Djavax.jdo.option.ConnectionURL=jdbc:postgresql://hive-metastore-db:5432/metastore_db
        -Djavax.jdo.option.ConnectionUserName=hive
        -Djavax.jdo.option.ConnectionPassword=hive123456
    ports:
      - "9083:9083"
    volumes:
      - ./hive/lib/postgresql-42.5.1.jar:/opt/hive/lib/postgresql-42.5.1.jar
      - ./hive/hive-site.xml:/opt/hive/conf/hive-site.xml
      - ./hive-warehouse:/opt/hive/data/warehouse
    networks:
      seatunnel-network:
        ipv4_address: 172.16.0.3

  hive-server2:
    image: apache/hive:4.0.0
    container_name: hive-server2
    hostname: hive-server2
    depends_on:
      - hive-metastore
    environment:
      HIVE_SERVER2_THRIFT_PORT: 10000
      SERVICE_NAME: hiveserver2
      IS_RESUME: "true"
      SERVICE_OPTS: "-Dhive.metastore.uris=thrift://hive-metastore:9083"
    ports:
      - "10000:10000"
      - "10002:10002"
    volumes:
      - ./hive-warehouse:/opt/hive/data/warehouse
    networks:
      seatunnel-network:
        ipv4_address: 172.16.0.4

  # ===== MySQL =====
  mysql-seatunnel:
    image: mysql:8.0.42
    container_name: mysql-seatunnel
    hostname: mysql-seatunnel
    environment:
      MYSQL_ROOT_PASSWORD: root123456
      MYSQL_DATABASE: seatunnel
      MYSQL_ROOT_HOST: '%'
    ports:
      - "3806:3306"
    volumes:
      - ./mysql_data:/var/lib/mysql
      - ./init-sql:/docker-entrypoint-initdb.d
    networks:
      seatunnel-network:
        ipv4_address: 172.16.0.5
    command: --default-authentication-plugin=mysql_native_password
    healthcheck:
      test: [ "CMD", "mysqladmin", "ping", "-h", "localhost" ]
      interval: 10s
      timeout: 5s
      retries: 5

  # ===== SeaTunnel =====
  seatunnel-master:
    build:
      context: ./seatunnel
      dockerfile: Dockerfile
    image: seatunnel:2.3.11
    container_name: seatunnel-master
    hostname: seatunnel-master
    extra_hosts:
      - "hive-metastore:172.16.0.3"
      - "hive-metastore-db:172.16.0.2"
    environment:
      - SEATUNNEL_HOME=/opt/seatunnel
    command: >
      sh -c "
      cd /opt/seatunnel &&
      exec bin/seatunnel-cluster.sh -r master
      "
    ports:
      - "5801:5801"
    volumes:
      - ./seatunnel/apache-seatunnel-2.3.11/:/opt/seatunnel/
      - ./logs/master:/opt/seatunnel/logs
      # [修改点] 挂载 Hive 仓库目录,确保数据写入宿主机共享目录
      - ./hive-warehouse:/opt/hive/data/warehouse
    networks:
      seatunnel-network:
        ipv4_address: 172.16.0.10

  seatunnel-worker1:
    image: seatunnel:2.3.11
    container_name: seatunnel-worker1
    hostname: seatunnel-worker1
    extra_hosts:
      - "hive-metastore:172.16.0.3"
      - "hive-metastore-db:172.16.0.2"
    environment:
      - SEATUNNEL_HOME=/opt/seatunnel
    command: >
      sh -c "
      cd /opt/seatunnel &&
      exec bin/seatunnel-cluster.sh -r worker
      "
    volumes:
      - ./seatunnel/apache-seatunnel-2.3.11/:/opt/seatunnel/
      - ./logs/worker1:/opt/seatunnel/logs
      # [修改点] 挂载 Hive 仓库目录,确保数据写入宿主机共享目录
      - ./hive-warehouse:/opt/hive/data/warehouse
    depends_on:
      - seatunnel-master
    networks:
      seatunnel-network:
        ipv4_address: 172.16.0.11

  seatunnel-worker2:
    image: seatunnel:2.3.11
    container_name: seatunnel-worker2
    hostname: seatunnel-worker2
    extra_hosts:
      - "hive-metastore:172.16.0.3"
      - "hive-metastore-db:172.16.0.2"
    environment:
      - SEATUNNEL_HOME=/opt/seatunnel
    command: >
      sh -c "
      cd /opt/seatunnel &&
      exec bin/seatunnel-cluster.sh -r worker
      "
    volumes:
      - ./seatunnel/apache-seatunnel-2.3.11/:/opt/seatunnel/
      - ./logs/worker2:/opt/seatunnel/logs
      # [修改点] 挂载 Hive 仓库目录,确保数据写入宿主机共享目录
      - ./hive-warehouse:/opt/hive/data/warehouse
    depends_on:
      - seatunnel-master
    networks:
      seatunnel-network:
        ipv4_address: 172.16.0.12

  seatunnel-web:
    build:
      context: ./seatunnel-web
      dockerfile: Dockerfile
    image: seatunnel-web:1.0.3
    container_name: seatunnel-web
    hostname: seatunnel-web
    extra_hosts:
      - "hive-metastore:172.16.0.3"
      - "hive-metastore-db:172.16.0.2"
    environment:
      - SEATUNNEL_HOME=/opt/seatunnel
      - SEATUNNEL_WEB_HOME=/opt/seatunnel-web
    ports:
      - "8801:8801"
    volumes:
      - ./seatunnel/apache-seatunnel-2.3.11/:/opt/seatunnel/
      - ./seatunnel-web/apache-seatunnel-web-1.0.3-bin/:/opt/seatunnel-web/
      - ./logs/web:/opt/seatunnel-web/logs
      # [修改点] 挂载 Hive 仓库目录,保持环境一致性
      - ./hive-warehouse:/opt/hive/data/warehouse
    depends_on:
      - seatunnel-master
    networks:
      seatunnel-network:
        ipv4_address: 172.16.0.13

SeaTunnel 配置

Dockerfile

FROM eclipse-temurin:8-jdk-ubi9-minimal

WORKDIR /opt/seatunnel/

# 设置环境变量
ENV SEATUNNEL_HOME=/opt/seatunnel
ENV PATH=$PATH:$SEATUNNEL_HOME/bin

# 暴露端口
EXPOSE 5801

# 启动命令
CMD ["sh", "bin/seatunnel-cluster.sh", "-r", "master"]

hazelcast-client.yaml 客户端配置

编辑 seatunnel/apache-seatunnel-2.3.11/config/hazelcast-client.yaml

hazelcast-client:
  cluster-name: seatunnel
  properties:
    hazelcast.logging.type: log4j2
  connection-strategy:
    connection-retry:
      cluster-connect-timeout-millis: 3000
  network:
    cluster-members:
      - seatunnel-master:5801

hazelcast-master.yaml 配置

编辑 seatunnel/apache-seatunnel-2.3.11/config/hazelcast-master.yaml

hazelcast:
  cluster-name: seatunnel
  network:
    rest-api:
      enabled: false
      endpoint-groups:
        CLUSTER_WRITE:
          enabled: true
        DATA:
          enabled: true
    join:
      tcp-ip:
        enabled: true
        member-list:
          - seatunnel-master:5801
          - seatunnel-worker1:5802
          - seatunnel-worker2:5802
    port:
      auto-increment: false
      port: 5801
  properties:
    hazelcast.invocation.max.retry.count: 20
    hazelcast.tcp.join.port.try.count: 30
    hazelcast.logging.type: log4j2
    hazelcast.operation.generic.thread.count: 50
    hazelcast.heartbeat.failuredetector.type: phi-accrual
    hazelcast.heartbeat.interval.seconds: 2
    hazelcast.max.no.heartbeat.seconds: 180
    hazelcast.heartbeat.phiaccrual.failuredetector.threshold: 10
    hazelcast.heartbeat.phiaccrual.failuredetector.sample.size: 200
    hazelcast.heartbeat.phiaccrual.failuredetector.min.std.dev.millis: 100

hazelcast-worker.yaml 配置

编辑 seatunnel/apache-seatunnel-2.3.11/config/hazelcast-worker.yaml

hazelcast:
  cluster-name: seatunnel
  network:
    join:
      tcp-ip:
        enabled: true
        member-list:
          - seatunnel-master:5801
          - seatunnel-worker1:5802
          - seatunnel-worker2:5802
    port:
      auto-increment: false
      port: 5802
  properties:
    hazelcast.invocation.max.retry.count: 20
    hazelcast.tcp.join.port.try.count: 30
    hazelcast.logging.type: log4j2
    hazelcast.operation.generic.thread.count: 50
    hazelcast.heartbeat.failuredetector.type: phi-accrual
    hazelcast.heartbeat.interval.seconds: 2
    hazelcast.max.no.heartbeat.seconds: 180
    hazelcast.heartbeat.phiaccrual.failuredetector.threshold: 10
    hazelcast.heartbeat.phiaccrual.failuredetector.sample.size: 200
    hazelcast.heartbeat.phiaccrual.failuredetector.min.std.dev.millis: 100

安装连接器依赖包

配置同步任务,点击Source组件,源名称下拉框没有数据,需要安装依赖才可以显示。

 cd seatunnel/apache-seatunnel-2.3.11/
 sh bin/install-plugin.sh

SeaTunnel Web 配置

Dockerfile 配置

FROM eclipse-temurin:8-jdk-ubi9-minimal

WORKDIR /opt/seatunnel-web/

# 设置环境变量
ENV SEATUNNEL_WEB_HOME=/opt/seatunnel-web
ENV SEATUNNEL_HOME=/opt/seatunnel

# 暴露端口
EXPOSE 8801

# 启动命令
CMD ["sh", "bin/seatunnel-backend-daemon.sh", "start"]

application.yml 配置

编辑 seatunnel-web/apache-seatunnel-web-1.0.3-bin/conf/application.yml

server:
  port: 8801

spring:
  main:
    allow-circular-references: true
  application:
    name: seatunnel
  jackson:
    date-format: yyyy-MM-dd HH:mm:ss
  datasource:
    driver-class-name: com.mysql.cj.jdbc.Driver
    url: jdbc:mysql://mysql-seatunnel:3306/seatunnel?useSSL=false&useUnicode=true&characterEncoding=utf-8&allowMultiQueries=true&allowPublicKeyRetrieval=true
    username: root
    password: root123456
jwt:
  expireTime: 86400
  # please add key when deploy 要配置下token 32位
  secretKey: a3f5c8d2e1b4098765432109abcdef1234567890abcdef
  algorithm: HS256

hazelcast-client.yaml 客户端配置

编辑 seatunnel-web/apache-seatunnel-web-1.0.3-bin/conf/hazelcast-client.yaml

hazelcast-client:
  cluster-name: seatunnel
  properties:
    hazelcast.logging.type: log4j2
  connection-strategy:
    connection-retry:
      cluster-connect-timeout-millis: 3000
  network:
    cluster-members:
      - seatunnel-master:5801

seatunnel-backend-daemon.sh

编辑 seatunnel-web/apache-seatunnel-web-1.0.3-bin/bin/seatunnel-backend-daemon.sh
去除后台模式 去掉 nohup 和最后的 &

$JAVA_HOME/bin/java $JAVA_OPTS \
  -cp "$CLASSPATH" $SPRING_OPTS \
  org.apache.seatunnel.app.SeatunnelApplication >> "${LOGDIR}/seatunnel.out" 2>&1
  echo "seatunnel-web started"

plugin-mapping.properties 配置

实际验证此步骤可省略。
拷贝seatunnel/apache-seatunnel-2.3.11/connectors/plugin-mapping.properties到
seatunnel-web/apache-seatunnel-web-1.0.3-bin/conf/plugin-mapping.properties

cd seatunnel-docker
cp seatunnel/apache-seatunnel-2.3.11/connectors/plugin-mapping.properties  seatunnel-web/apache-seatunnel-web-1.0.3-bin/conf/plugin-mapping.properties

Hive 配置

hive-site.xml 配置

<?xml version="1.0" encoding="UTF-8"?>
<configuration>
    <property>
        <name>hive.metastore.uris</name>
        <value>thrift://hive-metastore:9083</value>
    </property>
    <property>
        <name>hive.metastore.warehouse.dir</name>
        <value>/opt/hive/data/warehouse</value>
    </property>
    <property>
        <name>metastore.metastore.event.db.notification.api.auth</name>
        <value>false</value>
    </property>
</configuration>

lib目录 依赖包

postgresql-42.5.1.jar

Mysql 配置

init-sql目录 初始SQL脚本

拷贝seatunnel-web/apache-seatunnel-web-1.0.3-bin/script/seatunnel_server_mysql.sql到
init-sql/seatunnel_server_mysql.sql

 cd seatunnel-docker
 cp seatunnel-web/apache-seatunnel-web-1.0.3-bin/script/seatunnel_server_mysql.sql init-sql/seatunnel_server_mysql.sql

docker启动

# 启动所有服务
docker compose up -d --build
# 访问web ui页面 默认登录账号:admin / admin
open http://localhost:8801

运行示例

seatunnel-docker-start

登录配置语言

登录页面

login

设置

setting

配置语言

lang

配置数据源

kafka数据源

测试kafka_datasource-detail

ES数据源

测试ES_datasource-detail

Hive-metastore本地数据源

配置为thrift://hive-metastore:9083也可以。

测试本地hive-metastore_datasource-detail

配置虚拟表

虚拟表列表

virtual-table-detail

创建虚拟表流程

  1. 进入「虚拟表」菜单,点击「创建」按钮
  2. 选择数据源,配置虚拟表信息
  3. 点击「下一步」配置字段映射
  4. 点击「下一步」确认信息并保存

配置同步任务

kafka-hive 同步任务

测试kafka-hive_task-definition

任务组件配置

Source 组件配置

测试kafka-hive_task-definition-source

FieldMapper 组件配置(模型视图)

测试kafka-hive_task-definition-fieldmapper

Sink 组件配置

测试kafka-hive_task-definition-sink

Kafka-Elasticsearch 同步任务

测试kafka-ES_task-definition

任务组件配置

Source 组件配置

测试kafka-ES_task-definition-source

FieldMapper 组件配置(模型视图)

测试kafka-ES_task-definition-fieldmapper

Sink 组件配置

测试kafka-ES_task-definition-sink

创建同步任务通用流程

  1. 进入「任务」→「同步任务定义」,点击「创建」按钮
  2. 拖拽或选择 Source、FieldMapper、Sink 组件构建任务流程
  3. 双击 Source 组件,配置数据源信息(选择已配置的 Kafka 数据源)
  4. 双击 FieldMapper 组件,点击「模型」按钮配置字段映射关系
  5. 双击 Sink 组件,配置目标数据源信息(Hive 或 Elasticsearch)
  6. 保存并启动任务
    需要配置job mode ,不然保存不了,报错job env can't be empty, please change config

job-mode

hive相关操作

创建表


# 进入 HiveServer2 容器
docker exec -it hive-server2 beeline -u jdbc:hive2://localhost:10000 -e "
CREATE TABLE IF NOT EXISTS default.test_user_data3 (
user_id STRING,
type STRING,
content STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE;
"

# 或创建 Parquet 格式(推荐)
docker exec -it hive-server2 beeline -u jdbc:hive2://localhost:10000 -e "
CREATE TABLE IF NOT EXISTS default.test_user_data3 (
user_id STRING,
type STRING,
content STRING
)
STORED AS PARQUET;
"

查看表结构

docker exec -it hive-server2 beeline -u jdbc:hive2://localhost:10000 -e "
SHOW TABLES IN default;
DESCRIBE default.test_user_data3;
"

查询表数据

docker exec -it hive-server2 beeline -u jdbc:hive2://localhost:10000 -e "
SELECT * FROM default.test_user_data3 LIMIT 10;
"

备注

seatunnel-web容器启动就退出

排查是否配置

seatunnel-backend-daemon.sh

编辑 seatunnel-web/apache-seatunnel-web-1.0.3-bin/bin/seatunnel-backend-daemon.sh
去除后台模式 去掉 nohup 和最后的 &

$JAVA_HOME/bin/java $JAVA_OPTS \
  -cp "$CLASSPATH" $SPRING_OPTS \
  org.apache.seatunnel.app.SeatunnelApplication >> "${LOGDIR}/seatunnel.out" 2>&1
  echo "seatunnel-web started"

seatunnel-web启动后访问页面报错Unknown exception. secret key byte array cannot be null or empty

排查application.yml是否配置

jwt:
  expireTime: 86400
  # please add key when deploy
  secretKey:  a3f5c8d2e1b4098765432109abcdef1234567890abcdef
  algorithm: HS256

hive地址解析异常

seatunnel seatunnel-web ERROR [qtp2135089262-20] [MetaStoreUtils.logAndThrowMetaException():166] - Got exception: java.net.URISyntaxException Illegal character in hostname at index 44: thrift://hive-metastore.seatunnel-docker_seatunnel-network:9083

docker-compose.yml对应容器加上ip绑定

 extra_hosts:
   - "hive-metastore:172.16.0.3"
   - "hive-metastore-db:172.16.0.2"

Hive同步报错 error java.lang.NoClassDefFoundError

seatunnel/apache-seatunnel-2.3.11/lib存放依赖包

hive-exec-3.1.3.jar
hive-metastore-3.1.3.jar
libfb303-0.9.3.jar

hive同步任务显示成功,实际没有数据写入

docker-compose.yml对应容器加上hive写入本地目录的配置

volumes:
  # [修改点] 挂载 Hive 仓库目录,确保数据写入宿主机共享目录
  - ./hive-warehouse:/opt/hive/data/warehouse

查看任务执行日志 will be executed on worker

./logs/master/seatunnel-engine-master.log

 Task [TaskGroupLocation{jobId=1080750681855361026, pipelineId=1, taskGroupId=2}] will be executed on worker [[seatunnel-worker2]:5801], slotID [2], resourceProfile [ResourceProfile{cpu=CPU{core=0}, heapMemory=Memory{bytes=0}}], sequence [db6b679c-67cc-43b8-b64a-acaa85c2a4c0], assigned [1080750681855361026]
posted @ 2026-03-05 10:18  云婷  阅读(5)  评论(0)    收藏  举报