容器镜像构建-Hadoop基础镜像

说明

有安全加固或其他需求的,可自定义基础镜像用于Hadoop镜像构建。Hadoop基础镜像用于构建Hadoop镜像。

目录结构

.
├── Dockerfile.ubuntu25 # 基于Ubuntu25.10,python3,JDK11.
├── README.md
└── scripts # 已改造适配python3。
    ├── envtoconf.py
    ├── krb5.conf
    ├── starter.sh
    └── transformation.py

构建基础镜像

参考Hadoop官方说明
在本目录进行构建。因Hadoop官方项目长期未更新Dockerfile,已做部分改动。请自行对比官方原代码判断是否要调整。
请在联网环境构建镜像,推荐美国网络,规避软件下载异常问题。

JDK版本选择说明

参考Hadoop官方说明,JDK8支持编译和运行hadoop当前所有版本,Hadoop v3.3及更高版本可使用JDK11运行。
基础镜像默认使用JDK8,若使用JDK11,需编辑Dockerfile修改JDK版本。

执行构建命令

命令格式

docker build -t hadoop:runner-v1 -f Dockerfile.ubuntu25 .

Dockerfile.ubuntu25内容如下

# 1. 基础镜像改为 Ubuntu 25.10
FROM ubuntu:25.10

# 设置环境变量
ENV DEBIAN_FRONTEND=noninteractive

# 2. 安全删除内置 UID 1000 用户
# 使用脚本逻辑确保即使 1000 用户名不是 'ubuntu' 也能正确删除
RUN if getent passwd 1000; then \
        userdel -f $(getent passwd 1000 | cut -d: -f1); \
    fi && \
    if getent group 1000; then \
        groupdel $(getent group 1000 | cut -d: -f1); \
    fi

# 3. 更新包列表并安装
RUN apt-get update && apt-get install -y \
    sudo \
    python3-pip \
    python3-venv \
    python-is-python3 \
    wget \
    curl \
    netcat-openbsd \
    jq \
    openjdk-11-jdk \
    krb5-user \
    ca-certificates \
    && rm -rf /var/lib/apt/lists/*

# 4. 安装 Robotframework
# python-is-python3 会把 /usr/bin/python 指向 python3
RUN pip install --break-system-packages robotframework

# 5. 下载 dumb-init (更新到较新版本)
RUN wget -O /usr/local/bin/dumb-init https://github.com/Yelp/dumb-init/releases/download/v1.2.5/dumb-init_1.2.5_x86_64 && \
    chmod +x /usr/local/bin/dumb-init

# 6. 权限与目录初始化
RUN mkdir -p /opt/security/keytabs && chmod -R a+wr /opt/security/keytabs 
ADD https://repo.maven.apache.org/maven2/org/jboss/byteman/byteman/4.0.4/byteman-4.0.4.jar /opt/byteman.jar
RUN chmod o+r /opt/byteman.jar

# 7. 安装 Async-profiler
RUN mkdir -p /opt/profiler && \
    cd /opt/profiler && \
    curl -L https://github.com/jvm-profiling-tools/async-profiler/releases/download/v1.5/async-profiler-1.5-linux-x64.tar.gz | tar xvz

# 8. 环境变量
ENV JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64
ENV PATH=$PATH:/opt/hadoop/bin
ENV PYTHONPATH=/opt/scripts

# 9. 创建 Hadoop 用户 (UID 1000)
RUN groupadd --gid 1000 hadoop && \
    useradd --uid 1000 hadoop --gid 1000 --home /opt/hadoop --create-home --shell /bin/bash

# 允许 hadoop 用户使用 sudo (krb5.conf 修改需要)
RUN echo "hadoop ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers

# 10. 添加脚本
# 注意:先创建目录并赋权,再 ADD,防止权限丢失
RUN mkdir -p /opt/scripts
ADD scripts /opt/scripts/
# 赋予执行权限
RUN chmod +x /opt/scripts/*.sh /opt/scripts/*.py

# 11. 目录准备
RUN mkdir -p /opt/hadoop /var/log/hadoop && \
    chmod 1777 /opt/hadoop /var/log/hadoop && \
    chown -R hadoop:hadoop /opt/hadoop

ENV HADOOP_LOG_DIR=/var/log/hadoop
ENV HADOOP_CONF_DIR=/opt/hadoop

WORKDIR /opt/hadoop
RUN mkdir /data && chmod 1777 /data && chown hadoop:hadoop /data

# 12. 运行配置
USER hadoop
# 确保指向正确的脚本路径
ENTRYPOINT ["/usr/local/bin/dumb-init", "--", "/opt/scripts/starter.sh"]

scripts目录下
bashrc内容如下

#!/usr/bin/env bash
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
PS1="\u@\h: \w> "

envtoconf.py内容如下,

#!/usr/bin/python3
# -*- coding: utf-8 -*-

"""convert environment variables to config"""

import os
import re
import argparse
import sys
import transformation

class Simple(object):
    """Simple conversion"""
    def __init__(self, args):
        parser = argparse.ArgumentParser()
        parser.add_argument("--destination", help="Destination directory", required=True)
        self.args = parser.parse_args(args=args)

        self.known_formats = ['xml', 'properties', 'yaml', 'yml', 'env', "sh", "cfg", 'conf']
        self.output_dir = self.args.destination
        self.excluded_envs = ['HADOOP_CONF_DIR']
        self.configurables = {}

    def destination_file_path(self, name, extension):
        """destination file path"""
        return os.path.join(self.output_dir, "{}.{}".format(name, extension))

    def write_env_var(self, name, extension, key, value):
        """Write environment variables"""
        # 显式指定 utf-8 编码
        file_path = self.destination_file_path(name, extension) + ".raw"
        with open(file_path, "a", encoding='utf-8') as myfile:
            myfile.write("{}: {}\n".format(key, value))

    def process_envs(self):
        """Process environment variables"""
        # 对环境变量进行排序,保证执行的可预测性
        for key in sorted(os.environ.keys()):
            if key in self.excluded_envs:
                continue
            
            pattern = re.compile("[_\\.]")
            parts = pattern.split(key)
            if not parts:
                continue
                
            extension = None
            name = parts[0].lower()
            
            if len(parts) > 1:
                extension = parts[1].lower()
                # 默认配置 key 截取
                config_key = key[len(name) + len(extension) + 2:].strip()
            
            if extension and "!" in extension:
                splitted = extension.split("!")
                extension = splitted[0]
                fmt = splitted[1]
                config_key = key[len(name) + len(extension) + len(fmt) + 3:].strip()
            else:
                fmt = extension

            if extension and extension in self.known_formats:
                if name not in self.configurables:
                    # 初始化文件
                    with open(self.destination_file_path(name, extension) + ".raw", "w", encoding='utf-8') as myfile:
                        myfile.write("")
                self.configurables[name] = (extension, fmt)
                self.write_env_var(name, extension, config_key, os.environ[key])
            else:
                # 修复逻辑:处理不带 format 标记但匹配已定义 configurable 的变量
                for configurable_name in self.configurables:
                    if key.lower().startswith(configurable_name.lower()):
                        ext, _ = self.configurables[configurable_name] # 修正:只提取 extension
                        self.write_env_var(configurable_name,
                                           ext,
                                           key[len(configurable_name) + 1:],
                                           os.environ[key])

    def transform(self):
        """transform"""
        for configurable_name in sorted(self.configurables.keys()):
            name = configurable_name
            extension, fmt = self.configurables[name]

            destination_path = self.destination_file_path(name, extension)

            if not os.path.exists(destination_path + ".raw"):
                continue

            with open(destination_path + ".raw", "r", encoding='utf-8') as myfile:
                content = myfile.read()
                
            # 调用 transformation.py 中的函数
            try:
                transformer_func = getattr(transformation, "to_" + fmt)
                content = transformer_func(content)
                with open(destination_path, "w", encoding='utf-8') as myfile:
                    myfile.write(content)
            except AttributeError:
                print("Error: No transformer found for format '{}'".format(fmt), file=sys.stderr)

    def main(self):
        self.process_envs()
        self.transform()

def main():
    Simple(sys.argv[1:]).main()

if __name__ == '__main__':
    main()

krb5.conf内容如下

# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

[logging]
 default = FILE:/var/log/krb5libs.log
 kdc = FILE:/var/log/krb5kdc.log
 admin_server = FILE:/var/log/kadmind.log

[libdefaults]
 dns_canonicalize_hostname = false
 dns_lookup_realm = false
 ticket_lifetime = 24h
 renew_lifetime = 7d
 forwardable = true
 rdns = false
 default_realm = EXAMPLE.COM

[realms]
 EXAMPLE.COM = {
  kdc = SERVER
  admin_server = SERVER
 }

[domain_realm]
 .example.com = EXAMPLE.COM
 example.com = EXAMPLE.COM

starter.sh内容如下

#!/usr/bin/env bash
set -e

# 获取脚本所在目录
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"

# 1. 等待逻辑
if [ -n "$SLEEP_SECONDS" ]; then
   echo "Sleeping for $SLEEP_SECONDS seconds"
   sleep "$SLEEP_SECONDS"
fi

# 2. 端口等待逻辑 (WAITFOR)
if [ -n "$WAITFOR" ]; then
  echo "Waiting for the service $WAITFOR"
  WAITFOR_HOST=$(printf "%s\n" "$WAITFOR"| cut -d : -f 1)
  WAITFOR_PORT=$(printf "%s\n" "$WAITFOR"| cut -d : -f 2)
  # 修正 seq 的使用方式,确保兼容性
  for i in $(seq ${WAITFOR_TIMEOUT:-300} -1 0) ; do
    set +e
    nc -z "$WAITFOR_HOST" "$WAITFOR_PORT" > /dev/null 2>&1
    result=$?
    set -e
    if [ $result -eq 0 ] ; then
      break
    fi
    sleep 1
  done
  if [ "$i" -eq 0 ]; then
      echo "Waiting for service $WAITFOR is timed out." >&2
      exit 1
  fi
fi

# 3. Kerberos 设置
if [ -n "$KERBEROS_ENABLED" ]; then
  echo "Setting up kerberos!!"
  KERBEROS_SERVER=${KERBEROS_SERVER:-krb5}
  ISSUER_SERVER=${ISSUER_SERVER:-$KERBEROS_SERVER:8081}
  echo "KDC ISSUER_SERVER => $ISSUER_SERVER"

  if [ -n "$SLEEP_SECONDS" ]; then
    # 修正了之前的 $(SLEEP_SECONDS) 语法错误
    echo "Sleeping for ${SLEEP_SECONDS} seconds"
    sleep "${SLEEP_SECONDS}"
  fi

  KEYTAB_DIR=${KEYTAB_DIR:-/etc/security/keytabs}

  while true; do
      set +e
      STATUS=$(curl -s -o /dev/null -w '%{http_code}' http://"$ISSUER_SERVER"/keytab/test/test)
      set -e
      if [ "$STATUS" -eq 200 ]; then
        echo "Got 200, KDC service ready!!"
        break
      else
        echo "Got $STATUS :( KDC service not ready yet..."
      fi
      sleep 5
  done

  HOST_NAME=$(hostname -f)
  export HOST_NAME
  for NAME in ${KERBEROS_KEYTABS}; do
    echo "Download $NAME/$HOST_NAME@EXAMPLE.COM keytab file to $KEYTAB_DIR/$NAME.keytab"
    wget -q "http://$ISSUER_SERVER/keytab/$HOST_NAME/$NAME" -O "$KEYTAB_DIR/$NAME.keytab"
    klist -kt "$KEYTAB_DIR/$NAME.keytab"
  done

  # 适配 Ubuntu 的配置文件路径
  sed "s/SERVER/$KERBEROS_SERVER/g" "$DIR"/krb5.conf | sudo tee /etc/krb5.conf > /dev/null
fi

# 4. 权限修复 (针对 Docker 挂载卷)
sudo chmod o+rwx /data

# 5. 调用 Python 3 转换配置 (关键修改)
python3 "$DIR"/envtoconf.py --destination "${HADOOP_CONF_DIR:-/opt/hadoop/etc/hadoop}"

# 6. Hadoop/Ozone 初始化逻辑 (保持原样,但确保路径正确)
if [ -n "$ENSURE_NAMENODE_DIR" ]; then
  CLUSTERID_OPTS=""
  if [ -n "$ENSURE_NAMENODE_CLUSTERID" ]; then
    CLUSTERID_OPTS="-clusterid $ENSURE_NAMENODE_CLUSTERID"
  fi
  if [ ! -d "$ENSURE_NAMENODE_DIR" ]; then
    /opt/hadoop/bin/hdfs namenode -format -force $CLUSTERID_OPTS
  fi
fi

if [ -n "$ENSURE_STANDBY_NAMENODE_DIR" ]; then
  if [ ! -d "$ENSURE_STANDBY_NAMENODE_DIR" ]; then
    /opt/hadoop/bin/hdfs namenode -bootstrapStandby
  fi
fi

# Ozone 相关的初始化
if [ -n "$ENSURE_SCM_INITIALIZED" ]; then
  if [ ! -f "$ENSURE_SCM_INITIALIZED" ]; then
    /opt/hadoop/bin/ozone scm --init || /opt/hadoop/bin/ozone scm -init
  fi
fi

if [ -n "$ENSURE_OM_INITIALIZED" ]; then
  if [ ! -f "$ENSURE_OM_INITIALIZED" ]; then
    /opt/hadoop/bin/ozone om --init || /opt/hadoop/bin/ozone om -createObjectStore
  fi
fi

# 7. Byteman 注入
if [ -n "$BYTEMAN_SCRIPT" ] || [ -n "$BYTEMAN_SCRIPT_URL" ]; then
  # 确保 BYTEMAN_DIR 已定义
  BYTEMAN_DIR=${BYTEMAN_DIR:-/opt/profiler} 
  export PATH=$PATH:$BYTEMAN_DIR/bin

  if [ -n "$BYTEMAN_SCRIPT_URL" ]; then
    sudo wget -q $BYTEMAN_SCRIPT_URL -O /tmp/byteman.btm
    export BYTEMAN_SCRIPT=/tmp/byteman.btm
  fi

  if [ ! -f "$BYTEMAN_SCRIPT" ]; then
    echo "ERROR: The defined $BYTEMAN_SCRIPT does not exist!!!"
    exit 1
  fi

  AGENT_STRING="-javaagent:/opt/byteman.jar=script:$BYTEMAN_SCRIPT"
  export HADOOP_OPTS="$AGENT_STRING $HADOOP_OPTS"
  echo "Process is instrumented with $AGENT_STRING"
fi

# 执行 CMD 传入的命令
exec "$@"

transformation.py内容如下

#!/usr/bin/python3
# -*- coding: utf-8 -*-

"""This module transform properties into different format"""

def render_yaml(yaml_root, prefix=""):
    """render yaml"""
    result = ""
    if isinstance(yaml_root, dict):
        if prefix:
            result += "\n"
        # 兼容 Py3: 字典遍历建议排序以保证生成文件的一致性
        for key in sorted(yaml_root.keys()):
            result += "{}{}: {}".format(prefix, key, render_yaml(
                yaml_root[key], prefix + "   "))
    elif isinstance(yaml_root, list):
        result += "\n"
        for item in yaml_root:
            result += prefix + " - " + render_yaml(item, prefix + " ")
    else:
        result += "{}\n".format(yaml_root)
    return result


def to_yaml(content):
    """transform to yaml"""
    props = process_properties(content)
    keys = sorted(props.keys()) # 排序保证输出稳定
    yaml_props = {}
    for key in keys:
        parts = key.split(".")
        node = yaml_props
        prev_part = None
        parent_node = {}
        for part in parts[:-1]:
            if part.isdigit():
                idx = int(part)
                if isinstance(node, dict):
                    parent_node[prev_part] = []
                    node = parent_node[prev_part]
                while len(node) <= idx:
                    node.append({})
                parent_node = node
                node = node[idx] # 修正了原代码的 int(node) 错误
            else:
                if part not in node:
                    node[part] = {}
                parent_node = node
                node = node[part]
            prev_part = part
        
        last_part = parts[-1]
        if last_part.isdigit():
            idx = int(last_part)
            if isinstance(node, dict):
                parent_node[prev_part] = []
                node = parent_node[prev_part]
            node.append(props[key])
        else:
            node[last_part] = props[key]

    return render_yaml(yaml_props)

def to_yml(content):
    return to_yaml(content)

def to_properties(content):
    result = ""
    props = process_properties(content)
    for key, val in sorted(props.items()): # 增加 items() 并排序
        result += "{}: {}\n".format(key, val)
    return result

def to_env(content):
    result = ""
    props = process_properties(content)
    for key, val in sorted(props.items()): # 修正:增加 .items()
        result += "{}={}\n".format(key, val)
    return result

def to_sh(content):
    result = ""
    props = process_properties(content)
    for key, val in sorted(props.items()): # 修正:增加 .items()
        result += "export {}=\"{}\"\n".format(key, val)
    return result

def to_cfg(content):
    result = ""
    props = process_properties(content)
    for key, val in sorted(props.items()): # 修正:增加 .items()
        result += "{}={}\n".format(key, val)
    return result

def to_conf(content):
    result = ""
    props = process_properties(content)
    for key, val in sorted(props.items()): # 修正:增加 .items()
        result += "export {}={}\n".format(key, val)
    return result

def to_xml(content):
    result = "<configuration>\n"
    props = process_properties(content)
    for key in sorted(props.keys()):
        result += "<property><name>{0}</name><value>{1}</value></property>\n". \
          format(key, props[key])
    result += "</configuration>"
    return result

def process_properties(content, sep=': ', comment_char='#'):
    props = {}
    if not content:
        return props
    for line in content.split("\n"):
        sline = line.strip()
        if sline and not sline.startswith(comment_char):
            if sep in sline:
                key_value = sline.split(sep)
                key = key_value[0].strip()
                value = sep.join(key_value[1:]).strip().strip('"')
                props[key] = value
    return props
posted @ 2026-01-20 16:46  云上小朱  阅读(2)  评论(0)    收藏  举报