容器镜像构建-Hadoop基础镜像
说明
有安全加固或其他需求的,可自定义基础镜像用于Hadoop镜像构建。Hadoop基础镜像用于构建Hadoop镜像。
目录结构
.
├── Dockerfile.ubuntu25 # 基于Ubuntu25.10,python3,JDK11.
├── README.md
└── scripts # 已改造适配python3。
├── envtoconf.py
├── krb5.conf
├── starter.sh
└── transformation.py
构建基础镜像
参考Hadoop官方说明
在本目录进行构建。因Hadoop官方项目长期未更新Dockerfile,已做部分改动。请自行对比官方原代码判断是否要调整。
请在联网环境构建镜像,推荐美国网络,规避软件下载异常问题。
JDK版本选择说明
参考Hadoop官方说明,JDK8支持编译和运行hadoop当前所有版本,Hadoop v3.3及更高版本可使用JDK11运行。
基础镜像默认使用JDK8,若使用JDK11,需编辑Dockerfile修改JDK版本。
执行构建命令
命令格式
docker build -t hadoop:runner-v1 -f Dockerfile.ubuntu25 .
Dockerfile.ubuntu25内容如下
# 1. 基础镜像改为 Ubuntu 25.10
FROM ubuntu:25.10
# 设置环境变量
ENV DEBIAN_FRONTEND=noninteractive
# 2. 安全删除内置 UID 1000 用户
# 使用脚本逻辑确保即使 1000 用户名不是 'ubuntu' 也能正确删除
RUN if getent passwd 1000; then \
userdel -f $(getent passwd 1000 | cut -d: -f1); \
fi && \
if getent group 1000; then \
groupdel $(getent group 1000 | cut -d: -f1); \
fi
# 3. 更新包列表并安装
RUN apt-get update && apt-get install -y \
sudo \
python3-pip \
python3-venv \
python-is-python3 \
wget \
curl \
netcat-openbsd \
jq \
openjdk-11-jdk \
krb5-user \
ca-certificates \
&& rm -rf /var/lib/apt/lists/*
# 4. 安装 Robotframework
# python-is-python3 会把 /usr/bin/python 指向 python3
RUN pip install --break-system-packages robotframework
# 5. 下载 dumb-init (更新到较新版本)
RUN wget -O /usr/local/bin/dumb-init https://github.com/Yelp/dumb-init/releases/download/v1.2.5/dumb-init_1.2.5_x86_64 && \
chmod +x /usr/local/bin/dumb-init
# 6. 权限与目录初始化
RUN mkdir -p /opt/security/keytabs && chmod -R a+wr /opt/security/keytabs
ADD https://repo.maven.apache.org/maven2/org/jboss/byteman/byteman/4.0.4/byteman-4.0.4.jar /opt/byteman.jar
RUN chmod o+r /opt/byteman.jar
# 7. 安装 Async-profiler
RUN mkdir -p /opt/profiler && \
cd /opt/profiler && \
curl -L https://github.com/jvm-profiling-tools/async-profiler/releases/download/v1.5/async-profiler-1.5-linux-x64.tar.gz | tar xvz
# 8. 环境变量
ENV JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64
ENV PATH=$PATH:/opt/hadoop/bin
ENV PYTHONPATH=/opt/scripts
# 9. 创建 Hadoop 用户 (UID 1000)
RUN groupadd --gid 1000 hadoop && \
useradd --uid 1000 hadoop --gid 1000 --home /opt/hadoop --create-home --shell /bin/bash
# 允许 hadoop 用户使用 sudo (krb5.conf 修改需要)
RUN echo "hadoop ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
# 10. 添加脚本
# 注意:先创建目录并赋权,再 ADD,防止权限丢失
RUN mkdir -p /opt/scripts
ADD scripts /opt/scripts/
# 赋予执行权限
RUN chmod +x /opt/scripts/*.sh /opt/scripts/*.py
# 11. 目录准备
RUN mkdir -p /opt/hadoop /var/log/hadoop && \
chmod 1777 /opt/hadoop /var/log/hadoop && \
chown -R hadoop:hadoop /opt/hadoop
ENV HADOOP_LOG_DIR=/var/log/hadoop
ENV HADOOP_CONF_DIR=/opt/hadoop
WORKDIR /opt/hadoop
RUN mkdir /data && chmod 1777 /data && chown hadoop:hadoop /data
# 12. 运行配置
USER hadoop
# 确保指向正确的脚本路径
ENTRYPOINT ["/usr/local/bin/dumb-init", "--", "/opt/scripts/starter.sh"]
scripts目录下
bashrc内容如下
#!/usr/bin/env bash
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
PS1="\u@\h: \w> "
envtoconf.py内容如下,
#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""convert environment variables to config"""
import os
import re
import argparse
import sys
import transformation
class Simple(object):
"""Simple conversion"""
def __init__(self, args):
parser = argparse.ArgumentParser()
parser.add_argument("--destination", help="Destination directory", required=True)
self.args = parser.parse_args(args=args)
self.known_formats = ['xml', 'properties', 'yaml', 'yml', 'env', "sh", "cfg", 'conf']
self.output_dir = self.args.destination
self.excluded_envs = ['HADOOP_CONF_DIR']
self.configurables = {}
def destination_file_path(self, name, extension):
"""destination file path"""
return os.path.join(self.output_dir, "{}.{}".format(name, extension))
def write_env_var(self, name, extension, key, value):
"""Write environment variables"""
# 显式指定 utf-8 编码
file_path = self.destination_file_path(name, extension) + ".raw"
with open(file_path, "a", encoding='utf-8') as myfile:
myfile.write("{}: {}\n".format(key, value))
def process_envs(self):
"""Process environment variables"""
# 对环境变量进行排序,保证执行的可预测性
for key in sorted(os.environ.keys()):
if key in self.excluded_envs:
continue
pattern = re.compile("[_\\.]")
parts = pattern.split(key)
if not parts:
continue
extension = None
name = parts[0].lower()
if len(parts) > 1:
extension = parts[1].lower()
# 默认配置 key 截取
config_key = key[len(name) + len(extension) + 2:].strip()
if extension and "!" in extension:
splitted = extension.split("!")
extension = splitted[0]
fmt = splitted[1]
config_key = key[len(name) + len(extension) + len(fmt) + 3:].strip()
else:
fmt = extension
if extension and extension in self.known_formats:
if name not in self.configurables:
# 初始化文件
with open(self.destination_file_path(name, extension) + ".raw", "w", encoding='utf-8') as myfile:
myfile.write("")
self.configurables[name] = (extension, fmt)
self.write_env_var(name, extension, config_key, os.environ[key])
else:
# 修复逻辑:处理不带 format 标记但匹配已定义 configurable 的变量
for configurable_name in self.configurables:
if key.lower().startswith(configurable_name.lower()):
ext, _ = self.configurables[configurable_name] # 修正:只提取 extension
self.write_env_var(configurable_name,
ext,
key[len(configurable_name) + 1:],
os.environ[key])
def transform(self):
"""transform"""
for configurable_name in sorted(self.configurables.keys()):
name = configurable_name
extension, fmt = self.configurables[name]
destination_path = self.destination_file_path(name, extension)
if not os.path.exists(destination_path + ".raw"):
continue
with open(destination_path + ".raw", "r", encoding='utf-8') as myfile:
content = myfile.read()
# 调用 transformation.py 中的函数
try:
transformer_func = getattr(transformation, "to_" + fmt)
content = transformer_func(content)
with open(destination_path, "w", encoding='utf-8') as myfile:
myfile.write(content)
except AttributeError:
print("Error: No transformer found for format '{}'".format(fmt), file=sys.stderr)
def main(self):
self.process_envs()
self.transform()
def main():
Simple(sys.argv[1:]).main()
if __name__ == '__main__':
main()
krb5.conf内容如下
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
[logging]
default = FILE:/var/log/krb5libs.log
kdc = FILE:/var/log/krb5kdc.log
admin_server = FILE:/var/log/kadmind.log
[libdefaults]
dns_canonicalize_hostname = false
dns_lookup_realm = false
ticket_lifetime = 24h
renew_lifetime = 7d
forwardable = true
rdns = false
default_realm = EXAMPLE.COM
[realms]
EXAMPLE.COM = {
kdc = SERVER
admin_server = SERVER
}
[domain_realm]
.example.com = EXAMPLE.COM
example.com = EXAMPLE.COM
starter.sh内容如下
#!/usr/bin/env bash
set -e
# 获取脚本所在目录
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
# 1. 等待逻辑
if [ -n "$SLEEP_SECONDS" ]; then
echo "Sleeping for $SLEEP_SECONDS seconds"
sleep "$SLEEP_SECONDS"
fi
# 2. 端口等待逻辑 (WAITFOR)
if [ -n "$WAITFOR" ]; then
echo "Waiting for the service $WAITFOR"
WAITFOR_HOST=$(printf "%s\n" "$WAITFOR"| cut -d : -f 1)
WAITFOR_PORT=$(printf "%s\n" "$WAITFOR"| cut -d : -f 2)
# 修正 seq 的使用方式,确保兼容性
for i in $(seq ${WAITFOR_TIMEOUT:-300} -1 0) ; do
set +e
nc -z "$WAITFOR_HOST" "$WAITFOR_PORT" > /dev/null 2>&1
result=$?
set -e
if [ $result -eq 0 ] ; then
break
fi
sleep 1
done
if [ "$i" -eq 0 ]; then
echo "Waiting for service $WAITFOR is timed out." >&2
exit 1
fi
fi
# 3. Kerberos 设置
if [ -n "$KERBEROS_ENABLED" ]; then
echo "Setting up kerberos!!"
KERBEROS_SERVER=${KERBEROS_SERVER:-krb5}
ISSUER_SERVER=${ISSUER_SERVER:-$KERBEROS_SERVER:8081}
echo "KDC ISSUER_SERVER => $ISSUER_SERVER"
if [ -n "$SLEEP_SECONDS" ]; then
# 修正了之前的 $(SLEEP_SECONDS) 语法错误
echo "Sleeping for ${SLEEP_SECONDS} seconds"
sleep "${SLEEP_SECONDS}"
fi
KEYTAB_DIR=${KEYTAB_DIR:-/etc/security/keytabs}
while true; do
set +e
STATUS=$(curl -s -o /dev/null -w '%{http_code}' http://"$ISSUER_SERVER"/keytab/test/test)
set -e
if [ "$STATUS" -eq 200 ]; then
echo "Got 200, KDC service ready!!"
break
else
echo "Got $STATUS :( KDC service not ready yet..."
fi
sleep 5
done
HOST_NAME=$(hostname -f)
export HOST_NAME
for NAME in ${KERBEROS_KEYTABS}; do
echo "Download $NAME/$HOST_NAME@EXAMPLE.COM keytab file to $KEYTAB_DIR/$NAME.keytab"
wget -q "http://$ISSUER_SERVER/keytab/$HOST_NAME/$NAME" -O "$KEYTAB_DIR/$NAME.keytab"
klist -kt "$KEYTAB_DIR/$NAME.keytab"
done
# 适配 Ubuntu 的配置文件路径
sed "s/SERVER/$KERBEROS_SERVER/g" "$DIR"/krb5.conf | sudo tee /etc/krb5.conf > /dev/null
fi
# 4. 权限修复 (针对 Docker 挂载卷)
sudo chmod o+rwx /data
# 5. 调用 Python 3 转换配置 (关键修改)
python3 "$DIR"/envtoconf.py --destination "${HADOOP_CONF_DIR:-/opt/hadoop/etc/hadoop}"
# 6. Hadoop/Ozone 初始化逻辑 (保持原样,但确保路径正确)
if [ -n "$ENSURE_NAMENODE_DIR" ]; then
CLUSTERID_OPTS=""
if [ -n "$ENSURE_NAMENODE_CLUSTERID" ]; then
CLUSTERID_OPTS="-clusterid $ENSURE_NAMENODE_CLUSTERID"
fi
if [ ! -d "$ENSURE_NAMENODE_DIR" ]; then
/opt/hadoop/bin/hdfs namenode -format -force $CLUSTERID_OPTS
fi
fi
if [ -n "$ENSURE_STANDBY_NAMENODE_DIR" ]; then
if [ ! -d "$ENSURE_STANDBY_NAMENODE_DIR" ]; then
/opt/hadoop/bin/hdfs namenode -bootstrapStandby
fi
fi
# Ozone 相关的初始化
if [ -n "$ENSURE_SCM_INITIALIZED" ]; then
if [ ! -f "$ENSURE_SCM_INITIALIZED" ]; then
/opt/hadoop/bin/ozone scm --init || /opt/hadoop/bin/ozone scm -init
fi
fi
if [ -n "$ENSURE_OM_INITIALIZED" ]; then
if [ ! -f "$ENSURE_OM_INITIALIZED" ]; then
/opt/hadoop/bin/ozone om --init || /opt/hadoop/bin/ozone om -createObjectStore
fi
fi
# 7. Byteman 注入
if [ -n "$BYTEMAN_SCRIPT" ] || [ -n "$BYTEMAN_SCRIPT_URL" ]; then
# 确保 BYTEMAN_DIR 已定义
BYTEMAN_DIR=${BYTEMAN_DIR:-/opt/profiler}
export PATH=$PATH:$BYTEMAN_DIR/bin
if [ -n "$BYTEMAN_SCRIPT_URL" ]; then
sudo wget -q $BYTEMAN_SCRIPT_URL -O /tmp/byteman.btm
export BYTEMAN_SCRIPT=/tmp/byteman.btm
fi
if [ ! -f "$BYTEMAN_SCRIPT" ]; then
echo "ERROR: The defined $BYTEMAN_SCRIPT does not exist!!!"
exit 1
fi
AGENT_STRING="-javaagent:/opt/byteman.jar=script:$BYTEMAN_SCRIPT"
export HADOOP_OPTS="$AGENT_STRING $HADOOP_OPTS"
echo "Process is instrumented with $AGENT_STRING"
fi
# 执行 CMD 传入的命令
exec "$@"
transformation.py内容如下
#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""This module transform properties into different format"""
def render_yaml(yaml_root, prefix=""):
"""render yaml"""
result = ""
if isinstance(yaml_root, dict):
if prefix:
result += "\n"
# 兼容 Py3: 字典遍历建议排序以保证生成文件的一致性
for key in sorted(yaml_root.keys()):
result += "{}{}: {}".format(prefix, key, render_yaml(
yaml_root[key], prefix + " "))
elif isinstance(yaml_root, list):
result += "\n"
for item in yaml_root:
result += prefix + " - " + render_yaml(item, prefix + " ")
else:
result += "{}\n".format(yaml_root)
return result
def to_yaml(content):
"""transform to yaml"""
props = process_properties(content)
keys = sorted(props.keys()) # 排序保证输出稳定
yaml_props = {}
for key in keys:
parts = key.split(".")
node = yaml_props
prev_part = None
parent_node = {}
for part in parts[:-1]:
if part.isdigit():
idx = int(part)
if isinstance(node, dict):
parent_node[prev_part] = []
node = parent_node[prev_part]
while len(node) <= idx:
node.append({})
parent_node = node
node = node[idx] # 修正了原代码的 int(node) 错误
else:
if part not in node:
node[part] = {}
parent_node = node
node = node[part]
prev_part = part
last_part = parts[-1]
if last_part.isdigit():
idx = int(last_part)
if isinstance(node, dict):
parent_node[prev_part] = []
node = parent_node[prev_part]
node.append(props[key])
else:
node[last_part] = props[key]
return render_yaml(yaml_props)
def to_yml(content):
return to_yaml(content)
def to_properties(content):
result = ""
props = process_properties(content)
for key, val in sorted(props.items()): # 增加 items() 并排序
result += "{}: {}\n".format(key, val)
return result
def to_env(content):
result = ""
props = process_properties(content)
for key, val in sorted(props.items()): # 修正:增加 .items()
result += "{}={}\n".format(key, val)
return result
def to_sh(content):
result = ""
props = process_properties(content)
for key, val in sorted(props.items()): # 修正:增加 .items()
result += "export {}=\"{}\"\n".format(key, val)
return result
def to_cfg(content):
result = ""
props = process_properties(content)
for key, val in sorted(props.items()): # 修正:增加 .items()
result += "{}={}\n".format(key, val)
return result
def to_conf(content):
result = ""
props = process_properties(content)
for key, val in sorted(props.items()): # 修正:增加 .items()
result += "export {}={}\n".format(key, val)
return result
def to_xml(content):
result = "<configuration>\n"
props = process_properties(content)
for key in sorted(props.keys()):
result += "<property><name>{0}</name><value>{1}</value></property>\n". \
format(key, props[key])
result += "</configuration>"
return result
def process_properties(content, sep=': ', comment_char='#'):
props = {}
if not content:
return props
for line in content.split("\n"):
sline = line.strip()
if sline and not sline.startswith(comment_char):
if sep in sline:
key_value = sline.split(sep)
key = key_value[0].strip()
value = sep.join(key_value[1:]).strip().strip('"')
props[key] = value
return props

浙公网安备 33010602011771号