自定义制作docker容器自动自愈容器镜像
包括:
- 完整的
autoheal.sh(支持每分钟检查一次、连续 5 次 unhealthy 才重启) Dockerfiledocker-compose.yml- 详细文档,包含参数说明、用法
1️⃣ autoheal.sh
#!/usr/bin/env sh
set -e
set -o pipefail
DOCKER_SOCK=${DOCKER_SOCK:-/var/run/docker.sock}
CURL_TIMEOUT=${CURL_TIMEOUT:-30}
WEBHOOK_URL=${WEBHOOK_URL:-""}
HTTP_ENDPOINT="http://localhost"
UNIX_SOCK="--unix-socket ${DOCKER_SOCK}"
AUTOHEAL_CONTAINER_LABEL=${AUTOHEAL_CONTAINER_LABEL:-autoheal}
AUTOHEAL_START_PERIOD=${AUTOHEAL_START_PERIOD:-0}
AUTOHEAL_INTERVAL=${AUTOHEAL_INTERVAL:-60} # 默认每分钟检查一次
AUTOHEAL_DEFAULT_STOP_TIMEOUT=${AUTOHEAL_DEFAULT_STOP_TIMEOUT:-10}
AUTOHEAL_MAX_FAILS=${AUTOHEAL_MAX_FAILS:-5} # 连续5次才重启
STATE_FILE="/tmp/autoheal_counts"
touch "$STATE_FILE"
docker_curl() {
curl --max-time "${CURL_TIMEOUT}" --no-buffer -s \
${UNIX_SOCK} \
"$@"
}
get_container_info() {
local label_filter
if [ "$AUTOHEAL_CONTAINER_LABEL" = "all" ]; then
label_filter=""
else
label_filter=",\"label\":\[\"${AUTOHEAL_CONTAINER_LABEL}=true\"\]"
fi
url="${HTTP_ENDPOINT}/containers/json?filters=\{\"health\":\[\"unhealthy\"\]${label_filter}\}"
docker_curl "$url"
}
restart_container() {
local container_id="$1"
local timeout="$2"
docker_curl -f -X POST "${HTTP_ENDPOINT}/containers/${container_id}/restart?t=${timeout}"
}
get_count() {
grep "^$1 " "$STATE_FILE" | awk '{print $2}'
}
set_count() {
grep -v "^$1 " "$STATE_FILE" > "${STATE_FILE}.tmp" 2>/dev/null || true
echo "$1 $2" >> "${STATE_FILE}.tmp"
mv "${STATE_FILE}.tmp" "$STATE_FILE"
}
trap 'kill $$; exit 143' SIGTERM
if [ "$1" = "autoheal" ] && [ -e "$DOCKER_SOCK" ]; then
[ "$AUTOHEAL_START_PERIOD" -gt 0 ] && sleep "$AUTOHEAL_START_PERIOD"
while true; do
STOP_TIMEOUT=".Labels[\"autoheal.stop.timeout\"] // $AUTOHEAL_DEFAULT_STOP_TIMEOUT"
get_container_info | \
jq -r "foreach .[] as \$C([];[];\$C | .Id, .Names[0], .State, ${STOP_TIMEOUT})" | \
while read -r CONTAINER_ID && read -r CONTAINER_NAME && read -r CONTAINER_STATE && read -r TIMEOUT; do
CONTAINER_SHORT_ID=${CONTAINER_ID:0:12}
DATE=$(date +%F" "%T)
COUNT=$(get_count "$CONTAINER_ID")
COUNT=${COUNT:-0}
if [ "$CONTAINER_NAME" = "null" ]; then
echo "$DATE Container ${CONTAINER_SHORT_ID} name null, skip"
elif [ "$CONTAINER_STATE" = "restarting" ]; then
echo "$DATE Container $CONTAINER_NAME ($CONTAINER_SHORT_ID) restarting, skip"
else
COUNT=$((COUNT + 1))
echo "$DATE Container $CONTAINER_NAME ($CONTAINER_SHORT_ID) unhealthy count=$COUNT"
if [ $COUNT -ge $AUTOHEAL_MAX_FAILS ]; then
echo "$DATE Container $CONTAINER_NAME ($CONTAINER_SHORT_ID) reached $AUTOHEAL_MAX_FAILS, restarting"
if restart_container "$CONTAINER_ID" "$TIMEOUT"; then
echo "$DATE Restarted container $CONTAINER_SHORT_ID ok"
else
echo "$DATE Restarted container $CONTAINER_SHORT_ID failed" >&2
fi
COUNT=0
fi
set_count "$CONTAINER_ID" "$COUNT"
fi
done
sleep "$AUTOHEAL_INTERVAL"
done
else
exec "$@"
fi
2️⃣ Dockerfile
FROM alpine:3.19
RUN apk add --no-cache bash curl jq
COPY autoheal.sh /usr/local/bin/autoheal.sh
RUN chmod +x /usr/local/bin/autoheal.sh
# 自身容器健康检查
HEALTHCHECK --interval=5s --timeout=3s --retries=3 CMD pgrep -f autoheal || exit 1
ENTRYPOINT ["/usr/local/bin/autoheal.sh"]
CMD ["autoheal"]
构建:
docker build -t my-autoheal .
3️⃣ docker-compose.yml
version: "3.8"
services:
autoheal:
image: my-autoheal:latest
container_name: autoheal
restart: unless-stopped
volumes:
- /var/run/docker.sock:/var/run/docker.sock
environment:
# === 参数示例 ===
- DOCKER_SOCK=/var/run/docker.sock
- AUTOHEAL_CONTAINER_LABEL=all
- AUTOHEAL_START_PERIOD=0
- AUTOHEAL_INTERVAL=60
- AUTOHEAL_MAX_FAILS=5
- AUTOHEAL_DEFAULT_STOP_TIMEOUT=10
# - WEBHOOK_URL=https://your-webhook
启动:
docker-compose up -d
4️⃣ 参数说明
| 环境变量 | 说明 | 默认值 |
|---|---|---|
DOCKER_SOCK |
Docker Daemon 的 socket 或 TCP 地址 | /var/run/docker.sock |
AUTOHEAL_CONTAINER_LABEL |
要监控的容器标签:autoheal(只监控打了 autoheal=true 标签的容器),all(所有容器) |
autoheal |
AUTOHEAL_START_PERIOD |
容器启动后延迟多少秒再开始监控 | 0 |
AUTOHEAL_INTERVAL |
检查间隔秒数 | 60 |
AUTOHEAL_MAX_FAILS |
连续多少次 unhealthy 才重启 |
5 |
AUTOHEAL_DEFAULT_STOP_TIMEOUT |
停止容器的超时时间(秒) | 10 |
WEBHOOK_URL |
重启成功/失败时发送通知的 Webhook URL(可选) | 空 |
如果用 TCP 远程 Docker API,还可以设置证书路径(自己扩展 CA_CERT、CLIENT_CERT、CLIENT_KEY 等)。
5️⃣ 使用示例
你的业务容器:
services:
myapp:
image: myapp:latest
labels:
- autoheal=true # 如果 AUTOHEAL_CONTAINER_LABEL=autoheal 时
再用上面的 autoheal 服务即可实现:
- 每分钟检查一次容器健康
- 连续 5 次
unhealthy才重启 - 重启成功/失败可通过 Webhook 通知

浙公网安备 33010602011771号