docker 容器健康检测与重启
docker 容器健康检测与重启
health-check.py
import docker
import time
import logging
# ============================
# Logger 配置
# ============================
logger = logging.getLogger("autoheal")
logger.setLevel(logging.INFO)
fmt = logging.Formatter("%(asctime)s [%(levelname)s] [autoheal] %(message)s")
ch = logging.StreamHandler()
ch.setFormatter(fmt)
logger.addHandler(ch)
# ============================
# Docker 客户端
# ============================
client = docker.DockerClient(base_url='unix://var/run/docker.sock')
CHECK_INTERVAL = 60
RETRY_TIMES = 3
def get_health(container):
try:
container.reload() # 刷新状态
state = container.attrs["State"]
# 容器已经停止,无需检测健康,直接返回 stopped
if state.get("Status") in ["exited", "dead", "created"]:
return "stopped"
if "Health" in state:
return state["Health"]["Status"] # healthy / unhealthy / starting
else:
return "no-healthcheck"
except Exception as e:
return f"error: {e}"
def check_and_restart():
containers = client.containers.list(all=True)
for container in containers:
name = container.name
status = get_health(container)
# ------- 输出状态 -------
logger.info(f"[{name}] Current health/state: {status}")
# ------- 停止的容器不处理 -------
if status == "stopped":
logger.info(f"[{name}] Container stopped (Exited). Skip.")
continue
# ------- 无健康检查,跳过 -------
if status == "no-healthcheck":
logger.info(f"[{name}] No HEALTHCHECK. Skip.")
continue
# ------- 健康,不处理 -------
if status == "healthy":
continue
# ------- 不健康,进行三次重试 -------
if status == "unhealthy":
logger.warning(f"[{name}] Unhealthy! Starting retry checks...")
for retry in range(1, RETRY_TIMES + 1):
time.sleep(CHECK_INTERVAL)
status = get_health(container)
logger.info(f"[{name}] Retry {retry}/{RETRY_TIMES}: {status}")
# 可能已经被 stop 掉
if status == "stopped":
logger.info(f"[{name}] Container is stopped during retry. Skip restart.")
break
if status == "healthy":
logger.info(f"[{name}] Back to healthy. No restart needed.")
break
else:
# 3 次全部 unhealthy,执行重启
logger.warning(f"[{name}] Still unhealthy after {RETRY_TIMES} retries. Restarting...")
container.restart()
logger.info(f"[{name}] Restart issued.")
if __name__ == "__main__":
logger.info("=== Autoheal service started ===")
while True:
check_and_restart()
time.sleep(10) # 主循环每 10s 执行一次
Dockerfile
FROM python:3.10-slim
RUN pip install docker
COPY check_health.py /check_health.py
CMD ["python", "/check_health.py"]
构建镜像
docker build -t health-check:v1.0 .
启动检测容器
docker run -d \
-v /var/run/docker.sock:/var/run/docker.sock \
-v /etc/localtime:/etc/localtime \
--name health-checker \
health-check:v1.0

浙公网安备 33010602011771号