docker 容器健康检测与重启

docker 容器健康检测与重启

health-check.py

import docker
import time
import logging

# ============================
# Logger 配置
# ============================
logger = logging.getLogger("autoheal")
logger.setLevel(logging.INFO)

fmt = logging.Formatter("%(asctime)s [%(levelname)s] [autoheal] %(message)s")
ch = logging.StreamHandler()
ch.setFormatter(fmt)
logger.addHandler(ch)

# ============================
# Docker 客户端
# ============================
client = docker.DockerClient(base_url='unix://var/run/docker.sock')

CHECK_INTERVAL = 60
RETRY_TIMES = 3


def get_health(container):
    try:
        container.reload()    # 刷新状态
        state = container.attrs["State"]

        # 容器已经停止,无需检测健康,直接返回 stopped
        if state.get("Status") in ["exited", "dead", "created"]:
            return "stopped"

        if "Health" in state:
            return state["Health"]["Status"]   # healthy / unhealthy / starting
        else:
            return "no-healthcheck"

    except Exception as e:
        return f"error: {e}"


def check_and_restart():
    containers = client.containers.list(all=True)

    for container in containers:
        name = container.name

        status = get_health(container)

        # ------- 输出状态 -------
        logger.info(f"[{name}] Current health/state: {status}")

        # ------- 停止的容器不处理 -------
        if status == "stopped":
            logger.info(f"[{name}] Container stopped (Exited). Skip.")
            continue

        # ------- 无健康检查,跳过 -------
        if status == "no-healthcheck":
            logger.info(f"[{name}] No HEALTHCHECK. Skip.")
            continue

        # ------- 健康,不处理 -------
        if status == "healthy":
            continue

        # ------- 不健康,进行三次重试 -------
        if status == "unhealthy":
            logger.warning(f"[{name}] Unhealthy! Starting retry checks...")

            for retry in range(1, RETRY_TIMES + 1):
                time.sleep(CHECK_INTERVAL)
                status = get_health(container)

                logger.info(f"[{name}] Retry {retry}/{RETRY_TIMES}: {status}")

                # 可能已经被 stop 掉
                if status == "stopped":
                    logger.info(f"[{name}] Container is stopped during retry. Skip restart.")
                    break

                if status == "healthy":
                    logger.info(f"[{name}] Back to healthy. No restart needed.")
                    break

            else:
                # 3 次全部 unhealthy,执行重启
                logger.warning(f"[{name}] Still unhealthy after {RETRY_TIMES} retries. Restarting...")
                container.restart()
                logger.info(f"[{name}] Restart issued.")


if __name__ == "__main__":
    logger.info("=== Autoheal service started ===")
    while True:
        check_and_restart()
        time.sleep(10)   # 主循环每 10s 执行一次

Dockerfile

FROM python:3.10-slim

RUN pip install docker

COPY check_health.py /check_health.py

CMD ["python", "/check_health.py"]

构建镜像

docker build -t health-check:v1.0 .

启动检测容器

docker run -d \
  -v /var/run/docker.sock:/var/run/docker.sock \
  -v /etc/localtime:/etc/localtime \
  --name health-checker \
  health-check:v1.0
posted @ 2025-12-09 10:08  蒲公英PGY  阅读(5)  评论(0)    收藏  举报