基于Prometheus-实现AWS EC2的实例异常自动重启

1.需求：1.当成千上百台EC2实例异常需要花费大量时间去重启 2.EC2自我恢复时间较长，需要认为干预 3.发生重启和重启成功后告警提示，给故障预留充分证据。

2.目标：1.提升EC2自我恢复能力。2.降低人工运维效率

3.重启前提条件：

log " - Prometheus 故障检测 + 60秒二次确认"

log " - 实例状态为 running"
log " - 系统状态不为 initializing"
log " - 环境标签在允许列表中：dev sit fat uat qa mirror等非生产环境标签"
log " - 5分钟内未重启过"

4. 实现方式

4.1 使用容器化运行

 1 # 使用轻量级 Alpine Linux 作为基础镜像
 2 FROM alpine:3.18
 3 
 4 # 安装必要的工具
 5 RUN apk update && apk add --no-cache \
 6     curl \
 7     jq \
 8     aws-cli \
 9     bash \
10     coreutils \
11     && rm -rf /var/cache/apk/*
12 
13 # 创建应用目录
14 RUN mkdir -p /data/api
15 WORKDIR /data/api
16 
17 # 复制脚本文件
18 COPY autorestartec2.sh /data/api/autorestartec2.sh
19 
20 # 设置脚本权限
21 RUN chmod +x /data/api/autorestartec2.sh
22 
23 
24 # 设置容器启动命令 - 直接运行脚本（单次执行）
25 CMD ["/data/api/autorestartec2.sh"]

Dockerfile

4.2 使用shell脚本实现其中逻辑(比较方便简单)

   1 #!/bin/bash
   2 
   3 # EC2 自动重启脚本 -调试使用方式:https://github.com/LiquidityTech/devops-infra-deploy-manifests
   4 
   5 set -e
   6 
   7 # 配置
   8 PROMETHEUS_URL="${PROMETHEUS_URL:-http://prometheus.test.com}"
   9 AWS_REGION="${AWS_REGION:-ap-northeast-1}"
  10 LOG_FILE="/tmp/ec2_auto_restart.log"
  11 RESTART_HISTORY_FILE="/tmp/ec2_restart_history.log"
  12 MINUTES_THRESHOLD=5  #5分钟内不重复重启
  13 
  14 # Lark Webhook 配置 - 在这里设置您的 Webhook URL
  15 LARK_WEBHOOK_URL="${LARK_WEBHOOK_URL:-https://open.larksuite.com/open-apis/bot/v2/hook/token}"
  16 LARK_NOTIFICATION_ENABLED="${LARK_NOTIFICATION_ENABLED:-true}"  # 默认启用通知
  17 
  18 # 允许重启的环境标签
  19 ALLOWED_ENVS=("dev" "sit" "fat" "qa" "uat" "mirror")
  20 
  21 # 等待配置
  22 MAX_WAIT_TIME=120
  23 WAIT_INTERVAL=10
  24 SECOND_CHECK_DELAY=60  # 第二次检测等待时间（秒）
  25 
  26 # 日志函数
  27 log() {
  28     echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" | tee -a "$LOG_FILE"
  29 }
  30 
  31 # 获取实例所有标签
  32 get_instance_all_tags() {
  33     local instance_id="$1"
  34     
  35     local tags_json
  36     tags_json=$(aws ec2 describe-tags \
  37         --region "$AWS_REGION" \
  38         --filters "Name=resource-id,Values=$instance_id" \
  39         --query 'Tags[].[Key,Value]' \
  40         --output json 2>/dev/null)
  41     
  42     if [ $? -ne 0 ] || [ -z "$tags_json" ]; then
  43         echo "无法ec2获取标签"
  44     fi
  45     
  46     # 将标签格式化为易读的字符串
  47     local tags_string
  48     tags_string=$(echo "$tags_json" | jq -r '.[] | "\(.[0]): \(.[1])"' 2>/dev/null | tr '\n' '; ' | sed 's/; $//')
  49     
  50     if [ -n "$tags_string" ]; then
  51         echo "$tags_string"
  52     else
  53         echo "无标签,请管理员配置"
  54     fi
  55     
  56     return 0
  57 }
  58 
  59 # Lark 通知函数
  60 send_lark_notification() {
  61     local title="$1"
  62     local content="$2"
  63     local notification_type="$3"  # info, success, warning, error
  64     
  65     if [ "$LARK_NOTIFICATION_ENABLED" != "true" ] || [ -z "$LARK_WEBHOOK_URL" ]; then
  66         log "Lark 通知未配置或已禁用，跳过发送通知"
  67         return 0
  68     fi
  69     
  70     # 根据通知类型设置颜色
  71     local color
  72     case "$notification_type" in
  73         "success")
  74             color="green"
  75             ;;
  76         "warning")
  77             color="orange"
  78             ;;
  79         "error")
  80             color="red"
  81             ;;
  82         *)
  83             color="blue"
  84             ;;
  85     esac
  86     
  87     local json_data
  88     json_data=$(cat <<EOF
  89 {
  90     "msg_type": "interactive",
  91     "card": {
  92         "config": {
  93             "wide_screen_mode": true
  94         },
  95         "header": {
  96             "title": {
  97                 "tag": "plain_text",
  98                 "content": "$title"
  99             },
 100             "template": "$color"
 101         },
 102         "elements": [
 103             {
 104                 "tag": "div",
 105                 "fields": [
 106                     {
 107                         "is_short": true,
 108                         "text": {
 109                             "tag": "lark_md",
 110                             "content": "**时间:**\n$(date '+%Y-%m-%d %H:%M:%S')"
 111                         }
 112                     },
 113                     {
 114                         "is_short": true,
 115                         "text": {
 116                             "tag": "lark_md",
 117                             "content": "**区域:**\n$AWS_REGION"
 118                         }
 119                     },
 120                     {
 121                         "is_short": false,
 122                         "text": {
 123                             "tag": "lark_md",
 124                             "content": "**详情:**\n$content"
 125                         }
 126                     }
 127                 ]
 128             }
 129         ]
 130     }
 131 }
 132 EOF
 133 )
 134     
 135     local response
 136     response=$(curl -s -X POST \
 137         -H "Content-Type: application/json" \
 138         -d "$json_data" \
 139         "$LARK_WEBHOOK_URL" 2>/dev/null)
 140     
 141     if [ $? -eq 0 ]; then
 142         log "✓ Lark 通知发送成功: $title"
 143     else
 144         log "✗ Lark 通知发送失败: $title"
 145     fi
 146 }
 147 
 148 # 发送重启开始通知
 149 send_restart_start_notification() {
 150     local instance_id="$1"
 151     local private_ip="$2"
 152     local env_tag="$3"
 153     local all_tags="$4"
 154     
 155     local title="🔄 EC2 实例重启开始"
 156     local content="实例 ID: $instance_id\n私有 IP: $private_ip\n环境: $env_tag\n实例标签: $all_tags\n状态: 开始重启流程"
 157     echo "🔄  EC2 实例重启开始: $content"
 158     
 159     send_lark_notification "$title" "$content" "warning"
 160 }
 161 
 162 # 发送重启成功通知
 163 send_restart_success_notification() {
 164     local instance_id="$1"
 165     local private_ip="$2"
 166     local env_tag="$3"
 167     local duration="$4"
 168     local all_tags="$5"
 169     
 170     local title="✅ EC2 实例重启成功"
 171     local content="实例 ID: $instance_id\n私有 IP: $private_ip\n环境: $env_tag\n实例标签: $all_tags\n耗时: ${duration}秒\n状态: 重启完成"
 172     echo "✅  EC2 实例重启成功: $content"
 173     
 174     send_lark_notification "$title" "$content" "success"
 175 }
 176 
 177 # 发送重启失败通知
 178 send_restart_failed_notification() {
 179     local instance_id="$1"
 180     local private_ip="$2"
 181     local env_tag="$3"
 182     local error_msg="$4"
 183     local all_tags="$5"
 184     
 185     local title="❌ EC2 实例重启失败"
 186     local content="实例 ID: $instance_id\n私有 IP: $private_ip\n环境: $env_tag\n实例标签: $all_tags\n错误: $error_msg\n状态: 重启失败"
 187     
 188     send_lark_notification "$title" "$content" "error"
 189 }
 190 
 191 # 检查实例状态
 192 check_instance_state() {
 193     local instance_id="$1"
 194     local desired_state="$2"
 195     
 196     local state
 197     state=$(aws ec2 describe-instances \
 198         --region "$AWS_REGION" \
 199         --instance-ids "$instance_id" \
 200         --query 'Reservations[0].Instances[0].State.Name' \
 201         --output text 2>/dev/null)
 202     
 203     if [ $? -eq 0 ] && [ "$state" = "$desired_state" ]; then
 204         return 0
 205     else
 206         return 1
 207     fi
 208 }
 209 
 210 # 获取实例当前状态
 211 get_instance_state() {
 212     local instance_id="$1"
 213     
 214     local state
 215     state=$(aws ec2 describe-instances \
 216         --region "$AWS_REGION" \
 217         --instance-ids "$instance_id" \
 218         --query 'Reservations[0].Instances[0].State.Name' \
 219         --output text 2>/dev/null)
 220     
 221     if [ $? -eq 0 ]; then
 222         echo "$state"
 223         return 0
 224     else
 225         echo "unknown"
 226         return 1
 227     fi
 228 }
 229 
 230 # 获取实例环境标签
 231 get_env_tag() {
 232     local instance_id="$1"
 233     
 234     local env_tag
 235     env_tag=$(aws ec2 describe-tags \
 236         --region "$AWS_REGION" \
 237         --filters "Name=resource-id,Values=$instance_id" "Name=key,Values=env" \
 238         --query 'Tags[0].Value' \
 239         --output text 2>/dev/null)
 240     
 241     if [ $? -eq 0 ] && [ -n "$env_tag" ]; then
 242         echo "$env_tag"
 243         return 0
 244     else
 245         echo "unknown"
 246         return 1
 247     fi
 248 }
 249 
 250 # 检查实例系统状态
 251 check_system_status() {
 252     local instance_id="$1"
 253     
 254     log "检查实例 $instance_id 的系统状态..."
 255     
 256     # 获取实例状态详情
 257     local status_result
 258     status_result=$(aws ec2 describe-instance-status \
 259         --region "$AWS_REGION" \
 260         --instance-ids "$instance_id" \
 261         --include-all-instances \
 262         --output json 2>/dev/null)
 263     
 264     if [ $? -ne 0 ]; then
 265         log "错误: 无法获取实例 $instance_id 的状态信息"
 266         return 1
 267     fi
 268     
 269     # 检查是否有状态数据
 270     local instance_count
 271     instance_count=$(echo "$status_result" | jq -r '.InstanceStatuses | length' 2>/dev/null)
 272     if [ "$instance_count" -eq 0 ]; then
 273         log "警告: 实例 $instance_id 的状态数据为空，可能实例正在初始化"
 274         return 1
 275     fi
 276     
 277     # 解析系统状态
 278     local system_status
 279     system_status=$(echo "$status_result" | jq -r '.InstanceStatuses[0].SystemStatus.Status' 2>/dev/null)
 280     
 281     if [ -z "$system_status" ] || [ "$system_status" = "null" ]; then
 282         log "警告: 无法获取实例 $instance_id 的系统状态"
 283         return 1
 284     fi
 285     
 286     log "实例 $instance_id 系统状态: $system_status"
 287     
 288     # 如果系统状态为 initializing，则跳过重启
 289     if [ "$system_status" = "initializing" ]; then
 290         log "实例 $instance_id 系统状态为 'initializing'，正在初始化中，跳过重启"
 291         return 1
 292     fi
 293     
 294     # 如果系统状态为 impaired，可以考虑重启，但这里我们只关注 initializing
 295     if [ "$system_status" = "impaired" ]; then
 296         log "实例 $instance_id 系统状态为 'impaired'，系统检查失败"
 297         # 这里可以返回 0 表示允许重启，或者返回 1 表示跳过，根据需求决定
 298         return 0
 299     fi
 300     
 301     # 系统状态为 ok 或其他状态，允许进一步检查
 302     if [ "$system_status" = "ok" ]; then
 303         log "✓ 实例 $instance_id 系统状态正常"
 304     else
 305         log "实例 $instance_id 系统状态为: $system_status"
 306     fi
 307     
 308     return 0
 309 }
 310 
 311 # 等待实例达到特定状态
 312 wait_for_instance_state() {
 313     local instance_id="$1"
 314     local desired_state="$2"
 315     local max_wait_time="${3:-$MAX_WAIT_TIME}"
 316     local wait_interval="${4:-$WAIT_INTERVAL}"
 317     
 318     local start_time
 319     start_time=$(date +%s)
 320     local elapsed_time=0
 321     
 322     log "等待实例 $instance_id 进入 '$desired_state' 状态..."
 323     
 324     while [ $elapsed_time -lt $max_wait_time ]; do
 325         if check_instance_state "$instance_id" "$desired_state"; then
 326             log "✓ 实例 $instance_id 已进入 '$desired_state' 状态"
 327             return 0
 328         fi
 329         
 330         local current_state
 331         current_state=$(get_instance_state "$instance_id")
 332         log "实例 $instance_id 当前状态为 '$current_state'，等待 ${wait_interval} 秒后重试..."
 333         sleep $wait_interval
 334         
 335         elapsed_time=$(($(date +%s) - start_time))
 336     done
 337     
 338     log "✗ 等待超时: 实例 $instance_id 在 ${max_wait_time} 秒内未进入 '$desired_state' 状态"
 339     return 1
 340 }
 341 
 342 # 停止实例（支持强制停止）
 343 stop_instance() {
 344     local instance_id="$1"
 345     local max_wait_time=300
 346     local wait_interval=10
 347     local start_time=$(date +%s)
 348     local elapsed_time=0
 349     local force_stop_used=false
 350 
 351     # 首先检查实例当前状态
 352     local current_state
 353     current_state=$(get_instance_state "$instance_id")
 354     if [ "$current_state" != "running" ]; then
 355         log "实例 $instance_id 当前状态为 '$current_state'，无需停止"
 356         return 0
 357     fi
 358 
 359     # 首先尝试普通停止
 360     log "正在尝试普通停止实例: $instance_id"
 361     local stop_output
 362     stop_output=$(aws ec2 stop-instances --region "$AWS_REGION" --instance-ids "$instance_id" 2>&1)
 363     local stop_exit_code=$?
 364 
 365     if [ $stop_exit_code -ne 0 ]; then
 366         log "普通停止命令失败，尝试强制停止..."
 367         stop_output=$(aws ec2 stop-instances --region "$AWS_REGION" --instance-ids "$instance_id" --force 2>&1)
 368         stop_exit_code=$?
 369         force_stop_used=true
 370     fi
 371 
 372     if [ $stop_exit_code -eq 0 ]; then
 373         local action_type="停止"
 374         if [ "$force_stop_used" = true ]; then
 375             action_type="强制停止"
 376         fi
 377         log "✓ ${action_type}实例 $instance_id 命令发送成功"
 378 
 379         # 等待实例完全停止
 380         log "等待实例 $instance_id 进入 'stopped' 状态..."
 381         while [ $elapsed_time -lt $max_wait_time ]; do
 382             if check_instance_state "$instance_id" "stopped"; then
 383                 log "✓ 实例 $instance_id 已完全停止"
 384                 return 0
 385             fi
 386             local current_state
 387             current_state=$(get_instance_state "$instance_id")
 388             log "实例 $instance_id 当前状态为 '$current_state'，等待 ${wait_interval} 秒后重试..."
 389             sleep $wait_interval
 390             elapsed_time=$(($(date +%s) - start_time))
 391         done
 392 
 393         if [ "$force_stop_used" = false ]; then
 394             log "普通停止超时，尝试强制停止..."
 395             stop_output=$(aws ec2 stop-instances --region "$AWS_REGION" --instance-ids "$instance_id" --force 2>&1)
 396             if [ $? -eq 0 ]; then
 397                 log "✓ 强制停止命令发送成功，重新等待停止状态..."
 398                 start_time=$(date +%s)
 399                 elapsed_time=0
 400                 force_stop_used=true
 401                 
 402                 while [ $elapsed_time -lt $max_wait_time ]; do
 403                     if check_instance_state "$instance_id" "stopped"; then
 404                         log "✓ 实例 $instance_id 已完全停止（强制停止生效）"
 405                         return 0
 406                     fi
 407                     local current_state
 408                     current_state=$(get_instance_state "$instance_id")
 409                     log "实例 $instance_id 当前状态为 '$current_state'，等待 ${wait_interval} 秒后重试..."
 410                     sleep $wait_interval
 411                     elapsed_time=$(($(date +%s) - start_time))
 412                 done
 413             fi
 414         fi
 415 
 416         log "✗ 实例 $instance_id 在 ${max_wait_time} 秒内未停止，请通过管理控制台进一步检查"
 417         return 1
 418     else
 419         log "✗ 停止实例 $instance_id 失败，退出码: $stop_exit_code"
 420         log "停止命令输出: $stop_output"
 421         
 422         if echo "$stop_output" | grep -q "IncorrectInstanceState"; then
 423             log "错误: 实例 $instance_id 状态不正确，可能实例已经停止或正在停止"
 424         elif echo "$stop_output" | grep -q "UnauthorizedOperation"; then
 425             log "错误: 没有停止实例 $instance_id 的权限"
 426         elif echo "$stop_output" | grep -q "InvalidInstanceID"; then
 427             log "错误: 实例ID无效"
 428         else
 429             log "错误: $instance_id  未知的停止失败原因"
 430         fi
 431         
 432         return 1
 433     fi
 434 }
 435 
 436 # 启动实例
 437 start_instance() {
 438     local instance_id="$1"
 439     
 440     log "正在启动实例: $instance_id"
 441     
 442     # 检查实例当前状态
 443     local current_state
 444     current_state=$(get_instance_state "$instance_id")
 445     if [ "$current_state" = "running" ]; then
 446         log "实例 $instance_id 已经在运行状态，无需启动"
 447         return 0
 448     fi
 449     
 450     local start_output
 451     start_output=$(aws ec2 start-instances \
 452         --region "$AWS_REGION" \
 453         --instance-ids "$instance_id" 2>&1)
 454     
 455     local start_exit_code=$?
 456     
 457     if [ $start_exit_code -eq 0 ]; then
 458         log "✓ 启动实例 $instance_id 命令发送成功"
 459         
 460         if wait_for_instance_state "$instance_id" "running"; then
 461             return 0
 462         else
 463             log "✗ 实例 $instance_id 启动操作超时"
 464             return 1
 465         fi
 466     else
 467         log "✗ 启动实例 $instance_id 失败，退出码: $start_exit_code"
 468         log "启动命令输出: $start_output"
 469         
 470         if echo "$start_output" | grep -q "IncorrectInstanceState"; then
 471             log "错误: 实例 $instance_id 状态不正确，可能实例已经在运行"
 472         elif echo "$start_output" | grep -q "UnauthorizedOperation"; then
 473             log "错误: 没有启动实例 $instance_id  的权限"
 474         elif echo "$start_output" | grep -q "InvalidInstanceID"; then
 475             log "错误: 实例ID无效"
 476         else
 477             log "错误: $instance_id 未知的启动失败原因"
 478         fi
 479         
 480         return 1
 481     fi
 482 }
 483 
 484 # 检查实例是否在近期内重启过
 485 check_recent_restart() {
 486     local instance_id="$1"
 487     
 488     if [ ! -f "$RESTART_HISTORY_FILE" ]; then
 489         return 1
 490     fi
 491     
 492     local last_restart_line
 493     last_restart_line=$(grep "^$instance_id:" "$RESTART_HISTORY_FILE" | tail -1)
 494     
 495     if [ -z "$last_restart_line" ]; then
 496         return 1
 497     fi
 498     
 499     local last_restart_time
 500     last_restart_time=$(echo "$last_restart_line" | cut -d: -f2)
 501     
 502     local current_time
 503     current_time=$(date +%s)
 504     local time_diff_minutes
 505     time_diff_minutes=$(( (current_time - last_restart_time) / 60 ))
 506     
 507     if [ $time_diff_minutes -lt $MINUTES_THRESHOLD ]; then
 508         log "实例 $instance_id 在 $time_diff_minutes 分钟前重启过，跳过本次重启（${MINUTES_THRESHOLD}分钟内不重复重启）"
 509         return 0
 510     else
 511         return 1
 512     fi
 513 }
 514 
 515 # 记录重启历史
 516 record_restart() {
 517     local instance_id="$1"
 518     local timestamp
 519     timestamp=$(date +%s)
 520     
 521     if [ -f "$RESTART_HISTORY_FILE" ]; then
 522         grep -v "^$instance_id:" "$RESTART_HISTORY_FILE" > "${RESTART_HISTORY_FILE}.tmp" 2>/dev/null || true
 523         mv "${RESTART_HISTORY_FILE}.tmp" "$RESTART_HISTORY_FILE" 2>/dev/null || true
 524     fi
 525     
 526     echo "$instance_id:$timestamp" >> "$RESTART_HISTORY_FILE"
 527     log "记录实例 $instance_id 的重启时间"
 528     
 529     local cleanup_timestamp
 530     cleanup_timestamp=$(date -d "7 days ago" +%s 2>/dev/null || date -v-7d +%s 2>/dev/null || echo "0")
 531     
 532     if [ -f "$RESTART_HISTORY_FILE" ]; then
 533         awk -v cutoff="$cleanup_timestamp" -F: '$2 >= cutoff' "$RESTART_HISTORY_FILE" > "${RESTART_HISTORY_FILE}.clean" 2>/dev/null
 534         mv "${RESTART_HISTORY_FILE}.clean" "$RESTART_HISTORY_FILE" 2>/dev/null
 535     fi
 536 }
 537 
 538 # 查询 Prometheus 获取故障实例
 539 query_prometheus() {
 540     local query="$1"
 541     local response
 542     response=$(curl -s -G \
 543         --data-urlencode "query=$query" \
 544         "$PROMETHEUS_URL/api/v1/query")
 545     
 546     if [ $? -ne 0 ] || [ -z "$response" ]; then
 547         log "错误: Prometheus 查询失败"
 548         return 1
 549     fi
 550     
 551     if ! echo "$response" | jq -e '.data.result' > /dev/null 2>&1; then
 552         log "错误: Prometheus 返回无效的 JSON 响应"
 553         return 1
 554     fi
 555     
 556     echo "$response" | jq -r '.data.result[]? | "\(.metric.PrivateIpAddress) \(.metric.instance)"' 2>/dev/null
 557 }
 558 
 559 # 检查单个实例是否仍在故障列表中
 560 check_instance_still_faulty() {
 561     local private_ip="$1"
 562     local instance_id="$2"
 563     
 564     log "对实例 $instance_id (IP: $private_ip) 进行第二次检测..."
 565     
 566     # 构建针对单个实例的查询
 567     local query
 568     query="up{job=\"aws-ec2-nodes\", PrivateIpAddress=\"$private_ip\"} == 0 and max_over_time(up{job=\"aws-ec2-nodes\", PrivateIpAddress=\"$private_ip\"}[1h]) == 1"
 569     
 570     local faulty_instances
 571     faulty_instances=$(query_prometheus "$query")
 572     
 573     if [ $? -ne 0 ]; then
 574         log "第二次检测查询失败，跳过实例 $instance_id"
 575         return 1
 576     fi
 577     
 578     if [ -z "$faulty_instances" ]; then
 579         log "实例 $instance_id 在第二次检测中已恢复，跳过重启"
 580         return 1
 581     fi
 582     
 583     # 检查返回的实例是否包含我们正在检查的实例
 584     if echo "$faulty_instances" | grep -q "$instance_id"; then
 585         log "✓ 实例 $instance_id 在第二次检测中仍然故障，继续重启流程"
 586         return 0
 587     else
 588         log "实例 $instance_id 在第二次检测中已恢复，跳过重启"
 589         return 1
 590     fi
 591 }
 592 
 593 # 检查环境标签是否允许重启
 594 check_env_tag() {
 595     local instance_id="$1"
 596     
 597     local env_tag
 598     env_tag=$(aws ec2 describe-tags \
 599         --region "$AWS_REGION" \
 600         --filters "Name=resource-id,Values=$instance_id" "Name=key,Values=env" \
 601         --query 'Tags[0].Value' \
 602         --output text 2>/dev/null)
 603     
 604     if [ $? -ne 0 ] || [ -z "$env_tag" ]; then
 605         log "警告: 无法获取实例 $instance_id 的 env 标签"
 606     fi
 607     
 608     for allowed_env in "${ALLOWED_ENVS[@]}"; do
 609         if [ "$env_tag" = "$allowed_env" ]; then
 610             log "实例 $instance_id 的 env 标签 '$env_tag' 在允许列表中"
 611             return 0
 612         fi
 613     done
 614     
 615     log "实例 $instance_id 的 env 标签 '$env_tag' 不在允许列表中，跳过重启"
 616     return 1
 617 }
 618 
 619 # 检查实例状态是否符合重启条件
 620 check_instance_state_for_restart() {
 621     local instance_id="$1"
 622     
 623     log "检查实例 $instance_id 的状态..."
 624     
 625     local instance_state
 626     instance_state=$(get_instance_state "$instance_id")
 627     
 628     if [ $? -ne 0 ]; then
 629         log "错误: 无法获取实例 $instance_id 的状态"
 630         return 1
 631     fi
 632     
 633     log "实例 $instance_id 当前状态: $instance_state"
 634     
 635     # 只有 running 状态的实例才符合重启条件
 636     if [ "$instance_state" = "running" ]; then
 637         log "✓ 实例状态检查通过（实例为运行状态）"
 638         return 0
 639     else
 640         log "实例 $instance_id 状态为 '$instance_state'，不是 'running'，跳过重启"
 641         return 1
 642     fi
 643 }
 644 
 645 # 检查实例是否符合重启条件（严格按照四个条件）
 646 check_restart_conditions() {
 647     local instance_id="$1"
 648     local private_ip="$2"
 649     
 650     log "严格检查实例 $instance_id 的重启条件..."
 651     
 652     # 条件1: 环境标签检查
 653     log "检查条件1: 环境标签..."
 654     if ! check_env_tag "$instance_id"; then
 655         log "✗ 条件1不满足: 环境标签检查失败"
 656         return 1
 657     fi
 658     log "✓ 条件1满足: 环境标签检查通过"
 659     
 660     # 条件2: 实例状态检查（必须为 running）
 661     log "检查条件2: 实例状态..."
 662     if ! check_instance_state_for_restart "$instance_id"; then
 663         log "✗ 条件2不满足: 实例状态检查失败"
 664         return 1
 665     fi
 666     log "✓ 条件2满足: 实例状态检查通过"
 667     
 668     # 条件3: 系统状态检查（不能为 initializing）
 669     log "检查条件3: 系统状态..."
 670     if ! check_system_status "$instance_id"; then
 671         log "✗ 条件3不满足: 系统状态检查失败"
 672         return 1
 673     fi
 674     log "✓ 条件3满足: 系统状态检查通过"
 675     
 676     # 条件4: 近期重启检查
 677     log "检查条件4: 近期重启检查..."
 678     if check_recent_restart "$instance_id"; then
 679         log "✗ 条件4不满足: 实例近期已重启过"
 680         return 1
 681     fi
 682     log "✓ 条件4满足: 近期无重启记录"
 683     
 684     log "✓ 所有重启条件均已满足"
 685     return 0
 686 }
 687 
 688 # 重启实例函数
 689 restart_instance() {
 690     local instance_id="$1"
 691     local private_ip="$2"
 692     
 693     log "开始处理实例: $instance_id (私有IP: $private_ip)"
 694     
 695     # 获取环境标签用于通知
 696     local env_tag
 697     env_tag=$(get_env_tag "$instance_id")
 698     
 699     # 获取所有标签用于通知
 700     local all_tags
 701     all_tags=$(get_instance_all_tags "$instance_id")
 702     
 703     # 记录开始时间
 704     local start_time
 705     start_time=$(date +%s)
 706     
 707     # 严格检查重启条件
 708     if ! check_restart_conditions "$instance_id" "$private_ip"; then
 709         log "实例 $instance_id 不满足重启条件，跳过重启"
 710         return 1
 711     fi
 712     
 713     # 所有检查通过，发送重启开始通知
 714     send_restart_start_notification "$instance_id" "$private_ip" "$env_tag" "$all_tags"
 715     
 716     # 执行重启
 717     log "所有检查通过，正在重启实例: $instance_id"
 718     
 719     # 第一步：停止实例
 720     if ! stop_instance "$instance_id"; then
 721         local error_msg="停止阶段失败"
 722         log "✗ 重启实例 $instance_id 失败：$error_msg"
 723         send_restart_failed_notification "$instance_id" "$private_ip" "$env_tag" "$error_msg" "$all_tags"
 724         return 1
 725     fi
 726     
 727     log "✓ 实例 $instance_id 停止成功"
 728     
 729     # 第二步：启动实例
 730     if ! start_instance "$instance_id"; then
 731         local error_msg="启动阶段失败"
 732         log "✗ 重启实例 $instance_id 失败：$error_msg"
 733         send_restart_failed_notification "$instance_id" "$private_ip" "$env_tag" "$error_msg" "$all_tags"
 734         return 1
 735     fi
 736     
 737     log "✓ 实例 $instance_id 启动成功"
 738     
 739     # 计算重启耗时
 740     local end_time
 741     end_time=$(date +%s)
 742     local duration=$((end_time - start_time))
 743     
 744     # 记录重启历史
 745     record_restart "$instance_id"
 746     
 747     # 发送重启成功通知
 748     send_restart_success_notification "$instance_id" "$private_ip" "$env_tag" "$duration" "$all_tags"
 749     
 750     log "✓ 成功重启实例: $instance_id，耗时 ${duration} 秒"
 751     return 0
 752 }
 753 
 754 # 验证 AWS 权限
 755 check_aws_permissions() {
 756     log "检查 AWS 权限..."
 757     
 758     if ! aws ec2 describe-instances \
 759         --region "$AWS_REGION" \
 760         --max-items 1 > /dev/null 2>&1; then
 761         log "错误: AWS ec2:DescribeInstances 权限不足"
 762         return 1
 763     fi
 764     
 765     if ! aws ec2 describe-tags \
 766         --region "$AWS_REGION" \
 767         --max-items 1 > /dev/null 2>&1; then
 768         log "错误: AWS ec2:DescribeTags 权限不足"
 769         return 1
 770     fi
 771     
 772     log "检查 EC2 停止权限..."
 773     if ! aws ec2 stop-instances \
 774         --region "$AWS_REGION" \
 775         --instance-ids "i-00000000000000000" 2>&1 | grep -q "InvalidInstanceID"; then
 776         log "错误: AWS ec2:StopInstances 权限不足"
 777         return 1
 778     fi
 779     
 780     log "检查 EC2 强制停止权限..."
 781     if ! aws ec2 stop-instances \
 782         --region "$AWS_REGION" \
 783         --instance-ids "i-00000000000000000" \
 784         --force 2>&1 | grep -q "InvalidInstanceID"; then
 785         log "错误: AWS ec2:StopInstances (强制停止) 权限不足"
 786         return 1
 787     fi
 788     
 789     log "检查 EC2 启动权限..."
 790     if ! aws ec2 start-instances \
 791         --region "$AWS_REGION" \
 792         --instance-ids "i-00000000000000000" 2>&1 | grep -q "InvalidInstanceID"; then
 793         log "错误: AWS ec2:StartInstances 权限不足"
 794         return 1
 795     fi
 796     
 797     log "✓ AWS 权限检查通过"
 798     return 0
 799 }
 800 
 801 # 验证 Prometheus 连接
 802 check_prometheus_connection() {
 803     log "检查 Prometheus 连接..."
 804     if curl -s "$PROMETHEUS_URL/api/v1/query?query=up" > /dev/null 2>&1; then
 805         log "✓ Prometheus 连接正常"
 806         return 0
 807     else
 808         log "错误: 无法连接到 Prometheus: $PROMETHEUS_URL"
 809         return 1
 810     fi
 811 }
 812 
 813 # 验证 Lark Webhook 配置
 814 check_lark_webhook() {
 815     if [ "$LARK_NOTIFICATION_ENABLED" = "true" ] && [ -z "$LARK_WEBHOOK_URL" ]; then
 816         log "警告: Lark 通知已启用但未配置 Webhook URL"
 817         log "请设置 LARK_WEBHOOK_URL 环境变量以启用通知功能"
 818         return 1
 819     fi
 820     
 821     # 检查是否为默认的 Webhook URL
 822     if [ "$LARK_NOTIFICATION_ENABLED" = "true" ] && [ "$LARK_WEBHOOK_URL" = "https://open.feishu.cn/open-apis/bot/v2/hook/your-webhook-token-here" ]; then
 823         log "警告: 使用的是默认的 Lark Webhook URL，请修改脚本中的 LARK_WEBHOOK_URL 配置"
 824         return 1
 825     fi
 826     
 827     if [ "$LARK_NOTIFICATION_ENABLED" = "true" ]; then
 828         log "✓ Lark 通知功能已启用"
 829     else
 830         log "Lark 通知功能已禁用"
 831     fi
 832     
 833     return 0
 834 }
 835 
 836 # 查看重启历史
 837 view_restart_history() {
 838     log "=== 重启历史 ==="
 839     if [ ! -f "$RESTART_HISTORY_FILE" ] || [ ! -s "$RESTART_HISTORY_FILE" ]; then
 840         log "没有重启历史记录"
 841         return
 842     fi
 843     
 844     local current_time
 845     current_time=$(date +%s)
 846     
 847     while IFS=: read -r instance_id timestamp; do
 848         if [ -n "$instance_id" ] && [ -n "$timestamp" ]; then
 849             local time_diff_minutes
 850             time_diff_minutes=$(( (current_time - timestamp) / 60 ))
 851             local human_time
 852             human_time=$(date -d "@$timestamp" "+%Y-%m-%d %H:%M:%S" 2>/dev/null || echo "未知时间")
 853             log "实例 $instance_id - 重启时间: $human_time (${time_diff_minutes}分钟前)"
 854         fi
 855     done < "$RESTART_HISTORY_FILE"
 856 }
 857 
 858 # 清理重启历史
 859 clean_restart_history() {
 860     if [ -f "$RESTART_HISTORY_FILE" ]; then
 861         rm -f "$RESTART_HISTORY_FILE"
 862         log "已清理重启历史记录"
 863     else
 864         log "没有找到重启历史记录文件"
 865     fi
 866 }
 867 
 868 # 显示配置信息
 869 show_config() {
 870     log "=== 脚本配置信息 ==="
 871     log "Prometheus URL: $PROMETHEUS_URL"
 872     log "AWS Region: $AWS_REGION"
 873     log "允许的环境标签: ${ALLOWED_ENVS[*]}"
 874     log "Lark 通知: $LARK_NOTIFICATION_ENABLED"
 875     
 876     if [ "$LARK_NOTIFICATION_ENABLED" = "true" ] && [ "$LARK_WEBHOOK_URL" != "https://open.feishu.cn/open-apis/bot/v2/hook/your-webhook-token-here" ]; then
 877         log "Lark Webhook URL: [已配置]"
 878     elif [ "$LARK_NOTIFICATION_ENABLED" = "true" ]; then
 879         log "Lark Webhook URL: [需要配置 - 使用默认占位符]"
 880     fi
 881     
 882     log "重启条件检查:"
 883     log "  - Prometheus 故障检测 + 60秒二次确认"
 884     log "  - 实例状态为 running"
 885     log "  - 系统状态不为 initializing"
 886     log "  - 环境标签在允许列表中"
 887     log "  - 5分钟内未重启过"
 888 }
 889 
 890 # 主逻辑
 891 main() {
 892     log "=== EC2 自动重启任务开始 ==="
 893     
 894     # 显示配置信息
 895     show_config
 896     
 897     # 前置检查
 898     if ! check_aws_permissions; then
 899         exit 1
 900     fi
 901     
 902     if ! check_prometheus_connection; then
 903         exit 1
 904     fi
 905     
 906     if ! check_lark_webhook; then
 907         # 不退出，只是警告
 908         log "继续执行任务，但通知功能可能不可用"
 909     fi
 910     
 911     # 显示当前重启历史
 912     view_restart_history
 913     
 914     # PromQL 查询：检测 node_exporter 故障的实例
 915     # 修改查询条件，只查询长时间运行但近期故障的实例
 916     local query
 917     query='''up{job="aws-ec2-nodes", PrivateIpAddress!=""} == 0
 918 and 
 919 max_over_time(up{job="aws-ec2-nodes", PrivateIpAddress!=""}[1h]) == 1
 920 '''    
 921     log "执行 Prometheus 查询检测故障实例..."
 922     log "查询语句: $query"
 923     
 924     # 获取故障实例列表
 925     local faulty_instances
 926     faulty_instances=$(query_prometheus "$query")
 927     
 928     if [ $? -ne 0 ] || [ -z "$faulty_instances" ]; then
 929         log "未发现故障实例（Prometheus 查询无结果）"
 930         return 0
 931     fi
 932     
 933     local instance_count
 934     instance_count=$(echo "$faulty_instances" | grep -v '^$' | wc -l)
 935     log "发现 $instance_count 个故障实例（基于 Prometheus 监控指标）"
 936     
 937     # 处理每个实例
 938     local processed_count=0
 939     local restarted_count=0
 940     local skipped_count=0
 941     local second_check_passed=0
 942     local second_check_failed=0
 943     
 944     local temp_file
 945     temp_file=$(mktemp)
 946     echo "$faulty_instances" > "$temp_file"
 947     
 948     while IFS=' ' read -r private_ip instance_id; do
 949         if [ -z "$private_ip" ] || [ -z "$instance_id" ]; then
 950             continue
 951         fi
 952         
 953         processed_count=$((processed_count + 1))
 954         log "--- 处理实例 [$processed_count/$instance_count]: $instance_id (IP: $private_ip) ---"
 955         
 956         # 第一次检测通过，等待后进行第二次检测
 957         log "第一次检测通过，等待 ${SECOND_CHECK_DELAY} 秒后进行第二次检测..."
 958         sleep $SECOND_CHECK_DELAY
 959         
 960         # 第二次检测
 961         if check_instance_still_faulty "$private_ip" "$instance_id"; then
 962             second_check_passed=$((second_check_passed + 1))
 963             log "第二次检测通过，继续检查其他重启条件..."
 964             
 965             # 重启实例
 966             if restart_instance "$instance_id" "$private_ip"; then
 967                 restarted_count=$((restarted_count + 1))
 968             else
 969                 skipped_count=$((skipped_count + 1))
 970             fi
 971         else
 972             second_check_failed=$((second_check_failed + 1))
 973             skipped_count=$((skipped_count + 1))
 974             log "实例 $instance_id 第二次检测未通过，跳过重启"
 975         fi
 976         
 977         # 添加延迟避免频繁操作
 978         sleep 30
 979         
 980     done < "$temp_file"
 981     
 982     rm -f "$temp_file"
 983     
 984     log "=== 任务完成 ==="
 985     log "处理实例总数: $processed_count"
 986     log "第二次检测通过: $second_check_passed 个实例"
 987     log "第二次检测失败: $second_check_failed 个实例"
 988     log "成功重启: $restarted_count 个实例"
 989     log "跳过重启: $skipped_count 个实例"
 990 }
 991 
 992 # 安全检查：防止并发执行
 993 LOCK_FILE="/tmp/ec2_auto_restart.lock"
 994 
 995 if [ -e "$LOCK_FILE" ]; then
 996     log "错误: 脚本已在运行中 (锁文件存在: $LOCK_FILE)"
 997     exit 1
 998 fi
 999 
1000 trap 'rm -f "$LOCK_FILE"; exit' INT TERM EXIT
1001 echo $$ > "$LOCK_FILE"
1002 
1003 # 处理命令行参数
1004 case "${1:-}" in
1005     "view-history")
1006         view_restart_history
1007         exit 0
1008         ;;
1009     "clean-history")
1010         clean_restart_history
1011         exit 0
1012         ;;
1013     "test-notification")
1014         log "测试 Lark 通知功能..."
1015         send_lark_notification "测试通知" "这是一个测试通知\n时间: $(date '+%Y-%m-%d %H:%M:%S')" "info"
1016         exit 0
1017         ;;
1018     "show-config")
1019         show_config
1020         exit 0
1021         ;;
1022     "")
1023         main "$@"
1024         ;;
1025     *)
1026         log "错误: 未知参数 '$1'"
1027         log "用法: $0 [view-history|clean-history|test-notification|show-config]"
1028         exit 1
1029         ;;
1030 esac
1031 
1032 rm -f "$LOCK_FILE"

auto_restarec2.sh

4.3调试使用方法：

 1 # 1. 首先编辑脚本，将您的 Webhook token 填入配置部分
 2 # 修改这一行：
 3 # LARK_WEBHOOK_URL="${LARK_WEBHOOK_URL:-https://open.feishu.cn/open-apis/bot/v2/hook/your-webhook-token-here}"
 4 # 将 "your-webhook-token-here" 替换为您的实际 token
 5 
 6 # 2. 执行自动重启任务（使用内置的 Webhook URL）
 7 ./ec2_auto_restart.sh
 8 
 9 # 3. 查看配置信息
10 ./ec2_auto_restart.sh show-config
11 
12 # 4. 测试通知功能
13 ./ec2_auto_restart.sh test-notification
14 
15 # 5. 查看重启历史
16 ./ec2_auto_restart.sh view-history
17 
18 # 6. 清理重启历史
19 ./ec2_auto_restart.sh clean-history
20 
21 # 7. 如果需要临时覆盖 Webhook URL，仍然可以使用环境变量
22 export LARK_WEBHOOK_URL="https://open.feishu.cn/open-apis/bot/v2/hook/another-token"
23 ./ec2_auto_restart.sh

调试命令

注意：合规性要求避免在代码中使用ak/sk等密钥，使用aws role绑定策略生成 accountservice 给deployment 的pod使用即可获得对应role的一系列权限。

5.实现效果

当实例出现故障后，经过2次以上检测后，实例处于running和非初始化状态，则进行重启。

posted @ 2025-11-23 14:15 meijinmeng 阅读(6) 评论(0) 收藏举报

刷新页面返回顶部

基于Prometheus-实现AWS EC2的实例异常自动重启

公告