Monit-基于非容器服务自恢复程序实践
1.需求:因为历史原因和软件程序原因,有上百台服务和所在服务未运行在容器中,需要在程序奔溃自动拉起(以Java Python C++为主)。
2.目的:能够非人为干预快速自动恢复,要求检测频率在10s一次
3.实现方式
3.1 根据不同语言自己开发脚本实现自动拉起和通知(不够标准化-弃用)
1 #!/bin/bash 2 3 # ========== 使用说明 ========== 4 # 1. 自定配置env: fat/prod等环境变量, nacos地址, lark webhook等 5 # 2. 确保每个Java服务有对应的启动脚本(如 deploy-xxx.sh) 6 # 3. 设置需要监控的Java服务列表(见下方SERVICES_PROCESS_ID等变量) 7 # 4. 将此脚本添加到crontab中定期执行,例如每1分钟检测一次monitor是否在运行,如果在运行不会重复运行,不在运行则会后台运行: 8 # */1 * * * * /bin/bash /path/to/this/monitor.sh >> /path/to/monitor.log 2>&1 9 10 # ========== 环境变量加载 ========== 11 # 在 crontab 中执行时,需要显式加载环境变量 12 # 按优先级加载多个可能的环境变量文件 13 ENV_FILES=( 14 "/etc/profile" 15 "/etc/bashrc" 16 "/root/.bash_profile" 17 "/root/.bashrc" 18 "$HOME/.bash_profile" 19 "$HOME/.bashrc" 20 ) 21 22 for env_file in "${ENV_FILES[@]}"; do 23 if [ -f "$env_file" ]; then 24 echo "[$(date '+%Y-%m-%d %H:%M:%S')] 加载环境变量文件: $env_file" >> /tmp/monitor_env.log 25 source "$env_file" 26 fi 27 done 28 29 # 如果上述文件都没有加载到关键环境变量,则手动设置 30 if [ -z "$JAVA_HOME" ]; then 31 export JAVA_HOME="/opt/jdk-17.0.8" 32 export CLASSPATH=".:$JAVA_HOME/lib/tools.jar:$JAVA_HOME/lib/dt.jar" 33 export PATH="$JAVA_HOME/bin:$PATH" 34 echo "[$(date '+%Y-%m-%d %H:%M:%S')] 手动设置 JAVA_HOME: $JAVA_HOME" >> /tmp/monitor_env.log 35 fi 36 37 # 设置必要的环境变量 38 export env_nacos_address="nacos.test.com:8848" 39 export env_nacos_namespace="fat" 40 export ENV="fat" 41 42 # 记录环境变量状态(用于调试) 43 { 44 echo "=== 环境变量检查 ===" 45 echo "时间: $(date)" 46 echo "JAVA_HOME: $JAVA_HOME" 47 echo "PATH: $PATH" 48 echo "ENV: $ENV" 49 echo "env_nacos_address: $env_nacos_address" 50 which java 51 java -version 2>&1 52 echo "=== 环境变量检查结束 ===" 53 } >> /tmp/monitor_env.log 2>&1 54 55 # ========== 脚本配置 ========== 56 target_folder="./" 57 58 LARK_WEBHOOK="https://open.larksuite.com/open-apis/bot/v2/hook/token" 59 LARK_LOG_FILE="$target_folder/lark.log" 60 MONITOR_LOG_FILE="/tmp/check.logs" 61 62 LARK_MAX_RETRY=3 63 LARK_RETRY_DELAY=10 64 LARK_ENV="fat" 65 66 # ========== 文件锁机制 ========== 67 LOCK_FILE="/tmp/service_monitor.lock" 68 69 # 尝试获取锁,如果失败则退出 70 exec 200>"$LOCK_FILE" 71 flock -n 200 || { 72 echo "[$(date '+%Y-%m-%d %H:%M:%S')] 监控脚本已在运行,退出本次执行 (PID: $(cat $LOCK_FILE 2>/dev/null || echo '未知'))" >> "$MONITOR_LOG_FILE" 73 exit 0 74 } 75 76 # 将当前PID写入锁文件 77 echo $$ > "$LOCK_FILE" 78 79 # 设置退出时清理锁文件 80 trap "rm -f $LOCK_FILE; exit" INT TERM EXIT 81 82 echo "[$(date '+%Y-%m-%d %H:%M:%S')] 启动服务监控进程 (PID: $$)" | tee -a "$MONITOR_LOG_FILE" 83 84 # ========== Lark 报警函数 ========== 85 send_lark_alert() { 86 local message="$1" 87 local timestamp=$(date "+%Y-%m-%d %H:%M:%S") 88 89 local full_message="[$LARK_ENV][进程监控] $message\n" 90 full_message+="服务器名: $(hostname)\n" 91 full_message+="报警时间: $timestamp\n" 92 #full_message+="<at id=ou_id1>Jame Mei</at>" 93 #full_message+="<at id=ou_id2>Levi Li</at>" 94 #full_message+="<at id=ou_id3>Bingbing Sun</at>" 95 96 local json_data="{ 97 \"msg_type\": \"interactive\", 98 \"card\": { 99 \"elements\": [{ 100 \"tag\": \"div\", 101 \"text\": { 102 \"content\": \"$full_message\", 103 \"tag\": \"lark_md\" 104 } 105 }] 106 } 107 }" 108 109 local attempt=1 110 while [[ $attempt -le $LARK_MAX_RETRY ]]; do 111 http_status=$(curl -s -o /dev/null -w "%{http_code}" \ 112 -m 3 \ 113 -X POST "$LARK_WEBHOOK" \ 114 -H "Content-Type: application/json" \ 115 -d "$json_data") 116 117 if [[ $http_status -eq 200 ]]; then 118 echo "[$(date "+%F %T")] Lark报警发送成功" >> "$LARK_LOG_FILE" 119 return 0 120 else 121 sleep $LARK_RETRY_DELAY 122 ((attempt++)) 123 fi 124 done 125 echo "[$(date "+%F %T")] 错误: Lark报警发送失败,已达最大重试次数: $attempt" >> "$LARK_LOG_FILE" 126 return 1 127 } 128 129 send_alert_async() { 130 ( 131 send_lark_alert "$1" 132 ) &> /dev/null & 133 } 134 135 # ========== 服务配置 ========== 136 WORK_DIR="/data/scripts" 137 138 # 服务列表 139 #SERVICES_PROCESS_ID[0]="pb-trading-engine-1.0-SNAPSHOT.jar" 140 #SERVICES_FRIENDLY_NAME[0]="交易引擎 (Trading Engine)" 141 #SERVICES_START_CMD[0]="./deploy-engine-new.sh start" 142 143 SERVICES_PROCESS_ID[1]="pb-trading-market-1.0-SNAPSHOT.jar" 144 SERVICES_FRIENDLY_NAME[1]="行情服务 (Trading Market)" 145 SERVICES_START_CMD[1]="./pb-trading-market.sh start" 146 147 SERVICES_PROCESS_ID[2]="rapidx-trading-query-realtime-1.0-SNAPSHOT.jar" 148 SERVICES_FRIENDLY_NAME[2]="实时查询 (Query Realtime)" 149 SERVICES_START_CMD[2]="./deploy-realtime-new.sh start" 150 151 SERVICES_PROCESS_ID[3]="router-server-1.0.0-SNAPSHOT.jar" 152 SERVICES_FRIENDLY_NAME[3]="路由服务 (Router Server)" 153 SERVICES_START_CMD[3]="./router-server.sh start" 154 155 SERVICES_PROCESS_ID[4]="pb-trading-query-1.0-SNAPSHOT.jar" 156 SERVICES_FRIENDLY_NAME[4]="交易查询 (Trading Query)" 157 SERVICES_START_CMD[4]="./pb-trading-query.sh start" 158 159 SERVICES_PROCESS_ID[5]="rapidx-trading-market-gateway-1.0-SNAPSHOT.jar" 160 SERVICES_FRIENDLY_NAME[5]="行情网关 (Market Gateway)" 161 SERVICES_START_CMD[5]="./rapidx-trading-market-gateway.sh start" 162 163 SERVICES_PROCESS_ID[6]="ltp-ems-1.0.0-master-SNAPSHOT.jar" 164 SERVICES_FRIENDLY_NAME[6]="订单管理 (EMS)" 165 SERVICES_START_CMD[6]="./deploy-ems-new.sh start" 166 167 SERVICES_PROCESS_ID[7]="pb-trading-push-0.0.1-SNAPSHOT.jar" 168 SERVICES_FRIENDLY_NAME[7]="推送服务 (Trading Push)" 169 SERVICES_START_CMD[7]="./pb-trading-push.sh start" 170 171 SERVICES_PROCESS_ID[8]="pb-trading-gateway-1.0-SNAPSHOT.jar" 172 SERVICES_FRIENDLY_NAME[8]="交易网关 (Trading Gateway)" 173 SERVICES_START_CMD[8]="./pb-trading-gateway.sh start" 174 175 SERVICES_PROCESS_ID[9]="exchange-data-server-1.0.0-SNAPSHOT.jar" 176 SERVICES_FRIENDLY_NAME[9]="exchange-data-server" 177 SERVICES_START_CMD[9]="./ltp-exchange-data-server.sh start" 178 179 SERVICES_PROCESS_ID[10]="rapidtrade-storage-1.0-SNAPSHOT.jar" 180 SERVICES_FRIENDLY_NAME[10]="rapidtrade-storage" 181 SERVICES_START_CMD[10]="./rapidtrade-storage.sh start" 182 183 SERVICES_PROCESS_ID[11]="rapidx-trading-algo-server-1.0-SNAPSHOT.jar" 184 SERVICES_FRIENDLY_NAME[11]="rapidx-trading-algo-server" 185 SERVICES_START_CMD[11]="./rapidx-trading-algo-server.sh start" 186 187 SERVICES_PROCESS_ID[12]="rapidtrade-mock-1.0-SNAPSHOT.jar" 188 SERVICES_FRIENDLY_NAME[12]="rapidtrade-mock" 189 SERVICES_START_CMD[12]="./rapidtrade-mock.sh start" 190 191 SERVICES_PROCESS_ID[13]="rapidx-ws-simulator-1.0-SNAPSHOT.jar" 192 SERVICES_FRIENDLY_NAME[13]="rapidx-ws-simulator" 193 SERVICES_START_CMD[13]="./rapidx-ws-simulator.sh start" 194 195 SERVICES_PROCESS_ID[14]="pb-trading-statistics-1.0-SNAPSHOT.jar" 196 SERVICES_FRIENDLY_NAME[14]="pb-trading-statistics" 197 SERVICES_START_CMD[14]="./pb-trading-statistics.sh start" 198 199 SERVICES_PROCESS_ID[15]="rapidx-trading-onezero-maker-1.0-SNAPSHOT.jar" 200 SERVICES_FRIENDLY_NAME[15]="rapidx-trading-onezero-maker" 201 SERVICES_START_CMD[15]="./rapidx-trading-onezero-maker.sh start" 202 203 SERVICES_PROCESS_ID[16]="pb-trading-transfer-1.0-SNAPSHOT.jar" 204 SERVICES_FRIENDLY_NAME[16]="pb-trading-transfer" 205 SERVICES_START_CMD[16]="./pb-trading-transfer.sh start" 206 207 SERVICES_PROCESS_ID[17]="rapidx-trading-clearing-1.0-SNAPSHOT.jar" 208 SERVICES_FRIENDLY_NAME[17]="rapidx-trading-clearing" 209 SERVICES_START_CMD[17]="./rapidx-trading-clearing.sh start" 210 211 SERVICES_PROCESS_ID[18]="pb-trading-monitor-1.0-SNAPSHOT.jar" 212 SERVICES_FRIENDLY_NAME[18]="pb-trading-monitor" 213 SERVICES_START_CMD[18]="./pb-trading-monitor.sh start" 214 215 SERVICES_PROCESS_ID[19]="rapidx-trading-query-persistent-1.0-SNAPSHOT.jar" 216 SERVICES_FRIENDLY_NAME[19]="rapidx-trading-query-persistent" 217 SERVICES_START_CMD[19]="./deploy-persistent-new.sh start" 218 219 SERVICES_PROCESS_ID[20]="rapidtrade-storage-1.0-SNAPSHOT.jar" 220 SERVICES_FRIENDLY_NAME[20]="rapidtrade-storage" 221 SERVICES_START_CMD[20]="./rapidtrade-storage.sh start" 222 223 SERVICES_PROCESS_ID[21]="bitu-trade-1.0-SNAPSHOT.jar" 224 SERVICES_FRIENDLY_NAME[21]="bitu-trade" 225 SERVICES_START_CMD[21]=bitu-trade.sh 226 #SERVICES_START_CMD[21]="./deploy.sh start" 227 228 SERVICES_PROCESS_ID[22]="ltp-data-integration-1.0-SNAPSHOT.jar" 229 SERVICES_FRIENDLY_NAME[22]="ltp-data-integration" 230 SERVICES_START_CMD[22]="./ltp-data-integration.sh start" 231 232 SERVICES_PROCESS_ID[23]="ingest-server-app-1.0-SNAPSHOT.jar" 233 SERVICES_FRIENDLY_NAME[23]="ingest-server-app" 234 SERVICES_START_CMD[23]="./data-ingest-server.sh start" 235 236 SERVICES_PROCESS_ID[24]="ltp-data-visual-1.0-SNAPSHOT.jar" 237 SERVICES_FRIENDLY_NAME[24]="ltp-data-visual" 238 SERVICES_START_CMD[24]="./data-cam-visual.sh start" 239 240 241 # ========== 主循环 ========== 242 mkdir -p "$(dirname "$MONITOR_LOG_FILE")" 243 244 while true; do 245 for i in "${!SERVICES_PROCESS_ID[@]}"; do 246 process_id="${SERVICES_PROCESS_ID[$i]}" 247 friendly_name="${SERVICES_FRIENDLY_NAME[$i]}" 248 start_cmd="${SERVICES_START_CMD[$i]}" 249 250 if ! pgrep -f "$process_id" > /dev/null; then 251 echo "[$(date '+%Y-%m-%d %H:%M:%S')] - 服务 [$friendly_name] (进程标识: $process_id) 未运行,正在重启..." | tee -a "$MONITOR_LOG_FILE" 252 253 # 在子shell中启动服务,确保环境变量正确传递 254 ( 255 # 再次加载环境变量确保子shell中有正确的环境 256 for env_file in "${ENV_FILES[@]}"; do 257 if [ -f "$env_file" ]; then 258 source "$env_file" 259 fi 260 done 261 262 # 手动设置关键环境变量作为备用 263 export JAVA_HOME="/opt/jdk-17.0.8" 264 export CLASSPATH=".:$JAVA_HOME/lib/tools.jar:$JAVA_HOME/lib/dt.jar" 265 export PATH="$JAVA_HOME/bin:$PATH" 266 export env_nacos_address="nacos.test.com:8848" 267 export env_nacos_namespace="fat" 268 export ENV="fat" 269 270 cd "$WORK_DIR" 271 echo "[$(date '+%Y-%m-%d %H:%M:%S')] - 启动目录: $(pwd), JAVA_HOME: $JAVA_HOME" >> "$MONITOR_LOG_FILE" 272 $start_cmd >> "$MONITOR_LOG_FILE" 2>&1 273 ) & 274 275 send_alert_async "服务 [$friendly_name] (进程标识: $process_id) 已停止运行!正在尝试重启。" 276 else 277 echo "[$(date '+%Y-%m-%d %H:%M:%S')] - 服务 [$friendly_name] 运行正常." >> "$MONITOR_LOG_FILE" 278 fi 279 done 280 281 sleep 10 282 done
3.2 使用开源通用软件统一维护自动拉起(开发+运维都可以简单维护和使用)
1 #程序安装 2 dnf install -y gcc make openssl-devel bison flex zlib-devel 3 #apt install -y gcc make libssl-dev bison flex zlib1g-dev 4 #yum install -y gcc make openssl-devel bison flex zlib-devel 5 wget https://mmonit.com/monit/dist/monit-5.34.0.tar.gz 6 tar xf monit-5.34.0.tar.gz && cd monit-5.34.0/ 7 ./configure --prefix=/usr/local/monit --without-pam && make && make install 8 mkdir /usr/local/monit/etc -p && mkdir -p /usr/local/monit/etc/ 9 cp monitrc /usr/local/monit/etc/ 10 chmod 600 /usr/local/monit/etc/monitrc #配置文件定义检测时间,检测配置文件 11 ln -s /usr/local/monit/bin/monit /usr/sbin/monit 12 monit --version 13 mkdir /etc/monit/conf.d/ -p #所有进程检测配置文件,如启动脚本变动修改这里即可 14 cp /usr/local/monit/bin/monit /usr/bin/ 15 16 17 #systemd配置:vi /etc/systemd/system/monit.service 18 [Unit] 19 Description=Monit process monitor 20 Documentation=https://mmonit.com/monit/ 21 After=network.target 22 23 [Service] 24 Type=forking 25 ExecStart=/usr/bin/monit -c /usr/local/monit/etc/monitrc 26 ExecReload=/usr/bin/monit -c /usr/local/monit/etc/monitrc reload 27 ExecStop=/usr/bin/monit -c /usr/local/monit/etc/monitrc quit 28 PIDFile=/var/run/monit.pid 29 Restart=on-failure 30 User=root 31 Group=root 32 33 [Install] 34 WantedBy=multi-user.target # 多用户模式下开机自启 35 36 systemctl reload monit 37 systemctl enable monit 38 systemctl start monit
1 #进程配置方式vim /usr/local/monit/etc/monitrc: 2 set daemon 10 3 set logfile /var/log/monit.log 4 5 # 服务配置 /etc/monit/conf.d/rapidtrade-mock.conf 6 check process rapidtrade_mock matching "rapidtrade-mock" 7 start program = "/data/scripts/rapidtrade-mock.sh start" 8 stop program = "/data/scripts/rapidtrade-mock.sh stop" 9 if does not exist then start 10 11 12 13 #port配置方式: 14 check host rapidtrade_mock with address 127.0.0.1 15 if failed 16 port 7040 17 type tcp 18 timeout 5 seconds 19 for 2 cycles 20 then start 21 start program = "/data/scripts/rapidtrade-mock.sh start" as uid root and gid root 22 stop program = "/data/scripts/rapidtrade-mock.sh stop" as uid root and gid root 23 if 3 restarts within 5 cycles then timeout 24 25 26 27 28 #健康监测端口+路径:经过测试有问题,无法启动服务并恢复正常。 29 check host my_web_service with address 127.0.0.1 30 if failed 31 port 80 32 protocol http 33 request "/actuator/prometheus" # 指定要检查的健康检查端点路径 34 with timeout 10 seconds 35 for 3 cycles 36 then restart 37 start program = "/usr/bin/systemctl start my-service" 38 stop program = "/usr/bin/systemctl stop my-service"
4.测试和使用
这样基于传统服务,只需要改程序有对应的start.sh stop.sh脚本 就可以简单配置进程健康检测来维护服务自动拉起,简单高效不需要每个团队开发很多自己的脚本去维护。
http://www.cnblogs.com/Jame-mei
浙公网安备 33010602011771号