系统巡检脚本
点击查看代码
#!/bin/bash
################################################################################
# 脚本名称: system_health_check.sh
# 功能描述: 系统健康状态全面检查(输出精简版)
# 版本信息: v2.2(修复语法错误+完整输出)
################################################################################
# -------------------------- 新手必改:配置区 --------------------------
HOSTNAME=$(hostname)
REPORT_FILE="/var/log/system_check_$(date +%Y%m%d_%H%M%S).log"
CPU_WARNING=80 # CPU使用率告警阈值(%)
MEM_WARNING=85 # 内存使用率告警阈值(%)
DISK_WARNING=85 # 磁盘使用率告警阈值(%)
INODE_WARNING=80 # Inode使用率告警阈值(%)
TIME_WAIT_WARN=5000 # TIME_WAIT连接数告警阈值
# -------------------------------------------------------------------
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # 重置颜色
# 日志函数(使用printf避免-e问题)
log() { printf "[%s] %s\n" "$(date +"%Y-%m-%d %H:%M:%S")" "$1" | tee -a "$REPORT_FILE"; }
log_section() {
printf "\n============================================================\n" | tee -a "$REPORT_FILE"
printf " %s\n" "$1" | tee -a "$REPORT_FILE"
printf "===========================================================\n" | tee -a "$REPORT_FILE"
}
log_warning() { printf "${YELLOW}[WARNING] %s${NC}\n" "$1" | tee -a "$REPORT_FILE"; }
log_error() { printf "${RED}[ERROR] %s${NC}\n" "$1" | tee -a "$REPORT_FILE"; }
log_ok() { printf "${GREEN}[OK] %s${NC}\n" "$1" | tee -a "$REPORT_FILE"; }
# 1. 系统基本信息(精简)
check_basic_info() {
log_section "1. 系统基本信息"
log "主机名: $HOSTNAME | 检查时间: $(date +"%Y-%m-%d %H:%M:%S")"
log "系统版本: $(cat /etc/redhat-release 2>/dev/null || cat /etc/issue | head -1 | awk '{print $1,$2,$3}')"
log "内核版本: $(uname -r) | 架构: $(uname -m) | 运行时长: $(uptime | awk -F'up ' '{print $2}' | awk -F',' '{print $1}')"
}
# 2. CPU检查(精简)
check_cpu() {
log_section "2. CPU使用率与负载检查"
CPU_CORES=$(grep -c ^processor /proc/cpuinfo)
CPU_IDLE=$(top -bn2 -d 1 | grep "Cpu(s)" | tail -1 | awk '{print $8}' | cut -d'%' -f1)
CPU_USAGE=$(printf "%.1f" "$(echo "100 - $CPU_IDLE" | bc -l)")
LOAD_1=$(uptime | awk -F'load average:' '{print $2}' | awk -F',' '{print $1}' | xargs)
LOAD_THRESHOLD=$(printf "%.1f" "$(echo "$CPU_CORES * 2" | bc -l)")
log "CPU核心数: $CPU_CORES | 使用率: ${CPU_USAGE}% | 1分钟负载: ${LOAD_1}(阈值: ${LOAD_THRESHOLD})"
if (( $(echo "$CPU_USAGE > $CPU_WARNING" | bc -l) )); then
log_warning "CPU使用率超阈值(${CPU_WARNING}%),当前${CPU_USAGE}%"
log "TOP 5 CPU消耗进程(PID | 用户 | 使用率 | 命令):"
ps aux | sort -rn -k3 | head -5 | awk '{printf " %-8s %-10s %-5s %s\n", $2,$1,$3,$11}' | tee -a "$REPORT_FILE"
else
log_ok "CPU使用率正常"
fi
if (( $(echo "$LOAD_1 > $LOAD_THRESHOLD" | bc -l) )); then
log_warning "系统负载超阈值,1分钟负载${LOAD_1}(阈值${LOAD_THRESHOLD})"
fi
}
# 3. 内存检查(精简)
check_memory() {
log_section "3. 内存使用检查"
MEM_TOTAL=$(free -m | awk 'NR==2{print $2}')
MEM_USED=$(free -m | awk 'NR==2{print $3}')
MEM_AVAILABLE=$(free -m | awk 'NR==2{print $7}')
MEM_USAGE=$(printf "%.1f" "$(echo "$MEM_USED / $MEM_TOTAL * 100" | bc -l)")
SWAP_TOTAL=$(free -m | awk 'NR==3{print $2}')
SWAP_USED=$(free -m | awk 'NR==3{print $3}')
log "内存总量: ${MEM_TOTAL}MB | 已用: ${MEM_USED}MB | 可用: ${MEM_AVAILABLE}MB | 使用率: ${MEM_USAGE}%"
[ "$SWAP_TOTAL" -gt 0 ] && log "Swap总量: ${SWAP_TOTAL}MB | 已用: ${SWAP_USED}MB"
if (( $(echo "$MEM_USAGE > $MEM_WARNING" | bc -l) )); then
log_warning "内存使用率超阈值(${MEM_WARNING}%),当前${MEM_USAGE}%"
log "TOP 5 内存消耗进程(PID | 用户 | 使用率 | 命令):"
ps aux | sort -rn -k4 | head -5 | awk '{printf " %-8s %-10s %-5s %s\n", $2,$1,$4,$11}' | tee -a "$REPORT_FILE"
else
log_ok "内存使用率正常"
fi
if [ "$SWAP_TOTAL" -gt 0 ] && [ "$SWAP_USED" -gt 100 ]; then
log_warning "Swap使用量较高(${SWAP_USED}MB),可能存在内存压力"
fi
}
# 4. 磁盘检查(精简)
check_disk() {
log_section "4. 磁盘与Inode使用检查"
HAS_DISK_WARNING=0
log "磁盘分区使用情况(设备 | 总量 | 已用 | 可用 | 使用率 | 挂载点):"
df -h | grep -vE '^Filesystem|tmpfs|cdrom|loop' | awk '{printf " %-15s %-6s %-6s %-6s %-5s %s\n", $1,$2,$3,$4,$5,$6}' | tee -a "$REPORT_FILE"
while read line; do
USAGE=$(echo "$line" | awk '{print $5}' | sed 's/%//')
MOUNT=$(echo "$line" | awk '{print $6}')
if [ "$USAGE" -gt "$DISK_WARNING" ]; then
log_warning "磁盘分区 $MOUNT 使用率${USAGE}%(超阈值${DISK_WARNING}%)"
log " $MOUNT 分区TOP5占用目录:"
du -sh "${MOUNT}"/* 2>/dev/null | sort -rh | head -5 | awk '{printf " %-8s %s\n", $1,$2}' | tee -a "$REPORT_FILE"
HAS_DISK_WARNING=1
fi
done < <(df -h | grep -vE '^Filesystem|tmpfs|cdrom|loop')
[ $HAS_DISK_WARNING -eq 0 ] && log_ok "所有磁盘分区使用率正常"
log -e "\nInode使用情况(设备 | 总量 | 已用 | 可用 | 使用率 | 挂载点):"
df -i | grep -vE '^Filesystem|tmpfs|cdrom|loop' | awk '{printf " %-15s %-8s %-8s %-8s %-5s %s\n", $1,$2,$3,$4,$5,$6}' | tee -a "$REPORT_FILE"
while read line; do
INODE_USAGE=$(echo "$line" | awk '{print $5}' | sed 's/%//')
MOUNT=$(echo "$line" | awk '{print $6}')
if [ "$INODE_USAGE" -gt "$INODE_WARNING" ]; then
log_warning "分区 $MOUNT Inode使用率${INODE_USAGE}%(超阈值${INODE_WARNING}%)"
fi
done < <(df -i | grep -vE '^Filesystem|tmpfs|cdrom|loop')
}
# 5. 网络检查(精简)
check_network() {
log_section "5. 网络状态检查"
log "已启用网络接口(名称 | 状态 | IP):"
ip -br addr | grep -v DOWN | awk '{printf " %-8s %-6s %s\n", $1,$2,$3}' | tee -a "$REPORT_FILE"
if ! command -v netstat &> /dev/null; then
log_warning "未安装netstat,跳过TCP连接与监听端口检查(可执行yum install net-tools安装)"
return
fi
log -e "\nTCP连接状态统计:"
netstat -an | awk '/^tcp/ {print $6}' | sort | uniq -c | sort -rn | awk '{printf " %-5s %s\n", $1,$2}' | tee -a "$REPORT_FILE"
TIME_WAIT_COUNT=$(netstat -an | grep TIME_WAIT | wc -l)
log "TIME_WAIT连接数: $TIME_WAIT_COUNT"
[ "$TIME_WAIT_COUNT" -gt "$TIME_WAIT_WARN" ] && log_warning "TIME_WAIT连接数超阈值(${TIME_WAIT_WARN}),可优化TCP参数"
log -e "\n关键监听端口(协议 | 本地地址:端口 | 状态):"
netstat -tuln | grep LISTEN | awk '{printf " %-6s %-20s %s\n", $1,$4,$6}' | tee -a "$REPORT_FILE"
}
# 6. 进程和服务检查(精简)
check_processes() {
log_section "6. 进程和服务检查"
log "当前进程总数: $(ps aux | wc -l)"
ZOMBIE_COUNT=$(ps aux | awk '{print $8}' | grep -c Z)
log "僵尸进程数: $ZOMBIE_COUNT"
if [ "$ZOMBIE_COUNT" -gt 0 ]; then
log_warning "发现${ZOMBIE_COUNT}个僵尸进程(PID | 父PID | 命令):"
ps aux | grep 'Z' | grep -v grep | awk '{printf " %-8s %-8s %s\n", $2,$3,$11}' | tee -a "$REPORT_FILE"
else
log_ok "无僵尸进程"
fi
log -e "\n关键服务状态(只显异常):"
CRITICAL_SERVICES=("sshd" "crond" "rsyslog")
local all_ok=1
for service in "${CRITICAL_SERVICES[@]}"; do
if systemctl is-active --quiet "$service" 2>/dev/null || ps aux | grep -v grep | grep -q "$service"; then
continue
else
log_error " $service: 未运行(需手动启动)"
all_ok=0
fi
done
[ $all_ok -eq 1 ] && log_ok "所有关键服务均正常运行"
}
# 7. 系统日志检查(精简)
check_logs() {
log_section "7. 系统日志检查"
local has_error=0
log "最近10条系统错误日志(含error/fail/critical):"
if [ -f /var/log/messages ]; then
errors=$(grep -i "error\|fail\|critical" /var/log/messages | tail -10)
else
errors=$(journalctl -p err --no-pager | tail -10)
fi
if [ -n "$errors" ]; then
printf " %s\n" "$errors" | tee -a "$REPORT_FILE"
log_warning "发现系统错误日志,需关注"
has_error=1
else
log_ok "无严重系统错误日志"
fi
log -e "\nOOM(内存溢出)事件检查:"
OOM_COUNT=$(dmesg 2>/dev/null | grep -i "out of memory" | wc -l)
if [ "$OOM_COUNT" -gt 0 ]; then
log_warning "发现${OOM_COUNT}次OOM事件(最近5条):"
dmesg 2>/dev/null | grep -i "out of memory" | tail -5 | awk '{printf " %s\n", $0}' | tee -a "$REPORT_FILE"
has_error=1
else
log_ok "无OOM事件"
fi
}
# 8. 报告摘要(精简)
generate_summary() {
log_section "8. 巡检报告摘要"
WARNING_COUNT=$(grep -c "\[WARNING\]" "$REPORT_FILE")
ERROR_COUNT=$(grep -c "\[ERROR\]" "$REPORT_FILE")
log "巡检完成时间: $(date +"%Y-%m-%d %H:%M:%S")"
log "告警数量: $WARNING_COUNT | 错误数量: $ERROR_COUNT"
if [ "$ERROR_COUNT" -gt 0 ]; then
log_error "存在${ERROR_COUNT}个严重问题,需立即处理!"
elif [ "$WARNING_COUNT" -gt 0 ]; then
log_warning "存在${WARNING_COUNT}个告警,建议及时关注"
else
log_ok "系统状态良好,无异常"
fi
log "完整报告路径: $REPORT_FILE"
}
# 主函数(完整)
main() {
printf "==========================================\n"
printf " 服务器健康状态巡检脚本 v2.2\n"
printf "==========================================\n\n"
if [ "$(id -u)" -ne 0 ]; then
log_warning "非root用户运行,部分检查可能受限(建议使用root执行)"
fi
check_basic_info
check_cpu
check_memory
check_disk
check_network
check_processes
check_logs
generate_summary
printf "\n==========================================\n"
printf " 巡检完成! 结果已保存至 %s\n" "$REPORT_FILE"
printf "==========================================\n"
}
main "$@"
执行结果
点击查看代码
[root@k8s-master233 ~]# bash 1.sh
==========================================
服务器健康状态巡检脚本 v2.2
==========================================
============================================================
1. 系统基本信息
===========================================================
[2025-11-03 16:50:54] 主机名: k8s-master233 | 检查时间: 2025-11-03 16:50:54
[2025-11-03 16:50:54] 系统版本: Ubuntu 22.04.4 LTS
[2025-11-03 16:50:54] 内核版本: 5.15.0-153-generic | 架构: x86_64 | 运行时长: 7:34
============================================================
2. CPU使用率与负载检查
===========================================================
[2025-11-03 16:50:55] CPU核心数: 2 | 使用率: 32.0% | 1分钟负载: 1.05(阈值: 4.0)
[OK] CPU使用率正常
============================================================
3. 内存使用检查
===========================================================
[2025-11-03 16:50:55] 内存总量: 3875MB | 已用: 953MB | 可用: 2628MB | 使用率: 24.6%
[OK] 内存使用率正常
============================================================
4. 磁盘与Inode使用检查
===========================================================
[2025-11-03 16:50:55] 磁盘分区使用情况(设备 | 总量 | 已用 | 可用 | 使用率 | 挂载点):
/dev/mapper/ubuntu--vg-ubuntu--lv 48G 14G 32G 31% /
/dev/sda2 2.0G 247M 1.6G 14% /boot
overlay 48G 14G 32G 31% /var/lib/docker/overlay2/1fa1b54e7cc8a7eb436efb18053351a6d4b6fa2495a7c38642b9cbe737d8f8c4/merged
overlay 48G 14G 32G 31% /var/lib/docker/overlay2/6a08d0c81618d55b31e5d4ab53216b1e9972e3fb2e864574dbfcae9fef91b958/merged
overlay 48G 14G 32G 31% /var/lib/docker/overlay2/c7cf53e68ccaadb12f3219656340a2de575e1ce70443f2892515da84cce13dc0/merged
overlay 48G 14G 32G 31% /var/lib/docker/overlay2/efdf1427f3acf7503e9620fd61d9dea9afd5f13e83593670859fa05dd638e125/merged
overlay 48G 14G 32G 31% /var/lib/docker/overlay2/ff8f975d1e955fadfd9ba6f54249163c9ff893655863ecb421b9614169c56ad0/merged
shm 64M 0 64M 0% /var/lib/docker/containers/fbcf39d14513cd97cbd41c0deb5bfa1c1012d24abe43de7762a1e69fc8e052a4/mounts/shm
shm 64M 0 64M 0% /var/lib/docker/containers/8c762c28085202319457188150f73167351bd6f81976aac95937c83ccf6b9edd/mounts/shm
shm 64M 0 64M 0% /var/lib/docker/containers/2113afc79191840f6ddce8b7f5ab0546abb87259d14b2edf57d4c935cc9acca0/mounts/shm
shm 64M 0 64M 0% /var/lib/docker/containers/b9efbb0d21c068207ee60027561c000615332dc11cc93ca9d6e2af048e85b0d1/mounts/shm
overlay 48G 14G 32G 31% /var/lib/docker/overlay2/d3c8eddaec6517ec34044c018e6c4a41f13b353e040ab699423d3c1724f6b77b/merged
overlay 48G 14G 32G 31% /var/lib/docker/overlay2/c9234b66b457f79afde80ed9edd9993256ffb1cc946e341b43cfbe46bb874b12/merged
[OK] 所有磁盘分区使用率正常
[2025-11-03 16:50:55] -e
/dev/mapper/ubuntu--vg-ubuntu--lv 3211264 157214 3054050 5% /
/dev/sda2 131072 320 130752 1% /boot
overlay 3211264 157214 3054050 5% /var/lib/docker/overlay2/1fa1b54e7cc8a7eb436efb18053351a6d4b6fa2495a7c38642b9cbe737d8f8c4/merged
overlay 3211264 157214 3054050 5% /var/lib/docker/overlay2/6a08d0c81618d55b31e5d4ab53216b1e9972e3fb2e864574dbfcae9fef91b958/merged
overlay 3211264 157214 3054050 5% /var/lib/docker/overlay2/c7cf53e68ccaadb12f3219656340a2de575e1ce70443f2892515da84cce13dc0/merged
overlay 3211264 157214 3054050 5% /var/lib/docker/overlay2/efdf1427f3acf7503e9620fd61d9dea9afd5f13e83593670859fa05dd638e125/merged
overlay 3211264 157214 3054050 5% /var/lib/docker/overlay2/ff8f975d1e955fadfd9ba6f54249163c9ff893655863ecb421b9614169c56ad0/merged
shm 496106 1 496105 1% /var/lib/docker/containers/fbcf39d14513cd97cbd41c0deb5bfa1c1012d24abe43de7762a1e69fc8e052a4/mounts/shm
shm 496106 1 496105 1% /var/lib/docker/containers/8c762c28085202319457188150f73167351bd6f81976aac95937c83ccf6b9edd/mounts/shm
shm 496106 1 496105 1% /var/lib/docker/containers/2113afc79191840f6ddce8b7f5ab0546abb87259d14b2edf57d4c935cc9acca0/mounts/shm
shm 496106 1 496105 1% /var/lib/docker/containers/b9efbb0d21c068207ee60027561c000615332dc11cc93ca9d6e2af048e85b0d1/mounts/shm
overlay 3211264 157214 3054050 5% /var/lib/docker/overlay2/d3c8eddaec6517ec34044c018e6c4a41f13b353e040ab699423d3c1724f6b77b/merged
overlay 3211264 157214 3054050 5% /var/lib/docker/overlay2/c9234b66b457f79afde80ed9edd9993256ffb1cc946e341b43cfbe46bb874b12/merged
============================================================
5. 网络状态检查
===========================================================
[2025-11-03 16:50:55] 已启用网络接口(名称 | 状态 | IP):
lo UNKNOWN 127.0.0.1/8
eth0 UP 10.0.0.233/24
docker0 UP 172.17.0.1/16
vetha68cbca@if4 UP fe80::e06f:94ff:fe14:75db/64
[2025-11-03 16:50:55] -e
25520 CLOSE_WAIT
25382 FIN_WAIT2
7006 TIME_WAIT
700 SYN_SENT
105 ESTABLISHED
25 LAST_ACK
14 LISTEN
8 FIN_WAIT1
[2025-11-03 16:50:56] TIME_WAIT连接数: 6933
[WARNING] TIME_WAIT连接数超阈值(5000),可优化TCP参数
[2025-11-03 16:50:56] -e
tcp 0.0.0.0:6443 LISTEN
tcp 127.0.0.1:6010 LISTEN
tcp 0.0.0.0:22 LISTEN
tcp 127.0.0.1:10248 LISTEN
tcp 127.0.0.1:10259 LISTEN
tcp 127.0.0.1:10257 LISTEN
tcp 0.0.0.0:8080 LISTEN
tcp 127.0.0.1:35417 LISTEN
tcp 127.0.0.53:53 LISTEN
tcp6 :::9100 LISTEN
tcp6 ::1:6010 LISTEN
tcp6 :::10250 LISTEN
tcp6 :::22 LISTEN
tcp6 :::8080 LISTEN
============================================================
6. 进程和服务检查
===========================================================
[2025-11-03 16:50:57] 当前进程总数: 244
[2025-11-03 16:50:57] 僵尸进程数: 0
[OK] 无僵尸进程
[2025-11-03 16:50:57] -e
[ERROR] crond: 未运行(需手动启动)
============================================================
7. 系统日志检查
===========================================================
[2025-11-03 16:50:57] 最近10条系统错误日志(含error/fail/critical):
-- Boot 9618308e2ec34567841ca66817885a79 --
Sep 10 14:59:51 k8s-master233 kernel: piix4_smbus 0000:00:07.3: SMBus Host Controller not enabled!
Sep 10 14:59:51 k8s-master233 kernel: sd 32:0:0:0: [sda] Assuming drive cache: write through
Sep 10 15:00:08 k8s-master233 kernel: hub 2-2:1.0: hub_ext_port_status failed (err = -110)
Sep 10 19:56:06 k8s-master233 systemd[1]: Failed to start Refresh fwupd metadata and update motd.
Sep 13 21:21:47 k8s-master233 kernel: e1000 0000:02:01.0 eth0: Reset adapter
-- Boot be599bd54e694d1faf331b6a6118b460 --
Nov 03 09:16:48 k8s-master233 kernel: piix4_smbus 0000:00:07.3: SMBus Host Controller not enabled!
Nov 03 09:16:48 k8s-master233 kernel: sd 32:0:0:0: [sda] Assuming drive cache: write through
Nov 03 15:30:45 k8s-master233 kernel: e1000 0000:02:01.0 eth0: Reset adapter
[WARNING] 发现系统错误日志,需关注
[2025-11-03 16:50:57] -e
[OK] 无OOM事件
============================================================
8. 巡检报告摘要
===========================================================
[2025-11-03 16:50:57] 巡检完成时间: 2025-11-03 16:50:57
[2025-11-03 16:50:57] 告警数量: 2 | 错误数量: 1
[ERROR] 存在1个严重问题,需立即处理!
[2025-11-03 16:50:57] 完整报告路径: /var/log/system_check_20251103_165054.log
==========================================
巡检完成! 结果已保存至 /var/log/system_check_20251103_165054.log
==========================================

浙公网安备 33010602011771号