系统巡检脚本

点击查看代码
#!/bin/bash
################################################################################
# 脚本名称: system_health_check.sh
# 功能描述: 系统健康状态全面检查(输出精简版)
# 版本信息: v2.2(修复语法错误+完整输出)
################################################################################

# -------------------------- 新手必改:配置区 --------------------------
HOSTNAME=$(hostname)
REPORT_FILE="/var/log/system_check_$(date +%Y%m%d_%H%M%S).log"

CPU_WARNING=80        # CPU使用率告警阈值(%)
MEM_WARNING=85        # 内存使用率告警阈值(%)
DISK_WARNING=85       # 磁盘使用率告警阈值(%)
INODE_WARNING=80      # Inode使用率告警阈值(%)
TIME_WAIT_WARN=5000   # TIME_WAIT连接数告警阈值
# -------------------------------------------------------------------

# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'          # 重置颜色

# 日志函数(使用printf避免-e问题)
log() { printf "[%s] %s\n" "$(date +"%Y-%m-%d %H:%M:%S")" "$1" | tee -a "$REPORT_FILE"; }
log_section() {
    printf "\n============================================================\n" | tee -a "$REPORT_FILE"
    printf " %s\n" "$1" | tee -a "$REPORT_FILE"
    printf "===========================================================\n" | tee -a "$REPORT_FILE"
}
log_warning() { printf "${YELLOW}[WARNING] %s${NC}\n" "$1" | tee -a "$REPORT_FILE"; }
log_error() { printf "${RED}[ERROR] %s${NC}\n" "$1" | tee -a "$REPORT_FILE"; }
log_ok() { printf "${GREEN}[OK] %s${NC}\n" "$1" | tee -a "$REPORT_FILE"; }

# 1. 系统基本信息(精简)
check_basic_info() {
    log_section "1. 系统基本信息"
    log "主机名: $HOSTNAME | 检查时间: $(date +"%Y-%m-%d %H:%M:%S")"
    log "系统版本: $(cat /etc/redhat-release 2>/dev/null || cat /etc/issue | head -1 | awk '{print $1,$2,$3}')"
    log "内核版本: $(uname -r) | 架构: $(uname -m) | 运行时长: $(uptime | awk -F'up ' '{print $2}' | awk -F',' '{print $1}')"
}

# 2. CPU检查(精简)
check_cpu() {
    log_section "2. CPU使用率与负载检查"
    CPU_CORES=$(grep -c ^processor /proc/cpuinfo)
    CPU_IDLE=$(top -bn2 -d 1 | grep "Cpu(s)" | tail -1 | awk '{print $8}' | cut -d'%' -f1)
    CPU_USAGE=$(printf "%.1f" "$(echo "100 - $CPU_IDLE" | bc -l)")
    LOAD_1=$(uptime | awk -F'load average:' '{print $2}' | awk -F',' '{print $1}' | xargs)
    LOAD_THRESHOLD=$(printf "%.1f" "$(echo "$CPU_CORES * 2" | bc -l)")

    log "CPU核心数: $CPU_CORES | 使用率: ${CPU_USAGE}% | 1分钟负载: ${LOAD_1}(阈值: ${LOAD_THRESHOLD})"
    
    if (( $(echo "$CPU_USAGE > $CPU_WARNING" | bc -l) )); then
        log_warning "CPU使用率超阈值(${CPU_WARNING}%),当前${CPU_USAGE}%"
        log "TOP 5 CPU消耗进程(PID | 用户 | 使用率 | 命令):"
        ps aux | sort -rn -k3 | head -5 | awk '{printf "  %-8s %-10s %-5s %s\n", $2,$1,$3,$11}' | tee -a "$REPORT_FILE"
    else
        log_ok "CPU使用率正常"
    fi
    if (( $(echo "$LOAD_1 > $LOAD_THRESHOLD" | bc -l) )); then
        log_warning "系统负载超阈值,1分钟负载${LOAD_1}(阈值${LOAD_THRESHOLD})"
    fi
}

# 3. 内存检查(精简)
check_memory() {
    log_section "3. 内存使用检查"
    MEM_TOTAL=$(free -m | awk 'NR==2{print $2}')
    MEM_USED=$(free -m | awk 'NR==2{print $3}')
    MEM_AVAILABLE=$(free -m | awk 'NR==2{print $7}')
    MEM_USAGE=$(printf "%.1f" "$(echo "$MEM_USED / $MEM_TOTAL * 100" | bc -l)")
    SWAP_TOTAL=$(free -m | awk 'NR==3{print $2}')
    SWAP_USED=$(free -m | awk 'NR==3{print $3}')

    log "内存总量: ${MEM_TOTAL}MB | 已用: ${MEM_USED}MB | 可用: ${MEM_AVAILABLE}MB | 使用率: ${MEM_USAGE}%"
    [ "$SWAP_TOTAL" -gt 0 ] && log "Swap总量: ${SWAP_TOTAL}MB | 已用: ${SWAP_USED}MB"
    
    if (( $(echo "$MEM_USAGE > $MEM_WARNING" | bc -l) )); then
        log_warning "内存使用率超阈值(${MEM_WARNING}%),当前${MEM_USAGE}%"
        log "TOP 5 内存消耗进程(PID | 用户 | 使用率 | 命令):"
        ps aux | sort -rn -k4 | head -5 | awk '{printf "  %-8s %-10s %-5s %s\n", $2,$1,$4,$11}' | tee -a "$REPORT_FILE"
    else
        log_ok "内存使用率正常"
    fi
    if [ "$SWAP_TOTAL" -gt 0 ] && [ "$SWAP_USED" -gt 100 ]; then
        log_warning "Swap使用量较高(${SWAP_USED}MB),可能存在内存压力"
    fi
}

# 4. 磁盘检查(精简)
check_disk() {
    log_section "4. 磁盘与Inode使用检查"
    HAS_DISK_WARNING=0

    log "磁盘分区使用情况(设备 | 总量 | 已用 | 可用 | 使用率 | 挂载点):"
    df -h | grep -vE '^Filesystem|tmpfs|cdrom|loop' | awk '{printf "  %-15s %-6s %-6s %-6s %-5s %s\n", $1,$2,$3,$4,$5,$6}' | tee -a "$REPORT_FILE"
    
    while read line; do
        USAGE=$(echo "$line" | awk '{print $5}' | sed 's/%//')
        MOUNT=$(echo "$line" | awk '{print $6}')
        if [ "$USAGE" -gt "$DISK_WARNING" ]; then
            log_warning "磁盘分区 $MOUNT 使用率${USAGE}%(超阈值${DISK_WARNING}%)"
            log "  $MOUNT 分区TOP5占用目录:"
            du -sh "${MOUNT}"/* 2>/dev/null | sort -rh | head -5 | awk '{printf "    %-8s %s\n", $1,$2}' | tee -a "$REPORT_FILE"
            HAS_DISK_WARNING=1
        fi
    done < <(df -h | grep -vE '^Filesystem|tmpfs|cdrom|loop')
    [ $HAS_DISK_WARNING -eq 0 ] && log_ok "所有磁盘分区使用率正常"

    log -e "\nInode使用情况(设备 | 总量 | 已用 | 可用 | 使用率 | 挂载点):"
    df -i | grep -vE '^Filesystem|tmpfs|cdrom|loop' | awk '{printf "  %-15s %-8s %-8s %-8s %-5s %s\n", $1,$2,$3,$4,$5,$6}' | tee -a "$REPORT_FILE"
    
    while read line; do
        INODE_USAGE=$(echo "$line" | awk '{print $5}' | sed 's/%//')
        MOUNT=$(echo "$line" | awk '{print $6}')
        if [ "$INODE_USAGE" -gt "$INODE_WARNING" ]; then
            log_warning "分区 $MOUNT Inode使用率${INODE_USAGE}%(超阈值${INODE_WARNING}%)"
        fi
    done < <(df -i | grep -vE '^Filesystem|tmpfs|cdrom|loop')
}

# 5. 网络检查(精简)
check_network() {
    log_section "5. 网络状态检查"
    log "已启用网络接口(名称 | 状态 | IP):"
    ip -br addr | grep -v DOWN | awk '{printf "  %-8s %-6s %s\n", $1,$2,$3}' | tee -a "$REPORT_FILE"

    if ! command -v netstat &> /dev/null; then
        log_warning "未安装netstat,跳过TCP连接与监听端口检查(可执行yum install net-tools安装)"
        return
    fi

    log -e "\nTCP连接状态统计:"
    netstat -an | awk '/^tcp/ {print $6}' | sort | uniq -c | sort -rn | awk '{printf "  %-5s %s\n", $1,$2}' | tee -a "$REPORT_FILE"
    
    TIME_WAIT_COUNT=$(netstat -an | grep TIME_WAIT | wc -l)
    log "TIME_WAIT连接数: $TIME_WAIT_COUNT"
    [ "$TIME_WAIT_COUNT" -gt "$TIME_WAIT_WARN" ] && log_warning "TIME_WAIT连接数超阈值(${TIME_WAIT_WARN}),可优化TCP参数"

    log -e "\n关键监听端口(协议 | 本地地址:端口 | 状态):"
    netstat -tuln | grep LISTEN | awk '{printf "  %-6s %-20s %s\n", $1,$4,$6}' | tee -a "$REPORT_FILE"
}

# 6. 进程和服务检查(精简)
check_processes() {
    log_section "6. 进程和服务检查"
    log "当前进程总数: $(ps aux | wc -l)"

    ZOMBIE_COUNT=$(ps aux | awk '{print $8}' | grep -c Z)
    log "僵尸进程数: $ZOMBIE_COUNT"
    if [ "$ZOMBIE_COUNT" -gt 0 ]; then
        log_warning "发现${ZOMBIE_COUNT}个僵尸进程(PID | 父PID | 命令):"
        ps aux | grep 'Z' | grep -v grep | awk '{printf "  %-8s %-8s %s\n", $2,$3,$11}' | tee -a "$REPORT_FILE"
    else
        log_ok "无僵尸进程"
    fi

    log -e "\n关键服务状态(只显异常):"
    CRITICAL_SERVICES=("sshd" "crond" "rsyslog")
    local all_ok=1
    for service in "${CRITICAL_SERVICES[@]}"; do
        if systemctl is-active --quiet "$service" 2>/dev/null || ps aux | grep -v grep | grep -q "$service"; then
            continue
        else
            log_error "  $service: 未运行(需手动启动)"
            all_ok=0
        fi
    done
    [ $all_ok -eq 1 ] && log_ok "所有关键服务均正常运行"
}

# 7. 系统日志检查(精简)
check_logs() {
    log_section "7. 系统日志检查"
    local has_error=0

    log "最近10条系统错误日志(含error/fail/critical):"
    if [ -f /var/log/messages ]; then
        errors=$(grep -i "error\|fail\|critical" /var/log/messages | tail -10)
    else
        errors=$(journalctl -p err --no-pager | tail -10)
    fi
    if [ -n "$errors" ]; then
        printf "  %s\n" "$errors" | tee -a "$REPORT_FILE"
        log_warning "发现系统错误日志,需关注"
        has_error=1
    else
        log_ok "无严重系统错误日志"
    fi

    log -e "\nOOM(内存溢出)事件检查:"
    OOM_COUNT=$(dmesg 2>/dev/null | grep -i "out of memory" | wc -l)
    if [ "$OOM_COUNT" -gt 0 ]; then
        log_warning "发现${OOM_COUNT}次OOM事件(最近5条):"
        dmesg 2>/dev/null | grep -i "out of memory" | tail -5 | awk '{printf "  %s\n", $0}' | tee -a "$REPORT_FILE"
        has_error=1
    else
        log_ok "无OOM事件"
    fi
}

# 8. 报告摘要(精简)
generate_summary() {
    log_section "8. 巡检报告摘要"
    WARNING_COUNT=$(grep -c "\[WARNING\]" "$REPORT_FILE")
    ERROR_COUNT=$(grep -c "\[ERROR\]" "$REPORT_FILE")

    log "巡检完成时间: $(date +"%Y-%m-%d %H:%M:%S")"
    log "告警数量: $WARNING_COUNT | 错误数量: $ERROR_COUNT"
    
    if [ "$ERROR_COUNT" -gt 0 ]; then
        log_error "存在${ERROR_COUNT}个严重问题,需立即处理!"
    elif [ "$WARNING_COUNT" -gt 0 ]; then
        log_warning "存在${WARNING_COUNT}个告警,建议及时关注"
    else
        log_ok "系统状态良好,无异常"
    fi
    log "完整报告路径: $REPORT_FILE"
}

# 主函数(完整)
main() {
    printf "==========================================\n"
    printf "    服务器健康状态巡检脚本 v2.2\n"
    printf "==========================================\n\n"

    if [ "$(id -u)" -ne 0 ]; then
        log_warning "非root用户运行,部分检查可能受限(建议使用root执行)"
    fi

    check_basic_info
    check_cpu
    check_memory
    check_disk
    check_network
    check_processes
    check_logs
    generate_summary

    printf "\n==========================================\n"
    printf "    巡检完成! 结果已保存至 %s\n" "$REPORT_FILE"
    printf "==========================================\n"
}

main "$@"

执行结果

点击查看代码
[root@k8s-master233 ~]# bash 1.sh
==========================================
    服务器健康状态巡检脚本 v2.2
==========================================


============================================================
 1. 系统基本信息
===========================================================
[2025-11-03 16:50:54] 主机名: k8s-master233 | 检查时间: 2025-11-03 16:50:54
[2025-11-03 16:50:54] 系统版本: Ubuntu 22.04.4 LTS
[2025-11-03 16:50:54] 内核版本: 5.15.0-153-generic | 架构: x86_64 | 运行时长:  7:34

============================================================
 2. CPU使用率与负载检查
===========================================================
[2025-11-03 16:50:55] CPU核心数: 2 | 使用率: 32.0% | 1分钟负载: 1.05(阈值: 4.0)
[OK] CPU使用率正常

============================================================
 3. 内存使用检查
===========================================================
[2025-11-03 16:50:55] 内存总量: 3875MB | 已用: 953MB | 可用: 2628MB | 使用率: 24.6%
[OK] 内存使用率正常

============================================================
 4. 磁盘与Inode使用检查
===========================================================
[2025-11-03 16:50:55] 磁盘分区使用情况(设备 | 总量 | 已用 | 可用 | 使用率 | 挂载点):
  /dev/mapper/ubuntu--vg-ubuntu--lv 48G    14G    32G    31%   /
  /dev/sda2       2.0G   247M   1.6G   14%   /boot
  overlay         48G    14G    32G    31%   /var/lib/docker/overlay2/1fa1b54e7cc8a7eb436efb18053351a6d4b6fa2495a7c38642b9cbe737d8f8c4/merged
  overlay         48G    14G    32G    31%   /var/lib/docker/overlay2/6a08d0c81618d55b31e5d4ab53216b1e9972e3fb2e864574dbfcae9fef91b958/merged
  overlay         48G    14G    32G    31%   /var/lib/docker/overlay2/c7cf53e68ccaadb12f3219656340a2de575e1ce70443f2892515da84cce13dc0/merged
  overlay         48G    14G    32G    31%   /var/lib/docker/overlay2/efdf1427f3acf7503e9620fd61d9dea9afd5f13e83593670859fa05dd638e125/merged
  overlay         48G    14G    32G    31%   /var/lib/docker/overlay2/ff8f975d1e955fadfd9ba6f54249163c9ff893655863ecb421b9614169c56ad0/merged
  shm             64M    0      64M    0%    /var/lib/docker/containers/fbcf39d14513cd97cbd41c0deb5bfa1c1012d24abe43de7762a1e69fc8e052a4/mounts/shm
  shm             64M    0      64M    0%    /var/lib/docker/containers/8c762c28085202319457188150f73167351bd6f81976aac95937c83ccf6b9edd/mounts/shm
  shm             64M    0      64M    0%    /var/lib/docker/containers/2113afc79191840f6ddce8b7f5ab0546abb87259d14b2edf57d4c935cc9acca0/mounts/shm
  shm             64M    0      64M    0%    /var/lib/docker/containers/b9efbb0d21c068207ee60027561c000615332dc11cc93ca9d6e2af048e85b0d1/mounts/shm
  overlay         48G    14G    32G    31%   /var/lib/docker/overlay2/d3c8eddaec6517ec34044c018e6c4a41f13b353e040ab699423d3c1724f6b77b/merged
  overlay         48G    14G    32G    31%   /var/lib/docker/overlay2/c9234b66b457f79afde80ed9edd9993256ffb1cc946e341b43cfbe46bb874b12/merged
[OK] 所有磁盘分区使用率正常
[2025-11-03 16:50:55] -e
  /dev/mapper/ubuntu--vg-ubuntu--lv 3211264  157214   3054050  5%    /
  /dev/sda2       131072   320      130752   1%    /boot
  overlay         3211264  157214   3054050  5%    /var/lib/docker/overlay2/1fa1b54e7cc8a7eb436efb18053351a6d4b6fa2495a7c38642b9cbe737d8f8c4/merged
  overlay         3211264  157214   3054050  5%    /var/lib/docker/overlay2/6a08d0c81618d55b31e5d4ab53216b1e9972e3fb2e864574dbfcae9fef91b958/merged
  overlay         3211264  157214   3054050  5%    /var/lib/docker/overlay2/c7cf53e68ccaadb12f3219656340a2de575e1ce70443f2892515da84cce13dc0/merged
  overlay         3211264  157214   3054050  5%    /var/lib/docker/overlay2/efdf1427f3acf7503e9620fd61d9dea9afd5f13e83593670859fa05dd638e125/merged
  overlay         3211264  157214   3054050  5%    /var/lib/docker/overlay2/ff8f975d1e955fadfd9ba6f54249163c9ff893655863ecb421b9614169c56ad0/merged
  shm             496106   1        496105   1%    /var/lib/docker/containers/fbcf39d14513cd97cbd41c0deb5bfa1c1012d24abe43de7762a1e69fc8e052a4/mounts/shm
  shm             496106   1        496105   1%    /var/lib/docker/containers/8c762c28085202319457188150f73167351bd6f81976aac95937c83ccf6b9edd/mounts/shm
  shm             496106   1        496105   1%    /var/lib/docker/containers/2113afc79191840f6ddce8b7f5ab0546abb87259d14b2edf57d4c935cc9acca0/mounts/shm
  shm             496106   1        496105   1%    /var/lib/docker/containers/b9efbb0d21c068207ee60027561c000615332dc11cc93ca9d6e2af048e85b0d1/mounts/shm
  overlay         3211264  157214   3054050  5%    /var/lib/docker/overlay2/d3c8eddaec6517ec34044c018e6c4a41f13b353e040ab699423d3c1724f6b77b/merged
  overlay         3211264  157214   3054050  5%    /var/lib/docker/overlay2/c9234b66b457f79afde80ed9edd9993256ffb1cc946e341b43cfbe46bb874b12/merged

============================================================
 5. 网络状态检查
===========================================================
[2025-11-03 16:50:55] 已启用网络接口(名称 | 状态 | IP):
  lo       UNKNOWN 127.0.0.1/8
  eth0     UP     10.0.0.233/24
  docker0  UP     172.17.0.1/16
  vetha68cbca@if4 UP     fe80::e06f:94ff:fe14:75db/64
[2025-11-03 16:50:55] -e
  25520 CLOSE_WAIT
  25382 FIN_WAIT2
  7006  TIME_WAIT
  700   SYN_SENT
  105   ESTABLISHED
  25    LAST_ACK
  14    LISTEN
  8     FIN_WAIT1
[2025-11-03 16:50:56] TIME_WAIT连接数: 6933
[WARNING] TIME_WAIT连接数超阈值(5000),可优化TCP参数
[2025-11-03 16:50:56] -e
  tcp    0.0.0.0:6443         LISTEN
  tcp    127.0.0.1:6010       LISTEN
  tcp    0.0.0.0:22           LISTEN
  tcp    127.0.0.1:10248      LISTEN
  tcp    127.0.0.1:10259      LISTEN
  tcp    127.0.0.1:10257      LISTEN
  tcp    0.0.0.0:8080         LISTEN
  tcp    127.0.0.1:35417      LISTEN
  tcp    127.0.0.53:53        LISTEN
  tcp6   :::9100              LISTEN
  tcp6   ::1:6010             LISTEN
  tcp6   :::10250             LISTEN
  tcp6   :::22                LISTEN
  tcp6   :::8080              LISTEN

============================================================
 6. 进程和服务检查
===========================================================
[2025-11-03 16:50:57] 当前进程总数: 244
[2025-11-03 16:50:57] 僵尸进程数: 0
[OK] 无僵尸进程
[2025-11-03 16:50:57] -e
[ERROR]   crond: 未运行(需手动启动)

============================================================
 7. 系统日志检查
===========================================================
[2025-11-03 16:50:57] 最近10条系统错误日志(含error/fail/critical):
  -- Boot 9618308e2ec34567841ca66817885a79 --
Sep 10 14:59:51 k8s-master233 kernel: piix4_smbus 0000:00:07.3: SMBus Host Controller not enabled!
Sep 10 14:59:51 k8s-master233 kernel: sd 32:0:0:0: [sda] Assuming drive cache: write through
Sep 10 15:00:08 k8s-master233 kernel: hub 2-2:1.0: hub_ext_port_status failed (err = -110)
Sep 10 19:56:06 k8s-master233 systemd[1]: Failed to start Refresh fwupd metadata and update motd.
Sep 13 21:21:47 k8s-master233 kernel: e1000 0000:02:01.0 eth0: Reset adapter
-- Boot be599bd54e694d1faf331b6a6118b460 --
Nov 03 09:16:48 k8s-master233 kernel: piix4_smbus 0000:00:07.3: SMBus Host Controller not enabled!
Nov 03 09:16:48 k8s-master233 kernel: sd 32:0:0:0: [sda] Assuming drive cache: write through
Nov 03 15:30:45 k8s-master233 kernel: e1000 0000:02:01.0 eth0: Reset adapter
[WARNING] 发现系统错误日志,需关注
[2025-11-03 16:50:57] -e
[OK] 无OOM事件

============================================================
 8. 巡检报告摘要
===========================================================
[2025-11-03 16:50:57] 巡检完成时间: 2025-11-03 16:50:57
[2025-11-03 16:50:57] 告警数量: 2 | 错误数量: 1
[ERROR] 存在1个严重问题,需立即处理!
[2025-11-03 16:50:57] 完整报告路径: /var/log/system_check_20251103_165054.log

==========================================
    巡检完成! 结果已保存至 /var/log/system_check_20251103_165054.log
==========================================

posted @ 2025-11-03 16:54  你腿别抖了  阅读(7)  评论(0)    收藏  举报