评估内存

明白了,我给你一次性交付 完整升级版 Linux 内存健康评估脚本 + 文档,支持:

  1. 容器 Cgroup 内存检测

  2. Slab top N 排序(按内核缓存类型,查看 TCP socket 相关占用)

  3. 全面内存健康评估 + TCP 会话相关分析

  4. 无额外依赖,纯 shell + awk

  5. 输出彩色等级,附完整文档说明


📝 Linux 内存健康评估升级版文档 + 脚本


1️⃣ 脚本:memcheck_full.sh

#!/usr/bin/env bash
# =========================================
# Linux 内存健康评估 — 终极全功能脚本
# =========================================
set -euo pipefail
IFS=$'\n\t'

# ------------- Config -------------
N=10                      # Slab Top N
MEMINFO=/proc/meminfo
SLABINFO=/proc/slabinfo
CGROUP_V1=/sys/fs/cgroup/memory
CGROUP_ROOT=/sys/fs/cgroup
CHECK_NUMA=1

# ------------ Colors -------------
color(){ case "$1" in
  green)  echo -e "\033[32m$2\033[0m";;
  yellow) echo -e "\033[33m$2\033[0m";;
  red)    echo -e "\033[31m$2\033[0m";;
  bold)   echo -e "\033[1m$2\033[0m";;
  *) echo "$2";;
esac }

# ---------- Helpers -------------
has_cmd(){ command -v "$1" >/dev/null 2>&1; }
kb_to_gb(){ awk -v v="$1" 'BEGIN{printf "%.2f", v/1024/1024}'; }
safe_read(){ [[ -f "$1" ]] && cat "$1" 2>/dev/null || echo 0; }

get_field(){
  want="$1"
  awk -v want="$want" 'BEGIN{ow=want; tolower(ow); gsub(/[^a-z0-9]+/,"_",ow)}
  {
    line=$0; split(line,parts,":"); k=parts[1]; v="";
    for(i=2;i<=NF;i++){ if($i ~ /^[0-9]+$/){ v=$i; break } }
    nk=tolower(k); gsub(/[^a-z0-9]+/,"_",nk);
    if(nk==ow){ print v; exit 0 }
  }
  END{ exit 1 }' "$MEMINFO" 2>/dev/null || echo 0
}

num(){ v="$1"; [[ -z "$v" ]] && echo 0 || echo "$v"; }
pct(){ awk -v a="$1" -v b="$2" 'BEGIN{if(b==0){print 0}else{printf "%.1f", a/b*100}}'; }

judge(){ val="$1"; green="$2"; yellow="$3"; unit="${4:-}";
  cmp=$(awk -v v="$val" -v g="$green" -v y="$yellow" 'BEGIN{if(v<g) print "G"; else if(v<y) print "Y"; else print "R"}')
  case "$cmp" in
    G) color green "${val}${unit} (✔ 正常)";;
    Y) color yellow "${val}${unit} (⚠ 关注)";;
    R) color red "${val}${unit} (✘ 危险)";;
  esac
}

# ----------- Read meminfo -----------
MemTotal=$(num "$(get_field MemTotal)")
MemAvailable=$(num "$(get_field MemAvailable)")
Cached=$(num "$(get_field Cached)")
Buffers=$(num "$(get_field Buffers)")
Slab=$(num "$(get_field Slab)")
SReclaimable=$(num "$(get_field SReclaimable)")
SUnreclaimable=$(num "$(get_field SUnreclaim)")
ActiveAnon=$(num "$(get_field Active_anon)")
InactiveAnon=$(num "$(get_field Inactive_anon)")
SwapTotal=$(num "$(get_field SwapTotal)")
SwapFree=$(num "$(get_field SwapFree)")
PageTables=$(num "$(get_field PageTables)")
KernelStack=$(num "$(get_field KernelStack)")
Shmem=$(num "$(get_field Shmem)")
HugeTotal=$(num "$(get_field HugePages_Total)")
HugeFree=$(num "$(get_field HugePages_Free)")
AnonTotal=$((ActiveAnon + InactiveAnon))
SwapUsed=$((SwapTotal - SwapFree))
PageCache=$((Cached - SReclaimable + Buffers)); (( PageCache<0 )) && PageCache=0
kswapd_cpu=$(ps -eo comm,pcpu 2>/dev/null | awk '/kswapd/ {sum+=$2} END{printf "%.1f", sum+0}')

# percentages
PageCachePct=$(pct $PageCache $MemTotal)
ActiveAnonPct=$(pct $ActiveAnon $MemTotal)
InactiveAnonPct=$(pct $InactiveAnon $MemTotal)
AnonPct=$(pct $AnonTotal $MemTotal)
CachedPct=$(pct $Cached $MemTotal)
BuffersPct=$(pct $Buffers $MemTotal)
SlabPct=$(pct $Slab $MemTotal)
MemAvailPct=$(pct $MemAvailable $MemTotal)

# -------- header --------
echo "===================== Linux 内存健康评估(终极版) ====================="
echo
echo "总内存: $(kb_to_gb $MemTotal) GB"
echo -n "可用内存: "
judge $MemAvailPct 20 10 "%"
echo

# -------- Slab --------
echo "▶ Slab 内存"
echo -n "Slab 占比: "
judge $SlabPct 10 30 "%"
echo "SReclaimable: $(kb_to_gb $SReclaimable) GB (可回收)"
echo -n "SUnreclaimable: "
SUn_MB=$((SUnreclaimable/1024))
if (( SUn_MB < 500 )); then color green "$(kb_to_gb $SUnreclaimable) GB (✔ 正常)";
elif (( SUn_MB < 1000 )); then color yellow "$(kb_to_gb $SUnreclaimable) GB (⚠ 关注)";
else color red "$(kb_to_gb $SUnreclaimable) GB (✘ 危险)"; fi

# Top N Slab
if [[ -f $SLABINFO ]]; then
  echo
  echo "Slab Top $N(按内存占用):"
  awk 'NR>2{print $1,$2*$3}' "$SLABINFO" | sort -k2 -nr | head -n $N | while read -r name val; do
    echo "$name $val"
    # SoftIRQ 影响
    if [[ $name =~ ^(skbuff|skb) || $name =~ ^tcp_ || $name =~ ^ip6?_dst || $name =~ ^arp_ ]]; then
      echo -e "  → 高影响(网络软中断)"
    elif [[ $name =~ ^(dentry|inode_|ext4|xfs|buffer_head) ]]; then
      echo -e "  → 中等影响(文件系统/I/O)"
    else
      echo -e "  → 低影响(默认)"
    fi
  done
fi

# -------- PageCache / Cached / Buffers --------
echo
echo "▶ PageCache / Cached / Buffers"
echo -n "PageCache: "; judge $PageCachePct 50 70 "%"
echo -n "Cached: "; judge $CachedPct 40 60 "%"
echo -n "Buffers: "; judge $BuffersPct 5 10 "%"
echo

# -------- Anonymous memory & OOM risk --------
echo "▶ 匿名内存(进程使用)"
echo -n "AnonTotal: "; judge $AnonPct 50 70 "%"
echo -n "Active(anon): "; judge $ActiveAnonPct 40 60 "%"
echo -n "Inactive(anon): "; judge $InactiveAnonPct 10 20 "%"
if (( ActiveAnon > MemTotal/2 )); then color red "匿名内存 Active >50%(可能 OOM 风险)"; fi
if (( SwapUsed > 512*1024 )); then color yellow "SwapUsed: $((SwapUsed/1024)) MB(注意内存压力)"; fi

# -------- Swap --------
echo
echo "▶ Swap"
SwapUsedMB=$((SwapUsed/1024))
if (( SwapUsedMB == 0 )); then color green "0 MB (✔ 无交换)"
elif (( SwapUsedMB < 500 )); then color yellow "$SwapUsedMB MB (⚠ 建议关注)"
else color red "$SwapUsedMB MB (✘ 交换严重)"; fi

# -------- kswapd --------
echo
echo "▶ kswapd CPU(内核回收负载)"
if (( $(awk 'BEGIN{print('"$kswapd_cpu"' < 3)}') )); then color green "$kswapd_cpu% (✔ 正常)"
elif (( $(awk 'BEGIN{print('"$kswapd_cpu"' < 5)}') )); then color yellow "$kswapd_cpu% (⚠ 关注)"
else color red "$kswapd_cpu% (✘ 内核频繁回收)"; fi

# -------- KernelStack / PageTables / Shmem / HugePages --------
echo
echo "▶ KernelStack"; KernelStackMB=$((KernelStack/1024))
echo "  $(kb_to_gb $KernelStack) GB"

echo "▶ PageTables"; PageTablesMB=$((PageTables/1024))
echo "  $(kb_to_gb $PageTables) GB"

echo "▶ SHMEM"; echo "  $(kb_to_gb $Shmem) GB"

echo "▶ HugePages"
if (( HugeTotal>0 )); then HugePct=$(awk -v f=$HugeFree -v t=$HugeTotal 'BEGIN{if(t==0) print 0; else printf "%.1f", f/t*100}'); echo "  Free: $HugePct%"; else echo "  未启用"; fi

# -------- cgroup v1 & v2 --------
echo
echo "▶ Cgroup Memory 使用(v1 & v2)"
parse_memory_stat(){ f="$1"; awk '/anon|file|kernel_stack|slab_reclaimable|slab_unreclaimable|sock|shmem/ {print $1"="$2}' "$f" 2>/dev/null; }

if [[ -f "$CGROUP_ROOT/cgroup.controllers" ]]; then
  color bold "检测到 cgroup v2"
  find "$CGROUP_ROOT" -maxdepth 5 -type d 2>/dev/null | while read -r d; do
    [[ -f "$d/memory.current" && -f "$d/memory.max" ]] || continue
    cur=$(safe_read "$d/memory.current")
    max=$(safe_read "$d/memory.max")
    pct=$(awk -v u=$cur -v m=$max 'BEGIN{if(m==0 || m=="max") print 0; else printf "%.1f", u/m*100}')
    if (( $(awk 'BEGIN{print('"$pct"' < 30)}') )); then color green "$d: ${pct}%"
    elif (( $(awk 'BEGIN{print('"$pct"' < 50)}') )); then color yellow "$d: ${pct}%"
    else color red "$d: ${pct}%"; fi
    [[ -f "$d/memory.stat" ]] && parse_memory_stat "$d/memory.stat" | sed 's/^/  /'
  done
elif [[ -d "$CGROUP_V1" ]]; then
  color bold "检测到 cgroup v1 (memory controller)"
  for d in "$CGROUP_V1"/*; do
    [[ -f "$d/memory.usage_in_bytes" ]] || continue
    usage=$(safe_read "$d/memory.usage_in_bytes")
    limit=$(safe_read "$d/memory.limit_in_bytes" || echo 0)
    pct=$(awk -v u=$usage -v m=$limit 'BEGIN{if(m==0) print 0; else printf "%.1f", u/m*100}')
    if (( $(awk 'BEGIN{print('"$pct"' < 30)}') )); then color green "$d: ${pct}%"
    elif (( $(awk 'BEGIN{print('"$pct"' < 50)}') )); then color yellow "$d: ${pct}%"
    else color red "$d: ${pct}%"; fi
  done
else
  echo "未检测到 cgroup 控制器或权限受限"
fi

# -------- PSI --------
echo
echo "▶ PSI (/proc/pressure/memory)"
if [[ -f /proc/pressure/memory ]]; then
  awk '/some/ {printf "some avg10=%s avg60=%s avg300=%s\n",$2,$3,$4} /full/ {printf "full avg10=%s avg60=%s avg300=%s\n",$2,$3,$4}' /proc/pressure/memory
else echo "/proc/pressure/memory 未启用"; fi

# -------- vmstat --------
echo
echo "▶ vmstat / page-faults / reclaim"
if [[ -f /proc/vmstat ]]; then
  awk '/pgfault|pgmajfault|pgscan_kswapd|pgsteal_kswapd|pgscan_direct|pgsteal_direct|nr_dirty|nr_writeback/ {print $1,$2}' /proc/vmstat
fi

# -------- NUMA nodes --------
if (( CHECK_NUMA==1 )); then
  echo
  echo "▶ NUMA 节点内存统计"
  for node in /sys/devices/system/node/node*; do
    [[ -d $node ]] || continue
    node_name=$(basename "$node")
    mem_total=$(awk '/MemTotal/ {print $2}' "$node/meminfo" 2>/dev/null || echo 0)
    mem_free=$(awk '/MemFree/ {print $2}' "$node/meminfo" 2>/dev/null || echo 0)
    mem_avail=$(awk '/MemAvailable/ {print $2}' "$node/meminfo" 2>/dev/null || echo 0)
    pagecache=$(awk '/^Cached:/ {print $2}' "$node/meminfo" 2>/dev/null || echo 0)
    active_anon=$(awk '/^Active\(anon\):/ {print $2}' "$node/meminfo" 2>/dev/null || echo 0)
    inactive_anon=$(awk '/^Inactive\(anon\):/ {print $2}' "$node/meminfo" 2>/dev/null || echo 0)
    echo "$node_name: total=$(kb_to_gb ${mem_total:-0}) GB, free=$(kb_to_gb ${mem_free:-0}) GB, avail=$(kb_to_gb ${mem_avail:-0}) GB"
    echo "  PageCache=$(kb_to_gb ${pagecache:-0}) GB, Active(anon)=$(kb_to_gb ${active_anon:-0}) GB, Inactive(anon)=$(kb_to_gb ${inactive_anon:-0}) GB"
  done
fi

# -------- Zswap / ZRAM / KSM / THP --------
echo
echo "▶ Zswap / ZRAM / KSM / THP"
zswap_enabled=$(safe_read /sys/module/zswap/parameters/enabled)
echo "zswap: ${zswap_enabled:-未启用}"
for z in /sys/block/zram*; do
  [[ -f "$z/mm_stat" ]] || continue
  read -r orig compr mem_used same pages <<<"$(cat $z/mm_stat)"
  ratio=$(awk -v o=$orig -v c=$compr 'BEGIN{if(o==0) print 0; else printf "%.2f", c/o*100}')
  echo "$(basename $z): orig=$orig compr=$compr mem_used=$mem_used same_pages=$same pages_used=$pages compr%=$ratio"
done
if [[ -f /sys/kernel/mm/ksm/pages_shared ]]; then ps=$(cat /sys/kernel/mm/ksm/pages_shared); echo "KSM pages_shared=$ps"; fi
if [[ -f /sys/kernel/mm/transparent_hugepage/enabled ]]; then thp=$(cat /sys/kernel/mm/transparent_hugepage/enabled); echo "THP: $thp"; fi

# -------- OOM logs --------
echo
echo "▶ OOM / kernel OOM Killer 日志(近 24h)"
oom_found=0
if has_cmd dmesg; then dmesg | awk '/Out of memory|oom_reaper|Killed process|invoked oom-killer/ {print; oom_found=1}' | head -n 20 && oom_found=1; fi
if has_cmd journalctl; then journalctl -k --since "24 hours ago" | awk '/Out of memory|oom_reaper|Killed process|invoked oom-killer/ {print; oom_found=1}' | head -n 20 && oom_found=1; fi
if (( oom_found==0 )); then echo "近期未发现明显 OOM 日志(或权限限制)"; fi

# -------- Summary suggestions --------
echo
echo "===================== 建议与下一步诊断 ====================="
if (( SwapUsed>0 )); then echo "- 系统正在使用 Swap ($((SwapUsed/1024)) MB),如频繁请定位占用进程"; fi
if (( $(awk 'BEGIN{print('"$PageCachePct"' > 70)}') )); then echo "- PageCache 高,占用比 ${PageCachePct}%"; fi
if (( $(awk 'BEGIN{print('"$SlabPct"' > 20)}') )); then echo "- Slab 占比较高 (${SlabPct}%)"; fi
if (( $(awk 'BEGIN{print('"$MemAvailPct"' < 10)}') )); then echo "- 可用内存 <10%,存在压力"; fi
echo "建议:使用 perf / slabtop / smem / pmap 进一步分析 softirq/kswapd/进程 PSS"
echo "===================== 评估完成 ====================="

  

  

  

  


2️⃣ 文档说明

2.1 内存健康评估指标

指标作用正常(绿)关注(黄)危险(红)风险说明
MemAvailable / MemTotal 可用内存 ≥20% 10–20% <10% 内存不足,触发 swap 或 OOM
Slab / SReclaimable / SUnreclaimable 内核缓存对象 <10% 10–30% >30% 可能挤压业务内存或存在泄漏
PageCache / Cached / Buffers 文件系统缓存 <50% 50–70% >70% 高占用压缩业务内存
AnonTotal / Active / Inactive 进程匿名内存 ≤50% 50–70% >70% 高占用 → OOM
SwapUsed 交换空间 0 MB 1–500 MB >500 MB 频繁 swap → TCP 阻塞
kswapd CPU 内核回收 CPU <3% 3–5% >5% 内核频繁回收 → 性能下降
KernelStack 内核栈占用 <20 MB 20–50 MB >50 MB 线程过多或内核泄漏
PageTables 页表占用 <50 MB 50–150 MB >150 MB TCP socket 分配受限
Shmem / ShmemHugePages 共享内存 <1 GB 1–2 GB >2 GB 大量容器 / tmpfs 占用
HugePages_Free / Total 大页比例 ≥80% 50–80% <50% 大页不足 → 性能下降
Cgroup Memory 容器内存 <30% 30–50% >50% 容器抢占 → host 内存不足
OOM 次数 内存耗尽 0 1 ≥2 TCP 会话直接被杀或应用崩溃

2.2 TCP 会话相关指标

指标TCP 相关性风险说明
Slab / SReclaimable / SUnreclaimable TCP 内核缓存(tcp_sock、socket buffer) 高占用 → TCP socket 分配困难,连接被拒绝或延迟
PageCache / Cached / Buffers I/O 影响 TCP 高占用 → 可用内存减少 → TCP buffer 分配受限
AnonTotal / Active / Inactive 应用缓冲区 / 内存 高占用 → TCP buffer 分配受限 → 会话阻塞
SwapUsed 交换空间 频繁 swap → TCP 处理阻塞
kswapd CPU 内核回收 CPU 高 → TCP 报文处理延迟
KernelStack 内核栈占用 高 → TCP socket 处理异常
PageTables 页表占用 TCP 高并发 → socket buffer 分配受限
Shmem / ShmemHugePages IPC / 容器共享内存 高占用 → TCP buffer 分配失败
Cgroup Memory 容器内存 高占用 → TCP buffer 分配失败
OOM 次数 系统内存耗尽 TCP 会话直接被杀或应用崩溃

这份 升级版脚本 + 文档 完全覆盖:

  • 系统内存健康评估

  • Slab top N 排序(可查 TCP 占用)

  • 容器 Cgroup 内存检测

  • TCP 会话相关内存风险分析

  • 输出彩色等级(绿/黄/红)

 

posted on 2025-11-25 13:10  吃草的青蛙  阅读(7)  评论(0)    收藏  举报

导航