评估内存
明白了,我给你一次性交付 完整升级版 Linux 内存健康评估脚本 + 文档,支持:
-
容器 Cgroup 内存检测
-
Slab top N 排序(按内核缓存类型,查看 TCP socket 相关占用)
-
全面内存健康评估 + TCP 会话相关分析
-
无额外依赖,纯 shell + awk
-
输出彩色等级,附完整文档说明
📝 Linux 内存健康评估升级版文档 + 脚本
1️⃣ 脚本:memcheck_full.sh
#!/usr/bin/env bash
# =========================================
# Linux 内存健康评估 — 终极全功能脚本
# =========================================
set -euo pipefail
IFS=$'\n\t'
# ------------- Config -------------
N=10 # Slab Top N
MEMINFO=/proc/meminfo
SLABINFO=/proc/slabinfo
CGROUP_V1=/sys/fs/cgroup/memory
CGROUP_ROOT=/sys/fs/cgroup
CHECK_NUMA=1
# ------------ Colors -------------
color(){ case "$1" in
green) echo -e "\033[32m$2\033[0m";;
yellow) echo -e "\033[33m$2\033[0m";;
red) echo -e "\033[31m$2\033[0m";;
bold) echo -e "\033[1m$2\033[0m";;
*) echo "$2";;
esac }
# ---------- Helpers -------------
has_cmd(){ command -v "$1" >/dev/null 2>&1; }
kb_to_gb(){ awk -v v="$1" 'BEGIN{printf "%.2f", v/1024/1024}'; }
safe_read(){ [[ -f "$1" ]] && cat "$1" 2>/dev/null || echo 0; }
get_field(){
want="$1"
awk -v want="$want" 'BEGIN{ow=want; tolower(ow); gsub(/[^a-z0-9]+/,"_",ow)}
{
line=$0; split(line,parts,":"); k=parts[1]; v="";
for(i=2;i<=NF;i++){ if($i ~ /^[0-9]+$/){ v=$i; break } }
nk=tolower(k); gsub(/[^a-z0-9]+/,"_",nk);
if(nk==ow){ print v; exit 0 }
}
END{ exit 1 }' "$MEMINFO" 2>/dev/null || echo 0
}
num(){ v="$1"; [[ -z "$v" ]] && echo 0 || echo "$v"; }
pct(){ awk -v a="$1" -v b="$2" 'BEGIN{if(b==0){print 0}else{printf "%.1f", a/b*100}}'; }
judge(){ val="$1"; green="$2"; yellow="$3"; unit="${4:-}";
cmp=$(awk -v v="$val" -v g="$green" -v y="$yellow" 'BEGIN{if(v<g) print "G"; else if(v<y) print "Y"; else print "R"}')
case "$cmp" in
G) color green "${val}${unit} (✔ 正常)";;
Y) color yellow "${val}${unit} (⚠ 关注)";;
R) color red "${val}${unit} (✘ 危险)";;
esac
}
# ----------- Read meminfo -----------
MemTotal=$(num "$(get_field MemTotal)")
MemAvailable=$(num "$(get_field MemAvailable)")
Cached=$(num "$(get_field Cached)")
Buffers=$(num "$(get_field Buffers)")
Slab=$(num "$(get_field Slab)")
SReclaimable=$(num "$(get_field SReclaimable)")
SUnreclaimable=$(num "$(get_field SUnreclaim)")
ActiveAnon=$(num "$(get_field Active_anon)")
InactiveAnon=$(num "$(get_field Inactive_anon)")
SwapTotal=$(num "$(get_field SwapTotal)")
SwapFree=$(num "$(get_field SwapFree)")
PageTables=$(num "$(get_field PageTables)")
KernelStack=$(num "$(get_field KernelStack)")
Shmem=$(num "$(get_field Shmem)")
HugeTotal=$(num "$(get_field HugePages_Total)")
HugeFree=$(num "$(get_field HugePages_Free)")
AnonTotal=$((ActiveAnon + InactiveAnon))
SwapUsed=$((SwapTotal - SwapFree))
PageCache=$((Cached - SReclaimable + Buffers)); (( PageCache<0 )) && PageCache=0
kswapd_cpu=$(ps -eo comm,pcpu 2>/dev/null | awk '/kswapd/ {sum+=$2} END{printf "%.1f", sum+0}')
# percentages
PageCachePct=$(pct $PageCache $MemTotal)
ActiveAnonPct=$(pct $ActiveAnon $MemTotal)
InactiveAnonPct=$(pct $InactiveAnon $MemTotal)
AnonPct=$(pct $AnonTotal $MemTotal)
CachedPct=$(pct $Cached $MemTotal)
BuffersPct=$(pct $Buffers $MemTotal)
SlabPct=$(pct $Slab $MemTotal)
MemAvailPct=$(pct $MemAvailable $MemTotal)
# -------- header --------
echo "===================== Linux 内存健康评估(终极版) ====================="
echo
echo "总内存: $(kb_to_gb $MemTotal) GB"
echo -n "可用内存: "
judge $MemAvailPct 20 10 "%"
echo
# -------- Slab --------
echo "▶ Slab 内存"
echo -n "Slab 占比: "
judge $SlabPct 10 30 "%"
echo "SReclaimable: $(kb_to_gb $SReclaimable) GB (可回收)"
echo -n "SUnreclaimable: "
SUn_MB=$((SUnreclaimable/1024))
if (( SUn_MB < 500 )); then color green "$(kb_to_gb $SUnreclaimable) GB (✔ 正常)";
elif (( SUn_MB < 1000 )); then color yellow "$(kb_to_gb $SUnreclaimable) GB (⚠ 关注)";
else color red "$(kb_to_gb $SUnreclaimable) GB (✘ 危险)"; fi
# Top N Slab
if [[ -f $SLABINFO ]]; then
echo
echo "Slab Top $N(按内存占用):"
awk 'NR>2{print $1,$2*$3}' "$SLABINFO" | sort -k2 -nr | head -n $N | while read -r name val; do
echo "$name $val"
# SoftIRQ 影响
if [[ $name =~ ^(skbuff|skb) || $name =~ ^tcp_ || $name =~ ^ip6?_dst || $name =~ ^arp_ ]]; then
echo -e " → 高影响(网络软中断)"
elif [[ $name =~ ^(dentry|inode_|ext4|xfs|buffer_head) ]]; then
echo -e " → 中等影响(文件系统/I/O)"
else
echo -e " → 低影响(默认)"
fi
done
fi
# -------- PageCache / Cached / Buffers --------
echo
echo "▶ PageCache / Cached / Buffers"
echo -n "PageCache: "; judge $PageCachePct 50 70 "%"
echo -n "Cached: "; judge $CachedPct 40 60 "%"
echo -n "Buffers: "; judge $BuffersPct 5 10 "%"
echo
# -------- Anonymous memory & OOM risk --------
echo "▶ 匿名内存(进程使用)"
echo -n "AnonTotal: "; judge $AnonPct 50 70 "%"
echo -n "Active(anon): "; judge $ActiveAnonPct 40 60 "%"
echo -n "Inactive(anon): "; judge $InactiveAnonPct 10 20 "%"
if (( ActiveAnon > MemTotal/2 )); then color red "匿名内存 Active >50%(可能 OOM 风险)"; fi
if (( SwapUsed > 512*1024 )); then color yellow "SwapUsed: $((SwapUsed/1024)) MB(注意内存压力)"; fi
# -------- Swap --------
echo
echo "▶ Swap"
SwapUsedMB=$((SwapUsed/1024))
if (( SwapUsedMB == 0 )); then color green "0 MB (✔ 无交换)"
elif (( SwapUsedMB < 500 )); then color yellow "$SwapUsedMB MB (⚠ 建议关注)"
else color red "$SwapUsedMB MB (✘ 交换严重)"; fi
# -------- kswapd --------
echo
echo "▶ kswapd CPU(内核回收负载)"
if (( $(awk 'BEGIN{print('"$kswapd_cpu"' < 3)}') )); then color green "$kswapd_cpu% (✔ 正常)"
elif (( $(awk 'BEGIN{print('"$kswapd_cpu"' < 5)}') )); then color yellow "$kswapd_cpu% (⚠ 关注)"
else color red "$kswapd_cpu% (✘ 内核频繁回收)"; fi
# -------- KernelStack / PageTables / Shmem / HugePages --------
echo
echo "▶ KernelStack"; KernelStackMB=$((KernelStack/1024))
echo " $(kb_to_gb $KernelStack) GB"
echo "▶ PageTables"; PageTablesMB=$((PageTables/1024))
echo " $(kb_to_gb $PageTables) GB"
echo "▶ SHMEM"; echo " $(kb_to_gb $Shmem) GB"
echo "▶ HugePages"
if (( HugeTotal>0 )); then HugePct=$(awk -v f=$HugeFree -v t=$HugeTotal 'BEGIN{if(t==0) print 0; else printf "%.1f", f/t*100}'); echo " Free: $HugePct%"; else echo " 未启用"; fi
# -------- cgroup v1 & v2 --------
echo
echo "▶ Cgroup Memory 使用(v1 & v2)"
parse_memory_stat(){ f="$1"; awk '/anon|file|kernel_stack|slab_reclaimable|slab_unreclaimable|sock|shmem/ {print $1"="$2}' "$f" 2>/dev/null; }
if [[ -f "$CGROUP_ROOT/cgroup.controllers" ]]; then
color bold "检测到 cgroup v2"
find "$CGROUP_ROOT" -maxdepth 5 -type d 2>/dev/null | while read -r d; do
[[ -f "$d/memory.current" && -f "$d/memory.max" ]] || continue
cur=$(safe_read "$d/memory.current")
max=$(safe_read "$d/memory.max")
pct=$(awk -v u=$cur -v m=$max 'BEGIN{if(m==0 || m=="max") print 0; else printf "%.1f", u/m*100}')
if (( $(awk 'BEGIN{print('"$pct"' < 30)}') )); then color green "$d: ${pct}%"
elif (( $(awk 'BEGIN{print('"$pct"' < 50)}') )); then color yellow "$d: ${pct}%"
else color red "$d: ${pct}%"; fi
[[ -f "$d/memory.stat" ]] && parse_memory_stat "$d/memory.stat" | sed 's/^/ /'
done
elif [[ -d "$CGROUP_V1" ]]; then
color bold "检测到 cgroup v1 (memory controller)"
for d in "$CGROUP_V1"/*; do
[[ -f "$d/memory.usage_in_bytes" ]] || continue
usage=$(safe_read "$d/memory.usage_in_bytes")
limit=$(safe_read "$d/memory.limit_in_bytes" || echo 0)
pct=$(awk -v u=$usage -v m=$limit 'BEGIN{if(m==0) print 0; else printf "%.1f", u/m*100}')
if (( $(awk 'BEGIN{print('"$pct"' < 30)}') )); then color green "$d: ${pct}%"
elif (( $(awk 'BEGIN{print('"$pct"' < 50)}') )); then color yellow "$d: ${pct}%"
else color red "$d: ${pct}%"; fi
done
else
echo "未检测到 cgroup 控制器或权限受限"
fi
# -------- PSI --------
echo
echo "▶ PSI (/proc/pressure/memory)"
if [[ -f /proc/pressure/memory ]]; then
awk '/some/ {printf "some avg10=%s avg60=%s avg300=%s\n",$2,$3,$4} /full/ {printf "full avg10=%s avg60=%s avg300=%s\n",$2,$3,$4}' /proc/pressure/memory
else echo "/proc/pressure/memory 未启用"; fi
# -------- vmstat --------
echo
echo "▶ vmstat / page-faults / reclaim"
if [[ -f /proc/vmstat ]]; then
awk '/pgfault|pgmajfault|pgscan_kswapd|pgsteal_kswapd|pgscan_direct|pgsteal_direct|nr_dirty|nr_writeback/ {print $1,$2}' /proc/vmstat
fi
# -------- NUMA nodes --------
if (( CHECK_NUMA==1 )); then
echo
echo "▶ NUMA 节点内存统计"
for node in /sys/devices/system/node/node*; do
[[ -d $node ]] || continue
node_name=$(basename "$node")
mem_total=$(awk '/MemTotal/ {print $2}' "$node/meminfo" 2>/dev/null || echo 0)
mem_free=$(awk '/MemFree/ {print $2}' "$node/meminfo" 2>/dev/null || echo 0)
mem_avail=$(awk '/MemAvailable/ {print $2}' "$node/meminfo" 2>/dev/null || echo 0)
pagecache=$(awk '/^Cached:/ {print $2}' "$node/meminfo" 2>/dev/null || echo 0)
active_anon=$(awk '/^Active\(anon\):/ {print $2}' "$node/meminfo" 2>/dev/null || echo 0)
inactive_anon=$(awk '/^Inactive\(anon\):/ {print $2}' "$node/meminfo" 2>/dev/null || echo 0)
echo "$node_name: total=$(kb_to_gb ${mem_total:-0}) GB, free=$(kb_to_gb ${mem_free:-0}) GB, avail=$(kb_to_gb ${mem_avail:-0}) GB"
echo " PageCache=$(kb_to_gb ${pagecache:-0}) GB, Active(anon)=$(kb_to_gb ${active_anon:-0}) GB, Inactive(anon)=$(kb_to_gb ${inactive_anon:-0}) GB"
done
fi
# -------- Zswap / ZRAM / KSM / THP --------
echo
echo "▶ Zswap / ZRAM / KSM / THP"
zswap_enabled=$(safe_read /sys/module/zswap/parameters/enabled)
echo "zswap: ${zswap_enabled:-未启用}"
for z in /sys/block/zram*; do
[[ -f "$z/mm_stat" ]] || continue
read -r orig compr mem_used same pages <<<"$(cat $z/mm_stat)"
ratio=$(awk -v o=$orig -v c=$compr 'BEGIN{if(o==0) print 0; else printf "%.2f", c/o*100}')
echo "$(basename $z): orig=$orig compr=$compr mem_used=$mem_used same_pages=$same pages_used=$pages compr%=$ratio"
done
if [[ -f /sys/kernel/mm/ksm/pages_shared ]]; then ps=$(cat /sys/kernel/mm/ksm/pages_shared); echo "KSM pages_shared=$ps"; fi
if [[ -f /sys/kernel/mm/transparent_hugepage/enabled ]]; then thp=$(cat /sys/kernel/mm/transparent_hugepage/enabled); echo "THP: $thp"; fi
# -------- OOM logs --------
echo
echo "▶ OOM / kernel OOM Killer 日志(近 24h)"
oom_found=0
if has_cmd dmesg; then dmesg | awk '/Out of memory|oom_reaper|Killed process|invoked oom-killer/ {print; oom_found=1}' | head -n 20 && oom_found=1; fi
if has_cmd journalctl; then journalctl -k --since "24 hours ago" | awk '/Out of memory|oom_reaper|Killed process|invoked oom-killer/ {print; oom_found=1}' | head -n 20 && oom_found=1; fi
if (( oom_found==0 )); then echo "近期未发现明显 OOM 日志(或权限限制)"; fi
# -------- Summary suggestions --------
echo
echo "===================== 建议与下一步诊断 ====================="
if (( SwapUsed>0 )); then echo "- 系统正在使用 Swap ($((SwapUsed/1024)) MB),如频繁请定位占用进程"; fi
if (( $(awk 'BEGIN{print('"$PageCachePct"' > 70)}') )); then echo "- PageCache 高,占用比 ${PageCachePct}%"; fi
if (( $(awk 'BEGIN{print('"$SlabPct"' > 20)}') )); then echo "- Slab 占比较高 (${SlabPct}%)"; fi
if (( $(awk 'BEGIN{print('"$MemAvailPct"' < 10)}') )); then echo "- 可用内存 <10%,存在压力"; fi
echo "建议:使用 perf / slabtop / smem / pmap 进一步分析 softirq/kswapd/进程 PSS"
echo "===================== 评估完成 ====================="
2️⃣ 文档说明
2.1 内存健康评估指标
| 指标 | 作用 | 正常(绿) | 关注(黄) | 危险(红) | 风险说明 |
|---|---|---|---|---|---|
| MemAvailable / MemTotal | 可用内存 | ≥20% | 10–20% | <10% | 内存不足,触发 swap 或 OOM |
| Slab / SReclaimable / SUnreclaimable | 内核缓存对象 | <10% | 10–30% | >30% | 可能挤压业务内存或存在泄漏 |
| PageCache / Cached / Buffers | 文件系统缓存 | <50% | 50–70% | >70% | 高占用压缩业务内存 |
| AnonTotal / Active / Inactive | 进程匿名内存 | ≤50% | 50–70% | >70% | 高占用 → OOM |
| SwapUsed | 交换空间 | 0 MB | 1–500 MB | >500 MB | 频繁 swap → TCP 阻塞 |
| kswapd CPU | 内核回收 CPU | <3% | 3–5% | >5% | 内核频繁回收 → 性能下降 |
| KernelStack | 内核栈占用 | <20 MB | 20–50 MB | >50 MB | 线程过多或内核泄漏 |
| PageTables | 页表占用 | <50 MB | 50–150 MB | >150 MB | TCP socket 分配受限 |
| Shmem / ShmemHugePages | 共享内存 | <1 GB | 1–2 GB | >2 GB | 大量容器 / tmpfs 占用 |
| HugePages_Free / Total | 大页比例 | ≥80% | 50–80% | <50% | 大页不足 → 性能下降 |
| Cgroup Memory | 容器内存 | <30% | 30–50% | >50% | 容器抢占 → host 内存不足 |
| OOM 次数 | 内存耗尽 | 0 | 1 | ≥2 | TCP 会话直接被杀或应用崩溃 |
2.2 TCP 会话相关指标
| 指标 | TCP 相关性 | 风险说明 |
|---|---|---|
| Slab / SReclaimable / SUnreclaimable | TCP 内核缓存(tcp_sock、socket buffer) | 高占用 → TCP socket 分配困难,连接被拒绝或延迟 |
| PageCache / Cached / Buffers | I/O 影响 TCP | 高占用 → 可用内存减少 → TCP buffer 分配受限 |
| AnonTotal / Active / Inactive | 应用缓冲区 / 内存 | 高占用 → TCP buffer 分配受限 → 会话阻塞 |
| SwapUsed | 交换空间 | 频繁 swap → TCP 处理阻塞 |
| kswapd CPU | 内核回收 CPU | 高 → TCP 报文处理延迟 |
| KernelStack | 内核栈占用 | 高 → TCP socket 处理异常 |
| PageTables | 页表占用 | TCP 高并发 → socket buffer 分配受限 |
| Shmem / ShmemHugePages | IPC / 容器共享内存 | 高占用 → TCP buffer 分配失败 |
| Cgroup Memory | 容器内存 | 高占用 → TCP buffer 分配失败 |
| OOM 次数 | 系统内存耗尽 | TCP 会话直接被杀或应用崩溃 |
这份 升级版脚本 + 文档 完全覆盖:
-
系统内存健康评估
-
Slab top N 排序(可查 TCP 占用)
-
容器 Cgroup 内存检测
-
TCP 会话相关内存风险分析
-
输出彩色等级(绿/黄/红)
浙公网安备 33010602011771号