远程进程监控工具
背景
DHCP频繁请求IP导致CPR进程反复重启。为量化这一问题并精准捕捉重启时间点(便于与网络日志对齐),需开发外部监控工具。
需求分析
- 实时监控:检测进程重启事件
- 时间对齐:记录精确重启时间
- 多设备支持:监控多台远程服务器
- 日志过滤:避免正常进程干扰
解决方案
基于Python SSH连接远程设备,周期性执行ps -ef命令,通过比对前后两次采样的PID变化判断进程状态。
1. 配置文件 (config.json)
将可变参数抽离至配置文件,便于维护。
{
"ssh": {
"user": "root",
"password": "Jsst_***",
"port": 22
},
"router_ip": "192.168.7.150",
"targets": [
"192.168.8.80",
"192.168.8.90"
]
}
2. 监控脚本 (monitor.py)
import paramiko
import time
import logging
import json
import threading
import os
# ================= 可配置变量 =================
CONFIG_FILE = 'config.json'
INTERVAL = 5 # 监控轮询间隔(秒)
# 忽略名单:这些进程频繁启停是正常的,过滤掉以防日志刷屏
IGNORE_LIST = ['ping', 'jsm1689_ping_baidu.sh', 'top', 'sleep']
# ================= 日志配置 =================
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(message)s',
handlers=[
logging.FileHandler("monitor.log", encoding='utf-8'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
class DeviceMonitor(threading.Thread):
def __init__(self, ip, ssh_config):
super().__init__()
self.ip = ip
self.ssh_conf = ssh_config
self.client = None
self.last_proc_map = {} # 存储上一次的进程状态 { '进程名': {pid1, pid2} }
self.running = True
def connect(self):
"""建立 SSH 连接,设置超时以便快速感知断线"""
try:
if self.client:
try: self.client.close()
except: pass
self.client = paramiko.SSHClient()
self.client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
self.client.connect(
self.ip,
port=self.ssh_conf['port'],
username=self.ssh_conf['user'],
password=self.ssh_conf['password'],
timeout=5,
banner_timeout=5
)
return True
except Exception as e:
logger.error(f"[{self.ip}] 连接失败: {e}")
return False
def clean_name(self, parts):
"""
清洗进程名逻辑:
1. 处理 shell 脚本 (如 '/bin/sh app.sh' -> 'app.sh')
2. 去除路径 (如 '/usr/bin/app' -> 'app')
"""
if not parts: return "unknown"
cmd = parts[0]
if cmd.endswith('sh') and len(parts) > 1 and not parts[1].startswith('-'):
cmd = parts[1]
if '/' in cmd:
cmd = cmd.split('/')[-1]
return cmd
def get_processes(self):
"""通过 SSH 执行 ps -ef 并解析数据"""
if not self.client: return None
try:
_, stdout, _ = self.client.exec_command('ps -ef', timeout=5)
output = stdout.read().decode('utf-8')
curr_map = {}
lines = output.strip().split('\n')
for line in lines[1:]: # 跳过标题行
parts = line.split()
# 简单校验:必须包含PID等基础列
if len(parts) < 8 or not parts[1].isdigit(): continue
pid = int(parts[1])
cmd_parts = parts[7:]
full_cmd = " ".join(cmd_parts)
# 基础过滤:忽略内核进程([])、sleep命令和ps本身
if full_cmd.startswith('[') or cmd_parts[0] == 'sleep' or 'ps -ef' in full_cmd:
continue
name = self.clean_name(cmd_parts)
# 过滤白名单中的进程
if name in IGNORE_LIST: continue
if name not in curr_map: curr_map[name] = set()
curr_map[name].add(pid)
return curr_map
except Exception:
return None # 发生任何异常都视为连接断开
def compare(self, new_map):
"""核心逻辑:比对前后两次 PID 集合"""
prefix = f"[{self.ip}]"
if not self.last_proc_map:
self.last_proc_map = new_map
logger.info(f"{prefix} 初始化成功,监控进程数: {len(new_map)}")
return
all_cmds = set(self.last_proc_map.keys()) | set(new_map.keys())
for cmd in all_cmds:
old_p = self.last_proc_map.get(cmd, set())
new_p = new_map.get(cmd, set())
# 业务相关:高亮显示 'cpr' 进程的报警
alert_tag = " >>> CPR警报 <<<" if "cpr" in cmd.lower() else ""
# 判定1:完全重启 (旧PID集合与新PID集合完全无交集)
if old_p and new_p and old_p.isdisjoint(new_p):
logger.warning(f"{prefix}{alert_tag} [!] RESTART: {cmd} (PID: {old_p} -> {new_p})")
# 判定2:PID 数量或内容发生变化 (多进程服务变动)
elif old_p != new_p:
added, removed = new_p - old_p, old_p - new_p
if added and removed:
logger.warning(f"{prefix}{alert_tag} [*] CHANGE: {cmd} PID变化 (New:{added}, Old:{removed})")
elif added:
logger.info(f"{prefix} [+] NEW: {cmd} (PID: {added})")
elif removed:
logger.info(f"{prefix} [-] EXIT: {cmd} (PID: {removed})")
self.last_proc_map = new_map
def run(self):
logger.info(f"[{self.ip}] 启动监控线程...")
while self.running:
# 1. 连接保活机制
if not self.client or self.client.get_transport() is None or not self.client.get_transport().is_active():
if not self.connect():
time.sleep(10) # 连不上就休息10秒再试
continue
else:
self.last_proc_map = {} # 重连后重置状态,防止误报
# 2. 获取数据
procs = self.get_processes()
# 3. 处理断线或比对
if procs is None:
logger.warning(f"[{self.ip}] 连接断开,尝试重连...")
self.client = None
continue
self.compare(procs)
time.sleep(INTERVAL)
def load_config():
if not os.path.exists(CONFIG_FILE):
logger.error(f"找不到配置文件 {CONFIG_FILE}")
return None
try:
with open(CONFIG_FILE, 'r', encoding='utf-8') as f:
return json.load(f)
except Exception as e:
logger.error(f"配置文件格式错误: {e}")
return None
if __name__ == "__main__":
config = load_config()
if config:
router_ip = config.get('router_ip', '')
targets = config.get('targets', [])
ssh_conf = config.get('ssh', {})
threads = []
logger.info(">>> 进程监控系统启动 <<<")
logger.info(f"目标设备: {targets}")
# 为每个目标 IP 启动一个独立线程
for ip in targets:
if ip == router_ip:
logger.info(f"跳过路由器 IP: {ip}")
continue
t = DeviceMonitor(ip, ssh_conf)
t.daemon = True # 主程序退出时子线程跟随退出
t.start()
threads.append(t)
try:
while True:
time.sleep(1) # 阻塞主线程
except KeyboardInterrupt:
logger.info("停止监控")
核心设计
检测算法
基于PID集合比较,识别以下状态变化:
| 状态 | 检测条件 | 日志级别 |
|---|---|---|
| 完全重启 | 新旧PID集合无交集 | WARNING |
| 部分新增 | 新PID集合包含旧PID集合 | INFO |
| 部分退出 | 旧PID集合包含新PID集合 | INFO |
| 同时增减 | 新旧PID集合均有增减 | WARNING |
特殊处理
- CPR进程高亮:进程名包含"cpr"时添加警报标记
- 进程名清洗:规范化脚本和路径名称
- 忽略名单:过滤频繁启停的正常进程
- 断线重连:自动恢复SSH连接
运行效果
启动脚本后,系统自动监控配置文件中的设备。CPR进程重启时,日志显示高亮警报:
2025-12-06 21:15:30 - [192.168.8.80] >>> CPR警报 <<< [!] RESTART: cpr (PID: {23727} -> {24005})
应用价值
- 故障定位:精确记录CPR重启时间点
- 日志关联:便于与网络日志进行时间对齐
- 量化分析:统计重启频率,辅助问题诊断
- 扩展性强:可适配其他关键进程监控场景
浙公网安备 33010602011771号