# 作者:https://www.cnblogs.com/honglicheng
import json
import math
import pytz
import requests
from datetime import datetime
class MonitoringAlert:
def __init__(self):
# 监控阈值配置
self.cpu_threshold = 80 # CPU使用率报警阈值(%)
self.mem_threshold = 80 # 内存使用率报警阈值(%)
# 目标命名空间列表
self.namespace_list = [
"apollo", "bhpc-admin-nginx", "bluehelix", "broker", "cert-manager",
"chainnode", "clear", "elastic-system", "exchange", "gateway",
"kube-node-lease", "kube-public", "kube-system", "log", "wallet", "rc"
]
self.api_url = "https://prometheus.doex.io/api/v1/query" # Prometheus查询接口
self.alert_webhook = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=xxxxxxaf216f7" # 报警通知接口
def _fetch_metrics(self, query):
"""通用方法:调用Prometheus接口获取监控数据"""
try:
response = requests.get(self.api_url, params={"query": query}, timeout=10)
response.raise_for_status() # 触发HTTP错误状态码异常
data = response.json()
return data.get("data", {}).get("result", [])
except requests.exceptions.RequestException as e:
print(f"获取监控数据失败:{str(e)}")
return []
def get_cpu_usage(self, namespace):
"""获取指定命名空间的容器CPU使用率"""
# PromQL查询:容器CPU使用率(已用/配额)*100
query = (
f'sum(irate(container_cpu_usage_seconds_total{{container!="", container!="POD", namespace=~"{namespace}"}}[2m])) '
f'by (container, pod) / '
f'sum(container_spec_cpu_quota{{container!="", container!="POD", namespace=~"{namespace}"}} / 100000) '
f'by (container, pod) * 100'
)
metrics = self._fetch_metrics(query)
over_threshold = []
for item in metrics:
try:
usage = float(item["value"][1])
if usage > self.cpu_threshold and not math.isinf(usage):
over_threshold.append({
"container": item["metric"]["pod"],
"cpu_usage": round(usage, 2), # 保留两位小数
"type": "cpu"
})
except (KeyError, ValueError) as e:
print(f"解析CPU数据失败:{str(e)}")
return over_threshold
def get_mem_usage(self, namespace):
"""获取指定命名空间的容器内存使用率"""
# PromQL查询:容器内存使用率(工作集/限制)*100
query = (
f'sum(container_memory_working_set_bytes{{container!="", container!="POD", namespace=~"{namespace}"}}) '
f'by (container, pod) / '
f'sum(container_spec_memory_limit_bytes{{container!="", container!="POD", namespace=~"{namespace}"}}) '
f'by (container, pod) * 100'
)
metrics = self._fetch_metrics(query)
over_threshold = []
for item in metrics:
try:
usage = float(item["value"][1])
if usage > self.mem_threshold and not math.isinf(usage):
over_threshold.append({
"container": item["metric"]["pod"],
"mem_usage": round(usage, 2), # 保留两位小数
"type": "memory"
})
except (KeyError, ValueError) as e:
print(f"解析内存数据失败:{str(e)}")
return over_threshold
def send_alert(self, alert_info):
"""发送报警通知到webhook"""
# 获取上海时区当前时间
shanghai_tz = pytz.timezone("Asia/Shanghai")
alert_time = datetime.now(shanghai_tz).strftime("%Y-%m-%d %H:%M:%S")
# 构建Markdown报警内容
if alert_info["type"] == "cpu":
content = f'''
# 容器资源报警通知
**容器名称:** {alert_info["container"]}
**监控类型:** CPU使用率
**当前值:** {alert_info["cpu_usage"]}%
**报警阈值:** {self.cpu_threshold}%
**报警时间:** {alert_time}
**问题描述:** 容器CPU使用率已超过阈值,请及时处理
'''
else:
content = f'''
# 容器资源报警通知
**容器名称:** {alert_info["container"]}
**监控类型:** 内存使用率
**当前值:** {alert_info["mem_usage"]}%
**报警阈值:** {self.mem_threshold}%
**报警时间:** {alert_time}
**问题描述:** 容器内存使用率已超过阈值,请及时处理
'''
# 发送POST请求
try:
response = requests.post(
self.alert_webhook,
headers={"Content-Type": "application/json"},
json={
"msgtype": "markdown",
"markdown": {"content": content.strip()} # 去除多余空行
},
timeout=10
)
response.raise_for_status()
if response.json().get("errcode") == 0:
print(f"[{alert_time}] {alert_info['container']} 报警通知发送成功")
else:
print(f"发送失败:{response.json().get('errmsg')}")
except requests.exceptions.RequestException as e:
print(f"发送报警通知失败:{str(e)}")
def run_monitor(self):
"""执行监控流程:遍历命名空间,检查指标并报警"""
print("开始执行容器资源监控...")
for namespace in self.namespace_list:
print(f"检查命名空间:{namespace}")
# 检查CPU使用率
cpu_alerts = self.get_cpu_usage(namespace)
for alert in cpu_alerts:
self.send_alert(alert)
# 检查内存使用率
mem_alerts = self.get_mem_usage(namespace)
for alert in mem_alerts:
self.send_alert(alert)
print("监控执行完毕")
if __name__ == '__main__':
monitor = MonitoringAlert()
monitor.run_monitor()