青成林语

协助他人,成就彼此 ^_^
python抓取prometheus容器数据,并实现监控报警
# 作者:https://www.cnblogs.com/honglicheng
import json
import math
import pytz
import requests
from datetime import datetime


class MonitoringAlert:
    def __init__(self):
        # 监控阈值配置
        self.cpu_threshold = 80  # CPU使用率报警阈值(%)
        self.mem_threshold = 80  # 内存使用率报警阈值(%)
        # 目标命名空间列表
        self.namespace_list = [
            "apollo", "bhpc-admin-nginx", "bluehelix", "broker", "cert-manager",
            "chainnode", "clear", "elastic-system", "exchange", "gateway",
            "kube-node-lease", "kube-public", "kube-system", "log", "wallet", "rc"
        ]
        self.api_url = "https://prometheus.doex.io/api/v1/query"  # Prometheus查询接口
        self.alert_webhook = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=xxxxxxaf216f7"  # 报警通知接口

    def _fetch_metrics(self, query):
        """通用方法:调用Prometheus接口获取监控数据"""
        try:
            response = requests.get(self.api_url, params={"query": query}, timeout=10)
            response.raise_for_status()  # 触发HTTP错误状态码异常
            data = response.json()
            return data.get("data", {}).get("result", [])
        except requests.exceptions.RequestException as e:
            print(f"获取监控数据失败:{str(e)}")
            return []

    def get_cpu_usage(self, namespace):
        """获取指定命名空间的容器CPU使用率"""
        # PromQL查询:容器CPU使用率(已用/配额)*100
        query = (
            f'sum(irate(container_cpu_usage_seconds_total{{container!="", container!="POD", namespace=~"{namespace}"}}[2m])) '
            f'by (container, pod) / '
            f'sum(container_spec_cpu_quota{{container!="", container!="POD", namespace=~"{namespace}"}} / 100000) '
            f'by (container, pod) * 100'
        )
        metrics = self._fetch_metrics(query)
        over_threshold = []
        for item in metrics:
            try:
                usage = float(item["value"][1])
                if usage > self.cpu_threshold and not math.isinf(usage):
                    over_threshold.append({
                        "container": item["metric"]["pod"],
                        "cpu_usage": round(usage, 2),  # 保留两位小数
                        "type": "cpu"
                    })
            except (KeyError, ValueError) as e:
                print(f"解析CPU数据失败:{str(e)}")
        return over_threshold

    def get_mem_usage(self, namespace):
        """获取指定命名空间的容器内存使用率"""
        # PromQL查询:容器内存使用率(工作集/限制)*100
        query = (
            f'sum(container_memory_working_set_bytes{{container!="", container!="POD", namespace=~"{namespace}"}}) '
            f'by (container, pod) / '
            f'sum(container_spec_memory_limit_bytes{{container!="", container!="POD", namespace=~"{namespace}"}}) '
            f'by (container, pod) * 100'
        )
        metrics = self._fetch_metrics(query)
        over_threshold = []
        for item in metrics:
            try:
                usage = float(item["value"][1])
                if usage > self.mem_threshold and not math.isinf(usage):
                    over_threshold.append({
                        "container": item["metric"]["pod"],
                        "mem_usage": round(usage, 2),  # 保留两位小数
                        "type": "memory"
                    })
            except (KeyError, ValueError) as e:
                print(f"解析内存数据失败:{str(e)}")
        return over_threshold

    def send_alert(self, alert_info):
        """发送报警通知到webhook"""
        # 获取上海时区当前时间
        shanghai_tz = pytz.timezone("Asia/Shanghai")
        alert_time = datetime.now(shanghai_tz).strftime("%Y-%m-%d %H:%M:%S")
        
        # 构建Markdown报警内容
        if alert_info["type"] == "cpu":
            content = f'''
            # 容器资源报警通知
            **容器名称:** {alert_info["container"]}  
            **监控类型:** CPU使用率  
            **当前值:** {alert_info["cpu_usage"]}%  
            **报警阈值:** {self.cpu_threshold}%  
            **报警时间:** {alert_time}  
            **问题描述:** 容器CPU使用率已超过阈值,请及时处理
            '''
        else:
            content = f'''
            # 容器资源报警通知
            **容器名称:** {alert_info["container"]}  
            **监控类型:** 内存使用率  
            **当前值:** {alert_info["mem_usage"]}%  
            **报警阈值:** {self.mem_threshold}%  
            **报警时间:** {alert_time}  
            **问题描述:** 容器内存使用率已超过阈值,请及时处理
            '''
        
        # 发送POST请求
        try:
            response = requests.post(
                self.alert_webhook,
                headers={"Content-Type": "application/json"},
                json={
                    "msgtype": "markdown",
                    "markdown": {"content": content.strip()}  # 去除多余空行
                },
                timeout=10
            )
            response.raise_for_status()
            if response.json().get("errcode") == 0:
                print(f"[{alert_time}] {alert_info['container']} 报警通知发送成功")
            else:
                print(f"发送失败:{response.json().get('errmsg')}")
        except requests.exceptions.RequestException as e:
            print(f"发送报警通知失败:{str(e)}")

    def run_monitor(self):
        """执行监控流程:遍历命名空间,检查指标并报警"""
        print("开始执行容器资源监控...")
        for namespace in self.namespace_list:
            print(f"检查命名空间:{namespace}")
            # 检查CPU使用率
            cpu_alerts = self.get_cpu_usage(namespace)
            for alert in cpu_alerts:
                self.send_alert(alert)
            # 检查内存使用率
            mem_alerts = self.get_mem_usage(namespace)
            for alert in mem_alerts:
                self.send_alert(alert)
        print("监控执行完毕")


if __name__ == '__main__':
    monitor = MonitoringAlert()
    monitor.run_monitor()

 

posted on 2025-12-05 12:36  青成林语  阅读(0)  评论(0)    收藏  举报