Title

配置prometheus短信告警

本文使用的是阿里云的短信服务

需先创建AccessKey ID和AccessKey Secret,并授权短信服务的全部权限

创建签名

image

 

申请一个短信模板并通过审核

image

 

编写webhook短信接口脚本(有python语法)

vim prometheus-sms-webhook.sh

#!/bin/bash

# Prometheus Alert Manager短信通知脚本(最终修复版)


# -------------------------- 1. 配置区 --------------------------
ALIYUN_ACCESS_KEY="xxxxxxxxxxxxxxxxxx"
ALIYUN_SECRET_KEY="xxxxxxxxxxxxxxxxxxxx"
SMS_SIGN_NAME="青岛内分泌糖尿病医院"
SMS_TEMPLATE_CODE="SMS_493255124"
ALERT_PHONES=("18504980030,15706390766,15706390788,18954271554,13655421819")
LOG_PATH="/usr/local/prometheus-webhook-sms/logs/prometheus-sms-alert.log"

# -------------------------- 2. 工具函数 --------------------------
log() {
    local log_time=$(date '+%Y-%m-%d %H:%M:%S')
    local log_msg="[$log_time] $1"
    echo "$log_msg" >&2
    mkdir -p $(dirname "$LOG_PATH")
    echo "$log_msg" >> "$LOG_PATH"
}

send_sms_core() {
    local phone="$1"
    local template_param="$2"
    
    local utc_ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
    local nonce=$(date +%s%N | md5sum | head -c 10)
    
    python3 - "$ALIYUN_ACCESS_KEY" "$ALIYUN_SECRET_KEY" "$phone" "$SMS_SIGN_NAME" "$SMS_TEMPLATE_CODE" "$utc_ts" "$nonce" "$template_param" << 'END_PYTHON'
import sys
import urllib.parse
import base64
import hmac
import hashlib
import requests
import json

access_key = sys.argv[1]
secret_key = sys.argv[2]
phone_num = sys.argv[3]
sign_name = sys.argv[4]
template_code = sys.argv[5]
utc_timestamp = sys.argv[6]
nonce_str = sys.argv[7]
template_param = sys.argv[8]

try:
    params = {
        "AccessKeyId": access_key,
        "Action": "SendSms",
        "Format": "JSON",
        "PhoneNumbers": phone_num,
        "RegionId": "cn-hangzhou",
        "SignName": sign_name,
        "SignatureMethod": "HMAC-SHA1",
        "SignatureNonce": nonce_str,
        "SignatureVersion": "1.0",
        "TemplateCode": template_code,
        "TemplateParam": template_param,
        "Timestamp": utc_timestamp,
        "Version": "2017-05-25"
    }

    sorted_params = sorted(params.items())
    encoded_params = []
    for k, v in sorted_params:
        encoded_k = urllib.parse.quote(str(k), safe='')
        encoded_v = urllib.parse.quote(str(v), safe='')
        encoded_params.append(f"{encoded_k}={encoded_v}")
    query_str = "&".join(encoded_params)

    string_to_sign = "GET&%2F&" + urllib.parse.quote(query_str, safe='')
    signature = base64.b64encode(
        hmac.new(
            (secret_key + "&").encode("utf-8"),
            string_to_sign.encode("utf-8"),
            hashlib.sha1
        ).digest()
    ).decode("utf-8")

    request_url = f"http://dysmsapi.aliyuncs.com/?{query_str}&Signature={urllib.parse.quote(signature, safe='')}"
    response = requests.get(request_url, timeout=15)
    resp_json = response.json()

    if resp_json.get("Code") == "OK":
        print("success")
    else:
        error_msg = resp_json.get("Message", "未知错误")
        print(f"error: {error_msg}")

except requests.exceptions.Timeout:
    print("error: 请求超时")
except Exception as e:
    print(f"error: {str(e)}")
END_PYTHON
}

parse_alert_json() {
    local json_data="$1"
    
    if ! command -v jq &> /dev/null; then
        log "错误:请安装jq工具(yum install jq 或 apt install jq)"
        exit 1
    fi
    
    # 提取告警字段并赋值给全局变量(避免local在函数外的问题)
    ALERT_NAME=$(echo "$json_data" | jq -r '.alerts[0].labels.alertname // "未知告警"')
    ALERT_STATUS=$(echo "$json_data" | jq -r '.status // "unknown"')
    ALERT_SEVERITY=$(echo "$json_data" | jq -r '.alerts[0].labels.severity // "普通"')
    ALERT_SUMMARY=$(echo "$json_data" | jq -r '.alerts[0].annotations.summary // .alerts[0].annotations.description // "无详情"')
    ALERT_INSTANCE=$(echo "$json_data" | jq -r '.alerts[0].labels.instance // "未知实例"')
    
    # 截断过长字段
    ALERT_NAME=$(echo "$ALERT_NAME" | cut -c 1-25)
    ALERT_SUMMARY=$(echo "$ALERT_SUMMARY" | cut -c 1-50)
}

# -------------------------- 3. 主逻辑 --------------------------
main() {
    # 所有变量在函数内用local声明
    local mode="direct"
    local alert_json=""
    local json_file=""
    local ALERT_NAME=""
    local ALERT_STATUS=""
    local ALERT_SEVERITY=""
    local ALERT_SUMMARY=""
    local ALERT_INSTANCE=""
    local status_cn="未知"
    local sms_param=""
    local success=0
    local fail=0

    # 处理--json-file参数(Flask临时文件调用模式)
    if [ "$1" = "--json-file" ] && [ -n "$2" ] && [ -f "$2" ]; then
        mode="webhook"
        json_file="$2"
        alert_json=$(cat "$json_file")
        log "从临时文件读取JSON:$json_file"
        parse_alert_json "$alert_json"
        shift 2
    else
        # 处理管道输入(Webhook模式)
        if [ ! -t 0 ]; then
            alert_json=$(cat)
            if [ -n "$alert_json" ]; then
                mode="webhook"
                log "从管道读取JSON"
                parse_alert_json "$alert_json"
            fi
        fi
    fi

    # 直接调用模式(命令行参数)
    if [ "$mode" = "direct" ]; then
        if [ $# -ne 5 ]; then
            echo "用法1(直接调用):"
            echo "  $0 <告警名称> <状态> <级别> <详情> <实例>"
            echo "示例:"
            echo "  $0 'CPU过高' 'firing' '紧急' 'CPU>95%' '192.168.132.228'"
            echo "用法2(Webhook):"
            echo "  echo 'JSON' | $0 或 $0 --json-file 临时文件路径"
            exit 1
        fi
        ALERT_NAME="$1"
        ALERT_STATUS="$2"
        ALERT_SEVERITY="$3"
        ALERT_SUMMARY="$4"
        ALERT_INSTANCE="$5"
    fi

    # 转换状态为中文
    if [ "$ALERT_STATUS" = "firing" ]; then
        status_cn="触发"
    elif [ "$ALERT_STATUS" = "resolved" ]; then
        status_cn="恢复"
    fi

    # 打印告警信息
    log "==================== 告警详情 ===================="
    log "名称:$ALERT_NAME"
    log "状态:$status_cn"
    log "级别:$ALERT_SEVERITY"
    log "实例:$ALERT_INSTANCE"
    log "详情:$ALERT_SUMMARY"
    log "=================================================="

    # 生成模板参数(匹配阿里云模板变量)
    sms_param=$(jq -n \
        --arg name "$ALERT_NAME" \
        --arg status "$status_cn" \
        --arg severity "$ALERT_SEVERITY" \
        --arg instance "$ALERT_INSTANCE" \
        --arg summary "$ALERT_SUMMARY" \
        '{msg: "告警名称:\($name),状态:\($status),级别:\($severity),实例:\($instance),详情:\($summary)"}' \
        | tr -d '\n'
    )

    # 发送短信
    for phone in "${ALERT_PHONES[@]}"; do
        log "开始向 $phone 发送短信"
        result=$(send_sms_core "$phone" "$sms_param")
        
        if [ "$result" = "success" ]; then
            log "短信发送成功:$phone"
            success=$((success + 1))
        else
            log "短信发送失败:$phone,原因:$result"
            fail=$((fail + 1))
        fi
        
        sleep 1
    done

    # 统计结果
    log "==================== 发送统计 ===================="
    log "总数量:${#ALERT_PHONES[@]},成功:$success,失败:$fail"
    log "=================================================="

    [ $fail -eq 0 ] && exit 0 || exit 1
}

# 执行主函数
main "$@"

 

编写webhook接收器

vim prometheus-sms-webhook-server.py

from flask import Flask, request, jsonify
import subprocess
import json
import logging
import os
import tempfile
import time

app = Flask(__name__)

# -------------------------- 1. 配置(适配Python 3.6) --------------------------
SCRIPT_PATH = "/usr/local/prometheus-webhook-sms/prometheus-sms-webhook.sh"
SCRIPT_TIMEOUT = 30  # 脚本超时时间(秒)
ALLOWED_IPS = ["192.168.253.205"]  # AlertManager IP
LOG_FILE = "/var/log/prometheus-webhook.log"

# -------------------------- 2. 日志配置 --------------------------
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(module)s:%(lineno)d - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    handlers=[
        logging.FileHandler(LOG_FILE),
        logging.StreamHandler()
    ]
)

# -------------------------- 3. 工具函数 --------------------------
def is_allowed_ip(client_ip):
    #return client_ip in ALLOWED_IPS
    return True   #允许所有ip

# -------------------------- 4. 核心Webhook接口(修复Python 3.6兼容性) --------------------------
@app.route('/webhook', methods=['POST'])
def webhook():
    start_time = time.time()
    client_ip = request.remote_addr
    logging.info(f"收到来自 {client_ip} 的告警请求,耗时统计开始")
    temp_file = None

    try:
        # 1. 校验请求IP
        if not is_allowed_ip(client_ip):
            logging.warning(f"拒绝非法IP {client_ip}(不在允许列表)")
            return jsonify({"status": "error", "message": "Forbidden: Invalid IP"}), 403

        # 2. 解析JSON请求体
        try:
            alert_data = request.json
            if not alert_data:
                raise json.JSONDecodeError("请求体为空或Content-Type非application/json", doc="", pos=0)
            logging.info(f"AlertManager请求JSON:{json.dumps(alert_data, ensure_ascii=False)[:800]}")
        except json.JSONDecodeError as e:
            logging.error(f"JSON解析失败:{str(e)},原始请求体:{request.data[:200]}")
            return jsonify({"status": "error", "message": f"JSON解析失败:{str(e)}"}), 400

        # 3. 创建临时文件(传递JSON,避免标准输入转义问题)
        with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8', suffix='.json', delete=False) as f:
            json.dump(alert_data, f, ensure_ascii=False)
            temp_file = f.name
        logging.info(f"JSON已写入临时文件:{temp_file}")

        # 4. 校验脚本
        if not os.path.exists(SCRIPT_PATH):
            err_msg = f"脚本不存在:{SCRIPT_PATH}"
            logging.error(err_msg)
            return jsonify({"status": "error", "message": err_msg}), 500
        if not os.access(SCRIPT_PATH, os.X_OK):
            err_msg = f"脚本无执行权限:{SCRIPT_PATH}(执行 chmod +x {SCRIPT_PATH})"
            logging.error(err_msg)
            return jsonify({"status": "error", "message": err_msg}), 500

        # 5. 调用脚本:用 universal_newlines=True 替代 text=True(适配Python 3.6)
        logging.info(f"调用脚本:{SCRIPT_PATH} --json-file {temp_file}")
        result = subprocess.run(
            [SCRIPT_PATH, "--json-file", temp_file],  # 传递临时文件参数给脚本
            stdout=subprocess.PIPE,      # 捕获标准输出
            stderr=subprocess.PIPE,      # 捕获标准错误
            universal_newlines=True,     # 以文本模式返回输出(Python 3.6兼容)
            timeout=SCRIPT_TIMEOUT       # 超时控制
        )

        # 6. 打印脚本输出
        stdout = result.stdout.strip() if result.stdout else ""
        stderr = result.stderr.strip() if result.stderr else ""
        logging.info(f"脚本返回码:{result.returncode}")
        logging.info(f"脚本stdout:{stdout}")
        logging.info(f"脚本stderr:{stderr}")

        # 7. 处理结果
        if result.returncode == 0:
            logging.info(f"处理成功,耗时:{time.time()-start_time:.2f}秒")
            return jsonify({"status": "success", "message": "告警处理完成"}), 200
        else:
            err_msg = f"脚本执行失败:{stderr[:500]}"
            logging.error(f"处理失败,耗时:{time.time()-start_time:.2f}秒,原因:{err_msg}")
            return jsonify({"status": "error", "message": err_msg}), 500

    except subprocess.TimeoutExpired:
        err_msg = f"脚本超时(>{SCRIPT_TIMEOUT}秒)"
        logging.error(err_msg)
        return jsonify({"status": "error", "message": err_msg}), 500

    except Exception as e:
        err_msg = f"请求处理异常:{str(e)}"
        logging.error(err_msg, exc_info=True)
        return jsonify({"status": "error", "message": err_msg}), 500

    finally:
        # 删除临时文件
        if temp_file and os.path.exists(temp_file):
            os.remove(temp_file)
            logging.info(f"临时文件已删除:{temp_file}")

# -------------------------- 5. 启动服务 --------------------------
if __name__ == '__main__':
    # Python 3.6+支持,监听所有网卡
    app.run(host='0.0.0.0', port=9095, debug=False

 

测试直接调用模式,直接使用脚本,不调用alertmanager,看能否收到短信

/usr/local/prometheus-webhook-sms/prometheus-sms-webhook.sh \
"CPU使用率过高" \
"firing" \
"紧急" \
"CPU使用率超过95%,持续5分钟" \
"192.168.132.228"

 

测试 Webhook 模式

cat <<EOF | /usr/local/prometheus-webhook-sms/prometheus-sms-webhook.sh
{
  "status": "firing",
  "alerts": [
    {
      "labels": {
        "alertname": "CPU_Usage_High",
        "severity": "critical",
        "instance": "192.168.132.228"
      },
      "annotations": {
        "summary": "CPU使用率>95%",
        "description": "服务器CPU持续5分钟过高"
      }
    }
  ]
}
EOF

若直接调用和 Webhook 模式测试均正常,直接启动 Flask 服务(webhook接收器)即可

 

测试一下能否发送告警

curl -X POST \
  -H "Content-Type: application/json" \
  -d '{
    "status": "firing",
    "alerts": [
      {
        "labels": {"alertname": "Test_From_AlertManager", "severity": "critical", "instance": "192.168.132.228"},
        "annotations": {"summary": "测试AlertManager转发", "description": "从AlertManager服务器发起的测试"}
      }
    ]
  }' \
  http://192.168.132.228:9095/webhook

 添加alretmanager

 

图片

 

 

在使用alretmanager测试一下

生成一个告警

amtool alert add \
  --alertmanager.url=http://localhost:9093 \
  alertname="TestAlert" \
  severity="critical" \
  namespace="test-namespace" \
  instance="192.168.253.205:62429" \
  summary="测试告警" \
  description="测试Webhook接口"

 

 取消告警(不取消会一直存在)

amtool alert resolve \
  --alertmanager.url=http://localhost:9093 \
  alertname="TestAlert" \
  severity="critical" \
  namespace="test-namespace" \
  instance="192.168.253.205:62429"
posted @ 2025-08-26 11:45  Esurts~  阅读(103)  评论(0)    收藏  举报