配置prometheus短信告警
本文使用的是阿里云的短信服务
需先创建AccessKey ID和AccessKey Secret,并授权短信服务的全部权限
创建签名

申请一个短信模板并通过审核

编写webhook短信接口脚本(有python语法)
vim prometheus-sms-webhook.sh
#!/bin/bash # Prometheus Alert Manager短信通知脚本(最终修复版) # -------------------------- 1. 配置区 -------------------------- ALIYUN_ACCESS_KEY="xxxxxxxxxxxxxxxxxx" ALIYUN_SECRET_KEY="xxxxxxxxxxxxxxxxxxxx" SMS_SIGN_NAME="青岛内分泌糖尿病医院" SMS_TEMPLATE_CODE="SMS_493255124" ALERT_PHONES=("18504980030,15706390766,15706390788,18954271554,13655421819") LOG_PATH="/usr/local/prometheus-webhook-sms/logs/prometheus-sms-alert.log" # -------------------------- 2. 工具函数 -------------------------- log() { local log_time=$(date '+%Y-%m-%d %H:%M:%S') local log_msg="[$log_time] $1" echo "$log_msg" >&2 mkdir -p $(dirname "$LOG_PATH") echo "$log_msg" >> "$LOG_PATH" } send_sms_core() { local phone="$1" local template_param="$2" local utc_ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ") local nonce=$(date +%s%N | md5sum | head -c 10) python3 - "$ALIYUN_ACCESS_KEY" "$ALIYUN_SECRET_KEY" "$phone" "$SMS_SIGN_NAME" "$SMS_TEMPLATE_CODE" "$utc_ts" "$nonce" "$template_param" << 'END_PYTHON' import sys import urllib.parse import base64 import hmac import hashlib import requests import json access_key = sys.argv[1] secret_key = sys.argv[2] phone_num = sys.argv[3] sign_name = sys.argv[4] template_code = sys.argv[5] utc_timestamp = sys.argv[6] nonce_str = sys.argv[7] template_param = sys.argv[8] try: params = { "AccessKeyId": access_key, "Action": "SendSms", "Format": "JSON", "PhoneNumbers": phone_num, "RegionId": "cn-hangzhou", "SignName": sign_name, "SignatureMethod": "HMAC-SHA1", "SignatureNonce": nonce_str, "SignatureVersion": "1.0", "TemplateCode": template_code, "TemplateParam": template_param, "Timestamp": utc_timestamp, "Version": "2017-05-25" } sorted_params = sorted(params.items()) encoded_params = [] for k, v in sorted_params: encoded_k = urllib.parse.quote(str(k), safe='') encoded_v = urllib.parse.quote(str(v), safe='') encoded_params.append(f"{encoded_k}={encoded_v}") query_str = "&".join(encoded_params) string_to_sign = "GET&%2F&" + urllib.parse.quote(query_str, safe='') signature = base64.b64encode( hmac.new( (secret_key + "&").encode("utf-8"), string_to_sign.encode("utf-8"), hashlib.sha1 ).digest() ).decode("utf-8") request_url = f"http://dysmsapi.aliyuncs.com/?{query_str}&Signature={urllib.parse.quote(signature, safe='')}" response = requests.get(request_url, timeout=15) resp_json = response.json() if resp_json.get("Code") == "OK": print("success") else: error_msg = resp_json.get("Message", "未知错误") print(f"error: {error_msg}") except requests.exceptions.Timeout: print("error: 请求超时") except Exception as e: print(f"error: {str(e)}") END_PYTHON } parse_alert_json() { local json_data="$1" if ! command -v jq &> /dev/null; then log "错误:请安装jq工具(yum install jq 或 apt install jq)" exit 1 fi # 提取告警字段并赋值给全局变量(避免local在函数外的问题) ALERT_NAME=$(echo "$json_data" | jq -r '.alerts[0].labels.alertname // "未知告警"') ALERT_STATUS=$(echo "$json_data" | jq -r '.status // "unknown"') ALERT_SEVERITY=$(echo "$json_data" | jq -r '.alerts[0].labels.severity // "普通"') ALERT_SUMMARY=$(echo "$json_data" | jq -r '.alerts[0].annotations.summary // .alerts[0].annotations.description // "无详情"') ALERT_INSTANCE=$(echo "$json_data" | jq -r '.alerts[0].labels.instance // "未知实例"') # 截断过长字段 ALERT_NAME=$(echo "$ALERT_NAME" | cut -c 1-25) ALERT_SUMMARY=$(echo "$ALERT_SUMMARY" | cut -c 1-50) } # -------------------------- 3. 主逻辑 -------------------------- main() { # 所有变量在函数内用local声明 local mode="direct" local alert_json="" local json_file="" local ALERT_NAME="" local ALERT_STATUS="" local ALERT_SEVERITY="" local ALERT_SUMMARY="" local ALERT_INSTANCE="" local status_cn="未知" local sms_param="" local success=0 local fail=0 # 处理--json-file参数(Flask临时文件调用模式) if [ "$1" = "--json-file" ] && [ -n "$2" ] && [ -f "$2" ]; then mode="webhook" json_file="$2" alert_json=$(cat "$json_file") log "从临时文件读取JSON:$json_file" parse_alert_json "$alert_json" shift 2 else # 处理管道输入(Webhook模式) if [ ! -t 0 ]; then alert_json=$(cat) if [ -n "$alert_json" ]; then mode="webhook" log "从管道读取JSON" parse_alert_json "$alert_json" fi fi fi # 直接调用模式(命令行参数) if [ "$mode" = "direct" ]; then if [ $# -ne 5 ]; then echo "用法1(直接调用):" echo " $0 <告警名称> <状态> <级别> <详情> <实例>" echo "示例:" echo " $0 'CPU过高' 'firing' '紧急' 'CPU>95%' '192.168.132.228'" echo "用法2(Webhook):" echo " echo 'JSON' | $0 或 $0 --json-file 临时文件路径" exit 1 fi ALERT_NAME="$1" ALERT_STATUS="$2" ALERT_SEVERITY="$3" ALERT_SUMMARY="$4" ALERT_INSTANCE="$5" fi # 转换状态为中文 if [ "$ALERT_STATUS" = "firing" ]; then status_cn="触发" elif [ "$ALERT_STATUS" = "resolved" ]; then status_cn="恢复" fi # 打印告警信息 log "==================== 告警详情 ====================" log "名称:$ALERT_NAME" log "状态:$status_cn" log "级别:$ALERT_SEVERITY" log "实例:$ALERT_INSTANCE" log "详情:$ALERT_SUMMARY" log "==================================================" # 生成模板参数(匹配阿里云模板变量) sms_param=$(jq -n \ --arg name "$ALERT_NAME" \ --arg status "$status_cn" \ --arg severity "$ALERT_SEVERITY" \ --arg instance "$ALERT_INSTANCE" \ --arg summary "$ALERT_SUMMARY" \ '{msg: "告警名称:\($name),状态:\($status),级别:\($severity),实例:\($instance),详情:\($summary)"}' \ | tr -d '\n' ) # 发送短信 for phone in "${ALERT_PHONES[@]}"; do log "开始向 $phone 发送短信" result=$(send_sms_core "$phone" "$sms_param") if [ "$result" = "success" ]; then log "短信发送成功:$phone" success=$((success + 1)) else log "短信发送失败:$phone,原因:$result" fail=$((fail + 1)) fi sleep 1 done # 统计结果 log "==================== 发送统计 ====================" log "总数量:${#ALERT_PHONES[@]},成功:$success,失败:$fail" log "==================================================" [ $fail -eq 0 ] && exit 0 || exit 1 } # 执行主函数 main "$@"
编写webhook接收器
vim prometheus-sms-webhook-server.py
from flask import Flask, request, jsonify import subprocess import json import logging import os import tempfile import time app = Flask(__name__) # -------------------------- 1. 配置(适配Python 3.6) -------------------------- SCRIPT_PATH = "/usr/local/prometheus-webhook-sms/prometheus-sms-webhook.sh" SCRIPT_TIMEOUT = 30 # 脚本超时时间(秒) ALLOWED_IPS = ["192.168.253.205"] # AlertManager IP LOG_FILE = "/var/log/prometheus-webhook.log" # -------------------------- 2. 日志配置 -------------------------- logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(module)s:%(lineno)d - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", handlers=[ logging.FileHandler(LOG_FILE), logging.StreamHandler() ] ) # -------------------------- 3. 工具函数 -------------------------- def is_allowed_ip(client_ip): #return client_ip in ALLOWED_IPS return True #允许所有ip # -------------------------- 4. 核心Webhook接口(修复Python 3.6兼容性) -------------------------- @app.route('/webhook', methods=['POST']) def webhook(): start_time = time.time() client_ip = request.remote_addr logging.info(f"收到来自 {client_ip} 的告警请求,耗时统计开始") temp_file = None try: # 1. 校验请求IP if not is_allowed_ip(client_ip): logging.warning(f"拒绝非法IP {client_ip}(不在允许列表)") return jsonify({"status": "error", "message": "Forbidden: Invalid IP"}), 403 # 2. 解析JSON请求体 try: alert_data = request.json if not alert_data: raise json.JSONDecodeError("请求体为空或Content-Type非application/json", doc="", pos=0) logging.info(f"AlertManager请求JSON:{json.dumps(alert_data, ensure_ascii=False)[:800]}") except json.JSONDecodeError as e: logging.error(f"JSON解析失败:{str(e)},原始请求体:{request.data[:200]}") return jsonify({"status": "error", "message": f"JSON解析失败:{str(e)}"}), 400 # 3. 创建临时文件(传递JSON,避免标准输入转义问题) with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8', suffix='.json', delete=False) as f: json.dump(alert_data, f, ensure_ascii=False) temp_file = f.name logging.info(f"JSON已写入临时文件:{temp_file}") # 4. 校验脚本 if not os.path.exists(SCRIPT_PATH): err_msg = f"脚本不存在:{SCRIPT_PATH}" logging.error(err_msg) return jsonify({"status": "error", "message": err_msg}), 500 if not os.access(SCRIPT_PATH, os.X_OK): err_msg = f"脚本无执行权限:{SCRIPT_PATH}(执行 chmod +x {SCRIPT_PATH})" logging.error(err_msg) return jsonify({"status": "error", "message": err_msg}), 500 # 5. 调用脚本:用 universal_newlines=True 替代 text=True(适配Python 3.6) logging.info(f"调用脚本:{SCRIPT_PATH} --json-file {temp_file}") result = subprocess.run( [SCRIPT_PATH, "--json-file", temp_file], # 传递临时文件参数给脚本 stdout=subprocess.PIPE, # 捕获标准输出 stderr=subprocess.PIPE, # 捕获标准错误 universal_newlines=True, # 以文本模式返回输出(Python 3.6兼容) timeout=SCRIPT_TIMEOUT # 超时控制 ) # 6. 打印脚本输出 stdout = result.stdout.strip() if result.stdout else "无" stderr = result.stderr.strip() if result.stderr else "无" logging.info(f"脚本返回码:{result.returncode}") logging.info(f"脚本stdout:{stdout}") logging.info(f"脚本stderr:{stderr}") # 7. 处理结果 if result.returncode == 0: logging.info(f"处理成功,耗时:{time.time()-start_time:.2f}秒") return jsonify({"status": "success", "message": "告警处理完成"}), 200 else: err_msg = f"脚本执行失败:{stderr[:500]}" logging.error(f"处理失败,耗时:{time.time()-start_time:.2f}秒,原因:{err_msg}") return jsonify({"status": "error", "message": err_msg}), 500 except subprocess.TimeoutExpired: err_msg = f"脚本超时(>{SCRIPT_TIMEOUT}秒)" logging.error(err_msg) return jsonify({"status": "error", "message": err_msg}), 500 except Exception as e: err_msg = f"请求处理异常:{str(e)}" logging.error(err_msg, exc_info=True) return jsonify({"status": "error", "message": err_msg}), 500 finally: # 删除临时文件 if temp_file and os.path.exists(temp_file): os.remove(temp_file) logging.info(f"临时文件已删除:{temp_file}") # -------------------------- 5. 启动服务 -------------------------- if __name__ == '__main__': # Python 3.6+支持,监听所有网卡 app.run(host='0.0.0.0', port=9095, debug=False
测试直接调用模式,直接使用脚本,不调用alertmanager,看能否收到短信
/usr/local/prometheus-webhook-sms/prometheus-sms-webhook.sh \
"CPU使用率过高" \
"firing" \
"紧急" \
"CPU使用率超过95%,持续5分钟" \
"192.168.132.228"
测试 Webhook 模式
cat <<EOF | /usr/local/prometheus-webhook-sms/prometheus-sms-webhook.sh
{
"status": "firing",
"alerts": [
{
"labels": {
"alertname": "CPU_Usage_High",
"severity": "critical",
"instance": "192.168.132.228"
},
"annotations": {
"summary": "CPU使用率>95%",
"description": "服务器CPU持续5分钟过高"
}
}
]
}
EOF
若直接调用和 Webhook 模式测试均正常,直接启动 Flask 服务(webhook接收器)即可
测试一下能否发送告警
curl -X POST \
-H "Content-Type: application/json" \
-d '{
"status": "firing",
"alerts": [
{
"labels": {"alertname": "Test_From_AlertManager", "severity": "critical", "instance": "192.168.132.228"},
"annotations": {"summary": "测试AlertManager转发", "description": "从AlertManager服务器发起的测试"}
}
]
}' \
http://192.168.132.228:9095/webhook
添加alretmanager

在使用alretmanager测试一下
生成一个告警
amtool alert add \
--alertmanager.url=http://localhost:9093 \
alertname="TestAlert" \
severity="critical" \
namespace="test-namespace" \
instance="192.168.253.205:62429" \
summary="测试告警" \
description="测试Webhook接口"
取消告警(不取消会一直存在)
amtool alert resolve \
--alertmanager.url=http://localhost:9093 \
alertname="TestAlert" \
severity="critical" \
namespace="test-namespace" \
instance="192.168.253.205:62429"

浙公网安备 33010602011771号