prometheus配置邮箱、钉钉和企业微信告警

配置邮箱告警

配置alertmanager

cat /data/alertmanager/alertmanager.yml
global:
  resolve_timeout: 5m
  smtp_smarthost: 'smtp.qq.com:465'
  smtp_from: '826849158@qq.com'
  smtp_auth_username: '826849158@qq.com'
  smtp_auth_password: '******'  # 授权码
  smtp_require_tls: false

templates:
- '/data/alertmanager/tmpl/email.tmpl'

route: 
  group_by: [group, alertname]
  group_wait: 30s
  group_interval: 5m
  repeat_interval: 1h
  receiver: 'mail-receiver'

receivers:
- name: 'mail-receiver'
  email_configs:
  - to: '826849158@qq.com'
    send_resolved: true
    html: '{{ template "emailMessage" . }}'
    headers:
      subject: '[{{ .Status | toUpper }}] 告警: {{ .CommonLabels.alertname }} 发生于 {{ .CommonLabels.name }}'
  - to: 'zhangqifeng@jiagouyun.com'
    send_resolved: true
    html: '{{ template "emailMessage" . }}'
    headers:
      subject: '[{{ .Status | toUpper }}] 告警: {{ .CommonLabels.alertname }} 发生于 {{ .CommonLabels.name }}'

配置告警模版

cat /data/alertmanager/tmpl/email.tmpl 
{{ define "emailMessage" }}
{{- if gt (len .Alerts.Firing) 0 -}}
{{- range $index, $alert := .Alerts -}}
{{- if eq $index 0 }}
------ 告警问题 ------<br>
告警状态:{{ .Status }}<br>
告警名称:{{ .Labels.alertname }}<br>
故障实例:{{ .Labels.instance }}<br>
告警概要:{{ .Annotations.summary }}<br>
告警详情:{{ .Annotations.description }}<br>
故障时间:{{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}<br>
------ END ------<br>
{{- end }}
{{- end }}
{{- end }}

{{- if gt (len .Alerts.Resolved) 0 -}}
{{- range $index, $alert := .Alerts -}}
{{- if eq $index 0 }}
------ 告警恢复 ------<br>
告警状态:{{ .Status }}<br>
告警名称:{{ .Labels.alertname }}<br>
恢复实例:{{ .Labels.instance }}<br>
告警概要:{{ .Annotations.summary }}<br>
告警详情:{{ .Annotations.description }}<br>
故障时间:{{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}<br>
恢复时间:{{ (.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}<br>
------ END ------<br>
{{- end }}
{{- end }}
{{- end }}
{{- end }}

重启alertmanager

systemctl restart alertmanager.service
systemctl status alertmanager.service

测试

# 1. 配置测试的告警规则
cat /data/prometheus/rules/node_exporter.yml 
groups:
- name: node_usage_record_rules
  interval: 1m
  rules:
  - record: cpu:usage:rate1m
    expr: (1 - avg(rate(node_cpu_seconds_total{mode="idle"}[1m])) by (instance,vendor,account,group,name)) * 100
  - record: mem:usage:rate1m
    expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100

- name: node-exporter
  rules:
  - alert: 主机内存使用率
    expr: 100 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 90
    for: 1m
    labels:
      alertype: system
    annotations:
      summary: "内存使用率为{{ $value | humanize }}%"
      description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; 地址: {{ $labels.instance }}"

# 2. 热加载prometheus
curl -X POST http://localhost:9090/-/reload

# 3. 压测机器使内存使用率达到百分之90以上
stress --vm 1 --vm-bytes 200M --vm-hang 10000 --timeout 10000s

等待几分钟查看邮件:
20250826164605
告警恢复邮件:
20250826165109

配置钉钉告警

创建钉钉群

此次创建两个报警群,一个用来普通告警的群,一个用来dba告警的群
20250826165340
20250826165355

添加钉钉群机器人

两个群都需要添加
20250826165532
20250826165625

安装钉钉告警插件

wget https://github.com/timonwong/prometheus-webhook-dingtalk/releases/download/v2.1.0/prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz
tar -xf  prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz -C /data
mv /data/prometheus-webhook-dingtalk-2.1.0.linux-amd64/ /data/prometheus-webhook-dingtalk
cd /data/prometheus-webhook-dingtalk
cp  config.example.yml  config.yml

# 注册为系统服务
cat /usr/lib/systemd/system/prometheus-webhook-dingtalk.service
[Service]
ExecStart=/data/prometheus-webhook-dingtalk/prometheus-webhook-dingtalk --config.file=/data/prometheus-webhook-dingtalk/config.yml

[Install]
WantedBy=multi-user.target

[Unit]
Description=prometheus-webhook-dingtalk
After=network.target


systemctl daemon-reload
systemctl enable prometheus-webhook-dingtalk
systemctl restart prometheus-webhook-dingtalk
systemctl status prometheus-webhook-dingtalk

配置钉钉告警插件与钉钉机器人集成

cat /data/prometheus-webhook-dingtalk/config.yml
# 自定义模板文件及位置
templates:
  - /data/prometheus-webhook-dingtalk/contrib/templates/dingtalk.tmpl
targets:
  webhook:
    # 配置钉钉机器人
    url: https://oapi.dingtalk.com/robot/send?access_token=eaea39bb73cc919ac63738153f4de83ee7e3b67865d32e10c5a10505f7019a5b
    # 配置加签SECRET
    secret: SEC3ca5a2f02eb75fad71557e6695a9cbff5330e7be9038dede354f******
    message:
      # 添加模板标题,在下面的模板文件title位置
      title: '{{ template "ops.title" . }}'
      # 添加模板内容,在下面的模板文件content位置
      text: '{{ template "ops.content" . }}'
  webhook_dba:
    url: https://oapi.dingtalk.com/robot/send?access_token=5b38ba2c457e971543c7eefbeab2fdb9b8e96d1a8a52ec6aaa12b647e6d63b17
    secret: SEC20450f3c8b61273de04fd86564f56ea4d8f9dc446e6100dd06b22******
    message:
      title: '{{ template "ops.title" . }}'
      text: '{{ template "ops.content" . }}'

配置告警模板文件

cat /data/prometheus-webhook-dingtalk/contrib/templates/dingtalk.tmpl
{{ define "__subject" }}
[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}]
{{ end }}


{{ define "__alert_list" }}{{ range . }}
---
    **告警状态**:{{ .Status }}
    **告警名称**:{{ .Labels.alertname }}
    **故障实例**:{{ .Labels.instance }}
    **告警概要**:{{ .Annotations.summary }}
    **告警详情**:{{ .Annotations.description }}
    **触发时间**: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
{{ end }}{{ end }}

{{ define "__resolved_list" }}{{ range . }}
---
    **告警状态**:{{ .Status }}
    **告警名称**:{{ .Labels.alertname }}
    **恢复实例**:{{ .Labels.instance }}
    **告警概要**:{{ .Annotations.summary }}
    **告警详情**:{{ .Annotations.description }}
    **触发时间**: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
    **恢复时间**: {{ (.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
{{ end }}{{ end }}


{{ define "ops.title" }}
{{ template "__subject" . }}
{{ end }}

{{ define "ops.content" }}
{{ if gt (len .Alerts.Firing) 0 }}
**==== 侦测到{{ .Alerts.Firing | len  }}个故障====**
{{ template "__alert_list" .Alerts.Firing }}
---
{{ end }}

{{ if gt (len .Alerts.Resolved) 0 }}
**==== 恢复{{ .Alerts.Resolved | len  }}个故障====**
{{ template "__resolved_list" .Alerts.Resolved }}
{{ end }}
{{ end }}

{{ define "ops.link.title" }}{{ template "ops.title" . }}{{ end }}
{{ define "ops.link.content" }}{{ template "ops.content" . }}{{ end }}
{{ template "ops.title" . }}
{{ template "ops.content" . }}

赋予模版文件权限:

chmod 644 /data/prometheus-webhook-dingtalk/contrib/templates/dingtalk.tmpl

重启prometheus-webhook-dingtalk:

systemctl restart prometheus-webhook-dingtalk
systemctl status  prometheus-webhook-dingtalk

修改Alertmanager配置文件

cat /data/alertmanager/alertmanager.yml
global:
  resolve_timeout: 3m
templates:
  - '/data/prometheus-webhook-dingtalk/contrib/templates/dingtalk.tmpl'
route:
  group_by: ['group','alertname']
  group_wait: 30s
  group_interval: 5m
  repeat_interval: 1h
  receiver: webhook
  routes:
    - receiver: webhook_dba
      group_wait: 30s
      match:
        alertype: "dba"
receivers:
- name: 'webhook'
  webhook_configs:
  - url: 'http://localhost:8060/dingtalk/webhook/send'
    send_resolved: true
- name: 'webhook_dba'
  webhook_configs:
  - url: 'http://localhost:8060/dingtalk/webhook_dba/send'
    send_resolved: true

重启alertmanager:

systemctl restart alertmanager
systemctl status alertmanager

测试告警

# 1. 定义测试告警规则
cat /data/prometheus/rules/node_exporter.yml 
groups:
- name: node_usage_record_rules
  interval: 1m
  rules:
  - record: cpu:usage:rate1m
    expr: (1 - avg(rate(node_cpu_seconds_total{mode="idle"}[1m])) by (instance,vendor,account,group,name)) * 100
  - record: mem:usage:rate1m
    expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100

- name: node-exporter
  rules:
  - alert: 主机内存使用率
    expr: 100 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 90
    for: 1m
    labels:
      alertype: system
    annotations:
      summary: "内存使用率为{{ $value | humanize }}%"
      description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; 地址: {{ $labels.instance }}"

cat /data/prometheus/rules/mysql_exporter.yml 
groups:
- name: MySQL-Alert
  rules:
  - alert: MySQL_is_down
    expr: mysql_up == 0
    for: 1m
    labels:
      alertype: dba
    annotations:
      summary: "MySQL database is down"
      description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; 地址: {{ $labels.instance }}"

# 2. 热加载prometheus
curl -X POST http://localhost:9090/-/reload
  • 压测机器使内存使用率达到百分之90以上,可以发现告警打入普通告警群

    stress --vm 1 --vm-bytes 200M --vm-hang 10000 --timeout 10000s
    

    20250826171120
    告警恢复:
    20250826171545

  • 关闭mysql服务,可以发现告警打入dba告警群
    20250826172213
    告警恢复:
    20250826172417

配置企业微信告警

创建企业微信群

此次创建两个报警群,一个用来普通告警的群,一个用来dba告警的群
20250826172902

创建企业微信群机器人

两个群都创建
20250826173011
20250826173018

配置告警模版

cat  /data/alertmanager/tmpl/wechat.tmpl
{{ define "__subject" -}}
【{{ .Signature }}】
{{- if gt (.Alerts.Firing|len) 0 }}🚨 侦测到 {{ .Alerts.Firing|len }} 个故障{{ end }}
{{- if gt (.Alerts.Resolved|len) 0 }}✅ {{ .Alerts.Resolved|len }} 个告警已恢复{{ end }}
{{- end }}

{{ define "__alertinstance" -}}
{{- if ne .Labels.alertinstance nil -}}{{ .Labels.alertinstance }}
{{- else if ne .Labels.instance nil -}}{{ .Labels.instance }}
{{- else if ne .Labels.node nil -}}{{ .Labels.node }}
{{- else if ne .Labels.nodename nil -}}{{ .Labels.nodename }}
{{- else if ne .Labels.host nil -}}{{ .Labels.host }}
{{- else if ne .Labels.hostname nil -}}{{ .Labels.hostname }}
{{- else if ne .Labels.ip nil -}}{{ .Labels.ip }}
{{- else }}N/A{{- end -}}
{{- end }}

{{ define "prom.title" -}}
{{ template "__subject" . }}
{{- end }}

{{ define "prom.markdown" -}}
{{ range .Alerts -}}
---
**告警状态**: {{ if eq .Status "firing" }}`🚨 FIRING`{{ else }}`✅ RESOLVED`{{ end }}

**告警名称**: {{ if .Labels.alertname_cn }}{{ .Labels.alertname_cn }}{{ else if .Labels.alertname_custom }}{{ .Labels.alertname_custom }}{{ else if .Annotations.alertname }}{{ .Annotations.alertname }}{{ else }}{{ .Labels.alertname }}{{ end }}

**故障实例**: `{{ template "__alertinstance" . }}`

**告警概要**: {{ if .Annotations.summary_cn }}{{ .Annotations.summary_cn }}{{ else }}{{ .Annotations.summary }}{{ end }}

**告警详情**: {{ if .Annotations.description_cn }}{{ .Annotations.description_cn }}{{ else }}{{ .Annotations.description }}{{ end }}

**触发时间**: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
{{- if and (eq .Status "resolved") (.EndsAt.After .StartsAt) }}
**恢复时间**: {{ .EndsAt.Format "2006-01-02 15:04:05" }}
{{- end }}

{{ if .Labels.region }}**地域**: {{ .Labels.region }}{{ end }}
{{ if .Labels.zone }}**可用区**: {{ .Labels.zone }}{{ end }}
{{ if .Labels.product }}**产品**: {{ .Labels.product }}{{ end }}
{{ if .Labels.component }}**组件**: {{ .Labels.component }}{{ end }}

{{ end -}}
{{- end }}

{{ define "prom.text" -}}
{{ template "prom.markdown" . }}
{{- end }}

部署alertmanager-webhook-adapter

下载地址:https://github.com/bougou/alertmanager-webhook-adapter/releases

mv alertmanager-webhook-adapter-v1.1.8-linux-amd64 /data/alertmanager/alertmanager-webhook-adapter
chmod +x /data/alertmanager/alertmanager-webhook-adapter

cat /usr/lib/systemd/system/alertmanager-webhook-adapter.service
[Unit]
Description=alertmanager-webhook-adapter
After=network-online.target remote-fs.target nss-lookup.target
Wants=network-online.target

[Service]
Type=simple
ExecStart=/data/alertmanager/alertmanager-webhook-adapter  --listen-address=:8090 --signature=zqf-prometheus --tmpl-dir=/data/alertmanager/tmpl --tmpl-name=wechat
Restart=on-failure

[Install]
WantedBy=multi-user.target


systemctl daemon-reload
systemctl restart alertmanager-webhook-adapter.service 
systemctl enable alertmanager-webhook-adapter.service
systemctl status  alertmanager-webhook-adapter.service 

配置alertmanager

cat /data/alertmanager/alertmanager.yml
global:
  resolve_timeout: 3m
templates:
  - '/data/prometheus-webhook-dingtalk/contrib/templates/wechat.tmpl'
route:
  group_by: ['group','alertname']
  group_wait: 30s
  group_interval: 5m
  repeat_interval: 1h
  receiver: wechat
  routes:
    - receiver: wechat_dba
      group_wait: 30s
      match:
        alertype: "dba"
receivers:
- name: 'wechat'
  webhook_configs:
  # 如下url只需将地址串里的token 替换为创建企业微信群机器人时webhook的token即可
  - url: http://127.0.0.1:8090/webhook/send?channel_type=weixin&token=12b4fd90-f91f-447f-854b-de8d13******
    # 是否发送告警恢复通知
    send_resolved: true 
- name: 'wechat_dba'
  webhook_configs:
  # 如下url只需将地址串里的token 替换为创建企业微信群机器人时webhook的token即可
  - url: http://127.0.0.1:8090/webhook/send?channel_type=weixin&token=d7310349-3a25-4c22-b919-bdfa31******
    # 是否发送告警恢复通知
    send_resolved: true  

重启alertmanager

systemctl restart alertmanager
systemctl status  alertmanager

测试

# 1. 定义测试告警规则
cat /data/prometheus/rules/node_exporter.yml
groups:
- name: node_usage_record_rules
  interval: 1m
  rules:
  - record: cpu:usage:rate1m
    expr: (1 - avg(rate(node_cpu_seconds_total{mode="idle"}[1m])) by (instance,vendor,account,group,name)) * 100
  - record: mem:usage:rate1m
    expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100

- name: node-exporter
  rules:
  - alert: 主机内存使用率
    expr: 100 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 90
    for: 1m
    labels:
      alertype: system
    annotations:
      summary: "内存使用率为{{ $value | humanize }}%"
      description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; 地址: {{ $labels.instance }}"

cat /data/prometheus/rules/mysql_exporter.yml 
groups:
- name: MySQL-Alert
  rules:
  - alert: MySQL_is_down
    expr: mysql_up == 0
    for: 1m
    labels:
      alertype: dba
    annotations:
      summary: "MySQL database is down"
      description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; 地址: {{ $labels.instance }}"

# 2. 热加载prometheus
curl -X POST http://localhost:9090/-/reload
  • 压测机器使内存使用率达到百分之90以上,可以发现告警打入普通告警群

    stress --vm 1 --vm-bytes 200M --vm-hang 10000 --timeout 10000s
    

    20250826192919
    告警恢复:
    20250826193558

  • 关闭mysql服务,可以发现告警打入dba告警群
    20250826192134
    告警恢复:
    20250826192315

posted @ 2025-07-31 21:14  阿峰博客站  阅读(60)  评论(0)    收藏  举报