alertmanage学习链接:
路由中标签详解:
https://yunlzheng.gitbook.io/prometheus-book/parti-prometheus-ji-chu/alert/alert-manager-route
alertmanage的配置文件示例
https://blog.csdn.net/qq_22227087/article/details/96483009
alertmanage主配置文件中关键字(方法)详解
https://zhuanlan.zhihu.com/p/74932366
###########################################################
prometheus配置文件和alert,amage配置文件详解
prometheus的主配置文件中包含rule.yml, rule.yml中的rules只负责定义报警的触发条件(下面有rule.yml配置文件的样例)。触发以后会将消息转发到alertmanage,alertmanage的主配置文件中根据路由选择转发给哪个接收者(下面有alertmanage.yml配置文件的样例)
#################################################################
promethuse主配置文件样例
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
scrape_timeout: 15s
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
#promethuse关联alertmanger
alerting:
alertmanagers:
- static_configs:
- targets:
- localhost:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
- /data/prometheus/alertmanager-0.17.0/netdata-rule.yml
- /data/prometheus/alertmanager-0.17.0/mysql-rule.yml
- /data/prometheus/alertmanager-0.17.0/bigdata-rule.yml
- /data/prometheus/alertmanager-0.17.0/db-rule.yml
- /data/prometheus/alertmanager-0.17.0/ops-rule.yml
- /data/prometheus/alertmanager-0.17.0/game-backend-rule.yml
- /data/prometheus/alertmanager-0.17.0/game-status-rule.yml
- /data/prometheus/alertmanager-0.17.0/url-rule.yml
- /data/prometheus/alertmanager-0.17.0/idc-rule.yml
############################################################
rule.yml配置样例
groups:
- name: test-rule
rules:
- alert: "tcp连接报警"
expr: netdata_ipv4_tcpsock_active_connections_average{dimension="connections",family="tcp"} > 1
for: 1s
labels:
severity: warning
annotations:
summary: "服务名:{{$labels.alertname}}"
description: "业务500报警: {{ $value }}"
value: "{{ $value }}"
- name: test-rule2
rules:
- alert: "内存报警"
expr: 100 - ((node_memory_MemAvailable * 100) / node_memory_MemTotal) > 1
for: 1s
labels:
severity: test
annotations:
summary: "服务名:{{$labels.alertname}}"
description: "业务500报警: {{ $value }}"
value: "{{ $value }}"
##########################################################
global:
resolve_timeout: 5m
# smtp_smarthost: 使用email打开服务配置
smtp_smarthost: 'smtp.gmail.com:465'
# smtp_from:指定通知报警的邮箱
smtp_from: 'solomon02040@gmail.com'
# smtp_auth_username:邮箱用户名
smtp_auth_username: 'solomon02040@gmail.com'
# smtp_auth_password:授权密码
smtp_auth_password: '8899//zz'
# smtp_require_tls:是否启用tls
smtp_require_tls: false
templates:
- '/data/alertmanager/template/*.tmpl'
route:
receiver: solomontest #接收人的名字可以自己定义
group_by: ['alertname']
group_wait: 10s #组报警等待时间
group_interval: 10s #组报警间隔时间
repeat_interval: 60s #重复报警间隔时间
routes:
- receiver: solomontest
group_wait: 10s
match_re: #使用match_re方法(使用正则的方式匹配,这里用match更好)
env: test #env是报警数据中的标签,test是标签的值
receivers:
- name: solomontest
email_configs:
- to: 'solomon02040@gmail.com'
html: '{{ template "xx.html" . }}' #指定模板
headers: { Subject: "[WARN] 报警邮件" }
- name: 'test'
email_configs:
- to: 'xxx@xx.xx'
html: '{{ template "xx.html" . }}'
headers: { Subject: " {{ 第二路由匹配测试}}" }
########################################################