1,部署Alertmanager
启动端口为:9093
2,配置Prometheus与Alertmanager通信
2-1 在Alertmanager 部署机器,设置Alertmanager告警的接受方式。
[root@centos7 alert]# cat alertmanager.yml
global:
resolve_timeout: 5m
#smtp_smarthost: 'smtp.163.com:25'
smtp_smarthost: 'smtp.qq.com:465'
smtp_from: '852xxxxx04@qq.com'
smtp_auth_username: '852xxxxx04@qq.com'
smtp_auth_password: 'xxxxx' ## 授权码
smtp_require_tls: false
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 1m
receiver: 'mail'
receivers:
- name: 'mail'
email_configs:
- to: '256xxxxxx9253@qq.com'
#inhibit_rules: #告警抑制
# - source_match:
# severity: 'critical'
# target_match:
# severity: 'warning'
# equal: ['alertname', 'dev', 'instance']
[root@centos7 alert]#
[root@centos7 alert]#
[root@centos7 alert]# ./amtool check-config ./alertmanager.yml
Checking './alertmanager.yml' SUCCESS
Found:
- global config
- route
- 0 inhibit rules
- 1 receivers
- 0 templates
[root@centos7 alert]#
2-2 在Prometheus server端设置与Alertmanager通信
[root@centos7 prometheus]# cat prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets:
- 192.168.0.14:9093
rule_files:
- "rules/*.yml"
scrape_configs:
- job_name: 'bj'
file_sd_configs:
- files: ['/usr/local/prometheus/sd_config/*.yml']
refresh_interval: 5s
[root@centos7 prometheus]#
2-3 编写告警规则
[root@centos7 prometheus]# cat /usr/local/prometheus/rules/first.yml
groups:
- name: general.rules
rules:
# Alert for any instance that is unreachable for >5 minutes.
- alert: InstanceDown
expr: up == 0
for: 1m
labels:
severity: error
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
[root@centos7 prometheus]#
3,告警状态
Inactive:这里什么都没有发生。
Pending:已触发阈值,但未满足告警持续时间
Firing:已触发阈值且满足告警持续时间。警报发送给接受者