Kubernetes 之 Prometheus - Alertmanager - 钉钉告警 -企业微信告警【七】

prometheus 触发一条告警的过程


prometheus--->触发阈值--->超出持续时间--->alertmanager--->分组|抑制|静默--->媒体类型--->邮件|钉钉|微信等。

分组(group): 将类似性质的警报合并为单个通知,比如网络通知、主机通知、服务通知。

静默(Silences): 是一种简单的特定时间静音的机制,例如: 服务器要升级维护可以先设置这个时间段告警静默。
抑制(inhibition): 当警报发出后,停止重复发送由此警报引发的其他警报即合并一个故障引起的多个报警事件,可以消除冗余告警

部署安装Alertmanager

#下载地址
cd /apps
https://prometheus.io/download/#alertmanager
wget https://github.com/prometheus/alertmanager/releases/download/v0.25.0/alertmanager-0.25.0.linux-amd64.tar.gz
[root@xianchaomaster1 apps]# tar zxvf alertmanager-0.25.0.linux-amd64.tar.gz
alertmanager-0.25.0.linux-amd64/
alertmanager-0.25.0.linux-amd64/alertmanager.yml
alertmanager-0.25.0.linux-amd64/NOTICE
alertmanager-0.25.0.linux-amd64/amtool
alertmanager-0.25.0.linux-amd64/alertmanager
alertmanager-0.25.0.linux-amd64/LICENSE
[root@xianchaomaster1 apps]# ln -sv /apps/alertmanager-0.25.0.linux-amd64 /apps/alertmanager
‘/apps/alertmanager’ -> ‘/apps/alertmanager-0.25.0.linux-amd64’

#配置service文件
vim /etc/systemd/system/alertmanager.service
[Unit]
Description=Prometheus alertmanager
After=network.target

[Service]
ExecStart=/apps/alertmanager/alertmanager --config.file=/apps/alertmanager/alertmanager.yml

[Install]
WantedBy=multi-user.target

systemctl daemon-reload && systemctl restart alertmanager && systemctl enable alertmanager

#配置文件解析
vim /apps/alertmanager/alertmanager.yml
global:
  resolve_timeout: 5m #单次探测超时时间
  smtp_smarthost: 'smtp.qq.com:465' #邮箱smtp地址
  smtp_from: '807722920@qq.com' #发件人邮箱地址
  smtp_auth_username: '807722920@qq.com' #发件人的登录用户名,默认和发件人地址一直
  smtp_auth_password: '' #发件人的登录密码,有时候是授权码
  smtp_hello: '@qq.com'  #
  smtp_require_tls: false #是否需要tls协议。默认是true

route:
  group_by: ['alertname'] #通过alertname 的值对告警进行分类,- alert: 物理节点 cpu 使用率
  group_wait: 1s #一组告警第一次发送之前等待的延迟时间,即产生告警后延迟 10 秒钟将组内新产生的消息一起合并发送(一般设置为 0秒 ~ 几分钟)。
  group_interval: 1s #一组已发送过初始通知的告警接收到新告警后,下次发送通知前等待的延迟时,一般设置为 5分钟或更多
  repeat_interval: 1s # 一条成功发送的告警,在最终发送通知之前等待的时间(通常设置为 3 小时或更长时间)。
  receiver: 'wangyi-email' #其它的告警发送给 default-receiver

receivers: #定义多个接收者
  - name: 'wangyi-email'
    email_configs:
    - to: 'xksbirkhoff@163.com'

inhibit_rules: #抑制的规则
  - source_match: #源匹配级别,当匹配成功发出通知,但是其他'alertname', 'dev', 'instance' 产生的warning级别告警通知将被抑制
      severity: 'critical' #报警的事件级别
    target_match:
      severity: 'warning' #调用 source_match 的 severity 即如果已经有'critical'级别的报警,那么将匹配目标为新产生的告警级别为'warning’的将被抑制
    equal: ['alertname', 'dev', 'instance']
    
#配置alertmanager配置文件
vim /apps/alertmanager/alertmanager.yml
global:
  resolve_timeout: 5m
  smtp_smarthost: 'smtp.qq.com:465'
  smtp_from: '807722920@qq.com'
  smtp_auth_username: '807722920@qq.com'
  smtp_auth_password: ''
  smtp_hello: '@qq.com'
  smtp_require_tls: false

route:
  group_by: ['alertname']
  group_wait: 1s
  group_interval: 1s
  repeat_interval: 1s
  receiver: 'wangyi-email'

receivers:
  - name: 'wangyi-email'
    email_configs:
    - to: 'xksbirkhoff@163.com'

inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'dev', 'instance']

systemctl restart alertmanager

#配置Prometheus文件 打开alertmanager配置
[root@xianchaomaster1 rules]# vim /apps/prometheus/prometheus.yml
alerting:
  alertmanagers:
    - static_configs:
        - targets:
           - 192.168.40.180:9093

#配置rules
[root@xianchaomaster1 rules]# vim rule1.yaml
groups:
  - name: alertmanager_pod.rules
    rules:
    - alert: Pod_all_cpu_usage
      expr: (sum by(name)(rate(container_cpu_usage_seconds_total{image!=""}[5m]))*100) > 10
      for: 2m
      labels:
        severity: critical
        service: pods
        project: myserver
      annotations:
        description: 容器 {{ $labels.name }} CPU 资源利用率大于 10% , (current value is {{ $value }})
        summary: Dev CPU 负载告警

    - alert: Pod_all_memory_usage
      #expr: sort_desc(avg by(name)(irate(container_memory_usage_bytes{name!=""}[5m]))*100) > 10  #内存大于10%
      expr: sort_desc(avg by(name)(irate(node_memory_MemFree_bytes {name!=""}[5m]))) > 2*1024*1024*1024   #内存大于2G
      for: 2m
      labels:
        severity: critical
        project: myserver
      annotations:
        description: 容器 {{ $labels.name }} Memory 资源利用率大于 2G , (current value is {{ $value }})
        summary: Dev Memory 负载告警

    - alert: Pod_all_network_receive_usage
      #expr: sum by (name)(irate(container_network_receive_bytes_total{container_name="POD"}[1m])) > 50*1024*1024
      expr: sum by (name)(irate(container_network_receive_bytes_total{container_name="POD"}[1m])) > 0
      for: 2m
      labels:
        #severity: critical
        project: myserver
      annotations:
        description: 容器 {{ $labels.name }} network_receive 资源利用率大于 50M , (current value is {{ $value }})

    - alert: node内存可用大小
      expr: node_memory_MemFree_bytes > 1024 #故意写错的
      #expr: node_memory_MemFree_bytes < 524288000 #内存小于500兆
      for: 30s
      labels:
        project: node
      annotations:
        description: node节点可用内存小于500兆

#配置Prometheus 加载rules文件

[root@xianchaomaster1 rules]# vim /apps/prometheus/prometheus.yml
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  # - "first_rules.yml"
  # - "second_rules.yml"
  - "/apps/alertmanager/rules/rule1.yaml"

[root@xianchaomaster1 rules]# systemctl restart prometheus

DingDing 告警

配置钉钉机器人

复制token

配置dingding shell脚本测试

[root@xianchaomaster1 alertmananger_dingding]# cat dingding.sh
#!/bin/bash
source   /etc/profile
#PHONE=$1
#SUBJECT=$2
MESSAGE=$1

/usr/bin/curl -X "POST"  'https://oapi.dingtalk.com/robot/send?access_token=a7dbcdf0f2f4faeecb9e5264c2259bb6ac2d54b3eabf368665fdb711f50906eb' \
-H 'Content-Type: application/json' \
-d '{"msgtype": "text",
    "text": {
         "content": "'${MESSAGE}'"
    }
  }'
  
[root@xianchaomaster1 alertmananger_dingding]# bash dingding.sh 'xksnode1=192.168.40.181:9100,alertname=node内存可用大小小于5G'
{"errcode":0,"errmsg":"ok"}

配置dingding python脚本测试

[root@xianchaomaster1 alertmananger_dingding]# cat dingding.py
#!/usr/bin/python3
import sys
import requests
import json
#钉钉告警:
def info(msg):
    url = 'https://oapi.dingtalk.com/robot/send?access_token=a7dbcdf0f2f4faeecb9e5264c2259bb6ac2d54b3eabf368665fdb711f50906eb'

    headers = {
        'Content-Type': 'application/json;charset=utf-8'
    }
    formdata = {
        "msgtype": "text",
        "text": {"content":str(msg)}
    }
    #print(formdata)
    requests.post(url=url, data=json.dumps(formdata),headers=headers)
info(sys.argv[1])

[root@xianchaomaster1 alertmananger_dingding]# python dingding.py 'xianchaonode2=192.168.40.182:9100,  alertname=CPU too high need to look!'

部署webhook-dingtalk

#prometheus-webhook-dingtalk-1.4.0.linux-amd64.tar.gz
tar zxvf prometheus-webhook-dingtalk-1.4.0.linux-amd64.tar.gz
cd prometheus-webhook-dingtalk-1.4.0.linux-amd64
[root@xianchaomaster1 prometheus-webhook-dingtalk-1.4.0.linux-amd64]# ./prometheus-webhook-dingtalk ^C
./prometheus-webhook-dingtalk --web.listen-address="0.0.0.0:8060" --ding.profile="alertname=https://oapi.dingtalk.com/robot/send?access_token=a7dbcdf0f2f4faeecb9e5264c2259bb6ac2d54b3eabf368665fdb711f50906eb" &

[root@xianchaomaster1 prometheus-webhook-dingtalk-1.4.0.linux-amd64]# netstat -nltp | grep 8060
tcp6       0      0 :::8060                 :::*                    LISTEN      97792/./prometheus-


#配置alertmananger.yml
[root@xianchaomaster1 alertmanager]# vim alertmanager.yml
global:
  resolve_timeout: 5m
  smtp_smarthost: 'smtp.qq.com:465'
  smtp_from: '807722920@qq.com'
  smtp_auth_username: '807722920@qq.com'
  smtp_auth_password: 'uwxmrjtkpneqbbjg'
  smtp_hello: '@qq.com'
  smtp_require_tls: false

route:
  group_by: ['alertname']
  group_wait: 1s
  group_interval: 1s
  repeat_interval: 1s
  receiver: 'dingding'

receivers:
  - name: 'wangyi-email'
    email_configs:
    - to: 'xksbirkhoff@163.com'
  - name: 'dingding'
    webhook_configs:
    - url: 'http://192.168.40.180:8060/dingtalk/alertname/send'
      send_resolved: true

inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'dev', 'instance']


systemctl restart alertmanager
systemctl status alertmanager
systemctl restart prometheus

 

 

企业微信告警-此实验没有发送成功

 

global:
  resolve_timeout: 5m
  smtp_smarthost: 'smtp.qq.com:465'
  smtp_from: '807722920@qq.com'
  smtp_auth_username: '807722920@qq.com'
  smtp_auth_password: 'uwxmrjtkpneqbbjg'
  smtp_hello: '@qq.com'
  smtp_require_tls: false

route:
  group_by: ['alertname']
  group_wait: 1s
  group_interval: 1s
  repeat_interval: 10s
  receiver: 'wechat'

receivers:
  - name: 'wangyi-email'
    email_configs:
    - to: 'xksbirkhoff@163.com'
  - name: 'dingding'
    webhook_configs:
    - url: 'http://192.168.40.180:8060/dingtalk/alertname/send'
      send_resolved: true
  - name: 'wechat'
    wechat_configs:
    - corp_id: ww83d5cf599a033063
      #to_user: '@all'
      to_party: 2
      agent_id: 1000003
      api_secret: W-5uspMVqwKJnBKtzd5f77lsYd9zBftY8xBx1wCd70A
      send_resolved: true


inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'dev', 'instance']

配置告警模板

cat ./template/wechat.tmpl
{{ define "wechat.default.message" }}
{{- if gt (len .Alerts.Firing) 0 -}}
{{- range $index, $alert := .Alerts -}}
{{- if eq $index 0 }}
=========监控报警 =========
告警状态:{{   .Status }}
告警级别:{{ .Labels.severity }}
告警类型:{{ $alert.Labels.alertname }}
故障主机: {{ $alert.Labels.instance }} {{ $alert.Labels.pod }}
告警主题: {{ $alert.Annotations.summary }}
告警详情: {{ $alert.Annotations.message }}{{ $alert.Annotations.description}};
触发阀值:{{ .Annotations.value }}
故障时间: {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
========= = end =  =========
{{- end }}
{{- end }}
{{- end }}
{{- if gt (len .Alerts.Resolved) 0 -}}
{{- range $index, $alert := .Alerts -}}
{{- if eq $index 0 }}
=========异常恢复 =========
告警类型:{{ .Labels.alertname }}
告警状态:{{   .Status }}
告警主题: {{ $alert.Annotations.summary }}
告警详情: {{ $alert.Annotations.message }}{{ $alert.Annotations.description}};
故障时间: {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
恢复时间: {{ ($alert.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
{{- if gt (len $alert.Labels.instance) 0 }}
实例信息: {{ $alert.Labels.instance }}
{{- end }}
========= = end =  =========
{{- end }}
{{- end }}
{{- end }}
{{- end }}


vim alertmanager.yml
templates:
'./template/wechat.tmpl'#alertermanager 引用模板

alertmanager 高可用:

单机、基于负载均衡、基于Gossip机制

PrometheusAlert

下载地址:https://github.com/feiyu563/PrometheusAlert

 
posted @ 2023-04-29 01:02  しみずよしだ  阅读(177)  评论(1)    收藏  举报