Docker-compose 部署 Prometheus + Grafana + Alertmanager + DingDing 告警 (二)
部署 node-exporter, Prometheus, Alertmanager, Grafana
在 172.16.16.119 上操作
mkdir -p /data/docker-compose/monitor/{conf,data}
mkdir -p /data/docker-compose/monitor/data/{grafana,prometheus}
chown -R 1001.1001 /data/docker-compose/monitor/data/
cd /data/docker-compose/monitor/
# 编写 docker-compose.yml
vi docker-compose.yml
version: '3.3'
services:
prometheus:
image: bitnami/prometheus:2.53.1
container_name: prometheus
hostname: prometheus
restart: always
volumes:
- ./conf/prometheus.yml:/opt/bitnami/prometheus/prometheus.yml
- ./conf/node.yml:/opt/prometheus/rules/node.yml
- ./data/prometheus:/opt/bitnami/prometheus/data
ports:
- "9090:9090"
environment:
- TZ=Asia/Shanghai
command:
- '--storage.tsdb.retention=3d'
networks:
- monitor
alertmanager:
image: bitnami/alertmanager:0.27.0
container_name: alertmanager
hostname: alertmanager
restart: always
volumes:
- ./conf/alertmanager.yml:/etc/alertmanager/config.yml
ports:
- "9093:9093"
environment:
- TZ=Asia/Shanghai
networks:
- monitor
node-exporter:
image: bitnami/node-exporter:1.8.2
container_name: node-exporter
hostname: node-exporter
restart: always
ports:
- "9100:9100"
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/host/root:ro
command:
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
- '--path.rootfs=/host/root'
environment:
- TZ=Asia/Shanghai
networks:
- monitor
grafana:
image: swr.cn-north-4.myhuaweicloud.com/ddn-k8s/docker.io/bitnami/grafana:11.1.0
container_name: grafana
hostname: grafana
restart: always
volumes:
- ./data/grafana:/opt/bitnami/grafana/data
ports:
- "3000:3000"
networks:
- monitor
networks:
monitor:
driver: bridge
# 注意这里 prometheus 数据保留3天,通过 --storage.tsdb.retention=3d 进行设置
cd /data/docker-compose/monitor/conf
# prometheus 配置
vi prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
rule_files:
- "/opt/prometheus/rules/node.yml"
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['prometheus:9090']
- job_name: 'node'
static_configs: # node-exporter 的ip和端口
- targets: ['172.16.16.119:9100', '172.16.16.10:9100']
- job_name: 'alertmanager'
static_configs:
- targets: ['alertmanager:9093']
# alertmanager 配置
vi alertmanager.yml
global:
#每三分钟检查一次是否恢复
resolve_timeout: 3m
route:
#设置默认接收人
receiver: 'devops'
group_by: ['Service_Down']
#组告警等待时间。也就是告警产生后等待10s,如果有同组告警一起发出
group_wait: 10s
#两组告警的间隔时间
group_interval: 10s
#重复告警的间隔时间,减少相同微信告警的发送频率
repeat_interval: 1h
#采用哪个标签来作为分组依据
routes:
- receiver: devops
group_wait: 10s
match:
team: DevOps
receivers:
- name: 'devops' #与钉钉告警组相匹配
webhook_configs: # 特别注意 url dingtalk 后面的 devops 要跟上面的 name 一样
- url: http://172.16.16.119:8060/dingtalk/devops/send
#警报被解决之后是否通知
send_resolved: true
# 服务器资源告警策略
vi node.yml
groups:
- name: 服务器资源监控
rules:
- alert: Node实例已宕机
expr: up{instance =~ ".*:9100"} == 0
for: 30s
labels:
user: root
severity: Emergency
annotations:
summary: "{{ $labels.instance }} 客户端已停止运行,请尽快处理!"
description: "Node_Exporter客户端已停止运行,当前状态:{{ $value }} "
- alert: 内存使用率过高
expr: 100 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 80
for: 3m
labels:
severity: 严重告警
annotations:
summary: "{{ $labels.instance }} 内存使用率过高, 请尽快处理!"
description: "{{ $labels.instance }}内存使用率超过80%,当前使用率{{ $value }}%."
- alert: CPU高负荷
expr: 100 - (avg by (instance,job)(irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
for: 5m
labels:
severity: 严重告警
annotations:
summary: "{{$labels.instance}} CPU使用率过高,请尽快处理!"
description: "{{$labels.instance}} CPU使用大于90%,当前使用率{{ $value }}%. "
- alert: 网络流入
expr: ((sum(rate (node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance,job)) / 100) > 102400
for: 5m
labels:
severity: 严重告警
annotations:
summary: "{{$labels.instance}} 流入网络带宽过高,请尽快处理!"
description: "{{$labels.instance}} 流入网络带宽持续5分钟高于100M. RX带宽使用量{{$value}}."
- alert: 网络流出
expr: ((sum(rate (node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance,job)) / 100) > 102400
for: 5m
labels:
severity: 严重告警
annotations:
summary: "{{$labels.instance}} 流出网络带宽过高,请尽快处理!"
description: "{{$labels.instance}} 流出网络带宽持续5分钟高于100M. RX带宽使用量{$value}}."
- alert: TCP连接数
expr: node_netstat_Tcp_CurrEstab > 10000
for: 2m
labels:
severity: 严重告警
annotations:
summary: " TCP_ESTABLISHED过高!"
description: "{{$labels.instance}} TCP_ESTABLISHED大于100%,当前使用率{{ $value }}%."
- alert: 磁盘容量
expr: 100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_bytes {fstype=~"ext4|xfs"}*100) > 90
for: 1m
labels:
severity: 严重告警
annotations:
summary: "{{$labels.mountpoint}} 磁盘分区使用率过高,请尽快处理!"
description: "{{$labels.instance}} 磁盘分区使用大于90%,当前使用率{{ $value }}%."
# 启动
docker-compose up -d
访问 Prometheus
测试钉钉告警
在 172.16.16.119 上操作
cd /data/docker-compose/monitor
docker-compose stop node-exporter
钉钉收到告警信息




浙公网安备 33010602011771号