Docker-compose 部署 Prometheus + Grafana + Alertmanager + DingDing 告警 (二)

部署 node-exporter, Prometheus, Alertmanager, Grafana

在 172.16.16.119 上操作

mkdir -p /data/docker-compose/monitor/{conf,data}
mkdir -p /data/docker-compose/monitor/data/{grafana,prometheus}
chown -R 1001.1001 /data/docker-compose/monitor/data/

cd /data/docker-compose/monitor/

# 编写 docker-compose.yml
vi docker-compose.yml

version: '3.3'
services:
  prometheus:
    image: bitnami/prometheus:2.53.1
    container_name: prometheus
    hostname: prometheus
    restart: always
    volumes:
      - ./conf/prometheus.yml:/opt/bitnami/prometheus/prometheus.yml
      - ./conf/node.yml:/opt/prometheus/rules/node.yml
      - ./data/prometheus:/opt/bitnami/prometheus/data
    ports:
      - "9090:9090"
    environment: 
      - TZ=Asia/Shanghai
    command:
      - '--storage.tsdb.retention=3d'
    networks:
      - monitor

  alertmanager:
    image: bitnami/alertmanager:0.27.0
    container_name: alertmanager
    hostname: alertmanager
    restart: always
    volumes:
      - ./conf/alertmanager.yml:/etc/alertmanager/config.yml
    ports:
      - "9093:9093"
    environment: 
      - TZ=Asia/Shanghai
    networks:
      - monitor

  node-exporter:
    image: bitnami/node-exporter:1.8.2
    container_name: node-exporter
    hostname: node-exporter
    restart: always
    ports:
      - "9100:9100"
    volumes:
      - /proc:/host/proc:ro
      - /sys:/host/sys:ro
      - /:/host/root:ro 
    command: 
      - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
      - '--path.rootfs=/host/root'
    environment: 
      - TZ=Asia/Shanghai
    networks:
      - monitor

  grafana:
    image: swr.cn-north-4.myhuaweicloud.com/ddn-k8s/docker.io/bitnami/grafana:11.1.0
    container_name: grafana
    hostname: grafana
    restart: always
    volumes:
      - ./data/grafana:/opt/bitnami/grafana/data
    ports:
      - "3000:3000"
    networks:
      - monitor

networks:
  monitor:
    driver: bridge

# 注意这里 prometheus 数据保留3天,通过 --storage.tsdb.retention=3d 进行设置

cd /data/docker-compose/monitor/conf

# prometheus 配置
vi prometheus.yml

global:
  scrape_interval: 15s
  evaluation_interval: 15s
 
alerting:
  alertmanagers:
  - static_configs:
    - targets:
      - alertmanager:9093

rule_files:
  - "/opt/prometheus/rules/node.yml"

scrape_configs:
  - job_name: 'prometheus'
    static_configs:
    - targets: ['prometheus:9090']

  - job_name: 'node'
    static_configs: # node-exporter 的ip和端口
    - targets: ['172.16.16.119:9100', '172.16.16.10:9100']  

  - job_name: 'alertmanager'
    static_configs:
    - targets: ['alertmanager:9093']


# alertmanager 配置
vi alertmanager.yml 

global:
  #每三分钟检查一次是否恢复
  resolve_timeout: 3m
 
route:
  #设置默认接收人
  receiver: 'devops' 
  group_by: ['Service_Down']
  #组告警等待时间。也就是告警产生后等待10s,如果有同组告警一起发出
  group_wait: 10s
  #两组告警的间隔时间
  group_interval: 10s
  #重复告警的间隔时间,减少相同微信告警的发送频率
  repeat_interval: 1h
  #采用哪个标签来作为分组依据
  routes:
  - receiver: devops
    group_wait: 10s
    match:
      team: DevOps
receivers:
- name: 'devops'  #与钉钉告警组相匹配
  webhook_configs:  # 特别注意 url dingtalk 后面的 devops 要跟上面的 name 一样
  - url: http://172.16.16.119:8060/dingtalk/devops/send 
    #警报被解决之后是否通知
    send_resolved: true

# 服务器资源告警策略
vi node.yml 

groups:
- name: 服务器资源监控
  rules:
  - alert: Node实例已宕机
    expr: up{instance =~ ".*:9100"} == 0
    for: 30s
    labels:
      user: root
      severity: Emergency
    annotations:
      summary: "{{ $labels.instance }} 客户端已停止运行,请尽快处理!"
      description: "Node_Exporter客户端已停止运行,当前状态:{{ $value }} "

  - alert: 内存使用率过高
    expr: 100 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 80
    for: 3m 
    labels:
      severity: 严重告警
    annotations:
      summary: "{{ $labels.instance }} 内存使用率过高, 请尽快处理!"
      description: "{{ $labels.instance }}内存使用率超过80%,当前使用率{{ $value }}%."
          
  - alert: CPU高负荷
    expr: 100 - (avg by (instance,job)(irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
    for: 5m
    labels:
      severity: 严重告警
    annotations:
      summary: "{{$labels.instance}} CPU使用率过高,请尽快处理!"
      description: "{{$labels.instance}} CPU使用大于90%,当前使用率{{ $value }}%. "
 
  - alert: 网络流入
    expr: ((sum(rate (node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance,job)) / 100) > 102400
    for: 5m
    labels:
      severity: 严重告警
    annotations:
      summary: "{{$labels.instance}} 流入网络带宽过高,请尽快处理!"
      description: "{{$labels.instance}} 流入网络带宽持续5分钟高于100M. RX带宽使用量{{$value}}."
 
  - alert: 网络流出
    expr: ((sum(rate (node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance,job)) / 100) > 102400
    for: 5m
    labels:
      severity: 严重告警
    annotations:
      summary: "{{$labels.instance}} 流出网络带宽过高,请尽快处理!"
      description: "{{$labels.instance}} 流出网络带宽持续5分钟高于100M. RX带宽使用量{$value}}."
  
  - alert: TCP连接数
    expr: node_netstat_Tcp_CurrEstab > 10000
    for: 2m
    labels:
      severity: 严重告警
    annotations:
      summary: " TCP_ESTABLISHED过高!"
      description: "{{$labels.instance}} TCP_ESTABLISHED大于100%,当前使用率{{ $value }}%."
 
  - alert: 磁盘容量
    expr: 100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_bytes {fstype=~"ext4|xfs"}*100) > 90
    for: 1m
    labels:
      severity: 严重告警
    annotations:
      summary: "{{$labels.mountpoint}} 磁盘分区使用率过高,请尽快处理!"
      description: "{{$labels.instance}} 磁盘分区使用大于90%,当前使用率{{ $value }}%."

# 启动
docker-compose up -d

访问 Prometheus

http://172.16.16.119:9090/

测试钉钉告警

在 172.16.16.119 上操作

cd /data/docker-compose/monitor
docker-compose stop node-exporter

钉钉收到告警信息

posted @ 2024-07-30 17:49  klvchen  阅读(376)  评论(0)    收藏  举报