Prometheus 钉钉告警实例

本文是按照实际的环境来进行部署,具体请按各自的操作环境进行修改即可;

环境配置

# 关闭防火墙
systemctl stop firewalld
systemctl disable firewalld

# 开放端口
firewall-cmd --permanent --zone=public --add-port=端口/tcp //永久

# 关闭selinux
sed -i 's/enforcing/disabled/' /etc/selinux/config  //永久
setenforce 0                                        //临时

Docker/Docker-compose安装

监控端部署

Prometheus 是当前一套非常流行的开源监控和报警系统,于 2016 年加入了 Clound Native Computing Foundation,是继 kubernates 之后的第二个托管项目。
本文讲述如何使用 docker 快速搭建 prometheus 监控系统。

概览

Prometheus负责收集数据,Grafana负责展示数据。其中采用Prometheus 中的 Exporter含:

  • prometheus:负责收集和存储时间序列数据
  • grafana:负责展示数据
  • node-exporter:负责暴露主机 metrics 数据给 prometheus,收集 host 硬件和操作系统数据。它将以容器方式运行在所有 host 上
  • cAdvisor,负责收集容器数据。它将以容器方式运行在所有 host 上
  • Alertmanager,负责告警。它将以容器方式运行在所有 host 上

配置文件

$ mkdir -p /home/prometheus/{config,grafana}

# 配置报警规则文件,Prometheus配置文件,告警信息配置文件
$ touch /home/prometheus/config/alertmanager-rule.yml
$ touch /home/prometheus/config/prometheus.yml
$ touch /home/prometheus/config/alertmanager.yml

# 赋予权限访问
$ chmod 777 -R /home/prometheus

prometheus.yml

# my global config
global:
  scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.

# Alertmanager configuration
alerting:
  alertmanagers:
  - static_configs:
    - targets: ["188.188.4.5:9093"]
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  - "alertmanager-rule.yml"

scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: 'prometheus'
    static_configs:
    - targets: ['188.188.4.5:9090']
  - job_name: 'cadvisor'
    static_configs:
    - targets: ['188.188.4.5:8080']
  - job_name: 'node-exporter'
    scrape_interval: 8s
    static_configs:
    - targets: ['188.188.4.5:9100']

alertmanager.yml

global:
  resolve_timeout: 5m
route:
  group_by: ['service', 'alertname', 'cluster']
  group_interval: 5m
  group_wait: 10s
  repeat_interval: 5m
  receiver: webhook
  routes:
  - match:
      severity: 'Critical'
    receiver: 'webhook'
  - match_re:
      severity: ^(Warning|Disaster)$
    receiver: 'webhook'
receivers:
- name: 'webhook'
  webhook_configs:
  - url: http://188.188.4.5:8060/dingtalk/webhook1/send
    send_resolved: true

alertmanager-rule.yml

groups:
  - name: Warning
    rules:
      - alert: NodeMemoryUsage
        expr: 100 - (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes) / node_memory_MemTotal_bytes*100 > 80
        for: 1m
        labels:
          status: Warning
        annotations:
          summary: "{{$labels.instance}}: 内存使用率过高"
          description: "{{$labels.instance}}: 内存使用率大于 80% (当前值: {{ $value }}"

      - alert: NodeCpuUsage
        expr: (1-((sum(increase(node_cpu_seconds_total{mode="idle"}[1m])) by (instance)) / (sum(increase(node_cpu_seconds_total[1m])) by (instance)))) * 100 > 70
        for: 1m
        labels:
          status: Warning
        annotations:
          summary: "{{$labels.instance}}: CPU使用率过高"
          description: "{{$labels.instance}}: CPU使用率大于 70% (当前值: {{ $value }}"

      - alert: NodeDiskUsage
        expr: 100 - node_filesystem_free_bytes{fstype=~"xfs|ext4"} / node_filesystem_size_bytes{fstype=~"xfs|ext4"} * 100 > 80
        for: 1m
        labels:
          status: Warning
        annotations:
          summary: "{{$labels.instance}}: 分区使用率过高"
          description: "{{$labels.instance}}: 分区使用大于 80% (当前值: {{ $value }}"

      - alert: Node-UP
        expr: up{job='node-exporter'} == 0
        for: 1m
        labels:
          status: Warning
        annotations:
          summary: "{{$labels.instance}}: 服务宕机"
          description: "{{$labels.instance}}: 服务中断超过1分钟"

      - alert: TCP
        expr: node_netstat_Tcp_CurrEstab > 1000
        for: 1m
        labels:
          status: Warning
        annotations:
          summary: "{{$labels.instance}}: TCP连接过高"
          description: "{{$labels.instance}}: 连接大于1000 (当前值: {{$value}})"

      - alert: IO
        expr: 100 - (avg(irate(node_disk_io_time_seconds_total[1m])) by(instance)* 100) < 60
        for: 1m
        labels:
          status: Warning
        annotations:
          summary: "{{$labels.instance}}: 流入磁盘IO使用率过高"
          description: "{{$labels.instance}}:流入磁盘IO大于60%  (当前值:{{$value}})"

Docker-compose.yml

version: "2"
networks:
    monitor:
        driver: bridge
services:
  prometheus:
    image: prom/prometheus
    container_name: prometheus
    hostname: prometheus
    restart: always
    volumes:
    - /home/prometheus/config/prometheus.yml:/etc/prometheus/prometheus.yml
    - ./config/alertmanager-rule.yml:/etc/prometheus/alertmanager-rule.yml
    - /etc/localtime:/etc/localtime
    command:
    - '--config.file=/etc/prometheus/prometheus.yml'    
    - '--storage.tsdb.path=/prometheus'
    ports:
    - "9090:9090"
    networks:
    - monitor
  prometheus-webhook-alert:
    image: timonwong/prometheus-webhook-dingtalk
    container_name: prometheus-webhook-alertmanagers
    hostname: webhook-alertmanagers
    restart: always
    volumes:
    - /etc/localtime:/etc/localtime
    ports:
    - "8060:8060"
    entrypoint: /bin/prometheus-webhook-dingtalk --ding.profile="webhook1=https://oapi.dingtalk.com/robot/send?access_token=5ee7d33bd5d9da495bc5b33dccf803cb9c14f04c9f800c2c9e89b89b8a5b07fb"
    networks:
    - monitor
  alertmanager:
    image: prom/alertmanager
    container_name: alertmanager
    hostname: alertmanager
    restart: always
    volumes:
      - ./config/alertmanager.yml:/etc/alertmanager/alertmanager.yml
      - /etc/localtime:/etc/localtime
    ports:
      - "9093:9093"
    networks:
      - monitor
  grafana:
    image: grafana/grafana
    container_name: grafana
    hostname: grafana
    restart: always
    volumes:
    - /etc/localtime:/etc/localtime
    - ./grafana:/var/lib/grafana
    ports:
    - "3000:3000"
    networks:
    - monitor
  node-exporter:
    image: quay.io/prometheus/node-exporter
    container_name: node-exporter
    hostname: node-exporter
    restart: always
    ports:
    - "9100:9100"
    networks:
    - monitor
  cadvisor:
    image: google/cadvisor:latest
    container_name: cadvisor
    hostname: cadvisor
    restart: always
    volumes:
    - /:/rootfs:ro
    - /var/run:/var/run:rw
    - /sys:/sys:ro
    - /var/lib/docker/:/var/lib/docker:ro
    - /etc/localtime:/etc/localtime
    ports:
    - "8080:8080"
    networks:
    - monitor

启动docker-compose

# 启动容器:
docker-compose -f /home/prometheus/docker-compose-monitor.yml up -d
# 删除容器:
docker-compose -f /home/prometheus/docker-compose-monitor.yml down
# 重启容器:
docker-compose -f /home/prometheus/docker-compose-monitor.yml restart

客户端部署

发布程序 node-exporter 和 cadvisor

配置文件

version: "2"
networks:
    monitor:
        driver: bridge
services:
  node-exporter:
    image: prom/node-exporter
    container_name: node-exporter
    hostname: node-exporter
    restart: always
    ports:
    - "9100:9100"
    networks:
    - monitor
  cadvisor:
    image: google/cadvisor:latest
    container_name: cadvisor
    hostname: cadvisor
    restart: always
    volumes:
    - /:/rootfs:ro
    - /var/run:/var/run:rw
    - /sys:/sys:ro
    - /var/lib/docker/:/var/lib/docker:ro
    - /etc/localtime:/etc/localtime
    ports:
    - "8080:8080"
    networks:
    - monitor

服务端修改 prometheus.yml

scrape_configs:
  - job_name: 'prometheus'
    static_configs:
    - targets: ['188.188.2.30:9090']
  - job_name: 'cadvisor'
    static_configs:
    - targets: ['188.188.2.30:8083']
    - targets: ['188.188.4.5:8080']    # 增加容器客户端采集
  - job_name: 'node-exporter'
    scrape_interval: 8s
    static_configs:
    - targets: ['188.188.2.30:9100']
    - targets: ['188.188.4.5:9100']    # 增加客户端物理采集
...                                    # 具体根据情况添加主机

posted @ 2021-01-13 17:05  YuiKuen_Yuen  阅读(213)  评论(0)    收藏  举报