备注:以下操作均在root权限下进行的操作
安装前置环境:
已安装docker
已安装docker-compose
在/opt目录新建目录prom
在对应的prom目录下分别创建prometheus、alertmanager,grafana相关目录
- mkdir -p /opt/prom/{prometheus,prometheus/data,alertmanager,grafana}
- #prom目录结构
- prom/
- ├── alertmanager
- ├── grafana
- └── prometheus
- └── data
赋予相关目录对应的权限
chmod 777 /opt/prom/{prometheus/data,grafana}
编写docker-compose.yml文件,放在目录/opt/prom下
- touch /opt/prom/docker-compose.yml
- vim /opt/prom/docker-compose.yml
docker-compose.yml文件内容
- version: '3.3'
-
- services:
- node-exporter:
- image: prom/node-exporter:latest
- ports:
- - "9101:9100" #node-exporter默认端口9100,不过由于metersphere的node-exporter已经占用了9100了,这里调整为9101
- networks:
- - prom
-
- dingtalk:
- image: timonwong/prometheus-webhook-dingtalk:latest
- volumes:
- - type: bind
- source: ./alertmanager/config.yml
- target: /etc/prometheus-webhook-dingtalk/config.yml
- read_only: true
- ports:
- - "8060:8060"
- networks:
- - prom
-
- alertmanager:
- depends_on:
- - dingtalk
- image: prom/alertmanager:latest
- volumes:
- - type: bind
- source: ./alertmanager/alertmanager.yml
- target: /etc/alertmanager/alertmanager.yml
- read_only: true
- ports:
- - "9093:9093"
- - "9094:9094"
- networks:
- - prom
-
- prometheus:
- depends_on:
- - alertmanager
- image: prom/prometheus:latest
- volumes:
- - type: bind
- source: ./prometheus/prometheus.yml
- target: /etc/prometheus/prometheus.yml
- read_only: true
- - type: bind
- source: ./prometheus/alert-rules.yml
- target: /etc/prometheus/alert-rules.yml
- read_only: true
- - type: volume
- source: prometheus
- target: /prometheus
- ports:
- - "9200:9090" #prometheus默认端口9090,不过ms-prometheus占用该端口,这里换成9200
- networks:
- - prom
-
- grafana:
- depends_on:
- - prometheus
- image: grafana/grafana:latest
- volumes:
- - type: volume
- source: grafana
- target: /var/lib/grafana
- ports:
- - "3000:3000"
- networks:
- - prom
-
- volumes:
- prometheus:
- driver: local
- driver_opts:
- type: none
- o: bind
- device: /opt/prom/prometheus/data
- grafana:
- driver: local
- driver_opts:
- type: none
- o: bind
- device: /opt/prom/grafana
-
- networks:
- prom:
- driver: bridge
编写prometheus的配置文件prometheus.yml,放在/opt/prom/prometheus目录下
- touch /opt/prom/prometheus/prometheus.yml
- vim /opt/prom/prometheus/prometheus.yml
- global:
- scrape_interval: 15s
- evaluation_interval: 15s
-
- alerting:
- alertmanagers:
- - static_configs:
- - targets:
- - alertmanager:9093
-
- rule_files:
- - "*rules.yml"
-
- scrape_configs:
- - job_name: 'prometheus'
- static_configs:
- - targets: ['prometheus:9091']
-
- - job_name: 'node_prom_localhost'
- static_configs:
- - targets: ['node-exporter:9101']
-
- - job_name: 'alertmanager'
- static_configs:
- - targets: ['alertmanager:9093']
编写告警规则alert-rules.yml文件,放在目录/opt/prom/prometheus下
- touch /opt/prom/prometheus/alert-rules.yml
- vim /opt/prom/prometheus/alert-rules.yml
- groups:
- - name: node-alert
- rules:
- - alert: NodeDown
- expr: up{job="node"} == 0
- for: 5m
- labels:
- severity: critical
- instance: "{{ $labels.instance }}"
- annotations:
- summary: "instance: {{ $labels.instance }} down"
- description: "Instance: {{ $labels.instance }} 已经宕机 5分钟"
- value: "{{ $value }}"
-
- - alert: NodeCpuHigh
- expr: (1 - avg by (instance) (irate(node_cpu_seconds_total{job="node",mode="idle"}[5m]))) * 100 > 80
- for: 5m
- labels:
- severity: warning
- instance: "{{ $labels.instance }}"
- annotations:
- summary: "instance: {{ $labels.instance }} cpu使用率过高"
- description: "CPU 使用率超过 80%"
- value: "{{ $value }}"
-
- - alert: NodeCpuIowaitHigh
- expr: avg by (instance) (irate(node_cpu_seconds_total{job="node",mode="iowait"}[5m])) * 100 > 50
- for: 5m
- labels:
- severity: warning
- instance: "{{ $labels.instance }}"
- annotations:
- summary: "instance: {{ $labels.instance }} cpu iowait 使用率过高"
- description: "CPU iowait 使用率超过 50%"
- value: "{{ $value }}"
-
- - alert: NodeLoad5High
- expr: node_load5 > (count by (instance) (node_cpu_seconds_total{job="node",mode='system'})) * 1.2
- for: 5m
- labels:
- severity: warning
- instance: "{{ $labels.instance }}"
- annotations:
- summary: "instance: {{ $labels.instance }} load(5m) 过高"
- description: "Load(5m) 过高,超出cpu核数 1.2倍"
- value: "{{ $value }}"
-
- - alert: NodeMemoryHigh
- expr: (1 - node_memory_MemAvailable_bytes{job="node"} / node_memory_MemTotal_bytes{job="node"}) * 100 > 90
- for: 5m
- labels:
- severity: warning
- instance: "{{ $labels.instance }}"
- annotations:
- summary: "instance: {{ $labels.instance }} memory 使用率过高"
- description: "Memory 使用率超过 90%"
- value: "{{ $value }}"
-
- - alert: NodeDiskRootHigh
- expr: (1 - node_filesystem_avail_bytes{job="node",fstype=~"ext.*|xfs",mountpoint ="/"} / node_filesystem_size_bytes{job="node",fstype=~"ext.*|xfs",mountpoint ="/"}) * 100 > 90
- for: 10m
- labels:
- severity: warning
- instance: "{{ $labels.instance }}"
- annotations:
- summary: "instance: {{ $labels.instance }} disk(/ 分区) 使用率过高"
- description: "Disk(/ 分区) 使用率超过 90%"
- value: "{{ $value }}"
-
- - alert: NodeDiskBootHigh
- expr: (1 - node_filesystem_avail_bytes{job="node",fstype=~"ext.*|xfs",mountpoint ="/boot"} / node_filesystem_size_bytes{job="node",fstype=~"ext.*|xfs",mountpoint ="/boot"}) * 100 > 80
- for: 10m
- labels:
- severity: warning
- instance: "{{ $labels.instance }}"
- annotations:
- summary: "instance: {{ $labels.instance }} disk(/boot 分区) 使用率过高"
- description: "Disk(/boot 分区) 使用率超过 80%"
- value: "{{ $value }}"
-
- - alert: NodeDiskReadHigh
- expr: irate(node_disk_read_bytes_total{job="node"}[5m]) > 20 * (1024 ^ 2)
- for: 5m
- labels:
- severity: warning
- instance: "{{ $labels.instance }}"
- annotations:
- summary: "instance: {{ $labels.instance }} disk 读取字节数 速率过高"
- description: "Disk 读取字节数 速率超过 20 MB/s"
- value: "{{ $value }}"
-
- - alert: NodeDiskWriteHigh
- expr: irate(node_disk_written_bytes_total{job="node"}[5m]) > 20 * (1024 ^ 2)
- for: 5m
- labels:
- severity: warning
- instance: "{{ $labels.instance }}"
- annotations:
- summary: "instance: {{ $labels.instance }} disk 写入字节数 速率过高"
- description: "Disk 写入字节数 速率超过 20 MB/s"
- value: "{{ $value }}"
-
- - alert: NodeDiskReadRateCountHigh
- expr: irate(node_disk_reads_completed_total{job="node"}[5m]) > 3000
- for: 5m
- labels:
- severity: warning
- instance: "{{ $labels.instance }}"
- annotations:
- summary: "instance: {{ $labels.instance }} disk iops 每秒读取速率过高"
- description: "Disk iops 每秒读取速率超过 3000 iops"
- value: "{{ $value }}"
-
- - alert: NodeDiskWriteRateCountHigh
- expr: irate(node_disk_writes_completed_total{job="node"}[5m]) > 3000
- for: 5m
- labels:
- severity: warning
- instance: "{{ $labels.instance }}"
- annotations:
- summary: "instance: {{ $labels.instance }} disk iops 每秒写入速率过高"
- description: "Disk iops 每秒写入速率超过 3000 iops"
- value: "{{ $value }}"
-
- - alert: NodeInodeRootUsedPercentHigh
- expr: (1 - node_filesystem_files_free{job="node",fstype=~"ext4|xfs",mountpoint="/"} / node_filesystem_files{job="node",fstype=~"ext4|xfs",mountpoint="/"}) * 100 > 80
- for: 10m
- labels:
- severity: warning
- instance: "{{ $labels.instance }}"
- annotations:
- summary: "instance: {{ $labels.instance }} disk(/ 分区) inode 使用率过高"
- description: "Disk (/ 分区) inode 使用率超过 80%"
- value: "{{ $value }}"
-
- - alert: NodeInodeBootUsedPercentHigh
- expr: (1 - node_filesystem_files_free{job="node",fstype=~"ext4|xfs",mountpoint="/boot"} / node_filesystem_files{job="node",fstype=~"ext4|xfs",mountpoint="/boot"}) * 100 > 80
- for: 10m
- labels:
- severity: warning
- instance: "{{ $labels.instance }}"
- annotations:
- summary: "instance: {{ $labels.instance }} disk(/boot 分区) inode 使用率过高"
- description: "Disk (/boot 分区) inode 使用率超过 80%"
- value: "{{ $value }}"
-
- - alert: NodeFilefdAllocatedPercentHigh
- expr: node_filefd_allocated{job="node"} / node_filefd_maximum{job="node"} * 100 > 80
- for: 10m
- labels:
- severity: warning
- instance: "{{ $labels.instance }}"
- annotations:
- summary: "instance: {{ $labels.instance }} filefd 打开百分比过高"
- description: "Filefd 打开百分比 超过 80%"
- value: "{{ $value }}"
-
- - alert: NodeNetworkNetinBitRateHigh
- expr: avg by (instance) (irate(node_network_receive_bytes_total{device=~"eth0|eth1|ens33|ens37"}[1m]) * 8) > 20 * (1024 ^ 2) * 8
- for: 3m
- labels:
- severity: warning
- instance: "{{ $labels.instance }}"
- annotations:
- summary: "instance: {{ $labels.instance }} network 接收比特数 速率过高"
- description: "Network 接收比特数 速率超过 20MB/s"
- value: "{{ $value }}"
-
- - alert: NodeNetworkNetoutBitRateHigh
- expr: avg by (instance) (irate(node_network_transmit_bytes_total{device=~"eth0|eth1|ens33|ens37"}[1m]) * 8) > 20 * (1024 ^ 2) * 8
- for: 3m
- labels:
- severity: warning
- instance: "{{ $labels.instance }}"
- annotations:
- summary: "instance: {{ $labels.instance }} network 发送比特数 速率过高"
- description: "Network 发送比特数 速率超过 20MB/s"
- value: "{{ $value }}"
-
- - alert: NodeNetworkNetinPacketErrorRateHigh
- expr: avg by (instance) (irate(node_network_receive_errs_total{device=~"eth0|eth1|ens33|ens37"}[1m])) > 15
- for: 3m
- labels:
- severity: warning
- instance: "{{ $labels.instance }}"
- annotations:
- summary: "instance: {{ $labels.instance }} 接收错误包 速率过高"
- description: "Network 接收错误包 速率超过 15个/秒"
- value: "{{ $value }}"
-
- - alert: NodeNetworkNetoutPacketErrorRateHigh
- expr: avg by (instance) (irate(node_network_transmit_packets_total{device=~"eth0|eth1|ens33|ens37"}[1m])) > 15
- for: 3m
- labels:
- severity: warning
- instance: "{{ $labels.instance }}"
- annotations:
- summary: "instance: {{ $labels.instance }} 发送错误包 速率过高"
- description: "Network 发送错误包 速率超过 15个/秒"
- value: "{{ $value }}"
-
- - alert: NodeProcessBlockedHigh
- expr: node_procs_blocked{job="node"} > 10
- for: 10m
- labels:
- severity: warning
- instance: "{{ $labels.instance }}"
- annotations:
- summary: "instance: {{ $labels.instance }} 当前被阻塞的任务的数量过多"
- description: "Process 当前被阻塞的任务的数量超过 10个"
- value: "{{ $value }}"
-
- - alert: NodeTimeOffsetHigh
- expr: abs(node_timex_offset_seconds{job="node"}) > 3 * 60
- for: 2m
- labels:
- severity: info
- instance: "{{ $labels.instance }}"
- annotations:
- summary: "instance: {{ $labels.instance }} 时间偏差过大"
- description: "Time 节点的时间偏差超过 3m"
- value: "{{ $value }}"
编写alertmanager配置文件alertmanager.yml
- touch /opt/prom/alertmanager/alertmanager.yml
- vim /opt/prom/alertmanager/alertmanager.yml
- global:
- resolve_timeout: 5m
- smtp_smarthost: 'smtp.163.com:465' #邮箱smtp服务器代理,启用SSL发信, 端口一般是465
- smtp_from: 'test@163.com' #发送邮箱名称
- smtp_auth_username: 'test@163.com' #邮箱名称
- smtp_auth_password: 'test@123' #邮箱密码或授权码
- smtp_require_tls: false
-
- route:
- receiver: 'default'
- group_wait: 10s
- group_interval: 1m
- repeat_interval: 1h
- group_by: ['alertname']
-
- inhibit_rules:
- - source_match:
- severity: 'critical'
- target_match:
- severity: 'warning'
- equal: ['alertname', 'instance']
-
- receivers:
- - name: 'default'
- email_configs:
- - to: 'receiver@163.com'
- send_resolved: true
- webhook_configs:
- - url: 'http://dingtalk:8060/dingtalk/webhook/send' #这里是钉钉的接口
- send_resolved: true
编写钉钉告警相关配置文件config.yml放在/opt/prom/alertmanager/目录下
- touch /opt/prom/alertmanager/config.yml
- vim /opt/prom/alertmanager/config.yml
- targets:
- webhook:
- url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxx #修改为钉钉机器人的webhook
- mention:
- all: true
切到/opt/prom目录下,使用docker-compose up -d去起相关服务
docker-compose up -d

浙公网安备 33010602011771号