常用的监控规则
groups:
- name: 双鱼-帝国
rules:
- alert: disk
expr: ceil(100 - (node_filesystem_avail_bytes{fstype=~"ext4|xfs",serverip=~"10.254.25.*"} / node_filesystem_size_bytes{fstype=~"ext4|xfs",serverip=~"10.254.25.*"} )*100 > 80)
labels:
severity: warning
annotations:
info: "分区[{{ $labels.mountpoint }}]使用率超过80%"
value: "{{ $value }}%"
query: 'ceil(100 - (node_filesystem_avail_bytes{fstype=~"ext4|xfs",mountpoint="%s",serverip="%s" } / node_filesystem_size_bytes{fstype=~"ext4|xfs",mountpoint="%s",serverip="%s" }) * 100)'
- alert: cpu
expr: ceil(100-(avg(irate(node_cpu_seconds_total{mode="idle",serverip=~"10.254.25.*"}[5m])) by(serverip,hostname,env,os,type,department,business,label_dp,label_bu)* 100) > 80)
for: 5m
labels:
severity: warning
annotations:
info: "CPU使用率大于80%"
value: "{{ $value }}%"
query: 'ceil(100-(avg(irate(node_cpu_seconds_total{mode="idle",serverip="%s"}[5m])) by(serverip,hostname,env,os,type,department,business,label_dp,label_bu)* 100))'
- alert: mem
expr: ceil(100 - node_memory_MemAvailable_bytes{serverip=~"10.254.25.*"} / node_memory_MemTotal_bytes{serverip=~"10.254.25.*"} * 100) > 80
for: 5m
labels:
severity: warning
annotations:
info: "内存使用率超过80%"
value: "{{ $value }}%"
query: 'ceil(100 - node_memory_MemAvailable_bytes{serverip="%s"} / node_memory_MemTotal_bytes{serverip="%s"} * 100)'
- alert: down
expr: up{serverip=~"10.254.25.*"} == 0
labels:
severity: critical
annotations:
info: "{{ $labels.serverip }} 宕机"
value: "{{ $value }}"
query: 'up{serverip="%s"}'
- alert: receive
expr: round(irate(node_network_receive_bytes_total{serverip="10.254.25.*",device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m]) / 1024 / 1024 , 0.01 ) > 2
for: 10m
labels:
severity: warning
annotations:
info: "流入网络带宽持续5分钟高于2MB/s"
value: "{{ $value }}MB/s"
query: 'round(sum(rate(node_network_receive_bytes_total{device!~"tap.*|veth.*|br.*|docker.*|virbr*|lo*",serverip="%s"}[5m])) by (instance,department,business,job,serverip,label_dp,label_bu,hostname) / 1024 / 1024 , 0.01 )'
- alert: transmit
expr: round(irate(node_network_transmit_bytes_total{serverip="10.254.25.*",device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m]) / 1024 / 1024 , 0.01 ) > 2
for: 10m
labels:
severity: warning
annotations:
info: "流出网络带宽持续5分钟高于2MB/s"
value: "{{ $value }}MB/s"
query: 'round(sum(rate(node_network_transmit_bytes_total{device!~"tap.*|veth.*|br.*|docker.*|virbr*|lo*",serverip="%s"}[5m])) by (instance,department,business,job,serverip,label_dp,label_bu,hostname) / 1024 / 1024 , 0.01)'
- alert: tcp_conn_num
expr: node_netstat_Tcp_CurrEstab{serverip=~"10.254.25.*"} > 1000
for: 10m
labels:
severity: warning
annotations:
info: "TCP连接超过4000"
value: "{{ $value }}"
query: 'node_netstat_Tcp_CurrEstab{serverip="%s"}'