常用的监控规则

groups:
  - name: 双鱼-帝国
    rules:
    - alert: disk
      expr: ceil(100 - (node_filesystem_avail_bytes{fstype=~"ext4|xfs",serverip=~"10.254.25.*"} / node_filesystem_size_bytes{fstype=~"ext4|xfs",serverip=~"10.254.25.*"} )*100 > 80)
      labels:
        severity: warning
      annotations:
        info: "分区[{{ $labels.mountpoint }}]使用率超过80%"
        value: "{{ $value }}%"
        query: 'ceil(100 - (node_filesystem_avail_bytes{fstype=~"ext4|xfs",mountpoint="%s",serverip="%s" } / node_filesystem_size_bytes{fstype=~"ext4|xfs",mountpoint="%s",serverip="%s" }) * 100)'

    - alert: cpu
      expr: ceil(100-(avg(irate(node_cpu_seconds_total{mode="idle",serverip=~"10.254.25.*"}[5m])) by(serverip,hostname,env,os,type,department,business,label_dp,label_bu)* 100) > 80)
      for: 5m
      labels:
        severity: warning
      annotations:
        info: "CPU使用率大于80%"
        value: "{{ $value }}%"
        query: 'ceil(100-(avg(irate(node_cpu_seconds_total{mode="idle",serverip="%s"}[5m])) by(serverip,hostname,env,os,type,department,business,label_dp,label_bu)* 100))'
  
    - alert: mem
      expr: ceil(100 - node_memory_MemAvailable_bytes{serverip=~"10.254.25.*"} / node_memory_MemTotal_bytes{serverip=~"10.254.25.*"} * 100) > 80
      for: 5m
      labels:
        severity: warning
      annotations:
        info: "内存使用率超过80%"
        value: "{{ $value }}%"
        query: 'ceil(100 - node_memory_MemAvailable_bytes{serverip="%s"} / node_memory_MemTotal_bytes{serverip="%s"} * 100)'

    - alert: down
      expr: up{serverip=~"10.254.25.*"} == 0
      labels:
        severity: critical
      annotations:
        info: "{{ $labels.serverip }} 宕机"
        value: "{{ $value }}"
        query: 'up{serverip="%s"}'

    - alert: receive
      expr: round(irate(node_network_receive_bytes_total{serverip="10.254.25.*",device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m]) / 1024 / 1024 , 0.01 ) > 2
      for: 10m
      labels:
        severity: warning
      annotations:
        info: "流入网络带宽持续5分钟高于2MB/s"
        value: "{{ $value }}MB/s"  
        query: 'round(sum(rate(node_network_receive_bytes_total{device!~"tap.*|veth.*|br.*|docker.*|virbr*|lo*",serverip="%s"}[5m])) by (instance,department,business,job,serverip,label_dp,label_bu,hostname) / 1024 / 1024 , 0.01 )'

    - alert: transmit
      expr: round(irate(node_network_transmit_bytes_total{serverip="10.254.25.*",device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m]) / 1024 / 1024 , 0.01 ) > 2
      for: 10m
      labels:
        severity: warning
      annotations:
        info: "流出网络带宽持续5分钟高于2MB/s"
        value: "{{ $value }}MB/s"
        query: 'round(sum(rate(node_network_transmit_bytes_total{device!~"tap.*|veth.*|br.*|docker.*|virbr*|lo*",serverip="%s"}[5m])) by (instance,department,business,job,serverip,label_dp,label_bu,hostname) / 1024 / 1024 , 0.01)'

    - alert: tcp_conn_num
      expr: node_netstat_Tcp_CurrEstab{serverip=~"10.254.25.*"} > 1000
      for: 10m
      labels:
        severity: warning
      annotations:
        info: "TCP连接超过4000"
        value: "{{ $value }}"
        query: 'node_netstat_Tcp_CurrEstab{serverip="%s"}'

  

posted @ 2025-02-10 16:10  羊脂玉净瓶  阅读(12)  评论(0)    收藏  举报