报警规则配置大全

服务器告警规则

cat   /data/prometheus/rules/node_exporter.yml 
groups:
- name: node_usage_record_rules
  interval: 1m
  rules:
  - record: cpu:usage:rate1m
    expr: (1 - avg(rate(node_cpu_seconds_total{mode="idle"}[1m])) by (instance,vendor,account,group,name)) * 100
  - record: mem:usage:rate1m
    expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100

- name: node-exporter
  rules:
  - alert: 主机内存使用率
    expr: 100 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 90
    for: 5m
    labels:
      alertype: system
    annotations:
      summary: "内存使用率为{{ $value | humanize }}%"
      description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; IP: {{ $labels.instance }}"

  - alert: 主机CPU使用率
    expr: 100 - (avg by(instance,name,group,account) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
    for: 5m
    labels:
      alertype: system
    annotations:
      summary: "CPU使用率为{{ $value | humanize }}%"
      description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; IP: {{ $labels.instance }}"

  - alert: 主机系统负载
    expr: node_load5 / on (instance,name,group,account) sum(count(node_cpu_seconds_total{mode='system'}) by (cpu,instance,name,group,account)) by(instance,name,group,account) > 2.0
    for: 5m
    labels:
      alertype: system
    annotations:
      summary: "系统5分钟平均负载与CPU核心数比值为 {{ $value | humanize }}倍"
      description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; IP: {{ $labels.instance }}"

  - alert: 主机磁盘使用率
    expr: |
      100 - (node_filesystem_avail_bytes/node_filesystem_size_bytes{fstype=~"ext.?|xfs",mountpoint!~".*pods.*|/var/lib/docker/devicemapper/mnt/.*"} * 100) > 90
    for: 5m
    labels:
      alertype: system
    annotations:
      summary: "磁盘使用率为 {{ $value | humanize }}%"
      description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; IP: {{ $labels.instance }}; 磁盘: {{ $labels.device }}"

  - alert: 主机磁盘写IO
    expr: (irate(node_disk_written_bytes_total[5m]) ) /1024 /1024 > 100
    for: 5m
    labels:
      alertype: system
    annotations:
      summary: "当前写IO为 {{ $value | humanize }}MB/s"
      description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; IP: {{ $labels.instance }}; 磁盘: {{ $labels.device }}"

  - alert: 主机磁盘读IO
    expr: (irate(node_disk_read_bytes_total[5m]) ) /1024 /1024  > 100 
    for: 5m
    labels:
      alertype: system
    annotations:
      summary: "当前读IO为 {{ $value | humanize }}MB/s"
      description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; IP: {{ $labels.instance }}; 磁盘: {{ $labels.device }}"

  - alert: 主机网络流入(下载)数据过多
    expr: sum by(device,instance, name, group, account) (irate(node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr.*|lo.*|cni.*'}[5m])) / 1024 / 1024 > 20
    for: 5m
    labels:
      alertype: system
    annotations:
      summary: "流入数据为 {{ $value | humanize }}MB/s"
      description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; IP: {{ $labels.instance }}"

  - alert: 主机网络流出(上传)数据过多
    expr: sum by(device,instance, name, group, account) (irate(node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr.*|lo.*|cni.*'}[5m])) / 1024 / 1024 > 20
    for: 5m
    labels:
      alertype: system
    annotations:
      summary: "流出数据为 {{ $value | humanize }}MB/s"
      description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; IP: {{ $labels.instance }}"

  - alert: 主机重启
    expr: node_time_seconds - node_boot_time_seconds < 600
    for: 1m
    labels:
      alertype: system
    annotations:
      summary: "主机重启"
      description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; IP: {{ $labels.instance }}"

  - alert: node_exporter状态
    expr: up{job="node_exporter"} == 0
    for: 1m
    labels:
      alertype: system
    annotations:
      summary: "node_exporter异常"
      description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; IP: {{ $labels.instance }}"

mysql告警规则

cat /data/prometheus/rules/mysql_exporter.yml
groups:
- name: MySQL-Alert
  rules:
  - alert: MySQL_is_down
    expr: mysql_up == 0
    for: 1m
    labels:
      alertype: dba
    annotations:
      summary: "MySQL database is down"
      description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; 地址: {{ $labels.instance }}"

  - alert: MySQL_慢查询过多
    expr: delta(mysql_global_status_slow_queries[1m]) > 60
    for: 1m
    labels:
      alertype: dba
    annotations:
      summary: "每分钟慢查询 {{ $value }} 个"
      description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; 地址: {{ $labels.instance }}"

  - alert: MySQL_当前活跃的连接数过多
    expr: mysql_global_status_threads_running > 100
    for: 1m
    labels:
      alertype: dba
    annotations:
      summary: "当前活跃的连接数 {{ $value }} 个"
      description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; 地址: {{ $labels.instance }}"

  - alert: MySQL_主从IO线程运行状态异常
    expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_io_running == 0
    for: 1m
    labels:
      alertype: dba
    annotations:
      summary: "MySQL Slave IO thread not running"
      description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; 地址: {{ $labels.instance }}"

  - alert: MySQL_主从SQL线程运行状态异常
    expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_sql_running == 0
    for: 1m
    labels:
      alertype: dba
    annotations:
      summary: "MySQL Slave SQL thread not running"
      description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; 地址: {{ $labels.instance }}"

  - alert: MySQL_主从复制延迟过高
    expr: mysql_slave_status_seconds_behind_master > 3
    for: 1m
    labels:
      alertype: dba
    annotations:
      summary: "主从复制延迟当前:{{ $value | humanize }}s"
      description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; 地址: {{ $labels.instance }}"

  - alert: MySQL_is_Restart
    expr: mysql_global_status_uptime <600
    for: 1m
    labels:
      alertype: dba
    annotations:
      summary: "MySQL database is Restart"
      description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; 地址: {{ $labels.instance }}"

redis告警规则

cat  /data/prometheus/rules/redis_exporter.yml 
groups:
- name: REDIS-Alert
  rules:
  - alert: RedisDown
    expr: redis_up == 0
    for: 0m
    labels:
      alertype: dba
    annotations:
      summary: "Redis down"
      description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; 地址: {{ $labels.instance }}"

  - alert: RedisDisconnectedSlaves
    expr: count without (instance, job) (redis_connected_slaves) - sum without (instance, job) (redis_connected_slaves) - 1 > 1
    for: 0m
    labels:
      alertype: dba
    annotations:
      summary: "Redis 从节点断开连接,建议检查 Redis 复制状态。"
      description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; 地址: {{ $labels.instance }}"

  - alert: RedisReplicationBroken
    expr: delta(redis_connected_slaves[1m]) < 0
    for: 0m
    labels:
      alertype: dba
    annotations: 
      summary: "检测到 Redis 从节点复制中断"
      description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; 地址: {{ $labels.instance }}"

  - alert: RedisClusterFlapping
    expr: changes(redis_connected_slaves[1m]) > 1
    for: 2m
    labels:
      alertype: dba
    annotations:
      summary: "检测到 Redis 从节点连接状态发生变化。这可能是由于从节点与主节点断开连接后又重新连接(即抖动)导致的"
      description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; 地址: {{ $labels.instance }}"

  - alert: RedisMissingBackup
    expr: time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 24
    for: 0m
    labels:
      alertype: dba
    annotations:
      summary: "Redis 已超过24小时未进行RDB备份"
      description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; 地址: {{ $labels.instance }}"


  - alert: RedisOutOfConfiguredMaxmemory
    expr: redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90
    for: 2m
    labels:
      alertype: dba
    annotations:
      summary: "Redis 占用最大内存限制 {{ $value }}%"
      description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; 地址: {{ $labels.instance }}"

  - alert: RedisTooManyConnections
    expr: redis_connected_clients > 100
    for: 2m
    labels:
      alertype: dba
    annotations:
      summary: "Redis 连接数达到 {{ $value }} 个"
      description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; 地址: {{ $labels.instance }}"

黑盒告警规则

cat /data/prometheus/rules/blackbox_exporter.yml 
groups:
  - name: Domain
    rules:
    - alert: 站点可用性
      expr: probe_success == 0
      for: 1m
      labels:
        alertype: system
      annotations:
        summary: "站点无法访问"
        description: "环境: {{ $labels.env }} ;项目: {{ $labels.project }}; 站点: {{ $labels.instance }}"

    - alert: 站点1h可用性低于80%
      expr: sum_over_time(probe_success[1h])/count_over_time(probe_success[1h]) * 100
      for: 3m
      labels:
        alertype: system
      annotations:
        summary: "站点1h可用性:{{ $value | humanize }}%"
        description: "环境: {{ $labels.env }} ;项目: {{ $labels.project }}; 站点: {{ $labels.instance }}"

    - alert: 站点状态异常
      expr: (probe_success == 0 and probe_http_status_code > 499) or probe_http_status_code == 0
      for: 1m
      labels:
        alertype: system
      annotations:
        summary: "站点状态异常:{{ $value }}"
        description: "环境: {{ $labels.env }} ;项目: {{ $labels.project }}; 站点: {{ $labels.instance }}"

    - alert: 站点耗时过高
      expr: probe_duration_seconds > 0.5
      for: 2m
      labels:
        alertype: system
      annotations:
        summary: "当前站点耗时:{{ $value | humanize }}s"
        description: "环境: {{ $labels.env }} ;项目: {{ $labels.project }}; 站点: {{ $labels.instance }}"

    - alert: SSL证书有效期
      expr: (probe_ssl_earliest_cert_expiry-time()) / 3600 / 24 < 15
      for: 2m
      labels:
        alertype: system
      annotations:
        summary: "证书有效期剩余{{ $value | humanize }}天"
        description: "环境: {{ $labels.env }} ;项目: {{ $labels.project }}; 站点: {{ $labels.instance }}"

docker告警规则

cat /data/prometheus/rules/cadvisor.yml
groups:
- name: Docker.rules
  rules:
  - alert: CAdvisor Down
    expr: up{job="CAdvisor_docker"} == 0
    for: 0m
    labels:
      alertype: container
    annotations:
      summary: 'CAdvisor down'
      description: "CAdvisor实例: 【{{ $labels.instance }}】"
      
  - alert: ContainerKilled
    expr: time() - container_last_seen{name!=""} > 60
    for: 1m
    labels:
      alertype: container
    annotations:
      summary: '容器被删除'
      description: "主机: 【{{ $labels.instance }}】; 容器名: 【{{ $labels.name }}】"

  - alert: ContainerCpuUsage
    expr: (sum by(instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[3m])) * 100) > 80
    for: 2m
    labels:
      alertype: container
    annotations:
      summary: "容器CPU使用率已超过80%,当前值: {{ $value }}"
      description: "主机: 【{{ $labels.instance }}】; 容器名: 【{{ $labels.name }}】"

  - alert: ContainerMemoryUsage
    expr: (sum by(instance, name) (container_memory_working_set_bytes{name!=""}) / sum by(instance, name) (container_spec_memory_limit_bytes{name!=""} > 0) * 100)  > 80
    for: 2m
    labels:
      alertype: container
    annotations:
      summary: "容器内存使用率已超过80%,当前值: {{ $value }}"
      description: "主机: 【{{ $labels.instance }}】; 容器名: 【{{ $labels.name }}】"

!!!以下未验证

ES告警规则

cat > /data/prometheus/conf/rules/elasticsearch.yaml << 'EOF'
groups:
- name: Elasticsearch.rules
  rules:
   ## ES Alarm Rules
  - alert: ElasticsearchHeapUsageTooHigh
    expr: (elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 90
    for: 2m
    labels:
      severity: critical
    annotations:
      title: "Elasticsearch Heap Usage Too High"
      description: "主机: 【{{ $labels.instance }}】, The heap usage is over 90%, Current Value: {{ $value }}"

  - alert: ElasticsearchHeapUsageWarning
    expr: (elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 80
    for: 2m
    labels:
      severity: warning
    annotations:
      title: 'Elasticsearch Heap Usage warning'
      description: "主机: 【{{ $labels.instance }}】, The heap usage is over 80%, Current Value: {{ $value }}"

  - alert: ElasticsearchDiskOutOfSpace
    expr: elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 10
    for: 0m
    labels:
      severity: critical
    annotations:
      title: 'Elasticsearch disk out of space'
      description: "主机: 【{{ $labels.instance }}】, The disk usage is over 90%, Current Value: {{ $value }}"

  - alert: ElasticsearchDiskSpaceLow
    expr: elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 20
    for: 2m
    labels:
      severity: warning
    annotations:
      title: 'Elasticsearch disk space low'
      description: "主机: 【{{ $labels.instance }}】, The disk usage is over 80%, Current Value: {{ $value }}"

  - alert: ElasticsearchClusterRed
    expr: elasticsearch_cluster_health_status{color="red"} == 1
    for: 0m
    labels:
      severity: critical
    annotations:
      title: 'Elasticsearch Cluster Red'
      description: "主机: 【{{ $labels.instance }}】, Elastic Cluster Red status"

  - alert: ElasticsearchClusterYellow
    expr: elasticsearch_cluster_health_status{color="yellow"} == 1
    for: 0m
    labels:
      severity: warning
    annotations:
      title: 'Elasticsearch Cluster Yellow'
      description: "主机: 【{{ $labels.instance }}】, Elastic Cluster Yellow status"

  - alert: ElasticsearchHealthyNodes
    expr: elasticsearch_cluster_health_number_of_nodes < 3
    for: 0m
    labels:
      severity: critical
    annotations:
      title: 'Elasticsearch Healthy Nodes'
      description: "Missing node in Elasticsearch cluster"

  - alert: ElasticsearchHealthyDataNodes
    expr: elasticsearch_cluster_health_number_of_data_nodes < 3
    for: 0m
    labels:
      severity: critical
    annotations:
      title: 'Elasticsearch Healthy Data Nodes'
      description: "Missing data node in Elasticsearch cluster"

  - alert: ElasticsearchRelocatingShards
    expr: elasticsearch_cluster_health_relocating_shards > 0
    for: 0m
    labels:
      severity: info
    annotations:
      title: 'Elasticsearch relocating shards'
      description: "主机: 【{{ $labels.instance }}】, Elasticsearch is relocating shards"

  - alert: ElasticsearchRelocatingShardsTooLong
    expr: elasticsearch_cluster_health_relocating_shards > 0
    for: 15m
    labels:
      severity: warning
    annotations:
      title: 'Elasticsearch relocating shards too long'
      description: "主机: 【{{ $labels.instance }}】, Elasticsearch has been relocating shards for 15min"

  - alert: ElasticsearchInitializingShards
    expr: elasticsearch_cluster_health_initializing_shards > 0
    for: 0m
    labels:
      severity: info
    annotations:
      title: 'Elasticsearch initializing shards'
      description: "主机: 【{{ $labels.instance }}】, Elasticsearch is initializing shards"

  - alert: ElasticsearchInitializingShardsTooLong
    expr: elasticsearch_cluster_health_initializing_shards > 0
    for: 15m
    labels:
      severity: warning
    annotations:
      title: 'Elasticsearch initializing shards too long'
      description: "主机: 【{{ $labels.instance }}】, Elasticsearch has been initializing shards for 15 min"

  - alert: ElasticsearchUnassignedShards
    expr: elasticsearch_cluster_health_unassigned_shards > 0
    for: 0m
    labels:
      severity: critical
    annotations:
      title: 'Elasticsearch unassigned shards'
      description: "主机: 【{{ $labels.instance }}】, Elasticsearch has unassigned shards"

  - alert: ElasticsearchPendingTasks
    expr: elasticsearch_cluster_health_number_of_pending_tasks > 0
    for: 15m
    labels:
      severity: warning
    annotations:
      title: 'Elasticsearch pending tasks'
      description: "主机: 【{{ $labels.instance }}】, Elasticsearch has pending tasks. Cluster works slowly, Current Value: {{ $value }}"

  - alert: ElasticsearchNoNewDocuments
    expr: increase(elasticsearch_indices_docs{es_data_node="true"}[10m]) < 1
    for: 0m
    labels:
      severity: warning
    annotations:
      title: 'Elasticsearch no new documents'
      description: "主机: 【{{ $labels.instance }}】, Elasticsearch No new documents for 10 min!"
EOF

kafka告警规则

cat > /data/prometheus/conf/rules/kafka.yaml << 'EOF'
groups:
- name: kafka.rules
  rules:
##  KAFKA Alarm Rules
  - alert: KafkaTopicsReplicas
    expr: sum(kafka_topic_partition_in_sync_replica) by (topic) < 3
    for: 0m
    labels:
      severity: critical
    annotations:
      title: 'Kafka topics replicas less than 3'
      description: "Topic: {{ $labels.topic }} partition less than 3, Current Value: {{ $value }}"

  - alert: KafkaConsumersGroupLag
    expr: sum(kafka_consumergroup_lag) by (consumergroup) > 50
    for: 1m
    labels:
      severity: critical
    annotations:
      title: 'Kafka consumers group 消费滞后'
      description: "Kafka consumers group 消费滞后 (Lag > 50), Lag值: {{ $value }}"
      
  - alert: KafkaConsumersTopicLag
    expr: sum(kafka_consumergroup_lag) by (topic) > 50
    for: 1m
    labels:
      severity: critical
    annotations:
      title: 'Kafka Topic 消费滞后'
      description: "Kafka Topic 消费滞后 (Lag > 50), Lag值: {{ $value }}"
EOF
posted @ 2025-08-25 17:33  阿峰博客站  阅读(18)  评论(0)    收藏  举报