报警规则配置大全
服务器告警规则
cat /data/prometheus/rules/node_exporter.yml
groups:
- name: node_usage_record_rules
interval: 1m
rules:
- record: cpu:usage:rate1m
expr: (1 - avg(rate(node_cpu_seconds_total{mode="idle"}[1m])) by (instance,vendor,account,group,name)) * 100
- record: mem:usage:rate1m
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100
- name: node-exporter
rules:
- alert: 主机内存使用率
expr: 100 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 90
for: 5m
labels:
alertype: system
annotations:
summary: "内存使用率为{{ $value | humanize }}%"
description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; IP: {{ $labels.instance }}"
- alert: 主机CPU使用率
expr: 100 - (avg by(instance,name,group,account) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
for: 5m
labels:
alertype: system
annotations:
summary: "CPU使用率为{{ $value | humanize }}%"
description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; IP: {{ $labels.instance }}"
- alert: 主机系统负载
expr: node_load5 / on (instance,name,group,account) sum(count(node_cpu_seconds_total{mode='system'}) by (cpu,instance,name,group,account)) by(instance,name,group,account) > 2.0
for: 5m
labels:
alertype: system
annotations:
summary: "系统5分钟平均负载与CPU核心数比值为 {{ $value | humanize }}倍"
description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; IP: {{ $labels.instance }}"
- alert: 主机磁盘使用率
expr: |
100 - (node_filesystem_avail_bytes/node_filesystem_size_bytes{fstype=~"ext.?|xfs",mountpoint!~".*pods.*|/var/lib/docker/devicemapper/mnt/.*"} * 100) > 90
for: 5m
labels:
alertype: system
annotations:
summary: "磁盘使用率为 {{ $value | humanize }}%"
description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; IP: {{ $labels.instance }}; 磁盘: {{ $labels.device }}"
- alert: 主机磁盘写IO
expr: (irate(node_disk_written_bytes_total[5m]) ) /1024 /1024 > 100
for: 5m
labels:
alertype: system
annotations:
summary: "当前写IO为 {{ $value | humanize }}MB/s"
description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; IP: {{ $labels.instance }}; 磁盘: {{ $labels.device }}"
- alert: 主机磁盘读IO
expr: (irate(node_disk_read_bytes_total[5m]) ) /1024 /1024 > 100
for: 5m
labels:
alertype: system
annotations:
summary: "当前读IO为 {{ $value | humanize }}MB/s"
description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; IP: {{ $labels.instance }}; 磁盘: {{ $labels.device }}"
- alert: 主机网络流入(下载)数据过多
expr: sum by(device,instance, name, group, account) (irate(node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr.*|lo.*|cni.*'}[5m])) / 1024 / 1024 > 20
for: 5m
labels:
alertype: system
annotations:
summary: "流入数据为 {{ $value | humanize }}MB/s"
description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; IP: {{ $labels.instance }}"
- alert: 主机网络流出(上传)数据过多
expr: sum by(device,instance, name, group, account) (irate(node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr.*|lo.*|cni.*'}[5m])) / 1024 / 1024 > 20
for: 5m
labels:
alertype: system
annotations:
summary: "流出数据为 {{ $value | humanize }}MB/s"
description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; IP: {{ $labels.instance }}"
- alert: 主机重启
expr: node_time_seconds - node_boot_time_seconds < 600
for: 1m
labels:
alertype: system
annotations:
summary: "主机重启"
description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; IP: {{ $labels.instance }}"
- alert: node_exporter状态
expr: up{job="node_exporter"} == 0
for: 1m
labels:
alertype: system
annotations:
summary: "node_exporter异常"
description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; IP: {{ $labels.instance }}"
mysql告警规则
cat /data/prometheus/rules/mysql_exporter.yml
groups:
- name: MySQL-Alert
rules:
- alert: MySQL_is_down
expr: mysql_up == 0
for: 1m
labels:
alertype: dba
annotations:
summary: "MySQL database is down"
description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; 地址: {{ $labels.instance }}"
- alert: MySQL_慢查询过多
expr: delta(mysql_global_status_slow_queries[1m]) > 60
for: 1m
labels:
alertype: dba
annotations:
summary: "每分钟慢查询 {{ $value }} 个"
description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; 地址: {{ $labels.instance }}"
- alert: MySQL_当前活跃的连接数过多
expr: mysql_global_status_threads_running > 100
for: 1m
labels:
alertype: dba
annotations:
summary: "当前活跃的连接数 {{ $value }} 个"
description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; 地址: {{ $labels.instance }}"
- alert: MySQL_主从IO线程运行状态异常
expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_io_running == 0
for: 1m
labels:
alertype: dba
annotations:
summary: "MySQL Slave IO thread not running"
description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; 地址: {{ $labels.instance }}"
- alert: MySQL_主从SQL线程运行状态异常
expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_sql_running == 0
for: 1m
labels:
alertype: dba
annotations:
summary: "MySQL Slave SQL thread not running"
description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; 地址: {{ $labels.instance }}"
- alert: MySQL_主从复制延迟过高
expr: mysql_slave_status_seconds_behind_master > 3
for: 1m
labels:
alertype: dba
annotations:
summary: "主从复制延迟当前:{{ $value | humanize }}s"
description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; 地址: {{ $labels.instance }}"
- alert: MySQL_is_Restart
expr: mysql_global_status_uptime <600
for: 1m
labels:
alertype: dba
annotations:
summary: "MySQL database is Restart"
description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; 地址: {{ $labels.instance }}"
redis告警规则
cat /data/prometheus/rules/redis_exporter.yml
groups:
- name: REDIS-Alert
rules:
- alert: RedisDown
expr: redis_up == 0
for: 0m
labels:
alertype: dba
annotations:
summary: "Redis down"
description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; 地址: {{ $labels.instance }}"
- alert: RedisDisconnectedSlaves
expr: count without (instance, job) (redis_connected_slaves) - sum without (instance, job) (redis_connected_slaves) - 1 > 1
for: 0m
labels:
alertype: dba
annotations:
summary: "Redis 从节点断开连接,建议检查 Redis 复制状态。"
description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; 地址: {{ $labels.instance }}"
- alert: RedisReplicationBroken
expr: delta(redis_connected_slaves[1m]) < 0
for: 0m
labels:
alertype: dba
annotations:
summary: "检测到 Redis 从节点复制中断"
description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; 地址: {{ $labels.instance }}"
- alert: RedisClusterFlapping
expr: changes(redis_connected_slaves[1m]) > 1
for: 2m
labels:
alertype: dba
annotations:
summary: "检测到 Redis 从节点连接状态发生变化。这可能是由于从节点与主节点断开连接后又重新连接(即抖动)导致的"
description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; 地址: {{ $labels.instance }}"
- alert: RedisMissingBackup
expr: time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 24
for: 0m
labels:
alertype: dba
annotations:
summary: "Redis 已超过24小时未进行RDB备份"
description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; 地址: {{ $labels.instance }}"
- alert: RedisOutOfConfiguredMaxmemory
expr: redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90
for: 2m
labels:
alertype: dba
annotations:
summary: "Redis 占用最大内存限制 {{ $value }}%"
description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; 地址: {{ $labels.instance }}"
- alert: RedisTooManyConnections
expr: redis_connected_clients > 100
for: 2m
labels:
alertype: dba
annotations:
summary: "Redis 连接数达到 {{ $value }} 个"
description: "项目: {{ $labels.group }}; 主机名: {{ $labels.name }}; 地址: {{ $labels.instance }}"
黑盒告警规则
cat /data/prometheus/rules/blackbox_exporter.yml
groups:
- name: Domain
rules:
- alert: 站点可用性
expr: probe_success == 0
for: 1m
labels:
alertype: system
annotations:
summary: "站点无法访问"
description: "环境: {{ $labels.env }} ;项目: {{ $labels.project }}; 站点: {{ $labels.instance }}"
- alert: 站点1h可用性低于80%
expr: sum_over_time(probe_success[1h])/count_over_time(probe_success[1h]) * 100
for: 3m
labels:
alertype: system
annotations:
summary: "站点1h可用性:{{ $value | humanize }}%"
description: "环境: {{ $labels.env }} ;项目: {{ $labels.project }}; 站点: {{ $labels.instance }}"
- alert: 站点状态异常
expr: (probe_success == 0 and probe_http_status_code > 499) or probe_http_status_code == 0
for: 1m
labels:
alertype: system
annotations:
summary: "站点状态异常:{{ $value }}"
description: "环境: {{ $labels.env }} ;项目: {{ $labels.project }}; 站点: {{ $labels.instance }}"
- alert: 站点耗时过高
expr: probe_duration_seconds > 0.5
for: 2m
labels:
alertype: system
annotations:
summary: "当前站点耗时:{{ $value | humanize }}s"
description: "环境: {{ $labels.env }} ;项目: {{ $labels.project }}; 站点: {{ $labels.instance }}"
- alert: SSL证书有效期
expr: (probe_ssl_earliest_cert_expiry-time()) / 3600 / 24 < 15
for: 2m
labels:
alertype: system
annotations:
summary: "证书有效期剩余{{ $value | humanize }}天"
description: "环境: {{ $labels.env }} ;项目: {{ $labels.project }}; 站点: {{ $labels.instance }}"
docker告警规则
cat /data/prometheus/rules/cadvisor.yml
groups:
- name: Docker.rules
rules:
- alert: CAdvisor Down
expr: up{job="CAdvisor_docker"} == 0
for: 0m
labels:
alertype: container
annotations:
summary: 'CAdvisor down'
description: "CAdvisor实例: 【{{ $labels.instance }}】"
- alert: ContainerKilled
expr: time() - container_last_seen{name!=""} > 60
for: 1m
labels:
alertype: container
annotations:
summary: '容器被删除'
description: "主机: 【{{ $labels.instance }}】; 容器名: 【{{ $labels.name }}】"
- alert: ContainerCpuUsage
expr: (sum by(instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[3m])) * 100) > 80
for: 2m
labels:
alertype: container
annotations:
summary: "容器CPU使用率已超过80%,当前值: {{ $value }}"
description: "主机: 【{{ $labels.instance }}】; 容器名: 【{{ $labels.name }}】"
- alert: ContainerMemoryUsage
expr: (sum by(instance, name) (container_memory_working_set_bytes{name!=""}) / sum by(instance, name) (container_spec_memory_limit_bytes{name!=""} > 0) * 100) > 80
for: 2m
labels:
alertype: container
annotations:
summary: "容器内存使用率已超过80%,当前值: {{ $value }}"
description: "主机: 【{{ $labels.instance }}】; 容器名: 【{{ $labels.name }}】"
!!!以下未验证
ES告警规则
cat > /data/prometheus/conf/rules/elasticsearch.yaml << 'EOF'
groups:
- name: Elasticsearch.rules
rules:
## ES Alarm Rules
- alert: ElasticsearchHeapUsageTooHigh
expr: (elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 90
for: 2m
labels:
severity: critical
annotations:
title: "Elasticsearch Heap Usage Too High"
description: "主机: 【{{ $labels.instance }}】, The heap usage is over 90%, Current Value: {{ $value }}"
- alert: ElasticsearchHeapUsageWarning
expr: (elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 80
for: 2m
labels:
severity: warning
annotations:
title: 'Elasticsearch Heap Usage warning'
description: "主机: 【{{ $labels.instance }}】, The heap usage is over 80%, Current Value: {{ $value }}"
- alert: ElasticsearchDiskOutOfSpace
expr: elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 10
for: 0m
labels:
severity: critical
annotations:
title: 'Elasticsearch disk out of space'
description: "主机: 【{{ $labels.instance }}】, The disk usage is over 90%, Current Value: {{ $value }}"
- alert: ElasticsearchDiskSpaceLow
expr: elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 20
for: 2m
labels:
severity: warning
annotations:
title: 'Elasticsearch disk space low'
description: "主机: 【{{ $labels.instance }}】, The disk usage is over 80%, Current Value: {{ $value }}"
- alert: ElasticsearchClusterRed
expr: elasticsearch_cluster_health_status{color="red"} == 1
for: 0m
labels:
severity: critical
annotations:
title: 'Elasticsearch Cluster Red'
description: "主机: 【{{ $labels.instance }}】, Elastic Cluster Red status"
- alert: ElasticsearchClusterYellow
expr: elasticsearch_cluster_health_status{color="yellow"} == 1
for: 0m
labels:
severity: warning
annotations:
title: 'Elasticsearch Cluster Yellow'
description: "主机: 【{{ $labels.instance }}】, Elastic Cluster Yellow status"
- alert: ElasticsearchHealthyNodes
expr: elasticsearch_cluster_health_number_of_nodes < 3
for: 0m
labels:
severity: critical
annotations:
title: 'Elasticsearch Healthy Nodes'
description: "Missing node in Elasticsearch cluster"
- alert: ElasticsearchHealthyDataNodes
expr: elasticsearch_cluster_health_number_of_data_nodes < 3
for: 0m
labels:
severity: critical
annotations:
title: 'Elasticsearch Healthy Data Nodes'
description: "Missing data node in Elasticsearch cluster"
- alert: ElasticsearchRelocatingShards
expr: elasticsearch_cluster_health_relocating_shards > 0
for: 0m
labels:
severity: info
annotations:
title: 'Elasticsearch relocating shards'
description: "主机: 【{{ $labels.instance }}】, Elasticsearch is relocating shards"
- alert: ElasticsearchRelocatingShardsTooLong
expr: elasticsearch_cluster_health_relocating_shards > 0
for: 15m
labels:
severity: warning
annotations:
title: 'Elasticsearch relocating shards too long'
description: "主机: 【{{ $labels.instance }}】, Elasticsearch has been relocating shards for 15min"
- alert: ElasticsearchInitializingShards
expr: elasticsearch_cluster_health_initializing_shards > 0
for: 0m
labels:
severity: info
annotations:
title: 'Elasticsearch initializing shards'
description: "主机: 【{{ $labels.instance }}】, Elasticsearch is initializing shards"
- alert: ElasticsearchInitializingShardsTooLong
expr: elasticsearch_cluster_health_initializing_shards > 0
for: 15m
labels:
severity: warning
annotations:
title: 'Elasticsearch initializing shards too long'
description: "主机: 【{{ $labels.instance }}】, Elasticsearch has been initializing shards for 15 min"
- alert: ElasticsearchUnassignedShards
expr: elasticsearch_cluster_health_unassigned_shards > 0
for: 0m
labels:
severity: critical
annotations:
title: 'Elasticsearch unassigned shards'
description: "主机: 【{{ $labels.instance }}】, Elasticsearch has unassigned shards"
- alert: ElasticsearchPendingTasks
expr: elasticsearch_cluster_health_number_of_pending_tasks > 0
for: 15m
labels:
severity: warning
annotations:
title: 'Elasticsearch pending tasks'
description: "主机: 【{{ $labels.instance }}】, Elasticsearch has pending tasks. Cluster works slowly, Current Value: {{ $value }}"
- alert: ElasticsearchNoNewDocuments
expr: increase(elasticsearch_indices_docs{es_data_node="true"}[10m]) < 1
for: 0m
labels:
severity: warning
annotations:
title: 'Elasticsearch no new documents'
description: "主机: 【{{ $labels.instance }}】, Elasticsearch No new documents for 10 min!"
EOF
kafka告警规则
cat > /data/prometheus/conf/rules/kafka.yaml << 'EOF'
groups:
- name: kafka.rules
rules:
## KAFKA Alarm Rules
- alert: KafkaTopicsReplicas
expr: sum(kafka_topic_partition_in_sync_replica) by (topic) < 3
for: 0m
labels:
severity: critical
annotations:
title: 'Kafka topics replicas less than 3'
description: "Topic: {{ $labels.topic }} partition less than 3, Current Value: {{ $value }}"
- alert: KafkaConsumersGroupLag
expr: sum(kafka_consumergroup_lag) by (consumergroup) > 50
for: 1m
labels:
severity: critical
annotations:
title: 'Kafka consumers group 消费滞后'
description: "Kafka consumers group 消费滞后 (Lag > 50), Lag值: {{ $value }}"
- alert: KafkaConsumersTopicLag
expr: sum(kafka_consumergroup_lag) by (topic) > 50
for: 1m
labels:
severity: critical
annotations:
title: 'Kafka Topic 消费滞后'
description: "Kafka Topic 消费滞后 (Lag > 50), Lag值: {{ $value }}"
EOF

浙公网安备 33010602011771号