一、prometheus.yml
1 # my global config 2 global: 3 scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute. 4 evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. 5 # scrape_timeout is set to the global default (10s). 6 7 # Alertmanager configuration 8 alerting: 9 alertmanagers: 10 - static_configs: 11 - targets: 12 - xxxx:9093 13 14 # Load rules once and periodically evaluate them according to the global 'evaluation_interval'. 15 rule_files: 16 - "node_alerts.yml" 17 # - "second_rules.yml" 18 19 # A scrape configuration containing exactly one endpoint to scrape: 20 # Here it's Prometheus itself. 21 scrape_configs: 22 # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config. 23 - job_name: "prometheus" 24 25 # metrics_path defaults to '/metrics' 26 # scheme defaults to 'http'. 27 28 static_configs: 29 - targets: ["xxxx:9090","xxxx:18104"] 30 - job_name: "alertmanager" 31 static_configs: 32 - targets: ["xxxx:9093"] 33 - job_name: "node" 34 static_configs: 35 - targets: ["xxxx:9100"] 36 - job_name: "blackbox" 37 metrics_path: /metrics 38 static_configs: 39 - targets: ["xxxx:9115"] 40 - job_name: 'blackbox_http_2xx' 41 scrape_interval: 45s 42 metrics_path: /probe 43 params: 44 module: [http_2xx] # Look for a HTTP 200 response. 45 static_configs: 46 - targets: 47 - https://www.baidu.com/ 48 - 172.0.0.1:9090 49 relabel_configs: 50 - source_labels: [__address__] 51 target_label: __param_target 52 - source_labels: [__param_target] 53 target_label: instance 54 - target_label: __address__ 55 replacement: xxxx:9115 56 - job_name: "blackbox_telnet_port]" 57 scrape_interval: 5s 58 metrics_path: /probe 59 params: 60 module: [tcp_connect] 61 static_configs: 62 - targets: [ 'xxxx:443' ] 63 labels: 64 group: 'xxxidc机房ip监控' 65 - targets: ['xxxx:443'] 66 labels: 67 group: 'Process status of nginx(main) server' 68 relabel_configs: 69 - source_labels: [__address__] 70 target_label: __param_target 71 - source_labels: [__param_target] 72 target_label: instance 73 - target_label: __address__ 74 replacement: xxxx:9115 75 - job_name: 'blackbox_http_2xx_post' 76 scrape_interval: 10s 77 metrics_path: /probe 78 params: 79 module: [http_post_2xx_query] 80 static_configs: 81 - targets: 82 - https://xxx 83 labels: 84 group: 'Interface monitoring' 85 relabel_configs: 86 - source_labels: [__address__] 87 target_label: __param_target 88 - source_labels: [__param_target] 89 target_label: instance 90 - target_label: __address__ 91 replacement: xxxx:9115 # The blackbox exporter's real hostname:port. 92 - job_name: 'blackboxa' 93 metrics_path: /probe 94 params: 95 module: [http_2xx] # Look for a HTTP 200 response. 96 static_configs: 97 - targets: 98 - baidu.com # Target to probe 99 relabel_configs: 100 - source_labels: [__address__] 101 target_label: __param_target 102 - source_labels: [__param_target] 103 target_label: instance 104 - target_label: __address__ 105 replacement: 127.0.0.1:9115 # Blackbox exporter.
二、node_alerts.yml
[root@spug workfileprometheus]# cat node_alerts.yml groups: - name: 实例存活告警规则 rules: - alert: 实例存活告警 # 告警规则的名称(alertname) expr: up == 0 # expr 是计算公式,up指标可以获取到当前所有运行的Exporter实例以及其状态,即告警阈值为up==0 for: 30s # for语句会使 Prometheus 服务等待指定的时间, 然后执行查询表达式。(for 表示告警持续的时长,若持续时长小于该时间就不发给alertmanager了,大于该时间再发。for的值不要小于prometheus中的scrape_interval,例如scrape_interval为30s,for为15s,如果触发告警规则,则再经过for时长后也一定会告警,这是因为最新的度量指标还没有拉取,在15s时仍会用原来值进行计算。另外,要注意的是只有在第一次触发告警时才会等待(for)时长。) labels: # labels语句允许指定额外的标签列表,把它们附加在告警上。 severity: Disaster annotations: # annotations语句指定了另一组标签,它们不被当做告警实例的身份标识,它们经常用于存储一些额外的信息,用于报警信息的展示之类的。 summary: "节点失联" description: "节点断联已超过1分钟!" - name: 内存告警规则 rules: - alert: "内存使用率告警" expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes )) / node_memory_MemTotal_bytes * 100 > 75 # 告警阈值为当内存使用率大于75% for: 30s labels: severity: warning annotations: summary: "服务器内存报警" description: "内存资源利用率大于75%!(当前值: {{ $value }}%)" - name: 磁盘报警规则 rules: - alert: 磁盘使用率告警 expr: (node_filesystem_size_bytes - node_filesystem_avail_bytes) / node_filesystem_size_bytes * 100 > 80 # 告警阈值为某个挂载点使用大于80% for: 1m labels: severity: warning annotations: summary: "服务器 磁盘报警" description: "服务器磁盘设备使用超过80%!(挂载点: {{ $labels.mountpoint }} 当前值: {{ $value }}%)"
三、alertmanager.yml
[root@spug workfileprometheus]# cat alertmanager.yml global: resolve_timeout: 5m smtp_from: '登录用户号' smtp_smarthost: 'smtp.126.com:465' smtp_auth_username: '登录用户号' smtp_auth_password: 'xxx' smtp_require_tls: false smtp_hello: '发送QQ' route: group_by: ['alertname'] group_wait: 30s group_interval: 5m repeat_interval: 1h receiver: 'web.hook' receivers: - name: 'web.hook' email_configs: - to: '发送QQ' inhibit_rules: - source_match: severity: 'critical' target_match: severity: 'warning' equal: ['alertname', 'dev', 'instance']
四、config.yml
[root@spug workfileprometheus]# cat config.yml modules: http_2xx: prober: http timeout: 10s http: preferred_ip_protocol: "ip4" ##如果http监测是使用ipv4 就要写上,目前国内使用ipv6很少。 http_post_2xx_query: ##用于post请求使用的模块)由于每个接口传参不同 可以定义多个module 用于不同接口(例如此命名为http_post_2xx_query 用于监测query.action接口 prober: http timeout: 15s http: preferred_ip_protocol: "ip4" ##使用ipv4 method: POST headers: Content-Type: application/json ##header头 body: '{"hmac":"","params":{"publicFundsKeyWords":"xxx"}}' ##传参 tcp_connect: prober: tcp pop3s_banner: prober: tcp tcp: query_response: - expect: "^+OK" tls: true tls_config: insecure_skip_verify: false ssh_banner: prober: tcp tcp: query_response: - expect: "^SSH-2.0-" irc_banner: prober: tcp tcp: query_response: - send: "NICK prober" - send: "USER prober prober prober :prober" - expect: "PING :([^ ]+)" send: "PONG ${1}" - expect: "^:[^ ]+ 001" icmp: prober: icmp timeout: 5s icmp:
浙公网安备 33010602011771号