一、prometheus.yml

  1 # my global config
  2 global:
  3   scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  4   evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  5   # scrape_timeout is set to the global default (10s).
  6 
  7 # Alertmanager configuration
  8 alerting:
  9   alertmanagers:
 10     - static_configs:
 11         - targets:
 12           - xxxx:9093
 13 
 14 # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
 15 rule_files:
 16   - "node_alerts.yml"
 17   # - "second_rules.yml"
 18 
 19 # A scrape configuration containing exactly one endpoint to scrape:
 20 # Here it's Prometheus itself.
 21 scrape_configs:
 22   # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
 23   - job_name: "prometheus"
 24 
 25     # metrics_path defaults to '/metrics'
 26     # scheme defaults to 'http'.
 27 
 28     static_configs:
 29       - targets: ["xxxx:9090","xxxx:18104"]
 30   - job_name: "alertmanager"
 31     static_configs:
 32       - targets: ["xxxx:9093"]
 33   - job_name: "node"
 34     static_configs:
 35       - targets: ["xxxx:9100"]
 36   - job_name: "blackbox"
 37     metrics_path: /metrics
 38     static_configs:
 39       - targets: ["xxxx:9115"]
 40   - job_name: 'blackbox_http_2xx'
 41     scrape_interval: 45s
 42     metrics_path: /probe
 43     params:
 44       module: [http_2xx]  # Look for a HTTP 200 response.
 45     static_configs:
 46         - targets:
 47           - https://www.baidu.com/
 48           - 172.0.0.1:9090
 49     relabel_configs:
 50         - source_labels: [__address__]
 51           target_label: __param_target
 52         - source_labels: [__param_target]
 53           target_label: instance
 54         - target_label: __address__
 55           replacement: xxxx:9115
 56   - job_name: "blackbox_telnet_port]"
 57     scrape_interval: 5s
 58     metrics_path: /probe
 59     params:
 60       module: [tcp_connect]
 61     static_configs:
 62         - targets: [ 'xxxx:443' ]
 63           labels:
 64             group: 'xxxidc机房ip监控'
 65         - targets: ['xxxx:443']
 66           labels:
 67             group: 'Process status of nginx(main) server'
 68     relabel_configs:
 69         - source_labels: [__address__]
 70           target_label: __param_target
 71         - source_labels: [__param_target]
 72           target_label: instance
 73         - target_label: __address__
 74           replacement: xxxx:9115
 75   - job_name: 'blackbox_http_2xx_post'
 76     scrape_interval: 10s
 77     metrics_path: /probe
 78     params:
 79       module: [http_post_2xx_query]
 80     static_configs:
 81         - targets:
 82           - https://xxx 83           labels:
 84             group: 'Interface monitoring'
 85     relabel_configs:
 86         - source_labels: [__address__]
 87           target_label: __param_target
 88         - source_labels: [__param_target]
 89           target_label: instance
 90         - target_label: __address__
 91           replacement: xxxx:9115  # The blackbox exporter's real hostname:port.
 92   - job_name: 'blackboxa'
 93     metrics_path: /probe
 94     params:
 95       module: [http_2xx]  # Look for a HTTP 200 response.
 96     static_configs:
 97       - targets:
 98         - baidu.com  # Target to probe
 99     relabel_configs:
100       - source_labels: [__address__]
101         target_label: __param_target
102       - source_labels: [__param_target]
103         target_label: instance
104       - target_label: __address__
105         replacement: 127.0.0.1:9115  # Blackbox exporter.

二、node_alerts.yml

[root@spug workfileprometheus]# cat node_alerts.yml
groups:
- name: 实例存活告警规则
  rules:
  - alert: 实例存活告警         # 告警规则的名称(alertname)
    expr: up == 0               # expr 是计算公式,up指标可以获取到当前所有运行的Exporter实例以及其状态,即告警阈值为up==0
    for: 30s    # for语句会使 Prometheus 服务等待指定的时间, 然后执行查询表达式。(for 表示告警持续的时长,若持续时长小于该时间就不发给alertmanager了,大于该时间再发。for的值不要小于prometheus中的scrape_interval,例如scrape_interval为30s,for为15s,如果触发告警规则,则再经过for时长后也一定会告警,这是因为最新的度量指标还没有拉取,在15s时仍会用原来值进行计算。另外,要注意的是只有在第一次触发告警时才会等待(for)时长。)
    labels:             # labels语句允许指定额外的标签列表,把它们附加在告警上。
      severity: Disaster
    annotations:                # annotations语句指定了另一组标签,它们不被当做告警实例的身份标识,它们经常用于存储一些额外的信息,用于报警信息的展示之类的。
      summary: "节点失联"
      description: "节点断联已超过1分钟!"
- name: 内存告警规则
  rules:
  - alert: "内存使用率告警"
    expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes )) / node_memory_MemTotal_bytes * 100 > 75        # 告警阈值为当内存使用率大于75%
    for: 30s
    labels:
      severity: warning
    annotations:
      summary: "服务器内存报警"
      description: "内存资源利用率大于75%!(当前值: {{ $value }}%)"

- name: 磁盘报警规则
  rules:
  - alert: 磁盘使用率告警
    expr: (node_filesystem_size_bytes - node_filesystem_avail_bytes) / node_filesystem_size_bytes * 100 > 80    # 告警阈值为某个挂载点使用大于80%
    for: 1m
    labels:
      severity: warning
    annotations:
      summary: "服务器 磁盘报警"
      description: "服务器磁盘设备使用超过80%!(挂载点: {{ $labels.mountpoint }} 当前值: {{ $value }}%)"

三、alertmanager.yml

[root@spug workfileprometheus]# cat alertmanager.yml

global:
  resolve_timeout: 5m
  smtp_from: '登录用户号'
  smtp_smarthost: 'smtp.126.com:465'
  smtp_auth_username: '登录用户号'
  smtp_auth_password: 'xxx'
  smtp_require_tls: false
  smtp_hello: '发送QQ'
route:
  group_by: ['alertname']
  group_wait: 30s
  group_interval: 5m
  repeat_interval: 1h
  receiver: 'web.hook'
receivers:
- name: 'web.hook'
  email_configs:
  - to: '发送QQ'
inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'dev', 'instance']

四、config.yml

[root@spug workfileprometheus]# cat config.yml
modules:
  http_2xx:
    prober: http
    timeout: 10s
    http:
      preferred_ip_protocol: "ip4" ##如果http监测是使用ipv4 就要写上,目前国内使用ipv6很少。
  http_post_2xx_query: ##用于post请求使用的模块)由于每个接口传参不同 可以定义多个module 用于不同接口(例如此命名为http_post_2xx_query 用于监测query.action接口
    prober: http
    timeout: 15s
    http:
      preferred_ip_protocol: "ip4" ##使用ipv4
      method: POST
      headers:
        Content-Type: application/json ##header头
      body: '{"hmac":"","params":{"publicFundsKeyWords":"xxx"}}' ##传参
  tcp_connect:
    prober: tcp
  pop3s_banner:
    prober: tcp
    tcp:
      query_response:
      - expect: "^+OK"
      tls: true
      tls_config:
        insecure_skip_verify: false
  ssh_banner:
    prober: tcp
    tcp:
      query_response:
      - expect: "^SSH-2.0-"
  irc_banner:
    prober: tcp
    tcp:
      query_response:
      - send: "NICK prober"
      - send: "USER prober prober prober :prober"
      - expect: "PING :([^ ]+)"
        send: "PONG ${1}"
      - expect: "^:[^ ]+ 001"
  icmp:
    prober: icmp
    timeout: 5s
    icmp:

 

posted on 2022-07-20 20:08  砖头哥-  阅读(437)  评论(0)    收藏  举报