promethus grafana dingtalk pushgateway alertermanager

Prometheus安装

 cd /usr/src/
 wget https://github.com/prometheus/prometheus/releases/download/v2.12.0/prometheus-2.12.0.linux-amd64.tar.gz
tar xf prometheus-2.12.0.linux-amd64.tar.gz  -C /usr/local/
cd /usr/local/
ln -s prometheus-2.12.0.linux-amd64  prometheus

设置systemctl prometheus 启动

cat > /usr/lib/systemd/system/prometheus.service     << EOF
[Unit]
Description=Prometheus: the monitoring system
Documentation=http://prometheus.io/docs/

[Service]
ExecStart=/usr/local/prometheus/prometheus  --config.file=/usr/local/prometheus/prometheus.yml
Restart=always
StartLimitInterval=0
RestartSec=10

[Install]
WantedBy=multi-user.target
EOF
systemctl enable prometheus       ##制作开机启动prometheus  
systemctl start prometheus            ##启动prometheus
systemctl status prometheus         ##查看promethus状态

node_exporter安装部署 -> promethus依赖node_exporter来采集信息

cd /usr/src/
wget https://github.com/prometheus/node_exporter/releases/download/v0.18.1/node_exporter-0.18.1.linux-amd64.tar.gz
tar xf node_exporter-0.18.1.linux-amd64.tar.gz  -C /usr/local/
cd  /usr/local/
ln -s node_exporter-0.18.1.linux-amd64  node_exporter

制作systemctl方式启动node_exporter

cat >  /usr/lib/systemd/system/node_exporter.service  <<  EOF
[Unit]
Description=Prometheus node exporter
After=local-fs.target network-online.target network.target
Wants=local-fs.target network-online.target network.target

[Service]
Restart=on-failure
ExecStart="/usr/local/node_exporter/node_exporter"

[Install]
WantedBy=multi-user.target
EOF

制作node_exporter服务启动

systemctl enable node_exporter.service     ##制作开机启动node_exporter
systemctl start node_exporter.service          ##启动node_exporter
systemctl status node_exporter.service       ##查看node_exporter状态

访问方式

http://localhost:9090

获取主机信息

curl http://localhost:9090/metrics

如需要设置报警面板显示和监控多台机器可参考142机器的promethus.yml

# my global config
global:
  scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).

# Alertmanager configuration
alerting:
  alertmanagers:
  - static_configs:
    - targets:
        - 127.0.0.1:9093
      # - alertmanager:9093

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  - "rules/*.yml"
  # - "first_rules.yml"
  # - "second_rules.yml"

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: 'prometheus'

    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.
    static_configs:
    - targets: ['192.168.1.140:9090']
  - job_name: 'promethues-node'
    static_configs:
    - targets:
        - 192.168.1.140:9100
        - 192.168.1.137:9100
        - 192.168.1.57:9100
        - 192.168.1.141:9100
        - 192.168.1.60:9100
        - 192.168.1.201:9100
    - targets: ['192.168.1.59:9100']
      labels:
       instance: dataexa-insight-59

  - job_name: 'jmx'
    static_configs:
    - targets:
        - 192.168.1.59:3010
    - job_name: pushgateway
      static_configs:
      - targets: ['192.168.191.159:9091']
        labels:
          instance: pushgateway

jvm 监控

# 资料来源 https://www.jianshu.com/p/adada9c1f7dd
wget https://repo1.maven.org/maven2/io/prometheus/jmx/jmx_prometheus_javaagent/0.3.1/jmx_prometheus_javaagent-0.3.1.jar
# java -javaagent:/usr/local/prometheus/jmx_exporter/jmx_prometheus_javaagent-0.3.1.jar=3010:/usr/local/prometheus/jmx_exporter/jmx_exporter.yml -jar yourJar.jar

报警规则编写

需要在promethus.yml的同级目录下创建rules  --> mkdir rules
cat warining.yml
groups:
    - name: 主机状态-监控告警
      rules:
      - alert: 主机状态
        expr: up == 0
        for: 1m
        labels:
          status: 非常严重
        annotations:
          summary: "{{$labels.instance}}:服务器宕机"
          description: "{{$labels.instance}}:服务器延时超过5分钟"

      - alert: CPU使用情况
        expr: 100-(avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by(instance)* 100) > 80
        for: 1m
        labels:
          status: 一般告警
        annotations:
          summary: "{{$labels.mountpoint}} CPU使用率过高!"
          description: "{{$labels.mountpoint }} CPU使用大于80%(目前使用:{{$value}}%)"

      - alert: 内存使用
        expr: 100 -(node_memory_MemTotal_bytes -node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes ) / node_memory_MemTotal_bytes * 100> 80
        for: 1m
        labels:
          status: 严重告警
        annotations:
          summary: "{{$labels.mountpoint}} 内存使用率过高!"
          description: "{{$labels.mountpoint }} 内存使用大于80%(目前使用:{{$value}}%)"
      - alert: IO性能
        expr: 100-(avg(irate(node_disk_io_time_seconds_total[1m])) by(instance)* 100) < 60
        for: 1m
        labels:
          status: 严重告警
        annotations:
          summary: "{{$labels.mountpoint}} 流入磁盘IO使用率过高!"
          description: "{{$labels.mountpoint }} 流入磁盘IO大于60%(目前使用:{{$value}})"

      - alert: 网络
        expr: ((sum(rate (node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance)) / 100) > 102400
        for: 1m
        labels:
          status: 严重告警
        annotations:
          summary: "{{$labels.mountpoint}} 流入网络带宽过高!"
          description: "{{$labels.mountpoint }}流入网络带宽持续2分钟高于100M. RX带宽使用率{{$value}}"

      - alert: 网络
        expr: ((sum(rate (node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance)) / 100) > 102400
        for: 1m
        labels:
          status: 严重告警
        annotations:
          summary: "{{$labels.mountpoint}} 流出网络带宽过高!"
          description: "{{$labels.mountpoint }}流出网络带宽持续2分钟高于100M. RX带宽使用率{{$value}}"

      - alert: TCP会话
        expr: node_netstat_Tcp_CurrEstab > 1000
        for: 1m
        labels:
          status: 严重告警
        annotations:
          summary: "{{$labels.mountpoint}} TCP_ESTABLISHED过高!"
          description: "{{$labels.mountpoint }} TCP_ESTABLISHED大于1000%(目前使用:{{$value}}%)"

      - alert: 磁盘容量
        expr: 100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_bytes {fstype=~"ext4|xfs"}*100) > 80
        for: 1m
        labels:
          status: 严重告警
        annotations:
          summary: "{{$labels.mountpoint}} 磁盘分区使用率过高!"
          description: "{{$labels.mountpoint }} 磁盘分区使用大于80%(目前使用:{{$value}}%)"

安装grafana

cd /usr/local/src/
wget https://dl.grafana.com/oss/release/grafana-5.4.3-1.x86_64.rpm
yum localinstall grafana-5.4.3-1.x86_64.rpm
#启动服务
systemctl start grafana-server
#查看服务是否正常启动
systemctl status grafana-server
#自启动
systemctl enable grafana-server

访问

浏览器访问http://localhost:3000

grafana网页操作

https://www.cnblogs.com/zhaojiedi1992/p/zhaojiedi_liunx_64_prometheus_granafa.html

监控gpu

url:https://github.com/NVIDIA/gpu-monitoring-tools/tree/master/exporters/prometheus-dcgm
实际操作:
docker run -d --runtime=nvidia --name=nvidia-dcgm-exporter -v /run/prometheus:/run/prometheus nvidia/dcgm-exporter
docker run -d --net="host" --pid="host" --volumes-from nvidia-dcgm-exporter:ro quay.io/prometheus/node-exporter --collector.textfile.directory="/run/prometheus"

启动的三个服务

systemctl start prometheus                                      
systemctl start node_exporter                                 
systemctl start grafana-server              

alertmanager报警插件安装

wget https://github.com/prometheus/alertmanager/releases/download/v0.19.0/alertmanager-0.19.0.linux-amd64.tar.gz
tar xf alertmanager-0.19.0.linux-amd64.tar.gz -C /usr/local
mv alertmanager-0.19.0.linux-amd64  alertmanager
/usr/local/alertmanager/bin/ alertmanager     #启动

# 配置报警文件
cat alertmanager.yml
global:
  resolve_timeout: 5m

route:
  group_by: ['alertname']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 1h
  receiver: 'web.hook'
receivers:
- name: 'web.hook'
  webhook_configs:
  - url: 'http://localhost:8060/dingtalk/webhook/send'

钉钉报警

下载dingtalk进行报警
资料来源 https://www.codetd.com/article/6798984
下载好之后 选择使用markdown格式的报警格式
cat > /usr/local/gocode/src/github.com/timonwong/prometheus-webhook-dingtalk/template    <<  EOF
{{ define "__subject" }}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.SortedPairs.Values | join " " }} {{ if gt (len .CommonLabels) (len .GroupLabels) }}({{ with .CommonLabels.Remove .GroupLabels.Names }}{{ .Values | join " " }}{{ end }}){{ end }}{{ end }}
{{ define "__alertmanagerURL" }}{{ .ExternalURL }}/#/alerts?receiver={{ .Receiver }}{{ end }}

{{ define "__text_alert_list" }}{{ range . }}
**Labels**
{{ range .Labels.SortedPairs }}> - {{ .Name }}: {{ .Value | markdown | html }}
{{ end }}
**Annotations**
{{ range .Annotations.SortedPairs }}> - {{ .Name }}: {{ .Value | markdown | html }}
{{ end }}
**Source:** [{{ .Annotations.summary }}]({{ .GeneratorURL }})

{{ end }}{{ end }}

{{ define "ding.link.title" }}{{ template "__subject" . }}{{ end }}
{{ define "ding.link.content" }}#### \[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}\] **[{{ index .GroupLabels "alertname" }}]({{ template "__alertmanagerURL" . }})**
{{ template "__text_alert_list" .Alerts.Firing }}
{{ end }}

#dingtalk插件 指定格式模板和钉钉接口来启动
nohup prometheus-webhook-dingtalk --template.file="/usr/local/gocode/src/github.com/timonwong/prometheus-webhook-dingtalk/template/default.tmpl" --ding.profile="webhook=https://oapi.dingtalk.com/robot/send?access_token=64a517b7a1d0ad2dc23exxxx00fe18b0b4e15491f179456f94b6ff5"   2>&1 1>dingding.log &

钉钉报警群设置

只需要设置好公网ip即可

自定义监控项pushgateway

wget https://github.com/prometheus/pushgateway/releases/download/v0.10.0/pushgateway-0.10.0.linux-amd64.tar.gz
tar xf pushgateway-0.10.0.linux-amd64.tar.gz  -C /usr/local
mv /usr/local/pushgateway-0.10.0.linux-amd64  /usr/local/pushgateway
/usr/local/pushgateway/bin/pushgateway  #启动

使用脚本来获取机器值

cat count_netstat_wait_connections.sh
#!/bin/bash
instance_name=`hostname -f | cut -d'.' -f1`  #获取本机名,用于后面的的标签
label="count_netstat_wait_connections"  #定义key名
count_netstat_wait_connections=`netstat -an | grep -i wait | wc -l`  #获取数据的命令
echo "$label: $count_netstat_wait_connections"
echo "$label  $count_netstat_wait_connections" | curl --data-binary @- http://localhost:9091/metrics/job/pushgateway_test/instance/$instance_name
#这里pushgateway_test就是prometheus主配置文件里job的名字,需要保持一致,这样数据就会推送给这个job。后面的instance则是指定机器名,使用的就是脚本里获取的那个

promethus页面查看值

在promethus页面查询这个变量 count_netstat_wait_connections  即可获取到值

posted on 2020-04-02 10:52  石圪节胡德禄  阅读(512)  评论(0)    收藏  举报

导航