【监控】prometheus监控安装
部署
wget https://github.com/prometheus/prometheus/releases/download/v2.28.0/prometheus-2.28.0.linux-amd64.tar.gz
tar xf prometheus-2.28.0.linux-amd64.tar.gz
mv prometheus-2.28.0.linux-amd64 /usr/local/prometheus-2.28.0
vim /usr/local/prometheus-2.28.0/prometheus.yml
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
# - alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: 'prometheus'
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ['localhost:9090']
vim /usr/lib/systemd/system/prometheus.service
[Unit] Description=Prometheus Services After=network.target remote-fs.target [Service] Type=simple ExecStart=/usr/local/prometheus-2.28.0/prometheus --config.file=/usr/local/prometheus-2.28.0/prometheus.yml --storage.tsdb.path=/usr/local/prometheus-2.28.0/ Restart=on-failure RestartSec=5 [Install] WantedBy=multi-user.target
systemctl restart prometheus.service
监控
wget https://github.com/prometheus/node_exporter/releases/download/v1.1.2/node_exporter-1.1.2.linux-amd64.tar.gz
tar xf node_exporter-1.1.2.linux-amd64.tar.gz
mv node_exporter-1.1.2.linux-amd64 /usr/local/node_exporter
cat > /usr/lib/systemd/system/node_exporter.service << EOF
[Unit]
Description=Prometheus Node Exporter Services
After=network.target remote-fs.target
[Service]
Type=simple
ExecStart=/usr/local/node_exporter/node_exporter
Restart=on-failure
RestartSec=5
[Install]
WantedBy=multi-user.target
EOF
systemctl daemon-reload
systemctl start node_exporter
telegram报警
git clone https://github.com/nopp/alertmanager-webhook-telegram-python.git
yum install python3 pip3
cd alertmanager-webhook-telegram-python/
pip3 install -r requirements.txt
pip3 install python-dateutil
vim flaskAlert.py
import telegram, json, logging
from dateutil import parser
from flask import Flask
from flask import request
from flask_basicauth import BasicAuth
app = Flask(__name__)
app.secret_key = 'lAlAlA123'
basic_auth = BasicAuth(app)
# Yes need to have -, change it!
chatID = "" # 更改
# Authentication conf, change it!
app.config['BASIC_AUTH_FORCE'] = True
app.config['BASIC_AUTH_USERNAME'] = '' #更改
app.config['BASIC_AUTH_PASSWORD'] = '' #更改
# Bot token, change it!
bot = telegram.Bot(token="") #更改
@app.route('/alert', methods = ['POST'])
def postAlertmanager():
try:
content = json.loads(request.get_data())
for alert in content['alerts']:
message = "Status: "+alert['status']+"\n"
if 'name' in alert['labels']:
message += "Instance: "+alert['labels']['instance']+"("+alert['labels']['name']+")\n"
else:
message += "Instance: "+alert['labels']['instance']+"\n"
if 'info' in alert['annotations']:
message += "Info: "+alert['annotations']['info']+"\n"
if 'summary' in alert['annotations']:
message += "Summary: "+alert['annotations']['summary']+"\n"
if 'description' in alert['annotations']:
message += "Description: "+alert['annotations']['description']+"\n"
if alert['status'] == "resolved":
correctDate = parser.parse(alert['endsAt']).strftime('%Y-%m-%d %H:%M:%S')
message += "Resolved: "+correctDate
elif alert['status'] == "firing":
correctDate = parser.parse(alert['startsAt']).strftime('%Y-%m-%d %H:%M:%S')
message += "Started: "+correctDate
bot.sendMessage(chat_id=chatID, text=message)
return "Alert OK", 200
except RetryAfter:
sleep(30)
bot.sendMessage(chat_id=chatID, text=message)
return "Alert OK", 200
except TimedOut as e:
sleep(60)
bot.sendMessage(chat_id=chatID, text=message)
return "Alert OK", 200
except NetworkError as e:
sleep(60)
bot.sendMessage(chat_id=chatID, text=message)
return "Alert OK", 200
except Exception as error:
bot.sendMessage(chat_id=chatID, text="Error: "+str(error))
app.logger.info("\t%s",error)
return "Alert fail", 200
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
app.run(host='0.0.0.0', port=9119)
nohup python3 flaskAlert.py &
测试
curl -XPOST --data '{"status":"resolved","groupLabels":{"alertname":"instance_down"},"commonAnnotations":{"description":"i-0d7188fkl90bac100 of job ec2-sp-node_exporter has been down for more than 2 minutes.","summary":"Instance i-0d7188fkl90bac100 down"},"alerts":[{"status":"resolved","labels":{"name":"olokinho01-prod","instance":"i-0d7188fkl90bac100","job":"ec2-sp-node_exporter","alertname":"instance_down","os":"linux","severity":"page"},"endsAt":"2019-07-01T16:16:19.376244942-03:00","generatorURL":"http://pmts.io:9090","startsAt":"2019-07-01T16:02:19.376245319-03:00","annotations":{"description":"i-0d7188fkl90bac100 of job ec2-sp-node_exporter has been down for more than 2 minutes.","summary":"Instance i-0d7188fkl90bac100 down"}}],"version":"4","receiver":"infra-alert","externalURL":"http://alm.io:9093","commonLabels":{"name":"olokinho01-prod","instance":"i-0d7188fkl90bac100","job":"ec2-sp-node_exporter","alertname":"instance_down","os":"linux","severity":"page"}}' http://username:password@flaskAlert:9119/alert
安装alertmanager
wget https://github.com/prometheus/alertmanager/releases/download/v0.22.2/alertmanager-0.22.2.linux-amd64.tar.gz
tar xf alertmanager-0.22.2.linux-amd64.tar.gz
mv alertmanager-0.22.2.linux-amd64 /usr/local/alertmanager
cd /usr/local/alertmanager/
route:
group_by: ['alertname']
group_wait: 30s
group_interval: 5m
repeat_interval: 1h
receiver: 'alertmananger-bot'
receivers:
- name: 'alertmananger-bot'
webhook_configs:
- send_resolved: true
url: http://127.0.0.1:9119/alert
http_config:
basic_auth:
username: 'goroutine'
password: 'goroutine-12345'
templates:
- '/usr/local/alertmanager/test.tmpl'
########### /usr/local/alertmanager/test.rmpl ############
{{ define "test.html" }}
{{ range .Alerts }}
<pre>
故障实例: {{ .Labels.instance }}
故障概要: {{ .Annotations.summary }}
故障描述: {{ .Annotations.description }}
告警级别: {{ .Labels.severity }}
告警时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
</pre>
{{ end }}
{{ end }}
nohup /usr/local/alertmanager/alertmanager --config.file=/usr/local/alertmanager/alertmanager.yml --storage.path=/usr/local/alertmanager/data > /usr/local/alertmanager/alertmanager.log &
测试
#!/usr/bin/env bash
alerts_message='[
{
"labels": {
"alertname": "DiskRunningFull",
"dev": "sda1",
"instance": "example1",
"msgtype": "testing"
},
"annotations": {
"info": "The disk sda1 is running full",
"summary": "please check the instance example1"
}
},
{
"labels": {
"alertname": "DiskRunningFull",
"dev": "sda2",
"instance": "example1",
"msgtype": "testing"
},
"annotations": {
"info": "The disk sda2 is running full",
"summary": "please check the instance example1",
"runbook": "the following link http://test-url should be clickable"
}
}
]'
curl -XPOST -d"$alerts_message" http://127.0.0.1:9093/api/v1/alerts
prometheus修改
/usr/local/prometheus-2.28.0/prometheus.yml
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "rules/*.yml"
rules/base_rules.yml
groups:
- name: node-exporter-alert
rules:
- alert: node-exporter-down
expr: node_exporter:up == 0
for: 1m
labels:
severity: info
annotations:
summary: "instance: {{ $labels.instance }} 宕机了"
description: "instance: {{ $labels.instance }} \n- job: {{ $labels.job }} 关机了, 时间已经1分钟了。"
value: "{{ $value }}"
instance: "{{ $labels.instance }}"
grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
type: "google-cloud"
- alert: node-exporter-cpu-high
expr: node_exporter:cpu:total:percent > 80
for: 3m
labels:
severity: info
annotations:
summary: "instance: {{ $labels.instance }} cpu 使用率高于 {{ $value }}"
description: ""
value: "{{ $value }}"
instance: "{{ $labels.instance }}"
grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
type: "google-cloud"
- alert: node-exporter-cpu-iowait-high
expr: node_exporter:cpu:iowait:percent >= 12
for: 3m
labels:
severity: info
annotations:
summary: "instance: {{ $labels.instance }} cpu iowait 使用率高于 {{ $value }}"
description: ""
value: "{{ $value }}"
instance: "{{ $labels.instance }}"
grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
type: "google-cloud"
- alert: node-exporter-load-load1-high
expr: (node_exporter:load:load1) > (node_exporter:cpu:count) * 1.2
for: 3m
labels:
severity: info
annotations:
summary: "instance: {{ $labels.instance }} load1 使用率高于 {{ $value }}"
description: ""
value: "{{ $value }}"
instance: "{{ $labels.instance }}"
grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
type: "google-cloud"
- alert: node-exporter-memory-high
expr: node_exporter:memory:used:percent > 85
for: 3m
labels:
severity: info
annotations:
summary: "instance: {{ $labels.instance }} memory 使用率高于 {{ $value }}"
description: ""
value: "{{ $value }}"
instance: "{{ $labels.instance }}"
grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
type: "google-cloud"
- alert: node-exporter-disk-high
expr: node_exporter:disk:used:percent > 88
for: 10m
labels:
severity: info
annotations:
summary: "instance: {{ $labels.instance }} disk 使用率高于 {{ $value }}"
description: ""
value: "{{ $value }}"
instance: "{{ $labels.instance }}"
grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
type: "google-cloud"
- alert: node-exporter-disk-read:count-high
expr: node_exporter:disk:read:count:rate > 3000
for: 2m
labels:
severity: info
annotations:
summary: "instance: {{ $labels.instance }} iops read 使用率高于 {{ $value }}"
description: ""
value: "{{ $value }}"
instance: "{{ $labels.instance }}"
grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
type: "google-cloud"
- alert: node-exporter-disk-write-count-high
expr: node_exporter:disk:write:count:rate > 3000
for: 2m
labels:
severity: info
annotations:
summary: "instance: {{ $labels.instance }} iops write 使用率高于 {{ $value }}"
description: ""
value: "{{ $value }}"
instance: "{{ $labels.instance }}"
grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
type: "google-cloud"
- alert: node-exporter-disk-read-mb-high
expr: node_exporter:disk:read:mb:rate > 60
for: 2m
labels:
severity: info
annotations:
summary: "instance: {{ $labels.instance }} 读取字节数 高于 {{ $value }}"
description: ""
instance: "{{ $labels.instance }}"
value: "{{ $value }}"
grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
type: "google-cloud"
- alert: node-exporter-disk-write-mb-high
expr: node_exporter:disk:write:mb:rate > 60
for: 2m
labels:
severity: info
annotations:
summary: "instance: {{ $labels.instance }} 写入字节数 高于 {{ $value }}"
description: ""
value: "{{ $value }}"
instance: "{{ $labels.instance }}"
grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
type: "google-cloud"
- alert: node-exporter-filefd-allocated-percent-high
expr: node_exporter:filefd_allocated:percent > 80
for: 10m
labels:
severity: info
annotations:
summary: "instance: {{ $labels.instance }} 打开文件描述符 高于 {{ $value }}"
description: ""
value: "{{ $value }}"
instance: "{{ $labels.instance }}"
grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
type: "google-cloud"
- alert: node-exporter-network-netin-error-rate-high
expr: node_exporter:network:netin:error:rate > 4
for: 1m
labels:
severity: info
annotations:
summary: "instance: {{ $labels.instance }} 包进入的错误速率 高于 {{ $value }}"
description: ""
value: "{{ $value }}"
instance: "{{ $labels.instance }}"
grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
type: "google-cloud"
- alert: node-exporter-network-netin-packet-rate-high
expr: node_exporter:network:netin:packet:rate > 35000
for: 1m
labels:
severity: info
annotations:
summary: "instance: {{ $labels.instance }} 包进入速率 高于 {{ $value }}"
description: ""
value: "{{ $value }}"
instance: "{{ $labels.instance }}"
grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
type: "google-cloud"
- alert: node-exporter-network-netout-packet-rate-high
expr: node_exporter:network:netout:packet:rate > 35000
for: 1m
labels:
severity: info
annotations:
summary: "instance: {{ $labels.instance }} 包流出速率 高于 {{ $value }}"
description: ""
value: "{{ $value }}"
instance: "{{ $labels.instance }}"
grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
type: "google-cloud"
- alert: node-exporter-network-tcp-total-count-high
expr: node_exporter:network:tcp:total:count > 40000
for: 1m
labels:
severity: info
annotations:
summary: "instance: {{ $labels.instance }} tcp连接数量 高于 {{ $value }}"
description: ""
value: "{{ $value }}"
instance: "{{ $labels.instance }}"
grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
type: "google-cloud"
- alert: node-exporter-process-zoom-total-count-high
expr: node_exporter:process:zoom:total:count > 10
for: 10m
labels:
severity: info
annotations:
summary: "instance: {{ $labels.instance }} 僵死进程数量 高于 {{ $value }}"
description: ""
value: "{{ $value }}"
instance: "{{ $labels.instance }}"
grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
type: "google-cloud"
- alert: node-exporter-time-offset-high
expr: node_exporter:time:offset > 0.03
for: 2m
labels:
severity: info
annotations:
summary: "instance: {{ $labels.instance }} {{ $labels.desc }} {{ $value }} {{ $labels.unit }}"
description: ""
value: "{{ $value }}"
instance: "{{ $labels.instance }}"
grafana: "http://IP:3000/d/node-exporter/node-exporter?orgId=1&var-instance={{ $labels.instance }} "
type: "google-cloud"
systemctl restart prometheus

浙公网安备 33010602011771号