监控prometheus + Grafana + Alertmanager
安装Prometheus【信息采集】
主进程安装
docker pull prom/prometheus
docker run -d --name prometheus -p 9090:9090 prom/prometheus
# 复制以及挂载目录
mkdir -p /docker/prometheus/config
mkdir -p /docker/prometheus/data
mkdir -p /docker/prometheus/consoles
mkdir -p /docker/prometheus/console_libraries
# 权限
# sudo chown -R 1000:1000 /docker/prometheus
# 因为是二进制文件
sudo chown -R 65534:65534 /docker/prometheus
sudo chmod -R 775 /docker/prometheus
# 拷贝
docker cp prometheus:/etc/prometheus/. /docker/prometheus/config/
docker cp prometheus:/prometheus/. /docker/prometheus/data
docker cp prometheus:/usr/share/prometheus/consoles/. /docker/prometheus/consoles
docker cp prometheus:/usr/share/prometheus/console_libraries/. /docker/prometheus/console_libraries
# 删除原来的容器
docker stop prometheus
docker rm prometheus
# 重新跑
docker run -d \
--restart=always \
--name prometheus \
-p 9090:9090 \
-v /etc/localtime:/etc/localtime:ro \
-v /etc/timezone:/etc/timezone:ro \
-v /docker/prometheus/config/:/etc/prometheus \
-v /docker/prometheus/data/:/prometheus \
-v /docker/prometheus/consoles/:/usr/share/prometheus/consoles \
-v /docker/prometheus/console_libraries/:/usr/share/prometheus/console_libraries \
prom/prometheus \
--config.file=/etc/prometheus/prometheus.yml \
--storage.tsdb.path=/prometheus \
--web.enable-lifecycle
pushgateway (API网关) 安装
docker pull prom/pushgateway
# 运行
docker run -d \
--restart=always \
--name pushgateway \
-p 9091:9091 \
-v /etc/localtime:/etc/localtime:ro \
-v /etc/timezone:/etc/timezone:ro \
prom/pushgateway
Node Exporter(服务器资源监控)
cd /opt
sudo wget https://github.com/prometheus/node_exporter/releases/download/v1.9.1/node_exporter-1.9.1.linux-amd64.tar.gz
tar -xvf node_exporter-1.9.1.linux-amd64.tar.gz
sudo mv node_exporter-1.9.1.linux-amd64 node_exporter-1.9.1
cd node_exporter-1.9.1
# 启动
# sudo ./node_exporter
# 配置开启自启动
sudo vim /usr/lib/systemd/system/node_exporter.service
[Unit]
Description=node_export
Documentation=https://github.com/prometheus/node_exporter
After=network.target
[Service]
Type=simple
User=ubuntu
ExecStart= /opt/node_exporter-1.9.1/node_exporter
Restart=on-failure
[Install]
WantedBy=multi-user.target
# 加载配置
sudo systemctl daemon-reload
sudo systemctl enable node_exporter
sudo systemctl start node_exporter
sudo systemctl status node_exporter
# 访问 这个测试安装是否成功
http://192.168.0.9:9100/metrics
prometheus配置
prometheus.yml
# 我的全局配置
global:
scrape_interval: 15s # 设置抓取间隔为每 15 秒。默认是每 1 分钟。
evaluation_interval: 15s # 每 15 秒评估一次规则。默认是每 1 分钟。
# scrape_timeout 设置为全局默认值(10s)。
# 警报管理器配置
alerting:
alertmanagers:
- static_configs:
- targets: ['192.168.0.33:9093']
# - alertmanager:9093
# 一次加载规则并根据全局 'evaluation_interval' 定期评估它们。
rule_files:
- "node_alerts.yml"
# - "first_rules.yml"
# - "second_rules.yml"
# 一个抓取配置,包含一个需要抓取的端点:
# 这里是 Prometheus 本身。
scrape_configs:
# 作业名称将作为标签 `job=<job_name>` 添加到任何从该配置抓取的时间序列中。
- job_name: "prometheus"
static_configs:
- targets: ["192.168.0.33:9090"]
# Node Exporter 配置
- job_name: 'node_exporter'
static_configs:
- targets: ['192.168.0.33:9100','192.168.0.9:9100']
# Pushgateway 配置
- job_name: 'pushgateway'
static_configs:
- targets: ['192.168.0.33:9091']
node_alerts.yml
groups:
- name: node_alerts
rules:
- alert: HighCPUUsage
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[1m])) * 100) > 80
for: 2m
labels:
severity: warning
annotations:
summary: "高 CPU 使用率 ({{ $labels.instance }})"
description: "CPU 使用率超过 80%,持续 2 分钟。"
- alert: HighMemoryUsage
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9
for: 2m
labels:
severity: warning
annotations:
summary: "内存使用率高 ({{ $labels.instance }})"
description: "内存使用率超过 90%,持续 2 分钟。"
- alert: InstanceDown
expr: up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "实例宕机 ({{ $labels.instance }})"
description: "该实例在过去 1 分钟内无法访问,可能已宕机或网络故障。"
- alert: HighDiskUsage
expr: (node_filesystem_size_bytes - node_filesystem_avail_bytes) / node_filesystem_size_bytes > 0.9
for: 2m
labels:
severity: warning
annotations:
summary: "磁盘使用率高 ({{ $labels.instance }})"
description: "磁盘使用率超过 90%,持续 2 分钟。"
- alert: HighNetworkTraffic
expr: rate(node_network_receive_bytes_total[1m]) > 1000000 or rate(node_network_transmit_bytes_total[1m]) > 1000000
for: 2m
labels:
severity: warning
annotations:
summary: "网络流量过高 ({{ $labels.instance }})"
description: "网络接收或发送流量超过 1MB/s,持续 2 分钟。"
安装Grafana[监控]
docker pull grafana/grafana
docker run -d --name grafana grafana/grafana
# 复制以及挂载目录
mkdir -p /docker/grafana/config
mkdir -p /docker/grafana/data
mkdir -p /docker/grafana/provisioning
# 权限
sudo chown -R 472:472 /docker/grafana
sudo chmod -R 775 /docker/grafana
# 拷贝
sudo docker cp grafana:/etc/grafana/. /docker/grafana/config/
sudo docker cp grafana:/var/lib/grafana/. /docker/grafana/data/
sudo docker cp grafana:/etc/grafana/provisioning/. /docker/grafana/provisioning/
# 删除原来的容器
docker stop grafana
docker rm grafana
# 重新跑
docker run -d \
--restart=always \
--name grafana \
-p 3000:3000 \
-v /docker/grafana/config/:/etc/grafana \
-v /docker/grafana/data/:/var/lib/grafana \
-v /docker/grafana/provisioning/:/etc/grafana/provisioning \
-v /etc/localtime:/etc/localtime:ro \
-v /etc/timezone:/etc/timezone:ro \
grafana/grafana
https://grafana.com/grafana/dashboards/ 去这里找面板
安装 Alertmanager【告警】
docker pull prom/alertmanager
docker run -d --name alertmanager prom/alertmanager
# 复制以及挂载目录
mkdir -p /docker/alertmanager/config
mkdir -p /docker/alertmanager/data
# 权限
sudo chown -R 65534:65534 /docker/alertmanager
sudo chmod -R 775 /docker/alertmanager
# 拷贝
sudo docker cp alertmanager:/etc/alertmanager/. /docker/alertmanager/config/
sudo docker cp alertmanager:/alertmanager/. /docker/alertmanager/data/
# 删除原来的容器
docker stop alertmanager
docker rm alertmanager
# 重新跑
docker run -d \
--restart=always \
--name alertmanager \
-p 9093:9093 \
-v /docker/alertmanager/config/:/etc/alertmanager \
-v /docker/alertmanager/data/:/alertmanager \
-v /etc/localtime:/etc/localtime:ro \
-v /etc/timezone:/etc/timezone:ro \
-e TZ="Asia/Shanghai" \
prom/alertmanager
邮件模板
{{ define "email.html" }}
{{- if gt (len .Alerts.Firing) 0 -}}
<h1 style="color: #d9534f; text-align: center;">告警通知</h1>
<div style="padding: 20px; background-color: #f9f9f9; border-radius: 8px;">
{{ range $i, $alert := .Alerts.Firing }}
<div style="margin-bottom: 20px; border: 1px solid #ddd; padding: 10px; border-radius: 5px;">
<h2>{{ index $alert.Labels "alertname" }}</h2>
<p><strong>报警级别:</strong>{{ index $alert.Labels "severity" }}</p>
<p><strong>实例:</strong>{{ index $alert.Labels "instance" }}</p>
<p><strong>报警详情:</strong>{{ index $alert.Annotations "description" }}</p>
<p><strong>开始时间:</strong>{{ $alert.StartsAt.Local.Format "2006-01-02 15:04:05" }}</p>
</div>
{{ end }}
</div>
{{ end }}
{{- if gt (len .Alerts.Resolved) 0 -}}
<h1 style="color: #5bc0de; text-align: center;">恢复通知</h1>
<div style="padding: 20px; background-color: #f9f9f9; border-radius: 8px;">
{{ range $i, $alert := .Alerts.Resolved }}
<div style="margin-bottom: 20px; border: 1px solid #ddd; padding: 10px; border-radius: 5px;">
<h2>{{ index $alert.Labels "alertname" }}</h2>
<p><strong>报警级别:</strong>{{ index $alert.Labels "severity" }}</p>
<p><strong>实例:</strong>{{ index $alert.Labels "instance" }}</p>
<p><strong>报警详情:</strong>{{ index $alert.Annotations "description" }}</p>
<p><strong>开始时间:</strong>{{ $alert.StartsAt.Local.Format "2006-01-02 15:04:05" }}</p>
<p><strong>恢复时间:</strong>{{ $alert.EndsAt.Local.Format "2006-01-02 15:04:05" }}</p>
</div>
{{ end }}
</div>
{{ end }}
{{- end }}
配置文件
alertmanager.yml
global:
resolve_timeout: 5m # 告警恢复的等待时间(解决告警后多久发送 resolved 通知)
route:
group_by: ['alertname'] # 相同 alertname 的告警会分组一起发送
receiver: 'mail' # 默认的接收者名字,对应下面 receivers 里的 name
group_wait: 10s # 第一次告警等待多久发送(10 秒内的相同告警会合并成一组)
group_interval: 1m # 同一组告警之间发送的时间间隔
repeat_interval: 30m # 相同告警在未恢复时,多久重复发送一次
templates: [ '/etc/alertmanager/email_template.tmpl']
receivers:
- name: 'mail' # 接收器名称,与上面的 route.receiver 一致
email_configs:
- to: 'xxxx@gmail.com' # 接收告警的目标邮箱
from: 'xxxx@qq.com' # 发件邮箱(要配置 SMTP 服务)
smarthost: 'smtp.qq.com:465' # SMTP 邮件服务器地址与端口(587 是 TLS)
auth_username: 'xxxx@qq.com' # 登录 SMTP 的用户名(通常与发件人一致)
auth_password: 'vtetxneszfjvbcdi' # 登录 SMTP 的密码或授权码(QQ 邮箱需开启 SMTP 并获取授权码)
require_tls: false # 是否启用 TLS 加密
send_resolved: true # 告警恢复时是否发送恢复通知邮件
html: '{{ template "email.html" . }}'
headers:
Subject: '{{ .CommonLabels.alertname }} - {{ .CommonLabels.severity }} - {{ if eq .Status "firing" }}告警通知{{ else }}告警恢复{{ end }}'

浙公网安备 33010602011771号