监控prometheus + Grafana + Alertmanager

安装Prometheus【信息采集】

主进程安装

docker pull prom/prometheus

docker run -d --name prometheus -p 9090:9090 prom/prometheus

# 复制以及挂载目录
mkdir -p /docker/prometheus/config
mkdir -p /docker/prometheus/data
mkdir -p /docker/prometheus/consoles
mkdir -p /docker/prometheus/console_libraries
# 权限
# sudo chown -R 1000:1000 /docker/prometheus
# 因为是二进制文件
sudo chown -R 65534:65534 /docker/prometheus
sudo chmod -R 775 /docker/prometheus

# 拷贝
docker cp prometheus:/etc/prometheus/. /docker/prometheus/config/
docker cp prometheus:/prometheus/. /docker/prometheus/data
docker cp prometheus:/usr/share/prometheus/consoles/. /docker/prometheus/consoles
docker cp prometheus:/usr/share/prometheus/console_libraries/. /docker/prometheus/console_libraries

# 删除原来的容器
docker stop prometheus
docker rm prometheus

# 重新跑
docker run -d \
  --restart=always \
  --name prometheus \
  -p 9090:9090 \
  -v /etc/localtime:/etc/localtime:ro \
  -v /etc/timezone:/etc/timezone:ro \
  -v /docker/prometheus/config/:/etc/prometheus \
  -v /docker/prometheus/data/:/prometheus \
  -v /docker/prometheus/consoles/:/usr/share/prometheus/consoles \
  -v /docker/prometheus/console_libraries/:/usr/share/prometheus/console_libraries \
  prom/prometheus \
  --config.file=/etc/prometheus/prometheus.yml \
  --storage.tsdb.path=/prometheus \
  --web.enable-lifecycle

pushgateway (API网关) 安装

docker pull prom/pushgateway
# 运行
docker run -d \
  --restart=always \
  --name pushgateway \
  -p 9091:9091 \
  -v /etc/localtime:/etc/localtime:ro \
  -v /etc/timezone:/etc/timezone:ro \
  prom/pushgateway

Node Exporter(服务器资源监控)

cd /opt
sudo wget https://github.com/prometheus/node_exporter/releases/download/v1.9.1/node_exporter-1.9.1.linux-amd64.tar.gz
tar -xvf node_exporter-1.9.1.linux-amd64.tar.gz
sudo mv node_exporter-1.9.1.linux-amd64  node_exporter-1.9.1
cd node_exporter-1.9.1

# 启动
# sudo ./node_exporter

# 配置开启自启动
sudo vim /usr/lib/systemd/system/node_exporter.service 
[Unit] 
Description=node_export 
Documentation=https://github.com/prometheus/node_exporter 
After=network.target 
[Service] 
Type=simple 
User=ubuntu 
ExecStart= /opt/node_exporter-1.9.1/node_exporter 
Restart=on-failure 
[Install] 
WantedBy=multi-user.target

# 加载配置
sudo systemctl daemon-reload
sudo systemctl enable node_exporter
sudo systemctl start node_exporter
sudo systemctl status node_exporter

# 访问 这个测试安装是否成功
http://192.168.0.9:9100/metrics

prometheus配置

prometheus.yml

# 我的全局配置
global:
  scrape_interval: 15s # 设置抓取间隔为每 15 秒。默认是每 1 分钟。
  evaluation_interval: 15s # 每 15 秒评估一次规则。默认是每 1 分钟。
  # scrape_timeout 设置为全局默认值(10s)。

# 警报管理器配置
alerting:
  alertmanagers:
    - static_configs:
       - targets: ['192.168.0.33:9093']
          # - alertmanager:9093

# 一次加载规则并根据全局 'evaluation_interval' 定期评估它们。
rule_files:
  - "node_alerts.yml"
  # - "first_rules.yml"
  # - "second_rules.yml"

# 一个抓取配置,包含一个需要抓取的端点:
# 这里是 Prometheus 本身。
scrape_configs:
  # 作业名称将作为标签 `job=<job_name>` 添加到任何从该配置抓取的时间序列中。
  - job_name: "prometheus"
    static_configs:
      - targets: ["192.168.0.33:9090"]
  # Node Exporter 配置
  - job_name: 'node_exporter'
    static_configs:
      - targets: ['192.168.0.33:9100','192.168.0.9:9100']
   # Pushgateway 配置
  - job_name: 'pushgateway'
    static_configs:
      - targets: ['192.168.0.33:9091']

node_alerts.yml

groups:
  - name: node_alerts
    rules:
      - alert: HighCPUUsage
        expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[1m])) * 100) > 80
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "高 CPU 使用率 ({{ $labels.instance }})"
          description: "CPU 使用率超过 80%,持续 2 分钟。"

      - alert: HighMemoryUsage
        expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "内存使用率高 ({{ $labels.instance }})"
          description: "内存使用率超过 90%,持续 2 分钟。"

      - alert: InstanceDown
        expr: up == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "实例宕机 ({{ $labels.instance }})"
          description: "该实例在过去 1 分钟内无法访问,可能已宕机或网络故障。"

      - alert: HighDiskUsage
        expr: (node_filesystem_size_bytes - node_filesystem_avail_bytes) / node_filesystem_size_bytes > 0.9
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "磁盘使用率高 ({{ $labels.instance }})"
          description: "磁盘使用率超过 90%,持续 2 分钟。"

      - alert: HighNetworkTraffic
        expr: rate(node_network_receive_bytes_total[1m]) > 1000000 or rate(node_network_transmit_bytes_total[1m]) > 1000000
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "网络流量过高 ({{ $labels.instance }})"
          description: "网络接收或发送流量超过 1MB/s,持续 2 分钟。"

安装Grafana[监控]

docker pull grafana/grafana

docker run -d --name grafana grafana/grafana

# 复制以及挂载目录
mkdir -p /docker/grafana/config
mkdir -p /docker/grafana/data
mkdir -p /docker/grafana/provisioning
# 权限
sudo chown -R 472:472 /docker/grafana
sudo chmod -R 775 /docker/grafana

# 拷贝
sudo docker cp grafana:/etc/grafana/. /docker/grafana/config/
sudo docker cp grafana:/var/lib/grafana/. /docker/grafana/data/
sudo docker cp grafana:/etc/grafana/provisioning/. /docker/grafana/provisioning/

# 删除原来的容器
docker stop grafana
docker rm grafana

# 重新跑
docker run -d \
  --restart=always \
  --name grafana \
  -p 3000:3000 \
  -v /docker/grafana/config/:/etc/grafana \
  -v /docker/grafana/data/:/var/lib/grafana \
  -v /docker/grafana/provisioning/:/etc/grafana/provisioning \
  -v /etc/localtime:/etc/localtime:ro \
  -v /etc/timezone:/etc/timezone:ro \
  grafana/grafana

https://grafana.com/grafana/dashboards/ 去这里找面板

安装 Alertmanager【告警】

docker pull prom/alertmanager

docker run -d --name alertmanager  prom/alertmanager

# 复制以及挂载目录
mkdir -p /docker/alertmanager/config
mkdir -p /docker/alertmanager/data
# 权限
sudo chown -R 65534:65534 /docker/alertmanager
sudo chmod -R 775 /docker/alertmanager

# 拷贝
sudo docker cp alertmanager:/etc/alertmanager/. /docker/alertmanager/config/
sudo docker cp alertmanager:/alertmanager/. /docker/alertmanager/data/

# 删除原来的容器
docker stop alertmanager
docker rm alertmanager

# 重新跑
docker run -d \
  --restart=always \
  --name alertmanager \
  -p 9093:9093 \
  -v /docker/alertmanager/config/:/etc/alertmanager \
  -v /docker/alertmanager/data/:/alertmanager \
  -v /etc/localtime:/etc/localtime:ro \
  -v /etc/timezone:/etc/timezone:ro \
  -e TZ="Asia/Shanghai" \
  prom/alertmanager

邮件模板

{{ define "email.html" }}
{{- if gt (len .Alerts.Firing) 0 -}}
<h1 style="color: #d9534f; text-align: center;">告警通知</h1>
<div style="padding: 20px; background-color: #f9f9f9; border-radius: 8px;">
        {{ range $i, $alert := .Alerts.Firing }}
                <div style="margin-bottom: 20px; border: 1px solid #ddd; padding: 10px; border-radius: 5px;">
                        <h2>{{ index $alert.Labels "alertname" }}</h2>
						<p><strong>报警级别:</strong>{{ index $alert.Labels "severity" }}</p>
                        <p><strong>实例:</strong>{{ index $alert.Labels "instance" }}</p>
                        <p><strong>报警详情:</strong>{{ index $alert.Annotations "description" }}</p>
                        <p><strong>开始时间:</strong>{{ $alert.StartsAt.Local.Format "2006-01-02 15:04:05" }}</p>
                </div>
        {{ end }}
</div>
{{ end }}
{{- if gt (len .Alerts.Resolved) 0 -}}
<h1 style="color: #5bc0de; text-align: center;">恢复通知</h1>
<div style="padding: 20px; background-color: #f9f9f9; border-radius: 8px;">
        {{ range $i, $alert := .Alerts.Resolved }}
                <div style="margin-bottom: 20px; border: 1px solid #ddd; padding: 10px; border-radius: 5px;">
                        <h2>{{ index $alert.Labels "alertname" }}</h2>
						<p><strong>报警级别:</strong>{{ index $alert.Labels "severity" }}</p>
                        <p><strong>实例:</strong>{{ index $alert.Labels "instance" }}</p>
                        <p><strong>报警详情:</strong>{{ index $alert.Annotations "description" }}</p>
                        <p><strong>开始时间:</strong>{{ $alert.StartsAt.Local.Format "2006-01-02 15:04:05" }}</p>
                        <p><strong>恢复时间:</strong>{{ $alert.EndsAt.Local.Format "2006-01-02 15:04:05" }}</p>
                </div>
        {{ end }}
</div>
{{ end }}
{{- end }}

配置文件

alertmanager.yml

global:
  resolve_timeout: 5m  # 告警恢复的等待时间(解决告警后多久发送 resolved 通知)

route:
  group_by: ['alertname']  # 相同 alertname 的告警会分组一起发送
  receiver: 'mail'         # 默认的接收者名字,对应下面 receivers 里的 name
  group_wait: 10s          # 第一次告警等待多久发送(10 秒内的相同告警会合并成一组)
  group_interval: 1m       # 同一组告警之间发送的时间间隔
  repeat_interval: 30m     # 相同告警在未恢复时,多久重复发送一次

templates: [ '/etc/alertmanager/email_template.tmpl']

receivers:
  - name: 'mail'  # 接收器名称,与上面的 route.receiver 一致
    email_configs:
      - to: 'xxxx@gmail.com'         # 接收告警的目标邮箱
        from: 'xxxx@qq.com'       # 发件邮箱(要配置 SMTP 服务)
        smarthost: 'smtp.qq.com:465'      # SMTP 邮件服务器地址与端口(587 是 TLS)
        auth_username: 'xxxx@qq.com'  # 登录 SMTP 的用户名(通常与发件人一致)
        auth_password: 'vtetxneszfjvbcdi'      # 登录 SMTP 的密码或授权码(QQ 邮箱需开启 SMTP 并获取授权码)
        require_tls: false                 # 是否启用 TLS 加密
        send_resolved: true              # 告警恢复时是否发送恢复通知邮件
        html: '{{ template "email.html" . }}'
        headers:
         Subject: '{{ .CommonLabels.alertname }} - {{ .CommonLabels.severity }} - {{ if eq .Status "firing" }}告警通知{{ else }}告警恢复{{ end }}'
posted @ 2025-09-01 15:45  beamsoflight  阅读(6)  评论(0)    收藏  举报