Prometheus配置示例_基于docker


流量速率解析 如下几张流量截图解析:京东云带宽上限 3Mbps(3M比特[位]每秒 ≈ 384KB/s),此参数是指下载时的流量。云主机接收的瞬时流量会比此数值高(≈ 15Mb/s [ 1.83 MB/s ])。

image

image

 

image

 

image

 


告警规则示例
groups:
- name: host_alerts
  rules:
  - alert: 节点宕机
    expr: up == 0
    for: 1m
    labels:
      severity: 好严重
    annotations:
      summary: "实例 {{ $labels.job }} {{ $labels.instance }} 宕机"
      description: |
        {{ $labels.job }} {{ $labels.instance }} 已宕机超过 1 分钟。

  - alert: CPU使用率
    expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
    for: 1m
    labels:
      severity: 好严重
    annotations:
      summary: "实例 {{ $labels.job }} {{ $labels.instance }} CPU使用率过高"
      description: |
        {{ $labels.job }} {{ $labels.instance }} CPU使用率超过80%。当前值: {{ $value | printf "%.2f" }}%
  
  - alert: 内存使用率
    expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 80
    for: 1m
    labels:
      severity: 不用慌
    annotations:
      summary: "实例 {{ $labels.job }} {{ $labels.instance }} 内存使用率过高"
      description: |
        {{ $labels.job }} {{ $labels.instance }} 内存使用率超过80%。当前值: {{ $value | printf "%.2f" }}%。【计算公式:Used = Total - Free,从操作系统角度查看物理内存的总体分配情况。更精确的计算工时:内存使用率 = (总内存 - 空闲内存 - 缓冲内存 - 缓存内存) / 总内存 × 100- alert: 内存使用率(Linux系统应用程序实际占用内存)
    expr: (node_memory_MemTotal_bytes - node_memory_MemFree_bytes - node_memory_Buffers_bytes - node_memory_Cached_bytes) / node_memory_MemTotal_bytes * 100 > 30
    for: 1m
    labels:
      severity: 好严重
    annotations:
      summary: "实例 {{ $labels.job }} {{ $labels.instance }} 内存使用率过高"
      description: |
        {{ $labels.job }} {{ $labels.instance }} 内存使用率超过30%。当前值: {{ $value | printf "%.2f" }}%。【Used = Total - Free - Buffers - Cached 这个公式,是基于Prometheus node_exporter数据计算Linux系统应用程序实际占用内存的一种经典方法。它的设计思路是排除掉可以被内核快速回收的内存,从而更真实地反映内存压力】

  - alert: 根目录磁盘空间使用率
    expr: ((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_avail_bytes{mountpoint="/"}) / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 60
    for: 1m
    labels:
      severity: 好严重
    annotations:
      summary: "实例 {{ $labels.job }} {{ $labels.instance }} 根目录磁盘空间使用率过高"
      description: |
        {{ $labels.job }} {{ $labels.instance }} 根目录磁盘空间使用率大于60%。当前值: {{ $value | printf "%.2f" }}%

  - alert: 系统负载
    #expr: node_load5 > count by(instance) (node_cpu_seconds_total{mode="idle"})
    expr: node_load5 > 1
    for: 1m
    labels:
      severity: 好严重
    annotations:
      summary: "实例 {{ $labels.job }} {{ $labels.instance }} 5分钟负载"
      description: |
        {{ $labels.job }} {{ $labels.instance }} 5分钟负载超过CPU核心数1倍。当前值: {{ $value | printf "%.2f" }}。[若负载持续高于核心数(如4核CPU负载>4),则表示系统过载,任务需要排队]
 
grafana-image-renderer
## docker安装grafana-image-renderer插件
 docker run -d --net="host" --name grafana-image-renderer  -e TZ=Asia/Shanghai grafana/grafana-image-renderer:latest
---
## 二进制安装grafana(https://grafana.com/grafana/download/11.6.2?pg=get&plcmt=selfmanaged-box1-cta1&edition=oss
    选择OSS版本是社区版)
添加配置:
        [rendering]
        server_url = http://127.0.0.1:8081/render

        callback_url = http://127.0.0.1:3000/
---
# 创建临时容器来复制默认配置
docker run --rm -v /opt/grafana/conf:/temp grafana/grafana cp -r /etc/grafana/* /opt/grafana/conf
## 修改配置信息后运行容器
docker run -d   --name=grafana   --net="host"   -v /opt/grafana/data:/var/lib/grafana  -v /opt/grafana/conf:/etc/grafana  grafana/grafana

image

 


发送告警到企业微信webhook机器人
## 官方演示文档: https://developer.work.weixin.qq.com/document/path/99110?version=5.0.2.6011&platform=win
curl 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=7b5d6e66-4a09-48d2-8ce7-4b83dada9834' \
   -H 'Content-Type: application/json' \
   -d '
   {
        "msgtype": "text",
        "text": {
            "content": "hello world"
        }
   }'
---
apt install redis-server 【docker run -d --name my-redis --net="host" redis
  ##也可以使用docker安装redis】
cat > alertmanager-webhook.yaml <<EOF
# 企业微信机器人key
# 使用企业微信时必须配置,不使用则留空
qywechatKey: 7b5d6e66-4a09-48d2-8ce7-4b83dada9834

# 飞书机器人key
# 使用飞书时必须配置,不使用则留空
feishuKey:

# 钉钉机器人key
# 使用钉钉时必须配置,不使用则留
dingdingKey:

# Redis配置
redisServer: 127.0.0.1  # 必须配置
redisPort:  # 可选项,为空默认为6379
redisPassword:  # redis未设置密码则留空,如果设置了密码登陆则必须配置

# 日志配置
logFileDir:   # 可选项,为空则为程序运行目录
logFilePath: alertmanager-webhook.log # 必须配置

# 服务监听配置
port: 9095 # 可选项,为空则默认为9095
host: 0.0.0.0 # 可选项,为空默认监听 127.0.0.1
EOF
[root@lavm-e7epxxuqru /opt/alertmanager/alert_webhook/alertmanager-webhook]# ./alertmanager-webhook -c alertmanager-webhook.yaml
2025/11/28 16:54:57 the Process is Running
---
[root@lavm-e7epxxuqru /opt/prometheus/rules]# cat host_alerts.yml
groups:
- name: host_alerts
  rules:
  - alert: InstanceDown
    expr: up == 0
    for: 1m
    labels:
      severity: critical
    annotations:
      summary: "实例 {{ $labels.job }} {{ $labels.instance }} 宕机"
      description: |
        '{{ $labels.job }} {{ $labels.instance }} 已宕机超过 1 分钟。'
---
[root@lavm-e7epxxuqru /opt/alertmanager]# cat alertmanager.yml
global:
  resolve_timeout: 5m

templates:
  - '/etc/alertmanager/*.tmpl'

route:
  group_by: ['alertname']
  group_wait: 30s
  group_interval: 1m
  repeat_interval: 1h
  receiver: 'wechat-webhook'

receivers:
- name: 'wechat-webhook'
  webhook_configs:
  - url: 'http://127.0.0.1:9095/qywechat'
    send_resolved: true
---

image

 

 


 

docker安装Prometheus
---如下内容已测试完成,告警通知到邮件---
docker run -d   --net="host"   --pid="host"   -v "/:/host:ro,rslave"   quay.io/prometheus/node-exporter:latest   --path.rootfs=/host

mkdir -p /opt/prometheus/data && chown -R 65534:65534 /opt/prometheus/data
cat > /opt/prometheus/prometheus.yml <<EOF
global:
  scrape_interval: 9s # 全局的抓取间隔

scrape_configs:
  # 监控 Prometheus 自身
  - job_name: 'prometheus'
    static_configs:
      - targets: ['localhost:9090'] # Prometheus 自身的指标地址

  # 监控宿主机(通过 Node Exporter)
  - job_name: '京东云主机'
    static_configs:
      - targets: ['127.0.0.1:9100'] # Node Exporter 的暴露端口,若在同一主机可使用'localhost:9100'

alerting:
  alertmanagers:
    - static_configs:
        - targets: ['localhost:9093']  # 指定Alertmanager地址

# 加载告警规则文件
rule_files:
  - "/etc/prometheus/rules/*.yml"  # 规则文件在容器内的路径
EOF

[root@lavm-e7epxxuqru /opt/prometheus/rules]# cat host_alerts.yml
groups:
- name: host_alerts
  rules:
  - alert: InstanceDown
    expr: up == 0
    for: 1m
    labels:
      severity: critical
    annotations:
      summary: "实例 {{ $labels.instance }} 宕机"
      description: |
        "{{ $labels.instance }} 已宕机超过 1 分钟。"

docker run -d   --net="host"   -v /opt/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml   -v /opt/prometheus/data:/prometheus -v /opt/prometheus/rules:/etc/prometheus/rules  prom/prometheus   --web.enable-admin-api   --web.enable-lifecycle --storage.tsdb.retention.time=16d --config.file=/etc/prometheus/prometheus.yml --web.listen-address=":9091"
curl -X POST http://localhost:9090/-/reload
mkdir -p /opt/grafana/data && chown -R 472:472 /opt/grafana/*
docker run -d   --name=grafana   --net="host"   -v /opt/grafana/data:/var/lib/grafana   grafana/grafana   ##默认账密admin/admin
## 推荐实验仪表盘编号:1860(Node Exporter Full)16098(Node Exporter Dashboard 20240520)

image

 

 

mkdir -p /opt/alertmanager
cat > /opt/alertmanager/alertmanager.yml <<EOF
global:
  resolve_timeout: 5m
  smtp_smarthost: 'smtp.qq.com:587'  # QQ邮箱的SMTP服务器,请根据您的邮箱修改
  smtp_from: '8888888888@qq.com'     # 发件人邮箱
  smtp_auth_username: '8888888888@qq.com' # 发件人邮箱,通常与smtp_from一致
  smtp_auth_password: '*********'   # 这里是授权码,不是邮箱登录密码
  smtp_require_tls: true              # 启用TLS

templates:
  - '/etc/alertmanager/*.tmpl'

route:
  group_by: ['alertname']   # 按告警名称分组
  group_wait: 30s           # 初始等待时间,同一组告警首次发出前的等待时间
  group_interval: 5m       # 同一组告警再次发送的间隔时间
  repeat_interval: 1h       # 重复发送同一告警的间隔时间
  receiver: 'default-receiver' # 默认接收者

receivers:
- name: 'default-receiver'
  email_configs:
  - to: '18888888888@163.com' # 收件人邮箱
    send_resolved: true               # 告警恢复时也发送通知
    html: '{{ template "custom.alert.message" . }}'
    # 可选:自定义邮件主题
    headers:
      Subject: '【监控告警】{{ .CommonLabels.alertname }} - {{ .CommonLabels.instance }}'
EOF

cat > template.tmpl <<EOF
{{ define "custom.alert.message" }}
<!DOCTYPE html>
<html>
<head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
    <style type="text/css">
        body {
            font-family: 'Helvetica Neue', Arial, sans-serif;
            line-height: 1.6;
            color: #333;
            max-width: 700px;
            margin: 0 auto;
            padding: 20px;
            background-color: #f9f9f9;
        }
        .alert-card {
            border-radius: 8px;
            padding: 20px;
            margin-bottom: 20px;
            box-shadow: 0 2px 10px rgba(0,0,0,0.1);
        }
        .alert-critical {
            background: linear-gradient(135deg, #FFF6F6 0%, #FFEBEB 100%);
            border-left: 5px solid #FF5252;
        }
        .alert-resolved {
            background: linear-gradient(135deg, #F6FFF6 0%, #EBFFEB 100%);
            border-left: 5px solid #4CAF50;
        }
        .alert-title {
            font-size: 18px;
            font-weight: bold;
            margin-bottom: 15px;
            display: flex;
            align-items: center;
        }
        .alert-icon {
            width: 24px;
            height: 24px;
            margin-right: 10px;
        }
        .alert-field {
            margin-bottom: 8px;
            display: flex;
        }
        .field-label {
            font-weight: bold;
            min-width: 80px;
            color: #555;
        }
        .field-value {
            flex: 1;
        }
        .timestamp {
            color: #666;
            font-size: 13px;
            margin-top: 15px;
            text-align: right;
        }
        .divider {
            height: 1px;
            background: #eee;
            margin: 15px 0;
        }
    </style>
</head>
<body>
{{- if gt (len .Alerts.Firing) 0 -}}
    <div class="alert-header alert-critical">
        告警触发 - 请立即处理!
    </div>
    <div>
        <img src="https://img95.699pic.com/element/40114/9548.png_860.png" width="200px" height="200px">
    </div>
    {{- range $index, $alert := .Alerts -}}
        <div class="alert-card alert-critical">
            <div class="alert-field">
                <span class="field-label">告警名称:</span>
                <span class="field-value">{{ .Labels.alertname }}</span>
            </div>
            <div class="alert-field">
                <span class="field-label">告警级别:</span>
                <span class="field-value">{{ .Labels.severity }}</span>
            </div>
            <div class="alert-field">
                <span class="field-label">目标机器:</span>
                <span class="field-value">{{ .Labels.instance }}</span>
            </div>
            <div class="alert-field">
                <span class="field-label">告警摘要:</span>
                <span class="field-value">{{ .Annotations.summary }}</span>
            </div>
            <div class="alert-field">
                <span class="field-label">触发时间:</span>
                <span class="field-value">{{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}</span>
            </div>
            {{- if .Annotations.description }}
            <div class="divider"></div>
            <div class="alert-field">
                <span class="field-label">详细描述:</span>
                <span class="field-value">{{ .Annotations.description }}</span>
            </div>
            {{- end }}
        </div>
    {{- end }}
{{- end }}

{{- if gt (len .Alerts.Resolved) 0 -}}
    {{- range $index, $alert := .Alerts -}}
    <div class="alert-card alert-resolved">
        <div class="alert-title">
            告警恢复通知
        </div>
        <div>
            <img src="https://tse2-mm.cn.bing.net/th/id/OIP-C.n7AyZv_wWXqFCc1mtlGhFgHaHa?rs=1&pid=ImgDetMain" width="300" height="300">
        </div>
        <div class="alert-field">
            <span class="field-label">告警名称:</span>
            <span class="field-value">{{ .Labels.alertname }}</span>
        </div>
        <div class="alert-field">
            <span class="field-label">目标机器:</span>
            <span class="field-value">{{ .Labels.instance }}</span>
        </div>
        <div class="alert-field">
            <span class="field-label">告警摘要:</span>
            <span class="field-value">[ {{ .Annotations.summary }}] 此告警已经恢复~</span>
        </div>
        <div class="alert-field">
            <span class="field-label">恢复时间:</span>
            <span class="field-value">{{ (.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}</span>
        </div>
        {{- if .Annotations.description }}
        <div class="alert-field">
            <span class="field-label">详细描述:</span>
            <span class="field-value">{{ .Annotations.description }}</span>
        </div>
        {{- end }}
    </div>
    {{- end }}
{{- end }}

</body>
</html>
{{ end }}
EOF

docker run -d   --name=alertmanager   --net="host"   -v /opt/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml   prom/alertmanager   --config.file=/etc/alertmanager/alertmanager.yml --web.listen-address=:8181 --cluster.listen-address=:8182

image

  

image

 

posted on 2026-01-29 11:41  我,在等待  阅读(0)  评论(0)    收藏  举报