prometheus 监控部署

1 架构

# 监控服务器 需要安装3个服务:192.168.164.110
    # Prometheus Server(监控主服务器 )
    # Grafana         (展示监控界面)
    # alertmanager    (报警功能)
# 被监控的有安装3个:192.168.164.111
    # Node Exporter (收集 Host 硬件和操作系统信息)
    # cAdvisor     (收集 Host上运行的 容器信息)
    # gpu_exporter (收集 Host上运行的 GPU信息)

2 安装docker and docker-compose

2.1 安装docker依赖
apt-get update
apt-get install apt-transport-https ca-certificates curl software-properties-common
2.2 安装docker源
curl -fsSL http://mirrors.aliyun.com/docker-ce/linux/ubuntu/gpg | sudo apt-key add -
add-apt-repository "deb [arch=amd64] http://mirrors.aliyun.com/docker-ce/linux/ubuntu $(lsb_release -cs) stable"
apt-get -y update
2.3 查看源
apt-cache madison docker-ce
2.4 安装特定版本
apt  install docker-ce=18.03.0~ce-0~ubuntu

3 基础配置文件启动prom容器

3.1 采集配置
mkdir -p /data/prom/conf
vim /data/prom/conf/prometheus.yml
# 全局配置
global:
  scrape_interval:     15s  # 采集数据间隔时间
  evaluation_interval: 15s  # 评估规则间隔时间
# 报警配置
alerting:
  alertmanagers:
    - static_configs:
      - targets: ['192.168.164.110:9093']
# 加载规则
rule_files:
  - "node_down.yml"
# 采集host配置(9090|9090/metrics)
# 1.1 本身
scrape_configs:
  - job_name: 'prometheus'
    static_configs:
      - targets: ['localhost:9090']
# 1.2 nodes 节点
  - job_name: 'mon_node'
    static_configs:
      - targets: ['192.168.164.110:9100']
      - targets: ['192.168.164.111:9100']
# 1.3 nodes docker 节点容器
  - job_name: 'mon_docker'
    static_configs:
      - targets: ['192.168.164.110:8080']
      - targets: ['192.168.164.111:8080']
# 1.4 gpu nodes GPU节点
  - job_name: 'mon_gpu'
    static_configs:
      - targets: ['192.168.164.111:9445']
3.2 邮件告警配置文件
vim /data/prom/conf/alertmanager.yml

global:
  smtp_smarthost: 'smtp.163.com:25'         # 163服务器
  smtp_from: 'litao_59@163.com'          # 发邮件的邮箱
  smtp_auth_username: 'litao_59@163.com'    # 发邮件的邮箱用户名
  smtp_auth_password: 'AWRQAGIKTSVHUOAT'   # 邮箱密码。163为授权码
  smtp_require_tls: false              # 不进行tls验证
 
route:
  group_by: ['alertname']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 10m
  receiver: live-monitoring
 
receivers:
- name: 'prom-mon'
  email_configs:
  - to: 'litao_59@163.com'            #收邮件的邮箱
3.3 报警规则
vim /data/prom/conf/node_down.yml

groups:
- name: node_down
  rules:
  - alert: InstanceDown
    expr: up == 0
    for: 1m
    labels:
      user: test
    annotations:
      summary: "Instance {{ $labels.instance }} down"
      description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."
给权限

chmod 777 -R /data/prom/{conf,data,grafana}
给权限原因
# prometheus  conf  data
level=error ts=2021-08-05T07:43:27.346Z caller=query_logger.go:87 component=activeQueryTracker msg="Error opening query log file" file=/prometheus/queries.active err="open /prometheus/queries.active: permission denied"
panic: Unable to create mmap-ed active query log

#grafanna
mkdir: can't create directory '/var/lib/grafana/plugins': Permission denied
GF_PATHS_DATA='/var/lib/grafana' is not writable.

4 启动各个监控容器

4.1 启动客户端 主机硬件信息采集容器 mon_node_exporter(客户端)
docker run -itd \
-p 9100:9100 \
-v /:/rootfs:ro  \
-v /sys:/sys:ro \
--name mon_node-exporter \
--hostname mon_node-exporter \
--restart always \
prom/node-exporter:latest
可以测试 http://192.168.164.110:9100/metrics


4.2 启动客户端 容器信息采集容器 mon_cadvisor (客户端)
docker run -itd \
-p 8080:8080 \
-v /:/rootfs:ro \
-v /var/run:/var/run:rw \
-v /sys:/sys:ro \
-v /var/lib/docker:/var/lib/docker:ro \
--name mon_cadvisor \
--hostname mon_cadvisor \
--restart always \
google/cadvisor:latest
4.3 启动客户端 GPU信息采集 容器(前提是安装显卡驱动和nvidia-docker2)
docker run -itd \
--runtime nvidia-docker \
-p 9445:9445 \
--name mon_gpu-node-exporter \
--restart always \
gpu_prometheus_node_exporter:v1.0.1
4.4 启动服务端 图形展示容器 mon_grafana (服务端)

docker run -itd \
-p 3000:3000 \
-v /data/prom/grafana:/var/lib/grafana \
--name mon_grafana \
--hostname mon_grafana \
--restart always \
grafana/grafana:latest
4.5 启动服务端 报警容器 (服务端)
docker run -itd \
-p 9093:9093 \
-v /data/prom/conf/alertmanager.yml:/etc/alertmanager/alertmanager.yml \
--name mon_alertmanager \
--hostname mon_alertmanager \
--restart always \
prom/alertmanager:latest
4.6 启动服务端 主监控服务容器 mon_prometheus (服务端)
docker run -itd \
-p 9090:9090 \
-v /data/prom/conf/prometheus.yml:/etc/prometheus/prometheus.yml \
-v /data/prom/conf/node_down.yml:/etc/prometheus/node_down.yml \
-v /data/prom/data:/prometheus \
--name mon_prometheus \
--hostname mon_prometheus \
--restart always \
prom/prometheus:latest

测试:

http://192.168.164.110:9090/graph

111

http://192.168.164.110:9090/targets

222

5 yml文件用于k8s内部部署

5.1 服务端yml文件编写 mon_prom_server.yml

# vim mon_prom_server.yml
version: '2'
services: 
  prometheus:
    image:  prom/prometheus:latest
    container_name: mon_prometheus
    hostname: mon_prometheus
    restart: always
    volumes:
      - /data/prom/conf/prometheus.yml:/etc/prometheus/prometheus.yml
      - /data/prom/conf/node_down.yml:/etc/prometheus/node_down.yml
      - /data/prom/data:/prometheus
    expose:
      - 9090
    ports:
      - 9090:9090

  alertmanager:
    image: prom/alertmanager
    container_name: alertmanager
    hostname: mon_alertmanager
    restart: always
    volumes:
      - /data/prom/conf/alertmanager.yml:/etc/alertmanager/alertmanager.yml
    expose:
      - 9093
    ports:
      - 9093:9093

  grafana:
    image: grafana/grafana:latest
    container_name: mon_grafana
    hostname: mon_grafana
    restart: always
    volumes:
      - /data/prom/grafana:/var/lib/grafana
    expose:
      - 3000
    ports:
      - 3000:3000
5.2 客户端yml文件编写 mon_prom_client.yml
# vim mon_prom_client.yml
version: '2'
  node-exporter:
    image:prom/node-exporter:latest
    container_name: mon_node_exporter
    hostname: mon_node_exporter
    restart: always
    expose:
      - 9100
    ports:
      - 9100:9100

  cadvisor:
    image: google/cadvisor:latest
    container_name: cadvisor
    hostname: mon_cadvisor
    restart: mon_always
    volumes:
      - /:/rootfs:ro
      - /var/run:/var/run:rw
      - /sys:/sys:ro
      - /var/lib/docker/:/var/lib/docker:ro
    expose:
      - 8080
    ports:
      - 8080:8080

  gpu:
    image: gpu_prometheus_node_exporter:v1.0.1
    container_name: mon_gpu
    hostname: mon_gpu
    restart: always
    runtime: nvidia-docker
    expose:
      - 9445
    ports:
      - 9445:9445
可以提前下载镜像
docker pull prom/prometheus
docker pull grafana/grafana
docker pull prom/alertmanager
docker pull prom/node-exporter
docker pull google/cadvisor:latest
docker pull gpu_prometheus_node_exporter:v1.0.1

6 测试grafana展示prometheus(第一次登录会让设置新密码)

http://192.168.164.110:3000/login admim/admin
# 添加数据源(也就是prometheus的收集的数据)到grafana

333

# 添加prometheus监控服务器name and address

444

posted on 2025-10-22 11:20  luokeli  阅读(16)  评论(0)    收藏  举报

导航