promethues(监控服务)+alertmanager(告警)二进制部署
目录
环境及包
- 操作系统:centos7.6
- 下载地址:https://prometheus.io/download/
- promthues X86版本:prometheus-2.42.0.linux-amd64.tar.gz
- promthues arm版本:prometheus-2.42.0.linux-arm64.tar.gz
- alertmanager X86版本:alertmanager-0.25.0.linux-amd64.tar.gz
- alertmanager arm版本:alertmanager-0.25.0.linux-arm64.tar.gz
1、准备
#创建目录
mkdir -p /home/data/apps/prometheus/{data,conf,var,opt}
mkdir -p /home/data/apps/alertmanager/{data,conf,var,opt}
#解压安装包
tar -xvf prometheus-2.42.0.linux-amd64.tar.gz -C /home/data/apps/prometheus/opt
tar -xvf alertmanager-0.25.0.linux-amd64.tar.gz -C /home/data/apps/alertmanager/opt
2、复制默认配置文件
cp /home/data/apps/prometheus/opt/prometheus-2.42.0.linux-amd64/prometheus.yml /home/data/apps/prometheus/conf/
cp /home/data/apps/alertmanager/opt/alertmanager-0.25.0.linux-amd64/alertmanager.yml /home/data/apps/alertmanager/conf/
3、生成prometheus服务启动管理配置
cat > /usr/lib/systemd/system/prometheus.service << EOF
[Unit]
Description=prometheus
[Service]
ExecStart=/home/data/apps/prometheus/opt/prometheus-2.42.0.linux-amd64/prometheus --config.file=/home/data/apps/prometheus/conf/prometheus.yml --web.listen-address=:19091 --storage.tsdb.path=/home/data/apps/prometheus/data --storage.tsdb.retention.time=6w --storage.tsdb.retention.size=1500GB
ExecReload=/bin/kill -HUP $MAINPID
KillMode=process
Restart=on-failure
[Install]
WantedBy=multi-user.target
EOF
4、生成alertmanger服务启动管理配置,注意:如果是集群模式必须把--cluster.advertise-address改为可对外的真实IP
cat > /usr/lib/systemd/system/alertmanager.service << EOF
[Unit]
Description=alertmanager
[Service]
ExecStart=/home/data/apps/alertmanager/opt/alertmanager-0.25.0.linux-amd64/alertmanager --config.file=/home/data/apps/alertmanager/conf/alertmanager.yml --web.listen-address=:19093 --storage.path=/home/data/apps/alertmanager/data --cluster.listen-address=0.0.0.0:19094 --cluster.advertise-address=0.0.0.0:19094 --data.retention=720h
WorkingDirectory=/home/data/apps/alertmanager/
ExecReload=/bin/kill -HUP $MAINPID
KillMode=process
Restart=on-failure
[Install]
WantedBy=multi-user.target
EOF
5、启动服务
systemctl daemon-reload
systemctl enable prometheus
systemctl restart prometheus
systemctl daemon-reload
systemctl enable alertmanager
systemctl restart alertmanager
6、/home/data/apps/prometheus/conf/prometheus.yml 配置
global:
scrape_interval: 30s #目标资源抓取时间间隔
evaluation_interval: 15s #告警阀值评估时间间隔
# scrape_timeout is set to the global default (10s).
# 告警管理服务
alerting:
alertmanagers:
- static_configs:
- targets:
- 127.0.0.1:19093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
#告警规则文件
rule_files:
# - "/home/data/appsprometheus/conf/rules/node-rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: "prometheus"
static_configs:
- targets: ["localhost:19091"]
- job_name: "node-exporter" #监控类型
file_sd_configs: #监控目标自动发现
- files:
- /home/data/apps/prometheus/conf/targets/node-exporter.json #监控目标配置文件
refresh_interval: 5s #自动刷新配置间隔
- job_name: "windows-exporter" #监控类型
file_sd_configs: #监控目标自动发现
- files:
- /home/data/apps/prometheus/conf/targets/windows-exporter.json #监控目标配置文件
refresh_interval: 5s #自动刷新配置间隔
7. 检查配置文件是否正确
cd /home/data/apps/prometheus/opt/prometheus-2.42.0.linux-amd64
./promtool check config /home/data/apps/prometheus/conf/prometheus.yml
8. 添加信息抓取目标,注意: 按实际情况修改对应字段
- project:项目名称
- app: 应用系统
#创建目录
mkdir -p /home/data/apps/prometheus/conf/targets
cd /home/data/apps/prometheus/conf/targets
#创建文件
touch node-exporter.json windows-exporter.json
#node-exporter.json linux机器配置
cat > node-exporter.json << EOF
[
{
"targets": [ "192.168.233.102:19100"],
"labels": {
"instance": "192.168.233.102",
"system": "linux",
"project": "XMMC",
"app": "YYXT",
"contact": "huangkun"
}
}
]
EOF
#windows-exporter.json linux机器配置
cat > windows-exporter.json << EOF
[
{
"targets": [ "192.168.233.102:19100"],
"labels": {
"instance": "192.168.233.102",
"system": "linux",
"project": "XMMC",
"app": "YYXT",
"contact": "huangkun"
}
},
{
"targets": [ "192.168.233.101:19100"],
"labels": {
"instance": "192.168.233.102",
"system": "linux",
"project": "XMMC",
"app": "YYXT",
"contact": "huangkun"
}
}
]
EOF
9、添加告警规则,记得修改prometheus.yml 对应的rule_files
#创建目录
mkdir -p /home/data/appsprometheus/conf/rules
cd /home/data/appsprometheus/conf/rules
#创建文件
touch node-rules.yml
#linux监控规则示例,记得修改prometheus.yml 对应的rule_files
cat > node-rules.yml << EOF
groups:
- name: node-exporter
rules:
- alert: NodeStatus
expr: up{job="node-exporter"} == 0
for: 1s
labels:
severity: warning
annotations:
summary: "IP: {{ .instance }}"
description: "服务器连接异常!"
- alert: NodeDisk
expr: (1 - node_filesystem_avail_bytes{job="node-exporter",fstype=~"ext4|xfs"} / node_filesystem_size_bytes{job="node-exporter",fstype=~"ext4|xfs"}) * 100 > 10
for: 3m
labels:
severity: warning
annotations:
summary: "IP: {{ .instance }}"
description: "服务器磁盘设备 {{ .device }} 空间已使用 {{ printf \"%.0f\" $value }}%"
- alert: NodeCPU
expr: (1 - avg by (environment,instance,system,project,contact) (irate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[3m]))) * 100 > 10
for: 3m
labels:
severity: warning
annotations:
summary: "IP: {{ .instance }}"
description: "服务器CPU已使用 {{ printf \"%.0f\" $value }}%"
- alert: NodeMemory
expr: (1-(node_memory_MemAvailable_bytes{job="node-exporter"} / (node_memory_MemTotal_bytes{job="node-exporter"})))* 100 > 50
for: 3m
labels:
severity: warning
annotations:
summary: "IP: {{ .instance }}"
description: "服务器内存已使用 {{ printf \"%.0f\" $value }}%"
EOF
#windows监控规则示例,记得修改prometheus.yml 对应的rule_files
cat > windows-rules.yml << EOF
groups:
- name: windows-exporter
rules:
- alert: WindowsStatus
expr: up{job="windows-exporter"} == 0
for: 1s
labels:
severity: warning
annotations:
summary: "IP: {{ $labels.instance }}"
description: "服务器连接异常!"
- alert: WindowsDisk
expr: 100.0 - 100 * ((windows_logical_disk_free_bytes{volume!~"Harddisk.*"} / 1024 / 1024 ) / (windows_logical_disk_size_bytes{volume!~"Harddisk.*"} / 1024 / 1024)) > 10
for: 3m
labels:
severity: warning
annotations:
summary: "IP: {{ $labels.instance }}"
description: "服务器{{ $labels.device }}磁盘空间已使用{{ printf \"%.0f\" $value }}%"
- alert: WindowsCPU
expr: 100 - (avg by (instance,system,project,contact) (rate(windows_cpu_time_total{mode="idle"}[2m])) * 100) > 10
for: 3m
labels:
severity: warning
annotations:
summary: "IP: {{ $labels.instance }}"
description: "服务器CPU已使用{{ printf \"%.0f\" $value }}%"
- alert: WindowsMemory
expr: 100.0 - 100 * ((windows_logical_disk_free_bytes / 1024 / 1024 ) / (windows_logical_disk_size_bytes / 1024 / 1024)) > 10
for: 3m
labels:
severity: warning
annotations:
summary: "IP: {{ $labels.instance }}"
description: "服务器内存已使用{{ printf \"%.0f\" $value }}%"
EOF
前事不忘,后事之师