Prometheus(二)
一、kubernetes 二进制部署的prometheus实现服务发现
1.1 kubernetes集群外部署prometheus
主机:10.0.0.61
1.1.1 下载二进制程序
mkdir /apps
cd /apps
wget https://github.com/prometheus/prometheus/releases/download/v2.40.7/prometheus-2.40.7.linux-amd64.tar.gz
tar -xvf prometheus-2.40.7.linux-amd64.tar.gz
ln -s /apps/prometheus-2.40.7.linux-amd64 /apps/prometheus
1.1.2 启动prometheus服务
cat >>/etc/systemd/system/prometheus.service <<EOF
[Unit]
Description=Prometheus Server
Documentation=https://prometheus.io/docs/introduction/overview/
After=network.target
[Service]
Restart=on-failure
WorkingDirectory=/apps/prometheus/
ExecStart=/apps/prometheus/prometheus --config.file=/apps/prometheus/prometheus.yml --web.enable-lifecycle
[Install]
WantedBy=multi-user.target
EOF
启动服务
systemctl daemon-reload
systemctl enable --now prometheus.service
验证状态
# 查看服务状态
[root@prometheus-server apps]#systemctl is-active prometheus.service
active
# 查看端口
[root@prometheus-server apps]#ss -ntl|grep 9090
LISTEN 0 4096 *:9090 *:*

1.1.3 创建RBAC授权
允许从外访问k8s集群
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: monitoring
---
apiVersion: v1
kind: Secret
type: kubernetes.io/service-account-token
metadata:
name: monitoring-token
namespace: monitoring
annotations:
kubernetes.io/service-account.name: "prometheus"
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups:
- ""
resources:
- nodes
- services
- endpoints
- pods
- nodes/proxy
verbs:
- get
- list
- watch
- apiGroups:
- "extensions"
resources:
- ingresses
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- configmaps
- nodes/metrics
verbs:
- get
- nonResourceURLs:
- /metrics
verbs:
- get
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: monitoring
1.1.4 准备文件
# 准备token文件
## k8s集群中生成token
kubectl describe secrets monitoring-token -n monitoring|grep "token:"|awk '{print $2}' > k8s.token
## 复制文件至prometheus server服务器上,需提前在prometheus server上创建目录mkdir -p /apps/certs
ssh 10.0.0.61 'mkdir -p /apps/certs'
scp k8s.token 10.0.0.61:/apps/certs/
# 准备tls证书
## 复制k8s上ca.pem(或ca.crt)文件至prometheus server服务器上
scp /etc/kubernetes/ssl/ca.pem 10.0.0.61:/apps/certs/
查看prometheus server文件
[root@prometheus-server certs]#ls /apps/certs/
ca.pem k8s.token
1.2 实现node服务发现
1.2.1 配置node发现规则
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_timeout: 10s
scrape_configs:
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9090"]
- job_name: 'kubernetes-node'
kubernetes_sd_configs:
- role: node
api_server: https://10.0.0.10:6443 # k8s master VIP
tls_config:
ca_file: /apps/certs/ca.pem
bearer_token_file: /apps/certs/k8s.token
scheme: http
tls_config:
ca_file: /apps/certs/ca.pem
bearer_token_file: /apps/certs/k8s.token
relabel_configs:
- source_labels: [__address__]
target_label: __address__
regex: '(.*):10250'
replacement: '${1}:9100'
action: replace
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
1.2.2 验证node发现
标签替换、引用
1.2.3 grafana展示
- 创建新数据源
- 展示监控
模板:11704
1.2.4 node常见监控指标
node_cpu_ CPU相关指标
node_boot_time 系统自启动以后的总运行时间
node_disk* 磁盘IO
node_filesystem* 系统文件使用量
node_load1 系统CPU负载
node_memory* 内存使用量
node_network* 网络带宽指标
go_* node exporter中go相关指标
process_* node exporter自身进程相关运行指标
1.3 实现cadvisor服务发现
1.3.1 配置cadvisor发现规则
......
- job_name: 'kubernetes-nodes-cadvisor'
kubernetes_sd_configs:
- role: node
api_server: https://10.0.0.10:6443 # k8s master VIP
tls_config:
ca_file: /apps/certs/ca.pem
bearer_token_file: /apps/certs/k8s.token
scheme: https
tls_config:
ca_file: /apps/certs/ca.pem
bearer_token_file: /apps/certs/k8s.token
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: '10.0.0.10:6443' # k8s VIP
#replacement: kubernetes.default.svc:443 # 以pod方式部署k8s集群内,可用service访问时
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
metric_relabel_configs:
- action: replace
source_labels: [id]
regex: '^/machine\.slice/machine-rkt\\x2d([^\\]+)\\.+/([^/]+)\.service$'
target_label: rkt_container_name
replacement: '${2}-${1}'
- action: replace
source_labels: [id]
regex: '^/system\.slice/(.+)\.service$'
target_label: systemd_service_name
replacement: '${1}'
查看cadvisor数据
TOKEN=`cat /apps/certs/k8s.token`
curl --cacert /apps/certs/ca.pem -H "Authorization: Bearer $TOKEN" https://10.0.0.10:6443/api/v1/nodes/10.0.0.12/proxy/metrics/cadvisor
1.3.2 验证cadvisor发现
标签替换
1.3.3 grafana展示
导入模板:14282
pod名称显示异常,可按如下方法修改,其中cadvisor版本为0.45.0:
- 查看prometheus label
name="f342b6b20fb6b87f546db34808b8b16d09c4809cd2e42c027ca8ff99780696bb"
pod="cadvisor-4ds2w"
- 修改模板
将name更改为pod,示例如下
## 原命令
sum(rate(container_cpu_usage_seconds_total{instance=~"$host",name=~"$container",name=~".+"}[5m])) by (name) *100
## 更改后命令
sum(rate(container_cpu_usage_seconds_total{instance=~"$host",name=~"$container",name=~".+"}[5m])) by (pod) *100
- 展示
pod名称显示正常
二、prometheus 基于consul、file实现服务发现
2.1 consul服务发现
consul是分布式k/v数据存储集群,目前常用于服务的服务注册和发现。
2.1.1 部署consul集群
下载地址:https://releases.hashicorp.com/consul/
主机清单
node1 10.0.0.71
node2 10.0.0.72
node3 10.0.0.73
2.1.1.1 下载consul二进制程序
# 下载
wget https://releases.hashicorp.com/consul/1.14.0/consul_1.14.0_linux_amd64.zip
unzip consul_1.14.0_linux_amd64.zip
scp consul 10.0.0.71:/usr/local/bin/
scp consul 10.0.0.72:/usr/local/bin/
scp consul 10.0.0.73:/usr/local/bin/
# 创建数据目录
ssh 10.0.0.71 "mkdir -p /data/consul"
ssh 10.0.0.72 "mkdir -p /data/consul"
ssh 10.0.0.73 "mkdir -p /data/consul"
2.1.1.2 启动服务
#node1
nohup consul agent -server -bootstrap -bind=10.0.0.71 -client=10.0.0.71 -data-dir=/data/consul -ui -node=10.0.0.71 &
#node2
nohup consul agent -bind=10.0.0.72 -client=10.0.0.72 -data-dir=/data/consul -node=10.0.0.72 -join=10.0.0.71 &
#node3
nohup consul agent -bind=10.0.0.73 -client=10.0.0.73 -data-dir=/data/consul -node=10.0.0.73 -join=10.0.0.71 &
参数说明
consul agent -server 使用server模式运行consul服务
-bootstrap 首次部署使用初始化模式
-bind 设置集群通信的监听地址
-client 设置客户端访问的监听地址
-data-dir 数据目录
-ui 启动内置静态web UI服务器
-node 此节点名称,集群中必须唯一
-datacenter=dc1 集群名称,默认为dc1
-join 加入到已有consul环境
2.1.1.3 验证集群
查看日志
[root@consul-server1 ~]#tail -f nohup.out
2023-03-03T23:47:21.201+0800 [INFO] agent.server: federation state anti-entropy synced
2023-03-03T23:47:21.202+0800 [INFO] agent.leader: stopping routine: routine="virtual IP version check"
2023-03-03T23:47:21.202+0800 [INFO] agent.leader: stopped routine: routine="virtual IP version check"
2023-03-03T23:47:22.549+0800 [INFO] agent: Synced node info
2023-03-03T23:47:22.687+0800 [ERROR] agent.server.autopilot: Failed to reconcile current state with the desired state
2023-03-03T23:47:26.947+0800 [INFO] agent.server.serf.lan: serf: EventMemberJoin: 10.0.0.72 10.0.0.72
2023-03-03T23:47:26.947+0800 [INFO] agent.server: member joined, marking health alive: member=10.0.0.72 partition=default
2023-03-03T23:47:34.219+0800 [INFO] agent.server.serf.lan: serf: EventMemberJoin: 10.0.0.73 10.0.0.73
2023-03-03T23:47:34.219+0800 [INFO] agent.server: member joined, marking health alive: member=10.0.0.73 partition=default
2023-03-03T23:47:40.891+0800 [INFO] agent: Newer Consul version available: new_version=1.15.0 current_version=1.14.0
查看监听端口
[root@consul-server1 ~]#netstat -ntlp|grep consul
tcp 0 0 10.0.0.71:8503 0.0.0.0:* LISTEN 3546/consul
tcp 0 0 10.0.0.71:8600 0.0.0.0:* LISTEN 3546/consul
tcp 0 0 10.0.0.71:8300 0.0.0.0:* LISTEN 3546/consul
tcp 0 0 10.0.0.71:8301 0.0.0.0:* LISTEN 3546/consul
tcp 0 0 10.0.0.71:8302 0.0.0.0:* LISTEN 3546/consul
tcp 0 0 10.0.0.71:8500 0.0.0.0:* LISTEN 3546/consul
查看web,访问IP:8500
2.1.1.4 测试数据
- 通过consul API写入数据
curl -X PUT -d '{"id":"node-exporter1","name":"node-exporter1","address":"10.0.0.11","port":9100,"tags":["node-exporter"],"checks":[{"http":"http://10.0.0.11:9100","interval":"5s"}]}' \
http://10.0.0.71:8500/v1/agent/service/register
curl -X PUT -d '{"id":"node-exporter2","name":"node-exporter2","address":"10.0.0.12","port":9100,"tags":["node-exporter"],"checks":[{"http":"http://10.0.0.12:9100","interval": "5s"}]}' \
http://10.0.0.71:8500/v1/agent/service/register
curl -X PUT -d '{"id":"node-exporter3","name":"node-exporter3","address":"10.0.0.13","port":9100,"tags":["node-exporter"],"checks":[{"http":"http://10.0.0.13:9100","interval": "5s"}]}' \
http://10.0.0.71:8500/v1/agent/service/register
- consul验证数据
说明,consul删除服务命令:
curl --request PUT http://10.0.0.71:8500/v1/agent/service/deregister/node-exporter1
2.1.2 consul服务发现
2.1.2.1 二进制prometheus配置发现规则
- job_name: 'consul'
honor_labels: true
metrics_path: "/metrics"
scheme: http
consul_sd_configs:
- server: 10.0.0.71:8500
services: [] # 发现的目标服务名称,空为所有服务,可以写servicea,serviceb,servicec
- server: 10.0.0.72:8500
services: []
- server: 10.0.0.73:8500
services: []
relabel_configs:
- source_labels: [__meta_consul_tags]
target_label: 'product'
- source_labels: [__meta_consul_dc]
target_label: 'idc'
- source_labels: [__meta_consul_service]
regex: 'consul'
action: drop # 删除consul本机监控
2.1.2.2 prometheus验证数据
未匹配删除__meta_consul_service='consul'时
2.2 file服务发现
2.2.1 准备文件
- yaml文件
- targets: ['10.0.0.11:9100','10.0.0.12:9100','10.0.0.13:9100']
- json文件
[
{
"targets": ["10.0.0.41:9100","10.0.0.42:9100","10.0.0.43:9100"]
}
]
2.2.2 prometheus配置规则
- job_name: 'file_sd'
file_sd_configs:
- files: # 支持yaml和json格式文件
- /apps/prometheus/file_sd/file_sd.yaml # 支持*模糊匹配
- /apps/prometheus/file_sd/file_sd.json # 支持*模糊匹配
refresh_interval: 10s # 重新读取文件的刷新时间,若file文件内容改动,会自动发现变更内容
2.2.3 验证数据
json和yaml格式文件都可以正常发现
三、prometheus 监控案例-kube-state-metrics
https://github.com/kubernetes/kube-state-metrics
kube-state-metrics:通过监听API server生成有关资源对象的状态指标,比如Deployment、Node、Pod,需要注意的是kube-state-metrics使用场景不是用于监控对方是否存活,而是周期性获取目标对象metrics指标数据并在web界面进行显示或被prometheus抓取(如pod状态是running还是terminating、pod的创建时间等),目前kube-state-metrics收集的指标数据可参见官方文档
https://github.com/kubernetes/kube-state-metrics/tree/main/docs,kube-state-metrics并不会存储这些指标数据,所以可以使用prometheus来抓取这些数据然后存储,主要关注的是业务相关的一些元数据,如deployment、pod、副本状态等,调度了多少个replicas,现在可用的有几个,多少个pod是running/stopped/terminated状态,pod重启了多少次,目前有多少job在运行中
镜像地址:
https://hub.docker.com/r/bitnami/kube-state-metrics
https://quay.io/repository/coreos/kube-state-metrics?tag=latest&tab=tags
指标:
https://xie.infoq.cn/article/9e1fff6306649e65480a96bb1
3.1 部署kube-state-metrics
下载镜像
docker pull bitnami/kube-state-metrics:2.8.0
docker tag bitnami/kube-state-metrics:2.8.0 harbor.chu.net/baseimages/kube-state-metrics:2.8.0
docker push harbor.chu.net/baseimages/kube-state-metrics:2.8.0
编写yaml文件
apiVersion: apps/v1
kind: Deployment
metadata:
name: kube-state-metrics
namespace: kube-system
spec:
replicas: 1
selector:
matchLabels:
app: kube-state-metrics
template:
metadata:
labels:
app: kube-state-metrics
spec:
serviceAccountName: kube-state-metrics
containers:
- name: kube-state-metrics
# image: bitnami/kube-state-metrics:2.8.0 # 网络镜像
image: harbor.chu.net/baseimages/kube-state-metrics:2.8.0 # 本地镜像
ports:
- containerPort: 8080
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: kube-state-metrics
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: kube-state-metrics
rules:
- apiGroups: [""]
resources: ["nodes", "pods", "services", "resourcequotas", "replicationcontrollers", "limitranges", "persistentvolumeclaims", "persistentvolumes", "namespaces", "endpoints"]
verbs: ["list", "watch"]
- apiGroups: ["extensions"]
resources: ["daemonsets", "deployments", "replicasets"]
verbs: ["list", "watch"]
- apiGroups: ["apps"]
resources: ["statefulsets"]
verbs: ["list", "watch"]
- apiGroups: ["batch"]
resources: ["cronjobs", "jobs"]
verbs: ["list", "watch"]
- apiGroups: ["autoscaling"]
resources: ["horizontalpodautoscalers"]
verbs: ["list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: kube-state-metrics
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: kube-state-metrics
subjects:
- kind: ServiceAccount
name: kube-state-metrics
namespace: kube-system
---
apiVersion: v1
kind: Service
metadata:
annotations:
prometheus.io/scrape: 'true'
name: kube-state-metrics
namespace: kube-system
labels:
app: kube-state-metrics
spec:
type: NodePort
ports:
- name: kube-state-metrics
port: 8080
targetPort: 8080
nodePort: 31666
protocol: TCP
selector:
app: kube-state-metrics
查看状态
# 查看pod
[root@k8s-deploy ~]#kubectl get pod -n kube-system
NAME READY STATUS RESTARTS AGE
...
kube-state-metrics-6bc4545d76-xbdb2 1/1 Running 0 52s
# 查看service
[root@k8s-deploy case]#kubectl get svc -n kube-system
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
kube-dns ClusterIP 10.100.0.2 <none> 53/UDP,53/TCP,9153/TCP 43d
kube-state-metrics NodePort 10.100.81.152 <none> 8080:31666/TCP 2m47s
kubelet ClusterIP None <none> 10250/TCP,10255/TCP,4194/TCP 11d
3.2 验证数据
metrics
healthz
3.3 配置prometheus采集数据
- job_name: 'kube-state-metrics'
static_configs:
- targets: ["10.0.0.10:31666"] # k8s VIP
3.4 验证prometheus状态
3.5 grafana展示
- 模板13824
修改查询条件,显示masters/nodes数据
count(node_load1{kubernetes_io_role='master'})
count(node_load1{kubernetes_io_role='node'})
- 模板14518
四、prometheus 监控案例-Tomcat、Redis、Mysql、Haproxy、Nginx
4.1 监控Tomcat
https://github.com/nlighten/tomcat_exporter
监控tomcat活跃连接数、堆栈内存等信息
# tomcat活跃连接数
tomcat_connections_active_total{name='http-nio-8080',}
# jvm内存
jvm_memory_bytes_used{area='heap',}
4.1.1 构建镜像
- 下载jar/war包
地址:https://repo1.maven.org/maven2/io/prometheus/
TOMCAT_SIMPLECLIENT_VERSION=0.8.0
TOMCAT_EXPORTER_VERSION=0.0.12
curl -O https://repo1.maven.org/maven2/io/prometheus/simpleclient/${TOMCAT_SIMPLECLIENT_VERSION}/simpleclient-${TOMCAT_SIMPLECLIENT_VERSION}.jar
curl -O https://repo1.maven.org/maven2/io/prometheus/simpleclient_common/${TOMCAT_SIMPLECLIENT_VERSION}/simpleclient_common-${TOMCAT_SIMPLECLIENT_VERSION}.jar
curl -O https://repo1.maven.org/maven2/io/prometheus/simpleclient_hotspot/${TOMCAT_SIMPLECLIENT_VERSION}/simpleclient_hotspot-${TOMCAT_SIMPLECLIENT_VERSION}.jar
curl -O https://repo1.maven.org/maven2/io/prometheus/simpleclient_servlet/${TOMCAT_SIMPLECLIENT_VERSION}/simpleclient_servlet-${TOMCAT_SIMPLECLIENT_VERSION}.jar
curl -O https://repo1.maven.org/maven2/io/prometheus/simpleclient_servlet_common/${TOMCAT_SIMPLECLIENT_VERSION}/simpleclient_servlet_common-${TOMCAT_SIMPLECLIENT_VERSION}.jar
curl -O https://repo1.maven.org/maven2/nl/nlighten/tomcat_exporter_client/${TOMCAT_EXPORTER_VERSION}/tomcat_exporter_client-${TOMCAT_EXPORTER_VERSION}.jar
curl -O https://repo1.maven.org/maven2/nl/nlighten/tomcat_exporter_servlet/${TOMCAT_EXPORTER_VERSION}/tomcat_exporter_servlet-${TOMCAT_EXPORTER_VERSION}.war
- 编写Dockerfile
FROM tomcat:8.5.73
ADD server.xml /usr/local/tomcat/conf/server.xml
RUN mkdir /data/tomcat/webapps -p
ADD myapp /data/tomcat/webapps/myapp
ADD metrics.war /data/tomcat/webapps
ADD simpleclient-0.8.0.jar /usr/local/tomcat/lib/
ADD simpleclient_common-0.8.0.jar /usr/local/tomcat/lib/
ADD simpleclient_hotspot-0.8.0.jar /usr/local/tomcat/lib/
ADD simpleclient_servlet-0.8.0.jar /usr/local/tomcat/lib/
ADD tomcat_exporter_client-0.0.12.jar /usr/local/tomcat/lib/
EXPOSE 8080 8443 8009
- 构建镜像
docker build -t harbor.chu.net/web/tomcat-app1:v1 .
docker push harbor.chu.net/web/tomcat-app1:v1
4.1.2 测试镜像
docker run -it --rm -p 8080:8080 harbor.chu.net/web/tomcat-app1:v1
验证/metrics页面
4.1.3 部署tomcat
apiVersion: apps/v1
kind: Deployment
metadata:
name: tomcat-deployment
namespace: default
spec:
selector:
matchLabels:
app: tomcat
replicas: 1
template:
metadata:
labels:
app: tomcat
annotations:
prometheus.io/scrape: 'true'
spec:
containers:
- name: tomcat
image: harbor.chu.net/web/tomcat-app1:v1
imagePullPolicy: Always
ports:
- containerPort: 8080
securityContext:
privileged: true
---
kind: Service
apiVersion: v1
metadata:
annotations:
prometheus.io/scrape: 'true'
name: tomcat-service
spec:
selector:
app: tomcat
ports:
- nodePort: 31080
port: 80
protocol: TCP
targetPort: 8080
type: NodePort
创建pod
[root@k8s-deploy yaml]#kubectl apply -f tomcat-deploy.yaml
[root@k8s-deploy yaml]#kubectl get svc
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
kubernetes ClusterIP 10.100.0.1 <none> 443/TCP 45d
nodeport-nginx-service NodePort 10.100.225.73 <none> 80:30120/TCP 37d
tomcat-service NodePort 10.100.53.196 <none> 80:31080/TCP 6s
4.1.4 配置prometheus采集数据
- job_name: 'tomcat-monitor-metrics'
static_configs:
- targets: ["10.0.0.13:31080"] # 可配置VIP
4.1.5 grafana展示
模板:https://github.com/nlighten/tomcat_exporter/blob/master/dashboard/example.json
4.2 监控Redis
https://github.com/oliver006/redis_exporter
4.2.1 部署redis
下载redis_exporter镜像
docker pull oliver006/redis_exporter:v1.45.0
docker tag oliver006/redis_exporter:v1.45.0 harbor.chu.net/web/redis_exporter:v1.45.0
docker push harbor.chu.net/web/redis_exporter:v1.45.0
编写yaml文件
apiVersion: apps/v1
kind: Deployment
metadata:
name: redis
namespace: web
spec:
replicas: 1
selector:
matchLabels:
app: redis
template:
metadata:
labels:
app: redis
spec:
containers:
- name: redis
image: redis:4.0.14
resources:
requests:
cpu: 100m
memory: 100Mi
ports:
- containerPort: 6379
- name: redis-exporter
# image: oliver006/redis_exporter:latest
image: harbor.chu.net/web/redis_exporter:v1.45.0
resources:
requests:
cpu: 100m
memory: 100Mi
ports:
- containerPort: 9121
---
kind: Service
apiVersion: v1
metadata:
annotations:
prometheus.io/scrape: 'true'
prometheus.io/port: "9121"
name: redis-exporter-service
namespace: web
spec:
selector:
app: redis
ports:
- nodePort: 39121
name: prom
port: 9121
protocol: TCP
targetPort: 9121
type: NodePort
---
kind: Service
apiVersion: v1
metadata:
# annotations:
# prometheus.io/scrape: 'false'
name: redis-redis-service
namespace: web
spec:
selector:
app: redis
ports:
- nodePort: 36379
name: redis
port: 6379
protocol: TCP
targetPort: 6379
type: NodePort
查看状态
[root@k8s-deploy yaml]#kubectl get pod -n web
NAME READY STATUS RESTARTS AGE
redis-6969686c88-qcc47 2/2 Running 0 50s
[root@k8s-deploy yaml]#kubectl get svc -n web
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE 27d
redis-exporter-service NodePort 10.100.199.125 <none> 9121:39121/TCP 6s
redis-redis-service NodePort 10.100.203.43 <none> 6379:36379/TCP 6s
4.2.2 验证metrics
4.2.3 配置prometheus
- 静态配置
- job_name: redis_exporter
static_configs:
- targets: ['10.0.0.13:39121'] # 可配置VIP
- 【可选】配置kubernetes API动态服务发现
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: instance
regex: (.*redis.*)
4.2.4 grafana展示
4.3 监控Mysql
https://github.com/prometheus/mysqld_exporter
4.3.1 安装mysql
- 安装mysql
[root@prometheus-server ~]#apt install mariadb-server -y
- 授权监控账户权限
# 登录mysql
[root@prometheus-server ~]#mysql
MariaDB [(none)]> CREATE USER 'mysql_exporter'@'localhost' IDENTIFIED BY '123456' WITH MAX_USER_CONNECTIONS 3;
Query OK, 0 rows affected (0.001 sec)
MariaDB [(none)]> GRANT PROCESS, REPLICATION CLIENT, SELECT ON *.* TO 'mysql_exporter'@'localhost';
Query OK, 0 rows affected (0.000 sec)
- 验证权限
[root@prometheus-server prometheus]#mysql -umysql_exporter -p123456 -hlocalhost
Welcome to the MariaDB monitor. Commands end with ; or \g.
Your MariaDB connection id is 38
Server version: 10.3.38-MariaDB-0ubuntu0.20.04.1 Ubuntu 20.04
Copyright (c) 2000, 2018, Oracle, MariaDB Corporation Ab and others.
Type 'help;' or '\h' for help. Type '\c' to clear the current input statement.
MariaDB [(none)]> show databases;
+--------------------+
| Database |
+--------------------+
| information_schema |
| mysql |
| performance_schema |
+--------------------+
3 rows in set (0.001 sec)
MariaDB [(none)]>
4.3.2 准备mysql_exporter环境
- 下载exporter
wget https://github.com/prometheus/mysqld_exporter/releases/download/v0.14.0/mysqld_exporter-0.14.0.linux-amd64.tar.gz
tar xvf mysqld_exporter-0.14.0.linux-amd64.tar.gz
cp mysqld_exporter-0.14.0.linux-amd64/mysqld_exporter /usr/local/bin/
- 免密登录配置
cat >>/root/.my.cnf <<EOF
[client]
user=mysql_exporter
password=123456
EOF
- 验证权限
[root@prometheus-server apps]#mysql
Welcome to the MariaDB monitor. Commands end with ; or \g.
Your MariaDB connection id is 39
Server version: 10.3.38-MariaDB-0ubuntu0.20.04.1 Ubuntu 20.04
Copyright (c) 2000, 2018, Oracle, MariaDB Corporation Ab and others.
Type 'help;' or '\h' for help. Type '\c' to clear the current input statement.
MariaDB [(none)]>
4.3.3 启动mysql_exporter
- 编写service
cat >> /etc/systemd/system/mysqld_exporter.service <<EOF
[Unit]
Description=Prometheus MySQL Exporter
After=network.target
[Service]
ExecStart=/usr/local/bin/mysqld_exporter --config.my-cnf=/root/.my.cnf
[Install]
WantedBy=multi-user.target
EOF
- 启动服务
systemctl daemon-reload
systemctl enable --now mysqld_exporter.service
4.3.4 验证metrics

4.3.5 配置prometheus
- job_name: mysql-exporter
static_configs:
- targets: ['10.0.0.61:9104']
验证Prometheus状态
4.3.6 grafana展示
模板:13106
4.4 监控Haproxy
https://github.com/prometheus/haproxy_exporter
4.4.1 部署haproxy
- 安装haproxy
apt-cache madsion haproxy
apt install haproxy -y
- 编辑配置文件
[root@k8s-ha1 ~]#vim /etc/haproxy/haproxy.cfg
global
# 修改socket文件
stats socket /run/haproxy/admin.sock mode 660 level admin expose-fd listeners
......
listen stats
mode http
bind :9999
stats enable
stats uri /haproxy-status
stats realm HAProxy\ Stats\ Page
stats auth haadmin:123456
listen prometheus-server-9090
bind :9090
mode http
server 10.0.0.61 10.0.0.61:9090 check inter 3s fall 3 rise 5
- 重启服务
systemctl restart haproxy
4.4.2 部署haproxy_exporter
- 下载
wget https://github.com/prometheus/haproxy_exporter/releases/download/v0.14.0/haproxy_exporter-0.14.0.linux-amd64.tar.gz
tar xvf haproxy_exporter-0.14.0.linux-amd64.tar.gz
cp haproxy_exporter-0.14.0.linux-amd64/haproxy_exporter /usr/local/bin/
- 启动
# 方式一
haproxy_exporter --haproxy.scrape-uri=unix:/run/haproxy/admin.sock
# 方式二
haproxy_exporter --haproxy.scrape-uri="http://haadmin:123456@127.0.0.1:9999/haproxy-status;csv"
- 查看端口
[root@k8s-ha1 ~]#netstat -ntlp|grep 9101
tcp6 0 0 :::9101 :::* LISTEN 527956/haproxy_expo
- 验证状态页面
4.4.3 验证metrics数据
4.4.4 prometheus采集配置
- job_name: haproxy-monitor
static_configs:
- targets: ['10.0.0.31:9101']
4.4.5 grafana展示
模板:367
模板:2428
4.5 监控Nginx
需要在编译安装nginx时添加nginx-module-vts模块
GitHub地址:https://github.com/vozlt/nginx-module-vts
4.5.1 安装nginx
- 下载nginx源码、nginx-module-vts模块
wget http://nginx.org/download/nginx-1.20.2.tar.gz
wget https://github.com/vozlt/nginx-module-vts/archive/refs/tags/v0.2.0.tar.gz
tar xvf nginx-1.20.2.tar.gz -C /usr/local/src/
tar xvf v0.2.0.tar.gz -C /usr/local/src/
- 编译安装nginx
cd /usr/local/src/nginx-1.20.2
./configure --prefix=/apps/nginx \
--with-http_ssl_module \
--with-http_v2_module \
--with-http_realip_module \
--with-http_stub_status_module \
--with-http_gzip_static_module \
--with-pcre \
--with-file-aio \
--with-stream \
--with-stream_ssl_module \
--with-stream_realip_module \
--add-module=/usr/local/src/nginx-module-vts-0.2.0/
make && make install
- 编辑nginx配置文件
# http块配置
http {
vhost_traffic_status_zone; #启用状态页
# server块配置
server {
listen 10.0.0.31:80;
server_name localhost;
#charset koi8-r;
#access_log logs/host.access.log main;
location / {
root html;
index index.html index.htm;
proxy_pass http://10.0.0.31:9090;
}
location /status {
vhost_traffic_status_display;
vhost_traffic_status_display_format html;
}
...
}
...
}
检查配置
[root@k8s-ha1 apps]#/apps/nginx/sbin/nginx -t
nginx: the configuration file /apps/nginx/conf/nginx.conf syntax is ok
nginx: configuration file /apps/nginx/conf/nginx.conf test is successful
- 启动服务
/apps/nginx/sbin/nginx
- 验证web状态页
4.5.2 安装nginx_exporter
https://github.com/hnlq715/nginx-vts-exporter/releases
- 下载nginx_exporter
4.wget https://github.com/hnlq715/nginx-vts-exporter/releases/download/v0.10.3/nginx-vts-exporter-0.10.3.linux-amd64.tar.gz
tar xvf nginx-vts-exporter-0.10.3.linux-amd64.tar.gz
cp nginx-vts-exporter-0.10.3.linux-amd64/nginx-vts-exporter /usr/local/bin/
- 启动exporter
nginx-vts-exporter -nginx.scrape_uri http://10.0.0.31/status/format/json
- 查看端口
[root@k8s-ha1 apps]#netstat -ntlp|grep nginx
tcp 0 0 10.0.0.100:80 0.0.0.0:* LISTEN 555495/nginx: maste
tcp 0 0 10.0.0.31:80 0.0.0.0:* LISTEN 555495/nginx: maste
tcp 0 0 10.0.0.100:443 0.0.0.0:* LISTEN 555495/nginx: maste
tcp6 0 0 :::9913 :::* LISTEN 559231/nginx-vts-ex # nginx_exporter 端口
4.5.3 验证metrics数据
4.5.4 prometheus采集配置
- job_name: nginx-monitor
static_configs:
- targets: ['10.0.0.31:9913']
4.5.5 grafana展示
模板:2949
五、基于blackbox_exporter实现对URL状态、IP可用性、端口状态、TLS证书的过期时间监控
https://github.com/prometheus/blackbox_exporter
blackbox_exporter是prometheus官方提供的一个exporter,可以监控HTTP、HTTPS、DNS、TCP、ICMP等目标实例,实现对被监控节点进行监控和数据采集。
HTTP/HTTPS: URL/API可用性检测
TCP:端口监听检测
ICMP:主机存货检测
DNS:域名解析
5.1 部署blackbox_exporter
https://prometheus.io/download/#blackbox_exporter
部署在LoadBalance(10.0.0.31)上
- 下载
cd /apps
wget https://github.com/prometheus/blackbox_exporter/releases/download/v0.23.0/blackbox_exporter-0.23.0.linux-amd64.tar.gz
tar xvf blackbox_exporter-0.23.0.linux-amd64.tar.gz
ln -s /apps/blackbox_exporter-0.23.0.linux-amd64 /apps/blackbox_exporter
- 启动服务
# 创建服务
cat >> /etc/systemd/system/blackbox_exporter.service <<EOF
[Unit]
Description=Prometheus Blackbox Exporter
After=network.target
[Service]
Type=simple
User=root
Group=root
ExecStart=/apps/blackbox_exporter/blackbox_exporter \
--config.file=/apps/blackbox_exporter/blackbox.yml \
--web.listen-address=:9115
Restart=on-failure
[Install]
WantedBy=multi-user.target
EOF
# 启动服务
systemctl daemon-reload
systemctl enable --now blackbox_exporter.service
- 查看端口
[root@k8s-ha1 apps]#ss -ntl|grep 9115
LISTEN 0 4096 *:9115 *:*
- 验证web页面
5.2 URL监控
prometheus server配置URL监控
- job_name: "http_status"
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets: ['http://harbor.chu.net','www.baidu.com']
labels:
instance: http_status
group: web
relabel_configs:
- source_labels: [__address__] # 将__address__(当前监控目标URL地址的标签)修改为__param_target,用于传递给blackbox_exporter
target_label: __param_target # 标签key为__param_target、value为harbor.chu.net
- source_labels: [__param_target] # 基于__param_target获取监控目标
target_label: url # 将监控目标的值与url创建一个label
- target_label: __address__ # 新添加一个目标__address__,指向blackbox_exporter服务器地址,用于将监控请求发送给指定的blackbox_exporter服务器
replacement: 10.0.0.31:9115 # 指定blackbox_exporter服务器地址
检测配置文件
#/apps/prometheus/promtool check config /apps/prometheus/prometheus.yml
Checking /apps/prometheus/prometheus.yml
SUCCESS: /apps/prometheus/prometheus.yml is valid prometheus config file syntax
重启服务
systemctl restart prometheus.service
prometheus验证数据
blackbox_exporter页面验证数据
5.3 IP监控
prometheus配置IP监控
- job_name: "ping_status"
metrics_path: /probe
params:
module: [icmp]
static_configs:
- targets: ['10.0.0.2','223.6.6.6']
labels:
instance: ping_status
group: icmp
relabel_configs:
- source_labels: [__address__] # 将__address__(当前监控目标URL地址的标签)修改为__param_target,用于传递给blackbox_exporter
target_label: __param_target # 标签key为__param_target、value为'10.0.0.2','223.6.6.6'
- source_labels: [__param_target] # 基于__param_target获取监控目标
target_label: ip # 将监控目标的值与ip创建一个label
- target_label: __address__ # 新添加一个目标__address__,指向blackbox_exporter服务器地址,用于将监控请求发送给指定的blackbox_exporter服务器
replacement: 10.0.0.31:9115 # 指定blackbox_exporter服务器地址
重启服务
/apps/prometheus/promtool check config /apps/prometheus/prometheus.yml
systemctl restart prometheus.service
验证数据
5.4 端口监控
prometheus配置监控
- job_name: "port_status"
metrics_path: /probe
params:
module: [tcp_connect]
static_configs:
- targets: ['10.0.0.11:9100','10.0.0.61:9090','10.0.0.8:22']
labels:
instance: port_status
group: port
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: ip
- target_label: __address__
replacement: 10.0.0.31:9115
验证数据
5.5 TLS证书监控
prometheus配置监控
- job_name: "https_status"
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets: ['https://www.baidu.com']
labels:
instance: https_status
group: web
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: url
- target_label: __address__
replacement: 10.0.0.31:9115
5.6 grafana展示
导入模板:9965
显示https证书剩余天数
六、prometheus 结合钉钉实现告警通知、企业微信实现告警通知、告警模板的使用、告警分类发送
6.1 Alertmanager
https://github.com/prometheus/alertmanager
6.1.1 prometheus触发告警过程
prometheus-->触发阈值-->超出持续时间-->alertmanager-->分组|抑制|静默-->媒体类型-->邮件|钉钉|微信等
分组:将类似性质的警告合并为单个通知,如网络通知、主机通知、服务通知
静默:是一种简单的特定时间静音的机制,如服务器要升级维护可以先设置这个时间段告警静默
抑制:当警告发出后,停止重复发送由此警告引发的其他警告即合并为一个故障引起的多个报警事件,可以消除冗余告警
6.1.2 安装Alertmanager
与prometheus server/10.0.0.61安装在一起(也可分开安装)
# 下载Alertmanager
cd /apps
wget https://github.com/prometheus/alertmanager/releases/download/v0.25.0/alertmanager-0.25.0.linux-amd64.tar.gz
tar xvf alertmanager-0.25.0.linux-amd64.tar.gz
ln -s /apps/alertmanager-0.25.0.linux-amd64 /apps/alertmanager
# 创建服务
cat >> /etc/systemd/system/alertmanager.service <<EOF
[Unit]
Description=Prometheus Alertmanager
After=network.target
[Service]
ExecStart=/apps/alertmanager/alertmanager --config.file=/apps/alertmanager/alertmanager.yml
[Install]
WantedBy=multi-user.target
EOF
# 启动服务
systemctl daemon-reload
systemctl enable --now alertmanager.service
查看服务
[root@prometheus-server alertmanager]#systemctl is-active alertmanager.service
active
# 查看端口
[root@prometheus-server alertmanager]#netstat -ntlp|grep alertmanager
tcp6 0 0 :::9093 :::* LISTEN 6301/alertmanager
tcp6 0 0 :::9094 :::* LISTEN 6301/alertmanager
6.1.3 配置文件说明
官方配置文档:https://prometheus.io/docs/alerting/latest/configuration/
alertmanager.yml配置详解
global: # 全局配置
resolve_timeout: 1m #单次探测超时时间
smtp_from: '1234567890@qq.com' #发件人邮箱地址
smtp_smarthost: 'smtp.qq.com:465' #邮箱smtp地址。
smtp_auth_username: '123456789@qq.com' #发件人的登陆用户名,默认和发件人地址一致。
smtp_auth_password: 'ptqdjqbhjudejf' #发件人的登陆密码,有时候是授权码。
smtp_hello: '@qq.com'
smtp_require_tls: false #是否需要tls协议。默认是true。
wechart_api_urt: #企业微信API地址
wechart_api_secret: #企业微信APl
secretwechat_api_corp_id: #企业微信corp id信息
resolve_timeout: 60s #当一个告警在Alertmanager持续多长时间未接收到新告警后就标记告警状态为resolved(已解决/己恢复)。
route: # 用来设置报警的分发策略
group_by: [alertname] #通过alertname的值对告警进行分类
group_wait: 10s #一组警告第一次发送之前等待的延迟时间,即产生告警后延迟10秒钟将组内新产生的消息一起合并发送(一般设置为0秒~几分钟)。
group_interval: 2m #一组已发送过初始通知的告警接收到新告警后,下次发送通知前等待的延迟时间(一般设置为5分钟或更多)
repeat_interval: 5m #一条成功发送的告警,在最终发送通知之前等待的时间(通常设置为3小时或更长时间)
#间隔示例:
#group_wait: 10s #第一次产生告警,等待10s,组内有告警就一起发出,没有其它告警就单独发出。
#group_interval: 2m #第二次产生告警,先等待2分钟,2分钟后还没有恢复就进入repeat_interval。
#repeat_interval: 5m #在最终发送消息前再等待5分钟,5分钟后还没有恢复就发送第二次告警。
receiver: default-receiver #其它的告警发送给default-receiver
routes: #将critical的报警发送给myalertname
- receiver: myalertname
group_wait: 10s
match_re:
severity: critical
receivers: #定义多接收者
- name: 'default-receiver'
email_configs:
- to: '2403416792@qq.com'
send_resolved: true #通知己经恢复的告警
- name: myalertname
webhook_configs:
- url: 'http://172.30.7.101:806O/dingtalk/alertname/send'
send_resolved: true #通知已经恢复的告警
inhibit_rules: #抑制的规则
- source_match: #源匹配级别,当匹配成功发出通知,但是其它'alertname', 'dev', 'instance'产生的warning 级别的告警通知将被抑制
severity: 'critical' #报警的事件级别
target_match:
severity: 'warning' #调用source_match的severity即如果已经有'critical’级别的报警,那么将匹配目标为新产生的告警级别为'warning'的将被抑制
equal: ['alertname','dev','instance'] #匹配那些对象的告警
6.2 钉钉告警
6.2.1 创建群组机器人
- 添加机器人
- 创建加签或关键词
- 查看机器人
复制Webhook地址
6.2.2 钉钉认证-关键字-脚本
mkdir -p /data/scripts
- shell脚本
dingding-keywords.sh
#! /bin/bash
source /etc/profile
MESSAGE=$1
# https为Webhook地址
/usr/bin/curl -X "POST" "https://oapi.dingtalk.com/robot/send?access_token=a6026ee5fe935ad7d23bc3b4ba1f63c51b237e72160027a046720466fcfac4cf" \
-H 'Content-Type: application/json' \
-d '{"msgtype": "text",
"text": {
"content": "'${MESSAGE}'"
}
}'
- python脚本
dingding-keywords.py
#! /usr/bin/env python3
import sys
import requests
import json
def info(msg):
# url为Webhook地址
url = r"https://oapi.dingtalk.com/robot/send?access_token=a6026ee5fe935ad7d23bc3b4ba1f63c51b237e72160027a046720466fcfac4cf"
headers = {
"Content-Type": "application/json;charset=utf-8"
}
formdata = {
"msgtype": "text",
"text": {"content": str(msg)}
}
requests.post(url=url, data=json.dumps(formdata), headers=headers)
info(sys.argv[1])
6.2.3 验证脚本
测试发送消息
[root@prometheus-server ~]#bash /data/scripts/dingding-keywords.sh "namespace=default\ncpu=85%\nalertname=sh-pod"
{"errcode":0,"errmsg":"ok"}
[root@prometheus-server ~]#python3 /data/scripts/dingding-keywords.py "namespace=default cpu=85% alertname=python-pod"
验证
6.2.4 部署webhook-dingtalk
https://github.com/timonwong/prometheus-webhook-dingtalk/releases
# 下载
cd /apps
wget https://github.com/timonwong/prometheus-webhook-dingtalk/releases/download/v2.1.0/prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz
tar xvf prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz
ln -s /apps/prometheus-webhook-dingtalk-2.1.0.linux-amd64 /apps/prometheus-webhook-dingtalk
修改config.yaml
cp /apps/prometheus-webhook-dingtalk/config.example.yml /apps/prometheus-webhook-dingtalk/config.yml
[root@prometheus-server apps]#egrep -v "^#|^$" /apps/prometheus-webhook-dingtalk/config.yml
targets:
webhook1: # 可自定义,用于alertmanager调用
url: https://oapi.dingtalk.com/robot/send?access_token=a6026ee5fe935ad7d23bc3b4ba1f63c51b237e72160027a046720466fcfac4cf
secret: SEC8a7e5fe2bb03d383963c144a9cf8156fbbde20d9e754c4f0b43b6b3e04a2e892 # 钉钉机器人加签,secret认证
alertname: # 用于钉钉关键字认证
url: https://oapi.dingtalk.com/robot/send?access_token=a6026ee5fe935ad7d23bc3b4ba1f63c51b237e72160027a046720466fcfac4cf
webhook_legacy:
url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxx
# Customize template content
message:
# Use legacy template
title: '{{ template "legacy.title" . }}'
text: '{{ template "legacy.content" . }}'
webhook_mention_all:
url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxx
mention:
all: true # 给所有人发送
webhook_mention_users:
url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxx
mention: # 给指定人发送
mobiles: ['156xxxx8827', '189xxxx8325']
启动服务
# 后台启动
nohup /apps/prometheus-webhook-dingtalk/prometheus-webhook-dingtalk --web.listen-address=':8060' --config.file=/apps/prometheus-webhook-dingtalk/config.yml &
查看日志
ts=2023-03-14T10:10:47.289Z caller=main.go:113 component=configuration msg="Webhook urls for prometheus alertmanager" urls="http://localhost:8060/dingtalk/webhook_mention_all/send http://localhost:8060/dingtalk/webhook_mention_users/send http://localhost:8060/dingtalk/webhook1/send http://localhost:8060/dingtalk/alertname/send http://localhost:8060/dingtalk/webhook_legacy/send"
可看到http://localhost:8060/dingtalk/的URL分别对应config.yml中target
查看端口
[root@prometheus-server prometheus-webhook-dingtalk]#lsof -i :8060
COMMAND PID USER FD TYPE DEVICE SIZE/OFF NODE NAME
prometheu 13078 root 3u IPv6 101524 0t0 TCP *:8060 (LISTEN)
6.2.5 配置alertmanager调用dingtalk
官方配置:https://prometheus.io/docs/alerting/latest/configuration/
vim /apps/alertmanager/alertmanager.yml
global: # 全局配置
resolve_timeout: 2m #单次探测超时时间
smtp_from: '2403416792@qq.com' #发件人邮箱地址
smtp_smarthost: 'smtp.qq.com:465' #邮箱smtp地址。
smtp_auth_username: '2403416792@qq.com' #发件人的登陆用户名,默认和发件人地址一致。
smtp_auth_password: 'ptqdjqbhjudejf' #发件人的登陆密码,有时候是授权码。
smtp_hello: '@qq.com'
smtp_require_tls: false #是否需要tls协议。默认是true。
route:
group_by: [alertname] #通过alertname的值对告警进行分类
group_wait: 10s
group_interval: 2m
repeat_interval: 10m
receiver: dingding
receivers: #定义多接收者
- name: 'default-receiver'
email_configs:
- to: '2403416792@qq.com'
send_resolved: true #通知己经恢复的告警
- name: dingding
webhook_configs:
- url: 'http://10.0.0.61:8060/dingtalk/webhook1/send' # 对应prometheus-webhook-dingtalk/config.yml文件中的target webhook1,即使用钉钉加签认证
send_resolved: true #通知已经恢复的告警
重启服务
systemctl restart alertmanager.service
查看端口
[root@prometheus-server ~]#netstat -ntlp|grep alert
tcp6 0 0 :::9093 :::* LISTEN 13570/alertmanager
tcp6 0 0 :::9094 :::* LISTEN 13570/alertmanager
查看web页面
6.2.6 prometheus配置
6.2.6.1 配置报警规则
mkdir -p /apps/prometheus/rules
/apps/prometheus/rules/server_rules.yaml内容如下:
groups:
- name: alertmanager_pod.rules
rules:
- alert: Pod_all_cpu_usage
expr: (sum by(name)(rate(container_cpu_usage_seconds_total{image!=""}[5m]))*100) > 10
for: 2m
labels:
severity: critical
service: pods
project: myserver
annotations:
description: 容器 {{ $labels.name }} CPU 资源利用率大于 10% , (current value is {{ $value }})
summary: Dev CPU 负载告警
- alert: Pod_all_memory_usage
#expr: sort_desc(avg by(name)(irate(container_memory_usage_bytes{name!=""}[5m]))*100) > 10 #内存大于10%
expr: sort_desc(avg by(name)(irate(node_memory_MemFree_bytes {name!=""}[5m]))) > 2*1024*1024*1024 #内存大于2G
for: 2m
labels:
severity: critical
project: myserver
annotations:
description: 容器 {{ $labels.name }} Memory 资源利用率大于 2G , (current value is {{ $value }})
summary: Dev Memory 负载告警
- alert: Pod_all_network_receive_usage
#expr: sum by (name)(irate(container_network_receive_bytes_total{container_name="POD"}[1m])) > 50*1024*1024
expr: sum by (name)(irate(container_network_receive_bytes_total{container_name="POD"}[1m])) > 0
for: 2m
labels:
#severity: critical
project: myserver
annotations:
description: 容器 {{ $labels.name }} network_receive 资源利用率大于 50M , (current value is {{ $value }})
- alert: node内存可用大小
expr: node_memory_MemFree_bytes > 1024 #故意写错的
#expr: node_memory_MemFree_bytes < 524288000 #内存小于500兆
for: 30s
labels:
project: node
annotations:
description: node节点可用内存小于500兆
6.2.6.2 加载报警规则
vim /apps/prometheus/prometheus.yml
...
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- 10.0.0.61:9093 # alertmanager地址
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "/apps/prometheus/rules/server_rules.yaml" # 告警规则文件,可配置多个文件
# - "second_rules.yml"
...
6.2.6.3 验证规则
[root@prometheus-server ~]#/apps/prometheus/promtool check rules /apps/prometheus/rules/server_rules.yaml
Checking /apps/prometheus/rules/server_rules.yaml
SUCCESS: 4 rules found
6.2.6.4 重启prometheus
systemctl restart prometheus.service
6.2.6.5 查看当前警告
[root@prometheus-server ~]#/apps/alertmanager/amtool alert --alertmanager.url=http://10.0.0.61:9093
Alertname Starts At Summary State
node内存可用大小 2023-03-14 16:13:48 UTC active
node内存可用大小 2023-03-14 16:13:48 UTC active
node内存可用大小 2023-03-14 16:13:48 UTC active
node内存可用大小 2023-03-14 16:13:48 UTC active
node内存可用大小 2023-03-14 16:13:48 UTC active
node内存可用大小 2023-03-14 16:13:48 UTC active
node内存可用大小 2023-03-14 16:13:48 UTC active
node内存可用大小 2023-03-14 16:13:48 UTC active
node内存可用大小 2023-03-14 16:13:48 UTC active
node内存可用大小 2023-03-14 16:13:48 UTC active
node内存可用大小 2023-03-14 16:13:48 UTC active
node内存可用大小 2023-03-14 16:13:48 UTC active
6.2.7 验证告警消息发送
6.2.7.1 prometheus告警状态
prometheus告警状态
inactive #没有异常
pending #已触发阈值,但未满足告警持续时间(即rule中的for字段)
firing #已触发阈值且满足条件并发送至alertmanager
6.2.7.2 dingtalk日志
[root@prometheus-server /]#tail -f nohup.out
ts=2023-03-14T16:42:08.318Z caller=main.go:59 level=info msg="Starting prometheus-webhook-dingtalk" version="(version=2.1.0, branch=HEAD, revision=8580d1395f59490682fb2798136266bdb3005ab4)"
ts=2023-03-14T16:42:08.318Z caller=main.go:60 level=info msg="Build context" (gogo1.18.1,userroot@177bd003ba4d,date20220421-08:19:05)=(MISSING)
ts=2023-03-14T16:42:08.319Z caller=coordinator.go:83 level=info component=configuration file=/apps/prometheus-webhook-dingtalk/config.yml msg="Loading configuration file"
ts=2023-03-14T16:42:08.319Z caller=coordinator.go:91 level=info component=configuration file=/apps/prometheus-webhook-dingtalk/config.yml msg="Completed loading of configuration file"
ts=2023-03-14T16:42:08.319Z caller=main.go:97 level=info component=configuration msg="Loading templates" templates=
ts=2023-03-14T16:42:08.320Z caller=main.go:113 component=configuration msg="Webhook urls for prometheus alertmanager" urls="http://localhost:8060/dingtalk/webhook_legacy/send http://localhost:8060/dingtalk/webhook1/send http://localhost:8060/dingtalk/alertname/send"
ts=2023-03-14T16:42:08.320Z caller=web.go:208 level=info component=web msg="Start listening for connections" address=:8060
# 告警发送消息
ts=2023-03-14T16:42:08.951Z caller=entry.go:26 level=info component=web http_scheme=http http_proto=HTTP/1.1 http_method=POST remote_addr=10.0.0.61:34118 user_agent=Alertmanager/0.25.0 uri=http://10.0.0.61:8060/dingtalk/webhook1/send resp_status=200 resp_bytes_length=2 resp_elapsed_ms=311.106098 msg="request complete"
6.2.7.3 钉钉告警消息
6.3 企业微信告警
先完成企业微信注册
6.3.1 创建企业微信通知应用
- 进入web页面,创建应用
- 添加应用信息
- 查看企业微信通知应用信息
保存AgentID和Secret
6.3.2 添加成员
手动或扫二维码添加成员
验证通讯录
查看企业信息
保存企业ID
6.3.3 测试消息
- 应用管理-->企业微信通知应用-->发送消息
- 选择发送范围,编写发送信息
- 验证企业微信通知
6.3.4 配置prometheus
vim /apps/prometheus/prometheus.yml
...
# 检查是否有alerting配置
alerting:
alertmanagers:
- static_configs:
- targets:
- 10.0.0.61:9093 # alertmanager地址
# 与配置钉钉告警规则相同
rule_files:
- "/apps/prometheus/rules/server_rules.yaml" # 告警规则文件,可配置多个文件
重启服务
systemctl restart prometheus.service
6.3.5 配置alertmanager
#cat /apps/alertmanager/alertmanager.yml
global: # 全局配置
resolve_timeout: 2m #单次探测超时时间
smtp_from: '2403416792@qq.com' #发件人邮箱地址
smtp_smarthost: 'smtp.qq.com:465' #邮箱smtp地址。
smtp_auth_username: '2403416792@qq.com' #发件人的登陆用户名,默认和发件人地址一致。
smtp_auth_password: 'ptqdjqbhjudejf' #发件人的登陆密码,有时候是授权码。
smtp_hello: '@qq.com'
smtp_require_tls: false #是否需要tls协议。默认是true。
route: # 用来设置报警的分发策略
group_by: [alertname] #通过alertname的值对告警进行分类
group_wait: 10s #一组警告第一次发送之前等待的延迟时闻,即产生告警后延迟10秒钟将组内新产生的消息一起合并发送(一般设置为0秒~几分钟)。
group_interval: 2m #一组已发送过初始通知的告警接收到新告警后,下次发送通知前等待的延迟时间(一般设置为5分钟或更多)
repeat_interval: 10m #一条成功发送的告警,在最终发送通知之前等待的时间(通常设置为3小时或更长时间)
receiver: wechat #wechat告警
receivers: #定义多接收者
- name: 'default-receiver'
email_configs:
- to: '2403416792@qq.com'
send_resolved: true #通知己经恢复的告警
- name: dingding # 钉钉告警
webhook_configs:
- url: 'http://10.0.0.61:8060/dingtalk/alertname/send'
send_resolved: true #通知已经恢复的告警
- name: wechat # 微信告警
wechat_configs:
- corp_id: wwd2cebdb20b9c91d7 # 企业ID
to_user: '@all' # 发送给所有人
agent_id: 1000002 # 应用ID
api_secret: 5PwuWghboQCVEHe8DBQcGAyeU4oMHnaxevfQd84YpdA #应用secret
send_resolved: true #通知已经恢复的告警
重启服务
systemctl restart alertmanager.service
6.3.6 企业微信验证消息
出现60020错误码企业微信无法接收信息时
# 查看日志
/apps/alertmanager/alertmanager --config.file=/apps/alertmanager/alertmanager.yml --log.level=debug
...
caller=wechat.go:178 level=debug integration=wechat response="{\"errcode\":60020,\"errmsg\":\"not allow to access from your ip, hint: [1678823033431493982481539], from ip: 180.111.192.141, more info at https://open.work.weixin.qq.com/devtool/query?e=60020\"}" incident="{}:{alertname=\"node内存可用大小\"}"
错误码:60020
不安全的访问IP。请根据调用的应用类型分别按如下方法确认:
1)若调用者是企业自建应用或通讯录同步助手,请确认该IP是本企业服务器IP,并已经配置到应用详情的“企业可信IP”项目中。第三方服务商IP不能调用。
2)若调用者是第三方应用或服务商代开发应用,请确认该IP已经配置到“服务商管理后台”-“服务商信息”-“基本信息”-“IP白名单”。
3) 配置完可信IP之后,需要1分钟后才生效。
配置添加可信IP
根据实际设置可信域名或设置接收消息服务器URL,或联系企业微信技术团队解决
6.4 告警分类发送
根据消息中的属性信息设置规则,将消息分类发送,如以下将severity级别为critical的通知消息发送到钉钉,其他发送到邮件:
6.4.1 prometheus rules配置
# cat /apps/prometheus/rules/server2_rules.yaml
groups:
- name: alertmanager_pod.rules
rules:
- alert: Pod_all_cpu_usage # 警告
expr: (sum by(name)(rate(container_cpu_usage_seconds_total{image!=""}[5m]))*100) > 20
for: 2m
labels:
severity: warning
service: pods
project: myserver
annotations:
description: 容器 {{ $labels.name }} CPU 资源利用率大于 10% , (current value is {{ $value }})
summary: Pod CPU 利用率超过20%
- alert: Pod_all_cpu_usage # 严重
expr: (sum by(name)(rate(container_cpu_usage_seconds_total{image!=""}[5m]))*100) > 30
for: 2m
labels:
severity: critical
service: pods
project: myserver
annotations:
description: 容器 {{ $labels.name }} CPU 资源利用率大于 30% , (current value is {{ $value }})
summary: Pod CPU 利用率超过30%
- alert: Pod_all_memory_usage
#expr: sort_desc(avg by(name)(irate(container_memory_usage_bytes{name!=""}[5m]))*100) > 10 #内存大于10%
expr: sort_desc(avg by(name)(irate(node_memory_MemFree_bytes {name!=""}[5m]))) > 2*1024*1024*1024 #内存大于2G
for: 2m
labels:
severity: critical
project: myserver
annotations:
description: 容器 {{ $labels.name }} Memory 资源利用率大于 2G , (current value is {{ $value }})
summary: Dev Memory 负载告警
- alert: Pod_all_network_receive_usage
#expr: sum by (name)(irate(container_network_receive_bytes_total{container_name="POD"}[1m])) > 50*1024*1024
expr: sum by (name)(irate(container_network_receive_bytes_total{container_name="POD"}[1m])) > 1
for: 2m
labels:
severity: critical
project: myserver
annotations:
description: 容器 {{ $labels.name }} network_receive 资源利用率大于 50M , (current value is {{ $value }})
- alert: node内存可用大小
expr: node_memory_MemFree_bytes > 1024 #故意写错进行测试
#expr: node_memory_MemFree_bytes < 524288000 #内存小于500兆
for: 30s
labels:
project: node
annotations:
description: node节点可用内存小于500M
重启服务
systemctl restart prometheus.service
6.4.2 alertmanager配置
#cat /apps/alertmanager/alertmanager.yml
global:
resolve_timeout: 5m #单次探测超时时间
smtp_from: '2403416792@qq.com' #发件人邮箱地址
smtp_smarthost: 'smtp.qq.com:465' #邮箱smtp地址。
smtp_auth_username: '2403416792@qq.com' #发件人的登陆用户名,默认和发件人地址一致。
smtp_auth_password: 'riudkgxfttjqecda' #发件人的登陆密码,有时候是授权码。
smtp_hello: '@qq.com'
smtp_require_tls: false #是否需要tls协议。默认是true。
route:
group_by: [alertname]
group_wait: 10s
group_interval: 10s
repeat_interval: 10m
receiver: 'dingding' #默认告警方式为钉钉
#添加消息路由
routes:
- receiver: 'dingding-worknode' #critical级别的发送钉钉
group_wait: 10s
match_re:
severity: critical # 匹配严重等级告警
- receiver: 'email-receiver' # 宿主机告警通过邮件发送
group_wait: 10s
match_re:
project: node # 匹配node告警
receivers:
- name: 'email-receiver'
email_configs:
- to: '2403416792@qq.com'
send_resolved: true
- name: 'dingding'
webhook_configs:
- url: 'http://10.0.0.61:8060/dingtalk/webhook1/send'
send_resolved: true
- name: 'dingding-worknode'
webhook_configs:
- url: 'http://10.0.0.61:8060/dingtalk/webhook1/send'
send_resolved: true
inhibit_rules: #抑制的规则
- source_match: #源匹配级别,当匹配成功发出通知,但是其它'alertname', 'dev', 'instance'产生的warning 级别的告警通知将被抑制
severity: 'critical' #报警的事件级别
target_match:
severity: 'warning' #调用source_match的severity即如果已经有'critical’级别的报警,那么将匹配目标为新产生的告警级别为'warning'的将被抑制
equal: ['alertname','dev','instance'] #匹配那些对象的告警
重启服务
systemctl restart alertmanager.service
6.4.3 验证消息发送
- node告警发送邮件
- critical严重等级告警发送至钉钉
钉钉消息
6.5 钉钉告警模板
默认的消息内容需要调整、而且消息是连接在一起的。
6.5.1 定义模板
# vim /apps/prometheus-webhook-dingtalk/message_template.templ
{{ define "dingding.default.message" }}
{{- if gt (len .Alerts.Firing) 0 -}}
{{- range $index, $alert := .Alerts -}}
========= **监控告警** =========
**告警程序:** Alertmanager
**告警类型:** {{ $alert.Labels.alertname }}
**告警级别:** {{ $alert.Labels.severity }} 级
**告警状态:** {{ .Status }}
**故障主机:** {{ $alert.Labels.instance }} {{ $alert.Labels.device }}
**告警主题:** {{ .Annotations.summary }}
**告警详情:** {{ $alert.Annotations.message }}{{ $alert.Annotations.description}}
**主机标签:** {{ range .Labels.SortedPairs }} </br> [{{ .Name }}: {{ .Value | markdown | html }} ]
{{- end }} </br>
**故障时间:** {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
========= = end = =========
{{- end }}
{{- end }}
{{- if gt (len .Alerts.Resolved) 0 -}}
{{- range $index, $alert := .Alerts -}}
========= 告警恢复 =========
**告警程序:** Alertmanager
**告警主题:** {{ $alert.Annotations.summary }}
**告警主机:** {{ .Labels.instance }}
**告警类型:** {{ .Labels.alertname }}
**告警级别:** {{ $alert.Labels.severity }} 级
**告警状态:** {{ .Status }}
**告警详情:** {{ $alert.Annotations.message }}{{ $alert.Annotations.description}}
**故障时间:** {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
**恢复时间:** {{ ($alert.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
========= = **end** = =========
{{- end }}
{{- end }}
{{- end }}
6.5.2 dingtalk引用模板
#vim /apps/prometheus-webhook-dingtalk/config.yml
# 开启模板
templates:
- /apps/prometheus-webhook-dingtalk/message_template.templ # 引用模板
targets:
webhook1:
url: https://oapi.dingtalk.com/robot/send?access_token=a6026ee5fe935ad7d23bc3b4ba1f63c51b237e72160027a046720466fcfac4cf
secret: SEC8a7e5fe2bb03d383963c144a9cf8156fbbde20d9e754c4f0b43b6b3e04a2e892
message:
text: '{{ template "dingding.default.message" . }}' # 模板文件中模板名称
6.5.3 alertmanager引用dingtalk地址
#cat /apps/alertmanager/alertmanager.yml
global:
resolve_timeout: 5m #单次探测超时时间
smtp_from: '2403416792@qq.com' #发件人邮箱地址
smtp_smarthost: 'smtp.qq.com:465' #邮箱smtp地址。
smtp_auth_username: '2403416792@qq.com' #发件人的登陆用户名,默认和发件人地址一致。
smtp_auth_password: 'riudkgxfttjqecda' #发件人的登陆密码,有时候是授权码。
smtp_hello: '@qq.com'
smtp_require_tls: false #是否需要tls协议。默认是true。
templates:
- '/apps/alertmanager/message_template.templ' # Altermanager引用模板
route:
group_by: [alertname]
group_wait: 10s
group_interval: 10s
repeat_interval: 10m
receiver: 'email-receiver' #默认告警方式为邮件
#添加消息路由
routes:
- receiver: 'dingding' #critical级别的发送钉钉
group_wait: 10s
match_re:
severity: critical # 匹配严重等级告警
receivers:
- name: 'email-receiver'
email_configs:
- to: '2403416792@qq.com'
send_resolved: true
- name: 'dingding'
webhook_configs:
- url: 'http://10.0.0.61:8060/dingtalk/webhook1/send'
send_resolved: true
重启服务
/apps/prometheus-webhook-dingtalk/prometheus-webhook-dingtalk --web.listen-address=':8060' --config.file=/apps/prometheus-webhook-dingtalk/config.yml
systemctl restart alertmanager.service
6.5.4 验证告警内容

浙公网安备 33010602011771号