20220313作业 prometheus2

1.prometheus kube-state-metrics

通过监听API server生成有关资源对象的状态指标,比如Deployment、Node、Pod需要关注的事kube-state-metrics只是简单的提供一个metrics数据,并不会存储这些指标数据,所有我们可以使用prometheus来抓取这些数据然后存储,主要关注的是业务相关的一些元数据,比如Deployment,pod,副本状态等,调度了多少个replicas?现在可用的有几个?多少个pod是runnning/stopped等?pod重启次数,目前运行多少job等

部署kube-state-metrics:
apiVersion: apps/v1
kind: Deployment
metadata:
  name: kube-state-metrics
  namespace: kube-system
spec:
  replicas: 1
  selector:
    matchLabels:
      app: kube-state-metrics
  template:
    metadata:
      labels:
        app: kube-state-metrics
    spec:
      serviceAccountName: kube-state-metrics
      containers:
      - name: kube-state-metrics
        image: bitnami/kube-state-metrics:2.2.4 
        ports:
        - containerPort: 8080

---
---
apiVersion: v1
kind: ServiceAccount
metadata:
  name: kube-state-metrics
  namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: kube-state-metrics
rules:
- apiGroups: [""]
  resources: ["nodes", "pods", "services", "resourcequotas", "replicationcontrollers", "limitranges", "persistentvolumeclaims", "persistentvolumes", "namespaces", "endpoints"]
  verbs: ["list", "watch"]
- apiGroups: ["extensions"]
  resources: ["daemonsets", "deployments", "replicasets"]
  verbs: ["list", "watch"]
- apiGroups: ["apps"]
  resources: ["statefulsets"]
  verbs: ["list", "watch"]
- apiGroups: ["batch"]
  resources: ["cronjobs", "jobs"]
  verbs: ["list", "watch"]
- apiGroups: ["autoscaling"]
  resources: ["horizontalpodautoscalers"]
  verbs: ["list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: kube-state-metrics
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: kube-state-metrics
subjects:
- kind: ServiceAccount
  name: kube-state-metrics
  namespace: kube-system

---
apiVersion: v1
kind: Service
metadata:
  annotations:
    prometheus.io/scrape: 'true'
  name: kube-state-metrics
  namespace: kube-system
  labels:
    app: kube-state-metrics
spec:
  type: NodePort
  ports:
  - name: kube-state-metrics
    port: 8080
    targetPort: 8080
    nodePort: 31666
    protocol: TCP
  selector:
    app: kube-state-metrics
prometheus.yml配置:
  - job_name: "kube-state-metrics"
    static_configs:
      - targets: ["192.168.1.72:31666"]
模版:13332、13824、14518

2.监控tomcat、redis、mysql、haproxy、nginx

2.1 tomcat
2.2 redis
redis-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: redis
  namespace: studylinux-net 
spec:
  replicas: 1
  selector:
    matchLabels:
      app: redis
  template:
    metadata:
      labels:
        app: redis
    spec:
      containers:
      - name: redis
        image: redis:4.0.14 
        resources:
          requests:
            cpu: 100m
            memory: 100Mi
        ports:
        - containerPort: 6379
      - name: redis-exporter
        image: oliver006/redis_exporter:latest
        resources:
          requests:
            cpu: 100m
            memory: 100Mi
        ports:
        - containerPort: 9121

redis-redis-svc.yaml
kind: Service  #service 类型
apiVersion: v1
metadata:
#  annotations:
#    prometheus.io/scrape: 'false'
  name: redis-redis-service
  namespace: studylinux-net 
spec:
  selector:
    app: redis
  ports:
  - nodePort: 31081
    name: redis
    port: 6379
    protocol: TCP
    targetPort: 6379
  type: NodePort

redis-exporter-svc.yaml
kind: Service  #service 类型
apiVersion: v1
metadata:
  annotations:
    prometheus.io/scrape: 'true'
    prometheus.io/port: "9121"
  name: redis-exporter-service
  namespace: studylinux-net 
spec:
  selector:
    app: redis
  ports:
  - nodePort: 31082
    name: prom
    port: 9121
    protocol: TCP
    targetPort: 9121
  type: NodePort

2.3 mysql
(1)先安装好mysql
(2)下载mysql_export:https://github.com/prometheus/mysqld_exporter
(3)安装mysql_export
tar -zxf mysqld_exporter-0.13.0.linux-amd64.tar.gz
ln -sf mysqld_exporter-0.13.0.linux-amd64 mysqld_exporter

vim /etc/systemd/system/mysqld_exporter.service 

[Unit]
Description=node_exporter
After=network.target
[Service]
ExecStart=/apps/mysqld_exporter/mysqld_exporter --config.my-cnf=/root/.my.cnf
[Install]
WantedBy=multi-user.target

vim /root/.my.cnf
[client]
host=127.0.0.1
user=root
password=123456
systemctl daemon-reload&&systemctl restart mysqld_exporter&&systemctl enable mysqld_exporter
(4) 配置prometheus.yml
  - job_name: "mysql"
    static_configs:
      - targets: ["192.168.1.34:9104"]
(5) 导入模板:13106/11323

2.4 haproxy
(1)部署haproxy
(2)下载 haproxy_export :https://github.com/prometheus/haproxy_exporter
(3)修改haproxy配置:
#stats socket /var/lib/haproxy/stats
stats socket /var/lib/haproxy/haproxy.sock mode 600 level admin
(4)安装haproxy_export
tar -zxf haproxy_exporter-0.12.0.linux-amd64.tar.gz
ln -sf haproxy_exporter-0.12.0.linux-amd64 haproxy_exporter
启动haproxy_export
方式一:./haproxy_exporter --haproxy.scrape-uri=unix:/var/lib/haproxy/haproxy.sock
方式二,先启动状态页:
listen stats
    bind :9999
    stats enable
    stats uri /haproxy-status
    stats auth haadmin:123456
    stats auth admin:123456
haproxy_exporter --haproxy.scrape-uri="http://admin:123456@127.0.0.1:9999/haproxy-status;csv" &
(5)配置prometheus.yml
  - job_name: "haproxy"
    static_configs:
      - targets: ["192.168.1.76:9101"]
(5)导入模板:2428

2.5 nginx
编译安装nginx
git clone https://github.com/vozlt/nginx-module-vts.git
mv nginx-module-vts-master /usr/local/src/nginx-module-vts

wget https://nginx.org/download/nginx-1.20.2.tar.gz
tar -zxf nginx-1.20.2.tar.gz
cd nginx-1.20.2

./configure --prefix=/apps/nginx \
--with-http_ssl_module \
--with-http_v2_module \
--with-http_realip_module \
--with-http_stub_status_module  \
--with-http_gzip_static_module \
--with-pcre \
--with-file-aio \
--with-stream \
--with-stream_ssl_module \
--with-stream_realip_module \
--add-module=/usr/local//src/nginx-module-vts/

make
make install

cd /apps/nginx/conf

修改nginx配置:
http增加:
    vhost_traffic_status_zone;
server增加:
        location /status {
	    vhost_traffic_status_display;
   	    vhost_traffic_status_display_format html;	
        }
		
cd /apps/nginx/sbin
nginx -t 
./nginx

下载nginx-vts-exporter-0.10.0.linux-amd64.tar.gz
wget https://github.com/hnlq715/nginx-vts-exporter/releases/download/v0.10.0/nginx-vts-exporter-0.10.0.linux-amd64.tar.gz
tar -zxf nginx-vts-exporter-0.10.0.linux-amd64.tar.gz
cd nginx-vts-exporter-0.10.0.linux-amd64
./nginx-vts-exporter  -nginx.scrape_uri http://192.168.1.242/status/format/json

配置prometheus.yml
  - job_name: "nginx"
    static_configs:
      - targets: ["192.168.1.242:9913"]
	
导入模板:2949
2.4 blackbox_exporter
作用:
	http/https:url/api 可用性监测
	TCP:端口监听监测
	ICMP:主机存活监测
	DNS:域名解析
部署:
wget https://github.com/prometheus/blackbox_exporter/releases/download/v0.19.0/blackbox_exporter-0.19.0.linux-amd64.tar.gz
tar -zxf blackbox_exporter-0.19.0.linux-amd64.tar.gz
ln -sf blackbox_exporter-0.19.0.linux-amd64 blackbox_exporter
创建启动文件:
[Unit]
Description=Prometheus Blackbox Exporter 
After=network.target
[Service]
Type=simple 
User=root 
Group=root 
ExecStart=/apps/blackbox_exporter/blackbox_exporter --config.file=/apps/blackbox_exporter/blackbox.yml --web.listen-address=:9115
Restart=on-failure
[Instal]
WantedBy=multi-user.target

systemctl daemon-reload;systemctl restart blackbox_exporter.service ;systemctl enable blackbox_exporter.service 
配置prometheus.yml:
# 网站监控
  - job_name: 'http_status'
    metrics_path: /probe
    params:
      module: [http_2xx]
    static_configs:
      - targets: ['http://www.xiaomi.com', 'http://www.magedu.com']
        labels:
          instance: http_status
          group: web
    relabel_configs:
      - source_labels: [__address__] #relabel通过将__address__(当前目标地址)写入__param_target标签来创建一个label。
        target_label: __param_target #监控目标www.xiaomi.com,作为__address__的value
      - source_labels: [__param_target] #监控目标
        target_label: url #将监控目标与url创建一个label
      - target_label: __address__
        replacement: 192.168.1.242:9115


# icmp 检测
  - job_name: 'ping_status'
    metrics_path: /probe
    params:
      module: [icmp]
    static_configs:
      - targets: ['192.168.1.1',"223.6.6.6"]
        labels:
          instance: 'ping_status'
          group: 'icmp'
    relabel_configs:
      - source_labels: [__address__]
        target_label: __param_target
      - source_labels: [__param_target]
        target_label: ip #将ip与__param_target创建一个label
      - target_label: __address__
        replacement: 192.168.1.242:9115


# 端口监控
  - job_name: 'port_status'
    metrics_path: /probe
    params:
      module: [tcp_connect]
    static_configs:
      - targets: ['192.168.1.72:9100', '192.168.1.241:3000','192.168.1.72:22']
        labels:
          instance: 'port_status'
          group: 'port'
    relabel_configs:
      - source_labels: [__address__]
        target_label: __param_target
      - source_labels: [__param_target]
        target_label: ip
      - target_label: __address__
        replacement: 192.168.1.242:9115
		
模版:
9965,13587

3.prometheus告警-邮件、钉钉关键字及微信告警

3.1 安装alertmanager
wget https://github.com/prometheus/alertmanager/releases/download/v0.23.0/alertmanager-0.23.0.linux-amd64.tar.gz
tar -zxf alertmanager-0.23.0.linux-amd64.tar.gz 
ln -sf alertmanager-0.23.0.linux-amd64 alertmanager
编辑启动文件
cat /etc/systemd/system/alertmanager.service
[Unit]
Description=Prometheus Blackbox Exporter 
After=network.target
[Service]
Type=simple 
User=root 
Group=root 
#ExecStart=/apps/blackbox_exporter/blackbox_exporter --config.file=/apps/blackbox_exporter/blackbox.yml --web.listen-address=:9115
ExecStart=/apps/alertmanager/alertmanager --config.file=/apps/alertmanager/alertmanager.yml
Restart=on-failure
[Instal]
WantedBy=multi-user.target

systemctl daemon-reload&&systemctl restart alertmanager.service&&systemctl enable alertmanager.service

3.2 配置邮箱告警
配置alertmanager.yml

global:
  resolve_timeout: 5m
  smtp_smarthost: 'smtp.qq.com:465'
  smtp_from: 'xxx@qq.com'
  smtp_auth_username: 'xxx@qq.com'
  smtp_auth_password: 'xxx'
  smtp_hello: '@qq.com'
  smtp_require_tls: false
route:
  group_by: ['alertname']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 30s
  receiver: 'web.hook'
receivers:
- name: 'web.hook'
  email_configs:
  - to: 'xxx@qq.com'
inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'dev', 'instance']

配置告警规则:pods_rule.yaml

groups:
  - name: alertmanager_pod.rules
    rules:
    - alert: Pod_all_cpu_usage
      expr: (sum by(name)(rate(container_cpu_usage_seconds_total{image!=""}[5m]))*100) > 10
      for: 2m
      labels:
        severity: critical
        service: pods
      annotations:
        description: 容器 {{ $labels.name }} CPU 资源利用率大于 10% , (current value is {{ $value }})
        summary: Dev CPU 负载告警

    - alert: Pod_all_memory_usage  
      #expr: sort_desc(avg by(name)(irate(container_memory_usage_bytes{name!=""}[5m]))*100) > 10  #内存大于10%
      expr: sort_desc(avg by(name)(irate(node_memory_MemFree_bytes {name!=""}[5m]))) > 2147483648   #内存大于2G
      for: 2m
      labels:
        severity: critical
      annotations:
        description: 容器 {{ $labels.name }} Memory 资源利用率大于 2G , (current value is {{ $value }})
        summary: Dev Memory 负载告警

    - alert: Pod_all_network_receive_usage
      expr: sum by (name)(irate(container_network_receive_bytes_total{container_name="POD"}[1m])) > 50*1024*1024
      for: 2m
      labels:
        severity: critical
      annotations:
        description: 容器 {{ $labels.name }} network_receive 资源利用率大于 50M , (current value is {{ $value }})

    - alert: node内存可用大小 
      #expr: node_memory_MemFree_bytes < 512*1024*1024 #故意写错的
      expr: node_memory_MemFree_bytes < 1 #故意写错的
      for: 15s
      labels:
        severity: critical
      annotations:
        description: node可用内存小于4G

  - name: alertmanager_node.rules
    rules:
    - alert: 磁盘容量
      expr: 100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_bytes {fstype=~"ext4|xfs"}*100) > 80  #磁盘容量利用率大于80%
      for: 2s
      labels:
        severity: critical
      annotations:
        summary: "{{$labels.mountpoint}} 磁盘分区使用率过高!"
        description: "{{$labels.mountpoint }} 磁盘分区使用大于80%(目前使用:{{$value}}%)"

    - alert: 磁盘容量
      expr: 100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_bytes {fstype=~"ext4|xfs"}*100) > 60 #磁盘容量利用率大于60%
      for: 2s
      labels:
        severity: warning
      annotations:
        summary: "{{$labels.mountpoint}} 磁盘分区使用率过高!"
        description: "{{$labels.mountpoint }} 磁盘分区使用大于80%(目前使用:{{$value}}%)"
prometheus.yml增加报警规则:
# Alertmanager configuration
alerting:
  alertmanagers:
    - static_configs:
        - targets:
           - 192.168.1.242:9093

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
   - "/apps/prometheus/rules/pods_rule.yaml"
3.3 配置钉钉告警
下载prometheus-webhook-dingtalk
wget https://github.com/timonwong/prometheus-webhook-dingtalk/releases/download/v1.4.0/prometheus-webhook-dingtalk-1.4.0.linux-amd64.tar.gz
tar -zxf prometheus-webhook-dingtalk-1.4.0.linux-amd64.tar.gz
ln -sf prometheus-webhook-dingtalk-1.4.0.linux-amd64 prometheus-webhook-dingtalk
启动:
./prometheus-webhook-dingtalk --web.listen-address="0.0.0.0:8060" --ding.profile="alertname=https://oapi.dingtalk.com/robot/send?access_token=xxx" 
修改alertmanager.yml
- name: webhook
  webhook_configs:
  - url: http://192.168.1.242:8060/dingtalk/alertname/send  
    send_resolved: true
重启promethues

3.4 企业微信告警
配置机器人忽略
配置alertmanager.yml:
- name: 'wechat'
  wechat_configs:
  - corp_id: 'xxx'
    to_party: '2'
    agent_id: '1000002'
    api_secret: 'xxx'
    send_resolved: true
3.5 分类告警分发
配置alertmanager.yml:
route:
  group_by: ['alertname']
  group_wait: 10s
  group_interval: 20s
  repeat_interval: 2m
  receiver: 'wechat'
  routes:
  - receiver: 'webhook'
    group_wait: 10s
    match_re:
      instance: 192.168.1.72
3.6 告警抑制与静默
高等级告警自动抑制低等级告警‘
静默手动调整

4.prometheus联邦集群

prometheus联邦和prometheus server安装配置一样
prometheus采集联邦数据配置如下:

  - job_name: 'prometheus-federate-1.242'
    scrape_interval: 10s
    honor_labels: true
    metrics_path: '/federate'
    params:
      'match[]':
       - '{job="prometheus"}'
       - '{__name__=~"job:.*"}'
       - '{__name__=~"node.*"}'
    static_configs:
    - targets:
      - '192.168.1.242:9090'

  - job_name: 'prometheus-federate-1.191'
    scrape_interval: 10s
    honor_labels: true
    metrics_path: '/federate'
    params:
      'match[]':
       - '{job="prometheus"}'
       - '{__name__=~"job:.*"}'
       - '{__name__=~"node.*"}'
    static_configs:
    - targets:
      - '192.168.1.191:9090'

5.prometheus和grafana基于VictoriaMetrics存储集群实现数据读写分离

部署VictoriaMetrics集群
下载: wget https://github.com/VictoriaMetrics/VictoriaMetrics/releases/download/v1.71.0/victoria-metrics-amd64-v1.71.0-cluster.tar.gz
安装:
cat /etc/systemd/system/vminsert.service
[Unit]
Description=Vmstorage Server 
After=network.target
[Service]
Restart=on-failure 
WorkingDirectory=/tmp 
ExecStart=/usr/local/bin/vminsert-prod -httpListenAddr :8480 -storageNode=192.168.1.191:8400,192.168.1.192:8400,192.168.1.193:8400
[Instal]
WantedBy=multi-user.target

cat  /etc/systemd/system/vmselect.service
[Unit]
Description=Vmstorage Server 
After=network.target
[Service]
Restart=on-failure 
WorkingDirectory=/tmp 
ExecStart=/usr/local/bin/vmselect-prod -httpListenAddr :8481 -storageNode=192.168.1.191:8401,192.168.1.192:8401,192.168.1.193:8401
[Instal]
WantedBy=multi-user.target

 cat vmstorage.service
[Unit]
Description=Vmstorage Server 
After=network.target
[Service]
Restart=on-failure 
WorkingDirectory=/tmp 
#ExecStart=/usr/local/bin/vmstorage-prod -loggerTimezone Asia/Shanghai -storageDataPath/data/vmstorage-data -httpListenAddr:8482 -vminsertAddr :8400 -vmselectAddr:8401
ExecStart=/usr/local/bin/vmstorage-prod -loggerTimezone Asia/Shanghai -storageDataPath /data/vmstorage-data -httpListenAddr :8482  -vminsertAddr :8400 -vmselectAddr :8401 
[Instal]
WantedBy=multi-user.target

prometheus.yml配置:
remote_write:
  - url: http://192.168.1.191:8480/insert/0/prometheus
  - url: http://192.168.1.192:8480/insert/0/prometheus
  - url: http://192.168.1.193:8480/insert/0/prometheus

配置负载均衡器:
listen victoria-metrics
        bind 192.168.1.198:8481
        server 192.168.1.191 192.168.1.191:8481 check inter 2000 fall 3 rise 5
        server 192.168.1.192 192.168.1.192:8481 check inter 2000 fall 3 rise 5
        server 192.168.1.193 192.168.1.193:8481 check inter 2000 fall 3 rise 5
		
grafana配置:
http://192.168.1.198:8481/select/0/prometheus

posted @ 2022-03-14 23:51  没有猫的猫奴  阅读(32)  评论(0)    收藏  举报