k8s全方位监控中-常用rules配置

[root@VM_0_48_centos prometheus]# cat alertmanager-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: alertmanager-config
  namespace: kube-system
  labels:
    kubernetes.io/cluster-service: "true"
    addonmanager.kubernetes.io/mode: EnsureExists
data:
  alertmanager.yml: |
    global:
      resolve_timeout: 5m
      smtp_smarthost: 'smtp.163.com:25'
      smtp_from: 'xjq18125012766@163.com'
      smtp_auth_username: 'xjq18125012766@163.com'
      smtp_auth_password: 'test123'
      smtp_require_tls: false  

    route:
      group_by: ['alertname']
      group_wait: 10s
      group_interval: 30s
      repeat_interval: 10s    
      receiver: 'mail'

    receivers:
      - name: 'mail'
        email_configs:
        - to: '2654071080@qq.com'
[root@VM_0_48_centos prometheus]# cat  prometheus-rules.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: prometheus-rules-config
  namespace: kube-system
  labels:
    kubernetes.io/cluster-service: "true"
    addonmanager.kubernetes.io/mode: EnsureExists
data:
  pods.yml: |
    groups:
    - name: pod.rules
      rules:
      - alert: InstanceDown
        expr: up == 0
        for: 2m
        labels:
          severity: error 
        annotations:
          summary: "监控采集器{{ $labels.instance }}停止工作"
          value: "{{ $value }}"

      - alert: PodSvcDown
        expr: probe_success == 0
        for: 1m
        labels:
          severity: error 
        annotations:
          summary: "容器代理服务{{ $labels.instance }}停止工作"
          value: "{{ $value }}"

      - alert: MysqlCon
        expr: MysqlCon_metric > 40
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "mysql连接数过高"
          value: "{{ $value }}"

      - alert: PodCpuUsage
        expr: sum by(pod_name, namespace) (rate(container_cpu_usage_seconds_total{image!=""}[1m])) * 100 > 80
        for: 5m
        labels:
          severity: warning 
        annotations:
          summary: "容器ns: {{ $labels.namespace }} | pod: {{ $labels.pod_name }} CPU使用率超过80%"
          value: "{{ $value }}"

      - alert: PodMemoryUsage
        expr: sum(container_memory_rss{image!=""}) by(pod_name, namespace) / sum(container_spec_memory_limit_bytes{image!=""}) by(pod_name, namespace) * 100 != +inf > 80
        for: 5m
        labels:
          severity: warning 
        annotations:
          summary: "容器ns: {{ $labels.namespace }} | pod: {{ $labels.pod_name }} 内存使用率超过80%"
          value: "{{ $value }}"

      - alert: PodFailed
        expr: sum (kube_pod_status_phase{phase="Failed"}) by (pod,namespace) > 0
        for: 1m
        labels:
          severity: error 
        annotations:
          summary: "容器ns: {{ $labels.namespace }} | pod: {{ $labels.pod }} pod status is Failed"
          value: "{{ $value }}"

      - alert: PodPending
        expr: sum (kube_pod_status_phase{phase="Pending"}) by (pod,namespace) > 0
        for: 1m
        labels:
          severity: error 
        annotations:
          summary: "容器ns: {{ $labels.namespace }} | pod: {{ $labels.pod }} status is Pending"
          value: "{{ $value }}"

      - alert: PodNetworkReceive
        expr: sum (rate (container_network_receive_bytes_total{image!="",name=~"^k8s_.*"}[5m]) /1000) by (pod_name,namespace)  > 30000
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "容器ns: {{ $labels.namespace }} | pod: {{ $labels.pod_name }} 接受到的网络入流量大于30MB/s"
          value: "{{ $value }}K/s"                

      - alert: PodNetworkTransmit
        expr: sum (rate (container_network_transmit_bytes_total{image!="",name=~"^k8s_.*"}[5m]) /1000) by (pod_name,namespace) > 30000
        for: 5m
        labels:
          severity: warning 
        annotations:
          summary: "容器ns: {{ $labels.namespace }} | pod: {{ $labels.pod_name }} 传输的网络出流量大于30MB/s"
          value: "{{ $value }}K/s"

      - alert: PodRestart
        expr: sum (changes (kube_pod_container_status_restarts_total[1m])) by (pod,namespace) > 0
        for: 5s
        labels:
          severity: warning 
        annotations:
          summary: "容器ns: {{ $labels.namespace }} | pod: {{ $labels.pod }} pod is restart"
          value: "{{ $value }}"
  nodes.yml: |
    groups:
    - name: node.rules
      rules:
      - alert: NodeFilesystemUsage
        expr: 100 - (node_filesystem_free_bytes{device="rootfs"} / node_filesystem_size_bytes{device="rootfs"} * 100) > 85 
        for: 1m
        labels:
          severity: warning 
        annotations:
          summary: "主机 {{ $labels.instance }} : {{ $labels.mountpoint }} 分区使用率超过80%"
          value: "{{ $value }}"

      - alert: NodeMemoryUsage
        expr: 100 - (node_memory_MemFree_bytes+node_memory_Cached_bytes+node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "主机 {{ $labels.instance }} 内存使用率超过80%"
          value: "{{ $value }}"

      - alert: NodeCPUUsage    
        expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100) > 80 
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "主机 {{ $labels.instance }} CPU使用率超过80%"
          value: "{{ $value }}"

2、展示结果

posted @ 2019-08-22 16:52  马里亚纳仰望星空  Views(2087)  Comments(0Edit  收藏  举报