Kubernetes Skywalking 告警
https://github.com/apache/skywalking/blob/master/docs/en/setup/backend/backend-alarm.md#list-of-all-
解析条目
#vim config/alarm-settings.ymlrules: #定义rule规则
service_cpm_rule: #唯一的规则名称,必须以_rule结尾
# Metrics value need to be long, double or int
metrics-name: service_cpm #指标名称
op:">#操作符,>,>=,<,<=,==
threshold:1#指标圆值
# The lenath of time to evaluate the metrics
period: 2 #评估指标的间隔周期
# How many times after the metrics match the condition, will trigger alarm
count: 1 #匹配成功多少次就会触发告警
# How many times of checks, the alarm keeps silence after alarm triggered, default as same as period.
#silence-period:3
silence-period:2 #触发告警后的静默时间
message: dubbo-provider service_cpm 大于1了 #告警信息
core.oal:获取参数配置文件
#容器中/skywalking/config/oal/core.oal
[root@xianchaonode2 ~]# docker exec -it 08020a474bfc bash
bash-5.0# cd /skywalking/config/oal/
browser.oal core.oal disable.oal dotnet-agent.oal java-agent.oal tcp.oal
bash-5.0# cd /skywalking/config/oal/
bash-5.0# cat core.oal
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
// For services using protocols HTTP 1/2, gRPC, RPC, etc., the cpm metrics means "calls per minute",
// for services that are built on top of TCP, the cpm means "packages per minute".
// All scope metrics
all_percentile = from(All.latency).percentile(10); // Multiple values including p50, p75, p90, p95, p99
all_heatmap = from(All.latency).histogram(100, 20);
// Service scope metrics
service_resp_time = from(Service.latency).longAvg();
service_sla = from(Service.*).percent(status == true);
service_cpm = from(Service.*).cpm();
service_percentile = from(Service.latency).percentile(10); // Multiple values including p50, p75, p90, p95, p99
service_apdex = from(Service.latency).apdex(name, status);
// Service relation scope metrics for topology
service_relation_client_cpm = from(ServiceRelation.*).filter(detectPoint == DetectPoint.CLIENT).cpm();
service_relation_server_cpm = from(ServiceRelation.*).filter(detectPoint == DetectPoint.SERVER).cpm();
service_relation_client_call_sla = from(ServiceRelation.*).filter(detectPoint == DetectPoint.CLIENT).percent(status == true);
service_relation_server_call_sla = from(ServiceRelation.*).filter(detectPoint == DetectPoint.SERVER).percent(status == true);
service_relation_client_resp_time = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).longAvg();
service_relation_server_resp_time = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.SERVER).longAvg();
service_relation_client_percentile = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).percentile(10); // Multiple values including p50, p75, p90, p95, p99
service_relation_server_percentile = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.SERVER).percentile(10); // Multiple values including p50, p75, p90, p95, p99
// Service Instance relation scope metrics for topology
service_instance_relation_client_cpm = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.CLIENT).cpm();
service_instance_relation_server_cpm = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.SERVER).cpm();
service_instance_relation_client_call_sla = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.CLIENT).percent(status == true);
service_instance_relation_server_call_sla = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.SERVER).percent(status == true);
service_instance_relation_client_resp_time = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).longAvg();
service_instance_relation_server_resp_time = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.SERVER).longAvg();
service_instance_relation_client_percentile = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).percentile(10); // Multiple values including p50, p75, p90, p95, p99
service_instance_relation_server_percentile = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.SERVER).percentile(10); // Multiple values including p50, p75, p90, p95, p99
// Service Instance Scope metrics
service_instance_sla = from(ServiceInstance.*).percent(status == true);
service_instance_resp_time= from(ServiceInstance.latency).longAvg();
service_instance_cpm = from(ServiceInstance.*).cpm();
// Endpoint scope metrics
endpoint_cpm = from(Endpoint.*).cpm();
endpoint_avg = from(Endpoint.latency).longAvg();
endpoint_sla = from(Endpoint.*).percent(status == true);
endpoint_percentile = from(Endpoint.latency).percentile(10); // Multiple values including p50, p75, p90, p95, p99
// Endpoint relation scope metrics
endpoint_relation_cpm = from(EndpointRelation.*).filter(detectPoint == DetectPoint.SERVER).cpm();
endpoint_relation_resp_time = from(EndpointRelation.rpcLatency).filter(detectPoint == DetectPoint.SERVER).longAvg();
endpoint_relation_sla = from(EndpointRelation.*).filter(detectPoint == DetectPoint.SERVER).percent(status == true);
endpoint_relation_percentile = from(EndpointRelation.rpcLatency).filter(detectPoint == DetectPoint.SERVER).percentile(10); // Multiple values including p50, p75, p90, p95, p99
database_access_resp_time = from(DatabaseAccess.latency).longAvg();
database_access_sla = from(DatabaseAccess.*).percent(status == true);
database_access_cpm = from(DatabaseAccess.*).cpm();
database_access_percentile = from(DatabaseAccess.latency).percentile(10);
配置告警参数
#钉钉机器人使用参考链接:https://help.aliyun.com/document_detail/251838.html
#docker-compose.yml
#本地配置文件 挂在到容器中
[root@xianchaonode2 skywalking]# cat docker-compose.yaml
version: '3.3'
services:
es7:
image: elasticsearch:7.10.1
container_name: es7
ports:
- 9200:9200
- 9300:9300
environment:
- discovery.type=single-node #单机模式
- bootstrap.memory_lock=true #锁定物理内存地址
- "ES_JAVA_OPTS=-Xms1048m -Xmx1048m" #堆内存大小
- TZ=Asia/Shanghai
ulimits:
memlock:
soft: -1
hard: -1
volumes:
- /data/elasticsearch/data:/usr/share/elasticsearch/data
skywalking-oap:
image: apache/skywalking-oap-server:8.6.0-es7
container_name: skywalking-oap
restart: always
volumes:
- ./alarm-settings.yml:/skywalking/config/alarm-settings.yml ##本地配置文件 挂在到容器中
- ./core.oal:/skywalking/config/oal/core.oal #本地配置文件 挂在到容器中
depends_on:
- es7
links:
- es7
ports:
- 11800:11800
- 12800:12800
environment:
TZ: Asia/Shanghai
SW_STORAGE: elasticsearch7
SW_STORAGE_ES_CLUSTER_NODES: es7:9200
skywalking-ui:
image: apache/skywalking-ui:8.6.0
container_name: skywalking-ui
restart: always
depends_on:
- skywalking-oap
links:
- skywalking-oap
ports:
- 8080:8080
environment:
TZ: Asia/Shanghai
SW_OAP_ADDRESS: skywalking-oap:12800
[root@xianchaonode2 skywalking]# cat core.oal
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
// For services using protocols HTTP 1/2, gRPC, RPC, etc., the cpm metrics means "calls per minute",
// for services that are built on top of TCP, the cpm means "packages per minute".
// All scope metrics
all_percentile = from(All.latency).percentile(10); // Multiple values including p50, p75, p90, p95, p99
all_heatmap = from(All.latency).histogram(100, 20);
// Service scope metrics
service_resp_time = from(Service.latency).longAvg();
service_sla = from(Service.*).percent(status == true);
service_cpm = from(Service.*).cpm();
service_percentile = from(Service.latency).percentile(10); // Multiple values including p50, p75, p90, p95, p99
service_apdex = from(Service.latency).apdex(name, status);
// Service relation scope metrics for topology
service_relation_client_cpm = from(ServiceRelation.*).filter(detectPoint == DetectPoint.CLIENT).cpm();
service_relation_server_cpm = from(ServiceRelation.*).filter(detectPoint == DetectPoint.SERVER).cpm();
service_relation_client_call_sla = from(ServiceRelation.*).filter(detectPoint == DetectPoint.CLIENT).percent(status == true);
service_relation_server_call_sla = from(ServiceRelation.*).filter(detectPoint == DetectPoint.SERVER).percent(status == true);
service_relation_client_resp_time = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).longAvg();
service_relation_server_resp_time = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.SERVER).longAvg();
service_relation_client_percentile = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).percentile(10); // Multiple values including p50, p75, p90, p95, p99
service_relation_server_percentile = from(ServiceRelation.latency).filter(detectPoint == DetectPoint.SERVER).percentile(10); // Multiple values including p50, p75, p90, p95, p99
// Service Instance relation scope metrics for topology
service_instance_relation_client_cpm = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.CLIENT).cpm();
service_instance_relation_server_cpm = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.SERVER).cpm();
service_instance_relation_client_call_sla = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.CLIENT).percent(status == true);
service_instance_relation_server_call_sla = from(ServiceInstanceRelation.*).filter(detectPoint == DetectPoint.SERVER).percent(status == true);
service_instance_relation_client_resp_time = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).longAvg();
service_instance_relation_server_resp_time = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.SERVER).longAvg();
service_instance_relation_client_percentile = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.CLIENT).percentile(10); // Multiple values including p50, p75, p90, p95, p99
service_instance_relation_server_percentile = from(ServiceInstanceRelation.latency).filter(detectPoint == DetectPoint.SERVER).percentile(10); // Multiple values including p50, p75, p90, p95, p99
// Service Instance Scope metrics
service_instance_sla = from(ServiceInstance.*).percent(status == true);
service_instance_resp_time= from(ServiceInstance.latency).longAvg();
service_instance_cpm = from(ServiceInstance.*).cpm();
// Endpoint scope metrics
endpoint_cpm = from(Endpoint.*).cpm();
endpoint_avg = from(Endpoint.latency).longAvg();
endpoint_sla = from(Endpoint.*).percent(status == true);
endpoint_percentile = from(Endpoint.latency).percentile(10); // Multiple values including p50, p75, p90, p95, p99
// Endpoint relation scope metrics xks
endpoint_relation_cpm = from(EndpointRelation.*).filter(detectPoint == DetectPoint.SERVER).cpm();
endpoint_relation_resp_time = from(EndpointRelation.rpcLatency).filter(detectPoint == DetectPoint.SERVER).longAvg();
endpoint_relation_sla = from(EndpointRelation.*).filter(detectPoint == DetectPoint.SERVER).percent(status == true);
endpoint_relation_percentile = from(EndpointRelation.rpcLatency).filter(detectPoint == DetectPoint.SERVER).percentile(10); // Multiple values including p50, p75, p90, p95, p99
database_access_resp_time = from(DatabaseAccess.latency).longAvg();
database_access_sla = from(DatabaseAccess.*).percent(status == true);
database_access_cpm = from(DatabaseAccess.*).cpm();
database_access_percentile = from(DatabaseAccess.latency).percentile(10);
[root@xianchaonode2 skywalking]# cat alarm-settings.yml
rules:
service_cpm_rule: #监控所有服务的每分钟请求量,如果超过1就告警
metrics-name: service_cpm
op: ">"
threshold: 1 #为了测试,故意设置为1
period: 10
count: 1
silence-period: 5
message: Cpm is more than 1 in last 10 minutes.
dingtalkHooks: #钉钉接收者定义
textTemplate: |- #消息模板 内容必须有关键字 SkyWalking 钉钉上也设置SkyWalking
{
"msgtype": "text",
"text": {
"content": "Apache SkyWalking Alarm \n alertname: %s."
}
}
webhooks: #自己钉钉webhook地址
- url: https://oapi.dingtalk.com/robot/send?access_token=9464b2a09e03c6eb530ba51d422b7a89e442ae7c9018a13baa3c59736773a753
#重新启动项目
[root@xianchaonode2 skywalking]# docker-compose down
Stopping skywalking-ui ... done
Stopping skywalking-oap ... done
Stopping es7 ... done
Removing skywalking-ui ... done
Removing skywalking-oap ... done
Removing es7 ... done
Removing network skywalking_default
[root@xianchaonode2 skywalking]# docker-compose up -d
Creating network "skywalking_default" with the default driver
Creating es7 ... done
Creating skywalking-oap ... done
Creating skywalking-ui ... done
多次访问一下项目 超过1次就会报警