K8s AIOps 算法学习路线 + 每类算法对应代码
你需要的「K8s AIOps 算法学习路线 + 对应可运行代码」我已经整理好了,路线按从易到难、从落地到进阶设计,代码全部适配K8s场景(指标/事件/日志),直接复制就能跑。
一、K8s AIOps 算法学习路线(3个月落地版)
| 阶段 | 学习目标 | 核心算法 | 学习时长 | 落地场景 |
|---|---|---|---|---|
| 第1周 | 搞定基础异常检测 | 3σ/IQR、动态阈值 | 1周 | K8s Pod/Node 资源指标异常告警 |
| 第2-3周 | 时序预测与高级异常检测 | Prophet、孤立森林(IForest) | 2周 | K8s 资源使用率预测、非规则异常检测 |
| 第4-6周 | 告警降噪与根因排序 | DBSCAN(聚类)、XGBoost(特征重要性) | 3周 | K8s 雪崩告警去重、根因Top5输出 |
| 第7-12周 | 高阶根因与智能增强 | MIC(互信息)、PC算法(因果)、LLM解析 | 6周 | K8s 拓扑根因定位、日志语义解析 |
二、各阶段算法对应代码(K8s场景适配版)
前置准备
# 安装依赖
pip install kubernetes pandas numpy scikit-learn pyod fbprophet xgboost matplotlib requests
# 配置K8s凭证(~/.kube/config)
# 准备Prometheus地址(替换为你的地址)
PROM_URL="http://your-prometheus-ip:9090"
阶段1:3σ/IQR 动态阈值(K8s 指标异常检测)
功能:检测K8s Node CPU使用率异常
import requests
import pandas as pd
import numpy as np
# 1. 从Prometheus拉取Node CPU使用率(近1小时)
def get_node_cpu_usage(node_name):
query = f'100 - (avg by (instance) (irate(node_cpu_seconds_total{{mode="idle",instance=~"{node_name}:.*"}}[5m])) * 100)'
response = requests.get(f"{PROM_URL}/api/v1/query_range", params={
"query": query,
"start": "now-1h",
"end": "now",
"step": "60s"
})
data = response.json()
if data["status"] != "success" or not data["data"]["result"]:
return []
# 提取指标值
values = [float(v[1]) for v in data["data"]["result"][0]["values"]]
return values
# 2. 3σ异常检测(适配K8s动态阈值)
def detect_anomaly_3sigma(values):
if len(values) < 10: # 数据量不足时返回空
return [], []
df = pd.Series(values)
mean = df.mean()
std = df.std()
# 3σ阈值
upper_threshold = mean + 3 * std
lower_threshold = mean - 3 * std
# 标记异常点
anomalies = []
anomaly_times = []
for idx, val in enumerate(values):
if val > upper_threshold or val < lower_threshold:
anomalies.append(val)
anomaly_times.append(idx)
return anomalies, anomaly_times, upper_threshold
# 3. 执行检测(替换为你的Node名称)
if __name__ == "__main__":
NODE_NAME = "node-1" # 替换为你的Node名称
cpu_values = get_node_cpu_usage(NODE_NAME)
anomalies, anomaly_times, threshold = detect_anomaly_3sigma(cpu_values)
print(f"Node {NODE_NAME} CPU使用率3σ阈值:{threshold:.2f}%")
if anomalies:
print(f"检测到{len(anomalies)}个异常点:{anomalies}")
else:
print("未检测到异常")
阶段2:Prophet 预测 + 孤立森林(K8s 资源预测&异常)
1. Prophet 预测K8s Pod内存使用率
import pandas as pd
from prophet import Prophet
import requests
import matplotlib.pyplot as plt
# 1. 拉取Pod内存使用率(近7天)
def get_pod_mem_usage(namespace, pod_name):
query = f'container_memory_usage_bytes{{namespace="{namespace}", pod="{pod_name}"}}/1024/1024'
response = requests.get(f"{PROM_URL}/api/v1/query_range", params={
"query": query,
"start": "now-7d",
"end": "now",
"step": "3600s" # 1小时粒度
})
data = response.json()
if data["status"] != "success" or not data["data"]["result"]:
return pd.DataFrame()
# 转换为Prophet要求的格式(ds:时间, y:值)
values = data["data"]["result"][0]["values"]
df = pd.DataFrame(values, columns=["ds", "y"])
df["ds"] = pd.to_datetime(df["ds"], unit='s')
df["y"] = df["y"].astype(float)
return df
# 2. 预测未来24小时内存使用率
def predict_mem_usage(df):
if df.empty:
return None
model = Prophet(interval_width=0.95) # 95%置信区间
model.fit(df)
# 预测未来24小时
future = model.make_future_dataframe(periods=24, freq='H')
forecast = model.predict(future)
# 可视化
fig = model.plot(forecast)
plt.title("Pod Memory Usage Prediction (24h)")
plt.xlabel("Time")
plt.ylabel("Memory Usage (MB)")
plt.show()
# 返回预测结果
return forecast[["ds", "yhat", "yhat_lower", "yhat_upper"]].tail(24)
# 3. 执行预测(替换为你的Pod信息)
if __name__ == "__main__":
NAMESPACE = "default"
POD_NAME = "your-pod-name"
df = get_pod_mem_usage(NAMESPACE, POD_NAME)
forecast = predict_mem_usage(df)
if forecast is not None:
print("未来24小时内存使用率预测(MB):")
print(forecast)
2. PyOD 孤立森林检测K8s高维指标异常
import numpy as np
import pandas as pd
from pyod.models.iforest import IForest
from kubernetes import client, config
# 1. 获取Node多维度指标(CPU/内存/磁盘/网络)
def get_node_metrics(node_name):
config.load_kube_config()
# 模拟指标(实际可从Prometheus拉取)
# 维度:CPU使用率(%)、内存使用率(%)、磁盘使用率(%)、网络延迟(ms)
np.random.seed(42)
normal_data = np.random.randn(100, 4) * 10 + 50 # 正常数据(均值50)
anomaly_data = np.random.randn(10, 4) * 20 + 80 # 异常数据(均值80)
all_data = np.vstack([normal_data, anomaly_data])
# 构造DataFrame
df = pd.DataFrame(all_data, columns=["cpu", "mem", "disk", "network"])
return df
# 2. 孤立森林检测异常
def detect_node_anomaly(df):
# 初始化模型(适配K8s高维指标)
clf = IForest(contamination=0.1, random_state=42) # 异常比例10%
clf.fit(df)
# 获取异常标记和分数
df["anomaly_label"] = clf.labels_ # 0=正常, 1=异常
df["anomaly_score"] = clf.decision_scores_ # 异常分数(越高越异常)
# 输出异常节点
anomalies = df[df["anomaly_label"] == 1]
print(f"检测到{len(anomalies)}个异常节点指标:")
print(anomalies)
return df
# 3. 执行检测
if __name__ == "__main__":
NODE_NAME = "node-1"
df = get_node_metrics(NODE_NAME)
detect_node_anomaly(df)
阶段3:DBSCAN 告警聚类 + XGBoost 根因排序
1. DBSCAN 聚类K8s雪崩告警
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# 1. 模拟K8s雪崩告警(实际可从AlertManager拉取)
k8s_alerts = [
"Pod my-app-1234 in default namespace is CrashLoopBackOff",
"Service my-service in default namespace is unavailable",
"Pod my-app-5678 in default namespace is CrashLoopBackOff",
"Node node-1 is NotReady",
"Ingress my-ingress in default namespace has no endpoints",
"Pod my-app-9012 in default namespace is CrashLoopBackOff"
]
# 2. TF-IDF向量化 + DBSCAN聚类
def cluster_alerts(alerts):
# 文本向量化
vectorizer = TfidfVectorizer(stop_words="english")
X = vectorizer.fit_transform(alerts)
# 计算余弦相似度(替代距离)
similarity = cosine_similarity(X)
distance = 1 - similarity # 转换为距离
# DBSCAN聚类
dbscan = DBSCAN(eps=0.5, min_samples=2, metric="precomputed")
clusters = dbscan.fit_predict(distance)
# 输出聚类结果
alert_clusters = {}
for idx, cluster_id in enumerate(clusters):
if cluster_id not in alert_clusters:
alert_clusters[cluster_id] = []
alert_clusters[cluster_id].append(alerts[idx])
# 打印聚类(-1为噪声点)
for cluster_id, alerts_in_cluster in alert_clusters.items():
if cluster_id == -1:
print(f"噪声告警:{alerts_in_cluster}")
else:
print(f"聚类{cluster_id}(根告警类型):{alerts_in_cluster[0]} → 关联告警数:{len(alerts_in_cluster)}")
# 3. 执行聚类
if __name__ == "__main__":
cluster_alerts(k8s_alerts)
2. XGBoost 输出K8s故障根因Top5
import xgboost as xgb
import pandas as pd
import numpy as np
# 1. 构造K8s故障特征数据(实际可从K8s事件/指标提取)
# 特征:node_cpu(%)、node_mem(%)、pod_oom(0/1)、kubelet_restart(0/1)、disk_full(0/1)
# 标签:故障类型(0=无故障, 1=Pod重启, 2=Node故障, 3=磁盘故障)
data = {
"node_cpu": [80, 90, 50, 95, 40],
"node_mem": [85, 95, 45, 90, 88],
"pod_oom": [1, 1, 0, 0, 1],
"kubelet_restart": [0, 0, 1, 1, 0],
"disk_full": [0, 0, 0, 0, 1],
"fault_type": [1, 1, 2, 2, 3]
}
df = pd.DataFrame(data)
X = df.drop("fault_type", axis=1)
y = df["fault_type"]
# 2. XGBoost训练 + 特征重要性(根因排序)
def rank_root_cause(X, y):
model = xgb.XGBClassifier(random_state=42)
model.fit(X, y)
# 特征重要性(根因分数)
importance = model.feature_importances_
feature_names = X.columns
# 排序输出根因
root_cause_ranking = pd.DataFrame({
"feature": feature_names,
"importance": importance
}).sort_values("importance", ascending=False)
print("K8s故障根因Top5:")
print(root_cause_ranking)
return root_cause_ranking
# 3. 执行根因排序
if __name__ == "__main__":
rank_root_cause(X, y)
阶段4:MIC 互信息 + PC算法(因果推断)
1. MIC 分析K8s指标因果关联
from minepy import MINE
import numpy as np
# 1. 模拟K8s指标(CPU高 → 延迟高 还是 延迟高 → CPU高)
cpu_usage = np.random.randn(100) * 10 + 60 # CPU使用率
network_latency = cpu_usage * 0.8 + np.random.randn(100) * 5 # 延迟依赖CPU
disk_io = np.random.randn(100) * 8 + 50 # 无关指标
# 2. 计算MIC(互信息,衡量非线性关联)
def calculate_mic(x, y, label):
mine = MINE()
mine.compute_score(x, y)
mic_score = mine.mic()
print(f"{label} MIC值:{mic_score:.4f}(值越高关联越强)")
# 3. 执行计算
if __name__ == "__main__":
calculate_mic(cpu_usage, network_latency, "CPU使用率 vs 网络延迟")
calculate_mic(cpu_usage, disk_io, "CPU使用率 vs 磁盘IO")
三、学习与落地建议
- 先跑通代码:把阶段1-3的代码替换成你的K8s/Prometheus地址,先看到实际效果;
- 小场景落地:先落地“Node CPU异常检测”“Pod内存预测”,再扩展到告警聚类;
- 避坑点:
- 不要一开始就上因果推断/大模型,先搞定规则+统计;
- Prometheus指标拉取时注意降采样,避免数据量过大;
- 孤立森林的
contamination参数要根据K8s集群实际异常比例调整(一般5%-10%)。
总结
- K8s AIOps 算法落地优先选3σ、Prophet、孤立森林、DBSCAN、XGBoost,覆盖80%场景;
- 代码核心适配K8s的“指标+事件+告警”三大数据源,直接对接Prometheus/K8s API;
- 学习路线按“检测→预测→聚类→根因→因果”逐步进阶,3个月可落地核心能力。
浙公网安备 33010602011771号