K8s AIOps 算法学习路线 + 每类算法对应代码

你需要的「K8s AIOps 算法学习路线 + 对应可运行代码」我已经整理好了,路线按从易到难、从落地到进阶设计,代码全部适配K8s场景(指标/事件/日志),直接复制就能跑。


一、K8s AIOps 算法学习路线(3个月落地版)

阶段 学习目标 核心算法 学习时长 落地场景
第1周 搞定基础异常检测 3σ/IQR、动态阈值 1周 K8s Pod/Node 资源指标异常告警
第2-3周 时序预测与高级异常检测 Prophet、孤立森林(IForest) 2周 K8s 资源使用率预测、非规则异常检测
第4-6周 告警降噪与根因排序 DBSCAN(聚类)、XGBoost(特征重要性) 3周 K8s 雪崩告警去重、根因Top5输出
第7-12周 高阶根因与智能增强 MIC(互信息)、PC算法(因果)、LLM解析 6周 K8s 拓扑根因定位、日志语义解析

二、各阶段算法对应代码(K8s场景适配版)

前置准备

# 安装依赖
pip install kubernetes pandas numpy scikit-learn pyod fbprophet xgboost matplotlib requests
# 配置K8s凭证(~/.kube/config)
# 准备Prometheus地址(替换为你的地址)
PROM_URL="http://your-prometheus-ip:9090"

阶段1:3σ/IQR 动态阈值(K8s 指标异常检测)

功能:检测K8s Node CPU使用率异常

import requests
import pandas as pd
import numpy as np

# 1. 从Prometheus拉取Node CPU使用率(近1小时)
def get_node_cpu_usage(node_name):
    query = f'100 - (avg by (instance) (irate(node_cpu_seconds_total{{mode="idle",instance=~"{node_name}:.*"}}[5m])) * 100)'
    response = requests.get(f"{PROM_URL}/api/v1/query_range", params={
        "query": query,
        "start": "now-1h",
        "end": "now",
        "step": "60s"
    })
    data = response.json()
    if data["status"] != "success" or not data["data"]["result"]:
        return []
    # 提取指标值
    values = [float(v[1]) for v in data["data"]["result"][0]["values"]]
    return values

# 2. 3σ异常检测(适配K8s动态阈值)
def detect_anomaly_3sigma(values):
    if len(values) < 10:  # 数据量不足时返回空
        return [], []
    df = pd.Series(values)
    mean = df.mean()
    std = df.std()
    # 3σ阈值
    upper_threshold = mean + 3 * std
    lower_threshold = mean - 3 * std
    # 标记异常点
    anomalies = []
    anomaly_times = []
    for idx, val in enumerate(values):
        if val > upper_threshold or val < lower_threshold:
            anomalies.append(val)
            anomaly_times.append(idx)
    return anomalies, anomaly_times, upper_threshold

# 3. 执行检测(替换为你的Node名称)
if __name__ == "__main__":
    NODE_NAME = "node-1"  # 替换为你的Node名称
    cpu_values = get_node_cpu_usage(NODE_NAME)
    anomalies, anomaly_times, threshold = detect_anomaly_3sigma(cpu_values)
    print(f"Node {NODE_NAME} CPU使用率3σ阈值:{threshold:.2f}%")
    if anomalies:
        print(f"检测到{len(anomalies)}个异常点:{anomalies}")
    else:
        print("未检测到异常")

阶段2:Prophet 预测 + 孤立森林(K8s 资源预测&异常)

1. Prophet 预测K8s Pod内存使用率

import pandas as pd
from prophet import Prophet
import requests
import matplotlib.pyplot as plt

# 1. 拉取Pod内存使用率(近7天)
def get_pod_mem_usage(namespace, pod_name):
    query = f'container_memory_usage_bytes{{namespace="{namespace}", pod="{pod_name}"}}/1024/1024'
    response = requests.get(f"{PROM_URL}/api/v1/query_range", params={
        "query": query,
        "start": "now-7d",
        "end": "now",
        "step": "3600s"  # 1小时粒度
    })
    data = response.json()
    if data["status"] != "success" or not data["data"]["result"]:
        return pd.DataFrame()
    # 转换为Prophet要求的格式(ds:时间, y:值)
    values = data["data"]["result"][0]["values"]
    df = pd.DataFrame(values, columns=["ds", "y"])
    df["ds"] = pd.to_datetime(df["ds"], unit='s')
    df["y"] = df["y"].astype(float)
    return df

# 2. 预测未来24小时内存使用率
def predict_mem_usage(df):
    if df.empty:
        return None
    model = Prophet(interval_width=0.95)  # 95%置信区间
    model.fit(df)
    # 预测未来24小时
    future = model.make_future_dataframe(periods=24, freq='H')
    forecast = model.predict(future)
    # 可视化
    fig = model.plot(forecast)
    plt.title("Pod Memory Usage Prediction (24h)")
    plt.xlabel("Time")
    plt.ylabel("Memory Usage (MB)")
    plt.show()
    # 返回预测结果
    return forecast[["ds", "yhat", "yhat_lower", "yhat_upper"]].tail(24)

# 3. 执行预测(替换为你的Pod信息)
if __name__ == "__main__":
    NAMESPACE = "default"
    POD_NAME = "your-pod-name"
    df = get_pod_mem_usage(NAMESPACE, POD_NAME)
    forecast = predict_mem_usage(df)
    if forecast is not None:
        print("未来24小时内存使用率预测(MB):")
        print(forecast)

2. PyOD 孤立森林检测K8s高维指标异常

import numpy as np
import pandas as pd
from pyod.models.iforest import IForest
from kubernetes import client, config

# 1. 获取Node多维度指标(CPU/内存/磁盘/网络)
def get_node_metrics(node_name):
    config.load_kube_config()
    # 模拟指标(实际可从Prometheus拉取)
    # 维度:CPU使用率(%)、内存使用率(%)、磁盘使用率(%)、网络延迟(ms)
    np.random.seed(42)
    normal_data = np.random.randn(100, 4) * 10 + 50  # 正常数据(均值50)
    anomaly_data = np.random.randn(10, 4) * 20 + 80   # 异常数据(均值80)
    all_data = np.vstack([normal_data, anomaly_data])
    # 构造DataFrame
    df = pd.DataFrame(all_data, columns=["cpu", "mem", "disk", "network"])
    return df

# 2. 孤立森林检测异常
def detect_node_anomaly(df):
    # 初始化模型(适配K8s高维指标)
    clf = IForest(contamination=0.1, random_state=42)  # 异常比例10%
    clf.fit(df)
    # 获取异常标记和分数
    df["anomaly_label"] = clf.labels_  # 0=正常, 1=异常
    df["anomaly_score"] = clf.decision_scores_  # 异常分数(越高越异常)
    # 输出异常节点
    anomalies = df[df["anomaly_label"] == 1]
    print(f"检测到{len(anomalies)}个异常节点指标:")
    print(anomalies)
    return df

# 3. 执行检测
if __name__ == "__main__":
    NODE_NAME = "node-1"
    df = get_node_metrics(NODE_NAME)
    detect_node_anomaly(df)

阶段3:DBSCAN 告警聚类 + XGBoost 根因排序

1. DBSCAN 聚类K8s雪崩告警

import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 1. 模拟K8s雪崩告警(实际可从AlertManager拉取)
k8s_alerts = [
    "Pod my-app-1234 in default namespace is CrashLoopBackOff",
    "Service my-service in default namespace is unavailable",
    "Pod my-app-5678 in default namespace is CrashLoopBackOff",
    "Node node-1 is NotReady",
    "Ingress my-ingress in default namespace has no endpoints",
    "Pod my-app-9012 in default namespace is CrashLoopBackOff"
]

# 2. TF-IDF向量化 + DBSCAN聚类
def cluster_alerts(alerts):
    # 文本向量化
    vectorizer = TfidfVectorizer(stop_words="english")
    X = vectorizer.fit_transform(alerts)
    # 计算余弦相似度(替代距离)
    similarity = cosine_similarity(X)
    distance = 1 - similarity  # 转换为距离
    # DBSCAN聚类
    dbscan = DBSCAN(eps=0.5, min_samples=2, metric="precomputed")
    clusters = dbscan.fit_predict(distance)
    # 输出聚类结果
    alert_clusters = {}
    for idx, cluster_id in enumerate(clusters):
        if cluster_id not in alert_clusters:
            alert_clusters[cluster_id] = []
        alert_clusters[cluster_id].append(alerts[idx])
    # 打印聚类(-1为噪声点)
    for cluster_id, alerts_in_cluster in alert_clusters.items():
        if cluster_id == -1:
            print(f"噪声告警:{alerts_in_cluster}")
        else:
            print(f"聚类{cluster_id}(根告警类型):{alerts_in_cluster[0]} → 关联告警数:{len(alerts_in_cluster)}")

# 3. 执行聚类
if __name__ == "__main__":
    cluster_alerts(k8s_alerts)

2. XGBoost 输出K8s故障根因Top5

import xgboost as xgb
import pandas as pd
import numpy as np

# 1. 构造K8s故障特征数据(实际可从K8s事件/指标提取)
# 特征:node_cpu(%)、node_mem(%)、pod_oom(0/1)、kubelet_restart(0/1)、disk_full(0/1)
# 标签:故障类型(0=无故障, 1=Pod重启, 2=Node故障, 3=磁盘故障)
data = {
    "node_cpu": [80, 90, 50, 95, 40],
    "node_mem": [85, 95, 45, 90, 88],
    "pod_oom": [1, 1, 0, 0, 1],
    "kubelet_restart": [0, 0, 1, 1, 0],
    "disk_full": [0, 0, 0, 0, 1],
    "fault_type": [1, 1, 2, 2, 3]
}
df = pd.DataFrame(data)
X = df.drop("fault_type", axis=1)
y = df["fault_type"]

# 2. XGBoost训练 + 特征重要性(根因排序)
def rank_root_cause(X, y):
    model = xgb.XGBClassifier(random_state=42)
    model.fit(X, y)
    # 特征重要性(根因分数)
    importance = model.feature_importances_
    feature_names = X.columns
    # 排序输出根因
    root_cause_ranking = pd.DataFrame({
        "feature": feature_names,
        "importance": importance
    }).sort_values("importance", ascending=False)
    print("K8s故障根因Top5:")
    print(root_cause_ranking)
    return root_cause_ranking

# 3. 执行根因排序
if __name__ == "__main__":
    rank_root_cause(X, y)

阶段4:MIC 互信息 + PC算法(因果推断)

1. MIC 分析K8s指标因果关联

from minepy import MINE
import numpy as np

# 1. 模拟K8s指标(CPU高 → 延迟高 还是 延迟高 → CPU高)
cpu_usage = np.random.randn(100) * 10 + 60  # CPU使用率
network_latency = cpu_usage * 0.8 + np.random.randn(100) * 5  # 延迟依赖CPU
disk_io = np.random.randn(100) * 8 + 50  # 无关指标

# 2. 计算MIC(互信息,衡量非线性关联)
def calculate_mic(x, y, label):
    mine = MINE()
    mine.compute_score(x, y)
    mic_score = mine.mic()
    print(f"{label} MIC值:{mic_score:.4f}(值越高关联越强)")

# 3. 执行计算
if __name__ == "__main__":
    calculate_mic(cpu_usage, network_latency, "CPU使用率 vs 网络延迟")
    calculate_mic(cpu_usage, disk_io, "CPU使用率 vs 磁盘IO")

三、学习与落地建议

  1. 先跑通代码:把阶段1-3的代码替换成你的K8s/Prometheus地址,先看到实际效果;
  2. 小场景落地:先落地“Node CPU异常检测”“Pod内存预测”,再扩展到告警聚类;
  3. 避坑点
    • 不要一开始就上因果推断/大模型,先搞定规则+统计;
    • Prometheus指标拉取时注意降采样,避免数据量过大;
    • 孤立森林的contamination参数要根据K8s集群实际异常比例调整(一般5%-10%)。

总结

  1. K8s AIOps 算法落地优先选3σ、Prophet、孤立森林、DBSCAN、XGBoost,覆盖80%场景;
  2. 代码核心适配K8s的“指标+事件+告警”三大数据源,直接对接Prometheus/K8s API;
  3. 学习路线按“检测→预测→聚类→根因→因果”逐步进阶,3个月可落地核心能力。
posted @ 2026-02-11 15:50  wuyingchun1987  阅读(9)  评论(0)    收藏  举报