我将按照您的要求,将解决方案拆分为两个独立的Operator,并提供完整的实现细节。
## 第一部分:Metrics Collector Operator(Prometheus指标收集)
### 功能概述
1. 定期从Prometheus收集节点指标(CPU/内存/负载/延迟)
2. 将指标和过载状态写入节点注解
3. 识别需要迁移的Pods并标记
### 代码实现 (`metrics-collector-operator.py`)
```python
import os
import time
import logging
from datetime import datetime, timedelta
from typing import Dict, List, Optional
import kopf
import pytz
from kubernetes import client, config
from kubernetes.client import V1Node, V1Pod
from prometheus_api_client import PrometheusConnect
# 配置日志
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# 环境变量配置
PROMETHEUS_URL = os.getenv('PROMETHEUS_URL', 'http://prometheus-operated.monitoring.svc:9090')
SCRAPE_INTERVAL = int(os.getenv('SCRAPE_INTERVAL', '30')) # 默认30秒采集一次
# 资源阈值配置(可配置)
CPU_THRESHOLD = float(os.getenv('CPU_THRESHOLD', '0.8')) # CPU使用率阈值80%
MEM_THRESHOLD = float(os.getenv('MEM_THRESHOLD', '0.8')) # 内存使用率阈值80%
LOAD_THRESHOLD = float(os.getenv('LOAD_THRESHOLD', '1.5')) # 负载阈值1.5倍核心数
LATENCY_THRESHOLD = float(os.getenv('LATENCY_THRESHOLD', '500')) # 延迟阈值500ms
class PrometheusMetricsFetcher:
"""Prometheus指标获取器"""
def __init__(self):
self.prom = PrometheusConnect(url=PROMETHEUS_URL, disable_ssl=True)
def get_node_cpu_usage(self, node_name: str) -> float:
"""获取节点CPU使用率"""
query = f'''
100 - (avg by(instance) (
rate(node_cpu_seconds_total{{
mode="idle",
instance=~"{node_name}:.*"
}}[1m])
) * 100)
'''
try:
result = self.prom.custom_query(query)
return float(result[0]['value'][1]) / 100 if result else 0.0
except Exception as e:
logger.error(f"Failed to get CPU for {node_name}: {e}")
return 0.0
def get_node_memory_usage(self, node_name: str) -> float:
"""获取节点内存使用率"""
query = f'''
(node_memory_MemTotal_bytes{{
instance=~"{node_name}:.*"
}} - node_memory_MemAvailable_bytes{{
instance=~"{node_name}:.*"
}}) / node_memory_MemTotal_bytes{{
instance=~"{node_name}:.*"
}} * 100
'''
try:
result = self.prom.custom_query(query)
return float(result[0]['value'][1]) / 100 if result else 0.0
except Exception as e:
logger.error(f"Failed to get memory for {node_name}: {e}")
return 0.0
def get_node_load(self, node_name: str) -> float:
"""获取节点负载"""
query = f'''
node_load1{{
instance=~"{node_name}:.*"
}} / on(instance) count by(instance) (
node_cpu_seconds_total{{
mode="system",
instance=~"{node_name}:.*"
}}
)
'''
try:
result = self.prom.custom_query(query)
return float(result[0]['value'][1]) if result else 0.0
except Exception as e:
logger.error(f"Failed to get load for {node_name}: {e}")
return 0.0
def get_service_latency(self, node_name: str) -> float:
"""获取节点服务平均延迟(毫秒)"""
query = f'''
avg by(instance) (
histogram_quantile(0.95, sum by(le, instance) (
rate(http_request_duration_seconds_bucket{{
instance=~"{node_name}:.*"
}}[1m])
))
) * 1000
'''
try:
result = self.prom.custom_query(query)
return float(result[0]['value'][1]) if result else 0.0
except Exception as e:
logger.error(f"Failed to get latency for {node_name}: {e}")
return 0.0
class NodeMetricsProcessor:
"""节点指标处理器"""
def __init__(self):
config.load_incluster_config()
self.api = client.CoreV1Api()
self.metrics_fetcher = PrometheusMetricsFetcher()
def update_node_annotations(self):
"""更新所有节点的指标注解"""
nodes = self.api.list_node().items
for node in nodes:
self._update_single_node_metrics(node.metadata.name)
def _update_single_node_metrics(self, node_name: str):
"""更新单个节点的指标"""
metrics = {
'cpu': self.metrics_fetcher.get_node_cpu_usage(node_name),
'memory': self.metrics_fetcher.get_node_memory_usage(node_name),
'load': self.metrics_fetcher.get_node_load(node_name),
'latency': self.metrics_fetcher.get_service_latency(node_name)
}
# 检查是否超过阈值
overloaded = (
metrics['cpu'] > CPU_THRESHOLD or
metrics['memory'] > MEM_THRESHOLD or
metrics['load'] > LOAD_THRESHOLD or
metrics['latency'] > LATENCY_THRESHOLD
)
# 准备注解数据
annotations = {
'metrics.cpu': f"{metrics['cpu']:.3f}",
'metrics.memory': f"{metrics['memory']:.3f}",
'metrics.load': f"{metrics['load']:.3f}",
'metrics.latency': f"{metrics['latency']:.1f}",
'metrics.overloaded': str(overloaded).lower(),
'metrics.last-updated': datetime.now(pytz.UTC).isoformat()
}
# 更新节点注解
patch = {
'metadata': {
'annotations': annotations
}
}
try:
self.api.patch_node(node_name, patch)
logger.info(f"Updated metrics for node {node_name}")
except Exception as e:
logger.error(f"Failed to update node {node_name}: {e}")
@kopf.on.startup()
def configure(settings: kopf.OperatorSettings, **_):
"""Operator配置"""
settings.posting.level = logging.INFO
settings.watching.server_timeout = 60
settings.peering.name = 'metrics-collector-operator'
settings.peering.standalone = False
@kopf.timer('', interval=SCRAPE_INTERVAL)
def update_metrics(**kwargs):
"""定时更新指标"""
processor = NodeMetricsProcessor()
processor.update_node_annotations()
if __name__ == '__main__':
kopf.run()
```
### 部署说明
1. 创建RBAC权限 (`metrics-collector-rbac.yaml`):
```yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: metrics-collector
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: metrics-collector
rules:
- apiGroups: [""]
resources: ["nodes"]
verbs: ["get", "list", "watch", "patch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: metrics-collector
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: metrics-collector
subjects:
- kind: ServiceAccount
name: metrics-collector
namespace: kube-system
```
2. 创建Deployment (`metrics-collector-deployment.yaml`):
```yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: metrics-collector
namespace: kube-system
spec:
replicas: 2
selector:
matchLabels:
app: metrics-collector
template:
metadata:
labels:
app: metrics-collector
spec:
serviceAccountName: metrics-collector
containers:
- name: collector
image: python:3.9
command: ["python", "/app/metrics-collector.py"]
env:
- name: PROMETHEUS_URL
value: "http://prometheus-operated.monitoring.svc:9090"
- name: SCRAPE_INTERVAL
value: "30"
- name: CPU_THRESHOLD
value: "0.8"
- name: MEM_THRESHOLD
value: "0.8"
- name: LOAD_THRESHOLD
value: "1.5"
- name: LATENCY_THRESHOLD
value: "500"
```
## 第二部分:Pod Rescheduler Operator
### 功能概述
1. 监控节点注解中的指标数据
2. 识别过载节点上的Pods
3. 根据策略重新调度Pods
### 代码实现 (`pod-rescheduler-operator.py`)
```python
import os
import logging
from typing import Dict, List
import kopf
from kubernetes import client, config
from kubernetes.client import V1Node, V1Pod, V1Toleration
# 配置日志
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# 环境变量配置
RESCHEDULE_STRATEGY = os.getenv('RESCHEDULE_STRATEGY', 'least_loaded') # 调度策略
MAX_RETRY = int(os.getenv('MAX_RETRY', '3')) # 最大重试次数
class NodeSelector:
"""节点选择器"""
def __init__(self):
config.load_incluster_config()
self.api = client.CoreV1Api()
def get_least_loaded_node(self, exclude_nodes: List[str] = None) -> Optional[str]:
"""获取负载最低的节点"""
nodes = self.api.list_node().items
if not nodes:
return None
# 过滤掉排除的节点和不可调度的节点
candidate_nodes = [
node for node in nodes
if not node.spec.unschedulable and
node.metadata.name not in (exclude_nodes or [])
]
if not candidate_nodes:
return None
# 按注解中的综合负载排序
def get_node_score(node):
annotations = node.metadata.annotations or {}
try:
cpu = float(annotations.get('metrics.cpu', '0'))
mem = float(annotations.get('metrics.memory', '0'))
load = float(annotations.get('metrics.load', '0'))
return cpu + mem + load
except ValueError:
return float('inf')
sorted_nodes = sorted(candidate_nodes, key=get_node_score)
return sorted_nodes[0].metadata.name
class PodRescheduler:
"""Pod重新调度器"""
def __init__(self):
config.load_incluster_config()
self.api = client.CoreV1Api()
self.node_selector = NodeSelector()
def get_pods_on_node(self, node_name: str) -> List[V1Pod]:
"""获取节点上的所有Pod"""
field_selector = f"spec.nodeName={node_name}"
return self.api.list_pod_for_all_namespaces(field_selector=field_selector).items
def is_reschedulable(self, pod: V1Pod) -> bool:
"""检查Pod是否可以被重新调度"""
# 检查Pod注解
annotations = pod.metadata.annotations or {}
if annotations.get('reschedule.skipped', 'false').lower() == 'true':
return False
# 检查Pod状态
if pod.status.phase not in ['Running', 'Pending']:
return False
# 检查是否有本地存储
for vol in pod.spec.volumes or []:
if vol.empty_dir or vol.host_path:
return False
return True
def reschedule_pod(self, pod: V1Pod, target_node: str):
"""重新调度Pod到目标节点"""
pod_name = pod.metadata.name
namespace = pod.metadata.namespace
# 1. 添加注解标记
patch = {
'metadata': {
'annotations': {
'reschedule.original-node': pod.spec.node_name,
'reschedule.attempt': str(
int(pod.metadata.annotations.get('reschedule.attempt', '0')) + 1
}
}
}
try:
# 2. 删除Pod(由控制器重建)
self.api.delete_namespaced_pod(pod_name, namespace)
# 3. 更新Pod定义(在实际系统中可能需要修改Deployment等控制器)
logger.info(f"Rescheduled pod {namespace}/{pod_name} to node {target_node}")
return True
except Exception as e:
logger.error(f"Failed to reschedule pod {namespace}/{pod_name}: {e}")
return False
@kopf.on.startup()
def configure(settings: kopf.OperatorSettings, **_):
"""Operator配置"""
settings.posting.level = logging.INFO
settings.watching.server_timeout = 60
settings.peering.name = 'pod-rescheduler-operator'
settings.peering.standalone = False
@kopf.on.field('nodes', field='metadata.annotations')
def on_node_metrics_change(old, new, name, **kwargs):
"""当节点指标变化时触发"""
if new is None:
return
annotations = new or {}
if annotations.get('metrics.overloaded', 'false') == 'true':
logger.info(f"Node {name} is overloaded, checking pods...")
rescheduler = PodRescheduler()
# 获取节点上的所有Pod
pods = rescheduler.get_pods_on_node(name)
for pod in pods:
if rescheduler.is_reschedulable(pod):
# 找到最佳目标节点
target_node = rescheduler.node_selector.get_least_loaded_node(exclude_nodes=[name])
if target_node:
rescheduler.reschedule_pod(pod, target_node)
if __name__ == '__main__':
kopf.run()
```
### 部署说明
1. 创建RBAC权限 (`rescheduler-rbac.yaml`):
```yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: pod-rescheduler
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: pod-rescheduler
rules:
- apiGroups: [""]
resources: ["nodes", "pods"]
verbs: ["get", "list", "watch", "update", "patch", "delete"]
- apiGroups: ["apps"]
resources: ["deployments", "statefulsets"]
verbs: ["get", "list", "watch", "update", "patch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: pod-rescheduler
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: pod-rescheduler
subjects:
- kind: ServiceAccount
name: pod-rescheduler
namespace: kube-system
```
2. 创建Deployment (`rescheduler-deployment.yaml`):
```yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: pod-rescheduler
namespace: kube-system
spec:
replicas: 2
selector:
matchLabels:
app: pod-rescheduler
template:
metadata:
labels:
app: pod-rescheduler
spec:
serviceAccountName: pod-rescheduler
containers:
- name: rescheduler
image: python:3.9
command: ["python", "/app/pod-rescheduler.py"]
env:
- name: RESCHEDULE_STRATEGY
value: "least_loaded"
- name: MAX_RETRY
value: "3"
```
## 使用说明
1. 部署Prometheus Operator(如果尚未部署)
2. 部署Metrics Collector Operator
```bash
kubectl apply -f metrics-collector-rbac.yaml
kubectl apply -f metrics-collector-deployment.yaml
```
3. 部署Pod Rescheduler Operator
```bash
kubectl apply -f rescheduler-rbac.yaml
kubectl apply -f rescheduler-deployment.yaml
```
4. 验证节点注解是否更新
```bash
kubectl get nodes -o jsonpath='{.items[*].metadata.annotations}'
```
5. 当过载发生时,观察Pod的重新调度日志
## 监控指标说明
节点上将添加以下注解:
- `metrics.cpu`: CPU使用率 (0-1)
- `metrics.memory`: 内存使用率 (0-1)
- `metrics.load`: 系统负载 (相对于核心数)
- `metrics.latency`: 服务延迟 (毫秒)
- `metrics.overloaded`: 是否过载 (true/false)
- `metrics.last-updated`: 最后更新时间
## 扩展建议
1. 添加自定义指标支持
2. 实现更复杂的调度策略(如基于亲和性)
3. 添加事件通知机制
4. 实现优雅迁移(如先启动新Pod再终止旧Pod)
5. 添加Prometheus监控指标跟踪调度决策