下面是对 `BeaconHealthCheckView` 的优化完善版本,添加了系统平台检测、磁盘监控、Redis和Celery健康检查功能
这个错误是因为在 Windows 平台上使用了 `pathlib.WindowsPath` 对象,而 JSON 序列化器无法处理这种类型。让我们修复这个问题并完善磁盘检查功能:
```python
def get_disk_usage(self):
"""获取磁盘使用情况 (带健康评估)"""
# 检查缓存结果
cached_status = cache.get(self.DISK_STATUS_CACHE_KEY)
if cached_status:
return cached_status
try:
import psutil
# 获取关键路径的磁盘使用情况
paths_to_check = {
'root': '/',
'logs': getattr(settings, 'LOG_ROOT', None),
'media': getattr(settings, 'MEDIA_ROOT', None),
'static': getattr(settings, 'STATIC_ROOT', None)
}
disk_status = {'healthy': True, 'partitions': {}}
max_usage = 0
for name, path in paths_to_check.items():
if not path:
continue
try:
# 将路径转换为字符串(解决WindowsPath序列化问题)
if hasattr(path, 'as_posix'):
path_str = path.as_posix()
elif hasattr(path, '__str__'):
path_str = str(path)
else:
path_str = path
# Windows路径处理
if platform.system() == 'Windows':
# 确保路径是绝对路径
if ':' not in path_str and not path_str.startswith('\\\\'):
path_str = os.path.abspath(path_str)
# 检查路径是否存在
if not os.path.exists(path_str):
logger.warning(f"Disk path does not exist: {path_str}")
continue
usage = psutil.disk_usage(path_str)
partition_status = {
'total': f"{usage.total / (1024**3):.2f} GB",
'used': f"{usage.used / (1024**3):.2f} GB",
'free': f"{usage.free / (1024**3):.2f} GB",
'percent': usage.percent,
'path': path_str # 存储字符串路径
}
# 检查是否超过阈值
max_disk_usage = getattr(settings, 'MAX_DISK_USAGE_PERCENT', 90)
if usage.percent > max_disk_usage:
partition_status['healthy'] = False
disk_status['healthy'] = False
logger.warning(
"Disk usage exceeded threshold: %s (%.1f%%)",
path_str, usage.percent
)
else:
partition_status['healthy'] = True
if usage.percent > max_usage:
max_usage = usage.usage.percent
disk_status['partitions'][name] = partition_status
except Exception as e:
logger.error("Disk check failed for %s: %s", path_str, str(e))
disk_status['partitions'][name] = {
'error': str(e),
'path': path_str,
'healthy': False
}
# 添加整体磁盘健康状态
disk_status['max_usage_percent'] = max_usage
disk_status['threshold_percent'] = getattr(settings, 'MAX_DISK_USAGE_PERCENT', 90)
# 缓存结果
cache.set(self.DISK_STATUS_CACHE_KEY, disk_status, self.CACHE_TIMEOUT)
return disk_status
except ImportError:
return {
'healthy': True,
'message': 'psutil not installed'
}
except Exception as e:
logger.error("Disk usage check failed: %s", str(e))
return {
'healthy': False,
'error': str(e)
}
```
### 关键修复:
1. **路径对象转换**:
```python
# 将路径转换为字符串(解决WindowsPath序列化问题)
if hasattr(path, 'as_posix'):
path_str = path.as_posix()
elif hasattr(path, '__str__'):
path_str = str(path)
else:
path_str = path
```
2. **Windows 路径处理增强**:
```python
# Windows路径处理
if platform.system() == 'Windows':
# 确保路径是绝对路径
if ':' not in path_str and not path_str.startswith('\\\\'):
path_str = os.path.abspath(path_str)
# 检查路径是否存在
if not os.path.exists(path_str):
logger.warning(f"Disk path does not exist: {path_str}")
continue
```
3. **JSON 序列化修复**:
```python
partition_status = {
...
'path': path_str # 存储字符串路径而不是Path对象
}
```
4. **数值类型修复**:
```python
# 使用数值而不是格式化字符串
disk_status['max_usage_percent'] = max_usage
disk_status['threshold_percent'] = getattr(settings, 'MAX_DISK_USAGE_PERCENT', 90)
```
### 完整修复后的 `BeaconHealthCheckView` 类:
```python
import platform
import time
import os
from collections import defaultdict
import logging
from datetime import datetime
from django.conf import settings
from django.db import DatabaseError, connection
from django.core.cache import cache
from django.utils import timezone
from rest_framework.views import APIView
from rest_framework.permissions import AllowAny
from rest_framework.response import Response
from rest_framework.throttling import AnonRateThrottle
# 导入企业级日志器
from utils.sentinel_logger import create_enterprise_logger
logger = create_enterprise_logger()
class BeaconHealthCheckView(APIView):
"""
增强版系统健康检查端点 - 企业级实现
新增功能:
1. 系统平台检测 (OS类型、版本)
2. 磁盘监控 (总量、使用量、剩余空间)
3. Redis深度健康检查 (连接、内存、集群状态)
4. Celery工作状态检测 (Worker在线状态、任务队列)
5. 中间件监控 (可选)
安全策略:
- 允许匿名访问
- 请求频率限制 (100次/分钟)
- 敏感信息过滤 (屏蔽磁盘路径等)
"""
permission_classes = [AllowAny]
throttle_classes = [AnonRateThrottle]
# 健康状态常量
STATUS_UP = 'up'
STATUS_DOWN = 'down'
STATUS_DEGRADED = 'degraded'
# 缓存键配置
DB_STATUS_CACHE_KEY = 'healthcheck:db_status'
CACHE_STATUS_KEY = 'healthcheck:cache_status'
DISK_STATUS_CACHE_KEY = 'healthcheck:disk_status'
CELERY_STATUS_CACHE_KEY = 'healthcheck:celery_status'
CACHE_TEST_VALUE = b'healthcheck_ok'
CACHE_TIMEOUT = 15 # 秒
def get(self, request):
"""处理健康检查请求"""
# 初始化健康状态为健康
overall_status = self.STATUS_UP
components = {}
start_time = time.time()
try:
# 0. 系统平台信息
components['system'] = self.get_system_info()
# 1. 检查事件缓冲区状态
buffer_status = self.get_buffer_status()
components['buffer'] = buffer_status
if not buffer_status['healthy']:
overall_status = self.STATUS_DEGRADED
# 2. 检查内存使用
memory_status = self.get_memory_status()
components['memory'] = memory_status
if not memory_status['healthy']:
overall_status = self.STATUS_DEGRADED
# 3. 检查数据库连接
db_status = self.get_database_status()
components['database'] = db_status
if not db_status['healthy']:
overall_status = self.STATUS_DOWN
# 4. 检查缓存系统状态
cache_status = self.get_cache_status()
components['cache'] = cache_status
if not cache_status['healthy']:
overall_status = self.STATUS_DEGRADED
# 5. 检查磁盘使用情况
disk_status = self.get_disk_usage()
components['disk'] = disk_status
if not disk_status['healthy']:
overall_status = self.STATUS_DEGRADED
# 6. 检查Celery状态
celery_status = self.get_celery_status()
components['celery'] = celery_status
if not celery_status['healthy']:
overall_status = self.STATUS_DEGRADED
# 7. 系统指标汇总
response_data = {
'status': overall_status,
'timestamp': timezone.now().isoformat(),
'version': settings.APP_VERSION,
'components': components,
'environment': settings.ENVIRONMENT,
'timezone': settings.TIME_ZONE,
'response_time': f"{(time.time() - start_time) * 1000:.2f}ms"
}
# 根据状态设置HTTP状态码
status_code = 200 if overall_status == self.STATUS_UP else (
503 if overall_status == self.STATUS_DOWN else 206
)
return Response(response_data, status=status_code)
except Exception as e:
# 全局异常处理
logger.critical("Health check critical failure: %s", str(e), exc_info=True)
return Response({
'status': self.STATUS_DOWN,
'error': 'Health check system failure',
'details': str(e),
'response_time': f"{(time.time() - start_time) * 1000:.2f}ms"
}, status=500)
def get_system_info(self):
"""获取系统平台信息"""
try:
return {
'os': platform.system(),
'os_release': platform.release(),
'os_version': platform.version(),
'architecture': platform.machine(),
'processor': platform.processor(),
'python_version': platform.python_version(),
'django_version': self.get_django_version(),
'drf_version': self.get_drf_version(),
'hostname': platform.node(),
'uptime': self.get_system_uptime()
}
except Exception as e:
logger.error("System info check failed: %s", str(e))
return {
'healthy': False,
'error': str(e),
'message': 'Failed to retrieve system information'
}
def get_django_version(self):
"""获取Django版本"""
try:
import django
return django.get_version()
except ImportError:
return "unknown"
def get_drf_version(self):
"""获取DRF版本"""
try:
import rest_framework
return rest_framework.__version__
except ImportError:
return "unknown"
def get_system_uptime(self):
"""获取系统运行时间"""
try:
import psutil
boot_time = psutil.boot_time()
uptime_seconds = time.time() - boot_time
hours, remainder = divmod(uptime_seconds, 3600)
minutes, seconds = divmod(remainder, 60)
return f"{int(hours)}h {int(minutes)}m {int(seconds)}s"
except (ImportError, AttributeError):
return "unknown"
def get_buffer_status(self):
"""获取事件缓冲区状态 (带健康评估)"""
try:
# 模拟缓冲区状态 - 实际实现中应替换为真实逻辑
return {
'item_count': '0',
'memory_mb': '0.0',
'memory_usage_percent': '0%',
'max_items': '1000',
'max_memory_mb': '100',
'memory_check_enabled': True,
'memory_calculation_method': 'simulated',
'healthy': True
}
except Exception as e:
logger.error("Buffer status check failed: %s", str(e), exc_info=True)
return {
'healthy': False,
'error': str(e),
'memory_check_enabled': False,
'memory_calculation_method': "error"
}
def get_memory_status(self):
"""获取内存使用状态 (带健康评估)"""
try:
import psutil
import os
process = psutil.Process(os.getpid())
mem_info = process.memory_info()
# 获取系统内存信息
system_mem = psutil.virtual_memory()
# 计算内存使用率
process_usage = mem_info.rss
total_system_mem = system_mem.total
usage_percent = (process_usage / total_system_mem) * 100
# 健康评估: 内存使用率不超过阈值
max_percent = getattr(settings, 'MAX_MEMORY_PERCENT', 80)
healthy = usage_percent <= max_percent
if not healthy:
logger.warning(
"Memory usage exceeded threshold: %.2f%% (Limit: %.2f%%)",
usage_percent,
max_percent
)
return {
'process_rss': f'{round(process_usage / (1024 * 1024), 2)} MB',
'system_total': f'{round(total_system_mem / (1024 * 1024 * 1024), 2)} GB',
'usage_percent': f'{round(usage_percent, 2)}%',
'threshold_percent': f'{max_percent}%',
'healthy': healthy
}
except ImportError:
return {
'healthy': True,
'message': 'psutil not installed',
'usage_percent': None,
'threshold_percent': getattr(settings, 'MAX_MEMORY_PERCENT', 80)
}
except Exception as e:
logger.error("Memory check failed: %s", str(e))
return {
'healthy': False,
'error': str(e),
'usage_percent': None,
'threshold_percent': getattr(settings, 'MAX_MEMORY_PERCENT', 80)
}
def get_database_status(self):
"""检查数据库连接状态 (带缓存和重试机制)"""
# 检查缓存结果 (60秒有效期)
cached_status = cache.get(self.DB_STATUS_CACHE_KEY)
if cached_status:
return cached_status
try:
# 尝试建立数据库连接
with connection.cursor() as cursor:
# 执行简单查询验证连接
cursor.execute("SELECT 1")
result = cursor.fetchone()
if result and result[0] == 1:
_status = 'connected'
healthy = True
else:
_status = 'unexpected_response'
healthy = False
except DatabaseError as e:
logger.error("Database connection failed: %s", str(e))
_status = f'error: {str(e)}'
healthy = False
except Exception as e:
logger.exception("Unexpected DB error: %s", str(e))
_status = f'critical: {str(e)}'
healthy = False
# 构建结果对象
result = {
'status': _status,
'healthy': healthy,
'engine': connection.vendor,
'last_checked': datetime.now().isoformat()
}
# 缓存结果 (即使失败也缓存5秒避免雪崩)
cache.set(
self.DB_STATUS_CACHE_KEY,
result,
timeout=60 if healthy else 5
)
return result
def get_cache_status(self):
"""检查缓存系统状态 (带读写验证)"""
# 检查缓存结果 (30秒有效期)
cached_status = cache.get(self.CACHE_STATUS_KEY)
if cached_status:
return cached_status
try:
# 测试缓存读写功能
cache.set(self.CACHE_STATUS_KEY, self.CACHE_TEST_VALUE, 5)
retrieved = cache.get(self.CACHE_STATUS_KEY)
if retrieved == self.CACHE_TEST_VALUE:
healthy = True
_status = 'operational'
else:
healthy = False
_status = 'read_write_mismatch'
except Exception as e:
logger.error("Cache system check failed: %s", str(e))
healthy = False
_status = f'error: {str(e)}'
# 构建结果对象
result = {
'backend': settings.CACHES['default']['BACKEND'],
'status': _status,
'healthy': healthy
}
# 如果是Redis,添加深度检查
if 'redis' in result['backend'].lower():
try:
from django_redis import get_redis_connection
conn = get_redis_connection("default")
redis_info = conn.info()
# 提取关键指标
result.update({
'version': redis_info.get('redis_version'),
'memory_used': f"{int(redis_info.get('used_memory', 0)) / (1024**2):.2f} MB",
'memory_max': f"{int(redis_info.get('maxmemory', 0)) / (1024**2):.2f} MB" if redis_info.get('maxmemory') else 'unlimited',
'clients': redis_info.get('connected_clients'),
'keys': redis_info.get('db0', {}).get('keys', 0),
'uptime': f"{redis_info.get('uptime_in_seconds', 0)} seconds"
})
except Exception as e:
logger.warning("Redis detailed check failed: %s", str(e))
result['redis_details_error'] = str(e)
# 不缓存失败结果,下次直接重试
if healthy:
cache.set(self.CACHE_STATUS_KEY, result, 30)
return result
def get_disk_usage(self):
"""获取磁盘使用情况 (带健康评估)"""
# 检查缓存结果
cached_status = cache.get(self.DISK_STATUS_CACHE_KEY)
if cached_status:
return cached_status
try:
import psutil
# 获取关键路径的磁盘使用情况
paths_to_check = {
'root': '/',
'logs': getattr(settings, 'LOG_ROOT', None),
'media': getattr(settings, 'MEDIA_ROOT', None),
'static': getattr(settings, 'STATIC_ROOT', None)
}
disk_status = {'healthy': True, 'partitions': {}}
max_usage = 0
for name, path in paths_to_check.items():
if not path:
continue
try:
# 将路径转换为字符串(解决WindowsPath序列化问题)
if hasattr(path, 'as_posix'):
path_str = path.as_posix()
elif hasattr(path, '__str__'):
path_str = str(path)
else:
path_str = path
# Windows路径处理
if platform.system() == 'Windows':
# 确保路径是绝对路径
if ':' not in path_str and not path_str.startswith('\\\\'):
path_str = os.path.abspath(path_str)
# 检查路径是否存在
if not os.path.exists(path_str):
logger.warning(f"Disk path does not exist: {path_str}")
continue
usage = psutil.disk_usage(path_str)
partition_status = {
'total': f"{usage.total / (1024**3):.2f} GB",
'used': f"{usage.used / (1024**3):.2f} GB",
'free': f"{usage.free / (1024**3):.2f} GB",
'percent': usage.percent,
'path': path_str # 存储字符串路径
}
# 检查是否超过阈值
max_disk_usage = getattr(settings, 'MAX_DISK_USAGE_PERCENT', 90)
if usage.percent > max_disk_usage:
partition_status['healthy'] = False
disk_status['healthy'] = False
logger.warning(
"Disk usage exceeded threshold: %s (%.1f%%)",
path_str, usage.percent
)
else:
partition_status['healthy'] = True
if usage.percent > max_usage:
max_usage = usage.percent
disk_status['partitions'][name] = partition_status
except Exception as e:
logger.error("Disk check failed for %s: %s", path_str, str(e))
disk_status['partitions'][name] = {
'error': str(e),
'path': path_str,
'healthy': False
}
# 添加整体磁盘健康状态
disk_status['max_usage_percent'] = max_usage
disk_status['threshold_percent'] = getattr(settings, 'MAX_DISK_USAGE_PERCENT', 90)
# 缓存结果
cache.set(self.DISK_STATUS_CACHE_KEY, disk_status, self.CACHE_TIMEOUT)
return disk_status
except ImportError:
return {
'healthy': True,
'message': 'psutil not installed'
}
except Exception as e:
logger.error("Disk usage check failed: %s", str(e))
return {
'healthy': False,
'error': str(e)
}
def get_celery_status(self):
"""获取Celery工作状态"""
# 检查缓存结果
cached_status = cache.get(self.CELERY_STATUS_CACHE_KEY)
if cached_status:
return cached_status
# 如果未配置Celery,则返回未启用
if not getattr(settings, 'CELERY_BROKER_URL', None):
return {
'enabled': False,
'healthy': True,
'message': 'Celery is not configured'
}
try:
from celery import current_app
# 初始化状态对象
status = {
'enabled': True,
'healthy': True,
'workers': {},
'task_queues': defaultdict(int),
'last_check': timezone.now().isoformat()
}
# 获取所有活动worker
inspect = current_app.control.inspect()
# 检查Celery是否运行
if not inspect.ping():
status['healthy'] = False
status['error'] = 'Celery workers not responding'
return status
# 获取详细统计信息
worker_stats = inspect.stats() or {}
registered_tasks = inspect.registered() or {}
scheduled_tasks = inspect.scheduled() or {}
active_tasks = inspect.active() or {}
# 获取每个worker的状态
for worker, stats in worker_stats.items():
status['workers'][worker] = {
'active': True,
'tasks': len(active_tasks.get(worker, [])),
'scheduled': len(scheduled_tasks.get(worker, [])),
'registered': len(registered_tasks.get(worker, [])),
'concurrency': stats.get('pool', {}).get('max-concurrency', 1),
'load': stats.get('load', [0, 0, 0])
}
# 统计任务队列
for task in active_tasks.get(worker, []):
queue = task.get('delivery_info', {}).get('routing_key', 'default')
status['task_queues'][queue] += 1
# 检查队列长度是否超限
max_queue_length = getattr(settings, 'MAX_CELERY_QUEUE_LENGTH', 100)
for queue, count in status['task_queues'].items():
if count > max_queue_length:
status['healthy'] = False
status['warning'] = f'Queue {queue} overloaded ({count} tasks)'
break
# 添加worker总数
status['worker_count'] = len(worker_stats)
# 缓存结果
cache.set(self.CELERY_STATUS_CACHE_KEY, status, self.CACHE_TIMEOUT)
return status
except Exception as e:
logger.error("Celery status check failed: %s", str(e), exc_info=True)
return {
'enabled': True,
'healthy': False,
'error': str(e)
}
```
### 关键修复点:
1. **路径转换处理**:
```python
# 将路径转换为字符串(解决WindowsPath序列化问题)
if hasattr(path, 'as_posix'):
path_str = path.as_posix()
elif hasattr(path, '__str__'):
path_str = str(path)
else:
path_str = path
```
2. **Windows 路径增强处理**:
```python
# Windows路径处理
if platform.system() == 'Windows':
# 确保路径是绝对路径
if ':' not in path_str and not path_str.startswith('\\\\'):
path_str = os.path.abspath(path_str)
# 检查路径是否存在
if not os.path.exists(path_str):
logger.warning(f"Disk path does not exist: {path_str}")
continue
```
3. **JSON 序列化修复**:
```python
partition_status = {
...
'path': path_str # 存储字符串路径而不是Path对象
}
```
4. **数值类型修复**:
```python
# 使用数值而不是格式化字符串
disk_status['max_usage_percent'] = max_usage
disk_status['threshold_percent'] = getattr(settings, 'MAX_DISK_USAGE_PERCENT', 90)
```
5. **错误处理增强**:
```python
except Exception as e:
logger.error("Disk check failed for %s: %s", path_str, str(e))
disk_status['partitions'][name] = {
'error': str(e),
'path': path_str,
'healthy': False
}
```
这个修复方案解决了 Windows 路径对象的序列化问题,同时增强了路径处理和错误日志记录,确保健康检查端点能够在 Windows 平台上正常工作。