第十章 可观测性与监控

第十章 可观测性与监控

在微服务架构中,可观测性不是锦上添花,而是生存必需品。当你的系统由几十个服务组成,一个用户请求跨越多个服务时,没有良好的可观测性,你就像是在黑暗中摸索的盲人。

10.1 集中式日志:系统的黑匣子

10.1.1 为什么传统日志管理不再适用

记得我第一次排查微服务问题时,需要SSH到5台不同的服务器,查看10个不同的日志文件,试图拼凑出一个完整的故事。那种感觉就像在破案,但手里只有零散的线索。

// ❌ 传统方式 - 分散的日志
// Server 1: /var/log/order-service/app.log
// Server 2: /var/log/payment-service/app.log  
// Server 3: /var/log/inventory-service/app.log
// Server 4: /var/log/shipping-service/app.log
// Server 5: /var/log/notification-service/app.log

// 问题:如何关联这些日志?如何快速找到问题根源?

集中式日志解决了这个问题,它就像是飞机的黑匣子,记录了系统的完整运行轨迹。

10.1.2 ELK Stack实战配置

// Program.cs - Serilog配置
using Serilog;

var builder = WebApplication.CreateBuilder(args);

// 配置Serilog
Log.Logger = new LoggerConfiguration()
    .MinimumLevel.Debug()
    .MinimumLevel.Override("Microsoft", LogEventLevel.Information)
    .Enrich.FromLogContext()
    .Enrich.WithMachineName()
    .Enrich.WithProcessId()
    .Enrich.WithThreadId()
    .Enrich.WithProperty("Application", "OrderService")
    .Enrich.WithProperty("Environment", builder.Environment.EnvironmentName)
    .WriteTo.Console(
        outputTemplate: "[{Timestamp:HH:mm:ss} {Level:u3}] {CorrelationId} {Message:lj}{NewLine}{Exception}")
    .WriteTo.File(
        path: "logs/order-service-.txt",
        rollingInterval: RollingInterval.Day,
        retainedFileCountLimit: 7,
        outputTemplate: "{Timestamp:yyyy-MM-dd HH:mm:ss.fff zzz} [{Level:u3}] {CorrelationId} {Message:lj}{NewLine}{Exception}")
    .WriteTo.Elasticsearch(new ElasticsearchSinkOptions(new Uri(builder.Configuration["Elasticsearch:Uri"]))
    {
        AutoRegisterTemplate = true,
        AutoRegisterTemplateVersion = AutoRegisterTemplateVersion.ESv7,
        IndexFormat = "order-service-{0:yyyy.MM.dd}",
        CustomFormatter = new EcsTextFormatter(),
        FailureCallback = e => Console.WriteLine($"Unable to submit event {e.RenderMessage()} to Elasticsearch"),
        EmitEventFailure = EmitEventFailureHandling.WriteToSelfLog |
                           EmitEventFailureHandling.WriteToFailureSink |
                           EmitEventFailureHandling.RaiseCallback,
        FailureSink = new FileSink("./failures.txt", new JsonFormatter(), null)
    })
    .CreateLogger();

try
{
    builder.Host.UseSerilog();
    
    var app = builder.Build();
    
    // 配置请求日志中间件
    app.UseSerilogRequestLogging(options =>
    {
        options.EnrichDiagnosticContext = (diagnosticContext, httpContext) =>
        {
            diagnosticContext.Set("RequestHost", httpContext.Request.Host.Value);
            diagnosticContext.Set("RequestScheme", httpContext.Request.Scheme);
            diagnosticContext.Set("RequestPath", httpContext.Request.Path);
            diagnosticContext.Set("QueryString", httpContext.Request.QueryString.Value);
            diagnosticContext.Set("UserAgent", httpContext.Request.Headers["User-Agent"]);
            diagnosticContext.Set("IpAddress", httpContext.Connection.RemoteIpAddress?.ToString());
            diagnosticContext.Set("CorrelationId", httpContext.GetCorrelationId());
        };
    });
    
    app.Run();
}
catch (Exception ex)
{
    Log.Fatal(ex, "Application terminated unexpectedly");
}
finally
{
    Log.CloseAndFlush();
}

10.1.3 结构化日志的最佳实践

// 使用结构化日志记录业务事件
public class OrderService
{
    private readonly ILogger<OrderService> _logger;
    
    public async Task<Order> CreateOrderAsync(CreateOrderCommand command)
    {
        using var scope = _logger.BeginScope(new Dictionary<string, object>
        {
            ["CorrelationId"] = command.CorrelationId,
            ["CustomerId"] = command.CustomerId,
            ["OrderId"] = command.OrderId
        });
        
        _logger.LogInformation(
            "Starting order creation for customer {CustomerId} with {ItemCount} items",
            command.CustomerId, 
            command.Items.Count);
        
        try
        {
            var order = Order.Create(command.CustomerId, command.Items);
            
            // 记录业务事件
            _logger.LogInformation(
                "Order {OrderId} created successfully. Total amount: {TotalAmount}",
                order.Id, 
                order.TotalPrice);
                
            // 记录性能指标
            _logger.LogInformation(
                "Order creation completed in {ElapsedMs}ms",
                stopwatch.ElapsedMilliseconds);
                
            return order;
        }
        catch (BusinessRuleException ex)
        {
            _logger.LogWarning(
                "Business rule violation while creating order: {ErrorMessage}",
                ex.Message);
            throw;
        }
        catch (Exception ex)
        {
            _logger.LogError(
                ex, 
                "Unexpected error creating order for customer {CustomerId}",
                command.CustomerId);
            throw;
        }
    }
}

// 自定义日志事件类型
public static class LogEvents
{
    public const int OrderCreated = 1001;
    public const int OrderCancelled = 1002;
    public const int PaymentProcessed = 2001;
    public const int PaymentFailed = 2002;
    public const int StockReserved = 3001;
    public const int StockInsufficient = 3002;
    public const int ExternalServiceCall = 4001;
    public const int ExternalServiceError = 4002;
}

// 使用强类型的日志事件
public static class LoggingExtensions
{
    public static void LogOrderCreated(this ILogger logger, Guid orderId, decimal totalAmount)
    {
        logger.LogInformation(
            LogEvents.OrderCreated,
            "Order {OrderId} created with total amount {TotalAmount}",
            orderId, totalAmount);
    }
    
    public static void LogPaymentFailed(this ILogger logger, Guid orderId, string error)
    {
        logger.LogError(
            LogEvents.PaymentFailed,
            "Payment failed for order {OrderId}: {Error}",
            orderId, error);
    }
}

10.1.4 日志查询和分析

// 在Kibana中创建有用的查询

// 1. 查找特定用户的所有操作
{
  "query": {
    "bool": {
      "must": [
        { "term": { "CustomerId": "123e4567-e89b-12d3-a456-426614174000" } }
      ]
    }
  }
}

// 2. 查找响应时间超过1秒的所有请求
{
  "query": {
    "bool": {
      "must": [
        { "range": { "ElapsedMs": { "gt": 1000 } } },
        { "exists": { "field": "RequestPath" } }
      ]
    }
  }
}

// 3. 查找特定时间范围内的错误日志
{
  "query": {
    "bool": {
      "must": [
        { "range": { "@timestamp": { "gte": "2024-01-01T00:00:00", "lte": "2024-01-01T23:59:59" } } },
        { "term": { "Level": "Error" } }
      ]
    }
  }
}

// 4. 查找外部服务调用失败
{
  "query": {
    "bool": {
      "must": [
        { "term": { "EventId.Id": 4002 } },
        { "exists": { "field": "ExternalServiceName" } }
      ]
    }
  }
}

10.2 指标监控:系统的健康仪表盘

10.2.1 Prometheus + Grafana实战

// 安装NuGet包
dotnet add package prometheus-net.AspNetCore
dotnet add package prometheus-net.DotNetRuntime

// Program.cs - 配置Prometheus指标
var builder = WebApplication.CreateBuilder(args);

// 注册Prometheus指标收集
builder.Services.AddMetricServer(options =>
{
    options.Port = 9090; // Prometheus metrics端口
});

// 注册.NET运行时指标收集
builder.Services.AddDotNetRuntimeStats();

var app = builder.Build();

// 配置Prometheus中间件
app.UseMetricServer();
app.UseHttpMetrics(); // 自动收集HTTP请求指标

// 自定义业务指标
var orderCounter = Metrics.CreateCounter(
    "orders_created_total",
    "Total number of orders created",
    new[] { "customer_type", "payment_method" });

var orderValueHistogram = Metrics.CreateHistogram(
    "order_value_usd",
    "Order value in USD",
    new HistogramConfiguration
    {
        Buckets = Histogram.LinearBuckets(start: 10, width: 50, count: 10),
        LabelNames = new[] { "customer_type" }
    });

var activeOrdersGauge = Metrics.CreateGauge(
    "active_orders_count",
    "Number of active orders being processed");

// 使用自定义指标
app.MapPost("/api/orders", async (CreateOrderRequest request) =>
{
    var stopwatch = Stopwatch.StartNew();
    
    try
    {
        var order = await orderService.CreateOrderAsync(request);
        
        // 记录订单创建指标
        orderCounter.WithLabels(
            order.CustomerType.ToString().ToLower(),
            order.PaymentMethod.ToString().ToLower()).Inc();
            
        // 记录订单价值分布
        orderValueHistogram.WithLabels(order.CustomerType.ToString().ToLower())
            .Observe((double)order.TotalPrice.Amount);
            
        // 增加活跃订单数
        activeOrdersGauge.Inc();
        
        return Results.Ok(order);
    }
    catch (Exception ex)
    {
        // 记录错误指标
        errorCounter.WithLabels("order_creation", ex.GetType().Name).Inc();
        throw;
    }
    finally
    {
        // 记录请求处理时间
        requestDuration.WithLabels("POST", "/api/orders")
            .Observe(stopwatch.Elapsed.TotalSeconds);
    }
});

10.2.2 业务指标设计

// 业务健康指标
public class BusinessMetrics
{
    private readonly IMetricServer _metricServer;
    
    // 客户指标
    private readonly Counter _customerRegisteredCounter;
    private readonly Counter _customerActivatedCounter;
    private readonly Gauge _activeCustomersGauge;
    
    // 订单指标
    private readonly Counter _ordersCreatedCounter;
    private readonly Counter _ordersCompletedCounter;
    private readonly Counter _ordersCancelledCounter;
    private readonly Histogram _orderProcessingTimeHistogram;
    
    // 收入指标
    private readonly Counter _revenueCounter;
    private readonly Gauge _dailyRevenueGauge;
    
    public BusinessMetrics()
    {
        _customerRegisteredCounter = Metrics.CreateCounter(
            "customers_registered_total",
            "Total number of customers registered",
            new[] { "registration_source", "country" });
            
        _ordersCreatedCounter = Metrics.CreateCounter(
            "orders_created_total",
            "Total number of orders created",
            new[] { "customer_type", "channel", "region" });
            
        _revenueCounter = Metrics.CreateCounter(
            "revenue_total_usd",
            "Total revenue in USD",
            new[] { "product_category", "payment_method" });
            
        _orderProcessingTimeHistogram = Metrics.CreateHistogram(
            "order_processing_duration_seconds",
            "Order processing time in seconds",
            new HistogramConfiguration
            {
                Buckets = Histogram.ExponentialBuckets(0.1, 2, 10),
                LabelNames = new[] { "order_type", "priority" }
            });
    }
    
    public void RecordCustomerRegistration(string source, string country)
    {
        _customerRegisteredCounter.WithLabels(source, country).Inc();
    }
    
    public void RecordOrderCompletion(Order order)
    {
        _ordersCompletedCounter.WithLabels(
            order.CustomerType.ToString().ToLower(),
            order.Channel.ToString().ToLower(),
            order.Region.ToString().ToLower()).Inc();
            
        _revenueCounter.WithLabels(
            order.ProductCategory,
            order.PaymentMethod.ToString().ToLower()).Inc(order.TotalPrice.Amount);
    }
    
    public void RecordOrderProcessingTime(Order order, TimeSpan processingTime)
    {
        _orderProcessingTimeHistogram.WithLabels(
            order.Type.ToString().ToLower(),
            order.Priority.ToString().ToLower())
            .Observe(processingTime.TotalSeconds);
    }
}

10.2.3 Grafana仪表盘配置

// Grafana仪表盘配置示例
{
  "dashboard": {
    "id": null,
    "title": "Order Service Dashboard",
    "tags": ["order-service", "microservices"],
    "timezone": "browser",
    "panels": [
      {
        "id": 1,
        "title": "Orders per Second",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(orders_created_total[5m])",
            "legendFormat": "Orders/sec",
            "refId": "A"
          }
        ],
        "yAxes": [
          {
            "label": "Orders/sec",
            "min": 0
          }
        ]
      },
      {
        "id": 2,
        "title": "Order Processing Time",
        "type": "graph",
        "targets": [
          {
            "expr": "histogram_quantile(0.95, rate(order_processing_duration_seconds_bucket[5m]))",
            "legendFormat": "95th percentile",
            "refId": "A"
          },
          {
            "expr": "histogram_quantile(0.50, rate(order_processing_duration_seconds_bucket[5m]))",
            "legendFormat": "50th percentile",
            "refId": "B"
          }
        ],
        "yAxes": [
          {
            "label": "Seconds",
            "min": 0
          }
        ]
      },
      {
        "id": 3,
        "title": "Error Rate",
        "type": "singlestat",
        "targets": [
          {
            "expr": "rate(http_requests_total{status=~\"5..\"}[5m]) / rate(http_requests_total[5m]) * 100",
            "refId": "A"
          }
        ],
        "valueName": "current",
        "format": "percent",
        "thresholds": "1,5",
        "colorBackground": true
      },
      {
        "id": 4,
        "title": "Revenue per Hour",
        "type": "graph",
        "targets": [
          {
            "expr": "increase(revenue_total_usd[1h])",
            "legendFormat": "Revenue (USD)",
            "refId": "A"
          }
        ],
        "yAxes": [
          {
            "label": "USD",
            "min": 0
          }
        ]
      }
    ]
  }
}

10.3 分布式追踪:请求的完整故事

10.3.1 OpenTelemetry配置

// 安装NuGet包
dotnet add package OpenTelemetry.Extensions.Hosting
dotnet add package OpenTelemetry.Instrumentation.AspNetCore  
dotnet add package OpenTelemetry.Instrumentation.HttpClient
dotnet add package OpenTelemetry.Instrumentation.EntityFrameworkCore
dotnet add package OpenTelemetry.Exporter.Jaeger
dotnet add package OpenTelemetry.Exporter.Console

// Program.cs - OpenTelemetry配置
var builder = WebApplication.CreateBuilder(args);

builder.Services.AddOpenTelemetry()
    .WithTracing(traceBuilder =>
    {
        traceBuilder
            .AddSource("OrderService") // 添加自定义ActivitySource
            .SetResourceBuilder(ResourceBuilder.CreateDefault()
                .AddService("order-service")
                .AddAttributes(new Dictionary<string, object>
                {
                    ["service.version"] = "1.0.0",
                    ["service.namespace"] = "ecommerce",
                    ["deployment.environment"] = builder.Environment.EnvironmentName
                }))
            .AddAspNetCoreInstrumentation(options =>
            {
                options.Filter = (httpContext) =>
                {
                    // 过滤健康检查等不需要追踪的请求
                    return !httpContext.Request.Path.StartsWithSegments("/health");
                };
                options.EnrichWithHttpRequest = (activity, httpRequest) =>
                {
                    activity.SetTag("http.request_content_length", httpRequest.ContentLength);
                };
                options.EnrichWithHttpResponse = (activity, httpResponse) =>
                {
                    activity.SetTag("http.response_content_length", httpResponse.ContentLength);
                };
            })
            .AddHttpClientInstrumentation(options =>
            {
                options.FilterHttpRequestMessage = (request) =>
                {
                    // 过滤对外部监控服务的调用
                    return !request.RequestUri.Host.Contains("jaeger");
                };
            })
            .AddEntityFrameworkCoreInstrumentation(options =>
            {
                options.SetDbStatementForText = true;
                options.EnrichWithIDbCommand = (activity, command) =>
                {
                    activity.SetTag("db.statement_type", command.CommandType.ToString());
                    activity.SetTag("db.connection_id", command.Connection?.GetHashCode());
                };
            })
            .AddJaegerExporter(options =>
            {
                options.AgentHost = builder.Configuration["Jaeger:AgentHost"];
                options.AgentPort = int.Parse(builder.Configuration["Jaeger:AgentPort"]);
                options.ExportProcessorType = ExportProcessorType.Batch;
                options.BatchExportProcessorOptions = new BatchExportProcessorOptions<Activity>
                {
                    MaxQueueSize = 2048,
                    ScheduledDelayMilliseconds = 5000,
                    ExporterTimeoutMilliseconds = 30000,
                    MaxExportBatchSize = 512
                };
            });
            
        // 在开发环境中输出到控制台
        if (builder.Environment.IsDevelopment())
        {
            traceBuilder.AddConsoleExporter();
        }
    })
    .WithMetrics(metricsBuilder =>
    {
        metricsBuilder
            .AddAspNetCoreInstrumentation()
            .AddHttpClientInstrumentation()
            .AddRuntimeInstrumentation()
            .AddPrometheusExporter();
    });

10.3.2 自定义追踪

// 自定义ActivitySource
public class OrderActivitySource
{
    private static readonly ActivitySource Source = new("OrderService", "1.0.0");
    
    public static Activity StartActivity(string name, ActivityKind kind = ActivityKind.Internal)
    {
        return Source.StartActivity(name, kind);
    }
}

// 在业务代码中使用自定义追踪
public class OrderService
{
    private readonly ILogger<OrderService> _logger;
    
    public async Task<Order> ProcessOrderAsync(Order order)
    {
        // 开始一个自定义的追踪活动
        using var activity = OrderActivitySource.StartActivity("ProcessOrder");
        
        activity?.SetTag("order.id", order.Id);
        activity?.SetTag("order.customer_id", order.CustomerId);
        activity?.SetTag("order.total_amount", order.TotalPrice.Amount);
        activity?.SetTag("order.item_count", order.Items.Count);
        
        try
        {
            // 验证订单
            using var validationActivity = OrderActivitySource.StartActivity("ValidateOrder");
            await ValidateOrderAsync(order);
            
            // 处理库存
            using var inventoryActivity = OrderActivitySource.StartActivity("ProcessInventory");
            await ProcessInventoryAsync(order);
            
            // 处理支付
            using var paymentActivity = OrderActivitySource.StartActivity("ProcessPayment");
            await ProcessPaymentAsync(order);
            
            // 发送通知
            using var notificationActivity = OrderActivitySource.StartActivity("SendNotification");
            await SendOrderConfirmationAsync(order);
            
            activity?.SetStatus(ActivityStatusCode.Ok);
            
            return order;
        }
        catch (Exception ex)
        {
            activity?.SetStatus(ActivityStatusCode.Error, ex.Message);
            activity?.RecordException(ex);
            throw;
        }
    }
    
    private async Task ValidateOrderAsync(Order order)
    {
        using var activity = OrderActivitySource.StartActivity("ValidateOrderItems");
        
        foreach (var item in order.Items)
        {
            activity?.SetTag($"item.{item.ProductId}.quantity", item.Quantity);
            activity?.SetTag($"item.{item.ProductId}.price", item.Price.Amount);
            
            // 验证逻辑...
        }
    }
}

// 跨服务追踪传播
public class TracingHttpMessageHandler : DelegatingHandler
{
    protected override async Task<HttpResponseMessage> SendAsync(
        HttpRequestMessage request, 
        CancellationToken cancellationToken)
    {
        // 注入追踪上下文
        var activity = Activity.Current;
        if (activity != null)
        {
            request.Headers.Add("traceparent", activity.Id);
            if (activity.TraceStateString != null)
            {
                request.Headers.Add("tracestate", activity.TraceStateString);
            }
        }
        
        return await base.SendAsync(request, cancellationToken);
    }
}

10.3.3 追踪数据可视化

// Jaeger追踪数据示例
{
  "traceId": "7a3f1e8b9c2d4f6g",
  "spans": [
    {
      "spanId": "1a2b3c4d5e6f",
      "operationName": "POST /api/orders",
      "startTime": "2024-01-01T10:00:00.000Z",
      "duration": 2500,
      "tags": {
        "http.method": "POST",
        "http.url": "/api/orders",
        "http.status_code": 200,
        "customer.id": "123e4567-e89b-12d3-a456-426614174000"
      },
      "logs": [
        {
          "timestamp": "2024-01-01T10:00:01.000Z",
          "fields": [
            { "key": "event", "value": "Order validation started" }
          ]
        }
      ]
    },
    {
      "spanId": "2b3c4d5e6f7g",
      "operationName": "ProcessInventory",
      "startTime": "2024-01-01T10:00:01.100Z",
      "duration": 800,
      "parentSpanId": "1a2b3c4d5e6f",
      "tags": {
        "db.system": "postgresql",
        "db.statement": "UPDATE inventory SET quantity = quantity - ? WHERE product_id = ?"
      }
    }
  ]
}

10.4 健康检查与告警

10.4.1 健康检查实现

// 健康检查配置
builder.Services.AddHealthChecks()
    // 检查自身服务状态
    .AddCheck("self", () => HealthCheckResult.Healthy("Service is running"))
    
    // 检查数据库连接
    .AddNpgSql(
        connectionString: builder.Configuration.GetConnectionString("Default"),
        name: "database",
        failureStatus: HealthStatus.Unhealthy,
        tags: new[] { "database", "critical" })
    
    // 检查Redis连接
    .AddRedis(
        redisConnectionString: builder.Configuration["Redis:ConnectionString"],
        name: "redis",
        failureStatus: HealthStatus.Degraded,
        tags: new[] { "cache", "redis" })
    
    // 检查外部服务
    .AddUrlGroup(
        uri: new Uri(builder.Configuration["Services:PaymentService"] + "/health"),
        name: "payment-service",
        failureStatus: HealthStatus.Unhealthy,
        tags: new[] { "external-service", "payment" })
    
    // 检查消息队列
    .AddRabbitMQ(
        rabbitConnectionString: builder.Configuration["RabbitMQ:ConnectionString"],
        name: "rabbitmq",
        failureStatus: HealthStatus.Degraded,
        tags: new[] { "messaging", "rabbitmq" })
    
    // 自定义健康检查
    .AddCheck<DatabaseMigrationHealthCheck>("database-migrations")
    .AddCheck<MemoryUsageHealthCheck>("memory-usage");

// 自定义健康检查
public class DatabaseMigrationHealthCheck : IHealthCheck
{
    private readonly IDbContextFactory<OrderDbContext> _contextFactory;
    
    public async Task<HealthCheckResult> CheckHealthAsync(
        HealthCheckContext context,
        CancellationToken cancellationToken = default)
    {
        try
        {
            await using var dbContext = _contextFactory.CreateDbContext();
            var pendingMigrations = await dbContext.Database.GetPendingMigrationsAsync(cancellationToken);
            
            if (pendingMigrations.Any())
            {
                return HealthCheckResult.Unhealthy(
                    $"Database has {pendingMigrations.Count()} pending migrations");
            }
            
            return HealthCheckResult.Healthy("Database is up to date");
        }
        catch (Exception ex)
        {
            return HealthCheckResult.Unhealthy(
                "Database health check failed", 
                exception: ex);
        }
    }
}

public class MemoryUsageHealthCheck : IHealthCheck
{
    public Task<HealthCheckResult> CheckHealthAsync(
        HealthCheckContext context,
        CancellationToken cancellationToken = default)
    {
        var allocatedBytes = GC.GetTotalMemory(forceFullCollection: false);
        var maxBytes = 1024L * 1024L * 1024L; // 1GB
        
        var data = new Dictionary<string, object>
        {
            ["AllocatedBytes"] = allocatedBytes,
            ["MaxBytes"] = maxBytes,
            ["Generation0Collections"] = GC.CollectionCount(0),
            ["Generation1Collections"] = GC.CollectionCount(1),
            ["Generation2Collections"] = GC.CollectionCount(2)
        };
        
        var percentUsed = (double)allocatedBytes / maxBytes * 100;
        
        if (percentUsed > 90)
        {
            return Task.FromResult(HealthCheckResult.Unhealthy(
                $"Memory usage is {percentUsed:F1}%", 
                data: data));
        }
        else if (percentUsed > 70)
        {
            return Task.FromResult(HealthCheckResult.Degraded(
                $"Memory usage is {percentUsed:F1}%", 
                data: data));
        }
        
        return Task.FromResult(HealthCheckResult.Healthy(
            $"Memory usage is {percentUsed:F1}%", 
            data: data));
    }
}

// 健康检查路由
app.MapHealthChecks("/health", new HealthCheckOptions
{
    ResponseWriter = async (context, report) =>
    {
        context.Response.ContentType = "application/json";
        
        var response = new
        {
            status = report.Status.ToString(),
            duration = report.TotalDuration.TotalMilliseconds,
            info = report.Entries.Select(e => new
            {
                key = e.Key,
                status = e.Value.Status.ToString(),
                description = e.Value.Description,
                duration = e.Value.Duration.TotalMilliseconds,
                data = e.Value.Data
            })
        };
        
        await context.Response.WriteAsync(JsonSerializer.Serialize(response));
    }
});

app.MapHealthChecks("/health/ready", new HealthCheckOptions
{
    Predicate = check => check.Tags.Contains("ready")
});

app.MapHealthChecks("/health/live", new HealthCheckOptions
{
    Predicate = check => check.Tags.Contains("live") || check.Name == "self"
});

10.4.2 告警规则配置

# Prometheus告警规则
groups:
- name: order-service-alerts
  rules:
  - alert: HighErrorRate
    expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05
    for: 5m
    labels:
      severity: critical
      service: order-service
    annotations:
      summary: "High error rate detected for order service"
      description: "Error rate is {{ $value | humanizePercentage }} for the last 5 minutes"
      
  - alert: HighResponseTime
    expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
    for: 5m
    labels:
      severity: warning
      service: order-service
    annotations:
      summary: "High response time for order service"
      description: "95th percentile response time is {{ $value }}s for the last 5 minutes"
      
  - alert: DatabaseConnectionFailure
    expr: up{job="order-service-database"} == 0
    for: 1m
    labels:
      severity: critical
      service: order-service
      component: database
    annotations:
      summary: "Database connection failed for order service"
      description: "Database has been down for more than 1 minute"
      
  - alert: HighMemoryUsage
    expr: process_working_set_bytes{job="order-service"} / 1024 / 1024 / 1024 > 0.8
    for: 5m
    labels:
      severity: warning
      service: order-service
      component: memory
    annotations:
      summary: "High memory usage for order service"
      description: "Memory usage is {{ $value }}GB for order service"
      
  - alert: LowStockWarning
    expr: inventory_stock_quantity < 10
    for: 0m
    labels:
      severity: warning
      service: inventory-service
      component: stock
    annotations:
      summary: "Low stock warning"
      description: "Product {{ $labels.product_id }} has low stock: {{ $value }} units"

10.5 小结

可观测性是微服务架构的基石。一个良好的可观测性系统应该:

  1. 三个支柱缺一不可:日志、指标、追踪相互补充
  2. 结构化是关键:结构化的数据更容易分析和查询
  3. 监控业务指标:技术指标重要,业务指标更重要
  4. 自动化告警:及时的告警能减少MTTR(平均修复时间)
  5. 持续优化:根据实际使用情况不断优化监控策略

记住,可观测性不是为了监控而监控,而是为了更好地理解系统、快速定位问题、持续优化性能。投资于可观测性,就是投资于系统的长期健康。

在下一章中,我们将探讨测试策略,确保微服务系统的质量和可靠性。

posted @ 2026-01-22 21:41  高宏顺  阅读(3)  评论(0)    收藏  举报