第十章 可观测性与监控
第十章 可观测性与监控
在微服务架构中,可观测性不是锦上添花,而是生存必需品。当你的系统由几十个服务组成,一个用户请求跨越多个服务时,没有良好的可观测性,你就像是在黑暗中摸索的盲人。
10.1 集中式日志:系统的黑匣子
10.1.1 为什么传统日志管理不再适用
记得我第一次排查微服务问题时,需要SSH到5台不同的服务器,查看10个不同的日志文件,试图拼凑出一个完整的故事。那种感觉就像在破案,但手里只有零散的线索。
// ❌ 传统方式 - 分散的日志
// Server 1: /var/log/order-service/app.log
// Server 2: /var/log/payment-service/app.log
// Server 3: /var/log/inventory-service/app.log
// Server 4: /var/log/shipping-service/app.log
// Server 5: /var/log/notification-service/app.log
// 问题:如何关联这些日志?如何快速找到问题根源?
集中式日志解决了这个问题,它就像是飞机的黑匣子,记录了系统的完整运行轨迹。
10.1.2 ELK Stack实战配置
// Program.cs - Serilog配置
using Serilog;
var builder = WebApplication.CreateBuilder(args);
// 配置Serilog
Log.Logger = new LoggerConfiguration()
.MinimumLevel.Debug()
.MinimumLevel.Override("Microsoft", LogEventLevel.Information)
.Enrich.FromLogContext()
.Enrich.WithMachineName()
.Enrich.WithProcessId()
.Enrich.WithThreadId()
.Enrich.WithProperty("Application", "OrderService")
.Enrich.WithProperty("Environment", builder.Environment.EnvironmentName)
.WriteTo.Console(
outputTemplate: "[{Timestamp:HH:mm:ss} {Level:u3}] {CorrelationId} {Message:lj}{NewLine}{Exception}")
.WriteTo.File(
path: "logs/order-service-.txt",
rollingInterval: RollingInterval.Day,
retainedFileCountLimit: 7,
outputTemplate: "{Timestamp:yyyy-MM-dd HH:mm:ss.fff zzz} [{Level:u3}] {CorrelationId} {Message:lj}{NewLine}{Exception}")
.WriteTo.Elasticsearch(new ElasticsearchSinkOptions(new Uri(builder.Configuration["Elasticsearch:Uri"]))
{
AutoRegisterTemplate = true,
AutoRegisterTemplateVersion = AutoRegisterTemplateVersion.ESv7,
IndexFormat = "order-service-{0:yyyy.MM.dd}",
CustomFormatter = new EcsTextFormatter(),
FailureCallback = e => Console.WriteLine($"Unable to submit event {e.RenderMessage()} to Elasticsearch"),
EmitEventFailure = EmitEventFailureHandling.WriteToSelfLog |
EmitEventFailureHandling.WriteToFailureSink |
EmitEventFailureHandling.RaiseCallback,
FailureSink = new FileSink("./failures.txt", new JsonFormatter(), null)
})
.CreateLogger();
try
{
builder.Host.UseSerilog();
var app = builder.Build();
// 配置请求日志中间件
app.UseSerilogRequestLogging(options =>
{
options.EnrichDiagnosticContext = (diagnosticContext, httpContext) =>
{
diagnosticContext.Set("RequestHost", httpContext.Request.Host.Value);
diagnosticContext.Set("RequestScheme", httpContext.Request.Scheme);
diagnosticContext.Set("RequestPath", httpContext.Request.Path);
diagnosticContext.Set("QueryString", httpContext.Request.QueryString.Value);
diagnosticContext.Set("UserAgent", httpContext.Request.Headers["User-Agent"]);
diagnosticContext.Set("IpAddress", httpContext.Connection.RemoteIpAddress?.ToString());
diagnosticContext.Set("CorrelationId", httpContext.GetCorrelationId());
};
});
app.Run();
}
catch (Exception ex)
{
Log.Fatal(ex, "Application terminated unexpectedly");
}
finally
{
Log.CloseAndFlush();
}
10.1.3 结构化日志的最佳实践
// 使用结构化日志记录业务事件
public class OrderService
{
private readonly ILogger<OrderService> _logger;
public async Task<Order> CreateOrderAsync(CreateOrderCommand command)
{
using var scope = _logger.BeginScope(new Dictionary<string, object>
{
["CorrelationId"] = command.CorrelationId,
["CustomerId"] = command.CustomerId,
["OrderId"] = command.OrderId
});
_logger.LogInformation(
"Starting order creation for customer {CustomerId} with {ItemCount} items",
command.CustomerId,
command.Items.Count);
try
{
var order = Order.Create(command.CustomerId, command.Items);
// 记录业务事件
_logger.LogInformation(
"Order {OrderId} created successfully. Total amount: {TotalAmount}",
order.Id,
order.TotalPrice);
// 记录性能指标
_logger.LogInformation(
"Order creation completed in {ElapsedMs}ms",
stopwatch.ElapsedMilliseconds);
return order;
}
catch (BusinessRuleException ex)
{
_logger.LogWarning(
"Business rule violation while creating order: {ErrorMessage}",
ex.Message);
throw;
}
catch (Exception ex)
{
_logger.LogError(
ex,
"Unexpected error creating order for customer {CustomerId}",
command.CustomerId);
throw;
}
}
}
// 自定义日志事件类型
public static class LogEvents
{
public const int OrderCreated = 1001;
public const int OrderCancelled = 1002;
public const int PaymentProcessed = 2001;
public const int PaymentFailed = 2002;
public const int StockReserved = 3001;
public const int StockInsufficient = 3002;
public const int ExternalServiceCall = 4001;
public const int ExternalServiceError = 4002;
}
// 使用强类型的日志事件
public static class LoggingExtensions
{
public static void LogOrderCreated(this ILogger logger, Guid orderId, decimal totalAmount)
{
logger.LogInformation(
LogEvents.OrderCreated,
"Order {OrderId} created with total amount {TotalAmount}",
orderId, totalAmount);
}
public static void LogPaymentFailed(this ILogger logger, Guid orderId, string error)
{
logger.LogError(
LogEvents.PaymentFailed,
"Payment failed for order {OrderId}: {Error}",
orderId, error);
}
}
10.1.4 日志查询和分析
// 在Kibana中创建有用的查询
// 1. 查找特定用户的所有操作
{
"query": {
"bool": {
"must": [
{ "term": { "CustomerId": "123e4567-e89b-12d3-a456-426614174000" } }
]
}
}
}
// 2. 查找响应时间超过1秒的所有请求
{
"query": {
"bool": {
"must": [
{ "range": { "ElapsedMs": { "gt": 1000 } } },
{ "exists": { "field": "RequestPath" } }
]
}
}
}
// 3. 查找特定时间范围内的错误日志
{
"query": {
"bool": {
"must": [
{ "range": { "@timestamp": { "gte": "2024-01-01T00:00:00", "lte": "2024-01-01T23:59:59" } } },
{ "term": { "Level": "Error" } }
]
}
}
}
// 4. 查找外部服务调用失败
{
"query": {
"bool": {
"must": [
{ "term": { "EventId.Id": 4002 } },
{ "exists": { "field": "ExternalServiceName" } }
]
}
}
}
10.2 指标监控:系统的健康仪表盘
10.2.1 Prometheus + Grafana实战
// 安装NuGet包
dotnet add package prometheus-net.AspNetCore
dotnet add package prometheus-net.DotNetRuntime
// Program.cs - 配置Prometheus指标
var builder = WebApplication.CreateBuilder(args);
// 注册Prometheus指标收集
builder.Services.AddMetricServer(options =>
{
options.Port = 9090; // Prometheus metrics端口
});
// 注册.NET运行时指标收集
builder.Services.AddDotNetRuntimeStats();
var app = builder.Build();
// 配置Prometheus中间件
app.UseMetricServer();
app.UseHttpMetrics(); // 自动收集HTTP请求指标
// 自定义业务指标
var orderCounter = Metrics.CreateCounter(
"orders_created_total",
"Total number of orders created",
new[] { "customer_type", "payment_method" });
var orderValueHistogram = Metrics.CreateHistogram(
"order_value_usd",
"Order value in USD",
new HistogramConfiguration
{
Buckets = Histogram.LinearBuckets(start: 10, width: 50, count: 10),
LabelNames = new[] { "customer_type" }
});
var activeOrdersGauge = Metrics.CreateGauge(
"active_orders_count",
"Number of active orders being processed");
// 使用自定义指标
app.MapPost("/api/orders", async (CreateOrderRequest request) =>
{
var stopwatch = Stopwatch.StartNew();
try
{
var order = await orderService.CreateOrderAsync(request);
// 记录订单创建指标
orderCounter.WithLabels(
order.CustomerType.ToString().ToLower(),
order.PaymentMethod.ToString().ToLower()).Inc();
// 记录订单价值分布
orderValueHistogram.WithLabels(order.CustomerType.ToString().ToLower())
.Observe((double)order.TotalPrice.Amount);
// 增加活跃订单数
activeOrdersGauge.Inc();
return Results.Ok(order);
}
catch (Exception ex)
{
// 记录错误指标
errorCounter.WithLabels("order_creation", ex.GetType().Name).Inc();
throw;
}
finally
{
// 记录请求处理时间
requestDuration.WithLabels("POST", "/api/orders")
.Observe(stopwatch.Elapsed.TotalSeconds);
}
});
10.2.2 业务指标设计
// 业务健康指标
public class BusinessMetrics
{
private readonly IMetricServer _metricServer;
// 客户指标
private readonly Counter _customerRegisteredCounter;
private readonly Counter _customerActivatedCounter;
private readonly Gauge _activeCustomersGauge;
// 订单指标
private readonly Counter _ordersCreatedCounter;
private readonly Counter _ordersCompletedCounter;
private readonly Counter _ordersCancelledCounter;
private readonly Histogram _orderProcessingTimeHistogram;
// 收入指标
private readonly Counter _revenueCounter;
private readonly Gauge _dailyRevenueGauge;
public BusinessMetrics()
{
_customerRegisteredCounter = Metrics.CreateCounter(
"customers_registered_total",
"Total number of customers registered",
new[] { "registration_source", "country" });
_ordersCreatedCounter = Metrics.CreateCounter(
"orders_created_total",
"Total number of orders created",
new[] { "customer_type", "channel", "region" });
_revenueCounter = Metrics.CreateCounter(
"revenue_total_usd",
"Total revenue in USD",
new[] { "product_category", "payment_method" });
_orderProcessingTimeHistogram = Metrics.CreateHistogram(
"order_processing_duration_seconds",
"Order processing time in seconds",
new HistogramConfiguration
{
Buckets = Histogram.ExponentialBuckets(0.1, 2, 10),
LabelNames = new[] { "order_type", "priority" }
});
}
public void RecordCustomerRegistration(string source, string country)
{
_customerRegisteredCounter.WithLabels(source, country).Inc();
}
public void RecordOrderCompletion(Order order)
{
_ordersCompletedCounter.WithLabels(
order.CustomerType.ToString().ToLower(),
order.Channel.ToString().ToLower(),
order.Region.ToString().ToLower()).Inc();
_revenueCounter.WithLabels(
order.ProductCategory,
order.PaymentMethod.ToString().ToLower()).Inc(order.TotalPrice.Amount);
}
public void RecordOrderProcessingTime(Order order, TimeSpan processingTime)
{
_orderProcessingTimeHistogram.WithLabels(
order.Type.ToString().ToLower(),
order.Priority.ToString().ToLower())
.Observe(processingTime.TotalSeconds);
}
}
10.2.3 Grafana仪表盘配置
// Grafana仪表盘配置示例
{
"dashboard": {
"id": null,
"title": "Order Service Dashboard",
"tags": ["order-service", "microservices"],
"timezone": "browser",
"panels": [
{
"id": 1,
"title": "Orders per Second",
"type": "graph",
"targets": [
{
"expr": "rate(orders_created_total[5m])",
"legendFormat": "Orders/sec",
"refId": "A"
}
],
"yAxes": [
{
"label": "Orders/sec",
"min": 0
}
]
},
{
"id": 2,
"title": "Order Processing Time",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.95, rate(order_processing_duration_seconds_bucket[5m]))",
"legendFormat": "95th percentile",
"refId": "A"
},
{
"expr": "histogram_quantile(0.50, rate(order_processing_duration_seconds_bucket[5m]))",
"legendFormat": "50th percentile",
"refId": "B"
}
],
"yAxes": [
{
"label": "Seconds",
"min": 0
}
]
},
{
"id": 3,
"title": "Error Rate",
"type": "singlestat",
"targets": [
{
"expr": "rate(http_requests_total{status=~\"5..\"}[5m]) / rate(http_requests_total[5m]) * 100",
"refId": "A"
}
],
"valueName": "current",
"format": "percent",
"thresholds": "1,5",
"colorBackground": true
},
{
"id": 4,
"title": "Revenue per Hour",
"type": "graph",
"targets": [
{
"expr": "increase(revenue_total_usd[1h])",
"legendFormat": "Revenue (USD)",
"refId": "A"
}
],
"yAxes": [
{
"label": "USD",
"min": 0
}
]
}
]
}
}
10.3 分布式追踪:请求的完整故事
10.3.1 OpenTelemetry配置
// 安装NuGet包
dotnet add package OpenTelemetry.Extensions.Hosting
dotnet add package OpenTelemetry.Instrumentation.AspNetCore
dotnet add package OpenTelemetry.Instrumentation.HttpClient
dotnet add package OpenTelemetry.Instrumentation.EntityFrameworkCore
dotnet add package OpenTelemetry.Exporter.Jaeger
dotnet add package OpenTelemetry.Exporter.Console
// Program.cs - OpenTelemetry配置
var builder = WebApplication.CreateBuilder(args);
builder.Services.AddOpenTelemetry()
.WithTracing(traceBuilder =>
{
traceBuilder
.AddSource("OrderService") // 添加自定义ActivitySource
.SetResourceBuilder(ResourceBuilder.CreateDefault()
.AddService("order-service")
.AddAttributes(new Dictionary<string, object>
{
["service.version"] = "1.0.0",
["service.namespace"] = "ecommerce",
["deployment.environment"] = builder.Environment.EnvironmentName
}))
.AddAspNetCoreInstrumentation(options =>
{
options.Filter = (httpContext) =>
{
// 过滤健康检查等不需要追踪的请求
return !httpContext.Request.Path.StartsWithSegments("/health");
};
options.EnrichWithHttpRequest = (activity, httpRequest) =>
{
activity.SetTag("http.request_content_length", httpRequest.ContentLength);
};
options.EnrichWithHttpResponse = (activity, httpResponse) =>
{
activity.SetTag("http.response_content_length", httpResponse.ContentLength);
};
})
.AddHttpClientInstrumentation(options =>
{
options.FilterHttpRequestMessage = (request) =>
{
// 过滤对外部监控服务的调用
return !request.RequestUri.Host.Contains("jaeger");
};
})
.AddEntityFrameworkCoreInstrumentation(options =>
{
options.SetDbStatementForText = true;
options.EnrichWithIDbCommand = (activity, command) =>
{
activity.SetTag("db.statement_type", command.CommandType.ToString());
activity.SetTag("db.connection_id", command.Connection?.GetHashCode());
};
})
.AddJaegerExporter(options =>
{
options.AgentHost = builder.Configuration["Jaeger:AgentHost"];
options.AgentPort = int.Parse(builder.Configuration["Jaeger:AgentPort"]);
options.ExportProcessorType = ExportProcessorType.Batch;
options.BatchExportProcessorOptions = new BatchExportProcessorOptions<Activity>
{
MaxQueueSize = 2048,
ScheduledDelayMilliseconds = 5000,
ExporterTimeoutMilliseconds = 30000,
MaxExportBatchSize = 512
};
});
// 在开发环境中输出到控制台
if (builder.Environment.IsDevelopment())
{
traceBuilder.AddConsoleExporter();
}
})
.WithMetrics(metricsBuilder =>
{
metricsBuilder
.AddAspNetCoreInstrumentation()
.AddHttpClientInstrumentation()
.AddRuntimeInstrumentation()
.AddPrometheusExporter();
});
10.3.2 自定义追踪
// 自定义ActivitySource
public class OrderActivitySource
{
private static readonly ActivitySource Source = new("OrderService", "1.0.0");
public static Activity StartActivity(string name, ActivityKind kind = ActivityKind.Internal)
{
return Source.StartActivity(name, kind);
}
}
// 在业务代码中使用自定义追踪
public class OrderService
{
private readonly ILogger<OrderService> _logger;
public async Task<Order> ProcessOrderAsync(Order order)
{
// 开始一个自定义的追踪活动
using var activity = OrderActivitySource.StartActivity("ProcessOrder");
activity?.SetTag("order.id", order.Id);
activity?.SetTag("order.customer_id", order.CustomerId);
activity?.SetTag("order.total_amount", order.TotalPrice.Amount);
activity?.SetTag("order.item_count", order.Items.Count);
try
{
// 验证订单
using var validationActivity = OrderActivitySource.StartActivity("ValidateOrder");
await ValidateOrderAsync(order);
// 处理库存
using var inventoryActivity = OrderActivitySource.StartActivity("ProcessInventory");
await ProcessInventoryAsync(order);
// 处理支付
using var paymentActivity = OrderActivitySource.StartActivity("ProcessPayment");
await ProcessPaymentAsync(order);
// 发送通知
using var notificationActivity = OrderActivitySource.StartActivity("SendNotification");
await SendOrderConfirmationAsync(order);
activity?.SetStatus(ActivityStatusCode.Ok);
return order;
}
catch (Exception ex)
{
activity?.SetStatus(ActivityStatusCode.Error, ex.Message);
activity?.RecordException(ex);
throw;
}
}
private async Task ValidateOrderAsync(Order order)
{
using var activity = OrderActivitySource.StartActivity("ValidateOrderItems");
foreach (var item in order.Items)
{
activity?.SetTag($"item.{item.ProductId}.quantity", item.Quantity);
activity?.SetTag($"item.{item.ProductId}.price", item.Price.Amount);
// 验证逻辑...
}
}
}
// 跨服务追踪传播
public class TracingHttpMessageHandler : DelegatingHandler
{
protected override async Task<HttpResponseMessage> SendAsync(
HttpRequestMessage request,
CancellationToken cancellationToken)
{
// 注入追踪上下文
var activity = Activity.Current;
if (activity != null)
{
request.Headers.Add("traceparent", activity.Id);
if (activity.TraceStateString != null)
{
request.Headers.Add("tracestate", activity.TraceStateString);
}
}
return await base.SendAsync(request, cancellationToken);
}
}
10.3.3 追踪数据可视化
// Jaeger追踪数据示例
{
"traceId": "7a3f1e8b9c2d4f6g",
"spans": [
{
"spanId": "1a2b3c4d5e6f",
"operationName": "POST /api/orders",
"startTime": "2024-01-01T10:00:00.000Z",
"duration": 2500,
"tags": {
"http.method": "POST",
"http.url": "/api/orders",
"http.status_code": 200,
"customer.id": "123e4567-e89b-12d3-a456-426614174000"
},
"logs": [
{
"timestamp": "2024-01-01T10:00:01.000Z",
"fields": [
{ "key": "event", "value": "Order validation started" }
]
}
]
},
{
"spanId": "2b3c4d5e6f7g",
"operationName": "ProcessInventory",
"startTime": "2024-01-01T10:00:01.100Z",
"duration": 800,
"parentSpanId": "1a2b3c4d5e6f",
"tags": {
"db.system": "postgresql",
"db.statement": "UPDATE inventory SET quantity = quantity - ? WHERE product_id = ?"
}
}
]
}
10.4 健康检查与告警
10.4.1 健康检查实现
// 健康检查配置
builder.Services.AddHealthChecks()
// 检查自身服务状态
.AddCheck("self", () => HealthCheckResult.Healthy("Service is running"))
// 检查数据库连接
.AddNpgSql(
connectionString: builder.Configuration.GetConnectionString("Default"),
name: "database",
failureStatus: HealthStatus.Unhealthy,
tags: new[] { "database", "critical" })
// 检查Redis连接
.AddRedis(
redisConnectionString: builder.Configuration["Redis:ConnectionString"],
name: "redis",
failureStatus: HealthStatus.Degraded,
tags: new[] { "cache", "redis" })
// 检查外部服务
.AddUrlGroup(
uri: new Uri(builder.Configuration["Services:PaymentService"] + "/health"),
name: "payment-service",
failureStatus: HealthStatus.Unhealthy,
tags: new[] { "external-service", "payment" })
// 检查消息队列
.AddRabbitMQ(
rabbitConnectionString: builder.Configuration["RabbitMQ:ConnectionString"],
name: "rabbitmq",
failureStatus: HealthStatus.Degraded,
tags: new[] { "messaging", "rabbitmq" })
// 自定义健康检查
.AddCheck<DatabaseMigrationHealthCheck>("database-migrations")
.AddCheck<MemoryUsageHealthCheck>("memory-usage");
// 自定义健康检查
public class DatabaseMigrationHealthCheck : IHealthCheck
{
private readonly IDbContextFactory<OrderDbContext> _contextFactory;
public async Task<HealthCheckResult> CheckHealthAsync(
HealthCheckContext context,
CancellationToken cancellationToken = default)
{
try
{
await using var dbContext = _contextFactory.CreateDbContext();
var pendingMigrations = await dbContext.Database.GetPendingMigrationsAsync(cancellationToken);
if (pendingMigrations.Any())
{
return HealthCheckResult.Unhealthy(
$"Database has {pendingMigrations.Count()} pending migrations");
}
return HealthCheckResult.Healthy("Database is up to date");
}
catch (Exception ex)
{
return HealthCheckResult.Unhealthy(
"Database health check failed",
exception: ex);
}
}
}
public class MemoryUsageHealthCheck : IHealthCheck
{
public Task<HealthCheckResult> CheckHealthAsync(
HealthCheckContext context,
CancellationToken cancellationToken = default)
{
var allocatedBytes = GC.GetTotalMemory(forceFullCollection: false);
var maxBytes = 1024L * 1024L * 1024L; // 1GB
var data = new Dictionary<string, object>
{
["AllocatedBytes"] = allocatedBytes,
["MaxBytes"] = maxBytes,
["Generation0Collections"] = GC.CollectionCount(0),
["Generation1Collections"] = GC.CollectionCount(1),
["Generation2Collections"] = GC.CollectionCount(2)
};
var percentUsed = (double)allocatedBytes / maxBytes * 100;
if (percentUsed > 90)
{
return Task.FromResult(HealthCheckResult.Unhealthy(
$"Memory usage is {percentUsed:F1}%",
data: data));
}
else if (percentUsed > 70)
{
return Task.FromResult(HealthCheckResult.Degraded(
$"Memory usage is {percentUsed:F1}%",
data: data));
}
return Task.FromResult(HealthCheckResult.Healthy(
$"Memory usage is {percentUsed:F1}%",
data: data));
}
}
// 健康检查路由
app.MapHealthChecks("/health", new HealthCheckOptions
{
ResponseWriter = async (context, report) =>
{
context.Response.ContentType = "application/json";
var response = new
{
status = report.Status.ToString(),
duration = report.TotalDuration.TotalMilliseconds,
info = report.Entries.Select(e => new
{
key = e.Key,
status = e.Value.Status.ToString(),
description = e.Value.Description,
duration = e.Value.Duration.TotalMilliseconds,
data = e.Value.Data
})
};
await context.Response.WriteAsync(JsonSerializer.Serialize(response));
}
});
app.MapHealthChecks("/health/ready", new HealthCheckOptions
{
Predicate = check => check.Tags.Contains("ready")
});
app.MapHealthChecks("/health/live", new HealthCheckOptions
{
Predicate = check => check.Tags.Contains("live") || check.Name == "self"
});
10.4.2 告警规则配置
# Prometheus告警规则
groups:
- name: order-service-alerts
rules:
- alert: HighErrorRate
expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05
for: 5m
labels:
severity: critical
service: order-service
annotations:
summary: "High error rate detected for order service"
description: "Error rate is {{ $value | humanizePercentage }} for the last 5 minutes"
- alert: HighResponseTime
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
for: 5m
labels:
severity: warning
service: order-service
annotations:
summary: "High response time for order service"
description: "95th percentile response time is {{ $value }}s for the last 5 minutes"
- alert: DatabaseConnectionFailure
expr: up{job="order-service-database"} == 0
for: 1m
labels:
severity: critical
service: order-service
component: database
annotations:
summary: "Database connection failed for order service"
description: "Database has been down for more than 1 minute"
- alert: HighMemoryUsage
expr: process_working_set_bytes{job="order-service"} / 1024 / 1024 / 1024 > 0.8
for: 5m
labels:
severity: warning
service: order-service
component: memory
annotations:
summary: "High memory usage for order service"
description: "Memory usage is {{ $value }}GB for order service"
- alert: LowStockWarning
expr: inventory_stock_quantity < 10
for: 0m
labels:
severity: warning
service: inventory-service
component: stock
annotations:
summary: "Low stock warning"
description: "Product {{ $labels.product_id }} has low stock: {{ $value }} units"
10.5 小结
可观测性是微服务架构的基石。一个良好的可观测性系统应该:
- 三个支柱缺一不可:日志、指标、追踪相互补充
- 结构化是关键:结构化的数据更容易分析和查询
- 监控业务指标:技术指标重要,业务指标更重要
- 自动化告警:及时的告警能减少MTTR(平均修复时间)
- 持续优化:根据实际使用情况不断优化监控策略
记住,可观测性不是为了监控而监控,而是为了更好地理解系统、快速定位问题、持续优化性能。投资于可观测性,就是投资于系统的长期健康。
在下一章中,我们将探讨测试策略,确保微服务系统的质量和可靠性。

浙公网安备 33010602011771号