Kubernetes日志与监控:构建可观测性体系
引言
在Kubernetes环境中,日志和监控是运维和故障排查的关键。良好的可观测性体系能够帮助我们快速定位问题、优化性能。本文将深入探讨Kubernetes的日志收集、监控指标和追踪系统。
一、日志管理
1.1 容器日志基础
apiVersion: v1 kind: Pod metadata: name: logging-demo spec: containers: - name: app image: myapp:1.0 args: ["--log-level=info"] resources: limits: memory: "128Mi" cpu: "500m" volumeMounts: - name: log-volume mountPath: /var/log volumes: - name: log-volume emptyDir: {}1.2 日志收集架构
┌─────────────────────────────────────────────────────────────────┐ │ Kubernetes集群 │ │ ┌─────────────────────┐ ┌─────────────────────┐ │ │ │ Node 1 │ │ Node 2 │ │ │ │ ┌────────────────┐ │ │ ┌────────────────┐ │ │ │ │ │ Application │ │ │ │ Application │ │ │ │ │ │ Container │ │ │ │ Container │ │ │ │ │ └──────┬───────┘ │ │ └──────┬───────┘ │ │ │ │ │ │ │ │ │ │ │ │ ▼ │ │ ▼ │ │ │ │ ┌────────────────┐ │ │ ┌────────────────┐ │ │ │ │ │ Filebeat │ │ │ │ Filebeat │ │ │ │ │ │ (Sidecar) │ │ │ │ (Sidecar) │ │ │ │ │ └──────┬───────┘ │ │ └──────┬───────┘ │ │ │ └───────┼───────────┘ └───────┼───────────┘ │ │ │ │ │ │ └────────────────────┼─────────────────────────────│ │ ▼ │ ├───────────────────────────────────────────────────────────────┤ │ Logstash / Fluentd │ │ (日志处理管道) │ ├───────────────────────────────────────────────────────────────┤ │ │ │ │ ▼ │ │ Elasticsearch / Loki │ │ (日志存储) │ ├───────────────────────────────────────────────────────────────┤ │ │ │ │ ▼ │ │ Kibana / Grafana │ │ (日志查询与可视化) │ └───────────────────────────────────────────────────────────────┘1.3 Sidecar日志收集
apiVersion: v1 kind: Pod metadata: name: app-with-logging spec: containers: - name: app image: myapp:1.0 volumeMounts: - name: logs mountPath: /var/log/app - name: filebeat image: elastic/filebeat:8.5.0 args: ["-c", "/etc/filebeat/filebeat.yml"] volumeMounts: - name: logs mountPath: /var/log/app - name: config mountPath: /etc/filebeat volumes: - name: logs emptyDir: {} - name: config configMap: name: filebeat-configapiVersion: v1 kind: ConfigMap metadata: name: filebeat-config data: filebeat.yml: | filebeat.inputs: - type: log paths: - /var/log/app/*.log fields: app: myapp output.elasticsearch: hosts: ["elasticsearch:9200"] setup.kibana: host: "kibana:5601"二、指标监控
2.1 Prometheus配置
apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: name: app-monitor labels: release: prometheus spec: selector: matchLabels: app: myapp endpoints: - port: http path: /metrics interval: 30s scrapeTimeout: 10s2.2 应用指标暴露
package metrics import ( "fmt" "net/http" "time" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" ) type AppMetrics struct { httpRequestsTotal *prometheus.CounterVec httpRequestDuration *prometheus.HistogramVec activeConnections *prometheus.Gauge } func NewAppMetrics() *AppMetrics { am := &AppMetrics{ httpRequestsTotal: prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "app_http_requests_total", Help: "Total number of HTTP requests", }, []string{"method", "status", "endpoint"}, ), httpRequestDuration: prometheus.NewHistogramVec( prometheus.HistogramOpts{ Name: "app_http_request_duration_seconds", Help: "Duration of HTTP requests", Buckets: []float64{0.001, 0.01, 0.1, 0.5, 1, 5}, }, []string{"method", "endpoint"}, ), activeConnections: prometheus.NewGauge( prometheus.GaugeOpts{ Name: "app_active_connections", Help: "Number of active connections", }, ), } prometheus.MustRegister(am.httpRequestsTotal) prometheus.MustRegister(am.httpRequestDuration) prometheus.MustRegister(am.activeConnections) return am } func (am *AppMetrics) RecordRequest(method, endpoint string, status int, duration time.Duration) { am.httpRequestsTotal.WithLabelValues(method, fmt.Sprintf("%d", status), endpoint).Inc() am.httpRequestDuration.WithLabelValues(method, endpoint).Observe(duration.Seconds()) } func (am *AppMetrics) SetActiveConnections(count int) { am.activeConnections.Set(float64(count)) } func (am *AppMetrics) ServeMetrics() http.Handler { return promhttp.Handler() }2.3 Prometheus查询示例
# 查询HTTP请求总数 app_http_requests_total # 查询特定端点的请求数 app_http_requests_total{endpoint="/api/users"} # 查询请求率(每分钟) rate(app_http_requests_total[5m]) # 查询平均响应时间 avg(app_http_request_duration_seconds_sum / app_http_request_duration_seconds_count) # 查询活跃连接数 app_active_connections三、分布式追踪
3.1 Jaeger配置
apiVersion: jaegertracing.io/v1 kind: Jaeger metadata: name: jaeger spec: strategy: allInOne ingress: enabled: true3.2 应用追踪集成
package tracing import ( "fmt" "io" "net/http" "github.com/opentracing/opentracing-go" "github.com/uber/jaeger-client-go" "github.com/uber/jaeger-client-go/config" ) func InitTracer(serviceName string) (opentracing.Tracer, io.Closer) { cfg := config.Configuration{ ServiceName: serviceName, Sampler: &config.SamplerConfig{ Type: "const", Param: 1, }, Reporter: &config.ReporterConfig{ LogSpans: true, LocalAgentHostPort: "jaeger-agent:6831", }, } tracer, closer, err := cfg.NewTracer(config.Logger(jaeger.StdLogger)) if err != nil { panic(fmt.Sprintf("failed to init tracer: %v", err)) } opentracing.SetGlobalTracer(tracer) return tracer, closer } func TracingMiddleware(next http.Handler) http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { span, ctx := opentracing.StartSpanFromContext(r.Context(), r.URL.Path) defer span.Finish() span.SetTag("http.method", r.Method) span.SetTag("http.url", r.URL.String()) next.ServeHTTP(w, r.WithContext(ctx)) }) }四、监控告警
4.1 Prometheus Alertmanager配置
apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: name: app-alerts labels: release: prometheus spec: groups: - name: app.rules rules: - alert: HighErrorRate expr: rate(app_http_requests_total{status=~"5.."}[5m]) / rate(app_http_requests_total[5m]) > 0.1 for: 1m labels: severity: critical annotations: summary: "High error rate detected" description: "Error rate is {{ $value }}%" - alert: HighLatency expr: avg(app_http_request_duration_seconds_sum / app_http_request_duration_seconds_count) > 2 for: 2m labels: severity: warning annotations: summary: "High latency detected" description: "Average latency is {{ $value }}s" - alert: PodNotReady expr: kube_pod_status_ready{condition="false"} == 1 for: 5m labels: severity: critical annotations: summary: "Pod {{ $labels.pod }} is not ready"4.2 Alertmanager配置
apiVersion: v1 kind: Secret metadata: name: alertmanager-config type: Opaque data: config.yaml: | global: resolve_timeout: 5m route: group_by: ['alertname'] group_wait: 10s group_interval: 10s repeat_interval: 1h receiver: 'webhook' receivers: - name: 'webhook' webhook_configs: - url: 'https://webhook.example.com/alerts' inhibit_rules: - source_match: severity: 'critical' target_match: severity: 'warning' equal: ['alertname', 'dev', 'instance']五、可视化与仪表盘
5.1 Grafana Dashboard
apiVersion: monitoring.coreos.com/v1 kind: GrafanaDashboard metadata: name: app-dashboard spec: json: | { "title": "Application Dashboard", "panels": [ { "title": "HTTP Requests", "type": "graph", "targets": [ { "expr": "rate(app_http_requests_total[5m])", "legendFormat": "{{method}} {{status}}" } ], "yaxes": [{"format": "short"}] }, { "title": "Response Time", "type": "graph", "targets": [ { "expr": "avg(app_http_request_duration_seconds_sum / app_http_request_duration_seconds_count)", "legendFormat": "Average" } ], "yaxes": [{"format": "s"}] }, { "title": "Active Connections", "type": "singlestat", "targets": [{"expr": "app_active_connections"}] } ] }六、总结
Kubernetes可观测性体系包含三个核心维度:
- 日志收集:使用Filebeat/Fluentd收集容器日志,存储到Elasticsearch/Loki
- 指标监控:使用Prometheus收集应用和集群指标
- 分布式追踪:使用Jaeger追踪跨服务调用
通过整合这些组件,可以构建完整的可观测性平台,实现:
- 实时监控告警
- 历史数据分析
- 故障快速定位
- 性能优化分析