Prometheus & Grafana — 监控告警
架构概览
应用(/metrics 端点)
↑ 拉取(Pull)
Prometheus Server
├── TSDB(时序数据库)
├── PromQL(查询语言)
└── Alertmanager(告警)
├── 邮件
├── Slack
└── PagerDuty
Grafana ──► Prometheus(数据源)──► 可视化仪表盘数据模型
指标名{标签1="值1", 标签2="值2"} 值 时间戳
示例:
http_requests_total{method="GET", status="200", service="order"} 1234 1705000000
http_requests_total{method="POST", status="500", service="order"} 5 1705000000指标类型:
| 类型 | 说明 | 示例 |
|---|---|---|
| Counter | 单调递增计数器 | 请求总数、错误总数 |
| Gauge | 可增可减的瞬时值 | 内存使用量、连接数 |
| Histogram | 分桶统计(含 sum/count) | 请求延迟分布 |
| Summary | 客户端计算分位数 | P99 延迟 |
Spring Boot 集成
xml
<dependency>
<groupId>io.micrometer</groupId>
<artifactId>micrometer-registry-prometheus</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-actuator</artifactId>
</dependency>yaml
management:
endpoints:
web:
exposure:
include: prometheus,health,info
endpoint:
prometheus:
enabled: true
metrics:
tags:
application: ${spring.application.name}
env: ${spring.profiles.active}自定义指标
java
@Component
public class OrderMetrics {
private final Counter orderCreatedCounter;
private final Counter orderFailedCounter;
private final Timer orderProcessingTimer;
private final Gauge pendingOrdersGauge;
public OrderMetrics(MeterRegistry registry, OrderRepository orderRepo) {
this.orderCreatedCounter = Counter.builder("order.created.total")
.description("Total orders created")
.tag("service", "order")
.register(registry);
this.orderFailedCounter = Counter.builder("order.failed.total")
.description("Total failed orders")
.register(registry);
this.orderProcessingTimer = Timer.builder("order.processing.duration")
.description("Order processing duration")
.publishPercentiles(0.5, 0.95, 0.99)
.register(registry);
// Gauge:动态获取当前值
Gauge.builder("order.pending.count", orderRepo, OrderRepository::countPending)
.description("Current pending orders count")
.register(registry);
}
public void recordOrderCreated() {
orderCreatedCounter.increment();
}
public void recordOrderFailed(String reason) {
orderFailedCounter.increment(Tags.of("reason", reason));
}
public <T> T timeOrderProcessing(Supplier<T> supplier) {
return orderProcessingTimer.record(supplier);
}
}PromQL 查询语言
基础查询
promql
# 查询指标当前值
http_requests_total
# 按标签过滤
http_requests_total{service="order", status="200"}
# 正则匹配
http_requests_total{status=~"5.."} # 5xx 错误
# 排除标签
http_requests_total{status!="200"}函数与运算
promql
# 5分钟内的请求速率(QPS)
rate(http_requests_total[5m])
# 错误率
rate(http_requests_total{status=~"5.."}[5m])
/ rate(http_requests_total[5m])
# P99 延迟(Histogram)
histogram_quantile(0.99,
rate(http_request_duration_seconds_bucket[5m])
)
# 按服务聚合 QPS
sum(rate(http_requests_total[5m])) by (service)
# 内存使用率
(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)
/ node_memory_MemTotal_bytes * 100
# 预测磁盘何时满(基于过去4小时趋势)
predict_linear(node_filesystem_free_bytes[4h], 24*3600) < 0告警规则
yaml
# alert-rules.yaml
groups:
- name: application-alerts
rules:
# 高错误率告警
- alert: HighErrorRate
expr: |
rate(http_requests_total{status=~"5.."}[5m])
/ rate(http_requests_total[5m]) > 0.05
for: 2m
labels:
severity: critical
annotations:
summary: "High error rate on {{ $labels.service }}"
description: "Error rate is {{ $value | humanizePercentage }} for {{ $labels.service }}"
# 高延迟告警
- alert: HighLatency
expr: |
histogram_quantile(0.99,
rate(http_request_duration_seconds_bucket[5m])
) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "High P99 latency on {{ $labels.service }}"
# 服务宕机
- alert: ServiceDown
expr: up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Service {{ $labels.job }} is down"
# 磁盘空间不足
- alert: DiskSpaceLow
expr: |
(node_filesystem_avail_bytes / node_filesystem_size_bytes) < 0.15
for: 5m
labels:
severity: warning
annotations:
summary: "Disk space low on {{ $labels.instance }}"
description: "Only {{ $value | humanizePercentage }} disk space remaining"Alertmanager 配置
yaml
# alertmanager.yml
global:
smtp_smarthost: 'smtp.example.com:587'
smtp_from: 'alertmanager@example.com'
route:
group_by: ['alertname', 'service']
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
receiver: 'default'
routes:
- match:
severity: critical
receiver: 'pagerduty'
- match:
severity: warning
receiver: 'slack'
receivers:
- name: 'default'
email_configs:
- to: 'team@example.com'
- name: 'slack'
slack_configs:
- api_url: 'https://hooks.slack.com/services/xxx'
channel: '#alerts'
title: '{{ .GroupLabels.alertname }}'
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
- name: 'pagerduty'
pagerduty_configs:
- routing_key: 'your-pagerduty-key'
inhibit_rules:
# critical 告警时,抑制同服务的 warning 告警
- source_match:
severity: critical
target_match:
severity: warning
equal: ['service']Grafana 仪表盘
关键仪表盘
服务 SLO 仪表盘:
- 可用性(Availability)= 成功请求 / 总请求
- 延迟(Latency)= P50/P95/P99
- 错误率(Error Rate)= 5xx / 总请求
- 吞吐量(Throughput)= QPS基础设施仪表盘:
- CPU 使用率
- 内存使用率
- 磁盘 IO
- 网络流量常用 Grafana 变量
# 动态选择服务
$service = label_values(http_requests_total, service)
# 动态选择时间范围
$__rate_interval # 自动计算合适的 rate 时间窗口故障处理案例
案例一:Prometheus 内存溢出
现象:Prometheus 进程 OOM 被 Kill。
原因:
- 指标基数(Cardinality)过高(标签值过多)
- 数据保留时间过长
解决:
yaml
# prometheus.yml
storage:
tsdb:
retention.time: 15d # 减少保留时间
retention.size: 50GB # 限制存储大小
# 查找高基数指标
topk(10, count by (__name__)({__name__=~".+"}))案例二:告警风暴
现象:一个故障触发数百条告警,淹没真正重要的告警。
解决:
yaml
# 使用 inhibit_rules 抑制关联告警
# 使用 group_by 聚合同类告警
# 设置合理的 for 时间,避免瞬时抖动触发告警案例三:指标采集失败
bash
# 查看采集状态
curl http://prometheus:9090/api/v1/targets
# 查看具体错误
# Prometheus UI → Status → Targets