| 指标类别 | 关键指标 |
|---|
| 集群 | Leader 选举、Raft 提交延迟 |
| 服务 | 注册数量、健康检查状态 |
| 性能 | RPC 延迟、Gossip 健康度 |
| 资源 | CPU、内存、磁盘 |
# 查看 Agent 信息
consul info
# 输出示例
agent:
check_monitors = 0
check_ttls = 0
checks = 2
services = 3
consul:
bootstrap = false
known_datacenters = 1
leader = true
leader_addr = 192.168.1.10:8300
server = true
raft:
applied_index = 12345
commit_index = 12345
fsm_pending = 0
last_log_index = 12345
last_log_term = 5
last_snapshot_index = 10000
num_peers = 2
state = Leader
# 查看集群成员
consul members
# 详细信息
consul members -detailed
# WAN 成员
consul members -wan
# 查看所有健康检查
curl http://localhost:8500/v1/health/state/any
# 查看失败的检查
curl http://localhost:8500/v1/health/state/critical
{
"telemetry": {
"prometheus_retention_time": "60s",
"disable_hostname": true
}
}
# prometheus.yml
scrape_configs:
- job_name: 'consul'
metrics_path: '/v1/agent/metrics'
params:
format: ['prometheus']
static_configs:
- targets:
- 'consul-1:8500'
- 'consul-2:8500'
- 'consul-3:8500'
relabel_configs:
- source_labels: [__address__]
target_label: instance
# Raft 相关
consul_raft_leader # 是否为 Leader
consul_raft_peers # Peer 数量
consul_raft_commitTime # 提交延迟
# RPC 相关
consul_rpc_request # RPC 请求数
consul_rpc_request_error # RPC 错误数
# 服务相关
consul_catalog_services # 服务数量
consul_catalog_service_instances # 服务实例数
# 健康检查
consul_health_service_status # 服务健康状态
{
"dashboard": {
"title": "Consul Cluster",
"panels": [
{
"title": "Leader Status",
"type": "stat",
"targets": [
{
"expr": "consul_raft_leader"
}
]
},
{
"title": "Service Count",
"type": "graph",
"targets": [
{
"expr": "consul_catalog_services"
}
]
}
]
}
}
{
"log_level": "INFO",
"log_file": "/var/log/consul/consul.log",
"log_rotate_duration": "24h",
"log_rotate_max_files": 7
}
| 级别 | 说明 |
|---|
| TRACE | 最详细 |
| DEBUG | 调试信息 |
| INFO | 一般信息 |
| WARN | 警告 |
| ERROR | 错误 |
# 临时调整
consul monitor -log-level=debug
# 通过 API
curl -X POST http://localhost:8500/v1/agent/log-level?level=debug
# 查看错误日志
grep -i error /var/log/consul/consul.log
# 查看 Leader 选举
grep -i "leader" /var/log/consul/consul.log
# 查看服务注册
grep -i "register" /var/log/consul/consul.log
# consul_alerts.yml
groups:
- name: consul
rules:
- alert: ConsulNoLeader
expr: consul_raft_leader == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Consul cluster has no leader"
- alert: ConsulServerDown
expr: up{job="consul"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Consul server {{ $labels.instance }} is down"
- alert: ConsulServiceUnhealthy
expr: consul_health_service_status{status="critical"} > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Service {{ $labels.service_name }} is unhealthy"
#!/bin/bash
# consul_health_check.sh
CONSUL_ADDR="http://localhost:8500"
# 检查 Leader
check_leader() {
LEADER=$(curl -s ${CONSUL_ADDR}/v1/status/leader)
if [ -z "$LEADER" ] || [ "$LEADER" == '""' ]; then
echo "CRITICAL: No leader"
return 2
fi
echo "OK: Leader is $LEADER"
return 0
}
# 检查节点数量
check_peers() {
PEERS=$(curl -s ${CONSUL_ADDR}/v1/status/peers | jq length)
if [ "$PEERS" -lt 3 ]; then
echo "WARNING: Only $PEERS peers"
return 1
fi
echo "OK: $PEERS peers"
return 0
}
# 检查服务健康
check_services() {
CRITICAL=$(curl -s "${CONSUL_ADDR}/v1/health/state/critical" | jq length)
if [ "$CRITICAL" -gt 0 ]; then
echo "WARNING: $CRITICAL critical checks"
return 1
fi
echo "OK: All services healthy"
return 0
}
check_leader
check_peers
check_services
{
"performance": {
"raft_multiplier": 1,
"rpc_hold_timeout": "7s"
}
}
{
"limits": {
"http_max_conns_per_client": 200,
"rpc_max_conns_per_client": 100
}
}
{
"dns_config": {
"node_ttl": "30s",
"service_ttl": {
"*": "30s"
},
"allow_stale": true,
"max_stale": "87600h"
}
}
#!/bin/bash
# backup_consul.sh
BACKUP_DIR="/backup/consul"
RETENTION_DAYS=7
DATE=$(date +%Y%m%d_%H%M%S)
# 创建备份
consul snapshot save ${BACKUP_DIR}/consul_${DATE}.snap
# 清理旧备份
find ${BACKUP_DIR} -name "consul_*.snap" -mtime +${RETENTION_DAYS} -delete
# 上传到 S3(可选)
# aws s3 cp ${BACKUP_DIR}/consul_${DATE}.snap s3://backup-bucket/consul/
# crontab -e
0 */4 * * * /opt/scripts/backup_consul.sh >> /var/log/consul_backup.log 2>&1
# 1. 检查节点状态
consul members
# 2. 如果节点 failed,等待自动恢复或手动移除
consul operator raft remove-peer -address="故障节点:8300"
# 3. 添加新节点替换
# 1. 停止所有节点
systemctl stop consul
# 2. 在一个节点上恢复
consul snapshot restore backup.snap
# 3. 使用 bootstrap 模式启动
consul agent -config-dir=/etc/consul.d/ -bootstrap-expect=1
# 4. 逐个加入其他节点
# 1. 停止 Consul
systemctl stop consul
# 2. 备份当前数据
mv /opt/consul/data /opt/consul/data.bak
# 3. 创建新数据目录
mkdir -p /opt/consul/data
chown consul:consul /opt/consul/data
# 4. 恢复快照
consul snapshot restore backup.snap
# 5. 启动 Consul
systemctl start consul
# 查看成员
consul members
# 查看 Leader
consul operator raft list-peers
# 强制离开
consul force-leave <node>
# 重新加载配置
consul reload
# 列出服务
consul catalog services
# 注销服务
consul services deregister -id=<service-id>
# 维护模式
consul maint -enable -reason="升级维护"
consul maint -disable
# 导出 KV
consul kv export > kv_backup.json
# 导入 KV
consul kv import @kv_backup.json