完善中央与边缘部署、远程写入与监控文档

- 增加中央与边缘完整配置和部署脚本
- 引入 VictoriaMetrics 数据源与 remote_write 故障排查说明
- 新增 edge-agent 配置脚本、ONVIF 自建 exporter 与 ping 监控示例

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
Super User
2026-02-25 04:24:40 -05:00
parent 9e37f79a36
commit 95a09fd9d8
52 changed files with 5978 additions and 0 deletions

329
central-server/deploy.sh Normal file
View File

@@ -0,0 +1,329 @@
#!/bin/bash
# 分布式Prometheus中央服务器部署脚本
# 适用于Linux系统
set -e
echo "=== 分布式Prometheus中央服务器部署脚本 ==="
echo ""
# 加载环境变量配置
if [ -f ".env" ]; then
echo "📝 加载 .env 配置文件..."
# 导出环境变量(支持注释和空行)
set -a
source .env
set +a
echo "✅ 环境变量加载完成"
elif [ -f "env.example" ]; then
echo "⚠️ 未找到 .env 文件,从 env.example 创建..."
cp env.example .env
echo "✅ 已创建 .env 文件,请根据需要修改配置"
echo " 然后重新运行此脚本"
exit 0
else
echo "⚠️ 未找到 .env 和 env.example 文件,使用默认配置"
fi
# 设置默认值(如果环境变量未设置)
PROMETHEUS_PORT=${PROMETHEUS_PORT:-9091}
GRAFANA_PORT=${GRAFANA_PORT:-3000}
ALERTMANAGER_PORT=${ALERTMANAGER_PORT:-9093}
VICTORIAMETRICS_PORT=${VICTORIAMETRICS_PORT:-8428}
PROMETHEUS_DATA_DIR=${PROMETHEUS_DATA_DIR:-./data/prometheus-data}
GRAFANA_DATA_DIR=${GRAFANA_DATA_DIR:-./data/grafana-data}
VICTORIAMETRICS_DATA_DIR=${VICTORIAMETRICS_DATA_DIR:-./data/victoria-metrics-data}
GRAFANA_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin123}
PROMETHEUS_SCRAPE_INTERVAL=${PROMETHEUS_SCRAPE_INTERVAL:-15}
PROMETHEUS_EVALUATION_INTERVAL=${PROMETHEUS_EVALUATION_INTERVAL:-15}
PROMETHEUS_CLUSTER_NAME=${PROMETHEUS_CLUSTER_NAME:-central-monitoring}
PROMETHEUS_RETENTION_TIME=${PROMETHEUS_RETENTION_TIME:-30d}
VICTORIAMETRICS_RETENTION_PERIOD=${VICTORIAMETRICS_RETENTION_PERIOD:-30d}
PROMETHEUS_REMOTE_WRITE_MAX_SAMPLES=${PROMETHEUS_REMOTE_WRITE_MAX_SAMPLES:-10000}
PROMETHEUS_REMOTE_WRITE_CAPACITY=${PROMETHEUS_REMOTE_WRITE_CAPACITY:-20000}
PROMETHEUS_REMOTE_WRITE_MAX_SHARDS=${PROMETHEUS_REMOTE_WRITE_MAX_SHARDS:-10}
GRAFANA_DEFAULT_LANGUAGE=${GRAFANA_DEFAULT_LANGUAGE:-zh-Hans}
GRAFANA_DEFAULT_THEME=${GRAFANA_DEFAULT_THEME:-light}
# 将相对路径转换为绝对路径Docker 需要绝对路径)
# 获取脚本所在目录的绝对路径
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$SCRIPT_DIR" || exit 1
# 将相对路径转换为绝对路径
if [[ "$PROMETHEUS_DATA_DIR" != /* ]]; then
PROMETHEUS_DATA_DIR="$(cd "$(dirname "$PROMETHEUS_DATA_DIR")" && pwd)/$(basename "$PROMETHEUS_DATA_DIR")"
fi
if [[ "$GRAFANA_DATA_DIR" != /* ]]; then
GRAFANA_DATA_DIR="$(cd "$(dirname "$GRAFANA_DATA_DIR")" && pwd)/$(basename "$GRAFANA_DATA_DIR")"
fi
if [[ "$VICTORIAMETRICS_DATA_DIR" != /* ]]; then
VICTORIAMETRICS_DATA_DIR="$(cd "$(dirname "$VICTORIAMETRICS_DATA_DIR")" && pwd)/$(basename "$VICTORIAMETRICS_DATA_DIR")"
fi
echo ""
# 检查Docker是否安装
if ! command -v docker &> /dev/null; then
echo "❌ Docker未安装请先安装Docker"
exit 1
fi
# 检查Docker Compose (优先检查V2然后检查V1)
DOCKER_COMPOSE_CMD=""
if docker compose version &> /dev/null; then
DOCKER_COMPOSE_CMD="docker compose"
echo "✅ 检测到 Docker Compose V2"
elif command -v docker-compose &> /dev/null; then
DOCKER_COMPOSE_CMD="docker-compose"
echo "✅ 检测到 Docker Compose V1"
else
echo "❌ Docker Compose未安装请先安装Docker Compose"
exit 1
fi
echo "✅ Docker环境检查通过"
echo ""
# 检查磁盘空间(检查当前目录所在分区)
echo "💾 检查磁盘空间..."
DATA_PARTITION_AVAIL=$(df -BG "$SCRIPT_DIR" 2>/dev/null | awk 'NR==2 {print $4}' | sed 's/G//' || echo "0")
ROOT_AVAIL=$(df -BG / | awk 'NR==2 {print $4}' | sed 's/G//' || echo "0")
if [ -z "$DATA_PARTITION_AVAIL" ]; then
DATA_PARTITION_AVAIL=0
fi
if [ -z "$ROOT_AVAIL" ]; then
ROOT_AVAIL=0
fi
echo " 当前目录所在分区可用空间: ${DATA_PARTITION_AVAIL}GB"
echo " 根分区可用空间: ${ROOT_AVAIL}GB"
# 检查当前目录所在分区空间需要至少2GB用于数据存储
if [ "$DATA_PARTITION_AVAIL" -lt 2 ]; then
echo ""
echo "⚠️ 警告:当前目录所在分区空间不足!"
echo " 分区可用空间: ${DATA_PARTITION_AVAIL}GB"
echo " 数据存储路径: ${PROMETHEUS_DATA_DIR}"
echo " 建议至少保留 2GB 空间(用于监控数据存储)"
echo ""
read -p "是否继续部署?(y/N): " -n 1 -r
echo
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
echo "部署已取消"
exit 1
fi
fi
# 检查根分区空间需要至少500MB用于containerd临时文件
if [ "$ROOT_AVAIL" -lt 1 ]; then
echo ""
echo "⚠️ 警告:根分区空间不足!"
echo " 根分区可用空间: ${ROOT_AVAIL}GB"
echo " 建议至少保留 1GB 空间用于containerd临时文件和系统运行"
echo ""
echo "💡 建议清理空间:"
echo " 1. 清理Docker资源: docker system prune -a --volumes"
echo " 2. 清理系统日志: journalctl --vacuum-time=3d"
echo " 3. 清理包缓存: dnf clean all 或 apt-get clean"
echo " 4. 检查大文件: du -h --max-depth=1 / | sort -hr | head -10"
echo ""
read -p "是否继续部署?(y/N): " -n 1 -r
echo
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
echo "部署已取消"
exit 1
fi
fi
echo ""
# 从模板生成 prometheus.yml
if [ -f "prometheus.yml.template" ]; then
echo "📝 从模板生成 prometheus.yml..."
# 检查是否有 envsubst 命令
if command -v envsubst &> /dev/null; then
envsubst < prometheus.yml.template > prometheus.yml
echo "✅ prometheus.yml 已生成"
else
echo "⚠️ envsubst 命令未找到,尝试使用 sed 替换..."
# 使用 sed 进行简单的变量替换
sed -e "s/\${PROMETHEUS_SCRAPE_INTERVAL}/${PROMETHEUS_SCRAPE_INTERVAL}/g" \
-e "s/\${PROMETHEUS_EVALUATION_INTERVAL}/${PROMETHEUS_EVALUATION_INTERVAL}/g" \
-e "s/\${PROMETHEUS_CLUSTER_NAME}/${PROMETHEUS_CLUSTER_NAME}/g" \
-e "s/\${VICTORIAMETRICS_PORT}/${VICTORIAMETRICS_PORT}/g" \
-e "s/\${PROMETHEUS_REMOTE_WRITE_MAX_SAMPLES}/${PROMETHEUS_REMOTE_WRITE_MAX_SAMPLES}/g" \
-e "s/\${PROMETHEUS_REMOTE_WRITE_CAPACITY}/${PROMETHEUS_REMOTE_WRITE_CAPACITY}/g" \
-e "s/\${PROMETHEUS_REMOTE_WRITE_MAX_SHARDS}/${PROMETHEUS_REMOTE_WRITE_MAX_SHARDS}/g" \
prometheus.yml.template > prometheus.yml
echo "✅ prometheus.yml 已生成(使用 sed"
fi
elif [ ! -f "prometheus.yml" ]; then
echo "❌ 配置文件 prometheus.yml 不存在,且未找到模板文件"
exit 1
fi
# 检查配置文件
if [ ! -f "alert_rules.yml" ]; then
echo "❌ 配置文件 alert_rules.yml 不存在"
exit 1
fi
if [ ! -f "alertmanager/alertmanager.yml" ]; then
echo "❌ 配置文件 alertmanager/alertmanager.yml 不存在"
exit 1
fi
echo "✅ 配置文件检查通过"
echo ""
# 导出环境变量供 docker-compose 使用(使用绝对路径)
export PROMETHEUS_DATA_DIR
export GRAFANA_DATA_DIR
export VICTORIAMETRICS_DATA_DIR
export PROMETHEUS_PORT
export GRAFANA_PORT
export ALERTMANAGER_PORT
export VICTORIAMETRICS_PORT
export GRAFANA_ADMIN_PASSWORD
export GRAFANA_DEFAULT_LANGUAGE
export GRAFANA_DEFAULT_THEME
export GRAFANA_ROOT_URL
export PROMETHEUS_RETENTION_TIME
export VICTORIAMETRICS_RETENTION_PERIOD
export TRAEFIK_ENABLED
export TRAEFIK_NETWORK
export TRAEFIK_ENTRYPOINT
export GRAFANA_DOMAIN
export PROMETHEUS_DOMAIN
export ALERTMANAGER_DOMAIN
# 检查 Traefik 网络docker-compose.yml 中总是会引用此网络,无论是否启用)
echo "🔍 检查 Traefik 网络..."
TRAEFIK_NET=${TRAEFIK_NETWORK:-traefik}
if ! docker network inspect "$TRAEFIK_NET" &> /dev/null; then
echo "⚠️ Traefik 网络 '$TRAEFIK_NET' 不存在,正在创建..."
if docker network create "$TRAEFIK_NET" 2>/dev/null; then
echo "✅ Traefik 网络 '$TRAEFIK_NET' 已创建"
else
echo "❌ 无法创建 Traefik 网络 '$TRAEFIK_NET'"
echo " 请确保:"
echo " 1. Traefik 已运行并创建了网络"
echo " 2. 或手动创建网络: docker network create $TRAEFIK_NET"
echo ""
read -p "是否继续部署?(y/N): " -n 1 -r
echo
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
echo "部署已取消"
exit 1
fi
fi
else
echo "✅ Traefik 网络 '$TRAEFIK_NET' 已存在"
fi
echo ""
# 创建数据目录(使用环境变量中配置的路径)
echo "📁 创建数据目录..."
mkdir -p "${PROMETHEUS_DATA_DIR}"
mkdir -p "${GRAFANA_DATA_DIR}"
mkdir -p "${VICTORIAMETRICS_DATA_DIR}"
mkdir -p grafana/dashboards
mkdir -p grafana/provisioning/datasources
mkdir -p grafana/provisioning/dashboards
# 设置目录权限
# Prometheus 需要写权限
chmod 777 "${PROMETHEUS_DATA_DIR}" 2>/dev/null || true
# Grafana 需要 UID 472 的权限Grafana 容器用户)
chown -R 472:472 "${GRAFANA_DATA_DIR}" 2>/dev/null || chmod -R 777 "${GRAFANA_DATA_DIR}"
# VictoriaMetrics 需要写权限
chmod 777 "${VICTORIAMETRICS_DATA_DIR}" 2>/dev/null || true
echo "✅ 数据目录创建完成"
echo " - Prometheus: ${PROMETHEUS_DATA_DIR}"
echo " - Grafana: ${GRAFANA_DATA_DIR}"
echo " - VictoriaMetrics: ${VICTORIAMETRICS_DATA_DIR}"
echo ""
# 停止现有服务
echo "🛑 停止现有服务..."
$DOCKER_COMPOSE_CMD down 2>/dev/null || true
# 拉取最新镜像
echo "📥 拉取Docker镜像..."
if ! $DOCKER_COMPOSE_CMD pull; then
echo ""
echo "⚠️ 镜像拉取失败,可能的原因:"
echo " 1. 网络连接问题"
echo " 2. Docker Hub 速率限制"
echo " 3. 需要配置镜像加速器"
echo ""
echo "💡 建议:"
echo " 1. 检查网络连接"
echo " 2. 配置 Docker 镜像加速器(如阿里云、腾讯云等)"
echo " 3. 或稍后重试"
echo ""
echo "🔄 尝试继续启动(如果本地已有镜像)..."
echo ""
fi
# 启动服务
echo "🚀 启动服务..."
$DOCKER_COMPOSE_CMD up -d
# 等待服务启动
echo "⏳ 等待服务启动..."
sleep 15
# 检查服务状态
echo ""
echo "📊 服务状态检查:"
$DOCKER_COMPOSE_CMD ps
echo ""
echo "📋 服务日志:"
$DOCKER_COMPOSE_CMD logs --tail=20
echo ""
echo "✅ 部署完成!"
echo ""
# 检查是否启用 Traefik
if [ "${TRAEFIK_ENABLED:-false}" = "true" ]; then
echo "🔗 访问地址(通过 Traefik 反向代理):"
echo " - Grafana仪表板: http://${GRAFANA_DOMAIN:-grafana.example.com} (admin/${GRAFANA_ADMIN_PASSWORD}) [${GRAFANA_DEFAULT_LANGUAGE}界面]"
echo " - Prometheus: http://${PROMETHEUS_DOMAIN:-prometheus.example.com} [英文界面]"
echo " - Alertmanager: http://${ALERTMANAGER_DOMAIN:-alertmanager.example.com} [英文界面]"
echo " - VictoriaMetrics: http://localhost:${VICTORIAMETRICS_PORT} [英文界面,边缘节点直接连接]"
echo ""
echo "⚠️ 请确保:"
echo " 1. DNS 已正确解析域名到 Traefik 服务器"
echo " 2. Traefik 网络 (${TRAEFIK_NETWORK:-traefik_default}) 已创建"
echo " 3. 边缘节点可以访问此服务器的${VICTORIAMETRICS_PORT}端口VictoriaMetrics 不通过 Traefik"
else
echo "🔗 访问地址:"
echo " - Grafana仪表板: http://localhost:${GRAFANA_PORT} (admin/${GRAFANA_ADMIN_PASSWORD}) [${GRAFANA_DEFAULT_LANGUAGE}界面]"
echo " - Prometheus: http://localhost:${PROMETHEUS_PORT} [英文界面]"
echo " - VictoriaMetrics: http://localhost:${VICTORIAMETRICS_PORT} [英文界面]"
echo " - Alertmanager: http://localhost:${ALERTMANAGER_PORT} [英文界面]"
echo ""
echo "⚠️ 请确保:"
echo " 1. 防火墙已开放相应端口 (${GRAFANA_PORT}, ${PROMETHEUS_PORT}, ${VICTORIAMETRICS_PORT}, ${ALERTMANAGER_PORT})"
echo " 2. 边缘节点可以访问此服务器的${VICTORIAMETRICS_PORT}端口"
fi
echo " 3. 已配置好告警通知渠道"
echo ""
echo "📝 管理命令:"
echo " - 查看日志: $DOCKER_COMPOSE_CMD logs -f"
echo " - 重启服务: $DOCKER_COMPOSE_CMD restart"
echo " - 停止服务: $DOCKER_COMPOSE_CMD down"
echo ""
echo "🔧 下一步:"
echo " 1. 登录Grafana配置数据源"
echo " 2. 导入ONVIF监控仪表板"
echo " 3. 配置Alertmanager告警通知"
echo ""
echo "💡 提示:修改配置后,编辑 .env 文件并重新运行此脚本"