#!/bin/bash # 分布式Prometheus中央服务器部署脚本 # 适用于Linux系统 set -e echo "=== 分布式Prometheus中央服务器部署脚本 ===" echo "" # 加载环境变量配置 if [ -f ".env" ]; then echo "📝 加载 .env 配置文件..." # 导出环境变量(支持注释和空行) set -a source .env set +a echo "✅ 环境变量加载完成" elif [ -f "env.example" ]; then echo "⚠️ 未找到 .env 文件,从 env.example 创建..." cp env.example .env echo "✅ 已创建 .env 文件,请根据需要修改配置" echo " 然后重新运行此脚本" exit 0 else echo "⚠️ 未找到 .env 和 env.example 文件,使用默认配置" fi # 设置默认值(如果环境变量未设置) PROMETHEUS_PORT=${PROMETHEUS_PORT:-9091} GRAFANA_PORT=${GRAFANA_PORT:-3000} ALERTMANAGER_PORT=${ALERTMANAGER_PORT:-9093} VICTORIAMETRICS_PORT=${VICTORIAMETRICS_PORT:-18428} PROMETHEUS_DATA_DIR=${PROMETHEUS_DATA_DIR:-./data/prometheus-data} GRAFANA_DATA_DIR=${GRAFANA_DATA_DIR:-./data/grafana-data} VICTORIAMETRICS_DATA_DIR=${VICTORIAMETRICS_DATA_DIR:-./data/victoria-metrics-data} GRAFANA_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin123} PROMETHEUS_SCRAPE_INTERVAL=${PROMETHEUS_SCRAPE_INTERVAL:-15} PROMETHEUS_EVALUATION_INTERVAL=${PROMETHEUS_EVALUATION_INTERVAL:-15} PROMETHEUS_CLUSTER_NAME=${PROMETHEUS_CLUSTER_NAME:-central-monitoring} PROMETHEUS_RETENTION_TIME=${PROMETHEUS_RETENTION_TIME:-30d} VICTORIAMETRICS_RETENTION_PERIOD=${VICTORIAMETRICS_RETENTION_PERIOD:-30d} PROMETHEUS_REMOTE_WRITE_MAX_SAMPLES=${PROMETHEUS_REMOTE_WRITE_MAX_SAMPLES:-10000} PROMETHEUS_REMOTE_WRITE_CAPACITY=${PROMETHEUS_REMOTE_WRITE_CAPACITY:-20000} PROMETHEUS_REMOTE_WRITE_MAX_SHARDS=${PROMETHEUS_REMOTE_WRITE_MAX_SHARDS:-10} GRAFANA_DEFAULT_LANGUAGE=${GRAFANA_DEFAULT_LANGUAGE:-zh-Hans} GRAFANA_DEFAULT_THEME=${GRAFANA_DEFAULT_THEME:-light} TRAEFIK_PROVIDER=${TRAEFIK_PROVIDER:-external} VMAUTH_WRITE_USER=${VMAUTH_WRITE_USER:-vm_write} VMAUTH_WRITE_PASSWORD=${VMAUTH_WRITE_PASSWORD:-change-me-strong-write} VMAUTH_READ_USER=${VMAUTH_READ_USER:-vm_read} VMAUTH_READ_PASSWORD=${VMAUTH_READ_PASSWORD:-change-me-strong-read} VMAUTH_ADMIN_USER=${VMAUTH_ADMIN_USER:-vm_admin} VMAUTH_ADMIN_PASSWORD=${VMAUTH_ADMIN_PASSWORD:-change-me-strong-admin} # 根据 TRAEFIK_ENABLED 与 TRAEFIK_PROVIDER 设置网络 if [ "${TRAEFIK_ENABLED:-false}" = "true" ]; then case "${TRAEFIK_PROVIDER}" in internal) export NETWORK_NAME=${NETWORK_NAME:-central_default} export EXTERNAL_NETWORK=${EXTERNAL_NETWORK:-false} export TRAEFIK_NETWORK=${TRAEFIK_NETWORK:-central_default} export COMPOSE_PROFILES="${COMPOSE_PROFILES:-},traefik-internal" export COMPOSE_PROFILES="${COMPOSE_PROFILES#,}" ;; external) export NETWORK_NAME=${NETWORK_NAME:-traefik} export EXTERNAL_NETWORK=${EXTERNAL_NETWORK:-true} export TRAEFIK_NETWORK=${TRAEFIK_NETWORK:-traefik} ;; *) echo "⚠️ TRAEFIK_PROVIDER 应为 internal 或 external,当前为 ${TRAEFIK_PROVIDER}" export NETWORK_NAME=${NETWORK_NAME:-traefik} export EXTERNAL_NETWORK=${EXTERNAL_NETWORK:-true} export TRAEFIK_NETWORK=${TRAEFIK_NETWORK:-traefik} ;; esac else # 不使用 Traefik:使用 compose 默认网络,直连端口访问 export NETWORK_NAME=${NETWORK_NAME:-central_default} export EXTERNAL_NETWORK=${EXTERNAL_NETWORK:-false} export TRAEFIK_NETWORK=${TRAEFIK_NETWORK:-central_default} fi # 将相对路径转换为绝对路径(Docker 需要绝对路径) # 获取脚本所在目录的绝对路径 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" cd "$SCRIPT_DIR" || exit 1 # 将相对路径转换为绝对路径 if [[ "$PROMETHEUS_DATA_DIR" != /* ]]; then PROMETHEUS_DATA_DIR="$(cd "$(dirname "$PROMETHEUS_DATA_DIR")" && pwd)/$(basename "$PROMETHEUS_DATA_DIR")" fi if [[ "$GRAFANA_DATA_DIR" != /* ]]; then GRAFANA_DATA_DIR="$(cd "$(dirname "$GRAFANA_DATA_DIR")" && pwd)/$(basename "$GRAFANA_DATA_DIR")" fi if [[ "$VICTORIAMETRICS_DATA_DIR" != /* ]]; then VICTORIAMETRICS_DATA_DIR="$(cd "$(dirname "$VICTORIAMETRICS_DATA_DIR")" && pwd)/$(basename "$VICTORIAMETRICS_DATA_DIR")" fi echo "" # 检查Docker是否安装 if ! command -v docker &> /dev/null; then echo "❌ Docker未安装,请先安装Docker" exit 1 fi # 检查Docker Compose (优先检查V2,然后检查V1) DOCKER_COMPOSE_CMD="" if docker compose version &> /dev/null; then DOCKER_COMPOSE_CMD="docker compose" echo "✅ 检测到 Docker Compose V2" elif command -v docker-compose &> /dev/null; then DOCKER_COMPOSE_CMD="docker-compose" echo "✅ 检测到 Docker Compose V1" else echo "❌ Docker Compose未安装,请先安装Docker Compose" exit 1 fi echo "✅ Docker环境检查通过" echo "" # 检查磁盘空间(检查当前目录所在分区) echo "💾 检查磁盘空间..." DATA_PARTITION_AVAIL=$(df -BG "$SCRIPT_DIR" 2>/dev/null | awk 'NR==2 {print $4}' | sed 's/G//' || echo "0") ROOT_AVAIL=$(df -BG / | awk 'NR==2 {print $4}' | sed 's/G//' || echo "0") if [ -z "$DATA_PARTITION_AVAIL" ]; then DATA_PARTITION_AVAIL=0 fi if [ -z "$ROOT_AVAIL" ]; then ROOT_AVAIL=0 fi echo " 当前目录所在分区可用空间: ${DATA_PARTITION_AVAIL}GB" echo " 根分区可用空间: ${ROOT_AVAIL}GB" # 检查当前目录所在分区空间(需要至少2GB用于数据存储) if [ "$DATA_PARTITION_AVAIL" -lt 2 ]; then echo "" echo "⚠️ 警告:当前目录所在分区空间不足!" echo " 分区可用空间: ${DATA_PARTITION_AVAIL}GB" echo " 数据存储路径: ${PROMETHEUS_DATA_DIR}" echo " 建议至少保留 2GB 空间(用于监控数据存储)" echo "" read -p "是否继续部署?(y/N): " -n 1 -r echo if [[ ! $REPLY =~ ^[Yy]$ ]]; then echo "部署已取消" exit 1 fi fi # 检查根分区空间(需要至少500MB用于containerd临时文件) if [ "$ROOT_AVAIL" -lt 1 ]; then echo "" echo "⚠️ 警告:根分区空间不足!" echo " 根分区可用空间: ${ROOT_AVAIL}GB" echo " 建议至少保留 1GB 空间(用于containerd临时文件和系统运行)" echo "" echo "💡 建议清理空间:" echo " 1. 清理Docker资源: docker system prune -a --volumes" echo " 2. 清理系统日志: journalctl --vacuum-time=3d" echo " 3. 清理包缓存: dnf clean all 或 apt-get clean" echo " 4. 检查大文件: du -h --max-depth=1 / | sort -hr | head -10" echo "" read -p "是否继续部署?(y/N): " -n 1 -r echo if [[ ! $REPLY =~ ^[Yy]$ ]]; then echo "部署已取消" exit 1 fi fi echo "" # 从模板生成 prometheus.yml if [ -f "config/prometheus/prometheus.yml.template" ]; then echo "📝 从模板生成 config/prometheus/prometheus.yml..." # 检查是否有 envsubst 命令 if command -v envsubst &> /dev/null; then envsubst < config/prometheus/prometheus.yml.template > config/prometheus/prometheus.yml echo "✅ config/prometheus/prometheus.yml 已生成" else echo "⚠️ envsubst 命令未找到,尝试使用 sed 替换..." # 使用 sed 进行简单的变量替换 sed -e "s/\${PROMETHEUS_SCRAPE_INTERVAL}/${PROMETHEUS_SCRAPE_INTERVAL}/g" \ -e "s/\${PROMETHEUS_EVALUATION_INTERVAL}/${PROMETHEUS_EVALUATION_INTERVAL}/g" \ -e "s/\${PROMETHEUS_CLUSTER_NAME}/${PROMETHEUS_CLUSTER_NAME}/g" \ -e "s/\${VICTORIAMETRICS_PORT}/${VICTORIAMETRICS_PORT}/g" \ -e "s/\${PROMETHEUS_REMOTE_WRITE_MAX_SAMPLES}/${PROMETHEUS_REMOTE_WRITE_MAX_SAMPLES}/g" \ -e "s/\${PROMETHEUS_REMOTE_WRITE_CAPACITY}/${PROMETHEUS_REMOTE_WRITE_CAPACITY}/g" \ -e "s/\${PROMETHEUS_REMOTE_WRITE_MAX_SHARDS}/${PROMETHEUS_REMOTE_WRITE_MAX_SHARDS}/g" \ config/prometheus/prometheus.yml.template > config/prometheus/prometheus.yml echo "✅ config/prometheus/prometheus.yml 已生成(使用 sed)" fi elif [ ! -f "config/prometheus/prometheus.yml" ]; then echo "❌ 配置文件 config/prometheus/prometheus.yml 不存在,且未找到模板文件" exit 1 fi generate_vmauth_and_grafana_datasources() { mkdir -p config/vmauth config/grafana/provisioning/datasources local tenants_file="config/vmauth/tenants.csv" local vmauth_out="config/vmauth/vmauth.yml" local grafana_out="config/grafana/provisioning/datasources/victoriametrics.yml" echo "📝 生成 vmauth 配置与 Grafana 数据源..." # vmauth header cat > "$vmauth_out" <<'EOF' unauthorized_user: url_map: [] users: EOF # grafana header cat > "$grafana_out" <<'EOF' apiVersion: 1 datasources: EOF if [ -f "$tenants_file" ]; then # CSV: tenant_id,edge_node_id,write_user,write_password,read_user,read_password # Skip header line tail -n +2 "$tenants_file" | while IFS=',' read -r tenant_id edge_node_id wuser wpass ruser rpass; do # Skip empty lines [ -z "$tenant_id" ] && continue # vmauth write user (route to per-tenant insert) cat >> "$vmauth_out" <> "$vmauth_out" <> "$grafana_out" <> "$vmauth_out" <> "$grafana_out" </prometheus/...) cat >> "$vmauth_out" < /dev/null; then echo "⚠️ Traefik 网络 '$TRAEFIK_NET' 不存在,正在创建..." if docker network create "$TRAEFIK_NET" 2>/dev/null; then echo "✅ Traefik 网络 '$TRAEFIK_NET' 已创建" else echo "❌ 无法创建 Traefik 网络 '$TRAEFIK_NET'" echo " 请确保:" echo " 1. 外部 Traefik 已运行并创建了网络" echo " 2. 或改为 TRAEFIK_PROVIDER=internal 由本编排启动 Traefik" echo " 3. 或手动创建网络: docker network create $TRAEFIK_NET" echo "" read -p "是否继续部署?(y/N): " -n 1 -r echo if [[ ! $REPLY =~ ^[Yy]$ ]]; then echo "部署已取消" exit 1 fi fi else echo "✅ Traefik 网络 '$TRAEFIK_NET' 已存在(外部 Traefik)" fi echo "" # 创建数据目录(使用环境变量中配置的路径) echo "📁 创建数据目录..." mkdir -p "${PROMETHEUS_DATA_DIR}" mkdir -p "${GRAFANA_DATA_DIR}" mkdir -p "${VICTORIAMETRICS_DATA_DIR}" mkdir -p config/grafana/dashboards mkdir -p config/grafana/provisioning/datasources mkdir -p config/grafana/provisioning/dashboards mkdir -p config/vmauth # 设置目录权限 # Prometheus 需要写权限 chmod 777 "${PROMETHEUS_DATA_DIR}" 2>/dev/null || true # Grafana 需要 UID 472 的权限(Grafana 容器用户) chown -R 472:472 "${GRAFANA_DATA_DIR}" 2>/dev/null || chmod -R 777 "${GRAFANA_DATA_DIR}" # VictoriaMetrics 需要写权限 chmod 777 "${VICTORIAMETRICS_DATA_DIR}" 2>/dev/null || true echo "✅ 数据目录创建完成" echo " - Prometheus: ${PROMETHEUS_DATA_DIR}" echo " - Grafana: ${GRAFANA_DATA_DIR}" echo " - VictoriaMetrics: ${VICTORIAMETRICS_DATA_DIR}" echo "" # 停止现有服务 echo "🛑 停止现有服务..." $DOCKER_COMPOSE_CMD down 2>/dev/null || true # 拉取最新镜像 echo "📥 拉取Docker镜像..." if ! $DOCKER_COMPOSE_CMD pull; then echo "" echo "⚠️ 镜像拉取失败,可能的原因:" echo " 1. 网络连接问题" echo " 2. Docker Hub 速率限制" echo " 3. 需要配置镜像加速器" echo "" echo "💡 建议:" echo " 1. 检查网络连接" echo " 2. 配置 Docker 镜像加速器(如阿里云、腾讯云等)" echo " 3. 或稍后重试" echo "" echo "🔄 尝试继续启动(如果本地已有镜像)..." echo "" fi # 启动服务 echo "🚀 启动服务..." $DOCKER_COMPOSE_CMD up -d # 等待服务启动 echo "⏳ 等待服务启动..." sleep 15 # 检查服务状态 echo "" echo "📊 服务状态检查:" $DOCKER_COMPOSE_CMD ps echo "" echo "📋 服务日志:" $DOCKER_COMPOSE_CMD logs --tail=20 echo "" echo "✅ 部署完成!" echo "" # 检查是否启用 Traefik if [ "${TRAEFIK_ENABLED:-false}" = "true" ]; then echo "🔗 访问地址(通过 Traefik 反向代理):" if [ "${TRAEFIK_PROVIDER}" = "internal" ]; then echo " - Traefik 由本编排启动,监听 80/443 端口" fi echo " - Grafana仪表板: http://${GRAFANA_DOMAIN:-grafana.example.com} (admin/${GRAFANA_ADMIN_PASSWORD}) [${GRAFANA_DEFAULT_LANGUAGE}界面]" echo " - Prometheus: http://${PROMETHEUS_DOMAIN:-prometheus.example.com} [英文界面]" echo " - Alertmanager: http://${ALERTMANAGER_DOMAIN:-alertmanager.example.com} [英文界面]" echo " - VictoriaMetrics: http://localhost:${VICTORIAMETRICS_PORT} [英文界面,边缘节点直接连接]" echo "" echo "⚠️ 请确保:" echo " 1. DNS 已正确解析域名到本机" echo " 2. 边缘节点可以访问此服务器的${VICTORIAMETRICS_PORT}端口(VictoriaMetrics 不通过 Traefik)" else echo "🔗 访问地址:" echo " - Grafana仪表板: http://localhost:${GRAFANA_PORT} (admin/${GRAFANA_ADMIN_PASSWORD}) [${GRAFANA_DEFAULT_LANGUAGE}界面]" echo " - Prometheus: http://localhost:${PROMETHEUS_PORT} [英文界面]" echo " - VictoriaMetrics: http://localhost:${VICTORIAMETRICS_PORT} [英文界面]" echo " - Alertmanager: http://localhost:${ALERTMANAGER_PORT} [英文界面]" echo "" echo "⚠️ 请确保:" echo " 1. 防火墙已开放相应端口 (${GRAFANA_PORT}, ${PROMETHEUS_PORT}, ${VICTORIAMETRICS_PORT}, ${ALERTMANAGER_PORT})" echo " 2. 边缘节点可以访问此服务器的${VICTORIAMETRICS_PORT}端口" fi echo " 3. 已配置好告警通知渠道" echo "" echo "📝 管理命令:" echo " - 查看日志: $DOCKER_COMPOSE_CMD logs -f" echo " - 重启服务: $DOCKER_COMPOSE_CMD restart" echo " - 停止服务: $DOCKER_COMPOSE_CMD down" echo "" echo "🔧 下一步:" echo " 1. 登录Grafana配置数据源" echo " 2. 导入ONVIF监控仪表板" echo " 3. 配置Alertmanager告警通知" echo "" echo "💡 提示:修改配置后,编辑 .env 文件并重新运行此脚本"