Files
root c4825c2d27 feat: 引入 vmauth 鉴权与严格多租户
- 对外端口统一为 18428(vmauth 入口),VM 不再直接暴露宿主机端口
- 边缘 vmagent 与中央 Prometheus remote_write 增加 basic auth
- 支持 tenants.csv 驱动的 per-tenant 写入/查询隔离,并提供管理员跨租户只读查询
- 更新 Grafana provisioning 与部署/文档

Made-with: Cursor
2026-04-22 11:41:13 +00:00

514 lines
18 KiB
Bash
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
# 分布式Prometheus中央服务器部署脚本
# 适用于Linux系统
set -e
echo "=== 分布式Prometheus中央服务器部署脚本 ==="
echo ""
# 加载环境变量配置
if [ -f ".env" ]; then
echo "📝 加载 .env 配置文件..."
# 导出环境变量(支持注释和空行)
set -a
source .env
set +a
echo "✅ 环境变量加载完成"
elif [ -f "env.example" ]; then
echo "⚠️ 未找到 .env 文件,从 env.example 创建..."
cp env.example .env
echo "✅ 已创建 .env 文件,请根据需要修改配置"
echo " 然后重新运行此脚本"
exit 0
else
echo "⚠️ 未找到 .env 和 env.example 文件,使用默认配置"
fi
# 设置默认值(如果环境变量未设置)
PROMETHEUS_PORT=${PROMETHEUS_PORT:-9091}
GRAFANA_PORT=${GRAFANA_PORT:-3000}
ALERTMANAGER_PORT=${ALERTMANAGER_PORT:-9093}
VICTORIAMETRICS_PORT=${VICTORIAMETRICS_PORT:-18428}
PROMETHEUS_DATA_DIR=${PROMETHEUS_DATA_DIR:-./data/prometheus-data}
GRAFANA_DATA_DIR=${GRAFANA_DATA_DIR:-./data/grafana-data}
VICTORIAMETRICS_DATA_DIR=${VICTORIAMETRICS_DATA_DIR:-./data/victoria-metrics-data}
GRAFANA_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin123}
PROMETHEUS_SCRAPE_INTERVAL=${PROMETHEUS_SCRAPE_INTERVAL:-15}
PROMETHEUS_EVALUATION_INTERVAL=${PROMETHEUS_EVALUATION_INTERVAL:-15}
PROMETHEUS_CLUSTER_NAME=${PROMETHEUS_CLUSTER_NAME:-central-monitoring}
PROMETHEUS_RETENTION_TIME=${PROMETHEUS_RETENTION_TIME:-30d}
VICTORIAMETRICS_RETENTION_PERIOD=${VICTORIAMETRICS_RETENTION_PERIOD:-30d}
PROMETHEUS_REMOTE_WRITE_MAX_SAMPLES=${PROMETHEUS_REMOTE_WRITE_MAX_SAMPLES:-10000}
PROMETHEUS_REMOTE_WRITE_CAPACITY=${PROMETHEUS_REMOTE_WRITE_CAPACITY:-20000}
PROMETHEUS_REMOTE_WRITE_MAX_SHARDS=${PROMETHEUS_REMOTE_WRITE_MAX_SHARDS:-10}
GRAFANA_DEFAULT_LANGUAGE=${GRAFANA_DEFAULT_LANGUAGE:-zh-Hans}
GRAFANA_DEFAULT_THEME=${GRAFANA_DEFAULT_THEME:-light}
TRAEFIK_PROVIDER=${TRAEFIK_PROVIDER:-external}
VMAUTH_WRITE_USER=${VMAUTH_WRITE_USER:-vm_write}
VMAUTH_WRITE_PASSWORD=${VMAUTH_WRITE_PASSWORD:-change-me-strong-write}
VMAUTH_READ_USER=${VMAUTH_READ_USER:-vm_read}
VMAUTH_READ_PASSWORD=${VMAUTH_READ_PASSWORD:-change-me-strong-read}
VMAUTH_ADMIN_USER=${VMAUTH_ADMIN_USER:-vm_admin}
VMAUTH_ADMIN_PASSWORD=${VMAUTH_ADMIN_PASSWORD:-change-me-strong-admin}
# 根据 TRAEFIK_ENABLED 与 TRAEFIK_PROVIDER 设置网络
if [ "${TRAEFIK_ENABLED:-false}" = "true" ]; then
case "${TRAEFIK_PROVIDER}" in
internal)
export NETWORK_NAME=${NETWORK_NAME:-central_default}
export EXTERNAL_NETWORK=${EXTERNAL_NETWORK:-false}
export TRAEFIK_NETWORK=${TRAEFIK_NETWORK:-central_default}
export COMPOSE_PROFILES="${COMPOSE_PROFILES:-},traefik-internal"
export COMPOSE_PROFILES="${COMPOSE_PROFILES#,}"
;;
external)
export NETWORK_NAME=${NETWORK_NAME:-traefik}
export EXTERNAL_NETWORK=${EXTERNAL_NETWORK:-true}
export TRAEFIK_NETWORK=${TRAEFIK_NETWORK:-traefik}
;;
*)
echo "⚠️ TRAEFIK_PROVIDER 应为 internal 或 external当前为 ${TRAEFIK_PROVIDER}"
export NETWORK_NAME=${NETWORK_NAME:-traefik}
export EXTERNAL_NETWORK=${EXTERNAL_NETWORK:-true}
export TRAEFIK_NETWORK=${TRAEFIK_NETWORK:-traefik}
;;
esac
else
# 不使用 Traefik使用 compose 默认网络,直连端口访问
export NETWORK_NAME=${NETWORK_NAME:-central_default}
export EXTERNAL_NETWORK=${EXTERNAL_NETWORK:-false}
export TRAEFIK_NETWORK=${TRAEFIK_NETWORK:-central_default}
fi
# 将相对路径转换为绝对路径Docker 需要绝对路径)
# 获取脚本所在目录的绝对路径
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$SCRIPT_DIR" || exit 1
# 将相对路径转换为绝对路径
if [[ "$PROMETHEUS_DATA_DIR" != /* ]]; then
PROMETHEUS_DATA_DIR="$(cd "$(dirname "$PROMETHEUS_DATA_DIR")" && pwd)/$(basename "$PROMETHEUS_DATA_DIR")"
fi
if [[ "$GRAFANA_DATA_DIR" != /* ]]; then
GRAFANA_DATA_DIR="$(cd "$(dirname "$GRAFANA_DATA_DIR")" && pwd)/$(basename "$GRAFANA_DATA_DIR")"
fi
if [[ "$VICTORIAMETRICS_DATA_DIR" != /* ]]; then
VICTORIAMETRICS_DATA_DIR="$(cd "$(dirname "$VICTORIAMETRICS_DATA_DIR")" && pwd)/$(basename "$VICTORIAMETRICS_DATA_DIR")"
fi
echo ""
# 检查Docker是否安装
if ! command -v docker &> /dev/null; then
echo "❌ Docker未安装请先安装Docker"
exit 1
fi
# 检查Docker Compose (优先检查V2然后检查V1)
DOCKER_COMPOSE_CMD=""
if docker compose version &> /dev/null; then
DOCKER_COMPOSE_CMD="docker compose"
echo "✅ 检测到 Docker Compose V2"
elif command -v docker-compose &> /dev/null; then
DOCKER_COMPOSE_CMD="docker-compose"
echo "✅ 检测到 Docker Compose V1"
else
echo "❌ Docker Compose未安装请先安装Docker Compose"
exit 1
fi
echo "✅ Docker环境检查通过"
echo ""
# 检查磁盘空间(检查当前目录所在分区)
echo "💾 检查磁盘空间..."
DATA_PARTITION_AVAIL=$(df -BG "$SCRIPT_DIR" 2>/dev/null | awk 'NR==2 {print $4}' | sed 's/G//' || echo "0")
ROOT_AVAIL=$(df -BG / | awk 'NR==2 {print $4}' | sed 's/G//' || echo "0")
if [ -z "$DATA_PARTITION_AVAIL" ]; then
DATA_PARTITION_AVAIL=0
fi
if [ -z "$ROOT_AVAIL" ]; then
ROOT_AVAIL=0
fi
echo " 当前目录所在分区可用空间: ${DATA_PARTITION_AVAIL}GB"
echo " 根分区可用空间: ${ROOT_AVAIL}GB"
# 检查当前目录所在分区空间需要至少2GB用于数据存储
if [ "$DATA_PARTITION_AVAIL" -lt 2 ]; then
echo ""
echo "⚠️ 警告:当前目录所在分区空间不足!"
echo " 分区可用空间: ${DATA_PARTITION_AVAIL}GB"
echo " 数据存储路径: ${PROMETHEUS_DATA_DIR}"
echo " 建议至少保留 2GB 空间(用于监控数据存储)"
echo ""
read -p "是否继续部署?(y/N): " -n 1 -r
echo
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
echo "部署已取消"
exit 1
fi
fi
# 检查根分区空间需要至少500MB用于containerd临时文件
if [ "$ROOT_AVAIL" -lt 1 ]; then
echo ""
echo "⚠️ 警告:根分区空间不足!"
echo " 根分区可用空间: ${ROOT_AVAIL}GB"
echo " 建议至少保留 1GB 空间用于containerd临时文件和系统运行"
echo ""
echo "💡 建议清理空间:"
echo " 1. 清理Docker资源: docker system prune -a --volumes"
echo " 2. 清理系统日志: journalctl --vacuum-time=3d"
echo " 3. 清理包缓存: dnf clean all 或 apt-get clean"
echo " 4. 检查大文件: du -h --max-depth=1 / | sort -hr | head -10"
echo ""
read -p "是否继续部署?(y/N): " -n 1 -r
echo
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
echo "部署已取消"
exit 1
fi
fi
echo ""
# 从模板生成 prometheus.yml
if [ -f "config/prometheus/prometheus.yml.template" ]; then
echo "📝 从模板生成 config/prometheus/prometheus.yml..."
# 检查是否有 envsubst 命令
if command -v envsubst &> /dev/null; then
envsubst < config/prometheus/prometheus.yml.template > config/prometheus/prometheus.yml
echo "✅ config/prometheus/prometheus.yml 已生成"
else
echo "⚠️ envsubst 命令未找到,尝试使用 sed 替换..."
# 使用 sed 进行简单的变量替换
sed -e "s/\${PROMETHEUS_SCRAPE_INTERVAL}/${PROMETHEUS_SCRAPE_INTERVAL}/g" \
-e "s/\${PROMETHEUS_EVALUATION_INTERVAL}/${PROMETHEUS_EVALUATION_INTERVAL}/g" \
-e "s/\${PROMETHEUS_CLUSTER_NAME}/${PROMETHEUS_CLUSTER_NAME}/g" \
-e "s/\${VICTORIAMETRICS_PORT}/${VICTORIAMETRICS_PORT}/g" \
-e "s/\${PROMETHEUS_REMOTE_WRITE_MAX_SAMPLES}/${PROMETHEUS_REMOTE_WRITE_MAX_SAMPLES}/g" \
-e "s/\${PROMETHEUS_REMOTE_WRITE_CAPACITY}/${PROMETHEUS_REMOTE_WRITE_CAPACITY}/g" \
-e "s/\${PROMETHEUS_REMOTE_WRITE_MAX_SHARDS}/${PROMETHEUS_REMOTE_WRITE_MAX_SHARDS}/g" \
config/prometheus/prometheus.yml.template > config/prometheus/prometheus.yml
echo "✅ config/prometheus/prometheus.yml 已生成(使用 sed"
fi
elif [ ! -f "config/prometheus/prometheus.yml" ]; then
echo "❌ 配置文件 config/prometheus/prometheus.yml 不存在,且未找到模板文件"
exit 1
fi
generate_vmauth_and_grafana_datasources() {
mkdir -p config/vmauth config/grafana/provisioning/datasources
local tenants_file="config/vmauth/tenants.csv"
local vmauth_out="config/vmauth/vmauth.yml"
local grafana_out="config/grafana/provisioning/datasources/victoriametrics.yml"
echo "📝 生成 vmauth 配置与 Grafana 数据源..."
# vmauth header
cat > "$vmauth_out" <<'EOF'
unauthorized_user:
url_map: []
users:
EOF
# grafana header
cat > "$grafana_out" <<'EOF'
apiVersion: 1
datasources:
EOF
if [ -f "$tenants_file" ]; then
# CSV: tenant_id,edge_node_id,write_user,write_password,read_user,read_password
# Skip header line
tail -n +2 "$tenants_file" | while IFS=',' read -r tenant_id edge_node_id wuser wpass ruser rpass; do
# Skip empty lines
[ -z "$tenant_id" ] && continue
# vmauth write user (route to per-tenant insert)
cat >> "$vmauth_out" <<EOF
- username: "$wuser"
password: "$wpass"
url_prefix: "http://victoria-metrics:8428/insert/${tenant_id}/prometheus"
EOF
# vmauth read user (route to per-tenant select; allow only query-ish paths)
cat >> "$vmauth_out" <<EOF
- username: "$ruser"
password: "$rpass"
url_map:
- src_paths:
- "/api/v1/query"
- "/api/v1/query_range"
- "/api/v1/series"
- "/api/v1/labels"
- "/api/v1/label/.+/values"
- "/api/v1/export"
- "/federate"
url_prefix: "http://victoria-metrics:8428/select/${tenant_id}/prometheus"
EOF
# grafana datasource per tenant
# name includes tenant_id + edge_node_id for readability
cat >> "$grafana_out" <<EOF
- name: "VictoriaMetrics (tenant ${tenant_id}${edge_node_id:+ - ${edge_node_id}})"
type: prometheus
access: proxy
url: http://vmauth:8427
isDefault: false
editable: true
basicAuth: true
basicAuthUser: "${ruser}"
jsonData:
httpMethod: POST
queryTimeout: 60s
timeInterval: 15s
secureJsonData:
basicAuthPassword: "${rpass}"
EOF
done
echo "✅ 已根据 tenants.csv 生成 vmauth.yml 与 Grafana 多数据源"
else
# Fallback: single-tenant mode using .env variables
cat >> "$vmauth_out" <<EOF
- username: "${VMAUTH_WRITE_USER}"
password: "${VMAUTH_WRITE_PASSWORD}"
url_prefix: "http://victoria-metrics:8428"
- username: "${VMAUTH_READ_USER}"
password: "${VMAUTH_READ_PASSWORD}"
url_map:
- src_paths:
- "/api/v1/query"
- "/api/v1/query_range"
- "/api/v1/series"
- "/api/v1/labels"
- "/api/v1/label/.+/values"
- "/api/v1/export"
- "/federate"
url_prefix: "http://victoria-metrics:8428"
EOF
cat >> "$grafana_out" <<EOF
- name: VictoriaMetrics
type: prometheus
access: proxy
url: http://vmauth:8427
isDefault: false
editable: true
basicAuth: true
basicAuthUser: "${VMAUTH_READ_USER}"
jsonData:
httpMethod: POST
queryTimeout: 60s
timeInterval: 15s
secureJsonData:
basicAuthPassword: "${VMAUTH_READ_PASSWORD}"
EOF
echo "✅ 未找到 tenants.csv已使用单租户 .env 变量生成 vmauth.yml 与 Grafana 数据源"
fi
# Append admin read-only user (can query any tenant via /select/<tenant>/prometheus/...)
cat >> "$vmauth_out" <<EOF
- username: "${VMAUTH_ADMIN_USER}"
password: "${VMAUTH_ADMIN_PASSWORD}"
url_map:
- src_paths:
- "^/select/[0-9]+/prometheus/api/v1/query$"
- "^/select/[0-9]+/prometheus/api/v1/query_range$"
- "^/select/[0-9]+/prometheus/api/v1/series$"
- "^/select/[0-9]+/prometheus/api/v1/labels$"
- "^/select/[0-9]+/prometheus/api/v1/label/.+/values$"
- "^/select/[0-9]+/prometheus/api/v1/export$"
- "^/select/[0-9]+/prometheus/federate$"
url_prefix: "http://victoria-metrics:8428"
EOF
echo "✅ 已追加管理员只读账号(可跨租户查询)"
}
generate_vmauth_and_grafana_datasources
# 检查配置文件
if [ ! -f "config/prometheus/alert_rules.yml" ]; then
echo "❌ 配置文件 config/prometheus/alert_rules.yml 不存在"
exit 1
fi
if [ ! -f "config/alertmanager/alertmanager.yml" ]; then
echo "❌ 配置文件 config/alertmanager/alertmanager.yml 不存在"
exit 1
fi
echo "✅ 配置文件检查通过"
echo ""
# 导出环境变量供 docker-compose 使用(使用绝对路径)
export PROMETHEUS_DATA_DIR
export GRAFANA_DATA_DIR
export VICTORIAMETRICS_DATA_DIR
export PROMETHEUS_PORT
export GRAFANA_PORT
export ALERTMANAGER_PORT
export VICTORIAMETRICS_PORT
export GRAFANA_ADMIN_PASSWORD
export GRAFANA_DEFAULT_LANGUAGE
export GRAFANA_DEFAULT_THEME
export GRAFANA_ROOT_URL
export PROMETHEUS_RETENTION_TIME
export VICTORIAMETRICS_RETENTION_PERIOD
export TRAEFIK_ENABLED
export TRAEFIK_NETWORK
export TRAEFIK_ENTRYPOINT
export GRAFANA_DOMAIN
export PROMETHEUS_DOMAIN
export ALERTMANAGER_DOMAIN
# 检查 Traefik 网络(仅 TRAEFIK_PROVIDER=external 时需已存在)
echo "🔍 检查 Traefik 网络..."
TRAEFIK_NET=${TRAEFIK_NETWORK:-traefik}
if [ "${TRAEFIK_ENABLED:-false}" != "true" ]; then
echo "✅ 未启用 Traefik使用 compose 默认网络"
elif [ "${TRAEFIK_PROVIDER}" = "internal" ]; then
echo "✅ 本编排启动 Traefik将自动创建网络 $TRAEFIK_NET"
elif ! docker network inspect "$TRAEFIK_NET" &> /dev/null; then
echo "⚠️ Traefik 网络 '$TRAEFIK_NET' 不存在,正在创建..."
if docker network create "$TRAEFIK_NET" 2>/dev/null; then
echo "✅ Traefik 网络 '$TRAEFIK_NET' 已创建"
else
echo "❌ 无法创建 Traefik 网络 '$TRAEFIK_NET'"
echo " 请确保:"
echo " 1. 外部 Traefik 已运行并创建了网络"
echo " 2. 或改为 TRAEFIK_PROVIDER=internal 由本编排启动 Traefik"
echo " 3. 或手动创建网络: docker network create $TRAEFIK_NET"
echo ""
read -p "是否继续部署?(y/N): " -n 1 -r
echo
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
echo "部署已取消"
exit 1
fi
fi
else
echo "✅ Traefik 网络 '$TRAEFIK_NET' 已存在(外部 Traefik"
fi
echo ""
# 创建数据目录(使用环境变量中配置的路径)
echo "📁 创建数据目录..."
mkdir -p "${PROMETHEUS_DATA_DIR}"
mkdir -p "${GRAFANA_DATA_DIR}"
mkdir -p "${VICTORIAMETRICS_DATA_DIR}"
mkdir -p config/grafana/dashboards
mkdir -p config/grafana/provisioning/datasources
mkdir -p config/grafana/provisioning/dashboards
mkdir -p config/vmauth
# 设置目录权限
# Prometheus 需要写权限
chmod 777 "${PROMETHEUS_DATA_DIR}" 2>/dev/null || true
# Grafana 需要 UID 472 的权限Grafana 容器用户)
chown -R 472:472 "${GRAFANA_DATA_DIR}" 2>/dev/null || chmod -R 777 "${GRAFANA_DATA_DIR}"
# VictoriaMetrics 需要写权限
chmod 777 "${VICTORIAMETRICS_DATA_DIR}" 2>/dev/null || true
echo "✅ 数据目录创建完成"
echo " - Prometheus: ${PROMETHEUS_DATA_DIR}"
echo " - Grafana: ${GRAFANA_DATA_DIR}"
echo " - VictoriaMetrics: ${VICTORIAMETRICS_DATA_DIR}"
echo ""
# 停止现有服务
echo "🛑 停止现有服务..."
$DOCKER_COMPOSE_CMD down 2>/dev/null || true
# 拉取最新镜像
echo "📥 拉取Docker镜像..."
if ! $DOCKER_COMPOSE_CMD pull; then
echo ""
echo "⚠️ 镜像拉取失败,可能的原因:"
echo " 1. 网络连接问题"
echo " 2. Docker Hub 速率限制"
echo " 3. 需要配置镜像加速器"
echo ""
echo "💡 建议:"
echo " 1. 检查网络连接"
echo " 2. 配置 Docker 镜像加速器(如阿里云、腾讯云等)"
echo " 3. 或稍后重试"
echo ""
echo "🔄 尝试继续启动(如果本地已有镜像)..."
echo ""
fi
# 启动服务
echo "🚀 启动服务..."
$DOCKER_COMPOSE_CMD up -d
# 等待服务启动
echo "⏳ 等待服务启动..."
sleep 15
# 检查服务状态
echo ""
echo "📊 服务状态检查:"
$DOCKER_COMPOSE_CMD ps
echo ""
echo "📋 服务日志:"
$DOCKER_COMPOSE_CMD logs --tail=20
echo ""
echo "✅ 部署完成!"
echo ""
# 检查是否启用 Traefik
if [ "${TRAEFIK_ENABLED:-false}" = "true" ]; then
echo "🔗 访问地址(通过 Traefik 反向代理):"
if [ "${TRAEFIK_PROVIDER}" = "internal" ]; then
echo " - Traefik 由本编排启动,监听 80/443 端口"
fi
echo " - Grafana仪表板: http://${GRAFANA_DOMAIN:-grafana.example.com} (admin/${GRAFANA_ADMIN_PASSWORD}) [${GRAFANA_DEFAULT_LANGUAGE}界面]"
echo " - Prometheus: http://${PROMETHEUS_DOMAIN:-prometheus.example.com} [英文界面]"
echo " - Alertmanager: http://${ALERTMANAGER_DOMAIN:-alertmanager.example.com} [英文界面]"
echo " - VictoriaMetrics: http://localhost:${VICTORIAMETRICS_PORT} [英文界面,边缘节点直接连接]"
echo ""
echo "⚠️ 请确保:"
echo " 1. DNS 已正确解析域名到本机"
echo " 2. 边缘节点可以访问此服务器的${VICTORIAMETRICS_PORT}端口VictoriaMetrics 不通过 Traefik"
else
echo "🔗 访问地址:"
echo " - Grafana仪表板: http://localhost:${GRAFANA_PORT} (admin/${GRAFANA_ADMIN_PASSWORD}) [${GRAFANA_DEFAULT_LANGUAGE}界面]"
echo " - Prometheus: http://localhost:${PROMETHEUS_PORT} [英文界面]"
echo " - VictoriaMetrics: http://localhost:${VICTORIAMETRICS_PORT} [英文界面]"
echo " - Alertmanager: http://localhost:${ALERTMANAGER_PORT} [英文界面]"
echo ""
echo "⚠️ 请确保:"
echo " 1. 防火墙已开放相应端口 (${GRAFANA_PORT}, ${PROMETHEUS_PORT}, ${VICTORIAMETRICS_PORT}, ${ALERTMANAGER_PORT})"
echo " 2. 边缘节点可以访问此服务器的${VICTORIAMETRICS_PORT}端口"
fi
echo " 3. 已配置好告警通知渠道"
echo ""
echo "📝 管理命令:"
echo " - 查看日志: $DOCKER_COMPOSE_CMD logs -f"
echo " - 重启服务: $DOCKER_COMPOSE_CMD restart"
echo " - 停止服务: $DOCKER_COMPOSE_CMD down"
echo ""
echo "🔧 下一步:"
echo " 1. 登录Grafana配置数据源"
echo " 2. 导入ONVIF监控仪表板"
echo " 3. 配置Alertmanager告警通知"
echo ""
echo "💡 提示:修改配置后,编辑 .env 文件并重新运行此脚本"