refactor: config/apps 目录重组、文档重构、架构图收窄

- 中央:config/(prometheus,alertmanager,grafana)、apps/(tile-cache,topology-editor)
- 边缘:config/(vmagent,blackbox,targets)、apps/(onvif-exporter)
- env: TRAEFIK_PROVIDER、prometheus/env.example 详细说明
- 文档:README/doc 重构,EDGE_CACHE 合并到 EDGE_AGENT_CONFIG
- targets.csv 更新流程说明,ARCHITECTURE 图收窄

Made-with: Cursor
This commit is contained in:
2026-02-28 22:05:43 -05:00
parent 650e5145f1
commit ab1515dffb
48 changed files with 2071 additions and 509 deletions

View File

@@ -0,0 +1,22 @@
global:
smtp_smarthost: 'localhost:587'
smtp_from: 'alertmanager@example.com'
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'web.hook'
receivers:
- name: 'web.hook'
webhook_configs:
- url: 'http://127.0.0.1:5001/'
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']

View File

@@ -0,0 +1,76 @@
{
"dashboard": {
"id": null,
"title": "ONVIF设备监控",
"tags": ["onvif", "camera", "monitoring"],
"style": "dark",
"timezone": "browser",
"panels": [
{
"id": 1,
"title": "ONVIF设备状态",
"type": "stat",
"targets": [
{
"expr": "up{job=\"onvif-devices\"}",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"thresholds": {
"steps": [
{
"color": "red",
"value": 0
},
{
"color": "green",
"value": 1
}
]
}
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
}
},
{
"id": 2,
"title": "设备在线率",
"type": "gauge",
"targets": [
{
"expr": "sum(up{job=\"onvif-devices\"}) / count(up{job=\"onvif-devices\"}) * 100",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"min": 0,
"max": 100,
"unit": "percent"
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
}
}
],
"time": {
"from": "now-1h",
"to": "now"
},
"refresh": "30s"
}
}

View File

@@ -0,0 +1,12 @@
apiVersion: 1
providers:
- name: 'default'
orgId: 1
folder: ''
type: file
disableDeletion: false
updateIntervalSeconds: 10
allowUiUpdates: true
options:
path: /var/lib/grafana/dashboards

View File

@@ -0,0 +1,20 @@
# 管理员全局数据源配置
# 此数据源允许管理员查看所有数据(不受标签过滤限制)
# 放置在 provisioning/datasources/ 目录下会自动加载
apiVersion: 1
datasources:
- name: Prometheus-All-Data
type: prometheus
access: proxy
url: http://prometheus-central:9090
isDefault: false
editable: false
jsonData:
httpMethod: POST
queryTimeout: 60s
timeInterval: 15s
# 此数据源对所有组织可见(通过权限控制)
# 管理员可以使用无标签过滤的查询查看所有数据
# 例如: up 而不是 up{user_group="xxx"}

View File

@@ -0,0 +1,9 @@
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus-central:9090
isDefault: true
editable: true

View File

@@ -0,0 +1,16 @@
# VictoriaMetrics 数据源(边缘节点上报的数据存储在此)
# 边缘节点通过 remote_write 推送到中央 VictoriaMetrics本数据源用于在 Grafana 中查询这些数据
# 使用前需在边缘节点配置remote_write 指向中央服务器 VictoriaMetrics 地址(如 http://中央IP:8428/api/v1/write
apiVersion: 1
datasources:
- name: VictoriaMetrics
type: prometheus
access: proxy
url: http://victoria-metrics:8428
isDefault: false
editable: true
jsonData:
httpMethod: POST
queryTimeout: 60s
timeInterval: 15s

View File

@@ -0,0 +1,164 @@
#!/bin/bash
# Grafana 多用户和组织配置脚本
# 使用方法: ./setup-users.sh
set -e
GRAFANA_URL="${GRAFANA_URL:-http://localhost:3000}"
GRAFANA_ADMIN_USER="${GRAFANA_ADMIN_USER:-admin}"
GRAFANA_ADMIN_PASSWORD="${GRAFANA_ADMIN_PASSWORD:-admin123}"
echo "=== Grafana 多用户配置脚本 ==="
echo ""
# 检查 jq 是否安装
if ! command -v jq &> /dev/null; then
echo "❌ jq 未安装,请先安装 jq:"
echo " Ubuntu/Debian: sudo apt-get install jq"
echo " CentOS/RHEL: sudo yum install jq"
echo " Fedora: sudo dnf install jq"
exit 1
fi
# 获取认证 Token
echo "🔐 获取 Grafana API Token..."
AUTH_RESPONSE=$(curl -s -X POST \
-H "Content-Type: application/json" \
-d "{\"user\":\"$GRAFANA_ADMIN_USER\",\"password\":\"$GRAFANA_ADMIN_PASSWORD\"}" \
"$GRAFANA_URL/login")
# 检查 Grafana 是否可访问
if ! curl -s "$GRAFANA_URL/api/health" > /dev/null; then
echo "❌ 无法连接到 Grafana: $GRAFANA_URL"
echo " 请确保 Grafana 服务正在运行"
exit 1
fi
echo "✅ Grafana 连接成功"
echo ""
# 创建组织的函数
create_organization() {
local org_name=$1
local org_id=$2
echo "📁 创建组织: $org_name"
# 检查组织是否已存在
ORG_EXISTS=$(curl -s -u "$GRAFANA_ADMIN_USER:$GRAFANA_ADMIN_PASSWORD" \
"$GRAFANA_URL/api/orgs/name/$org_name" | jq -r '.id // empty')
if [ -n "$ORG_EXISTS" ]; then
echo " ⚠️ 组织 $org_name 已存在 (ID: $ORG_EXISTS)"
return
fi
# 创建组织
ORG_RESPONSE=$(curl -s -X POST \
-u "$GRAFANA_ADMIN_USER:$GRAFANA_ADMIN_PASSWORD" \
-H "Content-Type: application/json" \
-d "{\"name\":\"$org_name\"}" \
"$GRAFANA_URL/api/orgs")
NEW_ORG_ID=$(echo "$ORG_RESPONSE" | jq -r '.orgId // empty')
if [ -n "$NEW_ORG_ID" ]; then
echo " ✅ 组织创建成功 (ID: $NEW_ORG_ID)"
else
echo " ❌ 组织创建失败: $ORG_RESPONSE"
fi
}
# 创建用户的函数
create_user() {
local org_name=$1
local username=$2
local password=$3
local email=$4
local role=${5:-Viewer}
echo "👤 创建用户: $username (组织: $org_name)"
# 切换到指定组织
ORG_ID=$(curl -s -u "$GRAFANA_ADMIN_USER:$GRAFANA_ADMIN_PASSWORD" \
"$GRAFANA_URL/api/orgs/name/$org_name" | jq -r '.id // empty')
if [ -z "$ORG_ID" ]; then
echo " ❌ 组织 $org_name 不存在"
return
fi
# 切换到组织
curl -s -X POST \
-u "$GRAFANA_ADMIN_USER:$GRAFANA_ADMIN_PASSWORD" \
"$GRAFANA_URL/api/user/using/$ORG_ID" > /dev/null
# 检查用户是否已存在
USER_EXISTS=$(curl -s -u "$GRAFANA_ADMIN_USER:$GRAFANA_ADMIN_PASSWORD" \
"$GRAFANA_URL/api/users/lookup?loginOrEmail=$email" | jq -r '.id // empty')
if [ -n "$USER_EXISTS" ]; then
echo " ⚠️ 用户 $username 已存在"
# 将用户添加到组织
curl -s -X POST \
-u "$GRAFANA_ADMIN_USER:$GRAFANA_ADMIN_PASSWORD" \
-H "Content-Type: application/json" \
-d "{\"loginOrEmail\":\"$email\",\"role\":\"$role\"}" \
"$GRAFANA_URL/api/orgs/$ORG_ID/users" > /dev/null
echo " ✅ 用户已添加到组织"
return
fi
# 创建用户
USER_RESPONSE=$(curl -s -X POST \
-u "$GRAFANA_ADMIN_USER:$GRAFANA_ADMIN_PASSWORD" \
-H "Content-Type: application/json" \
-d "{
\"name\":\"$username\",
\"email\":\"$email\",
\"login\":\"$username\",
\"password\":\"$password\",
\"OrgId\":$ORG_ID
}" \
"$GRAFANA_URL/api/admin/users")
USER_ID=$(echo "$USER_RESPONSE" | jq -r '.id // empty')
if [ -n "$USER_ID" ]; then
echo " ✅ 用户创建成功 (ID: $USER_ID)"
else
echo " ❌ 用户创建失败: $USER_RESPONSE"
fi
}
# 示例:创建组织和用户
echo "📝 开始创建组织和用户..."
echo ""
# 创建示例组织
create_organization "用户组A" 2
create_organization "用户组B" 3
# 创建示例用户
create_user "用户组A" "usera1" "password123" "usera1@example.com" "Viewer"
create_user "用户组A" "usera2" "password123" "usera2@example.com" "Editor"
create_user "用户组B" "userb1" "password123" "userb1@example.com" "Viewer"
create_user "用户组B" "userb2" "password123" "userb2@example.com" "Editor"
echo ""
echo "✅ 用户配置完成!"
echo ""
echo "📋 创建的用户:"
echo " 用户组A:"
echo " - usera1 (Viewer) - usera1@example.com / password123"
echo " - usera2 (Editor) - usera2@example.com / password123"
echo " 用户组B:"
echo " - userb1 (Viewer) - userb1@example.com / password123"
echo " - userb2 (Editor) - userb2@example.com / password123"
echo ""
echo "💡 下一步:"
echo " 1. 登录 Grafana 为每个组织配置数据源"
echo " 2. 创建组织专用的仪表板"
echo " 3. 配置数据源标签过滤(通过 Prometheus 标签)"
echo ""

View File

@@ -0,0 +1,49 @@
groups:
- name: onvif_alerts
rules:
- alert: ONVIFDeviceDown
expr: up{job="onvif-devices"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "ONVIF设备离线"
description: "ONVIF设备 {{ $labels.instance }} 已离线超过1分钟"
- alert: ONVIFDeviceHighTemperature
expr: onvif_device_temperature > 70
for: 2m
labels:
severity: warning
annotations:
summary: "ONVIF设备温度过高"
description: "设备 {{ $labels.instance }} 温度达到 {{ $value }}°C"
- alert: ONVIFDeviceLowStorage
expr: onvif_storage_usage_percent > 90
for: 5m
labels:
severity: warning
annotations:
summary: "ONVIF设备存储空间不足"
description: "设备 {{ $labels.instance }} 存储使用率达到 {{ $value }}%"
- name: network_alerts
rules:
- alert: NetworkDeviceDown
expr: probe_success{job="network-ping"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "网络设备离线"
description: "网络设备 {{ $labels.instance }} 无法ping通"
- alert: HighNetworkLatency
expr: probe_duration_seconds{job="network-ping"} > 1
for: 5m
labels:
severity: warning
annotations:
summary: "网络延迟过高"
description: "设备 {{ $labels.instance }} 延迟达到 {{ $value }}秒"

View File

@@ -0,0 +1,67 @@
# Prometheus 中央服务器配置模板
# ============================================
# 说明:
# - 本文件为配置模板,包含 ${变量名} 占位符
# - 部署时由 deploy.sh 从 .env 读取变量,用 envsubst 生成 prometheus.yml
# - 请勿直接编辑 prometheus.yml修改应在此模板或 .env 中进行
#
# 变量来源central-server/.env参考 env.example本文件位于 config/prometheus/
# 涉及变量PROMETHEUS_SCRAPE_INTERVAL, PROMETHEUS_EVALUATION_INTERVAL,
# PROMETHEUS_CLUSTER_NAME, VICTORIAMETRICS_PORT,
# PROMETHEUS_REMOTE_WRITE_MAX_SAMPLES, PROMETHEUS_REMOTE_WRITE_CAPACITY,
# PROMETHEUS_REMOTE_WRITE_MAX_SHARDS
# ============================================
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
cluster: 'central-monitoring'
# 远程写入:将中央 Prometheus 抓取到的本地服务指标推送到 VictoriaMetrics
# (边缘节点由 vmagent 直接 remote_write 到 VictoriaMetrics
remote_write:
- url: http://victoria-metrics:8428/api/v1/write
queue_config:
max_samples_per_send: 10000
capacity: 20000
max_shards: 10
# 抓取配置:仅抓取中央本机 Docker 容器Prometheus/VM/Alertmanager/Grafana
scrape_configs:
# 抓取中央Prometheus自身
- job_name: 'prometheus-central'
scrape_interval: 15s
static_configs:
- targets: ['prometheus-central:9090']
# 抓取VictoriaMetrics (VictoriaMetrics 提供 /metrics 端点)
- job_name: 'victoria-metrics'
scrape_interval: 15s
metrics_path: '/metrics'
static_configs:
- targets: ['victoria-metrics:8428']
# 抓取Alertmanager
- job_name: 'alertmanager'
scrape_interval: 15s
static_configs:
- targets: ['alertmanager:9093']
# 抓取Grafana (需要启用 metrics 功能)
- job_name: 'grafana'
scrape_interval: 15s
metrics_path: '/metrics'
static_configs:
- targets: ['grafana:3000']
# 告警规则alert_rules.yml 与 prometheus.yml 同目录
rule_files:
- "alert_rules.yml"
# Alertmanager告警路由与静默
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093

View File

@@ -0,0 +1,67 @@
# Prometheus 中央服务器配置模板
# ============================================
# 说明:
# - 本文件为配置模板,包含 ${变量名} 占位符
# - 部署时由 deploy.sh 从 .env 读取变量,用 envsubst 生成 prometheus.yml
# - 请勿直接编辑 prometheus.yml修改应在此模板或 .env 中进行
#
# 变量来源central-server/.env参考 env.example本文件位于 config/prometheus/
# 涉及变量PROMETHEUS_SCRAPE_INTERVAL, PROMETHEUS_EVALUATION_INTERVAL,
# PROMETHEUS_CLUSTER_NAME, VICTORIAMETRICS_PORT,
# PROMETHEUS_REMOTE_WRITE_MAX_SAMPLES, PROMETHEUS_REMOTE_WRITE_CAPACITY,
# PROMETHEUS_REMOTE_WRITE_MAX_SHARDS
# ============================================
global:
scrape_interval: ${PROMETHEUS_SCRAPE_INTERVAL}s
evaluation_interval: ${PROMETHEUS_EVALUATION_INTERVAL}s
external_labels:
cluster: '${PROMETHEUS_CLUSTER_NAME}'
# 远程写入:将中央 Prometheus 抓取到的本地服务指标推送到 VictoriaMetrics
# (边缘节点由 vmagent 直接 remote_write 到 VictoriaMetrics
remote_write:
- url: http://victoria-metrics:${VICTORIAMETRICS_PORT}/api/v1/write
queue_config:
max_samples_per_send: ${PROMETHEUS_REMOTE_WRITE_MAX_SAMPLES}
capacity: ${PROMETHEUS_REMOTE_WRITE_CAPACITY}
max_shards: ${PROMETHEUS_REMOTE_WRITE_MAX_SHARDS}
# 抓取配置:仅抓取中央本机 Docker 容器Prometheus/VM/Alertmanager/Grafana
scrape_configs:
# 抓取中央Prometheus自身
- job_name: 'prometheus-central'
scrape_interval: ${PROMETHEUS_SCRAPE_INTERVAL}s
static_configs:
- targets: ['prometheus-central:9090']
# 抓取VictoriaMetrics (VictoriaMetrics 提供 /metrics 端点)
- job_name: 'victoria-metrics'
scrape_interval: ${PROMETHEUS_SCRAPE_INTERVAL}s
metrics_path: '/metrics'
static_configs:
- targets: ['victoria-metrics:${VICTORIAMETRICS_PORT}']
# 抓取Alertmanager
- job_name: 'alertmanager'
scrape_interval: ${PROMETHEUS_SCRAPE_INTERVAL}s
static_configs:
- targets: ['alertmanager:9093']
# 抓取Grafana (需要启用 metrics 功能)
- job_name: 'grafana'
scrape_interval: ${PROMETHEUS_SCRAPE_INTERVAL}s
metrics_path: '/metrics'
static_configs:
- targets: ['grafana:3000']
# 告警规则alert_rules.yml 与 prometheus.yml 同目录
rule_files:
- "alert_rules.yml"
# Alertmanager告警路由与静默
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093