refactor: config/apps 目录重组、文档重构、架构图收窄
- 中央:config/(prometheus,alertmanager,grafana)、apps/(tile-cache,topology-editor) - 边缘:config/(vmagent,blackbox,targets)、apps/(onvif-exporter) - env: TRAEFIK_PROVIDER、prometheus/env.example 详细说明 - 文档:README/doc 重构,EDGE_CACHE 合并到 EDGE_AGENT_CONFIG - targets.csv 更新流程说明,ARCHITECTURE 图收窄 Made-with: Cursor
This commit is contained in:
22
central-server/config/alertmanager/alertmanager.yml
Normal file
22
central-server/config/alertmanager/alertmanager.yml
Normal file
@@ -0,0 +1,22 @@
|
||||
global:
|
||||
smtp_smarthost: 'localhost:587'
|
||||
smtp_from: 'alertmanager@example.com'
|
||||
|
||||
route:
|
||||
group_by: ['alertname']
|
||||
group_wait: 10s
|
||||
group_interval: 10s
|
||||
repeat_interval: 1h
|
||||
receiver: 'web.hook'
|
||||
|
||||
receivers:
|
||||
- name: 'web.hook'
|
||||
webhook_configs:
|
||||
- url: 'http://127.0.0.1:5001/'
|
||||
|
||||
inhibit_rules:
|
||||
- source_match:
|
||||
severity: 'critical'
|
||||
target_match:
|
||||
severity: 'warning'
|
||||
equal: ['alertname', 'dev', 'instance']
|
||||
@@ -0,0 +1,76 @@
|
||||
{
|
||||
"dashboard": {
|
||||
"id": null,
|
||||
"title": "ONVIF设备监控",
|
||||
"tags": ["onvif", "camera", "monitoring"],
|
||||
"style": "dark",
|
||||
"timezone": "browser",
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "ONVIF设备状态",
|
||||
"type": "stat",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "up{job=\"onvif-devices\"}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"thresholds": {
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "设备在线率",
|
||||
"type": "gauge",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(up{job=\"onvif-devices\"}) / count(up{job=\"onvif-devices\"}) * 100",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"unit": "percent"
|
||||
}
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 0
|
||||
}
|
||||
}
|
||||
],
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"refresh": "30s"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: 'default'
|
||||
orgId: 1
|
||||
folder: ''
|
||||
type: file
|
||||
disableDeletion: false
|
||||
updateIntervalSeconds: 10
|
||||
allowUiUpdates: true
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards
|
||||
@@ -0,0 +1,20 @@
|
||||
# 管理员全局数据源配置
|
||||
# 此数据源允许管理员查看所有数据(不受标签过滤限制)
|
||||
# 放置在 provisioning/datasources/ 目录下会自动加载
|
||||
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Prometheus-All-Data
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://prometheus-central:9090
|
||||
isDefault: false
|
||||
editable: false
|
||||
jsonData:
|
||||
httpMethod: POST
|
||||
queryTimeout: 60s
|
||||
timeInterval: 15s
|
||||
# 此数据源对所有组织可见(通过权限控制)
|
||||
# 管理员可以使用无标签过滤的查询查看所有数据
|
||||
# 例如: up 而不是 up{user_group="xxx"}
|
||||
@@ -0,0 +1,9 @@
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://prometheus-central:9090
|
||||
isDefault: true
|
||||
editable: true
|
||||
@@ -0,0 +1,16 @@
|
||||
# VictoriaMetrics 数据源(边缘节点上报的数据存储在此)
|
||||
# 边缘节点通过 remote_write 推送到中央 VictoriaMetrics,本数据源用于在 Grafana 中查询这些数据
|
||||
# 使用前需在边缘节点配置:remote_write 指向中央服务器 VictoriaMetrics 地址(如 http://中央IP:8428/api/v1/write)
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: VictoriaMetrics
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://victoria-metrics:8428
|
||||
isDefault: false
|
||||
editable: true
|
||||
jsonData:
|
||||
httpMethod: POST
|
||||
queryTimeout: 60s
|
||||
timeInterval: 15s
|
||||
164
central-server/config/grafana/setup-users.sh
Normal file
164
central-server/config/grafana/setup-users.sh
Normal file
@@ -0,0 +1,164 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Grafana 多用户和组织配置脚本
|
||||
# 使用方法: ./setup-users.sh
|
||||
|
||||
set -e
|
||||
|
||||
GRAFANA_URL="${GRAFANA_URL:-http://localhost:3000}"
|
||||
GRAFANA_ADMIN_USER="${GRAFANA_ADMIN_USER:-admin}"
|
||||
GRAFANA_ADMIN_PASSWORD="${GRAFANA_ADMIN_PASSWORD:-admin123}"
|
||||
|
||||
echo "=== Grafana 多用户配置脚本 ==="
|
||||
echo ""
|
||||
|
||||
# 检查 jq 是否安装
|
||||
if ! command -v jq &> /dev/null; then
|
||||
echo "❌ jq 未安装,请先安装 jq:"
|
||||
echo " Ubuntu/Debian: sudo apt-get install jq"
|
||||
echo " CentOS/RHEL: sudo yum install jq"
|
||||
echo " Fedora: sudo dnf install jq"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 获取认证 Token
|
||||
echo "🔐 获取 Grafana API Token..."
|
||||
AUTH_RESPONSE=$(curl -s -X POST \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"user\":\"$GRAFANA_ADMIN_USER\",\"password\":\"$GRAFANA_ADMIN_PASSWORD\"}" \
|
||||
"$GRAFANA_URL/login")
|
||||
|
||||
# 检查 Grafana 是否可访问
|
||||
if ! curl -s "$GRAFANA_URL/api/health" > /dev/null; then
|
||||
echo "❌ 无法连接到 Grafana: $GRAFANA_URL"
|
||||
echo " 请确保 Grafana 服务正在运行"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "✅ Grafana 连接成功"
|
||||
echo ""
|
||||
|
||||
# 创建组织的函数
|
||||
create_organization() {
|
||||
local org_name=$1
|
||||
local org_id=$2
|
||||
|
||||
echo "📁 创建组织: $org_name"
|
||||
|
||||
# 检查组织是否已存在
|
||||
ORG_EXISTS=$(curl -s -u "$GRAFANA_ADMIN_USER:$GRAFANA_ADMIN_PASSWORD" \
|
||||
"$GRAFANA_URL/api/orgs/name/$org_name" | jq -r '.id // empty')
|
||||
|
||||
if [ -n "$ORG_EXISTS" ]; then
|
||||
echo " ⚠️ 组织 $org_name 已存在 (ID: $ORG_EXISTS)"
|
||||
return
|
||||
fi
|
||||
|
||||
# 创建组织
|
||||
ORG_RESPONSE=$(curl -s -X POST \
|
||||
-u "$GRAFANA_ADMIN_USER:$GRAFANA_ADMIN_PASSWORD" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"name\":\"$org_name\"}" \
|
||||
"$GRAFANA_URL/api/orgs")
|
||||
|
||||
NEW_ORG_ID=$(echo "$ORG_RESPONSE" | jq -r '.orgId // empty')
|
||||
|
||||
if [ -n "$NEW_ORG_ID" ]; then
|
||||
echo " ✅ 组织创建成功 (ID: $NEW_ORG_ID)"
|
||||
else
|
||||
echo " ❌ 组织创建失败: $ORG_RESPONSE"
|
||||
fi
|
||||
}
|
||||
|
||||
# 创建用户的函数
|
||||
create_user() {
|
||||
local org_name=$1
|
||||
local username=$2
|
||||
local password=$3
|
||||
local email=$4
|
||||
local role=${5:-Viewer}
|
||||
|
||||
echo "👤 创建用户: $username (组织: $org_name)"
|
||||
|
||||
# 切换到指定组织
|
||||
ORG_ID=$(curl -s -u "$GRAFANA_ADMIN_USER:$GRAFANA_ADMIN_PASSWORD" \
|
||||
"$GRAFANA_URL/api/orgs/name/$org_name" | jq -r '.id // empty')
|
||||
|
||||
if [ -z "$ORG_ID" ]; then
|
||||
echo " ❌ 组织 $org_name 不存在"
|
||||
return
|
||||
fi
|
||||
|
||||
# 切换到组织
|
||||
curl -s -X POST \
|
||||
-u "$GRAFANA_ADMIN_USER:$GRAFANA_ADMIN_PASSWORD" \
|
||||
"$GRAFANA_URL/api/user/using/$ORG_ID" > /dev/null
|
||||
|
||||
# 检查用户是否已存在
|
||||
USER_EXISTS=$(curl -s -u "$GRAFANA_ADMIN_USER:$GRAFANA_ADMIN_PASSWORD" \
|
||||
"$GRAFANA_URL/api/users/lookup?loginOrEmail=$email" | jq -r '.id // empty')
|
||||
|
||||
if [ -n "$USER_EXISTS" ]; then
|
||||
echo " ⚠️ 用户 $username 已存在"
|
||||
# 将用户添加到组织
|
||||
curl -s -X POST \
|
||||
-u "$GRAFANA_ADMIN_USER:$GRAFANA_ADMIN_PASSWORD" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"loginOrEmail\":\"$email\",\"role\":\"$role\"}" \
|
||||
"$GRAFANA_URL/api/orgs/$ORG_ID/users" > /dev/null
|
||||
echo " ✅ 用户已添加到组织"
|
||||
return
|
||||
fi
|
||||
|
||||
# 创建用户
|
||||
USER_RESPONSE=$(curl -s -X POST \
|
||||
-u "$GRAFANA_ADMIN_USER:$GRAFANA_ADMIN_PASSWORD" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{
|
||||
\"name\":\"$username\",
|
||||
\"email\":\"$email\",
|
||||
\"login\":\"$username\",
|
||||
\"password\":\"$password\",
|
||||
\"OrgId\":$ORG_ID
|
||||
}" \
|
||||
"$GRAFANA_URL/api/admin/users")
|
||||
|
||||
USER_ID=$(echo "$USER_RESPONSE" | jq -r '.id // empty')
|
||||
|
||||
if [ -n "$USER_ID" ]; then
|
||||
echo " ✅ 用户创建成功 (ID: $USER_ID)"
|
||||
else
|
||||
echo " ❌ 用户创建失败: $USER_RESPONSE"
|
||||
fi
|
||||
}
|
||||
|
||||
# 示例:创建组织和用户
|
||||
echo "📝 开始创建组织和用户..."
|
||||
echo ""
|
||||
|
||||
# 创建示例组织
|
||||
create_organization "用户组A" 2
|
||||
create_organization "用户组B" 3
|
||||
|
||||
# 创建示例用户
|
||||
create_user "用户组A" "usera1" "password123" "usera1@example.com" "Viewer"
|
||||
create_user "用户组A" "usera2" "password123" "usera2@example.com" "Editor"
|
||||
create_user "用户组B" "userb1" "password123" "userb1@example.com" "Viewer"
|
||||
create_user "用户组B" "userb2" "password123" "userb2@example.com" "Editor"
|
||||
|
||||
echo ""
|
||||
echo "✅ 用户配置完成!"
|
||||
echo ""
|
||||
echo "📋 创建的用户:"
|
||||
echo " 用户组A:"
|
||||
echo " - usera1 (Viewer) - usera1@example.com / password123"
|
||||
echo " - usera2 (Editor) - usera2@example.com / password123"
|
||||
echo " 用户组B:"
|
||||
echo " - userb1 (Viewer) - userb1@example.com / password123"
|
||||
echo " - userb2 (Editor) - userb2@example.com / password123"
|
||||
echo ""
|
||||
echo "💡 下一步:"
|
||||
echo " 1. 登录 Grafana 为每个组织配置数据源"
|
||||
echo " 2. 创建组织专用的仪表板"
|
||||
echo " 3. 配置数据源标签过滤(通过 Prometheus 标签)"
|
||||
echo ""
|
||||
49
central-server/config/prometheus/alert_rules.yml
Normal file
49
central-server/config/prometheus/alert_rules.yml
Normal file
@@ -0,0 +1,49 @@
|
||||
groups:
|
||||
- name: onvif_alerts
|
||||
rules:
|
||||
- alert: ONVIFDeviceDown
|
||||
expr: up{job="onvif-devices"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "ONVIF设备离线"
|
||||
description: "ONVIF设备 {{ $labels.instance }} 已离线超过1分钟"
|
||||
|
||||
- alert: ONVIFDeviceHighTemperature
|
||||
expr: onvif_device_temperature > 70
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "ONVIF设备温度过高"
|
||||
description: "设备 {{ $labels.instance }} 温度达到 {{ $value }}°C"
|
||||
|
||||
- alert: ONVIFDeviceLowStorage
|
||||
expr: onvif_storage_usage_percent > 90
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "ONVIF设备存储空间不足"
|
||||
description: "设备 {{ $labels.instance }} 存储使用率达到 {{ $value }}%"
|
||||
|
||||
- name: network_alerts
|
||||
rules:
|
||||
- alert: NetworkDeviceDown
|
||||
expr: probe_success{job="network-ping"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "网络设备离线"
|
||||
description: "网络设备 {{ $labels.instance }} 无法ping通"
|
||||
|
||||
- alert: HighNetworkLatency
|
||||
expr: probe_duration_seconds{job="network-ping"} > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "网络延迟过高"
|
||||
description: "设备 {{ $labels.instance }} 延迟达到 {{ $value }}秒"
|
||||
67
central-server/config/prometheus/prometheus.yml
Normal file
67
central-server/config/prometheus/prometheus.yml
Normal file
@@ -0,0 +1,67 @@
|
||||
# Prometheus 中央服务器配置模板
|
||||
# ============================================
|
||||
# 说明:
|
||||
# - 本文件为配置模板,包含 ${变量名} 占位符
|
||||
# - 部署时由 deploy.sh 从 .env 读取变量,用 envsubst 生成 prometheus.yml
|
||||
# - 请勿直接编辑 prometheus.yml,修改应在此模板或 .env 中进行
|
||||
#
|
||||
# 变量来源:central-server/.env(参考 env.example);本文件位于 config/prometheus/
|
||||
# 涉及变量:PROMETHEUS_SCRAPE_INTERVAL, PROMETHEUS_EVALUATION_INTERVAL,
|
||||
# PROMETHEUS_CLUSTER_NAME, VICTORIAMETRICS_PORT,
|
||||
# PROMETHEUS_REMOTE_WRITE_MAX_SAMPLES, PROMETHEUS_REMOTE_WRITE_CAPACITY,
|
||||
# PROMETHEUS_REMOTE_WRITE_MAX_SHARDS
|
||||
# ============================================
|
||||
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
external_labels:
|
||||
cluster: 'central-monitoring'
|
||||
|
||||
# 远程写入:将中央 Prometheus 抓取到的本地服务指标推送到 VictoriaMetrics
|
||||
# (边缘节点由 vmagent 直接 remote_write 到 VictoriaMetrics)
|
||||
remote_write:
|
||||
- url: http://victoria-metrics:8428/api/v1/write
|
||||
queue_config:
|
||||
max_samples_per_send: 10000
|
||||
capacity: 20000
|
||||
max_shards: 10
|
||||
|
||||
# 抓取配置:仅抓取中央本机 Docker 容器(Prometheus/VM/Alertmanager/Grafana)
|
||||
scrape_configs:
|
||||
# 抓取中央Prometheus自身
|
||||
- job_name: 'prometheus-central'
|
||||
scrape_interval: 15s
|
||||
static_configs:
|
||||
- targets: ['prometheus-central:9090']
|
||||
|
||||
# 抓取VictoriaMetrics (VictoriaMetrics 提供 /metrics 端点)
|
||||
- job_name: 'victoria-metrics'
|
||||
scrape_interval: 15s
|
||||
metrics_path: '/metrics'
|
||||
static_configs:
|
||||
- targets: ['victoria-metrics:8428']
|
||||
|
||||
# 抓取Alertmanager
|
||||
- job_name: 'alertmanager'
|
||||
scrape_interval: 15s
|
||||
static_configs:
|
||||
- targets: ['alertmanager:9093']
|
||||
|
||||
# 抓取Grafana (需要启用 metrics 功能)
|
||||
- job_name: 'grafana'
|
||||
scrape_interval: 15s
|
||||
metrics_path: '/metrics'
|
||||
static_configs:
|
||||
- targets: ['grafana:3000']
|
||||
|
||||
# 告警规则:alert_rules.yml 与 prometheus.yml 同目录
|
||||
rule_files:
|
||||
- "alert_rules.yml"
|
||||
|
||||
# Alertmanager:告警路由与静默
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
- alertmanager:9093
|
||||
67
central-server/config/prometheus/prometheus.yml.template
Normal file
67
central-server/config/prometheus/prometheus.yml.template
Normal file
@@ -0,0 +1,67 @@
|
||||
# Prometheus 中央服务器配置模板
|
||||
# ============================================
|
||||
# 说明:
|
||||
# - 本文件为配置模板,包含 ${变量名} 占位符
|
||||
# - 部署时由 deploy.sh 从 .env 读取变量,用 envsubst 生成 prometheus.yml
|
||||
# - 请勿直接编辑 prometheus.yml,修改应在此模板或 .env 中进行
|
||||
#
|
||||
# 变量来源:central-server/.env(参考 env.example);本文件位于 config/prometheus/
|
||||
# 涉及变量:PROMETHEUS_SCRAPE_INTERVAL, PROMETHEUS_EVALUATION_INTERVAL,
|
||||
# PROMETHEUS_CLUSTER_NAME, VICTORIAMETRICS_PORT,
|
||||
# PROMETHEUS_REMOTE_WRITE_MAX_SAMPLES, PROMETHEUS_REMOTE_WRITE_CAPACITY,
|
||||
# PROMETHEUS_REMOTE_WRITE_MAX_SHARDS
|
||||
# ============================================
|
||||
|
||||
global:
|
||||
scrape_interval: ${PROMETHEUS_SCRAPE_INTERVAL}s
|
||||
evaluation_interval: ${PROMETHEUS_EVALUATION_INTERVAL}s
|
||||
external_labels:
|
||||
cluster: '${PROMETHEUS_CLUSTER_NAME}'
|
||||
|
||||
# 远程写入:将中央 Prometheus 抓取到的本地服务指标推送到 VictoriaMetrics
|
||||
# (边缘节点由 vmagent 直接 remote_write 到 VictoriaMetrics)
|
||||
remote_write:
|
||||
- url: http://victoria-metrics:${VICTORIAMETRICS_PORT}/api/v1/write
|
||||
queue_config:
|
||||
max_samples_per_send: ${PROMETHEUS_REMOTE_WRITE_MAX_SAMPLES}
|
||||
capacity: ${PROMETHEUS_REMOTE_WRITE_CAPACITY}
|
||||
max_shards: ${PROMETHEUS_REMOTE_WRITE_MAX_SHARDS}
|
||||
|
||||
# 抓取配置:仅抓取中央本机 Docker 容器(Prometheus/VM/Alertmanager/Grafana)
|
||||
scrape_configs:
|
||||
# 抓取中央Prometheus自身
|
||||
- job_name: 'prometheus-central'
|
||||
scrape_interval: ${PROMETHEUS_SCRAPE_INTERVAL}s
|
||||
static_configs:
|
||||
- targets: ['prometheus-central:9090']
|
||||
|
||||
# 抓取VictoriaMetrics (VictoriaMetrics 提供 /metrics 端点)
|
||||
- job_name: 'victoria-metrics'
|
||||
scrape_interval: ${PROMETHEUS_SCRAPE_INTERVAL}s
|
||||
metrics_path: '/metrics'
|
||||
static_configs:
|
||||
- targets: ['victoria-metrics:${VICTORIAMETRICS_PORT}']
|
||||
|
||||
# 抓取Alertmanager
|
||||
- job_name: 'alertmanager'
|
||||
scrape_interval: ${PROMETHEUS_SCRAPE_INTERVAL}s
|
||||
static_configs:
|
||||
- targets: ['alertmanager:9093']
|
||||
|
||||
# 抓取Grafana (需要启用 metrics 功能)
|
||||
- job_name: 'grafana'
|
||||
scrape_interval: ${PROMETHEUS_SCRAPE_INTERVAL}s
|
||||
metrics_path: '/metrics'
|
||||
static_configs:
|
||||
- targets: ['grafana:3000']
|
||||
|
||||
# 告警规则:alert_rules.yml 与 prometheus.yml 同目录
|
||||
rule_files:
|
||||
- "alert_rules.yml"
|
||||
|
||||
# Alertmanager:告警路由与静默
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
- alertmanager:9093
|
||||
Reference in New Issue
Block a user