Files
2026-03-29 09:08:01 +08:00

305 lines
11 KiB
Bash
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env bash
# 验证入口(以 ansible/playbooks/verify/<doc_id>.yml 为唯一执行真源):
# - run <XX-YY>:执行单篇验证 playbook
# - run-all按 verify 目录中存在的 <doc_id>.yml 顺序执行仅执行域XX>0 && YY>0
# - fullpreflight + run-all
set -euo pipefail
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
# shellcheck disable=SC1091
source "${ROOT}/ansible/lib/lib-ansible-lab.sh"
ansible_lab_export_config
export VERIFY_TEARDOWN="${VERIFY_TEARDOWN:-1}"
STATUS_DIR="${ROOT}/.status"
TEARDOWN_STATE_JSON="${STATUS_DIR}/verify-teardown-state.json"
load_env() {
export ANSIBLE_CONFIG="${ANSIBLE_CONFIG:-${ROOT}/ansible/ansible.cfg}"
local td_override="${VERIFY_TEARDOWN-__unset__}"
if [[ -f "${ROOT}/ansible/env/.env.verify" ]]; then
set -a
# shellcheck disable=SC1091
source "${ROOT}/ansible/env/.env.verify"
set +a
echo "[OK] 已加载 ansible/env/.env.verify"
else
echo "[TIP] 未发现 ansible/env/.env.verify按默认变量继续"
fi
if [[ "${td_override}" != "__unset__" ]]; then
export VERIFY_TEARDOWN="${td_override}"
else
export VERIFY_TEARDOWN="${VERIFY_TEARDOWN:-1}"
fi
echo "[INFO] ANSIBLE_CONFIG=${ANSIBLE_CONFIG}"
}
record_teardown_state() {
mkdir -p "${STATUS_DIR}"
local td="${VERIFY_TEARDOWN:-1}"
local ts
ts="$(date -u +"%Y-%m-%dT%H:%M:%SZ")"
cat > "${TEARDOWN_STATE_JSON}" <<EOF
{"updated_at":"${ts}","verify_teardown":${td}}
EOF
}
warn_teardown_mode() {
local td="${VERIFY_TEARDOWN:-1}"
if [[ "${td}" == "0" ]]; then
echo "[WARN] VERIFY_TEARDOWN=0保留现场模式已启用可能污染后续 full/run-all" >&2
echo "[TIP] 恢复建议:切回 VERIFY_TEARDOWN=1 并复跑主线;必要时手工清理残留命名空间/资源" >&2
echo "[OC] doc_id=preflight result=verified phase=preflight assertion=teardown_mode verify_teardown=0"
fi
if [[ -f "${TEARDOWN_STATE_JSON}" && "${td}" == "1" ]]; then
local last_td=""
last_td="$(TEARDOWN_STATE_JSON="${TEARDOWN_STATE_JSON}" python3 - <<'PY' 2>/dev/null || true
import json, os, pathlib
p = pathlib.Path(os.environ["TEARDOWN_STATE_JSON"])
try:
d = json.loads(p.read_text(encoding="utf-8"))
print(d.get("verify_teardown", ""))
except Exception:
pass
PY
)"
if [[ "${last_td}" == "0" ]]; then
echo "[WARN] 检测到上次验证使用 VERIFY_TEARDOWN=0当前虽为 1但可能存在残留污染" >&2
echo "[TIP] 建议VERIFY_TEARDOWN=1 ./ansible/bin/verify.sh full或 run-all以清理并回归" >&2
echo "[OC] doc_id=preflight result=verified phase=preflight assertion=teardown_state last_verify_teardown=0 current_verify_teardown=1"
fi
fi
}
DOC_ID_EXEC_RE='^(0[1-9]|[1-9][0-9])-(0[1-9]|[1-9][0-9])$'
is_exec_doc_id() {
local doc_id="$1"
[[ "$doc_id" =~ $DOC_ID_EXEC_RE ]]
}
list_doc_ids_from_verify_dir() {
local series="${1:-}"
local id_regex="${2:-}"
local exclude_noop="${3:-0}"
local require_teardown="${4:-0}"
ROOT="${ROOT}" SERIES="${series}" ID_REGEX="${id_regex}" EXCLUDE_NOOP="${exclude_noop}" REQUIRE_TEARDOWN="${require_teardown}" python3 - <<'PY'
import os
import re
from pathlib import Path
root = Path(os.environ["ROOT"])
verify_dir = root / "ansible" / "playbooks" / "verify"
series = os.environ.get("SERIES", "").strip()
id_regex = os.environ.get("ID_REGEX", "").strip()
exclude_noop = os.environ.get("EXCLUDE_NOOP", "0") == "1"
require_teardown = os.environ.get("REQUIRE_TEARDOWN", "0") == "1"
pat = re.compile(r"^(?P<id>(0[1-9]|[1-9][0-9])-(0[1-9]|[1-9][0-9]))\.yml$")
id_pat = re.compile(id_regex) if id_regex else None
ids = []
for p in verify_dir.iterdir():
m = pat.match(p.name)
if not m:
continue
doc_id = m.group("id")
if series and not doc_id.startswith(f"{series}-"):
continue
if id_pat and not id_pat.search(doc_id):
continue
if exclude_noop or require_teardown:
content = p.read_text(encoding="utf-8", errors="ignore")
if exclude_noop and "noop verify" in content:
continue
if require_teardown and ("VERIFY_TEARDOWN" not in content and "verify_teardown" not in content):
continue
ids.append(doc_id)
for x in sorted(set(ids)):
print(x)
PY
}
run_preflight() {
local inv="${ANSIBLE_INVENTORY:-${ROOT}/ansible/inventory.ini}"
oc_failed() {
# OC-like preflight line for humans/tools (minimal; stdout is source of truth).
local assertion="$1"
shift || true
echo "[OC] doc_id=preflight result=failed phase=preflight assertion=${assertion} $*"
}
oc_gated() {
local missing="$1"
local scope="$2"
echo "[OC] doc_id=preflight result=gated phase=preflight assertion=dependency_check missing_dependency=${missing} skip_scope=\"${scope}\""
}
need_cmd_or_fail() {
local cmd="$1"
if ! command -v "$cmd" >/dev/null 2>&1; then
echo "[ERR] 未找到命令:$cmd" >&2
oc_failed "missing_cmd" "missing_cmd=${cmd}"
exit 2
fi
}
need_cmd_or_fail ansible-playbook
need_cmd_or_fail ansible
warn_teardown_mode
record_teardown_state
[[ -f "$inv" ]] || { echo "[ERR] inventory 不存在:$inv" >&2; oc_failed "missing_inventory" "inventory=${inv}"; exit 2; }
ansible_lab_check_inventory_keys "$inv" || { oc_failed "inventory_keys" "inventory=${inv}"; exit 2; }
echo "[INFO] 变量边界inventory=$inv | group_vars=ansible/group_vars/all.yml | env=ansible/env/.env.verify"
echo "[INFO] 关键变量VERIFY_TEARDOWN=${VERIFY_TEARDOWN:-1} nginx_entry_base=${nginx_entry_base:-<unset>} nodejs_entry_base=${nodejs_entry_base:-<unset>}"
echo "[RUN] ansible k3s_server -m ping"
if ! ansible k3s_server -i "$inv" -m ping; then
echo "[ERR] ansible ping 失败k3s_server 不可达" >&2
oc_failed "ansible_ping" "target_group=k3s_server"
exit 2
fi
# Optional cluster-side check (may still fail-fast: control-side hard failure).
if [[ "${VERIFY_PREFLIGHT_CLUSTER:-0}" == "1" ]]; then
if ! ansible k3s_server -i "$inv" -b -m ansible.builtin.shell -a \
'KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl get nodes'; then
echo "[ERR] kubectl 集群检查失败VERIFY_PREFLIGHT_CLUSTER=1" >&2
oc_failed "kubectl_get_nodes"
exit 2
fi
fi
# External dependencies: missing deps should not fail preflight (EC2) but must be explicit gated.
# We gate only the dependent scopes; runtime verify can still proceed for non-dependent doc_ids.
local gated=0
local missing_list=()
local scope_list=()
if [[ -z "${ACME_EMAIL:-}" ]]; then
gated=1; missing_list+=("acme"); scope_list+=("acme/tls issuance")
fi
# Epic 4Traefik ACME DNS-01 仅需 CF_API_TOKEN见 03-02 ensure secretZONE_* 不由 preflight 强门禁。
if [[ -z "${CF_API_TOKEN:-}" ]]; then
gated=1; missing_list+=("cloudflare"); scope_list+=("cloudflare api token / acme dns01")
fi
if [[ -z "${NFS_SERVER_IP:-}" || -z "${NFS_EXPORT_PATH:-}" ]]; then
gated=1; missing_list+=("nfs"); scope_list+=("nfs pv/pvc")
fi
if [[ -z "${WORKSTATION_SSH:-}" ]]; then
gated=1; missing_list+=("third_party_probe"); scope_list+=("third-party probe (WORKSTATION_SSH e.g. jack@ylc65)")
fi
if [[ "$gated" == "1" ]]; then
# Join arrays into readable strings.
local missing joined_scope
missing="$(IFS=,; echo "${missing_list[*]}")"
joined_scope="$(IFS='; '; echo "${scope_list[*]}")"
echo "[GATE] preflight external deps missing: ${missing} (scopes: ${joined_scope})"
oc_gated "${missing}" "${joined_scope}"
echo "[OK] preflight 通过带门控gated"
return 0
fi
echo "[OC] doc_id=preflight result=verified phase=preflight assertion=connectivity"
echo "[OK] preflight 通过"
}
run_all_verify() {
local series="${1:-}"
local id_regex="${2:-}"
local exclude_noop="${3:-0}"
local require_teardown="${4:-0}"
local id
while IFS= read -r id; do
echo ""
echo "########################################## $id"
ansible_verify "$id"
done < <(list_doc_ids_from_verify_dir "$series" "$id_regex" "$exclude_noop" "$require_teardown")
}
usage() {
cat <<'EOF'
用法ansible/bin/verify.sh <命令> [...]
命令flow | preflight | full | list | run <XX-YY> | run-all
筛选参数:--series <XX> | --id-regex <regex> | --exclude-noop | --require-teardown
EOF
}
print_flow() {
cat <<EOF
1 接入目标环境 inventory + 仓库同步;可选 source ansible/env/.env.verify
2 环境与前置清理 轻量:各 verify playbook 的 teardown
3 部署 ./ansible/bin/deploy-lab.sh k3s|longhorn|nginx-matrix*
4 断言 ./ansible/bin/verify.sh run <XX-YY> / run-all
EOF
}
ansible_verify() {
local doc_id="$1"
if ! is_exec_doc_id "$doc_id"; then
echo "[ERR] 非执行域 doc_id$doc_id(仅允许 XX>0 且 YY>0" >&2
echo "[OC] doc_id=${doc_id} result=failed phase=verify assertion=invalid_doc_id"
exit 1
fi
local inv="${ANSIBLE_INVENTORY:-${ROOT}/ansible/inventory.ini}"
local pb_single="${ROOT}/ansible/playbooks/verify/${doc_id}.yml"
[[ -f "$pb_single" ]] || { echo "[ERR] verify playbook 不存在:$pb_single" >&2; echo "[OC] doc_id=${doc_id} result=failed phase=verify assertion=missing_playbook"; exit 1; }
[[ -f "$inv" ]] || { echo "[ERR] inventory 不存在:$inv" >&2; echo "[OC] doc_id=${doc_id} result=failed phase=verify assertion=missing_inventory"; exit 1; }
local td="${VERIFY_TEARDOWN:-1}"
local run_log
run_log="$(mktemp)"
echo "[RUN] ansible-playbook -i $inv -e VERIFY_TEARDOWN=$td $pb_single"
if ansible-playbook -i "$inv" -e "VERIFY_TEARDOWN=$td" "$pb_single" 2>&1 | tee "$run_log"; then
if grep -q '\[GATE\]' "$run_log"; then
echo "[OC] doc_id=${doc_id} result=gated phase=verify assertion=playbook_gated"
else
# OC1: stable parse fields. OC3 evidence points to playbook output sections.
echo "[OC] doc_id=${doc_id} result=verified phase=verify assertion=playbook_success"
echo "[OC-EVIDENCE] doc_id=${doc_id} kind=cluster summary=\"see kubectl/assert output in playbook logs\""
echo "[OC-EVIDENCE] doc_id=${doc_id} kind=entry summary=\"see http/tls/assert output in playbook logs\""
fi
else
echo "[OC] doc_id=${doc_id} result=failed phase=verify assertion=playbook_failed"
rm -f "$run_log"
return 1
fi
rm -f "$run_log"
}
main() {
load_env
local cmd="${1:-}"
shift || true
local series=""
local id_regex=""
local exclude_noop=0
local require_teardown=0
parse_filter_args() {
while [[ $# -gt 0 ]]; do
case "$1" in
--series) series="${2:-}"; shift 2 ;;
--id-regex) id_regex="${2:-}"; shift 2 ;;
--exclude-noop) exclude_noop=1; shift ;;
--require-teardown) require_teardown=1; shift ;;
*) echo "[ERR] 未知参数:$1" >&2; exit 1 ;;
esac
done
}
case "$cmd" in
""|-h|--help) usage ;;
flow) print_flow ;;
preflight) run_preflight ;;
full) parse_filter_args "$@"; run_preflight; run_all_verify "$series" "$id_regex" "$exclude_noop" "$require_teardown" ;;
list) parse_filter_args "$@"; list_doc_ids_from_verify_dir "$series" "$id_regex" "$exclude_noop" "$require_teardown" ;;
run) local doc_id="${1:?need doc_id like 02-05}"; ansible_verify "$doc_id" ;;
run-all) parse_filter_args "$@"; run_all_verify "$series" "$id_regex" "$exclude_noop" "$require_teardown" ;;
*) echo "[ERR] unknown cmd: $cmd" >&2; usage; exit 1 ;;
esac
}
main "$@"