#!/usr/bin/env bash # 验证入口(以 ansible/playbooks/verify/.yml 为唯一执行真源): # - run :执行单篇验证 playbook # - run-all:按 verify 目录中存在的 .yml 顺序执行(仅执行域:XX>0 && YY>0) # - full:preflight + run-all set -euo pipefail ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" # shellcheck disable=SC1091 source "${ROOT}/ansible/lib/lib-ansible-lab.sh" ansible_lab_export_config export VERIFY_TEARDOWN="${VERIFY_TEARDOWN:-1}" STATUS_DIR="${ROOT}/.status" TEARDOWN_STATE_JSON="${STATUS_DIR}/verify-teardown-state.json" load_env() { export ANSIBLE_CONFIG="${ANSIBLE_CONFIG:-${ROOT}/ansible/ansible.cfg}" local td_override="${VERIFY_TEARDOWN-__unset__}" if [[ -f "${ROOT}/ansible/env/.env.verify" ]]; then set -a # shellcheck disable=SC1091 source "${ROOT}/ansible/env/.env.verify" set +a echo "[OK] 已加载 ansible/env/.env.verify" else echo "[TIP] 未发现 ansible/env/.env.verify,按默认变量继续" fi if [[ "${td_override}" != "__unset__" ]]; then export VERIFY_TEARDOWN="${td_override}" else export VERIFY_TEARDOWN="${VERIFY_TEARDOWN:-1}" fi echo "[INFO] ANSIBLE_CONFIG=${ANSIBLE_CONFIG}" } record_teardown_state() { mkdir -p "${STATUS_DIR}" local td="${VERIFY_TEARDOWN:-1}" local ts ts="$(date -u +"%Y-%m-%dT%H:%M:%SZ")" cat > "${TEARDOWN_STATE_JSON}" <&2 echo "[TIP] 恢复建议:切回 VERIFY_TEARDOWN=1 并复跑主线;必要时手工清理残留命名空间/资源" >&2 echo "[OC] doc_id=preflight result=verified phase=preflight assertion=teardown_mode verify_teardown=0" fi if [[ -f "${TEARDOWN_STATE_JSON}" && "${td}" == "1" ]]; then local last_td="" last_td="$(TEARDOWN_STATE_JSON="${TEARDOWN_STATE_JSON}" python3 - <<'PY' 2>/dev/null || true import json, os, pathlib p = pathlib.Path(os.environ["TEARDOWN_STATE_JSON"]) try: d = json.loads(p.read_text(encoding="utf-8")) print(d.get("verify_teardown", "")) except Exception: pass PY )" if [[ "${last_td}" == "0" ]]; then echo "[WARN] 检测到上次验证使用 VERIFY_TEARDOWN=0:当前虽为 1,但可能存在残留污染" >&2 echo "[TIP] 建议:VERIFY_TEARDOWN=1 ./ansible/bin/verify.sh full(或 run-all)以清理并回归" >&2 echo "[OC] doc_id=preflight result=verified phase=preflight assertion=teardown_state last_verify_teardown=0 current_verify_teardown=1" fi fi } DOC_ID_EXEC_RE='^(0[1-9]|[1-9][0-9])-(0[1-9]|[1-9][0-9])$' is_exec_doc_id() { local doc_id="$1" [[ "$doc_id" =~ $DOC_ID_EXEC_RE ]] } list_doc_ids_from_verify_dir() { local series="${1:-}" local id_regex="${2:-}" local exclude_noop="${3:-0}" local require_teardown="${4:-0}" ROOT="${ROOT}" SERIES="${series}" ID_REGEX="${id_regex}" EXCLUDE_NOOP="${exclude_noop}" REQUIRE_TEARDOWN="${require_teardown}" python3 - <<'PY' import os import re from pathlib import Path root = Path(os.environ["ROOT"]) verify_dir = root / "ansible" / "playbooks" / "verify" series = os.environ.get("SERIES", "").strip() id_regex = os.environ.get("ID_REGEX", "").strip() exclude_noop = os.environ.get("EXCLUDE_NOOP", "0") == "1" require_teardown = os.environ.get("REQUIRE_TEARDOWN", "0") == "1" pat = re.compile(r"^(?P(0[1-9]|[1-9][0-9])-(0[1-9]|[1-9][0-9]))\.yml$") id_pat = re.compile(id_regex) if id_regex else None ids = [] for p in verify_dir.iterdir(): m = pat.match(p.name) if not m: continue doc_id = m.group("id") if series and not doc_id.startswith(f"{series}-"): continue if id_pat and not id_pat.search(doc_id): continue if exclude_noop or require_teardown: content = p.read_text(encoding="utf-8", errors="ignore") if exclude_noop and "noop verify" in content: continue if require_teardown and ("VERIFY_TEARDOWN" not in content and "verify_teardown" not in content): continue ids.append(doc_id) for x in sorted(set(ids)): print(x) PY } run_preflight() { local inv="${ANSIBLE_INVENTORY:-${ROOT}/ansible/inventory.ini}" oc_failed() { # OC-like preflight line for humans/tools (minimal; stdout is source of truth). local assertion="$1" shift || true echo "[OC] doc_id=preflight result=failed phase=preflight assertion=${assertion} $*" } oc_gated() { local missing="$1" local scope="$2" echo "[OC] doc_id=preflight result=gated phase=preflight assertion=dependency_check missing_dependency=${missing} skip_scope=\"${scope}\"" } need_cmd_or_fail() { local cmd="$1" if ! command -v "$cmd" >/dev/null 2>&1; then echo "[ERR] 未找到命令:$cmd" >&2 oc_failed "missing_cmd" "missing_cmd=${cmd}" exit 2 fi } need_cmd_or_fail ansible-playbook need_cmd_or_fail ansible warn_teardown_mode record_teardown_state [[ -f "$inv" ]] || { echo "[ERR] inventory 不存在:$inv" >&2; oc_failed "missing_inventory" "inventory=${inv}"; exit 2; } ansible_lab_check_inventory_keys "$inv" || { oc_failed "inventory_keys" "inventory=${inv}"; exit 2; } echo "[INFO] 变量边界:inventory=$inv | group_vars=ansible/group_vars/all.yml | env=ansible/env/.env.verify" echo "[INFO] 关键变量:VERIFY_TEARDOWN=${VERIFY_TEARDOWN:-1} nginx_entry_base=${nginx_entry_base:-} nodejs_entry_base=${nodejs_entry_base:-}" echo "[RUN] ansible k3s_server -m ping" if ! ansible k3s_server -i "$inv" -m ping; then echo "[ERR] ansible ping 失败:k3s_server 不可达" >&2 oc_failed "ansible_ping" "target_group=k3s_server" exit 2 fi # Optional cluster-side check (may still fail-fast: control-side hard failure). if [[ "${VERIFY_PREFLIGHT_CLUSTER:-0}" == "1" ]]; then if ! ansible k3s_server -i "$inv" -b -m ansible.builtin.shell -a \ 'KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl get nodes'; then echo "[ERR] kubectl 集群检查失败(VERIFY_PREFLIGHT_CLUSTER=1)" >&2 oc_failed "kubectl_get_nodes" exit 2 fi fi # External dependencies: missing deps should not fail preflight (EC2) but must be explicit gated. # We gate only the dependent scopes; runtime verify can still proceed for non-dependent doc_ids. local gated=0 local missing_list=() local scope_list=() if [[ -z "${ACME_EMAIL:-}" ]]; then gated=1; missing_list+=("acme"); scope_list+=("acme/tls issuance") fi # Epic 4:Traefik ACME DNS-01 仅需 CF_API_TOKEN(见 03-02 ensure secret);ZONE_* 不由 preflight 强门禁。 if [[ -z "${CF_API_TOKEN:-}" ]]; then gated=1; missing_list+=("cloudflare"); scope_list+=("cloudflare api token / acme dns01") fi if [[ -z "${NFS_SERVER_IP:-}" || -z "${NFS_EXPORT_PATH:-}" ]]; then gated=1; missing_list+=("nfs"); scope_list+=("nfs pv/pvc") fi if [[ -z "${WORKSTATION_SSH:-}" ]]; then gated=1; missing_list+=("third_party_probe"); scope_list+=("third-party probe (WORKSTATION_SSH e.g. jack@ylc65)") fi if [[ "$gated" == "1" ]]; then # Join arrays into readable strings. local missing joined_scope missing="$(IFS=,; echo "${missing_list[*]}")" joined_scope="$(IFS='; '; echo "${scope_list[*]}")" echo "[GATE] preflight external deps missing: ${missing} (scopes: ${joined_scope})" oc_gated "${missing}" "${joined_scope}" echo "[OK] preflight 通过(带门控:gated)" return 0 fi echo "[OC] doc_id=preflight result=verified phase=preflight assertion=connectivity" echo "[OK] preflight 通过" } run_all_verify() { local series="${1:-}" local id_regex="${2:-}" local exclude_noop="${3:-0}" local require_teardown="${4:-0}" local id while IFS= read -r id; do echo "" echo "########################################## $id" ansible_verify "$id" done < <(list_doc_ids_from_verify_dir "$series" "$id_regex" "$exclude_noop" "$require_teardown") } usage() { cat <<'EOF' 用法:ansible/bin/verify.sh <命令> [...] 命令:flow | preflight | full | list | run | run-all 筛选参数:--series | --id-regex | --exclude-noop | --require-teardown EOF } print_flow() { cat < / run-all EOF } ansible_verify() { local doc_id="$1" if ! is_exec_doc_id "$doc_id"; then echo "[ERR] 非执行域 doc_id:$doc_id(仅允许 XX>0 且 YY>0)" >&2 echo "[OC] doc_id=${doc_id} result=failed phase=verify assertion=invalid_doc_id" exit 1 fi local inv="${ANSIBLE_INVENTORY:-${ROOT}/ansible/inventory.ini}" local pb_single="${ROOT}/ansible/playbooks/verify/${doc_id}.yml" [[ -f "$pb_single" ]] || { echo "[ERR] verify playbook 不存在:$pb_single" >&2; echo "[OC] doc_id=${doc_id} result=failed phase=verify assertion=missing_playbook"; exit 1; } [[ -f "$inv" ]] || { echo "[ERR] inventory 不存在:$inv" >&2; echo "[OC] doc_id=${doc_id} result=failed phase=verify assertion=missing_inventory"; exit 1; } local td="${VERIFY_TEARDOWN:-1}" local run_log run_log="$(mktemp)" echo "[RUN] ansible-playbook -i $inv -e VERIFY_TEARDOWN=$td $pb_single" if ansible-playbook -i "$inv" -e "VERIFY_TEARDOWN=$td" "$pb_single" 2>&1 | tee "$run_log"; then if grep -q '\[GATE\]' "$run_log"; then echo "[OC] doc_id=${doc_id} result=gated phase=verify assertion=playbook_gated" else # OC1: stable parse fields. OC3 evidence points to playbook output sections. echo "[OC] doc_id=${doc_id} result=verified phase=verify assertion=playbook_success" echo "[OC-EVIDENCE] doc_id=${doc_id} kind=cluster summary=\"see kubectl/assert output in playbook logs\"" echo "[OC-EVIDENCE] doc_id=${doc_id} kind=entry summary=\"see http/tls/assert output in playbook logs\"" fi else echo "[OC] doc_id=${doc_id} result=failed phase=verify assertion=playbook_failed" rm -f "$run_log" return 1 fi rm -f "$run_log" } main() { load_env local cmd="${1:-}" shift || true local series="" local id_regex="" local exclude_noop=0 local require_teardown=0 parse_filter_args() { while [[ $# -gt 0 ]]; do case "$1" in --series) series="${2:-}"; shift 2 ;; --id-regex) id_regex="${2:-}"; shift 2 ;; --exclude-noop) exclude_noop=1; shift ;; --require-teardown) require_teardown=1; shift ;; *) echo "[ERR] 未知参数:$1" >&2; exit 1 ;; esac done } case "$cmd" in ""|-h|--help) usage ;; flow) print_flow ;; preflight) run_preflight ;; full) parse_filter_args "$@"; run_preflight; run_all_verify "$series" "$id_regex" "$exclude_noop" "$require_teardown" ;; list) parse_filter_args "$@"; list_doc_ids_from_verify_dir "$series" "$id_regex" "$exclude_noop" "$require_teardown" ;; run) local doc_id="${1:?need doc_id like 02-05}"; ansible_verify "$doc_id" ;; run-all) parse_filter_args "$@"; run_all_verify "$series" "$id_regex" "$exclude_noop" "$require_teardown" ;; *) echo "[ERR] unknown cmd: $cmd" >&2; usage; exit 1 ;; esac } main "$@"