Deploy-Laboratory/ansible/bin/verify.sh

#!/usr/bin/env bash
# 验证入口（以 ansible/playbooks/verify/<doc_id>.yml 为唯一执行真源）：
# - run <XX-YY>：执行单篇验证 playbook
# - run-all：按 verify 目录中存在的 <doc_id>.yml 顺序执行（仅执行域：XX>0 && YY>0）
# - full：preflight + run-all
set -euo pipefail

ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
# shellcheck disable=SC1091
source "${ROOT}/ansible/lib/lib-ansible-lab.sh"
ansible_lab_export_config

export VERIFY_TEARDOWN="${VERIFY_TEARDOWN:-1}"
STATUS_DIR="${ROOT}/.status"
TEARDOWN_STATE_JSON="${STATUS_DIR}/verify-teardown-state.json"

load_env() {
  export ANSIBLE_CONFIG="${ANSIBLE_CONFIG:-${ROOT}/ansible/ansible.cfg}"
  local td_override="${VERIFY_TEARDOWN-__unset__}"
  if [[ -f "${ROOT}/ansible/env/.env.verify" ]]; then
    set -a
    # shellcheck disable=SC1091
    source "${ROOT}/ansible/env/.env.verify"
    set +a
    echo "[OK] 已加载 ansible/env/.env.verify"
  else
    echo "[TIP] 未发现 ansible/env/.env.verify，按默认变量继续"
  fi
  if [[ "${td_override}" != "__unset__" ]]; then
    export VERIFY_TEARDOWN="${td_override}"
  else
    export VERIFY_TEARDOWN="${VERIFY_TEARDOWN:-1}"
  fi
  echo "[INFO] ANSIBLE_CONFIG=${ANSIBLE_CONFIG}"
}

record_teardown_state() {
  mkdir -p "${STATUS_DIR}"
  local td="${VERIFY_TEARDOWN:-1}"
  local ts
  ts="$(date -u +"%Y-%m-%dT%H:%M:%SZ")"
  cat > "${TEARDOWN_STATE_JSON}" <<EOF
{"updated_at":"${ts}","verify_teardown":${td}}
EOF
}

warn_teardown_mode() {
  local td="${VERIFY_TEARDOWN:-1}"
  if [[ "${td}" == "0" ]]; then
    echo "[WARN] VERIFY_TEARDOWN=0：保留现场模式已启用（可能污染后续 full/run-all）" >&2
    echo "[TIP] 恢复建议：切回 VERIFY_TEARDOWN=1 并复跑主线；必要时手工清理残留命名空间/资源" >&2
    echo "[OC] doc_id=preflight result=verified phase=preflight assertion=teardown_mode verify_teardown=0"
  fi

  if [[ -f "${TEARDOWN_STATE_JSON}" && "${td}" == "1" ]]; then
    local last_td=""
    last_td="$(TEARDOWN_STATE_JSON="${TEARDOWN_STATE_JSON}" python3 - <<'PY' 2>/dev/null || true
import json, os, pathlib
p = pathlib.Path(os.environ["TEARDOWN_STATE_JSON"])
try:
  d = json.loads(p.read_text(encoding="utf-8"))
  print(d.get("verify_teardown", ""))
except Exception:
  pass
PY
    )"
    if [[ "${last_td}" == "0" ]]; then
      echo "[WARN] 检测到上次验证使用 VERIFY_TEARDOWN=0：当前虽为 1，但可能存在残留污染" >&2
      echo "[TIP] 建议：VERIFY_TEARDOWN=1 ./ansible/bin/verify.sh full（或 run-all）以清理并回归" >&2
      echo "[OC] doc_id=preflight result=verified phase=preflight assertion=teardown_state last_verify_teardown=0 current_verify_teardown=1"
    fi
  fi
}

DOC_ID_EXEC_RE='^(0[1-9]|[1-9][0-9])-(0[1-9]|[1-9][0-9])$'

is_exec_doc_id() {
  local doc_id="$1"
  [[ "$doc_id" =~ $DOC_ID_EXEC_RE ]]
}

list_doc_ids_from_verify_dir() {
  local series="${1:-}"
  local id_regex="${2:-}"
  local exclude_noop="${3:-0}"
  local require_teardown="${4:-0}"
  ROOT="${ROOT}" SERIES="${series}" ID_REGEX="${id_regex}" EXCLUDE_NOOP="${exclude_noop}" REQUIRE_TEARDOWN="${require_teardown}" python3 - <<'PY'
import os
import re
from pathlib import Path

root = Path(os.environ["ROOT"])
verify_dir = root / "ansible" / "playbooks" / "verify"
series = os.environ.get("SERIES", "").strip()
id_regex = os.environ.get("ID_REGEX", "").strip()
exclude_noop = os.environ.get("EXCLUDE_NOOP", "0") == "1"
require_teardown = os.environ.get("REQUIRE_TEARDOWN", "0") == "1"

pat = re.compile(r"^(?P<id>(0[1-9]|[1-9][0-9])-(0[1-9]|[1-9][0-9]))\.yml$")
id_pat = re.compile(id_regex) if id_regex else None

ids = []
for p in verify_dir.iterdir():
    m = pat.match(p.name)
    if not m:
        continue
    doc_id = m.group("id")
    if series and not doc_id.startswith(f"{series}-"):
      continue
    if id_pat and not id_pat.search(doc_id):
      continue
    if exclude_noop or require_teardown:
      content = p.read_text(encoding="utf-8", errors="ignore")
      if exclude_noop and "noop verify" in content:
        continue
      if require_teardown and ("VERIFY_TEARDOWN" not in content and "verify_teardown" not in content):
        continue
    ids.append(doc_id)

for x in sorted(set(ids)):
    print(x)
PY
}

run_preflight() {
  local inv="${ANSIBLE_INVENTORY:-${ROOT}/ansible/inventory.ini}"
  oc_failed() {
    # OC-like preflight line for humans/tools (minimal; stdout is source of truth).
    local assertion="$1"
    shift || true
    echo "[OC] doc_id=preflight result=failed phase=preflight assertion=${assertion} $*"
  }
  oc_gated() {
    local missing="$1"
    local scope="$2"
    echo "[OC] doc_id=preflight result=gated phase=preflight assertion=dependency_check missing_dependency=${missing} skip_scope=\"${scope}\""
  }
  need_cmd_or_fail() {
    local cmd="$1"
    if ! command -v "$cmd" >/dev/null 2>&1; then
      echo "[ERR] 未找到命令：$cmd" >&2
      oc_failed "missing_cmd" "missing_cmd=${cmd}"
      exit 2
    fi
  }

  need_cmd_or_fail ansible-playbook
  need_cmd_or_fail ansible

  warn_teardown_mode
  record_teardown_state

  [[ -f "$inv" ]] || { echo "[ERR] inventory 不存在：$inv" >&2; oc_failed "missing_inventory" "inventory=${inv}"; exit 2; }
  ansible_lab_check_inventory_keys "$inv" || { oc_failed "inventory_keys" "inventory=${inv}"; exit 2; }

  echo "[INFO] 变量边界：inventory=$inv | group_vars=ansible/group_vars/all.yml | env=ansible/env/.env.verify"
  echo "[INFO] 关键变量：VERIFY_TEARDOWN=${VERIFY_TEARDOWN:-1} nginx_entry_base=${nginx_entry_base:-<unset>} nodejs_entry_base=${nodejs_entry_base:-<unset>}"

  echo "[RUN] ansible k3s_server -m ping"
  if ! ansible k3s_server -i "$inv" -m ping; then
    echo "[ERR] ansible ping 失败：k3s_server 不可达" >&2
    oc_failed "ansible_ping" "target_group=k3s_server"
    exit 2
  fi

  # Optional cluster-side check (may still fail-fast: control-side hard failure).
  if [[ "${VERIFY_PREFLIGHT_CLUSTER:-0}" == "1" ]]; then
    if ! ansible k3s_server -i "$inv" -b -m ansible.builtin.shell -a \
      'KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl get nodes'; then
      echo "[ERR] kubectl 集群检查失败（VERIFY_PREFLIGHT_CLUSTER=1）" >&2
      oc_failed "kubectl_get_nodes"
      exit 2
    fi
  fi

  # External dependencies: missing deps should not fail preflight (EC2) but must be explicit gated.
  # We gate only the dependent scopes; runtime verify can still proceed for non-dependent doc_ids.
  local gated=0
  local missing_list=()
  local scope_list=()

  if [[ -z "${ACME_EMAIL:-}" ]]; then
    gated=1; missing_list+=("acme"); scope_list+=("acme/tls issuance")
  fi
  # Epic 4：Traefik ACME DNS-01 仅需 CF_API_TOKEN（见 03-02 ensure secret）；ZONE_* 不由 preflight 强门禁。
  if [[ -z "${CF_API_TOKEN:-}" ]]; then
    gated=1; missing_list+=("cloudflare"); scope_list+=("cloudflare api token / acme dns01")
  fi
  if [[ -z "${NFS_SERVER_IP:-}" || -z "${NFS_EXPORT_PATH:-}" ]]; then
    gated=1; missing_list+=("nfs"); scope_list+=("nfs pv/pvc")
  fi
  if [[ -z "${WORKSTATION_SSH:-}" ]]; then
    gated=1; missing_list+=("third_party_probe"); scope_list+=("third-party probe (WORKSTATION_SSH e.g. jack@ylc65)")
  fi

  if [[ "$gated" == "1" ]]; then
    # Join arrays into readable strings.
    local missing joined_scope
    missing="$(IFS=,; echo "${missing_list[*]}")"
    joined_scope="$(IFS='; '; echo "${scope_list[*]}")"
    echo "[GATE] preflight external deps missing: ${missing} (scopes: ${joined_scope})"
    oc_gated "${missing}" "${joined_scope}"
    echo "[OK] preflight 通过（带门控：gated）"
    return 0
  fi

  echo "[OC] doc_id=preflight result=verified phase=preflight assertion=connectivity"
  echo "[OK] preflight 通过"
}

run_all_verify() {
  local series="${1:-}"
  local id_regex="${2:-}"
  local exclude_noop="${3:-0}"
  local require_teardown="${4:-0}"
  local id
  while IFS= read -r id; do
    echo ""
    echo "########################################## $id"
    ansible_verify "$id"
  done < <(list_doc_ids_from_verify_dir "$series" "$id_regex" "$exclude_noop" "$require_teardown")
}

usage() {
  cat <<'EOF'
用法：ansible/bin/verify.sh <命令> [...]
命令：flow | preflight | full | list | run <XX-YY> | run-all
筛选参数：--series <XX> | --id-regex <regex> | --exclude-noop | --require-teardown
EOF
}

print_flow() {
  cat <<EOF
  1 接入目标环境     inventory + 仓库同步；可选 source ansible/env/.env.verify
  2 环境与前置清理   轻量：各 verify playbook 的 teardown
  3 部署             ./ansible/bin/deploy-lab.sh k3s|longhorn|nginx-matrix*
  4 断言             ./ansible/bin/verify.sh run <XX-YY> / run-all
EOF
}

ansible_verify() {
  local doc_id="$1"
  if ! is_exec_doc_id "$doc_id"; then
    echo "[ERR] 非执行域 doc_id：$doc_id（仅允许 XX>0 且 YY>0）" >&2
    echo "[OC] doc_id=${doc_id} result=failed phase=verify assertion=invalid_doc_id"
    exit 1
  fi
  local inv="${ANSIBLE_INVENTORY:-${ROOT}/ansible/inventory.ini}"
  local pb_single="${ROOT}/ansible/playbooks/verify/${doc_id}.yml"
  [[ -f "$pb_single" ]] || { echo "[ERR] verify playbook 不存在：$pb_single" >&2; echo "[OC] doc_id=${doc_id} result=failed phase=verify assertion=missing_playbook"; exit 1; }
  [[ -f "$inv" ]] || { echo "[ERR] inventory 不存在：$inv" >&2; echo "[OC] doc_id=${doc_id} result=failed phase=verify assertion=missing_inventory"; exit 1; }
  local td="${VERIFY_TEARDOWN:-1}"
  local run_log
  run_log="$(mktemp)"
  echo "[RUN] ansible-playbook -i $inv -e VERIFY_TEARDOWN=$td $pb_single"
  if ansible-playbook -i "$inv" -e "VERIFY_TEARDOWN=$td" "$pb_single" 2>&1 | tee "$run_log"; then
    if grep -q '\[GATE\]' "$run_log"; then
      echo "[OC] doc_id=${doc_id} result=gated phase=verify assertion=playbook_gated"
    else
      # OC1: stable parse fields. OC3 evidence points to playbook output sections.
      echo "[OC] doc_id=${doc_id} result=verified phase=verify assertion=playbook_success"
      echo "[OC-EVIDENCE] doc_id=${doc_id} kind=cluster summary=\"see kubectl/assert output in playbook logs\""
      echo "[OC-EVIDENCE] doc_id=${doc_id} kind=entry summary=\"see http/tls/assert output in playbook logs\""
    fi
  else
    echo "[OC] doc_id=${doc_id} result=failed phase=verify assertion=playbook_failed"
    rm -f "$run_log"
    return 1
  fi
  rm -f "$run_log"
}

main() {
  load_env
  local cmd="${1:-}"
  shift || true
  local series=""
  local id_regex=""
  local exclude_noop=0
  local require_teardown=0
  parse_filter_args() {
    while [[ $# -gt 0 ]]; do
      case "$1" in
        --series) series="${2:-}"; shift 2 ;;
        --id-regex) id_regex="${2:-}"; shift 2 ;;
        --exclude-noop) exclude_noop=1; shift ;;
        --require-teardown) require_teardown=1; shift ;;
        *) echo "[ERR] 未知参数：$1" >&2; exit 1 ;;
      esac
    done
  }
  case "$cmd" in
    ""|-h|--help) usage ;;
    flow) print_flow ;;
    preflight) run_preflight ;;
    full) parse_filter_args "$@"; run_preflight; run_all_verify "$series" "$id_regex" "$exclude_noop" "$require_teardown" ;;
    list) parse_filter_args "$@"; list_doc_ids_from_verify_dir "$series" "$id_regex" "$exclude_noop" "$require_teardown" ;;
    run) local doc_id="${1:?need doc_id like 02-05}"; ansible_verify "$doc_id" ;;
    run-all) parse_filter_args "$@"; run_all_verify "$series" "$id_regex" "$exclude_noop" "$require_teardown" ;;
    *) echo "[ERR] unknown cmd: $cmd" >&2; usage; exit 1 ;;
  esac
}

main "$@"