305 lines
11 KiB
Bash
Executable File
305 lines
11 KiB
Bash
Executable File
#!/usr/bin/env bash
|
||
# 验证入口(以 ansible/playbooks/verify/<doc_id>.yml 为唯一执行真源):
|
||
# - run <XX-YY>:执行单篇验证 playbook
|
||
# - run-all:按 verify 目录中存在的 <doc_id>.yml 顺序执行(仅执行域:XX>0 && YY>0)
|
||
# - full:preflight + run-all
|
||
set -euo pipefail
|
||
|
||
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
||
# shellcheck disable=SC1091
|
||
source "${ROOT}/ansible/lib/lib-ansible-lab.sh"
|
||
ansible_lab_export_config
|
||
|
||
export VERIFY_TEARDOWN="${VERIFY_TEARDOWN:-1}"
|
||
STATUS_DIR="${ROOT}/.status"
|
||
TEARDOWN_STATE_JSON="${STATUS_DIR}/verify-teardown-state.json"
|
||
|
||
load_env() {
|
||
export ANSIBLE_CONFIG="${ANSIBLE_CONFIG:-${ROOT}/ansible/ansible.cfg}"
|
||
local td_override="${VERIFY_TEARDOWN-__unset__}"
|
||
if [[ -f "${ROOT}/ansible/env/.env.verify" ]]; then
|
||
set -a
|
||
# shellcheck disable=SC1091
|
||
source "${ROOT}/ansible/env/.env.verify"
|
||
set +a
|
||
echo "[OK] 已加载 ansible/env/.env.verify"
|
||
else
|
||
echo "[TIP] 未发现 ansible/env/.env.verify,按默认变量继续"
|
||
fi
|
||
if [[ "${td_override}" != "__unset__" ]]; then
|
||
export VERIFY_TEARDOWN="${td_override}"
|
||
else
|
||
export VERIFY_TEARDOWN="${VERIFY_TEARDOWN:-1}"
|
||
fi
|
||
echo "[INFO] ANSIBLE_CONFIG=${ANSIBLE_CONFIG}"
|
||
}
|
||
|
||
record_teardown_state() {
|
||
mkdir -p "${STATUS_DIR}"
|
||
local td="${VERIFY_TEARDOWN:-1}"
|
||
local ts
|
||
ts="$(date -u +"%Y-%m-%dT%H:%M:%SZ")"
|
||
cat > "${TEARDOWN_STATE_JSON}" <<EOF
|
||
{"updated_at":"${ts}","verify_teardown":${td}}
|
||
EOF
|
||
}
|
||
|
||
warn_teardown_mode() {
|
||
local td="${VERIFY_TEARDOWN:-1}"
|
||
if [[ "${td}" == "0" ]]; then
|
||
echo "[WARN] VERIFY_TEARDOWN=0:保留现场模式已启用(可能污染后续 full/run-all)" >&2
|
||
echo "[TIP] 恢复建议:切回 VERIFY_TEARDOWN=1 并复跑主线;必要时手工清理残留命名空间/资源" >&2
|
||
echo "[OC] doc_id=preflight result=verified phase=preflight assertion=teardown_mode verify_teardown=0"
|
||
fi
|
||
|
||
if [[ -f "${TEARDOWN_STATE_JSON}" && "${td}" == "1" ]]; then
|
||
local last_td=""
|
||
last_td="$(TEARDOWN_STATE_JSON="${TEARDOWN_STATE_JSON}" python3 - <<'PY' 2>/dev/null || true
|
||
import json, os, pathlib
|
||
p = pathlib.Path(os.environ["TEARDOWN_STATE_JSON"])
|
||
try:
|
||
d = json.loads(p.read_text(encoding="utf-8"))
|
||
print(d.get("verify_teardown", ""))
|
||
except Exception:
|
||
pass
|
||
PY
|
||
)"
|
||
if [[ "${last_td}" == "0" ]]; then
|
||
echo "[WARN] 检测到上次验证使用 VERIFY_TEARDOWN=0:当前虽为 1,但可能存在残留污染" >&2
|
||
echo "[TIP] 建议:VERIFY_TEARDOWN=1 ./ansible/bin/verify.sh full(或 run-all)以清理并回归" >&2
|
||
echo "[OC] doc_id=preflight result=verified phase=preflight assertion=teardown_state last_verify_teardown=0 current_verify_teardown=1"
|
||
fi
|
||
fi
|
||
}
|
||
|
||
DOC_ID_EXEC_RE='^(0[1-9]|[1-9][0-9])-(0[1-9]|[1-9][0-9])$'
|
||
|
||
is_exec_doc_id() {
|
||
local doc_id="$1"
|
||
[[ "$doc_id" =~ $DOC_ID_EXEC_RE ]]
|
||
}
|
||
|
||
list_doc_ids_from_verify_dir() {
|
||
local series="${1:-}"
|
||
local id_regex="${2:-}"
|
||
local exclude_noop="${3:-0}"
|
||
local require_teardown="${4:-0}"
|
||
ROOT="${ROOT}" SERIES="${series}" ID_REGEX="${id_regex}" EXCLUDE_NOOP="${exclude_noop}" REQUIRE_TEARDOWN="${require_teardown}" python3 - <<'PY'
|
||
import os
|
||
import re
|
||
from pathlib import Path
|
||
|
||
root = Path(os.environ["ROOT"])
|
||
verify_dir = root / "ansible" / "playbooks" / "verify"
|
||
series = os.environ.get("SERIES", "").strip()
|
||
id_regex = os.environ.get("ID_REGEX", "").strip()
|
||
exclude_noop = os.environ.get("EXCLUDE_NOOP", "0") == "1"
|
||
require_teardown = os.environ.get("REQUIRE_TEARDOWN", "0") == "1"
|
||
|
||
pat = re.compile(r"^(?P<id>(0[1-9]|[1-9][0-9])-(0[1-9]|[1-9][0-9]))\.yml$")
|
||
id_pat = re.compile(id_regex) if id_regex else None
|
||
|
||
ids = []
|
||
for p in verify_dir.iterdir():
|
||
m = pat.match(p.name)
|
||
if not m:
|
||
continue
|
||
doc_id = m.group("id")
|
||
if series and not doc_id.startswith(f"{series}-"):
|
||
continue
|
||
if id_pat and not id_pat.search(doc_id):
|
||
continue
|
||
if exclude_noop or require_teardown:
|
||
content = p.read_text(encoding="utf-8", errors="ignore")
|
||
if exclude_noop and "noop verify" in content:
|
||
continue
|
||
if require_teardown and ("VERIFY_TEARDOWN" not in content and "verify_teardown" not in content):
|
||
continue
|
||
ids.append(doc_id)
|
||
|
||
for x in sorted(set(ids)):
|
||
print(x)
|
||
PY
|
||
}
|
||
|
||
run_preflight() {
|
||
local inv="${ANSIBLE_INVENTORY:-${ROOT}/ansible/inventory.ini}"
|
||
oc_failed() {
|
||
# OC-like preflight line for humans/tools (minimal; stdout is source of truth).
|
||
local assertion="$1"
|
||
shift || true
|
||
echo "[OC] doc_id=preflight result=failed phase=preflight assertion=${assertion} $*"
|
||
}
|
||
oc_gated() {
|
||
local missing="$1"
|
||
local scope="$2"
|
||
echo "[OC] doc_id=preflight result=gated phase=preflight assertion=dependency_check missing_dependency=${missing} skip_scope=\"${scope}\""
|
||
}
|
||
need_cmd_or_fail() {
|
||
local cmd="$1"
|
||
if ! command -v "$cmd" >/dev/null 2>&1; then
|
||
echo "[ERR] 未找到命令:$cmd" >&2
|
||
oc_failed "missing_cmd" "missing_cmd=${cmd}"
|
||
exit 2
|
||
fi
|
||
}
|
||
|
||
need_cmd_or_fail ansible-playbook
|
||
need_cmd_or_fail ansible
|
||
|
||
warn_teardown_mode
|
||
record_teardown_state
|
||
|
||
[[ -f "$inv" ]] || { echo "[ERR] inventory 不存在:$inv" >&2; oc_failed "missing_inventory" "inventory=${inv}"; exit 2; }
|
||
ansible_lab_check_inventory_keys "$inv" || { oc_failed "inventory_keys" "inventory=${inv}"; exit 2; }
|
||
|
||
echo "[INFO] 变量边界:inventory=$inv | group_vars=ansible/group_vars/all.yml | env=ansible/env/.env.verify"
|
||
echo "[INFO] 关键变量:VERIFY_TEARDOWN=${VERIFY_TEARDOWN:-1} nginx_entry_base=${nginx_entry_base:-<unset>} nodejs_entry_base=${nodejs_entry_base:-<unset>}"
|
||
|
||
echo "[RUN] ansible k3s_server -m ping"
|
||
if ! ansible k3s_server -i "$inv" -m ping; then
|
||
echo "[ERR] ansible ping 失败:k3s_server 不可达" >&2
|
||
oc_failed "ansible_ping" "target_group=k3s_server"
|
||
exit 2
|
||
fi
|
||
|
||
# Optional cluster-side check (may still fail-fast: control-side hard failure).
|
||
if [[ "${VERIFY_PREFLIGHT_CLUSTER:-0}" == "1" ]]; then
|
||
if ! ansible k3s_server -i "$inv" -b -m ansible.builtin.shell -a \
|
||
'KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl get nodes'; then
|
||
echo "[ERR] kubectl 集群检查失败(VERIFY_PREFLIGHT_CLUSTER=1)" >&2
|
||
oc_failed "kubectl_get_nodes"
|
||
exit 2
|
||
fi
|
||
fi
|
||
|
||
# External dependencies: missing deps should not fail preflight (EC2) but must be explicit gated.
|
||
# We gate only the dependent scopes; runtime verify can still proceed for non-dependent doc_ids.
|
||
local gated=0
|
||
local missing_list=()
|
||
local scope_list=()
|
||
|
||
if [[ -z "${ACME_EMAIL:-}" ]]; then
|
||
gated=1; missing_list+=("acme"); scope_list+=("acme/tls issuance")
|
||
fi
|
||
# Epic 4:Traefik ACME DNS-01 仅需 CF_API_TOKEN(见 03-02 ensure secret);ZONE_* 不由 preflight 强门禁。
|
||
if [[ -z "${CF_API_TOKEN:-}" ]]; then
|
||
gated=1; missing_list+=("cloudflare"); scope_list+=("cloudflare api token / acme dns01")
|
||
fi
|
||
if [[ -z "${NFS_SERVER_IP:-}" || -z "${NFS_EXPORT_PATH:-}" ]]; then
|
||
gated=1; missing_list+=("nfs"); scope_list+=("nfs pv/pvc")
|
||
fi
|
||
if [[ -z "${WORKSTATION_SSH:-}" ]]; then
|
||
gated=1; missing_list+=("third_party_probe"); scope_list+=("third-party probe (WORKSTATION_SSH e.g. jack@ylc65)")
|
||
fi
|
||
|
||
if [[ "$gated" == "1" ]]; then
|
||
# Join arrays into readable strings.
|
||
local missing joined_scope
|
||
missing="$(IFS=,; echo "${missing_list[*]}")"
|
||
joined_scope="$(IFS='; '; echo "${scope_list[*]}")"
|
||
echo "[GATE] preflight external deps missing: ${missing} (scopes: ${joined_scope})"
|
||
oc_gated "${missing}" "${joined_scope}"
|
||
echo "[OK] preflight 通过(带门控:gated)"
|
||
return 0
|
||
fi
|
||
|
||
echo "[OC] doc_id=preflight result=verified phase=preflight assertion=connectivity"
|
||
echo "[OK] preflight 通过"
|
||
}
|
||
|
||
run_all_verify() {
|
||
local series="${1:-}"
|
||
local id_regex="${2:-}"
|
||
local exclude_noop="${3:-0}"
|
||
local require_teardown="${4:-0}"
|
||
local id
|
||
while IFS= read -r id; do
|
||
echo ""
|
||
echo "########################################## $id"
|
||
ansible_verify "$id"
|
||
done < <(list_doc_ids_from_verify_dir "$series" "$id_regex" "$exclude_noop" "$require_teardown")
|
||
}
|
||
|
||
usage() {
|
||
cat <<'EOF'
|
||
用法:ansible/bin/verify.sh <命令> [...]
|
||
命令:flow | preflight | full | list | run <XX-YY> | run-all
|
||
筛选参数:--series <XX> | --id-regex <regex> | --exclude-noop | --require-teardown
|
||
EOF
|
||
}
|
||
|
||
print_flow() {
|
||
cat <<EOF
|
||
1 接入目标环境 inventory + 仓库同步;可选 source ansible/env/.env.verify
|
||
2 环境与前置清理 轻量:各 verify playbook 的 teardown
|
||
3 部署 ./ansible/bin/deploy-lab.sh k3s|longhorn|nginx-matrix*
|
||
4 断言 ./ansible/bin/verify.sh run <XX-YY> / run-all
|
||
EOF
|
||
}
|
||
|
||
ansible_verify() {
|
||
local doc_id="$1"
|
||
if ! is_exec_doc_id "$doc_id"; then
|
||
echo "[ERR] 非执行域 doc_id:$doc_id(仅允许 XX>0 且 YY>0)" >&2
|
||
echo "[OC] doc_id=${doc_id} result=failed phase=verify assertion=invalid_doc_id"
|
||
exit 1
|
||
fi
|
||
local inv="${ANSIBLE_INVENTORY:-${ROOT}/ansible/inventory.ini}"
|
||
local pb_single="${ROOT}/ansible/playbooks/verify/${doc_id}.yml"
|
||
[[ -f "$pb_single" ]] || { echo "[ERR] verify playbook 不存在:$pb_single" >&2; echo "[OC] doc_id=${doc_id} result=failed phase=verify assertion=missing_playbook"; exit 1; }
|
||
[[ -f "$inv" ]] || { echo "[ERR] inventory 不存在:$inv" >&2; echo "[OC] doc_id=${doc_id} result=failed phase=verify assertion=missing_inventory"; exit 1; }
|
||
local td="${VERIFY_TEARDOWN:-1}"
|
||
local run_log
|
||
run_log="$(mktemp)"
|
||
echo "[RUN] ansible-playbook -i $inv -e VERIFY_TEARDOWN=$td $pb_single"
|
||
if ansible-playbook -i "$inv" -e "VERIFY_TEARDOWN=$td" "$pb_single" 2>&1 | tee "$run_log"; then
|
||
if grep -q '\[GATE\]' "$run_log"; then
|
||
echo "[OC] doc_id=${doc_id} result=gated phase=verify assertion=playbook_gated"
|
||
else
|
||
# OC1: stable parse fields. OC3 evidence points to playbook output sections.
|
||
echo "[OC] doc_id=${doc_id} result=verified phase=verify assertion=playbook_success"
|
||
echo "[OC-EVIDENCE] doc_id=${doc_id} kind=cluster summary=\"see kubectl/assert output in playbook logs\""
|
||
echo "[OC-EVIDENCE] doc_id=${doc_id} kind=entry summary=\"see http/tls/assert output in playbook logs\""
|
||
fi
|
||
else
|
||
echo "[OC] doc_id=${doc_id} result=failed phase=verify assertion=playbook_failed"
|
||
rm -f "$run_log"
|
||
return 1
|
||
fi
|
||
rm -f "$run_log"
|
||
}
|
||
|
||
main() {
|
||
load_env
|
||
local cmd="${1:-}"
|
||
shift || true
|
||
local series=""
|
||
local id_regex=""
|
||
local exclude_noop=0
|
||
local require_teardown=0
|
||
parse_filter_args() {
|
||
while [[ $# -gt 0 ]]; do
|
||
case "$1" in
|
||
--series) series="${2:-}"; shift 2 ;;
|
||
--id-regex) id_regex="${2:-}"; shift 2 ;;
|
||
--exclude-noop) exclude_noop=1; shift ;;
|
||
--require-teardown) require_teardown=1; shift ;;
|
||
*) echo "[ERR] 未知参数:$1" >&2; exit 1 ;;
|
||
esac
|
||
done
|
||
}
|
||
case "$cmd" in
|
||
""|-h|--help) usage ;;
|
||
flow) print_flow ;;
|
||
preflight) run_preflight ;;
|
||
full) parse_filter_args "$@"; run_preflight; run_all_verify "$series" "$id_regex" "$exclude_noop" "$require_teardown" ;;
|
||
list) parse_filter_args "$@"; list_doc_ids_from_verify_dir "$series" "$id_regex" "$exclude_noop" "$require_teardown" ;;
|
||
run) local doc_id="${1:?need doc_id like 02-05}"; ansible_verify "$doc_id" ;;
|
||
run-all) parse_filter_args "$@"; run_all_verify "$series" "$id_regex" "$exclude_noop" "$require_teardown" ;;
|
||
*) echo "[ERR] unknown cmd: $cmd" >&2; usage; exit 1 ;;
|
||
esac
|
||
}
|
||
|
||
main "$@"
|