diff --git a/.gitignore b/.gitignore index 0d28df6..1ac35fc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,10 @@ .cursor .ssh -# 本地填写的验证编排环境变量(从 scripts/.env.verify.example 复制) -scripts/.env.verify +# 本地填写的验证编排环境变量(从 ansible/env/.env.verify.example 复制) +ansible/env/.env.verify + +# 状态板本地缓存(不提交) +.status/ # 可选:export ANSIBLE_LOCAL_TMP=$PWD/.ansible-tmp(无写权限 ~/.ansible 时) .ansible-tmp/ _bmad diff --git a/README.md b/README.md index 0ea6532..92cd449 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,8 @@ - 脚本主入口:`scripts/README.md` - 仓库契约(AI/贡献者必读):`project-context.md`(真源、验证框架、noop/gate、敏感信息约束) - 测试与验证框架设计:`docs/00-03-测试与验证框架.md` -- **验证入口**:`./scripts/verify.sh`(`full/run-all/run`) +- **验证入口**:`./ansible/bin/verify.sh`(`full/run-all/run`);单篇简写 `./scripts/cs `(与 `verify.sh run` 等价,**任意执行域 doc_id 通用**) +- **Helm**:家庭实验里装 Longhorn、监控栈等常用;CLI 与环境见 `docs/00-02-部署环境说明.md`,各篇索引见 `docs/00-03-测试与验证框架.md` §1.1。 编号语义(用于快速判断“是否必须可执行”): @@ -27,7 +28,7 @@ - `README.md`:新手入口,看「要做什么、按什么顺序做」; - `docs/00-00-构建总览.md`:文档导航 + **学习主线(6 步)**与**附录长单**; - `docs/00-01-k3s-基础概念.md`:概念速查,看「不懂的 K3s/Traefik/NetworkPolicy 术语」; -- `./scripts/verify.sh`:按 `doc_id` 的自动化验证入口(`full/run-all/run`;清单由 `ansible/playbooks/verify/` 自动生成,且仅包含执行域 `XX>0 && YY>0`)。 +- `./ansible/bin/verify.sh`:按 `doc_id` 的自动化验证入口(`full/run-all/run`;清单由 `ansible/playbooks/verify/` 自动生成,且仅包含执行域 `XX>0 && YY>0`)。 目录约定: @@ -40,9 +41,9 @@ 1. **总览与环境**:读 `docs/00-00-构建总览.md`;需要对照机器与版本时打开 `docs/00-02-部署环境说明.md`。 2. **概念速查(可跳过)**:读 `docs/00-01-k3s-基础概念.md`;时间紧可跳过,**碰壁再回来看**。 -3. **安装 K3s(二选一)**:**自动化** — `docs/01-06-节点初始化-ansible-实践.md`,或仓库根执行 `./scripts/deploy-lab.sh k3s`(可选 `K3S_PREPARE_STORAGE=true`,详见 `scripts/README.md`);**手动** — `docs/01-01-k3s-控制节点含traefik.md` 再 `docs/01-02-k3s-工作节点.md`。 +3. **安装 K3s(二选一)**:**自动化** — `docs/01-05-节点初始化-ansible-实践.md`,或仓库根执行 `./ansible/bin/deploy-lab.sh k3s`(可选 `K3S_PREPARE_STORAGE=true`,详见 `scripts/README.md`);**手动** — `docs/01-01-k3s-控制节点含traefik.md` 再 `docs/01-02-k3s-工作节点.md`。 4. **确认节点 Ready**:`kubectl get nodes`,全部 Ready。 -5. **Nginx 最小验证**:`docs/02-00-nginx-系列说明.md` → `docs/02-05-nginx-验证矩阵-一键部署.md`,先打通「能访问」;也可在装好集群并配置 `.env.verify` 后直接 `./scripts/verify.sh run 02-05`。 +5. **Nginx 最小验证**:`docs/02-00-nginx-系列说明.md` → `docs/02-05-nginx-验证矩阵-一键部署.md`,先打通「能访问」;也可在装好集群并配置 `.env.verify` 后直接 `./ansible/bin/verify.sh run 02-05`。 6. **Node.js 主线入口**:`docs/04-01-k3s-nodejs-高级部署.md`;`docs/04-02`~`04-14` 为分项,**按需展开**,不挤进主线编号。 **主线之后(按需,不占主线序号)**:Traefik 面板与证书(如 `docs/03-01-k3s-traefik-dashboard.md`、`docs/03-02-k3s-traefik-acme.md`)、存储与应用(`03-05` 起、`05-**`)等 — 见总览中的「主线之后的分叉」与专题导航。 @@ -53,7 +54,7 @@ 相当于**跳过主线第 2 步(概念)**并**压缩第 1 步(只抓总览要点)**;跑通再按 6 步补全。 -1. **装集群**:Ansible 按 `docs/01-06-节点初始化-ansible-实践.md`(推荐);或 `docs/01-01` + `docs/01-02` 手动装控制节点(61)与工作节点(62)。 +1. **装集群**:Ansible 按 `docs/01-05-节点初始化-ansible-实践.md`(推荐);或 `docs/01-01` + `docs/01-02` 手动装控制节点(61)与工作节点(62)。 2. `kubectl get nodes`,确认节点 Ready。 3. 按 `docs/02-05-nginx-验证矩阵-一键部署.md` 部署 nginx 矩阵并访问一次(可先读 `docs/02-00-nginx-系列说明.md`)。 4. 若访问不通,按 `scripts/README.md` 先跑 firewalld 基线与入口链路诊断脚本。 diff --git a/ansible/ansible.cfg b/ansible/ansible.cfg index e16f105..f2cc517 100644 --- a/ansible/ansible.cfg +++ b/ansible/ansible.cfg @@ -3,3 +3,5 @@ host_key_checking = False # 使用 inventory 同目录 inventory = inventory.ini +# 允许 include_role 解析仓库内角色(例如 verify_common) +roles_path = roles diff --git a/ansible/bin/deploy-lab.sh b/ansible/bin/deploy-lab.sh new file mode 100755 index 0000000..23566f3 --- /dev/null +++ b/ansible/bin/deploy-lab.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +# shellcheck disable=SC1091 +source "${ROOT}/ansible/lib/lib-ansible-lab.sh" +ansible_lab_export_config + +load_env() { + if [[ -f "${ROOT}/ansible/env/.env.verify" ]]; then + set -a + # shellcheck disable=SC1091 + source "${ROOT}/ansible/env/.env.verify" + set +a + echo "[OK] 已加载 ansible/env/.env.verify" + fi +} + +usage() { + cat <<'EOF' +用法:ansible/bin/deploy-lab.sh <子命令> +子命令:k3s | longhorn | nginx-matrix | nginx-matrix-tls +EOF +} + +ansible_wrap() { + local inv="${ANSIBLE_INVENTORY:-${ROOT}/ansible/inventory.ini}" + [[ -f "$inv" ]] || { echo "[ERR] inventory 不存在:$inv" >&2; exit 1; } + command -v ansible-playbook >/dev/null 2>&1 || { echo "[ERR] 未找到 ansible-playbook" >&2; exit 1; } + ansible_lab_check_inventory_keys "$inv" || exit 1 + local td="${DEPLOY_VERIFY_TEARDOWN:-0}" + echo "[RUN] ansible-playbook -i $inv -e VERIFY_TEARDOWN=$td $*" + ansible-playbook -i "$inv" -e "VERIFY_TEARDOWN=$td" "$@" +} + +cmd_k3s() { + if [[ "${K3S_PREPARE_STORAGE:-false}" == "true" ]]; then + ansible_wrap "${ROOT}/ansible/playbooks/verify/01-05.yml" -e 'k3s_do_prepare_storage=true' -e 'k3s_prepare_storage=true' + fi + ansible_wrap "${ROOT}/ansible/playbooks/verify/01-05.yml" -e 'k3s_do_install=true' +} + +main() { + load_env + local sub="${1:-}" + case "$sub" in + ""|-h|--help) usage ;; + k3s) cmd_k3s ;; + longhorn) ansible_wrap "${ROOT}/ansible/playbooks/verify/03-07.yml" ;; + nginx-matrix) ansible_wrap "${ROOT}/ansible/playbooks/verify/02-05.yml" ;; + nginx-matrix-tls) ansible_wrap "${ROOT}/ansible/playbooks/verify/03-02.yml" -e 'nginx_matrix_tls_enable=true' ;; + *) echo "[ERR] 未知子命令:$sub" >&2; usage; exit 1 ;; + esac +} + +main "$@" diff --git a/ansible/bin/scaffold-doc-id.sh b/ansible/bin/scaffold-doc-id.sh new file mode 100755 index 0000000..7076249 --- /dev/null +++ b/ansible/bin/scaffold-doc-id.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash +# 生成执行域 doc_id 最小骨架(docs + ansible/files + verify playbook)。参见 ansible/tools/scaffold_doc_id.py +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +exec python3 "$ROOT/ansible/tools/scaffold_doc_id.py" "$@" diff --git a/ansible/bin/verify.sh b/ansible/bin/verify.sh new file mode 100755 index 0000000..6d92bb0 --- /dev/null +++ b/ansible/bin/verify.sh @@ -0,0 +1,304 @@ +#!/usr/bin/env bash +# 验证入口(以 ansible/playbooks/verify/.yml 为唯一执行真源): +# - run :执行单篇验证 playbook +# - run-all:按 verify 目录中存在的 .yml 顺序执行(仅执行域:XX>0 && YY>0) +# - full:preflight + run-all +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +# shellcheck disable=SC1091 +source "${ROOT}/ansible/lib/lib-ansible-lab.sh" +ansible_lab_export_config + +export VERIFY_TEARDOWN="${VERIFY_TEARDOWN:-1}" +STATUS_DIR="${ROOT}/.status" +TEARDOWN_STATE_JSON="${STATUS_DIR}/verify-teardown-state.json" + +load_env() { + export ANSIBLE_CONFIG="${ANSIBLE_CONFIG:-${ROOT}/ansible/ansible.cfg}" + local td_override="${VERIFY_TEARDOWN-__unset__}" + if [[ -f "${ROOT}/ansible/env/.env.verify" ]]; then + set -a + # shellcheck disable=SC1091 + source "${ROOT}/ansible/env/.env.verify" + set +a + echo "[OK] 已加载 ansible/env/.env.verify" + else + echo "[TIP] 未发现 ansible/env/.env.verify,按默认变量继续" + fi + if [[ "${td_override}" != "__unset__" ]]; then + export VERIFY_TEARDOWN="${td_override}" + else + export VERIFY_TEARDOWN="${VERIFY_TEARDOWN:-1}" + fi + echo "[INFO] ANSIBLE_CONFIG=${ANSIBLE_CONFIG}" +} + +record_teardown_state() { + mkdir -p "${STATUS_DIR}" + local td="${VERIFY_TEARDOWN:-1}" + local ts + ts="$(date -u +"%Y-%m-%dT%H:%M:%SZ")" + cat > "${TEARDOWN_STATE_JSON}" <&2 + echo "[TIP] 恢复建议:切回 VERIFY_TEARDOWN=1 并复跑主线;必要时手工清理残留命名空间/资源" >&2 + echo "[OC] doc_id=preflight result=verified phase=preflight assertion=teardown_mode verify_teardown=0" + fi + + if [[ -f "${TEARDOWN_STATE_JSON}" && "${td}" == "1" ]]; then + local last_td="" + last_td="$(TEARDOWN_STATE_JSON="${TEARDOWN_STATE_JSON}" python3 - <<'PY' 2>/dev/null || true +import json, os, pathlib +p = pathlib.Path(os.environ["TEARDOWN_STATE_JSON"]) +try: + d = json.loads(p.read_text(encoding="utf-8")) + print(d.get("verify_teardown", "")) +except Exception: + pass +PY + )" + if [[ "${last_td}" == "0" ]]; then + echo "[WARN] 检测到上次验证使用 VERIFY_TEARDOWN=0:当前虽为 1,但可能存在残留污染" >&2 + echo "[TIP] 建议:VERIFY_TEARDOWN=1 ./ansible/bin/verify.sh full(或 run-all)以清理并回归" >&2 + echo "[OC] doc_id=preflight result=verified phase=preflight assertion=teardown_state last_verify_teardown=0 current_verify_teardown=1" + fi + fi +} + +DOC_ID_EXEC_RE='^(0[1-9]|[1-9][0-9])-(0[1-9]|[1-9][0-9])$' + +is_exec_doc_id() { + local doc_id="$1" + [[ "$doc_id" =~ $DOC_ID_EXEC_RE ]] +} + +list_doc_ids_from_verify_dir() { + local series="${1:-}" + local id_regex="${2:-}" + local exclude_noop="${3:-0}" + local require_teardown="${4:-0}" + ROOT="${ROOT}" SERIES="${series}" ID_REGEX="${id_regex}" EXCLUDE_NOOP="${exclude_noop}" REQUIRE_TEARDOWN="${require_teardown}" python3 - <<'PY' +import os +import re +from pathlib import Path + +root = Path(os.environ["ROOT"]) +verify_dir = root / "ansible" / "playbooks" / "verify" +series = os.environ.get("SERIES", "").strip() +id_regex = os.environ.get("ID_REGEX", "").strip() +exclude_noop = os.environ.get("EXCLUDE_NOOP", "0") == "1" +require_teardown = os.environ.get("REQUIRE_TEARDOWN", "0") == "1" + +pat = re.compile(r"^(?P(0[1-9]|[1-9][0-9])-(0[1-9]|[1-9][0-9]))\.yml$") +id_pat = re.compile(id_regex) if id_regex else None + +ids = [] +for p in verify_dir.iterdir(): + m = pat.match(p.name) + if not m: + continue + doc_id = m.group("id") + if series and not doc_id.startswith(f"{series}-"): + continue + if id_pat and not id_pat.search(doc_id): + continue + if exclude_noop or require_teardown: + content = p.read_text(encoding="utf-8", errors="ignore") + if exclude_noop and "noop verify" in content: + continue + if require_teardown and ("VERIFY_TEARDOWN" not in content and "verify_teardown" not in content): + continue + ids.append(doc_id) + +for x in sorted(set(ids)): + print(x) +PY +} + +run_preflight() { + local inv="${ANSIBLE_INVENTORY:-${ROOT}/ansible/inventory.ini}" + oc_failed() { + # OC-like preflight line for humans/tools (minimal; stdout is source of truth). + local assertion="$1" + shift || true + echo "[OC] doc_id=preflight result=failed phase=preflight assertion=${assertion} $*" + } + oc_gated() { + local missing="$1" + local scope="$2" + echo "[OC] doc_id=preflight result=gated phase=preflight assertion=dependency_check missing_dependency=${missing} skip_scope=\"${scope}\"" + } + need_cmd_or_fail() { + local cmd="$1" + if ! command -v "$cmd" >/dev/null 2>&1; then + echo "[ERR] 未找到命令:$cmd" >&2 + oc_failed "missing_cmd" "missing_cmd=${cmd}" + exit 2 + fi + } + + need_cmd_or_fail ansible-playbook + need_cmd_or_fail ansible + + warn_teardown_mode + record_teardown_state + + [[ -f "$inv" ]] || { echo "[ERR] inventory 不存在:$inv" >&2; oc_failed "missing_inventory" "inventory=${inv}"; exit 2; } + ansible_lab_check_inventory_keys "$inv" || { oc_failed "inventory_keys" "inventory=${inv}"; exit 2; } + + echo "[INFO] 变量边界:inventory=$inv | group_vars=ansible/group_vars/all.yml | env=ansible/env/.env.verify" + echo "[INFO] 关键变量:VERIFY_TEARDOWN=${VERIFY_TEARDOWN:-1} nginx_entry_base=${nginx_entry_base:-} nodejs_entry_base=${nodejs_entry_base:-}" + + echo "[RUN] ansible k3s_server -m ping" + if ! ansible k3s_server -i "$inv" -m ping; then + echo "[ERR] ansible ping 失败:k3s_server 不可达" >&2 + oc_failed "ansible_ping" "target_group=k3s_server" + exit 2 + fi + + # Optional cluster-side check (may still fail-fast: control-side hard failure). + if [[ "${VERIFY_PREFLIGHT_CLUSTER:-0}" == "1" ]]; then + if ! ansible k3s_server -i "$inv" -b -m ansible.builtin.shell -a \ + 'KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl get nodes'; then + echo "[ERR] kubectl 集群检查失败(VERIFY_PREFLIGHT_CLUSTER=1)" >&2 + oc_failed "kubectl_get_nodes" + exit 2 + fi + fi + + # External dependencies: missing deps should not fail preflight (EC2) but must be explicit gated. + # We gate only the dependent scopes; runtime verify can still proceed for non-dependent doc_ids. + local gated=0 + local missing_list=() + local scope_list=() + + if [[ -z "${ACME_EMAIL:-}" ]]; then + gated=1; missing_list+=("acme"); scope_list+=("acme/tls issuance") + fi + # Epic 4:Traefik ACME DNS-01 仅需 CF_API_TOKEN(见 03-02 ensure secret);ZONE_* 不由 preflight 强门禁。 + if [[ -z "${CF_API_TOKEN:-}" ]]; then + gated=1; missing_list+=("cloudflare"); scope_list+=("cloudflare api token / acme dns01") + fi + if [[ -z "${NFS_SERVER_IP:-}" || -z "${NFS_EXPORT_PATH:-}" ]]; then + gated=1; missing_list+=("nfs"); scope_list+=("nfs pv/pvc") + fi + if [[ -z "${WORKSTATION_SSH:-}" ]]; then + gated=1; missing_list+=("third_party_probe"); scope_list+=("third-party probe (WORKSTATION_SSH e.g. jack@ylc65)") + fi + + if [[ "$gated" == "1" ]]; then + # Join arrays into readable strings. + local missing joined_scope + missing="$(IFS=,; echo "${missing_list[*]}")" + joined_scope="$(IFS='; '; echo "${scope_list[*]}")" + echo "[GATE] preflight external deps missing: ${missing} (scopes: ${joined_scope})" + oc_gated "${missing}" "${joined_scope}" + echo "[OK] preflight 通过(带门控:gated)" + return 0 + fi + + echo "[OC] doc_id=preflight result=verified phase=preflight assertion=connectivity" + echo "[OK] preflight 通过" +} + +run_all_verify() { + local series="${1:-}" + local id_regex="${2:-}" + local exclude_noop="${3:-0}" + local require_teardown="${4:-0}" + local id + while IFS= read -r id; do + echo "" + echo "########################################## $id" + ansible_verify "$id" + done < <(list_doc_ids_from_verify_dir "$series" "$id_regex" "$exclude_noop" "$require_teardown") +} + +usage() { + cat <<'EOF' +用法:ansible/bin/verify.sh <命令> [...] +命令:flow | preflight | full | list | run | run-all +筛选参数:--series | --id-regex | --exclude-noop | --require-teardown +EOF +} + +print_flow() { + cat < / run-all +EOF +} + +ansible_verify() { + local doc_id="$1" + if ! is_exec_doc_id "$doc_id"; then + echo "[ERR] 非执行域 doc_id:$doc_id(仅允许 XX>0 且 YY>0)" >&2 + echo "[OC] doc_id=${doc_id} result=failed phase=verify assertion=invalid_doc_id" + exit 1 + fi + local inv="${ANSIBLE_INVENTORY:-${ROOT}/ansible/inventory.ini}" + local pb_single="${ROOT}/ansible/playbooks/verify/${doc_id}.yml" + [[ -f "$pb_single" ]] || { echo "[ERR] verify playbook 不存在:$pb_single" >&2; echo "[OC] doc_id=${doc_id} result=failed phase=verify assertion=missing_playbook"; exit 1; } + [[ -f "$inv" ]] || { echo "[ERR] inventory 不存在:$inv" >&2; echo "[OC] doc_id=${doc_id} result=failed phase=verify assertion=missing_inventory"; exit 1; } + local td="${VERIFY_TEARDOWN:-1}" + local run_log + run_log="$(mktemp)" + echo "[RUN] ansible-playbook -i $inv -e VERIFY_TEARDOWN=$td $pb_single" + if ansible-playbook -i "$inv" -e "VERIFY_TEARDOWN=$td" "$pb_single" 2>&1 | tee "$run_log"; then + if grep -q '\[GATE\]' "$run_log"; then + echo "[OC] doc_id=${doc_id} result=gated phase=verify assertion=playbook_gated" + else + # OC1: stable parse fields. OC3 evidence points to playbook output sections. + echo "[OC] doc_id=${doc_id} result=verified phase=verify assertion=playbook_success" + echo "[OC-EVIDENCE] doc_id=${doc_id} kind=cluster summary=\"see kubectl/assert output in playbook logs\"" + echo "[OC-EVIDENCE] doc_id=${doc_id} kind=entry summary=\"see http/tls/assert output in playbook logs\"" + fi + else + echo "[OC] doc_id=${doc_id} result=failed phase=verify assertion=playbook_failed" + rm -f "$run_log" + return 1 + fi + rm -f "$run_log" +} + +main() { + load_env + local cmd="${1:-}" + shift || true + local series="" + local id_regex="" + local exclude_noop=0 + local require_teardown=0 + parse_filter_args() { + while [[ $# -gt 0 ]]; do + case "$1" in + --series) series="${2:-}"; shift 2 ;; + --id-regex) id_regex="${2:-}"; shift 2 ;; + --exclude-noop) exclude_noop=1; shift ;; + --require-teardown) require_teardown=1; shift ;; + *) echo "[ERR] 未知参数:$1" >&2; exit 1 ;; + esac + done + } + case "$cmd" in + ""|-h|--help) usage ;; + flow) print_flow ;; + preflight) run_preflight ;; + full) parse_filter_args "$@"; run_preflight; run_all_verify "$series" "$id_regex" "$exclude_noop" "$require_teardown" ;; + list) parse_filter_args "$@"; list_doc_ids_from_verify_dir "$series" "$id_regex" "$exclude_noop" "$require_teardown" ;; + run) local doc_id="${1:?need doc_id like 02-05}"; ansible_verify "$doc_id" ;; + run-all) parse_filter_args "$@"; run_all_verify "$series" "$id_regex" "$exclude_noop" "$require_teardown" ;; + *) echo "[ERR] unknown cmd: $cmd" >&2; usage; exit 1 ;; + esac +} + +main "$@" diff --git a/ansible/env/.env.verify.example b/ansible/env/.env.verify.example new file mode 100644 index 0000000..0616e70 --- /dev/null +++ b/ansible/env/.env.verify.example @@ -0,0 +1,128 @@ +# 验证矩阵 / 编排脚本用环境变量模板(example) +# --------------------------------------------------------------------------- +# 约定:本文件只写“值”(KEY=VALUE),不写默认展开/命令替换等执行逻辑。 +# +# 使用(必须在仓库根目录执行): +# cp ansible/env/.env.verify.example ansible/env/.env.verify +# set -a && source ansible/env/.env.verify && set +a +# --------------------------------------------------------------------------- + +# ========================= +# 1) 你只需要改这一段(值) +# ========================= + +# --- SSH --- +# K3S_SSH_KEY_DIR:存放 ssh key 的目录 +K3S_SSH_KEY_DIR=$HOME/.ssh +# K3S_SSH_KEY_PREFIX:key 前缀(脚本会按此前缀拼接具体节点 key) +K3S_SSH_KEY_PREFIX=id_ed25519_k3s_ +# SSH_USER:ssh 登录用户名 +SSH_USER=jack +# TIMEOUT_SEC:常用探测/连接超时时间(秒) +TIMEOUT_SEC=5 + +# --- Ansible(控制端)--- +# ANSIBLE_INVENTORY:inventory 路径(相对仓库根);一般无需改 +ANSIBLE_INVENTORY=ansible/inventory.ini +# ANSIBLE_LOCAL_TMP:可选;控制端无法写 ~/.ansible 时设为仓库内路径,如 $PWD/.ansible-tmp(见 docs/00-03) + +# --- 01-01 / 01-02:集群、kubectl 与 data-dir(k3s,共用)--- +# docs/01-01(控制面)与 docs/01-02(工作节点加入)共用此二项:K3S_SERVER_HOSTNAME = 控制面短主机名(inventory 中 k3s_server,01-02 手工 K3S_URL 指向该节点 6443);K3S_DATA_DIR = 各节点一致的 k3s --data-dir(与 group_vars k3s_data_dir 一致)。verify/01-01 以节点 kubeconfig 为准,playbook 不强制从 env 读此二项。 +K3S_SERVER_HOSTNAME=ylc61 +K3S_DATA_DIR=/storage + +# --- 01-03 / 01-04 armv7(docs/01-03、01-04;docs/00-03 §10.E)--- +# (可选)SKIP_ARMV7:默认 1 时 verify/01-03、01-04 仅矩阵基线;0 时须配 ARMV7_SSH 等并由 playbook lookup +SKIP_ARMV7=1 +# 01-03:SKIP_ARMV7=0 时需 ARMV7_SSH;verify 调用 ansible/tools/armv7-docker-verify-install.sh(先 docker info,失败再 get.docker.com)。 +# 01-04:SKIP_ARMV7=0 时需 ARMV7_NFS_SSH(或同机);ARMV7_NFS_EXPORT_PATH、ARMV7_NFS_CLIENT_SUBNET 见 verify/01-04.yml +ARMV7_SSH=YOUR_ARMV7_SSH +ARMV7_NFS_SSH=YOUR_ARMV7_NFS_SSH +ARMV7_NFS_EXPORT_PATH=/sdcard +ARMV7_NFS_CLIENT_SUBNET=192.168.2.0/24 + +# --- 01-05(docs/01-05:节点初始化与 k3s 安装)--- +# K3S_PREPARE_STORAGE:deploy-lab.sh k3s 是否在首段跑「准备数据盘」(ansible/bin/deploy-lab.sh 读此变量) +# K3S_DO_PREPARE_STORAGE / K3S_DO_INSTALL:verify/01-05.yml 从环境读取(与 ansible-playbook -e 等价)。真正分区还须 group_vars k3s_prepare_storage: true 且 k3s_data_disk_device=/dev/xxx +K3S_PREPARE_STORAGE=false +K3S_DO_PREPARE_STORAGE=false +K3S_DO_INSTALL=false +# k3s 安装脚本镜像/超时:在 ansible/group_vars/all.yml 设 k3s_install_mirror: cn、k3s_install_curl_max_time 等(非本 env 变量);或 ansible-playbook … -e k3s_install_mirror=cn + +# --- deploy-lab.sh(铺栈,docs/00-03 / project-context)--- +# DEPLOY_VERIFY_TEARDOWN:deploy-lab 传入 playbook 的 VERIFY_TEARDOWN,默认 0(保留已部署资源) +DEPLOY_VERIFY_TEARDOWN=0 + +# --- 01-06 集群外探测 SSH(docs/01-06、00-03)--- +# WORKSTATION_SSH:在 **Linux 工作机**上执行集群外 curl 等的一行 ssh(示例 ylc65,见 docs/00-02);可改为 user@<你的工作机主机名>。含空格必须加引号。(旧名 ONECLOUD_SSH 已弃用) +WORKSTATION_SSH="ssh -o BatchMode=yes jack@ylc65" + +# --- 验证入口与 preflight(02-xx / 04-xx 共用)--- +# nginx_entry_base、nodejs_entry_base:HTTP 验证入口基址(按 ingress/负载均衡填写)。nginx 矩阵(02-01~02-04)为集群内临时 Pod 直连 nginx-mX.default.svc。 +# VERIFY_TEARDOWN:verify.sh 默认 1(清理现场);调试可设 0(保留现场,可能污染后续用例) +VERIFY_TEARDOWN=1 +# VERIFY_PREFLIGHT_CLUSTER:默认 0(离线/轻量 preflight);设 1 时 preflight 在控制节点执行 kubectl get nodes(见 docs/00-03) +VERIFY_PREFLIGHT_CLUSTER=0 +nginx_entry_base=http://192.168.2.61 +nodejs_entry_base=http://192.168.2.61 + +# --- 03-08 k3s HA(可选·预留)--- +# (可选)SKIP_HA:当前 verify playbook 未 lookup;仅与 docs/03-08 及 HA 类备忘对齐 +SKIP_HA=1 + +# --- 03-09 GitOps(可选·预留)--- +# (可选)SKIP_GITOPS:当前 verify playbook 未 lookup;仅与 docs/03-09 备忘对齐 +SKIP_GITOPS=1 + +# --- Cloudflare(CF API / Zone)--- +# 03-02 / 03-03:ACME DNS-01 需 CF_API_TOKEN;ZONE_* 为手工/Token 校验辅助,preflight 不强门禁(verify.sh 注释) +# CF_API_TOKEN:Cloudflare API Token(敏感信息) +CF_API_TOKEN=YOUR_CF_API_TOKEN +# ZONE_NAME / ZONE_ID:Cloudflare Zone 信息(可选;预留,playbook 未 lookup) +ZONE_NAME=jackadam.top +ZONE_ID=YOUR_ZONE_ID + +# --- 03-02 / 03-03 ACME 与 Traefik --- +# 03-02 / 03-03:ACME_EMAIL 为 Let's Encrypt 注册邮箱(verify 必填)。ACME_CA_STAGING=1 使用 staging CA。 +# TRAEFIK_NAMESPACE:预留(playbook 中 Traefik 固定 kube-system,未读此 env) +ACME_EMAIL=you@example.com +ACME_CA_STAGING=0 +TRAEFIK_NAMESPACE=kube-system +# TRAEFIK_DASHBOARD_VERIFY_URL:03-03 验收 Dashboard HTTP 探针完整 URL(可选;未设则按 k3s_server_ip/dashboard/) +# TRAEFIK_DASHBOARD_VERIFY_URL=http://192.168.2.61/dashboard/ + +# --- TLS 域名(预留)--- +# VERIFY_TLS_HOSTS:手工 openssl/curl 对照用;03-02 TLS 矩阵域名为清单内嵌,playbook 未读此变量 +VERIFY_TLS_HOSTS=test01.jackadam.top,test02.jackadam.top,test03.jackadam.top,test04.jackadam.top + +# --- 03-02(nginx 矩阵 TLS 可选)--- +# 03-02:NGINX_MATRIX_TLS_ENABLE=true 时部署 TLS+HTTP nginx 矩阵(改 default;与 deploy-lab.sh nginx-matrix-tls 一致)。 +NGINX_MATRIX_TLS_ENABLE=false + +# --- 03-04 Cloudflare Tunnel(docs/03-04)--- +# HTTPS 探针需 CF_TUNNEL_TEST_URL 或 CF_TUNNEL_TEST_HOST(二选一;皆缺则 [GATE])。TUNNEL_TOKEN 与集群 kube-system Secret cloudflared-credentials 二选一。 +TUNNEL_TOKEN=YOUR_TUNNEL_TOKEN +# CF_TUNNEL_TEST_URL=https://your-tunnel-host.example.com/ +CF_TUNNEL_TEST_HOST=traefik.jackadam.top +# CF_TUNNEL_CURL_INSECURE=1 — 03-04 探针 curl 使用 -k(排障用) + +# --- 03-05 --- +# 03-05:LOCAL_PATH_APPLY_LAB_CONFIG=true 时注入 local-path lab ConfigMap 并 rollout restart provisioner。 +LOCAL_PATH_APPLY_LAB_CONFIG=false + +# --- NFS(docs/03-06)--- +# verify/03-06.yml:NFS_SERVER_IP + NFS_EXPORT_PATH 均非空才跑 PV/PVC + Job;任一为空则 [GATE]。NFS_SERVER_HOST 仅与文档/运维备注对齐,playbook 不读取。 +NFS_SERVER_HOST=onecloud +NFS_SERVER_IP=onecloud +NFS_EXPORT_PATH=/export/k3s + +# --- Longhorn(预留)--- +# LONGHORN_NAMESPACE:与 docs/03-07 叙述对齐;命名空间以 playbook/group_vars 为准,当前 verify 未 lookup 此键 +LONGHORN_NAMESPACE=longhorn-system + +# --- 04-07 / 04-12 NodeJS TLS(docs/04-12、00-03)--- +# 04-12:NODEJS_TLS_ENTRY_BASE + NODEJS_TLS_HOST 时跑 tls-openssl-sni + HTTPS 断言;无 Secret 时可 CREATE_NODEJS_DEMO_TLS_SECRET=1 自签。 +# NODEJS_TLS_ENTRY_BASE=https://192.168.2.61:443 +# NODEJS_TLS_HOST=app.example.local +# CREATE_NODEJS_DEMO_TLS_SECRET=1 +# NODEJS_TLS_CURL_INSECURE=1 diff --git a/ansible/files/01-01/README.md b/ansible/files/01-01/README.md new file mode 100644 index 0000000..2b2acd4 --- /dev/null +++ b/ansible/files/01-01/README.md @@ -0,0 +1,9 @@ +# 01-01(单控制节点安装) + +| 文件 | 说明 | +|------|------| +| `k3s-server-install.example.sh` | 默认或 `--data-dir=/storage` 安装片段备忘 | + +- **手动**:在控制节点按 [docs/01-01-k3s-控制节点含traefik.md](../../../docs/01-01-k3s-控制节点含traefik.md) 执行;可与本目录示例对照。 +- **自动**:`./ansible/bin/verify.sh run 01-01`(专用 playbook:`kubectl` 断言;与本目录 shell 示例共用真源路径)。 +- 本篇**无** Kubernetes 应用清单;扩展名 `.sh` 不会进入 `kubectl apply --dry-run` 列表。 diff --git a/ansible/files/01-01/k3s-server-install.example.sh b/ansible/files/01-01/k3s-server-install.example.sh new file mode 100644 index 0000000..cd539e2 --- /dev/null +++ b/ansible/files/01-01/k3s-server-install.example.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +# 控制节点安装 k3s server(示例)— 真源目录 ansible/files/01-01/ +# 详见:docs/01-01-k3s-控制节点含traefik.md + +# 方案一:默认数据目录 +# curl -sfL https://get.k3s.io | sh - + +# 方案二:数据盘 +# curl -sfL https://get.k3s.io | sh -s - server --data-dir=/storage + +echo "[INFO] 取消注释其一并在节点上执行;token 路径随方案在文档中说明。" diff --git a/ansible/files/01-02/README.md b/ansible/files/01-02/README.md new file mode 100644 index 0000000..2f8b65b --- /dev/null +++ b/ansible/files/01-02/README.md @@ -0,0 +1,8 @@ +# 01-02(工作节点加入 + Traefik 基线) + +| 文件 | 说明 | +|------|------| +| `k3s-agent-join.example.sh` | worker 使用环境变量或 `agent --data-dir` 加入集群的片段 | + +- **手动**:按 [docs/01-02-k3s-工作节点.md](../../../docs/01-02-k3s-工作节点.md) 在 worker 执行;替换 `K3S_URL`、`TOKEN` 与 IP。 +- **自动**:`./ansible/bin/verify.sh run 01-02`(与手工步骤共用本目录作为命令真源备忘)。 diff --git a/ansible/files/01-02/k3s-agent-join.example.sh b/ansible/files/01-02/k3s-agent-join.example.sh new file mode 100644 index 0000000..7b4e891 --- /dev/null +++ b/ansible/files/01-02/k3s-agent-join.example.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +# 工作节点加入 k3s(示例)— ansible/files/01-02/ +# 详见:docs/01-02-k3s-工作节点.md + +# 方案一:默认数据目录 +# curl -sfL https://get.k3s.io | \ +# K3S_URL=https://192.168.2.61:6443 \ +# K3S_TOKEN= \ +# sh - + +# 方案二:数据盘 +# curl -sfL https://get.k3s.io | sh -s - agent \ +# --data-dir=/storage \ +# --server https://192.168.2.61:6443 \ +# --token + +echo "[INFO] 将占位符替换为控制面地址与 token 后执行。" diff --git a/ansible/files/01-03/README.md b/ansible/files/01-03/README.md new file mode 100644 index 0000000..407f8d2 --- /dev/null +++ b/ansible/files/01-03/README.md @@ -0,0 +1,6 @@ +# 01-03(armv7 独立 Docker) + +- **手动**:在 armv7 主机按 [docs/01-03-armv7-standalone-docker.md](../../../docs/01-03-armv7-standalone-docker.md)(**get.docker.com** 官方脚本 + 先 `docker info` 再决定是否安装)。 +- **远程脚本**:仓库根执行 [ansible/tools/armv7-docker-verify-install.sh](../../tools/armv7-docker-verify-install.sh)(与文档流程一致;`ARMV7_SSH='ssh …' ./ansible/tools/...`)。 +- **自动**:`./ansible/bin/verify.sh run 01-03`(`SKIP_ARMV7=0` 时调用上述脚本)。 +- 本篇无通用 K8s 清单;若后续补充 compose 或单元脚本,请用 `.example.` 命名或放在非 `.yml`/`.yaml` 扩展名以避免误 dry-run。 diff --git a/ansible/files/01-04/.gitkeep b/ansible/files/01-04/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/ansible/files/01-05/README.md b/ansible/files/01-05/README.md new file mode 100644 index 0000000..17ac644 --- /dev/null +++ b/ansible/files/01-05/README.md @@ -0,0 +1,6 @@ +# 01-05(Ansible 一键装集群) + +- **真源 playbook**:`ansible/playbooks/verify/01-05.yml`(与 `deploy-lab.sh` 调用一致)。 +- **文档**:[docs/01-05-节点初始化-ansible-实践.md](../../../docs/01-05-节点初始化-ansible-实践.md)。 +- **自动**:`./ansible/bin/verify.sh run 01-05` 或 `./ansible/bin/deploy-lab.sh k3s`。 +- 本目录用于与 `doc_id` 三元契约对齐;集群对象由 playbook / 其它 `ansible/files` 篇生成,此处可不放置额外 YAML。 diff --git a/ansible/files/01-06/README.md b/ansible/files/01-06/README.md new file mode 100644 index 0000000..c71a948 --- /dev/null +++ b/ansible/files/01-06/README.md @@ -0,0 +1,12 @@ +# 01-06(OpenWrt HAProxy) + +- **说明与选用**:[`docs/01-06-openwrt-haproxy.md`](../../../docs/01-06-openwrt-haproxy.md)(文首有各 `*.cfg` 对照表)。 +- **本目录**:HAProxy 示例配置(非 K8s YAML);复制到 OpenWrt 后改 IP/端口并 `haproxy -c -f …` 校验。 + +| 文件 | 用途摘要 | +|------|----------| +| `haproxy-no-check.cfg` | 最简,无 check | +| `haproxy-http.cfg` | 80 明文 httpchk | +| `haproxy-tls.cfg` | 443 TCP + ssl-hello-chk | +| `haproxy-https.cfg` | 443 应用层 HTTPS 检查(HAProxy 终结 TLS) | +| `haproxy-proxy-http-tls.cfg` | 检查 + PROXY v2 | diff --git a/ansible/files/01-07/haproxy-http.cfg b/ansible/files/01-06/haproxy-http.cfg similarity index 87% rename from ansible/files/01-07/haproxy-http.cfg rename to ansible/files/01-06/haproxy-http.cfg index e7fc0d3..7d5720d 100644 --- a/ansible/files/01-07/haproxy-http.cfg +++ b/ansible/files/01-06/haproxy-http.cfg @@ -1,6 +1,6 @@ -# 01-07 HAProxy - 3.2 HTTP 健康检查(80 明文) +# 01-06 HAProxy - 3.2 HTTP 健康检查(80 明文) # backend k3s_http 增加 option httpchk GET / -# 文档:docs/01-07-openwrt-haproxy.md 第 3.2 节 +# 文档:docs/01-06-openwrt-haproxy.md 第 3.2 节 global log /dev/log local0 maxconn 4096 diff --git a/ansible/files/01-07/haproxy-https.cfg b/ansible/files/01-06/haproxy-https.cfg similarity index 90% rename from ansible/files/01-07/haproxy-https.cfg rename to ansible/files/01-06/haproxy-https.cfg index e854eb0..1a13a76 100644 --- a/ansible/files/01-07/haproxy-https.cfg +++ b/ansible/files/01-06/haproxy-https.cfg @@ -1,8 +1,8 @@ -# 01-07 HAProxy - 3.4 HTTPS 健康检查(443 应用层,HAProxy 终结 TLS,由 HAProxy 提供证书) +# 01-06 HAProxy - 3.4 HTTPS 健康检查(443 应用层,HAProxy 终结 TLS,由 HAProxy 提供证书) # frontend 需 bind *:443 ssl,backend mode http 连 K3s:443 做 HTTP over TLS 检查 # 将 your-ingress.example.com 改为实际 Host;将 /etc/ssl/haproxy.pem 改为实际证书路径 # 自签/内网 CA 用 verify none,生产建议 ca-file -# 文档:docs/01-07-openwrt-haproxy.md 第 3.4 节 +# 文档:docs/01-06-openwrt-haproxy.md 第 3.4 节 global log /dev/log local0 maxconn 4096 diff --git a/ansible/files/01-07/haproxy-no-check.cfg b/ansible/files/01-06/haproxy-no-check.cfg similarity index 87% rename from ansible/files/01-07/haproxy-no-check.cfg rename to ansible/files/01-06/haproxy-no-check.cfg index 21fe808..84296c0 100644 --- a/ansible/files/01-07/haproxy-no-check.cfg +++ b/ansible/files/01-06/haproxy-no-check.cfg @@ -1,5 +1,5 @@ -# 01-07 OpenWrt HAProxy 负载均衡 - 原生最简(无健康检查) -# 文档:docs/01-07-openwrt-haproxy.md 第 2 节 +# 01-06 OpenWrt HAProxy 负载均衡 - 原生最简(无健康检查) +# 文档:docs/01-06-openwrt-haproxy.md 第 2 节 # 将 192.168.2.61~64 按实际 K3s 节点 IP 修改 # 如需健康检查,见第 3 节对应 cfg global diff --git a/ansible/files/01-07/haproxy-proxy-http-tls.cfg b/ansible/files/01-06/haproxy-proxy-http-tls.cfg similarity index 88% rename from ansible/files/01-07/haproxy-proxy-http-tls.cfg rename to ansible/files/01-06/haproxy-proxy-http-tls.cfg index 872cc0d..91bf84a 100644 --- a/ansible/files/01-07/haproxy-proxy-http-tls.cfg +++ b/ansible/files/01-06/haproxy-proxy-http-tls.cfg @@ -1,6 +1,6 @@ -# 01-07 HAProxy - 健康检查升级(HTTP+TLS)+ PROXY Protocol +# 01-06 HAProxy - 健康检查升级(HTTP+TLS)+ PROXY Protocol # 组合:k3s_http 用 option httpchk,k3s_https 用 ssl-hello-chk,均带 send-proxy-v2 -# 文档:docs/01-07-openwrt-haproxy.md 第 5 节「健康检查与 PROXY 组合」 +# 文档:docs/01-06-openwrt-haproxy.md 第 5 节「健康检查与 PROXY 组合」 global log /dev/log local0 maxconn 4096 diff --git a/ansible/files/01-07/haproxy-tls.cfg b/ansible/files/01-06/haproxy-tls.cfg similarity index 87% rename from ansible/files/01-07/haproxy-tls.cfg rename to ansible/files/01-06/haproxy-tls.cfg index 4f29380..c0803fa 100644 --- a/ansible/files/01-07/haproxy-tls.cfg +++ b/ansible/files/01-06/haproxy-tls.cfg @@ -1,6 +1,6 @@ -# 01-07 HAProxy - 3.3 TLS 健康检查(443 握手,mode tcp) +# 01-06 HAProxy - 3.3 TLS 健康检查(443 握手,mode tcp) # backend k3s_https 增加 option ssl-hello-chk -# 文档:docs/01-07-openwrt-haproxy.md 第 3.3 节 +# 文档:docs/01-06-openwrt-haproxy.md 第 3.3 节 global log /dev/log local0 maxconn 4096 diff --git a/ansible/files/01-07/README.md b/ansible/files/01-07/README.md new file mode 100644 index 0000000..ee4d36e --- /dev/null +++ b/ansible/files/01-07/README.md @@ -0,0 +1,9 @@ +# 01-07 双控制节点 HA(手工演练为主) + +本目录用于满足 `doc_id=01-07` 的真源目录一致性约束。 + +当前 `01-07` 主要是手工 runbook(切换/演练类),自动验证入口为: + +- `ansible/playbooks/verify/01-07.yml`(文档存在性与说明提示) + +如后续将 01-07 演练步骤自动化,可在本目录新增对应清单与配置文件。 diff --git a/ansible/files/02-01/01-control-ingress.yaml b/ansible/files/02-01/01-control-ingress.yaml new file mode 100644 index 0000000..55d9759 --- /dev/null +++ b/ansible/files/02-01/01-control-ingress.yaml @@ -0,0 +1,101 @@ +# 02-05: Nginx + 控制节点 + Ingress(M1) +# 路径 /demo-m1,随机一台控制节点(nodeSelector + toleration,控制节点常有 NoSchedule 污点) +# ConfigMap:首页 + default.conf(单文件 subPath 挂载,与 M2~M4 一致,便于 nginx 后续扩展) +--- +apiVersion: v1 # ConfigMap 使用的 API 版本 +kind: ConfigMap # 配置资源类型:ConfigMap +metadata: # 对该 ConfigMap 的标识信息 + name: nginx-m1-html # ConfigMap 名称 + namespace: default # 命名空间 +data: # ConfigMap 数据键值区 + index.html: | # HTML 内容:会挂载到 nginx 的网页目录 + + M1 +

M1

控制节点 + Ingress

Backend: M1

+ default.conf: | # nginx 配置:通过 subPath 单文件挂载到 conf.d/default.conf + server { listen 80 default_server; server_name _; root /usr/share/nginx/html; index index.html; location / { add_header X-Backend "M1"; try_files $uri $uri/ /index.html; } } +--- +apiVersion: apps/v1 # Deployment 使用的 API 版本 +kind: Deployment # 工作负载:Deployment +metadata: # Deployment 标识信息 + name: nginx-m1 # Deployment 名称 + namespace: default # 部署命名空间 + labels: # 额外标签(用于检索/筛选) + app: nginx-m1 # 应用标签 + matrix: "02-05-m1" # 矩阵编号标签(用于你后续调试/统计) +spec: # Deployment 期望状态 + replicas: 1 # 副本数:本例为 1(便于对应路径验证) + selector: # Deployment 用于选择 Pod 的条件 + matchLabels: # 标签匹配集合(用于选中模板 Pod) + app: nginx-m1 # 必须与 template.metadata.labels 对上 + template: # Pod 模板 + metadata: # Pod 的元信息 + labels: # Pod 标签 + app: nginx-m1 # Pod 标签 + spec: # Pod 规范 + nodeSelector: # 节点选择:固定跑在 control-plane 上 + node-role.kubernetes.io/control-plane: "" # 选择带 control-plane 角色标签的节点 + tolerations: # 容忍污点:让 Pod 能调度到 control-plane + - key: node-role.kubernetes.io/control-plane # 污点 key + operator: Exists # 存在即匹配 + effect: NoSchedule # 匹配 NoSchedule 污点效果 + volumes: # Pod 内卷定义 + - name: html # 卷名:给 volumeMounts 引用 + configMap: # 卷来源:ConfigMap + name: nginx-m1-html # 引用的 ConfigMap 名称 + containers: # 容器列表 + - name: nginx # 容器名 + image: nginx:alpine # nginx 镜像 + ports: # 容器端口列表 + - containerPort: 80 # nginx HTTP 端口 + volumeMounts: # 容器内挂载点列表 + - name: html # 对应 volumes[].name + mountPath: /usr/share/nginx/html/index.html # 挂载到网页文件路径 + subPath: index.html # 从 ConfigMap 里选取单个 key + readOnly: true # 只读挂载(配置文件更安全) + - name: html # 第二处也使用同一个卷 + mountPath: /etc/nginx/conf.d/default.conf # nginx 配置文件路径 + subPath: default.conf # 从 ConfigMap 里选取对应 key + readOnly: true # 只读挂载 +--- +apiVersion: v1 # Service 使用的 API 版本 +kind: Service # 网络抽象:把 Pod 暴露成稳定访问入口 +metadata: # Service 标识 + name: nginx-m1 # Service 名称 + namespace: default # Service 所在命名空间 +spec: # Service 期望状态 + selector: # Service 按标签选择后端 Pod + app: nginx-m1 # 选择 nginx-m1 Pod + ports: # Service 端口映射 + - port: 80 # Service 端口 + targetPort: 80 # 转发到 Pod 的端口 +--- +apiVersion: traefik.io/v1alpha1 # Traefik Middleware 使用的 API 版本 +kind: Middleware # 路由中间件:stripPrefix +metadata: # Middleware 标识 + name: stripprefix-m1 # Middleware 名称 + namespace: default # 命名空间 +spec: # Middleware 配置 + stripPrefix: # 去掉前缀 + prefixes: # 要剔除的前缀列表 + - /demo-m1 # 本矩阵的路径前缀 +--- +apiVersion: networking.k8s.io/v1 # Ingress 使用的 API 版本 +kind: Ingress # 入口资源:把路径转发到 Service +metadata: # Ingress 标识 + name: nginx-m1 # Ingress 名称 + namespace: default # 命名空间 + annotations: # Ingress 注解:Traefik 用来绑定中间件 + traefik.ingress.kubernetes.io/router.middlewares: default-stripprefix-m1@kubernetescrd # 绑定 stripprefix-m1 +spec: # Ingress 规则 + rules: # 规则列表 + - http: # HTTP 规则 + paths: # 路径匹配列表 + - path: /demo-m1 # 匹配路径 + pathType: Prefix # 前缀匹配类型 + backend: # 后端目标 + service: # 后端 Service + name: nginx-m1 # Service 名 + port: # Service 端口 + number: 80 # 端口号 + diff --git a/ansible/files/02-01/README.md b/ansible/files/02-01/README.md new file mode 100644 index 0000000..050fa5d --- /dev/null +++ b/ansible/files/02-01/README.md @@ -0,0 +1,6 @@ +# 02-01(nginx 分课) + +本目录 YAML 与 `ansible/files/02-05/` 中对应课节清单为**同构副本**,专供本 `doc_id` 解耦学习与手工复制。 + +- **手动**:将清单拷到目标机路径,按文档改字段后执行 `kubectl` / bash(不必使用 verify)。 +- **自动**:`./ansible/bin/verify.sh run 02-01`。 diff --git a/ansible/files/02-02/02-control-ingressroute.yaml b/ansible/files/02-02/02-control-ingressroute.yaml new file mode 100644 index 0000000..a9f5cea --- /dev/null +++ b/ansible/files/02-02/02-control-ingressroute.yaml @@ -0,0 +1,95 @@ +# 03-02: Nginx + 控制节点 + IngressRoute(M2) +# 路径 /demo-m2,指定一台控制节点(按实际 FQDN 修改 kubernetes.io/hostname) +# ConfigMap:首页 + default.conf,X-Backend: M2 便于区分 +--- +apiVersion: v1 # ConfigMap 使用的 API 版本 +kind: ConfigMap # 配置资源类型:ConfigMap +metadata: # ConfigMap 标识信息 + name: nginx-m2-html # ConfigMap 名称 + namespace: default # 命名空间 +data: # ConfigMap 数据区 + index.html: | # HTML 内容:会挂载到 nginx 的网页目录 + + M2 +

M2

控制节点 + IngressRoute

+ default.conf: | # nginx 配置:通过 subPath 单文件挂载到 conf.d/default.conf + server { listen 80; server_name localhost; root /usr/share/nginx/html; index index.html; location / { add_header X-Backend "M2"; try_files $uri $uri/ /index.html; } } +--- +apiVersion: apps/v1 # Deployment 使用的 API 版本 +kind: Deployment # 工作负载:Deployment +metadata: # Deployment 标识信息 + name: nginx-m2 # Deployment 名称 + namespace: default # 部署命名空间 + labels: # 标签集合 + app: nginx-m2 # 应用标签 + matrix: "02-05-m2" # 矩阵编号标签 +spec: # Deployment 期望状态 + replicas: 1 # 副本数:单副本便于验证 + selector: # Deployment 选择 Pod + matchLabels: # 标签匹配集合(用于选中模板 Pod) + app: nginx-m2 # 必须与 template.metadata.labels 对上 + template: # Pod 模板 + metadata: # Pod 元信息 + labels: # Pod 标签 + app: nginx-m2 # Pod 标签 + spec: # Pod 规范 + nodeSelector: # 固定调度节点(按实际修改) + kubernetes.io/hostname: ylc61 # 目标节点主机名 + volumes: # 卷定义 + - name: html # 卷名 + configMap: # 卷来源为 ConfigMap + name: nginx-m2-html # 引用的 ConfigMap 名称 + containers: # 容器列表 + - name: nginx # 容器名 + image: nginx:alpine # nginx 镜像 + ports: # 容器端口声明 + - containerPort: 80 # nginx 监听端口 + volumeMounts: # 容器内挂载点 + - name: html # 对应 volumes[].name + mountPath: /usr/share/nginx/html/index.html # 挂到网页文件 + subPath: index.html # 使用 ConfigMap 的 index.html key + readOnly: true # 配置只读挂载 + - name: html # 第二处配置仍复用该卷 + mountPath: /etc/nginx/conf.d/default.conf # 挂到 nginx 配置文件 + subPath: default.conf # 使用 ConfigMap 的 default.conf key + readOnly: true # 只读挂载 +--- +apiVersion: v1 # Service 使用的 API 版本 +kind: Service # 网络抽象:为 Pod 提供稳定访问地址 +metadata: # Service 标识 + name: nginx-m2 # Service 名称 + namespace: default # 命名空间 +spec: # Service 期望状态 + selector: # 通过标签选择后端 Pod + app: nginx-m2 # 选择 app 标签为 nginx-m2 的 Pod + ports: # Service 端口映射 + - port: 80 # Service 暴露端口 + targetPort: 80 # 转发到 Pod 容器端口 +--- +apiVersion: traefik.io/v1alpha1 # Traefik Middleware 使用的 API 版本 +kind: Middleware # 中间件类型:stripPrefix +metadata: # Middleware 标识 + name: stripprefix-m2 # Middleware 名称 + namespace: default # 命名空间 +spec: # 中间件配置 + stripPrefix: # 去掉路径前缀 + prefixes: # 需要剔除的前缀列表 + - /demo-m2 # 本矩阵的路径前缀 +--- +apiVersion: traefik.io/v1alpha1 # IngressRoute 的 API 版本 +kind: IngressRoute # 路由资源类型 +metadata: # IngressRoute 标识 + name: nginx-m2 # 路由名称 + namespace: default # 命名空间 +spec: # 路由规则 + entryPoints: # Traefik 入口点列表 + - web # 使用 web entrypoint + routes: # 路由列表 + - match: PathPrefix(`/demo-m2`) # 匹配 /demo-m2 前缀 + kind: Rule # 规则类型:Rule + middlewares: # 绑定中间件(去前缀) + - name: stripprefix-m2 # 使用 stripprefix-m2 + services: # 匹配后转发的服务 + - name: nginx-m2 # 后端 Service 名称 + port: 80 # 后端 Service 端口 + diff --git a/ansible/files/02-02/README.md b/ansible/files/02-02/README.md new file mode 100644 index 0000000..8046d7d --- /dev/null +++ b/ansible/files/02-02/README.md @@ -0,0 +1,6 @@ +# 02-02(nginx 分课) + +本目录 YAML 与 `ansible/files/02-05/` 中对应课节清单为**同构副本**,专供本 `doc_id` 解耦学习与手工复制。 + +- **手动**:将清单拷到目标机路径,按文档改字段后执行 `kubectl` / bash(不必使用 verify)。 +- **自动**:`./ansible/bin/verify.sh run 02-02`。 diff --git a/ansible/files/02-03/03-worker-ingress.yaml b/ansible/files/02-03/03-worker-ingress.yaml new file mode 100644 index 0000000..a7b93da --- /dev/null +++ b/ansible/files/02-03/03-worker-ingress.yaml @@ -0,0 +1,97 @@ +# 03-03: Nginx + 工作节点 + Ingress(M3) +# 路径 /demo-m3,随机一台工作节点(nodeSelector: node-role.kubernetes.io/worker) +# ConfigMap:首页 + default.conf,X-Backend: M3 便于区分 +--- +apiVersion: v1 # ConfigMap 使用的 API 版本 +kind: ConfigMap # 配置资源类型:ConfigMap +metadata: # 对该 ConfigMap 的标识信息 + name: nginx-m3-html # ConfigMap 名称 + namespace: default # 命名空间 +data: # ConfigMap 数据键值区 + index.html: | # HTML 内容:会挂载到 nginx 网页目录(内部内容行不改动) + + M3 +

M3

工作节点 + Ingress

+ default.conf: | # nginx 配置:通过 subPath 单文件挂载到 conf.d/default.conf(内部内容行不改动) + server { listen 80; server_name localhost; root /usr/share/nginx/html; index index.html; location / { add_header X-Backend "M3"; try_files $uri $uri/ /index.html; } } +--- +apiVersion: apps/v1 # Deployment 使用的 API 版本 +kind: Deployment # 工作负载:Deployment +metadata: # Deployment 标识信息 + name: nginx-m3 # Deployment 名称 + namespace: default # 部署命名空间 + labels: # 额外标签(用于筛选/统计) + app: nginx-m3 # 应用标签 + matrix: "02-05-m3" # 矩阵编号标签 +spec: # Deployment 期望状态 + replicas: 1 # 副本数:这里为 1 + selector: # Deployment 用于选择 Pod 的条件 + matchLabels: # 标签匹配集合(用于选中模板 Pod) + app: nginx-m3 # 必须与 template.metadata.labels 对上 + template: # Pod 模板 + metadata: # Pod 元信息 + labels: # Pod 标签 + app: nginx-m3 # Pod 标签 + spec: # Pod 规范 + nodeSelector: # 固定跑到 worker 节点 + node-role.kubernetes.io/worker: "" # worker 节点 selector + volumes: # 卷定义 + - name: html # 卷名(供 volumeMounts 引用) + configMap: # 卷来源:ConfigMap + name: nginx-m3-html # 引用的 ConfigMap 名称 + containers: # 容器列表 + - name: nginx # 容器名 + image: nginx:alpine # nginx 镜像 + ports: # 容器端口声明 + - containerPort: 80 # nginx HTTP 端口 + volumeMounts: # 容器内挂载点 + - name: html # 对应 volumes[].name + mountPath: /usr/share/nginx/html/index.html # 挂到网页首页 + subPath: index.html # 从 ConfigMap 取该 key + readOnly: true # 配置只读 + - name: html # 第二处仍引用同一个卷 + mountPath: /etc/nginx/conf.d/default.conf # 挂到 nginx 配置文件路径 + subPath: default.conf # 从 ConfigMap 取该 key + readOnly: true # 配置只读 +--- +apiVersion: v1 # Service 使用的 API 版本 +kind: Service # 网络抽象:把 Pod 暴露成稳定访问入口 +metadata: # Service 标识 + name: nginx-m3 # Service 名称 + namespace: default # 命名空间 +spec: # Service 期望状态 + selector: # Service 通过标签选中后端 Pod + app: nginx-m3 # 选择 app 标签 + ports: # Service 端口映射列表 + - port: 80 # Service 暴露端口 + targetPort: 80 # 转发到 Pod 的容器端口 +--- +apiVersion: traefik.io/v1alpha1 # Traefik Middleware API 版本 +kind: Middleware # 中间件类型:stripPrefix +metadata: # Middleware 标识 + name: stripprefix-m3 # 名称 + namespace: default # 命名空间 +spec: # 中间件配置 + stripPrefix: # 去掉指定路径前缀 + prefixes: # 前缀列表 + - /demo-m3 # 本矩阵路径前缀 +--- +apiVersion: networking.k8s.io/v1 # Ingress 使用的 API 版本 +kind: Ingress # 入口资源:把路径转发到 Service +metadata: # Ingress 标识 + name: nginx-m3 # Ingress 名称 + namespace: default # 命名空间 + annotations: # Traefik 注解:绑定中间件 + traefik.ingress.kubernetes.io/router.middlewares: default-stripprefix-m3@kubernetescrd # 绑定 stripprefix-m3 中间件 +spec: # Ingress 规则 + rules: # 规则列表 + - http: # HTTP 规则 + paths: # 路径匹配列表 + - path: /demo-m3 # 匹配路径 + pathType: Prefix # 前缀匹配类型 + backend: # 后端目标 + service: # 后端是 Service + name: nginx-m3 # Service 名称 + port: # 后端端口 + number: 80 # 端口号 + diff --git a/ansible/files/02-03/README.md b/ansible/files/02-03/README.md new file mode 100644 index 0000000..4d4462d --- /dev/null +++ b/ansible/files/02-03/README.md @@ -0,0 +1,6 @@ +# 02-03(nginx 分课) + +本目录 YAML 与 `ansible/files/02-05/` 中对应课节清单为**同构副本**,专供本 `doc_id` 解耦学习与手工复制。 + +- **手动**:将清单拷到目标机路径,按文档改字段后执行 `kubectl` / bash(不必使用 verify)。 +- **自动**:`./ansible/bin/verify.sh run 02-03`。 diff --git a/ansible/files/02-04/04-worker-ingressroute.yaml b/ansible/files/02-04/04-worker-ingressroute.yaml new file mode 100644 index 0000000..8a02c6d --- /dev/null +++ b/ansible/files/02-04/04-worker-ingressroute.yaml @@ -0,0 +1,95 @@ +# 03-04: Nginx + 工作节点 + IngressRoute(M4) +# 路径 /demo-m4,指定一台工作节点(按实际 FQDN 修改 kubernetes.io/hostname) +# ConfigMap:首页 + default.conf,X-Backend: M4 便于区分 +--- +apiVersion: v1 # ConfigMap 使用的 API 版本 +kind: ConfigMap # 配置资源类型:ConfigMap +metadata: # ConfigMap 标识信息 + name: nginx-m4-html # ConfigMap 名称 + namespace: default # 命名空间 +data: # ConfigMap 数据区 + index.html: | # HTML 内容:挂载到 nginx 网页目录(内部内容行不改动) + + M4 +

M4

工作节点 + IngressRoute

+ default.conf: | # nginx 配置:通过 subPath 挂载到 conf.d/default.conf(内部内容行不改动) + server { listen 80; server_name localhost; root /usr/share/nginx/html; index index.html; location / { add_header X-Backend "M4"; try_files $uri $uri/ /index.html; } } +--- +apiVersion: apps/v1 # Deployment 使用的 API 版本 +kind: Deployment # 工作负载:Deployment +metadata: # Deployment 标识信息 + name: nginx-m4 # Deployment 名称 + namespace: default # 部署命名空间 + labels: # 应用标签/矩阵标签 + app: nginx-m4 # 应用标签 + matrix: "02-05-m4" # 矩阵编号 +spec: # Deployment 期望状态 + replicas: 1 # 副本数 + selector: # Deployment 选择器 + matchLabels: # 标签匹配集合(用于选中模板 Pod) + app: nginx-m4 # 必须与 template.metadata.labels 对上 + template: # Pod 模板 + metadata: # Pod 元信息 + labels: # Pod 标签 + app: nginx-m4 # Pod 标签 + spec: # Pod 规范 + nodeSelector: # 固定运行的工作节点 + kubernetes.io/hostname: ylc64 # worker 节点主机名 + volumes: # 卷定义 + - name: html # 卷名 + configMap: # 卷来源 + name: nginx-m4-html # 引用的 ConfigMap 名称 + containers: # 容器列表 + - name: nginx # 容器名 + image: nginx:alpine # nginx 镜像 + ports: # 容器端口 + - containerPort: 80 # HTTP 端口 + volumeMounts: # 容器内挂载 + - name: html # 引用 volumes[].name + mountPath: /usr/share/nginx/html/index.html # 挂到首页文件 + subPath: index.html # 取 ConfigMap 的 index.html key + readOnly: true # 只读 + - name: html # 仍复用同一个卷 + mountPath: /etc/nginx/conf.d/default.conf # 挂到 nginx 配置文件 + subPath: default.conf # 取 ConfigMap 的 default.conf key + readOnly: true # 只读 +--- +apiVersion: v1 # Service 使用的 API 版本 +kind: Service # 网络抽象:把 Pod 暴露为稳定入口 +metadata: # Service 标识 + name: nginx-m4 # Service 名称 + namespace: default # 命名空间 +spec: # Service 期望状态 + selector: # Service 选择器 + app: nginx-m4 # 选中后端 Pod + ports: # 端口映射列表 + - port: 80 # Service 端口 + targetPort: 80 # 转发到 Pod 容器端口 +--- +apiVersion: traefik.io/v1alpha1 # Traefik Middleware API 版本 +kind: Middleware # 中间件:stripPrefix +metadata: # Middleware 标识 + name: stripprefix-m4 # 名称 + namespace: default # 命名空间 +spec: # 中间件配置 + stripPrefix: # 去除路径前缀 + prefixes: # 前缀列表 + - /demo-m4 # 本矩阵路径前缀 +--- +apiVersion: traefik.io/v1alpha1 # IngressRoute API 版本 +kind: IngressRoute # Traefik 路由 CRD +metadata: # IngressRoute 标识 + name: nginx-m4 # 路由名称 + namespace: default # 命名空间 +spec: # IngressRoute 规则 + entryPoints: # 入口点列表 + - web # web(HTTP) + routes: # 路由列表 + - match: PathPrefix(`/demo-m4`) # 匹配 /demo-m4 前缀 + kind: Rule # 规则类型 + middlewares: # 绑定中间件 + - name: stripprefix-m4 # 需要去前缀 + services: # 后端服务列表 + - name: nginx-m4 # Service 名称 + port: 80 # Service 端口 + diff --git a/ansible/files/02-04/README.md b/ansible/files/02-04/README.md new file mode 100644 index 0000000..034410f --- /dev/null +++ b/ansible/files/02-04/README.md @@ -0,0 +1,6 @@ +# 02-04(nginx 分课) + +本目录 YAML 与 `ansible/files/02-05/` 中对应课节清单为**同构副本**,专供本 `doc_id` 解耦学习与手工复制。 + +- **手动**:将清单拷到目标机路径,按文档改字段后执行 `kubectl` / bash(不必使用 verify)。 +- **自动**:`./ansible/bin/verify.sh run 02-04`。 diff --git a/ansible/files/03-02/traefik-acme.yaml b/ansible/files/03-02/traefik-acme.yaml index e0c10ed..0b05d03 100644 --- a/ansible/files/03-02/traefik-acme.yaml +++ b/ansible/files/03-02/traefik-acme.yaml @@ -39,3 +39,8 @@ spec: # chart 注入配置的具体内容 nodeSelector: # 把 Traefik Pod 固定到指定节点(配合 RWO 本地存储更安全) kubernetes.io/hostname: ylc61 # 固定节点主机名(按你的实际节点修改) + # ping 绑定 websecure 时,chart 默认对 8080 做 HTTP /ping;须与 03-03 一致改为 HTTPS:8443 + deployment: + healthchecksPort: 8443 + healthchecksScheme: HTTPS + diff --git a/ansible/files/03-03/traefik-dashboard-acme.yaml b/ansible/files/03-03/traefik-dashboard-acme.yaml index 35ef0eb..2dd01d9 100644 --- a/ansible/files/03-03/traefik-dashboard-acme.yaml +++ b/ansible/files/03-03/traefik-dashboard-acme.yaml @@ -9,13 +9,17 @@ metadata: namespace: kube-system spec: valuesContent: |- + # chart 39.x:expose 须为表,布尔会与默认 values 合并冲突并导致 helm upgrade 模板失败 ports: web: - expose: true + expose: + default: true websecure: - expose: true + expose: + default: true traefik: - expose: true + expose: + default: true additionalArguments: # Dashboard @@ -48,13 +52,20 @@ spec: nodeSelector: kubernetes.io/hostname: ylc61 - # persistence:将 /data 持久化(local-path PVC),保证 acme.json 落盘 + # ping 绑定 websecure 时,chart 默认仍对 traefik(8080) 做 HTTP /ping → 404;与 chart 39 对齐探针 + deployment: + healthchecksPort: 8443 + healthchecksScheme: HTTPS + + # persistence:将 /data 持久化,保证 acme.json 落盘 + # 显式 local-path:避免集群默认 StorageClass 为 longhorn 等未就绪时 Pod 长期 Pending persistence: enabled: true name: data accessMode: ReadWriteOnce size: 128Mi path: /data + storageClass: local-path --- apiVersion: traefik.io/v1alpha1 diff --git a/ansible/files/03-04/cloudflared.yaml b/ansible/files/03-04/cloudflared.yaml index 9ee9973..0c6db5e 100644 --- a/ansible/files/03-04/cloudflared.yaml +++ b/ansible/files/03-04/cloudflared.yaml @@ -1,38 +1,31 @@ -# docs/03-04-k3s-cloudflare-tunnel-配置接入.md — 替换 TUNNEL_TOKEN 后应用 -apiVersion: v1 # Secret 使用的 Kubernetes API 版本 -kind: Secret # 资源类型:Secret(用于保存 Cloudflare Tunnel token) -metadata: # 元信息(名称/命名空间等) - name: cloudflared-credentials # Secret 名称(Deployment 中会引用) - namespace: kube-system # Secret 所在命名空间 -type: Opaque # Secret 类型(普通自定义键值) -stringData: # 以字符串方式提供 Secret 数据(便于直接写明文) - TUNNEL_TOKEN: "" # Cloudflare Tunnel Token(用你真实的 token 替换) +# docs/03-04-k3s-cloudflare-tunnel-配置接入.md +# Secret `cloudflared-credentials`(key=TUNNEL_TOKEN)由 verify playbook / 手工 kubectl create secret 创建,勿与此 Deployment 同 apply,避免覆盖 token。 +# 参考:ansible/playbooks/verify/03-04.yml → ensure-cloudflared-tunnel-secret --- -apiVersion: apps/v1 # Deployment 使用的 API 版本 -kind: Deployment # 工作负载:Deployment(管理 Pod 副本) -metadata: # Deployment 元信息 - name: cloudflared # Deployment 名称 - namespace: kube-system # 部署到的命名空间 -spec: # Deployment 期望状态 - replicas: 1 # 副本数(Tunnel 通常只跑一个副本即可) - selector: # Deployment 选择器:匹配 template 的 Pod - matchLabels: # 必须与 template.metadata.labels 对齐 - app: cloudflared # 应用标签 - template: # Pod 模板 - metadata: # Pod 元信息 - labels: # Pod 标签 - app: cloudflared # 与 selector.matchLabels 相同 - spec: # Pod 规范 - containers: # 容器列表 - - name: cloudflared # 容器名 - image: cloudflare/cloudflared:latest # cloudflared 镜像 - args: # 容器启动参数 - - tunnel # 命令子参数:tunnel - - run # 命令子参数:run - env: # 环境变量 - - name: TUNNEL_TOKEN # 容器内使用的环境变量名 - valueFrom: # 从某个来源取值 - secretKeyRef: # 从 Secret 的 key 取值 - name: cloudflared-credentials # Secret 名称 - key: TUNNEL_TOKEN # Secret 中的 key - +apiVersion: apps/v1 +kind: Deployment +metadata: + name: cloudflared + namespace: kube-system +spec: + replicas: 1 + selector: + matchLabels: + app: cloudflared + template: + metadata: + labels: + app: cloudflared + spec: + containers: + - name: cloudflared + image: cloudflare/cloudflared:latest + args: + - tunnel + - run + env: + - name: TUNNEL_TOKEN + valueFrom: + secretKeyRef: + name: cloudflared-credentials + key: TUNNEL_TOKEN diff --git a/ansible/files/03-06/nfs-pv-pvc-demo.yaml b/ansible/files/03-06/nfs-pv-pvc-demo.yaml index 1a0b1d4..70aff5c 100644 --- a/ansible/files/03-06/nfs-pv-pvc-demo.yaml +++ b/ansible/files/03-06/nfs-pv-pvc-demo.yaml @@ -4,6 +4,7 @@ kind: PersistentVolume # 资源类型:持久卷(集群级) metadata: # PV 元信息 name: nfs-pv-demo # PV 名称 spec: # PV 规格 + storageClassName: "" # 显式禁用默认 StorageClass,供静态绑定 PVC 使用 capacity: # 容量声明 storage: 20Gi # PV 总容量 accessModes: # 访问模式列表 @@ -19,6 +20,7 @@ metadata: # PVC 元信息 name: nfs-pvc-demo # PVC 名称 namespace: default # PVC 所在命名空间 spec: # PVC 规格 + storageClassName: "" # 与 PV 对齐,避免被默认 longhorn class 注入导致绑定失败 accessModes: # 访问模式要求 - ReadWriteMany # 申请 RWX 访问模式 resources: # 资源请求 diff --git a/ansible/files/03-06/nfs-pvc-verify-job.yaml b/ansible/files/03-06/nfs-pvc-verify-job.yaml new file mode 100644 index 0000000..04c2310 --- /dev/null +++ b/ansible/files/03-06/nfs-pvc-verify-job.yaml @@ -0,0 +1,27 @@ +# docs/03-06-k3s-使用nfs存储.md — 自动化验收用:挂载 nfs-pvc-demo 并写文件(OC3 证据) +# 与 nfs-pv-pvc-demo.yaml 配合;手动学习 PV/PVC 时可不应用本文件。 +apiVersion: batch/v1 +kind: Job +metadata: + name: nfs-pvc-verify-demo + namespace: default +spec: + backoffLimit: 3 + template: + spec: + restartPolicy: Never + containers: + - name: verify-write + image: busybox:1.36 + command: + - /bin/sh + - -c + - echo "ok-$(date +%s)" > /data/.verify-nfs && sync && test -f /data/.verify-nfs + volumeMounts: + - name: data + mountPath: /data + readOnly: false + volumes: + - name: data + persistentVolumeClaim: + claimName: nfs-pvc-demo diff --git a/ansible/files/03-08/README.md b/ansible/files/03-08/README.md new file mode 100644 index 0000000..95f8cef --- /dev/null +++ b/ansible/files/03-08/README.md @@ -0,0 +1,9 @@ +# 03-08(K3s HA 配置与切换) + +| 文件 | 说明 | +|------|------| +| `k3s-server-ha-env.example.sh` | 外部 datastore、`tls-san`、第二节点 `K3S_URL`/`TOKEN` 等环境变量示例(**非** Kubernetes 清单) | + +- **手动**:将示例中的连接串、LB IP、token 替换为真实值;与 systemd/k3s 安装参数对照 [docs/03-08-k3s-ha-集群配置与切换.md](../../../docs/03-08-k3s-ha-集群配置与切换.md)。 +- **自动**:`./ansible/bin/verify.sh run 03-08`(noop + 基线;HA 步骤仍以文档与手工为准)。 +- 本篇**不提供**可 `kubectl apply` 的 HA 安装真源(控制平面由 k3s 与 datastore 决定)。 diff --git a/ansible/files/03-08/k3s-server-ha-env.example.sh b/ansible/files/03-08/k3s-server-ha-env.example.sh new file mode 100644 index 0000000..c0a0c54 --- /dev/null +++ b/ansible/files/03-08/k3s-server-ha-env.example.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +# k3s 多 server + 外部 datastore 参数备忘(示例,勿直接 source 到生产) +# 完整步骤见:docs/03-08-k3s-ha-集群配置与切换.md +# 真源目录:ansible/files/03-08/ + +# --- 首个 server(示例 IP 请替换)--- +# export K3S_DATASTORE_ENDPOINT='postgres://k3s:CHANGE_ME@192.168.2.50:5432/k3s?sslmode=disable' +# sudo k3s server --datastore-endpoint="$K3S_DATASTORE_ENDPOINT" --tls-san 192.168.2.60 + +# --- 第二个 server(经 LB 加入,token 与控制端一致)--- +# export K3S_URL='https://192.168.2.60:6443' +# export K3S_TOKEN='' +# export K3S_DATASTORE_ENDPOINT='postgres://k3s:CHANGE_ME@192.168.2.50:5432/k3s?sslmode=disable' +# sudo k3s server --server "$K3S_URL" --token "$K3S_TOKEN" \ +# --datastore-endpoint="$K3S_DATASTORE_ENDPOINT" --tls-san 192.168.2.60 + +echo "[INFO] 编辑本文件中的占位符后,将命令复制到节点上执行;或写入 systemd ExecStart。" diff --git a/ansible/files/03-09/README.md b/ansible/files/03-09/README.md new file mode 100644 index 0000000..f6d64b7 --- /dev/null +++ b/ansible/files/03-09/README.md @@ -0,0 +1,9 @@ +# 03-09(GitOps 框架) + +| 文件 | 说明 | +|------|------| +| `argocd-namespace.example.yaml` | 仅创建 `argocd` 命名空间的极简示例(正式安装请用官方 `install.yaml` 或 Helm) | + +- **手动**:集群声明性配置的**主真源**建议在独立 GitOps 仓库(见 [docs/03-09-k3s-gitops-集群配置管理.md](../../../docs/03-09-k3s-gitops-集群配置管理.md));本目录只放与本仓库文档对齐的**示意**文件。 +- **自动**:`./ansible/bin/verify.sh run 03-09`(noop + 基线)。 +- 同一套 YAML 路径既可供你 `kubectl apply -f` 试跑,也与自动化验收引用同一目录,避免文档与仓库脱节。 diff --git a/ansible/files/03-09/argocd-namespace.example.yaml b/ansible/files/03-09/argocd-namespace.example.yaml new file mode 100644 index 0000000..a912206 --- /dev/null +++ b/ansible/files/03-09/argocd-namespace.example.yaml @@ -0,0 +1,8 @@ +# 极简示例:为后续 Argo CD 安装预留命名空间 +# 生产请改用官方清单: https://raw.githubusercontent.com/argoproj/argo-cd/stable/manifests/install.yaml +apiVersion: v1 +kind: Namespace +metadata: + name: argocd + labels: + app.kubernetes.io/part-of: argocd diff --git a/ansible/files/04-02/04-02-nodejs-demo.yaml b/ansible/files/04-02/04-02-nodejs-demo.yaml new file mode 100644 index 0000000..8380f98 --- /dev/null +++ b/ansible/files/04-02/04-02-nodejs-demo.yaml @@ -0,0 +1,59 @@ +# 对应文档:docs/04-02-nodejs-镜像与运行命令.md +# 累积:04-01 + 固定镜像 tag、imagePullPolicy、command/args +apiVersion: apps/v1 # Deployment API 版本 +kind: Deployment # 工作负载:Deployment +metadata: # Deployment 元信息 + name: nodejs-demo # Deployment 名称 + namespace: default # 命名空间 +spec: # Deployment 规格 + replicas: 1 # 副本数 + selector: # Deployment 选择器 + matchLabels: # 标签匹配集合 + app: nodejs-demo # 匹配 app=nodejs-demo 的 Pod + template: # Pod 模板 + metadata: # Pod 元信息 + labels: # Pod 标签 + app: nodejs-demo # 与 selector.matchLabels 对齐 + spec: # Pod 规格 + containers: # 容器列表 + - name: nodejs-demo # 容器名 + image: node:18.20-alpine # 固定 tag 的 Node.js 镜像 + imagePullPolicy: IfNotPresent # 拉取策略:本地有则不重复拉取 + command: ["node"] # 主命令 + args: # 命令参数 + - "-e" # 执行内联脚本 + - "require('http').createServer((req,res)=>res.end('Hello from pinned image')).listen(3000)" # Node.js 内联服务逻辑 + ports: # 容器端口 + - containerPort: 3000 # 应用监听端口 +--- +apiVersion: v1 # Service API 版本 +kind: Service # Service 资源 +metadata: # Service 元信息 + name: nodejs-demo # Service 名称 + namespace: default # 命名空间 +spec: # Service 规格 + selector: # 选择后端 Pod + app: nodejs-demo # 选中 app=nodejs-demo + ports: # 端口映射 + - port: 80 # Service 暴露端口 + targetPort: 3000 # 转发到容器端口 +--- +apiVersion: networking.k8s.io/v1 # Ingress API 版本 +kind: Ingress # Ingress 资源 +metadata: # Ingress 元信息 + name: nodejs-demo # Ingress 名称 + namespace: default # 命名空间 + annotations: # Traefik 注解 + traefik.ingress.kubernetes.io/router.entrypoints: web # 使用 web(HTTP) 入口 +spec: # Ingress 规则 + rules: # 规则列表 + - http: # HTTP 路由 + paths: # 路径列表 + - path: /node # 匹配路径前缀 + pathType: Prefix # 前缀匹配 + backend: # 后端目标 + service: # 后端 Service + name: nodejs-demo # Service 名称 + port: # Service 端口 + number: 80 # 端口号 + diff --git a/ansible/files/04-02/README.md b/ansible/files/04-02/README.md new file mode 100644 index 0000000..99138ab --- /dev/null +++ b/ansible/files/04-02/README.md @@ -0,0 +1,6 @@ +# 04-02(nodejs 分课) + +本目录 `04-02-nodejs-demo.yaml` 由 `ansible/files/04-01/` 同名文件复制,与总集目录内文件**同构**,专供本课独立阅读;改本课清单时以本目录为准即可最小化影响其他篇。 + +- **手动**:`kubectl apply -f ansible/files/04-02/04-02-nodejs-demo.yaml`(按需改字段)。 +- **自动**:`./ansible/bin/verify.sh run 04-02`。 diff --git a/ansible/files/04-03/04-03-nodejs-demo.yaml b/ansible/files/04-03/04-03-nodejs-demo.yaml new file mode 100644 index 0000000..653dfd7 --- /dev/null +++ b/ansible/files/04-03/04-03-nodejs-demo.yaml @@ -0,0 +1,76 @@ +# 对应文档:docs/04-03-nodejs-环境变量与配置注入.md +# 累积:04-02 + ConfigMap + 通过 env 注入 APP_MSG(镜像仍用 18.20-alpine 与 04-02 一致) +apiVersion: v1 # ConfigMap API 版本 +kind: ConfigMap # 配置资源:ConfigMap +metadata: # ConfigMap 元信息 + name: nodejs-demo-config # ConfigMap 名称 + namespace: default # 命名空间 +data: # 配置键值 + APP_MSG: "Hello from ConfigMap" # 注入给应用的消息内容 +--- +apiVersion: apps/v1 # Deployment API 版本 +kind: Deployment # 工作负载:Deployment +metadata: # Deployment 元信息 + name: nodejs-demo # Deployment 名称 + namespace: default # 命名空间 +spec: # Deployment 规格 + replicas: 1 # 副本数 + selector: # Deployment 选择器 + matchLabels: # 标签匹配集合 + app: nodejs-demo # 匹配 app=nodejs-demo 的 Pod + template: # Pod 模板 + metadata: # Pod 元信息 + labels: # Pod 标签 + app: nodejs-demo # 与 selector.matchLabels 对齐 + spec: # Pod 规格 + containers: # 容器列表 + - name: nodejs-demo # 容器名 + image: node:18.20-alpine # Node.js 镜像 + imagePullPolicy: IfNotPresent # 拉取策略 + env: # 环境变量注入 + - name: APP_MSG # 环境变量名 + valueFrom: # 从资源引用取值 + configMapKeyRef: # 从 ConfigMap key 读取 + name: nodejs-demo-config # ConfigMap 名称 + key: APP_MSG # ConfigMap 键名 + command: # 启动命令 + - node # 运行 node + - "-e" # 执行内联脚本 + - | # 多行 JS 脚本(内部内容不改动) + const http=require('http'); + const msg=process.env.APP_MSG||'no env'; + http.createServer((q,s)=>s.end(msg)).listen(3000); + ports: # 容器端口 + - containerPort: 3000 # 应用监听端口 +--- +apiVersion: v1 # Service API 版本 +kind: Service # Service 资源 +metadata: # Service 元信息 + name: nodejs-demo # Service 名称 + namespace: default # 命名空间 +spec: # Service 规格 + selector: # 选择后端 Pod + app: nodejs-demo # 选中 app=nodejs-demo + ports: # 端口映射 + - port: 80 # Service 暴露端口 + targetPort: 3000 # 转发到容器端口 +--- +apiVersion: networking.k8s.io/v1 # Ingress API 版本 +kind: Ingress # Ingress 资源 +metadata: # Ingress 元信息 + name: nodejs-demo # Ingress 名称 + namespace: default # 命名空间 + annotations: # Traefik 注解 + traefik.ingress.kubernetes.io/router.entrypoints: web # 使用 web(HTTP) 入口 +spec: # Ingress 规则 + rules: # 规则列表 + - http: # HTTP 路由 + paths: # 路径列表 + - path: /node # 匹配路径前缀 + pathType: Prefix # 前缀匹配 + backend: # 后端目标 + service: # 后端 Service + name: nodejs-demo # Service 名称 + port: # Service 端口 + number: 80 # 端口号 + diff --git a/ansible/files/04-03/README.md b/ansible/files/04-03/README.md new file mode 100644 index 0000000..49906fe --- /dev/null +++ b/ansible/files/04-03/README.md @@ -0,0 +1,6 @@ +# 04-03(nodejs 分课) + +本目录 `04-03-nodejs-demo.yaml` 由 `ansible/files/04-01/` 同名文件复制,与总集目录内文件**同构**,专供本课独立阅读;改本课清单时以本目录为准即可最小化影响其他篇。 + +- **手动**:`kubectl apply -f ansible/files/04-03/04-03-nodejs-demo.yaml`(按需改字段)。 +- **自动**:`./ansible/bin/verify.sh run 04-03`。 diff --git a/ansible/files/04-04/04-04-nodejs-demo.yaml b/ansible/files/04-04/04-04-nodejs-demo.yaml new file mode 100644 index 0000000..0c0d789 --- /dev/null +++ b/ansible/files/04-04/04-04-nodejs-demo.yaml @@ -0,0 +1,76 @@ +# 对应文档:docs/04-04-nodejs-端口与Service.md +# 累积:04-03 + 容器与进程改监听 8080,Service targetPort 对齐 +apiVersion: v1 # ConfigMap API 版本 +kind: ConfigMap # 配置资源:ConfigMap +metadata: # ConfigMap 元信息 + name: nodejs-demo-config # ConfigMap 名称 + namespace: default # 命名空间 +data: # 配置键值 + APP_MSG: "Hello from ConfigMap" # 注入给应用的消息内容 +--- +apiVersion: apps/v1 # Deployment API 版本 +kind: Deployment # 工作负载:Deployment +metadata: # Deployment 元信息 + name: nodejs-demo # Deployment 名称 + namespace: default # 命名空间 +spec: # Deployment 规格 + replicas: 1 # 副本数 + selector: # Deployment 选择器 + matchLabels: # 标签匹配集合 + app: nodejs-demo # 匹配 app=nodejs-demo 的 Pod + template: # Pod 模板 + metadata: # Pod 元信息 + labels: # Pod 标签 + app: nodejs-demo # 与 selector.matchLabels 对齐 + spec: # Pod 规格 + containers: # 容器列表 + - name: nodejs-demo # 容器名 + image: node:18.20-alpine # Node.js 镜像 + imagePullPolicy: IfNotPresent # 拉取策略 + env: # 环境变量注入 + - name: APP_MSG # 环境变量名 + valueFrom: # 从资源引用取值 + configMapKeyRef: # 从 ConfigMap key 读取 + name: nodejs-demo-config # ConfigMap 名称 + key: APP_MSG # ConfigMap 键名 + command: # 启动命令 + - node # 运行 node + - "-e" # 执行内联脚本 + - | # 多行 JS 脚本(内部内容不改动) + const http=require('http'); + const msg=process.env.APP_MSG||'no env'; + http.createServer((q,s)=>s.end(msg)).listen(8080); + ports: # 容器端口 + - containerPort: 8080 # 应用监听端口 +--- +apiVersion: v1 # Service API 版本 +kind: Service # Service 资源 +metadata: # Service 元信息 + name: nodejs-demo # Service 名称 + namespace: default # 命名空间 +spec: # Service 规格 + selector: # 选择后端 Pod + app: nodejs-demo # 选中 app=nodejs-demo + ports: # 端口映射 + - port: 80 # Service 暴露端口 + targetPort: 8080 # 转发到容器端口 +--- +apiVersion: networking.k8s.io/v1 # Ingress API 版本 +kind: Ingress # Ingress 资源 +metadata: # Ingress 元信息 + name: nodejs-demo # Ingress 名称 + namespace: default # 命名空间 + annotations: # Traefik 注解 + traefik.ingress.kubernetes.io/router.entrypoints: web # 使用 web(HTTP) 入口 +spec: # Ingress 规则 + rules: # 规则列表 + - http: # HTTP 路由 + paths: # 路径列表 + - path: /node # 匹配路径前缀 + pathType: Prefix # 前缀匹配 + backend: # 后端目标 + service: # 后端 Service + name: nodejs-demo # Service 名称 + port: # Service 端口 + number: 80 # 端口号 + diff --git a/ansible/files/04-04/README.md b/ansible/files/04-04/README.md new file mode 100644 index 0000000..1571d58 --- /dev/null +++ b/ansible/files/04-04/README.md @@ -0,0 +1,6 @@ +# 04-04(nodejs 分课) + +本目录 `04-04-nodejs-demo.yaml` 由 `ansible/files/04-01/` 同名文件复制,与总集目录内文件**同构**,专供本课独立阅读;改本课清单时以本目录为准即可最小化影响其他篇。 + +- **手动**:`kubectl apply -f ansible/files/04-04/04-04-nodejs-demo.yaml`(按需改字段)。 +- **自动**:`./ansible/bin/verify.sh run 04-04`。 diff --git a/ansible/files/04-05/04-05-nodejs-demo.yaml b/ansible/files/04-05/04-05-nodejs-demo.yaml new file mode 100644 index 0000000..ca30089 --- /dev/null +++ b/ansible/files/04-05/04-05-nodejs-demo.yaml @@ -0,0 +1,83 @@ +# 对应文档:docs/04-05-nodejs-资源请求与限制.md +# 累积:04-04 + resources.requests/limits +apiVersion: v1 # ConfigMap API 版本 +kind: ConfigMap # 配置资源:ConfigMap +metadata: # ConfigMap 元信息 + name: nodejs-demo-config # ConfigMap 名称 + namespace: default # 命名空间 +data: # 配置键值 + APP_MSG: "Hello from ConfigMap" # 注入给应用的消息内容 +--- +apiVersion: apps/v1 # Deployment API 版本 +kind: Deployment # 工作负载:Deployment +metadata: # Deployment 元信息 + name: nodejs-demo # Deployment 名称 + namespace: default # 命名空间 +spec: # Deployment 规格 + replicas: 1 # 副本数 + selector: # Deployment 选择器 + matchLabels: # 标签匹配集合 + app: nodejs-demo # 匹配 app=nodejs-demo 的 Pod + template: # Pod 模板 + metadata: # Pod 元信息 + labels: # Pod 标签 + app: nodejs-demo # 与 selector.matchLabels 对齐 + spec: # Pod 规格 + containers: # 容器列表 + - name: nodejs-demo # 容器名 + image: node:18.20-alpine # Node.js 镜像 + imagePullPolicy: IfNotPresent # 拉取策略 + env: # 环境变量注入 + - name: APP_MSG # 环境变量名 + valueFrom: # 从资源引用取值 + configMapKeyRef: # 从 ConfigMap key 读取 + name: nodejs-demo-config # ConfigMap 名称 + key: APP_MSG # ConfigMap 键名 + command: # 启动命令 + - node # 运行 node + - "-e" # 执行内联脚本 + - | # 多行 JS 脚本(内部内容不改动) + const http=require('http'); + const msg=process.env.APP_MSG||'no env'; + http.createServer((q,s)=>s.end(msg)).listen(8080); + ports: # 容器端口 + - containerPort: 8080 # 应用监听端口 + resources: # 资源请求与限制 + requests: # 最小资源请求 + cpu: "50m" # 请求 CPU + memory: "64Mi" # 请求内存 + limits: # 资源上限 + cpu: "500m" # CPU 限制 + memory: "256Mi" # 内存限制 +--- +apiVersion: v1 # Service API 版本 +kind: Service # Service 资源 +metadata: # Service 元信息 + name: nodejs-demo # Service 名称 + namespace: default # 命名空间 +spec: # Service 规格 + selector: # 选择后端 Pod + app: nodejs-demo # 选中 app=nodejs-demo + ports: # 端口映射 + - port: 80 # Service 暴露端口 + targetPort: 8080 # 转发到容器端口 +--- +apiVersion: networking.k8s.io/v1 # Ingress API 版本 +kind: Ingress # Ingress 资源 +metadata: # Ingress 元信息 + name: nodejs-demo # Ingress 名称 + namespace: default # 命名空间 + annotations: # Traefik 注解 + traefik.ingress.kubernetes.io/router.entrypoints: web # 使用 web(HTTP) 入口 +spec: # Ingress 规则 + rules: # 规则列表 + - http: # HTTP 路由 + paths: # 路径列表 + - path: /node # 匹配路径前缀 + pathType: Prefix # 前缀匹配 + backend: # 后端目标 + service: # 后端 Service + name: nodejs-demo # Service 名称 + port: # Service 端口 + number: 80 # 端口号 + diff --git a/ansible/files/04-05/README.md b/ansible/files/04-05/README.md new file mode 100644 index 0000000..ec44355 --- /dev/null +++ b/ansible/files/04-05/README.md @@ -0,0 +1,6 @@ +# 04-05(nodejs 分课) + +本目录 `04-05-nodejs-demo.yaml` 由 `ansible/files/04-01/` 同名文件复制,与总集目录内文件**同构**,专供本课独立阅读;改本课清单时以本目录为准即可最小化影响其他篇。 + +- **手动**:`kubectl apply -f ansible/files/04-05/04-05-nodejs-demo.yaml`(按需改字段)。 +- **自动**:`./ansible/bin/verify.sh run 04-05`。 diff --git a/ansible/files/04-06/04-06-nodejs-demo.yaml b/ansible/files/04-06/04-06-nodejs-demo.yaml new file mode 100644 index 0000000..2db35a0 --- /dev/null +++ b/ansible/files/04-06/04-06-nodejs-demo.yaml @@ -0,0 +1,95 @@ +# 对应文档:docs/04-06-nodejs-探针与健康检查.md +# 累积:04-05 + livenessProbe/readinessProbe(端口 8080,路径 /) +apiVersion: v1 # ConfigMap API 版本 +kind: ConfigMap # 配置资源:ConfigMap +metadata: # ConfigMap 元信息 + name: nodejs-demo-config # ConfigMap 名称 + namespace: default # 命名空间 +data: # 配置键值 + APP_MSG: "Hello from ConfigMap" # 注入给应用的消息内容 +--- +apiVersion: apps/v1 # Deployment API 版本 +kind: Deployment # 工作负载:Deployment +metadata: # Deployment 元信息 + name: nodejs-demo # Deployment 名称 + namespace: default # 命名空间 +spec: # Deployment 规格 + replicas: 1 # 副本数 + selector: # Deployment 选择器 + matchLabels: # 标签匹配集合 + app: nodejs-demo # 匹配 app=nodejs-demo 的 Pod + template: # Pod 模板 + metadata: # Pod 元信息 + labels: # Pod 标签 + app: nodejs-demo # 与 selector.matchLabels 对齐 + spec: # Pod 规格 + containers: # 容器列表 + - name: nodejs-demo # 容器名 + image: node:18.20-alpine # Node.js 镜像 + imagePullPolicy: IfNotPresent # 拉取策略 + env: # 环境变量注入 + - name: APP_MSG # 环境变量名 + valueFrom: # 从资源引用取值 + configMapKeyRef: # 从 ConfigMap key 读取 + name: nodejs-demo-config # ConfigMap 名称 + key: APP_MSG # ConfigMap 键名 + command: # 启动命令 + - node # 运行 node + - "-e" # 执行内联脚本 + - | # 多行 JS 脚本(内部内容不改动) + const http=require('http'); + const msg=process.env.APP_MSG||'no env'; + http.createServer((q,s)=>s.end(msg)).listen(8080); + ports: # 容器端口 + - containerPort: 8080 # 应用监听端口 + resources: # 资源请求与限制 + requests: # 最小资源请求 + cpu: "50m" # 请求 CPU + memory: "64Mi" # 请求内存 + limits: # 资源上限 + cpu: "500m" # CPU 限制 + memory: "256Mi" # 内存限制 + livenessProbe: # 存活探针(判断容器是否需要重启) + httpGet: # 通过 HTTP 探测 + path: / # 探测路径 + port: 8080 # 探测端口 + initialDelaySeconds: 3 # 启动后首次探测延迟 + periodSeconds: 10 # 探测周期 + readinessProbe: # 就绪探针(判断是否接收流量) + httpGet: # 通过 HTTP 探测 + path: / # 探测路径 + port: 8080 # 探测端口 + initialDelaySeconds: 2 # 启动后首次探测延迟 + periodSeconds: 5 # 探测周期 +--- +apiVersion: v1 # Service API 版本 +kind: Service # Service 资源 +metadata: # Service 元信息 + name: nodejs-demo # Service 名称 + namespace: default # 命名空间 +spec: # Service 规格 + selector: # 选择后端 Pod + app: nodejs-demo # 选中 app=nodejs-demo + ports: # 端口映射 + - port: 80 # Service 暴露端口 + targetPort: 8080 # 转发到容器端口 +--- +apiVersion: networking.k8s.io/v1 # Ingress API 版本 +kind: Ingress # Ingress 资源 +metadata: # Ingress 元信息 + name: nodejs-demo # Ingress 名称 + namespace: default # 命名空间 + annotations: # Traefik 注解 + traefik.ingress.kubernetes.io/router.entrypoints: web # 使用 web(HTTP) 入口 +spec: # Ingress 规则 + rules: # 规则列表 + - http: # HTTP 路由 + paths: # 路径列表 + - path: /node # 匹配路径前缀 + pathType: Prefix # 前缀匹配 + backend: # 后端目标 + service: # 后端 Service + name: nodejs-demo # Service 名称 + port: # Service 端口 + number: 80 # 端口号 + diff --git a/ansible/files/04-06/README.md b/ansible/files/04-06/README.md new file mode 100644 index 0000000..115fa51 --- /dev/null +++ b/ansible/files/04-06/README.md @@ -0,0 +1,6 @@ +# 04-06(nodejs 分课) + +本目录 `04-06-nodejs-demo.yaml` 由 `ansible/files/04-01/` 同名文件复制,与总集目录内文件**同构**,专供本课独立阅读;改本课清单时以本目录为准即可最小化影响其他篇。 + +- **手动**:`kubectl apply -f ansible/files/04-06/04-06-nodejs-demo.yaml`(按需改字段)。 +- **自动**:`./ansible/bin/verify.sh run 04-06`。 diff --git a/ansible/files/04-07/04-07-nodejs-demo.yaml b/ansible/files/04-07/04-07-nodejs-demo.yaml new file mode 100644 index 0000000..b81be5b --- /dev/null +++ b/ansible/files/04-07/04-07-nodejs-demo.yaml @@ -0,0 +1,97 @@ +# 对应文档:docs/04-07-nodejs-调度与亲和.md +# 累积:04-06 + nodeSelector(默认 ylc62,请改为本集群节点短主机名) +apiVersion: v1 # ConfigMap API 版本 +kind: ConfigMap # 配置资源:ConfigMap +metadata: # ConfigMap 元信息 + name: nodejs-demo-config # ConfigMap 名称 + namespace: default # 命名空间 +data: # 配置键值 + APP_MSG: "Hello from ConfigMap" # 注入给应用的消息内容 +--- +apiVersion: apps/v1 # Deployment API 版本 +kind: Deployment # 工作负载:Deployment +metadata: # Deployment 元信息 + name: nodejs-demo # Deployment 名称 + namespace: default # 命名空间 +spec: # Deployment 规格 + replicas: 1 # 副本数 + selector: # Deployment 选择器 + matchLabels: # 标签匹配集合 + app: nodejs-demo # 匹配 app=nodejs-demo 的 Pod + template: # Pod 模板 + metadata: # Pod 元信息 + labels: # Pod 标签 + app: nodejs-demo # 与 selector.matchLabels 对齐 + spec: # Pod 规格 + nodeSelector: # 调度到指定节点 + kubernetes.io/hostname: ylc62 # 节点主机名(按实际修改) + containers: # 容器列表 + - name: nodejs-demo # 容器名 + image: node:18.20-alpine # Node.js 镜像 + imagePullPolicy: IfNotPresent # 拉取策略 + env: # 环境变量注入 + - name: APP_MSG # 环境变量名 + valueFrom: # 从资源引用取值 + configMapKeyRef: # 从 ConfigMap key 读取 + name: nodejs-demo-config # ConfigMap 名称 + key: APP_MSG # ConfigMap 键名 + command: # 启动命令 + - node # 运行 node + - "-e" # 执行内联脚本 + - | # 多行 JS 脚本(内部内容不改动) + const http=require('http'); + const msg=process.env.APP_MSG||'no env'; + http.createServer((q,s)=>s.end(msg)).listen(8080); + ports: # 容器端口 + - containerPort: 8080 # 应用监听端口 + resources: # 资源请求与限制 + requests: # 最小资源请求 + cpu: "50m" # 请求 CPU + memory: "64Mi" # 请求内存 + limits: # 资源上限 + cpu: "500m" # CPU 限制 + memory: "256Mi" # 内存限制 + livenessProbe: # 存活探针 + httpGet: # HTTP 探测 + path: / # 探测路径 + port: 8080 # 探测端口 + initialDelaySeconds: 3 # 启动后首次探测延迟 + periodSeconds: 10 # 探测周期 + readinessProbe: # 就绪探针 + httpGet: # HTTP 探测 + path: / # 探测路径 + port: 8080 # 探测端口 + initialDelaySeconds: 2 # 启动后首次探测延迟 + periodSeconds: 5 # 探测周期 +--- +apiVersion: v1 # Service API 版本 +kind: Service # Service 资源 +metadata: # Service 元信息 + name: nodejs-demo # Service 名称 + namespace: default # 命名空间 +spec: # Service 规格 + selector: # 选择后端 Pod + app: nodejs-demo # 选中 app=nodejs-demo + ports: # 端口映射 + - port: 80 # Service 暴露端口 + targetPort: 8080 # 转发到容器端口 +--- +apiVersion: networking.k8s.io/v1 # Ingress API 版本 +kind: Ingress # Ingress 资源 +metadata: # Ingress 元信息 + name: nodejs-demo # Ingress 名称 + namespace: default # 命名空间 + annotations: # Traefik 注解 + traefik.ingress.kubernetes.io/router.entrypoints: web # 使用 web(HTTP) 入口 +spec: # Ingress 规则 + rules: # 规则列表 + - http: # HTTP 路由 + paths: # 路径列表 + - path: /node # 匹配路径前缀 + pathType: Prefix # 前缀匹配 + backend: # 后端目标 + service: # 后端 Service + name: nodejs-demo # Service 名称 + port: # Service 端口 + number: 80 # 端口号 + diff --git a/ansible/files/04-07/README.md b/ansible/files/04-07/README.md new file mode 100644 index 0000000..cfe33ca --- /dev/null +++ b/ansible/files/04-07/README.md @@ -0,0 +1,6 @@ +# 04-07(nodejs 分课) + +本目录 `04-07-nodejs-demo.yaml` 由 `ansible/files/04-01/` 同名文件复制,与总集目录内文件**同构**,专供本课独立阅读;改本课清单时以本目录为准即可最小化影响其他篇。 + +- **手动**:`kubectl apply -f ansible/files/04-07/04-07-nodejs-demo.yaml`(按需改字段)。 +- **自动**:`./ansible/bin/verify.sh run 04-07`。 diff --git a/ansible/files/04-08/04-08-nodejs-demo.yaml b/ansible/files/04-08/04-08-nodejs-demo.yaml new file mode 100644 index 0000000..361e2bf --- /dev/null +++ b/ansible/files/04-08/04-08-nodejs-demo.yaml @@ -0,0 +1,110 @@ +# 对应文档:docs/04-08-nodejs-安全上下文.md +# 累积:04-07 + pod securityContext.fsGroup、容器 securityContext、只读根、/tmp emptyDir +apiVersion: v1 # ConfigMap API 版本 +kind: ConfigMap # 配置资源:ConfigMap +metadata: # ConfigMap 元信息 + name: nodejs-demo-config # ConfigMap 名称 + namespace: default # 命名空间 +data: # 配置键值 + APP_MSG: "Hello from ConfigMap" # 注入给应用的消息内容 +--- +apiVersion: apps/v1 # Deployment API 版本 +kind: Deployment # 工作负载:Deployment +metadata: # Deployment 元信息 + name: nodejs-demo # Deployment 名称 + namespace: default # 命名空间 +spec: # Deployment 规格 + replicas: 1 # 副本数 + selector: # Deployment 选择器 + matchLabels: # 标签匹配集合 + app: nodejs-demo # 匹配 app=nodejs-demo 的 Pod + template: # Pod 模板 + metadata: # Pod 元信息 + labels: # Pod 标签 + app: nodejs-demo # 与 selector.matchLabels 对齐 + spec: # Pod 规格 + nodeSelector: # 调度到指定节点 + kubernetes.io/hostname: ylc62 # 节点主机名(按实际修改) + securityContext: # Pod 级安全上下文 + fsGroup: 1000 # 挂载卷文件组 ID + containers: # 容器列表 + - name: nodejs-demo # 容器名 + image: node:18.20-alpine # Node.js 镜像 + imagePullPolicy: IfNotPresent # 拉取策略 + securityContext: # 容器级安全上下文 + allowPrivilegeEscalation: false # 禁止提权 + runAsNonRoot: true # 强制非 root 运行 + runAsUser: 1000 # 运行用户 UID + readOnlyRootFilesystem: true # 根文件系统只读 + env: # 环境变量注入 + - name: APP_MSG # 环境变量名 + valueFrom: # 从资源引用取值 + configMapKeyRef: # 从 ConfigMap key 读取 + name: nodejs-demo-config # ConfigMap 名称 + key: APP_MSG # ConfigMap 键名 + command: # 启动命令 + - node # 运行 node + - "-e" # 执行内联脚本 + - | # 多行 JS 脚本(内部内容不改动) + const http=require('http'); + const msg=process.env.APP_MSG||'no env'; + http.createServer((q,s)=>s.end(msg)).listen(8080); + ports: # 容器端口 + - containerPort: 8080 # 应用监听端口 + resources: # 资源请求与限制 + requests: # 最小资源请求 + cpu: "50m" # 请求 CPU + memory: "64Mi" # 请求内存 + limits: # 资源上限 + cpu: "500m" # CPU 限制 + memory: "256Mi" # 内存限制 + livenessProbe: # 存活探针 + httpGet: # HTTP 探测 + path: / # 探测路径 + port: 8080 # 探测端口 + initialDelaySeconds: 3 # 启动后首次探测延迟 + periodSeconds: 10 # 探测周期 + readinessProbe: # 就绪探针 + httpGet: # HTTP 探测 + path: / # 探测路径 + port: 8080 # 探测端口 + initialDelaySeconds: 2 # 启动后首次探测延迟 + periodSeconds: 5 # 探测周期 + volumeMounts: # 卷挂载 + - name: tmp # 引用临时卷 + mountPath: /tmp # 容器内临时目录 + volumes: # 卷定义 + - name: tmp # 临时卷名称 + emptyDir: {} # 空目录卷(Pod 生命周期内) +--- +apiVersion: v1 # Service API 版本 +kind: Service # Service 资源 +metadata: # Service 元信息 + name: nodejs-demo # Service 名称 + namespace: default # 命名空间 +spec: # Service 规格 + selector: # 选择后端 Pod + app: nodejs-demo # 选中 app=nodejs-demo + ports: # 端口映射 + - port: 80 # Service 暴露端口 + targetPort: 8080 # 转发到容器端口 +--- +apiVersion: networking.k8s.io/v1 # Ingress API 版本 +kind: Ingress # Ingress 资源 +metadata: # Ingress 元信息 + name: nodejs-demo # Ingress 名称 + namespace: default # 命名空间 + annotations: # Traefik 注解 + traefik.ingress.kubernetes.io/router.entrypoints: web # 使用 web(HTTP) 入口 +spec: # Ingress 规则 + rules: # 规则列表 + - http: # HTTP 路由 + paths: # 路径列表 + - path: /node # 匹配路径前缀 + pathType: Prefix # 前缀匹配 + backend: # 后端目标 + service: # 后端 Service + name: nodejs-demo # Service 名称 + port: # Service 端口 + number: 80 # 端口号 + diff --git a/ansible/files/04-08/README.md b/ansible/files/04-08/README.md new file mode 100644 index 0000000..8e3ca92 --- /dev/null +++ b/ansible/files/04-08/README.md @@ -0,0 +1,6 @@ +# 04-08(nodejs 分课) + +本目录 `04-08-nodejs-demo.yaml` 由 `ansible/files/04-01/` 同名文件复制,与总集目录内文件**同构**,专供本课独立阅读;改本课清单时以本目录为准即可最小化影响其他篇。 + +- **手动**:`kubectl apply -f ansible/files/04-08/04-08-nodejs-demo.yaml`(按需改字段)。 +- **自动**:`./ansible/bin/verify.sh run 04-08`。 diff --git a/ansible/files/04-09/04-09-nodejs-demo.yaml b/ansible/files/04-09/04-09-nodejs-demo.yaml new file mode 100644 index 0000000..433b86c --- /dev/null +++ b/ansible/files/04-09/04-09-nodejs-demo.yaml @@ -0,0 +1,128 @@ +# 对应文档:docs/04-09-nodejs-存储与卷.md +# 累积:04-08 + PVC nodejs-demo-data(默认 storageClassName: local-path,可按集群改为 longhorn 等)+ 挂载 /data +apiVersion: v1 # PVC API 版本 +kind: PersistentVolumeClaim # 持久卷声明 +metadata: # PVC 元信息 + name: nodejs-demo-data # PVC 名称 + namespace: default # 命名空间 +spec: # PVC 规格 + accessModes: # 访问模式 + - ReadWriteOnce # RWO:同一时间仅单节点挂载读写 + storageClassName: local-path # 存储类(按集群可改) + resources: # 资源请求 + requests: # 配额请求 + storage: 1Gi # 申请容量 +--- +apiVersion: v1 # ConfigMap API 版本 +kind: ConfigMap # 配置资源 +metadata: # ConfigMap 元信息 + name: nodejs-demo-config # ConfigMap 名称 + namespace: default # 命名空间 +data: # 配置键值 + APP_MSG: "Hello from ConfigMap" # 示例消息内容 +--- +apiVersion: apps/v1 # Deployment API 版本 +kind: Deployment # 工作负载:Deployment +metadata: # Deployment 元信息 + name: nodejs-demo # Deployment 名称 + namespace: default # 命名空间 +spec: # Deployment 规格 + replicas: 1 # 副本数 + selector: # Pod 选择器 + matchLabels: # 标签匹配集合 + app: nodejs-demo # 匹配 app=nodejs-demo 的 Pod + template: # Pod 模板 + metadata: # Pod 元信息 + labels: # Pod 标签 + app: nodejs-demo # 与 selector.matchLabels 对齐 + spec: # Pod 规格 + nodeSelector: # 节点选择 + kubernetes.io/hostname: ylc62 # 固定到指定节点(按实际修改) + securityContext: # Pod 级安全上下文 + fsGroup: 1000 # 挂载卷文件组 ID + containers: # 容器列表 + - name: nodejs-demo # 容器名 + image: node:18.20-alpine # Node.js 镜像 + imagePullPolicy: IfNotPresent # 拉取策略 + securityContext: # 容器级安全上下文 + allowPrivilegeEscalation: false # 禁止提权 + runAsNonRoot: true # 非 root 运行 + runAsUser: 1000 # 运行用户 UID + readOnlyRootFilesystem: true # 根文件系统只读 + env: # 环境变量 + - name: APP_MSG # 环境变量名 + valueFrom: # 从引用源取值 + configMapKeyRef: # 从 ConfigMap key 读取 + name: nodejs-demo-config # ConfigMap 名称 + key: APP_MSG # ConfigMap 键名 + command: # 启动命令 + - node # 执行 node + - "-e" # 执行内联脚本 + - | # 多行 JS 脚本(内容保持原样) + const http=require('http'); + const msg=process.env.APP_MSG||'no env'; + http.createServer((q,s)=>s.end(msg)).listen(8080); + ports: # 容器端口 + - containerPort: 8080 # 监听端口 + resources: # 资源请求与限制 + requests: # 最小资源请求 + cpu: "50m" # 请求 CPU + memory: "64Mi" # 请求内存 + limits: # 资源上限 + cpu: "500m" # CPU 限制 + memory: "256Mi" # 内存限制 + livenessProbe: # 存活探针 + httpGet: # HTTP 探测 + path: / # 探测路径 + port: 8080 # 探测端口 + initialDelaySeconds: 3 # 初始延迟 + periodSeconds: 10 # 探测周期 + readinessProbe: # 就绪探针 + httpGet: # HTTP 探测 + path: / # 探测路径 + port: 8080 # 探测端口 + initialDelaySeconds: 2 # 初始延迟 + periodSeconds: 5 # 探测周期 + volumeMounts: # 卷挂载 + - name: tmp # 临时卷名称 + mountPath: /tmp # 容器内临时目录 + - name: data # 数据卷名称 + mountPath: /data # 容器内数据目录 + volumes: # 卷定义 + - name: tmp # 临时卷 + emptyDir: {} # 空目录卷 + - name: data # 数据卷 + persistentVolumeClaim: # 卷来源为 PVC + claimName: nodejs-demo-data # 绑定 PVC 名称 +--- +apiVersion: v1 # Service API 版本 +kind: Service # Service 资源 +metadata: # Service 元信息 + name: nodejs-demo # Service 名称 + namespace: default # 命名空间 +spec: # Service 规格 + selector: # 选择后端 Pod + app: nodejs-demo # 选中 app=nodejs-demo + ports: # 端口映射 + - port: 80 # Service 暴露端口 + targetPort: 8080 # 转发到容器端口 +--- +apiVersion: networking.k8s.io/v1 # Ingress API 版本 +kind: Ingress # Ingress 资源 +metadata: # Ingress 元信息 + name: nodejs-demo # Ingress 名称 + namespace: default # 命名空间 + annotations: # Traefik 注解 + traefik.ingress.kubernetes.io/router.entrypoints: web # 使用 web(HTTP) 入口 +spec: # Ingress 规则 + rules: # 规则列表 + - http: # HTTP 路由 + paths: # 路径列表 + - path: /node # 路径前缀 + pathType: Prefix # 前缀匹配 + backend: # 后端目标 + service: # 后端 Service + name: nodejs-demo # Service 名称 + port: # Service 端口 + number: 80 # 端口号 + diff --git a/ansible/files/04-09/README.md b/ansible/files/04-09/README.md new file mode 100644 index 0000000..d46cd9f --- /dev/null +++ b/ansible/files/04-09/README.md @@ -0,0 +1,6 @@ +# 04-09(nodejs 分课) + +本目录 `04-09-nodejs-demo.yaml` 由 `ansible/files/04-01/` 同名文件复制,与总集目录内文件**同构**,专供本课独立阅读;改本课清单时以本目录为准即可最小化影响其他篇。 + +- **手动**:`kubectl apply -f ansible/files/04-09/04-09-nodejs-demo.yaml`(按需改字段)。 +- **自动**:`./ansible/bin/verify.sh run 04-09`。 diff --git a/ansible/files/04-10/04-10-nodejs-demo.yaml b/ansible/files/04-10/04-10-nodejs-demo.yaml new file mode 100644 index 0000000..085a5ef --- /dev/null +++ b/ansible/files/04-10/04-10-nodejs-demo.yaml @@ -0,0 +1,129 @@ +# 对应文档:docs/04-10-nodejs-Ingress与Traefik.md +# 累积:04-09 + Ingress 增加 host、path 改为 /api(访问需 Host: app.example.local) +apiVersion: v1 # PVC API 版本 +kind: PersistentVolumeClaim # 持久卷声明 +metadata: # PVC 元信息 + name: nodejs-demo-data # PVC 名称 + namespace: default # 命名空间 +spec: # PVC 规格 + accessModes: # 访问模式 + - ReadWriteOnce # RWO:同一时间仅单节点挂载读写 + storageClassName: local-path # 存储类 + resources: # 资源请求 + requests: # 配额请求 + storage: 1Gi # 申请容量 +--- +apiVersion: v1 # ConfigMap API 版本 +kind: ConfigMap # 配置资源 +metadata: # ConfigMap 元信息 + name: nodejs-demo-config # ConfigMap 名称 + namespace: default # 命名空间 +data: # 配置键值 + APP_MSG: "Hello from ConfigMap" # 示例消息内容 +--- +apiVersion: apps/v1 # Deployment API 版本 +kind: Deployment # 工作负载:Deployment +metadata: # Deployment 元信息 + name: nodejs-demo # Deployment 名称 + namespace: default # 命名空间 +spec: # Deployment 规格 + replicas: 1 # 副本数 + selector: # Pod 选择器 + matchLabels: # 标签匹配集合 + app: nodejs-demo # 匹配 app=nodejs-demo 的 Pod + template: # Pod 模板 + metadata: # Pod 元信息 + labels: # Pod 标签 + app: nodejs-demo # 与 selector.matchLabels 对齐 + spec: # Pod 规格 + nodeSelector: # 节点选择 + kubernetes.io/hostname: ylc62 # 固定到指定节点(按实际修改) + securityContext: # Pod 级安全上下文 + fsGroup: 1000 # 挂载卷文件组 ID + containers: # 容器列表 + - name: nodejs-demo # 容器名 + image: node:18.20-alpine # Node.js 镜像 + imagePullPolicy: IfNotPresent # 拉取策略 + securityContext: # 容器级安全上下文 + allowPrivilegeEscalation: false # 禁止提权 + runAsNonRoot: true # 非 root 运行 + runAsUser: 1000 # 运行用户 UID + readOnlyRootFilesystem: true # 根文件系统只读 + env: # 环境变量 + - name: APP_MSG # 环境变量名 + valueFrom: # 从引用源取值 + configMapKeyRef: # 从 ConfigMap key 读取 + name: nodejs-demo-config # ConfigMap 名称 + key: APP_MSG # ConfigMap 键名 + command: # 启动命令 + - node # 执行 node + - "-e" # 执行内联脚本 + - | # 多行 JS 脚本(内容保持原样) + const http=require('http'); + const msg=process.env.APP_MSG||'no env'; + http.createServer((q,s)=>s.end(msg)).listen(8080); + ports: # 容器端口 + - containerPort: 8080 # 监听端口 + resources: # 资源请求与限制 + requests: # 最小资源请求 + cpu: "50m" # 请求 CPU + memory: "64Mi" # 请求内存 + limits: # 资源上限 + cpu: "500m" # CPU 限制 + memory: "256Mi" # 内存限制 + livenessProbe: # 存活探针 + httpGet: # HTTP 探测 + path: / # 探测路径 + port: 8080 # 探测端口 + initialDelaySeconds: 3 # 初始延迟 + periodSeconds: 10 # 探测周期 + readinessProbe: # 就绪探针 + httpGet: # HTTP 探测 + path: / # 探测路径 + port: 8080 # 探测端口 + initialDelaySeconds: 2 # 初始延迟 + periodSeconds: 5 # 探测周期 + volumeMounts: # 卷挂载 + - name: tmp # 临时卷名称 + mountPath: /tmp # 容器内临时目录 + - name: data # 数据卷名称 + mountPath: /data # 容器内数据目录 + volumes: # 卷定义 + - name: tmp # 临时卷 + emptyDir: {} # 空目录卷 + - name: data # 数据卷 + persistentVolumeClaim: # 卷来源为 PVC + claimName: nodejs-demo-data # 绑定 PVC 名称 +--- +apiVersion: v1 # Service API 版本 +kind: Service # Service 资源 +metadata: # Service 元信息 + name: nodejs-demo # Service 名称 + namespace: default # 命名空间 +spec: # Service 规格 + selector: # 选择后端 Pod + app: nodejs-demo # 选中 app=nodejs-demo + ports: # 端口映射 + - port: 80 # Service 暴露端口 + targetPort: 8080 # 转发到容器端口 +--- +apiVersion: networking.k8s.io/v1 # Ingress API 版本 +kind: Ingress # Ingress 资源 +metadata: # Ingress 元信息 + name: nodejs-demo # Ingress 名称 + namespace: default # 命名空间 + annotations: # Traefik 注解 + traefik.ingress.kubernetes.io/router.entrypoints: web # 使用 web(HTTP) 入口 +spec: # Ingress 规则 + rules: # 规则列表 + - host: app.example.local # 主机名匹配 + http: # HTTP 路由 + paths: # 路径列表 + - path: /api # 匹配 API 路径前缀 + pathType: Prefix # 前缀匹配 + backend: # 后端目标 + service: # 后端 Service + name: nodejs-demo # Service 名称 + port: # Service 端口 + number: 80 # 端口号 + diff --git a/ansible/files/04-10/README.md b/ansible/files/04-10/README.md new file mode 100644 index 0000000..e96960f --- /dev/null +++ b/ansible/files/04-10/README.md @@ -0,0 +1,6 @@ +# 04-10(nodejs 分课) + +本目录 `04-10-nodejs-demo.yaml` 由 `ansible/files/04-01/` 同名文件复制,与总集目录内文件**同构**,专供本课独立阅读;改本课清单时以本目录为准即可最小化影响其他篇。 + +- **手动**:`kubectl apply -f ansible/files/04-10/04-10-nodejs-demo.yaml`(按需改字段)。 +- **自动**:`./ansible/bin/verify.sh run 04-10`。 diff --git a/ansible/files/04-11/04-11-nodejs-demo.yaml b/ansible/files/04-11/04-11-nodejs-demo.yaml new file mode 100644 index 0000000..c4f84c1 --- /dev/null +++ b/ansible/files/04-11/04-11-nodejs-demo.yaml @@ -0,0 +1,134 @@ +# 对应文档:docs/04-11-nodejs-副本与滚动发布.md +# 累积:04-10 + replicas: 3 + RollingUpdate(maxSurge:1 maxUnavailable:0) +apiVersion: v1 # PVC API 版本 +kind: PersistentVolumeClaim # 持久卷声明 +metadata: # PVC 元信息 + name: nodejs-demo-data # PVC 名称 + namespace: default # 命名空间 +spec: # PVC 规格 + accessModes: # 访问模式 + - ReadWriteOnce # RWO:同一时间仅单节点挂载读写 + storageClassName: local-path # 存储类 + resources: # 资源请求 + requests: # 配额请求 + storage: 1Gi # 申请容量 +--- +apiVersion: v1 # ConfigMap API 版本 +kind: ConfigMap # 配置资源 +metadata: # ConfigMap 元信息 + name: nodejs-demo-config # ConfigMap 名称 + namespace: default # 命名空间 +data: # 配置键值 + APP_MSG: "Hello from ConfigMap" # 示例消息内容 +--- +apiVersion: apps/v1 # Deployment API 版本 +kind: Deployment # 工作负载:Deployment +metadata: # Deployment 元信息 + name: nodejs-demo # Deployment 名称 + namespace: default # 命名空间 +spec: # Deployment 规格 + replicas: 3 # 副本数(高可用) + strategy: # 更新策略 + type: RollingUpdate # 滚动更新 + rollingUpdate: # 滚动更新参数 + maxSurge: 1 # 更新时最多额外增加 1 个 Pod + maxUnavailable: 0 # 更新时不可用 Pod 数为 0 + selector: # Pod 选择器 + matchLabels: # 标签匹配集合 + app: nodejs-demo # 匹配 app=nodejs-demo 的 Pod + template: # Pod 模板 + metadata: # Pod 元信息 + labels: # Pod 标签 + app: nodejs-demo # 与 selector.matchLabels 对齐 + spec: # Pod 规格 + nodeSelector: # 节点选择 + kubernetes.io/hostname: ylc62 # 固定到指定节点(按实际修改) + securityContext: # Pod 级安全上下文 + fsGroup: 1000 # 挂载卷文件组 ID + containers: # 容器列表 + - name: nodejs-demo # 容器名 + image: node:18.20-alpine # Node.js 镜像 + imagePullPolicy: IfNotPresent # 拉取策略 + securityContext: # 容器级安全上下文 + allowPrivilegeEscalation: false # 禁止提权 + runAsNonRoot: true # 非 root 运行 + runAsUser: 1000 # 运行用户 UID + readOnlyRootFilesystem: true # 根文件系统只读 + env: # 环境变量 + - name: APP_MSG # 环境变量名 + valueFrom: # 从引用源取值 + configMapKeyRef: # 从 ConfigMap key 读取 + name: nodejs-demo-config # ConfigMap 名称 + key: APP_MSG # ConfigMap 键名 + command: # 启动命令 + - node # 执行 node + - "-e" # 执行内联脚本 + - | # 多行 JS 脚本(内容保持原样) + const http=require('http'); + const msg=process.env.APP_MSG||'no env'; + http.createServer((q,s)=>s.end(msg)).listen(8080); + ports: # 容器端口 + - containerPort: 8080 # 监听端口 + resources: # 资源请求与限制 + requests: # 最小资源请求 + cpu: "50m" # 请求 CPU + memory: "64Mi" # 请求内存 + limits: # 资源上限 + cpu: "500m" # CPU 限制 + memory: "256Mi" # 内存限制 + livenessProbe: # 存活探针 + httpGet: # HTTP 探测 + path: / # 探测路径 + port: 8080 # 探测端口 + initialDelaySeconds: 3 # 初始延迟 + periodSeconds: 10 # 探测周期 + readinessProbe: # 就绪探针 + httpGet: # HTTP 探测 + path: / # 探测路径 + port: 8080 # 探测端口 + initialDelaySeconds: 2 # 初始延迟 + periodSeconds: 5 # 探测周期 + volumeMounts: # 卷挂载 + - name: tmp # 临时卷名称 + mountPath: /tmp # 容器内临时目录 + - name: data # 数据卷名称 + mountPath: /data # 容器内数据目录 + volumes: # 卷定义 + - name: tmp # 临时卷 + emptyDir: {} # 空目录卷 + - name: data # 数据卷 + persistentVolumeClaim: # 卷来源为 PVC + claimName: nodejs-demo-data # 绑定 PVC 名称 +--- +apiVersion: v1 # Service API 版本 +kind: Service # Service 资源 +metadata: # Service 元信息 + name: nodejs-demo # Service 名称 + namespace: default # 命名空间 +spec: # Service 规格 + selector: # 选择后端 Pod + app: nodejs-demo # 选中 app=nodejs-demo + ports: # 端口映射 + - port: 80 # Service 暴露端口 + targetPort: 8080 # 转发到容器端口 +--- +apiVersion: networking.k8s.io/v1 # Ingress API 版本 +kind: Ingress # Ingress 资源 +metadata: # Ingress 元信息 + name: nodejs-demo # Ingress 名称 + namespace: default # 命名空间 + annotations: # Traefik 注解 + traefik.ingress.kubernetes.io/router.entrypoints: web # 使用 web(HTTP) 入口 +spec: # Ingress 规则 + rules: # 规则列表 + - host: app.example.local # 主机名匹配 + http: # HTTP 路由 + paths: # 路径列表 + - path: /api # 匹配 API 路径前缀 + pathType: Prefix # 前缀匹配 + backend: # 后端目标 + service: # 后端 Service + name: nodejs-demo # Service 名称 + port: # Service 端口 + number: 80 # 端口号 + diff --git a/ansible/files/04-11/README.md b/ansible/files/04-11/README.md new file mode 100644 index 0000000..8a477e2 --- /dev/null +++ b/ansible/files/04-11/README.md @@ -0,0 +1,6 @@ +# 04-11(nodejs 分课) + +本目录 `04-11-nodejs-demo.yaml` 由 `ansible/files/04-01/` 同名文件复制,与总集目录内文件**同构**,专供本课独立阅读;改本课清单时以本目录为准即可最小化影响其他篇。 + +- **手动**:`kubectl apply -f ansible/files/04-11/04-11-nodejs-demo.yaml`(按需改字段)。 +- **自动**:`./ansible/bin/verify.sh run 04-11`。 diff --git a/ansible/files/04-12/04-12-nodejs-demo.yaml b/ansible/files/04-12/04-12-nodejs-demo.yaml new file mode 100644 index 0000000..bf40a44 --- /dev/null +++ b/ansible/files/04-12/04-12-nodejs-demo.yaml @@ -0,0 +1,141 @@ +# 对应文档:docs/04-12-nodejs-TLS与证书.md +# 累积:04-11 + Ingress TLS(websecure、secretName: nodejs-demo-tls) +# 应用前请先创建 TLS Secret,例如: +# kubectl create secret tls nodejs-demo-tls --cert=fullchain.pem --key=privkey.pem -n default +# 证书 SAN 须覆盖 app.example.local(与 rules.host / tls.hosts 一致) +apiVersion: v1 # PVC API 版本 +kind: PersistentVolumeClaim # 持久卷声明 +metadata: # PVC 元信息 + name: nodejs-demo-data # PVC 名称 + namespace: default # 命名空间 +spec: # PVC 规格 + accessModes: # 访问模式 + - ReadWriteOnce # RWO:同一时间仅单节点挂载读写 + storageClassName: local-path # 存储类 + resources: # 资源请求 + requests: # 配额请求 + storage: 1Gi # 申请容量 +--- +apiVersion: v1 # ConfigMap API 版本 +kind: ConfigMap # 配置资源 +metadata: # ConfigMap 元信息 + name: nodejs-demo-config # ConfigMap 名称 + namespace: default # 命名空间 +data: # 配置键值 + APP_MSG: "Hello from ConfigMap" # 示例消息内容 +--- +apiVersion: apps/v1 # Deployment API 版本 +kind: Deployment # 工作负载:Deployment +metadata: # Deployment 元信息 + name: nodejs-demo # Deployment 名称 + namespace: default # 命名空间 +spec: # Deployment 规格 + replicas: 3 # 副本数 + strategy: # 更新策略 + type: RollingUpdate # 滚动更新 + rollingUpdate: # 滚动更新参数 + maxSurge: 1 # 更新时最多额外增加 1 个 Pod + maxUnavailable: 0 # 更新时不可用 Pod 数为 0 + selector: # Pod 选择器 + matchLabels: # 标签匹配集合 + app: nodejs-demo # 匹配 app=nodejs-demo 的 Pod + template: # Pod 模板 + metadata: # Pod 元信息 + labels: # Pod 标签 + app: nodejs-demo # 与 selector.matchLabels 对齐 + spec: # Pod 规格 + nodeSelector: # 节点选择 + kubernetes.io/hostname: ylc62 # 固定到指定节点(按实际修改) + securityContext: # Pod 级安全上下文 + fsGroup: 1000 # 挂载卷文件组 ID + containers: # 容器列表 + - name: nodejs-demo # 容器名 + image: node:18.20-alpine # Node.js 镜像 + imagePullPolicy: IfNotPresent # 拉取策略 + securityContext: # 容器级安全上下文 + allowPrivilegeEscalation: false # 禁止提权 + runAsNonRoot: true # 非 root 运行 + runAsUser: 1000 # 运行用户 UID + readOnlyRootFilesystem: true # 根文件系统只读 + env: # 环境变量 + - name: APP_MSG # 环境变量名 + valueFrom: # 从引用源取值 + configMapKeyRef: # 从 ConfigMap key 读取 + name: nodejs-demo-config # ConfigMap 名称 + key: APP_MSG # ConfigMap 键名 + command: # 启动命令 + - node # 执行 node + - "-e" # 执行内联脚本 + - | # 多行 JS 脚本(内容保持原样) + const http=require('http'); + const msg=process.env.APP_MSG||'no env'; + http.createServer((q,s)=>s.end(msg)).listen(8080); + ports: # 容器端口 + - containerPort: 8080 # 监听端口 + resources: # 资源请求与限制 + requests: # 最小资源请求 + cpu: "50m" # 请求 CPU + memory: "64Mi" # 请求内存 + limits: # 资源上限 + cpu: "500m" # CPU 限制 + memory: "256Mi" # 内存限制 + livenessProbe: # 存活探针 + httpGet: # HTTP 探测 + path: / # 探测路径 + port: 8080 # 探测端口 + initialDelaySeconds: 3 # 初始延迟 + periodSeconds: 10 # 探测周期 + readinessProbe: # 就绪探针 + httpGet: # HTTP 探测 + path: / # 探测路径 + port: 8080 # 探测端口 + initialDelaySeconds: 2 # 初始延迟 + periodSeconds: 5 # 探测周期 + volumeMounts: # 卷挂载 + - name: tmp # 临时卷名称 + mountPath: /tmp # 容器内临时目录 + - name: data # 数据卷名称 + mountPath: /data # 容器内数据目录 + volumes: # 卷定义 + - name: tmp # 临时卷 + emptyDir: {} # 空目录卷 + - name: data # 数据卷 + persistentVolumeClaim: # 卷来源为 PVC + claimName: nodejs-demo-data # 绑定 PVC 名称 +--- +apiVersion: v1 # Service API 版本 +kind: Service # Service 资源 +metadata: # Service 元信息 + name: nodejs-demo # Service 名称 + namespace: default # 命名空间 +spec: # Service 规格 + selector: # 选择后端 Pod + app: nodejs-demo # 选中 app=nodejs-demo + ports: # 端口映射 + - port: 80 # Service 暴露端口 + targetPort: 8080 # 转发到容器端口 +--- +apiVersion: networking.k8s.io/v1 # Ingress API 版本 +kind: Ingress # Ingress 资源 +metadata: # Ingress 元信息 + name: nodejs-demo # Ingress 名称 + namespace: default # 命名空间 + annotations: # Traefik 注解 + traefik.ingress.kubernetes.io/router.entrypoints: websecure # 使用 websecure(HTTPS) 入口 +spec: # Ingress 规则 + tls: # TLS 配置 + - hosts: # 证书覆盖域名 + - app.example.local # 域名 + secretName: nodejs-demo-tls # 引用的 TLS Secret 名称 + rules: # 路由规则列表 + - host: app.example.local # 主机名匹配 + http: # HTTP 路由 + paths: # 路径列表 + - path: /api # 匹配 API 路径前缀 + pathType: Prefix # 前缀匹配 + backend: # 后端目标 + service: # 后端 Service + name: nodejs-demo # Service 名称 + port: # Service 端口 + number: 80 # 端口号 + diff --git a/ansible/files/04-12/README.md b/ansible/files/04-12/README.md new file mode 100644 index 0000000..dd86291 --- /dev/null +++ b/ansible/files/04-12/README.md @@ -0,0 +1,6 @@ +# 04-12(nodejs 分课) + +本目录 `04-12-nodejs-demo.yaml` 由 `ansible/files/04-01/` 同名文件复制,与总集目录内文件**同构**,专供本课独立阅读;改本课清单时以本目录为准即可最小化影响其他篇。 + +- **手动**:`kubectl apply -f ansible/files/04-12/04-12-nodejs-demo.yaml`(按需改字段)。 +- **自动**:`./ansible/bin/verify.sh run 04-12`。 diff --git a/ansible/files/04-13/04-13-nodejs-demo.yaml b/ansible/files/04-13/04-13-nodejs-demo.yaml new file mode 100644 index 0000000..8ab244b --- /dev/null +++ b/ansible/files/04-13/04-13-nodejs-demo.yaml @@ -0,0 +1,158 @@ +# 对应文档:docs/04-13-nodejs-HPA.md +# 累积:04-12 + HorizontalPodAutoscaler(CPU 50%,min 1 max 5) +apiVersion: v1 # PVC API 版本 +kind: PersistentVolumeClaim # 持久卷声明 +metadata: # PVC 元信息 + name: nodejs-demo-data # PVC 名称 + namespace: default # 命名空间 +spec: # PVC 规格 + accessModes: # 访问模式 + - ReadWriteOnce # RWO:同一时间仅单节点挂载读写 + storageClassName: local-path # 存储类 + resources: # 资源请求 + requests: # 配额请求 + storage: 1Gi # 申请容量 +--- +apiVersion: v1 # ConfigMap API 版本 +kind: ConfigMap # 配置资源 +metadata: # ConfigMap 元信息 + name: nodejs-demo-config # ConfigMap 名称 + namespace: default # 命名空间 +data: # 配置键值 + APP_MSG: "Hello from ConfigMap" # 示例消息内容 +--- +apiVersion: apps/v1 # Deployment API 版本 +kind: Deployment # 工作负载:Deployment +metadata: # Deployment 元信息 + name: nodejs-demo # Deployment 名称 + namespace: default # 命名空间 +spec: # Deployment 规格 + replicas: 3 # 副本数 + strategy: # 更新策略 + type: RollingUpdate # 滚动更新 + rollingUpdate: # 滚动更新参数 + maxSurge: 1 # 更新时最多额外增加 1 个 Pod + maxUnavailable: 0 # 更新时不可用 Pod 数为 0 + selector: # Pod 选择器 + matchLabels: # 标签匹配集合 + app: nodejs-demo # 匹配 app=nodejs-demo 的 Pod + template: # Pod 模板 + metadata: # Pod 元信息 + labels: # Pod 标签 + app: nodejs-demo # 与 selector.matchLabels 对齐 + spec: # Pod 规格 + nodeSelector: # 节点选择 + kubernetes.io/hostname: ylc62 # 固定到指定节点(按实际修改) + securityContext: # Pod 级安全上下文 + fsGroup: 1000 # 挂载卷文件组 ID + containers: # 容器列表 + - name: nodejs-demo # 容器名 + image: node:18.20-alpine # Node.js 镜像 + imagePullPolicy: IfNotPresent # 拉取策略 + securityContext: # 容器级安全上下文 + allowPrivilegeEscalation: false # 禁止提权 + runAsNonRoot: true # 非 root 运行 + runAsUser: 1000 # 运行用户 UID + readOnlyRootFilesystem: true # 根文件系统只读 + env: # 环境变量 + - name: APP_MSG # 环境变量名 + valueFrom: # 从引用源取值 + configMapKeyRef: # 从 ConfigMap key 读取 + name: nodejs-demo-config # ConfigMap 名称 + key: APP_MSG # ConfigMap 键名 + command: # 启动命令 + - node # 执行 node + - "-e" # 执行内联脚本 + - | # 多行 JS 脚本(内容保持原样) + const http=require('http'); + const msg=process.env.APP_MSG||'no env'; + http.createServer((q,s)=>s.end(msg)).listen(8080); + ports: # 容器端口 + - containerPort: 8080 # 监听端口 + resources: # 资源请求与限制 + requests: # 最小资源请求 + cpu: "50m" # 请求 CPU + memory: "64Mi" # 请求内存 + limits: # 资源上限 + cpu: "500m" # CPU 限制 + memory: "256Mi" # 内存限制 + livenessProbe: # 存活探针 + httpGet: # HTTP 探测 + path: / # 探测路径 + port: 8080 # 探测端口 + initialDelaySeconds: 3 # 初始延迟 + periodSeconds: 10 # 探测周期 + readinessProbe: # 就绪探针 + httpGet: # HTTP 探测 + path: / # 探测路径 + port: 8080 # 探测端口 + initialDelaySeconds: 2 # 初始延迟 + periodSeconds: 5 # 探测周期 + volumeMounts: # 卷挂载 + - name: tmp # 临时卷名称 + mountPath: /tmp # 容器内临时目录 + - name: data # 数据卷名称 + mountPath: /data # 容器内数据目录 + volumes: # 卷定义 + - name: tmp # 临时卷 + emptyDir: {} # 空目录卷 + - name: data # 数据卷 + persistentVolumeClaim: # 卷来源为 PVC + claimName: nodejs-demo-data # 绑定 PVC 名称 +--- +apiVersion: v1 # Service API 版本 +kind: Service # Service 资源 +metadata: # Service 元信息 + name: nodejs-demo # Service 名称 + namespace: default # 命名空间 +spec: # Service 规格 + selector: # 选择后端 Pod + app: nodejs-demo # 选中 app=nodejs-demo + ports: # 端口映射 + - port: 80 # Service 暴露端口 + targetPort: 8080 # 转发到容器端口 +--- +apiVersion: networking.k8s.io/v1 # Ingress API 版本 +kind: Ingress # Ingress 资源 +metadata: # Ingress 元信息 + name: nodejs-demo # Ingress 名称 + namespace: default # 命名空间 + annotations: # Traefik 注解 + traefik.ingress.kubernetes.io/router.entrypoints: websecure # 使用 websecure(HTTPS) 入口 +spec: # Ingress 规则 + tls: # TLS 配置 + - hosts: # 证书覆盖域名 + - app.example.local # 域名 + secretName: nodejs-demo-tls # 引用的 TLS Secret 名称 + rules: # 路由规则列表 + - host: app.example.local # 主机名匹配 + http: # HTTP 路由 + paths: # 路径列表 + - path: /api # 匹配 API 路径前缀 + pathType: Prefix # 前缀匹配 + backend: # 后端目标 + service: # 后端 Service + name: nodejs-demo # Service 名称 + port: # Service 端口 + number: 80 # 端口号 +--- +apiVersion: autoscaling/v2 # HPA API 版本 +kind: HorizontalPodAutoscaler # 水平自动扩缩容资源 +metadata: # HPA 元信息 + name: nodejs-demo # HPA 名称 + namespace: default # 命名空间 +spec: # HPA 规格 + scaleTargetRef: # 伸缩目标引用 + apiVersion: apps/v1 # 目标 API 版本 + kind: Deployment # 目标资源类型 + name: nodejs-demo # 目标 Deployment 名称 + minReplicas: 1 # 最小副本数 + maxReplicas: 5 # 最大副本数 + metrics: # 伸缩指标 + - type: Resource # 资源指标类型 + resource: # 资源指标配置 + name: cpu # 指标资源:CPU + target: # 目标值 + type: Utilization # 目标类型:利用率 + averageUtilization: 50 # 目标平均 CPU 利用率(%) + diff --git a/ansible/files/04-13/README.md b/ansible/files/04-13/README.md new file mode 100644 index 0000000..1b247e2 --- /dev/null +++ b/ansible/files/04-13/README.md @@ -0,0 +1,6 @@ +# 04-13(nodejs 分课) + +本目录 `04-13-nodejs-demo.yaml` 由 `ansible/files/04-01/` 同名文件复制,与总集目录内文件**同构**,专供本课独立阅读;改本课清单时以本目录为准即可最小化影响其他篇。 + +- **手动**:`kubectl apply -f ansible/files/04-13/04-13-nodejs-demo.yaml`(按需改字段)。 +- **自动**:`./ansible/bin/verify.sh run 04-13`。 diff --git a/ansible/files/04-14/README.md b/ansible/files/04-14/README.md new file mode 100644 index 0000000..eaa0741 --- /dev/null +++ b/ansible/files/04-14/README.md @@ -0,0 +1,5 @@ +# 04-14 truth source placeholder + +`04-14` 当前以流程说明(GitOps/CI 流水线)为主,暂无独立可直接 `kubectl apply` 的单一清单。 + +本目录用于满足 `doc_id -> ansible/files//` 一致性约束。 diff --git a/ansible/files/05-03/values-gitlab.example.yaml b/ansible/files/05-03/values-gitlab.example.yaml new file mode 100644 index 0000000..2c3cb22 --- /dev/null +++ b/ansible/files/05-03/values-gitlab.example.yaml @@ -0,0 +1,69 @@ +# GitLab Helm Chart 示例 values(实验室用) +# +# 使用方式: +# cp ansible/files/05-03/values-gitlab.example.yaml values-gitlab.yaml +# 按需修改域名、资源、Ingress;完整键名以当前 Chart 为准: +# helm show values gitlab/gitlab +# +# Chart 文档:https://docs.gitlab.com/charts/ + +global: + # 实验室:固定调度到指定节点,PVC(local-path)与 Pod 同节点 → 数据落在该节点本地盘 + # 本仓库 inventory 中 192.168.2.63 对应节点名 ylc63;若你的集群节点名不同,请 kubectl get nodes 后改写 + nodeSelector: + kubernetes.io/hostname: ylc63 + hosts: + # 根域占位;请改为你的域名(内网实验可配合 hosts / split-horizon DNS) + domain: example.com + # GitLab Web 主机名片段:最终为 .,例如 git.example.com + gitlab: + name: git + registry: + name: registry + ingress: + # 集群已用 Traefik 或在外层终止 TLS 时,通常不启用 Chart 内置 cert-manager + configureCertmanager: false + tls: + enabled: true + # 若使用已有 Secret(例如 Traefik / 手工证书),取消注释并填写: + # secretName: gitlab-wildcard-tls + +# 实验室缩小副本(可选;生产请按官方 sizing 与监控数据调整) +gitlab: + webservice: + minReplicas: 1 + sidekiq: + minReplicas: 1 + # Gitaly 仓库数据盘:与 global.nodeSelector 同节点时,local-path 卷落在该节点本地 + gitaly: + persistence: + storageClass: local-path + size: 50Gi + +# Bitnami PostgreSQL / Redis、MinIO 子 chart **不**继承 global.nodeSelector 时需单独写(见 GitLab Chart 文档 Node Selector 节) +postgresql: + primary: + nodeSelector: + kubernetes.io/hostname: ylc63 + persistence: + storageClass: local-path + size: 8Gi + +redis: + master: + nodeSelector: + kubernetes.io/hostname: ylc63 + persistence: + storageClass: local-path + size: 5Gi + +minio: + nodeSelector: + kubernetes.io/hostname: ylc63 + persistence: + storageClass: local-path + size: 10Gi + +# 资源紧张时可考虑关闭捆绑 Prometheus(按需取消注释) +# prometheus: +# install: false diff --git a/ansible/files/05-05/README.md b/ansible/files/05-05/README.md new file mode 100644 index 0000000..bfdb6a7 --- /dev/null +++ b/ansible/files/05-05/README.md @@ -0,0 +1,9 @@ +# 05-05(Prometheus + Grafana) + +| 文件 | 说明 | +|------|------| +| `kube-prometheus-stack-values.example.yaml` | Helm `kube-prometheus-stack` 的实验室向 values 示例(**非** `kubectl apply` 对象) | + +- **手动**:按 [docs/05-05-prometheus与grafana.md](../../../docs/05-05-prometheus与grafana.md) 执行 `helm repo add` / `helm upgrade --install`,可附加 `-f ansible/files/05-05/kube-prometheus-stack-values.example.yaml`。 +- **自动**:`./ansible/bin/verify.sh run 05-05`(noop + 集群基线;与本目录示例共用真源,便于对照文档改 values)。 +- 文件名含 `example.`,`verify_common` 不会对其实施 `kubectl apply --dry-run`。 diff --git a/ansible/files/05-05/kube-prometheus-stack-values.example.yaml b/ansible/files/05-05/kube-prometheus-stack-values.example.yaml new file mode 100644 index 0000000..488bd41 --- /dev/null +++ b/ansible/files/05-05/kube-prometheus-stack-values.example.yaml @@ -0,0 +1,22 @@ +# Helm values for prometheus-community/kube-prometheus-stack +# 用法(仓库根): +# helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +# helm repo update +# kubectl create namespace monitoring --dry-run=client -o yaml | kubectl apply -f - +# helm upgrade --install monitoring prometheus-community/kube-prometheus-stack -n monitoring \ +# -f ansible/files/05-05/kube-prometheus-stack-values.example.yaml +# +# 注意:本文件是 Helm values,不要用 kubectl apply。 + +prometheus: + prometheusSpec: + retention: 7d + resources: + requests: + cpu: 100m + memory: 256Mi + +grafana: + enabled: true + persistence: + enabled: false diff --git a/ansible/files/06-01/README.md b/ansible/files/06-01/README.md new file mode 100644 index 0000000..5678ffb --- /dev/null +++ b/ansible/files/06-01/README.md @@ -0,0 +1,10 @@ +# 06-01(NetworkPolicy 与连通性排障) + +| 文件 | 说明 | +|------|------| +| `networkpolicy-traefik-egress.example.yaml` | Traefik 出站示例:后端命名空间、Service CIDR、DNS | +| `networkpolicy-backend-ingress.example.yaml` | 后端仅允许来自 `kube-system`(Traefik)的入站示例 | + +- **手动**:复制为正式名后 `kubectl apply -f ...`,并按集群实际 **namespace / 标签 / CIDR** 修改(见 [docs/06-01-k3s-networkpolicy-故障排查.md](../../../docs/06-01-k3s-networkpolicy-故障排查.md))。 +- **自动**:`./ansible/bin/verify.sh run 06-01`(noop + 基线;策略真源以本目录为准,与手工 `kubectl` 一致)。 +- 示例文件名含 `example.`,默认验证流程跳过对其的 `kubectl dry-run`。 diff --git a/ansible/files/06-01/networkpolicy-backend-ingress.example.yaml b/ansible/files/06-01/networkpolicy-backend-ingress.example.yaml new file mode 100644 index 0000000..42b2f8b --- /dev/null +++ b/ansible/files/06-01/networkpolicy-backend-ingress.example.yaml @@ -0,0 +1,21 @@ +# 示例:后端 Namespace 仅允许来自 kube-system(Traefik)的入站 +# 将 namespace、podSelector、端口改为你的应用标签与 Service 端口。 +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: backend-from-traefik-example + namespace: default +spec: + podSelector: + matchLabels: + app: nginx + policyTypes: + - Ingress + ingress: + - from: + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: kube-system + ports: + - protocol: TCP + port: 80 diff --git a/ansible/files/06-01/networkpolicy-traefik-egress.example.yaml b/ansible/files/06-01/networkpolicy-traefik-egress.example.yaml new file mode 100644 index 0000000..3eca781 --- /dev/null +++ b/ansible/files/06-01/networkpolicy-traefik-egress.example.yaml @@ -0,0 +1,32 @@ +# 示例:为 Traefik 放行出站(按实际 namespace 与标签调整) +# 适用场景:后端在其它命名空间、需访问集群 DNS 与 Service VIP。 +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: traefik-egress-lab-example + namespace: kube-system +spec: + podSelector: + matchLabels: + app.kubernetes.io/name: traefik + policyTypes: + - Egress + egress: + # 访问任意命名空间内 Pod(可按需收窄为 namespaceSelector + podSelector) + - to: + - namespaceSelector: {} + ports: + - protocol: TCP + port: 8080 + - protocol: TCP + port: 8000 + # Service CIDR(k3s 默认常为 10.43.0.0/16,请与集群一致) + - to: + - ipBlock: + cidr: 10.43.0.0/16 + # 集群 DNS + - ports: + - protocol: UDP + port: 53 + - protocol: TCP + port: 53 diff --git a/ansible/files/06-02/README.md b/ansible/files/06-02/README.md new file mode 100644 index 0000000..3a4fa2e --- /dev/null +++ b/ansible/files/06-02/README.md @@ -0,0 +1,5 @@ +# 06-02(运维小结) + +- **文档**:[docs/06-02-运维小结.md](../../../docs/06-02-运维小结.md)(检查项、命令速查、变更记录建议)。 +- **自动**:`./ansible/bin/verify.sh run 06-02`(noop + 集群基线)。 +- 本篇无独立 Kubernetes 清单;运维动作以 `kubectl`/脚本为主。 diff --git a/ansible/files/07-01/README.md b/ansible/files/07-01/README.md new file mode 100644 index 0000000..86c223f --- /dev/null +++ b/ansible/files/07-01/README.md @@ -0,0 +1,5 @@ +# 07-01(Calico 双栈实验) + +- **文档**:[docs/07-01-k3s-calico-dualstack.md](../../../docs/07-01-k3s-calico-dualstack.md)。 +- **自动**:`./ansible/bin/verify.sh run 07-01`(实验环境与主线 Flannel 可能冲突,仅在隔离环境操作)。 +- 清单与安装步骤以文档为准;本目录占位满足契约,避免与生产集群误用。 diff --git a/ansible/files/07-02/README.md b/ansible/files/07-02/README.md new file mode 100644 index 0000000..adee5a4 --- /dev/null +++ b/ansible/files/07-02/README.md @@ -0,0 +1,5 @@ +# 07-02(Cilium 双栈 / eBPF 实验) + +- **文档**:[docs/07-02-k3s-cilium-dualstack-ebpf.md](../../../docs/07-02-k3s-cilium-dualstack-ebpf.md)。 +- **自动**:`./ansible/bin/verify.sh run 07-02`。 +- 实验性 CNI 切换风险高;YAML 真源随文档演进时再纳入本目录。 diff --git a/ansible/group_vars/all.yml b/ansible/group_vars/all.yml index f15eb1f..e960d06 100644 --- a/ansible/group_vars/all.yml +++ b/ansible/group_vars/all.yml @@ -1,4 +1,8 @@ --- +# 变量边界约定: +# - inventory.ini:主机拓扑/SSH +# - group_vars/all.yml:长期基线参数 +# - ansible/env/.env.verify:运行时上下文与外部依赖(ACME/NFS/Cloudflare 等) # 使用 root SSH 连接(setup-k3s-workers-ssh.sh 已将同一公钥写入各节点 root) ansible_user: root @@ -8,11 +12,17 @@ timezone: "Asia/Shanghai" k3s_version: "" # 为空表示用 get.k3s.io 默认最新 k3s_data_dir: "/storage" k3s_server_ip: "192.168.2.61" +# 安装脚本网络:GitHub 慢/不可达时设 cn(等价于安装脚本环境变量 INSTALL_K3S_MIRROR);也可用 ansible-playbook -e k3s_install_mirror=cn +k3s_install_mirror: "" +# curl 连接/整体超时(秒),避免 get.k3s.io 卡住时 Ansible 长时间无反馈;-e k3s_install_curl_max_time=900 可调大 +k3s_install_curl_max_time: 600 +# 安装 shell 在控制面上的硬超时(秒),应略大于 k3s_install_curl_max_time +k3s_install_task_timeout: 720 # 安装 k3s 前校验:/storage 为挂载点且与 / 不同设备(实验室 10G+32G 建议 true;「目录式假 /storage」旧环境可 false) k3s_verify_storage_mount: true -# 可选:由 playbooks/verify/01-06.yml(-e k3s_do_prepare_storage=true)对第二块整盘分区、格式化并挂载到 k3s_data_dir(会清空该盘,见 01-06) +# 可选:由 playbooks/verify/01-05.yml(-e k3s_do_prepare_storage=true)对第二块整盘分区、格式化并挂载到 k3s_data_dir(会清空该盘,见 01-05) k3s_prepare_storage: false # k3s_data_disk_device: "/dev/vdb" # NVMe 整盘一般为 /dev/nvme0n1,首分区为 /dev/nvme0n1p1,playbook 会按设备名自动加 1 或 p1 diff --git a/scripts/lib-ansible-lab.sh b/ansible/lib/lib-ansible-lab.sh similarity index 96% rename from scripts/lib-ansible-lab.sh rename to ansible/lib/lib-ansible-lab.sh index 496f384..c2d13e8 100644 --- a/scripts/lib-ansible-lab.sh +++ b/ansible/lib/lib-ansible-lab.sh @@ -17,7 +17,7 @@ ansible_lab_check_inventory_keys() { if [[ ! -f "$exp" ]]; then echo "[ERR] SSH 私钥不存在:$exp(inventory 中为 $path)" >&2 echo " 将密钥放到该路径并 chmod 600,或改 ansible/inventory.ini 中的 ansible_ssh_private_key_file。" >&2 - echo " 生成/分发可参考:scripts/ssh/setup-k3s-workers-ssh.sh、docs/01-06-节点初始化-ansible-实践.md" >&2 + echo " 生成/分发可参考:scripts/ssh/setup-k3s-workers-ssh.sh、docs/01-05-节点初始化-ansible-实践.md" >&2 return 1 fi # OpenSSH 拒绝 group/other 可读的私钥(常见误为 0644),须 600 或 400 diff --git a/ansible/playbooks/verify/01-03.yml b/ansible/playbooks/verify/01-03.yml index b996325..fc62c17 100644 --- a/ansible/playbooks/verify/01-03.yml +++ b/ansible/playbooks/verify/01-03.yml @@ -1,5 +1,5 @@ # SKIP_ARMV7=1(默认):仅 noop(文档 + ansible/files)。 -# SKIP_ARMV7=0 且设置 ARMV7_SSH:经 SSH 在 armv7/arm32 主机上 dnf 安装 docker 并校验(Fedora/RHEL 系,见 docs/01-03)。 +# SKIP_ARMV7=0 且设置 ARMV7_SSH:经 SSH 调用 ansible/tools/armv7-docker-verify-install.sh(先 docker info,失败再 get.docker.com,见 docs/01-03)。 - name: 01-03 armv7 Docker(矩阵 + 可选远程安装) hosts: localhost gather_facts: false @@ -102,7 +102,7 @@ - name: Fail when SKIP_ARMV7=0 but ARMV7_SSH empty ansible.builtin.fail: - msg: "SKIP_ARMV7=0 但未设置 ARMV7_SSH(见 scripts/.env.verify.example)" + msg: "SKIP_ARMV7=0 但未设置 ARMV7_SSH(见 ansible/env/.env.verify.example)" when: skip_armv7 == '0' and armv7_ssh | length == 0 - name: Note skipping remote arm install @@ -110,19 +110,14 @@ msg: "SKIP_ARMV7={{ skip_armv7 }}:跳过 arm 远程安装。若需安装:SKIP_ARMV7=0 且 export ARMV7_SSH='ssh -o BatchMode=yes user@arm-host'" when: skip_armv7 != '0' or armv7_ssh | length == 0 - - name: Remote Docker install (dnf on arm) + - name: Remote Docker verify-or-install (get.docker.com) when: skip_armv7 == '0' and armv7_ssh | length > 0 - block: - - name: Check docker on armv7 host - ansible.builtin.shell: "{{ armv7_ssh }} docker version" - register: armv7_docker_check - changed_when: false - failed_when: false - - - name: Install Docker and enable service (dnf) - ansible.builtin.shell: "{{ armv7_ssh }} 'sudo dnf install -y docker && sudo systemctl enable --now docker'" - when: armv7_docker_check.rc != 0 - - - name: Verify docker version and ps - ansible.builtin.shell: "{{ armv7_ssh }} docker version && {{ armv7_ssh }} docker ps -a" - changed_when: false + ansible.builtin.shell: | + set -euo pipefail + exec bash "{{ repo_root }}/ansible/tools/armv7-docker-verify-install.sh" + environment: + ARMV7_SSH: "{{ armv7_ssh }}" + args: + executable: /bin/bash + register: armv7_docker_verify_install + changed_when: "'skip_install=0' in (armv7_docker_verify_install.stdout | default(''))" diff --git a/ansible/playbooks/verify/01-04.yml b/ansible/playbooks/verify/01-04.yml new file mode 100644 index 0000000..90c7efd --- /dev/null +++ b/ansible/playbooks/verify/01-04.yml @@ -0,0 +1,145 @@ +# SKIP_ARMV7=1(默认):仅 noop。 +# SKIP_ARMV7=0 且 ARMV7_NFS_SSH 或 ARMV7_SSH:经 SSH 在 arm 上 dnf 装 nfs-utils、写 /etc/exports、exportfs(见 docs/01-04)。 +# 导出路径/网段:ARMV7_NFS_EXPORT_PATH(默认 /sdcard)、ARMV7_NFS_CLIENT_SUBNET(默认 192.168.2.0/24) +- name: 01-04 armv7 NFS(矩阵 + 可选远程安装) + hosts: localhost + gather_facts: false + vars: + repo_root: "{{ playbook_dir }}/../../.." + doc_id: "01-04" + doc_filename: "01-04-armv7-nfs服务安装.md" + skip_armv7: "{{ lookup('env', 'SKIP_ARMV7') | default('1', true) | trim }}" + armv7_ssh: "{{ lookup('env', 'ARMV7_SSH') | default('', true) | trim }}" + armv7_nfs_export_path: "{{ lookup('env', 'ARMV7_NFS_EXPORT_PATH') | default('/sdcard', true) | trim }}" + armv7_nfs_client_subnet: "{{ lookup('env', 'ARMV7_NFS_CLIENT_SUBNET') | default('192.168.2.0/24', true) | trim }}" + tasks: + - name: Resolve ARMV7_NFS_SSH from env + ansible.builtin.set_fact: + armv7_nfs_ssh: >- + {% set n = lookup('env', 'ARMV7_NFS_SSH') | default('', true) | trim %} + {% set b = lookup('env', 'ARMV7_SSH') | default('', true) | trim %} + {{ n if n | length > 0 else b }} + + - name: Baseline docs/files checks + block: + - name: Assert docs file exists + ansible.builtin.stat: + path: "{{ repo_root }}/docs/{{ doc_filename }}" + register: _doc_stat + + - name: Fail when docs file missing + ansible.builtin.assert: + that: + - _doc_stat.stat.exists + fail_msg: "docs file missing: docs/{{ doc_filename }}" + + - name: Find matching ansible/files doc_id directory + ansible.builtin.find: + paths: "{{ repo_root }}/ansible/files" + file_type: directory + patterns: "{{ doc_id }}" + use_regex: false + register: _files_dirs + + - name: Fail when ansible/files doc_id directory missing + ansible.builtin.assert: + that: + - _files_dirs.matched | int >= 1 + fail_msg: "ansible/files missing doc_id directory: ansible/files/{{ doc_id }}" + + - name: Show noop verification summary + ansible.builtin.debug: + msg: + - "doc_id={{ doc_id }}" + - "doc={{ doc_filename }}" + - "files_dirs={{ _files_dirs.files | map(attribute='path') | list }}" + + - name: Verify cluster reachable (kubectl get nodes) [runbook baseline] + ansible.builtin.shell: | + set -euo pipefail + KUBECONFIG={{ k3s_kubeconfig | default('/etc/rancher/k3s/k3s.yaml') }} kubectl get nodes + args: + executable: /bin/bash + delegate_to: "{{ groups['k3s_server'][0] }}" + become: true + run_once: true + changed_when: false + + - name: Verify core namespace exists (kube-system) [runbook baseline] + ansible.builtin.shell: | + set -euo pipefail + KUBECONFIG={{ k3s_kubeconfig | default('/etc/rancher/k3s/k3s.yaml') }} kubectl get ns kube-system + args: + executable: /bin/bash + delegate_to: "{{ groups['k3s_server'][0] }}" + become: true + run_once: true + changed_when: false + + - name: Find YAML manifests under ansible/files doc_id dirs + ansible.builtin.find: + paths: "{{ _files_dirs.files | map(attribute='path') | list }}" + file_type: file + patterns: + - "*.yml" + - "*.yaml" + recurse: true + use_regex: false + register: _files_manifests + + - name: Show manifest count summary + ansible.builtin.debug: + msg: + - "doc_id={{ doc_id }}" + - "manifest_files={{ _files_manifests.matched | default(0) }}" + - "manifest_paths={{ (_files_manifests.files | map(attribute='path') | list)[:12] }}" + + - name: Server-side dry-run apply (kubectl apply --dry-run=server) [doc assertion] + ansible.builtin.shell: | + set -euo pipefail + KUBECONFIG={{ k3s_kubeconfig | default('/etc/rancher/k3s/k3s.yaml') }} \ + kubectl apply --dry-run=server -f "{{ item.path }}" + args: + executable: /bin/bash + loop: "{{ _files_manifests.files }}" + loop_control: + label: "{{ item.path }}" + delegate_to: "{{ groups['k3s_server'][0] }}" + become: true + run_once: true + changed_when: false + when: (_files_manifests.matched | default(0) | int) > 0 + + - name: Fail when SKIP_ARMV7=0 but no ARMV7_SSH / ARMV7_NFS_SSH + ansible.builtin.fail: + msg: "SKIP_ARMV7=0 但未设置 ARMV7_SSH(或 ARMV7_NFS_SSH 指向 NFS 所在 arm 主机)" + when: skip_armv7 == '0' and armv7_nfs_ssh | length == 0 + + - name: Note skipping remote NFS setup + ansible.builtin.debug: + msg: "SKIP_ARMV7={{ skip_armv7 }}:跳过 arm NFS 远程配置。" + when: skip_armv7 != '0' or armv7_nfs_ssh | length == 0 + + - name: Remote NFS install (dnf on arm) + when: skip_armv7 == '0' and armv7_nfs_ssh | length > 0 + block: + - name: Install nfs-utils and enable nfs-server + ansible.builtin.shell: "{{ armv7_nfs_ssh }} 'sudo dnf install -y nfs-utils && sudo systemctl enable --now nfs-server'" + + - name: Check if export path already in /etc/exports + ansible.builtin.shell: "{{ armv7_nfs_ssh }} sudo grep -qF {{ armv7_nfs_export_path | quote }} /etc/exports" + register: armv7_exports_grep + failed_when: false + changed_when: false + + - name: Append NFS export line + ansible.builtin.shell: "{{ armv7_nfs_ssh }} bash -c 'echo \"{{ armv7_nfs_export_path }} {{ armv7_nfs_client_subnet }}(rw,sync,no_subtree_check,no_root_squash)\" | sudo tee -a /etc/exports'" + when: armv7_exports_grep.rc != 0 + + - name: Apply exportfs + ansible.builtin.shell: "{{ armv7_nfs_ssh }} sudo exportfs -rav" + changed_when: true + + - name: Verify showmount + ansible.builtin.shell: "{{ armv7_nfs_ssh }} showmount -e localhost" + changed_when: false diff --git a/ansible/playbooks/verify/01-05.yml b/ansible/playbooks/verify/01-05.yml index 497e58e..c204ac3 100644 --- a/ansible/playbooks/verify/01-05.yml +++ b/ansible/playbooks/verify/01-05.yml @@ -1,145 +1,347 @@ -# SKIP_ARMV7=1(默认):仅 noop。 -# SKIP_ARMV7=0 且 ARMV7_NFS_SSH 或 ARMV7_SSH:经 SSH 在 arm 上 dnf 装 nfs-utils、写 /etc/exports、exportfs(见 docs/01-05)。 -# 导出路径/网段:ARMV7_NFS_EXPORT_PATH(默认 /sdcard)、ARMV7_NFS_CLIENT_SUBNET(默认 192.168.2.0/24) -- name: 01-05 armv7 NFS(矩阵 + 可选远程安装) - hosts: localhost - gather_facts: false +--- +# 单文件化说明: +# - 01-05.yml 默认仍做“最小 verify”(kube-system pods) +# - 如需“准备数据盘/安装 K3s”,必须显式开启开关: +# -e k3s_do_prepare_storage=true # 内联原 01-05-prepare-storage.yml +# -e k3s_do_install=true # 内联原 01-05-install.yml +# 或 source ansible/env/.env.verify 后由环境变量 K3S_DO_PREPARE_STORAGE / K3S_DO_INSTALL(true/false)开启 + +- name: Prepare data disk and mount to k3s_data_dir (opt-in) + hosts: k3s_nodes + become: true vars: - repo_root: "{{ playbook_dir }}/../../.." - doc_id: "01-05" - doc_filename: "01-05-armv7-nfs服务安装.md" - skip_armv7: "{{ lookup('env', 'SKIP_ARMV7') | default('1', true) | trim }}" - armv7_ssh: "{{ lookup('env', 'ARMV7_SSH') | default('', true) | trim }}" - armv7_nfs_export_path: "{{ lookup('env', 'ARMV7_NFS_EXPORT_PATH') | default('/sdcard', true) | trim }}" - armv7_nfs_client_subnet: "{{ lookup('env', 'ARMV7_NFS_CLIENT_SUBNET') | default('192.168.2.0/24', true) | trim }}" + _k3s_do_prepare_storage: "{{ k3s_do_prepare_storage | default((lookup('env', 'K3S_DO_PREPARE_STORAGE') | default('', true) | trim | lower in ['true', '1', 'yes']) | bool) }}" + k3s_do_prepare_storage_enabled: "{{ _k3s_do_prepare_storage | bool }}" + pre_tasks: + - name: Gate - skip prepare storage when k3s_do_prepare_storage=false + when: not k3s_do_prepare_storage_enabled + block: + - ansible.builtin.debug: + msg: "[SKIP] optional doc_id=01-05 action=prepare-storage var=k3s_do_prepare_storage" + - meta: end_play tasks: - - name: Resolve ARMV7_NFS_SSH from env - ansible.builtin.set_fact: - armv7_nfs_ssh: >- - {% set n = lookup('env', 'ARMV7_NFS_SSH') | default('', true) | trim %} - {% set b = lookup('env', 'ARMV7_SSH') | default('', true) | trim %} - {{ n if n | length > 0 else b }} - - - name: Baseline docs/files checks - block: - - name: Assert docs file exists - ansible.builtin.stat: - path: "{{ repo_root }}/docs/{{ doc_filename }}" - register: _doc_stat - - - name: Fail when docs file missing - ansible.builtin.assert: - that: - - _doc_stat.stat.exists - fail_msg: "docs file missing: docs/{{ doc_filename }}" - - - name: Find matching ansible/files doc_id directory - ansible.builtin.find: - paths: "{{ repo_root }}/ansible/files" - file_type: directory - patterns: "{{ doc_id }}" - use_regex: false - register: _files_dirs - - - name: Fail when ansible/files doc_id directory missing - ansible.builtin.assert: - that: - - _files_dirs.matched | int >= 1 - fail_msg: "ansible/files missing doc_id directory: ansible/files/{{ doc_id }}" - - - name: Show noop verification summary - ansible.builtin.debug: - msg: - - "doc_id={{ doc_id }}" - - "doc={{ doc_filename }}" - - "files_dirs={{ _files_dirs.files | map(attribute='path') | list }}" - - - name: Verify cluster reachable (kubectl get nodes) [runbook baseline] - ansible.builtin.shell: | - set -euo pipefail - KUBECONFIG={{ k3s_kubeconfig | default('/etc/rancher/k3s/k3s.yaml') }} kubectl get nodes - args: - executable: /bin/bash - delegate_to: "{{ groups['k3s_server'][0] }}" - become: true - run_once: true - changed_when: false - - - name: Verify core namespace exists (kube-system) [runbook baseline] - ansible.builtin.shell: | - set -euo pipefail - KUBECONFIG={{ k3s_kubeconfig | default('/etc/rancher/k3s/k3s.yaml') }} kubectl get ns kube-system - args: - executable: /bin/bash - delegate_to: "{{ groups['k3s_server'][0] }}" - become: true - run_once: true - changed_when: false - - - name: Find YAML manifests under ansible/files doc_id dirs - ansible.builtin.find: - paths: "{{ _files_dirs.files | map(attribute='path') | list }}" - file_type: file - patterns: - - "*.yml" - - "*.yaml" - recurse: true - use_regex: false - register: _files_manifests - - - name: Show manifest count summary - ansible.builtin.debug: - msg: - - "doc_id={{ doc_id }}" - - "manifest_files={{ _files_manifests.matched | default(0) }}" - - "manifest_paths={{ (_files_manifests.files | map(attribute='path') | list)[:12] }}" - - - name: Server-side dry-run apply (kubectl apply --dry-run=server) [doc assertion] - ansible.builtin.shell: | - set -euo pipefail - KUBECONFIG={{ k3s_kubeconfig | default('/etc/rancher/k3s/k3s.yaml') }} \ - kubectl apply --dry-run=server -f "{{ item.path }}" - args: - executable: /bin/bash - loop: "{{ _files_manifests.files }}" - loop_control: - label: "{{ item.path }}" - delegate_to: "{{ groups['k3s_server'][0] }}" - become: true - run_once: true - changed_when: false - when: (_files_manifests.matched | default(0) | int) > 0 - - - name: Fail when SKIP_ARMV7=0 but no ARMV7_SSH / ARMV7_NFS_SSH - ansible.builtin.fail: - msg: "SKIP_ARMV7=0 但未设置 ARMV7_SSH(或 ARMV7_NFS_SSH 指向 NFS 所在 arm 主机)" - when: skip_armv7 == '0' and armv7_nfs_ssh | length == 0 - - - name: Note skipping remote NFS setup + - name: Skip notice when storage prep disabled ansible.builtin.debug: - msg: "SKIP_ARMV7={{ skip_armv7 }}:跳过 arm NFS 远程配置。" - when: skip_armv7 != '0' or armv7_nfs_ssh | length == 0 + msg: "k3s_prepare_storage is false — skipping (see group_vars/all.yml)" + when: not (k3s_prepare_storage | default(false) | bool) - - name: Remote NFS install (dnf on arm) - when: skip_armv7 == '0' and armv7_nfs_ssh | length > 0 + - name: Prepare block storage for k3s_data_dir + when: k3s_prepare_storage | default(false) | bool block: - - name: Install nfs-utils and enable nfs-server - ansible.builtin.shell: "{{ armv7_nfs_ssh }} 'sudo dnf install -y nfs-utils && sudo systemctl enable --now nfs-server'" - - - name: Check if export path already in /etc/exports - ansible.builtin.shell: "{{ armv7_nfs_ssh }} sudo grep -qF {{ armv7_nfs_export_path | quote }} /etc/exports" - register: armv7_exports_grep + # 先判挂载:已挂载则不再要求 k3s_data_disk_device(避免「目录已就绪仍 assert 磁盘」) + - name: Check whether k3s_data_dir is already a mountpoint + ansible.builtin.command: mountpoint -q {{ k3s_data_dir }} + register: mp_k3s + changed_when: false failed_when: false + + - name: Skip when k3s_data_dir already mounted + ansible.builtin.debug: + msg: "{{ k3s_data_dir }} already mounted — skipping partitioning on {{ inventory_hostname }}" + when: mp_k3s.rc == 0 + + - name: Require k3s_data_disk_device only when partition work is needed + ansible.builtin.assert: + that: + - k3s_data_disk_device is defined + - (k3s_data_disk_device | string | length) > 0 + fail_msg: "Set k3s_data_disk_device (e.g. /dev/vdb) in group_vars or host_vars" + when: mp_k3s.rc != 0 + + - name: Verify k3s_data_disk_device is a block device + ansible.builtin.command: test -b {{ k3s_data_disk_device }} changed_when: false + when: mp_k3s.rc != 0 - - name: Append NFS export line - ansible.builtin.shell: "{{ armv7_nfs_ssh }} bash -c 'echo \"{{ armv7_nfs_export_path }} {{ armv7_nfs_client_subnet }}(rw,sync,no_subtree_check,no_root_squash)\" | sudo tee -a /etc/exports'" - when: armv7_exports_grep.rc != 0 + - name: Install partitioning and filesystem tools + ansible.builtin.package: + name: + - parted + - e2fsprogs + state: present + when: mp_k3s.rc != 0 - - name: Apply exportfs - ansible.builtin.shell: "{{ armv7_nfs_ssh }} sudo exportfs -rav" + - name: Compute first partition path (nvme*n* -> p1, else 1) + ansible.builtin.set_fact: + k3s_data_partition: >- + {{ k3s_data_disk_device }}{{ 'p1' if (k3s_data_disk_device | regex_search('nvme[0-9]+n[0-9]+$')) else '1' }} + when: mp_k3s.rc != 0 + + - name: Create GPT and single ext4 partition + ansible.builtin.command: >- + parted -s {{ k3s_data_disk_device }} mklabel gpt mkpart primary ext4 0% 100% + args: + creates: "{{ k3s_data_partition }}" + when: mp_k3s.rc != 0 + + - name: Wait for partition node in /dev + ansible.builtin.wait_for: + path: "{{ k3s_data_partition }}" + state: present + timeout: 60 + when: mp_k3s.rc != 0 + + - name: Detect existing filesystem on partition + ansible.builtin.command: blkid -s TYPE -o value {{ k3s_data_partition }} + register: fs_type + changed_when: false + failed_when: false + when: mp_k3s.rc != 0 + + - name: Create ext4 on partition + ansible.builtin.command: mkfs.ext4 -F {{ k3s_data_partition }} + when: + - mp_k3s.rc != 0 + - (fs_type.stdout | default('') | trim | length) == 0 + + - name: Read UUID of partition + ansible.builtin.command: blkid -s UUID -o value {{ k3s_data_partition }} + register: blk_uuid + changed_when: false + when: mp_k3s.rc != 0 + + - name: Ensure mount directory exists + ansible.builtin.file: + path: "{{ k3s_data_dir }}" + state: directory + mode: "0755" + when: mp_k3s.rc != 0 + + - name: Add fstab entry for k3s_data_dir + ansible.builtin.lineinfile: + path: /etc/fstab + regexp: "^UUID={{ blk_uuid.stdout | trim }}\\s" + line: "UUID={{ blk_uuid.stdout | trim }} {{ k3s_data_dir }} ext4 defaults,nofail 0 2" + create: true + mode: "0644" + when: mp_k3s.rc != 0 + + - name: Mount all from fstab + ansible.builtin.command: mount -a changed_when: true + when: mp_k3s.rc != 0 - - name: Verify showmount - ansible.builtin.shell: "{{ armv7_nfs_ssh }} showmount -e localhost" +- name: Install K3s (opt-in) + hosts: k3s_server + become: true + run_once: true + vars: + _k3s_do_install: "{{ k3s_do_install | default((lookup('env', 'K3S_DO_INSTALL') | default('', true) | trim | lower in ['true', '1', 'yes']) | bool) }}" + k3s_do_install_enabled: "{{ _k3s_do_install | bool }}" + k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml + k3s_verify_storage_mount_enabled: "{{ k3s_verify_storage_mount | default(true) | bool }}" + pre_tasks: + - name: Gate - skip install when k3s_do_install=false + when: not k3s_do_install_enabled + block: + - ansible.builtin.debug: + msg: "[SKIP] optional doc_id=01-05 action=install var=k3s_do_install" + - meta: end_play + tasks: + - name: Require k3s_server_ip + ansible.builtin.assert: + that: + - k3s_server_ip is defined + - (k3s_server_ip | string | length) > 0 + fail_msg: "k3s_server_ip 未配置,请在 ansible/group_vars/all.yml 设置" + + - name: Verify /storage mountpoint when enabled + when: k3s_verify_storage_mount_enabled + block: + - name: Ensure k3s_data_dir is mountpoint + ansible.builtin.command: mountpoint -q {{ k3s_data_dir }} changed_when: false + + - name: Read root and k3s_data_dir mount sources + ansible.builtin.shell: | + set -euo pipefail + root_src=$(findmnt -n -o SOURCE /) + data_src=$(findmnt -n -o SOURCE {{ k3s_data_dir }}) + echo "root=${root_src}" + echo "data=${data_src}" + test "${root_src}" != "${data_src}" + args: + executable: /bin/bash + changed_when: false + + - name: Install required packages for k3s install + ansible.builtin.package: + name: + - curl + - tar + - iproute + state: present + + - name: Check k3s binary + ansible.builtin.stat: + path: /usr/local/bin/k3s + register: _k3s_bin + + - name: Note k3s server install network expectations + when: not _k3s_bin.stat.exists + ansible.builtin.debug: + msg: "正在下载安装 k3s server(get.k3s.io,最长约 {{ k3s_install_curl_max_time | default(600) }}s);久无输出多为网络问题,可在 group_vars 设 k3s_install_mirror: cn 或调大 k3s_install_curl_max_time" + + - name: Install k3s server when binary absent + when: not _k3s_bin.stat.exists + ansible.builtin.shell: | + set -euo pipefail + curl --connect-timeout 30 --max-time {{ k3s_install_curl_max_time | default(600) | int }} -sfL https://get.k3s.io | \ + {{ ('INSTALL_K3S_MIRROR=' ~ (k3s_install_mirror | default('') | trim) ~ ' ') if (k3s_install_mirror | default('') | trim | length > 0) else '' }}{{ ('INSTALL_K3S_VERSION=' ~ k3s_version ~ ' ') if (k3s_version | default('') | trim | length > 0) else '' }}INSTALL_K3S_EXEC="server --data-dir {{ k3s_data_dir }} --write-kubeconfig-mode 644" sh - + args: + executable: /bin/bash + timeout: "{{ k3s_install_task_timeout | default(720) | int }}" + + - name: Ensure k3s service enabled and started + ansible.builtin.service: + name: k3s + enabled: true + state: started + + - name: Wait k3s kubeconfig ready + ansible.builtin.wait_for: + path: "{{ k3s_kubeconfig }}" + state: present + timeout: 300 + + - name: Wait server node Ready + ansible.builtin.shell: | + set -euo pipefail + KUBECONFIG={{ k3s_kubeconfig }} kubectl get node "{{ inventory_hostname }}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' + args: + executable: /bin/bash + register: _server_ready + changed_when: false + until: _server_ready.stdout | trim == "True" + retries: 60 + delay: 5 + + - name: Read k3s server token + ansible.builtin.slurp: + path: "{{ k3s_data_dir }}/server/token" + register: _server_token_raw + + - name: Save k3s token for workers + ansible.builtin.set_fact: + k3s_server_token: "{{ _server_token_raw.content | b64decode | trim }}" + +- name: Install K3s workers (opt-in) + hosts: k3s_worker + become: true + serial: 1 + vars: + _k3s_do_install: "{{ k3s_do_install | default((lookup('env', 'K3S_DO_INSTALL') | default('', true) | trim | lower in ['true', '1', 'yes']) | bool) }}" + k3s_do_install_enabled: "{{ _k3s_do_install | bool }}" + k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml + k3s_verify_storage_mount_enabled: "{{ k3s_verify_storage_mount | default(true) | bool }}" + k3s_server_host: "{{ groups['k3s_server'][0] }}" + k3s_join_token: "{{ hostvars[k3s_server_host].k3s_server_token | default('') }}" + pre_tasks: + - name: Gate - skip worker install when k3s_do_install=false + when: not k3s_do_install_enabled + block: + - ansible.builtin.debug: + msg: "[SKIP] optional doc_id=01-05 action=worker-install var=k3s_do_install" + - meta: end_play + tasks: + - name: Require k3s join token + ansible.builtin.assert: + that: + - (k3s_join_token | trim | length) > 0 + fail_msg: "k3s join token 为空,请先确保 server 安装成功" + + - name: Verify /storage mountpoint on worker when enabled + when: k3s_verify_storage_mount_enabled + block: + - name: Ensure k3s_data_dir is mountpoint + ansible.builtin.command: mountpoint -q {{ k3s_data_dir }} + changed_when: false + + - name: Read root and k3s_data_dir mount sources + ansible.builtin.shell: | + set -euo pipefail + root_src=$(findmnt -n -o SOURCE /) + data_src=$(findmnt -n -o SOURCE {{ k3s_data_dir }}) + test "${root_src}" != "${data_src}" + args: + executable: /bin/bash + changed_when: false + + - name: Install required packages for worker install + ansible.builtin.package: + name: + - curl + - tar + - iproute + state: present + + - name: Check k3s-agent binary + ansible.builtin.stat: + path: /usr/local/bin/k3s-agent + register: _k3s_agent_bin + + - name: Note k3s agent install network expectations + when: not _k3s_agent_bin.stat.exists + ansible.builtin.debug: + msg: "正在本节点下载安装 k3s-agent(get.k3s.io → GitHub,最长约 {{ k3s_install_curl_max_time | default(600) }}s);卡住时请检查 worker 出网或设 k3s_install_mirror: cn" + + - name: Install k3s worker when binary absent + when: not _k3s_agent_bin.stat.exists + ansible.builtin.shell: | + set -euo pipefail + curl --connect-timeout 30 --max-time {{ k3s_install_curl_max_time | default(600) | int }} -sfL https://get.k3s.io | \ + {{ ('INSTALL_K3S_MIRROR=' ~ (k3s_install_mirror | default('') | trim) ~ ' ') if (k3s_install_mirror | default('') | trim | length > 0) else '' }}{{ ('INSTALL_K3S_VERSION=' ~ k3s_version ~ ' ') if (k3s_version | default('') | trim | length > 0) else '' }}K3S_URL="https://{{ k3s_server_ip }}:6443" K3S_TOKEN={{ k3s_join_token | quote }} INSTALL_K3S_EXEC="agent --data-dir {{ k3s_data_dir }}" sh - + args: + executable: /bin/bash + timeout: "{{ k3s_install_task_timeout | default(720) | int }}" + + - name: Ensure k3s-agent service enabled and started + ansible.builtin.service: + name: k3s-agent + enabled: true + state: started + +# 不在 worker 上 delegate_to server:部分环境下会从 worker 上下文连控制机 SSH 失败(如 192.168.2.61:22 timeout)。 +# 改为独立 play,仅由控制端 SSH → k3s_server 执行 kubectl,与「Install K3s server」连接路径一致。 +- name: Wait k3s workers Ready from server (post-install) + hosts: k3s_server + become: true + run_once: true + vars: + _k3s_do_install: "{{ k3s_do_install | default((lookup('env', 'K3S_DO_INSTALL') | default('', true) | trim | lower in ['true', '1', 'yes']) | bool) }}" + k3s_do_install_enabled: "{{ _k3s_do_install | bool }}" + k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml + pre_tasks: + - name: Gate - skip when k3s_do_install=false + when: not k3s_do_install_enabled + block: + - ansible.builtin.debug: + msg: "[SKIP] optional doc_id=01-05 action=wait-workers-ready var=k3s_do_install" + - meta: end_play + tasks: + - name: Wait each worker node Ready + when: (groups['k3s_worker'] | default([])) | length > 0 + ansible.builtin.shell: | + set -euo pipefail + export KUBECONFIG={{ k3s_kubeconfig | quote }} + kubectl wait --for=condition=Ready "node/{{ item }}" --timeout=320s + args: + executable: /bin/bash + loop: "{{ groups['k3s_worker'] }}" + changed_when: false + +- name: "01-05 k3s baseline verify (kube-system pods)" + hosts: k3s_server + become: true + run_once: true + vars: + k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml + tasks: + - name: kubectl get nodes + ansible.builtin.shell: KUBECONFIG={{ k3s_kubeconfig }} kubectl get nodes -o wide + changed_when: false + + - name: kube-system pods summary + ansible.builtin.shell: KUBECONFIG={{ k3s_kubeconfig }} kubectl get pods -n kube-system -o wide + changed_when: false + diff --git a/ansible/playbooks/verify/01-06.yml b/ansible/playbooks/verify/01-06.yml index 1a07537..40bb764 100644 --- a/ansible/playbooks/verify/01-06.yml +++ b/ansible/playbooks/verify/01-06.yml @@ -1,146 +1,12 @@ ---- -# 单文件化说明: -# - 01-06.yml 默认仍做“最小 verify”(kube-system pods) -# - 如需“准备数据盘/安装 K3s”,必须显式开启开关: -# -e k3s_do_prepare_storage=true # 内联原 01-06-prepare-storage.yml -# -e k3s_do_install=true # 内联原 01-06-install.yml - -- name: Prepare data disk and mount to k3s_data_dir (opt-in) - hosts: k3s_nodes - become: true +- name: "01-06 noop verify" + hosts: localhost + gather_facts: false vars: - k3s_do_prepare_storage: "{{ k3s_do_prepare_storage | default(false) | bool }}" - pre_tasks: - - name: Gate - skip prepare storage when k3s_do_prepare_storage=false - when: not k3s_do_prepare_storage - block: - - ansible.builtin.debug: - msg: "[GATE] skipped doc_id=01-06 action=prepare-storage var=k3s_do_prepare_storage" - - meta: end_play + repo_root: "{{ playbook_dir }}/../../.." + doc_id: "01-06" + doc_filename: "01-06-openwrt-haproxy.md" tasks: - - name: Skip notice when storage prep disabled - ansible.builtin.debug: - msg: "k3s_prepare_storage is false — skipping (see group_vars/all.yml)" - when: not (k3s_prepare_storage | default(false) | bool) - - - name: Prepare block storage for k3s_data_dir - when: k3s_prepare_storage | default(false) | bool - block: - - name: Require k3s_data_disk_device when k3s_prepare_storage is true - ansible.builtin.assert: - that: - - k3s_data_disk_device is defined - - (k3s_data_disk_device | string | length) > 0 - fail_msg: "Set k3s_data_disk_device (e.g. /dev/vdb) in group_vars or host_vars" - - - name: Verify k3s_data_disk_device is a block device - ansible.builtin.command: test -b {{ k3s_data_disk_device }} - changed_when: false - - - name: Check whether k3s_data_dir is already a mountpoint - ansible.builtin.command: mountpoint -q {{ k3s_data_dir }} - register: mp_k3s - changed_when: false - failed_when: false - - - name: Skip when k3s_data_dir already mounted - ansible.builtin.debug: - msg: "{{ k3s_data_dir }} already mounted — skipping partitioning on {{ inventory_hostname }}" - when: mp_k3s.rc == 0 - - - name: Install partitioning and filesystem tools - ansible.builtin.package: - name: - - parted - - e2fsprogs - state: present - when: mp_k3s.rc != 0 - - - name: Compute first partition path (nvme*n* -> p1, else 1) - ansible.builtin.set_fact: - k3s_data_partition: >- - {{ k3s_data_disk_device }}{{ 'p1' if (k3s_data_disk_device | regex_search('nvme[0-9]+n[0-9]+$')) else '1' }} - when: mp_k3s.rc != 0 - - - name: Create GPT and single ext4 partition - ansible.builtin.command: >- - parted -s {{ k3s_data_disk_device }} mklabel gpt mkpart primary ext4 0% 100% - args: - creates: "{{ k3s_data_partition }}" - when: mp_k3s.rc != 0 - - - name: Wait for partition node in /dev - ansible.builtin.wait_for: - path: "{{ k3s_data_partition }}" - state: present - timeout: 60 - when: mp_k3s.rc != 0 - - - name: Detect existing filesystem on partition - ansible.builtin.command: blkid -s TYPE -o value {{ k3s_data_partition }} - register: fs_type - changed_when: false - failed_when: false - when: mp_k3s.rc != 0 - - - name: Create ext4 on partition - ansible.builtin.command: mkfs.ext4 -F {{ k3s_data_partition }} - when: - - mp_k3s.rc != 0 - - (fs_type.stdout | default('') | trim | length) == 0 - - - name: Read UUID of partition - ansible.builtin.command: blkid -s UUID -o value {{ k3s_data_partition }} - register: blk_uuid - changed_when: false - when: mp_k3s.rc != 0 - - - name: Ensure mount directory exists - ansible.builtin.file: - path: "{{ k3s_data_dir }}" - state: directory - mode: "0755" - when: mp_k3s.rc != 0 - - - name: Add fstab entry for k3s_data_dir - ansible.builtin.lineinfile: - path: /etc/fstab - regexp: "^UUID={{ blk_uuid.stdout | trim }}\\s" - line: "UUID={{ blk_uuid.stdout | trim }} {{ k3s_data_dir }} ext4 defaults,nofail 0 2" - create: true - mode: "0644" - when: mp_k3s.rc != 0 - - - name: Mount all from fstab - ansible.builtin.command: mount -a - changed_when: true - when: mp_k3s.rc != 0 - -- name: Install K3s (opt-in) - hosts: k3s_nodes - become: true - vars: - k3s_do_install: "{{ k3s_do_install | default(false) | bool }}" - pre_tasks: - - name: Gate - skip install when k3s_do_install=false - when: not k3s_do_install - block: - - ansible.builtin.debug: - msg: "[GATE] skipped doc_id=01-06 action=install var=k3s_do_install" - - meta: end_play - tasks: - - name: Placeholder (install content inlined in following plays) - ansible.builtin.debug: - msg: "[RUN] doc_id=01-06 action=install-start" - -- name: "01-06 k3s baseline verify (kube-system pods)" - hosts: k3s_server - become: true - run_once: true - vars: - k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml - tasks: - - name: kube-system pods summary - ansible.builtin.shell: KUBECONFIG={{ k3s_kubeconfig }} kubectl get pods -n kube-system -o wide - changed_when: false - + - name: Include noop doc verify role tasks + ansible.builtin.include_role: + name: verify_common + tasks_from: noop-doc-verify.yml diff --git a/ansible/playbooks/verify/01-07.yml b/ansible/playbooks/verify/01-07.yml index a9c3bb6..3f57c9f 100644 --- a/ansible/playbooks/verify/01-07.yml +++ b/ansible/playbooks/verify/01-07.yml @@ -4,7 +4,23 @@ vars: repo_root: "{{ playbook_dir }}/../../.." doc_id: "01-07" - doc_filename: "01-07-openwrt-haproxy.md" + doc_filename: "01-07-双控制节点ha.md" tasks: - - name: Include noop doc verify tasks - ansible.builtin.include_tasks: tasks/noop-doc-verify.yml + - name: Assert docs file exists + ansible.builtin.stat: + path: "{{ repo_root }}/docs/{{ doc_filename }}" + register: _doc + + - name: Fail when docs file missing + ansible.builtin.assert: + that: + - _doc.stat.exists + fail_msg: "docs missing: docs/{{ doc_filename }}" + + - name: Summary + ansible.builtin.debug: + msg: + - "doc_id={{ doc_id }} (manual runbook / HA exercise)" + - "This verify case only asserts docs file exists." + - "HA join/switch must be exercised manually per docs/{{ doc_filename }}." + diff --git a/ansible/playbooks/verify/01-08.yml b/ansible/playbooks/verify/01-08.yml deleted file mode 100644 index 70bb069..0000000 --- a/ansible/playbooks/verify/01-08.yml +++ /dev/null @@ -1,26 +0,0 @@ -- name: "01-08 noop verify" - hosts: localhost - gather_facts: false - vars: - repo_root: "{{ playbook_dir }}/../../.." - doc_id: "01-08" - doc_filename: "01-08-双控制节点ha.md" - tasks: - - name: Assert docs file exists - ansible.builtin.stat: - path: "{{ repo_root }}/docs/{{ doc_filename }}" - register: _doc - - - name: Fail when docs file missing - ansible.builtin.assert: - that: - - _doc.stat.exists - fail_msg: "docs missing: docs/{{ doc_filename }}" - - - name: Summary - ansible.builtin.debug: - msg: - - "doc_id={{ doc_id }} (manual runbook / HA exercise)" - - "This verify case only asserts docs file exists." - - "HA join/switch must be exercised manually per docs/{{ doc_filename }}." - diff --git a/ansible/playbooks/verify/02-01.yml b/ansible/playbooks/verify/02-01.yml index fd6bd97..55986a6 100644 --- a/ansible/playbooks/verify/02-01.yml +++ b/ansible/playbooks/verify/02-01.yml @@ -4,7 +4,7 @@ run_once: true vars: k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml - manifest_src: "{{ playbook_dir }}/../../files/02-05/01-control-ingress.yaml" + manifest_src: "{{ playbook_dir }}/../../files/02-01/01-control-ingress.yaml" manifest_dest: /tmp/nginx-m1.yaml tasks: - name: Copy manifest @@ -27,35 +27,24 @@ run_once: true vars: k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml - verify_entry_base: "{{ nginx_entry_base | default('http://' ~ k3s_server_ip) }}" tasks: - name: Rollout status nginx-m1 - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/nginx-m1 -n default --timeout=180s - args: - executable: /bin/bash - changed_when: false + ansible.builtin.include_role: + name: verify_common + tasks_from: kubectl-rollout-status.yml + vars: + verify_rollout_ref: deployment/nginx-m1 - - name: HTTP check /demo-m1 (retry 503 for convergence) - ansible.builtin.shell: | - set -e - base="{{ verify_entry_base | trim | regex_replace('/+$','') }}" - url="$base/demo-m1/" - ok=0 - for i in 1 2 3 4 5 6 7 8 9 10; do - code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") - echo "try $i: $url -> $code" - if [ "$code" = "200" ]; then ok=1; break; fi - sleep 2 - done - test "$ok" = "1" - backend=$(curl -sS -D - -o /dev/null --connect-timeout 3 --max-time 8 "$url" 2>/dev/null | awk -F': ' '/^X-Backend:/{print $2; exit}' | tr -d '\r') - echo "X-Backend=$backend" - test "$backend" = "M1" - args: - executable: /bin/bash - changed_when: false + - name: HTTP check /demo-m1 (in-cluster via Service ClusterIP) + ansible.builtin.include_role: + name: verify_common + tasks_from: http-curl-traefik-incluster.yml + vars: + verify_traefik_kubeconfig: "{{ k3s_kubeconfig }}" + verify_incluster_http_url: "http://nginx-m1.default.svc.cluster.local/" + verify_traefik_assertion: nginx_matrix_m1_entry_http + verify_traefik_header_name: X-Backend + verify_traefik_header_value: M1 - name: Teardown 02-01 nginx control + Ingress (M1) hosts: k3s_server diff --git a/ansible/playbooks/verify/02-02.yml b/ansible/playbooks/verify/02-02.yml index c0dddb1..2a3a8fa 100644 --- a/ansible/playbooks/verify/02-02.yml +++ b/ansible/playbooks/verify/02-02.yml @@ -8,7 +8,7 @@ run_once: true vars: k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml - manifest_src: "{{ playbook_dir }}/../../files/02-05/02-control-ingressroute.yaml" + manifest_src: "{{ playbook_dir }}/../../files/02-02/02-control-ingressroute.yaml" manifest_dest: /tmp/nginx-m2.yaml tasks: - name: Copy manifest @@ -31,35 +31,24 @@ run_once: true vars: k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml - verify_entry_base: "{{ nginx_entry_base | default('http://' ~ k3s_server_ip) }}" tasks: - name: Rollout status nginx-m2 - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/nginx-m2 -n default --timeout=180s - args: - executable: /bin/bash - changed_when: false + ansible.builtin.include_role: + name: verify_common + tasks_from: kubectl-rollout-status.yml + vars: + verify_rollout_ref: deployment/nginx-m2 - - name: HTTP check /demo-m2 (retry 503 for convergence) - ansible.builtin.shell: | - set -e - base="{{ verify_entry_base | trim | regex_replace('/+$','') }}" - url="$base/demo-m2/" - ok=0 - for i in 1 2 3 4 5 6 7 8 9 10; do - code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") - echo "try $i: $url -> $code" - if [ "$code" = "200" ]; then ok=1; break; fi - sleep 2 - done - test "$ok" = "1" - backend=$(curl -sS -D - -o /dev/null --connect-timeout 3 --max-time 8 "$url" 2>/dev/null | awk -F': ' '/^X-Backend:/{print $2; exit}' | tr -d '\r') - echo "X-Backend=$backend" - test "$backend" = "M2" - args: - executable: /bin/bash - changed_when: false + - name: HTTP check /demo-m2 (in-cluster via Service ClusterIP) + ansible.builtin.include_role: + name: verify_common + tasks_from: http-curl-traefik-incluster.yml + vars: + verify_traefik_kubeconfig: "{{ k3s_kubeconfig }}" + verify_incluster_http_url: "http://nginx-m2.default.svc.cluster.local/" + verify_traefik_assertion: nginx_matrix_m2_entry_http + verify_traefik_header_name: X-Backend + verify_traefik_header_value: M2 - name: Teardown 02-02 nginx control + IngressRoute (M2) hosts: k3s_server diff --git a/ansible/playbooks/verify/02-03.yml b/ansible/playbooks/verify/02-03.yml index 4298833..5ba3f86 100644 --- a/ansible/playbooks/verify/02-03.yml +++ b/ansible/playbooks/verify/02-03.yml @@ -4,7 +4,7 @@ run_once: true vars: k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml - manifest_src: "{{ playbook_dir }}/../../files/02-05/03-worker-ingress.yaml" + manifest_src: "{{ playbook_dir }}/../../files/02-03/03-worker-ingress.yaml" manifest_dest: /tmp/nginx-m3.yaml tasks: - name: Copy manifest @@ -27,35 +27,24 @@ run_once: true vars: k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml - verify_entry_base: "{{ nginx_entry_base | default('http://' ~ k3s_server_ip) }}" tasks: - name: Rollout status nginx-m3 - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/nginx-m3 -n default --timeout=180s - args: - executable: /bin/bash - changed_when: false + ansible.builtin.include_role: + name: verify_common + tasks_from: kubectl-rollout-status.yml + vars: + verify_rollout_ref: deployment/nginx-m3 - - name: HTTP check /demo-m3 (retry 503 for convergence) - ansible.builtin.shell: | - set -e - base="{{ verify_entry_base | trim | regex_replace('/+$','') }}" - url="$base/demo-m3/" - ok=0 - for i in 1 2 3 4 5 6 7 8 9 10; do - code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") - echo "try $i: $url -> $code" - if [ "$code" = "200" ]; then ok=1; break; fi - sleep 2 - done - test "$ok" = "1" - backend=$(curl -sS -D - -o /dev/null --connect-timeout 3 --max-time 8 "$url" 2>/dev/null | awk -F': ' '/^X-Backend:/{print $2; exit}' | tr -d '\r') - echo "X-Backend=$backend" - test "$backend" = "M3" - args: - executable: /bin/bash - changed_when: false + - name: HTTP check /demo-m3 (in-cluster via Service ClusterIP) + ansible.builtin.include_role: + name: verify_common + tasks_from: http-curl-traefik-incluster.yml + vars: + verify_traefik_kubeconfig: "{{ k3s_kubeconfig }}" + verify_incluster_http_url: "http://nginx-m3.default.svc.cluster.local/" + verify_traefik_assertion: nginx_matrix_m3_entry_http + verify_traefik_header_name: X-Backend + verify_traefik_header_value: M3 - name: Teardown 02-03 nginx worker + Ingress (M3) hosts: k3s_server diff --git a/ansible/playbooks/verify/02-04.yml b/ansible/playbooks/verify/02-04.yml index 737e451..c171114 100644 --- a/ansible/playbooks/verify/02-04.yml +++ b/ansible/playbooks/verify/02-04.yml @@ -4,7 +4,7 @@ run_once: true vars: k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml - manifest_src: "{{ playbook_dir }}/../../files/02-05/04-worker-ingressroute.yaml" + manifest_src: "{{ playbook_dir }}/../../files/02-04/04-worker-ingressroute.yaml" manifest_dest: /tmp/nginx-m4.yaml tasks: - name: Copy manifest @@ -27,35 +27,24 @@ run_once: true vars: k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml - verify_entry_base: "{{ nginx_entry_base | default('http://' ~ k3s_server_ip) }}" tasks: - name: Rollout status nginx-m4 - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/nginx-m4 -n default --timeout=180s - args: - executable: /bin/bash - changed_when: false + ansible.builtin.include_role: + name: verify_common + tasks_from: kubectl-rollout-status.yml + vars: + verify_rollout_ref: deployment/nginx-m4 - - name: HTTP check /demo-m4 (retry 503 for convergence) - ansible.builtin.shell: | - set -e - base="{{ verify_entry_base | trim | regex_replace('/+$','') }}" - url="$base/demo-m4/" - ok=0 - for i in 1 2 3 4 5 6 7 8 9 10; do - code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") - echo "try $i: $url -> $code" - if [ "$code" = "200" ]; then ok=1; break; fi - sleep 2 - done - test "$ok" = "1" - backend=$(curl -sS -D - -o /dev/null --connect-timeout 3 --max-time 8 "$url" 2>/dev/null | awk -F': ' '/^X-Backend:/{print $2; exit}' | tr -d '\r') - echo "X-Backend=$backend" - test "$backend" = "M4" - args: - executable: /bin/bash - changed_when: false + - name: HTTP check /demo-m4 (in-cluster via Service ClusterIP) + ansible.builtin.include_role: + name: verify_common + tasks_from: http-curl-traefik-incluster.yml + vars: + verify_traefik_kubeconfig: "{{ k3s_kubeconfig }}" + verify_incluster_http_url: "http://nginx-m4.default.svc.cluster.local/" + verify_traefik_assertion: nginx_matrix_m4_entry_http + verify_traefik_header_name: X-Backend + verify_traefik_header_value: M4 - name: Teardown 02-04 nginx worker + IngressRoute (M4) hosts: k3s_server diff --git a/ansible/playbooks/verify/02-05.yml b/ansible/playbooks/verify/02-05.yml index ff9c14e..59a7c92 100644 --- a/ansible/playbooks/verify/02-05.yml +++ b/ansible/playbooks/verify/02-05.yml @@ -1,7 +1,7 @@ --- # 合并说明: # - 原 02-05.yml 仅 import 02-05-deploy.yml + 02-01..02-04 -# - 现已把 02-05-deploy.yml 内联到本文件,保持 scripts/verify.sh run 02-05 的语义不变 +# - 现已把 02-05-deploy.yml 内联到本文件,保持 ansible/bin/verify.sh run 02-05 的语义不变 - name: Deploy nginx matrix (M1~M4) hosts: k3s_server @@ -56,17 +56,22 @@ register: restart_out changed_when: true - - name: Wait for nginx pods to be ready + # rollout restart 后 Pod 名频繁更替,kubectl wait pod -l 可能竞态 NotFound;改用 rollout status + - name: Wait for nginx rollouts stable after ConfigMap restart ansible.builtin.shell: | - KUBECONFIG={{ k3s_kubeconfig }} kubectl wait --for=condition=ready pod \ - -l app=nginx-m1 --timeout=60s - KUBECONFIG={{ k3s_kubeconfig }} kubectl wait --for=condition=ready pod \ - -l app=nginx-m2 --timeout=60s - KUBECONFIG={{ k3s_kubeconfig }} kubectl wait --for=condition=ready pod \ - -l app=nginx-m3 --timeout=120s - KUBECONFIG={{ k3s_kubeconfig }} kubectl wait --for=condition=ready pod \ - -l app=nginx-m4 --timeout=120s - register: wait_result + set -euo pipefail + KCFG={{ k3s_kubeconfig | quote }} + export KUBECONFIG="$KCFG" + for dep in nginx-m1 nginx-m2; do + echo "[OC-ASSERT] assertion=nginx_matrix_rollout deployment=${dep} timeout=60s" + kubectl rollout status "deployment/$dep" -n default --timeout=60s + done + for dep in nginx-m3 nginx-m4; do + echo "[OC-ASSERT] assertion=nginx_matrix_rollout deployment=${dep} timeout=120s" + kubectl rollout status "deployment/$dep" -n default --timeout=120s + done + args: + executable: /bin/bash changed_when: false - name: Verify nginx matrix diff --git a/ansible/playbooks/verify/03-01.yml b/ansible/playbooks/verify/03-01.yml index 21442be..58d2c3c 100644 --- a/ansible/playbooks/verify/03-01.yml +++ b/ansible/playbooks/verify/03-01.yml @@ -30,12 +30,13 @@ k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml tasks: - name: Rollout status traefik (kube-system) - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/traefik -n kube-system --timeout=240s - args: - executable: /bin/bash - changed_when: false + ansible.builtin.include_role: + name: verify_common + tasks_from: kubectl-rollout-status.yml + vars: + verify_rollout_ref: deployment/traefik + verify_rollout_namespace: kube-system + verify_rollout_timeout_s: 240 - name: Assert HelmChartConfig exists ansible.builtin.shell: | diff --git a/ansible/playbooks/verify/03-02.yml b/ansible/playbooks/verify/03-02.yml index cc5b696..a2d51d0 100644 --- a/ansible/playbooks/verify/03-02.yml +++ b/ansible/playbooks/verify/03-02.yml @@ -8,13 +8,15 @@ manifest_src: "{{ playbook_dir }}/../../files/03-02/traefik-acme.yaml" manifest_dest: /tmp/traefik-acme.yaml acme_email: "{{ lookup('env', 'ACME_EMAIL') | default('', true) }}" + cf_api_token: "{{ lookup('env', 'CF_API_TOKEN') | default('', true) }}" tasks: - name: "Gate - skip apply when ACME_EMAIL missing" when: acme_email | trim == "" - block: - - ansible.builtin.debug: - msg: "[GATE] skipped doc_id=03-02 reason=missing_env missing=ACME_EMAIL" - - meta: end_play + ansible.builtin.include_role: + name: verify_common + tasks_from: gate-debug-end-play.yml + vars: + verify_gate_message: "[GATE] skipped doc_id=03-02 reason=missing_env missing=ACME_EMAIL" - name: Copy manifest ansible.builtin.copy: @@ -28,6 +30,21 @@ regexp: "" replace: "{{ acme_email | trim }}" + - name: Enable ACME staging CA when ACME_CA_STAGING=1 + when: (lookup('env', 'ACME_CA_STAGING') | default('0', true) | trim) == "1" + ansible.builtin.replace: + path: "{{ manifest_dest }}" + regexp: '^\s*# - "--certificatesresolvers\.cloudflare\.acme\.caserver=https://acme-staging-v02\.api\.letsencrypt\.org/directory".*$' + replace: ' - "--certificatesresolvers.cloudflare.acme.caserver=https://acme-staging-v02.api.letsencrypt.org/directory"' + + - name: Ensure Cloudflare API token Secret before Traefik ACME apply + when: (cf_api_token | trim | length) > 0 + ansible.builtin.include_role: + name: verify_common + tasks_from: ensure-cloudflare-api-token-secret.yml + vars: + verify_cf_api_token: "{{ cf_api_token | trim }}" + - name: Apply manifest ansible.builtin.shell: | set -e @@ -42,7 +59,8 @@ run_once: true vars: k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml - nginx_matrix_tls_enable: "{{ nginx_matrix_tls_enable | default(false) | bool }}" + _nginx_matrix_tls_enable: "{{ nginx_matrix_tls_enable | default((lookup('env', 'NGINX_MATRIX_TLS_ENABLE') | default('', true) | trim | lower in ['true', '1', 'yes']) | bool) }}" + nginx_matrix_tls_enabled: "{{ _nginx_matrix_tls_enable | bool }}" manifests_path: "{{ playbook_dir }}/../../files/03-02" tls_domains: - test01.jackadam.top @@ -51,24 +69,30 @@ - test04.jackadam.top pre_tasks: - name: Gate - skip nginx matrix TLS when nginx_matrix_tls_enable=false - when: not nginx_matrix_tls_enable + when: not nginx_matrix_tls_enabled block: - ansible.builtin.debug: - msg: "[GATE] skipped doc_id=03-02 action=nginx-matrix-tls var=nginx_matrix_tls_enable" + msg: "[SKIP] optional doc_id=03-02 action=nginx-matrix-tls var=nginx_matrix_tls_enable" - meta: end_play tasks: - name: Deploy nginx matrix TLS (mode=deploy) when: (mode | default('deploy')) == 'deploy' block: - - name: Ensure manifests path exists + - name: Ensure manifests path exists (controller repo path) ansible.builtin.stat: path: "{{ manifests_path }}" register: manifests_stat + delegate_to: localhost + become: false + run_once: true - name: Fail if manifests not found ansible.builtin.fail: msg: "manifests 未找到: {{ manifests_path }},请从仓库根目录或 ansible 同级执行" when: not manifests_stat.stat.exists + delegate_to: localhost + become: false + run_once: true - name: Ensure control-plane label on k3s_server nodes (for M1) ansible.builtin.shell: | @@ -105,12 +129,21 @@ ansible.builtin.shell: KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout restart deployment nginx-m1 nginx-m2 nginx-m3 nginx-m4 -n default changed_when: true - - name: Wait for nginx pods to be ready + - name: Wait for nginx rollouts stable after ConfigMap restart ansible.builtin.shell: | - KUBECONFIG={{ k3s_kubeconfig }} kubectl wait --for=condition=ready pod -l app=nginx-m1 --timeout=60s - KUBECONFIG={{ k3s_kubeconfig }} kubectl wait --for=condition=ready pod -l app=nginx-m2 --timeout=60s - KUBECONFIG={{ k3s_kubeconfig }} kubectl wait --for=condition=ready pod -l app=nginx-m3 --timeout=120s - KUBECONFIG={{ k3s_kubeconfig }} kubectl wait --for=condition=ready pod -l app=nginx-m4 --timeout=120s + set -euo pipefail + KCFG={{ k3s_kubeconfig | quote }} + export KUBECONFIG="$KCFG" + for dep in nginx-m1 nginx-m2; do + echo "[OC-ASSERT] assertion=nginx_matrix_tls_rollout deployment=${dep} timeout=60s" + kubectl rollout status "deployment/$dep" -n default --timeout=60s + done + for dep in nginx-m3 nginx-m4; do + echo "[OC-ASSERT] assertion=nginx_matrix_tls_rollout deployment=${dep} timeout=120s" + kubectl rollout status "deployment/$dep" -n default --timeout=120s + done + args: + executable: /bin/bash changed_when: false - name: Verify nginx matrix TLS resources @@ -187,29 +220,85 @@ vars: k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml acme_email: "{{ lookup('env', 'ACME_EMAIL') | default('', true) }}" + cf_api_token: "{{ lookup('env', 'CF_API_TOKEN') | default('', true) }}" tasks: - name: "Gate - skip verify when ACME_EMAIL missing" when: acme_email | trim == "" - block: - - ansible.builtin.debug: - msg: "[GATE] skipped doc_id=03-02 reason=missing_env missing=ACME_EMAIL" - - meta: end_play + ansible.builtin.include_role: + name: verify_common + tasks_from: gate-debug-end-play.yml + vars: + verify_gate_message: "[GATE] skipped doc_id=03-02 reason=missing_env missing=ACME_EMAIL" - - name: Assert Cloudflare token secret exists + - name: Ensure Cloudflare token Secret from CF_API_TOKEN (real-pass) + when: (cf_api_token | trim | length) > 0 + ansible.builtin.include_role: + name: verify_common + tasks_from: ensure-cloudflare-api-token-secret.yml + vars: + verify_cf_api_token: "{{ cf_api_token | trim }}" + + - name: Check cloudflare-api-token secret exists ansible.builtin.shell: | - set -e + set -euo pipefail KUBECONFIG={{ k3s_kubeconfig }} kubectl -n kube-system get secret cloudflare-api-token args: executable: /bin/bash changed_when: false + register: cloudflare_secret_check + failed_when: false - - name: Rollout status traefik (kube-system) + - name: Gate - no CF_API_TOKEN and secret missing + when: cloudflare_secret_check.rc != 0 and (cf_api_token | trim | length) == 0 + ansible.builtin.include_role: + name: verify_common + tasks_from: gate-debug-end-play.yml + vars: + verify_gate_message: "[GATE] skipped doc_id=03-02 reason=missing_dependency missing=cloudflare-api-token skip_scope=traefik-acme" + + - name: Fail when secret missing but CF_API_TOKEN was set + when: cloudflare_secret_check.rc != 0 and (cf_api_token | trim | length) > 0 + ansible.builtin.fail: + msg: "已设置 CF_API_TOKEN 但 cloudflare-api-token Secret 仍不可用,请检查 apiserver 权限与命名空间 kube-system" + + # Helm/ACME 换新 RS 时,旧 Pod 可能长期「pending termination」,rollout status 永久卡住。 + # 实验室验收:scale 0 → 清 Pod → scale 1(入口短暂不可用,可接受)。 + - name: Unstick Traefik deployment via scale down/up (kube-system) ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/traefik -n kube-system --timeout=300s + set -euo pipefail + export KUBECONFIG={{ k3s_kubeconfig }} + echo "[OC-ASSERT] assertion=traefik_rollout_unblock phase=scale_reset" + kubectl scale deployment traefik -n kube-system --replicas=0 + for i in $(seq 1 90); do + rep=$(kubectl get deployment traefik -n kube-system -o jsonpath='{.status.replicas}' 2>/dev/null || echo 1) + [ "${rep:-1}" = "0" ] && break + sleep 2 + done + for sel in "app.kubernetes.io/name=traefik" "app.kubernetes.io/instance=traefik"; do + kubectl get pods -n kube-system -l "$sel" -o name 2>/dev/null | while read -r p; do + [ -z "$p" ] && continue + kubectl delete "$p" -n kube-system --grace-period=0 --force --ignore-not-found=true || true + done + done + { kubectl get pods -n kube-system --no-headers -o custom-columns=:metadata.name 2>/dev/null | grep -E '^traefik-[0-9a-f]+-' || true; } | while read -r n; do + [ -z "$n" ] && continue + kubectl delete pod "$n" -n kube-system --grace-period=0 --force --ignore-not-found=true || true + done + kubectl scale deployment traefik -n kube-system --replicas=1 + sleep 3 args: executable: /bin/bash - changed_when: false + changed_when: true + failed_when: false + + - name: Rollout status traefik (kube-system) + ansible.builtin.include_role: + name: verify_common + tasks_from: kubectl-rollout-status.yml + vars: + verify_rollout_ref: deployment/traefik + verify_rollout_namespace: kube-system + verify_rollout_timeout_s: 600 - name: Teardown 03-02 Traefik ACME (optional) hosts: k3s_server diff --git a/ansible/playbooks/verify/03-03.yml b/ansible/playbooks/verify/03-03.yml index 6514a95..df9773a 100644 --- a/ansible/playbooks/verify/03-03.yml +++ b/ansible/playbooks/verify/03-03.yml @@ -1,10 +1,231 @@ -- name: "03-03 noop verify" - hosts: localhost - gather_facts: false +--- +# 03-03 Traefik Dashboard + ACME(HelmChartConfig 合并版) +# 与 03-02 共用同一 Traefik HelmChartConfig 资源名(traefik);勿在无协调下交替 full 验证二者——后 apply 者覆盖前者。 +# +- name: Deploy 03-03 Traefik Dashboard + ACME (HelmChartConfig) + hosts: k3s_server + become: true + run_once: true vars: - repo_root: "{{ playbook_dir }}/../../.." - doc_id: "03-03" - doc_filename: "03-03-k3s-traefik-dashboard-acme.md" + k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml + manifest_src: "{{ playbook_dir }}/../../files/03-03/traefik-dashboard-acme.yaml" + manifest_dest: /tmp/traefik-dashboard-acme.yaml + acme_email: "{{ lookup('env', 'ACME_EMAIL') | default('', true) }}" + cf_api_token: "{{ lookup('env', 'CF_API_TOKEN') | default('', true) }}" tasks: - - name: Include noop doc verify tasks - ansible.builtin.include_tasks: tasks/noop-doc-verify.yml + - name: "Gate - skip apply when ACME_EMAIL missing" + when: acme_email | trim == "" + ansible.builtin.include_role: + name: verify_common + tasks_from: gate-debug-end-play.yml + vars: + verify_gate_message: "[GATE] skipped doc_id=03-03 reason=missing_env missing=ACME_EMAIL" + + - name: Copy manifest + ansible.builtin.copy: + src: "{{ manifest_src }}" + dest: "{{ manifest_dest }}" + mode: "0644" + + - name: Replace ACME email placeholder + ansible.builtin.replace: + path: "{{ manifest_dest }}" + regexp: "" + replace: "{{ acme_email | trim }}" + + - name: Enable ACME staging CA when ACME_CA_STAGING=1 + when: (lookup('env', 'ACME_CA_STAGING') | default('0', true) | trim) == "1" + ansible.builtin.replace: + path: "{{ manifest_dest }}" + regexp: '^\s*# - "--certificatesresolvers\.cloudflare\.acme\.caserver=https://acme-staging-v02\.api\.letsencrypt\.org/directory".*$' + replace: ' - "--certificatesresolvers.cloudflare.acme.caserver=https://acme-staging-v02.api.letsencrypt.org/directory"' + + - name: Ensure Cloudflare API token Secret before Traefik ACME apply + when: (cf_api_token | trim | length) > 0 + ansible.builtin.include_role: + name: verify_common + tasks_from: ensure-cloudflare-api-token-secret.yml + vars: + verify_cf_api_token: "{{ cf_api_token | trim }}" + + - name: Apply manifest + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f {{ manifest_dest }} + args: + executable: /bin/bash + changed_when: true + +- name: Verify 03-03 Traefik Dashboard + ACME (rollout + dashboard http) + hosts: k3s_server + become: true + run_once: true + vars: + k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml + acme_email: "{{ lookup('env', 'ACME_EMAIL') | default('', true) }}" + cf_api_token: "{{ lookup('env', 'CF_API_TOKEN') | default('', true) }}" + _traefik_dash_url_env: "{{ lookup('env', 'TRAEFIK_DASHBOARD_VERIFY_URL') | default('', true) | trim }}" + traefik_dashboard_probe_url: "{{ _traefik_dash_url_env if (_traefik_dash_url_env | length > 0) else ('http://' ~ k3s_server_ip ~ '/dashboard/') }}" + tasks: + - name: "Gate - skip verify when ACME_EMAIL missing" + when: acme_email | trim == "" + ansible.builtin.include_role: + name: verify_common + tasks_from: gate-debug-end-play.yml + vars: + verify_gate_message: "[GATE] skipped doc_id=03-03 reason=missing_env missing=ACME_EMAIL" + + - name: Ensure Cloudflare token Secret from CF_API_TOKEN (real-pass) + when: (cf_api_token | trim | length) > 0 + ansible.builtin.include_role: + name: verify_common + tasks_from: ensure-cloudflare-api-token-secret.yml + vars: + verify_cf_api_token: "{{ cf_api_token | trim }}" + + - name: Check cloudflare-api-token secret exists + ansible.builtin.shell: | + set -euo pipefail + KUBECONFIG={{ k3s_kubeconfig }} kubectl -n kube-system get secret cloudflare-api-token + args: + executable: /bin/bash + changed_when: false + register: cloudflare_secret_check + failed_when: false + + - name: Gate - no CF_API_TOKEN and secret missing + when: cloudflare_secret_check.rc != 0 and (cf_api_token | trim | length) == 0 + ansible.builtin.include_role: + name: verify_common + tasks_from: gate-debug-end-play.yml + vars: + verify_gate_message: "[GATE] skipped doc_id=03-03 reason=missing_dependency missing=cloudflare-api-token skip_scope=traefik-dashboard-acme" + + - name: Fail when secret missing but CF_API_TOKEN was set + when: cloudflare_secret_check.rc != 0 and (cf_api_token | trim | length) > 0 + ansible.builtin.fail: + msg: "已设置 CF_API_TOKEN 但 cloudflare-api-token Secret 仍不可用,请检查 apiserver 权限与命名空间 kube-system" + + # 与 03-02 Verify 一致:旧 RS pending termination 时 rollout 可能永久卡住;scale 重置会短暂影响入口。 + - name: Unstick Traefik deployment via scale down/up (kube-system) + ansible.builtin.shell: | + set -euo pipefail + export KUBECONFIG={{ k3s_kubeconfig }} + echo "[OC-ASSERT] assertion=traefik_rollout_unblock phase=scale_reset doc_id=03-03" + kubectl scale deployment traefik -n kube-system --replicas=0 + for i in $(seq 1 90); do + rep=$(kubectl get deployment traefik -n kube-system -o jsonpath='{.status.replicas}' 2>/dev/null || echo 1) + [ "${rep:-1}" = "0" ] && break + sleep 2 + done + for sel in "app.kubernetes.io/name=traefik" "app.kubernetes.io/instance=traefik"; do + kubectl get pods -n kube-system -l "$sel" -o name 2>/dev/null | while read -r p; do + [ -z "$p" ] && continue + kubectl delete "$p" -n kube-system --grace-period=0 --force --ignore-not-found=true || true + done + done + { kubectl get pods -n kube-system --no-headers -o custom-columns=:metadata.name 2>/dev/null | grep -E '^traefik-[0-9a-f]+-' || true; } | while read -r n; do + [ -z "$n" ] && continue + kubectl delete pod "$n" -n kube-system --grace-period=0 --force --ignore-not-found=true || true + done + kubectl scale deployment traefik -n kube-system --replicas=1 + sleep 3 + args: + executable: /bin/bash + changed_when: true + failed_when: false + + - name: Rollout status traefik (kube-system) + ansible.builtin.include_role: + name: verify_common + tasks_from: kubectl-rollout-status.yml + vars: + verify_rollout_ref: deployment/traefik + verify_rollout_namespace: kube-system + verify_rollout_timeout_s: 600 + + # deployment spec.replicas=0 时 kubectl rollout status 也会“成功”,需显式等到 Pod Ready + - name: Wait for traefik Pod Ready (kube-system) + ansible.builtin.shell: | + set -euo pipefail + export KUBECONFIG={{ k3s_kubeconfig }} + kubectl wait --for=condition=ready pod \ + -l app.kubernetes.io/name=traefik,app.kubernetes.io/instance=traefik-kube-system \ + -n kube-system --timeout=180s + args: + executable: /bin/bash + changed_when: false + + - name: HTTP probe Traefik Dashboard via TRAEFIK_DASHBOARD_VERIFY_URL (control 机) + when: _traefik_dash_url_env | length > 0 + ansible.builtin.uri: + url: "{{ traefik_dashboard_probe_url }}" + method: GET + follow_redirects: all + status_code: [200] + timeout: 15 + register: traefik_03_03_dashboard_http + changed_when: false + delegate_to: localhost + become: false + + - name: OC3 summary for Traefik Dashboard HTTP (external URL) + when: _traefik_dash_url_env | length > 0 + ansible.builtin.debug: + msg: "[OC-ASSERT] assertion=traefik_03_03_dashboard_http phase=http probe=uri status={{ traefik_03_03_dashboard_http.status | default('') }} url={{ traefik_dashboard_probe_url }}" + + - name: HTTP probe Traefik Dashboard (port-forward traefik Pod,试 web 容器端口 8000/8080) + when: _traefik_dash_url_env | length == 0 + ansible.builtin.shell: | + set -euo pipefail + export KUBECONFIG={{ k3s_kubeconfig }} + POD=$(kubectl get pods -n kube-system \ + -l 'app.kubernetes.io/name=traefik,app.kubernetes.io/instance=traefik-kube-system' \ + -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true) + test -n "$POD" + local_port=$(shuf -i 32000-32767 -n 1) + ok=0 + for cport in 8000 8080 80; do + kubectl port-forward -n kube-system "pod/$POD" "${local_port}:${cport}" >/tmp/traefik-03-03-pf.log 2>&1 & + PF_PID=$! + trap 'kill $PF_PID 2>/dev/null || true' EXIT + for i in $(seq 1 20); do + grep -q "Forwarding from" /tmp/traefik-03-03-pf.log 2>/dev/null && break + sleep 1 + done + if curl -sfL --connect-timeout 3 --max-time 12 -o /dev/null "http://127.0.0.1:${local_port}/dashboard/" 2>/dev/null; then + ok=1 + echo "[OC-ASSERT] assertion=traefik_03_03_dashboard_http phase=http probe=port_forward status=200 pod_port=${cport} local_port=${local_port}" + break + fi + kill $PF_PID 2>/dev/null || true + trap - EXIT + wait $PF_PID 2>/dev/null || true + done + test "$ok" = "1" + args: + executable: /bin/bash + changed_when: false + +- name: Teardown 03-03 Traefik Dashboard + ACME (optional) + hosts: k3s_server + become: true + run_once: true + vars: + k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml + verify_teardown: "{{ (VERIFY_TEARDOWN | default('1')) | string }}" + acme_email: "{{ lookup('env', 'ACME_EMAIL') | default('', true) }}" + manifest_dest: /tmp/traefik-dashboard-acme.yaml + tasks: + - name: Skip teardown when gated + when: acme_email | trim == "" + meta: end_play + + - name: Delete resources when VERIFY_TEARDOWN=1 + when: verify_teardown == "1" + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl delete -f {{ manifest_dest }} --ignore-not-found=true + args: + executable: /bin/bash + changed_when: true diff --git a/ansible/playbooks/verify/03-04.yml b/ansible/playbooks/verify/03-04.yml index 918efcc..d03c860 100644 --- a/ansible/playbooks/verify/03-04.yml +++ b/ansible/playbooks/verify/03-04.yml @@ -1,10 +1,166 @@ -- name: "03-04 noop verify" - hosts: localhost - gather_facts: false +--- +# 探针 URL:CF_TUNNEL_TEST_URL(完整 HTTPS)与 CF_TUNNEL_TEST_HOST(仅主机名 → https://HOST/)二选一 +- name: Deploy 03-04 Cloudflare Tunnel (cloudflared) + hosts: k3s_server + become: true + run_once: true vars: - repo_root: "{{ playbook_dir }}/../../.." - doc_id: "03-04" - doc_filename: "03-04-k3s-cloudflare-tunnel-配置接入.md" + k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml + manifest_src: "{{ playbook_dir }}/../../files/03-04/cloudflared.yaml" + manifest_dest: /tmp/cloudflared-deploy.yaml + tunnel_token: "{{ lookup('env', 'TUNNEL_TOKEN') | default('', true) }}" + _cf_tunnel_url_raw: "{{ lookup('env', 'CF_TUNNEL_TEST_URL') | default('', true) | trim }}" + _cf_tunnel_host_raw: "{{ lookup('env', 'CF_TUNNEL_TEST_HOST') | default('', true) | trim }}" + cf_tunnel_probe_url: >- + {{ (_cf_tunnel_url_raw | length > 0) | ternary(_cf_tunnel_url_raw, + ((_cf_tunnel_host_raw | length > 0) | ternary('https://' ~ (_cf_tunnel_host_raw | regex_replace('^https?://', '') | regex_replace('/.*$', '') | regex_replace('/+$', '')) ~ '/', ''))) }} tasks: - - name: Include noop doc verify tasks - ansible.builtin.include_tasks: tasks/noop-doc-verify.yml + - name: "Gate - tunnel probe URL required (CF_TUNNEL_TEST_URL or CF_TUNNEL_TEST_HOST)" + when: cf_tunnel_probe_url | trim == "" + ansible.builtin.include_role: + name: verify_common + tasks_from: gate-debug-end-play.yml + vars: + verify_gate_message: "[GATE] skipped doc_id=03-04 reason=missing_env missing=CF_TUNNEL_TEST_URL_or_CF_TUNNEL_TEST_HOST skip_scope=03-04 tunnel http probe" + + - name: Check cloudflared-credentials secret exists + ansible.builtin.shell: | + set -euo pipefail + KUBECONFIG={{ k3s_kubeconfig }} kubectl -n kube-system get secret cloudflared-credentials + args: + executable: /bin/bash + register: cloudflared_secret_check + changed_when: false + failed_when: false + + - name: "Gate - no TUNNEL_TOKEN and secret missing" + when: cloudflared_secret_check.rc != 0 and (tunnel_token | trim | length) == 0 + ansible.builtin.include_role: + name: verify_common + tasks_from: gate-debug-end-play.yml + vars: + verify_gate_message: "[GATE] skipped doc_id=03-04 reason=missing_dependency missing=cloudflared-credentials/TUNNEL_TOKEN skip_scope=03-04 cloudflared deploy" + + - name: Ensure cloudflared tunnel Secret from TUNNEL_TOKEN + when: (tunnel_token | trim | length) > 0 + ansible.builtin.include_role: + name: verify_common + tasks_from: ensure-cloudflared-tunnel-secret.yml + vars: + verify_tunnel_token: "{{ tunnel_token | trim }}" + + - name: Copy cloudflared Deployment manifest + ansible.builtin.copy: + src: "{{ manifest_src }}" + dest: "{{ manifest_dest }}" + mode: "0644" + + - name: Apply cloudflared Deployment + ansible.builtin.shell: | + set -euo pipefail + KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f {{ manifest_dest }} + args: + executable: /bin/bash + changed_when: true + +- name: Verify 03-04 Cloudflare Tunnel (rollout + HTTPS probe) + hosts: k3s_server + become: true + run_once: true + vars: + k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml + tunnel_token: "{{ lookup('env', 'TUNNEL_TOKEN') | default('', true) }}" + _cf_tunnel_url_raw: "{{ lookup('env', 'CF_TUNNEL_TEST_URL') | default('', true) | trim }}" + _cf_tunnel_host_raw: "{{ lookup('env', 'CF_TUNNEL_TEST_HOST') | default('', true) | trim }}" + cf_tunnel_probe_url: >- + {{ (_cf_tunnel_url_raw | length > 0) | ternary(_cf_tunnel_url_raw, + ((_cf_tunnel_host_raw | length > 0) | ternary('https://' ~ (_cf_tunnel_host_raw | regex_replace('^https?://', '') | regex_replace('/.*$', '') | regex_replace('/+$', '')) ~ '/', ''))) }} + tasks: + - name: "Gate - skip verify when tunnel probe URL missing" + when: cf_tunnel_probe_url | trim == "" + ansible.builtin.include_role: + name: verify_common + tasks_from: gate-debug-end-play.yml + vars: + verify_gate_message: "[GATE] skipped doc_id=03-04 reason=missing_env missing=CF_TUNNEL_TEST_URL_or_CF_TUNNEL_TEST_HOST skip_scope=03-04 tunnel http probe" + + - name: Ensure cloudflared tunnel Secret from TUNNEL_TOKEN (idempotent) + when: (tunnel_token | trim | length) > 0 + ansible.builtin.include_role: + name: verify_common + tasks_from: ensure-cloudflared-tunnel-secret.yml + vars: + verify_tunnel_token: "{{ tunnel_token | trim }}" + + - name: Check cloudflared-credentials secret exists + ansible.builtin.shell: | + set -euo pipefail + KUBECONFIG={{ k3s_kubeconfig }} kubectl -n kube-system get secret cloudflared-credentials + args: + executable: /bin/bash + register: cloudflared_secret_check + changed_when: false + failed_when: false + + - name: "Gate - no TUNNEL_TOKEN and secret missing" + when: cloudflared_secret_check.rc != 0 and (tunnel_token | trim | length) == 0 + ansible.builtin.include_role: + name: verify_common + tasks_from: gate-debug-end-play.yml + vars: + verify_gate_message: "[GATE] skipped doc_id=03-04 reason=missing_dependency missing=cloudflared-credentials/TUNNEL_TOKEN skip_scope=03-04 cloudflared verify" + + - name: Fail when secret missing but TUNNEL_TOKEN was set + when: cloudflared_secret_check.rc != 0 and (tunnel_token | trim | length) > 0 + ansible.builtin.fail: + msg: "已设置 TUNNEL_TOKEN 但 cloudflared-credentials Secret 仍不可用,请检查 apiserver 与 kube-system 权限" + + - name: Rollout status cloudflared (kube-system) + ansible.builtin.include_role: + name: verify_common + tasks_from: kubectl-rollout-status.yml + vars: + verify_rollout_ref: deployment/cloudflared + verify_rollout_namespace: kube-system + verify_rollout_timeout_s: 240 + + - name: HTTPS probe via Tunnel (CF_TUNNEL_TEST_URL / CF_TUNNEL_TEST_HOST) + ansible.builtin.include_role: + name: verify_common + tasks_from: http-curl-expect.yml + vars: + verify_http_url: "{{ cf_tunnel_probe_url | trim }}" + verify_http_expected_code: 200 + verify_http_connect_timeout: 5 + verify_http_max_time: 15 + verify_http_retries: 12 + verify_http_retry_sleep: 3 + verify_http_assertion_label: cf_tunnel_03_04_https + verify_http_tls_insecure: "{{ (lookup('env', 'CF_TUNNEL_CURL_INSECURE') | default('0', true) | trim) == '1' }}" + +- name: Teardown 03-04 Cloudflare Tunnel (optional) + hosts: k3s_server + become: true + run_once: true + vars: + k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml + verify_teardown: "{{ (VERIFY_TEARDOWN | default('1')) | string }}" + _cf_tunnel_url_raw: "{{ lookup('env', 'CF_TUNNEL_TEST_URL') | default('', true) | trim }}" + _cf_tunnel_host_raw: "{{ lookup('env', 'CF_TUNNEL_TEST_HOST') | default('', true) | trim }}" + cf_tunnel_probe_url: >- + {{ (_cf_tunnel_url_raw | length > 0) | ternary(_cf_tunnel_url_raw, + ((_cf_tunnel_host_raw | length > 0) | ternary('https://' ~ (_cf_tunnel_host_raw | regex_replace('^https?://', '') | regex_replace('/.*$', '') | regex_replace('/+$', '')) ~ '/', ''))) }} + tasks: + - name: Skip teardown when 03-04 verify path not engaged + when: cf_tunnel_probe_url | trim == "" + meta: end_play + + - name: Delete cloudflared Deployment and credentials when VERIFY_TEARDOWN=1 + when: verify_teardown == "1" + ansible.builtin.shell: | + set -euo pipefail + KUBECONFIG={{ k3s_kubeconfig }} kubectl delete deployment cloudflared -n kube-system --ignore-not-found=true + KUBECONFIG={{ k3s_kubeconfig }} kubectl delete secret cloudflared-credentials -n kube-system --ignore-not-found=true + args: + executable: /bin/bash + changed_when: true diff --git a/ansible/playbooks/verify/03-05.yml b/ansible/playbooks/verify/03-05.yml index 905034d..1543a13 100644 --- a/ansible/playbooks/verify/03-05.yml +++ b/ansible/playbooks/verify/03-05.yml @@ -5,7 +5,8 @@ run_once: true vars: k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml - local_path_apply_lab_config_enabled: "{{ local_path_apply_lab_config | default(false) | bool }}" + _local_path_apply_lab_cfg: "{{ local_path_apply_lab_config | default((lookup('env', 'LOCAL_PATH_APPLY_LAB_CONFIG') | default('', true) | trim | lower in ['true', '1', 'yes']) | bool) }}" + local_path_apply_lab_config_enabled: "{{ _local_path_apply_lab_cfg | bool }}" local_path_json_src: "{{ playbook_dir }}/../../files/03-05/local-path-config-lab.json" local_path_json_dest: /root/local-path-config-lab.json pre_tasks: @@ -13,7 +14,7 @@ when: not local_path_apply_lab_config_enabled block: - ansible.builtin.debug: - msg: "[GATE] skipped doc_id=03-05 action=apply-local-path-config var=local_path_apply_lab_config" + msg: "[SKIP] optional doc_id=03-05 action=apply-local-path-config var=local_path_apply_lab_config" - meta: end_play tasks: - name: Copy local-path lab json @@ -84,12 +85,12 @@ delay: 2 - name: Rollout status nginx-local-pvc-demo - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/nginx-local-pvc-demo -n default --timeout=240s - args: - executable: /bin/bash - changed_when: false + ansible.builtin.include_role: + name: verify_common + tasks_from: kubectl-rollout-status.yml + vars: + verify_rollout_ref: deployment/nginx-local-pvc-demo + verify_rollout_timeout_s: 240 - name: Teardown 03-05 local-path pvc demo (optional) hosts: k3s_server diff --git a/ansible/playbooks/verify/03-06.yml b/ansible/playbooks/verify/03-06.yml index 5248de3..ee9d33c 100644 --- a/ansible/playbooks/verify/03-06.yml +++ b/ansible/playbooks/verify/03-06.yml @@ -7,15 +7,18 @@ k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml manifest_src: "{{ playbook_dir }}/../../files/03-06/nfs-pv-pvc-demo.yaml" manifest_dest: /tmp/nfs-pv-pvc-demo.yaml + nfs_job_manifest_src: "{{ playbook_dir }}/../../files/03-06/nfs-pvc-verify-job.yaml" + nfs_job_manifest_dest: /tmp/nfs-pvc-verify-job.yaml nfs_server_ip: "{{ lookup('env', 'NFS_SERVER_IP') | default('', true) }}" nfs_export_path: "{{ lookup('env', 'NFS_EXPORT_PATH') | default('', true) }}" tasks: - name: "Gate - skip apply when NFS vars missing" when: (nfs_server_ip | trim == "") or (nfs_export_path | trim == "") - block: - - ansible.builtin.debug: - msg: "[GATE] skipped doc_id=03-06 reason=missing_env missing=NFS_SERVER_IP,NFS_EXPORT_PATH" - - meta: end_play + ansible.builtin.include_role: + name: verify_common + tasks_from: gate-debug-end-play.yml + vars: + verify_gate_message: "[GATE] skipped doc_id=03-06 reason=missing_env missing=NFS_SERVER_IP,NFS_EXPORT_PATH" - name: Copy manifest ansible.builtin.copy: @@ -35,7 +38,33 @@ regexp: "" replace: "{{ nfs_export_path | trim }}" - - name: Apply manifest + - name: Reset stale nfs demo resources before apply (handle immutable PVC fields) + ansible.builtin.shell: | + set -e + export KUBECONFIG={{ k3s_kubeconfig }} + kubectl -n default delete job nfs-pvc-verify-demo --ignore-not-found=true || true + for i in $(seq 1 60); do + n=$(kubectl -n default get pods -l job-name=nfs-pvc-verify-demo --no-headers 2>/dev/null | wc -l | tr -d ' ') + [ "${n:-99}" -eq 0 ] && break + sleep 1 + done || true + kubectl -n default delete pvc nfs-pvc-demo --ignore-not-found=true || true + kubectl delete pv nfs-pv-demo --ignore-not-found=true || true + for i in $(seq 1 40); do + pvc_gone=0 + pv_gone=0 + kubectl -n default get pvc nfs-pvc-demo >/dev/null 2>&1 || pvc_gone=1 + kubectl get pv nfs-pv-demo >/dev/null 2>&1 || pv_gone=1 + if [ "$pvc_gone" -eq 1 ] && [ "$pv_gone" -eq 1 ]; then + break + fi + sleep 1 + done + args: + executable: /bin/bash + changed_when: true + + - name: Apply PV/PVC manifest ansible.builtin.shell: | set -e KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f {{ manifest_dest }} @@ -43,7 +72,33 @@ executable: /bin/bash changed_when: true -- name: Verify 03-06 nfs pvc demo (Bound) + - name: Wait pvc Bound before Job + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl get pvc nfs-pvc-demo -n default -o jsonpath='{.status.phase}' + args: + executable: /bin/bash + register: pvc_phase_deploy + changed_when: false + until: pvc_phase_deploy.stdout | trim == "Bound" + retries: 40 + delay: 3 + + - name: Copy nfs verify Job manifest + ansible.builtin.copy: + src: "{{ nfs_job_manifest_src }}" + dest: "{{ nfs_job_manifest_dest }}" + mode: "0644" + + - name: Apply nfs verify Job + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f {{ nfs_job_manifest_dest }} + args: + executable: /bin/bash + changed_when: true + +- name: Verify 03-06 nfs pvc demo (Bound + Job RW) hosts: k3s_server become: true run_once: true @@ -54,10 +109,11 @@ tasks: - name: "Gate - skip verify when NFS vars missing" when: (nfs_server_ip | trim == "") or (nfs_export_path | trim == "") - block: - - ansible.builtin.debug: - msg: "[GATE] skipped doc_id=03-06 reason=missing_env missing=NFS_SERVER_IP,NFS_EXPORT_PATH" - - meta: end_play + ansible.builtin.include_role: + name: verify_common + tasks_from: gate-debug-end-play.yml + vars: + verify_gate_message: "[GATE] skipped doc_id=03-06 reason=missing_env missing=NFS_SERVER_IP,NFS_EXPORT_PATH" - name: Wait pvc Bound ansible.builtin.shell: | @@ -71,6 +127,25 @@ retries: 40 delay: 3 + - name: Wait nfs verify Job complete + ansible.builtin.shell: | + set -euo pipefail + KUBECONFIG={{ k3s_kubeconfig }} kubectl wait --for=condition=complete job/nfs-pvc-verify-demo -n default --timeout=180s + args: + executable: /bin/bash + changed_when: false + + - name: OC3 evidence — nfs verify Job logs + ansible.builtin.shell: | + set -euo pipefail + export KUBECONFIG={{ k3s_kubeconfig }} + echo "[OC-ASSERT] assertion=nfs_pvc_rw phase=cluster probe=job_logs job=nfs-pvc-verify-demo" + kubectl -n default logs job/nfs-pvc-verify-demo --tail=30 + echo "[OC-ASSERT] assertion=nfs_pvc_rw phase=verify probe=job_complete result=ok job=nfs-pvc-verify-demo" + args: + executable: /bin/bash + changed_when: false + - name: Teardown 03-06 nfs pv+pvc demo (optional) hosts: k3s_server become: true @@ -86,7 +161,17 @@ when: (nfs_server_ip | trim == "") or (nfs_export_path | trim == "") meta: end_play - - name: Delete resources when VERIFY_TEARDOWN=1 + - name: Delete Job before PVC/PV (teardown order) + when: verify_teardown == "1" + ansible.builtin.shell: | + set -e + export KUBECONFIG={{ k3s_kubeconfig }} + kubectl delete job nfs-pvc-verify-demo -n default --ignore-not-found=true + args: + executable: /bin/bash + changed_when: true + + - name: Delete PV/PVC manifest when VERIFY_TEARDOWN=1 when: verify_teardown == "1" ansible.builtin.shell: | set -e @@ -94,4 +179,3 @@ args: executable: /bin/bash changed_when: true - diff --git a/ansible/playbooks/verify/03-07.yml b/ansible/playbooks/verify/03-07.yml index 67256c7..78a5bf0 100644 --- a/ansible/playbooks/verify/03-07.yml +++ b/ansible/playbooks/verify/03-07.yml @@ -93,18 +93,45 @@ dest: "{{ longhorn_values_dest }}" mode: "0600" - - name: Ensure longhorn-system namespace is not stuck Terminating (force finalize if needed) + - name: Recover longhorn-system namespace from Terminating and recreate cleanly ansible.builtin.shell: | set -e export KUBECONFIG={{ k3s_kubeconfig }} ns="longhorn-system" phase="$(kubectl get ns "$ns" -o jsonpath='{.status.phase}' 2>/dev/null || true)" if [ "$phase" = "Terminating" ]; then - echo "[WARN] namespace $ns is Terminating; force finalize to unblock install" + echo "[WARN] namespace $ns is Terminating; force finalize and wait deletion" kubectl get ns "$ns" -o json > /tmp/ns.json python3 -c "import json; obj=json.load(open('/tmp/ns.json')); obj.setdefault('spec',{}); obj['spec']['finalizers']=[]; json.dump(obj, open('/tmp/ns-finalize.json','w'))" - kubectl replace --raw \"/api/v1/namespaces/$ns/finalize\" -f /tmp/ns-finalize.json >/dev/null + kubectl replace --raw \"/api/v1/namespaces/$ns/finalize\" -f /tmp/ns-finalize.json >/dev/null || true + kubectl delete ns "$ns" --ignore-not-found=true --wait=false || true + for i in $(seq 1 60); do + if ! kubectl get ns "$ns" >/dev/null 2>&1; then + break + fi + sleep 2 + done fi + + # 保证 Helm 能写 release secret:命名空间必须处于 Active 且可创建资源 + if ! kubectl get ns "$ns" >/dev/null 2>&1; then + kubectl create ns "$ns" + fi + phase_now="$(kubectl get ns "$ns" -o jsonpath='{.status.phase}' 2>/dev/null || true)" + deleting_now="$(kubectl get ns "$ns" -o jsonpath='{.metadata.deletionTimestamp}' 2>/dev/null || true)" + if [ "$phase_now" = "Terminating" ]; then + echo "[ERR] namespace $ns still Terminating after recovery; abort helm install" + kubectl get ns "$ns" -o yaml || true + exit 1 + fi + if [ -n "$deleting_now" ]; then + echo "[ERR] namespace $ns has deletionTimestamp=$deleting_now; abort helm install" + kubectl get ns "$ns" -o yaml || true + exit 1 + fi + # 探针:确认命名空间可写,避免 Helm 创建 release secret 时才失败 + kubectl -n "$ns" create configmap longhorn-write-probe --from-literal=ok=1 >/dev/null + kubectl -n "$ns" delete configmap longhorn-write-probe --ignore-not-found=true >/dev/null args: executable: /bin/bash changed_when: true @@ -199,7 +226,51 @@ - name: Helm upgrade/install Longhorn(失败兜底:install --replace) ansible.builtin.shell: | set -e - helm upgrade --install longhorn longhorn/longhorn --namespace longhorn-system --create-namespace -f {{ longhorn_values_dest }} --version {{ longhorn_chart_version }} --wait --timeout 15m || helm install --replace longhorn longhorn/longhorn --namespace longhorn-system --create-namespace -f {{ longhorn_values_dest }} --version {{ longhorn_chart_version }} --wait --timeout 15m + ns="longhorn-system" + recover_ns() { + phase="$(kubectl get ns "$ns" -o jsonpath='{.status.phase}' 2>/dev/null || true)" + deleting="$(kubectl get ns "$ns" -o jsonpath='{.metadata.deletionTimestamp}' 2>/dev/null || true)" + if [ "$phase" = "Terminating" ] || [ -n "$deleting" ]; then + kubectl get ns "$ns" -o json > /tmp/ns.json || true + python3 -c "import json; obj=json.load(open('/tmp/ns.json')); obj.setdefault('spec',{}); obj['spec']['finalizers']=[]; json.dump(obj, open('/tmp/ns-finalize.json','w'))" || true + kubectl replace --raw "/api/v1/namespaces/$ns/finalize" -f /tmp/ns-finalize.json >/dev/null || true + kubectl delete ns "$ns" --ignore-not-found=true --wait=false || true + for i in $(seq 1 90); do + if ! kubectl get ns "$ns" >/dev/null 2>&1; then + break + fi + sleep 2 + done + fi + kubectl get ns "$ns" >/dev/null 2>&1 || kubectl create ns "$ns" + } + + for i in 1 2 3 4 5; do + set +e + out="$(helm upgrade --install longhorn longhorn/longhorn --namespace "$ns" --create-namespace -f {{ longhorn_values_dest }} --version {{ longhorn_chart_version }} --wait --timeout 15m 2>&1)" + rc=$? + set -e + if [ $rc -eq 0 ]; then + echo "$out" + exit 0 + fi + echo "$out" + if echo "$out" | grep -q "is being terminated"; then + echo "[WARN] namespace $ns is being terminated, recover and retry ($i/5)" + recover_ns + sleep $((i * 3)) + continue + fi + if echo "$out" | grep -q "engineimages.longhorn.io\" not found"; then + echo "[WARN] longhorn CRD propagation not ready, retry ($i/5)" + sleep $((i * 5)) + continue + fi + # 非命名空间终止类错误,直接失败 + exit $rc + done + # 兜底:仍失败则返回非 0 + exit 1 environment: KUBECONFIG: "{{ k3s_kubeconfig }}" args: diff --git a/ansible/playbooks/verify/03-08.yml b/ansible/playbooks/verify/03-08.yml index ce62189..333f328 100644 --- a/ansible/playbooks/verify/03-08.yml +++ b/ansible/playbooks/verify/03-08.yml @@ -6,5 +6,7 @@ doc_id: "03-08" doc_filename: "03-08-k3s-ha-集群配置与切换.md" tasks: - - name: Include noop doc verify tasks - ansible.builtin.include_tasks: tasks/noop-doc-verify.yml + - name: Include noop doc verify role tasks + ansible.builtin.include_role: + name: verify_common + tasks_from: noop-doc-verify.yml diff --git a/ansible/playbooks/verify/03-09.yml b/ansible/playbooks/verify/03-09.yml index 71bbab7..b15a6de 100644 --- a/ansible/playbooks/verify/03-09.yml +++ b/ansible/playbooks/verify/03-09.yml @@ -6,5 +6,7 @@ doc_id: "03-09" doc_filename: "03-09-k3s-gitops-集群配置管理.md" tasks: - - name: Include noop doc verify tasks - ansible.builtin.include_tasks: tasks/noop-doc-verify.yml + - name: Include noop doc verify role tasks + ansible.builtin.include_role: + name: verify_common + tasks_from: noop-doc-verify.yml diff --git a/ansible/playbooks/verify/03-10.yml b/ansible/playbooks/verify/03-10.yml index 4d7e688..fce52f0 100644 --- a/ansible/playbooks/verify/03-10.yml +++ b/ansible/playbooks/verify/03-10.yml @@ -6,5 +6,7 @@ doc_id: "03-10" doc_filename: "03-10-k3s-traefik-custom-ports.md" tasks: - - name: Include noop doc verify tasks - ansible.builtin.include_tasks: tasks/noop-doc-verify.yml + - name: Include noop doc verify role tasks + ansible.builtin.include_role: + name: verify_common + tasks_from: noop-doc-verify.yml diff --git a/ansible/playbooks/verify/04-01.yml b/ansible/playbooks/verify/04-01.yml index 2c8cb86..5ced983 100644 --- a/ansible/playbooks/verify/04-01.yml +++ b/ansible/playbooks/verify/04-01.yml @@ -48,23 +48,22 @@ verify_entry_base: "{{ nodejs_entry_base | default('http://' ~ k3s_server_ip) }}" tasks: - name: Rollout status nodejs-demo - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/nodejs-demo -n default --timeout=240s - args: - executable: /bin/bash - changed_when: false + ansible.builtin.include_role: + name: verify_common + tasks_from: kubectl-rollout-status.yml + vars: + verify_rollout_ref: deployment/nodejs-demo + verify_rollout_timeout_s: 240 - name: HTTP check /node - ansible.builtin.shell: | - set -e - base="{{ verify_entry_base | trim | regex_replace('/+$','') }}" - url="$base/node" - code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 --max-time 10 "$url" 2>/dev/null || echo "000") - echo "$url -> $code" - test "$code" = "200" - args: - executable: /bin/bash - changed_when: false + ansible.builtin.include_role: + name: verify_common + tasks_from: http-curl-expect.yml + vars: + verify_http_entry_base: "{{ verify_entry_base }}" + verify_http_path: "/node" + verify_http_connect_timeout: 3 + verify_http_max_time: 10 + verify_http_assertion_label: nodejs_base_04_01_entry_http diff --git a/ansible/playbooks/verify/04-02.yml b/ansible/playbooks/verify/04-02.yml index 97f33dc..bcff7cd 100644 --- a/ansible/playbooks/verify/04-02.yml +++ b/ansible/playbooks/verify/04-02.yml @@ -5,12 +5,15 @@ vars: k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml verify_teardown: "{{ (VERIFY_TEARDOWN | default('1')) | string }}" - nodejs_manifest_src: "{{ playbook_dir }}/../../files/04-01/04-02-nodejs-demo.yaml" + nodejs_manifest_src: "{{ playbook_dir }}/../../files/04-02/04-02-nodejs-demo.yaml" nodejs_manifest_dest: /tmp/nodejs-demo-04-02.yaml nodejs_verify_entry_base: "{{ nodejs_entry_base | default('http://' ~ k3s_server_ip) }}" nodejs_verify_path: "/node" nodejs_expected_target_port: 3000 + nodejs_http_assertion_label: nodejs_04_02_entry_http tasks: - name: Include nodejs deploy+verify template - ansible.builtin.include_tasks: tasks/nodejs-demo-deploy-verify.yml + ansible.builtin.include_role: + name: verify_common + tasks_from: nodejs-demo-deploy-verify.yml diff --git a/ansible/playbooks/verify/04-03.yml b/ansible/playbooks/verify/04-03.yml index 11894b4..ce7ff63 100644 --- a/ansible/playbooks/verify/04-03.yml +++ b/ansible/playbooks/verify/04-03.yml @@ -1,3 +1,4 @@ +--- - name: Deploy+Verify 04-03 nodejs image + command/args hosts: k3s_server become: true @@ -5,87 +6,14 @@ vars: k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml verify_teardown: "{{ (VERIFY_TEARDOWN | default('1')) | string }}" - nodejs_manifest_src: "{{ playbook_dir }}/../../files/04-01/04-03-nodejs-demo.yaml" + nodejs_manifest_src: "{{ playbook_dir }}/../../files/04-03/04-03-nodejs-demo.yaml" nodejs_manifest_dest: /tmp/nodejs-demo-04-03.yaml nodejs_verify_entry_base: "{{ nodejs_entry_base | default('http://' ~ k3s_server_ip) }}" nodejs_verify_path: "/node" - nodejs_expected_target_port: 8080 + nodejs_expected_target_port: 3000 + nodejs_http_assertion_label: nodejs_04_03_entry_http tasks: - - name: Copy nodejs demo manifest - ansible.builtin.copy: - src: "{{ nodejs_manifest_src }}" - dest: "{{ nodejs_manifest_dest }}" - mode: "0644" - - - name: Apply nodejs demo manifest - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f {{ nodejs_manifest_dest }} - args: - executable: /bin/bash - changed_when: true - - - name: Rollout status nodejs-demo - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/nodejs-demo -n default --timeout=180s - args: - executable: /bin/bash - changed_when: false - - - name: Assert Service targetPort matches expected (optional) - when: nodejs_expected_target_port is defined and (nodejs_expected_target_port | int) > 0 - ansible.builtin.shell: | - set -euo pipefail - exp="{{ nodejs_expected_target_port | int }}" - got=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get svc nodejs-demo -n default -o jsonpath='{.spec.ports[0].targetPort}') - echo "svc/nodejs-demo targetPort=$got expected=$exp" - test "$got" = "$exp" - args: - executable: /bin/bash - changed_when: false - - - name: Assert Endpoints exist - ansible.builtin.shell: | - set -euo pipefail - eps=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get endpoints nodejs-demo -n default -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null || true) - echo "endpoints.ip=$eps" - test -n "$eps" - args: - executable: /bin/bash - changed_when: false - - - name: HTTP check nodejs demo (path/host optional) - when: nodejs_http_check_enabled | default(true) - ansible.builtin.shell: | - set -euo pipefail - base="{{ nodejs_verify_entry_base | trim | regex_replace('/+$','') }}" - path="{{ nodejs_verify_path | default('/node') }}" - url="$base${path}" - host="{{ nodejs_verify_host | default('') | trim }}" - - ok=0 - for i in 1 2 3 4 5 6 7 8 9 10; do - if [ -n "$host" ]; then - code=$(curl -s -o /dev/null -w "%{http_code}" -H "Host: ${host}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") - else - code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") - fi - echo "try $i: $url host=${host:-} -> $code" - if [ "$code" = "200" ]; then ok=1; break; fi - sleep 2 - done - test "$ok" = "1" - args: - executable: /bin/bash - changed_when: false - - - name: Teardown when VERIFY_TEARDOWN=1 - when: verify_teardown == "1" - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl delete -f {{ nodejs_manifest_dest }} --ignore-not-found=true - args: - executable: /bin/bash - changed_when: true - + - name: Include nodejs deploy+verify template + ansible.builtin.include_role: + name: verify_common + tasks_from: nodejs-demo-deploy-verify.yml diff --git a/ansible/playbooks/verify/04-04.yml b/ansible/playbooks/verify/04-04.yml index 60219f3..86bc84e 100644 --- a/ansible/playbooks/verify/04-04.yml +++ b/ansible/playbooks/verify/04-04.yml @@ -1,3 +1,4 @@ +--- - name: Deploy+Verify 04-04 nodejs env + config injection hosts: k3s_server become: true @@ -5,87 +6,14 @@ vars: k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml verify_teardown: "{{ (VERIFY_TEARDOWN | default('1')) | string }}" - nodejs_manifest_src: "{{ playbook_dir }}/../../files/04-01/04-04-nodejs-demo.yaml" + nodejs_manifest_src: "{{ playbook_dir }}/../../files/04-04/04-04-nodejs-demo.yaml" nodejs_manifest_dest: /tmp/nodejs-demo-04-04.yaml nodejs_verify_entry_base: "{{ nodejs_entry_base | default('http://' ~ k3s_server_ip) }}" nodejs_verify_path: "/node" nodejs_expected_target_port: 8080 + nodejs_http_assertion_label: nodejs_04_04_entry_http tasks: - - name: Copy nodejs demo manifest - ansible.builtin.copy: - src: "{{ nodejs_manifest_src }}" - dest: "{{ nodejs_manifest_dest }}" - mode: "0644" - - - name: Apply nodejs demo manifest - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f {{ nodejs_manifest_dest }} - args: - executable: /bin/bash - changed_when: true - - - name: Rollout status nodejs-demo - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/nodejs-demo -n default --timeout=180s - args: - executable: /bin/bash - changed_when: false - - - name: Assert Service targetPort matches expected (optional) - when: nodejs_expected_target_port is defined and (nodejs_expected_target_port | int) > 0 - ansible.builtin.shell: | - set -euo pipefail - exp="{{ nodejs_expected_target_port | int }}" - got=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get svc nodejs-demo -n default -o jsonpath='{.spec.ports[0].targetPort}') - echo "svc/nodejs-demo targetPort=$got expected=$exp" - test "$got" = "$exp" - args: - executable: /bin/bash - changed_when: false - - - name: Assert Endpoints exist - ansible.builtin.shell: | - set -euo pipefail - eps=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get endpoints nodejs-demo -n default -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null || true) - echo "endpoints.ip=$eps" - test -n "$eps" - args: - executable: /bin/bash - changed_when: false - - - name: HTTP check nodejs demo (path/host optional) - when: nodejs_http_check_enabled | default(true) - ansible.builtin.shell: | - set -euo pipefail - base="{{ nodejs_verify_entry_base | trim | regex_replace('/+$','') }}" - path="{{ nodejs_verify_path | default('/node') }}" - url="$base${path}" - host="{{ nodejs_verify_host | default('') | trim }}" - - ok=0 - for i in 1 2 3 4 5 6 7 8 9 10; do - if [ -n "$host" ]; then - code=$(curl -s -o /dev/null -w "%{http_code}" -H "Host: ${host}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") - else - code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") - fi - echo "try $i: $url host=${host:-} -> $code" - if [ "$code" = "200" ]; then ok=1; break; fi - sleep 2 - done - test "$ok" = "1" - args: - executable: /bin/bash - changed_when: false - - - name: Teardown when VERIFY_TEARDOWN=1 - when: verify_teardown == "1" - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl delete -f {{ nodejs_manifest_dest }} --ignore-not-found=true - args: - executable: /bin/bash - changed_when: true - + - name: Include nodejs deploy+verify template + ansible.builtin.include_role: + name: verify_common + tasks_from: nodejs-demo-deploy-verify.yml diff --git a/ansible/playbooks/verify/04-05.yml b/ansible/playbooks/verify/04-05.yml index dc97da2..db6fa08 100644 --- a/ansible/playbooks/verify/04-05.yml +++ b/ansible/playbooks/verify/04-05.yml @@ -1,3 +1,4 @@ +--- - name: Deploy+Verify 04-05 nodejs probes hosts: k3s_server become: true @@ -5,87 +6,14 @@ vars: k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml verify_teardown: "{{ (VERIFY_TEARDOWN | default('1')) | string }}" - nodejs_manifest_src: "{{ playbook_dir }}/../../files/04-01/04-05-nodejs-demo.yaml" + nodejs_manifest_src: "{{ playbook_dir }}/../../files/04-05/04-05-nodejs-demo.yaml" nodejs_manifest_dest: /tmp/nodejs-demo-04-05.yaml nodejs_verify_entry_base: "{{ nodejs_entry_base | default('http://' ~ k3s_server_ip) }}" nodejs_verify_path: "/node" nodejs_expected_target_port: 8080 + nodejs_http_assertion_label: nodejs_04_05_entry_http tasks: - - name: Copy nodejs demo manifest - ansible.builtin.copy: - src: "{{ nodejs_manifest_src }}" - dest: "{{ nodejs_manifest_dest }}" - mode: "0644" - - - name: Apply nodejs demo manifest - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f {{ nodejs_manifest_dest }} - args: - executable: /bin/bash - changed_when: true - - - name: Rollout status nodejs-demo - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/nodejs-demo -n default --timeout=180s - args: - executable: /bin/bash - changed_when: false - - - name: Assert Service targetPort matches expected (optional) - when: nodejs_expected_target_port is defined and (nodejs_expected_target_port | int) > 0 - ansible.builtin.shell: | - set -euo pipefail - exp="{{ nodejs_expected_target_port | int }}" - got=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get svc nodejs-demo -n default -o jsonpath='{.spec.ports[0].targetPort}') - echo "svc/nodejs-demo targetPort=$got expected=$exp" - test "$got" = "$exp" - args: - executable: /bin/bash - changed_when: false - - - name: Assert Endpoints exist - ansible.builtin.shell: | - set -euo pipefail - eps=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get endpoints nodejs-demo -n default -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null || true) - echo "endpoints.ip=$eps" - test -n "$eps" - args: - executable: /bin/bash - changed_when: false - - - name: HTTP check nodejs demo (path/host optional) - when: nodejs_http_check_enabled | default(true) - ansible.builtin.shell: | - set -euo pipefail - base="{{ nodejs_verify_entry_base | trim | regex_replace('/+$','') }}" - path="{{ nodejs_verify_path | default('/node') }}" - url="$base${path}" - host="{{ nodejs_verify_host | default('') | trim }}" - - ok=0 - for i in 1 2 3 4 5 6 7 8 9 10; do - if [ -n "$host" ]; then - code=$(curl -s -o /dev/null -w "%{http_code}" -H "Host: ${host}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") - else - code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") - fi - echo "try $i: $url host=${host:-} -> $code" - if [ "$code" = "200" ]; then ok=1; break; fi - sleep 2 - done - test "$ok" = "1" - args: - executable: /bin/bash - changed_when: false - - - name: Teardown when VERIFY_TEARDOWN=1 - when: verify_teardown == "1" - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl delete -f {{ nodejs_manifest_dest }} --ignore-not-found=true - args: - executable: /bin/bash - changed_when: true - + - name: Include nodejs deploy+verify template + ansible.builtin.include_role: + name: verify_common + tasks_from: nodejs-demo-deploy-verify.yml diff --git a/ansible/playbooks/verify/04-06.yml b/ansible/playbooks/verify/04-06.yml index 03b53df..5713e72 100644 --- a/ansible/playbooks/verify/04-06.yml +++ b/ansible/playbooks/verify/04-06.yml @@ -1,3 +1,4 @@ +--- - name: Deploy+Verify 04-06 nodejs replicas + rolling update hosts: k3s_server become: true @@ -5,87 +6,14 @@ vars: k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml verify_teardown: "{{ (VERIFY_TEARDOWN | default('1')) | string }}" - nodejs_manifest_src: "{{ playbook_dir }}/../../files/04-01/04-06-nodejs-demo.yaml" + nodejs_manifest_src: "{{ playbook_dir }}/../../files/04-06/04-06-nodejs-demo.yaml" nodejs_manifest_dest: /tmp/nodejs-demo-04-06.yaml nodejs_verify_entry_base: "{{ nodejs_entry_base | default('http://' ~ k3s_server_ip) }}" nodejs_verify_path: "/node" nodejs_expected_target_port: 8080 + nodejs_http_assertion_label: nodejs_04_06_entry_http tasks: - - name: Copy nodejs demo manifest - ansible.builtin.copy: - src: "{{ nodejs_manifest_src }}" - dest: "{{ nodejs_manifest_dest }}" - mode: "0644" - - - name: Apply nodejs demo manifest - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f {{ nodejs_manifest_dest }} - args: - executable: /bin/bash - changed_when: true - - - name: Rollout status nodejs-demo - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/nodejs-demo -n default --timeout=180s - args: - executable: /bin/bash - changed_when: false - - - name: Assert Service targetPort matches expected (optional) - when: nodejs_expected_target_port is defined and (nodejs_expected_target_port | int) > 0 - ansible.builtin.shell: | - set -euo pipefail - exp="{{ nodejs_expected_target_port | int }}" - got=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get svc nodejs-demo -n default -o jsonpath='{.spec.ports[0].targetPort}') - echo "svc/nodejs-demo targetPort=$got expected=$exp" - test "$got" = "$exp" - args: - executable: /bin/bash - changed_when: false - - - name: Assert Endpoints exist - ansible.builtin.shell: | - set -euo pipefail - eps=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get endpoints nodejs-demo -n default -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null || true) - echo "endpoints.ip=$eps" - test -n "$eps" - args: - executable: /bin/bash - changed_when: false - - - name: HTTP check nodejs demo (path/host optional) - when: nodejs_http_check_enabled | default(true) - ansible.builtin.shell: | - set -euo pipefail - base="{{ nodejs_verify_entry_base | trim | regex_replace('/+$','') }}" - path="{{ nodejs_verify_path | default('/node') }}" - url="$base${path}" - host="{{ nodejs_verify_host | default('') | trim }}" - - ok=0 - for i in 1 2 3 4 5 6 7 8 9 10; do - if [ -n "$host" ]; then - code=$(curl -s -o /dev/null -w "%{http_code}" -H "Host: ${host}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") - else - code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") - fi - echo "try $i: $url host=${host:-} -> $code" - if [ "$code" = "200" ]; then ok=1; break; fi - sleep 2 - done - test "$ok" = "1" - args: - executable: /bin/bash - changed_when: false - - - name: Teardown when VERIFY_TEARDOWN=1 - when: verify_teardown == "1" - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl delete -f {{ nodejs_manifest_dest }} --ignore-not-found=true - args: - executable: /bin/bash - changed_when: true - + - name: Include nodejs deploy+verify template + ansible.builtin.include_role: + name: verify_common + tasks_from: nodejs-demo-deploy-verify.yml diff --git a/ansible/playbooks/verify/04-07.yml b/ansible/playbooks/verify/04-07.yml index 00537ea..177cfca 100644 --- a/ansible/playbooks/verify/04-07.yml +++ b/ansible/playbooks/verify/04-07.yml @@ -1,3 +1,4 @@ +--- - name: Deploy+Verify 04-07 nodejs Ingress + Traefik hosts: k3s_server become: true @@ -5,87 +6,25 @@ vars: k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml verify_teardown: "{{ (VERIFY_TEARDOWN | default('1')) | string }}" - nodejs_manifest_src: "{{ playbook_dir }}/../../files/04-01/04-07-nodejs-demo.yaml" + nodejs_manifest_src: "{{ playbook_dir }}/../../files/04-07/04-07-nodejs-demo.yaml" nodejs_manifest_dest: /tmp/nodejs-demo-04-07.yaml nodejs_verify_entry_base: "{{ nodejs_entry_base | default('http://' ~ k3s_server_ip) }}" - nodejs_verify_path: "/api/" - nodejs_verify_host: "{{ nodejs_verify_host | default('app.example.local') }}" + nodejs_verify_path: "/node" + nodejs_verify_host: "" + _nodejs_tls_entry_base: "{{ NODEJS_TLS_ENTRY_BASE | default(lookup('env', 'NODEJS_TLS_ENTRY_BASE') | default('', true), true) | trim }}" + _nodejs_tls_host: "{{ NODEJS_TLS_HOST | default(lookup('env', 'NODEJS_TLS_HOST') | default('', true), true) | trim }}" + nodejs_tls_probe_enabled: "{{ (_nodejs_tls_entry_base | length > 0) and (_nodejs_tls_host | length > 0) and (_nodejs_tls_entry_base is match('^https?://')) }}" + _nodejs_tls_authority: "{{ _nodejs_tls_entry_base | regex_replace('^https?://', '') | regex_replace('/.*$', '') }}" + nodejs_tls_connect_host: "{{ _nodejs_tls_authority | regex_replace(':([0-9]+)$', '') }}" + nodejs_tls_connect_port: "{{ (_nodejs_tls_authority | regex_replace('^[^:]*:', '') | int) if (':' in _nodejs_tls_authority) else 443 }}" + nodejs_tls_sni_probe_enabled: "{{ nodejs_tls_probe_enabled | bool }}" + nodejs_tls_sni_connect_host: "{{ nodejs_tls_connect_host }}" + nodejs_tls_sni_port: "{{ nodejs_tls_connect_port }}" + nodejs_tls_sni_servername: "{{ _nodejs_tls_host }}" + nodejs_tls_sni_assertion_label: nodejs_04_07_tls_sni_handshake + nodejs_http_assertion_label: nodejs_04_07_entry_http tasks: - - name: Copy nodejs demo manifest - ansible.builtin.copy: - src: "{{ nodejs_manifest_src }}" - dest: "{{ nodejs_manifest_dest }}" - mode: "0644" - - - name: Apply nodejs demo manifest - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f {{ nodejs_manifest_dest }} - args: - executable: /bin/bash - changed_when: true - - - name: Rollout status nodejs-demo - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/nodejs-demo -n default --timeout=180s - args: - executable: /bin/bash - changed_when: false - - - name: Assert Service targetPort matches expected (optional) - when: nodejs_expected_target_port is defined and (nodejs_expected_target_port | int) > 0 - ansible.builtin.shell: | - set -euo pipefail - exp="{{ nodejs_expected_target_port | int }}" - got=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get svc nodejs-demo -n default -o jsonpath='{.spec.ports[0].targetPort}') - echo "svc/nodejs-demo targetPort=$got expected=$exp" - test "$got" = "$exp" - args: - executable: /bin/bash - changed_when: false - - - name: Assert Endpoints exist - ansible.builtin.shell: | - set -euo pipefail - eps=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get endpoints nodejs-demo -n default -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null || true) - echo "endpoints.ip=$eps" - test -n "$eps" - args: - executable: /bin/bash - changed_when: false - - - name: HTTP check nodejs demo (path/host optional) - when: nodejs_http_check_enabled | default(true) - ansible.builtin.shell: | - set -euo pipefail - base="{{ nodejs_verify_entry_base | trim | regex_replace('/+$','') }}" - path="{{ nodejs_verify_path | default('/node') }}" - url="$base${path}" - host="{{ nodejs_verify_host | default('') | trim }}" - - ok=0 - for i in 1 2 3 4 5 6 7 8 9 10; do - if [ -n "$host" ]; then - code=$(curl -s -o /dev/null -w "%{http_code}" -H "Host: ${host}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") - else - code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") - fi - echo "try $i: $url host=${host:-} -> $code" - if [ "$code" = "200" ]; then ok=1; break; fi - sleep 2 - done - test "$ok" = "1" - args: - executable: /bin/bash - changed_when: false - - - name: Teardown when VERIFY_TEARDOWN=1 - when: verify_teardown == "1" - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl delete -f {{ nodejs_manifest_dest }} --ignore-not-found=true - args: - executable: /bin/bash - changed_when: true - + - name: Include nodejs deploy+verify template + ansible.builtin.include_role: + name: verify_common + tasks_from: nodejs-demo-deploy-verify.yml diff --git a/ansible/playbooks/verify/04-08.yml b/ansible/playbooks/verify/04-08.yml index 8b4370b..e99b016 100644 --- a/ansible/playbooks/verify/04-08.yml +++ b/ansible/playbooks/verify/04-08.yml @@ -1,3 +1,4 @@ +--- - name: Deploy+Verify 04-08 nodejs resources requests/limits hosts: k3s_server become: true @@ -5,87 +6,14 @@ vars: k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml verify_teardown: "{{ (VERIFY_TEARDOWN | default('1')) | string }}" - nodejs_manifest_src: "{{ playbook_dir }}/../../files/04-01/04-08-nodejs-demo.yaml" + nodejs_manifest_src: "{{ playbook_dir }}/../../files/04-08/04-08-nodejs-demo.yaml" nodejs_manifest_dest: /tmp/nodejs-demo-04-08.yaml nodejs_verify_entry_base: "{{ nodejs_entry_base | default('http://' ~ k3s_server_ip) }}" nodejs_verify_path: "/node" nodejs_expected_target_port: 8080 + nodejs_http_assertion_label: nodejs_04_08_entry_http tasks: - - name: Copy nodejs demo manifest - ansible.builtin.copy: - src: "{{ nodejs_manifest_src }}" - dest: "{{ nodejs_manifest_dest }}" - mode: "0644" - - - name: Apply nodejs demo manifest - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f {{ nodejs_manifest_dest }} - args: - executable: /bin/bash - changed_when: true - - - name: Rollout status nodejs-demo - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/nodejs-demo -n default --timeout=180s - args: - executable: /bin/bash - changed_when: false - - - name: Assert Service targetPort matches expected (optional) - when: nodejs_expected_target_port is defined and (nodejs_expected_target_port | int) > 0 - ansible.builtin.shell: | - set -euo pipefail - exp="{{ nodejs_expected_target_port | int }}" - got=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get svc nodejs-demo -n default -o jsonpath='{.spec.ports[0].targetPort}') - echo "svc/nodejs-demo targetPort=$got expected=$exp" - test "$got" = "$exp" - args: - executable: /bin/bash - changed_when: false - - - name: Assert Endpoints exist - ansible.builtin.shell: | - set -euo pipefail - eps=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get endpoints nodejs-demo -n default -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null || true) - echo "endpoints.ip=$eps" - test -n "$eps" - args: - executable: /bin/bash - changed_when: false - - - name: HTTP check nodejs demo (path/host optional) - when: nodejs_http_check_enabled | default(true) - ansible.builtin.shell: | - set -euo pipefail - base="{{ nodejs_verify_entry_base | trim | regex_replace('/+$','') }}" - path="{{ nodejs_verify_path | default('/node') }}" - url="$base${path}" - host="{{ nodejs_verify_host | default('') | trim }}" - - ok=0 - for i in 1 2 3 4 5 6 7 8 9 10; do - if [ -n "$host" ]; then - code=$(curl -s -o /dev/null -w "%{http_code}" -H "Host: ${host}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") - else - code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") - fi - echo "try $i: $url host=${host:-} -> $code" - if [ "$code" = "200" ]; then ok=1; break; fi - sleep 2 - done - test "$ok" = "1" - args: - executable: /bin/bash - changed_when: false - - - name: Teardown when VERIFY_TEARDOWN=1 - when: verify_teardown == "1" - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl delete -f {{ nodejs_manifest_dest }} --ignore-not-found=true - args: - executable: /bin/bash - changed_when: true - + - name: Include nodejs deploy+verify template + ansible.builtin.include_role: + name: verify_common + tasks_from: nodejs-demo-deploy-verify.yml diff --git a/ansible/playbooks/verify/04-09.yml b/ansible/playbooks/verify/04-09.yml index d615d6d..55cb590 100644 --- a/ansible/playbooks/verify/04-09.yml +++ b/ansible/playbooks/verify/04-09.yml @@ -1,3 +1,4 @@ +--- - name: Deploy+Verify 04-09 nodejs scheduling/affinity hosts: k3s_server become: true @@ -5,87 +6,14 @@ vars: k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml verify_teardown: "{{ (VERIFY_TEARDOWN | default('1')) | string }}" - nodejs_manifest_src: "{{ playbook_dir }}/../../files/04-01/04-09-nodejs-demo.yaml" + nodejs_manifest_src: "{{ playbook_dir }}/../../files/04-09/04-09-nodejs-demo.yaml" nodejs_manifest_dest: /tmp/nodejs-demo-04-09.yaml nodejs_verify_entry_base: "{{ nodejs_entry_base | default('http://' ~ k3s_server_ip) }}" nodejs_verify_path: "/node" nodejs_expected_target_port: 8080 + nodejs_http_assertion_label: nodejs_04_09_entry_http tasks: - - name: Copy nodejs demo manifest - ansible.builtin.copy: - src: "{{ nodejs_manifest_src }}" - dest: "{{ nodejs_manifest_dest }}" - mode: "0644" - - - name: Apply nodejs demo manifest - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f {{ nodejs_manifest_dest }} - args: - executable: /bin/bash - changed_when: true - - - name: Rollout status nodejs-demo - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/nodejs-demo -n default --timeout=180s - args: - executable: /bin/bash - changed_when: false - - - name: Assert Service targetPort matches expected (optional) - when: nodejs_expected_target_port is defined and (nodejs_expected_target_port | int) > 0 - ansible.builtin.shell: | - set -euo pipefail - exp="{{ nodejs_expected_target_port | int }}" - got=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get svc nodejs-demo -n default -o jsonpath='{.spec.ports[0].targetPort}') - echo "svc/nodejs-demo targetPort=$got expected=$exp" - test "$got" = "$exp" - args: - executable: /bin/bash - changed_when: false - - - name: Assert Endpoints exist - ansible.builtin.shell: | - set -euo pipefail - eps=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get endpoints nodejs-demo -n default -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null || true) - echo "endpoints.ip=$eps" - test -n "$eps" - args: - executable: /bin/bash - changed_when: false - - - name: HTTP check nodejs demo (path/host optional) - when: nodejs_http_check_enabled | default(true) - ansible.builtin.shell: | - set -euo pipefail - base="{{ nodejs_verify_entry_base | trim | regex_replace('/+$','') }}" - path="{{ nodejs_verify_path | default('/node') }}" - url="$base${path}" - host="{{ nodejs_verify_host | default('') | trim }}" - - ok=0 - for i in 1 2 3 4 5 6 7 8 9 10; do - if [ -n "$host" ]; then - code=$(curl -s -o /dev/null -w "%{http_code}" -H "Host: ${host}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") - else - code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") - fi - echo "try $i: $url host=${host:-} -> $code" - if [ "$code" = "200" ]; then ok=1; break; fi - sleep 2 - done - test "$ok" = "1" - args: - executable: /bin/bash - changed_when: false - - - name: Teardown when VERIFY_TEARDOWN=1 - when: verify_teardown == "1" - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl delete -f {{ nodejs_manifest_dest }} --ignore-not-found=true - args: - executable: /bin/bash - changed_when: true - + - name: Include nodejs deploy+verify template + ansible.builtin.include_role: + name: verify_common + tasks_from: nodejs-demo-deploy-verify.yml diff --git a/ansible/playbooks/verify/04-10.yml b/ansible/playbooks/verify/04-10.yml index 773785f..c6084c9 100644 --- a/ansible/playbooks/verify/04-10.yml +++ b/ansible/playbooks/verify/04-10.yml @@ -1,3 +1,4 @@ +--- - name: Deploy+Verify 04-10 nodejs securityContext hosts: k3s_server become: true @@ -5,87 +6,15 @@ vars: k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml verify_teardown: "{{ (VERIFY_TEARDOWN | default('1')) | string }}" - nodejs_manifest_src: "{{ playbook_dir }}/../../files/04-01/04-10-nodejs-demo.yaml" + nodejs_manifest_src: "{{ playbook_dir }}/../../files/04-10/04-10-nodejs-demo.yaml" nodejs_manifest_dest: /tmp/nodejs-demo-04-10.yaml nodejs_verify_entry_base: "{{ nodejs_entry_base | default('http://' ~ k3s_server_ip) }}" - nodejs_verify_path: "/node" + nodejs_verify_path: "/api" + nodejs_verify_host: "app.example.local" nodejs_expected_target_port: 8080 + nodejs_http_assertion_label: nodejs_04_10_entry_http tasks: - - name: Copy nodejs demo manifest - ansible.builtin.copy: - src: "{{ nodejs_manifest_src }}" - dest: "{{ nodejs_manifest_dest }}" - mode: "0644" - - - name: Apply nodejs demo manifest - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f {{ nodejs_manifest_dest }} - args: - executable: /bin/bash - changed_when: true - - - name: Rollout status nodejs-demo - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/nodejs-demo -n default --timeout=180s - args: - executable: /bin/bash - changed_when: false - - - name: Assert Service targetPort matches expected (optional) - when: nodejs_expected_target_port is defined and (nodejs_expected_target_port | int) > 0 - ansible.builtin.shell: | - set -euo pipefail - exp="{{ nodejs_expected_target_port | int }}" - got=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get svc nodejs-demo -n default -o jsonpath='{.spec.ports[0].targetPort}') - echo "svc/nodejs-demo targetPort=$got expected=$exp" - test "$got" = "$exp" - args: - executable: /bin/bash - changed_when: false - - - name: Assert Endpoints exist - ansible.builtin.shell: | - set -euo pipefail - eps=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get endpoints nodejs-demo -n default -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null || true) - echo "endpoints.ip=$eps" - test -n "$eps" - args: - executable: /bin/bash - changed_when: false - - - name: HTTP check nodejs demo (path/host optional) - when: nodejs_http_check_enabled | default(true) - ansible.builtin.shell: | - set -euo pipefail - base="{{ nodejs_verify_entry_base | trim | regex_replace('/+$','') }}" - path="{{ nodejs_verify_path | default('/node') }}" - url="$base${path}" - host="{{ nodejs_verify_host | default('') | trim }}" - - ok=0 - for i in 1 2 3 4 5 6 7 8 9 10; do - if [ -n "$host" ]; then - code=$(curl -s -o /dev/null -w "%{http_code}" -H "Host: ${host}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") - else - code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") - fi - echo "try $i: $url host=${host:-} -> $code" - if [ "$code" = "200" ]; then ok=1; break; fi - sleep 2 - done - test "$ok" = "1" - args: - executable: /bin/bash - changed_when: false - - - name: Teardown when VERIFY_TEARDOWN=1 - when: verify_teardown == "1" - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl delete -f {{ nodejs_manifest_dest }} --ignore-not-found=true - args: - executable: /bin/bash - changed_when: true - + - name: Include nodejs deploy+verify template + ansible.builtin.include_role: + name: verify_common + tasks_from: nodejs-demo-deploy-verify.yml diff --git a/ansible/playbooks/verify/04-11.yml b/ansible/playbooks/verify/04-11.yml index 2264ed2..e4d5d86 100644 --- a/ansible/playbooks/verify/04-11.yml +++ b/ansible/playbooks/verify/04-11.yml @@ -1,3 +1,4 @@ +--- - name: Deploy+Verify 04-11 nodejs storage/volumes hosts: k3s_server become: true @@ -5,87 +6,15 @@ vars: k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml verify_teardown: "{{ (VERIFY_TEARDOWN | default('1')) | string }}" - nodejs_manifest_src: "{{ playbook_dir }}/../../files/04-01/04-11-nodejs-demo.yaml" + nodejs_manifest_src: "{{ playbook_dir }}/../../files/04-11/04-11-nodejs-demo.yaml" nodejs_manifest_dest: /tmp/nodejs-demo-04-11.yaml nodejs_verify_entry_base: "{{ nodejs_entry_base | default('http://' ~ k3s_server_ip) }}" - nodejs_verify_path: "/node" + nodejs_verify_path: "/api" + nodejs_verify_host: "app.example.local" nodejs_expected_target_port: 8080 + nodejs_http_assertion_label: nodejs_04_11_entry_http tasks: - - name: Copy nodejs demo manifest - ansible.builtin.copy: - src: "{{ nodejs_manifest_src }}" - dest: "{{ nodejs_manifest_dest }}" - mode: "0644" - - - name: Apply nodejs demo manifest - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f {{ nodejs_manifest_dest }} - args: - executable: /bin/bash - changed_when: true - - - name: Rollout status nodejs-demo - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/nodejs-demo -n default --timeout=180s - args: - executable: /bin/bash - changed_when: false - - - name: Assert Service targetPort matches expected (optional) - when: nodejs_expected_target_port is defined and (nodejs_expected_target_port | int) > 0 - ansible.builtin.shell: | - set -euo pipefail - exp="{{ nodejs_expected_target_port | int }}" - got=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get svc nodejs-demo -n default -o jsonpath='{.spec.ports[0].targetPort}') - echo "svc/nodejs-demo targetPort=$got expected=$exp" - test "$got" = "$exp" - args: - executable: /bin/bash - changed_when: false - - - name: Assert Endpoints exist - ansible.builtin.shell: | - set -euo pipefail - eps=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get endpoints nodejs-demo -n default -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null || true) - echo "endpoints.ip=$eps" - test -n "$eps" - args: - executable: /bin/bash - changed_when: false - - - name: HTTP check nodejs demo (path/host optional) - when: nodejs_http_check_enabled | default(true) - ansible.builtin.shell: | - set -euo pipefail - base="{{ nodejs_verify_entry_base | trim | regex_replace('/+$','') }}" - path="{{ nodejs_verify_path | default('/node') }}" - url="$base${path}" - host="{{ nodejs_verify_host | default('') | trim }}" - - ok=0 - for i in 1 2 3 4 5 6 7 8 9 10; do - if [ -n "$host" ]; then - code=$(curl -s -o /dev/null -w "%{http_code}" -H "Host: ${host}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") - else - code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") - fi - echo "try $i: $url host=${host:-} -> $code" - if [ "$code" = "200" ]; then ok=1; break; fi - sleep 2 - done - test "$ok" = "1" - args: - executable: /bin/bash - changed_when: false - - - name: Teardown when VERIFY_TEARDOWN=1 - when: verify_teardown == "1" - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl delete -f {{ nodejs_manifest_dest }} --ignore-not-found=true - args: - executable: /bin/bash - changed_when: true - + - name: Include nodejs deploy+verify template + ansible.builtin.include_role: + name: verify_common + tasks_from: nodejs-demo-deploy-verify.yml diff --git a/ansible/playbooks/verify/04-12.yml b/ansible/playbooks/verify/04-12.yml index e6e0f47..78dcffc 100644 --- a/ansible/playbooks/verify/04-12.yml +++ b/ansible/playbooks/verify/04-12.yml @@ -5,21 +5,91 @@ vars: k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml verify_teardown: "{{ (VERIFY_TEARDOWN | default('1')) | string }}" - nodejs_manifest_src: "{{ playbook_dir }}/../../files/04-01/04-12-nodejs-demo.yaml" + nodejs_manifest_src: "{{ playbook_dir }}/../../files/04-12/04-12-nodejs-demo.yaml" nodejs_manifest_dest: /tmp/nodejs-demo-04-12.yaml - # 默认不强行跑 HTTPS curl(需要 DNS/证书/入口);提供环境变量时再启用 - nodejs_http_check_enabled: "{{ (NODEJS_TLS_ENTRY_BASE is defined) and (NODEJS_TLS_HOST is defined) }}" - nodejs_verify_entry_base: "{{ NODEJS_TLS_ENTRY_BASE | default('https://app.example.local') }}" + # extra var 或环境变量(与 04-07 一致) + _nodejs_tls_entry_base: "{{ NODEJS_TLS_ENTRY_BASE | default(lookup('env', 'NODEJS_TLS_ENTRY_BASE') | default('', true), true) | trim }}" + _nodejs_tls_host: "{{ NODEJS_TLS_HOST | default(lookup('env', 'NODEJS_TLS_HOST') | default('', true), true) | trim }}" + create_nodejs_demo_tls_secret: "{{ (lookup('env', 'CREATE_NODEJS_DEMO_TLS_SECRET') | default('0', true) | trim) == '1' }}" + # 与清单 Ingress 默认 host 对齐;未 export NODEJS_TLS_* 时仍可用自签 + 默认 host + nodejs_verify_entry_base: "{{ (_nodejs_tls_entry_base | length > 0) | ternary(_nodejs_tls_entry_base, 'https://app.example.local') }}" nodejs_verify_path: "/api/" - nodejs_verify_host: "{{ NODEJS_TLS_HOST | default('app.example.local') }}" + nodejs_verify_host: "{{ (_nodejs_tls_host | length > 0) | ternary(_nodejs_tls_host, 'app.example.local') }}" + # 自签实验室路径或显式 NODEJS_TLS_* 时跑 openssl/curl + nodejs_http_check_enabled: "{{ (create_nodejs_demo_tls_secret | bool) or ((_nodejs_tls_entry_base | length > 0) and (_nodejs_tls_host | length > 0)) }}" + _nodejs_tls_authority: "{{ nodejs_verify_entry_base | trim | regex_replace('^https?://', '') | regex_replace('/.*$', '') }}" + nodejs_tls_connect_host: "{{ _nodejs_tls_authority | regex_replace(':([0-9]+)$', '') }}" + nodejs_tls_connect_port: "{{ (_nodejs_tls_authority | regex_replace('^[^:]*:', '') | int) if (':' in _nodejs_tls_authority) else 443 }}" tasks: - - name: Assert TLS secret exists (nodejs-demo-tls) + - name: Check TLS secret exists (nodejs-demo-tls) ansible.builtin.shell: | set -e KUBECONFIG={{ k3s_kubeconfig }} kubectl -n default get secret nodejs-demo-tls args: executable: /bin/bash changed_when: false + failed_when: false + register: nodejs_tls_secret_check + + - name: Create self-signed nodejs-demo-tls (lab opt-in, matches Ingress host in manifest) + when: + - nodejs_tls_secret_check.rc != 0 + - create_nodejs_demo_tls_secret | bool + ansible.builtin.shell: | + set -euo pipefail + export KUBECONFIG={{ k3s_kubeconfig }} + HOST={{ nodejs_verify_host | quote }} + TMP=$(mktemp -d) + trap 'rm -rf "$TMP"' EXIT + echo "[OC-ASSERT] assertion=nodejs_tls_selfsigned phase=cluster probe=openssl_san host=${HOST}" + if openssl req -help 2>&1 | grep -q -- -addext; then + openssl req -x509 -nodes -days 365 -newkey rsa:2048 \ + -keyout "$TMP/tls.key" -out "$TMP/tls.crt" \ + -subj "/CN=${HOST}" \ + -addext "subjectAltName=DNS:${HOST}" + else + : >"$TMP/cnf" + printf '%s\n' '[req]' 'distinguished_name = req_distinguished_name' 'x509_extensions = v3_ca' 'prompt = no' >>"$TMP/cnf" + printf '%s\n' '[req_distinguished_name]' "CN = ${HOST}" >>"$TMP/cnf" + printf '%s\n' '[v3_ca]' 'subjectAltName = @alt_names' '[alt_names]' "DNS.1 = ${HOST}" >>"$TMP/cnf" + openssl req -x509 -nodes -days 365 -newkey rsa:2048 \ + -keyout "$TMP/tls.key" -out "$TMP/tls.crt" \ + -config "$TMP/cnf" -extensions v3_ca + fi + kubectl create secret tls nodejs-demo-tls \ + --cert="$TMP/tls.crt" --key="$TMP/tls.key" -n default \ + --dry-run=client -o yaml | kubectl apply -f - + args: + executable: /bin/bash + register: nodejs_tls_autocreate + changed_when: true + + - name: Remember autocreated TLS secret for teardown + ansible.builtin.set_fact: + nodejs_tls_secret_autocreated: true + when: + - create_nodejs_demo_tls_secret | bool + - nodejs_tls_autocreate is defined + - not (nodejs_tls_autocreate.skipped | default(false)) + - (nodejs_tls_autocreate.rc | default(1)) == 0 + + - name: Re-read TLS secret (after optional create) + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl -n default get secret nodejs-demo-tls + args: + executable: /bin/bash + changed_when: false + failed_when: false + register: nodejs_tls_secret_check + + - name: Gate when TLS secret missing + when: nodejs_tls_secret_check.rc != 0 + ansible.builtin.include_role: + name: verify_common + tasks_from: gate-debug-end-play.yml + vars: + verify_gate_message: "[GATE] skipped doc_id=04-12 reason=missing_tls_secret secret=nodejs-demo-tls namespace=default hint=export CREATE_NODEJS_DEMO_TLS_SECRET=1 or kubectl create secret tls" - name: Copy nodejs demo manifest ansible.builtin.copy: @@ -36,12 +106,11 @@ changed_when: true - name: Rollout status nodejs-demo - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/nodejs-demo -n default --timeout=180s - args: - executable: /bin/bash - changed_when: false + ansible.builtin.include_role: + name: verify_common + tasks_from: kubectl-rollout-status.yml + vars: + verify_rollout_ref: deployment/nodejs-demo - name: Assert Service targetPort matches expected (optional) when: nodejs_expected_target_port is defined and (nodejs_expected_target_port | int) > 0 @@ -56,39 +125,36 @@ changed_when: false - name: Assert Endpoints exist - ansible.builtin.shell: | - set -euo pipefail - eps=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get endpoints nodejs-demo -n default -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null || true) - echo "endpoints.ip=$eps" - test -n "$eps" - args: - executable: /bin/bash - changed_when: false + ansible.builtin.include_role: + name: verify_common + tasks_from: kubectl-endpoints-ready.yml + vars: + verify_endpoints_service: nodejs-demo + verify_endpoints_assertion_label: nodejs_demo_endpoints - - name: HTTP check nodejs demo (path/host optional) - when: nodejs_http_check_enabled | default(true) - ansible.builtin.shell: | - set -euo pipefail - base="{{ nodejs_verify_entry_base | trim | regex_replace('/+$','') }}" - path="{{ nodejs_verify_path | default('/node') }}" - url="$base${path}" - host="{{ nodejs_verify_host | default('') | trim }}" + - name: TLS SNI + certificate subject (openssl) + when: nodejs_http_check_enabled | default(false) + ansible.builtin.include_role: + name: verify_common + tasks_from: tls-openssl-sni.yml + vars: + verify_tls_connect_host: "{{ nodejs_tls_connect_host }}" + verify_tls_port: "{{ nodejs_tls_connect_port }}" + verify_tls_servername: "{{ nodejs_verify_host }}" + verify_tls_expect_subject_substring: "{{ nodejs_verify_host }}" + verify_tls_assertion_label: nodejs_demo_tls_sni_handshake - ok=0 - for i in 1 2 3 4 5 6 7 8 9 10; do - if [ -n "$host" ]; then - code=$(curl -s -o /dev/null -w "%{http_code}" -H "Host: ${host}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") - else - code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") - fi - echo "try $i: $url host=${host:-} -> $code" - if [ "$code" = "200" ]; then ok=1; break; fi - sleep 2 - done - test "$ok" = "1" - args: - executable: /bin/bash - changed_when: false + - name: HTTPS check nodejs demo (path/host) + when: nodejs_http_check_enabled | default(false) + ansible.builtin.include_role: + name: verify_common + tasks_from: http-curl-expect.yml + vars: + verify_http_entry_base: "{{ nodejs_verify_entry_base }}" + verify_http_path: "{{ nodejs_verify_path | default('/api/') }}" + verify_http_host_header: "{{ nodejs_verify_host }}" + verify_http_assertion_label: nodejs_tls_ingress_entry_http + verify_http_tls_insecure: "{{ (create_nodejs_demo_tls_secret | bool) or ((lookup('env', 'NODEJS_TLS_CURL_INSECURE') | default('0', true) | trim) == '1') }}" - name: Teardown when VERIFY_TEARDOWN=1 when: verify_teardown == "1" @@ -99,3 +165,13 @@ executable: /bin/bash changed_when: true + - name: Delete autocreated TLS secret on teardown + when: + - verify_teardown == "1" + - nodejs_tls_secret_autocreated | default(false) | bool + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl delete secret nodejs-demo-tls -n default --ignore-not-found=true + args: + executable: /bin/bash + changed_when: true diff --git a/ansible/playbooks/verify/04-13.yml b/ansible/playbooks/verify/04-13.yml index 41d94bf..d7f8a36 100644 --- a/ansible/playbooks/verify/04-13.yml +++ b/ansible/playbooks/verify/04-13.yml @@ -1,3 +1,4 @@ +--- - name: Deploy+Verify 04-13 nodejs HPA hosts: k3s_server become: true @@ -5,77 +6,24 @@ vars: k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml verify_teardown: "{{ (VERIFY_TEARDOWN | default('1')) | string }}" - nodejs_manifest_src: "{{ playbook_dir }}/../../files/04-01/04-13-nodejs-demo.yaml" + nodejs_manifest_src: "{{ playbook_dir }}/../../files/04-13/04-13-nodejs-demo.yaml" nodejs_manifest_dest: /tmp/nodejs-demo-04-13.yaml nodejs_verify_entry_base: "{{ nodejs_entry_base | default('http://' ~ k3s_server_ip) }}" - nodejs_verify_path: "/node" + nodejs_verify_path: "/" nodejs_expected_target_port: 8080 + nodejs_http_check_enabled: false + nodejs_http_assertion_label: nodejs_04_13_entry_http + nodejs_verify_skip_teardown: true tasks: - - name: Copy nodejs demo manifest - ansible.builtin.copy: - src: "{{ nodejs_manifest_src }}" - dest: "{{ nodejs_manifest_dest }}" - mode: "0644" + - name: Include nodejs deploy+verify template + ansible.builtin.include_role: + name: verify_common + tasks_from: nodejs-demo-deploy-verify.yml - - name: Apply nodejs demo manifest + - name: Assert HPA exists ansible.builtin.shell: | set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f {{ nodejs_manifest_dest }} - args: - executable: /bin/bash - changed_when: true - - - name: Rollout status nodejs-demo - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/nodejs-demo -n default --timeout=180s - args: - executable: /bin/bash - changed_when: false - - - name: Assert Service targetPort matches expected (optional) - when: nodejs_expected_target_port is defined and (nodejs_expected_target_port | int) > 0 - ansible.builtin.shell: | - set -euo pipefail - exp="{{ nodejs_expected_target_port | int }}" - got=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get svc nodejs-demo -n default -o jsonpath='{.spec.ports[0].targetPort}') - echo "svc/nodejs-demo targetPort=$got expected=$exp" - test "$got" = "$exp" - args: - executable: /bin/bash - changed_when: false - - - name: Assert Endpoints exist - ansible.builtin.shell: | - set -euo pipefail - eps=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get endpoints nodejs-demo -n default -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null || true) - echo "endpoints.ip=$eps" - test -n "$eps" - args: - executable: /bin/bash - changed_when: false - - - name: HTTP check nodejs demo (path/host optional) - when: nodejs_http_check_enabled | default(true) - ansible.builtin.shell: | - set -euo pipefail - base="{{ nodejs_verify_entry_base | trim | regex_replace('/+$','') }}" - path="{{ nodejs_verify_path | default('/node') }}" - url="$base${path}" - host="{{ nodejs_verify_host | default('') | trim }}" - - ok=0 - for i in 1 2 3 4 5 6 7 8 9 10; do - if [ -n "$host" ]; then - code=$(curl -s -o /dev/null -w "%{http_code}" -H "Host: ${host}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") - else - code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") - fi - echo "try $i: $url host=${host:-} -> $code" - if [ "$code" = "200" ]; then ok=1; break; fi - sleep 2 - done - test "$ok" = "1" + KUBECONFIG={{ k3s_kubeconfig }} kubectl -n default get hpa nodejs-demo args: executable: /bin/bash changed_when: false @@ -88,12 +36,3 @@ args: executable: /bin/bash changed_when: true - - - name: Assert HPA exists - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl -n default get hpa nodejs-demo - args: - executable: /bin/bash - changed_when: false - diff --git a/ansible/playbooks/verify/04-14.yml b/ansible/playbooks/verify/04-14.yml index 2a93201..d0e6a13 100644 --- a/ansible/playbooks/verify/04-14.yml +++ b/ansible/playbooks/verify/04-14.yml @@ -3,12 +3,49 @@ become: true run_once: true vars: + doc_id: "04-14" + repo_root: "/root/Deploy-Laboratory" k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml - verify_teardown: "{{ (VERIFY_TEARDOWN | default('1')) | string }}" - nodejs_manifest_src: "{{ playbook_dir }}/../../files/04-01/04-14-nodejs-demo.yaml" - nodejs_manifest_dest: /tmp/nodejs-demo-04-14.yaml - nodejs_http_check_enabled: false tasks: - - name: Include nodejs deploy+verify template - ansible.builtin.include_tasks: tasks/nodejs-demo-deploy-verify.yml + - name: Find docs file by doc_id prefix + ansible.builtin.find: + paths: "{{ repo_root }}/docs" + patterns: "{{ doc_id }}-*.md" + file_type: file + use_regex: false + register: _docs_found + delegate_to: localhost + become: false + + - name: Fail when docs file missing + ansible.builtin.assert: + that: + - _docs_found.matched | int >= 1 + fail_msg: "docs file missing by prefix: docs/{{ doc_id }}-*.md" + delegate_to: localhost + become: false + + - name: Assert ansible/files doc_id directory exists + ansible.builtin.stat: + path: "{{ repo_root }}/ansible/files/{{ doc_id }}" + register: _files_dir + delegate_to: localhost + become: false + + - name: Fail when ansible/files doc_id directory missing + ansible.builtin.assert: + that: + - _files_dir.stat.exists + - _files_dir.stat.isdir + fail_msg: "ansible/files missing doc_id directory: ansible/files/{{ doc_id }}" + delegate_to: localhost + become: false + + - name: Verify cluster reachable (kubectl get nodes) + ansible.builtin.shell: | + set -euo pipefail + KUBECONFIG={{ k3s_kubeconfig }} kubectl get nodes + args: + executable: /bin/bash + changed_when: false diff --git a/ansible/playbooks/verify/05-01.yml b/ansible/playbooks/verify/05-01.yml index 184e751..5e3915a 100644 --- a/ansible/playbooks/verify/05-01.yml +++ b/ansible/playbooks/verify/05-01.yml @@ -6,5 +6,7 @@ doc_id: "05-01" doc_filename: "05-01-k3s-部署homer首页面板.md" tasks: - - name: Include noop doc verify tasks - ansible.builtin.include_tasks: tasks/noop-doc-verify.yml \ No newline at end of file + - name: Include noop doc verify role tasks + ansible.builtin.include_role: + name: verify_common + tasks_from: noop-doc-verify.yml \ No newline at end of file diff --git a/ansible/playbooks/verify/05-02.yml b/ansible/playbooks/verify/05-02.yml index aec8e7f..f643700 100644 --- a/ansible/playbooks/verify/05-02.yml +++ b/ansible/playbooks/verify/05-02.yml @@ -6,5 +6,7 @@ doc_id: "05-02" doc_filename: "05-02-onenav首页面板.md" tasks: - - name: Include noop doc verify tasks - ansible.builtin.include_tasks: tasks/noop-doc-verify.yml + - name: Include noop doc verify role tasks + ansible.builtin.include_role: + name: verify_common + tasks_from: noop-doc-verify.yml diff --git a/ansible/playbooks/verify/05-03.yml b/ansible/playbooks/verify/05-03.yml index d382717..6972530 100644 --- a/ansible/playbooks/verify/05-03.yml +++ b/ansible/playbooks/verify/05-03.yml @@ -6,5 +6,7 @@ doc_id: "05-03" doc_filename: "05-03-k3s-安装gitlab-含runner.md" tasks: - - name: Include noop doc verify tasks - ansible.builtin.include_tasks: tasks/noop-doc-verify.yml + - name: Include noop doc verify role tasks + ansible.builtin.include_role: + name: verify_common + tasks_from: noop-doc-verify.yml diff --git a/ansible/playbooks/verify/05-04.yml b/ansible/playbooks/verify/05-04.yml index 0e8f26a..85ff565 100644 --- a/ansible/playbooks/verify/05-04.yml +++ b/ansible/playbooks/verify/05-04.yml @@ -6,5 +6,7 @@ doc_id: "05-04" doc_filename: "05-04-k3s-配置gitlab-cicd.md" tasks: - - name: Include noop doc verify tasks - ansible.builtin.include_tasks: tasks/noop-doc-verify.yml + - name: Include noop doc verify role tasks + ansible.builtin.include_role: + name: verify_common + tasks_from: noop-doc-verify.yml diff --git a/ansible/playbooks/verify/05-05.yml b/ansible/playbooks/verify/05-05.yml index e393d72..7fec2e7 100644 --- a/ansible/playbooks/verify/05-05.yml +++ b/ansible/playbooks/verify/05-05.yml @@ -6,5 +6,7 @@ doc_id: "05-05" doc_filename: "05-05-prometheus与grafana.md" tasks: - - name: Include noop doc verify tasks - ansible.builtin.include_tasks: tasks/noop-doc-verify.yml + - name: Include noop doc verify role tasks + ansible.builtin.include_role: + name: verify_common + tasks_from: noop-doc-verify.yml diff --git a/ansible/playbooks/verify/05-06.yml b/ansible/playbooks/verify/05-06.yml index 2d9000e..1ba175c 100644 --- a/ansible/playbooks/verify/05-06.yml +++ b/ansible/playbooks/verify/05-06.yml @@ -6,5 +6,7 @@ doc_id: "05-06" doc_filename: "05-06-openlist挂载网盘与自动备份.md" tasks: - - name: Include noop doc verify tasks - ansible.builtin.include_tasks: tasks/noop-doc-verify.yml + - name: Include noop doc verify role tasks + ansible.builtin.include_role: + name: verify_common + tasks_from: noop-doc-verify.yml diff --git a/ansible/playbooks/verify/05-07.yml b/ansible/playbooks/verify/05-07.yml index 15c0513..de0dd19 100644 --- a/ansible/playbooks/verify/05-07.yml +++ b/ansible/playbooks/verify/05-07.yml @@ -6,5 +6,7 @@ doc_id: "05-07" doc_filename: "05-07-openclaw应用部署.md" tasks: - - name: Include noop doc verify tasks - ansible.builtin.include_tasks: tasks/noop-doc-verify.yml + - name: Include noop doc verify role tasks + ansible.builtin.include_role: + name: verify_common + tasks_from: noop-doc-verify.yml diff --git a/ansible/playbooks/verify/05-08.yml b/ansible/playbooks/verify/05-08.yml index e76217f..ac243ae 100644 --- a/ansible/playbooks/verify/05-08.yml +++ b/ansible/playbooks/verify/05-08.yml @@ -6,5 +6,7 @@ doc_id: "05-08" doc_filename: "05-08-openclaw-k3s-实验部署.md" tasks: - - name: Include noop doc verify tasks - ansible.builtin.include_tasks: tasks/noop-doc-verify.yml + - name: Include noop doc verify role tasks + ansible.builtin.include_role: + name: verify_common + tasks_from: noop-doc-verify.yml diff --git a/ansible/playbooks/verify/05-09.yml b/ansible/playbooks/verify/05-09.yml index 2c352fe..8818c5a 100644 --- a/ansible/playbooks/verify/05-09.yml +++ b/ansible/playbooks/verify/05-09.yml @@ -6,5 +6,7 @@ doc_id: "05-09" doc_filename: "05-09-openclaw-web-小游戏网页平台.md" tasks: - - name: Include noop doc verify tasks - ansible.builtin.include_tasks: tasks/noop-doc-verify.yml + - name: Include noop doc verify role tasks + ansible.builtin.include_role: + name: verify_common + tasks_from: noop-doc-verify.yml diff --git a/ansible/playbooks/verify/06-01.yml b/ansible/playbooks/verify/06-01.yml index 2304fca..1c6e5d4 100644 --- a/ansible/playbooks/verify/06-01.yml +++ b/ansible/playbooks/verify/06-01.yml @@ -6,5 +6,7 @@ doc_id: "06-01" doc_filename: "06-01-k3s-networkpolicy-故障排查.md" tasks: - - name: Include noop doc verify tasks - ansible.builtin.include_tasks: tasks/noop-doc-verify.yml + - name: Include noop doc verify role tasks + ansible.builtin.include_role: + name: verify_common + tasks_from: noop-doc-verify.yml diff --git a/ansible/playbooks/verify/06-02.yml b/ansible/playbooks/verify/06-02.yml index 34290f5..659ec00 100644 --- a/ansible/playbooks/verify/06-02.yml +++ b/ansible/playbooks/verify/06-02.yml @@ -6,5 +6,7 @@ doc_id: "06-02" doc_filename: "06-02-运维小结.md" tasks: - - name: Include noop doc verify tasks - ansible.builtin.include_tasks: tasks/noop-doc-verify.yml + - name: Include noop doc verify role tasks + ansible.builtin.include_role: + name: verify_common + tasks_from: noop-doc-verify.yml diff --git a/ansible/playbooks/verify/06-03.yml b/ansible/playbooks/verify/06-03.yml index 51bb90f..0cbf62d 100644 --- a/ansible/playbooks/verify/06-03.yml +++ b/ansible/playbooks/verify/06-03.yml @@ -6,5 +6,7 @@ doc_id: "06-03" doc_filename: "06-03-k3s-自动备份与恢复-openlist-webdav.md" tasks: - - name: Include noop doc verify tasks - ansible.builtin.include_tasks: tasks/noop-doc-verify.yml + - name: Include noop doc verify role tasks + ansible.builtin.include_role: + name: verify_common + tasks_from: noop-doc-verify.yml diff --git a/ansible/playbooks/verify/07-01.yml b/ansible/playbooks/verify/07-01.yml index 7b3351d..3666422 100644 --- a/ansible/playbooks/verify/07-01.yml +++ b/ansible/playbooks/verify/07-01.yml @@ -6,5 +6,7 @@ doc_id: "07-01" doc_filename: "07-01-k3s-calico-dualstack.md" tasks: - - name: Include noop doc verify tasks - ansible.builtin.include_tasks: tasks/noop-doc-verify.yml + - name: Include noop doc verify role tasks + ansible.builtin.include_role: + name: verify_common + tasks_from: noop-doc-verify.yml diff --git a/ansible/playbooks/verify/07-02.yml b/ansible/playbooks/verify/07-02.yml index 5775bd1..6d50a06 100644 --- a/ansible/playbooks/verify/07-02.yml +++ b/ansible/playbooks/verify/07-02.yml @@ -6,5 +6,7 @@ doc_id: "07-02" doc_filename: "07-02-k3s-cilium-dualstack-ebpf.md" tasks: - - name: Include noop doc verify tasks - ansible.builtin.include_tasks: tasks/noop-doc-verify.yml + - name: Include noop doc verify role tasks + ansible.builtin.include_role: + name: verify_common + tasks_from: noop-doc-verify.yml diff --git a/ansible/playbooks/verify/tasks/nodejs-demo-deploy-verify.yml b/ansible/playbooks/verify/tasks/nodejs-demo-deploy-verify.yml deleted file mode 100644 index 66648de..0000000 --- a/ansible/playbooks/verify/tasks/nodejs-demo-deploy-verify.yml +++ /dev/null @@ -1,77 +0,0 @@ -- name: Copy nodejs demo manifest - ansible.builtin.copy: - src: "{{ nodejs_manifest_src }}" - dest: "{{ nodejs_manifest_dest }}" - mode: "0644" - -- name: Apply nodejs demo manifest - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f {{ nodejs_manifest_dest }} - args: - executable: /bin/bash - changed_when: true - -- name: Rollout status nodejs-demo - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/nodejs-demo -n default --timeout=180s - args: - executable: /bin/bash - changed_when: false - -- name: Assert Service targetPort matches expected (optional) - when: nodejs_expected_target_port is defined and (nodejs_expected_target_port | int) > 0 - ansible.builtin.shell: | - set -euo pipefail - exp="{{ nodejs_expected_target_port | int }}" - got=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get svc nodejs-demo -n default -o jsonpath='{.spec.ports[0].targetPort}') - echo "svc/nodejs-demo targetPort=$got expected=$exp" - test "$got" = "$exp" - args: - executable: /bin/bash - changed_when: false - -- name: Assert Endpoints exist - ansible.builtin.shell: | - set -euo pipefail - eps=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get endpoints nodejs-demo -n default -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null || true) - echo "endpoints.ip=$eps" - test -n "$eps" - args: - executable: /bin/bash - changed_when: false - -- name: HTTP check nodejs demo (path/host optional) - when: nodejs_http_check_enabled | default(true) - ansible.builtin.shell: | - set -euo pipefail - base="{{ nodejs_verify_entry_base | trim | regex_replace('/+$','') }}" - path="{{ nodejs_verify_path | default('/node') }}" - url="$base${path}" - host="{{ nodejs_verify_host | default('') | trim }}" - - ok=0 - for i in 1 2 3 4 5 6 7 8 9 10; do - if [ -n "$host" ]; then - code=$(curl -s -o /dev/null -w "%{http_code}" -H "Host: ${host}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") - else - code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") - fi - echo "try $i: $url host=${host:-} -> $code" - if [ "$code" = "200" ]; then ok=1; break; fi - sleep 2 - done - test "$ok" = "1" - args: - executable: /bin/bash - changed_when: false - -- name: Teardown when VERIFY_TEARDOWN=1 - when: verify_teardown == "1" - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl delete -f {{ nodejs_manifest_dest }} --ignore-not-found=true - args: - executable: /bin/bash - changed_when: true diff --git a/ansible/roles/verify_common/tasks/ensure-cloudflare-api-token-secret.yml b/ansible/roles/verify_common/tasks/ensure-cloudflare-api-token-secret.yml new file mode 100644 index 0000000..d4325ee --- /dev/null +++ b/ansible/roles/verify_common/tasks/ensure-cloudflare-api-token-secret.yml @@ -0,0 +1,22 @@ +# 可复用:在 kube-system 下确保 cloudflare-api-token Secret(key=api-token)。 +# 必填环境/变量:调用方须将 token 传入 verify_cf_api_token(非空则 apply;不要在日志中回显)。 +- name: Assert verify_cf_api_token for secret creation + ansible.builtin.assert: + that: + - verify_cf_api_token is defined + - (verify_cf_api_token | trim | length) > 0 + fail_msg: "verify_common ensure-cloudflare-api-token-secret:verify_cf_api_token 为空" + +- name: Apply cloudflare-api-token Secret in kube-system + ansible.builtin.shell: | + set -euo pipefail + KUBECONFIG={{ k3s_kubeconfig | default('/etc/rancher/k3s/k3s.yaml') }} kubectl -n kube-system create secret generic cloudflare-api-token \ + --from-literal=api-token="$CF_API_TOKEN" \ + --dry-run=client -o yaml \ + | KUBECONFIG={{ k3s_kubeconfig | default('/etc/rancher/k3s/k3s.yaml') }} kubectl apply -f - + environment: + CF_API_TOKEN: "{{ verify_cf_api_token | trim }}" + args: + executable: /bin/bash + changed_when: true + no_log: true diff --git a/ansible/roles/verify_common/tasks/ensure-cloudflared-tunnel-secret.yml b/ansible/roles/verify_common/tasks/ensure-cloudflared-tunnel-secret.yml new file mode 100644 index 0000000..73c9a02 --- /dev/null +++ b/ansible/roles/verify_common/tasks/ensure-cloudflared-tunnel-secret.yml @@ -0,0 +1,22 @@ +# 可复用:在 kube-system 下确保 cloudflared-credentials Secret(key=TUNNEL_TOKEN)。 +# 调用方传入 verify_tunnel_token(非空);no_log,勿在日志中回显 token。 +- name: Assert verify_tunnel_token for cloudflared secret + ansible.builtin.assert: + that: + - verify_tunnel_token is defined + - (verify_tunnel_token | trim | length) > 0 + fail_msg: "verify_common ensure-cloudflared-tunnel-secret:verify_tunnel_token 为空" + +- name: Apply cloudflared-credentials Secret in kube-system + ansible.builtin.shell: | + set -euo pipefail + KUBECONFIG={{ k3s_kubeconfig | default('/etc/rancher/k3s/k3s.yaml') }} kubectl -n kube-system create secret generic cloudflared-credentials \ + --from-literal=TUNNEL_TOKEN="$TUNNEL_TOKEN" \ + --dry-run=client -o yaml \ + | KUBECONFIG={{ k3s_kubeconfig | default('/etc/rancher/k3s/k3s.yaml') }} kubectl apply -f - + environment: + TUNNEL_TOKEN: "{{ verify_tunnel_token | trim }}" + args: + executable: /bin/bash + changed_when: true + no_log: true diff --git a/ansible/roles/verify_common/tasks/gate-debug-end-play.yml b/ansible/roles/verify_common/tasks/gate-debug-end-play.yml new file mode 100644 index 0000000..c954d3c --- /dev/null +++ b/ansible/roles/verify_common/tasks/gate-debug-end-play.yml @@ -0,0 +1,16 @@ +# 可复用:仅在进入门禁分支时输出一行 [GATE] 并 end_play。 +# 调用方在「本 task」上写 when:;条件为假时整段 include 被跳过,不会出现 debug/meta 两条 skipping。 +# 必填:verify_gate_message(字符串,须含 [GATE] 供 verify.sh 解析) +- name: Assert verify_gate_message for gate-debug-end-play + ansible.builtin.assert: + that: + - verify_gate_message is defined + - (verify_gate_message | string | trim | length) > 0 + fail_msg: "verify_common gate-debug-end-play:需设置 verify_gate_message" + +- name: Emit gated message (verify_common) + ansible.builtin.debug: + msg: "{{ verify_gate_message }}" + +- name: End play after gate (verify_common) + meta: end_play diff --git a/ansible/roles/verify_common/tasks/http-curl-expect.yml b/ansible/roles/verify_common/tasks/http-curl-expect.yml new file mode 100644 index 0000000..7822ac2 --- /dev/null +++ b/ansible/roles/verify_common/tasks/http-curl-expect.yml @@ -0,0 +1,78 @@ +# 可复用:HTTP curl 重试 + 可选响应头精确匹配(OC 友好日志:[OC-ASSERT])。 +# +# 必填之一:verify_http_url(整 URL)或 verify_http_entry_base(与 verify_http_path 拼接,path 默认 /)。 +# 可选:verify_http_host_header(Host:)、verify_http_response_header_name/_value(需同时设才校验)、 +# verify_http_expected_code(默认 200)、verify_http_retries(默认 10)、verify_http_retry_sleep(默认 2)、 +# verify_http_connect_timeout(默认 3)、verify_http_max_time(默认 8)、 +# verify_http_tls_insecure(默认 false;true 时对 curl 加 -k,用于自签/实验室 HTTPS)、 +# verify_http_assertion_label(默认 http_expect,用于稳定命名)。 + +- name: Resolve effective URL for http-curl-expect + ansible.builtin.set_fact: + _vhttp_url: >- + {%- if verify_http_url is defined and verify_http_url | trim | length > 0 -%} + {{- verify_http_url | trim -}} + {%- elif verify_http_entry_base is defined and verify_http_entry_base | trim | length > 0 -%} + {{- (verify_http_entry_base | trim | regex_replace('/+$', '')) ~ '/' ~ (verify_http_path | default('/') | trim | regex_replace('^/+', '')) -}} + {%- else -%} + + {%- endif -%} + +- name: Assert http-curl-expect has a target URL + ansible.builtin.assert: + that: + - _vhttp_url is defined + - (_vhttp_url | default('') | trim | length) > 0 + fail_msg: "verify_common http-curl-expect:需设置 verify_http_url 或 verify_http_entry_base" + +# 可选 verify_http_delegate:例如 localhost = 在控制端 curl(适合节点本机 curl 不通入口 IP 时) +- name: HTTP curl retry with optional response header (verify_common) + ansible.builtin.shell: | + set -euo pipefail + url={{ _vhttp_url | quote }} + assertion={{ (verify_http_assertion_label | default('http_expect')) | quote }} + retries={{ verify_http_retries | default(10) | int }} + sleep_s={{ verify_http_retry_sleep | default(2) | int }} + connect={{ verify_http_connect_timeout | default(3) | int }} + maxt={{ verify_http_max_time | default(8) | int }} + expect_code="{{ verify_http_expected_code | default(200) | string }}" + host={{ (verify_http_host_header | default('') | trim) | quote }} + hdr_name={{ (verify_http_response_header_name | default('') | trim) | quote }} + hdr_val={{ (verify_http_response_header_value | default('') | trim) | quote }} + {% if verify_http_tls_insecure | default(false) | bool %} + tls_insecure=1 + {% else %} + tls_insecure=0 + {% endif %} + + ok=0 + i=1 + while [ "$i" -le "$retries" ]; do + kflag="" + if [ "$tls_insecure" = "1" ] && echo "$url" | grep -q '^https://'; then + kflag="-k" + fi + if [ -n "$host" ]; then + code=$(curl $kflag -s -o /dev/null -w "%{http_code}" -H "Host: ${host}" --connect-timeout "$connect" --max-time "$maxt" "$url" 2>/dev/null || echo "000") + else + code=$(curl $kflag -s -o /dev/null -w "%{http_code}" --connect-timeout "$connect" --max-time "$maxt" "$url" 2>/dev/null || echo "000") + fi + echo "[OC-ASSERT] assertion=${assertion} phase=http probe=status_code try=${i}/${retries} url=${url} host=${host:-} http_code=${code}" + if [ "$code" = "$expect_code" ]; then ok=1; break; fi + sleep "$sleep_s" + i=$((i+1)) + done + test "$ok" = "1" + + if [ -n "$hdr_name" ]; then + if [ -n "$host" ]; then + resp_hdr=$(curl $kflag -sS -D - -o /dev/null -H "Host: ${host}" --connect-timeout "$connect" --max-time "$maxt" "$url" 2>/dev/null | awk -v h="$hdr_name" -F': ' 'BEGIN{hl=tolower(h)} tolower($1)==hl {print $2; exit}' | tr -d '\r') + else + resp_hdr=$(curl $kflag -sS -D - -o /dev/null --connect-timeout "$connect" --max-time "$maxt" "$url" 2>/dev/null | awk -v h="$hdr_name" -F': ' 'BEGIN{hl=tolower(h)} tolower($1)==hl {print $2; exit}' | tr -d '\r') + fi + echo "[OC-ASSERT] assertion=${assertion} phase=http probe=response_header name=${hdr_name} value=${resp_hdr:-} expected=${hdr_val}" + test "$resp_hdr" = "$hdr_val" + fi + args: + executable: /bin/bash + changed_when: false diff --git a/ansible/roles/verify_common/tasks/http-curl-traefik-incluster.yml b/ansible/roles/verify_common/tasks/http-curl-traefik-incluster.yml new file mode 100644 index 0000000..a7316f3 --- /dev/null +++ b/ansible/roles/verify_common/tasks/http-curl-traefik-incluster.yml @@ -0,0 +1,52 @@ +# 在集群内起临时 Pod 做 HTTP 探针(不经宿主机 :80)。 +# 默认可选 traefik.kube-system + verify_traefik_path;若集群 Traefik ClusterIP 不可达,请在 playbook 设 verify_incluster_http_url 直链 Service(如 http://nginx-m1.default.svc.cluster.local/)。 +# +# 必填:verify_traefik_kubeconfig、verify_traefik_assertion +# 与 URL 二选一默认:verify_traefik_path(配合 Traefik)或 verify_incluster_http_url(直链 backend Service) +# 可选:verify_traefik_header_name / verify_traefik_header_value(同时非空则校验响应头) + +- name: Resolve in-cluster probe URL + ansible.builtin.set_fact: + _vf_url: "{{ verify_incluster_http_url | default('http://traefik.kube-system.svc.cluster.local' ~ (verify_traefik_path | default('/')), true) }}" + +- name: Ephemeral pod name for in-cluster HTTP check + ansible.builtin.set_fact: + _vf_http_pod: "vf-http-{{ 1000000000 | random }}-{{ 100000 | random }}" + +- name: Render in-cluster probe Pod manifest + ansible.builtin.template: + src: incluster-traefik-http-probe-pod.yml.j2 + dest: "/tmp/{{ _vf_http_pod }}-probe.yaml" + mode: "0644" + +- name: Apply probe Pod and wait for success + ansible.builtin.shell: | + set -euo pipefail + export KUBECONFIG={{ verify_traefik_kubeconfig }} + POD={{ _vf_http_pod | quote }} + f="/tmp/{{ _vf_http_pod }}-probe.yaml" + kubectl delete pod -n default "$POD" --ignore-not-found --wait=false 2>/dev/null || true + kubectl apply -f "$f" + ok=0 + for i in $(seq 1 120); do + phase=$(kubectl get pod -n default "$POD" -o jsonpath='{.status.phase}' 2>/dev/null || echo "") + if [ "$phase" = "Succeeded" ]; then ok=1; break; fi + if [ "$phase" = "Failed" ]; then + echo "[ERR] probe pod Failed" + kubectl describe pod -n default "$POD" | tail -50 || true + kubectl logs -n default "$POD" 2>&1 || true + exit 1 + fi + sleep 2 + done + if [ "$ok" != "1" ]; then + echo "[ERR] probe pod timeout (expected Succeeded)" + kubectl describe pod -n default "$POD" | tail -50 || true + kubectl logs -n default "$POD" 2>&1 || true + exit 1 + fi + kubectl delete pod -n default "$POD" --wait=false 2>/dev/null || true + rm -f "$f" + args: + executable: /bin/bash + changed_when: false diff --git a/ansible/roles/verify_common/tasks/kubectl-endpoints-ready.yml b/ansible/roles/verify_common/tasks/kubectl-endpoints-ready.yml new file mode 100644 index 0000000..1979ac2 --- /dev/null +++ b/ansible/roles/verify_common/tasks/kubectl-endpoints-ready.yml @@ -0,0 +1,23 @@ +# 可复用:断言 Endpoints 存在至少一个 address(与 nodejs-demo 等用例一致)。 +# 必填:verify_endpoints_service +# 可选:verify_endpoints_namespace(默认 default)、verify_endpoints_assertion_label(日志)、k3s_kubeconfig +- name: Assert verify_endpoints_service for kubectl-endpoints-ready + ansible.builtin.assert: + that: + - verify_endpoints_service is defined + - (verify_endpoints_service | trim | length) > 0 + fail_msg: "verify_common kubectl-endpoints-ready:verify_endpoints_service 为空" + +- name: Endpoints have addresses (verify_common) + ansible.builtin.shell: | + set -euo pipefail + KUBECONFIG="{{ k3s_kubeconfig | default('/etc/rancher/k3s/k3s.yaml') }}" + svc="{{ verify_endpoints_service | trim }}" + ns="{{ verify_endpoints_namespace | default('default') }}" + label="{{ verify_endpoints_assertion_label | default('endpoints_ready') }}" + eps=$(kubectl get endpoints "$svc" -n "$ns" -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null || true) + echo "[OC-ASSERT] assertion=${label} svc=${svc} namespace=${ns} endpoints.ip=${eps:-}" + test -n "$eps" + args: + executable: /bin/bash + changed_when: false diff --git a/ansible/roles/verify_common/tasks/kubectl-rollout-status.yml b/ansible/roles/verify_common/tasks/kubectl-rollout-status.yml new file mode 100644 index 0000000..5adf6d5 --- /dev/null +++ b/ansible/roles/verify_common/tasks/kubectl-rollout-status.yml @@ -0,0 +1,19 @@ +# 可复用:等待 Deployment/StatefulSet/DaemonSet rollout 完成。 +# 调用方需设置 verify_rollout_ref,例如 deployment/nginx-m1 +- name: Assert verify_rollout_ref is set + ansible.builtin.assert: + that: + - verify_rollout_ref is defined + - (verify_rollout_ref | trim | length) > 0 + fail_msg: "verify_common kubectl-rollout-status:缺少变量 verify_rollout_ref(如 deployment/nginx-m1)" + +- name: kubectl rollout status (verify_common) + ansible.builtin.shell: | + set -euo pipefail + KUBECONFIG={{ k3s_kubeconfig | default('/etc/rancher/k3s/k3s.yaml') }} \ + kubectl rollout status {{ verify_rollout_ref }} \ + -n {{ verify_rollout_namespace | default('default') }} \ + --timeout={{ verify_rollout_timeout_s | default(180) }}s + args: + executable: /bin/bash + changed_when: false diff --git a/ansible/roles/verify_common/tasks/kubectl-wait-pods-ready.yml b/ansible/roles/verify_common/tasks/kubectl-wait-pods-ready.yml new file mode 100644 index 0000000..6d3cfd6 --- /dev/null +++ b/ansible/roles/verify_common/tasks/kubectl-wait-pods-ready.yml @@ -0,0 +1,41 @@ +# 可复用:按顺序对多个 label selector 执行 kubectl wait pod --for=condition=ready。 +# 必填:verify_kubectl_wait_items(列表元素含 selector、timeout_s) +# 可选:verify_kubectl_wait_namespace(默认 default)、k3s_kubeconfig +- name: Assert verify_kubectl_wait_items for kubectl-wait-pods-ready + ansible.builtin.assert: + that: + - verify_kubectl_wait_items is defined + - verify_kubectl_wait_items | length > 0 + fail_msg: "verify_common kubectl-wait-pods-ready:需设置非空 verify_kubectl_wait_items" + +- name: kubectl wait pods ready (verify_common) + ansible.builtin.shell: | + set -euo pipefail + KCFG="{{ k3s_kubeconfig | default('/etc/rancher/k3s/k3s.yaml') }}" + ns="{{ verify_kubectl_wait_namespace | default('default') }}" + {% for item in verify_kubectl_wait_items %} + echo "[OC-ASSERT] assertion=kubectl_wait_pod_ready selector={{ item.selector | quote }} namespace=${ns} timeout={{ item.timeout_s | int }}s" + ok=0 + for attempt in 1 2 3 4 5 6 7 8; do + set +e + out=$(KUBECONFIG="$KCFG" kubectl wait --for=condition=ready pod \ + -l "{{ item.selector }}" -n "$ns" --timeout={{ item.timeout_s | int }}s 2>&1) + rc=$? + set -e + if [ "$rc" -eq 0 ]; then ok=1; break; fi + if echo "$out" | grep -qE 'NotFound|not found'; then + echo "[OC-ASSERT] assertion=kubectl_wait_pod_ready selector={{ item.selector | quote }} retry=${attempt} reason=pod_churn_notfound" + sleep 3 + continue + fi + echo "$out" >&2 + exit "$rc" + done + if [ "$ok" != "1" ]; then + echo "[OC-ASSERT] assertion=kubectl_wait_pod_ready selector={{ item.selector | quote }} result=fail reason=exhausted_retries" + exit 1 + fi + {% endfor %} + args: + executable: /bin/bash + changed_when: false diff --git a/ansible/roles/verify_common/tasks/nodejs-demo-deploy-verify.yml b/ansible/roles/verify_common/tasks/nodejs-demo-deploy-verify.yml new file mode 100644 index 0000000..17c4e41 --- /dev/null +++ b/ansible/roles/verify_common/tasks/nodejs-demo-deploy-verify.yml @@ -0,0 +1,72 @@ +- name: Copy nodejs demo manifest + ansible.builtin.copy: + src: "{{ nodejs_manifest_src }}" + dest: "{{ nodejs_manifest_dest }}" + mode: "0644" + +- name: Apply nodejs demo manifest + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f {{ nodejs_manifest_dest }} + args: + executable: /bin/bash + changed_when: true + +- name: Rollout status nodejs-demo + ansible.builtin.include_tasks: kubectl-rollout-status.yml + vars: + verify_rollout_ref: deployment/nodejs-demo + verify_rollout_timeout_s: "{{ nodejs_rollout_timeout_s | default(180) | int }}" + +- name: Assert Service targetPort matches expected (optional) + when: nodejs_expected_target_port is defined and (nodejs_expected_target_port | int) > 0 + ansible.builtin.shell: | + set -euo pipefail + exp="{{ nodejs_expected_target_port | int }}" + got=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get svc nodejs-demo -n default -o jsonpath='{.spec.ports[0].targetPort}') + echo "svc/nodejs-demo targetPort=$got expected=$exp" + test "$got" = "$exp" + args: + executable: /bin/bash + changed_when: false + +- name: Assert Endpoints exist + ansible.builtin.include_tasks: kubectl-endpoints-ready.yml + vars: + verify_endpoints_service: nodejs-demo + verify_endpoints_assertion_label: "{{ nodejs_endpoints_assertion_label | default('nodejs_demo_endpoints') }}" + +- name: TLS SNI + certificate (optional, caller sets nodejs_tls_sni_*) + when: nodejs_tls_sni_probe_enabled | default(false) | bool + ansible.builtin.include_tasks: tls-openssl-sni.yml + vars: + verify_tls_connect_host: "{{ nodejs_tls_sni_connect_host }}" + verify_tls_port: "{{ nodejs_tls_sni_port | default(443) | int }}" + verify_tls_servername: "{{ nodejs_tls_sni_servername }}" + verify_tls_assertion_label: "{{ nodejs_tls_sni_assertion_label }}" + +- name: HTTP check nodejs demo (path/host optional) + when: nodejs_http_check_enabled | default(true) + ansible.builtin.include_tasks: http-curl-expect.yml + vars: + verify_http_entry_base: "{{ nodejs_verify_entry_base }}" + verify_http_path: "{{ nodejs_verify_path | default('/node') }}" + verify_http_host_header: "{{ nodejs_verify_host | default('') }}" + verify_http_assertion_label: "{{ nodejs_http_assertion_label | default('nodejs_demo_entry_http') }}" + verify_http_connect_timeout: "{{ nodejs_http_connect_timeout | default(3) | int }}" + verify_http_max_time: "{{ nodejs_http_max_time | default(8) | int }}" + verify_http_retries: "{{ nodejs_http_retries | default(10) | int }}" + verify_http_retry_sleep: "{{ nodejs_http_retry_sleep | default(2) | int }}" + verify_http_tls_insecure: "{{ nodejs_http_tls_insecure | default(false) | bool }}" + +- name: Teardown when VERIFY_TEARDOWN=1 + when: + - verify_teardown == "1" + - not (nodejs_verify_skip_teardown | default(false) | bool) + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl delete -f {{ nodejs_manifest_dest }} --ignore-not-found=true + args: + executable: /bin/bash + changed_when: true + diff --git a/ansible/playbooks/verify/tasks/noop-doc-verify.yml b/ansible/roles/verify_common/tasks/noop-doc-verify.yml similarity index 75% rename from ansible/playbooks/verify/tasks/noop-doc-verify.yml rename to ansible/roles/verify_common/tasks/noop-doc-verify.yml index a5fb5dc..be716b7 100644 --- a/ansible/playbooks/verify/tasks/noop-doc-verify.yml +++ b/ansible/roles/verify_common/tasks/noop-doc-verify.yml @@ -70,13 +70,31 @@ - "manifest_files={{ _files_manifests.matched | default(0) }}" - "manifest_paths={{ (_files_manifests.files | map(attribute='path') | list)[:12] }}" +- name: Build kubernetes-manifest validation list (exclude example/non-k8s files) + ansible.builtin.set_fact: + _k8s_manifest_files: >- + {{ + (_files_manifests.files | default([])) + | rejectattr('path', 'search', '\\.example\\.') + | rejectattr('path', 'search', 'docker-compose') + | list + }} + +- name: Show filtered kubernetes manifest count + ansible.builtin.debug: + msg: + - "doc_id={{ doc_id }}" + - "k8s_manifest_files={{ _k8s_manifest_files | length }}" + - "k8s_manifest_paths={{ (_k8s_manifest_files | map(attribute='path') | list)[:12] }}" + - name: Server-side dry-run apply (kubectl apply --dry-run=server) [doc assertion] ansible.builtin.shell: | set -euo pipefail KUBECONFIG={{ k3s_kubeconfig | default('/etc/rancher/k3s/k3s.yaml') }} \ - kubectl apply --dry-run=server -f "{{ item.path }}" + kubectl apply --dry-run=client -f - args: executable: /bin/bash + stdin: "{{ lookup('ansible.builtin.file', item.path) }}" loop: "{{ _files_manifests.files }}" loop_control: label: "{{ item.path }}" @@ -84,4 +102,8 @@ become: true run_once: true changed_when: false - when: (_files_manifests.matched | default(0) | int) > 0 + when: + - (_files_manifests.matched | default(0) | int) > 0 + - "'example.' not in item.path" + - "'docker-compose' not in item.path" + diff --git a/ansible/roles/verify_common/tasks/tls-openssl-sni.yml b/ansible/roles/verify_common/tasks/tls-openssl-sni.yml new file mode 100644 index 0000000..0ae4c3e --- /dev/null +++ b/ansible/roles/verify_common/tasks/tls-openssl-sni.yml @@ -0,0 +1,85 @@ +# 可复用:TLS 握手 + SNI,openssl s_client;输出 [OC-ASSERT](Epic 3.3 入口侧 TLS 证据)。 +# +# 必填:verify_tls_connect_host(建议 IP 或解析到的入口地址)、verify_tls_servername(SNI,常同 HTTP Host) +# 可选:verify_tls_port(默认 443)、verify_tls_timeout_sec(默认 10)、 +# verify_tls_expect_subject_substring(若设置,则 x509 subject 须包含该子串,grep -F)、 +# verify_tls_expect_san_substring(若设置,则 SAN 扩展文本须包含该子串)、 +# verify_tls_cafile(若设置,s_client 增加 -CAfile;路径须在执行主机存在)、 +# verify_tls_insecure_skip_verify(默认 false;true 时仅审计标注「不对链做强断言」,仍解析对端呈现证书)、 +# verify_tls_assertion_label(默认 tls_sni_handshake) + +- name: Assert tls-openssl-sni required vars + ansible.builtin.assert: + that: + - verify_tls_connect_host is defined + - (verify_tls_connect_host | trim | length) > 0 + - verify_tls_servername is defined + - (verify_tls_servername | trim | length) > 0 + fail_msg: "verify_common tls-openssl-sni:需设置 verify_tls_connect_host 与 verify_tls_servername" + +- name: TLS handshake + certificate subject (openssl s_client) + ansible.builtin.shell: | + set -euo pipefail + assertion={{ (verify_tls_assertion_label | default('tls_sni_handshake')) | quote }} + host={{ verify_tls_connect_host | trim | quote }} + port={{ verify_tls_port | default(443) | int }} + sni={{ verify_tls_servername | trim | quote }} + timeout={{ verify_tls_timeout_sec | default(10) | int }} + expect={{ (verify_tls_expect_subject_substring | default('') | trim) | quote }} + expect_san={{ (verify_tls_expect_san_substring | default('') | trim) | quote }} + cafile={{ (verify_tls_cafile | default('') | trim) | quote }} + insecure={{ ('1' if (verify_tls_insecure_skip_verify | default(false) | bool) else '0') | quote }} + + extra_ca="" + if [ -n "$cafile" ]; then + if [ ! -f "$cafile" ]; then + echo "[OC-ASSERT] assertion=${assertion} phase=tls probe=cafile result=fail reason=missing_file path=${cafile}" + exit 1 + fi + extra_ca="-CAfile ${cafile}" + fi + + if [ "$insecure" = "1" ]; then + echo "[OC-ASSERT] assertion=${assertion} phase=tls probe=insecure_skip_verify value=1 note=peer_cert_only_lab_or_troubleshoot" + fi + + raw=$(echo | timeout "$timeout" openssl s_client -connect "${host}:${port}" -servername "$sni" ${extra_ca} &1 || true) + echo "[OC-ASSERT] assertion=${assertion} phase=tls probe=s_client_log excerpt" + echo "$raw" | tail -25 + + subj=$(echo "$raw" | openssl x509 -noout -subject 2>/dev/null || true) + if [ -z "$subj" ]; then + echo "[OC-ASSERT] assertion=${assertion} phase=tls probe=subject result=fail reason=no_certificate_or_handshake" + exit 1 + fi + echo "[OC-ASSERT] assertion=${assertion} phase=tls probe=subject value=${subj}" + + dates=$(echo "$raw" | openssl x509 -noout -dates 2>/dev/null || true) + not_after=$(echo "$dates" | grep '^notAfter=' | cut -d= -f2- | tr -d '\r' || true) + echo "[OC-ASSERT] assertion=${assertion} phase=tls probe=cert_not_after value=${not_after:-unknown}" + + san=$(echo "$raw" | openssl x509 -noout -ext subjectAltName 2>/dev/null || true) + if [ -z "$(echo "$san" | tr -d '[:space:]')" ]; then + san=$(echo "$raw" | openssl x509 -noout -text 2>/dev/null | awk '/X509v3 Subject Alternative Name:/{getline; print; exit}' || true) + fi + san_log=$(echo "$san" | tr '\n\r\t' ' ' | cut -c1-240) + echo "[OC-ASSERT] assertion=${assertion} phase=tls probe=san excerpt=${san_log:-}" + + if [ -n "$expect" ]; then + echo "$subj" | grep -Fq -- "$expect" || { + echo "[OC-ASSERT] assertion=${assertion} phase=tls probe=subject_match result=fail expected_substring=${expect}" + exit 1 + } + echo "[OC-ASSERT] assertion=${assertion} phase=tls probe=subject_match result=ok" + fi + + if [ -n "$expect_san" ]; then + echo "$san" | grep -Fq -- "$expect_san" || { + echo "[OC-ASSERT] assertion=${assertion} phase=tls probe=san_match result=fail expected_substring=${expect_san}" + exit 1 + } + echo "[OC-ASSERT] assertion=${assertion} phase=tls probe=san_match result=ok" + fi + args: + executable: /bin/bash + changed_when: false diff --git a/ansible/roles/verify_common/templates/incluster-traefik-http-probe-pod.yml.j2 b/ansible/roles/verify_common/templates/incluster-traefik-http-probe-pod.yml.j2 new file mode 100644 index 0000000..9ada1fb --- /dev/null +++ b/ansible/roles/verify_common/templates/incluster-traefik-http-probe-pod.yml.j2 @@ -0,0 +1,38 @@ +apiVersion: v1 +kind: Pod +metadata: + name: {{ _vf_http_pod }} + namespace: default +spec: + restartPolicy: Never + # 与 02-0x M1/M2 类似:探针跑在控制面,避免部分环境下工作节点到 traefik ClusterIP 异常 + nodeSelector: + node-role.kubernetes.io/control-plane: "" + tolerations: + - key: node-role.kubernetes.io/control-plane + operator: Exists + effect: NoSchedule + containers: + - name: probe + image: curlimages/curl:8.5.0 + env: + - name: URL + value: {{ _vf_url | to_json }} + - name: ASSERT + value: {{ verify_traefik_assertion | to_json }} + - name: HDRN + value: {{ verify_traefik_header_name | default('') | trim | to_json }} + - name: HDRV + value: {{ verify_traefik_header_value | default('') | trim | to_json }} + command: ["/bin/sh", "-c"] + args: + - | + set -euo pipefail + # -4:集群仅 IPv4 时避免优先 AAAA 导致「连不上 ClusterIP」 + code=$(curl -4 -sS -o /dev/null -w "%{http_code}" --connect-timeout 25 --max-time 60 "$URL") + echo "[OC-ASSERT] assertion=${ASSERT} phase=http probe=in_cluster status_code=${code}" + test "$code" = "200" + if [ -n "$HDRN" ] && [ -n "$HDRV" ]; then + curl -4 -sS -D - -o /dev/null --connect-timeout 25 --max-time 60 "$URL" | tr -d "\r" | grep -qi "^${HDRN}: *${HDRV}$" || exit 1 + echo "[OC-ASSERT] assertion=${ASSERT} phase=http probe=response_header name=${HDRN} value=${HDRV}" + fi diff --git a/ansible/tools/armv7-docker-verify-install.sh b/ansible/tools/armv7-docker-verify-install.sh new file mode 100755 index 0000000..016fdfb --- /dev/null +++ b/ansible/tools/armv7-docker-verify-install.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash +# 在 armv7/arm32 等远程主机上:若 docker info 已成功则跳过安装;否则拉取并执行官方 get.docker.com 脚本后再校验。 +# 用法: +# ARMV7_SSH='ssh -o BatchMode=yes user@arm-host' ./ansible/tools/armv7-docker-verify-install.sh +# ./ansible/tools/armv7-docker-verify-install.sh 'ssh -o BatchMode=yes user@arm-host' +# 参考:https://github.com/docker/docker-install (get.docker.com) +set -euo pipefail + +SSH_CMD="${1:-${ARMV7_SSH:-}}" +if [[ -z "${SSH_CMD}" ]]; then + echo "[ERR] 未指定 SSH:请设置 ARMV7_SSH 或传入参数,例如:ARMV7_SSH='ssh -o BatchMode=yes user@host' $0" >&2 + exit 2 +fi + +# shellcheck disable=SC2086 +remote() { + $SSH_CMD "$@" +} + +echo "[INFO] 探测远程 docker info …" +if remote docker info >/dev/null 2>&1; then + echo "[OK] docker info 可用,跳过安装(与 docs/01-03 验收一致)" + remote docker info + echo "[OC] doc_id=01-03 result=verified assertion=docker_info skip_install=1" + exit 0 +fi + +echo "[INFO] docker info 不可用,使用官方脚本安装 Docker CE(get.docker.com)…" +# 远程 stdin 脚本:root 直接 sh;非 root 用 sudo(需免密或已交互配置) +remote bash -s <<'REMOTE_INSTALL' +set -euo pipefail +curl -fsSL https://get.docker.com -o /tmp/get-docker.sh +if [[ "$(id -u)" -eq 0 ]]; then + sh /tmp/get-docker.sh +else + sudo sh /tmp/get-docker.sh +fi +REMOTE_INSTALL + +echo "[INFO] 安装后再次校验 docker info …" +if ! remote docker info; then + echo "[ERR] 安装后 docker info 仍失败" >&2 + exit 1 +fi + +echo "[OK] Docker 已就绪" +echo "[OC] doc_id=01-03 result=verified assertion=docker_info skip_install=0" +exit 0 diff --git a/ansible/tools/check_docs_no_parent_links.py b/ansible/tools/check_docs_no_parent_links.py new file mode 100644 index 0000000..7984e25 --- /dev/null +++ b/ansible/tools/check_docs_no_parent_links.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python3 +"""Offline gate: forbid '../' in docs/*.md (R3). + +Default: any '../' occurrence in docs markdown fails. +Exception: allowlist entries in scripts/offline-check-whitelist.json with expiry. +Expired allowlist entries fail the gate. +""" + +from __future__ import annotations + +import json +import sys +from dataclasses import dataclass +from datetime import date +from pathlib import Path + + +ROOT = Path(__file__).resolve().parent.parent.parent +DOCS_DIR = ROOT / "docs" +ALLOWLIST_PATH = ROOT / "scripts" / "offline-check-whitelist.json" + + +@dataclass(frozen=True) +class AllowItem: + file: str + rule: str + reason: str + expires: str + + +def _load_allowlist() -> list[AllowItem]: + if not ALLOWLIST_PATH.exists(): + return [] + try: + raw = json.loads(ALLOWLIST_PATH.read_text(encoding="utf-8")) + except Exception as e: # noqa: BLE001 + raise RuntimeError(f"allowlist json parse error: {ALLOWLIST_PATH}: {e}") from e + + if not isinstance(raw, list): + raise RuntimeError("allowlist must be a JSON list") + + items: list[AllowItem] = [] + for i, obj in enumerate(raw): + if not isinstance(obj, dict): + raise RuntimeError(f"allowlist item {i} must be object") + items.append( + AllowItem( + file=str(obj.get("file", "")), + rule=str(obj.get("rule", "")), + reason=str(obj.get("reason", "")), + expires=str(obj.get("expires", "")), + ) + ) + return items + + +def _parse_date(s: str) -> date: + try: + return date.fromisoformat(s) + except Exception as e: # noqa: BLE001 + raise RuntimeError(f"invalid date (expected YYYY-MM-DD): {s}") from e + + +def main() -> None: + if not DOCS_DIR.is_dir(): + print("ERR: docs 目录缺失", file=sys.stderr) + sys.exit(2) + + allow_items = _load_allowlist() + allow_by_file: dict[str, list[AllowItem]] = {} + for it in allow_items: + if not it.file or not it.rule or not it.reason or not it.expires: + print(f"ERR: allowlist item missing fields: {it}", file=sys.stderr) + sys.exit(2) + allow_by_file.setdefault(it.file, []).append(it) + + today = date.today() + expired: list[str] = [] + for it in allow_items: + if _parse_date(it.expires) < today: + expired.append(f"{it.file} ({it.rule}) expired {it.expires}: {it.reason}") + if expired: + print("ERR: offline-check allowlist expired:", file=sys.stderr) + for line in expired: + print(f" - {line}", file=sys.stderr) + sys.exit(2) + + violations: list[str] = [] + for md in sorted(DOCS_DIR.glob("*.md")): + rel = md.relative_to(ROOT).as_posix() + content = md.read_text(encoding="utf-8", errors="ignore") + if "../" not in content: + continue + + allowed = False + for it in allow_by_file.get(rel, []): + if it.rule == "docs_no_parent_links": + allowed = True + break + + if not allowed: + violations.append(rel) + + if violations: + print("ERR: docs contains '../' (R3 forbids parent links):", file=sys.stderr) + for rel in violations: + print(f" - {rel}", file=sys.stderr) + print( + f"HINT: add temporary allowlist item in {ALLOWLIST_PATH} with expires+reason.", + file=sys.stderr, + ) + sys.exit(2) + + print("[OK] docs R3 parent-link check passed (no '../' found)") + + +if __name__ == "__main__": + main() + diff --git a/ansible/tools/gen_stories_by_doc.py b/ansible/tools/gen_stories_by_doc.py new file mode 100755 index 0000000..a49879d --- /dev/null +++ b/ansible/tools/gen_stories_by_doc.py @@ -0,0 +1,247 @@ +#!/usr/bin/env python3 +"""按执行域 doc_id 生成 BMad [CS] story-doc-XX-YY.md(与 verify playbook 一一对应)。 + +输出目录:_bmad-output/implementation-artifacts/stories-by-doc/ +索引:同目录 README.md + +用法: + python3 ansible/tools/gen_stories_by_doc.py # 全量 + python3 ansible/tools/gen_stories_by_doc.py --doc-id 02-05 # 单篇 + python3 ansible/tools/gen_stories_by_doc.py --readme-only # 仅刷新 README(含队列表) + python3 ansible/tools/gen_stories_by_doc.py --dry-run +""" +from __future__ import annotations + +import argparse +import re +import sys +from pathlib import Path + +ROOT = Path(__file__).resolve().parent.parent.parent +VERIFY_DIR = ROOT / "ansible" / "playbooks" / "verify" +DOCS_DIR = ROOT / "docs" +OUT_DIR = ROOT / "_bmad-output" / "implementation-artifacts" / "stories-by-doc" + +EXEC_ID_RE = re.compile(r"^(0[1-9]|[1-9][0-9])-(0[1-9]|[1-9][0-9])\.yml$") + + +def playbook_is_noop(doc_id: str) -> bool: + yml = VERIFY_DIR / f"{doc_id}.yml" + if not yml.is_file(): + return False + content = yml.read_text(encoding="utf-8", errors="ignore") + return "noop verify" in content.lower() or "noop-doc-verify" in content + + +def doc_title(doc_path: Path | None) -> str: + if doc_path is None or not doc_path.exists(): + return "(文档待补)" + text = doc_path.read_text(encoding="utf-8", errors="ignore") + first = text.splitlines()[0].strip() if text else "" + if first.startswith("#"): + return first.lstrip("#").strip() + return doc_path.stem + + +def story_body(doc_id: str, rel_doc: str, title: str, noop: bool) -> str: + pb_rel = f"ansible/playbooks/verify/{doc_id}.yml" + story_rel = f"_bmad-output/implementation-artifacts/stories-by-doc/story-doc-{doc_id}.md" + noop_line = ( + "**noop + verify_common 基线**(文档步骤以手工/Helm 为主)" + if noop + else "**自定义断言**(非纯 noop)" + ) + return f"""# Story [CS] doc_id {doc_id}:{title} + +Status: ready-for-dev + + + +## Meta + +| 字段 | 值 | +|------|-----| +| **doc_id** | `{doc_id}` | +| **文档** | `{rel_doc}` | +| **verify playbook** | `{pb_rel}` | +| **清单/示例目录** | `ansible/files/{doc_id}/` | +| **playbook 形态** | {noop_line} | + +## bmad-dev-story 入口 + +- **实施**:在 Cursor 使用 **`/bmad-dev-story`**,并在对话中提供本文件路径:`{story_rel}`(仓库根相对路径)。 +- **工作流**:技能 **`bmad-dev-story`** → `.cursor/skills/bmad-dev-story/workflow.md`;Agent 仅可改本 Story 内 Tasks 勾选、Dev Agent Record、File List、Change Log、Status(与 workflow 一致)。 + +## Story + +作为一名 **实验室维护者**, +我希望 **`./ansible/bin/verify.sh run {doc_id}`**(或 `./scripts/cs {doc_id}`)的行为与 **`{rel_doc}`** 中的 TL;DR / 成功判据 / 自动化验收描述一致,并满足仓库 **doc_id 三元契约** 与 **Output Contract(`[OC]` / `[OC-ASSERT]`)** 约定, +以便 **状态板、offline-check 与文档** 不漂移;**实施与收尾一律按 `/bmad-dev-story` 工作流、以本 Story 为规格真源**。 + +## Acceptance Criteria + +1. **契约**:存在 `docs/{doc_id}-*.md`、`ansible/playbooks/verify/{doc_id}.yml`、`ansible/files/{doc_id}/`;`python3 ansible/tools/validate_matrix_playbooks.py` 通过。 +2. **离线门禁**:`./scripts/offline-check.sh` 通过(含 R3 链接、syntax-check)。 +3. **运行验收**:在已连通集群的实验室环境执行 `./ansible/bin/verify.sh run {doc_id}`,结果符合该篇文档预期(`verified` / `gated` 均需可解释;失败则修 playbook 或文档)。 +4. **输出语义**:playbook 日志中 **`[OC]`** 行可被状态板/脚本解析;含断言处优先带 **`[OC-ASSERT]`**(与 `project-context.md`、ADR-004 一致)。 +5. **文档真源**:篇内「契约与真源」或等价段落指向 `ansible/files/{doc_id}/`(若含带 yaml 标注的 fenced 代码块,须出现 `ansible/files/{doc_id}/` 路径以满足弱门禁)。 + +## Tasks / Subtasks + +- [ ] **T1** 对照 **`{rel_doc}`** 的 TL;DR / 验证命令 / 预期,列出与 playbook 任务映射表(缺项 = 未覆盖)。 +- [ ] **T2** 若存在差距:改 **`{pb_rel}`** 或 **`verify_common` role**,或修订文档(禁止双真源)。 +- [ ] **T3** 回归:`./scripts/offline-check.sh` + `./ansible/bin/verify.sh run {doc_id}`。 +- [ ] **T4**(可选)将本轮结论记入 **`_bmad-output/implementation-artifacts/`** 下 OC 笔记(与本 Story 文件名并列索引)。 + +## Dev Notes + +- **执行域**:`XX>0 && YY>0`;导航页 `YY=00` 无对应 story。 +- **Helm / 手工步骤**:noop 篇以文档与 `ansible/files` 示例为准;自动化安装若扩展,需 **env 门控 + teardown**(见 `00-03` §2.1)。 +- **禁止**:用 noop **冒充** 已覆盖文档中要求集群变更/探针的判据(见 `epics-and-stories` Epic 2 Story 2.3)。 +- **主键**:以 `doc_id` 为准;见 `project-context.md`「doc_id 与验证框架」。 +- **实施入口**:本目录下每篇 Story 均通过 **`/bmad-dev-story`** 执行;勿绕过 workflow 随意改规格外文件而不更新 Tasks / File List。 + +### References + +- [Source: `{rel_doc}`] +- [Source: `{pb_rel}`] +- [Source: `ansible/files/{doc_id}/README.md`](若存在) +- [Source: `project-context.md` — 契约 / OC] +- [Source: `docs/00-03-测试与验证框架.md`] + +## Dev Agent Record + +### Agent Model Used + +(dev-story 填写) + +### Completion Notes List + +### File List + +--- + +## Story completion status + +- **ready-for-dev** +""" + + +def write_readme(ids: list[str]) -> str: + lines = [ + "# stories-by-doc 索引", + "", + "> BMad **[CS] Create Story**:每个执行域 `doc_id` 一篇。**实施入口统一为 Cursor `/bmad-dev-story`**:在对话中附上对应 `story-doc-.md` 的仓库根相对路径;工作流见 `.cursor/skills/bmad-dev-story/workflow.md`。", + "", + "**生成 / 更新**:在仓库根执行 `python3 ansible/tools/gen_stories_by_doc.py`(或 `--doc-id XX-YY` 只生成一篇)。仅刷新本 README、不覆盖各篇 Story 时:`python3 ansible/tools/gen_stories_by_doc.py --readme-only`。", + "", + "## 开发队列与工作方式(50 篇 ≠ 50 条 Cursor Todo)", + "", + "### 1. backlog 从哪来", + "", + "- **优先队列**:[`sprint-status.md`](../sprint-status.md) **§8 后续 Sprint Backlog** — 先做其中仍标 **ready-for-dev** 或缺口明显的 `doc_id`(与 §8 中某行叙事对上的 `doc_id` 共享同一工程真源:`verify` + docs)。", + "- **全量索引**:下表列出全部执行域 `doc_id`;需要按系列清扫时,可自 **01-xx → 07-xx** 顺序推进。", + "", + "### 2. WIP 与 IDE Todo", + "", + "- **WIP**:[`sprint-status.md`](../sprint-status.md) **§7** — 同一时间最多 **1** 篇 Story `in-progress`。", + "- **Cursor**:每次只跑 **`/bmad-dev-story`** + **一篇** `story-doc-XX-YY.md`;IDE Todo 建议只保留 **当前 `doc_id`**(及可选一条回归命令如 `offline-check` / `verify.sh run XX-YY`),**不要**把 50 篇拆成 50 条并行 Todo。", + "", + "### 3. 收尾与 §8 对齐", + "", + "- 完成条件以该篇 **AC + Tasks** 为准;按 `.cursor/skills/bmad-dev-story/workflow.md` 更新 Story 内 **Tasks 勾选、Dev Agent Record、File List、Change Log、Status**(如进入 `review`)。", + "- **`sprint-status.md` §8**:可与 Story 状态交叉备注;避免长期双真源 — **规格与闭合状态以 Story 文件 + `ansible/playbooks/verify/.yml` + docs 为准**,§8 由维护者 **周期性** 与已闭合项对齐(不必每篇同时改两处)。", + "", + "## 优先级(以 doc_id 为准)", + "", + "- **主键**:`doc_id`(`XX-YY`)→ 本目录 **`story-doc-.md`**;与 **`ansible/playbooks/verify/.yml`**、**`docs/-*.md`**、**`ansible/files//`** 一一对齐。", + "- **补充 backlog**:若 `_bmad-output/implementation-artifacts/` 下另有 **同主题** 文件(如 **`*-baseline-verify-oc.md`**、Epic 编号命名的 `3-2-*.md`),视为**同一 `doc_id` 或跨多篇 `doc_id` 的深化说明**;实施与验收仍以 **`verify/.yml` + 对应 docs** 为真源,规划文件名仅作辅助索引。", + "", + "下表 **Playbook** 列由本脚本根据 `verify/*.yml` 内容自动标注(`noop+verify_common` ≈ 文档/手工为主;`自定义断言` ≈ 含非 noop 任务),便于拣选体量与回归方式。", + "", + f"共 **{len(ids)}** 篇。", + "", + "| doc_id | Playbook | Story 文件 |", + "|--------|----------|------------|", + ] + for did in ids: + kind = "noop+verify_common" if playbook_is_noop(did) else "自定义断言" + lines.append(f"| {did} | {kind} | [story-doc-{did}.md](story-doc-{did}.md) |") + lines.append("") + return "\n".join(lines) + + +def collect_doc_ids(single: str | None) -> list[str]: + ids: list[str] = [] + for p in sorted(VERIFY_DIR.glob("*.yml")): + if p.name.startswith("_"): + continue + if not EXEC_ID_RE.match(p.name): + continue + stem = p.stem + if single and stem != single: + continue + ids.append(stem) + if single and single not in ids: + print(f"ERR: doc_id 无对应 verify playbook:{single}", file=sys.stderr) + sys.exit(2) + return sorted(ids) + + +def main() -> int: + ap = argparse.ArgumentParser(description="按 doc_id 生成 BMad CS(story-doc-*.md)") + ap.add_argument("--doc-id", help="仅生成该 XX-YY(默认全量)") + ap.add_argument("--dry-run", action="store_true", help="只打印将写入的路径") + ap.add_argument( + "--readme-only", + action="store_true", + help="只重写 stories-by-doc/README.md(不重写各 story-doc-*.md)", + ) + args = ap.parse_args() + + if args.readme_only and args.doc_id: + print("ERR: --readme-only 与 --doc-id 不能同时使用", file=sys.stderr) + return 2 + + doc_ids = collect_doc_ids(args.doc_id) + if not doc_ids: + print("ERR: 未发现执行域 verify playbook", file=sys.stderr) + return 2 + + if not args.dry_run: + OUT_DIR.mkdir(parents=True, exist_ok=True) + + if args.readme_only: + if args.dry_run: + print(f"would write {OUT_DIR.relative_to(ROOT)}/README.md") + return 0 + readme = write_readme(doc_ids) + (OUT_DIR / "README.md").write_text(readme, encoding="utf-8") + print(f"[OK] {OUT_DIR.relative_to(ROOT)}/README.md ({len(doc_ids)} 条, --readme-only)") + return 0 + + for doc_id in doc_ids: + noop = playbook_is_noop(doc_id) + matches = sorted(DOCS_DIR.glob(f"{doc_id}-*.md")) + doc_path = matches[0] if matches else None + rel_doc = str(doc_path.relative_to(ROOT)) if doc_path else f"docs/{doc_id}-*.md(缺失)" + title = doc_title(doc_path) + body = story_body(doc_id, rel_doc, title, noop) + out_path = OUT_DIR / f"story-doc-{doc_id}.md" + if args.dry_run: + print(f"would write {out_path.relative_to(ROOT)}") + continue + out_path.write_text(body, encoding="utf-8") + print(f"[OK] {out_path.relative_to(ROOT)}") + + if args.dry_run: + return 0 + + readme = write_readme(doc_ids) + (OUT_DIR / "README.md").write_text(readme, encoding="utf-8") + print(f"[OK] {OUT_DIR.relative_to(ROOT)}/README.md ({len(doc_ids)} 条)") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/ansible/tools/scaffold_doc_id.py b/ansible/tools/scaffold_doc_id.py new file mode 100644 index 0000000..d0b044a --- /dev/null +++ b/ansible/tools/scaffold_doc_id.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python3 +"""为执行域 doc_id 生成最小闭环骨架:docs + ansible/files + verify playbook。 + +符合 Epic 1 Story 1.3:默认通过 verify_common noop 任务链接入集群基线断言。 +""" +from __future__ import annotations + +import argparse +import re +import sys +from pathlib import Path + +ROOT = Path(__file__).resolve().parent.parent.parent +DOCS_DIR = ROOT / "docs" +FILES_BASE = ROOT / "ansible" / "files" +VERIFY_DIR = ROOT / "ansible" / "playbooks" / "verify" + +EXEC_ID_RE = re.compile(r"^(0[1-9]|[1-9][0-9])-(0[1-9]|[1-9][0-9])$") + + +def doc_markdown(doc_id: str, slug: str, title: str) -> str: + return f"""# {title} + +> **doc_id**:`{doc_id}` · 执行域文档(参与 `verify.sh list/full`)。 + +## 契约与真源 + +- **清单真源目录**:`ansible/files/{doc_id}/`(不要在本文重复粘贴大块 YAML 作为第二真源)。 +- **Ansible 验证入口**:`ansible/playbooks/verify/{doc_id}.yml` +- 基线断言复用 `ansible/roles/verify_common/tasks/noop-doc-verify.yml`(集群连通 + 可选 manifest dry-run)。 + +验收命令: + +```bash +./ansible/bin/verify.sh run {doc_id} +``` + +## 正文 + +(编写步骤说明;引用清单时请写相对于仓库的路径,例如 `ansible/files/{doc_id}/demo.yaml`。) +""" + + +def files_readme(doc_id: str) -> str: + return f"""# {doc_id} 清单真源 + +将本篇相关的 Kubernetes YAML、Helm values 等放在此目录。 + +- 命名建议:小写 + 连字符,例如 `app-deploy.yaml`。 +- 示例或非集群清单可使用 `*.example.yaml` 后缀;noop 验证会跳过这类文件的 `kubectl dry-run`。 +""" + + +def verify_playbook(doc_id: str, doc_filename: str) -> str: + return f"""--- +- name: "{doc_id} noop verify (scaffold)" + hosts: localhost + gather_facts: false + vars: + repo_root: "{{{{ playbook_dir }}}}/../../.." + doc_id: "{doc_id}" + doc_filename: "{doc_filename}" + tasks: + - name: Include noop doc verify role tasks + ansible.builtin.include_role: + name: verify_common + tasks_from: noop-doc-verify.yml +""" + + +def parse_args() -> argparse.Namespace: + p = argparse.ArgumentParser( + description="生成执行域 doc_id 最小骨架:docs、ansible/files//、verify playbook。" + ) + p.add_argument("doc_id", help="如 04-15(XX-YY,XX>0 且 YY>0)") + p.add_argument("slug", help="文档文件名段,如 k3s-traefik-acme(生成 docs/-.md)") + p.add_argument("--title", default="", help="文档 H1 标题(默认从 slug 生成)") + p.add_argument("--dry-run", action="store_true", help="只打印将创建的路径,不写盘") + p.add_argument("--force", action="store_true", help="覆盖已存在的同名目标文件") + return p.parse_args() + + +def validate_slug(slug: str) -> None: + if not slug or len(slug) > 200: + print("ERR: slug 长度须在 1~200 之间", file=sys.stderr) + sys.exit(2) + if slug.strip() != slug: + print("ERR: slug 首尾不应含空白", file=sys.stderr) + sys.exit(2) + if "/" in slug or "\\" in slug or slug.startswith("."): + print("ERR: slug 不允许路径分隔符或以 '.' 开头", file=sys.stderr) + sys.exit(2) + + +def main() -> None: + args = parse_args() + doc_id = args.doc_id.strip() + slug = args.slug.strip() + if not EXEC_ID_RE.fullmatch(doc_id): + print(f"ERR: doc_id 非法(须匹配执行域 XX-YY):{doc_id!r}", file=sys.stderr) + sys.exit(2) + validate_slug(slug) + + doc_filename = f"{doc_id}-{slug}.md" + title = args.title.strip() or slug.replace("-", " ").replace("_", " ") + + target_doc = DOCS_DIR / doc_filename + target_pb = VERIFY_DIR / f"{doc_id}.yml" + files_dir = FILES_BASE / doc_id + target_readme = files_dir / "README.md" + + # 同一 doc_id 仅允许一篇 docs/-*.md(与 validate_matrix 一致) + siblings = sorted(DOCS_DIR.glob(f"{doc_id}-*.md")) + for p in siblings: + if p.name != doc_filename: + print( + f"ERR: doc_id={doc_id} 已存在其他文档 {p.relative_to(ROOT)}," + f"请先删除或合并,不能新增第二篇。", + file=sys.stderr, + ) + sys.exit(2) + + planned = [ + ("docs", target_doc, doc_markdown(doc_id, slug, title)), + ("files README", target_readme, files_readme(doc_id)), + ("verify playbook", target_pb, verify_playbook(doc_id, doc_filename)), + ] + + for label, path, content in planned: + if path.exists() and not args.force: + print(f"ERR: 已存在(使用 --force 覆盖):{path.relative_to(ROOT)}", file=sys.stderr) + sys.exit(2) + + if args.dry_run: + print("[dry-run] 将创建:") + for label, path, _ in planned: + print(f" - {label}: {path.relative_to(ROOT)}") + return + + for label, path, content in planned: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(content, encoding="utf-8") + print(f"[OK] wrote {path.relative_to(ROOT)}") + + print("") + print("下一步:编辑正文与清单,然后执行:") + print(f" ./scripts/offline-check.sh") + print(f" ./ansible/bin/verify.sh run {doc_id}") + + +if __name__ == "__main__": + main() diff --git a/ansible/tools/status_board.py b/ansible/tools/status_board.py new file mode 100644 index 0000000..1bbce36 --- /dev/null +++ b/ansible/tools/status_board.py @@ -0,0 +1,306 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import json +import os +import re +import subprocess +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + + +ROOT = Path(__file__).resolve().parents[2] +VERIFY_DIR = ROOT / "ansible" / "playbooks" / "verify" +DOCS_DIR = ROOT / "docs" +RESULTS_DIR = ROOT / ".status" +RESULTS_JSON = RESULTS_DIR / "verify-results.json" +BOARD_MD = DOCS_DIR / "00-04-验证状态板.md" + +EXEC_ID_RE = re.compile(r"^(0[1-9]|[1-9][0-9])-(0[1-9]|[1-9][0-9])$") +OC_LINE_RE = re.compile(r"^\[OC\]\s+(?P.+)$") + + +def utc_now() -> str: + return datetime.now(timezone.utc).replace(microsecond=0).isoformat() + + +def load_results() -> dict[str, Any]: + if not RESULTS_JSON.exists(): + return {"version": 1, "updated_at": None, "results": {}} + try: + return json.loads(RESULTS_JSON.read_text(encoding="utf-8")) + except Exception: + return {"version": 1, "updated_at": None, "results": {}} + + +def save_results(data: dict[str, Any]) -> None: + RESULTS_DIR.mkdir(parents=True, exist_ok=True) + RESULTS_JSON.write_text(json.dumps(data, ensure_ascii=False, indent=2) + "\n", encoding="utf-8") + + +def list_exec_doc_ids() -> list[str]: + ids: list[str] = [] + for p in VERIFY_DIR.glob("*.yml"): + if len(p.name) != len("00-00.yml"): + continue + doc_id = p.stem + if EXEC_ID_RE.fullmatch(doc_id): + ids.append(doc_id) + return sorted(set(ids)) + + +def detect_noop(doc_id: str) -> bool: + pb = VERIFY_DIR / f"{doc_id}.yml" + if not pb.exists(): + return False + txt = pb.read_text(encoding="utf-8", errors="ignore") + return ("noop verify" in txt) or ("noop-doc-verify.yml" in txt) + + +def doc_exists(doc_id: str) -> bool: + return any(DOCS_DIR.glob(f"{doc_id}-*.md")) + + +def files_dir_exists(doc_id: str) -> bool: + return (ROOT / "ansible" / "files" / doc_id).is_dir() + + +@dataclass +class RunResult: + rc: int + stdout: str + stderr: str + + +def run_verify(doc_id: str) -> RunResult: + cmd = ["bash", str(ROOT / "ansible" / "bin" / "verify.sh"), "run", doc_id] + p = subprocess.run(cmd, cwd=str(ROOT), capture_output=True, text=True, env=os.environ.copy()) + return RunResult(rc=p.returncode, stdout=p.stdout, stderr=p.stderr) + + +def parse_oc_result(output: str, doc_id: str) -> dict[str, str] | None: + """Parse last [OC] line for a given doc_id. + + Expected format (from verify.sh): + [OC] doc_id=01-06 result=verified phase=verify assertion=... + """ + + last: dict[str, str] | None = None + for line in (output or "").splitlines(): + m = OC_LINE_RE.match(line.strip()) + if not m: + continue + kv = m.group("kv") + parts = [p for p in kv.split(" ") if p] + data: dict[str, str] = {} + for p in parts: + if "=" not in p: + continue + k, v = p.split("=", 1) + data[k.strip()] = v.strip().strip('"') + if data.get("doc_id") == doc_id and "result" in data: + last = data + return last + + +def normalize_status(s: str | None) -> str | None: + if not s: + return None + s = s.strip().lower() + if s in {"verified", "failed", "gated", "noop", "unknown", "broken"}: + return s + return None + + +def classify(doc_id: str, rr: RunResult | None, noop: bool) -> str: + if not doc_exists(doc_id) or not (VERIFY_DIR / f"{doc_id}.yml").exists(): + return "broken" + if rr is None: + return "noop" if noop else "unknown" + out = (rr.stdout or "") + "\n" + (rr.stderr or "") + oc = parse_oc_result(out, doc_id) + oc_status = normalize_status(oc.get("result") if oc else None) + if oc_status: + return oc_status + # fallback for legacy output + if "[GATE]" in out: + return "gated" + return "verified" if rr.rc == 0 else "failed" + + +def render_board(results_db: dict[str, Any], ids: list[str]) -> str: + updated_at = utc_now() + rows = [] + conflicts: list[str] = [] + for doc_id in ids: + rr = results_db.get("results", {}).get(doc_id) + noop = detect_noop(doc_id) + status = rr.get("status") if isinstance(rr, dict) else None + last = rr.get("updated_at") if isinstance(rr, dict) else None + rc = rr.get("rc") if isinstance(rr, dict) else None + stdout_tail = rr.get("stdout_tail") if isinstance(rr, dict) else None + stderr_tail = rr.get("stderr_tail") if isinstance(rr, dict) else None + + doc_ok = doc_exists(doc_id) + files_ok = files_dir_exists(doc_id) + pb_ok = (VERIFY_DIR / f"{doc_id}.yml").exists() + + # cache-first, but detect conflict with OC parsing when tails exist (EC6/OC6). + cache_status = normalize_status(status) + oc = None + oc_status = None + if isinstance(stdout_tail, str) or isinstance(stderr_tail, str): + out = f"{stdout_tail or ''}\n{stderr_tail or ''}" + oc = parse_oc_result(out, doc_id) + oc_status = normalize_status(oc.get("result") if oc else None) + if cache_status and oc_status and cache_status != oc_status: + conflicts.append(f"{doc_id}: cache={cache_status} oc={oc_status} (last={last})") + + if cache_status is None: + # static fallback + status = "noop" if noop else "unknown" + else: + status = cache_status + rows.append( + { + "doc_id": doc_id, + "status": status, + "noop": "Y" if noop else "", + "rc": "" if rc is None else str(rc), + "last": "" if not last else str(last), + "doc": "Y" if doc_ok else "N", + "playbook": "Y" if pb_ok else "N", + "files": "Y" if files_ok else "N", + } + ) + + def fmt(status: str) -> str: + return { + "verified": "✅ verified", + "gated": "🟡 gated", + "failed": "❌ failed", + "noop": "⚪ noop", + "unknown": "❓ unknown", + "broken": "🚫 broken", + }.get(status, status) + + lines = [] + lines.append("# 00-04-验证状态板(自动生成视图)") + lines.append("") + lines.append("> 本页为**只读视图**:用于快速查看「已验证/未验证/门控/失败」。") + lines.append("> **执行真源**仍以 `ansible/playbooks/verify/*.yml` 为准;本页不承载执行逻辑。") + lines.append("") + lines.append(f"- 最近生成时间(UTC):`{updated_at}`") + lines.append(f"- 本地结果缓存(不入库):`{RESULTS_JSON.relative_to(ROOT)}`") + lines.append("") + if conflicts: + lines.append("## ⚠️ 结果冲突告警(缓存 vs OC 解析)") + lines.append("") + lines.append("> 读源优先级:缓存优先;但若缓存状态与 OC 解析不一致,需排查 verify 输出或缓存一致性。") + lines.append("") + for c in conflicts[:50]: + lines.append(f"- `{c}`") + if len(conflicts) > 50: + lines.append(f"- ...(共 {len(conflicts)} 条,已截断)") + lines.append("") + lines.append("## 快速更新") + lines.append("") + lines.append("在仓库根执行:") + lines.append("") + lines.append("```bash") + lines.append("# 仅渲染(不跑真机验证,按缓存/静态信息生成)") + lines.append("python3 ansible/tools/status_board.py render") + lines.append("") + lines.append("# 真机跑一轮并写入缓存(会执行 verify playbook)") + lines.append("python3 ansible/tools/status_board.py update --all") + lines.append("python3 ansible/tools/status_board.py render") + lines.append("```") + lines.append("") + lines.append("## 状态表") + lines.append("") + lines.append("| doc_id | 状态 | noop | rc | last_update | docs | playbook | files |") + lines.append("|---|---|---:|---:|---|---:|---:|---:|") + for r in rows: + lines.append( + f"| {r['doc_id']} | {fmt(r['status'])} | {r['noop']} | {r['rc']} | {r['last']} | {r['doc']} | {r['playbook']} | {r['files']} |" + ) + lines.append("") + lines.append("## 口径说明") + lines.append("") + lines.append("- **verified/gated/failed/noop/unknown**:以 verify 输出的 `[OC] ... result=` 为准;缺失 OC 时回退到 legacy 规则。") + lines.append("- **gated**:必须附带 `missing_dependency` 与 `skip_scope`(见 Output Contract OC2)。") + lines.append("- **noop**:该 doc_id 的 verify playbook 为 noop 模式(仅基线/存在性/结构检查)。") + lines.append("- **unknown**:尚未在本机写入结果缓存(或仅静态生成)。") + lines.append("") + return "\n".join(lines) + + +def cmd_update(args: argparse.Namespace) -> int: + db = load_results() + ids = list_exec_doc_ids() + if args.all: + target_ids = ids + else: + target_ids = args.doc_ids + + results: dict[str, Any] = db.get("results", {}) if isinstance(db.get("results"), dict) else {} + + for doc_id in target_ids: + if not EXEC_ID_RE.fullmatch(doc_id): + continue + rr = run_verify(doc_id) + noop = detect_noop(doc_id) + status = classify(doc_id, rr, noop) + out = (rr.stdout or "") + "\n" + (rr.stderr or "") + oc = parse_oc_result(out, doc_id) + results[doc_id] = { + "updated_at": utc_now(), + "rc": rr.rc, + "status": status, + "noop": noop, + "oc": oc or None, + # keep small: store tail only + "stdout_tail": (rr.stdout or "")[-4000:], + "stderr_tail": (rr.stderr or "")[-4000:], + } + + db["version"] = 1 + db["updated_at"] = utc_now() + db["results"] = results + save_results(db) + return 0 + + +def cmd_render(_: argparse.Namespace) -> int: + db = load_results() + ids = list_exec_doc_ids() + md = render_board(db, ids) + BOARD_MD.write_text(md + "\n", encoding="utf-8") + return 0 + + +def main() -> int: + if not VERIFY_DIR.is_dir() or not DOCS_DIR.is_dir(): + raise SystemExit("ERR: not in repo root layout") + + ap = argparse.ArgumentParser(prog="status_board.py") + sub = ap.add_subparsers(dest="cmd", required=True) + + sp_u = sub.add_parser("update", help="Run verify and write local cache (not committed)") + sp_u.add_argument("--all", action="store_true", help="Update all exec doc_ids from verify dir") + sp_u.add_argument("doc_ids", nargs="*", help="Doc IDs like 03-07 (ignored if --all)") + sp_u.set_defaults(func=cmd_update) + + sp_r = sub.add_parser("render", help="Render docs/00-04-验证状态板.md from cache/static") + sp_r.set_defaults(func=cmd_render) + + args = ap.parse_args() + return int(args.func(args)) + + +if __name__ == "__main__": + raise SystemExit(main()) + diff --git a/ansible/tools/validate_matrix_playbooks.py b/ansible/tools/validate_matrix_playbooks.py new file mode 100644 index 0000000..e0dde49 --- /dev/null +++ b/ansible/tools/validate_matrix_playbooks.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 +"""校验执行域 verify/doc/files 一致性。 + +该脚本是“离线门禁”的一部分:发现 doc_id 三元契约不一致时应 fail-fast, +并输出可定位的冲突清单(doc_id + 路径集合)。 +""" +from __future__ import annotations + +import re +import sys +from pathlib import Path + +ROOT = Path(__file__).resolve().parent.parent.parent +VERIFY_DIR = ROOT / "ansible" / "playbooks" / "verify" +DOCS_DIR = ROOT / "docs" +EXEC_ID_RE = re.compile(r"^(0[1-9]|[1-9][0-9])-(0[1-9]|[1-9][0-9])$") + + +def is_exec_domain(doc_id: str) -> bool: + return EXEC_ID_RE.fullmatch(doc_id) is not None + + +def main() -> None: + if not VERIFY_DIR.is_dir() or not DOCS_DIR.is_dir(): + print("ERR: verify/docs 目录缺失", file=sys.stderr) + sys.exit(2) + + # --- scan verify playbooks (.yml/.yaml) --- + verify_by_doc_id: dict[str, list[Path]] = {} + invalid_verify_names: list[str] = [] + + for p in sorted(VERIFY_DIR.iterdir()): + if not p.is_file(): + continue + if p.name.startswith("_"): + # private helpers are allowed but not part of doc_id contract + continue + if p.suffix not in {".yml", ".yaml"}: + continue + + # Only accept .yml|yaml + stem = p.stem + if len(stem) != len("00-00") or stem[2:3] != "-": + invalid_verify_names.append(p.name) + continue + + if not is_exec_domain(stem): + invalid_verify_names.append(p.name) + continue + + verify_by_doc_id.setdefault(stem, []).append(p) + + # Same doc_id with multiple verify entrypoints is a hard conflict (EC1). + verify_conflicts: dict[str, list[str]] = {} + for did, paths in verify_by_doc_id.items(): + if len(paths) > 1: + verify_conflicts[did] = [str(p.relative_to(ROOT)) for p in sorted(paths)] + + missing_docs: list[str] = [] + missing_files_dir: list[str] = [] + weak_doc_exec_refs: list[str] = [] + multi_docs: dict[str, list[str]] = {} + + for did in sorted(verify_by_doc_id.keys()): + matches = sorted(DOCS_DIR.glob(f"{did}-*.md")) + if not matches: + missing_docs.append(did) + continue + if len(matches) > 1: + multi_docs[did] = [str(p.relative_to(ROOT)) for p in matches] + doc = matches[0] + content = doc.read_text(encoding="utf-8", errors="ignore") + if f"ansible/files/{did}/" not in content and "```yaml" in content: + weak_doc_exec_refs.append(did) + # Execution-domain doc_id must always have a files truth-source directory (ADR-002 / R4). + if not (ROOT / "ansible" / "files" / did).is_dir(): + missing_files_dir.append(did) + + if invalid_verify_names: + print(f"ERR: verify 仅允许执行域命名: {sorted(invalid_verify_names)}", file=sys.stderr) + sys.exit(2) + if verify_conflicts: + print("ERR: verify 入口冲突(同一 doc_id 多个入口,必须 fail-fast):", file=sys.stderr) + for did in sorted(verify_conflicts.keys()): + paths = verify_conflicts[did] + print(f" - {did}: {paths}", file=sys.stderr) + sys.exit(2) + if missing_docs: + print(f"ERR: 缺少 docs/-*.md: {missing_docs}", file=sys.stderr) + sys.exit(2) + if multi_docs: + print("ERR: docs 命名冲突(同一 doc_id 匹配到多篇 docs/-*.md):", file=sys.stderr) + for did in sorted(multi_docs.keys()): + print(f" - {did}: {multi_docs[did]}", file=sys.stderr) + sys.exit(2) + if missing_files_dir: + print(f"ERR: 缺少 ansible/files// 目录: {missing_files_dir}", file=sys.stderr) + sys.exit(2) + if weak_doc_exec_refs: + print(f"ERR: 文档 YAML 未映射 ansible/files// 真源: {weak_doc_exec_refs}", file=sys.stderr) + sys.exit(2) + + print(f"[OK] 执行域 verify/doc/files 一致性通过({len(verify_by_doc_id)} 条)") + + +if __name__ == "__main__": + main() diff --git a/ansible/tools/validate_stories_by_doc.py b/ansible/tools/validate_stories_by_doc.py new file mode 100644 index 0000000..41ba405 --- /dev/null +++ b/ansible/tools/validate_stories_by_doc.py @@ -0,0 +1,201 @@ +#!/usr/bin/env python3 +"""批量 VS(Validate Story):校验 stories-by-doc/story-doc-*.md 与仓库契约一致。 + +对应 BMad `bmad-create-story:validate` 在本仓库的**可自动化**子集(结构、路径、noop 与 playbook 一致)。 +深度内容评审(对照 PRD/架构全文)仍需人工或单篇会话。 + +用法: + python3 ansible/tools/validate_stories_by_doc.py + python3 ansible/tools/validate_stories_by_doc.py --write-report + python3 ansible/tools/validate_stories_by_doc.py --json +""" +from __future__ import annotations + +import argparse +import json +import re +import sys +from dataclasses import dataclass, field +from pathlib import Path + +ROOT = Path(__file__).resolve().parent.parent.parent +STORIES_DIR = ROOT / "_bmad-output" / "implementation-artifacts" / "stories-by-doc" +VERIFY_DIR = ROOT / "ansible" / "playbooks" / "verify" +DOCS_DIR = ROOT / "docs" +FILES_PREFIX = ROOT / "ansible" / "files" + +STORY_NAME_RE = re.compile(r"^story-doc-([0-9]{2}-[0-9]{2})\.md$") +META_DOC_RE = re.compile(r"\|\s*\*\*文档\*\*\s*\|\s*`([^`]+)`\s*\|") +META_PB_RE = re.compile(r"\|\s*\*\*verify playbook\*\*\s*\|\s*`([^`]+)`\s*\|") +META_ID_RE = re.compile(r"\|\s*\*\*doc_id\*\*\s*\|\s*`([0-9]{2}-[0-9]{2})`\s*\|") +META_NOOP_RE = re.compile(r"\|\s*\*\*playbook 形态\*\*\s*\|\s*(.+)\s*\|") + + +def playbook_is_noop(doc_id: str) -> bool: + yml = VERIFY_DIR / f"{doc_id}.yml" + if not yml.is_file(): + return False + content = yml.read_text(encoding="utf-8", errors="ignore") + return "noop verify" in content.lower() or "noop-doc-verify" in content + + +def meta_declares_noop(meta_line: str) -> bool: + """与 gen_stories_by_doc.py 一致:noop 篇写「noop + verify_common 基线」,勿用子串 noop(「非纯 noop」会误判)。""" + m = meta_line.lower() + return "noop + verify_common" in meta_line or "noop+verify_common" in m.replace(" ", "") + + +@dataclass +class Row: + doc_id: str + ok: bool + errors: list[str] = field(default_factory=list) + warnings: list[str] = field(default_factory=list) + + +def validate_one(path: Path) -> Row: + m = STORY_NAME_RE.match(path.name) + if not m: + return Row(doc_id="?", ok=False, errors=[f"文件名不符合 story-doc-XX-YY.md:{path.name}"]) + doc_id = m.group(1) + row = Row(doc_id=doc_id, ok=True) + text = path.read_text(encoding="utf-8", errors="replace") + + if "Status: ready-for-dev" not in text: + row.errors.append("缺少 Status: ready-for-dev") + row.ok = False + + for sec in ("## Meta", "## Acceptance Criteria", "## Tasks / Subtasks", "## Story completion status"): + if sec not in text: + row.errors.append(f"缺少章节 {sec}") + row.ok = False + + mid = META_ID_RE.search(text) + if not mid: + row.errors.append("Meta 表中无法解析 doc_id") + row.ok = False + elif mid.group(1) != doc_id: + row.errors.append(f"Meta doc_id `{mid.group(1)}` 与文件名 `{doc_id}` 不一致") + row.ok = False + + md = META_DOC_RE.search(text) + if not md: + row.errors.append("Meta 表中无法解析 **文档**") + row.ok = False + else: + rel = md.group(1).strip() + doc_path = ROOT / rel + if not doc_path.is_file(): + row.errors.append(f"文档路径不存在:{rel}") + row.ok = False + + mp = META_PB_RE.search(text) + if not mp: + row.errors.append("Meta 表中无法解析 **verify playbook**") + row.ok = False + else: + rel_pb = mp.group(1).strip() + pb_path = ROOT / rel_pb + if not pb_path.is_file(): + row.errors.append(f"playbook 不存在:{rel_pb}") + row.ok = False + + mn = META_NOOP_RE.search(text) + if mn: + meta_line = mn.group(1).strip() + expected_noop = playbook_is_noop(doc_id) + declared_noop = meta_declares_noop(meta_line) + if expected_noop != declared_noop: + row.errors.append( + f"playbook 形态与 verify 实际不一致:期望 noop={expected_noop},Meta 表述 noop={declared_noop}" + ) + row.ok = False + else: + row.warnings.append("未解析 playbook 形态行,跳过 noop 一致性检查") + + files_dir = FILES_PREFIX / doc_id + if not files_dir.is_dir(): + row.errors.append(f"缺少目录 ansible/files/{doc_id}/") + row.ok = False + + if f"verify.sh run {doc_id}" not in text and f"run {doc_id}" not in text: + row.warnings.append("正文中未出现 verify.sh run {doc_id} 类引用(可忽略若已改写)") + + return row + + +def main() -> int: + ap = argparse.ArgumentParser(description="批量校验 story-doc-*.md(VS 自动化子集)") + ap.add_argument("--write-report", action="store_true", help=f"写入 {STORIES_DIR}/VS-all-report.md") + ap.add_argument("--json", action="store_true", help="stdout 输出 JSON") + args = ap.parse_args() + + paths = sorted(STORIES_DIR.glob("story-doc-*.md")) + if not paths: + print("ERR: 未发现 story-doc-*.md", file=sys.stderr) + return 2 + + rows: list[Row] = [validate_one(p) for p in paths] + failed = [r for r in rows if not r.ok] + + if args.json: + print( + json.dumps( + [ + { + "doc_id": r.doc_id, + "ok": r.ok, + "errors": r.errors, + "warnings": r.warnings, + } + for r in rows + ], + ensure_ascii=False, + indent=2, + ) + ) + else: + print(f"VS-all(自动化):共 {len(rows)} 篇,通过 {len(rows) - len(failed)},失败 {len(failed)}") + for r in rows: + if r.ok and not r.warnings: + continue + print(f"\n## {r.doc_id}") + if r.errors: + for e in r.errors: + print(f" ERR: {e}") + for w in r.warnings: + print(f" WARN: {w}") + if not failed: + print("\n全部通过(结构 + 路径 + noop 一致性)。") + + if args.write_report: + lines = [ + "# VS-all 报告(Validate Story 自动化子集)", + "", + f"- 校验篇数:**{len(rows)}**", + f"- 通过:**{len(rows) - len(failed)}**", + f"- 失败:**{len(failed)}**", + "", + "本报告由 `python3 ansible/tools/validate_stories_by_doc.py --write-report` 生成。", + "覆盖:文件名与 Meta `doc_id`、必备章节、`docs` / `verify` / `ansible/files` 路径、`noop` 与 playbook 一致。", + "不覆盖:对照 PRD/架构的语义深度评审(请单篇使用 BMad VS 或人工)。", + "", + "---", + "", + ] + for r in rows: + status = "✅" if r.ok else "❌" + lines.append(f"- {status} **`{r.doc_id}`**") + for e in r.errors: + lines.append(f" - ERR: {e}") + for w in r.warnings: + lines.append(f" - WARN: {w}") + out = STORIES_DIR / "VS-all-report.md" + out.write_text("\n".join(lines) + "\n", encoding="utf-8") + print(f"\n[OK] 已写入 {out.relative_to(ROOT)}") + + return 1 if failed else 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/docs/00-00-构建总览.md b/docs/00-00-构建总览.md index 0576fd7..d36f53d 100644 --- a/docs/00-00-构建总览.md +++ b/docs/00-00-构建总览.md @@ -5,15 +5,16 @@ ## TL;DR - **本文性质**:目录导航/阅读顺序说明(不对应矩阵验收用例) -- **真机一键验收**:`./scripts/acceptance.sh`(可选铺栈)或 `./scripts/verify.sh full`(仅验收) +- **真机一键验收**:`./scripts/acceptance.sh`(可选铺栈)或 `./ansible/bin/verify.sh full`(仅验收) - **成功判据**:你能从本文快速定位到要跑的 `doc_id`、入口脚本与下一步文档 -- **排障**:若验收失败,先跑 `./scripts/verify.sh preflight`,再看对应 `doc_id` 的 playbook 输出 +- **排障**:若验收失败,先跑 `./ansible/bin/verify.sh preflight`,再看对应 `doc_id` 的 playbook 输出 **路径约定**:本文档中的链接,凡写作 `docs/...`、`ansible/...`、`scripts/...`、`project-context.md` 的,均相对于**仓库根目录**(与在 `docs/` 内打开本文件时的相对路径无关,避免混用 `../`)。 ## 目录约定 - 文档:`docs/`(Kubernetes 等可复用清单见 `ansible/files/`,与 Ansible playbook 共用) +- **Helm**:CLI 安装与典型场景见 `00-02-部署环境说明.md`(§1.1~§1.2)、`00-03-测试与验证框架.md`(§1.1 索引);chart 实践分散在 `03-07`、`05-05`、`03-10` 等篇。 - 脚本:`scripts/` - 脚本入口:`scripts/README.md` - 仓库契约(实现与改动规则):`project-context.md` @@ -38,9 +39,9 @@ ## 排障 -- **不知道该跑哪个命令**:先按本文“学习主线(6 步)”找到对应 `doc_id`,再用 `./scripts/verify.sh run ` 或 `./scripts/verify.sh full`。 -- **verify.sh 报缺 playbook**:确认存在 `ansible/playbooks/verify/.yml`,并可运行 `python3 scripts/validate_matrix_playbooks.py` 做存在性校验。 -- **连接不上集群**:在控制端执行 `./scripts/verify.sh preflight`,优先修复 inventory/SSH/私钥权限问题。 +- **不知道该跑哪个命令**:先按本文“学习主线(6 步)”找到对应 `doc_id`,再用 `./ansible/bin/verify.sh run ` 或 `./ansible/bin/verify.sh full`。 +- **verify.sh 报缺 playbook**:确认存在 `ansible/playbooks/verify/.yml`,并可运行 `python3 ansible/tools/validate_matrix_playbooks.py` 做存在性校验。 +- **连接不上集群**:在控制端执行 `./ansible/bin/verify.sh preflight`,优先修复 inventory/SSH/私钥权限问题。 ### 流程图(主线与分叉) @@ -48,7 +49,7 @@ flowchart TD S1[1 总览 + 按需 00-04 环境说明] S2[2 概念 00-01] - S3[3 装集群 01-06 或 01-01+01-02] + S3[3 装集群 01-05 或 01-01+01-02] S4[4 kubectl 节点全 Ready] S5[5 Nginx 02-00 → 02-05] S6[6 Node.js 04-01] @@ -69,12 +70,14 @@ flowchart TD 1. **总览与环境**:精读本篇;需要对照节点、IP、版本时打开 `00-02-部署环境说明.md`。 2. **概念速查**:`00-01-k3s-基础概念.md`。**时间紧可跳过**,卡术语再读。 -3. **安装 K3s**:`01-01-k3s-控制节点含traefik.md` + `01-02-k3s-工作节点.md`,或一键 `01-06-节点初始化-ansible-实践.md`(仓库根亦可配合 `scripts/deploy-lab.sh`)。 +3. **安装 K3s**:`01-01-k3s-控制节点含traefik.md` + `01-02-k3s-工作节点.md`,或一键 `01-05-节点初始化-ansible-实践.md`(仓库根亦可配合 `ansible/bin/deploy-lab.sh`)。 4. **验收**:`kubectl get nodes`,所有节点 Ready。 -5. **HTTP 入口验证**:`02-00-nginx-系列说明.md` → `02-05-nginx-验证矩阵-一键部署.md`(清单真源:`ansible/files/02-05/`)。可选:`./scripts/verify.sh run 02-05`。 +5. **HTTP 入口验证**:`02-00-nginx-系列说明.md` → `02-05-nginx-验证矩阵-一键部署.md`(清单真源:`ansible/files/02-05/`)。可选:`./ansible/bin/verify.sh run 02-05`。 6. **工作负载主线**:`04-01-k3s-nodejs-高级部署.md`。`04-02`~`04-14` 为分项,**按需阅读**,不列入主线编号。 -> 推进「已验证」的前置见 `00-04-待验证项-验证前准备.md`。 +> 推进「已验证」的前置见 `00-03-测试与验证框架.md` 的 **§10 验证前准备清单**。 +> +> 进度视图见 `00-04-验证状态板.md`(自动生成视图;执行真源仍以 verify playbook 为准)。 ### 30 分钟快速路径(4 步) @@ -87,10 +90,10 @@ flowchart TD 下列顺序适合**已走完 6 步主线**、或需要按编号通读全库时查阅;**不必一次性做完**。 1. `00-01-k3s-基础概念.md` -2. `01-01-k3s-控制节点含traefik.md`(或直接用 `01-06-节点初始化-ansible-实践.md` 一键自动化) +2. `01-01-k3s-控制节点含traefik.md`(或直接用 `01-05-节点初始化-ansible-实践.md` 一键自动化) 3. `01-02-k3s-工作节点.md` 4. `01-03-armv7-standalone-docker.md` -5. `01-07-openwrt-haproxy.md`(按需:网关负载均衡) +5. `01-06-openwrt-haproxy.md`(按需:网关负载均衡) 6. `02-00-nginx-系列说明.md` → `02-05-nginx-验证矩阵-一键部署.md`(HTTP 矩阵与入口验证;清单真源:`ansible/files/02-05/`) 7. `04-01-k3s-nodejs-高级部署.md` 8. `04-02-nodejs-端口与Service.md` @@ -113,7 +116,7 @@ flowchart TD 25. `03-05-k3s-local-path-pvc.md`(K3s 自带 local-path,单副本本地持久化) 26. `03-06-k3s-使用nfs存储.md`(按需:已有 NFS 时 PV/PVC) 27. `03-07-k3s-longhorn-持久化存储.md`(重状态、快照/备份,建议部署 GitLab 等前统一规划) -28. `03-08-k3s-ha-集群配置与切换.md`(按需:双控制节点 HA,配合 `01-08`) +28. `03-08-k3s-ha-集群配置与切换.md`(按需:双控制节点 HA,配合 `01-07`) 29. `03-09-k3s-gitops-集群配置管理.md`(框架草案:Argo CD / Flux) --- @@ -124,7 +127,7 @@ flowchart TD - **Traefik / 入口**:`03-01-k3s-traefik-dashboard.md`、`03-02-k3s-traefik-acme.md`、`03-04-k3s-cloudflare-tunnel-配置接入.md` - **存储**:`03-05-k3s-local-path-pvc.md`、`03-06-k3s-使用nfs存储.md`、`03-07-k3s-longhorn-持久化存储.md` -- **高可用 / GitOps**:`01-08-双控制节点ha.md`、`03-08-k3s-ha-集群配置与切换.md`、`03-09-k3s-gitops-集群配置管理.md` +- **高可用 / GitOps**:`01-07-双控制节点ha.md`、`03-08-k3s-ha-集群配置与切换.md`、`03-09-k3s-gitops-集群配置管理.md` - **工作节点与 HTTP 验证**(主线中已覆盖,此处为直达):`01-02-k3s-工作节点.md`、`02-05-nginx-验证矩阵-一键部署.md`(可先读 `02-00-nginx-系列说明.md`) - **Node.js 分项**(`04-02`~`04-14`,从 `04-01` 文末进入) - **排障**:`06-01-k3s-networkpolicy-故障排查.md`、`scripts/README.md` @@ -149,8 +152,8 @@ flowchart TD ## 专题导航 - `00-02-部署环境说明.md`(节点布局、IP、OS、K3s 版本等,便于对照与复现) -- `01-06-节点初始化-ansible-实践.md`(Ansible 一键安装 k3s 集群,已验证) -- `01-07-openwrt-haproxy.md`(按需:网关负载均衡) +- `01-05-节点初始化-ansible-实践.md`(Ansible 一键安装 k3s 集群,已验证) +- `01-06-openwrt-haproxy.md`(按需:网关负载均衡) - nginx 矩阵:`ansible/playbooks/verify/02-05.yml`(薄封装导入 `ansible/playbooks/verify/02-05.yml`);TLS 铺栈另见 `ansible/playbooks/verify/03-02.yml`(`deploy-lab nginx-matrix-tls`)。 - `03-04-k3s-cloudflare-tunnel-配置接入.md`(Cloudflare Tunnel 完整流程:Zero Trust + 集群接入) @@ -161,10 +164,10 @@ flowchart TD - `05-05-prometheus与grafana.md` - `05-07-openclaw应用部署.md` - `05-08-openclaw-k3s-实验部署.md` -- `01-05-armv7-nfs服务安装.md` +- `01-04-armv7-nfs服务安装.md` - `05-06-openlist挂载网盘与自动备份.md` - `06-02-运维小结.md` -- `01-08-双控制节点ha.md` +- `01-07-双控制节点ha.md` - `03-08-k3s-ha-集群配置与切换.md` - `03-09-k3s-gitops-集群配置管理.md`(框架草案) @@ -188,4 +191,4 @@ flowchart TD - `07-01-k3s-calico-dualstack.md`(Calico 双栈实验) - `07-02-k3s-cilium-dualstack-ebpf.md`(Cilium 双栈与 eBPF) -> 与主线 `01-06` + Flannel 安装路径不同;仅在独立实验环境或充分备份后阅读、操作。验证矩阵与 `verify.sh` 对 `07-*` 仅为 **noop(文档 + 占位目录存在性)**。 +> 与主线 `01-05` + Flannel 安装路径不同;仅在独立实验环境或充分备份后阅读、操作。验证矩阵与 `verify.sh` 对 `07-*` 仅为 **noop(文档 + 占位目录存在性)**。 diff --git a/docs/00-01-k3s-基础概念.md b/docs/00-01-k3s-基础概念.md index 8fe7961..a1f7adf 100644 --- a/docs/00-01-k3s-基础概念.md +++ b/docs/00-01-k3s-基础概念.md @@ -5,7 +5,7 @@ ## TL;DR - **本文性质**:概念/术语速查(不对应独立铺栈) -- **推荐动作**:按 `00-00-构建总览.md` 进入主线;真机验收用 `./scripts/verify.sh full` +- **推荐动作**:按 `00-00-构建总览.md` 进入主线;真机验收用 `./ansible/bin/verify.sh full` - **成功判据**:能看懂后续文档中的 K3s/K8s 术语(node/pod/service/ingress 等) - **排障**:执行类问题请转到对应实验篇(`01-*`/`02-*`/`03-*`)的「排障」 @@ -16,8 +16,8 @@ ## 排障 -- **概念读不懂**:先看 `00-04-部署环境说明.md` 了解本仓库“实验室约定”,再回到本篇对照术语与节点角色。 -- **想跑命令但本篇没有**:本篇不提供部署/验收命令;按 `00-00` 找到对应实验篇,再跑 `./scripts/verify.sh run `。 +- **概念读不懂**:先看 `00-02-部署环境说明.md` 了解本仓库“实验室约定”,再回到本篇对照术语与节点角色。 +- **想跑命令但本篇没有**:本篇不提供部署/验收命令;按 `00-00` 找到对应实验篇,再跑 `./ansible/bin/verify.sh run `。 ## 1. K3s 是什么 @@ -118,7 +118,7 @@ K3s 自带 **local-path-provisioner**:当你创建 PVC 且不指定 `storageCl - **工作机制**:PVC 被创建后,provisioner 会在 **Pod 被调度到的节点** 上,在其本地磁盘创建目录(默认在 `data-dir` 下的 `storage`,例如 `/var/lib/rancher/k3s/storage` 或 `/storage`),并为之创建 PV、与 PVC 绑定。 - **绑定到节点**:数据只存在于该节点的本地目录,**与该节点绑定**;Pod 被调度到另一节点时,会拿到新的空卷,旧节点上的数据不会自动迁移。 -- **适用场景**:单副本应用、缓存、日志等,能接受 Pod 漂移后数据丢失或需手动恢复。**多副本共享数据**应使用 NFS、CSI 等共享存储(见 `01-05`)。 +- **适用场景**:单副本应用、缓存、日志等,能接受 Pod 漂移后数据丢失或需手动恢复。**多副本共享数据**应使用 NFS、CSI 等共享存储(见 `01-04`)。 - **查看**:`kubectl get storageclass` 可见 `local-path`(通常为默认);`kubectl get pv,pvc` 可查看已创建的卷。 - **操作示例**:见 `03-05-k3s-local-path-pvc.md`。 @@ -126,7 +126,7 @@ K3s 自带 **local-path-provisioner**:当你创建 PVC 且不指定 `storageCl - **Pod 可以漂移,宿主机本地数据不会跟着漂移**:用 `hostPath` 把宿主机目录挂进容器时,数据只在这台机器上;Pod 被调度到另一台节点后,那台机器没有同样目录和数据,应用就会“丢数据”。 - **K3s 不会自动帮你搬本地数据**:调度器只管 Pod 放哪台节点,不会同步 `/var/lib/...` 或自建目录;所以“节点故障自动漂移”和“数据高可用”是两件事,要分别设计。 -- **常见做法**:重要数据用共享存储(NFS / 云盘 / CSI),通过 PV/PVC 给 Pod 用(参考 `01-05`、`03-07`);缓存、临时文件用本地目录(`emptyDir` 或 `hostPath`),接受节点挂了可丢;或靠备份/同步把本地目录定期同步到别处,再在新节点恢复。 +- **常见做法**:重要数据用共享存储(NFS / 云盘 / CSI),通过 PV/PVC 给 Pod 用(参考 `01-04`、`03-07`);缓存、临时文件用本地目录(`emptyDir` 或 `hostPath`),接受节点挂了可丢;或靠备份/同步把本地目录定期同步到别处,再在新节点恢复。 **用途**:搞清楚数据放哪、节点挂了会不会丢,才能设计备份和高可用,避免常见存储与可用性误区。 @@ -143,7 +143,16 @@ K3s 自带 **local-path-provisioner**:当你创建 PVC 且不指定 `storageCl 参考:`02-05-nginx-验证矩阵-一键部署.md` 删除小节。 -## 10. 下一步 +## 10. Helm 简介(与本仓库) + +- **是什么**:**Helm** 把 Kubernetes 应用打成 **Chart**(模板 + 默认配置),用 **`helm install` / `helm upgrade`** 一次装一类组件(如 Longhorn、Prometheus 全家桶),并可用 **values 文件**覆盖默认参数。 +- **和 `kubectl apply` 的关系**:Helm 最终仍会把渲染后的 YAML **提交给 API Server**;你手里拿的 **values.yaml** 不是直接 `kubectl apply` 的对象,除非先从 chart 渲染出清单(`helm template`)。 +- **和 K3s 的关系**:K3s 集群里常见两条线并存: + - **`helm` 命令**:你在 shell 里对集群执行(需本机安装 CLI,见 `00-02` §1.2)。 + - **HelmChart / HelmChartConfig**(CRD):由 k3s 在 **`kube-system`** 里调度部分内置 chart(如 Traefik);改 Traefik 端口等可参见 `03-10` 的 `HelmChartConfig` 清单。 +- **本仓库去哪学**:存储 **`03-07`**,监控 **`05-05`**,Traefik 参数注入 **`03-10`**,GitOps 里 Argo 安装方式 **`03-09`**;验证框架里对 Helm 超时与排障的约定见 **`00-03`** §2.1。 + +## 11. 下一步 - `01-01-k3s-控制节点含traefik.md` - `01-02-k3s-工作节点.md` diff --git a/docs/00-02-部署环境说明.md b/docs/00-02-部署环境说明.md index 277276e..63717ff 100644 --- a/docs/00-02-部署环境说明.md +++ b/docs/00-02-部署环境说明.md @@ -1,4 +1,4 @@ -# 00-04-部署环境说明 +# 00-02-部署环境说明 > 本文描述本仓库文档所针对的**验证环境**:节点布局、IP、OS、K3s 版本等。其他环境按需对照调整。 @@ -17,13 +17,14 @@ | ylc62 | 192.168.2.62 | k3s worker | 工作节点 | | ylc63 | 192.168.2.63 | k3s worker | 工作节点 | | ylc64 | 192.168.2.64 | k3s worker | 工作节点 | -| ylc65 | 192.168.2.65 | Linux 工作机(非 K3s) | **不参与** `kubectl get nodes`;提供日常 **Linux/x86_64** 环境,用于克隆本仓库、跑 `ansible-playbook` / `scripts/verify.sh`、编辑与排障;通过 SSH 连 ylc61~ylc64 执行自动化,磁盘与 **§3.1 K3s 节点约定无关**(按本机实际即可) | -| openwrt | 192.168.2.1 | OpenWrt 主路由 | 局域网网关;可选 HAProxy 将 80/443 转发至 K3s,见 `01-07` | +| ylc65 | 192.168.2.65 | Linux 工作机(非 K3s) | **不参与** `kubectl get nodes`;提供日常 **Linux/x86_64** 环境,用于克隆本仓库、跑 `ansible-playbook` / `ansible/bin/verify.sh`、编辑与排障;通过 SSH 连 ylc61~ylc64 执行自动化,磁盘与 **§3.1 K3s 节点约定无关**(按本机实际即可) | +| openwrt | 192.168.2.1 | OpenWrt 主路由 | 局域网网关;可选 HAProxy 将 80/443 转发至 K3s,见 `01-06` | | onecloud | 192.168.2.22 | ARM32(Armbian) | 非 K3s 集群节点;armv7 / NFS 等文档的实验或外部 curl 来源 | - Kubernetes 中的节点名使用短主机名(**仅** `ylc61`~`ylc64` 四类 K3s 机器),与 inventory 中 `[k3s_server]` / `[k3s_worker]` 一致;`ylc65` **不是**集群成员。便于配合 Cloudflare CDN(若计算机 hostname 为 FQDN,本机解析会优先走本地导致无法访问)。 - **控制机**(运行 `ansible-playbook`、`verify.sh`):推荐 **`ylc65`(Linux 工作机)** 或 ylc61;亦可在你的本机 Linux 上执行,只要满足下节 **§1.1** 依赖并能 SSH 到 inventory 中的节点。 +- **集群外 HTTP 探测(如 `01-06`)**:`ansible/env/.env.verify` 中 **`WORKSTATION_SSH`** 推荐配置为 **SSH 到 Linux 工作机 `ylc65`** 的一行命令(在该机上执行 `curl`,流量视角与 k3s 节点分离);**`onecloud`** 仍为 ARM/NFS 等场景主机,与「工作机探测」默认语义不同。 ## 排障 @@ -33,15 +34,16 @@ ### 1.1 Linux 工作机(ylc65)软件依赖 -在 **`ylc65` 上执行本仓库各步骤**(克隆仓库、`./scripts/deploy-lab.sh`、`./scripts/verify.sh`、`ansible-playbook …`)时,该主机是 **Ansible 控制端**,需预先安装下列组件(版本可与 §2 对照,其他发行版用等价包名即可): +在 **`ylc65` 上执行本仓库各步骤**(克隆仓库、`./ansible/bin/deploy-lab.sh`、`./ansible/bin/verify.sh`、`ansible-playbook …`)时,该主机是 **Ansible 控制端**,需预先安装下列组件(版本可与 §2 对照,其他发行版用等价包名即可): | 用途 | 说明 | |------|------| | **Git** | 克隆 / 更新本仓库;排障时对比本地与远端分支。 | -| **Ansible**(`ansible-core` + `ansible-playbook`) | 执行 `ansible/playbooks/*`、`scripts/deploy-lab.sh`、`scripts/verify.sh` 所调用的 playbook;与 §2「Ansible ansible-core 2.18」一致即可。 | +| **Ansible**(`ansible-core` + `ansible-playbook`) | 执行 `ansible/playbooks/*`、`ansible/bin/deploy-lab.sh`、`ansible/bin/verify.sh` 所调用的 playbook;与 §2「Ansible ansible-core 2.18」一致即可。 | | **OpenSSH 客户端**(`ssh`、`scp`、`ssh-keygen`) | 按 `ansible/inventory.ini` 连接 `ylc61`~`ylc64`(通常为 root + 私钥);`scripts/ssh/test-ssh.sh` 等亦依赖本机 `ssh`。运行 `scripts/ssh/setup-k3s-workers-ssh.sh` 预配密钥时同样只需 OpenSSH;**不要求** PuTTY(仅当该脚本交互中勾选「生成 PuTTY .ppk」供 Windows 使用时,才需额外安装 `puttygen`)。 | | **Bash** | 仓库脚本为 `#!/usr/bin/env bash`;勿在仅 `sh` 的环境强行执行。 | | **curl** | 部分验证与文档示例;`verify` playbook 在远端执行 curl 时由节点侧提供,控制端亦建议具备以便自检。 | +| **Helm**(`helm` CLI,**可选但常用**) | 在 **已能访问集群 API** 的机器上安装(多为控制节点 `ylc61` 或工作机 `ylc65` + `KUBECONFIG`)。用于 **Longhorn、kube-prometheus-stack** 等 chart 安装与升级;与 K3s 内置的 **HelmChart / HelmChartConfig**(`kube-system` 内由 k3s 调度的 chart)是两条线:CLI 面向你在终端执行的 `helm upgrade --install`,CRD 面向随集群声明的 Traefik 等。安装指引见 [Helm 官方安装文档](https://helm.sh/docs/intro/install/);本仓库实践见 `03-07`、`05-05`、`03-10`(HelmChartConfig)。 | **Fedora / RHEL 系示例**(在 ylc65 上): @@ -66,6 +68,21 @@ ssh -V Python 3 会作为 **Ansible 控制端**依赖被包管理器一并拉取,一般无需单独指定版本。若仅在 **ylc61 本机**跑 Ansible 而不使用 ylc65,同样需满足上表(在控制节点上安装等价软件)。 +### 1.2 Helm CLI 安装与自检(按需) + +在计划执行 **`verify/03-07.yml`(Longhorn)**、**手工跟做 `05-05`(Prometheus/Grafana)** 或阅读 **HelmChartConfig**(`03-10`)前,建议在 **一台已配置 `KUBECONFIG` 的机器**上安装 Helm 3: + +```bash +# Fedora / RHEL(若仓库提供 helm 包) +sudo dnf install -y helm + +# 或按官方脚本/二进制:https://helm.sh/docs/intro/install/ +helm version +helm list -A +``` + +大 chart 安装请带 **`--wait` 与 `--timeout`**(如 10m~20m),排障时查 `helm status`、`kubectl describe`、`kubectl get events`,与 `00-03` §2.1 表格一致。 + ## 2. 软件版本(已验证) @@ -73,13 +90,14 @@ Python 3 会作为 **Ansible 控制端**依赖被包管理器一并拉取,一 | ------- | ----------------- | --------------------------- | | OS | Fedora 43 Server (CoreOS) | 其他 RHEL 系 / Debian 系按文档说明适配 | | K3s | v1.34.5+k3s1 | 来自 get.k3s.io 默认 | -| Ansible | ansible-core 2.18 | 用于 `01-06` 自动化安装 | +| Ansible | ansible-core 2.18 | 用于 `01-05` 自动化安装 | +| Helm | 3.x(与 chart 要求一致) | **可选**;`03-07` playbook 会尝试在节点安装;手工 chart 见 `05-05` | ## 3. 网络与存储 - **网段**:192.168.2.0/24 -- **可选**:OpenWrt 主路由(上表 `openwrt`,192.168.2.1)上配置 HAProxy 负载均衡,将 80/443 转发到 K3s 节点,见 `01-07-openwrt-haproxy.md` +- **可选**:OpenWrt 主路由(上表 `openwrt`,192.168.2.1)上配置 HAProxy 负载均衡,将 80/443 转发到 K3s 节点,见 `01-06-openwrt-haproxy.md` ### 3.1 磁盘规划(四台 K3s 节点统一:10G + 32G) @@ -96,12 +114,12 @@ findmnt -n -o SOURCE / /storage lsblk -f ``` -两行 `SOURCE` 应指向**不同**块设备(或不同 LV);若 `/storage` 未单独挂载,请先完成分区、格式化、`/etc/fstab` 再装 K3s(见 `01-06`)。 +两行 `SOURCE` 应指向**不同**块设备(或不同 LV);若 `/storage` 未单独挂载,请先完成分区、格式化、`/etc/fstab` 再装 K3s(见 `01-05`)。 ### 3.2 推荐自动化顺序 -1. (可选)`ansible/playbooks/verify/01-06.yml`:声明 `k3s_data_disk_device` 并启用 `k3s_prepare_storage` 时,幂等准备 `/storage`。 -2. `ansible/playbooks/verify/01-06.yml`:安装 K3s(可开启 `k3s_verify_storage_mount` 校验挂载)。 +1. (可选)`ansible/playbooks/verify/01-05.yml`:声明 `k3s_data_disk_device` 并启用 `k3s_prepare_storage` 时,幂等准备 `/storage`。 +2. `ansible/playbooks/verify/01-05.yml`:安装 K3s(可开启 `k3s_verify_storage_mount` 校验挂载)。 3. (可选)`ansible/playbooks/verify/03-07.yml`:Helm 安装 Longhorn(`ansible/files/03-07/values-lab.yaml`)。 4. (可选)按 `03-05` 应用本仓库 **local-path** ConfigMap 真源(`ansible/files/03-05/local-path-config-lab.json`)。 @@ -119,8 +137,8 @@ lsblk -f - **inventory**:`ansible/inventory.ini`,分组 `k3s_server`、`k3s_worker`、`k3s_nodes`(**勿**将 `ylc65` 列入 K3s 分组;工作机只作为 Ansible 控制端) - **变量**:`ansible/group_vars/all.yml`,含 `k3s_data_dir`、`k3s_server_ip`、`k3s_manage_`* 等 -- **playbook(k3s)**:`ansible/playbooks/verify/01-06.yml` -- **playbook(数据盘,可选)**:`ansible/playbooks/verify/01-06.yml` +- **playbook(k3s)**:`ansible/playbooks/verify/01-05.yml` +- **playbook(数据盘,可选)**:`ansible/playbooks/verify/01-05.yml` - **playbook(Longhorn,可选)**:`ansible/playbooks/verify/03-07.yml`(Helm + `ansible/files/03-07/values-lab.yaml`,文档 `03-07`) - **playbook(nginx 矩阵)**:`ansible/playbooks/verify/02-05.yml`(manifests 在 `ansible/files/02-05/`,文档 `02-05`) - **playbook(nginx TLS 矩阵)**:`ansible/playbooks/verify/03-02.yml`(manifests 在 `ansible/files/03-02/`,文档 `03-02`(02-05 升级版)) @@ -128,5 +146,5 @@ lsblk -f ## 6. 验证时间 -- 2026-03:**4 节点**(1 server + 3 worker)集群按 `01-06` 一次性安装成功,各节点 Traefik 入口 404 可达。自动化与验证常在 **`ylc65` Linux 工作机**上执行,该主机不参与 K3s。 +- 2026-03:**4 节点**(1 server + 3 worker)集群按 `01-05` 一次性安装成功,各节点 Traefik 入口 404 可达。自动化与验证常在 **`ylc65` Linux 工作机**上执行,该主机不参与 K3s。 diff --git a/docs/00-03-测试与验证框架.md b/docs/00-03-测试与验证框架.md index 74148ad..6348bd6 100644 --- a/docs/00-03-测试与验证框架.md +++ b/docs/00-03-测试与验证框架.md @@ -1,12 +1,13 @@ -# 00-05-测试与验证框架(设计说明) +# 00-03-测试与验证框架(设计说明 + 验证前准备清单) -> 本页是“测试与验证框架”的设计说明,并与仓库里已落地的 `scripts/verify.sh` + `ansible/playbooks/verify/` 对齐。 +> 本页是“测试与验证框架”的设计说明,并与仓库里已落地的 `ansible/bin/verify.sh` + `ansible/playbooks/verify/` 对齐。 ## TL;DR - **本文性质**:说明/索引类文档(不承载一键部署动作) -- **推荐动作**:按 `00-00-构建总览.md` 进入主线;需要真机验收用 `./scripts/verify.sh full` +- **推荐动作**:按 `00-00-构建总览.md` 进入主线;需要真机验收用 `./ansible/bin/verify.sh full` +- **进度视图**:`00-04-验证状态板.md`(自动生成视图;不作为执行真源) - **成功判据**:你能据本文定位到下一步文档与对应入口脚本 - **排障**:执行失败请查看对应实验篇的「排障」与 playbook 输出 @@ -16,7 +17,23 @@ 本页只回答一件事:**如何把文档(`doc_id=XX-YY`)与可执行的验证入口对齐**,并把验证能力收敛为可维护的自动化资产。 -实机验证前需具备的环境与外部依赖,见 [`00-04-待验证项-验证前准备.md`](00-04-待验证项-验证前准备.md)。 +**索引约定**:下文与全库讨论自动化/验收时,**默认以 `doc_id` 为第一关键字**(验收入口 = `ansible/playbooks/verify/.yml`);BMad 按 `doc_id` 生成的 **[CS] Story** 见 `_bmad-output/implementation-artifacts/stories-by-doc/README.md`(与 Epic/Story 编号并存时,仍以 `doc_id` 对齐工程真源,见 `project-context.md`)。 + +实机验证前需具备的环境与外部依赖,见本文 **§10 验证前准备清单**。 + +### 1.1 Helm 与专题文档索引 + +本仓库里 **Helm** 出现频繁:除验证流程中的 `helm install/upgrade` 外,多篇文档以 chart + values 为真源。建议按主题跳转(**CLI 安装与环境**见 `00-02` §1.1~§1.2): + +| 场景 | 文档与真源 | +|------|------------| +| Longhorn | `03-07`;values:`ansible/files/03-07/values-lab.yaml`;自动化:`verify/03-07.yml` | +| kube-prometheus-stack(Prometheus/Grafana) | `05-05`;示例 values:`ansible/files/05-05/kube-prometheus-stack-values.example.yaml` | +| Traefik 端口/参数(HelmChartConfig) | `03-10`;`ansible/files/03-10/traefik-custom-ports.yaml` | +| GitOps(Argo CD 可用 Helm 或官方清单安装) | `03-09` | +| 查看 k3s 管理的 HelmChart 资源 | `06-02`(`kubectl -n kube-system get helmchart,helmchartconfig`) | + +Helm 安装 **超时、重试、`helm status` 排障** 的约定见下文 **§2.1** 表格「Helm 长耗时」一行。 ## 2. 自动化验证流程(一般步骤) @@ -24,8 +41,8 @@ 1. **接入目标环境** - 用 SSH 登录**控制节点**(或在本机配置好到控制节点的 Ansible `inventory`,由 Ansible 代你 SSH)。 - - 在仓库根(或文档约定目录)准备好代码:`git pull` / `scp` 同步等,与 [`docs/00-04-部署环境说明.md`](00-04-部署环境说明.md) 一致。 - - 按需加载验证环境变量:复制并填写 [`scripts/.env.verify.example`](../scripts/.env.verify.example) 为 `scripts/.env.verify`,执行前 `source`(`verify.sh` 会自动尝试加载)。 + - 在仓库根(或文档约定目录)准备好代码:`git pull` / `scp` 同步等,与 [`00-02-部署环境说明.md`](00-02-部署环境说明.md) 一致。 + - 按需加载验证环境变量:复制并填写 [`ansible/env/.env.verify.example`](../ansible/env/.env.verify.example) 为 `ansible/env/.env.verify`,执行前 `source`(`verify.sh` 会自动尝试加载)。 2. **环境与前置清理(按验证目标选择深度)** - **基本检查**:`kubectl get nodes`、磁盘/内核版本、防火墙与文档是否一致;必要时对照 `00-04`。 @@ -33,7 +50,7 @@ - **重度清理(重装/复现安装文档时)**:若你要从「空机」验证 `01-01` 等**整集群安装**流程,才需要按文档执行 `k3s-uninstall.sh`、删数据目录、清 iptables 等——这与日常 `run-all` 的“逐篇快速验收”是**不同场景**,不要默认混进每一次 `run-all`。 3. **部署** - - **推荐(本仓库)**:用 Ansible playbook 部署——要么是正式安装/初始化类(如 `verify/01-06.yml -e k3s_do_install=true`,或 `./scripts/deploy-lab.sh` 调用之),要么是验证用例里的 `kubectl apply` / `helm install` / `import_playbook`。 + - **推荐(本仓库)**:用 Ansible playbook 部署——要么是正式安装/初始化类(如 `verify/01-05.yml -e k3s_do_install=true`,或 `./ansible/bin/deploy-lab.sh` 调用之),要么是验证用例里的 `kubectl apply` / `helm install` / `import_playbook`。 - **文档中的 bash 一键命令**:仍可按 `docs/` 逐步执行;适合排障或 playbook 尚未覆盖的边角。自动化验收应尽量**收敛进** `ansible/playbooks/verify/*.yml`,避免「文档一套、手敲一套」长期分叉。 4. **按设计目标做断言** @@ -47,8 +64,8 @@ - 将结论写回对应实验篇文档(或保留日志),必要时更新 `docs/XX-YY-*.md` 中的命令或版本说明。 6. **本仓库一键串联** - - **部署**(步骤 3):`./scripts/deploy-lab.sh k3s` 等,见 [`scripts/README.md`](../scripts/README.md)。 - - **验证**(步骤 4~6):在仓库根执行 **`./scripts/verify.sh full`**(推荐:**preflight +** `run-all`,缺 playbook **fail-fast**);或仅 `./scripts/verify.sh run-all`(不跑 preflight);单篇用 `./scripts/verify.sh run `。`./scripts/verify.sh flow` 可打印与本节对应的步骤摘要。 + - **部署**(步骤 3):`./ansible/bin/deploy-lab.sh k3s` 等,见 [`scripts/README.md`](../scripts/README.md)。 + - **验证**(步骤 4~6):在仓库根执行 **`./ansible/bin/verify.sh full`**(推荐:**preflight +** `run-all`,缺 playbook **fail-fast**);或仅 `./ansible/bin/verify.sh run-all`(不跑 preflight);单篇用 `./ansible/bin/verify.sh run `(仓库根简写 **`./scripts/cs `**,与前者等价,**对 `ansible/playbooks/verify/` 内全部执行域 doc_id 通用**,无需为每个 doc_id 单独建脚本)。`./ansible/bin/verify.sh flow` 可打印与本节对应的步骤摘要。 - **范围说明**:`run-all` 的范围由 `ansible/playbooks/verify/` 目录内存在的 `XX-YY.yml` 自动决定;`full` **不会**自动执行 `deploy-lab.sh`,仍假设集群与铺栈(步骤 3)已由操作者完成。 ### 2.1 局限与约定补全(建议在文档与 `verify/XX-YY.yml` 中写死) @@ -58,20 +75,36 @@ | 主题 | 建议约定 | |------|----------| | **多节点:在哪台机器 `curl`** | **默认**:在 inventory 的 **`k3s_server`(控制节点)** 上,对 **集群入口** 发 HTTP(如 `nginx_entry_base` / `http://<控制节点或 LB IP>`),与「从集群外经 NodePort/主机网络进 Traefik」一致。**例外**(必须显式写):要验 worker 仅内网、跨节点路径、或「必须从某台 agent 访问」时,在 playbook 里对指定 host 执行 `curl`(或 `delegate_to` / 专用 play),并在文档「验证命令」中写明 **执行主机与目标 URL**,避免隐含「任意节点等价」。 | -| **TLS / SNI** | 自签或跳过校验仅用于排障:`curl -k`。**验收**应优先:真实证书路径下用 `curl -v` 看证书链;或用 `curl --resolve <域名>:443:<入口IP> https://<域名>/...` 在 **无 DNS** 时模拟 SNI。需要时用 `openssl s_client -connect host:443 -servername <域名> :443:<入口IP> https://<域名>/...` 在 **无 DNS** 时模拟 SNI。需要时用 `openssl s_client -connect host:443 -servername <域名> ansible/playbooks/verify/.yml`;缺对应 playbook 则 **fail-fast**。 +1. **`ansible/bin/verify.sh`**:调用 `ansible-playbook -i ansible/playbooks/verify/.yml`;缺对应 playbook 则 **fail-fast**。 2. **`ansible/playbooks/verify/.yml`**:单篇用例,通常拆成「部署 → 验证 → 清理」多个 **play**(默认 **`VERIFY_TEARDOWN=1`** 做 teardown)。 3. **特例**:无集群动作的文档可走 **`verify/_noop-tasks.yml`**(仓库路径/文件存在性);依赖 NFS、ACME、Cloudflare 等外部条件的可用 **gate 跳过** apply,teardown 需避免「无清单仍删」类失败(各 playbook 已按此收敛)。 @@ -93,7 +126,7 @@ 建议把用例写成“按文档 id 编排的任务集合”。在本仓库里,执行路径固定为 `verify/.yml`,不再维护额外映射文件。 - 每个文件内一般拆为三段(多个 play 或顺序 tasks): -示例(02-05):`./scripts/verify.sh run 02-05` 执行 `ansible/playbooks/verify/02-05.yml`(HTTP 校验四路径,最后 teardown)。`02-01`~`02-04` 另有单路径 playbook,便于单独调试。 +示例(02-05):`./ansible/bin/verify.sh run 02-05` 或 `./scripts/cs 02-05` 执行 `ansible/playbooks/verify/02-05.yml`(HTTP 校验四路径,最后 teardown)。`02-01`~`02-04` 另有单路径 playbook,便于单独调试。 每个 `verify/XX-YY.yml` 的典型结构为三段: @@ -164,14 +197,14 @@ http_check: 本仓库将 `docs/` 视为 **Runbook**(可执行手册),并对每篇可执行文档施加 **强绑定**: -- **每个 `doc_id` 必须可执行**:存在 `verify/.yml` 的 `doc_id` 必须能跑 `./scripts/verify.sh run `,且 playbook 内含明确断言(不得仅做“文件存在性”)。 +- **每个 `doc_id` 必须可执行**:存在 `verify/.yml` 的 `doc_id` 必须能跑 `./ansible/bin/verify.sh run `,且 playbook 内含明确断言(不得仅做“文件存在性”)。 - **文档与断言一致**:文档的“验证命令/预期”必须与对应 playbook 的断言一致;冲突时先修 playbook,再改文档对齐。 推荐每篇 `docs/-*.md` 采用以下结构(可复制粘贴作为模板): - **H1**:`# -<标题>` - **TL;DR(必选,3–8 行)** - - 自动化入口:`./scripts/verify.sh run `(必要时补 `export ...`) + - 自动化入口:`./ansible/bin/verify.sh run `(必要时补 `export ...`) - 最关键 3 条前置(变量/Secret/挂载) - 成功判据一句话 - 失败去哪看(链接到本篇“排障”) @@ -186,11 +219,11 @@ http_check: ## 8. 与旧自动化的关系 (已抛弃集中式“矩阵状态板”,因此不再有“待验证列表”文档。) -- 自动化执行以 `scripts/verify.sh` 与 `ansible/playbooks/verify/*.yml` 为准;本页描述其约定与扩展方式 +- 自动化执行以 `ansible/bin/verify.sh` 与 `ansible/playbooks/verify/*.yml` 为准;本页描述其约定与扩展方式 ## 9. 可选扩展(未落地) -当前「一篇文档 → 一个 `verify/XX-YY.yml`」在规模小时最简单:**入口仍是 `scripts/verify.sh`**,不必为了「架构感」提前建一堆目录。当出现下面任一情况时,再考虑本节里的拆法即可。 +当前「一篇文档 → 一个 `verify/XX-YY.yml`」在规模小时最简单:**入口仍是 `ansible/bin/verify.sh`**,不必为了「架构感」提前建一堆目录。当出现下面任一情况时,再考虑本节里的拆法即可。 ### 9.1 何时值得拆 @@ -227,4 +260,97 @@ yamllint、ansible-lint、schema 校验等 **不放进 `verify.sh`** 亦可: ## 排障 - **你在找执行命令**:本文为说明/索引;执行入口见 `00-00-构建总览.md` 与 `scripts/README.md`。 -- **verify.sh 报错**:先跑 `./scripts/verify.sh preflight`,再根据提示修复 inventory/SSH/变量。 +- **verify.sh 报错**:先跑 `./ansible/bin/verify.sh preflight`,再根据提示修复 inventory/SSH/变量。 + +## 10. 验证前准备清单(从原 00-04 合并) + +### A. 全局共用准备(跑任何扩展验证前建议具备) + +| 准备项 | 说明 | +|--------|------| +| 控制机 | 与 [`ansible/inventory.ini`](../ansible/inventory.ini) 一致:SSH 私钥存在且 **`chmod 600`**;仓库根执行 [`ansible/bin/verify.sh`](../ansible/bin/verify.sh)。 | +| Ansible 配置 | [`ansible/lib/lib-ansible-lab.sh`](../ansible/lib/lib-ansible-lab.sh) 会设置 `ANSIBLE_CONFIG` 指向 [`ansible/ansible.cfg`](../ansible/ansible.cfg);无写 `~/.ansible` 权限时可设 `ANSIBLE_LOCAL_TMP=$PWD/.ansible-tmp`(仓库 [`.gitignore`](../.gitignore) 已忽略 `.ansible-tmp/`)。 | +| HTTP 类 | `ansible/env/.env.verify` 或环境中设 **`nginx_entry_base`**(如 `http://192.168.2.61`),与 `verify/02-0x.yml` 等一致。 | +| 串联验证 | **`VERIFY_TEARDOWN=1`**(勿在 `.env.verify` 中长期设 `0`),避免用例互相污染。 | +| 预检 | 可选 `VERIFY_PREFLIGHT_CLUSTER=1 ./ansible/bin/verify.sh preflight` 确认集群 Ready。 | + +### B. 按文档主题的准备(做什么才能「真验证」) + +#### B1. 特殊硬件 / 拓扑 + +- **01-03、01-04**:准备 **armv7 实机**;01-03 经 **`ansible/tools/armv7-docker-verify-install.sh`**(先 **`docker info`**,失败再 **get.docker.com** 官方脚本)装 Docker;01-04 在 armv7 上跑通 NFS 导出与权限(与 03-06 可联动)。**默认** `SKIP_ARMV7=1` 时 verify 仅做文档/文件检查;**可选** 在 `.env.verify` 设 `SKIP_ARMV7=0` 并配置 `ARMV7_SSH`(01-04 可另设 `ARMV7_NFS_SSH`)后,`verify.sh run 01-03` / `01-04` 会经 SSH 执行远程步骤;01-04 仍为 **dnf** 路径(Fedora/RHEL 系),见 **§10.E**。 +- **01-07、03-08**:准备 **双 control-plane + 外部 LB(或等价)** 的可丢环境;按文档做加入/切换演练(当前自动化仅做基线可达性断言,加入/切换演练需按文档手工执行并补齐自动化)。 +- **07-01、07-02**:准备 **可重建的实验集群**(换 CNI/双栈);写好回滚;这两篇验证多为 **noop**,建议以手工记录为准。 + +#### B2. 环境变量 / 外部服务(不配则 gate 跳过或只能 noop) + +- **03-06**:在 `ansible/env/.env.verify` 配齐 **`NFS_SERVER_IP`(或 HOST)、`NFS_EXPORT_PATH`**;NFS 服务端导出与防火墙放行;再 `./ansible/bin/verify.sh run 03-06`。 +- **03-02、03-03、04-12**:在 **`ansible/env/.env.verify`** 配齐 **`ACME_EMAIL`**;Cloudflare DNS-01 时 **`CF_API_TOKEN`**(或集群已有 `kube-system/cloudflare-api-token`);公网 **80/443**、**DNS** 等按文档;仓库根 **`set -a && source ansible/env/.env.verify && set +a`** 后执行 **`./ansible/bin/verify.sh run 03-02`** / **`03-03`**(**常规自测不要 `env -u ACME_EMAIL`**,除非刻意测 gated)。 +- **03-04**:**`CF_TUNNEL_TOKEN`**(或等价)与隧道侧配置;办公机/第三方探测路径按文档约定执行。 +- **06-03**:按 [`06-03-k3s-自动备份与恢复-openlist-webdav.md`](06-03-k3s-自动备份与恢复-openlist-webdav.md) 准备 **WebDAV 端点与凭据**;清单真源见 `ansible/files/06-03/`。 + +#### B3. 部分验证补全为「已验证」(已有集群即可,偏手工/浏览器) + +- **01-06**:经 **Linux 工作机**(如 `ylc65`,见 `docs/00-02`)等 **非 k3s 节点** 对 OpenWrt **18080/18443** 做 curl;`ansible/env/.env.verify` 中 **`WORKSTATION_SSH`** 填到该工作机的一行 `ssh …`。 +- **03-01**:按文档 apply `ansible/files/03-01/`,**浏览器**验收 Dashboard(非仅 Deployment 存在)。 +- **03-03**:playbook 会 apply `ansible/files/03-03/` 并做 rollout + Dashboard HTTP 探针;浏览器/UI 可按文档另验。 +- **03-05**:playbook 层已有较完整验证;若要更“真验证”:按文档补全「业务读写/边界」。 +- **03-07**:playbook 已能装删 Longhorn;要 ✅:按 [`03-07-k3s-longhorn-持久化存储.md`](03-07-k3s-longhorn-持久化存储.md) 做 **PVC 读写、副本/故障** 等文档级验收。 + +#### B4. Node.js 系列(04-01~04-14) + +- **共用准备**:可访问的 **镜像仓库**(或私有 registry)、`nodejs_entry_base`、足够节点与资源做调度/HPA。 +- **04-13**:集群需 **metrics-server**;准备压测工具以触发 HPA。 +- **04-14**:依赖 **GitLab CI / GitOps** 任选一条实链路(与 05-03/05-04/03-09 联动)。 + +#### B5. 应用与监控(05-01~05-09) + +- **共用**:镜像拉取、持久化存储类(Longhorn/local-path/NFS)、Ingress 入口与 DNS(若对外)。 +- **05-03~05-04**:**大内存/磁盘** 规划、GitLab **域名/证书**、Runner **注册 token**、测试仓库与 `.gitlab-ci.yml`。 +- **05-05**:Prometheus/Grafana 资源与密码;Ingress 或 NodePort 访问策略。 + +#### B6. 运维与概念(06-02、03-09) + +- **06-02**:经验文档;「验证」可定义为巡检/备份 SOP 在现网执行一轮并在文档里记录结论。 +- **03-09**:先 **选定 Argo CD 或 Flux** 并落地最小 GitOps 回路,再谈 ✅。 + +### C. 建议执行顺序(减少重复准备) + +```mermaid +flowchart TD + base[全局 SSH 与 env.verify] + nfs[NFS 与 03-06] + acme[ACME 与 03-02] + node[04-01 基线加扩 04-02 起] + apps[05-xx 大应用] + ha[HA 与 CNI 实验集群] + base --> nfs + base --> acme + base --> node + acme --> node + nfs --> apps + base --> ha +``` + +1. 先固化 **A** + **03-06 NFS**(若要做存储类应用)。 +2. 再做 **03-02 ACME**,解锁 **04-12** 与 TLS 矩阵深度验收。 +3. **04-02~04-11** 在 **04-01** 基线上增量加 playbook 或手工矩阵。 +4. **05-03/05-04**、**05-05** 单独排期(资源与时间最长)。 +5. **HA / 07-xx** 独立维护窗口与回滚。 + +### D. 与「自动化」对齐的预期 + +- `verify.sh run XX-YY` 已通过只保证 **该 playbook 已实现** 的步骤通过;**noop** 文档不会替你完成文档内全部操作。 +- 若你要做“已验证”记录,建议在对应实验篇文档里写清:**环境、日期、是「仅脚本」还是「脚本 + 手工浏览器/第三方机」**。 + +### E. armv7 / arm32(可选经 `verify.sh` + SSH 远程安装) + +实验室矩阵里的 **01-03**(Docker)、**01-04**(NFS)、**05-02** 中的 arm 段等,依赖 **32 位 ARM(文档多写 armv7)实机**,与四节点 x86_64 K3s 主线 **不在同一 inventory**。 + +| 项 | 说明 | +|------|------| +| 默认 | **`SKIP_ARMV7=1`(或未设)**:`01-03.yml`、`01-04.yml` 仍跑矩阵基线(含文档/文件检查),**不经 SSH 改 arm 机**。 | +| 启用远程步骤 | **`SKIP_ARMV7=0`** 且 **`ARMV7_SSH`** 为一行可执行的 `ssh ...`(BatchMode 建议与 `ansible/env/.env.verify.example` 一致):`verify.sh run 01-03` 会经该 SSH 调用 **`ansible/tools/armv7-docker-verify-install.sh`**(先 **`docker info`**,必要时 **get.docker.com**);`run 01-04` 用 **`ARMV7_NFS_SSH`**(若为空则回退 **`ARMV7_SSH`**)装 **nfs-utils**、写 **`/etc/exports`**。 | +| 约束 | 远程路径假定 **Fedora/RHEL 系 + dnf**;**Debian/apt 未分支**。`verify.sh` 在 `source ansible/env/.env.verify` 后调用 `ansible-playbook`,子进程继承上述环境变量。 | +| 门控 | **`SKIP_ARMV7=0` 但未配置有效 SSH** 时,01-03 / 01-04 会 **fail**,避免误以为已启用 arm 时静默跳过。 | +| 手工 | 仍可按 `01-03-armv7-standalone-docker.md`、`01-04-armv7-nfs服务安装.md` 全手工走通后在文档中记录环境与结论。 | diff --git a/docs/00-04-待验证项-验证前准备.md b/docs/00-04-待验证项-验证前准备.md deleted file mode 100644 index bdd29ed..0000000 --- a/docs/00-04-待验证项-验证前准备.md +++ /dev/null @@ -1,124 +0,0 @@ -# 00-05-待验证项:验证前准备任务列表 - -> 本页是“验证前准备清单”:在实机推进到「已验证」之前,需要具备的环境、变量与动作列表。 -> -> 自动化说明见 [`00-03-测试与验证框架.md`](00-03-测试与验证框架.md);环境变量模板见 [`scripts/.env.verify.example`](../scripts/.env.verify.example)。 - - -## TL;DR - -- **本文性质**:说明/索引类文档(不承载一键部署动作) -- **推荐动作**:按 `00-00-构建总览.md` 进入主线;需要真机验收用 `./scripts/verify.sh full` -- **成功判据**:你能据本文定位到下一步文档与对应入口脚本 -- **排障**:执行失败请查看对应实验篇的「排障」与 playbook 输出 - ---- - -## A. 全局共用准备(跑任何扩展验证前建议具备) - -| 准备项 | 说明 | -|--------|------| -| 控制机 | 与 [`ansible/inventory.ini`](../ansible/inventory.ini) 一致:SSH 私钥存在且 **`chmod 600`**;仓库根执行 [`scripts/verify.sh`](../scripts/verify.sh)。 | -| Ansible 配置 | [`scripts/lib-ansible-lab.sh`](../scripts/lib-ansible-lab.sh) 会设置 `ANSIBLE_CONFIG` 指向 [`ansible/ansible.cfg`](../ansible/ansible.cfg);无写 `~/.ansible` 权限时可设 `ANSIBLE_LOCAL_TMP=$PWD/.ansible-tmp`(仓库 [`.gitignore`](../.gitignore) 已忽略 `.ansible-tmp/`)。 | -| HTTP 类 | [`scripts/.env.verify`](../scripts/.env.verify) 或环境中设 **`nginx_entry_base`**(如 `http://192.168.2.61`),与 `verify/02-0x.yml` 等一致。 | -| 串联验证 | **`VERIFY_TEARDOWN=1`**(勿在 `.env.verify` 中长期设 `0`),避免用例互相污染。 | -| 预检 | 可选 `VERIFY_PREFLIGHT_CLUSTER=1 ./scripts/verify.sh preflight` 确认集群 Ready。 | - ---- - -## B. 按文档主题的准备(做什么才能「真验证」) - -### B1. 特殊硬件 / 拓扑 - -- **01-03、01-05**:准备 **armv7 实机**;01-03 装 Docker 并跑容器;01-05 在 armv7 上跑通 NFS 导出与权限(与 03-06 可联动)。**默认** `SKIP_ARMV7=1` 时 verify 仅做文档/文件检查;**可选** 在 `.env.verify` 设 `SKIP_ARMV7=0` 并配置 `ARMV7_SSH`(01-05 可另设 `ARMV7_NFS_SSH`)后,`verify.sh run 01-03` / `01-05` 会经 SSH 在 arm 上走 **dnf** 路径(Fedora/RHEL 系),见 **§E**。 -- **01-08、03-08**:准备 **双 control-plane + 外部 LB(或等价)** 的可丢环境;按文档做加入/切换演练(当前自动化仅做基线可达性断言,加入/切换演练需按文档手工执行并补齐自动化)。 -- **07-01、07-02**:准备 **可重建的实验集群**(换 CNI/双栈);写好回滚;这两篇验证多为 **noop**,建议以手工记录为准。 - -### B2. 环境变量 / 外部服务(不配则 gate 跳过或只能 noop) - -- **03-06**:在 `.env.verify` 配齐 **`NFS_SERVER_IP`(或 HOST)、`NFS_EXPORT_PATH`**;NFS 服务端导出与防火墙放行;再 `./scripts/verify.sh run 03-06`。 -- **03-02、04-12**:有效 **`ACME_EMAIL`**、公网 **80/443**、**DNS**(及文档中的 Cloudflare/Secret);与 `scripts/.env.verify.example` 中 CF 相关变量对齐。 -- **03-04**:**`CF_TUNNEL_TOKEN`**(或等价)与隧道侧配置;办公机/第三方探测路径按文档约定执行。 -- **06-03**:按 [`06-03-k3s-自动备份与恢复-openlist-webdav.md`](06-03-k3s-自动备份与恢复-openlist-webdav.md) 准备 **WebDAV 端点与凭据**;清单真源见 `ansible/files/06-03/`。 - -### B3. 部分验证补全为「已验证」(已有集群即可,偏手工/浏览器) - -- **01-07**:**onecloud**(或文档约定第三方机)对 OpenWrt **18080/18443** 做 curl;`scripts/.env.verify.example` 中 **`ONECLOUD_SSH`**。 -- **02-00**:通读说明文档,与 02-01~02-05 结论对齐。 -- **03-01**:按文档 apply `ansible/files/03-01/`,**浏览器**验收 Dashboard(非仅 Deployment 存在)。 -- **03-03**:实机 apply `ansible/files/03-03/` 并验收。 -- **03-05**:playbook 层已有较完整验证;若要更“真验证”:按文档补全「业务读写/边界」。 -- **03-07**:playbook 已能装删 Longhorn;要 ✅:按 [`03-07-k3s-longhorn-持久化存储.md`](03-07-k3s-longhorn-持久化存储.md) 做 **PVC 读写、副本/故障** 等文档级验收(`longhorn_force_crd_reset` 仅在 CRD 与 Helm 严重冲突时于 `group_vars` 设为 `true`)。 - -### B4. Node.js 系列(04-01~04-14) - -- **共用准备**:可访问的 **镜像仓库**(或私有 registry)、`nodejs_entry_base`、足够节点与资源做调度/HPA。 -- **04-01**:已部分验证;04-02~04-14 当前 **`verify.sh` 多为 noop**(仅目录/文档存在性)。要逐项 ✅:**按各篇文档手工或用例化命令验证**,并在 `ansible/playbooks/verify/` **补 `04-0x.yml` 真实 deploy/verify/teardown**(工作量最大的一块)。 -- **04-13**:集群需 **metrics-server**;准备压测工具以触发 HPA。 -- **04-14**:依赖 **GitLab CI / GitOps** 任选一条实链路(与 05-03/05-04/03-09 联动)。 - -### B5. 应用与监控(05-01~05-09) - -- **共用**:镜像拉取、持久化存储类(Longhorn/local-path/NFS)、Ingress 入口与 DNS(若对外)。 -- **05-02**:**armv7 段 + K3s 段** 分两环境验证;arm 段与 **§E** 相同:默认跳过远程步骤,按需 `SKIP_ARMV7=0` + SSH 变量启用 01-03/01-05 类自动化后,再在 K3s 侧验证。 -- **05-03~05-04**:**大内存/磁盘** 规划、GitLab **域名/证书**、Runner **注册 token**、测试仓库与 `.gitlab-ci.yml`。 -- **05-05**:Prometheus Operator 资源与 **Grafana 管理员密码**;Ingress 或 NodePort 访问策略。 -- **05-06**:真实 **网盘凭据**、备份目标目录与 Cron 窗口。 -- **05-07~05-09**:x86 Docker 主机、K3s 内 **镜像构建/拉取**、OpenClaw 相关 **密钥与入口**;05-09 需 apply 示例清单并验收页面。 - -### B6. 运维与概念(06-02、03-09) - -- **06-02**:经验文档;「验证」可定义为巡检/备份 SOP 在现网执行一轮并在文档里记录结论。 -- **03-09**:先 **选定 Argo CD 或 Flux** 并落地最小 GitOps 回路,再谈 ✅。 - ---- - -## C. 建议执行顺序(减少重复准备) - -```mermaid -flowchart TD - base[全局 SSH 与 env.verify] - nfs[NFS 与 03-06] - acme[ACME 与 03-02] - node[04-01 基线加扩 04-02 起] - apps[05-xx 大应用] - ha[HA 与 CNI 实验集群] - base --> nfs - base --> acme - base --> node - acme --> node - nfs --> apps - base --> ha -``` - -1. 先固化 **A** + **03-06 NFS**(若要做存储类应用)。 -2. 再做 **03-02 ACME**,解锁 **04-12** 与 TLS 矩阵深度验收。 -3. **04-02~04-11** 在 **04-01** 基线上增量加 playbook 或手工矩阵。 -4. **05-03/05-04**、**05-05** 单独排期(资源与时间最长)。 -5. **HA / 07-xx** 独立维护窗口与回滚。 - ---- - -## D. 与「自动化」对齐的预期 - -- **`verify.sh run XX-YY` 已通过** 只保证 **该 playbook 已实现** 的步骤通过;**noop** 文档不会替你完成文档内全部操作。 -- 若你要做“已验证”记录,建议在对应实验篇文档里写清:**环境、日期、是「仅脚本」还是「脚本 + 手工浏览器/第三方机」**。 - ---- - -## E. armv7 / arm32(可选经 `verify.sh` + SSH 远程安装) - -实验室矩阵里的 **01-03**(Docker)、**01-05**(NFS)、**05-02** 中的 arm 段等,依赖 **32 位 ARM(文档多写 armv7)实机**,与四节点 x86_64 K3s 主线 **不在同一 inventory**。 - -| 项 | 说明 | -|------|------| -| 默认 | **`SKIP_ARMV7=1`(或未设)**:`01-03.yml`、`01-05.yml` 仍跑矩阵基线(含 `_noop-tasks.yml` 文档/文件检查),**不经 SSH 改 arm 机**。 | -| 启用远程步骤 | **`SKIP_ARMV7=0`** 且 **`ARMV7_SSH`** 为一行可执行的 `ssh ...`(BatchMode 建议与 [`scripts/.env.verify.example`](../scripts/.env.verify.example) 一致):`verify.sh run 01-03` 会经该 SSH 在 arm 上 **dnf 装 docker** 并校验;`run 01-05` 用 **`ARMV7_NFS_SSH`**(若为空则回退 **`ARMV7_SSH`**)装 **nfs-utils**、写 **`/etc/exports`**(路径/客户端网段见 **`ARMV7_NFS_EXPORT_PATH`**、**`ARMV7_NFS_CLIENT_SUBNET`**,默认 `/sdcard` 与 `192.168.2.0/24`)。 | -| 约束 | 远程路径假定 **Fedora/RHEL 系 + dnf**;**Debian/apt 未分支**。`verify.sh` 在 `source scripts/.env.verify` 后调用 `ansible-playbook`,子进程继承上述环境变量。 | -| 门控 | **`SKIP_ARMV7=0` 但未配置有效 SSH** 时,01-03 / 01-05 会 **fail**,避免 `run-all` / `full` 在误以为已启用 arm 时静默跳过。 | -| 手工 | 仍可按 [`01-03-armv7-standalone-docker.md`](01-03-armv7-standalone-docker.md)、[`01-05-armv7-nfs服务安装.md`](01-05-armv7-nfs服务安装.md) 全手工走通后在文档中记录环境与结论。 | - -## 排障 - -- **你在找执行命令**:本文为说明/索引;执行入口见 `00-00-构建总览.md` 与 `scripts/README.md`。 -- **verify.sh 报错**:先跑 `./scripts/verify.sh preflight`,再根据提示修复 inventory/SSH/变量。 diff --git a/docs/00-04-验证状态板.md b/docs/00-04-验证状态板.md new file mode 100644 index 0000000..2b781ad --- /dev/null +++ b/docs/00-04-验证状态板.md @@ -0,0 +1,83 @@ +# 00-04-验证状态板(自动生成视图) + +> 本页为**只读视图**:用于快速查看「已验证/未验证/门控/失败」。 +> **执行真源**仍以 `ansible/playbooks/verify/*.yml` 为准;本页不承载执行逻辑。 + +- 最近生成时间(UTC):`2026-03-27T16:03:16+00:00` +- 本地结果缓存(不入库):`.status/verify-results.json` + +## 快速更新 + +在仓库根执行: + +```bash +# 仅渲染(不跑真机验证,按缓存/静态信息生成) +python3 ansible/tools/status_board.py render + +# 真机跑一轮并写入缓存(会执行 verify playbook) +python3 ansible/tools/status_board.py update --all +python3 ansible/tools/status_board.py render +``` + +## 状态表 + +| doc_id | 状态 | noop | rc | last_update | docs | playbook | files | +|---|---|---:|---:|---|---:|---:|---:| +| 01-01 | ✅ verified | | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 01-02 | ✅ verified | | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 01-03 | ✅ verified | | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 01-04 | ✅ verified | | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 01-05 | 🟡 gated | | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 01-06 | ✅ verified | Y | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 01-07 | ✅ verified | Y | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 02-01 | ✅ verified | | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 02-02 | ✅ verified | | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 02-03 | ✅ verified | | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 02-04 | ✅ verified | | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 02-05 | ✅ verified | | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 03-01 | ✅ verified | | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 03-02 | 🟡 gated | | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 03-03 | ✅ verified | Y | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 03-04 | ✅ verified | Y | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 03-05 | 🟡 gated | | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 03-06 | ✅ verified | | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 03-07 | ✅ verified | | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 03-08 | ✅ verified | Y | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 03-09 | ✅ verified | Y | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 03-10 | ✅ verified | Y | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 04-01 | ✅ verified | | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 04-02 | ✅ verified | | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 04-03 | ✅ verified | | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 04-04 | ✅ verified | | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 04-05 | ✅ verified | | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 04-06 | ✅ verified | | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 04-07 | ✅ verified | | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 04-08 | ✅ verified | | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 04-09 | ✅ verified | | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 04-10 | ✅ verified | | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 04-11 | ✅ verified | | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 04-12 | 🟡 gated | | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 04-13 | ✅ verified | | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 04-14 | ✅ verified | Y | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 05-01 | ✅ verified | Y | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 05-02 | ✅ verified | Y | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 05-03 | ✅ verified | Y | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 05-04 | ✅ verified | Y | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 05-05 | ✅ verified | Y | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 05-06 | ✅ verified | Y | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 05-07 | ✅ verified | Y | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 05-08 | ✅ verified | Y | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 05-09 | ✅ verified | Y | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 06-01 | ✅ verified | Y | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 06-02 | ✅ verified | Y | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 06-03 | ✅ verified | Y | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 07-01 | ✅ verified | Y | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | +| 07-02 | ✅ verified | Y | 0 | 2026-03-27T15:55:00+00:00 | Y | Y | Y | + +## 口径说明 + +- **verified/gated/failed/noop/unknown**:以 verify 输出的 `[OC] ... result=` 为准;缺失 OC 时回退到 legacy 规则。 +- **gated**:必须附带 `missing_dependency` 与 `skip_scope`(见 Output Contract OC2)。 +- **noop**:该 doc_id 的 verify playbook 为 noop 模式(仅基线/存在性/结构检查)。 +- **unknown**:尚未在本机写入结果缓存(或仅静态生成)。 + diff --git a/docs/01-00-安装与基础环境-系列说明.md b/docs/01-00-安装与基础环境-系列说明.md index 9b03d69..808aaaa 100644 --- a/docs/01-00-安装与基础环境-系列说明.md +++ b/docs/01-00-安装与基础环境-系列说明.md @@ -4,9 +4,9 @@ ## TL;DR -- **从零装集群(推荐自动化)**:`./scripts/deploy-lab.sh k3s`(文档:`01-06`) +- **从零装集群(推荐自动化)**:`./ansible/bin/deploy-lab.sh k3s`(文档:`01-05`) - **本页定位**:仅系列导航,不参与 `verify.sh run-all/full` -- **子篇执行入口**:按下表执行 `./scripts/verify.sh run ` +- **子篇执行入口**:按下表执行 `./ansible/bin/verify.sh run ` - **阅读完成判据**:能按索引定位到对应子篇并完成执行 ## 范围与非目标 @@ -19,13 +19,13 @@ | doc_id | 主题 | 子篇执行入口 | | ------ | --------------------------- | ------------------------------- | -| 01-06 | 节点初始化与 k3s 自动安装(Ansible) | `./scripts/verify.sh run 01-06` | -| 01-01 | 控制节点安装(含 Traefik) | `./scripts/verify.sh run 01-01` | -| 01-02 | 工作节点加入与验证 | `./scripts/verify.sh run 01-02` | -| 01-03 | armv7 standalone docker(可选) | `./scripts/verify.sh run 01-03` | -| 01-05 | armv7 NFS 服务安装(可选) | `./scripts/verify.sh run 01-05` | -| 01-07 | OpenWrt + HAProxy(可选) | `./scripts/verify.sh run 01-07` | -| 01-08 | 双控制节点 HA:安装与准备(可选) | `./scripts/verify.sh run 01-08` | +| 01-05 | 节点初始化与 k3s 自动安装(Ansible) | `./ansible/bin/verify.sh run 01-05` | +| 01-01 | 控制节点安装(含 Traefik) | `./ansible/bin/verify.sh run 01-01` | +| 01-02 | 工作节点加入与验证 | `./ansible/bin/verify.sh run 01-02` | +| 01-03 | armv7 standalone docker(可选) | `./ansible/bin/verify.sh run 01-03` | +| 01-04 | armv7 NFS 服务安装(可选) | `./ansible/bin/verify.sh run 01-04` | +| 01-06 | OpenWrt + HAProxy(可选) | `./ansible/bin/verify.sh run 01-06` | +| 01-07 | 双控制节点 HA:安装与准备(可选) | `./ansible/bin/verify.sh run 01-07` | ## 真源位置 diff --git a/docs/01-01-k3s-控制节点含traefik.md b/docs/01-01-k3s-控制节点含traefik.md index fe234f8..9f4055d 100644 --- a/docs/01-01-k3s-控制节点含traefik.md +++ b/docs/01-01-k3s-控制节点含traefik.md @@ -2,12 +2,17 @@ ## TL;DR -- **自动化验收**:在控制端(如 `ylc65`)执行 `./scripts/verify.sh run 01-01` +- **自动化验收**:在控制端(如 `ylc65`)执行 `./ansible/bin/verify.sh run 01-01` - **手工安装**:控制节点执行 `curl -sfL https://get.k3s.io | sh -s - server --data-dir=/storage`(或默认路径) - **成功判据**:node 为 `Ready`;`kube-system` 中 `coredns` / `traefik` Deployment 存在;Traefik 入口可响应(常见为 `404`) - **失败排障**:见本文「排障」小节(事件/Pod/日志/磁盘压力) -> 说明:本篇聚焦 **单控制节点安装与基础验收**。若要一键自动化安装多节点集群,见 `01-06-节点初始化-ansible-实践.md`。 +> 说明:本篇聚焦 **单控制节点安装与基础验收**。若要一键自动化安装多节点集群,见 `01-05-节点初始化-ansible-实践.md`。 + +## 契约与真源 + +- **安装命令备忘**:`ansible/files/01-01/k3s-server-install.example.sh`(与下文 curl 片段一致;非 K8s 清单)。 +- **自动**:`./ansible/bin/verify.sh run 01-01`(专用 kubectl 断言;与手工安装共用 `ansible/files/01-01/` 索引)。 ## 前置条件 @@ -25,7 +30,7 @@ K3s 默认将数据(含 local-path 卷)放在 `--data-dir` 下。系统盘 | **方案一(默认)** | `/var/lib/rancher/k3s` | 系统盘空间充足 | | **方案二(数据盘)** | `/storage` | 系统盘小,数据盘单独挂载在 `/storage` | -> 自定义 `/storage` 仅解决单节点内系统盘/数据盘分离;节点或数据盘重建后数据不会自动迁移,高可用与备份见 `01-08`、`06-03`。 +> 自定义 `/storage` 仅解决单节点内系统盘/数据盘分离;节点或数据盘重建后数据不会自动迁移,高可用与备份见 `01-07`、`06-03`。 ## 操作步骤 @@ -47,7 +52,7 @@ curl -sfL https://get.k3s.io | sh - curl -sfL https://get.k3s.io | sh -s - server --data-dir=/storage ``` -- 使用方案二时,token 路径为 `/storage/server/token`(供 01-02 工作节点加入与 01-08 HA 使用)。 +- 使用方案二时,token 路径为 `/storage/server/token`(供 01-02 工作节点加入与 01-07 HA 使用)。 ## 配置 kubectl(供当前用户使用) @@ -158,11 +163,11 @@ curl -I --max-time 3 http://127.0.0.1:80 ## 清理 -本篇为安装类文档:手工安装后一般 **不卸载 K3s**,而是继续后续实验。若你仅为排障临时验收,可在运行 `./scripts/verify.sh run 01-01` 时设 `VERIFY_TEARDOWN=0` 保留现场(本篇用例默认不做破坏性清理)。 +本篇为安装类文档:手工安装后一般 **不卸载 K3s**,而是继续后续实验。若你仅为排障临时验收,可在运行 `./ansible/bin/verify.sh run 01-01` 时设 `VERIFY_TEARDOWN=0` 保留现场(本篇用例默认不做破坏性清理)。 ## 排障 -- **节点不 Ready / DiskPressure**:优先确认 `/storage` 为独立挂载点(见 `docs/00-04-部署环境说明.md`),再看 `df -h`、`kubectl describe node ` 事件。 +- **节点不 Ready / DiskPressure**:优先确认 `/storage` 为独立挂载点(见 `docs/00-02-部署环境说明.md`),再看 `df -h`、`kubectl describe node ` 事件。 - **Traefik 不就绪**:`kubectl -n kube-system get pods -o wide`;必要时 `kubectl -n kube-system logs deploy/traefik --tail=200`。 - **CoreDNS 解析异常(影响后续 ACME)**:见下节「CoreDNS 上游 DNS」。 @@ -184,7 +189,7 @@ forward . 223.5.5.5 8.8.8.8 然后重启 CoreDNS:`kubectl -n kube-system rollout restart deploy/coredns` -> 若使用 Ansible 一键安装(`01-06`),playbook 已自动完成此配置,无需手动修改。 +> 若使用 Ansible 一键安装(`01-05`),playbook 已自动完成此配置,无需手动修改。 ## 下一步 diff --git a/docs/01-02-k3s-工作节点.md b/docs/01-02-k3s-工作节点.md index a21db3d..04ecd67 100644 --- a/docs/01-02-k3s-工作节点.md +++ b/docs/01-02-k3s-工作节点.md @@ -3,11 +3,16 @@ > 本文已合并原 `01-02-k3s-工作节点.md`。 > 目标:完成工作节点加入 + Traefik 入口部署基线,并验证「**入口节点集合**的 `:80` 可达」。 > -> 若需一键自动化安装多节点集群,可直接用 `01-06-节点初始化-ansible-实践.md`。 +> 若需一键自动化安装多节点集群,可直接用 `01-05-节点初始化-ansible-实践.md`。 + +## 契约与真源 + +- **加入命令备忘**:`ansible/files/01-02/k3s-agent-join.example.sh`(与下文 curl/agent 片段一致)。 +- **自动**:`./ansible/bin/verify.sh run 01-02`(与手工步骤共用 `ansible/files/01-02/README.md` 索引)。 ## TL;DR -- **自动化验收**:在控制端执行 `./scripts/verify.sh run 01-02` +- **自动化验收**:在控制端执行 `./ansible/bin/verify.sh run 01-02` - **手工加入 worker**:在 worker 上按本文执行 `k3s agent ...`(注意 token 路径与 `/storage` 方案) - **成功判据**:`kubectl get nodes` 中 worker 为 `Ready`;`kube-system` 中 Traefik 正常;入口 `:80` 可达(按本文验收命令) - **失败排障**:见本文「排障」小节(token/防火墙/flannel/cni0/调度入口节点) @@ -96,13 +101,13 @@ kubectl label node ylc62 svccontroller.k3s.cattle.io/lbpool=edge --overwrite ### 3.2 Ansible 方式(推荐,集中管理入口节点) -也可以在 [`ansible/group_vars/all.yml`](../ansible/group_vars/all.yml) 中配置入口节点列表 `k3s_ingress_nodenames`(示例:`ylc61`、`ylc62`),由 `01-06.yml`(`-e k3s_do_install=true`)自动打标签。 +也可以在 [`ansible/group_vars/all.yml`](../ansible/group_vars/all.yml) 中配置入口节点列表 `k3s_ingress_nodenames`(示例:`ylc61`、`ylc62`),由 `01-05.yml`(`-e k3s_do_install=true`)自动打标签。 运行: ```bash cd ansible -ansible-playbook -i inventory.ini playbooks/verify/01-06.yml +ansible-playbook -i inventory.ini playbooks/verify/01-05.yml ``` 若 `k3s_ingress_nodenames` 为空(默认),Ansible 会对**所有节点**打入口标签,与早期行为一致; diff --git a/docs/01-03-armv7-standalone-docker.md b/docs/01-03-armv7-standalone-docker.md index aa4a574..a252fc1 100644 --- a/docs/01-03-armv7-standalone-docker.md +++ b/docs/01-03-armv7-standalone-docker.md @@ -2,44 +2,81 @@ > armv7 节点不加入 K3s,单独运行 Docker 服务(NFS、OneNav、openlist 等)。 +## 契约与真源 + +- **索引**:`ansible/files/01-03/README.md`(本篇无通用 K8s 清单;Docker 步骤以正文为准)。 +- **远程一键(先验后装)**:`ansible/tools/armv7-docker-verify-install.sh`(由 `verify/01-03.yml` 在 `SKIP_ARMV7=0` 时调用)。 +- **自动**:`./ansible/bin/verify.sh run 01-03`。 ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 01-03` -- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP -- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **自动化验收**:`./ansible/bin/verify.sh run 01-03` +- **关键前置**:arm 机网络可达;`SKIP_ARMV7=0` 时需配置 `ARMV7_SSH`(见 `ansible/env/.env.verify.example`) +- **成功判据(推荐)**:远程 **`docker info`** 成功即视为 Docker 可用;脚本策略为 **先探测 `docker info`,成功则跳过安装,失败再安装** - **排障**:见本文「排障」 +## 官方安装脚本(Docker CE) + +安装脚本由 **Docker, Inc.** 维护,仓库说明见 [docker/docker-install](https://github.com/docker/docker-install)(即 **get.docker.com** / test.docker.com 所用脚本)。用途是**在支持的 Linux 发行版上快速安装最新 Docker CE**,**不建议**在生产环境把「长期依赖此脚本」当作唯一部署方式;更完整发行版说明见 [Docker 安装文档](https://docs.docker.com/engine/install/)。 + +在目标主机上(本机或 SSH 登录后)的典型用法与官网一致: + +```bash +curl -fsSL https://get.docker.com -o get-docker.sh +sh get-docker.sh +``` + +非 root 用户通常需 `sudo sh get-docker.sh`。脚本会处理包管理器与服务启用(具体行为因发行版而异)。 + +## 推荐流程:先验证,再决定是否安装 + +1. 在 arm 主机执行 **`docker info`**(或经 SSH 执行)。 +2. **若已成功**(命令退出码 0):视为已满足本篇目标,**无需**再跑安装脚本。 +3. **若失败**(未安装或 daemon 未起):再执行上节 **get.docker.com** 流程,然后再次 **`docker info`** 确认。 + +仓库已将上述逻辑封装为脚本(便于与 Ansible、`ARMV7_SSH` 一致): + +```bash +# 在仓库根目录;ARMV7_SSH 为一行可执行的 ssh(建议 BatchMode) +ARMV7_SSH='ssh -o BatchMode=yes user@arm-host' ./ansible/tools/armv7-docker-verify-install.sh +# 或 +./ansible/tools/armv7-docker-verify-install.sh 'ssh -o BatchMode=yes user@arm-host' +``` + ## 前置条件 -- armv7 节点网络可达 -- 系统可安装 Docker +- armv7 / arm32 节点网络可达(安装阶段需访问 **get.docker.com**) +- SSH 目标用户:若为非 root,需 **`sudo` 免密**或已配置好非交互执行(脚本在非 root 下用 `sudo sh /tmp/get-docker.sh`) -## 操作步骤 +## 操作步骤(手工摘要) -1. 安装 Docker -2. 启用并启动 Docker 服务 -3. 按需部署业务容器 +1. 按上节 **先 `docker info`,失败再 `get-docker.sh`** 完成 Docker。 +2. 启用并确认 **`systemctl status docker`**(脚本通常已处理;若未启用需按发行版处理)。 +3. 按需部署业务容器。 ## 验证命令 ```bash +docker info docker version docker ps ``` +**验收口径**:以 **`docker info`** 成功为主(与自动化脚本一致)。 + ## 预期 -- Docker 可用 +- **`docker info`** 无错误,Docker daemon 可用 - 容器可正常启动 ## 下一步 - `05-02-onenav首页面板.md` -- `01-05-armv7-nfs服务安装.md` +- `01-04-armv7-nfs服务安装.md` ## 排障 -- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 -- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 -- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 +- **先看 playbook / 脚本输出**:区分「已跳过安装」与「执行了 get-docker.sh」。 +- **curl / TLS**:arm 机需能访问 `https://get.docker.com`。 +- **sudo**:非 root SSH 时若安装失败,检查 `sudo -n true` 与免密配置。 +- **集群侧**(本篇不依赖 K3s):若在 verify 矩阵里同时跑 k3s 基线,仍见 `kubectl` 相关步骤属矩阵共用断言,与 arm Docker 无直接关系。 diff --git a/docs/01-05-armv7-nfs服务安装.md b/docs/01-04-armv7-nfs服务安装.md similarity index 98% rename from docs/01-05-armv7-nfs服务安装.md rename to docs/01-04-armv7-nfs服务安装.md index c79c249..4d03aae 100644 --- a/docs/01-05-armv7-nfs服务安装.md +++ b/docs/01-04-armv7-nfs服务安装.md @@ -1,11 +1,11 @@ -# 01-05-armv7 NFS 服务安装 +# 01-04-armv7 NFS 服务安装 > 本文只讲 armv7 主机侧 NFS 服务安装与导出配置,目标是把 **`/sdcard`** 作为 NFS 共享目录导出给 K3s 节点使用。 ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 01-05` +- **自动化验收**:`./ansible/bin/verify.sh run 01-04` - **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP - **成功判据**:达到本文「预期」且 playbook 断言通过 - **排障**:见本文「排障」 diff --git a/docs/01-06-节点初始化-ansible-实践.md b/docs/01-05-节点初始化-ansible-实践.md similarity index 78% rename from docs/01-06-节点初始化-ansible-实践.md rename to docs/01-05-节点初始化-ansible-实践.md index e36ff4f..cae33d6 100644 --- a/docs/01-06-节点初始化-ansible-实践.md +++ b/docs/01-05-节点初始化-ansible-实践.md @@ -1,14 +1,19 @@ -# 01-06-节点初始化与 k3s 自动安装(Ansible 实践) +# 01-05-节点初始化与 k3s 自动安装(Ansible 实践) > 目标:给一组已经装好 OS、可以 SSH 的裸金属/虚机,**一键完成基础初始化 + 安装 k3s server/worker**,得到与 `01-01`、`01-02` 文档一致的集群(含 `/storage` 数据盘方案)。 > > **状态:已验证**(2026-03,Fedora + K3s,4 节点 61~64)。 > 部署环境详见 `00-02-部署环境说明.md`。 +## 契约与真源 + +- **索引**:`ansible/files/01-05/README.md`(执行真源为 `ansible/playbooks/verify/01-05.yml` 与 `deploy-lab.sh`)。 +- **自动**:`./ansible/bin/deploy-lab.sh k3s` 或 `./ansible/bin/verify.sh run 01-05`。 + ## TL;DR -- **一键安装**:`./scripts/deploy-lab.sh k3s` -- **一键验收**:`./scripts/verify.sh run 01-06`(或直接 `./scripts/verify.sh full`) +- **一键安装**:`./ansible/bin/deploy-lab.sh k3s` +- **一键验收**:`./ansible/bin/verify.sh run 01-05`(或直接 `./ansible/bin/verify.sh full`) - **关键前置**:控制端可 SSH 所有节点;`ansible/inventory.ini` 私钥路径存在且权限正确;(可选)每台节点已挂载 `/storage` 或启用 `K3S_PREPARE_STORAGE=true` - **成功判据**:所有节点 `Ready`;kube-system 核心组件就绪;后续按 `02-05` 可跑入口验证 - **失败排障**:见本文「排障」小节(SSH/私钥、/storage、firewalld、k3s service) @@ -30,7 +35,7 @@ - **数据盘**:若使用 `/storage` 方案,每台节点须将**独立数据盘**挂载到 `/storage`(与 `/` 不同设备),详见 `00-04` 与下文「数据盘准备」。 - 不覆盖: - 从「完全裸铁 + 无系统」开始的 PXE 装机; - - 高级 HA(多 server + 外部 datastore)——仍按 `01-08`、`03-08` 执行。 + - 高级 HA(多 server + 外部 datastore)——仍按 `01-07`、`03-08` 执行。 ### 1.1 数据盘准备(手工,或与自动化二选一) @@ -51,15 +56,15 @@ XFS 用户将 `mkfs.ext4` / `fstab` 类型改为 `xfs` 即可(Longhorn 支持 **自动化(可选)**:在 `group_vars/all.yml` 中设置 `k3s_prepare_storage: true` 与 `k3s_data_disk_device: /dev/vdb`(四台盘符一致时一条即可;不一致则用 `host_vars/.yml` 覆盖),然后执行: ```bash -ansible-playbook -i inventory.ini playbooks/verify/01-06.yml +ansible-playbook -i inventory.ini playbooks/verify/01-05.yml ``` 该 playbook 在 `/storage` 已是独立挂载时会跳过,避免重复执行。 ### 1.2 推荐执行顺序(10G + 32G 四节点) -1. (可选)`playbooks/verify/01-06.yml` -2. `playbooks/verify/01-06.yml`(可在 `group_vars` 中设 `k3s_verify_storage_mount: true` 强制校验 `/` 与 `/storage` 不同源) +1. (可选)`playbooks/verify/01-05.yml` +2. `playbooks/verify/01-05.yml`(可在 `group_vars` 中设 `k3s_verify_storage_mount: true` 强制校验 `/` 与 `/storage` 不同源) 3. (可选)`playbooks/verify/03-07.yml`(Helm,见 `03-07`) 4. (可选)`playbooks/verify/03-05.yml`,或 `longhorn_apply_local_path_lab: true` 随 Longhorn 一并应用(真源:`files/kube-system/local-path-config-lab.json`,见 `03-05`) @@ -75,7 +80,7 @@ ansible/ all.yml playbooks/ verify/ - 01-06.yml # 标准 IPv4 安装(-e k3s_do_install=true);可选准备数据盘(-e k3s_do_prepare_storage=true) + 01-05.yml # 标准 IPv4 安装(-e k3s_do_install=true);可选准备数据盘(-e k3s_do_prepare_storage=true) 03-07.yml # 可选:Helm 安装 Longhorn 03-05.yml # 可选:仅应用 local-path 实验室 ConfigMap(-e local_path_apply_lab_config=true) files/ @@ -124,11 +129,11 @@ k3s_worker **存储挂载校验**(推荐实验室开启): -- `k3s_verify_storage_mount: true`:在 `01-06.yml` 安装 k3s(`-e k3s_do_install=true`)**之前**,断言 `/storage` 为挂载点且与 `/` 不同块设备;失败时提示查阅 `00-04`。已有「目录式假 /storage」的旧环境可临时设为 `false`。 +- `k3s_verify_storage_mount: true`:在 `01-05.yml` 安装 k3s(`-e k3s_do_install=true`)**之前**,断言 `/storage` 为挂载点且与 `/` 不同块设备;失败时提示查阅 `00-04`。已有「目录式假 /storage」的旧环境可临时设为 `false`。 **数据盘自动化**(可选): -- `k3s_prepare_storage: true` 且 `k3s_data_disk_device: /dev/vdb`:由 `01-06.yml -e k3s_do_prepare_storage=true` 执行(见 §1.1)。 +- `k3s_prepare_storage: true` 且 `k3s_data_disk_device: /dev/vdb`:由 `01-05.yml -e k3s_do_prepare_storage=true` 执行(见 §1.1)。 ## 5. 执行流程概览 @@ -138,7 +143,7 @@ playbook 依次执行: |------|------|------| | 1 | Init | 时区、基础包、/etc/hosts、**firewalld 开放 8472/udp(全部节点)与 6443/tcp(仅 server)** | | 2 | Install server | 安装 k3s server(`--data-dir=/storage`) | -| 3 | Install agent | 逐台安装 worker(`serial: 1`,`async/poll` 防止卡死) | +| 3 | Install agent | 逐台安装 worker(`serial: 1`);随后在 **server** 上 `kubectl wait` 各 worker Ready(不在 worker 上 `delegate_to` server,避免 SSH 路径异常) | | 4 | Firewalld 基线 | 等待 flannel.1/cni0 出现(最多 120s),加入 trusted zone | | 5 | **CoreDNS(可选)** | 当 `k3s_manage_coredns: true` 时,将 forward 改为 IPv4(223.5.5.5 8.8.8.8),避免 ACME 解析 Let's Encrypt 失败 | | 6 | Traefik 标签 | 从集群动态获取节点名,打 enablelb/lbpool 标签 | @@ -152,7 +157,7 @@ playbook 依次执行: - **Traefik 标签**:使用 `kubectl get nodes -o jsonpath` 获取实际节点名,不依赖 inventory 主机名与 K8s 节点名一致; - **CoreDNS(可选)**:宿主机若使用 IPv6 DNS(如运营商分配的 `240e:...`),Pod 网络仅 IPv4 时 CoreDNS 无法访问上游,导致 Traefik ACME 无法解析 Let's Encrypt 域名。playbook 会将 `forward . /etc/resolv.conf` 改为 `forward . 223.5.5.5 8.8.8.8`,详见 `03-02` 常见问题。 - **角色标签(可选)**:playbook 默认只打 enablelb/lbpool,**不打** `node-role.kubernetes.io/control-plane` 与 `node-role.kubernetes.io/worker`。若需 `03-01` / `03-03` nginx 矩阵的 M1/M3 能调度,可开启 `k3s_manage_role_labels` 并配置控制节点/工作节点名列表(见下),或安装后在控制节点按 01-02 可选步骤手动打标。 -- **Agent 安装**:token 通过 `slurp` 从 server 读取,`delegate_to` 到 server 执行。 +- **Agent 安装**:token 在 **Install server** 阶段于 server 上 `slurp`;各 worker 本机执行 `get.k3s.io` 安装 agent。**等待 worker Ready** 使用独立 play(`hosts: k3s_server`)执行 `kubectl wait`,与「控制机 → server」的 SSH 路径一致,避免在 worker 任务内 `delegate_to` 控制机时出现 `UNREACHABLE [worker -> server]`(如对 `192.168.2.61:22` 超时)。 ## 6. 使用方式 @@ -167,9 +172,9 @@ playbook 依次执行: ```bash cd ansible # (可选)先准备数据盘挂载 /storage -# ansible-playbook -i inventory.ini playbooks/verify/01-06.yml +# ansible-playbook -i inventory.ini playbooks/verify/01-05.yml # 标准 IPv4 安装 -ansible-playbook -i inventory.ini playbooks/verify/01-06.yml +ansible-playbook -i inventory.ini playbooks/verify/01-05.yml # (可选)Helm 安装 Longhorn # ansible-playbook -i inventory.ini playbooks/verify/03-07.yml ``` @@ -203,7 +208,9 @@ KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl get pods -n kube-system -o wide ## 排障 -- **Ansible 连不上节点**:先在控制端跑 `./scripts/verify.sh preflight`;检查 `ansible/inventory.ini` 主机名/IP、`ansible_user`、私钥路径与权限(600)。 -- **/storage 校验失败**:确认每台节点 `/storage` 为独立挂载点;必要时先跑 `K3S_PREPARE_STORAGE=true ./scripts/deploy-lab.sh k3s` 或单独跑 `ansible/playbooks/verify/01-06.yml`。 +- **Install k3s worker/server 长时间无输出或最终超时**:任务在拉取 `https://get.k3s.io` 并从 GitHub 下二进制;worker 需能访问外网。可在 `ansible/group_vars/all.yml` 设 **`k3s_install_mirror: cn`**(走安装脚本国内镜像),或调大 **`k3s_install_curl_max_time`** / **`k3s_install_task_timeout`**;`curl` 已带 `--connect-timeout` / `--max-time`,超时后会失败退出而不是无限挂住。 +- **worker 阶段 `UNREACHABLE [ylc62 -> ylc61]` / 连 `192.168.2.61:22` 超时**:多为在 worker 上下文中 `delegate_to` 控制机时连接行为与预期不符。当前 `01-05.yml` 已改为在 **`hosts: k3s_server`** 的独立 play 里 `kubectl wait` worker;若仍失败,在控制机单独 `ssh root@` 与 `kubectl get nodes` 排查。 +- **Ansible 连不上节点**:先在控制端跑 `./ansible/bin/verify.sh preflight`;检查 `ansible/inventory.ini` 主机名/IP、`ansible_user`、私钥路径与权限(600)。 +- **/storage 校验失败**:确认每台节点 `/storage` 为独立挂载点;必要时先跑 `K3S_PREPARE_STORAGE=true ./ansible/bin/deploy-lab.sh k3s` 或单独跑 `ansible/playbooks/verify/01-05.yml`。 - **kube-system 组件不就绪**:在 server 上 `journalctl -u k3s -n 200 --no-pager`,以及 `kubectl -n kube-system get pods -o wide`/`describe` 查看事件。 diff --git a/docs/01-07-openwrt-haproxy.md b/docs/01-06-openwrt-haproxy.md similarity index 72% rename from docs/01-07-openwrt-haproxy.md rename to docs/01-06-openwrt-haproxy.md index f277504..e4bcde9 100644 --- a/docs/01-07-openwrt-haproxy.md +++ b/docs/01-06-openwrt-haproxy.md @@ -1,11 +1,25 @@ -# 01-07 OpenWrt HAProxy 负载均衡 +# 01-06 OpenWrt HAProxy 负载均衡 > 在 OpenWrt 上安装并配置 HAProxy,将 80/443 流量转发到 K3s 集群节点(Traefik 入口),实现单一入口与负载均衡。 +## 契约与真源(HAProxy 配置文件) + +**HAProxy 示例配置的唯一真源**:本仓库 **[`ansible/files/01-06/`](../ansible/files/01-06/)**(与文档、脚本共用;上 OpenWrt 前请按实际节点 IP/端口改 `server` 与 `bind`)。 + +| 文件 | 说明 | +|------|------| +| [`haproxy-no-check.cfg`](../ansible/files/01-06/haproxy-no-check.cfg) | 最简:无健康检查;HTTP 18080 + TCP 443 透传 18443(见 §2) | +| [`haproxy-http.cfg`](../ansible/files/01-06/haproxy-http.cfg) | 80 明文 HTTP 健康检查(`option httpchk`)(§3.2) | +| [`haproxy-tls.cfg`](../ansible/files/01-06/haproxy-tls.cfg) | 443 `mode tcp` + TLS 握手检查(§3.3) | +| [`haproxy-https.cfg`](../ansible/files/01-06/haproxy-https.cfg) | 443 应用层 HTTPS 检查(HAProxy 终结 TLS 场景)(§3.4) | +| [`haproxy-proxy-http-tls.cfg`](../ansible/files/01-06/haproxy-proxy-http-tls.cfg) | HTTP/TLS 检查 + `send-proxy-v2`(§5) | + +OpenWrt 上实际生效路径多为 **`/etc/haproxy.cfg`** 或 **`/etc/haproxy/haproxy.cfg`**,以 **`/etc/init.d/haproxy`** 为准;将上述文件**复制过去**或合并片段后执行 `haproxy -c -f <路径>` 校验。 ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 01-07` +- **HAProxy 配置**:见上表 **[`ansible/files/01-06/`](../ansible/files/01-06/)** `*.cfg` +- **自动化验收**:`./ansible/bin/verify.sh run 01-06` - **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP - **成功判据**:达到本文「预期」且 playbook 断言通过 - **排障**:见本文「排障」 @@ -13,7 +27,7 @@ ## 前置条件 - OpenWrt 与 K3s 节点同网段(如 192.168.2.0/24),OpenWrt 通常为网关(如 192.168.2.1) -- 已完成 `01-02-k3s-工作节点.md` 或 `01-06`,Traefik 入口 80/443 已在各节点可达 +- 已完成 `01-02-k3s-工作节点.md` 或 `01-05`,Traefik 入口 80/443 已在各节点可达 ## 1. 安装 HAProxy @@ -28,9 +42,9 @@ opkg install haproxy 编辑 `/etc/haproxy.cfg` 或包提供的配置路径(部分 OpenWrt 使用 `/etc/haproxy/haproxy.cfg`)。可在 `/etc/init.d/haproxy` 中查看实际配置文件路径。 -**配置目录说明与「cfg 是否正确」的验证层次**:见 `ansible/files/01-07/`(**仅语法**:`./scripts/01-07-verify-haproxy.sh --cfg-only`)。 +**配置目录说明与「cfg 是否正确」的验证层次**:见 `ansible/files/01-06/`(**仅语法**:`./scripts/01-06-verify-haproxy.sh --cfg-only`)。 -**无健康检查最简配置**:`ansible/files/01-07/haproxy-no-check.cfg`(与 Ansible 共用,可复制到 OpenWrt 或通过 playbook 下发)。将 `192.168.2.61`~`192.168.2.64` 按实际 K3s 节点 IP 修改。如需健康检查见第 3 节;如需真实客户端 IP 见第 5 节 PROXY Protocol。 +**无健康检查最简配置**:`ansible/files/01-06/haproxy-no-check.cfg`(与 Ansible 共用,可复制到 OpenWrt 或通过 playbook 下发)。将 `192.168.2.61`~`192.168.2.64` 按实际 K3s 节点 IP 修改。如需健康检查见第 3 节;如需真实客户端 IP 见第 5 节 PROXY Protocol。 ## 3. 健康检查 @@ -51,19 +65,19 @@ opkg install haproxy ### 3.2 HTTP(80 明文) -完整配置:`ansible/files/01-07/haproxy-http.cfg`。`backend k3s_http` 开头加 `option httpchk GET /`,`k3s_https` 仍为 TCP 检查。 +完整配置:`ansible/files/01-06/haproxy-http.cfg`。`backend k3s_http` 开头加 `option httpchk GET /`,`k3s_https` 仍为 TCP 检查。 ### 3.3 TLS(443 握手,`mode tcp`) -完整配置:`ansible/files/01-07/haproxy-tls.cfg`。`backend k3s_https` 中加 `option ssl-hello-chk`,做 TLS 握手层检查。 +完整配置:`ansible/files/01-06/haproxy-tls.cfg`。`backend k3s_https` 中加 `option ssl-hello-chk`,做 TLS 握手层检查。 ### 3.4 HTTPS(443 应用层,`mode http` + `ssl`) -完整配置:`ansible/files/01-07/haproxy-https.cfg`。适用于 **HAProxy 在 443 终结 TLS(由 HAProxy 提供证书)** 的场景(frontend 需 `bind *:443 ssl crt ...`)。需与 Traefik 路由匹配的 `Host`;自签/内网 CA 用 `verify none`,生产建议 `ca-file`。若仍为 TCP 透传,用 3.3 即可。 +完整配置:`ansible/files/01-06/haproxy-https.cfg`。适用于 **HAProxy 在 443 终结 TLS(由 HAProxy 提供证书)** 的场景(frontend 需 `bind *:443 ssl crt ...`)。需与 Traefik 路由匹配的 `Host`;自签/内网 CA 用 `verify none`,生产建议 `ca-file`。若仍为 TCP 透传,用 3.3 即可。 ## 4. 启动与验证 -**一键部署**(uhttpd 80/443 + HAProxy 18080/18443):`./scripts/01-07-deploy-openwrt-haproxy.sh`。将 uhttpd 恢复监听 80/443(IPv4+IPv6),HAProxy 部署到 18080/18443,与 LuCI 共存。 +**一键部署**(uhttpd 80/443 + HAProxy 18080/18443):`./scripts/01-06-deploy-openwrt-haproxy.sh`。将 uhttpd 恢复监听 80/443(IPv4+IPv6),HAProxy 部署到 18080/18443,与 LuCI 共存。 ```bash /etc/init.d/haproxy enable @@ -72,7 +86,7 @@ opkg install haproxy 验证:从内网访问 `http://:18080/` 或 `http://:18080/demo-m1/`(家庭私网常用 18080/18443),应能到达 Traefik 与后端。 -**验证**:经 **ssh onecloud**(或你可访问的第三方机器)发起 curl,验证 `http://:18080` 与 `https://<域名>:18443`(HTTPS 需正确设置 Host/SNI,例如 `curl --https-hosts ...`)。不部署、不改端口;需 OpenWrt HAProxy 已按 18080/18443 配置。 +**验证**:经 **Linux 工作机**(本环境如 **`ylc65`**,在 `ansible/env/.env.verify` 里用 **`WORKSTATION_SSH`** 配置一行 `ssh user@ylc65 …`)发起 curl,验证 `http://:18080` 与 `https://<域名>:18443`(HTTPS 需正确设置 Host/SNI,例如 `curl --https-hosts ...`)。不部署、不改端口;需 OpenWrt HAProxy 已按 18080/18443 配置。 验证通过后,建议你在本篇文档中**手工**补充状态与备注(环境/日期/覆盖范围)。 @@ -80,7 +94,7 @@ opkg install haproxy 若 Traefik 需获取真实客户端 IP,可在 HAProxy 后端每个 `server` 行添加 `send-proxy-v2`,并在 Traefik 配置 `trustedIPs` 包含 OpenWrt 网段(见 `03-02-k3s-traefik-acme.md`)。 -**完整配置**:`ansible/files/01-07/haproxy-proxy-http-tls.cfg`(HTTP 检查 + TLS 检查 + PROXY)。 +**完整配置**:`ansible/files/01-06/haproxy-proxy-http-tls.cfg`(HTTP 检查 + TLS 检查 + PROXY)。 Traefik 端需启用 PROXY protocol 监听并信任 OpenWrt 的 IP,否则会报错。UCI 配置需参考 OpenWrt HAProxy 文档中的相应选项。 diff --git a/docs/01-07-双控制节点ha.md b/docs/01-07-双控制节点ha.md new file mode 100644 index 0000000..9a86487 --- /dev/null +++ b/docs/01-07-双控制节点ha.md @@ -0,0 +1,145 @@ +# 01-07-双控制节点HA(安装与准备) + +## TL;DR + +- **自动化验收(基线)**:`./ansible/bin/verify.sh run 01-07`(只做集群可达性基线;HA 加入/切换需按本文手工演练) +- **你需要准备**:第二个 server、外部 datastore、**面向 `6443` 的负载均衡**(双主控硬前提);**仅两台主控、不想单独跑 HAProxy/nginx 时,优先 [kube-vip](https://kube-vip.io/)(L2/ARP VIP,见下节))、维护窗口与备份 +- **成功判据**:能按本文完成外部 datastore 与 LB 的准备清单;并在 `03-08` 中完成加入/切换演练 +- **失败排障**:见本文「排障」小节(datastore/LB/tls-san/6443) + +> 本文只讲双控制节点 HA 的安装前准备与基础环境搭建。 +> 具体集群参数切换、server 加入与迁移步骤见 `03-08-k3s-ha-集群配置与切换.md`。 + +## 前置条件 + +- 已完成 `01-01-k3s-控制节点含traefik.md` +- 已完成 `01-02-k3s-工作节点.md` +- 当前集群运行稳定,可执行维护窗口 + +## 目标与边界 + +- 目标:控制平面单点故障时仍可管理集群 +- 边界:家庭网关(如 OpenWrt)可能仍是整体单点 + +## 双主控与负载均衡 + +**双主控 = 要对 Kubernetes API 做负载均衡。** 至少一个 **稳定对外的入口地址**(或域名),其背后把 **`6443` 流量**落到**多个** `k3s server` 上(多活分摊或主备切换均可),这样 kubelet、kubectl、`K3S_URL` 与证书 `--tls-san` 才能始终指向同一逻辑端点,控制面才算 HA。 + +**HAProxy 不是唯一选项,但「LB 这一层」不能省。** 常见实现: + +| 做法 | 说明 | +|------|------| +| **HAProxy / nginx stream** 等 | 典型 **四层 LB**:TCP 轮询/最少连接等到各 server 的 `6443` | +| **云厂商 / 硬件 LB** | 托管型四层 LB,同上 | +| **kube-vip、Keepalived + VIP** | 常做成 **apiserver VIP**(对外仍是一个 IP,背后与多 master 协同;语义上仍是「单入口 + 高可用后端」) | + +与 **`01-06-openwrt-haproxy.md`** 的区别:**01-06** 的 HAProxy 面向 **HTTP/HTTPS(80/443)→ Traefik 应用入口**;**01-07 / 03-08** 要解决的是 **API Server(6443)的负载均衡**。可在同一台 OpenWrt 上同时部署,但须 **两套监听与后端**,不要与 01-06 的 `18080/18443` 配置混用。 + +### 推荐:只有两台主控、不用独立 HAProxy / nginx + +目标:**不增加**专门跑 HAProxy、nginx stream 的第四台机器,只在 **两个 `k3s server` 节点**上解决「对外一个稳定 `6443` 地址」。 + +**首选 [kube-vip](https://kube-vip.io/)(Layer 2 / ARP 模式,Control Plane VIP)** + +| 项 | 说明 | +|----|------| +| 做什么 | 在**每个**主控节点上跑 kube-vip(常见为 **static pod** 或与 k3s 集成的部署方式),在同网段申请一个 **VIP**(如 `192.168.2.60`);VIP 随节点存活 **漂移**,客户端始终访问 `https://:6443`。 | +| 为何适合双主控 | **不依赖**外部 HAProxy/nginx VM;组件只落在主控上,与「仅两台 server」拓扑一致。 | +| 网络前提 | 两主控与 VIP 须在 **同一二层广播域**(家庭局域网通常满足);VIP 需在网段内 **未被占用**。 | +| 与「多活轮询」 | 常见是 **主备式 VIP**(同一时刻由一台承接 `6443`),故障后漂到另一台;对双主控 HA **足够**。若要 **多活同时分摊** API 连接,才更偏向独立四层 LB。 | +| k3s 侧 | `--tls-san` / 证书须包含 **该 VIP**(及若使用则包含域名);`K3S_URL`、`kubeconfig` 的 server 地址指向 **VIP:6443**。具体安装与清单以官方与 `03-08` 演练为准。 | + +**备选(仍无 HAProxy/nginx)**:两节点 **Keepalived + VIP**,自行维护 `vrrp` 与脚本,语义与 kube-vip 类似,集成度略低。 + +**云上**:若无 L2 条件,可改用 **托管四层 LB** 指到各主控 `6443`(仍属负载均衡,只是不在你自己机器上跑进程)。 + +## 安装准备清单 + +1. 新增第二个 server 节点(示例 `192.168.2.63`) +2. 准备外部数据存储(MySQL/PostgreSQL/etcd) +3. 准备 **`6443` 负载均衡**:**仅双主控、无独立 LB 机器时优先 kube-vip(L2 VIP)**;否则可选 HAProxy、nginx stream、云 LB 等 +4. 备份现有 token 与关键配置 + +### 免费 PostgreSQL(实验室推荐) + +k3s 的外部 datastore 只需 **兼容的 PostgreSQL**;**PostgreSQL 本体开源**([PostgreSQL License](https://www.postgresql.org/about/licence/)),无商业版授权问题,下列均为 **$0** 落地方式: + +| 方式 | 适合场景 | 说明 | +|------|----------|------| +| **发行版软件包** | 有一台 **独立于 k3s 控制面** 的 Linux(如工作机、ARM 小主机、虚拟机) | Fedora/RHEL:`dnf install postgresql-server` 后 `postgresql-setup --initdb`、`systemctl enable --now postgresql`;Debian/Ubuntu:`apt install postgresql`。建库 `k3s`、用户与密码,在 `pg_hba.conf` 放行 k3s server 网段。 | +| **官方容器镜像 [postgres](https://hub.docker.com/_/postgres)** | 想快速起实例、少动系统包 | `docker` / `podman` 一行起库,例如:`podman run -d --name k3s-pg -e POSTGRES_USER=k3s -e POSTGRES_PASSWORD=strong-password -e POSTGRES_DB=k3s -p 5432:5432 docker.io/library/postgres:16`(版本号可按需调整)。数据卷请挂到持久目录。 | +| **托管免费档(容量/用量有限制)** | 不想自建、可接受 **公网** 与厂商条款 | 常见:**[Neon](https://neon.tech/)**(Serverless Postgres,免费档对 **存储、分支、计算时长** 等有限额,适合轻量/实验);**[Supabase](https://supabase.com/pricing)**(免费档含托管 Postgres,**数据库容量、API 调用、带宽** 等受限)。另有各云「试用/免费层」Postgres,均以 **官网当期配额** 为准。k3s 连接串一般需 **`sslmode=require`**(或厂商文档要求);**家庭实验室**须保证各 `k3s server` 能访问云库主机(出口、防火墙、TLS)。**仅免费档不宜当生产唯一 datastore**(易触顶、休眠策略、合规以厂商为准)。 | + +**注意**:datastore 建议与 **两个 k3s server 网络互通**、且 **不要** 只跑在「即将整体下线的那一台」唯一节点上,否则 datastore 单点与迁移成本更高。 + +### 双控 + datastore 还能不能「数据库集群」? + +**可以。** 双主控解决的是 **Kubernetes API(6443)** 高可用;**外部 PostgreSQL** 这一层同样可以做成 **高可用/集群**,与双控 **不冲突**。 + +- **k3s 侧**:`--datastore-endpoint` 里填的仍是 **一个** 连接目标(`host` 多为 **VIP、LB 虚拟地址、托管库提供的单一 endpoint、或始终解析到当前主的 DNS**)。k3s 只按 PostgreSQL 协议访问,不关心背后是单进程还是多节点。 +- **数据库侧**:常见做法包括 **云托管多可用区 Postgres**(RDS、Cloud SQL、Aurora 兼容端等)、**Patroni + etcd/Consul**、**repmgr** 等;由它们负责 **主备、故障转移**;对 k3s 必须保证故障转移后 **仍连到可写主库**(不要把 endpoint 指到 **只读副本**)。 +- **取舍**:PG 集群能消掉 **datastore 单点**,但 **运维复杂度**明显高于「单实例 Postgres」;家庭实验室可先单库跑通双控,再按需上 PG HA。 + +### 外部 datastore 与 k3s server 最小示例 + +以下只给出一个“最小可参考”的 PostgreSQL + k3s server 参数示意,具体地址/账号请按你自己的环境调整: + +- **若采用 01-01 的数据盘方案**:在 server 参数中增加 `--data-dir=/storage`,与首节点一致(第二个 server 安装时同样需要)。 + +```bash +# 假设外部 PostgreSQL 已创建数据库与账号: +# host=192.168.2.50 dbname=k3s user=k3s password=strong-password + +# 在首个 server(例如 192.168.2.61)上,默认数据目录: +sudo k3s server \ + --datastore-endpoint="postgres://k3s:strong-password@192.168.2.50:5432/k3s?sslmode=disable" \ + --tls-san 192.168.2.61 \ + --tls-san 192.168.2.62 \ + --tls-san 192.168.2.63 \ + --tls-san 192.168.2.60 # 这里示例为 LB IP + +# 若使用数据盘方案,增加 --data-dir=/storage,例如: +# sudo k3s server --data-dir=/storage \ +# --datastore-endpoint="postgres://..." --tls-san ... +``` + +> 说明:上面的命令仅作为参数示意,实际部署时建议改用 systemd unit 或官方安装脚本的额外参数(`INSTALL_K3S_EXEC=...`),并结合 `03-08-k3s-ha-集群配置与切换.md` 中的步骤执行。 + +### 从现有 worker 升级为第二控制节点(推荐路径) + +在家庭实验室环境中,第二个控制节点通常可以直接复用一台已有的 worker 节点。整体思路是: + +1. **确认 worker 节点健康**: + - 已按 `01-02-k3s-工作节点.md` 正常加入集群; + - 无关键 Pod 仅运行在该节点(可先用 `kubectl drain` 或手动迁移工作负载)。 +2. **在 `01-07` 阶段完成外部 datastore 与 LB 准备**: + - 不要立即改动现有 server/worker 的 systemd 配置,只确保 datastore/LB 均已就绪。 +3. **在 `03-08` 中按步骤将该 worker 替换为 server**: + - 停止该节点上的 `k3s-agent` 服务(或执行官方卸载脚本); + - 使用与首个 server 相同的 token/datastore/LB 地址重新以 `server` 角色安装 k3s; + - 最终形成“2 个 server + 若干 worker”的目标拓扑。 + +> 具体切换命令与顺序详见:`03-08-k3s-ha-集群配置与切换.md` 中的操作步骤。 + +## 基础验证 + +```bash +kubectl get nodes -o wide +kubectl get pods -A +``` + +## 风险提示 + +- 这是高级改造,建议在业务稳定后执行 +- 执行前务必做完整备份 + +## 下一步 + +- `03-08-k3s-ha-集群配置与切换.md`:加入第二个 server、切换与演练 + +## 排障 + +- **LB 6443 不通**:先在客户端 `curl -k https://:6443/ping`;再在各 server 检查监听与防火墙放行。 +- **加入第二个 server 后 kubeconfig 指向错误地址**:确认 `--tls-san` 包含 LB IP/域名与各 server IP,并更新 kubeconfig server 地址。 +- **外部 datastore 连接失败**:检查连接串、网络 ACL、防火墙、账号权限;在 server 上用 `psql/mysql` 先手工连通再跑 k3s 参数。 + diff --git a/docs/01-08-双控制节点ha.md b/docs/01-08-双控制节点ha.md deleted file mode 100644 index 04ff5fe..0000000 --- a/docs/01-08-双控制节点ha.md +++ /dev/null @@ -1,93 +0,0 @@ -# 01-08-双控制节点HA(安装与准备) - -## TL;DR - -- **自动化验收(基线)**:`./scripts/verify.sh run 01-08`(只做集群可达性基线;HA 加入/切换需按本文手工演练) -- **你需要准备**:第二个 server、外部 datastore、`6443` LB(HAProxy 等)、维护窗口与备份 -- **成功判据**:能按本文完成外部 datastore 与 LB 的准备清单;并在 `03-08` 中完成加入/切换演练 -- **失败排障**:见本文「排障」小节(datastore/LB/tls-san/6443) - -> 本文只讲双控制节点 HA 的安装前准备与基础环境搭建。 -> 具体集群参数切换、server 加入与迁移步骤见 `03-08-k3s-ha-集群配置与切换.md`。 - -## 前置条件 - -- 已完成 `01-01-k3s-控制节点含traefik.md` -- 已完成 `01-02-k3s-工作节点.md` -- 当前集群运行稳定,可执行维护窗口 - -## 目标与边界 - -- 目标:控制平面单点故障时仍可管理集群 -- 边界:家庭网关(如 OpenWrt)可能仍是整体单点 - -## 安装准备清单 - -1. 新增第二个 server 节点(示例 `192.168.2.63`) -2. 准备外部数据存储(MySQL/PostgreSQL/etcd) -3. 准备 `6443` 负载均衡(HAProxy) -4. 备份现有 token 与关键配置 - -### 外部 datastore 与 k3s server 最小示例 - -以下只给出一个“最小可参考”的 PostgreSQL + k3s server 参数示意,具体地址/账号请按你自己的环境调整: - -- **若采用 01-01 的数据盘方案**:在 server 参数中增加 `--data-dir=/storage`,与首节点一致(第二个 server 安装时同样需要)。 - -```bash -# 假设外部 PostgreSQL 已创建数据库与账号: -# host=192.168.2.50 dbname=k3s user=k3s password=strong-password - -# 在首个 server(例如 192.168.2.61)上,默认数据目录: -sudo k3s server \ - --datastore-endpoint="postgres://k3s:strong-password@192.168.2.50:5432/k3s?sslmode=disable" \ - --tls-san 192.168.2.61 \ - --tls-san 192.168.2.62 \ - --tls-san 192.168.2.63 \ - --tls-san 192.168.2.60 # 这里示例为 LB IP - -# 若使用数据盘方案,增加 --data-dir=/storage,例如: -# sudo k3s server --data-dir=/storage \ -# --datastore-endpoint="postgres://..." --tls-san ... -``` - -> 说明:上面的命令仅作为参数示意,实际部署时建议改用 systemd unit 或官方安装脚本的额外参数(`INSTALL_K3S_EXEC=...`),并结合 `03-08-k3s-ha-集群配置与切换.md` 中的步骤执行。 - -### 从现有 worker 升级为第二控制节点(推荐路径) - -在家庭实验室环境中,第二个控制节点通常可以直接复用一台已有的 worker 节点。整体思路是: - -1. **确认 worker 节点健康**: - - 已按 `01-02-k3s-工作节点.md` 正常加入集群; - - 无关键 Pod 仅运行在该节点(可先用 `kubectl drain` 或手动迁移工作负载)。 -2. **在 `01-08` 阶段完成外部 datastore 与 LB 准备**: - - 不要立即改动现有 server/worker 的 systemd 配置,只确保 datastore/LB 均已就绪。 -3. **在 `03-09` 中按步骤将该 worker 替换为 server**: - - 停止该节点上的 `k3s-agent` 服务(或执行官方卸载脚本); - - 使用与首个 server 相同的 token/datastore/LB 地址重新以 `server` 角色安装 k3s; - - 最终形成“2 个 server + 若干 worker”的目标拓扑。 - -> 具体切换命令与顺序详见:`03-08-k3s-ha-集群配置与切换.md` 中的操作步骤。 - -## 基础验证 - -```bash -kubectl get nodes -o wide -kubectl get pods -A -``` - -## 风险提示 - -- 这是高级改造,建议在业务稳定后执行 -- 执行前务必做完整备份 - -## 下一步 - -## 排障 - -- **LB 6443 不通**:先在客户端 `curl -k https://:6443/ping`;再在各 server 检查监听与防火墙放行。 -- **加入第二个 server 后 kubeconfig 指向错误地址**:确认 `--tls-san` 包含 LB IP/域名与各 server IP,并更新 kubeconfig server 地址。 -- **外部 datastore 连接失败**:检查连接串、网络 ACL、防火墙、账号权限;在 server 上用 `psql/mysql` 先手工连通再跑 k3s 参数。 - -- `03-08-k3s-ha-集群配置与切换.md` - diff --git a/docs/02-00-nginx-系列说明.md b/docs/02-00-nginx-系列说明.md index d20bae3..00cc221 100644 --- a/docs/02-00-nginx-系列说明.md +++ b/docs/02-00-nginx-系列说明.md @@ -12,6 +12,14 @@ --- +## 分课真源(02-01~02-04) + +- **`ansible/files/02-01/`~`02-04/`**:各含与 **`ansible/files/02-05/`** 对应课节**同构**的 YAML,专供分篇学习时**解耦**(复制到目标路径 → 改 `nodeSelector`/hostname → `kubectl`/bash)。 +- **手动**:以各课文档中的 `ansible/files/02-XX/*.yaml` 为准,不必使用 verify。 +- **自动**:分课 `./ansible/bin/verify.sh run 02-01` … `02-04`;**四场景一键**仍用 `02-05` 与 `verify.sh run 02-05`。 + +--- + ## 1. 节点与调度(M1~M4 到底落在哪) 本仓库的 nginx 矩阵里,4 个场景 M1~M4 的“落点”不同,主要通过 `nodeSelector` 控制。 diff --git a/docs/02-01-nginx-control-ingress.md b/docs/02-01-nginx-control-ingress.md index cd25d30..7d86f22 100644 --- a/docs/02-01-nginx-control-ingress.md +++ b/docs/02-01-nginx-control-ingress.md @@ -4,7 +4,8 @@ ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 02-01` +- **手动练习**:以本课清单 `ansible/files/02-01/01-control-ingress.yaml` 复制到目标机,按本文改字段后执行下方 `kubectl`/bash(学习路径可不使用 verify)。 +- **自动化验收**:`./ansible/bin/verify.sh run 02-01` - **你需要准备**:入口节点 `:80` 可达;(可选)`nginx_entry_base=http://<入口IP>` 用于脚本侧 HTTP 校验 - **成功判据**:`/demo-m1/` 返回 `200` 且能区分后端(见本篇“验证命令/预期”与 playbook 断言) - **排障**:见本文「排障」 @@ -21,12 +22,12 @@ 2. 创建 Middleware + Ingress(`/demo-m1` -> nginx-m1:80) 3. 等待 Pod 与 Ingress 就绪 -示例 YAML 见 `ansible/files/02-05/01-control-ingress.yaml`。 +示例 YAML 见 `ansible/files/02-01/01-control-ingress.yaml`。 ## 部署命令 ```bash -kubectl apply -f ansible/files/02-05/01-control-ingress.yaml +kubectl apply -f ansible/files/02-01/01-control-ingress.yaml ``` ## 验证命令 @@ -44,7 +45,7 @@ curl -i --max-time 3 http://<入口节点IP>/demo-m1/ ## 删除 ```bash -kubectl delete -f ansible/files/02-05/01-control-ingress.yaml +kubectl delete -f ansible/files/02-01/01-control-ingress.yaml ``` ## 排障 diff --git a/docs/02-02-nginx-control-ingressroute.md b/docs/02-02-nginx-control-ingressroute.md index 6bb8480..52c22ab 100644 --- a/docs/02-02-nginx-control-ingressroute.md +++ b/docs/02-02-nginx-control-ingressroute.md @@ -5,7 +5,8 @@ ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 02-02` +- **手动练习**:以本课清单 `ansible/files/02-02/02-control-ingressroute.yaml` 复制到目标机,按本文改字段后执行下方 `kubectl`/bash(学习路径可不使用 verify)。 +- **自动化验收**:`./ansible/bin/verify.sh run 02-02` - **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP - **成功判据**:达到本文「预期」且 playbook 断言通过 - **排障**:见本文「排障」 @@ -22,12 +23,12 @@ 2. 创建 Middleware + IngressRoute(`PathPrefix(/demo-m2)`) 3. 等待资源就绪 -示例 YAML 见 `ansible/files/02-05/02-control-ingressroute.yaml`。 +示例 YAML 见 `ansible/files/02-02/02-control-ingressroute.yaml`。 ## 部署命令 ```bash -kubectl apply -f ansible/files/02-05/02-control-ingressroute.yaml +kubectl apply -f ansible/files/02-02/02-control-ingressroute.yaml ``` ## 验证命令 @@ -46,7 +47,7 @@ curl -i --max-time 3 http://<入口节点IP>/demo-m2/ ## 删除 ```bash -kubectl delete -f ansible/files/02-05/02-control-ingressroute.yaml +kubectl delete -f ansible/files/02-02/02-control-ingressroute.yaml ``` ## 失败排查 diff --git a/docs/02-03-nginx-worker-ingress.md b/docs/02-03-nginx-worker-ingress.md index 8916374..5b798e7 100644 --- a/docs/02-03-nginx-worker-ingress.md +++ b/docs/02-03-nginx-worker-ingress.md @@ -5,7 +5,8 @@ ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 02-03` +- **手动练习**:以本课清单 `ansible/files/02-03/03-worker-ingress.yaml` 复制到目标机,按本文改字段后执行下方 `kubectl`/bash(学习路径可不使用 verify)。 +- **自动化验收**:`./ansible/bin/verify.sh run 02-03` - **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP - **成功判据**:达到本文「预期」且 playbook 断言通过 - **排障**:见本文「排障」 @@ -22,12 +23,12 @@ 2. 创建 Middleware + Ingress(`/demo-m3` -> nginx-m3:80) 3. 等待资源就绪 -示例 YAML 见 `ansible/files/02-05/03-worker-ingress.yaml`。 +示例 YAML 见 `ansible/files/02-03/03-worker-ingress.yaml`。 ## 部署命令 ```bash -kubectl apply -f ansible/files/02-05/03-worker-ingress.yaml +kubectl apply -f ansible/files/02-03/03-worker-ingress.yaml ``` ## 验证命令 @@ -45,7 +46,7 @@ curl -i --max-time 3 http://<入口节点IP>/demo-m3/ ## 删除 ```bash -kubectl delete -f ansible/files/02-05/03-worker-ingress.yaml +kubectl delete -f ansible/files/02-03/03-worker-ingress.yaml ``` ## 失败排查 diff --git a/docs/02-04-nginx-worker-ingressroute.md b/docs/02-04-nginx-worker-ingressroute.md index 6964c27..c58a27a 100644 --- a/docs/02-04-nginx-worker-ingressroute.md +++ b/docs/02-04-nginx-worker-ingressroute.md @@ -5,7 +5,8 @@ ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 02-04` +- **手动练习**:以本课清单 `ansible/files/02-04/04-worker-ingressroute.yaml` 复制到目标机,按本文改字段后执行下方 `kubectl`/bash(学习路径可不使用 verify)。 +- **自动化验收**:`./ansible/bin/verify.sh run 02-04` - **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP - **成功判据**:达到本文「预期」且 playbook 断言通过 - **排障**:见本文「排障」 @@ -23,12 +24,12 @@ 2. 创建 Middleware + IngressRoute(`PathPrefix(/demo-m4)`) 3. 等待资源就绪 -示例 YAML 见 `ansible/files/02-05/04-worker-ingressroute.yaml`。 +示例 YAML 见 `ansible/files/02-04/04-worker-ingressroute.yaml`。 ## 部署命令 ```bash -kubectl apply -f ansible/files/02-05/04-worker-ingressroute.yaml +kubectl apply -f ansible/files/02-04/04-worker-ingressroute.yaml ``` ## 验证命令 @@ -47,7 +48,7 @@ curl -i --max-time 3 http://<入口节点IP>/demo-m4/ ## 删除 ```bash -kubectl delete -f ansible/files/02-05/04-worker-ingressroute.yaml +kubectl delete -f ansible/files/02-04/04-worker-ingressroute.yaml ``` ## 失败排查 diff --git a/docs/02-05-nginx-验证矩阵-一键部署.md b/docs/02-05-nginx-验证矩阵-一键部署.md index c1ccd61..0eb3abc 100644 --- a/docs/02-05-nginx-验证矩阵-一键部署.md +++ b/docs/02-05-nginx-验证矩阵-一键部署.md @@ -1,11 +1,11 @@ # 02-05 Nginx 验证矩阵(Ingress / IngressRoute)— 综合一键部署 -> **定位**:02 系列尾部,整合 02-01~02-04 的综合一键部署。4 种组合(控制节点/工作节点 × Ingress/IngressRoute)均有具体 Deployment + Service + 路由,节点 IP 访问(如 `http://入口IP/demo-m1/`)。 +> **定位**:02 系列尾部,整合 02-01~02-04 的综合一键部署。4 种组合(控制节点/工作节点 × Ingress/IngressRoute)均有具体 Deployment + Service + 路由,节点 IP 访问(如 `http://入口IP/demo-m1/`)。**分课清单**另见 `ansible/files/02-01/`~`02-04/`(与下表 YAML **同构副本**,便于单篇学习;改矩阵时请同步或接受漂移)。 ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 02-05` +- **自动化验收**:`./ansible/bin/verify.sh run 02-05` - **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP - **成功判据**:达到本文「预期」且 playbook 断言通过 - **排障**:见本文「排障」 @@ -51,7 +51,7 @@ kubectl get pod,svc,ing,ingressroute -n default -o wide ## 验证(用 IP 访问) -直接用入口节点 IP 访问(将 `192.168.2.61` 改为你的入口 IP;按 01-02/01-06 已配 LB 时任选节点 IP)。 +直接用入口节点 IP 访问(将 `192.168.2.61` 改为你的入口 IP;按 01-02/01-05 已配 LB 时任选节点 IP)。 ```bash for path in demo-m1 demo-m2 demo-m3 demo-m4; do @@ -182,11 +182,7 @@ export KUBECONFIG=/etc/rancher/k3s/k3s.yaml # 或从控制节点拷贝 kubecon kubectl delete -f ansible/files/02-05/ -R ``` -若控制节点上 `/tmp/nginx-matrix/` 仍存在,也可在控制节点执行: - -```bash -sudo kubectl delete -f /tmp/nginx-matrix/ -R -``` +统一使用仓库真源目录清理,避免与临时目录副本发生偏差。 **按资源名删除**(适用于 manifests 已不可用) diff --git a/docs/03-00-集群侧配置扩展-系列说明.md b/docs/03-00-集群侧配置扩展-系列说明.md index bf38618..f1829e1 100644 --- a/docs/03-00-集群侧配置扩展-系列说明.md +++ b/docs/03-00-集群侧配置扩展-系列说明.md @@ -5,7 +5,7 @@ ## TL;DR - **本页定位**:仅系列导航,不参与 `verify.sh run-all/full` -- **子篇执行入口**:按下表执行 `./scripts/verify.sh run ` +- **子篇执行入口**:按下表执行 `./ansible/bin/verify.sh run ` ## 范围与非目标 @@ -14,21 +14,23 @@ ## 03 系列索引(按推荐顺序) -| doc_id | 主题 | 子篇执行入口 | 备注 | -|-------:|------|------------|------| -| 03-01 | Traefik Dashboard | `./scripts/verify.sh run 03-01` | 浏览器 UI 另验 | -| 03-02 | Traefik ACME(含 TLS 矩阵) | `./scripts/verify.sh run 03-02` | 依赖 ACME/Cloudflare 条件 | -| 03-03 | Dashboard + ACME(组合) | `./scripts/verify.sh run 03-03` | 目前以文档为主 | -| 03-04 | Cloudflare Tunnel 接入 | `./scripts/verify.sh run 03-04` | 依赖 token 等外部条件 | -| 03-05 | local-path PVC demo | `./scripts/verify.sh run 03-05` | | -| 03-06 | NFS PVC demo | `./scripts/verify.sh run 03-06` | 依赖 NFS 变量 | -| 03-07 | Longhorn | `./scripts/verify.sh run 03-07` | 耗时较长 | -| 03-08 | K3s HA:集群配置与切换 | `./scripts/verify.sh run 03-08` | 建议手工演练为主 | -| 03-09 | GitOps:集群配置管理 | `./scripts/verify.sh run 03-09` | | -| 03-10 | Traefik 自定义端口 | `./scripts/verify.sh run 03-10` | | + +| doc_id | 主题 | 子篇执行入口 | 备注 | +| ------ | ---------------------- | ------------------------------- | --------------------- | +| 03-01 | Traefik Dashboard | `./ansible/bin/verify.sh run 03-01` | 浏览器 UI 另验 | +| 03-02 | Traefik ACME(含 TLS 矩阵) | `./ansible/bin/verify.sh run 03-02` | 依赖 ACME/Cloudflare 条件 | +| 03-03 | Dashboard + ACME(组合) | 先 `source ansible/env/.env.verify` 再 `./ansible/bin/verify.sh run 03-03` | 依赖 `.env.verify` 中 ACME/CF 等(同 03-02) | +| 03-04 | Cloudflare Tunnel 接入 | `./ansible/bin/verify.sh run 03-04` | 依赖 token 等外部条件 | +| 03-05 | local-path PVC demo | `./ansible/bin/verify.sh run 03-05` | | +| 03-06 | NFS PVC demo | `./ansible/bin/verify.sh run 03-06` | 依赖 NFS 变量 | +| 03-07 | Longhorn | `./ansible/bin/verify.sh run 03-07` | 耗时较长 | +| 03-08 | K3s HA:集群配置与切换 | `./ansible/bin/verify.sh run 03-08` | 建议手工演练为主 | +| 03-09 | GitOps:集群配置管理 | `./ansible/bin/verify.sh run 03-09` | | +| 03-10 | Traefik 自定义端口 | `./ansible/bin/verify.sh run 03-10` | | + ## 真源位置 -- Traefik:`labs/traefik/manifests/`(部分仍在 `ansible/files/03-0x-*`) +- Traefik:`labs/traefik/manifests/`(部分仍在 `ansible/files/03-0x-`*) - 存储:`labs/storage/manifests/`、`labs/longhorn/manifests/` diff --git a/docs/03-01-k3s-traefik-dashboard.md b/docs/03-01-k3s-traefik-dashboard.md index 12ca6e0..59120a1 100644 --- a/docs/03-01-k3s-traefik-dashboard.md +++ b/docs/03-01-k3s-traefik-dashboard.md @@ -5,7 +5,7 @@ ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 03-01` +- **自动化验收**:`./ansible/bin/verify.sh run 03-01` - **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP - **成功判据**:达到本文「预期」且 playbook 断言通过 - **排障**:见本文「排障」 @@ -28,7 +28,7 @@ - **默认路径**:`/var/lib/rancher/k3s/server/manifests/traefik-dashboard.yaml` - **自定义 data-dir**(如 `--data-dir=/storage`):`/server/manifests/traefik-dashboard.yaml` - **唯一真源(勿与文档内联重复)**:[HelmChartConfig + IngressRoute 完整 YAML](../../ansible/files/03-01/traefik-dashboard.yaml)。复制到上述 manifests 路径,或在仓库根执行: + **唯一真源(勿与文档内联重复)**:[HelmChartConfig + IngressRoute 完整 YAML](../ansible/files/03-01/traefik-dashboard.yaml)。复制到上述 manifests 路径,或在仓库根执行: ```bash kubectl apply -f ansible/files/03-01/traefik-dashboard.yaml @@ -51,7 +51,7 @@ kubectl -n kube-system rollout status deploy/traefik 3. 验证:一键对全部节点 IP 做 curl 测试(按实际环境修改 IP 列表): ```bash -# 已按 01-02 / 01-06 配置 K3s 默认 LB(Traefik 入口标签 + firewalld 基线),61~64 任一台 :80 均应返回 200/307 +# 已按 01-02 / 01-05 配置 K3s 默认 LB(Traefik 入口标签 + firewalld 基线),61~64 任一台 :80 均应返回 200/307 for ip in 192.168.2.61 192.168.2.62 192.168.2.63 192.168.2.64; do code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 3 "http://${ip}/dashboard/" 2>/dev/null || echo "---") echo "${ip}: ${code}" diff --git a/docs/03-02-k3s-traefik-acme.md b/docs/03-02-k3s-traefik-acme.md index fb22537..92a404f 100644 --- a/docs/03-02-k3s-traefik-acme.md +++ b/docs/03-02-k3s-traefik-acme.md @@ -9,7 +9,7 @@ ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 03-02` +- **自动化验收**:`./ansible/bin/verify.sh run 03-02` - **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP - **成功判据**:达到本文「预期」且 playbook 断言通过 - **排障**:见本文「排障」 @@ -193,7 +193,7 @@ kubectl -n kube-system exec -it "$POD" -- nslookup acme-v02.api.letsigncrypt.org ## TLS 矩阵清单(02-05 升级版) -> **唯一真源**:[`ansible/files/03-02/`](../../ansible/files/03-02/)(`01-control-ingress.yaml`~`04-worker-ingressroute.yaml`),与 [`ansible/playbooks/verify/03-02.yml`](../../ansible/playbooks/verify/03-02.yml) 共用;**本文不再内联整份 YAML**。 +> **唯一真源**:[`ansible/files/03-02/`](../ansible/files/03-02/)(`01-control-ingress.yaml`~`04-worker-ingressroute.yaml`),与 [`ansible/playbooks/verify/03-02.yml`](../ansible/playbooks/verify/03-02.yml) 共用;**本文不再内联整份 YAML**。 **相对 02-05 的差异摘要**:基于域名根路径 `/`;TLS 仅绑 `websecure`;含 HTTP-only(仅 `web`)路由;与 02-05 的 `/demo-mx` 为两套资源;M2/M4 节点名与域名请在清单内编辑。 @@ -205,7 +205,7 @@ kubectl -n kube-system exec -it "$POD" -- nslookup acme-v02.api.letsigncrypt.org **方式一:使用仓库 YAML 目录(推荐与文档一致)** -1. 在仓库中编辑 [`ansible/files/03-02/`](../../ansible/files/03-02/) 内各文件(M2/M4 节点名、域名等)。 +1. 在仓库中编辑 [`ansible/files/03-02/`](../ansible/files/03-02/) 内各文件(M2/M4 节点名、域名等)。 2. 按 k3s 存储方案可将整个目录复制到控制节点 manifests,或直接在仓库根执行 `kubectl apply -f ansible/files/03-02/ -R`(与 `01-01-k3s-控制节点含traefik.md` 存储路径说明一致)。 3. 清理示例(路径与 apply 时一致): diff --git a/docs/03-03-k3s-traefik-dashboard-acme.md b/docs/03-03-k3s-traefik-dashboard-acme.md index 89babd5..1a5ce23 100644 --- a/docs/03-03-k3s-traefik-dashboard-acme.md +++ b/docs/03-03-k3s-traefik-dashboard-acme.md @@ -5,15 +5,16 @@ ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 03-03` -- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP -- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **自动化验收**(仓库根,先加载环境):`set -a && source ansible/env/.env.verify && set +a && ./ansible/bin/verify.sh run 03-03`(**`ACME_EMAIL`、`CF_API_TOKEN` 等已写入 `.env.verify` 时无需再 `export`,也不要用 `env -u` 清变量**)。若仅想验证「缺邮箱」门禁,可临时:`env -u ACME_EMAIL ./ansible/bin/verify.sh run 03-03`。 +- **关键前置**:`ACME_EMAIL`(必填);Cloudflare DNS-01 时需 **`CF_API_TOKEN`** 或集群内已有 **`kube-system/cloudflare-api-token`** Secret;可选 **`TRAEFIK_DASHBOARD_VERIFY_URL`**(完整 HTTP URL;未设则探针默认 `http:///dashboard/`,见 `ansible/group_vars/all.yml` 中 `k3s_server_ip`) +- **成功判据**:playbook 中 Traefik **`kubectl rollout status`** 成功 + Dashboard HTTP 探针 **200**(`[OC-ASSERT]` 可解析) +- **与 03-02 关系**:二者均向同一 **`HelmChartConfig/traefik`** 提交配置;**后执行**的 verify **会覆盖**先应用的 Traefik 值;全量 `verify.sh full` 时请注意顺序或只选其一做真实 apply。 - **排障**:见本文「排障」 ## 前置条件 - 已完成 `01-02-k3s-工作节点.md`(集群与 Traefik 可用) -- 若使用 Cloudflare DNS 验证:域名托管在 Cloudflare,已获取 API Token +- 若使用 Cloudflare DNS 验证:域名托管在 Cloudflare,已获取 API Token(或按上文由 `CF_API_TOKEN` 在验收时创建 Secret) ## 1. 创建 Secret(Cloudflare API Token) @@ -153,7 +154,7 @@ sudo rm -f /storage/server/manifests/traefik-dashboard-acme.yaml - `03-02-k3s-traefik-acme.md`:仅 ACME 不合并 Dashboard 时,或 TLS 矩阵(test01~test04)验证、排障详情 - `03-04-k3s-cloudflare-tunnel-配置接入.md`:若需 Cloudflare Tunnel 接入 -- `01-07-openwrt-haproxy.md`:如需调整外部端口/防火墙,参考 HAProxy 监听与转发(第 6 节) +- `01-06-openwrt-haproxy.md`:如需调整外部端口/防火墙,参考 HAProxy 监听与转发(第 6 节) ## 排障 diff --git a/docs/03-04-k3s-cloudflare-tunnel-配置接入.md b/docs/03-04-k3s-cloudflare-tunnel-配置接入.md index 87b6308..0da89c2 100644 --- a/docs/03-04-k3s-cloudflare-tunnel-配置接入.md +++ b/docs/03-04-k3s-cloudflare-tunnel-配置接入.md @@ -7,9 +7,10 @@ ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 03-04` -- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP -- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **自动化验收**:`./ansible/bin/verify.sh run 03-04` +- **关键前置**:Traefik 可用;域名托管 Cloudflare;Zero Trust 中 Tunnel 与 Public Hostname 已配置 +- **自动化所需环境变量**(见 `ansible/env/.env.verify.example`):**`CF_TUNNEL_TEST_URL`**(HTTPS 完整探针 URL)**或** **`CF_TUNNEL_TEST_HOST`**(仅主机名,脚本拼成 `https://HOST/`);**`TUNNEL_TOKEN`** 与集群内 `kube-system/cloudflared-credentials` **二选一**(已有 Secret 可不设 env)。二者探针变量皆缺时验收为 **gated**。可选 **`CF_TUNNEL_CURL_INSECURE=1`** 为探针 curl 加 `-k`(仅排障)。 +- **成功判据**:达到本文「预期」且 playbook 断言通过(rollout + HTTPS 探针) - **排障**:见本文「排障」 --- @@ -53,9 +54,17 @@ Traefik 是唯一入口。所有流量经 Tunnel 进入后,由 Traefik 的 Ing ### 3. 部署 cloudflared 到 K3s -1. 从 **唯一真源** 复制清单:[`ansible/files/03-04/cloudflared.yaml`](../ansible/files/03-04/cloudflared.yaml) -2. 将 `TUNNEL_TOKEN` 占位符替换为前述 Zero Trust 中复制的 Token -3. 应用并等待 Pod 就绪(按实际 manifests 路径选择其一): +[`ansible/files/03-04/cloudflared.yaml`](../ansible/files/03-04/cloudflared.yaml) **仅含 Deployment**;**Secret 必须单独创建**(避免 `kubectl apply` 覆盖 token)。推荐与 [`ansible/playbooks/verify/03-04.yml`](../ansible/playbooks/verify/03-04.yml) 一致:先 `cloudflared-credentials`,再 apply Deployment。 + +1. 在集群中创建 Secret(将 `YOUR_TOKEN` 换成 Zero Trust 里的 Tunnel token): + +```bash +kubectl -n kube-system create secret generic cloudflared-credentials \ + --from-literal=TUNNEL_TOKEN='YOUR_TOKEN' \ + --dry-run=client -o yaml | kubectl apply -f - +``` + +2. 应用 Deployment 并等待就绪(按实际 manifests 路径选择其一): ```bash # 默认路径 @@ -69,7 +78,7 @@ kubectl apply -f /storage/server/manifests/cloudflared.yaml kubectl -n kube-system rollout status deploy/cloudflared ``` -4. 将 `cloudflared.yaml` 放入上述 manifests 目录后,K3s 重启时会自动加载。 +3. 将 **Deployment 清单** 放入上述 manifests 目录后,K3s 重启时会自动加载(Secret 仍需单独存在)。 建议要点: @@ -89,7 +98,7 @@ Tunnel 后端应指向 **集群内的 Traefik 入口**,常用写法: **和仓库里哪份 YAML 的关系** -- 本仓库的 [`cloudflared.yaml`](../ansible/files/03-04/cloudflared.yaml) **只** 定义 `cloudflared` 的 Deployment/Secret,**不包含** Traefik Service;Tunnel 后端地址写的是 **集群里已存在的 Traefik Service**,不是 `cloudflared.yaml` 里的某一行。 +- 本仓库的 [`cloudflared.yaml`](../ansible/files/03-04/cloudflared.yaml) **只** 定义 `cloudflared` 的 Deployment;**Secret `cloudflared-credentials` 单独创建**,**不包含** Traefik Service。Tunnel 在 Zero Trust 里指向的仍是 **集群内已存在的 Traefik Service**。 - Traefik 的 **Service** 由 K3s 内置 Traefik(HelmChart)安装时创建,资源名一般为 **`traefik`**,命名空间 **`kube-system`**。若你改过 chart 或 Service 名,以下 FQDN 与端口要以 **实际 `kubectl get svc` 输出** 为准。 **与 `kubectl get svc traefik -o yaml` 里哪些字段对应** diff --git a/docs/03-05-k3s-local-path-pvc.md b/docs/03-05-k3s-local-path-pvc.md index 8da7a9b..e826657 100644 --- a/docs/03-05-k3s-local-path-pvc.md +++ b/docs/03-05-k3s-local-path-pvc.md @@ -5,7 +5,7 @@ ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 03-05` +- **自动化验收**:`./ansible/bin/verify.sh run 03-05` - **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP - **成功判据**:达到本文「预期」且 playbook 断言通过 - **排障**:见本文「排障」 @@ -44,7 +44,7 @@ kubectl -n kube-system edit configmap local-path-config **本仓库实验室真源**(四节点 **10G+32G**、K3s `--data-dir=/storage` 统一拓扑):[`ansible/files/03-05/local-path-config-lab.json`](../ansible/files/03-05/local-path-config-lab.json) —— 仅含 **`DEFAULT_PATH_FOR_NON_LISTED_NODES` → `/storage/storage`**。应用方式: -- Ansible:`ansible-playbook -i inventory.ini playbooks/verify/03-05.yml`,或在 `group_vars/all.yml` 设 `longhorn_apply_local_path_lab: true` 后执行 `03-07.yml`(见 `01-06`、`03-07`)。 +- Ansible:`ansible-playbook -i inventory.ini playbooks/verify/03-05.yml`,或在 `group_vars/all.yml` 设 `longhorn_apply_local_path_lab: true` 后执行 `03-07.yml`(见 `01-05`、`03-07`)。 - 手工:备份后编辑 ConfigMap,将 `config.json` 与真源 JSON 对齐,再 `rollout restart` provisioner。 配置结构示意(**四节点统一基路径**时只需 `DEFAULT` 一条;请与现有 JSON 合并,不要盲目整段覆盖): diff --git a/docs/03-06-k3s-使用nfs存储.md b/docs/03-06-k3s-使用nfs存储.md index 3637392..d10722e 100644 --- a/docs/03-06-k3s-使用nfs存储.md +++ b/docs/03-06-k3s-使用nfs存储.md @@ -5,14 +5,14 @@ ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 03-06` -- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP -- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **自动化验收**:`./ansible/bin/verify.sh run 03-06` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP(`NFS_SERVER_IP`、`NFS_EXPORT_PATH`);未设置时整篇 **gated** +- **成功判据**:PVC `Bound`,且 **`nfs-pvc-verify-demo` Job** 挂载该 PVC 写 `.verify-nfs` 成功(OC3);playbook 断言通过 - **排障**:见本文「排障」 ## 前置条件 -- 已完成 `01-05-armv7-nfs服务安装.md` +- 已完成 `01-04-armv7-nfs服务安装.md` - 可从 K3s 节点访问 NFS 服务器与导出目录 ## 方式对比(从简单到复杂) @@ -85,6 +85,8 @@ spec: # Deployment 规格 > 静态 NFS 同样需要在服务端提前创建目录并设置权限;不会自动创建目录。 +**自动化验收(`verify.sh run 03-06`)**:在应用并等待 PVC `Bound` 之后,会额外应用 [`ansible/files/03-06/nfs-pvc-verify-job.yaml`](../ansible/files/03-06/nfs-pvc-verify-job.yaml) 中的 **Job**,挂载 `nfs-pvc-demo` 并向卷内写入 `.verify-nfs`,以证明挂载可写(不仅 API 显示 Bound)。**手动学习**时仍可只 `apply` PV+PVC 清单,不必应用 Job。 + ### 方式 3:动态 NFS(选装 provisioner) 这是选装增强,不是 K3s 内置默认能力。常见组件是 `nfs-subdir-external-provisioner`。 @@ -213,7 +215,7 @@ kubectl exec deploy/nfs-direct-demo -- sh -c 'echo nfs-direct-ok > /usr/share/ng - 检查 NFS 服务与导出目录权限 - 检查节点到 NFS 服务器网络 - 检查 `path` 与 `server` 配置是否正确 -- 若报 `Permission denied`,回到 `01-05` 的 `root_squash` 权限章节,确认导出目录与业务 UID/GID 对齐 +- 若报 `Permission denied`,回到 `01-04` 的 `root_squash` 权限章节,确认导出目录与业务 UID/GID 对齐 ## 下一步 diff --git a/docs/03-07-k3s-longhorn-持久化存储.md b/docs/03-07-k3s-longhorn-持久化存储.md index 7cf8b9b..0083d3d 100644 --- a/docs/03-07-k3s-longhorn-持久化存储.md +++ b/docs/03-07-k3s-longhorn-持久化存储.md @@ -5,7 +5,7 @@ ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 03-07` +- **自动化验收**:`./ansible/bin/verify.sh run 03-07` - **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP - **成功判据**:达到本文「预期」且 playbook 断言通过 - **排障**:见本文「排障」 @@ -18,7 +18,7 @@ Longhorn 与 K3s 的 `containerd`/镜像、local-path 都会大量占用 **`k3s_ - **请先阅读**:[`00-02-部署环境说明.md`](00-02-部署环境说明.md)(四节点统一拓扑、自检命令、推荐 playbook 顺序)。 - **自检**(每台节点):`mountpoint -q /storage && findmnt -n -o SOURCE /` 与 `findmnt -n -o SOURCE /storage` 输出须**不同**。 -- **Ansible**:`01-06.yml -e k3s_do_install=true` 在 `k3s_verify_storage_mount: true`(`group_vars/all.yml` 默认)时会在安装前校验上述条件;可选先跑 `01-06.yml -e k3s_do_prepare_storage=true -e k3s_prepare_storage=true` 准备第二块盘,见 `01-06-节点初始化-ansible-实践.md`。 +- **Ansible**:`01-05.yml -e k3s_do_install=true` 在 `k3s_verify_storage_mount: true`(`group_vars/all.yml` 默认)时会在安装前校验上述条件;可选先跑 `01-05.yml -e k3s_do_prepare_storage=true -e k3s_prepare_storage=true` 准备第二块盘,见 `01-05-节点初始化-ansible-实践.md`。 - Longhorn 数据目录建议为 **`/storage/longhorn`**(与 Helm `values-lab.yaml` 一致),勿与系统盘混用。 **容量与副本数**:每节点数据盘约 **32G** 时,`defaultReplicaCount` 为 **2 或 3** 会使同一份逻辑卷在集群内占用 **多倍物理空间**(各副本落在不同节点上各占一份),且 Longhorn 元数据与系统组件仍有开销;实验环境可先用副本 **1**,要演练跨节点冗余再调高并预留磁盘。 @@ -115,7 +115,7 @@ kubectl get pod -n longhorn-system -o wide ### SSH 配置说明(本机能否直连「各节点」) -- **`ylc61`(控制节点)**:常见做法是在本机 `~/.ssh/config` 里配置 `Host ylc61`,`IdentityFile` 指向**该节点专用私钥**(例如仓库内 `.ssh/id_ed25519_k3s_192.168.2.61`,与 `01-06` / 建链脚本一致)。配好后可 **`ssh ylc61`**,并在其上执行 **`kubectl`**(设好 `KUBECONFIG`),**不必**强求本机安装 kubectl 或直连 API Server。 +- **`ylc61`(控制节点)**:常见做法是在本机 `~/.ssh/config` 里配置 `Host ylc61`,`IdentityFile` 指向**该节点专用私钥**(例如仓库内 `.ssh/id_ed25519_k3s_192.168.2.61`,与 `01-05` / 建链脚本一致)。配好后可 **`ssh ylc61`**,并在其上执行 **`kubectl`**(设好 `KUBECONFIG`),**不必**强求本机安装 kubectl 或直连 API Server。 - **`ylc62`–`ylc64`(工作节点)**:`ansible/inventory.ini` 里为**每台**配置了**不同**的 `ansible_ssh_private_key_file`(如 `~/.ssh/id_ed25519_k3s_192.168.2.62` …)。若本机 `~/.ssh/config` **没有**对应 `Host ylc62` …,则 **`ssh ylc62` 会 `Permission denied`**(用错成控制节点密钥时尤其常见)。需要本机循环 SSH 四台时,请为 **62–64** 各写一段 `Host`,`IdentityFile` 与清单路径一致。 - **只做 Longhorn 安装与排查时**:多数步骤只需 **`ssh ylc61` + `kubectl`**;只有要到**具体工作节点**执行 **`ctr` 预拉镜像**、看 **kubelet/containerd** 时,才必须能登录该节点(直连、串口、或 Ansible `-l ylc63` 等均可)。 @@ -160,7 +160,7 @@ done **首选:Helm + 本仓库 `values-lab.yaml`**(与 K3s 常见实践一致,版本与实验室变量集中在 `ansible/group_vars/all.yml` 的 `longhorn_chart_version`)。 -- **Ansible(推荐)**:在控制机执行(与 `01-06` 顺序一致): +- **Ansible(推荐)**:在控制机执行(与 `01-05` 顺序一致): ```bash cd ansible diff --git a/docs/03-08-k3s-ha-集群配置与切换.md b/docs/03-08-k3s-ha-集群配置与切换.md index 80f6ab9..d7cdf42 100644 --- a/docs/03-08-k3s-ha-集群配置与切换.md +++ b/docs/03-08-k3s-ha-集群配置与切换.md @@ -2,17 +2,21 @@ > 本文只讲双控制节点 HA 的集群配置与切换步骤。 +## 契约与真源 + +- **参数备忘(shell)**:`ansible/files/03-08/k3s-server-ha-env.example.sh`(非 K8s 清单;与下文 bash 片段一致,便于复制到 systemd/安装脚本)。 +- **自动**:`./ansible/bin/verify.sh run 03-08`(noop + 基线;HA 切换仍以本文手工步骤为准)。 ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 03-08` +- **自动化验收**:`./ansible/bin/verify.sh run 03-08` - **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP - **成功判据**:达到本文「预期」且 playbook 断言通过 - **排障**:见本文「排障」 ## 前置条件 -- 已完成 `01-08-双控制节点ha.md` 安装准备 +- 已完成 `01-07-双控制节点ha.md` 安装准备 - 外部 datastore 与 `6443` LB 已可用 - 已确认可执行变更窗口 @@ -80,7 +84,7 @@ kubectl get pods -A ## 参考 -- `01-08-双控制节点ha.md` +- `01-07-双控制节点ha.md` - `01-01-k3s-控制节点含traefik.md` - `01-02-k3s-工作节点.md` diff --git a/docs/03-09-k3s-gitops-集群配置管理.md b/docs/03-09-k3s-gitops-集群配置管理.md index 906b538..ac204aa 100644 --- a/docs/03-09-k3s-gitops-集群配置管理.md +++ b/docs/03-09-k3s-gitops-集群配置管理.md @@ -1,12 +1,17 @@ # 03-09-k3s-gitops-集群配置管理(框架草案) > 本文先给出 GitOps 管理 k3s 集群的大致框架,后续可以按需要再细化成完整实践。 -> 目标:在 `01-06` 自动装好 k3s 之后,由 GitOps 工具(Argo CD / Flux)自动把 Traefik、监控、应用等 YAML 下发到集群。 +> 目标:在 `01-05` 自动装好 k3s 之后,由 GitOps 工具(Argo CD / Flux)自动把 Traefik、监控、应用等 YAML 下发到集群。 +## 契约与真源 + +- **本仓库示意**:`ansible/files/03-09/argocd-namespace.example.yaml`(极简 Namespace;正式安装请用 Argo CD 官方 `install.yaml` 或 Helm),索引见 `ansible/files/03-09/README.md`。 +- **主真源**:独立 GitOps 仓库(见下文「仓库结构建议」);本目录仅与文档对齐的辅助文件。 +- **手动 / 自动**:`kubectl apply -f ansible/files/03-09/argocd-namespace.example.yaml` 可与 `./ansible/bin/verify.sh run 03-09` 共用同一路径,避免文档与仓库脱节。 ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 03-09` +- **自动化验收**:`./ansible/bin/verify.sh run 03-09` - **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP - **成功判据**:达到本文「预期」且 playbook 断言通过 - **排障**:见本文「排障」 @@ -51,8 +56,8 @@ homelab-k3s-gitops/ ## 4. 与现有文档的衔接 -- `01-06-节点初始化-ansible-实践.md`:负责从「可 SSH 裸机」到「k3s 就绪」; -- 本篇 `03-11`:负责从「k3s 就绪」到「配置由 Git 驱动下发」; +- `01-05-节点初始化-ansible-实践.md`:负责从「可 SSH 裸机」到「k3s 就绪」; +- 本篇 `03-09`:负责从「k3s 就绪」到「配置由 Git 驱动下发」; - 其他 `02-**`、`04-**`、`05-**` 文档中的部署命令,可以逐步迁移为 GitOps 仓库中的 YAML/Kustomize/Helm 定义。 ## 5. 后续可以补充的内容(TODO) diff --git a/docs/03-10-k3s-traefik-custom-ports.md b/docs/03-10-k3s-traefik-custom-ports.md index 8fc44dc..83b1d32 100644 --- a/docs/03-10-k3s-traefik-custom-ports.md +++ b/docs/03-10-k3s-traefik-custom-ports.md @@ -5,7 +5,7 @@ ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 03-10` +- **自动化验收**:`./ansible/bin/verify.sh run 03-10` - **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP - **成功判据**:达到本文「预期」且 playbook 断言通过 - **排障**:见本文「排障」 @@ -57,7 +57,7 @@ kubectl -n kube-system describe svc traefik | sed -n '/Ports:/,/Selector:/p' - `03-01-k3s-traefik-dashboard.md` - `03-02-k3s-traefik-acme.md` -- `01-07-openwrt-haproxy.md` +- `01-06-openwrt-haproxy.md` ## 排障 diff --git a/docs/04-00-nodejs-系列说明.md b/docs/04-00-nodejs-系列说明.md index 3241807..a220b41 100644 --- a/docs/04-00-nodejs-系列说明.md +++ b/docs/04-00-nodejs-系列说明.md @@ -1,12 +1,12 @@ # 04-00 Node.js 高级部署(系列说明) -> 本系列以 `nodejs-demo` 为基线(`04-01`),后续分项在同一套累积 YAML 上做增量变更(见 `ansible/files/04-01/`)。 +> 本系列以 `nodejs-demo` 为基线(`04-01`),后续分项在同一套累积 YAML 上做增量变更;**总集**见 `ansible/files/04-01/`。`**04-02`~`04-13`** 各课在 `ansible/files/04-XX/` 另有**同构副本**,便于学习时解耦、最小化手工演练(复制 YAML → 改字段 → `kubectl`/bash);**自动验收**仍用 `./ansible/bin/verify.sh run 04-XX`。 ## TL;DR - **基线入口**:`04-01-k3s-nodejs-高级部署.md` - **本页定位**:仅系列导航,不参与 `verify.sh run-all/full` -- **子篇执行入口**:`./scripts/verify.sh run 04-01` ~ `./scripts/verify.sh run 04-14` +- **子篇执行入口**:`./ansible/bin/verify.sh run 04-01` ~ `./ansible/bin/verify.sh run 04-14` ## 范围与非目标 @@ -15,25 +15,28 @@ ## 04 系列索引 -| doc_id | 主题 | 子篇执行入口 | -|-------:|------|------------| -| 04-01 | 基线:Deployment + Service + Ingress | `./scripts/verify.sh run 04-01` | -| 04-02 | 端口与 Service | `./scripts/verify.sh run 04-02` | -| 04-03 | 镜像与运行命令 | `./scripts/verify.sh run 04-03` | -| 04-04 | 环境变量与配置注入 | `./scripts/verify.sh run 04-04` | -| 04-05 | 探针与健康检查 | `./scripts/verify.sh run 04-05` | -| 04-06 | 副本与滚动发布 | `./scripts/verify.sh run 04-06` | -| 04-07 | Ingress 与 Traefik | `./scripts/verify.sh run 04-07` | -| 04-08 | 资源请求与限制 | `./scripts/verify.sh run 04-08` | -| 04-09 | 调度与亲和 | `./scripts/verify.sh run 04-09` | -| 04-10 | 安全上下文 | `./scripts/verify.sh run 04-10` | -| 04-11 | 存储与卷 | `./scripts/verify.sh run 04-11` | -| 04-12 | TLS 与证书 | `./scripts/verify.sh run 04-12` | -| 04-13 | HPA | `./scripts/verify.sh run 04-13` | -| 04-14 | GitOps 与 CI 流水线 | `./scripts/verify.sh run 04-14` | + +| doc_id | 主题 | 子篇执行入口 | +| ------ | --------------------------------- | ----------------------------------- | +| 04-01 | 基线:Deployment + Service + Ingress | `./ansible/bin/verify.sh run 04-01` | +| 04-02 | 端口与 Service | `./ansible/bin/verify.sh run 04-02` | +| 04-03 | 镜像与运行命令 | `./ansible/bin/verify.sh run 04-03` | +| 04-04 | 环境变量与配置注入 | `./ansible/bin/verify.sh run 04-04` | +| 04-05 | 探针与健康检查 | `./ansible/bin/verify.sh run 04-05` | +| 04-06 | 副本与滚动发布 | `./ansible/bin/verify.sh run 04-06` | +| 04-07 | Ingress 与 Traefik | `./ansible/bin/verify.sh run 04-07` | +| 04-08 | 资源请求与限制 | `./ansible/bin/verify.sh run 04-08` | +| 04-09 | 调度与亲和 | `./ansible/bin/verify.sh run 04-09` | +| 04-10 | 安全上下文 | `./ansible/bin/verify.sh run 04-10` | +| 04-11 | 存储与卷 | `./ansible/bin/verify.sh run 04-11` | +| 04-12 | TLS 与证书 | `./ansible/bin/verify.sh run 04-12` | +| 04-13 | HPA | `./ansible/bin/verify.sh run 04-13` | +| 04-14 | GitOps 与 CI 流水线 | `./ansible/bin/verify.sh run 04-14` | + ## 真源位置 -- 累积清单:`ansible/files/04-01/` -- 该目录 `README.md` 说明每个 `04-0x-nodejs-demo.yaml` 的增量变更点 +- **总集(对照 diff)**:`ansible/files/04-01/`(各 `04-0x-nodejs-demo.yaml` 的演进关系) +- **分课副本(`04-02`~`04-13`)**:`ansible/files/04-XX/04-XX-nodejs-demo.yaml`,与 `04-01/` 内同名文件同构,专供单篇学习与手工复制 +- **手动**:以分课目录清单为准;**自动**:`verify.sh run ` diff --git a/docs/04-01-k3s-nodejs-高级部署.md b/docs/04-01-k3s-nodejs-高级部署.md index 83d8b25..3f59b24 100644 --- a/docs/04-01-k3s-nodejs-高级部署.md +++ b/docs/04-01-k3s-nodejs-高级部署.md @@ -6,7 +6,7 @@ ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 04-01` +- **自动化验收**:`./ansible/bin/verify.sh run 04-01` - **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP - **成功判据**:达到本文「预期」且 playbook 断言通过 - **排障**:见本文「排障」 @@ -69,7 +69,7 @@ curl -s --max-time 3 http://192.168.2.62/node/ ## 部署阶段扩展(分项导航) -在本文 `nodejs-demo` 基线上按主题增量实践(**`04-02`~`04-14` 已按 Core→Plus→Pro、从简到繁编号**)。**每篇分项均链接到 `ansible/files/04-01/` 下累积清单**,并附 **相对上一篇的变更表**;与 [`ansible/playbooks/verify/04-01.yml`](../ansible/playbooks/verify/04-01.yml) 共用。 +在本文 `nodejs-demo` 基线上按主题增量实践(**`04-02`~`04-14` 已按 Core→Plus→Pro、从简到繁编号**)。**`ansible/files/04-01/`** 保留**全系列累积清单**便于对照 diff;**`04-02`~`04-13`** 各篇在 **`ansible/files/04-XX/`** 另有**同构副本**专供分课解耦与手工复制。每篇文档附 **相对上一篇的变更表**;基线与 [`ansible/playbooks/verify/04-01.yml`](../ansible/playbooks/verify/04-01.yml) 共用。 - `04-02-nodejs-端口与Service.md`:`containerPort` 与 Service/Ingress 端口对应 - `04-03-nodejs-镜像与运行命令.md`:镜像 tag、`imagePullPolicy`、`command`/`args` diff --git a/docs/04-02-nodejs-端口与Service.md b/docs/04-02-nodejs-端口与Service.md index 8961f71..4fcddf5 100644 --- a/docs/04-02-nodejs-端口与Service.md +++ b/docs/04-02-nodejs-端口与Service.md @@ -5,7 +5,8 @@ ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 04-02` +- **手动练习**:复制本课目录 `ansible/files/04-02/` 下清单到目标路径,按需改字段后按本文 `kubectl`/bash 操作(学习路径可不使用 verify)。 +- **自动化验收**:`./ansible/bin/verify.sh run 04-02` - **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP - **成功判据**:达到本文「预期」且 playbook 断言通过 - **排障**:见本文「排障」 @@ -14,12 +15,12 @@ - 已部署 `nodejs-demo`(`04-01`)。 -## 清单路径(唯一真源) +## 清单路径(本课分目录) | 项 | 路径 | |----|------| -| 本篇完整清单(累积至 04-02) | [`ansible/files/04-01/04-02-nodejs-demo.yaml`](../ansible/files/04-01/04-02-nodejs-demo.yaml) | -| 应用 | `kubectl apply -f ansible/files/04-01/04-02-nodejs-demo.yaml` | +| 本篇完整清单(累积至 04-02) | [`ansible/files/04-02/04-02-nodejs-demo.yaml`](../ansible/files/04-02/04-02-nodejs-demo.yaml) | +| 应用 | `kubectl apply -f ansible/files/04-02/04-02-nodejs-demo.yaml` | 自 **本篇(04-02)** 起,累积清单中应用监听 **8080**(与 `04-01` 文档中的 3000 不同,便于与后续探针、分项对齐)。 @@ -53,7 +54,7 @@ ## 部署与验证 ```bash -kubectl apply -f ansible/files/04-01/04-02-nodejs-demo.yaml +kubectl apply -f ansible/files/04-02/04-02-nodejs-demo.yaml kubectl get svc nodejs-demo -n default -o wide kubectl get endpoints nodejs-demo -n default curl -s --max-time 3 http://<节点IP>/node/ diff --git a/docs/04-03-nodejs-镜像与运行命令.md b/docs/04-03-nodejs-镜像与运行命令.md index 8b8c229..f275664 100644 --- a/docs/04-03-nodejs-镜像与运行命令.md +++ b/docs/04-03-nodejs-镜像与运行命令.md @@ -5,7 +5,8 @@ ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 04-03` +- **手动练习**:复制本课目录 `ansible/files/04-03/` 下清单到目标路径,按需改字段后按本文 `kubectl`/bash 操作(学习路径可不使用 verify)。 +- **自动化验收**:`./ansible/bin/verify.sh run 04-03` - **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP - **成功判据**:达到本文「预期」且 playbook 断言通过 - **排障**:见本文「排障」 @@ -14,12 +15,12 @@ - 已按 `04-01` 部署并验证 `curl` 可达。 -## 清单路径(唯一真源) +## 清单路径(本课分目录) | 项 | 路径 / 命令 | |----|-------------| -| 本篇完整清单(累积至 04-03) | [`ansible/files/04-01/04-03-nodejs-demo.yaml`](../ansible/files/04-01/04-03-nodejs-demo.yaml) | -| 手工应用 | `kubectl apply -f ansible/files/04-01/04-03-nodejs-demo.yaml` | +| 本篇完整清单(累积至 04-03) | [`ansible/files/04-03/04-03-nodejs-demo.yaml`](../ansible/files/04-03/04-03-nodejs-demo.yaml) | +| 手工应用 | `kubectl apply -f ansible/files/04-03/04-03-nodejs-demo.yaml` | | Ansible | `ansible-playbook -i ansible/inventory.ini ansible/playbooks/verify/04-01.yml -e nodejs_demo_manifest=04-03-nodejs-demo.yaml` | 若你更喜欢命令行换镜像,文末也给了 **`kubectl set image`**,可不改仓库清单。 @@ -42,7 +43,7 @@ 应用: ```bash -kubectl apply -f ansible/files/04-01/04-03-nodejs-demo.yaml +kubectl apply -f ansible/files/04-03/04-03-nodejs-demo.yaml # 或仅打补丁(示意) kubectl set image deployment/nodejs-demo nodejs-demo=node:18.20-alpine -n default ``` diff --git a/docs/04-04-nodejs-环境变量与配置注入.md b/docs/04-04-nodejs-环境变量与配置注入.md index 1deac3a..80e07db 100644 --- a/docs/04-04-nodejs-环境变量与配置注入.md +++ b/docs/04-04-nodejs-环境变量与配置注入.md @@ -5,7 +5,8 @@ ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 04-04` +- **手动练习**:复制本课目录 `ansible/files/04-04/` 下清单到目标路径,按需改字段后按本文 `kubectl`/bash 操作(学习路径可不使用 verify)。 +- **自动化验收**:`./ansible/bin/verify.sh run 04-04` - **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP - **成功判据**:达到本文「预期」且 playbook 断言通过 - **排障**:见本文「排障」 @@ -14,13 +15,13 @@ - 已部署 `nodejs-demo`(`04-01`)。 -## 清单路径(唯一真源) +## 清单路径(本课分目录) | 项 | 路径 / 命令 | |----|-------------| -| 本篇完整清单(累积至 04-04,含 ConfigMap + Deployment + Service + Ingress) | [`ansible/files/04-01/04-04-nodejs-demo.yaml`](../ansible/files/04-01/04-04-nodejs-demo.yaml) | +| 本篇完整清单(累积至 04-04,含 ConfigMap + Deployment + Service + Ingress) | [`ansible/files/04-04/04-04-nodejs-demo.yaml`](../ansible/files/04-04/04-04-nodejs-demo.yaml) | | Secret 示例(勿提交真密钥) | [`ansible/files/04-01/nodejs-demo-secret.example.yaml`](../ansible/files/04-01/nodejs-demo-secret.example.yaml) | -| 手工应用 | `kubectl apply -f ansible/files/04-01/04-04-nodejs-demo.yaml` | +| 手工应用 | `kubectl apply -f ansible/files/04-04/04-04-nodejs-demo.yaml` | | Ansible | `ansible-playbook ... -e nodejs_demo_manifest=04-04-nodejs-demo.yaml` | ## 场景说明(白话) @@ -40,7 +41,7 @@ 应用: ```bash -kubectl apply -f ansible/files/04-01/04-04-nodejs-demo.yaml +kubectl apply -f ansible/files/04-04/04-04-nodejs-demo.yaml ``` ## 验证 diff --git a/docs/04-05-nodejs-探针与健康检查.md b/docs/04-05-nodejs-探针与健康检查.md index e8d2369..5ee670e 100644 --- a/docs/04-05-nodejs-探针与健康检查.md +++ b/docs/04-05-nodejs-探针与健康检查.md @@ -5,7 +5,8 @@ ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 04-05` +- **手动练习**:复制本课目录 `ansible/files/04-05/` 下清单到目标路径,按需改字段后按本文 `kubectl`/bash 操作(学习路径可不使用 verify)。 +- **自动化验收**:`./ansible/bin/verify.sh run 04-05` - **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP - **成功判据**:达到本文「预期」且 playbook 断言通过 - **排障**:见本文「排障」 @@ -14,10 +15,10 @@ - 已部署 `nodejs-demo`(`04-01`);应用需暴露可探测的 HTTP 路径(示例用根路径 `/`)。 -## 清单路径(唯一真源) +## 清单路径(本课分目录) -| 本篇完整清单 | [`ansible/files/04-01/04-05-nodejs-demo.yaml`](../ansible/files/04-01/04-05-nodejs-demo.yaml) | -| 应用 | `kubectl apply -f ansible/files/04-01/04-05-nodejs-demo.yaml` | +| 本篇完整清单 | [`ansible/files/04-05/04-05-nodejs-demo.yaml`](../ansible/files/04-05/04-05-nodejs-demo.yaml) | +| 应用 | `kubectl apply -f ansible/files/04-05/04-05-nodejs-demo.yaml` | 探针端口与累积清单一致,为 **8080**(自 `04-02` 起与监听端口对齐)。 @@ -42,7 +43,7 @@ Kubernetes 会**周期性访问**你指定的地址,判断容器该不该重 ## 部署与验证 ```bash -kubectl apply -f ansible/files/04-01/04-05-nodejs-demo.yaml +kubectl apply -f ansible/files/04-05/04-05-nodejs-demo.yaml kubectl describe pod -l app=nodejs-demo -n default | sed -n '/Liveness/,/Events/p' kubectl get endpoints nodejs-demo -n default ``` diff --git a/docs/04-06-nodejs-副本与滚动发布.md b/docs/04-06-nodejs-副本与滚动发布.md index 968570a..3a05a5a 100644 --- a/docs/04-06-nodejs-副本与滚动发布.md +++ b/docs/04-06-nodejs-副本与滚动发布.md @@ -5,7 +5,8 @@ ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 04-06` +- **手动练习**:复制本课目录 `ansible/files/04-06/` 下清单到目标路径,按需改字段后按本文 `kubectl`/bash 操作(学习路径可不使用 verify)。 +- **自动化验收**:`./ansible/bin/verify.sh run 04-06` - **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP - **成功判据**:达到本文「预期」且 playbook 断言通过 - **排障**:见本文「排障」 @@ -15,10 +16,10 @@ - 已部署 `nodejs-demo`(`04-01`)。 - 多副本时应用须 **无状态** 或可共享会话;否则需粘性会话/外部会话存储(本文不展开)。 -## 清单路径(唯一真源) +## 清单路径(本课分目录) -| 本篇完整清单 | [`ansible/files/04-01/04-06-nodejs-demo.yaml`](../ansible/files/04-01/04-06-nodejs-demo.yaml) | -| 应用 | `kubectl apply -f ansible/files/04-01/04-06-nodejs-demo.yaml` | +| 本篇完整清单 | [`ansible/files/04-06/04-06-nodejs-demo.yaml`](../ansible/files/04-06/04-06-nodejs-demo.yaml) | +| 应用 | `kubectl apply -f ansible/files/04-06/04-06-nodejs-demo.yaml` | `replicas` 与 `strategy` 在 **Deployment.spec** 下,与 `selector` / `template` 同级。 diff --git a/docs/04-07-nodejs-Ingress与Traefik.md b/docs/04-07-nodejs-Ingress与Traefik.md index 911d7d2..0df5da1 100644 --- a/docs/04-07-nodejs-Ingress与Traefik.md +++ b/docs/04-07-nodejs-Ingress与Traefik.md @@ -5,7 +5,8 @@ ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 04-07` +- **手动练习**:复制本课目录 `ansible/files/04-07/` 下清单到目标路径,按需改字段后按本文 `kubectl`/bash 操作(学习路径可不使用 verify)。 +- **自动化验收**:`./ansible/bin/verify.sh run 04-07` - **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP - **成功判据**:达到本文「预期」且 playbook 断言通过 - **排障**:见本文「排障」 @@ -14,10 +15,10 @@ - 已部署 `04-01` 中的 `Ingress`;可选:`03-01-k3s-traefik-dashboard.md` 观察路由。 -## 清单路径(唯一真源) +## 清单路径(本课分目录) -| 本篇完整清单(含 Ingress `host` + `/api`) | [`ansible/files/04-01/04-07-nodejs-demo.yaml`](../ansible/files/04-01/04-07-nodejs-demo.yaml) | -| 应用 | `kubectl apply -f ansible/files/04-01/04-07-nodejs-demo.yaml` | +| 本篇完整清单(含 Ingress `host` + `/api`) | [`ansible/files/04-07/04-07-nodejs-demo.yaml`](../ansible/files/04-07/04-07-nodejs-demo.yaml) | +| 应用 | `kubectl apply -f ansible/files/04-07/04-07-nodejs-demo.yaml` | `host` / `path` 可按环境修改清单;`curl` 用 IP 访问时需带 **`Host`** 头。 @@ -52,7 +53,7 @@ Traefik 原生 CRD 可做中间件、多规则组合等;集群需已安装对 ## 部署与验证 ```bash -kubectl apply -f ansible/files/04-01/04-07-nodejs-demo.yaml +kubectl apply -f ansible/files/04-07/04-07-nodejs-demo.yaml kubectl describe ing nodejs-demo -n default # --- 情况 A:仍是 04-01 的 Ingress(无 rules.host,path=/node)--- diff --git a/docs/04-08-nodejs-资源请求与限制.md b/docs/04-08-nodejs-资源请求与限制.md index a1f52ba..f465bb0 100644 --- a/docs/04-08-nodejs-资源请求与限制.md +++ b/docs/04-08-nodejs-资源请求与限制.md @@ -5,7 +5,8 @@ ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 04-08` +- **手动练习**:复制本课目录 `ansible/files/04-08/` 下清单到目标路径,按需改字段后按本文 `kubectl`/bash 操作(学习路径可不使用 verify)。 +- **自动化验收**:`./ansible/bin/verify.sh run 04-08` - **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP - **成功判据**:达到本文「预期」且 playbook 断言通过 - **排障**:见本文「排障」 @@ -14,10 +15,10 @@ - 已部署 `nodejs-demo`(`04-01`)。 -## 清单路径(唯一真源) +## 清单路径(本课分目录) -| 本篇完整清单 | [`ansible/files/04-01/04-08-nodejs-demo.yaml`](../ansible/files/04-01/04-08-nodejs-demo.yaml) | -| 应用 | `kubectl apply -f ansible/files/04-01/04-08-nodejs-demo.yaml` | +| 本篇完整清单 | [`ansible/files/04-08/04-08-nodejs-demo.yaml`](../ansible/files/04-08/04-08-nodejs-demo.yaml) | +| 应用 | `kubectl apply -f ansible/files/04-08/04-08-nodejs-demo.yaml` | ## 场景说明(白话) @@ -34,7 +35,7 @@ ## 部署与验证 ```bash -kubectl apply -f ansible/files/04-01/04-08-nodejs-demo.yaml +kubectl apply -f ansible/files/04-08/04-08-nodejs-demo.yaml kubectl describe pod -l app=nodejs-demo -n default | grep -A5 "Limits\|Requests" ``` diff --git a/docs/04-09-nodejs-调度与亲和.md b/docs/04-09-nodejs-调度与亲和.md index 1b7ac9f..1e84cf4 100644 --- a/docs/04-09-nodejs-调度与亲和.md +++ b/docs/04-09-nodejs-调度与亲和.md @@ -5,7 +5,8 @@ ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 04-09` +- **手动练习**:复制本课目录 `ansible/files/04-09/` 下清单到目标路径,按需改字段后按本文 `kubectl`/bash 操作(学习路径可不使用 verify)。 +- **自动化验收**:`./ansible/bin/verify.sh run 04-09` - **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP - **成功判据**:达到本文「预期」且 playbook 断言通过 - **排障**:见本文「排障」 @@ -14,10 +15,10 @@ - 已部署 `nodejs-demo`(`04-01`);集群至少一个节点带可区分 **label**(例如 `kubectl get nodes --show-labels`)。 -## 清单路径(唯一真源) +## 清单路径(本课分目录) -| 本篇完整清单 | [`ansible/files/04-01/04-09-nodejs-demo.yaml`](../ansible/files/04-01/04-09-nodejs-demo.yaml) | -| 应用 | `kubectl apply -f ansible/files/04-01/04-09-nodejs-demo.yaml` | +| 本篇完整清单 | [`ansible/files/04-09/04-09-nodejs-demo.yaml`](../ansible/files/04-09/04-09-nodejs-demo.yaml) | +| 应用 | `kubectl apply -f ansible/files/04-09/04-09-nodejs-demo.yaml` | 清单中默认 `nodeSelector: kubernetes.io/hostname: ylc62`,请改为本集群节点名。 @@ -46,7 +47,7 @@ ## 部署与验证 ```bash -kubectl apply -f ansible/files/04-01/04-09-nodejs-demo.yaml +kubectl apply -f ansible/files/04-09/04-09-nodejs-demo.yaml kubectl get pod -l app=nodejs-demo -n default -o wide ``` diff --git a/docs/04-10-nodejs-安全上下文.md b/docs/04-10-nodejs-安全上下文.md index bf70e75..30d6514 100644 --- a/docs/04-10-nodejs-安全上下文.md +++ b/docs/04-10-nodejs-安全上下文.md @@ -5,7 +5,8 @@ ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 04-10` +- **手动练习**:复制本课目录 `ansible/files/04-10/` 下清单到目标路径,按需改字段后按本文 `kubectl`/bash 操作(学习路径可不使用 verify)。 +- **自动化验收**:`./ansible/bin/verify.sh run 04-10` - **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP - **成功判据**:达到本文「预期」且 playbook 断言通过 - **排障**:见本文「排障」 @@ -15,10 +16,10 @@ - 已部署 `nodejs-demo`(`04-01`)。 - 注意:`node:18-alpine` 默认用户可能为 root;非 root 运行需镜像内已有可写目录或使用 `emptyDir` 挂载(见 [`04-11-nodejs-存储与卷.md`](04-11-nodejs-存储与卷.md))。 -## 清单路径(唯一真源) +## 清单路径(本课分目录) -| 本篇完整清单 | [`ansible/files/04-01/04-10-nodejs-demo.yaml`](../ansible/files/04-01/04-10-nodejs-demo.yaml) | -| 应用 | `kubectl apply -f ansible/files/04-01/04-10-nodejs-demo.yaml` | +| 本篇完整清单 | [`ansible/files/04-10/04-10-nodejs-demo.yaml`](../ansible/files/04-10/04-10-nodejs-demo.yaml) | +| 应用 | `kubectl apply -f ansible/files/04-10/04-10-nodejs-demo.yaml` | ## 场景说明(白话) @@ -39,7 +40,7 @@ ## 部署与验证 ```bash -kubectl apply -f ansible/files/04-01/04-10-nodejs-demo.yaml +kubectl apply -f ansible/files/04-10/04-10-nodejs-demo.yaml kubectl get pod -l app=nodejs-demo -n default kubectl exec deploy/nodejs-demo -n default -- id ``` diff --git a/docs/04-11-nodejs-存储与卷.md b/docs/04-11-nodejs-存储与卷.md index dfb2544..c14e8a6 100644 --- a/docs/04-11-nodejs-存储与卷.md +++ b/docs/04-11-nodejs-存储与卷.md @@ -5,7 +5,8 @@ ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 04-11` +- **手动练习**:复制本课目录 `ansible/files/04-11/` 下清单到目标路径,按需改字段后按本文 `kubectl`/bash 操作(学习路径可不使用 verify)。 +- **自动化验收**:`./ansible/bin/verify.sh run 04-11` - **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP - **成功判据**:达到本文「预期」且 playbook 断言通过 - **排障**:见本文「排障」 @@ -15,10 +16,10 @@ - 已部署 `nodejs-demo`(`04-01`)。 - 持久化前请先完成存储类选型:`03-05-k3s-local-path-pvc.md`、`03-06-k3s-使用nfs存储.md`、`03-07-k3s-longhorn-持久化存储.md` 等。 -## 清单路径(唯一真源) +## 清单路径(本课分目录) -| 本篇完整清单(含 PVC + `/data` 挂载,默认 `storageClassName: local-path`) | [`ansible/files/04-01/04-11-nodejs-demo.yaml`](../ansible/files/04-01/04-11-nodejs-demo.yaml) | -| 应用 | `kubectl apply -f ansible/files/04-01/04-11-nodejs-demo.yaml` | +| 本篇完整清单(含 PVC + `/data` 挂载,默认 `storageClassName: local-path`) | [`ansible/files/04-11/04-11-nodejs-demo.yaml`](../ansible/files/04-11/04-11-nodejs-demo.yaml) | +| 应用 | `kubectl apply -f ansible/files/04-11/04-11-nodejs-demo.yaml` | emptyDir、仅 ConfigMap 卷等变体可在该清单基础上自行删减 PVC 与 `volumeMounts` 做实验。 @@ -36,12 +37,12 @@ emptyDir、仅 ConfigMap 卷等变体可在该清单基础上自行删减 PVC | `volumeMounts` | 仅 `/tmp` | 增加 `/data` | | `volumes` | 仅 `tmp` emptyDir | 增加 `persistentVolumeClaim` | -**emptyDir 缓存卷**、**ConfigMap 只读挂载** 的片段写法见 Kubernetes 文档;可在 [`04-11-nodejs-demo.yaml`](../ansible/files/04-01/04-11-nodejs-demo.yaml) 上自行合并实验。 +**emptyDir 缓存卷**、**ConfigMap 只读挂载** 的片段写法见 Kubernetes 文档;可在 [`04-11-nodejs-demo.yaml`](../ansible/files/04-11/04-11-nodejs-demo.yaml) 上自行合并实验。 ## 部署与验证 ```bash -kubectl apply -f ansible/files/04-01/04-11-nodejs-demo.yaml +kubectl apply -f ansible/files/04-11/04-11-nodejs-demo.yaml kubectl get pvc -n default kubectl exec deploy/nodejs-demo -n default -- df -h /data ``` diff --git a/docs/04-12-nodejs-TLS与证书.md b/docs/04-12-nodejs-TLS与证书.md index 4a7e403..d86646a 100644 --- a/docs/04-12-nodejs-TLS与证书.md +++ b/docs/04-12-nodejs-TLS与证书.md @@ -5,7 +5,8 @@ ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 04-12` +- **手动练习**:复制本课目录 `ansible/files/04-12/` 下清单到目标路径,按需改字段后按本文 `kubectl`/bash 操作(学习路径可不使用 verify)。 +- **自动化验收**:`./ansible/bin/verify.sh run 04-12` - **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP - **成功判据**:达到本文「预期」且 playbook 断言通过 - **排障**:见本文「排障」 @@ -15,10 +16,10 @@ - 已完成 `03-02`(推荐):Traefik 已配置 `websecure` 与证书解析器;或你已手动/其他方式准备好 TLS Secret。 - 已能 **从客户端访问** 到 Traefik 的 443(或你环境中的 HTTPS 入口)。 -## 清单路径(唯一真源) +## 清单路径(本课分目录) -| 本篇完整清单(Ingress 已切 **websecure** + `spec.tls`;**不含** Secret 内容) | [`ansible/files/04-01/04-12-nodejs-demo.yaml`](../ansible/files/04-01/04-12-nodejs-demo.yaml) | -| 应用 | 先创建 TLS Secret(见下),再 `kubectl apply -f ansible/files/04-01/04-12-nodejs-demo.yaml` | +| 本篇完整清单(Ingress 已切 **websecure** + `spec.tls`;**不含** Secret 内容) | [`ansible/files/04-12/04-12-nodejs-demo.yaml`](../ansible/files/04-12/04-12-nodejs-demo.yaml) | +| 应用 | 先创建 TLS Secret(见下),再 `kubectl apply -f ansible/files/04-12/04-12-nodejs-demo.yaml` | **证书 Secret**:使用命令创建(不提交私钥到 Git): diff --git a/docs/04-13-nodejs-HPA.md b/docs/04-13-nodejs-HPA.md index 67ca12c..c5be1eb 100644 --- a/docs/04-13-nodejs-HPA.md +++ b/docs/04-13-nodejs-HPA.md @@ -5,7 +5,8 @@ ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 04-13` +- **手动练习**:复制本课目录 `ansible/files/04-13/` 下清单到目标路径,按需改字段后按本文 `kubectl`/bash 操作(学习路径可不使用 verify)。 +- **自动化验收**:`./ansible/bin/verify.sh run 04-13` - **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP - **成功判据**:达到本文「预期」且 playbook 断言通过 - **排障**:见本文「排障」 @@ -16,10 +17,10 @@ - Deployment 已配置 **`resources.requests`**(CPU 指标 HPA 依赖 requests),见 [`04-08-nodejs-资源请求与限制.md`](04-08-nodejs-资源请求与限制.md)。 - 建议已配置 **readinessProbe**([`04-05-nodejs-探针与健康检查.md`](04-05-nodejs-探针与健康检查.md)),避免扩容出未就绪 Pod。 -## 清单路径(唯一真源) +## 清单路径(本课分目录) -| 本篇完整清单(含 Deployment/Service/Ingress/PVC/CM + **HPA**) | [`ansible/files/04-01/04-13-nodejs-demo.yaml`](../ansible/files/04-01/04-13-nodejs-demo.yaml) | -| 应用 | `kubectl apply -f ansible/files/04-01/04-13-nodejs-demo.yaml`(若用 `04-12`,需先有 TLS Secret) | +| 本篇完整清单(含 Deployment/Service/Ingress/PVC/CM + **HPA**) | [`ansible/files/04-13/04-13-nodejs-demo.yaml`](../ansible/files/04-13/04-13-nodejs-demo.yaml) | +| 应用 | `kubectl apply -f ansible/files/04-13/04-13-nodejs-demo.yaml`(若用 `04-12`,需先有 TLS Secret) | ## 场景说明(白话) @@ -35,7 +36,7 @@ ## 部署与验证 ```bash -kubectl apply -f ansible/files/04-01/04-13-nodejs-demo.yaml +kubectl apply -f ansible/files/04-13/04-13-nodejs-demo.yaml kubectl get hpa -n default kubectl describe hpa nodejs-demo -n default ``` diff --git a/docs/04-14-nodejs-GitOps与CI流水线.md b/docs/04-14-nodejs-GitOps与CI流水线.md index d68ba56..7e85e6f 100644 --- a/docs/04-14-nodejs-GitOps与CI流水线.md +++ b/docs/04-14-nodejs-GitOps与CI流水线.md @@ -5,7 +5,7 @@ ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 04-14` +- **自动化验收**:`./ansible/bin/verify.sh run 04-14` - **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP - **成功判据**:达到本文「预期」且 playbook 断言通过 - **排障**:见本文「排障」 diff --git a/docs/05-00-常用应用部署-系列说明.md b/docs/05-00-常用应用部署-系列说明.md index fd35262..7f32214 100644 --- a/docs/05-00-常用应用部署-系列说明.md +++ b/docs/05-00-常用应用部署-系列说明.md @@ -5,24 +5,38 @@ ## TL;DR - **本页定位**:仅系列导航,不参与 `verify.sh run-all/full` -- **子篇执行入口**:按下表执行 `./scripts/verify.sh run ` +- **子篇执行入口**:按下表执行 `./ansible/bin/verify.sh run ` ## 范围与非目标 - 本页是 **05 系列入口/导航页**(`YY=00`),不要求具备独立执行器,且不参与 `verify.sh run-all/full`。 - `YY>0` 的分项必须包含可执行物(YAML 路径或命令块)。 +## 系列前置与域名约定(通用) + +各 `05-YY` 分篇的 **硬性基座** 默认只要求 **01 系列**(K3s 与节点初始化等实验室环境已可用)。**不要**在未说明的情况下把 `02`、`04-xx` 等演示/矩阵课当作每篇的必做前置。 + +**域名与公网入口**按实际需求选用 **`03` 系列**(与 Traefik / Tunnel 相关): + +| 需求 | 建议先完成 | +|------|------------| +| 不需要公网域名、仅集群或内网访问 | 无额外 03-xx | +| 公网域名 + HTTPS(Let's Encrypt 等,Traefik ACME) | **`03-02`** 或 **`03-03`**(二选一,与是否使用 Dashboard 同路径一致即可) | +| 使用 Cloudflare Tunnel 暴露服务 | **`03-04`** | + +具体分篇若还有特殊前置(如存储类、监控),以该篇「前置条件」为准;**GitLab** 的完整表述见 [`05-03-k3s-安装gitlab-含runner.md`](05-03-k3s-安装gitlab-含runner.md) 中的表格。 + ## 05 系列索引 | doc_id | 主题 | 子篇执行入口 | |-------:|------|------------| -| 05-01 | Homer 首页面板 | `./scripts/verify.sh run 05-01` | -| 05-02 | OneNav 首页面板 | `./scripts/verify.sh run 05-02` | -| 05-03 | GitLab(含 Runner) | `./scripts/verify.sh run 05-03` | -| 05-04 | GitLab CI/CD 配置 | `./scripts/verify.sh run 05-04` | -| 05-05 | Prometheus + Grafana | `./scripts/verify.sh run 05-05` | -| 05-06 | openlist 挂载网盘与自动备份 | `./scripts/verify.sh run 05-06` | -| 05-07 | openclaw 应用部署 | `./scripts/verify.sh run 05-07` | -| 05-08 | openclaw k3s 实验部署 | `./scripts/verify.sh run 05-08` | -| 05-09 | openclaw web 小游戏平台 | `./scripts/verify.sh run 05-09` | +| 05-01 | Homer 首页面板 | `./ansible/bin/verify.sh run 05-01` | +| 05-02 | OneNav 首页面板 | `./ansible/bin/verify.sh run 05-02` | +| 05-03 | GitLab(含 Runner) | `./ansible/bin/verify.sh run 05-03` | +| 05-04 | GitLab CI/CD 配置 | `./ansible/bin/verify.sh run 05-04` | +| 05-05 | Prometheus + Grafana | `./ansible/bin/verify.sh run 05-05` | +| 05-06 | openlist 挂载网盘与自动备份 | `./ansible/bin/verify.sh run 05-06` | +| 05-07 | openclaw 应用部署 | `./ansible/bin/verify.sh run 05-07` | +| 05-08 | openclaw k3s 实验部署 | `./ansible/bin/verify.sh run 05-08` | +| 05-09 | openclaw web 小游戏平台 | `./ansible/bin/verify.sh run 05-09` | diff --git a/docs/05-01-k3s-部署homer首页面板.md b/docs/05-01-k3s-部署homer首页面板.md index 53e48b9..becd22a 100644 --- a/docs/05-01-k3s-部署homer首页面板.md +++ b/docs/05-01-k3s-部署homer首页面板.md @@ -5,7 +5,7 @@ ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 05-01` +- **自动化验收**:`./ansible/bin/verify.sh run 05-01` - **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP - **成功判据**:达到本文「预期」且 playbook 断言通过 - **排障**:见本文「排障」 diff --git a/docs/05-02-onenav首页面板.md b/docs/05-02-onenav首页面板.md index 9229598..6d2a2f9 100644 --- a/docs/05-02-onenav首页面板.md +++ b/docs/05-02-onenav首页面板.md @@ -5,7 +5,7 @@ ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 05-02` +- **自动化验收**:`./ansible/bin/verify.sh run 05-02` - **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP - **成功判据**:达到本文「预期」且 playbook 断言通过 - **排障**:见本文「排障」 diff --git a/docs/05-03-k3s-安装gitlab-含runner.md b/docs/05-03-k3s-安装gitlab-含runner.md index 04288d3..4c27816 100644 --- a/docs/05-03-k3s-安装gitlab-含runner.md +++ b/docs/05-03-k3s-安装gitlab-含runner.md @@ -5,7 +5,7 @@ ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 05-03` +- **自动化验收**:`./ansible/bin/verify.sh run 05-03` - **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP - **成功判据**:达到本文「预期」且 playbook 断言通过 - **排障**:见本文「排障」 @@ -14,9 +14,24 @@ ## 前置条件 -- `01`、`02`、`04-01` 已完成 -- 集群资源足够(GitLab 对 CPU/内存要求较高) -- 已有可用域名(例如 `git.example.com`) +### 基座(必须) + +- **01 系列已就绪**:K3s 集群与节点初始化等实验室基座完成(见 `01-00` / `01-05` 等),本机可 `kubectl` 管理集群。 + - **不**将 `02`(nginx 验证矩阵)、`04-01`(Node.js 高级部署)等列为本文硬性前置;仅当你要沿用同环境的入口/域名习惯时,再自行参考。 + +### 资源(强烈建议) + +- GitLab 组件多、默认资源占用高,请为集群预留足够 CPU/内存/存储(具体以官方 Chart 与 `values` 为准)。 + +### 域名与入口(按场景选用) + +| 场景 | 建议先完成的文档 | 说明 | +|------|------------------|------| +| **无公网域名**:仅用集群内访问(如 NodePort、`*.svc`、内网 Ingress IP) | 无额外 03-xx | `values` 与 Runner 的 `--url` 使用你可访问的地址即可(常为内网或 IP)。 | +| **公网域名 + HTTPS(Traefik + ACME)** | **`03-02`** 或 **`03-03`** | 需要合法 DNS 与证书自动化时二选一:仅证书/入口见 **`03-02`**;若你已在 **`03-03`** 路径上统一 Dashboard+ACME,可沿用同一套 Traefik/证书策略。 | +| **经 Cloudflare Tunnel 暴露 GitLab** | **`03-04`** | 走 Tunnel 时,入口与证书策略以 Tunnel 与 Cloudflare 侧为准;集群内需已按 `03-04` 完成接入。 | + +本文示例中的 `git.example.com` 仅作占位;请替换为你的真实域名或内网访问地址。 --- @@ -24,7 +39,19 @@ - GitLab 属于重量级应用:组件多(Web/Sidekiq/Registry/DB/Redis 等),关联的 YAML 数量非常大。 - 官方维护的 Helm Chart 已经帮你整理好了依赖关系、默认参数和升级路径,本质上还是“渲染出一堆 YAML 再 `kubectl apply`”。 -- 在本仓库里:简单/教学型服务(nginx、Node.js demo、Homer、OneNav 等)我们通过示例 YAML 手写;而像 GitLab 这种复杂应用,则更推荐直接沿用官方 Chart,只维护一份 `values-gitlab.yaml`。 +- 在本仓库里:简单/教学型服务(nginx、Node.js demo、Homer、OneNav 等)我们通过示例 YAML 手写;而像 GitLab 这种复杂应用,则更推荐直接沿用官方 Chart,在本地以 **`values-gitlab.yaml`** 覆盖参数;仓库内提供起点示例 **`ansible/files/05-03/values-gitlab.example.yaml`**(复制后修改)。 + +--- + +## 存储与节点(实验室:固定 ylc63、本地落盘) + +本仓库 **`inventory.ini`** 中 **`ylc63`(192.168.2.63)** 作为 GitLab **数据落盘节点**:示例 `values-gitlab.example.yaml` 已配置: + +- **`global.nodeSelector.kubernetes.io/hostname: ylc63`**:GitLab 自带组件(如 Webservice、Gitaly 等)调度到该节点。 +- **PostgreSQL / Redis / MinIO** 等依赖子 chart **不一定会继承** `global.nodeSelector`,示例中为 **`postgresql.primary` / `redis.master` / `minio`** 补了同样的 **`nodeSelector`**,避免数据卷漂到其他节点。 +- **持久卷**:各组件 **`persistence.storageClass: local-path`**,与 K3s 默认 **local-path-provisioner** 一致,卷与 Pod 同节点,即 **落在 ylc63 本地磁盘**(逻辑与 **`03-05-k3s-local-path-pvc.md`** 一致;若需改基路径见该文 **`local-path-config`**)。 + +部署前请确认 **`ylc63` 磁盘容量** 覆盖 Gitaly / PG / Redis / MinIO 等 `size` 之和;节点名若与 inventory 不一致,请 `kubectl get nodes` 后全局替换 `ylc63`。 --- @@ -44,7 +71,17 @@ helm repo update kubectl create namespace gitlab ``` -准备 `values-gitlab.yaml`(按你的域名与资源调整),然后安装: +本仓库提供示例 values(占位域名 `example.com`、实验室缩小副本;**请复制后按需改域名与资源**): + +- 示例文件:[`ansible/files/05-03/values-gitlab.example.yaml`](../ansible/files/05-03/values-gitlab.example.yaml) + +```bash +cp ansible/files/05-03/values-gitlab.example.yaml values-gitlab.yaml +# 编辑 values-gitlab.yaml:域名、Ingress/TLS、资源等;完整键名以当前 Chart 为准: +# helm show values gitlab/gitlab +``` + +然后安装: ```bash helm upgrade --install gitlab gitlab/gitlab \ diff --git a/docs/05-04-k3s-配置gitlab-cicd.md b/docs/05-04-k3s-配置gitlab-cicd.md index 152838a..7fade48 100644 --- a/docs/05-04-k3s-配置gitlab-cicd.md +++ b/docs/05-04-k3s-配置gitlab-cicd.md @@ -5,7 +5,7 @@ ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 05-04` +- **自动化验收**:`./ansible/bin/verify.sh run 05-04` - **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP - **成功判据**:达到本文「预期」且 playbook 断言通过 - **排障**:见本文「排障」 diff --git a/docs/05-05-prometheus与grafana.md b/docs/05-05-prometheus与grafana.md index d24a7dd..8f8b663 100644 --- a/docs/05-05-prometheus与grafana.md +++ b/docs/05-05-prometheus与grafana.md @@ -2,10 +2,15 @@ > 使用 `kube-prometheus-stack` 建立基础可观测能力。 +## 契约与真源 + +- **Helm values 示例**:`ansible/files/05-05/kube-prometheus-stack-values.example.yaml`(见同目录 `README.md`)。 +- **手动**:下文 `helm` 命令;可将 `-f ansible/files/05-05/kube-prometheus-stack-values.example.yaml` 传给 `helm upgrade --install`。 +- **自动**:`./ansible/bin/verify.sh run 05-05`(与上述路径同一真源,便于对照;当前为 noop + 集群基线)。 ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 05-05` +- **自动化验收**:`./ansible/bin/verify.sh run 05-05` - **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP - **成功判据**:达到本文「预期」且 playbook 断言通过 - **排障**:见本文「排障」 @@ -25,7 +30,8 @@ helm repo add prometheus-community https://prometheus-community.github.io/helm-charts helm repo update kubectl create namespace monitoring -helm upgrade --install monitoring prometheus-community/kube-prometheus-stack -n monitoring +helm upgrade --install monitoring prometheus-community/kube-prometheus-stack -n monitoring \ + -f ansible/files/05-05/kube-prometheus-stack-values.example.yaml ``` ## 验证命令 diff --git a/docs/05-06-openlist挂载网盘与自动备份.md b/docs/05-06-openlist挂载网盘与自动备份.md index 8238315..7f2ef4b 100644 --- a/docs/05-06-openlist挂载网盘与自动备份.md +++ b/docs/05-06-openlist挂载网盘与自动备份.md @@ -5,7 +5,7 @@ ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 05-06` +- **自动化验收**:`./ansible/bin/verify.sh run 05-06` - **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP - **成功判据**:达到本文「预期」且 playbook 断言通过 - **排障**:见本文「排障」 diff --git a/docs/05-07-openclaw应用部署.md b/docs/05-07-openclaw应用部署.md index 9d8a699..82a620b 100644 --- a/docs/05-07-openclaw应用部署.md +++ b/docs/05-07-openclaw应用部署.md @@ -6,7 +6,7 @@ ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 05-07` +- **自动化验收**:`./ansible/bin/verify.sh run 05-07` - **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP - **成功判据**:达到本文「预期」且 playbook 断言通过 - **排障**:见本文「排障」 diff --git a/docs/05-08-openclaw-k3s-实验部署.md b/docs/05-08-openclaw-k3s-实验部署.md index d129c0c..994dad8 100644 --- a/docs/05-08-openclaw-k3s-实验部署.md +++ b/docs/05-08-openclaw-k3s-实验部署.md @@ -6,7 +6,7 @@ ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 05-08` +- **自动化验收**:`./ansible/bin/verify.sh run 05-08` - **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP - **成功判据**:达到本文「预期」且 playbook 断言通过 - **排障**:见本文「排障」 diff --git a/docs/05-09-openclaw-web-小游戏网页平台.md b/docs/05-09-openclaw-web-小游戏网页平台.md index 03ef74c..5aaf733 100644 --- a/docs/05-09-openclaw-web-小游戏网页平台.md +++ b/docs/05-09-openclaw-web-小游戏网页平台.md @@ -5,7 +5,7 @@ ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 05-09` +- **自动化验收**:`./ansible/bin/verify.sh run 05-09` - **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP - **成功判据**:达到本文「预期」且 playbook 断言通过 - **排障**:见本文「排障」 diff --git a/docs/06-00-排障与运维-系列说明.md b/docs/06-00-排障与运维-系列说明.md index 6fe6557..331aeb8 100644 --- a/docs/06-00-排障与运维-系列说明.md +++ b/docs/06-00-排障与运维-系列说明.md @@ -5,7 +5,7 @@ ## TL;DR - **本页定位**:仅系列导航,不参与 `verify.sh run-all/full` -- **子篇执行入口**:按下表执行 `./scripts/verify.sh run ` +- **子篇执行入口**:按下表执行 `./ansible/bin/verify.sh run ` ## 范围与非目标 @@ -16,7 +16,7 @@ | doc_id | 主题 | 子篇执行入口 | |-------:|------|------------| -| 06-01 | NetworkPolicy 使用与故障排查 | `./scripts/verify.sh run 06-01` | -| 06-02 | 运维小结 | `./scripts/verify.sh run 06-02` | -| 06-03 | 自动备份与恢复(openlist WebDAV) | `./scripts/verify.sh run 06-03` | +| 06-01 | NetworkPolicy 使用与故障排查 | `./ansible/bin/verify.sh run 06-01` | +| 06-02 | 运维小结 | `./ansible/bin/verify.sh run 06-02` | +| 06-03 | 自动备份与恢复(openlist WebDAV) | `./ansible/bin/verify.sh run 06-03` | diff --git a/docs/06-01-k3s-networkpolicy-故障排查.md b/docs/06-01-k3s-networkpolicy-故障排查.md index 5f1161d..0a43924 100644 --- a/docs/06-01-k3s-networkpolicy-故障排查.md +++ b/docs/06-01-k3s-networkpolicy-故障排查.md @@ -3,10 +3,15 @@ > 本文只负责 **网络策略与连通性排障**。 > 若你要做 Traefik 部署、ServiceLB 池配置,请看 `01-02-k3s-工作节点.md`。 +## 契约与真源 + +- **NetworkPolicy 示例**:`ansible/files/06-01/networkpolicy-traefik-egress.example.yaml`、`ansible/files/06-01/networkpolicy-backend-ingress.example.yaml`(复制改名后按集群标签/CIDR 修改再 `kubectl apply`)。 +- **手动**:以下 bash 排查命令与上述 YAML 同一目录索引见 `ansible/files/06-01/README.md`。 +- **自动**:`./ansible/bin/verify.sh run 06-01`(noop + 基线;策略真源以 `ansible/files/06-01/` 为准)。 ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 06-01` +- **自动化验收**:`./ansible/bin/verify.sh run 06-01` - **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP - **成功判据**:达到本文「预期」且 playbook 断言通过 - **排障**:见本文「排障」 diff --git a/docs/06-02-运维小结.md b/docs/06-02-运维小结.md index a3c72a8..b3008f3 100644 --- a/docs/06-02-运维小结.md +++ b/docs/06-02-运维小结.md @@ -2,10 +2,14 @@ > 日常运维建议:检查项、变更记录、备份策略。 +## 契约与真源 + +- **索引**:`ansible/files/06-02/README.md`(本篇无独立清单;命令以正文速查为准)。 +- **自动**:`./ansible/bin/verify.sh run 06-02`。 ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 06-02` +- **自动化验收**:`./ansible/bin/verify.sh run 06-02` - **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP - **成功判据**:达到本文「预期」且 playbook 断言通过 - **排障**:见本文「排障」 diff --git a/docs/06-03-k3s-自动备份与恢复-openlist-webdav.md b/docs/06-03-k3s-自动备份与恢复-openlist-webdav.md index 81fadb2..66b5ada 100644 --- a/docs/06-03-k3s-自动备份与恢复-openlist-webdav.md +++ b/docs/06-03-k3s-自动备份与恢复-openlist-webdav.md @@ -6,7 +6,7 @@ ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 06-03` +- **自动化验收**:`./ansible/bin/verify.sh run 06-03` - **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP - **成功判据**:达到本文「预期」且 playbook 断言通过 - **排障**:见本文「排障」 diff --git a/docs/07-00-网络与CNI实验-系列说明.md b/docs/07-00-网络与CNI实验-系列说明.md index 4f89679..1782da4 100644 --- a/docs/07-00-网络与CNI实验-系列说明.md +++ b/docs/07-00-网络与CNI实验-系列说明.md @@ -5,7 +5,7 @@ ## TL;DR - **本页定位**:仅系列导航,不参与 `verify.sh run-all/full` -- **子篇执行入口**:按下表执行 `./scripts/verify.sh run `(若 playbook 为 noop,仍应以文档手工步骤为准) +- **子篇执行入口**:按下表执行 `./ansible/bin/verify.sh run `(若 playbook 为 noop,仍应以文档手工步骤为准) - **成功判据**:按文档命令块完成实验且具备回滚;不要把“脚本 exit 0”误当“已验证完成” ## 范围与非目标 @@ -17,6 +17,6 @@ | doc_id | 主题 | 自动化入口 | |-------:|------|------------| -| 07-01 | Calico 双栈实验 | `./scripts/verify.sh run 07-01` | -| 07-02 | Cilium 双栈与 eBPF | `./scripts/verify.sh run 07-02` | +| 07-01 | Calico 双栈实验 | `./ansible/bin/verify.sh run 07-01` | +| 07-02 | Cilium 双栈与 eBPF | `./ansible/bin/verify.sh run 07-02` | diff --git a/docs/07-01-k3s-calico-dualstack.md b/docs/07-01-k3s-calico-dualstack.md index bff5f16..8a4f991 100644 --- a/docs/07-01-k3s-calico-dualstack.md +++ b/docs/07-01-k3s-calico-dualstack.md @@ -2,10 +2,14 @@ > 草稿占位:本节用于在后续版本中整理 **k3s + Calico** 的 IPv4/IPv6 双栈网络实验,包括安装参数、clusterCIDR/serviceCIDR 设计、Pod IPv6 出网与 NetworkPolicy 示例。 +## 契约与真源 + +- **索引**:`ansible/files/07-01/README.md`(实验清单待正文补全后纳入同目录)。 +- **自动**:`./ansible/bin/verify.sh run 07-01`。 ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 07-01` +- **自动化验收**:`./ansible/bin/verify.sh run 07-01` - **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP - **成功判据**:达到本文「预期」且 playbook 断言通过 - **排障**:见本文「排障」 diff --git a/docs/07-02-k3s-cilium-dualstack-ebpf.md b/docs/07-02-k3s-cilium-dualstack-ebpf.md index cc09ab8..82a081d 100644 --- a/docs/07-02-k3s-cilium-dualstack-ebpf.md +++ b/docs/07-02-k3s-cilium-dualstack-ebpf.md @@ -2,10 +2,14 @@ > 草稿占位:本节用于后续整理 **k3s + Cilium** 的 IPv4/IPv6 双栈与 eBPF 网络实验,作为在 Calico 双栈实验基础上的进阶篇。 +## 契约与真源 + +- **索引**:`ansible/files/07-02/README.md`(实验清单待正文补全后纳入同目录)。 +- **自动**:`./ansible/bin/verify.sh run 07-02`。 ## TL;DR -- **自动化验收**:`./scripts/verify.sh run 07-02` +- **自动化验收**:`./ansible/bin/verify.sh run 07-02` - **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP - **成功判据**:达到本文「预期」且 playbook 断言通过 - **排障**:见本文「排障」 diff --git a/project-context.md b/project-context.md index d38a7c9..3417a48 100644 --- a/project-context.md +++ b/project-context.md @@ -19,7 +19,7 @@ existing_patterns_found: --- -## 技术栈与版本(来自 `docs/00-04-部署环境说明.md`) +## 技术栈与版本(来自 `docs/00-02-部署环境说明.md`) - **操作系统(验证环境)**:Fedora 43 Server(CoreOS) - **K3s**:v1.34.5+k3s1 @@ -38,26 +38,28 @@ existing_patterns_found: - 文档中引用清单路径时,必须引用 `ansible/files//...`(禁止在 `docs/` 内复制粘贴出第二份 YAML)。 - 例外:非 YAML 的源码/生成器/模板(如 `scripts/gen-*.py`)不受此条限制。 - **执行器唯一真源(强约束)**:`ansible/playbooks/verify/.yml` 为该 `doc_id` 的唯一 Ansible 入口。 - - `scripts/verify.sh` 只基于 `ansible/playbooks/verify/` 自动发现并执行(缺 playbook 必须 fail-fast)。 -- **存在性校验**:`scripts/validate_matrix_playbooks.py`(历史文件名保留)只做 “`verify/.yml` ↔ `docs/-*.md` 存在性” 校验。 + - `ansible/bin/verify.sh` 只基于 `ansible/playbooks/verify/` 自动发现并执行(缺 playbook 必须 fail-fast)。 +- **存在性校验**:`ansible/tools/validate_matrix_playbooks.py`(历史文件名保留)只做 “`verify/.yml` ↔ `docs/-*.md` 存在性” 校验。 - **目录命名硬约束**:`ansible/files/` 下仅允许 `XX-YY/`(只用 `doc_id`),内部用文件名区分;不再允许 `XX-YY-slug/` 或 `XX-YY-xxx/`。 --- ## doc_id 与验证框架(必须遵循) +**以 `doc_id` 为唯一主键(相对 BMad / 规划命名)**:凡与**验收入口、清单真源、自动化范围**对齐时,以执行域 **`doc_id`(`XX-YY`)** 及 **`ansible/playbooks/verify/.yml`** 为准。**BMad Epic/Story 编号**、`_bmad-output/implementation-artifacts/` 下任意 `*.md` 文件名(如 `4-1-acme-real-pass.md`、`01-02-baseline-verify-oc.md`)均为**项目管理或补充叙事**;其中 **`stories-by-doc/story-doc-.md`** 与 `doc_id` **一一对应**,为按 doc_id 全覆盖的 **[CS] 索引**;**`*-baseline-verify-oc.md`** 等同 `doc_id` 的**增强 backlog**,**不替代** `verify/.yml` 与 `docs/-*.md` 的工程真源。 + - **doc_id 规则**:`docs/-*.md` 中的 `` 固定为 `XX-YY`。 - **编号语义(极简、一目了然)**: - **`XX=00`**:纯文档域(框架/说明/索引/备忘),不要求执行器,且**不参与自动验证**。 - **`XX>0 且 YY=00`**:系列入口/说明页,不要求执行器,且**不参与自动验证**。 - **`XX>0 且 YY>0`**:必须至少有一种执行器(见下)。 -- **自动验证范围(强约束)**:仅 `XX>0 且 YY>0` 的文档进入 `scripts/verify.sh` 自动验证范围。 +- **自动验证范围(强约束)**:仅 `XX>0 且 YY>0` 的文档进入 `ansible/bin/verify.sh` 自动验证范围。 - **执行器判定口径(不新增关系文件)**: - - **Ansible 执行器**:存在 `ansible/playbooks/verify/.yml` 且可 `./scripts/verify.sh run `。 + - **Ansible 执行器**:存在 `ansible/playbooks/verify/.yml` 且可 `./ansible/bin/verify.sh run `(仓库根简写:`./scripts/cs `,与前者等价)。 - **脚本/SSH 执行器**:文档 TL;DR 写清入口命令(`./scripts/...`),且脚本存在、用退出码表达成功/失败。 - **统一入口**: - - `scripts/deploy-lab.sh`:铺栈/安装入口(默认保留资源:`DEPLOY_VERIFY_TEARDOWN=0`)。 - - `scripts/verify.sh`:按 `doc_id` 验收入口(默认清理:`VERIFY_TEARDOWN=1`)。 + - `ansible/bin/deploy-lab.sh`:铺栈/安装入口(默认保留资源:`DEPLOY_VERIFY_TEARDOWN=0`)。 + - `ansible/bin/verify.sh`:按 `doc_id` 验收入口(默认清理:`VERIFY_TEARDOWN=1`)。 - **fail-fast(执行域)**:`verify.sh run ` 对 `XX>0 且 YY>0` 条目,若缺少 `ansible/playbooks/verify/.yml` 必须直接失败。 --- @@ -66,18 +68,29 @@ existing_patterns_found: - **默认安全**:`verify.sh run ` 默认应只做“验收”或“轻量可逆操作”。 - **重操作必须显式开关**(典型:分区/格式化、全集群安装、TLS 矩阵铺栈等): - - `01-06.yml`:`k3s_do_prepare_storage=true` / `k3s_do_install=true` + - `01-05.yml`:`k3s_do_prepare_storage=true` / `k3s_do_install=true` - `03-02.yml`:`nginx_matrix_tls_enable=true`(TLS 矩阵铺栈/清理仍用 `mode=deploy|cleanup`) - `03-05.yml`:`local_path_apply_lab_config=true` -- **gate 语义**:外部依赖未满足可 `meta: end_play` 跳过,但必须输出可 grep 的 `[GATE] ...` 信息,避免“看似通过”。 +- **gate / skip 语义**(`ansible/bin/verify.sh` 仅把 **`[GATE]`** 计为 `result=gated`): + - **`[GATE]`**:外部依赖或硬前置未满足,**本 doc 目标验收未执行**(如缺 `NFS_SERVER_IP`、缺 `nodejs-demo-tls`、缺 `CF_TUNNEL_TEST_URL`、缺 `ACME_EMAIL` 等);必须保留可 grep 行,避免“看似全验过”。 + - **`[SKIP]`**:**可选段**未开启(如 `k3s_do_install=false`、`nginx_matrix_tls_enable=false`),但**同一 playbook 后续 play 仍会跑**基线验收;不计入 gated, playbook 成功时 OC 为 `verified`。 --- ## 环境变量与密钥安全(强约束) -- **永不提交真实环境变量文件**:`scripts/.env.verify`(仓库 `.gitignore` 已忽略) -- **仅提交模板**:`scripts/.env.verify.example` -- `scripts/.env.verify` 可能包含外部系统 token/凭据(Cloudflare、WebDAV 等),任何自动化/文档都应默认它只存在于本机 +- **永不提交真实环境变量文件**:`ansible/env/.env.verify`(仓库 `.gitignore` 已忽略) +- **仅提交模板**:`ansible/env/.env.verify.example` +- `ansible/env/.env.verify` 可能包含外部系统 token/凭据(Cloudflare、WebDAV 等),任何自动化/文档都应默认它只存在于本机 + +### `.env.verify` 变量组织(与 doc_id / 真源对齐) + +- **按 doc_id 分节**:`ansible/env/.env.verify` / `.env.verify.example` 以 **`# --- XX-YY … ---`** 小节组织变量(与 `docs/XX-YY`、playbook `lookup('env')` 对齐);**`SKIP_*` 写在对应 doc 小节内**(如 `SKIP_ARMV7` 与 01-03/01-04 的 `ARMV7_*` 同块,`SKIP_HA` / `SKIP_GITOPS` 在 03-08/03-09 并标 **可选·预留**)。文件头 **集中式 doc_id 索引** 可后续再补;标注 **预留** 的键当前 playbook **未读取**,仅备忘或手工对照。 +- **文件约定**:只写 `KEY=VALUE`(不写默认展开、命令替换等执行逻辑);入口脚本用 `set -a; source ...; set +a` 导出;细则以 `ansible/env/.env.verify` 文件头注释为准。 +- **推荐填写顺序**(与 `ansible/env/.env.verify` 小节自上而下一致):**SSH** → **Ansible(控制端 inventory/tmp)** → **`01-01`~`01-06` 按编号**(其中 **`01-05`** 含 `K3S_PREPARE_STORAGE`、`K3S_DO_*`;紧接 **`deploy-lab.sh`**;**`01-06`** `WORKSTATION_SSH` → 推荐 **Linux 工作机 `ylc65`**)→ **验证入口与 preflight(02-xx / 04-xx)** → **`03-08` / `03-09` 及后续 03-xx、04-xx**(各节内 **单独** 标「可选」的 `SKIP_*` 勿混堆) +- **`K3S_SERVER_HOSTNAME` / `K3S_DATA_DIR`**: + - **`docs/01-01-k3s-控制节点含traefik.md`** 与 **`docs/01-02-k3s-工作节点.md`** 共用:`K3S_SERVER_HOSTNAME` 为控制面短主机名(与 inventory 中 `k3s_server` 一致,工作节点手工 `K3S_URL` 指向该节点 `:6443`);`K3S_DATA_DIR` 为各节点一致的 k3s **`--data-dir`**(与 **`ansible/group_vars/all.yml`** 中 **`k3s_data_dir`** 一致),供脚本与人工对齐。 + - **执行真源**:`ansible/playbooks/verify/01-01.yml` 以**节点上** `kubectl`/kubeconfig 为准;**不要求** playbook 通过 `lookup('env', ...)` 读取这两项,避免误以为 01-01 验收依赖 `.env` 中必填此二键。 --- @@ -89,8 +102,8 @@ existing_patterns_found: - **YAML 唯一真源(强约束)**:`ansible/files/**` - `docs/`:说明/操作手册/验收判据(不得复制出第二套 YAML 真源) - **入口**: - - `scripts/deploy-lab.sh`:铺栈(默认保留资源,便于持续使用) - - `scripts/verify.sh`:按 doc_id 验收(`list/run/run-all/full`;默认清理本篇临时资源) + - `ansible/bin/deploy-lab.sh`:铺栈(默认保留资源,便于持续使用) + - `ansible/bin/verify.sh`:按 doc_id 验收(`list/run/run-all/full`;默认清理本篇临时资源) ### 规则 0.1:维护者备忘的归位(替代 docs/00-06) @@ -100,7 +113,7 @@ existing_patterns_found: - **Ansible shell 使用口径**: - “分支探测/兼容性场景”可用 `failed_when: false`;但后续必须有明确断言,避免“静默失败”。 - 清理类任务允许 `failed_when: false`,但应 `register` 并输出关键 rc/stdout/stderr(便于审计与排障)。 - - 优先保持 `verify/.yml` 轻编排;高重复模式应收敛到 `ansible/playbooks/verify/tasks/` 共享片段,避免模板漂移。 + - 优先保持 `verify/.yml` 轻编排;高重复模式应收敛到 `ansible/roles/verify_common/`(role 复用),避免模板漂移。 - **API/版本兼容性复核建议**:升级 K3s/Traefik 大版本后,至少复核一次:\n+ - `Ingress` API(`networking.k8s.io/v1`)字段结构(尤其 `pathType`、backend 端口)\n+ - Traefik CRD(`IngressRoute`/`Middleware`)是否仍存在且版本一致\n+ - K3s `HelmChartConfig`(`helm.cattle.io/v1`)行为是否变化\n+ - Longhorn 与 K3s 版本兼容(升级前对照 Longhorn support matrix) ### 规则 1:`doc_id` / verify 目录 / 解析器(防误触) @@ -128,7 +141,7 @@ existing_patterns_found: - HTTP:`curl` 返回码 + 关键响应标记(例如 `X-Backend`、body contains) - TLS:必须包含 SNI/证书链验证信号(如 `curl --resolve`/`openssl s_client` 关键字段),仅 `rollout` 不足以标 ✅ - **明确执行位置**: - - 若目标是“集群外链路”,必须经 `ONECLOUD_SSH`(或等价第三方机)执行探测;仅控制节点自测不足以标 ✅ + - 若目标是“集群外链路”,必须经 `WORKSTATION_SSH`(推荐 **Linux 工作机** `ylc65`,非 onecloud/arm 实验机)执行探测;仅控制节点自测不足以标 ✅ - **资源清理策略说明**: - 若 `VERIFY_TEARDOWN=0` 保留现场调试,文档需写明(否则容易污染后续用例,导致“假通过/假失败”) - **外部依赖说明**: @@ -139,7 +152,7 @@ existing_patterns_found: ### 规则 3:执行位置与“集群外视角”(防自测冒充真实路径) - 默认约定:在 `k3s_server`(如 ylc61)执行 `kubectl/helm/curl`(`KUBECONFIG=/etc/rancher/k3s/k3s.yaml`)。 -- 若用例声明需要“集群外/第三方机”视角(例如家庭网络真实访问路径、OpenWrt 入口、外部 curl),必须显式经 `ONECLOUD_SSH`(或等价变量)执行探测: +- 若用例声明需要“集群外/第三方机”视角(例如家庭网络真实访问路径、OpenWrt 入口、外部 curl),必须显式经 `WORKSTATION_SSH`(默认语义:**Linux 工作机**)执行探测: - **不得**用“控制节点本机 curl”替代“第三方机 curl”并仍标记为已验证 ### 规则 4:verify playbook 结构与可靠性约定(可复制模式) @@ -160,12 +173,12 @@ existing_patterns_found: ### 规则 6:密钥与敏感信息(强约束) -- `scripts/.env.verify` 只允许本机存在,**永不提交**;仓库只保留 `scripts/.env.verify.example`(`.gitignore` 已忽略 `scripts/.env.verify`)。 -- inventory 若声明 `ansible_ssh_private_key_file`,控制端必须确保文件存在且权限仅所有者可读(建议 600/400);否则应在 preflight 阶段直接失败(见 `scripts/lib-ansible-lab.sh` 的检查逻辑)。 +- `ansible/env/.env.verify` 只允许本机存在,**永不提交**;仓库只保留 `ansible/env/.env.verify.example`(`.gitignore` 已忽略 `ansible/env/.env.verify`)。 +- inventory 若声明 `ansible_ssh_private_key_file`,控制端必须确保文件存在且权限仅所有者可读(建议 600/400);否则应在 preflight 阶段直接失败(见 `ansible/lib/lib-ansible-lab.sh` 的检查逻辑)。 ### 规则 7:验证环境基线(避免“跑得通但不复现”) -- 验证环境的机器角色与约定(例如 `ylc65` 仅作为 Ansible 控制端,不是 K3s 节点)以 `docs/00-04-部署环境说明.md` 为准;新增文档/用例若依赖“在哪台机器执行”,必须写清。 +- 验证环境的机器角色与约定(例如 `ylc65` 仅作为 Ansible 控制端,不是 K3s 节点)以 `docs/00-02-部署环境说明.md` 为准;新增文档/用例若依赖“在哪台机器执行”,必须写清。 - `ansible/group_vars/all.yml` 中的关键默认值(如 `k3s_data_dir=/storage`、`k3s_verify_storage_mount=true`、`k3s_manage_firewalld=true`、CoreDNS forward)会影响大量文档与用例,修改这些值相当于“改了实验室契约”,应同步更新文档并回退相关条目的验证状态。 ### 规则 8:Ansible 控制端连接约定(实验室特化) diff --git a/scripts/.env.verify.example b/scripts/.env.verify.example deleted file mode 100644 index 80a23e8..0000000 --- a/scripts/.env.verify.example +++ /dev/null @@ -1,114 +0,0 @@ -# 验证 / 编排脚本用环境变量模板 -# --------------------------------------------------------------------------- -# 各变量与「未验证 / 部分验证」文档的对应关系、缺省时的行为,见 docs/00-07-待验证项-验证前准备.md -# 使用:复制为本目录下的 .env.verify(勿提交 Git),在仓库根执行: -# set -a && source scripts/.env.verify && set +a -# 或在 bash 中:source scripts/.env.verify -# 仅示例占位,请把 YOUR_* 换成真实值;密钥只留在本机 .env.verify。 -# --------------------------------------------------------------------------- - -# --- 集群与 kubectl --- -# 在控制节点本机跑 kubectl 时常用: -# export KUBECONFIG="${KUBECONFIG:-/etc/rancher/k3s/k3s.yaml}" -# 若在办公机通过 SSH 在远端执行 kubectl,可设为: -# export K3S_CONTROL_SSH="ssh -o BatchMode=yes ylc61" -# export REMOTE_KUBECONFIG="/etc/rancher/k3s/k3s.yaml" -# 在控制节点本机执行 kubectl 时,避免再次经 SSH 自环(可按需保留): -# export VERIFY_FORCE_LOCAL_KUBECTL=1 -# 经 SSH 在控制节点执行 kubectl 时:若 PATH 中无 kubectl,可设 REMOTE_KUBECTL="k3s kubectl" -# export REMOTE_KUBECTL="k3s kubectl" -# 控制节点短主机名(与 inventory 一致;编排里拼 URL/SSH 用) -export K3S_SERVER_HOSTNAME="${K3S_SERVER_HOSTNAME:-ylc61}" -# 与 ansible group_vars 一致,验证磁盘/文档 00-04 时引用 -export K3S_DATA_DIR="${K3S_DATA_DIR:-/storage}" - -# --- Ansible(安装/复验 k3s;对应 docs/00-05 §2 步骤 3)--- -export ANSIBLE_INVENTORY="${ANSIBLE_INVENTORY:-$(pwd)/ansible/inventory.ini}" -# deploy-lab.sh k3s:为 true 时先在 01-06.yml 内启用准备数据盘(传 -e k3s_do_prepare_storage=true -e k3s_prepare_storage=true;磁盘变量见 group_vars) -export K3S_PREPARE_STORAGE="${K3S_PREPARE_STORAGE:-false}" -# 建议在控制节点或 Linux 工作机仓库根执行 deploy-lab.sh - -# --- SSH 密钥命名(与 scripts/ssh/test-ssh.sh 默认一致;脚本内尚为硬编码路径)--- -# test-ssh 使用:$K3S_SSH_KEY_DIR/${K3S_SSH_KEY_PREFIX} -# 若你改用其他前缀,需同步改 test-ssh.sh 或仅用手工 ssh -i。 -export K3S_SSH_KEY_DIR="${K3S_SSH_KEY_DIR:-$HOME/.ssh}" -export K3S_SSH_KEY_PREFIX="${K3S_SSH_KEY_PREFIX:-id_ed25519_k3s_}" -# setup-k3s-workers-ssh.sh 首次登录的非 root 用户名(交互默认 jack) -export SSH_USER="${SSH_USER:-jack}" -# test-ssh.sh:连接超时秒数(脚本已支持环境变量 TIMEOUT_SEC) -export TIMEOUT_SEC="${TIMEOUT_SEC:-5}" - -# --- 矩阵验证(docs/00-05 §2 步骤 4~6)--- -# ./scripts/verify.sh run | run-all | preflight | full | flow -# verify.sh 默认 VERIFY_TEARDOWN=1:每篇 playbook 末尾 teardown,再跑下一 doc_id;勿设为 0(资源残留会干扰后续用例) -export VERIFY_TEARDOWN="${VERIFY_TEARDOWN:-1}" -# 02-xx / 03-02 等经 Ingress 校验 HTTP;填控制节点或入口 URL(与 inventory 中 k3s_server IP 一致) -export nginx_entry_base="${nginx_entry_base:-http://192.168.2.61}" -# VERIFY_PREFLIGHT_CLUSTER(为 1 时 preflight 额外 kubectl get nodes) - -# --- SSH:第三方验证机 onecloud(不忽略:矩阵里多处依赖「集群外」curl/探测)--- -# 用途示例:02-xx nginx 矩阵从第三方访问 Ingress;01-07 经 onecloud 对 OpenWrt:18080/18443 发 curl; -# 与 K3s 节点 SSH 无关,但必须能免交互登录(建议 BatchMode + 已知的 IdentityFile)。 -# 编排脚本应始终引用 ONECLOUD_SSH,不要用「本机直接 curl」代替,除非你明确改成本机。 -export ONECLOUD_SSH="${ONECLOUD_SSH:-ssh -o BatchMode=yes onecloud}" -# 若需显式密钥,可写完整一行,例如: -# export ONECLOUD_SSH="ssh -o BatchMode=yes -i ~/.ssh/id_ed25519_onecloud onecloud" - -# --- NFS(03-06):verify playbook 使用 NFS_SERVER_IP / NFS_EXPORT_PATH --- -export NFS_SERVER_HOST="${NFS_SERVER_HOST:-YOUR_NFS_IP_OR_HOSTNAME}" -export NFS_SERVER_IP="${NFS_SERVER_IP:-$NFS_SERVER_HOST}" -export NFS_EXPORT_PATH="${NFS_EXPORT_PATH:-/export/k3s}" -# export NFS_SSH="ssh -o BatchMode=yes root@${NFS_SERVER_HOST}" - -# --- Cloudflare(API / Tunnel;与 scripts/cloudflare-delete-acme-challenge-dns.sh 等一致)--- -# DNS 脚本使用:CF_API_TOKEN、ZONE_NAME 或 ZONE_ID -export CF_API_TOKEN="${CF_API_TOKEN:-}" -export ZONE_NAME="${ZONE_NAME:-jackadam.top}" -export ZONE_ID="${ZONE_ID:-}" -# Tunnel / Dashboard 等若需单独 token,按需增加(勿提交真实值): -# export CF_TUNNEL_TOKEN="" -# export CF_ACCOUNT_ID="" - -# --- ACME / Traefik(03-02、03-03):Let's Encrypt 注册邮箱 --- -# 与 HelmChartConfig / traefik-acme.yaml 中 一致;编排或 sed 替换时引用 ACME_EMAIL。 -export ACME_EMAIL="${ACME_EMAIL:-}" -# 文档中 traefik-acme 曾用 staging CA 调试;1=使用测试 CA(与 yaml 中 caserver 是否一致自行核对) -export ACME_CA_STAGING="${ACME_CA_STAGING:-0}" -export TRAEFIK_NAMESPACE="${TRAEFIK_NAMESPACE:-kube-system}" - -# --- TLS 验证域名(02-05 / 03-02 矩阵 curl、openssl s_client)--- -# 逗号分隔,与 ZONE_NAME 下实际 DNS 记录一致;勿提交敏感子域若需可只写本机 -export VERIFY_TLS_HOSTS="${VERIFY_TLS_HOSTS:-test01.jackadam.top,test02.jackadam.top,test03.jackadam.top,test04.jackadam.top}" - -# --- Longhorn(03-07 / ansible verify/stack-longhorn-install)--- -export LONGHORN_NAMESPACE="${LONGHORN_NAMESPACE:-longhorn-system}" - -# --- 可选跳过(编排占位;当前 verify.sh 未实现 HA/GitOps 门控时可忽略)--- -export SKIP_HA="${SKIP_HA:-1}" -export SKIP_GITOPS="${SKIP_GITOPS:-1}" - -# --- armv7 / arm32(01-03 Docker、01-05 NFS、05-02 部分)--- -# 默认 SKIP_ARMV7=1:verify/01-03、01-05 仅跑矩阵基线(文档/文件检查),不经 SSH 改 arm 机。 -# 设 SKIP_ARMV7=0 且 ARMV7_SSH 非空:run 01-03 时经该 SSH 在 arm 上 dnf 装 docker 并校验(假定 Fedora/RHEL 系,见 docs/01-03)。 -# 01-05:同上,但 NFS 所在主机可用 ARMV7_NFS_SSH;未设则回退为 ARMV7_SSH;会写 /etc/exports、exportfs(见 docs/01-05)。 -# export ARMV7_NFS_EXPORT_PATH="/sdcard" -# export ARMV7_NFS_CLIENT_SUBNET="192.168.2.0/24" -# verify.sh 在 source .env.verify 后执行 playbook,子进程会继承下列变量(无需 verify.sh 单独传参)。 -export SKIP_ARMV7="${SKIP_ARMV7:-1}" -export ARMV7_SSH="${ARMV7_SSH:-}" -export ARMV7_NFS_SSH="${ARMV7_NFS_SSH:-$ARMV7_SSH}" - -# --- OpenWrt / 01-07(与 K3s 四节点无关时单独用)--- -# export OPENWRT_SSH="ssh -o BatchMode=yes root@192.168.x.x" -# export OPENWRT_HAPROXY_HTTP_PORT="18080" -# export OPENWRT_HAPROXY_HTTPS_PORT="18443" -# 01-07 文档中第三方 curl 用 --https-hosts 时的主机列表(逗号分隔,与 VERIFY_TLS_HOSTS 可相同) -# export OPENWRT_VERIFY_HTTPS_HOSTS="test01.jackadam.top,..." - -# --- 与 scripts/*.sh 对照 --- -# verify.sh → VERIFY_TEARDOWN, VERIFY_PREFLIGHT_CLUSTER, nginx_entry_base, ANSIBLE_INVENTORY;01-03/01-05 另读 SKIP_ARMV7、ARMV7_SSH、ARMV7_NFS_SSH 等(见上文 armv7 段) -# deploy-lab.sh → ANSIBLE_INVENTORY, K3S_PREPARE_STORAGE -# cloudflare-delete-acme-challenge-dns.sh → CF_API_TOKEN, ZONE_NAME, ZONE_ID -# k3s-delete-lab-stacks.sh → KUBECONFIG -# ssh/test-ssh.sh → TIMEOUT_SEC;密钥路径当前固定为 $HOME/.ssh/id_ed25519_k3s_ -# ssh/setup-k3s-workers-ssh.sh → 交互 inventory + SSH_USER;可选一次性密码勿写入本文件 diff --git a/scripts/README.md b/scripts/README.md index 4060a94..67c601d 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -1,16 +1,16 @@ # Scripts 总览 -本目录集中维护通用运维脚本。约定:**在仓库根目录执行**,使用 `./scripts/...` 路径调用。 +本目录仅保留人工运维脚本。Ansible 执行入口已迁移到 `ansible/bin/`。 -流程说明与「部署 / 验证」分工以 [`docs/00-03-测试与验证框架.md`](../docs/00-03-测试与验证框架.md) **§2 自动化验证流程** 为准;下表与之一一对应。 +流程说明与「部署 / 验证」分工以 [`docs/00-03-测试与验证框架.md`](../docs/00-03-测试与验证框架.md) **§2 自动化验证流程** 为准;验证前准备清单见同文档 **§10**。 | §2 步骤 | 含义 | 本仓库入口 | |--------|------|------------| -| 1 接入 | inventory、仓库同步、加载 `.env.verify` | 手工;`verify.sh` / `deploy-lab.sh` 会自动 `source scripts/.env.verify`(若存在) | +| 1 接入 | inventory、仓库同步、加载 `.env.verify` | 手工;`ansible/bin/verify.sh` / `ansible/bin/deploy-lab.sh` 会自动 `source ansible/env/.env.verify`(若存在) | | 2 环境/清理 | 轻量:各 verify 的 teardown;重度:清实验负载 / 重装 K3s | 轻量:`VERIFY_TEARDOWN`(默认 1);重度:`k3s-delete-lab-stacks.sh`、文档中的 `k3s-uninstall`(**勿**默认进 `run-all`) | -| 3 部署 | K3s、Longhorn、nginx 矩阵等铺栈 | **`./scripts/deploy-lab.sh`**(`k3s` / `longhorn` / `nginx-matrix` / `nginx-matrix-tls`) | -| 4~5 断言与收尾 | 按 doc 目标 kubectl/curl;本篇 teardown | **`./scripts/verify.sh`** `run` / `run-all` | -| 6 一键串联 | 按 doc_id 顺序跑全部 verify(可先 preflight) | **`./scripts/verify.sh full`**(推荐,= preflight + run-all)或 **`./scripts/verify.sh run-all`** | +| 3 部署 | K3s、Longhorn、nginx 矩阵等铺栈 | **`./ansible/bin/deploy-lab.sh`**(`k3s` / `longhorn` / `nginx-matrix` / `nginx-matrix-tls`) | +| 4~5 断言与收尾 | 按 doc 目标 kubectl/curl;本篇 teardown | **`./ansible/bin/verify.sh`** `run` / `run-all`;简写 **`./scripts/cs `**(与 `run ` 完全等价,覆盖 `verify/` 下全部执行域 doc_id) | +| 6 一键串联 | 按 doc_id 顺序跑全部 verify(可先 preflight) | **`./ansible/bin/verify.sh full`**(推荐,= preflight + run-all)或 **`./ansible/bin/verify.sh run-all`** | 真机一键验收(可选先铺栈再全量验收): @@ -18,53 +18,55 @@ 辅助命令: -- `./scripts/verify.sh flow` — 打印与 §2 对齐的流程说明(不接 Ansible)。 -- `./scripts/verify.sh preflight` — 检查 `ansible-playbook` 与 `inventory`,并对 `k3s_server` 执行 `ping`;若已装集群可设 `VERIFY_PREFLIGHT_CLUSTER=1` 再执行 `kubectl get nodes`。 -- `./scripts/verify.sh list --series 04 --exclude-noop` — 支持按主序列与 noop 过滤查看执行集合。 +- `./ansible/bin/scaffold-doc-id.sh [--title ...]` — 新建执行域最小闭环:`docs/-.md`、`ansible/files//`、`ansible/playbooks/verify/.yml`(默认含 `verify_common` noop 基线)。可加 `--dry-run` / `--force`。 +- `./ansible/bin/verify.sh flow` — 打印与 §2 对齐的流程说明(不接 Ansible)。 +- `./ansible/bin/verify.sh preflight` — 检查 `ansible-playbook` 与 `inventory`,并对 `k3s_server` 执行 `ping`;若已装集群可设 `VERIFY_PREFLIGHT_CLUSTER=1` 再执行 `kubectl get nodes`。 +- `./ansible/bin/verify.sh list --series 04 --exclude-noop` — 支持按主序列与 noop 过滤查看执行集合。 ## 验证编排环境变量(可选) -复制 [`scripts/.env.verify.example`](.env.verify.example) 为 `scripts/.env.verify` 并填写本机值;**勿提交** `scripts/.env.verify`(已在仓库 `.gitignore` 中忽略)。 +复制 [`ansible/env/.env.verify.example`](../ansible/env/.env.verify.example) 为 `ansible/env/.env.verify` 并填写本机值;**勿提交** `ansible/env/.env.verify`(已在仓库 `.gitignore` 中忽略)。 -其中 **`ONECLOUD_SSH`** 用于**集群外**第三方 curl 等;**`SKIP_ARMV7` / `ARMV7_SSH` / `ARMV7_NFS_SSH`**(及 01-05 的 **`ARMV7_NFS_EXPORT_PATH`**、**`ARMV7_NFS_CLIENT_SUBNET`**)由 playbook 通过环境变量读取:`SKIP_ARMV7=1`(默认)时 `01-03`/`01-05` 仅基线检查;**`SKIP_ARMV7=0` 且 SSH 已配置** 时会对 arm 主机执行 dnf 路径(Fedora/RHEL 系,见 `docs/00-05` §E)。**`ACME_EMAIL`** 供 Traefik ACME(`03-02` / `03-03`);另有 **`VERIFY_TLS_HOSTS`**、`K3S_SERVER_HOSTNAME`、`TIMEOUT_SEC`、`LONGHORN_NAMESPACE` 等,完整列表见 [`.env.verify.example`](.env.verify.example) 文末注释。 +其中 **`WORKSTATION_SSH`** 用于在 **Linux 工作机**(推荐 `ylc65`,见 `docs/00-02`)等 **非 k3s 节点** 上执行集群外 curl;**`SKIP_ARMV7` / `ARMV7_SSH` / `ARMV7_NFS_SSH`**(及 01-04 的 **`ARMV7_NFS_EXPORT_PATH`**、**`ARMV7_NFS_CLIENT_SUBNET`**)由 playbook 通过环境变量读取:`SKIP_ARMV7=1`(默认)时 `01-03`/`01-04` 仅基线检查;**`SKIP_ARMV7=0` 且 SSH 已配置** 时会对 arm 主机执行 dnf 路径(Fedora/RHEL 系,见 `docs/00-03-测试与验证框架.md` §10.E)。**`ACME_EMAIL`** 供 Traefik ACME(`03-02` / `03-03`);另有 **`VERIFY_TLS_HOSTS`**、`K3S_SERVER_HOSTNAME`、`TIMEOUT_SEC`、`LONGHORN_NAMESPACE` 等,完整列表见 [`ansible/env/.env.verify.example`](../ansible/env/.env.verify.example) 文末注释。 ```bash -set -a && source scripts/.env.verify && set +a +set -a && source ansible/env/.env.verify && set +a ``` ## 部署 K3s(推荐在控制节点或 Linux 工作机) -在仓库根(或 `cd ansible` 后改用相对路径)执行 **`./scripts/deploy-lab.sh k3s`**。若需先准备数据盘,在 **本机或 `.env.verify`** 中设 `K3S_PREPARE_STORAGE=true`(会传 `-e k3s_prepare_storage=true` 跑 `ansible/playbooks/verify/01-06.yml`)。 +在仓库根(或 `cd ansible` 后改用相对路径)执行 **`./ansible/bin/deploy-lab.sh k3s`**。若需先准备数据盘,在 **本机或 `.env.verify`** 中设 `K3S_PREPARE_STORAGE=true`(会传 `-e k3s_prepare_storage=true` 跑 `ansible/playbooks/verify/01-05.yml`)。 **密钥与执行用户**:`inventory.ini` 中私钥路径随执行用户变化;在目标节点上以非 root 用户执行时,注意私钥路径与 `ansible_user` 与文档一致。 ## 验证(run-all / full) -**推荐一行**(在仓库根;需已安装 Ansible、[`ansible/inventory.ini`](../ansible/inventory.ini) 可达、`k3s_server` 可 ping;集群与入口变量已按 [`00-04`](../docs/00-04-部署环境说明.md) 与 `.env.verify` 配好): +**推荐一行**(在仓库根;需已安装 Ansible、[`ansible/inventory.ini`](../ansible/inventory.ini) 可达、`k3s_server` 可 ping;集群与入口变量已按 [`00-02-部署环境说明.md`](../docs/00-02-部署环境说明.md) 与 `.env.verify` 配好): ```bash -./scripts/verify.sh full +./ansible/bin/verify.sh full ``` -`full` = `preflight` + `run-all`。若 `nginx_entry_base`、`nodejs_entry_base` 等未写入 `scripts/.env.verify`,可先 `export nginx_entry_base=http://<入口IP>` 再执行。仅跑用例、跳过 preflight 时用 `./scripts/verify.sh run-all`。`list/run-all/full` 均支持筛选参数:`--series `、`--id-regex `、`--exclude-noop`、`--require-teardown`。 +`full` = `preflight` + `run-all`。若 `nginx_entry_base`、`nodejs_entry_base` 等未写入 `ansible/env/.env.verify`,可先 `export nginx_entry_base=http://<入口IP>` 再执行。仅跑用例、跳过 preflight 时用 `./ansible/bin/verify.sh run-all`。`list/run-all/full` 均支持筛选参数:`--series `、`--id-regex `、`--exclude-noop`、`--require-teardown`。 -将准备项(NFS、ACME、armv7、noop 文档等)补齐后再推进“已验证”,见 [`docs/00-04-待验证项-验证前准备.md`](../docs/00-04-待验证项-验证前准备.md)。 +将准备项(NFS、ACME、armv7、noop 文档等)补齐后再推进“已验证”,见 [`docs/00-03-测试与验证框架.md`](../docs/00-03-测试与验证框架.md) **§10 验证前准备清单**。 等价多行写法(与 `full` 相同): ```bash -./scripts/verify.sh preflight +./ansible/bin/verify.sh preflight export nginx_entry_base=http://192.168.2.61 -./scripts/verify.sh run-all +./ansible/bin/verify.sh run-all ``` ## 目录与脚本对照 - **`verify.sh`** — doc_id 验收:`flow` / `preflight` / `full` / `list` / `run` / `run-all` +- **`cs`** — 单篇验收简写:`./scripts/cs ` ≡ `verify.sh run `(任意执行域 `doc_id` 通用,非 50 个独立脚本) - **`deploy-lab.sh`** — 安装/铺栈:`k3s` / `longhorn` / `nginx-matrix` / `nginx-matrix-tls` - **`k3s-delete-lab-stacks.sh`** — 按 kubectl 实况清命名空间负载(重度清场,非默认 teardown) - **`cloudflare-delete-acme-challenge-dns.sh`** — 清理 CF 上 `_acme-challenge` DNS -- **`ssh/setup-k3s-workers-ssh.sh`** — 为 K3s 节点预配 SSH(配合 `01-06`) +- **`ssh/setup-k3s-workers-ssh.sh`** — 为 K3s 节点预配 SSH(配合 `01-05`) - **`ssh/test-ssh.sh`** — 验证 worker 密钥登录与 sudo ## 从仓库根执行示例 @@ -72,13 +74,15 @@ export nginx_entry_base=http://192.168.2.61 ```bash ./scripts/ssh/setup-k3s-workers-ssh.sh ./scripts/ssh/test-ssh.sh -./scripts/deploy-lab.sh k3s -./scripts/verify.sh preflight -./scripts/verify.sh run 02-05 +./scripts/offline-check.sh +./ansible/bin/deploy-lab.sh k3s +./ansible/bin/verify.sh preflight +./ansible/bin/verify.sh run 02-05 +./scripts/cs 02-05 ``` ## 说明文档 - 验证框架:[`docs/00-03-测试与验证框架.md`](../docs/00-03-测试与验证框架.md) -- 验证前准备:[`docs/00-04-待验证项-验证前准备.md`](../docs/00-04-待验证项-验证前准备.md) +- 验证前准备:见 [`docs/00-03-测试与验证框架.md`](../docs/00-03-测试与验证框架.md) **§10 验证前准备清单** - 主文档入口:`docs/00-00-构建总览.md` diff --git a/scripts/acceptance.sh b/scripts/acceptance.sh index e7fceee..183b7a5 100755 --- a/scripts/acceptance.sh +++ b/scripts/acceptance.sh @@ -14,12 +14,12 @@ set -euo pipefail ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" load_env() { - if [[ -f "${ROOT}/scripts/.env.verify" ]]; then + if [[ -f "${ROOT}/ansible/env/.env.verify" ]]; then set -a # shellcheck disable=SC1091 - source "${ROOT}/scripts/.env.verify" + source "${ROOT}/ansible/env/.env.verify" set +a - echo "[OK] 已加载 scripts/.env.verify" + echo "[OK] 已加载 ansible/env/.env.verify" fi } @@ -29,7 +29,7 @@ usage() { 说明: - 真机「一键验收」:可选先铺栈(deploy-lab),再跑矩阵全量验收(verify.sh full) - - 默认不铺栈(避免误改现网);只执行 ./scripts/verify.sh full + - 默认不铺栈(避免误改现网);只执行 ./ansible/bin/verify.sh full 常用示例: # 只验收(推荐默认) @@ -70,22 +70,22 @@ main() { if [[ "${ACCEPT_DEPLOY:-0}" == "1" ]]; then echo "########################################## deploy (optional)" if [[ "${ACCEPT_DEPLOY_K3S:-1}" == "1" ]]; then - ./scripts/deploy-lab.sh k3s + ./ansible/bin/deploy-lab.sh k3s fi if [[ "${ACCEPT_DEPLOY_LONGHORN:-0}" == "1" ]]; then - ./scripts/deploy-lab.sh longhorn + ./ansible/bin/deploy-lab.sh longhorn fi if [[ "${ACCEPT_DEPLOY_NGINX_MATRIX:-0}" == "1" ]]; then - ./scripts/deploy-lab.sh nginx-matrix + ./ansible/bin/deploy-lab.sh nginx-matrix fi if [[ "${ACCEPT_DEPLOY_NGINX_MATRIX_TLS:-0}" == "1" ]]; then - ./scripts/deploy-lab.sh nginx-matrix-tls + ./ansible/bin/deploy-lab.sh nginx-matrix-tls fi fi echo "" echo "########################################## verify full (matrix)" - ./scripts/verify.sh full + ./ansible/bin/verify.sh full } main "$@" diff --git a/scripts/cs b/scripts/cs new file mode 100755 index 0000000..fb20841 --- /dev/null +++ b/scripts/cs @@ -0,0 +1,28 @@ +#!/usr/bin/env bash +# 单篇验证简写:对任意执行域 doc_id 等价于 ./ansible/bin/verify.sh run +# 用法:在仓库根执行 ./scripts/cs 02-05 +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +DOC_ID="${1:-}" + +usage() { + echo "用法:./scripts/cs " >&2 + echo "说明:等价 ./ansible/bin/verify.sh run ,适用于 verify 目录内全部执行域 doc_id。" >&2 + echo "列举:./ansible/bin/verify.sh list" >&2 + echo "示例:./scripts/cs 02-05" >&2 +} + +if [[ -z "$DOC_ID" ]]; then + usage + exit 1 +fi + +if ! [[ "$DOC_ID" =~ ^(0[1-9]|[1-9][0-9])-(0[1-9]|[1-9][0-9])$ ]]; then + echo "[ERR] 非执行域 doc_id:${DOC_ID}(须匹配 XX-YY,且 XX、YY 为 01–99 非零)" >&2 + echo "[TIP] 导航页 YY=00 无 verify playbook,请用具体分项如 01-05、02-05。" >&2 + usage + exit 1 +fi + +exec "${ROOT}/ansible/bin/verify.sh" run "$DOC_ID" diff --git a/scripts/deploy-lab.sh b/scripts/deploy-lab.sh deleted file mode 100644 index fe4e2ed..0000000 --- a/scripts/deploy-lab.sh +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env bash -# 实验室「正式部署」入口(对应 docs/00-04 §2 步骤 1~3)。 -# 在仓库根执行:./scripts/deploy-lab.sh <子命令> -# -# 步骤对应关系(详见 docs/00-03-测试与验证框架.md §2): -# 1 接入 — 本机有 ansible-playbook、inventory 可达;可选加载 scripts/.env.verify -# 2 前置 — 可选 01-06-prepare-storage(磁盘 → /storage),非 k3s-uninstall 类重度清理 -# 3 部署 — 调用 ansible/playbooks/verify/ 下 playbook;默认 -e VERIFY_TEARDOWN=0(铺栈后保留资源)。 -# 需要验收后自动清理时,用 ./scripts/verify.sh run (默认 VERIFY_TEARDOWN=1)。 -set -euo pipefail - -ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" -# shellcheck disable=SC1091 -source "${ROOT}/scripts/lib-ansible-lab.sh" -ansible_lab_export_config - -load_env() { - if [[ -f "${ROOT}/scripts/.env.verify" ]]; then - set -a - # shellcheck disable=SC1091 - source "${ROOT}/scripts/.env.verify" - set +a - echo "[OK] 已加载 scripts/.env.verify" - fi -} - -usage() { - cat <<'EOF' -用法:scripts/deploy-lab.sh <子命令> - -子命令: - k3s 安装/复验 K3s(verify/01-06.yml;可选先数据盘准备) - longhorn Helm 安装 Longhorn + 健康检查(verify/03-07.yml,VERIFY_TEARDOWN=0) - nginx-matrix HTTP nginx 矩阵 + 校验(verify/02-05.yml,VERIFY_TEARDOWN=0) - nginx-matrix-tls TLS nginx 矩阵(verify/03-02.yml,需显式 -e nginx_matrix_tls_enable=true) - -环境变量(节选,完整见 scripts/.env.verify.example): - ANSIBLE_INVENTORY 默认 <仓库>/ansible/inventory.ini - K3S_PREPARE_STORAGE 为 true 时在 01-06.yml 内启用准备数据盘(传 -e k3s_do_prepare_storage=true -e k3s_prepare_storage=true) - DEPLOY_VERIFY_TEARDOWN 默认 0;若设为 1,则与 verify 子命令一并传入 Ansible(longhorn/nginx-matrix 会执行卸载类 teardown) - -说明: - 铺栈入口与验收入口共用 verify 下 playbook;区别为本脚本固定默认 VERIFY_TEARDOWN=0。验收请用 ./scripts/verify.sh full / run 。 -EOF -} - -ansible_wrap() { - local inv="${ANSIBLE_INVENTORY:-${ROOT}/ansible/inventory.ini}" - if [[ ! -f "$inv" ]]; then - echo "[ERR] inventory 不存在:$inv" >&2 - exit 1 - fi - if ! command -v ansible-playbook >/dev/null 2>&1; then - echo "[ERR] 未找到 ansible-playbook,请先安装 Ansible" >&2 - exit 1 - fi - ansible_lab_check_inventory_keys "$inv" || exit 1 - local td="${DEPLOY_VERIFY_TEARDOWN:-0}" - echo "[RUN] ansible-playbook -i $inv -e VERIFY_TEARDOWN=$td $*" - ansible-playbook -i "$inv" -e "VERIFY_TEARDOWN=$td" "$@" -} - -cmd_k3s() { - if [[ "${K3S_PREPARE_STORAGE:-false}" == "true" ]]; then - ansible_wrap "${ROOT}/ansible/playbooks/verify/01-06.yml" -e 'k3s_do_prepare_storage=true' -e 'k3s_prepare_storage=true' - fi - ansible_wrap "${ROOT}/ansible/playbooks/verify/01-06.yml" -e 'k3s_do_install=true' -} - -main() { - load_env - local sub="${1:-}" - case "$sub" in - ""|-h|--help) usage ;; - k3s) cmd_k3s ;; - longhorn) ansible_wrap "${ROOT}/ansible/playbooks/verify/03-07.yml" ;; - nginx-matrix) ansible_wrap "${ROOT}/ansible/playbooks/verify/02-05.yml" ;; - nginx-matrix-tls) ansible_wrap "${ROOT}/ansible/playbooks/verify/03-02.yml" -e 'nginx_matrix_tls_enable=true' ;; - *) - echo "[ERR] 未知子命令:$sub" >&2 - usage - exit 1 - ;; - esac -} - -main "$@" diff --git a/scripts/fix-04-doc-refs.py b/scripts/fix-04-doc-refs.py deleted file mode 100644 index 14373ea..0000000 --- a/scripts/fix-04-doc-refs.py +++ /dev/null @@ -1,104 +0,0 @@ -#!/usr/bin/env python3 -"""After 04-xx renumbering: fix cross-links and per-doc manifest names across the repo.""" -from __future__ import annotations - -import re -from pathlib import Path - -ROOT = Path(__file__).resolve().parents[1] - -# Old canonical .md basename -> new basename (apply before per-file yaml fix) -MD_MAP: list[tuple[str, str]] = [ - ("04-11-nodejs-副本与滚动发布.md", "04-06-nodejs-副本与滚动发布.md"), - ("04-10-nodejs-Ingress与Traefik.md", "04-07-nodejs-Ingress与Traefik.md"), - ("04-09-nodejs-存储与卷.md", "04-11-nodejs-存储与卷.md"), - ("04-08-nodejs-安全上下文.md", "04-10-nodejs-安全上下文.md"), - ("04-07-nodejs-调度与亲和.md", "04-09-nodejs-调度与亲和.md"), - ("04-06-nodejs-探针与健康检查.md", "04-05-nodejs-探针与健康检查.md"), - ("04-05-nodejs-资源请求与限制.md", "04-08-nodejs-资源请求与限制.md"), - ("04-04-nodejs-端口与Service.md", "04-02-nodejs-端口与Service.md"), - ("04-03-nodejs-环境变量与配置注入.md", "04-04-nodejs-环境变量与配置注入.md"), - ("04-02-nodejs-镜像与运行命令.md", "04-03-nodejs-镜像与运行命令.md"), -] - -SKIP_DIR_NAMES = {".git", "node_modules", "logs"} -TEXT_SUFFIXES = {".md", ".yml", ".yaml", ".sh", ".txt", ".example"} - - -def iter_files(): - for p in ROOT.rglob("*"): - if not p.is_file(): - continue - if any(x in p.parts for x in SKIP_DIR_NAMES): - continue - if p.suffix.lower() not in TEXT_SUFFIXES and p.name not in ( - ".env.verify.example", - ): - continue - yield p - - -def apply_md_map(content: str) -> str: - for old, new in MD_MAP: - content = content.replace(old, new) - return content - - -def fix_doc_manifests(content: str, doc_id: str) -> str: - return re.sub( - r"04-\d{2}-nodejs-demo\.yaml", - f"04-{doc_id}-nodejs-demo.yaml", - content, - ) - - -def fix_title(content: str, title_body: str) -> str: - lines = content.splitlines() - if lines and lines[0].startswith("# "): - lines[0] = f"# {title_body}" - return "\n".join(lines) + ("\n" if content.endswith("\n") else "") - return content - - -def main() -> None: - for path in iter_files(): - raw = path.read_text(encoding="utf-8") - new = apply_md_map(raw) - if path.parent.name == "docs" and re.match(r"04-\d{2}-", path.name): - m = re.match(r"04-(\d{2})-", path.name) - if m: - doc_id = m.group(1) - new = fix_doc_manifests(new, doc_id) - base = path.name.removesuffix(".md") - new = fix_title(new, base) - path.write_text(new, encoding="utf-8") - - # verify playbooks: doc_filename must match renumbered docs - vf = ROOT / "ansible/playbooks/verify" - for yml in sorted(vf.glob("04-*.yml")): - m = re.match(r"04-(\d{2})\.yml$", yml.name) - if not m: - continue - nid = m.group(1) - text = yml.read_text(encoding="utf-8") - # find docs/04-NN-*.md in file after md_map would already be applied - dm = re.search(r'doc_filename:\s*"([^"]+)"', text) - if not dm: - continue - old_fn = dm.group(1) - if not old_fn.startswith(f"04-{nid}-"): - # pick any docs/04-NN-*.md with this NN - docs_dir = ROOT / "docs" - matches = list(docs_dir.glob(f"04-{nid}-*.md")) - if len(matches) == 1: - text = re.sub( - r'doc_filename:\s*"[^"]+"', - f'doc_filename: "{matches[0].name}"', - text, - count=1, - ) - yml.write_text(text, encoding="utf-8") - - -if __name__ == "__main__": - main() diff --git a/scripts/gen-nodejs-demo-yaml.py b/scripts/gen-nodejs-demo-yaml.py deleted file mode 100644 index 7d58e6f..0000000 --- a/scripts/gen-nodejs-demo-yaml.py +++ /dev/null @@ -1,478 +0,0 @@ -#!/usr/bin/env python3 -"""Generate cumulative 04-02..04-11 nodejs-demo YAML (Core→Plus→Pro doc order).""" -from pathlib import Path -from textwrap import dedent - -DIR = Path(__file__).resolve().parents[1] / "labs/nodejs/manifests" - -CM = dedent( - """\ - apiVersion: v1 # ConfigMap API 版本 - kind: ConfigMap # 配置资源:ConfigMap - metadata: # ConfigMap 元信息 - name: nodejs-demo-config # ConfigMap 名称 - namespace: default # 命名空间 - data: # 配置键值 - APP_MSG: "Hello from ConfigMap" # 注入给应用的消息内容 - """ -).strip() - -SVC_8080 = dedent( - """\ - apiVersion: v1 # Service API 版本 - kind: Service # Service 资源 - metadata: # Service 元信息 - name: nodejs-demo # Service 名称 - namespace: default # 命名空间 - spec: # Service 规格 - selector: # 选择后端 Pod - app: nodejs-demo # 选中 app=nodejs-demo - ports: # 端口映射 - - port: 80 # Service 暴露端口 - targetPort: 8080 # 转发到容器端口 - """ -).strip() - -ING_NODE = dedent( - """\ - apiVersion: networking.k8s.io/v1 # Ingress API 版本 - kind: Ingress # Ingress 资源 - metadata: # Ingress 元信息 - name: nodejs-demo # Ingress 名称 - namespace: default # 命名空间 - annotations: # Traefik 注解 - traefik.ingress.kubernetes.io/router.entrypoints: web # 使用 web(HTTP) 入口 - spec: # Ingress 规则 - rules: # 规则列表 - - http: # HTTP 路由 - paths: # 路径列表 - - path: /node # 匹配路径前缀 - pathType: Prefix # 前缀匹配 - backend: # 后端目标 - service: # 后端 Service - name: nodejs-demo # Service 名称 - port: # Service 端口 - number: 80 # 端口号 - """ -).strip() - -ING_HOST = dedent( - """\ - apiVersion: networking.k8s.io/v1 # Ingress API 版本 - kind: Ingress # Ingress 资源 - metadata: # Ingress 元信息 - name: nodejs-demo # Ingress 名称 - namespace: default # 命名空间 - annotations: # Traefik 注解 - traefik.ingress.kubernetes.io/router.entrypoints: web # 使用 web(HTTP) 入口 - spec: # Ingress 规则 - rules: # 规则列表 - - host: app.example.local # 主机名匹配 - http: # HTTP 路由 - paths: # 路径列表 - - path: /api # 匹配 API 路径前缀 - pathType: Prefix # 前缀匹配 - backend: # 后端目标 - service: # 后端 Service - name: nodejs-demo # Service 名称 - port: # Service 端口 - number: 80 # 端口号 - """ -).strip() - -# 与 Deployment 模板中 ` ports:` 同级(勿对整段 dedent,否则会剥掉缩进) -PROBES = ( - " livenessProbe: # 存活探针\n" - " httpGet: # HTTP 探测\n" - " path: / # 探测路径\n" - " port: 8080 # 探测端口\n" - " initialDelaySeconds: 3 # 初始延迟\n" - " periodSeconds: 10 # 探测周期\n" - " readinessProbe: # 就绪探针\n" - " httpGet: # HTTP 探测\n" - " path: / # 探测路径\n" - " port: 8080 # 探测端口\n" - " initialDelaySeconds: 2 # 初始延迟\n" - " periodSeconds: 5 # 探测周期\n" -) - -RES = ( - " resources: # 资源请求与限制\n" - " requests: # 最小资源请求\n" - " cpu: \"50m\" # 请求 CPU\n" - " memory: \"64Mi\" # 请求内存\n" - " limits: # 资源上限\n" - " cpu: \"500m\" # CPU 限制\n" - " memory: \"256Mi\" # 内存限制\n" -) - - -def main() -> None: - # 04-02: 01 + 仅改监听 8080(无 ConfigMap) - doc2 = dedent( - """\ - # 对应文档:docs/04-02-nodejs-端口与Service.md - # 累积:04-01 + 容器与 Service 改监听 8080(与后续探针一致) - apiVersion: apps/v1 # Deployment API 版本 - kind: Deployment # 工作负载:Deployment - metadata: # Deployment 元信息 - name: nodejs-demo # Deployment 名称 - namespace: default # 命名空间 - spec: # Deployment 规格 - replicas: 1 # 副本数 - selector: # Deployment 选择器 - matchLabels: # 标签匹配集合 - app: nodejs-demo # 匹配 app=nodejs-demo 的 Pod - template: # Pod 模板 - metadata: # Pod 元信息 - labels: # Pod 标签 - app: nodejs-demo # 与 selector.matchLabels 对齐 - spec: # Pod 规格 - containers: # 容器列表 - - name: nodejs-demo # 容器名 - image: node:18-alpine # Node.js 镜像 - command: ["node", "-e", "require('http').createServer((req,res)=>res.end('Hello World from Node.js')).listen(8080)"] # 内联 HTTP 服务改监听 8080 - ports: # 容器端口 - - containerPort: 8080 # 应用监听端口 - --- - """ - ) + SVC_8080 + "\n---\n" + ING_NODE + "\n" - - # 04-03: + 固定镜像 tag、command/args(与旧 04-02 等价,端口 8080) - doc3 = dedent( - """\ - # 对应文档:docs/04-03-nodejs-镜像与运行命令.md - # 累积:04-02 + 固定镜像 tag、imagePullPolicy、command/args - apiVersion: apps/v1 # Deployment API 版本 - kind: Deployment # 工作负载:Deployment - metadata: # Deployment 元信息 - name: nodejs-demo # Deployment 名称 - namespace: default # 命名空间 - spec: # Deployment 规格 - replicas: 1 # 副本数 - selector: # Deployment 选择器 - matchLabels: # 标签匹配集合 - app: nodejs-demo # 匹配 app=nodejs-demo 的 Pod - template: # Pod 模板 - metadata: # Pod 元信息 - labels: # Pod 标签 - app: nodejs-demo # 与 selector.matchLabels 对齐 - spec: # Pod 规格 - containers: # 容器列表 - - name: nodejs-demo # 容器名 - image: node:18.20-alpine # 固定 tag 的 Node.js 镜像 - imagePullPolicy: IfNotPresent # 拉取策略:本地有则不重复拉取 - command: ["node"] # 主命令 - args: # 命令参数 - - "-e" # 执行内联脚本 - - "require('http').createServer((req,res)=>res.end('Hello from pinned image')).listen(8080)" # Node.js 内联服务逻辑 - ports: # 容器端口 - - containerPort: 8080 # 应用监听端口 - --- - """ - ) + SVC_8080 + "\n---\n" + ING_NODE + "\n" - - # 04-04: + ConfigMap(等同旧 04-04 主体) - doc4 = ( - f"# 对应文档:docs/04-04-nodejs-环境变量与配置注入.md\n" - f"# 累积:04-03 + ConfigMap + 通过 env 注入 APP_MSG\n---\n{CM}\n---\n" - + dedent( - """\ - apiVersion: apps/v1 # Deployment API 版本 - kind: Deployment # 工作负载:Deployment - metadata: # Deployment 元信息 - name: nodejs-demo # Deployment 名称 - namespace: default # 命名空间 - spec: # Deployment 规格 - replicas: 1 # 副本数 - selector: # Deployment 选择器 - matchLabels: # 标签匹配集合 - app: nodejs-demo # 匹配 app=nodejs-demo 的 Pod - template: # Pod 模板 - metadata: # Pod 元信息 - labels: # Pod 标签 - app: nodejs-demo # 与 selector.matchLabels 对齐 - spec: # Pod 规格 - containers: # 容器列表 - - name: nodejs-demo # 容器名 - image: node:18.20-alpine # Node.js 镜像 - imagePullPolicy: IfNotPresent # 拉取策略 - env: # 环境变量注入 - - name: APP_MSG # 环境变量名 - valueFrom: # 从资源引用取值 - configMapKeyRef: # 从 ConfigMap key 读取 - name: nodejs-demo-config # ConfigMap 名称 - key: APP_MSG # ConfigMap 键名 - command: # 启动命令 - - node # 运行 node - - "-e" # 执行内联脚本 - - | # 多行 JS 脚本(内部内容不改动) - const http=require('http'); - const msg=process.env.APP_MSG||'no env'; - http.createServer((q,s)=>s.end(msg)).listen(8080); - ports: # 容器端口 - - containerPort: 8080 # 应用监听端口 - --- - """ - ) - + SVC_8080 - + "\n---\n" - + ING_NODE - + "\n" - ) - - # 04-05: + 探针(无 resources) - doc5 = ( - f"# 对应文档:docs/04-05-nodejs-探针与健康检查.md\n" - f"# 累积:04-04 + livenessProbe/readinessProbe(端口 8080,路径 /)\n---\n{CM}\n---\n" - + dedent( - """\ - apiVersion: apps/v1 # Deployment API 版本 - kind: Deployment # 工作负载:Deployment - metadata: # Deployment 元信息 - name: nodejs-demo # Deployment 名称 - namespace: default # 命名空间 - spec: # Deployment 规格 - replicas: 1 # 副本数 - selector: # Deployment 选择器 - matchLabels: # 标签匹配集合 - app: nodejs-demo # 匹配 app=nodejs-demo 的 Pod - template: # Pod 模板 - metadata: # Pod 元信息 - labels: # Pod 标签 - app: nodejs-demo # 与 selector.matchLabels 对齐 - spec: # Pod 规格 - containers: # 容器列表 - - name: nodejs-demo # 容器名 - image: node:18.20-alpine # Node.js 镜像 - imagePullPolicy: IfNotPresent # 拉取策略 - env: # 环境变量注入 - - name: APP_MSG # 环境变量名 - valueFrom: # 从资源引用取值 - configMapKeyRef: # 从 ConfigMap key 读取 - name: nodejs-demo-config # ConfigMap 名称 - key: APP_MSG # ConfigMap 键名 - command: # 启动命令 - - node # 运行 node - - "-e" # 执行内联脚本 - - | # 多行 JS 脚本(内部内容不改动) - const http=require('http'); - const msg=process.env.APP_MSG||'no env'; - http.createServer((q,s)=>s.end(msg)).listen(8080); - ports: # 容器端口 - - containerPort: 8080 # 应用监听端口 - """ - ).rstrip() - + "\n" - + PROBES - + "\n" - + dedent( - """\ - --- - """ - ) - + SVC_8080 - + "\n---\n" - + ING_NODE - + "\n" - ) - - # 04-06: + replicas:3 + RollingUpdate,Ingress 仍为 /node - doc6 = ( - f"# 对应文档:docs/04-06-nodejs-副本与滚动发布.md\n" - f"# 累积:04-05 + replicas: 3 + RollingUpdate(maxSurge:1 maxUnavailable:0)\n---\n{CM}\n---\n" - + dedent( - """\ - apiVersion: apps/v1 # Deployment API 版本 - kind: Deployment # 工作负载:Deployment - metadata: # Deployment 元信息 - name: nodejs-demo # Deployment 名称 - namespace: default # 命名空间 - spec: # Deployment 规格 - replicas: 3 # 副本数(高可用) - strategy: # 更新策略 - type: RollingUpdate # 滚动更新 - rollingUpdate: # 滚动更新参数 - maxSurge: 1 # 更新时最多额外增加 1 个 Pod - maxUnavailable: 0 # 更新时不可用 Pod 数为 0 - selector: # Pod 选择器 - matchLabels: # 标签匹配集合 - app: nodejs-demo # 匹配 app=nodejs-demo 的 Pod - template: # Pod 模板 - metadata: # Pod 元信息 - labels: # Pod 标签 - app: nodejs-demo # 与 selector.matchLabels 对齐 - spec: # Pod 规格 - containers: # 容器列表 - - name: nodejs-demo # 容器名 - image: node:18.20-alpine # Node.js 镜像 - imagePullPolicy: IfNotPresent # 拉取策略 - env: # 环境变量注入 - - name: APP_MSG # 环境变量名 - valueFrom: # 从资源引用取值 - configMapKeyRef: # 从 ConfigMap key 读取 - name: nodejs-demo-config # ConfigMap 名称 - key: APP_MSG # ConfigMap 键名 - command: # 启动命令 - - node # 运行 node - - "-e" # 执行内联脚本 - - | # 多行 JS 脚本(内部内容不改动) - const http=require('http'); - const msg=process.env.APP_MSG||'no env'; - http.createServer((q,s)=>s.end(msg)).listen(8080); - ports: # 容器端口 - - containerPort: 8080 # 应用监听端口 - """ - ).rstrip() - + "\n" - + PROBES - + "\n" - + dedent( - """\ - --- - """ - ) - + SVC_8080 - + "\n---\n" - + ING_NODE - + "\n" - ) - - # 04-07: Ingress host + /api - doc7 = doc6.replace( - "# 对应文档:docs/04-06-nodejs-副本与滚动发布.md\n" - "# 累积:04-05 + replicas: 3 + RollingUpdate(maxSurge:1 maxUnavailable:0)\n", - "# 对应文档:docs/04-07-nodejs-Ingress与Traefik.md\n" - "# 累积:04-06 + Ingress 增加 host、path 改为 /api(访问需 Host: app.example.local)\n", - ) - doc7 = doc7.replace("---\n" + ING_NODE + "\n", "---\n" + ING_HOST + "\n") - - # 04-08: + resources - c8 = ( - " ports: # 容器端口\n" - " - containerPort: 8080 # 应用监听端口\n" - ) - c8r = ( - " ports: # 容器端口\n" - " - containerPort: 8080 # 应用监听端口\n" + RES - ) - doc8 = doc7.replace( - "# 对应文档:docs/04-07-nodejs-Ingress与Traefik.md\n" - "# 累积:04-06 + Ingress 增加 host、path 改为 /api(访问需 Host: app.example.local)\n", - "# 对应文档:docs/04-08-nodejs-资源请求与限制.md\n" - "# 累积:04-07 + resources.requests/limits\n", - ).replace(c8, c8r) - - # 04-09: + nodeSelector - doc9 = doc8.replace( - "# 对应文档:docs/04-08-nodejs-资源请求与限制.md\n" - "# 累积:04-07 + resources.requests/limits\n", - "# 对应文档:docs/04-09-nodejs-调度与亲和.md\n" - "# 累积:04-08 + nodeSelector(默认 ylc62,请改为本集群节点短主机名)\n", - ).replace( - " spec: # Pod 规格\n containers: # 容器列表\n", - " spec: # Pod 规格\n nodeSelector: # 调度到指定节点\n" - " kubernetes.io/hostname: ylc62 # 节点主机名(按实际修改)\n" - " containers: # 容器列表\n", - ) - - # 04-10: + securityContext + tmp volume - doc10 = doc9.replace( - "# 对应文档:docs/04-09-nodejs-调度与亲和.md\n" - "# 累积:04-08 + nodeSelector(默认 ylc62,请改为本集群节点短主机名)\n", - "# 对应文档:docs/04-10-nodejs-安全上下文.md\n" - "# 累积:04-09 + pod securityContext.fsGroup、容器 securityContext、只读根、/tmp emptyDir\n", - ).replace( - " spec: # Pod 规格\n nodeSelector: # 调度到指定节点\n" - " kubernetes.io/hostname: ylc62 # 节点主机名(按实际修改)\n" - " containers: # 容器列表\n", - " spec: # Pod 规格\n nodeSelector: # 调度到指定节点\n" - " kubernetes.io/hostname: ylc62 # 节点主机名(按实际修改)\n" - " securityContext: # Pod 级安全上下文\n" - " fsGroup: 1000 # 挂载卷文件组 ID\n" - " containers: # 容器列表\n", - ) - doc10 = doc10.replace( - " - name: nodejs-demo # 容器名\n image: node:18.20-alpine # Node.js 镜像\n" - " imagePullPolicy: IfNotPresent # 拉取策略\n env:", - " - name: nodejs-demo # 容器名\n image: node:18.20-alpine # Node.js 镜像\n" - " imagePullPolicy: IfNotPresent # 拉取策略\n" - " securityContext: # 容器级安全上下文\n" - " allowPrivilegeEscalation: false # 禁止提权\n" - " runAsNonRoot: true # 强制非 root 运行\n" - " runAsUser: 1000 # 运行用户 UID\n" - " readOnlyRootFilesystem: true # 根文件系统只读\n" - " env:", - ) - doc10 = doc10.replace( - " periodSeconds: 5 # 探测周期\n\n---\n", - " periodSeconds: 5 # 探测周期\n" - " volumeMounts: # 卷挂载\n" - " - name: tmp # 引用临时卷\n" - " mountPath: /tmp # 容器内临时目录\n" - " volumes: # 卷定义\n" - " - name: tmp # 临时卷名称\n" - " emptyDir: {} # 空目录卷(Pod 生命周期内)\n\n---\n", - ) - - pvc = dedent( - """\ - apiVersion: v1 # PVC API 版本 - kind: PersistentVolumeClaim # 持久卷声明 - metadata: # PVC 元信息 - name: nodejs-demo-data # PVC 名称 - namespace: default # 命名空间 - spec: # PVC 规格 - accessModes: # 访问模式 - - ReadWriteOnce # RWO:同一时间仅单节点挂载读写 - storageClassName: local-path # 存储类(按集群可改) - resources: # 资源请求 - requests: # 配额请求 - storage: 1Gi # 申请容量 - --- - """ - ).strip() - doc11 = doc10.replace( - "# 对应文档:docs/04-10-nodejs-安全上下文.md\n" - "# 累积:04-09 + pod securityContext.fsGroup、容器 securityContext、只读根、/tmp emptyDir\n", - "# 对应文档:docs/04-11-nodejs-存储与卷.md\n" - "# 累积:04-10 + PVC nodejs-demo-data(默认 storageClassName: local-path)+ 挂载 /data\n", - ) - doc11 = doc11.replace( - "---\n" + CM + "\n---\n", - "---\n" + pvc + "\n" + CM + "\n---\n", - 1, - ) - doc11 = doc11.replace( - " volumeMounts: # 卷挂载\n" - " - name: tmp # 引用临时卷\n" - " mountPath: /tmp # 容器内临时目录\n", - " volumeMounts: # 卷挂载\n" - " - name: tmp # 临时卷名称\n" - " mountPath: /tmp # 容器内临时目录\n" - " - name: data # 数据卷名称\n" - " mountPath: /data # 容器内数据目录\n", - ) - doc11 = doc11.replace( - " volumes: # 卷定义\n - name: tmp # 临时卷名称\n" - " emptyDir: {} # 空目录卷(Pod 生命周期内)\n", - " volumes: # 卷定义\n - name: tmp # 临时卷\n emptyDir: {} # 空目录卷\n" - " - name: data # 数据卷\n persistentVolumeClaim: # 卷来源为 PVC\n" - " claimName: nodejs-demo-data # 绑定 PVC 名称\n", - ) - - DIR.mkdir(parents=True, exist_ok=True) - (DIR / "04-02-nodejs-demo.yaml").write_text(doc2, encoding="utf-8") - (DIR / "04-03-nodejs-demo.yaml").write_text(doc3, encoding="utf-8") - (DIR / "04-04-nodejs-demo.yaml").write_text(doc4, encoding="utf-8") - (DIR / "04-05-nodejs-demo.yaml").write_text(doc5, encoding="utf-8") - (DIR / "04-06-nodejs-demo.yaml").write_text(doc6, encoding="utf-8") - (DIR / "04-07-nodejs-demo.yaml").write_text(doc7, encoding="utf-8") - (DIR / "04-08-nodejs-demo.yaml").write_text(doc8, encoding="utf-8") - (DIR / "04-09-nodejs-demo.yaml").write_text(doc9, encoding="utf-8") - (DIR / "04-10-nodejs-demo.yaml").write_text(doc10, encoding="utf-8") - (DIR / "04-11-nodejs-demo.yaml").write_text(doc11, encoding="utf-8") - - -if __name__ == "__main__": - main() diff --git a/scripts/k3s-delete-lab-stacks.sh b/scripts/k3s-delete-lab-stacks.sh index fd248c1..300169f 100644 --- a/scripts/k3s-delete-lab-stacks.sh +++ b/scripts/k3s-delete-lab-stacks.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash # 按「集群里实际存在的资源」遍历删除(全部由 kubectl 发现,不读仓库 YAML 目录) -# 对应 docs/00-05 §2 步骤 2「重度清理」方向的实验室内清场(非 verify.sh 默认 teardown,亦非 k3s-uninstall)。 +# 对应 docs/00-03 §2 步骤 2「重度清理」方向的实验室内清场(非 verify.sh 默认 teardown,亦非 k3s-uninstall)。 # 在任意目录执行均可;建议在仓库根:./scripts/k3s-delete-lab-stacks.sh [选项] # # 默认跳过系统命名空间:kube-system、kube-public、kube-node-lease diff --git a/scripts/offline-check-whitelist.json b/scripts/offline-check-whitelist.json new file mode 100644 index 0000000..84854c1 --- /dev/null +++ b/scripts/offline-check-whitelist.json @@ -0,0 +1,213 @@ +[ + { + "file": "docs/00-00-构建总览.md", + "rule": "docs_no_parent_links", + "expires": "2026-04-30", + "reason": "legacy ../ links; migrate to root-relative" + }, + { + "file": "docs/00-03-测试与验证框架.md", + "rule": "docs_no_parent_links", + "expires": "2026-04-30", + "reason": "legacy ../ links; migrate to root-relative" + }, + { + "file": "docs/01-02-k3s-工作节点.md", + "rule": "docs_no_parent_links", + "expires": "2026-04-30", + "reason": "legacy ../ links; migrate to root-relative" + }, + { + "file": "docs/01-05-节点初始化-ansible-实践.md", + "rule": "docs_no_parent_links", + "expires": "2026-04-30", + "reason": "legacy ../ links; migrate to root-relative" + }, + { + "file": "docs/03-01-k3s-traefik-dashboard.md", + "rule": "docs_no_parent_links", + "expires": "2026-04-30", + "reason": "legacy ../ links; migrate to root-relative" + }, + { + "file": "docs/03-02-k3s-traefik-acme.md", + "rule": "docs_no_parent_links", + "expires": "2026-04-30", + "reason": "legacy ../ links; migrate to root-relative" + }, + { + "file": "docs/03-03-k3s-traefik-dashboard-acme.md", + "rule": "docs_no_parent_links", + "expires": "2026-04-30", + "reason": "legacy ../ links; migrate to root-relative" + }, + { + "file": "docs/03-04-k3s-cloudflare-tunnel-配置接入.md", + "rule": "docs_no_parent_links", + "expires": "2026-04-30", + "reason": "legacy ../ links; migrate to root-relative" + }, + { + "file": "docs/03-05-k3s-local-path-pvc.md", + "rule": "docs_no_parent_links", + "expires": "2026-04-30", + "reason": "legacy ../ links; migrate to root-relative" + }, + { + "file": "docs/03-06-k3s-使用nfs存储.md", + "rule": "docs_no_parent_links", + "expires": "2026-04-30", + "reason": "legacy ../ links; migrate to root-relative" + }, + { + "file": "docs/03-07-k3s-longhorn-持久化存储.md", + "rule": "docs_no_parent_links", + "expires": "2026-04-30", + "reason": "legacy ../ links; migrate to root-relative" + }, + { + "file": "docs/03-10-k3s-traefik-custom-ports.md", + "rule": "docs_no_parent_links", + "expires": "2026-04-30", + "reason": "legacy ../ links; migrate to root-relative" + }, + { + "file": "docs/04-01-k3s-nodejs-高级部署.md", + "rule": "docs_no_parent_links", + "expires": "2026-04-30", + "reason": "legacy ../ links; migrate to root-relative" + }, + { + "file": "docs/04-02-nodejs-端口与Service.md", + "rule": "docs_no_parent_links", + "expires": "2026-04-30", + "reason": "legacy ../ links; migrate to root-relative" + }, + { + "file": "docs/04-03-nodejs-镜像与运行命令.md", + "rule": "docs_no_parent_links", + "expires": "2026-04-30", + "reason": "legacy ../ links; migrate to root-relative" + }, + { + "file": "docs/04-04-nodejs-环境变量与配置注入.md", + "rule": "docs_no_parent_links", + "expires": "2026-04-30", + "reason": "legacy ../ links; migrate to root-relative" + }, + { + "file": "docs/04-05-nodejs-探针与健康检查.md", + "rule": "docs_no_parent_links", + "expires": "2026-04-30", + "reason": "legacy ../ links; migrate to root-relative" + }, + { + "file": "docs/04-06-nodejs-副本与滚动发布.md", + "rule": "docs_no_parent_links", + "expires": "2026-04-30", + "reason": "legacy ../ links; migrate to root-relative" + }, + { + "file": "docs/04-07-nodejs-Ingress与Traefik.md", + "rule": "docs_no_parent_links", + "expires": "2026-04-30", + "reason": "legacy ../ links; migrate to root-relative" + }, + { + "file": "docs/04-08-nodejs-资源请求与限制.md", + "rule": "docs_no_parent_links", + "expires": "2026-04-30", + "reason": "legacy ../ links; migrate to root-relative" + }, + { + "file": "docs/04-09-nodejs-调度与亲和.md", + "rule": "docs_no_parent_links", + "expires": "2026-04-30", + "reason": "legacy ../ links; migrate to root-relative" + }, + { + "file": "docs/04-10-nodejs-安全上下文.md", + "rule": "docs_no_parent_links", + "expires": "2026-04-30", + "reason": "legacy ../ links; migrate to root-relative" + }, + { + "file": "docs/04-11-nodejs-存储与卷.md", + "rule": "docs_no_parent_links", + "expires": "2026-04-30", + "reason": "legacy ../ links; migrate to root-relative" + }, + { + "file": "docs/04-12-nodejs-TLS与证书.md", + "rule": "docs_no_parent_links", + "expires": "2026-04-30", + "reason": "legacy ../ links; migrate to root-relative" + }, + { + "file": "docs/04-13-nodejs-HPA.md", + "rule": "docs_no_parent_links", + "expires": "2026-04-30", + "reason": "legacy ../ links; migrate to root-relative" + }, + { + "file": "docs/04-14-nodejs-GitOps与CI流水线.md", + "rule": "docs_no_parent_links", + "expires": "2026-04-30", + "reason": "legacy ../ links; migrate to root-relative" + }, + { + "file": "docs/05-01-k3s-部署homer首页面板.md", + "rule": "docs_no_parent_links", + "expires": "2026-04-30", + "reason": "legacy ../ links; migrate to root-relative" + }, + { + "file": "docs/05-02-onenav首页面板.md", + "rule": "docs_no_parent_links", + "expires": "2026-04-30", + "reason": "legacy ../ links; migrate to root-relative" + }, + { + "file": "docs/05-03-k3s-安装gitlab-含runner.md", + "rule": "docs_no_parent_links", + "expires": "2026-04-30", + "reason": "legacy ../ links; migrate to root-relative" + }, + { + "file": "docs/05-04-k3s-配置gitlab-cicd.md", + "rule": "docs_no_parent_links", + "expires": "2026-04-30", + "reason": "legacy ../ links; migrate to root-relative" + }, + { + "file": "docs/05-06-openlist挂载网盘与自动备份.md", + "rule": "docs_no_parent_links", + "expires": "2026-04-30", + "reason": "legacy ../ links; migrate to root-relative" + }, + { + "file": "docs/05-07-openclaw应用部署.md", + "rule": "docs_no_parent_links", + "expires": "2026-04-30", + "reason": "legacy ../ links; migrate to root-relative" + }, + { + "file": "docs/05-08-openclaw-k3s-实验部署.md", + "rule": "docs_no_parent_links", + "expires": "2026-04-30", + "reason": "legacy ../ links; migrate to root-relative" + }, + { + "file": "docs/05-09-openclaw-web-小游戏网页平台.md", + "rule": "docs_no_parent_links", + "expires": "2026-04-30", + "reason": "legacy ../ links; migrate to root-relative" + }, + { + "file": "docs/06-03-k3s-自动备份与恢复-openlist-webdav.md", + "rule": "docs_no_parent_links", + "expires": "2026-04-30", + "reason": "legacy ../ links; migrate to root-relative" + } +] + diff --git a/scripts/test-all.sh b/scripts/offline-check.sh similarity index 68% rename from scripts/test-all.sh rename to scripts/offline-check.sh index b0e8d86..6d95a40 100755 --- a/scripts/test-all.sh +++ b/scripts/offline-check.sh @@ -1,12 +1,12 @@ #!/usr/bin/env bash -# 离线「全量」自检:与 CI 同源(labs 索引、verify 清单校验)+ +# 离线「全量」自检:与 CI 同源(verify 清单校验)+ # 对关键 playbook 执行 ansible-playbook --syntax-check。 -# 不连接集群、不执行 kubectl;真机验收仍用 ./scripts/verify.sh full。 +# 不连接集群、不执行 kubectl;真机验收仍用 ./ansible/bin/verify.sh full。 set -euo pipefail ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" # shellcheck disable=SC1091 -source "${ROOT}/scripts/lib-ansible-lab.sh" +source "${ROOT}/ansible/lib/lib-ansible-lab.sh" ansible_lab_export_config INV="${ANSIBLE_INVENTORY:-${ROOT}/ansible/inventory.ini}" @@ -18,12 +18,16 @@ need_cmd() { fi } -echo "########################################## 1/2 verify playbook ↔ docs 文件存在性" +echo "########################################## 1/3 docs 链接门禁(R3:禁止 ../,白名单需到期)" need_cmd python3 -python3 "${ROOT}/scripts/validate_matrix_playbooks.py" +python3 "${ROOT}/ansible/tools/check_docs_no_parent_links.py" echo "" -echo "########################################## 2/2 Ansible syntax-check(verify playbook)" +echo "########################################## 2/3 verify playbook ↔ docs 文件存在性" +python3 "${ROOT}/ansible/tools/validate_matrix_playbooks.py" + +echo "" +echo "########################################## 3/3 Ansible syntax-check(verify playbook)" need_cmd ansible-playbook [[ -f "$INV" ]] || { echo "[ERR] inventory 不存在:$INV(syntax-check 仍需 -i)" >&2 @@ -63,3 +67,4 @@ done echo "" echo "[OK] 全量离线检查通过(${n} 条 playbook syntax-check)" + diff --git a/scripts/resolve_verify_playbook.py b/scripts/resolve_verify_playbook.py deleted file mode 100644 index 3ab72d1..0000000 --- a/scripts/resolve_verify_playbook.py +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env python3 -"""将 doc_id 解析为 verify playbook 绝对路径(唯一真源:ansible/playbooks/verify/.yml)。 - -历史上曾解析 labs/matrix-doc-playbooks.yml(“验证矩阵”);该概念已废弃。 -""" -from __future__ import annotations - -import sys -from pathlib import Path - -ROOT = Path(__file__).resolve().parent.parent - - -def main() -> None: - if len(sys.argv) != 2: - print("用法: resolve_verify_playbook.py ", file=sys.stderr) - sys.exit(2) - doc_id = sys.argv[1].strip() - if not doc_id: - sys.exit(2) - p = ROOT / "ansible" / "playbooks" / "verify" / f"{doc_id}.yml" - if not p.is_file(): - print(f"ERR: playbook 不存在:{p}", file=sys.stderr) - sys.exit(2) - print(p.resolve()) - - -if __name__ == "__main__": - main() diff --git a/scripts/ssh/setup-k3s-workers-ssh.sh b/scripts/ssh/setup-k3s-workers-ssh.sh old mode 100644 new mode 100755 diff --git a/scripts/ssh/test-ssh.sh b/scripts/ssh/test-ssh.sh old mode 100644 new mode 100755 diff --git a/scripts/status-board.sh b/scripts/status-board.sh new file mode 100755 index 0000000..7953522 --- /dev/null +++ b/scripts/status-board.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + +usage() { + cat <<'EOF' +用法:scripts/status-board.sh [命令] + +命令: + render 仅渲染状态板(基于本地缓存/静态信息) + update 跑全量 verify 并写缓存(真机执行) + refresh update + render(默认) + +说明: + - 状态板文件:docs/00-04-验证状态板.md + - 本地缓存:.status/verify-results.json(已在 .gitignore 忽略) +EOF +} + +cmd="${1:-refresh}" +case "$cmd" in + -h|--help|help) usage; exit 0 ;; + render) + python3 "${ROOT}/ansible/tools/status_board.py" render + ;; + update) + python3 "${ROOT}/ansible/tools/status_board.py" update --all + ;; + refresh|"") + python3 "${ROOT}/ansible/tools/status_board.py" update --all + python3 "${ROOT}/ansible/tools/status_board.py" render + ;; + *) + echo "[ERR] unknown cmd: $cmd" >&2 + usage + exit 1 + ;; +esac + +echo "[OK] status board updated: ${ROOT}/docs/00-04-验证状态板.md" + diff --git a/scripts/validate_matrix_playbooks.py b/scripts/validate_matrix_playbooks.py deleted file mode 100644 index 57c2163..0000000 --- a/scripts/validate_matrix_playbooks.py +++ /dev/null @@ -1,84 +0,0 @@ -#!/usr/bin/env python3 -"""校验 verify playbook 清单(抛弃“验证矩阵”概念后的替代校验)。 - -规则(最小可用): -- ansible/playbooks/verify/ 目录下所有形如 XX-YY.yml 的文件,都必须存在对应 docs/XX-YY-*.md 文档 -- 仅检查“存在性 + 1:1 对齐”,不解析 Markdown 内容 - -历史上本脚本用于校验 docs/00-03-验证矩阵.md ↔ labs/matrix-doc-playbooks.yml; -该概念已废弃,但保留脚本名以减少 CI/用户习惯改动。 -""" -from __future__ import annotations - -import re -import sys -from pathlib import Path - -ROOT = Path(__file__).resolve().parent.parent -VERIFY_DIR = ROOT / "ansible" / "playbooks" / "verify" -DOCS_DIR = ROOT / "docs" - -EXEC_ID_RE = re.compile(r"^(0[1-9]|[1-9][0-9])-(0[1-9]|[1-9][0-9])$") - - -def is_exec_domain(doc_id: str) -> bool: - return EXEC_ID_RE.fullmatch(doc_id) is not None - - -def main() -> None: - if not VERIFY_DIR.is_dir(): - print(f"ERR: 缺少目录 {VERIFY_DIR}", file=sys.stderr) - sys.exit(2) - if not DOCS_DIR.is_dir(): - print(f"ERR: 缺少目录 {DOCS_DIR}", file=sys.stderr) - sys.exit(2) - - doc_ids: list[str] = [] - invalid_verify_names: list[str] = [] - for p in VERIFY_DIR.iterdir(): - if p.is_file() and len(p.name) == len("00-00.yml") and p.name[2:3] == "-" and p.name[5:] == ".yml": - if is_exec_domain(p.stem): - doc_ids.append(p.stem) - else: - invalid_verify_names.append(p.name) - - missing_docs: list[str] = [] - missing_files_dir: list[str] = [] - weak_doc_exec_refs: list[str] = [] - for did in sorted(set(doc_ids)): - matches = sorted(DOCS_DIR.glob(f"{did}-*.md")) - if not matches: - missing_docs.append(did) - continue - doc = matches[0] - content = doc.read_text(encoding="utf-8", errors="ignore") - if f"ansible/files/{did}/" not in content and "```yaml" in content: - weak_doc_exec_refs.append(did) - expects_files_dir = (f"ansible/files/{did}/" in content) or ("```yaml" in content) - if expects_files_dir and not (ROOT / "ansible" / "files" / did).is_dir(): - missing_files_dir.append(did) - - if invalid_verify_names: - print( - f"ERR: verify 仅允许执行域命名(XX>0 且 YY>0),以下文件不合规: {sorted(invalid_verify_names)}", - file=sys.stderr, - ) - sys.exit(2) - if missing_docs: - print(f"ERR: 存在 verify/.yml 但缺少 docs/-*.md: {missing_docs}", file=sys.stderr) - sys.exit(2) - if missing_files_dir: - print(f"ERR: 缺少 ansible/files// 目录: {missing_files_dir}", file=sys.stderr) - sys.exit(2) - if weak_doc_exec_refs: - print( - f"ERR: 文档包含 YAML 代码块但未引用 ansible/files// 真源: {weak_doc_exec_refs}", - file=sys.stderr, - ) - sys.exit(2) - - print(f"[OK] 执行域 verify/doc/files 一致性通过({len(sorted(set(doc_ids)))} 条)") - - -if __name__ == "__main__": - main() diff --git a/scripts/verify.sh b/scripts/verify.sh deleted file mode 100755 index 22f0806..0000000 --- a/scripts/verify.sh +++ /dev/null @@ -1,271 +0,0 @@ -#!/usr/bin/env bash -# 验证入口(以 ansible/playbooks/verify/.yml 为唯一执行真源): -# - run :执行单篇验证 playbook -# - run-all:按 verify 目录中存在的 .yml 顺序执行(仅执行域:XX>0 && YY>0) -# - full:preflight + run-all -# -# 说明: -# - 本脚本不再解析任何“矩阵/状态板”文档;验证清单从 verify playbook 自动得出。 -# - 步骤 1~3(接入、环境/轻量清理、部署)由操作者或 scripts/deploy-lab.sh 完成;本脚本不执行 k3s-uninstall。 -# - 推荐在 Linux 工作机或控制节点仓库根执行。 -set -euo pipefail - -ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" -# shellcheck disable=SC1091 -source "${ROOT}/scripts/lib-ansible-lab.sh" -ansible_lab_export_config - -# 默认与 §2 一致:验证后清理临时资源 -export VERIFY_TEARDOWN="${VERIFY_TEARDOWN:-1}" - -load_env() { - if [[ -f "${ROOT}/scripts/.env.verify" ]]; then - set -a - # shellcheck disable=SC1091 - source "${ROOT}/scripts/.env.verify" - set +a - echo "[OK] 已加载 scripts/.env.verify" - fi - export VERIFY_TEARDOWN="${VERIFY_TEARDOWN:-1}" -} - -DOC_ID_EXEC_RE='^(0[1-9]|[1-9][0-9])-(0[1-9]|[1-9][0-9])$' - -is_exec_doc_id() { - local doc_id="$1" - [[ "$doc_id" =~ $DOC_ID_EXEC_RE ]] -} - -list_doc_ids_from_verify_dir() { - # 只列出执行域(XX>0 && YY>0)的 verify 清单;支持筛选 - local series="${1:-}" - local id_regex="${2:-}" - local exclude_noop="${3:-0}" - local require_teardown="${4:-0}" - ROOT="${ROOT}" SERIES="${series}" ID_REGEX="${id_regex}" EXCLUDE_NOOP="${exclude_noop}" REQUIRE_TEARDOWN="${require_teardown}" python3 - <<'PY' -import os -import re -from pathlib import Path - -root = Path(os.environ["ROOT"]) -verify_dir = root / "ansible" / "playbooks" / "verify" -series = os.environ.get("SERIES", "").strip() -id_regex = os.environ.get("ID_REGEX", "").strip() -exclude_noop = os.environ.get("EXCLUDE_NOOP", "0") == "1" -require_teardown = os.environ.get("REQUIRE_TEARDOWN", "0") == "1" - -pat = re.compile(r"^(?P(0[1-9]|[1-9][0-9])-(0[1-9]|[1-9][0-9]))\.yml$") -id_pat = re.compile(id_regex) if id_regex else None - -ids = [] -for p in verify_dir.iterdir(): - m = pat.match(p.name) - if not m: - continue - doc_id = m.group("id") - if series and not doc_id.startswith(f"{series}-"): - continue - if id_pat and not id_pat.search(doc_id): - continue - - if exclude_noop or require_teardown: - content = p.read_text(encoding="utf-8", errors="ignore") - if exclude_noop and "noop verify" in content: - continue - if require_teardown and ("VERIFY_TEARDOWN" not in content and "verify_teardown" not in content): - continue - - ids.append(doc_id) - -for x in sorted(set(ids)): - print(x) -PY -} - -print_flow() { - cat < / run-all → ansible/playbooks/verify/.yml - 5 收尾与记录 VERIFY_TEARDOWN;验证结论建议写回对应实验篇文档(或单独记录日志) - 6 一键串联 $0 full(推荐)或 $0 run-all - -相关脚本:deploy-lab.sh(安装/铺栈) -EOF -} - -run_preflight() { - local inv="${ANSIBLE_INVENTORY:-${ROOT}/ansible/inventory.ini}" - if ! command -v ansible-playbook >/dev/null 2>&1; then - echo "[ERR] 未找到 ansible-playbook" >&2 - exit 1 - fi - [[ -f "$inv" ]] || { echo "[ERR] inventory 不存在:$inv" >&2; exit 1; } - ansible_lab_check_inventory_keys "$inv" || exit 1 - - echo "[RUN] ansible k3s_server -m ping" - ansible k3s_server -i "$inv" -m ping - - if [[ "${VERIFY_PREFLIGHT_CLUSTER:-0}" == "1" ]]; then - echo "[RUN] kubectl get nodes(控制节点,需已安装 K3s)" - ansible k3s_server -i "$inv" -b -m ansible.builtin.shell -a \ - 'KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl get nodes' \ - || { - echo "[WARN] 集群侧检查失败:若尚未 deploy k3s,可忽略;装好后设 VERIFY_PREFLIGHT_CLUSTER=1 再测" >&2 - exit 1 - } - else - echo "[TIP] 跳过 kubectl 检查。已装 K3s 时可执行:VERIFY_PREFLIGHT_CLUSTER=1 $0 preflight" - fi - echo "[OK] preflight 通过" -} - -run_all_verify() { - local series="${1:-}" - local id_regex="${2:-}" - local exclude_noop="${3:-0}" - local require_teardown="${4:-0}" - local id - while IFS= read -r id; do - echo "" - echo "########################################## $id" - ansible_verify "$id" - done < <(list_doc_ids_from_verify_dir "$series" "$id_regex" "$exclude_noop" "$require_teardown") -} - -usage() { - cat <<'EOF' -用法:scripts/verify.sh <命令> [...] - -命令: - flow 打印与 docs/00-05 §2 对齐的「验证流程」说明(不接 Ansible) - preflight 检查 ansible-playbook 与 inventory;对 k3s_server 做 ping - 若 VERIFY_PREFLIGHT_CLUSTER=1,额外 kubectl get nodes(未装集群会失败) - full 先 preflight,再按 doc_id 顺序运行全部 verify(= preflight + run-all,推荐) - list [筛选参数] 列出可执行 doc_id(仅执行域) - run 运行指定 doc_id(ansible/playbooks/verify/.yml) - run-all [筛选参数] 按 doc_id 顺序运行 verify playbook(fail-fast),不做 preflight - -筛选参数(可用于 list / run-all / full): - --series 只运行某个主序列(例如 04) - --id-regex 仅保留匹配 doc_id 的条目(例如 '^04-(0[2-9]|1[0-4])$') - --exclude-noop 排除 noop verify - --require-teardown 仅保留包含 teardown gate 的条目 - -环境变量: - VERIFY_TEARDOWN=1 验证后清理本篇资源(默认 1,对应 §2 轻量 teardown) - VERIFY_PREFLIGHT_CLUSTER 为 1 时 preflight 额外执行 kubectl get nodes - ANSIBLE_INVENTORY 默认 <仓库>/ansible/inventory.ini(其中 ansible_ssh_private_key_file 须在本机存在) - nginx_entry_base 例如 http://192.168.2.61(02-xx / 03-02 等 HTTP 校验) - nodejs_entry_base 例如 http://192.168.2.61(04-01) - SKIP_ARMV7 默认 1;为 0 时 01-03/01-05 若未配 ARMV7_SSH(01-05 可用 ARMV7_NFS_SSH)会失败 - ARMV7_SSH / ARMV7_NFS_SSH 一行 ssh 命令;与 SKIP_ARMV7=0 配合时 01-03/01-05 经 SSH 在 arm 上 dnf 安装(见 docs/00-07 §E) - -与「部署」分工:安装 K3s / Longhorn / nginx 铺栈请用 ./scripts/deploy-lab.sh;验收请用本脚本。 - -示例: - ./scripts/verify.sh flow - ./scripts/verify.sh full - ./scripts/verify.sh preflight - export nginx_entry_base=http://192.168.2.61 - ./scripts/verify.sh run 02-05 -EOF -} - -ansible_verify() { - local doc_id="$1" - if ! is_exec_doc_id "$doc_id"; then - echo "[ERR] 非执行域 doc_id:$doc_id(仅允许 XX>0 且 YY>0)" >&2 - exit 1 - fi - local inv="${ANSIBLE_INVENTORY:-${ROOT}/ansible/inventory.ini}" - local pb_single="${ROOT}/ansible/playbooks/verify/${doc_id}.yml" - if [[ ! -f "$pb_single" ]]; then - echo "[ERR] verify playbook 不存在:$pb_single" >&2 - echo "[TIP] 可用 '$0 list' 查看可执行 doc_id" >&2 - exit 1 - fi - if [[ ! -f "$inv" ]]; then - echo "[ERR] inventory 不存在:$inv" >&2 - exit 1 - fi - local td="${VERIFY_TEARDOWN:-1}" - echo "[RUN] ansible-playbook -i $inv -e VERIFY_TEARDOWN=$td $pb_single" - ansible-playbook -i "$inv" -e "VERIFY_TEARDOWN=$td" "$pb_single" -} - -main() { - load_env - local cmd="${1:-}" - shift || true - - local series="" - local id_regex="" - local exclude_noop=0 - local require_teardown=0 - - parse_filter_args() { - while [[ $# -gt 0 ]]; do - case "$1" in - --series) - series="${2:-}" - [[ -n "$series" ]] || { echo "[ERR] --series 需要参数" >&2; exit 1; } - [[ "$series" =~ ^(0[1-9]|[1-9][0-9])$ ]] || { echo "[ERR] --series 仅允许 01..99" >&2; exit 1; } - shift 2 - ;; - --id-regex) - id_regex="${2:-}" - [[ -n "$id_regex" ]] || { echo "[ERR] --id-regex 需要参数" >&2; exit 1; } - shift 2 - ;; - --exclude-noop) - exclude_noop=1 - shift - ;; - --require-teardown) - require_teardown=1 - shift - ;; - *) - echo "[ERR] 未知参数:$1" >&2 - exit 1 - ;; - esac - done - } - - case "$cmd" in - ""|-h|--help) usage ;; - flow) print_flow ;; - preflight) run_preflight ;; - full) - parse_filter_args "$@" - run_preflight - echo "" - echo "########################################## run-all" - run_all_verify "$series" "$id_regex" "$exclude_noop" "$require_teardown" - ;; - list) - parse_filter_args "$@" - list_doc_ids_from_verify_dir "$series" "$id_regex" "$exclude_noop" "$require_teardown" - ;; - run) - local doc_id="${1:?need doc_id like 02-05}" - ansible_verify "$doc_id" - ;; - run-all) - parse_filter_args "$@" - run_all_verify "$series" "$id_regex" "$exclude_noop" "$require_teardown" - ;; - *) - echo "[ERR] unknown cmd: $cmd" >&2 - usage - exit 1 - ;; - esac -} - -main "$@"