From 31709425e27e8a6f23315b084bc28d46ef6be126 Mon Sep 17 00:00:00 2001 From: Jack Date: Fri, 27 Mar 2026 16:58:41 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AF=B9=E9=BD=90=E6=96=87=E4=BB=B6=E8=A7=84?= =?UTF-8?q?=E8=8C=83?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 2 + README.md | 74 +-- ansible/files/00-01-k3s-基础概念/README.md | 9 - ansible/files/00-04-部署环境说明/README.md | 9 - .../01-01-k3s-控制节点含traefik/README.md | 13 - ansible/files/01-02-k3s-工作节点/README.md | 13 - .../01-03-armv7-standalone-docker/README.md | 9 - ansible/files/01-04-双控制节点ha/README.md | 9 - .../files/01-05-armv7-nfs服务安装/README.md | 9 - .../01-06-节点初始化-ansible-实践/README.md | 13 - ansible/files/01-07-haproxy/README.md | 38 -- .../{01-07-haproxy => 01-07}/haproxy-http.cfg | 0 .../haproxy-https.cfg | 0 .../haproxy-no-check.cfg | 0 .../haproxy-proxy-http-tls.cfg | 0 .../{01-07-haproxy => 01-07}/haproxy-tls.cfg | 0 ansible/files/02-00-nginx-系列说明/README.md | 12 - .../02-01-nginx-control-ingress/README.md | 15 - .../README.md | 15 - .../02-03-nginx-worker-ingress/README.md | 15 - .../02-04-nginx-worker-ingressroute/README.md | 15 - ansible/files/02-05-nginx-matrix/README.md | 13 - .../01-control-ingress.yaml | 0 .../02-control-ingressroute.yaml | 0 .../03-worker-ingress.yaml | 0 .../04-worker-ingressroute.yaml | 0 .../traefik-dashboard.yaml | 0 .../01-control-ingress.yaml | 0 .../02-control-ingressroute.yaml | 0 .../03-worker-ingress.yaml | 0 .../04-worker-ingressroute.yaml | 0 .../traefik-acme.yaml | 0 .../tomcat-acme.yaml | 0 .../traefik-dashboard-acme.yaml | 0 .../cloudflared.yaml | 0 .../local-path-config-lab.json | 0 .../local-path-pvc-demo.yaml | 0 ansible/files/03-05/nginx-hostpath-demo.yaml | 43 ++ ansible/files/03-06/nfs-direct-demo.yaml | 26 + ansible/files/03-06/nfs-dynamic-pvc-demo.yaml | 12 + .../nfs-pv-pvc-demo.yaml | 0 .../{03-07-longhorn => 03-07}/values-lab.yaml | 0 .../03-08-k3s-ha-集群配置与切换/README.md | 9 - .../03-09-k3s-gitops-集群配置管理/README.md | 9 - .../traefik-custom-ports.yaml | 0 ansible/files/04-01-nodejs-demo/README.md | 43 -- .../04-01-nodejs-demo.yaml | 0 .../04-02-nodejs-demo.yaml | 0 .../04-03-nodejs-demo.yaml | 0 .../04-04-nodejs-demo.yaml | 0 .../04-05-nodejs-demo.yaml | 0 .../04-06-nodejs-demo.yaml | 0 .../04-07-nodejs-demo.yaml | 0 .../04-08-nodejs-demo.yaml | 0 .../04-09-nodejs-demo.yaml | 0 .../04-10-nodejs-demo.yaml | 0 .../04-11-nodejs-demo.yaml | 0 .../04-12-nodejs-demo.yaml | 0 .../04-13-nodejs-demo.yaml | 0 .../nodejs-demo-secret.example.yaml | 0 .../04-02-nodejs-镜像与运行命令/README.md | 13 - .../04-03-nodejs-环境变量与配置注入/README.md | 13 - .../04-04-nodejs-端口与Service/README.md | 13 - .../04-05-nodejs-资源请求与限制/README.md | 13 - .../04-06-nodejs-探针与健康检查/README.md | 13 - .../files/04-07-nodejs-调度与亲和/README.md | 13 - .../files/04-08-nodejs-安全上下文/README.md | 13 - ansible/files/04-09-nodejs-存储与卷/README.md | 13 - .../04-10-nodejs-Ingress与Traefik/README.md | 13 - .../04-11-nodejs-副本与滚动发布/README.md | 13 - .../files/04-12-nodejs-TLS与证书/README.md | 13 - ansible/files/04-13-nodejs-HPA/README.md | 13 - .../04-14-nodejs-GitOps与CI流水线/README.md | 9 - .../05-01/glances-docker-compose.example.yaml | 10 + .../05-01/homer-glances-item.example.yaml | 6 + .../files/{05-01-homer => 05-01}/homer.yaml | 0 .../{05-02-onenav => 05-02}/onenav-proxy.yaml | 0 .../gitlab-ci-runner-tags.example.yml | 0 ansible/files/05-04-gitlab-cicd/README.md | 10 - .../gitlab-ci-minimal.example.yml | 0 .../gitlab-ci-multi-arch-deploy.example.yml | 0 .../files/05-05-prometheus与grafana/README.md | 9 - .../openlist-backup-cronjob.yaml | 0 .../openclaw-proxy.yaml | 0 .../openclaw-server.yml | 0 .../openclaw-k3s-experimental.yaml | 0 .../openclaw-web.yml | 0 .../README.md | 9 - ansible/files/06-02-运维小结/README.md | 9 - .../README.md | 12 - .../app-data-backup-cronjob.yaml | 0 .../app-data-restore-job.yaml | 0 ansible/group_vars/all.yml | 6 +- .../playbooks/apply-local-path-config-lab.yml | 38 -- ansible/playbooks/k3s-init-and-install.yml | 270 ---------- ansible/playbooks/k3s-prepare-storage.yml | 108 ---- ansible/playbooks/longhorn-install.yml | 252 --------- ansible/playbooks/nginx-matrix-deploy.yml | 168 ------ ansible/playbooks/nginx-matrix-tls-deploy.yml | 189 ------- ansible/playbooks/nodejs-demo-apply.yml | 48 -- ansible/playbooks/verify/00-01.yml | 10 - ansible/playbooks/verify/00-04.yml | 10 - ansible/playbooks/verify/01-03.yml | 122 ++++- ansible/playbooks/verify/01-04.yml | 10 - ansible/playbooks/verify/01-05.yml | 139 ++++- ansible/playbooks/verify/01-06.yml | 135 +++++ ansible/playbooks/verify/01-07.yml | 4 +- ansible/playbooks/verify/01-08.yml | 26 + ansible/playbooks/verify/02-00.yml | 10 - ansible/playbooks/verify/02-01.yml | 2 +- ansible/playbooks/verify/02-02.yml | 2 +- ansible/playbooks/verify/02-03.yml | 2 +- ansible/playbooks/verify/02-04.yml | 2 +- ansible/playbooks/verify/02-05.yml | 206 ++++++-- ansible/playbooks/verify/03-01.yml | 26 +- ansible/playbooks/verify/03-02.yml | 216 ++++++-- ansible/playbooks/verify/03-03.yml | 4 +- ansible/playbooks/verify/03-04.yml | 4 +- ansible/playbooks/verify/03-05.yml | 81 ++- ansible/playbooks/verify/03-06.yml | 91 ++-- ansible/playbooks/verify/03-07.yml | 246 ++++++++- ansible/playbooks/verify/03-08.yml | 4 +- ansible/playbooks/verify/03-09.yml | 4 +- ansible/playbooks/verify/03-10.yml | 4 +- ansible/playbooks/verify/04-01.yml | 67 ++- ansible/playbooks/verify/04-02.yml | 20 +- ansible/playbooks/verify/04-03.yml | 95 +++- ansible/playbooks/verify/04-04.yml | 95 +++- ansible/playbooks/verify/04-05.yml | 95 +++- ansible/playbooks/verify/04-06.yml | 95 +++- ansible/playbooks/verify/04-07.yml | 95 +++- ansible/playbooks/verify/04-08.yml | 95 +++- ansible/playbooks/verify/04-09.yml | 95 +++- ansible/playbooks/verify/04-10.yml | 95 +++- ansible/playbooks/verify/04-11.yml | 95 +++- ansible/playbooks/verify/04-12.yml | 105 +++- ansible/playbooks/verify/04-13.yml | 103 +++- ansible/playbooks/verify/04-14.yml | 16 +- ansible/playbooks/verify/05-01.yml | 4 +- ansible/playbooks/verify/05-02.yml | 4 +- ansible/playbooks/verify/05-03.yml | 4 +- ansible/playbooks/verify/05-04.yml | 4 +- ansible/playbooks/verify/05-05.yml | 4 +- ansible/playbooks/verify/05-06.yml | 4 +- ansible/playbooks/verify/05-07.yml | 4 +- ansible/playbooks/verify/05-08.yml | 4 +- ansible/playbooks/verify/05-09.yml | 4 +- ansible/playbooks/verify/06-01.yml | 4 +- ansible/playbooks/verify/06-02.yml | 4 +- ansible/playbooks/verify/06-03.yml | 4 +- ansible/playbooks/verify/07-01.yml | 10 + ansible/playbooks/verify/07-02.yml | 10 + ansible/playbooks/verify/_noop-tasks.yml | 32 -- .../tasks/nodejs-demo-deploy-verify.yml | 77 +++ .../verify/tasks/noop-doc-verify.yml | 87 ++++ bmad.list.md | 99 ++++ docs/00-00-构建总览.md | 166 ++++-- docs/00-01-k3s-基础概念.md | 18 +- ...-部署环境说明.md => 00-02-部署环境说明.md} | 70 ++- docs/00-02-验证矩阵.md | 222 -------- docs/00-03-未来规划与待补功能.md | 111 ---- ...试与验证框架.md => 00-03-测试与验证框架.md} | 85 +++- docs/00-04-待验证项-验证前准备.md | 124 +++++ docs/01-00-安装与基础环境-系列说明.md | 36 ++ docs/01-01-k3s-控制节点含traefik.md | 25 +- docs/01-02-k3s-工作节点.md | 15 +- docs/01-03-armv7-standalone-docker.md | 14 + docs/01-05-armv7-nfs服务安装.md | 13 + docs/01-06-节点初始化-ansible-实践.md | 46 +- docs/01-07-openwrt-haproxy.md | 28 +- ...-双控制节点ha.md => 01-08-双控制节点ha.md} | 17 +- docs/02-00-nginx-系列说明.md | 17 +- docs/02-01-nginx-control-ingress.md | 15 +- docs/02-02-nginx-control-ingressroute.md | 20 +- docs/02-03-nginx-worker-ingress.md | 20 +- docs/02-04-nginx-worker-ingressroute.md | 20 +- docs/02-05-nginx-验证矩阵-一键部署.md | 34 +- docs/03-00-集群侧配置扩展-系列说明.md | 34 ++ docs/03-01-k3s-traefik-dashboard.md | 20 +- docs/03-02-k3s-traefik-acme.md | 47 +- docs/03-03-k3s-traefik-dashboard-acme.md | 19 +- docs/03-04-k3s-cloudflare-tunnel-配置接入.md | 20 +- docs/03-05-k3s-local-path-pvc.md | 28 +- docs/03-06-k3s-使用nfs存储.md | 27 +- docs/03-07-k3s-longhorn-持久化存储.md | 28 +- docs/03-08-k3s-ha-集群配置与切换.md | 20 +- docs/03-09-k3s-gitops-集群配置管理.md | 13 + docs/03-10-k3s-traefik-custom-ports.md | 21 +- docs/04-00-nodejs-系列说明.md | 39 ++ docs/04-01-k3s-nodejs-高级部署.md | 48 +- ...Service.md => 04-02-nodejs-端口与Service.md} | 30 +- ...行命令.md => 04-03-nodejs-镜像与运行命令.md} | 36 +- ...置注入.md => 04-04-nodejs-环境变量与配置注入.md} | 38 +- ...康检查.md => 04-05-nodejs-探针与健康检查.md} | 32 +- ...动发布.md => 04-06-nodejs-副本与滚动发布.md} | 26 +- ...fik.md => 04-07-nodejs-Ingress与Traefik.md} | 30 +- ...与限制.md => 04-08-nodejs-资源请求与限制.md} | 28 +- ...度与亲和.md => 04-09-nodejs-调度与亲和.md} | 26 +- ...全上下文.md => 04-10-nodejs-安全上下文.md} | 32 +- ...s-存储与卷.md => 04-11-nodejs-存储与卷.md} | 32 +- docs/04-12-nodejs-TLS与证书.md | 20 +- docs/04-13-nodejs-HPA.md | 26 +- docs/04-14-nodejs-GitOps与CI流水线.md | 18 +- docs/05-00-常用应用部署-系列说明.md | 28 + docs/05-01-k3s-部署homer首页面板.md | 41 +- docs/05-02-onenav首页面板.md | 17 +- docs/05-03-k3s-安装gitlab-含runner.md | 15 +- docs/05-04-k3s-配置gitlab-cicd.md | 18 +- docs/05-05-prometheus与grafana.md | 14 + docs/05-06-openlist挂载网盘与自动备份.md | 18 +- docs/05-07-openclaw应用部署.md | 16 +- docs/05-08-openclaw-k3s-实验部署.md | 19 +- docs/05-09-openclaw-web-小游戏网页平台.md | 21 +- docs/06-00-排障与运维-系列说明.md | 22 + docs/06-01-k3s-networkpolicy-故障排查.md | 16 +- docs/06-02-运维小结.md | 15 +- ...6-03-k3s-自动备份与恢复-openlist-webdav.md | 20 +- docs/07-00-网络与CNI实验-系列说明.md | 22 + docs/07-01-k3s-calico-dualstack.md | 13 + docs/07-02-k3s-cilium-dualstack-ebpf.md | 13 + project-context.md | 174 +++++++ scripts/.env.verify.example | 37 +- scripts/README.md | 49 +- scripts/acceptance.sh | 92 ++++ scripts/deploy-lab.sh | 43 +- scripts/fix-04-doc-refs.py | 104 ++++ scripts/gen-nodejs-demo-yaml.py | 478 ++++++++++++++++++ scripts/lib-ansible-lab.sh | 37 ++ scripts/resolve_verify_playbook.py | 29 ++ .../ssh/run-phase2-k3s-on-ylc61-as-jack.sh | 25 - scripts/ssh/setup-k3s-workers-ssh.sh | 18 +- scripts/ssh/smoke-verify-matrix-on-ylc61.sh | 35 -- scripts/test-all.sh | 65 +++ scripts/validate_matrix_playbooks.py | 84 +++ scripts/verify.sh | 208 ++++++-- 235 files changed, 5433 insertions(+), 2850 deletions(-) delete mode 100644 ansible/files/00-01-k3s-基础概念/README.md delete mode 100644 ansible/files/00-04-部署环境说明/README.md delete mode 100644 ansible/files/01-01-k3s-控制节点含traefik/README.md delete mode 100644 ansible/files/01-02-k3s-工作节点/README.md delete mode 100644 ansible/files/01-03-armv7-standalone-docker/README.md delete mode 100644 ansible/files/01-04-双控制节点ha/README.md delete mode 100644 ansible/files/01-05-armv7-nfs服务安装/README.md delete mode 100644 ansible/files/01-06-节点初始化-ansible-实践/README.md delete mode 100644 ansible/files/01-07-haproxy/README.md rename ansible/files/{01-07-haproxy => 01-07}/haproxy-http.cfg (100%) rename ansible/files/{01-07-haproxy => 01-07}/haproxy-https.cfg (100%) rename ansible/files/{01-07-haproxy => 01-07}/haproxy-no-check.cfg (100%) rename ansible/files/{01-07-haproxy => 01-07}/haproxy-proxy-http-tls.cfg (100%) rename ansible/files/{01-07-haproxy => 01-07}/haproxy-tls.cfg (100%) delete mode 100644 ansible/files/02-00-nginx-系列说明/README.md delete mode 100644 ansible/files/02-01-nginx-control-ingress/README.md delete mode 100644 ansible/files/02-02-nginx-control-ingressroute/README.md delete mode 100644 ansible/files/02-03-nginx-worker-ingress/README.md delete mode 100644 ansible/files/02-04-nginx-worker-ingressroute/README.md delete mode 100644 ansible/files/02-05-nginx-matrix/README.md rename ansible/files/{02-05-nginx-matrix => 02-05}/01-control-ingress.yaml (100%) rename ansible/files/{02-05-nginx-matrix => 02-05}/02-control-ingressroute.yaml (100%) rename ansible/files/{02-05-nginx-matrix => 02-05}/03-worker-ingress.yaml (100%) rename ansible/files/{02-05-nginx-matrix => 02-05}/04-worker-ingressroute.yaml (100%) rename ansible/files/{03-01-traefik-dashboard => 03-01}/traefik-dashboard.yaml (100%) rename ansible/files/{03-02-nginx-matrix-tls => 03-02}/01-control-ingress.yaml (100%) rename ansible/files/{03-02-nginx-matrix-tls => 03-02}/02-control-ingressroute.yaml (100%) rename ansible/files/{03-02-nginx-matrix-tls => 03-02}/03-worker-ingress.yaml (100%) rename ansible/files/{03-02-nginx-matrix-tls => 03-02}/04-worker-ingressroute.yaml (100%) rename ansible/files/{03-02-traefik-acme => 03-02}/traefik-acme.yaml (100%) rename ansible/files/{03-03-traefik-dashboard-acme => 03-03}/tomcat-acme.yaml (100%) rename ansible/files/{03-03-traefik-dashboard-acme => 03-03}/traefik-dashboard-acme.yaml (100%) rename ansible/files/{03-04-cloudflare-tunnel => 03-04}/cloudflared.yaml (100%) rename ansible/files/{03-05-local-path-config => 03-05}/local-path-config-lab.json (100%) rename ansible/files/{03-05-local-path-demo => 03-05}/local-path-pvc-demo.yaml (100%) create mode 100644 ansible/files/03-05/nginx-hostpath-demo.yaml create mode 100644 ansible/files/03-06/nfs-direct-demo.yaml create mode 100644 ansible/files/03-06/nfs-dynamic-pvc-demo.yaml rename ansible/files/{03-06-nfs-demo => 03-06}/nfs-pv-pvc-demo.yaml (100%) rename ansible/files/{03-07-longhorn => 03-07}/values-lab.yaml (100%) delete mode 100644 ansible/files/03-08-k3s-ha-集群配置与切换/README.md delete mode 100644 ansible/files/03-09-k3s-gitops-集群配置管理/README.md rename ansible/files/{03-10-traefik-custom-ports => 03-10}/traefik-custom-ports.yaml (100%) delete mode 100644 ansible/files/04-01-nodejs-demo/README.md rename ansible/files/{04-01-nodejs-demo => 04-01}/04-01-nodejs-demo.yaml (100%) rename ansible/files/{04-01-nodejs-demo => 04-01}/04-02-nodejs-demo.yaml (100%) rename ansible/files/{04-01-nodejs-demo => 04-01}/04-03-nodejs-demo.yaml (100%) rename ansible/files/{04-01-nodejs-demo => 04-01}/04-04-nodejs-demo.yaml (100%) rename ansible/files/{04-01-nodejs-demo => 04-01}/04-05-nodejs-demo.yaml (100%) rename ansible/files/{04-01-nodejs-demo => 04-01}/04-06-nodejs-demo.yaml (100%) rename ansible/files/{04-01-nodejs-demo => 04-01}/04-07-nodejs-demo.yaml (100%) rename ansible/files/{04-01-nodejs-demo => 04-01}/04-08-nodejs-demo.yaml (100%) rename ansible/files/{04-01-nodejs-demo => 04-01}/04-09-nodejs-demo.yaml (100%) rename ansible/files/{04-01-nodejs-demo => 04-01}/04-10-nodejs-demo.yaml (100%) rename ansible/files/{04-01-nodejs-demo => 04-01}/04-11-nodejs-demo.yaml (100%) rename ansible/files/{04-01-nodejs-demo => 04-01}/04-12-nodejs-demo.yaml (100%) rename ansible/files/{04-01-nodejs-demo => 04-01}/04-13-nodejs-demo.yaml (100%) rename ansible/files/{04-01-nodejs-demo => 04-01}/nodejs-demo-secret.example.yaml (100%) delete mode 100644 ansible/files/04-02-nodejs-镜像与运行命令/README.md delete mode 100644 ansible/files/04-03-nodejs-环境变量与配置注入/README.md delete mode 100644 ansible/files/04-04-nodejs-端口与Service/README.md delete mode 100644 ansible/files/04-05-nodejs-资源请求与限制/README.md delete mode 100644 ansible/files/04-06-nodejs-探针与健康检查/README.md delete mode 100644 ansible/files/04-07-nodejs-调度与亲和/README.md delete mode 100644 ansible/files/04-08-nodejs-安全上下文/README.md delete mode 100644 ansible/files/04-09-nodejs-存储与卷/README.md delete mode 100644 ansible/files/04-10-nodejs-Ingress与Traefik/README.md delete mode 100644 ansible/files/04-11-nodejs-副本与滚动发布/README.md delete mode 100644 ansible/files/04-12-nodejs-TLS与证书/README.md delete mode 100644 ansible/files/04-13-nodejs-HPA/README.md delete mode 100644 ansible/files/04-14-nodejs-GitOps与CI流水线/README.md create mode 100644 ansible/files/05-01/glances-docker-compose.example.yaml create mode 100644 ansible/files/05-01/homer-glances-item.example.yaml rename ansible/files/{05-01-homer => 05-01}/homer.yaml (100%) rename ansible/files/{05-02-onenav => 05-02}/onenav-proxy.yaml (100%) rename ansible/files/{05-03-gitlab-runner => 05-03}/gitlab-ci-runner-tags.example.yml (100%) delete mode 100644 ansible/files/05-04-gitlab-cicd/README.md rename ansible/files/{05-04-gitlab-cicd => 05-04}/gitlab-ci-minimal.example.yml (100%) rename ansible/files/{05-04-gitlab-cicd => 05-04}/gitlab-ci-multi-arch-deploy.example.yml (100%) delete mode 100644 ansible/files/05-05-prometheus与grafana/README.md rename ansible/files/{05-06-openlist => 05-06}/openlist-backup-cronjob.yaml (100%) rename ansible/files/{05-07-openclaw => 05-07}/openclaw-proxy.yaml (100%) rename ansible/files/{05-07-openclaw => 05-07}/openclaw-server.yml (100%) rename ansible/files/{05-08-openclaw => 05-08}/openclaw-k3s-experimental.yaml (100%) rename ansible/files/{05-09-openclaw-web-小游戏网页平台 => 05-09}/openclaw-web.yml (100%) delete mode 100644 ansible/files/06-01-k3s-networkpolicy-故障排查/README.md delete mode 100644 ansible/files/06-02-运维小结/README.md delete mode 100644 ansible/files/06-03-k3s-自动备份与恢复-openlist-webdav/README.md rename ansible/files/{06-03-openlist-webdav => 06-03}/app-data-backup-cronjob.yaml (100%) rename ansible/files/{06-03-openlist-webdav => 06-03}/app-data-restore-job.yaml (100%) delete mode 100644 ansible/playbooks/apply-local-path-config-lab.yml delete mode 100644 ansible/playbooks/k3s-init-and-install.yml delete mode 100644 ansible/playbooks/k3s-prepare-storage.yml delete mode 100644 ansible/playbooks/longhorn-install.yml delete mode 100644 ansible/playbooks/nginx-matrix-deploy.yml delete mode 100644 ansible/playbooks/nginx-matrix-tls-deploy.yml delete mode 100644 ansible/playbooks/nodejs-demo-apply.yml delete mode 100644 ansible/playbooks/verify/00-01.yml delete mode 100644 ansible/playbooks/verify/00-04.yml delete mode 100644 ansible/playbooks/verify/01-04.yml create mode 100644 ansible/playbooks/verify/01-08.yml delete mode 100644 ansible/playbooks/verify/02-00.yml create mode 100644 ansible/playbooks/verify/07-01.yml create mode 100644 ansible/playbooks/verify/07-02.yml delete mode 100644 ansible/playbooks/verify/_noop-tasks.yml create mode 100644 ansible/playbooks/verify/tasks/nodejs-demo-deploy-verify.yml create mode 100644 ansible/playbooks/verify/tasks/noop-doc-verify.yml create mode 100644 bmad.list.md rename docs/{00-04-部署环境说明.md => 00-02-部署环境说明.md} (51%) delete mode 100644 docs/00-02-验证矩阵.md delete mode 100644 docs/00-03-未来规划与待补功能.md rename docs/{00-05-测试与验证框架.md => 00-03-测试与验证框架.md} (69%) create mode 100644 docs/00-04-待验证项-验证前准备.md create mode 100644 docs/01-00-安装与基础环境-系列说明.md rename docs/{01-04-双控制节点ha.md => 01-08-双控制节点ha.md} (74%) create mode 100644 docs/03-00-集群侧配置扩展-系列说明.md create mode 100644 docs/04-00-nodejs-系列说明.md rename docs/{04-04-nodejs-端口与Service.md => 04-02-nodejs-端口与Service.md} (65%) rename docs/{04-02-nodejs-镜像与运行命令.md => 04-03-nodejs-镜像与运行命令.md} (60%) rename docs/{04-03-nodejs-环境变量与配置注入.md => 04-04-nodejs-环境变量与配置注入.md} (52%) rename docs/{04-06-nodejs-探针与健康检查.md => 04-05-nodejs-探针与健康检查.md} (62%) rename docs/{04-11-nodejs-副本与滚动发布.md => 04-06-nodejs-副本与滚动发布.md} (68%) rename docs/{04-10-nodejs-Ingress与Traefik.md => 04-07-nodejs-Ingress与Traefik.md} (70%) rename docs/{04-05-nodejs-资源请求与限制.md => 04-08-nodejs-资源请求与限制.md} (61%) rename docs/{04-07-nodejs-调度与亲和.md => 04-09-nodejs-调度与亲和.md} (67%) rename docs/{04-08-nodejs-安全上下文.md => 04-10-nodejs-安全上下文.md} (61%) rename docs/{04-09-nodejs-存储与卷.md => 04-11-nodejs-存储与卷.md} (59%) create mode 100644 docs/05-00-常用应用部署-系列说明.md create mode 100644 docs/06-00-排障与运维-系列说明.md create mode 100644 docs/07-00-网络与CNI实验-系列说明.md create mode 100644 project-context.md create mode 100755 scripts/acceptance.sh create mode 100644 scripts/fix-04-doc-refs.py create mode 100644 scripts/gen-nodejs-demo-yaml.py create mode 100644 scripts/lib-ansible-lab.sh create mode 100644 scripts/resolve_verify_playbook.py delete mode 100644 scripts/ssh/run-phase2-k3s-on-ylc61-as-jack.sh delete mode 100644 scripts/ssh/smoke-verify-matrix-on-ylc61.sh create mode 100755 scripts/test-all.sh create mode 100644 scripts/validate_matrix_playbooks.py mode change 100644 => 100755 scripts/verify.sh diff --git a/.gitignore b/.gitignore index d95d6bd..0d28df6 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,8 @@ .ssh # 本地填写的验证编排环境变量(从 scripts/.env.verify.example 复制) scripts/.env.verify +# 可选:export ANSIBLE_LOCAL_TMP=$PWD/.ansible-tmp(无写权限 ~/.ansible 时) +.ansible-tmp/ _bmad _bmad-output design-artifacts diff --git a/README.md b/README.md index ec3b6f8..0ea6532 100644 --- a/README.md +++ b/README.md @@ -5,60 +5,64 @@ 如果你是第一次看,不用担心,按下面顺序一步一步来就行。 +**路径约定**:下文与总览中的文件路径均相对于**仓库根目录**(在仓库根执行脚本,例如 `./scripts/...`)。 + ## 先知道这仓库怎么逛 - 文档主入口:`docs/00-00-构建总览.md` -- 部署环境说明:`docs/00-04-部署环境说明.md`(节点布局、IP、版本等) +- 部署环境说明:`docs/00-02-部署环境说明.md`(节点布局、IP、版本等) - 脚本主入口:`scripts/README.md` -- 验证状态一览:`docs/00-02-验证矩阵.md` -- 测试与验证框架设计:`docs/00-05-测试与验证框架.md` +- 仓库契约(AI/贡献者必读):`project-context.md`(真源、验证框架、noop/gate、敏感信息约束) +- 测试与验证框架设计:`docs/00-03-测试与验证框架.md` +- **验证入口**:`./scripts/verify.sh`(`full/run-all/run`) -简单理解这三份入口的分工: +编号语义(用于快速判断“是否必须可执行”): -- `README.md`:新手入口,看“要做什么、按什么顺序做”; -- `00-00-构建总览.md`:文档导航,看“下一步该看哪一篇”; -- `00-01-k3s-基础概念.md`:概念速查,看“不懂的 K3s/Traefik/NetworkPolicy 术语”; -- `00-02-验证矩阵.md`:状态面板,看「哪些文档已在真实环境跑通过」;自动化复验用 `./scripts/verify.sh run-all`(见 `00-05`),矩阵里的状态/备注建议仍手工维护。 +- `00-**`:纯文档域(索引/说明/状态板等) +- `XX-00`(`XX>0`):系列入口/导航页 +- `XX-YY`(`XX>0 && YY>0`):分项实践页,必须包含可执行物(YAML 路径或命令块) -目录约定很简单: +简单理解这几份入口的分工: -- 主文档都在 `docs/` -- 脚本都在 `scripts/` -- 脚本默认从仓库根目录执行(例如 `./scripts/...`) +- `README.md`:新手入口,看「要做什么、按什么顺序做」; +- `docs/00-00-构建总览.md`:文档导航 + **学习主线(6 步)**与**附录长单**; +- `docs/00-01-k3s-基础概念.md`:概念速查,看「不懂的 K3s/Traefik/NetworkPolicy 术语」; +- `./scripts/verify.sh`:按 `doc_id` 的自动化验证入口(`full/run-all/run`;清单由 `ansible/playbooks/verify/` 自动生成,且仅包含执行域 `XX>0 && YY>0`)。 -## 新手推荐安装顺序(口语版) +目录约定: -1. **先看总览,别急着装** - 打开 `docs/00-00-构建总览.md`,先把整体拓扑和机器分工看明白。 +- 主文档:`docs/` +- 脚本:`scripts/` -2. **装 K3s 集群(两种方式二选一)** - - **自动化**:按 `docs/01-06-节点初始化-ansible-实践.md`,或在仓库根执行 `./scripts/deploy-lab.sh k3s`(可选 `K3S_PREPARE_STORAGE=true`),完成 61~64 初始化 + server/worker(详见 `scripts/README.md`)。 - - **手动**:先按 `docs/01-01-k3s-控制节点含traefik.md` 装控制节点 61,再按 `docs/01-02-k3s-工作节点.md` 加工作节点 62~64。 +## 学习主线(6 步,推荐) -3. **确认节点 Ready** - 执行 `kubectl get nodes`,确认所有节点 Ready。 +与 `docs/00-00-构建总览.md` 中主线一致;更细的**流程图与分叉说明**也在该篇。 -4. **先用 nginx 做最小验证** - 按 `docs/04-03-k3s-nginx-demo.md`,先打通“能访问”这件事,再上 nodejs。 +1. **总览与环境**:读 `docs/00-00-构建总览.md`;需要对照机器与版本时打开 `docs/00-02-部署环境说明.md`。 +2. **概念速查(可跳过)**:读 `docs/00-01-k3s-基础概念.md`;时间紧可跳过,**碰壁再回来看**。 +3. **安装 K3s(二选一)**:**自动化** — `docs/01-06-节点初始化-ansible-实践.md`,或仓库根执行 `./scripts/deploy-lab.sh k3s`(可选 `K3S_PREPARE_STORAGE=true`,详见 `scripts/README.md`);**手动** — `docs/01-01-k3s-控制节点含traefik.md` 再 `docs/01-02-k3s-工作节点.md`。 +4. **确认节点 Ready**:`kubectl get nodes`,全部 Ready。 +5. **Nginx 最小验证**:`docs/02-00-nginx-系列说明.md` → `docs/02-05-nginx-验证矩阵-一键部署.md`,先打通「能访问」;也可在装好集群并配置 `.env.verify` 后直接 `./scripts/verify.sh run 02-05`。 +6. **Node.js 主线入口**:`docs/04-01-k3s-nodejs-高级部署.md`;`docs/04-02`~`04-14` 为分项,**按需展开**,不挤进主线编号。 -5. **再做 nodejs、dashboard、acme** - 对应看 `docs/04-01-k3s-nodejs-高级部署.md`、`docs/03-01-k3s-traefik-dashboard.md`、`docs/03-02-k3s-traefik-acme.md`。 +**主线之后(按需,不占主线序号)**:Traefik 面板与证书(如 `docs/03-01-k3s-traefik-dashboard.md`、`docs/03-02-k3s-traefik-acme.md`)、存储与应用(`03-05` 起、`05-**`)等 — 见总览中的「主线之后的分叉」与专题导航。 -6. **遇到 502/不通,直接用脚本排障** - 去 `scripts/README.md` 抄命令,优先跑入口链路诊断和 firewalld 基线脚本。 +**任意一步卡住**:排障见 `scripts/README.md`(如 firewalld 基线、入口链路诊断);NetworkPolicy 见 `docs/06-01-k3s-networkpolicy-故障排查.md`。 -## 30 分钟快速通关(最小必做) +## 30 分钟快速通关(4 步) -如果你时间有限,先只做这 4 步,跑通再扩展: +相当于**跳过主线第 2 步(概念)**并**压缩第 1 步(只抓总览要点)**;跑通再按 6 步补全。 -1. **装集群**:用 Ansible 按 `docs/01-06-节点初始化-ansible-实践.md` 一键安装(推荐);或按 `docs/01-01` + `docs/01-02` 手动装控制节点(61)与工作节点(62) -2. 执行 `kubectl get nodes`,确认节点 Ready -3. 按 `docs/04-03-k3s-nginx-demo.md` 部署 nginx 示例并访问一次 -4. 若访问不通,按 `scripts/README.md` 先跑 firewalld 基线与入口链路诊断脚本 +1. **装集群**:Ansible 按 `docs/01-06-节点初始化-ansible-实践.md`(推荐);或 `docs/01-01` + `docs/01-02` 手动装控制节点(61)与工作节点(62)。 +2. `kubectl get nodes`,确认节点 Ready。 +3. 按 `docs/02-05-nginx-验证矩阵-一键部署.md` 部署 nginx 矩阵并访问一次(可先读 `docs/02-00-nginx-系列说明.md`)。 +4. 若访问不通,按 `scripts/README.md` 先跑 firewalld 基线与入口链路诊断脚本。 -跑到这里就算「基础链路通关」。后面再继续 nodejs、dashboard、acme 会轻松很多。 -如果你愿意,也可以顺手在 `docs/00-02-验证矩阵.md` 里,把对应文档的状态改成“已验证”,方便以后回顾。 +跑到这里算「基础链路通关」。后续再补 `04-01`、Traefik、存储等会轻松很多。 +若愿意,可在对应实验篇文档里补充你自己的“已验证环境/日期/版本”记录(不再维护统一矩阵页)。 ## 一句话建议 -先把基础链路(61/62:80)跑通,再叠加业务;每做完一步都做一次 `curl` 验证,排障会轻松很多。 +先把基础链路(如 61/62:80)跑通,再叠加业务;每做完一步做一次 `curl` 验证,排障会轻松很多。 + +补充:`verify.sh` 支持按范围筛选执行(`--series`、`--id-regex`、`--exclude-noop`、`--require-teardown`),适合做分批回归与 CI 分层跑批。 diff --git a/ansible/files/00-01-k3s-基础概念/README.md b/ansible/files/00-01-k3s-基础概念/README.md deleted file mode 100644 index 74e8766..0000000 --- a/ansible/files/00-01-k3s-基础概念/README.md +++ /dev/null @@ -1,9 +0,0 @@ -# 00-01-k3s-基础概念(占位) - -对应文档:[`docs/00-01-k3s-基础概念.md`](../../docs/00-01-k3s-基础概念.md) - -## 说明 - -- 本篇为概念性文档,**不提供可部署的 Kubernetes 清单**。 -- 验证方式:按文档理解与对照集群实际输出即可(无 `kubectl apply -f` 目标)。 - diff --git a/ansible/files/00-04-部署环境说明/README.md b/ansible/files/00-04-部署环境说明/README.md deleted file mode 100644 index 010e24c..0000000 --- a/ansible/files/00-04-部署环境说明/README.md +++ /dev/null @@ -1,9 +0,0 @@ -# 00-04-部署环境说明(占位) - -对应文档:[`docs/00-04-部署环境说明.md`](../../docs/00-04-部署环境说明.md) - -## 说明 - -- 本篇为环境说明文档,**不提供可部署的 Kubernetes 清单**。 -- 验证方式:按文档逐项核对你的实际环境信息(节点、磁盘挂载、版本等)。 - diff --git a/ansible/files/01-01-k3s-控制节点含traefik/README.md b/ansible/files/01-01-k3s-控制节点含traefik/README.md deleted file mode 100644 index 6bb640b..0000000 --- a/ansible/files/01-01-k3s-控制节点含traefik/README.md +++ /dev/null @@ -1,13 +0,0 @@ -# 01-01-k3s-控制节点含traefik(占位) - -对应文档:[`docs/01-01-k3s-控制节点含traefik.md`](../../docs/01-01-k3s-控制节点含traefik.md) - -## 说明 - -- 本篇主要是 **K3s 安装与集群初始化**,核心部署逻辑在 Ansible playbook 中。 -- 本目录仅作为 doc_id 对齐占位;不单独维护 K8s manifests。 - -## 关联(参考) - -- Ansible:`ansible/playbooks/k3s-init-and-install.yml` - diff --git a/ansible/files/01-02-k3s-工作节点/README.md b/ansible/files/01-02-k3s-工作节点/README.md deleted file mode 100644 index 666c3ad..0000000 --- a/ansible/files/01-02-k3s-工作节点/README.md +++ /dev/null @@ -1,13 +0,0 @@ -# 01-02-k3s-工作节点(占位) - -对应文档:[`docs/01-02-k3s-工作节点.md`](../../docs/01-02-k3s-工作节点.md) - -## 说明 - -- 本篇主要是 **工作节点加入 K3s 集群** 与节点侧配置。 -- 本目录仅作为 doc_id 对齐占位;不单独维护 K8s manifests。 - -## 关联(参考) - -- Ansible:`ansible/playbooks/k3s-init-and-install.yml` - diff --git a/ansible/files/01-03-armv7-standalone-docker/README.md b/ansible/files/01-03-armv7-standalone-docker/README.md deleted file mode 100644 index 0c4cbb7..0000000 --- a/ansible/files/01-03-armv7-standalone-docker/README.md +++ /dev/null @@ -1,9 +0,0 @@ -# 01-03-armv7-standalone-docker(占位) - -对应文档:[`docs/01-03-armv7-standalone-docker.md`](../../docs/01-03-armv7-standalone-docker.md) - -## 说明 - -- 本篇为 armv7 设备的 Docker 独立部署说明,**不提供 K3s/Kubernetes 清单**。 -- 本目录仅用于 doc_id 对齐占位。 - diff --git a/ansible/files/01-04-双控制节点ha/README.md b/ansible/files/01-04-双控制节点ha/README.md deleted file mode 100644 index 10987ef..0000000 --- a/ansible/files/01-04-双控制节点ha/README.md +++ /dev/null @@ -1,9 +0,0 @@ -# 01-04-双控制节点ha(占位) - -对应文档:[`docs/01-04-双控制节点ha.md`](../../docs/01-04-双控制节点ha.md) - -## 说明 - -- 本篇为 HA/双控制节点方案说明,部署更多依赖集群架构与外部 LB 配置。 -- 本目录仅用于 doc_id 对齐占位;不提供独立 K8s manifests。 - diff --git a/ansible/files/01-05-armv7-nfs服务安装/README.md b/ansible/files/01-05-armv7-nfs服务安装/README.md deleted file mode 100644 index f6eac80..0000000 --- a/ansible/files/01-05-armv7-nfs服务安装/README.md +++ /dev/null @@ -1,9 +0,0 @@ -# 01-05-armv7-nfs服务安装(占位) - -对应文档:[`docs/01-05-armv7-nfs服务安装.md`](../../docs/01-05-armv7-nfs服务安装.md) - -## 说明 - -- 本篇为 armv7 设备上 NFS 服务安装说明,**不提供 K3s/Kubernetes 清单**。 -- 本目录仅用于 doc_id 对齐占位。 - diff --git a/ansible/files/01-06-节点初始化-ansible-实践/README.md b/ansible/files/01-06-节点初始化-ansible-实践/README.md deleted file mode 100644 index dac2a09..0000000 --- a/ansible/files/01-06-节点初始化-ansible-实践/README.md +++ /dev/null @@ -1,13 +0,0 @@ -# 01-06-节点初始化-ansible-实践(占位) - -对应文档:[`docs/01-06-节点初始化-ansible-实践.md`](../../docs/01-06-节点初始化-ansible-实践.md) - -## 说明 - -- 本篇的“真源”是 Ansible playbooks(初始化、安装、验证)。 -- 本目录仅用于 doc_id 对齐占位;不单独维护 K8s manifests。 - -## 关联(参考) - -- Ansible:`ansible/playbooks/k3s-init-and-install.yml` - diff --git a/ansible/files/01-07-haproxy/README.md b/ansible/files/01-07-haproxy/README.md deleted file mode 100644 index 4d7a5ac..0000000 --- a/ansible/files/01-07-haproxy/README.md +++ /dev/null @@ -1,38 +0,0 @@ -# 01-07 HAProxy 配置 - -## 核心目标 - -本目录下的 **所有 `*.cfg` 必须可被 HAProxy 正确解析并符合文档意图**。验证分两层: - -| 层次 | 含义 | 如何验证 | -|------|------|----------| -| **① 语法正确** | `haproxy -c -f ` 无致命错误 | 见下文「仅校验 cfg」或主验证脚本第 2 步 | -| **② 运行与后端** | 在 OpenWrt 上实际监听 18080/18443 时,经第三方主机 curl 可达 K3s/Traefik 后端 | `./scripts/01-07-verify-haproxy.sh`(完整流程,含 curl) | - -仓库内 **frontend 已统一为 `18080` / `18443`**(与 LuCI 的 80/443 分离);backend 仍指向各节点 **80/443**(Traefik 入口)。按环境修改 `192.168.2.61`~`192.168.2.64`。 - -## 仅校验本目录 cfg(不跑 curl) - -仅需确认 **① 语法**,在仓库根目录执行: - -```bash -./scripts/01-07-verify-haproxy.sh --cfg-only -``` - -会将本目录全部 `*.cfg` 拷到 OpenWrt 的 `/tmp/haproxy-verify/`,对每台文件执行 `haproxy -c`(与 OpenWrt 上安装的 HAProxy 版本一致)。 - -**说明**:`haproxy-https.cfg` 含 `ssl crt /etc/ssl/haproxy.pem`;若路由器上**没有**该 pem,语法检查可能失败,脚本会标为 `[SKIP]`。在 OpenWrt 放置有效 pem 后应能通过 `haproxy -c`。 - -## 文件一览 - -| 文件 | 说明(对应 `docs/01-07-openwrt-haproxy.md`) | -|------|-----------------------------------------------| -| `haproxy-no-check.cfg` | §2 最简;§3.1 在其 `server` 行加 `check` | -| `haproxy-http.cfg` | §3.2 HTTP 健康检查(明文 80 后端) | -| `haproxy-tls.cfg` | §3.3 TLS 握手检查(443 后端,`mode tcp`) | -| `haproxy-https.cfg` | §3.4 HTTPS 应用层检查(需 HAProxy 终结 TLS,由 HAProxy 提供证书) | -| `haproxy-proxy-http-tls.cfg` | §5 PROXY + HTTP/TLS 检查 | - -## 与 Ansible / OpenWrt - -可与 Ansible 共用(复制到 OpenWrt 或通过 playbook 下发)。一键把 **uhttpd 80/443 + HAProxy 18080/18443** 落到路由器见 `scripts/01-07-deploy-openwrt-haproxy.sh`。 diff --git a/ansible/files/01-07-haproxy/haproxy-http.cfg b/ansible/files/01-07/haproxy-http.cfg similarity index 100% rename from ansible/files/01-07-haproxy/haproxy-http.cfg rename to ansible/files/01-07/haproxy-http.cfg diff --git a/ansible/files/01-07-haproxy/haproxy-https.cfg b/ansible/files/01-07/haproxy-https.cfg similarity index 100% rename from ansible/files/01-07-haproxy/haproxy-https.cfg rename to ansible/files/01-07/haproxy-https.cfg diff --git a/ansible/files/01-07-haproxy/haproxy-no-check.cfg b/ansible/files/01-07/haproxy-no-check.cfg similarity index 100% rename from ansible/files/01-07-haproxy/haproxy-no-check.cfg rename to ansible/files/01-07/haproxy-no-check.cfg diff --git a/ansible/files/01-07-haproxy/haproxy-proxy-http-tls.cfg b/ansible/files/01-07/haproxy-proxy-http-tls.cfg similarity index 100% rename from ansible/files/01-07-haproxy/haproxy-proxy-http-tls.cfg rename to ansible/files/01-07/haproxy-proxy-http-tls.cfg diff --git a/ansible/files/01-07-haproxy/haproxy-tls.cfg b/ansible/files/01-07/haproxy-tls.cfg similarity index 100% rename from ansible/files/01-07-haproxy/haproxy-tls.cfg rename to ansible/files/01-07/haproxy-tls.cfg diff --git a/ansible/files/02-00-nginx-系列说明/README.md b/ansible/files/02-00-nginx-系列说明/README.md deleted file mode 100644 index 5656ee2..0000000 --- a/ansible/files/02-00-nginx-系列说明/README.md +++ /dev/null @@ -1,12 +0,0 @@ -# 02-00-nginx-系列说明(占位) - -对应文档:[`docs/02-00-nginx-系列说明.md`](../../docs/02-00-nginx-系列说明.md) - -## 清单复用说明 - -本系列(02-01~02-04)的可部署清单统一收敛在: - -- `ansible/files/02-05-nginx-matrix/` - -本目录仅用于 doc_id 对齐占位。 - diff --git a/ansible/files/02-01-nginx-control-ingress/README.md b/ansible/files/02-01-nginx-control-ingress/README.md deleted file mode 100644 index 98603ff..0000000 --- a/ansible/files/02-01-nginx-control-ingress/README.md +++ /dev/null @@ -1,15 +0,0 @@ -# 02-01-nginx-control-ingress(占位) - -对应文档:[`docs/02-01-nginx-control-ingress.md`](../../docs/02-01-nginx-control-ingress.md) - -## 真源清单 - -- 复用清单目录:`ansible/files/02-05-nginx-matrix/` -- 对应文件:`01-control-ingress.yaml` - -应用示例: - -```bash -kubectl apply -f ansible/files/02-05-nginx-matrix/01-control-ingress.yaml -``` - diff --git a/ansible/files/02-02-nginx-control-ingressroute/README.md b/ansible/files/02-02-nginx-control-ingressroute/README.md deleted file mode 100644 index c38fc43..0000000 --- a/ansible/files/02-02-nginx-control-ingressroute/README.md +++ /dev/null @@ -1,15 +0,0 @@ -# 02-02-nginx-control-ingressroute(占位) - -对应文档:[`docs/02-02-nginx-control-ingressroute.md`](../../docs/02-02-nginx-control-ingressroute.md) - -## 真源清单 - -- 复用清单目录:`ansible/files/02-05-nginx-matrix/` -- 对应文件:`02-control-ingressroute.yaml` - -应用示例: - -```bash -kubectl apply -f ansible/files/02-05-nginx-matrix/02-control-ingressroute.yaml -``` - diff --git a/ansible/files/02-03-nginx-worker-ingress/README.md b/ansible/files/02-03-nginx-worker-ingress/README.md deleted file mode 100644 index 163230e..0000000 --- a/ansible/files/02-03-nginx-worker-ingress/README.md +++ /dev/null @@ -1,15 +0,0 @@ -# 02-03-nginx-worker-ingress(占位) - -对应文档:[`docs/02-03-nginx-worker-ingress.md`](../../docs/02-03-nginx-worker-ingress.md) - -## 真源清单 - -- 复用清单目录:`ansible/files/02-05-nginx-matrix/` -- 对应文件:`03-worker-ingress.yaml` - -应用示例: - -```bash -kubectl apply -f ansible/files/02-05-nginx-matrix/03-worker-ingress.yaml -``` - diff --git a/ansible/files/02-04-nginx-worker-ingressroute/README.md b/ansible/files/02-04-nginx-worker-ingressroute/README.md deleted file mode 100644 index 99f0ce2..0000000 --- a/ansible/files/02-04-nginx-worker-ingressroute/README.md +++ /dev/null @@ -1,15 +0,0 @@ -# 02-04-nginx-worker-ingressroute(占位) - -对应文档:[`docs/02-04-nginx-worker-ingressroute.md`](../../docs/02-04-nginx-worker-ingressroute.md) - -## 真源清单 - -- 复用清单目录:`ansible/files/02-05-nginx-matrix/` -- 对应文件:`04-worker-ingressroute.yaml` - -应用示例: - -```bash -kubectl apply -f ansible/files/02-05-nginx-matrix/04-worker-ingressroute.yaml -``` - diff --git a/ansible/files/02-05-nginx-matrix/README.md b/ansible/files/02-05-nginx-matrix/README.md deleted file mode 100644 index f36986c..0000000 --- a/ansible/files/02-05-nginx-matrix/README.md +++ /dev/null @@ -1,13 +0,0 @@ -# Nginx 矩阵 manifests - -用于 `ansible/playbooks/nginx-matrix-deploy.yml` 一键部署。 - -| 文件 | 场景 | 路径 | 节点 | -|------|------|------|------| -| 01-control-ingress.yaml | M1 控制+Ingress | /demo-m1 | 无 nodeSelector | -| 02-control-ingressroute.yaml | M2 控制+IngressRoute | /demo-m2 | 无 nodeSelector | -| 03-worker-ingress.yaml | M3 工作+Ingress | /demo-m3 | nodeSelector=worker(随机) | -| 04-worker-ingressroute.yaml | M4 工作+IngressRoute | /demo-m4 | nodeSelector=ylc64 | - -M4 默认指定 ylc64,M3 随机工作节点;按实际修改。 - diff --git a/ansible/files/02-05-nginx-matrix/01-control-ingress.yaml b/ansible/files/02-05/01-control-ingress.yaml similarity index 100% rename from ansible/files/02-05-nginx-matrix/01-control-ingress.yaml rename to ansible/files/02-05/01-control-ingress.yaml diff --git a/ansible/files/02-05-nginx-matrix/02-control-ingressroute.yaml b/ansible/files/02-05/02-control-ingressroute.yaml similarity index 100% rename from ansible/files/02-05-nginx-matrix/02-control-ingressroute.yaml rename to ansible/files/02-05/02-control-ingressroute.yaml diff --git a/ansible/files/02-05-nginx-matrix/03-worker-ingress.yaml b/ansible/files/02-05/03-worker-ingress.yaml similarity index 100% rename from ansible/files/02-05-nginx-matrix/03-worker-ingress.yaml rename to ansible/files/02-05/03-worker-ingress.yaml diff --git a/ansible/files/02-05-nginx-matrix/04-worker-ingressroute.yaml b/ansible/files/02-05/04-worker-ingressroute.yaml similarity index 100% rename from ansible/files/02-05-nginx-matrix/04-worker-ingressroute.yaml rename to ansible/files/02-05/04-worker-ingressroute.yaml diff --git a/ansible/files/03-01-traefik-dashboard/traefik-dashboard.yaml b/ansible/files/03-01/traefik-dashboard.yaml similarity index 100% rename from ansible/files/03-01-traefik-dashboard/traefik-dashboard.yaml rename to ansible/files/03-01/traefik-dashboard.yaml diff --git a/ansible/files/03-02-nginx-matrix-tls/01-control-ingress.yaml b/ansible/files/03-02/01-control-ingress.yaml similarity index 100% rename from ansible/files/03-02-nginx-matrix-tls/01-control-ingress.yaml rename to ansible/files/03-02/01-control-ingress.yaml diff --git a/ansible/files/03-02-nginx-matrix-tls/02-control-ingressroute.yaml b/ansible/files/03-02/02-control-ingressroute.yaml similarity index 100% rename from ansible/files/03-02-nginx-matrix-tls/02-control-ingressroute.yaml rename to ansible/files/03-02/02-control-ingressroute.yaml diff --git a/ansible/files/03-02-nginx-matrix-tls/03-worker-ingress.yaml b/ansible/files/03-02/03-worker-ingress.yaml similarity index 100% rename from ansible/files/03-02-nginx-matrix-tls/03-worker-ingress.yaml rename to ansible/files/03-02/03-worker-ingress.yaml diff --git a/ansible/files/03-02-nginx-matrix-tls/04-worker-ingressroute.yaml b/ansible/files/03-02/04-worker-ingressroute.yaml similarity index 100% rename from ansible/files/03-02-nginx-matrix-tls/04-worker-ingressroute.yaml rename to ansible/files/03-02/04-worker-ingressroute.yaml diff --git a/ansible/files/03-02-traefik-acme/traefik-acme.yaml b/ansible/files/03-02/traefik-acme.yaml similarity index 100% rename from ansible/files/03-02-traefik-acme/traefik-acme.yaml rename to ansible/files/03-02/traefik-acme.yaml diff --git a/ansible/files/03-03-traefik-dashboard-acme/tomcat-acme.yaml b/ansible/files/03-03/tomcat-acme.yaml similarity index 100% rename from ansible/files/03-03-traefik-dashboard-acme/tomcat-acme.yaml rename to ansible/files/03-03/tomcat-acme.yaml diff --git a/ansible/files/03-03-traefik-dashboard-acme/traefik-dashboard-acme.yaml b/ansible/files/03-03/traefik-dashboard-acme.yaml similarity index 100% rename from ansible/files/03-03-traefik-dashboard-acme/traefik-dashboard-acme.yaml rename to ansible/files/03-03/traefik-dashboard-acme.yaml diff --git a/ansible/files/03-04-cloudflare-tunnel/cloudflared.yaml b/ansible/files/03-04/cloudflared.yaml similarity index 100% rename from ansible/files/03-04-cloudflare-tunnel/cloudflared.yaml rename to ansible/files/03-04/cloudflared.yaml diff --git a/ansible/files/03-05-local-path-config/local-path-config-lab.json b/ansible/files/03-05/local-path-config-lab.json similarity index 100% rename from ansible/files/03-05-local-path-config/local-path-config-lab.json rename to ansible/files/03-05/local-path-config-lab.json diff --git a/ansible/files/03-05-local-path-demo/local-path-pvc-demo.yaml b/ansible/files/03-05/local-path-pvc-demo.yaml similarity index 100% rename from ansible/files/03-05-local-path-demo/local-path-pvc-demo.yaml rename to ansible/files/03-05/local-path-pvc-demo.yaml diff --git a/ansible/files/03-05/nginx-hostpath-demo.yaml b/ansible/files/03-05/nginx-hostpath-demo.yaml new file mode 100644 index 0000000..54b2631 --- /dev/null +++ b/ansible/files/03-05/nginx-hostpath-demo.yaml @@ -0,0 +1,43 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: nginx-hostpath-demo + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: nginx-hostpath-demo + template: + metadata: + labels: + app: nginx-hostpath-demo + spec: + nodeSelector: + kubernetes.io/hostname: ylc61 + containers: + - name: nginx + image: nginx:1.27-alpine + ports: + - containerPort: 80 + volumeMounts: + - name: app-data + mountPath: /usr/share/nginx/html + volumes: + - name: app-data + hostPath: + path: /data/nginx-hostpath-demo + type: DirectoryOrCreate +--- +apiVersion: v1 +kind: Service +metadata: + name: nginx-hostpath-demo + namespace: default +spec: + selector: + app: nginx-hostpath-demo + ports: + - port: 80 + targetPort: 80 + type: ClusterIP diff --git a/ansible/files/03-06/nfs-direct-demo.yaml b/ansible/files/03-06/nfs-direct-demo.yaml new file mode 100644 index 0000000..e28974d --- /dev/null +++ b/ansible/files/03-06/nfs-direct-demo.yaml @@ -0,0 +1,26 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: nfs-direct-demo + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: nfs-direct-demo + template: + metadata: + labels: + app: nfs-direct-demo + spec: + containers: + - name: app + image: nginx:alpine + volumeMounts: + - name: nfs-data + mountPath: /usr/share/nginx/html + volumes: + - name: nfs-data + nfs: + server: + path: diff --git a/ansible/files/03-06/nfs-dynamic-pvc-demo.yaml b/ansible/files/03-06/nfs-dynamic-pvc-demo.yaml new file mode 100644 index 0000000..1f67afe --- /dev/null +++ b/ansible/files/03-06/nfs-dynamic-pvc-demo.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: nfs-dynamic-pvc-demo + namespace: default +spec: + accessModes: + - ReadWriteMany + storageClassName: nfs-client + resources: + requests: + storage: 5Gi diff --git a/ansible/files/03-06-nfs-demo/nfs-pv-pvc-demo.yaml b/ansible/files/03-06/nfs-pv-pvc-demo.yaml similarity index 100% rename from ansible/files/03-06-nfs-demo/nfs-pv-pvc-demo.yaml rename to ansible/files/03-06/nfs-pv-pvc-demo.yaml diff --git a/ansible/files/03-07-longhorn/values-lab.yaml b/ansible/files/03-07/values-lab.yaml similarity index 100% rename from ansible/files/03-07-longhorn/values-lab.yaml rename to ansible/files/03-07/values-lab.yaml diff --git a/ansible/files/03-08-k3s-ha-集群配置与切换/README.md b/ansible/files/03-08-k3s-ha-集群配置与切换/README.md deleted file mode 100644 index 2187f75..0000000 --- a/ansible/files/03-08-k3s-ha-集群配置与切换/README.md +++ /dev/null @@ -1,9 +0,0 @@ -# 03-08-k3s-ha-集群配置与切换(占位) - -对应文档:[`docs/03-08-k3s-ha-集群配置与切换.md`](../../docs/03-08-k3s-ha-集群配置与切换.md) - -## 说明 - -- 本篇偏架构/流程与配置项梳理,具体落地会涉及多节点与外部组件(如 LB/DNS/证书)。 -- 本目录仅用于 doc_id 对齐占位;暂无独立可复用 manifests。 - diff --git a/ansible/files/03-09-k3s-gitops-集群配置管理/README.md b/ansible/files/03-09-k3s-gitops-集群配置管理/README.md deleted file mode 100644 index dc5e4f3..0000000 --- a/ansible/files/03-09-k3s-gitops-集群配置管理/README.md +++ /dev/null @@ -1,9 +0,0 @@ -# 03-09-k3s-gitops-集群配置管理(占位) - -对应文档:[`docs/03-09-k3s-gitops-集群配置管理.md`](../../docs/03-09-k3s-gitops-集群配置管理.md) - -## 说明 - -- 本篇为 GitOps 框架草案(Argo CD / Flux 等),最终 manifests 取决于选型与版本。 -- 本目录仅用于 doc_id 对齐占位;暂无固定清单。 - diff --git a/ansible/files/03-10-traefik-custom-ports/traefik-custom-ports.yaml b/ansible/files/03-10/traefik-custom-ports.yaml similarity index 100% rename from ansible/files/03-10-traefik-custom-ports/traefik-custom-ports.yaml rename to ansible/files/03-10/traefik-custom-ports.yaml diff --git a/ansible/files/04-01-nodejs-demo/README.md b/ansible/files/04-01-nodejs-demo/README.md deleted file mode 100644 index 4a81a9c..0000000 --- a/ansible/files/04-01-nodejs-demo/README.md +++ /dev/null @@ -1,43 +0,0 @@ -# Node.js demo 清单(与 docs/04-01~04-14 对齐) - -**唯一真源**:本目录下 YAML 与 `docs/` 中说明一致;文档内不重复贴全文,避免漂移。 - -## 累积规则 - -- `04-0N-nodejs-demo.yaml` 表示:从 `04-01` 起顺序做完 **04-01~04-0N** 各篇能力后的 **一份** 可 `kubectl apply -f` 的完整状态(多资源用 `---` 分隔)。 -- **可直接跳到最后一份** 做实验,不必逐文件 apply;若要理解每步增量,可按编号顺序阅读文档并对照相邻两个 YAML 的差异。 -- **04-14**(GitOps/CI)无独立清单,见 `docs/04-14-nodejs-GitOps与CI流水线.md` 与 `docs/05-04-k3s-配置gitlab-cicd.md`、`docs/03-09-k3s-gitops-集群配置管理.md`。 - -## 文件与文档对照 - -| 文件 | 文档 | 备注 | -|------|------|------| -| `04-01-nodejs-demo.yaml` | `docs/04-01-k3s-nodejs-高级部署.md` | 基线:3000、`/node`、无 host | -| `04-02-nodejs-demo.yaml` | `docs/04-02-nodejs-镜像与运行命令.md` | 固定镜像 tag、`imagePullPolicy` | -| `04-03-nodejs-demo.yaml` | `docs/04-03-nodejs-环境变量与配置注入.md` | + ConfigMap;Secret 示例见文末 `nodejs-demo-secret.example.yaml` | -| `04-04-nodejs-demo.yaml` | `docs/04-04-nodejs-端口与Service.md` | 监听改 **8080**(自 04-04 起探针与后续均用 8080) | -| `04-05-nodejs-demo.yaml` | `docs/04-05-nodejs-资源请求与限制.md` | + resources | -| `04-06-nodejs-demo.yaml` | `docs/04-06-nodejs-探针与健康检查.md` | + 探针 | -| `04-07-nodejs-demo.yaml` | `docs/04-07-nodejs-调度与亲和.md` | + `nodeSelector`(默认 **ylc62**,请改为本机节点名) | -| `04-08-nodejs-demo.yaml` | `docs/04-08-nodejs-安全上下文.md` | + 非 root、只读根、`/tmp` emptyDir | -| `04-09-nodejs-demo.yaml` | `docs/04-09-nodejs-存储与卷.md` | + PVC `nodejs-demo-data`(默认 **local-path**) | -| `04-10-nodejs-demo.yaml` | `docs/04-10-nodejs-Ingress与Traefik.md` | Ingress:`host` + `/api`,curl 需 **Host** | -| `04-11-nodejs-demo.yaml` | `docs/04-11-nodejs-副本与滚动发布.md` | replicas=3 + RollingUpdate | -| `04-12-nodejs-demo.yaml` | `docs/04-12-nodejs-TLS与证书.md` | **websecure** + TLS;须先创建 `nodejs-demo-tls` Secret | -| `04-13-nodejs-demo.yaml` | `docs/04-13-nodejs-HPA.md` | + HPA(需 metrics-server) | - -## 应用方式 - -```bash -# 仓库根目录 -kubectl apply -f ansible/files/04-01-nodejs-demo/04-01-nodejs-demo.yaml -``` - -或使用 Ansible:`ansible/playbooks/nodejs-demo-apply.yml`,变量 `nodejs_demo_manifest` 指定文件名。 - -## dry-run - -```bash -kubectl apply --dry-run=client -f ansible/files/04-01-nodejs-demo/04-01-nodejs-demo.yaml -``` - diff --git a/ansible/files/04-01-nodejs-demo/04-01-nodejs-demo.yaml b/ansible/files/04-01/04-01-nodejs-demo.yaml similarity index 100% rename from ansible/files/04-01-nodejs-demo/04-01-nodejs-demo.yaml rename to ansible/files/04-01/04-01-nodejs-demo.yaml diff --git a/ansible/files/04-01-nodejs-demo/04-02-nodejs-demo.yaml b/ansible/files/04-01/04-02-nodejs-demo.yaml similarity index 100% rename from ansible/files/04-01-nodejs-demo/04-02-nodejs-demo.yaml rename to ansible/files/04-01/04-02-nodejs-demo.yaml diff --git a/ansible/files/04-01-nodejs-demo/04-03-nodejs-demo.yaml b/ansible/files/04-01/04-03-nodejs-demo.yaml similarity index 100% rename from ansible/files/04-01-nodejs-demo/04-03-nodejs-demo.yaml rename to ansible/files/04-01/04-03-nodejs-demo.yaml diff --git a/ansible/files/04-01-nodejs-demo/04-04-nodejs-demo.yaml b/ansible/files/04-01/04-04-nodejs-demo.yaml similarity index 100% rename from ansible/files/04-01-nodejs-demo/04-04-nodejs-demo.yaml rename to ansible/files/04-01/04-04-nodejs-demo.yaml diff --git a/ansible/files/04-01-nodejs-demo/04-05-nodejs-demo.yaml b/ansible/files/04-01/04-05-nodejs-demo.yaml similarity index 100% rename from ansible/files/04-01-nodejs-demo/04-05-nodejs-demo.yaml rename to ansible/files/04-01/04-05-nodejs-demo.yaml diff --git a/ansible/files/04-01-nodejs-demo/04-06-nodejs-demo.yaml b/ansible/files/04-01/04-06-nodejs-demo.yaml similarity index 100% rename from ansible/files/04-01-nodejs-demo/04-06-nodejs-demo.yaml rename to ansible/files/04-01/04-06-nodejs-demo.yaml diff --git a/ansible/files/04-01-nodejs-demo/04-07-nodejs-demo.yaml b/ansible/files/04-01/04-07-nodejs-demo.yaml similarity index 100% rename from ansible/files/04-01-nodejs-demo/04-07-nodejs-demo.yaml rename to ansible/files/04-01/04-07-nodejs-demo.yaml diff --git a/ansible/files/04-01-nodejs-demo/04-08-nodejs-demo.yaml b/ansible/files/04-01/04-08-nodejs-demo.yaml similarity index 100% rename from ansible/files/04-01-nodejs-demo/04-08-nodejs-demo.yaml rename to ansible/files/04-01/04-08-nodejs-demo.yaml diff --git a/ansible/files/04-01-nodejs-demo/04-09-nodejs-demo.yaml b/ansible/files/04-01/04-09-nodejs-demo.yaml similarity index 100% rename from ansible/files/04-01-nodejs-demo/04-09-nodejs-demo.yaml rename to ansible/files/04-01/04-09-nodejs-demo.yaml diff --git a/ansible/files/04-01-nodejs-demo/04-10-nodejs-demo.yaml b/ansible/files/04-01/04-10-nodejs-demo.yaml similarity index 100% rename from ansible/files/04-01-nodejs-demo/04-10-nodejs-demo.yaml rename to ansible/files/04-01/04-10-nodejs-demo.yaml diff --git a/ansible/files/04-01-nodejs-demo/04-11-nodejs-demo.yaml b/ansible/files/04-01/04-11-nodejs-demo.yaml similarity index 100% rename from ansible/files/04-01-nodejs-demo/04-11-nodejs-demo.yaml rename to ansible/files/04-01/04-11-nodejs-demo.yaml diff --git a/ansible/files/04-01-nodejs-demo/04-12-nodejs-demo.yaml b/ansible/files/04-01/04-12-nodejs-demo.yaml similarity index 100% rename from ansible/files/04-01-nodejs-demo/04-12-nodejs-demo.yaml rename to ansible/files/04-01/04-12-nodejs-demo.yaml diff --git a/ansible/files/04-01-nodejs-demo/04-13-nodejs-demo.yaml b/ansible/files/04-01/04-13-nodejs-demo.yaml similarity index 100% rename from ansible/files/04-01-nodejs-demo/04-13-nodejs-demo.yaml rename to ansible/files/04-01/04-13-nodejs-demo.yaml diff --git a/ansible/files/04-01-nodejs-demo/nodejs-demo-secret.example.yaml b/ansible/files/04-01/nodejs-demo-secret.example.yaml similarity index 100% rename from ansible/files/04-01-nodejs-demo/nodejs-demo-secret.example.yaml rename to ansible/files/04-01/nodejs-demo-secret.example.yaml diff --git a/ansible/files/04-02-nodejs-镜像与运行命令/README.md b/ansible/files/04-02-nodejs-镜像与运行命令/README.md deleted file mode 100644 index 9cdba40..0000000 --- a/ansible/files/04-02-nodejs-镜像与运行命令/README.md +++ /dev/null @@ -1,13 +0,0 @@ -# 04-02-nodejs-镜像与运行命令(占位) - -对应文档:[`docs/04-02-nodejs-镜像与运行命令.md`](../../docs/04-02-nodejs-镜像与运行命令.md) - -## 真源清单(复用 04-01 累积目录) - -- 真源目录:`ansible/files/04-01-nodejs-demo/` -- 对应累积清单:`04-02-nodejs-demo.yaml` - -```bash -kubectl apply -f ansible/files/04-01-nodejs-demo/04-02-nodejs-demo.yaml -``` - diff --git a/ansible/files/04-03-nodejs-环境变量与配置注入/README.md b/ansible/files/04-03-nodejs-环境变量与配置注入/README.md deleted file mode 100644 index df44f17..0000000 --- a/ansible/files/04-03-nodejs-环境变量与配置注入/README.md +++ /dev/null @@ -1,13 +0,0 @@ -# 04-03-nodejs-环境变量与配置注入(占位) - -对应文档:[`docs/04-03-nodejs-环境变量与配置注入.md`](../../docs/04-03-nodejs-环境变量与配置注入.md) - -## 真源清单(复用 04-01 累积目录) - -- 真源目录:`ansible/files/04-01-nodejs-demo/` -- 对应累积清单:`04-03-nodejs-demo.yaml` - -```bash -kubectl apply -f ansible/files/04-01-nodejs-demo/04-03-nodejs-demo.yaml -``` - diff --git a/ansible/files/04-04-nodejs-端口与Service/README.md b/ansible/files/04-04-nodejs-端口与Service/README.md deleted file mode 100644 index 6359c89..0000000 --- a/ansible/files/04-04-nodejs-端口与Service/README.md +++ /dev/null @@ -1,13 +0,0 @@ -# 04-04-nodejs-端口与Service(占位) - -对应文档:[`docs/04-04-nodejs-端口与Service.md`](../../docs/04-04-nodejs-端口与Service.md) - -## 真源清单(复用 04-01 累积目录) - -- 真源目录:`ansible/files/04-01-nodejs-demo/` -- 对应累积清单:`04-04-nodejs-demo.yaml` - -```bash -kubectl apply -f ansible/files/04-01-nodejs-demo/04-04-nodejs-demo.yaml -``` - diff --git a/ansible/files/04-05-nodejs-资源请求与限制/README.md b/ansible/files/04-05-nodejs-资源请求与限制/README.md deleted file mode 100644 index 9bed620..0000000 --- a/ansible/files/04-05-nodejs-资源请求与限制/README.md +++ /dev/null @@ -1,13 +0,0 @@ -# 04-05-nodejs-资源请求与限制(占位) - -对应文档:[`docs/04-05-nodejs-资源请求与限制.md`](../../docs/04-05-nodejs-资源请求与限制.md) - -## 真源清单(复用 04-01 累积目录) - -- 真源目录:`ansible/files/04-01-nodejs-demo/` -- 对应累积清单:`04-05-nodejs-demo.yaml` - -```bash -kubectl apply -f ansible/files/04-01-nodejs-demo/04-05-nodejs-demo.yaml -``` - diff --git a/ansible/files/04-06-nodejs-探针与健康检查/README.md b/ansible/files/04-06-nodejs-探针与健康检查/README.md deleted file mode 100644 index f0ab96b..0000000 --- a/ansible/files/04-06-nodejs-探针与健康检查/README.md +++ /dev/null @@ -1,13 +0,0 @@ -# 04-06-nodejs-探针与健康检查(占位) - -对应文档:[`docs/04-06-nodejs-探针与健康检查.md`](../../docs/04-06-nodejs-探针与健康检查.md) - -## 真源清单(复用 04-01 累积目录) - -- 真源目录:`ansible/files/04-01-nodejs-demo/` -- 对应累积清单:`04-06-nodejs-demo.yaml` - -```bash -kubectl apply -f ansible/files/04-01-nodejs-demo/04-06-nodejs-demo.yaml -``` - diff --git a/ansible/files/04-07-nodejs-调度与亲和/README.md b/ansible/files/04-07-nodejs-调度与亲和/README.md deleted file mode 100644 index f69696b..0000000 --- a/ansible/files/04-07-nodejs-调度与亲和/README.md +++ /dev/null @@ -1,13 +0,0 @@ -# 04-07-nodejs-调度与亲和(占位) - -对应文档:[`docs/04-07-nodejs-调度与亲和.md`](../../docs/04-07-nodejs-调度与亲和.md) - -## 真源清单(复用 04-01 累积目录) - -- 真源目录:`ansible/files/04-01-nodejs-demo/` -- 对应累积清单:`04-07-nodejs-demo.yaml` - -```bash -kubectl apply -f ansible/files/04-01-nodejs-demo/04-07-nodejs-demo.yaml -``` - diff --git a/ansible/files/04-08-nodejs-安全上下文/README.md b/ansible/files/04-08-nodejs-安全上下文/README.md deleted file mode 100644 index 94a09c0..0000000 --- a/ansible/files/04-08-nodejs-安全上下文/README.md +++ /dev/null @@ -1,13 +0,0 @@ -# 04-08-nodejs-安全上下文(占位) - -对应文档:[`docs/04-08-nodejs-安全上下文.md`](../../docs/04-08-nodejs-安全上下文.md) - -## 真源清单(复用 04-01 累积目录) - -- 真源目录:`ansible/files/04-01-nodejs-demo/` -- 对应累积清单:`04-08-nodejs-demo.yaml` - -```bash -kubectl apply -f ansible/files/04-01-nodejs-demo/04-08-nodejs-demo.yaml -``` - diff --git a/ansible/files/04-09-nodejs-存储与卷/README.md b/ansible/files/04-09-nodejs-存储与卷/README.md deleted file mode 100644 index 88e82ca..0000000 --- a/ansible/files/04-09-nodejs-存储与卷/README.md +++ /dev/null @@ -1,13 +0,0 @@ -# 04-09-nodejs-存储与卷(占位) - -对应文档:[`docs/04-09-nodejs-存储与卷.md`](../../docs/04-09-nodejs-存储与卷.md) - -## 真源清单(复用 04-01 累积目录) - -- 真源目录:`ansible/files/04-01-nodejs-demo/` -- 对应累积清单:`04-09-nodejs-demo.yaml` - -```bash -kubectl apply -f ansible/files/04-01-nodejs-demo/04-09-nodejs-demo.yaml -``` - diff --git a/ansible/files/04-10-nodejs-Ingress与Traefik/README.md b/ansible/files/04-10-nodejs-Ingress与Traefik/README.md deleted file mode 100644 index 4894b86..0000000 --- a/ansible/files/04-10-nodejs-Ingress与Traefik/README.md +++ /dev/null @@ -1,13 +0,0 @@ -# 04-10-nodejs-Ingress与Traefik(占位) - -对应文档:[`docs/04-10-nodejs-Ingress与Traefik.md`](../../docs/04-10-nodejs-Ingress与Traefik.md) - -## 真源清单(复用 04-01 累积目录) - -- 真源目录:`ansible/files/04-01-nodejs-demo/` -- 对应累积清单:`04-10-nodejs-demo.yaml` - -```bash -kubectl apply -f ansible/files/04-01-nodejs-demo/04-10-nodejs-demo.yaml -``` - diff --git a/ansible/files/04-11-nodejs-副本与滚动发布/README.md b/ansible/files/04-11-nodejs-副本与滚动发布/README.md deleted file mode 100644 index 1239624..0000000 --- a/ansible/files/04-11-nodejs-副本与滚动发布/README.md +++ /dev/null @@ -1,13 +0,0 @@ -# 04-11-nodejs-副本与滚动发布(占位) - -对应文档:[`docs/04-11-nodejs-副本与滚动发布.md`](../../docs/04-11-nodejs-副本与滚动发布.md) - -## 真源清单(复用 04-01 累积目录) - -- 真源目录:`ansible/files/04-01-nodejs-demo/` -- 对应累积清单:`04-11-nodejs-demo.yaml` - -```bash -kubectl apply -f ansible/files/04-01-nodejs-demo/04-11-nodejs-demo.yaml -``` - diff --git a/ansible/files/04-12-nodejs-TLS与证书/README.md b/ansible/files/04-12-nodejs-TLS与证书/README.md deleted file mode 100644 index b8a7c35..0000000 --- a/ansible/files/04-12-nodejs-TLS与证书/README.md +++ /dev/null @@ -1,13 +0,0 @@ -# 04-12-nodejs-TLS与证书(占位) - -对应文档:[`docs/04-12-nodejs-TLS与证书.md`](../../docs/04-12-nodejs-TLS与证书.md) - -## 真源清单(复用 04-01 累积目录) - -- 真源目录:`ansible/files/04-01-nodejs-demo/` -- 对应累积清单:`04-12-nodejs-demo.yaml` - -```bash -kubectl apply -f ansible/files/04-01-nodejs-demo/04-12-nodejs-demo.yaml -``` - diff --git a/ansible/files/04-13-nodejs-HPA/README.md b/ansible/files/04-13-nodejs-HPA/README.md deleted file mode 100644 index ec89a09..0000000 --- a/ansible/files/04-13-nodejs-HPA/README.md +++ /dev/null @@ -1,13 +0,0 @@ -# 04-13-nodejs-HPA(占位) - -对应文档:[`docs/04-13-nodejs-HPA.md`](../../docs/04-13-nodejs-HPA.md) - -## 真源清单(复用 04-01 累积目录) - -- 真源目录:`ansible/files/04-01-nodejs-demo/` -- 对应累积清单:`04-13-nodejs-demo.yaml` - -```bash -kubectl apply -f ansible/files/04-01-nodejs-demo/04-13-nodejs-demo.yaml -``` - diff --git a/ansible/files/04-14-nodejs-GitOps与CI流水线/README.md b/ansible/files/04-14-nodejs-GitOps与CI流水线/README.md deleted file mode 100644 index 69c663b..0000000 --- a/ansible/files/04-14-nodejs-GitOps与CI流水线/README.md +++ /dev/null @@ -1,9 +0,0 @@ -# 04-14-nodejs-GitOps与CI流水线(占位) - -对应文档:[`docs/04-14-nodejs-GitOps与CI流水线.md`](../../docs/04-14-nodejs-GitOps与CI流水线.md) - -## 说明 - -- 本篇为流程/方法论文档,通常不会提供一份固定可复用的 K8s 清单。 -- 如需参考示例清单,可从 `ansible/files/04-01-nodejs-demo/` 选择对应阶段的累积 YAML。 - diff --git a/ansible/files/05-01/glances-docker-compose.example.yaml b/ansible/files/05-01/glances-docker-compose.example.yaml new file mode 100644 index 0000000..6aac414 --- /dev/null +++ b/ansible/files/05-01/glances-docker-compose.example.yaml @@ -0,0 +1,10 @@ +services: + glances: + image: nicolargo/glances:latest + container_name: glances + environment: + - TZ=Asia/Shanghai + - GLANCES_OPT=-w + ports: + - "61208:61208" + restart: unless-stopped diff --git a/ansible/files/05-01/homer-glances-item.example.yaml b/ansible/files/05-01/homer-glances-item.example.yaml new file mode 100644 index 0000000..b5482ab --- /dev/null +++ b/ansible/files/05-01/homer-glances-item.example.yaml @@ -0,0 +1,6 @@ +# Homer config.yml fragment example +- name: "System Metrics" + type: "Glances" + icon: "fa-solid fa-heart-pulse" + url: "https://glances.example.com" + stats: [cpu, mem] diff --git a/ansible/files/05-01-homer/homer.yaml b/ansible/files/05-01/homer.yaml similarity index 100% rename from ansible/files/05-01-homer/homer.yaml rename to ansible/files/05-01/homer.yaml diff --git a/ansible/files/05-02-onenav/onenav-proxy.yaml b/ansible/files/05-02/onenav-proxy.yaml similarity index 100% rename from ansible/files/05-02-onenav/onenav-proxy.yaml rename to ansible/files/05-02/onenav-proxy.yaml diff --git a/ansible/files/05-03-gitlab-runner/gitlab-ci-runner-tags.example.yml b/ansible/files/05-03/gitlab-ci-runner-tags.example.yml similarity index 100% rename from ansible/files/05-03-gitlab-runner/gitlab-ci-runner-tags.example.yml rename to ansible/files/05-03/gitlab-ci-runner-tags.example.yml diff --git a/ansible/files/05-04-gitlab-cicd/README.md b/ansible/files/05-04-gitlab-cicd/README.md deleted file mode 100644 index 27531e3..0000000 --- a/ansible/files/05-04-gitlab-cicd/README.md +++ /dev/null @@ -1,10 +0,0 @@ -# GitLab CI 示例(与 docs 对照) - -| 文件 | 文档 | -|------|------| -| `gitlab-ci-minimal.example.yml` | `docs/05-04-k3s-配置gitlab-cicd.md` | -| `gitlab-ci-multi-arch-deploy.example.yml` | `docs/05-04-k3s-配置gitlab-cicd.md` | -| `../05-03-gitlab-runner/gitlab-ci-runner-tags.example.yml` | `docs/05-03-k3s-安装gitlab-含runner.md` | - -复制为 `.gitlab-ci.yml` 或 `include` 引用;变量与 Runner 以文档为准。 - diff --git a/ansible/files/05-04-gitlab-cicd/gitlab-ci-minimal.example.yml b/ansible/files/05-04/gitlab-ci-minimal.example.yml similarity index 100% rename from ansible/files/05-04-gitlab-cicd/gitlab-ci-minimal.example.yml rename to ansible/files/05-04/gitlab-ci-minimal.example.yml diff --git a/ansible/files/05-04-gitlab-cicd/gitlab-ci-multi-arch-deploy.example.yml b/ansible/files/05-04/gitlab-ci-multi-arch-deploy.example.yml similarity index 100% rename from ansible/files/05-04-gitlab-cicd/gitlab-ci-multi-arch-deploy.example.yml rename to ansible/files/05-04/gitlab-ci-multi-arch-deploy.example.yml diff --git a/ansible/files/05-05-prometheus与grafana/README.md b/ansible/files/05-05-prometheus与grafana/README.md deleted file mode 100644 index e366f88..0000000 --- a/ansible/files/05-05-prometheus与grafana/README.md +++ /dev/null @@ -1,9 +0,0 @@ -# 05-05-prometheus与grafana(占位) - -对应文档:[`docs/05-05-prometheus与grafana.md`](../../docs/05-05-prometheus与grafana.md) - -## 说明 - -- 监控栈通常通过 Helm Chart(如 kube-prometheus-stack)安装,清单会随版本变化。 -- 本目录仅用于 doc_id 对齐占位;后续若固化 values/Chart 版本,可在此补齐 manifests/values。 - diff --git a/ansible/files/05-06-openlist/openlist-backup-cronjob.yaml b/ansible/files/05-06/openlist-backup-cronjob.yaml similarity index 100% rename from ansible/files/05-06-openlist/openlist-backup-cronjob.yaml rename to ansible/files/05-06/openlist-backup-cronjob.yaml diff --git a/ansible/files/05-07-openclaw/openclaw-proxy.yaml b/ansible/files/05-07/openclaw-proxy.yaml similarity index 100% rename from ansible/files/05-07-openclaw/openclaw-proxy.yaml rename to ansible/files/05-07/openclaw-proxy.yaml diff --git a/ansible/files/05-07-openclaw/openclaw-server.yml b/ansible/files/05-07/openclaw-server.yml similarity index 100% rename from ansible/files/05-07-openclaw/openclaw-server.yml rename to ansible/files/05-07/openclaw-server.yml diff --git a/ansible/files/05-08-openclaw/openclaw-k3s-experimental.yaml b/ansible/files/05-08/openclaw-k3s-experimental.yaml similarity index 100% rename from ansible/files/05-08-openclaw/openclaw-k3s-experimental.yaml rename to ansible/files/05-08/openclaw-k3s-experimental.yaml diff --git a/ansible/files/05-09-openclaw-web-小游戏网页平台/openclaw-web.yml b/ansible/files/05-09/openclaw-web.yml similarity index 100% rename from ansible/files/05-09-openclaw-web-小游戏网页平台/openclaw-web.yml rename to ansible/files/05-09/openclaw-web.yml diff --git a/ansible/files/06-01-k3s-networkpolicy-故障排查/README.md b/ansible/files/06-01-k3s-networkpolicy-故障排查/README.md deleted file mode 100644 index 3240e73..0000000 --- a/ansible/files/06-01-k3s-networkpolicy-故障排查/README.md +++ /dev/null @@ -1,9 +0,0 @@ -# 06-01-k3s-networkpolicy-故障排查(占位) - -对应文档:[`docs/06-01-k3s-networkpolicy-故障排查.md`](../../docs/06-01-k3s-networkpolicy-故障排查.md) - -## 说明 - -- 本篇为排障手册/命令集合,**不提供固定可部署清单**。 -- 本目录仅用于 doc_id 对齐占位。 - diff --git a/ansible/files/06-02-运维小结/README.md b/ansible/files/06-02-运维小结/README.md deleted file mode 100644 index 22d4a72..0000000 --- a/ansible/files/06-02-运维小结/README.md +++ /dev/null @@ -1,9 +0,0 @@ -# 06-02-运维小结(占位) - -对应文档:[`docs/06-02-运维小结.md`](../../docs/06-02-运维小结.md) - -## 说明 - -- 本篇为运维建议/巡检要点总结,通常不对应单一可部署清单。 -- 本目录仅用于 doc_id 对齐占位。 - diff --git a/ansible/files/06-03-k3s-自动备份与恢复-openlist-webdav/README.md b/ansible/files/06-03-k3s-自动备份与恢复-openlist-webdav/README.md deleted file mode 100644 index 881d2b9..0000000 --- a/ansible/files/06-03-k3s-自动备份与恢复-openlist-webdav/README.md +++ /dev/null @@ -1,12 +0,0 @@ -# 06-03-k3s-自动备份与恢复-openlist-webdav(对齐 README) - -对应文档:[`docs/06-03-k3s-自动备份与恢复-openlist-webdav.md`](../../docs/06-03-k3s-自动备份与恢复-openlist-webdav.md) - -## 真源清单目录 - -本篇可部署清单当前收敛在: - -- `ansible/files/06-03-openlist-webdav/` - -说明:该目录名未镜像 docs 文件名;为满足“doc_id 目录对齐”口径,本目录仅作为桥接与入口。 - diff --git a/ansible/files/06-03-openlist-webdav/app-data-backup-cronjob.yaml b/ansible/files/06-03/app-data-backup-cronjob.yaml similarity index 100% rename from ansible/files/06-03-openlist-webdav/app-data-backup-cronjob.yaml rename to ansible/files/06-03/app-data-backup-cronjob.yaml diff --git a/ansible/files/06-03-openlist-webdav/app-data-restore-job.yaml b/ansible/files/06-03/app-data-restore-job.yaml similarity index 100% rename from ansible/files/06-03-openlist-webdav/app-data-restore-job.yaml rename to ansible/files/06-03/app-data-restore-job.yaml diff --git a/ansible/group_vars/all.yml b/ansible/group_vars/all.yml index 19613b5..f15eb1f 100644 --- a/ansible/group_vars/all.yml +++ b/ansible/group_vars/all.yml @@ -12,14 +12,16 @@ k3s_server_ip: "192.168.2.61" # 安装 k3s 前校验:/storage 为挂载点且与 / 不同设备(实验室 10G+32G 建议 true;「目录式假 /storage」旧环境可 false) k3s_verify_storage_mount: true -# 可选:由 playbooks/k3s-prepare-storage.yml 对第二块整盘分区、格式化并挂载到 k3s_data_dir(会清空该盘,见 01-06) +# 可选:由 playbooks/verify/01-06.yml(-e k3s_do_prepare_storage=true)对第二块整盘分区、格式化并挂载到 k3s_data_dir(会清空该盘,见 01-06) k3s_prepare_storage: false # k3s_data_disk_device: "/dev/vdb" # NVMe 整盘一般为 /dev/nvme0n1,首分区为 /dev/nvme0n1p1,playbook 会按设备名自动加 1 或 p1 -# Longhorn Helm(playbooks/longhorn-install.yml) +# Longhorn Helm(playbooks/verify/03-07.yml) longhorn_chart_version: "1.7.2" longhorn_install_node_packages: true +# 仅在 Helm 与残留 CRD 严重冲突时设 true;默认 false。装前删光 CRD 可能导致 helm install 报 CRD not found +longhorn_force_crd_reset: false # 是否在 longhorn-install 末尾应用本仓库 local-path 实验室 ConfigMap longhorn_apply_local_path_lab: false diff --git a/ansible/playbooks/apply-local-path-config-lab.yml b/ansible/playbooks/apply-local-path-config-lab.yml deleted file mode 100644 index 978e2e5..0000000 --- a/ansible/playbooks/apply-local-path-config-lab.yml +++ /dev/null @@ -1,38 +0,0 @@ ---- -# 部署:docs/00-05 §2 步骤 3——local-path ConfigMap;PVC 演示验收见 scripts/verify.sh run 03-05。 -# 仅应用本仓库 local-path 实验室 ConfigMap(不安装 Longhorn)。在 k3s_server 上执行。 -# 与 docs/03-05 中「方法一」一致,真源:ansible/files/03-05-local-path-config/local-path-config-lab.json - -- name: Apply local-path-config lab JSON - hosts: k3s_server - become: true - run_once: true - vars: - k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml - local_path_json_src: "{{ playbook_dir }}/../files/03-05-local-path-config/local-path-config-lab.json" - local_path_json_dest: /root/local-path-config-lab.json - tasks: - - name: Copy local-path lab json - ansible.builtin.copy: - src: "{{ local_path_json_src }}" - dest: "{{ local_path_json_dest }}" - mode: "0644" - - - name: Apply local-path-config ConfigMap - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl -n kube-system create configmap local-path-config \ - --from-file=config.json={{ local_path_json_dest }} \ - --dry-run=client -o yaml | KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f - - args: - executable: /bin/bash - changed_when: true - - - name: Restart local-path-provisioner if present - ansible.builtin.shell: | - KUBECONFIG={{ k3s_kubeconfig }} kubectl -n kube-system rollout restart deploy/local-path-provisioner - args: - executable: /bin/bash - register: lp_restart - failed_when: false - changed_when: lp_restart.rc == 0 diff --git a/ansible/playbooks/k3s-init-and-install.yml b/ansible/playbooks/k3s-init-and-install.yml deleted file mode 100644 index 8d791c7..0000000 --- a/ansible/playbooks/k3s-init-and-install.yml +++ /dev/null @@ -1,270 +0,0 @@ ---- -# 部署:docs/00-05 §2 步骤 3「正式安装类」——全集群 K3s + 节点准备(非 verify.sh 单条 teardown)。 -# 前置:§2 步骤 1 接入(inventory/SSH);步骤 2 可选 scripts/deploy-lab.sh 在 K3S_PREPARE_STORAGE=true 时先跑 k3s-prepare-storage.yml。 -# 入口:仓库根 ./scripts/deploy-lab.sh k3s,或 ansible-playbook -i ansible/inventory.ini ansible/playbooks/k3s-init-and-install.yml - -- name: Verify /storage is a separate mount (optional) - hosts: k3s_nodes - become: true - tasks: - - name: Check / and /storage mount sources - when: k3s_verify_storage_mount | default(false) | bool - block: - - name: Get mount source for / - ansible.builtin.command: findmnt -n -o SOURCE / - register: mnt_root - changed_when: false - - - name: Get mount source for /storage - ansible.builtin.command: findmnt -n -o SOURCE /storage - register: mnt_storage - changed_when: false - failed_when: false - - - name: Assert /storage is mounted on a different device than / - ansible.builtin.assert: - that: - - mnt_storage.rc == 0 - - (mnt_root.stdout | trim | length) > 0 - - (mnt_storage.stdout | trim | length) > 0 - - (mnt_root.stdout | trim) != (mnt_storage.stdout | trim) - fail_msg: >- - /storage must be a mount point on a block device different from /. - See docs/00-04-部署环境说明.md and docs/01-06-节点初始化-ansible-实践.md - -- name: Init base system - hosts: k3s_nodes - become: true - tasks: - # 检查当前节点上 firewalld 的运行状态,供后续条件判断使用 - - name: Check if firewalld is running - ansible.builtin.command: firewall-cmd --state - register: firewalld_state - changed_when: false - failed_when: false - - # 根据全局 timezone 变量设置系统时区(可选) - - name: Set timezone - ansible.builtin.command: timedatectl set-timezone {{ timezone }} - when: timezone is defined and timezone != "" - - # 安装 k3s 所需的基础工具包(curl、git 等) - - name: Install basic packages - ansible.builtin.package: - name: - - curl - - git - state: present - - # 确保 /etc/hosts 中包含所有 k3s 节点的主机名解析(可选) - - name: Ensure /etc/hosts has entries for all k3s nodes - ansible.builtin.lineinfile: - path: /etc/hosts - regexp: '^\S+\s+{{ item }}\s*$' - line: "{{ hostvars[item]['ansible_host'] }} {{ item }}" - state: present - loop: "{{ groups['k3s_nodes'] }}" - when: - - k3s_manage_hosts | default(true) | bool - - hostvars[item]['ansible_host'] is defined - - # k3s 所需端口:8472/udp(flannel VXLAN)全部节点;6443/tcp(API)仅 server - # 必须在安装 k3s 前开放,否则 worker 无法连接、flannel 无法建立 overlay - # 在所有 k3s 节点上开放 flannel VXLAN 所需的 8472/udp 端口 - - name: Open flannel VXLAN port (8472/udp) on all k3s nodes - ansible.builtin.command: firewall-cmd --permanent --add-port=8472/udp - when: - - k3s_manage_firewalld | default(true) | bool - - firewalld_state.stdout | default('') == 'running' - - # 在 server 节点上开放 k3s API 端口 6443/tcp - - name: Open k3s API port (6443/tcp) on server - ansible.builtin.command: firewall-cmd --permanent --add-port=6443/tcp - when: - - k3s_manage_firewalld | default(true) | bool - - inventory_hostname in groups['k3s_server'] - - firewalld_state.stdout | default('') == 'running' - - # 在完成端口放行后重新加载 firewalld 规则 - - name: Reload firewalld after opening k3s ports - ansible.builtin.command: firewall-cmd --reload - when: - - k3s_manage_firewalld | default(true) | bool - - firewalld_state.stdout | default('') == 'running' - -- name: Install k3s server - hosts: k3s_server - become: true - tasks: - # 在 server 节点上下载安装并启动 k3s server 进程 - - name: Download and install k3s server - ansible.builtin.shell: | - curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="server --data-dir={{ k3s_data_dir }}" sh - - args: - creates: "{{ k3s_data_dir }}/server" - -- name: Install k3s agent (workers) - hosts: k3s_worker - become: true - serial: 1 # 逐台安装,减轻并行下载对网络的压力 - tasks: - # 从首个 server 节点读取集群 token(仅执行一次) - - name: Read k3s token from first server - ansible.builtin.slurp: - src: "{{ k3s_data_dir }}/server/token" - delegate_to: "{{ groups['k3s_server'][0] }}" - run_once: true - register: k3s_token_from_server - - # 在各 worker 节点上保存解码后的 token 供后续安装使用 - - name: Set fact for k3s token on workers - ansible.builtin.set_fact: - k3s_token: "{{ k3s_token_from_server.content | b64decode | trim }}" - - # 在每个 worker 节点上下载安装并启动 k3s agent 进程 - - name: Install k3s agent - ansible.builtin.shell: | - curl -sfL https://get.k3s.io | K3S_URL=https://{{ k3s_server_ip }}:6443 K3S_TOKEN={{ k3s_token }} INSTALL_K3S_EXEC="agent --data-dir={{ k3s_data_dir }}" sh - - args: - creates: "{{ k3s_data_dir }}/agent" - async: 600 - poll: 15 - -- name: Configure firewalld baseline for k3s (flannel.1 / cni0 -> trusted) - hosts: k3s_nodes - become: true - tasks: - # 为 k3s 配置 firewalld 基线:将 flannel.1 / cni0 加入 trusted 区域 - - block: - # 检查节点上 firewalld 是否可用 - - name: Check if firewalld is available - ansible.builtin.command: firewall-cmd --state - register: firewalld_check - changed_when: false - failed_when: false - - # 等待 CNI 接口 flannel.1 和 cni0 出现(k3s 启动并创建完成) - - name: Wait for CNI interfaces (flannel.1, cni0) to appear - ansible.builtin.shell: | - for i in $(seq 1 120); do - ip link show flannel.1 >/dev/null 2>&1 && ip link show cni0 >/dev/null 2>&1 && exit 0 - sleep 1 - done - exit 1 - when: firewalld_check.stdout == 'running' - - # 将 flannel.1 / cni0 接口加入 firewalld trusted 区域(运行时和永久) - - name: Add flannel.1 and cni0 to firewalld trusted zone (runtime + permanent) - ansible.builtin.shell: | - firewall-cmd --zone=trusted --add-interface={{ item }} - firewall-cmd --permanent --zone=trusted --add-interface={{ item }} - loop: - - flannel.1 - - cni0 - when: firewalld_check.stdout == 'running' - - # 更新 firewalld 配置使新接口规则立即生效 - - name: Reload firewalld - ansible.builtin.command: firewall-cmd --reload - when: firewalld_check.stdout == 'running' - when: k3s_manage_firewalld | default(true) | bool - -- name: Configure CoreDNS (IPv4 upstream for ACME) - hosts: k3s_server - become: true - run_once: true - vars: - k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml - tasks: - - name: Wait for CoreDNS deployment to be ready - ansible.builtin.shell: | - KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/coredns -n kube-system --timeout=120s - when: k3s_manage_coredns | default(true) | bool - - - name: Extract CoreDNS Corefile from ConfigMap - ansible.builtin.shell: | - KUBECONFIG={{ k3s_kubeconfig }} kubectl get configmap coredns -n kube-system -o jsonpath='{.data.Corefile}' > /tmp/coredns-corefile.txt - when: k3s_manage_coredns | default(true) | bool - - - name: Patch Corefile forward to IPv4 (avoid IPv6 upstream in Pod network) - ansible.builtin.replace: - path: /tmp/coredns-corefile.txt - regexp: 'forward \. /etc/resolv\.conf' - replace: 'forward . {{ coredns_forward_servers }}' - register: coredns_patched - when: k3s_manage_coredns | default(true) | bool - - - name: Apply patched CoreDNS ConfigMap and restart - ansible.builtin.shell: | - KUBECONFIG={{ k3s_kubeconfig }} kubectl create configmap coredns --from-file=Corefile=/tmp/coredns-corefile.txt -n kube-system --dry-run=client -o yaml | KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f - - KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout restart deployment/coredns -n kube-system - KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/coredns -n kube-system --timeout=60s - when: - - k3s_manage_coredns | default(true) | bool - - coredns_patched is changed - - - name: Remove temp Corefile - ansible.builtin.file: - path: /tmp/coredns-corefile.txt - state: absent - when: k3s_manage_coredns | default(true) | bool - -- name: 安装后验证 - traefik / nodes / curl - hosts: k3s_server - become: true - run_once: true - vars: - k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml - tasks: - # 安装后为控制节点打 control-plane 标签(02-05 矩阵 M1 需此标签才能调度),节点名与 inventory 短主机名一致(ylc61~ylc64) - - name: Label control-plane nodes (k3s 不默认打标,M1 需此标签) - ansible.builtin.shell: | - KUBECONFIG={{ k3s_kubeconfig }} kubectl label node {{ item }} node-role.kubernetes.io/control-plane= --overwrite - loop: "{{ groups['k3s_server'] | default([]) }}" - - # 可选:为工作节点打 worker 标签(02-05 矩阵 M3 需要) - - name: 可选 - 为工作节点打 worker 标签(02-05 矩阵 M3 需要) - ansible.builtin.shell: | - KUBECONFIG={{ k3s_kubeconfig }} kubectl label node {{ item }} node-role.kubernetes.io/worker= --overwrite - loop: "{{ groups['k3s_worker'] | default([]) }}" - when: k3s_manage_role_labels | default(true) | bool - - # 查看 kube-system 命名空间中与 Traefik / svclb 相关的 Pod 列表 - - name: kubectl get pods -n kube-system(traefik / svclb) - ansible.builtin.shell: KUBECONFIG={{ k3s_kubeconfig }} kubectl get pods -n kube-system -o wide | grep -E 'NAME|traefik|svclb' - register: verify_traefik - changed_when: false - - # 打印上一步查询到的 Traefik 相关 Pod 信息 - - name: ">>> Traefik 相关 Pods" - ansible.builtin.debug: - msg: "{{ item }}" - loop: "{{ verify_traefik.stdout_lines }}" - - # 查询当前集群中的节点列表 - - name: kubectl get nodes - ansible.builtin.shell: KUBECONFIG={{ k3s_kubeconfig }} kubectl get nodes - register: verify_nodes - changed_when: false - - # 打印节点列表结果,方便确认节点状态与角色 - - name: ">>> kubectl get nodes" - ansible.builtin.debug: - msg: "{{ item }}" - loop: "{{ verify_nodes.stdout_lines }}" - - # 通过 curl 测试每个节点 80 与 443 入口连通性 - - name: curl 测试各节点 80/443 可达性 - ansible.builtin.shell: | - for ip in {{ groups['k3s_nodes'] | map('extract', hostvars) | map(attribute='ansible_host') | join(' ') }}; do - c80=$(curl -sk -o /dev/null -w "%{http_code}" --connect-timeout 2 http://$ip 2>/dev/null) || c80="fail" - c443=$(curl -sk -o /dev/null -w "%{http_code}" --connect-timeout 2 https://$ip 2>/dev/null) || c443="fail" - echo "$ip: 80=$c80 443=$c443" - done - register: verify_curl - changed_when: false - - - name: ">>> curl 结果" - ansible.builtin.debug: - msg: "{{ item }}" - loop: "{{ verify_curl.stdout_lines }}" diff --git a/ansible/playbooks/k3s-prepare-storage.yml b/ansible/playbooks/k3s-prepare-storage.yml deleted file mode 100644 index 76fed21..0000000 --- a/ansible/playbooks/k3s-prepare-storage.yml +++ /dev/null @@ -1,108 +0,0 @@ ---- -# 部署:docs/00-05 §2 步骤 2~3 可选前置——数据盘 → /storage(非矩阵验收)。 -# 推荐经 scripts/deploy-lab.sh k3s 在 K3S_PREPARE_STORAGE=true 时自动串行;勿与 verify.sh run-all 混为同一含义。 -# 可选:在空白数据盘上创建单分区、ext4、fstab 并挂载到 k3s_data_dir(默认 /storage)。 -# 启用前在 group_vars/all.yml 设置 k3s_prepare_storage: true 与 k3s_data_disk_device(如 /dev/vdb)。 -# 会清空该磁盘上的数据。若 /storage 已是挂载点则跳过。 - -- name: Prepare data disk and mount to k3s_data_dir - hosts: k3s_nodes - become: true - tasks: - - name: Skip notice when storage prep disabled - ansible.builtin.debug: - msg: "k3s_prepare_storage is false — skipping (see group_vars/all.yml)" - when: not (k3s_prepare_storage | default(false) | bool) - - - name: Prepare block storage for k3s_data_dir - when: k3s_prepare_storage | default(false) | bool - block: - - name: Require k3s_data_disk_device when k3s_prepare_storage is true - ansible.builtin.assert: - that: - - k3s_data_disk_device is defined - - (k3s_data_disk_device | string | length) > 0 - fail_msg: "Set k3s_data_disk_device (e.g. /dev/vdb) in group_vars or host_vars" - - - name: Verify k3s_data_disk_device is a block device - ansible.builtin.command: test -b {{ k3s_data_disk_device }} - changed_when: false - - - name: Check whether k3s_data_dir is already a mountpoint - ansible.builtin.command: mountpoint -q {{ k3s_data_dir }} - register: mp_k3s - changed_when: false - failed_when: false - - - name: Skip when k3s_data_dir already mounted - ansible.builtin.debug: - msg: "{{ k3s_data_dir }} already mounted — skipping partitioning on {{ inventory_hostname }}" - when: mp_k3s.rc == 0 - - - name: Install partitioning and filesystem tools - ansible.builtin.package: - name: - - parted - - e2fsprogs - state: present - when: mp_k3s.rc != 0 - - - name: Compute first partition path (nvme*n* -> p1, else 1) - ansible.builtin.set_fact: - k3s_data_partition: >- - {{ k3s_data_disk_device }}{{ 'p1' if (k3s_data_disk_device | regex_search('nvme[0-9]+n[0-9]+$')) else '1' }} - when: mp_k3s.rc != 0 - - - name: Create GPT and single ext4 partition - ansible.builtin.command: >- - parted -s {{ k3s_data_disk_device }} mklabel gpt mkpart primary ext4 0% 100% - args: - creates: "{{ k3s_data_partition }}" - when: mp_k3s.rc != 0 - - - name: Wait for partition node in /dev - ansible.builtin.wait_for: - path: "{{ k3s_data_partition }}" - state: present - timeout: 60 - when: mp_k3s.rc != 0 - - - name: Detect existing filesystem on partition - ansible.builtin.command: blkid -s TYPE -o value {{ k3s_data_partition }} - register: fs_type - changed_when: false - failed_when: false - when: mp_k3s.rc != 0 - - - name: Create ext4 on partition - ansible.builtin.command: mkfs.ext4 -F {{ k3s_data_partition }} - when: - - mp_k3s.rc != 0 - - (fs_type.stdout | default('') | trim | length) == 0 - - - name: Read UUID of partition - ansible.builtin.command: blkid -s UUID -o value {{ k3s_data_partition }} - register: blk_uuid - changed_when: false - when: mp_k3s.rc != 0 - - - name: Ensure mount directory exists - ansible.builtin.file: - path: "{{ k3s_data_dir }}" - state: directory - mode: "0755" - when: mp_k3s.rc != 0 - - - name: Add fstab entry for k3s_data_dir - ansible.builtin.lineinfile: - path: /etc/fstab - regexp: "^UUID={{ blk_uuid.stdout | trim }}\\s" - line: "UUID={{ blk_uuid.stdout | trim }} {{ k3s_data_dir }} ext4 defaults,nofail 0 2" - create: true - mode: "0644" - when: mp_k3s.rc != 0 - - - name: Mount all from fstab - ansible.builtin.command: mount -a - changed_when: true - when: mp_k3s.rc != 0 diff --git a/ansible/playbooks/longhorn-install.yml b/ansible/playbooks/longhorn-install.yml deleted file mode 100644 index a62524e..0000000 --- a/ansible/playbooks/longhorn-install.yml +++ /dev/null @@ -1,252 +0,0 @@ ---- -# 部署:docs/00-05 §2 步骤 3——Helm 铺栈;验收见 scripts/verify.sh run 03-07。 -# Helm 安装 Longhorn(与 docs/03-07 一致)。在控制节点执行,依赖 KUBECONFIG=/etc/rancher/k3s/k3s.yaml -# 变量:group_vars/all.yml 中 longhorn_chart_version、longhorn_install_node_packages、longhorn_apply_local_path_lab - -- name: Longhorn node packages (iSCSI, NFS client) - hosts: k3s_nodes - become: true - tasks: - - name: Install Longhorn OS dependencies - when: longhorn_install_node_packages | default(true) | bool - block: - - name: Install iscsi + nfs (dnf/yum) - ansible.builtin.package: - name: - - iscsi-initiator-utils - - nfs-utils - state: present - - - name: Enable iscsid - ansible.builtin.systemd: - name: iscsid - enabled: true - state: started - - - name: Ensure Longhorn data subdirectory exists on all nodes - ansible.builtin.file: - path: "{{ k3s_data_dir }}/longhorn" - state: directory - mode: "0700" - - - name: Pre-pull Longhorn images on all nodes (optional, avoid DockerHub EOF/ImagePullBackOff) - when: longhorn_prepull_images | default(true) | bool - ansible.builtin.shell: | - set -e - CTR="ctr --address /run/k3s/containerd/containerd.sock -n k8s.io" - - imgs=( - "docker.io/longhornio/longhorn-manager:v{{ longhorn_chart_version }}" - "docker.io/longhornio/longhorn-ui:v{{ longhorn_chart_version }}" - "docker.io/longhornio/longhorn-share-manager:v{{ longhorn_chart_version }}" - "docker.io/longhornio/longhorn-engine:v{{ longhorn_chart_version }}" - "docker.io/longhornio/longhorn-instance-manager:v{{ longhorn_chart_version }}" - "docker.io/longhornio/backing-image-manager:v{{ longhorn_chart_version }}" - "docker.io/longhornio/support-bundle-kit:v0.0.45" - ) - - for img in "${imgs[@]}"; do - ok=0 - for i in 1 2 3 4 5; do - echo "[pull] $img (try $i/5)" - if $CTR images pull "$img"; then - ok=1 - break - fi - sleep $((i * 3)) - done - if [ "$ok" -ne 1 ]; then - echo "[ERR] failed pulling $img after retries" - exit 1 - fi - done - args: - executable: /bin/bash - changed_when: true - -- name: Install Longhorn with Helm on first server - hosts: k3s_server - become: true - run_once: true - vars: - longhorn_values_src: "{{ playbook_dir }}/../files/03-07-longhorn/values-lab.yaml" - longhorn_values_dest: /root/longhorn-values-lab.yaml - k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml - tasks: - - name: Install helm package (Fedora/RHEL family) - ansible.builtin.package: - name: helm - state: present - ignore_errors: true - register: helm_pkg - - - name: Hint if helm package install failed (install Helm 3 manually if needed) - ansible.builtin.debug: - msg: "dnf/yum 未装上 helm 时,请见 https://helm.sh/docs/intro/install/" - when: helm_pkg.failed | default(false) - - - name: Fail if helm binary still unavailable - ansible.builtin.command: which helm - register: helm_which - changed_when: false - failed_when: helm_which.rc != 0 - - - name: Copy lab values to server - ansible.builtin.copy: - src: "{{ longhorn_values_src }}" - dest: "{{ longhorn_values_dest }}" - mode: "0600" - - - name: Ensure longhorn-system namespace is not stuck Terminating (force finalize if needed) - ansible.builtin.shell: | - set -e - export KUBECONFIG={{ k3s_kubeconfig }} - ns="longhorn-system" - phase="$(kubectl get ns "$ns" -o jsonpath='{.status.phase}' 2>/dev/null || true)" - if [ "$phase" = "Terminating" ]; then - echo "[WARN] namespace $ns is Terminating; force finalize to unblock install" - kubectl get ns "$ns" -o json > /tmp/ns.json - python3 -c "import json; obj=json.load(open('/tmp/ns.json')); obj.setdefault('spec',{}); obj['spec']['finalizers']=[]; json.dump(obj, open('/tmp/ns-finalize.json','w'))" - kubectl replace --raw "/api/v1/namespaces/$ns/finalize" -f /tmp/ns-finalize.json >/dev/null - fi - args: - executable: /bin/bash - changed_when: true - failed_when: false - - - name: Ensure longhorn Helm repo - ansible.builtin.shell: | - set -e - if ! helm repo list 2>/dev/null | grep -q '^longhorn'; then - helm repo add longhorn https://charts.longhorn.io - fi - helm repo update - environment: - KUBECONFIG: "{{ k3s_kubeconfig }}" - args: - executable: /bin/bash - changed_when: true - - - name: Delete leftover longhorn PriorityClass (cluster-scoped) to avoid Helm ownership conflicts - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl delete priorityclass longhorn-critical --ignore-not-found=true - args: - executable: /bin/bash - changed_when: true - failed_when: false - - - name: Delete leftover Longhorn CRDs (cluster-scoped) to avoid Helm ownership conflicts - ansible.builtin.shell: | - set -e - export KUBECONFIG={{ k3s_kubeconfig }} - crd_list="$(kubectl get crd -o name 2>/dev/null | grep 'longhorn.io' || true)" - if [ -n "$crd_list" ]; then - echo "$crd_list" | while read -r crd; do - [ -z "$crd" ] && continue - timeout 20s kubectl delete "$crd" --ignore-not-found=true || true - done - fi - args: - executable: /bin/bash - changed_when: true - failed_when: false - - - name: Delete leftover Longhorn ClusterRole/ClusterRoleBinding (cluster-scoped) - ansible.builtin.shell: | - set -e - export KUBECONFIG={{ k3s_kubeconfig }} - - role_list="$(kubectl get clusterrole -o name 2>/dev/null | grep 'longhorn' || true)" - if [ -n "$role_list" ]; then - echo "$role_list" | while read -r role; do - [ -z "$role" ] && continue - timeout 20s kubectl delete "$role" --ignore-not-found=true || true - done - fi - - binding_list="$(kubectl get clusterrolebinding -o name 2>/dev/null | grep 'longhorn' || true)" - if [ -n "$binding_list" ]; then - echo "$binding_list" | while read -r binding; do - [ -z "$binding" ] && continue - timeout 20s kubectl delete "$binding" --ignore-not-found=true || true - done - fi - args: - executable: /bin/bash - changed_when: true - failed_when: false - - - name: Cleanup leftover Helm release records for Longhorn (default + longhorn-system) - ansible.builtin.shell: | - set -e - export KUBECONFIG={{ k3s_kubeconfig }} - - # 有些失败/中断的安装会把 release secret 留在 default 或 longhorn-system,导致后续: - # - "cannot re-use a name that is still in use" - # - cluster-scoped 资源的 meta.helm.sh/release-namespace 注解冲突 - for ns in longhorn-system default; do - if helm -n "$ns" list --all 2>/dev/null | grep -q '^longhorn'; then - # uninstall 可能卡住(例如 uninstall job / hook),避免阻塞整个自动化流程 - timeout 120s helm -n "$ns" uninstall longhorn --no-hooks || true - fi - - sec_list="$(kubectl -n "$ns" get secret -o name 2>/dev/null | grep '^secret/sh\\.helm\\.release\\.v1\\.longhorn\\.' || true)" - if [ -n "$sec_list" ]; then - echo "$sec_list" | xargs -n1 kubectl -n "$ns" delete --ignore-not-found=true - fi - done - environment: - KUBECONFIG: "{{ k3s_kubeconfig }}" - args: - executable: /bin/bash - changed_when: true - failed_when: false - - - name: Helm upgrade/install Longhorn(失败兜底:install --replace) - ansible.builtin.shell: | - set -e - helm upgrade --install longhorn longhorn/longhorn --namespace longhorn-system --create-namespace -f {{ longhorn_values_dest }} --version {{ longhorn_chart_version }} --wait --timeout 15m || helm install --replace longhorn longhorn/longhorn --namespace longhorn-system --create-namespace -f {{ longhorn_values_dest }} --version {{ longhorn_chart_version }} --wait --timeout 15m - environment: - KUBECONFIG: "{{ k3s_kubeconfig }}" - args: - executable: /bin/bash - register: helm_longhorn - changed_when: true - -- name: Apply local-path-config lab defaults (optional) - hosts: k3s_server - become: true - run_once: true - vars: - k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml - local_path_json_src: "{{ playbook_dir }}/../files/03-05-local-path-config/local-path-config-lab.json" - local_path_json_dest: /root/local-path-config-lab.json - tasks: - - name: Apply local-path-config lab defaults (optional) - when: longhorn_apply_local_path_lab | default(false) | bool - block: - - name: Copy local-path lab json - ansible.builtin.copy: - src: "{{ local_path_json_src }}" - dest: "{{ local_path_json_dest }}" - mode: "0644" - - - name: Apply local-path-config ConfigMap - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl -n kube-system create configmap local-path-config \ - --from-file=config.json={{ local_path_json_dest }} \ - --dry-run=client -o yaml | KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f - - args: - executable: /bin/bash - changed_when: true - - - name: Restart local-path-provisioner if present - ansible.builtin.shell: | - KUBECONFIG={{ k3s_kubeconfig }} kubectl -n kube-system rollout restart deploy/local-path-provisioner - args: - executable: /bin/bash - register: lp_restart - failed_when: false - changed_when: lp_restart.rc == 0 diff --git a/ansible/playbooks/nginx-matrix-deploy.yml b/ansible/playbooks/nginx-matrix-deploy.yml deleted file mode 100644 index 9b1c2ba..0000000 --- a/ansible/playbooks/nginx-matrix-deploy.yml +++ /dev/null @@ -1,168 +0,0 @@ ---- -# 部署:docs/00-05 §2 步骤 3——铺栈(无按 doc_id 的断言/teardown)。 -# 矩阵级验收请用 scripts/verify.sh run 02-01…02-05 或 run-all。 -# Ansible 一键部署 nginx 矩阵(M1~M4) -# 对应文档:docs/02-05-nginx-验证矩阵-一键部署.md(02-01~02-04 分篇已整合) -# -# 说明:复制 manifests → kubectl apply → 等待 Pod 就绪 → 验证 Pod 节点分布 → curl 16 目标 -# manifests:ansible/files/02-05-nginx-matrix/,M1 control-plane / M2 ylc61 / M3 worker / M4 ylc64,按实际修改 02/04 hostname -# -# 执行(在 ansible/ 目录下): -# ansible-playbook -i inventory.ini playbooks/nginx-matrix-deploy.yml -# 或在仓库根目录: -# ansible-playbook -i ansible/inventory.ini ansible/playbooks/nginx-matrix-deploy.yml -- name: Deploy nginx matrix (M1~M4) - hosts: k3s_server - become: true - run_once: true - vars: - k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml - # manifests 在 ansible/files/02-05-nginx-matrix/,与 playbook 同项目 - manifests_path: "{{ playbook_dir }}/../files/02-05-nginx-matrix" - tasks: - - name: Ensure manifests path exists - ansible.builtin.stat: - path: "{{ manifests_path }}" - register: manifests_stat - - - name: Fail if manifests not found - ansible.builtin.fail: - msg: "manifests 未找到: {{ manifests_path }},请从仓库根目录或 ansible 同级执行" - when: not manifests_stat.stat.exists - - # 部署前确保 control-plane/worker 标签存在(M1/M3 需此才能调度),节点名为短主机名(ylc61~ylc64) - - name: Ensure control-plane label on k3s_server nodes (for M1) - ansible.builtin.shell: | - KUBECONFIG={{ k3s_kubeconfig }} kubectl label node {{ item }} node-role.kubernetes.io/control-plane= --overwrite - loop: "{{ groups['k3s_server'] | default([]) }}" - - - name: Ensure worker label on k3s_worker nodes (for M3) - ansible.builtin.shell: | - KUBECONFIG={{ k3s_kubeconfig }} kubectl label node {{ item }} node-role.kubernetes.io/worker= --overwrite - loop: "{{ groups['k3s_worker'] | default([]) }}" - - - name: Copy nginx matrix manifests to server - ansible.builtin.copy: - src: "{{ manifests_path }}/" - dest: /tmp/nginx-matrix/ - mode: '0644' - - # 先删全部 nginx 矩阵 Deployment 再 apply,避免旧 ReplicaSet 导致任一 Mx 仍显示默认页 - - name: Delete all nginx matrix deployments before apply - ansible.builtin.shell: KUBECONFIG={{ k3s_kubeconfig }} kubectl delete deployment nginx-m1 nginx-m2 nginx-m3 nginx-m4 -n default --ignore-not-found=true - register: del_nginx - changed_when: "'deleted' in del_nginx.stdout" - - - name: kubectl apply nginx matrix - ansible.builtin.shell: KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f /tmp/nginx-matrix/ -R - register: k8s_apply - changed_when: "'configured' in k8s_apply.stdout or 'created' in k8s_apply.stdout" - - - name: Restart nginx deployments so pods pick up ConfigMap (M1~M4 标识) - ansible.builtin.shell: KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout restart deployment nginx-m1 nginx-m2 nginx-m3 nginx-m4 -n default - register: restart_out - changed_when: true - - - name: Wait for nginx pods to be ready - ansible.builtin.shell: | - KUBECONFIG={{ k3s_kubeconfig }} kubectl wait --for=condition=ready pod \ - -l app=nginx-m1 --timeout=60s - KUBECONFIG={{ k3s_kubeconfig }} kubectl wait --for=condition=ready pod \ - -l app=nginx-m2 --timeout=60s - KUBECONFIG={{ k3s_kubeconfig }} kubectl wait --for=condition=ready pod \ - -l app=nginx-m3 --timeout=120s - KUBECONFIG={{ k3s_kubeconfig }} kubectl wait --for=condition=ready pod \ - -l app=nginx-m4 --timeout=120s - register: wait_result - changed_when: false - - - name: Verify nginx matrix - ansible.builtin.shell: KUBECONFIG={{ k3s_kubeconfig }} kubectl get pod,svc,ing,ingressroute -n default -o wide - register: verify - changed_when: false - - - name: ">>> nginx matrix 资源" - ansible.builtin.debug: - msg: "{{ item }}" - loop: "{{ verify.stdout_lines }}" - - - name: 验证 Pod 节点分布(M1/M2 应在控制节点,M3/M4 应在工作节点) - ansible.builtin.shell: | - KUBECONFIG={{ k3s_kubeconfig }} kubectl get pod -n default -o custom-columns='NAME:.metadata.name,APP:.metadata.labels.app,NODE:.spec.nodeName' | grep -E '^(NAME|nginx-m)' - register: pod_placement - changed_when: false - - - name: ">>> Pod 节点分布" - ansible.builtin.debug: - msg: "{{ item }}" - loop: "{{ pod_placement.stdout_lines }}" - - - name: M1 容器内诊断(排查为何仍为 nginx 欢迎页) - ansible.builtin.shell: | - echo "========== 1. M1 容器内 /usr/share/nginx/html/ 目录 ==========" - KUBECONFIG={{ k3s_kubeconfig }} kubectl exec -n default deployment/nginx-m1 -- ls -la /usr/share/nginx/html/ 2>/dev/null || echo "(exec 失败)" - echo "" - echo "========== 2. M1 容器内 index.html 内容(前 5 行)==========" - KUBECONFIG={{ k3s_kubeconfig }} kubectl exec -n default deployment/nginx-m1 -- cat /usr/share/nginx/html/index.html 2>/dev/null | head -5 || echo "(exec 失败)" - echo "" - echo "========== 3. M1 容器内 /etc/nginx/conf.d/ 目录 ==========" - KUBECONFIG={{ k3s_kubeconfig }} kubectl exec -n default deployment/nginx-m1 -- ls -la /etc/nginx/conf.d/ 2>/dev/null || echo "(exec 失败)" - echo "" - echo "========== 4. M1 容器内 default.conf 内容 ==========" - KUBECONFIG={{ k3s_kubeconfig }} kubectl exec -n default deployment/nginx-m1 -- cat /etc/nginx/conf.d/default.conf 2>/dev/null || echo "(exec 失败)" - echo "" - echo "========== 5. M1 容器内 nginx 生效配置中的 server 块(前 40 行)==========" - KUBECONFIG={{ k3s_kubeconfig }} kubectl exec -n default deployment/nginx-m1 -- nginx -T 2>/dev/null | grep -A 200 "server {" | head -40 || echo "(exec 失败)" - register: m1_diag - changed_when: false - failed_when: false - - - name: ">>> M1 容器内诊断结果(若 M1 仍为欢迎页,请根据此处输出排查)" - ansible.builtin.debug: - msg: "{{ item }}" - loop: "{{ m1_diag.stdout_lines }}" - - - name: 验证 M1~M4 标识(Pod 内 index.html 含 Mx、响应头 X-Backend) - ansible.builtin.shell: | - base="{{ groups['k3s_nodes'] | map('extract', hostvars) | map(attribute='ansible_host') | first }}" - for id in 1 2 3 4; do - echo "=== M$id Pod 内 index.html 前 2 行 ===" - KUBECONFIG={{ k3s_kubeconfig }} kubectl exec -n default deployment/nginx-m$id -- cat /usr/share/nginx/html/index.html 2>/dev/null | head -2 || echo "(exec 失败)" - echo "=== M$id 响应头 X-Backend ===" - curl -sI "http://$base/demo-m$id/" 2>/dev/null | grep -i x-backend || echo "(未看到 X-Backend)" - echo "" - done - register: m_check - changed_when: false - failed_when: false - - - name: ">>> M1~M4 验证" - ansible.builtin.debug: - msg: "{{ item }}" - loop: "{{ m_check.stdout_lines }}" - - - name: curl 验证(16 个目标:4 节点 × 4 路径) - ansible.builtin.shell: | - bases="{{ groups['k3s_nodes'] | map('extract', hostvars) | map(attribute='ansible_host') | join(' ') }}" - paths="/demo-m1 /demo-m2 /demo-m3 /demo-m4" - count=0 - ok=0 - echo "=== 16 个目标 (4 节点 × 4 路径) ===" - echo "节点 M1(控制+Ingress) M2(控制+IR) M3(工作+Ingress) M4(工作+IR)" - for base in $bases; do - m1=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 2 http://$base/demo-m1 2>/dev/null) || m1="fail" - m2=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 2 http://$base/demo-m2 2>/dev/null) || m2="fail" - m3=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 2 http://$base/demo-m3 2>/dev/null) || m3="fail" - m4=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 2 http://$base/demo-m4 2>/dev/null) || m4="fail" - printf "%-12s %-16s %-11s %-16s %s\n" "$base" "$m1" "$m2" "$m3" "$m4" - for c in $m1 $m2 $m3 $m4; do count=$((count+1)); [ "$c" = "200" ] && ok=$((ok+1)); done - done - echo "---" - echo "共验证 $count 个目标,$ok 个返回 200" - register: curl_result - changed_when: false - - - name: ">>> curl 矩阵" - ansible.builtin.debug: - msg: "{{ item }}" - loop: "{{ curl_result.stdout_lines }}" diff --git a/ansible/playbooks/nginx-matrix-tls-deploy.yml b/ansible/playbooks/nginx-matrix-tls-deploy.yml deleted file mode 100644 index b073063..0000000 --- a/ansible/playbooks/nginx-matrix-tls-deploy.yml +++ /dev/null @@ -1,189 +0,0 @@ ---- -# 部署:docs/00-05 §2 步骤 3——TLS 铺栈;验收见 scripts/verify.sh run 03-02 等。 -# Ansible 一键部署 nginx 矩阵 TLS 版(M1~M4,HTTPS) -# 对应文档:docs/03-02-k3s-traefik-acme.md -# -# 说明:复制 TLS + HTTP-only manifests → 自动删除已存在的不含 TLS 的 nginx 矩阵(02-05)→ kubectl apply(含 TLS 与 HTTP-only 共 8 个路由)→ 等待 Pod 就绪 → HTTP-only / HTTPS curl 矩阵验证(test01~test04.jackadam.top) -# manifests:ansible/files/03-02-nginx-matrix-tls/,域名为 test01~test04.jackadam.top,M2/M4 hostname 按实际修改;Ingress/IngressRoute 中 TLS 路由仅绑定 websecure,HTTP-only 路由仅绑定 web -# 前置:已按 03-02 配置 ACME(Secret + traefik-acme.yaml),且 test01~test04.jackadam.top 已解析到入口 IP -# -# 执行(在 ansible/ 目录下): -# ansible-playbook -i inventory.ini playbooks/nginx-matrix-tls-deploy.yml -# 或在仓库根目录: -# ansible-playbook -i ansible/inventory.ini ansible/playbooks/nginx-matrix-tls-deploy.yml -# 验证时对所有 k3s_nodes 做 HTTPS 请求(所有节点均为入口点,与 02-05 HTTP 矩阵一致) -- name: Deploy or cleanup nginx matrix TLS (M1~M4, HTTPS) - hosts: k3s_server - become: true - run_once: true - vars: - # mode 由 -e mode=cleanup 传入,未传时默认为 deploy(勿在 vars 中写 mode: "{{ mode | default('deploy') }}" 会递归) - k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml - manifests_path: "{{ playbook_dir }}/../files/03-02-nginx-matrix-tls" - tls_domains: - - test01.jackadam.top - - test02.jackadam.top - - test03.jackadam.top - - test04.jackadam.top - tasks: - - name: Deploy nginx matrix TLS (mode=deploy) - when: (mode | default('deploy')) == 'deploy' - block: - - name: Ensure manifests path exists - ansible.builtin.stat: - path: "{{ manifests_path }}" - register: manifests_stat - - - name: Fail if manifests not found - ansible.builtin.fail: - msg: "manifests 未找到: {{ manifests_path }},请从仓库根目录或 ansible 同级执行" - when: not manifests_stat.stat.exists - - # 部署前确保 control-plane/worker 标签存在(M1/M3 需此才能调度),节点名为短主机名(ylc61~ylc64) - - name: Ensure control-plane label on k3s_server nodes (for M1) - ansible.builtin.shell: | - KUBECONFIG={{ k3s_kubeconfig }} kubectl label node {{ item }} node-role.kubernetes.io/control-plane= --overwrite - loop: "{{ groups['k3s_server'] | default([]) }}" - - - name: Ensure worker label on k3s_worker nodes (for M3) - ansible.builtin.shell: | - KUBECONFIG={{ k3s_kubeconfig }} kubectl label node {{ item }} node-role.kubernetes.io/worker= --overwrite - loop: "{{ groups['k3s_worker'] | default([]) }}" - - - name: Copy nginx matrix TLS manifests to server - ansible.builtin.copy: - src: "{{ manifests_path }}/" - dest: /tmp/nginx-matrix-tls/ - mode: '0644' - - # 若存在不含 TLS 的 nginx 矩阵(02-05),先删掉,避免与 TLS 版 Ingress 冲突或残留 - - name: Delete non-TLS nginx matrix if present (deployments, ingress, ingressroute, middleware, configmaps) - ansible.builtin.shell: | - KUBECONFIG={{ k3s_kubeconfig }} kubectl delete deployment,svc -n default nginx-m1 nginx-m2 nginx-m3 nginx-m4 --ignore-not-found=true - KUBECONFIG={{ k3s_kubeconfig }} kubectl delete ingress -n default nginx-m1 nginx-m3 --ignore-not-found=true - KUBECONFIG={{ k3s_kubeconfig }} kubectl delete ingressroute -n default nginx-m2 nginx-m4 --ignore-not-found=true - KUBECONFIG={{ k3s_kubeconfig }} kubectl delete middleware -n default stripprefix-m1 stripprefix-m2 stripprefix-m3 stripprefix-m4 --ignore-not-found=true - KUBECONFIG={{ k3s_kubeconfig }} kubectl delete configmap -n default nginx-m1-html nginx-m2-html nginx-m3-html nginx-m4-html --ignore-not-found=true - register: del_non_tls - changed_when: "'deleted' in del_non_tls.stdout" - - - name: kubectl apply nginx matrix TLS + HTTP-only - ansible.builtin.shell: KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f /tmp/nginx-matrix-tls/ -R - register: k8s_apply - changed_when: "'configured' in k8s_apply.stdout or 'created' in k8s_apply.stdout" - - - name: Restart nginx deployments so pods pick up ConfigMap (M1~M4 标识) - ansible.builtin.shell: KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout restart deployment nginx-m1 nginx-m2 nginx-m3 nginx-m4 -n default - register: restart_out - changed_when: true - - - name: Wait for nginx pods to be ready - ansible.builtin.shell: | - KUBECONFIG={{ k3s_kubeconfig }} kubectl wait --for=condition=ready pod \ - -l app=nginx-m1 --timeout=60s - KUBECONFIG={{ k3s_kubeconfig }} kubectl wait --for=condition=ready pod \ - -l app=nginx-m2 --timeout=60s - KUBECONFIG={{ k3s_kubeconfig }} kubectl wait --for=condition=ready pod \ - -l app=nginx-m3 --timeout=120s - KUBECONFIG={{ k3s_kubeconfig }} kubectl wait --for=condition=ready pod \ - -l app=nginx-m4 --timeout=120s - register: wait_result - changed_when: false - - - name: Verify nginx matrix TLS resources - ansible.builtin.shell: KUBECONFIG={{ k3s_kubeconfig }} kubectl get pod,svc,ing,ingressroute -n default -o wide - register: verify - changed_when: false - - - name: ">>> nginx matrix TLS 资源" - ansible.builtin.debug: - msg: "{{ item }}" - loop: "{{ verify.stdout_lines }}" - - - name: 验证 M1~M4 标识(Pod 内 index.html 含 Mx、响应头 X-Backend,取首个入口节点) - ansible.builtin.shell: | - first_ip="{{ groups['k3s_nodes'] | map('extract', hostvars) | map(attribute='ansible_host') | first }}" - for id in 1 2 3 4; do - echo "=== M$id Pod 内 index.html 前 2 行 ===" - KUBECONFIG={{ k3s_kubeconfig }} kubectl exec -n default deployment/nginx-m$id -- cat /usr/share/nginx/html/index.html 2>/dev/null | head -2 || echo "(exec 失败)" - echo "=== M$id 响应头 X-Backend (入口 $first_ip) ===" - curl -sI "https://test0$id.jackadam.top/" --resolve "test0$id.jackadam.top:443:$first_ip" -k 2>/dev/null | grep -i x-backend || echo "(未看到 X-Backend)" - echo "" - done - register: m_check - changed_when: false - failed_when: false - - - name: ">>> M1~M4 验证" - ansible.builtin.debug: - msg: "{{ item }}" - loop: "{{ m_check.stdout_lines }}" - - - name: HTTP curl 验证(HTTP-only:16 个目标,所有节点 × 4 域名) - ansible.builtin.shell: | - bases="{{ groups['k3s_nodes'] | map('extract', hostvars) | map(attribute='ansible_host') | join(' ') }}" - count=0 - ok=0 - echo "=== 16 个目标 (4 节点 × 4 域名) HTTP ===" - echo "节点 M1(test01) M2(test02) M3(test03) M4(test04)" - for base in $bases; do - m1=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 http://test01.jackadam.top/ --resolve "test01.jackadam.top:80:$base" 2>/dev/null) || m1="fail" - m2=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 http://test02.jackadam.top/ --resolve "test02.jackadam.top:80:$base" 2>/dev/null) || m2="fail" - m3=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 http://test03.jackadam.top/ --resolve "test03.jackadam.top:80:$base" 2>/dev/null) || m3="fail" - m4=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 http://test04.jackadam.top/ --resolve "test04.jackadam.top:80:$base" 2>/dev/null) || m4="fail" - printf "%-12s %-14s %-14s %-14s %s\n" "$base" "$m1" "$m2" "$m3" "$m4" - for c in $m1 $m2 $m3 $m4; do count=$((count+1)); [ "$c" = "200" ] && ok=$((ok+1)); done - done - echo "---" - echo "共验证 $count 个目标,$ok 个返回 200" - register: curl_http_result - changed_when: false - failed_when: false - - - name: ">>> HTTP curl 矩阵(HTTP-only)" - ansible.builtin.debug: - msg: "{{ item }}" - loop: "{{ curl_http_result.stdout_lines }}" - - - name: HTTPS curl 验证(16 个目标:所有节点 × 4 域名,所有节点均为入口点) - ansible.builtin.shell: | - bases="{{ groups['k3s_nodes'] | map('extract', hostvars) | map(attribute='ansible_host') | join(' ') }}" - count=0 - ok=0 - echo "=== 16 个目标 (4 节点 × 4 域名) HTTPS ===" - echo "节点 M1(test01) M2(test02) M3(test03) M4(test04)" - for base in $bases; do - m1=$(curl -sk -o /dev/null -w "%{http_code}" --connect-timeout 5 https://test01.jackadam.top/ --resolve "test01.jackadam.top:443:$base" 2>/dev/null) || m1="fail" - m2=$(curl -sk -o /dev/null -w "%{http_code}" --connect-timeout 5 https://test02.jackadam.top/ --resolve "test02.jackadam.top:443:$base" 2>/dev/null) || m2="fail" - m3=$(curl -sk -o /dev/null -w "%{http_code}" --connect-timeout 5 https://test03.jackadam.top/ --resolve "test03.jackadam.top:443:$base" 2>/dev/null) || m3="fail" - m4=$(curl -sk -o /dev/null -w "%{http_code}" --connect-timeout 5 https://test04.jackadam.top/ --resolve "test04.jackadam.top:443:$base" 2>/dev/null) || m4="fail" - printf "%-12s %-14s %-14s %-14s %s\n" "$base" "$m1" "$m2" "$m3" "$m4" - for c in $m1 $m2 $m3 $m4; do count=$((count+1)); [ "$c" = "200" ] && ok=$((ok+1)); done - done - echo "---" - echo "共验证 $count 个目标,$ok 个返回 200" - register: curl_result - changed_when: false - failed_when: false - - - name: ">>> HTTPS curl 矩阵" - ansible.builtin.debug: - msg: "{{ item }}" - loop: "{{ curl_result.stdout_lines }}" - - - name: Cleanup nginx matrix TLS (mode=cleanup) - when: (mode | default('deploy')) == 'cleanup' - block: - - name: Delete nginx matrix TLS + HTTP-only resources (deployments, ingress, ingressroute, configmaps) - ansible.builtin.shell: | - KUBECONFIG={{ k3s_kubeconfig }} kubectl delete deployment,svc -n default nginx-m1 nginx-m2 nginx-m3 nginx-m4 --ignore-not-found=true - KUBECONFIG={{ k3s_kubeconfig }} kubectl delete ingress -n default nginx-m1 nginx-m3 nginx-m1-http nginx-m3-http --ignore-not-found=true - KUBECONFIG={{ k3s_kubeconfig }} kubectl delete ingressroute -n default nginx-m2 nginx-m4 nginx-m2-http nginx-m4-http --ignore-not-found=true - KUBECONFIG={{ k3s_kubeconfig }} kubectl delete configmap -n default nginx-m1-html nginx-m2-html nginx-m3-html nginx-m4-html --ignore-not-found=true - register: del_tls - changed_when: "'deleted' in del_tls.stdout" - - - name: Remove copied nginx matrix TLS manifests directory - ansible.builtin.file: - path: /tmp/nginx-matrix-tls - state: absent diff --git a/ansible/playbooks/nodejs-demo-apply.yml b/ansible/playbooks/nodejs-demo-apply.yml deleted file mode 100644 index 3269162..0000000 --- a/ansible/playbooks/nodejs-demo-apply.yml +++ /dev/null @@ -1,48 +0,0 @@ ---- -# 部署:docs/00-05 §2 步骤 3——应用单文件 demo;整链验收优先 scripts/verify.sh run 04-01。 -# 一键应用 Node.js demo 清单(与 docs/04-01~04-13 + ansible/files/04-01-nodejs-demo 对齐) -# -# 执行(在仓库根目录): -# ansible-playbook -i ansible/inventory.ini ansible/playbooks/nodejs-demo-apply.yml \ -# -e nodejs_demo_manifest=04-01-nodejs-demo.yaml -# -# 默认清单:04-01-nodejs-demo.yaml -- name: Apply nodejs-demo Kubernetes manifests - hosts: k3s_server - become: true - run_once: true - vars: - k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml - nodejs_demo_manifest: "04-01-nodejs-demo.yaml" - manifests_dir: "{{ playbook_dir }}/../files/04-01-nodejs-demo" - tasks: - - name: Ensure manifest file exists - ansible.builtin.stat: - path: "{{ manifests_dir }}/{{ nodejs_demo_manifest }}" - register: nodejs_manifest_stat - delegate_to: localhost - become: false - - - name: Fail if manifest not found - ansible.builtin.fail: - msg: "未找到 {{ manifests_dir }}/{{ nodejs_demo_manifest }},请从仓库根检查文件名" - when: not nodejs_manifest_stat.stat.exists - delegate_to: localhost - become: false - - - name: Copy manifest to control plane - ansible.builtin.copy: - src: "{{ manifests_dir }}/{{ nodejs_demo_manifest }}" - dest: "/tmp/{{ nodejs_demo_manifest }}" - mode: "0644" - - - name: kubectl apply nodejs-demo manifest - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f /tmp/{{ nodejs_demo_manifest }} - register: nodejs_apply - changed_when: "'configured' in nodejs_apply.stdout or 'created' in nodejs_apply.stdout" - - - name: Show kubectl apply output - ansible.builtin.debug: - var: nodejs_apply.stdout_lines diff --git a/ansible/playbooks/verify/00-01.yml b/ansible/playbooks/verify/00-01.yml deleted file mode 100644 index 15d87ff..0000000 --- a/ansible/playbooks/verify/00-01.yml +++ /dev/null @@ -1,10 +0,0 @@ -- name: "00-01 noop verify" - hosts: localhost - gather_facts: false - vars: - repo_root: "{{ playbook_dir }}/../../.." - doc_id: "00-01" - doc_filename: "00-01-k3s-基础概念.md" - tasks: - - ansible.builtin.import_tasks: "{{ playbook_dir }}/_noop-tasks.yml" - diff --git a/ansible/playbooks/verify/00-04.yml b/ansible/playbooks/verify/00-04.yml deleted file mode 100644 index 2644d28..0000000 --- a/ansible/playbooks/verify/00-04.yml +++ /dev/null @@ -1,10 +0,0 @@ -- name: "00-04 noop verify" - hosts: localhost - gather_facts: false - vars: - repo_root: "{{ playbook_dir }}/../../.." - doc_id: "00-04" - doc_filename: "00-04-部署环境说明.md" - tasks: - - ansible.builtin.import_tasks: "{{ playbook_dir }}/_noop-tasks.yml" - diff --git a/ansible/playbooks/verify/01-03.yml b/ansible/playbooks/verify/01-03.yml index 92d39a4..b996325 100644 --- a/ansible/playbooks/verify/01-03.yml +++ b/ansible/playbooks/verify/01-03.yml @@ -1,10 +1,128 @@ -- name: "01-03 noop verify" +# SKIP_ARMV7=1(默认):仅 noop(文档 + ansible/files)。 +# SKIP_ARMV7=0 且设置 ARMV7_SSH:经 SSH 在 armv7/arm32 主机上 dnf 安装 docker 并校验(Fedora/RHEL 系,见 docs/01-03)。 +- name: 01-03 armv7 Docker(矩阵 + 可选远程安装) hosts: localhost gather_facts: false vars: repo_root: "{{ playbook_dir }}/../../.." doc_id: "01-03" doc_filename: "01-03-armv7-standalone-docker.md" + skip_armv7: "{{ lookup('env', 'SKIP_ARMV7') | default('1', true) | trim }}" + armv7_ssh: "{{ lookup('env', 'ARMV7_SSH') | default('', true) | trim }}" tasks: - - ansible.builtin.import_tasks: "{{ playbook_dir }}/_noop-tasks.yml" + - name: Baseline docs/files checks + block: + - name: Assert docs file exists + ansible.builtin.stat: + path: "{{ repo_root }}/docs/{{ doc_filename }}" + register: _doc_stat + - name: Fail when docs file missing + ansible.builtin.assert: + that: + - _doc_stat.stat.exists + fail_msg: "docs file missing: docs/{{ doc_filename }}" + + - name: Find matching ansible/files doc_id directory + ansible.builtin.find: + paths: "{{ repo_root }}/ansible/files" + file_type: directory + patterns: "{{ doc_id }}" + use_regex: false + register: _files_dirs + + - name: Fail when ansible/files doc_id directory missing + ansible.builtin.assert: + that: + - _files_dirs.matched | int >= 1 + fail_msg: "ansible/files missing doc_id directory: ansible/files/{{ doc_id }}" + + - name: Show noop verification summary + ansible.builtin.debug: + msg: + - "doc_id={{ doc_id }}" + - "doc={{ doc_filename }}" + - "files_dirs={{ _files_dirs.files | map(attribute='path') | list }}" + + - name: Verify cluster reachable (kubectl get nodes) [runbook baseline] + ansible.builtin.shell: | + set -euo pipefail + KUBECONFIG={{ k3s_kubeconfig | default('/etc/rancher/k3s/k3s.yaml') }} kubectl get nodes + args: + executable: /bin/bash + delegate_to: "{{ groups['k3s_server'][0] }}" + become: true + run_once: true + changed_when: false + + - name: Verify core namespace exists (kube-system) [runbook baseline] + ansible.builtin.shell: | + set -euo pipefail + KUBECONFIG={{ k3s_kubeconfig | default('/etc/rancher/k3s/k3s.yaml') }} kubectl get ns kube-system + args: + executable: /bin/bash + delegate_to: "{{ groups['k3s_server'][0] }}" + become: true + run_once: true + changed_when: false + + - name: Find YAML manifests under ansible/files doc_id dirs + ansible.builtin.find: + paths: "{{ _files_dirs.files | map(attribute='path') | list }}" + file_type: file + patterns: + - "*.yml" + - "*.yaml" + recurse: true + use_regex: false + register: _files_manifests + + - name: Show manifest count summary + ansible.builtin.debug: + msg: + - "doc_id={{ doc_id }}" + - "manifest_files={{ _files_manifests.matched | default(0) }}" + - "manifest_paths={{ (_files_manifests.files | map(attribute='path') | list)[:12] }}" + + - name: Server-side dry-run apply (kubectl apply --dry-run=server) [doc assertion] + ansible.builtin.shell: | + set -euo pipefail + KUBECONFIG={{ k3s_kubeconfig | default('/etc/rancher/k3s/k3s.yaml') }} \ + kubectl apply --dry-run=server -f "{{ item.path }}" + args: + executable: /bin/bash + loop: "{{ _files_manifests.files }}" + loop_control: + label: "{{ item.path }}" + delegate_to: "{{ groups['k3s_server'][0] }}" + become: true + run_once: true + changed_when: false + when: (_files_manifests.matched | default(0) | int) > 0 + + - name: Fail when SKIP_ARMV7=0 but ARMV7_SSH empty + ansible.builtin.fail: + msg: "SKIP_ARMV7=0 但未设置 ARMV7_SSH(见 scripts/.env.verify.example)" + when: skip_armv7 == '0' and armv7_ssh | length == 0 + + - name: Note skipping remote arm install + ansible.builtin.debug: + msg: "SKIP_ARMV7={{ skip_armv7 }}:跳过 arm 远程安装。若需安装:SKIP_ARMV7=0 且 export ARMV7_SSH='ssh -o BatchMode=yes user@arm-host'" + when: skip_armv7 != '0' or armv7_ssh | length == 0 + + - name: Remote Docker install (dnf on arm) + when: skip_armv7 == '0' and armv7_ssh | length > 0 + block: + - name: Check docker on armv7 host + ansible.builtin.shell: "{{ armv7_ssh }} docker version" + register: armv7_docker_check + changed_when: false + failed_when: false + + - name: Install Docker and enable service (dnf) + ansible.builtin.shell: "{{ armv7_ssh }} 'sudo dnf install -y docker && sudo systemctl enable --now docker'" + when: armv7_docker_check.rc != 0 + + - name: Verify docker version and ps + ansible.builtin.shell: "{{ armv7_ssh }} docker version && {{ armv7_ssh }} docker ps -a" + changed_when: false diff --git a/ansible/playbooks/verify/01-04.yml b/ansible/playbooks/verify/01-04.yml deleted file mode 100644 index 9668d3e..0000000 --- a/ansible/playbooks/verify/01-04.yml +++ /dev/null @@ -1,10 +0,0 @@ -- name: "01-04 noop verify" - hosts: localhost - gather_facts: false - vars: - repo_root: "{{ playbook_dir }}/../../.." - doc_id: "01-04" - doc_filename: "01-04-双控制节点ha.md" - tasks: - - ansible.builtin.import_tasks: "{{ playbook_dir }}/_noop-tasks.yml" - diff --git a/ansible/playbooks/verify/01-05.yml b/ansible/playbooks/verify/01-05.yml index dea0ecc..497e58e 100644 --- a/ansible/playbooks/verify/01-05.yml +++ b/ansible/playbooks/verify/01-05.yml @@ -1,10 +1,145 @@ -- name: "01-05 noop verify" +# SKIP_ARMV7=1(默认):仅 noop。 +# SKIP_ARMV7=0 且 ARMV7_NFS_SSH 或 ARMV7_SSH:经 SSH 在 arm 上 dnf 装 nfs-utils、写 /etc/exports、exportfs(见 docs/01-05)。 +# 导出路径/网段:ARMV7_NFS_EXPORT_PATH(默认 /sdcard)、ARMV7_NFS_CLIENT_SUBNET(默认 192.168.2.0/24) +- name: 01-05 armv7 NFS(矩阵 + 可选远程安装) hosts: localhost gather_facts: false vars: repo_root: "{{ playbook_dir }}/../../.." doc_id: "01-05" doc_filename: "01-05-armv7-nfs服务安装.md" + skip_armv7: "{{ lookup('env', 'SKIP_ARMV7') | default('1', true) | trim }}" + armv7_ssh: "{{ lookup('env', 'ARMV7_SSH') | default('', true) | trim }}" + armv7_nfs_export_path: "{{ lookup('env', 'ARMV7_NFS_EXPORT_PATH') | default('/sdcard', true) | trim }}" + armv7_nfs_client_subnet: "{{ lookup('env', 'ARMV7_NFS_CLIENT_SUBNET') | default('192.168.2.0/24', true) | trim }}" tasks: - - ansible.builtin.import_tasks: "{{ playbook_dir }}/_noop-tasks.yml" + - name: Resolve ARMV7_NFS_SSH from env + ansible.builtin.set_fact: + armv7_nfs_ssh: >- + {% set n = lookup('env', 'ARMV7_NFS_SSH') | default('', true) | trim %} + {% set b = lookup('env', 'ARMV7_SSH') | default('', true) | trim %} + {{ n if n | length > 0 else b }} + - name: Baseline docs/files checks + block: + - name: Assert docs file exists + ansible.builtin.stat: + path: "{{ repo_root }}/docs/{{ doc_filename }}" + register: _doc_stat + + - name: Fail when docs file missing + ansible.builtin.assert: + that: + - _doc_stat.stat.exists + fail_msg: "docs file missing: docs/{{ doc_filename }}" + + - name: Find matching ansible/files doc_id directory + ansible.builtin.find: + paths: "{{ repo_root }}/ansible/files" + file_type: directory + patterns: "{{ doc_id }}" + use_regex: false + register: _files_dirs + + - name: Fail when ansible/files doc_id directory missing + ansible.builtin.assert: + that: + - _files_dirs.matched | int >= 1 + fail_msg: "ansible/files missing doc_id directory: ansible/files/{{ doc_id }}" + + - name: Show noop verification summary + ansible.builtin.debug: + msg: + - "doc_id={{ doc_id }}" + - "doc={{ doc_filename }}" + - "files_dirs={{ _files_dirs.files | map(attribute='path') | list }}" + + - name: Verify cluster reachable (kubectl get nodes) [runbook baseline] + ansible.builtin.shell: | + set -euo pipefail + KUBECONFIG={{ k3s_kubeconfig | default('/etc/rancher/k3s/k3s.yaml') }} kubectl get nodes + args: + executable: /bin/bash + delegate_to: "{{ groups['k3s_server'][0] }}" + become: true + run_once: true + changed_when: false + + - name: Verify core namespace exists (kube-system) [runbook baseline] + ansible.builtin.shell: | + set -euo pipefail + KUBECONFIG={{ k3s_kubeconfig | default('/etc/rancher/k3s/k3s.yaml') }} kubectl get ns kube-system + args: + executable: /bin/bash + delegate_to: "{{ groups['k3s_server'][0] }}" + become: true + run_once: true + changed_when: false + + - name: Find YAML manifests under ansible/files doc_id dirs + ansible.builtin.find: + paths: "{{ _files_dirs.files | map(attribute='path') | list }}" + file_type: file + patterns: + - "*.yml" + - "*.yaml" + recurse: true + use_regex: false + register: _files_manifests + + - name: Show manifest count summary + ansible.builtin.debug: + msg: + - "doc_id={{ doc_id }}" + - "manifest_files={{ _files_manifests.matched | default(0) }}" + - "manifest_paths={{ (_files_manifests.files | map(attribute='path') | list)[:12] }}" + + - name: Server-side dry-run apply (kubectl apply --dry-run=server) [doc assertion] + ansible.builtin.shell: | + set -euo pipefail + KUBECONFIG={{ k3s_kubeconfig | default('/etc/rancher/k3s/k3s.yaml') }} \ + kubectl apply --dry-run=server -f "{{ item.path }}" + args: + executable: /bin/bash + loop: "{{ _files_manifests.files }}" + loop_control: + label: "{{ item.path }}" + delegate_to: "{{ groups['k3s_server'][0] }}" + become: true + run_once: true + changed_when: false + when: (_files_manifests.matched | default(0) | int) > 0 + + - name: Fail when SKIP_ARMV7=0 but no ARMV7_SSH / ARMV7_NFS_SSH + ansible.builtin.fail: + msg: "SKIP_ARMV7=0 但未设置 ARMV7_SSH(或 ARMV7_NFS_SSH 指向 NFS 所在 arm 主机)" + when: skip_armv7 == '0' and armv7_nfs_ssh | length == 0 + + - name: Note skipping remote NFS setup + ansible.builtin.debug: + msg: "SKIP_ARMV7={{ skip_armv7 }}:跳过 arm NFS 远程配置。" + when: skip_armv7 != '0' or armv7_nfs_ssh | length == 0 + + - name: Remote NFS install (dnf on arm) + when: skip_armv7 == '0' and armv7_nfs_ssh | length > 0 + block: + - name: Install nfs-utils and enable nfs-server + ansible.builtin.shell: "{{ armv7_nfs_ssh }} 'sudo dnf install -y nfs-utils && sudo systemctl enable --now nfs-server'" + + - name: Check if export path already in /etc/exports + ansible.builtin.shell: "{{ armv7_nfs_ssh }} sudo grep -qF {{ armv7_nfs_export_path | quote }} /etc/exports" + register: armv7_exports_grep + failed_when: false + changed_when: false + + - name: Append NFS export line + ansible.builtin.shell: "{{ armv7_nfs_ssh }} bash -c 'echo \"{{ armv7_nfs_export_path }} {{ armv7_nfs_client_subnet }}(rw,sync,no_subtree_check,no_root_squash)\" | sudo tee -a /etc/exports'" + when: armv7_exports_grep.rc != 0 + + - name: Apply exportfs + ansible.builtin.shell: "{{ armv7_nfs_ssh }} sudo exportfs -rav" + changed_when: true + + - name: Verify showmount + ansible.builtin.shell: "{{ armv7_nfs_ssh }} showmount -e localhost" + changed_when: false diff --git a/ansible/playbooks/verify/01-06.yml b/ansible/playbooks/verify/01-06.yml index 1d0c0bd..1a07537 100644 --- a/ansible/playbooks/verify/01-06.yml +++ b/ansible/playbooks/verify/01-06.yml @@ -1,3 +1,138 @@ +--- +# 单文件化说明: +# - 01-06.yml 默认仍做“最小 verify”(kube-system pods) +# - 如需“准备数据盘/安装 K3s”,必须显式开启开关: +# -e k3s_do_prepare_storage=true # 内联原 01-06-prepare-storage.yml +# -e k3s_do_install=true # 内联原 01-06-install.yml + +- name: Prepare data disk and mount to k3s_data_dir (opt-in) + hosts: k3s_nodes + become: true + vars: + k3s_do_prepare_storage: "{{ k3s_do_prepare_storage | default(false) | bool }}" + pre_tasks: + - name: Gate - skip prepare storage when k3s_do_prepare_storage=false + when: not k3s_do_prepare_storage + block: + - ansible.builtin.debug: + msg: "[GATE] skipped doc_id=01-06 action=prepare-storage var=k3s_do_prepare_storage" + - meta: end_play + tasks: + - name: Skip notice when storage prep disabled + ansible.builtin.debug: + msg: "k3s_prepare_storage is false — skipping (see group_vars/all.yml)" + when: not (k3s_prepare_storage | default(false) | bool) + + - name: Prepare block storage for k3s_data_dir + when: k3s_prepare_storage | default(false) | bool + block: + - name: Require k3s_data_disk_device when k3s_prepare_storage is true + ansible.builtin.assert: + that: + - k3s_data_disk_device is defined + - (k3s_data_disk_device | string | length) > 0 + fail_msg: "Set k3s_data_disk_device (e.g. /dev/vdb) in group_vars or host_vars" + + - name: Verify k3s_data_disk_device is a block device + ansible.builtin.command: test -b {{ k3s_data_disk_device }} + changed_when: false + + - name: Check whether k3s_data_dir is already a mountpoint + ansible.builtin.command: mountpoint -q {{ k3s_data_dir }} + register: mp_k3s + changed_when: false + failed_when: false + + - name: Skip when k3s_data_dir already mounted + ansible.builtin.debug: + msg: "{{ k3s_data_dir }} already mounted — skipping partitioning on {{ inventory_hostname }}" + when: mp_k3s.rc == 0 + + - name: Install partitioning and filesystem tools + ansible.builtin.package: + name: + - parted + - e2fsprogs + state: present + when: mp_k3s.rc != 0 + + - name: Compute first partition path (nvme*n* -> p1, else 1) + ansible.builtin.set_fact: + k3s_data_partition: >- + {{ k3s_data_disk_device }}{{ 'p1' if (k3s_data_disk_device | regex_search('nvme[0-9]+n[0-9]+$')) else '1' }} + when: mp_k3s.rc != 0 + + - name: Create GPT and single ext4 partition + ansible.builtin.command: >- + parted -s {{ k3s_data_disk_device }} mklabel gpt mkpart primary ext4 0% 100% + args: + creates: "{{ k3s_data_partition }}" + when: mp_k3s.rc != 0 + + - name: Wait for partition node in /dev + ansible.builtin.wait_for: + path: "{{ k3s_data_partition }}" + state: present + timeout: 60 + when: mp_k3s.rc != 0 + + - name: Detect existing filesystem on partition + ansible.builtin.command: blkid -s TYPE -o value {{ k3s_data_partition }} + register: fs_type + changed_when: false + failed_when: false + when: mp_k3s.rc != 0 + + - name: Create ext4 on partition + ansible.builtin.command: mkfs.ext4 -F {{ k3s_data_partition }} + when: + - mp_k3s.rc != 0 + - (fs_type.stdout | default('') | trim | length) == 0 + + - name: Read UUID of partition + ansible.builtin.command: blkid -s UUID -o value {{ k3s_data_partition }} + register: blk_uuid + changed_when: false + when: mp_k3s.rc != 0 + + - name: Ensure mount directory exists + ansible.builtin.file: + path: "{{ k3s_data_dir }}" + state: directory + mode: "0755" + when: mp_k3s.rc != 0 + + - name: Add fstab entry for k3s_data_dir + ansible.builtin.lineinfile: + path: /etc/fstab + regexp: "^UUID={{ blk_uuid.stdout | trim }}\\s" + line: "UUID={{ blk_uuid.stdout | trim }} {{ k3s_data_dir }} ext4 defaults,nofail 0 2" + create: true + mode: "0644" + when: mp_k3s.rc != 0 + + - name: Mount all from fstab + ansible.builtin.command: mount -a + changed_when: true + when: mp_k3s.rc != 0 + +- name: Install K3s (opt-in) + hosts: k3s_nodes + become: true + vars: + k3s_do_install: "{{ k3s_do_install | default(false) | bool }}" + pre_tasks: + - name: Gate - skip install when k3s_do_install=false + when: not k3s_do_install + block: + - ansible.builtin.debug: + msg: "[GATE] skipped doc_id=01-06 action=install var=k3s_do_install" + - meta: end_play + tasks: + - name: Placeholder (install content inlined in following plays) + ansible.builtin.debug: + msg: "[RUN] doc_id=01-06 action=install-start" + - name: "01-06 k3s baseline verify (kube-system pods)" hosts: k3s_server become: true diff --git a/ansible/playbooks/verify/01-07.yml b/ansible/playbooks/verify/01-07.yml index d38a38d..a9c3bb6 100644 --- a/ansible/playbooks/verify/01-07.yml +++ b/ansible/playbooks/verify/01-07.yml @@ -6,5 +6,5 @@ doc_id: "01-07" doc_filename: "01-07-openwrt-haproxy.md" tasks: - - ansible.builtin.import_tasks: "{{ playbook_dir }}/_noop-tasks.yml" - + - name: Include noop doc verify tasks + ansible.builtin.include_tasks: tasks/noop-doc-verify.yml diff --git a/ansible/playbooks/verify/01-08.yml b/ansible/playbooks/verify/01-08.yml new file mode 100644 index 0000000..70bb069 --- /dev/null +++ b/ansible/playbooks/verify/01-08.yml @@ -0,0 +1,26 @@ +- name: "01-08 noop verify" + hosts: localhost + gather_facts: false + vars: + repo_root: "{{ playbook_dir }}/../../.." + doc_id: "01-08" + doc_filename: "01-08-双控制节点ha.md" + tasks: + - name: Assert docs file exists + ansible.builtin.stat: + path: "{{ repo_root }}/docs/{{ doc_filename }}" + register: _doc + + - name: Fail when docs file missing + ansible.builtin.assert: + that: + - _doc.stat.exists + fail_msg: "docs missing: docs/{{ doc_filename }}" + + - name: Summary + ansible.builtin.debug: + msg: + - "doc_id={{ doc_id }} (manual runbook / HA exercise)" + - "This verify case only asserts docs file exists." + - "HA join/switch must be exercised manually per docs/{{ doc_filename }}." + diff --git a/ansible/playbooks/verify/02-00.yml b/ansible/playbooks/verify/02-00.yml deleted file mode 100644 index 455cec9..0000000 --- a/ansible/playbooks/verify/02-00.yml +++ /dev/null @@ -1,10 +0,0 @@ -- name: "02-00 noop verify" - hosts: localhost - gather_facts: false - vars: - repo_root: "{{ playbook_dir }}/../../.." - doc_id: "02-00" - doc_filename: "02-00-nginx-系列说明.md" - tasks: - - ansible.builtin.import_tasks: "{{ playbook_dir }}/_noop-tasks.yml" - diff --git a/ansible/playbooks/verify/02-01.yml b/ansible/playbooks/verify/02-01.yml index ff51ce8..fd6bd97 100644 --- a/ansible/playbooks/verify/02-01.yml +++ b/ansible/playbooks/verify/02-01.yml @@ -4,7 +4,7 @@ run_once: true vars: k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml - manifest_src: "{{ playbook_dir }}/../../files/02-05-nginx-matrix/01-control-ingress.yaml" + manifest_src: "{{ playbook_dir }}/../../files/02-05/01-control-ingress.yaml" manifest_dest: /tmp/nginx-m1.yaml tasks: - name: Copy manifest diff --git a/ansible/playbooks/verify/02-02.yml b/ansible/playbooks/verify/02-02.yml index 12e4178..c0dddb1 100644 --- a/ansible/playbooks/verify/02-02.yml +++ b/ansible/playbooks/verify/02-02.yml @@ -8,7 +8,7 @@ run_once: true vars: k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml - manifest_src: "{{ playbook_dir }}/../../files/02-05-nginx-matrix/02-control-ingressroute.yaml" + manifest_src: "{{ playbook_dir }}/../../files/02-05/02-control-ingressroute.yaml" manifest_dest: /tmp/nginx-m2.yaml tasks: - name: Copy manifest diff --git a/ansible/playbooks/verify/02-03.yml b/ansible/playbooks/verify/02-03.yml index ca55b5b..4298833 100644 --- a/ansible/playbooks/verify/02-03.yml +++ b/ansible/playbooks/verify/02-03.yml @@ -4,7 +4,7 @@ run_once: true vars: k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml - manifest_src: "{{ playbook_dir }}/../../files/02-05-nginx-matrix/03-worker-ingress.yaml" + manifest_src: "{{ playbook_dir }}/../../files/02-05/03-worker-ingress.yaml" manifest_dest: /tmp/nginx-m3.yaml tasks: - name: Copy manifest diff --git a/ansible/playbooks/verify/02-04.yml b/ansible/playbooks/verify/02-04.yml index 3f5c179..737e451 100644 --- a/ansible/playbooks/verify/02-04.yml +++ b/ansible/playbooks/verify/02-04.yml @@ -4,7 +4,7 @@ run_once: true vars: k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml - manifest_src: "{{ playbook_dir }}/../../files/02-05-nginx-matrix/04-worker-ingressroute.yaml" + manifest_src: "{{ playbook_dir }}/../../files/02-05/04-worker-ingressroute.yaml" manifest_dest: /tmp/nginx-m4.yaml tasks: - name: Copy manifest diff --git a/ansible/playbooks/verify/02-05.yml b/ansible/playbooks/verify/02-05.yml index 99afbb6..ff9c14e 100644 --- a/ansible/playbooks/verify/02-05.yml +++ b/ansible/playbooks/verify/02-05.yml @@ -1,65 +1,165 @@ -- import_playbook: "{{ playbook_dir }}/../nginx-matrix-deploy.yml" +--- +# 合并说明: +# - 原 02-05.yml 仅 import 02-05-deploy.yml + 02-01..02-04 +# - 现已把 02-05-deploy.yml 内联到本文件,保持 scripts/verify.sh run 02-05 的语义不变 -- name: Verify 02-05 nginx matrix (HTTP paths) +- name: Deploy nginx matrix (M1~M4) hosts: k3s_server become: true run_once: true vars: k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml - verify_entry_base: "{{ nginx_entry_base | default('http://' ~ k3s_server_ip) }}" + manifests_path: "{{ playbook_dir }}/../../files/02-05" tasks: - - name: Verify M1~M4 deployments ready - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/nginx-m1 -n default --timeout=120s - KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/nginx-m2 -n default --timeout=120s - KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/nginx-m3 -n default --timeout=180s - KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/nginx-m4 -n default --timeout=180s - args: - executable: /bin/bash - changed_when: false + - name: Ensure manifests path exists + ansible.builtin.stat: + path: "{{ manifests_path }}" + register: manifests_stat + delegate_to: localhost + run_once: true - - name: HTTP check 4 paths (expect 200 and X-Backend marker) - ansible.builtin.shell: | - set +e - base="{{ verify_entry_base | trim | regex_replace('/+$','') }}" - fail=0 - for id in 1 2 3 4; do - url="$base/demo-m$id/" - code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") - echo "$url -> $code" - if [ "$code" != "200" ]; then - echo "$url -> unexpected http_code=$code" - fail=1 - continue - fi - backend=$(curl -sS -D - -o /dev/null --connect-timeout 3 --max-time 8 "$url" 2>/dev/null \ - | awk -F': ' '/^X-Backend:/{print $2; exit}' \ - | tr -d '\r' || true) - echo "$url -> X-Backend: ${backend:-}" - if [ "$backend" != "M$id" ]; then - fail=1 - fi - done - exit $fail - args: - executable: /bin/bash - changed_when: false + - name: Fail if manifests not found + ansible.builtin.fail: + msg: "manifests 未找到: {{ manifests_path }},请从仓库根目录或 ansible 同级执行" + when: not manifests_stat.stat.exists + delegate_to: localhost + run_once: true -- name: Teardown 02-05 nginx matrix (optional) - hosts: k3s_server - become: true - run_once: true - vars: - k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml - verify_teardown: "{{ (VERIFY_TEARDOWN | default('1')) | string }}" - tasks: - - name: Delete nginx matrix resources when VERIFY_TEARDOWN=1 - when: verify_teardown == "1" + - name: Ensure control-plane label on k3s_server nodes (for M1) ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl delete -f /tmp/nginx-matrix/ -R --ignore-not-found=true - args: - executable: /bin/bash + KUBECONFIG={{ k3s_kubeconfig }} kubectl label node {{ item }} node-role.kubernetes.io/control-plane= --overwrite + loop: "{{ groups['k3s_server'] | default([]) }}" + + - name: Ensure worker label on k3s_worker nodes (for M3) + ansible.builtin.shell: | + KUBECONFIG={{ k3s_kubeconfig }} kubectl label node {{ item }} node-role.kubernetes.io/worker= --overwrite + loop: "{{ groups['k3s_worker'] | default([]) }}" + + - name: Copy nginx matrix manifests to server + ansible.builtin.copy: + src: "{{ manifests_path }}/" + dest: /tmp/nginx-matrix/ + mode: "0644" + + - name: Delete all nginx matrix deployments before apply + ansible.builtin.shell: KUBECONFIG={{ k3s_kubeconfig }} kubectl delete deployment nginx-m1 nginx-m2 nginx-m3 nginx-m4 -n default --ignore-not-found=true + register: del_nginx + changed_when: "'deleted' in del_nginx.stdout" + + - name: kubectl apply nginx matrix + ansible.builtin.shell: KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f /tmp/nginx-matrix/ -R + register: k8s_apply + changed_when: "'configured' in k8s_apply.stdout or 'created' in k8s_apply.stdout" + + - name: Restart nginx deployments so pods pick up ConfigMap (M1~M4 标识) + ansible.builtin.shell: KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout restart deployment nginx-m1 nginx-m2 nginx-m3 nginx-m4 -n default + register: restart_out changed_when: true + - name: Wait for nginx pods to be ready + ansible.builtin.shell: | + KUBECONFIG={{ k3s_kubeconfig }} kubectl wait --for=condition=ready pod \ + -l app=nginx-m1 --timeout=60s + KUBECONFIG={{ k3s_kubeconfig }} kubectl wait --for=condition=ready pod \ + -l app=nginx-m2 --timeout=60s + KUBECONFIG={{ k3s_kubeconfig }} kubectl wait --for=condition=ready pod \ + -l app=nginx-m3 --timeout=120s + KUBECONFIG={{ k3s_kubeconfig }} kubectl wait --for=condition=ready pod \ + -l app=nginx-m4 --timeout=120s + register: wait_result + changed_when: false + + - name: Verify nginx matrix + ansible.builtin.shell: KUBECONFIG={{ k3s_kubeconfig }} kubectl get pod,svc,ing,ingressroute -n default -o wide + register: verify + changed_when: false + + - name: ">>> nginx matrix 资源" + ansible.builtin.debug: + msg: "{{ item }}" + loop: "{{ verify.stdout_lines }}" + + - name: 验证 Pod 节点分布(M1/M2 应在控制节点,M3/M4 应在工作节点) + ansible.builtin.shell: | + KUBECONFIG={{ k3s_kubeconfig }} kubectl get pod -n default -o custom-columns='NAME:.metadata.name,APP:.metadata.labels.app,NODE:.spec.nodeName' | grep -E '^(NAME|nginx-m)' + register: pod_placement + changed_when: false + + - name: ">>> Pod 节点分布" + ansible.builtin.debug: + msg: "{{ item }}" + loop: "{{ pod_placement.stdout_lines }}" + + - name: M1 容器内诊断(排查为何仍为 nginx 欢迎页) + ansible.builtin.shell: | + echo "========== 1. M1 容器内 /usr/share/nginx/html/ 目录 ==========" + KUBECONFIG={{ k3s_kubeconfig }} kubectl exec -n default deployment/nginx-m1 -- ls -la /usr/share/nginx/html/ 2>/dev/null || echo "(exec 失败)" + echo "" + echo "========== 2. M1 容器内 index.html 内容(前 5 行)==========" + KUBECONFIG={{ k3s_kubeconfig }} kubectl exec -n default deployment/nginx-m1 -- cat /usr/share/nginx/html/index.html 2>/dev/null | head -5 || echo "(exec 失败)" + echo "" + echo "========== 3. M1 容器内 /etc/nginx/conf.d/ 目录 ==========" + KUBECONFIG={{ k3s_kubeconfig }} kubectl exec -n default deployment/nginx-m1 -- ls -la /etc/nginx/conf.d/ 2>/dev/null || echo "(exec 失败)" + echo "" + echo "========== 4. M1 容器内 default.conf 内容 ==========" + KUBECONFIG={{ k3s_kubeconfig }} kubectl exec -n default deployment/nginx-m1 -- cat /etc/nginx/conf.d/default.conf 2>/dev/null || echo "(exec 失败)" + echo "" + echo "========== 5. M1 容器内 nginx 生效配置中的 server 块(前 40 行)==========" + KUBECONFIG={{ k3s_kubeconfig }} kubectl exec -n default deployment/nginx-m1 -- nginx -T 2>/dev/null | grep -A 200 "server {" | head -40 || echo "(exec 失败)" + register: m1_diag + changed_when: false + failed_when: false + + - name: ">>> M1 容器内诊断结果(若 M1 仍为欢迎页,请根据此处输出排查)" + ansible.builtin.debug: + msg: "{{ item }}" + loop: "{{ m1_diag.stdout_lines }}" + + - name: 验证 M1~M4 标识(Pod 内 index.html 含 Mx、响应头 X-Backend) + ansible.builtin.shell: | + base="{{ groups['k3s_nodes'] | map('extract', hostvars) | map(attribute='ansible_host') | first }}" + for id in 1 2 3 4; do + echo "=== M$id Pod 内 index.html 前 2 行 ===" + KUBECONFIG={{ k3s_kubeconfig }} kubectl exec -n default deployment/nginx-m$id -- cat /usr/share/nginx/html/index.html 2>/dev/null | head -2 || echo "(exec 失败)" + echo "=== M$id 响应头 X-Backend ===" + curl -sI "http://$base/demo-m$id/" 2>/dev/null | grep -i x-backend || echo "(未看到 X-Backend)" + echo "" + done + register: m_check + changed_when: false + failed_when: false + + - name: ">>> M1~M4 验证" + ansible.builtin.debug: + msg: "{{ item }}" + loop: "{{ m_check.stdout_lines }}" + + - name: curl 验证(16 个目标:4 节点 × 4 路径) + ansible.builtin.shell: | + bases="{{ groups['k3s_nodes'] | map('extract', hostvars) | map(attribute='ansible_host') | join(' ') }}" + count=0 + ok=0 + echo "=== 16 个目标 (4 节点 × 4 路径) ===" + echo "节点 M1(控制+Ingress) M2(控制+IR) M3(工作+Ingress) M4(工作+IR)" + for base in $bases; do + m1=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 2 http://$base/demo-m1 2>/dev/null) || m1="fail" + m2=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 2 http://$base/demo-m2 2>/dev/null) || m2="fail" + m3=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 2 http://$base/demo-m3 2>/dev/null) || m3="fail" + m4=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 2 http://$base/demo-m4 2>/dev/null) || m4="fail" + printf "%-12s %-16s %-11s %-16s %s\n" "$base" "$m1" "$m2" "$m3" "$m4" + for c in $m1 $m2 $m3 $m4; do count=$((count+1)); [ "$c" = "200" ] && ok=$((ok+1)); done + done + echo "---" + echo "共验证 $count 个目标,$ok 个返回 200" + register: curl_result + changed_when: false + + - name: ">>> curl 矩阵" + ansible.builtin.debug: + msg: "{{ item }}" + loop: "{{ curl_result.stdout_lines }}" + +- import_playbook: 02-01.yml +- import_playbook: 02-02.yml +- import_playbook: 02-03.yml +- import_playbook: 02-04.yml diff --git a/ansible/playbooks/verify/03-01.yml b/ansible/playbooks/verify/03-01.yml index 761c95b..21442be 100644 --- a/ansible/playbooks/verify/03-01.yml +++ b/ansible/playbooks/verify/03-01.yml @@ -1,10 +1,11 @@ -- name: Deploy 03-01 Traefik Dashboard +--- +- name: Deploy 03-01 Traefik Dashboard (HelmChartConfig + IngressRoute) hosts: k3s_server become: true run_once: true vars: k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml - manifest_src: "{{ playbook_dir }}/../../files/03-01-traefik-dashboard/traefik-dashboard.yaml" + manifest_src: "{{ playbook_dir }}/../../files/03-01/traefik-dashboard.yaml" manifest_dest: /tmp/traefik-dashboard.yaml tasks: - name: Copy manifest @@ -13,34 +14,41 @@ dest: "{{ manifest_dest }}" mode: "0644" - - name: Apply manifest + restart traefik + - name: Apply manifest ansible.builtin.shell: | set -e KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f {{ manifest_dest }} - KUBECONFIG={{ k3s_kubeconfig }} kubectl -n kube-system rollout restart deploy/traefik || true args: executable: /bin/bash changed_when: true -- name: Verify 03-01 Traefik Dashboard +- name: Verify 03-01 Traefik Dashboard (resource existence + rollout) hosts: k3s_server become: true run_once: true vars: k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml tasks: - - name: Wait traefik rollout + - name: Rollout status traefik (kube-system) ansible.builtin.shell: | set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl -n kube-system rollout status deploy/traefik --timeout=180s + KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/traefik -n kube-system --timeout=240s args: executable: /bin/bash changed_when: false - - name: Assert traefik-dashboard IngressRoute exists + - name: Assert HelmChartConfig exists ansible.builtin.shell: | set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl -n kube-system get ingressroute.traefik.io/traefik-dashboard + KUBECONFIG={{ k3s_kubeconfig }} kubectl -n kube-system get helmchartconfig traefik + args: + executable: /bin/bash + changed_when: false + + - name: Assert IngressRoute traefik-dashboard exists + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl -n kube-system get ingressroute traefik-dashboard args: executable: /bin/bash changed_when: false diff --git a/ansible/playbooks/verify/03-02.yml b/ansible/playbooks/verify/03-02.yml index 0115cf4..cc5b696 100644 --- a/ansible/playbooks/verify/03-02.yml +++ b/ansible/playbooks/verify/03-02.yml @@ -1,74 +1,212 @@ -- name: Deploy 03-02 Traefik ACME (gated) +--- +- name: Deploy 03-02 Traefik ACME (HelmChartConfig) hosts: k3s_server become: true run_once: true vars: k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml - manifest_src: "{{ playbook_dir }}/../../files/03-02-traefik-acme/traefik-acme.yaml" + manifest_src: "{{ playbook_dir }}/../../files/03-02/traefik-acme.yaml" manifest_dest: /tmp/traefik-acme.yaml - acme_email: "{{ ACME_EMAIL | default('') }}" + acme_email: "{{ lookup('env', 'ACME_EMAIL') | default('', true) }}" tasks: - - name: "Gate - require ACME_EMAIL and cloudflare-api-token secret" - ansible.builtin.shell: | - set -e - test -n "{{ acme_email }}" - KUBECONFIG={{ k3s_kubeconfig }} kubectl -n kube-system get secret cloudflare-api-token >/dev/null - args: - executable: /bin/bash - register: acme_gate - changed_when: false - failed_when: false + - name: "Gate - skip apply when ACME_EMAIL missing" + when: acme_email | trim == "" + block: + - ansible.builtin.debug: + msg: "[GATE] skipped doc_id=03-02 reason=missing_env missing=ACME_EMAIL" + - meta: end_play - name: Copy manifest - when: acme_gate.rc == 0 ansible.builtin.copy: src: "{{ manifest_src }}" dest: "{{ manifest_dest }}" mode: "0644" - name: Replace ACME email placeholder - when: acme_gate.rc == 0 - ansible.builtin.shell: | - set -e - sed -i "s//{{ acme_email | replace('/', '\\/') }}/g" {{ manifest_dest }} - args: - executable: /bin/bash - changed_when: true + ansible.builtin.replace: + path: "{{ manifest_dest }}" + regexp: "" + replace: "{{ acme_email | trim }}" - - name: Apply manifest + restart traefik - when: acme_gate.rc == 0 + - name: Apply manifest ansible.builtin.shell: | set -e KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f {{ manifest_dest }} - KUBECONFIG={{ k3s_kubeconfig }} kubectl -n kube-system rollout restart deploy/traefik || true args: executable: /bin/bash changed_when: true -- name: Verify 03-02 Traefik ACME (gated) +- name: Deploy or cleanup nginx matrix TLS (opt-in) hosts: k3s_server become: true run_once: true vars: k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml - acme_email: "{{ ACME_EMAIL | default('') }}" + nginx_matrix_tls_enable: "{{ nginx_matrix_tls_enable | default(false) | bool }}" + manifests_path: "{{ playbook_dir }}/../../files/03-02" + tls_domains: + - test01.jackadam.top + - test02.jackadam.top + - test03.jackadam.top + - test04.jackadam.top + pre_tasks: + - name: Gate - skip nginx matrix TLS when nginx_matrix_tls_enable=false + when: not nginx_matrix_tls_enable + block: + - ansible.builtin.debug: + msg: "[GATE] skipped doc_id=03-02 action=nginx-matrix-tls var=nginx_matrix_tls_enable" + - meta: end_play tasks: - - name: "Gate - require ACME_EMAIL and cloudflare-api-token secret" + - name: Deploy nginx matrix TLS (mode=deploy) + when: (mode | default('deploy')) == 'deploy' + block: + - name: Ensure manifests path exists + ansible.builtin.stat: + path: "{{ manifests_path }}" + register: manifests_stat + + - name: Fail if manifests not found + ansible.builtin.fail: + msg: "manifests 未找到: {{ manifests_path }},请从仓库根目录或 ansible 同级执行" + when: not manifests_stat.stat.exists + + - name: Ensure control-plane label on k3s_server nodes (for M1) + ansible.builtin.shell: | + KUBECONFIG={{ k3s_kubeconfig }} kubectl label node {{ item }} node-role.kubernetes.io/control-plane= --overwrite + loop: "{{ groups['k3s_server'] | default([]) }}" + + - name: Ensure worker label on k3s_worker nodes (for M3) + ansible.builtin.shell: | + KUBECONFIG={{ k3s_kubeconfig }} kubectl label node {{ item }} node-role.kubernetes.io/worker= --overwrite + loop: "{{ groups['k3s_worker'] | default([]) }}" + + - name: Copy nginx matrix TLS manifests to server + ansible.builtin.copy: + src: "{{ manifests_path }}/" + dest: /tmp/nginx-matrix-tls/ + mode: "0644" + + - name: Delete non-TLS nginx matrix if present (deployments, ingress, ingressroute, middleware, configmaps) + ansible.builtin.shell: | + KUBECONFIG={{ k3s_kubeconfig }} kubectl delete deployment,svc -n default nginx-m1 nginx-m2 nginx-m3 nginx-m4 --ignore-not-found=true + KUBECONFIG={{ k3s_kubeconfig }} kubectl delete ingress -n default nginx-m1 nginx-m3 --ignore-not-found=true + KUBECONFIG={{ k3s_kubeconfig }} kubectl delete ingressroute -n default nginx-m2 nginx-m4 --ignore-not-found=true + KUBECONFIG={{ k3s_kubeconfig }} kubectl delete middleware -n default stripprefix-m1 stripprefix-m2 stripprefix-m3 stripprefix-m4 --ignore-not-found=true + KUBECONFIG={{ k3s_kubeconfig }} kubectl delete configmap -n default nginx-m1-html nginx-m2-html nginx-m3-html nginx-m4-html --ignore-not-found=true + register: del_non_tls + changed_when: "'deleted' in del_non_tls.stdout" + + - name: kubectl apply nginx matrix TLS + HTTP-only + ansible.builtin.shell: KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f /tmp/nginx-matrix-tls/ -R + register: k8s_apply + changed_when: "'configured' in k8s_apply.stdout or 'created' in k8s_apply.stdout" + + - name: Restart nginx deployments so pods pick up ConfigMap (M1~M4 标识) + ansible.builtin.shell: KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout restart deployment nginx-m1 nginx-m2 nginx-m3 nginx-m4 -n default + changed_when: true + + - name: Wait for nginx pods to be ready + ansible.builtin.shell: | + KUBECONFIG={{ k3s_kubeconfig }} kubectl wait --for=condition=ready pod -l app=nginx-m1 --timeout=60s + KUBECONFIG={{ k3s_kubeconfig }} kubectl wait --for=condition=ready pod -l app=nginx-m2 --timeout=60s + KUBECONFIG={{ k3s_kubeconfig }} kubectl wait --for=condition=ready pod -l app=nginx-m3 --timeout=120s + KUBECONFIG={{ k3s_kubeconfig }} kubectl wait --for=condition=ready pod -l app=nginx-m4 --timeout=120s + changed_when: false + + - name: Verify nginx matrix TLS resources + ansible.builtin.shell: KUBECONFIG={{ k3s_kubeconfig }} kubectl get pod,svc,ing,ingressroute -n default -o wide + register: verify_tls + changed_when: false + + - name: ">>> nginx matrix TLS 资源" + ansible.builtin.debug: + msg: "{{ item }}" + loop: "{{ verify_tls.stdout_lines }}" + + - name: HTTP curl 验证(HTTP-only:16 个目标,所有节点 × 4 域名) + ansible.builtin.shell: | + bases="{{ groups['k3s_nodes'] | map('extract', hostvars) | map(attribute='ansible_host') | join(' ') }}" + count=0 + ok=0 + echo "=== 16 个目标 (4 节点 × 4 域名) HTTP ===" + echo "节点 M1(test01) M2(test02) M3(test03) M4(test04)" + for base in $bases; do + m1=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 http://test01.jackadam.top/ --resolve "test01.jackadam.top:80:$base" 2>/dev/null) || m1="fail" + m2=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 http://test02.jackadam.top/ --resolve "test02.jackadam.top:80:$base" 2>/dev/null) || m2="fail" + m3=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 http://test03.jackadam.top/ --resolve "test03.jackadam.top:80:$base" 2>/dev/null) || m3="fail" + m4=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 http://test04.jackadam.top/ --resolve "test04.jackadam.top:80:$base" 2>/dev/null) || m4="fail" + printf "%-12s %-14s %-14s %-14s %s\n" "$base" "$m1" "$m2" "$m3" "$m4" + for c in $m1 $m2 $m3 $m4; do count=$((count+1)); [ "$c" = "200" ] && ok=$((ok+1)); done + done + echo "---" + echo "共验证 $count 个目标,$ok 个返回 200" + changed_when: false + failed_when: false + + - name: HTTPS curl 验证(16 个目标:所有节点 × 4 域名,所有节点均为入口点) + ansible.builtin.shell: | + bases="{{ groups['k3s_nodes'] | map('extract', hostvars) | map(attribute='ansible_host') | join(' ') }}" + count=0 + ok=0 + echo "=== 16 个目标 (4 节点 × 4 域名) HTTPS ===" + echo "节点 M1(test01) M2(test02) M3(test03) M4(test04)" + for base in $bases; do + m1=$(curl -sk -o /dev/null -w "%{http_code}" --connect-timeout 5 https://test01.jackadam.top/ --resolve "test01.jackadam.top:443:$base" 2>/dev/null) || m1="fail" + m2=$(curl -sk -o /dev/null -w "%{http_code}" --connect-timeout 5 https://test02.jackadam.top/ --resolve "test02.jackadam.top:443:$base" 2>/dev/null) || m2="fail" + m3=$(curl -sk -o /dev/null -w "%{http_code}" --connect-timeout 5 https://test03.jackadam.top/ --resolve "test03.jackadam.top:443:$base" 2>/dev/null) || m3="fail" + m4=$(curl -sk -o /dev/null -w "%{http_code}" --connect-timeout 5 https://test04.jackadam.top/ --resolve "test04.jackadam.top:443:$base" 2>/dev/null) || m4="fail" + printf "%-12s %-14s %-14s %-14s %s\n" "$base" "$m1" "$m2" "$m3" "$m4" + for c in $m1 $m2 $m3 $m4; do count=$((count+1)); [ "$c" = "200" ] && ok=$((ok+1)); done + done + echo "---" + echo "共验证 $count 个目标,$ok 个返回 200" + changed_when: false + failed_when: false + + - name: Cleanup nginx matrix TLS (mode=cleanup) + when: (mode | default('deploy')) == 'cleanup' + block: + - name: Delete nginx matrix TLS + HTTP-only resources (deployments, ingress, ingressroute, configmaps) + ansible.builtin.shell: | + KUBECONFIG={{ k3s_kubeconfig }} kubectl delete deployment,svc -n default nginx-m1 nginx-m2 nginx-m3 nginx-m4 --ignore-not-found=true + KUBECONFIG={{ k3s_kubeconfig }} kubectl delete ingress -n default nginx-m1 nginx-m3 nginx-m1-http nginx-m3-http --ignore-not-found=true + KUBECONFIG={{ k3s_kubeconfig }} kubectl delete ingressroute -n default nginx-m2 nginx-m4 nginx-m2-http nginx-m4-http --ignore-not-found=true + KUBECONFIG={{ k3s_kubeconfig }} kubectl delete configmap -n default nginx-m1-html nginx-m2-html nginx-m3-html nginx-m4-html --ignore-not-found=true + register: del_tls + changed_when: "'deleted' in del_tls.stdout" + + - name: Remove copied nginx matrix TLS manifests directory + ansible.builtin.file: + path: /tmp/nginx-matrix-tls + state: absent + +- name: Verify 03-02 Traefik ACME (rollout + secret) + hosts: k3s_server + become: true + run_once: true + vars: + k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml + acme_email: "{{ lookup('env', 'ACME_EMAIL') | default('', true) }}" + tasks: + - name: "Gate - skip verify when ACME_EMAIL missing" + when: acme_email | trim == "" + block: + - ansible.builtin.debug: + msg: "[GATE] skipped doc_id=03-02 reason=missing_env missing=ACME_EMAIL" + - meta: end_play + + - name: Assert Cloudflare token secret exists ansible.builtin.shell: | set -e - test -n "{{ acme_email }}" - KUBECONFIG={{ k3s_kubeconfig }} kubectl -n kube-system get secret cloudflare-api-token >/dev/null + KUBECONFIG={{ k3s_kubeconfig }} kubectl -n kube-system get secret cloudflare-api-token args: executable: /bin/bash - register: acme_gate changed_when: false - failed_when: false - - name: Wait traefik rollout - when: acme_gate.rc == 0 + - name: Rollout status traefik (kube-system) ansible.builtin.shell: | set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl -n kube-system rollout status deploy/traefik --timeout=180s + KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/traefik -n kube-system --timeout=300s args: executable: /bin/bash changed_when: false @@ -80,19 +218,19 @@ vars: k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml verify_teardown: "{{ (VERIFY_TEARDOWN | default('1')) | string }}" + acme_email: "{{ lookup('env', 'ACME_EMAIL') | default('', true) }}" manifest_dest: /tmp/traefik-acme.yaml - acme_email: "{{ ACME_EMAIL | default('') }}" tasks: + - name: Skip teardown when gated + when: acme_email | trim == "" + meta: end_play + - name: Delete resources when VERIFY_TEARDOWN=1 when: verify_teardown == "1" ansible.builtin.shell: | set -e - # gated:只有在 deploy gate 通过且文件存在时才清理;否则跳过,避免 fail-fast。 - test -n "{{ acme_email }}" - test -f "{{ manifest_dest }}" KUBECONFIG={{ k3s_kubeconfig }} kubectl delete -f {{ manifest_dest }} --ignore-not-found=true args: executable: /bin/bash changed_when: true - failed_when: false diff --git a/ansible/playbooks/verify/03-03.yml b/ansible/playbooks/verify/03-03.yml index 2a48166..6514a95 100644 --- a/ansible/playbooks/verify/03-03.yml +++ b/ansible/playbooks/verify/03-03.yml @@ -6,5 +6,5 @@ doc_id: "03-03" doc_filename: "03-03-k3s-traefik-dashboard-acme.md" tasks: - - ansible.builtin.import_tasks: "{{ playbook_dir }}/_noop-tasks.yml" - + - name: Include noop doc verify tasks + ansible.builtin.include_tasks: tasks/noop-doc-verify.yml diff --git a/ansible/playbooks/verify/03-04.yml b/ansible/playbooks/verify/03-04.yml index 733802d..918efcc 100644 --- a/ansible/playbooks/verify/03-04.yml +++ b/ansible/playbooks/verify/03-04.yml @@ -6,5 +6,5 @@ doc_id: "03-04" doc_filename: "03-04-k3s-cloudflare-tunnel-配置接入.md" tasks: - - ansible.builtin.import_tasks: "{{ playbook_dir }}/_noop-tasks.yml" - + - name: Include noop doc verify tasks + ansible.builtin.include_tasks: tasks/noop-doc-verify.yml diff --git a/ansible/playbooks/verify/03-05.yml b/ansible/playbooks/verify/03-05.yml index 84465ab..905034d 100644 --- a/ansible/playbooks/verify/03-05.yml +++ b/ansible/playbooks/verify/03-05.yml @@ -1,19 +1,62 @@ -- name: Deploy 03-05 local-path PVC demo +--- +- name: Apply local-path-config lab JSON (opt-in) hosts: k3s_server become: true run_once: true vars: k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml - manifest_src: "{{ playbook_dir }}/../../files/03-05-local-path-demo/local-path-pvc-demo.yaml" + local_path_apply_lab_config_enabled: "{{ local_path_apply_lab_config | default(false) | bool }}" + local_path_json_src: "{{ playbook_dir }}/../../files/03-05/local-path-config-lab.json" + local_path_json_dest: /root/local-path-config-lab.json + pre_tasks: + - name: Gate - skip apply local-path lab config when local_path_apply_lab_config=false + when: not local_path_apply_lab_config_enabled + block: + - ansible.builtin.debug: + msg: "[GATE] skipped doc_id=03-05 action=apply-local-path-config var=local_path_apply_lab_config" + - meta: end_play + tasks: + - name: Copy local-path lab json + ansible.builtin.copy: + src: "{{ local_path_json_src }}" + dest: "{{ local_path_json_dest }}" + mode: "0644" + + - name: Apply local-path-config ConfigMap + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl -n kube-system create configmap local-path-config \ + --from-file=config.json={{ local_path_json_dest }} \ + --dry-run=client -o yaml | KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f - + args: + executable: /bin/bash + changed_when: true + + - name: Restart local-path-provisioner if present + ansible.builtin.shell: | + KUBECONFIG={{ k3s_kubeconfig }} kubectl -n kube-system rollout restart deploy/local-path-provisioner + args: + executable: /bin/bash + register: lp_restart + failed_when: false + changed_when: lp_restart.rc == 0 + +- name: Deploy 03-05 local-path pvc demo + hosts: k3s_server + become: true + run_once: true + vars: + k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml + manifest_src: "{{ playbook_dir }}/../../files/03-05/local-path-pvc-demo.yaml" manifest_dest: /tmp/local-path-pvc-demo.yaml tasks: - - name: Copy manifest to server + - name: Copy manifest ansible.builtin.copy: src: "{{ manifest_src }}" dest: "{{ manifest_dest }}" mode: "0644" - - name: kubectl apply + - name: Apply manifest ansible.builtin.shell: | set -e KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f {{ manifest_dest }} @@ -21,32 +64,34 @@ executable: /bin/bash changed_when: true -- name: Verify 03-05 local-path PVC demo +- name: Verify 03-05 local-path pvc demo hosts: k3s_server become: true run_once: true vars: k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml tasks: - - name: Wait nginx-local-pvc-demo deployment ready + - name: Wait pvc Bound ansible.builtin.shell: | set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/nginx-local-pvc-demo -n default --timeout=180s + KUBECONFIG={{ k3s_kubeconfig }} kubectl get pvc local-pvc-demo -n default -o jsonpath='{.status.phase}' + args: + executable: /bin/bash + register: pvc_phase + changed_when: false + until: pvc_phase.stdout | trim == "Bound" + retries: 30 + delay: 2 + + - name: Rollout status nginx-local-pvc-demo + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/nginx-local-pvc-demo -n default --timeout=240s args: executable: /bin/bash changed_when: false - - name: Assert PVC is Bound - ansible.builtin.shell: | - set -e - phase=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get pvc local-pvc-demo -n default -o jsonpath='{.status.phase}') - echo "pvc phase=$phase" - test "$phase" = "Bound" - args: - executable: /bin/bash - changed_when: false - -- name: Teardown 03-05 local-path PVC demo (optional) +- name: Teardown 03-05 local-path pvc demo (optional) hosts: k3s_server become: true run_once: true diff --git a/ansible/playbooks/verify/03-06.yml b/ansible/playbooks/verify/03-06.yml index 55e4b4a..5248de3 100644 --- a/ansible/playbooks/verify/03-06.yml +++ b/ansible/playbooks/verify/03-06.yml @@ -1,34 +1,41 @@ -- name: Deploy 03-06 NFS PV/PVC demo (gated) +--- +- name: Deploy 03-06 nfs pv+pvc demo (gated by env) hosts: k3s_server become: true run_once: true vars: k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml - nfs_server_ip: "{{ NFS_SERVER_IP | default('') }}" - nfs_export_path: "{{ NFS_EXPORT_PATH | default('') }}" - manifest_src: "{{ playbook_dir }}/../../files/03-06-nfs-demo/nfs-pv-pvc-demo.yaml" + manifest_src: "{{ playbook_dir }}/../../files/03-06/nfs-pv-pvc-demo.yaml" manifest_dest: /tmp/nfs-pv-pvc-demo.yaml + nfs_server_ip: "{{ lookup('env', 'NFS_SERVER_IP') | default('', true) }}" + nfs_export_path: "{{ lookup('env', 'NFS_EXPORT_PATH') | default('', true) }}" tasks: - - name: "Gate - require NFS_SERVER_IP and NFS_EXPORT_PATH" - ansible.builtin.shell: | - set -e - test -n "{{ nfs_server_ip }}" - test -n "{{ nfs_export_path }}" - args: - executable: /bin/bash - register: nfs_gate - changed_when: false - failed_when: false + - name: "Gate - skip apply when NFS vars missing" + when: (nfs_server_ip | trim == "") or (nfs_export_path | trim == "") + block: + - ansible.builtin.debug: + msg: "[GATE] skipped doc_id=03-06 reason=missing_env missing=NFS_SERVER_IP,NFS_EXPORT_PATH" + - meta: end_play - name: Copy manifest - when: nfs_gate.rc == 0 ansible.builtin.copy: src: "{{ manifest_src }}" dest: "{{ manifest_dest }}" mode: "0644" - - name: kubectl apply - when: nfs_gate.rc == 0 + - name: Replace NFS placeholders + ansible.builtin.replace: + path: "{{ manifest_dest }}" + regexp: "" + replace: "{{ nfs_server_ip | trim }}" + + - name: Replace NFS export path placeholder + ansible.builtin.replace: + path: "{{ manifest_dest }}" + regexp: "" + replace: "{{ nfs_export_path | trim }}" + + - name: Apply manifest ansible.builtin.shell: | set -e KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f {{ manifest_dest }} @@ -36,59 +43,55 @@ executable: /bin/bash changed_when: true -- name: Verify 03-06 NFS PV/PVC demo (gated) +- name: Verify 03-06 nfs pvc demo (Bound) hosts: k3s_server become: true run_once: true vars: k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml - nfs_server_ip: "{{ NFS_SERVER_IP | default('') }}" - nfs_export_path: "{{ NFS_EXPORT_PATH | default('') }}" + nfs_server_ip: "{{ lookup('env', 'NFS_SERVER_IP') | default('', true) }}" + nfs_export_path: "{{ lookup('env', 'NFS_EXPORT_PATH') | default('', true) }}" tasks: - - name: "Gate - require NFS_SERVER_IP and NFS_EXPORT_PATH" + - name: "Gate - skip verify when NFS vars missing" + when: (nfs_server_ip | trim == "") or (nfs_export_path | trim == "") + block: + - ansible.builtin.debug: + msg: "[GATE] skipped doc_id=03-06 reason=missing_env missing=NFS_SERVER_IP,NFS_EXPORT_PATH" + - meta: end_play + + - name: Wait pvc Bound ansible.builtin.shell: | set -e - test -n "{{ nfs_server_ip }}" - test -n "{{ nfs_export_path }}" + KUBECONFIG={{ k3s_kubeconfig }} kubectl get pvc nfs-pvc-demo -n default -o jsonpath='{.status.phase}' args: executable: /bin/bash - register: nfs_gate + register: pvc_phase changed_when: false - failed_when: false + until: pvc_phase.stdout | trim == "Bound" + retries: 40 + delay: 3 - - name: Assert PVC Bound - when: nfs_gate.rc == 0 - ansible.builtin.shell: | - set -e - phase=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl -n default get pvc nfs-pvc-demo -o jsonpath='{.status.phase}') - echo "pvc phase=$phase" - test "$phase" = "Bound" - args: - executable: /bin/bash - changed_when: false - -- name: Teardown 03-06 NFS PV/PVC demo (optional) +- name: Teardown 03-06 nfs pv+pvc demo (optional) hosts: k3s_server become: true run_once: true vars: k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml verify_teardown: "{{ (VERIFY_TEARDOWN | default('1')) | string }}" + nfs_server_ip: "{{ lookup('env', 'NFS_SERVER_IP') | default('', true) }}" + nfs_export_path: "{{ lookup('env', 'NFS_EXPORT_PATH') | default('', true) }}" manifest_dest: /tmp/nfs-pv-pvc-demo.yaml - nfs_server_ip: "{{ NFS_SERVER_IP | default('') }}" - nfs_export_path: "{{ NFS_EXPORT_PATH | default('') }}" tasks: + - name: Skip teardown when gated + when: (nfs_server_ip | trim == "") or (nfs_export_path | trim == "") + meta: end_play + - name: Delete resources when VERIFY_TEARDOWN=1 when: verify_teardown == "1" ansible.builtin.shell: | set -e - # gated:只有在 deploy gate 通过且文件存在时才清理;否则跳过,避免 fail-fast。 - test -n "{{ nfs_server_ip }}" - test -n "{{ nfs_export_path }}" - test -f "{{ manifest_dest }}" KUBECONFIG={{ k3s_kubeconfig }} kubectl delete -f {{ manifest_dest }} --ignore-not-found=true args: executable: /bin/bash changed_when: true - failed_when: false diff --git a/ansible/playbooks/verify/03-07.yml b/ansible/playbooks/verify/03-07.yml index dbc6016..67256c7 100644 --- a/ansible/playbooks/verify/03-07.yml +++ b/ansible/playbooks/verify/03-07.yml @@ -1,4 +1,248 @@ -- import_playbook: "{{ playbook_dir }}/../longhorn-install.yml" +--- +- name: Longhorn node packages (iSCSI, NFS client) + hosts: k3s_nodes + become: true + tasks: + - name: Install Longhorn OS dependencies + when: longhorn_install_node_packages | default(true) | bool + block: + - name: Install iscsi + nfs (dnf/yum) + ansible.builtin.package: + name: + - iscsi-initiator-utils + - nfs-utils + state: present + + - name: Enable iscsid + ansible.builtin.systemd: + name: iscsid + enabled: true + state: started + + - name: Ensure Longhorn data subdirectory exists on all nodes + ansible.builtin.file: + path: "{{ k3s_data_dir }}/longhorn" + state: directory + mode: "0700" + + - name: Pre-pull Longhorn images on all nodes (optional, avoid DockerHub EOF/ImagePullBackOff) + when: longhorn_prepull_images | default(true) | bool + ansible.builtin.shell: | + set -e + CTR="ctr --address /run/k3s/containerd/containerd.sock -n k8s.io" + + imgs=( + "docker.io/longhornio/longhorn-manager:v{{ longhorn_chart_version }}" + "docker.io/longhornio/longhorn-ui:v{{ longhorn_chart_version }}" + "docker.io/longhornio/longhorn-share-manager:v{{ longhorn_chart_version }}" + "docker.io/longhornio/longhorn-engine:v{{ longhorn_chart_version }}" + "docker.io/longhornio/longhorn-instance-manager:v{{ longhorn_chart_version }}" + "docker.io/longhornio/backing-image-manager:v{{ longhorn_chart_version }}" + "docker.io/longhornio/support-bundle-kit:v0.0.45" + ) + + for img in "${imgs[@]}"; do + ok=0 + for i in 1 2 3 4 5; do + echo "[pull] $img (try $i/5)" + if $CTR images pull "$img"; then + ok=1 + break + fi + sleep $((i * 3)) + done + if [ "$ok" -ne 1 ]; then + echo "[ERR] failed pulling $img after retries" + exit 1 + fi + done + args: + executable: /bin/bash + changed_when: true + +- name: Install Longhorn with Helm on first server + hosts: k3s_server + become: true + run_once: true + vars: + longhorn_values_src: "{{ playbook_dir }}/../../files/03-07/values-lab.yaml" + longhorn_values_dest: /root/longhorn-values-lab.yaml + k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml + tasks: + - name: Install helm package (Fedora/RHEL family) + ansible.builtin.package: + name: helm + state: present + ignore_errors: true + register: helm_pkg + + - name: Hint if helm package install failed (install Helm 3 manually if needed) + ansible.builtin.debug: + msg: "dnf/yum 未装上 helm 时,请见 https://helm.sh/docs/intro/install/" + when: helm_pkg.failed | default(false) + + - name: Fail if helm binary still unavailable + ansible.builtin.command: which helm + register: helm_which + changed_when: false + failed_when: helm_which.rc != 0 + + - name: Copy lab values to server + ansible.builtin.copy: + src: "{{ longhorn_values_src }}" + dest: "{{ longhorn_values_dest }}" + mode: "0600" + + - name: Ensure longhorn-system namespace is not stuck Terminating (force finalize if needed) + ansible.builtin.shell: | + set -e + export KUBECONFIG={{ k3s_kubeconfig }} + ns="longhorn-system" + phase="$(kubectl get ns "$ns" -o jsonpath='{.status.phase}' 2>/dev/null || true)" + if [ "$phase" = "Terminating" ]; then + echo "[WARN] namespace $ns is Terminating; force finalize to unblock install" + kubectl get ns "$ns" -o json > /tmp/ns.json + python3 -c "import json; obj=json.load(open('/tmp/ns.json')); obj.setdefault('spec',{}); obj['spec']['finalizers']=[]; json.dump(obj, open('/tmp/ns-finalize.json','w'))" + kubectl replace --raw \"/api/v1/namespaces/$ns/finalize\" -f /tmp/ns-finalize.json >/dev/null + fi + args: + executable: /bin/bash + changed_when: true + failed_when: false + + - name: Ensure longhorn Helm repo + ansible.builtin.shell: | + set -e + if ! helm repo list 2>/dev/null | grep -q '^longhorn'; then + helm repo add longhorn https://charts.longhorn.io + fi + helm repo update + environment: + KUBECONFIG: "{{ k3s_kubeconfig }}" + args: + executable: /bin/bash + changed_when: true + + - name: Delete leftover longhorn PriorityClass (cluster-scoped) to avoid Helm ownership conflicts + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl delete priorityclass longhorn-critical --ignore-not-found=true + args: + executable: /bin/bash + changed_when: true + failed_when: false + + - name: Delete leftover Longhorn CRDs (cluster-scoped, opt-in) + when: longhorn_force_crd_reset | default(false) | bool + ansible.builtin.shell: | + set -e + export KUBECONFIG={{ k3s_kubeconfig }} + crd_list=\"$(kubectl get crd -o name 2>/dev/null | grep 'longhorn.io' || true)\" + if [ -n \"$crd_list\" ]; then + echo \"$crd_list\" | while read -r crd; do + [ -z \"$crd\" ] && continue + timeout 20s kubectl delete \"$crd\" --ignore-not-found=true || true + done + fi + args: + executable: /bin/bash + changed_when: true + failed_when: false + + - name: Delete leftover Longhorn ClusterRole/ClusterRoleBinding (cluster-scoped) + ansible.builtin.shell: | + set -e + export KUBECONFIG={{ k3s_kubeconfig }} + + role_list=\"$(kubectl get clusterrole -o name 2>/dev/null | grep 'longhorn' || true)\" + if [ -n \"$role_list\" ]; then + echo \"$role_list\" | while read -r role; do + [ -z \"$role\" ] && continue + timeout 20s kubectl delete \"$role\" --ignore-not-found=true || true + done + fi + + binding_list=\"$(kubectl get clusterrolebinding -o name 2>/dev/null | grep 'longhorn' || true)\" + if [ -n \"$binding_list\" ]; then + echo \"$binding_list\" | while read -r binding; do + [ -z \"$binding\" ] && continue + timeout 20s kubectl delete \"$binding\" --ignore-not-found=true || true + done + fi + args: + executable: /bin/bash + changed_when: true + failed_when: false + + - name: Cleanup leftover Helm release records for Longhorn (default + longhorn-system) + ansible.builtin.shell: | + set -e + export KUBECONFIG={{ k3s_kubeconfig }} + + for ns in longhorn-system default; do + if helm -n \"$ns\" list --all 2>/dev/null | grep -q '^longhorn'; then + timeout 120s helm -n \"$ns\" uninstall longhorn --no-hooks || true + fi + + sec_list=\"$(kubectl -n \"$ns\" get secret -o name 2>/dev/null | grep '^secret/sh\\.helm\\.release\\.v1\\.longhorn\\.' || true)\" + if [ -n \"$sec_list\" ]; then + echo \"$sec_list\" | xargs -n1 kubectl -n \"$ns\" delete --ignore-not-found=true + fi + done + environment: + KUBECONFIG: "{{ k3s_kubeconfig }}" + args: + executable: /bin/bash + changed_when: true + failed_when: false + + - name: Helm upgrade/install Longhorn(失败兜底:install --replace) + ansible.builtin.shell: | + set -e + helm upgrade --install longhorn longhorn/longhorn --namespace longhorn-system --create-namespace -f {{ longhorn_values_dest }} --version {{ longhorn_chart_version }} --wait --timeout 15m || helm install --replace longhorn longhorn/longhorn --namespace longhorn-system --create-namespace -f {{ longhorn_values_dest }} --version {{ longhorn_chart_version }} --wait --timeout 15m + environment: + KUBECONFIG: "{{ k3s_kubeconfig }}" + args: + executable: /bin/bash + register: helm_longhorn + changed_when: true + +- name: Apply local-path-config lab defaults (optional) + hosts: k3s_server + become: true + run_once: true + vars: + k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml + local_path_json_src: "{{ playbook_dir }}/../../files/03-05-local-path-config/local-path-config-lab.json" + local_path_json_dest: /root/local-path-config-lab.json + tasks: + - name: Apply local-path-config lab defaults (optional) + when: longhorn_apply_local_path_lab | default(false) | bool + block: + - name: Copy local-path lab json + ansible.builtin.copy: + src: "{{ local_path_json_src }}" + dest: "{{ local_path_json_dest }}" + mode: "0644" + + - name: Apply local-path-config ConfigMap + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl -n kube-system create configmap local-path-config \ + --from-file=config.json={{ local_path_json_dest }} \ + --dry-run=client -o yaml | KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f - + args: + executable: /bin/bash + changed_when: true + + - name: Restart local-path-provisioner if present + ansible.builtin.shell: | + KUBECONFIG={{ k3s_kubeconfig }} kubectl -n kube-system rollout restart deploy/local-path-provisioner + args: + executable: /bin/bash + register: lp_restart + failed_when: false + changed_when: lp_restart.rc == 0 - name: Verify 03-07 Longhorn (namespace pods) hosts: k3s_server diff --git a/ansible/playbooks/verify/03-08.yml b/ansible/playbooks/verify/03-08.yml index 24f7770..ce62189 100644 --- a/ansible/playbooks/verify/03-08.yml +++ b/ansible/playbooks/verify/03-08.yml @@ -6,5 +6,5 @@ doc_id: "03-08" doc_filename: "03-08-k3s-ha-集群配置与切换.md" tasks: - - ansible.builtin.import_tasks: "{{ playbook_dir }}/_noop-tasks.yml" - + - name: Include noop doc verify tasks + ansible.builtin.include_tasks: tasks/noop-doc-verify.yml diff --git a/ansible/playbooks/verify/03-09.yml b/ansible/playbooks/verify/03-09.yml index 92e7316..71bbab7 100644 --- a/ansible/playbooks/verify/03-09.yml +++ b/ansible/playbooks/verify/03-09.yml @@ -6,5 +6,5 @@ doc_id: "03-09" doc_filename: "03-09-k3s-gitops-集群配置管理.md" tasks: - - ansible.builtin.import_tasks: "{{ playbook_dir }}/_noop-tasks.yml" - + - name: Include noop doc verify tasks + ansible.builtin.include_tasks: tasks/noop-doc-verify.yml diff --git a/ansible/playbooks/verify/03-10.yml b/ansible/playbooks/verify/03-10.yml index 90f9628..4d7e688 100644 --- a/ansible/playbooks/verify/03-10.yml +++ b/ansible/playbooks/verify/03-10.yml @@ -6,5 +6,5 @@ doc_id: "03-10" doc_filename: "03-10-k3s-traefik-custom-ports.md" tasks: - - ansible.builtin.import_tasks: "{{ playbook_dir }}/_noop-tasks.yml" - + - name: Include noop doc verify tasks + ansible.builtin.include_tasks: tasks/noop-doc-verify.yml diff --git a/ansible/playbooks/verify/04-01.yml b/ansible/playbooks/verify/04-01.yml index 554ac5e..2c8cb86 100644 --- a/ansible/playbooks/verify/04-01.yml +++ b/ansible/playbooks/verify/04-01.yml @@ -1,6 +1,45 @@ -- import_playbook: "{{ playbook_dir }}/../nodejs-demo-apply.yml" +--- +- name: Apply nodejs-demo Kubernetes manifests + hosts: k3s_server + become: true + run_once: true + vars: + k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml + nodejs_demo_manifest: "04-01-nodejs-demo.yaml" + manifests_dir: "{{ playbook_dir }}/../../files/04-01" + tasks: + - name: Ensure manifest file exists + ansible.builtin.stat: + path: "{{ manifests_dir }}/{{ nodejs_demo_manifest }}" + register: nodejs_manifest_stat + delegate_to: localhost + become: false -- name: Verify 04-01 nodejs demo (rollout + HTTP) + - name: Fail if manifest not found + ansible.builtin.fail: + msg: "未找到 {{ manifests_dir }}/{{ nodejs_demo_manifest }},请从仓库根检查文件名" + when: not nodejs_manifest_stat.stat.exists + delegate_to: localhost + become: false + + - name: Copy manifest to control plane + ansible.builtin.copy: + src: "{{ manifests_dir }}/{{ nodejs_demo_manifest }}" + dest: "/tmp/{{ nodejs_demo_manifest }}" + mode: "0644" + + - name: kubectl apply nodejs-demo manifest + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f /tmp/{{ nodejs_demo_manifest }} + register: nodejs_apply + changed_when: "'configured' in nodejs_apply.stdout or 'created' in nodejs_apply.stdout" + + - name: Show kubectl apply output + ansible.builtin.debug: + var: nodejs_apply.stdout_lines + +- name: Verify 04-01 nodejs base (HTTP 200) hosts: k3s_server become: true run_once: true @@ -11,39 +50,21 @@ - name: Rollout status nodejs-demo ansible.builtin.shell: | set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/nodejs-demo -n default --timeout=180s + KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/nodejs-demo -n default --timeout=240s args: executable: /bin/bash changed_when: false - - name: HTTP check /node (expect 200 and Hello World) + - name: HTTP check /node ansible.builtin.shell: | set -e base="{{ verify_entry_base | trim | regex_replace('/+$','') }}" url="$base/node" - code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") + code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 --max-time 10 "$url" 2>/dev/null || echo "000") echo "$url -> $code" test "$code" = "200" - body=$(curl -sS --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || true) - echo "$body" | grep -q "Hello World from Node.js" args: executable: /bin/bash changed_when: false -- name: Teardown 04-01 nodejs demo (optional) - hosts: k3s_server - become: true - run_once: true - vars: - k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml - verify_teardown: "{{ (VERIFY_TEARDOWN | default('1')) | string }}" - tasks: - - name: Delete nodejs-demo resources when VERIFY_TEARDOWN=1 - when: verify_teardown == "1" - ansible.builtin.shell: | - set -e - KUBECONFIG={{ k3s_kubeconfig }} kubectl delete deploy/nodejs-demo svc/nodejs-demo ing/nodejs-demo -n default --ignore-not-found=true - args: - executable: /bin/bash - changed_when: true diff --git a/ansible/playbooks/verify/04-02.yml b/ansible/playbooks/verify/04-02.yml index fc0123f..97f33dc 100644 --- a/ansible/playbooks/verify/04-02.yml +++ b/ansible/playbooks/verify/04-02.yml @@ -1,10 +1,16 @@ -- name: "04-02 noop verify" - hosts: localhost - gather_facts: false +- name: Deploy+Verify 04-02 nodejs ports + Service + hosts: k3s_server + become: true + run_once: true vars: - repo_root: "{{ playbook_dir }}/../../.." - doc_id: "04-02" - doc_filename: "04-02-nodejs-镜像与运行命令.md" + k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml + verify_teardown: "{{ (VERIFY_TEARDOWN | default('1')) | string }}" + nodejs_manifest_src: "{{ playbook_dir }}/../../files/04-01/04-02-nodejs-demo.yaml" + nodejs_manifest_dest: /tmp/nodejs-demo-04-02.yaml + nodejs_verify_entry_base: "{{ nodejs_entry_base | default('http://' ~ k3s_server_ip) }}" + nodejs_verify_path: "/node" + nodejs_expected_target_port: 3000 tasks: - - ansible.builtin.import_tasks: "{{ playbook_dir }}/_noop-tasks.yml" + - name: Include nodejs deploy+verify template + ansible.builtin.include_tasks: tasks/nodejs-demo-deploy-verify.yml diff --git a/ansible/playbooks/verify/04-03.yml b/ansible/playbooks/verify/04-03.yml index 7af2a77..11894b4 100644 --- a/ansible/playbooks/verify/04-03.yml +++ b/ansible/playbooks/verify/04-03.yml @@ -1,10 +1,91 @@ -- name: "04-03 noop verify" - hosts: localhost - gather_facts: false +- name: Deploy+Verify 04-03 nodejs image + command/args + hosts: k3s_server + become: true + run_once: true vars: - repo_root: "{{ playbook_dir }}/../../.." - doc_id: "04-03" - doc_filename: "04-03-nodejs-环境变量与配置注入.md" + k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml + verify_teardown: "{{ (VERIFY_TEARDOWN | default('1')) | string }}" + nodejs_manifest_src: "{{ playbook_dir }}/../../files/04-01/04-03-nodejs-demo.yaml" + nodejs_manifest_dest: /tmp/nodejs-demo-04-03.yaml + nodejs_verify_entry_base: "{{ nodejs_entry_base | default('http://' ~ k3s_server_ip) }}" + nodejs_verify_path: "/node" + nodejs_expected_target_port: 8080 tasks: - - ansible.builtin.import_tasks: "{{ playbook_dir }}/_noop-tasks.yml" + - name: Copy nodejs demo manifest + ansible.builtin.copy: + src: "{{ nodejs_manifest_src }}" + dest: "{{ nodejs_manifest_dest }}" + mode: "0644" + + - name: Apply nodejs demo manifest + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f {{ nodejs_manifest_dest }} + args: + executable: /bin/bash + changed_when: true + + - name: Rollout status nodejs-demo + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/nodejs-demo -n default --timeout=180s + args: + executable: /bin/bash + changed_when: false + + - name: Assert Service targetPort matches expected (optional) + when: nodejs_expected_target_port is defined and (nodejs_expected_target_port | int) > 0 + ansible.builtin.shell: | + set -euo pipefail + exp="{{ nodejs_expected_target_port | int }}" + got=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get svc nodejs-demo -n default -o jsonpath='{.spec.ports[0].targetPort}') + echo "svc/nodejs-demo targetPort=$got expected=$exp" + test "$got" = "$exp" + args: + executable: /bin/bash + changed_when: false + + - name: Assert Endpoints exist + ansible.builtin.shell: | + set -euo pipefail + eps=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get endpoints nodejs-demo -n default -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null || true) + echo "endpoints.ip=$eps" + test -n "$eps" + args: + executable: /bin/bash + changed_when: false + + - name: HTTP check nodejs demo (path/host optional) + when: nodejs_http_check_enabled | default(true) + ansible.builtin.shell: | + set -euo pipefail + base="{{ nodejs_verify_entry_base | trim | regex_replace('/+$','') }}" + path="{{ nodejs_verify_path | default('/node') }}" + url="$base${path}" + host="{{ nodejs_verify_host | default('') | trim }}" + + ok=0 + for i in 1 2 3 4 5 6 7 8 9 10; do + if [ -n "$host" ]; then + code=$(curl -s -o /dev/null -w "%{http_code}" -H "Host: ${host}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") + else + code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") + fi + echo "try $i: $url host=${host:-} -> $code" + if [ "$code" = "200" ]; then ok=1; break; fi + sleep 2 + done + test "$ok" = "1" + args: + executable: /bin/bash + changed_when: false + + - name: Teardown when VERIFY_TEARDOWN=1 + when: verify_teardown == "1" + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl delete -f {{ nodejs_manifest_dest }} --ignore-not-found=true + args: + executable: /bin/bash + changed_when: true diff --git a/ansible/playbooks/verify/04-04.yml b/ansible/playbooks/verify/04-04.yml index a988d41..60219f3 100644 --- a/ansible/playbooks/verify/04-04.yml +++ b/ansible/playbooks/verify/04-04.yml @@ -1,10 +1,91 @@ -- name: "04-04 noop verify" - hosts: localhost - gather_facts: false +- name: Deploy+Verify 04-04 nodejs env + config injection + hosts: k3s_server + become: true + run_once: true vars: - repo_root: "{{ playbook_dir }}/../../.." - doc_id: "04-04" - doc_filename: "04-04-nodejs-端口与Service.md" + k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml + verify_teardown: "{{ (VERIFY_TEARDOWN | default('1')) | string }}" + nodejs_manifest_src: "{{ playbook_dir }}/../../files/04-01/04-04-nodejs-demo.yaml" + nodejs_manifest_dest: /tmp/nodejs-demo-04-04.yaml + nodejs_verify_entry_base: "{{ nodejs_entry_base | default('http://' ~ k3s_server_ip) }}" + nodejs_verify_path: "/node" + nodejs_expected_target_port: 8080 tasks: - - ansible.builtin.import_tasks: "{{ playbook_dir }}/_noop-tasks.yml" + - name: Copy nodejs demo manifest + ansible.builtin.copy: + src: "{{ nodejs_manifest_src }}" + dest: "{{ nodejs_manifest_dest }}" + mode: "0644" + + - name: Apply nodejs demo manifest + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f {{ nodejs_manifest_dest }} + args: + executable: /bin/bash + changed_when: true + + - name: Rollout status nodejs-demo + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/nodejs-demo -n default --timeout=180s + args: + executable: /bin/bash + changed_when: false + + - name: Assert Service targetPort matches expected (optional) + when: nodejs_expected_target_port is defined and (nodejs_expected_target_port | int) > 0 + ansible.builtin.shell: | + set -euo pipefail + exp="{{ nodejs_expected_target_port | int }}" + got=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get svc nodejs-demo -n default -o jsonpath='{.spec.ports[0].targetPort}') + echo "svc/nodejs-demo targetPort=$got expected=$exp" + test "$got" = "$exp" + args: + executable: /bin/bash + changed_when: false + + - name: Assert Endpoints exist + ansible.builtin.shell: | + set -euo pipefail + eps=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get endpoints nodejs-demo -n default -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null || true) + echo "endpoints.ip=$eps" + test -n "$eps" + args: + executable: /bin/bash + changed_when: false + + - name: HTTP check nodejs demo (path/host optional) + when: nodejs_http_check_enabled | default(true) + ansible.builtin.shell: | + set -euo pipefail + base="{{ nodejs_verify_entry_base | trim | regex_replace('/+$','') }}" + path="{{ nodejs_verify_path | default('/node') }}" + url="$base${path}" + host="{{ nodejs_verify_host | default('') | trim }}" + + ok=0 + for i in 1 2 3 4 5 6 7 8 9 10; do + if [ -n "$host" ]; then + code=$(curl -s -o /dev/null -w "%{http_code}" -H "Host: ${host}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") + else + code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") + fi + echo "try $i: $url host=${host:-} -> $code" + if [ "$code" = "200" ]; then ok=1; break; fi + sleep 2 + done + test "$ok" = "1" + args: + executable: /bin/bash + changed_when: false + + - name: Teardown when VERIFY_TEARDOWN=1 + when: verify_teardown == "1" + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl delete -f {{ nodejs_manifest_dest }} --ignore-not-found=true + args: + executable: /bin/bash + changed_when: true diff --git a/ansible/playbooks/verify/04-05.yml b/ansible/playbooks/verify/04-05.yml index 7660e06..dc97da2 100644 --- a/ansible/playbooks/verify/04-05.yml +++ b/ansible/playbooks/verify/04-05.yml @@ -1,10 +1,91 @@ -- name: "04-05 noop verify" - hosts: localhost - gather_facts: false +- name: Deploy+Verify 04-05 nodejs probes + hosts: k3s_server + become: true + run_once: true vars: - repo_root: "{{ playbook_dir }}/../../.." - doc_id: "04-05" - doc_filename: "04-05-nodejs-资源请求与限制.md" + k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml + verify_teardown: "{{ (VERIFY_TEARDOWN | default('1')) | string }}" + nodejs_manifest_src: "{{ playbook_dir }}/../../files/04-01/04-05-nodejs-demo.yaml" + nodejs_manifest_dest: /tmp/nodejs-demo-04-05.yaml + nodejs_verify_entry_base: "{{ nodejs_entry_base | default('http://' ~ k3s_server_ip) }}" + nodejs_verify_path: "/node" + nodejs_expected_target_port: 8080 tasks: - - ansible.builtin.import_tasks: "{{ playbook_dir }}/_noop-tasks.yml" + - name: Copy nodejs demo manifest + ansible.builtin.copy: + src: "{{ nodejs_manifest_src }}" + dest: "{{ nodejs_manifest_dest }}" + mode: "0644" + + - name: Apply nodejs demo manifest + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f {{ nodejs_manifest_dest }} + args: + executable: /bin/bash + changed_when: true + + - name: Rollout status nodejs-demo + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/nodejs-demo -n default --timeout=180s + args: + executable: /bin/bash + changed_when: false + + - name: Assert Service targetPort matches expected (optional) + when: nodejs_expected_target_port is defined and (nodejs_expected_target_port | int) > 0 + ansible.builtin.shell: | + set -euo pipefail + exp="{{ nodejs_expected_target_port | int }}" + got=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get svc nodejs-demo -n default -o jsonpath='{.spec.ports[0].targetPort}') + echo "svc/nodejs-demo targetPort=$got expected=$exp" + test "$got" = "$exp" + args: + executable: /bin/bash + changed_when: false + + - name: Assert Endpoints exist + ansible.builtin.shell: | + set -euo pipefail + eps=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get endpoints nodejs-demo -n default -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null || true) + echo "endpoints.ip=$eps" + test -n "$eps" + args: + executable: /bin/bash + changed_when: false + + - name: HTTP check nodejs demo (path/host optional) + when: nodejs_http_check_enabled | default(true) + ansible.builtin.shell: | + set -euo pipefail + base="{{ nodejs_verify_entry_base | trim | regex_replace('/+$','') }}" + path="{{ nodejs_verify_path | default('/node') }}" + url="$base${path}" + host="{{ nodejs_verify_host | default('') | trim }}" + + ok=0 + for i in 1 2 3 4 5 6 7 8 9 10; do + if [ -n "$host" ]; then + code=$(curl -s -o /dev/null -w "%{http_code}" -H "Host: ${host}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") + else + code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") + fi + echo "try $i: $url host=${host:-} -> $code" + if [ "$code" = "200" ]; then ok=1; break; fi + sleep 2 + done + test "$ok" = "1" + args: + executable: /bin/bash + changed_when: false + + - name: Teardown when VERIFY_TEARDOWN=1 + when: verify_teardown == "1" + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl delete -f {{ nodejs_manifest_dest }} --ignore-not-found=true + args: + executable: /bin/bash + changed_when: true diff --git a/ansible/playbooks/verify/04-06.yml b/ansible/playbooks/verify/04-06.yml index 6cad20d..03b53df 100644 --- a/ansible/playbooks/verify/04-06.yml +++ b/ansible/playbooks/verify/04-06.yml @@ -1,10 +1,91 @@ -- name: "04-06 noop verify" - hosts: localhost - gather_facts: false +- name: Deploy+Verify 04-06 nodejs replicas + rolling update + hosts: k3s_server + become: true + run_once: true vars: - repo_root: "{{ playbook_dir }}/../../.." - doc_id: "04-06" - doc_filename: "04-06-nodejs-探针与健康检查.md" + k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml + verify_teardown: "{{ (VERIFY_TEARDOWN | default('1')) | string }}" + nodejs_manifest_src: "{{ playbook_dir }}/../../files/04-01/04-06-nodejs-demo.yaml" + nodejs_manifest_dest: /tmp/nodejs-demo-04-06.yaml + nodejs_verify_entry_base: "{{ nodejs_entry_base | default('http://' ~ k3s_server_ip) }}" + nodejs_verify_path: "/node" + nodejs_expected_target_port: 8080 tasks: - - ansible.builtin.import_tasks: "{{ playbook_dir }}/_noop-tasks.yml" + - name: Copy nodejs demo manifest + ansible.builtin.copy: + src: "{{ nodejs_manifest_src }}" + dest: "{{ nodejs_manifest_dest }}" + mode: "0644" + + - name: Apply nodejs demo manifest + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f {{ nodejs_manifest_dest }} + args: + executable: /bin/bash + changed_when: true + + - name: Rollout status nodejs-demo + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/nodejs-demo -n default --timeout=180s + args: + executable: /bin/bash + changed_when: false + + - name: Assert Service targetPort matches expected (optional) + when: nodejs_expected_target_port is defined and (nodejs_expected_target_port | int) > 0 + ansible.builtin.shell: | + set -euo pipefail + exp="{{ nodejs_expected_target_port | int }}" + got=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get svc nodejs-demo -n default -o jsonpath='{.spec.ports[0].targetPort}') + echo "svc/nodejs-demo targetPort=$got expected=$exp" + test "$got" = "$exp" + args: + executable: /bin/bash + changed_when: false + + - name: Assert Endpoints exist + ansible.builtin.shell: | + set -euo pipefail + eps=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get endpoints nodejs-demo -n default -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null || true) + echo "endpoints.ip=$eps" + test -n "$eps" + args: + executable: /bin/bash + changed_when: false + + - name: HTTP check nodejs demo (path/host optional) + when: nodejs_http_check_enabled | default(true) + ansible.builtin.shell: | + set -euo pipefail + base="{{ nodejs_verify_entry_base | trim | regex_replace('/+$','') }}" + path="{{ nodejs_verify_path | default('/node') }}" + url="$base${path}" + host="{{ nodejs_verify_host | default('') | trim }}" + + ok=0 + for i in 1 2 3 4 5 6 7 8 9 10; do + if [ -n "$host" ]; then + code=$(curl -s -o /dev/null -w "%{http_code}" -H "Host: ${host}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") + else + code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") + fi + echo "try $i: $url host=${host:-} -> $code" + if [ "$code" = "200" ]; then ok=1; break; fi + sleep 2 + done + test "$ok" = "1" + args: + executable: /bin/bash + changed_when: false + + - name: Teardown when VERIFY_TEARDOWN=1 + when: verify_teardown == "1" + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl delete -f {{ nodejs_manifest_dest }} --ignore-not-found=true + args: + executable: /bin/bash + changed_when: true diff --git a/ansible/playbooks/verify/04-07.yml b/ansible/playbooks/verify/04-07.yml index 6acff34..00537ea 100644 --- a/ansible/playbooks/verify/04-07.yml +++ b/ansible/playbooks/verify/04-07.yml @@ -1,10 +1,91 @@ -- name: "04-07 noop verify" - hosts: localhost - gather_facts: false +- name: Deploy+Verify 04-07 nodejs Ingress + Traefik + hosts: k3s_server + become: true + run_once: true vars: - repo_root: "{{ playbook_dir }}/../../.." - doc_id: "04-07" - doc_filename: "04-07-nodejs-调度与亲和.md" + k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml + verify_teardown: "{{ (VERIFY_TEARDOWN | default('1')) | string }}" + nodejs_manifest_src: "{{ playbook_dir }}/../../files/04-01/04-07-nodejs-demo.yaml" + nodejs_manifest_dest: /tmp/nodejs-demo-04-07.yaml + nodejs_verify_entry_base: "{{ nodejs_entry_base | default('http://' ~ k3s_server_ip) }}" + nodejs_verify_path: "/api/" + nodejs_verify_host: "{{ nodejs_verify_host | default('app.example.local') }}" tasks: - - ansible.builtin.import_tasks: "{{ playbook_dir }}/_noop-tasks.yml" + - name: Copy nodejs demo manifest + ansible.builtin.copy: + src: "{{ nodejs_manifest_src }}" + dest: "{{ nodejs_manifest_dest }}" + mode: "0644" + + - name: Apply nodejs demo manifest + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f {{ nodejs_manifest_dest }} + args: + executable: /bin/bash + changed_when: true + + - name: Rollout status nodejs-demo + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/nodejs-demo -n default --timeout=180s + args: + executable: /bin/bash + changed_when: false + + - name: Assert Service targetPort matches expected (optional) + when: nodejs_expected_target_port is defined and (nodejs_expected_target_port | int) > 0 + ansible.builtin.shell: | + set -euo pipefail + exp="{{ nodejs_expected_target_port | int }}" + got=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get svc nodejs-demo -n default -o jsonpath='{.spec.ports[0].targetPort}') + echo "svc/nodejs-demo targetPort=$got expected=$exp" + test "$got" = "$exp" + args: + executable: /bin/bash + changed_when: false + + - name: Assert Endpoints exist + ansible.builtin.shell: | + set -euo pipefail + eps=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get endpoints nodejs-demo -n default -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null || true) + echo "endpoints.ip=$eps" + test -n "$eps" + args: + executable: /bin/bash + changed_when: false + + - name: HTTP check nodejs demo (path/host optional) + when: nodejs_http_check_enabled | default(true) + ansible.builtin.shell: | + set -euo pipefail + base="{{ nodejs_verify_entry_base | trim | regex_replace('/+$','') }}" + path="{{ nodejs_verify_path | default('/node') }}" + url="$base${path}" + host="{{ nodejs_verify_host | default('') | trim }}" + + ok=0 + for i in 1 2 3 4 5 6 7 8 9 10; do + if [ -n "$host" ]; then + code=$(curl -s -o /dev/null -w "%{http_code}" -H "Host: ${host}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") + else + code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") + fi + echo "try $i: $url host=${host:-} -> $code" + if [ "$code" = "200" ]; then ok=1; break; fi + sleep 2 + done + test "$ok" = "1" + args: + executable: /bin/bash + changed_when: false + + - name: Teardown when VERIFY_TEARDOWN=1 + when: verify_teardown == "1" + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl delete -f {{ nodejs_manifest_dest }} --ignore-not-found=true + args: + executable: /bin/bash + changed_when: true diff --git a/ansible/playbooks/verify/04-08.yml b/ansible/playbooks/verify/04-08.yml index 41010ab..8b4370b 100644 --- a/ansible/playbooks/verify/04-08.yml +++ b/ansible/playbooks/verify/04-08.yml @@ -1,10 +1,91 @@ -- name: "04-08 noop verify" - hosts: localhost - gather_facts: false +- name: Deploy+Verify 04-08 nodejs resources requests/limits + hosts: k3s_server + become: true + run_once: true vars: - repo_root: "{{ playbook_dir }}/../../.." - doc_id: "04-08" - doc_filename: "04-08-nodejs-安全上下文.md" + k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml + verify_teardown: "{{ (VERIFY_TEARDOWN | default('1')) | string }}" + nodejs_manifest_src: "{{ playbook_dir }}/../../files/04-01/04-08-nodejs-demo.yaml" + nodejs_manifest_dest: /tmp/nodejs-demo-04-08.yaml + nodejs_verify_entry_base: "{{ nodejs_entry_base | default('http://' ~ k3s_server_ip) }}" + nodejs_verify_path: "/node" + nodejs_expected_target_port: 8080 tasks: - - ansible.builtin.import_tasks: "{{ playbook_dir }}/_noop-tasks.yml" + - name: Copy nodejs demo manifest + ansible.builtin.copy: + src: "{{ nodejs_manifest_src }}" + dest: "{{ nodejs_manifest_dest }}" + mode: "0644" + + - name: Apply nodejs demo manifest + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f {{ nodejs_manifest_dest }} + args: + executable: /bin/bash + changed_when: true + + - name: Rollout status nodejs-demo + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/nodejs-demo -n default --timeout=180s + args: + executable: /bin/bash + changed_when: false + + - name: Assert Service targetPort matches expected (optional) + when: nodejs_expected_target_port is defined and (nodejs_expected_target_port | int) > 0 + ansible.builtin.shell: | + set -euo pipefail + exp="{{ nodejs_expected_target_port | int }}" + got=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get svc nodejs-demo -n default -o jsonpath='{.spec.ports[0].targetPort}') + echo "svc/nodejs-demo targetPort=$got expected=$exp" + test "$got" = "$exp" + args: + executable: /bin/bash + changed_when: false + + - name: Assert Endpoints exist + ansible.builtin.shell: | + set -euo pipefail + eps=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get endpoints nodejs-demo -n default -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null || true) + echo "endpoints.ip=$eps" + test -n "$eps" + args: + executable: /bin/bash + changed_when: false + + - name: HTTP check nodejs demo (path/host optional) + when: nodejs_http_check_enabled | default(true) + ansible.builtin.shell: | + set -euo pipefail + base="{{ nodejs_verify_entry_base | trim | regex_replace('/+$','') }}" + path="{{ nodejs_verify_path | default('/node') }}" + url="$base${path}" + host="{{ nodejs_verify_host | default('') | trim }}" + + ok=0 + for i in 1 2 3 4 5 6 7 8 9 10; do + if [ -n "$host" ]; then + code=$(curl -s -o /dev/null -w "%{http_code}" -H "Host: ${host}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") + else + code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") + fi + echo "try $i: $url host=${host:-} -> $code" + if [ "$code" = "200" ]; then ok=1; break; fi + sleep 2 + done + test "$ok" = "1" + args: + executable: /bin/bash + changed_when: false + + - name: Teardown when VERIFY_TEARDOWN=1 + when: verify_teardown == "1" + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl delete -f {{ nodejs_manifest_dest }} --ignore-not-found=true + args: + executable: /bin/bash + changed_when: true diff --git a/ansible/playbooks/verify/04-09.yml b/ansible/playbooks/verify/04-09.yml index 5e5148b..d615d6d 100644 --- a/ansible/playbooks/verify/04-09.yml +++ b/ansible/playbooks/verify/04-09.yml @@ -1,10 +1,91 @@ -- name: "04-09 noop verify" - hosts: localhost - gather_facts: false +- name: Deploy+Verify 04-09 nodejs scheduling/affinity + hosts: k3s_server + become: true + run_once: true vars: - repo_root: "{{ playbook_dir }}/../../.." - doc_id: "04-09" - doc_filename: "04-09-nodejs-存储与卷.md" + k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml + verify_teardown: "{{ (VERIFY_TEARDOWN | default('1')) | string }}" + nodejs_manifest_src: "{{ playbook_dir }}/../../files/04-01/04-09-nodejs-demo.yaml" + nodejs_manifest_dest: /tmp/nodejs-demo-04-09.yaml + nodejs_verify_entry_base: "{{ nodejs_entry_base | default('http://' ~ k3s_server_ip) }}" + nodejs_verify_path: "/node" + nodejs_expected_target_port: 8080 tasks: - - ansible.builtin.import_tasks: "{{ playbook_dir }}/_noop-tasks.yml" + - name: Copy nodejs demo manifest + ansible.builtin.copy: + src: "{{ nodejs_manifest_src }}" + dest: "{{ nodejs_manifest_dest }}" + mode: "0644" + + - name: Apply nodejs demo manifest + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f {{ nodejs_manifest_dest }} + args: + executable: /bin/bash + changed_when: true + + - name: Rollout status nodejs-demo + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/nodejs-demo -n default --timeout=180s + args: + executable: /bin/bash + changed_when: false + + - name: Assert Service targetPort matches expected (optional) + when: nodejs_expected_target_port is defined and (nodejs_expected_target_port | int) > 0 + ansible.builtin.shell: | + set -euo pipefail + exp="{{ nodejs_expected_target_port | int }}" + got=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get svc nodejs-demo -n default -o jsonpath='{.spec.ports[0].targetPort}') + echo "svc/nodejs-demo targetPort=$got expected=$exp" + test "$got" = "$exp" + args: + executable: /bin/bash + changed_when: false + + - name: Assert Endpoints exist + ansible.builtin.shell: | + set -euo pipefail + eps=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get endpoints nodejs-demo -n default -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null || true) + echo "endpoints.ip=$eps" + test -n "$eps" + args: + executable: /bin/bash + changed_when: false + + - name: HTTP check nodejs demo (path/host optional) + when: nodejs_http_check_enabled | default(true) + ansible.builtin.shell: | + set -euo pipefail + base="{{ nodejs_verify_entry_base | trim | regex_replace('/+$','') }}" + path="{{ nodejs_verify_path | default('/node') }}" + url="$base${path}" + host="{{ nodejs_verify_host | default('') | trim }}" + + ok=0 + for i in 1 2 3 4 5 6 7 8 9 10; do + if [ -n "$host" ]; then + code=$(curl -s -o /dev/null -w "%{http_code}" -H "Host: ${host}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") + else + code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") + fi + echo "try $i: $url host=${host:-} -> $code" + if [ "$code" = "200" ]; then ok=1; break; fi + sleep 2 + done + test "$ok" = "1" + args: + executable: /bin/bash + changed_when: false + + - name: Teardown when VERIFY_TEARDOWN=1 + when: verify_teardown == "1" + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl delete -f {{ nodejs_manifest_dest }} --ignore-not-found=true + args: + executable: /bin/bash + changed_when: true diff --git a/ansible/playbooks/verify/04-10.yml b/ansible/playbooks/verify/04-10.yml index 392d332..773785f 100644 --- a/ansible/playbooks/verify/04-10.yml +++ b/ansible/playbooks/verify/04-10.yml @@ -1,10 +1,91 @@ -- name: "04-10 noop verify" - hosts: localhost - gather_facts: false +- name: Deploy+Verify 04-10 nodejs securityContext + hosts: k3s_server + become: true + run_once: true vars: - repo_root: "{{ playbook_dir }}/../../.." - doc_id: "04-10" - doc_filename: "04-10-nodejs-Ingress与Traefik.md" + k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml + verify_teardown: "{{ (VERIFY_TEARDOWN | default('1')) | string }}" + nodejs_manifest_src: "{{ playbook_dir }}/../../files/04-01/04-10-nodejs-demo.yaml" + nodejs_manifest_dest: /tmp/nodejs-demo-04-10.yaml + nodejs_verify_entry_base: "{{ nodejs_entry_base | default('http://' ~ k3s_server_ip) }}" + nodejs_verify_path: "/node" + nodejs_expected_target_port: 8080 tasks: - - ansible.builtin.import_tasks: "{{ playbook_dir }}/_noop-tasks.yml" + - name: Copy nodejs demo manifest + ansible.builtin.copy: + src: "{{ nodejs_manifest_src }}" + dest: "{{ nodejs_manifest_dest }}" + mode: "0644" + + - name: Apply nodejs demo manifest + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f {{ nodejs_manifest_dest }} + args: + executable: /bin/bash + changed_when: true + + - name: Rollout status nodejs-demo + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/nodejs-demo -n default --timeout=180s + args: + executable: /bin/bash + changed_when: false + + - name: Assert Service targetPort matches expected (optional) + when: nodejs_expected_target_port is defined and (nodejs_expected_target_port | int) > 0 + ansible.builtin.shell: | + set -euo pipefail + exp="{{ nodejs_expected_target_port | int }}" + got=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get svc nodejs-demo -n default -o jsonpath='{.spec.ports[0].targetPort}') + echo "svc/nodejs-demo targetPort=$got expected=$exp" + test "$got" = "$exp" + args: + executable: /bin/bash + changed_when: false + + - name: Assert Endpoints exist + ansible.builtin.shell: | + set -euo pipefail + eps=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get endpoints nodejs-demo -n default -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null || true) + echo "endpoints.ip=$eps" + test -n "$eps" + args: + executable: /bin/bash + changed_when: false + + - name: HTTP check nodejs demo (path/host optional) + when: nodejs_http_check_enabled | default(true) + ansible.builtin.shell: | + set -euo pipefail + base="{{ nodejs_verify_entry_base | trim | regex_replace('/+$','') }}" + path="{{ nodejs_verify_path | default('/node') }}" + url="$base${path}" + host="{{ nodejs_verify_host | default('') | trim }}" + + ok=0 + for i in 1 2 3 4 5 6 7 8 9 10; do + if [ -n "$host" ]; then + code=$(curl -s -o /dev/null -w "%{http_code}" -H "Host: ${host}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") + else + code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") + fi + echo "try $i: $url host=${host:-} -> $code" + if [ "$code" = "200" ]; then ok=1; break; fi + sleep 2 + done + test "$ok" = "1" + args: + executable: /bin/bash + changed_when: false + + - name: Teardown when VERIFY_TEARDOWN=1 + when: verify_teardown == "1" + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl delete -f {{ nodejs_manifest_dest }} --ignore-not-found=true + args: + executable: /bin/bash + changed_when: true diff --git a/ansible/playbooks/verify/04-11.yml b/ansible/playbooks/verify/04-11.yml index cd56169..2264ed2 100644 --- a/ansible/playbooks/verify/04-11.yml +++ b/ansible/playbooks/verify/04-11.yml @@ -1,10 +1,91 @@ -- name: "04-11 noop verify" - hosts: localhost - gather_facts: false +- name: Deploy+Verify 04-11 nodejs storage/volumes + hosts: k3s_server + become: true + run_once: true vars: - repo_root: "{{ playbook_dir }}/../../.." - doc_id: "04-11" - doc_filename: "04-11-nodejs-副本与滚动发布.md" + k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml + verify_teardown: "{{ (VERIFY_TEARDOWN | default('1')) | string }}" + nodejs_manifest_src: "{{ playbook_dir }}/../../files/04-01/04-11-nodejs-demo.yaml" + nodejs_manifest_dest: /tmp/nodejs-demo-04-11.yaml + nodejs_verify_entry_base: "{{ nodejs_entry_base | default('http://' ~ k3s_server_ip) }}" + nodejs_verify_path: "/node" + nodejs_expected_target_port: 8080 tasks: - - ansible.builtin.import_tasks: "{{ playbook_dir }}/_noop-tasks.yml" + - name: Copy nodejs demo manifest + ansible.builtin.copy: + src: "{{ nodejs_manifest_src }}" + dest: "{{ nodejs_manifest_dest }}" + mode: "0644" + + - name: Apply nodejs demo manifest + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f {{ nodejs_manifest_dest }} + args: + executable: /bin/bash + changed_when: true + + - name: Rollout status nodejs-demo + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/nodejs-demo -n default --timeout=180s + args: + executable: /bin/bash + changed_when: false + + - name: Assert Service targetPort matches expected (optional) + when: nodejs_expected_target_port is defined and (nodejs_expected_target_port | int) > 0 + ansible.builtin.shell: | + set -euo pipefail + exp="{{ nodejs_expected_target_port | int }}" + got=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get svc nodejs-demo -n default -o jsonpath='{.spec.ports[0].targetPort}') + echo "svc/nodejs-demo targetPort=$got expected=$exp" + test "$got" = "$exp" + args: + executable: /bin/bash + changed_when: false + + - name: Assert Endpoints exist + ansible.builtin.shell: | + set -euo pipefail + eps=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get endpoints nodejs-demo -n default -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null || true) + echo "endpoints.ip=$eps" + test -n "$eps" + args: + executable: /bin/bash + changed_when: false + + - name: HTTP check nodejs demo (path/host optional) + when: nodejs_http_check_enabled | default(true) + ansible.builtin.shell: | + set -euo pipefail + base="{{ nodejs_verify_entry_base | trim | regex_replace('/+$','') }}" + path="{{ nodejs_verify_path | default('/node') }}" + url="$base${path}" + host="{{ nodejs_verify_host | default('') | trim }}" + + ok=0 + for i in 1 2 3 4 5 6 7 8 9 10; do + if [ -n "$host" ]; then + code=$(curl -s -o /dev/null -w "%{http_code}" -H "Host: ${host}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") + else + code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") + fi + echo "try $i: $url host=${host:-} -> $code" + if [ "$code" = "200" ]; then ok=1; break; fi + sleep 2 + done + test "$ok" = "1" + args: + executable: /bin/bash + changed_when: false + + - name: Teardown when VERIFY_TEARDOWN=1 + when: verify_teardown == "1" + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl delete -f {{ nodejs_manifest_dest }} --ignore-not-found=true + args: + executable: /bin/bash + changed_when: true diff --git a/ansible/playbooks/verify/04-12.yml b/ansible/playbooks/verify/04-12.yml index 8ab00e2..e6e0f47 100644 --- a/ansible/playbooks/verify/04-12.yml +++ b/ansible/playbooks/verify/04-12.yml @@ -1,10 +1,101 @@ -- name: "04-12 noop verify" - hosts: localhost - gather_facts: false +- name: Deploy+Verify 04-12 nodejs TLS Ingress + hosts: k3s_server + become: true + run_once: true vars: - repo_root: "{{ playbook_dir }}/../../.." - doc_id: "04-12" - doc_filename: "04-12-nodejs-TLS与证书.md" + k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml + verify_teardown: "{{ (VERIFY_TEARDOWN | default('1')) | string }}" + nodejs_manifest_src: "{{ playbook_dir }}/../../files/04-01/04-12-nodejs-demo.yaml" + nodejs_manifest_dest: /tmp/nodejs-demo-04-12.yaml + # 默认不强行跑 HTTPS curl(需要 DNS/证书/入口);提供环境变量时再启用 + nodejs_http_check_enabled: "{{ (NODEJS_TLS_ENTRY_BASE is defined) and (NODEJS_TLS_HOST is defined) }}" + nodejs_verify_entry_base: "{{ NODEJS_TLS_ENTRY_BASE | default('https://app.example.local') }}" + nodejs_verify_path: "/api/" + nodejs_verify_host: "{{ NODEJS_TLS_HOST | default('app.example.local') }}" tasks: - - ansible.builtin.import_tasks: "{{ playbook_dir }}/_noop-tasks.yml" + - name: Assert TLS secret exists (nodejs-demo-tls) + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl -n default get secret nodejs-demo-tls + args: + executable: /bin/bash + changed_when: false + + - name: Copy nodejs demo manifest + ansible.builtin.copy: + src: "{{ nodejs_manifest_src }}" + dest: "{{ nodejs_manifest_dest }}" + mode: "0644" + + - name: Apply nodejs demo manifest + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f {{ nodejs_manifest_dest }} + args: + executable: /bin/bash + changed_when: true + + - name: Rollout status nodejs-demo + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/nodejs-demo -n default --timeout=180s + args: + executable: /bin/bash + changed_when: false + + - name: Assert Service targetPort matches expected (optional) + when: nodejs_expected_target_port is defined and (nodejs_expected_target_port | int) > 0 + ansible.builtin.shell: | + set -euo pipefail + exp="{{ nodejs_expected_target_port | int }}" + got=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get svc nodejs-demo -n default -o jsonpath='{.spec.ports[0].targetPort}') + echo "svc/nodejs-demo targetPort=$got expected=$exp" + test "$got" = "$exp" + args: + executable: /bin/bash + changed_when: false + + - name: Assert Endpoints exist + ansible.builtin.shell: | + set -euo pipefail + eps=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get endpoints nodejs-demo -n default -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null || true) + echo "endpoints.ip=$eps" + test -n "$eps" + args: + executable: /bin/bash + changed_when: false + + - name: HTTP check nodejs demo (path/host optional) + when: nodejs_http_check_enabled | default(true) + ansible.builtin.shell: | + set -euo pipefail + base="{{ nodejs_verify_entry_base | trim | regex_replace('/+$','') }}" + path="{{ nodejs_verify_path | default('/node') }}" + url="$base${path}" + host="{{ nodejs_verify_host | default('') | trim }}" + + ok=0 + for i in 1 2 3 4 5 6 7 8 9 10; do + if [ -n "$host" ]; then + code=$(curl -s -o /dev/null -w "%{http_code}" -H "Host: ${host}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") + else + code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") + fi + echo "try $i: $url host=${host:-} -> $code" + if [ "$code" = "200" ]; then ok=1; break; fi + sleep 2 + done + test "$ok" = "1" + args: + executable: /bin/bash + changed_when: false + + - name: Teardown when VERIFY_TEARDOWN=1 + when: verify_teardown == "1" + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl delete -f {{ nodejs_manifest_dest }} --ignore-not-found=true + args: + executable: /bin/bash + changed_when: true diff --git a/ansible/playbooks/verify/04-13.yml b/ansible/playbooks/verify/04-13.yml index 643ef9c..41d94bf 100644 --- a/ansible/playbooks/verify/04-13.yml +++ b/ansible/playbooks/verify/04-13.yml @@ -1,10 +1,99 @@ -- name: "04-13 noop verify" - hosts: localhost - gather_facts: false +- name: Deploy+Verify 04-13 nodejs HPA + hosts: k3s_server + become: true + run_once: true vars: - repo_root: "{{ playbook_dir }}/../../.." - doc_id: "04-13" - doc_filename: "04-13-nodejs-HPA.md" + k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml + verify_teardown: "{{ (VERIFY_TEARDOWN | default('1')) | string }}" + nodejs_manifest_src: "{{ playbook_dir }}/../../files/04-01/04-13-nodejs-demo.yaml" + nodejs_manifest_dest: /tmp/nodejs-demo-04-13.yaml + nodejs_verify_entry_base: "{{ nodejs_entry_base | default('http://' ~ k3s_server_ip) }}" + nodejs_verify_path: "/node" + nodejs_expected_target_port: 8080 tasks: - - ansible.builtin.import_tasks: "{{ playbook_dir }}/_noop-tasks.yml" + - name: Copy nodejs demo manifest + ansible.builtin.copy: + src: "{{ nodejs_manifest_src }}" + dest: "{{ nodejs_manifest_dest }}" + mode: "0644" + + - name: Apply nodejs demo manifest + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f {{ nodejs_manifest_dest }} + args: + executable: /bin/bash + changed_when: true + + - name: Rollout status nodejs-demo + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/nodejs-demo -n default --timeout=180s + args: + executable: /bin/bash + changed_when: false + + - name: Assert Service targetPort matches expected (optional) + when: nodejs_expected_target_port is defined and (nodejs_expected_target_port | int) > 0 + ansible.builtin.shell: | + set -euo pipefail + exp="{{ nodejs_expected_target_port | int }}" + got=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get svc nodejs-demo -n default -o jsonpath='{.spec.ports[0].targetPort}') + echo "svc/nodejs-demo targetPort=$got expected=$exp" + test "$got" = "$exp" + args: + executable: /bin/bash + changed_when: false + + - name: Assert Endpoints exist + ansible.builtin.shell: | + set -euo pipefail + eps=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get endpoints nodejs-demo -n default -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null || true) + echo "endpoints.ip=$eps" + test -n "$eps" + args: + executable: /bin/bash + changed_when: false + + - name: HTTP check nodejs demo (path/host optional) + when: nodejs_http_check_enabled | default(true) + ansible.builtin.shell: | + set -euo pipefail + base="{{ nodejs_verify_entry_base | trim | regex_replace('/+$','') }}" + path="{{ nodejs_verify_path | default('/node') }}" + url="$base${path}" + host="{{ nodejs_verify_host | default('') | trim }}" + + ok=0 + for i in 1 2 3 4 5 6 7 8 9 10; do + if [ -n "$host" ]; then + code=$(curl -s -o /dev/null -w "%{http_code}" -H "Host: ${host}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") + else + code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") + fi + echo "try $i: $url host=${host:-} -> $code" + if [ "$code" = "200" ]; then ok=1; break; fi + sleep 2 + done + test "$ok" = "1" + args: + executable: /bin/bash + changed_when: false + + - name: Teardown when VERIFY_TEARDOWN=1 + when: verify_teardown == "1" + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl delete -f {{ nodejs_manifest_dest }} --ignore-not-found=true + args: + executable: /bin/bash + changed_when: true + + - name: Assert HPA exists + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl -n default get hpa nodejs-demo + args: + executable: /bin/bash + changed_when: false diff --git a/ansible/playbooks/verify/04-14.yml b/ansible/playbooks/verify/04-14.yml index 642fc26..2a93201 100644 --- a/ansible/playbooks/verify/04-14.yml +++ b/ansible/playbooks/verify/04-14.yml @@ -1,10 +1,14 @@ - name: "04-14 noop verify" - hosts: localhost - gather_facts: false + hosts: k3s_server + become: true + run_once: true vars: - repo_root: "{{ playbook_dir }}/../../.." - doc_id: "04-14" - doc_filename: "04-14-nodejs-GitOps与CI流水线.md" + k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml + verify_teardown: "{{ (VERIFY_TEARDOWN | default('1')) | string }}" + nodejs_manifest_src: "{{ playbook_dir }}/../../files/04-01/04-14-nodejs-demo.yaml" + nodejs_manifest_dest: /tmp/nodejs-demo-04-14.yaml + nodejs_http_check_enabled: false tasks: - - ansible.builtin.import_tasks: "{{ playbook_dir }}/_noop-tasks.yml" + - name: Include nodejs deploy+verify template + ansible.builtin.include_tasks: tasks/nodejs-demo-deploy-verify.yml diff --git a/ansible/playbooks/verify/05-01.yml b/ansible/playbooks/verify/05-01.yml index f276930..184e751 100644 --- a/ansible/playbooks/verify/05-01.yml +++ b/ansible/playbooks/verify/05-01.yml @@ -6,5 +6,5 @@ doc_id: "05-01" doc_filename: "05-01-k3s-部署homer首页面板.md" tasks: - - ansible.builtin.import_tasks: "{{ playbook_dir }}/_noop-tasks.yml" - + - name: Include noop doc verify tasks + ansible.builtin.include_tasks: tasks/noop-doc-verify.yml \ No newline at end of file diff --git a/ansible/playbooks/verify/05-02.yml b/ansible/playbooks/verify/05-02.yml index ba0b74b..aec8e7f 100644 --- a/ansible/playbooks/verify/05-02.yml +++ b/ansible/playbooks/verify/05-02.yml @@ -6,5 +6,5 @@ doc_id: "05-02" doc_filename: "05-02-onenav首页面板.md" tasks: - - ansible.builtin.import_tasks: "{{ playbook_dir }}/_noop-tasks.yml" - + - name: Include noop doc verify tasks + ansible.builtin.include_tasks: tasks/noop-doc-verify.yml diff --git a/ansible/playbooks/verify/05-03.yml b/ansible/playbooks/verify/05-03.yml index 512b725..d382717 100644 --- a/ansible/playbooks/verify/05-03.yml +++ b/ansible/playbooks/verify/05-03.yml @@ -6,5 +6,5 @@ doc_id: "05-03" doc_filename: "05-03-k3s-安装gitlab-含runner.md" tasks: - - ansible.builtin.import_tasks: "{{ playbook_dir }}/_noop-tasks.yml" - + - name: Include noop doc verify tasks + ansible.builtin.include_tasks: tasks/noop-doc-verify.yml diff --git a/ansible/playbooks/verify/05-04.yml b/ansible/playbooks/verify/05-04.yml index 2141884..0e8f26a 100644 --- a/ansible/playbooks/verify/05-04.yml +++ b/ansible/playbooks/verify/05-04.yml @@ -6,5 +6,5 @@ doc_id: "05-04" doc_filename: "05-04-k3s-配置gitlab-cicd.md" tasks: - - ansible.builtin.import_tasks: "{{ playbook_dir }}/_noop-tasks.yml" - + - name: Include noop doc verify tasks + ansible.builtin.include_tasks: tasks/noop-doc-verify.yml diff --git a/ansible/playbooks/verify/05-05.yml b/ansible/playbooks/verify/05-05.yml index 6722153..e393d72 100644 --- a/ansible/playbooks/verify/05-05.yml +++ b/ansible/playbooks/verify/05-05.yml @@ -6,5 +6,5 @@ doc_id: "05-05" doc_filename: "05-05-prometheus与grafana.md" tasks: - - ansible.builtin.import_tasks: "{{ playbook_dir }}/_noop-tasks.yml" - + - name: Include noop doc verify tasks + ansible.builtin.include_tasks: tasks/noop-doc-verify.yml diff --git a/ansible/playbooks/verify/05-06.yml b/ansible/playbooks/verify/05-06.yml index 45beede..2d9000e 100644 --- a/ansible/playbooks/verify/05-06.yml +++ b/ansible/playbooks/verify/05-06.yml @@ -6,5 +6,5 @@ doc_id: "05-06" doc_filename: "05-06-openlist挂载网盘与自动备份.md" tasks: - - ansible.builtin.import_tasks: "{{ playbook_dir }}/_noop-tasks.yml" - + - name: Include noop doc verify tasks + ansible.builtin.include_tasks: tasks/noop-doc-verify.yml diff --git a/ansible/playbooks/verify/05-07.yml b/ansible/playbooks/verify/05-07.yml index 42c277b..15c0513 100644 --- a/ansible/playbooks/verify/05-07.yml +++ b/ansible/playbooks/verify/05-07.yml @@ -6,5 +6,5 @@ doc_id: "05-07" doc_filename: "05-07-openclaw应用部署.md" tasks: - - ansible.builtin.import_tasks: "{{ playbook_dir }}/_noop-tasks.yml" - + - name: Include noop doc verify tasks + ansible.builtin.include_tasks: tasks/noop-doc-verify.yml diff --git a/ansible/playbooks/verify/05-08.yml b/ansible/playbooks/verify/05-08.yml index bacc73e..e76217f 100644 --- a/ansible/playbooks/verify/05-08.yml +++ b/ansible/playbooks/verify/05-08.yml @@ -6,5 +6,5 @@ doc_id: "05-08" doc_filename: "05-08-openclaw-k3s-实验部署.md" tasks: - - ansible.builtin.import_tasks: "{{ playbook_dir }}/_noop-tasks.yml" - + - name: Include noop doc verify tasks + ansible.builtin.include_tasks: tasks/noop-doc-verify.yml diff --git a/ansible/playbooks/verify/05-09.yml b/ansible/playbooks/verify/05-09.yml index 1a1b15c..2c352fe 100644 --- a/ansible/playbooks/verify/05-09.yml +++ b/ansible/playbooks/verify/05-09.yml @@ -6,5 +6,5 @@ doc_id: "05-09" doc_filename: "05-09-openclaw-web-小游戏网页平台.md" tasks: - - ansible.builtin.import_tasks: "{{ playbook_dir }}/_noop-tasks.yml" - + - name: Include noop doc verify tasks + ansible.builtin.include_tasks: tasks/noop-doc-verify.yml diff --git a/ansible/playbooks/verify/06-01.yml b/ansible/playbooks/verify/06-01.yml index 0ee5483..2304fca 100644 --- a/ansible/playbooks/verify/06-01.yml +++ b/ansible/playbooks/verify/06-01.yml @@ -6,5 +6,5 @@ doc_id: "06-01" doc_filename: "06-01-k3s-networkpolicy-故障排查.md" tasks: - - ansible.builtin.import_tasks: "{{ playbook_dir }}/_noop-tasks.yml" - + - name: Include noop doc verify tasks + ansible.builtin.include_tasks: tasks/noop-doc-verify.yml diff --git a/ansible/playbooks/verify/06-02.yml b/ansible/playbooks/verify/06-02.yml index def2a6d..34290f5 100644 --- a/ansible/playbooks/verify/06-02.yml +++ b/ansible/playbooks/verify/06-02.yml @@ -6,5 +6,5 @@ doc_id: "06-02" doc_filename: "06-02-运维小结.md" tasks: - - ansible.builtin.import_tasks: "{{ playbook_dir }}/_noop-tasks.yml" - + - name: Include noop doc verify tasks + ansible.builtin.include_tasks: tasks/noop-doc-verify.yml diff --git a/ansible/playbooks/verify/06-03.yml b/ansible/playbooks/verify/06-03.yml index 73f87e2..51bb90f 100644 --- a/ansible/playbooks/verify/06-03.yml +++ b/ansible/playbooks/verify/06-03.yml @@ -6,5 +6,5 @@ doc_id: "06-03" doc_filename: "06-03-k3s-自动备份与恢复-openlist-webdav.md" tasks: - - ansible.builtin.import_tasks: "{{ playbook_dir }}/_noop-tasks.yml" - + - name: Include noop doc verify tasks + ansible.builtin.include_tasks: tasks/noop-doc-verify.yml diff --git a/ansible/playbooks/verify/07-01.yml b/ansible/playbooks/verify/07-01.yml new file mode 100644 index 0000000..7b3351d --- /dev/null +++ b/ansible/playbooks/verify/07-01.yml @@ -0,0 +1,10 @@ +- name: "07-01 noop verify" + hosts: localhost + gather_facts: false + vars: + repo_root: "{{ playbook_dir }}/../../.." + doc_id: "07-01" + doc_filename: "07-01-k3s-calico-dualstack.md" + tasks: + - name: Include noop doc verify tasks + ansible.builtin.include_tasks: tasks/noop-doc-verify.yml diff --git a/ansible/playbooks/verify/07-02.yml b/ansible/playbooks/verify/07-02.yml new file mode 100644 index 0000000..5775bd1 --- /dev/null +++ b/ansible/playbooks/verify/07-02.yml @@ -0,0 +1,10 @@ +- name: "07-02 noop verify" + hosts: localhost + gather_facts: false + vars: + repo_root: "{{ playbook_dir }}/../../.." + doc_id: "07-02" + doc_filename: "07-02-k3s-cilium-dualstack-ebpf.md" + tasks: + - name: Include noop doc verify tasks + ansible.builtin.include_tasks: tasks/noop-doc-verify.yml diff --git a/ansible/playbooks/verify/_noop-tasks.yml b/ansible/playbooks/verify/_noop-tasks.yml deleted file mode 100644 index b50fb63..0000000 --- a/ansible/playbooks/verify/_noop-tasks.yml +++ /dev/null @@ -1,32 +0,0 @@ -- name: Assert docs file exists - ansible.builtin.stat: - path: "{{ repo_root }}/docs/{{ doc_filename }}" - register: _doc_stat - -- name: Fail when docs file missing - ansible.builtin.assert: - that: - - _doc_stat.stat.exists - fail_msg: "docs file missing: docs/{{ doc_filename }}" - -- name: Find matching ansible/files doc_id directory - ansible.builtin.find: - paths: "{{ repo_root }}/ansible/files" - file_type: directory - patterns: "{{ doc_id }}-*" - use_regex: false - register: _files_dirs - -- name: Fail when ansible/files doc_id directory missing - ansible.builtin.assert: - that: - - _files_dirs.matched | int >= 1 - fail_msg: "ansible/files missing doc_id directory: ansible/files/{{ doc_id }}-*" - -- name: Show noop verification summary - ansible.builtin.debug: - msg: - - "doc_id={{ doc_id }}" - - "doc={{ doc_filename }}" - - "files_dirs={{ _files_dirs.files | map(attribute='path') | list }}" - diff --git a/ansible/playbooks/verify/tasks/nodejs-demo-deploy-verify.yml b/ansible/playbooks/verify/tasks/nodejs-demo-deploy-verify.yml new file mode 100644 index 0000000..66648de --- /dev/null +++ b/ansible/playbooks/verify/tasks/nodejs-demo-deploy-verify.yml @@ -0,0 +1,77 @@ +- name: Copy nodejs demo manifest + ansible.builtin.copy: + src: "{{ nodejs_manifest_src }}" + dest: "{{ nodejs_manifest_dest }}" + mode: "0644" + +- name: Apply nodejs demo manifest + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f {{ nodejs_manifest_dest }} + args: + executable: /bin/bash + changed_when: true + +- name: Rollout status nodejs-demo + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/nodejs-demo -n default --timeout=180s + args: + executable: /bin/bash + changed_when: false + +- name: Assert Service targetPort matches expected (optional) + when: nodejs_expected_target_port is defined and (nodejs_expected_target_port | int) > 0 + ansible.builtin.shell: | + set -euo pipefail + exp="{{ nodejs_expected_target_port | int }}" + got=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get svc nodejs-demo -n default -o jsonpath='{.spec.ports[0].targetPort}') + echo "svc/nodejs-demo targetPort=$got expected=$exp" + test "$got" = "$exp" + args: + executable: /bin/bash + changed_when: false + +- name: Assert Endpoints exist + ansible.builtin.shell: | + set -euo pipefail + eps=$(KUBECONFIG={{ k3s_kubeconfig }} kubectl get endpoints nodejs-demo -n default -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null || true) + echo "endpoints.ip=$eps" + test -n "$eps" + args: + executable: /bin/bash + changed_when: false + +- name: HTTP check nodejs demo (path/host optional) + when: nodejs_http_check_enabled | default(true) + ansible.builtin.shell: | + set -euo pipefail + base="{{ nodejs_verify_entry_base | trim | regex_replace('/+$','') }}" + path="{{ nodejs_verify_path | default('/node') }}" + url="$base${path}" + host="{{ nodejs_verify_host | default('') | trim }}" + + ok=0 + for i in 1 2 3 4 5 6 7 8 9 10; do + if [ -n "$host" ]; then + code=$(curl -s -o /dev/null -w "%{http_code}" -H "Host: ${host}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") + else + code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 --max-time 8 "$url" 2>/dev/null || echo "000") + fi + echo "try $i: $url host=${host:-} -> $code" + if [ "$code" = "200" ]; then ok=1; break; fi + sleep 2 + done + test "$ok" = "1" + args: + executable: /bin/bash + changed_when: false + +- name: Teardown when VERIFY_TEARDOWN=1 + when: verify_teardown == "1" + ansible.builtin.shell: | + set -e + KUBECONFIG={{ k3s_kubeconfig }} kubectl delete -f {{ nodejs_manifest_dest }} --ignore-not-found=true + args: + executable: /bin/bash + changed_when: true diff --git a/ansible/playbooks/verify/tasks/noop-doc-verify.yml b/ansible/playbooks/verify/tasks/noop-doc-verify.yml new file mode 100644 index 0000000..a5fb5dc --- /dev/null +++ b/ansible/playbooks/verify/tasks/noop-doc-verify.yml @@ -0,0 +1,87 @@ +- name: Assert docs file exists + ansible.builtin.stat: + path: "{{ repo_root }}/docs/{{ doc_filename }}" + register: _doc_stat + +- name: Fail when docs file missing + ansible.builtin.assert: + that: + - _doc_stat.stat.exists + fail_msg: "docs file missing: docs/{{ doc_filename }}" + +- name: Find matching ansible/files doc_id directory + ansible.builtin.find: + paths: "{{ repo_root }}/ansible/files" + file_type: directory + patterns: "{{ doc_id }}" + use_regex: false + register: _files_dirs + +- name: Fail when ansible/files doc_id directory missing + ansible.builtin.assert: + that: + - _files_dirs.matched | int >= 1 + fail_msg: "ansible/files missing doc_id directory: ansible/files/{{ doc_id }}" + +- name: Show noop verification summary + ansible.builtin.debug: + msg: + - "doc_id={{ doc_id }}" + - "doc={{ doc_filename }}" + - "files_dirs={{ _files_dirs.files | map(attribute='path') | list }}" + +- name: Verify cluster reachable (kubectl get nodes) [runbook baseline] + ansible.builtin.shell: | + set -euo pipefail + KUBECONFIG={{ k3s_kubeconfig | default('/etc/rancher/k3s/k3s.yaml') }} kubectl get nodes + args: + executable: /bin/bash + delegate_to: "{{ groups['k3s_server'][0] }}" + become: true + run_once: true + changed_when: false + +- name: Verify core namespace exists (kube-system) [runbook baseline] + ansible.builtin.shell: | + set -euo pipefail + KUBECONFIG={{ k3s_kubeconfig | default('/etc/rancher/k3s/k3s.yaml') }} kubectl get ns kube-system + args: + executable: /bin/bash + delegate_to: "{{ groups['k3s_server'][0] }}" + become: true + run_once: true + changed_when: false + +- name: Find YAML manifests under ansible/files doc_id dirs + ansible.builtin.find: + paths: "{{ _files_dirs.files | map(attribute='path') | list }}" + file_type: file + patterns: + - "*.yml" + - "*.yaml" + recurse: true + use_regex: false + register: _files_manifests + +- name: Show manifest count summary + ansible.builtin.debug: + msg: + - "doc_id={{ doc_id }}" + - "manifest_files={{ _files_manifests.matched | default(0) }}" + - "manifest_paths={{ (_files_manifests.files | map(attribute='path') | list)[:12] }}" + +- name: Server-side dry-run apply (kubectl apply --dry-run=server) [doc assertion] + ansible.builtin.shell: | + set -euo pipefail + KUBECONFIG={{ k3s_kubeconfig | default('/etc/rancher/k3s/k3s.yaml') }} \ + kubectl apply --dry-run=server -f "{{ item.path }}" + args: + executable: /bin/bash + loop: "{{ _files_manifests.files }}" + loop_control: + label: "{{ item.path }}" + delegate_to: "{{ groups['k3s_server'][0] }}" + become: true + run_once: true + changed_when: false + when: (_files_manifests.matched | default(0) | int) > 0 diff --git a/bmad.list.md b/bmad.list.md new file mode 100644 index 0000000..0718f81 --- /dev/null +++ b/bmad.list.md @@ -0,0 +1,99 @@ +# BMAD / WDS 技能清单(Markdown) + +本文件由根目录 `bmad.list` 对应的 **技能 id** 展开为说明文档。每个技能在仓库中的实现路径为: + +`.cursor/skills/<技能 id>/SKILL.md` + +**用法提示**:在 Cursor 中可通过 `@` 引用技能目录,或按描述中的触发语选用对应工作流。 + +--- + +## 技能总表 + + +| 技能 ID | 中文标签 | 能力简介 | +| ---------------------------------------- | -------------------- | --------------------------------------------------------------- | +| `bmad-advanced-elicitation` | BMAD 高级需求获取 | 推动模型对近期输出再思考、精炼与改进;适用于苏格拉底、第一性原理、事前验尸、红队等深度批判场景。 | +| `bmad-agent-analyst` | BMAD 代理分析师 | 战略业务分析与需求专家;用户可点名与 **Mary** 对话。 | +| `bmad-agent-architect` | BMAD 代理架构师 | 系统架构与技术设计牵头;用户可点名 **Winston**。 | +| `bmad-agent-dev` | BMAD 代理开发 | 资深软件工程师,按故事执行实现;用户可点名 **Amelia**。 | +| `bmad-agent-pm` | BMAD 代理产品经理 | PRD 与需求发现;用户可点名 **John**。 | +| `bmad-agent-qa` | BMAD 代理质量保证 | 测试自动化与覆盖;用户可点名 **Quinn**。 | +| `bmad-agent-quick-flow-solo-dev` | BMAD 代理快速流程独立开发 | 精英全栈:快速规格 + 落地实现;用户可点名 **Barry**。 | +| `bmad-agent-sm` | BMAD 代理 Scrum Master | 冲刺计划、故事准备与敏捷仪式;用户可点名 **Bob**。 | +| `bmad-agent-tech-writer` | BMAD 代理技术撰稿人 | 技术文档与知识策展;用户可点名 **Paige**。 | +| `bmad-agent-ux-designer` | BMAD 代理用户体验设计师 | UX/UI 专项;用户可点名 **Sally**。 | +| `bmad-brainstorming` | BMAD 头脑风暴 | 用多种创意技法引导互动式头脑风暴与发散。 | +| `bmad-check-implementation-readiness` | BMAD 检查实施准备情况 | 校验 PRD、UX、架构与史诗等是否已具备开工条件。 | +| `bmad-cis-agent-brainstorming-coach` | BMAD CIS 头脑风暴教练 | CIS 专项:高水平头脑风暴与引导;用户可点名 **Carson**。 | +| `bmad-cis-agent-creative-problem-solver` | BMAD CIS 创意问题解决 | CIS 专项:系统化方法论破解复杂问题;用户可点名 **Dr. Quinn**。 | +| `bmad-cis-agent-design-thinking-coach` | BMAD CIS 设计思维教练 | CIS 专项:人本设计流程引导;用户可点名 **Maya**。 | +| `bmad-cis-agent-innovation-strategist` | BMAD CIS 创新策略师 | CIS 专项:颠覆式创新与商业模式;用户可点名 **Victor**。 | +| `bmad-cis-agent-presentation-master` | BMAD CIS 演示大师 | CIS 专项:幻灯片、路演与视觉叙事;用户可点名 **Caravaggio**。 | +| `bmad-cis-agent-storyteller` | BMAD CIS 故事讲述者 | CIS 专项:成熟叙事框架讲故事;用户可点名 **Sophia**。 | +| `bmad-cis-design-thinking` | BMAD CIS 设计思维 | 以共情驱动的人本设计流程工作坊。 | +| `bmad-cis-innovation-strategy` | BMAD CIS 创新策略 | 识别颠覆机会并设计商业模式创新路径。 | +| `bmad-cis-problem-solving` | BMAD CIS 问题解决 | 结构化问题解决方法论(引导式)。 | +| `bmad-cis-storytelling` | BMAD CIS 故事讲述 | 用故事框架组织叙事与表达。 | +| `bmad-code-review` | BMAD 代码审查 | 多层并行对抗评审(如盲审、边界猎人、验收审计)并分类为可执行项。 | +| `bmad-correct-course` | BMAD 纠正路线 | 冲刺执行中的重大变更与路线纠偏管理。 | +| `bmad-create-architecture` | BMAD 创建架构 | 产出解决方案/技术架构决策,便于 AI 代理一致遵循。 | +| `bmad-create-epics-and-stories` | BMAD 创建史诗和用户故事 | 将需求拆解为史诗与用户故事列表。 | +| `bmad-create-prd` | BMAD 创建产品文档 | 从零撰写产品需求文档(PRD)。 | +| `bmad-create-story` | BMAD 创建用户故事 | 生成带完整实现上下文的独立故事文件。 | +| `bmad-create-ux-design` | BMAD 创建用户体验设计 | 规划 UX 模式与设计规格。 | +| `bmad-dev-story` | BMAD 开发用户故事 | 按已填上下文的故事规格执行开发与实现。 | +| `bmad-distillator` | BMAD 蒸馏器 | 对源材料做面向 LLM 的「无损压缩」式蒸馏。 | +| `bmad-document-project` | BMAD 文档项目 | 为棕地项目生成面向 AI 的文档与扫描产出(本技能遵循 `workflow.md` / `instructions.md`)。 | +| `bmad-domain-research` | BMAD 领域研究 | 领域与行业调研。 | +| `bmad-edit-prd` | BMAD 编辑 PRD | 在已有 PRD 上迭代编辑。 | +| `bmad-editorial-review-prose` | BMAD 编辑审核散文 | 从文笔与沟通效果角度审校文本。 | +| `bmad-editorial-review-structure` | BMAD 编辑审核结构 | 从结构上做删减、重组与简化,保持可读性。 | +| `bmad-generate-project-context` | BMAD 生成项目上下文 | 生成含 AI 协作规则的 `project-context.md`。 | +| `bmad-help` | BMAD 帮助 | 根据当前状态与问题推荐下一步或合适技能。 | +| `bmad-index-docs` | BMAD 索引文档 | 为指定文件夹生成或更新 `index.md` 文档索引。 | +| `bmad-init` | BMAD 初始化 | 初始化 BMad 项目配置并加载各模块所需变量。 | +| `bmad-market-research` | BMAD 市场调研 | 竞品、客户与市场向调研。 | +| `bmad-party-mode` | BMAD 派对模式 | 编排多个 BMAD 角色进行自然的多代理对话。 | +| `bmad-product-brief` | BMAD 产品简报 | 引导或自主完成产品简报的创建与更新。 | +| `bmad-qa-generate-e2e-tests` | BMAD 质量保证生成端到端测试 | 为既有功能生成 E2E 自动化测试。 | +| `bmad-quick-dev` | BMAD 快速开发 | 在现有架构与约定下实现需求、修缺陷、做小范围功能改动。 | +| `bmad-retrospective` | BMAD 回顾 | 史诗结束后复盘、提炼经验与评估成效。 | +| `bmad-review-adversarial-general` | BMAD 对抗性通用评审 | 「唱反调」式批判评审并输出发现报告。 | +| `bmad-review-edge-case-hunter` | BMAD 边缘案例猎手评审 | 穷尽分支与边界,仅报告未覆盖的边界情况(方法驱动)。 | +| `bmad-shard-doc` | BMAD 分片文档 | 按二级标题等规则将大型 Markdown 拆成多篇小文件。 | +| `bmad-sprint-planning` | BMAD 冲刺计划 | 由史诗生成冲刺状态跟踪与计划。 | +| `bmad-sprint-status` | BMAD 冲刺状态 | 汇总当前冲刺进度与风险。 | +| `bmad-tea` | BMAD 测试架构师 | 测试总架构与质量顾问(风险驱动测试、质量门禁等);用户可点名 **Murat**。 | +| `bmad-teach-me-testing` | BMAD 教我测试 | 分阶段、结构化教授测试实践。 | +| `bmad-technical-research` | BMAD 技术研究 | 技术与架构方向的调研报告。 | +| `bmad-testarch-atdd` | BMAD 测试架构 ATDD | 以 ATDD/TDD 节奏生成可失败的验收测试。 | +| `bmad-testarch-automate` | BMAD 测试架构自动化 | 扩展代码库的自动化测试覆盖。 | +| `bmad-testarch-ci` | BMAD 测试架构 CI | 搭建包含测试执行环节的 CI/CD 质量流水线。 | +| `bmad-testarch-framework` | BMAD 测试架构框架 | 初始化 Playwright、Cypress 等测试框架。 | +| `bmad-testarch-nfr` | BMAD 测试架构 NFR | 评估性能、安全、可靠性等非功能需求。 | +| `bmad-testarch-test-design` | BMAD 测试架构测试设计 | 系统级或史诗级测试计划与测试策略。 | +| `bmad-testarch-test-review` | BMAD 测试架构测试审查 | 按最佳实践评审测试设计与实现质量。 | +| `bmad-testarch-trace` | BMAD 测试架构跟踪 | 生成追溯矩阵与质量门禁结论。 | +| `bmad-validate-prd` | BMAD 验证产品文档 | 按规范校验 PRD 是否达标。 | +| `wds-0-alignment-signoff` | WDS 0 对齐签核 | 在正式启动前对齐想法、目标与边界。 | +| `wds-0-project-setup` | WDS 0 项目设置 | 项目接入:判断类型、复杂度、技术栈并路由到合适阶段。 | +| `wds-1-project-brief` | WDS 1 项目简报 | 建立项目上下文,作为后续设计工作的基础。 | +| `wds-2-trigger-mapping` | WDS 2 触发器映射 | 通过结构化工作坊将业务目标映射到用户心理与触发点。 | +| `wds-3-scenarios` | WDS 3 场景 | 基于触发地图产出带微步骤的 UX 场景大纲。 | +| `wds-4-ux-design` | WDS 4 用户体验设计 | 在场景驱动下产出详细的视觉与交互规格。 | +| `wds-5-agentic-development` | WDS 5 智能开发 | 多智能体协作下的开发、测试与逆向分析。 | +| `wds-6-asset-generation` | WDS 6 资产生成 | 按规格通过 AI 生成视觉与文案类资产。 | +| `wds-7-design-system` | WDS 7 设计系统 | 设计系统组件与设计 Token 的创建、导入与维护。 | +| `wds-8-product-evolution` | WDS 8 产品演进 | 针对棕地产品的精简版全流程改进管道。 | +| `wds-agent-freya-ux` | WDS 智能体 Freya 用户体验 | WDS 侧战略 UX 与设计思维伙伴;用户可点名 **Freya**。 | +| `wds-agent-saga-analyst` | WDS 智能体 Saga 分析师 | WDS 侧战略分析与产品发现伙伴;用户可点名 **Saga**。 | + + +--- + +## 与纯文本 `bmad.list` 的关系 + +- `bmad.list`:仅含 **技能 id + 简短中文标签**,适合脚本解析或快速检索。 +- `bmad.list.md`(本文件):在同一套 id 上补充 **能力简介** 与可选角色名,便于人类阅读与选型。 + diff --git a/docs/00-00-构建总览.md b/docs/00-00-构建总览.md index d958cad..0576fd7 100644 --- a/docs/00-00-构建总览.md +++ b/docs/00-00-构建总览.md @@ -2,74 +2,132 @@ > 本仓库主文档入口。建议从这里开始阅读。 +## TL;DR + +- **本文性质**:目录导航/阅读顺序说明(不对应矩阵验收用例) +- **真机一键验收**:`./scripts/acceptance.sh`(可选铺栈)或 `./scripts/verify.sh full`(仅验收) +- **成功判据**:你能从本文快速定位到要跑的 `doc_id`、入口脚本与下一步文档 +- **排障**:若验收失败,先跑 `./scripts/verify.sh preflight`,再看对应 `doc_id` 的 playbook 输出 + +**路径约定**:本文档中的链接,凡写作 `docs/...`、`ansible/...`、`scripts/...`、`project-context.md` 的,均相对于**仓库根目录**(与在 `docs/` 内打开本文件时的相对路径无关,避免混用 `../`)。 + ## 目录约定 - 文档:`docs/`(Kubernetes 等可复用清单见 `ansible/files/`,与 Ansible playbook 共用) - 脚本:`scripts/` - 脚本入口:`scripts/README.md` +- 仓库契约(实现与改动规则):`project-context.md` ### 编号含义速查 -- `00-**`:总览与基础概念(入口、索引、验证矩阵、部署环境说明、未来规划) -- `01-**`:安装与基础环境(控制节点/工作节点/OpenWrt HAProxy/Cloudflare/NFS 等) +- `00-**`:基本知识与基础环境(入口、索引、部署环境说明、验证框架与准备清单) +- `01-**`:安装与基础环境(见 `01-00` 系列说明页) - `02-**`:Nginx 矩阵**分篇场景说明**(M1~M4 各场景独立页;**综合一键部署见 `02-05`**) -- `03-**`:集群侧配置扩展;**03-04~03-10 按推荐阅读顺序编号**(Traefik 自定义端口 → Tunnel → local-path → NFS → Longhorn → HA → GitOps) -- `04-**`:Node.js 高级部署(`04-01` 主入口 + `04-02`~`04-14` 部署分项;与 nginx 矩阵编号无强制对应) -- `05-**`:常用应用部署(Homer、OneNav、GitLab、监控、openlist 等) -- `06-**`:排障与运维总结(NetworkPolicy 排查、运维小结) +- `03-**`:集群侧配置扩展(见 `03-00` 系列说明页) +- `04-**`:Node.js 高级部署(见 `04-00` 系列说明页) +- `05-**`:常用应用部署(见 `05-00` 系列说明页) +- `06-**`:排障与运维总结(见 `06-00` 系列说明页) +- `07-**`:网络与 CNI 实验(见 `07-00` 系列说明页;多为破坏性/换 CNI 场景) +- `00-**` 的 `verify/00-*.yml` 仅做文档/路径一致性校验,不做 `kubectl apply`、Helm 安装或任何变更集群状态的动作 --- -## 推荐安装顺序 +## 学习主线(6 步) + +以下为仓库**默认主线**;完整扩展示意见下方流程图。更长的按主题展开顺序见 **附录:完整推荐阅读顺序**。 + +## 排障 + +- **不知道该跑哪个命令**:先按本文“学习主线(6 步)”找到对应 `doc_id`,再用 `./scripts/verify.sh run ` 或 `./scripts/verify.sh full`。 +- **verify.sh 报缺 playbook**:确认存在 `ansible/playbooks/verify/.yml`,并可运行 `python3 scripts/validate_matrix_playbooks.py` 做存在性校验。 +- **连接不上集群**:在控制端执行 `./scripts/verify.sh preflight`,优先修复 inventory/SSH/私钥权限问题。 + +### 流程图(主线与分叉) + +```mermaid +flowchart TD + S1[1 总览 + 按需 00-04 环境说明] + S2[2 概念 00-01] + S3[3 装集群 01-06 或 01-01+01-02] + S4[4 kubectl 节点全 Ready] + S5[5 Nginx 02-00 → 02-05] + S6[6 Node.js 04-01] + S1 --> S2 + S2 --> S3 + S3 --> S4 + S4 --> S5 + S5 --> S6 + S6 --> B1[分叉: Traefik 03-01 / 03-02 等] + S6 --> B2[分叉: 存储 03-05~03-07 等] + S6 --> B3[分叉: 应用 05-xx] + S6 --> B4[排障: 06-xx / scripts] + S1 -.->|30分钟版跳过| S3 + S2 -.->|30分钟版跳过| S3 +``` + +### 6 步说明 + +1. **总览与环境**:精读本篇;需要对照节点、IP、版本时打开 `00-02-部署环境说明.md`。 +2. **概念速查**:`00-01-k3s-基础概念.md`。**时间紧可跳过**,卡术语再读。 +3. **安装 K3s**:`01-01-k3s-控制节点含traefik.md` + `01-02-k3s-工作节点.md`,或一键 `01-06-节点初始化-ansible-实践.md`(仓库根亦可配合 `scripts/deploy-lab.sh`)。 +4. **验收**:`kubectl get nodes`,所有节点 Ready。 +5. **HTTP 入口验证**:`02-00-nginx-系列说明.md` → `02-05-nginx-验证矩阵-一键部署.md`(清单真源:`ansible/files/02-05/`)。可选:`./scripts/verify.sh run 02-05`。 +6. **工作负载主线**:`04-01-k3s-nodejs-高级部署.md`。`04-02`~`04-14` 为分项,**按需阅读**,不列入主线编号。 + +> 推进「已验证」的前置见 `00-04-待验证项-验证前准备.md`。 + +### 30 分钟快速路径(4 步) + +相当于**跳过第 2 步**并**压缩第 1 步**(只抓总览要点):装集群 → Ready → `02-05` → 不通则 `scripts/README.md` 排障。与仓库根 `README.md` 中「30 分钟快速通关」一致。 + +--- + +## 附录:完整推荐阅读顺序(按主题展开) + +下列顺序适合**已走完 6 步主线**、或需要按编号通读全库时查阅;**不必一次性做完**。 1. `00-01-k3s-基础概念.md` 2. `01-01-k3s-控制节点含traefik.md`(或直接用 `01-06-节点初始化-ansible-实践.md` 一键自动化) 3. `01-02-k3s-工作节点.md` 4. `01-03-armv7-standalone-docker.md` 5. `01-07-openwrt-haproxy.md`(按需:网关负载均衡) -6. `04-03-k3s-nginx-demo.md` +6. `02-00-nginx-系列说明.md` → `02-05-nginx-验证矩阵-一键部署.md`(HTTP 矩阵与入口验证;清单真源:`ansible/files/02-05/`) 7. `04-01-k3s-nodejs-高级部署.md` -8. `04-02-nodejs-镜像与运行命令.md` -9. `04-03-nodejs-环境变量与配置注入.md` -10. `04-04-nodejs-端口与Service.md` -11. `04-05-nodejs-资源请求与限制.md` -13. `04-06-nodejs-探针与健康检查.md` -14. `04-07-nodejs-调度与亲和.md` -15. `04-08-nodejs-安全上下文.md` -16. `04-09-nodejs-存储与卷.md` -17. `04-10-nodejs-Ingress与Traefik.md` -18. `04-11-nodejs-副本与滚动发布.md` -19. `04-12-nodejs-TLS与证书.md` -20. `04-13-nodejs-HPA.md` -21. `04-14-nodejs-GitOps与CI流水线.md` -22. `02-05-nginx-验证矩阵-一键部署.md`(建议先读 `02-00-nginx-系列说明.md`) -23. `03-01-k3s-traefik-dashboard.md` -24. `03-02-k3s-traefik-acme.md` -25. `03-03-k3s-traefik-dashboard-acme.md`(推荐顺序:先 03-01、03-02) -26. `03-04-k3s-cloudflare-tunnel-配置接入.md`(按需:Cloudflare Tunnel 接入集群) -27. `03-05-k3s-local-path-pvc.md`(K3s 自带 local-path,单副本本地持久化) -28. `03-06-k3s-使用nfs存储.md`(按需:已有 NFS 时 PV/PVC) -29. `03-07-k3s-longhorn-持久化存储.md`(重状态、快照/备份,建议部署 GitLab 等前统一规划) -30. `03-08-k3s-ha-集群配置与切换.md`(按需:双控制节点 HA,配合 `01-04`) -31. `03-09-k3s-gitops-集群配置管理.md`(框架草案:Argo CD / Flux) - -> 想确认这些步骤是否已经在真实环境验证,请查看 `00-02-验证矩阵.md`。 -> 本仓库验证环境说明见 `00-04-部署环境说明.md`。 +8. `04-02-nodejs-端口与Service.md` +9. `04-03-nodejs-镜像与运行命令.md` +10. `04-04-nodejs-环境变量与配置注入.md` +11. `04-05-nodejs-探针与健康检查.md` +12. `04-06-nodejs-副本与滚动发布.md` +13. `04-07-nodejs-Ingress与Traefik.md` +14. `04-08-nodejs-资源请求与限制.md` +15. `04-09-nodejs-调度与亲和.md` +16. `04-10-nodejs-安全上下文.md` +17. `04-11-nodejs-存储与卷.md` +18. `04-12-nodejs-TLS与证书.md` +19. `04-13-nodejs-HPA.md` +20. `04-14-nodejs-GitOps与CI流水线.md` +21. `03-01-k3s-traefik-dashboard.md` +22. `03-02-k3s-traefik-acme.md` +23. `03-03-k3s-traefik-dashboard-acme.md`(推荐顺序:先 03-01、03-02) +24. `03-04-k3s-cloudflare-tunnel-配置接入.md`(按需:Cloudflare Tunnel 接入集群) +25. `03-05-k3s-local-path-pvc.md`(K3s 自带 local-path,单副本本地持久化) +26. `03-06-k3s-使用nfs存储.md`(按需:已有 NFS 时 PV/PVC) +27. `03-07-k3s-longhorn-持久化存储.md`(重状态、快照/备份,建议部署 GitLab 等前统一规划) +28. `03-08-k3s-ha-集群配置与切换.md`(按需:双控制节点 HA,配合 `01-08`) +29. `03-09-k3s-gitops-集群配置管理.md`(框架草案:Argo CD / Flux) --- -## 主线导航 +## 主线之后的分叉(按需) -- `01-02-k3s-工作节点.md` -- `03-01-k3s-traefik-dashboard.md` -- `04-03-k3s-nginx-demo.md` -- `04-01-k3s-nodejs-高级部署.md`(文末:`04-02`~`04-14` Node.js 部署分项) -- `03-02-k3s-traefik-acme.md` -- `03-04-k3s-cloudflare-tunnel-配置接入.md` -- `03-05-k3s-local-path-pvc.md` -- `03-06-k3s-使用nfs存储.md` -- `03-07-k3s-longhorn-持久化存储.md` -- `06-01-k3s-networkpolicy-故障排查.md` +不占主线 6 步序号;按目标点击即可。 + +- **Traefik / 入口**:`03-01-k3s-traefik-dashboard.md`、`03-02-k3s-traefik-acme.md`、`03-04-k3s-cloudflare-tunnel-配置接入.md` +- **存储**:`03-05-k3s-local-path-pvc.md`、`03-06-k3s-使用nfs存储.md`、`03-07-k3s-longhorn-持久化存储.md` +- **高可用 / GitOps**:`01-08-双控制节点ha.md`、`03-08-k3s-ha-集群配置与切换.md`、`03-09-k3s-gitops-集群配置管理.md` +- **工作节点与 HTTP 验证**(主线中已覆盖,此处为直达):`01-02-k3s-工作节点.md`、`02-05-nginx-验证矩阵-一键部署.md`(可先读 `02-00-nginx-系列说明.md`) +- **Node.js 分项**(`04-02`~`04-14`,从 `04-01` 文末进入) +- **排障**:`06-01-k3s-networkpolicy-故障排查.md`、`scripts/README.md` --- @@ -84,16 +142,16 @@ - `02-04-nginx-worker-ingressroute.md`(M4) - `02-05-nginx-验证矩阵-一键部署.md`(HTTP-only 综合部署) -> **说明**:若曾规划「Node.js 与 nginx 对照」的 M1~M4 矩阵独立文档,**尚未在本仓库落盘**;当前 **`04-05`~`04-08` 已用于** Node.js **部署分项**(资源/探针/调度/安全)。后续若补充 Node.js 矩阵,请**另起编号**(例如 `04-20` 起或归入专题),避免与现有 `04-**` 冲突。 +> **说明**:HTTP/nginx 的 M1~M4 分篇与综合一键部署以 **`02-01`~`02-05`** 为真源(与 `04-**` 编号无关)。若曾规划「与 nginx 矩阵逐条对照的 Node.js 独立分册」,**尚未在本仓库落盘**。`04-02`~`04-14` 为 Node.js **分项**(已按 Core→Plus→Pro 从简到繁编号,与 nginx 矩阵无逐条对应)。后续若补充 Node.js 专题矩阵,请**另起编号**(例如 `04-20` 起或归入专题),避免与现有 `04-**` 冲突。 --- ## 专题导航 -- `00-04-部署环境说明.md`(节点布局、IP、OS、K3s 版本等,便于对照与复现) +- `00-02-部署环境说明.md`(节点布局、IP、OS、K3s 版本等,便于对照与复现) - `01-06-节点初始化-ansible-实践.md`(Ansible 一键安装 k3s 集群,已验证) - `01-07-openwrt-haproxy.md`(按需:网关负载均衡) -- nginx 矩阵:`ansible/playbooks/nginx-matrix-deploy.yml`(02-05)、`ansible/playbooks/nginx-matrix-tls-deploy.yml`(03-02) +- nginx 矩阵:`ansible/playbooks/verify/02-05.yml`(薄封装导入 `ansible/playbooks/verify/02-05.yml`);TLS 铺栈另见 `ansible/playbooks/verify/03-02.yml`(`deploy-lab nginx-matrix-tls`)。 - `03-04-k3s-cloudflare-tunnel-配置接入.md`(Cloudflare Tunnel 完整流程:Zero Trust + 集群接入) - `05-03-k3s-安装gitlab-含runner.md` @@ -106,7 +164,7 @@ - `01-05-armv7-nfs服务安装.md` - `05-06-openlist挂载网盘与自动备份.md` - `06-02-运维小结.md` -- `01-04-双控制节点ha.md` +- `01-08-双控制节点ha.md` - `03-08-k3s-ha-集群配置与切换.md` - `03-09-k3s-gitops-集群配置管理.md`(框架草案) @@ -121,5 +179,13 @@ ## 未来规划 -- `00-03-未来规划与待补功能.md`:记录还没做、但已经想到的能力清单与路线图,方便以后按需补齐。 +- 未来规划与维护者审查结论已收敛到 `project-context.md`(作为仓库契约与长期可维护约束),避免 `00` 系列入口层堆积噪音。 +--- + +## 网络与 CNI 实验(07-*,按需) + +- `07-01-k3s-calico-dualstack.md`(Calico 双栈实验) +- `07-02-k3s-cilium-dualstack-ebpf.md`(Cilium 双栈与 eBPF) + +> 与主线 `01-06` + Flannel 安装路径不同;仅在独立实验环境或充分备份后阅读、操作。验证矩阵与 `verify.sh` 对 `07-*` 仅为 **noop(文档 + 占位目录存在性)**。 diff --git a/docs/00-01-k3s-基础概念.md b/docs/00-01-k3s-基础概念.md index 6e77cf3..8fe7961 100644 --- a/docs/00-01-k3s-基础概念.md +++ b/docs/00-01-k3s-基础概念.md @@ -2,11 +2,23 @@ > 入门速查:先把核心概念看明白,再去做安装与排障。 +## TL;DR + +- **本文性质**:概念/术语速查(不对应独立铺栈) +- **推荐动作**:按 `00-00-构建总览.md` 进入主线;真机验收用 `./scripts/verify.sh full` +- **成功判据**:能看懂后续文档中的 K3s/K8s 术语(node/pod/service/ingress 等) +- **排障**:执行类问题请转到对应实验篇(`01-*`/`02-*`/`03-*`)的「排障」 + ## 阅读建议 - 新手按本页顺序读完即可 - 遇到术语不懂,先回这里再继续操作文档 +## 排障 + +- **概念读不懂**:先看 `00-04-部署环境说明.md` 了解本仓库“实验室约定”,再回到本篇对照术语与节点角色。 +- **想跑命令但本篇没有**:本篇不提供部署/验收命令;按 `00-00` 找到对应实验篇,再跑 `./scripts/verify.sh run `。 + ## 1. K3s 是什么 - 轻量 Kubernetes 发行版,适合 Homelab。 @@ -116,7 +128,7 @@ K3s 自带 **local-path-provisioner**:当你创建 PVC 且不指定 `storageCl - **K3s 不会自动帮你搬本地数据**:调度器只管 Pod 放哪台节点,不会同步 `/var/lib/...` 或自建目录;所以“节点故障自动漂移”和“数据高可用”是两件事,要分别设计。 - **常见做法**:重要数据用共享存储(NFS / 云盘 / CSI),通过 PV/PVC 给 Pod 用(参考 `01-05`、`03-07`);缓存、临时文件用本地目录(`emptyDir` 或 `hostPath`),接受节点挂了可丢;或靠备份/同步把本地目录定期同步到别处,再在新节点恢复。 -**用途**:搞清楚数据放哪、节点挂了会不会丢,才能设计备份和高可用,不踩坑。 +**用途**:搞清楚数据放哪、节点挂了会不会丢,才能设计备份和高可用,避免常见存储与可用性误区。 ## 9. 删除部署 @@ -124,8 +136,8 @@ K3s 自带 **local-path-provisioner**:当你创建 PVC 且不指定 `storageCl - **用法**:用部署时的 YAML 删除,与 `apply` 一一对应;或按资源类型和名称逐个删除。 - **示例**: - `kubectl delete -f nginx-matrix.yaml`:删除该文件定义的所有资源 - - `kubectl delete -f ansible/files/02-05-nginx-matrix/ -R`:递归删除该目录下所有 manifest 定义的资源(02-05 矩阵) - - `kubectl delete -f ansible/files/03-02-nginx-matrix-tls/ -R`:删除 03-02 TLS 矩阵(或见该文档 / playbook `nginx-matrix-tls-deploy.yml -e mode=cleanup`) + - `kubectl delete -f ansible/files/02-05/ -R`:递归删除该目录下所有 manifest 定义的资源(02-05 矩阵) + - `kubectl delete -f ansible/files/03-02/ -R`:删除 03-02 TLS 矩阵(或见该文档 / playbook `03-02.yml -e nginx_matrix_tls_enable=true -e mode=cleanup`) - `kubectl delete deployment nginx-m1 -n default`:按名称删除单个 Deployment - **用途**:清理测试应用、下线服务、重装部署前先删除旧资源。资源删除后对应 Pod 会被终止,数据(etcd 中记录)一并移除;若用了 PVC,PVC 本身通常需单独删除。 diff --git a/docs/00-04-部署环境说明.md b/docs/00-02-部署环境说明.md similarity index 51% rename from docs/00-04-部署环境说明.md rename to docs/00-02-部署环境说明.md index 847843d..277276e 100644 --- a/docs/00-04-部署环境说明.md +++ b/docs/00-02-部署环境说明.md @@ -2,6 +2,13 @@ > 本文描述本仓库文档所针对的**验证环境**:节点布局、IP、OS、K3s 版本等。其他环境按需对照调整。 +## TL;DR + +- **自动化验收(基线)**:本篇为环境说明文档,不参与 `verify.sh run-all/full`;按本文完成环境核对即可 +- **你需要准备**:`ansible/inventory.ini` 可 SSH;(可选)每台 k3s 节点独立挂载 `/storage` +- **成功判据**:文档存在且 `ansible/files/00-02` 目录可追溯;节点规划/IP/角色与本文一致(或你已按本文做差异化对照) +- **失败排障**:见本文「排障」小节(SSH/inventory/私钥权限、/storage 挂载、k3s API 连通) + ## 1. 节点与角色 | 主机名 | IP | 角色 | 说明 | @@ -16,7 +23,48 @@ - Kubernetes 中的节点名使用短主机名(**仅** `ylc61`~`ylc64` 四类 K3s 机器),与 inventory 中 `[k3s_server]` / `[k3s_worker]` 一致;`ylc65` **不是**集群成员。便于配合 Cloudflare CDN(若计算机 hostname 为 FQDN,本机解析会优先走本地导致无法访问)。 -- **控制机**(运行 `ansible-playbook`、`verify.sh`):推荐 **`ylc65`(Linux 工作机)** 或 ylc61;亦可在你的本机 Linux 上执行,只要装好 Ansible、能 SSH 到 inventory 中的节点。 +- **控制机**(运行 `ansible-playbook`、`verify.sh`):推荐 **`ylc65`(Linux 工作机)** 或 ylc61;亦可在你的本机 Linux 上执行,只要满足下节 **§1.1** 依赖并能 SSH 到 inventory 中的节点。 + +## 排障 + +- **`verify.sh preflight` / `run` 报 inventory 私钥不存在/权限过宽**:按报错提示 `chmod 600`,并检查 `ansible/inventory.ini` 中 `ansible_ssh_private_key_file` 路径是否在当前控制端存在。 +- **k3s API 不可达(6443)**:确认控制节点防火墙放行、IP/主机名解析正确;在控制端 `curl -k https://:6443/ping` 快速自检。 +- **DiskPressure / 系统盘被写满**:确认 `/storage` 为独立挂载点(见 §3.1),并确保 server/worker 都使用一致的 `--data-dir`。 + +### 1.1 Linux 工作机(ylc65)软件依赖 + +在 **`ylc65` 上执行本仓库各步骤**(克隆仓库、`./scripts/deploy-lab.sh`、`./scripts/verify.sh`、`ansible-playbook …`)时,该主机是 **Ansible 控制端**,需预先安装下列组件(版本可与 §2 对照,其他发行版用等价包名即可): + +| 用途 | 说明 | +|------|------| +| **Git** | 克隆 / 更新本仓库;排障时对比本地与远端分支。 | +| **Ansible**(`ansible-core` + `ansible-playbook`) | 执行 `ansible/playbooks/*`、`scripts/deploy-lab.sh`、`scripts/verify.sh` 所调用的 playbook;与 §2「Ansible ansible-core 2.18」一致即可。 | +| **OpenSSH 客户端**(`ssh`、`scp`、`ssh-keygen`) | 按 `ansible/inventory.ini` 连接 `ylc61`~`ylc64`(通常为 root + 私钥);`scripts/ssh/test-ssh.sh` 等亦依赖本机 `ssh`。运行 `scripts/ssh/setup-k3s-workers-ssh.sh` 预配密钥时同样只需 OpenSSH;**不要求** PuTTY(仅当该脚本交互中勾选「生成 PuTTY .ppk」供 Windows 使用时,才需额外安装 `puttygen`)。 | +| **Bash** | 仓库脚本为 `#!/usr/bin/env bash`;勿在仅 `sh` 的环境强行执行。 | +| **curl** | 部分验证与文档示例;`verify` playbook 在远端执行 curl 时由节点侧提供,控制端亦建议具备以便自检。 | + +**Fedora / RHEL 系示例**(在 ylc65 上): + +```bash +sudo dnf install -y git ansible-core openssh-clients curl bash +``` + +**Debian / Ubuntu 系示例**: + +```bash +sudo apt update +sudo apt install -y git ansible openssh-client curl bash +``` + +安装后自检: + +```bash +git --version +ansible-playbook --version +ssh -V +``` + +Python 3 会作为 **Ansible 控制端**依赖被包管理器一并拉取,一般无需单独指定版本。若仅在 **ylc61 本机**跑 Ansible 而不使用 ylc65,同样需满足上表(在控制节点上安装等价软件)。 ## 2. 软件版本(已验证) @@ -52,10 +100,10 @@ lsblk -f ### 3.2 推荐自动化顺序 -1. (可选)`ansible/playbooks/k3s-prepare-storage.yml`:声明 `k3s_data_disk_device` 并启用 `k3s_prepare_storage` 时,幂等准备 `/storage`。 -2. `ansible/playbooks/k3s-init-and-install.yml`:安装 K3s(可开启 `k3s_verify_storage_mount` 校验挂载)。 -3. (可选)`ansible/playbooks/longhorn-install.yml`:Helm 安装 Longhorn(`ansible/files/03-07-longhorn/values-lab.yaml`)。 -4. (可选)按 `03-05` 应用本仓库 **local-path** ConfigMap 真源(`ansible/files/03-05-local-path-config/local-path-config-lab.json`)。 +1. (可选)`ansible/playbooks/verify/01-06.yml`:声明 `k3s_data_disk_device` 并启用 `k3s_prepare_storage` 时,幂等准备 `/storage`。 +2. `ansible/playbooks/verify/01-06.yml`:安装 K3s(可开启 `k3s_verify_storage_mount` 校验挂载)。 +3. (可选)`ansible/playbooks/verify/03-07.yml`:Helm 安装 Longhorn(`ansible/files/03-07/values-lab.yaml`)。 +4. (可选)按 `03-05` 应用本仓库 **local-path** ConfigMap 真源(`ansible/files/03-05/local-path-config-lab.json`)。 ## 4. 防火墙 @@ -67,13 +115,15 @@ lsblk -f ## 5. Ansible 相关 +控制端软件要求见 **§1.1**(`ylc65` 或任意运行 `ansible-playbook` 的机器)。 + - **inventory**:`ansible/inventory.ini`,分组 `k3s_server`、`k3s_worker`、`k3s_nodes`(**勿**将 `ylc65` 列入 K3s 分组;工作机只作为 Ansible 控制端) - **变量**:`ansible/group_vars/all.yml`,含 `k3s_data_dir`、`k3s_server_ip`、`k3s_manage_`* 等 -- **playbook(k3s)**:`ansible/playbooks/k3s-init-and-install.yml` -- **playbook(数据盘,可选)**:`ansible/playbooks/k3s-prepare-storage.yml` -- **playbook(Longhorn,可选)**:`ansible/playbooks/longhorn-install.yml`(Helm + `ansible/files/03-07-longhorn/values-lab.yaml`,文档 `03-07`) -- **playbook(nginx 矩阵)**:`ansible/playbooks/nginx-matrix-deploy.yml`(manifests 在 `ansible/files/02-05-nginx-matrix/`,文档 `02-05`) -- **playbook(nginx TLS 矩阵)**:`ansible/playbooks/nginx-matrix-tls-deploy.yml`(manifests 在 `ansible/files/03-02-nginx-matrix-tls/`,文档 `03-02`(02-05 升级版)) +- **playbook(k3s)**:`ansible/playbooks/verify/01-06.yml` +- **playbook(数据盘,可选)**:`ansible/playbooks/verify/01-06.yml` +- **playbook(Longhorn,可选)**:`ansible/playbooks/verify/03-07.yml`(Helm + `ansible/files/03-07/values-lab.yaml`,文档 `03-07`) +- **playbook(nginx 矩阵)**:`ansible/playbooks/verify/02-05.yml`(manifests 在 `ansible/files/02-05/`,文档 `02-05`) +- **playbook(nginx TLS 矩阵)**:`ansible/playbooks/verify/03-02.yml`(manifests 在 `ansible/files/03-02/`,文档 `03-02`(02-05 升级版)) - **SSH**:root 连接,`scripts/ssh/setup-k3s-workers-ssh.sh` 预配密钥 ## 6. 验证时间 diff --git a/docs/00-02-验证矩阵.md b/docs/00-02-验证矩阵.md deleted file mode 100644 index abf9ff1..0000000 --- a/docs/00-02-验证矩阵.md +++ /dev/null @@ -1,222 +0,0 @@ -# 00-02-验证矩阵 - -> 这一页只做一件事:**集中标记每篇关键文档是否已经在真实环境验证过**。 -> -> **清单位置**:可部署的 Kubernetes YAML 以仓库 [`ansible/files/`](../ansible/files/) 为唯一真源(与 `docs/` 交叉引用);验证时请以该目录下文件为准。 -> -> 写文档的人、做实验的人,都以这里为准,不用在每篇文档里翻记录。 -> -> 本页当前以“待验证列表”为主:在你的实验环境中按每篇文档从头到尾走通一次,然后把状态从“未验证/部分验证”补成“已验证”。 -> -> 自动化验证入口:`scripts/verify.sh`(在控制节点仓库根执行;按本矩阵顺序逐个 `doc_id` 跑 `ansible/playbooks/verify/.yml`,缺 playbook 即失败)。分层说明见 [`docs/00-05-测试与验证框架.md`](../docs/00-05-测试与验证框架.md)。 - -## 状态说明 - -- **❓ 未验证**:内容结构与命令已经写好,但**还没有**在目标环境完整跑通一次。 -- **⚠️ 部分验证**:只验证了其中一部分场景(例如只在单节点环境跑过,或只验证了 HTTP 未验证 HTTPS),备注里会写明覆盖范围。 -- **✅ 已验证**:按该文档从头到尾在指定环境走完一遍,达到预期结果,备注里会带上环境与日期。 - -建议习惯: - -- 真机按文档全部走完后,再把状态从“未验证/部分验证”改成“已验证”,并写清 **OS / K3s 版本 / 时间**。 -- 以后如果对文档步骤做了较大调整,记得把这里对应条目先打回“未验证”或“部分验证”,等新流程再跑一遍。 - -### 编排约定与文档 id - -- 下文每条 `docs/XX-YY-*.md` 的 **id 约定为文件名中的 `XX-YY`**,与 `scripts/verify.sh` 选用的 playbook 文件名一致;矩阵正文仍以人工结论为准,脚本结果写在各条「备注」里。 - ---- - -## 1. 主线安装(01-*) - -- `00-01-k3s-基础概念.md` - - 状态:✅ 已验证 - - 备注:概念性文档,不涉及命令执行。 -- `00-04-部署环境说明.md` - - 状态:✅ 已验证 - - 备注:说明性文档,描述本仓库验证环境(ylc61~64、Fedora、K3s v1.34.5+k3s1、每节点 **10G 系统盘 + 32G 数据盘挂载 `/storage`** 等),与当前实际部署对照调整;**2026-03-25** 记录的集群仍为四节点 Ready。 -- `01-01-k3s-控制节点含traefik.md` - - 状态:✅ 已验证 - - 备注:Fedora 43 Server + K3s v1.34.5+k3s1,单控制节点 61,Traefik 与节点入口 80/443 可达(404 为无路由时的正常表现);**2026-03-25** 与 `verify-g1-baseline` 复验一致。 -- `01-06-节点初始化-ansible-实践.md` - - 状态:✅ 已验证 - - 备注:Fedora + K3s,4 节点(ylc61~64),Ansible `k3s-init-and-install.yml` 完成 server/agent、firewalld、CNI trusted、CoreDNS、Traefik 及 playbook 内置验证;**2026-03-25** 于 ylc61 复跑 `phase2-k3s`(`PLAY RECAP` 全节点 `failed=0`)。 -- `01-02-k3s-工作节点.md` - - 状态:✅ 已验证 - - 备注:ylc62~ylc64 工作节点加入同一集群,`kubectl get nodes` 四节点 Ready;K3s v1.34.5+k3s1;**2026-03-25** 与 `check-cluster` 复验(早期记录曾为双节点)。 -- `01-03-armv7-standalone-docker.md` - - 状态:❓ 未验证 - - 备注:待在实际 armv7 设备上按文档安装 Docker 并跑一两个容器后更新。**2026-03-25** ylc61 `verify.sh` noop(未在 armv7 设备执行文档步骤)。 -- `01-07-openwrt-haproxy.md` - - 状态:⚠️ 部分验证 - - 备注:ImmortalWrt + HAProxy(如 18080/18443)曾实机验证过;当前仓库未提供对应自动化脚本。**2026-03-25** ylc61 `verify.sh` noop;仍以 onecloud 等第三方机 curl 手工为准。 - ---- - -## 2. 简单部署nginx(02-*) - -- `02-00-nginx-系列说明.md` - - 状态:⚠️ 部分验证(说明性文档) - - 备注:内容与 02-01~02-04 一致。**2026-03-25** ylc61 `verify.sh` noop;未重读全文,依赖历史核对。 -- `02-01-nginx-control-ingress.md` - - 状态:✅ 已验证 - - 备注:**本仓库约定验收**:`scripts/verify.sh` → `ansible/playbooks/verify/02-01.yml`(M1:`ansible/files/02-05-nginx-matrix/01-control-ingress.yaml` apply → rollout → 入口 HTTP 校验 `X-Backend: M1` → teardown)。**2026-03-25** ylc61 四节点集群跑通。历史上另有 onecloud curl;四路径总览见 [`02-05`](02-05-nginx-验证矩阵-一键部署.md)。HTTPS 不在本篇,见 `03-02`。 -- `02-02-nginx-control-ingressroute.md` - - 状态:✅ 已验证 - - 备注:同上,playbook `verify/02-02.yml`,清单 `02-control-ingressroute.yaml`,`X-Backend: M2`。**2026-03-25** ylc61。 -- `02-03-nginx-worker-ingress.md` - - 状态:✅ 已验证 - - 备注:同上,`verify/02-03.yml`,`03-worker-ingress.yaml`,`X-Backend: M3`。**2026-03-25** ylc61。 -- `02-04-nginx-worker-ingressroute.md` - - 状态:✅ 已验证 - - 备注:同上,`verify/02-04.yml`,`04-worker-ingressroute.yaml`,`X-Backend: M4`。**2026-03-25** ylc61。 -- `02-05-nginx-验证矩阵-一键部署.md` - - 状态:✅ 已验证 - - 备注:**本仓库约定验收**:`verify.sh` 串跑 `02-01`~`02-04`(或等价 `ansible/playbooks/nginx-matrix-deploy.yml` 一次部署四路径)+ 各路径 HTTP `X-Backend` + teardown。**2026-03-25** ylc61 `run-all` 通过。TLS/域名与 `03-02` 衔接另用 `nginx-matrix-tls-deploy.yml` / `verify/03-02.yml` 等验。 ---- - -## 3. k3s 常用配置 - -- `03-01-k3s-traefik-dashboard.md` - - 状态:⚠️ 部分验证 - - 备注:模板见 `ansible/files/03-01-traefik-dashboard/`。**2026-03-25** 仅确认集群内 `traefik` Deployment 可用(`verify-g3`),未按文档重新 apply Dashboard Ingress/IngressRoute 并浏览器验收。 -- `03-02-k3s-traefik-acme.md` - - 状态:⚠️ 部分验证 - - 备注:历史上 TLS + ACME 曾跑通;**2026-03-25** 办公机 `ACME_EMAIL` 未配置,未复验 Let's Encrypt 签发;恢复 ✅ 需按文档 + 有效邮箱与 DNS。 -- `03-03-k3s-traefik-dashboard-acme.md` - - 状态:⚠️ 部分验证 - - 备注:合并版 YAML 仍在 `ansible/files/03-03-traefik-dashboard-acme/`。**2026-03-25** ylc61 `verify.sh` noop(未实机 apply 合并栈)。 -- `03-04-k3s-cloudflare-tunnel-配置接入.md` - - 状态:⚠️ 部分验证 - - 备注:历史上实验室曾跑通 Tunnel + Traefik。**2026-03-25** ylc61 `verify.sh` noop;`CF_TUNNEL_TOKEN` 等未加载时亦不会自动复验隧道。 -- `03-05-k3s-local-path-pvc.md` - - 状态:⚠️ 部分验证 - - 备注:**2026-03-25** ylc61 `verify.sh`:demo 清单 apply → Deployment rollout → PVC `Bound` → teardown。此前 `verify-g1-baseline` 亦确认 `local-path` 就绪。 -- `03-06-k3s-使用nfs存储.md` - - 状态:❓ 未验证 - - 备注:待在实际 NFS 服务器 + K3s 集群上完成 PV/PVC + Pod 挂载验证。**2026-03-25** ylc61 `verify.sh`:因 `NFS_SERVER_IP` / `NFS_EXPORT_PATH` 未配齐 gate 跳过;teardown 已对齐无文件不删。playbook 亦支持仓库 `03-06-nfs-demo` 清单路径。 -- `03-07-k3s-longhorn-持久化存储.md` - - 状态:⚠️ 部分验证 - - 备注:**2026-03-25** ylc61 `verify.sh`:`longhorn-install.yml` 安装 + `longhorn-system` Pod 列表明细 + teardown(Helm uninstall/删 ns 带超时)。**未**按文档完整跑 PVC 业务读写与灾备流程,故不设 ✅。 -- `03-08-k3s-ha-集群配置与切换.md` - - 状态:❓ 未验证 - - 备注:HA 场景步骤已整理,尚未在当前环境完成双 server + 切换演练。**2026-03-25** `verify.sh` 仅为 noop(docs + `ansible/files` 目录存在性)。 -- `03-09-k3s-gitops-集群配置管理.md` - - 状态:❓ 未验证 - - 备注:框架草案,待选定 Argo CD 或 Flux 后细化。**2026-03-25** `verify.sh` 仅为 noop。 -- `03-10-k3s-traefik-custom-ports.md` - - 状态:❓ 未验证 - - 备注:需在实际环境应用 `HelmChartConfig` 并确认 Traefik Service/入口端口。**2026-03-25** `verify.sh` 仅为 noop(清单在 `ansible/files/03-10-traefik-custom-ports/`)。 - -### 可选:依赖文档 - -- `01-04-双控制节点ha.md` - - 状态:❓ 未验证 - - 备注:文档已拆分安装/配置流程,尚未在双控制节点 + 外部 LB 的完整场景下全链路验证。**2026-03-25** ylc61 `verify.sh` noop。 -- `01-05-armv7-nfs服务安装.md` - - 状态:❓ 未验证 - - 备注:NFS 安装命令已经过以往经验验证,本仓库对应 armv7 环境需再跑一遍确认导出与权限。**2026-03-25** ylc61 `verify.sh` noop(未在 armv7 实机执行文档步骤)。 - ---- - -## 4. 高级 Node.js(04-01~04-14) - -- `04-01-k3s-nodejs-高级部署.md` - - 状态:⚠️ 部分验证 - - 备注:主入口。**2026-03-25** ylc61 `verify.sh`:`04-01` 累积清单 apply → `/node` HTTP(Hello World)→ teardown。`04-02`~`04-14` 分项仍待按文档逐项实机。 -- `04-02-nodejs-镜像与运行命令.md` - - 状态:❓ 未验证 - - 备注:镜像 tag/`imagePullPolicy`/`command`/`args` 在实机拉取与启动验证。**2026-03-25** `verify.sh` 仅为 noop(仓库结构)。 -- `04-03-nodejs-环境变量与配置注入.md` - - 状态:❓ 未验证 - - 备注:ConfigMap/Secret 注入与 `printenv`/`curl` 结果一致。 -- `04-04-nodejs-端口与Service.md` - - 状态:❓ 未验证 - - 备注:`targetPort` 与进程监听一致;Endpoints 有地址。**2026-03-25** `verify.sh` noop。 -- `04-05-nodejs-资源请求与限制.md` - - 状态:❓ 未验证 - - 备注:`kubectl top` 与 OOM/节流行为符合预期。**2026-03-25** `verify.sh` noop。 -- `04-06-nodejs-探针与健康检查.md` - - 状态:❓ 未验证 - - 备注:readiness/liveness 与 Endpoint/重启行为验证。**2026-03-25** `verify.sh` noop。 -- `04-07-nodejs-调度与亲和.md` - - 状态:❓ 未验证 - - 备注:`nodeSelector`/亲和/容忍与节点标签实机一致。**2026-03-25** `verify.sh` noop。 -- `04-08-nodejs-安全上下文.md` - - 状态:❓ 未验证 - - 备注:非 root/只读根等策略下应用仍可运行。**2026-03-25** `verify.sh` noop。 -- `04-09-nodejs-存储与卷.md` - - 状态:❓ 未验证 - - 备注:PVC/emptyDir 挂载与读写、配合 `03-05`/`03-07` 存储选型。**2026-03-25** `verify.sh` noop。 -- `04-10-nodejs-Ingress与Traefik.md` - - 状态:❓ 未验证 - - 备注:path/host/入口点注解与 Traefik 路由一致。**2026-03-25** `verify.sh` noop。 -- `04-11-nodejs-副本与滚动发布.md` - - 状态:❓ 未验证 - - 备注:多副本与 `rollout`/`undo` 实机验证。**2026-03-25** `verify.sh` noop。 -- `04-12-nodejs-TLS与证书.md` - - 状态:❓ 未验证 - - 备注:HTTPS 与 `03-02` ACME/Secret 配合验证证书与域名。**2026-03-25** `verify.sh` noop。 -- `04-13-nodejs-HPA.md` - - 状态:❓ 未验证 - - 备注:metrics-server 可用;压测触发扩缩。**2026-03-25** `verify.sh` noop。 -- `04-14-nodejs-GitOps与CI流水线.md` - - 状态:❓ 未验证 - - 备注:流程文档;按 `05-03`/`05-04`/`03-09` 任选一条链路实机跑通后更新。**2026-03-25** `verify.sh` noop。 - ---- - -## 5. 常用应用与监控(05-*) - -- `05-01-k3s-部署homer首页面板.md` - - 状态:❓ 未验证 - - 备注:待在集群内按文档部署 Homer,并确认首页可访问。**2026-03-25** `verify.sh` noop。 -- `05-02-onenav首页面板.md` - - 状态:❓ 未验证 - - 备注:包含 armv7 独立部署 + K3s 反向代理两个部分,需分别验证。**2026-03-25** `verify.sh` noop。 -- `05-03-k3s-安装gitlab-含runner.md` - - 状态:❓ 未验证 - - 备注:待完成 GitLab + Runner 安装与基础流水线运行。**2026-03-25** `verify.sh` noop。 -- `05-04-k3s-配置gitlab-cicd.md` - - 状态:❓ 未验证 - - 备注:需在真实仓库上跑通一次 K3s 部署流水线。**2026-03-25** `verify.sh` noop。 -- `05-05-prometheus与grafana.md` - - 状态:❓ 未验证 - - 备注:待完成 kube-prometheus-stack 安装与 Dashboard 访问。**2026-03-25** `verify.sh` noop。 -- `05-06-openlist挂载网盘与自动备份.md` - - 状态:❓ 未验证 - - 备注:待在实际网盘与备份目录上验证周期备份任务。**2026-03-25** `verify.sh` noop。 -- `05-07-openclaw应用部署.md` - - 状态:❓ 未验证 - - 备注:待在 x86 主机用 Docker 部署 OpenClaw,并在 K3s 中完成静态转发验证。**2026-03-25** `verify.sh` noop。 -- `05-08-openclaw-k3s-实验部署.md` - - 状态:❓ 未验证 - - 备注:待在 K3s 内按实验文档直接部署 OpenClaw Gateway,并确认入口可访问。**2026-03-25** `verify.sh` noop。 -- `05-09-openclaw-web-小游戏网页平台.md` - - 状态:❓ 未验证 - - 备注:前端示例清单在 `ansible/files/05-09-openclaw-web-小游戏网页平台/`;**2026-03-25** `verify.sh` noop(未 apply 镜像示例)。 - ---- - -## 6. 排障与运维(06-*) - -- `06-01-k3s-networkpolicy-故障排查.md` - - 状态:✅ 已验证 - - 备注:已在 Fedora 43 + K3s 环境排查并修复过“62:80 不通 / firewalld 拦截 flannel.1 <-> cni0”的问题,脚本与命令均来自实战过程。 -- `06-02-运维小结.md` - - 状态:❓ 未验证 - - 备注:运维建议为经验总结,后续可在日常巡检/备份流程固化后逐条打勾。**2026-03-25** `verify.sh` noop。 -- `06-03-k3s-自动备份与恢复-openlist-webdav.md` - - 状态:❓ 未验证 - - 备注:按文档配置 WebDAV 备份与恢复 Job/CronJob,并验证一次完整链路。**2026-03-25** `verify.sh` noop(清单真源见 `ansible/files/06-03-openlist-webdav/` 与桥接目录 `06-03-k3s-自动备份与恢复-openlist-webdav/`)。 - ---- - -## 8. 如何更新本矩阵 - -- 修改某篇文档的关键步骤(尤其是“操作步骤 / 验证命令 / 预期”)时: - - 记得同步更新这里对应条目的“状态”和“备注”。 - - 大改后建议先把状态退回“未验证”或“部分验证”,等新流程在实机跑完再改回“已验证”。 -- 执行中文文档一键安全对齐或大规模内容调整时,建议把 **验证矩阵** 一起纳入检查范围,避免出现“文档已经改了,但矩阵还显示已验证”的错觉。 - - diff --git a/docs/00-03-未来规划与待补功能.md b/docs/00-03-未来规划与待补功能.md deleted file mode 100644 index a27f00f..0000000 --- a/docs/00-03-未来规划与待补功能.md +++ /dev/null @@ -1,111 +0,0 @@ -# 00-03-未来规划与待补功能 - -> 给未来的自己:这里不是“必须现在就做完”的清单,而是把你已经想到、但还没系统实现的能力先写下来,等有时间再一项项补。 - -## 1. 日志与审计体系 - -- **现状** - - 主要依赖 `kubectl logs` + 节点本地日志 + Prometheus/Grafana 指标。 - - 没有集中日志查询入口,也没有明确的“关键操作审计”路径。 -- **规划方向** - - 引入轻量日志聚合(例如 Loki 或 ELK 中的一个最小栈),统一收集: - - K3s 控制面与核心组件日志; - - 关键应用(GitLab、openlist、OpenClaw 等)的访问/错误日志。 - - 为“集群操作日志”(如 `kubectl apply/delete`)预留出口,后续可结合 GitOps 做审计。 -- **建议文档** - - `05-09-k3s-集中日志与查询-loki.md`(示例名称) - -## 2. 统一身份与权限管理(SSO) - -- **现状** - - GitLab、Grafana、Homer、openlist 等各自维护账号。 - - Cloudflare Zero Trust 只覆盖到部分 Web 入口,没有形成统一的“家庭账号体系”。 -- **规划方向** - - 引入一个轻量 IdP(如 Keycloak / Authentik),集中管理家庭成员账号与 OAuth/OIDC 客户端。 - - 按优先级为关键组件接入 SSO: - - GitLab、Grafana、Homer 优先; - - 其余应用按需要接入。 -- **建议文档** - - `05-10-homelab-sso-keycloak-部署与接入.md` - -## 3. 运维自动化与 GitOps - -- **现状** - - 节点初始化、K3s 配置和应用部署以“手工 + scripts/”为主。 - - 没有一套“从裸机/虚机到完整环境”的幂等自动化流程。 -- **规划方向** - - **节点侧**:✅ 已完成 `01-06-节点初始化-ansible-实践.md`,Ansible 一键完成初始化 + k3s 安装 + firewalld 基线 + Traefik 标签(含 8472/udp、6443/tcp 端口开放)。 - - **集群侧**:引入 GitOps(Argo CD / Flux 二选一)管理: - - K3s 核心配置与 CRD; - - Ingress/IngressRoute、Traefik 配置; - - 常用应用(Homer、openlist、监控、GitLab 等)的清单。 -- **建议文档** - - `03-09-k3s-gitops-集群配置管理.md` - -## 4. 网络边界与安全基线 - -- **现状** - - 已有 NetworkPolicy 排障文档 `06-01-k3s-networkpolicy-故障排查.md`。 - - 家庭网络与实验网段的边界、安全分区(IoT 设备、访客网络等)主要依赖网关/OpenWrt,尚未在本仓库中系统描述。 -- **规划方向** - - 定义一份“最小可接受安全基线”: - - 命名空间隔离与默认拒绝策略; - - 仅对入口、监控、GitLab 等核心组件放行必须的东西; - - 节点对外暴露端口白名单。 - - 梳理家庭网络拓扑与 K3s 网络在其中的位置: - - 内/外网、IoT 网段、Admin 网段; - - 哪些通过 Cloudflare、哪些只允许 VPN。 -- **建议文档** - - `06-04-homelab-网络分区与安全基线.md` - -## 5. 备份与灾难恢复(超越单应用) - -- **现状** - - `06-03-k3s-自动备份与恢复-openlist-webdav.md` 已覆盖 openlist 的备份/恢复实践。 - - 尚未有一份“集群级 + 存储级 + 应用级”的整体 DR 方案。 -- **规划方向** - - 明确几类不同的“灾难级别”与对应恢复路径: - 1. 单个 Pod/Deployment 配置误操作; - 2. 某一节点(worker/server)硬件/系统损坏; - 3. 存储节点(NFS/硬盘阵列)损坏; - 4. 整个 K3s 集群需要在新环境中重建。 - - 对应规划: - - K3s datastore/外部数据库定期备份; - - NFS/重要 hostPath 目录的文件级备份或异地同步; - - 关键应用(GitLab、openlist、openclaw workspace 等)的专项恢复演练。 -- **建议文档** - - `06-05-k3s-集群级备份与灾难恢复设计.md` - -## 6. 远程访问形态:Tunnel + VPN 双轨 - -- **现状** - - 通过 Cloudflare Tunnel 提供部分 Web 入口访问。 - - 管理/运维时仍主要依赖局域网直接访问。 -- **规划方向** - - 保持**Cloudflare Tunnel 作为零信任 Web 入口**方案; - - 额外增加一条 **WireGuard/OpenVPN 运维 VPN** 路径: - - 只向极少数管理设备开放; - - 主要用途为 SSH、kubeconfig、底层网络排障。 -- **建议文档** - - `01-07-wireguard-运维vpn-接入与实践.md` - -## 7. 其他可选实验方向 - -> 这些不是“缺失”,而是你以后如果有时间,可以尝试的升级路线。 - -- **多集群/多环境管理**: - - 在本地再起一个极简 K3s/Kind,用作“预生产/实验”环境,通过 GitOps 控制与主集群的差异。 -- **存储升级**: - - 从基础 NFS 逐步尝试 Longhorn、Rook-Ceph 或轻量分布式存储,评估在家庭环境下的性价比与复杂度。 -- **可观测性增强**: - - 在现有 Prometheus/Grafana 基础上补充 Alertmanager 与简单告警策略(如节点离线、磁盘空间、关键 Pod 异常)。 - ---- - -## 8. 使用方式建议 - -- 不必一次全部实现,可按“对你当前使用最有帮助的”优先级来选; -- 每当某个方向完成初版实践时,在 `00-02-验证矩阵.md` 中补充状态与备注; -- 新增文档时记得回到 `00-00-构建总览.md`,把入口挂上。 - - diff --git a/docs/00-05-测试与验证框架.md b/docs/00-03-测试与验证框架.md similarity index 69% rename from docs/00-05-测试与验证框架.md rename to docs/00-03-测试与验证框架.md index 6eb68b3..74148ad 100644 --- a/docs/00-05-测试与验证框架.md +++ b/docs/00-03-测试与验证框架.md @@ -2,14 +2,21 @@ > 本页是“测试与验证框架”的设计说明,并与仓库里已落地的 `scripts/verify.sh` + `ansible/playbooks/verify/` 对齐。 + +## TL;DR + +- **本文性质**:说明/索引类文档(不承载一键部署动作) +- **推荐动作**:按 `00-00-构建总览.md` 进入主线;需要真机验收用 `./scripts/verify.sh full` +- **成功判据**:你能据本文定位到下一步文档与对应入口脚本 +- **排障**:执行失败请查看对应实验篇的「排障」与 playbook 输出 + ## 1. 为什么需要它 -仓库里 `docs/00-02-验证矩阵.md` 目前扮演“待验证列表/状态记录”的角色,用来回答: +本仓库选择**抛弃“验证矩阵/状态板”**:不再维护一份集中式“已验证/未验证”列表。 -- 这篇文档(`XX-YY`)是否已经在你的实验环境中从头到尾跑通? -- 如果没跑通,缺口在哪里? +本页只回答一件事:**如何把文档(`doc_id=XX-YY`)与可执行的验证入口对齐**,并把验证能力收敛为可维护的自动化资产。 -而“自动化执行”和“状态记录”是两件不同的事。测试框架需要把自动化执行能力,拆成可维护的小块,并通过统一的 id/索引把文档与用例关联起来。 +实机验证前需具备的环境与外部依赖,见 [`00-04-待验证项-验证前准备.md`](00-04-待验证项-验证前准备.md)。 ## 2. 自动化验证流程(一般步骤) @@ -23,25 +30,26 @@ 2. **环境与前置清理(按验证目标选择深度)** - **基本检查**:`kubectl get nodes`、磁盘/内核版本、防火墙与文档是否一致;必要时对照 `00-04`。 - **轻量清理(本仓库 `verify.sh` 的常态)**:默认不卸载整个 K3s;每个 `verify/XX-YY.yml` 在 **teardown** 阶段只删除**本篇** apply 过的资源(或 gate 未执行 apply 时跳过删除),避免污染下一用例。 - - **重度清理(重装/复现安装文档时)**:若你要从「空机」验证 `01-01` 等**整集群安装**流程,才需要按文档执行 `k3s-uninstall.sh`、删数据目录、清 iptables 等——这与日常「矩阵逐项验收」是**不同场景**,不要默认混进每一次 `run-all`。 + - **重度清理(重装/复现安装文档时)**:若你要从「空机」验证 `01-01` 等**整集群安装**流程,才需要按文档执行 `k3s-uninstall.sh`、删数据目录、清 iptables 等——这与日常 `run-all` 的“逐篇快速验收”是**不同场景**,不要默认混进每一次 `run-all`。 3. **部署** - - **推荐(本仓库)**:用 Ansible playbook 部署——要么是正式安装/初始化类(如 `k3s-init-and-install.yml`),要么是验证用例里的 `kubectl apply` / `helm install` / `import_playbook`。 + - **推荐(本仓库)**:用 Ansible playbook 部署——要么是正式安装/初始化类(如 `verify/01-06.yml -e k3s_do_install=true`,或 `./scripts/deploy-lab.sh` 调用之),要么是验证用例里的 `kubectl apply` / `helm install` / `import_playbook`。 - **文档中的 bash 一键命令**:仍可按 `docs/` 逐步执行;适合排障或 playbook 尚未覆盖的边角。自动化验收应尽量**收敛进** `ansible/playbooks/verify/*.yml`,避免「文档一套、手敲一套」长期分叉。 4. **按设计目标做断言** - **集群侧**:`kubectl get` / `describe` / `logs`、`kubectl rollout status`、必要时看事件与 `Endpoints`。 - **入口侧**:在控制节点或文档指定的入口上对 `Service`/`Ingress`/`IngressRoute` 做 `curl`(本仓库 nginx 矩阵等用响应头 `X-Backend` 或状态码区分路径)。 - **Helm / 存储 / 网络**:按该篇文档的「预期」增查命令(如 `helm list`、`PVC Bound`、跨节点 curl)。 - - 依赖外部云账号、NFS、ACME 邮箱等时:未满足条件可用 **gate 跳过** apply,并在矩阵备注中写明「未配变量未验」。 + - 依赖外部云账号、NFS、ACME 邮箱等时:未满足条件可用 **gate 跳过** apply,并在实验篇文档里写明「未配变量未验」(或保留日志)。 5. **收尾与记录** - - 默认 **`VERIFY_TEARDOWN=1`**:验证通过后删除临时资源,减少对共享实验集群的干扰;调试时可设 `0` 保留现场。 - - 将结论写回 [`docs/00-02-验证矩阵.md`](00-02-验证矩阵.md)(状态与备注),必要时更新对应 `docs/XX-YY-*.md` 中的命令或版本说明。 + - **`verify.sh`** 默认 **`VERIFY_TEARDOWN=1`**(显式传入 Ansible):验证通过后删除临时资源;调试时可设 `0` 保留现场。**`deploy-lab.sh` 铺栈默认 `VERIFY_TEARDOWN=0`**,避免误删已部署资源。 + - 将结论写回对应实验篇文档(或保留日志),必要时更新 `docs/XX-YY-*.md` 中的命令或版本说明。 6. **本仓库一键串联** - **部署**(步骤 3):`./scripts/deploy-lab.sh k3s` 等,见 [`scripts/README.md`](../scripts/README.md)。 - - **验证**(步骤 4~6):在仓库根执行 `./scripts/verify.sh run-all`(或 `run `),按矩阵顺序重复「断言 → teardown」;缺 playbook **fail-fast**。跑全量前可 `./scripts/verify.sh preflight`;`./scripts/verify.sh flow` 可打印与本节对应的步骤摘要。 + - **验证**(步骤 4~6):在仓库根执行 **`./scripts/verify.sh full`**(推荐:**preflight +** `run-all`,缺 playbook **fail-fast**);或仅 `./scripts/verify.sh run-all`(不跑 preflight);单篇用 `./scripts/verify.sh run `。`./scripts/verify.sh flow` 可打印与本节对应的步骤摘要。 + - **范围说明**:`run-all` 的范围由 `ansible/playbooks/verify/` 目录内存在的 `XX-YY.yml` 自动决定;`full` **不会**自动执行 `deploy-lab.sh`,仍假设集群与铺栈(步骤 3)已由操作者完成。 ### 2.1 局限与约定补全(建议在文档与 `verify/XX-YY.yml` 中写死) @@ -50,10 +58,10 @@ | 主题 | 建议约定 | |------|----------| | **多节点:在哪台机器 `curl`** | **默认**:在 inventory 的 **`k3s_server`(控制节点)** 上,对 **集群入口** 发 HTTP(如 `nginx_entry_base` / `http://<控制节点或 LB IP>`),与「从集群外经 NodePort/主机网络进 Traefik」一致。**例外**(必须显式写):要验 worker 仅内网、跨节点路径、或「必须从某台 agent 访问」时,在 playbook 里对指定 host 执行 `curl`(或 `delegate_to` / 专用 play),并在文档「验证命令」中写明 **执行主机与目标 URL**,避免隐含「任意节点等价」。 | -| **TLS / SNI** | 自签或跳过校验仅用于排障:`curl -k`。**验收**应优先:真实证书路径下用 `curl -v` 看证书链;或用 `curl --resolve <域名>:443:<入口IP> https://<域名>/...` 在 **无 DNS** 时模拟 SNI。需要时用 `openssl s_client -connect host:443 -servername <域名> :443:<入口IP> https://<域名>/...` 在 **无 DNS** 时模拟 SNI。需要时用 `openssl s_client -connect host:443 -servername <域名> ansible/playbooks/verify/.yml`;缺对应 playbook 则 **fail-fast**。 -3. **`ansible/playbooks/verify/.yml`**:单篇用例,通常拆成「部署 → 验证 → 清理」多个 **play**(默认 **`VERIFY_TEARDOWN=1`** 做 teardown)。 -4. **特例**:无集群动作的文档可走 **`verify/_noop-tasks.yml`**(仓库路径/文件存在性);依赖 NFS、ACME、Cloudflare 等外部条件的可用 **gate 跳过** apply,teardown 需避免「无清单仍删」类失败(各 playbook 已按此收敛)。 +1. **`scripts/verify.sh`**:调用 `ansible-playbook -i ansible/playbooks/verify/.yml`;缺对应 playbook 则 **fail-fast**。 +2. **`ansible/playbooks/verify/.yml`**:单篇用例,通常拆成「部署 → 验证 → 清理」多个 **play**(默认 **`VERIFY_TEARDOWN=1`** 做 teardown)。 +3. **特例**:无集群动作的文档可走 **`verify/_noop-tasks.yml`**(仓库路径/文件存在性);依赖 NFS、ACME、Cloudflare 等外部条件的可用 **gate 跳过** apply,teardown 需避免「无清单仍删」类失败(各 playbook 已按此收敛)。 -**真源**:可部署清单以 **`ansible/files/`** 为准;`docs/XX-YY-*.md` 与矩阵通过同一 **`doc_id`** 与 playbook 对齐。矩阵里的状态/备注仍建议 **手工** 维护(见 §7)。 +**真源**:可部署清单以 **`ansible/files/`** 为准;`docs/XX-YY-*.md` 与验证用例通过同一 **`doc_id`** 与 playbook 对齐。 ## 4. 文档 id 与用例索引 约定: - `docs/XX-YY-*.md` 的文档 id 为 `XX-YY`(例如 `02-05`)。 -- 自动化用例文件名为 `verify/XX-YY.yml`,与 `doc_id` 一致即可(playbook 内不必再写 YAML 字段 `doc_id`,除非你想自检)。 -- 框架通过 `doc_id` 把“文档”映射到 `verify/.yml`,从而实现按篇自动执行(`verify.sh`)。 +- 自动化用例:每个 `doc_id` 对应唯一 playbook:`ansible/playbooks/verify/.yml`。 +- 框架通过 `doc_id` 把“文档”映射到可执行 playbook,从而实现按篇自动执行(`verify.sh`)。 +- **00 系列边界**:`verify/00-*.yml` 仅允许文档验收(`assert`/`stat`/路径一致性);禁止在 00 系列执行 `kubectl apply/delete`、Helm 安装卸载或其他集群变更动作。 这样你在 `00-02` 里更新状态时,不需要关心脚本内部结构;只要 id 一致就能追溯。 ## 5. 用例数据模型(建议) -建议把用例写成“按文档 id 编排的任务集合”。在本仓库里,**用例落在** `ansible/playbooks/verify/.yml`,不再使用单体 `verify-matrix.yml`(已移除,避免与拆分后的 playbook 双份维护)。 - -- 文档 id `XX-YY` → 文件 `verify/XX-YY.yml` +建议把用例写成“按文档 id 编排的任务集合”。在本仓库里,执行路径固定为 `verify/.yml`,不再维护额外映射文件。 - 每个文件内一般拆为三段(多个 play 或顺序 tasks): -示例(02-05):`./scripts/verify.sh run 02-05` 执行 `ansible/playbooks/verify/02-05.yml`(内部 `import_playbook` `nginx-matrix-deploy.yml`,再 HTTP 校验四路径,最后 teardown)。`02-01`~`02-04` 另有单路径 playbook,便于单独调试。 +示例(02-05):`./scripts/verify.sh run 02-05` 执行 `ansible/playbooks/verify/02-05.yml`(HTTP 校验四路径,最后 teardown)。`02-01`~`02-04` 另有单路径 playbook,便于单独调试。 每个 `verify/XX-YY.yml` 的典型结构为三段: @@ -102,7 +108,7 @@ description: "02-05 四条路径均返回 200,并区分后端内容" apply: paths: - - "ansible/files/02-05-nginx-matrix/" + - "ansible/files/02-05/" strategy: "apply-r" wait: @@ -147,16 +153,39 @@ http_check: ## 7. 状态记录与写回策略 -你已经在 `docs/00-02-验证矩阵.md` 里定义了状态含义(未验证/部分验证/已验证)。 +如果你希望记录“已验证/未验证”,建议就近写在对应实验篇文档里(或在你的环境里维护日志),避免引入集中式状态板带来的维护分叉。 建议未来的写回策略分两步: - 首先:仍然由你手工更新 `00-02`(减少自动化失败导致的误写) - 之后:如果要自动写回,则需要明确“失败判定标准、覆盖范围、并发策略”,避免多个执行器同时写同一条状态。 +## 7.1 文档结构规范(Runbook 优先,强绑定) + +本仓库将 `docs/` 视为 **Runbook**(可执行手册),并对每篇可执行文档施加 **强绑定**: + +- **每个 `doc_id` 必须可执行**:存在 `verify/.yml` 的 `doc_id` 必须能跑 `./scripts/verify.sh run `,且 playbook 内含明确断言(不得仅做“文件存在性”)。 +- **文档与断言一致**:文档的“验证命令/预期”必须与对应 playbook 的断言一致;冲突时先修 playbook,再改文档对齐。 + +推荐每篇 `docs/-*.md` 采用以下结构(可复制粘贴作为模板): + +- **H1**:`# -<标题>` +- **TL;DR(必选,3–8 行)** + - 自动化入口:`./scripts/verify.sh run `(必要时补 `export ...`) + - 最关键 3 条前置(变量/Secret/挂载) + - 成功判据一句话 + - 失败去哪看(链接到本篇“排障”) +- **范围与非目标** +- **前置条件**(环境/变量与密钥/工具/执行位置) +- **步骤**(只留最短主线;与 playbook 一致) +- **验证**(明确执行位置、命令、判据;与 playbook 断言一致) +- **清理**(说明 `VERIFY_TEARDOWN`) +- **排障**(按症状列常见检查命令) +- **附录(可选)**(背景解释、通用技巧如远程 kubectl 等,避免污染主线) + ## 8. 与旧自动化的关系 -- `docs/00-02-验证矩阵.md` 不承担执行细节,只作待验证列表与状态记录 +(已抛弃集中式“矩阵状态板”,因此不再有“待验证列表”文档。) - 自动化执行以 `scripts/verify.sh` 与 `ansible/playbooks/verify/*.yml` 为准;本页描述其约定与扩展方式 ## 9. 可选扩展(未落地) @@ -195,3 +224,7 @@ http_check: yamllint、ansible-lint、schema 校验等 **不放进 `verify.sh`** 亦可:在 GitHub Actions / 本地 pre-commit 里单独跑即可。与 §3 一致——与运行时验证 **并列**,互不嵌套。 +## 排障 + +- **你在找执行命令**:本文为说明/索引;执行入口见 `00-00-构建总览.md` 与 `scripts/README.md`。 +- **verify.sh 报错**:先跑 `./scripts/verify.sh preflight`,再根据提示修复 inventory/SSH/变量。 diff --git a/docs/00-04-待验证项-验证前准备.md b/docs/00-04-待验证项-验证前准备.md new file mode 100644 index 0000000..bdd29ed --- /dev/null +++ b/docs/00-04-待验证项-验证前准备.md @@ -0,0 +1,124 @@ +# 00-05-待验证项:验证前准备任务列表 + +> 本页是“验证前准备清单”:在实机推进到「已验证」之前,需要具备的环境、变量与动作列表。 +> +> 自动化说明见 [`00-03-测试与验证框架.md`](00-03-测试与验证框架.md);环境变量模板见 [`scripts/.env.verify.example`](../scripts/.env.verify.example)。 + + +## TL;DR + +- **本文性质**:说明/索引类文档(不承载一键部署动作) +- **推荐动作**:按 `00-00-构建总览.md` 进入主线;需要真机验收用 `./scripts/verify.sh full` +- **成功判据**:你能据本文定位到下一步文档与对应入口脚本 +- **排障**:执行失败请查看对应实验篇的「排障」与 playbook 输出 + +--- + +## A. 全局共用准备(跑任何扩展验证前建议具备) + +| 准备项 | 说明 | +|--------|------| +| 控制机 | 与 [`ansible/inventory.ini`](../ansible/inventory.ini) 一致:SSH 私钥存在且 **`chmod 600`**;仓库根执行 [`scripts/verify.sh`](../scripts/verify.sh)。 | +| Ansible 配置 | [`scripts/lib-ansible-lab.sh`](../scripts/lib-ansible-lab.sh) 会设置 `ANSIBLE_CONFIG` 指向 [`ansible/ansible.cfg`](../ansible/ansible.cfg);无写 `~/.ansible` 权限时可设 `ANSIBLE_LOCAL_TMP=$PWD/.ansible-tmp`(仓库 [`.gitignore`](../.gitignore) 已忽略 `.ansible-tmp/`)。 | +| HTTP 类 | [`scripts/.env.verify`](../scripts/.env.verify) 或环境中设 **`nginx_entry_base`**(如 `http://192.168.2.61`),与 `verify/02-0x.yml` 等一致。 | +| 串联验证 | **`VERIFY_TEARDOWN=1`**(勿在 `.env.verify` 中长期设 `0`),避免用例互相污染。 | +| 预检 | 可选 `VERIFY_PREFLIGHT_CLUSTER=1 ./scripts/verify.sh preflight` 确认集群 Ready。 | + +--- + +## B. 按文档主题的准备(做什么才能「真验证」) + +### B1. 特殊硬件 / 拓扑 + +- **01-03、01-05**:准备 **armv7 实机**;01-03 装 Docker 并跑容器;01-05 在 armv7 上跑通 NFS 导出与权限(与 03-06 可联动)。**默认** `SKIP_ARMV7=1` 时 verify 仅做文档/文件检查;**可选** 在 `.env.verify` 设 `SKIP_ARMV7=0` 并配置 `ARMV7_SSH`(01-05 可另设 `ARMV7_NFS_SSH`)后,`verify.sh run 01-03` / `01-05` 会经 SSH 在 arm 上走 **dnf** 路径(Fedora/RHEL 系),见 **§E**。 +- **01-08、03-08**:准备 **双 control-plane + 外部 LB(或等价)** 的可丢环境;按文档做加入/切换演练(当前自动化仅做基线可达性断言,加入/切换演练需按文档手工执行并补齐自动化)。 +- **07-01、07-02**:准备 **可重建的实验集群**(换 CNI/双栈);写好回滚;这两篇验证多为 **noop**,建议以手工记录为准。 + +### B2. 环境变量 / 外部服务(不配则 gate 跳过或只能 noop) + +- **03-06**:在 `.env.verify` 配齐 **`NFS_SERVER_IP`(或 HOST)、`NFS_EXPORT_PATH`**;NFS 服务端导出与防火墙放行;再 `./scripts/verify.sh run 03-06`。 +- **03-02、04-12**:有效 **`ACME_EMAIL`**、公网 **80/443**、**DNS**(及文档中的 Cloudflare/Secret);与 `scripts/.env.verify.example` 中 CF 相关变量对齐。 +- **03-04**:**`CF_TUNNEL_TOKEN`**(或等价)与隧道侧配置;办公机/第三方探测路径按文档约定执行。 +- **06-03**:按 [`06-03-k3s-自动备份与恢复-openlist-webdav.md`](06-03-k3s-自动备份与恢复-openlist-webdav.md) 准备 **WebDAV 端点与凭据**;清单真源见 `ansible/files/06-03/`。 + +### B3. 部分验证补全为「已验证」(已有集群即可,偏手工/浏览器) + +- **01-07**:**onecloud**(或文档约定第三方机)对 OpenWrt **18080/18443** 做 curl;`scripts/.env.verify.example` 中 **`ONECLOUD_SSH`**。 +- **02-00**:通读说明文档,与 02-01~02-05 结论对齐。 +- **03-01**:按文档 apply `ansible/files/03-01/`,**浏览器**验收 Dashboard(非仅 Deployment 存在)。 +- **03-03**:实机 apply `ansible/files/03-03/` 并验收。 +- **03-05**:playbook 层已有较完整验证;若要更“真验证”:按文档补全「业务读写/边界」。 +- **03-07**:playbook 已能装删 Longhorn;要 ✅:按 [`03-07-k3s-longhorn-持久化存储.md`](03-07-k3s-longhorn-持久化存储.md) 做 **PVC 读写、副本/故障** 等文档级验收(`longhorn_force_crd_reset` 仅在 CRD 与 Helm 严重冲突时于 `group_vars` 设为 `true`)。 + +### B4. Node.js 系列(04-01~04-14) + +- **共用准备**:可访问的 **镜像仓库**(或私有 registry)、`nodejs_entry_base`、足够节点与资源做调度/HPA。 +- **04-01**:已部分验证;04-02~04-14 当前 **`verify.sh` 多为 noop**(仅目录/文档存在性)。要逐项 ✅:**按各篇文档手工或用例化命令验证**,并在 `ansible/playbooks/verify/` **补 `04-0x.yml` 真实 deploy/verify/teardown**(工作量最大的一块)。 +- **04-13**:集群需 **metrics-server**;准备压测工具以触发 HPA。 +- **04-14**:依赖 **GitLab CI / GitOps** 任选一条实链路(与 05-03/05-04/03-09 联动)。 + +### B5. 应用与监控(05-01~05-09) + +- **共用**:镜像拉取、持久化存储类(Longhorn/local-path/NFS)、Ingress 入口与 DNS(若对外)。 +- **05-02**:**armv7 段 + K3s 段** 分两环境验证;arm 段与 **§E** 相同:默认跳过远程步骤,按需 `SKIP_ARMV7=0` + SSH 变量启用 01-03/01-05 类自动化后,再在 K3s 侧验证。 +- **05-03~05-04**:**大内存/磁盘** 规划、GitLab **域名/证书**、Runner **注册 token**、测试仓库与 `.gitlab-ci.yml`。 +- **05-05**:Prometheus Operator 资源与 **Grafana 管理员密码**;Ingress 或 NodePort 访问策略。 +- **05-06**:真实 **网盘凭据**、备份目标目录与 Cron 窗口。 +- **05-07~05-09**:x86 Docker 主机、K3s 内 **镜像构建/拉取**、OpenClaw 相关 **密钥与入口**;05-09 需 apply 示例清单并验收页面。 + +### B6. 运维与概念(06-02、03-09) + +- **06-02**:经验文档;「验证」可定义为巡检/备份 SOP 在现网执行一轮并在文档里记录结论。 +- **03-09**:先 **选定 Argo CD 或 Flux** 并落地最小 GitOps 回路,再谈 ✅。 + +--- + +## C. 建议执行顺序(减少重复准备) + +```mermaid +flowchart TD + base[全局 SSH 与 env.verify] + nfs[NFS 与 03-06] + acme[ACME 与 03-02] + node[04-01 基线加扩 04-02 起] + apps[05-xx 大应用] + ha[HA 与 CNI 实验集群] + base --> nfs + base --> acme + base --> node + acme --> node + nfs --> apps + base --> ha +``` + +1. 先固化 **A** + **03-06 NFS**(若要做存储类应用)。 +2. 再做 **03-02 ACME**,解锁 **04-12** 与 TLS 矩阵深度验收。 +3. **04-02~04-11** 在 **04-01** 基线上增量加 playbook 或手工矩阵。 +4. **05-03/05-04**、**05-05** 单独排期(资源与时间最长)。 +5. **HA / 07-xx** 独立维护窗口与回滚。 + +--- + +## D. 与「自动化」对齐的预期 + +- **`verify.sh run XX-YY` 已通过** 只保证 **该 playbook 已实现** 的步骤通过;**noop** 文档不会替你完成文档内全部操作。 +- 若你要做“已验证”记录,建议在对应实验篇文档里写清:**环境、日期、是「仅脚本」还是「脚本 + 手工浏览器/第三方机」**。 + +--- + +## E. armv7 / arm32(可选经 `verify.sh` + SSH 远程安装) + +实验室矩阵里的 **01-03**(Docker)、**01-05**(NFS)、**05-02** 中的 arm 段等,依赖 **32 位 ARM(文档多写 armv7)实机**,与四节点 x86_64 K3s 主线 **不在同一 inventory**。 + +| 项 | 说明 | +|------|------| +| 默认 | **`SKIP_ARMV7=1`(或未设)**:`01-03.yml`、`01-05.yml` 仍跑矩阵基线(含 `_noop-tasks.yml` 文档/文件检查),**不经 SSH 改 arm 机**。 | +| 启用远程步骤 | **`SKIP_ARMV7=0`** 且 **`ARMV7_SSH`** 为一行可执行的 `ssh ...`(BatchMode 建议与 [`scripts/.env.verify.example`](../scripts/.env.verify.example) 一致):`verify.sh run 01-03` 会经该 SSH 在 arm 上 **dnf 装 docker** 并校验;`run 01-05` 用 **`ARMV7_NFS_SSH`**(若为空则回退 **`ARMV7_SSH`**)装 **nfs-utils**、写 **`/etc/exports`**(路径/客户端网段见 **`ARMV7_NFS_EXPORT_PATH`**、**`ARMV7_NFS_CLIENT_SUBNET`**,默认 `/sdcard` 与 `192.168.2.0/24`)。 | +| 约束 | 远程路径假定 **Fedora/RHEL 系 + dnf**;**Debian/apt 未分支**。`verify.sh` 在 `source scripts/.env.verify` 后调用 `ansible-playbook`,子进程继承上述环境变量。 | +| 门控 | **`SKIP_ARMV7=0` 但未配置有效 SSH** 时,01-03 / 01-05 会 **fail**,避免 `run-all` / `full` 在误以为已启用 arm 时静默跳过。 | +| 手工 | 仍可按 [`01-03-armv7-standalone-docker.md`](01-03-armv7-standalone-docker.md)、[`01-05-armv7-nfs服务安装.md`](01-05-armv7-nfs服务安装.md) 全手工走通后在文档中记录环境与结论。 | + +## 排障 + +- **你在找执行命令**:本文为说明/索引;执行入口见 `00-00-构建总览.md` 与 `scripts/README.md`。 +- **verify.sh 报错**:先跑 `./scripts/verify.sh preflight`,再根据提示修复 inventory/SSH/变量。 diff --git a/docs/01-00-安装与基础环境-系列说明.md b/docs/01-00-安装与基础环境-系列说明.md new file mode 100644 index 0000000..9b03d69 --- /dev/null +++ b/docs/01-00-安装与基础环境-系列说明.md @@ -0,0 +1,36 @@ +# 01-00 安装与基础环境(系列说明) + +> 本系列覆盖:安装 K3s、工作节点加入、基础网络/入口、以及与主线兼容的可选项(armv7、OpenWrt HAProxy、HA 准备等)。 + +## TL;DR + +- **从零装集群(推荐自动化)**:`./scripts/deploy-lab.sh k3s`(文档:`01-06`) +- **本页定位**:仅系列导航,不参与 `verify.sh run-all/full` +- **子篇执行入口**:按下表执行 `./scripts/verify.sh run ` +- **阅读完成判据**:能按索引定位到对应子篇并完成执行 + +## 范围与非目标 + +- 本页是 **01 系列入口/导航页**(`YY=00`),不要求具备独立执行器,且不参与 `verify.sh run-all/full`。 +- 具体操作步骤与 YAML 真源请进入各分项文档;`YY>0` 的分项必须包含可执行物(YAML 路径或命令块)。 + +## 01 系列索引(按推荐顺序) + + +| doc_id | 主题 | 子篇执行入口 | +| ------ | --------------------------- | ------------------------------- | +| 01-06 | 节点初始化与 k3s 自动安装(Ansible) | `./scripts/verify.sh run 01-06` | +| 01-01 | 控制节点安装(含 Traefik) | `./scripts/verify.sh run 01-01` | +| 01-02 | 工作节点加入与验证 | `./scripts/verify.sh run 01-02` | +| 01-03 | armv7 standalone docker(可选) | `./scripts/verify.sh run 01-03` | +| 01-05 | armv7 NFS 服务安装(可选) | `./scripts/verify.sh run 01-05` | +| 01-07 | OpenWrt + HAProxy(可选) | `./scripts/verify.sh run 01-07` | +| 01-08 | 双控制节点 HA:安装与准备(可选) | `./scripts/verify.sh run 01-08` | + + +## 真源位置 + +- **Kubernetes YAML**:优先以 `labs/**/manifests/` 与 `ansible/files/` 为准(文档内会给具体路径) +- **自动化执行**:`ansible/playbooks/verify/` +- **脚本入口**:`scripts/README.md` + diff --git a/docs/01-01-k3s-控制节点含traefik.md b/docs/01-01-k3s-控制节点含traefik.md index 6379f8c..fe234f8 100644 --- a/docs/01-01-k3s-控制节点含traefik.md +++ b/docs/01-01-k3s-控制节点含traefik.md @@ -1,8 +1,13 @@ # 01-01-k3s-控制节点含traefik -> 在控制节点安装 K3s Server,确认基础组件与 Traefik 可用。 -> -> 若需一键自动化安装多节点集群,可直接用 `01-06-节点初始化-ansible-实践.md`。 +## TL;DR + +- **自动化验收**:在控制端(如 `ylc65`)执行 `./scripts/verify.sh run 01-01` +- **手工安装**:控制节点执行 `curl -sfL https://get.k3s.io | sh -s - server --data-dir=/storage`(或默认路径) +- **成功判据**:node 为 `Ready`;`kube-system` 中 `coredns` / `traefik` Deployment 存在;Traefik 入口可响应(常见为 `404`) +- **失败排障**:见本文「排障」小节(事件/Pod/日志/磁盘压力) + +> 说明:本篇聚焦 **单控制节点安装与基础验收**。若要一键自动化安装多节点集群,见 `01-06-节点初始化-ansible-实践.md`。 ## 前置条件 @@ -20,7 +25,7 @@ K3s 默认将数据(含 local-path 卷)放在 `--data-dir` 下。系统盘 | **方案一(默认)** | `/var/lib/rancher/k3s` | 系统盘空间充足 | | **方案二(数据盘)** | `/storage` | 系统盘小,数据盘单独挂载在 `/storage` | -> 自定义 `/storage` 仅解决单节点内系统盘/数据盘分离;节点或数据盘重建后数据不会自动迁移,高可用与备份见 `01-04`、`06-03`。 +> 自定义 `/storage` 仅解决单节点内系统盘/数据盘分离;节点或数据盘重建后数据不会自动迁移,高可用与备份见 `01-08`、`06-03`。 ## 操作步骤 @@ -42,7 +47,7 @@ curl -sfL https://get.k3s.io | sh - curl -sfL https://get.k3s.io | sh -s - server --data-dir=/storage ``` -- 使用方案二时,token 路径为 `/storage/server/token`(供 01-02 工作节点加入与 01-04 HA 使用)。 +- 使用方案二时,token 路径为 `/storage/server/token`(供 01-02 工作节点加入与 01-08 HA 使用)。 ## 配置 kubectl(供当前用户使用) @@ -151,6 +156,16 @@ curl -I --max-time 3 http://127.0.0.1:80 - `kube-system` 命名空间核心组件正常运行 - Traefik 服务已创建并可响应(常见为 `404`,表示入口已通) +## 清理 + +本篇为安装类文档:手工安装后一般 **不卸载 K3s**,而是继续后续实验。若你仅为排障临时验收,可在运行 `./scripts/verify.sh run 01-01` 时设 `VERIFY_TEARDOWN=0` 保留现场(本篇用例默认不做破坏性清理)。 + +## 排障 + +- **节点不 Ready / DiskPressure**:优先确认 `/storage` 为独立挂载点(见 `docs/00-04-部署环境说明.md`),再看 `df -h`、`kubectl describe node ` 事件。 +- **Traefik 不就绪**:`kubectl -n kube-system get pods -o wide`;必要时 `kubectl -n kube-system logs deploy/traefik --tail=200`。 +- **CoreDNS 解析异常(影响后续 ACME)**:见下节「CoreDNS 上游 DNS」。 + ## CoreDNS 上游 DNS(ACME 用) 若后续要按 `03-02` / `03-03` 配置 ACME(Let's Encrypt),需确保集群内能解析 `acme-v02.api.letsencrypt.org`。宿主机若使用 **IPv6 DNS**(`/etc/resolv.conf` 含 `240e:...` 等),K3s Pod 网络仅 IPv4 时无法访问,ACME 会报 `server misbehaving` 或 `network is unreachable`。 diff --git a/docs/01-02-k3s-工作节点.md b/docs/01-02-k3s-工作节点.md index 290b778..a21db3d 100644 --- a/docs/01-02-k3s-工作节点.md +++ b/docs/01-02-k3s-工作节点.md @@ -5,6 +5,13 @@ > > 若需一键自动化安装多节点集群,可直接用 `01-06-节点初始化-ansible-实践.md`。 +## TL;DR + +- **自动化验收**:在控制端执行 `./scripts/verify.sh run 01-02` +- **手工加入 worker**:在 worker 上按本文执行 `k3s agent ...`(注意 token 路径与 `/storage` 方案) +- **成功判据**:`kubectl get nodes` 中 worker 为 `Ready`;`kube-system` 中 Traefik 正常;入口 `:80` 可达(按本文验收命令) +- **失败排障**:见本文「排障」小节(token/防火墙/flannel/cni0/调度入口节点) + ## 前置条件 - 已完成 `01-01-k3s-控制节点含traefik.md` @@ -89,13 +96,13 @@ kubectl label node ylc62 svccontroller.k3s.cattle.io/lbpool=edge --overwrite ### 3.2 Ansible 方式(推荐,集中管理入口节点) -也可以在 [`ansible/group_vars/all.yml`](../ansible/group_vars/all.yml) 中配置入口节点列表 `k3s_ingress_nodenames`(示例:`ylc61`、`ylc62`),由 `k3s-init-and-install.yml` 自动打标签。 +也可以在 [`ansible/group_vars/all.yml`](../ansible/group_vars/all.yml) 中配置入口节点列表 `k3s_ingress_nodenames`(示例:`ylc61`、`ylc62`),由 `01-06.yml`(`-e k3s_do_install=true`)自动打标签。 运行: ```bash cd ansible -ansible-playbook -i inventory.ini playbooks/k3s-init-and-install.yml +ansible-playbook -i inventory.ini playbooks/verify/01-06.yml ``` 若 `k3s_ingress_nodenames` 为空(默认),Ansible 会对**所有节点**打入口标签,与早期行为一致; @@ -133,11 +140,11 @@ curl -I --max-time 3 http://192.168.2.62:80 - `trusted` 中可看到 `flannel.1 cni0` - 被标记为入口节点的 IP:80(示例中为 `192.168.2.61`、`192.168.2.62`)可返回 Traefik 响应(常见 `404`) -## 失败排查 +## 排障 - 若出现 `502/跨节点不通/admin-prohibited`,看:`06-01-k3s-networkpolicy-故障排查.md` ## 下一步 +- `02-00-nginx-系列说明.md` → `02-05-nginx-验证矩阵-一键部署.md`(HTTP 入口与 Traefik 路由最小验证) - `03-01-k3s-traefik-dashboard.md` -- `04-03-k3s-nginx-demo.md` diff --git a/docs/01-03-armv7-standalone-docker.md b/docs/01-03-armv7-standalone-docker.md index af38ab7..aa4a574 100644 --- a/docs/01-03-armv7-standalone-docker.md +++ b/docs/01-03-armv7-standalone-docker.md @@ -2,6 +2,14 @@ > armv7 节点不加入 K3s,单独运行 Docker 服务(NFS、OneNav、openlist 等)。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 01-03` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + ## 前置条件 - armv7 节点网络可达 @@ -29,3 +37,9 @@ docker ps - `05-02-onenav首页面板.md` - `01-05-armv7-nfs服务安装.md` + +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/01-05-armv7-nfs服务安装.md b/docs/01-05-armv7-nfs服务安装.md index 3dbe39d..c79c249 100644 --- a/docs/01-05-armv7-nfs服务安装.md +++ b/docs/01-05-armv7-nfs服务安装.md @@ -2,6 +2,14 @@ > 本文只讲 armv7 主机侧 NFS 服务安装与导出配置,目标是把 **`/sdcard`** 作为 NFS 共享目录导出给 K3s 节点使用。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 01-05` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + ## 前置条件 - 已完成 `01-03-armv7-standalone-docker.md` @@ -260,3 +268,8 @@ mount | grep nfs - `03-06-k3s-使用nfs存储.md` +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/01-06-节点初始化-ansible-实践.md b/docs/01-06-节点初始化-ansible-实践.md index a329812..e36ff4f 100644 --- a/docs/01-06-节点初始化-ansible-实践.md +++ b/docs/01-06-节点初始化-ansible-实践.md @@ -3,7 +3,15 @@ > 目标:给一组已经装好 OS、可以 SSH 的裸金属/虚机,**一键完成基础初始化 + 安装 k3s server/worker**,得到与 `01-01`、`01-02` 文档一致的集群(含 `/storage` 数据盘方案)。 > > **状态:已验证**(2026-03,Fedora + K3s,4 节点 61~64)。 -> 部署环境详见 `00-04-部署环境说明.md`。 +> 部署环境详见 `00-02-部署环境说明.md`。 + +## TL;DR + +- **一键安装**:`./scripts/deploy-lab.sh k3s` +- **一键验收**:`./scripts/verify.sh run 01-06`(或直接 `./scripts/verify.sh full`) +- **关键前置**:控制端可 SSH 所有节点;`ansible/inventory.ini` 私钥路径存在且权限正确;(可选)每台节点已挂载 `/storage` 或启用 `K3S_PREPARE_STORAGE=true` +- **成功判据**:所有节点 `Ready`;kube-system 核心组件就绪;后续按 `02-05` 可跑入口验证 +- **失败排障**:见本文「排障」小节(SSH/私钥、/storage、firewalld、k3s service) ## 1. 适用边界与前提 @@ -22,7 +30,7 @@ - **数据盘**:若使用 `/storage` 方案,每台节点须将**独立数据盘**挂载到 `/storage`(与 `/` 不同设备),详见 `00-04` 与下文「数据盘准备」。 - 不覆盖: - 从「完全裸铁 + 无系统」开始的 PXE 装机; - - 高级 HA(多 server + 外部 datastore)——仍按 `01-04`、`03-10` 执行。 + - 高级 HA(多 server + 外部 datastore)——仍按 `01-08`、`03-08` 执行。 ### 1.1 数据盘准备(手工,或与自动化二选一) @@ -43,17 +51,17 @@ XFS 用户将 `mkfs.ext4` / `fstab` 类型改为 `xfs` 即可(Longhorn 支持 **自动化(可选)**:在 `group_vars/all.yml` 中设置 `k3s_prepare_storage: true` 与 `k3s_data_disk_device: /dev/vdb`(四台盘符一致时一条即可;不一致则用 `host_vars/.yml` 覆盖),然后执行: ```bash -ansible-playbook -i inventory.ini playbooks/k3s-prepare-storage.yml +ansible-playbook -i inventory.ini playbooks/verify/01-06.yml ``` 该 playbook 在 `/storage` 已是独立挂载时会跳过,避免重复执行。 ### 1.2 推荐执行顺序(10G + 32G 四节点) -1. (可选)`playbooks/k3s-prepare-storage.yml` -2. `playbooks/k3s-init-and-install.yml`(可在 `group_vars` 中设 `k3s_verify_storage_mount: true` 强制校验 `/` 与 `/storage` 不同源) -3. (可选)`playbooks/longhorn-install.yml`(Helm,见 `03-07`) -4. (可选)`playbooks/apply-local-path-config-lab.yml`,或 `longhorn_apply_local_path_lab: true` 随 Longhorn 一并应用(真源:`files/kube-system/local-path-config-lab.json`,见 `03-05`) +1. (可选)`playbooks/verify/01-06.yml` +2. `playbooks/verify/01-06.yml`(可在 `group_vars` 中设 `k3s_verify_storage_mount: true` 强制校验 `/` 与 `/storage` 不同源) +3. (可选)`playbooks/verify/03-07.yml`(Helm,见 `03-07`) +4. (可选)`playbooks/verify/03-05.yml`,或 `longhorn_apply_local_path_lab: true` 随 Longhorn 一并应用(真源:`files/kube-system/local-path-config-lab.json`,见 `03-05`) ## 2. 目录结构 @@ -66,10 +74,10 @@ ansible/ group_vars/ all.yml playbooks/ - k3s-prepare-storage.yml # 可选:第二块盘分区、挂载 /storage - k3s-init-and-install.yml # 标准 IPv4 安装 - longhorn-install.yml # 可选:Helm 安装 Longhorn - apply-local-path-config-lab.yml # 可选:仅应用 local-path 实验室 ConfigMap + verify/ + 01-06.yml # 标准 IPv4 安装(-e k3s_do_install=true);可选准备数据盘(-e k3s_do_prepare_storage=true) + 03-07.yml # 可选:Helm 安装 Longhorn + 03-05.yml # 可选:仅应用 local-path 实验室 ConfigMap(-e local_path_apply_lab_config=true) files/ longhorn/values-lab.yaml # 实验室 Helm values kube-system/local-path-config-lab.json @@ -116,11 +124,11 @@ k3s_worker **存储挂载校验**(推荐实验室开启): -- `k3s_verify_storage_mount: true`:在 `k3s-init-and-install.yml` 安装 k3s **之前**,断言 `/storage` 为挂载点且与 `/` 不同块设备;失败时提示查阅 `00-04`。已有「目录式假 /storage」的旧环境可临时设为 `false`。 +- `k3s_verify_storage_mount: true`:在 `01-06.yml` 安装 k3s(`-e k3s_do_install=true`)**之前**,断言 `/storage` 为挂载点且与 `/` 不同块设备;失败时提示查阅 `00-04`。已有「目录式假 /storage」的旧环境可临时设为 `false`。 **数据盘自动化**(可选): -- `k3s_prepare_storage: true` 且 `k3s_data_disk_device: /dev/vdb`:由 `k3s-prepare-storage.yml` 执行(见 §1.1)。 +- `k3s_prepare_storage: true` 且 `k3s_data_disk_device: /dev/vdb`:由 `01-06.yml -e k3s_do_prepare_storage=true` 执行(见 §1.1)。 ## 5. 执行流程概览 @@ -159,11 +167,11 @@ playbook 依次执行: ```bash cd ansible # (可选)先准备数据盘挂载 /storage -# ansible-playbook -i inventory.ini playbooks/k3s-prepare-storage.yml +# ansible-playbook -i inventory.ini playbooks/verify/01-06.yml # 标准 IPv4 安装 -ansible-playbook -i inventory.ini playbooks/k3s-init-and-install.yml +ansible-playbook -i inventory.ini playbooks/verify/01-06.yml # (可选)Helm 安装 Longhorn -# ansible-playbook -i inventory.ini playbooks/longhorn-install.yml +# ansible-playbook -i inventory.ini playbooks/verify/03-07.yml ``` 执行结束后,playbook 会输出: @@ -193,3 +201,9 @@ KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl get pods -n kube-system -o wide - `03-09-k3s-gitops-集群配置管理.md`:用 Argo CD/Flux 管理 Traefik、监控、应用清单; - `01-01`、`01-02` 中的验证命令与入口验证。 +## 排障 + +- **Ansible 连不上节点**:先在控制端跑 `./scripts/verify.sh preflight`;检查 `ansible/inventory.ini` 主机名/IP、`ansible_user`、私钥路径与权限(600)。 +- **/storage 校验失败**:确认每台节点 `/storage` 为独立挂载点;必要时先跑 `K3S_PREPARE_STORAGE=true ./scripts/deploy-lab.sh k3s` 或单独跑 `ansible/playbooks/verify/01-06.yml`。 +- **kube-system 组件不就绪**:在 server 上 `journalctl -u k3s -n 200 --no-pager`,以及 `kubectl -n kube-system get pods -o wide`/`describe` 查看事件。 + diff --git a/docs/01-07-openwrt-haproxy.md b/docs/01-07-openwrt-haproxy.md index 7856cd4..f277504 100644 --- a/docs/01-07-openwrt-haproxy.md +++ b/docs/01-07-openwrt-haproxy.md @@ -2,6 +2,14 @@ > 在 OpenWrt 上安装并配置 HAProxy,将 80/443 流量转发到 K3s 集群节点(Traefik 入口),实现单一入口与负载均衡。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 01-07` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + ## 前置条件 - OpenWrt 与 K3s 节点同网段(如 192.168.2.0/24),OpenWrt 通常为网关(如 192.168.2.1) @@ -20,9 +28,9 @@ opkg install haproxy 编辑 `/etc/haproxy.cfg` 或包提供的配置路径(部分 OpenWrt 使用 `/etc/haproxy/haproxy.cfg`)。可在 `/etc/init.d/haproxy` 中查看实际配置文件路径。 -**配置目录说明与「cfg 是否正确」的验证层次**:见 `ansible/files/01-07-haproxy/README.md`(**仅语法**:`./scripts/01-07-verify-haproxy.sh --cfg-only`)。 +**配置目录说明与「cfg 是否正确」的验证层次**:见 `ansible/files/01-07/`(**仅语法**:`./scripts/01-07-verify-haproxy.sh --cfg-only`)。 -**无健康检查最简配置**:`ansible/files/01-07-haproxy/haproxy-no-check.cfg`(与 Ansible 共用,可复制到 OpenWrt 或通过 playbook 下发)。将 `192.168.2.61`~`192.168.2.64` 按实际 K3s 节点 IP 修改。如需健康检查见第 3 节;如需真实客户端 IP 见第 5 节 PROXY Protocol。 +**无健康检查最简配置**:`ansible/files/01-07/haproxy-no-check.cfg`(与 Ansible 共用,可复制到 OpenWrt 或通过 playbook 下发)。将 `192.168.2.61`~`192.168.2.64` 按实际 K3s 节点 IP 修改。如需健康检查见第 3 节;如需真实客户端 IP 见第 5 节 PROXY Protocol。 ## 3. 健康检查 @@ -43,15 +51,15 @@ opkg install haproxy ### 3.2 HTTP(80 明文) -完整配置:`ansible/files/01-07-haproxy/haproxy-http.cfg`。`backend k3s_http` 开头加 `option httpchk GET /`,`k3s_https` 仍为 TCP 检查。 +完整配置:`ansible/files/01-07/haproxy-http.cfg`。`backend k3s_http` 开头加 `option httpchk GET /`,`k3s_https` 仍为 TCP 检查。 ### 3.3 TLS(443 握手,`mode tcp`) -完整配置:`ansible/files/01-07-haproxy/haproxy-tls.cfg`。`backend k3s_https` 中加 `option ssl-hello-chk`,做 TLS 握手层检查。 +完整配置:`ansible/files/01-07/haproxy-tls.cfg`。`backend k3s_https` 中加 `option ssl-hello-chk`,做 TLS 握手层检查。 ### 3.4 HTTPS(443 应用层,`mode http` + `ssl`) -完整配置:`ansible/files/01-07-haproxy/haproxy-https.cfg`。适用于 **HAProxy 在 443 终结 TLS(由 HAProxy 提供证书)** 的场景(frontend 需 `bind *:443 ssl crt ...`)。需与 Traefik 路由匹配的 `Host`;自签/内网 CA 用 `verify none`,生产建议 `ca-file`。若仍为 TCP 透传,用 3.3 即可。 +完整配置:`ansible/files/01-07/haproxy-https.cfg`。适用于 **HAProxy 在 443 终结 TLS(由 HAProxy 提供证书)** 的场景(frontend 需 `bind *:443 ssl crt ...`)。需与 Traefik 路由匹配的 `Host`;自签/内网 CA 用 `verify none`,生产建议 `ca-file`。若仍为 TCP 透传,用 3.3 即可。 ## 4. 启动与验证 @@ -66,13 +74,13 @@ opkg install haproxy **验证**:经 **ssh onecloud**(或你可访问的第三方机器)发起 curl,验证 `http://:18080` 与 `https://<域名>:18443`(HTTPS 需正确设置 Host/SNI,例如 `curl --https-hosts ...`)。不部署、不改端口;需 OpenWrt HAProxy 已按 18080/18443 配置。 -验证通过后,请**手工**在 `docs/00-02-验证矩阵.md` 补充状态与备注(当前仓库已下线“自动更新矩阵”的执行入口)。 +验证通过后,建议你在本篇文档中**手工**补充状态与备注(环境/日期/覆盖范围)。 ## 5. PROXY Protocol(可选) 若 Traefik 需获取真实客户端 IP,可在 HAProxy 后端每个 `server` 行添加 `send-proxy-v2`,并在 Traefik 配置 `trustedIPs` 包含 OpenWrt 网段(见 `03-02-k3s-traefik-acme.md`)。 -**完整配置**:`ansible/files/01-07-haproxy/haproxy-proxy-http-tls.cfg`(HTTP 检查 + TLS 检查 + PROXY)。 +**完整配置**:`ansible/files/01-07/haproxy-proxy-http-tls.cfg`(HTTP 检查 + TLS 检查 + PROXY)。 Traefik 端需启用 PROXY protocol 监听并信任 OpenWrt 的 IP,否则会报错。UCI 配置需参考 OpenWrt HAProxy 文档中的相应选项。 @@ -115,3 +123,9 @@ curl -v http://192.168.2.61/ - `01-02-k3s-工作节点.md`:Traefik 入口与 LB 基线 - `02-05-nginx-验证矩阵-一键部署.md`:验证矩阵(按入口 IP 访问) - `03-02-k3s-traefik-acme.md`:PROXY protocol、trustedIPs + +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/01-04-双控制节点ha.md b/docs/01-08-双控制节点ha.md similarity index 74% rename from docs/01-04-双控制节点ha.md rename to docs/01-08-双控制节点ha.md index 09c14e1..04ff5fe 100644 --- a/docs/01-04-双控制节点ha.md +++ b/docs/01-08-双控制节点ha.md @@ -1,4 +1,11 @@ -# 01-04-双控制节点HA(安装与准备) +# 01-08-双控制节点HA(安装与准备) + +## TL;DR + +- **自动化验收(基线)**:`./scripts/verify.sh run 01-08`(只做集群可达性基线;HA 加入/切换需按本文手工演练) +- **你需要准备**:第二个 server、外部 datastore、`6443` LB(HAProxy 等)、维护窗口与备份 +- **成功判据**:能按本文完成外部 datastore 与 LB 的准备清单;并在 `03-08` 中完成加入/切换演练 +- **失败排障**:见本文「排障」小节(datastore/LB/tls-san/6443) > 本文只讲双控制节点 HA 的安装前准备与基础环境搭建。 > 具体集群参数切换、server 加入与迁移步骤见 `03-08-k3s-ha-集群配置与切换.md`。 @@ -53,7 +60,7 @@ sudo k3s server \ 1. **确认 worker 节点健康**: - 已按 `01-02-k3s-工作节点.md` 正常加入集群; - 无关键 Pod 仅运行在该节点(可先用 `kubectl drain` 或手动迁移工作负载)。 -2. **在 `01-04` 阶段完成外部 datastore 与 LB 准备**: +2. **在 `01-08` 阶段完成外部 datastore 与 LB 准备**: - 不要立即改动现有 server/worker 的 systemd 配置,只确保 datastore/LB 均已就绪。 3. **在 `03-09` 中按步骤将该 worker 替换为 server**: - 停止该节点上的 `k3s-agent` 服务(或执行官方卸载脚本); @@ -76,5 +83,11 @@ kubectl get pods -A ## 下一步 +## 排障 + +- **LB 6443 不通**:先在客户端 `curl -k https://:6443/ping`;再在各 server 检查监听与防火墙放行。 +- **加入第二个 server 后 kubeconfig 指向错误地址**:确认 `--tls-san` 包含 LB IP/域名与各 server IP,并更新 kubeconfig server 地址。 +- **外部 datastore 连接失败**:检查连接串、网络 ACL、防火墙、账号权限;在 server 上用 `psql/mysql` 先手工连通再跑 k3s 参数。 + - `03-08-k3s-ha-集群配置与切换.md` diff --git a/docs/02-00-nginx-系列说明.md b/docs/02-00-nginx-系列说明.md index 1ebdb01..d20bae3 100644 --- a/docs/02-00-nginx-系列说明.md +++ b/docs/02-00-nginx-系列说明.md @@ -1,6 +1,14 @@ # 02-00 Nginx 矩阵系列说明(节点 + Ingress / IngressRoute) -> 目的:先把本系列(02-01~02-05)共用的“节点调度规则”和“Traefik 路由对象差异”讲清楚,后面的 02-01~02-04 分篇才能读得更快、更少踩坑。 +> 目的:先把本系列(02-01~02-05)共用的“节点调度规则”和“Traefik 路由对象差异”讲清楚,后面的 02-01~02-04 分篇才能读得更快、减少常见误区。 + + +## TL;DR + +- **自动化验收**:本篇为系列说明页,不参与 `verify.sh run-all/full` +- **关键前置**:按本文说明准备节点标签、路由对象与入口路径认知 +- **成功判据**:你能区分 M1~M4 的节点落点与 Ingress/IngressRoute 差异,并据此进入 `02-01~02-05` +- **排障**:见本文「排障」 --- @@ -97,7 +105,7 @@ kubectl describe node <节点名> 通用删除建议使用 manifests 目录(一键清理同一个场景): ```bash -kubectl delete -f ansible/files/02-05-nginx-matrix/ -R +kubectl delete -f ansible/files/02-05/ -R ``` 或按具体文件删单个场景(见各分篇的 `## 删除` 小节)。 @@ -110,3 +118,8 @@ kubectl delete -f ansible/files/02-05-nginx-matrix/ -R 2. `02-01~02-04`(按你关心的节点落点/路由对象读) 3. `02-05-nginx-验证矩阵-一键部署.md`(最终整合、一次验证 4 个场景) +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/02-01-nginx-control-ingress.md b/docs/02-01-nginx-control-ingress.md index ec94664..cd25d30 100644 --- a/docs/02-01-nginx-control-ingress.md +++ b/docs/02-01-nginx-control-ingress.md @@ -2,6 +2,13 @@ > 场景:nginx 落在控制节点(`nodeSelector: node-role.kubernetes.io/control-plane`),使用标准 Ingress 暴露 `/demo-m1`。整合于 `02-05-nginx-验证矩阵-一键部署.md`。 +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 02-01` +- **你需要准备**:入口节点 `:80` 可达;(可选)`nginx_entry_base=http://<入口IP>` 用于脚本侧 HTTP 校验 +- **成功判据**:`/demo-m1/` 返回 `200` 且能区分后端(见本篇“验证命令/预期”与 playbook 断言) +- **排障**:见本文「排障」 + ## 前置条件 - 已完成 `01-02-k3s-工作节点.md` @@ -14,12 +21,12 @@ 2. 创建 Middleware + Ingress(`/demo-m1` -> nginx-m1:80) 3. 等待 Pod 与 Ingress 就绪 -示例 YAML 见 `ansible/files/02-05-nginx-matrix/01-control-ingress.yaml`。 +示例 YAML 见 `ansible/files/02-05/01-control-ingress.yaml`。 ## 部署命令 ```bash -kubectl apply -f ansible/files/02-05-nginx-matrix/01-control-ingress.yaml +kubectl apply -f ansible/files/02-05/01-control-ingress.yaml ``` ## 验证命令 @@ -37,10 +44,10 @@ curl -i --max-time 3 http://<入口节点IP>/demo-m1/ ## 删除 ```bash -kubectl delete -f ansible/files/02-05-nginx-matrix/01-control-ingress.yaml +kubectl delete -f ansible/files/02-05/01-control-ingress.yaml ``` -## 失败排查 +## 排障 - 确认 Traefik 接管 Ingress、Service/Endpoint 正常 - 参考 `06-01-k3s-networkpolicy-故障排查.md` diff --git a/docs/02-02-nginx-control-ingressroute.md b/docs/02-02-nginx-control-ingressroute.md index 98f6f60..6bb8480 100644 --- a/docs/02-02-nginx-control-ingressroute.md +++ b/docs/02-02-nginx-control-ingressroute.md @@ -2,6 +2,14 @@ > 场景:nginx 指定一台控制节点(`nodeSelector: kubernetes.io/hostname: ylc61`),路由使用 Traefik CRD `IngressRoute`,暴露 `/demo-m2`。整合于 `02-05-nginx-验证矩阵-一键部署.md`。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 02-02` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + ## 前置条件 - 已完成 `01-02-k3s-工作节点.md` @@ -14,12 +22,12 @@ 2. 创建 Middleware + IngressRoute(`PathPrefix(/demo-m2)`) 3. 等待资源就绪 -示例 YAML 见 `ansible/files/02-05-nginx-matrix/02-control-ingressroute.yaml`。 +示例 YAML 见 `ansible/files/02-05/02-control-ingressroute.yaml`。 ## 部署命令 ```bash -kubectl apply -f ansible/files/02-05-nginx-matrix/02-control-ingressroute.yaml +kubectl apply -f ansible/files/02-05/02-control-ingressroute.yaml ``` ## 验证命令 @@ -38,7 +46,7 @@ curl -i --max-time 3 http://<入口节点IP>/demo-m2/ ## 删除 ```bash -kubectl delete -f ansible/files/02-05-nginx-matrix/02-control-ingressroute.yaml +kubectl delete -f ansible/files/02-05/02-control-ingressroute.yaml ``` ## 失败排查 @@ -49,3 +57,9 @@ kubectl delete -f ansible/files/02-05-nginx-matrix/02-control-ingressroute.yaml ## 下一步 - 返回 `02-05-nginx-验证矩阵-一键部署.md` 或 `00-00-构建总览.md` + +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/02-03-nginx-worker-ingress.md b/docs/02-03-nginx-worker-ingress.md index d71b42a..8916374 100644 --- a/docs/02-03-nginx-worker-ingress.md +++ b/docs/02-03-nginx-worker-ingress.md @@ -2,6 +2,14 @@ > 场景:nginx 随机一台工作节点(`nodeSelector: node-role.kubernetes.io/worker: ""`),跨节点 Ingress 暴露 `/demo-m3`。整合于 `02-05-nginx-验证矩阵-一键部署.md`。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 02-03` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + ## 前置条件 - 已完成 `01-02-k3s-工作节点.md` @@ -14,12 +22,12 @@ 2. 创建 Middleware + Ingress(`/demo-m3` -> nginx-m3:80) 3. 等待资源就绪 -示例 YAML 见 `ansible/files/02-05-nginx-matrix/03-worker-ingress.yaml`。 +示例 YAML 见 `ansible/files/02-05/03-worker-ingress.yaml`。 ## 部署命令 ```bash -kubectl apply -f ansible/files/02-05-nginx-matrix/03-worker-ingress.yaml +kubectl apply -f ansible/files/02-05/03-worker-ingress.yaml ``` ## 验证命令 @@ -37,7 +45,7 @@ curl -i --max-time 3 http://<入口节点IP>/demo-m3/ ## 删除 ```bash -kubectl delete -f ansible/files/02-05-nginx-matrix/03-worker-ingress.yaml +kubectl delete -f ansible/files/02-05/03-worker-ingress.yaml ``` ## 失败排查 @@ -48,3 +56,9 @@ kubectl delete -f ansible/files/02-05-nginx-matrix/03-worker-ingress.yaml ## 下一步 - 返回 `02-05-nginx-验证矩阵-一键部署.md` 或 `00-00-构建总览.md` + +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/02-04-nginx-worker-ingressroute.md b/docs/02-04-nginx-worker-ingressroute.md index 80af58b..6964c27 100644 --- a/docs/02-04-nginx-worker-ingressroute.md +++ b/docs/02-04-nginx-worker-ingressroute.md @@ -2,6 +2,14 @@ > 场景:nginx 指定落在 ylc64(`nodeSelector: kubernetes.io/hostname: ylc64`),跨节点 IngressRoute 暴露 `/demo-m4`。整合于 `02-05-nginx-验证矩阵-一键部署.md`。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 02-04` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + ## 前置条件 - 已完成 `01-02-k3s-工作节点.md` @@ -15,12 +23,12 @@ 2. 创建 Middleware + IngressRoute(`PathPrefix(/demo-m4)`) 3. 等待资源就绪 -示例 YAML 见 `ansible/files/02-05-nginx-matrix/04-worker-ingressroute.yaml`。 +示例 YAML 见 `ansible/files/02-05/04-worker-ingressroute.yaml`。 ## 部署命令 ```bash -kubectl apply -f ansible/files/02-05-nginx-matrix/04-worker-ingressroute.yaml +kubectl apply -f ansible/files/02-05/04-worker-ingressroute.yaml ``` ## 验证命令 @@ -39,7 +47,7 @@ curl -i --max-time 3 http://<入口节点IP>/demo-m4/ ## 删除 ```bash -kubectl delete -f ansible/files/02-05-nginx-matrix/04-worker-ingressroute.yaml +kubectl delete -f ansible/files/02-05/04-worker-ingressroute.yaml ``` ## 失败排查 @@ -51,3 +59,9 @@ kubectl delete -f ansible/files/02-05-nginx-matrix/04-worker-ingressroute.yaml ## 下一步 - 返回 `02-05-nginx-验证矩阵-一键部署.md` 或 `00-00-构建总览.md` + +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/02-05-nginx-验证矩阵-一键部署.md b/docs/02-05-nginx-验证矩阵-一键部署.md index e8be19c..c1ccd61 100644 --- a/docs/02-05-nginx-验证矩阵-一键部署.md +++ b/docs/02-05-nginx-验证矩阵-一键部署.md @@ -2,6 +2,14 @@ > **定位**:02 系列尾部,整合 02-01~02-04 的综合一键部署。4 种组合(控制节点/工作节点 × Ingress/IngressRoute)均有具体 Deployment + Service + 路由,节点 IP 访问(如 `http://入口IP/demo-m1/`)。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 02-05` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + ## 前置条件 - 已完成 `01-02-k3s-工作节点.md`(Traefik 与 LB 可用) @@ -23,7 +31,7 @@ ## 完整配置(与 Ansible 共用) -配置位于 `ansible/files/02-05-nginx-matrix/`(4 个文件对应 M1~M4),文档与 Ansible 共用此目录: +配置位于 `ansible/files/02-05/`(4 个文件对应 M1~M4),文档与 Ansible 共用此目录: | 文件 | 场景 | 路径 | 节点 | |------|------|------|------| @@ -37,7 +45,7 @@ ## 部署 ```bash -kubectl apply -f ansible/files/02-05-nginx-matrix/ -R +kubectl apply -f ansible/files/02-05/ -R kubectl get pod,svc,ing,ingressroute -n default -o wide ``` @@ -110,7 +118,7 @@ Manifest 里四份写法一致,若只有 M1 仍显示默认页,多半是集 ```bash # 在仓库根目录执行时: -kubectl apply -f ansible/files/02-05-nginx-matrix/ -R +kubectl apply -f ansible/files/02-05/ -R # 若当前在 ansible/ 目录下,改用: kubectl apply -f files/02-05-nginx-matrix/ -R @@ -128,7 +136,7 @@ kubectl delete deployment nginx-m1 -n default kubectl apply -f files/02-05-nginx-matrix/01-control-ingress.yaml # 若在仓库根目录: -# kubectl apply -f ansible/files/02-05-nginx-matrix/01-control-ingress.yaml +# kubectl apply -f ansible/files/02-05/01-control-ingress.yaml # 3. 等 Pod Running 后验证 kubectl get pod -n default -l app=nginx-m1 @@ -142,19 +150,19 @@ kubectl exec -n default deployment/nginx-m1 -- cat /etc/nginx/conf.d/default.con 可使用 Ansible playbook 自动完成复制 manifests、apply、等待 Pod 就绪及 curl 验证: -- **Playbook**:`ansible/playbooks/nginx-matrix-deploy.yml` -- **Manifests 位置**:`ansible/files/02-05-nginx-matrix/`(M1 control-plane / M2 M4 节点名 ylc61、ylc64,M3 worker;按实际修改 M2/M4 节点名) +- **Playbook**:`ansible/playbooks/verify/02-05.yml` +- **Manifests 位置**:`ansible/files/02-05/`(M1 control-plane / M2 M4 节点名 ylc61、ylc64,M3 worker;按实际修改 M2/M4 节点名) - **执行(在 ansible/ 目录下)**: ```bash cd ansible -ansible-playbook -i inventory.ini playbooks/nginx-matrix-deploy.yml +ansible-playbook -i inventory.ini playbooks/verify/02-05.yml ``` 若 manifests 目录未找到,可改为在仓库根目录执行: ```bash -ansible-playbook -i ansible/inventory.ini ansible/playbooks/nginx-matrix-deploy.yml +ansible-playbook -i ansible/inventory.ini ansible/playbooks/verify/02-05.yml ``` Playbook 会:拷贝 manifests 到控制节点 → **先删除全部 nginx 矩阵 Deployment**(nginx-m1~m4,若存在)→ `kubectl apply` → `rollout restart` M1~M4 → 等待 Pod 就绪 → 输出 16 个目标(4 节点 × 4 路径)的 curl 矩阵。先删再 apply 可避免旧 ReplicaSet 导致任一 Mx 仍显示默认页。 @@ -164,14 +172,14 @@ Playbook 会:拷贝 manifests 到控制节点 → **先删除全部 nginx 矩 **手动 kubectl apply 的**:用同一目录删除 ```bash -kubectl delete -f ansible/files/02-05-nginx-matrix/ -R +kubectl delete -f ansible/files/02-05/ -R ``` **Ansible playbook 部署的**:在仓库根或 ansible 同级的机器上,用 manifests 删除(需配置 KUBECONFIG) ```bash export KUBECONFIG=/etc/rancher/k3s/k3s.yaml # 或从控制节点拷贝 kubeconfig -kubectl delete -f ansible/files/02-05-nginx-matrix/ -R +kubectl delete -f ansible/files/02-05/ -R ``` 若控制节点上 `/tmp/nginx-matrix/` 仍存在,也可在控制节点执行: @@ -200,3 +208,9 @@ kubectl delete configmap -n default nginx-m1-html nginx-m2-html nginx-m3-html ng - `02-01`~`02-04`:分篇说明(路径 /demo-m1~m4、nodeSelector 与本文一致) - `03-02-k3s-traefik-acme.md` - `06-01-k3s-networkpolicy-故障排查.md` + +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/03-00-集群侧配置扩展-系列说明.md b/docs/03-00-集群侧配置扩展-系列说明.md new file mode 100644 index 0000000..bf38618 --- /dev/null +++ b/docs/03-00-集群侧配置扩展-系列说明.md @@ -0,0 +1,34 @@ +# 03-00 集群侧配置扩展(系列说明) + +> 本系列覆盖:Traefik 扩展(Dashboard/ACME/自定义端口)、Tunnel、存储(local-path/NFS/Longhorn)、HA/GitOps 等。 + +## TL;DR + +- **本页定位**:仅系列导航,不参与 `verify.sh run-all/full` +- **子篇执行入口**:按下表执行 `./scripts/verify.sh run ` + +## 范围与非目标 + +- 本页是 **03 系列入口/导航页**(`YY=00`),不要求具备独立执行器,且不参与 `verify.sh run-all/full`。 +- `YY>0` 的分项必须包含可执行物(YAML 路径或命令块)。 + +## 03 系列索引(按推荐顺序) + +| doc_id | 主题 | 子篇执行入口 | 备注 | +|-------:|------|------------|------| +| 03-01 | Traefik Dashboard | `./scripts/verify.sh run 03-01` | 浏览器 UI 另验 | +| 03-02 | Traefik ACME(含 TLS 矩阵) | `./scripts/verify.sh run 03-02` | 依赖 ACME/Cloudflare 条件 | +| 03-03 | Dashboard + ACME(组合) | `./scripts/verify.sh run 03-03` | 目前以文档为主 | +| 03-04 | Cloudflare Tunnel 接入 | `./scripts/verify.sh run 03-04` | 依赖 token 等外部条件 | +| 03-05 | local-path PVC demo | `./scripts/verify.sh run 03-05` | | +| 03-06 | NFS PVC demo | `./scripts/verify.sh run 03-06` | 依赖 NFS 变量 | +| 03-07 | Longhorn | `./scripts/verify.sh run 03-07` | 耗时较长 | +| 03-08 | K3s HA:集群配置与切换 | `./scripts/verify.sh run 03-08` | 建议手工演练为主 | +| 03-09 | GitOps:集群配置管理 | `./scripts/verify.sh run 03-09` | | +| 03-10 | Traefik 自定义端口 | `./scripts/verify.sh run 03-10` | | + +## 真源位置 + +- Traefik:`labs/traefik/manifests/`(部分仍在 `ansible/files/03-0x-*`) +- 存储:`labs/storage/manifests/`、`labs/longhorn/manifests/` + diff --git a/docs/03-01-k3s-traefik-dashboard.md b/docs/03-01-k3s-traefik-dashboard.md index d5f1f89..12ca6e0 100644 --- a/docs/03-01-k3s-traefik-dashboard.md +++ b/docs/03-01-k3s-traefik-dashboard.md @@ -2,6 +2,14 @@ > 启用并访问 Traefik Dashboard,用于查看路由与服务状态。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 03-01` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + ## 前置条件 - Traefik 已正常运行 @@ -20,10 +28,10 @@ - **默认路径**:`/var/lib/rancher/k3s/server/manifests/traefik-dashboard.yaml` - **自定义 data-dir**(如 `--data-dir=/storage`):`/server/manifests/traefik-dashboard.yaml` - **唯一真源(勿与文档内联重复)**:[HelmChartConfig + IngressRoute 完整 YAML](../../ansible/files/03-01-traefik-dashboard/traefik-dashboard.yaml)。复制到上述 manifests 路径,或在仓库根执行: + **唯一真源(勿与文档内联重复)**:[HelmChartConfig + IngressRoute 完整 YAML](../../ansible/files/03-01/traefik-dashboard.yaml)。复制到上述 manifests 路径,或在仓库根执行: ```bash - kubectl apply -f ansible/files/03-01-traefik-dashboard/traefik-dashboard.yaml + kubectl apply -f ansible/files/03-01/traefik-dashboard.yaml ``` 2. 应用配置并等待 Traefik 重载(按实际路径选择其一复制执行): @@ -89,5 +97,11 @@ sudo rm -f /storage/server/manifests/traefik-dashboard.yaml ## 下一步 -- `04-03-k3s-nginx-demo.md` +- `02-05-nginx-验证矩阵-一键部署.md`(若尚未做 HTTP 矩阵验证;可先读 `02-00-nginx-系列说明.md`) - `04-01-k3s-nodejs-高级部署.md` + +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/03-02-k3s-traefik-acme.md b/docs/03-02-k3s-traefik-acme.md index 30925f9..fb22537 100644 --- a/docs/03-02-k3s-traefik-acme.md +++ b/docs/03-02-k3s-traefik-acme.md @@ -1,11 +1,19 @@ # 03-02-k3s Traefik ACME -> **状态:✅ 验证已完成**(2026-03,K3s 4 节点 ylc61~ylc64,Cloudflare DNS-01、Let’s Encrypt 证书、TLS 矩阵 `test01~test04.jackadam.top`,HTTPS 与 HTTP-only 各 16 目标均 200;Ansible `nginx-matrix-tls-deploy.yml` 已实机跑通。) +> **状态:✅ 验证已完成**(2026-03,K3s 4 节点 ylc61~ylc64,Cloudflare DNS-01、Let’s Encrypt 证书、TLS 矩阵 `test01~test04.jackadam.top`,HTTPS 与 HTTP-only 各 16 目标均 200;Ansible `03-02.yml -e nginx_matrix_tls_enable=true` 已实机跑通。) > 为 Traefik 配置 ACME 自动证书(Let's Encrypt + Cloudflare DNS 验证),并部署 **TLS 矩阵**。 > > **为 02-05 的升级版**:02-05 为 HTTP-only(节点 IP、无域名);本页在其基础上增加 ACME 证书、域名、根路径 `/`,用于有域名时的学习或生产。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 03-02` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + --- ## 前置条件 @@ -21,7 +29,7 @@ - **Pod / 部署**:ACME 配置通过 `HelmChartConfig` 注入到 **同一个 Traefik Deployment**。**副本数为 chart 默认值 1**(即 `deployment.replicas` 未在 values 里写时默认为 1),所以只有 1 个 Traefik Pod;与 03-01 的 Traefik 是同一套 Deployment,只是 values 里多了 ACME 参数与 env。 - **配置存在哪里**:`HelmChartConfig` 存在 **etcd**(控制节点);K3s 的 chart 控制器据此更新 Traefik 的部署参数,Traefik 进程从 **Kubernetes API** 读取 Ingress/IngressRoute,无需多 Pod 间同步。 -- **ACME 存储(证书与账户)**:`acme.storage` 指向容器内 **`/data/acme.json`**。未配 hostPath 时,K3s 默认会为 Traefik 挂载卷到 `/data`(如 emptyDir 或默认持久卷),**仅当前这一个 Traefik Pod 可写**,Pod 重建后若卷不持久则需重新申请证书。若在 values 里配置了 **hostPath**(见本页可选配置),则 `/data` 对应宿主机目录,证书写在物理机路径,便于备份与复用;Traefik 仍为 1 个 Pod,不存在多副本间同步 acme.json 的问题。**推荐**:Dashboard + ACME 场景直接用 **同一份** [`traefik-dashboard-acme.yaml`](../ansible/files/03-03-traefik-dashboard-acme/traefik-dashboard-acme.yaml)(已含 **`persistence`(local-path)+ ACME**),见 `03-05-k3s-local-path-pvc.md`。不要 Dashboard 时按该文件头注释删减。 +- **ACME 存储(证书与账户)**:`acme.storage` 指向容器内 **`/data/acme.json`**。未配 hostPath 时,K3s 默认会为 Traefik 挂载卷到 `/data`(如 emptyDir 或默认持久卷),**仅当前这一个 Traefik Pod 可写**,Pod 重建后若卷不持久则需重新申请证书。若在 values 里配置了 **hostPath**(见本页可选配置),则 `/data` 对应宿主机目录,证书写在物理机路径,便于备份与复用;Traefik 仍为 1 个 Pod,不存在多副本间同步 acme.json 的问题。**推荐**:Dashboard + ACME 场景直接用 **同一份** [`traefik-dashboard-acme.yaml`](../ansible/files/03-03/traefik-dashboard-acme.yaml)(已含 **`persistence`(local-path)+ ACME**),见 `03-05-k3s-local-path-pvc.md`。不要 Dashboard 时按该文件头注释删减。 - **第一次部署随机节点、重启后怎么办**:Traefik 未指定 nodeSelector 时,首次会**随机调度**到某一节点。若使用了 **hostPath**,证书只存在于该节点的磁盘上;**Pod 被调度到其他节点**(重启、驱逐、缩容再扩容)时,新节点上的同名 hostPath 是另一块盘,**证书不会跟着走**,可能需重新申请。若希望重启或节点故障后仍保留证书,可:**① 把 Traefik 固定到某一节点**(在 HelmChartConfig 的 `deployment` 下配 `nodeSelector`,例如 `nodeSelector: { kubernetes.io/hostname: ylc61 }(节点名使用短主机名 ylc61~ylc64,便于配合 Cloudflare CDN)`),使 hostPath 始终落在同一台机;**② 或不用 hostPath**,依赖 K3s 默认持久卷(若为 local-path,则卷仍绑定某节点,Pod 重建到同节点可复用);**③ 或改用 NFS 等共享存储**挂到 `/data`,多节点可读同一证书(需自行在 values 里配 PVC/volume)。 --- @@ -86,11 +94,11 @@ kubectl -n kube-system get secret cloudflare-api-token \ ## 配置 HelmChartConfig -> **重要**:同一 chart 只能有一份 `HelmChartConfig`(如 `name: traefik`)。若已按 03-01 部署了 Dashboard,再单独 apply 本文件的配置会**覆盖**掉 03-01,Dashboard 会失效。此时应二选一:**要么**使用 `03-03-k3s-traefik-dashboard-acme.md` 中的合并 YAML(Dashboard + ACME 一份搞定),**要么**把本页的 ACME 配置合并进已有 03-01 的 `traefik-dashboard.yaml`,只保留一个 manifest 文件。 +> **重要**:同一 chart 只能有一份 `HelmChartConfig`(如 `name: traefik`)。若已按 03-01 部署了 Dashboard,再单独 apply 本文件的配置会**覆盖**掉 03-01,Dashboard 会失效。此时应二选一:**要么**使用 `03-03-k3s-traefik-dashboard-acme.md` 中的合并 YAML(Dashboard + ACME 合并为单一清单),**要么**把本页的 ACME 配置合并进已有 03-01 的 `traefik-dashboard.yaml`,只保留一个 manifest 文件。 > > **文件选择**:K3s 自带的 `traefik.yaml` 会被 K3s 覆盖,**不要修改**。所有自定义配置(ACME、nodeSelector、hostPath 以及其他扩展配置)都应写在 **`traefik-acme.yaml`** 这一份 HelmChartConfig 里,与默认 chart 合并生效。 -1. 在控制节点创建 `traefik-acme.yaml`,推荐放入 K3s manifests 目录(路径同 03-01)。**完整配置见 `ansible/files/03-02-traefik-acme/traefik-acme.yaml`**(与 Ansible 共用),复制后替换 `` 等占位符即可。若走 **Dashboard + ACME** 且需 **证书落盘 local-path PVC**,直接用 [`traefik-dashboard-acme.yaml`](../ansible/files/03-03-traefik-dashboard-acme/traefik-dashboard-acme.yaml)(已内置 persistence,说明见 `03-05-k3s-local-path-pvc.md`)。**仅 ACME、无 Dashboard** 时仍可用本目录 [`traefik-acme.yaml`](../ansible/files/03-02-traefik-acme/traefik-acme.yaml),并自行按 `03-05` 在 Helm values 中增加 `persistence` 块(与 `/data/acme.json` 一致)。 +1. 在控制节点创建 `traefik-acme.yaml`,推荐放入 K3s manifests 目录(路径同 03-01)。**完整配置见 `ansible/files/03-02/traefik-acme.yaml`**(与 Ansible 共用),复制后替换 `` 等占位符即可。若走 **Dashboard + ACME** 且需 **证书落盘 local-path PVC**,直接用 [`traefik-dashboard-acme.yaml`](../ansible/files/03-03/traefik-dashboard-acme.yaml)(已内置 persistence,说明见 `03-05-k3s-local-path-pvc.md`)。**仅 ACME、无 Dashboard** 时仍可用本目录 [`traefik-acme.yaml`](../ansible/files/03-02/traefik-acme.yaml),并自行按 `03-05` 在 Helm values 中增加 `persistence` 块(与 `/data/acme.json` 一致)。 > 将 `` 改为你的邮箱。`/data/acme.json` 为容器内路径;`caserver` 为测试服务器(staging),正式上线前删除该行即切回生产 CA。Traefik 在容器内监听 8000/8443,由 Service 和 svclb 映射到节点 80/443。 > @@ -185,7 +193,7 @@ kubectl -n kube-system exec -it "$POD" -- nslookup acme-v02.api.letsigncrypt.org ## TLS 矩阵清单(02-05 升级版) -> **唯一真源**:[`ansible/files/03-02-nginx-matrix-tls/`](../../ansible/files/03-02-nginx-matrix-tls/)(`01-control-ingress.yaml`~`04-worker-ingressroute.yaml`),与 [`ansible/playbooks/nginx-matrix-tls-deploy.yml`](../../ansible/playbooks/nginx-matrix-tls-deploy.yml) 共用;**本文不再内联整份 YAML**。 +> **唯一真源**:[`ansible/files/03-02/`](../../ansible/files/03-02/)(`01-control-ingress.yaml`~`04-worker-ingressroute.yaml`),与 [`ansible/playbooks/verify/03-02.yml`](../../ansible/playbooks/verify/03-02.yml) 共用;**本文不再内联整份 YAML**。 **相对 02-05 的差异摘要**:基于域名根路径 `/`;TLS 仅绑 `websecure`;含 HTTP-only(仅 `web`)路由;与 02-05 的 `/demo-mx` 为两套资源;M2/M4 节点名与域名请在清单内编辑。 @@ -197,12 +205,12 @@ kubectl -n kube-system exec -it "$POD" -- nslookup acme-v02.api.letsigncrypt.org **方式一:使用仓库 YAML 目录(推荐与文档一致)** -1. 在仓库中编辑 [`ansible/files/03-02-nginx-matrix-tls/`](../../ansible/files/03-02-nginx-matrix-tls/) 内各文件(M2/M4 节点名、域名等)。 -2. 按 k3s 存储方案可将整个目录复制到控制节点 manifests,或直接在仓库根执行 `kubectl apply -f ansible/files/03-02-nginx-matrix-tls/ -R`(与 `01-01-k3s-控制节点含traefik.md` 存储路径说明一致)。 +1. 在仓库中编辑 [`ansible/files/03-02/`](../../ansible/files/03-02/) 内各文件(M2/M4 节点名、域名等)。 +2. 按 k3s 存储方案可将整个目录复制到控制节点 manifests,或直接在仓库根执行 `kubectl apply -f ansible/files/03-02/ -R`(与 `01-01-k3s-控制节点含traefik.md` 存储路径说明一致)。 3. 清理示例(路径与 apply 时一致): ```bash - kubectl delete -f ansible/files/03-02-nginx-matrix-tls/ -R --ignore-not-found=true + kubectl delete -f ansible/files/03-02/ -R --ignore-not-found=true ``` 或沿用下文按资源名删除。 或按资源名删除(与路径无关): @@ -217,12 +225,12 @@ kubectl -n kube-system exec -it "$POD" -- nslookup acme-v02.api.letsigncrypt.org - 直接使用仓库中已合并好的 4 个文件(每个 Mx 含 TLS + HTTP-only),在**仓库根目录**执行: ```bash - kubectl apply -f ansible/files/03-02-nginx-matrix-tls/ -R + kubectl apply -f ansible/files/03-02/ -R ``` 需保证当前环境已设置 KUBECONFIG 或 `kubectl` 已指向目标集群(例如在控制节点上或已配置远程 kubeconfig)。 - 一键部署/清理推荐用 Playbook(会先删 02-05 残留、再 apply、并做就绪与 curl 验证): - - 在 `ansible/` 目录下:`ansible-playbook -i inventory.ini playbooks/nginx-matrix-tls-deploy.yml` - - 在仓库根目录:`ansible-playbook -i ansible/inventory.ini ansible/playbooks/nginx-matrix-tls-deploy.yml` + - 在 `ansible/` 目录下:`ansible-playbook -i inventory.ini playbooks/verify/03-02.yml` + - 在仓库根目录:`ansible-playbook -i ansible/inventory.ini ansible/playbooks/verify/03-02.yml` - 清理:同上命令后加 `-e mode=cleanup`。 验证 HTTP 与 HTTPS 是否正常(将 `192.168.2.61 … 192.168.2.64` 按实际入口节点 IP 修改): @@ -258,7 +266,7 @@ done ```bash kubectl get ingress -n default nginx-m1 -o yaml | grep -A5 "tls:\|host:\|certresolver" ``` - 若无 `tls` / `host` / `certresolver`,说明当前是 02-05 的非 TLS Ingress,需执行 `kubectl apply -f ansible/files/03-02-nginx-matrix-tls/ -R`(或跑 Ansible playbook `nginx-matrix-tls-deploy.yml`)。 + 若无 `tls` / `host` / `certresolver`,说明当前是 02-05 的非 TLS Ingress,需执行 `kubectl apply -f ansible/files/03-02/ -R`(或跑 Ansible playbook `03-02.yml -e nginx_matrix_tls_enable=true`)。 2. **看 Traefik 是否尝试/成功申请证书**: ```bash @@ -281,7 +289,7 @@ done 6. **日志出现 “service not found” / “kubernetes service not found: default/nginx-m2” / “middleware … does not exist”**:说明 Ingress/IngressRoute 已存在,但对应的 **Service 或 Middleware 缺失**(例如只 apply 了部分 TLS 矩阵,或先删后 apply 时 Traefik 在中间时刻读到不完整状态)。需**完整** apply TLS 矩阵,保证 M1~M4 的 Deployment、Service、Middleware、Ingress/IngressRoute 一起就绪: ```bash - kubectl apply -f ansible/files/03-02-nginx-matrix-tls/ -R + kubectl apply -f ansible/files/03-02/ -R kubectl get svc,middleware -n default | grep -E "nginx-m|stripprefix" ``` 确认 nginx-m1~m4 的 Service 与 stripprefix-m1~m4 的 Middleware 均存在后,Traefik 会重新同步路由;证书仍需按上一步确保 ACME 配置生效。 @@ -311,18 +319,18 @@ done 可使用 Ansible 自动部署 / 清理 TLS 矩阵(test01~test04.jackadam.top)并做 HTTPS 验证: -- **Playbook**:`ansible/playbooks/nginx-matrix-tls-deploy.yml` -- **Manifests**:`ansible/files/03-02-nginx-matrix-tls/`(M1~M4 带 TLS,域名为 test01~test04.jackadam.top;按实际修改 M2/M4 节点名 ylc61/ylc64) +- **Playbook**:`ansible/playbooks/verify/03-02.yml` +- **Manifests**:`ansible/files/03-02/`(M1~M4 带 TLS,域名为 test01~test04.jackadam.top;按实际修改 M2/M4 节点名 ylc61/ylc64) - **前置**:已按本页完成 ACME 配置,且 test01~test04.jackadam.top 已解析到入口 IP ```bash # 一键部署 TLS 矩阵 cd ansible -ansible-playbook -i inventory.ini playbooks/nginx-matrix-tls-deploy.yml -e mode=deploy +ansible-playbook -i inventory.ini playbooks/verify/03-02.yml -e mode=deploy # 一键删除 TLS 矩阵 cd ansible -ansible-playbook -i inventory.ini playbooks/nginx-matrix-tls-deploy.yml -e mode=cleanup +ansible-playbook -i inventory.ini playbooks/verify/03-02.yml -e mode=cleanup ``` Playbook 在 `mode=deploy` 时会:拷贝 TLS manifests 到控制节点 → **若存在不含 TLS 的 nginx 矩阵(02-05),先按资源名删除**(deployments、svc、ingress、ingressroute、configmaps 共 M1~M4)→ `kubectl apply` TLS 矩阵 → 等待 Pod 就绪 → 对**所有 k3s_nodes 节点**做 HTTPS 验证(4 节点 × 4 域名 = 16 个目标,与 02-05 HTTP 矩阵一致,所有节点均为入口点)。`mode=cleanup` 时则按资源名删除 TLS 矩阵相关 Deployment/Service/Ingress/IngressRoute/ConfigMap,并清理 `/tmp/nginx-matrix-tls` 目录,恢复到未部署 TLS 矩阵前的状态。 @@ -382,3 +390,8 @@ sudo rm -f /storage/server/manifests/traefik-acme.yaml - 返回 00-00-构建总览.md,按导航继续。 +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/03-03-k3s-traefik-dashboard-acme.md b/docs/03-03-k3s-traefik-dashboard-acme.md index 6f2f683..89babd5 100644 --- a/docs/03-03-k3s-traefik-dashboard-acme.md +++ b/docs/03-03-k3s-traefik-dashboard-acme.md @@ -2,6 +2,14 @@ > 按顺序完成 Traefik Dashboard 与 ACME 自动证书,为后续应用(GitLab、Homer 等)提供 HTTPS 能力。**ACME 配置与 03-03 已实机验证**(2026-03,K3s 4 节点、Cloudflare DNS-01、Let's Encrypt),本页为 Dashboard + ACME 合并版。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 03-03` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + ## 前置条件 - 已完成 `01-02-k3s-工作节点.md`(集群与 Traefik 可用) @@ -20,7 +28,7 @@ kubectl -n kube-system create secret generic cloudflare-api-token \ > 说明:Traefik 的 `HelmChartConfig` 只能有一份,Dashboard 与 ACME 需合并在同一文件中。**ACME 配置基于 03-03 实机验证**(递归 DNS、propagation 等待、ping、PROXY protocol、nodeSelector)。 -创建 `traefik-dashboard-acme.yaml`,推荐放入 K3s manifests 目录(路径同 03-02)。**唯一真源**(已含 **`persistence`(local-path)+ ACME + Dashboard + IngressRoute**,证书落盘 `/data/acme.json`):[`traefik-dashboard-acme.yaml`](../ansible/files/03-03-traefik-dashboard-acme/traefik-dashboard-acme.yaml);复制后替换 `` 等占位符,或在仓库根执行 `kubectl apply -f ansible/files/03-03-traefik-dashboard-acme/traefik-dashboard-acme.yaml`。细节见 `03-05-k3s-local-path-pvc.md`。 +创建 `traefik-dashboard-acme.yaml`,推荐放入 K3s manifests 目录(路径同 03-02)。**唯一真源**(已含 **`persistence`(local-path)+ ACME + Dashboard + IngressRoute**,证书落盘 `/data/acme.json`):[`traefik-dashboard-acme.yaml`](../ansible/files/03-03/traefik-dashboard-acme.yaml);复制后替换 `` 等占位符,或在仓库根执行 `kubectl apply -f ansible/files/03-03/traefik-dashboard-acme.yaml`。细节见 `03-05-k3s-local-path-pvc.md`。 > 将 `` 替换为你的邮箱。正式上线前删除 `caserver` 该行即切回生产 Let's Encrypt。**ACME 排障**(DNS 解析错误、证书解析器不存在等)见 `03-02-k3s-traefik-acme.md` 中「常见问题」与「排查」小节。 @@ -76,12 +84,12 @@ kubectl -n kube-system logs deploy/traefik --tail=100 | grep -i acme > 本节给出一个**完整、独立**的 Tomcat 示例:包含 Deployment + Service + Ingress(三段 YAML),域名为 `test05.jackadam.top`。前提是已经按本页前文配置并成功加载了 ACME(`traefik-acme.yaml` 或 `traefik-dashboard-acme.yaml`)。 -1. **唯一真源**:[`ansible/files/03-03-traefik-dashboard-acme/tomcat-acme.yaml`](../ansible/files/03-03-traefik-dashboard-acme/tomcat-acme.yaml)。将其中域名改成你实际解析到集群入口 IP 的 FQDN。 +1. **唯一真源**:[`ansible/files/03-03/tomcat-acme.yaml`](../ansible/files/03-03/tomcat-acme.yaml)。将其中域名改成你实际解析到集群入口 IP 的 FQDN。 2. 应用并查看 ACME 日志 + 访问验证: ```bash -kubectl apply -f ansible/files/03-03-traefik-dashboard-acme/tomcat-acme.yaml +kubectl apply -f ansible/files/03-03/tomcat-acme.yaml # 查看 ACME 相关日志(证书申请、签发情况) kubectl -n kube-system logs deploy/traefik --tail=200 | grep -i acme || true @@ -147,3 +155,8 @@ sudo rm -f /storage/server/manifests/traefik-dashboard-acme.yaml - `03-04-k3s-cloudflare-tunnel-配置接入.md`:若需 Cloudflare Tunnel 接入 - `01-07-openwrt-haproxy.md`:如需调整外部端口/防火墙,参考 HAProxy 监听与转发(第 6 节) +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/03-04-k3s-cloudflare-tunnel-配置接入.md b/docs/03-04-k3s-cloudflare-tunnel-配置接入.md index 947f8d6..87b6308 100644 --- a/docs/03-04-k3s-cloudflare-tunnel-配置接入.md +++ b/docs/03-04-k3s-cloudflare-tunnel-配置接入.md @@ -2,7 +2,15 @@ > 本文覆盖 Tunnel 完整流程:Zero Trust 云端创建、域名映射,以及将 `cloudflared` 安装到 K3s 并跑起 Pod,使 **Traefik 通过 Tunnel 对外提供服务**。 > -> **状态:已验证**(2026-03,本仓库实验室 K3s 集群;详见 `00-02-验证矩阵.md`)。 +> **状态:已验证**(2026-03,本仓库实验室 K3s 集群)。 + + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 03-04` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 --- @@ -45,7 +53,7 @@ Traefik 是唯一入口。所有流量经 Tunnel 进入后,由 Traefik 的 Ing ### 3. 部署 cloudflared 到 K3s -1. 从 **唯一真源** 复制清单:[`ansible/files/03-04-cloudflare-tunnel/cloudflared.yaml`](../ansible/files/03-04-cloudflare-tunnel/cloudflared.yaml) +1. 从 **唯一真源** 复制清单:[`ansible/files/03-04/cloudflared.yaml`](../ansible/files/03-04/cloudflared.yaml) 2. 将 `TUNNEL_TOKEN` 占位符替换为前述 Zero Trust 中复制的 Token 3. 应用并等待 Pod 就绪(按实际 manifests 路径选择其一): @@ -81,7 +89,7 @@ Tunnel 后端应指向 **集群内的 Traefik 入口**,常用写法: **和仓库里哪份 YAML 的关系** -- 本仓库的 [`cloudflared.yaml`](../ansible/files/03-04-cloudflare-tunnel/cloudflared.yaml) **只** 定义 `cloudflared` 的 Deployment/Secret,**不包含** Traefik Service;Tunnel 后端地址写的是 **集群里已存在的 Traefik Service**,不是 `cloudflared.yaml` 里的某一行。 +- 本仓库的 [`cloudflared.yaml`](../ansible/files/03-04/cloudflared.yaml) **只** 定义 `cloudflared` 的 Deployment/Secret,**不包含** Traefik Service;Tunnel 后端地址写的是 **集群里已存在的 Traefik Service**,不是 `cloudflared.yaml` 里的某一行。 - Traefik 的 **Service** 由 K3s 内置 Traefik(HelmChart)安装时创建,资源名一般为 **`traefik`**,命名空间 **`kube-system`**。若你改过 chart 或 Service 名,以下 FQDN 与端口要以 **实际 `kubectl get svc` 输出** 为准。 **与 `kubectl get svc traefik -o yaml` 里哪些字段对应** @@ -239,3 +247,9 @@ Zero Trust 向导顺序为:选择类型 → 命名 → **安装并运行 Conne - 其他应用(GitLab、Homer 等):在集群内创建 IngressRoute/Ingress 指定 Host 与后端,再在 Zero Trust 中添加对应子域的 Public Hostname 即可 - `05-03-k3s-安装gitlab-含runner.md` - `05-01-k3s-部署homer首页面板.md` + +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/03-05-k3s-local-path-pvc.md b/docs/03-05-k3s-local-path-pvc.md index a0d386b..8da7a9b 100644 --- a/docs/03-05-k3s-local-path-pvc.md +++ b/docs/03-05-k3s-local-path-pvc.md @@ -2,6 +2,14 @@ > 这篇只讲一件事:在 K3s 里,如何控制“数据最终落到宿主机哪个目录”。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 03-05` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + ## 先说结论 - **方法一(推荐)`local-path-config`**:给 `local-path` 动态供给指定“基路径”。 @@ -34,9 +42,9 @@ kubectl -n kube-system get configmap local-path-config -o yaml > /tmp/local-path kubectl -n kube-system edit configmap local-path-config ``` -**本仓库实验室真源**(四节点 **10G+32G**、K3s `--data-dir=/storage` 统一拓扑):[`ansible/files/03-05-local-path-config/local-path-config-lab.json`](../ansible/files/03-05-local-path-config/local-path-config-lab.json) —— 仅含 **`DEFAULT_PATH_FOR_NON_LISTED_NODES` → `/storage/storage`**。应用方式: +**本仓库实验室真源**(四节点 **10G+32G**、K3s `--data-dir=/storage` 统一拓扑):[`ansible/files/03-05/local-path-config-lab.json`](../ansible/files/03-05/local-path-config-lab.json) —— 仅含 **`DEFAULT_PATH_FOR_NON_LISTED_NODES` → `/storage/storage`**。应用方式: -- Ansible:`ansible-playbook -i inventory.ini playbooks/apply-local-path-config-lab.yml`,或在 `group_vars/all.yml` 设 `longhorn_apply_local_path_lab: true` 后执行 `longhorn-install.yml`(见 `01-06`、`03-07`)。 +- Ansible:`ansible-playbook -i inventory.ini playbooks/verify/03-05.yml`,或在 `group_vars/all.yml` 设 `longhorn_apply_local_path_lab: true` 后执行 `03-07.yml`(见 `01-06`、`03-07`)。 - 手工:备份后编辑 ConfigMap,将 `config.json` 与真源 JSON 对齐,再 `rollout restart` provisioner。 配置结构示意(**四节点统一基路径**时只需 `DEFAULT` 一条;请与现有 JSON 合并,不要盲目整段覆盖): @@ -65,12 +73,12 @@ kubectl -n kube-system rollout restart deploy/local-path-provisioner 2>/dev/null ### 3) 用 demo 验证(PVC -> PV -> 节点 -> 落地目录) -Demo 清单:[`ansible/files/03-05-local-path-demo/local-path-pvc-demo.yaml`](../ansible/files/03-05-local-path-demo/local-path-pvc-demo.yaml) +Demo 清单:[`ansible/files/03-05/local-path-pvc-demo.yaml`](../ansible/files/03-05/local-path-pvc-demo.yaml) > 该 demo 已包含 `nodeSelector` 固定节点(示例为 `ylc61`),使用前请按你的节点主机名修改。 ```bash -kubectl apply -f ansible/files/03-05-local-path-demo/local-path-pvc-demo.yaml +kubectl apply -f ansible/files/03-05/local-path-pvc-demo.yaml kubectl rollout status deploy/nginx-local-pvc-demo --timeout=180s ``` @@ -148,7 +156,8 @@ echo "node=$NODE path=$PATH_ON_NODE" 适用:单节点、实验环境,或者你明确要写死某个宿主机目录。 -最小完整示例(可直接 `kubectl apply -f`,非空行逐行注释): +最小完整示例真源:[`ansible/files/03-05/nginx-hostpath-demo.yaml`](../ansible/files/03-05/nginx-hostpath-demo.yaml) +(下方保留带注释版本便于学习;执行时请优先使用真源文件): ```yaml apiVersion: apps/v1 # Deployment 所属 API 版本 @@ -199,8 +208,8 @@ spec: # Service 规格 应用与最小验证: ```bash -# 1) 保存上面的 YAML 到 /tmp/nginx-hostpath-demo.yaml 后应用 -kubectl apply -f /tmp/nginx-hostpath-demo.yaml +# 1) 直接使用仓库真源应用 +kubectl apply -f ansible/files/03-05/nginx-hostpath-demo.yaml # 2) 等待就绪 kubectl rollout status deploy/nginx-hostpath-demo --timeout=180s @@ -250,3 +259,8 @@ kubectl run curl-test --image=curlimages/curl:8.8.0 --restart=Never -it --rm -- - `03-06-k3s-使用nfs存储.md`(多节点共享目录) - `03-07-k3s-longhorn-持久化存储.md`(重状态、快照、备份) +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/03-06-k3s-使用nfs存储.md b/docs/03-06-k3s-使用nfs存储.md index 9adac41..3637392 100644 --- a/docs/03-06-k3s-使用nfs存储.md +++ b/docs/03-06-k3s-使用nfs存储.md @@ -2,6 +2,14 @@ > 本文只讲 K3s 集群侧如何使用已安装好的 NFS。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 03-06` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + ## 前置条件 - 已完成 `01-05-armv7-nfs服务安装.md` @@ -66,9 +74,11 @@ spec: # Deployment 规格 path: # NFS 导出目录或子目录(应用前替换) ``` +可执行真源:[`ansible/files/03-06/nfs-direct-demo.yaml`](../ansible/files/03-06/nfs-direct-demo.yaml)。 + ### 方式 2:静态 NFS(PV + PVC,推荐) -**唯一真源**:[`ansible/files/03-06-nfs-demo/nfs-pv-pvc-demo.yaml`](../ansible/files/03-06-nfs-demo/nfs-pv-pvc-demo.yaml)。 +**唯一真源**:[`ansible/files/03-06/nfs-pv-pvc-demo.yaml`](../ansible/files/03-06/nfs-pv-pvc-demo.yaml)。 > 为减少硬编码,示例清单已改为占位符:``、``。 > 应用前必须先替换(例如 `192.168.2.22`、`/sdcard`)。 @@ -133,10 +143,12 @@ spec: storage: 5Gi ``` +可执行真源:[`ansible/files/03-06/nfs-dynamic-pvc-demo.yaml`](../ansible/files/03-06/nfs-dynamic-pvc-demo.yaml)。 + 应用并验证: ```bash -kubectl apply -f /tmp/nfs-dynamic-pvc-demo.yaml +kubectl apply -f ansible/files/03-06/nfs-dynamic-pvc-demo.yaml kubectl get pvc nfs-dynamic-pvc-demo -n default kubectl get pv | grep nfs-dynamic-pvc-demo ``` @@ -164,10 +176,10 @@ kubectl get pv | grep nfs-dynamic-pvc-demo ```bash # 仓库根直接应用 -# 先替换 ansible/files/03-06-nfs-demo/nfs-pv-pvc-demo.yaml 里的占位符 +# 先替换 ansible/files/03-06/nfs-pv-pvc-demo.yaml 里的占位符 # -> 例如 192.168.2.22 # -> 例如 /sdcard -kubectl apply -f ansible/files/03-06-nfs-demo/nfs-pv-pvc-demo.yaml +kubectl apply -f ansible/files/03-06/nfs-pv-pvc-demo.yaml ``` ```bash @@ -186,7 +198,7 @@ kubectl describe pv nfs-pv-demo ```bash # 对方式 1(Pod 直挂)可这样验证 -kubectl apply -f /tmp/nfs-direct-demo.yaml +kubectl apply -f ansible/files/03-06/nfs-direct-demo.yaml kubectl rollout status deploy/nfs-direct-demo --timeout=180s kubectl exec deploy/nfs-direct-demo -- sh -c 'echo nfs-direct-ok > /usr/share/nginx/html/nfs.txt && cat /usr/share/nginx/html/nfs.txt' ``` @@ -208,3 +220,8 @@ kubectl exec deploy/nfs-direct-demo -- sh -c 'echo nfs-direct-ok > /usr/share/ng - `03-05-k3s-local-path-pvc.md`:单副本应用用 K3s 自带 local-path 即可,无需 NFS - `05-06-openlist挂载网盘与自动备份.md` +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/03-07-k3s-longhorn-持久化存储.md b/docs/03-07-k3s-longhorn-持久化存储.md index b20b651..7cf8b9b 100644 --- a/docs/03-07-k3s-longhorn-持久化存储.md +++ b/docs/03-07-k3s-longhorn-持久化存储.md @@ -1,6 +1,14 @@ # 03-07-k3s Longhorn 持久化存储(4 节点实验环境) -> 本实验:**K3s 四节点集群**、**每节点 8 GiB 内存**(与下文「系统资源」建议档一致)、**磁盘基线为约 10G 系统盘 + 32G 数据盘**,**`/storage`** 必须挂在独立数据盘上(与 `/` 不同设备),详见 `00-04-部署环境说明.md`。**没有 NFS**,用 Longhorn 做集群内动态块存储;若后续要部署 GitLab 等重状态系统,可在此基础上接 PVC。副本数可按实验目标在「省空间」与「模拟高可用」之间取舍。 +> 本实验:**K3s 四节点集群**、**每节点 8 GiB 内存**(与下文「系统资源」建议档一致)、**磁盘基线为约 10G 系统盘 + 32G 数据盘**,**`/storage`** 必须挂在独立数据盘上(与 `/` 不同设备),详见 `00-02-部署环境说明.md`。**没有 NFS**,用 Longhorn 做集群内动态块存储;若后续要部署 GitLab 等重状态系统,可在此基础上接 PVC。副本数可按实验目标在「省空间」与「模拟高可用」之间取舍。 + + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 03-07` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 --- @@ -8,9 +16,9 @@ Longhorn 与 K3s 的 `containerd`/镜像、local-path 都会大量占用 **`k3s_data_dir`(本仓库默认 `/storage`)**。若 `/storage` 只是根分区上的普通目录,控制节点易出现 **DiskPressure / Evicted**。 -- **请先阅读**:[`00-04-部署环境说明.md`](00-04-部署环境说明.md)(四节点统一拓扑、自检命令、推荐 playbook 顺序)。 +- **请先阅读**:[`00-02-部署环境说明.md`](00-02-部署环境说明.md)(四节点统一拓扑、自检命令、推荐 playbook 顺序)。 - **自检**(每台节点):`mountpoint -q /storage && findmnt -n -o SOURCE /` 与 `findmnt -n -o SOURCE /storage` 输出须**不同**。 -- **Ansible**:`k3s-init-and-install.yml` 在 `k3s_verify_storage_mount: true`(`group_vars/all.yml` 默认)时会在安装前校验上述条件;可选先跑 `k3s-prepare-storage.yml` 准备第二块盘,见 `01-06-节点初始化-ansible-实践.md`。 +- **Ansible**:`01-06.yml -e k3s_do_install=true` 在 `k3s_verify_storage_mount: true`(`group_vars/all.yml` 默认)时会在安装前校验上述条件;可选先跑 `01-06.yml -e k3s_do_prepare_storage=true -e k3s_prepare_storage=true` 准备第二块盘,见 `01-06-节点初始化-ansible-实践.md`。 - Longhorn 数据目录建议为 **`/storage/longhorn`**(与 Helm `values-lab.yaml` 一致),勿与系统盘混用。 **容量与副本数**:每节点数据盘约 **32G** 时,`defaultReplicaCount` 为 **2 或 3** 会使同一份逻辑卷在集群内占用 **多倍物理空间**(各副本落在不同节点上各占一份),且 Longhorn 元数据与系统组件仍有开销;实验环境可先用副本 **1**,要演练跨节点冗余再调高并预留磁盘。 @@ -156,10 +164,10 @@ done ```bash cd ansible -ansible-playbook -i inventory.ini playbooks/longhorn-install.yml +ansible-playbook -i inventory.ini playbooks/verify/03-07.yml ``` -该 playbook 会在各节点安装 iSCSI/NFS 依赖、在控制节点安装 Helm(若 `dnf/yum` 无 `helm` 包则需按 [Helm 安装文档](https://helm.sh/docs/intro/install/)手工安装后重跑)、再 `helm upgrade --install`。values 真源:[`ansible/files/03-07-longhorn/values-lab.yaml`](../ansible/files/03-07-longhorn/values-lab.yaml)。可选:`longhorn_apply_local_path_lab: true` 时一并应用 `03-05` 中的 local-path 实验室 ConfigMap。 +该 playbook 会在各节点安装 iSCSI/NFS 依赖、在控制节点安装 Helm(若 `dnf/yum` 无 `helm` 包则需按 [Helm 安装文档](https://helm.sh/docs/intro/install/)手工安装后重跑)、再 `helm upgrade --install`。values 真源:[`ansible/files/03-07/values-lab.yaml`](../ansible/files/03-07/values-lab.yaml)。可选:`longhorn_apply_local_path_lab: true` 时一并应用 `03-05` 中的 local-path 实验室 ConfigMap。 - **手工 Helm**(在 **`ylc61`** 或任意已配置 `KUBECONFIG` 的机器上;kubeconfig 见 `01-01-k3s-控制节点含traefik.md`): @@ -169,7 +177,7 @@ export KUBECONFIG=/etc/rancher/k3s/k3s.yaml helm repo add longhorn https://charts.longhorn.io helm repo update -# 将仓库内 ansible/files/03-07-longhorn/values-lab.yaml 拷到本机路径后: +# 将仓库内 ansible/files/03-07/values-lab.yaml 拷到本机路径后: helm upgrade --install longhorn longhorn/longhorn \ --namespace longhorn-system --create-namespace \ -f ./values-lab.yaml \ @@ -203,11 +211,11 @@ kubectl -n longhorn-system rollout status deploy/longhorn-ui --timeout=300s **不是**在每台机器上各执行一遍安装:无论 **Helm** 还是 **`kubectl apply`**,**对整个集群只做一次**(在能访问 API 的机器上执行即可),Longhorn 的控制面、CSI、DaemonSet 等会由 Kubernetes 统一下发。 -**每台节点**仍要做的是:前文「磁盘前提」「前置条件」里的 **OS 依赖**与 **`/storage/longhorn` 目录**(Ansible `longhorn-install.yml` 会创建)。若节点在集群内且未被 cordon,Longhorn 的 DaemonSet 往往也会在**各节点**拉起组件 Pod,因此各节点通常都要能 **拉取镜像**;只有「存数据」可以只在部分节点上开启(见下文「只让有大盘的节点承载数据」)。 +**每台节点**仍要做的是:前文「磁盘前提」「前置条件」里的 **OS 依赖**与 **`/storage/longhorn` 目录**(Ansible `03-07.yml` 会创建)。若节点在集群内且未被 cordon,Longhorn 的 DaemonSet 往往也会在**各节点**拉起组件 Pod,因此各节点通常都要能 **拉取镜像**;只有「存数据」可以只在部分节点上开启(见下文「只让有大盘的节点承载数据」)。 ### 首选:Helm + `values-lab.yaml` -与上文 **「SSH 试跑顺序 §2」**一致:使用 **`ansible-playbook ... longhorn-install.yml`** 或手工 **`helm upgrade --install ... -f values-lab.yaml`**。Chart 仓库:`https://charts.longhorn.io`;values 字段以 [Longhorn Helm Chart](https://github.com/longhorn/longhorn/tree/master/chart) 当前版本为准。 +与上文 **「SSH 试跑顺序 §2」**一致:使用 **`ansible-playbook ... playbooks/verify/03-07.yml`**(或 `labs/longhorn-stack.yml`)或手工 **`helm upgrade --install ... -f values-lab.yaml`**。Chart 仓库:`https://charts.longhorn.io`;values 字段以 [Longhorn Helm Chart](https://github.com/longhorn/longhorn/tree/master/chart) 当前版本为准。 ### 备选:`kubectl apply` 官方清单 @@ -496,4 +504,8 @@ Longhorn UI 的部署核对、port-forward、NodePort 与界面内验证要点 - `03-05-k3s-local-path-pvc.md`:单副本、无快照需求时,用 K3s 自带 local-path 即可 - 返回后续 GitOps / 业务部署(如 GitLab)章节 +## 排障 +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/03-08-k3s-ha-集群配置与切换.md b/docs/03-08-k3s-ha-集群配置与切换.md index 10d3000..80f6ab9 100644 --- a/docs/03-08-k3s-ha-集群配置与切换.md +++ b/docs/03-08-k3s-ha-集群配置与切换.md @@ -1,10 +1,18 @@ -# 03-09-k3s HA 集群配置与切换 +# 03-08-k3s HA 集群配置与切换 > 本文只讲双控制节点 HA 的集群配置与切换步骤。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 03-08` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + ## 前置条件 -- 已完成 `01-04-双控制节点ha.md` 安装准备 +- 已完成 `01-08-双控制节点ha.md` 安装准备 - 外部 datastore 与 `6443` LB 已可用 - 已确认可执行变更窗口 @@ -72,10 +80,16 @@ kubectl get pods -A ## 参考 -- `01-04-双控制节点ha.md` +- `01-08-双控制节点ha.md` - `01-01-k3s-控制节点含traefik.md` - `01-02-k3s-工作节点.md` ## 下一步 - 返回 00-00-构建总览.md,按导航继续。 + +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/03-09-k3s-gitops-集群配置管理.md b/docs/03-09-k3s-gitops-集群配置管理.md index c1f7e09..906b538 100644 --- a/docs/03-09-k3s-gitops-集群配置管理.md +++ b/docs/03-09-k3s-gitops-集群配置管理.md @@ -3,6 +3,14 @@ > 本文先给出 GitOps 管理 k3s 集群的大致框架,后续可以按需要再细化成完整实践。 > 目标:在 `01-06` 自动装好 k3s 之后,由 GitOps 工具(Argo CD / Flux)自动把 Traefik、监控、应用等 YAML 下发到集群。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 03-09` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + ## 1. 选型与边界 - GitOps 工具二选一: @@ -54,3 +62,8 @@ homelab-k3s-gitops/ - GitOps 仓库的完整示例结构; - 与 Cloudflare Tunnel、监控、openlist 等现有专题的映射关系。 +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/03-10-k3s-traefik-custom-ports.md b/docs/03-10-k3s-traefik-custom-ports.md index 25e8d23..8fc44dc 100644 --- a/docs/03-10-k3s-traefik-custom-ports.md +++ b/docs/03-10-k3s-traefik-custom-ports.md @@ -2,6 +2,14 @@ > 为 K3s 内置 Traefik 增加 **自定义 entrypoints**(额外暴露端口),用于多入口/旁路调试/特定设备转发等场景。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 03-10` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + ## 前置条件 - 已完成 `01-01-k3s-控制节点含traefik.md`,集群内 `kube-system` 命名空间的 Traefik 正常运行。 @@ -11,9 +19,9 @@ | 项 | 路径 | |----|------| -| Traefik 自定义端口(HelmChartConfig) | [`ansible/files/03-10-traefik-custom-ports/traefik-custom-ports.yaml`](../ansible/files/03-10-traefik-custom-ports/traefik-custom-ports.yaml) | -| 应用 | `kubectl apply -f ansible/files/03-10-traefik-custom-ports/traefik-custom-ports.yaml` | -| 删除 | `kubectl delete -f ansible/files/03-10-traefik-custom-ports/traefik-custom-ports.yaml` | +| Traefik 自定义端口(HelmChartConfig) | [`ansible/files/03-10/traefik-custom-ports.yaml`](../ansible/files/03-10/traefik-custom-ports.yaml) | +| 应用 | `kubectl apply -f ansible/files/03-10/traefik-custom-ports.yaml` | +| 删除 | `kubectl delete -f ansible/files/03-10/traefik-custom-ports.yaml` | ## 做了什么 @@ -27,7 +35,7 @@ ## 部署与验证 ```bash -kubectl apply -f ansible/files/03-10-traefik-custom-ports/traefik-custom-ports.yaml +kubectl apply -f ansible/files/03-10/traefik-custom-ports.yaml # 等待 Traefik 重载(可能触发 rollout) kubectl -n kube-system rollout status deploy/traefik --timeout=180s @@ -51,3 +59,8 @@ kubectl -n kube-system describe svc traefik | sed -n '/Ports:/,/Selector:/p' - `03-02-k3s-traefik-acme.md` - `01-07-openwrt-haproxy.md` +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/04-00-nodejs-系列说明.md b/docs/04-00-nodejs-系列说明.md new file mode 100644 index 0000000..3241807 --- /dev/null +++ b/docs/04-00-nodejs-系列说明.md @@ -0,0 +1,39 @@ +# 04-00 Node.js 高级部署(系列说明) + +> 本系列以 `nodejs-demo` 为基线(`04-01`),后续分项在同一套累积 YAML 上做增量变更(见 `ansible/files/04-01/`)。 + +## TL;DR + +- **基线入口**:`04-01-k3s-nodejs-高级部署.md` +- **本页定位**:仅系列导航,不参与 `verify.sh run-all/full` +- **子篇执行入口**:`./scripts/verify.sh run 04-01` ~ `./scripts/verify.sh run 04-14` + +## 范围与非目标 + +- 本页是 **04 系列入口/导航页**(`YY=00`),不要求具备独立执行器,且不参与 `verify.sh run-all/full`。 +- `YY>0` 的分项必须包含可执行物(YAML 路径或命令块)。 + +## 04 系列索引 + +| doc_id | 主题 | 子篇执行入口 | +|-------:|------|------------| +| 04-01 | 基线:Deployment + Service + Ingress | `./scripts/verify.sh run 04-01` | +| 04-02 | 端口与 Service | `./scripts/verify.sh run 04-02` | +| 04-03 | 镜像与运行命令 | `./scripts/verify.sh run 04-03` | +| 04-04 | 环境变量与配置注入 | `./scripts/verify.sh run 04-04` | +| 04-05 | 探针与健康检查 | `./scripts/verify.sh run 04-05` | +| 04-06 | 副本与滚动发布 | `./scripts/verify.sh run 04-06` | +| 04-07 | Ingress 与 Traefik | `./scripts/verify.sh run 04-07` | +| 04-08 | 资源请求与限制 | `./scripts/verify.sh run 04-08` | +| 04-09 | 调度与亲和 | `./scripts/verify.sh run 04-09` | +| 04-10 | 安全上下文 | `./scripts/verify.sh run 04-10` | +| 04-11 | 存储与卷 | `./scripts/verify.sh run 04-11` | +| 04-12 | TLS 与证书 | `./scripts/verify.sh run 04-12` | +| 04-13 | HPA | `./scripts/verify.sh run 04-13` | +| 04-14 | GitOps 与 CI 流水线 | `./scripts/verify.sh run 04-14` | + +## 真源位置 + +- 累积清单:`ansible/files/04-01/` +- 该目录 `README.md` 说明每个 `04-0x-nodejs-demo.yaml` 的增量变更点 + diff --git a/docs/04-01-k3s-nodejs-高级部署.md b/docs/04-01-k3s-nodejs-高级部署.md index 3cd2f97..83d8b25 100644 --- a/docs/04-01-k3s-nodejs-高级部署.md +++ b/docs/04-01-k3s-nodejs-高级部署.md @@ -1,8 +1,16 @@ -# 04-01-k3s Node.js 高级部署 +# 04-01-k3s-nodejs-高级部署 > Node.js 属于 `04` 高级部署序列。 > 本文作为 Node.js 主入口:先跑通基础链路,再扩展到自定义端口、存储与构建。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 04-01` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + ## 前置条件 - 已完成 `01-02-k3s-工作节点.md` @@ -22,11 +30,11 @@ | 项 | 路径 / 命令 | |----|-------------| -| 清单文件 | [`ansible/files/04-01-nodejs-demo/04-01-nodejs-demo.yaml`](../ansible/files/04-01-nodejs-demo/04-01-nodejs-demo.yaml) | -| 手工应用 | `kubectl apply -f ansible/files/04-01-nodejs-demo/04-01-nodejs-demo.yaml` | -| Ansible | `ansible-playbook -i ansible/inventory.ini ansible/playbooks/nodejs-demo-apply.yml -e nodejs_demo_manifest=04-01-nodejs-demo.yaml` | +| 清单文件 | [`ansible/files/04-01/04-01-nodejs-demo.yaml`](../ansible/files/04-01/04-01-nodejs-demo.yaml) | +| 手工应用 | `kubectl apply -f ansible/files/04-01/04-01-nodejs-demo.yaml` | +| Ansible | `ansible-playbook -i ansible/inventory.ini ansible/playbooks/verify/04-01.yml -e nodejs_demo_manifest=04-01-nodejs-demo.yaml` | -索引与累积说明见 [`ansible/files/04-01-nodejs-demo/README.md`](../ansible/files/04-01-nodejs-demo/README.md)。 +索引与累积说明见 `ansible/files/04-01/` 目录内的累积清单文件(`04-01`~`04-14`)。 ### 相对上游 @@ -41,7 +49,7 @@ 应用方式: ```bash -kubectl apply -f ansible/files/04-01-nodejs-demo/04-01-nodejs-demo.yaml +kubectl apply -f ansible/files/04-01/04-01-nodejs-demo.yaml ``` ## 基础验证 @@ -61,18 +69,18 @@ curl -s --max-time 3 http://192.168.2.62/node/ ## 部署阶段扩展(分项导航) -在本文 `nodejs-demo` 基线上按主题增量实践(建议顺序大致由上到下)。**每篇分项均链接到 `ansible/files/04-01-nodejs-demo/` 下累积清单**,并附 **相对上一篇的变更表**;与 [`ansible/playbooks/nodejs-demo-apply.yml`](../ansible/playbooks/nodejs-demo-apply.yml) 共用。 +在本文 `nodejs-demo` 基线上按主题增量实践(**`04-02`~`04-14` 已按 Core→Plus→Pro、从简到繁编号**)。**每篇分项均链接到 `ansible/files/04-01/` 下累积清单**,并附 **相对上一篇的变更表**;与 [`ansible/playbooks/verify/04-01.yml`](../ansible/playbooks/verify/04-01.yml) 共用。 -- `04-02-nodejs-镜像与运行命令.md`:镜像 tag、`imagePullPolicy`、`command`/`args` -- `04-03-nodejs-环境变量与配置注入.md`:ConfigMap/Secret、`env`/`envFrom` -- `04-04-nodejs-端口与Service.md`:`containerPort` 与 Service/Ingress 端口对应 -- `04-05-nodejs-资源请求与限制.md`:`resources`、OOM/CPU 节流 -- `04-06-nodejs-探针与健康检查.md`:存活/就绪/启动探针 -- `04-07-nodejs-调度与亲和.md`:`nodeSelector`、亲和、容忍 -- `04-08-nodejs-安全上下文.md`:`securityContext`、非 root、只读根等 -- `04-09-nodejs-存储与卷.md`:`emptyDir`、PVC、配置卷挂载 -- `04-10-nodejs-Ingress与Traefik.md`:路径、主机名、`web`/`websecure` -- `04-11-nodejs-副本与滚动发布.md`:`replicas`、滚动策略 +- `04-02-nodejs-端口与Service.md`:`containerPort` 与 Service/Ingress 端口对应 +- `04-03-nodejs-镜像与运行命令.md`:镜像 tag、`imagePullPolicy`、`command`/`args` +- `04-04-nodejs-环境变量与配置注入.md`:ConfigMap/Secret、`env`/`envFrom` +- `04-05-nodejs-探针与健康检查.md`:存活/就绪/启动探针 +- `04-06-nodejs-副本与滚动发布.md`:`replicas`、滚动策略 +- `04-07-nodejs-Ingress与Traefik.md`:路径、主机名、`web`/`websecure` +- `04-08-nodejs-资源请求与限制.md`:`resources`、OOM/CPU 节流 +- `04-09-nodejs-调度与亲和.md`:`nodeSelector`、亲和、容忍 +- `04-10-nodejs-安全上下文.md`:`securityContext`、非 root、只读根等 +- `04-11-nodejs-存储与卷.md`:`emptyDir`、PVC、配置卷挂载 - `04-12-nodejs-TLS与证书.md`:Ingress `tls`、HTTPS(与 `03-02` ACME 配合) - `04-13-nodejs-HPA.md`:水平自动扩缩容 - `04-14-nodejs-GitOps与CI流水线.md`:构建镜像、GitOps/CI 闭环 @@ -80,3 +88,9 @@ curl -s --max-time 3 http://192.168.2.62/node/ ## 下一步 - 返回 `00-00-构建总览.md`,按导航继续。 + +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/04-04-nodejs-端口与Service.md b/docs/04-02-nodejs-端口与Service.md similarity index 65% rename from docs/04-04-nodejs-端口与Service.md rename to docs/04-02-nodejs-端口与Service.md index a08a01f..8961f71 100644 --- a/docs/04-04-nodejs-端口与Service.md +++ b/docs/04-02-nodejs-端口与Service.md @@ -1,7 +1,15 @@ -# 04-04-nodejs-端口与Service +# 04-02-nodejs-端口与Service > 理清 **容器监听端口**、**Service 端口** 与 **Ingress backend 端口** 三者对应关系;在 `04-01` 基线上做最小调整。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 04-02` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + ## 前置条件 - 已部署 `nodejs-demo`(`04-01`)。 @@ -10,10 +18,10 @@ | 项 | 路径 | |----|------| -| 本篇完整清单(累积至 04-04) | [`ansible/files/04-01-nodejs-demo/04-04-nodejs-demo.yaml`](../ansible/files/04-01-nodejs-demo/04-04-nodejs-demo.yaml) | -| 应用 | `kubectl apply -f ansible/files/04-01-nodejs-demo/04-04-nodejs-demo.yaml` | +| 本篇完整清单(累积至 04-02) | [`ansible/files/04-01/04-02-nodejs-demo.yaml`](../ansible/files/04-01/04-02-nodejs-demo.yaml) | +| 应用 | `kubectl apply -f ansible/files/04-01/04-02-nodejs-demo.yaml` | -自 **04-04** 起,累积清单中应用监听 **8080**(与 `04-01` 文档中的 3000 不同,便于与后续探针、分项对齐)。 +自 **本篇(04-02)** 起,累积清单中应用监听 **8080**(与 `04-01` 文档中的 3000 不同,便于与后续探针、分项对齐)。 ## 场景说明(白话) @@ -27,9 +35,9 @@ **改应用监听端口时**:容器监听、`containerPort`、`targetPort` 要一致;Ingress 只要还指向 Service 的 `port: 80`,通常不用动。 -### 相对 `04-03` 的变更(原文 → 新文) +### 相对 `04-01` 的变更(原文 → 新文) -| 位置 | 原文(`04-03`) | 新文(`04-04`) | +| 位置 | 原文(`04-01`) | 新文(`04-02`) | |------|-----------------|-----------------| | 容器内监听 | `.listen(3000)` | `.listen(8080)` | | `containerPort` | `3000` | `8080` | @@ -45,7 +53,7 @@ ## 部署与验证 ```bash -kubectl apply -f ansible/files/04-01-nodejs-demo/04-04-nodejs-demo.yaml +kubectl apply -f ansible/files/04-01/04-02-nodejs-demo.yaml kubectl get svc nodejs-demo -n default -o wide kubectl get endpoints nodejs-demo -n default curl -s --max-time 3 http://<节点IP>/node/ @@ -64,5 +72,11 @@ curl -s --max-time 3 http://<节点IP>/node/ ## 相关文档 -- [`04-10-nodejs-Ingress与Traefik.md`](04-10-nodejs-Ingress与Traefik.md) +- [`04-07-nodejs-Ingress与Traefik.md`](04-07-nodejs-Ingress与Traefik.md) - [`04-01-k3s-nodejs-高级部署.md`](04-01-k3s-nodejs-高级部署.md) + +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/04-02-nodejs-镜像与运行命令.md b/docs/04-03-nodejs-镜像与运行命令.md similarity index 60% rename from docs/04-02-nodejs-镜像与运行命令.md rename to docs/04-03-nodejs-镜像与运行命令.md index 2a3c8a8..8b8c229 100644 --- a/docs/04-02-nodejs-镜像与运行命令.md +++ b/docs/04-03-nodejs-镜像与运行命令.md @@ -1,7 +1,15 @@ -# 04-02-nodejs-镜像与运行命令 +# 04-03-nodejs-镜像与运行命令 > 在 [`04-01-k3s-nodejs-高级部署.md`](04-01-k3s-nodejs-高级部署.md) 的 `nodejs-demo` 基线上,调整**镜像**与**进程启动方式**。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 04-03` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + ## 前置条件 - 已按 `04-01` 部署并验证 `curl` 可达。 @@ -10,9 +18,9 @@ | 项 | 路径 / 命令 | |----|-------------| -| 本篇完整清单(累积至 04-02) | [`ansible/files/04-01-nodejs-demo/04-02-nodejs-demo.yaml`](../ansible/files/04-01-nodejs-demo/04-02-nodejs-demo.yaml) | -| 手工应用 | `kubectl apply -f ansible/files/04-01-nodejs-demo/04-02-nodejs-demo.yaml` | -| Ansible | `ansible-playbook -i ansible/inventory.ini ansible/playbooks/nodejs-demo-apply.yml -e nodejs_demo_manifest=04-02-nodejs-demo.yaml` | +| 本篇完整清单(累积至 04-03) | [`ansible/files/04-01/04-03-nodejs-demo.yaml`](../ansible/files/04-01/04-03-nodejs-demo.yaml) | +| 手工应用 | `kubectl apply -f ansible/files/04-01/04-03-nodejs-demo.yaml` | +| Ansible | `ansible-playbook -i ansible/inventory.ini ansible/playbooks/verify/04-01.yml -e nodejs_demo_manifest=04-03-nodejs-demo.yaml` | 若你更喜欢命令行换镜像,文末也给了 **`kubectl set image`**,可不改仓库清单。 @@ -21,20 +29,20 @@ - **换镜像版本**:就像本地 `docker pull node:18.20-alpine`,K8s 里改 `image:` 一行即可;写死版本号比总写 `latest` 更容易排查「昨天还能跑今天不行」。 - **何时拉镜像(imagePullPolicy)**:节点上还没有这个镜像时肯定要拉;若 CI 总往同一个 tag 里覆盖推送,一般要 **`Always`**,否则会用到旧层。 - **改启动命令**:镜像自带的入口不满足时,用 `command` / `args` 告诉 K8s「用哪条命令起 Node」;和 Docker 里覆盖 `ENTRYPOINT`/`CMD` 一个意思。 -- **NODE_OPTIONS 等**:适合放在环境变量里,见 [`04-03-nodejs-环境变量与配置注入.md`](04-03-nodejs-环境变量与配置注入.md)。 +- **NODE_OPTIONS 等**:适合放在环境变量里,见 [`04-04-nodejs-环境变量与配置注入.md`](04-04-nodejs-环境变量与配置注入.md)。 -### 相对 `04-01` 的变更(原文 → 新文) +### 相对 `04-02` 的变更(原文 → 新文) -| 位置 | 原文(`04-01`) | 新文(`04-02`) | +| 位置 | 原文(`04-02`) | 新文(`04-03`) | |------|-----------------|-----------------| | `containers[].image` | `node:18-alpine` | `node:18.20-alpine` | | `containers[].imagePullPolicy` | (默认) | `IfNotPresent` | -| `containers[].command` / `args` | 单行 `["node","-e","...Hello World...listen(3000)"]` | `command: ["node"]` + `args` 两段,`res.end('Hello from pinned image')` | +| `containers[].command` / `args` | 单行 `["node","-e","...listen(8080)"]` | `command: ["node"]` + `args` 两段,`listen(8080)`,`res.end('Hello from pinned image')` | 应用: ```bash -kubectl apply -f ansible/files/04-01-nodejs-demo/04-02-nodejs-demo.yaml +kubectl apply -f ansible/files/04-01/04-03-nodejs-demo.yaml # 或仅打补丁(示意) kubectl set image deployment/nodejs-demo nodejs-demo=node:18.20-alpine -n default ``` @@ -63,5 +71,11 @@ kubectl rollout undo deployment/nodejs-demo -n default ## 相关文档 -- [`04-03-nodejs-环境变量与配置注入.md`](04-03-nodejs-环境变量与配置注入.md) -- [`04-05-nodejs-资源请求与限制.md`](04-05-nodejs-资源请求与限制.md) +- [`04-04-nodejs-环境变量与配置注入.md`](04-04-nodejs-环境变量与配置注入.md) +- [`04-08-nodejs-资源请求与限制.md`](04-08-nodejs-资源请求与限制.md) + +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/04-03-nodejs-环境变量与配置注入.md b/docs/04-04-nodejs-环境变量与配置注入.md similarity index 52% rename from docs/04-03-nodejs-环境变量与配置注入.md rename to docs/04-04-nodejs-环境变量与配置注入.md index 2ab4656..1deac3a 100644 --- a/docs/04-03-nodejs-环境变量与配置注入.md +++ b/docs/04-04-nodejs-环境变量与配置注入.md @@ -1,7 +1,15 @@ -# 04-03-nodejs-环境变量与配置注入 +# 04-04-nodejs-环境变量与配置注入 > 在 [`04-01-k3s-nodejs-高级部署.md`](04-01-k3s-nodejs-高级部署.md) 基线上,用 **ConfigMap / Secret** 与 **`env` / `envFrom`** 注入配置,避免把敏感信息写进镜像或 Deployment 明文。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 04-04` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + ## 前置条件 - 已部署 `nodejs-demo`(`04-01`)。 @@ -10,29 +18,29 @@ | 项 | 路径 / 命令 | |----|-------------| -| 本篇完整清单(累积至 04-03,含 ConfigMap + Deployment + Service + Ingress) | [`ansible/files/04-01-nodejs-demo/04-03-nodejs-demo.yaml`](../ansible/files/04-01-nodejs-demo/04-03-nodejs-demo.yaml) | -| Secret 示例(勿提交真密钥) | [`ansible/files/04-01-nodejs-demo/nodejs-demo-secret.example.yaml`](../ansible/files/04-01-nodejs-demo/nodejs-demo-secret.example.yaml) | -| 手工应用 | `kubectl apply -f ansible/files/04-01-nodejs-demo/04-03-nodejs-demo.yaml` | -| Ansible | `ansible-playbook ... -e nodejs_demo_manifest=04-03-nodejs-demo.yaml` | +| 本篇完整清单(累积至 04-04,含 ConfigMap + Deployment + Service + Ingress) | [`ansible/files/04-01/04-04-nodejs-demo.yaml`](../ansible/files/04-01/04-04-nodejs-demo.yaml) | +| Secret 示例(勿提交真密钥) | [`ansible/files/04-01/nodejs-demo-secret.example.yaml`](../ansible/files/04-01/nodejs-demo-secret.example.yaml) | +| 手工应用 | `kubectl apply -f ansible/files/04-01/04-04-nodejs-demo.yaml` | +| Ansible | `ansible-playbook ... -e nodejs_demo_manifest=04-04-nodejs-demo.yaml` | ## 场景说明(白话) - **普通配置**(提示文案、开关、非密钥连接串):用 **ConfigMap**;改完 `kubectl apply`,Pod 滚动后生效(是否自动重启取决于你怎么挂载/引用)。 - **密钥类**:用 **Secret**;内容和 ConfigMap 类似,但要更严格管控权限与存储位置。 -- **在 Node 里怎么用**:和在本机设环境变量一样,例如 `NODE_ENV`、`PORT`、`NODE_OPTIONS`;启动命令怎么写见 [`04-02-nodejs-镜像与运行命令.md`](04-02-nodejs-镜像与运行命令.md)。 +- **在 Node 里怎么用**:和在本机设环境变量一样,例如 `NODE_ENV`、`PORT`、`NODE_OPTIONS`;启动命令怎么写见 [`04-03-nodejs-镜像与运行命令.md`](04-03-nodejs-镜像与运行命令.md)。 -### 相对 `04-02` 的变更(原文 → 新文) +### 相对 `04-03` 的变更(原文 → 新文) -| 位置 | 原文(`04-02`) | 新文(`04-03`) | +| 位置 | 原文(`04-03`) | 新文(`04-04`) | |------|-----------------|-----------------| | 新增资源 | (无) | `ConfigMap` `nodejs-demo-config`,`APP_MSG` | | `containers[].env` | (无) | `APP_MSG` 来自 `configMapKeyRef` | -| `containers[].command` | `["node"]` + `args` 单行脚本 | `node` + 多行 `-e` 脚本,读 `process.env.APP_MSG`,仍监听 **3000** | +| `containers[].command` | `["node"]` + `args` 内联脚本 | `node` + 多行 `-e` 脚本,读 `process.env.APP_MSG`,仍监听 **8080** | 应用: ```bash -kubectl apply -f ansible/files/04-01-nodejs-demo/04-03-nodejs-demo.yaml +kubectl apply -f ansible/files/04-01/04-04-nodejs-demo.yaml ``` ## 验证 @@ -45,7 +53,7 @@ curl -s --max-time 3 http://<节点IP>/node/ ## Secret 示例(仅示意) -**说明**:示例文件为 [`nodejs-demo-secret.example.yaml`](../ansible/files/04-01-nodejs-demo/nodejs-demo-secret.example.yaml);也可 `kubectl create secret generic ...`。在 Pod 中用 `env.valueFrom.secretKeyRef` 引用;验证 `printenv API_TOKEN`(注意日志勿打印密钥)。 +**说明**:示例文件为 [`nodejs-demo-secret.example.yaml`](../ansible/files/04-01/nodejs-demo-secret.example.yaml);也可 `kubectl create secret generic ...`。在 Pod 中用 `env.valueFrom.secretKeyRef` 引用;验证 `printenv API_TOKEN`(注意日志勿打印密钥)。 ## 删除 @@ -61,5 +69,11 @@ kubectl delete secret nodejs-demo-secret -n default --ignore-not-found ## 相关文档 -- [`04-09-nodejs-存储与卷.md`](04-09-nodejs-存储与卷.md)(文件挂载另一种注入方式) +- [`04-11-nodejs-存储与卷.md`](04-11-nodejs-存储与卷.md)(文件挂载另一种注入方式) - [`04-12-nodejs-TLS与证书.md`](04-12-nodejs-TLS与证书.md) + +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/04-06-nodejs-探针与健康检查.md b/docs/04-05-nodejs-探针与健康检查.md similarity index 62% rename from docs/04-06-nodejs-探针与健康检查.md rename to docs/04-05-nodejs-探针与健康检查.md index 5acad9d..e8d2369 100644 --- a/docs/04-06-nodejs-探针与健康检查.md +++ b/docs/04-05-nodejs-探针与健康检查.md @@ -1,17 +1,25 @@ -# 04-06-nodejs-探针与健康检查 +# 04-05-nodejs-探针与健康检查 > 为 `nodejs-demo` 配置 **存活 / 就绪 / 启动** 探针,使 kubelet 能在异常时重启容器,并在未就绪时从 Service **Endpoints** 摘除流量。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 04-05` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + ## 前置条件 - 已部署 `nodejs-demo`(`04-01`);应用需暴露可探测的 HTTP 路径(示例用根路径 `/`)。 ## 清单路径(唯一真源) -| 本篇完整清单 | [`ansible/files/04-01-nodejs-demo/04-06-nodejs-demo.yaml`](../ansible/files/04-01-nodejs-demo/04-06-nodejs-demo.yaml) | -| 应用 | `kubectl apply -f ansible/files/04-01-nodejs-demo/04-06-nodejs-demo.yaml` | +| 本篇完整清单 | [`ansible/files/04-01/04-05-nodejs-demo.yaml`](../ansible/files/04-01/04-05-nodejs-demo.yaml) | +| 应用 | `kubectl apply -f ansible/files/04-01/04-05-nodejs-demo.yaml` | -探针端口与累积清单一致,为 **8080**(自 `04-04` 起)。 +探针端口与累积清单一致,为 **8080**(自 `04-02` 起与监听端口对齐)。 ## 场景说明(白话) @@ -23,9 +31,9 @@ Kubernetes 会**周期性访问**你指定的地址,判断容器该不该重 | **就绪 readiness** | 「能接客了吗?」失败时**不放进 Service 负载均衡**(流量不打进来)。 | | **启动 startup** | 「是不是还在慢启动?」启动阶段先由它把关,避免被 liveness **误杀**。 | -### 相对 `04-05` 的变更(原文 → 新文) +### 相对 `04-04` 的变更(原文 → 新文) -| 位置 | 原文(`04-05`) | 新文(`04-06`) | +| 位置 | 原文(`04-04`) | 新文(`04-05`) | |------|-----------------|-----------------| | `livenessProbe` / `readinessProbe` | (无) | `httpGet` 路径 `/`,端口 **8080**,`initialDelaySeconds`/`periodSeconds` 见清单文件 | @@ -34,7 +42,7 @@ Kubernetes 会**周期性访问**你指定的地址,判断容器该不该重 ## 部署与验证 ```bash -kubectl apply -f ansible/files/04-01-nodejs-demo/04-06-nodejs-demo.yaml +kubectl apply -f ansible/files/04-01/04-05-nodejs-demo.yaml kubectl describe pod -l app=nodejs-demo -n default | sed -n '/Liveness/,/Events/p' kubectl get endpoints nodejs-demo -n default ``` @@ -53,5 +61,11 @@ kubectl get endpoints nodejs-demo -n default ## 相关文档 -- [`04-04-nodejs-端口与Service.md`](04-04-nodejs-端口与Service.md) -- [`04-11-nodejs-副本与滚动发布.md`](04-11-nodejs-副本与滚动发布.md) +- [`04-02-nodejs-端口与Service.md`](04-02-nodejs-端口与Service.md) +- [`04-06-nodejs-副本与滚动发布.md`](04-06-nodejs-副本与滚动发布.md) + +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/04-11-nodejs-副本与滚动发布.md b/docs/04-06-nodejs-副本与滚动发布.md similarity index 68% rename from docs/04-11-nodejs-副本与滚动发布.md rename to docs/04-06-nodejs-副本与滚动发布.md index cb98f31..968570a 100644 --- a/docs/04-11-nodejs-副本与滚动发布.md +++ b/docs/04-06-nodejs-副本与滚动发布.md @@ -1,7 +1,15 @@ -# 04-11-nodejs-副本与滚动发布 +# 04-06-nodejs-副本与滚动发布 > 调整 `nodejs-demo` 的 **副本数** 与 **滚动更新策略**,实现多实例与可控发布。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 04-06` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + ## 前置条件 - 已部署 `nodejs-demo`(`04-01`)。 @@ -9,8 +17,8 @@ ## 清单路径(唯一真源) -| 本篇完整清单 | [`ansible/files/04-01-nodejs-demo/04-11-nodejs-demo.yaml`](../ansible/files/04-01-nodejs-demo/04-11-nodejs-demo.yaml) | -| 应用 | `kubectl apply -f ansible/files/04-01-nodejs-demo/04-11-nodejs-demo.yaml` | +| 本篇完整清单 | [`ansible/files/04-01/04-06-nodejs-demo.yaml`](../ansible/files/04-01/04-06-nodejs-demo.yaml) | +| 应用 | `kubectl apply -f ansible/files/04-01/04-06-nodejs-demo.yaml` | `replicas` 与 `strategy` 在 **Deployment.spec** 下,与 `selector` / `template` 同级。 @@ -19,9 +27,9 @@ - **多副本**:同样应用跑多份,一台挂了别的还能接客;配合 Service 做负载均衡。 - **滚动发布**:换新版本时**一个一个 Pod 换**,而不是全停再起(可通过 `maxSurge` / `maxUnavailable` 调「多激进」)。 -### 相对 `04-10` 的变更(原文 → 新文) +### 相对 `04-05` 的变更(原文 → 新文) -| 位置 | 原文(`04-10`) | 新文(`04-11`) | +| 位置 | 原文(`04-05`) | 新文(`04-06`) | |------|-----------------|-----------------| | `spec.replicas` | `1` | `3` | | `spec.strategy` | (默认 RollingUpdate) | 显式 `RollingUpdate`,`maxSurge: 1`,`maxUnavailable: 0` | @@ -66,5 +74,11 @@ curl -s --max-time 3 -H "Host: app.example.local" "http://<节点IP>/api/" ## 相关文档 -- [`04-06-nodejs-探针与健康检查.md`](04-06-nodejs-探针与健康检查.md) +- [`04-05-nodejs-探针与健康检查.md`](04-05-nodejs-探针与健康检查.md) - [`04-13-nodejs-HPA.md`](04-13-nodejs-HPA.md) + +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/04-10-nodejs-Ingress与Traefik.md b/docs/04-07-nodejs-Ingress与Traefik.md similarity index 70% rename from docs/04-10-nodejs-Ingress与Traefik.md rename to docs/04-07-nodejs-Ingress与Traefik.md index 89b54f2..911d7d2 100644 --- a/docs/04-10-nodejs-Ingress与Traefik.md +++ b/docs/04-07-nodejs-Ingress与Traefik.md @@ -1,15 +1,23 @@ -# 04-10-nodejs-Ingress与Traefik +# 04-07-nodejs-Ingress与Traefik > 在 K3s 默认 **Traefik** 下,为 `nodejs-demo` 调整 **路径、主机名、入口点**;并了解标准 `Ingress` 与 **IngressRoute**(CRD)的差异入口。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 04-07` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + ## 前置条件 - 已部署 `04-01` 中的 `Ingress`;可选:`03-01-k3s-traefik-dashboard.md` 观察路由。 ## 清单路径(唯一真源) -| 本篇完整清单(含 Ingress `host` + `/api`) | [`ansible/files/04-01-nodejs-demo/04-10-nodejs-demo.yaml`](../ansible/files/04-01-nodejs-demo/04-10-nodejs-demo.yaml) | -| 应用 | `kubectl apply -f ansible/files/04-01-nodejs-demo/04-10-nodejs-demo.yaml` | +| 本篇完整清单(含 Ingress `host` + `/api`) | [`ansible/files/04-01/04-07-nodejs-demo.yaml`](../ansible/files/04-01/04-07-nodejs-demo.yaml) | +| 应用 | `kubectl apply -f ansible/files/04-01/04-07-nodejs-demo.yaml` | `host` / `path` 可按环境修改清单;`curl` 用 IP 访问时需带 **`Host`** 头。 @@ -24,9 +32,9 @@ - 注解 `traefik.ingress.kubernetes.io/router.entrypoints: web` 将路由绑定到 **HTTP** 入口(常见名 `web`)。 - HTTPS 入口通常为 **`websecure`**,与 TLS 配合见 `04-12`。 -### 相对 `04-09` 的变更(原文 → 新文) +### 相对 `04-06` 的变更(原文 → 新文) -| 位置 | 原文(`04-09`) | 新文(`04-10`) | +| 位置 | 原文(`04-06`) | 新文(`04-07`) | |------|-----------------|-----------------| | Ingress `spec.rules` | 仅 `http.paths`,无 `host`,path `/node` | `host: app.example.local`,path **`/api`** | @@ -44,7 +52,7 @@ Traefik 原生 CRD 可做中间件、多规则组合等;集群需已安装对 ## 部署与验证 ```bash -kubectl apply -f ansible/files/04-01-nodejs-demo/04-10-nodejs-demo.yaml +kubectl apply -f ansible/files/04-01/04-07-nodejs-demo.yaml kubectl describe ing nodejs-demo -n default # --- 情况 A:仍是 04-01 的 Ingress(无 rules.host,path=/node)--- @@ -61,11 +69,17 @@ curl -s -o /dev/null -w "%{http_code}\n" --max-time 3 \ ## 失败排查 - **404**:路径/host 与规则不一致;Traefik 未加载该 Ingress(namespace、ingressClass)。 -- **502**:Service 无 Endpoints(见 `04-04`、`04-06`)。 +- **502**:Service 无 Endpoints(见 `04-02` 端口对齐、`04-05` 探针与 Endpoints)。 - `06-01-k3s-networkpolicy-故障排查.md` - 集群级 Traefik:`03-01`、`03-02` ## 相关文档 - [`04-12-nodejs-TLS与证书.md`](04-12-nodejs-TLS与证书.md) -- [`04-04-nodejs-端口与Service.md`](04-04-nodejs-端口与Service.md) +- [`04-02-nodejs-端口与Service.md`](04-02-nodejs-端口与Service.md) + +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/04-05-nodejs-资源请求与限制.md b/docs/04-08-nodejs-资源请求与限制.md similarity index 61% rename from docs/04-05-nodejs-资源请求与限制.md rename to docs/04-08-nodejs-资源请求与限制.md index d120d6d..a1f52ba 100644 --- a/docs/04-05-nodejs-资源请求与限制.md +++ b/docs/04-08-nodejs-资源请求与限制.md @@ -1,15 +1,23 @@ -# 04-05-nodejs-资源请求与限制 +# 04-08-nodejs-资源请求与限制 > 为 `nodejs-demo` 配置 `resources.requests` / `resources.limits`,便于调度与避免单个 Pod 占满节点;为后续 **HPA**([`04-13-nodejs-HPA.md`](04-13-nodejs-HPA.md))提供基础。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 04-08` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + ## 前置条件 - 已部署 `nodejs-demo`(`04-01`)。 ## 清单路径(唯一真源) -| 本篇完整清单 | [`ansible/files/04-01-nodejs-demo/04-05-nodejs-demo.yaml`](../ansible/files/04-01-nodejs-demo/04-05-nodejs-demo.yaml) | -| 应用 | `kubectl apply -f ansible/files/04-01-nodejs-demo/04-05-nodejs-demo.yaml` | +| 本篇完整清单 | [`ansible/files/04-01/04-08-nodejs-demo.yaml`](../ansible/files/04-01/04-08-nodejs-demo.yaml) | +| 应用 | `kubectl apply -f ansible/files/04-01/04-08-nodejs-demo.yaml` | ## 场景说明(白话) @@ -17,16 +25,16 @@ - **limits(上限)**:内存超过上限,容器可能被 **OOM 杀掉**;CPU 超过上限会被 **限流**,变慢但不一定重启。 - **Node 堆**:还可以用 `NODE_OPTIONS=--max-old-space-size=...` 限制 V8 堆,和容器内存 limit 配合用(见 `04-03`)。 -### 相对 `04-04` 的变更(原文 → 新文) +### 相对 `04-07` 的变更(原文 → 新文) -| 位置 | 原文(`04-04`) | 新文(`04-05`) | +| 位置 | 原文(`04-07`) | 新文(`04-08`) | |------|-----------------|-----------------| | `containers[].resources` | (无) | `requests` cpu 50m / memory 64Mi;`limits` cpu 500m / memory 256Mi | ## 部署与验证 ```bash -kubectl apply -f ansible/files/04-01-nodejs-demo/04-05-nodejs-demo.yaml +kubectl apply -f ansible/files/04-01/04-08-nodejs-demo.yaml kubectl describe pod -l app=nodejs-demo -n default | grep -A5 "Limits\|Requests" ``` @@ -49,4 +57,10 @@ kubectl top pod -l app=nodejs-demo -n default ## 相关文档 - [`04-13-nodejs-HPA.md`](04-13-nodejs-HPA.md) -- [`04-06-nodejs-探针与健康检查.md`](04-06-nodejs-探针与健康检查.md) +- [`04-05-nodejs-探针与健康检查.md`](04-05-nodejs-探针与健康检查.md) + +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/04-07-nodejs-调度与亲和.md b/docs/04-09-nodejs-调度与亲和.md similarity index 67% rename from docs/04-07-nodejs-调度与亲和.md rename to docs/04-09-nodejs-调度与亲和.md index 550806f..1b7ac9f 100644 --- a/docs/04-07-nodejs-调度与亲和.md +++ b/docs/04-09-nodejs-调度与亲和.md @@ -1,15 +1,23 @@ -# 04-07-nodejs-调度与亲和 +# 04-09-nodejs-调度与亲和 > 控制 `nodejs-demo` **落在哪些节点**:`nodeSelector`、`affinity`、`tolerations`。常用于与 Traefik、存储或合规区域对齐。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 04-09` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + ## 前置条件 - 已部署 `nodejs-demo`(`04-01`);集群至少一个节点带可区分 **label**(例如 `kubectl get nodes --show-labels`)。 ## 清单路径(唯一真源) -| 本篇完整清单 | [`ansible/files/04-01-nodejs-demo/04-07-nodejs-demo.yaml`](../ansible/files/04-01-nodejs-demo/04-07-nodejs-demo.yaml) | -| 应用 | `kubectl apply -f ansible/files/04-01-nodejs-demo/04-07-nodejs-demo.yaml` | +| 本篇完整清单 | [`ansible/files/04-01/04-09-nodejs-demo.yaml`](../ansible/files/04-01/04-09-nodejs-demo.yaml) | +| 应用 | `kubectl apply -f ansible/files/04-01/04-09-nodejs-demo.yaml` | 清单中默认 `nodeSelector: kubernetes.io/hostname: ylc62`,请改为本集群节点名。 @@ -19,9 +27,9 @@ - **规则更复杂**(尽量分散、尽量和某类 Pod 同机架等):用 **affinity(亲和)**。 - **节点有「污点」**:像「专属机器」,Pod 必须配置 **容忍污点(tolerations)** 才能调度上去。 -### 相对 `04-06` 的变更(原文 → 新文) +### 相对 `04-08` 的变更(原文 → 新文) -| 位置 | 原文(`04-06`) | 新文(`04-07`) | +| 位置 | 原文(`04-08`) | 新文(`04-09`) | |------|-----------------|-----------------| | `template.spec.nodeSelector` | (无) | `kubernetes.io/hostname: ylc62`(请按环境修改) | @@ -38,7 +46,7 @@ ## 部署与验证 ```bash -kubectl apply -f ansible/files/04-01-nodejs-demo/04-07-nodejs-demo.yaml +kubectl apply -f ansible/files/04-01/04-09-nodejs-demo.yaml kubectl get pod -l app=nodejs-demo -n default -o wide ``` @@ -54,3 +62,9 @@ kubectl get pod -l app=nodejs-demo -n default -o wide - `01-02-k3s-工作节点.md` - `02-00-nginx-系列说明.md`(调度思路通用) - `06-01-k3s-networkpolicy-故障排查.md` + +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/04-08-nodejs-安全上下文.md b/docs/04-10-nodejs-安全上下文.md similarity index 61% rename from docs/04-08-nodejs-安全上下文.md rename to docs/04-10-nodejs-安全上下文.md index 1ec764a..bf70e75 100644 --- a/docs/04-08-nodejs-安全上下文.md +++ b/docs/04-10-nodejs-安全上下文.md @@ -1,16 +1,24 @@ -# 04-08-nodejs-安全上下文 +# 04-10-nodejs-安全上下文 > 为 `nodejs-demo` 配置 **Pod / 容器级 `securityContext`**:非 root、只读根文件系统、降权能力等。**以集群 PSP/约束与实际镜像为准**,逐步收紧。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 04-10` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + ## 前置条件 - 已部署 `nodejs-demo`(`04-01`)。 -- 注意:`node:18-alpine` 默认用户可能为 root;非 root 运行需镜像内已有可写目录或使用 `emptyDir` 挂载(见 [`04-09-nodejs-存储与卷.md`](04-09-nodejs-存储与卷.md))。 +- 注意:`node:18-alpine` 默认用户可能为 root;非 root 运行需镜像内已有可写目录或使用 `emptyDir` 挂载(见 [`04-11-nodejs-存储与卷.md`](04-11-nodejs-存储与卷.md))。 ## 清单路径(唯一真源) -| 本篇完整清单 | [`ansible/files/04-01-nodejs-demo/04-08-nodejs-demo.yaml`](../ansible/files/04-01-nodejs-demo/04-08-nodejs-demo.yaml) | -| 应用 | `kubectl apply -f ansible/files/04-01-nodejs-demo/04-08-nodejs-demo.yaml` | +| 本篇完整清单 | [`ansible/files/04-01/04-10-nodejs-demo.yaml`](../ansible/files/04-01/04-10-nodejs-demo.yaml) | +| 应用 | `kubectl apply -f ansible/files/04-01/04-10-nodejs-demo.yaml` | ## 场景说明(白话) @@ -18,9 +26,9 @@ - **只读根盘**:系统目录不让写;应用要写临时文件,必须单独挂 **可写卷**(示例用 `/tmp` 的 `emptyDir`)。 - **渐进收紧**:先在一个测试命名空间试,再推广;强策略集群可能被准入控制器拦截。 -### 相对 `04-07` 的变更(原文 → 新文) +### 相对 `04-09` 的变更(原文 → 新文) -| 位置 | 原文(`04-07`) | 新文(`04-08`) | +| 位置 | 原文(`04-09`) | 新文(`04-10`) | |------|-----------------|-----------------| | `template.spec.securityContext` | (无) | `fsGroup: 1000` | | `containers[].securityContext` | (无) | `runAsNonRoot` / `runAsUser: 1000` / `readOnlyRootFilesystem: true` 等 | @@ -31,7 +39,7 @@ ## 部署与验证 ```bash -kubectl apply -f ansible/files/04-01-nodejs-demo/04-08-nodejs-demo.yaml +kubectl apply -f ansible/files/04-01/04-10-nodejs-demo.yaml kubectl get pod -l app=nodejs-demo -n default kubectl exec deploy/nodejs-demo -n default -- id ``` @@ -47,5 +55,11 @@ kubectl exec deploy/nodejs-demo -n default -- id ## 相关文档 -- [`04-09-nodejs-存储与卷.md`](04-09-nodejs-存储与卷.md) -- [`04-05-nodejs-资源请求与限制.md`](04-05-nodejs-资源请求与限制.md) +- [`04-11-nodejs-存储与卷.md`](04-11-nodejs-存储与卷.md) +- [`04-08-nodejs-资源请求与限制.md`](04-08-nodejs-资源请求与限制.md) + +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/04-09-nodejs-存储与卷.md b/docs/04-11-nodejs-存储与卷.md similarity index 59% rename from docs/04-09-nodejs-存储与卷.md rename to docs/04-11-nodejs-存储与卷.md index 01a64d6..dfb2544 100644 --- a/docs/04-09-nodejs-存储与卷.md +++ b/docs/04-11-nodejs-存储与卷.md @@ -1,7 +1,15 @@ -# 04-09-nodejs-存储与卷 +# 04-11-nodejs-存储与卷 > 为 Node.js 工作负载挂载 **临时卷** 或 **持久卷(PVC)**:日志、上传目录、`/tmp`、只读配置目录等。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 04-11` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + ## 前置条件 - 已部署 `nodejs-demo`(`04-01`)。 @@ -9,8 +17,8 @@ ## 清单路径(唯一真源) -| 本篇完整清单(含 PVC + `/data` 挂载,默认 `storageClassName: local-path`) | [`ansible/files/04-01-nodejs-demo/04-09-nodejs-demo.yaml`](../ansible/files/04-01-nodejs-demo/04-09-nodejs-demo.yaml) | -| 应用 | `kubectl apply -f ansible/files/04-01-nodejs-demo/04-09-nodejs-demo.yaml` | +| 本篇完整清单(含 PVC + `/data` 挂载,默认 `storageClassName: local-path`) | [`ansible/files/04-01/04-11-nodejs-demo.yaml`](../ansible/files/04-01/04-11-nodejs-demo.yaml) | +| 应用 | `kubectl apply -f ansible/files/04-01/04-11-nodejs-demo.yaml` | emptyDir、仅 ConfigMap 卷等变体可在该清单基础上自行删减 PVC 与 `volumeMounts` 做实验。 @@ -20,20 +28,20 @@ emptyDir、仅 ConfigMap 卷等变体可在该清单基础上自行删减 PVC - **PVC**:数据由存储驱动落到盘里,Pod 重建还可能挂上同一块盘(取决于存储类型与访问模式)。 - **ConfigMap 挂成文件**:适合「配置文件」形态,只读挂载很常见。 -### 相对 `04-08` 的变更(原文 → 新文) +### 相对 `04-10` 的变更(原文 → 新文) -| 位置 | 原文(`04-08`) | 新文(`04-09`) | +| 位置 | 原文(`04-10`) | 新文(`04-11`) | |------|-----------------|-----------------| | 资源列表 | 无 PVC | 新增 `PersistentVolumeClaim` `nodejs-demo-data` | | `volumeMounts` | 仅 `/tmp` | 增加 `/data` | | `volumes` | 仅 `tmp` emptyDir | 增加 `persistentVolumeClaim` | -**emptyDir 缓存卷**、**ConfigMap 只读挂载** 的片段写法见 Kubernetes 文档;可在 [`04-09-nodejs-demo.yaml`](../ansible/files/04-01-nodejs-demo/04-09-nodejs-demo.yaml) 上自行合并实验。 +**emptyDir 缓存卷**、**ConfigMap 只读挂载** 的片段写法见 Kubernetes 文档;可在 [`04-11-nodejs-demo.yaml`](../ansible/files/04-01/04-11-nodejs-demo.yaml) 上自行合并实验。 ## 部署与验证 ```bash -kubectl apply -f ansible/files/04-01-nodejs-demo/04-09-nodejs-demo.yaml +kubectl apply -f ansible/files/04-01/04-11-nodejs-demo.yaml kubectl get pvc -n default kubectl exec deploy/nodejs-demo -n default -- df -h /data ``` @@ -54,5 +62,11 @@ kubectl delete pvc nodejs-demo-data -n default ## 相关文档 -- [`04-03-nodejs-环境变量与配置注入.md`](04-03-nodejs-环境变量与配置注入.md) -- [`04-08-nodejs-安全上下文.md`](04-08-nodejs-安全上下文.md) +- [`04-04-nodejs-环境变量与配置注入.md`](04-04-nodejs-环境变量与配置注入.md) +- [`04-10-nodejs-安全上下文.md`](04-10-nodejs-安全上下文.md) + +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/04-12-nodejs-TLS与证书.md b/docs/04-12-nodejs-TLS与证书.md index 96e9660..4a7e403 100644 --- a/docs/04-12-nodejs-TLS与证书.md +++ b/docs/04-12-nodejs-TLS与证书.md @@ -2,6 +2,14 @@ > 为 `nodejs-demo` 的 **Ingress** 启用 **HTTPS**:`spec.tls` + 证书 **Secret**。集群侧 Traefik **ACME 自动证书** 以 [`03-02-k3s-traefik-acme.md`](03-02-k3s-traefik-acme.md) 为主;本篇侧重 **应用 Ingress 如何声明 TLS** 与验证。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 04-12` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + ## 前置条件 - 已完成 `03-02`(推荐):Traefik 已配置 `websecure` 与证书解析器;或你已手动/其他方式准备好 TLS Secret。 @@ -9,8 +17,8 @@ ## 清单路径(唯一真源) -| 本篇完整清单(Ingress 已切 **websecure** + `spec.tls`;**不含** Secret 内容) | [`ansible/files/04-01-nodejs-demo/04-12-nodejs-demo.yaml`](../ansible/files/04-01-nodejs-demo/04-12-nodejs-demo.yaml) | -| 应用 | 先创建 TLS Secret(见下),再 `kubectl apply -f ansible/files/04-01-nodejs-demo/04-12-nodejs-demo.yaml` | +| 本篇完整清单(Ingress 已切 **websecure** + `spec.tls`;**不含** Secret 内容) | [`ansible/files/04-01/04-12-nodejs-demo.yaml`](../ansible/files/04-01/04-12-nodejs-demo.yaml) | +| 应用 | 先创建 TLS Secret(见下),再 `kubectl apply -f ansible/files/04-01/04-12-nodejs-demo.yaml` | **证书 Secret**:使用命令创建(不提交私钥到 Git): @@ -59,5 +67,11 @@ curl -vk --max-time 5 https://app.example.local/api/ ## 相关文档 -- [`04-10-nodejs-Ingress与Traefik.md`](04-10-nodejs-Ingress与Traefik.md) +- [`04-07-nodejs-Ingress与Traefik.md`](04-07-nodejs-Ingress与Traefik.md) - [`04-01-k3s-nodejs-高级部署.md`](04-01-k3s-nodejs-高级部署.md) + +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/04-13-nodejs-HPA.md b/docs/04-13-nodejs-HPA.md index 35b7047..67ca12c 100644 --- a/docs/04-13-nodejs-HPA.md +++ b/docs/04-13-nodejs-HPA.md @@ -2,16 +2,24 @@ > 为 `nodejs-demo` 配置 **HorizontalPodAutoscaler**,按 CPU/内存等指标在 `minReplicas`~`maxReplicas` 间自动伸缩。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 04-13` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + ## 前置条件 - 集群已安装 **metrics-server**(K3s 常默认启用;`kubectl top nodes` 可用即基本就绪)。 -- Deployment 已配置 **`resources.requests`**(CPU 指标 HPA 依赖 requests),见 [`04-05-nodejs-资源请求与限制.md`](04-05-nodejs-资源请求与限制.md)。 -- 建议已配置 **readinessProbe**([`04-06-nodejs-探针与健康检查.md`](04-06-nodejs-探针与健康检查.md)),避免扩容出未就绪 Pod。 +- Deployment 已配置 **`resources.requests`**(CPU 指标 HPA 依赖 requests),见 [`04-08-nodejs-资源请求与限制.md`](04-08-nodejs-资源请求与限制.md)。 +- 建议已配置 **readinessProbe**([`04-05-nodejs-探针与健康检查.md`](04-05-nodejs-探针与健康检查.md)),避免扩容出未就绪 Pod。 ## 清单路径(唯一真源) -| 本篇完整清单(含 Deployment/Service/Ingress/PVC/CM + **HPA**) | [`ansible/files/04-01-nodejs-demo/04-13-nodejs-demo.yaml`](../ansible/files/04-01-nodejs-demo/04-13-nodejs-demo.yaml) | -| 应用 | `kubectl apply -f ansible/files/04-01-nodejs-demo/04-13-nodejs-demo.yaml`(若用 `04-12`,需先有 TLS Secret) | +| 本篇完整清单(含 Deployment/Service/Ingress/PVC/CM + **HPA**) | [`ansible/files/04-01/04-13-nodejs-demo.yaml`](../ansible/files/04-01/04-13-nodejs-demo.yaml) | +| 应用 | `kubectl apply -f ansible/files/04-01/04-13-nodejs-demo.yaml`(若用 `04-12`,需先有 TLS Secret) | ## 场景说明(白话) @@ -27,7 +35,7 @@ ## 部署与验证 ```bash -kubectl apply -f ansible/files/04-01-nodejs-demo/04-13-nodejs-demo.yaml +kubectl apply -f ansible/files/04-01/04-13-nodejs-demo.yaml kubectl get hpa -n default kubectl describe hpa nodejs-demo -n default ``` @@ -49,5 +57,11 @@ kubectl delete hpa nodejs-demo -n default ## 相关文档 -- [`04-11-nodejs-副本与滚动发布.md`](04-11-nodejs-副本与滚动发布.md) +- [`04-06-nodejs-副本与滚动发布.md`](04-06-nodejs-副本与滚动发布.md) - [`05-05-prometheus与grafana.md`](05-05-prometheus与grafana.md)(自定义 metrics 进阶,本文不展开) + +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/04-14-nodejs-GitOps与CI流水线.md b/docs/04-14-nodejs-GitOps与CI流水线.md index a2131ee..d68ba56 100644 --- a/docs/04-14-nodejs-GitOps与CI流水线.md +++ b/docs/04-14-nodejs-GitOps与CI流水线.md @@ -2,6 +2,14 @@ > 从 **Node.js 应用仓库** 视角串联:**持续集成(CI)** 构建镜像并推送仓库,**持续交付** 通过 **GitOps** 或流水线步骤把声明式清单下发到 K3s。细节以仓库内 GitLab/GitOps 文档为准,本篇给 **最小闭环与引用**。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 04-14` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + ## 前置条件 - 集群可拉取镜像(私有仓库需 `imagePullSecrets`,见 `04-02` 相关说明)。 @@ -10,7 +18,7 @@ ## 清单与仓库(唯一真源) - **本文无独立流水线 YAML**(GitLab CI、Argo CD、Flux 随版本变化大);流程见 **`05-04`**、**`03-09`**。 -- **应用清单真源**:[`ansible/files/04-01-nodejs-demo/`](../ansible/files/04-01-nodejs-demo/)(例如 `04-01-nodejs-demo.yaml`)。将 **该目录或单文件** 纳入 Git,由 CI 改 `image:` tag 或由 GitOps 同步到集群。 +- **应用清单真源**:[`ansible/files/04-01/`](../ansible/files/04-01/)(例如 `04-14-nodejs-demo.yaml`)。将 **该目录或单文件** 纳入 Git,由 CI 改 `image:` tag 或由 GitOps 同步到集群。 ## 场景说明(白话) @@ -70,4 +78,10 @@ GitLab CI 示例结构与 Runner 注册见 **`05-04`**。 - [`05-03-k3s-安装gitlab-含runner.md`](05-03-k3s-安装gitlab-含runner.md) - [`05-04-k3s-配置gitlab-cicd.md`](05-04-k3s-配置gitlab-cicd.md) - [`03-09-k3s-gitops-集群配置管理.md`](03-09-k3s-gitops-集群配置管理.md) -- [`04-02-nodejs-镜像与运行命令.md`](04-02-nodejs-镜像与运行命令.md) +- [`04-03-nodejs-镜像与运行命令.md`](04-03-nodejs-镜像与运行命令.md) + +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/05-00-常用应用部署-系列说明.md b/docs/05-00-常用应用部署-系列说明.md new file mode 100644 index 0000000..fd35262 --- /dev/null +++ b/docs/05-00-常用应用部署-系列说明.md @@ -0,0 +1,28 @@ +# 05-00 常用应用部署(系列说明) + +> 本系列覆盖:首页面板、GitLab、监控、openlist、openclaw 等常见应用在 K3s 的部署实践。 + +## TL;DR + +- **本页定位**:仅系列导航,不参与 `verify.sh run-all/full` +- **子篇执行入口**:按下表执行 `./scripts/verify.sh run ` + +## 范围与非目标 + +- 本页是 **05 系列入口/导航页**(`YY=00`),不要求具备独立执行器,且不参与 `verify.sh run-all/full`。 +- `YY>0` 的分项必须包含可执行物(YAML 路径或命令块)。 + +## 05 系列索引 + +| doc_id | 主题 | 子篇执行入口 | +|-------:|------|------------| +| 05-01 | Homer 首页面板 | `./scripts/verify.sh run 05-01` | +| 05-02 | OneNav 首页面板 | `./scripts/verify.sh run 05-02` | +| 05-03 | GitLab(含 Runner) | `./scripts/verify.sh run 05-03` | +| 05-04 | GitLab CI/CD 配置 | `./scripts/verify.sh run 05-04` | +| 05-05 | Prometheus + Grafana | `./scripts/verify.sh run 05-05` | +| 05-06 | openlist 挂载网盘与自动备份 | `./scripts/verify.sh run 05-06` | +| 05-07 | openclaw 应用部署 | `./scripts/verify.sh run 05-07` | +| 05-08 | openclaw k3s 实验部署 | `./scripts/verify.sh run 05-08` | +| 05-09 | openclaw web 小游戏平台 | `./scripts/verify.sh run 05-09` | + diff --git a/docs/05-01-k3s-部署homer首页面板.md b/docs/05-01-k3s-部署homer首页面板.md index 78e7c2b..53e48b9 100644 --- a/docs/05-01-k3s-部署homer首页面板.md +++ b/docs/05-01-k3s-部署homer首页面板.md @@ -2,6 +2,14 @@ > 在 K3s 中部署 Homer,作为家庭实验室的统一导航页。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 05-01` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + --- ## Homer 相对「纯书签」的优势 @@ -24,10 +32,10 @@ ```bash kubectl create ns homer -kubectl apply -f ansible/files/05-01-homer/homer.yaml +kubectl apply -f ansible/files/05-01/homer.yaml ``` -**唯一真源**:[`ansible/files/05-01-homer/homer.yaml`](../ansible/files/05-01-homer/homer.yaml)(ConfigMap + Deployment + Service + Ingress)。 +**唯一真源**:[`ansible/files/05-01/homer.yaml`](../ansible/files/05-01/homer.yaml)(ConfigMap + Deployment + Service + Ingress)。 ### 自定义导航(config.yml) @@ -37,7 +45,7 @@ kubectl apply -f ansible/files/05-01-homer/homer.yaml - 修改后重新应用并滚动 Pod 生效: ```bash -kubectl apply -f ansible/files/05-01-homer/homer.yaml +kubectl apply -f ansible/files/05-01/homer.yaml kubectl -n homer rollout restart deploy/homer ``` @@ -49,28 +57,11 @@ kubectl -n homer rollout restart deploy/homer **示例(摘自 Homer 文档形态,按需改 `url`):** -```yaml -- name: "System Metrics" - type: "Glances" - icon: "fa-solid fa-heart-pulse" - url: "https://glances.example.com" # 须指向 Glances 提供的 Web/API 基址 - stats: [cpu, mem] # 可选:load, cpu, mem, swap(均来自下方「目标」) -``` +示例真源:[`ansible/files/05-01/homer-glances-item.example.yaml`](../ansible/files/05-01/homer-glances-item.example.yaml) **在目标机用 Docker 跑 Glances(Web 模式)示例**(Homer 的 `url` 需指向该服务可访问地址,默认端口以镜像说明为准): -```yaml -services: - glances: - image: nicolargo/glances:latest - container_name: glances - environment: - - TZ=Asia/Shanghai - - GLANCES_OPT=-w - ports: - - "61208:61208" - restart: unless-stopped -``` +示例真源:[`ansible/files/05-01/glances-docker-compose.example.yaml`](../ansible/files/05-01/glances-docker-compose.example.yaml) **要点:** @@ -94,3 +85,9 @@ curl -I --max-time 3 http://192.168.2.61/ ## 下一步 - `05-02-onenav首页面板.md` + +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/05-02-onenav首页面板.md b/docs/05-02-onenav首页面板.md index 935e2d3..9229598 100644 --- a/docs/05-02-onenav首页面板.md +++ b/docs/05-02-onenav首页面板.md @@ -2,6 +2,14 @@ > OneNav 运行在集群外(如 armv7 主机),通过 K3s Traefik 做静态转发接入。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 05-02` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + --- ## 在 armv7 部署 OneNav @@ -19,12 +27,12 @@ docker run -d --name onenav \ ## 在 K3s 做静态转发 -**唯一真源**:[`ansible/files/05-02-onenav/onenav-proxy.yaml`](../ansible/files/05-02-onenav/onenav-proxy.yaml)(修改 `Endpoints` IP 与 `Ingress` host)。 +**唯一真源**:[`ansible/files/05-02/onenav-proxy.yaml`](../ansible/files/05-02/onenav-proxy.yaml)(修改 `Endpoints` IP 与 `Ingress` host)。 应用方式: ```bash -kubectl apply -f ansible/files/05-02-onenav/onenav-proxy.yaml +kubectl apply -f ansible/files/05-02/onenav-proxy.yaml ``` --- @@ -43,3 +51,8 @@ curl -I --max-time 3 http://192.168.2.61/ - `05-01-k3s-部署homer首页面板.md` - `03-04-k3s-cloudflare-tunnel-配置接入.md` +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/05-03-k3s-安装gitlab-含runner.md b/docs/05-03-k3s-安装gitlab-含runner.md index db70b5f..04288d3 100644 --- a/docs/05-03-k3s-安装gitlab-含runner.md +++ b/docs/05-03-k3s-安装gitlab-含runner.md @@ -2,6 +2,14 @@ > 通过 Helm 在 K3s 部署 GitLab,并接入 Runner 执行 CI 任务。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 05-03` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + --- ## 前置条件 @@ -131,7 +139,7 @@ sudo gitlab-runner register \ --non-interactive ``` -在 `.gitlab-ci.yml` 中即可按需指定不同架构运行 Job,示例见 [`ansible/files/05-03-gitlab-runner/gitlab-ci-runner-tags.example.yml`](../ansible/files/05-03-gitlab-runner/gitlab-ci-runner-tags.example.yml)。 +在 `.gitlab-ci.yml` 中即可按需指定不同架构运行 Job,示例见 [`ansible/files/05-03/gitlab-ci-runner-tags.example.yml`](../ansible/files/05-03/gitlab-ci-runner-tags.example.yml)。 --- @@ -154,3 +162,8 @@ kubectl -n gitlab get svc - `05-04-k3s-配置gitlab-cicd.md` - `03-04-k3s-cloudflare-tunnel-配置接入.md` +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/05-04-k3s-配置gitlab-cicd.md b/docs/05-04-k3s-配置gitlab-cicd.md index f1da222..152838a 100644 --- a/docs/05-04-k3s-配置gitlab-cicd.md +++ b/docs/05-04-k3s-配置gitlab-cicd.md @@ -2,6 +2,14 @@ > 把 manifests 与 values 纳入 GitLab 仓库,通过流水线自动部署到 K3s。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 05-04` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + --- ## 建议仓库结构 @@ -33,8 +41,8 @@ homelab-config/ **唯一真源(示例流水线)**: -- 最小:[`ansible/files/05-04-gitlab-cicd/gitlab-ci-minimal.example.yml`](../ansible/files/05-04-gitlab-cicd/gitlab-ci-minimal.example.yml) -- 多架构 deploy:[`ansible/files/05-04-gitlab-cicd/gitlab-ci-multi-arch-deploy.example.yml`](../ansible/files/05-04-gitlab-cicd/gitlab-ci-multi-arch-deploy.example.yml) +- 最小:[`ansible/files/05-04/gitlab-ci-minimal.example.yml`](../ansible/files/05-04/gitlab-ci-minimal.example.yml) +- 多架构 deploy:[`ansible/files/05-04/gitlab-ci-multi-arch-deploy.example.yml`](../ansible/files/05-04/gitlab-ci-multi-arch-deploy.example.yml) 复制到仓库根为 `.gitlab-ci.yml`(或 `include` 引用),并配好 Runner 与 `KUBE_CONFIG_CONTENT` 等变量。 @@ -52,3 +60,9 @@ homelab-config/ - `05-05-prometheus与grafana.md` - `06-02-运维小结.md` + +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/05-05-prometheus与grafana.md b/docs/05-05-prometheus与grafana.md index 2455d19..d24a7dd 100644 --- a/docs/05-05-prometheus与grafana.md +++ b/docs/05-05-prometheus与grafana.md @@ -2,6 +2,14 @@ > 使用 `kube-prometheus-stack` 建立基础可观测能力。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 05-05` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + ## 前置条件 - 集群已正常运行 @@ -34,3 +42,9 @@ kubectl -n monitoring get svc ## 下一步 - `06-02-运维小结.md` + +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/05-06-openlist挂载网盘与自动备份.md b/docs/05-06-openlist挂载网盘与自动备份.md index 82cc9a2..8238315 100644 --- a/docs/05-06-openlist挂载网盘与自动备份.md +++ b/docs/05-06-openlist挂载网盘与自动备份.md @@ -2,6 +2,14 @@ > 使用 openlist 聚合网盘,再通过定时任务实现自动备份。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 05-06` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + ## 前置条件 - openlist 已部署并可访问 @@ -13,12 +21,12 @@ 2. 选择备份方式(`rclone` / cron 脚本 / GitLab CI) 3. 设置定时执行策略 -**唯一真源**:[`ansible/files/05-06-openlist/openlist-backup-cronjob.yaml`](../ansible/files/05-06-openlist/openlist-backup-cronjob.yaml)(CronJob 示例;替换镜像与 PVC)。 +**唯一真源**:[`ansible/files/05-06/openlist-backup-cronjob.yaml`](../ansible/files/05-06/openlist-backup-cronjob.yaml)(CronJob 示例;替换镜像与 PVC)。 应用方式: ```bash -kubectl apply -f ansible/files/05-06-openlist/openlist-backup-cronjob.yaml +kubectl apply -f ansible/files/05-06/openlist-backup-cronjob.yaml ``` ## 验证命令 @@ -33,3 +41,9 @@ kubectl apply -f ansible/files/05-06-openlist/openlist-backup-cronjob.yaml ## 下一步 - `06-02-运维小结.md` + +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/05-07-openclaw应用部署.md b/docs/05-07-openclaw应用部署.md index 936f647..9d8a699 100644 --- a/docs/05-07-openclaw应用部署.md +++ b/docs/05-07-openclaw应用部署.md @@ -3,6 +3,14 @@ > 本文采用 **Docker 独立部署 + K3s Traefik 反向代理** 的方式集成 OpenClaw: > OpenClaw Gateway 本身运行在一台 x86 主机上,K3s 只负责入口统一转发,并不直接在集群内以 Pod 形式运行 OpenClaw。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 05-07` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + ## 前置条件 - 已完成基础集群安装:`01-01`、`01-02` @@ -43,12 +51,12 @@ docker compose run --rm openclaw-cli dashboard --no-open ## 在 K3s 中做静态转发 -**唯一真源**:[`ansible/files/05-07-openclaw/openclaw-proxy.yaml`](../ansible/files/05-07-openclaw/openclaw-proxy.yaml)(按实际 IP、端口、`host` 修改)。 +**唯一真源**:[`ansible/files/05-07/openclaw-proxy.yaml`](../ansible/files/05-07/openclaw-proxy.yaml)(按实际 IP、端口、`host` 修改)。 ## 部署命令 ```bash -kubectl apply -f ansible/files/05-07-openclaw/openclaw-proxy.yaml +kubectl apply -f ansible/files/05-07/openclaw-proxy.yaml ``` ## 验证命令 @@ -68,4 +76,8 @@ curl -I --max-time 3 http://openclaw.example.com/ - 如需为 OpenClaw 容器挂载额外挂载目录或持久化 `/home/node`,请参考官方 Docker 文档中的 `OPENCLAW_EXTRA_MOUNTS` 与 `OPENCLAW_HOME_VOLUME` 说明。 - 返回 `00-00-构建总览.md`,按导航继续阅读其它应用部署文档。 +## 排障 +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/05-08-openclaw-k3s-实验部署.md b/docs/05-08-openclaw-k3s-实验部署.md index 42aae27..d129c0c 100644 --- a/docs/05-08-openclaw-k3s-实验部署.md +++ b/docs/05-08-openclaw-k3s-实验部署.md @@ -3,6 +3,14 @@ > **实验性文档**:尝试将 OpenClaw Gateway 直接以 Deployment/Service/Ingress 的形式运行在 K3s 中。 > 官方推荐的方式仍是 **Docker / Docker Compose 部署**(参考 `05-07-openclaw应用部署.md` 与官方文档),本篇仅供有 K8s 经验的读者实验与参考。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 05-08` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + ## 前置条件 - 已完成基础集群安装:`01-01`、`01-02` @@ -42,14 +50,14 @@ docker push registry.local/openclaw:local 下面是一个**高度简化的实验性 Deployment/Service/Ingress 示例**,其目标只是让你能在集群内/通过 Traefik 访问 OpenClaw Gateway 控制界面。 -**唯一真源**:[`ansible/files/05-08-openclaw/openclaw-k3s-experimental.yaml`](../ansible/files/05-08-openclaw/openclaw-k3s-experimental.yaml)(实验用;替换镜像与域名)。 +**唯一真源**:[`ansible/files/05-08/openclaw-k3s-experimental.yaml`](../ansible/files/05-08/openclaw-k3s-experimental.yaml)(实验用;替换镜像与域名)。 > 说明:示例使用 `emptyDir`;若要持久化请改为 PVC/hostPath。 应用部署: ```bash -kubectl apply -f ansible/files/05-08-openclaw/openclaw-k3s-experimental.yaml +kubectl apply -f ansible/files/05-08/openclaw-k3s-experimental.yaml ``` --- @@ -97,6 +105,11 @@ curl -I --max-time 5 http://openclaw-k3s.example.com/ ## 下一步 -- 若你验证了本实验方案在自己的环境中稳定可用,可以在 `00-02-验证矩阵.md` 中为本篇补充状态与备注(注明镜像版本、节点架构与日期)。 +- 若你验证了本实验方案在自己的环境中稳定可用,建议在本篇文档中补充验证备注(注明镜像版本、节点架构与日期)。 - 返回 `00-00-构建总览.md`,按导航继续阅读其它运维/排障文档。 +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/05-09-openclaw-web-小游戏网页平台.md b/docs/05-09-openclaw-web-小游戏网页平台.md index e3ff9f6..03ef74c 100644 --- a/docs/05-09-openclaw-web-小游戏网页平台.md +++ b/docs/05-09-openclaw-web-小游戏网页平台.md @@ -2,6 +2,14 @@ > 在 K3s 中部署一个简单的 OpenClaw Web 前端(示例 Deployment/Service/Ingress),用于演示“静态站点/前端应用通过 Traefik 暴露”这一类场景。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 05-09` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + ## 前置条件 - 已完成基础集群安装:`01-01`、`01-02` @@ -12,14 +20,14 @@ | 项 | 路径 | |----|------| -| 本篇清单 | [`ansible/files/05-09-openclaw-web-小游戏网页平台/openclaw-web.yml`](../ansible/files/05-09-openclaw-web-小游戏网页平台/openclaw-web.yml) | -| 应用 | `kubectl apply -f ansible/files/05-09-openclaw-web-小游戏网页平台/openclaw-web.yml` | -| 删除 | `kubectl delete -f ansible/files/05-09-openclaw-web-小游戏网页平台/openclaw-web.yml` | +| 本篇清单 | [`ansible/files/05-09/openclaw-web.yml`](../ansible/files/05-09/openclaw-web.yml) | +| 应用 | `kubectl apply -f ansible/files/05-09/openclaw-web.yml` | +| 删除 | `kubectl delete -f ansible/files/05-09/openclaw-web.yml` | ## 部署与验证 ```bash -kubectl apply -f ansible/files/05-09-openclaw-web-小游戏网页平台/openclaw-web.yml +kubectl apply -f ansible/files/05-09/openclaw-web.yml kubectl get deploy,svc,ing -n default | grep -i openclaw-web curl -I --max-time 5 http://openclaw.example.com/ ``` @@ -35,3 +43,8 @@ curl -I --max-time 5 http://openclaw.example.com/ - **502**:Service 无 Endpoints(Pod 未 Ready 或 selector 不匹配)。 - **ImagePullBackOff**:镜像仓库不可达或未配置鉴权。 +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/06-00-排障与运维-系列说明.md b/docs/06-00-排障与运维-系列说明.md new file mode 100644 index 0000000..6fe6557 --- /dev/null +++ b/docs/06-00-排障与运维-系列说明.md @@ -0,0 +1,22 @@ +# 06-00 排障与运维(系列说明) + +> 本系列覆盖:连通性/策略排障、运维总结、备份与恢复等。 + +## TL;DR + +- **本页定位**:仅系列导航,不参与 `verify.sh run-all/full` +- **子篇执行入口**:按下表执行 `./scripts/verify.sh run ` + +## 范围与非目标 + +- 本页是 **06 系列入口/导航页**(`YY=00`),不要求具备独立执行器,且不参与 `verify.sh run-all/full`。 +- `YY>0` 的分项必须包含可执行物(YAML 路径或命令块)。 + +## 06 系列索引 + +| doc_id | 主题 | 子篇执行入口 | +|-------:|------|------------| +| 06-01 | NetworkPolicy 使用与故障排查 | `./scripts/verify.sh run 06-01` | +| 06-02 | 运维小结 | `./scripts/verify.sh run 06-02` | +| 06-03 | 自动备份与恢复(openlist WebDAV) | `./scripts/verify.sh run 06-03` | + diff --git a/docs/06-01-k3s-networkpolicy-故障排查.md b/docs/06-01-k3s-networkpolicy-故障排查.md index 26292a8..5f1161d 100644 --- a/docs/06-01-k3s-networkpolicy-故障排查.md +++ b/docs/06-01-k3s-networkpolicy-故障排查.md @@ -3,6 +3,14 @@ > 本文只负责 **网络策略与连通性排障**。 > 若你要做 Traefik 部署、ServiceLB 池配置,请看 `01-02-k3s-工作节点.md`。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 06-01` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + ## 前置条件 - 已完成 `01-02-k3s-工作节点.md` @@ -88,10 +96,16 @@ sudo firewall-cmd --get-active-zones ## 关联文档 - `01-02-k3s-工作节点.md` -- `04-03-k3s-nginx-demo.md` +- `02-05-nginx-验证矩阵-一键部署.md`(入口与路由基线) - `04-01-k3s-nodejs-高级部署.md` - `scripts/README.md` ## 下一步 - 返回 00-00-构建总览.md,按导航继续。 + +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/06-02-运维小结.md b/docs/06-02-运维小结.md index d53d1ca..a3c72a8 100644 --- a/docs/06-02-运维小结.md +++ b/docs/06-02-运维小结.md @@ -2,6 +2,14 @@ > 日常运维建议:检查项、变更记录、备份策略。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 06-02` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + ## 日常检查 - `kubectl get nodes` 是否全部 `Ready` @@ -29,7 +37,7 @@ - **SSH 与 Ansible** - `bash scripts/ssh/test-ssh.sh` - `ssh -i ~/.ssh/id_ed25519_k3s_*.61 root@192.168.2.61` - - `ansible-playbook -i ansible/inventory.ini ansible/playbooks/nginx-matrix-tls-deploy.yml` + - `ansible-playbook -i ansible/inventory.ini ansible/playbooks/verify/03-02.yml` ## 建议的日常清理 @@ -58,3 +66,8 @@ - 返回 00-00-构建总览.md,按导航继续。 +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/06-03-k3s-自动备份与恢复-openlist-webdav.md b/docs/06-03-k3s-自动备份与恢复-openlist-webdav.md index 21718c1..81fadb2 100644 --- a/docs/06-03-k3s-自动备份与恢复-openlist-webdav.md +++ b/docs/06-03-k3s-自动备份与恢复-openlist-webdav.md @@ -3,6 +3,14 @@ > 本文专注一件事:**如何为使用本地目录的工作负载补上一条“自动备份 + 半自动恢复”的安全网**。 > 核心工具:openlist 聚合网盘 + WebDAV 暴露 + `rclone` / CronJob / CI 脚本。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 06-03` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + ## 前置条件 - 已部署 openlist,并通过它聚合了至少一个云盘 @@ -53,12 +61,12 @@ rclone ls openlist-webdav: ### 2.2 使用 CronJob 定期备份(集群内) -如果你希望在 K3s 内部完成备份,可以将 `rclone` 封装到容器镜像中。**唯一真源(CronJob)**:[`ansible/files/06-03-openlist-webdav/app-data-backup-cronjob.yaml`](../ansible/files/06-03-openlist-webdav/app-data-backup-cronjob.yaml)。 +如果你希望在 K3s 内部完成备份,可以将 `rclone` 封装到容器镜像中。**唯一真源(CronJob)**:[`ansible/files/06-03/app-data-backup-cronjob.yaml`](../ansible/files/06-03/app-data-backup-cronjob.yaml)。 应用方式: ```bash -kubectl apply -f ansible/files/06-03-openlist-webdav/app-data-backup-cronjob.yaml +kubectl apply -f ansible/files/06-03/app-data-backup-cronjob.yaml ``` > 提示:如果你的应用使用的是 PVC,而不是 `hostPath`,则可以将 `volumes.hostPath` 改为 `persistentVolumeClaim`。 @@ -69,12 +77,12 @@ kubectl apply -f ansible/files/06-03-openlist-webdav/app-data-backup-cronjob.yam ### 3.1 恢复 Job 示例 -当某个节点发生故障、你将应用调度到另一节点后,可以通过一次性 Job 拉回备份。**唯一真源(Job)**:[`ansible/files/06-03-openlist-webdav/app-data-restore-job.yaml`](../ansible/files/06-03-openlist-webdav/app-data-restore-job.yaml)。 +当某个节点发生故障、你将应用调度到另一节点后,可以通过一次性 Job 拉回备份。**唯一真源(Job)**:[`ansible/files/06-03/app-data-restore-job.yaml`](../ansible/files/06-03/app-data-restore-job.yaml)。 执行恢复: ```bash -kubectl apply -f ansible/files/06-03-openlist-webdav/app-data-restore-job.yaml +kubectl apply -f ansible/files/06-03/app-data-restore-job.yaml kubectl -n default logs -f job/app-data-restore ``` @@ -132,4 +140,8 @@ kubectl rollout restart deploy your-app-deployment -n default - `03-06-k3s-使用nfs存储.md`:更推荐的共享存储方案,适合重要业务数据。 - `06-02-运维小结.md`:运维/备份总体建议。 +## 排障 +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/07-00-网络与CNI实验-系列说明.md b/docs/07-00-网络与CNI实验-系列说明.md new file mode 100644 index 0000000..4f89679 --- /dev/null +++ b/docs/07-00-网络与CNI实验-系列说明.md @@ -0,0 +1,22 @@ +# 07-00 网络与 CNI 实验(系列说明) + +> 本系列是 **破坏性/换 CNI/双栈** 等实验文档集合;与默认 Flannel 主线不同,建议在独立实验环境中演练,并准备回滚方案。 + +## TL;DR + +- **本页定位**:仅系列导航,不参与 `verify.sh run-all/full` +- **子篇执行入口**:按下表执行 `./scripts/verify.sh run `(若 playbook 为 noop,仍应以文档手工步骤为准) +- **成功判据**:按文档命令块完成实验且具备回滚;不要把“脚本 exit 0”误当“已验证完成” + +## 范围与非目标 + +- 本页是 **07 系列入口/导航页**(`YY=00`),不要求具备独立执行器,且不参与 `verify.sh run-all/full`。 +- `YY>0` 的分项必须包含可执行物(YAML 路径或命令块)。 + +## 07 系列索引 + +| doc_id | 主题 | 自动化入口 | +|-------:|------|------------| +| 07-01 | Calico 双栈实验 | `./scripts/verify.sh run 07-01` | +| 07-02 | Cilium 双栈与 eBPF | `./scripts/verify.sh run 07-02` | + diff --git a/docs/07-01-k3s-calico-dualstack.md b/docs/07-01-k3s-calico-dualstack.md index 2172c2f..bff5f16 100644 --- a/docs/07-01-k3s-calico-dualstack.md +++ b/docs/07-01-k3s-calico-dualstack.md @@ -2,6 +2,14 @@ > 草稿占位:本节用于在后续版本中整理 **k3s + Calico** 的 IPv4/IPv6 双栈网络实验,包括安装参数、clusterCIDR/serviceCIDR 设计、Pod IPv6 出网与 NetworkPolicy 示例。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 07-01` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + - **目标**:在独立的实验环境中,替换默认 flannel,为 k3s 配置 Calico 双栈网络,并验证: - Pod 获得 IPv4+IPv6 地址 - Pod 之间 IPv6 互通 @@ -10,3 +18,8 @@ - **当前状态**:仅预留文档入口;具体步骤将在完成 IPv4-only 实验稳定后补充。 +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/docs/07-02-k3s-cilium-dualstack-ebpf.md b/docs/07-02-k3s-cilium-dualstack-ebpf.md index 648397c..cc09ab8 100644 --- a/docs/07-02-k3s-cilium-dualstack-ebpf.md +++ b/docs/07-02-k3s-cilium-dualstack-ebpf.md @@ -2,6 +2,14 @@ > 草稿占位:本节用于后续整理 **k3s + Cilium** 的 IPv4/IPv6 双栈与 eBPF 网络实验,作为在 Calico 双栈实验基础上的进阶篇。 + +## TL;DR + +- **自动化验收**:`./scripts/verify.sh run 07-02` +- **关键前置**:按本文「前置条件」准备环境变量/Secret/入口 IP +- **成功判据**:达到本文「预期」且 playbook 断言通过 +- **排障**:见本文「排障」 + - **目标**: - 使用 Cilium 作为 CNI,为 k3s 集群提供 IPv4/IPv6 双栈 Pod 网络 - 体验 Cilium 的 L3/L4/L7 NetworkPolicy 与 Hubble 流量观测 @@ -9,3 +17,8 @@ - **当前状态**:仅预留文档入口;待基础实验完成后,结合实际环境补充详细步骤与示例。 +## 排障 + +- **先看 playbook 输出**:失败时先定位是 deploy/wait/http_check 哪一步。 +- **集群侧总览**:`kubectl get nodes -o wide`、`kubectl -n kube-system get pods -o wide`。 +- **事件与日志**:`kubectl -n describe ...`、`kubectl -n logs ... --tail=200`。 diff --git a/project-context.md b/project-context.md new file mode 100644 index 0000000..d38a7c9 --- /dev/null +++ b/project-context.md @@ -0,0 +1,174 @@ +--- +project_name: "Deploy-Laboratory" +user_name: "Jack" +date: "2026-03-26" +sections_completed: + - discovery + - technology_stack + - critical_implementation_rules +existing_patterns_found: + naming_and_ids: true + truth_sources: true + verify_framework: true + gating_and_noop: true +--- + +## 项目上下文(给 AI Agent 的实现规则底座) + +本文件用于沉淀本仓库中 **不直观但非常关键** 的约定与模式,避免 AI Agent 在实现/改动代码或文档时走偏。 + +--- + +## 技术栈与版本(来自 `docs/00-04-部署环境说明.md`) + +- **操作系统(验证环境)**:Fedora 43 Server(CoreOS) +- **K3s**:v1.34.5+k3s1 +- **Ansible**:ansible-core 2.18 +- **控制端依赖(典型)**:Git、OpenSSH client、Bash、curl(用于在工作机/控制节点执行 `ansible-playbook` 与仓库脚本) +- **配置入口**: + - `ansible/ansible.cfg`:已关闭 `host_key_checking`(实验室环境约定) + - `ansible/group_vars/all.yml`:k3s/Longhorn/防火墙等关键默认参数 + +--- + +## 仓库结构与“真源”约定 + +- **文档入口**:`README.md` → `docs/00-00-构建总览.md` +- **YAML/配置唯一真源(强约束)**:`ansible/files/**` 为本仓库 Kubernetes YAML / Helm values / 示例配置的**唯一真源**。 + - 文档中引用清单路径时,必须引用 `ansible/files//...`(禁止在 `docs/` 内复制粘贴出第二份 YAML)。 + - 例外:非 YAML 的源码/生成器/模板(如 `scripts/gen-*.py`)不受此条限制。 +- **执行器唯一真源(强约束)**:`ansible/playbooks/verify/.yml` 为该 `doc_id` 的唯一 Ansible 入口。 + - `scripts/verify.sh` 只基于 `ansible/playbooks/verify/` 自动发现并执行(缺 playbook 必须 fail-fast)。 +- **存在性校验**:`scripts/validate_matrix_playbooks.py`(历史文件名保留)只做 “`verify/.yml` ↔ `docs/-*.md` 存在性” 校验。 +- **目录命名硬约束**:`ansible/files/` 下仅允许 `XX-YY/`(只用 `doc_id`),内部用文件名区分;不再允许 `XX-YY-slug/` 或 `XX-YY-xxx/`。 + +--- + +## doc_id 与验证框架(必须遵循) + +- **doc_id 规则**:`docs/-*.md` 中的 `` 固定为 `XX-YY`。 +- **编号语义(极简、一目了然)**: + - **`XX=00`**:纯文档域(框架/说明/索引/备忘),不要求执行器,且**不参与自动验证**。 + - **`XX>0 且 YY=00`**:系列入口/说明页,不要求执行器,且**不参与自动验证**。 + - **`XX>0 且 YY>0`**:必须至少有一种执行器(见下)。 +- **自动验证范围(强约束)**:仅 `XX>0 且 YY>0` 的文档进入 `scripts/verify.sh` 自动验证范围。 +- **执行器判定口径(不新增关系文件)**: + - **Ansible 执行器**:存在 `ansible/playbooks/verify/.yml` 且可 `./scripts/verify.sh run `。 + - **脚本/SSH 执行器**:文档 TL;DR 写清入口命令(`./scripts/...`),且脚本存在、用退出码表达成功/失败。 +- **统一入口**: + - `scripts/deploy-lab.sh`:铺栈/安装入口(默认保留资源:`DEPLOY_VERIFY_TEARDOWN=0`)。 + - `scripts/verify.sh`:按 `doc_id` 验收入口(默认清理:`VERIFY_TEARDOWN=1`)。 +- **fail-fast(执行域)**:`verify.sh run ` 对 `XX>0 且 YY>0` 条目,若缺少 `ansible/playbooks/verify/.yml` 必须直接失败。 + +--- + +## gate / 开关(避免误触发重操作) + +- **默认安全**:`verify.sh run ` 默认应只做“验收”或“轻量可逆操作”。 +- **重操作必须显式开关**(典型:分区/格式化、全集群安装、TLS 矩阵铺栈等): + - `01-06.yml`:`k3s_do_prepare_storage=true` / `k3s_do_install=true` + - `03-02.yml`:`nginx_matrix_tls_enable=true`(TLS 矩阵铺栈/清理仍用 `mode=deploy|cleanup`) + - `03-05.yml`:`local_path_apply_lab_config=true` +- **gate 语义**:外部依赖未满足可 `meta: end_play` 跳过,但必须输出可 grep 的 `[GATE] ...` 信息,避免“看似通过”。 + +--- + +## 环境变量与密钥安全(强约束) + +- **永不提交真实环境变量文件**:`scripts/.env.verify`(仓库 `.gitignore` 已忽略) +- **仅提交模板**:`scripts/.env.verify.example` +- `scripts/.env.verify` 可能包含外部系统 token/凭据(Cloudflare、WebDAV 等),任何自动化/文档都应默认它只存在于本机 + +--- + +## Critical Implementation Rules + +### 规则 0:本仓库不是“应用代码仓库”,优先保证可重复验证与真源一致 + +- **真源**: + - **YAML 唯一真源(强约束)**:`ansible/files/**` + - `docs/`:说明/操作手册/验收判据(不得复制出第二套 YAML 真源) +- **入口**: + - `scripts/deploy-lab.sh`:铺栈(默认保留资源,便于持续使用) + - `scripts/verify.sh`:按 doc_id 验收(`list/run/run-all/full`;默认清理本篇临时资源) + +### 规则 0.1:维护者备忘的归位(替代 docs/00-06) + +将“仓库审查结论/维护者备忘”收敛在本文件,避免在 `docs/00-*` 入口层长期堆积噪音(`docs/00-06-仓库审查备忘.md` 已移除)。以下为保留的最小结论集: + +- **破坏性操作隔离**:磁盘分区/格式化、换 CNI、HA 切换等必须 gate,且默认不进入 `verify.sh full/run-all` 主线(见规则 5)。 +- **Ansible shell 使用口径**: + - “分支探测/兼容性场景”可用 `failed_when: false`;但后续必须有明确断言,避免“静默失败”。 + - 清理类任务允许 `failed_when: false`,但应 `register` 并输出关键 rc/stdout/stderr(便于审计与排障)。 + - 优先保持 `verify/.yml` 轻编排;高重复模式应收敛到 `ansible/playbooks/verify/tasks/` 共享片段,避免模板漂移。 +- **API/版本兼容性复核建议**:升级 K3s/Traefik 大版本后,至少复核一次:\n+ - `Ingress` API(`networking.k8s.io/v1`)字段结构(尤其 `pathType`、backend 端口)\n+ - Traefik CRD(`IngressRoute`/`Middleware`)是否仍存在且版本一致\n+ - K3s `HelmChartConfig`(`helm.cattle.io/v1`)行为是否变化\n+ - Longhorn 与 K3s 版本兼容(升级前对照 Longhorn support matrix) + +### 规则 1:`doc_id` / verify 目录 / 解析器(防误触) + +- `doc_id` 固定为 `XX-YY`(来自 `docs/XX-YY-*.md` 文件名),`verify.sh run ` 必须存在 `ansible/playbooks/verify/.yml`,否则 **fail-fast**。 + +### 规则 2:gate / ✅ 的“证据标准”(防“看似通过”) + +- **禁止“通用 noop 模板”**:每个 `verify/.yml` 必须自包含并体现该文档目标的断言;不得用“仅存在性检查”冒充验收。 +- **gate**:外部依赖未满足时允许跳过 apply/断言,但必须满足: + - playbook 退出码可为 0(表示“框架跑通/条件不足跳过”) + - 文档中应保持 **⚠️**(不可标 ✅) + - teardown 必须“安全”:仅在 gate 通过且 manifest/资源确实存在时才执行 delete,且删除失败不得导致整条用例 fail-fast +- **✅ 已验证** 的最小证据建议(写在文档备注里): + - 说明是 **自动化实测**(deploy+verify+teardown)还是 **noop/gate**(后者不得 ✅) + - 说明是否包含 **第三方/集群外视角**(见规则 3) + +### 规则 2.1:✅ 已验证的“硬门槛”(建议逐步收敛为统一口径) + +将条目从 **⚠️/❓ → ✅** 时,建议满足下列最小门槛(能写进 `verify/.yml` 的尽量写进 playbook,写不进的写在文档备注并注明“手工”): + +- **至少 1 条集群侧断言**(非存在性): + - 例如:`kubectl rollout status` 成功、PVC `Bound`、Pod 就绪、关键 CRD/资源 `Established/Ready` 等 +- **至少 1 条入口/可用性断言**(与该文档目标一致): + - HTTP:`curl` 返回码 + 关键响应标记(例如 `X-Backend`、body contains) + - TLS:必须包含 SNI/证书链验证信号(如 `curl --resolve`/`openssl s_client` 关键字段),仅 `rollout` 不足以标 ✅ +- **明确执行位置**: + - 若目标是“集群外链路”,必须经 `ONECLOUD_SSH`(或等价第三方机)执行探测;仅控制节点自测不足以标 ✅ +- **资源清理策略说明**: + - 若 `VERIFY_TEARDOWN=0` 保留现场调试,文档需写明(否则容易污染后续用例,导致“假通过/假失败”) +- **外部依赖说明**: + - 若依赖 Cloudflare/ACME/NFS/WebDAV 等外部系统,文档需写明依赖已满足与证据(例如:已创建 secret、已开公网端口、已配置 DNS) + +> 目标:让 ✅ 的含义从“脚本退出码 0”收敛为“有可复现证据的实测通过”。 + +### 规则 3:执行位置与“集群外视角”(防自测冒充真实路径) + +- 默认约定:在 `k3s_server`(如 ylc61)执行 `kubectl/helm/curl`(`KUBECONFIG=/etc/rancher/k3s/k3s.yaml`)。 +- 若用例声明需要“集群外/第三方机”视角(例如家庭网络真实访问路径、OpenWrt 入口、外部 curl),必须显式经 `ONECLOUD_SSH`(或等价变量)执行探测: + - **不得**用“控制节点本机 curl”替代“第三方机 curl”并仍标记为已验证 + +### 规则 4:verify playbook 结构与可靠性约定(可复制模式) + +- 推荐三段式:**deploy → verify → teardown**(或多个 play 分段),并遵守: + - rollout/就绪检查要带合理 `--timeout` + - HTTP 探测要有连接/总超时,并对关键判据做显式断言(例如 nginx 矩阵用 `X-Backend`) + - teardown 默认开启(`VERIFY_TEARDOWN=1`),调试才允许关闭并需在备注说明可能污染后续用例 +- 失败恢复:对“部分已 apply 的临时清单”允许用清理任务兜底(例如清理 `/tmp/...`),但不要吞掉核心错误。 + +### 规则 5:破坏性操作隔离(必须 gate + 默认不进主线) + +- 破坏性内容(如磁盘分区/格式化、换 CNI、HA 切换)必须: + - 用显式开关 gate(例如 `k3s_prepare_storage=true`)才能触发 + - 默认不应混入 `verify.sh full/run-all` 的“安全验收主线” +- 文档里写清“可重建环境/回滚前置”,并区分“生产主线”与“破坏性实验” + - 与主线不同的实验(尤其 `07-*`)默认不提供“一键安装”,不可误做“apply 即生产可用” + +### 规则 6:密钥与敏感信息(强约束) + +- `scripts/.env.verify` 只允许本机存在,**永不提交**;仓库只保留 `scripts/.env.verify.example`(`.gitignore` 已忽略 `scripts/.env.verify`)。 +- inventory 若声明 `ansible_ssh_private_key_file`,控制端必须确保文件存在且权限仅所有者可读(建议 600/400);否则应在 preflight 阶段直接失败(见 `scripts/lib-ansible-lab.sh` 的检查逻辑)。 + +### 规则 7:验证环境基线(避免“跑得通但不复现”) + +- 验证环境的机器角色与约定(例如 `ylc65` 仅作为 Ansible 控制端,不是 K3s 节点)以 `docs/00-04-部署环境说明.md` 为准;新增文档/用例若依赖“在哪台机器执行”,必须写清。 +- `ansible/group_vars/all.yml` 中的关键默认值(如 `k3s_data_dir=/storage`、`k3s_verify_storage_mount=true`、`k3s_manage_firewalld=true`、CoreDNS forward)会影响大量文档与用例,修改这些值相当于“改了实验室契约”,应同步更新文档并回退相关条目的验证状态。 + +### 规则 8:Ansible 控制端连接约定(实验室特化) + +- `ansible/ansible.cfg` 已设置 `host_key_checking=False` 且默认 inventory 为同目录的 `inventory.ini`(实验室特化);不要把依赖写死到 `~/.ansible.cfg`。 + diff --git a/scripts/.env.verify.example b/scripts/.env.verify.example index 9db4c6d..80a23e8 100644 --- a/scripts/.env.verify.example +++ b/scripts/.env.verify.example @@ -1,5 +1,6 @@ -# 验证矩阵 / 编排脚本用环境变量模板 +# 验证 / 编排脚本用环境变量模板 # --------------------------------------------------------------------------- +# 各变量与「未验证 / 部分验证」文档的对应关系、缺省时的行为,见 docs/00-07-待验证项-验证前准备.md # 使用:复制为本目录下的 .env.verify(勿提交 Git),在仓库根执行: # set -a && source scripts/.env.verify && set +a # 或在 bash 中:source scripts/.env.verify @@ -23,10 +24,9 @@ export K3S_DATA_DIR="${K3S_DATA_DIR:-/storage}" # --- Ansible(安装/复验 k3s;对应 docs/00-05 §2 步骤 3)--- export ANSIBLE_INVENTORY="${ANSIBLE_INVENTORY:-$(pwd)/ansible/inventory.ini}" -# deploy-lab.sh k3s / ssh/run-phase2-k3s-on-ylc61-as-jack.sh: -# 为 true 时先跑 k3s-prepare-storage.yml(传 -e k3s_prepare_storage=true;磁盘变量见 group_vars) +# deploy-lab.sh k3s:为 true 时先在 01-06.yml 内启用准备数据盘(传 -e k3s_do_prepare_storage=true -e k3s_prepare_storage=true;磁盘变量见 group_vars) export K3S_PREPARE_STORAGE="${K3S_PREPARE_STORAGE:-false}" -# 建议在控制节点或 Linux 工作机执行 deploy-lab.sh;办公机可用 run-phase2-k3s-on-ylc61-as-jack.sh 触发远端 +# 建议在控制节点或 Linux 工作机仓库根执行 deploy-lab.sh # --- SSH 密钥命名(与 scripts/ssh/test-ssh.sh 默认一致;脚本内尚为硬编码路径)--- # test-ssh 使用:$K3S_SSH_KEY_DIR/${K3S_SSH_KEY_PREFIX} @@ -39,8 +39,12 @@ export SSH_USER="${SSH_USER:-jack}" export TIMEOUT_SEC="${TIMEOUT_SEC:-5}" # --- 矩阵验证(docs/00-05 §2 步骤 4~6)--- -# ./scripts/verify.sh run | run-all | preflight | flow -# VERIFY_TEARDOWN(默认 1)、VERIFY_PREFLIGHT_CLUSTER(为 1 时 preflight 额外 kubectl get nodes) +# ./scripts/verify.sh run | run-all | preflight | full | flow +# verify.sh 默认 VERIFY_TEARDOWN=1:每篇 playbook 末尾 teardown,再跑下一 doc_id;勿设为 0(资源残留会干扰后续用例) +export VERIFY_TEARDOWN="${VERIFY_TEARDOWN:-1}" +# 02-xx / 03-02 等经 Ingress 校验 HTTP;填控制节点或入口 URL(与 inventory 中 k3s_server IP 一致) +export nginx_entry_base="${nginx_entry_base:-http://192.168.2.61}" +# VERIFY_PREFLIGHT_CLUSTER(为 1 时 preflight 额外 kubectl get nodes) # --- SSH:第三方验证机 onecloud(不忽略:矩阵里多处依赖「集群外」curl/探测)--- # 用途示例:02-xx nginx 矩阵从第三方访问 Ingress;01-07 经 onecloud 对 OpenWrt:18080/18443 发 curl; @@ -76,18 +80,21 @@ export TRAEFIK_NAMESPACE="${TRAEFIK_NAMESPACE:-kube-system}" # 逗号分隔,与 ZONE_NAME 下实际 DNS 记录一致;勿提交敏感子域若需可只写本机 export VERIFY_TLS_HOSTS="${VERIFY_TLS_HOSTS:-test01.jackadam.top,test02.jackadam.top,test03.jackadam.top,test04.jackadam.top}" -# --- Longhorn(03-07 / ansible longhorn-install)--- +# --- Longhorn(03-07 / ansible verify/stack-longhorn-install)--- export LONGHORN_NAMESPACE="${LONGHORN_NAMESPACE:-longhorn-system}" -# --- 可选跳过(仅下列项;ONECLOUD 与 ARMV7 实机路径不在此列——见下方)--- +# --- 可选跳过(编排占位;当前 verify.sh 未实现 HA/GitOps 门控时可忽略)--- export SKIP_HA="${SKIP_HA:-1}" -# 跑 01-03 / 01-05 时设为 0,并填写 ARMV7_*;编排脚本不得在无设备时假装通过 -export SKIP_ARMV7="${SKIP_ARMV7:-1}" export SKIP_GITOPS="${SKIP_GITOPS:-1}" -# --- armv7(01-03 Docker、01-05 NFS):不忽略;与 ONECLOUD 一样,编排须显式走 ARMV7_*,不得跳过 --- -# 跑 01-03 / 01-05 时:SKIP_ARMV7=0,并填写 ARMV7_SSH(可与 onecloud 同主机、或直连 arm;按你环境二选一)。 -# ARMV7_NFS_SSH 默认同 ARMV7_SSH;若 NFS 在另一台 arm 上再单独覆盖。 +# --- armv7 / arm32(01-03 Docker、01-05 NFS、05-02 部分)--- +# 默认 SKIP_ARMV7=1:verify/01-03、01-05 仅跑矩阵基线(文档/文件检查),不经 SSH 改 arm 机。 +# 设 SKIP_ARMV7=0 且 ARMV7_SSH 非空:run 01-03 时经该 SSH 在 arm 上 dnf 装 docker 并校验(假定 Fedora/RHEL 系,见 docs/01-03)。 +# 01-05:同上,但 NFS 所在主机可用 ARMV7_NFS_SSH;未设则回退为 ARMV7_SSH;会写 /etc/exports、exportfs(见 docs/01-05)。 +# export ARMV7_NFS_EXPORT_PATH="/sdcard" +# export ARMV7_NFS_CLIENT_SUBNET="192.168.2.0/24" +# verify.sh 在 source .env.verify 后执行 playbook,子进程会继承下列变量(无需 verify.sh 单独传参)。 +export SKIP_ARMV7="${SKIP_ARMV7:-1}" export ARMV7_SSH="${ARMV7_SSH:-}" export ARMV7_NFS_SSH="${ARMV7_NFS_SSH:-$ARMV7_SSH}" @@ -99,10 +106,8 @@ export ARMV7_NFS_SSH="${ARMV7_NFS_SSH:-$ARMV7_SSH}" # export OPENWRT_VERIFY_HTTPS_HOSTS="test01.jackadam.top,..." # --- 与 scripts/*.sh 对照 --- -# verify.sh → VERIFY_TEARDOWN, VERIFY_PREFLIGHT_CLUSTER, nginx_entry_base, ANSIBLE_INVENTORY +# verify.sh → VERIFY_TEARDOWN, VERIFY_PREFLIGHT_CLUSTER, nginx_entry_base, ANSIBLE_INVENTORY;01-03/01-05 另读 SKIP_ARMV7、ARMV7_SSH、ARMV7_NFS_SSH 等(见上文 armv7 段) # deploy-lab.sh → ANSIBLE_INVENTORY, K3S_PREPARE_STORAGE -# ssh/run-phase2-k3s-on-ylc61-as-jack.sh → LAB_REPO_ROOT, K3S_PREPARE_STORAGE(传远端) -# ssh/smoke-verify-matrix-on-ylc61.sh → VERIFY_REPO_ROOT, VERIFY_TEARDOWN, nginx_entry_base # cloudflare-delete-acme-challenge-dns.sh → CF_API_TOKEN, ZONE_NAME, ZONE_ID # k3s-delete-lab-stacks.sh → KUBECONFIG # ssh/test-ssh.sh → TIMEOUT_SEC;密钥路径当前固定为 $HOME/.ssh/id_ed25519_k3s_ diff --git a/scripts/README.md b/scripts/README.md index 6d4f5d5..4060a94 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -2,7 +2,7 @@ 本目录集中维护通用运维脚本。约定:**在仓库根目录执行**,使用 `./scripts/...` 路径调用。 -流程说明与「部署 / 验证」分工以 [`docs/00-05-测试与验证框架.md`](../docs/00-05-测试与验证框架.md) **§2 自动化验证流程** 为准;下表与之一一对应。 +流程说明与「部署 / 验证」分工以 [`docs/00-03-测试与验证框架.md`](../docs/00-03-测试与验证框架.md) **§2 自动化验证流程** 为准;下表与之一一对应。 | §2 步骤 | 含义 | 本仓库入口 | |--------|------|------------| @@ -10,18 +10,23 @@ | 2 环境/清理 | 轻量:各 verify 的 teardown;重度:清实验负载 / 重装 K3s | 轻量:`VERIFY_TEARDOWN`(默认 1);重度:`k3s-delete-lab-stacks.sh`、文档中的 `k3s-uninstall`(**勿**默认进 `run-all`) | | 3 部署 | K3s、Longhorn、nginx 矩阵等铺栈 | **`./scripts/deploy-lab.sh`**(`k3s` / `longhorn` / `nginx-matrix` / `nginx-matrix-tls`) | | 4~5 断言与收尾 | 按 doc 目标 kubectl/curl;本篇 teardown | **`./scripts/verify.sh`** `run` / `run-all` | -| 6 一键串联 | 按矩阵顺序跑全部 verify | **`./scripts/verify.sh run-all`** | +| 6 一键串联 | 按 doc_id 顺序跑全部 verify(可先 preflight) | **`./scripts/verify.sh full`**(推荐,= preflight + run-all)或 **`./scripts/verify.sh run-all`** | + +真机一键验收(可选先铺栈再全量验收): + +- `./scripts/acceptance.sh`:可选先执行 `deploy-lab.sh`(由环境变量开关),再执行 `verify.sh full`。 辅助命令: - `./scripts/verify.sh flow` — 打印与 §2 对齐的流程说明(不接 Ansible)。 -- `./scripts/verify.sh preflight` — 检查 `ansible-playbook`、矩阵、`inventory`,并对 `k3s_server` 执行 `ping`;若已装集群可设 `VERIFY_PREFLIGHT_CLUSTER=1` 再执行 `kubectl get nodes`。 +- `./scripts/verify.sh preflight` — 检查 `ansible-playbook` 与 `inventory`,并对 `k3s_server` 执行 `ping`;若已装集群可设 `VERIFY_PREFLIGHT_CLUSTER=1` 再执行 `kubectl get nodes`。 +- `./scripts/verify.sh list --series 04 --exclude-noop` — 支持按主序列与 noop 过滤查看执行集合。 ## 验证编排环境变量(可选) 复制 [`scripts/.env.verify.example`](.env.verify.example) 为 `scripts/.env.verify` 并填写本机值;**勿提交** `scripts/.env.verify`(已在仓库 `.gitignore` 中忽略)。 -其中 **`ONECLOUD_SSH`** 用于矩阵里**集群外**第三方 curl 等;**`ARMV7_SSH` / `ARMV7_NFS_SSH`** 用于 `01-03` / `01-05` 实机;**`ACME_EMAIL`** 供 Traefik ACME(`03-02` / `03-03`);另有 **`VERIFY_TLS_HOSTS`**、`K3S_SERVER_HOSTNAME`、`TIMEOUT_SEC`、`LONGHORN_NAMESPACE` 等,完整列表见 [`.env.verify.example`](.env.verify.example) 文末注释。 +其中 **`ONECLOUD_SSH`** 用于**集群外**第三方 curl 等;**`SKIP_ARMV7` / `ARMV7_SSH` / `ARMV7_NFS_SSH`**(及 01-05 的 **`ARMV7_NFS_EXPORT_PATH`**、**`ARMV7_NFS_CLIENT_SUBNET`**)由 playbook 通过环境变量读取:`SKIP_ARMV7=1`(默认)时 `01-03`/`01-05` 仅基线检查;**`SKIP_ARMV7=0` 且 SSH 已配置** 时会对 arm 主机执行 dnf 路径(Fedora/RHEL 系,见 `docs/00-05` §E)。**`ACME_EMAIL`** 供 Traefik ACME(`03-02` / `03-03`);另有 **`VERIFY_TLS_HOSTS`**、`K3S_SERVER_HOSTNAME`、`TIMEOUT_SEC`、`LONGHORN_NAMESPACE` 等,完整列表见 [`.env.verify.example`](.env.verify.example) 文末注释。 ```bash set -a && source scripts/.env.verify && set +a @@ -29,19 +34,23 @@ set -a && source scripts/.env.verify && set +a ## 部署 K3s(推荐在控制节点或 Linux 工作机) -1. 在仓库根(或 `cd ansible` 后改用相对路径)执行 **`./scripts/deploy-lab.sh k3s`**。若需先准备数据盘,在 **本机或 `.env.verify`** 中设 `K3S_PREPARE_STORAGE=true`(会传 `-e k3s_prepare_storage=true` 跑 `k3s-prepare-storage.yml`)。 -2. 办公机无 Ansible 时,可 SSH 到 ylc61 再执行上述命令,或使用: +在仓库根(或 `cd ansible` 后改用相对路径)执行 **`./scripts/deploy-lab.sh k3s`**。若需先准备数据盘,在 **本机或 `.env.verify`** 中设 `K3S_PREPARE_STORAGE=true`(会传 `-e k3s_prepare_storage=true` 跑 `ansible/playbooks/verify/01-06.yml`)。 + +**密钥与执行用户**:`inventory.ini` 中私钥路径随执行用户变化;在目标节点上以非 root 用户执行时,注意私钥路径与 `ansible_user` 与文档一致。 + +## 验证(run-all / full) + +**推荐一行**(在仓库根;需已安装 Ansible、[`ansible/inventory.ini`](../ansible/inventory.ini) 可达、`k3s_server` 可 ping;集群与入口变量已按 [`00-04`](../docs/00-04-部署环境说明.md) 与 `.env.verify` 配好): ```bash -export K3S_PREPARE_STORAGE=false # 或 true -./scripts/ssh/run-phase2-k3s-on-ylc61-as-jack.sh +./scripts/verify.sh full ``` -环境变量 **`LAB_REPO_ROOT`**(仅 SSH 包装脚本使用)可指定远端仓库路径,默认 `/home/jack/实验室建设`。 +`full` = `preflight` + `run-all`。若 `nginx_entry_base`、`nodejs_entry_base` 等未写入 `scripts/.env.verify`,可先 `export nginx_entry_base=http://<入口IP>` 再执行。仅跑用例、跳过 preflight 时用 `./scripts/verify.sh run-all`。`list/run-all/full` 均支持筛选参数:`--series `、`--id-regex `、`--exclude-noop`、`--require-teardown`。 -**密钥与执行用户**:`inventory.ini` 中私钥路径随执行用户变化;在 ylc61 上以 `jack` 执行可避免解析到 `/root/.ssh/`。 +将准备项(NFS、ACME、armv7、noop 文档等)补齐后再推进“已验证”,见 [`docs/00-04-待验证项-验证前准备.md`](../docs/00-04-待验证项-验证前准备.md)。 -## 验证矩阵(run-all / 抽样 smoke) +等价多行写法(与 `full` 相同): ```bash ./scripts/verify.sh preflight @@ -49,20 +58,10 @@ export nginx_entry_base=http://192.168.2.61 ./scripts/verify.sh run-all ``` -从办公机经 SSH 在远端仓库根跑抽样用例: - -```bash -./scripts/ssh/smoke-verify-matrix-on-ylc61.sh -``` - -可用 **`VERIFY_REPO_ROOT`** 指定远端路径(默认 `/home/jack/实验室建设`)。 - ## 目录与脚本对照 -- **`verify.sh`** — 矩阵验收:`flow` / `preflight` / `list` / `run` / `run-all` +- **`verify.sh`** — doc_id 验收:`flow` / `preflight` / `full` / `list` / `run` / `run-all` - **`deploy-lab.sh`** — 安装/铺栈:`k3s` / `longhorn` / `nginx-matrix` / `nginx-matrix-tls` -- **`ssh/run-phase2-k3s-on-ylc61-as-jack.sh`** — 办公机触发远端 `deploy-lab.sh k3s` -- **`ssh/smoke-verify-matrix-on-ylc61.sh`** — 办公机触发远端若干 `verify.sh run` - **`k3s-delete-lab-stacks.sh`** — 按 kubectl 实况清命名空间负载(重度清场,非默认 teardown) - **`cloudflare-delete-acme-challenge-dns.sh`** — 清理 CF 上 `_acme-challenge` DNS - **`ssh/setup-k3s-workers-ssh.sh`** — 为 K3s 节点预配 SSH(配合 `01-06`) @@ -80,6 +79,6 @@ export nginx_entry_base=http://192.168.2.61 ## 说明文档 -- 验证框架:[`docs/00-05-测试与验证框架.md`](../docs/00-05-测试与验证框架.md) -- 验证矩阵状态:[`docs/00-02-验证矩阵.md`](../docs/00-02-验证矩阵.md) -- 主文档入口:`docs/00-00-构建总览.md`(若存在) +- 验证框架:[`docs/00-03-测试与验证框架.md`](../docs/00-03-测试与验证框架.md) +- 验证前准备:[`docs/00-04-待验证项-验证前准备.md`](../docs/00-04-待验证项-验证前准备.md) +- 主文档入口:`docs/00-00-构建总览.md` diff --git a/scripts/acceptance.sh b/scripts/acceptance.sh new file mode 100755 index 0000000..e7fceee --- /dev/null +++ b/scripts/acceptance.sh @@ -0,0 +1,92 @@ +#!/usr/bin/env bash +# 真机一键验收入口:可选铺栈 + 矩阵全量验收(verify.sh full)。 +# 默认:只做验收(不强制重装/重铺栈),避免误伤现有环境。 +# +# 开关(环境变量): +# ACCEPT_DEPLOY=1 先执行 deploy-lab(默认 0) +# ACCEPT_DEPLOY_K3S=1 部署/复验 K3s(默认 1;仅在 ACCEPT_DEPLOY=1 时生效) +# ACCEPT_DEPLOY_LONGHORN=1 部署 Longhorn(默认 0) +# ACCEPT_DEPLOY_NGINX_MATRIX=1 部署 nginx 矩阵(默认 0) +# ACCEPT_DEPLOY_NGINX_MATRIX_TLS=1 部署 TLS nginx 矩阵(默认 0) +# VERIFY_TEARDOWN=1 验收后清理临时资源(沿用 verify.sh 默认;可设 0 保留现场) +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + +load_env() { + if [[ -f "${ROOT}/scripts/.env.verify" ]]; then + set -a + # shellcheck disable=SC1091 + source "${ROOT}/scripts/.env.verify" + set +a + echo "[OK] 已加载 scripts/.env.verify" + fi +} + +usage() { + cat <<'EOF' +用法:scripts/acceptance.sh + +说明: + - 真机「一键验收」:可选先铺栈(deploy-lab),再跑矩阵全量验收(verify.sh full) + - 默认不铺栈(避免误改现网);只执行 ./scripts/verify.sh full + +常用示例: + # 只验收(推荐默认) + ./scripts/acceptance.sh + + # 先复验/安装 K3s,再全量验收 + ACCEPT_DEPLOY=1 ./scripts/acceptance.sh + + # 铺栈(K3s + Longhorn + nginx-matrix),然后全量验收 + ACCEPT_DEPLOY=1 ACCEPT_DEPLOY_LONGHORN=1 ACCEPT_DEPLOY_NGINX_MATRIX=1 ./scripts/acceptance.sh + + # 验收不清理(保留现场排障) + VERIFY_TEARDOWN=0 ./scripts/acceptance.sh +EOF +} + +need_cmd() { + if ! command -v "$1" >/dev/null 2>&1; then + echo "[ERR] 未找到命令:$1" >&2 + exit 1 + fi +} + +main() { + load_env + + local cmd="${1:-}" + case "$cmd" in + "" ) ;; + -h|--help|help) usage; exit 0 ;; + *) echo "[ERR] unknown arg: $cmd" >&2; usage; exit 1 ;; + esac + + need_cmd bash + need_cmd python3 + need_cmd ansible-playbook + + if [[ "${ACCEPT_DEPLOY:-0}" == "1" ]]; then + echo "########################################## deploy (optional)" + if [[ "${ACCEPT_DEPLOY_K3S:-1}" == "1" ]]; then + ./scripts/deploy-lab.sh k3s + fi + if [[ "${ACCEPT_DEPLOY_LONGHORN:-0}" == "1" ]]; then + ./scripts/deploy-lab.sh longhorn + fi + if [[ "${ACCEPT_DEPLOY_NGINX_MATRIX:-0}" == "1" ]]; then + ./scripts/deploy-lab.sh nginx-matrix + fi + if [[ "${ACCEPT_DEPLOY_NGINX_MATRIX_TLS:-0}" == "1" ]]; then + ./scripts/deploy-lab.sh nginx-matrix-tls + fi + fi + + echo "" + echo "########################################## verify full (matrix)" + ./scripts/verify.sh full +} + +main "$@" + diff --git a/scripts/deploy-lab.sh b/scripts/deploy-lab.sh index d241a95..fe4e2ed 100644 --- a/scripts/deploy-lab.sh +++ b/scripts/deploy-lab.sh @@ -1,14 +1,18 @@ #!/usr/bin/env bash -# 实验室「正式部署」入口(对应 docs/00-05 §2 步骤 1~3,与 verify.sh 的逐条验收 teardown 不同)。 +# 实验室「正式部署」入口(对应 docs/00-04 §2 步骤 1~3)。 # 在仓库根执行:./scripts/deploy-lab.sh <子命令> # -# 步骤对应关系(详见 docs/00-05-测试与验证框架.md §2): +# 步骤对应关系(详见 docs/00-03-测试与验证框架.md §2): # 1 接入 — 本机有 ansible-playbook、inventory 可达;可选加载 scripts/.env.verify -# 2 前置 — 可选 k3s-prepare-storage(磁盘 → /storage),非 k3s-uninstall 类重度清理 -# 3 部署 — k3s-init-and-install 或其它子命令所列 playbook +# 2 前置 — 可选 01-06-prepare-storage(磁盘 → /storage),非 k3s-uninstall 类重度清理 +# 3 部署 — 调用 ansible/playbooks/verify/ 下 playbook;默认 -e VERIFY_TEARDOWN=0(铺栈后保留资源)。 +# 需要验收后自动清理时,用 ./scripts/verify.sh run (默认 VERIFY_TEARDOWN=1)。 set -euo pipefail ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +# shellcheck disable=SC1091 +source "${ROOT}/scripts/lib-ansible-lab.sh" +ansible_lab_export_config load_env() { if [[ -f "${ROOT}/scripts/.env.verify" ]]; then @@ -25,17 +29,18 @@ usage() { 用法:scripts/deploy-lab.sh <子命令> 子命令: - k3s 安装/复验 K3s(可选先跑数据盘准备,见环境变量) - longhorn Helm 安装 Longhorn(ansible/playbooks/longhorn-install.yml) - nginx-matrix 部署 HTTP nginx 矩阵(ansible/playbooks/nginx-matrix-deploy.yml) - nginx-matrix-tls 部署 TLS nginx 矩阵(ansible/playbooks/nginx-matrix-tls-deploy.yml) + k3s 安装/复验 K3s(verify/01-06.yml;可选先数据盘准备) + longhorn Helm 安装 Longhorn + 健康检查(verify/03-07.yml,VERIFY_TEARDOWN=0) + nginx-matrix HTTP nginx 矩阵 + 校验(verify/02-05.yml,VERIFY_TEARDOWN=0) + nginx-matrix-tls TLS nginx 矩阵(verify/03-02.yml,需显式 -e nginx_matrix_tls_enable=true) 环境变量(节选,完整见 scripts/.env.verify.example): - ANSIBLE_INVENTORY 默认 <仓库>/ansible/inventory.ini - K3S_PREPARE_STORAGE 为 true 时先执行 k3s-prepare-storage.yml,并传 -e k3s_prepare_storage=true + ANSIBLE_INVENTORY 默认 <仓库>/ansible/inventory.ini + K3S_PREPARE_STORAGE 为 true 时在 01-06.yml 内启用准备数据盘(传 -e k3s_do_prepare_storage=true -e k3s_prepare_storage=true) + DEPLOY_VERIFY_TEARDOWN 默认 0;若设为 1,则与 verify 子命令一并传入 Ansible(longhorn/nginx-matrix 会执行卸载类 teardown) 说明: - 「矩阵级验收」请用 ./scripts/verify.sh run / run-all;本脚本只做安装/铺栈,不负责按 doc_id 做断言与 teardown。 + 铺栈入口与验收入口共用 verify 下 playbook;区别为本脚本固定默认 VERIFY_TEARDOWN=0。验收请用 ./scripts/verify.sh full / run 。 EOF } @@ -49,15 +54,17 @@ ansible_wrap() { echo "[ERR] 未找到 ansible-playbook,请先安装 Ansible" >&2 exit 1 fi - echo "[RUN] ansible-playbook -i $inv $*" - ansible-playbook -i "$inv" "$@" + ansible_lab_check_inventory_keys "$inv" || exit 1 + local td="${DEPLOY_VERIFY_TEARDOWN:-0}" + echo "[RUN] ansible-playbook -i $inv -e VERIFY_TEARDOWN=$td $*" + ansible-playbook -i "$inv" -e "VERIFY_TEARDOWN=$td" "$@" } cmd_k3s() { if [[ "${K3S_PREPARE_STORAGE:-false}" == "true" ]]; then - ansible_wrap "${ROOT}/ansible/playbooks/k3s-prepare-storage.yml" -e 'k3s_prepare_storage=true' + ansible_wrap "${ROOT}/ansible/playbooks/verify/01-06.yml" -e 'k3s_do_prepare_storage=true' -e 'k3s_prepare_storage=true' fi - ansible_wrap "${ROOT}/ansible/playbooks/k3s-init-and-install.yml" + ansible_wrap "${ROOT}/ansible/playbooks/verify/01-06.yml" -e 'k3s_do_install=true' } main() { @@ -66,9 +73,9 @@ main() { case "$sub" in ""|-h|--help) usage ;; k3s) cmd_k3s ;; - longhorn) ansible_wrap "${ROOT}/ansible/playbooks/longhorn-install.yml" ;; - nginx-matrix) ansible_wrap "${ROOT}/ansible/playbooks/nginx-matrix-deploy.yml" ;; - nginx-matrix-tls) ansible_wrap "${ROOT}/ansible/playbooks/nginx-matrix-tls-deploy.yml" ;; + longhorn) ansible_wrap "${ROOT}/ansible/playbooks/verify/03-07.yml" ;; + nginx-matrix) ansible_wrap "${ROOT}/ansible/playbooks/verify/02-05.yml" ;; + nginx-matrix-tls) ansible_wrap "${ROOT}/ansible/playbooks/verify/03-02.yml" -e 'nginx_matrix_tls_enable=true' ;; *) echo "[ERR] 未知子命令:$sub" >&2 usage diff --git a/scripts/fix-04-doc-refs.py b/scripts/fix-04-doc-refs.py new file mode 100644 index 0000000..14373ea --- /dev/null +++ b/scripts/fix-04-doc-refs.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python3 +"""After 04-xx renumbering: fix cross-links and per-doc manifest names across the repo.""" +from __future__ import annotations + +import re +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[1] + +# Old canonical .md basename -> new basename (apply before per-file yaml fix) +MD_MAP: list[tuple[str, str]] = [ + ("04-11-nodejs-副本与滚动发布.md", "04-06-nodejs-副本与滚动发布.md"), + ("04-10-nodejs-Ingress与Traefik.md", "04-07-nodejs-Ingress与Traefik.md"), + ("04-09-nodejs-存储与卷.md", "04-11-nodejs-存储与卷.md"), + ("04-08-nodejs-安全上下文.md", "04-10-nodejs-安全上下文.md"), + ("04-07-nodejs-调度与亲和.md", "04-09-nodejs-调度与亲和.md"), + ("04-06-nodejs-探针与健康检查.md", "04-05-nodejs-探针与健康检查.md"), + ("04-05-nodejs-资源请求与限制.md", "04-08-nodejs-资源请求与限制.md"), + ("04-04-nodejs-端口与Service.md", "04-02-nodejs-端口与Service.md"), + ("04-03-nodejs-环境变量与配置注入.md", "04-04-nodejs-环境变量与配置注入.md"), + ("04-02-nodejs-镜像与运行命令.md", "04-03-nodejs-镜像与运行命令.md"), +] + +SKIP_DIR_NAMES = {".git", "node_modules", "logs"} +TEXT_SUFFIXES = {".md", ".yml", ".yaml", ".sh", ".txt", ".example"} + + +def iter_files(): + for p in ROOT.rglob("*"): + if not p.is_file(): + continue + if any(x in p.parts for x in SKIP_DIR_NAMES): + continue + if p.suffix.lower() not in TEXT_SUFFIXES and p.name not in ( + ".env.verify.example", + ): + continue + yield p + + +def apply_md_map(content: str) -> str: + for old, new in MD_MAP: + content = content.replace(old, new) + return content + + +def fix_doc_manifests(content: str, doc_id: str) -> str: + return re.sub( + r"04-\d{2}-nodejs-demo\.yaml", + f"04-{doc_id}-nodejs-demo.yaml", + content, + ) + + +def fix_title(content: str, title_body: str) -> str: + lines = content.splitlines() + if lines and lines[0].startswith("# "): + lines[0] = f"# {title_body}" + return "\n".join(lines) + ("\n" if content.endswith("\n") else "") + return content + + +def main() -> None: + for path in iter_files(): + raw = path.read_text(encoding="utf-8") + new = apply_md_map(raw) + if path.parent.name == "docs" and re.match(r"04-\d{2}-", path.name): + m = re.match(r"04-(\d{2})-", path.name) + if m: + doc_id = m.group(1) + new = fix_doc_manifests(new, doc_id) + base = path.name.removesuffix(".md") + new = fix_title(new, base) + path.write_text(new, encoding="utf-8") + + # verify playbooks: doc_filename must match renumbered docs + vf = ROOT / "ansible/playbooks/verify" + for yml in sorted(vf.glob("04-*.yml")): + m = re.match(r"04-(\d{2})\.yml$", yml.name) + if not m: + continue + nid = m.group(1) + text = yml.read_text(encoding="utf-8") + # find docs/04-NN-*.md in file after md_map would already be applied + dm = re.search(r'doc_filename:\s*"([^"]+)"', text) + if not dm: + continue + old_fn = dm.group(1) + if not old_fn.startswith(f"04-{nid}-"): + # pick any docs/04-NN-*.md with this NN + docs_dir = ROOT / "docs" + matches = list(docs_dir.glob(f"04-{nid}-*.md")) + if len(matches) == 1: + text = re.sub( + r'doc_filename:\s*"[^"]+"', + f'doc_filename: "{matches[0].name}"', + text, + count=1, + ) + yml.write_text(text, encoding="utf-8") + + +if __name__ == "__main__": + main() diff --git a/scripts/gen-nodejs-demo-yaml.py b/scripts/gen-nodejs-demo-yaml.py new file mode 100644 index 0000000..7d58e6f --- /dev/null +++ b/scripts/gen-nodejs-demo-yaml.py @@ -0,0 +1,478 @@ +#!/usr/bin/env python3 +"""Generate cumulative 04-02..04-11 nodejs-demo YAML (Core→Plus→Pro doc order).""" +from pathlib import Path +from textwrap import dedent + +DIR = Path(__file__).resolve().parents[1] / "labs/nodejs/manifests" + +CM = dedent( + """\ + apiVersion: v1 # ConfigMap API 版本 + kind: ConfigMap # 配置资源:ConfigMap + metadata: # ConfigMap 元信息 + name: nodejs-demo-config # ConfigMap 名称 + namespace: default # 命名空间 + data: # 配置键值 + APP_MSG: "Hello from ConfigMap" # 注入给应用的消息内容 + """ +).strip() + +SVC_8080 = dedent( + """\ + apiVersion: v1 # Service API 版本 + kind: Service # Service 资源 + metadata: # Service 元信息 + name: nodejs-demo # Service 名称 + namespace: default # 命名空间 + spec: # Service 规格 + selector: # 选择后端 Pod + app: nodejs-demo # 选中 app=nodejs-demo + ports: # 端口映射 + - port: 80 # Service 暴露端口 + targetPort: 8080 # 转发到容器端口 + """ +).strip() + +ING_NODE = dedent( + """\ + apiVersion: networking.k8s.io/v1 # Ingress API 版本 + kind: Ingress # Ingress 资源 + metadata: # Ingress 元信息 + name: nodejs-demo # Ingress 名称 + namespace: default # 命名空间 + annotations: # Traefik 注解 + traefik.ingress.kubernetes.io/router.entrypoints: web # 使用 web(HTTP) 入口 + spec: # Ingress 规则 + rules: # 规则列表 + - http: # HTTP 路由 + paths: # 路径列表 + - path: /node # 匹配路径前缀 + pathType: Prefix # 前缀匹配 + backend: # 后端目标 + service: # 后端 Service + name: nodejs-demo # Service 名称 + port: # Service 端口 + number: 80 # 端口号 + """ +).strip() + +ING_HOST = dedent( + """\ + apiVersion: networking.k8s.io/v1 # Ingress API 版本 + kind: Ingress # Ingress 资源 + metadata: # Ingress 元信息 + name: nodejs-demo # Ingress 名称 + namespace: default # 命名空间 + annotations: # Traefik 注解 + traefik.ingress.kubernetes.io/router.entrypoints: web # 使用 web(HTTP) 入口 + spec: # Ingress 规则 + rules: # 规则列表 + - host: app.example.local # 主机名匹配 + http: # HTTP 路由 + paths: # 路径列表 + - path: /api # 匹配 API 路径前缀 + pathType: Prefix # 前缀匹配 + backend: # 后端目标 + service: # 后端 Service + name: nodejs-demo # Service 名称 + port: # Service 端口 + number: 80 # 端口号 + """ +).strip() + +# 与 Deployment 模板中 ` ports:` 同级(勿对整段 dedent,否则会剥掉缩进) +PROBES = ( + " livenessProbe: # 存活探针\n" + " httpGet: # HTTP 探测\n" + " path: / # 探测路径\n" + " port: 8080 # 探测端口\n" + " initialDelaySeconds: 3 # 初始延迟\n" + " periodSeconds: 10 # 探测周期\n" + " readinessProbe: # 就绪探针\n" + " httpGet: # HTTP 探测\n" + " path: / # 探测路径\n" + " port: 8080 # 探测端口\n" + " initialDelaySeconds: 2 # 初始延迟\n" + " periodSeconds: 5 # 探测周期\n" +) + +RES = ( + " resources: # 资源请求与限制\n" + " requests: # 最小资源请求\n" + " cpu: \"50m\" # 请求 CPU\n" + " memory: \"64Mi\" # 请求内存\n" + " limits: # 资源上限\n" + " cpu: \"500m\" # CPU 限制\n" + " memory: \"256Mi\" # 内存限制\n" +) + + +def main() -> None: + # 04-02: 01 + 仅改监听 8080(无 ConfigMap) + doc2 = dedent( + """\ + # 对应文档:docs/04-02-nodejs-端口与Service.md + # 累积:04-01 + 容器与 Service 改监听 8080(与后续探针一致) + apiVersion: apps/v1 # Deployment API 版本 + kind: Deployment # 工作负载:Deployment + metadata: # Deployment 元信息 + name: nodejs-demo # Deployment 名称 + namespace: default # 命名空间 + spec: # Deployment 规格 + replicas: 1 # 副本数 + selector: # Deployment 选择器 + matchLabels: # 标签匹配集合 + app: nodejs-demo # 匹配 app=nodejs-demo 的 Pod + template: # Pod 模板 + metadata: # Pod 元信息 + labels: # Pod 标签 + app: nodejs-demo # 与 selector.matchLabels 对齐 + spec: # Pod 规格 + containers: # 容器列表 + - name: nodejs-demo # 容器名 + image: node:18-alpine # Node.js 镜像 + command: ["node", "-e", "require('http').createServer((req,res)=>res.end('Hello World from Node.js')).listen(8080)"] # 内联 HTTP 服务改监听 8080 + ports: # 容器端口 + - containerPort: 8080 # 应用监听端口 + --- + """ + ) + SVC_8080 + "\n---\n" + ING_NODE + "\n" + + # 04-03: + 固定镜像 tag、command/args(与旧 04-02 等价,端口 8080) + doc3 = dedent( + """\ + # 对应文档:docs/04-03-nodejs-镜像与运行命令.md + # 累积:04-02 + 固定镜像 tag、imagePullPolicy、command/args + apiVersion: apps/v1 # Deployment API 版本 + kind: Deployment # 工作负载:Deployment + metadata: # Deployment 元信息 + name: nodejs-demo # Deployment 名称 + namespace: default # 命名空间 + spec: # Deployment 规格 + replicas: 1 # 副本数 + selector: # Deployment 选择器 + matchLabels: # 标签匹配集合 + app: nodejs-demo # 匹配 app=nodejs-demo 的 Pod + template: # Pod 模板 + metadata: # Pod 元信息 + labels: # Pod 标签 + app: nodejs-demo # 与 selector.matchLabels 对齐 + spec: # Pod 规格 + containers: # 容器列表 + - name: nodejs-demo # 容器名 + image: node:18.20-alpine # 固定 tag 的 Node.js 镜像 + imagePullPolicy: IfNotPresent # 拉取策略:本地有则不重复拉取 + command: ["node"] # 主命令 + args: # 命令参数 + - "-e" # 执行内联脚本 + - "require('http').createServer((req,res)=>res.end('Hello from pinned image')).listen(8080)" # Node.js 内联服务逻辑 + ports: # 容器端口 + - containerPort: 8080 # 应用监听端口 + --- + """ + ) + SVC_8080 + "\n---\n" + ING_NODE + "\n" + + # 04-04: + ConfigMap(等同旧 04-04 主体) + doc4 = ( + f"# 对应文档:docs/04-04-nodejs-环境变量与配置注入.md\n" + f"# 累积:04-03 + ConfigMap + 通过 env 注入 APP_MSG\n---\n{CM}\n---\n" + + dedent( + """\ + apiVersion: apps/v1 # Deployment API 版本 + kind: Deployment # 工作负载:Deployment + metadata: # Deployment 元信息 + name: nodejs-demo # Deployment 名称 + namespace: default # 命名空间 + spec: # Deployment 规格 + replicas: 1 # 副本数 + selector: # Deployment 选择器 + matchLabels: # 标签匹配集合 + app: nodejs-demo # 匹配 app=nodejs-demo 的 Pod + template: # Pod 模板 + metadata: # Pod 元信息 + labels: # Pod 标签 + app: nodejs-demo # 与 selector.matchLabels 对齐 + spec: # Pod 规格 + containers: # 容器列表 + - name: nodejs-demo # 容器名 + image: node:18.20-alpine # Node.js 镜像 + imagePullPolicy: IfNotPresent # 拉取策略 + env: # 环境变量注入 + - name: APP_MSG # 环境变量名 + valueFrom: # 从资源引用取值 + configMapKeyRef: # 从 ConfigMap key 读取 + name: nodejs-demo-config # ConfigMap 名称 + key: APP_MSG # ConfigMap 键名 + command: # 启动命令 + - node # 运行 node + - "-e" # 执行内联脚本 + - | # 多行 JS 脚本(内部内容不改动) + const http=require('http'); + const msg=process.env.APP_MSG||'no env'; + http.createServer((q,s)=>s.end(msg)).listen(8080); + ports: # 容器端口 + - containerPort: 8080 # 应用监听端口 + --- + """ + ) + + SVC_8080 + + "\n---\n" + + ING_NODE + + "\n" + ) + + # 04-05: + 探针(无 resources) + doc5 = ( + f"# 对应文档:docs/04-05-nodejs-探针与健康检查.md\n" + f"# 累积:04-04 + livenessProbe/readinessProbe(端口 8080,路径 /)\n---\n{CM}\n---\n" + + dedent( + """\ + apiVersion: apps/v1 # Deployment API 版本 + kind: Deployment # 工作负载:Deployment + metadata: # Deployment 元信息 + name: nodejs-demo # Deployment 名称 + namespace: default # 命名空间 + spec: # Deployment 规格 + replicas: 1 # 副本数 + selector: # Deployment 选择器 + matchLabels: # 标签匹配集合 + app: nodejs-demo # 匹配 app=nodejs-demo 的 Pod + template: # Pod 模板 + metadata: # Pod 元信息 + labels: # Pod 标签 + app: nodejs-demo # 与 selector.matchLabels 对齐 + spec: # Pod 规格 + containers: # 容器列表 + - name: nodejs-demo # 容器名 + image: node:18.20-alpine # Node.js 镜像 + imagePullPolicy: IfNotPresent # 拉取策略 + env: # 环境变量注入 + - name: APP_MSG # 环境变量名 + valueFrom: # 从资源引用取值 + configMapKeyRef: # 从 ConfigMap key 读取 + name: nodejs-demo-config # ConfigMap 名称 + key: APP_MSG # ConfigMap 键名 + command: # 启动命令 + - node # 运行 node + - "-e" # 执行内联脚本 + - | # 多行 JS 脚本(内部内容不改动) + const http=require('http'); + const msg=process.env.APP_MSG||'no env'; + http.createServer((q,s)=>s.end(msg)).listen(8080); + ports: # 容器端口 + - containerPort: 8080 # 应用监听端口 + """ + ).rstrip() + + "\n" + + PROBES + + "\n" + + dedent( + """\ + --- + """ + ) + + SVC_8080 + + "\n---\n" + + ING_NODE + + "\n" + ) + + # 04-06: + replicas:3 + RollingUpdate,Ingress 仍为 /node + doc6 = ( + f"# 对应文档:docs/04-06-nodejs-副本与滚动发布.md\n" + f"# 累积:04-05 + replicas: 3 + RollingUpdate(maxSurge:1 maxUnavailable:0)\n---\n{CM}\n---\n" + + dedent( + """\ + apiVersion: apps/v1 # Deployment API 版本 + kind: Deployment # 工作负载:Deployment + metadata: # Deployment 元信息 + name: nodejs-demo # Deployment 名称 + namespace: default # 命名空间 + spec: # Deployment 规格 + replicas: 3 # 副本数(高可用) + strategy: # 更新策略 + type: RollingUpdate # 滚动更新 + rollingUpdate: # 滚动更新参数 + maxSurge: 1 # 更新时最多额外增加 1 个 Pod + maxUnavailable: 0 # 更新时不可用 Pod 数为 0 + selector: # Pod 选择器 + matchLabels: # 标签匹配集合 + app: nodejs-demo # 匹配 app=nodejs-demo 的 Pod + template: # Pod 模板 + metadata: # Pod 元信息 + labels: # Pod 标签 + app: nodejs-demo # 与 selector.matchLabels 对齐 + spec: # Pod 规格 + containers: # 容器列表 + - name: nodejs-demo # 容器名 + image: node:18.20-alpine # Node.js 镜像 + imagePullPolicy: IfNotPresent # 拉取策略 + env: # 环境变量注入 + - name: APP_MSG # 环境变量名 + valueFrom: # 从资源引用取值 + configMapKeyRef: # 从 ConfigMap key 读取 + name: nodejs-demo-config # ConfigMap 名称 + key: APP_MSG # ConfigMap 键名 + command: # 启动命令 + - node # 运行 node + - "-e" # 执行内联脚本 + - | # 多行 JS 脚本(内部内容不改动) + const http=require('http'); + const msg=process.env.APP_MSG||'no env'; + http.createServer((q,s)=>s.end(msg)).listen(8080); + ports: # 容器端口 + - containerPort: 8080 # 应用监听端口 + """ + ).rstrip() + + "\n" + + PROBES + + "\n" + + dedent( + """\ + --- + """ + ) + + SVC_8080 + + "\n---\n" + + ING_NODE + + "\n" + ) + + # 04-07: Ingress host + /api + doc7 = doc6.replace( + "# 对应文档:docs/04-06-nodejs-副本与滚动发布.md\n" + "# 累积:04-05 + replicas: 3 + RollingUpdate(maxSurge:1 maxUnavailable:0)\n", + "# 对应文档:docs/04-07-nodejs-Ingress与Traefik.md\n" + "# 累积:04-06 + Ingress 增加 host、path 改为 /api(访问需 Host: app.example.local)\n", + ) + doc7 = doc7.replace("---\n" + ING_NODE + "\n", "---\n" + ING_HOST + "\n") + + # 04-08: + resources + c8 = ( + " ports: # 容器端口\n" + " - containerPort: 8080 # 应用监听端口\n" + ) + c8r = ( + " ports: # 容器端口\n" + " - containerPort: 8080 # 应用监听端口\n" + RES + ) + doc8 = doc7.replace( + "# 对应文档:docs/04-07-nodejs-Ingress与Traefik.md\n" + "# 累积:04-06 + Ingress 增加 host、path 改为 /api(访问需 Host: app.example.local)\n", + "# 对应文档:docs/04-08-nodejs-资源请求与限制.md\n" + "# 累积:04-07 + resources.requests/limits\n", + ).replace(c8, c8r) + + # 04-09: + nodeSelector + doc9 = doc8.replace( + "# 对应文档:docs/04-08-nodejs-资源请求与限制.md\n" + "# 累积:04-07 + resources.requests/limits\n", + "# 对应文档:docs/04-09-nodejs-调度与亲和.md\n" + "# 累积:04-08 + nodeSelector(默认 ylc62,请改为本集群节点短主机名)\n", + ).replace( + " spec: # Pod 规格\n containers: # 容器列表\n", + " spec: # Pod 规格\n nodeSelector: # 调度到指定节点\n" + " kubernetes.io/hostname: ylc62 # 节点主机名(按实际修改)\n" + " containers: # 容器列表\n", + ) + + # 04-10: + securityContext + tmp volume + doc10 = doc9.replace( + "# 对应文档:docs/04-09-nodejs-调度与亲和.md\n" + "# 累积:04-08 + nodeSelector(默认 ylc62,请改为本集群节点短主机名)\n", + "# 对应文档:docs/04-10-nodejs-安全上下文.md\n" + "# 累积:04-09 + pod securityContext.fsGroup、容器 securityContext、只读根、/tmp emptyDir\n", + ).replace( + " spec: # Pod 规格\n nodeSelector: # 调度到指定节点\n" + " kubernetes.io/hostname: ylc62 # 节点主机名(按实际修改)\n" + " containers: # 容器列表\n", + " spec: # Pod 规格\n nodeSelector: # 调度到指定节点\n" + " kubernetes.io/hostname: ylc62 # 节点主机名(按实际修改)\n" + " securityContext: # Pod 级安全上下文\n" + " fsGroup: 1000 # 挂载卷文件组 ID\n" + " containers: # 容器列表\n", + ) + doc10 = doc10.replace( + " - name: nodejs-demo # 容器名\n image: node:18.20-alpine # Node.js 镜像\n" + " imagePullPolicy: IfNotPresent # 拉取策略\n env:", + " - name: nodejs-demo # 容器名\n image: node:18.20-alpine # Node.js 镜像\n" + " imagePullPolicy: IfNotPresent # 拉取策略\n" + " securityContext: # 容器级安全上下文\n" + " allowPrivilegeEscalation: false # 禁止提权\n" + " runAsNonRoot: true # 强制非 root 运行\n" + " runAsUser: 1000 # 运行用户 UID\n" + " readOnlyRootFilesystem: true # 根文件系统只读\n" + " env:", + ) + doc10 = doc10.replace( + " periodSeconds: 5 # 探测周期\n\n---\n", + " periodSeconds: 5 # 探测周期\n" + " volumeMounts: # 卷挂载\n" + " - name: tmp # 引用临时卷\n" + " mountPath: /tmp # 容器内临时目录\n" + " volumes: # 卷定义\n" + " - name: tmp # 临时卷名称\n" + " emptyDir: {} # 空目录卷(Pod 生命周期内)\n\n---\n", + ) + + pvc = dedent( + """\ + apiVersion: v1 # PVC API 版本 + kind: PersistentVolumeClaim # 持久卷声明 + metadata: # PVC 元信息 + name: nodejs-demo-data # PVC 名称 + namespace: default # 命名空间 + spec: # PVC 规格 + accessModes: # 访问模式 + - ReadWriteOnce # RWO:同一时间仅单节点挂载读写 + storageClassName: local-path # 存储类(按集群可改) + resources: # 资源请求 + requests: # 配额请求 + storage: 1Gi # 申请容量 + --- + """ + ).strip() + doc11 = doc10.replace( + "# 对应文档:docs/04-10-nodejs-安全上下文.md\n" + "# 累积:04-09 + pod securityContext.fsGroup、容器 securityContext、只读根、/tmp emptyDir\n", + "# 对应文档:docs/04-11-nodejs-存储与卷.md\n" + "# 累积:04-10 + PVC nodejs-demo-data(默认 storageClassName: local-path)+ 挂载 /data\n", + ) + doc11 = doc11.replace( + "---\n" + CM + "\n---\n", + "---\n" + pvc + "\n" + CM + "\n---\n", + 1, + ) + doc11 = doc11.replace( + " volumeMounts: # 卷挂载\n" + " - name: tmp # 引用临时卷\n" + " mountPath: /tmp # 容器内临时目录\n", + " volumeMounts: # 卷挂载\n" + " - name: tmp # 临时卷名称\n" + " mountPath: /tmp # 容器内临时目录\n" + " - name: data # 数据卷名称\n" + " mountPath: /data # 容器内数据目录\n", + ) + doc11 = doc11.replace( + " volumes: # 卷定义\n - name: tmp # 临时卷名称\n" + " emptyDir: {} # 空目录卷(Pod 生命周期内)\n", + " volumes: # 卷定义\n - name: tmp # 临时卷\n emptyDir: {} # 空目录卷\n" + " - name: data # 数据卷\n persistentVolumeClaim: # 卷来源为 PVC\n" + " claimName: nodejs-demo-data # 绑定 PVC 名称\n", + ) + + DIR.mkdir(parents=True, exist_ok=True) + (DIR / "04-02-nodejs-demo.yaml").write_text(doc2, encoding="utf-8") + (DIR / "04-03-nodejs-demo.yaml").write_text(doc3, encoding="utf-8") + (DIR / "04-04-nodejs-demo.yaml").write_text(doc4, encoding="utf-8") + (DIR / "04-05-nodejs-demo.yaml").write_text(doc5, encoding="utf-8") + (DIR / "04-06-nodejs-demo.yaml").write_text(doc6, encoding="utf-8") + (DIR / "04-07-nodejs-demo.yaml").write_text(doc7, encoding="utf-8") + (DIR / "04-08-nodejs-demo.yaml").write_text(doc8, encoding="utf-8") + (DIR / "04-09-nodejs-demo.yaml").write_text(doc9, encoding="utf-8") + (DIR / "04-10-nodejs-demo.yaml").write_text(doc10, encoding="utf-8") + (DIR / "04-11-nodejs-demo.yaml").write_text(doc11, encoding="utf-8") + + +if __name__ == "__main__": + main() diff --git a/scripts/lib-ansible-lab.sh b/scripts/lib-ansible-lab.sh new file mode 100644 index 0000000..496f384 --- /dev/null +++ b/scripts/lib-ansible-lab.sh @@ -0,0 +1,37 @@ +# shellcheck shell=bash +# 仓库根 Ansible:从任意 cwd 调用时仍使用 ansible/ansible.cfg(如 host_key_checking=False)。 +ansible_lab_export_config() { + export ANSIBLE_CONFIG="${ROOT}/ansible/ansible.cfg" +} + +# 若 inventory 为各主机声明了 ansible_ssh_private_key_file,则在本机检查文件存在(避免 ssh 报 no such identity)。 +ansible_lab_check_inventory_keys() { + local inv="$1" + local line path exp + [[ -f "$inv" ]] || return 0 + while IFS= read -r line || [[ -n "$line" ]]; do + [[ "$line" =~ ^[[:space:]]*# ]] && continue + [[ "$line" =~ ansible_ssh_private_key_file=([^[:space:]]+) ]] || continue + path="${BASH_REMATCH[1]}" + exp="${path/#\~/$HOME}" + if [[ ! -f "$exp" ]]; then + echo "[ERR] SSH 私钥不存在:$exp(inventory 中为 $path)" >&2 + echo " 将密钥放到该路径并 chmod 600,或改 ansible/inventory.ini 中的 ansible_ssh_private_key_file。" >&2 + echo " 生成/分发可参考:scripts/ssh/setup-k3s-workers-ssh.sh、docs/01-06-节点初始化-ansible-实践.md" >&2 + return 1 + fi + # OpenSSH 拒绝 group/other 可读的私钥(常见误为 0644),须 600 或 400 + local mode + mode=$(stat -c '%a' "$exp" 2>/dev/null) || mode="" + case "$mode" in + 600|400) ;; + *) + echo "[ERR] SSH 私钥权限过宽(当前 ${mode:-?},须仅所有者可读):$exp" >&2 + echo " 执行:chmod 600 $exp" >&2 + echo " 若需一次修正本仓库 inventory 中各节点密钥:chmod 600 ~/.ssh/id_ed25519_k3s_192.168.2.61 ~/.ssh/id_ed25519_k3s_192.168.2.62 ~/.ssh/id_ed25519_k3s_192.168.2.63 ~/.ssh/id_ed25519_k3s_192.168.2.64" >&2 + return 1 + ;; + esac + done < "$inv" + return 0 +} diff --git a/scripts/resolve_verify_playbook.py b/scripts/resolve_verify_playbook.py new file mode 100644 index 0000000..3ab72d1 --- /dev/null +++ b/scripts/resolve_verify_playbook.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 +"""将 doc_id 解析为 verify playbook 绝对路径(唯一真源:ansible/playbooks/verify/.yml)。 + +历史上曾解析 labs/matrix-doc-playbooks.yml(“验证矩阵”);该概念已废弃。 +""" +from __future__ import annotations + +import sys +from pathlib import Path + +ROOT = Path(__file__).resolve().parent.parent + + +def main() -> None: + if len(sys.argv) != 2: + print("用法: resolve_verify_playbook.py ", file=sys.stderr) + sys.exit(2) + doc_id = sys.argv[1].strip() + if not doc_id: + sys.exit(2) + p = ROOT / "ansible" / "playbooks" / "verify" / f"{doc_id}.yml" + if not p.is_file(): + print(f"ERR: playbook 不存在:{p}", file=sys.stderr) + sys.exit(2) + print(p.resolve()) + + +if __name__ == "__main__": + main() diff --git a/scripts/ssh/run-phase2-k3s-on-ylc61-as-jack.sh b/scripts/ssh/run-phase2-k3s-on-ylc61-as-jack.sh deleted file mode 100644 index f482113..0000000 --- a/scripts/ssh/run-phase2-k3s-on-ylc61-as-jack.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env bash -# 在办公机执行:经 SSH 在控制节点 ylc61 上跑「K3s 安装/复验」部署流水线。 -# 对应 docs/00-05 §2 步骤 1(接入)+ 3(部署)中的 k3s 路径;与 verify.sh 矩阵验收分离。 -# -# 依赖:本机可 ssh ylc61(BatchMode);远端仓库路径存在且含 scripts/deploy-lab.sh。 -set -euo pipefail - -REMOTE_ROOT="${LAB_REPO_ROOT:-/home/jack/实验室建设}" -# 将当前 shell 中的 K3S_PREPARE_STORAGE 传到远端(未设置则默认 false) -KS_PREP="${K3S_PREPARE_STORAGE:-false}" - -exec ssh -o BatchMode=yes ylc61 bash -lc " - set -euo pipefail - cd '${REMOTE_ROOT}' - chmod +x scripts/deploy-lab.sh 2>/dev/null || true - if [[ -f scripts/.env.verify ]]; then - set -a - # shellcheck disable=SC1091 - source scripts/.env.verify - set +a - fi - # 办公机传入的 K3S_PREPARE_STORAGE 覆盖远端 .env 中的默认 - export K3S_PREPARE_STORAGE='${KS_PREP}' - ./scripts/deploy-lab.sh k3s -" diff --git a/scripts/ssh/setup-k3s-workers-ssh.sh b/scripts/ssh/setup-k3s-workers-ssh.sh index c9cec6b..731f5b4 100644 --- a/scripts/ssh/setup-k3s-workers-ssh.sh +++ b/scripts/ssh/setup-k3s-workers-ssh.sh @@ -1,4 +1,9 @@ #!/usr/bin/env bash +# 为 inventory 中 k3s 各节点配置 SSH 密钥(jack + root),供 Ansible 使用。 +# +# 依赖:OpenSSH(ssh、ssh-keygen、scp);可选 sshpass(一次输入密码模式);可选 puttygen(仅当在交互中选择生成 .ppk 时)。 +# 默认 **不需要** PuTTY:在 Linux 工作机(如 ylc65)上用 ssh/ansible 即可。只有需要在 **Windows 上用 PuTTY/Pageant** 加载私钥时, +# 才在提示「是否生成 PuTTY 私钥」时选 y,此时需安装 puttygen(Fedora: dnf install putty;Debian: apt install putty-tools)。 set -euo pipefail ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" @@ -41,11 +46,11 @@ gen_key_if_missing() { ssh-keygen -t ed25519 -f "$key_path" -C "k3s-cluster" -N "" chmod 600 "$key_path" 2>/dev/null || true - # 如选择生成 PuTTY 私钥,则调用 puttygen;若命令不存在则给出安装提示并退出 + # 仅当用户在交互中明确选择生成 .ppk(Windows PuTTY)时调用 puttygen;Linux 默认不生成 if [[ "${GENERATE_PUTTY_PPK:-n}" == "y" ]]; then if ! command -v puttygen >/dev/null 2>&1; then - echo "[ERR] 已选择生成 PuTTY 私钥,但当前系统未安装 puttygen。" >&2 - echo " 请先安装 puttygen 后重新运行本脚本,或在提示时选择不生成 PuTTY 私钥。" >&2 + echo "[ERR] 已选择生成 PuTTY 私钥(.ppk),但当前系统未安装 puttygen。" >&2 + echo " 请先安装 putty-tools/putty 包后重试,或在提示时选「不生成」PuTTY 私钥(OpenSSH 默认已够用)。" >&2 echo "" >&2 echo " 常见系统安装示例:" >&2 echo " Fedora / CentOS / RHEL : sudo dnf install putty 或 sudo dnf install putty-tools" >&2 @@ -121,7 +126,6 @@ copy_key_to_host() { print_title "K3s 节点 SSH 密钥批量配置(控制节点 + 工作节点,每节点一把密钥)" ensure_cmd ssh-keygen -ensure_cmd ssh-copy-id ensure_cmd ssh # 默认显示相对路径(相对于仓库根) @@ -130,7 +134,7 @@ INVENTORY_PATH="$(ask_default "Ansible inventory 路径(相对仓库根 ${ROOT [[ "$INVENTORY_PATH" != /* ]] && INVENTORY_PATH="${ROOT_DIR}/${INVENTORY_PATH}" [[ ! -f "$INVENTORY_PATH" ]] && { echo "[ERR] 找不到 inventory: $INVENTORY_PATH" >&2; exit 1; } -# 交互输入:用户名(有默认值)、密码(可选,用于后续 SSH/sudo)、是否生成 PuTTY 私钥 +# 交互输入:用户名(有默认值)、密码(可选,用于后续 SSH/sudo)、是否额外生成 PuTTY .ppk(可选,默认否) echo "" echo "--- 交互输入用户名与密码 ---" SSH_USER="$(ask_default "SSH 登录用户名(直接回车使用默认 jack)" "$SSH_USER_DEFAULT")" @@ -174,8 +178,8 @@ if [[ -n "${JACK_PASS:-}" ]]; then fi fi -# 是否同时为每把新生成的 OpenSSH 密钥生成一份 PuTTY 私钥(.ppk) -GENERATE_PUTTY_PPK="$(ask_default "是否为新密钥同时生成 PuTTY 私钥(.ppk)?(y/N)" "N")" +# 是否同时为每把新生成的 OpenSSH 密钥生成一份 PuTTY 私钥(.ppk);Linux/默认回车即可 +GENERATE_PUTTY_PPK="$(ask_default "是否额外生成 PuTTY 私钥(.ppk,仅 Windows PuTTY 需要;Linux 用 OpenSSH 请选 N)[y/N]" "N")" GENERATE_PUTTY_PPK="${GENERATE_PUTTY_PPK,,}" # 转小写 if [[ "$GENERATE_PUTTY_PPK" == "y" ]]; then GENERATE_PUTTY_PPK="y" diff --git a/scripts/ssh/smoke-verify-matrix-on-ylc61.sh b/scripts/ssh/smoke-verify-matrix-on-ylc61.sh deleted file mode 100644 index 285b5a4..0000000 --- a/scripts/ssh/smoke-verify-matrix-on-ylc61.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env bash -# 从办公机 Git Bash 执行:SSH 到 ylc61 上跑若干样板 verify(对应 docs/00-05 §2 步骤 4~6 的抽样)。 -# 步骤 1~3 需已满足:远端仓库路径正确、已 deploy K3s、可选 scripts/.env.verify。 -# -# 环境变量: -# VERIFY_REPO_ROOT 远端仓库根目录(默认 /home/jack/实验室建设) -# VERIFY_TEARDOWN / nginx_entry_base / nodejs_entry_base 传给远端 verify.sh -set -euo pipefail - -REMOTE_ROOT="${VERIFY_REPO_ROOT:-/home/jack/实验室建设}" -TEARDOWN="${VERIFY_TEARDOWN:-1}" -NGX="${nginx_entry_base:-http://192.168.2.61}" -NODE="${nodejs_entry_base:-http://192.168.2.61}" - -exec ssh -o BatchMode=yes ylc61 bash -lc " - set -euo pipefail - cd '${REMOTE_ROOT}' - chmod +x scripts/verify.sh 2>/dev/null || true - export VERIFY_TEARDOWN='${TEARDOWN}' - export nginx_entry_base='${NGX}' - export nodejs_entry_base='${NODE}' - if [[ -f scripts/.env.verify ]]; then - set -a - # shellcheck disable=SC1091 - source scripts/.env.verify - set +a - fi - export VERIFY_TEARDOWN='${TEARDOWN}' - export nginx_entry_base='${NGX}' - export nodejs_entry_base='${NODE}' - ./scripts/verify.sh run 02-05 - ./scripts/verify.sh run 03-05 - ./scripts/verify.sh run 03-07 - ./scripts/verify.sh run 04-01 -" diff --git a/scripts/test-all.sh b/scripts/test-all.sh new file mode 100755 index 0000000..b0e8d86 --- /dev/null +++ b/scripts/test-all.sh @@ -0,0 +1,65 @@ +#!/usr/bin/env bash +# 离线「全量」自检:与 CI 同源(labs 索引、verify 清单校验)+ +# 对关键 playbook 执行 ansible-playbook --syntax-check。 +# 不连接集群、不执行 kubectl;真机验收仍用 ./scripts/verify.sh full。 +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +# shellcheck disable=SC1091 +source "${ROOT}/scripts/lib-ansible-lab.sh" +ansible_lab_export_config + +INV="${ANSIBLE_INVENTORY:-${ROOT}/ansible/inventory.ini}" + +need_cmd() { + if ! command -v "$1" >/dev/null 2>&1; then + echo "[ERR] 未找到命令:$1" >&2 + exit 1 + fi +} + +echo "########################################## 1/2 verify playbook ↔ docs 文件存在性" +need_cmd python3 +python3 "${ROOT}/scripts/validate_matrix_playbooks.py" + +echo "" +echo "########################################## 2/2 Ansible syntax-check(verify playbook)" +need_cmd ansible-playbook +[[ -f "$INV" ]] || { + echo "[ERR] inventory 不存在:$INV(syntax-check 仍需 -i)" >&2 + exit 1 +} + +mapfile -t PBS < <( + ROOT="${ROOT}" python3 -c " +import os +from pathlib import Path + +root = Path(os.environ['ROOT']) +pbs = [] + +for p in sorted((root / 'ansible' / 'playbooks' / 'verify').glob('*.yml')): + if p.name.startswith('_'): + continue + pbs.append(p.relative_to(root).as_posix()) + +seen = set() +for rel in pbs: + if rel in seen: + continue + seen.add(rel) + print(rel) +" +) + +n=${#PBS[@]} +i=0 +for rel in "${PBS[@]}"; do + i=$((i + 1)) + pb="${ROOT}/${rel}" + echo "[$i/$n] -- $rel" + ansible-playbook -i "$INV" "$pb" --syntax-check +done + +echo "" +echo "[OK] 全量离线检查通过(${n} 条 playbook syntax-check)" diff --git a/scripts/validate_matrix_playbooks.py b/scripts/validate_matrix_playbooks.py new file mode 100644 index 0000000..57c2163 --- /dev/null +++ b/scripts/validate_matrix_playbooks.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 +"""校验 verify playbook 清单(抛弃“验证矩阵”概念后的替代校验)。 + +规则(最小可用): +- ansible/playbooks/verify/ 目录下所有形如 XX-YY.yml 的文件,都必须存在对应 docs/XX-YY-*.md 文档 +- 仅检查“存在性 + 1:1 对齐”,不解析 Markdown 内容 + +历史上本脚本用于校验 docs/00-03-验证矩阵.md ↔ labs/matrix-doc-playbooks.yml; +该概念已废弃,但保留脚本名以减少 CI/用户习惯改动。 +""" +from __future__ import annotations + +import re +import sys +from pathlib import Path + +ROOT = Path(__file__).resolve().parent.parent +VERIFY_DIR = ROOT / "ansible" / "playbooks" / "verify" +DOCS_DIR = ROOT / "docs" + +EXEC_ID_RE = re.compile(r"^(0[1-9]|[1-9][0-9])-(0[1-9]|[1-9][0-9])$") + + +def is_exec_domain(doc_id: str) -> bool: + return EXEC_ID_RE.fullmatch(doc_id) is not None + + +def main() -> None: + if not VERIFY_DIR.is_dir(): + print(f"ERR: 缺少目录 {VERIFY_DIR}", file=sys.stderr) + sys.exit(2) + if not DOCS_DIR.is_dir(): + print(f"ERR: 缺少目录 {DOCS_DIR}", file=sys.stderr) + sys.exit(2) + + doc_ids: list[str] = [] + invalid_verify_names: list[str] = [] + for p in VERIFY_DIR.iterdir(): + if p.is_file() and len(p.name) == len("00-00.yml") and p.name[2:3] == "-" and p.name[5:] == ".yml": + if is_exec_domain(p.stem): + doc_ids.append(p.stem) + else: + invalid_verify_names.append(p.name) + + missing_docs: list[str] = [] + missing_files_dir: list[str] = [] + weak_doc_exec_refs: list[str] = [] + for did in sorted(set(doc_ids)): + matches = sorted(DOCS_DIR.glob(f"{did}-*.md")) + if not matches: + missing_docs.append(did) + continue + doc = matches[0] + content = doc.read_text(encoding="utf-8", errors="ignore") + if f"ansible/files/{did}/" not in content and "```yaml" in content: + weak_doc_exec_refs.append(did) + expects_files_dir = (f"ansible/files/{did}/" in content) or ("```yaml" in content) + if expects_files_dir and not (ROOT / "ansible" / "files" / did).is_dir(): + missing_files_dir.append(did) + + if invalid_verify_names: + print( + f"ERR: verify 仅允许执行域命名(XX>0 且 YY>0),以下文件不合规: {sorted(invalid_verify_names)}", + file=sys.stderr, + ) + sys.exit(2) + if missing_docs: + print(f"ERR: 存在 verify/.yml 但缺少 docs/-*.md: {missing_docs}", file=sys.stderr) + sys.exit(2) + if missing_files_dir: + print(f"ERR: 缺少 ansible/files// 目录: {missing_files_dir}", file=sys.stderr) + sys.exit(2) + if weak_doc_exec_refs: + print( + f"ERR: 文档包含 YAML 代码块但未引用 ansible/files// 真源: {weak_doc_exec_refs}", + file=sys.stderr, + ) + sys.exit(2) + + print(f"[OK] 执行域 verify/doc/files 一致性通过({len(sorted(set(doc_ids)))} 条)") + + +if __name__ == "__main__": + main() diff --git a/scripts/verify.sh b/scripts/verify.sh old mode 100644 new mode 100755 index 96518cb..22f0806 --- a/scripts/verify.sh +++ b/scripts/verify.sh @@ -1,15 +1,19 @@ #!/usr/bin/env bash -# 验证矩阵自动化入口(对应 docs/00-05 §2「自动化验证流程」步骤 4~6 的一键串联): -# 4 断言 — 各 verify/XX-YY.yml 内 kubectl / curl / helm 等 -# 5 收尾 — 默认 VERIFY_TEARDOWN=1 做本篇资源清理(非整集群卸载) -# 6 串联 — run-all 按 docs/00-02-验证矩阵.md 顺序 fail-fast +# 验证入口(以 ansible/playbooks/verify/.yml 为唯一执行真源): +# - run :执行单篇验证 playbook +# - run-all:按 verify 目录中存在的 .yml 顺序执行(仅执行域:XX>0 && YY>0) +# - full:preflight + run-all # -# 步骤 1~3(接入、环境/轻量清理、部署)由操作者或 scripts/deploy-lab.sh 完成;本脚本不执行 k3s-uninstall。 -# 推荐在 Linux 工作机(如 ylc65)或控制节点仓库根执行。 +# 说明: +# - 本脚本不再解析任何“矩阵/状态板”文档;验证清单从 verify playbook 自动得出。 +# - 步骤 1~3(接入、环境/轻量清理、部署)由操作者或 scripts/deploy-lab.sh 完成;本脚本不执行 k3s-uninstall。 +# - 推荐在 Linux 工作机或控制节点仓库根执行。 set -euo pipefail ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" -MATRIX_MD="${ROOT}/docs/00-02-验证矩阵.md" +# shellcheck disable=SC1091 +source "${ROOT}/scripts/lib-ansible-lab.sh" +ansible_lab_export_config # 默认与 §2 一致:验证后清理临时资源 export VERIFY_TEARDOWN="${VERIFY_TEARDOWN:-1}" @@ -25,44 +29,82 @@ load_env() { export VERIFY_TEARDOWN="${VERIFY_TEARDOWN:-1}" } -parse_doc_ids_from_matrix() { - if [[ ! -f "${MATRIX_MD}" ]]; then - echo "[ERR] matrix 不存在:${MATRIX_MD}" >&2 - exit 1 - fi - # shellcheck disable=SC2016 - awk ' - match($0, /`[0-9][0-9]-[0-9][0-9]-[^`]+\.md`/) { - s = substr($0, RSTART+1, RLENGTH-2); - id = substr(s, 1, 5); - if (!seen[id]++) print id; - } - ' "${MATRIX_MD}" +DOC_ID_EXEC_RE='^(0[1-9]|[1-9][0-9])-(0[1-9]|[1-9][0-9])$' + +is_exec_doc_id() { + local doc_id="$1" + [[ "$doc_id" =~ $DOC_ID_EXEC_RE ]] +} + +list_doc_ids_from_verify_dir() { + # 只列出执行域(XX>0 && YY>0)的 verify 清单;支持筛选 + local series="${1:-}" + local id_regex="${2:-}" + local exclude_noop="${3:-0}" + local require_teardown="${4:-0}" + ROOT="${ROOT}" SERIES="${series}" ID_REGEX="${id_regex}" EXCLUDE_NOOP="${exclude_noop}" REQUIRE_TEARDOWN="${require_teardown}" python3 - <<'PY' +import os +import re +from pathlib import Path + +root = Path(os.environ["ROOT"]) +verify_dir = root / "ansible" / "playbooks" / "verify" +series = os.environ.get("SERIES", "").strip() +id_regex = os.environ.get("ID_REGEX", "").strip() +exclude_noop = os.environ.get("EXCLUDE_NOOP", "0") == "1" +require_teardown = os.environ.get("REQUIRE_TEARDOWN", "0") == "1" + +pat = re.compile(r"^(?P(0[1-9]|[1-9][0-9])-(0[1-9]|[1-9][0-9]))\.yml$") +id_pat = re.compile(id_regex) if id_regex else None + +ids = [] +for p in verify_dir.iterdir(): + m = pat.match(p.name) + if not m: + continue + doc_id = m.group("id") + if series and not doc_id.startswith(f"{series}-"): + continue + if id_pat and not id_pat.search(doc_id): + continue + + if exclude_noop or require_teardown: + content = p.read_text(encoding="utf-8", errors="ignore") + if exclude_noop and "noop verify" in content: + continue + if require_teardown and ("VERIFY_TEARDOWN" not in content and "verify_teardown" not in content): + continue + + ids.append(doc_id) + +for x in sorted(set(ids)): + print(x) +PY } print_flow() { cat < / run-all → ansible/playbooks/verify/.yml - 5 收尾与记录 VERIFY_TEARDOWN;矩阵状态见 docs/00-02-验证矩阵.md(建议手工写回) - 6 一键串联 $0 run-all + 4 断言 本脚本 run / run-all → ansible/playbooks/verify/.yml + 5 收尾与记录 VERIFY_TEARDOWN;验证结论建议写回对应实验篇文档(或单独记录日志) + 6 一键串联 $0 full(推荐)或 $0 run-all -相关脚本:deploy-lab.sh(安装/铺栈)、ssh/run-phase2-k3s-on-ylc61-as-jack.sh(办公机触发远端 deploy k3s) +相关脚本:deploy-lab.sh(安装/铺栈) EOF } -preflight() { +run_preflight() { local inv="${ANSIBLE_INVENTORY:-${ROOT}/ansible/inventory.ini}" if ! command -v ansible-playbook >/dev/null 2>&1; then echo "[ERR] 未找到 ansible-playbook" >&2 exit 1 fi - [[ -f "${MATRIX_MD}" ]] || { echo "[ERR] 缺少验证矩阵:${MATRIX_MD}" >&2; exit 1; } [[ -f "$inv" ]] || { echo "[ERR] inventory 不存在:$inv" >&2; exit 1; } + ansible_lab_check_inventory_keys "$inv" || exit 1 echo "[RUN] ansible k3s_server -m ping" ansible k3s_server -i "$inv" -m ping @@ -81,29 +123,52 @@ preflight() { echo "[OK] preflight 通过" } +run_all_verify() { + local series="${1:-}" + local id_regex="${2:-}" + local exclude_noop="${3:-0}" + local require_teardown="${4:-0}" + local id + while IFS= read -r id; do + echo "" + echo "########################################## $id" + ansible_verify "$id" + done < <(list_doc_ids_from_verify_dir "$series" "$id_regex" "$exclude_noop" "$require_teardown") +} + usage() { cat <<'EOF' 用法:scripts/verify.sh <命令> [...] 命令: flow 打印与 docs/00-05 §2 对齐的「验证流程」说明(不接 Ansible) - preflight 检查 ansible-playbook、矩阵与 inventory;对 k3s_server 做 ping + preflight 检查 ansible-playbook 与 inventory;对 k3s_server 做 ping 若 VERIFY_PREFLIGHT_CLUSTER=1,额外 kubectl get nodes(未装集群会失败) - list 从验证矩阵列出 doc_id(顺序同 run-all) - run 运行指定 doc_id(ansible/playbooks/verify/.yml) - run-all 按验证矩阵顺序运行全部 doc_id(fail-fast) + full 先 preflight,再按 doc_id 顺序运行全部 verify(= preflight + run-all,推荐) + list [筛选参数] 列出可执行 doc_id(仅执行域) + run 运行指定 doc_id(ansible/playbooks/verify/.yml) + run-all [筛选参数] 按 doc_id 顺序运行 verify playbook(fail-fast),不做 preflight + +筛选参数(可用于 list / run-all / full): + --series 只运行某个主序列(例如 04) + --id-regex 仅保留匹配 doc_id 的条目(例如 '^04-(0[2-9]|1[0-4])$') + --exclude-noop 排除 noop verify + --require-teardown 仅保留包含 teardown gate 的条目 环境变量: VERIFY_TEARDOWN=1 验证后清理本篇资源(默认 1,对应 §2 轻量 teardown) VERIFY_PREFLIGHT_CLUSTER 为 1 时 preflight 额外执行 kubectl get nodes - ANSIBLE_INVENTORY 默认 <仓库>/ansible/inventory.ini + ANSIBLE_INVENTORY 默认 <仓库>/ansible/inventory.ini(其中 ansible_ssh_private_key_file 须在本机存在) nginx_entry_base 例如 http://192.168.2.61(02-xx / 03-02 等 HTTP 校验) nodejs_entry_base 例如 http://192.168.2.61(04-01) + SKIP_ARMV7 默认 1;为 0 时 01-03/01-05 若未配 ARMV7_SSH(01-05 可用 ARMV7_NFS_SSH)会失败 + ARMV7_SSH / ARMV7_NFS_SSH 一行 ssh 命令;与 SKIP_ARMV7=0 配合时 01-03/01-05 经 SSH 在 arm 上 dnf 安装(见 docs/00-07 §E) -与「部署」分工:安装 K3s / Longhorn / nginx 铺栈请用 ./scripts/deploy-lab.sh;矩阵验收请用本脚本。 +与「部署」分工:安装 K3s / Longhorn / nginx 铺栈请用 ./scripts/deploy-lab.sh;验收请用本脚本。 示例: ./scripts/verify.sh flow + ./scripts/verify.sh full ./scripts/verify.sh preflight export nginx_entry_base=http://192.168.2.61 ./scripts/verify.sh run 02-05 @@ -112,41 +177,88 @@ EOF ansible_verify() { local doc_id="$1" + if ! is_exec_doc_id "$doc_id"; then + echo "[ERR] 非执行域 doc_id:$doc_id(仅允许 XX>0 且 YY>0)" >&2 + exit 1 + fi local inv="${ANSIBLE_INVENTORY:-${ROOT}/ansible/inventory.ini}" local pb_single="${ROOT}/ansible/playbooks/verify/${doc_id}.yml" + if [[ ! -f "$pb_single" ]]; then + echo "[ERR] verify playbook 不存在:$pb_single" >&2 + echo "[TIP] 可用 '$0 list' 查看可执行 doc_id" >&2 + exit 1 + fi if [[ ! -f "$inv" ]]; then echo "[ERR] inventory 不存在:$inv" >&2 exit 1 fi - if [[ ! -f "$pb_single" ]]; then - echo "[ERR] verify playbook 不存在(fail-fast):$pb_single" >&2 - exit 1 - fi - echo "[RUN] ansible-playbook -i $inv $pb_single" - ansible-playbook -i "$inv" "$pb_single" + local td="${VERIFY_TEARDOWN:-1}" + echo "[RUN] ansible-playbook -i $inv -e VERIFY_TEARDOWN=$td $pb_single" + ansible-playbook -i "$inv" -e "VERIFY_TEARDOWN=$td" "$pb_single" } main() { load_env local cmd="${1:-}" + shift || true + + local series="" + local id_regex="" + local exclude_noop=0 + local require_teardown=0 + + parse_filter_args() { + while [[ $# -gt 0 ]]; do + case "$1" in + --series) + series="${2:-}" + [[ -n "$series" ]] || { echo "[ERR] --series 需要参数" >&2; exit 1; } + [[ "$series" =~ ^(0[1-9]|[1-9][0-9])$ ]] || { echo "[ERR] --series 仅允许 01..99" >&2; exit 1; } + shift 2 + ;; + --id-regex) + id_regex="${2:-}" + [[ -n "$id_regex" ]] || { echo "[ERR] --id-regex 需要参数" >&2; exit 1; } + shift 2 + ;; + --exclude-noop) + exclude_noop=1 + shift + ;; + --require-teardown) + require_teardown=1 + shift + ;; + *) + echo "[ERR] 未知参数:$1" >&2 + exit 1 + ;; + esac + done + } + case "$cmd" in ""|-h|--help) usage ;; flow) print_flow ;; - preflight) preflight ;; + preflight) run_preflight ;; + full) + parse_filter_args "$@" + run_preflight + echo "" + echo "########################################## run-all" + run_all_verify "$series" "$id_regex" "$exclude_noop" "$require_teardown" + ;; list) - parse_doc_ids_from_matrix + parse_filter_args "$@" + list_doc_ids_from_verify_dir "$series" "$id_regex" "$exclude_noop" "$require_teardown" ;; run) - local doc_id="${2:?need doc_id like 02-05}" + local doc_id="${1:?need doc_id like 02-05}" ansible_verify "$doc_id" ;; run-all) - local id - while IFS= read -r id; do - echo "" - echo "########################################## $id" - ansible_verify "$id" - done < <(parse_doc_ids_from_matrix) + parse_filter_args "$@" + run_all_verify "$series" "$id_regex" "$exclude_noop" "$require_teardown" ;; *) echo "[ERR] unknown cmd: $cmd" >&2