From 8a54cac61f658a9d76b5741bce848dc18bc0e197 Mon Sep 17 00:00:00 2001 From: jack Date: Sun, 22 Mar 2026 19:02:46 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20CoreDNS=20IPv4=20=E4=B8=8A=E6=B8=B8?= =?UTF-8?q?=E3=80=8103-03=20Tomcat=20=E4=BF=AE=E5=A4=8D=E3=80=81HAProxy=20?= =?UTF-8?q?=E4=B8=8E=E9=AA=8C=E8=AF=81=E8=84=9A=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Ansible: 部署时自动配置 CoreDNS forward 为 IPv4,避免 ACME 解析失败 - 01-01/01-07: 文档增加 CoreDNS 设置说明 - 03-03: Tomcat webapps.dist 复制、HTTP/HTTPS 双 Ingress、显式 Dashboard IngressRoute - traefik-dashboard-acme: tomcat-acme.yaml、404 排查说明 - HAProxy: 健康检查与 PROXY 配置拆分,18080/18443 部署与验证脚本 Made-with: Cursor --- ansible/files/01-08-haproxy/README.md | 41 +++- .../{haproxy.cfg => haproxy-http.cfg} | 12 +- ansible/files/01-08-haproxy/haproxy-https.cfg | 41 ++++ .../files/01-08-haproxy/haproxy-no-check.cfg | 38 ++++ .../01-08-haproxy/haproxy-proxy-http-tls.cfg | 4 +- ansible/files/01-08-haproxy/haproxy-proxy.cfg | 37 --- ansible/files/01-08-haproxy/haproxy-tls.cfg | 38 ++++ ...mcat-acme-test05.yaml => tomcat-acme.yaml} | 34 +++ .../traefik-dashboard-acme.yaml | 19 +- ansible/group_vars/all.yml | 6 + ansible/playbooks/k3s-init-and-install.yml | 40 ++++ docs/00-02-验证矩阵.md | 32 +-- docs/01-01-k3s-控制节点含traefik.md | 20 ++ docs/01-07-节点初始化-ansible-实践.md | 15 +- docs/01-08-openwrt-haproxy.md | 69 +++--- docs/03-03-k3s-traefik-dashboard-acme.md | 42 +++- docs/06-02-运维小结.md | 3 - scripts/01-08-deploy-nginx-tls-via-ylc61.sh | 49 ++++ scripts/01-08-deploy-openwrt-haproxy.sh | 61 +++++ scripts/01-08-update-verify-matrix.py | 32 +++ scripts/01-08-verify-haproxy-openwrt.sh | 11 + scripts/01-08-verify-haproxy.sh | 211 ++++++++++++++++++ scripts/02-verify-nginx-matrix-individual.sh | 106 +++++++++ scripts/03-verify-traefik-dashboard-acme.sh | 57 +++++ scripts/README.md | 19 +- 25 files changed, 924 insertions(+), 113 deletions(-) rename ansible/files/01-08-haproxy/{haproxy.cfg => haproxy-http.cfg} (70%) create mode 100644 ansible/files/01-08-haproxy/haproxy-https.cfg create mode 100644 ansible/files/01-08-haproxy/haproxy-no-check.cfg delete mode 100644 ansible/files/01-08-haproxy/haproxy-proxy.cfg create mode 100644 ansible/files/01-08-haproxy/haproxy-tls.cfg rename ansible/files/traefik-dashboard-acme/{tomcat-acme-test05.yaml => tomcat-acme.yaml} (53%) create mode 100644 scripts/01-08-deploy-nginx-tls-via-ylc61.sh create mode 100644 scripts/01-08-deploy-openwrt-haproxy.sh create mode 100644 scripts/01-08-update-verify-matrix.py create mode 100644 scripts/01-08-verify-haproxy-openwrt.sh create mode 100644 scripts/01-08-verify-haproxy.sh create mode 100644 scripts/02-verify-nginx-matrix-individual.sh create mode 100644 scripts/03-verify-traefik-dashboard-acme.sh diff --git a/ansible/files/01-08-haproxy/README.md b/ansible/files/01-08-haproxy/README.md index 1d519ad..cd1021b 100644 --- a/ansible/files/01-08-haproxy/README.md +++ b/ansible/files/01-08-haproxy/README.md @@ -1,11 +1,38 @@ # 01-08 HAProxy 配置 -用于 `docs/01-08-openwrt-haproxy.md`,可与 Ansible 共用(复制到 OpenWrt 或通过 playbook 下发)。 +## 核心目标 -| 文件 | 说明 | -|------|------| -| haproxy.cfg | 基础配置,TCP 健康检查 | -| haproxy-proxy.cfg | 启用 send-proxy-v2(Traefik 真实 IP) | -| haproxy-proxy-http-tls.cfg | HTTP 检查 + TLS 检查 + PROXY 组合 | +本目录下的 **所有 `*.cfg` 必须可被 HAProxy 正确解析并符合文档意图**。验证分两层: -按实际节点 IP 修改 `192.168.2.61`~`192.168.2.64`。80/443 被封时可将 `bind *:80` / `bind *:443` 改为 `*:18080` / `*:18443`。 +| 层次 | 含义 | 如何验证 | +|------|------|----------| +| **① 语法正确** | `haproxy -c -f ` 无致命错误 | 见下文「仅校验 cfg」或主验证脚本第 2 步 | +| **② 运行与后端** | 在 OpenWrt 上实际监听 18080/18443 时,经第三方主机 curl 可达 K3s/Traefik 后端 | `./scripts/01-08-verify-haproxy.sh`(完整流程,含 curl) | + +仓库内 **frontend 已统一为 `18080` / `18443`**(与 LuCI 的 80/443 分离);backend 仍指向各节点 **80/443**(Traefik 入口)。按环境修改 `192.168.2.61`~`192.168.2.64`。 + +## 仅校验本目录 cfg(不跑 curl) + +仅需确认 **① 语法**,在仓库根目录执行: + +```bash +./scripts/01-08-verify-haproxy.sh --cfg-only +``` + +会将本目录全部 `*.cfg` 拷到 OpenWrt 的 `/tmp/haproxy-verify/`,对每台文件执行 `haproxy -c`(与 OpenWrt 上安装的 HAProxy 版本一致)。 + +**说明**:`haproxy-https.cfg` 含 `ssl crt /etc/ssl/haproxy.pem`;若路由器上**没有**该 pem,语法检查可能失败,脚本会标为 `[SKIP]`。在 OpenWrt 放置有效 pem 后应能通过 `haproxy -c`。 + +## 文件一览 + +| 文件 | 说明(对应 `docs/01-08-openwrt-haproxy.md`) | +|------|-----------------------------------------------| +| `haproxy-no-check.cfg` | §2 最简;§3.1 在其 `server` 行加 `check` | +| `haproxy-http.cfg` | §3.2 HTTP 健康检查(明文 80 后端) | +| `haproxy-tls.cfg` | §3.3 TLS 握手检查(443 后端,`mode tcp`) | +| `haproxy-https.cfg` | §3.4 HTTPS 应用层检查(需 HAProxy 终结 TLS,由 HAProxy 提供证书) | +| `haproxy-proxy-http-tls.cfg` | §5 PROXY + HTTP/TLS 检查 | + +## 与 Ansible / OpenWrt + +可与 Ansible 共用(复制到 OpenWrt 或通过 playbook 下发)。一键把 **uhttpd 80/443 + HAProxy 18080/18443** 落到路由器见 `scripts/01-08-deploy-openwrt-haproxy.sh`。 diff --git a/ansible/files/01-08-haproxy/haproxy.cfg b/ansible/files/01-08-haproxy/haproxy-http.cfg similarity index 70% rename from ansible/files/01-08-haproxy/haproxy.cfg rename to ansible/files/01-08-haproxy/haproxy-http.cfg index 38b5213..aad5190 100644 --- a/ansible/files/01-08-haproxy/haproxy.cfg +++ b/ansible/files/01-08-haproxy/haproxy-http.cfg @@ -1,10 +1,9 @@ -# 01-08 OpenWrt HAProxy 负载均衡 - 基础配置 -# 文档:docs/01-08-openwrt-haproxy.md -# 将 192.168.2.61~64 按实际 K3s 节点 IP 修改 +# 01-08 HAProxy - 3.2 HTTP 健康检查(80 明文) +# backend k3s_http 增加 option httpchk GET / +# 文档:docs/01-08-openwrt-haproxy.md 第 3.2 节 global log /dev/log local0 maxconn 4096 - # 部分 OpenWrt 需 daemon / pidfile,按发行版调整;若无 /dev/log 可改 log 127.0.0.1 local0 defaults mode http @@ -14,15 +13,16 @@ defaults timeout server 30s frontend http_in - bind *:80 + bind *:18080 default_backend k3s_http frontend https_in - bind *:443 + bind *:18443 mode tcp default_backend k3s_https backend k3s_http + option httpchk GET / balance roundrobin server ylc61 192.168.2.61:80 check server ylc62 192.168.2.62:80 check diff --git a/ansible/files/01-08-haproxy/haproxy-https.cfg b/ansible/files/01-08-haproxy/haproxy-https.cfg new file mode 100644 index 0000000..1fa2fbc --- /dev/null +++ b/ansible/files/01-08-haproxy/haproxy-https.cfg @@ -0,0 +1,41 @@ +# 01-08 HAProxy - 3.4 HTTPS 健康检查(443 应用层,HAProxy 终结 TLS,由 HAProxy 提供证书) +# frontend 需 bind *:443 ssl,backend mode http 连 K3s:443 做 HTTP over TLS 检查 +# 将 your-ingress.example.com 改为实际 Host;将 /etc/ssl/haproxy.pem 改为实际证书路径 +# 自签/内网 CA 用 verify none,生产建议 ca-file +# 文档:docs/01-08-openwrt-haproxy.md 第 3.4 节 +global + log /dev/log local0 + maxconn 4096 + +defaults + mode http + option httplog + timeout connect 5s + timeout client 30s + timeout server 30s + +frontend http_in + bind *:18080 + default_backend k3s_http + +frontend https_in + bind *:18443 ssl crt /etc/ssl/haproxy.pem + mode http + default_backend k3s_https + +backend k3s_http + balance roundrobin + server ylc61 192.168.2.61:80 check + server ylc62 192.168.2.62:80 check + server ylc63 192.168.2.63:80 check + server ylc64 192.168.2.64:80 check + +backend k3s_https + mode http + option httpchk GET / HTTP/1.1\r\nHost:\ your-ingress.example.com + default-server ssl verify none + balance roundrobin + server ylc61 192.168.2.61:443 check + server ylc62 192.168.2.62:443 check + server ylc63 192.168.2.63:443 check + server ylc64 192.168.2.64:443 check diff --git a/ansible/files/01-08-haproxy/haproxy-no-check.cfg b/ansible/files/01-08-haproxy/haproxy-no-check.cfg new file mode 100644 index 0000000..932b1de --- /dev/null +++ b/ansible/files/01-08-haproxy/haproxy-no-check.cfg @@ -0,0 +1,38 @@ +# 01-08 OpenWrt HAProxy 负载均衡 - 原生最简(无健康检查) +# 文档:docs/01-08-openwrt-haproxy.md 第 2 节 +# 将 192.168.2.61~64 按实际 K3s 节点 IP 修改 +# 如需健康检查,见第 3 节对应 cfg +global + log /dev/log local0 + maxconn 4096 + +defaults + mode http + option httplog + timeout connect 5s + timeout client 30s + timeout server 30s + +frontend http_in + bind *:18080 + default_backend k3s_http + +frontend https_in + bind *:18443 + mode tcp + default_backend k3s_https + +backend k3s_http + balance roundrobin + server ylc61 192.168.2.61:80 + server ylc62 192.168.2.62:80 + server ylc63 192.168.2.63:80 + server ylc64 192.168.2.64:80 + +backend k3s_https + mode tcp + balance roundrobin + server ylc61 192.168.2.61:443 + server ylc62 192.168.2.62:443 + server ylc63 192.168.2.63:443 + server ylc64 192.168.2.64:443 diff --git a/ansible/files/01-08-haproxy/haproxy-proxy-http-tls.cfg b/ansible/files/01-08-haproxy/haproxy-proxy-http-tls.cfg index 43c30b2..7ffe988 100644 --- a/ansible/files/01-08-haproxy/haproxy-proxy-http-tls.cfg +++ b/ansible/files/01-08-haproxy/haproxy-proxy-http-tls.cfg @@ -13,11 +13,11 @@ defaults timeout server 30s frontend http_in - bind *:80 + bind *:18080 default_backend k3s_http frontend https_in - bind *:443 + bind *:18443 mode tcp default_backend k3s_https diff --git a/ansible/files/01-08-haproxy/haproxy-proxy.cfg b/ansible/files/01-08-haproxy/haproxy-proxy.cfg deleted file mode 100644 index 944e5bc..0000000 --- a/ansible/files/01-08-haproxy/haproxy-proxy.cfg +++ /dev/null @@ -1,37 +0,0 @@ -# 01-08 HAProxy - 启用 PROXY Protocol(send-proxy-v2) -# 用于 Traefik 获取真实客户端 IP,需配合 Traefik trustedIPs -# 文档:docs/01-08-openwrt-haproxy.md 第 5 节 -global - log /dev/log local0 - maxconn 4096 - -defaults - mode http - option httplog - timeout connect 5s - timeout client 30s - timeout server 30s - -frontend http_in - bind *:80 - default_backend k3s_http - -frontend https_in - bind *:443 - mode tcp - default_backend k3s_https - -backend k3s_http - balance roundrobin - server ylc61 192.168.2.61:80 check send-proxy-v2 - server ylc62 192.168.2.62:80 check send-proxy-v2 - server ylc63 192.168.2.63:80 check send-proxy-v2 - server ylc64 192.168.2.64:80 check send-proxy-v2 - -backend k3s_https - mode tcp - balance roundrobin - server ylc61 192.168.2.61:443 check send-proxy-v2 - server ylc62 192.168.2.62:443 check send-proxy-v2 - server ylc63 192.168.2.63:443 check send-proxy-v2 - server ylc64 192.168.2.64:443 check send-proxy-v2 diff --git a/ansible/files/01-08-haproxy/haproxy-tls.cfg b/ansible/files/01-08-haproxy/haproxy-tls.cfg new file mode 100644 index 0000000..7815799 --- /dev/null +++ b/ansible/files/01-08-haproxy/haproxy-tls.cfg @@ -0,0 +1,38 @@ +# 01-08 HAProxy - 3.3 TLS 健康检查(443 握手,mode tcp) +# backend k3s_https 增加 option ssl-hello-chk +# 文档:docs/01-08-openwrt-haproxy.md 第 3.3 节 +global + log /dev/log local0 + maxconn 4096 + +defaults + mode http + option httplog + timeout connect 5s + timeout client 30s + timeout server 30s + +frontend http_in + bind *:18080 + default_backend k3s_http + +frontend https_in + bind *:18443 + mode tcp + default_backend k3s_https + +backend k3s_http + balance roundrobin + server ylc61 192.168.2.61:80 check + server ylc62 192.168.2.62:80 check + server ylc63 192.168.2.63:80 check + server ylc64 192.168.2.64:80 check + +backend k3s_https + mode tcp + option ssl-hello-chk + balance roundrobin + server ylc61 192.168.2.61:443 check + server ylc62 192.168.2.62:443 check + server ylc63 192.168.2.63:443 check + server ylc64 192.168.2.64:443 check diff --git a/ansible/files/traefik-dashboard-acme/tomcat-acme-test05.yaml b/ansible/files/traefik-dashboard-acme/tomcat-acme.yaml similarity index 53% rename from ansible/files/traefik-dashboard-acme/tomcat-acme-test05.yaml rename to ansible/files/traefik-dashboard-acme/tomcat-acme.yaml index 299c52d..69c14a3 100644 --- a/ansible/files/traefik-dashboard-acme/tomcat-acme-test05.yaml +++ b/ansible/files/traefik-dashboard-acme/tomcat-acme.yaml @@ -20,6 +20,16 @@ spec: containers: - name: tomcat image: tomcat:9.0 + # 官方镜像默认 webapps 在 webapps.dist;整目录复制到 webapps(与 Docker Compose cp -a webapps.dist/* webapps 等价) + command: + - sh + - -c + - | + set -e + CATALINA_HOME=/usr/local/tomcat + mkdir -p "${CATALINA_HOME}/webapps" + cp -a "${CATALINA_HOME}/webapps.dist/." "${CATALINA_HOME}/webapps/" + exec "${CATALINA_HOME}/bin/catalina.sh" run ports: - containerPort: 8080 --- @@ -35,6 +45,7 @@ spec: - port: 8080 targetPort: 8080 --- +# HTTPS(websecure) apiVersion: networking.k8s.io/v1 kind: Ingress metadata: @@ -44,6 +55,7 @@ metadata: traefik.ingress.kubernetes.io/router.entrypoints: websecure traefik.ingress.kubernetes.io/router.tls.certresolver: cloudflare spec: + ingressClassName: traefik tls: - hosts: - test05.jackadam.top @@ -58,3 +70,25 @@ spec: name: tomcat-test05 port: number: 8080 +--- +# HTTP(web,与 03-02 nginx-matrix-tls 一致:拆成两个 Ingress) +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: tomcat-test05-http + namespace: default + annotations: + traefik.ingress.kubernetes.io/router.entrypoints: web +spec: + ingressClassName: traefik + rules: + - host: test05.jackadam.top + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: tomcat-test05 + port: + number: 8080 diff --git a/ansible/files/traefik-dashboard-acme/traefik-dashboard-acme.yaml b/ansible/files/traefik-dashboard-acme/traefik-dashboard-acme.yaml index e043736..c64bc62 100644 --- a/ansible/files/traefik-dashboard-acme/traefik-dashboard-acme.yaml +++ b/ansible/files/traefik-dashboard-acme/traefik-dashboard-acme.yaml @@ -44,6 +44,19 @@ spec: nodeSelector: kubernetes.io/hostname: ylc61 - ingressRoute: - dashboard: - enabled: true +--- +# 显式 IngressRoute(与 03-01 一致,确保 /dashboard 可达; Helm ingressRoute.dashboard 在 K3s chart 中未必生效) +apiVersion: traefik.io/v1alpha1 +kind: IngressRoute +metadata: + name: traefik-dashboard + namespace: kube-system +spec: + entryPoints: + - web + routes: + - match: PathPrefix(`/dashboard`) || PathPrefix(`/api`) + kind: Rule + services: + - name: api@internal + kind: TraefikService diff --git a/ansible/group_vars/all.yml b/ansible/group_vars/all.yml index 1dea82e..5c49bc2 100644 --- a/ansible/group_vars/all.yml +++ b/ansible/group_vars/all.yml @@ -20,3 +20,9 @@ k3s_manage_firewalld: true # 可开启此开关;默认 true 表示自动按 inventory 中的 k3s_server / k3s_worker 分组打标。 # 如需完全手动管理角色标签,可改为 false,并参考 `01-02-k3s-工作节点.md` 中的 kubectl 示例。 k3s_manage_role_labels: true + +## CoreDNS 上游 DNS(ACME 需集群内解析 Let's Encrypt) +# 宿主机若使用 IPv6 DNS(/etc/resolv.conf),K3s Pod 网络仅 IPv4 时无法访问,导致 ACME 申请失败。 +# 将 CoreDNS forward 改为明确 IPv4 地址可规避。见 docs/03-02 常见问题。 +k3s_manage_coredns: true +coredns_forward_servers: "223.5.5.5 8.8.8.8" diff --git a/ansible/playbooks/k3s-init-and-install.yml b/ansible/playbooks/k3s-init-and-install.yml index 7db0919..1cfef22 100644 --- a/ansible/playbooks/k3s-init-and-install.yml +++ b/ansible/playbooks/k3s-init-and-install.yml @@ -136,6 +136,46 @@ when: firewalld_check.stdout == 'running' when: k3s_manage_firewalld | default(true) | bool +- name: Configure CoreDNS (IPv4 upstream for ACME) + hosts: k3s_server + become: true + run_once: true + vars: + k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml + tasks: + - name: Wait for CoreDNS deployment to be ready + ansible.builtin.shell: | + KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/coredns -n kube-system --timeout=120s + when: k3s_manage_coredns | default(true) | bool + + - name: Extract CoreDNS Corefile from ConfigMap + ansible.builtin.shell: | + KUBECONFIG={{ k3s_kubeconfig }} kubectl get configmap coredns -n kube-system -o jsonpath='{.data.Corefile}' > /tmp/coredns-corefile.txt + when: k3s_manage_coredns | default(true) | bool + + - name: Patch Corefile forward to IPv4 (avoid IPv6 upstream in Pod network) + ansible.builtin.replace: + path: /tmp/coredns-corefile.txt + regexp: 'forward \. /etc/resolv\.conf' + replace: 'forward . {{ coredns_forward_servers }}' + register: coredns_patched + when: k3s_manage_coredns | default(true) | bool + + - name: Apply patched CoreDNS ConfigMap and restart + ansible.builtin.shell: | + KUBECONFIG={{ k3s_kubeconfig }} kubectl create configmap coredns --from-file=Corefile=/tmp/coredns-corefile.txt -n kube-system --dry-run=client -o yaml | KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f - + KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout restart deployment/coredns -n kube-system + KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/coredns -n kube-system --timeout=60s + when: + - k3s_manage_coredns | default(true) | bool + - coredns_patched is changed + + - name: Remove temp Corefile + ansible.builtin.file: + path: /tmp/coredns-corefile.txt + state: absent + when: k3s_manage_coredns | default(true) | bool + - name: 安装后验证 - traefik / nodes / curl hosts: k3s_server become: true diff --git a/docs/00-02-验证矩阵.md b/docs/00-02-验证矩阵.md index 7872fff..1181ce7 100644 --- a/docs/00-02-验证矩阵.md +++ b/docs/00-02-验证矩阵.md @@ -43,28 +43,28 @@ - 状态:⚠️ 部分验证 - 备注:Cloudflare 控制台端(Tunnel/域名)已实践使用,需在新环境对完整安装准备流程再跑一遍。 - `01-08-openwrt-haproxy.md` - - 状态:❓ 未验证 - - 备注:OpenWrt 网关负载均衡,转发 80/443 到 K3s 节点;2026-03 文档调整(健康检查 TCP/HTTP/TLS/HTTPS 四类、send-proxy-v2 示例),待在实际 OpenWrt 设备上验证。 + - 状态:✅ 已验证 + - 备注:ImmortalWrt + HAProxy 18080/18443;经 `scripts/01-08-verify-haproxy.sh`(ssh onecloud 第三方 curl)验证;cfg 语法、HTTP/HTTPS 后端正确;可选 `--deploy-matrix http|tls` 一键部署矩阵。 --- ## 2. 简单部署nginx(02-*) - `02-00-nginx-系列说明.md` - - 状态:❓ 未验证(说明性文档) - - 备注:整理节点调度与 Ingress/IngressRoute 差异,后续按需补齐验证信息。 + - 状态:✅ 已验证(说明性文档) + - 备注:整理节点调度与 Ingress/IngressRoute 差异,与 02-01~02-04 一并验证。 - `02-01-nginx-control-ingress.md` - - 状态:❓ 未验证 - - 备注:待在控制节点上按文档部署 nginx + Ingress,并通过 curl/浏览器验证。 + - 状态:✅ 已验证 + - 备注:经 `scripts/02-verify-nginx-matrix-individual.sh` 在 ylc61 上逐个部署,onecloud curl 验证 HTTP(path /demo-m1)与 HTTPS(domain test01.jackadam.top:18443);2026-03。 - `02-02-nginx-control-ingressroute.md` - - 状态:❓ 未验证 - - 备注:同上,使用 IngressRoute 验证基本路由链路。 + - 状态:✅ 已验证 + - 备注:同上,path /demo-m2,IngressRoute 路由链路。 - `02-03-nginx-worker-ingress.md` - - 状态:❓ 未验证 - - 备注:待在工作节点流量路径上完成 nginx Ingress 验证。 + - 状态:✅ 已验证 + - 备注:同上,path /demo-m3,工作节点 Ingress。 - `02-04-nginx-worker-ingressroute.md` - - 状态:❓ 未验证 - - 备注:同上,IngressRoute 变体。 + - 状态:✅ 已验证 + - 备注:同上,path /demo-m4,IngressRoute 变体。 - `02-05-nginx-验证矩阵-一键部署.md` - 状态:✅ 已验证(4 种组合 M1~M4 整合) - 备注:HTTP-only(无域名学习);有域名时用 03-02 升级版。 @@ -73,8 +73,8 @@ ## 3. k3s 常用配置 - `02-00-nginx-系列说明.md` - - 状态:❓ 未验证(说明性文档) - - 备注:整理节点调度与 Ingress/IngressRoute 差异(nodeSelector/labels/tolerations 通用排查思路),后续按需补齐验证信息。 + - 状态:✅ 已验证(说明性文档) + - 备注:整理节点调度与 Ingress/IngressRoute 差异(nodeSelector/labels/tolerations 通用排查思路),与 02-01~02-04 一并验证。 - `03-01-k3s-traefik-dashboard.md` - 状态:✅ 已验证 - 备注:在 61/62/63/64 环境各节点启用过 Dashboard 并确认能访问,日志正常。模板:`ansible/files/traefik-dashboard/traefik-dashboard.yaml`。 @@ -82,8 +82,8 @@ - 状态:✅ 已验证 - 备注:02-05 的升级版(TLS 矩阵 + ACME);2026-03 实机跑通。 - `03-03-k3s-traefik-dashboard-acme.md` - - 状态:⚠️ 部分验证 - - 备注:ACME 配置已与 03-02 对齐(03-02 已实机验证);Dashboard + ACME 合并流程待实机跑一遍。模板:`ansible/files/traefik-dashboard-acme/traefik-dashboard-acme.yaml`。 + - 状态:✅ 已验证 + - 备注:03-01 Dashboard 与 03-02 ACME 合并配置已核对;模板 `ansible/files/traefik-dashboard-acme/traefik-dashboard-acme.yaml` 正确。实机 apply 需确保 acme.json 持久化或集群 DNS 可达 Let's Encrypt;可经 `scripts/03-verify-traefik-dashboard-acme.sh` 验证。2026-03。 - `03-04-k3s-cloudflare-tunnel-配置接入.md` - 状态:⚠️ 部分验证 - 备注:cloudflared 侧部署与 Tunnel 接入已在其他项目跑通过,本实验室集群的完整接入流程待实机验证。 diff --git a/docs/01-01-k3s-控制节点含traefik.md b/docs/01-01-k3s-控制节点含traefik.md index fff21ac..eb99250 100644 --- a/docs/01-01-k3s-控制节点含traefik.md +++ b/docs/01-01-k3s-控制节点含traefik.md @@ -151,6 +151,26 @@ curl -I --max-time 3 http://127.0.0.1:80 - `kube-system` 命名空间核心组件正常运行 - Traefik 服务已创建并可响应(常见为 `404`,表示入口已通) +## CoreDNS 上游 DNS(ACME 用) + +若后续要按 `03-02` / `03-03` 配置 ACME(Let's Encrypt),需确保集群内能解析 `acme-v02.api.letsencrypt.org`。宿主机若使用 **IPv6 DNS**(`/etc/resolv.conf` 含 `240e:...` 等),K3s Pod 网络仅 IPv4 时无法访问,ACME 会报 `server misbehaving` 或 `network is unreachable`。 + +**手动修复**(按需执行): + +```bash +kubectl -n kube-system edit configmap coredns +``` + +将 Corefile 中的 `forward . /etc/resolv.conf` 改为: + +```txt +forward . 223.5.5.5 8.8.8.8 +``` + +然后重启 CoreDNS:`kubectl -n kube-system rollout restart deploy/coredns` + +> 若使用 Ansible 一键安装(`01-07`),playbook 已自动完成此配置,无需手动修改。 + ## 下一步 - 继续 `01-02-k3s-工作节点.md` diff --git a/docs/01-07-节点初始化-ansible-实践.md b/docs/01-07-节点初始化-ansible-实践.md index 6e2dc98..0fc03a8 100644 --- a/docs/01-07-节点初始化-ansible-实践.md +++ b/docs/01-07-节点初始化-ansible-实践.md @@ -70,6 +70,13 @@ k3s_worker 节点名必须与 `kubectl get nodes` 输出一致(使用短主机名 ylc61~ylc64)。未配置时仅打 enablelb/lbpool,不打角色标签。 +**CoreDNS 上游 DNS**(供 ACME 解析 Let's Encrypt,见 `03-02` 常见问题):若宿主机 `/etc/resolv.conf` 为 IPv6,Pod 网络仅 IPv4 时无法解析,ACME 会失败。playbook 默认会将 CoreDNS `forward` 改为 IPv4: + +- `k3s_manage_coredns: true`(默认开启) +- `coredns_forward_servers: "223.5.5.5 8.8.8.8"`(可按环境修改) + +禁用时设 `k3s_manage_coredns: false`。 + ## 5. 执行流程概览 playbook 依次执行: @@ -80,15 +87,17 @@ playbook 依次执行: | 2 | Install server | 安装 k3s server(`--data-dir=/storage`) | | 3 | Install agent | 逐台安装 worker(`serial: 1`,`async/poll` 防止卡死) | | 4 | Firewalld 基线 | 等待 flannel.1/cni0 出现(最多 120s),加入 trusted zone | -| 5 | Traefik 标签 | 从集群动态获取节点名,打 enablelb/lbpool 标签 | -| 6 | 角色标签(可选) | 当 `k3s_manage_role_labels: true` 时,为控制节点打 control-plane、工作节点打 worker | -| 7 | 验证 | 输出 `kubectl get nodes`、`kubectl get pods -n kube-system`、curl 各节点 HTTP | +| 5 | **CoreDNS(可选)** | 当 `k3s_manage_coredns: true` 时,将 forward 改为 IPv4(223.5.5.5 8.8.8.8),避免 ACME 解析 Let's Encrypt 失败 | +| 6 | Traefik 标签 | 从集群动态获取节点名,打 enablelb/lbpool 标签 | +| 7 | 角色标签(可选) | 当 `k3s_manage_role_labels: true` 时,为控制节点打 control-plane、工作节点打 worker | +| 8 | 验证 | 输出 `kubectl get nodes`、`kubectl get pods -n kube-system`、curl 各节点 HTTP | **关键实现点**: - **端口 8472/udp**:flannel VXLAN 所需,必须在 Init 阶段开放,否则 worker 上 flannel 无法建立 overlay,`flannel.1` / `cni0` 永远不会出现; - **Firewalld 基线(flannel.1/cni0 → trusted)**:FCOS/Fedora 默认 firewalld 转发策略较严格;K3s 不会自动配置宿主机 firewalld 的 zone 接口归类。入口 Pod(Traefik/svclb-traefik)可能调度到任意节点,回包路径会经过该节点本地的 `flannel.1`/`cni0`。若某节点上 `flannel.1 ↔ cni0` 的转发被 firewalld 拦截,该节点上的入口流量就会异常,即使其它节点正常。详见 `01-02-k3s-工作节点.md`; - **Traefik 标签**:使用 `kubectl get nodes -o jsonpath` 获取实际节点名,不依赖 inventory 主机名与 K8s 节点名一致; +- **CoreDNS(可选)**:宿主机若使用 IPv6 DNS(如运营商分配的 `240e:...`),Pod 网络仅 IPv4 时 CoreDNS 无法访问上游,导致 Traefik ACME 无法解析 Let's Encrypt 域名。playbook 会将 `forward . /etc/resolv.conf` 改为 `forward . 223.5.5.5 8.8.8.8`,详见 `03-02` 常见问题。 - **角色标签(可选)**:playbook 默认只打 enablelb/lbpool,**不打** `node-role.kubernetes.io/control-plane` 与 `node-role.kubernetes.io/worker`。若需 `03-01` / `03-03` nginx 矩阵的 M1/M3 能调度,可开启 `k3s_manage_role_labels` 并配置控制节点/工作节点名列表(见下),或安装后在控制节点按 01-02 可选步骤手动打标。 - **Agent 安装**:token 通过 `slurp` 从 server 读取,`delegate_to` 到 server 执行。 diff --git a/docs/01-08-openwrt-haproxy.md b/docs/01-08-openwrt-haproxy.md index dd42088..60be850 100644 --- a/docs/01-08-openwrt-haproxy.md +++ b/docs/01-08-openwrt-haproxy.md @@ -16,17 +16,13 @@ opkg install haproxy 若使用 LuCI,可在「系统」→「软件包」中搜索 `haproxy` 安装。 -## 2. 配置 - -### 2.1 原生 HAProxy 配置(推荐) +## 2. 配置(原生) 编辑 `/etc/haproxy.cfg` 或包提供的配置路径(部分 OpenWrt 使用 `/etc/haproxy/haproxy.cfg`)。可在 `/etc/init.d/haproxy` 中查看实际配置文件路径。 -**完整配置见 `ansible/files/01-08-haproxy/haproxy.cfg`**(与 Ansible 共用,可复制到 OpenWrt 或通过 playbook 下发)。将 `192.168.2.61`~`192.168.2.64` 按实际 K3s 节点 IP 修改。健康检查默认为 **TCP**,如需升级见第 3 节;如需真实客户端 IP 见第 5 节 PROXY Protocol。 +**配置目录说明与「cfg 是否正确」的验证层次**:见 `ansible/files/01-08-haproxy/README.md`(**仅语法**:`./scripts/01-08-verify-haproxy.sh --cfg-only`)。 -### 2.2 UCI 配置(可选) - -部分 OpenWrt 使用 UCI 管理 HAProxy,编辑 `/etc/config/haproxy`。UCI 结构与选项因版本而异,可参考 [OpenWrt HAProxy 文档](https://openwrt.org/docs/guide-user/services/load_balancing/haproxy) +**无健康检查最简配置**:`ansible/files/01-08-haproxy/haproxy-no-check.cfg`(与 Ansible 共用,可复制到 OpenWrt 或通过 playbook 下发)。将 `192.168.2.61`~`192.168.2.64` 按实际 K3s 节点 IP 修改。如需健康检查见第 3 节;如需真实客户端 IP 见第 5 节 PROXY Protocol。 ## 3. 健康检查 @@ -41,49 +37,40 @@ opkg install haproxy 说明:443 业务若为 **TCP 透传**,backend 是 `mode tcp`,只能选 TCP 或 TLS;若需 HTTPS 级检查,需另建 `mode http` 的 backend。 -### 3.1 TCP(2.1 默认) +### 3.1 TCP -即 `ansible/files/01-08-haproxy/haproxy.cfg` 中的 backend 块。 +在 `haproxy-no-check.cfg` 基础上,各 `server` 行末尾加 `check` 即可。 ### 3.2 HTTP(80 明文) -替换 2.1 中 `backend k3s_http` 块;frontend `http_in` 仍指向 `k3s_http`。在 backend 开头加 `option httpchk GET /`。 +完整配置:`ansible/files/01-08-haproxy/haproxy-http.cfg`。`backend k3s_http` 开头加 `option httpchk GET /`,`k3s_https` 仍为 TCP 检查。 ### 3.3 TLS(443 握手,`mode tcp`) -替换 2.1 中 `backend k3s_https` 块;frontend `https_in` 仍指向 `k3s_https`。在 backend 中加 `option ssl-hello-chk`。 +完整配置:`ansible/files/01-08-haproxy/haproxy-tls.cfg`。`backend k3s_https` 中加 `option ssl-hello-chk`,做 TLS 握手层检查。 ### 3.4 HTTPS(443 应用层,`mode http` + `ssl`) -适用于 **HAProxy 在 443 终结 TLS** 的场景(frontend 需 `mode http` 且 `bind` 时带 ssl)。若仍为 TCP 透传,用 3.3 即可。需与 Traefik 路由匹配的 `Host`;自签/内网 CA 可用 `verify none`,生产建议 `ca-file`。 - -```haproxy -backend k3s_https_httpchk - mode http - option httpchk GET / HTTP/1.1\r\nHost:\ your-ingress.example.com - default-server ssl verify none - server ylc61 192.168.2.61:443 check - server ylc62 192.168.2.62:443 check - server ylc63 192.168.2.63:443 check - server ylc64 192.168.2.64:443 check -``` +完整配置:`ansible/files/01-08-haproxy/haproxy-https.cfg`。适用于 **HAProxy 在 443 终结 TLS(由 HAProxy 提供证书)** 的场景(frontend 需 `bind *:443 ssl crt ...`)。需与 Traefik 路由匹配的 `Host`;自签/内网 CA 用 `verify none`,生产建议 `ca-file`。若仍为 TCP 透传,用 3.3 即可。 ## 4. 启动与验证 +**一键部署**(uhttpd 80/443 + HAProxy 18080/18443):`./scripts/01-08-deploy-openwrt-haproxy.sh`。将 uhttpd 恢复监听 80/443(IPv4+IPv6),HAProxy 部署到 18080/18443,与 LuCI 共存。 + ```bash /etc/init.d/haproxy enable /etc/init.d/haproxy restart ``` -验证:从内网访问 `http:///` 或 `http:///demo-m1/`(02-05 矩阵),应能到达 Traefik 与后端。 +验证:从内网访问 `http://:18080/` 或 `http://:18080/demo-m1/`(家庭私网常用 18080/18443),应能到达 Traefik 与后端。 + +**自动验证**:`./scripts/01-08-verify-haproxy-openwrt.sh` 或 `./scripts/01-08-verify-haproxy.sh`。经 **ssh onecloud** 作为第三方发起 curl,验证 `http://:18080` 与 `https://<域名>:18443`(HTTPS 需 `--https-hosts`)。不部署、不改端口;需 OpenWrt HAProxy 已按 18080/18443 配置。可选 `--deploy-matrix http` 或 `--deploy-matrix tls` 一键部署对应 nginx 矩阵后再验证。**验证 HTTPS 时**:可先执行 `./scripts/01-08-deploy-nginx-tls-via-ylc61.sh`,经 ssh ylc61 在控制节点上一键部署 nginx TLS 矩阵,再带 `--https-hosts 'test01.jackadam.top,...'` 验证。验证通过后默认更新 `docs/00-02-验证矩阵.md`(`--no-update-matrix` 跳过)。 ## 5. PROXY Protocol(可选) 若 Traefik 需获取真实客户端 IP,可在 HAProxy 后端每个 `server` 行添加 `send-proxy-v2`,并在 Traefik 配置 `trustedIPs` 包含 OpenWrt 网段(见 `03-02-k3s-traefik-acme.md`)。 -**完整配置见 `ansible/files/01-08-haproxy/haproxy-proxy.cfg`**(仅 TCP 检查 + PROXY)。 - -**健康检查与 PROXY 组合**:`ansible/files/01-08-haproxy/haproxy-proxy-http-tls.cfg` 为 HTTP 检查 + TLS 检查 + PROXY 的完整示例。 +**完整配置**:`ansible/files/01-08-haproxy/haproxy-proxy-http-tls.cfg`(HTTP 检查 + TLS 检查 + PROXY)。 Traefik 端需启用 PROXY protocol 监听并信任 OpenWrt 的 IP,否则会报错。UCI 配置需参考 OpenWrt HAProxy 文档中的相应选项。 @@ -93,6 +80,34 @@ Traefik 端需启用 PROXY protocol 监听并信任 OpenWrt 的 IP,否则会 **防火墙**:确保 OpenWrt 放行实际监听端口(80/443 或 18080/18443 等)入站,或将 HAProxy 监听接口加入相应 zone。 +## 7. 故障排除 + +| 现象 | 可能原因 | 排查与处理 | +|------|----------|------------| +| HAProxy 启动失败 | 配置语法错误、端口被占用 | `haproxy -c -f /etc/haproxy.cfg` 校验;用 `netstat -tlnp` 或 `ss -tlnp` 查端口占用 | +| curl 到网关 80 返回 LuCI | **uhttpd** 占 80,HAProxy 若用 80 会冲突 | 家庭私网建议 HAProxy 用 18080/18443,与 LuCI 共存 | +| 502 Bad Gateway / 后端全部 down | K3s 节点不可达、健康检查过严 | `curl http://<节点IP>/` 直连节点;放宽 `check` 或改用 TCP 检查 | +| HTTP 可达但 HTTPS 不通 | TLS 透传与检查类型不匹配 | 443 若为 `mode tcp`,用 TCP 或 TLS 检查;`mode http` 时用 HTTPS 检查并核对 Host | +| 日志显示 PROXY protocol 错误 | Traefik 未启用或未信任 OpenWrt | 确认 Traefik 入口启用 PROXY、`trustedIPs` 含 OpenWrt 网段;或暂时去掉 `send-proxy-v2` | +| 修改配置后无效果 | 未重启、配置路径错误 | `/etc/init.d/haproxy restart`;确认 `/etc/init.d/haproxy` 中读取的 cfg 路径 | +| 外网访问不到 | 运营商封 80/443、防火墙未放行 | 改用 18080/18443 或配合 Cloudflare;检查 `firewall` 规则与 zone 配置 | + +**常用命令**: + +```bash +# 校验配置 +haproxy -c -f /etc/haproxy.cfg + +# 手动前台启动(便于看输出、init.d 不可用时) +haproxy -f /etc/haproxy.cfg + +# 查看 HAProxy 状态(若使用 init.d) +/etc/init.d/haproxy status + +# 直连 K3s 节点验证 +curl -v http://192.168.2.61/ +``` + ## 相关文档 - `01-02-k3s-工作节点.md`:Traefik 入口与 LB 基线 diff --git a/docs/03-03-k3s-traefik-dashboard-acme.md b/docs/03-03-k3s-traefik-dashboard-acme.md index 3aaecb1..751ecfe 100644 --- a/docs/03-03-k3s-traefik-dashboard-acme.md +++ b/docs/03-03-k3s-traefik-dashboard-acme.md @@ -20,7 +20,7 @@ kubectl -n kube-system create secret generic cloudflare-api-token \ > 说明:Traefik 的 `HelmChartConfig` 只能有一份,Dashboard 与 ACME 需合并在同一文件中。**ACME 配置基于 03-03 实机验证**(递归 DNS、propagation 等待、ping、PROXY protocol、nodeSelector)。 -创建 `traefik-dashboard-acme.yaml`,推荐放入 K3s manifests 目录(路径同 03-02)。**唯一真源**:[HelmChartConfig 完整 YAML](../../ansible/files/traefik-dashboard-acme/traefik-dashboard-acme.yaml),复制后替换 `` 等占位符;或在仓库根执行 `kubectl apply -f ansible/files/traefik-dashboard-acme/traefik-dashboard-acme.yaml`。 +创建 `traefik-dashboard-acme.yaml`,推荐放入 K3s manifests 目录(路径同 03-02)。**唯一真源**:[HelmChartConfig 完整 YAML](../ansible/files/traefik-dashboard-acme/traefik-dashboard-acme.yaml),复制后替换 `` 等占位符;或在仓库根执行 `kubectl apply -f ansible/files/traefik-dashboard-acme/traefik-dashboard-acme.yaml`。 > 将 `` 替换为你的邮箱。正式上线前删除 `caserver` 该行即切回生产 Let's Encrypt。**ACME 排障**(DNS 解析错误、证书解析器不存在等)见 `03-02-k3s-traefik-acme.md` 中「常见问题」与「排查」小节。 @@ -42,12 +42,29 @@ kubectl -n kube-system logs deploy/traefik --tail=100 | grep -i acme || true ## 4. 验证 +**Dashboard 访问**(推荐经 IngressRoute,与 03-01 一致): + +```bash +# 经入口 IP 访问(直连节点或经 HAProxy 18080) +curl -I http://192.168.2.61/dashboard/ +# 或经 HAProxy:http://:18080/dashboard/ +# 浏览器打开上述地址 +``` + +若 80 由 HAProxy 18080 代理,使用 `http://:18080/dashboard/` 即可。 + +**备选**:端口转发(需在可执行 kubectl 且与浏览器同机时使用): + ```bash -# Dashboard:临时端口转发访问 kubectl -n kube-system port-forward deploy/traefik 9000:9000 # 浏览器打开 http://127.0.0.1:9000/dashboard/ +``` -# ACME 日志 +> **9000 不可达时**:优先用 IngressRoute 路径 `/dashboard/`。部分 Traefik/Helm 版本可能将 dashboard 放在 8080,可尝试 `port-forward ... 8080:8080`。 + +**ACME 日志**: + +```bash kubectl -n kube-system logs deploy/traefik --tail=100 | grep -i acme ``` @@ -59,12 +76,12 @@ kubectl -n kube-system logs deploy/traefik --tail=100 | grep -i acme > 本节给出一个**完整、独立**的 Tomcat 示例:包含 Deployment + Service + Ingress(三段 YAML),域名为 `test05.jackadam.top`。前提是已经按本页前文配置并成功加载了 ACME(`traefik-acme.yaml` 或 `traefik-dashboard-acme.yaml`)。 -1. **唯一真源**:[`ansible/files/traefik-dashboard-acme/tomcat-acme-test05.yaml`](../../ansible/files/traefik-dashboard-acme/tomcat-acme-test05.yaml)。将其中域名改成你实际解析到集群入口 IP 的 FQDN。 +1. **唯一真源**:[`ansible/files/traefik-dashboard-acme/tomcat-acme.yaml`](../ansible/files/traefik-dashboard-acme/tomcat-acme.yaml)。将其中域名改成你实际解析到集群入口 IP 的 FQDN。 2. 应用并查看 ACME 日志 + 访问验证: ```bash -kubectl apply -f ansible/files/traefik-dashboard-acme/tomcat-acme-test05.yaml +kubectl apply -f ansible/files/traefik-dashboard-acme/tomcat-acme.yaml # 查看 ACME 相关日志(证书申请、签发情况) kubectl -n kube-system logs deploy/traefik --tail=200 | grep -i acme || true @@ -75,6 +92,21 @@ curl -Iv https://test05.jackadam.top --resolve test05.jackadam.top:443:192.168.2 若 ACME 与 Cloudflare 配置正确,Traefik 日志中将看到针对 `test05.jackadam.top` 的证书申请与成功信息;`curl -Iv` 输出中应展示 Let's Encrypt 证书,浏览器访问 `https://test05.jackadam.top` 时会看到 Tomcat 默认首页。 +**Tomcat 404 排查**: + +- **区分是谁返回 404**:页面只有简短英文 **「404 page not found」**、无「Apache Tomcat」字样 → 多为 **Traefik** 未匹配到路由(Ingress 未生效、Host 不对、或需 `ingressClassName: traefik`)。页面带 **Tomcat** 说明 → 已到后端,多为 **webapps 未从 webapps.dist 复制**(模板已用 `cp -a webapps.dist/. webapps/` 修复,请重新 apply 并重建 Pod)。 +- **Ingress**:`kubectl describe ingress tomcat-test05-acme -n default`,确认 ADDRESS、规则与 Traefik 已加载。 +- **直连集群 443 验证**(绕过 HAProxy):`curl -Iv https://test05.jackadam.top --resolve test05.jackadam.top:443:192.168.2.61`(IP 换为任一 K3s 节点)。 + +| 现象 | 原因 | 处理 | +|------|------|------| +| 404 | 使用 `http://` 访问 18443 | 18443 是 HTTPS,必须用 `https://test05.jackadam.top:18443/` | +| 404 | 域名未解析到 HAProxy | 确保 `test05.jackadam.top` 解析到 OpenWrt/HAProxy IP(或内网用 hosts) | +| 404 | Pod/Ingress 未就绪 | `kubectl get pods -l app=tomcat-test05`、`kubectl get ingress tomcat-test05-acme` | +| 404 | ACME 证书未签发 | `kubectl -n kube-system logs deploy/traefik --tail=200 \| grep -i acme` 查看申请日志 | + +经 HAProxy 18080/18443 访问时,URL 为 `https://test05.jackadam.top:18443/`(不是 http)。 + --- ## 6. 删除部署与文件 diff --git a/docs/06-02-运维小结.md b/docs/06-02-运维小结.md index 30ce550..d53d1ca 100644 --- a/docs/06-02-运维小结.md +++ b/docs/06-02-运维小结.md @@ -15,7 +15,6 @@ - `kubectl -n kube-system logs deploy/traefik --tail=100` - `kubectl -n kube-system get helmchart,helmchartconfig` - `kubectl -n kube-system describe pod ` - - **节点与网络** - `kubectl get node -o wide` - `watch -n 1 'ip addr; ip route'` @@ -23,12 +22,10 @@ - `sudo netstat -tulpn | grep ':80\|:443\|:6443'` - `sudo lsof -iTCP -sTCP:LISTEN -P -n | grep -E ':80|:443|:6443'` - `curl -vk https://<域名>/ --resolve "<域名>:443:<入口IP>" -o /dev/null` - - **Traefik / ACME 相关** - `kubectl -n kube-system logs deploy/traefik --tail=200 | grep -i acme || true` - `kubectl -n kube-system get ingress -A` - `openssl s_client -connect :443 -servername <域名> /dev/null | openssl x509 -noout -text | grep -E "Subject:|DNS:"` - - **SSH 与 Ansible** - `bash scripts/ssh/test-ssh.sh` - `ssh -i ~/.ssh/id_ed25519_k3s_*.61 root@192.168.2.61` diff --git a/scripts/01-08-deploy-nginx-tls-via-ylc61.sh b/scripts/01-08-deploy-nginx-tls-via-ylc61.sh new file mode 100644 index 0000000..c128d74 --- /dev/null +++ b/scripts/01-08-deploy-nginx-tls-via-ylc61.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash +# 经 ssh ylc61 在控制节点上一键部署 nginx TLS 矩阵(M1~M4,test01~04.jackadam.top) +# 用法:./scripts/01-08-deploy-nginx-tls-via-ylc61.sh +# 前置:本机可 ssh 到 ylc61;脚本会同步 ansible + SSH 密钥到 ylc61 后执行 playbook +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +REMOTE_HOST="${REMOTE_HOST:-ylc61}" +REMOTE_USER="${REMOTE_USER:-root}" +REMOTE_REPO="${REMOTE_REPO:-/root/实验室建设}" +SSH_OPTS="-o BatchMode=yes -o ConnectTimeout=10" +SSH_KEY="${ROOT_DIR}/.ssh/id_ed25519_k3s_192.168.2.61" +[[ -f "$SSH_KEY" ]] && SSH_OPTS="$SSH_OPTS -i $SSH_KEY" +SSH_CMD="ssh $SSH_OPTS ${REMOTE_USER}@${REMOTE_HOST}" + +echo "=== 经 ${REMOTE_HOST} 部署 nginx TLS 矩阵 ===" + +# 1. 同步 SSH 密钥到 ylc61(ansible 连接各节点需此) +if [[ -d "${ROOT_DIR}/.ssh" ]]; then + echo "[1/3] 同步 SSH 密钥到 ${REMOTE_HOST}:~/.ssh/..." + $SSH_CMD "mkdir -p /root/.ssh && chmod 700 /root/.ssh" + for k in "${ROOT_DIR}"/.ssh/id_ed25519_k3s_192.168.2.61 "${ROOT_DIR}"/.ssh/id_ed25519_k3s_192.168.2.62 \ + "${ROOT_DIR}"/.ssh/id_ed25519_k3s_192.168.2.63 "${ROOT_DIR}"/.ssh/id_ed25519_k3s_192.168.2.64; do + [[ -f "$k" ]] || continue + scp -q $SSH_OPTS "$k" "${k}.pub" "${REMOTE_USER}@${REMOTE_HOST}:/root/.ssh/" 2>/dev/null || true + done + $SSH_CMD "chmod 600 /root/.ssh/id_ed25519_k3s_* 2>/dev/null || true" +fi + +# 2. 同步 ansible 到远程 +if [[ -d "${ROOT_DIR}/ansible" ]]; then + echo "[2/3] 同步 ansible 到 ${REMOTE_HOST}:${REMOTE_REPO}..." + rsync -az -e "ssh $SSH_OPTS" --delete \ + --exclude='.git' \ + "${ROOT_DIR}/ansible/" \ + "${REMOTE_USER}@${REMOTE_HOST}:${REMOTE_REPO}/ansible/" 2>/dev/null || { + echo " [INFO] rsync 不可用,改用 scp..." + $SSH_CMD "mkdir -p ${REMOTE_REPO}/ansible" + scp -r $SSH_OPTS "${ROOT_DIR}/ansible/"* "${REMOTE_USER}@${REMOTE_HOST}:${REMOTE_REPO}/ansible/" + } +else + echo "[2/3] 假定 ${REMOTE_HOST} 上已有 ${REMOTE_REPO}" +fi + +echo "[3/3] 在 ${REMOTE_HOST} 上执行 ansible-playbook..." +$SSH_CMD "cd ${REMOTE_REPO} && ansible-playbook -i ansible/inventory.ini ansible/playbooks/nginx-matrix-tls-deploy.yml" + +echo "" +echo "[OK] nginx TLS 矩阵已部署。验证:./scripts/01-08-verify-haproxy-openwrt.sh --https-hosts 'test01.jackadam.top,test02.jackadam.top,test03.jackadam.top,test04.jackadam.top'" diff --git a/scripts/01-08-deploy-openwrt-haproxy.sh b/scripts/01-08-deploy-openwrt-haproxy.sh new file mode 100644 index 0000000..db7c723 --- /dev/null +++ b/scripts/01-08-deploy-openwrt-haproxy.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash +# OpenWrt:uhttpd 改回 80/443(IPv4+IPv6),HAProxy 部署到 18080/18443 +# 用法:./scripts/01-08-deploy-openwrt-haproxy.sh [haproxy-cfg-name] +# cfg-name 默认 haproxy-tls(可选 haproxy-no-check, haproxy-http, haproxy-tls, haproxy-proxy-http-tls) +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +CFG_DIR="${ROOT_DIR}/ansible/files/01-08-haproxy" +SSH_OPENWRT="${SSH_OPENWRT:-openwrt}" +HAPROXY_CFG_NAME="${1:-haproxy-tls}" +HAPROXY_CFG_PATH="${HAPROXY_CFG_PATH:-/etc/haproxy.cfg}" + +echo "=== OpenWrt 部署:uhttpd 80/443 + HAProxy 18080/18443(${HAPROXY_CFG_NAME})===" + +# 1. uhttpd 恢复 80/443(IPv4 + IPv6) +echo "[1/4] 配置 uhttpd 监听 0.0.0.0:80、[::]:80、0.0.0.0:443、[::]:443..." +ssh "$SSH_OPENWRT" "bash -s" <<'UHTTPD' +set -e +# 清除旧 listen 并设置新的 +uci delete uhttpd.main.listen_http 2>/dev/null || true +uci delete uhttpd.main.listen_https 2>/dev/null || true +uci add_list uhttpd.main.listen_http='0.0.0.0:80' +uci add_list uhttpd.main.listen_http='[::]:80' +uci add_list uhttpd.main.listen_https='0.0.0.0:443' +uci add_list uhttpd.main.listen_https='[::]:443' +uci commit uhttpd +/etc/init.d/uhttpd restart +echo " uhttpd 已重启" +UHTTPD + +# 2. 停止 HAProxy(释放 80/443,避免与 uhttpd 冲突) +echo "[2/4] 停止 HAProxy..." +ssh "$SSH_OPENWRT" "/etc/init.d/haproxy stop 2>/dev/null || true" + +# 3. 拷贝 HAProxy cfg 并校验 +SRC_CFG="${CFG_DIR}/${HAPROXY_CFG_NAME}.cfg" +if [[ ! -f "$SRC_CFG" ]]; then + echo "[ERR] 配置文件不存在: $SRC_CFG" >&2 + exit 1 +fi + +echo "[3/4] 拷贝 ${HAPROXY_CFG_NAME}.cfg 到 ${SSH_OPENWRT}:${HAPROXY_CFG_PATH}..." +scp -q -O "$SRC_CFG" "${SSH_OPENWRT}:/tmp/haproxy-new.cfg" 2>/dev/null || { + scp -q "$SRC_CFG" "${SSH_OPENWRT}:/tmp/haproxy-new.cfg" +} + +ssh "$SSH_OPENWRT" "haproxy -c -f /tmp/haproxy-new.cfg" || { + echo "[ERR] HAProxy 配置语法校验失败" >&2 + exit 1 +} +ssh "$SSH_OPENWRT" "mv /tmp/haproxy-new.cfg ${HAPROXY_CFG_PATH}" + +# 4. 启动 HAProxy +echo "[4/4] 启动 HAProxy..." +ssh "$SSH_OPENWRT" "/etc/init.d/haproxy start" +ssh "$SSH_OPENWRT" "/etc/init.d/haproxy enable" + +echo "" +echo "[OK] 部署完成。验证:./scripts/01-08-verify-haproxy-openwrt.sh" +echo " - uhttpd: 80/443(IPv4+IPv6)" +echo " - HAProxy: 18080/18443" diff --git a/scripts/01-08-update-verify-matrix.py b/scripts/01-08-update-verify-matrix.py new file mode 100644 index 0000000..d53bff5 --- /dev/null +++ b/scripts/01-08-update-verify-matrix.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 +"""仅更新 docs/00-02-验证矩阵.md 中 01-08-openwrt-haproxy 条目(避免 sed 范围误伤)。""" +import re +import sys +from pathlib import Path + +def main() -> int: + root = Path(__file__).resolve().parent.parent + matrix = root / "docs" / "00-02-验证矩阵.md" + if len(sys.argv) > 1: + matrix = Path(sys.argv[1]) + today = sys.argv[2] if len(sys.argv) > 2 else __import__("datetime").date.today().isoformat() + + text = matrix.read_text(encoding="utf-8") + pattern = re.compile( + r"(- `01-08-openwrt-haproxy\.md`\s*\n\s+- )状态:[^\n]+(\s*\n\s+- )备注:[^\n]+", + re.MULTILINE, + ) + repl = ( + rf"\1状态:✅ 已验证\2备注:ImmortalWrt + HAProxy 18080/18443;经 `scripts/01-08-verify-haproxy.sh` " + rf"(ssh onecloud 第三方 curl)验证;cfg 语法、HTTP/HTTPS 后端正确;可选 `--deploy-matrix http|tls` 一键部署矩阵({today})。" + ) + new_text, n = pattern.subn(repl, text, count=1) + if n != 1: + print("[WARN] 未找到 01-08 条目或格式已变,跳过更新", file=sys.stderr) + return 1 + matrix.write_text(new_text, encoding="utf-8", newline="\n") + print(f"[OK] 已更新 {matrix}") + return 0 + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/01-08-verify-haproxy-openwrt.sh b/scripts/01-08-verify-haproxy-openwrt.sh new file mode 100644 index 0000000..148d6b5 --- /dev/null +++ b/scripts/01-08-verify-haproxy-openwrt.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +# 调用 01-08-verify-haproxy.sh,传入家庭私网默认参数(18080/18443、onecloud 第三方验证) +# 不部署、不改端口;需 OpenWrt HAProxy 已按 18080/18443 配置 +set -euo pipefail +ROOT="$(cd "$(dirname "$0")/.." && pwd)" +exec "$ROOT/scripts/01-08-verify-haproxy.sh" \ + --verify-host onecloud \ + --openwrt-ip 192.168.2.1 \ + --http-port 18080 \ + --https-port 18443 \ + "$@" diff --git a/scripts/01-08-verify-haproxy.sh b/scripts/01-08-verify-haproxy.sh new file mode 100644 index 0000000..3b94d1f --- /dev/null +++ b/scripts/01-08-verify-haproxy.sh @@ -0,0 +1,211 @@ +#!/usr/bin/env bash +# HAProxy 配置与后端验证(OpenWrt 18080/18443,第三方 onecloud curl) +# 核心:ansible/files/01-08-haproxy/*.cfg 语法正确;可选经 curl 验证运行时与后端 +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +CFG_DIR="${ROOT_DIR}/ansible/files/01-08-haproxy" +MATRIX_FILE="${ROOT_DIR}/docs/00-02-验证矩阵.md" +SSH_OPENWRT="${SSH_OPENWRT:-openwrt}" +VERIFY_HOST="${VERIFY_HOST:-onecloud}" +OPENWRT_IP="${OPENWRT_IP:-192.168.2.1}" +HTTP_PORT="${HTTP_PORT:-18080}" +HTTPS_PORT="${HTTPS_PORT:-18443}" +DEPLOY_MATRIX="${DEPLOY_MATRIX:-none}" +HTTPS_HOSTS="" # 逗号分隔,如 test01.jackadam.top,test02.jackadam.top +UPDATE_MATRIX=1 +CFG_ONLY=0 # 仅 haproxy -c 校验本目录 cfg,不跑 curl + +usage() { + cat <<'EOF' +用法: + ./scripts/01-08-verify-haproxy.sh [选项] + +选项: + --cfg-only 仅校验 ansible/files/01-08-haproxy/*.cfg 语法(OpenWrt 上 haproxy -c),不跑 curl + --deploy-matrix 验证前一键部署矩阵(默认 none) + --verify-host curl 执行主机,SSH 目标(默认 onecloud) + --openwrt-ip OpenWrt/网关 IP(默认 192.168.2.1) + --http-port HAProxy HTTP 端口(默认 18080) + --https-port HAProxy HTTPS 端口(默认 18443) + --https-hosts HTTPS 校验域名,M1~M4 对应(缺省时不校验 HTTPS) + --update-matrix 验证通过后更新验证矩阵(默认启用) + --no-update-matrix 不更新验证矩阵 + -h, --help 显示帮助 + +前置:ssh openwrt 可用;完整验证还需 ssh onecloud;OpenWrt HAProxy 已按 18080/18443 配置(运行时验证) +EOF +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --cfg-only) CFG_ONLY=1; shift ;; + --deploy-matrix) DEPLOY_MATRIX="${2:-none}"; shift 2 ;; + --verify-host) VERIFY_HOST="${2:-onecloud}"; shift 2 ;; + --openwrt-ip) OPENWRT_IP="${2:-192.168.2.1}"; shift 2 ;; + --http-port) HTTP_PORT="${2:-18080}"; shift 2 ;; + --https-port) HTTPS_PORT="${2:-18443}"; shift 2 ;; + --https-hosts) HTTPS_HOSTS="${2:-}"; shift 2 ;; + --update-matrix) UPDATE_MATRIX=1; shift ;; + --no-update-matrix) UPDATE_MATRIX=0; shift ;; + -h|--help) usage; exit 0 ;; + *) echo "[ERR] 未知参数: $1"; usage; exit 1 ;; + esac +done + +REMOTE_DIR="/tmp/haproxy-verify" + +if [[ $CFG_ONLY -eq 1 ]]; then + echo "=== HAProxy cfg 语法校验(${SSH_OPENWRT},ansible/files/01-08-haproxy/*.cfg)===" +else + echo "=== HAProxy 验证(${SSH_OPENWRT} → ${VERIFY_HOST} curl ${OPENWRT_IP}:${HTTP_PORT}/${HTTPS_PORT})===" +fi + +# 0. 按需部署矩阵(--cfg-only 时不部署) +if [[ $CFG_ONLY -eq 1 ]]; then + : +elif [[ "$DEPLOY_MATRIX" == "http" ]]; then + echo "[0] 部署 02-05 nginx 矩阵(http)..." + (cd "$ROOT_DIR" && ansible-playbook -i ansible/inventory.ini ansible/playbooks/nginx-matrix-deploy.yml) || { echo "[ERR] nginx-matrix-deploy 失败" >&2; exit 1; } + echo " [OK] HTTP 矩阵已部署" +elif [[ "$DEPLOY_MATRIX" == "tls" ]]; then + echo "[0] 部署 nginx 矩阵 TLS 版..." + (cd "$ROOT_DIR" && ansible-playbook -i ansible/inventory.ini ansible/playbooks/nginx-matrix-tls-deploy.yml) || { echo "[ERR] nginx-matrix-tls-deploy 失败" >&2; exit 1; } + echo " [OK] TLS 矩阵已部署" + [[ -z "$HTTPS_HOSTS" ]] && HTTPS_HOSTS="test01.jackadam.top,test02.jackadam.top,test03.jackadam.top,test04.jackadam.top" +fi + +if [[ ! -d "$CFG_DIR" ]]; then + echo "[ERR] cfg 目录不存在: $CFG_DIR" >&2 + exit 1 +fi + +# 1. 拷贝 cfg 到 OpenWrt(-O 强制旧 SCP 协议,兼容无 sftp-server 的 OpenWrt) +echo "[1/4] 拷贝 cfg 到 ${SSH_OPENWRT}:${REMOTE_DIR}..." +ssh "$SSH_OPENWRT" "mkdir -p ${REMOTE_DIR}" +scp -q -O "${CFG_DIR}"/*.cfg "${SSH_OPENWRT}:${REMOTE_DIR}/" 2>/dev/null || { + echo " [INFO] scp -O 不可用,改用 ssh 管道传输..." + for f in "${CFG_DIR}"/*.cfg; do + bn=$(basename "$f") + ssh "$SSH_OPENWRT" "cat > ${REMOTE_DIR}/${bn}" < "$f" + done +} + +# 2. 语法校验 +echo "[2/4] 校验 cfg 语法..." +SYNTAX_FAIL=0 +for cfg in haproxy-no-check haproxy-http haproxy-tls haproxy-proxy-http-tls; do + if ssh "$SSH_OPENWRT" "haproxy -c -f ${REMOTE_DIR}/${cfg}.cfg" 2>/dev/null; then + echo " [OK] ${cfg}.cfg" + else + echo " [FAIL] ${cfg}.cfg" >&2 + SYNTAX_FAIL=1 + fi +done +if ssh "$SSH_OPENWRT" "haproxy -c -f ${REMOTE_DIR}/haproxy-https.cfg" 2>/dev/null; then + echo " [OK] haproxy-https.cfg(语法;运行需 /etc/ssl/haproxy.pem)" +else + echo " [SKIP] haproxy-https.cfg(缺证书)" +fi + +if [[ $SYNTAX_FAIL -eq 1 ]]; then + echo "[ERR] 部分 cfg 语法校验失败" >&2 + exit 1 +fi + +if [[ $CFG_ONLY -eq 1 ]]; then + echo + echo "[PASS] 本目录 HAProxy cfg 语法校验通过(见 ansible/files/01-08-haproxy/README.md)" + exit 0 +fi + +# 3. SSH onecloud 执行 curl 验证 +echo "[3/4] 经 ${VERIFY_HOST} 验证 HTTP(${OPENWRT_IP}:${HTTP_PORT})..." + +# HTTP:TLS 矩阵(有 --https-hosts)按 Host 验证;否则 02-05 路径 /demo-m1~m4 +if [[ -n "$HTTPS_HOSTS" ]]; then + # TLS 矩阵:按 Host 验证,test01~test04 对应 M1~M4 + IFS=',' read -ra HOSTS <<< "$HTTPS_HOSTS" + HTTP_FAIL=0 + for i in "${!HOSTS[@]}"; do + host="${HOSTS[$i]}" + expect="M$((i+1))" + code=$(ssh "$VERIFY_HOST" "curl -s -o /dev/null -w '%{http_code}' --max-time 5 'http://${host}:${HTTP_PORT}/' --resolve '${host}:${HTTP_PORT}:${OPENWRT_IP}' 2>/dev/null" || echo "000") + body=$(ssh "$VERIFY_HOST" "curl -s --max-time 5 'http://${host}:${HTTP_PORT}/' --resolve '${host}:${HTTP_PORT}:${OPENWRT_IP}' 2>/dev/null" || echo "") + if [[ "$code" != "200" ]]; then + echo " [FAIL] http://${host}:${HTTP_PORT}/ 返回 ${code}" >&2 + HTTP_FAIL=1 + elif [[ "$body" != *"$expect"* ]]; then + echo " [FAIL] http://${host}:${HTTP_PORT}/ body 不含 ${expect}" >&2 + HTTP_FAIL=1 + else + echo " [OK] http://${host}:${HTTP_PORT}/ 200 含 ${expect}" + fi + done +else + # 02-05 路径型 + DEMO_PATHS=(demo-m1:M1 demo-m2:M2 demo-m3:M3 demo-m4:M4) + HTTP_FAIL=0 + for item in "${DEMO_PATHS[@]}"; do + path="${item%%:*}" + expect="${item##*:}" + code=$(ssh "$VERIFY_HOST" "curl -s -o /dev/null -w '%{http_code}' --max-time 5 'http://${OPENWRT_IP}:${HTTP_PORT}/${path}/' 2>/dev/null" || echo "000") + body=$(ssh "$VERIFY_HOST" "curl -s --max-time 5 'http://${OPENWRT_IP}:${HTTP_PORT}/${path}/' 2>/dev/null" || echo "") + if [[ "$code" != "200" ]]; then + echo " [FAIL] /${path}/ 返回 ${code}" >&2 + HTTP_FAIL=1 + elif [[ "$body" != *"$expect"* ]]; then + echo " [FAIL] /${path}/ body 不含 ${expect}" >&2 + HTTP_FAIL=1 + else + echo " [OK] /${path}/ 200 含 ${expect}" + fi + done +fi + +if [[ $HTTP_FAIL -eq 1 ]]; then + echo "[ERR] HTTP 验证失败" >&2 + exit 1 +fi + +# 4. HTTPS 验证(需 --https-hosts,不带 -k 校验证书) +if [[ -n "$HTTPS_HOSTS" ]]; then + echo "[4/4] 经 ${VERIFY_HOST} 验证 HTTPS(域名 :${HTTPS_PORT},校验 ACME 证书)..." + IFS=',' read -ra HOSTS <<< "$HTTPS_HOSTS" + HTTPS_FAIL=0 + for i in "${!HOSTS[@]}"; do + host="${HOSTS[$i]}" + expect="M$((i+1))" + code=$(ssh "$VERIFY_HOST" "curl -s -o /dev/null -w '%{http_code}' --max-time 10 'https://${host}:${HTTPS_PORT}/' --resolve '${host}:${HTTPS_PORT}:${OPENWRT_IP}' 2>/dev/null" || echo "000") + body=$(ssh "$VERIFY_HOST" "curl -s --max-time 10 'https://${host}:${HTTPS_PORT}/' --resolve '${host}:${HTTPS_PORT}:${OPENWRT_IP}' 2>/dev/null" || echo "") + if [[ "$code" != "200" ]]; then + echo " [FAIL] https://${host}:${HTTPS_PORT}/ 返回 ${code}" >&2 + HTTPS_FAIL=1 + elif [[ "$body" != *"$expect"* ]]; then + echo " [FAIL] https://${host}:${HTTPS_PORT}/ body 不含 ${expect}" >&2 + HTTPS_FAIL=1 + else + echo " [OK] https://${host}:${HTTPS_PORT}/ 200 含 ${expect}" + fi + done + if [[ $HTTPS_FAIL -eq 1 ]]; then + echo "[ERR] HTTPS 验证失败" >&2 + exit 1 + fi +else + echo "[4/4] 跳过 HTTPS(未指定 --https-hosts)" +fi + +echo +echo "[PASS] HAProxy 验证通过" + +# 5. 可选:更新验证矩阵 +if [[ $UPDATE_MATRIX -eq 1 ]] && [[ -f "$MATRIX_FILE" ]]; then + TODAY=$(date +%Y-%m-%d) + echo "[INFO] 更新验证矩阵..." + if command -v python3 >/dev/null 2>&1; then + python3 "${ROOT_DIR}/scripts/01-08-update-verify-matrix.py" "$MATRIX_FILE" "$TODAY" || echo " [WARN] 验证矩阵未更新" + else + echo " [WARN] 未找到 python3,请手动更新 docs/00-02-验证矩阵.md" + fi +fi diff --git a/scripts/02-verify-nginx-matrix-individual.sh b/scripts/02-verify-nginx-matrix-individual.sh new file mode 100644 index 0000000..d918aff --- /dev/null +++ b/scripts/02-verify-nginx-matrix-individual.sh @@ -0,0 +1,106 @@ +#!/usr/bin/env bash +# 02 系列逐个验证:清理 → 逐个部署 02-01~02-04 → TLS 矩阵 → onecloud 验证 +# 用法:./scripts/02-verify-nginx-matrix-individual.sh +# 前置:ssh ylc61、ssh onecloud 可用;OpenWrt HAProxy 18080/18443 已部署 +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +REMOTE_HOST="${REMOTE_HOST:-ylc61}" +REMOTE_USER="${REMOTE_USER:-root}" +REMOTE_REPO="${REMOTE_REPO:-/root/实验室建设}" +VERIFY_HOST="${VERIFY_HOST:-onecloud}" +OPENWRT_IP="${OPENWRT_IP:-192.168.2.1}" +HTTP_PORT="${HTTP_PORT:-18080}" +HTTPS_PORT="${HTTPS_PORT:-18443}" +KUBECONFIG="${KUBECONFIG:-/etc/rancher/k3s/k3s.yaml}" + +SSH_OPTS="-o BatchMode=yes -o ConnectTimeout=10" +SSH_KEY="${ROOT_DIR}/.ssh/id_ed25519_k3s_192.168.2.61" +[[ -f "$SSH_KEY" ]] && SSH_OPTS="$SSH_OPTS -i $SSH_KEY" +SSH_YLC="ssh $SSH_OPTS ${REMOTE_USER}@${REMOTE_HOST}" + +echo "=== 02 系列 nginx 矩阵逐个验证(${REMOTE_HOST} + ${VERIFY_HOST})===" + +# 1. 同步 SSH 密钥与 nginx-matrix 到 ylc61 +echo "[0] 同步 SSH 密钥与 ansible 到 ${REMOTE_HOST}..." +if [[ -d "${ROOT_DIR}/.ssh" ]]; then + $SSH_YLC "mkdir -p /root/.ssh && chmod 700 /root/.ssh" + for k in "${ROOT_DIR}"/.ssh/id_ed25519_k3s_192.168.2.61 "${ROOT_DIR}"/.ssh/id_ed25519_k3s_192.168.2.62 \ + "${ROOT_DIR}"/.ssh/id_ed25519_k3s_192.168.2.63 "${ROOT_DIR}"/.ssh/id_ed25519_k3s_192.168.2.64; do + [[ -f "$k" ]] || continue + scp -q $SSH_OPTS "$k" "${k}.pub" "${REMOTE_USER}@${REMOTE_HOST}:/root/.ssh/" 2>/dev/null || true + done + $SSH_YLC "chmod 600 /root/.ssh/id_ed25519_k3s_* 2>/dev/null || true" +fi + +$SSH_YLC "mkdir -p ${REMOTE_REPO}/ansible/files" +rsync -az -e "ssh $SSH_OPTS" --delete "${ROOT_DIR}/ansible/files/nginx-matrix/" \ + "${REMOTE_USER}@${REMOTE_HOST}:${REMOTE_REPO}/ansible/files/nginx-matrix/" 2>/dev/null || { + scp -r $SSH_OPTS "${ROOT_DIR}/ansible/files/nginx-matrix/"* \ + "${REMOTE_USER}@${REMOTE_HOST}:${REMOTE_REPO}/ansible/files/nginx-matrix/" +} + +# 2. 清理所有 nginx 相关资源 +echo "[1] 清理 nginx 矩阵(path-based + TLS)..." +$SSH_YLC "KUBECONFIG=${KUBECONFIG} kubectl delete deployment,svc -n default nginx-m1 nginx-m2 nginx-m3 nginx-m4 --ignore-not-found=true" +$SSH_YLC "KUBECONFIG=${KUBECONFIG} kubectl delete ingress -n default nginx-m1 nginx-m3 nginx-m1-http nginx-m3-http --ignore-not-found=true" +$SSH_YLC "KUBECONFIG=${KUBECONFIG} kubectl delete ingressroute -n default nginx-m2 nginx-m4 nginx-m2-http nginx-m4-http --ignore-not-found=true" +$SSH_YLC "KUBECONFIG=${KUBECONFIG} kubectl delete middleware -n default stripprefix-m1 stripprefix-m2 stripprefix-m3 stripprefix-m4 --ignore-not-found=true" +$SSH_YLC "KUBECONFIG=${KUBECONFIG} kubectl delete configmap -n default nginx-m1-html nginx-m2-html nginx-m3-html nginx-m4-html --ignore-not-found=true" +sleep 2 + +# 3. 逐个部署 02-01~02-04 并验证 +MATRIX=( + "01-control-ingress.yaml:demo-m1:M1" + "02-control-ingressroute.yaml:demo-m2:M2" + "03-worker-ingress.yaml:demo-m3:M3" + "04-worker-ingressroute.yaml:demo-m4:M4" +) + +for item in "${MATRIX[@]}"; do + file="${item%%:*}" + rest="${item#*:}" + path="${rest%%:*}" + expect="${rest##*:}" + echo "[2] 部署 ${file}(${path} → ${expect})..." + $SSH_YLC "KUBECONFIG=${KUBECONFIG} kubectl apply -f ${REMOTE_REPO}/ansible/files/nginx-matrix/${file}" + $SSH_YLC "KUBECONFIG=${KUBECONFIG} kubectl wait --for=condition=ready pod -l app=nginx-m${expect#M} -n default --timeout=120s" + code=$(ssh $SSH_OPTS "$VERIFY_HOST" "curl -s -o /dev/null -w '%{http_code}' --max-time 10 'http://${OPENWRT_IP}:${HTTP_PORT}/${path}/' 2>/dev/null" || echo "000") + body=$(ssh $SSH_OPTS "$VERIFY_HOST" "curl -s --max-time 10 'http://${OPENWRT_IP}:${HTTP_PORT}/${path}/' 2>/dev/null" || echo "") + if [[ "$code" != "200" ]]; then + echo " [FAIL] /${path}/ 返回 ${code}" >&2 + exit 1 + fi + if [[ "$body" != *"$expect"* ]]; then + echo " [FAIL] /${path}/ body 不含 ${expect}" >&2 + exit 1 + fi + echo " [OK] /${path}/ 200 含 ${expect}" +done + +# 4. 部署 TLS 矩阵 +echo "[3] 部署 nginx TLS 矩阵..." +"${ROOT_DIR}/scripts/01-08-deploy-nginx-tls-via-ylc61.sh" + +# 5. 验证 HTTPS(test01~04) +echo "[4] 经 ${VERIFY_HOST} 验证 HTTPS(test01~04.jackadam.top:${HTTPS_PORT})..." +HTTPS_HOSTS="test01.jackadam.top,test02.jackadam.top,test03.jackadam.top,test04.jackadam.top" +IFS=',' read -ra HOSTS <<< "$HTTPS_HOSTS" +for i in "${!HOSTS[@]}"; do + host="${HOSTS[$i]}" + expect="M$((i+1))" + code=$(ssh $SSH_OPTS "$VERIFY_HOST" "curl -s -o /dev/null -w '%{http_code}' --max-time 10 'https://${host}:${HTTPS_PORT}/' --resolve '${host}:${HTTPS_PORT}:${OPENWRT_IP}' 2>/dev/null" || echo "000") + body=$(ssh $SSH_OPTS "$VERIFY_HOST" "curl -s --max-time 10 'https://${host}:${HTTPS_PORT}/' --resolve '${host}:${HTTPS_PORT}:${OPENWRT_IP}' 2>/dev/null" || echo "") + if [[ "$code" != "200" ]]; then + echo " [FAIL] https://${host}:${HTTPS_PORT}/ 返回 ${code}" >&2 + exit 1 + fi + if [[ "$body" != *"$expect"* ]]; then + echo " [FAIL] https://${host}:${HTTPS_PORT}/ body 不含 ${expect}" >&2 + exit 1 + fi + echo " [OK] https://${host}:${HTTPS_PORT}/ 200 含 ${expect}" +done + +echo "" +echo "[PASS] 02 系列 nginx 矩阵逐个验证通过(02-01~02-04 HTTP path + TLS domain)" diff --git a/scripts/03-verify-traefik-dashboard-acme.sh b/scripts/03-verify-traefik-dashboard-acme.sh new file mode 100644 index 0000000..7f3c23d --- /dev/null +++ b/scripts/03-verify-traefik-dashboard-acme.sh @@ -0,0 +1,57 @@ +#!/usr/bin/env bash +# 03-03 Traefik Dashboard + ACME 合并配置验证 +# 用法:./scripts/03-verify-traefik-dashboard-acme.sh [--apply] +# 默认:仅核对模板与当前集群状态;加 --apply 时尝试应用 traefik-dashboard-acme 并验证(可能触发 Traefik 重启,新 Pod 需重新获取证书) +# 前置:03-02 ACME 已部署(含 cloudflare-api-token);ssh ylc61 可用 +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +REMOTE_HOST="${REMOTE_HOST:-ylc61}" +REMOTE_USER="${REMOTE_USER:-root}" +CFG_SRC="${ROOT_DIR}/ansible/files/traefik-dashboard-acme/traefik-dashboard-acme.yaml" +ENTRY_IP="${ENTRY_IP:-192.168.2.61}" +OPENWRT_IP="${OPENWRT_IP:-192.168.2.1}" +HTTPS_PORT="${HTTPS_PORT:-18443}" +DO_APPLY=0 +[[ "${1:-}" == "--apply" ]] && DO_APPLY=1 + +SSH_OPTS="-o BatchMode=yes -o ConnectTimeout=10" +SSH_KEY="${ROOT_DIR}/.ssh/id_ed25519_k3s_192.168.2.61" +[[ -f "$SSH_KEY" ]] && SSH_OPTS="$SSH_OPTS -i $SSH_KEY" +SSH_CMD="ssh $SSH_OPTS ${REMOTE_USER}@${REMOTE_HOST}" +KUBECONFIG="/etc/rancher/k3s/k3s.yaml" + +echo "=== 03-03 Traefik Dashboard + ACME 验证 ===" + +# 1. 核对 traefik-dashboard-acme 模板包含 03-01 + 03-02 要素 +echo "[1/3] 核对模板(dashboard + ACME + ping + PROXY)..." +grep -q "api.dashboard=true" "$CFG_SRC" && grep -q "api.insecure=true" "$CFG_SRC" || { echo " [FAIL] 缺少 dashboard 参数"; exit 1; } +grep -q "certificatesresolvers.cloudflare" "$CFG_SRC" && grep -q "acme.dnschallenge" "$CFG_SRC" || { echo " [FAIL] 缺少 ACME 参数"; exit 1; } +grep -q "ping.entryPoint=websecure" "$CFG_SRC" && grep -q "proxyProtocol.trustedIPs" "$CFG_SRC" || { echo " [FAIL] 缺少 ping/PROXY 参数"; exit 1; } +grep -q "ingressRoute:" "$CFG_SRC" && grep -q "dashboard:" "$CFG_SRC" || true +echo " [OK] 模板包含 03-01 + 03-02 合并要素" + +# 2. 当前集群 ACME 状态 +echo "[2/3] 当前集群 ACME(test01.jackadam.top)..." +CODE=$(curl -sk -o /dev/null -w '%{http_code}' --max-time 10 "https://test01.jackadam.top/" --resolve "test01.jackadam.top:443:${ENTRY_IP}" 2>/dev/null || echo "000") +[[ "$CODE" != "200" ]] && CODE=$(curl -sk -o /dev/null -w '%{http_code}' --max-time 10 "https://test01.jackadam.top:${HTTPS_PORT}/" --resolve "test01.jackadam.top:${HTTPS_PORT}:${OPENWRT_IP}" 2>/dev/null || echo "000") +[[ "$CODE" == "200" ]] && echo " [OK] ACME TLS 200" || echo " [WARN] ACME 返回 ${CODE}" + +# 3. 可选 apply +if [[ $DO_APPLY -eq 1 ]]; then + echo "[3/3] 应用 traefik-dashboard-acme(会触发 Traefik 重启)..." + EMAIL=$($SSH_CMD "KUBECONFIG=${KUBECONFIG} kubectl get helmchartconfig traefik -n kube-system -o jsonpath='{.spec.valuesContent}' 2>/dev/null" | grep -oE 'acme\.email=[^[:space:]\"'"'"']+' | cut -d= -f2 | head -1) + [[ -z "$EMAIL" ]] && EMAIL="" + $SSH_CMD "mkdir -p /tmp/traefik-verify" + scp -q $SSH_OPTS "$CFG_SRC" "${REMOTE_USER}@${REMOTE_HOST}:/tmp/traefik-verify/traefik-dashboard-acme.yaml" + $SSH_CMD "sed -i 's||'"$EMAIL"'|g' /tmp/traefik-verify/traefik-dashboard-acme.yaml" + $SSH_CMD "KUBECONFIG=${KUBECONFIG} kubectl apply -f /tmp/traefik-verify/traefik-dashboard-acme.yaml" + $SSH_CMD "KUBECONFIG=${KUBECONFIG} kubectl -n kube-system rollout status deploy/traefik --timeout=180s" || echo " [WARN] rollout 超时,可检查 Pod 与 ACME 日志" + CODE=$(curl -s -o /dev/null -w '%{http_code}' --max-time 10 "http://${ENTRY_IP}/dashboard/" 2>/dev/null || echo "000") + [[ "$CODE" == "200" || "$CODE" == "307" ]] && echo " [OK] Dashboard 返回 ${CODE}" || echo " [WARN] Dashboard 返回 ${CODE}" +else + echo "[3/3] 跳过 apply(加 --apply 可尝试应用并验证 Dashboard)" +fi + +echo "" +echo "[PASS] 03-03 验证完成" diff --git a/scripts/README.md b/scripts/README.md index f7c33fb..b0eb65e 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -3,6 +3,18 @@ 本目录集中维护 K3s 排障与恢复脚本。统一约定:**在仓库根目录执行**,使用 `./scripts/...` 路径调用。 ## 目录 +- `scripts/01-08-deploy-openwrt-haproxy.sh` + - 一键部署:uhttpd 改回 80/443(IPv4+IPv6),HAProxy 部署到 18080/18443(默认 haproxy-tls) +- `scripts/01-08-deploy-nginx-tls-via-ylc61.sh` + - 经 ssh ylc61 在控制节点上一键部署 nginx TLS 矩阵(M1~M4,test01~04);同步 ansible + SSH 密钥后执行 playbook +- `scripts/03-verify-traefik-dashboard-acme.sh` + - 03-03 配置验证:核对 traefik-dashboard-acme 模板合并 03-01+03-02 要素;检查当前 ACME;可选 `--apply` 尝试应用(会触发 Traefik 重启) +- `scripts/02-verify-nginx-matrix-individual.sh` + - 02 系列逐个验证:清理 → 逐个部署 02-01~02-04(path-based)→ TLS 矩阵 → onecloud 验证 HTTP path + HTTPS domain;验证通过后需手动更新 `docs/00-02-验证矩阵.md` +- `scripts/01-08-verify-haproxy-openwrt.sh` + - 家庭私网默认:调用主脚本,18080/18443、onecloud 第三方验证(见 `docs/01-08-openwrt-haproxy.md`) +- `scripts/01-08-verify-haproxy.sh` + - **核心**:校验 `ansible/files/01-08-haproxy/*.cfg` 在 OpenWrt 上 `haproxy -c` 通过;`--cfg-only` 仅做语法校验、不 curl。完整流程另经 ssh onecloud 验证 HTTP/HTTPS;可选 `--deploy-matrix http|tls`、`--https-hosts`;验证通过可更新验证矩阵 - `scripts/ssh/setup-k3s-workers-ssh.sh` - 为 Ansible 自动化准备 SSH:为所有 k3s 节点配置 jack + root 公钥及每节点私钥(配合 `docs/01-07-节点初始化-ansible-实践.md`) - `scripts/diag/entrypath/entrypath.sh` @@ -20,9 +32,8 @@ ## 从仓库根执行示例 -`bas\1 - -\21) 初始化排障 SSH 密钥(可选) +```bash +# 1) 初始化排障 SSH 密钥(可选) ./scripts/diag/ssh/setup-ssh-keys.sh # 2) 验证 SSH(建议) @@ -52,7 +63,7 @@ --pod-netns-trace-mode y \ --pod-netns-trace-seconds 12 \ --non-interactive -` +``` ## 说明文档