Files
Deploy-Laboratory/ansible/playbooks/verify/03-03.yml
2026-03-29 09:08:01 +08:00

232 lines
9.8 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
---
# 03-03 Traefik Dashboard + ACMEHelmChartConfig 合并版)
# 与 03-02 共用同一 Traefik HelmChartConfig 资源名traefik勿在无协调下交替 full 验证二者——后 apply 者覆盖前者。
#
- name: Deploy 03-03 Traefik Dashboard + ACME (HelmChartConfig)
hosts: k3s_server
become: true
run_once: true
vars:
k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml
manifest_src: "{{ playbook_dir }}/../../files/03-03/traefik-dashboard-acme.yaml"
manifest_dest: /tmp/traefik-dashboard-acme.yaml
acme_email: "{{ lookup('env', 'ACME_EMAIL') | default('', true) }}"
cf_api_token: "{{ lookup('env', 'CF_API_TOKEN') | default('', true) }}"
tasks:
- name: "Gate - skip apply when ACME_EMAIL missing"
when: acme_email | trim == ""
ansible.builtin.include_role:
name: verify_common
tasks_from: gate-debug-end-play.yml
vars:
verify_gate_message: "[GATE] skipped doc_id=03-03 reason=missing_env missing=ACME_EMAIL"
- name: Copy manifest
ansible.builtin.copy:
src: "{{ manifest_src }}"
dest: "{{ manifest_dest }}"
mode: "0644"
- name: Replace ACME email placeholder
ansible.builtin.replace:
path: "{{ manifest_dest }}"
regexp: "<YOUR_REAL_EMAIL>"
replace: "{{ acme_email | trim }}"
- name: Enable ACME staging CA when ACME_CA_STAGING=1
when: (lookup('env', 'ACME_CA_STAGING') | default('0', true) | trim) == "1"
ansible.builtin.replace:
path: "{{ manifest_dest }}"
regexp: '^\s*# - "--certificatesresolvers\.cloudflare\.acme\.caserver=https://acme-staging-v02\.api\.letsencrypt\.org/directory".*$'
replace: ' - "--certificatesresolvers.cloudflare.acme.caserver=https://acme-staging-v02.api.letsencrypt.org/directory"'
- name: Ensure Cloudflare API token Secret before Traefik ACME apply
when: (cf_api_token | trim | length) > 0
ansible.builtin.include_role:
name: verify_common
tasks_from: ensure-cloudflare-api-token-secret.yml
vars:
verify_cf_api_token: "{{ cf_api_token | trim }}"
- name: Apply manifest
ansible.builtin.shell: |
set -e
KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f {{ manifest_dest }}
args:
executable: /bin/bash
changed_when: true
- name: Verify 03-03 Traefik Dashboard + ACME (rollout + dashboard http)
hosts: k3s_server
become: true
run_once: true
vars:
k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml
acme_email: "{{ lookup('env', 'ACME_EMAIL') | default('', true) }}"
cf_api_token: "{{ lookup('env', 'CF_API_TOKEN') | default('', true) }}"
_traefik_dash_url_env: "{{ lookup('env', 'TRAEFIK_DASHBOARD_VERIFY_URL') | default('', true) | trim }}"
traefik_dashboard_probe_url: "{{ _traefik_dash_url_env if (_traefik_dash_url_env | length > 0) else ('http://' ~ k3s_server_ip ~ '/dashboard/') }}"
tasks:
- name: "Gate - skip verify when ACME_EMAIL missing"
when: acme_email | trim == ""
ansible.builtin.include_role:
name: verify_common
tasks_from: gate-debug-end-play.yml
vars:
verify_gate_message: "[GATE] skipped doc_id=03-03 reason=missing_env missing=ACME_EMAIL"
- name: Ensure Cloudflare token Secret from CF_API_TOKEN (real-pass)
when: (cf_api_token | trim | length) > 0
ansible.builtin.include_role:
name: verify_common
tasks_from: ensure-cloudflare-api-token-secret.yml
vars:
verify_cf_api_token: "{{ cf_api_token | trim }}"
- name: Check cloudflare-api-token secret exists
ansible.builtin.shell: |
set -euo pipefail
KUBECONFIG={{ k3s_kubeconfig }} kubectl -n kube-system get secret cloudflare-api-token
args:
executable: /bin/bash
changed_when: false
register: cloudflare_secret_check
failed_when: false
- name: Gate - no CF_API_TOKEN and secret missing
when: cloudflare_secret_check.rc != 0 and (cf_api_token | trim | length) == 0
ansible.builtin.include_role:
name: verify_common
tasks_from: gate-debug-end-play.yml
vars:
verify_gate_message: "[GATE] skipped doc_id=03-03 reason=missing_dependency missing=cloudflare-api-token skip_scope=traefik-dashboard-acme"
- name: Fail when secret missing but CF_API_TOKEN was set
when: cloudflare_secret_check.rc != 0 and (cf_api_token | trim | length) > 0
ansible.builtin.fail:
msg: "已设置 CF_API_TOKEN 但 cloudflare-api-token Secret 仍不可用,请检查 apiserver 权限与命名空间 kube-system"
# 与 03-02 Verify 一致:旧 RS pending termination 时 rollout 可能永久卡住scale 重置会短暂影响入口。
- name: Unstick Traefik deployment via scale down/up (kube-system)
ansible.builtin.shell: |
set -euo pipefail
export KUBECONFIG={{ k3s_kubeconfig }}
echo "[OC-ASSERT] assertion=traefik_rollout_unblock phase=scale_reset doc_id=03-03"
kubectl scale deployment traefik -n kube-system --replicas=0
for i in $(seq 1 90); do
rep=$(kubectl get deployment traefik -n kube-system -o jsonpath='{.status.replicas}' 2>/dev/null || echo 1)
[ "${rep:-1}" = "0" ] && break
sleep 2
done
for sel in "app.kubernetes.io/name=traefik" "app.kubernetes.io/instance=traefik"; do
kubectl get pods -n kube-system -l "$sel" -o name 2>/dev/null | while read -r p; do
[ -z "$p" ] && continue
kubectl delete "$p" -n kube-system --grace-period=0 --force --ignore-not-found=true || true
done
done
{ kubectl get pods -n kube-system --no-headers -o custom-columns=:metadata.name 2>/dev/null | grep -E '^traefik-[0-9a-f]+-' || true; } | while read -r n; do
[ -z "$n" ] && continue
kubectl delete pod "$n" -n kube-system --grace-period=0 --force --ignore-not-found=true || true
done
kubectl scale deployment traefik -n kube-system --replicas=1
sleep 3
args:
executable: /bin/bash
changed_when: true
failed_when: false
- name: Rollout status traefik (kube-system)
ansible.builtin.include_role:
name: verify_common
tasks_from: kubectl-rollout-status.yml
vars:
verify_rollout_ref: deployment/traefik
verify_rollout_namespace: kube-system
verify_rollout_timeout_s: 600
# deployment spec.replicas=0 时 kubectl rollout status 也会“成功”,需显式等到 Pod Ready
- name: Wait for traefik Pod Ready (kube-system)
ansible.builtin.shell: |
set -euo pipefail
export KUBECONFIG={{ k3s_kubeconfig }}
kubectl wait --for=condition=ready pod \
-l app.kubernetes.io/name=traefik,app.kubernetes.io/instance=traefik-kube-system \
-n kube-system --timeout=180s
args:
executable: /bin/bash
changed_when: false
- name: HTTP probe Traefik Dashboard via TRAEFIK_DASHBOARD_VERIFY_URL (control 机)
when: _traefik_dash_url_env | length > 0
ansible.builtin.uri:
url: "{{ traefik_dashboard_probe_url }}"
method: GET
follow_redirects: all
status_code: [200]
timeout: 15
register: traefik_03_03_dashboard_http
changed_when: false
delegate_to: localhost
become: false
- name: OC3 summary for Traefik Dashboard HTTP (external URL)
when: _traefik_dash_url_env | length > 0
ansible.builtin.debug:
msg: "[OC-ASSERT] assertion=traefik_03_03_dashboard_http phase=http probe=uri status={{ traefik_03_03_dashboard_http.status | default('') }} url={{ traefik_dashboard_probe_url }}"
- name: HTTP probe Traefik Dashboard (port-forward traefik Pod试 web 容器端口 8000/8080)
when: _traefik_dash_url_env | length == 0
ansible.builtin.shell: |
set -euo pipefail
export KUBECONFIG={{ k3s_kubeconfig }}
POD=$(kubectl get pods -n kube-system \
-l 'app.kubernetes.io/name=traefik,app.kubernetes.io/instance=traefik-kube-system' \
-o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)
test -n "$POD"
local_port=$(shuf -i 32000-32767 -n 1)
ok=0
for cport in 8000 8080 80; do
kubectl port-forward -n kube-system "pod/$POD" "${local_port}:${cport}" >/tmp/traefik-03-03-pf.log 2>&1 &
PF_PID=$!
trap 'kill $PF_PID 2>/dev/null || true' EXIT
for i in $(seq 1 20); do
grep -q "Forwarding from" /tmp/traefik-03-03-pf.log 2>/dev/null && break
sleep 1
done
if curl -sfL --connect-timeout 3 --max-time 12 -o /dev/null "http://127.0.0.1:${local_port}/dashboard/" 2>/dev/null; then
ok=1
echo "[OC-ASSERT] assertion=traefik_03_03_dashboard_http phase=http probe=port_forward status=200 pod_port=${cport} local_port=${local_port}"
break
fi
kill $PF_PID 2>/dev/null || true
trap - EXIT
wait $PF_PID 2>/dev/null || true
done
test "$ok" = "1"
args:
executable: /bin/bash
changed_when: false
- name: Teardown 03-03 Traefik Dashboard + ACME (optional)
hosts: k3s_server
become: true
run_once: true
vars:
k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml
verify_teardown: "{{ (VERIFY_TEARDOWN | default('1')) | string }}"
acme_email: "{{ lookup('env', 'ACME_EMAIL') | default('', true) }}"
manifest_dest: /tmp/traefik-dashboard-acme.yaml
tasks:
- name: Skip teardown when gated
when: acme_email | trim == ""
meta: end_play
- name: Delete resources when VERIFY_TEARDOWN=1
when: verify_teardown == "1"
ansible.builtin.shell: |
set -e
KUBECONFIG={{ k3s_kubeconfig }} kubectl delete -f {{ manifest_dest }} --ignore-not-found=true
args:
executable: /bin/bash
changed_when: true