日常更新

This commit is contained in:
2026-03-29 09:08:01 +08:00
parent 31709425e2
commit befdefd222
224 changed files with 7240 additions and 3297 deletions

View File

@@ -8,13 +8,15 @@
manifest_src: "{{ playbook_dir }}/../../files/03-02/traefik-acme.yaml"
manifest_dest: /tmp/traefik-acme.yaml
acme_email: "{{ lookup('env', 'ACME_EMAIL') | default('', true) }}"
cf_api_token: "{{ lookup('env', 'CF_API_TOKEN') | default('', true) }}"
tasks:
- name: "Gate - skip apply when ACME_EMAIL missing"
when: acme_email | trim == ""
block:
- ansible.builtin.debug:
msg: "[GATE] skipped doc_id=03-02 reason=missing_env missing=ACME_EMAIL"
- meta: end_play
ansible.builtin.include_role:
name: verify_common
tasks_from: gate-debug-end-play.yml
vars:
verify_gate_message: "[GATE] skipped doc_id=03-02 reason=missing_env missing=ACME_EMAIL"
- name: Copy manifest
ansible.builtin.copy:
@@ -28,6 +30,21 @@
regexp: "<YOUR_REAL_EMAIL>"
replace: "{{ acme_email | trim }}"
- name: Enable ACME staging CA when ACME_CA_STAGING=1
when: (lookup('env', 'ACME_CA_STAGING') | default('0', true) | trim) == "1"
ansible.builtin.replace:
path: "{{ manifest_dest }}"
regexp: '^\s*# - "--certificatesresolvers\.cloudflare\.acme\.caserver=https://acme-staging-v02\.api\.letsencrypt\.org/directory".*$'
replace: ' - "--certificatesresolvers.cloudflare.acme.caserver=https://acme-staging-v02.api.letsencrypt.org/directory"'
- name: Ensure Cloudflare API token Secret before Traefik ACME apply
when: (cf_api_token | trim | length) > 0
ansible.builtin.include_role:
name: verify_common
tasks_from: ensure-cloudflare-api-token-secret.yml
vars:
verify_cf_api_token: "{{ cf_api_token | trim }}"
- name: Apply manifest
ansible.builtin.shell: |
set -e
@@ -42,7 +59,8 @@
run_once: true
vars:
k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml
nginx_matrix_tls_enable: "{{ nginx_matrix_tls_enable | default(false) | bool }}"
_nginx_matrix_tls_enable: "{{ nginx_matrix_tls_enable | default((lookup('env', 'NGINX_MATRIX_TLS_ENABLE') | default('', true) | trim | lower in ['true', '1', 'yes']) | bool) }}"
nginx_matrix_tls_enabled: "{{ _nginx_matrix_tls_enable | bool }}"
manifests_path: "{{ playbook_dir }}/../../files/03-02"
tls_domains:
- test01.jackadam.top
@@ -51,24 +69,30 @@
- test04.jackadam.top
pre_tasks:
- name: Gate - skip nginx matrix TLS when nginx_matrix_tls_enable=false
when: not nginx_matrix_tls_enable
when: not nginx_matrix_tls_enabled
block:
- ansible.builtin.debug:
msg: "[GATE] skipped doc_id=03-02 action=nginx-matrix-tls var=nginx_matrix_tls_enable"
msg: "[SKIP] optional doc_id=03-02 action=nginx-matrix-tls var=nginx_matrix_tls_enable"
- meta: end_play
tasks:
- name: Deploy nginx matrix TLS (mode=deploy)
when: (mode | default('deploy')) == 'deploy'
block:
- name: Ensure manifests path exists
- name: Ensure manifests path exists (controller repo path)
ansible.builtin.stat:
path: "{{ manifests_path }}"
register: manifests_stat
delegate_to: localhost
become: false
run_once: true
- name: Fail if manifests not found
ansible.builtin.fail:
msg: "manifests 未找到: {{ manifests_path }},请从仓库根目录或 ansible 同级执行"
when: not manifests_stat.stat.exists
delegate_to: localhost
become: false
run_once: true
- name: Ensure control-plane label on k3s_server nodes (for M1)
ansible.builtin.shell: |
@@ -105,12 +129,21 @@
ansible.builtin.shell: KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout restart deployment nginx-m1 nginx-m2 nginx-m3 nginx-m4 -n default
changed_when: true
- name: Wait for nginx pods to be ready
- name: Wait for nginx rollouts stable after ConfigMap restart
ansible.builtin.shell: |
KUBECONFIG={{ k3s_kubeconfig }} kubectl wait --for=condition=ready pod -l app=nginx-m1 --timeout=60s
KUBECONFIG={{ k3s_kubeconfig }} kubectl wait --for=condition=ready pod -l app=nginx-m2 --timeout=60s
KUBECONFIG={{ k3s_kubeconfig }} kubectl wait --for=condition=ready pod -l app=nginx-m3 --timeout=120s
KUBECONFIG={{ k3s_kubeconfig }} kubectl wait --for=condition=ready pod -l app=nginx-m4 --timeout=120s
set -euo pipefail
KCFG={{ k3s_kubeconfig | quote }}
export KUBECONFIG="$KCFG"
for dep in nginx-m1 nginx-m2; do
echo "[OC-ASSERT] assertion=nginx_matrix_tls_rollout deployment=${dep} timeout=60s"
kubectl rollout status "deployment/$dep" -n default --timeout=60s
done
for dep in nginx-m3 nginx-m4; do
echo "[OC-ASSERT] assertion=nginx_matrix_tls_rollout deployment=${dep} timeout=120s"
kubectl rollout status "deployment/$dep" -n default --timeout=120s
done
args:
executable: /bin/bash
changed_when: false
- name: Verify nginx matrix TLS resources
@@ -187,29 +220,85 @@
vars:
k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml
acme_email: "{{ lookup('env', 'ACME_EMAIL') | default('', true) }}"
cf_api_token: "{{ lookup('env', 'CF_API_TOKEN') | default('', true) }}"
tasks:
- name: "Gate - skip verify when ACME_EMAIL missing"
when: acme_email | trim == ""
block:
- ansible.builtin.debug:
msg: "[GATE] skipped doc_id=03-02 reason=missing_env missing=ACME_EMAIL"
- meta: end_play
ansible.builtin.include_role:
name: verify_common
tasks_from: gate-debug-end-play.yml
vars:
verify_gate_message: "[GATE] skipped doc_id=03-02 reason=missing_env missing=ACME_EMAIL"
- name: Assert Cloudflare token secret exists
- name: Ensure Cloudflare token Secret from CF_API_TOKEN (real-pass)
when: (cf_api_token | trim | length) > 0
ansible.builtin.include_role:
name: verify_common
tasks_from: ensure-cloudflare-api-token-secret.yml
vars:
verify_cf_api_token: "{{ cf_api_token | trim }}"
- name: Check cloudflare-api-token secret exists
ansible.builtin.shell: |
set -e
set -euo pipefail
KUBECONFIG={{ k3s_kubeconfig }} kubectl -n kube-system get secret cloudflare-api-token
args:
executable: /bin/bash
changed_when: false
register: cloudflare_secret_check
failed_when: false
- name: Rollout status traefik (kube-system)
- name: Gate - no CF_API_TOKEN and secret missing
when: cloudflare_secret_check.rc != 0 and (cf_api_token | trim | length) == 0
ansible.builtin.include_role:
name: verify_common
tasks_from: gate-debug-end-play.yml
vars:
verify_gate_message: "[GATE] skipped doc_id=03-02 reason=missing_dependency missing=cloudflare-api-token skip_scope=traefik-acme"
- name: Fail when secret missing but CF_API_TOKEN was set
when: cloudflare_secret_check.rc != 0 and (cf_api_token | trim | length) > 0
ansible.builtin.fail:
msg: "已设置 CF_API_TOKEN 但 cloudflare-api-token Secret 仍不可用,请检查 apiserver 权限与命名空间 kube-system"
# Helm/ACME 换新 RS 时,旧 Pod 可能长期「pending termination」rollout status 永久卡住。
# 实验室验收scale 0 → 清 Pod → scale 1入口短暂不可用可接受
- name: Unstick Traefik deployment via scale down/up (kube-system)
ansible.builtin.shell: |
set -e
KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/traefik -n kube-system --timeout=300s
set -euo pipefail
export KUBECONFIG={{ k3s_kubeconfig }}
echo "[OC-ASSERT] assertion=traefik_rollout_unblock phase=scale_reset"
kubectl scale deployment traefik -n kube-system --replicas=0
for i in $(seq 1 90); do
rep=$(kubectl get deployment traefik -n kube-system -o jsonpath='{.status.replicas}' 2>/dev/null || echo 1)
[ "${rep:-1}" = "0" ] && break
sleep 2
done
for sel in "app.kubernetes.io/name=traefik" "app.kubernetes.io/instance=traefik"; do
kubectl get pods -n kube-system -l "$sel" -o name 2>/dev/null | while read -r p; do
[ -z "$p" ] && continue
kubectl delete "$p" -n kube-system --grace-period=0 --force --ignore-not-found=true || true
done
done
{ kubectl get pods -n kube-system --no-headers -o custom-columns=:metadata.name 2>/dev/null | grep -E '^traefik-[0-9a-f]+-' || true; } | while read -r n; do
[ -z "$n" ] && continue
kubectl delete pod "$n" -n kube-system --grace-period=0 --force --ignore-not-found=true || true
done
kubectl scale deployment traefik -n kube-system --replicas=1
sleep 3
args:
executable: /bin/bash
changed_when: false
changed_when: true
failed_when: false
- name: Rollout status traefik (kube-system)
ansible.builtin.include_role:
name: verify_common
tasks_from: kubectl-rollout-status.yml
vars:
verify_rollout_ref: deployment/traefik
verify_rollout_namespace: kube-system
verify_rollout_timeout_s: 600
- name: Teardown 03-02 Traefik ACME (optional)
hosts: k3s_server