feat: 按 doc_id 重组 ansible/files 与验证框架

- ansible/files 改为与文档 XX-YY 对齐的目录结构,更新相关 playbook 路径
- 新增 scripts/verify.sh 与 ansible/playbooks/verify/*.yml,移除单体 verify-matrix.yml
- 补充 docs/00-02 矩阵状态、00-05 验证框架与流程、00-04 环境与 ylc65 工作机说明
- 增加 k3s 存储准备、Longhorn、local-path 等 playbook 与辅助脚本

Made-with: Cursor
This commit is contained in:
2026-03-26 07:01:14 +08:00
parent a67788de56
commit 8c43761962
192 changed files with 4006 additions and 320 deletions

View File

@@ -0,0 +1,251 @@
---
# Helm 安装 Longhorn与 docs/03-07 一致)。在控制节点执行,依赖 KUBECONFIG=/etc/rancher/k3s/k3s.yaml
# 变量group_vars/all.yml 中 longhorn_chart_version、longhorn_install_node_packages、longhorn_apply_local_path_lab
- name: Longhorn node packages (iSCSI, NFS client)
hosts: k3s_nodes
become: true
tasks:
- name: Install Longhorn OS dependencies
when: longhorn_install_node_packages | default(true) | bool
block:
- name: Install iscsi + nfs (dnf/yum)
ansible.builtin.package:
name:
- iscsi-initiator-utils
- nfs-utils
state: present
- name: Enable iscsid
ansible.builtin.systemd:
name: iscsid
enabled: true
state: started
- name: Ensure Longhorn data subdirectory exists on all nodes
ansible.builtin.file:
path: "{{ k3s_data_dir }}/longhorn"
state: directory
mode: "0700"
- name: Pre-pull Longhorn images on all nodes (optional, avoid DockerHub EOF/ImagePullBackOff)
when: longhorn_prepull_images | default(true) | bool
ansible.builtin.shell: |
set -e
CTR="ctr --address /run/k3s/containerd/containerd.sock -n k8s.io"
imgs=(
"docker.io/longhornio/longhorn-manager:v{{ longhorn_chart_version }}"
"docker.io/longhornio/longhorn-ui:v{{ longhorn_chart_version }}"
"docker.io/longhornio/longhorn-share-manager:v{{ longhorn_chart_version }}"
"docker.io/longhornio/longhorn-engine:v{{ longhorn_chart_version }}"
"docker.io/longhornio/longhorn-instance-manager:v{{ longhorn_chart_version }}"
"docker.io/longhornio/backing-image-manager:v{{ longhorn_chart_version }}"
"docker.io/longhornio/support-bundle-kit:v0.0.45"
)
for img in "${imgs[@]}"; do
ok=0
for i in 1 2 3 4 5; do
echo "[pull] $img (try $i/5)"
if $CTR images pull "$img"; then
ok=1
break
fi
sleep $((i * 3))
done
if [ "$ok" -ne 1 ]; then
echo "[ERR] failed pulling $img after retries"
exit 1
fi
done
args:
executable: /bin/bash
changed_when: true
- name: Install Longhorn with Helm on first server
hosts: k3s_server
become: true
run_once: true
vars:
longhorn_values_src: "{{ playbook_dir }}/../files/03-07-longhorn/values-lab.yaml"
longhorn_values_dest: /root/longhorn-values-lab.yaml
k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml
tasks:
- name: Install helm package (Fedora/RHEL family)
ansible.builtin.package:
name: helm
state: present
ignore_errors: true
register: helm_pkg
- name: Hint if helm package install failed (install Helm 3 manually if needed)
ansible.builtin.debug:
msg: "dnf/yum 未装上 helm 时,请见 https://helm.sh/docs/intro/install/"
when: helm_pkg.failed | default(false)
- name: Fail if helm binary still unavailable
ansible.builtin.command: which helm
register: helm_which
changed_when: false
failed_when: helm_which.rc != 0
- name: Copy lab values to server
ansible.builtin.copy:
src: "{{ longhorn_values_src }}"
dest: "{{ longhorn_values_dest }}"
mode: "0600"
- name: Ensure longhorn-system namespace is not stuck Terminating (force finalize if needed)
ansible.builtin.shell: |
set -e
export KUBECONFIG={{ k3s_kubeconfig }}
ns="longhorn-system"
phase="$(kubectl get ns "$ns" -o jsonpath='{.status.phase}' 2>/dev/null || true)"
if [ "$phase" = "Terminating" ]; then
echo "[WARN] namespace $ns is Terminating; force finalize to unblock install"
kubectl get ns "$ns" -o json > /tmp/ns.json
python3 -c "import json; obj=json.load(open('/tmp/ns.json')); obj.setdefault('spec',{}); obj['spec']['finalizers']=[]; json.dump(obj, open('/tmp/ns-finalize.json','w'))"
kubectl replace --raw "/api/v1/namespaces/$ns/finalize" -f /tmp/ns-finalize.json >/dev/null
fi
args:
executable: /bin/bash
changed_when: true
failed_when: false
- name: Ensure longhorn Helm repo
ansible.builtin.shell: |
set -e
if ! helm repo list 2>/dev/null | grep -q '^longhorn'; then
helm repo add longhorn https://charts.longhorn.io
fi
helm repo update
environment:
KUBECONFIG: "{{ k3s_kubeconfig }}"
args:
executable: /bin/bash
changed_when: true
- name: Delete leftover longhorn PriorityClass (cluster-scoped) to avoid Helm ownership conflicts
ansible.builtin.shell: |
set -e
KUBECONFIG={{ k3s_kubeconfig }} kubectl delete priorityclass longhorn-critical --ignore-not-found=true
args:
executable: /bin/bash
changed_when: true
failed_when: false
- name: Delete leftover Longhorn CRDs (cluster-scoped) to avoid Helm ownership conflicts
ansible.builtin.shell: |
set -e
export KUBECONFIG={{ k3s_kubeconfig }}
crd_list="$(kubectl get crd -o name 2>/dev/null | grep 'longhorn.io' || true)"
if [ -n "$crd_list" ]; then
echo "$crd_list" | while read -r crd; do
[ -z "$crd" ] && continue
timeout 20s kubectl delete "$crd" --ignore-not-found=true || true
done
fi
args:
executable: /bin/bash
changed_when: true
failed_when: false
- name: Delete leftover Longhorn ClusterRole/ClusterRoleBinding (cluster-scoped)
ansible.builtin.shell: |
set -e
export KUBECONFIG={{ k3s_kubeconfig }}
role_list="$(kubectl get clusterrole -o name 2>/dev/null | grep 'longhorn' || true)"
if [ -n "$role_list" ]; then
echo "$role_list" | while read -r role; do
[ -z "$role" ] && continue
timeout 20s kubectl delete "$role" --ignore-not-found=true || true
done
fi
binding_list="$(kubectl get clusterrolebinding -o name 2>/dev/null | grep 'longhorn' || true)"
if [ -n "$binding_list" ]; then
echo "$binding_list" | while read -r binding; do
[ -z "$binding" ] && continue
timeout 20s kubectl delete "$binding" --ignore-not-found=true || true
done
fi
args:
executable: /bin/bash
changed_when: true
failed_when: false
- name: Cleanup leftover Helm release records for Longhorn (default + longhorn-system)
ansible.builtin.shell: |
set -e
export KUBECONFIG={{ k3s_kubeconfig }}
# 有些失败/中断的安装会把 release secret 留在 default 或 longhorn-system导致后续
# - "cannot re-use a name that is still in use"
# - cluster-scoped 资源的 meta.helm.sh/release-namespace 注解冲突
for ns in longhorn-system default; do
if helm -n "$ns" list --all 2>/dev/null | grep -q '^longhorn'; then
# uninstall 可能卡住(例如 uninstall job / hook避免阻塞整个自动化流程
timeout 120s helm -n "$ns" uninstall longhorn --no-hooks || true
fi
sec_list="$(kubectl -n "$ns" get secret -o name 2>/dev/null | grep '^secret/sh\\.helm\\.release\\.v1\\.longhorn\\.' || true)"
if [ -n "$sec_list" ]; then
echo "$sec_list" | xargs -n1 kubectl -n "$ns" delete --ignore-not-found=true
fi
done
environment:
KUBECONFIG: "{{ k3s_kubeconfig }}"
args:
executable: /bin/bash
changed_when: true
failed_when: false
- name: Helm upgrade/install Longhorn失败兜底install --replace
ansible.builtin.shell: |
set -e
helm upgrade --install longhorn longhorn/longhorn --namespace longhorn-system --create-namespace -f {{ longhorn_values_dest }} --version {{ longhorn_chart_version }} --wait --timeout 15m || helm install --replace longhorn longhorn/longhorn --namespace longhorn-system --create-namespace -f {{ longhorn_values_dest }} --version {{ longhorn_chart_version }} --wait --timeout 15m
environment:
KUBECONFIG: "{{ k3s_kubeconfig }}"
args:
executable: /bin/bash
register: helm_longhorn
changed_when: true
- name: Apply local-path-config lab defaults (optional)
hosts: k3s_server
become: true
run_once: true
vars:
k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml
local_path_json_src: "{{ playbook_dir }}/../files/03-05-local-path-config/local-path-config-lab.json"
local_path_json_dest: /root/local-path-config-lab.json
tasks:
- name: Apply local-path-config lab defaults (optional)
when: longhorn_apply_local_path_lab | default(false) | bool
block:
- name: Copy local-path lab json
ansible.builtin.copy:
src: "{{ local_path_json_src }}"
dest: "{{ local_path_json_dest }}"
mode: "0644"
- name: Apply local-path-config ConfigMap
ansible.builtin.shell: |
set -e
KUBECONFIG={{ k3s_kubeconfig }} kubectl -n kube-system create configmap local-path-config \
--from-file=config.json={{ local_path_json_dest }} \
--dry-run=client -o yaml | KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f -
args:
executable: /bin/bash
changed_when: true
- name: Restart local-path-provisioner if present
ansible.builtin.shell: |
KUBECONFIG={{ k3s_kubeconfig }} kubectl -n kube-system rollout restart deploy/local-path-provisioner
args:
executable: /bin/bash
register: lp_restart
failed_when: false
changed_when: lp_restart.rc == 0