--- - name: Longhorn node packages (iSCSI, NFS client) hosts: k3s_nodes become: true tasks: - name: Install Longhorn OS dependencies when: longhorn_install_node_packages | default(true) | bool block: - name: Install iscsi + nfs (dnf/yum) ansible.builtin.package: name: - iscsi-initiator-utils - nfs-utils state: present - name: Enable iscsid ansible.builtin.systemd: name: iscsid enabled: true state: started - name: Ensure Longhorn data subdirectory exists on all nodes ansible.builtin.file: path: "{{ k3s_data_dir }}/longhorn" state: directory mode: "0700" - name: Pre-pull Longhorn images on all nodes (optional, avoid DockerHub EOF/ImagePullBackOff) when: longhorn_prepull_images | default(true) | bool ansible.builtin.shell: | set -e CTR="ctr --address /run/k3s/containerd/containerd.sock -n k8s.io" imgs=( "docker.io/longhornio/longhorn-manager:v{{ longhorn_chart_version }}" "docker.io/longhornio/longhorn-ui:v{{ longhorn_chart_version }}" "docker.io/longhornio/longhorn-share-manager:v{{ longhorn_chart_version }}" "docker.io/longhornio/longhorn-engine:v{{ longhorn_chart_version }}" "docker.io/longhornio/longhorn-instance-manager:v{{ longhorn_chart_version }}" "docker.io/longhornio/backing-image-manager:v{{ longhorn_chart_version }}" "docker.io/longhornio/support-bundle-kit:v0.0.45" ) for img in "${imgs[@]}"; do ok=0 for i in 1 2 3 4 5; do echo "[pull] $img (try $i/5)" if $CTR images pull "$img"; then ok=1 break fi sleep $((i * 3)) done if [ "$ok" -ne 1 ]; then echo "[ERR] failed pulling $img after retries" exit 1 fi done args: executable: /bin/bash changed_when: true - name: Install Longhorn with Helm on first server hosts: k3s_server become: true run_once: true vars: longhorn_values_src: "{{ playbook_dir }}/../../files/03-07/values-lab.yaml" longhorn_values_dest: /root/longhorn-values-lab.yaml k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml tasks: - name: Install helm package (Fedora/RHEL family) ansible.builtin.package: name: helm state: present ignore_errors: true register: helm_pkg - name: Hint if helm package install failed (install Helm 3 manually if needed) ansible.builtin.debug: msg: "dnf/yum 未装上 helm 时,请见 https://helm.sh/docs/intro/install/" when: helm_pkg.failed | default(false) - name: Fail if helm binary still unavailable ansible.builtin.command: which helm register: helm_which changed_when: false failed_when: helm_which.rc != 0 - name: Copy lab values to server ansible.builtin.copy: src: "{{ longhorn_values_src }}" dest: "{{ longhorn_values_dest }}" mode: "0600" - name: Recover longhorn-system namespace from Terminating and recreate cleanly ansible.builtin.shell: | set -e export KUBECONFIG={{ k3s_kubeconfig }} ns="longhorn-system" phase="$(kubectl get ns "$ns" -o jsonpath='{.status.phase}' 2>/dev/null || true)" if [ "$phase" = "Terminating" ]; then echo "[WARN] namespace $ns is Terminating; force finalize and wait deletion" kubectl get ns "$ns" -o json > /tmp/ns.json python3 -c "import json; obj=json.load(open('/tmp/ns.json')); obj.setdefault('spec',{}); obj['spec']['finalizers']=[]; json.dump(obj, open('/tmp/ns-finalize.json','w'))" kubectl replace --raw \"/api/v1/namespaces/$ns/finalize\" -f /tmp/ns-finalize.json >/dev/null || true kubectl delete ns "$ns" --ignore-not-found=true --wait=false || true for i in $(seq 1 60); do if ! kubectl get ns "$ns" >/dev/null 2>&1; then break fi sleep 2 done fi # 保证 Helm 能写 release secret:命名空间必须处于 Active 且可创建资源 if ! kubectl get ns "$ns" >/dev/null 2>&1; then kubectl create ns "$ns" fi phase_now="$(kubectl get ns "$ns" -o jsonpath='{.status.phase}' 2>/dev/null || true)" deleting_now="$(kubectl get ns "$ns" -o jsonpath='{.metadata.deletionTimestamp}' 2>/dev/null || true)" if [ "$phase_now" = "Terminating" ]; then echo "[ERR] namespace $ns still Terminating after recovery; abort helm install" kubectl get ns "$ns" -o yaml || true exit 1 fi if [ -n "$deleting_now" ]; then echo "[ERR] namespace $ns has deletionTimestamp=$deleting_now; abort helm install" kubectl get ns "$ns" -o yaml || true exit 1 fi # 探针:确认命名空间可写,避免 Helm 创建 release secret 时才失败 kubectl -n "$ns" create configmap longhorn-write-probe --from-literal=ok=1 >/dev/null kubectl -n "$ns" delete configmap longhorn-write-probe --ignore-not-found=true >/dev/null args: executable: /bin/bash changed_when: true failed_when: false - name: Ensure longhorn Helm repo ansible.builtin.shell: | set -e if ! helm repo list 2>/dev/null | grep -q '^longhorn'; then helm repo add longhorn https://charts.longhorn.io fi helm repo update environment: KUBECONFIG: "{{ k3s_kubeconfig }}" args: executable: /bin/bash changed_when: true - name: Delete leftover longhorn PriorityClass (cluster-scoped) to avoid Helm ownership conflicts ansible.builtin.shell: | set -e KUBECONFIG={{ k3s_kubeconfig }} kubectl delete priorityclass longhorn-critical --ignore-not-found=true args: executable: /bin/bash changed_when: true failed_when: false - name: Delete leftover Longhorn CRDs (cluster-scoped, opt-in) when: longhorn_force_crd_reset | default(false) | bool ansible.builtin.shell: | set -e export KUBECONFIG={{ k3s_kubeconfig }} crd_list=\"$(kubectl get crd -o name 2>/dev/null | grep 'longhorn.io' || true)\" if [ -n \"$crd_list\" ]; then echo \"$crd_list\" | while read -r crd; do [ -z \"$crd\" ] && continue timeout 20s kubectl delete \"$crd\" --ignore-not-found=true || true done fi args: executable: /bin/bash changed_when: true failed_when: false - name: Delete leftover Longhorn ClusterRole/ClusterRoleBinding (cluster-scoped) ansible.builtin.shell: | set -e export KUBECONFIG={{ k3s_kubeconfig }} role_list=\"$(kubectl get clusterrole -o name 2>/dev/null | grep 'longhorn' || true)\" if [ -n \"$role_list\" ]; then echo \"$role_list\" | while read -r role; do [ -z \"$role\" ] && continue timeout 20s kubectl delete \"$role\" --ignore-not-found=true || true done fi binding_list=\"$(kubectl get clusterrolebinding -o name 2>/dev/null | grep 'longhorn' || true)\" if [ -n \"$binding_list\" ]; then echo \"$binding_list\" | while read -r binding; do [ -z \"$binding\" ] && continue timeout 20s kubectl delete \"$binding\" --ignore-not-found=true || true done fi args: executable: /bin/bash changed_when: true failed_when: false - name: Cleanup leftover Helm release records for Longhorn (default + longhorn-system) ansible.builtin.shell: | set -e export KUBECONFIG={{ k3s_kubeconfig }} for ns in longhorn-system default; do if helm -n \"$ns\" list --all 2>/dev/null | grep -q '^longhorn'; then timeout 120s helm -n \"$ns\" uninstall longhorn --no-hooks || true fi sec_list=\"$(kubectl -n \"$ns\" get secret -o name 2>/dev/null | grep '^secret/sh\\.helm\\.release\\.v1\\.longhorn\\.' || true)\" if [ -n \"$sec_list\" ]; then echo \"$sec_list\" | xargs -n1 kubectl -n \"$ns\" delete --ignore-not-found=true fi done environment: KUBECONFIG: "{{ k3s_kubeconfig }}" args: executable: /bin/bash changed_when: true failed_when: false - name: Helm upgrade/install Longhorn(失败兜底:install --replace) ansible.builtin.shell: | set -e ns="longhorn-system" recover_ns() { phase="$(kubectl get ns "$ns" -o jsonpath='{.status.phase}' 2>/dev/null || true)" deleting="$(kubectl get ns "$ns" -o jsonpath='{.metadata.deletionTimestamp}' 2>/dev/null || true)" if [ "$phase" = "Terminating" ] || [ -n "$deleting" ]; then kubectl get ns "$ns" -o json > /tmp/ns.json || true python3 -c "import json; obj=json.load(open('/tmp/ns.json')); obj.setdefault('spec',{}); obj['spec']['finalizers']=[]; json.dump(obj, open('/tmp/ns-finalize.json','w'))" || true kubectl replace --raw "/api/v1/namespaces/$ns/finalize" -f /tmp/ns-finalize.json >/dev/null || true kubectl delete ns "$ns" --ignore-not-found=true --wait=false || true for i in $(seq 1 90); do if ! kubectl get ns "$ns" >/dev/null 2>&1; then break fi sleep 2 done fi kubectl get ns "$ns" >/dev/null 2>&1 || kubectl create ns "$ns" } for i in 1 2 3 4 5; do set +e out="$(helm upgrade --install longhorn longhorn/longhorn --namespace "$ns" --create-namespace -f {{ longhorn_values_dest }} --version {{ longhorn_chart_version }} --wait --timeout 15m 2>&1)" rc=$? set -e if [ $rc -eq 0 ]; then echo "$out" exit 0 fi echo "$out" if echo "$out" | grep -q "is being terminated"; then echo "[WARN] namespace $ns is being terminated, recover and retry ($i/5)" recover_ns sleep $((i * 3)) continue fi if echo "$out" | grep -q "engineimages.longhorn.io\" not found"; then echo "[WARN] longhorn CRD propagation not ready, retry ($i/5)" sleep $((i * 5)) continue fi # 非命名空间终止类错误,直接失败 exit $rc done # 兜底:仍失败则返回非 0 exit 1 environment: KUBECONFIG: "{{ k3s_kubeconfig }}" args: executable: /bin/bash register: helm_longhorn changed_when: true - name: Apply local-path-config lab defaults (optional) hosts: k3s_server become: true run_once: true vars: k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml local_path_json_src: "{{ playbook_dir }}/../../files/03-05-local-path-config/local-path-config-lab.json" local_path_json_dest: /root/local-path-config-lab.json tasks: - name: Apply local-path-config lab defaults (optional) when: longhorn_apply_local_path_lab | default(false) | bool block: - name: Copy local-path lab json ansible.builtin.copy: src: "{{ local_path_json_src }}" dest: "{{ local_path_json_dest }}" mode: "0644" - name: Apply local-path-config ConfigMap ansible.builtin.shell: | set -e KUBECONFIG={{ k3s_kubeconfig }} kubectl -n kube-system create configmap local-path-config \ --from-file=config.json={{ local_path_json_dest }} \ --dry-run=client -o yaml | KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f - args: executable: /bin/bash changed_when: true - name: Restart local-path-provisioner if present ansible.builtin.shell: | KUBECONFIG={{ k3s_kubeconfig }} kubectl -n kube-system rollout restart deploy/local-path-provisioner args: executable: /bin/bash register: lp_restart failed_when: false changed_when: lp_restart.rc == 0 - name: Verify 03-07 Longhorn (namespace pods) hosts: k3s_server become: true run_once: true vars: k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml longhorn_ns: "{{ longhorn_namespace | default('longhorn-system') }}" tasks: - name: Check longhorn pods ansible.builtin.shell: | set -e KUBECONFIG={{ k3s_kubeconfig }} kubectl get pods -n {{ longhorn_ns }} -o wide args: executable: /bin/bash changed_when: false - name: Teardown 03-07 Longhorn (optional) hosts: k3s_server become: true run_once: true vars: k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml verify_teardown: "{{ (VERIFY_TEARDOWN | default('1')) | string }}" longhorn_ns: "{{ longhorn_namespace | default('longhorn-system') }}" tasks: - name: Uninstall longhorn helm release when VERIFY_TEARDOWN=1 when: verify_teardown == "1" ansible.builtin.shell: | set -e export KUBECONFIG={{ k3s_kubeconfig }} if helm -n {{ longhorn_ns }} list 2>/dev/null | grep -q longhorn; then timeout 180s helm -n {{ longhorn_ns }} uninstall longhorn --no-hooks || true fi kubectl delete ns {{ longhorn_ns }} --ignore-not-found=true --wait=false || true args: executable: /bin/bash changed_when: true failed_when: false