Files
Deploy-Laboratory/ansible/playbooks/verify/03-07.yml
2026-03-29 09:08:01 +08:00

357 lines
13 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
---
- name: Longhorn node packages (iSCSI, NFS client)
hosts: k3s_nodes
become: true
tasks:
- name: Install Longhorn OS dependencies
when: longhorn_install_node_packages | default(true) | bool
block:
- name: Install iscsi + nfs (dnf/yum)
ansible.builtin.package:
name:
- iscsi-initiator-utils
- nfs-utils
state: present
- name: Enable iscsid
ansible.builtin.systemd:
name: iscsid
enabled: true
state: started
- name: Ensure Longhorn data subdirectory exists on all nodes
ansible.builtin.file:
path: "{{ k3s_data_dir }}/longhorn"
state: directory
mode: "0700"
- name: Pre-pull Longhorn images on all nodes (optional, avoid DockerHub EOF/ImagePullBackOff)
when: longhorn_prepull_images | default(true) | bool
ansible.builtin.shell: |
set -e
CTR="ctr --address /run/k3s/containerd/containerd.sock -n k8s.io"
imgs=(
"docker.io/longhornio/longhorn-manager:v{{ longhorn_chart_version }}"
"docker.io/longhornio/longhorn-ui:v{{ longhorn_chart_version }}"
"docker.io/longhornio/longhorn-share-manager:v{{ longhorn_chart_version }}"
"docker.io/longhornio/longhorn-engine:v{{ longhorn_chart_version }}"
"docker.io/longhornio/longhorn-instance-manager:v{{ longhorn_chart_version }}"
"docker.io/longhornio/backing-image-manager:v{{ longhorn_chart_version }}"
"docker.io/longhornio/support-bundle-kit:v0.0.45"
)
for img in "${imgs[@]}"; do
ok=0
for i in 1 2 3 4 5; do
echo "[pull] $img (try $i/5)"
if $CTR images pull "$img"; then
ok=1
break
fi
sleep $((i * 3))
done
if [ "$ok" -ne 1 ]; then
echo "[ERR] failed pulling $img after retries"
exit 1
fi
done
args:
executable: /bin/bash
changed_when: true
- name: Install Longhorn with Helm on first server
hosts: k3s_server
become: true
run_once: true
vars:
longhorn_values_src: "{{ playbook_dir }}/../../files/03-07/values-lab.yaml"
longhorn_values_dest: /root/longhorn-values-lab.yaml
k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml
tasks:
- name: Install helm package (Fedora/RHEL family)
ansible.builtin.package:
name: helm
state: present
ignore_errors: true
register: helm_pkg
- name: Hint if helm package install failed (install Helm 3 manually if needed)
ansible.builtin.debug:
msg: "dnf/yum 未装上 helm 时,请见 https://helm.sh/docs/intro/install/"
when: helm_pkg.failed | default(false)
- name: Fail if helm binary still unavailable
ansible.builtin.command: which helm
register: helm_which
changed_when: false
failed_when: helm_which.rc != 0
- name: Copy lab values to server
ansible.builtin.copy:
src: "{{ longhorn_values_src }}"
dest: "{{ longhorn_values_dest }}"
mode: "0600"
- name: Recover longhorn-system namespace from Terminating and recreate cleanly
ansible.builtin.shell: |
set -e
export KUBECONFIG={{ k3s_kubeconfig }}
ns="longhorn-system"
phase="$(kubectl get ns "$ns" -o jsonpath='{.status.phase}' 2>/dev/null || true)"
if [ "$phase" = "Terminating" ]; then
echo "[WARN] namespace $ns is Terminating; force finalize and wait deletion"
kubectl get ns "$ns" -o json > /tmp/ns.json
python3 -c "import json; obj=json.load(open('/tmp/ns.json')); obj.setdefault('spec',{}); obj['spec']['finalizers']=[]; json.dump(obj, open('/tmp/ns-finalize.json','w'))"
kubectl replace --raw \"/api/v1/namespaces/$ns/finalize\" -f /tmp/ns-finalize.json >/dev/null || true
kubectl delete ns "$ns" --ignore-not-found=true --wait=false || true
for i in $(seq 1 60); do
if ! kubectl get ns "$ns" >/dev/null 2>&1; then
break
fi
sleep 2
done
fi
# 保证 Helm 能写 release secret命名空间必须处于 Active 且可创建资源
if ! kubectl get ns "$ns" >/dev/null 2>&1; then
kubectl create ns "$ns"
fi
phase_now="$(kubectl get ns "$ns" -o jsonpath='{.status.phase}' 2>/dev/null || true)"
deleting_now="$(kubectl get ns "$ns" -o jsonpath='{.metadata.deletionTimestamp}' 2>/dev/null || true)"
if [ "$phase_now" = "Terminating" ]; then
echo "[ERR] namespace $ns still Terminating after recovery; abort helm install"
kubectl get ns "$ns" -o yaml || true
exit 1
fi
if [ -n "$deleting_now" ]; then
echo "[ERR] namespace $ns has deletionTimestamp=$deleting_now; abort helm install"
kubectl get ns "$ns" -o yaml || true
exit 1
fi
# 探针:确认命名空间可写,避免 Helm 创建 release secret 时才失败
kubectl -n "$ns" create configmap longhorn-write-probe --from-literal=ok=1 >/dev/null
kubectl -n "$ns" delete configmap longhorn-write-probe --ignore-not-found=true >/dev/null
args:
executable: /bin/bash
changed_when: true
failed_when: false
- name: Ensure longhorn Helm repo
ansible.builtin.shell: |
set -e
if ! helm repo list 2>/dev/null | grep -q '^longhorn'; then
helm repo add longhorn https://charts.longhorn.io
fi
helm repo update
environment:
KUBECONFIG: "{{ k3s_kubeconfig }}"
args:
executable: /bin/bash
changed_when: true
- name: Delete leftover longhorn PriorityClass (cluster-scoped) to avoid Helm ownership conflicts
ansible.builtin.shell: |
set -e
KUBECONFIG={{ k3s_kubeconfig }} kubectl delete priorityclass longhorn-critical --ignore-not-found=true
args:
executable: /bin/bash
changed_when: true
failed_when: false
- name: Delete leftover Longhorn CRDs (cluster-scoped, opt-in)
when: longhorn_force_crd_reset | default(false) | bool
ansible.builtin.shell: |
set -e
export KUBECONFIG={{ k3s_kubeconfig }}
crd_list=\"$(kubectl get crd -o name 2>/dev/null | grep 'longhorn.io' || true)\"
if [ -n \"$crd_list\" ]; then
echo \"$crd_list\" | while read -r crd; do
[ -z \"$crd\" ] && continue
timeout 20s kubectl delete \"$crd\" --ignore-not-found=true || true
done
fi
args:
executable: /bin/bash
changed_when: true
failed_when: false
- name: Delete leftover Longhorn ClusterRole/ClusterRoleBinding (cluster-scoped)
ansible.builtin.shell: |
set -e
export KUBECONFIG={{ k3s_kubeconfig }}
role_list=\"$(kubectl get clusterrole -o name 2>/dev/null | grep 'longhorn' || true)\"
if [ -n \"$role_list\" ]; then
echo \"$role_list\" | while read -r role; do
[ -z \"$role\" ] && continue
timeout 20s kubectl delete \"$role\" --ignore-not-found=true || true
done
fi
binding_list=\"$(kubectl get clusterrolebinding -o name 2>/dev/null | grep 'longhorn' || true)\"
if [ -n \"$binding_list\" ]; then
echo \"$binding_list\" | while read -r binding; do
[ -z \"$binding\" ] && continue
timeout 20s kubectl delete \"$binding\" --ignore-not-found=true || true
done
fi
args:
executable: /bin/bash
changed_when: true
failed_when: false
- name: Cleanup leftover Helm release records for Longhorn (default + longhorn-system)
ansible.builtin.shell: |
set -e
export KUBECONFIG={{ k3s_kubeconfig }}
for ns in longhorn-system default; do
if helm -n \"$ns\" list --all 2>/dev/null | grep -q '^longhorn'; then
timeout 120s helm -n \"$ns\" uninstall longhorn --no-hooks || true
fi
sec_list=\"$(kubectl -n \"$ns\" get secret -o name 2>/dev/null | grep '^secret/sh\\.helm\\.release\\.v1\\.longhorn\\.' || true)\"
if [ -n \"$sec_list\" ]; then
echo \"$sec_list\" | xargs -n1 kubectl -n \"$ns\" delete --ignore-not-found=true
fi
done
environment:
KUBECONFIG: "{{ k3s_kubeconfig }}"
args:
executable: /bin/bash
changed_when: true
failed_when: false
- name: Helm upgrade/install Longhorn失败兜底install --replace
ansible.builtin.shell: |
set -e
ns="longhorn-system"
recover_ns() {
phase="$(kubectl get ns "$ns" -o jsonpath='{.status.phase}' 2>/dev/null || true)"
deleting="$(kubectl get ns "$ns" -o jsonpath='{.metadata.deletionTimestamp}' 2>/dev/null || true)"
if [ "$phase" = "Terminating" ] || [ -n "$deleting" ]; then
kubectl get ns "$ns" -o json > /tmp/ns.json || true
python3 -c "import json; obj=json.load(open('/tmp/ns.json')); obj.setdefault('spec',{}); obj['spec']['finalizers']=[]; json.dump(obj, open('/tmp/ns-finalize.json','w'))" || true
kubectl replace --raw "/api/v1/namespaces/$ns/finalize" -f /tmp/ns-finalize.json >/dev/null || true
kubectl delete ns "$ns" --ignore-not-found=true --wait=false || true
for i in $(seq 1 90); do
if ! kubectl get ns "$ns" >/dev/null 2>&1; then
break
fi
sleep 2
done
fi
kubectl get ns "$ns" >/dev/null 2>&1 || kubectl create ns "$ns"
}
for i in 1 2 3 4 5; do
set +e
out="$(helm upgrade --install longhorn longhorn/longhorn --namespace "$ns" --create-namespace -f {{ longhorn_values_dest }} --version {{ longhorn_chart_version }} --wait --timeout 15m 2>&1)"
rc=$?
set -e
if [ $rc -eq 0 ]; then
echo "$out"
exit 0
fi
echo "$out"
if echo "$out" | grep -q "is being terminated"; then
echo "[WARN] namespace $ns is being terminated, recover and retry ($i/5)"
recover_ns
sleep $((i * 3))
continue
fi
if echo "$out" | grep -q "engineimages.longhorn.io\" not found"; then
echo "[WARN] longhorn CRD propagation not ready, retry ($i/5)"
sleep $((i * 5))
continue
fi
# 非命名空间终止类错误,直接失败
exit $rc
done
# 兜底:仍失败则返回非 0
exit 1
environment:
KUBECONFIG: "{{ k3s_kubeconfig }}"
args:
executable: /bin/bash
register: helm_longhorn
changed_when: true
- name: Apply local-path-config lab defaults (optional)
hosts: k3s_server
become: true
run_once: true
vars:
k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml
local_path_json_src: "{{ playbook_dir }}/../../files/03-05-local-path-config/local-path-config-lab.json"
local_path_json_dest: /root/local-path-config-lab.json
tasks:
- name: Apply local-path-config lab defaults (optional)
when: longhorn_apply_local_path_lab | default(false) | bool
block:
- name: Copy local-path lab json
ansible.builtin.copy:
src: "{{ local_path_json_src }}"
dest: "{{ local_path_json_dest }}"
mode: "0644"
- name: Apply local-path-config ConfigMap
ansible.builtin.shell: |
set -e
KUBECONFIG={{ k3s_kubeconfig }} kubectl -n kube-system create configmap local-path-config \
--from-file=config.json={{ local_path_json_dest }} \
--dry-run=client -o yaml | KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f -
args:
executable: /bin/bash
changed_when: true
- name: Restart local-path-provisioner if present
ansible.builtin.shell: |
KUBECONFIG={{ k3s_kubeconfig }} kubectl -n kube-system rollout restart deploy/local-path-provisioner
args:
executable: /bin/bash
register: lp_restart
failed_when: false
changed_when: lp_restart.rc == 0
- name: Verify 03-07 Longhorn (namespace pods)
hosts: k3s_server
become: true
run_once: true
vars:
k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml
longhorn_ns: "{{ longhorn_namespace | default('longhorn-system') }}"
tasks:
- name: Check longhorn pods
ansible.builtin.shell: |
set -e
KUBECONFIG={{ k3s_kubeconfig }} kubectl get pods -n {{ longhorn_ns }} -o wide
args:
executable: /bin/bash
changed_when: false
- name: Teardown 03-07 Longhorn (optional)
hosts: k3s_server
become: true
run_once: true
vars:
k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml
verify_teardown: "{{ (VERIFY_TEARDOWN | default('1')) | string }}"
longhorn_ns: "{{ longhorn_namespace | default('longhorn-system') }}"
tasks:
- name: Uninstall longhorn helm release when VERIFY_TEARDOWN=1
when: verify_teardown == "1"
ansible.builtin.shell: |
set -e
export KUBECONFIG={{ k3s_kubeconfig }}
if helm -n {{ longhorn_ns }} list 2>/dev/null | grep -q longhorn; then
timeout 180s helm -n {{ longhorn_ns }} uninstall longhorn --no-hooks || true
fi
kubectl delete ns {{ longhorn_ns }} --ignore-not-found=true --wait=false || true
args:
executable: /bin/bash
changed_when: true
failed_when: false