feat: 按 doc_id 重组 ansible/files 与验证框架
- ansible/files 改为与文档 XX-YY 对齐的目录结构,更新相关 playbook 路径 - 新增 scripts/verify.sh 与 ansible/playbooks/verify/*.yml,移除单体 verify-matrix.yml - 补充 docs/00-02 矩阵状态、00-05 验证框架与流程、00-04 环境与 ylc65 工作机说明 - 增加 k3s 存储准备、Longhorn、local-path 等 playbook 与辅助脚本 Made-with: Cursor
This commit is contained in:
251
ansible/playbooks/longhorn-install.yml
Normal file
251
ansible/playbooks/longhorn-install.yml
Normal file
@@ -0,0 +1,251 @@
|
||||
---
|
||||
# Helm 安装 Longhorn(与 docs/03-07 一致)。在控制节点执行,依赖 KUBECONFIG=/etc/rancher/k3s/k3s.yaml
|
||||
# 变量:group_vars/all.yml 中 longhorn_chart_version、longhorn_install_node_packages、longhorn_apply_local_path_lab
|
||||
|
||||
- name: Longhorn node packages (iSCSI, NFS client)
|
||||
hosts: k3s_nodes
|
||||
become: true
|
||||
tasks:
|
||||
- name: Install Longhorn OS dependencies
|
||||
when: longhorn_install_node_packages | default(true) | bool
|
||||
block:
|
||||
- name: Install iscsi + nfs (dnf/yum)
|
||||
ansible.builtin.package:
|
||||
name:
|
||||
- iscsi-initiator-utils
|
||||
- nfs-utils
|
||||
state: present
|
||||
|
||||
- name: Enable iscsid
|
||||
ansible.builtin.systemd:
|
||||
name: iscsid
|
||||
enabled: true
|
||||
state: started
|
||||
|
||||
- name: Ensure Longhorn data subdirectory exists on all nodes
|
||||
ansible.builtin.file:
|
||||
path: "{{ k3s_data_dir }}/longhorn"
|
||||
state: directory
|
||||
mode: "0700"
|
||||
|
||||
- name: Pre-pull Longhorn images on all nodes (optional, avoid DockerHub EOF/ImagePullBackOff)
|
||||
when: longhorn_prepull_images | default(true) | bool
|
||||
ansible.builtin.shell: |
|
||||
set -e
|
||||
CTR="ctr --address /run/k3s/containerd/containerd.sock -n k8s.io"
|
||||
|
||||
imgs=(
|
||||
"docker.io/longhornio/longhorn-manager:v{{ longhorn_chart_version }}"
|
||||
"docker.io/longhornio/longhorn-ui:v{{ longhorn_chart_version }}"
|
||||
"docker.io/longhornio/longhorn-share-manager:v{{ longhorn_chart_version }}"
|
||||
"docker.io/longhornio/longhorn-engine:v{{ longhorn_chart_version }}"
|
||||
"docker.io/longhornio/longhorn-instance-manager:v{{ longhorn_chart_version }}"
|
||||
"docker.io/longhornio/backing-image-manager:v{{ longhorn_chart_version }}"
|
||||
"docker.io/longhornio/support-bundle-kit:v0.0.45"
|
||||
)
|
||||
|
||||
for img in "${imgs[@]}"; do
|
||||
ok=0
|
||||
for i in 1 2 3 4 5; do
|
||||
echo "[pull] $img (try $i/5)"
|
||||
if $CTR images pull "$img"; then
|
||||
ok=1
|
||||
break
|
||||
fi
|
||||
sleep $((i * 3))
|
||||
done
|
||||
if [ "$ok" -ne 1 ]; then
|
||||
echo "[ERR] failed pulling $img after retries"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: true
|
||||
|
||||
- name: Install Longhorn with Helm on first server
|
||||
hosts: k3s_server
|
||||
become: true
|
||||
run_once: true
|
||||
vars:
|
||||
longhorn_values_src: "{{ playbook_dir }}/../files/03-07-longhorn/values-lab.yaml"
|
||||
longhorn_values_dest: /root/longhorn-values-lab.yaml
|
||||
k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml
|
||||
tasks:
|
||||
- name: Install helm package (Fedora/RHEL family)
|
||||
ansible.builtin.package:
|
||||
name: helm
|
||||
state: present
|
||||
ignore_errors: true
|
||||
register: helm_pkg
|
||||
|
||||
- name: Hint if helm package install failed (install Helm 3 manually if needed)
|
||||
ansible.builtin.debug:
|
||||
msg: "dnf/yum 未装上 helm 时,请见 https://helm.sh/docs/intro/install/"
|
||||
when: helm_pkg.failed | default(false)
|
||||
|
||||
- name: Fail if helm binary still unavailable
|
||||
ansible.builtin.command: which helm
|
||||
register: helm_which
|
||||
changed_when: false
|
||||
failed_when: helm_which.rc != 0
|
||||
|
||||
- name: Copy lab values to server
|
||||
ansible.builtin.copy:
|
||||
src: "{{ longhorn_values_src }}"
|
||||
dest: "{{ longhorn_values_dest }}"
|
||||
mode: "0600"
|
||||
|
||||
- name: Ensure longhorn-system namespace is not stuck Terminating (force finalize if needed)
|
||||
ansible.builtin.shell: |
|
||||
set -e
|
||||
export KUBECONFIG={{ k3s_kubeconfig }}
|
||||
ns="longhorn-system"
|
||||
phase="$(kubectl get ns "$ns" -o jsonpath='{.status.phase}' 2>/dev/null || true)"
|
||||
if [ "$phase" = "Terminating" ]; then
|
||||
echo "[WARN] namespace $ns is Terminating; force finalize to unblock install"
|
||||
kubectl get ns "$ns" -o json > /tmp/ns.json
|
||||
python3 -c "import json; obj=json.load(open('/tmp/ns.json')); obj.setdefault('spec',{}); obj['spec']['finalizers']=[]; json.dump(obj, open('/tmp/ns-finalize.json','w'))"
|
||||
kubectl replace --raw "/api/v1/namespaces/$ns/finalize" -f /tmp/ns-finalize.json >/dev/null
|
||||
fi
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: true
|
||||
failed_when: false
|
||||
|
||||
- name: Ensure longhorn Helm repo
|
||||
ansible.builtin.shell: |
|
||||
set -e
|
||||
if ! helm repo list 2>/dev/null | grep -q '^longhorn'; then
|
||||
helm repo add longhorn https://charts.longhorn.io
|
||||
fi
|
||||
helm repo update
|
||||
environment:
|
||||
KUBECONFIG: "{{ k3s_kubeconfig }}"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: true
|
||||
|
||||
- name: Delete leftover longhorn PriorityClass (cluster-scoped) to avoid Helm ownership conflicts
|
||||
ansible.builtin.shell: |
|
||||
set -e
|
||||
KUBECONFIG={{ k3s_kubeconfig }} kubectl delete priorityclass longhorn-critical --ignore-not-found=true
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: true
|
||||
failed_when: false
|
||||
|
||||
- name: Delete leftover Longhorn CRDs (cluster-scoped) to avoid Helm ownership conflicts
|
||||
ansible.builtin.shell: |
|
||||
set -e
|
||||
export KUBECONFIG={{ k3s_kubeconfig }}
|
||||
crd_list="$(kubectl get crd -o name 2>/dev/null | grep 'longhorn.io' || true)"
|
||||
if [ -n "$crd_list" ]; then
|
||||
echo "$crd_list" | while read -r crd; do
|
||||
[ -z "$crd" ] && continue
|
||||
timeout 20s kubectl delete "$crd" --ignore-not-found=true || true
|
||||
done
|
||||
fi
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: true
|
||||
failed_when: false
|
||||
|
||||
- name: Delete leftover Longhorn ClusterRole/ClusterRoleBinding (cluster-scoped)
|
||||
ansible.builtin.shell: |
|
||||
set -e
|
||||
export KUBECONFIG={{ k3s_kubeconfig }}
|
||||
|
||||
role_list="$(kubectl get clusterrole -o name 2>/dev/null | grep 'longhorn' || true)"
|
||||
if [ -n "$role_list" ]; then
|
||||
echo "$role_list" | while read -r role; do
|
||||
[ -z "$role" ] && continue
|
||||
timeout 20s kubectl delete "$role" --ignore-not-found=true || true
|
||||
done
|
||||
fi
|
||||
|
||||
binding_list="$(kubectl get clusterrolebinding -o name 2>/dev/null | grep 'longhorn' || true)"
|
||||
if [ -n "$binding_list" ]; then
|
||||
echo "$binding_list" | while read -r binding; do
|
||||
[ -z "$binding" ] && continue
|
||||
timeout 20s kubectl delete "$binding" --ignore-not-found=true || true
|
||||
done
|
||||
fi
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: true
|
||||
failed_when: false
|
||||
|
||||
- name: Cleanup leftover Helm release records for Longhorn (default + longhorn-system)
|
||||
ansible.builtin.shell: |
|
||||
set -e
|
||||
export KUBECONFIG={{ k3s_kubeconfig }}
|
||||
|
||||
# 有些失败/中断的安装会把 release secret 留在 default 或 longhorn-system,导致后续:
|
||||
# - "cannot re-use a name that is still in use"
|
||||
# - cluster-scoped 资源的 meta.helm.sh/release-namespace 注解冲突
|
||||
for ns in longhorn-system default; do
|
||||
if helm -n "$ns" list --all 2>/dev/null | grep -q '^longhorn'; then
|
||||
# uninstall 可能卡住(例如 uninstall job / hook),避免阻塞整个自动化流程
|
||||
timeout 120s helm -n "$ns" uninstall longhorn --no-hooks || true
|
||||
fi
|
||||
|
||||
sec_list="$(kubectl -n "$ns" get secret -o name 2>/dev/null | grep '^secret/sh\\.helm\\.release\\.v1\\.longhorn\\.' || true)"
|
||||
if [ -n "$sec_list" ]; then
|
||||
echo "$sec_list" | xargs -n1 kubectl -n "$ns" delete --ignore-not-found=true
|
||||
fi
|
||||
done
|
||||
environment:
|
||||
KUBECONFIG: "{{ k3s_kubeconfig }}"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: true
|
||||
failed_when: false
|
||||
|
||||
- name: Helm upgrade/install Longhorn(失败兜底:install --replace)
|
||||
ansible.builtin.shell: |
|
||||
set -e
|
||||
helm upgrade --install longhorn longhorn/longhorn --namespace longhorn-system --create-namespace -f {{ longhorn_values_dest }} --version {{ longhorn_chart_version }} --wait --timeout 15m || helm install --replace longhorn longhorn/longhorn --namespace longhorn-system --create-namespace -f {{ longhorn_values_dest }} --version {{ longhorn_chart_version }} --wait --timeout 15m
|
||||
environment:
|
||||
KUBECONFIG: "{{ k3s_kubeconfig }}"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: helm_longhorn
|
||||
changed_when: true
|
||||
|
||||
- name: Apply local-path-config lab defaults (optional)
|
||||
hosts: k3s_server
|
||||
become: true
|
||||
run_once: true
|
||||
vars:
|
||||
k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml
|
||||
local_path_json_src: "{{ playbook_dir }}/../files/03-05-local-path-config/local-path-config-lab.json"
|
||||
local_path_json_dest: /root/local-path-config-lab.json
|
||||
tasks:
|
||||
- name: Apply local-path-config lab defaults (optional)
|
||||
when: longhorn_apply_local_path_lab | default(false) | bool
|
||||
block:
|
||||
- name: Copy local-path lab json
|
||||
ansible.builtin.copy:
|
||||
src: "{{ local_path_json_src }}"
|
||||
dest: "{{ local_path_json_dest }}"
|
||||
mode: "0644"
|
||||
|
||||
- name: Apply local-path-config ConfigMap
|
||||
ansible.builtin.shell: |
|
||||
set -e
|
||||
KUBECONFIG={{ k3s_kubeconfig }} kubectl -n kube-system create configmap local-path-config \
|
||||
--from-file=config.json={{ local_path_json_dest }} \
|
||||
--dry-run=client -o yaml | KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f -
|
||||
args:
|
||||
executable: /bin/bash
|
||||
changed_when: true
|
||||
|
||||
- name: Restart local-path-provisioner if present
|
||||
ansible.builtin.shell: |
|
||||
KUBECONFIG={{ k3s_kubeconfig }} kubectl -n kube-system rollout restart deploy/local-path-provisioner
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: lp_restart
|
||||
failed_when: false
|
||||
changed_when: lp_restart.rc == 0
|
||||
Reference in New Issue
Block a user