Files
Deploy-Laboratory/ansible/playbooks/k3s-init-and-install.yml
jack 231b6713c4 chore: 对齐 00-05 §2 的部署与验证脚本
- 新增 deploy-lab.sh(k3s/longhorn/nginx 铺栈)与 ssh/run-phase2-k3s-on-ylc61-as-jack.sh
- verify.sh:flow/preflight、VERIFY_TEARDOWN 默认、注释与 §2 对应
- 更新 smoke-verify、README、.env.verify.example、根 README 与主要 playbook 头注释
- k3s-delete-lab-stacks 标明重度清场语义

Made-with: Cursor
2026-03-26 07:32:08 +08:00

271 lines
11 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
---
# 部署docs/00-05 §2 步骤 3「正式安装类」——全集群 K3s + 节点准备(非 verify.sh 单条 teardown
# 前置§2 步骤 1 接入inventory/SSH步骤 2 可选 scripts/deploy-lab.sh 在 K3S_PREPARE_STORAGE=true 时先跑 k3s-prepare-storage.yml。
# 入口:仓库根 ./scripts/deploy-lab.sh k3s或 ansible-playbook -i ansible/inventory.ini ansible/playbooks/k3s-init-and-install.yml
- name: Verify /storage is a separate mount (optional)
hosts: k3s_nodes
become: true
tasks:
- name: Check / and /storage mount sources
when: k3s_verify_storage_mount | default(false) | bool
block:
- name: Get mount source for /
ansible.builtin.command: findmnt -n -o SOURCE /
register: mnt_root
changed_when: false
- name: Get mount source for /storage
ansible.builtin.command: findmnt -n -o SOURCE /storage
register: mnt_storage
changed_when: false
failed_when: false
- name: Assert /storage is mounted on a different device than /
ansible.builtin.assert:
that:
- mnt_storage.rc == 0
- (mnt_root.stdout | trim | length) > 0
- (mnt_storage.stdout | trim | length) > 0
- (mnt_root.stdout | trim) != (mnt_storage.stdout | trim)
fail_msg: >-
/storage must be a mount point on a block device different from /.
See docs/00-04-部署环境说明.md and docs/01-06-节点初始化-ansible-实践.md
- name: Init base system
hosts: k3s_nodes
become: true
tasks:
# 检查当前节点上 firewalld 的运行状态,供后续条件判断使用
- name: Check if firewalld is running
ansible.builtin.command: firewall-cmd --state
register: firewalld_state
changed_when: false
failed_when: false
# 根据全局 timezone 变量设置系统时区(可选)
- name: Set timezone
ansible.builtin.command: timedatectl set-timezone {{ timezone }}
when: timezone is defined and timezone != ""
# 安装 k3s 所需的基础工具包curl、git 等)
- name: Install basic packages
ansible.builtin.package:
name:
- curl
- git
state: present
# 确保 /etc/hosts 中包含所有 k3s 节点的主机名解析(可选)
- name: Ensure /etc/hosts has entries for all k3s nodes
ansible.builtin.lineinfile:
path: /etc/hosts
regexp: '^\S+\s+{{ item }}\s*$'
line: "{{ hostvars[item]['ansible_host'] }} {{ item }}"
state: present
loop: "{{ groups['k3s_nodes'] }}"
when:
- k3s_manage_hosts | default(true) | bool
- hostvars[item]['ansible_host'] is defined
# k3s 所需端口8472/udpflannel VXLAN全部节点6443/tcpAPI仅 server
# 必须在安装 k3s 前开放,否则 worker 无法连接、flannel 无法建立 overlay
# 在所有 k3s 节点上开放 flannel VXLAN 所需的 8472/udp 端口
- name: Open flannel VXLAN port (8472/udp) on all k3s nodes
ansible.builtin.command: firewall-cmd --permanent --add-port=8472/udp
when:
- k3s_manage_firewalld | default(true) | bool
- firewalld_state.stdout | default('') == 'running'
# 在 server 节点上开放 k3s API 端口 6443/tcp
- name: Open k3s API port (6443/tcp) on server
ansible.builtin.command: firewall-cmd --permanent --add-port=6443/tcp
when:
- k3s_manage_firewalld | default(true) | bool
- inventory_hostname in groups['k3s_server']
- firewalld_state.stdout | default('') == 'running'
# 在完成端口放行后重新加载 firewalld 规则
- name: Reload firewalld after opening k3s ports
ansible.builtin.command: firewall-cmd --reload
when:
- k3s_manage_firewalld | default(true) | bool
- firewalld_state.stdout | default('') == 'running'
- name: Install k3s server
hosts: k3s_server
become: true
tasks:
# 在 server 节点上下载安装并启动 k3s server 进程
- name: Download and install k3s server
ansible.builtin.shell: |
curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="server --data-dir={{ k3s_data_dir }}" sh -
args:
creates: "{{ k3s_data_dir }}/server"
- name: Install k3s agent (workers)
hosts: k3s_worker
become: true
serial: 1 # 逐台安装,减轻并行下载对网络的压力
tasks:
# 从首个 server 节点读取集群 token仅执行一次
- name: Read k3s token from first server
ansible.builtin.slurp:
src: "{{ k3s_data_dir }}/server/token"
delegate_to: "{{ groups['k3s_server'][0] }}"
run_once: true
register: k3s_token_from_server
# 在各 worker 节点上保存解码后的 token 供后续安装使用
- name: Set fact for k3s token on workers
ansible.builtin.set_fact:
k3s_token: "{{ k3s_token_from_server.content | b64decode | trim }}"
# 在每个 worker 节点上下载安装并启动 k3s agent 进程
- name: Install k3s agent
ansible.builtin.shell: |
curl -sfL https://get.k3s.io | K3S_URL=https://{{ k3s_server_ip }}:6443 K3S_TOKEN={{ k3s_token }} INSTALL_K3S_EXEC="agent --data-dir={{ k3s_data_dir }}" sh -
args:
creates: "{{ k3s_data_dir }}/agent"
async: 600
poll: 15
- name: Configure firewalld baseline for k3s (flannel.1 / cni0 -> trusted)
hosts: k3s_nodes
become: true
tasks:
# 为 k3s 配置 firewalld 基线:将 flannel.1 / cni0 加入 trusted 区域
- block:
# 检查节点上 firewalld 是否可用
- name: Check if firewalld is available
ansible.builtin.command: firewall-cmd --state
register: firewalld_check
changed_when: false
failed_when: false
# 等待 CNI 接口 flannel.1 和 cni0 出现k3s 启动并创建完成)
- name: Wait for CNI interfaces (flannel.1, cni0) to appear
ansible.builtin.shell: |
for i in $(seq 1 120); do
ip link show flannel.1 >/dev/null 2>&1 && ip link show cni0 >/dev/null 2>&1 && exit 0
sleep 1
done
exit 1
when: firewalld_check.stdout == 'running'
# 将 flannel.1 / cni0 接口加入 firewalld trusted 区域(运行时和永久)
- name: Add flannel.1 and cni0 to firewalld trusted zone (runtime + permanent)
ansible.builtin.shell: |
firewall-cmd --zone=trusted --add-interface={{ item }}
firewall-cmd --permanent --zone=trusted --add-interface={{ item }}
loop:
- flannel.1
- cni0
when: firewalld_check.stdout == 'running'
# 更新 firewalld 配置使新接口规则立即生效
- name: Reload firewalld
ansible.builtin.command: firewall-cmd --reload
when: firewalld_check.stdout == 'running'
when: k3s_manage_firewalld | default(true) | bool
- name: Configure CoreDNS (IPv4 upstream for ACME)
hosts: k3s_server
become: true
run_once: true
vars:
k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml
tasks:
- name: Wait for CoreDNS deployment to be ready
ansible.builtin.shell: |
KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/coredns -n kube-system --timeout=120s
when: k3s_manage_coredns | default(true) | bool
- name: Extract CoreDNS Corefile from ConfigMap
ansible.builtin.shell: |
KUBECONFIG={{ k3s_kubeconfig }} kubectl get configmap coredns -n kube-system -o jsonpath='{.data.Corefile}' > /tmp/coredns-corefile.txt
when: k3s_manage_coredns | default(true) | bool
- name: Patch Corefile forward to IPv4 (avoid IPv6 upstream in Pod network)
ansible.builtin.replace:
path: /tmp/coredns-corefile.txt
regexp: 'forward \. /etc/resolv\.conf'
replace: 'forward . {{ coredns_forward_servers }}'
register: coredns_patched
when: k3s_manage_coredns | default(true) | bool
- name: Apply patched CoreDNS ConfigMap and restart
ansible.builtin.shell: |
KUBECONFIG={{ k3s_kubeconfig }} kubectl create configmap coredns --from-file=Corefile=/tmp/coredns-corefile.txt -n kube-system --dry-run=client -o yaml | KUBECONFIG={{ k3s_kubeconfig }} kubectl apply -f -
KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout restart deployment/coredns -n kube-system
KUBECONFIG={{ k3s_kubeconfig }} kubectl rollout status deployment/coredns -n kube-system --timeout=60s
when:
- k3s_manage_coredns | default(true) | bool
- coredns_patched is changed
- name: Remove temp Corefile
ansible.builtin.file:
path: /tmp/coredns-corefile.txt
state: absent
when: k3s_manage_coredns | default(true) | bool
- name: 安装后验证 - traefik / nodes / curl
hosts: k3s_server
become: true
run_once: true
vars:
k3s_kubeconfig: /etc/rancher/k3s/k3s.yaml
tasks:
# 安装后为控制节点打 control-plane 标签02-05 矩阵 M1 需此标签才能调度),节点名与 inventory 短主机名一致ylc61ylc64
- name: Label control-plane nodes (k3s 不默认打标M1 需此标签)
ansible.builtin.shell: |
KUBECONFIG={{ k3s_kubeconfig }} kubectl label node {{ item }} node-role.kubernetes.io/control-plane= --overwrite
loop: "{{ groups['k3s_server'] | default([]) }}"
# 可选:为工作节点打 worker 标签02-05 矩阵 M3 需要)
- name: 可选 - 为工作节点打 worker 标签02-05 矩阵 M3 需要)
ansible.builtin.shell: |
KUBECONFIG={{ k3s_kubeconfig }} kubectl label node {{ item }} node-role.kubernetes.io/worker= --overwrite
loop: "{{ groups['k3s_worker'] | default([]) }}"
when: k3s_manage_role_labels | default(true) | bool
# 查看 kube-system 命名空间中与 Traefik / svclb 相关的 Pod 列表
- name: kubectl get pods -n kube-systemtraefik / svclb
ansible.builtin.shell: KUBECONFIG={{ k3s_kubeconfig }} kubectl get pods -n kube-system -o wide | grep -E 'NAME|traefik|svclb'
register: verify_traefik
changed_when: false
# 打印上一步查询到的 Traefik 相关 Pod 信息
- name: ">>> Traefik 相关 Pods"
ansible.builtin.debug:
msg: "{{ item }}"
loop: "{{ verify_traefik.stdout_lines }}"
# 查询当前集群中的节点列表
- name: kubectl get nodes
ansible.builtin.shell: KUBECONFIG={{ k3s_kubeconfig }} kubectl get nodes
register: verify_nodes
changed_when: false
# 打印节点列表结果,方便确认节点状态与角色
- name: ">>> kubectl get nodes"
ansible.builtin.debug:
msg: "{{ item }}"
loop: "{{ verify_nodes.stdout_lines }}"
# 通过 curl 测试每个节点 80 与 443 入口连通性
- name: curl 测试各节点 80/443 可达性
ansible.builtin.shell: |
for ip in {{ groups['k3s_nodes'] | map('extract', hostvars) | map(attribute='ansible_host') | join(' ') }}; do
c80=$(curl -sk -o /dev/null -w "%{http_code}" --connect-timeout 2 http://$ip 2>/dev/null) || c80="fail"
c443=$(curl -sk -o /dev/null -w "%{http_code}" --connect-timeout 2 https://$ip 2>/dev/null) || c443="fail"
echo "$ip: 80=$c80 443=$c443"
done
register: verify_curl
changed_when: false
- name: ">>> curl 结果"
ansible.builtin.debug:
msg: "{{ item }}"
loop: "{{ verify_curl.stdout_lines }}"