Files
Deploy-Laboratory/scripts/diag/recovery/k3s-recovery-reset.sh
2026-03-21 04:36:06 +08:00

149 lines
4.3 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
set -euo pipefail
LOG_DIR="/root/netpol-diag-logs"
mkdir -p "${LOG_DIR}"
LOG_FILE="${LOG_DIR}/recovery-$(date '+%Y%m%d-%H%M%S').log"
exec > >(tee -a "${LOG_FILE}") 2>&1
info() { echo "[INFO] $*"; }
warn() { echo "[WARN] $*"; }
confirm_once() {
local prompt="$1"
local answer=""
printf "%s (输入 YES 继续): " "${prompt}"
read -r answer
[[ "${answer}" == "YES" ]]
}
confirm_twice_high_risk() {
local answer1=""
local answer2=""
printf "高风险操作,第一次确认:输入 YES 执行: "
read -r answer1
printf "第二次确认:输入 RESET 执行: "
read -r answer2
[[ "${answer1}" == "YES" && "${answer2}" == "RESET" ]]
}
cleanup_demo_resources() {
info "清理 demo 资源default + kube-system"
kubectl delete ingress -n default nginx-demo nodejs-demo --ignore-not-found || true
kubectl delete ingressroute -n default nginx-demo nodejs-demo --ignore-not-found || true
kubectl delete middleware -n default nginx-demo-stripprefix nodejs-demo-stripprefix --ignore-not-found || true
kubectl delete service -n default nginx-demo nodejs-demo --ignore-not-found || true
kubectl delete deployment -n default nginx-demo nodejs-demo --ignore-not-found || true
kubectl delete networkpolicy -n default allow-traefik-to-nginx allow-traefik-to-nodejs --ignore-not-found || true
kubectl delete networkpolicy -n kube-system allow-traefik-egress-to-services --ignore-not-found || true
info "demo 资源清理完成"
}
restart_key_components() {
info "重启关键组件"
kubectl rollout restart deployment -n kube-system traefik || true
kubectl rollout restart deployment -n kube-system coredns || true
kubectl get ds -n kube-system -l k8s-app=kube-proxy -o name | while read -r ds; do
kubectl rollout restart -n kube-system "${ds}" || true
done
info "等待关键组件状态"
kubectl rollout status deployment/traefik -n kube-system --timeout=180s || true
kubectl rollout status deployment/coredns -n kube-system --timeout=180s || true
}
network_rules_guidance() {
warn "该步骤仅打印建议命令,不自动执行。"
cat <<'EOF'
建议在控制节点人工执行并逐条确认:
# 1) 备份当前规则
iptables-save > /root/iptables-backup-$(date +%F-%H%M%S).txt
# 2) 查看 KUBE-ROUTER 相关链(确认后再清理)
iptables-save | grep KUBE-ROUTER || true
# 3) 若你明确要清理 kube-router 规则(高风险)
# iptables-save | grep -v KUBE-ROUTER | iptables-restore
# 4) 查看并清理相关 ipset高风险按需逐个
# ipset list -n | grep '^KUBE-'
# ipset destroy <set-name>
EOF
}
print_rebuild_runbook() {
cat <<'EOF'
K3s 重建步骤(只输出,不自动执行):
1) 在 server 节点卸载:
/usr/local/bin/k3s-uninstall.sh
2) 在 agent 节点卸载:
/usr/local/bin/k3s-agent-uninstall.sh
3) 清理残留目录(确认后):
rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/cni /opt/cni
4) 重新安装 server带你当前需要的参数
5) 重新 join agent
6) 先部署 04-1 / 04-2 / 04-3再到 04-4 / 04-5
7) 最后用 /root/check-nodejs-netpol.sh 复测
EOF
}
show_menu() {
echo
echo "===== K3s 恢复脚本(独立于诊断)====="
echo "1) 仅清理 demo 资源(低风险)"
echo "2) 清理 demo + 重启关键组件(中风险)"
echo "3) 高风险网络规则清理(双重确认,默认仅打印建议)"
echo "4) 输出完整重建步骤(不自动执行)"
echo "0) 退出"
printf "请选择: "
}
main() {
info "日志文件: ${LOG_FILE}"
while true; do
show_menu
read -r choice
case "${choice}" in
1)
if confirm_once "确认执行“仅清理 demo 资源”吗?"; then
cleanup_demo_resources
else
warn "已取消"
fi
;;
2)
if confirm_once "确认执行“清理 demo + 重启关键组件”吗?"; then
cleanup_demo_resources
restart_key_components
else
warn "已取消"
fi
;;
3)
if confirm_twice_high_risk; then
network_rules_guidance
else
warn "高风险操作已取消"
fi
;;
4)
print_rebuild_runbook
;;
0)
info "退出。日志已保存:${LOG_FILE}"
break
;;
*)
warn "无效选项"
;;
esac
done
}
main