基本框架
This commit is contained in:
148
scripts/diag/recovery/k3s-recovery-reset.sh
Normal file
148
scripts/diag/recovery/k3s-recovery-reset.sh
Normal file
@@ -0,0 +1,148 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
LOG_DIR="/root/netpol-diag-logs"
|
||||
mkdir -p "${LOG_DIR}"
|
||||
LOG_FILE="${LOG_DIR}/recovery-$(date '+%Y%m%d-%H%M%S').log"
|
||||
exec > >(tee -a "${LOG_FILE}") 2>&1
|
||||
|
||||
info() { echo "[INFO] $*"; }
|
||||
warn() { echo "[WARN] $*"; }
|
||||
|
||||
confirm_once() {
|
||||
local prompt="$1"
|
||||
local answer=""
|
||||
printf "%s (输入 YES 继续): " "${prompt}"
|
||||
read -r answer
|
||||
[[ "${answer}" == "YES" ]]
|
||||
}
|
||||
|
||||
confirm_twice_high_risk() {
|
||||
local answer1=""
|
||||
local answer2=""
|
||||
printf "高风险操作,第一次确认:输入 YES 执行: "
|
||||
read -r answer1
|
||||
printf "第二次确认:输入 RESET 执行: "
|
||||
read -r answer2
|
||||
[[ "${answer1}" == "YES" && "${answer2}" == "RESET" ]]
|
||||
}
|
||||
|
||||
cleanup_demo_resources() {
|
||||
info "清理 demo 资源(default + kube-system)"
|
||||
kubectl delete ingress -n default nginx-demo nodejs-demo --ignore-not-found || true
|
||||
kubectl delete ingressroute -n default nginx-demo nodejs-demo --ignore-not-found || true
|
||||
kubectl delete middleware -n default nginx-demo-stripprefix nodejs-demo-stripprefix --ignore-not-found || true
|
||||
kubectl delete service -n default nginx-demo nodejs-demo --ignore-not-found || true
|
||||
kubectl delete deployment -n default nginx-demo nodejs-demo --ignore-not-found || true
|
||||
kubectl delete networkpolicy -n default allow-traefik-to-nginx allow-traefik-to-nodejs --ignore-not-found || true
|
||||
kubectl delete networkpolicy -n kube-system allow-traefik-egress-to-services --ignore-not-found || true
|
||||
info "demo 资源清理完成"
|
||||
}
|
||||
|
||||
restart_key_components() {
|
||||
info "重启关键组件"
|
||||
kubectl rollout restart deployment -n kube-system traefik || true
|
||||
kubectl rollout restart deployment -n kube-system coredns || true
|
||||
kubectl get ds -n kube-system -l k8s-app=kube-proxy -o name | while read -r ds; do
|
||||
kubectl rollout restart -n kube-system "${ds}" || true
|
||||
done
|
||||
info "等待关键组件状态"
|
||||
kubectl rollout status deployment/traefik -n kube-system --timeout=180s || true
|
||||
kubectl rollout status deployment/coredns -n kube-system --timeout=180s || true
|
||||
}
|
||||
|
||||
network_rules_guidance() {
|
||||
warn "该步骤仅打印建议命令,不自动执行。"
|
||||
cat <<'EOF'
|
||||
建议在控制节点人工执行并逐条确认:
|
||||
|
||||
# 1) 备份当前规则
|
||||
iptables-save > /root/iptables-backup-$(date +%F-%H%M%S).txt
|
||||
|
||||
# 2) 查看 KUBE-ROUTER 相关链(确认后再清理)
|
||||
iptables-save | grep KUBE-ROUTER || true
|
||||
|
||||
# 3) 若你明确要清理 kube-router 规则(高风险)
|
||||
# iptables-save | grep -v KUBE-ROUTER | iptables-restore
|
||||
|
||||
# 4) 查看并清理相关 ipset(高风险,按需逐个)
|
||||
# ipset list -n | grep '^KUBE-'
|
||||
# ipset destroy <set-name>
|
||||
|
||||
EOF
|
||||
}
|
||||
|
||||
print_rebuild_runbook() {
|
||||
cat <<'EOF'
|
||||
K3s 重建步骤(只输出,不自动执行):
|
||||
|
||||
1) 在 server 节点卸载:
|
||||
/usr/local/bin/k3s-uninstall.sh
|
||||
|
||||
2) 在 agent 节点卸载:
|
||||
/usr/local/bin/k3s-agent-uninstall.sh
|
||||
|
||||
3) 清理残留目录(确认后):
|
||||
rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/cni /opt/cni
|
||||
|
||||
4) 重新安装 server(带你当前需要的参数)
|
||||
5) 重新 join agent
|
||||
6) 先部署 04-1 / 04-2 / 04-3,再到 04-4 / 04-5
|
||||
7) 最后用 /root/check-nodejs-netpol.sh 复测
|
||||
EOF
|
||||
}
|
||||
|
||||
show_menu() {
|
||||
echo
|
||||
echo "===== K3s 恢复脚本(独立于诊断)====="
|
||||
echo "1) 仅清理 demo 资源(低风险)"
|
||||
echo "2) 清理 demo + 重启关键组件(中风险)"
|
||||
echo "3) 高风险网络规则清理(双重确认,默认仅打印建议)"
|
||||
echo "4) 输出完整重建步骤(不自动执行)"
|
||||
echo "0) 退出"
|
||||
printf "请选择: "
|
||||
}
|
||||
|
||||
main() {
|
||||
info "日志文件: ${LOG_FILE}"
|
||||
while true; do
|
||||
show_menu
|
||||
read -r choice
|
||||
case "${choice}" in
|
||||
1)
|
||||
if confirm_once "确认执行“仅清理 demo 资源”吗?"; then
|
||||
cleanup_demo_resources
|
||||
else
|
||||
warn "已取消"
|
||||
fi
|
||||
;;
|
||||
2)
|
||||
if confirm_once "确认执行“清理 demo + 重启关键组件”吗?"; then
|
||||
cleanup_demo_resources
|
||||
restart_key_components
|
||||
else
|
||||
warn "已取消"
|
||||
fi
|
||||
;;
|
||||
3)
|
||||
if confirm_twice_high_risk; then
|
||||
network_rules_guidance
|
||||
else
|
||||
warn "高风险操作已取消"
|
||||
fi
|
||||
;;
|
||||
4)
|
||||
print_rebuild_runbook
|
||||
;;
|
||||
0)
|
||||
info "退出。日志已保存:${LOG_FILE}"
|
||||
break
|
||||
;;
|
||||
*)
|
||||
warn "无效选项"
|
||||
;;
|
||||
esac
|
||||
done
|
||||
}
|
||||
|
||||
main
|
||||
Reference in New Issue
Block a user