基本框架

This commit is contained in:
2026-03-21 04:36:06 +08:00
commit de1be1dbe5
125 changed files with 10302 additions and 0 deletions

View File

@@ -0,0 +1,148 @@
#!/bin/bash
set -euo pipefail
LOG_DIR="/root/netpol-diag-logs"
mkdir -p "${LOG_DIR}"
LOG_FILE="${LOG_DIR}/recovery-$(date '+%Y%m%d-%H%M%S').log"
exec > >(tee -a "${LOG_FILE}") 2>&1
info() { echo "[INFO] $*"; }
warn() { echo "[WARN] $*"; }
confirm_once() {
local prompt="$1"
local answer=""
printf "%s (输入 YES 继续): " "${prompt}"
read -r answer
[[ "${answer}" == "YES" ]]
}
confirm_twice_high_risk() {
local answer1=""
local answer2=""
printf "高风险操作,第一次确认:输入 YES 执行: "
read -r answer1
printf "第二次确认:输入 RESET 执行: "
read -r answer2
[[ "${answer1}" == "YES" && "${answer2}" == "RESET" ]]
}
cleanup_demo_resources() {
info "清理 demo 资源default + kube-system"
kubectl delete ingress -n default nginx-demo nodejs-demo --ignore-not-found || true
kubectl delete ingressroute -n default nginx-demo nodejs-demo --ignore-not-found || true
kubectl delete middleware -n default nginx-demo-stripprefix nodejs-demo-stripprefix --ignore-not-found || true
kubectl delete service -n default nginx-demo nodejs-demo --ignore-not-found || true
kubectl delete deployment -n default nginx-demo nodejs-demo --ignore-not-found || true
kubectl delete networkpolicy -n default allow-traefik-to-nginx allow-traefik-to-nodejs --ignore-not-found || true
kubectl delete networkpolicy -n kube-system allow-traefik-egress-to-services --ignore-not-found || true
info "demo 资源清理完成"
}
restart_key_components() {
info "重启关键组件"
kubectl rollout restart deployment -n kube-system traefik || true
kubectl rollout restart deployment -n kube-system coredns || true
kubectl get ds -n kube-system -l k8s-app=kube-proxy -o name | while read -r ds; do
kubectl rollout restart -n kube-system "${ds}" || true
done
info "等待关键组件状态"
kubectl rollout status deployment/traefik -n kube-system --timeout=180s || true
kubectl rollout status deployment/coredns -n kube-system --timeout=180s || true
}
network_rules_guidance() {
warn "该步骤仅打印建议命令,不自动执行。"
cat <<'EOF'
建议在控制节点人工执行并逐条确认:
# 1) 备份当前规则
iptables-save > /root/iptables-backup-$(date +%F-%H%M%S).txt
# 2) 查看 KUBE-ROUTER 相关链(确认后再清理)
iptables-save | grep KUBE-ROUTER || true
# 3) 若你明确要清理 kube-router 规则(高风险)
# iptables-save | grep -v KUBE-ROUTER | iptables-restore
# 4) 查看并清理相关 ipset高风险按需逐个
# ipset list -n | grep '^KUBE-'
# ipset destroy <set-name>
EOF
}
print_rebuild_runbook() {
cat <<'EOF'
K3s 重建步骤(只输出,不自动执行):
1) 在 server 节点卸载:
/usr/local/bin/k3s-uninstall.sh
2) 在 agent 节点卸载:
/usr/local/bin/k3s-agent-uninstall.sh
3) 清理残留目录(确认后):
rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/cni /opt/cni
4) 重新安装 server带你当前需要的参数
5) 重新 join agent
6) 先部署 04-1 / 04-2 / 04-3再到 04-4 / 04-5
7) 最后用 /root/check-nodejs-netpol.sh 复测
EOF
}
show_menu() {
echo
echo "===== K3s 恢复脚本(独立于诊断)====="
echo "1) 仅清理 demo 资源(低风险)"
echo "2) 清理 demo + 重启关键组件(中风险)"
echo "3) 高风险网络规则清理(双重确认,默认仅打印建议)"
echo "4) 输出完整重建步骤(不自动执行)"
echo "0) 退出"
printf "请选择: "
}
main() {
info "日志文件: ${LOG_FILE}"
while true; do
show_menu
read -r choice
case "${choice}" in
1)
if confirm_once "确认执行“仅清理 demo 资源”吗?"; then
cleanup_demo_resources
else
warn "已取消"
fi
;;
2)
if confirm_once "确认执行“清理 demo + 重启关键组件”吗?"; then
cleanup_demo_resources
restart_key_components
else
warn "已取消"
fi
;;
3)
if confirm_twice_high_risk; then
network_rules_guidance
else
warn "高风险操作已取消"
fi
;;
4)
print_rebuild_runbook
;;
0)
info "退出。日志已保存:${LOG_FILE}"
break
;;
*)
warn "无效选项"
;;
esac
done
}
main