基本框架

This commit is contained in:
2026-03-21 04:36:06 +08:00
commit de1be1dbe5
125 changed files with 10302 additions and 0 deletions

View File

@@ -0,0 +1,419 @@
#!/usr/bin/env bash
set -euo pipefail
NS_TRAEFIK="kube-system"
APP_TRAEFIK_LABEL="app.kubernetes.io/name=traefik"
TIMEOUT=3
LOG_TAIL=200
LOG_SINCE="20m"
LOG_DIR=""
LOG_FILE=""
PROBE_CLIENT="SKIP"
PROBE_TRAEFIK_TO_SVC="SKIP"
PROBE_TRAEFIK_TO_POD="SKIP"
PROBE_TRAEFIK_DNS="SKIP"
print_title() {
echo
echo "=== $1 ==="
}
safe_run() {
"$@" || true
}
require_cmd() {
local c="$1"
if ! command -v "$c" >/dev/null 2>&1; then
echo "[ERR] 缺少命令: $c"
exit 1
fi
}
KUBECTL_PATH="$(command -v kubectl || true)"
IPTABLES_PATH="$(command -v iptables || true)"
USE_SUDO=""
init_runtime() {
require_cmd kubectl
require_cmd iptables
require_cmd awk
require_cmd grep
require_cmd curl
if [[ "${EUID}" -ne 0 ]] && command -v sudo >/dev/null 2>&1; then
# 先尝试无交互,失败则提示一次密码
if ! sudo -n true 2>/dev/null; then
echo "[INFO] 需要 sudo 权限以读取 iptables / kubectl 配置。"
sudo -v
fi
USE_SUDO="1"
fi
# 非 root 默认写到 HOME避免 /root 权限问题
if [[ "${EUID}" -eq 0 ]]; then
LOG_DIR="/root/netpol-diag-logs"
else
LOG_DIR="${HOME}/netpol-diag-logs"
fi
}
# 统一封装,避免脚本各处手工判断是否 sudo
kubectl() {
if [[ -n "${USE_SUDO}" ]]; then
sudo "${KUBECTL_PATH}" "$@"
else
"${KUBECTL_PATH}" "$@"
fi
}
iptables() {
if [[ -n "${USE_SUDO}" ]]; then
sudo "${IPTABLES_PATH}" "$@"
else
"${IPTABLES_PATH}" "$@"
fi
}
probe_wget_from_traefik() {
local url="$1"
if kubectl exec -n "${NS_TRAEFIK}" deploy/traefik -- wget -qO- "${url}" --timeout="${TIMEOUT}" >/tmp/netpol_probe.out 2>/tmp/netpol_probe.err; then
cat /tmp/netpol_probe.out
return 0
fi
cat /tmp/netpol_probe.err
return 1
}
select_scene() {
echo "请选择诊断场景:"
echo " 1) nginx-demo (/demo, 80)"
echo " 2) nodejs-demo (/node, 3000)"
echo " 3) 自定义"
printf "输入序号 [1/2/3](默认 2: "
read -r CHOICE
CHOICE="${CHOICE:-2}"
case "${CHOICE}" in
1)
NS_BACKEND="default"
APP_NAME="nginx-demo"
APP_LABEL="app=nginx-demo"
SVC_NAME="nginx-demo"
PATH_PREFIX="/demo/"
POD_PORT="80"
;;
2)
NS_BACKEND="default"
APP_NAME="nodejs-demo"
APP_LABEL="app=nodejs-demo"
SVC_NAME="nodejs-demo"
PATH_PREFIX="/node/"
POD_PORT="3000"
;;
3)
printf "后端命名空间(默认 default: "
read -r NS_BACKEND
NS_BACKEND="${NS_BACKEND:-default}"
printf "应用名Deployment/Service 名,示例 nodejs-demo: "
read -r APP_NAME
APP_NAME="${APP_NAME:-nodejs-demo}"
printf "Pod 标签选择器(默认 app=<应用名>: "
read -r APP_LABEL
APP_LABEL="${APP_LABEL:-app=${APP_NAME}}"
printf "Service 名(默认与应用名一致): "
read -r SVC_NAME
SVC_NAME="${SVC_NAME:-${APP_NAME}}"
printf "入口路径前缀(默认 /: "
read -r PATH_PREFIX
PATH_PREFIX="${PATH_PREFIX:-/}"
printf "后端 Pod 端口(默认 80: "
read -r POD_PORT
POD_PORT="${POD_PORT:-80}"
;;
*)
echo "[WARN] 无效选择,使用 nodejs-demo 默认场景。"
NS_BACKEND="default"
APP_NAME="nodejs-demo"
APP_LABEL="app=nodejs-demo"
SVC_NAME="nodejs-demo"
PATH_PREFIX="/node/"
POD_PORT="3000"
;;
esac
printf "入口 IP用于本机 curl默认 192.168.2.61: "
read -r ENTRY_IP
ENTRY_IP="${ENTRY_IP:-192.168.2.61}"
}
init_runtime
select_scene
mkdir -p "${LOG_DIR}"
LOG_FILE="${LOG_DIR}/diag-$(date '+%Y%m%d-%H%M%S')-${APP_NAME}.log"
exec > >(tee -a "${LOG_FILE}") 2>&1
print_title "0. 诊断上下文"
echo "TIME: $(date '+%F %T %Z')"
echo "LOG_FILE=${LOG_FILE}"
echo "SCENE_APP=${APP_NAME}"
echo "SCENE_NS=${NS_BACKEND}"
echo "SCENE_LABEL=${APP_LABEL}"
echo "SCENE_SVC=${SVC_NAME}"
echo "SCENE_PATH=${PATH_PREFIX}"
echo "SCENE_POD_PORT=${POD_PORT}"
echo "ENTRY_IP=${ENTRY_IP}"
echo "HOSTNAME=$(hostname)"
safe_run kubectl version --short
print_title "1. 集群与 Traefik 基线"
safe_run kubectl get nodes -o wide
safe_run kubectl get deploy -n "${NS_TRAEFIK}" traefik -o wide
safe_run kubectl get svc -n "${NS_TRAEFIK}" traefik -o wide
safe_run kubectl get pod -n "${NS_TRAEFIK}" -l "${APP_TRAEFIK_LABEL}" -o wide
kubectl get pods -n kube-system -o wide | grep -E 'kube-router|flannel|traefik|svclb-traefik' || true
TRAEFIK_POD="$(kubectl get pod -n "${NS_TRAEFIK}" -l "${APP_TRAEFIK_LABEL}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)"
TRAEFIK_IP="$(kubectl get pod -n "${NS_TRAEFIK}" -l "${APP_TRAEFIK_LABEL}" -o jsonpath='{.items[0].status.podIP}' 2>/dev/null || true)"
echo "--- 1.1 kube-proxy 基线 ---"
safe_run kubectl get pod -n kube-system -l k8s-app=kube-proxy -o wide
safe_run kubectl get configmap -n kube-system kube-proxy -o yaml
KPROXY_POD="$(kubectl get pod -n kube-system -l k8s-app=kube-proxy -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)"
if [[ -n "${KPROXY_POD}" ]]; then
echo "--- 1.2 kube-proxy 日志关键字error|conntrack|iptables|ipvs|nft---"
kubectl logs -n kube-system "${KPROXY_POD}" --tail=200 | grep -Ei "error|fail|conntrack|iptables|ipvs|nft|sync" || true
else
echo "[WARN] 未找到 kube-proxy PodK3s 某些模式可忽略)"
fi
print_title "2. 业务资源采集"
safe_run kubectl get deploy -n "${NS_BACKEND}" "${APP_NAME}" -o wide
safe_run kubectl get svc -n "${NS_BACKEND}" "${SVC_NAME}" -o wide
safe_run kubectl get endpoints -n "${NS_BACKEND}" "${SVC_NAME}" -o wide
safe_run kubectl get endpointslice -n "${NS_BACKEND}" -l kubernetes.io/service-name="${SVC_NAME}" -o wide
safe_run kubectl get pod -n "${NS_BACKEND}" -l "${APP_LABEL}" -o wide
safe_run kubectl get pod -n "${NS_BACKEND}" -l "${APP_LABEL}" --show-labels
safe_run kubectl get ingress -n "${NS_BACKEND}"
safe_run kubectl get ingressroute -n "${NS_BACKEND}"
safe_run kubectl get networkpolicy -n "${NS_BACKEND}"
safe_run kubectl get networkpolicy -n "${NS_TRAEFIK}"
safe_run kubectl get ns "${NS_BACKEND}" "${NS_TRAEFIK}" --show-labels
BACKEND_POD_IP="$(kubectl get pod -n "${NS_BACKEND}" -l "${APP_LABEL}" -o jsonpath='{.items[0].status.podIP}' 2>/dev/null || true)"
SVC_IP="$(kubectl get svc -n "${NS_BACKEND}" "${SVC_NAME}" -o jsonpath='{.spec.clusterIP}' 2>/dev/null || true)"
EP_COUNT="$(kubectl get endpoints -n "${NS_BACKEND}" "${SVC_NAME}" -o jsonpath='{.subsets[*].addresses[*].ip}' 2>/dev/null | awk '{print NF}')"
EP_COUNT="${EP_COUNT:-0}"
echo "TRAEFIK_POD=${TRAEFIK_POD:-<none>}"
echo "TRAEFIK_IP=${TRAEFIK_IP:-<none>}"
echo "BACKEND_POD_IP=${BACKEND_POD_IP:-<none>}"
echo "SVC_IP=${SVC_IP:-<none>}"
echo "ENDPOINTS_COUNT=${EP_COUNT}"
echo "SERVICE_SELECTOR=$(kubectl get svc -n "${NS_BACKEND}" "${SVC_NAME}" -o jsonpath='{.spec.selector}' 2>/dev/null || echo '{}')"
echo "--- 2.1 EndpointSlice 条件ready/serving/terminating/node---"
kubectl get endpointslice -n "${NS_BACKEND}" -l kubernetes.io/service-name="${SVC_NAME}" \
-o jsonpath='{range .items[*]}{"slice="}{.metadata.name}{"\n"}{range .endpoints[*]}{" addr="}{.addresses[0]}{" ready="}{.conditions.ready}{" serving="}{.conditions.serving}{" terminating="}{.conditions.terminating}{" node="}{.nodeName}{"\n"}{end}{end}' \
|| true
print_title "3. 主链路连通性探测"
echo "--- 3.1 本机 -> 入口 (${ENTRY_IP}${PATH_PREFIX}) ---"
if curl -sS -m "${TIMEOUT}" -o /tmp/netpol_client.out -w "%{http_code}" "http://${ENTRY_IP}${PATH_PREFIX}" >/tmp/netpol_client.code 2>/tmp/netpol_client.err; then
CODE="$(cat /tmp/netpol_client.code)"
echo "HTTP_CODE=${CODE}"
echo "BODY_PREVIEW:"
head -c 200 /tmp/netpol_client.out || true
echo
PROBE_CLIENT="OK_${CODE}"
else
cat /tmp/netpol_client.err
PROBE_CLIENT="FAIL"
fi
if [[ -n "${TRAEFIK_POD}" && -n "${SVC_IP}" ]]; then
echo
echo "--- 3.2 Traefik -> ServiceIP (${SVC_IP}:80) ---"
if probe_wget_from_traefik "http://${SVC_IP}:80"; then
PROBE_TRAEFIK_TO_SVC="OK"
else
PROBE_TRAEFIK_TO_SVC="FAIL"
fi
else
echo "[SKIP] 缺少 Traefik Pod 或 ServiceIP。"
fi
if [[ -n "${TRAEFIK_POD}" ]]; then
echo
echo "--- 3.3 Traefik -> Service DNS (${SVC_NAME}.${NS_BACKEND}.svc.cluster.local:80) ---"
if probe_wget_from_traefik "http://${SVC_NAME}.${NS_BACKEND}.svc.cluster.local:80"; then
PROBE_TRAEFIK_DNS="OK"
else
PROBE_TRAEFIK_DNS="FAIL"
fi
else
echo "[SKIP] 未找到 Traefik Pod。"
fi
if [[ -n "${TRAEFIK_POD}" && -n "${BACKEND_POD_IP}" ]]; then
echo
echo "--- 3.4 Traefik -> PodIP (${BACKEND_POD_IP}:${POD_PORT}) ---"
if probe_wget_from_traefik "http://${BACKEND_POD_IP}:${POD_PORT}"; then
PROBE_TRAEFIK_TO_POD="OK"
else
PROBE_TRAEFIK_TO_POD="FAIL"
fi
else
echo "[SKIP] 缺少 Traefik Pod 或后端 PodIP。"
fi
print_title "4. 路由与配置详情"
echo "--- 4.1 Ingress ---"
safe_run kubectl get ingress -n "${NS_BACKEND}" -o yaml
echo "--- 4.2 IngressRoute ---"
safe_run kubectl get ingressroute -n "${NS_BACKEND}" -o yaml
echo "--- 4.3 Service / Endpoints ---"
safe_run kubectl get svc -n "${NS_BACKEND}" "${SVC_NAME}" -o yaml
safe_run kubectl get endpoints -n "${NS_BACKEND}" "${SVC_NAME}" -o yaml
safe_run kubectl describe svc -n "${NS_BACKEND}" "${SVC_NAME}"
echo "--- 4.4 相关 NetworkPolicykube-system + backend---"
safe_run kubectl get networkpolicy -n "${NS_TRAEFIK}" -o yaml
safe_run kubectl get networkpolicy -n "${NS_BACKEND}" -o yaml
echo "--- 4.5 近期事件backend + kube-system---"
safe_run kubectl get events -n "${NS_BACKEND}" --sort-by=.lastTimestamp
safe_run kubectl get events -n kube-system --sort-by=.lastTimestamp
print_title "5. Traefik 日志(最近 ${LOG_SINCE},最多 ${LOG_TAIL} 行)"
safe_run kubectl logs -n "${NS_TRAEFIK}" deploy/traefik --since="${LOG_SINCE}" --tail="${LOG_TAIL}"
echo "--- 5.1 关键字过滤404|502|503|router|service|middleware|upstream|${SVC_NAME}|${PATH_PREFIX} ---"
kubectl logs -n "${NS_TRAEFIK}" deploy/traefik --since="${LOG_SINCE}" --tail="${LOG_TAIL}" | grep -Ei "404|502|503|router|service|middleware|upstream|endpoint|${SVC_NAME}|${PATH_PREFIX}" || true
echo "--- 5.2 Traefik 访问日志候选status=404/502/503 ---"
kubectl logs -n "${NS_TRAEFIK}" deploy/traefik --since="${LOG_SINCE}" --tail="${LOG_TAIL}" | grep -E "\" 404 |\" 502 |\" 503 " || true
echo "--- 5.3 Traefik 上一次容器日志(若重启过) ---"
safe_run kubectl logs -n "${NS_TRAEFIK}" deploy/traefik --previous --tail=100
print_title "6. 防火墙与数据平面"
echo "--- 6.1 防火墙状态 ---"
if command -v firewall-cmd >/dev/null 2>&1; then
safe_run firewall-cmd --state
safe_run firewall-cmd --list-all
else
echo "firewall-cmd: not found"
fi
if command -v ufw >/dev/null 2>&1; then
safe_run ufw status verbose
else
echo "ufw: not found"
fi
echo "--- 6.2 FORWARD 与 KUBE-ROUTER-FORWARD ---"
safe_run iptables -L FORWARD -n -v --line-numbers
safe_run iptables -L KUBE-ROUTER-FORWARD -n -v --line-numbers
echo "--- 6.2.1 NAT 链KUBE-SERVICES---"
safe_run iptables -t nat -L KUBE-SERVICES -n -v --line-numbers
if [[ -n "${SVC_IP}" ]]; then
echo "--- 6.2.2 NAT 链中 ServiceIP 相关规则 (${SVC_IP}) ---"
iptables -t nat -S | grep "${SVC_IP}" || true
fi
TRAEFIK_CHAIN=""
BACKEND_CHAIN=""
if [[ -n "${TRAEFIK_IP}" ]]; then
TRAEFIK_CHAIN="$(iptables -L KUBE-ROUTER-FORWARD -n --line-numbers 2>/dev/null | awk -v ip="${TRAEFIK_IP}" '$0 ~ ip && $4 ~ /^KUBE-POD-FW-/ {print $4; exit}')"
fi
if [[ -n "${BACKEND_POD_IP}" ]]; then
BACKEND_CHAIN="$(iptables -L KUBE-ROUTER-FORWARD -n --line-numbers 2>/dev/null | awk -v ip="${BACKEND_POD_IP}" '$0 ~ ip && $4 ~ /^KUBE-POD-FW-/ {print $4; exit}')"
fi
echo "TRAEFIK_CHAIN=${TRAEFIK_CHAIN:-<not found>}"
echo "BACKEND_CHAIN=${BACKEND_CHAIN:-<not found>}"
if [[ -n "${TRAEFIK_CHAIN}" ]]; then
echo "--- 6.3 Traefik Pod 链 ${TRAEFIK_CHAIN} ---"
safe_run iptables -L "${TRAEFIK_CHAIN}" -n -v -x
echo "--- 6.4 Traefik Pod 链规则定义 ---"
safe_run iptables -S "${TRAEFIK_CHAIN}"
fi
if [[ -n "${BACKEND_CHAIN}" ]]; then
echo "--- 6.5 Backend Pod 链 ${BACKEND_CHAIN} ---"
safe_run iptables -L "${BACKEND_CHAIN}" -n -v -x
echo "--- 6.6 Backend Pod 链规则定义 ---"
safe_run iptables -S "${BACKEND_CHAIN}"
fi
echo "--- 6.7 ipsetKUBE-SRC/KUBE-DST---"
if command -v ipset >/dev/null 2>&1; then
ipset list -n | grep -E '^KUBE-(SRC|DST)-' || true
else
echo "ipset: not found"
fi
echo "--- 6.8 conntrack容量与关键连接---"
if command -v conntrack >/dev/null 2>&1; then
safe_run conntrack -S
safe_run sysctl net.netfilter.nf_conntrack_count
safe_run sysctl net.netfilter.nf_conntrack_max
if [[ -n "${SVC_IP}" ]]; then
echo "conntrack by service ip (${SVC_IP}):"
conntrack -L -d "${SVC_IP}" 2>/dev/null | head -n 100 || true
fi
if [[ -n "${BACKEND_POD_IP}" ]]; then
echo "conntrack by backend pod ip (${BACKEND_POD_IP}):"
conntrack -L -d "${BACKEND_POD_IP}" 2>/dev/null | head -n 100 || true
fi
else
echo "conntrack: not found"
fi
print_title "7. 自动判读502/503/404"
echo "探测结果:"
echo " CLIENT=${PROBE_CLIENT}"
echo " TRAEFIK_TO_SVC=${PROBE_TRAEFIK_TO_SVC}"
echo " TRAEFIK_DNS=${PROBE_TRAEFIK_DNS}"
echo " TRAEFIK_TO_POD=${PROBE_TRAEFIK_TO_POD}"
if [[ "${EP_COUNT}" == "0" ]]; then
echo "- [高概率 503] Service 无可用 Endpoints。检查 Deployment 是否 Ready、selector 是否匹配。"
fi
if [[ "${PROBE_CLIENT}" == OK_404* ]]; then
echo "- [高概率 404] 入口路由未命中。检查 Ingress/IngressRoute 的 path、host、middleware。"
fi
if [[ "${PROBE_CLIENT}" == OK_503* ]]; then
echo "- [高概率 503] 入口已命中但后端不可用。优先看 Endpoints/EndpointSlice 条件与 Traefik 日志。"
fi
if [[ "${PROBE_TRAEFIK_TO_SVC}" == "FAIL" ]]; then
echo "- [高概率 502/503] Traefik 到 Service 不通。优先检查 NetworkPolicy、kube-router 链、DNS 53 放行。"
fi
if [[ "${PROBE_TRAEFIK_DNS}" == "FAIL" && "${PROBE_TRAEFIK_TO_SVC}" == "FAIL" ]]; then
echo "- [可能 DNS/服务发现问题] Traefik 到 Service DNS 与 ServiceIP 都失败。检查 CoreDNS、kube-system egress 53。"
fi
if [[ "${PROBE_TRAEFIK_TO_SVC}" == "OK" && "${PROBE_CLIENT}" == "FAIL" ]]; then
echo "- [可能入口层问题] 集群内后端可达但入口访问失败。检查控制节点防火墙、Traefik Service 暴露端口、外部路由。"
fi
if [[ "${PROBE_TRAEFIK_TO_SVC}" == "OK" && "${PROBE_TRAEFIK_TO_POD}" == "FAIL" ]]; then
echo "- [已知行为候选] Service 可达但 PodIP 直连失败,常见于 kube-router 同节点桥接路径。"
fi
echo
echo "下一步建议:"
echo "1) 先修复 Endpoints=0 / 404 路由不匹配。"
echo "2) 再看 Traefik -> Service 探测与 NetworkPolicy 命中。"
echo "3) 最后结合 KUBE-ROUTER-FORWARD、Pod 链、ipset 判断是否为 kube-router 行为问题。"
echo
echo "日志已保存:${LOG_FILE}"