#!/usr/bin/env bash set -euo pipefail NS_TRAEFIK="kube-system" APP_TRAEFIK_LABEL="app.kubernetes.io/name=traefik" TIMEOUT=3 LOG_TAIL=200 LOG_SINCE="20m" LOG_DIR="" LOG_FILE="" PROBE_CLIENT="SKIP" PROBE_TRAEFIK_TO_SVC="SKIP" PROBE_TRAEFIK_TO_POD="SKIP" PROBE_TRAEFIK_DNS="SKIP" print_title() { echo echo "=== $1 ===" } safe_run() { "$@" || true } require_cmd() { local c="$1" if ! command -v "$c" >/dev/null 2>&1; then echo "[ERR] 缺少命令: $c" exit 1 fi } KUBECTL_PATH="$(command -v kubectl || true)" IPTABLES_PATH="$(command -v iptables || true)" USE_SUDO="" init_runtime() { require_cmd kubectl require_cmd iptables require_cmd awk require_cmd grep require_cmd curl if [[ "${EUID}" -ne 0 ]] && command -v sudo >/dev/null 2>&1; then # 先尝试无交互,失败则提示一次密码 if ! sudo -n true 2>/dev/null; then echo "[INFO] 需要 sudo 权限以读取 iptables / kubectl 配置。" sudo -v fi USE_SUDO="1" fi # 非 root 默认写到 HOME,避免 /root 权限问题 if [[ "${EUID}" -eq 0 ]]; then LOG_DIR="/root/netpol-diag-logs" else LOG_DIR="${HOME}/netpol-diag-logs" fi } # 统一封装,避免脚本各处手工判断是否 sudo kubectl() { if [[ -n "${USE_SUDO}" ]]; then sudo "${KUBECTL_PATH}" "$@" else "${KUBECTL_PATH}" "$@" fi } iptables() { if [[ -n "${USE_SUDO}" ]]; then sudo "${IPTABLES_PATH}" "$@" else "${IPTABLES_PATH}" "$@" fi } probe_wget_from_traefik() { local url="$1" if kubectl exec -n "${NS_TRAEFIK}" deploy/traefik -- wget -qO- "${url}" --timeout="${TIMEOUT}" >/tmp/netpol_probe.out 2>/tmp/netpol_probe.err; then cat /tmp/netpol_probe.out return 0 fi cat /tmp/netpol_probe.err return 1 } select_scene() { echo "请选择诊断场景:" echo " 1) nginx-demo (/demo, 80)" echo " 2) nodejs-demo (/node, 3000)" echo " 3) 自定义" printf "输入序号 [1/2/3](默认 2): " read -r CHOICE CHOICE="${CHOICE:-2}" case "${CHOICE}" in 1) NS_BACKEND="default" APP_NAME="nginx-demo" APP_LABEL="app=nginx-demo" SVC_NAME="nginx-demo" PATH_PREFIX="/demo/" POD_PORT="80" ;; 2) NS_BACKEND="default" APP_NAME="nodejs-demo" APP_LABEL="app=nodejs-demo" SVC_NAME="nodejs-demo" PATH_PREFIX="/node/" POD_PORT="3000" ;; 3) printf "后端命名空间(默认 default): " read -r NS_BACKEND NS_BACKEND="${NS_BACKEND:-default}" printf "应用名(Deployment/Service 名,示例 nodejs-demo): " read -r APP_NAME APP_NAME="${APP_NAME:-nodejs-demo}" printf "Pod 标签选择器(默认 app=<应用名>): " read -r APP_LABEL APP_LABEL="${APP_LABEL:-app=${APP_NAME}}" printf "Service 名(默认与应用名一致): " read -r SVC_NAME SVC_NAME="${SVC_NAME:-${APP_NAME}}" printf "入口路径前缀(默认 /): " read -r PATH_PREFIX PATH_PREFIX="${PATH_PREFIX:-/}" printf "后端 Pod 端口(默认 80): " read -r POD_PORT POD_PORT="${POD_PORT:-80}" ;; *) echo "[WARN] 无效选择,使用 nodejs-demo 默认场景。" NS_BACKEND="default" APP_NAME="nodejs-demo" APP_LABEL="app=nodejs-demo" SVC_NAME="nodejs-demo" PATH_PREFIX="/node/" POD_PORT="3000" ;; esac printf "入口 IP(用于本机 curl,默认 192.168.2.61): " read -r ENTRY_IP ENTRY_IP="${ENTRY_IP:-192.168.2.61}" } init_runtime select_scene mkdir -p "${LOG_DIR}" LOG_FILE="${LOG_DIR}/diag-$(date '+%Y%m%d-%H%M%S')-${APP_NAME}.log" exec > >(tee -a "${LOG_FILE}") 2>&1 print_title "0. 诊断上下文" echo "TIME: $(date '+%F %T %Z')" echo "LOG_FILE=${LOG_FILE}" echo "SCENE_APP=${APP_NAME}" echo "SCENE_NS=${NS_BACKEND}" echo "SCENE_LABEL=${APP_LABEL}" echo "SCENE_SVC=${SVC_NAME}" echo "SCENE_PATH=${PATH_PREFIX}" echo "SCENE_POD_PORT=${POD_PORT}" echo "ENTRY_IP=${ENTRY_IP}" echo "HOSTNAME=$(hostname)" safe_run kubectl version --short print_title "1. 集群与 Traefik 基线" safe_run kubectl get nodes -o wide safe_run kubectl get deploy -n "${NS_TRAEFIK}" traefik -o wide safe_run kubectl get svc -n "${NS_TRAEFIK}" traefik -o wide safe_run kubectl get pod -n "${NS_TRAEFIK}" -l "${APP_TRAEFIK_LABEL}" -o wide kubectl get pods -n kube-system -o wide | grep -E 'kube-router|flannel|traefik|svclb-traefik' || true TRAEFIK_POD="$(kubectl get pod -n "${NS_TRAEFIK}" -l "${APP_TRAEFIK_LABEL}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)" TRAEFIK_IP="$(kubectl get pod -n "${NS_TRAEFIK}" -l "${APP_TRAEFIK_LABEL}" -o jsonpath='{.items[0].status.podIP}' 2>/dev/null || true)" echo "--- 1.1 kube-proxy 基线 ---" safe_run kubectl get pod -n kube-system -l k8s-app=kube-proxy -o wide safe_run kubectl get configmap -n kube-system kube-proxy -o yaml KPROXY_POD="$(kubectl get pod -n kube-system -l k8s-app=kube-proxy -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)" if [[ -n "${KPROXY_POD}" ]]; then echo "--- 1.2 kube-proxy 日志关键字(error|conntrack|iptables|ipvs|nft)---" kubectl logs -n kube-system "${KPROXY_POD}" --tail=200 | grep -Ei "error|fail|conntrack|iptables|ipvs|nft|sync" || true else echo "[WARN] 未找到 kube-proxy Pod(K3s 某些模式可忽略)" fi print_title "2. 业务资源采集" safe_run kubectl get deploy -n "${NS_BACKEND}" "${APP_NAME}" -o wide safe_run kubectl get svc -n "${NS_BACKEND}" "${SVC_NAME}" -o wide safe_run kubectl get endpoints -n "${NS_BACKEND}" "${SVC_NAME}" -o wide safe_run kubectl get endpointslice -n "${NS_BACKEND}" -l kubernetes.io/service-name="${SVC_NAME}" -o wide safe_run kubectl get pod -n "${NS_BACKEND}" -l "${APP_LABEL}" -o wide safe_run kubectl get pod -n "${NS_BACKEND}" -l "${APP_LABEL}" --show-labels safe_run kubectl get ingress -n "${NS_BACKEND}" safe_run kubectl get ingressroute -n "${NS_BACKEND}" safe_run kubectl get networkpolicy -n "${NS_BACKEND}" safe_run kubectl get networkpolicy -n "${NS_TRAEFIK}" safe_run kubectl get ns "${NS_BACKEND}" "${NS_TRAEFIK}" --show-labels BACKEND_POD_IP="$(kubectl get pod -n "${NS_BACKEND}" -l "${APP_LABEL}" -o jsonpath='{.items[0].status.podIP}' 2>/dev/null || true)" SVC_IP="$(kubectl get svc -n "${NS_BACKEND}" "${SVC_NAME}" -o jsonpath='{.spec.clusterIP}' 2>/dev/null || true)" EP_COUNT="$(kubectl get endpoints -n "${NS_BACKEND}" "${SVC_NAME}" -o jsonpath='{.subsets[*].addresses[*].ip}' 2>/dev/null | awk '{print NF}')" EP_COUNT="${EP_COUNT:-0}" echo "TRAEFIK_POD=${TRAEFIK_POD:-}" echo "TRAEFIK_IP=${TRAEFIK_IP:-}" echo "BACKEND_POD_IP=${BACKEND_POD_IP:-}" echo "SVC_IP=${SVC_IP:-}" echo "ENDPOINTS_COUNT=${EP_COUNT}" echo "SERVICE_SELECTOR=$(kubectl get svc -n "${NS_BACKEND}" "${SVC_NAME}" -o jsonpath='{.spec.selector}' 2>/dev/null || echo '{}')" echo "--- 2.1 EndpointSlice 条件(ready/serving/terminating/node)---" kubectl get endpointslice -n "${NS_BACKEND}" -l kubernetes.io/service-name="${SVC_NAME}" \ -o jsonpath='{range .items[*]}{"slice="}{.metadata.name}{"\n"}{range .endpoints[*]}{" addr="}{.addresses[0]}{" ready="}{.conditions.ready}{" serving="}{.conditions.serving}{" terminating="}{.conditions.terminating}{" node="}{.nodeName}{"\n"}{end}{end}' \ || true print_title "3. 主链路连通性探测" echo "--- 3.1 本机 -> 入口 (${ENTRY_IP}${PATH_PREFIX}) ---" if curl -sS -m "${TIMEOUT}" -o /tmp/netpol_client.out -w "%{http_code}" "http://${ENTRY_IP}${PATH_PREFIX}" >/tmp/netpol_client.code 2>/tmp/netpol_client.err; then CODE="$(cat /tmp/netpol_client.code)" echo "HTTP_CODE=${CODE}" echo "BODY_PREVIEW:" head -c 200 /tmp/netpol_client.out || true echo PROBE_CLIENT="OK_${CODE}" else cat /tmp/netpol_client.err PROBE_CLIENT="FAIL" fi if [[ -n "${TRAEFIK_POD}" && -n "${SVC_IP}" ]]; then echo echo "--- 3.2 Traefik -> ServiceIP (${SVC_IP}:80) ---" if probe_wget_from_traefik "http://${SVC_IP}:80"; then PROBE_TRAEFIK_TO_SVC="OK" else PROBE_TRAEFIK_TO_SVC="FAIL" fi else echo "[SKIP] 缺少 Traefik Pod 或 ServiceIP。" fi if [[ -n "${TRAEFIK_POD}" ]]; then echo echo "--- 3.3 Traefik -> Service DNS (${SVC_NAME}.${NS_BACKEND}.svc.cluster.local:80) ---" if probe_wget_from_traefik "http://${SVC_NAME}.${NS_BACKEND}.svc.cluster.local:80"; then PROBE_TRAEFIK_DNS="OK" else PROBE_TRAEFIK_DNS="FAIL" fi else echo "[SKIP] 未找到 Traefik Pod。" fi if [[ -n "${TRAEFIK_POD}" && -n "${BACKEND_POD_IP}" ]]; then echo echo "--- 3.4 Traefik -> PodIP (${BACKEND_POD_IP}:${POD_PORT}) ---" if probe_wget_from_traefik "http://${BACKEND_POD_IP}:${POD_PORT}"; then PROBE_TRAEFIK_TO_POD="OK" else PROBE_TRAEFIK_TO_POD="FAIL" fi else echo "[SKIP] 缺少 Traefik Pod 或后端 PodIP。" fi print_title "4. 路由与配置详情" echo "--- 4.1 Ingress ---" safe_run kubectl get ingress -n "${NS_BACKEND}" -o yaml echo "--- 4.2 IngressRoute ---" safe_run kubectl get ingressroute -n "${NS_BACKEND}" -o yaml echo "--- 4.3 Service / Endpoints ---" safe_run kubectl get svc -n "${NS_BACKEND}" "${SVC_NAME}" -o yaml safe_run kubectl get endpoints -n "${NS_BACKEND}" "${SVC_NAME}" -o yaml safe_run kubectl describe svc -n "${NS_BACKEND}" "${SVC_NAME}" echo "--- 4.4 相关 NetworkPolicy(kube-system + backend)---" safe_run kubectl get networkpolicy -n "${NS_TRAEFIK}" -o yaml safe_run kubectl get networkpolicy -n "${NS_BACKEND}" -o yaml echo "--- 4.5 近期事件(backend + kube-system)---" safe_run kubectl get events -n "${NS_BACKEND}" --sort-by=.lastTimestamp safe_run kubectl get events -n kube-system --sort-by=.lastTimestamp print_title "5. Traefik 日志(最近 ${LOG_SINCE},最多 ${LOG_TAIL} 行)" safe_run kubectl logs -n "${NS_TRAEFIK}" deploy/traefik --since="${LOG_SINCE}" --tail="${LOG_TAIL}" echo "--- 5.1 关键字过滤(404|502|503|router|service|middleware|upstream|${SVC_NAME}|${PATH_PREFIX}) ---" kubectl logs -n "${NS_TRAEFIK}" deploy/traefik --since="${LOG_SINCE}" --tail="${LOG_TAIL}" | grep -Ei "404|502|503|router|service|middleware|upstream|endpoint|${SVC_NAME}|${PATH_PREFIX}" || true echo "--- 5.2 Traefik 访问日志候选(status=404/502/503) ---" kubectl logs -n "${NS_TRAEFIK}" deploy/traefik --since="${LOG_SINCE}" --tail="${LOG_TAIL}" | grep -E "\" 404 |\" 502 |\" 503 " || true echo "--- 5.3 Traefik 上一次容器日志(若重启过) ---" safe_run kubectl logs -n "${NS_TRAEFIK}" deploy/traefik --previous --tail=100 print_title "6. 防火墙与数据平面" echo "--- 6.1 防火墙状态 ---" if command -v firewall-cmd >/dev/null 2>&1; then safe_run firewall-cmd --state safe_run firewall-cmd --list-all else echo "firewall-cmd: not found" fi if command -v ufw >/dev/null 2>&1; then safe_run ufw status verbose else echo "ufw: not found" fi echo "--- 6.2 FORWARD 与 KUBE-ROUTER-FORWARD ---" safe_run iptables -L FORWARD -n -v --line-numbers safe_run iptables -L KUBE-ROUTER-FORWARD -n -v --line-numbers echo "--- 6.2.1 NAT 链(KUBE-SERVICES)---" safe_run iptables -t nat -L KUBE-SERVICES -n -v --line-numbers if [[ -n "${SVC_IP}" ]]; then echo "--- 6.2.2 NAT 链中 ServiceIP 相关规则 (${SVC_IP}) ---" iptables -t nat -S | grep "${SVC_IP}" || true fi TRAEFIK_CHAIN="" BACKEND_CHAIN="" if [[ -n "${TRAEFIK_IP}" ]]; then TRAEFIK_CHAIN="$(iptables -L KUBE-ROUTER-FORWARD -n --line-numbers 2>/dev/null | awk -v ip="${TRAEFIK_IP}" '$0 ~ ip && $4 ~ /^KUBE-POD-FW-/ {print $4; exit}')" fi if [[ -n "${BACKEND_POD_IP}" ]]; then BACKEND_CHAIN="$(iptables -L KUBE-ROUTER-FORWARD -n --line-numbers 2>/dev/null | awk -v ip="${BACKEND_POD_IP}" '$0 ~ ip && $4 ~ /^KUBE-POD-FW-/ {print $4; exit}')" fi echo "TRAEFIK_CHAIN=${TRAEFIK_CHAIN:-}" echo "BACKEND_CHAIN=${BACKEND_CHAIN:-}" if [[ -n "${TRAEFIK_CHAIN}" ]]; then echo "--- 6.3 Traefik Pod 链 ${TRAEFIK_CHAIN} ---" safe_run iptables -L "${TRAEFIK_CHAIN}" -n -v -x echo "--- 6.4 Traefik Pod 链规则定义 ---" safe_run iptables -S "${TRAEFIK_CHAIN}" fi if [[ -n "${BACKEND_CHAIN}" ]]; then echo "--- 6.5 Backend Pod 链 ${BACKEND_CHAIN} ---" safe_run iptables -L "${BACKEND_CHAIN}" -n -v -x echo "--- 6.6 Backend Pod 链规则定义 ---" safe_run iptables -S "${BACKEND_CHAIN}" fi echo "--- 6.7 ipset(KUBE-SRC/KUBE-DST)---" if command -v ipset >/dev/null 2>&1; then ipset list -n | grep -E '^KUBE-(SRC|DST)-' || true else echo "ipset: not found" fi echo "--- 6.8 conntrack(容量与关键连接)---" if command -v conntrack >/dev/null 2>&1; then safe_run conntrack -S safe_run sysctl net.netfilter.nf_conntrack_count safe_run sysctl net.netfilter.nf_conntrack_max if [[ -n "${SVC_IP}" ]]; then echo "conntrack by service ip (${SVC_IP}):" conntrack -L -d "${SVC_IP}" 2>/dev/null | head -n 100 || true fi if [[ -n "${BACKEND_POD_IP}" ]]; then echo "conntrack by backend pod ip (${BACKEND_POD_IP}):" conntrack -L -d "${BACKEND_POD_IP}" 2>/dev/null | head -n 100 || true fi else echo "conntrack: not found" fi print_title "7. 自动判读(502/503/404)" echo "探测结果:" echo " CLIENT=${PROBE_CLIENT}" echo " TRAEFIK_TO_SVC=${PROBE_TRAEFIK_TO_SVC}" echo " TRAEFIK_DNS=${PROBE_TRAEFIK_DNS}" echo " TRAEFIK_TO_POD=${PROBE_TRAEFIK_TO_POD}" if [[ "${EP_COUNT}" == "0" ]]; then echo "- [高概率 503] Service 无可用 Endpoints。检查 Deployment 是否 Ready、selector 是否匹配。" fi if [[ "${PROBE_CLIENT}" == OK_404* ]]; then echo "- [高概率 404] 入口路由未命中。检查 Ingress/IngressRoute 的 path、host、middleware。" fi if [[ "${PROBE_CLIENT}" == OK_503* ]]; then echo "- [高概率 503] 入口已命中但后端不可用。优先看 Endpoints/EndpointSlice 条件与 Traefik 日志。" fi if [[ "${PROBE_TRAEFIK_TO_SVC}" == "FAIL" ]]; then echo "- [高概率 502/503] Traefik 到 Service 不通。优先检查 NetworkPolicy、kube-router 链、DNS 53 放行。" fi if [[ "${PROBE_TRAEFIK_DNS}" == "FAIL" && "${PROBE_TRAEFIK_TO_SVC}" == "FAIL" ]]; then echo "- [可能 DNS/服务发现问题] Traefik 到 Service DNS 与 ServiceIP 都失败。检查 CoreDNS、kube-system egress 53。" fi if [[ "${PROBE_TRAEFIK_TO_SVC}" == "OK" && "${PROBE_CLIENT}" == "FAIL" ]]; then echo "- [可能入口层问题] 集群内后端可达,但入口访问失败。检查控制节点防火墙、Traefik Service 暴露端口、外部路由。" fi if [[ "${PROBE_TRAEFIK_TO_SVC}" == "OK" && "${PROBE_TRAEFIK_TO_POD}" == "FAIL" ]]; then echo "- [已知行为候选] Service 可达但 PodIP 直连失败,常见于 kube-router 同节点桥接路径。" fi echo echo "下一步建议:" echo "1) 先修复 Endpoints=0 / 404 路由不匹配。" echo "2) 再看 Traefik -> Service 探测与 NetworkPolicy 命中。" echo "3) 最后结合 KUBE-ROUTER-FORWARD、Pod 链、ipset 判断是否为 kube-router 行为问题。" echo echo "日志已保存:${LOG_FILE}"