Files
Deploy-Laboratory/scripts/diag/netpol/check-net.sh
2026-03-21 04:36:06 +08:00

420 lines
15 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env bash
set -euo pipefail
NS_TRAEFIK="kube-system"
APP_TRAEFIK_LABEL="app.kubernetes.io/name=traefik"
TIMEOUT=3
LOG_TAIL=200
LOG_SINCE="20m"
LOG_DIR=""
LOG_FILE=""
PROBE_CLIENT="SKIP"
PROBE_TRAEFIK_TO_SVC="SKIP"
PROBE_TRAEFIK_TO_POD="SKIP"
PROBE_TRAEFIK_DNS="SKIP"
print_title() {
echo
echo "=== $1 ==="
}
safe_run() {
"$@" || true
}
require_cmd() {
local c="$1"
if ! command -v "$c" >/dev/null 2>&1; then
echo "[ERR] 缺少命令: $c"
exit 1
fi
}
KUBECTL_PATH="$(command -v kubectl || true)"
IPTABLES_PATH="$(command -v iptables || true)"
USE_SUDO=""
init_runtime() {
require_cmd kubectl
require_cmd iptables
require_cmd awk
require_cmd grep
require_cmd curl
if [[ "${EUID}" -ne 0 ]] && command -v sudo >/dev/null 2>&1; then
# 先尝试无交互,失败则提示一次密码
if ! sudo -n true 2>/dev/null; then
echo "[INFO] 需要 sudo 权限以读取 iptables / kubectl 配置。"
sudo -v
fi
USE_SUDO="1"
fi
# 非 root 默认写到 HOME避免 /root 权限问题
if [[ "${EUID}" -eq 0 ]]; then
LOG_DIR="/root/netpol-diag-logs"
else
LOG_DIR="${HOME}/netpol-diag-logs"
fi
}
# 统一封装,避免脚本各处手工判断是否 sudo
kubectl() {
if [[ -n "${USE_SUDO}" ]]; then
sudo "${KUBECTL_PATH}" "$@"
else
"${KUBECTL_PATH}" "$@"
fi
}
iptables() {
if [[ -n "${USE_SUDO}" ]]; then
sudo "${IPTABLES_PATH}" "$@"
else
"${IPTABLES_PATH}" "$@"
fi
}
probe_wget_from_traefik() {
local url="$1"
if kubectl exec -n "${NS_TRAEFIK}" deploy/traefik -- wget -qO- "${url}" --timeout="${TIMEOUT}" >/tmp/netpol_probe.out 2>/tmp/netpol_probe.err; then
cat /tmp/netpol_probe.out
return 0
fi
cat /tmp/netpol_probe.err
return 1
}
select_scene() {
echo "请选择诊断场景:"
echo " 1) nginx-demo (/demo, 80)"
echo " 2) nodejs-demo (/node, 3000)"
echo " 3) 自定义"
printf "输入序号 [1/2/3](默认 2: "
read -r CHOICE
CHOICE="${CHOICE:-2}"
case "${CHOICE}" in
1)
NS_BACKEND="default"
APP_NAME="nginx-demo"
APP_LABEL="app=nginx-demo"
SVC_NAME="nginx-demo"
PATH_PREFIX="/demo/"
POD_PORT="80"
;;
2)
NS_BACKEND="default"
APP_NAME="nodejs-demo"
APP_LABEL="app=nodejs-demo"
SVC_NAME="nodejs-demo"
PATH_PREFIX="/node/"
POD_PORT="3000"
;;
3)
printf "后端命名空间(默认 default: "
read -r NS_BACKEND
NS_BACKEND="${NS_BACKEND:-default}"
printf "应用名Deployment/Service 名,示例 nodejs-demo: "
read -r APP_NAME
APP_NAME="${APP_NAME:-nodejs-demo}"
printf "Pod 标签选择器(默认 app=<应用名>: "
read -r APP_LABEL
APP_LABEL="${APP_LABEL:-app=${APP_NAME}}"
printf "Service 名(默认与应用名一致): "
read -r SVC_NAME
SVC_NAME="${SVC_NAME:-${APP_NAME}}"
printf "入口路径前缀(默认 /: "
read -r PATH_PREFIX
PATH_PREFIX="${PATH_PREFIX:-/}"
printf "后端 Pod 端口(默认 80: "
read -r POD_PORT
POD_PORT="${POD_PORT:-80}"
;;
*)
echo "[WARN] 无效选择,使用 nodejs-demo 默认场景。"
NS_BACKEND="default"
APP_NAME="nodejs-demo"
APP_LABEL="app=nodejs-demo"
SVC_NAME="nodejs-demo"
PATH_PREFIX="/node/"
POD_PORT="3000"
;;
esac
printf "入口 IP用于本机 curl默认 192.168.2.61: "
read -r ENTRY_IP
ENTRY_IP="${ENTRY_IP:-192.168.2.61}"
}
init_runtime
select_scene
mkdir -p "${LOG_DIR}"
LOG_FILE="${LOG_DIR}/diag-$(date '+%Y%m%d-%H%M%S')-${APP_NAME}.log"
exec > >(tee -a "${LOG_FILE}") 2>&1
print_title "0. 诊断上下文"
echo "TIME: $(date '+%F %T %Z')"
echo "LOG_FILE=${LOG_FILE}"
echo "SCENE_APP=${APP_NAME}"
echo "SCENE_NS=${NS_BACKEND}"
echo "SCENE_LABEL=${APP_LABEL}"
echo "SCENE_SVC=${SVC_NAME}"
echo "SCENE_PATH=${PATH_PREFIX}"
echo "SCENE_POD_PORT=${POD_PORT}"
echo "ENTRY_IP=${ENTRY_IP}"
echo "HOSTNAME=$(hostname)"
safe_run kubectl version --short
print_title "1. 集群与 Traefik 基线"
safe_run kubectl get nodes -o wide
safe_run kubectl get deploy -n "${NS_TRAEFIK}" traefik -o wide
safe_run kubectl get svc -n "${NS_TRAEFIK}" traefik -o wide
safe_run kubectl get pod -n "${NS_TRAEFIK}" -l "${APP_TRAEFIK_LABEL}" -o wide
kubectl get pods -n kube-system -o wide | grep -E 'kube-router|flannel|traefik|svclb-traefik' || true
TRAEFIK_POD="$(kubectl get pod -n "${NS_TRAEFIK}" -l "${APP_TRAEFIK_LABEL}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)"
TRAEFIK_IP="$(kubectl get pod -n "${NS_TRAEFIK}" -l "${APP_TRAEFIK_LABEL}" -o jsonpath='{.items[0].status.podIP}' 2>/dev/null || true)"
echo "--- 1.1 kube-proxy 基线 ---"
safe_run kubectl get pod -n kube-system -l k8s-app=kube-proxy -o wide
safe_run kubectl get configmap -n kube-system kube-proxy -o yaml
KPROXY_POD="$(kubectl get pod -n kube-system -l k8s-app=kube-proxy -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)"
if [[ -n "${KPROXY_POD}" ]]; then
echo "--- 1.2 kube-proxy 日志关键字error|conntrack|iptables|ipvs|nft---"
kubectl logs -n kube-system "${KPROXY_POD}" --tail=200 | grep -Ei "error|fail|conntrack|iptables|ipvs|nft|sync" || true
else
echo "[WARN] 未找到 kube-proxy PodK3s 某些模式可忽略)"
fi
print_title "2. 业务资源采集"
safe_run kubectl get deploy -n "${NS_BACKEND}" "${APP_NAME}" -o wide
safe_run kubectl get svc -n "${NS_BACKEND}" "${SVC_NAME}" -o wide
safe_run kubectl get endpoints -n "${NS_BACKEND}" "${SVC_NAME}" -o wide
safe_run kubectl get endpointslice -n "${NS_BACKEND}" -l kubernetes.io/service-name="${SVC_NAME}" -o wide
safe_run kubectl get pod -n "${NS_BACKEND}" -l "${APP_LABEL}" -o wide
safe_run kubectl get pod -n "${NS_BACKEND}" -l "${APP_LABEL}" --show-labels
safe_run kubectl get ingress -n "${NS_BACKEND}"
safe_run kubectl get ingressroute -n "${NS_BACKEND}"
safe_run kubectl get networkpolicy -n "${NS_BACKEND}"
safe_run kubectl get networkpolicy -n "${NS_TRAEFIK}"
safe_run kubectl get ns "${NS_BACKEND}" "${NS_TRAEFIK}" --show-labels
BACKEND_POD_IP="$(kubectl get pod -n "${NS_BACKEND}" -l "${APP_LABEL}" -o jsonpath='{.items[0].status.podIP}' 2>/dev/null || true)"
SVC_IP="$(kubectl get svc -n "${NS_BACKEND}" "${SVC_NAME}" -o jsonpath='{.spec.clusterIP}' 2>/dev/null || true)"
EP_COUNT="$(kubectl get endpoints -n "${NS_BACKEND}" "${SVC_NAME}" -o jsonpath='{.subsets[*].addresses[*].ip}' 2>/dev/null | awk '{print NF}')"
EP_COUNT="${EP_COUNT:-0}"
echo "TRAEFIK_POD=${TRAEFIK_POD:-<none>}"
echo "TRAEFIK_IP=${TRAEFIK_IP:-<none>}"
echo "BACKEND_POD_IP=${BACKEND_POD_IP:-<none>}"
echo "SVC_IP=${SVC_IP:-<none>}"
echo "ENDPOINTS_COUNT=${EP_COUNT}"
echo "SERVICE_SELECTOR=$(kubectl get svc -n "${NS_BACKEND}" "${SVC_NAME}" -o jsonpath='{.spec.selector}' 2>/dev/null || echo '{}')"
echo "--- 2.1 EndpointSlice 条件ready/serving/terminating/node---"
kubectl get endpointslice -n "${NS_BACKEND}" -l kubernetes.io/service-name="${SVC_NAME}" \
-o jsonpath='{range .items[*]}{"slice="}{.metadata.name}{"\n"}{range .endpoints[*]}{" addr="}{.addresses[0]}{" ready="}{.conditions.ready}{" serving="}{.conditions.serving}{" terminating="}{.conditions.terminating}{" node="}{.nodeName}{"\n"}{end}{end}' \
|| true
print_title "3. 主链路连通性探测"
echo "--- 3.1 本机 -> 入口 (${ENTRY_IP}${PATH_PREFIX}) ---"
if curl -sS -m "${TIMEOUT}" -o /tmp/netpol_client.out -w "%{http_code}" "http://${ENTRY_IP}${PATH_PREFIX}" >/tmp/netpol_client.code 2>/tmp/netpol_client.err; then
CODE="$(cat /tmp/netpol_client.code)"
echo "HTTP_CODE=${CODE}"
echo "BODY_PREVIEW:"
head -c 200 /tmp/netpol_client.out || true
echo
PROBE_CLIENT="OK_${CODE}"
else
cat /tmp/netpol_client.err
PROBE_CLIENT="FAIL"
fi
if [[ -n "${TRAEFIK_POD}" && -n "${SVC_IP}" ]]; then
echo
echo "--- 3.2 Traefik -> ServiceIP (${SVC_IP}:80) ---"
if probe_wget_from_traefik "http://${SVC_IP}:80"; then
PROBE_TRAEFIK_TO_SVC="OK"
else
PROBE_TRAEFIK_TO_SVC="FAIL"
fi
else
echo "[SKIP] 缺少 Traefik Pod 或 ServiceIP。"
fi
if [[ -n "${TRAEFIK_POD}" ]]; then
echo
echo "--- 3.3 Traefik -> Service DNS (${SVC_NAME}.${NS_BACKEND}.svc.cluster.local:80) ---"
if probe_wget_from_traefik "http://${SVC_NAME}.${NS_BACKEND}.svc.cluster.local:80"; then
PROBE_TRAEFIK_DNS="OK"
else
PROBE_TRAEFIK_DNS="FAIL"
fi
else
echo "[SKIP] 未找到 Traefik Pod。"
fi
if [[ -n "${TRAEFIK_POD}" && -n "${BACKEND_POD_IP}" ]]; then
echo
echo "--- 3.4 Traefik -> PodIP (${BACKEND_POD_IP}:${POD_PORT}) ---"
if probe_wget_from_traefik "http://${BACKEND_POD_IP}:${POD_PORT}"; then
PROBE_TRAEFIK_TO_POD="OK"
else
PROBE_TRAEFIK_TO_POD="FAIL"
fi
else
echo "[SKIP] 缺少 Traefik Pod 或后端 PodIP。"
fi
print_title "4. 路由与配置详情"
echo "--- 4.1 Ingress ---"
safe_run kubectl get ingress -n "${NS_BACKEND}" -o yaml
echo "--- 4.2 IngressRoute ---"
safe_run kubectl get ingressroute -n "${NS_BACKEND}" -o yaml
echo "--- 4.3 Service / Endpoints ---"
safe_run kubectl get svc -n "${NS_BACKEND}" "${SVC_NAME}" -o yaml
safe_run kubectl get endpoints -n "${NS_BACKEND}" "${SVC_NAME}" -o yaml
safe_run kubectl describe svc -n "${NS_BACKEND}" "${SVC_NAME}"
echo "--- 4.4 相关 NetworkPolicykube-system + backend---"
safe_run kubectl get networkpolicy -n "${NS_TRAEFIK}" -o yaml
safe_run kubectl get networkpolicy -n "${NS_BACKEND}" -o yaml
echo "--- 4.5 近期事件backend + kube-system---"
safe_run kubectl get events -n "${NS_BACKEND}" --sort-by=.lastTimestamp
safe_run kubectl get events -n kube-system --sort-by=.lastTimestamp
print_title "5. Traefik 日志(最近 ${LOG_SINCE},最多 ${LOG_TAIL} 行)"
safe_run kubectl logs -n "${NS_TRAEFIK}" deploy/traefik --since="${LOG_SINCE}" --tail="${LOG_TAIL}"
echo "--- 5.1 关键字过滤404|502|503|router|service|middleware|upstream|${SVC_NAME}|${PATH_PREFIX} ---"
kubectl logs -n "${NS_TRAEFIK}" deploy/traefik --since="${LOG_SINCE}" --tail="${LOG_TAIL}" | grep -Ei "404|502|503|router|service|middleware|upstream|endpoint|${SVC_NAME}|${PATH_PREFIX}" || true
echo "--- 5.2 Traefik 访问日志候选status=404/502/503 ---"
kubectl logs -n "${NS_TRAEFIK}" deploy/traefik --since="${LOG_SINCE}" --tail="${LOG_TAIL}" | grep -E "\" 404 |\" 502 |\" 503 " || true
echo "--- 5.3 Traefik 上一次容器日志(若重启过) ---"
safe_run kubectl logs -n "${NS_TRAEFIK}" deploy/traefik --previous --tail=100
print_title "6. 防火墙与数据平面"
echo "--- 6.1 防火墙状态 ---"
if command -v firewall-cmd >/dev/null 2>&1; then
safe_run firewall-cmd --state
safe_run firewall-cmd --list-all
else
echo "firewall-cmd: not found"
fi
if command -v ufw >/dev/null 2>&1; then
safe_run ufw status verbose
else
echo "ufw: not found"
fi
echo "--- 6.2 FORWARD 与 KUBE-ROUTER-FORWARD ---"
safe_run iptables -L FORWARD -n -v --line-numbers
safe_run iptables -L KUBE-ROUTER-FORWARD -n -v --line-numbers
echo "--- 6.2.1 NAT 链KUBE-SERVICES---"
safe_run iptables -t nat -L KUBE-SERVICES -n -v --line-numbers
if [[ -n "${SVC_IP}" ]]; then
echo "--- 6.2.2 NAT 链中 ServiceIP 相关规则 (${SVC_IP}) ---"
iptables -t nat -S | grep "${SVC_IP}" || true
fi
TRAEFIK_CHAIN=""
BACKEND_CHAIN=""
if [[ -n "${TRAEFIK_IP}" ]]; then
TRAEFIK_CHAIN="$(iptables -L KUBE-ROUTER-FORWARD -n --line-numbers 2>/dev/null | awk -v ip="${TRAEFIK_IP}" '$0 ~ ip && $4 ~ /^KUBE-POD-FW-/ {print $4; exit}')"
fi
if [[ -n "${BACKEND_POD_IP}" ]]; then
BACKEND_CHAIN="$(iptables -L KUBE-ROUTER-FORWARD -n --line-numbers 2>/dev/null | awk -v ip="${BACKEND_POD_IP}" '$0 ~ ip && $4 ~ /^KUBE-POD-FW-/ {print $4; exit}')"
fi
echo "TRAEFIK_CHAIN=${TRAEFIK_CHAIN:-<not found>}"
echo "BACKEND_CHAIN=${BACKEND_CHAIN:-<not found>}"
if [[ -n "${TRAEFIK_CHAIN}" ]]; then
echo "--- 6.3 Traefik Pod 链 ${TRAEFIK_CHAIN} ---"
safe_run iptables -L "${TRAEFIK_CHAIN}" -n -v -x
echo "--- 6.4 Traefik Pod 链规则定义 ---"
safe_run iptables -S "${TRAEFIK_CHAIN}"
fi
if [[ -n "${BACKEND_CHAIN}" ]]; then
echo "--- 6.5 Backend Pod 链 ${BACKEND_CHAIN} ---"
safe_run iptables -L "${BACKEND_CHAIN}" -n -v -x
echo "--- 6.6 Backend Pod 链规则定义 ---"
safe_run iptables -S "${BACKEND_CHAIN}"
fi
echo "--- 6.7 ipsetKUBE-SRC/KUBE-DST---"
if command -v ipset >/dev/null 2>&1; then
ipset list -n | grep -E '^KUBE-(SRC|DST)-' || true
else
echo "ipset: not found"
fi
echo "--- 6.8 conntrack容量与关键连接---"
if command -v conntrack >/dev/null 2>&1; then
safe_run conntrack -S
safe_run sysctl net.netfilter.nf_conntrack_count
safe_run sysctl net.netfilter.nf_conntrack_max
if [[ -n "${SVC_IP}" ]]; then
echo "conntrack by service ip (${SVC_IP}):"
conntrack -L -d "${SVC_IP}" 2>/dev/null | head -n 100 || true
fi
if [[ -n "${BACKEND_POD_IP}" ]]; then
echo "conntrack by backend pod ip (${BACKEND_POD_IP}):"
conntrack -L -d "${BACKEND_POD_IP}" 2>/dev/null | head -n 100 || true
fi
else
echo "conntrack: not found"
fi
print_title "7. 自动判读502/503/404"
echo "探测结果:"
echo " CLIENT=${PROBE_CLIENT}"
echo " TRAEFIK_TO_SVC=${PROBE_TRAEFIK_TO_SVC}"
echo " TRAEFIK_DNS=${PROBE_TRAEFIK_DNS}"
echo " TRAEFIK_TO_POD=${PROBE_TRAEFIK_TO_POD}"
if [[ "${EP_COUNT}" == "0" ]]; then
echo "- [高概率 503] Service 无可用 Endpoints。检查 Deployment 是否 Ready、selector 是否匹配。"
fi
if [[ "${PROBE_CLIENT}" == OK_404* ]]; then
echo "- [高概率 404] 入口路由未命中。检查 Ingress/IngressRoute 的 path、host、middleware。"
fi
if [[ "${PROBE_CLIENT}" == OK_503* ]]; then
echo "- [高概率 503] 入口已命中但后端不可用。优先看 Endpoints/EndpointSlice 条件与 Traefik 日志。"
fi
if [[ "${PROBE_TRAEFIK_TO_SVC}" == "FAIL" ]]; then
echo "- [高概率 502/503] Traefik 到 Service 不通。优先检查 NetworkPolicy、kube-router 链、DNS 53 放行。"
fi
if [[ "${PROBE_TRAEFIK_DNS}" == "FAIL" && "${PROBE_TRAEFIK_TO_SVC}" == "FAIL" ]]; then
echo "- [可能 DNS/服务发现问题] Traefik 到 Service DNS 与 ServiceIP 都失败。检查 CoreDNS、kube-system egress 53。"
fi
if [[ "${PROBE_TRAEFIK_TO_SVC}" == "OK" && "${PROBE_CLIENT}" == "FAIL" ]]; then
echo "- [可能入口层问题] 集群内后端可达但入口访问失败。检查控制节点防火墙、Traefik Service 暴露端口、外部路由。"
fi
if [[ "${PROBE_TRAEFIK_TO_SVC}" == "OK" && "${PROBE_TRAEFIK_TO_POD}" == "FAIL" ]]; then
echo "- [已知行为候选] Service 可达但 PodIP 直连失败,常见于 kube-router 同节点桥接路径。"
fi
echo
echo "下一步建议:"
echo "1) 先修复 Endpoints=0 / 404 路由不匹配。"
echo "2) 再看 Traefik -> Service 探测与 NetworkPolicy 命中。"
echo "3) 最后结合 KUBE-ROUTER-FORWARD、Pod 链、ipset 判断是否为 kube-router 行为问题。"
echo
echo "日志已保存:${LOG_FILE}"