287 lines
10 KiB
Bash
287 lines
10 KiB
Bash
#!/usr/bin/env bash
|
||
|
||
CAPTURE_MODE="N"
|
||
CAPTURE_SECONDS="12"
|
||
CAPTURE_MODE_ARG=""
|
||
CAP_FILE_ENS18=""
|
||
CAP_PID_ENS18=""
|
||
NFT_TRACE_MODE="N"
|
||
NFT_TRACE_SECONDS="8"
|
||
NFT_TRACE_MODE_ARG=""
|
||
NFT_FILE=""
|
||
NFT_PID=""
|
||
NFT_TRACE_TABLE="diag_k3s_entrypath"
|
||
LOCAL_NFT_TRACE_TABLE="diag61_k3s_entrypath"
|
||
RETURN_TRACE_MODE="N"
|
||
RETURN_TRACE_SECONDS="10"
|
||
RETURN_TRACE_MODE_ARG=""
|
||
RET_FILE_LOCAL_8000=""
|
||
RET_FILE_LOCAL_CNI0=""
|
||
RET_FILE_WORKER_ENS18=""
|
||
RET_FILE_WORKER_CONNTRACK=""
|
||
RET_PID_LOCAL_8000=""
|
||
RET_PID_LOCAL_CNI0=""
|
||
RET_PID_WORKER_ENS18=""
|
||
RET_PID_WORKER_CONNTRACK=""
|
||
RET_FILE_LOCAL_NFT_TRACE=""
|
||
RET_PID_LOCAL_NFT_TRACE=""
|
||
NFT_DNAT_HIT="no"
|
||
RET_LOCAL_SYN_COUNT=0
|
||
RET_LOCAL_SYNACK_COUNT=0
|
||
RET_CNI0_SYN_COUNT=0
|
||
RET_CNI0_SYNACK_COUNT=0
|
||
RET_WORKER_SYN_COUNT=0
|
||
RET_WORKER_SYNACK_COUNT=0
|
||
POD_NETNS_TRACE_MODE="N"
|
||
POD_NETNS_TRACE_MODE_ARG=""
|
||
POD_NETNS_TRACE_SECONDS=""
|
||
POD_NETNS_TRACE_SECONDS_ARG=""
|
||
POD_NETNS_PID=""
|
||
POD_NETNS_FILE=""
|
||
POD_NETNS_SYN_COUNT=0
|
||
POD_NETNS_SYNACK_COUNT=0
|
||
|
||
start_worker_capture() {
|
||
if [[ ! "$CAPTURE_MODE" =~ ^[Yy]$ ]]; then
|
||
return 0
|
||
fi
|
||
if [[ ! "$DO_REMOTE" =~ ^[Yy]$ ]] || [[ -z "${WORKER_HOST}" ]]; then
|
||
echo "[WARN] 抓包模式已开启,但未启用远端检查或未提供 worker 主机,跳过抓包。"
|
||
return 0
|
||
fi
|
||
|
||
CAP_FILE_ENS18="$(mktemp)"
|
||
say "启动 worker 抓包(ens18, ${CAPTURE_SECONDS}s): host ${CLIENT_IP} and tcp port 80"
|
||
ssh "${SSH_OPTS[@]}" "${WORKER_HOST}" \
|
||
"sudo timeout ${CAPTURE_SECONDS} tcpdump -ni ens18 'host ${CLIENT_IP} and tcp port 80' 2>/dev/null || true" \
|
||
>"${CAP_FILE_ENS18}" 2>&1 &
|
||
CAP_PID_ENS18="$!"
|
||
sleep 1
|
||
}
|
||
|
||
start_worker_nft_trace() {
|
||
if [[ ! "$NFT_TRACE_MODE" =~ ^[Yy]$ ]]; then
|
||
return 0
|
||
fi
|
||
if [[ ! "$DO_REMOTE" =~ ^[Yy]$ ]] || [[ -z "${WORKER_HOST}" ]]; then
|
||
echo "[WARN] nft trace 已开启,但未启用远端检查或未提供 worker 主机,跳过 nft trace。"
|
||
return 0
|
||
fi
|
||
|
||
ssh "${SSH_OPTS[@]}" "${WORKER_HOST}" \
|
||
"sudo nft add table inet ${NFT_TRACE_TABLE} 2>/dev/null || true; \
|
||
sudo nft 'add chain inet ${NFT_TRACE_TABLE} prerouting { type filter hook prerouting priority -301; policy accept; }' 2>/dev/null || true; \
|
||
sudo nft add rule inet ${NFT_TRACE_TABLE} prerouting iif \"ens18\" ip saddr ${CLIENT_IP} ip daddr ${LB_IP} tcp dport 80 meta nftrace set 1 2>/dev/null || true" \
|
||
|| true
|
||
|
||
NFT_FILE="$(mktemp)"
|
||
say "启动 worker nft trace(${NFT_TRACE_SECONDS}s)"
|
||
ssh "${SSH_OPTS[@]}" "${WORKER_HOST}" \
|
||
"sudo timeout ${NFT_TRACE_SECONDS} nft monitor trace 2>/dev/null || true" \
|
||
>"${NFT_FILE}" 2>&1 &
|
||
NFT_PID="$!"
|
||
sleep 1
|
||
}
|
||
|
||
start_return_path_trace() {
|
||
if [[ ! "$RETURN_TRACE_MODE" =~ ^[Yy]$ ]]; then
|
||
return 0
|
||
fi
|
||
if [[ ! "$DO_REMOTE" =~ ^[Yy]$ ]] || [[ -z "${WORKER_HOST}" ]]; then
|
||
echo "[WARN] 回包链路跟踪已开启,但未启用远端检查或未提供 worker 主机,跳过。"
|
||
return 0
|
||
fi
|
||
|
||
RET_FILE_LOCAL_8000="$(mktemp)"
|
||
RET_FILE_LOCAL_CNI0="$(mktemp)"
|
||
RET_FILE_LOCAL_NFT_TRACE="$(mktemp)"
|
||
RET_FILE_WORKER_ENS18="$(mktemp)"
|
||
RET_FILE_WORKER_CONNTRACK="$(mktemp)"
|
||
|
||
say "启动回包链路跟踪(${RETURN_TRACE_SECONDS}s)"
|
||
sudo nft add table inet "${LOCAL_NFT_TRACE_TABLE}" 2>/dev/null || true
|
||
sudo nft "add chain inet ${LOCAL_NFT_TRACE_TABLE} forward { type filter hook forward priority -301; policy accept; }" 2>/dev/null || true
|
||
sudo nft add rule inet "${LOCAL_NFT_TRACE_TABLE}" forward iif "flannel.1" ip daddr "${TRAEFIK_IP}" tcp dport 8000 meta nftrace set 1 2>/dev/null || true
|
||
sudo timeout "${RETURN_TRACE_SECONDS}" nft monitor trace 2>/dev/null \
|
||
>"${RET_FILE_LOCAL_NFT_TRACE}" 2>&1 &
|
||
RET_PID_LOCAL_NFT_TRACE="$!"
|
||
|
||
sudo timeout "${RETURN_TRACE_SECONDS}" tcpdump -ni any "host ${TRAEFIK_IP} and tcp port 8000" 2>/dev/null \
|
||
>"${RET_FILE_LOCAL_8000}" 2>&1 &
|
||
RET_PID_LOCAL_8000="$!"
|
||
|
||
sudo timeout "${RETURN_TRACE_SECONDS}" tcpdump -ni cni0 "host ${TRAEFIK_IP} and tcp port 8000" 2>/dev/null \
|
||
>"${RET_FILE_LOCAL_CNI0}" 2>&1 &
|
||
RET_PID_LOCAL_CNI0="$!"
|
||
|
||
ssh "${SSH_OPTS[@]}" "${WORKER_HOST}" \
|
||
"sudo timeout ${RETURN_TRACE_SECONDS} tcpdump -ni ens18 'host ${CLIENT_IP} and tcp' 2>/dev/null || true" \
|
||
>"${RET_FILE_WORKER_ENS18}" 2>&1 &
|
||
RET_PID_WORKER_ENS18="$!"
|
||
|
||
ssh "${SSH_OPTS[@]}" "${WORKER_HOST}" \
|
||
"if command -v conntrack >/dev/null 2>&1; then sudo timeout ${RETURN_TRACE_SECONDS} conntrack -E -p tcp 2>/dev/null || true; else echo 'conntrack: not found'; fi" \
|
||
>"${RET_FILE_WORKER_CONNTRACK}" 2>&1 &
|
||
RET_PID_WORKER_CONNTRACK="$!"
|
||
|
||
sleep 1
|
||
}
|
||
|
||
start_pod_netns_trace() {
|
||
if [[ ! "${POD_NETNS_TRACE_MODE}" =~ ^[Yy]$ ]]; then
|
||
return 0
|
||
fi
|
||
if ! command -v crictl >/dev/null 2>&1; then
|
||
echo "[WARN] 未找到 crictl,跳过 pod netns 抓包。"
|
||
return 0
|
||
fi
|
||
if ! command -v nsenter >/dev/null 2>&1; then
|
||
echo "[WARN] 未找到 nsenter,跳过 pod netns 抓包。"
|
||
return 0
|
||
fi
|
||
|
||
local sec="${POD_NETNS_TRACE_SECONDS:-$RETURN_TRACE_SECONDS}"
|
||
local cid
|
||
local pid
|
||
local runtime_id=""
|
||
|
||
runtime_id="$(sudo kubectl -n kube-system get pod "${TRAEFIK_POD}" -o jsonpath='{.status.containerStatuses[?(@.name=="traefik")].containerID}' 2>/dev/null || true)"
|
||
runtime_id="${runtime_id#containerd://}"
|
||
runtime_id="${runtime_id#cri-o://}"
|
||
|
||
if [[ -n "${runtime_id}" ]]; then
|
||
cid="${runtime_id}"
|
||
else
|
||
cid="$(sudo crictl ps --name traefik -q 2>/dev/null | awk 'NR==1{print; exit}' || true)"
|
||
fi
|
||
if [[ -z "${cid}" ]]; then
|
||
echo "[WARN] 未解析到 traefik 容器ID,跳过 pod netns 抓包。"
|
||
return 0
|
||
fi
|
||
|
||
pid="$(sudo crictl inspect "${cid}" 2>/dev/null | awk -F': ' '/"pid":/ {gsub(/,/, "", $2); print $2; exit}' || true)"
|
||
if [[ -z "${pid}" || ! "${pid}" =~ ^[0-9]+$ ]]; then
|
||
echo "[WARN] 未解析到 traefik 容器 PID,跳过 pod netns 抓包。"
|
||
return 0
|
||
fi
|
||
|
||
POD_NETNS_FILE="$(mktemp)"
|
||
say "启动 Traefik Pod netns 抓包(${sec}s, pid=${pid})"
|
||
sudo timeout "${sec}" nsenter -t "${pid}" -n tcpdump -ni any "tcp port 8000" 2>/dev/null \
|
||
>"${POD_NETNS_FILE}" 2>&1 &
|
||
POD_NETNS_PID="$!"
|
||
sleep 1
|
||
}
|
||
|
||
flush_worker_capture() {
|
||
if [[ -n "${CAP_PID_ENS18}" ]]; then
|
||
wait "${CAP_PID_ENS18}" || true
|
||
CAP_PID_ENS18=""
|
||
fi
|
||
if [[ -n "${CAP_FILE_ENS18}" && -f "${CAP_FILE_ENS18}" ]]; then
|
||
echo
|
||
echo "===== Worker 抓包结果(ens18) ====="
|
||
cat "${CAP_FILE_ENS18}" || true
|
||
rm -f "${CAP_FILE_ENS18}" || true
|
||
CAP_FILE_ENS18=""
|
||
fi
|
||
|
||
if [[ -n "${NFT_PID}" ]]; then
|
||
wait "${NFT_PID}" || true
|
||
NFT_PID=""
|
||
fi
|
||
if [[ -n "${NFT_FILE}" && -f "${NFT_FILE}" ]]; then
|
||
if grep -Eq "KUBE-SEP-.*dnat to ${TRAEFIK_IP}:8000|dnat to ${TRAEFIK_IP}:8000" "${NFT_FILE}" >/dev/null 2>&1; then
|
||
NFT_DNAT_HIT="yes"
|
||
fi
|
||
echo
|
||
echo "===== Worker nft trace 结果 ====="
|
||
cat "${NFT_FILE}" || true
|
||
rm -f "${NFT_FILE}" || true
|
||
NFT_FILE=""
|
||
fi
|
||
|
||
if [[ "$NFT_TRACE_MODE" =~ ^[Yy]$ ]] && [[ "$DO_REMOTE" =~ ^[Yy]$ ]] && [[ -n "${WORKER_HOST}" ]]; then
|
||
ssh "${SSH_OPTS[@]}" "${WORKER_HOST}" "sudo nft delete table inet ${NFT_TRACE_TABLE} 2>/dev/null || true" || true
|
||
fi
|
||
|
||
if [[ -n "${RET_PID_LOCAL_8000}" ]]; then
|
||
wait "${RET_PID_LOCAL_8000}" || true
|
||
RET_PID_LOCAL_8000=""
|
||
fi
|
||
if [[ -n "${RET_PID_LOCAL_NFT_TRACE}" ]]; then
|
||
wait "${RET_PID_LOCAL_NFT_TRACE}" || true
|
||
RET_PID_LOCAL_NFT_TRACE=""
|
||
fi
|
||
if [[ -n "${RET_PID_LOCAL_CNI0}" ]]; then
|
||
wait "${RET_PID_LOCAL_CNI0}" || true
|
||
RET_PID_LOCAL_CNI0=""
|
||
fi
|
||
if [[ -n "${RET_PID_WORKER_ENS18}" ]]; then
|
||
wait "${RET_PID_WORKER_ENS18}" || true
|
||
RET_PID_WORKER_ENS18=""
|
||
fi
|
||
if [[ -n "${RET_PID_WORKER_CONNTRACK}" ]]; then
|
||
wait "${RET_PID_WORKER_CONNTRACK}" || true
|
||
RET_PID_WORKER_CONNTRACK=""
|
||
fi
|
||
|
||
if [[ -n "${RET_FILE_LOCAL_8000}" && -f "${RET_FILE_LOCAL_8000}" ]]; then
|
||
RET_LOCAL_SYN_COUNT="$(count_tcpdump_flag "${RET_FILE_LOCAL_8000}" "Flags [S]")"
|
||
RET_LOCAL_SYNACK_COUNT="$(count_tcpdump_flag "${RET_FILE_LOCAL_8000}" "Flags [S.]")"
|
||
echo
|
||
echo "===== 回包链路抓包(ylc61 any -> ${TRAEFIK_IP}:8000) ====="
|
||
cat "${RET_FILE_LOCAL_8000}" || true
|
||
rm -f "${RET_FILE_LOCAL_8000}" || true
|
||
RET_FILE_LOCAL_8000=""
|
||
fi
|
||
if [[ -n "${RET_FILE_LOCAL_NFT_TRACE}" && -f "${RET_FILE_LOCAL_NFT_TRACE}" ]]; then
|
||
echo
|
||
echo "===== 本机 nft trace 结果(ylc61 forward) ====="
|
||
cat "${RET_FILE_LOCAL_NFT_TRACE}" || true
|
||
rm -f "${RET_FILE_LOCAL_NFT_TRACE}" || true
|
||
RET_FILE_LOCAL_NFT_TRACE=""
|
||
fi
|
||
if [[ -n "${RET_FILE_LOCAL_CNI0}" && -f "${RET_FILE_LOCAL_CNI0}" ]]; then
|
||
RET_CNI0_SYN_COUNT="$(count_tcpdump_flag "${RET_FILE_LOCAL_CNI0}" "Flags [S]")"
|
||
RET_CNI0_SYNACK_COUNT="$(count_tcpdump_flag "${RET_FILE_LOCAL_CNI0}" "Flags [S.]")"
|
||
echo
|
||
echo "===== 回包链路抓包(ylc61 cni0 -> ${TRAEFIK_IP}:8000) ====="
|
||
cat "${RET_FILE_LOCAL_CNI0}" || true
|
||
rm -f "${RET_FILE_LOCAL_CNI0}" || true
|
||
RET_FILE_LOCAL_CNI0=""
|
||
fi
|
||
if [[ -n "${RET_FILE_WORKER_ENS18}" && -f "${RET_FILE_WORKER_ENS18}" ]]; then
|
||
RET_WORKER_SYN_COUNT="$(count_tcpdump_flag "${RET_FILE_WORKER_ENS18}" "Flags [S]")"
|
||
RET_WORKER_SYNACK_COUNT="$(count_tcpdump_flag "${RET_FILE_WORKER_ENS18}" "Flags [S.]")"
|
||
echo
|
||
echo "===== 回包链路抓包(ylc62 ens18 <-> ${CLIENT_IP}) ====="
|
||
cat "${RET_FILE_WORKER_ENS18}" || true
|
||
rm -f "${RET_FILE_WORKER_ENS18}" || true
|
||
RET_FILE_WORKER_ENS18=""
|
||
fi
|
||
if [[ -n "${RET_FILE_WORKER_CONNTRACK}" && -f "${RET_FILE_WORKER_CONNTRACK}" ]]; then
|
||
echo
|
||
echo "===== 回包链路 conntrack 事件(ylc62) ====="
|
||
cat "${RET_FILE_WORKER_CONNTRACK}" || true
|
||
rm -f "${RET_FILE_WORKER_CONNTRACK}" || true
|
||
RET_FILE_WORKER_CONNTRACK=""
|
||
fi
|
||
|
||
sudo nft delete table inet "${LOCAL_NFT_TRACE_TABLE}" 2>/dev/null || true
|
||
|
||
if [[ -n "${POD_NETNS_PID}" ]]; then
|
||
wait "${POD_NETNS_PID}" || true
|
||
POD_NETNS_PID=""
|
||
fi
|
||
if [[ -n "${POD_NETNS_FILE}" && -f "${POD_NETNS_FILE}" ]]; then
|
||
POD_NETNS_SYN_COUNT="$(count_tcpdump_flag "${POD_NETNS_FILE}" "Flags [S]")"
|
||
POD_NETNS_SYNACK_COUNT="$(count_tcpdump_flag "${POD_NETNS_FILE}" "Flags [S.]")"
|
||
echo
|
||
echo "===== Traefik Pod netns 抓包(ylc61) ====="
|
||
cat "${POD_NETNS_FILE}" || true
|
||
rm -f "${POD_NETNS_FILE}" || true
|
||
POD_NETNS_FILE=""
|
||
fi
|
||
}
|