Files
Deploy-Laboratory/scripts/diag/entrypath/lib/capture.sh
2026-03-21 04:36:06 +08:00

287 lines
10 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env bash
CAPTURE_MODE="N"
CAPTURE_SECONDS="12"
CAPTURE_MODE_ARG=""
CAP_FILE_ENS18=""
CAP_PID_ENS18=""
NFT_TRACE_MODE="N"
NFT_TRACE_SECONDS="8"
NFT_TRACE_MODE_ARG=""
NFT_FILE=""
NFT_PID=""
NFT_TRACE_TABLE="diag_k3s_entrypath"
LOCAL_NFT_TRACE_TABLE="diag61_k3s_entrypath"
RETURN_TRACE_MODE="N"
RETURN_TRACE_SECONDS="10"
RETURN_TRACE_MODE_ARG=""
RET_FILE_LOCAL_8000=""
RET_FILE_LOCAL_CNI0=""
RET_FILE_WORKER_ENS18=""
RET_FILE_WORKER_CONNTRACK=""
RET_PID_LOCAL_8000=""
RET_PID_LOCAL_CNI0=""
RET_PID_WORKER_ENS18=""
RET_PID_WORKER_CONNTRACK=""
RET_FILE_LOCAL_NFT_TRACE=""
RET_PID_LOCAL_NFT_TRACE=""
NFT_DNAT_HIT="no"
RET_LOCAL_SYN_COUNT=0
RET_LOCAL_SYNACK_COUNT=0
RET_CNI0_SYN_COUNT=0
RET_CNI0_SYNACK_COUNT=0
RET_WORKER_SYN_COUNT=0
RET_WORKER_SYNACK_COUNT=0
POD_NETNS_TRACE_MODE="N"
POD_NETNS_TRACE_MODE_ARG=""
POD_NETNS_TRACE_SECONDS=""
POD_NETNS_TRACE_SECONDS_ARG=""
POD_NETNS_PID=""
POD_NETNS_FILE=""
POD_NETNS_SYN_COUNT=0
POD_NETNS_SYNACK_COUNT=0
start_worker_capture() {
if [[ ! "$CAPTURE_MODE" =~ ^[Yy]$ ]]; then
return 0
fi
if [[ ! "$DO_REMOTE" =~ ^[Yy]$ ]] || [[ -z "${WORKER_HOST}" ]]; then
echo "[WARN] 抓包模式已开启,但未启用远端检查或未提供 worker 主机,跳过抓包。"
return 0
fi
CAP_FILE_ENS18="$(mktemp)"
say "启动 worker 抓包ens18, ${CAPTURE_SECONDS}s: host ${CLIENT_IP} and tcp port 80"
ssh "${SSH_OPTS[@]}" "${WORKER_HOST}" \
"sudo timeout ${CAPTURE_SECONDS} tcpdump -ni ens18 'host ${CLIENT_IP} and tcp port 80' 2>/dev/null || true" \
>"${CAP_FILE_ENS18}" 2>&1 &
CAP_PID_ENS18="$!"
sleep 1
}
start_worker_nft_trace() {
if [[ ! "$NFT_TRACE_MODE" =~ ^[Yy]$ ]]; then
return 0
fi
if [[ ! "$DO_REMOTE" =~ ^[Yy]$ ]] || [[ -z "${WORKER_HOST}" ]]; then
echo "[WARN] nft trace 已开启,但未启用远端检查或未提供 worker 主机,跳过 nft trace。"
return 0
fi
ssh "${SSH_OPTS[@]}" "${WORKER_HOST}" \
"sudo nft add table inet ${NFT_TRACE_TABLE} 2>/dev/null || true; \
sudo nft 'add chain inet ${NFT_TRACE_TABLE} prerouting { type filter hook prerouting priority -301; policy accept; }' 2>/dev/null || true; \
sudo nft add rule inet ${NFT_TRACE_TABLE} prerouting iif \"ens18\" ip saddr ${CLIENT_IP} ip daddr ${LB_IP} tcp dport 80 meta nftrace set 1 2>/dev/null || true" \
|| true
NFT_FILE="$(mktemp)"
say "启动 worker nft trace${NFT_TRACE_SECONDS}s"
ssh "${SSH_OPTS[@]}" "${WORKER_HOST}" \
"sudo timeout ${NFT_TRACE_SECONDS} nft monitor trace 2>/dev/null || true" \
>"${NFT_FILE}" 2>&1 &
NFT_PID="$!"
sleep 1
}
start_return_path_trace() {
if [[ ! "$RETURN_TRACE_MODE" =~ ^[Yy]$ ]]; then
return 0
fi
if [[ ! "$DO_REMOTE" =~ ^[Yy]$ ]] || [[ -z "${WORKER_HOST}" ]]; then
echo "[WARN] 回包链路跟踪已开启,但未启用远端检查或未提供 worker 主机,跳过。"
return 0
fi
RET_FILE_LOCAL_8000="$(mktemp)"
RET_FILE_LOCAL_CNI0="$(mktemp)"
RET_FILE_LOCAL_NFT_TRACE="$(mktemp)"
RET_FILE_WORKER_ENS18="$(mktemp)"
RET_FILE_WORKER_CONNTRACK="$(mktemp)"
say "启动回包链路跟踪(${RETURN_TRACE_SECONDS}s"
sudo nft add table inet "${LOCAL_NFT_TRACE_TABLE}" 2>/dev/null || true
sudo nft "add chain inet ${LOCAL_NFT_TRACE_TABLE} forward { type filter hook forward priority -301; policy accept; }" 2>/dev/null || true
sudo nft add rule inet "${LOCAL_NFT_TRACE_TABLE}" forward iif "flannel.1" ip daddr "${TRAEFIK_IP}" tcp dport 8000 meta nftrace set 1 2>/dev/null || true
sudo timeout "${RETURN_TRACE_SECONDS}" nft monitor trace 2>/dev/null \
>"${RET_FILE_LOCAL_NFT_TRACE}" 2>&1 &
RET_PID_LOCAL_NFT_TRACE="$!"
sudo timeout "${RETURN_TRACE_SECONDS}" tcpdump -ni any "host ${TRAEFIK_IP} and tcp port 8000" 2>/dev/null \
>"${RET_FILE_LOCAL_8000}" 2>&1 &
RET_PID_LOCAL_8000="$!"
sudo timeout "${RETURN_TRACE_SECONDS}" tcpdump -ni cni0 "host ${TRAEFIK_IP} and tcp port 8000" 2>/dev/null \
>"${RET_FILE_LOCAL_CNI0}" 2>&1 &
RET_PID_LOCAL_CNI0="$!"
ssh "${SSH_OPTS[@]}" "${WORKER_HOST}" \
"sudo timeout ${RETURN_TRACE_SECONDS} tcpdump -ni ens18 'host ${CLIENT_IP} and tcp' 2>/dev/null || true" \
>"${RET_FILE_WORKER_ENS18}" 2>&1 &
RET_PID_WORKER_ENS18="$!"
ssh "${SSH_OPTS[@]}" "${WORKER_HOST}" \
"if command -v conntrack >/dev/null 2>&1; then sudo timeout ${RETURN_TRACE_SECONDS} conntrack -E -p tcp 2>/dev/null || true; else echo 'conntrack: not found'; fi" \
>"${RET_FILE_WORKER_CONNTRACK}" 2>&1 &
RET_PID_WORKER_CONNTRACK="$!"
sleep 1
}
start_pod_netns_trace() {
if [[ ! "${POD_NETNS_TRACE_MODE}" =~ ^[Yy]$ ]]; then
return 0
fi
if ! command -v crictl >/dev/null 2>&1; then
echo "[WARN] 未找到 crictl跳过 pod netns 抓包。"
return 0
fi
if ! command -v nsenter >/dev/null 2>&1; then
echo "[WARN] 未找到 nsenter跳过 pod netns 抓包。"
return 0
fi
local sec="${POD_NETNS_TRACE_SECONDS:-$RETURN_TRACE_SECONDS}"
local cid
local pid
local runtime_id=""
runtime_id="$(sudo kubectl -n kube-system get pod "${TRAEFIK_POD}" -o jsonpath='{.status.containerStatuses[?(@.name=="traefik")].containerID}' 2>/dev/null || true)"
runtime_id="${runtime_id#containerd://}"
runtime_id="${runtime_id#cri-o://}"
if [[ -n "${runtime_id}" ]]; then
cid="${runtime_id}"
else
cid="$(sudo crictl ps --name traefik -q 2>/dev/null | awk 'NR==1{print; exit}' || true)"
fi
if [[ -z "${cid}" ]]; then
echo "[WARN] 未解析到 traefik 容器ID跳过 pod netns 抓包。"
return 0
fi
pid="$(sudo crictl inspect "${cid}" 2>/dev/null | awk -F': ' '/"pid":/ {gsub(/,/, "", $2); print $2; exit}' || true)"
if [[ -z "${pid}" || ! "${pid}" =~ ^[0-9]+$ ]]; then
echo "[WARN] 未解析到 traefik 容器 PID跳过 pod netns 抓包。"
return 0
fi
POD_NETNS_FILE="$(mktemp)"
say "启动 Traefik Pod netns 抓包(${sec}s, pid=${pid}"
sudo timeout "${sec}" nsenter -t "${pid}" -n tcpdump -ni any "tcp port 8000" 2>/dev/null \
>"${POD_NETNS_FILE}" 2>&1 &
POD_NETNS_PID="$!"
sleep 1
}
flush_worker_capture() {
if [[ -n "${CAP_PID_ENS18}" ]]; then
wait "${CAP_PID_ENS18}" || true
CAP_PID_ENS18=""
fi
if [[ -n "${CAP_FILE_ENS18}" && -f "${CAP_FILE_ENS18}" ]]; then
echo
echo "===== Worker 抓包结果ens18 ====="
cat "${CAP_FILE_ENS18}" || true
rm -f "${CAP_FILE_ENS18}" || true
CAP_FILE_ENS18=""
fi
if [[ -n "${NFT_PID}" ]]; then
wait "${NFT_PID}" || true
NFT_PID=""
fi
if [[ -n "${NFT_FILE}" && -f "${NFT_FILE}" ]]; then
if grep -Eq "KUBE-SEP-.*dnat to ${TRAEFIK_IP}:8000|dnat to ${TRAEFIK_IP}:8000" "${NFT_FILE}" >/dev/null 2>&1; then
NFT_DNAT_HIT="yes"
fi
echo
echo "===== Worker nft trace 结果 ====="
cat "${NFT_FILE}" || true
rm -f "${NFT_FILE}" || true
NFT_FILE=""
fi
if [[ "$NFT_TRACE_MODE" =~ ^[Yy]$ ]] && [[ "$DO_REMOTE" =~ ^[Yy]$ ]] && [[ -n "${WORKER_HOST}" ]]; then
ssh "${SSH_OPTS[@]}" "${WORKER_HOST}" "sudo nft delete table inet ${NFT_TRACE_TABLE} 2>/dev/null || true" || true
fi
if [[ -n "${RET_PID_LOCAL_8000}" ]]; then
wait "${RET_PID_LOCAL_8000}" || true
RET_PID_LOCAL_8000=""
fi
if [[ -n "${RET_PID_LOCAL_NFT_TRACE}" ]]; then
wait "${RET_PID_LOCAL_NFT_TRACE}" || true
RET_PID_LOCAL_NFT_TRACE=""
fi
if [[ -n "${RET_PID_LOCAL_CNI0}" ]]; then
wait "${RET_PID_LOCAL_CNI0}" || true
RET_PID_LOCAL_CNI0=""
fi
if [[ -n "${RET_PID_WORKER_ENS18}" ]]; then
wait "${RET_PID_WORKER_ENS18}" || true
RET_PID_WORKER_ENS18=""
fi
if [[ -n "${RET_PID_WORKER_CONNTRACK}" ]]; then
wait "${RET_PID_WORKER_CONNTRACK}" || true
RET_PID_WORKER_CONNTRACK=""
fi
if [[ -n "${RET_FILE_LOCAL_8000}" && -f "${RET_FILE_LOCAL_8000}" ]]; then
RET_LOCAL_SYN_COUNT="$(count_tcpdump_flag "${RET_FILE_LOCAL_8000}" "Flags [S]")"
RET_LOCAL_SYNACK_COUNT="$(count_tcpdump_flag "${RET_FILE_LOCAL_8000}" "Flags [S.]")"
echo
echo "===== 回包链路抓包ylc61 any -> ${TRAEFIK_IP}:8000 ====="
cat "${RET_FILE_LOCAL_8000}" || true
rm -f "${RET_FILE_LOCAL_8000}" || true
RET_FILE_LOCAL_8000=""
fi
if [[ -n "${RET_FILE_LOCAL_NFT_TRACE}" && -f "${RET_FILE_LOCAL_NFT_TRACE}" ]]; then
echo
echo "===== 本机 nft trace 结果ylc61 forward ====="
cat "${RET_FILE_LOCAL_NFT_TRACE}" || true
rm -f "${RET_FILE_LOCAL_NFT_TRACE}" || true
RET_FILE_LOCAL_NFT_TRACE=""
fi
if [[ -n "${RET_FILE_LOCAL_CNI0}" && -f "${RET_FILE_LOCAL_CNI0}" ]]; then
RET_CNI0_SYN_COUNT="$(count_tcpdump_flag "${RET_FILE_LOCAL_CNI0}" "Flags [S]")"
RET_CNI0_SYNACK_COUNT="$(count_tcpdump_flag "${RET_FILE_LOCAL_CNI0}" "Flags [S.]")"
echo
echo "===== 回包链路抓包ylc61 cni0 -> ${TRAEFIK_IP}:8000 ====="
cat "${RET_FILE_LOCAL_CNI0}" || true
rm -f "${RET_FILE_LOCAL_CNI0}" || true
RET_FILE_LOCAL_CNI0=""
fi
if [[ -n "${RET_FILE_WORKER_ENS18}" && -f "${RET_FILE_WORKER_ENS18}" ]]; then
RET_WORKER_SYN_COUNT="$(count_tcpdump_flag "${RET_FILE_WORKER_ENS18}" "Flags [S]")"
RET_WORKER_SYNACK_COUNT="$(count_tcpdump_flag "${RET_FILE_WORKER_ENS18}" "Flags [S.]")"
echo
echo "===== 回包链路抓包ylc62 ens18 <-> ${CLIENT_IP} ====="
cat "${RET_FILE_WORKER_ENS18}" || true
rm -f "${RET_FILE_WORKER_ENS18}" || true
RET_FILE_WORKER_ENS18=""
fi
if [[ -n "${RET_FILE_WORKER_CONNTRACK}" && -f "${RET_FILE_WORKER_CONNTRACK}" ]]; then
echo
echo "===== 回包链路 conntrack 事件ylc62 ====="
cat "${RET_FILE_WORKER_CONNTRACK}" || true
rm -f "${RET_FILE_WORKER_CONNTRACK}" || true
RET_FILE_WORKER_CONNTRACK=""
fi
sudo nft delete table inet "${LOCAL_NFT_TRACE_TABLE}" 2>/dev/null || true
if [[ -n "${POD_NETNS_PID}" ]]; then
wait "${POD_NETNS_PID}" || true
POD_NETNS_PID=""
fi
if [[ -n "${POD_NETNS_FILE}" && -f "${POD_NETNS_FILE}" ]]; then
POD_NETNS_SYN_COUNT="$(count_tcpdump_flag "${POD_NETNS_FILE}" "Flags [S]")"
POD_NETNS_SYNACK_COUNT="$(count_tcpdump_flag "${POD_NETNS_FILE}" "Flags [S.]")"
echo
echo "===== Traefik Pod netns 抓包ylc61 ====="
cat "${POD_NETNS_FILE}" || true
rm -f "${POD_NETNS_FILE}" || true
POD_NETNS_FILE=""
fi
}