基本框架

This commit is contained in:
2026-03-21 04:36:06 +08:00
commit de1be1dbe5
125 changed files with 10302 additions and 0 deletions

View File

@@ -0,0 +1,286 @@
#!/usr/bin/env bash
CAPTURE_MODE="N"
CAPTURE_SECONDS="12"
CAPTURE_MODE_ARG=""
CAP_FILE_ENS18=""
CAP_PID_ENS18=""
NFT_TRACE_MODE="N"
NFT_TRACE_SECONDS="8"
NFT_TRACE_MODE_ARG=""
NFT_FILE=""
NFT_PID=""
NFT_TRACE_TABLE="diag_k3s_entrypath"
LOCAL_NFT_TRACE_TABLE="diag61_k3s_entrypath"
RETURN_TRACE_MODE="N"
RETURN_TRACE_SECONDS="10"
RETURN_TRACE_MODE_ARG=""
RET_FILE_LOCAL_8000=""
RET_FILE_LOCAL_CNI0=""
RET_FILE_WORKER_ENS18=""
RET_FILE_WORKER_CONNTRACK=""
RET_PID_LOCAL_8000=""
RET_PID_LOCAL_CNI0=""
RET_PID_WORKER_ENS18=""
RET_PID_WORKER_CONNTRACK=""
RET_FILE_LOCAL_NFT_TRACE=""
RET_PID_LOCAL_NFT_TRACE=""
NFT_DNAT_HIT="no"
RET_LOCAL_SYN_COUNT=0
RET_LOCAL_SYNACK_COUNT=0
RET_CNI0_SYN_COUNT=0
RET_CNI0_SYNACK_COUNT=0
RET_WORKER_SYN_COUNT=0
RET_WORKER_SYNACK_COUNT=0
POD_NETNS_TRACE_MODE="N"
POD_NETNS_TRACE_MODE_ARG=""
POD_NETNS_TRACE_SECONDS=""
POD_NETNS_TRACE_SECONDS_ARG=""
POD_NETNS_PID=""
POD_NETNS_FILE=""
POD_NETNS_SYN_COUNT=0
POD_NETNS_SYNACK_COUNT=0
start_worker_capture() {
if [[ ! "$CAPTURE_MODE" =~ ^[Yy]$ ]]; then
return 0
fi
if [[ ! "$DO_REMOTE" =~ ^[Yy]$ ]] || [[ -z "${WORKER_HOST}" ]]; then
echo "[WARN] 抓包模式已开启,但未启用远端检查或未提供 worker 主机,跳过抓包。"
return 0
fi
CAP_FILE_ENS18="$(mktemp)"
say "启动 worker 抓包ens18, ${CAPTURE_SECONDS}s: host ${CLIENT_IP} and tcp port 80"
ssh "${SSH_OPTS[@]}" "${WORKER_HOST}" \
"sudo timeout ${CAPTURE_SECONDS} tcpdump -ni ens18 'host ${CLIENT_IP} and tcp port 80' 2>/dev/null || true" \
>"${CAP_FILE_ENS18}" 2>&1 &
CAP_PID_ENS18="$!"
sleep 1
}
start_worker_nft_trace() {
if [[ ! "$NFT_TRACE_MODE" =~ ^[Yy]$ ]]; then
return 0
fi
if [[ ! "$DO_REMOTE" =~ ^[Yy]$ ]] || [[ -z "${WORKER_HOST}" ]]; then
echo "[WARN] nft trace 已开启,但未启用远端检查或未提供 worker 主机,跳过 nft trace。"
return 0
fi
ssh "${SSH_OPTS[@]}" "${WORKER_HOST}" \
"sudo nft add table inet ${NFT_TRACE_TABLE} 2>/dev/null || true; \
sudo nft 'add chain inet ${NFT_TRACE_TABLE} prerouting { type filter hook prerouting priority -301; policy accept; }' 2>/dev/null || true; \
sudo nft add rule inet ${NFT_TRACE_TABLE} prerouting iif \"ens18\" ip saddr ${CLIENT_IP} ip daddr ${LB_IP} tcp dport 80 meta nftrace set 1 2>/dev/null || true" \
|| true
NFT_FILE="$(mktemp)"
say "启动 worker nft trace${NFT_TRACE_SECONDS}s"
ssh "${SSH_OPTS[@]}" "${WORKER_HOST}" \
"sudo timeout ${NFT_TRACE_SECONDS} nft monitor trace 2>/dev/null || true" \
>"${NFT_FILE}" 2>&1 &
NFT_PID="$!"
sleep 1
}
start_return_path_trace() {
if [[ ! "$RETURN_TRACE_MODE" =~ ^[Yy]$ ]]; then
return 0
fi
if [[ ! "$DO_REMOTE" =~ ^[Yy]$ ]] || [[ -z "${WORKER_HOST}" ]]; then
echo "[WARN] 回包链路跟踪已开启,但未启用远端检查或未提供 worker 主机,跳过。"
return 0
fi
RET_FILE_LOCAL_8000="$(mktemp)"
RET_FILE_LOCAL_CNI0="$(mktemp)"
RET_FILE_LOCAL_NFT_TRACE="$(mktemp)"
RET_FILE_WORKER_ENS18="$(mktemp)"
RET_FILE_WORKER_CONNTRACK="$(mktemp)"
say "启动回包链路跟踪(${RETURN_TRACE_SECONDS}s"
sudo nft add table inet "${LOCAL_NFT_TRACE_TABLE}" 2>/dev/null || true
sudo nft "add chain inet ${LOCAL_NFT_TRACE_TABLE} forward { type filter hook forward priority -301; policy accept; }" 2>/dev/null || true
sudo nft add rule inet "${LOCAL_NFT_TRACE_TABLE}" forward iif "flannel.1" ip daddr "${TRAEFIK_IP}" tcp dport 8000 meta nftrace set 1 2>/dev/null || true
sudo timeout "${RETURN_TRACE_SECONDS}" nft monitor trace 2>/dev/null \
>"${RET_FILE_LOCAL_NFT_TRACE}" 2>&1 &
RET_PID_LOCAL_NFT_TRACE="$!"
sudo timeout "${RETURN_TRACE_SECONDS}" tcpdump -ni any "host ${TRAEFIK_IP} and tcp port 8000" 2>/dev/null \
>"${RET_FILE_LOCAL_8000}" 2>&1 &
RET_PID_LOCAL_8000="$!"
sudo timeout "${RETURN_TRACE_SECONDS}" tcpdump -ni cni0 "host ${TRAEFIK_IP} and tcp port 8000" 2>/dev/null \
>"${RET_FILE_LOCAL_CNI0}" 2>&1 &
RET_PID_LOCAL_CNI0="$!"
ssh "${SSH_OPTS[@]}" "${WORKER_HOST}" \
"sudo timeout ${RETURN_TRACE_SECONDS} tcpdump -ni ens18 'host ${CLIENT_IP} and tcp' 2>/dev/null || true" \
>"${RET_FILE_WORKER_ENS18}" 2>&1 &
RET_PID_WORKER_ENS18="$!"
ssh "${SSH_OPTS[@]}" "${WORKER_HOST}" \
"if command -v conntrack >/dev/null 2>&1; then sudo timeout ${RETURN_TRACE_SECONDS} conntrack -E -p tcp 2>/dev/null || true; else echo 'conntrack: not found'; fi" \
>"${RET_FILE_WORKER_CONNTRACK}" 2>&1 &
RET_PID_WORKER_CONNTRACK="$!"
sleep 1
}
start_pod_netns_trace() {
if [[ ! "${POD_NETNS_TRACE_MODE}" =~ ^[Yy]$ ]]; then
return 0
fi
if ! command -v crictl >/dev/null 2>&1; then
echo "[WARN] 未找到 crictl跳过 pod netns 抓包。"
return 0
fi
if ! command -v nsenter >/dev/null 2>&1; then
echo "[WARN] 未找到 nsenter跳过 pod netns 抓包。"
return 0
fi
local sec="${POD_NETNS_TRACE_SECONDS:-$RETURN_TRACE_SECONDS}"
local cid
local pid
local runtime_id=""
runtime_id="$(sudo kubectl -n kube-system get pod "${TRAEFIK_POD}" -o jsonpath='{.status.containerStatuses[?(@.name=="traefik")].containerID}' 2>/dev/null || true)"
runtime_id="${runtime_id#containerd://}"
runtime_id="${runtime_id#cri-o://}"
if [[ -n "${runtime_id}" ]]; then
cid="${runtime_id}"
else
cid="$(sudo crictl ps --name traefik -q 2>/dev/null | awk 'NR==1{print; exit}' || true)"
fi
if [[ -z "${cid}" ]]; then
echo "[WARN] 未解析到 traefik 容器ID跳过 pod netns 抓包。"
return 0
fi
pid="$(sudo crictl inspect "${cid}" 2>/dev/null | awk -F': ' '/"pid":/ {gsub(/,/, "", $2); print $2; exit}' || true)"
if [[ -z "${pid}" || ! "${pid}" =~ ^[0-9]+$ ]]; then
echo "[WARN] 未解析到 traefik 容器 PID跳过 pod netns 抓包。"
return 0
fi
POD_NETNS_FILE="$(mktemp)"
say "启动 Traefik Pod netns 抓包(${sec}s, pid=${pid}"
sudo timeout "${sec}" nsenter -t "${pid}" -n tcpdump -ni any "tcp port 8000" 2>/dev/null \
>"${POD_NETNS_FILE}" 2>&1 &
POD_NETNS_PID="$!"
sleep 1
}
flush_worker_capture() {
if [[ -n "${CAP_PID_ENS18}" ]]; then
wait "${CAP_PID_ENS18}" || true
CAP_PID_ENS18=""
fi
if [[ -n "${CAP_FILE_ENS18}" && -f "${CAP_FILE_ENS18}" ]]; then
echo
echo "===== Worker 抓包结果ens18 ====="
cat "${CAP_FILE_ENS18}" || true
rm -f "${CAP_FILE_ENS18}" || true
CAP_FILE_ENS18=""
fi
if [[ -n "${NFT_PID}" ]]; then
wait "${NFT_PID}" || true
NFT_PID=""
fi
if [[ -n "${NFT_FILE}" && -f "${NFT_FILE}" ]]; then
if grep -Eq "KUBE-SEP-.*dnat to ${TRAEFIK_IP}:8000|dnat to ${TRAEFIK_IP}:8000" "${NFT_FILE}" >/dev/null 2>&1; then
NFT_DNAT_HIT="yes"
fi
echo
echo "===== Worker nft trace 结果 ====="
cat "${NFT_FILE}" || true
rm -f "${NFT_FILE}" || true
NFT_FILE=""
fi
if [[ "$NFT_TRACE_MODE" =~ ^[Yy]$ ]] && [[ "$DO_REMOTE" =~ ^[Yy]$ ]] && [[ -n "${WORKER_HOST}" ]]; then
ssh "${SSH_OPTS[@]}" "${WORKER_HOST}" "sudo nft delete table inet ${NFT_TRACE_TABLE} 2>/dev/null || true" || true
fi
if [[ -n "${RET_PID_LOCAL_8000}" ]]; then
wait "${RET_PID_LOCAL_8000}" || true
RET_PID_LOCAL_8000=""
fi
if [[ -n "${RET_PID_LOCAL_NFT_TRACE}" ]]; then
wait "${RET_PID_LOCAL_NFT_TRACE}" || true
RET_PID_LOCAL_NFT_TRACE=""
fi
if [[ -n "${RET_PID_LOCAL_CNI0}" ]]; then
wait "${RET_PID_LOCAL_CNI0}" || true
RET_PID_LOCAL_CNI0=""
fi
if [[ -n "${RET_PID_WORKER_ENS18}" ]]; then
wait "${RET_PID_WORKER_ENS18}" || true
RET_PID_WORKER_ENS18=""
fi
if [[ -n "${RET_PID_WORKER_CONNTRACK}" ]]; then
wait "${RET_PID_WORKER_CONNTRACK}" || true
RET_PID_WORKER_CONNTRACK=""
fi
if [[ -n "${RET_FILE_LOCAL_8000}" && -f "${RET_FILE_LOCAL_8000}" ]]; then
RET_LOCAL_SYN_COUNT="$(count_tcpdump_flag "${RET_FILE_LOCAL_8000}" "Flags [S]")"
RET_LOCAL_SYNACK_COUNT="$(count_tcpdump_flag "${RET_FILE_LOCAL_8000}" "Flags [S.]")"
echo
echo "===== 回包链路抓包ylc61 any -> ${TRAEFIK_IP}:8000 ====="
cat "${RET_FILE_LOCAL_8000}" || true
rm -f "${RET_FILE_LOCAL_8000}" || true
RET_FILE_LOCAL_8000=""
fi
if [[ -n "${RET_FILE_LOCAL_NFT_TRACE}" && -f "${RET_FILE_LOCAL_NFT_TRACE}" ]]; then
echo
echo "===== 本机 nft trace 结果ylc61 forward ====="
cat "${RET_FILE_LOCAL_NFT_TRACE}" || true
rm -f "${RET_FILE_LOCAL_NFT_TRACE}" || true
RET_FILE_LOCAL_NFT_TRACE=""
fi
if [[ -n "${RET_FILE_LOCAL_CNI0}" && -f "${RET_FILE_LOCAL_CNI0}" ]]; then
RET_CNI0_SYN_COUNT="$(count_tcpdump_flag "${RET_FILE_LOCAL_CNI0}" "Flags [S]")"
RET_CNI0_SYNACK_COUNT="$(count_tcpdump_flag "${RET_FILE_LOCAL_CNI0}" "Flags [S.]")"
echo
echo "===== 回包链路抓包ylc61 cni0 -> ${TRAEFIK_IP}:8000 ====="
cat "${RET_FILE_LOCAL_CNI0}" || true
rm -f "${RET_FILE_LOCAL_CNI0}" || true
RET_FILE_LOCAL_CNI0=""
fi
if [[ -n "${RET_FILE_WORKER_ENS18}" && -f "${RET_FILE_WORKER_ENS18}" ]]; then
RET_WORKER_SYN_COUNT="$(count_tcpdump_flag "${RET_FILE_WORKER_ENS18}" "Flags [S]")"
RET_WORKER_SYNACK_COUNT="$(count_tcpdump_flag "${RET_FILE_WORKER_ENS18}" "Flags [S.]")"
echo
echo "===== 回包链路抓包ylc62 ens18 <-> ${CLIENT_IP} ====="
cat "${RET_FILE_WORKER_ENS18}" || true
rm -f "${RET_FILE_WORKER_ENS18}" || true
RET_FILE_WORKER_ENS18=""
fi
if [[ -n "${RET_FILE_WORKER_CONNTRACK}" && -f "${RET_FILE_WORKER_CONNTRACK}" ]]; then
echo
echo "===== 回包链路 conntrack 事件ylc62 ====="
cat "${RET_FILE_WORKER_CONNTRACK}" || true
rm -f "${RET_FILE_WORKER_CONNTRACK}" || true
RET_FILE_WORKER_CONNTRACK=""
fi
sudo nft delete table inet "${LOCAL_NFT_TRACE_TABLE}" 2>/dev/null || true
if [[ -n "${POD_NETNS_PID}" ]]; then
wait "${POD_NETNS_PID}" || true
POD_NETNS_PID=""
fi
if [[ -n "${POD_NETNS_FILE}" && -f "${POD_NETNS_FILE}" ]]; then
POD_NETNS_SYN_COUNT="$(count_tcpdump_flag "${POD_NETNS_FILE}" "Flags [S]")"
POD_NETNS_SYNACK_COUNT="$(count_tcpdump_flag "${POD_NETNS_FILE}" "Flags [S.]")"
echo
echo "===== Traefik Pod netns 抓包ylc61 ====="
cat "${POD_NETNS_FILE}" || true
rm -f "${POD_NETNS_FILE}" || true
POD_NETNS_FILE=""
fi
}