81 lines
4.3 KiB
Bash
81 lines
4.3 KiB
Bash
#!/usr/bin/env bash
|
||
|
||
print_diag_summary() {
|
||
echo
|
||
echo "===== 自动判读(基于当前计数) ====="
|
||
echo "- Traefik Pod FW 链: ${TRAEFIK_CHAIN:-N/A}"
|
||
echo "- Traefik REJECT 命中: ${REJECT_PKTS:-0}"
|
||
echo "- Traefik NFLOG 命中: ${NFLOG_PKTS:-0}"
|
||
echo "- Service web 链: ${TRAEFIK_WEB_SVC_CHAIN:-N/A}"
|
||
echo "- Service web endpoint 链: ${TRAEFIK_WEB_SEP_CHAIN:-N/A}"
|
||
echo "- Worker CNI hostport链: ${WORKER_CNI_DNAT_CHAIN:-N/A}"
|
||
echo "- nft 观测到 KUBE-EXT DNAT: ${NFT_DNAT_HIT:-no}"
|
||
echo "- ylc61(any) SYN/SYN-ACK: ${RET_LOCAL_SYN_COUNT:-0}/${RET_LOCAL_SYNACK_COUNT:-0}"
|
||
echo "- ylc61(cni0) SYN/SYN-ACK: ${RET_CNI0_SYN_COUNT:-0}/${RET_CNI0_SYNACK_COUNT:-0}"
|
||
echo "- ylc62(ens18) SYN/SYN-ACK: ${RET_WORKER_SYN_COUNT:-0}/${RET_WORKER_SYNACK_COUNT:-0}"
|
||
echo "- Traefik pod netns SYN/SYN-ACK: ${POD_NETNS_SYN_COUNT:-0}/${POD_NETNS_SYNACK_COUNT:-0}"
|
||
|
||
if [[ "${REJECT_PKTS:-0}" =~ ^[0-9]+$ ]] && [[ "${REJECT_PKTS:-0}" -gt 0 ]]; then
|
||
echo "[结论] Traefik Pod 防火墙链出现 REJECT 命中,优先检查 kube-system 下 Traefik 相关 Ingress NetworkPolicy。"
|
||
elif [[ "${RET_WORKER_SYNACK_COUNT:-0}" -gt 0 ]] && [[ "${RET_LOCAL_SYNACK_COUNT:-0}" -gt 0 ]] && [[ "${RET_CNI0_SYNACK_COUNT:-0}" -gt 0 ]]; then
|
||
echo "[结论] 链路已恢复:ylc62/ylc61/cni0 均观测到 SYN-ACK,62:80 已可达 Traefik。"
|
||
elif [[ "${NFT_DNAT_HIT:-no}" == "yes" ]] && [[ "${RET_LOCAL_SYN_COUNT:-0}" -gt 0 ]] && [[ "${RET_LOCAL_SYNACK_COUNT:-0}" -eq 0 ]]; then
|
||
echo "[结论] 流量已经在 worker 被 KUBE-EXT/KUBE-SVC DNAT 到 Traefik(10.42.0.12:8000),但 ylc61 未观察到 SYN-ACK,优先排查 Traefik Pod/宿主转发回包路径。"
|
||
elif [[ -n "${WORKER_CNI_HIT_AFTER:-}" && -n "${WORKER_CNI_HIT_BEFORE:-}" ]] && \
|
||
[[ "${WORKER_CNI_HIT_AFTER}" == "${WORKER_CNI_HIT_BEFORE}" ]]; then
|
||
echo "[结论] Worker CNI hostport DNAT 计数未增长。若 nft trace 显示走 KUBE-EXT/KUBE-SVC,这是正常路径提示,不构成故障根因。"
|
||
else
|
||
echo "[结论] 未观察到 Traefik REJECT 明确命中,优先检查回包链路(ylc61<->ylc62 flannel / ylc62 ens18 出口)。"
|
||
fi
|
||
}
|
||
|
||
analyze_log_file() {
|
||
local log_file="$1"
|
||
if [[ -z "${log_file}" || ! -f "${log_file}" ]]; then
|
||
echo "[ERR] analyze 模式需要有效日志文件: --log <path>"
|
||
return 1
|
||
fi
|
||
|
||
local has_worker_dnat="no"
|
||
local has_firewalld_reject="no"
|
||
local has_traefik_reject="no"
|
||
local has_syn_no_synack="no"
|
||
local has_synack_recovered="no"
|
||
|
||
if awk '/KUBE-EXT-.*KUBE-SVC|dnat to 10\.42\./ {hit=1} END{exit !hit}' "${log_file}"; then
|
||
has_worker_dnat="yes"
|
||
fi
|
||
if awk '/filter_FORWARD_POLICIES.*admin-prohibited/ {hit=1} END{exit !hit}' "${log_file}"; then
|
||
has_firewalld_reject="yes"
|
||
fi
|
||
if awk '/Traefik REJECT 命中: [1-9]/ {hit=1} END{exit !hit}' "${log_file}"; then
|
||
has_traefik_reject="yes"
|
||
fi
|
||
if awk '/ylc61\(any\) SYN\/SYN-ACK: [1-9][0-9]*\/0/ {hit=1} END{exit !hit}' "${log_file}"; then
|
||
has_syn_no_synack="yes"
|
||
fi
|
||
if awk '/ylc61\(any\) SYN\/SYN-ACK: [1-9][0-9]*\/[1-9][0-9]*/ {a=1} /ylc62\(ens18\) SYN\/SYN-ACK: [1-9][0-9]*\/[1-9][0-9]*/ {b=1} END{exit !(a&&b)}' "${log_file}"; then
|
||
has_synack_recovered="yes"
|
||
fi
|
||
|
||
echo "===== 日志离线判读 ====="
|
||
echo "- 日志文件: ${log_file}"
|
||
echo "- 观测到 worker DNAT: ${has_worker_dnat}"
|
||
echo "- 观测到 firewalld forward reject: ${has_firewalld_reject}"
|
||
echo "- 观测到 Traefik Pod REJECT 命中: ${has_traefik_reject}"
|
||
echo "- 观测到 ylc61 SYN 无 SYN-ACK: ${has_syn_no_synack}"
|
||
echo "- 观测到链路恢复(有 SYN-ACK): ${has_synack_recovered}"
|
||
|
||
if [[ "${has_firewalld_reject}" == "yes" ]]; then
|
||
echo "[结论] 高概率为 ylc61 firewalld FORWARD 策略阻断 flannel.1 -> cni0。"
|
||
elif [[ "${has_synack_recovered}" == "yes" ]]; then
|
||
echo "[结论] 链路已恢复,入口到 Traefik 回包路径正常。"
|
||
elif [[ "${has_worker_dnat}" == "yes" && "${has_syn_no_synack}" == "yes" ]]; then
|
||
echo "[结论] worker 入站与 DNAT 正常,需优先排查 ylc61 到 Traefik Pod 的转发/回包链路。"
|
||
elif [[ "${has_traefik_reject}" == "yes" ]]; then
|
||
echo "[结论] Traefik Pod NetworkPolicy 命中拒绝,优先检查 kube-system netpol。"
|
||
else
|
||
echo "[结论] 日志未出现单一确定根因,建议执行 run/capture 模式重新采样。"
|
||
fi
|
||
}
|