Files
Deploy-Laboratory/scripts/diag/entrypath/lib/analyze.sh
2026-03-21 04:36:06 +08:00

81 lines
4.3 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env bash
print_diag_summary() {
echo
echo "===== 自动判读(基于当前计数) ====="
echo "- Traefik Pod FW 链: ${TRAEFIK_CHAIN:-N/A}"
echo "- Traefik REJECT 命中: ${REJECT_PKTS:-0}"
echo "- Traefik NFLOG 命中: ${NFLOG_PKTS:-0}"
echo "- Service web 链: ${TRAEFIK_WEB_SVC_CHAIN:-N/A}"
echo "- Service web endpoint 链: ${TRAEFIK_WEB_SEP_CHAIN:-N/A}"
echo "- Worker CNI hostport链: ${WORKER_CNI_DNAT_CHAIN:-N/A}"
echo "- nft 观测到 KUBE-EXT DNAT: ${NFT_DNAT_HIT:-no}"
echo "- ylc61(any) SYN/SYN-ACK: ${RET_LOCAL_SYN_COUNT:-0}/${RET_LOCAL_SYNACK_COUNT:-0}"
echo "- ylc61(cni0) SYN/SYN-ACK: ${RET_CNI0_SYN_COUNT:-0}/${RET_CNI0_SYNACK_COUNT:-0}"
echo "- ylc62(ens18) SYN/SYN-ACK: ${RET_WORKER_SYN_COUNT:-0}/${RET_WORKER_SYNACK_COUNT:-0}"
echo "- Traefik pod netns SYN/SYN-ACK: ${POD_NETNS_SYN_COUNT:-0}/${POD_NETNS_SYNACK_COUNT:-0}"
if [[ "${REJECT_PKTS:-0}" =~ ^[0-9]+$ ]] && [[ "${REJECT_PKTS:-0}" -gt 0 ]]; then
echo "[结论] Traefik Pod 防火墙链出现 REJECT 命中,优先检查 kube-system 下 Traefik 相关 Ingress NetworkPolicy。"
elif [[ "${RET_WORKER_SYNACK_COUNT:-0}" -gt 0 ]] && [[ "${RET_LOCAL_SYNACK_COUNT:-0}" -gt 0 ]] && [[ "${RET_CNI0_SYNACK_COUNT:-0}" -gt 0 ]]; then
echo "[结论] 链路已恢复ylc62/ylc61/cni0 均观测到 SYN-ACK62:80 已可达 Traefik。"
elif [[ "${NFT_DNAT_HIT:-no}" == "yes" ]] && [[ "${RET_LOCAL_SYN_COUNT:-0}" -gt 0 ]] && [[ "${RET_LOCAL_SYNACK_COUNT:-0}" -eq 0 ]]; then
echo "[结论] 流量已经在 worker 被 KUBE-EXT/KUBE-SVC DNAT 到 Traefik(10.42.0.12:8000),但 ylc61 未观察到 SYN-ACK优先排查 Traefik Pod/宿主转发回包路径。"
elif [[ -n "${WORKER_CNI_HIT_AFTER:-}" && -n "${WORKER_CNI_HIT_BEFORE:-}" ]] && \
[[ "${WORKER_CNI_HIT_AFTER}" == "${WORKER_CNI_HIT_BEFORE}" ]]; then
echo "[结论] Worker CNI hostport DNAT 计数未增长。若 nft trace 显示走 KUBE-EXT/KUBE-SVC这是正常路径提示不构成故障根因。"
else
echo "[结论] 未观察到 Traefik REJECT 明确命中优先检查回包链路ylc61<->ylc62 flannel / ylc62 ens18 出口)。"
fi
}
analyze_log_file() {
local log_file="$1"
if [[ -z "${log_file}" || ! -f "${log_file}" ]]; then
echo "[ERR] analyze 模式需要有效日志文件: --log <path>"
return 1
fi
local has_worker_dnat="no"
local has_firewalld_reject="no"
local has_traefik_reject="no"
local has_syn_no_synack="no"
local has_synack_recovered="no"
if awk '/KUBE-EXT-.*KUBE-SVC|dnat to 10\.42\./ {hit=1} END{exit !hit}' "${log_file}"; then
has_worker_dnat="yes"
fi
if awk '/filter_FORWARD_POLICIES.*admin-prohibited/ {hit=1} END{exit !hit}' "${log_file}"; then
has_firewalld_reject="yes"
fi
if awk '/Traefik REJECT 命中: [1-9]/ {hit=1} END{exit !hit}' "${log_file}"; then
has_traefik_reject="yes"
fi
if awk '/ylc61\(any\) SYN\/SYN-ACK: [1-9][0-9]*\/0/ {hit=1} END{exit !hit}' "${log_file}"; then
has_syn_no_synack="yes"
fi
if awk '/ylc61\(any\) SYN\/SYN-ACK: [1-9][0-9]*\/[1-9][0-9]*/ {a=1} /ylc62\(ens18\) SYN\/SYN-ACK: [1-9][0-9]*\/[1-9][0-9]*/ {b=1} END{exit !(a&&b)}' "${log_file}"; then
has_synack_recovered="yes"
fi
echo "===== 日志离线判读 ====="
echo "- 日志文件: ${log_file}"
echo "- 观测到 worker DNAT: ${has_worker_dnat}"
echo "- 观测到 firewalld forward reject: ${has_firewalld_reject}"
echo "- 观测到 Traefik Pod REJECT 命中: ${has_traefik_reject}"
echo "- 观测到 ylc61 SYN 无 SYN-ACK: ${has_syn_no_synack}"
echo "- 观测到链路恢复(有 SYN-ACK: ${has_synack_recovered}"
if [[ "${has_firewalld_reject}" == "yes" ]]; then
echo "[结论] 高概率为 ylc61 firewalld FORWARD 策略阻断 flannel.1 -> cni0。"
elif [[ "${has_synack_recovered}" == "yes" ]]; then
echo "[结论] 链路已恢复,入口到 Traefik 回包路径正常。"
elif [[ "${has_worker_dnat}" == "yes" && "${has_syn_no_synack}" == "yes" ]]; then
echo "[结论] worker 入站与 DNAT 正常,需优先排查 ylc61 到 Traefik Pod 的转发/回包链路。"
elif [[ "${has_traefik_reject}" == "yes" ]]; then
echo "[结论] Traefik Pod NetworkPolicy 命中拒绝,优先检查 kube-system netpol。"
else
echo "[结论] 日志未出现单一确定根因,建议执行 run/capture 模式重新采样。"
fi
}