基本框架
This commit is contained in:
113
scripts/diag/entrypath/README.md
Normal file
113
scripts/diag/entrypath/README.md
Normal file
@@ -0,0 +1,113 @@
|
||||
# entrypath 诊断脚本说明
|
||||
|
||||
`entrypath.sh` 用于排查 `client -> worker:80 -> kube-proxy DNAT -> Traefik Pod` 全链路问题。
|
||||
|
||||
## 命令
|
||||
|
||||
```bash
|
||||
./scripts/diag/entrypath/entrypath.sh <command> [options]
|
||||
```
|
||||
|
||||
- `run`:完整检查(默认)
|
||||
- `preflight`:仅检查依赖与参数环境
|
||||
- `capture`:强制开启抓包/trace能力后执行 run
|
||||
- `analyze --log <path>`:离线分析日志
|
||||
|
||||
## 关键参数
|
||||
|
||||
- `--worker-host` / `--client-host`
|
||||
- `--worker-ssh-key` / `--client-ssh-key`
|
||||
- `--client-ip` / `--lb-ip`
|
||||
- `--remote-check y|n`
|
||||
- `--capture-mode y|n`
|
||||
- `--nft-trace-mode y|n`
|
||||
- `--return-trace-mode y|n`
|
||||
- `--pod-netns-trace-mode y|n`
|
||||
- `--non-interactive`
|
||||
|
||||
## 日志
|
||||
|
||||
- root 运行:`/root/netpol-diag-logs/entrypath-*.log`
|
||||
- 非 root:`~/netpol-diag-logs/entrypath-*.log`
|
||||
|
||||
## 典型用法
|
||||
|
||||
### 1) 预检查
|
||||
|
||||
```bash
|
||||
./scripts/diag/entrypath/entrypath.sh preflight --non-interactive
|
||||
```
|
||||
|
||||
### 2) 全功能在线诊断(默认值示例)
|
||||
|
||||
```bash
|
||||
./scripts/diag/entrypath/entrypath.sh run \
|
||||
--worker-host root@192.168.2.62 \
|
||||
--client-host root@192.168.2.63 \
|
||||
--worker-ssh-key ~/.ssh/id_ed25519_k3s_diag_worker \
|
||||
--client-ssh-key ~/.ssh/id_ed25519_k3s_diag_client \
|
||||
--client-ip 192.168.2.63 \
|
||||
--lb-ip 192.168.2.62 \
|
||||
--remote-check y \
|
||||
--capture-mode y \
|
||||
--capture-seconds 15 \
|
||||
--nft-trace-mode y \
|
||||
--nft-trace-seconds 10 \
|
||||
--return-trace-mode y \
|
||||
--return-trace-seconds 12 \
|
||||
--pod-netns-trace-mode y \
|
||||
--pod-netns-trace-seconds 12 \
|
||||
--non-interactive
|
||||
```
|
||||
|
||||
### 3) 离线日志判读
|
||||
|
||||
```bash
|
||||
./scripts/diag/entrypath/entrypath.sh analyze \
|
||||
--log ~/netpol-diag-logs/entrypath-20260310-195812.log
|
||||
```
|
||||
|
||||
## 常见陷阱与修复
|
||||
|
||||
### 1) `62:80` 不通,但 worker 已 DNAT 到 Traefik
|
||||
|
||||
若日志同时出现:
|
||||
|
||||
- `nft 观测到 KUBE-EXT DNAT: yes`
|
||||
- `ylc61(any) SYN/SYN-ACK: N/0`
|
||||
- `filter_FORWARD_POLICIES ... reject with icmpx admin-prohibited`
|
||||
|
||||
通常是 `ylc61` 的 firewalld 转发策略阻断 `flannel.1 -> cni0`。
|
||||
|
||||
修复(推荐):
|
||||
|
||||
```bash
|
||||
sudo firewall-cmd --zone=trusted --add-interface=flannel.1
|
||||
sudo firewall-cmd --zone=trusted --add-interface=cni0
|
||||
|
||||
sudo firewall-cmd --permanent --zone=trusted --add-interface=flannel.1
|
||||
sudo firewall-cmd --permanent --zone=trusted --add-interface=cni0
|
||||
sudo firewall-cmd --reload
|
||||
```
|
||||
|
||||
### 2) `Worker CNI hostport DNAT 计数未增长` 是否异常
|
||||
|
||||
不一定。若 nft trace 明确显示走的是 `KUBE-EXT -> KUBE-SVC -> KUBE-SEP`,则 CNI hostport 计数不增长属于正常路径差异,不应作为故障根因。
|
||||
|
||||
### 3) 成功判据
|
||||
|
||||
至少满足以下任一组:
|
||||
|
||||
- 客户端对 `http://<lb-ip>:80` 返回 `404/200/...`(非连接失败)
|
||||
- 自动判读中:
|
||||
- `ylc62(ens18) SYN/SYN-ACK` 为 `N/N`
|
||||
- `ylc61(any) SYN/SYN-ACK` 为 `N/N`
|
||||
- `ylc61(cni0) SYN/SYN-ACK` 为 `N/N`
|
||||
|
||||
## 模块划分
|
||||
|
||||
- `lib/common.sh`:通用工具、参数默认值
|
||||
- `lib/k8s_checks.sh`:本地 K8s 基线采样
|
||||
- `lib/remote_checks.sh`:远端 worker 采样与复测
|
||||
- `lib/capture.sh`:tcpdump / nft / conntrack / pod netns
|
||||
- `lib/analyze.sh`:实时/离线判读
|
||||
144
scripts/diag/entrypath/entrypath.sh
Normal file
144
scripts/diag/entrypath/entrypath.sh
Normal file
@@ -0,0 +1,144 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
LIB_DIR="${SCRIPT_DIR}/lib"
|
||||
|
||||
source "${LIB_DIR}/common.sh"
|
||||
source "${LIB_DIR}/k8s_checks.sh"
|
||||
source "${LIB_DIR}/remote_checks.sh"
|
||||
source "${LIB_DIR}/capture.sh"
|
||||
source "${LIB_DIR}/analyze.sh"
|
||||
|
||||
parse_args() {
|
||||
init_defaults
|
||||
|
||||
if [[ $# -gt 0 ]]; then
|
||||
case "$1" in
|
||||
run|preflight|capture|analyze)
|
||||
COMMAND="$1"
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--worker-host) WORKER_HOST="${2:-}"; shift 2 ;;
|
||||
--client-host) CLIENT_HOST="${2:-}"; shift 2 ;;
|
||||
--client-ip) CLIENT_IP="${2:-}"; shift 2 ;;
|
||||
--lb-ip) LB_IP="${2:-}"; shift 2 ;;
|
||||
--worker-ssh-key) WORKER_SSH_KEY="${2:-}"; shift 2 ;;
|
||||
--ssh-key) WORKER_SSH_KEY="${2:-}"; shift 2 ;;
|
||||
--client-ssh-key) CLIENT_SSH_KEY="${2:-}"; shift 2 ;;
|
||||
--remote-check) DO_REMOTE_ARG="${2:-}"; shift 2 ;;
|
||||
--capture-mode) CAPTURE_MODE_ARG="${2:-}"; shift 2 ;;
|
||||
--capture-seconds) CAPTURE_SECONDS="${2:-12}"; shift 2 ;;
|
||||
--nft-trace-mode) NFT_TRACE_MODE_ARG="${2:-}"; shift 2 ;;
|
||||
--nft-trace-seconds) NFT_TRACE_SECONDS="${2:-8}"; shift 2 ;;
|
||||
--return-trace-mode) RETURN_TRACE_MODE_ARG="${2:-}"; shift 2 ;;
|
||||
--return-trace-seconds) RETURN_TRACE_SECONDS="${2:-10}"; shift 2 ;;
|
||||
--pod-netns-trace-mode) POD_NETNS_TRACE_MODE_ARG="${2:-}"; shift 2 ;;
|
||||
--pod-netns-trace-seconds) POD_NETNS_TRACE_SECONDS_ARG="${2:-}"; shift 2 ;;
|
||||
--non-interactive) NON_INTERACTIVE="1"; shift ;;
|
||||
--log) ANALYZE_LOG="${2:-}"; shift 2 ;;
|
||||
-h|--help) usage; exit 0 ;;
|
||||
*)
|
||||
echo "[ERR] 未知参数: $1"
|
||||
usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
}
|
||||
|
||||
cmd_preflight() {
|
||||
local_preflight_checks
|
||||
prepare_runtime_context
|
||||
echo "=== preflight ==="
|
||||
echo "[OK] 依赖检查通过"
|
||||
echo "worker_host=${WORKER_HOST}"
|
||||
echo "client_host=${CLIENT_HOST:-<none>}"
|
||||
echo "client_ip=${CLIENT_IP}"
|
||||
echo "lb_ip=${LB_IP}"
|
||||
echo "worker_ssh_key=${WORKER_SSH_KEY:-<ssh默认>}"
|
||||
echo "client_ssh_key=${CLIENT_SSH_KEY:-<ssh默认>}"
|
||||
}
|
||||
|
||||
cmd_run() {
|
||||
local_preflight_checks
|
||||
echo "K3s 全链路一键检查(入口 -> DNAT -> Service -> Endpoint -> NetPol -> 回包)"
|
||||
echo "建议在 server 节点执行(例如 ylc61)。"
|
||||
echo
|
||||
|
||||
# 构造针对 IPv4 LB_IP 的 curl 探测命令
|
||||
local CURL_HTTP CURL_DESC
|
||||
CURL_HTTP="curl -I --max-time 3 http://${LB_IP}:80"
|
||||
CURL_DESC="curl -I --max-time 3 http://${LB_IP}:80"
|
||||
|
||||
prepare_runtime_context
|
||||
setup_log_file
|
||||
|
||||
say "日志文件: $LOG_FILE"
|
||||
say "worker SSH key: ${WORKER_SSH_KEY:-<ssh默认>}"
|
||||
say "client SSH key: ${CLIENT_SSH_KEY:-<ssh默认>}"
|
||||
|
||||
collect_local_k8s_state
|
||||
echo
|
||||
resolve_runtime_modes
|
||||
collect_remote_worker_state
|
||||
|
||||
echo
|
||||
echo ">>> 请在第三方客户端(${CLIENT_IP})执行 3 次:${CURL_DESC}"
|
||||
start_worker_capture
|
||||
start_worker_nft_trace
|
||||
start_return_path_trace
|
||||
start_pod_netns_trace
|
||||
if [[ -n "${CLIENT_HOST}" ]]; then
|
||||
say "通过 SSH 自动触发客户端探测: ${CLIENT_HOST}"
|
||||
run_cmd "Client 自动探测(3次)" ssh "${CLIENT_SSH_OPTS[@]}" "${CLIENT_HOST}" \
|
||||
"for i in 1 2 3; do ${CURL_HTTP} || true; sleep 1; done"
|
||||
elif [[ "${NON_INTERACTIVE}" == "0" ]]; then
|
||||
read -r -p "完成后按回车继续采样..."
|
||||
else
|
||||
echo "[WARN] non-interactive 模式且未提供 --client-host:跳过等待直接采样,可能没有新流量。"
|
||||
fi
|
||||
flush_worker_capture
|
||||
|
||||
post_remote_worker_state
|
||||
run_cmd "Traefik Pod FW 链复测" sudo iptables -L "${TRAEFIK_CHAIN:-KUBE-ROUTER-FORWARD}" -n -v --line-numbers
|
||||
run_cmd "本机访问目标LB_IP:80(仅供参考,可能本机被kube-proxy劫持)" bash -lc "${CURL_HTTP}"
|
||||
|
||||
print_diag_summary
|
||||
echo
|
||||
echo "Traefik pod netns SYN/SYN-ACK: ${POD_NETNS_SYN_COUNT:-0}/${POD_NETNS_SYNACK_COUNT:-0}"
|
||||
echo
|
||||
echo "完成。完整日志: ${LOG_FILE}"
|
||||
}
|
||||
|
||||
cmd_capture() {
|
||||
DO_REMOTE_ARG="y"
|
||||
CAPTURE_MODE_ARG="y"
|
||||
NFT_TRACE_MODE_ARG="y"
|
||||
RETURN_TRACE_MODE_ARG="y"
|
||||
POD_NETNS_TRACE_MODE_ARG="y"
|
||||
NON_INTERACTIVE="1"
|
||||
cmd_run
|
||||
}
|
||||
|
||||
cmd_analyze() {
|
||||
analyze_log_file "${ANALYZE_LOG}"
|
||||
}
|
||||
|
||||
main() {
|
||||
parse_args "$@"
|
||||
case "${COMMAND}" in
|
||||
run) cmd_run ;;
|
||||
preflight) cmd_preflight ;;
|
||||
capture) cmd_capture ;;
|
||||
analyze) cmd_analyze ;;
|
||||
*) echo "[ERR] 未知命令: ${COMMAND}"; usage; exit 1 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
main "$@"
|
||||
80
scripts/diag/entrypath/lib/analyze.sh
Normal file
80
scripts/diag/entrypath/lib/analyze.sh
Normal file
@@ -0,0 +1,80 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
print_diag_summary() {
|
||||
echo
|
||||
echo "===== 自动判读(基于当前计数) ====="
|
||||
echo "- Traefik Pod FW 链: ${TRAEFIK_CHAIN:-N/A}"
|
||||
echo "- Traefik REJECT 命中: ${REJECT_PKTS:-0}"
|
||||
echo "- Traefik NFLOG 命中: ${NFLOG_PKTS:-0}"
|
||||
echo "- Service web 链: ${TRAEFIK_WEB_SVC_CHAIN:-N/A}"
|
||||
echo "- Service web endpoint 链: ${TRAEFIK_WEB_SEP_CHAIN:-N/A}"
|
||||
echo "- Worker CNI hostport链: ${WORKER_CNI_DNAT_CHAIN:-N/A}"
|
||||
echo "- nft 观测到 KUBE-EXT DNAT: ${NFT_DNAT_HIT:-no}"
|
||||
echo "- ylc61(any) SYN/SYN-ACK: ${RET_LOCAL_SYN_COUNT:-0}/${RET_LOCAL_SYNACK_COUNT:-0}"
|
||||
echo "- ylc61(cni0) SYN/SYN-ACK: ${RET_CNI0_SYN_COUNT:-0}/${RET_CNI0_SYNACK_COUNT:-0}"
|
||||
echo "- ylc62(ens18) SYN/SYN-ACK: ${RET_WORKER_SYN_COUNT:-0}/${RET_WORKER_SYNACK_COUNT:-0}"
|
||||
echo "- Traefik pod netns SYN/SYN-ACK: ${POD_NETNS_SYN_COUNT:-0}/${POD_NETNS_SYNACK_COUNT:-0}"
|
||||
|
||||
if [[ "${REJECT_PKTS:-0}" =~ ^[0-9]+$ ]] && [[ "${REJECT_PKTS:-0}" -gt 0 ]]; then
|
||||
echo "[结论] Traefik Pod 防火墙链出现 REJECT 命中,优先检查 kube-system 下 Traefik 相关 Ingress NetworkPolicy。"
|
||||
elif [[ "${RET_WORKER_SYNACK_COUNT:-0}" -gt 0 ]] && [[ "${RET_LOCAL_SYNACK_COUNT:-0}" -gt 0 ]] && [[ "${RET_CNI0_SYNACK_COUNT:-0}" -gt 0 ]]; then
|
||||
echo "[结论] 链路已恢复:ylc62/ylc61/cni0 均观测到 SYN-ACK,62:80 已可达 Traefik。"
|
||||
elif [[ "${NFT_DNAT_HIT:-no}" == "yes" ]] && [[ "${RET_LOCAL_SYN_COUNT:-0}" -gt 0 ]] && [[ "${RET_LOCAL_SYNACK_COUNT:-0}" -eq 0 ]]; then
|
||||
echo "[结论] 流量已经在 worker 被 KUBE-EXT/KUBE-SVC DNAT 到 Traefik(10.42.0.12:8000),但 ylc61 未观察到 SYN-ACK,优先排查 Traefik Pod/宿主转发回包路径。"
|
||||
elif [[ -n "${WORKER_CNI_HIT_AFTER:-}" && -n "${WORKER_CNI_HIT_BEFORE:-}" ]] && \
|
||||
[[ "${WORKER_CNI_HIT_AFTER}" == "${WORKER_CNI_HIT_BEFORE}" ]]; then
|
||||
echo "[结论] Worker CNI hostport DNAT 计数未增长。若 nft trace 显示走 KUBE-EXT/KUBE-SVC,这是正常路径提示,不构成故障根因。"
|
||||
else
|
||||
echo "[结论] 未观察到 Traefik REJECT 明确命中,优先检查回包链路(ylc61<->ylc62 flannel / ylc62 ens18 出口)。"
|
||||
fi
|
||||
}
|
||||
|
||||
analyze_log_file() {
|
||||
local log_file="$1"
|
||||
if [[ -z "${log_file}" || ! -f "${log_file}" ]]; then
|
||||
echo "[ERR] analyze 模式需要有效日志文件: --log <path>"
|
||||
return 1
|
||||
fi
|
||||
|
||||
local has_worker_dnat="no"
|
||||
local has_firewalld_reject="no"
|
||||
local has_traefik_reject="no"
|
||||
local has_syn_no_synack="no"
|
||||
local has_synack_recovered="no"
|
||||
|
||||
if awk '/KUBE-EXT-.*KUBE-SVC|dnat to 10\.42\./ {hit=1} END{exit !hit}' "${log_file}"; then
|
||||
has_worker_dnat="yes"
|
||||
fi
|
||||
if awk '/filter_FORWARD_POLICIES.*admin-prohibited/ {hit=1} END{exit !hit}' "${log_file}"; then
|
||||
has_firewalld_reject="yes"
|
||||
fi
|
||||
if awk '/Traefik REJECT 命中: [1-9]/ {hit=1} END{exit !hit}' "${log_file}"; then
|
||||
has_traefik_reject="yes"
|
||||
fi
|
||||
if awk '/ylc61\(any\) SYN\/SYN-ACK: [1-9][0-9]*\/0/ {hit=1} END{exit !hit}' "${log_file}"; then
|
||||
has_syn_no_synack="yes"
|
||||
fi
|
||||
if awk '/ylc61\(any\) SYN\/SYN-ACK: [1-9][0-9]*\/[1-9][0-9]*/ {a=1} /ylc62\(ens18\) SYN\/SYN-ACK: [1-9][0-9]*\/[1-9][0-9]*/ {b=1} END{exit !(a&&b)}' "${log_file}"; then
|
||||
has_synack_recovered="yes"
|
||||
fi
|
||||
|
||||
echo "===== 日志离线判读 ====="
|
||||
echo "- 日志文件: ${log_file}"
|
||||
echo "- 观测到 worker DNAT: ${has_worker_dnat}"
|
||||
echo "- 观测到 firewalld forward reject: ${has_firewalld_reject}"
|
||||
echo "- 观测到 Traefik Pod REJECT 命中: ${has_traefik_reject}"
|
||||
echo "- 观测到 ylc61 SYN 无 SYN-ACK: ${has_syn_no_synack}"
|
||||
echo "- 观测到链路恢复(有 SYN-ACK): ${has_synack_recovered}"
|
||||
|
||||
if [[ "${has_firewalld_reject}" == "yes" ]]; then
|
||||
echo "[结论] 高概率为 ylc61 firewalld FORWARD 策略阻断 flannel.1 -> cni0。"
|
||||
elif [[ "${has_synack_recovered}" == "yes" ]]; then
|
||||
echo "[结论] 链路已恢复,入口到 Traefik 回包路径正常。"
|
||||
elif [[ "${has_worker_dnat}" == "yes" && "${has_syn_no_synack}" == "yes" ]]; then
|
||||
echo "[结论] worker 入站与 DNAT 正常,需优先排查 ylc61 到 Traefik Pod 的转发/回包链路。"
|
||||
elif [[ "${has_traefik_reject}" == "yes" ]]; then
|
||||
echo "[结论] Traefik Pod NetworkPolicy 命中拒绝,优先检查 kube-system netpol。"
|
||||
else
|
||||
echo "[结论] 日志未出现单一确定根因,建议执行 run/capture 模式重新采样。"
|
||||
fi
|
||||
}
|
||||
286
scripts/diag/entrypath/lib/capture.sh
Normal file
286
scripts/diag/entrypath/lib/capture.sh
Normal file
@@ -0,0 +1,286 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
CAPTURE_MODE="N"
|
||||
CAPTURE_SECONDS="12"
|
||||
CAPTURE_MODE_ARG=""
|
||||
CAP_FILE_ENS18=""
|
||||
CAP_PID_ENS18=""
|
||||
NFT_TRACE_MODE="N"
|
||||
NFT_TRACE_SECONDS="8"
|
||||
NFT_TRACE_MODE_ARG=""
|
||||
NFT_FILE=""
|
||||
NFT_PID=""
|
||||
NFT_TRACE_TABLE="diag_k3s_entrypath"
|
||||
LOCAL_NFT_TRACE_TABLE="diag61_k3s_entrypath"
|
||||
RETURN_TRACE_MODE="N"
|
||||
RETURN_TRACE_SECONDS="10"
|
||||
RETURN_TRACE_MODE_ARG=""
|
||||
RET_FILE_LOCAL_8000=""
|
||||
RET_FILE_LOCAL_CNI0=""
|
||||
RET_FILE_WORKER_ENS18=""
|
||||
RET_FILE_WORKER_CONNTRACK=""
|
||||
RET_PID_LOCAL_8000=""
|
||||
RET_PID_LOCAL_CNI0=""
|
||||
RET_PID_WORKER_ENS18=""
|
||||
RET_PID_WORKER_CONNTRACK=""
|
||||
RET_FILE_LOCAL_NFT_TRACE=""
|
||||
RET_PID_LOCAL_NFT_TRACE=""
|
||||
NFT_DNAT_HIT="no"
|
||||
RET_LOCAL_SYN_COUNT=0
|
||||
RET_LOCAL_SYNACK_COUNT=0
|
||||
RET_CNI0_SYN_COUNT=0
|
||||
RET_CNI0_SYNACK_COUNT=0
|
||||
RET_WORKER_SYN_COUNT=0
|
||||
RET_WORKER_SYNACK_COUNT=0
|
||||
POD_NETNS_TRACE_MODE="N"
|
||||
POD_NETNS_TRACE_MODE_ARG=""
|
||||
POD_NETNS_TRACE_SECONDS=""
|
||||
POD_NETNS_TRACE_SECONDS_ARG=""
|
||||
POD_NETNS_PID=""
|
||||
POD_NETNS_FILE=""
|
||||
POD_NETNS_SYN_COUNT=0
|
||||
POD_NETNS_SYNACK_COUNT=0
|
||||
|
||||
start_worker_capture() {
|
||||
if [[ ! "$CAPTURE_MODE" =~ ^[Yy]$ ]]; then
|
||||
return 0
|
||||
fi
|
||||
if [[ ! "$DO_REMOTE" =~ ^[Yy]$ ]] || [[ -z "${WORKER_HOST}" ]]; then
|
||||
echo "[WARN] 抓包模式已开启,但未启用远端检查或未提供 worker 主机,跳过抓包。"
|
||||
return 0
|
||||
fi
|
||||
|
||||
CAP_FILE_ENS18="$(mktemp)"
|
||||
say "启动 worker 抓包(ens18, ${CAPTURE_SECONDS}s): host ${CLIENT_IP} and tcp port 80"
|
||||
ssh "${SSH_OPTS[@]}" "${WORKER_HOST}" \
|
||||
"sudo timeout ${CAPTURE_SECONDS} tcpdump -ni ens18 'host ${CLIENT_IP} and tcp port 80' 2>/dev/null || true" \
|
||||
>"${CAP_FILE_ENS18}" 2>&1 &
|
||||
CAP_PID_ENS18="$!"
|
||||
sleep 1
|
||||
}
|
||||
|
||||
start_worker_nft_trace() {
|
||||
if [[ ! "$NFT_TRACE_MODE" =~ ^[Yy]$ ]]; then
|
||||
return 0
|
||||
fi
|
||||
if [[ ! "$DO_REMOTE" =~ ^[Yy]$ ]] || [[ -z "${WORKER_HOST}" ]]; then
|
||||
echo "[WARN] nft trace 已开启,但未启用远端检查或未提供 worker 主机,跳过 nft trace。"
|
||||
return 0
|
||||
fi
|
||||
|
||||
ssh "${SSH_OPTS[@]}" "${WORKER_HOST}" \
|
||||
"sudo nft add table inet ${NFT_TRACE_TABLE} 2>/dev/null || true; \
|
||||
sudo nft 'add chain inet ${NFT_TRACE_TABLE} prerouting { type filter hook prerouting priority -301; policy accept; }' 2>/dev/null || true; \
|
||||
sudo nft add rule inet ${NFT_TRACE_TABLE} prerouting iif \"ens18\" ip saddr ${CLIENT_IP} ip daddr ${LB_IP} tcp dport 80 meta nftrace set 1 2>/dev/null || true" \
|
||||
|| true
|
||||
|
||||
NFT_FILE="$(mktemp)"
|
||||
say "启动 worker nft trace(${NFT_TRACE_SECONDS}s)"
|
||||
ssh "${SSH_OPTS[@]}" "${WORKER_HOST}" \
|
||||
"sudo timeout ${NFT_TRACE_SECONDS} nft monitor trace 2>/dev/null || true" \
|
||||
>"${NFT_FILE}" 2>&1 &
|
||||
NFT_PID="$!"
|
||||
sleep 1
|
||||
}
|
||||
|
||||
start_return_path_trace() {
|
||||
if [[ ! "$RETURN_TRACE_MODE" =~ ^[Yy]$ ]]; then
|
||||
return 0
|
||||
fi
|
||||
if [[ ! "$DO_REMOTE" =~ ^[Yy]$ ]] || [[ -z "${WORKER_HOST}" ]]; then
|
||||
echo "[WARN] 回包链路跟踪已开启,但未启用远端检查或未提供 worker 主机,跳过。"
|
||||
return 0
|
||||
fi
|
||||
|
||||
RET_FILE_LOCAL_8000="$(mktemp)"
|
||||
RET_FILE_LOCAL_CNI0="$(mktemp)"
|
||||
RET_FILE_LOCAL_NFT_TRACE="$(mktemp)"
|
||||
RET_FILE_WORKER_ENS18="$(mktemp)"
|
||||
RET_FILE_WORKER_CONNTRACK="$(mktemp)"
|
||||
|
||||
say "启动回包链路跟踪(${RETURN_TRACE_SECONDS}s)"
|
||||
sudo nft add table inet "${LOCAL_NFT_TRACE_TABLE}" 2>/dev/null || true
|
||||
sudo nft "add chain inet ${LOCAL_NFT_TRACE_TABLE} forward { type filter hook forward priority -301; policy accept; }" 2>/dev/null || true
|
||||
sudo nft add rule inet "${LOCAL_NFT_TRACE_TABLE}" forward iif "flannel.1" ip daddr "${TRAEFIK_IP}" tcp dport 8000 meta nftrace set 1 2>/dev/null || true
|
||||
sudo timeout "${RETURN_TRACE_SECONDS}" nft monitor trace 2>/dev/null \
|
||||
>"${RET_FILE_LOCAL_NFT_TRACE}" 2>&1 &
|
||||
RET_PID_LOCAL_NFT_TRACE="$!"
|
||||
|
||||
sudo timeout "${RETURN_TRACE_SECONDS}" tcpdump -ni any "host ${TRAEFIK_IP} and tcp port 8000" 2>/dev/null \
|
||||
>"${RET_FILE_LOCAL_8000}" 2>&1 &
|
||||
RET_PID_LOCAL_8000="$!"
|
||||
|
||||
sudo timeout "${RETURN_TRACE_SECONDS}" tcpdump -ni cni0 "host ${TRAEFIK_IP} and tcp port 8000" 2>/dev/null \
|
||||
>"${RET_FILE_LOCAL_CNI0}" 2>&1 &
|
||||
RET_PID_LOCAL_CNI0="$!"
|
||||
|
||||
ssh "${SSH_OPTS[@]}" "${WORKER_HOST}" \
|
||||
"sudo timeout ${RETURN_TRACE_SECONDS} tcpdump -ni ens18 'host ${CLIENT_IP} and tcp' 2>/dev/null || true" \
|
||||
>"${RET_FILE_WORKER_ENS18}" 2>&1 &
|
||||
RET_PID_WORKER_ENS18="$!"
|
||||
|
||||
ssh "${SSH_OPTS[@]}" "${WORKER_HOST}" \
|
||||
"if command -v conntrack >/dev/null 2>&1; then sudo timeout ${RETURN_TRACE_SECONDS} conntrack -E -p tcp 2>/dev/null || true; else echo 'conntrack: not found'; fi" \
|
||||
>"${RET_FILE_WORKER_CONNTRACK}" 2>&1 &
|
||||
RET_PID_WORKER_CONNTRACK="$!"
|
||||
|
||||
sleep 1
|
||||
}
|
||||
|
||||
start_pod_netns_trace() {
|
||||
if [[ ! "${POD_NETNS_TRACE_MODE}" =~ ^[Yy]$ ]]; then
|
||||
return 0
|
||||
fi
|
||||
if ! command -v crictl >/dev/null 2>&1; then
|
||||
echo "[WARN] 未找到 crictl,跳过 pod netns 抓包。"
|
||||
return 0
|
||||
fi
|
||||
if ! command -v nsenter >/dev/null 2>&1; then
|
||||
echo "[WARN] 未找到 nsenter,跳过 pod netns 抓包。"
|
||||
return 0
|
||||
fi
|
||||
|
||||
local sec="${POD_NETNS_TRACE_SECONDS:-$RETURN_TRACE_SECONDS}"
|
||||
local cid
|
||||
local pid
|
||||
local runtime_id=""
|
||||
|
||||
runtime_id="$(sudo kubectl -n kube-system get pod "${TRAEFIK_POD}" -o jsonpath='{.status.containerStatuses[?(@.name=="traefik")].containerID}' 2>/dev/null || true)"
|
||||
runtime_id="${runtime_id#containerd://}"
|
||||
runtime_id="${runtime_id#cri-o://}"
|
||||
|
||||
if [[ -n "${runtime_id}" ]]; then
|
||||
cid="${runtime_id}"
|
||||
else
|
||||
cid="$(sudo crictl ps --name traefik -q 2>/dev/null | awk 'NR==1{print; exit}' || true)"
|
||||
fi
|
||||
if [[ -z "${cid}" ]]; then
|
||||
echo "[WARN] 未解析到 traefik 容器ID,跳过 pod netns 抓包。"
|
||||
return 0
|
||||
fi
|
||||
|
||||
pid="$(sudo crictl inspect "${cid}" 2>/dev/null | awk -F': ' '/"pid":/ {gsub(/,/, "", $2); print $2; exit}' || true)"
|
||||
if [[ -z "${pid}" || ! "${pid}" =~ ^[0-9]+$ ]]; then
|
||||
echo "[WARN] 未解析到 traefik 容器 PID,跳过 pod netns 抓包。"
|
||||
return 0
|
||||
fi
|
||||
|
||||
POD_NETNS_FILE="$(mktemp)"
|
||||
say "启动 Traefik Pod netns 抓包(${sec}s, pid=${pid})"
|
||||
sudo timeout "${sec}" nsenter -t "${pid}" -n tcpdump -ni any "tcp port 8000" 2>/dev/null \
|
||||
>"${POD_NETNS_FILE}" 2>&1 &
|
||||
POD_NETNS_PID="$!"
|
||||
sleep 1
|
||||
}
|
||||
|
||||
flush_worker_capture() {
|
||||
if [[ -n "${CAP_PID_ENS18}" ]]; then
|
||||
wait "${CAP_PID_ENS18}" || true
|
||||
CAP_PID_ENS18=""
|
||||
fi
|
||||
if [[ -n "${CAP_FILE_ENS18}" && -f "${CAP_FILE_ENS18}" ]]; then
|
||||
echo
|
||||
echo "===== Worker 抓包结果(ens18) ====="
|
||||
cat "${CAP_FILE_ENS18}" || true
|
||||
rm -f "${CAP_FILE_ENS18}" || true
|
||||
CAP_FILE_ENS18=""
|
||||
fi
|
||||
|
||||
if [[ -n "${NFT_PID}" ]]; then
|
||||
wait "${NFT_PID}" || true
|
||||
NFT_PID=""
|
||||
fi
|
||||
if [[ -n "${NFT_FILE}" && -f "${NFT_FILE}" ]]; then
|
||||
if grep -Eq "KUBE-SEP-.*dnat to ${TRAEFIK_IP}:8000|dnat to ${TRAEFIK_IP}:8000" "${NFT_FILE}" >/dev/null 2>&1; then
|
||||
NFT_DNAT_HIT="yes"
|
||||
fi
|
||||
echo
|
||||
echo "===== Worker nft trace 结果 ====="
|
||||
cat "${NFT_FILE}" || true
|
||||
rm -f "${NFT_FILE}" || true
|
||||
NFT_FILE=""
|
||||
fi
|
||||
|
||||
if [[ "$NFT_TRACE_MODE" =~ ^[Yy]$ ]] && [[ "$DO_REMOTE" =~ ^[Yy]$ ]] && [[ -n "${WORKER_HOST}" ]]; then
|
||||
ssh "${SSH_OPTS[@]}" "${WORKER_HOST}" "sudo nft delete table inet ${NFT_TRACE_TABLE} 2>/dev/null || true" || true
|
||||
fi
|
||||
|
||||
if [[ -n "${RET_PID_LOCAL_8000}" ]]; then
|
||||
wait "${RET_PID_LOCAL_8000}" || true
|
||||
RET_PID_LOCAL_8000=""
|
||||
fi
|
||||
if [[ -n "${RET_PID_LOCAL_NFT_TRACE}" ]]; then
|
||||
wait "${RET_PID_LOCAL_NFT_TRACE}" || true
|
||||
RET_PID_LOCAL_NFT_TRACE=""
|
||||
fi
|
||||
if [[ -n "${RET_PID_LOCAL_CNI0}" ]]; then
|
||||
wait "${RET_PID_LOCAL_CNI0}" || true
|
||||
RET_PID_LOCAL_CNI0=""
|
||||
fi
|
||||
if [[ -n "${RET_PID_WORKER_ENS18}" ]]; then
|
||||
wait "${RET_PID_WORKER_ENS18}" || true
|
||||
RET_PID_WORKER_ENS18=""
|
||||
fi
|
||||
if [[ -n "${RET_PID_WORKER_CONNTRACK}" ]]; then
|
||||
wait "${RET_PID_WORKER_CONNTRACK}" || true
|
||||
RET_PID_WORKER_CONNTRACK=""
|
||||
fi
|
||||
|
||||
if [[ -n "${RET_FILE_LOCAL_8000}" && -f "${RET_FILE_LOCAL_8000}" ]]; then
|
||||
RET_LOCAL_SYN_COUNT="$(count_tcpdump_flag "${RET_FILE_LOCAL_8000}" "Flags [S]")"
|
||||
RET_LOCAL_SYNACK_COUNT="$(count_tcpdump_flag "${RET_FILE_LOCAL_8000}" "Flags [S.]")"
|
||||
echo
|
||||
echo "===== 回包链路抓包(ylc61 any -> ${TRAEFIK_IP}:8000) ====="
|
||||
cat "${RET_FILE_LOCAL_8000}" || true
|
||||
rm -f "${RET_FILE_LOCAL_8000}" || true
|
||||
RET_FILE_LOCAL_8000=""
|
||||
fi
|
||||
if [[ -n "${RET_FILE_LOCAL_NFT_TRACE}" && -f "${RET_FILE_LOCAL_NFT_TRACE}" ]]; then
|
||||
echo
|
||||
echo "===== 本机 nft trace 结果(ylc61 forward) ====="
|
||||
cat "${RET_FILE_LOCAL_NFT_TRACE}" || true
|
||||
rm -f "${RET_FILE_LOCAL_NFT_TRACE}" || true
|
||||
RET_FILE_LOCAL_NFT_TRACE=""
|
||||
fi
|
||||
if [[ -n "${RET_FILE_LOCAL_CNI0}" && -f "${RET_FILE_LOCAL_CNI0}" ]]; then
|
||||
RET_CNI0_SYN_COUNT="$(count_tcpdump_flag "${RET_FILE_LOCAL_CNI0}" "Flags [S]")"
|
||||
RET_CNI0_SYNACK_COUNT="$(count_tcpdump_flag "${RET_FILE_LOCAL_CNI0}" "Flags [S.]")"
|
||||
echo
|
||||
echo "===== 回包链路抓包(ylc61 cni0 -> ${TRAEFIK_IP}:8000) ====="
|
||||
cat "${RET_FILE_LOCAL_CNI0}" || true
|
||||
rm -f "${RET_FILE_LOCAL_CNI0}" || true
|
||||
RET_FILE_LOCAL_CNI0=""
|
||||
fi
|
||||
if [[ -n "${RET_FILE_WORKER_ENS18}" && -f "${RET_FILE_WORKER_ENS18}" ]]; then
|
||||
RET_WORKER_SYN_COUNT="$(count_tcpdump_flag "${RET_FILE_WORKER_ENS18}" "Flags [S]")"
|
||||
RET_WORKER_SYNACK_COUNT="$(count_tcpdump_flag "${RET_FILE_WORKER_ENS18}" "Flags [S.]")"
|
||||
echo
|
||||
echo "===== 回包链路抓包(ylc62 ens18 <-> ${CLIENT_IP}) ====="
|
||||
cat "${RET_FILE_WORKER_ENS18}" || true
|
||||
rm -f "${RET_FILE_WORKER_ENS18}" || true
|
||||
RET_FILE_WORKER_ENS18=""
|
||||
fi
|
||||
if [[ -n "${RET_FILE_WORKER_CONNTRACK}" && -f "${RET_FILE_WORKER_CONNTRACK}" ]]; then
|
||||
echo
|
||||
echo "===== 回包链路 conntrack 事件(ylc62) ====="
|
||||
cat "${RET_FILE_WORKER_CONNTRACK}" || true
|
||||
rm -f "${RET_FILE_WORKER_CONNTRACK}" || true
|
||||
RET_FILE_WORKER_CONNTRACK=""
|
||||
fi
|
||||
|
||||
sudo nft delete table inet "${LOCAL_NFT_TRACE_TABLE}" 2>/dev/null || true
|
||||
|
||||
if [[ -n "${POD_NETNS_PID}" ]]; then
|
||||
wait "${POD_NETNS_PID}" || true
|
||||
POD_NETNS_PID=""
|
||||
fi
|
||||
if [[ -n "${POD_NETNS_FILE}" && -f "${POD_NETNS_FILE}" ]]; then
|
||||
POD_NETNS_SYN_COUNT="$(count_tcpdump_flag "${POD_NETNS_FILE}" "Flags [S]")"
|
||||
POD_NETNS_SYNACK_COUNT="$(count_tcpdump_flag "${POD_NETNS_FILE}" "Flags [S.]")"
|
||||
echo
|
||||
echo "===== Traefik Pod netns 抓包(ylc61) ====="
|
||||
cat "${POD_NETNS_FILE}" || true
|
||||
rm -f "${POD_NETNS_FILE}" || true
|
||||
POD_NETNS_FILE=""
|
||||
fi
|
||||
}
|
||||
104
scripts/diag/entrypath/lib/common.sh
Normal file
104
scripts/diag/entrypath/lib/common.sh
Normal file
@@ -0,0 +1,104 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
now() { date '+%Y-%m-%d %H:%M:%S'; }
|
||||
say() { echo "[$(now)] $*"; }
|
||||
|
||||
usage() {
|
||||
cat <<'EOF'
|
||||
用法:
|
||||
entrypath.sh <command> [选项]
|
||||
entrypath.sh [选项] # 等价于 run
|
||||
|
||||
命令:
|
||||
run 完整检查(默认)
|
||||
preflight 仅检查本地依赖与参数环境
|
||||
capture 强制开启所有抓包/trace能力后执行 run
|
||||
analyze --log <path> 离线分析日志文件
|
||||
|
||||
通用选项:
|
||||
--worker-host <user@host> 远端 worker SSH 主机(默认 jack@192.168.2.62)
|
||||
--client-host <user@host> 远端客户端 SSH 主机(可选,用于自动发起 curl)
|
||||
--client-ip <ip> 第三方客户端 IP(默认 192.168.2.63)
|
||||
--lb-ip <ip> 待排查 LB 节点 IP(默认 192.168.2.62)
|
||||
--worker-ssh-key <path> worker SSH 私钥路径(默认 ~/.ssh/id_ed25519_k3s_diag_worker)
|
||||
--client-ssh-key <path> 客户端 SSH 私钥路径(默认 ~/.ssh/id_ed25519_k3s_diag_client)
|
||||
--ssh-key <path> 兼容别名,等同 --worker-ssh-key
|
||||
--remote-check <y|n> 是否启用远端检查(默认 n,交互可覆盖)
|
||||
--capture-mode <y|n> 抓包模式(worker ens18,默认 n)
|
||||
--capture-seconds <n> 抓包持续秒数(默认 12)
|
||||
--nft-trace-mode <y|n> nft trace 模式(worker,默认 n)
|
||||
--nft-trace-seconds <n> nft trace 持续秒数(默认 8)
|
||||
--return-trace-mode <y|n> 回包链路跟踪(ylc61/ylc62,默认 n)
|
||||
--return-trace-seconds <n> 回包链路跟踪持续秒数(默认 10)
|
||||
--pod-netns-trace-mode <y|n> Traefik Pod netns 抓包(ylc61,默认 n)
|
||||
--pod-netns-trace-seconds <n> Traefik Pod netns 抓包持续秒数(默认同 return-trace-seconds)
|
||||
--non-interactive 非交互模式(需配合上面参数)
|
||||
--log <path> 仅 analyze 子命令使用
|
||||
-h, --help 显示帮助
|
||||
EOF
|
||||
}
|
||||
|
||||
run_cmd() {
|
||||
local desc="$1"
|
||||
shift
|
||||
echo
|
||||
echo "===== ${desc} ====="
|
||||
"$@" || true
|
||||
}
|
||||
|
||||
require_cmd() {
|
||||
local c="$1"
|
||||
if ! command -v "$c" >/dev/null 2>&1; then
|
||||
echo "[ERR] missing command: $c"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
read_default() {
|
||||
local prompt="$1"
|
||||
local def="$2"
|
||||
local out
|
||||
printf "%s [%s]: " "$prompt" "$def" >&2
|
||||
read -r out
|
||||
echo "${out:-$def}"
|
||||
}
|
||||
|
||||
extract_pkts_for_target() {
|
||||
local table="$1"
|
||||
local chain="$2"
|
||||
local target="$3"
|
||||
sudo iptables ${table:+-t "$table"} -L "$chain" -n -v -x 2>/dev/null \
|
||||
| awk -v t="$target" '$3==t {print $1; exit}'
|
||||
}
|
||||
|
||||
extract_first_jump_target() {
|
||||
local table="$1"
|
||||
local chain="$2"
|
||||
sudo iptables ${table:+-t "$table"} -S "$chain" 2>/dev/null \
|
||||
| awk '/-j KUBE-SEP-/{for(i=1;i<=NF;i++) if($i=="-j"){print $(i+1); exit}}'
|
||||
}
|
||||
|
||||
count_tcpdump_flag() {
|
||||
local file="$1"
|
||||
local flag="$2"
|
||||
if [[ ! -f "$file" ]]; then
|
||||
echo 0
|
||||
return 0
|
||||
fi
|
||||
awk -v f="$flag" 'BEGIN{c=0} index($0,f){c++} END{print c}' "$file"
|
||||
}
|
||||
|
||||
init_defaults() {
|
||||
COMMAND="run"
|
||||
ANALYZE_LOG=""
|
||||
WORKER_HOST="jack@192.168.2.62"
|
||||
CLIENT_HOST=""
|
||||
CLIENT_IP="192.168.2.63"
|
||||
LB_IP="192.168.2.62"
|
||||
WORKER_SSH_KEY=""
|
||||
CLIENT_SSH_KEY=""
|
||||
DEFAULT_WORKER_SSH_KEY="${HOME}/.ssh/id_ed25519_k3s_diag_worker"
|
||||
DEFAULT_CLIENT_SSH_KEY="${HOME}/.ssh/id_ed25519_k3s_diag_client"
|
||||
DO_REMOTE_ARG=""
|
||||
NON_INTERACTIVE="0"
|
||||
}
|
||||
95
scripts/diag/entrypath/lib/k8s_checks.sh
Normal file
95
scripts/diag/entrypath/lib/k8s_checks.sh
Normal file
@@ -0,0 +1,95 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
local_preflight_checks() {
|
||||
require_cmd bash
|
||||
require_cmd sudo
|
||||
require_cmd kubectl
|
||||
require_cmd awk
|
||||
require_cmd sed
|
||||
require_cmd grep
|
||||
}
|
||||
|
||||
prepare_runtime_context() {
|
||||
if [[ "${NON_INTERACTIVE}" == "0" ]]; then
|
||||
WORKER_HOST="$(read_default "Worker SSH 主机(user@host,留空跳过远端检查)" "${WORKER_HOST}")"
|
||||
CLIENT_IP="$(read_default "第三方客户端 IP(用于人工发流量)" "${CLIENT_IP}")"
|
||||
LB_IP="$(read_default "待排查节点对外 IP(如 ylc62)" "${LB_IP}")"
|
||||
fi
|
||||
|
||||
if [[ -z "${WORKER_SSH_KEY}" && -f "${DEFAULT_WORKER_SSH_KEY}" ]]; then
|
||||
WORKER_SSH_KEY="${DEFAULT_WORKER_SSH_KEY}"
|
||||
fi
|
||||
if [[ -z "${CLIENT_SSH_KEY}" && -f "${DEFAULT_CLIENT_SSH_KEY}" ]]; then
|
||||
CLIENT_SSH_KEY="${DEFAULT_CLIENT_SSH_KEY}"
|
||||
fi
|
||||
if [[ -z "${CLIENT_SSH_KEY}" && -n "${WORKER_SSH_KEY}" ]]; then
|
||||
CLIENT_SSH_KEY="${WORKER_SSH_KEY}"
|
||||
fi
|
||||
|
||||
SSH_OPTS=()
|
||||
if [[ -n "${WORKER_SSH_KEY}" ]]; then
|
||||
SSH_OPTS=(-i "${WORKER_SSH_KEY}" -o IdentitiesOnly=yes)
|
||||
fi
|
||||
|
||||
CLIENT_SSH_OPTS=()
|
||||
if [[ -n "${CLIENT_SSH_KEY}" ]]; then
|
||||
CLIENT_SSH_OPTS=(-i "${CLIENT_SSH_KEY}" -o IdentitiesOnly=yes)
|
||||
fi
|
||||
}
|
||||
|
||||
setup_log_file() {
|
||||
if [[ "${EUID}" -eq 0 ]]; then
|
||||
LOG_DIR="/root/netpol-diag-logs"
|
||||
else
|
||||
LOG_DIR="${HOME}/netpol-diag-logs"
|
||||
fi
|
||||
mkdir -p "$LOG_DIR"
|
||||
LOG_FILE="${LOG_DIR}/entrypath-$(date '+%Y%m%d-%H%M%S').log"
|
||||
exec > >(tee -a "$LOG_FILE") 2>&1
|
||||
}
|
||||
|
||||
collect_local_k8s_state() {
|
||||
run_cmd "节点状态" sudo kubectl get nodes -o wide
|
||||
run_cmd "kube-system 关键组件" sh -c "sudo kubectl -n kube-system get pods -o wide | grep -E 'traefik|svclb|flannel|kube-proxy' || true"
|
||||
run_cmd "Traefik Service" sudo kubectl -n kube-system get svc traefik -o wide
|
||||
run_cmd "Traefik Service 关键字段" sh -c "sudo kubectl -n kube-system get svc traefik -o yaml | grep -E 'type:|externalTrafficPolicy|loadBalancerSourceRanges|svccontroller.k3s.cattle.io' || true"
|
||||
|
||||
TRAEFIK_POD="$(sudo kubectl -n kube-system get pod -l app.kubernetes.io/name=traefik -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)"
|
||||
TRAEFIK_IP="$(sudo kubectl -n kube-system get pod -l app.kubernetes.io/name=traefik -o jsonpath='{.items[0].status.podIP}' 2>/dev/null || true)"
|
||||
|
||||
if [[ -z "${TRAEFIK_IP}" ]]; then
|
||||
echo "[ERR] 无法解析 Traefik Pod IP,终止。"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
TRAEFIK_CHAIN="$(sudo iptables -L KUBE-ROUTER-FORWARD -n -v --line-numbers \
|
||||
| awk -v ip="${TRAEFIK_IP}" '$0 ~ ip {print $4; exit}')"
|
||||
|
||||
echo
|
||||
echo "Traefik pod: ${TRAEFIK_POD}"
|
||||
echo "Traefik ip : ${TRAEFIK_IP}"
|
||||
echo "Traefik fw : ${TRAEFIK_CHAIN:-N/A}"
|
||||
|
||||
if [[ -n "${TRAEFIK_CHAIN}" ]]; then
|
||||
run_cmd "Traefik Pod FW 链详情" sudo iptables -L "$TRAEFIK_CHAIN" -n -v -x
|
||||
run_cmd "Traefik Pod FW 链规则" sudo iptables -S "$TRAEFIK_CHAIN"
|
||||
REJECT_PKTS="$(extract_pkts_for_target "" "$TRAEFIK_CHAIN" REJECT || echo 0)"
|
||||
NFLOG_PKTS="$(extract_pkts_for_target "" "$TRAEFIK_CHAIN" NFLOG || echo 0)"
|
||||
else
|
||||
REJECT_PKTS=0
|
||||
NFLOG_PKTS=0
|
||||
fi
|
||||
|
||||
TRAEFIK_WEB_SVC_CHAIN="$(sudo iptables -t nat -S KUBE-SERVICES \
|
||||
| awk '/kube-system\/traefik:web cluster IP/ && /--dport 80/ {for(i=1;i<=NF;i++) if($i=="-j"){print $(i+1); exit}}')"
|
||||
TRAEFIK_WEB_SEP_CHAIN=""
|
||||
if [[ -n "${TRAEFIK_WEB_SVC_CHAIN}" ]]; then
|
||||
run_cmd "Traefik web Service 链" sudo iptables -t nat -L "$TRAEFIK_WEB_SVC_CHAIN" -n -v -x
|
||||
TRAEFIK_WEB_SEP_CHAIN="$(extract_first_jump_target nat "$TRAEFIK_WEB_SVC_CHAIN" || true)"
|
||||
fi
|
||||
if [[ -n "${TRAEFIK_WEB_SEP_CHAIN}" ]]; then
|
||||
run_cmd "Traefik web Endpoint 链" sudo iptables -t nat -L "$TRAEFIK_WEB_SEP_CHAIN" -n -v -x
|
||||
fi
|
||||
|
||||
run_cmd "KUBE-SERVICES 中目标LB_IP命中" sh -c "sudo iptables -t nat -L KUBE-SERVICES -n -v --line-numbers | grep '${LB_IP}' || true"
|
||||
}
|
||||
59
scripts/diag/entrypath/lib/remote_checks.sh
Normal file
59
scripts/diag/entrypath/lib/remote_checks.sh
Normal file
@@ -0,0 +1,59 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
WORKER_CNI_DNAT_CHAIN=""
|
||||
WORKER_CNI_HIT_BEFORE=""
|
||||
WORKER_CNI_HIT_AFTER=""
|
||||
|
||||
resolve_runtime_modes() {
|
||||
if [[ -n "${DO_REMOTE_ARG}" ]]; then
|
||||
DO_REMOTE="${DO_REMOTE_ARG}"
|
||||
else
|
||||
if [[ "${NON_INTERACTIVE}" == "1" ]]; then
|
||||
DO_REMOTE="N"
|
||||
else
|
||||
read -r -p "是否通过 SSH 拉取 worker 计数(需要可免交互 sudo)? [y/N]: " DO_REMOTE
|
||||
DO_REMOTE="${DO_REMOTE:-N}"
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ -n "${CAPTURE_MODE_ARG}" ]]; then
|
||||
CAPTURE_MODE="${CAPTURE_MODE_ARG}"
|
||||
fi
|
||||
if [[ -n "${NFT_TRACE_MODE_ARG}" ]]; then
|
||||
NFT_TRACE_MODE="${NFT_TRACE_MODE_ARG}"
|
||||
fi
|
||||
if [[ -n "${RETURN_TRACE_MODE_ARG}" ]]; then
|
||||
RETURN_TRACE_MODE="${RETURN_TRACE_MODE_ARG}"
|
||||
fi
|
||||
if [[ -n "${POD_NETNS_TRACE_MODE_ARG}" ]]; then
|
||||
POD_NETNS_TRACE_MODE="${POD_NETNS_TRACE_MODE_ARG}"
|
||||
fi
|
||||
if [[ -n "${POD_NETNS_TRACE_SECONDS_ARG}" ]]; then
|
||||
POD_NETNS_TRACE_SECONDS="${POD_NETNS_TRACE_SECONDS_ARG}"
|
||||
fi
|
||||
}
|
||||
|
||||
collect_remote_worker_state() {
|
||||
if [[ ! "$DO_REMOTE" =~ ^[Yy]$ ]] || [[ -z "$WORKER_HOST" ]]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
say "开始远端检查: ${WORKER_HOST}"
|
||||
run_cmd "Worker 基础网络状态" ssh "${SSH_OPTS[@]}" "$WORKER_HOST" "ip -br a; ip route"
|
||||
run_cmd "Worker k3s-agent 状态" ssh "${SSH_OPTS[@]}" "$WORKER_HOST" "sudo systemctl is-active k3s-agent; sudo journalctl -u k3s-agent -n 40 --no-pager"
|
||||
run_cmd "Worker PREROUTING 关键计数" ssh "${SSH_OPTS[@]}" "$WORKER_HOST" "sudo iptables -t nat -L PREROUTING -n -v --line-numbers | grep -E 'CNI-HOSTPORT-DNAT|KUBE-SERVICES|dpt:80' || true"
|
||||
run_cmd "Worker CNI-HOSTPORT-DNAT" ssh "${SSH_OPTS[@]}" "$WORKER_HOST" "sudo iptables -t nat -L CNI-HOSTPORT-DNAT -n -v --line-numbers || true"
|
||||
|
||||
WORKER_CNI_DNAT_CHAIN="$(ssh "${SSH_OPTS[@]}" "$WORKER_HOST" "sudo iptables -t nat -S CNI-HOSTPORT-DNAT 2>/dev/null | awk '/-j CNI-DN-/{for(i=1;i<=NF;i++) if(\$i==\"-j\"){print \$(i+1); exit}}'")"
|
||||
if [[ -n "${WORKER_CNI_DNAT_CHAIN}" ]]; then
|
||||
run_cmd "Worker 具体 CNI-DNAT 链" ssh "${SSH_OPTS[@]}" "$WORKER_HOST" "sudo iptables -t nat -L ${WORKER_CNI_DNAT_CHAIN} -n -v --line-numbers"
|
||||
WORKER_CNI_HIT_BEFORE="$(ssh "${SSH_OPTS[@]}" "$WORKER_HOST" "sudo iptables -t nat -L ${WORKER_CNI_DNAT_CHAIN} -n -v -x | awk 'BEGIN{v=0} /DNAT/&&/dpt:80/{v=\$1} END{print v}'")"
|
||||
fi
|
||||
}
|
||||
|
||||
post_remote_worker_state() {
|
||||
if [[ "$DO_REMOTE" =~ ^[Yy]$ ]] && [[ -n "${WORKER_CNI_DNAT_CHAIN}" ]]; then
|
||||
WORKER_CNI_HIT_AFTER="$(ssh "${SSH_OPTS[@]}" "$WORKER_HOST" "sudo iptables -t nat -L ${WORKER_CNI_DNAT_CHAIN} -n -v -x | awk 'BEGIN{v=0} /DNAT/&&/dpt:80/{v=\$1} END{print v}'")"
|
||||
run_cmd "Worker CNI-DNAT 链复测" ssh "${SSH_OPTS[@]}" "$WORKER_HOST" "sudo iptables -t nat -L ${WORKER_CNI_DNAT_CHAIN} -n -v --line-numbers"
|
||||
fi
|
||||
}
|
||||
91
scripts/diag/firewalld/setup-k3s-firewalld-interfaces.sh
Normal file
91
scripts/diag/firewalld/setup-k3s-firewalld-interfaces.sh
Normal file
@@ -0,0 +1,91 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
TRUSTED_ZONE="trusted"
|
||||
IFACES=("flannel.1" "cni0")
|
||||
WAIT_SECONDS_DEFAULT=30
|
||||
WAIT_SECONDS="${WAIT_SECONDS_DEFAULT}"
|
||||
NON_INTERACTIVE="0"
|
||||
|
||||
usage() {
|
||||
cat <<'EOF'
|
||||
用法:
|
||||
setup-k3s-firewalld-interfaces.sh [选项]
|
||||
|
||||
选项:
|
||||
--wait-seconds <n> 等待接口出现的秒数(默认 30)
|
||||
--non-interactive 非交互模式
|
||||
-h, --help 显示帮助
|
||||
EOF
|
||||
}
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--wait-seconds) WAIT_SECONDS="${2:-30}"; shift 2 ;;
|
||||
--non-interactive) NON_INTERACTIVE="1"; shift ;;
|
||||
-h|--help) usage; exit 0 ;;
|
||||
*) echo "[ERR] 未知参数: $1"; usage; exit 1 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
require_cmd() {
|
||||
local c="$1"
|
||||
if ! command -v "$c" >/dev/null 2>&1; then
|
||||
echo "[ERR] 缺少命令: $c"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
require_cmd firewall-cmd
|
||||
require_cmd ip
|
||||
|
||||
if [[ $EUID -ne 0 ]]; then
|
||||
SUDO="sudo"
|
||||
else
|
||||
SUDO=""
|
||||
fi
|
||||
|
||||
echo "=== K3s firewalld 接口基线配置 ==="
|
||||
echo "目标 zone: ${TRUSTED_ZONE}"
|
||||
echo "目标接口: ${IFACES[*]}"
|
||||
echo "等待接口出现: ${WAIT_SECONDS}s"
|
||||
|
||||
if [[ "${NON_INTERACTIVE}" == "0" ]]; then
|
||||
read -r -p "继续执行?[Y/n]: " ans
|
||||
ans="${ans:-Y}"
|
||||
if [[ ! "${ans}" =~ ^[Yy]$ ]]; then
|
||||
echo "已取消。"
|
||||
exit 0
|
||||
fi
|
||||
fi
|
||||
|
||||
deadline=$((SECONDS + WAIT_SECONDS))
|
||||
for iface in "${IFACES[@]}"; do
|
||||
while ! ip link show "${iface}" >/dev/null 2>&1; do
|
||||
if (( SECONDS >= deadline )); then
|
||||
echo "[ERR] 接口未出现: ${iface}(等待 ${WAIT_SECONDS}s 仍未出现)"
|
||||
echo "请确认 k3s 已启动并生成 CNI 接口后重试。"
|
||||
exit 1
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
done
|
||||
|
||||
for iface in "${IFACES[@]}"; do
|
||||
echo "[RUN ] runtime add-interface ${iface} -> ${TRUSTED_ZONE}"
|
||||
${SUDO} firewall-cmd --zone="${TRUSTED_ZONE}" --add-interface="${iface}" >/dev/null
|
||||
|
||||
echo "[RUN ] permanent add-interface ${iface} -> ${TRUSTED_ZONE}"
|
||||
${SUDO} firewall-cmd --permanent --zone="${TRUSTED_ZONE}" --add-interface="${iface}" >/dev/null
|
||||
done
|
||||
|
||||
echo "[RUN ] firewall-cmd --reload"
|
||||
${SUDO} firewall-cmd --reload >/dev/null
|
||||
|
||||
echo
|
||||
echo "=== 验证输出 ==="
|
||||
${SUDO} firewall-cmd --zone="${TRUSTED_ZONE}" --list-interfaces
|
||||
${SUDO} firewall-cmd --get-active-zones
|
||||
|
||||
echo
|
||||
echo "[OK] 已完成 firewalld 接口基线配置。"
|
||||
419
scripts/diag/netpol/check-net.sh
Normal file
419
scripts/diag/netpol/check-net.sh
Normal file
@@ -0,0 +1,419 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
NS_TRAEFIK="kube-system"
|
||||
APP_TRAEFIK_LABEL="app.kubernetes.io/name=traefik"
|
||||
TIMEOUT=3
|
||||
LOG_TAIL=200
|
||||
LOG_SINCE="20m"
|
||||
LOG_DIR=""
|
||||
LOG_FILE=""
|
||||
|
||||
PROBE_CLIENT="SKIP"
|
||||
PROBE_TRAEFIK_TO_SVC="SKIP"
|
||||
PROBE_TRAEFIK_TO_POD="SKIP"
|
||||
PROBE_TRAEFIK_DNS="SKIP"
|
||||
|
||||
print_title() {
|
||||
echo
|
||||
echo "=== $1 ==="
|
||||
}
|
||||
|
||||
safe_run() {
|
||||
"$@" || true
|
||||
}
|
||||
|
||||
require_cmd() {
|
||||
local c="$1"
|
||||
if ! command -v "$c" >/dev/null 2>&1; then
|
||||
echo "[ERR] 缺少命令: $c"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
KUBECTL_PATH="$(command -v kubectl || true)"
|
||||
IPTABLES_PATH="$(command -v iptables || true)"
|
||||
USE_SUDO=""
|
||||
|
||||
init_runtime() {
|
||||
require_cmd kubectl
|
||||
require_cmd iptables
|
||||
require_cmd awk
|
||||
require_cmd grep
|
||||
require_cmd curl
|
||||
|
||||
if [[ "${EUID}" -ne 0 ]] && command -v sudo >/dev/null 2>&1; then
|
||||
# 先尝试无交互,失败则提示一次密码
|
||||
if ! sudo -n true 2>/dev/null; then
|
||||
echo "[INFO] 需要 sudo 权限以读取 iptables / kubectl 配置。"
|
||||
sudo -v
|
||||
fi
|
||||
USE_SUDO="1"
|
||||
fi
|
||||
|
||||
# 非 root 默认写到 HOME,避免 /root 权限问题
|
||||
if [[ "${EUID}" -eq 0 ]]; then
|
||||
LOG_DIR="/root/netpol-diag-logs"
|
||||
else
|
||||
LOG_DIR="${HOME}/netpol-diag-logs"
|
||||
fi
|
||||
}
|
||||
|
||||
# 统一封装,避免脚本各处手工判断是否 sudo
|
||||
kubectl() {
|
||||
if [[ -n "${USE_SUDO}" ]]; then
|
||||
sudo "${KUBECTL_PATH}" "$@"
|
||||
else
|
||||
"${KUBECTL_PATH}" "$@"
|
||||
fi
|
||||
}
|
||||
|
||||
iptables() {
|
||||
if [[ -n "${USE_SUDO}" ]]; then
|
||||
sudo "${IPTABLES_PATH}" "$@"
|
||||
else
|
||||
"${IPTABLES_PATH}" "$@"
|
||||
fi
|
||||
}
|
||||
|
||||
probe_wget_from_traefik() {
|
||||
local url="$1"
|
||||
if kubectl exec -n "${NS_TRAEFIK}" deploy/traefik -- wget -qO- "${url}" --timeout="${TIMEOUT}" >/tmp/netpol_probe.out 2>/tmp/netpol_probe.err; then
|
||||
cat /tmp/netpol_probe.out
|
||||
return 0
|
||||
fi
|
||||
cat /tmp/netpol_probe.err
|
||||
return 1
|
||||
}
|
||||
|
||||
select_scene() {
|
||||
echo "请选择诊断场景:"
|
||||
echo " 1) nginx-demo (/demo, 80)"
|
||||
echo " 2) nodejs-demo (/node, 3000)"
|
||||
echo " 3) 自定义"
|
||||
printf "输入序号 [1/2/3](默认 2): "
|
||||
read -r CHOICE
|
||||
CHOICE="${CHOICE:-2}"
|
||||
|
||||
case "${CHOICE}" in
|
||||
1)
|
||||
NS_BACKEND="default"
|
||||
APP_NAME="nginx-demo"
|
||||
APP_LABEL="app=nginx-demo"
|
||||
SVC_NAME="nginx-demo"
|
||||
PATH_PREFIX="/demo/"
|
||||
POD_PORT="80"
|
||||
;;
|
||||
2)
|
||||
NS_BACKEND="default"
|
||||
APP_NAME="nodejs-demo"
|
||||
APP_LABEL="app=nodejs-demo"
|
||||
SVC_NAME="nodejs-demo"
|
||||
PATH_PREFIX="/node/"
|
||||
POD_PORT="3000"
|
||||
;;
|
||||
3)
|
||||
printf "后端命名空间(默认 default): "
|
||||
read -r NS_BACKEND
|
||||
NS_BACKEND="${NS_BACKEND:-default}"
|
||||
|
||||
printf "应用名(Deployment/Service 名,示例 nodejs-demo): "
|
||||
read -r APP_NAME
|
||||
APP_NAME="${APP_NAME:-nodejs-demo}"
|
||||
|
||||
printf "Pod 标签选择器(默认 app=<应用名>): "
|
||||
read -r APP_LABEL
|
||||
APP_LABEL="${APP_LABEL:-app=${APP_NAME}}"
|
||||
|
||||
printf "Service 名(默认与应用名一致): "
|
||||
read -r SVC_NAME
|
||||
SVC_NAME="${SVC_NAME:-${APP_NAME}}"
|
||||
|
||||
printf "入口路径前缀(默认 /): "
|
||||
read -r PATH_PREFIX
|
||||
PATH_PREFIX="${PATH_PREFIX:-/}"
|
||||
|
||||
printf "后端 Pod 端口(默认 80): "
|
||||
read -r POD_PORT
|
||||
POD_PORT="${POD_PORT:-80}"
|
||||
;;
|
||||
*)
|
||||
echo "[WARN] 无效选择,使用 nodejs-demo 默认场景。"
|
||||
NS_BACKEND="default"
|
||||
APP_NAME="nodejs-demo"
|
||||
APP_LABEL="app=nodejs-demo"
|
||||
SVC_NAME="nodejs-demo"
|
||||
PATH_PREFIX="/node/"
|
||||
POD_PORT="3000"
|
||||
;;
|
||||
esac
|
||||
|
||||
printf "入口 IP(用于本机 curl,默认 192.168.2.61): "
|
||||
read -r ENTRY_IP
|
||||
ENTRY_IP="${ENTRY_IP:-192.168.2.61}"
|
||||
}
|
||||
|
||||
init_runtime
|
||||
select_scene
|
||||
|
||||
mkdir -p "${LOG_DIR}"
|
||||
LOG_FILE="${LOG_DIR}/diag-$(date '+%Y%m%d-%H%M%S')-${APP_NAME}.log"
|
||||
exec > >(tee -a "${LOG_FILE}") 2>&1
|
||||
|
||||
print_title "0. 诊断上下文"
|
||||
echo "TIME: $(date '+%F %T %Z')"
|
||||
echo "LOG_FILE=${LOG_FILE}"
|
||||
echo "SCENE_APP=${APP_NAME}"
|
||||
echo "SCENE_NS=${NS_BACKEND}"
|
||||
echo "SCENE_LABEL=${APP_LABEL}"
|
||||
echo "SCENE_SVC=${SVC_NAME}"
|
||||
echo "SCENE_PATH=${PATH_PREFIX}"
|
||||
echo "SCENE_POD_PORT=${POD_PORT}"
|
||||
echo "ENTRY_IP=${ENTRY_IP}"
|
||||
echo "HOSTNAME=$(hostname)"
|
||||
safe_run kubectl version --short
|
||||
|
||||
print_title "1. 集群与 Traefik 基线"
|
||||
safe_run kubectl get nodes -o wide
|
||||
safe_run kubectl get deploy -n "${NS_TRAEFIK}" traefik -o wide
|
||||
safe_run kubectl get svc -n "${NS_TRAEFIK}" traefik -o wide
|
||||
safe_run kubectl get pod -n "${NS_TRAEFIK}" -l "${APP_TRAEFIK_LABEL}" -o wide
|
||||
kubectl get pods -n kube-system -o wide | grep -E 'kube-router|flannel|traefik|svclb-traefik' || true
|
||||
|
||||
TRAEFIK_POD="$(kubectl get pod -n "${NS_TRAEFIK}" -l "${APP_TRAEFIK_LABEL}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)"
|
||||
TRAEFIK_IP="$(kubectl get pod -n "${NS_TRAEFIK}" -l "${APP_TRAEFIK_LABEL}" -o jsonpath='{.items[0].status.podIP}' 2>/dev/null || true)"
|
||||
|
||||
echo "--- 1.1 kube-proxy 基线 ---"
|
||||
safe_run kubectl get pod -n kube-system -l k8s-app=kube-proxy -o wide
|
||||
safe_run kubectl get configmap -n kube-system kube-proxy -o yaml
|
||||
|
||||
KPROXY_POD="$(kubectl get pod -n kube-system -l k8s-app=kube-proxy -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)"
|
||||
if [[ -n "${KPROXY_POD}" ]]; then
|
||||
echo "--- 1.2 kube-proxy 日志关键字(error|conntrack|iptables|ipvs|nft)---"
|
||||
kubectl logs -n kube-system "${KPROXY_POD}" --tail=200 | grep -Ei "error|fail|conntrack|iptables|ipvs|nft|sync" || true
|
||||
else
|
||||
echo "[WARN] 未找到 kube-proxy Pod(K3s 某些模式可忽略)"
|
||||
fi
|
||||
|
||||
print_title "2. 业务资源采集"
|
||||
safe_run kubectl get deploy -n "${NS_BACKEND}" "${APP_NAME}" -o wide
|
||||
safe_run kubectl get svc -n "${NS_BACKEND}" "${SVC_NAME}" -o wide
|
||||
safe_run kubectl get endpoints -n "${NS_BACKEND}" "${SVC_NAME}" -o wide
|
||||
safe_run kubectl get endpointslice -n "${NS_BACKEND}" -l kubernetes.io/service-name="${SVC_NAME}" -o wide
|
||||
safe_run kubectl get pod -n "${NS_BACKEND}" -l "${APP_LABEL}" -o wide
|
||||
safe_run kubectl get pod -n "${NS_BACKEND}" -l "${APP_LABEL}" --show-labels
|
||||
safe_run kubectl get ingress -n "${NS_BACKEND}"
|
||||
safe_run kubectl get ingressroute -n "${NS_BACKEND}"
|
||||
safe_run kubectl get networkpolicy -n "${NS_BACKEND}"
|
||||
safe_run kubectl get networkpolicy -n "${NS_TRAEFIK}"
|
||||
safe_run kubectl get ns "${NS_BACKEND}" "${NS_TRAEFIK}" --show-labels
|
||||
|
||||
BACKEND_POD_IP="$(kubectl get pod -n "${NS_BACKEND}" -l "${APP_LABEL}" -o jsonpath='{.items[0].status.podIP}' 2>/dev/null || true)"
|
||||
SVC_IP="$(kubectl get svc -n "${NS_BACKEND}" "${SVC_NAME}" -o jsonpath='{.spec.clusterIP}' 2>/dev/null || true)"
|
||||
EP_COUNT="$(kubectl get endpoints -n "${NS_BACKEND}" "${SVC_NAME}" -o jsonpath='{.subsets[*].addresses[*].ip}' 2>/dev/null | awk '{print NF}')"
|
||||
EP_COUNT="${EP_COUNT:-0}"
|
||||
|
||||
echo "TRAEFIK_POD=${TRAEFIK_POD:-<none>}"
|
||||
echo "TRAEFIK_IP=${TRAEFIK_IP:-<none>}"
|
||||
echo "BACKEND_POD_IP=${BACKEND_POD_IP:-<none>}"
|
||||
echo "SVC_IP=${SVC_IP:-<none>}"
|
||||
echo "ENDPOINTS_COUNT=${EP_COUNT}"
|
||||
echo "SERVICE_SELECTOR=$(kubectl get svc -n "${NS_BACKEND}" "${SVC_NAME}" -o jsonpath='{.spec.selector}' 2>/dev/null || echo '{}')"
|
||||
|
||||
echo "--- 2.1 EndpointSlice 条件(ready/serving/terminating/node)---"
|
||||
kubectl get endpointslice -n "${NS_BACKEND}" -l kubernetes.io/service-name="${SVC_NAME}" \
|
||||
-o jsonpath='{range .items[*]}{"slice="}{.metadata.name}{"\n"}{range .endpoints[*]}{" addr="}{.addresses[0]}{" ready="}{.conditions.ready}{" serving="}{.conditions.serving}{" terminating="}{.conditions.terminating}{" node="}{.nodeName}{"\n"}{end}{end}' \
|
||||
|| true
|
||||
|
||||
print_title "3. 主链路连通性探测"
|
||||
echo "--- 3.1 本机 -> 入口 (${ENTRY_IP}${PATH_PREFIX}) ---"
|
||||
if curl -sS -m "${TIMEOUT}" -o /tmp/netpol_client.out -w "%{http_code}" "http://${ENTRY_IP}${PATH_PREFIX}" >/tmp/netpol_client.code 2>/tmp/netpol_client.err; then
|
||||
CODE="$(cat /tmp/netpol_client.code)"
|
||||
echo "HTTP_CODE=${CODE}"
|
||||
echo "BODY_PREVIEW:"
|
||||
head -c 200 /tmp/netpol_client.out || true
|
||||
echo
|
||||
PROBE_CLIENT="OK_${CODE}"
|
||||
else
|
||||
cat /tmp/netpol_client.err
|
||||
PROBE_CLIENT="FAIL"
|
||||
fi
|
||||
|
||||
if [[ -n "${TRAEFIK_POD}" && -n "${SVC_IP}" ]]; then
|
||||
echo
|
||||
echo "--- 3.2 Traefik -> ServiceIP (${SVC_IP}:80) ---"
|
||||
if probe_wget_from_traefik "http://${SVC_IP}:80"; then
|
||||
PROBE_TRAEFIK_TO_SVC="OK"
|
||||
else
|
||||
PROBE_TRAEFIK_TO_SVC="FAIL"
|
||||
fi
|
||||
else
|
||||
echo "[SKIP] 缺少 Traefik Pod 或 ServiceIP。"
|
||||
fi
|
||||
|
||||
if [[ -n "${TRAEFIK_POD}" ]]; then
|
||||
echo
|
||||
echo "--- 3.3 Traefik -> Service DNS (${SVC_NAME}.${NS_BACKEND}.svc.cluster.local:80) ---"
|
||||
if probe_wget_from_traefik "http://${SVC_NAME}.${NS_BACKEND}.svc.cluster.local:80"; then
|
||||
PROBE_TRAEFIK_DNS="OK"
|
||||
else
|
||||
PROBE_TRAEFIK_DNS="FAIL"
|
||||
fi
|
||||
else
|
||||
echo "[SKIP] 未找到 Traefik Pod。"
|
||||
fi
|
||||
|
||||
if [[ -n "${TRAEFIK_POD}" && -n "${BACKEND_POD_IP}" ]]; then
|
||||
echo
|
||||
echo "--- 3.4 Traefik -> PodIP (${BACKEND_POD_IP}:${POD_PORT}) ---"
|
||||
if probe_wget_from_traefik "http://${BACKEND_POD_IP}:${POD_PORT}"; then
|
||||
PROBE_TRAEFIK_TO_POD="OK"
|
||||
else
|
||||
PROBE_TRAEFIK_TO_POD="FAIL"
|
||||
fi
|
||||
else
|
||||
echo "[SKIP] 缺少 Traefik Pod 或后端 PodIP。"
|
||||
fi
|
||||
|
||||
print_title "4. 路由与配置详情"
|
||||
echo "--- 4.1 Ingress ---"
|
||||
safe_run kubectl get ingress -n "${NS_BACKEND}" -o yaml
|
||||
echo "--- 4.2 IngressRoute ---"
|
||||
safe_run kubectl get ingressroute -n "${NS_BACKEND}" -o yaml
|
||||
echo "--- 4.3 Service / Endpoints ---"
|
||||
safe_run kubectl get svc -n "${NS_BACKEND}" "${SVC_NAME}" -o yaml
|
||||
safe_run kubectl get endpoints -n "${NS_BACKEND}" "${SVC_NAME}" -o yaml
|
||||
safe_run kubectl describe svc -n "${NS_BACKEND}" "${SVC_NAME}"
|
||||
echo "--- 4.4 相关 NetworkPolicy(kube-system + backend)---"
|
||||
safe_run kubectl get networkpolicy -n "${NS_TRAEFIK}" -o yaml
|
||||
safe_run kubectl get networkpolicy -n "${NS_BACKEND}" -o yaml
|
||||
echo "--- 4.5 近期事件(backend + kube-system)---"
|
||||
safe_run kubectl get events -n "${NS_BACKEND}" --sort-by=.lastTimestamp
|
||||
safe_run kubectl get events -n kube-system --sort-by=.lastTimestamp
|
||||
|
||||
print_title "5. Traefik 日志(最近 ${LOG_SINCE},最多 ${LOG_TAIL} 行)"
|
||||
safe_run kubectl logs -n "${NS_TRAEFIK}" deploy/traefik --since="${LOG_SINCE}" --tail="${LOG_TAIL}"
|
||||
echo "--- 5.1 关键字过滤(404|502|503|router|service|middleware|upstream|${SVC_NAME}|${PATH_PREFIX}) ---"
|
||||
kubectl logs -n "${NS_TRAEFIK}" deploy/traefik --since="${LOG_SINCE}" --tail="${LOG_TAIL}" | grep -Ei "404|502|503|router|service|middleware|upstream|endpoint|${SVC_NAME}|${PATH_PREFIX}" || true
|
||||
echo "--- 5.2 Traefik 访问日志候选(status=404/502/503) ---"
|
||||
kubectl logs -n "${NS_TRAEFIK}" deploy/traefik --since="${LOG_SINCE}" --tail="${LOG_TAIL}" | grep -E "\" 404 |\" 502 |\" 503 " || true
|
||||
echo "--- 5.3 Traefik 上一次容器日志(若重启过) ---"
|
||||
safe_run kubectl logs -n "${NS_TRAEFIK}" deploy/traefik --previous --tail=100
|
||||
|
||||
print_title "6. 防火墙与数据平面"
|
||||
echo "--- 6.1 防火墙状态 ---"
|
||||
if command -v firewall-cmd >/dev/null 2>&1; then
|
||||
safe_run firewall-cmd --state
|
||||
safe_run firewall-cmd --list-all
|
||||
else
|
||||
echo "firewall-cmd: not found"
|
||||
fi
|
||||
|
||||
if command -v ufw >/dev/null 2>&1; then
|
||||
safe_run ufw status verbose
|
||||
else
|
||||
echo "ufw: not found"
|
||||
fi
|
||||
|
||||
echo "--- 6.2 FORWARD 与 KUBE-ROUTER-FORWARD ---"
|
||||
safe_run iptables -L FORWARD -n -v --line-numbers
|
||||
safe_run iptables -L KUBE-ROUTER-FORWARD -n -v --line-numbers
|
||||
echo "--- 6.2.1 NAT 链(KUBE-SERVICES)---"
|
||||
safe_run iptables -t nat -L KUBE-SERVICES -n -v --line-numbers
|
||||
if [[ -n "${SVC_IP}" ]]; then
|
||||
echo "--- 6.2.2 NAT 链中 ServiceIP 相关规则 (${SVC_IP}) ---"
|
||||
iptables -t nat -S | grep "${SVC_IP}" || true
|
||||
fi
|
||||
|
||||
TRAEFIK_CHAIN=""
|
||||
BACKEND_CHAIN=""
|
||||
if [[ -n "${TRAEFIK_IP}" ]]; then
|
||||
TRAEFIK_CHAIN="$(iptables -L KUBE-ROUTER-FORWARD -n --line-numbers 2>/dev/null | awk -v ip="${TRAEFIK_IP}" '$0 ~ ip && $4 ~ /^KUBE-POD-FW-/ {print $4; exit}')"
|
||||
fi
|
||||
if [[ -n "${BACKEND_POD_IP}" ]]; then
|
||||
BACKEND_CHAIN="$(iptables -L KUBE-ROUTER-FORWARD -n --line-numbers 2>/dev/null | awk -v ip="${BACKEND_POD_IP}" '$0 ~ ip && $4 ~ /^KUBE-POD-FW-/ {print $4; exit}')"
|
||||
fi
|
||||
|
||||
echo "TRAEFIK_CHAIN=${TRAEFIK_CHAIN:-<not found>}"
|
||||
echo "BACKEND_CHAIN=${BACKEND_CHAIN:-<not found>}"
|
||||
|
||||
if [[ -n "${TRAEFIK_CHAIN}" ]]; then
|
||||
echo "--- 6.3 Traefik Pod 链 ${TRAEFIK_CHAIN} ---"
|
||||
safe_run iptables -L "${TRAEFIK_CHAIN}" -n -v -x
|
||||
echo "--- 6.4 Traefik Pod 链规则定义 ---"
|
||||
safe_run iptables -S "${TRAEFIK_CHAIN}"
|
||||
fi
|
||||
|
||||
if [[ -n "${BACKEND_CHAIN}" ]]; then
|
||||
echo "--- 6.5 Backend Pod 链 ${BACKEND_CHAIN} ---"
|
||||
safe_run iptables -L "${BACKEND_CHAIN}" -n -v -x
|
||||
echo "--- 6.6 Backend Pod 链规则定义 ---"
|
||||
safe_run iptables -S "${BACKEND_CHAIN}"
|
||||
fi
|
||||
|
||||
echo "--- 6.7 ipset(KUBE-SRC/KUBE-DST)---"
|
||||
if command -v ipset >/dev/null 2>&1; then
|
||||
ipset list -n | grep -E '^KUBE-(SRC|DST)-' || true
|
||||
else
|
||||
echo "ipset: not found"
|
||||
fi
|
||||
|
||||
echo "--- 6.8 conntrack(容量与关键连接)---"
|
||||
if command -v conntrack >/dev/null 2>&1; then
|
||||
safe_run conntrack -S
|
||||
safe_run sysctl net.netfilter.nf_conntrack_count
|
||||
safe_run sysctl net.netfilter.nf_conntrack_max
|
||||
if [[ -n "${SVC_IP}" ]]; then
|
||||
echo "conntrack by service ip (${SVC_IP}):"
|
||||
conntrack -L -d "${SVC_IP}" 2>/dev/null | head -n 100 || true
|
||||
fi
|
||||
if [[ -n "${BACKEND_POD_IP}" ]]; then
|
||||
echo "conntrack by backend pod ip (${BACKEND_POD_IP}):"
|
||||
conntrack -L -d "${BACKEND_POD_IP}" 2>/dev/null | head -n 100 || true
|
||||
fi
|
||||
else
|
||||
echo "conntrack: not found"
|
||||
fi
|
||||
|
||||
print_title "7. 自动判读(502/503/404)"
|
||||
echo "探测结果:"
|
||||
echo " CLIENT=${PROBE_CLIENT}"
|
||||
echo " TRAEFIK_TO_SVC=${PROBE_TRAEFIK_TO_SVC}"
|
||||
echo " TRAEFIK_DNS=${PROBE_TRAEFIK_DNS}"
|
||||
echo " TRAEFIK_TO_POD=${PROBE_TRAEFIK_TO_POD}"
|
||||
|
||||
if [[ "${EP_COUNT}" == "0" ]]; then
|
||||
echo "- [高概率 503] Service 无可用 Endpoints。检查 Deployment 是否 Ready、selector 是否匹配。"
|
||||
fi
|
||||
|
||||
if [[ "${PROBE_CLIENT}" == OK_404* ]]; then
|
||||
echo "- [高概率 404] 入口路由未命中。检查 Ingress/IngressRoute 的 path、host、middleware。"
|
||||
fi
|
||||
|
||||
if [[ "${PROBE_CLIENT}" == OK_503* ]]; then
|
||||
echo "- [高概率 503] 入口已命中但后端不可用。优先看 Endpoints/EndpointSlice 条件与 Traefik 日志。"
|
||||
fi
|
||||
|
||||
if [[ "${PROBE_TRAEFIK_TO_SVC}" == "FAIL" ]]; then
|
||||
echo "- [高概率 502/503] Traefik 到 Service 不通。优先检查 NetworkPolicy、kube-router 链、DNS 53 放行。"
|
||||
fi
|
||||
|
||||
if [[ "${PROBE_TRAEFIK_DNS}" == "FAIL" && "${PROBE_TRAEFIK_TO_SVC}" == "FAIL" ]]; then
|
||||
echo "- [可能 DNS/服务发现问题] Traefik 到 Service DNS 与 ServiceIP 都失败。检查 CoreDNS、kube-system egress 53。"
|
||||
fi
|
||||
|
||||
if [[ "${PROBE_TRAEFIK_TO_SVC}" == "OK" && "${PROBE_CLIENT}" == "FAIL" ]]; then
|
||||
echo "- [可能入口层问题] 集群内后端可达,但入口访问失败。检查控制节点防火墙、Traefik Service 暴露端口、外部路由。"
|
||||
fi
|
||||
|
||||
if [[ "${PROBE_TRAEFIK_TO_SVC}" == "OK" && "${PROBE_TRAEFIK_TO_POD}" == "FAIL" ]]; then
|
||||
echo "- [已知行为候选] Service 可达但 PodIP 直连失败,常见于 kube-router 同节点桥接路径。"
|
||||
fi
|
||||
|
||||
echo
|
||||
echo "下一步建议:"
|
||||
echo "1) 先修复 Endpoints=0 / 404 路由不匹配。"
|
||||
echo "2) 再看 Traefik -> Service 探测与 NetworkPolicy 命中。"
|
||||
echo "3) 最后结合 KUBE-ROUTER-FORWARD、Pod 链、ipset 判断是否为 kube-router 行为问题。"
|
||||
echo
|
||||
echo "日志已保存:${LOG_FILE}"
|
||||
148
scripts/diag/recovery/k3s-recovery-reset.sh
Normal file
148
scripts/diag/recovery/k3s-recovery-reset.sh
Normal file
@@ -0,0 +1,148 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
LOG_DIR="/root/netpol-diag-logs"
|
||||
mkdir -p "${LOG_DIR}"
|
||||
LOG_FILE="${LOG_DIR}/recovery-$(date '+%Y%m%d-%H%M%S').log"
|
||||
exec > >(tee -a "${LOG_FILE}") 2>&1
|
||||
|
||||
info() { echo "[INFO] $*"; }
|
||||
warn() { echo "[WARN] $*"; }
|
||||
|
||||
confirm_once() {
|
||||
local prompt="$1"
|
||||
local answer=""
|
||||
printf "%s (输入 YES 继续): " "${prompt}"
|
||||
read -r answer
|
||||
[[ "${answer}" == "YES" ]]
|
||||
}
|
||||
|
||||
confirm_twice_high_risk() {
|
||||
local answer1=""
|
||||
local answer2=""
|
||||
printf "高风险操作,第一次确认:输入 YES 执行: "
|
||||
read -r answer1
|
||||
printf "第二次确认:输入 RESET 执行: "
|
||||
read -r answer2
|
||||
[[ "${answer1}" == "YES" && "${answer2}" == "RESET" ]]
|
||||
}
|
||||
|
||||
cleanup_demo_resources() {
|
||||
info "清理 demo 资源(default + kube-system)"
|
||||
kubectl delete ingress -n default nginx-demo nodejs-demo --ignore-not-found || true
|
||||
kubectl delete ingressroute -n default nginx-demo nodejs-demo --ignore-not-found || true
|
||||
kubectl delete middleware -n default nginx-demo-stripprefix nodejs-demo-stripprefix --ignore-not-found || true
|
||||
kubectl delete service -n default nginx-demo nodejs-demo --ignore-not-found || true
|
||||
kubectl delete deployment -n default nginx-demo nodejs-demo --ignore-not-found || true
|
||||
kubectl delete networkpolicy -n default allow-traefik-to-nginx allow-traefik-to-nodejs --ignore-not-found || true
|
||||
kubectl delete networkpolicy -n kube-system allow-traefik-egress-to-services --ignore-not-found || true
|
||||
info "demo 资源清理完成"
|
||||
}
|
||||
|
||||
restart_key_components() {
|
||||
info "重启关键组件"
|
||||
kubectl rollout restart deployment -n kube-system traefik || true
|
||||
kubectl rollout restart deployment -n kube-system coredns || true
|
||||
kubectl get ds -n kube-system -l k8s-app=kube-proxy -o name | while read -r ds; do
|
||||
kubectl rollout restart -n kube-system "${ds}" || true
|
||||
done
|
||||
info "等待关键组件状态"
|
||||
kubectl rollout status deployment/traefik -n kube-system --timeout=180s || true
|
||||
kubectl rollout status deployment/coredns -n kube-system --timeout=180s || true
|
||||
}
|
||||
|
||||
network_rules_guidance() {
|
||||
warn "该步骤仅打印建议命令,不自动执行。"
|
||||
cat <<'EOF'
|
||||
建议在控制节点人工执行并逐条确认:
|
||||
|
||||
# 1) 备份当前规则
|
||||
iptables-save > /root/iptables-backup-$(date +%F-%H%M%S).txt
|
||||
|
||||
# 2) 查看 KUBE-ROUTER 相关链(确认后再清理)
|
||||
iptables-save | grep KUBE-ROUTER || true
|
||||
|
||||
# 3) 若你明确要清理 kube-router 规则(高风险)
|
||||
# iptables-save | grep -v KUBE-ROUTER | iptables-restore
|
||||
|
||||
# 4) 查看并清理相关 ipset(高风险,按需逐个)
|
||||
# ipset list -n | grep '^KUBE-'
|
||||
# ipset destroy <set-name>
|
||||
|
||||
EOF
|
||||
}
|
||||
|
||||
print_rebuild_runbook() {
|
||||
cat <<'EOF'
|
||||
K3s 重建步骤(只输出,不自动执行):
|
||||
|
||||
1) 在 server 节点卸载:
|
||||
/usr/local/bin/k3s-uninstall.sh
|
||||
|
||||
2) 在 agent 节点卸载:
|
||||
/usr/local/bin/k3s-agent-uninstall.sh
|
||||
|
||||
3) 清理残留目录(确认后):
|
||||
rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/cni /opt/cni
|
||||
|
||||
4) 重新安装 server(带你当前需要的参数)
|
||||
5) 重新 join agent
|
||||
6) 先部署 04-1 / 04-2 / 04-3,再到 04-4 / 04-5
|
||||
7) 最后用 /root/check-nodejs-netpol.sh 复测
|
||||
EOF
|
||||
}
|
||||
|
||||
show_menu() {
|
||||
echo
|
||||
echo "===== K3s 恢复脚本(独立于诊断)====="
|
||||
echo "1) 仅清理 demo 资源(低风险)"
|
||||
echo "2) 清理 demo + 重启关键组件(中风险)"
|
||||
echo "3) 高风险网络规则清理(双重确认,默认仅打印建议)"
|
||||
echo "4) 输出完整重建步骤(不自动执行)"
|
||||
echo "0) 退出"
|
||||
printf "请选择: "
|
||||
}
|
||||
|
||||
main() {
|
||||
info "日志文件: ${LOG_FILE}"
|
||||
while true; do
|
||||
show_menu
|
||||
read -r choice
|
||||
case "${choice}" in
|
||||
1)
|
||||
if confirm_once "确认执行“仅清理 demo 资源”吗?"; then
|
||||
cleanup_demo_resources
|
||||
else
|
||||
warn "已取消"
|
||||
fi
|
||||
;;
|
||||
2)
|
||||
if confirm_once "确认执行“清理 demo + 重启关键组件”吗?"; then
|
||||
cleanup_demo_resources
|
||||
restart_key_components
|
||||
else
|
||||
warn "已取消"
|
||||
fi
|
||||
;;
|
||||
3)
|
||||
if confirm_twice_high_risk; then
|
||||
network_rules_guidance
|
||||
else
|
||||
warn "高风险操作已取消"
|
||||
fi
|
||||
;;
|
||||
4)
|
||||
print_rebuild_runbook
|
||||
;;
|
||||
0)
|
||||
info "退出。日志已保存:${LOG_FILE}"
|
||||
break
|
||||
;;
|
||||
*)
|
||||
warn "无效选项"
|
||||
;;
|
||||
esac
|
||||
done
|
||||
}
|
||||
|
||||
main
|
||||
Reference in New Issue
Block a user