SNAT失败为什么会表现为超时

intro

在之前的测试中，如果严格限制了所有SNAT的源修改为特定IP+PORT，将所有发往本机137.0.01:8080的tcp都SNAT到127.0.0.1:40000端口。

tsecer@harry: sudo iptables -t nat -A POSTROUTING  -p tcp  -d 127.0.0.1 --dport 8080 -j SNAT --to-source 127.0.0.1:40000

直观的看，这种配置肯定是有问题的：明显以为限制了SNAT源的选择，所以最多只能完成一个连接的NAT。

毫不意外，测试的时候就会发现：只能有一个连接成功，之后的连接尝试都会在发送SYN包之后进入SYN_SENT状态，并且最终因为没有收到对方回包而超时。

因为发起connect的socket处于SYN_SENT状态，说明socket是分配并发送了握手包；但是通过tcpdump抓包却没有看到这个报文离开主机进入网络，这又说明报文在本机中丢失了。

一个自然而然的问题就是：这个同步包去哪里了？

分配

实现中有一个细节：先修改选择结果，然后判断如果没在使用则返回；反过来说：**如果所有都被用完，返回的是最后一次尝试的(已经被占用的)那个地址(IP + PORT)。

another_round:
	for (i = 0; i < attempts; i++, off++) {
		*keyptr = htons(min + off % range_size);
		if (!nf_nat_used_tuple_harder(tuple, ct, attempts - i))
			return;

xt_snat_target_v2>>nf_nat_setup_info>>get_unique_tuple==>>nf_nat_l4proto_unique_tuple


/* Alter the per-proto part of the tuple (depending on maniptype), to
 * give a unique tuple in the given range if possible.
 *
 * Per-protocol part of tuple is initialized to the incoming packet.
 */
static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
					const struct nf_nat_range2 *range,
					enum nf_nat_manip_type maniptype,
					const struct nf_conn *ct)
{
	unsigned int range_size, min, max, i, attempts;
	__be16 *keyptr;
	u16 off;

	switch (tuple->dst.protonum) {
	case IPPROTO_ICMP:
	case IPPROTO_ICMPV6:
		/* id is same for either direction... */
		keyptr = &tuple->src.u.icmp.id;
		if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {
			min = 0;
			range_size = 65536;
		} else {
			min = ntohs(range->min_proto.icmp.id);
			range_size = ntohs(range->max_proto.icmp.id) -
				     ntohs(range->min_proto.icmp.id) + 1;
		}
		goto find_free_id;
#if IS_ENABLED(CONFIG_NF_CT_PROTO_GRE)
	case IPPROTO_GRE:
		/* If there is no master conntrack we are not PPTP,
		   do not change tuples */
		if (!ct->master)
			return;

		if (maniptype == NF_NAT_MANIP_SRC)
			keyptr = &tuple->src.u.gre.key;
		else
			keyptr = &tuple->dst.u.gre.key;

		if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {
			min = 1;
			range_size = 65535;
		} else {
			min = ntohs(range->min_proto.gre.key);
			range_size = ntohs(range->max_proto.gre.key) - min + 1;
		}
		goto find_free_id;
#endif
	case IPPROTO_UDP:
	case IPPROTO_UDPLITE:
	case IPPROTO_TCP:
	case IPPROTO_SCTP:
	case IPPROTO_DCCP:
		if (maniptype == NF_NAT_MANIP_SRC)
			keyptr = &tuple->src.u.all;
		else
			keyptr = &tuple->dst.u.all;

		break;
	default:
		return;
	}

	/* If no range specified... */
	if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {
		/* If it's dst rewrite, can't change port */
		if (maniptype == NF_NAT_MANIP_DST)
			return;

		if (ntohs(*keyptr) < 1024) {
			/* Loose convention: >> 512 is credential passing */
			if (ntohs(*keyptr) < 512) {
				min = 1;
				range_size = 511 - min + 1;
			} else {
				min = 600;
				range_size = 1023 - min + 1;
			}
		} else {
			min = 1024;
			range_size = 65535 - 1024 + 1;
		}
	} else {
		min = ntohs(range->min_proto.all);
		max = ntohs(range->max_proto.all);
		if (unlikely(max < min))
			swap(max, min);
		range_size = max - min + 1;
	}

find_free_id:
	if (range->flags & NF_NAT_RANGE_PROTO_OFFSET)
		off = (ntohs(*keyptr) - ntohs(range->base_proto.all));
	else if ((range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL) ||
		 maniptype != NF_NAT_MANIP_DST)
		off = get_random_u16();
	else
		off = 0;

	attempts = range_size;
	if (attempts > NF_NAT_MAX_ATTEMPTS)
		attempts = NF_NAT_MAX_ATTEMPTS;

	/* We are in softirq; doing a search of the entire range risks
	 * soft lockup when all tuples are already used.
	 *
	 * If we can't find any free port from first offset, pick a new
	 * one and try again, with ever smaller search window.
	 */
another_round:
	for (i = 0; i < attempts; i++, off++) {
		*keyptr = htons(min + off % range_size);
		if (!nf_nat_used_tuple_harder(tuple, ct, attempts - i))
			return;
	}

	if (attempts >= range_size || attempts < 16)
		return;
	attempts /= 2;
	off = get_random_u16();
	goto another_round;
}

确认

前面分配了IP+PORT是否就意味着conntrack就真的SNAT成功了呢？显然不是，因为如果建立成功就以为这逻辑错误。以开始的测试SNAT为例，当收到回包时，五元组(proto, srcip, srcport, dstip, dstport)都是相同的(tcp, 127.0.0.1, 8080, 127.0.0.1, 40000)无法区分到底属于当前主机上哪个socket(端口)。

nf_conntrack模块(nf_conntrack_proto.c)在中NF_INET_POST_ROUTING chain中注册了nf_confirm hook函数。

///@file: nf_conntrack_proto.c
/* Connection tracking may drop packets, but never alters them, so
 * make it the first hook.
 */
static const struct nf_hook_ops ipv4_conntrack_ops[] = {
	{
		.hook		= ipv4_conntrack_in,
		.pf		= NFPROTO_IPV4,
		.hooknum	= NF_INET_PRE_ROUTING,
		.priority	= NF_IP_PRI_CONNTRACK,
	},
	{
		.hook		= ipv4_conntrack_local,
		.pf		= NFPROTO_IPV4,
		.hooknum	= NF_INET_LOCAL_OUT,
		.priority	= NF_IP_PRI_CONNTRACK,
	},
	{
		.hook		= nf_confirm,
		.pf		= NFPROTO_IPV4,
		.hooknum	= NF_INET_POST_ROUTING,
		.priority	= NF_IP_PRI_CONNTRACK_CONFIRM,
	},
	{
		.hook		= nf_confirm,
		.pf		= NFPROTO_IPV4,
		.hooknum	= NF_INET_LOCAL_IN,
		.priority	= NF_IP_PRI_CONNTRACK_CONFIRM,
	},
};

NF_IP_PRI_CONNTRACK_CONFIRM的优先级最低(数值最大：INT_MAX)，所以它会在所有的target执行完之后再执行。

enum nf_ip_hook_priorities {
	NF_IP_PRI_FIRST = INT_MIN,
	NF_IP_PRI_RAW_BEFORE_DEFRAG = -450,
	NF_IP_PRI_CONNTRACK_DEFRAG = -400,
	NF_IP_PRI_RAW = -300,
	NF_IP_PRI_SELINUX_FIRST = -225,
	NF_IP_PRI_CONNTRACK = -200,
	NF_IP_PRI_MANGLE = -150,
	NF_IP_PRI_NAT_DST = -100,
	NF_IP_PRI_FILTER = 0,
	NF_IP_PRI_SECURITY = 50,
	NF_IP_PRI_NAT_SRC = 100,
	NF_IP_PRI_SELINUX_LAST = 225,
	NF_IP_PRI_CONNTRACK_HELPER = 300,
	NF_IP_PRI_CONNTRACK_CONFIRM = INT_MAX,
	NF_IP_PRI_LAST = INT_MAX,
};

当有冲突项时，调用nf_ct_resolve_clash函数尝试解决冲突。

nf_confirm>>nf_conntrack_confirm>>__nf_conntrack_confirm


/* Confirm a connection given skb; places it in hash table */
int
__nf_conntrack_confirm(struct sk_buff *skb)
{
///...
	max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN);
	/* See if there's one in the list already, including reverse:
	   NAT could have grabbed it without realizing, since we're
	   not in the hash.  If there is, we lost race. */
	hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) {
		if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
				    zone, net))
			goto out;
		if (chainlen++ > max_chainlen)
			goto chaintoolong;
	}

	chainlen = 0;
	hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) {
		if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
				    zone, net))
			goto out;
		if (chainlen++ > max_chainlen) {
chaintoolong:
			NF_CT_STAT_INC(net, chaintoolong);
			NF_CT_STAT_INC(net, insert_failed);
			ret = NF_DROP;
			goto dying;
		}
	}
///...
out:
	ret = nf_ct_resolve_clash(skb, h, reply_hash);
dying:
	nf_conntrack_double_unlock(hash, reply_hash);
	local_bh_enable();
	return ret;
}

由于nf_ct_resolve_clash通常nf_ct_resolve_clash并不能解决冲突，所以会返回NF_DROP，同时会递增drop和insert_failed事件。

drop:
	NF_CT_STAT_INC(net, drop);
	NF_CT_STAT_INC(net, insert_failed);

也就是说，会丢掉这个报文(struct sk_buff *skb)，并且不会创建conntrack条目。

/**
 * nf_ct_resolve_clash - attempt to handle clash without packet drop
 *
 * @skb: skb that causes the clash
 * @h: tuplehash of the clashing entry already in table
 * @reply_hash: hash slot for reply direction
 *
 * A conntrack entry can be inserted to the connection tracking table
 * if there is no existing entry with an identical tuple.
 *
 * If there is one, @skb (and the assocated, unconfirmed conntrack) has
 * to be dropped.  In case @skb is retransmitted, next conntrack lookup
 * will find the already-existing entry.
 *
 * The major problem with such packet drop is the extra delay added by
 * the packet loss -- it will take some time for a retransmit to occur
 * (or the sender to time out when waiting for a reply).
 *
 * This function attempts to handle the situation without packet drop.
 *
 * If @skb has no NAT transformation or if the colliding entries are
 * exactly the same, only the to-be-confirmed conntrack entry is discarded
 * and @skb is associated with the conntrack entry already in the table.
 *
 * Failing that, the new, unconfirmed conntrack is still added to the table
 * provided that the collision only occurs in the ORIGINAL direction.
 * The new entry will be added only in the non-clashing REPLY direction,
 * so packets in the ORIGINAL direction will continue to match the existing
 * entry.  The new entry will also have a fixed timeout so it expires --
 * due to the collision, it will only see reply traffic.
 *
 * Returns NF_DROP if the clash could not be resolved.
 */
static __cold noinline int
nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h,
		    u32 reply_hash)
{
	/* This is the conntrack entry already in hashes that won race. */
	struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
	const struct nf_conntrack_l4proto *l4proto;
	enum ip_conntrack_info ctinfo;
	struct nf_conn *loser_ct;
	struct net *net;
	int ret;

	loser_ct = nf_ct_get(skb, &ctinfo);
	net = nf_ct_net(loser_ct);

	l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct));
	if (!l4proto->allow_clash)
		goto drop;

	ret = __nf_ct_resolve_clash(skb, h);
	if (ret == NF_ACCEPT)
		return ret;

	ret = nf_ct_resolve_clash_harder(skb, reply_hash);
	if (ret == NF_ACCEPT)
		return ret;

drop:
	NF_CT_STAT_INC(net, drop);
	NF_CT_STAT_INC(net, insert_failed);
	return NF_DROP;
}

验证

执行conntrack -S的输出，可以看到cpu=15中的insert_failed/drop不断增加，从1一直到7，并且变化的间隔也越来越大。这个也对应着启动tcp连接的syn包重试的二倍增加策略。

tsecer@harry: watch conntrack -S  
tsecer@harry: 
cpu=0           found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=1           found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=2           found=0 invalid=0 ignore=2 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=3           found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=4           found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=2
cpu=5           found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=6           found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=7           found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=8           found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=9           found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=10          found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=11          found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=12          found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=13          found=0 invalid=6 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=14          found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=15          found=7 invalid=0 ignore=0 insert=0 insert_failed=7 drop=7 early_drop=0 error=0 search_restart=0
cpu=16          found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=1
cpu=17          found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=18          found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=19          found=0 invalid=0 ignore=2 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=20          found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=21          found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=22          found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=23          found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=2
cpu=24          found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=25          found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=26          found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=27          found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=28          found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=29          found=0 invalid=0 ignore=3 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=30          found=0 invalid=6 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=31          found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0

通过netstat -s可以看到，其中有其次TCPTimeouts，对应启动连接时的7次syn包重传；这7次重传对应一次因为超时导致的连接失败(1 connections aborted due to timeout)。

tsecer@harry: netstat -s
Ip:
    Forwarding: 2
    11631 total packets received
......
TcpExt:
    390 TCP sockets finished time wait in fast timer
    10 delayed acks sent
    1 delayed acks further delayed because of locked socket
    Quick ack mode was activated 45 times
    1375 packet headers predicted
    2695 acknowledgments not containing data payload received
    845 predicted acknowledgments
    TCPTimeouts: 7
    TCPLossProbes: 3
    TCPDSACKOldSent: 45
    TCPDSACKRecv: 1
    8 connections reset due to unexpected data
    23 connections reset due to early user close
    1 connections aborted due to timeout

参考文档

该文章指出了可以通过conntrack -S查看输出中的“insert_failed”状态；并且这篇文章同时推荐使用的--random-fully参数，在作者的测试代码中的使用方法，也是k8s中MASQUERADE条目添加的参数。

outro

测试的例子看起来是"Nobody does that, do they?"的错误用法。但是，在真实的环境(不确定，猜测是相同原因)中，同样可能出现这样的现象和问题：尽管直接原因不同，但是底层原理都是因为选择了重复的SNAT，触发丢包，导致延迟。

另外，k8s访问cluster外的IP同样也是用了SNAT的另一种形式MASQUERADE，而这正是分析这部分逻辑的初衷。

posted on 2025-03-08 18:09 tsecer 阅读(49) 评论(0) 收藏举报

刷新页面返回顶部

tsecer

SNAT失败为什么会表现为超时

intro

分配

确认

验证

参考文档

outro

导航

公告