SNAT失败为什么会表现为超时
intro
在之前的测试中,如果严格限制了所有SNAT的源修改为特定IP+PORT,将所有发往本机137.0.01:8080的tcp都SNAT到127.0.0.1:40000端口。
tsecer@harry: sudo iptables -t nat -A POSTROUTING -p tcp -d 127.0.0.1 --dport 8080 -j SNAT --to-source 127.0.0.1:40000
直观的看,这种配置肯定是有问题的:明显以为限制了SNAT源的选择,所以最多只能完成一个连接的NAT。
毫不意外,测试的时候就会发现:只能有一个连接成功,之后的连接尝试都会在发送SYN包之后进入SYN_SENT状态,并且最终因为没有收到对方回包而超时。
因为发起connect的socket处于SYN_SENT状态,说明socket是分配并发送了握手包;但是通过tcpdump抓包却没有看到这个报文离开主机进入网络,这又说明报文在本机中丢失了。
一个自然而然的问题就是:这个同步包去哪里了?
分配
实现中有一个细节:先修改选择结果,然后判断如果没在使用则返回;反过来说:**如果所有都被用完,返回的是最后一次尝试的(已经被占用的)那个地址(IP + PORT)。
another_round:
for (i = 0; i < attempts; i++, off++) {
*keyptr = htons(min + off % range_size);
if (!nf_nat_used_tuple_harder(tuple, ct, attempts - i))
return;
xt_snat_target_v2>>nf_nat_setup_info>>get_unique_tuple==>>nf_nat_l4proto_unique_tuple
/* Alter the per-proto part of the tuple (depending on maniptype), to
* give a unique tuple in the given range if possible.
*
* Per-protocol part of tuple is initialized to the incoming packet.
*/
static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
const struct nf_nat_range2 *range,
enum nf_nat_manip_type maniptype,
const struct nf_conn *ct)
{
unsigned int range_size, min, max, i, attempts;
__be16 *keyptr;
u16 off;
switch (tuple->dst.protonum) {
case IPPROTO_ICMP:
case IPPROTO_ICMPV6:
/* id is same for either direction... */
keyptr = &tuple->src.u.icmp.id;
if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {
min = 0;
range_size = 65536;
} else {
min = ntohs(range->min_proto.icmp.id);
range_size = ntohs(range->max_proto.icmp.id) -
ntohs(range->min_proto.icmp.id) + 1;
}
goto find_free_id;
#if IS_ENABLED(CONFIG_NF_CT_PROTO_GRE)
case IPPROTO_GRE:
/* If there is no master conntrack we are not PPTP,
do not change tuples */
if (!ct->master)
return;
if (maniptype == NF_NAT_MANIP_SRC)
keyptr = &tuple->src.u.gre.key;
else
keyptr = &tuple->dst.u.gre.key;
if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {
min = 1;
range_size = 65535;
} else {
min = ntohs(range->min_proto.gre.key);
range_size = ntohs(range->max_proto.gre.key) - min + 1;
}
goto find_free_id;
#endif
case IPPROTO_UDP:
case IPPROTO_UDPLITE:
case IPPROTO_TCP:
case IPPROTO_SCTP:
case IPPROTO_DCCP:
if (maniptype == NF_NAT_MANIP_SRC)
keyptr = &tuple->src.u.all;
else
keyptr = &tuple->dst.u.all;
break;
default:
return;
}
/* If no range specified... */
if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {
/* If it's dst rewrite, can't change port */
if (maniptype == NF_NAT_MANIP_DST)
return;
if (ntohs(*keyptr) < 1024) {
/* Loose convention: >> 512 is credential passing */
if (ntohs(*keyptr) < 512) {
min = 1;
range_size = 511 - min + 1;
} else {
min = 600;
range_size = 1023 - min + 1;
}
} else {
min = 1024;
range_size = 65535 - 1024 + 1;
}
} else {
min = ntohs(range->min_proto.all);
max = ntohs(range->max_proto.all);
if (unlikely(max < min))
swap(max, min);
range_size = max - min + 1;
}
find_free_id:
if (range->flags & NF_NAT_RANGE_PROTO_OFFSET)
off = (ntohs(*keyptr) - ntohs(range->base_proto.all));
else if ((range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL) ||
maniptype != NF_NAT_MANIP_DST)
off = get_random_u16();
else
off = 0;
attempts = range_size;
if (attempts > NF_NAT_MAX_ATTEMPTS)
attempts = NF_NAT_MAX_ATTEMPTS;
/* We are in softirq; doing a search of the entire range risks
* soft lockup when all tuples are already used.
*
* If we can't find any free port from first offset, pick a new
* one and try again, with ever smaller search window.
*/
another_round:
for (i = 0; i < attempts; i++, off++) {
*keyptr = htons(min + off % range_size);
if (!nf_nat_used_tuple_harder(tuple, ct, attempts - i))
return;
}
if (attempts >= range_size || attempts < 16)
return;
attempts /= 2;
off = get_random_u16();
goto another_round;
}
确认
前面分配了IP+PORT是否就意味着conntrack就真的SNAT成功了呢?显然不是,因为如果建立成功就以为这逻辑错误。以开始的测试SNAT为例,当收到回包时,五元组(proto, srcip, srcport, dstip, dstport)都是相同的(tcp, 127.0.0.1, 8080, 127.0.0.1, 40000)无法区分到底属于当前主机上哪个socket(端口)。
nf_conntrack模块(nf_conntrack_proto.c)在中NF_INET_POST_ROUTING chain中注册了nf_confirm hook函数。
///@file: nf_conntrack_proto.c
/* Connection tracking may drop packets, but never alters them, so
* make it the first hook.
*/
static const struct nf_hook_ops ipv4_conntrack_ops[] = {
{
.hook = ipv4_conntrack_in,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_CONNTRACK,
},
{
.hook = ipv4_conntrack_local,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_CONNTRACK,
},
{
.hook = nf_confirm,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM,
},
{
.hook = nf_confirm,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM,
},
};
NF_IP_PRI_CONNTRACK_CONFIRM的优先级最低(数值最大:INT_MAX),所以它会在所有的target执行完之后再执行。
enum nf_ip_hook_priorities {
NF_IP_PRI_FIRST = INT_MIN,
NF_IP_PRI_RAW_BEFORE_DEFRAG = -450,
NF_IP_PRI_CONNTRACK_DEFRAG = -400,
NF_IP_PRI_RAW = -300,
NF_IP_PRI_SELINUX_FIRST = -225,
NF_IP_PRI_CONNTRACK = -200,
NF_IP_PRI_MANGLE = -150,
NF_IP_PRI_NAT_DST = -100,
NF_IP_PRI_FILTER = 0,
NF_IP_PRI_SECURITY = 50,
NF_IP_PRI_NAT_SRC = 100,
NF_IP_PRI_SELINUX_LAST = 225,
NF_IP_PRI_CONNTRACK_HELPER = 300,
NF_IP_PRI_CONNTRACK_CONFIRM = INT_MAX,
NF_IP_PRI_LAST = INT_MAX,
};
当有冲突项时,调用nf_ct_resolve_clash函数尝试解决冲突。
nf_confirm>>nf_conntrack_confirm>>__nf_conntrack_confirm
/* Confirm a connection given skb; places it in hash table */
int
__nf_conntrack_confirm(struct sk_buff *skb)
{
///...
max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN);
/* See if there's one in the list already, including reverse:
NAT could have grabbed it without realizing, since we're
not in the hash. If there is, we lost race. */
hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) {
if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
zone, net))
goto out;
if (chainlen++ > max_chainlen)
goto chaintoolong;
}
chainlen = 0;
hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) {
if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
zone, net))
goto out;
if (chainlen++ > max_chainlen) {
chaintoolong:
NF_CT_STAT_INC(net, chaintoolong);
NF_CT_STAT_INC(net, insert_failed);
ret = NF_DROP;
goto dying;
}
}
///...
out:
ret = nf_ct_resolve_clash(skb, h, reply_hash);
dying:
nf_conntrack_double_unlock(hash, reply_hash);
local_bh_enable();
return ret;
}
由于nf_ct_resolve_clash通常nf_ct_resolve_clash并不能解决冲突,所以会返回NF_DROP,同时会递增drop和insert_failed事件。
drop:
NF_CT_STAT_INC(net, drop);
NF_CT_STAT_INC(net, insert_failed);
也就是说,会丢掉这个报文(struct sk_buff *skb),并且不会创建conntrack条目。
/**
* nf_ct_resolve_clash - attempt to handle clash without packet drop
*
* @skb: skb that causes the clash
* @h: tuplehash of the clashing entry already in table
* @reply_hash: hash slot for reply direction
*
* A conntrack entry can be inserted to the connection tracking table
* if there is no existing entry with an identical tuple.
*
* If there is one, @skb (and the assocated, unconfirmed conntrack) has
* to be dropped. In case @skb is retransmitted, next conntrack lookup
* will find the already-existing entry.
*
* The major problem with such packet drop is the extra delay added by
* the packet loss -- it will take some time for a retransmit to occur
* (or the sender to time out when waiting for a reply).
*
* This function attempts to handle the situation without packet drop.
*
* If @skb has no NAT transformation or if the colliding entries are
* exactly the same, only the to-be-confirmed conntrack entry is discarded
* and @skb is associated with the conntrack entry already in the table.
*
* Failing that, the new, unconfirmed conntrack is still added to the table
* provided that the collision only occurs in the ORIGINAL direction.
* The new entry will be added only in the non-clashing REPLY direction,
* so packets in the ORIGINAL direction will continue to match the existing
* entry. The new entry will also have a fixed timeout so it expires --
* due to the collision, it will only see reply traffic.
*
* Returns NF_DROP if the clash could not be resolved.
*/
static __cold noinline int
nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h,
u32 reply_hash)
{
/* This is the conntrack entry already in hashes that won race. */
struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
const struct nf_conntrack_l4proto *l4proto;
enum ip_conntrack_info ctinfo;
struct nf_conn *loser_ct;
struct net *net;
int ret;
loser_ct = nf_ct_get(skb, &ctinfo);
net = nf_ct_net(loser_ct);
l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct));
if (!l4proto->allow_clash)
goto drop;
ret = __nf_ct_resolve_clash(skb, h);
if (ret == NF_ACCEPT)
return ret;
ret = nf_ct_resolve_clash_harder(skb, reply_hash);
if (ret == NF_ACCEPT)
return ret;
drop:
NF_CT_STAT_INC(net, drop);
NF_CT_STAT_INC(net, insert_failed);
return NF_DROP;
}
验证
执行conntrack -S的输出,可以看到cpu=15中的insert_failed/drop不断增加,从1一直到7,并且变化的间隔也越来越大。这个也对应着启动tcp连接的syn包重试的二倍增加策略。
tsecer@harry: watch conntrack -S
tsecer@harry:
cpu=0 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=1 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=2 found=0 invalid=0 ignore=2 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=3 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=4 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=2
cpu=5 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=6 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=7 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=8 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=9 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=10 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=11 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=12 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=13 found=0 invalid=6 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=14 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=15 found=7 invalid=0 ignore=0 insert=0 insert_failed=7 drop=7 early_drop=0 error=0 search_restart=0
cpu=16 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=1
cpu=17 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=18 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=19 found=0 invalid=0 ignore=2 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=20 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=21 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=22 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=23 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=2
cpu=24 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=25 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=26 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=27 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=28 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=29 found=0 invalid=0 ignore=3 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=30 found=0 invalid=6 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
cpu=31 found=0 invalid=0 ignore=0 insert=0 insert_failed=0 drop=0 early_drop=0 error=0 search_restart=0
通过netstat -s可以看到,其中有其次TCPTimeouts,对应启动连接时的7次syn包重传;这7次重传对应一次因为超时导致的连接失败(1 connections aborted due to timeout)。
tsecer@harry: netstat -s
Ip:
Forwarding: 2
11631 total packets received
......
TcpExt:
390 TCP sockets finished time wait in fast timer
10 delayed acks sent
1 delayed acks further delayed because of locked socket
Quick ack mode was activated 45 times
1375 packet headers predicted
2695 acknowledgments not containing data payload received
845 predicted acknowledgments
TCPTimeouts: 7
TCPLossProbes: 3
TCPDSACKOldSent: 45
TCPDSACKRecv: 1
8 connections reset due to unexpected data
23 connections reset due to early user close
1 connections aborted due to timeout
参考文档
该文章指出了可以通过conntrack -S查看输出中的“insert_failed”状态;并且这篇文章同时推荐使用的--random-fully参数,在作者的测试代码中的使用方法,也是k8s中MASQUERADE条目添加的参数。
outro
测试的例子看起来是"Nobody does that, do they?"的错误用法。但是,在真实的环境(不确定,猜测是相同原因)中,同样可能出现这样的现象和问题:尽管直接原因不同,但是底层原理都是因为选择了重复的SNAT,触发丢包,导致延迟。
另外,k8s访问cluster外的IP同样也是用了SNAT的另一种形式MASQUERADE,而这正是分析这部分逻辑的初衷。
浙公网安备 33010602011771号