socket bind2vrf
Add l3mdev index to flow struct and avoid oif reset for port devices
VRF和l3mdev核心代码的基本前提是,将套接字绑定到一个设备(l3mdev或带有三层域的网络设备)以指示三层网络的作用域。
旧的代码会将flowi_oif(输出接口)重置为l3mdev设备,从而丢失了原始的端口设备绑定。
Ben展示了需要保留原始端口设备绑定的重要使用场景。该补丁通过在通用流结构(flow struct)中添加一个新条目来解决这个问题,
该条目可以指示l3mdev索引,用于后续的规则和路由表匹配,从而避免了重置flowi_oif的需求。
-
解决方案:
- 这个补丁通过向通用流结构(flow struct)中添加一个新条目来保留
l3mdev索引。这样可以在后续的规则和路由表匹配中使用该索引,而不需要重置flowi_oif。 - 这种改进允许更多需要端口设备绑定的使用场景,并且简化了部分数据路径。
- 这个补丁通过向通用流结构(flow struct)中添加一个新条目来保留
-
改进细节:
- l3mdev_fib_rule_match:这个函数只在遍历FIB(转发信息库)规则时调用,并且总是在调用
l3mdev_update_flow之后。这个改进允许对非VRF类型的使用场景进行优化,如果flowi_l3mdev没有设置,可以提前跳过无关的操作。此外,FIB表ID只需要检查这个新加入的索引。 - l3mdev_update_flow:该函数可以在
flowi_oif设置为l3mdev设备(如VRF设备)的情况下调用。通过仅在这种情况下重置flowi_oif,可以不再需要FLOWI_FLAG_SKIP_NH_OIF标志,从而简化多个数据路径中的检查。flowi_iif路径也可以简化,只有在不是环回接口(loopback)且l3mdev索引未设置的情况下才调用。 - 输出路径中的优化:当FIB查找返回拒绝失败时,避免另一个设备查找。
- l3mdev_fib_rule_match:这个函数只在遍历FIB(转发信息库)规则时调用,并且总是在调用
此次修改见:https://lore.kernel.org/netdev/20220314204551.16369-1-dsahern@kernel.org/
Before the patch, if the original output interface was enslaved to a
> > VRF, the output interface in the flow struct would be updated to the VRF
> > and the 'FLOWI_FLAG_SKIP_NH_OIF' flag would be set, causing the above
> > check to be skipped.
> > After the patch, the check is no longer skipped, as original output
> > interface is retained and the flag was removed.
> This breaks scenarios where a GRE tunnel specifies a dummy device
> > enslaved to a VRF as its physical device. The purpose of this
> > configuration is to redirect the underlay lookup to the table associated
> > with the VRF to which the dummy device is enslaved to. The check fails
> > because 'flp->flowi4_oif' points to the dummy device, whereas
> > 'nhc->nhc_oif' points to the interface via which the encapsulated packet
> > should egress.
如果原始输出接口被绑定到一个VRF,流结构中的输出接口会更新为该VRF,并且会设置FLOWI_FLAG_SKIP_NH_OIF标志,导致跳过上述检查。
修改后,不再跳过该检查,因为原始输出接口得以保留,且该标志被移除。
这破坏了某些场景,其中GRE隧道指定了一个绑定到VRF的虚拟设备作为其物理设备。这种配置的目的是将底层查找重定向到与虚拟设备所属VRF相关的路由表。
检查失败的原因是flp->flowi4_oif指向虚拟设备,而nhc->nhc_oif指向封装包应通过的接口。

也就是 fib_table_lookup------>fib_lookup_good_nhc
What's the callchain for this failure?
What's the callchain for this failure? Perhaps the > FLOWI_FLAG_SKIP_NH_OIF needs to be kept for this particular use case. This is the stack trace for the failure: fib_lookup_good_nhc+5 fib_table_lookup+3281 fib4_rule_action+501 fib_rules_lookup+858 __fib_lookup+233 fib_lookup.constprop.0+926 ip_route_output_key_hash_rcu+3707 ip_route_output_key_hash+392 ip_route_output_flow+33 ip_tunnel_xmit+1794 gre_tap_xmit+1312 dev_hard_start_xmit+448 sch_direct_xmit+615 __dev_queue_xmit+4841 The GRE tap is using a dummy device enslaved to a VRF as its physical device.
是在使用VRF(虚拟路由和转发)时遇到的一个需求,即将一个套接字(socket)绑定到一个端口设备,并希望网络查找时强制使用这个出站端口(例如用于多路径场景)。
原来的机制将输出接口(oif)切换到VRF设备,然后忽略oif检查,这种做法让检查变得过于宽松,不符合这种用例的要求。
-
需求背景:
- 某些使用VRF的场景中,用户希望套接字能绑定到特定的物理端口设备,期望在进行路由查找时,系统能够强制使用这个特定的出站端口。比如在多路径(multipath)场景下,需要确保数据流量走某个特定的路径。
-
原始机制的问题:
- 在原有实现中,当绑定套接字时,系统会将输出接口(
oif)切换为VRF设备,而不再是原始的端口设备。同时,系统会忽略oif检查。虽然这种做法让检查变得更加灵活,但对某些用例来说,灵活性过头了,导致无法强制执行对特定端口的使用。
- 在原有实现中,当绑定套接字时,系统会将输出接口(
patch:https://lore.kernel.org/netdev/Yjnrz7vL9HqE5UBz@shredder/
https://lore.kernel.org/netdev/20220314204551.16369-1-dsahern@kernel.org/
所以改动为:
@@ -551,10 +550,10 @@ static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb, memset(&fl4, 0, sizeof(fl4)); /* needed to match OIF rule */ - fl4.flowi4_oif = vrf_dev->ifindex; + fl4.flowi4_l3mdev = vrf_dev->ifindex; fl4.flowi4_iif = LOOPBACK_IFINDEX; fl4.flowi4_tos = RT_TOS(ip4h->tos); - fl4.flowi4_flags = FLOWI_FLAG_ANYSRC | FLOWI_FLAG_SKIP_NH_OIF; + fl4.flowi4_flags = FLOWI_FLAG_ANYSRC; fl4.flowi4_proto = ip4h->protocol; fl4.daddr = ip4h->daddr; fl4.saddr = ip4h->saddr; diff --git a/include/net/flow.h b/include/net/flow.h index 58beb16a49b8..987bd511d652 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -29,6 +29,7 @@ struct flowi_tunnel { struct flowi_common { int flowic_oif; int flowic_iif; + int flowic_l3mdev; __u32 flowic_mark; __u8 flowic_tos; __u8 flowic_scope; @@ -36,7 +37,6 @@ struct flowi_common { __u8 flowic_flags; #define FLOWI_FLAG_ANYSRC 0x01 #define FLOWI_FLAG_KNOWN_NH 0x02 -#define FLOWI_FLAG_SKIP_NH_OIF 0x04 __u32 flowic_secid; kuid_t flowic_uid; struct flowi_tunnel flowic_tun_key; @@ -70,6 +70,7 @@ struct flowi4 { struct flowi_common __fl_common; #define flowi4_oif __fl_common.flowic_oif #define flowi4_iif __fl_common.flowic_iif +#define flowi4_l3mdev __fl_common.flowic_l3mdev #define flowi4_mark __fl_common.flowic_mark #define flowi4_tos __fl_common.flowic_tos #define flowi4_scope __fl_common.flowic_scope @@ -102,6 +103,7 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif, { fl4->flowi4_oif = oif; fl4->flowi4_iif = LOOPBACK_IFINDEX; + fl4->flowi4_l3mdev = 0; fl4->flowi4_mark = mark; fl4->flowi4_tos = tos; fl4->flowi4_scope = scope; @@ -132,6 +134,7 @@ struct flowi6 { struct flowi_common __fl_common; #define flowi6_oif __fl_common.flowic_oif #define flowi6_iif __fl_common.flowic_iif +#define flowi6_l3mdev __fl_common.flowic_l3mdev #define flowi6_mark __fl_common.flowic_mark #define flowi6_scope __fl_common.flowic_scope #define flowi6_proto __fl_common.flowic_proto @@ -177,6 +180,7 @@ struct flowi { } u; #define flowi_oif u.__fl_common.flowic_oif #define flowi_iif u.__fl_common.flowic_iif +#define flowi_l3mdev u.__fl_common.flowic_l3mdev #define flowi_mark u.__fl_common.flowic_mark #define flowi_tos u.__fl_common.flowic_tos #define flowi_scope u.__fl_common.flowic_scope diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 7408051632ac..af8209f912ab 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -291,7 +291,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) bool vmark = in_dev && IN_DEV_SRC_VMARK(in_dev); struct flowi4 fl4 = { .flowi4_iif = LOOPBACK_IFINDEX, - .flowi4_oif = l3mdev_master_ifindex_rcu(dev), + .flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev), .daddr = ip_hdr(skb)->saddr, .flowi4_tos = ip_hdr(skb)->tos & IPTOS_RT_MASK, .flowi4_scope = scope, @@ -353,9 +353,8 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, bool dev_match; fl4.flowi4_oif = 0; - fl4.flowi4_iif = l3mdev_master_ifindex_rcu(dev); - if (!fl4.flowi4_iif) - fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX; + fl4.flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev); + fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX; fl4.daddr = src; fl4.saddr = dst; fl4.flowi4_tos = tos; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index c5a29703185a..cc8e84ef2ae4 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -2234,7 +2234,7 @@ void fib_select_multipath(struct fib_result *res, int hash) void fib_select_path(struct net *net, struct fib_result *res, struct flowi4 *fl4, const struct sk_buff *skb) { - if (fl4->flowi4_oif && !(fl4->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) + if (fl4->flowi4_oif) goto check_saddr; #ifdef CONFIG_IP_ROUTE_MULTIPATH diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 2af2b99e0bea..fb0e49c36c2e 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1429,11 +1429,8 @@ bool fib_lookup_good_nhc(const struct fib_nh_common *nhc, int fib_flags, !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE)) return false; - if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) { - if (flp->flowi4_oif && - flp->flowi4_oif != nhc->nhc_oif) - return false; - } + if (flp->flowi4_oif && flp->flowi4_oif != nhc->nhc_oif) + return false; return true; }
ipv4: fix source address selection with route leak
By default, an address assigned to the output interface is selected when the source address is not specified.
This is problematic when a route, configured in a vrf, uses an interface from another vrf (aka route leak).
The original vrf does not own the selected source address.
Let's add a check against the output interface and call the appropriate function to select the source address.
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index f669da98d11d8f..8956026bc0a2c3 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -2270,6 +2270,15 @@ void fib_select_path(struct net *net, struct fib_result *res, fib_select_default(fl4, res); check_saddr: - if (!fl4->saddr) - fl4->saddr = fib_result_prefsrc(net, res); + if (!fl4->saddr) { + struct net_device *l3mdev; + + l3mdev = dev_get_by_index_rcu(net, fl4->flowi4_l3mdev); + + if (!l3mdev || + l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) == l3mdev) + fl4->saddr = fib_result_prefsrc(net, res); + else + fl4->saddr = inet_select_addr(l3mdev, 0, RT_SCOPE_LINK); + } }

http代理服务器(3-4-7层代理)-网络事件库公共组件、内核kernel驱动 摄像头驱动 tcpip网络协议栈、netfilter、bridge 好像看过!!!!
但行好事 莫问前程
--身高体重180的胖子

浙公网安备 33010602011771号