SKB路由缓存与SOCK路由缓存2
skb结构体中的成员_skb_refdst用于暂时缓存出口/入口路由,避免在skb生存期中反复查找路由
sock结构体中有两个成员缓存路由:sk_rx_dst缓存入口路由,sk_dst_cache缓存出口路由
SKB路由缓存
skb_dst_set需要在调用前增加引用计数(dst_clone);而skb_dst_set_noref不需要,其通过标志SKB_DST_NOREF用来标识此缓存没有引用计数,并且在skb_dst_drop函数释放路由缓存时,不进行释放操作。
/** * skb_dst_set - sets skb dst * @skb: buffer * @dst: dst entry * * Sets skb dst, assuming a reference was taken on dst and should * be released by skb_dst_drop() */ static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst) { skb->_skb_refdst = (unsigned long)dst; } /** * skb_dst_set_noref - sets skb dst, hopefully, without taking reference * @skb: buffer * @dst: dst entry * * Sets skb dst, assuming a reference was not taken on dst. * If dst entry is cached, we do not take reference and dst_release * will be avoided by refdst_drop. If dst entry is not cached, we take * reference, so that last dst_release can destroy the dst immediately. */ static inline void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst) { WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF; }
SKB出口路由缓存
1、TCP服务端,在回复客户端SYN+ACK时,新建一个skb结构体,根据路由查询结果(inet_csk_route_req查询出口路由),设置skb路由缓存,此时缓存的为出口路由
tcp_make_synack() { struct inet_request_sock *ireq = inet_rsk(req); const struct tcp_sock *tp = tcp_sk(sk); skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); skb_dst_set(skb, dst); }
2、TCP服务端,在发送数据时最后调用ip_queue_xmit;IP层发送数据包时(调用ip_queue_xmit),检测sock结构中出口路由缓存,如果有效,设置到skb结构体中。否则重新进行出口路由查找,然后设置sock以及skb中
也就是:
- skb有出口路由缓存,则使用
- sock有出口缓存则copy到skb,使用
- 否则查找路由;然后设置到sk 以及skb,使用
/** * skb_rtable - Returns the skb &rtable * @skb: buffer */ static inline struct rtable *skb_rtable(const struct sk_buff *skb) { return (struct rtable *)skb_dst(skb); }
int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, __u8 tos) { rt = skb_rtable(skb); if (rt) goto packet_routed; rt = (struct rtable *)__sk_dst_check(sk, 0); if (!rt) { rt = ip_route_output_ports(net, fl4, sk, daddr, inet->inet_saddr, inet->inet_dport,inet->inet_sport,sk->sk_protocol,---); sk_setup_caps(sk, &rt->dst); } skb_dst_set_noref(skb, &rt->dst);
packet_routed:
------------------------
}
sk_setup_caps(sk, &rt->dst);---->sk_dst_set(sk, dst);
涉及到sk的出口路由缓存
static inline void sk_dst_set(struct sock *sk, struct dst_entry *dst) { struct dst_entry *old_dst; sk_tx_queue_clear(sk); sk->sk_dst_pending_confirm = 0; old_dst = xchg((__force struct dst_entry **)&sk->sk_dst_cache, dst); dst_release(old_dst); }
入口路由缓存
对于接收到的数据包,一种情况是通过early_demux获取缓存路由,例如,在函数tcp_v4_early_demux中,通过sock结构体成员sk_rx_dst中的路由缓存初始化skb的dst,也就是,此时缓存的为入口路由。使用设置函数skb_dst_set_noref,不增加dst的引用计数。使用关联的sock成员sk_rx_dst的引用计数,可保障在sock存续期间,skb的dst可安全释放;当sock释放时,关联的skb会一并释放。
如果early_demux中找不到路由----则直接查询入口路由(ip_route_input_noref),缓存到skb中
- 也就是sock中有sk_rx_dst;则将sock结构体成员sk_rx_dst的路由缓存赋值给skb成员_skb_refdst
- 否则查找fib_lookup
if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && !skb->sk && !ip_is_fragment(iph)) { const struct net_protocol *ipprot; int protocol = iph->protocol; ipprot = rcu_dereference(inet_protos[protocol]); if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) { err = INDIRECT_CALL_2(edemux, tcp_v4_early_demux, udp_v4_early_demux, skb); } } /* * Initialise the virtual path cache for the packet. It describes * how the packet travels inside Linux networking. */ if (!skb_valid_dst(skb)) { err = ip_route_input_noref(skb, iph->daddr, iph->saddr, iph->tos, dev); if (unlikely(err)) goto drop_error; }
forwar路由缓存
在查找路由时如果FIB(fib_lookup)查询出的路由类型不是之前的RTN_LOCAL,路由dst的input函数指针设置为ip_forward; output函数指针设置为ip_output。在转发过程中避免重复查找路由
此过程中涉及到路目的ip路由缓存处理;不看了
struct rtable *rt_dst_alloc(struct net_device *dev, unsigned int flags, u16 type, bool nopolicy, bool noxfrm) { struct rtable *rt; rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, (nopolicy ? DST_NOPOLICY : 0) | (noxfrm ? DST_NOXFRM : 0)); if (rt) { rt->rt_genid = rt_genid_ipv4(dev_net(dev)); rt->dst.output = ip_output; if (flags & RTCF_LOCAL) rt->dst.input = ip_local_deliver; } return rt; } static int __mkroute_input(struct sk_buff *skb, const struct fib_result *res,s) { rth = rt_dst_alloc(out_dev->dev, 0, res->type, IN_DEV_CONF_GET(in_dev, NOPOLICY), IN_DEV_CONF_GET(out_dev, NOXFRM)); rth->rt_is_input = 1; rth->dst.input = ip_forward; rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag, do_cache); skb_dst_set(skb, &rth->dst); //skb缓存 }