skb相关路由信息
sock结构体中有两个成员缓存路由:sk_rx_dst缓存入口路由,sk_dst_cache缓存出口路由
skb结构体中的_skb_refdst在特定时刻仅缓存一种路由,防止反复查找
skb_dst_set需要在调用前增加引用计数(dst_clone);而skb_dst_set_noref不需要,其通过标志SKB_DST_NOREF用来标识此缓存没有引用计数,并且在skb_dst_drop函数释放路由缓存时,不进行释放操作
/** * skb_dst_set - sets skb dst * @skb: buffer * @dst: dst entry * * Sets skb dst, assuming a reference was taken on dst and should * be released by skb_dst_drop() */ static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst) { skb->_skb_refdst = (unsigned long)dst; } /** * skb_dst_set_noref - sets skb dst, hopefully, without taking reference * @skb: buffer * @dst: dst entry * * Sets skb dst, assuming a reference was not taken on dst. * If dst entry is cached, we do not take reference and dst_release * will be avoided by refdst_drop. If dst entry is not cached, we take * reference, so that last dst_release can destroy the dst immediately. */ static inline void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst) { WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF; } /** * skb_dst_is_noref - Test if skb dst isn't refcounted * @skb: buffer */ static inline bool skb_dst_is_noref(const struct sk_buff *skb) { return (skb->_skb_refdst & SKB_DST_NOREF) && skb_dst(skb); } static inline struct rtable *skb_rtable(const struct sk_buff *skb) { return (struct rtable *)skb_dst(skb); }
出口路由缓存
对于本地发出的数据包(本地创建分配的skb),其缓存的为出口路由。例如,作为TCP服务端,在回复客户端SYN+ACK时,新建一个skb结构体,根据路由查询结果(inet_csk_route_req查询出口路由),设置skb路由缓存,此时缓存的为出口路由,之后在发送过程中就不需要再次查找路由了。
/* * Send a SYN-ACK after having received a SYN. * This still operates on a request_sock only, not on a big * socket. */ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, bool attach_req) { const struct inet_request_sock *ireq = inet_rsk(req); struct flowi4 fl4;/* First, grab a route. */ if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) return -1; skb = tcp_make_synack(sk, dst, req, foc, attach_req); ----------------------------- } struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, struct tcp_fastopen_cookie *foc, bool attach_req) { struct inet_request_sock *ireq = inet_rsk(req); const struct tcp_sock *tp = tcp_sk(sk); ------------------------------- skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); ------------------------ /* Reserve space for headers. */ skb_reserve(skb, MAX_TCP_HEADER); ---------------------------------------- skb_dst_set(skb, dst); ------------------------------------ }
对于UDP协议客户端,其在connect时(UDP客户端connect不同于TCP,仅绑定通信端地址),查询路由,缓存到sock结构体的sk_dst_cache中。
int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { rt = ip_route_connect(...); sk_dst_set(sk, &rt->dst); }
之后,发送UDP数据包时,检查sock结构体中的出口路由是否有效,有效的话可不用再次查询路由表,在函数ip_make_skb中直接使用rt,并且调用skb_dst_set赋值给skb的_skb_refdst结构体,以便在发送过程中使用。
对于UDP服务端,在首次发包检测到rt为空时,查询路由表得到出口路由,缓存在sock结构中,之后发包时rt有效,省去再次查询
struct sk_buff *__ip_make_skb(...) { skb_dst_set(skb, &rt->dst); } int udp_sendmsg(...) { if (connected) rt = (struct rtable *)sk_dst_check(sk, 0); if (rt == NULL) { rt = ip_route_output_flow(net, fl4, sk); if (connected) sk_dst_set(sk, dst_clone(&rt->dst)); } skb = ip_make_skb(sk, fl4, getfrag, msg->msg_iov, ulen, sizeof(struct udphdr), &ipc, &rt, msg->msg_flags); }
对于TCP调用IP层发送数据包时(调用ip_queue_xmit),检测sock结构中出口路由缓存,如果有效,设置到skb结构体中。否则重新进行出口路由查找。
/* Note: skb->sk can be different from sk, in case of tunnels */ int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) { struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); -------------------------------------- /* Skip all of this if the packet is already routed, * f.e. by something like SCTP. */ rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); fl4 = &fl->u.ip4; rt = skb_rtable(skb); if (rt) goto packet_routed; /* Make sure we can route this packet. */ rt = (struct rtable *)__sk_dst_check(sk, 0); if (!rt) {// sk is tproxy no needed routing __be32 daddr; /* Use correct destination address if we have options. */ daddr = inet->inet_daddr; if (inet_opt && inet_opt->opt.srr) daddr = inet_opt->opt.faddr; /* If this fails, retransmit mechanism of transport layer will * keep trying until route appears or the connection times * itself out. */ rt = ip_route_output_ports(net, fl4, sk, daddr, inet->inet_saddr,inet->inet_dport,inet->inet_sport, sk->sk_protocol, RT_CONN_FLAGS(sk),sk->sk_bound_dev_if); ----------------------------------------- sk_setup_caps(sk, &rt->dst); } skb_dst_set_noref(skb, &rt->dst); }
入口路由缓存
对于接收到的数据包,一种情况是通过early_demux获取缓存路由,例如,在函数tcp_v4_early_demux中,通过sock结构体成员sk_rx_dst中的路由缓存初始化skb的dst,顾名思义,此时缓存的为入口路由。使用设置函数skb_dst_set_noref,不增加dst的引用计数。使用关联的sock成员sk_rx_dst的引用计数,可保障在sock存续期间,skb的dst可安全释放;当sock释放时,关联的skb会一并释放。另一种情况直接查询入口路由(ip_route_input_noref),缓存到skb中。
static int ip_rcv_finish(struct sock *sk, struct sk_buff *skb) { if (sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) { ipprot->early_demux(skb); } if (!skb_valid_dst(skb)) { int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, iph->tos, skb->dev); } }
对于:sysctl_ip_early_demux
当内核接收到一个TCP数据包来说,首先需要查找skb对应的路由,然后查找skb对应的socket。David Miller 发现这样做是一种浪费,对于属于同一个socket(只考虑ESTABLISHED情况)的路由是相同的,那么如果能将skb的路由缓存到socket(skb->sk)中,就可以只查找查找一次skb所属的socket,就可以顺便把路由找到了,于是David Miller提交了一个patch ipv4: Early TCP socket demux;然而Davem添加的这个patch是有局限的,因为这个处理对于转发的数据包,增加了一个在查找路由之前查找socket的逻辑,可能导致转发效率的降低。
Alexander Duyck提出增加一个ip_early_demux参数来控制是否启动这个特性。
SOCK入口路由与SKB路由缓存
内核在接收流程中,调用early_demux函数提前在IP层做established状态的sock查找,并负责将sock结构体成员sk_rx_dst的路由缓存赋值给skb成员_skb_refdst,
对于UDP协议,对于先查找 对应目标的 sk,先关联skb 和sk,在判断DST_NOCACHE标志,如果成立,增加dst引用计数,设置skb的dst;否则,调用skb_dst_set_noref直接进行设置。
void tcp_v4_early_demux(struct sk_buff *skb) { const struct iphdr *iph; const struct tcphdr *th; struct sock *sk; if (skb->pkt_type != PACKET_HOST) return; if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) return; iph = ip_hdr(skb); th = tcp_hdr(skb); if (th->doff < sizeof(struct tcphdr) / 4) return; sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, iph->saddr, th->source, iph->daddr, ntohs(th->dest), skb->skb_iif); if (sk) { skb->sk = sk; skb->destructor = sock_edemux; if (sk_fullsock(sk)) { struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst); if (dst) dst = dst_check(dst, 0); if (dst && inet_sk(sk)->rx_dst_ifindex == skb->skb_iif) skb_dst_set_noref(skb, dst); } } }
void udp_v4_early_demux(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); const struct iphdr *iph; const struct udphdr *uh; struct sock *sk; struct dst_entry *dst; int dif = skb->dev->ifindex; int ours; /* validate the packet */ if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr))) return; iph = ip_hdr(skb); uh = udp_hdr(skb); if (skb->pkt_type == PACKET_BROADCAST || skb->pkt_type == PACKET_MULTICAST) { struct in_device *in_dev = __in_dev_get_rcu(skb->dev); if (!in_dev) return; /* we are supposed to accept bcast packets */ if (skb->pkt_type == PACKET_MULTICAST) { ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr, iph->protocol); if (!ours) return; } sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr, uh->source, iph->saddr, dif); } else if (skb->pkt_type == PACKET_HOST) { sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr, uh->source, iph->saddr, dif); } else { return; } if (!sk) return; skb->sk = sk; skb->destructor = sock_efree; dst = READ_ONCE(sk->sk_rx_dst); if (dst) dst = dst_check(dst, 0); if (dst) { /* DST_NOCACHE can not be used without taking a reference */ if (dst->flags & DST_NOCACHE) { if (likely(atomic_inc_not_zero(&dst->__refcnt))) skb_dst_set(skb, dst); } else { skb_dst_set_noref(skb, dst); } } }
入口路由缓存之TCP
a)作为服务端,三次握手完成时,在函数tcp_v4_syn_recv_sock中创建子sock时赋值;
b)作为客户端在函数tcp_finish_connect;
c)在函数tcp_rcv_established中sock处于established状态时,更新其值;
struct sock *tcp_v4_syn_recv_sock() { newsk = tcp_create_openreq_child(sk, req, skb); inet_sk_rx_dst_set(newsk, skb); } void tcp_finish_connect() { //IPv4v6两个回调函数inet_sk_rx_dst_set与inet6_sk_rx_dst_set if (skb) icsk->icsk_af_ops->sk_rx_dst_set(sk, skb); } void tcp_rcv_established() { if (unlikely(!sk->sk_rx_dst)) inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); }
sk_rx_dst合法判断
在函数tcp_v4_early_demux与tcp_v4_do_rcv中判断sk_rx_dst的合法性,此判断仅在sock状态为TCP_ESTABLISHED时进行。由于作为服务端,会接收到来自于多个接口的客户端请求,所以除需判断缓存路由是否过期外(dst->ops->check(dst, 0)),还需要判断其接口索引(rx_dst_ifindex)是否与此时报文的入接口相同(skb_iif)。两个条件有一个不满足,就释放缓存的路由项。其后会在tcp_rcv_established函数中更新。
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) { struct sock *rsk; if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ struct dst_entry *dst = sk->sk_rx_dst; sock_rps_save_rxhash(sk, skb); sk_mark_napi_id(sk, skb); if (dst) { if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || !dst->ops->check(dst, 0)) { dst_release(dst); sk->sk_rx_dst = NULL; } } tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len); return 0; } }
UDP不区分客户端或服务端,都于接收报文时,初始化sk_rx_dst缓存,与TCP不同,udp不记录此路由缓存的入接口索引(rx_dst_ifindex),在合法性检查时仅看是否过期(dst_check)
static void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) { old = xchg(&sk->sk_rx_dst, dst); } void udp_v4_early_demux(struct sk_buff *skb) { dst = READ_ONCE(sk->sk_rx_dst); if (dst) dst = dst_check(dst, 0); } int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, int proto) { sk = skb_steal_sock(skb); if (sk) { struct dst_entry *dst = skb_dst(skb); //unlikely定义,表明udp路由缓存几乎不更新, 使用优化选项 if (unlikely(sk->sk_rx_dst != dst)) udp_sk_rx_dst_set(sk, dst); } }
在用户层应用关闭socket时,于inet层释放缓存的入口路由缓存:
void inet_sock_destruct(struct sock *sk) { ------------------------------ dst_release(sk->sk_rx_dst); ------------------------ }
转发路由缓存
转发路由缓存与入口路由缓存查找方法相同,同是ip_rcv_finish函数中获得转发路由缓存。此时,FIB(fib_lookup)查询出的路由类型不是之前的RTN_LOCAL,路由dst的input函数指针设置为ip_forward; output函数指针设置为ip_output。在转发过程中避免重复查找路由
static int __mkroute_input(...) { rth = rt_dst_alloc(out_dev->dev, IN_DEV_CONF_GET(in_dev, NOPOLICY), IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache); rth->dst.input = ip_forward; rth->dst.output = ip_output; } static int ip_route_input_slow(...) { if (!IN_DEV_FORWARD(in_dev)) goto no_route; // 最终调用__mkroute_input。 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos); out: return err; }

浙公网安备 33010602011771号