skb相关路由信息

sock结构体中有两个成员缓存路由:sk_rx_dst缓存入口路由,sk_dst_cache缓存出口路由

skb结构体中的_skb_refdst在特定时刻仅缓存一种路由,防止反复查找

skb_dst_set需要在调用前增加引用计数(dst_clone);而skb_dst_set_noref不需要,其通过标志SKB_DST_NOREF用来标识此缓存没有引用计数,并且在skb_dst_drop函数释放路由缓存时,不进行释放操作

/**
 * skb_dst_set - sets skb dst
 * @skb: buffer
 * @dst: dst entry
 *
 * Sets skb dst, assuming a reference was taken on dst and should
 * be released by skb_dst_drop()
 */
static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst)
{
    skb->_skb_refdst = (unsigned long)dst;
}

/**
 * skb_dst_set_noref - sets skb dst, hopefully, without taking reference
 * @skb: buffer
 * @dst: dst entry
 *
 * Sets skb dst, assuming a reference was not taken on dst.
 * If dst entry is cached, we do not take reference and dst_release
 * will be avoided by refdst_drop. If dst entry is not cached, we take
 * reference, so that last dst_release can destroy the dst immediately.
 */
static inline void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst)
{
    WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
    skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF;
}

/**
 * skb_dst_is_noref - Test if skb dst isn't refcounted
 * @skb: buffer
 */
static inline bool skb_dst_is_noref(const struct sk_buff *skb)
{
    return (skb->_skb_refdst & SKB_DST_NOREF) && skb_dst(skb);
}

static inline struct rtable *skb_rtable(const struct sk_buff *skb)
{
    return (struct rtable *)skb_dst(skb);
}
View Code

 

出口路由缓存

  对于本地发出的数据包(本地创建分配的skb),其缓存的为出口路由。例如,作为TCP服务端,在回复客户端SYN+ACK时,新建一个skb结构体,根据路由查询结果(inet_csk_route_req查询出口路由),设置skb路由缓存,此时缓存的为出口路由,之后在发送过程中就不需要再次查找路由了。

/*
 *    Send a SYN-ACK after having received a SYN.
 *    This still operates on a request_sock only, not on a big
 *    socket.
 */
static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
                  struct flowi *fl,
                  struct request_sock *req,
                  struct tcp_fastopen_cookie *foc,
                  bool attach_req)
{
    const struct inet_request_sock *ireq = inet_rsk(req);
    struct flowi4 fl4;/* First, grab a route. */
    if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
        return -1;
    skb = tcp_make_synack(sk, dst, req, foc, attach_req);
-----------------------------
}
struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
                struct request_sock *req,
                struct tcp_fastopen_cookie *foc,
                bool attach_req)
{
    struct inet_request_sock *ireq = inet_rsk(req);
    const struct tcp_sock *tp = tcp_sk(sk);
-------------------------------

    skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
------------------------
    /* Reserve space for headers. */
    skb_reserve(skb, MAX_TCP_HEADER);

----------------------------------------
    skb_dst_set(skb, dst);
------------------------------------
}

 对于UDP协议客户端,其在connect时(UDP客户端connect不同于TCP,仅绑定通信端地址),查询路由,缓存到sock结构体的sk_dst_cache中。

int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
    rt = ip_route_connect(...);
    sk_dst_set(sk, &rt->dst);
}

 

  之后,发送UDP数据包时,检查sock结构体中的出口路由是否有效,有效的话可不用再次查询路由表,在函数ip_make_skb中直接使用rt,并且调用skb_dst_set赋值给skb的_skb_refdst结构体,以便在发送过程中使用。
对于UDP服务端,在首次发包检测到rt为空时,查询路由表得到出口路由,缓存在sock结构中,之后发包时rt有效,省去再次查询

struct sk_buff *__ip_make_skb(...)
{
    skb_dst_set(skb, &rt->dst);
}
 
int udp_sendmsg(...)
{
    if (connected)
        rt = (struct rtable *)sk_dst_check(sk, 0);
    if (rt == NULL) {
        rt = ip_route_output_flow(net, fl4, sk);
        if (connected)
            sk_dst_set(sk, dst_clone(&rt->dst));
    }
 
    skb = ip_make_skb(sk, fl4, getfrag, msg->msg_iov, ulen,
            sizeof(struct udphdr), &ipc, &rt,
            msg->msg_flags);
}

对于TCP调用IP层发送数据包时(调用ip_queue_xmit),检测sock结构中出口路由缓存,如果有效,设置到skb结构体中。否则重新进行出口路由查找。

/* Note: skb->sk can be different from sk, in case of tunnels */
int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
{
    struct inet_sock *inet = inet_sk(sk);
    struct net *net = sock_net(sk);
--------------------------------------
    /* Skip all of this if the packet is already routed,
     * f.e. by something like SCTP.
     */
    rcu_read_lock();
    inet_opt = rcu_dereference(inet->inet_opt);
    fl4 = &fl->u.ip4;
    rt = skb_rtable(skb);
    if (rt)
        goto packet_routed;
    /* Make sure we can route this packet. */
    rt = (struct rtable *)__sk_dst_check(sk, 0);
    if (!rt) {// sk is tproxy no needed routing
        __be32 daddr;
        /* Use correct destination address if we have options. */
        daddr = inet->inet_daddr;
        if (inet_opt && inet_opt->opt.srr)
            daddr = inet_opt->opt.faddr;

        /* If this fails, retransmit mechanism of transport layer will
         * keep trying until route appears or the connection times
         * itself out.
         */
        rt = ip_route_output_ports(net, fl4, sk, daddr, inet->inet_saddr,inet->inet_dport,inet->inet_sport,
                       sk->sk_protocol, RT_CONN_FLAGS(sk),sk->sk_bound_dev_if);
-----------------------------------------
        sk_setup_caps(sk, &rt->dst);
    }
    skb_dst_set_noref(skb, &rt->dst);
}

 

入口路由缓存

  对于接收到的数据包,一种情况是通过early_demux获取缓存路由,例如,在函数tcp_v4_early_demux中,通过sock结构体成员sk_rx_dst中的路由缓存初始化skb的dst,顾名思义,此时缓存的为入口路由。使用设置函数skb_dst_set_noref,不增加dst的引用计数。使用关联的sock成员sk_rx_dst的引用计数,可保障在sock存续期间,skb的dst可安全释放;当sock释放时,关联的skb会一并释放。另一种情况直接查询入口路由(ip_route_input_noref),缓存到skb中。

static int ip_rcv_finish(struct sock *sk, struct sk_buff *skb)
{
    if (sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) {
        ipprot->early_demux(skb);
    }
 
    if (!skb_valid_dst(skb)) {
        int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
                           iph->tos, skb->dev);
    }
}

对于:sysctl_ip_early_demux

  当内核接收到一个TCP数据包来说,首先需要查找skb对应的路由,然后查找skb对应的socket。David Miller 发现这样做是一种浪费,对于属于同一个socket(只考虑ESTABLISHED情况)的路由是相同的,那么如果能将skb的路由缓存到socket(skb->sk)中,就可以只查找查找一次skb所属的socket,就可以顺便把路由找到了,于是David Miller提交了一个patch ipv4: Early TCP socket demux;然而Davem添加的这个patch是有局限的,因为这个处理对于转发的数据包,增加了一个在查找路由之前查找socket的逻辑,可能导致转发效率的降低。
Alexander Duyck提出增加一个ip_early_demux参数来控制是否启动这个特性。

SOCK入口路由与SKB路由缓存

  内核在接收流程中,调用early_demux函数提前在IP层做established状态的sock查找,并负责将sock结构体成员sk_rx_dst的路由缓存赋值给skb成员_skb_refdst,

对于UDP协议,对于先查找 对应目标的 sk,先关联skb 和sk,在判断DST_NOCACHE标志,如果成立,增加dst引用计数,设置skb的dst;否则,调用skb_dst_set_noref直接进行设置。

void tcp_v4_early_demux(struct sk_buff *skb)
{
    const struct iphdr *iph;
    const struct tcphdr *th;
    struct sock *sk;

    if (skb->pkt_type != PACKET_HOST)
        return;

    if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
        return;

    iph = ip_hdr(skb);
    th = tcp_hdr(skb);

    if (th->doff < sizeof(struct tcphdr) / 4)
        return;

    sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
                       iph->saddr, th->source,
                       iph->daddr, ntohs(th->dest),
                       skb->skb_iif);
    if (sk) {
        skb->sk = sk;
        skb->destructor = sock_edemux;
        if (sk_fullsock(sk)) {
            struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);

            if (dst)
                dst = dst_check(dst, 0);
            if (dst &&
                inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
                skb_dst_set_noref(skb, dst);
        }
    }
}
void udp_v4_early_demux(struct sk_buff *skb)
{
    struct net *net = dev_net(skb->dev);
    const struct iphdr *iph;
    const struct udphdr *uh;
    struct sock *sk;
    struct dst_entry *dst;
    int dif = skb->dev->ifindex;
    int ours;

    /* validate the packet */
    if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr)))
        return;

    iph = ip_hdr(skb);
    uh = udp_hdr(skb);

    if (skb->pkt_type == PACKET_BROADCAST ||
        skb->pkt_type == PACKET_MULTICAST) {
        struct in_device *in_dev = __in_dev_get_rcu(skb->dev);

        if (!in_dev)
            return;

        /* we are supposed to accept bcast packets */
        if (skb->pkt_type == PACKET_MULTICAST) {
            ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr,
                           iph->protocol);
            if (!ours)
                return;
        }

        sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr,
                           uh->source, iph->saddr, dif);
    } else if (skb->pkt_type == PACKET_HOST) {
        sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr,
                         uh->source, iph->saddr, dif);
    } else {
        return;
    }

    if (!sk)
        return;

    skb->sk = sk;
    skb->destructor = sock_efree;
    dst = READ_ONCE(sk->sk_rx_dst);

    if (dst)
        dst = dst_check(dst, 0);
    if (dst) {
        /* DST_NOCACHE can not be used without taking a reference */
        if (dst->flags & DST_NOCACHE) {
            if (likely(atomic_inc_not_zero(&dst->__refcnt)))
                skb_dst_set(skb, dst);
        } else {
            skb_dst_set_noref(skb, dst);
        }
    }
}
View Code

入口路由缓存之TCP

a)作为服务端,三次握手完成时,在函数tcp_v4_syn_recv_sock中创建子sock时赋值;
b)作为客户端在函数tcp_finish_connect;
c)在函数tcp_rcv_established中sock处于established状态时,更新其值;

struct sock *tcp_v4_syn_recv_sock()
{
    newsk = tcp_create_openreq_child(sk, req, skb);
    inet_sk_rx_dst_set(newsk, skb);
}
void tcp_finish_connect()
{
    //IPv4v6两个回调函数inet_sk_rx_dst_set与inet6_sk_rx_dst_set
    if (skb)
        icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
}
void tcp_rcv_established()
{
    if (unlikely(!sk->sk_rx_dst))
        inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
}

 

sk_rx_dst合法判断

  在函数tcp_v4_early_demux与tcp_v4_do_rcv中判断sk_rx_dst的合法性,此判断仅在sock状态为TCP_ESTABLISHED时进行。由于作为服务端,会接收到来自于多个接口的客户端请求,所以除需判断缓存路由是否过期外(dst->ops->check(dst, 0)),还需要判断其接口索引(rx_dst_ifindex)是否与此时报文的入接口相同(skb_iif)。两个条件有一个不满足,就释放缓存的路由项。其后会在tcp_rcv_established函数中更新。

int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
    struct sock *rsk;

    if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
        struct dst_entry *dst = sk->sk_rx_dst;

        sock_rps_save_rxhash(sk, skb);
        sk_mark_napi_id(sk, skb);
        if (dst) {
            if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
                !dst->ops->check(dst, 0)) {
                dst_release(dst);
                sk->sk_rx_dst = NULL;
            }
        }
        tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
        return 0;
    }

}

 

  UDP不区分客户端或服务端,都于接收报文时,初始化sk_rx_dst缓存,与TCP不同,udp不记录此路由缓存的入接口索引(rx_dst_ifindex),在合法性检查时仅看是否过期(dst_check)

static void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
{
    old = xchg(&sk->sk_rx_dst, dst);
}
 
void udp_v4_early_demux(struct sk_buff *skb)
{
    dst = READ_ONCE(sk->sk_rx_dst);
    if (dst)
        dst = dst_check(dst, 0);
}
 
int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
           int proto)
{
    sk = skb_steal_sock(skb);
    if (sk) {
        struct dst_entry *dst = skb_dst(skb);
 //unlikely定义,表明udp路由缓存几乎不更新, 使用优化选项
        if (unlikely(sk->sk_rx_dst != dst))
            udp_sk_rx_dst_set(sk, dst);
    }
}

 

在用户层应用关闭socket时,于inet层释放缓存的入口路由缓存:

void inet_sock_destruct(struct sock *sk)
{
------------------------------
    dst_release(sk->sk_rx_dst);
------------------------
}

 

转发路由缓存

  转发路由缓存与入口路由缓存查找方法相同,同是ip_rcv_finish函数中获得转发路由缓存。此时,FIB(fib_lookup)查询出的路由类型不是之前的RTN_LOCAL,路由dst的input函数指针设置为ip_forward; output函数指针设置为ip_output。在转发过程中避免重复查找路由

static int __mkroute_input(...)
{
    rth = rt_dst_alloc(out_dev->dev,
               IN_DEV_CONF_GET(in_dev, NOPOLICY),
               IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
 
    rth->dst.input = ip_forward;
    rth->dst.output = ip_output;
}
 
static int ip_route_input_slow(...)
{
    if (!IN_DEV_FORWARD(in_dev))
        goto no_route;
 
    // 最终调用__mkroute_input。
    err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
out:    return err;
}

 

 

 

posted @ 2022-03-05 12:58  codestacklinuxer  阅读(211)  评论(0)    收藏  举报