TSO-GSO reading

对 TCP,在网卡不支持 TSO 时,使用和不使用 GSO 的情形

 

TSO  :

在 分析:IP层发包时:如果是gso 报文会调用

ip_finish_output_gso 

 来处理

static int ip_finish_output(struct sock *sk, struct sk_buff *skb)
{
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
    /* Policy lookup after SNAT yielded a new policy */
    if (skb_dst(skb)->xfrm) {            //仅经过ip_forward流程处理的报文携带该对象
        IPCB(skb)->flags |= IPSKB_REROUTED;    //该flag会影响后续报文的GSO处理
        return dst_output_sk(sk, skb);        //由于SNAT等策略处理,需要再次调用xfrm4_output函数来发包
    }
#endif
    if (skb_is_gso(skb))
        return ip_finish_output_gso(sk, skb);    //如果是gso报文
 
    if (skb->len > ip_skb_dst_mtu(skb))        //非gso报文,报文大小超过设备MTU值,则需要进行IP分片
        return ip_fragment(sk, skb, ip_finish_output2);
 
    return ip_finish_output2(sk, skb);        //直接发送报文
}

 

 

static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb)
{
    netdev_features_t features;
    struct sk_buff *segs;
    int ret = 0;
 
    /* common case: locally created skb or seglen is <= mtu */
    if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) ||        //只有ip forward流程该条件才会不成立,否则该条件成立
          skb_gso_network_seglen(skb) <= ip_skb_dst_mtu(skb))
        return ip_finish_output2(sk, skb);
 
    /* Slowpath -  GSO segment length is exceeding the dst MTU.
     *
     * This can happen in two cases:
     * 1) TCP GRO packet, DF bit not set
     * 2) skb arrived via virtio-net, we thus get TSO/GSO skbs directly
     * from host network stack.
     */
    features = netif_skb_features(skb);                //获取dev的offload feature
    segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);    //skb gso报文分段
    if (IS_ERR_OR_NULL(segs)) {
        kfree_skb(skb);
        return -ENOMEM;
    }
 
    consume_skb(skb);
 
    do {
        struct sk_buff *nskb = segs->next;
        int err;
 
        segs->next = NULL;
        err = ip_fragment(sk, segs, ip_finish_output2);        //分段报文经过ip分片后通过ip_finish_output2发送
 
        if (err && ret == 0)
            ret = err;
        segs = nskb;
    } while (segs);
 
    return ret;
}

SO:可知正常情况下本地发包是不需要进行gso处理的;

实际上本地发包都是延迟到网络设备发包时在处理;一般不会再IP层处理,一般都是在网络设备层处理---->进行软件GSO(硬件不支持)

 

  1. 检测当前报文是GSO数据包,同时物理设备不支此种GSO的分片聚合,或者当前报文已经不需要物理设备进行校验和,则直接到软件GSO逻辑处理
    static inline bool skb_gso_ok(struct sk_buff *skb, netdev_features_t features)
    {
        return net_gso_ok(features, skb_shinfo(skb)->gso_type) &&
               (!skb_has_frag_list(skb) || (features & NETIF_F_FRAGLIST));
    }
    //skb_is_gso 判断skb的shinfo中gso_size字段是否有值来确定当前是GSO包
      //skb_gso_ok 检测设备是否支持当前gso包类型(gso可以有UDP、TCP等几种)
    static inline bool netif_needs_gso(struct sk_buff *skb,
                       netdev_features_t features)
    {
        return skb_is_gso(skb) && (!skb_gso_ok(skb, features) ||
            unlikely((skb->ip_summed != CHECKSUM_PARTIAL) &&
                 (skb->ip_summed != CHECKSUM_UNNECESSARY)));
        //skb->ip_summed != CHECKSUM_PARTIAL    表明该包软件实现校验和
    }

     

 

 

 

 

static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
{
    netdev_features_t features;

    features = netif_skb_features(skb);
    skb = validate_xmit_vlan(skb, features);
    if (unlikely(!skb))
        goto out_null;

    if (netif_needs_gso(skb, features)) {
        
    //检测当前报文是GSO数据包,同时物理设备不支此种GSO的分片聚合,或者当前报
      //文已经不需要物理设备进行校验和,则直接进行软件实现GSO处理。
        struct sk_buff *segs;

        segs = skb_gso_segment(skb, features);
        if (IS_ERR(segs)) {
            goto out_kfree_skb;
        } else if (segs) {
            consume_skb(skb);
            skb = segs;
        }
    } else {
    //如果当前报文有多个frag_list组成,并且当前设备不支持多段处理,则需要使用
  //__skb_linearize进行线性化,也就是需要将多个段数据和入到一个单独的skb中
  //如果__skb_linearize处理失败,该包需要丢弃,这里失败原因比如说创建一个大的
  //skb时没有足够内存资源等。
        if (skb_needs_linearize(skb, features) &&
            __skb_linearize(skb))
            goto out_kfree_skb;

        /* If packet is not checksummed and device does not
         * support checksumming for this protocol, complete
         * checksumming here.
         *///如果当前报文需要硬件设备进行校验和,
              //但当前设备不支持任何校验和处理,或者当前设备不支持IP校验和,或者当前设备
              //支持IP校验和可是当前报文不是IP报文。
              //则需要进行软件校验和处理,
        if (skb->ip_summed == CHECKSUM_PARTIAL) {
            if (skb->encapsulation)
                skb_set_inner_transport_header(skb,
                                   skb_checksum_start_offset(skb));
            else
                skb_set_transport_header(skb,
                             skb_checksum_start_offset(skb));
            if (!(features & NETIF_F_CSUM_MASK) &&
                skb_checksum_help(skb))
                goto out_kfree_skb;
        }
    }

    return skb;

out_kfree_skb:
    kfree_skb(skb);
out_null:
    atomic_long_inc(&dev->tx_dropped);
    return NULL;
}

int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
            struct net_device *dev, struct netdev_queue *txq,
            spinlock_t *root_lock, bool validate)
{
    int ret = NETDEV_TX_BUSY;
---------------------------------------------

    /* Note that we validate skb (GSO, checksum, ...) outside of locks */
    if (validate)//报文校验,gso分段、csum计算
        skb = validate_xmit_skb_list(skb, dev);

    if (likely(skb)) {
        HARD_TX_LOCK(dev, txq, smp_processor_id());
        /*如果说txq被stop,即置位QUEUE_STATE_ANY_XOFF_OR_FROZEN,就直接ret = NETDEV_TX_BUSY
         *如果说txq 正常运行,那么直接调用dev_hard_start_xmit发送数据包*/
            skb = dev_hard_start_xmit(skb, dev, txq, &ret);//调用驱动发送报文

-----------------------------------

}

 

 

 

 

 

 

 

看下 gso 的处理方式:入口函数skb_gso_segment

这个函数将skb分片,并返回一个skb list。如果skb不需要分片则返回NULL。

 

/**
 *    __skb_gso_segment - Perform segmentation on skb.
 *    @skb: buffer to segment
 *    @features: features for the output path (see dev->features)
 *    @tx_path: whether it is called in TX path
 *
 *    This function segments the given skb and returns a list of segments.
 *
 *    It may return NULL if the skb requires no segmentation.  This is
 *    only possible when GSO is used for verifying header integrity.
 *
 *    Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
 */
struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
                  netdev_features_t features, bool tx_path)
{
    struct sk_buff *segs;

    if (unlikely(skb_needs_check(skb, tx_path))) {// 判断等于 skb->ip_summed != CHECKSUM_PARTIAL  CHECKSUM_UNNECESSARY
        int err;

        /* We're going to init ->check field in TCP or UDP header   copy header of skb when it is required
 *    If the skb passed lacks sufficient headroom or its data part
 *    is shared, data is reallocated. If reallocation fails, an error
 *    is returned and original skb is not changed. */
        err = skb_cow_head(skb, 0);
        if (err < 0)
            return ERR_PTR(err);
    }

    /* Only report GSO partial support if it will enable us to
     * support segmentation on this frame without needing additional
     * work.
     */
    if (features & NETIF_F_GSO_PARTIAL) {
        netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
        struct net_device *dev = skb->dev;

        partial_features |= dev->features & dev->gso_partial_features;
        if (!skb_gso_ok(skb, features | partial_features))
            features &= ~NETIF_F_GSO_PARTIAL;
    }

    BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
             sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));

    SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);//设置mac_offset, 用于skb_segment分段拷贝外层报文
    SKB_GSO_CB(skb)->encap_level = 0;//encap_level为零,说明是最外层的报文

    skb_reset_mac_header(skb);//重置mac header
    skb_reset_mac_len(skb);//重置mac len

    segs = skb_mac_gso_segment(skb, features);

    if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
        skb_warn_bad_offload(skb);

    return segs;
}

 

/**
 *    skb_mac_gso_segment - mac layer segmentation handler.
 *    @skb: buffer to segment
 *    @features: features for the output path (see dev->features)
 */
struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
                    netdev_features_t features)
{
    struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
    struct packet_offload *ptype;
    int vlan_depth = skb->mac_len;//__skb_gso_segment函数中计算得到  
    __be16 type = skb_network_protocol(skb, &vlan_depth);//得到skb协议

    if (unlikely(!type))
        return ERR_PTR(-EINVAL);

    __skb_pull(skb, vlan_depth);//skb data指针移动到IP头

    rcu_read_lock();
    list_for_each_entry_rcu(ptype, &offload_base, list) {
        if (ptype->type == type && ptype->callbacks.gso_segment) {
            segs = ptype->callbacks.gso_segment(skb, features);//调用IP层的GSO segment函数
            break;
        }
    }
    rcu_read_unlock();

    __skb_push(skb, skb->data - skb_mac_header(skb));//skb data指针移动到MAC头

    return segs;
}

IP层对GSO的支持  

需要做gso分段,则先进入ip层的分段处理,在ip层分段处理函数里,主要工作是调用tcp层的分段处理函数,等tcp层分段完成后,重新对分段的skb的ip头做checksum

static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
                    netdev_features_t features)
{
    struct sk_buff *segs = ERR_PTR(-EINVAL);
    const struct net_offload *ops;
    unsigned int offset = 0;
    bool udpfrag, encap;
    struct iphdr *iph;
    int proto;
    int nhoff;
    int ihl;
    int id;
 //校验待软GSO分段的的skb,其gso_tpye是否存在其他非法值
    if (unlikely(skb_shinfo(skb)->gso_type &
             ~(SKB_GSO_TCPV4 |
               SKB_GSO_UDP |
               SKB_GSO_DODGY |
               SKB_GSO_TCP_ECN |
               SKB_GSO_GRE |
               SKB_GSO_GRE_CSUM |
               SKB_GSO_IPIP |
               SKB_GSO_SIT |
               SKB_GSO_TCPV6 |
               SKB_GSO_UDP_TUNNEL |
               SKB_GSO_UDP_TUNNEL_CSUM |
               SKB_GSO_TUNNEL_REMCSUM |
               0)))
        goto out;
 
    skb_reset_network_header(skb);
    nhoff = skb_network_header(skb) - skb_mac_header(skb);    //根据network header和mac header得到IP头相对MAC的偏移
    if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))    //分段数据至少大于IP首部长度
        goto out;
 
    iph = ip_hdr(skb);
    //检验首部中的长度字段是否有效
    ihl = iph->ihl * 4;        //得到IP包头的实际长度,基于此可以得到L4的首地址
    if (ihl < sizeof(*iph))
        goto out;
 
    id = ntohs(iph->id);//取出首部中的id字段
    proto = iph->protocol;        //取出IP首部的协议值,L4层协议类型 用于定位与之对应的传输层接口(tcp还是udp)
 
    /* Warning: after this point, iph might be no longer valid */
    //再次通过首部中的长度字段检测skb长度是否有效
    if (unlikely(!pskb_may_pull(skb, ihl)))    //检测skb是否可以移动到L4头?
        goto out;
    __skb_pull(skb, ihl);        //报文data指针移动到传输层
 
    encap = SKB_GSO_CB(skb)->encap_level > 0;
    if (encap)
        features &= skb->dev->hw_enc_features;        //如果encap,那么feature与hw_enc_features取交集
    SKB_GSO_CB(skb)->encap_level += ihl;    //用来标示是否为内层报文
 
    skb_reset_transport_header(skb);    //设置transport header值
 
    segs = ERR_PTR(-EPROTONOSUPPORT);
 
    if (skb->encapsulation &&
        skb_shinfo(skb)->gso_type & (SKB_GSO_SIT|SKB_GSO_IPIP))
        udpfrag = proto == IPPROTO_UDP && encap;
    else
        udpfrag = proto == IPPROTO_UDP && !skb->encapsulation;        //vxlan封装报文走此分支,此时udpfrag为false
 
    ops = rcu_dereference(inet_offloads[proto]);//调用上册协议的GSO处理函数
    if (likely(ops && ops->callbacks.gso_segment))
        segs = ops->callbacks.gso_segment(skb, features);    //UDP或TCP的分段函数
 
    if (IS_ERR_OR_NULL(segs))
        goto out;
 
    skb = segs;//开始处理分段后的skb
    do {
        iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);    //根据分段报文的mac header 和 IP偏移
        if (udpfrag) {                //ip分片报文
            iph->id = htons(id);
            iph->frag_off = htons(offset >> 3);    //设置ip头的frag_off值
            if (skb->next)
                iph->frag_off |= htons(IP_MF);    //后面还有报文,需要设置more frag标记
            offset += skb->len - nhoff - ihl;    //计算offset值,下一个报文需要使用
        } else {
            iph->id = htons(id++);        //每个报文为完整的IP报文
        }
        iph->tot_len = htons(skb->len - nhoff);
        ip_send_check(iph);                //计算ip头 csum值
        if (encap)        //如果encap值非空,说明当前处于内层报文中,所以需要设置inner heaer值
            skb_reset_inner_headers(skb);
        skb->network_header = (u8 *)iph - skb->head;    //设置network header
    } while ((skb = skb->next));
 
out:
    return segs;
}

 TCP层对GSO的支持 

  UDP经过GSO分片后每个分片的IP头部id是一样的,这个符合IP分片的逻辑,但是为什么TCP的GSO分片,IP头部的id会依次加1呢?原因是: tcp建立三次握手的过程中产生合适的mss,这个mss肯定是<=网络层的最大路径MTU,然后tcp数据封装成ip数据包通过网络层发送,当服务器端传输层接收到tcp数据之后进行tcp重组。所以正常情况下tcp产生的ip数据包在传输过程中是不会发生分片的!由于GSO应该保证对外透明,所以其效果应该也和在TCP层直接分片的效果是一样的,所以这里对UDP的处理是IP分片逻辑,但对TCP的处理是构造新的skb逻辑

l  对于GSO

    UDP:所有分片ip头部id都相同,设置IP_MF分片标志(除最后一片(等同于IP分片)

    TCP:分片后,每个分片IP头部中id1, (等同于TCP分段)

static struct sk_buff *tcp4_gso_segment(struct sk_buff *skb,
                    netdev_features_t features)
{
    if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
        return ERR_PTR(-EINVAL);
 
    if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {    
        const struct iphdr *iph = ip_hdr(skb);
        struct tcphdr *th = tcp_hdr(skb);    //ip层报文保证了transport header值
 
        /* Set up checksum pseudo header, usually expect stack to
         * have done this already.
         */
 
        th->check = 0;
        skb->ip_summed = CHECKSUM_PARTIAL;
        __tcp_v4_send_check(skb, iph->saddr, iph->daddr);    //计算伪头check值
    }
 
    return tcp_gso_segment(skb, features);    //TCP GSO分段
}
struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
                netdev_features_t features)
{
    struct sk_buff *segs = ERR_PTR(-EINVAL);
    unsigned int sum_truesize = 0;
    struct tcphdr *th;
    unsigned int thlen;
    unsigned int seq;
    __be32 delta;
    unsigned int oldlen;
    unsigned int mss;
    struct sk_buff *gso_skb = skb;
    __sum16 newcheck;
    bool ooo_okay, copy_destructor;
 
    th = tcp_hdr(skb);
    thlen = th->doff * 4;        //得到tcp头的长度
    if (thlen < sizeof(*th))
        goto out;
 
    if (!pskb_may_pull(skb, thlen)) //再次通过首部中的长度字段检测skb长度是否有效
        goto out;
 //把tcp header移到skb header里,把skb->len存到oldlen中,此时skb->len就只有ip payload的长度(包含TCP首部)
    oldlen = (u16)~skb->len;
    __skb_pull(skb, thlen);        //skb移动到用户数据区(payload)
 
    mss = tcp_skb_mss(skb);        //得到mss值
    if (unlikely(skb->len <= mss))
        goto out;
 
    if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
        /* Packet is from an untrusted source, reset gso_segs. */
        int type = skb_shinfo(skb)->gso_type;
 
        if (unlikely(type &
                 ~(SKB_GSO_TCPV4 |
                   SKB_GSO_DODGY |
                   SKB_GSO_TCP_ECN |
                   SKB_GSO_TCPV6 |
                   SKB_GSO_GRE |
                   SKB_GSO_GRE_CSUM |
                   SKB_GSO_IPIP |
                   SKB_GSO_SIT |
                   SKB_GSO_UDP_TUNNEL |
                   SKB_GSO_UDP_TUNNEL_CSUM |
                   SKB_GSO_TUNNEL_REMCSUM |
                   0) ||
                 !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
            goto out; //校验待软GSO分段的的skb,其gso_tpye是否存在其他非法值
 
        skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); //如果报文来源不可信,则重新计算segs,返回
 
        segs = NULL;
        goto out;
    }
 
    copy_destructor = gso_skb->destructor == tcp_wfree;
    ooo_okay = gso_skb->ooo_okay;
    /* All segments but the first should have ooo_okay cleared */
    skb->ooo_okay = 0;
 
    segs = skb_segment(skb, features);    //调用payload根据mss值分段
    if (IS_ERR(segs))
        goto out;
 
    /* Only first segment might have ooo_okay set */
    segs->ooo_okay = ooo_okay;
 
    delta = htonl(oldlen + (thlen + mss));    //TCP头+mss - 原始报文,该值为负值
 
    skb = segs;
    th = tcp_hdr(skb);    //skb_segment分段后,可以直接从skb中获取tcp头, skb_segment或udp4_ufo_fragment保证
    seq = ntohl(th->seq);
 
    if (unlikely(skb_shinfo(gso_skb)->tx_flags & SKBTX_SW_TSTAMP))
        tcp_gso_tstamp(segs, skb_shinfo(gso_skb)->tskey, seq, mss);
 
    newcheck = ~csum_fold((__force __wsum)((__force u32)th->check +    //第一个报文基于原先值,根据delta快速计算
                           (__force u32)delta));
 
    do {    //刷新分段后报文的TCP头设置
        th->fin = th->psh = 0;
        th->check = newcheck;
 //计算每个分片的校验和
        if (skb->ip_summed != CHECKSUM_PARTIAL)       
            th->check = gso_make_checksum(skb, ~th->check);     //重新计算check值
 
        seq += mss; //重新初始化每个分片的序列号
        if (copy_destructor) {
            skb->destructor = gso_skb->destructor;
            skb->sk = gso_skb->sk;
            sum_truesize += skb->truesize;
        }
        skb = skb->next;
        th = tcp_hdr(skb);
 
        th->seq = htonl(seq);
        th->cwr = 0;
    } while (skb->next);
 
    /* Following permits TCP Small Queues to work well with GSO :
     * The callback to TCP stack will be called at the time last frag
     * is freed at TX completion, and not right now when gso_skb
     * is freed by GSO engine
     */
    if (copy_destructor) {
        swap(gso_skb->sk, skb->sk);
        swap(gso_skb->destructor, skb->destructor);
        sum_truesize += skb->truesize;
        atomic_add(sum_truesize - gso_skb->truesize,
               &skb->sk->sk_wmem_alloc);
    }
 
    delta = htonl(oldlen + (skb_tail_pointer(skb) -
                skb_transport_header(skb)) +    //最后一个报文的delta值不同
              skb->data_len);
    th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
                (__force u32)delta));
    if (skb->ip_summed != CHECKSUM_PARTIAL)
        th->check = gso_make_checksum(skb, ~th->check);    //重新计算check值
out:
    return segs;
}

 

skg_segment是实现封装报文GSO分段的基础

 

/**
 *    skb_segment - Perform protocol segmentation on skb.
 *    @head_skb: buffer to segment
 *    @features: features for the output path (see dev->features)
 *
 *    This function performs segmentation on the given skb.  It returns
 *    a pointer to the first in a list of new skbs for the segments.
 *    In case of error it returns ERR_PTR(err).
 */
struct sk_buff *skb_segment(struct sk_buff *head_skb,
                netdev_features_t features)
{
    struct sk_buff *segs = NULL;
    struct sk_buff *tail = NULL;
    struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list;
    skb_frag_t *frag = skb_shinfo(head_skb)->frags;    
    unsigned int mss = skb_shinfo(head_skb)->gso_size;
    //mac头+ip头+tcp头 或mac头+ip头(对于UDP传入时没有将头部偏移过去)
    unsigned int doffset = head_skb->data - skb_mac_header(head_skb);  //得到内层报头的长度
    struct sk_buff *frag_skb = head_skb;
    unsigned int offset = doffset;
    unsigned int tnl_hlen = skb_tnl_header_len(head_skb);    //得到外层报头的长度,非封装报文该值为0, 是支持封装报文GSO的基础
    unsigned int headroom;
    unsigned int len;
    __be16 proto;
    bool csum;
    int sg = !!(features & NETIF_F_SG);    //是否支持SG
    int nfrags = skb_shinfo(head_skb)->nr_frags;
    int err = -ENOMEM;
    int i = 0;
    int pos;
    int dummy;
 
    __skb_push(head_skb, doffset);        //报文移到内层报文的mac头
    proto = skb_network_protocol(head_skb, &dummy);    //报文协议类型
    if (unlikely(!proto))
        return ERR_PTR(-EINVAL);
 
    csum = !head_skb->encap_hdr_csum &&
        !!can_checksum_protocol(features, proto);
 
    headroom = skb_headroom(head_skb);    //得到报文的headroom大小
    pos = skb_headlen(head_skb);        //报文线性区长度
 
    do {
        struct sk_buff *nskb;
        skb_frag_t *nskb_frag;
        int hsize;
        int size;
 /* offset为分片已处理的长度,len为skb->len减去直到offset的部分。开始时,offset只是mac header + ip header + tcp header的长度,
 len即tcp payload的长度。随着segment增加, offset每次都增加mss长度。
 因此len的定义是每个segment的payload长度(最后一个segment的payload可能小于一个mss长度)
 */
        len = head_skb->len - offset;    //计算报文待拷贝的长度,不包括包头
        if (len > mss)   //len为本次要创建的新分片的长度         
            len = mss;        //len超过mss,则只能拷贝mss长度
 // hsize为线性区部分的payload减去offset后的大小,如果hsize小于0,那么说明payload在skb的frags或frag_list中。
 //随着offset一直增长,必定会有hsize一直<0的情况开始出现,除非skb是一个完全linearize化的skb
        hsize = skb_headlen(head_skb) - offset;    //待拷贝的线性区长度
        if (hsize < 0)
            hsize = 0;//这种情况说明线性区已经没有tcp payload的部分,需要pull数据过来
        if (hsize > len || !sg)
            hsize = len;//如果不支持NETIF_F_SG或者hsize大于len,那么hsize就为len(本次新分片的长度),此时说明segment的payload还在skb 线性区中
  //如果把frags数组中的数据拷贝完还不够len长度,则需要从frag_list中拷贝了
  //表示需要从frags数组或者frag_list链表中拷贝出数据,i >= nfrags说明frags数组中的数据也拷贝完了//下面需要从frag_list链表中拷贝数据了
        if (!hsize && i >= nfrags && skb_headlen(list_skb) && 
            (skb_headlen(list_skb) == len || sg)) {
            BUG_ON(skb_headlen(list_skb) > len);    //frag_list中的skb线性区长度不超过len,即mss值
 
            i = 0;
            nfrags = skb_shinfo(list_skb)->nr_frags;
            frag = skb_shinfo(list_skb)->frags;
            frag_skb = list_skb;
            pos += skb_headlen(list_skb);    //增加线性区长度
 
            while (pos < offset + len) {    //只能拷贝len长度
                BUG_ON(i >= nfrags);
 
                size = skb_frag_size(frag);
                if (pos + size > offset + len)
                    break;
 
                i++;
                pos += size;        //增加frag的长度
                frag++;
            }
            //frag_list的数据不用真的拷贝,只需要拷贝其skb描述符,就可以复用其数据区
            nskb = skb_clone(list_skb, GFP_ATOMIC);    //克隆报文,该报文包含完整的数据,需要裁剪
            list_skb = list_skb->next;
 
            if (unlikely(!nskb))
                goto err;
 
            if (unlikely(pskb_trim(nskb, len))) {    //裁剪报文到len长度
                kfree_skb(nskb);
                goto err;
            }
 
            hsize = skb_end_offset(nskb); //保证新的skb的headroom有mac header+ip header+tcp/udp+header的大小
            if (skb_cow_head(nskb, doffset + headroom)) {    //扩展head,以容得下外层报头
                kfree_skb(nskb);
                goto err;
            }
            //调整truesize,使其包含本次已分片的数据部分长度(hsize)
            nskb->truesize += skb_end_offset(nskb) - hsize;    //truesize值刷新
            skb_release_head_state(nskb);
            __skb_push(nskb, doffset);    //skb移动到内层报文的mac头
        } else {
        //每次要拷贝出的数据长度为len,其中hsize位于线性区
            nskb = __alloc_skb(hsize + doffset + headroom,    //skb的frag还未使用完,采用新申请skb的方式
                       GFP_ATOMIC, skb_alloc_rx_flag(head_skb),
                       NUMA_NO_NODE);
 
            if (unlikely(!nskb))
                goto err;
 
            skb_reserve(nskb, headroom);    //skb预留headroom长度
            __skb_put(nskb, doffset);    //线性区扩展内层报头长度
        }
 
        if (segs)
            tail->next = nskb;
        else
            segs = nskb;
        tail = nskb;
 
        __copy_skb_header(nskb, head_skb);    //拷贝skb的相关信息,包括header都拷贝了
 
        skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom);    //刷新header值
        skb_reset_mac_len(nskb);    //重置mac len值
  //把skb->data开始doffset长度的内容拷贝到nskb->data中
        skb_copy_from_linear_data_offset(head_skb, -tnl_hlen,    //拷贝外两层报头(如果封装的话)
                         nskb->data - tnl_hlen,
                         doffset + tnl_hlen);
 
        if (nskb->len == len + doffset)        //对于使用frag_list场景,满足条件;拷贝frag场景不满足
            goto perform_csum_check;
 
        if (!sg && !nskb->remcsum_offload) {//如果不支持NETIF_F_SG,说明frags数组中没有数据,只考虑从线性区中拷贝数据
            nskb->ip_summed = CHECKSUM_NONE;
            nskb->csum = skb_copy_and_csum_bits(head_skb, offset,    //计算cusm值
                                skb_put(nskb, len),
                                len, 0);
            SKB_GSO_CB(nskb)->csum_start =
                skb_headroom(nskb) + doffset;
            continue;
        }
 
        nskb_frag = skb_shinfo(nskb)->frags;
        //如果hsize不为0,那么拷贝hsize的内容到nskb的线性区中
        skb_copy_from_linear_data_offset(head_skb, offset,    //拷贝线性区数据
                         skb_put(nskb, hsize), hsize);
 
        skb_shinfo(nskb)->tx_flags = skb_shinfo(head_skb)->tx_flags &
            SKBTX_SHARED_FRAG;
            
 //每次要拷贝的数据长度是len,其中hsize是位于线性区中,但是随着线性区数据逐渐被处理,hsize可能不够len,这时剩下的(len-hsize)长度就要从frags数组中拷贝了
        while (pos < offset + len) {    
            if (i >= nfrags) {
                BUG_ON(skb_headlen(list_skb));
 
                i = 0;
                nfrags = skb_shinfo(list_skb)->nr_frags;
                frag = skb_shinfo(list_skb)->frags;
                frag_skb = list_skb;
 
                BUG_ON(!nfrags);
 
                list_skb = list_skb->next;    //frag_list场景,取下一个skb
            }
 
            if (unlikely(skb_shinfo(nskb)->nr_frags >=
                     MAX_SKB_FRAGS)) {
                net_warn_ratelimited(
                    "skb_segment: too many frags: %u %u\n",
                    pos, mss);
                goto err;
            }
 
            if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC)))
                goto err;
 
            *nskb_frag = *frag;    //frag_list的逻辑和frag的逻辑合并在了一起,增加了复杂度
            __skb_frag_ref(nskb_frag);
            size = skb_frag_size(nskb_frag);
 
            if (pos < offset) {//pos初始为线性区长度,后来表示已经被拷贝的长度
                nskb_frag->page_offset += offset - pos;
                skb_frag_size_sub(nskb_frag, offset - pos);  //frag分拆
            }
 
            skb_shinfo(nskb)->nr_frags++;
 
            if (pos + size <= offset + len) {
                i++;
                frag++;
                pos += size;
            } else {
            
                skb_frag_size_sub(nskb_frag, pos + size - (offset + len));    //frag分拆
                goto skip_fraglist;
            }
 
            nskb_frag++;
        }
 
skip_fraglist:
        nskb->data_len = len - hsize;
        nskb->len += nskb->data_len;
        nskb->truesize += nskb->data_len;
 
perform_csum_check:
        if (!csum && !nskb->remcsum_offload) {
            nskb->csum = skb_checksum(nskb, doffset,
                          nskb->len - doffset, 0);    //计算csum值
            nskb->ip_summed = CHECKSUM_NONE;
            SKB_GSO_CB(nskb)->csum_start =
                skb_headroom(nskb) + doffset;
        }
    } while ((offset += len) < head_skb->len);
 
    /* Some callers want to get the end of the list.
     * Put it in segs->prev to avoid walking the list.
     * (see validate_xmit_skb_list() for example)
     */
    segs->prev = tail;
 
    /* Following permits correct backpressure, for protocols
     * using skb_set_owner_w().
     * Idea is to tranfert ownership from head_skb to last segment.
     */
    if (head_skb->destructor == sock_wfree) {
        swap(tail->truesize, head_skb->truesize);
        swap(tail->destructor, head_skb->destructor);
        swap(tail->sk, head_skb->sk);
    }
    return segs;
 
err:
    kfree_skb_list(segs);
    return ERR_PTR(err);
}

 

输出报文 分片:

int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
           int (*output)(struct net *, struct sock *, struct sk_buff *))
{
    struct iphdr *iph;
    int ptr;
    struct sk_buff *skb2;
    unsigned int mtu, hlen, left, len, ll_rs;
    int offset;
    __be16 not_last_frag;
    struct rtable *rt = skb_rtable(skb);
    int err = 0;

    /* for offloaded checksums cleanup checksum before fragmentation */
    /* PARTIAL类型需要清除校验和 */
    if (skb->ip_summed == CHECKSUM_PARTIAL &&
        (err = skb_checksum_help(skb)))
        goto fail;

    /*
     *    Point into the IP datagram header.
     */

    iph = ip_hdr(skb);

    /* 获取mtu */
    mtu = ip_skb_dst_mtu(sk, skb);

    /* 接收到的最大分片长度 < mtu,则将mtu设置为该值 */
    if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu)
        mtu = IPCB(skb)->frag_max_size;

    /*
     *    Setup starting values.
     */

    hlen = iph->ihl * 4;
    mtu = mtu - hlen;    /* Size of data space */
    IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;

    /* When frag_list is given, use it. First, check its validity:
     * some transformers could create wrong frag_list or break existing
     * one, it is not prohibited. In this case fall back to copying.
     *
     * LATER: this step can be merged to real generation of fragments,
     * we can switch to copy when see the first bad fragment.
     */
    /* 有分片列表 */
    if (skb_has_frag_list(skb)) {
        struct sk_buff *frag, *frag2;

        /* 线性区域和分页区的数据长度 */
        unsigned int first_len = skb_pagelen(skb);

        /* 以下情况,进入慢路处理 */
        if (first_len - hlen > mtu || /* 分片长度>MTU */
            ((first_len - hlen) & 7) || /* 没有8字节对齐 */
            ip_is_fragment(iph) || /* 是一个分片 */
            skb_cloned(skb)) /* 是克隆的 */
            goto slow_path;

        /* 遍历分片列表 */
        skb_walk_frags(skb, frag) {
            /* Correct geometry. */
            /* 以下情况,恢复状态,进入慢速路径 */
            if (frag->len > mtu || /* 分片长度>mtu */
                ((frag->len & 7) && frag->next) || /* 除最后一个分片外,其余有非8字节对齐的 */
                skb_headroom(frag) < hlen) /* 头部长度过小 */
                goto slow_path_clean;

            /* Partially cloned skb? */
            /* 克隆的,恢复状态,进入慢速路径 */
            if (skb_shared(frag))
                goto slow_path_clean;

            BUG_ON(frag->sk);

            /* 分片关联控制块 */
            if (skb->sk) {
                frag->sk = skb->sk;
                frag->destructor = sock_wfree;
            }

            /* 第一个skb的长度去掉当前分片的长度 */
            skb->truesize -= frag->truesize;
        }

        /* Everything is OK. Generate! */

        /* 现在分片没问题了,设置分片信息 */
        err = 0;
        offset = 0;
        frag = skb_shinfo(skb)->frag_list;
        skb_frag_list_init(skb);
        skb->data_len = first_len - skb_headlen(skb);
        skb->len = first_len;
        iph->tot_len = htons(first_len);
        iph->frag_off = htons(IP_MF);
        ip_send_check(iph);

        /* 循环设置分片信息,并发送 */
        for (;;) {
            /* Prepare header of the next frame,
             * before previous one went down. */
             /* 为每一片都拷贝ip头,设置偏移信息 */
            if (frag) {
                frag->ip_summed = CHECKSUM_NONE;
                skb_reset_transport_header(frag);
                __skb_push(frag, hlen);
                skb_reset_network_header(frag);
                memcpy(skb_network_header(frag), iph, hlen);
                iph = ip_hdr(frag);
                iph->tot_len = htons(frag->len);
                ip_copy_metadata(frag, skb);
                if (offset == 0)
                    ip_options_fragment(frag);
                offset += skb->len - hlen;
                iph->frag_off = htons(offset>>3);
                if (frag->next)
                    iph->frag_off |= htons(IP_MF);
                /* Ready, complete checksum */
                ip_send_check(iph);
            }

            /* 调用发送回调 */
            err = output(net, sk, skb);

            if (!err)
                IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
            if (err || !frag)
                break;

            skb = frag;
            frag = skb->next;
            skb->next = NULL;
        }

        if (err == 0) {
            IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
            return 0;
        }

        /* 出错,释放分片 */
        while (frag) {
            skb = frag->next;
            kfree_skb(frag);
            frag = skb;
        }
        IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
        return err;

slow_path_clean:
        /* 将分片恢复原状态 */
        skb_walk_frags(skb, frag2) {
            if (frag2 == frag)
                break;
            frag2->sk = NULL;
            frag2->destructor = NULL;
            skb->truesize += frag2->truesize;
        }
    }

slow_path:
    /* 慢速分片路径 */


    iph = ip_hdr(skb);

    /* 除去首部的剩余空间 */
    left = skb->len - hlen;        /* Space per frame */
    ptr = hlen;        /* Where to start from */

    /* 二层头部空间 */
    ll_rs = LL_RESERVED_SPACE(rt->dst.dev);

    /*
     *    Fragment the datagram.
     */

    /* 初始化mf和offset */
    offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
    not_last_frag = iph->frag_off & htons(IP_MF);

    /*
     *    Keep copying data until we run out.
     */

    /* 开始分片了 */
    while (left > 0) {
        /* len初始为剩余长度 */
        len = left;
        /* IF: it doesn't fit, use 'mtu' - the data space left */
        /* 根据mtu确认长度 */
        if (len > mtu)
            len = mtu;
        /* IF: we are not sending up to and including the packet end
           then align the next start on an eight byte boundary */
        /* 除最后分片外,其余8字节对齐 */
        if (len < left)    {
            len &= ~7;
        }

        /* Allocate buffer */
        /* 分配skb */
        skb2 = alloc_skb(len + hlen + ll_rs, GFP_ATOMIC);
        if (!skb2) {
            err = -ENOMEM;
            goto fail;
        }

        /*
         *    Set up data on packet
         */

        /* 拷贝元数据 */
        ip_copy_metadata(skb2, skb);

        /* 预留空间,设置头部偏移 */
        skb_reserve(skb2, ll_rs);
        skb_put(skb2, len + hlen);
        skb_reset_network_header(skb2);
        skb2->transport_header = skb2->network_header + hlen;

        /*
         *    Charge the memory for the fragment to any owner
         *    it might possess
         */
        /* 关联sk */
        if (skb->sk)
            skb_set_owner_w(skb2, skb->sk);

        /*
         *    Copy the packet header into the new buffer.
         */

        /* 拷贝头部 */
        skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);

        /*
         *    Copy a block of the IP datagram.
         */
        /* 拷贝数据 */
        if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
            BUG();
        left -= len;

        /*
         *    Fill in the new header fields.
         */
        iph = ip_hdr(skb2);

        /* 设置偏移 *//
        iph->frag_off = htons((offset >> 3));

        /* 转发的数据包,带有FRAG_PMTU标记,则打上DF */
        if (IPCB(skb)->flags & IPSKB_FRAG_PMTU)
            iph->frag_off |= htons(IP_DF);

        /* ANK: dirty, but effective trick. Upgrade options only if
         * the segment to be fragmented was THE FIRST (otherwise,
         * options are already fixed) and make it ONCE
         * on the initial skb, so that all the following fragments
         * will inherit fixed options.
         */
        /* 第一个分片包含ip选项 */
        if (offset == 0)
            ip_options_fragment(skb);

        /*
         *    Added AC : If we are fragmenting a fragment that's not the
         *           last fragment then keep MF on each bit
         */
        /* 不是最后分片需要设定MF标记 */
        if (left > 0 || not_last_frag)
            iph->frag_off |= htons(IP_MF);

        /* 指针和偏移更新 */
        ptr += len;
        offset += len;

        /*
         *    Put this fragment into the sending queue.
         */
        /* 设置数据长度 */
        iph->tot_len = htons(len + hlen);

        /* 校验和 */
        ip_send_check(iph);

        /* 发送分片 */
        err = output(net, sk, skb2);
        if (err)
            goto fail;

        IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
    }

    /* 分片完成并发送,释放skb */
    consume_skb(skb);
    IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
    return err;

fail:

    /* 出错,释放skb */
    kfree_skb(skb);
    IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
    return err;
}

 

posted @ 2021-06-16 19:43  codestacklinuxer  阅读(524)  评论(0编辑  收藏  举报