Kernel - 帧的接收（v3.7相对v2.6.12的改变）

目前的Kernel（“目前”是指3.7，还处于rc）相对于ULNI（大约是2.6.12时期）的描述更新了帧的接收（NAPI和netif_rx）相关内容。中心思想并没有太多变化，只是将有些NAPI相关字段从net_device中挪到了单独的napi_struct，启用了新的接口名（napi_xxx）。不过这也许会引起代码阅读的困难（至少对于我这样水平有限的人而言），和相关驱动代码的修改。故写此Note作为补充。既然是补充，有些未变的部分就不再累述，还请参考ULNI和Source Code。

1. 数据结构

现在的Per-CPU变量softnet_data结构定义如下。暂时忽略RPS（Receive Packet Steering）和netpoll部分。RPS它可以将接收包分发到不同的CPU队列并发处理中提高PPS，具体可见http://lwn.net/Articles/328339/。NETPOLL则提供了紧急情况下使用通过硬件polling的方式使用网络设备，可以使用它实现netconsole, kgdb-over-ethernet等。

struct softnet_data {
    struct Qdisc        *output_queue;
    struct Qdisc        **output_queue_tailp;
    struct list_head    poll_list;           // ingress轮询列表，列表中的“设备”用相关的napi_struct表示
    struct sk_buff      *completion_queue;
    struct sk_buff_head process_queue;   //旧式积压队列（netif_rx对应的process_backlog所使用），
                                         //skb从input_pkt_queue转移到此处再进行收取

    /* stats */
    unsigned int        processed;
    unsigned int        time_squeeze;
    unsigned int        cpu_collision;
    unsigned int        received_rps;

#ifdef CONFIG_RPS
    ... ...
#endif

    unsigned int        dropped;
    struct sk_buff_head input_pkt_queue;  // Non-NAPI（调用netif_rx）的接口，把skb直接放入此队列
    struct napi_struct  backlog;          // Non-NAPI设备，代替dev相关napi_struct，放入poll_list的结构
};

新内核抽象了另一个之前没有的NAPI结构，定义如下，

struct napi_struct {
    /* The poll_list must only be managed by the entity which
     * changes the state of the NAPI_STATE_SCHED bit.  This means
     * whoever atomically sets that bit can add this napi_struct
     * to the per-cpu poll_list, and whoever clears that bit
     * can remove from the list right before clearing the bit.
     */
    struct list_head    poll_list;  // sd->poll_list列表元素。
    unsigned long       state;
    int         weight;
    unsigned int        gro_count;
    int         (*poll)(struct napi_struct *, int); // 使用此函数polling设备，读取帧。
#ifdef CONFIG_NETPOLL
    ... ...
#endif
    struct net_device   *dev;
    struct sk_buff      *gro_list;
    struct sk_buff      *skb;
    struct list_head    dev_list;
};

softnet_data（简称sd）的poll_list是输入“设备”列表，“设备”用相关的napi_struct标识。对应的，napi_struct的poll_list元素，作为sd->poll_list的元素。可见ULNI提到的net_device结构中和NAPI相关的字段poll（函数指针），poll_list去除了。而放到了napi_struct结构中。可见将NAPI概念（和其他device无关的概念）从net_device中剥离是一个趋势。之前排入sd->poll_list的是net_device，而现在自然也变成了napi_struct。而原来quota，weight这些用于netif_rx流控的字段，直接随着netif_rx的流控部分的删除而删除了。

2. 软、硬件中断处理

2.1 NAPI-aware设备

do_IRQ硬件中断，调用驱动之前注册的中断处理函数。以e100为例，就是e100_intr。e100在ULNI的时期使用的是netif_rx现在则使用NAPI。所有支持中断共享的设备需要通过其他方式（通常是寄存器值）检查中断是否是自己的。如果是的话继续，先关闭中断，然后查看napi能否调度（是否被disable或已经被调度），可以的话，使用__napi_schedule调度NAPI。以下代码取自drivers/net/ethernet/intel/e100.c。

static irqreturn_t e100_intr(int irq, void *dev_id)
{
    ... ...

    if (stat_ack == stat_ack_not_ours ||    /* Not our interrupt */
       stat_ack == stat_ack_not_present)    /* Hardware is ejected */
        return IRQ_NONE;
    ... ...

    if (likely(napi_schedule_prep(&nic->napi))) {
        e100_disable_irq(nic);
        __napi_schedule(&nic->napi);  
    } 

    return IRQ_HANDLED;
}

__napi_schedule负责找出对应CPU的softnet_data，然后调用____napi_schedule。后者将接收设备（对应的napi->pool_list）加到sd的poll_list中。然后调度softirq。

void __napi_schedule(struct napi_struct *n)
{
    unsigned long flags;

    local_irq_save(flags);
    ____napi_schedule(&__get_cpu_var(softnet_data), n);
    local_irq_restore(flags);
}

static inline void ____napi_schedule(struct softnet_data *sd,
                     struct napi_struct *napi)
{
    list_add_tail(&napi->poll_list, &sd->poll_list);
    __raise_softirq_irqoff(NET_RX_SOFTIRQ);
}

当软件中断执行的时候，即函数do_softirq的时候（它在很多地方被调用，如so_IRQ, syscall返回，local_bh_enable，或者ksoftirqd）。由于注册了net_rx_action为NET_RX_SOFTIRQ的Handler，net_rx_action会查询sd的poll_list中每个设备（不一定只是这次注册的设备），调用设备对应的poll函数。和旧的NAPI相比，原先会使用的netif_rx_schedule或__netif_rx_schedule转而使用napi_xxx函数（虽然他们的原理没有变化）。例如，原本把设备（相关napi_struct）从轮询列表poll_list移除的函数是netif_rx_complete或__netif_rx_complete。现在则是对应的napi_complete和__napi_complete。

注：过去的说法是“把设备net_device”放入poll_list，因为原来poll及相关字段都是在net_device里的，现在把它们挪到napi_struct后，其实就是把dev相关的napi_struct（而非net_device）放入poll_list。这里沿用过去的说法“把"dev"放入轮询列表（poll_list）”，但要理解放入的是napi_struct。

2.2 使用netif_rx（旧backlog队列）的设备

单从netif_rx的使用角度看，和ULNI时期没有变化。先复习一下。netif_rx是被广泛使用的旧函数，所以还是要看一下，以drivers/net/ethernet/3com/3c59x.c为例。我们只关注接收部分，而vortex_interrupt会调用vortex_rx来处理帧的接收。分配skb，使用RX寄存器或者DMA读取数据到sdk后，调用netif_rx。

static int vortex_rx(struct net_device *dev)
{
    ... ...
    while ((rx_status = ioread16(ioaddr + RxStatus)) > 0) {
        if (rx_status & 0x4000) { /* Error, update stats. */
            ... ...
        } else {
            /* The packet length: up to 4.5K!. */
            int pkt_len = rx_status & 0x1fff;
            struct sk_buff *skb;          

            skb = netdev_alloc_skb(dev, pkt_len + 5);
            ... ...       
            if (skb != NULL) {
                skb_reserve(skb, 2);    /* Align IP on 16 byte boundaries */
                /* 'skb_put()' points to the start of sk_buff data area. */
                if (vp->bus_master &&         
                    ... ... // 用DMA方式读取
                } else {
                    ... ... // 直接从寄存器读取
                }
                iowrite16(RxDiscard, ioaddr + EL3_CMD); /* Pop top Rx packet. */
                skb->protocol = eth_type_trans(skb, dev);
                netif_rx(skb);
                dev->stats.rx_packets++;
                /* Wait a limited time to go to next packet. */
                ... ... // 忙等 = =b
                continue;
            } else if (vortex_debug > 0)
            ... ...
        }
        ... ...
    }

    return 0;
}

自从有了NAPI之后的某个时刻，netif_rx也会利用NAPI的infrastructure。这点在ULNI里面就已经有描述。更旧（2.4版本）的netif_rx未利用NAPI，知道有这件事就行，就不过多讨论了。这里的netif_rx都是使用了NAPI框架的netif_rx。

netif_rx的实现也根据新的NAPI接口稍作修改（由netif_rx_xxx转用napi_xxx）。再次强调：NAPI的接口接收数据的方式是，在中断处理阶段把自己相关的napi_struct排入轮询列表sd->poll_list，在BH阶段由net_rx_action调用napi->poll轮询数据帧。而使用netif_rx的接口接收数据的方式是直接把skb放入sd->input_skb_queue。

enqueue_to_backlog，首先取出sd。接下来的处理流程如下，如果输入队列sd->input_pkt_queue达到了系统配置的上限，将skb丢弃。如果队列未满，那么，如果也为不为空，则简单的将skb放到队列尾端。

如果队列为空，则“将skb放入队列”前，要先把“设备（对应的napi_struct）”放到sd->poll_list中，不过放NAPI-unaware的设备根本没有这个结构，因实际上是用“sd->backlog”（也是napi_struct）作为代替，放入sd->poll_list，再调度软中断（即____napi_schedule）。将sd->backlog放入sd->poll_list的好处是net_rx_action可以像对待NAPI设备一样对待Non-NAPI设备，只不过使用的poll不是设备提供的poll，而是为所有Non-NAPI设备准备的sk->backlog的poll（专门为NON-API设备准备的轮询虚拟函数：process_backlog）。

最后再把skb放入sd->input_pkt_queue（goto enqueue）。

int netif_rx(struct sk_buff *skb)
{
    ... ...
        unsigned int qtail;
        ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
        put_cpu();
    ... ...
}

static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
                  unsigned int *qtail)
{  
    struct softnet_data *sd;
    unsigned long flags;

    sd = &per_cpu(softnet_data, cpu);

    ... ...
    if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
        if (skb_queue_len(&sd->input_pkt_queue)) {
enqueue:
            __skb_queue_tail(&sd->input_pkt_queue, skb);
            ... ...
            return NET_RX_SUCCESS;
        }

        /* Schedule NAPI for backlog device
         * We can use non atomic operation since we own the queue lock
         */
        if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
            if (!rps_ipi_queued(sd))
                ____napi_schedule(sd, &sd->backlog);
        }
        goto enqueue;
    }

    ... ...
    return NET_RX_DROP;
}

有没有发现新版的netif_rx比原来“清爽”不少？主要是去掉了netif_rx的拥塞管理（CPU窒息状态处理）的部分。其原因（未加求证）貌似是“吃力而不讨好”，

commit 34008d8c631d067caffa136313260525f3ae48a2
Author: Stephen Hemminger <shemminger@osdl.org>
Date:   Thu Jun 23 20:10:00 2005 -0700

    [NET]: Remove obsolete netif_rx congestion sensing mechanism.
   
    Remove the congestion sensing mechanism from netif_rx, and always
    return either full or empty.  Almost no driver checks the return value
    from netif_rx, and those that do only use it for debug messages.
   
    The original design of netif_rx was to do flow control based on the
    receive queue, but NAPI has supplanted this and no driver uses the
    feedback.

2.3 net_rx_action

net_rx_action本身倒是变化不大，只是本来从net_device中读取的字段改由napi_struct读取。因为之前入队的也不再是net_device结构，而是napi_struct结构。并调用修改后的NAPI接口。

根据NIC驱动使用netif_rx还是NAPI，帧可以在两处等待net_rx_action处理：共享CPU的专用队列sd->input_pkt_queue（netif_rx），设备内存（Driver提供poll）。不过因为netif_rx使用了sd->backlog代替dev（对应的napi）放入sd->poll_list队列，并且提供了虚拟轮询函数process_backlog。所以在net_rx_action看来，处理上没有区别。net_rx_action执行的时候，会尝试遍历poll_list中所有的设备，尽力一次性多收取一些帧。同时根据全局的buget，持续执行时间和每个设备的工作量，进行有节制的收取。具体内容参考ULNI。

2.3.1 process_backlog

process_backlog（netif_rx）的实现相对原来有所变化。process_backlog不再从sd->input_pkt_queue直接读取skb，而是从sd->process_queue中读取。工作量work和配额quota的比较有两处，一处位于外部的while循环，另一处在每次真正从队列读取skb之后。这是因为从sd->process_queue收取后，skb有一个转移过程。如果sd->queue_queue中没有数据，需要从input_pkt_queue中把转移过来，转移工作是一次性整队列的，而不是按单独skb进行。转移后，预计下次的__skb_dequeue足够完成quota了，就可以把设备从poll_list移除。而“转移”过程其实是一个统计工作量的过程，如果没有太多的数据可以消费，可以修改quota，然后在下一次dequeue后，马上退出。

static int process_backlog(struct napi_struct *napi, int quota)
{
    int work = 0;
    struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);

    ... ...

    napi->weight = weight_p;
    local_irq_disable();
    while (work < quota) {     //工作量不足配额时继续收取数据。
        struct sk_buff *skb;
        unsigned int qlen;

        while ((skb = __skb_dequeue(&sd->process_queue))) { //从process_queue而非input_pkt_queue中读取
            local_irq_enable();
            __netif_receive_skb(skb);
            local_irq_disable();
            input_queue_head_incr(sd);
            if (++work >= quota) { // 工作量达配额上限，结束收取
                local_irq_enable();
                return work;
            }   
        }   

        rps_lock(sd);
        qlen = skb_queue_len(&sd->input_pkt_queue);
        if (qlen) // 将skb从input_pkt_queue（一次性，整队列）转移到process_queue
            skb_queue_splice_tail_init(&sd->input_pkt_queue,
                           &sd->process_queue);

        if (qlen < quota - work) { // 预计下次的__skb_dequeue足够完成quota了，就可以把设备从poll_list移除。
            /*  
             * Inline a custom version of __napi_complete().
             * only current cpu owns and manipulates this napi,
             * and NAPI_STATE_SCHED is the only possible flag set on backlog.
             * we can use a plain write instead of clear_bit(),
             * and we dont need an smp_mb() memory barrier.
             */
            list_del(&napi->poll_list);
            napi->state = 0;

            quota = work + qlen; // 缩小配额值到下次_skb_dequeue “恰好完成工作”，以便稍后退出
        }   
        rps_unlock(sd);
    }
    local_irq_enable();

    return work;
}

不管是使用NAPI-aware的NIC所提供的Poll，还是使用netif_rx（旧backlog）对应的处理函数process_backlog，最终都会调用__netif_receive_skb或其包裹函数netif_receive_skb收取数据。netif_receive_skb只是处理了一下时间戳，然后交由__netif_receive_skb继续处理。

posted @ 2012-12-09 12:42 beacer 阅读(924) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

beacer