邻居子系统1.5 neigh output

1.5.1

当邻居项不处于NUD_CONNECTD状态时,不允许快速路径发送报文,函数neigh_resolve_output 用于慢而安全的输出,通常用初始化neigh_ops结构

来实例output函数,当邻居从NUD_CONNECT转到非NUD_CONNECT的时候,使用neigh_suspect 将output设置为neigh_resolve_output ()

/* Neighbour state is suspicious;
   disable fast path.

   Called with write_locked neigh.
 */
static void neigh_suspect(struct neighbour *neigh)
{
    NEIGH_PRINTK2("neigh %p is suspected.\n", neigh);

    neigh->output = neigh->ops->output;
}

/* Neighbour state is OK;
   enable fast path.

   Called with write_locked neigh.
 */
static void neigh_connect(struct neighbour *neigh)
{
    NEIGH_PRINTK2("neigh %p is connected.\n", neigh);

    neigh->output = neigh->ops->connected_output;
}
static const struct neigh_ops arp_generic_ops = {
    .family =        AF_INET,
    .solicit =        arp_solicit,
    .error_report =        arp_error_report,
    .output =        neigh_resolve_output,
    .connected_output =    neigh_connected_output,
};

neigh_resolve_output:大概含义为:邻居项的输出设备支持hard_header_cache 同时二层首部没有建立

则为改路由缓存建立硬件首部缓存,然后再输出报文中添加改二层硬件首部.;否则直接在报文首部添加硬件首部

/* Slow and careful. */

int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb)
{
    struct dst_entry *dst = skb_dst(skb);
    int rc = 0;

    if (!dst)
        goto discard;
/* 检测邻居项状态有效性 */
    if (!neigh_event_send(neigh, skb)) {
        int err;
        struct net_device *dev = neigh->dev;
        unsigned int seq;
///* 有二层头缓存函数,则缓存之 */
        if (dev->header_ops->cache && !neigh->hh.hh_len)
            neigh_hh_init(neigh, dst);

        do {/* 填充二层头 */
            __skb_pull(skb, skb_network_offset(skb));
            seq = read_seqbegin(&neigh->ha_lock);
            err = dev_hard_header(skb, dev, ntohs(skb->protocol),
                          neigh->ha, NULL, skb->len);
        } while (read_seqretry(&neigh->ha_lock, seq));

        if (err >= 0)//如果添加首部成功调用xmit 输出到网络设备
            rc = dev_queue_xmit(skb);/* 数据包发送 */
        else
            goto out_kfree_skb;
    }
out:
    return rc;
discard:
    NEIGH_PRINTK1("neigh_resolve_output: dst=%p neigh=%p\n",
              dst, neigh);
out_kfree_skb:
    rc = -EINVAL;
    kfree_skb(skb);
    goto out;
}

 

static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
{
    unsigned long now = jiffies;
    
    if (neigh->used != now)
        neigh->used = now;
    if (!(neigh->nud_state&(NUD_CONNECTED|NUD_DELAY|NUD_PROBE)))
        return __neigh_event_send(neigh, skb);
    return 0;
}
int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
{
    int rc;
    bool immediate_probe = false;

    write_lock_bh(&neigh->lock);

    rc = 0;
    if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE))
        goto out_unlock_bh;
    /*
        去掉NUD_CONNECT NUD_DELAY NUD_PROBE  状态
        那么就只剩下 NUD_STALE NUD_INCOMPLETE NUD_NONE NUD_FAILD
    */
    if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) { //NUD_NONE状态
        if (neigh->parms->mcast_probes + neigh->parms->app_probes) {
            //如果允许发送广播请求或者应用程序发送请求解析neigh地址
            unsigned long next, now = jiffies;

            atomic_set(&neigh->probes, neigh->parms->ucast_probes);
            neigh->nud_state     = NUD_INCOMPLETE;
            neigh->updated = now;
            next = now + max(neigh->parms->retrans_time, HZ/2);
            neigh_add_timer(neigh, next); //启动定时器
            immediate_probe = true;  //发送arp 请求(ipv4) 请求邻居表项
        } else {
            neigh->nud_state = NUD_FAILED;//邻居无效 不能输出
            neigh->updated = jiffies;
            write_unlock_bh(&neigh->lock);

            kfree_skb(skb);
            return 1;
        }
    } else if (neigh->nud_state & NUD_STALE) {
        NEIGH_PRINTK2("neigh %p is delayed.\n", neigh);
        neigh->nud_state = NUD_DELAY;//转变为delay 状态
        neigh->updated = jiffies;
        neigh_add_timer(neigh,
                jiffies + neigh->parms->delay_probe_time);
    }

    if (neigh->nud_state == NUD_INCOMPLETE) {//说明之前有报文发送
        if (skb) {
            while (neigh->arp_queue_len_bytes + skb->truesize >
                   neigh->parms->queue_len_bytes) {//如果请求报文已经满了,但还没有收到应答。
                struct sk_buff *buff;//如果缓存队列还没有达到上限,则将报文加入到输出缓存队列中
                                //否者 丢弃队列中最早加入的报文然后加入队列
                                //但是返回值都是1 即 不能立即发送
                buff = __skb_dequeue(&neigh->arp_queue);
                if (!buff)
                    break;
                neigh->arp_queue_len_bytes -= buff->truesize;
                kfree_skb(buff);
                NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards);
            }
            skb_dst_force(skb);
            __skb_queue_tail(&neigh->arp_queue, skb);
            neigh->arp_queue_len_bytes += skb->truesize;
        }
        rc = 1;
    }
out_unlock_bh:
    if (immediate_probe)
        neigh_probe(neigh); 发出邻居项请求 solict报文 (arp请求等)
    else
        write_unlock(&neigh->lock);
    local_bh_enable();
    return rc;
}

 

neigh_hh_init :缓存二层头,以eth为例:就是缓存二层mac

/**
 * ether_setup - setup Ethernet network device
 * @dev: network device
 *
 * Fill in the fields of the device structure with Ethernet-generic values.
 */
void ether_setup(struct net_device *dev)
{
    dev->header_ops        = &eth_header_ops;
    dev->type        = ARPHRD_ETHER;
    dev->hard_header_len     = ETH_HLEN;
    dev->mtu        = ETH_DATA_LEN;
    dev->addr_len        = ETH_ALEN;
    dev->tx_queue_len    = 1000;    /* Ethernet wants good queues */
    dev->flags        = IFF_BROADCAST|IFF_MULTICAST;
    dev->priv_flags        |= IFF_TX_SKB_SHARING;

    memset(dev->broadcast, 0xFF, ETH_ALEN);

}
const struct header_ops eth_header_ops ____cacheline_aligned = {
    .create        = eth_header,
    .parse        = eth_header_parse,
    .rebuild    = eth_rebuild_header,
    .cache        = eth_header_cache,
    .cache_update    = eth_header_cache_update,
};

/**
 * eth_header_cache - fill cache entry from neighbour
 * @neigh: source neighbour
 * @hh: destination cache entry
 * @type: Ethernet type field
 *
 * Create an Ethernet header template from the neighbour.
 */
int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh, __be16 type)
{
    struct ethhdr *eth;
    const struct net_device *dev = neigh->dev;

    eth = (struct ethhdr *)
        (((u8 *) hh->hh_data) + (HH_DATA_OFF(sizeof(*eth))));

    if (type == htons(ETH_P_802_3))
        return -1;

    eth->h_proto = type;
    memcpy(eth->h_source, dev->dev_addr, ETH_ALEN);
    memcpy(eth->h_dest, neigh->ha, ETH_ALEN);
    hh->hh_len = ETH_HLEN;
    return 0;
}

/* called with read_lock_bh(&n->lock); */
static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst)
{
    struct net_device *dev = dst->dev;
    __be16 prot = dst->ops->protocol;
    struct hh_cache    *hh = &n->hh;

    write_lock_bh(&n->lock);

    /* Only one thread can come in here and initialize the
     * hh_cache entry.
     */
    if (!hh->hh_len)
        dev->header_ops->cache(n, hh, prot);

    write_unlock_bh(&n->lock);
}
//根据代码可以看出  直接拷贝二层头
View Code

 其实在创建网卡虚拟接口ethx的时候, 以ixgbe驱动为例! ixgbe驱动加载后match 到设备info,执行ixgbe_probe,其会创建net_device,

netdev = alloc_etherdev_mq(sizeof(struct ixgbe_adapter), indices);

 

其最后调用ether_setup  设置其dev->header_ops = &eth_header_ops;等函数 设置net_device等hook 函数,以及设置ethtool_ops 回调接口实现;

最后设置 netdev->netdev_ops = &ixgbe_netdev_ops; 设置 报文发送的驱动函数接口实现 ,比如ndo_start_xmit 的实现

快速发送:

//ip_finish_output2 中会调用dst_neigh_output  输出报文
static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n,
                   struct sk_buff *skb)
{
    struct hh_cache *hh;

    if (unlikely(dst->pending_confirm)) {
        n->confirmed = jiffies;
        dst->pending_confirm = 0;
    }

    hh = &n->hh;
    if ((n->nud_state & NUD_CONNECTED) && hh->hh_len)
        return neigh_hh_output(hh, skb);//快速发出
    else
        return n->output(n, skb);// 慢速发出neigh_resolve_output
}
/* 拷贝缓存的二层头部,输出 */
static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
{
    unsigned int seq;
    unsigned int hh_len;

    /* 拷贝二层头到skb */
    do {
        seq = read_seqbegin(&hh->hh_lock);
        hh_len = hh->hh_len;
        /* 二层头部<DATA_MOD,直接使用该长度拷贝 */
        if (likely(hh_len <= HH_DATA_MOD)) {
            /* this is inlined by gcc */
            memcpy(skb->data - HH_DATA_MOD, hh->hh_data, HH_DATA_MOD);
        } 
        /* >=DATA_MOD,对齐头部,拷贝 */
        else {
            unsigned int hh_alen = HH_DATA_ALIGN(hh_len);

            memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
        }
    } while (read_seqretry(&hh->hh_lock, seq));

    skb_push(skb, hh_len);

    /* 发送 */
    return dev_queue_xmit(skb);
}

 

neigh_hh_output-缓存输出,直接拷贝二层头部,然后输出;

neigh_connected_output-快速输出,用于连接状态的输出;需要重新构建二层头部,然后输出;

neigh_resolve_output-慢速输出,用于非连接状态的输出;需要对邻居项状态进行检查,然后重新构造二层头部,最后输出;

neigh_direct_output-直接输出,用于没有二层头部时的输出;

/* CONNECTED状态的发送函数,没有neigh_hh_output快,这个需要重建二层头 */
int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb)
{
    struct net_device *dev = neigh->dev;
    unsigned int seq;
    int err;

    /* 拷贝二层头 */
    do {
        __skb_pull(skb, skb_network_offset(skb));
        seq = read_seqbegin(&neigh->ha_lock);
        err = dev_hard_header(skb, dev, ntohs(skb->protocol),
                      neigh->ha, NULL, skb->len);
    } while (read_seqretry(&neigh->ha_lock, seq));

    /* 发送数据包 */
    if (err >= 0)
        err = dev_queue_xmit(skb);
    else {
        err = -EINVAL;
        kfree_skb(skb);
    }
    return err;
}

 

posted @ 2019-11-12 20:12  codestacklinuxer  阅读(805)  评论(0)    收藏  举报