dpdk l2fwd

之前在helloworld中主要分析了hugepage的使用,这回在l2fwd中主要分析一下uio和PMD的实现

 

main函数中首先调用了rte_eal_init初始化eal环境,其中主要是hugepage的初始化;

ret = rte_eal_init(argc, argv);
if (ret < 0)
    rte_exit(EXIT_FAILURE, "Invalid EAL arguments\n");

 

 

接着创建了mbuf pool

/* create the mbuf pool */
l2fwd_pktmbuf_pool =
    rte_mempool_create("mbuf_pool", NB_MBUF,
               MBUF_SIZE, 32,
               sizeof(struct rte_pktmbuf_pool_private),
               rte_pktmbuf_pool_init, NULL,
               rte_pktmbuf_init, NULL,
               rte_socket_id(), 0);
if (l2fwd_pktmbuf_pool == NULL)
    rte_exit(EXIT_FAILURE, "Cannot init mbuf pool\n");

 

 

然后是PMD驱动的注册和PCI设备驱动加载

/* init driver(s) */
if (rte_pmd_init_all() < 0)
    rte_exit(EXIT_FAILURE, "Cannot init pmd\n");

if (rte_eal_pci_probe() < 0)
    rte_exit(EXIT_FAILURE, "Cannot probe PCI\n");

 

首先是PMD驱动的注册,目前DPDK支持igb igbvf em ixgbe ixgbevf virtio vmxnet3;不过这些具体是什么还不清楚,后面以虚拟机环境中使用的em驱动为例子分析;

static inline
int rte_pmd_init_all(void)
{
    int ret = -ENODEV;

#ifdef RTE_LIBRTE_IGB_PMD
    if ((ret = rte_igb_pmd_init()) != 0) {
        RTE_LOG(ERR, PMD, "Cannot init igb PMD\n");
        return (ret);
    }
    if ((ret = rte_igbvf_pmd_init()) != 0) {
        RTE_LOG(ERR, PMD, "Cannot init igbvf PMD\n");
        return (ret);
    }
#endif /* RTE_LIBRTE_IGB_PMD */

#ifdef RTE_LIBRTE_EM_PMD
    if ((ret = rte_em_pmd_init()) != 0) {
        RTE_LOG(ERR, PMD, "Cannot init em PMD\n");
        return (ret);
    }
#endif /* RTE_LIBRTE_EM_PMD */

#ifdef RTE_LIBRTE_IXGBE_PMD
    if ((ret = rte_ixgbe_pmd_init()) != 0) {
        RTE_LOG(ERR, PMD, "Cannot init ixgbe PMD\n");
        return (ret);
    }
    if ((ret = rte_ixgbevf_pmd_init()) != 0) {
        RTE_LOG(ERR, PMD, "Cannot init ixgbevf PMD\n");
        return (ret);
    }
#endif /* RTE_LIBRTE_IXGBE_PMD */

#ifdef RTE_LIBRTE_VIRTIO_PMD
    if ((ret = rte_virtio_pmd_init()) != 0) {
        RTE_LOG(ERR, PMD, "Cannot init virtio PMD\n");
        return (ret);
    }
#endif /* RTE_LIBRTE_VIRTIO_PMD */

#ifdef RTE_LIBRTE_VMXNET3_PMD
    if ((ret = rte_vmxnet3_pmd_init()) != 0) {
        RTE_LOG(ERR, PMD, "Cannot init vmxnet3 PMD\n");
        return (ret);
    }
#endif /* RTE_LIBRTE_VMXNET3_PMD */

    if (ret == -ENODEV)
        RTE_LOG(ERR, PMD, "No PMD(s) are configured\n");
    return (ret);
}

 

注册EM驱动

int
rte_em_pmd_init(void)
{
    rte_eth_driver_register(&rte_em_pmd);
    return 0;
}

/**
 * Register an Ethernet [Poll Mode] driver.
 *
 * Function invoked by the initialization function of an Ethernet driver
 * to simultaneously register itself as a PCI driver and as an Ethernet
 * Poll Mode Driver.
 * Invokes the rte_eal_pci_register() function to register the *pci_drv*
 * structure embedded in the *eth_drv* structure, after having stored the
 * address of the rte_eth_dev_init() function in the *devinit* field of
 * the *pci_drv* structure.
 * During the PCI probing phase, the rte_eth_dev_init() function is
 * invoked for each PCI [Ethernet device] matching the embedded PCI
 * identifiers provided by the driver.
 */
void
rte_eth_driver_register(struct eth_driver *eth_drv)
{
    eth_drv->pci_drv.devinit = rte_eth_dev_init;
    rte_eal_pci_register(&eth_drv->pci_drv);
}

/* register a driver */
void
rte_eal_pci_register(struct rte_pci_driver *driver)
{
    TAILQ_INSERT_TAIL(&driver_list, driver, next);
}

 

这里PMD驱动结构包含了PMD驱动部分和PCI驱动部分

/**
 * @internal
 * The structure associated with a PMD Ethernet driver.
 *
 * Each Ethernet driver acts as a PCI driver and is represented by a generic
 * *eth_driver* structure that holds:
 *
 * - An *rte_pci_driver* structure (which must be the first field).
 *
 * - The *eth_dev_init* function invoked for each matching PCI device.
 *
 * - The size of the private data to allocate for each matching device.
 */
struct eth_driver {
    struct rte_pci_driver pci_drv;    /**< The PMD is also a PCI driver. */
    eth_dev_init_t eth_dev_init;      /**< Device init function. */
    unsigned int dev_private_size;    /**< Size of device private data. */
};

 

接下来,如果不存在白名单则加载每个device的所有驱动;在白名单中的device加载驱动失败直接退出;

/*
 * Scan the content of the PCI bus, and call the devinit() function for
 * all registered drivers that have a matching entry in its id_table
 * for discovered devices.
 */
int
rte_eal_pci_probe(void)
{
    struct rte_pci_device *dev = NULL;

    TAILQ_FOREACH(dev, &device_list, next)
        if (!eal_dev_whitelist_exists())
            pci_probe_all_drivers(dev);
        else if (pcidev_is_whitelisted(dev) && pci_probe_all_drivers(dev) < 0 )
                rte_exit(EXIT_FAILURE, "Requested device " PCI_PRI_FMT
                        " cannot be used\n", dev->addr.domain,dev->addr.bus,
                        dev->addr.devid, dev->addr.function);

    return 0;
}

对于每个device,尝试是否可以加载driver,RTE_PCI_DRV_MULTIPLE标记的驱动需要加载多次,第三方驱动可能需要;

/*
 * If vendor/device ID match, call the devinit() function of all
 * registered driver for the given device. Return -1 if no driver is
 * found for this device.
 * For drivers with the RTE_PCI_DRV_MULTIPLE flag enabled, register
 * the same device multiple times until failure to do so.
 * It is required for non-Intel NIC drivers provided by third-parties such
 * as 6WIND.
 */
static int
pci_probe_all_drivers(struct rte_pci_device *dev)
{
    struct rte_pci_driver *dr = NULL;
    int rc;

    dev->blacklisted = !!is_blacklisted(dev);
    TAILQ_FOREACH(dr, &driver_list, next) {
        rc = rte_eal_pci_probe_one_driver(dr, dev);
        if (rc < 0)
            /* negative value is an error */
            break;
        if (rc > 0)
            /* positive value means driver not found */
            continue;
        /* initialize subsequent driver instances for this device */
        if ((dr->drv_flags & RTE_PCI_DRV_MULTIPLE) &&
                (!dev->blacklisted))
            while (rte_eal_pci_probe_one_driver(dr, dev) == 0)
                ;
        return 0;
    }
    return -1;
}

 

驱动加载

/*
 * If vendor/device ID match, call the devinit() function of the
 * driver.
 */
int
rte_eal_pci_probe_one_driver(struct rte_pci_driver *dr, struct rte_pci_device *dev)
{
    struct rte_pci_id *id_table;

    /* id table位于rte_pci_dev_ids.h */
    for (id_table = dr->id_table ; id_table->vendor_id != 0; id_table++) {

        /* check if device's identifiers match the driver's ones */
        if (id_table->vendor_id != dev->id.vendor_id &&
                id_table->vendor_id != PCI_ANY_ID)
            continue;
        if (id_table->device_id != dev->id.device_id &&
                id_table->device_id != PCI_ANY_ID)
            continue;
        if (id_table->subsystem_vendor_id != dev->id.subsystem_vendor_id &&
                id_table->subsystem_vendor_id != PCI_ANY_ID)
            continue;
        if (id_table->subsystem_device_id != dev->id.subsystem_device_id &&
                id_table->subsystem_device_id != PCI_ANY_ID)
            continue;

        /* 当前driver与device匹配 */
        struct rte_pci_addr *loc = &dev->addr;

        RTE_LOG(DEBUG, EAL, "PCI device "PCI_PRI_FMT" on NUMA socket %i\n",
                loc->domain, loc->bus, loc->devid, loc->function,
                dev->numa_node);

        RTE_LOG(DEBUG, EAL, "  probe driver: %x:%x %s\n", dev->id.vendor_id,
                dev->id.device_id, dr->name);

        /* 黑名单设备不加载 */
        /* no initialization when blacklisted, return without error */
        if (dev->blacklisted) {
            RTE_LOG(DEBUG, EAL, "  Device is blacklisted, not initializing\n");
            return 0;
        }

#ifdef RTE_EAL_UNBIND_PORTS
        if (dr->drv_flags & RTE_PCI_DRV_NEED_IGB_UIO) {
            /* unbind driver and load uio resources for Intel NICs */
            if (pci_switch_module(dr, dev, 1, IGB_UIO_NAME) < 0)
                return -1;
        } else if (dr->drv_flags & RTE_PCI_DRV_FORCE_UNBIND &&
                   rte_eal_process_type() == RTE_PROC_PRIMARY) {
            /* unbind current driver */
            if (pci_unbind_kernel_driver(dev) < 0)
                return -1;
        }
#else
        /* 首先获取设备的uio映射地址和大小,然后映射到/dev/uiox上 */
        if (dr->drv_flags & RTE_PCI_DRV_NEED_IGB_UIO)
            /* just map resources for Intel NICs */
            if (pci_uio_map_resource(dev) < 0)
                return -1;
#endif

        /* reference driver structure */
        dev->driver = dr;

        /* 调用PCI驱动的初始化函数 */
        /* call the driver devinit() function */
        return dr->devinit(dr, dev);
    }
    /* return positive value if driver is not found */
    return 1;
}

 

映射PCI地址空间到用户空间的过程

/* map the PCI resource of a PCI device in virtual memory */
static int
pci_uio_map_resource(struct rte_pci_device *dev)
{
    int i, j;
    char dirname[PATH_MAX];
    char filename[PATH_MAX];
    char devname[PATH_MAX]; /* contains the /dev/uioX */
    void *mapaddr;
    int uio_num;
    unsigned long start,size;
    uint64_t phaddr;
    uint64_t offset;
    uint64_t pagesz;
    ssize_t nb_maps;
    struct rte_pci_addr *loc = &dev->addr;
    struct uio_resource *uio_res;
    struct uio_map *maps;

    dev->intr_handle.fd = -1;

    /* PRIMARY进程才做映射 */
    /* secondary processes - use already recorded details */
    if ((rte_eal_process_type() != RTE_PROC_PRIMARY) &&
        (dev->id.vendor_id != PCI_VENDOR_ID_QUMRANET))
        return (pci_uio_map_secondary(dev));

    /* 通过/sys/bus/pci/devices/0000:02:01.0/uio/uio0找到与当前device关联的uio设备ID */
    /* find uio resource */
    uio_num = pci_get_uio_dev(dev, dirname, sizeof(dirname));
    if (uio_num < 0) {
        RTE_LOG(WARNING, EAL, "  "PCI_PRI_FMT" not managed by UIO driver, "
                "skipping\n", loc->domain, loc->bus, loc->devid, loc->function);
        return -1;
    }

    /* 忽略 */
    if(dev->id.vendor_id == PCI_VENDOR_ID_QUMRANET) {
        /* get portio size */
        rte_snprintf(filename, sizeof(filename),
             "%s/portio/port0/size", dirname);
        if (eal_parse_sysfs_value(filename, &size) < 0) {
            RTE_LOG(ERR, EAL, "%s(): cannot parse size\n",
                __func__);
            return -1;
        }

        /* get portio start */
        rte_snprintf(filename, sizeof(filename),
             "%s/portio/port0/start", dirname);
        if (eal_parse_sysfs_value(filename, &start) < 0) {
            RTE_LOG(ERR, EAL, "%s(): cannot parse portio start\n",
                __func__);
            return -1;
        }
        dev->mem_resource[0].addr = (void *)(uintptr_t)start;
        dev->mem_resource[0].len =  (uint64_t)size;
        RTE_LOG(DEBUG, EAL, "PCI Port IO found start=0x%lx with size=0x%lx\n", start, size);
        /* rte_virtio_pmd does not need any other bar even if available */
        return (0);
    }
    
    /* allocate the mapping details for secondary processes*/
    if ((uio_res = rte_zmalloc("UIO_RES", sizeof (*uio_res), 0)) == NULL) {
        RTE_LOG(ERR, EAL,
            "%s(): cannot store uio mmap details\n", __func__);
        return (-1);
    }

    rte_snprintf(devname, sizeof(devname), "/dev/uio%u", uio_num);
    rte_snprintf(uio_res->path, sizeof(uio_res->path), "%s", devname);
    memcpy(&uio_res->pci_addr, &dev->addr, sizeof(uio_res->pci_addr));

    /* uio设备所有map记录到uio_res->maps中,并返回map的个数 */
    /* collect info about device mappings */
    if ((nb_maps = pci_uio_get_mappings(dirname, uio_res->maps,
            sizeof (uio_res->maps) / sizeof (uio_res->maps[0])))
            < 0)
        return (nb_maps);
 
    uio_res->nb_maps = nb_maps;

    /* Map all BARs */
    pagesz = sysconf(_SC_PAGESIZE);
 
    maps = uio_res->maps;
    for (i = 0; i != PCI_MAX_RESOURCE; i++) {
    
        /* rte_eal_init -> rte_eal_pci_init 中初始化了dev->mem_resource */
        /* /sys/bus/pci/devices/0000:02:01.0/resource 文件中读取
         物理地址起始地址      物理地址结束          FLAG(第10个bit表示IO memory)
         0x00000000fd5a0000 0x00000000fd5bffff 0x0000000000140204
        0x0000000000000000 0x0000000000000000 0x0000000000000000
        0x00000000fdff0000 0x00000000fdffffff 0x0000000000140204
        0x0000000000000000 0x0000000000000000 0x0000000000000000
        0x0000000000002000 0x000000000000203f 0x0000000000040101
        0x0000000000000000 0x0000000000000000 0x0000000000000000
        0x00000000e7b00000 0x00000000e7b0ffff 0x000000000004e200
        0x0000000000000000 0x0000000000000000 0x0000000000000000
        0x0000000000000000 0x0000000000000000 0x0000000000000000
        0x0000000000000000 0x0000000000000000 0x0000000000000000
        0x0000000000000000 0x0000000000000000 0x0000000000000000
        0x0000000000000000 0x0000000000000000 0x0000000000000000
        0x0000000000000000 0x0000000000000000 0x0000000000000000
        */
        /* skip empty BAR */
        if ((phaddr = dev->mem_resource[i].phys_addr) == 0)
            continue;
 
        /* 查找PCI IO地址和uio匹配的 */
        for (j = 0; j != nb_maps && (phaddr != maps[j].phaddr ||
                dev->mem_resource[i].len != maps[j].size);
                j++)
            ;
 
        /* 打开/dev/uiox,把它的内存映射到用户空间 */
        /* if matching map is found, then use it */
        if (j != nb_maps) {
            offset = j * pagesz;
            if (maps[j].addr != NULL ||
                    (mapaddr = pci_map_resource(dev,
                    NULL, devname, (off_t)offset,
                    (size_t)maps[j].size)) == NULL) {
                return (-1);
            }
 
            maps[j].addr = mapaddr;
            maps[j].offset = offset;
            dev->mem_resource[i].addr = mapaddr;
        }
    }
    /* uio_res加入uio_res_list链表 */ 
    TAILQ_INSERT_TAIL(uio_res_list, uio_res, next);

    return (0);
}

 

回到pci驱动的初始化rte_eth_dev_init

static int
rte_eth_dev_init(struct rte_pci_driver *pci_drv,
         struct rte_pci_device *pci_dev)
{
    struct eth_driver    *eth_drv;
    struct rte_eth_dev *eth_dev;
    int diag;

    eth_drv = (struct eth_driver *)pci_drv;

    /* 分配或查找名为rte_eth_dev_data的memzone,并从全局数组rte_eth_devices中返回当前端口的entry */
    eth_dev = rte_eth_dev_allocate();
    if (eth_dev == NULL)
        return -ENOMEM;

    if (rte_eal_process_type() == RTE_PROC_PRIMARY){
        /* 分配PMD驱动的private内存 */
        eth_dev->data->dev_private = rte_zmalloc("ethdev private structure",
                  eth_drv->dev_private_size,
                  CACHE_LINE_SIZE);
        if (eth_dev->data->dev_private == NULL)
            rte_panic("Cannot allocate memzone for private port data\n");
    }
    eth_dev->pci_dev = pci_dev;
    eth_dev->driver = eth_drv;
    eth_dev->data->rx_mbuf_alloc_failed = 0;

    /* init user callbacks */
    TAILQ_INIT(&(eth_dev->callbacks));

    /*
     * Set the default maximum frame size.
     */
    eth_dev->data->max_frame_size = ETHER_MAX_LEN;

    /* 这次调用的是PMD驱动的初始化, 当前函数的上下文为PCI驱动的初始化函数 */
    /* Invoke PMD device initialization function */
    diag = (*eth_drv->eth_dev_init)(eth_drv, eth_dev);
    if (diag == 0)
        return (0);

    /* 初始化出错,回收内存,端口数修正 */
    PMD_DEBUG_TRACE("driver %s: eth_dev_init(vendor_id=0x%u device_id=0x%x)"
            " failed\n", pci_drv->name,
            (unsigned) pci_dev->id.vendor_id,
            (unsigned) pci_dev->id.device_id);
    if (rte_eal_process_type() == RTE_PROC_PRIMARY)
        rte_free(eth_dev->data->dev_private);
    nb_ports--;
    return diag;
}

 

PMD驱动的初始化过程

static int
eth_em_dev_init(__attribute__((unused)) struct eth_driver *eth_drv,
        struct rte_eth_dev *eth_dev)
{
    struct rte_pci_device *pci_dev;
    struct e1000_hw *hw =
        E1000_DEV_PRIVATE_TO_HW(eth_dev->data->dev_private);
    struct e1000_vfta * shadow_vfta =
        E1000_DEV_PRIVATE_TO_VFTA(eth_dev->data->dev_private);

    pci_dev = eth_dev->pci_dev;
    eth_dev->dev_ops = &eth_em_ops;
    eth_dev->rx_pkt_burst = (eth_rx_burst_t)&eth_em_recv_pkts;
    eth_dev->tx_pkt_burst = (eth_tx_burst_t)&eth_em_xmit_pkts;

    /* for secondary processes, we don't initialise any further as primary
     * has already done this work. Only check we don't need a different
     * RX function */
    if (rte_eal_process_type() != RTE_PROC_PRIMARY){
        if (eth_dev->data->scattered_rx)
            eth_dev->rx_pkt_burst =
                (eth_rx_burst_t)&eth_em_recv_scattered_pkts;
        return 0;
    }

    hw->hw_addr = (void *)pci_dev->mem_resource[0].addr;
    hw->device_id = pci_dev->id.device_id;

    /* For ICH8 support we'll need to map the flash memory BAR */

    if (e1000_setup_init_funcs(hw, TRUE) != E1000_SUCCESS ||
            em_hw_init(hw) != 0) {
        PMD_INIT_LOG(ERR, "port_id %d vendorID=0x%x deviceID=0x%x: "
            "failed to init HW",
            eth_dev->data->port_id, pci_dev->id.vendor_id,
            pci_dev->id.device_id);
        return -(ENODEV);
    }

    /* Allocate memory for storing MAC addresses */
    eth_dev->data->mac_addrs = rte_zmalloc("e1000", ETHER_ADDR_LEN *
            hw->mac.rar_entry_count, 0);
    if (eth_dev->data->mac_addrs == NULL) {
        PMD_INIT_LOG(ERR, "Failed to allocate %d bytes needed to "
            "store MAC addresses",
            ETHER_ADDR_LEN * hw->mac.rar_entry_count);
        return -(ENOMEM);
    }

    /* Copy the permanent MAC address */
    ether_addr_copy((struct ether_addr *) hw->mac.addr,
        eth_dev->data->mac_addrs);

    /* initialize the vfta */
    memset(shadow_vfta, 0, sizeof(*shadow_vfta));

    PMD_INIT_LOG(INFO, "port_id %d vendorID=0x%x deviceID=0x%x\n",
            eth_dev->data->port_id, pci_dev->id.vendor_id,
            pci_dev->id.device_id);

    rte_intr_callback_register(&(pci_dev->intr_handle),
        eth_em_interrupt_handler, (void *)eth_dev);

    return (0);
}

PMD驱动初始化主要是一些硬件相关的寄存器初始化以及函数的初始化,细节就不再分析了;函数的最后注册了一个中断处理函数,下面主要分析中断处理的过程;

int
rte_intr_callback_register(struct rte_intr_handle *intr_handle,
            rte_intr_callback_fn cb, void *cb_arg)
{
    int ret, wake_thread;
    struct rte_intr_source *src;
    struct rte_intr_callback *callback;

    wake_thread = 0;

    /* intr_handle.fd为pci内存映射对应/dev/uiox文件描述符 */
    /* first do parameter checking */
    if (intr_handle == NULL || intr_handle->fd < 0 || cb == NULL) {
        RTE_LOG(ERR, EAL,
            "Registering with invalid input parameter\n");
        return -EINVAL;
    }

    /* allocate a new interrupt callback entity */
    callback = rte_zmalloc("interrupt callback list",
                sizeof(*callback), 0);
    if (callback == NULL) {
        RTE_LOG(ERR, EAL, "Can not allocate memory\n");
        return -ENOMEM;
    }
    callback->cb_fn = cb;
    callback->cb_arg = cb_arg;

    rte_spinlock_lock(&intr_lock);

    /* check if there is at least one callback registered for the fd */
    TAILQ_FOREACH(src, &intr_sources, next) {
        if (src->intr_handle.fd == intr_handle->fd) {
            /* we had no interrupts for this */
            if TAILQ_EMPTY(&src->callbacks)
                wake_thread = 1;

            TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
            ret = 0;
            break;
        }
    }

    /* no existing callbacks for this - add new source */
    if (src == NULL) {
        if ((src = rte_zmalloc("interrupt source list",
                sizeof(*src), 0)) == NULL) {
            RTE_LOG(ERR, EAL, "Can not allocate memory\n");
            rte_free(callback);
            ret = -ENOMEM;
        } else {
            src->intr_handle = *intr_handle;
            TAILQ_INIT(&src->callbacks);
            TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
            TAILQ_INSERT_TAIL(&intr_sources, src, next);
            wake_thread = 1;
            ret = 0;
        }
    }

    rte_spinlock_unlock(&intr_lock);

    /* wake_thread=1会通知中断处理线程有新的fd加入 */
    /**
     * check if need to notify the pipe fd waited by epoll_wait to
     * rebuild the wait list.
     */
    if (wake_thread)
        if (write(intr_pipe.writefd, "1", 1) < 0)
            return -EPIPE;

    return (ret);
}

 

在rte_eal_init初始化过程中调用了rte_eal_intr_init, rte_eal_intr_init里面会初始化一个中断处理线程

int
rte_eal_intr_init(void)
{
    int ret = 0;

    /* init the global interrupt source head */
    TAILQ_INIT(&intr_sources);

    /**
     * create a pipe which will be waited by epoll and notified to
     * rebuild the wait list of epoll.
     */
    if (pipe(intr_pipe.pipefd) < 0)
        return -1;

    /* 中断处理线程,用于监听intr_sources中fd是否需要处理,并调用对应回调 */
    /* create the host thread to wait/handle the interrupt */
    ret = pthread_create(&intr_thread, NULL,
            eal_intr_thread_main, NULL);
    if (ret != 0)
        RTE_LOG(ERR, EAL,
            "Failed to create thread for interrupt handling\n");

    return -ret;
}

 

/**
 * It builds/rebuilds up the epoll file descriptor with all the
 * file descriptors being waited on. Then handles the interrupts.
 *
 * @param arg
 *  pointer. (unused)
 *
 * @return
 *  never return;
 */
static __attribute__((noreturn)) void *
eal_intr_thread_main(__rte_unused void *arg)
{
    struct epoll_event ev;

    /* host thread, never break out */
    for (;;) {
        /* build up the epoll fd with all descriptors we are to
         * wait on then pass it to the handle_interrupts function
         */
        static struct epoll_event pipe_event = {
            .events = EPOLLIN | EPOLLPRI,
        };
        struct rte_intr_source *src;
        unsigned numfds = 0;

        /* 创建epoll */
        /* create epoll fd */
        int pfd = epoll_create(1);
        if (pfd < 0)
            rte_panic("Cannot create epoll instance\n");

        /* 如果有新的中断处理函数注册,则会设置intr_pipe.readfd,本线程则会重新读取intr_sources中所有中断并加入epoll */
        pipe_event.data.fd = intr_pipe.readfd;
        /**
         * add pipe fd into wait list, this pipe is used to
         * rebuild the wait list.
         */
        if (epoll_ctl(pfd, EPOLL_CTL_ADD, intr_pipe.readfd,
                        &pipe_event) < 0) {
            rte_panic("Error adding fd to %d epoll_ctl, %s\n",
                    intr_pipe.readfd, strerror(errno));
        }
        numfds++;

        rte_spinlock_lock(&intr_lock);

        /* intr_sources中所有fd加入epoll */
        TAILQ_FOREACH(src, &intr_sources, next) {
            if (src->callbacks.tqh_first == NULL)
                continue; /* skip those with no callbacks */
            ev.events = EPOLLIN | EPOLLPRI;
            ev.data.fd = src->intr_handle.fd;

            /**
             * add all the uio device file descriptor
             * into wait list.
             */
            if (epoll_ctl(pfd, EPOLL_CTL_ADD,
                    src->intr_handle.fd, &ev) < 0){
                rte_panic("Error adding fd %d epoll_ctl, %s\n",
                    src->intr_handle.fd, strerror(errno));
            }
            else
                numfds++;
        }
        rte_spinlock_unlock(&intr_lock);

        /* 等待fd事件,然后调用对应callback */
        /* serve the interrupt */
        eal_intr_handle_interrupts(pfd, numfds);

        /**
         * when we return, we need to rebuild the
         * list of fds to monitor.
         */
        close(pfd);
    }
}

 

static int
eal_intr_process_interrupts(struct epoll_event *events, int nfds)
{
    int n, bytes_read;
    struct rte_intr_source *src;
    struct rte_intr_callback *cb;
    union rte_intr_read_buffer buf;
    struct rte_intr_callback active_cb;

    for (n = 0; n < nfds; n++) {

        /* 如果是intr_pipe.readfd,则表示需要重新建epoll的fd等待链表,本循环退出 */
        /**
         * if the pipe fd is ready to read, return out to
         * rebuild the wait list.
         */
        if (events[n].data.fd == intr_pipe.readfd){
            int r = read(intr_pipe.readfd, buf.charbuf,
                    sizeof(buf.charbuf));
            RTE_SET_USED(r);
            return -1;
        }

        /* 中断fd */
        rte_spinlock_lock(&intr_lock);
        TAILQ_FOREACH(src, &intr_sources, next)
            if (src->intr_handle.fd ==
                    events[n].data.fd)
                break;
        if (src == NULL){
            rte_spinlock_unlock(&intr_lock);
            continue;
        }

        /* mark this interrupt source as active and release the lock. */
        src->active = 1;
        rte_spinlock_unlock(&intr_lock);

        /* EM的中断只需要处理这两个 */
        /* set the length to be read dor different handle type */
        switch (src->intr_handle.type) {
        case RTE_INTR_HANDLE_UIO:
            bytes_read = 4;
            break;
        case RTE_INTR_HANDLE_ALARM:
            bytes_read = sizeof(uint64_t);
            break;
        default:
            bytes_read = 1;
            break;
        }

        /**
         * read out to clear the ready-to-be-read flag
         * for epoll_wait.
         */
        bytes_read = read(events[n].data.fd, &buf, bytes_read);

        if (bytes_read < 0)
            RTE_LOG(ERR, EAL, "Error reading from file "
                "descriptor %d: %s\n", events[n].data.fd,
                            strerror(errno));
        else if (bytes_read == 0)
            RTE_LOG(ERR, EAL, "Read nothing from file "
                "descriptor %d\n", events[n].data.fd);

        /* callback调用 */
        /* grab a lock, again to call callbacks and update status. */
        rte_spinlock_lock(&intr_lock);

        if (bytes_read > 0) {

            /* Finally, call all callbacks. */
            TAILQ_FOREACH(cb, &src->callbacks, next) {

                /* make a copy and unlock. */
                active_cb = *cb;
                rte_spinlock_unlock(&intr_lock);

                /* call the actual callback */
                active_cb.cb_fn(&src->intr_handle,
                    active_cb.cb_arg);

                /*get the lcok back. */
                rte_spinlock_lock(&intr_lock);
            }
        }

        /* we done with that interrupt source, release it. */
        src->active = 0;
        rte_spinlock_unlock(&intr_lock);
    }

    return 0;
}

 

对于E1000的驱动注册的callback eth_em_interrupt_handler里面处理了link状态的回调, link down消息则关闭收发包, link up开启收发包;

static void
eth_em_interrupt_handler(__rte_unused struct rte_intr_handle *handle,
                            void *param)
{
    struct rte_eth_dev *dev = (struct rte_eth_dev *)param;

    /* 读取寄存器, 硬件状态是否改变? */
    eth_em_interrupt_get_status(dev);
    /* 根据link状态设置对应的收发包寄存器 */
    eth_em_interrupt_action(dev);

    /* 调用用户注册的回调, 如果用户关心事件可以用rte_eth_dev_callback_register注册相应回调 */
    _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC);
}

 

后面还有收发包队列的初始化, 待分析;

posted @ 2014-04-01 20:34  chanwai1219  阅读(7607)  评论(0编辑  收藏  举报