linux同步IO复用之-select

最近对libev libuv这些异步事件库,比较感兴趣,为了知其所以然,决定重新认识一下select/poll/epoll, 这里是对select的简单总结

先来看一下其函数声明:

       /* According to POSIX.1-2001, POSIX.1-2008 */
       #include <sys/select.h>

       /* According to earlier standards */
       #include <sys/time.h>
       #include <sys/types.h>
       #include <unistd.h>

       int select(int nfds, fd_set *readfds, fd_set *writefds,
                  fd_set *exceptfds, struct timeval *timeout);

       void FD_CLR(int fd, fd_set *set);
       int  FD_ISSET(int fd, fd_set *set);
       void FD_SET(int fd, fd_set *set);
       void FD_ZERO(fd_set *set);

       #include <sys/select.h>

       int pselect(int nfds, fd_set *readfds, fd_set *writefds,
                   fd_set *exceptfds, const struct timespec *timeout,
                   const sigset_t *sigmask);

   Feature Test Macro Requirements for glibc (see feature_test_macros(7)):

       pselect(): _POSIX_C_SOURCE >= 200112L

 

学习的过程中,要慢慢学着提问题? 我们就带着问题来看

问题一、根据手册描述,nfds是三个文件描述符集合中,数值最大的描述符加上1, 这是为什么?

               下面的代码来自内核include/linux/posix_types.h,是fd_set相关的定义。    

 21 #undef __NFDBITS                                                                
 22 #define __NFDBITS   (8 * sizeof(unsigned long))                                 
 23                                                                                 
 24 #undef __FD_SETSIZE                                                             
 25 #define __FD_SETSIZE    1024                                                    
 26                                                                                 
 27 #undef __FDSET_LONGS                                                            
 28 #define __FDSET_LONGS   (__FD_SETSIZE/__NFDBITS)                                
 29                                                                                 
 30 #undef __FDELT                                                                  
 31 #define __FDELT(d)  ((d) / __NFDBITS)                                           
 32                                                                                 
 33 #undef __FDMASK                                                                 
 34 #define __FDMASK(d) (1UL << ((d) % __NFDBITS))                                  
 35                                                                                 
 36 typedef struct {                                                                
 37     unsigned long fds_bits [__FDSET_LONGS];                                     
 38 } __kernel_fd_set;                                

    可以看到内核中,是使用位图来管理文件描述符的。如何来确定文件描述符N是否包含在一个fd_set内呢?

    _FDMASK & fd_set[_FDELT[N + 1] ?  True : False

              换句话说,数值为n的文件描述符在位图中的索引就是n+1

    因为select执行的时候需要遍历所有添加的文件描述符,这当然也包括最大的文件描述符了,而其索引就是n+1

    既然说到内核了就多说两句,select实现在文件fs/select.c中,比较关键的两个函数是core_sys_select和do_select(内核版本2.6.32.71),

    可以看到do_select中有一个大的循环来遍历文件描述符,这也就是文件描述符比较多时,select性能不足的原因了。

int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
{
    ktime_t expire, *to = NULL;
    struct poll_wqueues table;
    poll_table *wait;
    int retval, i, timed_out = 0;
    unsigned long slack = 0;

    rcu_read_lock();
    retval = max_select_fd(n, fds);
    rcu_read_unlock();

    if (retval < 0)
        return retval;
    n = retval;

    poll_initwait(&table);
    wait = &table.pt;
    if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
        wait = NULL;
        timed_out = 1;
    }

    if (end_time && !timed_out)
        slack = estimate_accuracy(end_time);

    retval = 0;
    for (;;) {
        unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;

        inp = fds->in; outp = fds->out; exp = fds->ex;
        rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;

        for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
            unsigned long in, out, ex, all_bits, bit = 1, mask, j;
            unsigned long res_in = 0, res_out = 0, res_ex = 0;
            const struct file_operations *f_op = NULL;
            struct file *file = NULL;

            in = *inp++; out = *outp++; ex = *exp++;
            all_bits = in | out | ex;
            if (all_bits == 0) {
                i += __NFDBITS;
                continue;
            }

            for (j = 0; j < __NFDBITS; ++j, ++i, bit <<= 1) {
                int fput_needed;
                if (i >= n)
                    break;
                if (!(bit & all_bits))
                    continue;
                file = fget_light(i, &fput_needed);
                if (file) {
                    f_op = file->f_op;
                    mask = DEFAULT_POLLMASK;
                    if (f_op && f_op->poll) {
                        wait_key_set(wait, in, out, bit);
                        mask = (*f_op->poll)(file, wait);
                    }
                    fput_light(file, fput_needed);
                    if ((mask & POLLIN_SET) && (in & bit)) {
                        res_in |= bit;
                        retval++;
                        wait = NULL;
                    }
                    if ((mask & POLLOUT_SET) && (out & bit)) {
                        res_out |= bit;
                        retval++;
                        wait = NULL;
                    }
                    if ((mask & POLLEX_SET) && (ex & bit)) {
                        res_ex |= bit;
                        retval++;
                        wait = NULL;
                    }
                }
            }
            if (res_in)
                *rinp = res_in;
            if (res_out)
                *routp = res_out;
            if (res_ex)
                *rexp = res_ex;
            cond_resched();
        }
        wait = NULL;
        if (retval || timed_out || signal_pending(current))
            break;
        if (table.error) {
            retval = table.error;
            break;
        }

        /*
         * If this is the first loop and we have a timeout
         * given, then we convert to ktime_t and set the to
         * pointer to the expiry value.
         */
        if (end_time && !to) {
            expire = timespec_to_ktime(*end_time);
            to = &expire;
        }

        if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
                       to, slack))
            timed_out = 1;
    }

    poll_freewait(&table);

    return retval;
}

/*
 * We can actually return ERESTARTSYS instead of EINTR, but I'd
 * like to be certain this leads to no problems. So I return
 * EINTR just for safety.
 *
 * Update: ERESTARTSYS breaks at least the xview clock binary, so
 * I'm trying ERESTARTNOHAND which restart only when you want to.
 */
#define MAX_SELECT_SECONDS \
    ((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1)

int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
               fd_set __user *exp, struct timespec *end_time)
{
    fd_set_bits fds;
    void *bits;
    int ret, max_fds;
    unsigned int size;
    struct fdtable *fdt;
    /* Allocate small arguments on the stack to save memory and be faster */
    long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];

    ret = -EINVAL;
    if (n < 0)
        goto out_nofds;

    /* max_fds can increase, so grab it once to avoid race */
    rcu_read_lock();
    fdt = files_fdtable(current->files);
    max_fds = fdt->max_fds;
    rcu_read_unlock();
    if (n > max_fds)
        n = max_fds;

    /*
     * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
     * since we used fdset we need to allocate memory in units of
     * long-words. 
     */
    size = FDS_BYTES(n);
    bits = stack_fds;
    if (size > sizeof(stack_fds) / 6) {
        /* Not enough space in on-stack array; must use kmalloc */
        ret = -ENOMEM;
        bits = kmalloc(6 * size, GFP_KERNEL);
        if (!bits)
            goto out_nofds;
    }
    fds.in      = bits;
    fds.out     = bits +   size;
    fds.ex      = bits + 2*size;
    fds.res_in  = bits + 3*size;
    fds.res_out = bits + 4*size;
    fds.res_ex  = bits + 5*size;

    if ((ret = get_fd_set(n, inp, fds.in)) ||
        (ret = get_fd_set(n, outp, fds.out)) ||
        (ret = get_fd_set(n, exp, fds.ex)))
        goto out;
    zero_fd_set(n, fds.res_in);
    zero_fd_set(n, fds.res_out);
    zero_fd_set(n, fds.res_ex);

    ret = do_select(n, &fds, end_time);

    if (ret < 0)
        goto out;
    if (!ret) {
        ret = -ERESTARTNOHAND;
        if (signal_pending(current))
            goto out;
        ret = 0;
    }

    if (set_fd_set(n, inp, fds.res_in) ||
        set_fd_set(n, outp, fds.res_out) ||
        set_fd_set(n, exp, fds.res_ex))
        ret = -EFAULT;

out:
    if (bits != stack_fds)
        kfree(bits);
out_nofds:
    return ret;
}
select内核实现核心代码

 


 

问题二、如果select在阻塞的时候被信号中断,会发生什么?

         根据man手册(man 7 signal)可以知道,select将返回-1,并将errno设置为EINTER

    系统调用被信号中断后,是什么情况? 这个不仅与具体的系统调用有关,还与信号处理函数的安装有关(SA_RESTART标志)

 

BUGS:

       Under Linux, select() may report a socket file descriptor as "ready for reading", while nevertheless a subsequent read blocks.  This  could
       for  example  happen  when  data has arrived but upon examination has wrong checksum and is discarded.  There may be other circumstances in
       which a file descriptor is spuriously reported as ready.  Thus it may be safer to use O_NONBLOCK on sockets that should not block.


       在Linux下,可能出现如下情形: select报告一个套结字描述符状态可读,但是下一次read()调用却被阻塞了。

       这是可能发生的,举个例子,数据已经被接收但是因为校验和错误而被丢弃。也可能存在其他情形,一个描述符被虚假的报告为可读。所以,在一个不应该

  被阻塞的套结字上使用非阻塞模式读取是更加安全的做法。

 

posted @ 2016-03-28 21:59  我的湖  阅读(324)  评论(0)    收藏  举报