linux同步IO复用之-select
先来看一下其函数声明:
/* According to POSIX.1-2001, POSIX.1-2008 */ #include <sys/select.h> /* According to earlier standards */ #include <sys/time.h> #include <sys/types.h> #include <unistd.h> int select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, struct timeval *timeout); void FD_CLR(int fd, fd_set *set); int FD_ISSET(int fd, fd_set *set); void FD_SET(int fd, fd_set *set); void FD_ZERO(fd_set *set); #include <sys/select.h> int pselect(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, const struct timespec *timeout, const sigset_t *sigmask); Feature Test Macro Requirements for glibc (see feature_test_macros(7)): pselect(): _POSIX_C_SOURCE >= 200112L
学习的过程中,要慢慢学着提问题? 我们就带着问题来看
问题一、根据手册描述,nfds是三个文件描述符集合中,数值最大的描述符加上1, 这是为什么?
下面的代码来自内核include/linux/posix_types.h,是fd_set相关的定义。
21 #undef __NFDBITS 22 #define __NFDBITS (8 * sizeof(unsigned long)) 23 24 #undef __FD_SETSIZE 25 #define __FD_SETSIZE 1024 26 27 #undef __FDSET_LONGS 28 #define __FDSET_LONGS (__FD_SETSIZE/__NFDBITS) 29 30 #undef __FDELT 31 #define __FDELT(d) ((d) / __NFDBITS) 32 33 #undef __FDMASK 34 #define __FDMASK(d) (1UL << ((d) % __NFDBITS)) 35 36 typedef struct { 37 unsigned long fds_bits [__FDSET_LONGS]; 38 } __kernel_fd_set;
可以看到内核中,是使用位图来管理文件描述符的。如何来确定文件描述符N是否包含在一个fd_set内呢?
_FDMASK & fd_set[_FDELT[N + 1] ? True : False
换句话说,数值为n的文件描述符在位图中的索引就是n+1
因为select执行的时候需要遍历所有添加的文件描述符,这当然也包括最大的文件描述符了,而其索引就是n+1
既然说到内核了就多说两句,select实现在文件fs/select.c中,比较关键的两个函数是core_sys_select和do_select(内核版本2.6.32.71),
可以看到do_select中有一个大的循环来遍历文件描述符,这也就是文件描述符比较多时,select性能不足的原因了。
int do_select(int n, fd_set_bits *fds, struct timespec *end_time) { ktime_t expire, *to = NULL; struct poll_wqueues table; poll_table *wait; int retval, i, timed_out = 0; unsigned long slack = 0; rcu_read_lock(); retval = max_select_fd(n, fds); rcu_read_unlock(); if (retval < 0) return retval; n = retval; poll_initwait(&table); wait = &table.pt; if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { wait = NULL; timed_out = 1; } if (end_time && !timed_out) slack = estimate_accuracy(end_time); retval = 0; for (;;) { unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; inp = fds->in; outp = fds->out; exp = fds->ex; rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; for (i = 0; i < n; ++rinp, ++routp, ++rexp) { unsigned long in, out, ex, all_bits, bit = 1, mask, j; unsigned long res_in = 0, res_out = 0, res_ex = 0; const struct file_operations *f_op = NULL; struct file *file = NULL; in = *inp++; out = *outp++; ex = *exp++; all_bits = in | out | ex; if (all_bits == 0) { i += __NFDBITS; continue; } for (j = 0; j < __NFDBITS; ++j, ++i, bit <<= 1) { int fput_needed; if (i >= n) break; if (!(bit & all_bits)) continue; file = fget_light(i, &fput_needed); if (file) { f_op = file->f_op; mask = DEFAULT_POLLMASK; if (f_op && f_op->poll) { wait_key_set(wait, in, out, bit); mask = (*f_op->poll)(file, wait); } fput_light(file, fput_needed); if ((mask & POLLIN_SET) && (in & bit)) { res_in |= bit; retval++; wait = NULL; } if ((mask & POLLOUT_SET) && (out & bit)) { res_out |= bit; retval++; wait = NULL; } if ((mask & POLLEX_SET) && (ex & bit)) { res_ex |= bit; retval++; wait = NULL; } } } if (res_in) *rinp = res_in; if (res_out) *routp = res_out; if (res_ex) *rexp = res_ex; cond_resched(); } wait = NULL; if (retval || timed_out || signal_pending(current)) break; if (table.error) { retval = table.error; break; } /* * If this is the first loop and we have a timeout * given, then we convert to ktime_t and set the to * pointer to the expiry value. */ if (end_time && !to) { expire = timespec_to_ktime(*end_time); to = &expire; } if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE, to, slack)) timed_out = 1; } poll_freewait(&table); return retval; } /* * We can actually return ERESTARTSYS instead of EINTR, but I'd * like to be certain this leads to no problems. So I return * EINTR just for safety. * * Update: ERESTARTSYS breaks at least the xview clock binary, so * I'm trying ERESTARTNOHAND which restart only when you want to. */ #define MAX_SELECT_SECONDS \ ((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1) int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timespec *end_time) { fd_set_bits fds; void *bits; int ret, max_fds; unsigned int size; struct fdtable *fdt; /* Allocate small arguments on the stack to save memory and be faster */ long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; ret = -EINVAL; if (n < 0) goto out_nofds; /* max_fds can increase, so grab it once to avoid race */ rcu_read_lock(); fdt = files_fdtable(current->files); max_fds = fdt->max_fds; rcu_read_unlock(); if (n > max_fds) n = max_fds; /* * We need 6 bitmaps (in/out/ex for both incoming and outgoing), * since we used fdset we need to allocate memory in units of * long-words. */ size = FDS_BYTES(n); bits = stack_fds; if (size > sizeof(stack_fds) / 6) { /* Not enough space in on-stack array; must use kmalloc */ ret = -ENOMEM; bits = kmalloc(6 * size, GFP_KERNEL); if (!bits) goto out_nofds; } fds.in = bits; fds.out = bits + size; fds.ex = bits + 2*size; fds.res_in = bits + 3*size; fds.res_out = bits + 4*size; fds.res_ex = bits + 5*size; if ((ret = get_fd_set(n, inp, fds.in)) || (ret = get_fd_set(n, outp, fds.out)) || (ret = get_fd_set(n, exp, fds.ex))) goto out; zero_fd_set(n, fds.res_in); zero_fd_set(n, fds.res_out); zero_fd_set(n, fds.res_ex); ret = do_select(n, &fds, end_time); if (ret < 0) goto out; if (!ret) { ret = -ERESTARTNOHAND; if (signal_pending(current)) goto out; ret = 0; } if (set_fd_set(n, inp, fds.res_in) || set_fd_set(n, outp, fds.res_out) || set_fd_set(n, exp, fds.res_ex)) ret = -EFAULT; out: if (bits != stack_fds) kfree(bits); out_nofds: return ret; }
问题二、如果select在阻塞的时候被信号中断,会发生什么?
根据man手册(man 7 signal)可以知道,select将返回-1,并将errno设置为EINTER
系统调用被信号中断后,是什么情况? 这个不仅与具体的系统调用有关,还与信号处理函数的安装有关(SA_RESTART标志)
BUGS:
Under Linux, select() may report a socket file descriptor as "ready for reading", while nevertheless a subsequent read blocks. This could
for example happen when data has arrived but upon examination has wrong checksum and is discarded. There may be other circumstances in
which a file descriptor is spuriously reported as ready. Thus it may be safer to use O_NONBLOCK on sockets that should not block.
在Linux下,可能出现如下情形: select报告一个套结字描述符状态可读,但是下一次read()调用却被阻塞了。
这是可能发生的,举个例子,数据已经被接收但是因为校验和错误而被丢弃。也可能存在其他情形,一个描述符被虚假的报告为可读。所以,在一个不应该
被阻塞的套结字上使用非阻塞模式读取是更加安全的做法。
浙公网安备 33010602011771号