tcp_notsent_lowat +epoll
tcp_notsent_lowat控制发送缓存队列中的未发送数据量。低于此值可发送 也就是对外是epoll_out
内核函数tcp_poll,通过函数sk_stream_is_writeable的结果判定,可通知用户层发送数据的时机。当发送缓存队列的数据流小于notsent_lowat值的时候,由POLLOUT通知应用层可写,无需等待;反之,应用层需要等待。
if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { if (__sk_stream_is_writeable(sk, 1)) { mask |= EPOLLOUT | EPOLLWRNORM; }
static inline bool __sk_stream_is_writeable(const struct sock *sk, int wake) { return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && __sk_stream_memory_free(sk, wake); } static inline int sk_stream_wspace(const struct sock *sk) { return READ_ONCE(sk->sk_sndbuf) - READ_ONCE(sk->sk_wmem_queued); } /* * Compute minimal free write space needed to queue new packets. */ static inline int sk_stream_min_wspace(const struct sock *sk) { return READ_ONCE(sk->sk_wmem_queued) >> 1; }
/* @wake is one when sk_stream_write_space() calls us. * This sends EPOLLOUT only if notsent_bytes is half the limit. * This mimics the strategy used in sock_def_write_space(). */ static inline bool tcp_stream_memory_free(const struct sock *sk, int wake) { const struct tcp_sock *tp = tcp_sk(sk); u32 notsent_bytes = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt); return (notsent_bytes << wake) < tcp_notsent_lowat(tp); }
static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); return tp->notsent_lowat ?: net->ipv4.sysctl_tcp_notsent_lowat; } //如果具体的socket notsent_lowat 也就是TCP_NOTSENT_LOWAT 设置为有值,则使用其设置值,否则使用系统默认值net.ipv4.tcp_notsent_lowat = 4294967295
判断发送缓存何时变得可用时,内核使用sk_stream_is_writeable函数,首要条件是发送队列缓存空间的余量(sk_stream_wspace)大于等于当前发送队列占用空间的一半,即还有1/3以上的空余空间。其次,是未发送的数据量低于notsent_lowat的值。
// Test the behavior of EPOLLET for EPOLLOUT with custom notsent lowat. // // When sendbuffer is cleared, epoll_wait returns EPOLLOUT after a partial // write, regardless of whether the user received EAGAIN or not. `../../common/defaults.sh ../common/set_sysctls.py /proc/sys/net/ipv4/tcp_wmem="4096 50000 4194304" ` 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 +0 bind(3, ..., ...) = 0 +0 listen(3, 1) = 0 +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7> +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> +0 < . 1:1(0) ack 1 win 257 +0 accept(3, ..., ...) = 4 // Set not-sent lowat to 32K. +0 setsockopt(4, SOL_TCP, TCP_NOTSENT_LOWAT, [32000], 4) = 0 +0 getsockopt(4, SOL_TCP, TCP_NOTSENT_LOWAT, [32000], [4]) = 0 +0 fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0 +0 epoll_create(1) = 5 // Add the FD as EPOLLET and clear all the events. +0 epoll_ctl(5, EPOLL_CTL_ADD, 4, {events=EPOLLOUT|EPOLLIN|EPOLLET, fd=4}) = 0 +0 epoll_wait(5, {events=EPOLLOUT, fd=4}, 1, 0) = 1 +0 epoll_wait(5, {events=0, ptr=0}, 1, 0) = 0 // We have a parital write here. +0 write(4, ..., 59000) = 42000 // Now epoll_wait should not return any event because there is no space. +0 epoll_wait(5, {events=0, ptr=0}, 1, 0) = 0 // The ack does not open up enough space to send EPOLLOUT to the user. +0 > P. 1:10001(10000) ack 1 +0 < . 1:1(0) ack 10001 win 257 +0 > P. 10001:26001(16000) ack 1 +0 epoll_wait(5, {events=0, ptr=0}, 1, 0) = 0 // Now this ack opens up substantial space and we will send EPOLLOUT. +0 < . 1:1(0) ack 26001 win 257 +0 > P. 26001:42001(16000) ack 1 +0 epoll_wait(5, {events=EPOLLOUT, fd=4}, 1, 0) = 1 // EPOLLET prevents a second EPOLLOUT. +0 epoll_wait(5, {events=0, ptr=0}, 1, 0) = 0 // We have another parital write here. +0 write(4, ..., 180000) = 48000 // The ack does not open enough space to send EPOLLOUT to the user. +0 < . 1:1(0) ack 42001 win 257 +0 > P. 42001:74001(32000) ack 1 +0 epoll_wait(5, {events=0, ptr=0}, 1, 0) = 0 // The ack opens more space, and we send EPOLLOUT to the user. +0 epoll_wait(5, {events=0, ptr=0}, 1, 0) = 0 +0 < . 1:1(0) ack 74001 win 257 +0 epoll_wait(5, {events=EPOLLOUT, fd=4}, 1, 0) = 1 +0 epoll_wait(5, {events=0, ptr=0}, 1, 0) = 0 +0 > P. 74001:90001(16000) ack 1 // Another partial write. +0 write(4, ..., 180000) = 48000 +0 < . 1:1(0) ack 90001 win 257 +0 > P. 90001:106001(16000) ack 1 +0 epoll_wait(5, {events=0, ptr=0}, 1, 0) = 0 // This ack opens more space, so we send EPOLLOUT. +0 < . 1:1(0) ack 106001 win 257 +0 > P. 106001:138001(32000) ack 1 +0 epoll_wait(5, {events=EPOLLOUT, fd=4}, 1, 0) = 1
http代理服务器(3-4-7层代理)-网络事件库公共组件、内核kernel驱动 摄像头驱动 tcpip网络协议栈、netfilter、bridge 好像看过!!!!
但行好事 莫问前程
--身高体重180的胖子

浙公网安备 33010602011771号