tcp_notsent_lowat +epoll

tcp_notsent_lowat控制发送缓存队列中的未发送数据量。低于此值可发送 也就是对外是epoll_out

  内核函数tcp_poll,通过函数sk_stream_is_writeable的结果判定,可通知用户层发送数据的时机。当发送缓存队列的数据流小于notsent_lowat值的时候,由POLLOUT通知应用层可写,无需等待;反之,应用层需要等待。

 

if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
            if (__sk_stream_is_writeable(sk, 1)) {
                mask |= EPOLLOUT | EPOLLWRNORM;
 } 

static inline bool __sk_stream_is_writeable(const struct sock *sk, int wake) { return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && __sk_stream_memory_free(sk, wake); } static inline int sk_stream_wspace(const struct sock *sk) { return READ_ONCE(sk->sk_sndbuf) - READ_ONCE(sk->sk_wmem_queued); } /* * Compute minimal free write space needed to queue new packets. */ static inline int sk_stream_min_wspace(const struct sock *sk) { return READ_ONCE(sk->sk_wmem_queued) >> 1; }
/* @wake is one when sk_stream_write_space() calls us.
 * This sends EPOLLOUT only if notsent_bytes is half the limit.
 * This mimics the strategy used in sock_def_write_space().
 */
static inline bool tcp_stream_memory_free(const struct sock *sk, int wake)
{
    const struct tcp_sock *tp = tcp_sk(sk);
    u32 notsent_bytes = READ_ONCE(tp->write_seq) -
                READ_ONCE(tp->snd_nxt);

    return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
}
static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp)
{
    struct net *net = sock_net((struct sock *)tp);
    return tp->notsent_lowat ?: net->ipv4.sysctl_tcp_notsent_lowat;
}
//如果具体的socket  notsent_lowat 也就是TCP_NOTSENT_LOWAT 设置为有值,则使用其设置值,否则使用系统默认值net.ipv4.tcp_notsent_lowat = 4294967295

 

  判断发送缓存何时变得可用时,内核使用sk_stream_is_writeable函数,首要条件是发送队列缓存空间的余量(sk_stream_wspace)大于等于当前发送队列占用空间的一半,即还有1/3以上的空余空间。其次,是未发送的数据量低于notsent_lowat的值。

// Test the behavior of EPOLLET for EPOLLOUT with custom notsent lowat.
//
// When sendbuffer is cleared, epoll_wait returns EPOLLOUT after a partial
// write, regardless of whether the user received EAGAIN or not.
`../../common/defaults.sh
../common/set_sysctls.py /proc/sys/net/ipv4/tcp_wmem="4096 50000 4194304"
`
    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
   +0 bind(3, ..., ...) = 0
   +0 listen(3, 1) = 0

   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
   +0 < . 1:1(0) ack 1 win 257

   +0 accept(3, ..., ...) = 4

   // Set not-sent lowat to 32K.
   +0 setsockopt(4, SOL_TCP, TCP_NOTSENT_LOWAT, [32000], 4) = 0
   +0 getsockopt(4, SOL_TCP, TCP_NOTSENT_LOWAT, [32000], [4]) = 0

   +0 fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0
   +0 epoll_create(1) = 5

   // Add the FD as EPOLLET and clear all the events.
   +0 epoll_ctl(5, EPOLL_CTL_ADD, 4, {events=EPOLLOUT|EPOLLIN|EPOLLET, fd=4}) = 0
   +0 epoll_wait(5, {events=EPOLLOUT, fd=4}, 1, 0) = 1
   +0 epoll_wait(5, {events=0, ptr=0}, 1, 0) = 0

   // We have a parital write here.
   +0 write(4, ..., 59000) = 42000

   // Now epoll_wait should not return any event because there is no space.
   +0 epoll_wait(5, {events=0, ptr=0}, 1, 0) = 0

   // The ack does not open up enough space to send EPOLLOUT to the user.
   +0 > P. 1:10001(10000) ack 1
   +0 < . 1:1(0) ack 10001 win 257
   +0 > P. 10001:26001(16000) ack 1
   +0 epoll_wait(5, {events=0, ptr=0}, 1, 0) = 0

   // Now this ack opens up substantial space and we will send EPOLLOUT.
   +0 < . 1:1(0) ack 26001 win 257
   +0 > P. 26001:42001(16000) ack 1
   +0 epoll_wait(5, {events=EPOLLOUT, fd=4}, 1, 0) = 1
   // EPOLLET prevents a second EPOLLOUT.
   +0 epoll_wait(5, {events=0, ptr=0}, 1, 0) = 0

   // We have another parital write here.
   +0 write(4, ..., 180000) = 48000

   // The ack does not open enough space to send EPOLLOUT to the user.
   +0 < . 1:1(0) ack 42001 win 257
   +0 > P. 42001:74001(32000) ack 1
   +0 epoll_wait(5, {events=0, ptr=0}, 1, 0) = 0

   // The ack opens more space, and we send EPOLLOUT to the user.
   +0 epoll_wait(5, {events=0, ptr=0}, 1, 0) = 0
   +0 < . 1:1(0) ack 74001 win 257
   +0 epoll_wait(5, {events=EPOLLOUT, fd=4}, 1, 0) = 1
   +0 epoll_wait(5, {events=0, ptr=0}, 1, 0) = 0
   +0 > P. 74001:90001(16000) ack 1

   // Another partial write.
   +0 write(4, ..., 180000) = 48000

   +0 < . 1:1(0) ack 90001 win 257
   +0 > P. 90001:106001(16000) ack 1
   +0 epoll_wait(5, {events=0, ptr=0}, 1, 0) = 0
   // This ack opens more space, so we send EPOLLOUT.
   +0 < . 1:1(0) ack 106001 win 257
   +0 > P. 106001:138001(32000) ack 1
   +0 epoll_wait(5, {events=EPOLLOUT, fd=4}, 1, 0) = 1

 

posted @ 2024-12-21 00:24  codestacklinuxer  阅读(45)  评论(0)    收藏  举报