转载-网络基础-UDP传输队列长度sk_wmem_alloc统计
原文链接:UDP传输队列长度sk_wmem_alloc统计_siocoutq udp-CSDN博客
UDP协议使用sk_wmem_alloc统计当前UDP相关套接口发送缓存的占用。
统计初始化
在应用层创建套接口时,内核将新分配的套接口结构的成员变量sk_wmem_alloc初始化为1。
struct sock *sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot, int kern)
{
struct sock *sk;
sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
if (sk) {
refcount_set(&sk->sk_wmem_alloc, 1);
}
}
增加sk_wmwm_alloc统计
增加sk_wmem_alloc统计的基础函数为skb_set_owner_w。在发送流程相关的函数中,其能够将数据包skb占用的空间truesize添加到sk_wmem_alloc统计中,同时skb的销毁回调函数destructor赋值为sock_wfree函数,其将在skb销毁时,减去skb缓存长度相应的sk_wmem_alloc统计值。
void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
{
skb->destructor = sock_wfree;
/*
* We used to take a refcount on sk, but following operation
* is enough to guarantee sk_free() wont free this sock until
* all in-flight packets are completed
*/
refcount_add(skb->truesize, &sk->sk_wmem_alloc);
}
内核UDP的两个发送函数为udp_sendmsg和udp_sendpage,此处以发送数据函数udp_sendmsg为例,无论是非CORK模式的ip_make_skb函数或是CORK模式的ip_append_data函数,最终都会调用到__ip_append_data函数,前者(非CORK模式的ip_make_skb函数)利用__ip_append_data函数将数据包添加到自定义的队列中发送,后者(CORK模式的ip_append_data函数)将数据添加到通用的套接口发送队列中(sk_write_queue)。__ip_append_data在分配将数据由用户层拷贝到内核的skb缓存时,使用skb_set_owner_w增加sk_wmem_alloc的统计值。
int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
/* Lockless fast path for the non-corking case. */
if (!corkreq) {
skb = ip_make_skb(sk, fl4, getfrag, msg, ulen, sizeof(struct udphdr), &ipc, &rt, msg->msg_flags);
err = PTR_ERR(skb);
if (!IS_ERR_OR_NULL(skb))
err = udp_send_skb(skb, fl4);
goto out;
}
do_append_data:
up->len += ulen;
err = ip_append_data(sk, fl4, getfrag, msg, ulen, sizeof(struct udphdr), &ipc, &rt, corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
if (err)
udp_flush_pending_frames(sk);
else if (!corkreq)
err = udp_push_pending_frames(sk);
else if (unlikely(skb_queue_empty(&sk->sk_write_queue)))
up->pending = 0;
}
函数__ip_append_data如下所示:
- 根据transhdrlen(transport header len 传输头部长度,与分片业务有关),选择sock\_alloc\_send\_skb还是sock\_wmalloc,两个函数内部都会去申请skb空间并调用skb_set_owner_w增加sk\_wmem\_alloc的统计值
- sock\_alloc\_send\_skb调用alloc\_skb\_with\_frags函数分配指定的线性缓存和页面片段
- sk\_wmem\_alloc调用alloc_skb申请skb
- 如果发送设备支持SG特性(NETIF_F_SG标志),将数据拷贝到skb的共享页面片段区,并且增加sk_wmem_alloc的统计值(待研究)
+++
一般先走sock_wmalloc流程:
- 在分配skb缓存之前执行了两次判断,首先在函数__ip_append_data中,如果sk_wmem_alloc统计值小于两倍的套接口指定最大发送缓存值sk_sndbuf(net.core.wmem_default),才能执行sock_wmalloc分配函数,否则返回无内存错误-ENOBUFS。
- 其次进入sock_wmalloc函数,如果强制分配force为真(__ip_append_data将其固定为真1)或者sk_wmem_alloc统计值小于套接口指定的最大发送缓存值sk_sndbuf,则进行skb分配。最后也是使用skb_set_owner_w函数增加sk_wmem_alloc的统计值。
static int __ip_append_data(struct sock *sk, struct flowi4 *fl4, struct sk_buff_head *queue, ...)
{
while (length > 0) {
if (copy <= 0) {
if (transhdrlen) {
skb = sock_alloc_send_skb(sk, alloclen + hh_len + 15, (flags & MSG_DONTWAIT), &err);
} else {
if (refcount_read(&sk->sk_wmem_alloc) <= 2 * sk->sk_sndbuf)
skb = sock_wmalloc(sk, alloclen + hh_len + 15, 1, sk->sk_allocation);
}
}
if (!(rt->dst.dev->features&NETIF_F_SG)) {
} else {
skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
skb->truesize += copy;
refcount_add(copy, &sk->sk_wmem_alloc);
}
}
}
struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, unsigned long data_len, int noblock, ...)
{
skb = alloc_skb_with_frags(header_len, data_len, max_page_order, errcode, sk->sk_allocation);
if (skb)
skb_set_owner_w(skb, sk);
return skb;
}
struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority)
{
if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
struct sk_buff *skb = alloc_skb(size, priority);
if (skb) {
skb_set_owner_w(skb, sk);
return skb;
}
}
}
最后,两个IP层发送函数ip_finish_output2和ip_do_fragment在由需要的情况下需要分配skb缓存,对于前者而言,由于skb缓存的头部可用空间不足以容纳二层协议头部信息时,需要重新分配一个skb缓存,将新的长度值添加的sk_wmem_alloc统计中。
static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
{
if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
if (skb->sk)
skb_set_owner_w(skb2, skb->sk);
}
}
对于分片函数ip_do_fragment,在执行慢速路径中,需要分配新的skb缓存拷贝需分片的skb缓存的部分长度,内核将新分配的skb缓存长度增加到sk_wmem_alloc统计中。
int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *))
{
slow_path:
while (left > 0) {
skb2 = alloc_skb(len + hlen + ll_rs, GFP_ATOMIC);
ip_copy_metadata(skb2, skb);
if (skb->sk)
skb_set_owner_w(skb2, skb->sk);
}
}
减少sk_wmem_alloc
UDP两个基本的sk_wmem_alloc统计值减少函数分别为:套接口释放函数sk_free和skb缓存释放函数sock_wfree,另外,相对应的TCP的skb缓存释放函数为__sock_wfree。由于在套接口创建时,sk_wmem_alloc初始化为1,如果建议之后其值为0,表明套接口关联的发送队列已空,调用__sk_free释放套接口。否则,此套接口将在发送队列中最后一个skb缓存释放时被清除(调用__sk_free),参见函数sock_wfree。
void sk_free(struct sock *sk)
{
if (refcount_dec_and_test(&sk->sk_wmem_alloc))
__sk_free(sk);
}
void sock_wfree(struct sk_buff *skb)
{
struct sock *sk = skb->sk;
unsigned int len = skb->truesize;
if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
sk->sk_write_space(sk);
len = 1;
}
if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
__sk_free(sk);
}
统计值判断控制
如前所述的UDP发送路径上的skb分配函数sock_alloc_send_pskb,如果sk_wmem_alloc统计值低于套接口限定的发送缓存最大值sk_sndbuf,直接进行skb分配。否则,设置套接口的空间不足标志SOCKWQ_ASYNC_NOSPACE,如果用户设置了不需等待标志MSG_DONTWAIT立即返回错误码-EAGAIN,反之,等待缓存可用,参见函数sock_wait_for_wmem。
需要注意sk_wmem_alloc_get函数,其返回值为sk_wmem_alloc统计值减去1,由于在套接口分配时sk_wmem_alloc初始化为1,此处减去1意味着取出的是发送缓存skb及数据占用的空间长度。
struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, unsigned long data_len, int noblock, ...)
{
timeo = sock_sndtimeo(sk, noblock);
for (;;) {
if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
break;
sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
err = -EAGAIN;
if (!timeo)
goto failure;
if (signal_pending(current))
goto interrupted;
timeo = sock_wait_for_wmem(sk, timeo);
}
skb = alloc_skb_with_frags(header_len, data_len, max_page_order, errcode, sk->sk_allocation);
}
static inline int sk_wmem_alloc_get(const struct sock *sk)
{
return refcount_read(&sk->sk_wmem_alloc) - 1;
}
直到在发送的缓存sk_wmem_alloc统计值降低到小于sk_sndbuf或者超时,sock_wait_for_wmem函数才会退出。
static long sock_wait_for_wmem(struct sock *sk, long timeo)
{
sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
for (;;) {
if (!timeo)
break;
if (signal_pending(current))
break;
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
break;
timeo = schedule_timeout(timeo);
}
}
另外,内核网络协议中最重要的缓存空间判断函数__sk_mem_raise_allocated,对于UDP协议而言,如果sk_wmem_alloc统计值小于系统限定的协议最小值(/proc/sys/net/ipv4/udp_wmem_min),允许内存分配。
int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
{
if (kind == SK_MEM_RECV) {
} else { /* SK_MEM_SEND */
int wmem0 = sk_get_wmem0(sk, prot);
if (sk->sk_type == SOCK_STREAM) {
} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
return 1;
}
}
}
最后,函数sock_def_write_space,如果发送缓存空间sk_sndbuf大于等于sk_wmem_alloc统计值的一倍时,说明套接口已有足够的空间,唤醒等待在套接口队列上的进程,对于异步等待的进程,使用函数sock_writeable再次确认sk_wmem_alloc统计值低于sk_sndbuf的一半大小,唤醒异步等待的进程。此函数在skb缓存释放函数sock_wfree和用户层重设置最大发送缓存空间sk_sndbuf值的函数sock_setsockopt中调用。
static void sock_def_write_space(struct sock *sk)
{
struct socket_wq *wq;
if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
wq = rcu_dereference(sk->sk_wq);
if (skwq_has_sleeper(wq))
wake_up_interruptible_sync_poll(&wq->wait, POLLOUT | POLLWRNORM | POLLWRBAND);
/* Should agree with poll, otherwise some programs break */
if (sock_writeable(sk))
sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
}
}
static inline bool sock_writeable(const struct sock *sk)
{
return refcount_read(&sk->sk_wmem_alloc) < (sk->sk_sndbuf >> 1);
}
int sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
{
switch (optname) {
case SO_SNDBUF:
val = min_t(u32, val, sysctl_wmem_max);
set_sndbuf:
sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
/* Wake up sending tasks if we upped the value. */
sk->sk_write_space(sk);
break;
}
}
sk_wmem_alloc获取
应用层可通过IOCTL命令SIOCOUTQ获取到当前套接口的sk_wmem_alloc统计值,内核处理函数为如下udp_ioctl。
int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
{
switch (cmd) {
case SIOCOUTQ:
{
int amount = sk_wmem_alloc_get(sk);
return put_user(amount, (int __user *)arg);
}
}
另外,通过getsockopt接口选项SO_MEMINFO可获得所有的套接口缓存统计信息,其中包括sk_wmem_alloc统计值。UDP的diag接口也可获取到sk_wmem_alloc统计值,见函数udp_diag_get_info。
int sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen)
{
switch (optname) {
case SO_MEMINFO:
{
u32 meminfo[SK_MEMINFO_VARS];
sk_get_meminfo(sk, meminfo);
}
}
}
void sk_get_meminfo(const struct sock *sk, u32 *mem)
{
memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
}
static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, void *info)
{
r->idiag_rqueue = sk_rmem_alloc_get(sk);
r->idiag_wqueue = sk_wmem_alloc_get(sk);
}
内核版本 4.15
本文来自博客园,作者:LiYanbin,转载请注明原文链接:https://www.cnblogs.com/stellar-liyanbin/p/18786899