Linux fork之后，到底是子进程先运行还是父进程先运行【转】

转自：https://blog.csdn.net/dog250/article/details/105756168

大约10年前，我写过两篇关于Linux内核CFS调度器的文章：
https://blog.csdn.net/dog250/article/details/5302865
https://blog.csdn.net/dog250/article/details/5302864

我觉得这两篇文章是垃圾，但我又不删，留着给自己喷吧！

不就是一个内核参数 kernel.sched_child_runs_first 吗？在今天看来，验证它是否起作用实在太简单了。

首先解释一下为什么要子进程先运行。

因为fork的行为造成了后续的COW(copy on write)，一般而言子进程会调用exec而替换掉需要COW的地址空间，子进程先运行可以避免不必要的COW开销。

那么对于CFS调度器而言，kernel.sched_child_runs_first是否有作用呢？我们试一下便知道，依然使用那两篇垃圾文章中的例子：

#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <unistd.h>
int main(int argc,char *argv[])
{
   int v = atoi(argv[1]);
   printf("%d\n", getpid());
   nice(v);
   int i = 90000;
   while (i-->0) {
       v++;
   }

   if(fork() == 0) {
       printf("sub\n");
   }
   printf("main,%d\n",v);
}

我们设置好内核参数后，看看到底哪个先打印出来：

[root@localhost test]# sysctl -w kernel.sched_child_runs_first=1
kernel.sched_child_runs_first = 1
[root@localhost test]#
[root@localhost test]# ./a.out 10
5101
main,90010
[root@localhost test]# sub
[root@localhost test]# ./a.out -10
5105
sub
main,89990
[root@localhost test]# ./a.out -10
5108
main,89990
[root@localhost test]# sub
[root@localhost test]# ./a.out -10
5112
main,89990
[root@localhost test]# sub
[root@localhost test]# ./a.out 10
5117
main,90010
[root@localhost test]# sub

不用试了，它不起作用，不管你有没有设置START_DEBIT这个feature！它和START_DEBIT根本没有关系，dog250在2010年写的那些东西故弄玄虚，把简单问题复杂化！还扯什么START_DEBIT，还扯什么统计概览，真是无中生有，垃圾啊垃圾。

正确的排查问题的方法完全就不是这个思路！

现在，我来展示正确的做法。在实验之前，澄清一个事实，不要用printf来确认到底谁先运行！因为printf太复杂了，执行它的周期太久，有可能虽然子进程先运行但却是父进程先打印出来。

所以，我用exit：

#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <unistd.h>
int main(int argc,char *argv[])
{
   int v = atoi(argv[1]);
   printf("%d\n", getpid());
   nice(v);

   if(fork() == 0) {
       exit(0);
   }
   exit(0);
}

我们用操作系统的方式去观测，而不是用printf，这次，我们用stap：

#!/usr/bin/stap -g

global g_se;
global g_cfs_rq;

probe begin {
   g_cfs_rq = 0;
   g_se = 0;
}

probe kernel.function("__schedule")
{
   t_curr = task_current();
   if (task_execname(t_curr) == "a.out")
       printf("[_schedule] current task: %s[%d]\n", task_execname(t_curr), task_pid(t_curr));
}

probe kernel.function("do_exit")
{
   t_curr = task_current();
   if (task_execname(t_curr) == "a.out")
       printf("Exit task: %s[%d]\n", task_execname(t_curr), task_pid(t_curr));
}

probe kernel.function("pick_next_task_fair")
{
   g_cfs_rq = &$rq->cfs;
}

function container_of_entity:long(se:long)
{
   offset = &@cast(0, "struct task_struct")->se;
   return se - offset;
}

probe kernel.function("pick_next_task_fair").return
{
   if($return != 0) {
       se = &$return->se;
       t_se = container_of_entity(se);
       t_curr = task_current();
       if (task_execname(t_se) == "a.out" || task_execname(t_curr) == "a.out") {
           printf("[pick_next_task_fair] Return task: %s[%d] From current: %s[%d]\n", task_execname(t_se), task_pid(t_se), task_execname(t_curr), task_pid(t_curr));
       }
   }
}

probe kernel.function("wake_up_new_task")
{
   g_se = &$p->se;
   g_cfs_rq = @cast(g_se, "struct sched_entity")->cfs_rq;
}

probe kernel.function("wake_up_new_task").return
{
   t_se = container_of_entity(g_se);
   tname = task_execname(t_se);
   vruntime = @cast(g_se, "struct sched_entity")->vruntime;
   if (tname == "a.out") {
       curr = @cast(g_cfs_rq, "struct cfs_rq")->curr;
       t_curr = container_of_entity(curr);
       curr_vruntime = @cast(curr, "struct sched_entity")->vruntime;
       printf("[wake_up_new_task] current:[%s][%d] curr:%d new:%d del:%d\n",
               task_execname(t_curr), task_pid(t_curr), curr_vruntime, vruntime,
               curr_vruntime - vruntime);
   }
   g_se = 0;
   g_cfs_rq = 0;
}

probe kernel.function("place_entity")
{
   t_initial = $initial;
   if (t_initial == 1) {
       g_cfs_rq = $cfs_rq;
       g_se = $se;
   }
}
probe kernel.function("place_entity").return
{
   if (g_se) {
       t_se = container_of_entity(g_se);
       tname = task_execname(t_se);
       vruntime = @cast(g_se, "struct sched_entity")->vruntime;
       if (tname == "a.out") {
           curr = @cast(g_cfs_rq, "struct cfs_rq")->curr;
           t_curr = container_of_entity(curr);
           curr_vruntime = @cast(curr, "struct sched_entity")->vruntime;
           printf("[place_entity] name:[%s][%d] curr:%d new:%d   delta:%d\n",
               task_execname(t_curr), task_pid(t_curr), curr_vruntime, vruntime,
               curr_vruntime - vruntime);
       }
       g_se = 0;
       g_cfs_rq = 0;
   }
}

执行它，然后运行多次a.out，到底发生了什么，你就彻底知道了，下面是一个结果：

[root@localhost test]# ./a.out 10
5653
main,90010
[root@localhost test]# sub

# 另一个终端上打印的stap信息
[_schedule] current task: a.out[5653]
[pick_next_task_fair] Return task: a.out[5653] From current: a.out[5653]
# 父进程fork子进程，并设置了它的初始vruntime。
# 后续的child runs first检查会resched current
[place_entity] name:[a.out][5653] curr:74161009564 new:74192039854   delta:-31030290
# 注意，这里在fork中发生了切换，why？？因为在fork中spin_unlock的时候会check resched！
# 这就发生了task_fork_fair最后释放rq lock时！
[_schedule] current task: a.out[5653]
[pick_next_task_fair] Return task: rcu_sched[10] From current: a.out[5653]
[pick_next_task_fair] Return task: a.out[5653] From current: sshd[1392]
# 父进程返回运行，wakeup子进程，然而vruntime的delta却不足一个granularity！
# 不足一个granularity，子进程无法抢占父进程！
# 换句话说，之前由于child runs first进行的resched已经失效！
[wake_up_new_task] current:[a.out][5653] curr:74192443179 new:74192132619 del:310560
# 依然是父进程先运行。
Exit task: a.out[5653]
[_schedule] current task: a.out[5653]
[pick_next_task_fair] Return task: rcu_sched[10] From current: a.out[5653]
[pick_next_task_fair] Return task: a.out[5654] From current: sshd[1392]
# 子进程被调度运行
Exit task: a.out[5654]

很尴尬的事发生了，为了child runs first而执行resched_task，需要lock住rq，之所以要resched_task可能是交换父子的vruntime之后，希望子进程继续运行下去，替换父进程。

然而unlock rq时的check preempt却白白消耗了这次resched的机会！为什么说白白消耗呢？因为调用sched_fork的时候，子进程尚未准备好，也就是说，它尚不足以被wakeup！

只要在rq unlock时check preempt时候，父进程被其它进程抢占(在父进程优先级低时更容易发生！！)，那么子进程大概率不会runs first了，因为后面还要check granularity！

如果我们用更小的nice值运行a.out，那么子进程还是有机会runs first的，因为在rq unlock的时候，父进程不容易被其它进程抢占进而消费掉resched的机会，留到后面wakeup child的时候，还可以使用，此时子进程已经拥有了执行的条件，进而抢占掉父进程！

或者说，即便这次抢占父进程失败，那么它的vruntime已经低于父进程，它在红黑树中的位置是比父进程更加leftmost的，终究还是要比父进程runs first。

好了，情景分析完毕，该解题了！如何让kernel.sched_child_runs_first如其意之所表达，真正做到child runs first呢？

2010年dog250写的那个patch是错的，没有意义。真正的解法是：

    在wakeup child的时候再check sched_child_runs_first，进而resched。

我们来验证一下，由于我懒得为这个重新编译一遍内核，所以我采用stap guru hook的方式来玩玩。

首先我们废除原始的sched_child_runs_first判断，这很容易，关掉这个开关即可：

[root@localhost test]# sysctl -w kernel.sched_child_runs_first=0
kernel.sched_child_runs_first = 0

    1
    2

然后，我们以guru模式运行下面的stap脚本：

#!/usr/bin/stap -g

global g_p;

probe begin {
   g_p = 0;
}

%{
static void *(*_resched_task)(struct task_struct *p);
%}

function resched(tsk:long, tskp:long)
%{
   struct task_struct *task = NULL, *parent = NULL;
   struct sched_entity *pse = NULL, *cse = NULL;

   task = (struct task_struct *)STAP_ARG_tsk;
   parent = (struct task_struct *)STAP_ARG_tskp;
   cse = &task->se;
   pse = &parent->se;

   if (_resched_task == NULL)
       _resched_task = (void *)kallsyms_lookup_name("resched_task");
   if (_resched_task && pse->vruntime < cse->vruntime) {
       swap(pse->vruntime, cse->vruntime);
       STAP_PRINTF("---[%lu]------[%lu]-------\n", pse->vruntime, cse->vruntime);
       _resched_task(current);
   }

%}

probe kernel.function("check_preempt_wakeup")
{
   g_p = $p;
}
// 这里的trick在于，由于我们的父子a.out都是纯CPU型的，只在创建时被wakeup一次，所以hook该点。
probe kernel.function("check_preempt_wakeup").return
{
   parent = @cast(g_p, "struct task_struct")->parent;
   // 这里过滤掉了除了我们的fork场景之外的所有其它的wakeup场景。
   if (task_execname(g_p) == "a.out" || task_execname(parent) == "a.out") {
       resched(g_p, parent);
   }
   g_p = 0;
}

来吧，执行之！为了观测效果，我们可以再次同时执行之前的脚本(hook不要冲突即可)：

[root@localhost test]# ./a.out 10
6988
sub
main,90010

# 以下是输出
[_schedule] current task: a.out[6988]
[pick_next_task_fair] Return task: kworker/0:0[5380] From current: a.out[6988]
[pick_next_task_fair] Return task: a.out[6988] From current: sshd[1392]
[_schedule] current task: a.out[6988]
[pick_next_task_fair] Return task: rcu_sched[10] From current: a.out[6988]
[pick_next_task_fair] Return task: a.out[6988] From current: sshd[1392]
[place_entity] name:[a.out][6988] curr:78347075684 new:78440166555   delta:-93090871
# 在place_entity和wake_up_new_task之间没有被打断！
# 因为把resched从place移动到了wakeup的时候。
[wake_up_new_task] current:[a.out][6988] curr:78440166555 new:78347480815 del:92685740
[_schedule] current task: a.out[6988]
[pick_next_task_fair] Return task: a.out[6989] From current: a.out[6988]
# 子进程runs first，优先退出！
Exit task: a.out[6989]
[_schedule] current task: a.out[6989]
[pick_next_task_fair] Return task: kworker/0:0[5380] From current: a.out[6989]
[pick_next_task_fair] Return task: a.out[6988] From current: sshd[1392]
[_schedule] current task: a.out[6988]
[pick_next_task_fair] Return task: kworker/0:0[5380] From current: a.out[6988]
[pick_next_task_fair] Return task: a.out[6988] From current: sshd[1392]
# 父进程在后
Exit task: a.out[6988]
[_schedule] current task: a.out[6988]
[pick_next_task_fair] Return task: systemd[1] From current: a.out[6988]

多试几次，还是这样的结果。

现在，是时候回到printf了，虽然它可能并不准，但是肉眼观测，让经理信服，也只能靠它了：

[root@localhost test]# ./a.out 18
sub
main,90018
[root@localhost test]# ./a.out -18
sub
main,89982
[root@localhost test]# ./a.out -10
sub
main,89990
[root@localhost test]# ./a.out 10
sub
main,90010
[root@localhost test]# ./a.out 1
sub
main,90001
[root@localhost test]# ./a.out -1
sub
main,89999
[root@localhost test]# ./a.out 0
sub
main,90000

咋试咋舒服！

好了，现在该出patch了。这个patch才是真的有效的：

Date: Thu, 23 Apr 2020 22:42:07 +0800
Subject: [PATCH] fix child runs first

---
kernel/sched/fair.c | 20 +++++++++++---------
1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6a33137..f7f83a3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4564,6 +4564,17 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
    int scale = cfs_rq->nr_running >= sched_nr_latency;
    int next_buddy_marked = 0;

+   if ((wake_flags&WF_FORK) && sysctl_sched_child_runs_first && se &&
+       entity_before(se, pse)) {
+       /*
+       * Upon rescheduling, sched_class::put_prev_task() will place
+       * 'current' within the tree based on its new key value.
+       */
+       swap(se->vruntime, pse->vruntime);
+       resched_task(curr);
+   }
+
+
    if (unlikely(se == pse))
        return;

@@ -7086,15 +7097,6 @@ static void task_fork_fair(struct task_struct *p)
        se->vruntime = curr->vruntime;
    place_entity(cfs_rq, se, 1);

-   if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
-       /*
-       * Upon rescheduling, sched_class::put_prev_task() will place
-       * 'current' within the tree based on its new key value.
-       */
-       swap(curr->vruntime, se->vruntime);
-       resched_task(rq->curr);
-   }
-
    se->vruntime -= cfs_rq->min_vruntime;

    raw_spin_unlock_irqrestore(&rq->lock, flags);
--
1.8.3.1

对了，为了让这个patch不是真正可以打入内核的，我特意在老旧的内核上制作了这个patch。真正手艺人的玩法永远不是制作真正的patch，二进制hook不好吗？哈哈！

浙江温州皮鞋湿，下雨进水不会胖。
————————————————
版权声明：本文为CSDN博主「dog250」的原创文章，遵循CC 4.0 BY-SA版权协议，转载请附上原文出处链接及本声明。
原文链接：https://blog.csdn.net/dog250/article/details/105756168

posted @ 2020-06-09 17:21 Sky&Zhang 阅读(2842) 评论(0) 收藏举报

刷新页面返回顶部

sky

我所做的事情都是源于自己对梦想的追求--分享技术、共同创造新世界---欢迎交流：zhangbinghua2012@163.com skyzhangbinghua@gmai.com

Linux fork之后，到底是子进程先运行还是父进程先运行【转】

公告