sched_ext的update_idle函数(linux 6.15.7) - 指南

一、update_idle功能及注意点

先看注释,见 kernel/sched/ext.c

	struct sched_ext_ops {
        /**
	     * @update_idle: Update the idle state of a CPU
	     * @cpu: CPU to update the idle state for
	     * @idle: whether entering or exiting the idle state
	     *
	     * This operation is called when @rq's CPU goes or leaves the idle
	     * state. By default, implementing this operation disables the built-in
	     * idle CPU tracking and the following helpers become unavailable:
	     *
	     * - scx_bpf_select_cpu_dfl()
	     * - scx_bpf_test_and_clear_cpu_idle()
	     * - scx_bpf_pick_idle_cpu()
	     *
	     * The user also must implement ops.select_cpu() as the default
	     * implementation relies on scx_bpf_select_cpu_dfl().
	     *
	     * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle
	     * tracking.
	     */
	    void (*update_idle)(s32 cpu, bool idle);
}

当cpu进入或退出idle状态时(对应的是,cpu开始执行init_task、停止执行init_task),调度器框架代码会执行struct sched_ext_ops -> update_idle。

默认情况,如果实现了update_idle函数,那么内置的idle tracking机制就不可用了,依赖于内置idle tracking机制的一些函数也不能用了(scx_bpf_select_cpu_dfl、scx_bpf_test_and_clear_cpu_idle、scx_bpf_pick_idle_cpu)。

如果sched_ext调度器指定了SCX_OPS_KEEP_BUILTIN_IDLE标记,那么“update_idle”与内置的“全局ilde tracking”是同时生效的。

如果sched_ext调度器指定了SCX_OPS_BUILTIN_IDLE_PER_NODE标记,那么“update_idle”与系统自带的“per-node的ilde机制”是同时生效的。

如果SCX_OPS_KEEP_BUILTIN_IDLE与SCX_OPS_BUILTIN_IDLE_PER_NODE都指定了,那么那么“用户自己实现的update_idle”与系统自带的“全局的ilde机制”、“per-node的ilde机制”都是生效的。

二、内置idle tracking机制

kernel中内置了一个idle tracking机制(built-in idle tracking),见kernel/sched/ext/ext_idle.c

1,cpu idle mask

kernel/sched/ext.c

/*
 * Global host-wide idle cpumasks (used when SCX_OPS_BUILTIN_IDLE_PER_NODE
 * is not enabled).
 */
static struct scx_idle_cpus scx_idle_global_masks;
/*
 * Per-node idle cpumasks.
 */
static struct scx_idle_cpus **scx_idle_node_masks;

scx_pick_idle_cpu等函数会用到这些cpu mask。

2,全局标志(static key)表示是否启用 built-in idle

kernel/sched/ext.c

/* Enable/disable built-in idle CPU selection policy */
static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
/* Enable/disable per-node idle cpumasks */
static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_per_node);

3,built-in idle的启用

void scx_idle_enable(struct sched_ext_ops *ops)
{
	if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))
		static_branch_enable_cpuslocked(&scx_builtin_idle_enabled);
	else
		static_branch_disable_cpuslocked(&scx_builtin_idle_enabled);
	if (ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)
		static_branch_enable_cpuslocked(&scx_builtin_idle_per_node);
	else
		static_branch_disable_cpuslocked(&scx_builtin_idle_per_node);
#ifdef CONFIG_SMP
	reset_idle_masks(ops);
#endif
}

对于全局 idle tracking:

sched_ext未实现update_idle或者sched_ext调度器指定了 SCX_OPS_KEEP_BUILTIN_IDLE,使能全局 idle tracking;否则关闭全局 idle tracking。

对于per-node idle tracking:

sched_ext调度器指定了 SCX_OPS_BUILTIN_IDLE_PER_NODE,使能per-node idle tracking;否则关闭全局 idle tracking。

三、空闲任务init_task

cpu开始执行init_task时,表示进入idle;停止执行init_task,表示退出idle。这两个状态都会调用update_idle函数。

理解update_idle,需要搞清楚以下问题:

1,init_task的pid是多少?

    简答:pid为0

2,init_task的sched class是什么?

    简答:idle_sched_class

3,何时执行init_task?

     简答:系统中没有其他task要执行。

1,init_task的pid

init_task是一个全局变量,未显示初始化的字段将会被设置成0,所以init_task ->pid、init_task ->tgid全部为0。我们可以利用pid=0这个特点,来判断cpu正在执行的task是不是init_task,从而判断出cpu是否处于idle状态。

kernel/inclue/linux/sched.h

struct task_struct {
    ……
    pid_t				pid;
	pid_t				tgid;
    ……
}

kernel/init/init_task.c

struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
#ifdef CONFIG_THREAD_INFO_IN_TASK
	.thread_info	= INIT_THREAD_INFO(init_task),
	.stack_refcount	= REFCOUNT_INIT(1),
#endif
	.__state	= 0,
	.stack		= init_stack,
	.usage		= REFCOUNT_INIT(2),
	.flags		= PF_KTHREAD,
	.prio		= MAX_PRIO - 20,
	.static_prio	= MAX_PRIO - 20,
	.normal_prio	= MAX_PRIO - 20,
	.policy		= SCHED_NORMAL,
	.cpus_ptr	= &init_task.cpus_mask,
	.user_cpus_ptr	= NULL,
    ……
}

init_task 没有显示设置pid、tgid,所以这2个字段为0。

2,init_task的sched class

sched_init在init_task的上下文中执行(不是本文重点,不展开,知道就可以了),所以sched_init的current就是init_task。

sched_init -->  __sched_fork(0, current); 
                 -->  init_idle(current, smp_processor_id());

void __init init_idle(struct task_struct *idle, int cpu)
{
	struct rq *rq = cpu_rq(cpu);
    idle->__state = TASK_RUNNING;   注意点1)
	rq->idle = idle;   注意点2)
	idle->sched_class = &idle_sched_class;  注意点3)
}

在init_idle函数中,有3点需要注意:

1)idle task 永远 runnable,但只有当 CPU 上 没有其他可运行任务 时才被调度。

2)cpu 运行队列rq->idle指向init_task。

3)sched_class 是idle_sched_class。

3,何时执行init_task

schedule -> __schedule_loop -> __schedule -> pick_next_task 

pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
	struct task_struct *next, *p, *max = NULL;
	const struct cpumask *smt_mask;
	bool fi_before = false;
	bool core_clock_updated = (rq == rq->core);
	unsigned long cookie;
	int i, cpu, occ = 0;
	struct rq *rq_i;
	bool need_sync;
	if (!sched_core_enabled(rq))
		return __pick_next_task(rq, prev, rf);

没配置sched_core的情况下,执行__pick_next_task(常规 CFS/RT 选择下一个待运行的task)。

static inline struct task_struct *
__pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
	const struct sched_class *class;
	struct task_struct *p;
    ……
	for_each_active_class(class) {
		if (class->pick_next_task) {
			p = class->pick_next_task(rq, prev);
			if (p)
				return p;
		} else {
			p = class->pick_task(rq);
			if (p) {
				put_prev_set_next_task(rq, prev, p);
				return p;
			}
		}
	}
	BUG(); /* The idle class should always have a runnable task. */
}

__pick_next_task按照sched class优先级,从高到底选出一个task。最低优先级的class是idle_sched_class,且没有pick_next_task,所以执行pick_task,即pick_task_idle函数。

kernel/sched/idle.c

DEFINE_SCHED_CLASS(idle) = {
	/* no enqueue/yield_task for idle tasks */
	/* dequeue is not valid, we print a debug message there: */
	.dequeue_task		= dequeue_task_idle,
	.wakeup_preempt		= wakeup_preempt_idle,
	.pick_task		= pick_task_idle,
	.put_prev_task		= put_prev_task_idle,
	.set_next_task          = set_next_task_idle,
#ifdef CONFIG_SMP
	.balance		= balance_idle,
	.select_task_rq		= select_task_rq_idle,
	.set_cpus_allowed	= set_cpus_allowed_common,
#endif
	.task_tick		= task_tick_idle,
	.prio_changed		= prio_changed_idle,
	.switched_to		= switched_to_idle,
	.update_curr		= update_curr_idle,
}

kernel/sched/idle.c

struct task_struct *pick_task_idle(struct rq *rq)
{
	scx_update_idle(rq, true, false);
	return rq->idle;
}

前面说过,rq->idle就是init_task,将会被调度到cpu上执行。

四、何时执行update_idle

调度init_task执行时,cpu进入idle状态,所以需要在这里执行update_idle,pick_task_idle -> scx_update_idle(rq, true, false)。

kernel/sched/ext.h

static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify)
{
	if (scx_enabled())
		__scx_update_idle(rq, idle, do_notify);
}

kenel/sched/ext_idle.c

void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
{
	int cpu = cpu_of(rq);
	lockdep_assert_rq_held(rq);
	/*
	 * Trigger ops.update_idle() only when transitioning from a task to
	 * the idle thread and vice versa.
	 *
	 * Idle transitions are indicated by do_notify being set to true,
	 * managed by put_prev_task_idle()/set_next_task_idle().
	 */
	if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq))
		SCX_CALL_OP(SCX_KF_REST, update_idle, rq, cpu_of(rq), idle);

如果用户实现了update_idle(SCX_HAS_OP(update_idle)),就会执行update_idle函数。

posted @ 2025-12-17 12:12  clnchanpin  阅读(19)  评论(0)    收藏  举报