sched_ext的update_idle函数(linux 6.15.7) - 指南
一、update_idle功能及注意点
先看注释,见 kernel/sched/ext.c
struct sched_ext_ops {
/**
* @update_idle: Update the idle state of a CPU
* @cpu: CPU to update the idle state for
* @idle: whether entering or exiting the idle state
*
* This operation is called when @rq's CPU goes or leaves the idle
* state. By default, implementing this operation disables the built-in
* idle CPU tracking and the following helpers become unavailable:
*
* - scx_bpf_select_cpu_dfl()
* - scx_bpf_test_and_clear_cpu_idle()
* - scx_bpf_pick_idle_cpu()
*
* The user also must implement ops.select_cpu() as the default
* implementation relies on scx_bpf_select_cpu_dfl().
*
* Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle
* tracking.
*/
void (*update_idle)(s32 cpu, bool idle);
}
当cpu进入或退出idle状态时(对应的是,cpu开始执行init_task、停止执行init_task),调度器框架代码会执行struct sched_ext_ops -> update_idle。
默认情况,如果实现了update_idle函数,那么内置的idle tracking机制就不可用了,依赖于内置idle tracking机制的一些函数也不能用了(scx_bpf_select_cpu_dfl、scx_bpf_test_and_clear_cpu_idle、scx_bpf_pick_idle_cpu)。
如果sched_ext调度器指定了SCX_OPS_KEEP_BUILTIN_IDLE标记,那么“update_idle”与内置的“全局ilde tracking”是同时生效的。
如果sched_ext调度器指定了SCX_OPS_BUILTIN_IDLE_PER_NODE标记,那么“update_idle”与系统自带的“per-node的ilde机制”是同时生效的。
如果SCX_OPS_KEEP_BUILTIN_IDLE与SCX_OPS_BUILTIN_IDLE_PER_NODE都指定了,那么那么“用户自己实现的update_idle”与系统自带的“全局的ilde机制”、“per-node的ilde机制”都是生效的。
二、内置idle tracking机制
kernel中内置了一个idle tracking机制(built-in idle tracking),见kernel/sched/ext/ext_idle.c
1,cpu idle mask
kernel/sched/ext.c
/*
* Global host-wide idle cpumasks (used when SCX_OPS_BUILTIN_IDLE_PER_NODE
* is not enabled).
*/
static struct scx_idle_cpus scx_idle_global_masks;
/*
* Per-node idle cpumasks.
*/
static struct scx_idle_cpus **scx_idle_node_masks;
scx_pick_idle_cpu等函数会用到这些cpu mask。
2,全局标志(static key)表示是否启用 built-in idle
kernel/sched/ext.c
/* Enable/disable built-in idle CPU selection policy */
static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
/* Enable/disable per-node idle cpumasks */
static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_per_node);
3,built-in idle的启用
void scx_idle_enable(struct sched_ext_ops *ops)
{
if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))
static_branch_enable_cpuslocked(&scx_builtin_idle_enabled);
else
static_branch_disable_cpuslocked(&scx_builtin_idle_enabled);
if (ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)
static_branch_enable_cpuslocked(&scx_builtin_idle_per_node);
else
static_branch_disable_cpuslocked(&scx_builtin_idle_per_node);
#ifdef CONFIG_SMP
reset_idle_masks(ops);
#endif
}
对于全局 idle tracking:
sched_ext未实现update_idle或者sched_ext调度器指定了 SCX_OPS_KEEP_BUILTIN_IDLE,使能全局 idle tracking;否则关闭全局 idle tracking。
对于per-node idle tracking:
sched_ext调度器指定了 SCX_OPS_BUILTIN_IDLE_PER_NODE,使能per-node idle tracking;否则关闭全局 idle tracking。
三、空闲任务init_task
cpu开始执行init_task时,表示进入idle;停止执行init_task,表示退出idle。这两个状态都会调用update_idle函数。
理解update_idle,需要搞清楚以下问题:
1,init_task的pid是多少?
简答:pid为0
2,init_task的sched class是什么?
简答:idle_sched_class
3,何时执行init_task?
简答:系统中没有其他task要执行。
1,init_task的pid
init_task是一个全局变量,未显示初始化的字段将会被设置成0,所以init_task ->pid、init_task ->tgid全部为0。我们可以利用pid=0这个特点,来判断cpu正在执行的task是不是init_task,从而判断出cpu是否处于idle状态。
kernel/inclue/linux/sched.h
struct task_struct {
……
pid_t pid;
pid_t tgid;
……
}
kernel/init/init_task.c
struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
#ifdef CONFIG_THREAD_INFO_IN_TASK
.thread_info = INIT_THREAD_INFO(init_task),
.stack_refcount = REFCOUNT_INIT(1),
#endif
.__state = 0,
.stack = init_stack,
.usage = REFCOUNT_INIT(2),
.flags = PF_KTHREAD,
.prio = MAX_PRIO - 20,
.static_prio = MAX_PRIO - 20,
.normal_prio = MAX_PRIO - 20,
.policy = SCHED_NORMAL,
.cpus_ptr = &init_task.cpus_mask,
.user_cpus_ptr = NULL,
……
}
init_task 没有显示设置pid、tgid,所以这2个字段为0。
2,init_task的sched class
sched_init在init_task的上下文中执行(不是本文重点,不展开,知道就可以了),所以sched_init的current就是init_task。
sched_init --> __sched_fork(0, current);
--> init_idle(current, smp_processor_id());
void __init init_idle(struct task_struct *idle, int cpu)
{
struct rq *rq = cpu_rq(cpu);
idle->__state = TASK_RUNNING; 注意点1)
rq->idle = idle; 注意点2)
idle->sched_class = &idle_sched_class; 注意点3)
}
在init_idle函数中,有3点需要注意:
1)idle task 永远 runnable,但只有当 CPU 上 没有其他可运行任务 时才被调度。
2)cpu 运行队列rq->idle指向init_task。
3)sched_class 是idle_sched_class。
3,何时执行init_task
schedule -> __schedule_loop -> __schedule -> pick_next_task
pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct task_struct *next, *p, *max = NULL;
const struct cpumask *smt_mask;
bool fi_before = false;
bool core_clock_updated = (rq == rq->core);
unsigned long cookie;
int i, cpu, occ = 0;
struct rq *rq_i;
bool need_sync;
if (!sched_core_enabled(rq))
return __pick_next_task(rq, prev, rf);
没配置sched_core的情况下,执行__pick_next_task(常规 CFS/RT 选择下一个待运行的task)。
static inline struct task_struct *
__pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
const struct sched_class *class;
struct task_struct *p;
……
for_each_active_class(class) {
if (class->pick_next_task) {
p = class->pick_next_task(rq, prev);
if (p)
return p;
} else {
p = class->pick_task(rq);
if (p) {
put_prev_set_next_task(rq, prev, p);
return p;
}
}
}
BUG(); /* The idle class should always have a runnable task. */
}
__pick_next_task按照sched class优先级,从高到底选出一个task。最低优先级的class是idle_sched_class,且没有pick_next_task,所以执行pick_task,即pick_task_idle函数。
kernel/sched/idle.c
DEFINE_SCHED_CLASS(idle) = {
/* no enqueue/yield_task for idle tasks */
/* dequeue is not valid, we print a debug message there: */
.dequeue_task = dequeue_task_idle,
.wakeup_preempt = wakeup_preempt_idle,
.pick_task = pick_task_idle,
.put_prev_task = put_prev_task_idle,
.set_next_task = set_next_task_idle,
#ifdef CONFIG_SMP
.balance = balance_idle,
.select_task_rq = select_task_rq_idle,
.set_cpus_allowed = set_cpus_allowed_common,
#endif
.task_tick = task_tick_idle,
.prio_changed = prio_changed_idle,
.switched_to = switched_to_idle,
.update_curr = update_curr_idle,
}
kernel/sched/idle.c
struct task_struct *pick_task_idle(struct rq *rq)
{
scx_update_idle(rq, true, false);
return rq->idle;
}
前面说过,rq->idle就是init_task,将会被调度到cpu上执行。
四、何时执行update_idle
调度init_task执行时,cpu进入idle状态,所以需要在这里执行update_idle,pick_task_idle -> scx_update_idle(rq, true, false)。
kernel/sched/ext.h
static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify)
{
if (scx_enabled())
__scx_update_idle(rq, idle, do_notify);
}
kenel/sched/ext_idle.c
void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
{
int cpu = cpu_of(rq);
lockdep_assert_rq_held(rq);
/*
* Trigger ops.update_idle() only when transitioning from a task to
* the idle thread and vice versa.
*
* Idle transitions are indicated by do_notify being set to true,
* managed by put_prev_task_idle()/set_next_task_idle().
*/
if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq))
SCX_CALL_OP(SCX_KF_REST, update_idle, rq, cpu_of(rq), idle);
如果用户实现了update_idle(SCX_HAS_OP(update_idle)),就会执行update_idle函数。
浙公网安备 33010602011771号