Linux内核源码—CFS调度(4.20.17)
cfs_rq
每个 cpu 都有一个对应的运行队列 rq,在 rq 中维护着不同调度策略的调度队列。
struct rq {
...
struct cfs_rq cfs;
struct rt_rq rt;
struct dl_rq dl;
...
};
cfs的调度队列通过红黑树维护,在 cfs_rq 的数据结构中,struct rb_root_cached tasks_timeline 包含了红黑树 struct rb_root rb_root 和 最左叶子节点缓存 struct rb_node *rb_leftmost 。
struct cfs_rq {
struct load_weight load; //CFS运行队列的负载权重值
unsigned long runnable_weight;
unsigned int nr_running;
unsigned int h_nr_running;
u64 exec_clock;
u64 min_vruntime;
#ifndef CONFIG_64BIT
u64 min_vruntime_copy;
#endif
struct rb_root_cached tasks_timeline; //红黑树,维护调度实体
/*
* 'curr' points to currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
struct sched_entity *curr; //当前运行的调度实体
struct sched_entity *next; //下一个调度实体
struct sched_entity *last; //队列中最后的调度实体
struct sched_entity *skip; //跳过的调度实体
#ifdef CONFIG_SCHED_DEBUG
unsigned int nr_spread_over;
#endif
#ifdef CONFIG_SMP
/*
* CFS load tracking
*/
struct sched_avg avg;
#ifndef CONFIG_64BIT
u64 load_last_update_time_copy;
#endif
struct {
raw_spinlock_t lock ____cacheline_aligned;
int nr;
unsigned long load_avg;
unsigned long util_avg;
unsigned long runnable_sum;
} removed;
#ifdef CONFIG_FAIR_GROUP_SCHED
unsigned long tg_load_avg_contrib;
long propagate;
long prop_runnable_sum;
/*
* h_load = weight * f(tg)
*
* Where f(tg) is the recursive weight fraction assigned to
* this group.
*/
unsigned long h_load;
u64 last_h_load_update;
struct sched_entity *h_load_next;
#endif /* CONFIG_FAIR_GROUP_SCHED */
#endif /* CONFIG_SMP */
#ifdef CONFIG_FAIR_GROUP_SCHED
struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */
/*
* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
* a hierarchy). Non-leaf lrqs hold other higher schedulable entities
* (like users, containers etc.)
*
* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU.
* This list is used during load balance.
*/
int on_list;
struct list_head leaf_cfs_rq_list;
struct task_group *tg; /* group that "owns" this runqueue */
#ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled;
int expires_seq;
u64 runtime_expires;
s64 runtime_remaining;
u64 throttled_clock;
u64 throttled_clock_task;
u64 throttled_clock_task_time;
int throttled;
int throttle_count;
struct list_head throttled_list;
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};
vruntime
那么CFS是根据什么来对任务进行排序呢?----------》虚拟运行时间 vruntime。
update_curr 函数(/kernel/sched/fair.c)实现了 vruntime 的更新,其步骤是计算出当前进程的运行时间 delta_exec,再结合当前可运行进程总数对delta_exec 进行加权运算。
static void update_curr(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr; //获取当前调度实体
u64 now = rq_clock_task(rq_of(cfs_rq)); //获取当前时间
u64 delta_exec;
if (unlikely(!curr))
return;
delta_exec = now - curr->exec_start; //计算当前进程已执行的时间,exec_start是调度实体的开始执行时间
if (unlikely((s64)delta_exec <= 0))
return;
curr->exec_start = now;
schedstat_set(curr->statistics.exec_max,
max(delta_exec, curr->statistics.exec_max));
curr->sum_exec_runtime += delta_exec; //修改调度实体已执行总时间
schedstat_add(cfs_rq->exec_clock, delta_exec);
curr->vruntime += calc_delta_fair(delta_exec, curr); //修改调度实体虚拟运行时间
update_min_vruntime(cfs_rq);
if (entity_is_task(curr)) { //如果调度实体是task,也要给它的调度组记录执行时间
struct task_struct *curtask = task_of(curr);
trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
cgroup_account_cputime(curtask, delta_exec);
account_group_exec_runtime(curtask, delta_exec);
}
account_cfs_rq_runtime(cfs_rq, delta_exec);
}
calc_delta_fair(delta_exec, curr) 实现了虚拟运行时间的计算:
虚拟运行时间 = delta_exec * NICE_0_LOAD / 当前进程的权重
而具体在 __calc_delta 中,是通过(delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT 实现的,通过左移和右移避免浮点运算。
从公式可以得出,如果一个进程的虚拟运行时间越小,说明实际运行的时间越少或者是进程的权重大,那么就应该具有更高的优先度。而红黑树维护的就是进程的 vruntime 值,每次选择 vruntime 最小的进程执行,该节点缓存在了最左叶子节点 struct rb_node *rb_leftmost 中。
static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
{
if (unlikely(se->load.weight != NICE_0_LOAD))
delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
return delta;
}
进程选择
在进程变为可运行状态(被唤醒)或者是通过 fork() 调用第一次创建进程时,需要将进程插入红黑树,调用 __enqueue_entity 实现这一过程。删除节点也是同样的道理。
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node; //红黑树根节点
struct rb_node *parent = NULL;
struct sched_entity *entry;
bool leftmost = true;
/*
* Find the right place in the rbtree:
*/
while (*link) {
parent = *link;
entry = rb_entry(parent, struct sched_entity, run_node); //rb_entry 只是 container_of 的封装而已,找到首地址
/*
* We dont care about collisions. Nodes with
* the same key stay together.
*/
if (entity_before(se, entry)) {
link = &parent->rb_left;
} else {
link = &parent->rb_right;
leftmost = false;
}
}
rb_link_node(&se->run_node, parent, link); //在红黑树中插入节点
rb_insert_color_cached(&se->run_node, //设置节点的颜色
&cfs_rq->tasks_timeline, leftmost);
}
进程调度
进程调度的主要入口点是函数 schedule(/kernel/sched/core.c),它通过 pick_next_task() 函数选择下一个进程,如果选出来的进程与当前运行进程不一致,则调用 context_switch() 函数进行上下文切换。
static void __sched notrace __schedule(bool preempt)
{
cpu = smp_processor_id();
rq = cpu_rq(cpu);
prev = rq->curr; //获取当前运行进程
...
next = pick_next_task(rq, prev, &rf);
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
if (likely(prev != next)) {
...
rq = context_switch(rq, prev, next, &rf);
} else {
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
rq_unlock_irq(rq, &rf);
}
...
}
pick_next_task() 函数的实现并不复杂,这里用到了一点优化,如果所有的可运行进程都在 cfs 中,那么就可以直接调用 cfs 的 pick_next_task(), 否则就需要按照调度器的优先级来选择。
static inline struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
const struct sched_class *class;
struct task_struct *p;
/*
* Optimization: we know that if all tasks are in the fair class we can
* call that function directly, but only if the @prev task wasn't of a
* higher scheduling class, because otherwise those loose the
* opportunity to pull in more work from other CPUs.
*/
if (likely((prev->sched_class == &idle_sched_class ||
prev->sched_class == &fair_sched_class) &&
rq->nr_running == rq->cfs.h_nr_running)) {
p = fair_sched_class.pick_next_task(rq, prev, rf);
if (unlikely(p == RETRY_TASK))
goto again;
/* Assumes fair_sched_class->next == idle_sched_class */
if (unlikely(!p))
p = idle_sched_class.pick_next_task(rq, prev, rf);
return p;
}
again:
for_each_class(class) {
p = class->pick_next_task(rq, prev, rf);
if (p) {
if (unlikely(p == RETRY_TASK))
goto again;
return p;
}
}
/* The idle class should always have a runnable task: */
BUG();
}
References:

浙公网安备 33010602011771号