Linux内核源码—CFS调度（4.20.17）

cfs_rq

每个 cpu 都有一个对应的运行队列 rq，在 rq 中维护着不同调度策略的调度队列。

struct rq {
        ...
    struct cfs_rq       cfs;
    struct rt_rq        rt;
    struct dl_rq        dl;
        ...   
};

cfs的调度队列通过红黑树维护，在 cfs_rq 的数据结构中，struct rb_root_cached tasks_timeline 包含了红黑树 struct rb_root rb_root 和最左叶子节点缓存 struct rb_node *rb_leftmost 。

struct cfs_rq {
	struct load_weight	load;  //CFS运行队列的负载权重值
	unsigned long		runnable_weight;
	unsigned int		nr_running;
	unsigned int		h_nr_running;

	u64			exec_clock;
	u64			min_vruntime;
#ifndef CONFIG_64BIT
	u64			min_vruntime_copy;
#endif

	struct rb_root_cached	tasks_timeline;  //红黑树，维护调度实体

	/*
	 * 'curr' points to currently running entity on this cfs_rq.
	 * It is set to NULL otherwise (i.e when none are currently running).
	 */
	struct sched_entity	*curr;  //当前运行的调度实体
	struct sched_entity	*next;  //下一个调度实体
	struct sched_entity	*last;  //队列中最后的调度实体
	struct sched_entity	*skip;  //跳过的调度实体

#ifdef	CONFIG_SCHED_DEBUG
	unsigned int		nr_spread_over;
#endif

#ifdef CONFIG_SMP
	/*
	 * CFS load tracking
	 */
	struct sched_avg	avg;
#ifndef CONFIG_64BIT
	u64			load_last_update_time_copy;
#endif
	struct {
		raw_spinlock_t	lock ____cacheline_aligned;
		int		nr;
		unsigned long	load_avg;
		unsigned long	util_avg;
		unsigned long	runnable_sum;
	} removed;

#ifdef CONFIG_FAIR_GROUP_SCHED
	unsigned long		tg_load_avg_contrib;
	long			propagate;
	long			prop_runnable_sum;

	/*
	 *   h_load = weight * f(tg)
	 *
	 * Where f(tg) is the recursive weight fraction assigned to
	 * this group.
	 */
	unsigned long		h_load;
	u64			last_h_load_update;
	struct sched_entity	*h_load_next;
#endif /* CONFIG_FAIR_GROUP_SCHED */
#endif /* CONFIG_SMP */

#ifdef CONFIG_FAIR_GROUP_SCHED
	struct rq		*rq;	/* CPU runqueue to which this cfs_rq is attached */

	/*
	 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
	 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
	 * (like users, containers etc.)
	 *
	 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU.
	 * This list is used during load balance.
	 */
	int			on_list;
	struct list_head	leaf_cfs_rq_list;
	struct task_group	*tg;	/* group that "owns" this runqueue */

#ifdef CONFIG_CFS_BANDWIDTH
	int			runtime_enabled;
	int			expires_seq;
	u64			runtime_expires;
	s64			runtime_remaining;

	u64			throttled_clock;
	u64			throttled_clock_task;
	u64			throttled_clock_task_time;
	int			throttled;
	int			throttle_count;
	struct list_head	throttled_list;
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};

vruntime

那么CFS是根据什么来对任务进行排序呢？----------》虚拟运行时间 vruntime。

update_curr 函数（/kernel/sched/fair.c）实现了 vruntime 的更新，其步骤是计算出当前进程的运行时间 delta_exec，再结合当前可运行进程总数对delta_exec 进行加权运算。

static void update_curr(struct cfs_rq *cfs_rq)
{
	struct sched_entity *curr = cfs_rq->curr;  //获取当前调度实体
	u64 now = rq_clock_task(rq_of(cfs_rq));  //获取当前时间
	u64 delta_exec;

	if (unlikely(!curr))
		return;

	delta_exec = now - curr->exec_start;  //计算当前进程已执行的时间，exec_start是调度实体的开始执行时间
	if (unlikely((s64)delta_exec <= 0))
		return;

	curr->exec_start = now;

	schedstat_set(curr->statistics.exec_max,
		      max(delta_exec, curr->statistics.exec_max));

	curr->sum_exec_runtime += delta_exec;  //修改调度实体已执行总时间
	schedstat_add(cfs_rq->exec_clock, delta_exec);

	curr->vruntime += calc_delta_fair(delta_exec, curr);  //修改调度实体虚拟运行时间
	update_min_vruntime(cfs_rq);

	if (entity_is_task(curr)) {  //如果调度实体是task，也要给它的调度组记录执行时间
		struct task_struct *curtask = task_of(curr);

		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
		cgroup_account_cputime(curtask, delta_exec);
		account_group_exec_runtime(curtask, delta_exec);
	}

	account_cfs_rq_runtime(cfs_rq, delta_exec);
}

calc_delta_fair(delta_exec, curr) 实现了虚拟运行时间的计算：

虚拟运行时间 = delta_exec * NICE_0_LOAD / 当前进程的权重

而具体在 __calc_delta 中，是通过(delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT 实现的，通过左移和右移避免浮点运算。

从公式可以得出，如果一个进程的虚拟运行时间越小，说明实际运行的时间越少或者是进程的权重大，那么就应该具有更高的优先度。而红黑树维护的就是进程的 vruntime 值，每次选择 vruntime 最小的进程执行，该节点缓存在了最左叶子节点 struct rb_node *rb_leftmost 中。

static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
{
	if (unlikely(se->load.weight != NICE_0_LOAD))
		delta = __calc_delta(delta, NICE_0_LOAD, &se->load);

	return delta;
}

进程选择

在进程变为可运行状态（被唤醒）或者是通过 fork() 调用第一次创建进程时，需要将进程插入红黑树，调用 __enqueue_entity 实现这一过程。删除节点也是同样的道理。

static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;  //红黑树根节点
	struct rb_node *parent = NULL;
	struct sched_entity *entry;
	bool leftmost = true;

	/*
	 * Find the right place in the rbtree:
	 */
	while (*link) {
		parent = *link;
		entry = rb_entry(parent, struct sched_entity, run_node);  //rb_entry 只是 container_of 的封装而已，找到首地址
		/*
		 * We dont care about collisions. Nodes with
		 * the same key stay together.
		 */
		if (entity_before(se, entry)) {
			link = &parent->rb_left;
		} else {
			link = &parent->rb_right;
			leftmost = false;
		}
	}

	rb_link_node(&se->run_node, parent, link);  //在红黑树中插入节点
	rb_insert_color_cached(&se->run_node,  //设置节点的颜色
			       &cfs_rq->tasks_timeline, leftmost);
}

进程调度

进程调度的主要入口点是函数 schedule(/kernel/sched/core.c)，它通过 pick_next_task() 函数选择下一个进程，如果选出来的进程与当前运行进程不一致，则调用 context_switch() 函数进行上下文切换。

static void __sched notrace __schedule(bool preempt)
{
	cpu = smp_processor_id();
	rq = cpu_rq(cpu);
	prev = rq->curr;  //获取当前运行进程

        ...

	next = pick_next_task(rq, prev, &rf);
	clear_tsk_need_resched(prev);
	clear_preempt_need_resched();

	if (likely(prev != next)) {
                ...
		rq = context_switch(rq, prev, next, &rf);
	} else {
		rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
		rq_unlock_irq(rq, &rf);
	}
        ...
}

pick_next_task() 函数的实现并不复杂，这里用到了一点优化，如果所有的可运行进程都在 cfs 中，那么就可以直接调用 cfs 的 pick_next_task()，否则就需要按照调度器的优先级来选择。

static inline struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
	const struct sched_class *class;
	struct task_struct *p;

	/*
	 * Optimization: we know that if all tasks are in the fair class we can
	 * call that function directly, but only if the @prev task wasn't of a
	 * higher scheduling class, because otherwise those loose the
	 * opportunity to pull in more work from other CPUs.
	 */
	if (likely((prev->sched_class == &idle_sched_class ||
		    prev->sched_class == &fair_sched_class) &&
		   rq->nr_running == rq->cfs.h_nr_running)) {

		p = fair_sched_class.pick_next_task(rq, prev, rf);
		if (unlikely(p == RETRY_TASK))
			goto again;

		/* Assumes fair_sched_class->next == idle_sched_class */
		if (unlikely(!p))
			p = idle_sched_class.pick_next_task(rq, prev, rf);

		return p;
	}

again:
	for_each_class(class) {
		p = class->pick_next_task(rq, prev, rf);
		if (p) {
			if (unlikely(p == RETRY_TASK))
				goto again;
			return p;
		}
	}

	/* The idle class should always have a runnable task: */
	BUG();
}

References：

posted @ 2022-03-08 18:58 Kayden_Cheung 阅读(596) 评论(0) 收藏举报

刷新页面返回顶部

Kayden_Cheung's Blog

对未来的真正慷慨，是把一切献给现在。

Linux内核源码—CFS调度（4.20.17）

cfs_rq

vruntime

进程选择

进程调度

公告