基于Linux-2.6.23的进程调度分析

1.前言

本文基于Linux Kernel 2.6.23的源码，深入分析其进程模型及调度机制

2.进程是什么

在对进程模型分析前让我们先来看看什么是进程，进程(process)是具有一定独立功能的程序关于某个数据集合上的一次运行活动,是系统进行资源(内存、CPU)分配和调度的一个独立单位。而Linux是多任务抢占操作系统，多任务就是指多个进程通过分时切换来并发执行，所以对于这些进程间的切换就要有一套完善的调度机制。

在Linux中，我们可以用 ps 的命令来查询系统当前正在运行的进程（PID：进程号，TTY：控制终端，TIME：进程使用cpu时间，CMD：命令名）

3.进程的组织过程

本节主要深入源码对进程的组织过程进行解析

3.1进程描述符

要组织进程第一步当然就是对进程的定义了，在源码中进程描述符定义在 sched.h 文件下的 task_struct 结构体中，每个进程有且只有一个进程描述符，它里面包含了这个进程相关的所有信息。下面是对 task_struct 结构体的部分截取。

 1 struct task_struct {
 2 
 3     ......
 4 
 5     volatile long state; /* 进程状态 */
 6     void *stack; /* 指向内核栈 */
 7     struct list_head tasks; /* 用于加入进程链表 */
 8     ......
 9 
10     struct mm_struct *mm, *active_mm;/* 指向该进程的内存区描述符 */
11 
12     ........
13 
14     pid_t pid;/* 进程ID，每个进程(线程)的PID都不同 */
15     pid_t tgid;  /* 线程组ID，同一个线程组拥有相同的pid，与领头线程(该组中第一个轻量级进程)pid一致，保存在tgid中，线程组领头线程的pid和tgid相同 */
16     struct pid_link pids[PIDTYPE_MAX]; /* 用于连接到PID、TGID、PGRP、SESSION哈希表 */
17 
18     ........
19 }

如果你想看整个结构体的代码可以点击这里

  1 struct task_struct {
  2     volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
  3     void *stack;
  4     atomic_t usage;
  5     unsigned int flags;    /* per process flags, defined below */
  6     unsigned int ptrace;
  7 
  8     int lock_depth;        /* BKL lock depth */
  9 
 10 #ifdef CONFIG_SMP
 11 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
 12     int oncpu;
 13 #endif
 14 #endif
 15 
 16     int prio, static_prio, normal_prio;
 17     struct list_head run_list;
 18     struct sched_class *sched_class;
 19     struct sched_entity se;
 20 
 21 #ifdef CONFIG_PREEMPT_NOTIFIERS
 22     /* list of struct preempt_notifier: */
 23     struct hlist_head preempt_notifiers;
 24 #endif
 25 
 26     unsigned short ioprio;
 27 #ifdef CONFIG_BLK_DEV_IO_TRACE
 28     unsigned int btrace_seq;
 29 #endif
 30 
 31     unsigned int policy;
 32     cpumask_t cpus_allowed;
 33     unsigned int time_slice;
 34 
 35 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 36     struct sched_info sched_info;
 37 #endif
 38 
 39     struct list_head tasks;
 40     /*
 41      * ptrace_list/ptrace_children forms the list of my children
 42      * that were stolen by a ptracer.
 43      */
 44     struct list_head ptrace_children;
 45     struct list_head ptrace_list;
 46 
 47     struct mm_struct *mm, *active_mm;
 48 
 49 /* task state */
 50     struct linux_binfmt *binfmt;
 51     int exit_state;
 52     int exit_code, exit_signal;
 53     int pdeath_signal;  /*  The signal sent when the parent dies  */
 54     /* ??? */
 55     unsigned int personality;
 56     unsigned did_exec:1;
 57     pid_t pid;
 58     pid_t tgid;
 59 
 60 #ifdef CONFIG_CC_STACKPROTECTOR
 61     /* Canary value for the -fstack-protector gcc feature */
 62     unsigned long stack_canary;
 63 #endif
 64     /* 
 65      * pointers to (original) parent process, youngest child, younger sibling,
 66      * older sibling, respectively.  (p->father can be replaced with 
 67      * p->parent->pid)
 68      */
 69     struct task_struct *real_parent; /* real parent process (when being debugged) */
 70     struct task_struct *parent;    /* parent process */
 71     /*
 72      * children/sibling forms the list of my children plus the
 73      * tasks I'm ptracing.
 74      */
 75     struct list_head children;    /* list of my children */
 76     struct list_head sibling;    /* linkage in my parent's children list */
 77     struct task_struct *group_leader;    /* threadgroup leader */
 78 
 79     /* PID/PID hash table linkage. */
 80     struct pid_link pids[PIDTYPE_MAX];
 81     struct list_head thread_group;
 82 
 83     struct completion *vfork_done;        /* for vfork() */
 84     int __user *set_child_tid;        /* CLONE_CHILD_SETTID */
 85     int __user *clear_child_tid;        /* CLONE_CHILD_CLEARTID */
 86 
 87     unsigned int rt_priority;
 88     cputime_t utime, stime;
 89     unsigned long nvcsw, nivcsw; /* context switch counts */
 90     struct timespec start_time;         /* monotonic time */
 91     struct timespec real_start_time;    /* boot based time */
 92 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
 93     unsigned long min_flt, maj_flt;
 94 
 95       cputime_t it_prof_expires, it_virt_expires;
 96     unsigned long long it_sched_expires;
 97     struct list_head cpu_timers[3];
 98 
 99 /* process credentials */
100     uid_t uid,euid,suid,fsuid;
101     gid_t gid,egid,sgid,fsgid;
102     struct group_info *group_info;
103     kernel_cap_t   cap_effective, cap_inheritable, cap_permitted;
104     unsigned keep_capabilities:1;
105     struct user_struct *user;
106 #ifdef CONFIG_KEYS
107     struct key *request_key_auth;    /* assumed request_key authority */
108     struct key *thread_keyring;    /* keyring private to this thread */
109     unsigned char jit_keyring;    /* default keyring to attach requested keys to */
110 #endif
111     /*
112      * fpu_counter contains the number of consecutive context switches
113      * that the FPU is used. If this is over a threshold, the lazy fpu
114      * saving becomes unlazy to save the trap. This is an unsigned char
115      * so that after 256 times the counter wraps and the behavior turns
116      * lazy again; this to deal with bursty apps that only use FPU for
117      * a short time
118      */
119     unsigned char fpu_counter;
120     int oomkilladj; /* OOM kill score adjustment (bit shift). */
121     char comm[TASK_COMM_LEN]; /* executable name excluding path
122                      - access with [gs]et_task_comm (which lock
123                        it with task_lock())
124                      - initialized normally by flush_old_exec */
125 /* file system info */
126     int link_count, total_link_count;
127 #ifdef CONFIG_SYSVIPC
128 /* ipc stuff */
129     struct sysv_sem sysvsem;
130 #endif
131 /* CPU-specific state of this task */
132     struct thread_struct thread;
133 /* filesystem information */
134     struct fs_struct *fs;
135 /* open file information */
136     struct files_struct *files;
137 /* namespaces */
138     struct nsproxy *nsproxy;
139 /* signal handlers */
140     struct signal_struct *signal;
141     struct sighand_struct *sighand;
142 
143     sigset_t blocked, real_blocked;
144     sigset_t saved_sigmask;        /* To be restored with TIF_RESTORE_SIGMASK */
145     struct sigpending pending;
146 
147     unsigned long sas_ss_sp;
148     size_t sas_ss_size;
149     int (*notifier)(void *priv);
150     void *notifier_data;
151     sigset_t *notifier_mask;
152     
153     void *security;
154     struct audit_context *audit_context;
155     seccomp_t seccomp;
156 
157 /* Thread group tracking */
158        u32 parent_exec_id;
159        u32 self_exec_id;
160 /* Protection of (de-)allocation: mm, files, fs, tty, keyrings */
161     spinlock_t alloc_lock;
162 
163     /* Protection of the PI data structures: */
164     spinlock_t pi_lock;
165 
166 #ifdef CONFIG_RT_MUTEXES
167     /* PI waiters blocked on a rt_mutex held by this task */
168     struct plist_head pi_waiters;
169     /* Deadlock detection and priority inheritance handling */
170     struct rt_mutex_waiter *pi_blocked_on;
171 #endif
172 
173 #ifdef CONFIG_DEBUG_MUTEXES
174     /* mutex deadlock detection */
175     struct mutex_waiter *blocked_on;
176 #endif
177 #ifdef CONFIG_TRACE_IRQFLAGS
178     unsigned int irq_events;
179     int hardirqs_enabled;
180     unsigned long hardirq_enable_ip;
181     unsigned int hardirq_enable_event;
182     unsigned long hardirq_disable_ip;
183     unsigned int hardirq_disable_event;
184     int softirqs_enabled;
185     unsigned long softirq_disable_ip;
186     unsigned int softirq_disable_event;
187     unsigned long softirq_enable_ip;
188     unsigned int softirq_enable_event;
189     int hardirq_context;
190     int softirq_context;
191 #endif
192 #ifdef CONFIG_LOCKDEP
193 # define MAX_LOCK_DEPTH 30UL
194     u64 curr_chain_key;
195     int lockdep_depth;
196     struct held_lock held_locks[MAX_LOCK_DEPTH];
197     unsigned int lockdep_recursion;
198 #endif
199 
200 /* journalling filesystem info */
201     void *journal_info;
202 
203 /* stacked block device info */
204     struct bio *bio_list, **bio_tail;
205 
206 /* VM state */
207     struct reclaim_state *reclaim_state;
208 
209     struct backing_dev_info *backing_dev_info;
210 
211     struct io_context *io_context;
212 
213     unsigned long ptrace_message;
214     siginfo_t *last_siginfo; /* For ptrace use.  */
215 /*
216  * current io wait handle: wait queue entry to use for io waits
217  * If this thread is processing aio, this points at the waitqueue
218  * inside the currently handled kiocb. It may be NULL (i.e. default
219  * to a stack based synchronous wait) if its doing sync IO.
220  */
221     wait_queue_t *io_wait;
222 #ifdef CONFIG_TASK_XACCT
223 /* i/o counters(bytes read/written, #syscalls */
224     u64 rchar, wchar, syscr, syscw;
225 #endif
226     struct task_io_accounting ioac;
227 #if defined(CONFIG_TASK_XACCT)
228     u64 acct_rss_mem1;    /* accumulated rss usage */
229     u64 acct_vm_mem1;    /* accumulated virtual memory usage */
230     cputime_t acct_stimexpd;/* stime since last update */
231 #endif
232 #ifdef CONFIG_NUMA
233       struct mempolicy *mempolicy;
234     short il_next;
235 #endif
236 #ifdef CONFIG_CPUSETS
237     struct cpuset *cpuset;
238     nodemask_t mems_allowed;
239     int cpuset_mems_generation;
240     int cpuset_mem_spread_rotor;
241 #endif
242     struct robust_list_head __user *robust_list;
243 #ifdef CONFIG_COMPAT
244     struct compat_robust_list_head __user *compat_robust_list;
245 #endif
246     struct list_head pi_state_list;
247     struct futex_pi_state *pi_state_cache;
248 
249     atomic_t fs_excl;    /* holding fs exclusive resources */
250     struct rcu_head rcu;
251 
252     /*
253      * cache last used pipe for splice
254      */
255     struct pipe_inode_info *splice_pipe;
256 #ifdef    CONFIG_TASK_DELAY_ACCT
257     struct task_delay_info *delays;
258 #endif
259 #ifdef CONFIG_FAULT_INJECTION
260     int make_it_fail;
261 #endif
262 };

View Code

这当中定义了进程的状态，调度信息，通讯状况，插入进程树时联系父子兄弟的指针，时间信息，内存信息，进程上下文，内核上下文以及处理器上下文等相关信息。

3.2进程标识符

 pid_t pid;/* 进程ID，每个进程(线程)的PID都不同 */
 pid_t tgid;  /* 线程组ID，同一个线程组拥有相同的pid，与领头线程(该组中第一个轻量级进程)pid一致，保存在tgid中，线程组领头线程的pid和tgid相同 */

Unix系统通过pid来标识进程，linux把不同的pid与系统中每个进程或轻量级线程关联，而unix程序员希望同一组线程具有共同的pid，遵照这个标准linux引入线程组的概念。在Linux系统中，一个线程组中的所有线程使用和该线程组的领头线程（该组中的第一个轻量级进程）相同的PID，并被存放在tgid成员中。只有线程组的领头线程的pid成员才会被设置为与tgid相同的值。

#define PID_MAX_DEFAULT (CONFIG_BASE_SMALL ? 0x1000 : 0x8000)

在CONFIG_BASE_SMALL配置为0的情况下，PID的取值范围是0到32767，即系统中的进程数最大为32768个。

3.3进程标记

unsigned int flags;	/*定义每个进程标记*/

进程标记用于反应进程状态的信息，但不是运行状态，用于内核识别进程当前的状态，以备下一步操作。

flags成员的可能取值如下，这些宏以PF(ProcessFlag)开头

/*
 * Per process flags
 */
#define PF_ALIGNWARN	0x00000001	/* Print alignment warning msgs */
					/* Not implemented yet, only for 486*/
#define PF_STARTING	0x00000002	/* being created */
#define PF_EXITING	0x00000004	/* getting shut down */
#define PF_EXITPIDONE	0x00000008	/* pi exit done on shut down */
#define PF_FORKNOEXEC	0x00000040	/* forked but didn't exec */
#define PF_SUPERPRIV	0x00000100	/* used super-user privileges */
#define PF_DUMPCORE	0x00000200	/* dumped core */
#define PF_SIGNALED	0x00000400	/* killed by a signal */
#define PF_MEMALLOC	0x00000800	/* Allocating memory */
#define PF_FLUSHER	0x00001000	/* responsible for disk writeback */
#define PF_USED_MATH	0x00002000	/* if unset the fpu must be initialized before use */
#define PF_NOFREEZE	0x00008000	/* this thread should not be frozen */
#define PF_FROZEN	0x00010000	/* frozen for system suspend */
#define PF_FSTRANS	0x00020000	/* inside a filesystem transaction */
#define PF_KSWAPD	0x00040000	/* I am kswapd */
#define PF_SWAPOFF	0x00080000	/* I am in swapoff */
#define PF_LESS_THROTTLE 0x00100000	/* Throttle me less: I clean memory */
#define PF_BORROWED_MM	0x00200000	/* I am a kthread doing use_mm */
#define PF_RANDOMIZE	0x00400000	/* randomize virtual address space */
#define PF_SWAPWRITE	0x00800000	/* Allowed to write to swap */
#define PF_SPREAD_PAGE	0x01000000	/* Spread page cache over cpuset */
#define PF_SPREAD_SLAB	0x02000000	/* Spread some slab caches over cpuset */
#define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
#define PF_MUTEX_TESTER	0x20000000	/* Thread belongs to the rt mutex tester */
#define PF_FREEZER_SKIP	0x40000000	/* Freezer should not count it as freezeable */

3.4进程的状态

进程状态的定义

 volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */

state成员的可能取值

/*
 * Task state bitmask. NOTE! These bits are also
 * encoded in fs/proc/array.c: get_task_state().
 *
 * We have two separate sets of flags: task->state
 * is about runnability, while task->exit_state are
 * about the task exiting. Confusing, but this way
 * modifying one set can't modify the other one by
 * mistake.
 */
#define TASK_RUNNING		0
#define TASK_INTERRUPTIBLE	1
#define TASK_UNINTERRUPTIBLE	2
#define TASK_STOPPED		4
#define TASK_TRACED		8
/* in tsk->exit_state */
#define EXIT_ZOMBIE		16
#define EXIT_DEAD		32
/* in tsk->state again */
#define TASK_NONINTERACTIVE	64
#define TASK_DEAD		128

#define __set_task_state(tsk, state_value)		\
	do { (tsk)->state = (state_value); } while (0)
#define set_task_state(tsk, state_value)		\
	set_mb((tsk)->state, (state_value))

那就让我们来分析一下这些进程状态的具体含义：

1、TASK_RUNNING：（R）
进程当前正在运行，或者正在运行队列中等待调度。只有在该状态的进程才可能在CPU上运行，同一时刻可能有多个进程处于可执行状态。

2、TASK_INTERRUPTIBLE：(S)
进程处于睡眠状态，处于这个状态的进程因为等待某事件的发生（比如等待socket连接、等待信号量），而被挂起。当这些事件发生时，对应的等待队列中的一个或多个进程将被唤醒。一般情况下，
进程列表中的绝大多数进程都处于TASK_INTERRUPTIBLE状态。进程可以被信号中断。接收到信号或被显式的唤醒呼叫唤醒之后，进程将转变为 TASK_RUNNING 状态。

3、TASK_UNINTERRUPTIBLE：(D)
不可中断的睡眠状态,此进程状态类似于 TASK_INTERRUPTIBLE，只是它不会处理信号。不可中断，指的是进程不响应异步信号，无法用kill命令关闭处于TASK_UNINTERRUPTIBLE状态的进程。中
断处于这种状态的进程是不合适的，因为它可能正在完成某些重要的任务。 当它所等待的事件发生时，进程将被显式的唤醒呼叫唤醒。可处理signal,　有延迟

4、TASK_STOPPED：
进程已中止执行，它没有运行，并且不能运行。接收到 SIGSTOP 和 SIGTSTP 等信号时，进程将进入这种状态。接收到 SIGCONT 信号之后，进程将再次变得可运行。

5、TASK_TRACED：(T)
正被调试程序等其他进程监控时，进程将进入这种状态。

6、EXIT_ZOMBIE：(Z)
进程已终止，它正等待其父进程收集关于它的一些统计信息。不可被kill,　即不响应任务信号,　无法用SIGKILL杀死

7、EXIT_DEAD：(X)
最终状态（正如其名）。将进程从系统中删除时，它将进入此状态，因为其父进程已经通过 wait4() 或 waitpid() 调用收集了所有统计信息。EXIT_DEAD状态是非常短暂的，几乎不可能通过ps命
令捕捉到。

在Linux-2.6.25的版本后还引入了 TASK_KILLABLE 这种进程状态，用于将进程置为睡眠状态，它可以替代有效但可能无法终止的 TASK_UNINTERRUPTIBLE 进程状态，以及易于唤醒但更加安全的 TASK_INTERRUPTIBLE 进程状态。

4.进程调度

上文提到了Linux是多任务抢占操作系统，抢占式系统表示即使当前进程没有用完时间片，也没有主动放弃cpu，如果调度系统发现有更高动态优先级的进程，则强制剥夺当前的cpu，选择更高动态优先级的进程执行。

4.1.时间片和优先级进程优先级

时间片是一个数值，它表明进程在被抢占前所能持续运行的时间。I/O消耗型不需要长的时间片，而处理器消耗型的进程则希望越长越好。

#define MAX_USER_RT_PRIO	100
#define MAX_RT_PRIO		MAX_USER_RT_PRIO

#define MAX_PRIO		(MAX_RT_PRIO + 40)
#define DEFAULT_PRIO		(MAX_RT_PRIO + 20)

Linux进程的静态优先级。静态优先级分为两个范围：0~99是实时进程的静态优先级（值越大优先级越高），100~139是普通进程的静态优先级（通过nice值表示，-20~19，值越大优先级越低）。

4.2.进程调度策略

/*
 * Scheduling policies
 */
#define SCHED_NORMAL		0
#define SCHED_FIFO		1
#define SCHED_RR		2
#define SCHED_BATCH		3
/* SCHED_ISO: reserved but not implemented yet */
#define SCHED_IDLE		5

内核2.6的进程调度策略一共有五种： SCHED_NORMAL,SCHED-FIFO,SCHED_RR,SCHED_BATCH,SCHED_IDLE 。其中 SCHED_NORMAL 和 SCHED_BATCH 用于普通进程的调度通过CFS调度器实现，后三种用于实时进程的调度。

4.3.普通进程的调度策略（CFS）

CFS（ Completely Fair Scheduler）即完全公平调度器，不再跟踪进程的睡眠时间，也不再企图区分交互式进程，它将所有的进程统一对待，这就是公平的含义。

先简单说一下CFS调度算法的思想：理想状态下每个进程都能获得相同的时间片，并且同时运行在CPU上，但实际上一个CPU同一时刻运行的进程只能有一个。也就是说，当一个进程占用CPU时，其他进程就必须等待。CFS为了实现公平，必须惩罚当前正在运行的进程，以使那些正在等待的进程下次被调度。

4.3.1.调度实体

/*
 * CFS stats for a schedulable entity (task, task-group etc)
 *
 * Current field usage histogram:
 *
 *     4 se->block_start
 *     4 se->run_node
 *     4 se->sleep_start
 *     4 se->sleep_start_fair
 *     6 se->load.weight
 *     7 se->delta_fair
 *    15 se->wait_runtime
 */
struct sched_entity {
	long			wait_runtime;
	unsigned long		delta_fair_run;
	unsigned long		delta_fair_sleep;
	unsigned long		delta_exec;
	s64			fair_key;
	struct load_weight	load;		/* for load-balancing */
	struct rb_node		run_node;
	unsigned int		on_rq;

	u64			exec_start;
	u64			sum_exec_runtime;
	u64			prev_sum_exec_runtime;
	u64			wait_start_fair;
	u64			sleep_start_fair;

#ifdef CONFIG_SCHEDSTATS
	u64			wait_start;
	u64			wait_max;
	s64			sum_wait_runtime;

	u64			sleep_start;
	u64			sleep_max;
	s64			sum_sleep_runtime;

	u64			block_start;
	u64			block_max;
	u64			exec_max;

	unsigned long		wait_runtime_overruns;
	unsigned long		wait_runtime_underruns;
#endif

运行实体结构为sched_entity，所有的调度器都必须对进程运行时间做记账。CFS不再有时间片的概念，但是它也必须维护每个进程运行的时间记账，因为他需要确保每个进程只在公平分配给他的处理器时间内运行。CFS使用调度器实体结构来最终运行记账。

4.3.2.运行队列

    struct cfs_rq {  
        struct load_weight load;/*运行负载*/  
        unsigned long nr_running;/*运行进程个数*/  
      
        u64 exec_clock;  
        u64 min_vruntime;/*保存的最小运行时间*/  
      
        struct rb_root tasks_timeline;/*运行队列树根*/  
        struct rb_node *rb_leftmost;/*保存的红黑树最左边的节点，这个为最小运行时间的节点，当进程选择下一个来运行时，直接选择这个*/  
      
        struct list_head tasks;  
        struct list_head *balance_iterator;  
      
        /* 
         * 'curr' points to currently running entity on this cfs_rq. 
         * It is set to NULL otherwise (i.e when none are currently running). 
         */  
        struct sched_entity *curr, *next, *last;  
      
        unsigned int nr_spread_over;  
      
    #ifdef CONFIG_FAIR_GROUP_SCHED  
        struct rq *rq;  /* cpu runqueue to which this cfs_rq is attached */  
      
        /* 
         * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in 
         * a hierarchy). Non-leaf lrqs hold other higher schedulable entities 
         * (like users, containers etc.) 
         * 
         * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This 
         * list is used during load balance. 
         */  
        struct list_head leaf_cfs_rq_list;  
        struct task_group *tg;  /* group that "owns" this runqueue */  
      
    #ifdef CONFIG_SMP  
        /* 
         * the part of load.weight contributed by tasks 
         */  
        unsigned long task_weight;  
      
        /* 
         *   h_load = weight * f(tg) 
         * 
         * Where f(tg) is the recursive weight fraction assigned to 
         * this group. 
         */  
        unsigned long h_load;  
      
        /* 
         * this cpu's part of tg->shares 
         */  
        unsigned long shares;  
      
        /* 
         * load.weight at the time we set shares 
         */  
        unsigned long rq_weight;  
    #endif  
    #endif  
    };

4.3.3.虚拟运行时间（vruntime)

struct sched_entity
{
    struct load_weight      load;           /* for load-balancing负荷权重，这个决定了进程在CPU上的运行时间和被调度次数 */
    struct rb_node          run_node;
    unsigned int            on_rq;          /*  是否在就绪队列上  */

    u64                     exec_start;         /*  上次启动的时间*/

    u64                     sum_exec_runtime;
    u64                     vruntime;
    u64                     prev_sum_exec_runtime;
    /* rq on which this entity is (to be) queued: */
    struct cfs_rq           *cfs_rq;
    ...
};

实现CFS算法时，CFS通过每个进程的虚拟运行时间（vruntime）来衡量哪个进程最值得被调度。CFS中的就绪队列是一棵以vruntime为键值的红黑树，虚拟时间越小的进程越靠近整个红黑树的最左端。因此，调度器每次选择位于红黑树最左端的那个进程，该进程的vruntime最小。

虚拟运行时间是通过进程的实际运行时间和进程的权重（weight）计算出来的。在CFS调度器中，将进程优先级这个概念弱化，而是强调进程的权重。一个进程的权重越大，则说明这个进程更需要运行，因此它的虚拟运行时间就越小，这样被调度的机会就越大。

而在内核中，进程的虚拟运行时间是自进程诞生以来进行累加的，每个时钟周期内一个进程的虚拟运行时间是通过下面的方法计算的：

一次调度间隔的虚拟运行时间=实际运行时间*（NICE_0_LOAD/权重）

其中，NICE_0_LOAD是nice为0时的权重。也就是说，nice值为0的进程实际运行时间和虚拟运行时间相同。通过这个公式可以看到，权重越大的进程获得的虚拟运行时间越小，那么它将被调度器所调度的机会就越大。

4.3.4.进程选择

CFS调度算法的核心是选择具有最小vruntine的任务。运行队列采用红黑树方式存放，其中节点的键值便是可运行进程的虚拟运行时间。CFS调度器选取待运行的下一个进程，是所有进程中vruntime最小的那个，他对应的便是在树中最左侧的叶子节点。实现选择的函数为pick_next_task_fair。

    static struct task_struct *pick_next_task_fair(struct rq *rq)  
    {  
        struct task_struct *p;  
        struct cfs_rq *cfs_rq = &rq->cfs;  
        struct sched_entity *se;  
      
        if (unlikely(!cfs_rq->nr_running))  
            return NULL;  
      
        do {/*此循环为了考虑组调度*/  
            se = pick_next_entity(cfs_rq);  
            set_next_entity(cfs_rq, se);/*设置为当前运行进程*/  
            cfs_rq = group_cfs_rq(se);  
        } while (cfs_rq);  
      
        p = task_of(se);  
        hrtick_start_fair(rq, p);  
      
        return p;  
    }

该函数最终调用__pick_next_entity完成实质工作

    /*函数本身并不会遍历数找到最左叶子节点(是 
    所有进程中vruntime最小的那个),因为该值已经缓存 
    在rb_leftmost字段中*/  
    static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)  
    {  
        /*rb_leftmost为保存的红黑树的最左边的节点*/  
        struct rb_node *left = cfs_rq->rb_leftmost;  
      
        if (!left)  
            return NULL;  
      
        return rb_entry(left, struct sched_entity, run_node);  
    }

CFS调度运行队列采用红黑树方式组织，红黑树种的key值以vruntime排序。每次选择下一个进程运行时即是选择最左边的一个进程运行。而对于入队和处队都会更新调度队列、调度实体的相关信息。

5.对操作系统进程模型的看法

操作系统其实就相当于一个大型的软件，是用户和计算机的接口，同时也是计算机硬件和其他软件的接口。操作系统管理着计算机的资源，同时为用户的使用分配着资源，进程的调度就是操作系统在幕后控制的。从一开始的非抢占式单任务运行系统到现在的抢占式多任务并行系统，进程的调度算法一次又一次的优化创新，以此来达到更好的效率，来更好的满足用户的需求。而我认为这个创新是永无止境的，随着硬件以及时间的变化，调度算法也会一次又一次的更新换代。

posted @ 2018-05-01 18:13 Wbingo 阅读(515) 评论(1) 收藏举报

刷新页面返回顶部

Wbingo