Linux kernel source code - CFS scheduling (4.20.17)


Each cpu has a corresponding running queue rq, in which the scheduling queues with different scheduling strategies are maintained.

struct rq {
    struct cfs_rq       cfs;
    struct rt_rq        rt;
    struct dl_rq        dl;

The scheduling queue of cfs is maintained through the red black tree_ In the data structure of RQ, struct rb_root_cached   tasks_timeline contains red black tree struct rb_root rb_root and leftmost leaf node cache {struct rb_node *rb_leftmost .

struct cfs_rq {
	struct load_weight	load;  //Load weight value of CFS running queue
	unsigned long		runnable_weight;
	unsigned int		nr_running;
	unsigned int		h_nr_running;

	u64			exec_clock;
	u64			min_vruntime;
#ifndef CONFIG_64BIT
	u64			min_vruntime_copy;

	struct rb_root_cached	tasks_timeline;  //Red black tree, maintenance scheduling entity

	 * 'curr' points to currently running entity on this cfs_rq.
	 * It is set to NULL otherwise (i.e when none are currently running).
	struct sched_entity	*curr;  //Currently running scheduling entity
	struct sched_entity	*next;  //Next scheduling entity
	struct sched_entity	*last;  //Last scheduled entity in queue
	struct sched_entity	*skip;  //Skipped scheduling entity

	unsigned int		nr_spread_over;

	 * CFS load tracking
	struct sched_avg	avg;
#ifndef CONFIG_64BIT
	u64			load_last_update_time_copy;
	struct {
		raw_spinlock_t	lock ____cacheline_aligned;
		int		nr;
		unsigned long	load_avg;
		unsigned long	util_avg;
		unsigned long	runnable_sum;
	} removed;

	unsigned long		tg_load_avg_contrib;
	long			propagate;
	long			prop_runnable_sum;

	 *   h_load = weight * f(tg)
	 * Where f(tg) is the recursive weight fraction assigned to
	 * this group.
	unsigned long		h_load;
	u64			last_h_load_update;
	struct sched_entity	*h_load_next;
#endif /* CONFIG_SMP */

	struct rq		*rq;	/* CPU runqueue to which this cfs_rq is attached */

	 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
	 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
	 * (like users, containers etc.)
	 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU.
	 * This list is used during load balance.
	int			on_list;
	struct list_head	leaf_cfs_rq_list;
	struct task_group	*tg;	/* group that "owns" this runqueue */

	int			runtime_enabled;
	int			expires_seq;
	u64			runtime_expires;
	s64			runtime_remaining;

	u64			throttled_clock;
	u64			throttled_clock_task;
	u64			throttled_clock_task_time;
	int			throttled;
	int			throttle_count;
	struct list_head	throttled_list;



So what sort of tasks does CFS use? -------------- Virtual runtime.

update_ The curr function (/ kernel/sched/fair.c) implements the update of vruntime. Its step is to calculate the running time delta of the current process_ Exec, combined with the current total number of runnable processes to delta_exec performs weighting operation.

static void update_curr(struct cfs_rq *cfs_rq)
	struct sched_entity *curr = cfs_rq->curr;  //Get the current scheduling entity
	u64 now = rq_clock_task(rq_of(cfs_rq));  //Get current time
	u64 delta_exec;

	if (unlikely(!curr))

	delta_exec = now - curr->exec_start;  //Calculate the execution time of the current process, exec_start is the start execution time of the scheduling entity
	if (unlikely((s64)delta_exec <= 0))

	curr->exec_start = now;

		      max(delta_exec, curr->statistics.exec_max));

	curr->sum_exec_runtime += delta_exec;  //Total execution time of modifying scheduling entity
	schedstat_add(cfs_rq->exec_clock, delta_exec);

	curr->vruntime += calc_delta_fair(delta_exec, curr);  //Modify virtual running time of scheduling entity

	if (entity_is_task(curr)) {  //If the scheduling entity is a task, the execution time should also be recorded for its scheduling group
		struct task_struct *curtask = task_of(curr);

		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
		cgroup_account_cputime(curtask, delta_exec);
		account_group_exec_runtime(curtask, delta_exec);

	account_cfs_rq_runtime(cfs_rq, delta_exec);

calc_delta_fair(delta_exec, curr) realizes the calculation of virtual running time:

Virtual runtime = delta_exec * NICE_0_LOAD / weight of current process

And specifically__ calc_ In Delta, it is through (delta_exec * (weight * LW - > inv_weight)) > > wmult_ Shift is implemented to avoid floating-point operation by shifting left and right.

It can be concluded from the formula that if the virtual running time of a process is smaller, it means that the actual running time is less or the weight of the process is significant, then it should have a higher priority. The red black tree maintains the vruntime value of the process. Each time the process with the smallest vruntime is selected for execution, the node is cached in the leftmost leaf node struct rb_node *rb_leftmost.

static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
	if (unlikely(se->load.weight != NICE_0_LOAD))
		delta = __calc_delta(delta, NICE_0_LOAD, &se->load);

	return delta;


Process selection

When the process becomes runnable (awakened) or is created for the first time through fork(), you need to insert the process into the red black tree and call__ enqueue_entity implements this process. The same is true for deleting nodes.

static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
	struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;  //Red black tree root node
	struct rb_node *parent = NULL;
	struct sched_entity *entry;
	bool leftmost = true;

	 * Find the right place in the rbtree:
	while (*link) {
		parent = *link;
		entry = rb_entry(parent, struct sched_entity, run_node);  //rb_entry is just a container_of, find the first address
		 * We dont care about collisions. Nodes with
		 * the same key stay together.
		if (entity_before(se, entry)) {
			link = &parent->rb_left;
		} else {
			link = &parent->rb_right;
			leftmost = false;

	rb_link_node(&se->run_node, parent, link);  //Insert node in red black tree
	rb_insert_color_cached(&se->run_node,  //Sets the color of the node
			       &cfs_rq->tasks_timeline, leftmost);


Process scheduling

The main entry point of process scheduling is the function schedule(/kernel/sched/core.c), which uses pick_ next_ The task() function selects the next process. If the selected process is inconsistent with the current running process, it calls context_ The switch() function performs context switching.

static void __sched notrace __schedule(bool preempt)
	cpu = smp_processor_id();
	rq = cpu_rq(cpu);
	prev = rq->curr;  //Get the currently running process


	next = pick_next_task(rq, prev, &rf);

	if (likely(prev != next)) {
... rq = context_switch(rq, prev, next, &rf); } else { rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); rq_unlock_irq(rq, &rf); }
... }

pick_ next_ The implementation of the task () function is not complicated. A little optimization is used here. If all the runnable processes are in cfs, you can directly call the pick of cfs_ next_ Task(), otherwise it needs to be selected according to the priority of the scheduler.

static inline struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
	const struct sched_class *class;
	struct task_struct *p;

	 * Optimization: we know that if all tasks are in the fair class we can
	 * call that function directly, but only if the @prev task wasn't of a
	 * higher scheduling class, because otherwise those loose the
	 * opportunity to pull in more work from other CPUs.
	if (likely((prev->sched_class == &idle_sched_class ||
		    prev->sched_class == &fair_sched_class) &&
		   rq->nr_running == rq->cfs.h_nr_running)) {

		p = fair_sched_class.pick_next_task(rq, prev, rf);
		if (unlikely(p == RETRY_TASK))
			goto again;

		/* Assumes fair_sched_class->next == idle_sched_class */
		if (unlikely(!p))
			p = idle_sched_class.pick_next_task(rq, prev, rf);

		return p;

	for_each_class(class) {
		p = class->pick_next_task(rq, prev, rf);
		if (p) {
			if (unlikely(p == RETRY_TASK))
				goto again;
			return p;

	/* The idle class should always have a runnable task: */





  1. [original] (V) Linux Process Scheduling - CFS scheduler
  2. CFS scheduling code analysis I

Keywords: Operating System

Added by vestax1984 on Tue, 08 Mar 2022 13:12:12 +0200