linux内核设计核实现第四章---进程调度笔记

Source

参考网址：

Linux CFS调度器之队列操作--Linux进程的管理与调度(二十七） - 云+社区 - 腾讯云

4.1多任务

多任务处理器上，能让多个进程处于阻塞或者睡眠状态，实际上不被调度，直到被唤醒

多任务操作系统分为抢占和非抢占两种多任务方式，linux是公平调度。

非抢占模式除非进程主动让出(yeild)否则会一直被执行，不会让出，但是绝大多操作系统是抢占式的。

4.2 linux进程的调度

时间片调度，通过alrm 闹钟信号

/****************************************************************************/

/* 功能：进程调度。*/

/* 先对alarm和信号进行处理，如果某个进程处于可中断睡眠状态，并且收*/

/* 到信号，则把进程状态改成可运行。之后在处可运行状态的进程中挑选一个*/

/* 并用switch_to()切换到那个进程*/

/* 参数：(无)*/

/* 返回：(无)*/

/****************************************************************************/

void schedule(void)

{

int i,next,c;

struct task_struct ** p;

/* check alarm, wake up any interruptible tasks that have got a signal */

// 首先处理alarm信号，唤醒所有收到信号的可中断睡眠进程

for(p = &LAST_TASK ; p > &FIRST_TASK ; --p)

if (*p) {

// 如果进程设置了alarm，并且alarm已经到时间了

if ((*p)->alarm && (*p)->alarm < jiffies) {

// 向该进程发送SIGALRM信号

(*p)->signal |= (1<

(*p)->alarm = 0;// 清除alarm

}

//可屏蔽信号位图BLOCKABLE定义在sched.c第24行，(~(_S(SIGKILL) | _S(SIGSTOP)))

// 说明SIGKILL和SIGSTOP是不能被屏蔽的。

// 可屏蔽信号位图 & 当前进程屏蔽的信号位图 = 当前进程实际屏蔽的信号位图

// 当前进程收到的信号位图 & ~当前进程实际屏蔽的信号位图

//= 当前进程收到的允许相应的信号位图

// 如果当前进程收到允许相应的信号，并且当前进程处于可中断睡眠态

// 则把状态改成运行态，参与下面的选择过程

if (((*p)->signal & ~(_BLOCKABLE & (*p)->blocked)) &&

(*p)->state==TASK_INTERRUPTIBLE)

(*p)->state=TASK_RUNNING;

}

/* this is the scheduler proper: */

// 下面是进程调度的主要部分

while (1) {undefined

c = -1;

next = 0;

i = NR_TASKS;

p = &task[NR_TASKS];

while (--i) {// 遍历整个task[]数组

if (!*--p)// 跳过task[]中的空项

continue;

// 寻找剩余时间片最长的可运行进程，

// c记录目前找到的最长时间片

// next记录目前最长时间片进程的任务号

if ((*p)->state == TASK_RUNNING && (*p)->counter > c)

c = (*p)->counter, next = i;

}

// 如果有进程时间片没有用完c一定大于0。这时退出循环，执行 switch_to任务切换

if (c) break;

// 到这里说明所有可运行进程的时间片都用完了，则利用任务优先级重新分配时间片。

// 这里需要重新设置所有任务的时间片，而不光是可运行任务的时间片。

// 利用公式：counter = counter/2 + priority

for(p = &LAST_TASK ; p > &FIRST_TASK ; --p)

if (*p)

(*p)->counter = ((*p)->counter >> 1) +

(*p)->priority;

// 整个设置时间片过程结束后，重新进入进程选择过程

}

// 当的上面的循环退出时，说明找到了可以切换的任务

switch_to(next);

}

注意到，当系统中现在没有可以投入运行的进程，但是存在就绪态，不过其时间片为0，此时，就需要重新为进程分配时间片。

linux 采用的是cfs，公平调度，允许进程运行一段时间，循环旋转。先择运行最少的进程优先运行

4.5 linux调度的实现

四个重要组成部分：

时间记账、进程选择、调度器入口、睡眠和唤醒。

CFS使用调度器实体结构，来追踪进程运行记账

struct sched_entity {
	struct load_weight	load;		/* for load-balancing */
	struct rb_node		run_node;
	struct list_head	group_node;
	unsigned int		on_rq;

	u64			exec_start;
	u64			sum_exec_runtime;
	u64			vruntime;
	u64			prev_sum_exec_runtime;


	u64			nr_migrations;


#ifdef CONFIG_SCHEDSTATS
	struct sched_statistics statistics;
#endif


#ifdef CONFIG_FAIR_GROUP_SCHED
	int			depth;
	struct sched_entity	*parent;
	/* rq on which this entity is (to be) queued: */
	struct cfs_rq		*cfs_rq;
	/* rq "owned" by this entity/group: */
	struct cfs_rq		*my_q;
#endif


#ifdef CONFIG_SMP
	/*
	 * Per entity load average tracking.
	 *
	 * Put into separate cache line so it does not
	 * collide with read-mostly values above.
	 */
	struct sched_avg	avg ____cacheline_aligned_in_smp;
#endif
};

调度器实体作为一个名字是se的成员变量，嵌入在进程描述符task_struct里面。

2.虚拟实时

update_curr实现了记账功能

/*
 * Update the current task's runtime statistics.
 */
static void update_curr(struct cfs_rq *cfs_rq)
{
	struct sched_entity *curr = cfs_rq->curr;
	u64 now = rq_clock_task(rq_of(cfs_rq));
	u64 delta_exec;
 
	if (unlikely(!curr))
		return;
    /* (3.2.1.1)  计算cfs_rq->curr se的实际执行时间 */ 
	delta_exec = now - curr->exec_start;
	if (unlikely((s64)delta_exec <= 0))
		return;
 
	curr->exec_start = now;
    
	schedstat_set(curr->statistics.exec_max, max(delta_exec, curr->statistics.exec_max));
 
	curr->sum_exec_runtime += delta_exec;// (1) 累计当前进程的实际运行时间
// 更新cfs_rq的实际执行时间cfs_rq->exec_clock
	schedstat_add(cfs_rq, exec_clock, delta_exec);
/* (3.2.1.2)  计算cfs_rq->curr se的虚拟执行时间vruntime */
	curr->vruntime += calc_delta_fair(delta_exec, curr);// (2) 累计当前进程的vruntime
	update_min_vruntime(cfs_rq);
/* (3.2.1.3)  如果se对应的是task，而不是task_group，
        更新task对应的时间统计
     */
	if (entity_is_task(curr)) {
		struct task_struct *curtask = task_of(curr);
      
		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
        // 更新task所在cgroup之cpuacct的某个cpu运行时间ca->cpuusage[cpu]->cpuusage
		cpuacct_charge(curtask, delta_exec);
        // 统计task所在线程组(thread group)的运行时间：
        // tsk->signal->cputimer.cputime_atomic.sum_exec_runtime
		account_group_exec_runtime(curtask, delta_exec);
	}
/* (3.2.1.4)  计算cfs_rq的运行时间，是否超过cfs_bandwidth的限制:
        cfs_rq->runtime_remaining
     */
	account_cfs_rq_runtime(cfs_rq, delta_exec);
}

_update_curr()完成后，exec_start被设置为rq的时间

pdate_curr()函数只负责计算delta_exec以及更新exec_start。其它工作由__update_curr()函数完成：
        1、更新当前进程的实际运行时间（抽象模型中的runtime）。
        2、更新当前进程的虚拟时间vruntime。
        3、更新cfs_rq->min_vruntime。
           在当前进程和下一个将要被调度的进程中选择vruntime较小的值。然后用该值和cfs_rq->min_vruntime比较，如果比min_vruntime大，则更新cfs_rq为的min_vruntime为所求出的值。

4.5.2 进程选择

红黑树上存储了所有可以运行的进程

/*
 * Pick the next process, keeping these things in mind, in this order:
 * 1) keep things fair between processes/task groups
 * 2) pick the "next" process, since someone really wants that to run
 * 3) pick the "last" process, for cache locality
 * 4) do not run the "skip" process, if something else is available
 *
 *  1. 首先要确保任务组之间的公平, 这也是设置组的原因之一
 *  2. 其次, 挑选下一个合适的（优先级比较高的）进程
 *     因为它确实需要马上运行 
 *  3. 如果没有找到条件2中的进程
 *     那么为了保持良好的局部性
 *     则选中上一次执行的进程 
 *  4. 只要有任务存在, 就不要让CPU空转, 
 *     只有在没有进程的情况下才会让CPU运行idle进程
 */
static struct sched_entity *
pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
    /*  摘取红黑树最左边的进程  */
    struct sched_entity *left = __pick_first_entity(cfs_rq);
    struct sched_entity *se;

    /*
     * If curr is set we have to see if its left of the leftmost entity
     * still in the tree, provided there was anything in the tree at all.
     *
     * 如果
     * left == NULL  或者
     * curr != NULL curr进程比left进程更优(即curr的虚拟运行时间更小) 
     * 说明curr进程是自动放弃运行权利, 且其比最左进程更优
     * 因此将left指向了curr, 即curr是最优的进程
     */
    if (!left || (curr && entity_before(curr, left)))
    {
        left = curr;
    }

    /* se = left存储了cfs_rq队列中最优的那个进程  
     * 如果进程curr是一个自愿放弃CPU的进程(其比最左进程更优), 则取se = curr
     * 否则进程se就取红黑树中最左的进程left, 它必然是当前就绪队列上最优的
     */
    se = left; /* ideally we run the leftmost entity */

    /*
     * Avoid running the skip buddy, if running something else can
     * be done without getting too unfair.
     *
     * cfs_rq->skip存储了需要调过不参与调度的进程调度实体
     * 如果我们挑选出来的最优调度实体se正好是skip
     * 那么我们需要选择次优的调度实体se来进行调度
     * 由于之前的se = left = (curr before left) curr left
     * 则如果 se == curr == skip, 则选择left = __pick_first_entity进行即可
     * 否则则se == left == skip, 则选择次优的那个调度实体second
     */
    if (cfs_rq->skip == se)
    {
        struct sched_entity *second;

        if (se == curr) /* se == curr == skip选择最左的那个调度实体left  */
        {
            second = __pick_first_entity(cfs_rq);
        }
        else    /*  否则se == left == skip, 选择次优的调度实体second  */
        {
            /*  摘取红黑树上第二左的进程节点  */
            second = __pick_next_entity(se);
            /*  同时与left进程一样, 
             * 如果
             * second == NULL 没有次优的进程  或者
             * curr != NULL curr进程比left进程更优(即curr的虚拟运行时间更小) 
             * 说明curr进程比最second进程更优
             * 因此将second指向了curr, 即curr是最优的进程*/
            if (!second || (curr && entity_before(curr, second)))
                second = curr;
        }

        /* 判断left和second的vruntime的差距是否小于sysctl_sched_wakeup_granularity
         * 即如果second能抢占left */
        if (second && wakeup_preempt_entity(second, left) < 1)
            se = second;
    }

    /*
     * Prefer last buddy, try to return the CPU to a preempted task.
     *
     * 
     */
    if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
        se = cfs_rq->last;

    /*
     * Someone really wants this to run. If it's not unfair, run it.
     */
    if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
        se = cfs_rq->next;

    /* 用过一次任何一个next或者last
     * 都需要清除掉这个指针
     * 以免影响到下次pick next sched_entity  */
    clear_buddies(cfs_rq, se);

    return se;
}

向树中加入进程

enqueue_entity完成了进程真正的入队操作, 其具体流程如下所示

更新一些统计统计量, update_curr, update_cfs_shares等
如果进程此前是在睡眠状态, 则调用place_entity中首先会调整进程的虚拟运行时间
最后如果进程最近在运行, 其虚拟运行时间仍然有效, 那么则直接用__enqueue_entity加入到红黑树

首先如果进程最近正在运行, 其虚拟时间时间仍然有效, 那么(除非它当前在执行中)它可以直接用__enqueue_entity插入到红黑树, 该函数徐娅萍处理一些红黑树的机制, 这可以依靠内核的标准实现, 参见__enqueue_entity函数,

static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
    /*
     * Update the normalized vruntime before updating min_vruntime
     * through calling update_curr().
     *
     * 如果当前进程之前已经是可运行状态不是被唤醒的那么其虚拟运行时间要增加
     */
    if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
        se->vruntime += cfs_rq->min_vruntime;

    /*
     * Update run-time statistics of the 'current'.
     * 更新进程的统计量信息
     */
    update_curr(cfs_rq);
    enqueue_entity_load_avg(cfs_rq, se);
    account_entity_enqueue(cfs_rq, se);
    update_cfs_shares(cfs_rq);

    /*  如果当前进行之前在睡眠刚被唤醒  */
    if (flags & ENQUEUE_WAKEUP)
    {
        /*  调整进程的虚拟运行时间  */
        place_entity(cfs_rq, se, 0);
        if (schedstat_enabled())
            enqueue_sleeper(cfs_rq, se);
    }

    check_schedstat_required();
    if (schedstat_enabled()) {
        update_stats_enqueue(cfs_rq, se);
        check_spread(cfs_rq, se);
    }

    /*  将进程插入到红黑树中  */
    if (se != cfs_rq->curr)
        __enqueue_entity(cfs_rq, se);
    se->on_rq = 1;

    if (cfs_rq->nr_running == 1) {
        list_add_leaf_cfs_rq(cfs_rq);
        check_enqueue_throttle(cfs_rq);
    }
}

从红黑树中删除，进程变为了挂起等待的进程

static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
    /*
     * Update run-time statistics of the 'current'.
     */
    update_curr(cfs_rq);
    dequeue_entity_load_avg(cfs_rq, se);

    if (schedstat_enabled())
        update_stats_dequeue(cfs_rq, se, flags);

    clear_buddies(cfs_rq, se);

    if (se != cfs_rq->curr)
        __dequeue_entity(cfs_rq, se);
    se->on_rq = 0;
    account_entity_dequeue(cfs_rq, se);

    /*
     * Normalize the entity after updating the min_vruntime because the
     * update can refer to the ->curr item and we need to reflect this
     * movement in our normalized position.
     */
    if (!(flags & DEQUEUE_SLEEP))
        se->vruntime -= cfs_rq->min_vruntime;

    /* return excess runtime on last dequeue */
    return_cfs_rq_runtime(cfs_rq);

    update_min_vruntime(cfs_rq);
    update_cfs_shares(cfs_rq);
}

static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
    if (cfs_rq->rb_leftmost == &se->run_node) {
        struct rb_node *next_node;

        next_node = rb_next(&se->run_node);
        cfs_rq->rb_leftmost = next_node;
    }

    rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
}

4.5.3 调度器入口

调度器的核心函数是schedule，调用pick_next_task选择下一个要被调度的进程

schedule->__schedule->pick_next_task

static inline struct task_struct *
pick_next_task(struct rq *rq， struct task_struct *prev， struct rq_flags *rf)
{
    const struct sched_class *class;
    struct task_struct *p;

    if (likely((prev->sched_class == &idle_sched_class ||
            prev->sched_class == &fair_sched_class) &&
           rq->nr_running == rq->cfs.h_nr_running)) {     ........1

        p = fair_sched_class.pick_next_task(rq， prev， rf);  .....2
        if (unlikely(p == RETRY_TASK))
            goto again;

        /* Assumes fair_sched_class->next == idle_sched_class */
        if (unlikely(!p))
            p = idle_sched_class.pick_next_task(rq， prev， rf);

        return p;
    }

again:
    for_each_class(class) {            ...........................3
        p = class->pick_next_task(rq， prev， rf);
        if (p) {
            if (unlikely(p == RETRY_TASK))
                goto again;
            return p;
        }
    }
}

4.5.4 函数唤醒和休眠

只记几个核心

//定义一个等待队列
DEFINE_WAIT

//加入等待队列
add_wait_queue

while (!concition) {

    //设置当前进程可被唤醒状态
    prepare_to_Wait

    //调度
    schedule()
}

//移除等待队列
finish_wait

唤醒使用wakeup系列就可以

4.6 上下文切换和抢占

上下文切换核心是context_switch

/*
 * context_switch - switch to the new MM and the new thread's register state.
 */
static __always_inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,
	       struct task_struct *next, struct rq_flags *rf)
{
	prepare_task_switch(rq, prev, next);
 
	/*
	 * For paravirt, this is coupled with an exit in switch_to to
	 * combine the page table reload and the switch backend into
	 * one hypercall.
	 */
	arch_start_context_switch(prev);
 
	/*
	 * kernel -> kernel   lazy + transfer active
	 *   user -> kernel   lazy + mmgrab() active
	 *
	 * kernel ->   user   switch + mmdrop() active
	 *   user ->   user   switch
	 */
	if (!next->mm) {                                // to kernel
		enter_lazy_tlb(prev->active_mm, next);
 
		next->active_mm = prev->active_mm;
		if (prev->mm)                           // from user
			mmgrab(prev->active_mm);
		else
			prev->active_mm = NULL;
	} else {                                        // to user
		membarrier_switch_mm(rq, prev->active_mm, next->mm);
		/*
		 * sys_membarrier() requires an smp_mb() between setting
		 * rq->curr / membarrier_switch_mm() and returning to userspace.
		 *
		 * The below provides this either through switch_mm(), or in
		 * case 'prev->active_mm == next->mm' through
		 * finish_task_switch()'s mmdrop().
		 */
		switch_mm_irqs_off(prev->active_mm, next->mm, next);
 
		if (!prev->mm) {                        // from kernel
			/* will mmdrop() in finish_task_switch(). */
			rq->prev_mm = prev->active_mm;
			prev->active_mm = NULL;
		}
	}
 
	rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
 
	prepare_lock_switch(rq, next, rf);
 
	/* Here we just switch the register state and the stack. */
	switch_to(prev, next, prev);
	barrier();
 
	return finish_task_switch(prev);
}

上下文切换分为

根据引发context switch的原因，又分为两种情况——CSWCH（自愿上下文切换）和NVCSWCH（非自愿上下文切换），查看man pidstat中：

症状：上下文切换过多，会导致sys系统态CPU增加，换句话说CPU资源都被内核用了
1. 如果是CSWCH过多，那要定位是什么资源短缺导致频繁context switch影响系统性能
2. 如果是NVCSWCH过多，那要看是哪里导致那么多R态进程等待调度运行

switch_ mm负责把虚拟内存从旧进程切换到新进程，switch_to负责切换处理器状态