linux进程调度完全公平,Linux进程的调度

本文解析了Linux内核中进程创建时的调度策略,包括如何初始化新进程的虚拟运行时间,以及如何确保新进程不会因初始虚拟运行时间优势而长期占据调度资源。此外,还介绍了sysctl参数对进程调度的影响。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

创建新的进程时,如何调度这个进程呢?如果新创建的进程vruntime为0,那么它将长

期保持调度优势,这显然是不合理的。kernel/sched/core.c定义类sched_fork处理新

创建进程时的情况。

/*

* fork()/clone()-time setup:

*/

int sched_fork(unsigned long clone_flags, struct task_struct *p)

{

unsigned long flags;

int cpu = get_cpu();

__sched_fork(clone_flags, p);

/*

* We mark the process as NEW here. This guarantees that

* nobody will actually run it, and a signal or other external

* event cannot wake it up and insert it on the runqueue either.

*/

p->state = TASK_NEW;

/*

* Make sure we do not leak PI boosting priority to the child.

*/

p->prio = current->normal_prio;

/*

* Revert to default priority/policy on fork if requested.

*/

if (unlikely(p->sched_reset_on_fork)) {

if (task_has_dl_policy(p) || task_has_rt_policy(p)) {

p->policy = SCHED_NORMAL;

p->static_prio = NICE_TO_PRIO(0);

p->rt_priority = 0;

} else if (PRIO_TO_NICE(p->static_prio) < 0)

p->static_prio = NICE_TO_PRIO(0);

p->prio = p->normal_prio = __normal_prio(p);

set_load_weight(p);

/*

* We don't need the reset flag anymore after the fork. It has

* fulfilled its duty:

*/

p->sched_reset_on_fork = 0;

}

if (dl_prio(p->prio)) {

put_cpu();

return -EAGAIN;

} else if (rt_prio(p->prio)) {

p->sched_class = &rt_sched_class;

} else {

p->sched_class = &fair_sched_class;

}

init_entity_runnable_average(&p->se);

/*

* The child is not yet in the pid-hash so no cgroup attach races,

* and the cgroup is pinned to this child due to cgroup_fork()

* is ran before sched_fork().

*

* Silence PROVE_RCU.

*/

raw_spin_lock_irqsave(&p->pi_lock, flags);

/*

* We're setting the cpu for the first time, we don't migrate,

* so use __set_task_cpu().

*/

__set_task_cpu(p, cpu);

if (p->sched_class->task_fork)

p->sched_class->task_fork(p);

raw_spin_unlock_irqrestore(&p->pi_lock, flags);

#ifdef CONFIG_SCHED_INFO

if (likely(sched_info_on()))

memset(&p->sched_info, 0, sizeof(p->sched_info));

#endif

#if defined(CONFIG_SMP)

p->on_cpu = 0;

#endif

init_task_preempt_count(p);

#ifdef CONFIG_SMP

plist_node_init(&p->pushable_tasks, MAX_PRIO);

RB_CLEAR_NODE(&p->pushable_dl_tasks);

#endif

put_cpu();

return 0;

}

其中,task_fork在完全公平调度类中对应的是kernel/sched/fair.c中的是

task_fork_fair。

/*

* called on fork with the child task as argument from the parent's context

* - child not yet on the tasklist

* - preemption disabled

*/

static void task_fork_fair(struct task_struct *p)

{

struct cfs_rq *cfs_rq;

struct sched_entity *se = &p->se, *curr;

struct rq *rq = this_rq();

raw_spin_lock(&rq->lock);

update_rq_clock(rq);

cfs_rq = task_cfs_rq(current);

curr = cfs_rq->curr;

if (curr) {

update_curr(cfs_rq);

se->vruntime = curr->vruntime;

}

/*调整虚拟运行时间*/

place_entity(cfs_rq, se, 1);

if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {

/*

* Upon rescheduling, sched_class::put_prev_task() will place

* 'current' within the tree based on its new key value.

*/

swap(curr->vruntime, se->vruntime);

resched_curr(rq);

}

se->vruntime -= cfs_rq->min_vruntime;

raw_spin_unlock(&rq->lock);

}

static void

place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)

{

u64 vruntime = cfs_rq->min_vruntime;

/*

* The 'current' period is already promised to the current tasks,

* however the extra weight of the new task will slow them down a

* little, place the new task so that it fits in the slot that

* stays open at the end.

*/

if (initial && sched_feat(START_DEBIT))

vruntime += sched_vslice(cfs_rq, se);

/*sleeps up to a single latency don't count.*/

if (!initial) {

unsigned long thresh = sysctl_sched_latency;

/*

* Halve their sleep time's effect, to allow

* for a gentler effect of sleepers:

*/

if (sched_feat(GENTLE_FAIR_SLEEPERS))

thresh >>= 1;

vruntime -= thresh;

}

/*ensure we never gain time by being placed backwards.*/

se->vruntime = max_vruntime(se->vruntime, vruntime);

}

如果没有开启START_DEBIT,子进程的虚拟运行时间是父进程的虚拟运行时间与CFS运

行队列的最小虚拟运行时间的较小值。如果设置了START_DEBIT,会通过增大虚拟运行

时间来惩罚新创建的进程,增加的时间为一个虚拟时间片。

注意到sysctl_sched_child_runs_first那一行,可以指定

/proc/sys/kernel/sched_child_runs_first为1使子进程优先获得调度,如果是0,则

父进程优先获得调度。但这只是一个偏好设置,并不是保证。

再看这一行:

se->vruntime -= cfs_rq->min_vruntime;

在多处理器结构中,新创建的进程和父进程不一定在同一个CPU上,min_vruntime可能

相差较大,为了减少这个差距,在迁移之前减去所在CPU运行队列的最小虚拟运行时间;

在迁移后,再加上迁移后的CPU的运行队列中最小虚拟运行时间。在enqueue_task中可

以看到vruntime再加回来。enqueue_task在完全公平调度类中对应的是task_fork_fair。

/*

* called on fork with the child task as argument from the parent's context

* - child not yet on the tasklist

* - preemption disabled

*/

static void task_fork_fair(struct task_struct *p)

{

struct cfs_rq *cfs_rq;

struct sched_entity *se = &p->se, *curr;

struct rq *rq = this_rq();

raw_spin_lock(&rq->lock);

update_rq_clock(rq);

cfs_rq = task_cfs_rq(current);

curr = cfs_rq->curr;

if (curr) {

update_curr(cfs_rq);

se->vruntime = curr->vruntime;

}

place_entity(cfs_rq, se, 1);

if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {

/*

* Upon rescheduling, sched_class::put_prev_task() will place

* 'current' within the tree based on its new key value.

*/

swap(curr->vruntime, se->vruntime);

resched_curr(rq);

}

se->vruntime -= cfs_rq->min_vruntime;

raw_spin_unlock(&rq->lock);

}

static void

place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)

{

u64 vruntime = cfs_rq->min_vruntime;

/*

* The 'current' period is already promised to the current tasks,

* however the extra weight of the new task will slow them down a

* little, place the new task so that it fits in the slot that

* stays open at the end.

*/

if (initial && sched_feat(START_DEBIT))

vruntime += sched_vslice(cfs_rq, se);

/*sleeps up to a single latency don't count.*/

if (!initial) {

unsigned long thresh = sysctl_sched_latency;

/*

* Halve their sleep time's effect, to allow

* for a gentler effect of sleepers:

*/

if (sched_feat(GENTLE_FAIR_SLEEPERS))

thresh >>= 1;

vruntime -= thresh;

}

/*ensure we never gain time by being placed backwards.*/

se->vruntime = max_vruntime(se->vruntime, vruntime);

}

try_to_wake_up负责将睡眠进程唤醒。对应代码在kernel/sched/core.c中。其中也使

用了enqueue_task_fair。在place_entity中,可以看到当initial为0即被唤醒时,虚

拟运行时间为最小虚拟时间减去半个或一个周期。

无论是try_to_wake_up最后都会调用check_preempt_wakeup检查唤醒后者创建的进程

是否可以抢占当前进程。

/*

* Preempt the current task with a newly woken task if needed:

*/

static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)

{

struct task_struct *curr = rq->curr;

struct sched_entity *se = &curr->se, *pse = &p->se;

struct cfs_rq *cfs_rq = task_cfs_rq(curr);

int scale = cfs_rq->nr_running >= sched_nr_latency;

int next_buddy_marked = 0;

if (unlikely(se == pse))

return;

/*

* This is possible from callers such as attach_tasks(), in which we

* unconditionally check_prempt_curr() after an enqueue (which may have

* lead to a throttle). This both saves work and prevents false

* next-buddy nomination below.

*/

if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))

return;

if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {

set_next_buddy(pse);

next_buddy_marked = 1;

}

/*

* We can come here with TIF_NEED_RESCHED already set from new task

* wake up path.

*

* Note: this also catches the edge-case of curr being in a throttled

* group (e.g. via set_curr_task), since update_curr() (in the

* enqueue of curr) will have resulted in resched being set. This

* prevents us from potentially nominating it as a false LAST_BUDDY

* below.

*/

if (test_tsk_need_resched(curr))

return;

/*Idle tasks are by definition preempted by non-idle tasks.*/

if (unlikely(curr->policy == SCHED_IDLE) &&

likely(p->policy != SCHED_IDLE))

goto preempt;

/*

* Batch and idle tasks do not preempt non-idle tasks (their preemption

* is driven by the tick):

*/

if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))

return;

find_matching_se(&se, &pse);

update_curr(cfs_rq_of(se));

BUG_ON(!pse);

if (wakeup_preempt_entity(se, pse) == 1) {

/*

* Bias pick_next to pick the sched entity that is

* triggering this preemption.

*/

if (!next_buddy_marked)

set_next_buddy(pse);

goto preempt;

}

return;

preempt:

resched_curr(rq);

/*

* Only set the backward buddy when the current task is still

* on the rq. This can happen when a wakeup gets interleaved

* with schedule on the ->pre_schedule() or idle_balance()

* point, either of which can * drop the rq lock.

*

* Also, during early boot the idle thread is in the fair class,

* for obvious reasons its a bad idea to schedule back to it.

*/

if (unlikely(!se->on_rq || curr == rq->idle))

return;

if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))

set_last_buddy(se);

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值