From 6d5ab2932a21ea54406ab95c43ecff90a3eddfda Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Fri, 21 Jan 2011 20:45:01 -0800 Subject: sched: Simplify update_cfs_shares parameters Re-visiting this: Since update_cfs_shares will now only ever re-weight an entity that is a relative parent of the current entity in enqueue_entity; we can safely issue the account_entity_enqueue relative to that cfs_rq and avoid the requirement for special handling of the enqueue case in update_cfs_shares. Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra LKML-Reference: <20110122044851.915214637@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 18d38e4..e0fa3ff 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8510,7 +8510,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) /* Propagate contribution to hierarchy */ raw_spin_lock_irqsave(&rq->lock, flags); for_each_sched_entity(se) - update_cfs_shares(group_cfs_rq(se), 0); + update_cfs_shares(group_cfs_rq(se)); raw_spin_unlock_irqrestore(&rq->lock, flags); } -- cgit v1.1 From f07333bf6ee66d9b49286cec4371cf375e745b7a Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Fri, 21 Jan 2011 20:45:03 -0800 Subject: sched: Avoid expensive initial update_cfs_load() Since cfs->{load_stamp,load_last} are zero-initalized the initial load update will consider the delta to be 'since the beginning of time'. This results in a lot of pointless divisions to bring this large period to be within the sysctl_sched_shares_window. Fix this by initializing load_stamp to be 1 at cfs_rq initialization, this allows for an initial load_stamp > load_last which then lets standard idle truncation proceed. We avoid spinning (and slightly improve consistency) by fixing delta to be [period - 1] in this path resulting in a slightly more predictable shares ramp. (Previously the amount of idle time preserved by the overflow would range between [period/2,period-1].) Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra LKML-Reference: <20110122044852.102126037@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index e0fa3ff..6820b5b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7796,6 +7796,8 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) INIT_LIST_HEAD(&cfs_rq->tasks); #ifdef CONFIG_FAIR_GROUP_SCHED cfs_rq->rq = rq; + /* allow initial update_cfs_load() to truncate */ + cfs_rq->load_stamp = 1; #endif cfs_rq->min_vruntime = (u64)(-(1LL << 20)); } -- cgit v1.1 From 4dd53d891ca46dcc1fde0376a33540d3fd83cb9a Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Tue, 21 Dec 2010 17:09:00 -0800 Subject: softirqs: Free up pf flag PF_KSOFTIRQD Cleanup patch, freeing up PF_KSOFTIRQD and use per_cpu ksoftirqd pointer instead, as suggested by Eric Dumazet. Tested-by: Shaun Ruffell Signed-off-by: Venkatesh Pallipadi Signed-off-by: Peter Zijlstra LKML-Reference: <1292980144-28796-2-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 6820b5b..8b89b3b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1880,7 +1880,7 @@ void account_system_vtime(struct task_struct *curr) */ if (hardirq_count()) __this_cpu_add(cpu_hardirq_time, delta); - else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) + else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) __this_cpu_add(cpu_softirq_time, delta); irq_time_write_end(); -- cgit v1.1 From 70a89a6620f658d47a1488515bada4b8ee6291d8 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Tue, 21 Dec 2010 17:09:02 -0800 Subject: sched: Refactor account_system_time separating id-update Refactor account_system_time, to separate out the logic of identifying the update needed and code that does actual update. This is used by following patch for IRQ_TIME_ACCOUNTING, which has different identification logic and same update logic. Tested-by: Shaun Ruffell Signed-off-by: Venkatesh Pallipadi Signed-off-by: Peter Zijlstra LKML-Reference: <1292980144-28796-4-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 46 +++++++++++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 15 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 8b89b3b..e3fa921 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3568,6 +3568,32 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, } /* + * Account system cpu time to a process and desired cpustat field + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in kernel space since the last update + * @cputime_scaled: cputime scaled by cpu frequency + * @target_cputime64: pointer to cpustat field that has to be updated + */ +static inline +void __account_system_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled, cputime64_t *target_cputime64) +{ + cputime64_t tmp = cputime_to_cputime64(cputime); + + /* Add system time to process. */ + p->stime = cputime_add(p->stime, cputime); + p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); + account_group_system_time(p, cputime); + + /* Add system time to cpustat. */ + *target_cputime64 = cputime64_add(*target_cputime64, tmp); + cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); + + /* Account for system time used */ + acct_update_integrals(p); +} + +/* * Account system cpu time to a process. * @p: the process that the cpu time gets accounted to * @hardirq_offset: the offset to subtract from hardirq_count() @@ -3578,31 +3604,21 @@ void account_system_time(struct task_struct *p, int hardirq_offset, cputime_t cputime, cputime_t cputime_scaled) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - cputime64_t tmp; + cputime64_t *target_cputime64; if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { account_guest_time(p, cputime, cputime_scaled); return; } - /* Add system time to process. */ - p->stime = cputime_add(p->stime, cputime); - p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); - account_group_system_time(p, cputime); - - /* Add system time to cpustat. */ - tmp = cputime_to_cputime64(cputime); if (hardirq_count() - hardirq_offset) - cpustat->irq = cputime64_add(cpustat->irq, tmp); + target_cputime64 = &cpustat->irq; else if (in_serving_softirq()) - cpustat->softirq = cputime64_add(cpustat->softirq, tmp); + target_cputime64 = &cpustat->softirq; else - cpustat->system = cputime64_add(cpustat->system, tmp); + target_cputime64 = &cpustat->system; - cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); - - /* Account for system time used */ - acct_update_integrals(p); + __account_system_time(p, cputime, cputime_scaled, target_cputime64); } /* -- cgit v1.1 From abb74cefa9c682fb38ba86c17ca3c86fed6cc464 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Tue, 21 Dec 2010 17:09:03 -0800 Subject: sched: Export ns irqtimes through /proc/stat CONFIG_IRQ_TIME_ACCOUNTING adds ns granularity irq time on each CPU. This info is already used in scheduler to do proper task chargeback (earlier patches). This patch retro-fits this ns granularity hardirq and softirq information to /proc/stat irq and softirq fields. The update is still done on timer tick, where we look at accumulated ns hardirq/softirq time and account the tick to user/system/irq/hardirq/guest accordingly. No new interface added. Earlier versions looked at adding this as new fields in some /proc files. This one seems to be the best in terms of impact to existing apps, even though it has somewhat more kernel code than earlier versions. Tested-by: Shaun Ruffell Signed-off-by: Venkatesh Pallipadi Signed-off-by: Peter Zijlstra LKML-Reference: <1292980144-28796-5-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 102 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index e3fa921..2a3c979 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1920,8 +1920,40 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) sched_rt_avg_update(rq, irq_delta); } +static int irqtime_account_hi_update(void) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + unsigned long flags; + u64 latest_ns; + int ret = 0; + + local_irq_save(flags); + latest_ns = this_cpu_read(cpu_hardirq_time); + if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq)) + ret = 1; + local_irq_restore(flags); + return ret; +} + +static int irqtime_account_si_update(void) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + unsigned long flags; + u64 latest_ns; + int ret = 0; + + local_irq_save(flags); + latest_ns = this_cpu_read(cpu_softirq_time); + if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq)) + ret = 1; + local_irq_restore(flags); + return ret; +} + #else /* CONFIG_IRQ_TIME_ACCOUNTING */ +#define sched_clock_irqtime (0) + static void update_rq_clock_task(struct rq *rq, s64 delta) { rq->clock_task += delta; @@ -3621,6 +3653,65 @@ void account_system_time(struct task_struct *p, int hardirq_offset, __account_system_time(p, cputime, cputime_scaled, target_cputime64); } +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +/* + * Account a tick to a process and cpustat + * @p: the process that the cpu time gets accounted to + * @user_tick: is the tick from userspace + * @rq: the pointer to rq + * + * Tick demultiplexing follows the order + * - pending hardirq update + * - pending softirq update + * - user_time + * - idle_time + * - system time + * - check for guest_time + * - else account as system_time + * + * Check for hardirq is done both for system and user time as there is + * no timer going off while we are on hardirq and hence we may never get an + * opportunity to update it solely in system time. + * p->stime and friends are only updated on system time and not on irq + * softirq as those do not count in task exec_runtime any more. + */ +static void irqtime_account_process_tick(struct task_struct *p, int user_tick, + struct rq *rq) +{ + cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); + cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + + if (irqtime_account_hi_update()) { + cpustat->irq = cputime64_add(cpustat->irq, tmp); + } else if (irqtime_account_si_update()) { + cpustat->softirq = cputime64_add(cpustat->softirq, tmp); + } else if (user_tick) { + account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); + } else if (p == rq->idle) { + account_idle_time(cputime_one_jiffy); + } else if (p->flags & PF_VCPU) { /* System time or guest time */ + account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); + } else { + __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, + &cpustat->system); + } +} + +static void irqtime_account_idle_ticks(int ticks) +{ + int i; + struct rq *rq = this_rq(); + + for (i = 0; i < ticks; i++) + irqtime_account_process_tick(current, 0, rq); +} +#else +static void irqtime_account_idle_ticks(int ticks) {} +static void irqtime_account_process_tick(struct task_struct *p, int user_tick, + struct rq *rq) {} +#endif + /* * Account for involuntary wait time. * @steal: the cpu time spent in involuntary wait @@ -3661,6 +3752,11 @@ void account_process_tick(struct task_struct *p, int user_tick) cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); struct rq *rq = this_rq(); + if (sched_clock_irqtime) { + irqtime_account_process_tick(p, user_tick, rq); + return; + } + if (user_tick) account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) @@ -3686,6 +3782,12 @@ void account_steal_ticks(unsigned long ticks) */ void account_idle_ticks(unsigned long ticks) { + + if (sched_clock_irqtime) { + irqtime_account_idle_ticks(ticks); + return; + } + account_idle_time(jiffies_to_cputime(ticks)); } -- cgit v1.1 From 414bee9ba613adb3804965e2d84db32d0599f9c6 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Tue, 21 Dec 2010 17:09:04 -0800 Subject: softirqs: Account ksoftirqd time as cpustat softirq softirq time in ksoftirqd context is not accounted in ns granularity per cpu softirq stats, as we want that to be a part of ksoftirqd exec_runtime. Accounting them as softirq on /proc/stat separately. Tested-by: Shaun Ruffell Signed-off-by: Venkatesh Pallipadi Signed-off-by: Peter Zijlstra LKML-Reference: <1292980144-28796-6-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 2a3c979..8b718b5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3686,6 +3686,14 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, cpustat->irq = cputime64_add(cpustat->irq, tmp); } else if (irqtime_account_si_update()) { cpustat->softirq = cputime64_add(cpustat->softirq, tmp); + } else if (this_cpu_ksoftirqd() == p) { + /* + * ksoftirqd time do not get accounted in cpu_softirq_time. + * So, we have to handle it separately here. + * Also, p->stime needs to be updated for ksoftirqd. + */ + __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, + &cpustat->softirq); } else if (user_tick) { account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); } else if (p == rq->idle) { -- cgit v1.1 From da7a735e51f9622eb3e1672594d4a41da01d7e4f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 17 Jan 2011 17:03:27 +0100 Subject: sched: Fix switch_from_fair() When a task is taken out of the fair class we must ensure the vruntime is properly normalized because when we put it back in it will assume to be normalized. The case that goes wrong is when changing away from the fair class while sleeping. Sleeping tasks have non-normalized vruntime in order to make sleeper-fairness work. So treat the switch away from fair as a wakeup and preserve the relative vruntime. Also update sysrq-n to call the ->switch_{to,from} methods. Reported-by: Onkalo Samu Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 8b718b5..78fa753 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2057,14 +2057,14 @@ inline int task_curr(const struct task_struct *p) static inline void check_class_changed(struct rq *rq, struct task_struct *p, const struct sched_class *prev_class, - int oldprio, int running) + int oldprio) { if (prev_class != p->sched_class) { if (prev_class->switched_from) - prev_class->switched_from(rq, p, running); - p->sched_class->switched_to(rq, p, running); - } else - p->sched_class->prio_changed(rq, p, oldprio, running); + prev_class->switched_from(rq, p); + p->sched_class->switched_to(rq, p); + } else if (oldprio != p->prio) + p->sched_class->prio_changed(rq, p, oldprio); } static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) @@ -2598,6 +2598,7 @@ static void __sched_fork(struct task_struct *p) p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; + p->se.vruntime = 0; #ifdef CONFIG_SCHEDSTATS memset(&p->se.statistics, 0, sizeof(p->se.statistics)); @@ -4696,11 +4697,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (running) p->sched_class->set_curr_task(rq); - if (on_rq) { + if (on_rq) enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); - check_class_changed(rq, p, prev_class, oldprio, running); - } + check_class_changed(rq, p, prev_class, oldprio); task_rq_unlock(rq, &flags); } @@ -5028,11 +5028,10 @@ recheck: if (running) p->sched_class->set_curr_task(rq); - if (on_rq) { + if (on_rq) activate_task(rq, p, 0); - check_class_changed(rq, p, prev_class, oldprio, running); - } + check_class_changed(rq, p, prev_class, oldprio); __task_rq_unlock(rq); raw_spin_unlock_irqrestore(&p->pi_lock, flags); @@ -8237,6 +8236,8 @@ EXPORT_SYMBOL(__might_sleep); #ifdef CONFIG_MAGIC_SYSRQ static void normalize_task(struct rq *rq, struct task_struct *p) { + const struct sched_class *prev_class = p->sched_class; + int old_prio = p->prio; int on_rq; on_rq = p->se.on_rq; @@ -8247,6 +8248,8 @@ static void normalize_task(struct rq *rq, struct task_struct *p) activate_task(rq, p, 0); resched_task(rq->curr); } + + check_class_changed(rq, p, prev_class, old_prio); } void normalize_rt_tasks(void) -- cgit v1.1 From 6ea72f12069306b235151c5b05ac0cca7e1dedfa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 26 Jan 2011 13:36:03 +0100 Subject: sched: Avoid expensive initial update_cfs_load(), on UP too Fix the build on UP. Signed-off-by: Peter Zijlstra Cc: Paul Turner LKML-Reference: <20110122044852.102126037@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 78fa753..477e1bc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7922,8 +7922,10 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) #ifdef CONFIG_FAIR_GROUP_SCHED cfs_rq->rq = rq; /* allow initial update_cfs_load() to truncate */ +#ifdef CONFIG_SMP cfs_rq->load_stamp = 1; #endif +#endif cfs_rq->min_vruntime = (u64)(-(1LL << 20)); } -- cgit v1.1 From ac53db596cc08ecb8040cfb6f71ae40c6f2041c4 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 1 Feb 2011 09:51:03 -0500 Subject: sched: Use a buddy to implement yield_task_fair() Use the buddy mechanism to implement yield_task_fair. This allows us to skip onto the next highest priority se at every level in the CFS tree, unless doing so would introduce gross unfairness in CPU time distribution. We order the buddy selection in pick_next_entity to check yield first, then last, then next. We need next to be able to override yield, because it is possible for the "next" and "yield" task to be different processen in the same sub-tree of the CFS tree. When they are, we need to go into that sub-tree regardless of the "yield" hint, and pick the correct entity once we get to the right level. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra LKML-Reference: <20110201095103.3a79e92a@annuminas.surriel.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 477e1bc..ae5e1a1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -324,7 +324,7 @@ struct cfs_rq { * 'curr' points to currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). */ - struct sched_entity *curr, *next, *last; + struct sched_entity *curr, *next, *last, *skip; unsigned int nr_spread_over; -- cgit v1.1 From d95f412200652694e63e64bfd49f0ae274a54479 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 1 Feb 2011 09:50:51 -0500 Subject: sched: Add yield_to(task, preempt) functionality Currently only implemented for fair class tasks. Add a yield_to_task method() to the fair scheduling class. allowing the caller of yield_to() to accelerate another thread in it's thread group, task group. Implemented via a scheduler hint, using cfs_rq->next to encourage the target being selected. We can rely on pick_next_entity to keep things fair, so noone can accelerate a thread that has already used its fair share of CPU time. This also means callers should only call yield_to when they really mean it. Calling it too often can result in the scheduler just ignoring the hint. Signed-off-by: Rik van Riel Signed-off-by: Marcelo Tosatti Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra LKML-Reference: <20110201095051.4ddb7738@annuminas.surriel.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index ae5e1a1..2effcb7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1686,6 +1686,39 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) __release(rq2->lock); } +#else /* CONFIG_SMP */ + +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ +static void double_rq_lock(struct rq *rq1, struct rq *rq2) + __acquires(rq1->lock) + __acquires(rq2->lock) +{ + BUG_ON(!irqs_disabled()); + BUG_ON(rq1 != rq2); + raw_spin_lock(&rq1->lock); + __acquire(rq2->lock); /* Fake it out ;) */ +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static void double_rq_unlock(struct rq *rq1, struct rq *rq2) + __releases(rq1->lock) + __releases(rq2->lock) +{ + BUG_ON(rq1 != rq2); + raw_spin_unlock(&rq1->lock); + __release(rq2->lock); +} + #endif static void calc_load_account_idle(struct rq *this_rq); @@ -5448,6 +5481,58 @@ void __sched yield(void) } EXPORT_SYMBOL(yield); +/** + * yield_to - yield the current processor to another thread in + * your thread group, or accelerate that thread toward the + * processor it's on. + * + * It's the caller's job to ensure that the target task struct + * can't go away on us before we can do any checks. + * + * Returns true if we indeed boosted the target task. + */ +bool __sched yield_to(struct task_struct *p, bool preempt) +{ + struct task_struct *curr = current; + struct rq *rq, *p_rq; + unsigned long flags; + bool yielded = 0; + + local_irq_save(flags); + rq = this_rq(); + +again: + p_rq = task_rq(p); + double_rq_lock(rq, p_rq); + while (task_rq(p) != p_rq) { + double_rq_unlock(rq, p_rq); + goto again; + } + + if (!curr->sched_class->yield_to_task) + goto out; + + if (curr->sched_class != p->sched_class) + goto out; + + if (task_running(p_rq, p) || p->state) + goto out; + + yielded = curr->sched_class->yield_to_task(rq, p, preempt); + if (yielded) + schedstat_inc(rq, yld_count); + +out: + double_rq_unlock(rq, p_rq); + local_irq_restore(flags); + + if (yielded) + schedule(); + + return yielded; +} +EXPORT_SYMBOL_GPL(yield_to); + /* * This task is about to go to sleep on IO. Increment rq->nr_iowait so * that process accounting knows that this is a task in IO wait state. -- cgit v1.1 From 7e9498705e810404ecf29bb2d6fa632b9484c609 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 25 Feb 2011 14:32:28 +0100 Subject: sched: Add #ifdef around irq time accounting functions Get rid of this: kernel/sched.c:3731:13: warning: 'irqtime_account_idle_ticks' defined but not used kernel/sched.c:3732:13: warning: 'irqtime_account_process_tick' defined but not used Signed-off-by: Heiko Carstens Cc: Venkatesh Pallipadi Cc: Peter Zijlstra LKML-Reference: <20110225133228.GD7469@osiris.boeblingen.de.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 2effcb7..79bca16 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3687,6 +3687,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, __account_system_time(p, cputime, cputime_scaled, target_cputime64); } +#ifndef CONFIG_VIRT_CPU_ACCOUNTING #ifdef CONFIG_IRQ_TIME_ACCOUNTING /* * Account a tick to a process and cpustat @@ -3753,6 +3754,7 @@ static void irqtime_account_idle_ticks(int ticks) {} static void irqtime_account_process_tick(struct task_struct *p, int user_tick, struct rq *rq) {} #endif +#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ /* * Account for involuntary wait time. -- cgit v1.1 From 544b4a1f309d18f40969dbab7e08bafd136b2f55 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Fri, 25 Feb 2011 15:13:16 -0800 Subject: sched: Clean up the IRQ_TIME_ACCOUNTING code Fix this warning: lkml.org/lkml/2011/1/30/124 kernel/sched.c:3719: warning: 'irqtime_account_idle_ticks' defined but not used kernel/sched.c:3720: warning: 'irqtime_account_process_tick' defined but not used In a cleaner way than: 7e9498705e81: sched: Add #ifdef around irq time accounting functions This patch will not have any functional impact. Signed-off-by: Venkatesh Pallipadi Cc: heiko.carstens@de.ibm.com Cc: a.p.zijlstra@chello.nl LKML-Reference: <1298675596-10992-1-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 64 ++++++++++++++++++++++++++++------------------------------ 1 file changed, 31 insertions(+), 33 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 79bca16..0c87126 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3687,7 +3687,36 @@ void account_system_time(struct task_struct *p, int hardirq_offset, __account_system_time(p, cputime, cputime_scaled, target_cputime64); } +/* + * Account for involuntary wait time. + * @cputime: the cpu time spent in involuntary wait + */ +void account_steal_time(cputime_t cputime) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t cputime64 = cputime_to_cputime64(cputime); + + cpustat->steal = cputime64_add(cpustat->steal, cputime64); +} + +/* + * Account for idle time. + * @cputime: the cpu time spent in idle wait + */ +void account_idle_time(cputime_t cputime) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t cputime64 = cputime_to_cputime64(cputime); + struct rq *rq = this_rq(); + + if (atomic_read(&rq->nr_iowait) > 0) + cpustat->iowait = cputime64_add(cpustat->iowait, cputime64); + else + cpustat->idle = cputime64_add(cpustat->idle, cputime64); +} + #ifndef CONFIG_VIRT_CPU_ACCOUNTING + #ifdef CONFIG_IRQ_TIME_ACCOUNTING /* * Account a tick to a process and cpustat @@ -3749,42 +3778,11 @@ static void irqtime_account_idle_ticks(int ticks) for (i = 0; i < ticks; i++) irqtime_account_process_tick(current, 0, rq); } -#else +#else /* CONFIG_IRQ_TIME_ACCOUNTING */ static void irqtime_account_idle_ticks(int ticks) {} static void irqtime_account_process_tick(struct task_struct *p, int user_tick, struct rq *rq) {} -#endif -#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ - -/* - * Account for involuntary wait time. - * @steal: the cpu time spent in involuntary wait - */ -void account_steal_time(cputime_t cputime) -{ - struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - cputime64_t cputime64 = cputime_to_cputime64(cputime); - - cpustat->steal = cputime64_add(cpustat->steal, cputime64); -} - -/* - * Account for idle time. - * @cputime: the cpu time spent in idle wait - */ -void account_idle_time(cputime_t cputime) -{ - struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - cputime64_t cputime64 = cputime_to_cputime64(cputime); - struct rq *rq = this_rq(); - - if (atomic_read(&rq->nr_iowait) > 0) - cpustat->iowait = cputime64_add(cpustat->iowait, cputime64); - else - cpustat->idle = cputime64_add(cpustat->idle, cputime64); -} - -#ifndef CONFIG_VIRT_CPU_ACCOUNTING +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ /* * Account a single tick of cpu time. -- cgit v1.1 From c02aa73b1d18e43cfd79c2f193b225e84ca497c8 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Thu, 17 Feb 2011 15:37:07 -0800 Subject: sched: Allow users with sufficient RLIMIT_NICE to change from SCHED_IDLE policy The current scheduler implementation returns -EPERM when trying to change from SCHED_IDLE to SCHED_OTHER or SCHED_BATCH. Since SCHED_IDLE is considered to be a nice 20 on steroids, changing to another policy should be allowed provided the RLIMIT_NICE is accounted for. This patch allows the following test-case to pass with RLIMIT_NICE=40, but still fail with RLIMIT_NICE=10 when the calling process is run from a typical shell (nice 0, or 20 in rlimit terms). int main() { int ret; struct sched_param sp; sp.sched_priority = 0; /* switch to SCHED_IDLE */ ret = sched_setscheduler(0, SCHED_IDLE, &sp); printf("setscheduler IDLE: %d\n", ret); if (ret) return ret; /* switch back to SCHED_OTHER */ ret = sched_setscheduler(0, SCHED_OTHER, &sp); printf("setscheduler OTHER: %d\n", ret); return ret; } $ ulimit -e 40 $ ./test setscheduler IDLE: 0 setscheduler OTHER: 0 $ ulimit -e 10 $ ulimit -e 10 $ ./test setscheduler IDLE: 0 setscheduler OTHER: -1 Signed-off-by: Darren Hart Signed-off-by: Peter Zijlstra Cc: Richard Purdie LKML-Reference: <4D657BEE.4040608@linux.intel.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 0c87126..f303070 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4981,12 +4981,15 @@ recheck: param->sched_priority > rlim_rtprio) return -EPERM; } + /* - * Like positive nice levels, dont allow tasks to - * move out of SCHED_IDLE either: + * Treat SCHED_IDLE as nice 20. Only allow a switch to + * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. */ - if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) - return -EPERM; + if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { + if (!can_nice(p, TASK_NICE(p))) + return -EPERM; + } /* can't change other user's priorities */ if (!check_same_owner(p)) -- cgit v1.1 From 6d1cafd8b56ea726c10a5a104de57cc3ed8fa953 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Tue, 1 Mar 2011 16:28:21 -0800 Subject: sched: Resched proper CPU on yield_to() yield_to_task_fair() has code to resched the CPU of yielding task when the intention is to resched the CPU of the task that is being yielded to. Change here fixes the problem and also makes the resched conditional on rq != p_rq. Signed-off-by: Venkatesh Pallipadi Reviewed-by: Rik van Riel Signed-off-by: Peter Zijlstra LKML-Reference: <1299025701-22168-1-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index f303070..61452e8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5522,8 +5522,15 @@ again: goto out; yielded = curr->sched_class->yield_to_task(rq, p, preempt); - if (yielded) + if (yielded) { schedstat_inc(rq, yld_count); + /* + * Make p's CPU reschedule; pick_next_entity takes care of + * fairness. + */ + if (preempt && rq != p_rq) + resched_task(p_rq->curr); + } out: double_rq_unlock(rq, p_rq); -- cgit v1.1