From 39be350127ec60a078edffe5b4915dafba4ba514 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 26 Jan 2012 12:44:34 +0100 Subject: sched, block: Unify cache detection The block layer has some code trying to determine if two CPUs share a cache, the scheduler has a similar function. Expose the function used by the scheduler and make the block layer use it, thereby removing the block layers usage of CONFIG_SCHED* and topology bits. Signed-off-by: Peter Zijlstra Acked-by: Jens Axboe Link: http://lkml.kernel.org/r/1327579450.2446.95.camel@twins --- kernel/sched/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5255c9d2e053..d7c43227311d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1507,7 +1507,7 @@ static int ttwu_activate_remote(struct task_struct *p, int wake_flags) } #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ -static inline int ttwu_share_cache(int this_cpu, int that_cpu) +bool cpus_share_cache(int this_cpu, int that_cpu) { return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); } @@ -1518,7 +1518,7 @@ static void ttwu_queue(struct task_struct *p, int cpu) struct rq *rq = cpu_rq(cpu); #if defined(CONFIG_SMP) - if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) { + if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { sched_clock_cpu(cpu); /* sync clocks x-cpu */ ttwu_queue_remote(p, cpu); return; @@ -5754,7 +5754,7 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) * * Also keep a unique ID per domain (we use the first cpu number in * the cpumask of the domain), this allows us to quickly tell if - * two cpus are in the same cache domain, see ttwu_share_cache(). + * two cpus are in the same cache domain, see cpus_share_cache(). */ DEFINE_PER_CPU(struct sched_domain *, sd_llc); DEFINE_PER_CPU(int, sd_llc_id); -- cgit v1.2.3 From 4ec4412e1e91f44a3dcb97b6c9172a13fc78bac9 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 12 Dec 2011 20:21:08 +0100 Subject: sched: Ensure cpu_power periodic update With a lot of small tasks, the softirq sched is nearly never called when no_hz is enabled. In this case load_balance() is mainly called with the newly_idle mode which doesn't update the cpu_power. Add a next_update field which ensure a maximum update period when there is short activity. Having stale cpu_power information can skew the load-balancing decisions, this is cured by the guaranteed update. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1323717668-2143-1-git-send-email-vincent.guittot@linaro.org --- include/linux/sched.h | 1 + kernel/sched/fair.c | 24 ++++++++++++++++-------- 2 files changed, 17 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 0e1959568836..92313a3f6f77 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -905,6 +905,7 @@ struct sched_group_power { * single CPU. */ unsigned int power, power_orig; + unsigned long next_update; /* * Number of busy cpus in this group. */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7c6414fc669d..8e77a6bd597b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -215,6 +215,8 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, const struct sched_class fair_sched_class; +static unsigned long __read_mostly max_load_balance_interval = HZ/10; + /************************************************************** * CFS operations on generic schedulable entities: */ @@ -3776,6 +3778,11 @@ void update_group_power(struct sched_domain *sd, int cpu) struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; unsigned long power; + unsigned long interval; + + interval = msecs_to_jiffies(sd->balance_interval); + interval = clamp(interval, 1UL, max_load_balance_interval); + sdg->sgp->next_update = jiffies + interval; if (!child) { update_cpu_power(sd, cpu); @@ -3883,12 +3890,15 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, * domains. In the newly idle case, we will allow all the cpu's * to do the newly idle load balance. */ - if (idle != CPU_NEWLY_IDLE && local_group) { - if (balance_cpu != this_cpu) { - *balance = 0; - return; - } - update_group_power(sd, this_cpu); + if (local_group) { + if (idle != CPU_NEWLY_IDLE) { + if (balance_cpu != this_cpu) { + *balance = 0; + return; + } + update_group_power(sd, this_cpu); + } else if (time_after_eq(jiffies, group->sgp->next_update)) + update_group_power(sd, this_cpu); } /* Adjust by relative CPU power of the group */ @@ -4945,8 +4955,6 @@ static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, static DEFINE_SPINLOCK(balancing); -static unsigned long __read_mostly max_load_balance_interval = HZ/10; - /* * Scale the max load_balance interval with the number of CPUs in the system. * This trades load-balance latency on larger machines for less cross talk. -- cgit v1.2.3 From 30fd049afcfed50e022704036e8629d6bdfe84e6 Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Tue, 24 Jan 2012 22:33:56 +0600 Subject: sched: Remove sched_switch Currently we don't utilize the sched_switch field anymore. But, simply removing sched_switch field from the middle of the sched_stat output will break tools. So, to stay compatible we hardcode it to zero and remove the field from the scheduler data structures. Update the schedstat documentation accordingly. Signed-off-by: Rakib Mullick Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1327422836.27181.5.camel@localhost.localdomain Signed-off-by: Ingo Molnar --- Documentation/scheduler/sched-stats.txt | 3 ++- kernel/sched/debug.c | 1 - kernel/sched/sched.h | 1 - kernel/sched/stats.c | 4 ++-- 4 files changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/Documentation/scheduler/sched-stats.txt b/Documentation/scheduler/sched-stats.txt index 1cd5d51bc761..8259b34a66ae 100644 --- a/Documentation/scheduler/sched-stats.txt +++ b/Documentation/scheduler/sched-stats.txt @@ -38,7 +38,8 @@ First field is a sched_yield() statistic: 1) # of times sched_yield() was called Next three are schedule() statistics: - 2) # of times we switched to the expired queue and reused it + 2) This field is a legacy array expiration count field used in the O(1) + scheduler. We kept it for ABI compatibility, but it is always set to zero. 3) # of times schedule() was called 4) # of times schedule() left the processor idle diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2a075e10004b..09acaa15161d 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -288,7 +288,6 @@ static void print_cpu(struct seq_file *m, int cpu) P(yld_count); - P(sched_switch); P(sched_count); P(sched_goidle); #ifdef CONFIG_SMP diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 98c0c2623db8..8a2c768d2f98 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -462,7 +462,6 @@ struct rq { unsigned int yld_count; /* schedule() stats */ - unsigned int sched_switch; unsigned int sched_count; unsigned int sched_goidle; diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 2a581ba8e190..903ffa9e8872 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -32,9 +32,9 @@ static int show_schedstat(struct seq_file *seq, void *v) /* runqueue-specific stats */ seq_printf(seq, - "cpu%d %u %u %u %u %u %u %llu %llu %lu", + "cpu%d %u 0 %u %u %u %u %llu %llu %lu", cpu, rq->yld_count, - rq->sched_switch, rq->sched_count, rq->sched_goidle, + rq->sched_count, rq->sched_goidle, rq->ttwu_count, rq->ttwu_local, rq->rq_cpu_time, rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); -- cgit v1.2.3 From ed387b781ea6e14b78f449aa2ee4f270b60b01ac Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Tue, 31 Jan 2012 11:40:32 +0900 Subject: sched: Move SMP-only variable into the SMP section This also fixes the following compilation warning on !SMP: CC kernel/sched/fair.o kernel/sched/fair.c:218:36: warning: 'max_load_balance_interval' defined but not used [-Wunused-variable] Signed-off-by: Hiroshi Shimamoto Reviewed-by: Vincent Guittot Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/4F2754A0.9090306@ct.jp.nec.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8e77a6bd597b..4ab60a24ea49 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -215,8 +215,6 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, const struct sched_class fair_sched_class; -static unsigned long __read_mostly max_load_balance_interval = HZ/10; - /************************************************************** * CFS operations on generic schedulable entities: */ @@ -3086,6 +3084,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * Fair scheduling class load-balancing methods: */ +static unsigned long __read_mostly max_load_balance_interval = HZ/10; + /* * pull_task - move a task from a remote runqueue to the local runqueue. * Both runqueues must be locked. -- cgit v1.2.3 From 62f6536a630affe3176deb48554d27ee58b65077 Mon Sep 17 00:00:00 2001 From: "Nikunj A. Dadhania" Date: Fri, 17 Feb 2012 08:34:30 +0530 Subject: sched: Remove rcu_read_lock/unlock() from select_idle_sibling() select_idle_sibling() is called from select_task_rq_fair(), which already has the RCU read lock held. Signed-off-by: Nikunj A. Dadhania Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120217030409.11748.12491.stgit@abhimanyu Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4ab60a24ea49..6ce9992926d0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2670,8 +2670,6 @@ static int select_idle_sibling(struct task_struct *p, int target) /* * Otherwise, iterate the domains and find an elegible idle cpu. */ - rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_llc, target)); for_each_lower_domain(sd) { sg = sd->groups; @@ -2693,8 +2691,6 @@ next: } while (sg != sd->groups); } done: - rcu_read_unlock(); - return target; } -- cgit v1.2.3 From de5bdff7a72acc281219be2b8edeeca1fd81c542 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Thu, 16 Feb 2012 14:52:21 +0900 Subject: sched: Make initial SCHED_RR timeslace DEF_TIMESLICE Current the initial SCHED_RR timeslice of init_task is HZ, which means 1s, and is not same as the default SCHED_RR timeslice DEF_TIMESLICE. Change that initial timeslice to the DEF_TIMESLICE. Signed-off-by: Hiroshi Shimamoto [ s/DEF_TIMESLICE/RR_TIMESLICE/g ] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/4F3C9995.3010800@ct.jp.nec.com Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 2 +- include/linux/sched.h | 6 ++++++ kernel/sched/rt.c | 4 ++-- kernel/sched/sched.h | 4 ---- 4 files changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 9c66b1ada9d7..f994d51f70f2 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -149,7 +149,7 @@ extern struct cred init_cred; }, \ .rt = { \ .run_list = LIST_HEAD_INIT(tsk.rt.run_list), \ - .time_slice = HZ, \ + .time_slice = RR_TIMESLICE, \ .nr_cpus_allowed = NR_CPUS, \ }, \ .tasks = LIST_HEAD_INIT(tsk.tasks), \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 5dba2ad52431..eb5de466f099 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1234,6 +1234,12 @@ struct sched_rt_entity { #endif }; +/* + * default timeslice is 100 msecs (used only for SCHED_RR tasks). + * Timeslices get refilled after they expire. + */ +#define RR_TIMESLICE (100 * HZ / 1000) + struct rcu_node; enum perf_event_task_context { diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f42ae7fb5ec5..f70206c2c802 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1972,7 +1972,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) if (--p->rt.time_slice) return; - p->rt.time_slice = DEF_TIMESLICE; + p->rt.time_slice = RR_TIMESLICE; /* * Requeue to the end of queue if we are not the only element @@ -2000,7 +2000,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) * Time slice is 0 for SCHED_FIFO tasks */ if (task->policy == SCHED_RR) - return DEF_TIMESLICE; + return RR_TIMESLICE; else return 0; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 8a2c768d2f98..c0660a1a0cd1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -36,11 +36,7 @@ extern __read_mostly int scheduler_running; /* * These are the 'tuning knobs' of the scheduler: - * - * default timeslice is 100 msecs (used only for SCHED_RR tasks). - * Timeslices get refilled after they expire. */ -#define DEF_TIMESLICE (100 * HZ / 1000) /* * single value that denotes runtime == period, ie unlimited time. -- cgit v1.2.3 From 42c62a589f1ccbf38a02cb732231f9c2fccc5ab0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 18 Oct 2011 22:03:48 +0200 Subject: sched/rt: Keep period timer ticking when rt throttling is active When a runqueue is throttled we cannot disable the period timer because that timer is the only way to undo the throttling. We got stale throttling entries when a rq was throttled and then the global sysctl was disabled, which stopped the timer. Signed-off-by: Peter Zijlstra [ Added changelog ] Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/n/tip-nuj34q52p6ro7szapuz84i0v@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f70206c2c802..6d1eb0be1870 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -778,12 +778,9 @@ static inline int balance_runtime(struct rt_rq *rt_rq) static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) { - int i, idle = 1; + int i, idle = 1, throttled = 0; const struct cpumask *span; - if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) - return 1; - span = sched_rt_period_mask(); for_each_cpu(i, span) { int enqueue = 0; @@ -818,12 +815,17 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) if (!rt_rq_throttled(rt_rq)) enqueue = 1; } + if (rt_rq->rt_throttled) + throttled = 1; if (enqueue) sched_rt_rq_enqueue(rt_rq); raw_spin_unlock(&rq->lock); } + if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)) + return 1; + return idle; } @@ -884,7 +886,8 @@ static void update_curr_rt(struct rq *rq) if (unlikely((s64)delta_exec < 0)) delta_exec = 0; - schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); + schedstat_set(curr->se.statistics.exec_max, + max(curr->se.statistics.exec_max, delta_exec)); curr->se.sum_exec_runtime += delta_exec; account_group_exec_runtime(curr, delta_exec); -- cgit v1.2.3 From 7abc63b1bd412f7655b62ef3e35c3c11c5134636 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 18 Oct 2011 22:03:48 +0200 Subject: sched/rt: Do not throttle when PI boosting When a runqueue has rt_runtime_us = 0 then the only way it can accumulate rt_time is via PI boosting. That causes the runqueue to be throttled and replenishing does not change anything due to rt_runtime_us = 0. So avoid that situation by clearing rt_time and skip the throttling alltogether. Signed-off-by: Peter Zijlstra [ Changelog ] Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/n/tip-7x70cypsotjb4jvcor3edctk@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 6d1eb0be1870..7f7e7cdcb472 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -857,8 +857,24 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) return 0; if (rt_rq->rt_time > runtime) { - rt_rq->rt_throttled = 1; - printk_once(KERN_WARNING "sched: RT throttling activated\n"); + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); + + /* + * Don't actually throttle groups that have no runtime assigned + * but accrue some time due to boosting. + */ + if (likely(rt_b->rt_runtime)) { + rt_rq->rt_throttled = 1; + printk_once(KERN_WARNING "sched: RT throttling activated\n"); + } else { + /* + * In case we did anyway, make it go away, + * replenishment is a joke, since it will replenish us + * with exactly 0 ns. + */ + rt_rq->rt_time = 0; + } + if (rt_rq_throttled(rt_rq)) { sched_rt_rq_dequeue(rt_rq); return 1; -- cgit v1.2.3 From c5491ea779793f977d282754db478157cc409d82 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 21 Mar 2011 12:09:35 +0100 Subject: sched/rt: Add schedule_preempt_disabled() Add helper to get rid of the ever repeating: preempt_enable_no_resched(); schedule(); preempt_disable(); patterns. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-wxx7btox7coby6ifv5vzhzgp@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched/core.c | 12 ++++++++++++ 2 files changed, 13 insertions(+) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 1a6398424fab..03dd224d0667 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -361,6 +361,7 @@ extern signed long schedule_timeout_interruptible(signed long timeout); extern signed long schedule_timeout_killable(signed long timeout); extern signed long schedule_timeout_uninterruptible(signed long timeout); asmlinkage void schedule(void); +extern void schedule_preempt_disabled(void); extern int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner); struct nsproxy; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 91fe6a0e9098..73022395c00e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3246,6 +3246,18 @@ asmlinkage void __sched schedule(void) } EXPORT_SYMBOL(schedule); +/** + * schedule_preempt_disabled - called with preemption disabled + * + * Returns with preemption disabled. Note: preempt_count must be 1 + */ +void __sched schedule_preempt_disabled(void) +{ + preempt_enable_no_resched(); + schedule(); + preempt_disable(); +} + #ifdef CONFIG_MUTEX_SPIN_ON_OWNER static inline bool owner_running(struct mutex *lock, struct task_struct *owner) -- cgit v1.2.3 From bd2f55361f18347e890d52ff9cfd8895455ec11b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 21 Mar 2011 12:33:18 +0100 Subject: sched/rt: Use schedule_preempt_disabled() Coccinelle based conversion. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-24swm5zut3h9c4a6s46x8rws@git.kernel.org Signed-off-by: Ingo Molnar --- arch/arm/kernel/process.c | 4 +--- arch/avr32/kernel/process.c | 4 +--- arch/blackfin/kernel/process.c | 4 +--- arch/cris/kernel/process.c | 4 +--- arch/frv/kernel/process.c | 4 +--- arch/h8300/kernel/process.c | 4 +--- arch/ia64/kernel/process.c | 4 +--- arch/m32r/kernel/process.c | 4 +--- arch/m68k/kernel/process_mm.c | 4 +--- arch/m68k/kernel/process_no.c | 4 +--- arch/microblaze/kernel/process.c | 4 +--- arch/mips/kernel/process.c | 4 +--- arch/mn10300/kernel/process.c | 4 +--- arch/parisc/kernel/process.c | 4 +--- arch/powerpc/kernel/idle.c | 8 ++++---- arch/powerpc/platforms/iseries/setup.c | 8 ++------ arch/s390/kernel/process.c | 4 +--- arch/score/kernel/process.c | 4 +--- arch/sh/kernel/idle.c | 4 +--- arch/sparc/kernel/process_32.c | 8 ++------ arch/sparc/kernel/process_64.c | 10 ++++------ arch/tile/kernel/process.c | 4 +--- arch/x86/kernel/process_32.c | 4 +--- arch/x86/kernel/process_64.c | 4 +--- arch/xtensa/kernel/process.c | 4 +--- init/main.c | 5 +---- kernel/mutex.c | 4 +--- kernel/softirq.c | 4 +--- 28 files changed, 36 insertions(+), 95 deletions(-) (limited to 'kernel') diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 971d65c253a9..c2ae3cd331fe 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -239,9 +239,7 @@ void cpu_idle(void) leds_event(led_idle_end); rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c index ea3395750324..92c5af98a6f7 100644 --- a/arch/avr32/kernel/process.c +++ b/arch/avr32/kernel/process.c @@ -40,9 +40,7 @@ void cpu_idle(void) cpu_idle_sleep(); rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c index 8dd0416673cb..a80a643f3691 100644 --- a/arch/blackfin/kernel/process.c +++ b/arch/blackfin/kernel/process.c @@ -94,9 +94,7 @@ void cpu_idle(void) idle(); rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/cris/kernel/process.c b/arch/cris/kernel/process.c index aa585e4e979e..d8f50ff6fadd 100644 --- a/arch/cris/kernel/process.c +++ b/arch/cris/kernel/process.c @@ -115,9 +115,7 @@ void cpu_idle (void) idle = default_idle; idle(); } - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/frv/kernel/process.c b/arch/frv/kernel/process.c index 3901df1213c0..29cc49783787 100644 --- a/arch/frv/kernel/process.c +++ b/arch/frv/kernel/process.c @@ -92,9 +92,7 @@ void cpu_idle(void) idle(); } - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c index 933bd388efb2..1a173b35f475 100644 --- a/arch/h8300/kernel/process.c +++ b/arch/h8300/kernel/process.c @@ -81,9 +81,7 @@ void cpu_idle(void) while (1) { while (!need_resched()) idle(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index 6d33c5cc94f0..9dc52b63fc87 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -330,9 +330,7 @@ cpu_idle (void) normal_xtp(); #endif } - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); check_pgt_cache(); if (cpu_is_offline(cpu)) play_dead(); diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c index 422bea9f1dbc..3a4a32b27208 100644 --- a/arch/m32r/kernel/process.c +++ b/arch/m32r/kernel/process.c @@ -90,9 +90,7 @@ void cpu_idle (void) idle(); } - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/m68k/kernel/process_mm.c b/arch/m68k/kernel/process_mm.c index 099283ee1a8f..fe4186b5fc32 100644 --- a/arch/m68k/kernel/process_mm.c +++ b/arch/m68k/kernel/process_mm.c @@ -78,9 +78,7 @@ void cpu_idle(void) while (1) { while (!need_resched()) idle(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/m68k/kernel/process_no.c b/arch/m68k/kernel/process_no.c index 5e1078cabe0e..f7fe6c348595 100644 --- a/arch/m68k/kernel/process_no.c +++ b/arch/m68k/kernel/process_no.c @@ -73,9 +73,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { idle(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c index 7dcb5bfffb75..9155f7d92669 100644 --- a/arch/microblaze/kernel/process.c +++ b/arch/microblaze/kernel/process.c @@ -110,9 +110,7 @@ void cpu_idle(void) rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); check_pgt_cache(); } } diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index 7955409051c4..61f1cb45a1d5 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -80,9 +80,7 @@ void __noreturn cpu_idle(void) #endif rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c index 28eec3102535..cac401d37f75 100644 --- a/arch/mn10300/kernel/process.c +++ b/arch/mn10300/kernel/process.c @@ -123,9 +123,7 @@ void cpu_idle(void) idle(); } - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index 62c60b87d039..d4b94b395c16 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -71,9 +71,7 @@ void cpu_idle(void) while (1) { while (!need_resched()) barrier(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); check_pgt_cache(); } } diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c index 0a48bf5db6c8..65035141552b 100644 --- a/arch/powerpc/kernel/idle.c +++ b/arch/powerpc/kernel/idle.c @@ -101,11 +101,11 @@ void cpu_idle(void) ppc64_runlatch_on(); rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - if (cpu_should_die()) + if (cpu_should_die()) { + preempt_enable_no_resched(); cpu_die(); - schedule(); - preempt_disable(); + } + schedule_preempt_disabled(); } } diff --git a/arch/powerpc/platforms/iseries/setup.c b/arch/powerpc/platforms/iseries/setup.c index 8fc62586a973..a5fbf4cb6329 100644 --- a/arch/powerpc/platforms/iseries/setup.c +++ b/arch/powerpc/platforms/iseries/setup.c @@ -584,9 +584,7 @@ static void iseries_shared_idle(void) if (hvlpevent_is_pending()) process_iSeries_events(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } @@ -615,9 +613,7 @@ static void iseries_dedicated_idle(void) ppc64_runlatch_on(); rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index e795933eb2cb..7618085b4164 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -97,9 +97,7 @@ void cpu_idle(void) tick_nohz_idle_exit(); if (test_thread_flag(TIF_MCCK_PENDING)) s390_handle_mcck(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/score/kernel/process.c b/arch/score/kernel/process.c index 25d08030a883..2707023c7563 100644 --- a/arch/score/kernel/process.c +++ b/arch/score/kernel/process.c @@ -53,9 +53,7 @@ void __noreturn cpu_idle(void) while (!need_resched()) barrier(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/sh/kernel/idle.c b/arch/sh/kernel/idle.c index 406508d4ce74..7e4892826563 100644 --- a/arch/sh/kernel/idle.c +++ b/arch/sh/kernel/idle.c @@ -114,9 +114,7 @@ void cpu_idle(void) rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index f793742eec2b..935fdbcd88c2 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -113,9 +113,7 @@ void cpu_idle(void) while (!need_resched()) cpu_relax(); } - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); check_pgt_cache(); } } @@ -138,9 +136,7 @@ void cpu_idle(void) while (!need_resched()) cpu_relax(); } - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); check_pgt_cache(); } } diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index 39d8b05201a2..ab9a29268213 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -104,15 +104,13 @@ void cpu_idle(void) rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - #ifdef CONFIG_HOTPLUG_CPU - if (cpu_is_offline(cpu)) + if (cpu_is_offline(cpu)) { + preempt_enable_no_resched(); cpu_play_dead(); + } #endif - - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c index 4c1ac6e5347a..6ae495ef2b99 100644 --- a/arch/tile/kernel/process.c +++ b/arch/tile/kernel/process.c @@ -108,9 +108,7 @@ void cpu_idle(void) } rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index c08d1ff12b7c..49888fefe794 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -119,9 +119,7 @@ void cpu_idle(void) } rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index cfa5c90c01db..e34257c70c28 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -156,9 +156,7 @@ void cpu_idle(void) } tick_nohz_idle_exit(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c index 47041e7c088c..2c9004770c4e 100644 --- a/arch/xtensa/kernel/process.c +++ b/arch/xtensa/kernel/process.c @@ -113,9 +113,7 @@ void cpu_idle(void) while (1) { while (!need_resched()) platform_idle(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/init/main.c b/init/main.c index ff49a6dacfbb..4990f7ec776a 100644 --- a/init/main.c +++ b/init/main.c @@ -374,11 +374,8 @@ static noinline void __init_refok rest_init(void) * at least once to get things moving: */ init_idle_bootup_task(current); - preempt_enable_no_resched(); - schedule(); - + schedule_preempt_disabled(); /* Call into cpu_idle with preempt disabled */ - preempt_disable(); cpu_idle(); } diff --git a/kernel/mutex.c b/kernel/mutex.c index 89096dd8786f..a307cc9c9526 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -240,9 +240,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, /* didn't get the lock, go to sleep: */ spin_unlock_mutex(&lock->wait_lock, flags); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); spin_lock_mutex(&lock->wait_lock, flags); } diff --git a/kernel/softirq.c b/kernel/softirq.c index 4eb3a0fa351e..79b524767a24 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -744,9 +744,7 @@ static int run_ksoftirqd(void * __bind_cpu) while (!kthread_should_stop()) { preempt_disable(); if (!local_softirq_pending()) { - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } __set_current_state(TASK_RUNNING); -- cgit v1.2.3 From ba74c1448f127649046615ec017bded7b2a76f29 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 21 Mar 2011 13:32:17 +0100 Subject: sched/rt: Document scheduler related skip-resched-check sites Create a distinction between scheduler related preempt_enable_no_resched() calls and the nearly one hundred other places in the kernel that do not want to reschedule, for one reason or another. This distinction matters for -rt, where the scheduler and the non-scheduler preempt models (and checks) are different. For upstream it's purely documentational. Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/n/tip-gs88fvx2mdv5psnzxnv575ke@git.kernel.org Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/idle.c | 2 +- arch/sparc/kernel/process_64.c | 2 +- include/linux/preempt.h | 5 ++++- kernel/sched/core.c | 6 +++--- kernel/softirq.c | 4 ++-- 5 files changed, 11 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c index 65035141552b..c97fc60c790c 100644 --- a/arch/powerpc/kernel/idle.c +++ b/arch/powerpc/kernel/idle.c @@ -102,7 +102,7 @@ void cpu_idle(void) rcu_idle_exit(); tick_nohz_idle_exit(); if (cpu_should_die()) { - preempt_enable_no_resched(); + sched_preempt_enable_no_resched(); cpu_die(); } schedule_preempt_disabled(); diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index ab9a29268213..06b5b5fc20c7 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -106,7 +106,7 @@ void cpu_idle(void) #ifdef CONFIG_HOTPLUG_CPU if (cpu_is_offline(cpu)) { - preempt_enable_no_resched(); + sched_preempt_enable_no_resched(); cpu_play_dead(); } #endif diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 58969b2a8a82..5a710b9c578e 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -48,12 +48,14 @@ do { \ barrier(); \ } while (0) -#define preempt_enable_no_resched() \ +#define sched_preempt_enable_no_resched() \ do { \ barrier(); \ dec_preempt_count(); \ } while (0) +#define preempt_enable_no_resched() sched_preempt_enable_no_resched() + #define preempt_enable() \ do { \ preempt_enable_no_resched(); \ @@ -92,6 +94,7 @@ do { \ #else /* !CONFIG_PREEMPT_COUNT */ #define preempt_disable() do { } while (0) +#define sched_preempt_enable_no_resched() do { } while (0) #define preempt_enable_no_resched() do { } while (0) #define preempt_enable() do { } while (0) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 73022395c00e..643cc37fcb23 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3220,7 +3220,7 @@ need_resched: post_schedule(rq); - preempt_enable_no_resched(); + sched_preempt_enable_no_resched(); if (need_resched()) goto need_resched; } @@ -3253,7 +3253,7 @@ EXPORT_SYMBOL(schedule); */ void __sched schedule_preempt_disabled(void) { - preempt_enable_no_resched(); + sched_preempt_enable_no_resched(); schedule(); preempt_disable(); } @@ -4486,7 +4486,7 @@ SYSCALL_DEFINE0(sched_yield) __release(rq->lock); spin_release(&rq->lock.dep_map, 1, _THIS_IP_); do_raw_spin_unlock(&rq->lock); - preempt_enable_no_resched(); + sched_preempt_enable_no_resched(); schedule(); diff --git a/kernel/softirq.c b/kernel/softirq.c index 79b524767a24..f268369ebe1f 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -353,7 +353,7 @@ void irq_exit(void) tick_nohz_irq_exit(); #endif rcu_irq_exit(); - preempt_enable_no_resched(); + sched_preempt_enable_no_resched(); } /* @@ -759,7 +759,7 @@ static int run_ksoftirqd(void * __bind_cpu) if (local_softirq_pending()) __do_softirq(); local_irq_enable(); - preempt_enable_no_resched(); + sched_preempt_enable_no_resched(); cond_resched(); preempt_disable(); rcu_note_context_switch((long)__bind_cpu); -- cgit v1.2.3 From 63b2001169e75cd71e917ec953fdab572e3f944a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 1 Dec 2011 00:04:00 +0100 Subject: sched/wait: Add __wake_up_all_locked() API For code which protects the waitqueue itself with another lock it makes no sense to acquire the waitqueue lock for wakeup all. Provide __wake_up_all_locked(). This is an optimization on the vanilla kernel (to be used by the PCI code) and an important semantic distinction on -rt. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-ux6m4b8jonb9inx8xafh77ds@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/wait.h | 5 +++-- kernel/sched/core.c | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/wait.h b/include/linux/wait.h index a9ce45e8501c..7d9a9e990ce6 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -157,7 +157,7 @@ void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key); void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key); void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, void *key); -void __wake_up_locked(wait_queue_head_t *q, unsigned int mode); +void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr); void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr); void __wake_up_bit(wait_queue_head_t *, void *, int); int __wait_on_bit(wait_queue_head_t *, struct wait_bit_queue *, int (*)(void *), unsigned); @@ -170,7 +170,8 @@ wait_queue_head_t *bit_waitqueue(void *, int); #define wake_up(x) __wake_up(x, TASK_NORMAL, 1, NULL) #define wake_up_nr(x, nr) __wake_up(x, TASK_NORMAL, nr, NULL) #define wake_up_all(x) __wake_up(x, TASK_NORMAL, 0, NULL) -#define wake_up_locked(x) __wake_up_locked((x), TASK_NORMAL) +#define wake_up_locked(x) __wake_up_locked((x), TASK_NORMAL, 1) +#define wake_up_all_locked(x) __wake_up_locked((x), TASK_NORMAL, 0) #define wake_up_interruptible(x) __wake_up(x, TASK_INTERRUPTIBLE, 1, NULL) #define wake_up_interruptible_nr(x, nr) __wake_up(x, TASK_INTERRUPTIBLE, nr, NULL) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 643cc37fcb23..820f7453fda9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3418,9 +3418,9 @@ EXPORT_SYMBOL(__wake_up); /* * Same as __wake_up but called with the spinlock in wait_queue_head_t held. */ -void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) +void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) { - __wake_up_common(q, mode, 1, 0, NULL); + __wake_up_common(q, mode, nr, 0, NULL); } EXPORT_SYMBOL_GPL(__wake_up_locked); -- cgit v1.2.3 From 1c4dd99bed5f6f70932bf8dacdd54d04a2619778 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Jun 2011 20:07:38 +0200 Subject: sched/rt: Prevent idle task boosting Idle task boosting is a nono in general. There is one exception, when PREEMPT_RT and NOHZ is active: The idle task calls get_next_timer_interrupt() and holds the timer wheel base->lock on the CPU and another CPU wants to access the timer (probably to cancel it). We can safely ignore the boosting request, as the idle CPU runs this code with interrupts disabled and will complete the lock protected section without being interrupted. So there is no real need to boost. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-755rvsosz7sdzot12a3gbha6@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 820f7453fda9..c2bbdecaa27d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3779,6 +3779,24 @@ void rt_mutex_setprio(struct task_struct *p, int prio) rq = __task_rq_lock(p); + /* + * Idle task boosting is a nono in general. There is one + * exception, when PREEMPT_RT and NOHZ is active: + * + * The idle task calls get_next_timer_interrupt() and holds + * the timer wheel base->lock on the CPU and another CPU wants + * to access the timer (probably to cancel it). We can safely + * ignore the boosting request, as the idle CPU runs this code + * with interrupts disabled and will complete the lock + * protected section without being interrupted. So there is no + * real need to boost. + */ + if (unlikely(p == rq->idle)) { + WARN_ON(p != rq->curr); + WARN_ON(p->pi_blocked_on); + goto out_unlock; + } + trace_sched_pi_setprio(p, prio); oldprio = p->prio; prev_class = p->sched_class; @@ -3802,11 +3820,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); check_class_changed(rq, p, prev_class, oldprio); +out_unlock: __task_rq_unlock(rq); } - #endif - void set_user_nice(struct task_struct *p, long nice) { int old_prio, delta, on_rq; -- cgit v1.2.3 From 3c7d51843b03a6839e9ec7cda724e54d2319a63a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 17 Jul 2011 20:46:52 +0200 Subject: sched/rt: Do not submit new work when PI-blocked When we are PI-blocked then we want to get things done ASAP. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-vw8et3445km5b8mpihf4trae@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 8 ++++++++ kernel/sched/core.c | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 03dd224d0667..c628a9151437 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2082,12 +2082,20 @@ extern unsigned int sysctl_sched_cfs_bandwidth_slice; extern int rt_mutex_getprio(struct task_struct *p); extern void rt_mutex_setprio(struct task_struct *p, int prio); extern void rt_mutex_adjust_pi(struct task_struct *p); +static inline bool tsk_is_pi_blocked(struct task_struct *tsk) +{ + return tsk->pi_blocked_on != NULL; +} #else static inline int rt_mutex_getprio(struct task_struct *p) { return p->normal_prio; } # define rt_mutex_adjust_pi(p) do { } while (0) +static inline bool tsk_is_pi_blocked(struct task_struct *tsk) +{ + return false; +} #endif extern bool yield_to(struct task_struct *p, bool preempt); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c2bbdecaa27d..25e06a5e048b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3227,7 +3227,7 @@ need_resched: static inline void sched_submit_work(struct task_struct *tsk) { - if (!tsk->state) + if (!tsk->state || tsk_is_pi_blocked(tsk)) return; /* * If we are going to sleep and we have plugged IO queued, -- cgit v1.2.3 From 8e45cb545d98bc58e75b7de89ec8d3e5c8459ee6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Feb 2012 12:47:19 +0100 Subject: sched: Move load-balancing arguments into helper struct Passing large sets of similar arguments all around the load-balancer gets tiresom when you want to modify something. Stick them all in a helper structure and pass the structure around. Signed-off-by: Peter Zijlstra Cc: pjt@google.com Link: http://lkml.kernel.org/n/tip-5slqz0vhsdzewrfk9eza1aon@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 177 +++++++++++++++++++++++++++------------------------- 1 file changed, 93 insertions(+), 84 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 79e9e13c31ab..55b1f117419a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3135,13 +3135,25 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) #define LBF_HAD_BREAKS 0x0C /* count HAD_BREAKs overflows into ABORT */ #define LBF_ABORT 0x10 +struct lb_env { + struct sched_domain *sd; + + int this_cpu; + struct rq *this_rq; + + struct rq *busiest_rq; + struct cfs_rq *busiest_cfs_rq; + + enum cpu_idle_type idle; + unsigned long max_load_move; + unsigned int flags; +}; + /* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ static -int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, - struct sched_domain *sd, enum cpu_idle_type idle, - int *lb_flags) +int can_migrate_task(struct task_struct *p, struct lb_env *env) { int tsk_cache_hot = 0; /* @@ -3150,13 +3162,13 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, * 2) cannot be migrated to this CPU due to cpus_allowed, or * 3) are cache-hot on their current CPU. */ - if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(p))) { + if (!cpumask_test_cpu(env->this_cpu, tsk_cpus_allowed(p))) { schedstat_inc(p, se.statistics.nr_failed_migrations_affine); return 0; } - *lb_flags &= ~LBF_ALL_PINNED; + env->flags &= ~LBF_ALL_PINNED; - if (task_running(rq, p)) { + if (task_running(env->busiest_rq, p)) { schedstat_inc(p, se.statistics.nr_failed_migrations_running); return 0; } @@ -3167,12 +3179,12 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, * 2) too many balance attempts have failed. */ - tsk_cache_hot = task_hot(p, rq->clock_task, sd); + tsk_cache_hot = task_hot(p, env->busiest_rq->clock_task, env->sd); if (!tsk_cache_hot || - sd->nr_balance_failed > sd->cache_nice_tries) { + env->sd->nr_balance_failed > env->sd->cache_nice_tries) { #ifdef CONFIG_SCHEDSTATS if (tsk_cache_hot) { - schedstat_inc(sd, lb_hot_gained[idle]); + schedstat_inc(env->sd, lb_hot_gained[env->idle]); schedstat_inc(p, se.statistics.nr_forced_migrations); } #endif @@ -3193,31 +3205,27 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, * * Called with both runqueues locked. */ -static int -move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, - struct sched_domain *sd, enum cpu_idle_type idle) +static int move_one_task(struct lb_env *env) { struct task_struct *p, *n; struct cfs_rq *cfs_rq; - int pinned = 0; - for_each_leaf_cfs_rq(busiest, cfs_rq) { + for_each_leaf_cfs_rq(env->busiest_rq, cfs_rq) { list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { if (throttled_lb_pair(task_group(p), - busiest->cpu, this_cpu)) + env->busiest_rq->cpu, env->this_cpu)) break; - if (!can_migrate_task(p, busiest, this_cpu, - sd, idle, &pinned)) + if (!can_migrate_task(p, env)) continue; - pull_task(busiest, p, this_rq, this_cpu); + pull_task(env->busiest_rq, p, env->this_rq, env->this_cpu); /* * Right now, this is only the second place pull_task() * is called, so we can safely collect pull_task() * stats here rather than inside pull_task(). */ - schedstat_inc(sd, lb_gained[idle]); + schedstat_inc(env->sd, lb_gained[env->idle]); return 1; } } @@ -3225,31 +3233,26 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, return 0; } -static unsigned long -balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, struct sched_domain *sd, - enum cpu_idle_type idle, int *lb_flags, - struct cfs_rq *busiest_cfs_rq) +static unsigned long balance_tasks(struct lb_env *env) { int loops = 0, pulled = 0; - long rem_load_move = max_load_move; + long rem_load_move = env->max_load_move; struct task_struct *p, *n; - if (max_load_move == 0) + if (env->max_load_move == 0) goto out; - list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { + list_for_each_entry_safe(p, n, &env->busiest_cfs_rq->tasks, se.group_node) { if (loops++ > sysctl_sched_nr_migrate) { - *lb_flags |= LBF_NEED_BREAK; + env->flags |= LBF_NEED_BREAK; break; } if ((p->se.load.weight >> 1) > rem_load_move || - !can_migrate_task(p, busiest, this_cpu, sd, idle, - lb_flags)) + !can_migrate_task(p, env)) continue; - pull_task(busiest, p, this_rq, this_cpu); + pull_task(env->busiest_rq, p, env->this_rq, env->this_cpu); pulled++; rem_load_move -= p->se.load.weight; @@ -3259,8 +3262,8 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, * kernels will stop after the first task is pulled to minimize * the critical section. */ - if (idle == CPU_NEWLY_IDLE) { - *lb_flags |= LBF_ABORT; + if (env->idle == CPU_NEWLY_IDLE) { + env->flags |= LBF_ABORT; break; } #endif @@ -3278,9 +3281,9 @@ out: * so we can safely collect pull_task() stats here rather than * inside pull_task(). */ - schedstat_add(sd, lb_gained[idle], pulled); + schedstat_add(env->sd, lb_gained[env->idle], pulled); - return max_load_move - rem_load_move; + return env->max_load_move - rem_load_move; } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -3363,40 +3366,39 @@ static void update_h_load(long cpu) walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); } -static unsigned long -load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *lb_flags) +static unsigned long load_balance_fair(struct lb_env *env) { - long rem_load_move = max_load_move; - struct cfs_rq *busiest_cfs_rq; + unsigned long max_load_move = env->max_load_move; + long rem_load_move = env->max_load_move; rcu_read_lock(); - update_h_load(cpu_of(busiest)); + update_h_load(cpu_of(env->busiest_rq)); - for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) { - unsigned long busiest_h_load = busiest_cfs_rq->h_load; - unsigned long busiest_weight = busiest_cfs_rq->load.weight; + for_each_leaf_cfs_rq(env->busiest_rq, env->busiest_cfs_rq) { + unsigned long busiest_h_load = env->busiest_cfs_rq->h_load; + unsigned long busiest_weight = env->busiest_cfs_rq->load.weight; u64 rem_load, moved_load; - if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT)) + if (env->flags & (LBF_NEED_BREAK|LBF_ABORT)) break; /* * empty group or part of a throttled hierarchy */ - if (!busiest_cfs_rq->task_weight || - throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu)) + if (!env->busiest_cfs_rq->task_weight) + continue; + + if (throttled_lb_pair(env->busiest_cfs_rq->tg, + cpu_of(env->busiest_rq), + env->this_cpu)) continue; rem_load = (u64)rem_load_move * busiest_weight; rem_load = div_u64(rem_load, busiest_h_load + 1); - moved_load = balance_tasks(this_rq, this_cpu, busiest, - rem_load, sd, idle, lb_flags, - busiest_cfs_rq); + env->max_load_move = rem_load; + moved_load = balance_tasks(env); if (!moved_load) continue; @@ -3416,15 +3418,10 @@ static inline void update_shares(int cpu) { } -static unsigned long -load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *lb_flags) +static unsigned long load_balance_fair(struct lb_env *env) { - return balance_tasks(this_rq, this_cpu, busiest, - max_load_move, sd, idle, lb_flags, - &busiest->cfs); + env->busiest_cfs_rq = &env->busiest_rq->cfs; + return balance_tasks(env); } #endif @@ -3435,21 +3432,17 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, * * Called with both runqueues locked. */ -static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *lb_flags) +static int move_tasks(struct lb_env *env) { + unsigned long max_load_move = env->max_load_move; unsigned long total_load_moved = 0, load_moved; do { - load_moved = load_balance_fair(this_rq, this_cpu, busiest, - max_load_move - total_load_moved, - sd, idle, lb_flags); - + env->max_load_move = max_load_move - total_load_moved; + load_moved = load_balance_fair(env); total_load_moved += load_moved; - if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT)) + if (env->flags & (LBF_NEED_BREAK|LBF_ABORT)) break; #ifdef CONFIG_PREEMPT @@ -3458,8 +3451,8 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, * kernels will stop after the first task is pulled to minimize * the critical section. */ - if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) { - *lb_flags |= LBF_ABORT; + if (env->idle == CPU_NEWLY_IDLE && env->this_rq->nr_running) { + env->flags |= LBF_ABORT; break; } #endif @@ -4459,13 +4452,20 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *balance) { - int ld_moved, lb_flags = 0, active_balance = 0; + int ld_moved, active_balance = 0; struct sched_group *group; unsigned long imbalance; struct rq *busiest; unsigned long flags; struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); + struct lb_env env = { + .sd = sd, + .this_cpu = this_cpu, + .this_rq = this_rq, + .idle = idle, + }; + cpumask_copy(cpus, cpu_active_mask); schedstat_inc(sd, lb_count[idle]); @@ -4500,11 +4500,13 @@ redo: * still unbalanced. ld_moved simply stays zero, so it is * correctly treated as an imbalance. */ - lb_flags |= LBF_ALL_PINNED; + env.flags |= LBF_ALL_PINNED; + env.max_load_move = imbalance; + env.busiest_rq = busiest; + local_irq_save(flags); double_rq_lock(this_rq, busiest); - ld_moved = move_tasks(this_rq, this_cpu, busiest, - imbalance, sd, idle, &lb_flags); + ld_moved = move_tasks(&env); double_rq_unlock(this_rq, busiest); local_irq_restore(flags); @@ -4514,18 +4516,18 @@ redo: if (ld_moved && this_cpu != smp_processor_id()) resched_cpu(this_cpu); - if (lb_flags & LBF_ABORT) + if (env.flags & LBF_ABORT) goto out_balanced; - if (lb_flags & LBF_NEED_BREAK) { - lb_flags += LBF_HAD_BREAK - LBF_NEED_BREAK; - if (lb_flags & LBF_ABORT) + if (env.flags & LBF_NEED_BREAK) { + env.flags += LBF_HAD_BREAK - LBF_NEED_BREAK; + if (env.flags & LBF_ABORT) goto out_balanced; goto redo; } /* All tasks on this runqueue were pinned by CPU affinity */ - if (unlikely(lb_flags & LBF_ALL_PINNED)) { + if (unlikely(env.flags & LBF_ALL_PINNED)) { cpumask_clear_cpu(cpu_of(busiest), cpus); if (!cpumask_empty(cpus)) goto redo; @@ -4555,7 +4557,7 @@ redo: tsk_cpus_allowed(busiest->curr))) { raw_spin_unlock_irqrestore(&busiest->lock, flags); - lb_flags |= LBF_ALL_PINNED; + env.flags |= LBF_ALL_PINNED; goto out_one_pinned; } @@ -4608,7 +4610,7 @@ out_balanced: out_one_pinned: /* tune up the balancing interval */ - if (((lb_flags & LBF_ALL_PINNED) && + if (((env.flags & LBF_ALL_PINNED) && sd->balance_interval < MAX_PINNED_INTERVAL) || (sd->balance_interval < sd->max_interval)) sd->balance_interval *= 2; @@ -4718,10 +4720,17 @@ static int active_load_balance_cpu_stop(void *data) } if (likely(sd)) { + struct lb_env env = { + .sd = sd, + .this_cpu = target_cpu, + .this_rq = target_rq, + .busiest_rq = busiest_rq, + .idle = CPU_IDLE, + }; + schedstat_inc(sd, alb_count); - if (move_one_task(target_rq, target_cpu, busiest_rq, - sd, CPU_IDLE)) + if (move_one_task(&env)) schedstat_inc(sd, alb_pushed); else schedstat_inc(sd, alb_failed); -- cgit v1.2.3 From ddcdf6e7d9919d139031fa2a6addd9544a9a833e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Feb 2012 19:27:40 +0100 Subject: sched: Rename load-balancing fields s/env->this_/env->dst_/g s/env->busiest_/env->src_/g s/pull_task/move_task/g Makes everything clearer. Signed-off-by: Peter Zijlstra Cc: pjt@google.com Link: http://lkml.kernel.org/n/tip-0yvgms8t8x962drpvl0fu0kk@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 118 ++++++++++++++++++++++++++-------------------------- 1 file changed, 60 insertions(+), 58 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 55b1f117419a..233d05171bf5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2918,7 +2918,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; /* - * This is possible from callers such as pull_task(), in which we + * This is possible from callers such as move_task(), in which we * unconditionally check_prempt_curr() after an enqueue (which may have * lead to a throttle). This both saves work and prevents false * next-buddy nomination below. @@ -3084,17 +3084,37 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp static unsigned long __read_mostly max_load_balance_interval = HZ/10; +#define LBF_ALL_PINNED 0x01 +#define LBF_NEED_BREAK 0x02 /* clears into HAD_BREAK */ +#define LBF_HAD_BREAK 0x04 +#define LBF_HAD_BREAKS 0x0C /* count HAD_BREAKs overflows into ABORT */ +#define LBF_ABORT 0x10 + +struct lb_env { + struct sched_domain *sd; + + int src_cpu; + struct rq *src_rq; + struct cfs_rq *src_cfs_rq; + + int dst_cpu; + struct rq *dst_rq; + + enum cpu_idle_type idle; + unsigned long max_load_move; + unsigned int flags; +}; + /* - * pull_task - move a task from a remote runqueue to the local runqueue. + * move_task - move a task from one runqueue to another runqueue. * Both runqueues must be locked. */ -static void pull_task(struct rq *src_rq, struct task_struct *p, - struct rq *this_rq, int this_cpu) +static void move_task(struct task_struct *p, struct lb_env *env) { - deactivate_task(src_rq, p, 0); - set_task_cpu(p, this_cpu); - activate_task(this_rq, p, 0); - check_preempt_curr(this_rq, p, 0); + deactivate_task(env->src_rq, p, 0); + set_task_cpu(p, env->dst_cpu); + activate_task(env->dst_rq, p, 0); + check_preempt_curr(env->dst_rq, p, 0); } /* @@ -3129,26 +3149,6 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) return delta < (s64)sysctl_sched_migration_cost; } -#define LBF_ALL_PINNED 0x01 -#define LBF_NEED_BREAK 0x02 /* clears into HAD_BREAK */ -#define LBF_HAD_BREAK 0x04 -#define LBF_HAD_BREAKS 0x0C /* count HAD_BREAKs overflows into ABORT */ -#define LBF_ABORT 0x10 - -struct lb_env { - struct sched_domain *sd; - - int this_cpu; - struct rq *this_rq; - - struct rq *busiest_rq; - struct cfs_rq *busiest_cfs_rq; - - enum cpu_idle_type idle; - unsigned long max_load_move; - unsigned int flags; -}; - /* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ @@ -3162,13 +3162,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 2) cannot be migrated to this CPU due to cpus_allowed, or * 3) are cache-hot on their current CPU. */ - if (!cpumask_test_cpu(env->this_cpu, tsk_cpus_allowed(p))) { + if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { schedstat_inc(p, se.statistics.nr_failed_migrations_affine); return 0; } env->flags &= ~LBF_ALL_PINNED; - if (task_running(env->busiest_rq, p)) { + if (task_running(env->src_rq, p)) { schedstat_inc(p, se.statistics.nr_failed_migrations_running); return 0; } @@ -3179,7 +3179,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 2) too many balance attempts have failed. */ - tsk_cache_hot = task_hot(p, env->busiest_rq->clock_task, env->sd); + tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); if (!tsk_cache_hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { #ifdef CONFIG_SCHEDSTATS @@ -3210,20 +3210,20 @@ static int move_one_task(struct lb_env *env) struct task_struct *p, *n; struct cfs_rq *cfs_rq; - for_each_leaf_cfs_rq(env->busiest_rq, cfs_rq) { + for_each_leaf_cfs_rq(env->src_rq, cfs_rq) { list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { if (throttled_lb_pair(task_group(p), - env->busiest_rq->cpu, env->this_cpu)) + env->src_cpu, env->dst_cpu)) break; if (!can_migrate_task(p, env)) continue; - pull_task(env->busiest_rq, p, env->this_rq, env->this_cpu); + move_task(p, env); /* - * Right now, this is only the second place pull_task() - * is called, so we can safely collect pull_task() - * stats here rather than inside pull_task(). + * Right now, this is only the second place move_task() + * is called, so we can safely collect move_task() + * stats here rather than inside move_task(). */ schedstat_inc(env->sd, lb_gained[env->idle]); return 1; @@ -3242,7 +3242,7 @@ static unsigned long balance_tasks(struct lb_env *env) if (env->max_load_move == 0) goto out; - list_for_each_entry_safe(p, n, &env->busiest_cfs_rq->tasks, se.group_node) { + list_for_each_entry_safe(p, n, &env->src_cfs_rq->tasks, se.group_node) { if (loops++ > sysctl_sched_nr_migrate) { env->flags |= LBF_NEED_BREAK; break; @@ -3252,7 +3252,7 @@ static unsigned long balance_tasks(struct lb_env *env) !can_migrate_task(p, env)) continue; - pull_task(env->busiest_rq, p, env->this_rq, env->this_cpu); + move_task(p, env); pulled++; rem_load_move -= p->se.load.weight; @@ -3277,9 +3277,9 @@ static unsigned long balance_tasks(struct lb_env *env) } out: /* - * Right now, this is one of only two places pull_task() is called, - * so we can safely collect pull_task() stats here rather than - * inside pull_task(). + * Right now, this is one of only two places move_task() is called, + * so we can safely collect move_task() stats here rather than + * inside move_task(). */ schedstat_add(env->sd, lb_gained[env->idle], pulled); @@ -3372,11 +3372,11 @@ static unsigned long load_balance_fair(struct lb_env *env) long rem_load_move = env->max_load_move; rcu_read_lock(); - update_h_load(cpu_of(env->busiest_rq)); + update_h_load(cpu_of(env->src_rq)); - for_each_leaf_cfs_rq(env->busiest_rq, env->busiest_cfs_rq) { - unsigned long busiest_h_load = env->busiest_cfs_rq->h_load; - unsigned long busiest_weight = env->busiest_cfs_rq->load.weight; + for_each_leaf_cfs_rq(env->src_rq, env->src_cfs_rq) { + unsigned long busiest_h_load = env->src_cfs_rq->h_load; + unsigned long busiest_weight = env->src_cfs_rq->load.weight; u64 rem_load, moved_load; if (env->flags & (LBF_NEED_BREAK|LBF_ABORT)) @@ -3385,12 +3385,12 @@ static unsigned long load_balance_fair(struct lb_env *env) /* * empty group or part of a throttled hierarchy */ - if (!env->busiest_cfs_rq->task_weight) + if (!env->src_cfs_rq->task_weight) continue; - if (throttled_lb_pair(env->busiest_cfs_rq->tg, - cpu_of(env->busiest_rq), - env->this_cpu)) + if (throttled_lb_pair(env->src_cfs_rq->tg, + cpu_of(env->src_rq), + env->dst_cpu)) continue; rem_load = (u64)rem_load_move * busiest_weight; @@ -3420,7 +3420,7 @@ static inline void update_shares(int cpu) static unsigned long load_balance_fair(struct lb_env *env) { - env->busiest_cfs_rq = &env->busiest_rq->cfs; + env->src_cfs_rq = &env->src_rq->cfs; return balance_tasks(env); } #endif @@ -3451,7 +3451,7 @@ static int move_tasks(struct lb_env *env) * kernels will stop after the first task is pulled to minimize * the critical section. */ - if (env->idle == CPU_NEWLY_IDLE && env->this_rq->nr_running) { + if (env->idle == CPU_NEWLY_IDLE && env->dst_rq->nr_running) { env->flags |= LBF_ABORT; break; } @@ -4461,8 +4461,8 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct lb_env env = { .sd = sd, - .this_cpu = this_cpu, - .this_rq = this_rq, + .dst_cpu = this_cpu, + .dst_rq = this_rq, .idle = idle, }; @@ -4502,7 +4502,8 @@ redo: */ env.flags |= LBF_ALL_PINNED; env.max_load_move = imbalance; - env.busiest_rq = busiest; + env.src_cpu = busiest->cpu; + env.src_rq = busiest; local_irq_save(flags); double_rq_lock(this_rq, busiest); @@ -4722,9 +4723,10 @@ static int active_load_balance_cpu_stop(void *data) if (likely(sd)) { struct lb_env env = { .sd = sd, - .this_cpu = target_cpu, - .this_rq = target_rq, - .busiest_rq = busiest_rq, + .dst_cpu = target_cpu, + .dst_rq = target_rq, + .src_cpu = busiest_rq->cpu, + .src_rq = busiest_rq, .idle = CPU_IDLE, }; -- cgit v1.2.3 From 367456c756a6b84f493ca9cc5b17b1f5d38ef466 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 20 Feb 2012 21:49:09 +0100 Subject: sched: Ditch per cgroup task lists for load-balancing Per cgroup load-balance has numerous problems, chief amongst them that there is no real sane order in them. So stop pretending it makes sense and enqueue all tasks on a single list. This also allows us to more easily fix the fwd progress issue uncovered by the lock-break stuff. Rotate the list on failure to migreate and limit the total iterations to nr_running (which with releasing the lock isn't strictly accurate but close enough). Also add a filter that skips very light tasks on the first attempt around the list, this attempts to avoid shooting whole cgroups around without affecting over balance. Signed-off-by: Peter Zijlstra Cc: pjt@google.com Link: http://lkml.kernel.org/n/tip-tx8yqydc7eimgq7i4rkc3a4g@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 + kernel/sched/fair.c | 176 ++++++++++++++++++++++----------------------------- kernel/sched/sched.h | 10 +-- 3 files changed, 80 insertions(+), 109 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 25e06a5e048b..95545126be1c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6959,6 +6959,9 @@ void __init sched_init(void) rq->online = 0; rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; + + INIT_LIST_HEAD(&rq->cfs_tasks); + rq_attach_root(rq, &def_root_domain); #ifdef CONFIG_NO_HZ rq->nohz_flags = 0; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 233d05171bf5..a0424fc4cc54 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -776,29 +776,16 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) * Scheduling class queueing methods: */ -#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED -static void -add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) -{ - cfs_rq->task_weight += weight; -} -#else -static inline void -add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) -{ -} -#endif - static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_add(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) update_load_add(&rq_of(cfs_rq)->load, se->load.weight); - if (entity_is_task(se)) { - add_cfs_task_weight(cfs_rq, se->load.weight); - list_add(&se->group_node, &cfs_rq->tasks); - } +#ifdef CONFIG_SMP + if (entity_is_task(se)) + list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); +#endif cfs_rq->nr_running++; } @@ -808,10 +795,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_sub(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); - if (entity_is_task(se)) { - add_cfs_task_weight(cfs_rq, -se->load.weight); + if (entity_is_task(se)) list_del_init(&se->group_node); - } cfs_rq->nr_running--; } @@ -3085,17 +3070,14 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp static unsigned long __read_mostly max_load_balance_interval = HZ/10; #define LBF_ALL_PINNED 0x01 -#define LBF_NEED_BREAK 0x02 /* clears into HAD_BREAK */ -#define LBF_HAD_BREAK 0x04 -#define LBF_HAD_BREAKS 0x0C /* count HAD_BREAKs overflows into ABORT */ -#define LBF_ABORT 0x10 +#define LBF_NEED_BREAK 0x02 +#define LBF_ABORT 0x04 struct lb_env { struct sched_domain *sd; int src_cpu; struct rq *src_rq; - struct cfs_rq *src_cfs_rq; int dst_cpu; struct rq *dst_rq; @@ -3103,6 +3085,10 @@ struct lb_env { enum cpu_idle_type idle; unsigned long max_load_move; unsigned int flags; + + unsigned int loop; + unsigned int loop_break; + unsigned int loop_max; }; /* @@ -3208,53 +3194,69 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) static int move_one_task(struct lb_env *env) { struct task_struct *p, *n; - struct cfs_rq *cfs_rq; - for_each_leaf_cfs_rq(env->src_rq, cfs_rq) { - list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { - if (throttled_lb_pair(task_group(p), - env->src_cpu, env->dst_cpu)) - break; + list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { + if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu)) + continue; - if (!can_migrate_task(p, env)) - continue; + if (!can_migrate_task(p, env)) + continue; - move_task(p, env); - /* - * Right now, this is only the second place move_task() - * is called, so we can safely collect move_task() - * stats here rather than inside move_task(). - */ - schedstat_inc(env->sd, lb_gained[env->idle]); - return 1; - } + move_task(p, env); + /* + * Right now, this is only the second place move_task() + * is called, so we can safely collect move_task() + * stats here rather than inside move_task(). + */ + schedstat_inc(env->sd, lb_gained[env->idle]); + return 1; } - return 0; } +static unsigned long task_h_load(struct task_struct *p); + static unsigned long balance_tasks(struct lb_env *env) { - int loops = 0, pulled = 0; long rem_load_move = env->max_load_move; struct task_struct *p, *n; + unsigned long load; + int pulled = 0; if (env->max_load_move == 0) goto out; - list_for_each_entry_safe(p, n, &env->src_cfs_rq->tasks, se.group_node) { - if (loops++ > sysctl_sched_nr_migrate) { + list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { + env->loop++; + /* We've more or less seen every task there is, call it quits */ + if (env->loop > env->loop_max) { + env->flags |= LBF_ABORT; + break; + } + /* take a beather every nr_migrate tasks */ + if (env->loop > env->loop_break) { + env->loop_break += sysctl_sched_nr_migrate; env->flags |= LBF_NEED_BREAK; break; } - if ((p->se.load.weight >> 1) > rem_load_move || - !can_migrate_task(p, env)) - continue; + if (throttled_lb_pair(task_group(p), env->src_rq->cpu, + env->dst_cpu)) + goto next; + + load = task_h_load(p); + if (load < 16 && !env->sd->nr_balance_failed) + goto next; + + if ((load * 2) > rem_load_move) + goto next; + + if (!can_migrate_task(p, env)) + goto next; move_task(p, env); pulled++; - rem_load_move -= p->se.load.weight; + rem_load_move -= load; #ifdef CONFIG_PREEMPT /* @@ -3274,6 +3276,10 @@ static unsigned long balance_tasks(struct lb_env *env) */ if (rem_load_move <= 0) break; + + continue; +next: + list_move_tail(&p->se.group_node, &env->src_rq->cfs_tasks); } out: /* @@ -3363,65 +3369,33 @@ static int tg_load_down(struct task_group *tg, void *data) static void update_h_load(long cpu) { + rcu_read_lock(); walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); + rcu_read_unlock(); } -static unsigned long load_balance_fair(struct lb_env *env) +static unsigned long task_h_load(struct task_struct *p) { - unsigned long max_load_move = env->max_load_move; - long rem_load_move = env->max_load_move; - - rcu_read_lock(); - update_h_load(cpu_of(env->src_rq)); - - for_each_leaf_cfs_rq(env->src_rq, env->src_cfs_rq) { - unsigned long busiest_h_load = env->src_cfs_rq->h_load; - unsigned long busiest_weight = env->src_cfs_rq->load.weight; - u64 rem_load, moved_load; - - if (env->flags & (LBF_NEED_BREAK|LBF_ABORT)) - break; - - /* - * empty group or part of a throttled hierarchy - */ - if (!env->src_cfs_rq->task_weight) - continue; - - if (throttled_lb_pair(env->src_cfs_rq->tg, - cpu_of(env->src_rq), - env->dst_cpu)) - continue; - - rem_load = (u64)rem_load_move * busiest_weight; - rem_load = div_u64(rem_load, busiest_h_load + 1); - - env->max_load_move = rem_load; - - moved_load = balance_tasks(env); - if (!moved_load) - continue; - - moved_load *= busiest_h_load; - moved_load = div_u64(moved_load, busiest_weight + 1); + struct cfs_rq *cfs_rq = task_cfs_rq(p); + unsigned long load; - rem_load_move -= moved_load; - if (rem_load_move < 0) - break; - } - rcu_read_unlock(); + load = p->se.load.weight; + load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1); - return max_load_move - rem_load_move; + return load; } #else static inline void update_shares(int cpu) { } -static unsigned long load_balance_fair(struct lb_env *env) +static inline void update_h_load(long cpu) { - env->src_cfs_rq = &env->src_rq->cfs; - return balance_tasks(env); +} + +static unsigned long task_h_load(struct task_struct *p) +{ + return p->se.load.weight; } #endif @@ -3437,9 +3411,10 @@ static int move_tasks(struct lb_env *env) unsigned long max_load_move = env->max_load_move; unsigned long total_load_moved = 0, load_moved; + update_h_load(cpu_of(env->src_rq)); do { env->max_load_move = max_load_move - total_load_moved; - load_moved = load_balance_fair(env); + load_moved = balance_tasks(env); total_load_moved += load_moved; if (env->flags & (LBF_NEED_BREAK|LBF_ABORT)) @@ -4464,6 +4439,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, .dst_cpu = this_cpu, .dst_rq = this_rq, .idle = idle, + .loop_break = sysctl_sched_nr_migrate, }; cpumask_copy(cpus, cpu_active_mask); @@ -4504,6 +4480,7 @@ redo: env.max_load_move = imbalance; env.src_cpu = busiest->cpu; env.src_rq = busiest; + env.loop_max = busiest->nr_running; local_irq_save(flags); double_rq_lock(this_rq, busiest); @@ -4521,9 +4498,7 @@ redo: goto out_balanced; if (env.flags & LBF_NEED_BREAK) { - env.flags += LBF_HAD_BREAK - LBF_NEED_BREAK; - if (env.flags & LBF_ABORT) - goto out_balanced; + env.flags &= ~LBF_NEED_BREAK; goto redo; } @@ -5357,7 +5332,6 @@ static void set_curr_task_fair(struct rq *rq) void init_cfs_rq(struct cfs_rq *cfs_rq) { cfs_rq->tasks_timeline = RB_ROOT; - INIT_LIST_HEAD(&cfs_rq->tasks); cfs_rq->min_vruntime = (u64)(-(1LL << 20)); #ifndef CONFIG_64BIT cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c0660a1a0cd1..753bdd567416 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -212,9 +212,6 @@ struct cfs_rq { struct rb_root tasks_timeline; struct rb_node *rb_leftmost; - struct list_head tasks; - struct list_head *balance_iterator; - /* * 'curr' points to currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). @@ -241,11 +238,6 @@ struct cfs_rq { struct task_group *tg; /* group that "owns" this runqueue */ #ifdef CONFIG_SMP - /* - * the part of load.weight contributed by tasks - */ - unsigned long task_weight; - /* * h_load = weight * f(tg) * @@ -420,6 +412,8 @@ struct rq { int cpu; int online; + struct list_head cfs_tasks; + u64 rt_avg; u64 age_stamp; u64 idle_stamp; -- cgit v1.2.3 From 2e5b5b3a1b7768c89fbfeca18e75f8ee377e924c Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Thu, 23 Feb 2012 17:41:27 +0900 Subject: sched: Clean up parameter passing of proc_sched_autogroup_set_nice() Pass nice as a value to proc_sched_autogroup_set_nice(). No side effect is expected, and the variable err will be overwritten with the return value. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/4F45FBB7.5090607@ct.jp.nec.com Signed-off-by: Ingo Molnar --- fs/proc/base.c | 3 +-- include/linux/sched.h | 2 +- kernel/sched/auto_group.c | 12 ++++++------ 3 files changed, 8 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/fs/proc/base.c b/fs/proc/base.c index d4548dd49b02..965d4bde3a3b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1310,8 +1310,7 @@ sched_autogroup_write(struct file *file, const char __user *buf, if (!p) return -ESRCH; - err = nice; - err = proc_sched_autogroup_set_nice(p, &err); + err = proc_sched_autogroup_set_nice(p, nice); if (err) count = err; diff --git a/include/linux/sched.h b/include/linux/sched.h index c628a9151437..c298fb9cf5ad 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2065,7 +2065,7 @@ extern void sched_autogroup_fork(struct signal_struct *sig); extern void sched_autogroup_exit(struct signal_struct *sig); #ifdef CONFIG_PROC_FS extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m); -extern int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice); +extern int proc_sched_autogroup_set_nice(struct task_struct *p, int nice); #endif #else static inline void sched_autogroup_create_attach(struct task_struct *p) { } diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index e8a1f83ee0e7..0984a21076a3 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -195,20 +195,20 @@ __setup("noautogroup", setup_autogroup); #ifdef CONFIG_PROC_FS -int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice) +int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) { static unsigned long next = INITIAL_JIFFIES; struct autogroup *ag; int err; - if (*nice < -20 || *nice > 19) + if (nice < -20 || nice > 19) return -EINVAL; - err = security_task_setnice(current, *nice); + err = security_task_setnice(current, nice); if (err) return err; - if (*nice < 0 && !can_nice(current, *nice)) + if (nice < 0 && !can_nice(current, nice)) return -EPERM; /* this is a heavy operation taking global locks.. */ @@ -219,9 +219,9 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice) ag = autogroup_task_get(p); down_write(&ag->lock); - err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]); + err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]); if (!err) - ag->nice = *nice; + ag->nice = nice; up_write(&ag->lock); autogroup_kref_put(ag); -- cgit v1.2.3 From 5d6523ebd2f67de9d23285aad7f3910e7b0aee83 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 10 Mar 2012 00:07:36 +0100 Subject: sched: Fix load-balance wreckage Commit 367456c ("sched: Ditch per cgroup task lists for load-balancing") completely wrecked load-balancing due to a few silly mistakes. Correct those and remove more pointless code. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-zk04ihygwxn7qqrlpaf73b0r@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 110 +++++++++++++++++++--------------------------------- 1 file changed, 39 insertions(+), 71 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a0424fc4cc54..def17aa302d5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -784,7 +784,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_add(&rq_of(cfs_rq)->load, se->load.weight); #ifdef CONFIG_SMP if (entity_is_task(se)) - list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); + list_add_tail(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); #endif cfs_rq->nr_running++; } @@ -3071,7 +3071,6 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; #define LBF_ALL_PINNED 0x01 #define LBF_NEED_BREAK 0x02 -#define LBF_ABORT 0x04 struct lb_env { struct sched_domain *sd; @@ -3083,7 +3082,7 @@ struct lb_env { struct rq *dst_rq; enum cpu_idle_type idle; - unsigned long max_load_move; + long load_move; unsigned int flags; unsigned int loop; @@ -3216,39 +3215,47 @@ static int move_one_task(struct lb_env *env) static unsigned long task_h_load(struct task_struct *p); -static unsigned long balance_tasks(struct lb_env *env) +/* + * move_tasks tries to move up to load_move weighted load from busiest to + * this_rq, as part of a balancing operation within domain "sd". + * Returns 1 if successful and 0 otherwise. + * + * Called with both runqueues locked. + */ +static int move_tasks(struct lb_env *env) { - long rem_load_move = env->max_load_move; - struct task_struct *p, *n; + struct list_head *tasks = &env->src_rq->cfs_tasks; + struct task_struct *p; unsigned long load; int pulled = 0; - if (env->max_load_move == 0) - goto out; + if (env->load_move <= 0) + return 0; + + while (!list_empty(tasks)) { + p = list_first_entry(tasks, struct task_struct, se.group_node); - list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { env->loop++; /* We've more or less seen every task there is, call it quits */ - if (env->loop > env->loop_max) { - env->flags |= LBF_ABORT; + if (env->loop > env->loop_max) break; - } - /* take a beather every nr_migrate tasks */ + + /* take a breather every nr_migrate tasks */ if (env->loop > env->loop_break) { env->loop_break += sysctl_sched_nr_migrate; env->flags |= LBF_NEED_BREAK; break; } - if (throttled_lb_pair(task_group(p), env->src_rq->cpu, - env->dst_cpu)) + if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) goto next; load = task_h_load(p); + if (load < 16 && !env->sd->nr_balance_failed) goto next; - if ((load * 2) > rem_load_move) + if ((load / 2) > env->load_move) goto next; if (!can_migrate_task(p, env)) @@ -3256,7 +3263,7 @@ static unsigned long balance_tasks(struct lb_env *env) move_task(p, env); pulled++; - rem_load_move -= load; + env->load_move -= load; #ifdef CONFIG_PREEMPT /* @@ -3264,24 +3271,22 @@ static unsigned long balance_tasks(struct lb_env *env) * kernels will stop after the first task is pulled to minimize * the critical section. */ - if (env->idle == CPU_NEWLY_IDLE) { - env->flags |= LBF_ABORT; + if (env->idle == CPU_NEWLY_IDLE) break; - } #endif /* * We only want to steal up to the prescribed amount of * weighted load. */ - if (rem_load_move <= 0) + if (env->load_move <= 0) break; continue; next: - list_move_tail(&p->se.group_node, &env->src_rq->cfs_tasks); + list_move_tail(&p->se.group_node, tasks); } -out: + /* * Right now, this is one of only two places move_task() is called, * so we can safely collect move_task() stats here rather than @@ -3289,7 +3294,7 @@ out: */ schedstat_add(env->sd, lb_gained[env->idle], pulled); - return env->max_load_move - rem_load_move; + return pulled; } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -3399,43 +3404,6 @@ static unsigned long task_h_load(struct task_struct *p) } #endif -/* - * move_tasks tries to move up to max_load_move weighted load from busiest to - * this_rq, as part of a balancing operation within domain "sd". - * Returns 1 if successful and 0 otherwise. - * - * Called with both runqueues locked. - */ -static int move_tasks(struct lb_env *env) -{ - unsigned long max_load_move = env->max_load_move; - unsigned long total_load_moved = 0, load_moved; - - update_h_load(cpu_of(env->src_rq)); - do { - env->max_load_move = max_load_move - total_load_moved; - load_moved = balance_tasks(env); - total_load_moved += load_moved; - - if (env->flags & (LBF_NEED_BREAK|LBF_ABORT)) - break; - -#ifdef CONFIG_PREEMPT - /* - * NEWIDLE balancing is a source of latency, so preemptible - * kernels will stop after the first task is pulled to minimize - * the critical section. - */ - if (env->idle == CPU_NEWLY_IDLE && env->dst_rq->nr_running) { - env->flags |= LBF_ABORT; - break; - } -#endif - } while (load_moved && max_load_move > total_load_moved); - - return total_load_moved > 0; -} - /********** Helpers for find_busiest_group ************************/ /* * sd_lb_stats - Structure to store the statistics of a sched_domain @@ -4477,31 +4445,31 @@ redo: * correctly treated as an imbalance. */ env.flags |= LBF_ALL_PINNED; - env.max_load_move = imbalance; + env.load_move = imbalance; env.src_cpu = busiest->cpu; env.src_rq = busiest; env.loop_max = busiest->nr_running; +more_balance: local_irq_save(flags); double_rq_lock(this_rq, busiest); - ld_moved = move_tasks(&env); + if (!env.loop) + update_h_load(env.src_cpu); + ld_moved += move_tasks(&env); double_rq_unlock(this_rq, busiest); local_irq_restore(flags); + if (env.flags & LBF_NEED_BREAK) { + env.flags &= ~LBF_NEED_BREAK; + goto more_balance; + } + /* * some other cpu did the load balance for us. */ if (ld_moved && this_cpu != smp_processor_id()) resched_cpu(this_cpu); - if (env.flags & LBF_ABORT) - goto out_balanced; - - if (env.flags & LBF_NEED_BREAK) { - env.flags &= ~LBF_NEED_BREAK; - goto redo; - } - /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(env.flags & LBF_ALL_PINNED)) { cpumask_clear_cpu(cpu_of(busiest), cpus); -- cgit v1.2.3 From 5fbd036b552f633abb394a319f7c62a5c86a9cd7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Dec 2011 17:09:22 +0100 Subject: sched: Cleanup cpu_active madness Stepan found: CPU0 CPUn _cpu_up() __cpu_up() boostrap() notify_cpu_starting() set_cpu_online() while (!cpu_active()) cpu_relax() smp_call_function(.wait=1) /* we find cpu_online() is true */ arch_send_call_function_ipi_mask() /* wait-forever-more */ local_irq_enable() cpu_notify(CPU_ONLINE) sched_cpu_active() set_cpu_active() Now the purpose of cpu_active is mostly with bringing down a cpu, where we mark it !active to avoid the load-balancer from moving tasks to it while we tear down the cpu. This is required because we only update the sched_domain tree after we brought the cpu-down. And this is needed so that some tasks can still run while we bring it down, we just don't want new tasks to appear. On cpu-up however the sched_domain tree doesn't yet include the new cpu, so its invisible to the load-balancer, regardless of the active state. So instead of setting the active state after we boot the new cpu (and consequently having to wait for it before enabling interrupts) set the cpu active before we set it online and avoid the whole mess. Reported-by: Stepan Moskovchenko Signed-off-by: Peter Zijlstra Acked-by: Thomas Gleixner Link: http://lkml.kernel.org/r/1323965362.18942.71.camel@twins Signed-off-by: Ingo Molnar --- arch/arm/kernel/smp.c | 7 ------- arch/hexagon/kernel/smp.c | 2 -- arch/s390/kernel/smp.c | 6 ------ arch/x86/kernel/smpboot.c | 13 ------------- kernel/sched/core.c | 2 +- 5 files changed, 1 insertion(+), 29 deletions(-) (limited to 'kernel') diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index cdeb727527d3..d616ed51e7a7 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -295,13 +295,6 @@ asmlinkage void __cpuinit secondary_start_kernel(void) */ percpu_timer_setup(); - while (!cpu_active(cpu)) - cpu_relax(); - - /* - * cpu_active bit is set, so it's safe to enalbe interrupts - * now. - */ local_irq_enable(); local_fiq_enable(); diff --git a/arch/hexagon/kernel/smp.c b/arch/hexagon/kernel/smp.c index c871a2cffaef..0123c63e9a3a 100644 --- a/arch/hexagon/kernel/smp.c +++ b/arch/hexagon/kernel/smp.c @@ -179,8 +179,6 @@ void __cpuinit start_secondary(void) printk(KERN_INFO "%s cpu %d\n", __func__, current_thread_info()->cpu); set_cpu_online(cpu, true); - while (!cpumask_test_cpu(cpu, cpu_active_mask)) - cpu_relax(); local_irq_enable(); cpu_idle(); diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 2398ce6b15ae..b0e28c47ab83 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -550,12 +550,6 @@ int __cpuinit start_secondary(void *cpuvoid) S390_lowcore.restart_psw.addr = PSW_ADDR_AMODE | (unsigned long) psw_restart_int_handler; __ctl_set_bit(0, 28); /* Enable lowcore protection */ - /* - * Wait until the cpu which brought this one up marked it - * active before enabling interrupts. - */ - while (!cpumask_test_cpu(smp_processor_id(), cpu_active_mask)) - cpu_relax(); local_irq_enable(); /* cpu_idle will call schedule for us */ cpu_idle(); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 66d250c00d11..58f78165d308 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -291,19 +291,6 @@ notrace static void __cpuinit start_secondary(void *unused) per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; x86_platform.nmi_init(); - /* - * Wait until the cpu which brought this one up marked it - * online before enabling interrupts. If we don't do that then - * we can end up waking up the softirq thread before this cpu - * reached the active state, which makes the scheduler unhappy - * and schedule the softirq thread on the wrong cpu. This is - * only observable with forced threaded interrupts, but in - * theory it could also happen w/o them. It's just way harder - * to achieve. - */ - while (!cpumask_test_cpu(smp_processor_id(), cpu_active_mask)) - cpu_relax(); - /* enable local interrupts */ local_irq_enable(); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 95545126be1c..b1ccce819ce2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5410,7 +5410,7 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu) { switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: + case CPU_STARTING: case CPU_DOWN_FAILED: set_cpu_active((long)hcpu, true); return NOTIFY_OK; -- cgit v1.2.3 From 554cecaf733623b327eef9652b65965eb1081b81 Mon Sep 17 00:00:00 2001 From: Diwakar Tundlam Date: Wed, 7 Mar 2012 14:44:26 -0800 Subject: sched/nohz: Correctly initialize 'next_balance' in 'nohz' idle balancer The 'next_balance' field of 'nohz' idle balancer must be initialized to jiffies. Since jiffies is initialized to negative 300 seconds the 'nohz' idle balancer does not run for the first 300s (5mins) after bootup. If no new processes are spawed or no idle cycles happen, the load on the cpus will remain unbalanced for that duration. Signed-off-by: Diwakar Tundlam Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1DD7BFEDD3147247B1355BEFEFE4665237994F30EF@HQMAIL04.nvidia.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index def17aa302d5..11f3979bad2a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5571,6 +5571,7 @@ __init void init_sched_fair_class(void) open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); #ifdef CONFIG_NO_HZ + nohz.next_balance = jiffies; zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); cpu_notifier(sched_ilb_notifier, 0); #endif -- cgit v1.2.3 From 3ccf3e8306156a28213adc720aba807e9a901ad5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 27 Feb 2012 10:47:00 +0100 Subject: printk/sched: Introduce special printk_sched() for those awkward moments There's a few awkward printk()s inside of scheduler guts that people prefer to keep but really are rather deadlock prone. Fudge around it by storing the text in a per-cpu buffer and poll it using the existing printk_tick() handler. This will drop output when its more frequent than once a tick, however only the affinity thing could possible go that fast and for that just one should suffice to notify the admin he's done something silly.. Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/n/tip-wua3lmkt3dg8nfts66o6brne@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/printk.h | 10 ++++++++++ kernel/printk.c | 40 +++++++++++++++++++++++++++++++++++++--- kernel/sched/core.c | 2 +- kernel/sched/rt.c | 8 +++++++- 4 files changed, 55 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/printk.h b/include/linux/printk.h index f0e22f75143f..1f77a4174ee0 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -100,6 +100,11 @@ int vprintk(const char *fmt, va_list args); asmlinkage __printf(1, 2) __cold int printk(const char *fmt, ...); +/* + * Special printk facility for scheduler use only, _DO_NOT_USE_ ! + */ +__printf(1, 2) __cold int printk_sched(const char *fmt, ...); + /* * Please don't use printk_ratelimit(), because it shares ratelimiting state * with all other unrelated printk_ratelimit() callsites. Instead use @@ -127,6 +132,11 @@ int printk(const char *s, ...) { return 0; } +static inline __printf(1, 2) __cold +int printk_sched(const char *s, ...) +{ + return 0; +} static inline int printk_ratelimit(void) { return 0; diff --git a/kernel/printk.c b/kernel/printk.c index 13c0a1143f49..7ca7ba591e21 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1208,13 +1208,47 @@ int is_console_locked(void) return console_locked; } +/* + * Delayed printk facility, for scheduler-internal messages: + */ +#define PRINTK_BUF_SIZE 512 + +#define PRINTK_PENDING_WAKEUP 0x01 +#define PRINTK_PENDING_SCHED 0x02 + static DEFINE_PER_CPU(int, printk_pending); +static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); + +int printk_sched(const char *fmt, ...) +{ + unsigned long flags; + va_list args; + char *buf; + int r; + + local_irq_save(flags); + buf = __get_cpu_var(printk_sched_buf); + + va_start(args, fmt); + r = vsnprintf(buf, PRINTK_BUF_SIZE, fmt, args); + va_end(args); + + __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED); + local_irq_restore(flags); + + return r; +} void printk_tick(void) { if (__this_cpu_read(printk_pending)) { - __this_cpu_write(printk_pending, 0); - wake_up_interruptible(&log_wait); + int pending = __this_cpu_xchg(printk_pending, 0); + if (pending & PRINTK_PENDING_SCHED) { + char *buf = __get_cpu_var(printk_sched_buf); + printk(KERN_WARNING "[sched_delayed] %s", buf); + } + if (pending & PRINTK_PENDING_WAKEUP) + wake_up_interruptible(&log_wait); } } @@ -1228,7 +1262,7 @@ int printk_needs_cpu(int cpu) void wake_up_klogd(void) { if (waitqueue_active(&log_wait)) - this_cpu_write(printk_pending, 1); + this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); } /** diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b1ccce819ce2..8781cec7c3e6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1284,7 +1284,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) * leave kernel. */ if (p->mm && printk_ratelimit()) { - printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n", + printk_sched("process %d (%s) no longer affine to cpu%d\n", task_pid_nr(p), p->comm, cpu); } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 7f7e7cdcb472..b60dad720173 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -864,8 +864,14 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) * but accrue some time due to boosting. */ if (likely(rt_b->rt_runtime)) { + static bool once = false; + rt_rq->rt_throttled = 1; - printk_once(KERN_WARNING "sched: RT throttling activated\n"); + + if (!once) { + once = true; + printk_sched("sched: RT throttling activated\n"); + } } else { /* * In case we did anyway, make it go away, -- cgit v1.2.3 From 8e3fabfde445a872c8aec2296846badf24d7c8b4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 6 Mar 2012 18:54:26 +0100 Subject: sched: Update yield() docs Suggested-by: Joe Perches Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1331056466.11248.327.camel@twins Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8781cec7c3e6..47614a5cdd47 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4577,8 +4577,24 @@ EXPORT_SYMBOL(__cond_resched_softirq); /** * yield - yield the current processor to other threads. * - * This is a shortcut for kernel-space yielding - it marks the - * thread runnable and calls sys_sched_yield(). + * Do not ever use this function, there's a 99% chance you're doing it wrong. + * + * The scheduler is at all times free to pick the calling task as the most + * eligible task to run, if removing the yield() call from your code breaks + * it, its already broken. + * + * Typical broken usage is: + * + * while (!event) + * yield(); + * + * where one assumes that yield() will let 'the other' process run that will + * make event true. If the current task is a SCHED_FIFO task that will never + * happen. Never use yield() as a progress guarantee!! + * + * If you want to use yield() to wait for something, use wait_event(). + * If you want to use yield() to be 'nice' for others, use cond_resched(). + * If you still want to use yield(), do not! */ void __sched yield(void) { -- cgit v1.2.3 From c308b56b5398779cd3da0f62ab26b0453494c3d4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 1 Mar 2012 15:04:46 +0100 Subject: sched: Fix nohz load accounting -- again! MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Various people reported nohz load tracking still being wrecked, but Doug spotted the actual problem. We fold the nohz remainder in too soon, causing us to loose samples and under-account. So instead of playing catch-up up-front, always do a single load-fold with whatever state we encounter and only then fold the nohz remainder and play catch-up. Reported-by: Doug Smythies Reported-by: LesÃ…=82aw Kope=C4=87 Reported-by: Aman Gupta Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-4v31etnhgg9kwd6ocgx3rxl8@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 53 ++++++++++++++++++++++++++--------------------------- 1 file changed, 26 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 47614a5cdd47..e3ccc13c4caa 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2266,13 +2266,10 @@ calc_load_n(unsigned long load, unsigned long exp, * Once we've updated the global active value, we need to apply the exponential * weights adjusted to the number of cycles missed. */ -static void calc_global_nohz(unsigned long ticks) +static void calc_global_nohz(void) { long delta, active, n; - if (time_before(jiffies, calc_load_update)) - return; - /* * If we crossed a calc_load_update boundary, make sure to fold * any pending idle changes, the respective CPUs might have @@ -2284,31 +2281,25 @@ static void calc_global_nohz(unsigned long ticks) atomic_long_add(delta, &calc_load_tasks); /* - * If we were idle for multiple load cycles, apply them. + * It could be the one fold was all it took, we done! */ - if (ticks >= LOAD_FREQ) { - n = ticks / LOAD_FREQ; + if (time_before(jiffies, calc_load_update + 10)) + return; - active = atomic_long_read(&calc_load_tasks); - active = active > 0 ? active * FIXED_1 : 0; + /* + * Catch-up, fold however many we are behind still + */ + delta = jiffies - calc_load_update - 10; + n = 1 + (delta / LOAD_FREQ); - avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); - avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); - avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; - calc_load_update += n * LOAD_FREQ; - } + avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); + avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); + avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); - /* - * Its possible the remainder of the above division also crosses - * a LOAD_FREQ period, the regular check in calc_global_load() - * which comes after this will take care of that. - * - * Consider us being 11 ticks before a cycle completion, and us - * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will - * age us 4 cycles, and the test in calc_global_load() will - * pick up the final one. - */ + calc_load_update += n * LOAD_FREQ; } #else void calc_load_account_idle(struct rq *this_rq) @@ -2320,7 +2311,7 @@ static inline long calc_load_fold_idle(void) return 0; } -static void calc_global_nohz(unsigned long ticks) +static void calc_global_nohz(void) { } #endif @@ -2348,8 +2339,6 @@ void calc_global_load(unsigned long ticks) { long active; - calc_global_nohz(ticks); - if (time_before(jiffies, calc_load_update + 10)) return; @@ -2361,6 +2350,16 @@ void calc_global_load(unsigned long ticks) avenrun[2] = calc_load(avenrun[2], EXP_15, active); calc_load_update += LOAD_FREQ; + + /* + * Account one period with whatever state we found before + * folding in the nohz state and ageing the entire idle period. + * + * This avoids loosing a sample when we go idle between + * calc_load_account_active() (10 ticks ago) and now and thus + * under-accounting. + */ + calc_global_nohz(); } /* -- cgit v1.2.3 From 600e145882802d6ccbfe2c4aea243d97caeb91a9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Mar 2012 12:35:37 +0100 Subject: printk: Make it compile with !CONFIG_PRINTK Commit 3ccf3e830615 ("printk/sched: Introduce special printk_sched() for those awkward moments") overlooked an #ifdef, so move code around to respect these directives. Reported-by: Stephen Rothwell Signed-off-by: Peter Zijlstra Cc: Randy Dunlap Link: http://lkml.kernel.org/r/1331811337.18960.179.camel@twins Signed-off-by: Ingo Molnar --- kernel/printk.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 49a2ae4a8dc7..b64ce71cb2e5 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1222,26 +1222,6 @@ int is_console_locked(void) static DEFINE_PER_CPU(int, printk_pending); static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); -int printk_sched(const char *fmt, ...) -{ - unsigned long flags; - va_list args; - char *buf; - int r; - - local_irq_save(flags); - buf = __get_cpu_var(printk_sched_buf); - - va_start(args, fmt); - r = vsnprintf(buf, PRINTK_BUF_SIZE, fmt, args); - va_end(args); - - __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED); - local_irq_restore(flags); - - return r; -} - void printk_tick(void) { if (__this_cpu_read(printk_pending)) { @@ -1658,6 +1638,26 @@ late_initcall(printk_late_init); #if defined CONFIG_PRINTK +int printk_sched(const char *fmt, ...) +{ + unsigned long flags; + va_list args; + char *buf; + int r; + + local_irq_save(flags); + buf = __get_cpu_var(printk_sched_buf); + + va_start(args, fmt); + r = vsnprintf(buf, PRINTK_BUF_SIZE, fmt, args); + va_end(args); + + __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED); + local_irq_restore(flags); + + return r; +} + /* * printk rate limiting, lifted from the networking subsystem. * -- cgit v1.2.3