From a4c96ae319b8047f62dedbe1eac79e321c185749 Mon Sep 17 00:00:00 2001 From: Peter Boonstoppel Date: Thu, 9 Aug 2012 15:34:47 -0700 Subject: sched: Unthrottle rt runqueues in __disable_runtime() migrate_tasks() uses _pick_next_task_rt() to get tasks from the real-time runqueues to be migrated. When rt_rq is throttled _pick_next_task_rt() won't return anything, in which case migrate_tasks() can't move all threads over and gets stuck in an infinite loop. Instead unthrottle rt runqueues before migrating tasks. Additionally: move unthrottle_offline_cfs_rqs() to rq_offline_fair() Signed-off-by: Peter Boonstoppel Signed-off-by: Peter Zijlstra Cc: Paul Turner Link: http://lkml.kernel.org/r/5FBF8E85CA34454794F0F7ECBA79798F379D3648B7@HQMAIL04.nvidia.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c219bf8d704c..86ad83c45dae 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2052,7 +2052,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) hrtimer_cancel(&cfs_b->slack_timer); } -void unthrottle_offline_cfs_rqs(struct rq *rq) +static void unthrottle_offline_cfs_rqs(struct rq *rq) { struct cfs_rq *cfs_rq; @@ -2106,7 +2106,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) return NULL; } static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} -void unthrottle_offline_cfs_rqs(struct rq *rq) {} +static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} #endif /* CONFIG_CFS_BANDWIDTH */ @@ -4956,6 +4956,9 @@ static void rq_online_fair(struct rq *rq) static void rq_offline_fair(struct rq *rq) { update_sysctl(); + + /* Ensure any throttled groups are reachable by pick_next_task */ + unthrottle_offline_cfs_rqs(rq); } #endif /* CONFIG_SMP */ -- cgit v1.2.3 From 9450d57eab5cad36774c297da123062744472588 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 18 Aug 2012 17:45:08 -0700 Subject: sched: Fix kernel-doc warnings in kernel/sched/fair.c Fix two kernel-doc warnings in kernel/sched/fair.c: Warning(kernel/sched/fair.c:3660): Excess function parameter 'cpus' description in 'update_sg_lb_stats' Warning(kernel/sched/fair.c:3806): Excess function parameter 'cpus' description in 'update_sd_lb_stats' Signed-off-by: Randy Dunlap Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/50303714.3090204@xenotime.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 86ad83c45dae..42d9df6a5ca4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3658,7 +3658,6 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) * @group: sched_group whose statistics are to be updated. * @load_idx: Load index of sched_domain of this_cpu for load calc. * @local_group: Does group contain this_cpu. - * @cpus: Set of cpus considered for load balancing. * @balance: Should we balance. * @sgs: variable to hold the statistics for this group. */ @@ -3805,7 +3804,6 @@ static bool update_sd_pick_busiest(struct lb_env *env, /** * update_sd_lb_stats - Update sched_domain's statistics for load balancing. * @env: The load balancing environment. - * @cpus: Set of cpus considered for load balancing. * @balance: Should we balance. * @sds: variable to hold the statistics for this sched_domain. */ -- cgit v1.2.3 From 37407ea7f93864c2cfc03edf8f37872ec539ea2b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 16 Sep 2012 12:29:43 -0700 Subject: Revert "sched: Improve scalability via 'CPU buddies', which withstand random perturbations" This reverts commit 970e178985cadbca660feb02f4d2ee3a09f7fdda. Nikolay Ulyanitsky reported thatthe 3.6-rc5 kernel has a 15-20% performance drop on PostgreSQL 9.2 on his machine (running "pgbench"). Borislav Petkov was able to reproduce this, and bisected it to this commit 970e178985ca ("sched: Improve scalability via 'CPU buddies' ...") apparently because the new single-idle-buddy model simply doesn't find idle CPU's to reschedule on aggressively enough. Mike Galbraith suspects that it is likely due to the user-mode spinlocks in PostgreSQL not reacting well to preemption, but we don't really know the details - I'll just revert the commit for now. There are hopefully other approaches to improve scheduler scalability without it causing these kinds of downsides. Reported-by: Nikolay Ulyanitsky Bisected-by: Borislav Petkov Acked-by: Mike Galbraith Cc: Andrew Morton Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 - kernel/sched/core.c | 39 +-------------------------------------- kernel/sched/fair.c | 28 +++++++++++++++++++++------- 3 files changed, 22 insertions(+), 46 deletions(-) (limited to 'kernel/sched/fair.c') diff --git a/include/linux/sched.h b/include/linux/sched.h index b8c86648a2f9..23bddac4bad8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -954,7 +954,6 @@ struct sched_domain { unsigned int smt_gain; int flags; /* See SD_* */ int level; - int idle_buddy; /* cpu assigned to select_idle_sibling() */ /* Runtime fields. */ unsigned long last_balance; /* init to jiffies. units in jiffies */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a4ea245f3d85..649c9f876cb1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6014,11 +6014,6 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this * allows us to avoid some pointer chasing select_idle_sibling(). * - * Iterate domains and sched_groups downward, assigning CPUs to be - * select_idle_sibling() hw buddy. Cross-wiring hw makes bouncing - * due to random perturbation self canceling, ie sw buddies pull - * their counterpart to their CPU's hw counterpart. - * * Also keep a unique ID per domain (we use the first cpu number in * the cpumask of the domain), this allows us to quickly tell if * two cpus are in the same cache domain, see cpus_share_cache(). @@ -6032,40 +6027,8 @@ static void update_top_cache_domain(int cpu) int id = cpu; sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); - if (sd) { - struct sched_domain *tmp = sd; - struct sched_group *sg, *prev; - bool right; - - /* - * Traverse to first CPU in group, and count hops - * to cpu from there, switching direction on each - * hop, never ever pointing the last CPU rightward. - */ - do { - id = cpumask_first(sched_domain_span(tmp)); - prev = sg = tmp->groups; - right = 1; - - while (cpumask_first(sched_group_cpus(sg)) != id) - sg = sg->next; - - while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) { - prev = sg; - sg = sg->next; - right = !right; - } - - /* A CPU went down, never point back to domain start. */ - if (right && cpumask_first(sched_group_cpus(sg->next)) == id) - right = false; - - sg = right ? sg->next : prev; - tmp->idle_buddy = cpumask_first(sched_group_cpus(sg)); - } while ((tmp = tmp->child)); - + if (sd) id = cpumask_first(sched_domain_span(sd)); - } rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_id, cpu) = id; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 42d9df6a5ca4..96e2b18b6283 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2637,6 +2637,8 @@ static int select_idle_sibling(struct task_struct *p, int target) int cpu = smp_processor_id(); int prev_cpu = task_cpu(p); struct sched_domain *sd; + struct sched_group *sg; + int i; /* * If the task is going to be woken-up on this cpu and if it is @@ -2653,17 +2655,29 @@ static int select_idle_sibling(struct task_struct *p, int target) return prev_cpu; /* - * Otherwise, check assigned siblings to find an elegible idle cpu. + * Otherwise, iterate the domains and find an elegible idle cpu. */ sd = rcu_dereference(per_cpu(sd_llc, target)); - for_each_lower_domain(sd) { - if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p))) - continue; - if (idle_cpu(sd->idle_buddy)) - return sd->idle_buddy; - } + sg = sd->groups; + do { + if (!cpumask_intersects(sched_group_cpus(sg), + tsk_cpus_allowed(p))) + goto next; + for_each_cpu(i, sched_group_cpus(sg)) { + if (!idle_cpu(i)) + goto next; + } + + target = cpumask_first_and(sched_group_cpus(sg), + tsk_cpus_allowed(p)); + goto done; +next: + sg = sg->next; + } while (sg != sd->groups); + } +done: return target; } -- cgit v1.2.3