From 4e2dcb73aecbde9fe4e3137c9ea35cb6aa6cb286 Mon Sep 17 00:00:00 2001 From: Zhang Hang Date: Wed, 10 Apr 2013 14:04:55 +0800 Subject: sched: Simplify can_migrate_task() At this point tsk_cache_hot is always true, so no need to check it. Signed-off-by: Zhang Hang Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/51650107.9040606@huawei.com [ Also remove unnecessary schedstat #ifdefs. ] Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 539760ef00c4..bf8ab4f5e603 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3921,20 +3921,17 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); if (!tsk_cache_hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { -#ifdef CONFIG_SCHEDSTATS + if (tsk_cache_hot) { schedstat_inc(env->sd, lb_hot_gained[env->idle]); schedstat_inc(p, se.statistics.nr_forced_migrations); } -#endif + return 1; } - if (tsk_cache_hot) { - schedstat_inc(p, se.statistics.nr_failed_migrations_hot); - return 0; - } - return 1; + schedstat_inc(p, se.statistics.nr_failed_migrations_hot); + return 0; } /* -- cgit v1.2.3 From b9b0853a4b377f84a5e6ed091816a9a2d6b10918 Mon Sep 17 00:00:00 2001 From: Libin Date: Mon, 1 Apr 2013 19:14:01 +0800 Subject: sched: Fix comment in rebalance_domains() A comment in function rebalance_domains() mentions arch_init_sched_domains(), but that function does not exist anymore. The proper function is init_sched_domains(). Signed-off-by: Libin Cc: Link: http://lkml.kernel.org/r/1364814841-49156-1-git-send-email-huawei.libin@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bf8ab4f5e603..155783b4e4bf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5466,7 +5466,7 @@ void update_max_interval(void) * It checks each scheduling domain to see if it is due to be balanced, * and initiates a balancing operation if so. * - * Balancing parameters are set up in arch_init_sched_domains. + * Balancing parameters are set up in init_sched_domains. */ static void rebalance_domains(int cpu, enum cpu_idle_type idle) { -- cgit v1.2.3 From 642dbc39ab1ea00f47e0fee1b8e8a27da036d940 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 18 Apr 2013 18:34:26 +0200 Subject: sched: Fix wrong rq's runnable_avg update with rt tasks The current update of the rq's load can be erroneous when RT tasks are involved. The update of the load of a rq that becomes idle, is done only if the avg_idle is less than sysctl_sched_migration_cost. If RT tasks and short idle duration alternate, the runnable_avg will not be updated correctly and the time will be accounted as idle time when a CFS task wakes up. A new idle_enter function is called when the next task is the idle function so the elapsed time will be accounted as run time in the load of the rq, whatever the average idle time is. The function update_rq_runnable_avg is removed from idle_balance. When a RT task is scheduled on an idle CPU, the update of the rq's load is not done when the rq exit idle state because CFS's functions are not called. Then, the idle_balance, which is called just before entering the idle function, updates the rq's load and makes the assumption that the elapsed time since the last update, was only running time. As a consequence, the rq's load of a CPU that only runs a periodic RT task, is close to LOAD_AVG_MAX whatever the running duration of the RT task is. A new idle_exit function is called when the prev task is the idle function so the elapsed time will be accounted as idle time in the rq's load. Signed-off-by: Vincent Guittot Acked-by: Peter Zijlstra Acked-by: Steven Rostedt Cc: linaro-kernel@lists.linaro.org Cc: peterz@infradead.org Cc: pjt@google.com Cc: fweisbec@gmail.com Cc: efault@gmx.de Link: http://lkml.kernel.org/r/1366302867-5055-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 23 +++++++++++++++++++++-- kernel/sched/idle_task.c | 16 ++++++++++++++++ kernel/sched/sched.h | 12 ++++++++++++ 3 files changed, 49 insertions(+), 2 deletions(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 155783b4e4bf..1c977350e322 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1563,6 +1563,27 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); } /* migrations, e.g. sleep=0 leave decay_count == 0 */ } + +/* + * Update the rq's load with the elapsed running time before entering + * idle. if the last scheduled task is not a CFS task, idle_enter will + * be the only way to update the runnable statistic. + */ +void idle_enter_fair(struct rq *this_rq) +{ + update_rq_runnable_avg(this_rq, 1); +} + +/* + * Update the rq's load with the elapsed idle time before a task is + * scheduled. if the newly scheduled task is not a CFS task, idle_exit will + * be the only way to update the runnable statistic. + */ +void idle_exit_fair(struct rq *this_rq) +{ + update_rq_runnable_avg(this_rq, 0); +} + #else static inline void update_entity_load_avg(struct sched_entity *se, int update_cfs_rq) {} @@ -5217,8 +5238,6 @@ void idle_balance(int this_cpu, struct rq *this_rq) if (this_rq->avg_idle < sysctl_sched_migration_cost) return; - update_rq_runnable_avg(this_rq, 1); - /* * Drop the rq->lock, but keep IRQ/preempt disabled. */ diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index b6baf370cae9..b8ce77328341 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -13,6 +13,16 @@ select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) { return task_cpu(p); /* IDLE tasks as never migrated */ } + +static void pre_schedule_idle(struct rq *rq, struct task_struct *prev) +{ + idle_exit_fair(rq); +} + +static void post_schedule_idle(struct rq *rq) +{ + idle_enter_fair(rq); +} #endif /* CONFIG_SMP */ /* * Idle tasks are unconditionally rescheduled: @@ -25,6 +35,10 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl static struct task_struct *pick_next_task_idle(struct rq *rq) { schedstat_inc(rq, sched_goidle); +#ifdef CONFIG_SMP + /* Trigger the post schedule to do an idle_enter for CFS */ + rq->post_schedule = 1; +#endif return rq->idle; } @@ -86,6 +100,8 @@ const struct sched_class idle_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_idle, + .pre_schedule = pre_schedule_idle, + .post_schedule = post_schedule_idle, #endif .set_curr_task = set_curr_task_idle, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 8116cf8e350f..605426a63588 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1024,6 +1024,18 @@ extern void update_group_power(struct sched_domain *sd, int cpu); extern void trigger_load_balance(struct rq *rq, int cpu); extern void idle_balance(int this_cpu, struct rq *this_rq); +/* + * Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg + * becomes useful in lb + */ +#if defined(CONFIG_FAIR_GROUP_SCHED) +extern void idle_enter_fair(struct rq *this_rq); +extern void idle_exit_fair(struct rq *this_rq); +#else +static inline void idle_enter_fair(struct rq *this_rq) {} +static inline void idle_exit_fair(struct rq *this_rq) {} +#endif + #else /* CONFIG_SMP */ static inline void idle_balance(int cpu, struct rq *rq) -- cgit v1.2.3 From f1cd0858100c67273f2c74344e0c464344c4a982 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 23 Apr 2013 17:27:37 +0900 Subject: sched: Change position of resched_cpu() in load_balance() cur_ld_moved is reset if env.flags hit LBF_NEED_BREAK. So, there is possibility that we miss doing resched_cpu(). Correct it as changing position of resched_cpu() before checking LBF_NEED_BREAK. Signed-off-by: Joonsoo Kim Tested-by: Jason Low Acked-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Davidlohr Bueso Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1366705662-3587-2-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1c977350e322..25aaf93281de 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5080,17 +5080,17 @@ more_balance: double_rq_unlock(env.dst_rq, busiest); local_irq_restore(flags); - if (env.flags & LBF_NEED_BREAK) { - env.flags &= ~LBF_NEED_BREAK; - goto more_balance; - } - /* * some other cpu did the load balance for us. */ if (cur_ld_moved && env.dst_cpu != smp_processor_id()) resched_cpu(env.dst_cpu); + if (env.flags & LBF_NEED_BREAK) { + env.flags &= ~LBF_NEED_BREAK; + goto more_balance; + } + /* * Revisit (affine) tasks on src_cpu that couldn't be moved to * us and move them to an alternate dst_cpu in our sched_group -- cgit v1.2.3 From de5eb2dd7f171ee8a45d23cd41aa2efe9ab922b3 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 23 Apr 2013 17:27:38 +0900 Subject: sched: Explicitly cpu_idle_type checking in rebalance_domains() After commit 88b8dac0, dst-cpu can be changed in load_balance(), then we can't know cpu_idle_type of dst-cpu when load_balance() return positive. So, add explicit cpu_idle_type checking. Signed-off-by: Joonsoo Kim Tested-by: Jason Low Acked-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Davidlohr Bueso Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1366705662-3587-3-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 25aaf93281de..726e12905725 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5523,10 +5523,11 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) if (time_after_eq(jiffies, sd->last_balance + interval)) { if (load_balance(cpu, rq, sd, idle, &balance)) { /* - * We've pulled tasks over so either we're no - * longer idle. + * The LBF_SOME_PINNED logic could have changed + * env->dst_cpu, so we can't know our idle + * state even if we migrated tasks. Update it. */ - idle = CPU_NOT_IDLE; + idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; } sd->last_balance = jiffies; } -- cgit v1.2.3 From cfc03118047172f5bdc58d63c607d16d33ce5305 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 23 Apr 2013 17:27:39 +0900 Subject: sched: Don't consider other cpus in our group in case of NEWLY_IDLE Commit 88b8dac0 makes load_balance() consider other cpus in its group, regardless of idle type. When we do NEWLY_IDLE balancing, we should not consider it, because a motivation of NEWLY_IDLE balancing is to turn this cpu to non idle state if needed. This is not the case of other cpus. So, change code not to consider other cpus for NEWLY_IDLE balancing. With this patch, assign 'if (pulled_task) this_rq->idle_stamp = 0' in idle_balance() is corrected, because NEWLY_IDLE balancing doesn't consider other cpus. Assigning to 'this_rq->idle_stamp' is now valid. Signed-off-by: Joonsoo Kim Tested-by: Jason Low Acked-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Davidlohr Bueso Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1366705662-3587-4-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 726e12905725..dfa92b7b3dec 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5026,8 +5026,21 @@ static int load_balance(int this_cpu, struct rq *this_rq, .cpus = cpus, }; + /* + * For NEWLY_IDLE load_balancing, we don't need to consider + * other cpus in our group + */ + if (idle == CPU_NEWLY_IDLE) { + env.dst_grpmask = NULL; + /* + * we don't care max_lb_iterations in this case, + * in following patch, this will be removed + */ + max_lb_iterations = 0; + } else + max_lb_iterations = cpumask_weight(env.dst_grpmask); + cpumask_copy(cpus, cpu_active_mask); - max_lb_iterations = cpumask_weight(env.dst_grpmask); schedstat_inc(sd, lb_count[idle]); -- cgit v1.2.3 From d31980846f9688db3ee3e5863525c6ff8ace4c7c Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 23 Apr 2013 17:27:40 +0900 Subject: sched: Move up affinity check to mitigate useless redoing overhead Currently, LBF_ALL_PINNED is cleared after affinity check is passed. So, if task migration is skipped by small load value or small imbalance value in move_tasks(), we don't clear LBF_ALL_PINNED. At last, we trigger 'redo' in load_balance(). Imbalance value is often so small that any tasks cannot be moved to other cpus and, of course, this situation may be continued after we change the target cpu. So this patch move up affinity check code and clear LBF_ALL_PINNED before evaluating load value in order to mitigate useless redoing overhead. In addition, re-order some comments correctly. Signed-off-by: Joonsoo Kim Acked-by: Peter Zijlstra Tested-by: Jason Low Cc: Srivatsa Vaddagiri Cc: Davidlohr Bueso Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1366705662-3587-5-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dfa92b7b3dec..b8ef321641df 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3896,10 +3896,14 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) int tsk_cache_hot = 0; /* * We do not migrate tasks that are: - * 1) running (obviously), or + * 1) throttled_lb_pair, or * 2) cannot be migrated to this CPU due to cpus_allowed, or - * 3) are cache-hot on their current CPU. + * 3) running (obviously), or + * 4) are cache-hot on their current CPU. */ + if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) + return 0; + if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { int new_dst_cpu; @@ -3967,9 +3971,6 @@ static int move_one_task(struct lb_env *env) struct task_struct *p, *n; list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { - if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu)) - continue; - if (!can_migrate_task(p, env)) continue; @@ -4021,7 +4022,7 @@ static int move_tasks(struct lb_env *env) break; } - if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) + if (!can_migrate_task(p, env)) goto next; load = task_h_load(p); @@ -4032,9 +4033,6 @@ static int move_tasks(struct lb_env *env) if ((load / 2) > env->imbalance) goto next; - if (!can_migrate_task(p, env)) - goto next; - move_task(p, env); pulled++; env->imbalance -= load; -- cgit v1.2.3 From e6252c3ef4b9cd251b53f7b68035f395d20b044e Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 23 Apr 2013 17:27:41 +0900 Subject: sched: Rename load_balance_tmpmask to load_balance_mask This name doesn't represent specific meaning. So rename it to imply it's purpose. Signed-off-by: Joonsoo Kim Acked-by: Peter Zijlstra Tested-by: Jason Low Cc: Srivatsa Vaddagiri Cc: Davidlohr Bueso Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1366705662-3587-6-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ++-- kernel/sched/fair.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ee8c1bd703fe..cb49b2ab0e16 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6873,7 +6873,7 @@ struct task_group root_task_group; LIST_HEAD(task_groups); #endif -DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); +DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); void __init sched_init(void) { @@ -6910,7 +6910,7 @@ void __init sched_init(void) #endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_CPUMASK_OFFSTACK for_each_possible_cpu(i) { - per_cpu(load_balance_tmpmask, i) = (void *)ptr; + per_cpu(load_balance_mask, i) = (void *)ptr; ptr += cpumask_size(); } #endif /* CONFIG_CPUMASK_OFFSTACK */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b8ef321641df..5b1e96687b49 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4977,7 +4977,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, #define MAX_PINNED_INTERVAL 512 /* Working cpumask for load_balance and load_balance_newidle. */ -DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); +DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); static int need_active_balance(struct lb_env *env) { @@ -5012,7 +5012,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_group *group; struct rq *busiest; unsigned long flags; - struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); + struct cpumask *cpus = __get_cpu_var(load_balance_mask); struct lb_env env = { .sd = sd, -- cgit v1.2.3 From e02e60c109ca70935bad1131976bdbf5160cf576 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 23 Apr 2013 17:27:42 +0900 Subject: sched: Prevent to re-select dst-cpu in load_balance() Commit 88b8dac0 makes load_balance() consider other cpus in its group. But, in that, there is no code for preventing to re-select dst-cpu. So, same dst-cpu can be selected over and over. This patch add functionality to load_balance() in order to exclude cpu which is selected once. We prevent to re-select dst_cpu via env's cpus, so now, env's cpus is a candidate not only for src_cpus, but also dst_cpus. With this patch, we can remove lb_iterations and max_lb_iterations, because we decide whether we can go ahead or not via env's cpus. Signed-off-by: Joonsoo Kim Acked-by: Peter Zijlstra Tested-by: Jason Low Cc: Srivatsa Vaddagiri Cc: Davidlohr Bueso Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1366705662-3587-7-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5b1e96687b49..acaf567a03d2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3905,7 +3905,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) return 0; if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { - int new_dst_cpu; + int cpu; schedstat_inc(p, se.statistics.nr_failed_migrations_affine); @@ -3920,12 +3920,15 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) return 0; - new_dst_cpu = cpumask_first_and(env->dst_grpmask, - tsk_cpus_allowed(p)); - if (new_dst_cpu < nr_cpu_ids) { - env->flags |= LBF_SOME_PINNED; - env->new_dst_cpu = new_dst_cpu; + /* Prevent to re-select dst_cpu via env's cpus */ + for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { + if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { + env->flags |= LBF_SOME_PINNED; + env->new_dst_cpu = cpu; + break; + } } + return 0; } @@ -5008,7 +5011,6 @@ static int load_balance(int this_cpu, struct rq *this_rq, int *balance) { int ld_moved, cur_ld_moved, active_balance = 0; - int lb_iterations, max_lb_iterations; struct sched_group *group; struct rq *busiest; unsigned long flags; @@ -5028,15 +5030,8 @@ static int load_balance(int this_cpu, struct rq *this_rq, * For NEWLY_IDLE load_balancing, we don't need to consider * other cpus in our group */ - if (idle == CPU_NEWLY_IDLE) { + if (idle == CPU_NEWLY_IDLE) env.dst_grpmask = NULL; - /* - * we don't care max_lb_iterations in this case, - * in following patch, this will be removed - */ - max_lb_iterations = 0; - } else - max_lb_iterations = cpumask_weight(env.dst_grpmask); cpumask_copy(cpus, cpu_active_mask); @@ -5064,7 +5059,6 @@ redo: schedstat_add(sd, lb_imbalance[idle], env.imbalance); ld_moved = 0; - lb_iterations = 1; if (busiest->nr_running > 1) { /* * Attempt to move tasks. If find_busiest_group has found @@ -5121,14 +5115,17 @@ more_balance: * moreover subsequent load balance cycles should correct the * excess load moved. */ - if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && - lb_iterations++ < max_lb_iterations) { + if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { env.dst_rq = cpu_rq(env.new_dst_cpu); env.dst_cpu = env.new_dst_cpu; env.flags &= ~LBF_SOME_PINNED; env.loop = 0; env.loop_break = sched_nr_migrate_break; + + /* Prevent to re-select dst_cpu via env's cpus */ + cpumask_clear_cpu(env.dst_cpu, env.cpus); + /* * Go back to "more_balance" rather than "redo" since we * need to continue with same src_cpu. -- cgit v1.2.3 From 25f55d9d01ad7a7ad248fd5af1d22675ffd202c5 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 23 Apr 2013 16:59:02 +0200 Subject: sched: Fix init NOHZ_IDLE flag On my SMP platform which is made of 5 cores in 2 clusters, I have the nr_busy_cpu field of sched_group_power struct that is not null when the platform is fully idle - which makes the scheduler unhappy. The root cause is: During the boot sequence, some CPUs reach the idle loop and set their NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus field is initialized later with the assumption that all CPUs are in the busy state whereas some CPUs have already set their NOHZ_IDLE flag. More generally, the NOHZ_IDLE flag must be initialized when new sched_domains are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned. This condition can be ensured by adding a synchronize_rcu() between the destruction of old sched_domains and the creation of new ones so the NOHZ_IDLE flag will not be updated with old sched_domain once it has been initialized. But this solution introduces a additionnal latency in the rebuild sequence that is called during cpu hotplug. As suggested by Frederic Weisbecker, another solution is to have the same rcu lifecycle for both NOHZ_IDLE and sched_domain struct. A new nohz_idle field is added to sched_domain so both status and sched_domain will share the same RCU lifecycle and will be always synchronized. In addition, there is no more need to protect nohz_idle against concurrent access as it is only modified by 2 exclusive functions called by local cpu. This solution has been prefered to the creation of a new struct with an extra pointer indirection for sched_domain. The synchronization is done at the cost of : - An additional indirection and a rcu_dereference for accessing nohz_idle. - We use only the nohz_idle field of the top sched_domain. Signed-off-by: Vincent Guittot Acked-by: Peter Zijlstra Cc: linaro-kernel@lists.linaro.org Cc: peterz@infradead.org Cc: fweisbec@gmail.com Cc: pjt@google.com Cc: rostedt@goodmis.org Cc: efault@gmx.de Link: http://lkml.kernel.org/r/1366729142-14662-1-git-send-email-vincent.guittot@linaro.org [ Fixed !NO_HZ build bug. ] Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ kernel/sched/fair.c | 26 ++++++++++++++++---------- kernel/sched/sched.h | 1 - 3 files changed, 18 insertions(+), 11 deletions(-) (limited to 'kernel/sched/fair.c') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6bdaa73ede13..a25168f4ab86 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -808,6 +808,8 @@ struct sched_domain { unsigned int wake_idx; unsigned int forkexec_idx; unsigned int smt_gain; + + int nohz_idle; /* NOHZ IDLE status */ int flags; /* See SD_* */ int level; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index acaf567a03d2..8bf7081b1ec5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5420,13 +5420,16 @@ static inline void set_cpu_sd_state_busy(void) struct sched_domain *sd; int cpu = smp_processor_id(); - if (!test_bit(NOHZ_IDLE, nohz_flags(cpu))) - return; - clear_bit(NOHZ_IDLE, nohz_flags(cpu)); - rcu_read_lock(); - for_each_domain(cpu, sd) + sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); + + if (!sd || !sd->nohz_idle) + goto unlock; + sd->nohz_idle = 0; + + for (; sd; sd = sd->parent) atomic_inc(&sd->groups->sgp->nr_busy_cpus); +unlock: rcu_read_unlock(); } @@ -5435,13 +5438,16 @@ void set_cpu_sd_state_idle(void) struct sched_domain *sd; int cpu = smp_processor_id(); - if (test_bit(NOHZ_IDLE, nohz_flags(cpu))) - return; - set_bit(NOHZ_IDLE, nohz_flags(cpu)); - rcu_read_lock(); - for_each_domain(cpu, sd) + sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); + + if (!sd || sd->nohz_idle) + goto unlock; + sd->nohz_idle = 1; + + for (; sd; sd = sd->parent) atomic_dec(&sd->groups->sgp->nr_busy_cpus); +unlock: rcu_read_unlock(); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 605426a63588..4c225c4c7111 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1303,7 +1303,6 @@ extern void account_cfs_bandwidth_used(int enabled, int was_enabled); enum rq_nohz_flag_bits { NOHZ_TICK_STOPPED, NOHZ_BALANCE_KICK, - NOHZ_IDLE, }; #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) -- cgit v1.2.3