From 4ec4412e1e91f44a3dcb97b6c9172a13fc78bac9 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Mon, 12 Dec 2011 20:21:08 +0100
Subject: sched: Ensure cpu_power periodic update

With a lot of small tasks, the softirq sched is nearly never called
when no_hz is enabled. In this case load_balance() is mainly called
with the newly_idle mode which doesn't update the cpu_power.

Add a next_update field which ensure a maximum update period when
there is short activity.

Having stale cpu_power information can skew the load-balancing
decisions, this is cured by the guaranteed update.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1323717668-2143-1-git-send-email-vincent.guittot@linaro.org
---
 kernel/sched/fair.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

(limited to 'kernel/sched/fair.c')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7c6414fc669d..8e77a6bd597b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -215,6 +215,8 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 
 const struct sched_class fair_sched_class;
 
+static unsigned long __read_mostly max_load_balance_interval = HZ/10;
+
 /**************************************************************
  * CFS operations on generic schedulable entities:
  */
@@ -3776,6 +3778,11 @@ void update_group_power(struct sched_domain *sd, int cpu)
 	struct sched_domain *child = sd->child;
 	struct sched_group *group, *sdg = sd->groups;
 	unsigned long power;
+	unsigned long interval;
+
+	interval = msecs_to_jiffies(sd->balance_interval);
+	interval = clamp(interval, 1UL, max_load_balance_interval);
+	sdg->sgp->next_update = jiffies + interval;
 
 	if (!child) {
 		update_cpu_power(sd, cpu);
@@ -3883,12 +3890,15 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 	 * domains. In the newly idle case, we will allow all the cpu's
 	 * to do the newly idle load balance.
 	 */
-	if (idle != CPU_NEWLY_IDLE && local_group) {
-		if (balance_cpu != this_cpu) {
-			*balance = 0;
-			return;
-		}
-		update_group_power(sd, this_cpu);
+	if (local_group) {
+		if (idle != CPU_NEWLY_IDLE) {
+			if (balance_cpu != this_cpu) {
+				*balance = 0;
+				return;
+			}
+			update_group_power(sd, this_cpu);
+		} else if (time_after_eq(jiffies, group->sgp->next_update))
+			update_group_power(sd, this_cpu);
 	}
 
 	/* Adjust by relative CPU power of the group */
@@ -4945,8 +4955,6 @@ static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
 
 static DEFINE_SPINLOCK(balancing);
 
-static unsigned long __read_mostly max_load_balance_interval = HZ/10;
-
 /*
  * Scale the max load_balance interval with the number of CPUs in the system.
  * This trades load-balance latency on larger machines for less cross talk.
-- 
cgit v1.2.3


From ed387b781ea6e14b78f449aa2ee4f270b60b01ac Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Tue, 31 Jan 2012 11:40:32 +0900
Subject: sched: Move SMP-only variable into the SMP section

This also fixes the following compilation warning on !SMP:

  CC kernel/sched/fair.o
  kernel/sched/fair.c:218:36: warning: 'max_load_balance_interval' defined but not used [-Wunused-variable]

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/4F2754A0.9090306@ct.jp.nec.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched/fair.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel/sched/fair.c')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8e77a6bd597b..4ab60a24ea49 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -215,8 +215,6 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 
 const struct sched_class fair_sched_class;
 
-static unsigned long __read_mostly max_load_balance_interval = HZ/10;
-
 /**************************************************************
  * CFS operations on generic schedulable entities:
  */
@@ -3086,6 +3084,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
  * Fair scheduling class load-balancing methods:
  */
 
+static unsigned long __read_mostly max_load_balance_interval = HZ/10;
+
 /*
  * pull_task - move a task from a remote runqueue to the local runqueue.
  * Both runqueues must be locked.
-- 
cgit v1.2.3


From 62f6536a630affe3176deb48554d27ee58b65077 Mon Sep 17 00:00:00 2001
From: "Nikunj A. Dadhania" <nikunj@linux.vnet.ibm.com>
Date: Fri, 17 Feb 2012 08:34:30 +0530
Subject: sched: Remove rcu_read_lock/unlock() from select_idle_sibling()

select_idle_sibling() is called from select_task_rq_fair(), which
already has the RCU read lock held.

Signed-off-by: Nikunj A. Dadhania <nikunj@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20120217030409.11748.12491.stgit@abhimanyu
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched/fair.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel/sched/fair.c')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4ab60a24ea49..6ce9992926d0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2670,8 +2670,6 @@ static int select_idle_sibling(struct task_struct *p, int target)
 	/*
 	 * Otherwise, iterate the domains and find an elegible idle cpu.
 	 */
-	rcu_read_lock();
-
 	sd = rcu_dereference(per_cpu(sd_llc, target));
 	for_each_lower_domain(sd) {
 		sg = sd->groups;
@@ -2693,8 +2691,6 @@ next:
 		} while (sg != sd->groups);
 	}
 done:
-	rcu_read_unlock();
-
 	return target;
 }
 
-- 
cgit v1.2.3


From 8e45cb545d98bc58e75b7de89ec8d3e5c8459ee6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 22 Feb 2012 12:47:19 +0100
Subject: sched: Move load-balancing arguments into helper struct

Passing large sets of similar arguments all around the load-balancer
gets tiresom when you want to modify something. Stick them all in a
helper structure and pass the structure around.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: pjt@google.com
Link: http://lkml.kernel.org/n/tip-5slqz0vhsdzewrfk9eza1aon@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched/fair.c | 177 +++++++++++++++++++++++++++-------------------------
 1 file changed, 93 insertions(+), 84 deletions(-)

(limited to 'kernel/sched/fair.c')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 79e9e13c31ab..55b1f117419a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3135,13 +3135,25 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
 #define LBF_HAD_BREAKS	0x0C	/* count HAD_BREAKs overflows into ABORT */
 #define LBF_ABORT	0x10
 
+struct lb_env {
+	struct sched_domain	*sd;
+
+	int			this_cpu;
+	struct rq		*this_rq;
+
+	struct rq		*busiest_rq;
+	struct cfs_rq		*busiest_cfs_rq;
+
+	enum cpu_idle_type	idle;
+	unsigned long		max_load_move;
+	unsigned int		flags;
+};
+
 /*
  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
  */
 static
-int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
-		     struct sched_domain *sd, enum cpu_idle_type idle,
-		     int *lb_flags)
+int can_migrate_task(struct task_struct *p, struct lb_env *env)
 {
 	int tsk_cache_hot = 0;
 	/*
@@ -3150,13 +3162,13 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
 	 * 3) are cache-hot on their current CPU.
 	 */
-	if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(p))) {
+	if (!cpumask_test_cpu(env->this_cpu, tsk_cpus_allowed(p))) {
 		schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
 		return 0;
 	}
-	*lb_flags &= ~LBF_ALL_PINNED;
+	env->flags &= ~LBF_ALL_PINNED;
 
-	if (task_running(rq, p)) {
+	if (task_running(env->busiest_rq, p)) {
 		schedstat_inc(p, se.statistics.nr_failed_migrations_running);
 		return 0;
 	}
@@ -3167,12 +3179,12 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 	 * 2) too many balance attempts have failed.
 	 */
 
-	tsk_cache_hot = task_hot(p, rq->clock_task, sd);
+	tsk_cache_hot = task_hot(p, env->busiest_rq->clock_task, env->sd);
 	if (!tsk_cache_hot ||
-		sd->nr_balance_failed > sd->cache_nice_tries) {
+		env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
 #ifdef CONFIG_SCHEDSTATS
 		if (tsk_cache_hot) {
-			schedstat_inc(sd, lb_hot_gained[idle]);
+			schedstat_inc(env->sd, lb_hot_gained[env->idle]);
 			schedstat_inc(p, se.statistics.nr_forced_migrations);
 		}
 #endif
@@ -3193,31 +3205,27 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
  *
  * Called with both runqueues locked.
  */
-static int
-move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
-	      struct sched_domain *sd, enum cpu_idle_type idle)
+static int move_one_task(struct lb_env *env)
 {
 	struct task_struct *p, *n;
 	struct cfs_rq *cfs_rq;
-	int pinned = 0;
 
-	for_each_leaf_cfs_rq(busiest, cfs_rq) {
+	for_each_leaf_cfs_rq(env->busiest_rq, cfs_rq) {
 		list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
 			if (throttled_lb_pair(task_group(p),
-					      busiest->cpu, this_cpu))
+					      env->busiest_rq->cpu, env->this_cpu))
 				break;
 
-			if (!can_migrate_task(p, busiest, this_cpu,
-						sd, idle, &pinned))
+			if (!can_migrate_task(p, env))
 				continue;
 
-			pull_task(busiest, p, this_rq, this_cpu);
+			pull_task(env->busiest_rq, p, env->this_rq, env->this_cpu);
 			/*
 			 * Right now, this is only the second place pull_task()
 			 * is called, so we can safely collect pull_task()
 			 * stats here rather than inside pull_task().
 			 */
-			schedstat_inc(sd, lb_gained[idle]);
+			schedstat_inc(env->sd, lb_gained[env->idle]);
 			return 1;
 		}
 	}
@@ -3225,31 +3233,26 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	return 0;
 }
 
-static unsigned long
-balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-	      unsigned long max_load_move, struct sched_domain *sd,
-	      enum cpu_idle_type idle, int *lb_flags,
-	      struct cfs_rq *busiest_cfs_rq)
+static unsigned long balance_tasks(struct lb_env *env)
 {
 	int loops = 0, pulled = 0;
-	long rem_load_move = max_load_move;
+	long rem_load_move = env->max_load_move;
 	struct task_struct *p, *n;
 
-	if (max_load_move == 0)
+	if (env->max_load_move == 0)
 		goto out;
 
-	list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
+	list_for_each_entry_safe(p, n, &env->busiest_cfs_rq->tasks, se.group_node) {
 		if (loops++ > sysctl_sched_nr_migrate) {
-			*lb_flags |= LBF_NEED_BREAK;
+			env->flags |= LBF_NEED_BREAK;
 			break;
 		}
 
 		if ((p->se.load.weight >> 1) > rem_load_move ||
-		    !can_migrate_task(p, busiest, this_cpu, sd, idle,
-				      lb_flags))
+		    !can_migrate_task(p, env))
 			continue;
 
-		pull_task(busiest, p, this_rq, this_cpu);
+		pull_task(env->busiest_rq, p, env->this_rq, env->this_cpu);
 		pulled++;
 		rem_load_move -= p->se.load.weight;
 
@@ -3259,8 +3262,8 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		 * kernels will stop after the first task is pulled to minimize
 		 * the critical section.
 		 */
-		if (idle == CPU_NEWLY_IDLE) {
-			*lb_flags |= LBF_ABORT;
+		if (env->idle == CPU_NEWLY_IDLE) {
+			env->flags |= LBF_ABORT;
 			break;
 		}
 #endif
@@ -3278,9 +3281,9 @@ out:
 	 * so we can safely collect pull_task() stats here rather than
 	 * inside pull_task().
 	 */
-	schedstat_add(sd, lb_gained[idle], pulled);
+	schedstat_add(env->sd, lb_gained[env->idle], pulled);
 
-	return max_load_move - rem_load_move;
+	return env->max_load_move - rem_load_move;
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -3363,40 +3366,39 @@ static void update_h_load(long cpu)
 	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
 }
 
-static unsigned long
-load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
-		  unsigned long max_load_move,
-		  struct sched_domain *sd, enum cpu_idle_type idle,
-		  int *lb_flags)
+static unsigned long load_balance_fair(struct lb_env *env)
 {
-	long rem_load_move = max_load_move;
-	struct cfs_rq *busiest_cfs_rq;
+	unsigned long max_load_move = env->max_load_move;
+	long rem_load_move = env->max_load_move;
 
 	rcu_read_lock();
-	update_h_load(cpu_of(busiest));
+	update_h_load(cpu_of(env->busiest_rq));
 
-	for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) {
-		unsigned long busiest_h_load = busiest_cfs_rq->h_load;
-		unsigned long busiest_weight = busiest_cfs_rq->load.weight;
+	for_each_leaf_cfs_rq(env->busiest_rq, env->busiest_cfs_rq) {
+		unsigned long busiest_h_load = env->busiest_cfs_rq->h_load;
+		unsigned long busiest_weight = env->busiest_cfs_rq->load.weight;
 		u64 rem_load, moved_load;
 
-		if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT))
+		if (env->flags & (LBF_NEED_BREAK|LBF_ABORT))
 			break;
 
 		/*
 		 * empty group or part of a throttled hierarchy
 		 */
-		if (!busiest_cfs_rq->task_weight ||
-		    throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu))
+		if (!env->busiest_cfs_rq->task_weight)
+			continue;
+
+		if (throttled_lb_pair(env->busiest_cfs_rq->tg,
+				      cpu_of(env->busiest_rq),
+				      env->this_cpu))
 			continue;
 
 		rem_load = (u64)rem_load_move * busiest_weight;
 		rem_load = div_u64(rem_load, busiest_h_load + 1);
 
-		moved_load = balance_tasks(this_rq, this_cpu, busiest,
-				rem_load, sd, idle, lb_flags,
-				busiest_cfs_rq);
+		env->max_load_move = rem_load;
 
+		moved_load = balance_tasks(env);
 		if (!moved_load)
 			continue;
 
@@ -3416,15 +3418,10 @@ static inline void update_shares(int cpu)
 {
 }
 
-static unsigned long
-load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
-		  unsigned long max_load_move,
-		  struct sched_domain *sd, enum cpu_idle_type idle,
-		  int *lb_flags)
+static unsigned long load_balance_fair(struct lb_env *env)
 {
-	return balance_tasks(this_rq, this_cpu, busiest,
-			max_load_move, sd, idle, lb_flags,
-			&busiest->cfs);
+	env->busiest_cfs_rq = &env->busiest_rq->cfs;
+	return balance_tasks(env);
 }
 #endif
 
@@ -3435,21 +3432,17 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
  *
  * Called with both runqueues locked.
  */
-static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-		      unsigned long max_load_move,
-		      struct sched_domain *sd, enum cpu_idle_type idle,
-		      int *lb_flags)
+static int move_tasks(struct lb_env *env)
 {
+	unsigned long max_load_move = env->max_load_move;
 	unsigned long total_load_moved = 0, load_moved;
 
 	do {
-		load_moved = load_balance_fair(this_rq, this_cpu, busiest,
-				max_load_move - total_load_moved,
-				sd, idle, lb_flags);
-
+		env->max_load_move = max_load_move - total_load_moved;
+		load_moved = load_balance_fair(env);
 		total_load_moved += load_moved;
 
-		if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT))
+		if (env->flags & (LBF_NEED_BREAK|LBF_ABORT))
 			break;
 
 #ifdef CONFIG_PREEMPT
@@ -3458,8 +3451,8 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		 * kernels will stop after the first task is pulled to minimize
 		 * the critical section.
 		 */
-		if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) {
-			*lb_flags |= LBF_ABORT;
+		if (env->idle == CPU_NEWLY_IDLE && env->this_rq->nr_running) {
+			env->flags |= LBF_ABORT;
 			break;
 		}
 #endif
@@ -4459,13 +4452,20 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 			struct sched_domain *sd, enum cpu_idle_type idle,
 			int *balance)
 {
-	int ld_moved, lb_flags = 0, active_balance = 0;
+	int ld_moved, active_balance = 0;
 	struct sched_group *group;
 	unsigned long imbalance;
 	struct rq *busiest;
 	unsigned long flags;
 	struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
 
+	struct lb_env env = {
+		.sd		= sd,
+		.this_cpu	= this_cpu,
+		.this_rq	= this_rq,
+		.idle		= idle,
+	};
+
 	cpumask_copy(cpus, cpu_active_mask);
 
 	schedstat_inc(sd, lb_count[idle]);
@@ -4500,11 +4500,13 @@ redo:
 		 * still unbalanced. ld_moved simply stays zero, so it is
 		 * correctly treated as an imbalance.
 		 */
-		lb_flags |= LBF_ALL_PINNED;
+		env.flags |= LBF_ALL_PINNED;
+		env.max_load_move = imbalance;
+		env.busiest_rq = busiest;
+
 		local_irq_save(flags);
 		double_rq_lock(this_rq, busiest);
-		ld_moved = move_tasks(this_rq, this_cpu, busiest,
-				      imbalance, sd, idle, &lb_flags);
+		ld_moved = move_tasks(&env);
 		double_rq_unlock(this_rq, busiest);
 		local_irq_restore(flags);
 
@@ -4514,18 +4516,18 @@ redo:
 		if (ld_moved && this_cpu != smp_processor_id())
 			resched_cpu(this_cpu);
 
-		if (lb_flags & LBF_ABORT)
+		if (env.flags & LBF_ABORT)
 			goto out_balanced;
 
-		if (lb_flags & LBF_NEED_BREAK) {
-			lb_flags += LBF_HAD_BREAK - LBF_NEED_BREAK;
-			if (lb_flags & LBF_ABORT)
+		if (env.flags & LBF_NEED_BREAK) {
+			env.flags += LBF_HAD_BREAK - LBF_NEED_BREAK;
+			if (env.flags & LBF_ABORT)
 				goto out_balanced;
 			goto redo;
 		}
 
 		/* All tasks on this runqueue were pinned by CPU affinity */
-		if (unlikely(lb_flags & LBF_ALL_PINNED)) {
+		if (unlikely(env.flags & LBF_ALL_PINNED)) {
 			cpumask_clear_cpu(cpu_of(busiest), cpus);
 			if (!cpumask_empty(cpus))
 				goto redo;
@@ -4555,7 +4557,7 @@ redo:
 					tsk_cpus_allowed(busiest->curr))) {
 				raw_spin_unlock_irqrestore(&busiest->lock,
 							    flags);
-				lb_flags |= LBF_ALL_PINNED;
+				env.flags |= LBF_ALL_PINNED;
 				goto out_one_pinned;
 			}
 
@@ -4608,7 +4610,7 @@ out_balanced:
 
 out_one_pinned:
 	/* tune up the balancing interval */
-	if (((lb_flags & LBF_ALL_PINNED) &&
+	if (((env.flags & LBF_ALL_PINNED) &&
 			sd->balance_interval < MAX_PINNED_INTERVAL) ||
 			(sd->balance_interval < sd->max_interval))
 		sd->balance_interval *= 2;
@@ -4718,10 +4720,17 @@ static int active_load_balance_cpu_stop(void *data)
 	}
 
 	if (likely(sd)) {
+		struct lb_env env = {
+			.sd		= sd,
+			.this_cpu	= target_cpu,
+			.this_rq	= target_rq,
+			.busiest_rq	= busiest_rq,
+			.idle		= CPU_IDLE,
+		};
+
 		schedstat_inc(sd, alb_count);
 
-		if (move_one_task(target_rq, target_cpu, busiest_rq,
-				  sd, CPU_IDLE))
+		if (move_one_task(&env))
 			schedstat_inc(sd, alb_pushed);
 		else
 			schedstat_inc(sd, alb_failed);
-- 
cgit v1.2.3


From ddcdf6e7d9919d139031fa2a6addd9544a9a833e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 22 Feb 2012 19:27:40 +0100
Subject: sched: Rename load-balancing fields

 s/env->this_/env->dst_/g
 s/env->busiest_/env->src_/g
 s/pull_task/move_task/g

Makes everything clearer.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: pjt@google.com
Link: http://lkml.kernel.org/n/tip-0yvgms8t8x962drpvl0fu0kk@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched/fair.c | 118 ++++++++++++++++++++++++++--------------------------
 1 file changed, 60 insertions(+), 58 deletions(-)

(limited to 'kernel/sched/fair.c')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 55b1f117419a..233d05171bf5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2918,7 +2918,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 		return;
 
 	/*
-	 * This is possible from callers such as pull_task(), in which we
+	 * This is possible from callers such as move_task(), in which we
 	 * unconditionally check_prempt_curr() after an enqueue (which may have
 	 * lead to a throttle).  This both saves work and prevents false
 	 * next-buddy nomination below.
@@ -3084,17 +3084,37 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
 
 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
 
+#define LBF_ALL_PINNED	0x01
+#define LBF_NEED_BREAK	0x02	/* clears into HAD_BREAK */
+#define LBF_HAD_BREAK	0x04
+#define LBF_HAD_BREAKS	0x0C	/* count HAD_BREAKs overflows into ABORT */
+#define LBF_ABORT	0x10
+
+struct lb_env {
+	struct sched_domain	*sd;
+
+	int			src_cpu;
+	struct rq		*src_rq;
+	struct cfs_rq		*src_cfs_rq;
+
+	int			dst_cpu;
+	struct rq		*dst_rq;
+
+	enum cpu_idle_type	idle;
+	unsigned long		max_load_move;
+	unsigned int		flags;
+};
+
 /*
- * pull_task - move a task from a remote runqueue to the local runqueue.
+ * move_task - move a task from one runqueue to another runqueue.
  * Both runqueues must be locked.
  */
-static void pull_task(struct rq *src_rq, struct task_struct *p,
-		      struct rq *this_rq, int this_cpu)
+static void move_task(struct task_struct *p, struct lb_env *env)
 {
-	deactivate_task(src_rq, p, 0);
-	set_task_cpu(p, this_cpu);
-	activate_task(this_rq, p, 0);
-	check_preempt_curr(this_rq, p, 0);
+	deactivate_task(env->src_rq, p, 0);
+	set_task_cpu(p, env->dst_cpu);
+	activate_task(env->dst_rq, p, 0);
+	check_preempt_curr(env->dst_rq, p, 0);
 }
 
 /*
@@ -3129,26 +3149,6 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
 	return delta < (s64)sysctl_sched_migration_cost;
 }
 
-#define LBF_ALL_PINNED	0x01
-#define LBF_NEED_BREAK	0x02	/* clears into HAD_BREAK */
-#define LBF_HAD_BREAK	0x04
-#define LBF_HAD_BREAKS	0x0C	/* count HAD_BREAKs overflows into ABORT */
-#define LBF_ABORT	0x10
-
-struct lb_env {
-	struct sched_domain	*sd;
-
-	int			this_cpu;
-	struct rq		*this_rq;
-
-	struct rq		*busiest_rq;
-	struct cfs_rq		*busiest_cfs_rq;
-
-	enum cpu_idle_type	idle;
-	unsigned long		max_load_move;
-	unsigned int		flags;
-};
-
 /*
  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
  */
@@ -3162,13 +3162,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
 	 * 3) are cache-hot on their current CPU.
 	 */
-	if (!cpumask_test_cpu(env->this_cpu, tsk_cpus_allowed(p))) {
+	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
 		schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
 		return 0;
 	}
 	env->flags &= ~LBF_ALL_PINNED;
 
-	if (task_running(env->busiest_rq, p)) {
+	if (task_running(env->src_rq, p)) {
 		schedstat_inc(p, se.statistics.nr_failed_migrations_running);
 		return 0;
 	}
@@ -3179,7 +3179,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 	 * 2) too many balance attempts have failed.
 	 */
 
-	tsk_cache_hot = task_hot(p, env->busiest_rq->clock_task, env->sd);
+	tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
 	if (!tsk_cache_hot ||
 		env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
 #ifdef CONFIG_SCHEDSTATS
@@ -3210,20 +3210,20 @@ static int move_one_task(struct lb_env *env)
 	struct task_struct *p, *n;
 	struct cfs_rq *cfs_rq;
 
-	for_each_leaf_cfs_rq(env->busiest_rq, cfs_rq) {
+	for_each_leaf_cfs_rq(env->src_rq, cfs_rq) {
 		list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
 			if (throttled_lb_pair(task_group(p),
-					      env->busiest_rq->cpu, env->this_cpu))
+					      env->src_cpu, env->dst_cpu))
 				break;
 
 			if (!can_migrate_task(p, env))
 				continue;
 
-			pull_task(env->busiest_rq, p, env->this_rq, env->this_cpu);
+			move_task(p, env);
 			/*
-			 * Right now, this is only the second place pull_task()
-			 * is called, so we can safely collect pull_task()
-			 * stats here rather than inside pull_task().
+			 * Right now, this is only the second place move_task()
+			 * is called, so we can safely collect move_task()
+			 * stats here rather than inside move_task().
 			 */
 			schedstat_inc(env->sd, lb_gained[env->idle]);
 			return 1;
@@ -3242,7 +3242,7 @@ static unsigned long balance_tasks(struct lb_env *env)
 	if (env->max_load_move == 0)
 		goto out;
 
-	list_for_each_entry_safe(p, n, &env->busiest_cfs_rq->tasks, se.group_node) {
+	list_for_each_entry_safe(p, n, &env->src_cfs_rq->tasks, se.group_node) {
 		if (loops++ > sysctl_sched_nr_migrate) {
 			env->flags |= LBF_NEED_BREAK;
 			break;
@@ -3252,7 +3252,7 @@ static unsigned long balance_tasks(struct lb_env *env)
 		    !can_migrate_task(p, env))
 			continue;
 
-		pull_task(env->busiest_rq, p, env->this_rq, env->this_cpu);
+		move_task(p, env);
 		pulled++;
 		rem_load_move -= p->se.load.weight;
 
@@ -3277,9 +3277,9 @@ static unsigned long balance_tasks(struct lb_env *env)
 	}
 out:
 	/*
-	 * Right now, this is one of only two places pull_task() is called,
-	 * so we can safely collect pull_task() stats here rather than
-	 * inside pull_task().
+	 * Right now, this is one of only two places move_task() is called,
+	 * so we can safely collect move_task() stats here rather than
+	 * inside move_task().
 	 */
 	schedstat_add(env->sd, lb_gained[env->idle], pulled);
 
@@ -3372,11 +3372,11 @@ static unsigned long load_balance_fair(struct lb_env *env)
 	long rem_load_move = env->max_load_move;
 
 	rcu_read_lock();
-	update_h_load(cpu_of(env->busiest_rq));
+	update_h_load(cpu_of(env->src_rq));
 
-	for_each_leaf_cfs_rq(env->busiest_rq, env->busiest_cfs_rq) {
-		unsigned long busiest_h_load = env->busiest_cfs_rq->h_load;
-		unsigned long busiest_weight = env->busiest_cfs_rq->load.weight;
+	for_each_leaf_cfs_rq(env->src_rq, env->src_cfs_rq) {
+		unsigned long busiest_h_load = env->src_cfs_rq->h_load;
+		unsigned long busiest_weight = env->src_cfs_rq->load.weight;
 		u64 rem_load, moved_load;
 
 		if (env->flags & (LBF_NEED_BREAK|LBF_ABORT))
@@ -3385,12 +3385,12 @@ static unsigned long load_balance_fair(struct lb_env *env)
 		/*
 		 * empty group or part of a throttled hierarchy
 		 */
-		if (!env->busiest_cfs_rq->task_weight)
+		if (!env->src_cfs_rq->task_weight)
 			continue;
 
-		if (throttled_lb_pair(env->busiest_cfs_rq->tg,
-				      cpu_of(env->busiest_rq),
-				      env->this_cpu))
+		if (throttled_lb_pair(env->src_cfs_rq->tg,
+				      cpu_of(env->src_rq),
+				      env->dst_cpu))
 			continue;
 
 		rem_load = (u64)rem_load_move * busiest_weight;
@@ -3420,7 +3420,7 @@ static inline void update_shares(int cpu)
 
 static unsigned long load_balance_fair(struct lb_env *env)
 {
-	env->busiest_cfs_rq = &env->busiest_rq->cfs;
+	env->src_cfs_rq = &env->src_rq->cfs;
 	return balance_tasks(env);
 }
 #endif
@@ -3451,7 +3451,7 @@ static int move_tasks(struct lb_env *env)
 		 * kernels will stop after the first task is pulled to minimize
 		 * the critical section.
 		 */
-		if (env->idle == CPU_NEWLY_IDLE && env->this_rq->nr_running) {
+		if (env->idle == CPU_NEWLY_IDLE && env->dst_rq->nr_running) {
 			env->flags |= LBF_ABORT;
 			break;
 		}
@@ -4461,8 +4461,8 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 
 	struct lb_env env = {
 		.sd		= sd,
-		.this_cpu	= this_cpu,
-		.this_rq	= this_rq,
+		.dst_cpu	= this_cpu,
+		.dst_rq		= this_rq,
 		.idle		= idle,
 	};
 
@@ -4502,7 +4502,8 @@ redo:
 		 */
 		env.flags |= LBF_ALL_PINNED;
 		env.max_load_move = imbalance;
-		env.busiest_rq = busiest;
+		env.src_cpu = busiest->cpu;
+		env.src_rq = busiest;
 
 		local_irq_save(flags);
 		double_rq_lock(this_rq, busiest);
@@ -4722,9 +4723,10 @@ static int active_load_balance_cpu_stop(void *data)
 	if (likely(sd)) {
 		struct lb_env env = {
 			.sd		= sd,
-			.this_cpu	= target_cpu,
-			.this_rq	= target_rq,
-			.busiest_rq	= busiest_rq,
+			.dst_cpu	= target_cpu,
+			.dst_rq		= target_rq,
+			.src_cpu	= busiest_rq->cpu,
+			.src_rq		= busiest_rq,
 			.idle		= CPU_IDLE,
 		};
 
-- 
cgit v1.2.3


From 367456c756a6b84f493ca9cc5b17b1f5d38ef466 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 20 Feb 2012 21:49:09 +0100
Subject: sched: Ditch per cgroup task lists for load-balancing

Per cgroup load-balance has numerous problems, chief amongst them that
there is no real sane order in them. So stop pretending it makes sense
and enqueue all tasks on a single list.

This also allows us to more easily fix the fwd progress issue
uncovered by the lock-break stuff. Rotate the list on failure to
migreate and limit the total iterations to nr_running (which with
releasing the lock isn't strictly accurate but close enough).

Also add a filter that skips very light tasks on the first attempt
around the list, this attempts to avoid shooting whole cgroups around
without affecting over balance.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: pjt@google.com
Link: http://lkml.kernel.org/n/tip-tx8yqydc7eimgq7i4rkc3a4g@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched/core.c  |   3 +
 kernel/sched/fair.c  | 176 ++++++++++++++++++++++-----------------------------
 kernel/sched/sched.h |  10 +--
 3 files changed, 80 insertions(+), 109 deletions(-)

(limited to 'kernel/sched/fair.c')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 25e06a5e048b..95545126be1c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6959,6 +6959,9 @@ void __init sched_init(void)
 		rq->online = 0;
 		rq->idle_stamp = 0;
 		rq->avg_idle = 2*sysctl_sched_migration_cost;
+
+		INIT_LIST_HEAD(&rq->cfs_tasks);
+
 		rq_attach_root(rq, &def_root_domain);
 #ifdef CONFIG_NO_HZ
 		rq->nohz_flags = 0;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 233d05171bf5..a0424fc4cc54 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -776,29 +776,16 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
  * Scheduling class queueing methods:
  */
 
-#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
-static void
-add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
-{
-	cfs_rq->task_weight += weight;
-}
-#else
-static inline void
-add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
-{
-}
-#endif
-
 static void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	update_load_add(&cfs_rq->load, se->load.weight);
 	if (!parent_entity(se))
 		update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
-	if (entity_is_task(se)) {
-		add_cfs_task_weight(cfs_rq, se->load.weight);
-		list_add(&se->group_node, &cfs_rq->tasks);
-	}
+#ifdef CONFIG_SMP
+	if (entity_is_task(se))
+		list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
+#endif
 	cfs_rq->nr_running++;
 }
 
@@ -808,10 +795,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	update_load_sub(&cfs_rq->load, se->load.weight);
 	if (!parent_entity(se))
 		update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
-	if (entity_is_task(se)) {
-		add_cfs_task_weight(cfs_rq, -se->load.weight);
+	if (entity_is_task(se))
 		list_del_init(&se->group_node);
-	}
 	cfs_rq->nr_running--;
 }
 
@@ -3085,17 +3070,14 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
 
 #define LBF_ALL_PINNED	0x01
-#define LBF_NEED_BREAK	0x02	/* clears into HAD_BREAK */
-#define LBF_HAD_BREAK	0x04
-#define LBF_HAD_BREAKS	0x0C	/* count HAD_BREAKs overflows into ABORT */
-#define LBF_ABORT	0x10
+#define LBF_NEED_BREAK	0x02
+#define LBF_ABORT	0x04
 
 struct lb_env {
 	struct sched_domain	*sd;
 
 	int			src_cpu;
 	struct rq		*src_rq;
-	struct cfs_rq		*src_cfs_rq;
 
 	int			dst_cpu;
 	struct rq		*dst_rq;
@@ -3103,6 +3085,10 @@ struct lb_env {
 	enum cpu_idle_type	idle;
 	unsigned long		max_load_move;
 	unsigned int		flags;
+
+	unsigned int		loop;
+	unsigned int		loop_break;
+	unsigned int		loop_max;
 };
 
 /*
@@ -3208,53 +3194,69 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 static int move_one_task(struct lb_env *env)
 {
 	struct task_struct *p, *n;
-	struct cfs_rq *cfs_rq;
 
-	for_each_leaf_cfs_rq(env->src_rq, cfs_rq) {
-		list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
-			if (throttled_lb_pair(task_group(p),
-					      env->src_cpu, env->dst_cpu))
-				break;
+	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
+		if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
+			continue;
 
-			if (!can_migrate_task(p, env))
-				continue;
+		if (!can_migrate_task(p, env))
+			continue;
 
-			move_task(p, env);
-			/*
-			 * Right now, this is only the second place move_task()
-			 * is called, so we can safely collect move_task()
-			 * stats here rather than inside move_task().
-			 */
-			schedstat_inc(env->sd, lb_gained[env->idle]);
-			return 1;
-		}
+		move_task(p, env);
+		/*
+		 * Right now, this is only the second place move_task()
+		 * is called, so we can safely collect move_task()
+		 * stats here rather than inside move_task().
+		 */
+		schedstat_inc(env->sd, lb_gained[env->idle]);
+		return 1;
 	}
-
 	return 0;
 }
 
+static unsigned long task_h_load(struct task_struct *p);
+
 static unsigned long balance_tasks(struct lb_env *env)
 {
-	int loops = 0, pulled = 0;
 	long rem_load_move = env->max_load_move;
 	struct task_struct *p, *n;
+	unsigned long load;
+	int pulled = 0;
 
 	if (env->max_load_move == 0)
 		goto out;
 
-	list_for_each_entry_safe(p, n, &env->src_cfs_rq->tasks, se.group_node) {
-		if (loops++ > sysctl_sched_nr_migrate) {
+	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
+		env->loop++;
+		/* We've more or less seen every task there is, call it quits */
+		if (env->loop > env->loop_max) {
+			env->flags |= LBF_ABORT;
+			break;
+		}
+		/* take a beather every nr_migrate tasks */
+		if (env->loop > env->loop_break) {
+			env->loop_break += sysctl_sched_nr_migrate;
 			env->flags |= LBF_NEED_BREAK;
 			break;
 		}
 
-		if ((p->se.load.weight >> 1) > rem_load_move ||
-		    !can_migrate_task(p, env))
-			continue;
+		if (throttled_lb_pair(task_group(p), env->src_rq->cpu,
+					env->dst_cpu))
+			goto next;
+
+		load = task_h_load(p);
+		if (load < 16 && !env->sd->nr_balance_failed)
+			goto next;
+
+		if ((load * 2) > rem_load_move)
+			goto next;
+
+		if (!can_migrate_task(p, env))
+			goto next;
 
 		move_task(p, env);
 		pulled++;
-		rem_load_move -= p->se.load.weight;
+		rem_load_move -= load;
 
 #ifdef CONFIG_PREEMPT
 		/*
@@ -3274,6 +3276,10 @@ static unsigned long balance_tasks(struct lb_env *env)
 		 */
 		if (rem_load_move <= 0)
 			break;
+
+		continue;
+next:
+		list_move_tail(&p->se.group_node, &env->src_rq->cfs_tasks);
 	}
 out:
 	/*
@@ -3363,65 +3369,33 @@ static int tg_load_down(struct task_group *tg, void *data)
 
 static void update_h_load(long cpu)
 {
+	rcu_read_lock();
 	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
+	rcu_read_unlock();
 }
 
-static unsigned long load_balance_fair(struct lb_env *env)
+static unsigned long task_h_load(struct task_struct *p)
 {
-	unsigned long max_load_move = env->max_load_move;
-	long rem_load_move = env->max_load_move;
-
-	rcu_read_lock();
-	update_h_load(cpu_of(env->src_rq));
-
-	for_each_leaf_cfs_rq(env->src_rq, env->src_cfs_rq) {
-		unsigned long busiest_h_load = env->src_cfs_rq->h_load;
-		unsigned long busiest_weight = env->src_cfs_rq->load.weight;
-		u64 rem_load, moved_load;
-
-		if (env->flags & (LBF_NEED_BREAK|LBF_ABORT))
-			break;
-
-		/*
-		 * empty group or part of a throttled hierarchy
-		 */
-		if (!env->src_cfs_rq->task_weight)
-			continue;
-
-		if (throttled_lb_pair(env->src_cfs_rq->tg,
-				      cpu_of(env->src_rq),
-				      env->dst_cpu))
-			continue;
-
-		rem_load = (u64)rem_load_move * busiest_weight;
-		rem_load = div_u64(rem_load, busiest_h_load + 1);
-
-		env->max_load_move = rem_load;
-
-		moved_load = balance_tasks(env);
-		if (!moved_load)
-			continue;
-
-		moved_load *= busiest_h_load;
-		moved_load = div_u64(moved_load, busiest_weight + 1);
+	struct cfs_rq *cfs_rq = task_cfs_rq(p);
+	unsigned long load;
 
-		rem_load_move -= moved_load;
-		if (rem_load_move < 0)
-			break;
-	}
-	rcu_read_unlock();
+	load = p->se.load.weight;
+	load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1);
 
-	return max_load_move - rem_load_move;
+	return load;
 }
 #else
 static inline void update_shares(int cpu)
 {
 }
 
-static unsigned long load_balance_fair(struct lb_env *env)
+static inline void update_h_load(long cpu)
 {
-	env->src_cfs_rq = &env->src_rq->cfs;
-	return balance_tasks(env);
+}
+
+static unsigned long task_h_load(struct task_struct *p)
+{
+	return p->se.load.weight;
 }
 #endif
 
@@ -3437,9 +3411,10 @@ static int move_tasks(struct lb_env *env)
 	unsigned long max_load_move = env->max_load_move;
 	unsigned long total_load_moved = 0, load_moved;
 
+	update_h_load(cpu_of(env->src_rq));
 	do {
 		env->max_load_move = max_load_move - total_load_moved;
-		load_moved = load_balance_fair(env);
+		load_moved = balance_tasks(env);
 		total_load_moved += load_moved;
 
 		if (env->flags & (LBF_NEED_BREAK|LBF_ABORT))
@@ -4464,6 +4439,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 		.dst_cpu	= this_cpu,
 		.dst_rq		= this_rq,
 		.idle		= idle,
+		.loop_break	= sysctl_sched_nr_migrate,
 	};
 
 	cpumask_copy(cpus, cpu_active_mask);
@@ -4504,6 +4480,7 @@ redo:
 		env.max_load_move = imbalance;
 		env.src_cpu = busiest->cpu;
 		env.src_rq = busiest;
+		env.loop_max = busiest->nr_running;
 
 		local_irq_save(flags);
 		double_rq_lock(this_rq, busiest);
@@ -4521,9 +4498,7 @@ redo:
 			goto out_balanced;
 
 		if (env.flags & LBF_NEED_BREAK) {
-			env.flags += LBF_HAD_BREAK - LBF_NEED_BREAK;
-			if (env.flags & LBF_ABORT)
-				goto out_balanced;
+			env.flags &= ~LBF_NEED_BREAK;
 			goto redo;
 		}
 
@@ -5357,7 +5332,6 @@ static void set_curr_task_fair(struct rq *rq)
 void init_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	cfs_rq->tasks_timeline = RB_ROOT;
-	INIT_LIST_HEAD(&cfs_rq->tasks);
 	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
 #ifndef CONFIG_64BIT
 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c0660a1a0cd1..753bdd567416 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -212,9 +212,6 @@ struct cfs_rq {
 	struct rb_root tasks_timeline;
 	struct rb_node *rb_leftmost;
 
-	struct list_head tasks;
-	struct list_head *balance_iterator;
-
 	/*
 	 * 'curr' points to currently running entity on this cfs_rq.
 	 * It is set to NULL otherwise (i.e when none are currently running).
@@ -241,11 +238,6 @@ struct cfs_rq {
 	struct task_group *tg;	/* group that "owns" this runqueue */
 
 #ifdef CONFIG_SMP
-	/*
-	 * the part of load.weight contributed by tasks
-	 */
-	unsigned long task_weight;
-
 	/*
 	 *   h_load = weight * f(tg)
 	 *
@@ -420,6 +412,8 @@ struct rq {
 	int cpu;
 	int online;
 
+	struct list_head cfs_tasks;
+
 	u64 rt_avg;
 	u64 age_stamp;
 	u64 idle_stamp;
-- 
cgit v1.2.3


From 5d6523ebd2f67de9d23285aad7f3910e7b0aee83 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 10 Mar 2012 00:07:36 +0100
Subject: sched: Fix load-balance wreckage

Commit 367456c ("sched: Ditch per cgroup task lists for
load-balancing") completely wrecked load-balancing due to
a few silly mistakes.

Correct those and remove more pointless code.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/n/tip-zk04ihygwxn7qqrlpaf73b0r@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched/fair.c | 110 +++++++++++++++++++---------------------------------
 1 file changed, 39 insertions(+), 71 deletions(-)

(limited to 'kernel/sched/fair.c')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a0424fc4cc54..def17aa302d5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -784,7 +784,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
 #ifdef CONFIG_SMP
 	if (entity_is_task(se))
-		list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
+		list_add_tail(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
 #endif
 	cfs_rq->nr_running++;
 }
@@ -3071,7 +3071,6 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
 
 #define LBF_ALL_PINNED	0x01
 #define LBF_NEED_BREAK	0x02
-#define LBF_ABORT	0x04
 
 struct lb_env {
 	struct sched_domain	*sd;
@@ -3083,7 +3082,7 @@ struct lb_env {
 	struct rq		*dst_rq;
 
 	enum cpu_idle_type	idle;
-	unsigned long		max_load_move;
+	long			load_move;
 	unsigned int		flags;
 
 	unsigned int		loop;
@@ -3216,39 +3215,47 @@ static int move_one_task(struct lb_env *env)
 
 static unsigned long task_h_load(struct task_struct *p);
 
-static unsigned long balance_tasks(struct lb_env *env)
+/*
+ * move_tasks tries to move up to load_move weighted load from busiest to
+ * this_rq, as part of a balancing operation within domain "sd".
+ * Returns 1 if successful and 0 otherwise.
+ *
+ * Called with both runqueues locked.
+ */
+static int move_tasks(struct lb_env *env)
 {
-	long rem_load_move = env->max_load_move;
-	struct task_struct *p, *n;
+	struct list_head *tasks = &env->src_rq->cfs_tasks;
+	struct task_struct *p;
 	unsigned long load;
 	int pulled = 0;
 
-	if (env->max_load_move == 0)
-		goto out;
+	if (env->load_move <= 0)
+		return 0;
+
+	while (!list_empty(tasks)) {
+		p = list_first_entry(tasks, struct task_struct, se.group_node);
 
-	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
 		env->loop++;
 		/* We've more or less seen every task there is, call it quits */
-		if (env->loop > env->loop_max) {
-			env->flags |= LBF_ABORT;
+		if (env->loop > env->loop_max)
 			break;
-		}
-		/* take a beather every nr_migrate tasks */
+
+		/* take a breather every nr_migrate tasks */
 		if (env->loop > env->loop_break) {
 			env->loop_break += sysctl_sched_nr_migrate;
 			env->flags |= LBF_NEED_BREAK;
 			break;
 		}
 
-		if (throttled_lb_pair(task_group(p), env->src_rq->cpu,
-					env->dst_cpu))
+		if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
 			goto next;
 
 		load = task_h_load(p);
+
 		if (load < 16 && !env->sd->nr_balance_failed)
 			goto next;
 
-		if ((load * 2) > rem_load_move)
+		if ((load / 2) > env->load_move)
 			goto next;
 
 		if (!can_migrate_task(p, env))
@@ -3256,7 +3263,7 @@ static unsigned long balance_tasks(struct lb_env *env)
 
 		move_task(p, env);
 		pulled++;
-		rem_load_move -= load;
+		env->load_move -= load;
 
 #ifdef CONFIG_PREEMPT
 		/*
@@ -3264,24 +3271,22 @@ static unsigned long balance_tasks(struct lb_env *env)
 		 * kernels will stop after the first task is pulled to minimize
 		 * the critical section.
 		 */
-		if (env->idle == CPU_NEWLY_IDLE) {
-			env->flags |= LBF_ABORT;
+		if (env->idle == CPU_NEWLY_IDLE)
 			break;
-		}
 #endif
 
 		/*
 		 * We only want to steal up to the prescribed amount of
 		 * weighted load.
 		 */
-		if (rem_load_move <= 0)
+		if (env->load_move <= 0)
 			break;
 
 		continue;
 next:
-		list_move_tail(&p->se.group_node, &env->src_rq->cfs_tasks);
+		list_move_tail(&p->se.group_node, tasks);
 	}
-out:
+
 	/*
 	 * Right now, this is one of only two places move_task() is called,
 	 * so we can safely collect move_task() stats here rather than
@@ -3289,7 +3294,7 @@ out:
 	 */
 	schedstat_add(env->sd, lb_gained[env->idle], pulled);
 
-	return env->max_load_move - rem_load_move;
+	return pulled;
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -3399,43 +3404,6 @@ static unsigned long task_h_load(struct task_struct *p)
 }
 #endif
 
-/*
- * move_tasks tries to move up to max_load_move weighted load from busiest to
- * this_rq, as part of a balancing operation within domain "sd".
- * Returns 1 if successful and 0 otherwise.
- *
- * Called with both runqueues locked.
- */
-static int move_tasks(struct lb_env *env)
-{
-	unsigned long max_load_move = env->max_load_move;
-	unsigned long total_load_moved = 0, load_moved;
-
-	update_h_load(cpu_of(env->src_rq));
-	do {
-		env->max_load_move = max_load_move - total_load_moved;
-		load_moved = balance_tasks(env);
-		total_load_moved += load_moved;
-
-		if (env->flags & (LBF_NEED_BREAK|LBF_ABORT))
-			break;
-
-#ifdef CONFIG_PREEMPT
-		/*
-		 * NEWIDLE balancing is a source of latency, so preemptible
-		 * kernels will stop after the first task is pulled to minimize
-		 * the critical section.
-		 */
-		if (env->idle == CPU_NEWLY_IDLE && env->dst_rq->nr_running) {
-			env->flags |= LBF_ABORT;
-			break;
-		}
-#endif
-	} while (load_moved && max_load_move > total_load_moved);
-
-	return total_load_moved > 0;
-}
-
 /********** Helpers for find_busiest_group ************************/
 /*
  * sd_lb_stats - Structure to store the statistics of a sched_domain
@@ -4477,31 +4445,31 @@ redo:
 		 * correctly treated as an imbalance.
 		 */
 		env.flags |= LBF_ALL_PINNED;
-		env.max_load_move = imbalance;
+		env.load_move = imbalance;
 		env.src_cpu = busiest->cpu;
 		env.src_rq = busiest;
 		env.loop_max = busiest->nr_running;
 
+more_balance:
 		local_irq_save(flags);
 		double_rq_lock(this_rq, busiest);
-		ld_moved = move_tasks(&env);
+		if (!env.loop)
+			update_h_load(env.src_cpu);
+		ld_moved += move_tasks(&env);
 		double_rq_unlock(this_rq, busiest);
 		local_irq_restore(flags);
 
+		if (env.flags & LBF_NEED_BREAK) {
+			env.flags &= ~LBF_NEED_BREAK;
+			goto more_balance;
+		}
+
 		/*
 		 * some other cpu did the load balance for us.
 		 */
 		if (ld_moved && this_cpu != smp_processor_id())
 			resched_cpu(this_cpu);
 
-		if (env.flags & LBF_ABORT)
-			goto out_balanced;
-
-		if (env.flags & LBF_NEED_BREAK) {
-			env.flags &= ~LBF_NEED_BREAK;
-			goto redo;
-		}
-
 		/* All tasks on this runqueue were pinned by CPU affinity */
 		if (unlikely(env.flags & LBF_ALL_PINNED)) {
 			cpumask_clear_cpu(cpu_of(busiest), cpus);
-- 
cgit v1.2.3


From 554cecaf733623b327eef9652b65965eb1081b81 Mon Sep 17 00:00:00 2001
From: Diwakar Tundlam <dtundlam@nvidia.com>
Date: Wed, 7 Mar 2012 14:44:26 -0800
Subject: sched/nohz: Correctly initialize 'next_balance' in 'nohz' idle
 balancer

The 'next_balance' field of 'nohz' idle balancer must be initialized
to jiffies. Since jiffies is initialized to negative 300 seconds the
'nohz' idle balancer does not run for the first 300s (5mins) after
bootup. If no new processes are spawed or no idle cycles happen, the
load on the cpus will remain unbalanced for that duration.

Signed-off-by: Diwakar Tundlam <dtundlam@nvidia.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1DD7BFEDD3147247B1355BEFEFE4665237994F30EF@HQMAIL04.nvidia.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched/fair.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel/sched/fair.c')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index def17aa302d5..11f3979bad2a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5571,6 +5571,7 @@ __init void init_sched_fair_class(void)
 	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
 
 #ifdef CONFIG_NO_HZ
+	nohz.next_balance = jiffies;
 	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
 	cpu_notifier(sched_ilb_notifier, 0);
 #endif
-- 
cgit v1.2.3