sched: Fix cgroup smp fairness

Commit ec4e0e2fe018992d980910db901637c814575914 ("fix inconsistency when redistribute per-cpu tg->cfs_rq shares") broke cgroup smp fairness. In order to avoid starvation of newly placed tasks, we never quite set the share of an empty cpu group-task to 0, but instead we set it as if there's a single NICE-0 task present. If however we actually set this in cfs_rq[cpu]->shares, that means the total shares for that group will be slightly inflated every time we balance, causing the observed unfairness. Fix this by setting cfs_rq[cpu]->shares to 0 but actually setting the effective weight of the related se to the inflated number. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <1248696557.6987.1615.camel@twins> Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Peter Zijlstra <a.p.zijlstra@chello.nl> 2009-07-27 14:04:49 +0200
committer: Ingo Molnar <mingo@elte.hu> 2009-08-02 14:26:06 +0200
commit: a5004278f0525dcb9aa43703ef77bf371ea837cd (patch)
tree: d4beb94aa2e454a6d23425cac856654e507541a1 /kernel/sched.c
parent: 8e9ed8b02490fea577b1eb1704c05bf43c891ed7 (diff)
download: lwn-a5004278f0525dcb9aa43703ef77bf371ea837cd.tar.gz
lwn-a5004278f0525dcb9aa43703ef77bf371ea837cd.zip
1 files changed, 20 insertions, 8 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index ce1056e9b02a..26976cd8be0f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1523,13 +1523,18 @@ static void
 update_group_shares_cpu(struct task_group *tg, int cpu,
 			unsigned long sd_shares, unsigned long sd_rq_weight)
 {
-	unsigned long shares;
 	unsigned long rq_weight;
+	unsigned long shares;
+	int boost = 0;
 
 	if (!tg->se[cpu])
 		return;
 
 	rq_weight = tg->cfs_rq[cpu]->rq_weight;
+	if (!rq_weight) {
+		boost = 1;
+		rq_weight = NICE_0_LOAD;
+	}
 
 	/*
 	 *           \Sum shares * rq_weight
@@ -1546,8 +1551,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
 		unsigned long flags;
 
 		spin_lock_irqsave(&rq->lock, flags);
-		tg->cfs_rq[cpu]->shares = shares;
-
+		tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
 		__set_se_shares(tg->se[cpu], shares);
 		spin_unlock_irqrestore(&rq->lock, flags);
 	}
@@ -1560,7 +1564,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
  */
 static int tg_shares_up(struct task_group *tg, void *data)
 {
-	unsigned long weight, rq_weight = 0;
+	unsigned long weight, rq_weight = 0, eff_weight = 0;
 	unsigned long shares = 0;
 	struct sched_domain *sd = data;
 	int i;
@@ -1572,11 +1576,13 @@ static int tg_shares_up(struct task_group *tg, void *data)
 		 * run here it will not get delayed by group starvation.
 		 */
 		weight = tg->cfs_rq[i]->load.weight;
+		tg->cfs_rq[i]->rq_weight = weight;
+		rq_weight += weight;
+
 		if (!weight)
 			weight = NICE_0_LOAD;
 
-		tg->cfs_rq[i]->rq_weight = weight;
-		rq_weight += weight;
+		eff_weight += weight;
 		shares += tg->cfs_rq[i]->shares;
 	}
 
@@ -1586,8 +1592,14 @@ static int tg_shares_up(struct task_group *tg, void *data)
 	if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
 		shares = tg->shares;
 
-	for_each_cpu(i, sched_domain_span(sd))
-		update_group_shares_cpu(tg, i, shares, rq_weight);
+	for_each_cpu(i, sched_domain_span(sd)) {
+		unsigned long sd_rq_weight = rq_weight;
+
+		if (!tg->cfs_rq[i]->rq_weight)
+			sd_rq_weight = eff_weight;
+
+		update_group_shares_cpu(tg, i, shares, sd_rq_weight);
+	}
 
 	return 0;
 }
author	Peter Zijlstra <a.p.zijlstra@chello.nl>	2009-07-27 14:04:49 +0200
committer	Ingo Molnar <mingo@elte.hu>	2009-08-02 14:26:06 +0200
commit	a5004278f0525dcb9aa43703ef77bf371ea837cd (patch)
tree	d4beb94aa2e454a6d23425cac856654e507541a1 /kernel/sched.c
parent	8e9ed8b02490fea577b1eb1704c05bf43c891ed7 (diff)
download	lwn-a5004278f0525dcb9aa43703ef77bf371ea837cd.tar.gz lwn-a5004278f0525dcb9aa43703ef77bf371ea837cd.zip