summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2008-03-14 21:12:12 +0100
committerIngo Molnar <mingo@elte.hu>2008-03-15 03:02:50 +0100
commitaa2ac25229cd4d0280f6174c42712744ad61b140 (patch)
treee4450de1bb2cd4cd56d6abf64feb862c1d542653 /kernel
parent27d117266097101dcf79c4576903cdcdd0eabffc (diff)
downloadlwn-aa2ac25229cd4d0280f6174c42712744ad61b140.tar.gz
lwn-aa2ac25229cd4d0280f6174c42712744ad61b140.zip
sched: fix overload performance: buddy wakeups
Currently we schedule to the leftmost task in the runqueue. When the runtimes are very short because of some server/client ping-pong, especially in over-saturated workloads, this will cycle through all tasks trashing the cache. Reduce cache trashing by keeping dependent tasks together by running newly woken tasks first. However, by not running the leftmost task first we could starve tasks because the wakee can gain unlimited runtime. Therefore we only run the wakee if its within a small (wakeup_granularity) window of the leftmost task. This preserves fairness, but does alternate server/client task groups. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched.c2
-rw-r--r--kernel/sched_fair.c26
2 files changed, 27 insertions, 1 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 6b06f23261c0..d1ad69b270ca 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -301,7 +301,7 @@ struct cfs_rq {
/* 'curr' points to currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
- struct sched_entity *curr;
+ struct sched_entity *curr, *next;
unsigned long nr_spread_over;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 9d003c9d2a48..31c4a2988b64 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -207,6 +207,9 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
}
+ if (cfs_rq->next == se)
+ cfs_rq->next = NULL;
+
rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
}
@@ -626,12 +629,32 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
se->prev_sum_exec_runtime = se->sum_exec_runtime;
}
+static struct sched_entity *
+pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ s64 diff, gran;
+
+ if (!cfs_rq->next)
+ return se;
+
+ diff = cfs_rq->next->vruntime - se->vruntime;
+ if (diff < 0)
+ return se;
+
+ gran = calc_delta_fair(sysctl_sched_wakeup_granularity, &cfs_rq->load);
+ if (diff > gran)
+ return se;
+
+ return cfs_rq->next;
+}
+
static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
{
struct sched_entity *se = NULL;
if (first_fair(cfs_rq)) {
se = __pick_next_entity(cfs_rq);
+ se = pick_next(cfs_rq, se);
set_next_entity(cfs_rq, se);
}
@@ -1070,6 +1093,9 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
resched_task(curr);
return;
}
+
+ cfs_rq_of(pse)->next = pse;
+
/*
* Batch tasks do not preempt (their preemption is driven by
* the tick):