diff options
author | Christoph Lameter <clameter@sgi.com> | 2006-11-11 12:13:53 +0100 |
---|---|---|
committer | Adrian Bunk <bunk@stusta.de> | 2006-11-11 12:13:53 +0100 |
commit | ef147950bee58ecfa672433badf679608fb90668 (patch) | |
tree | 499fda5911ff48ac9ef93d0fddfe1d20eaa796c9 /kernel | |
parent | f9a198cc962f1347564493d406789b79cab4294b (diff) | |
download | lwn-ef147950bee58ecfa672433badf679608fb90668.tar.gz lwn-ef147950bee58ecfa672433badf679608fb90668.zip |
Fix longstanding load balancing bug in the scheduler
The scheduler will stop load balancing if the most busy processor contains
processes pinned via processor affinity.
The scheduler currently only does one search for busiest cpu. If it cannot
pull any tasks away from the busiest cpu because they were pinned then the
scheduler goes into a corner and sulks leaving the idle processors idle.
F.e. If you have processor 0 busy running four tasks pinned via taskset,
there are none on processor 1 and one just started two processes on
processor 2 then the scheduler will not move one of the two processes away
from processor 2.
This patch fixes that issue by forcing the scheduler to come out of its
corner and retrying the load balancing by considering other processors for
load balancing.
This patch was originally developed by John Hawkes and discussed at
http://marc.theaimsgroup.com/?l=linux-kernel&m=113901368523205&w=2.
I have removed extraneous material and gone back to equipping struct rq
with the cpu the queue is associated with since this makes the patch much
easier and it is likely that others in the future will have the same
difficulty of figuring out which processor owns which runqueue.
The overhead added through these patches is a single word on the stack if
the kernel is configured to support 32 cpus or less (32 bit). For 32 bit
environments the maximum number of cpus that can be configued is 255 which
would result in the use of 32 bytes additional on the stack. On IA64 up to
1k cpus can be configured which will result in the use of 128 additional
bytes on the stack. The maximum additional cache footprint is one
cacheline. Typically memory use will be much less than a cacheline and the
additional cpumask will be placed on the stack in a cacheline that already
contains other local variable.
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/sched.c | 38 |
1 files changed, 31 insertions, 7 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 4e7efac7b1ec..d1707c5f2880 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1924,7 +1924,8 @@ out: */ static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, - unsigned long *imbalance, enum idle_type idle, int *sd_idle) + unsigned long *imbalance, enum idle_type idle, int *sd_idle, + cpumask_t *cpus) { struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; @@ -1950,6 +1951,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, avg_load = 0; for_each_cpu_mask(i, group->cpumask) { + if (!cpu_isset(i, *cpus)) + continue; + if (*sd_idle && !idle_cpu(i)) *sd_idle = 0; @@ -2063,13 +2067,16 @@ out_balanced: * find_busiest_queue - find the busiest runqueue among the cpus in group. */ static runqueue_t *find_busiest_queue(struct sched_group *group, - enum idle_type idle) + enum idle_type idle, cpumask_t *cpus) { unsigned long load, max_load = 0; runqueue_t *busiest = NULL; int i; for_each_cpu_mask(i, group->cpumask) { + if (!cpu_isset(i, *cpus)) + continue; + load = source_load(i, 0); if (load > max_load) { @@ -2102,19 +2109,22 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, int nr_moved, all_pinned = 0; int active_balance = 0; int sd_idle = 0; + cpumask_t cpus = CPU_MASK_ALL; if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER) sd_idle = 1; schedstat_inc(sd, lb_cnt[idle]); - group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle); +redo: + group = find_busiest_group(sd, this_cpu, &imbalance, idle, + &sd_idle, &cpus); if (!group) { schedstat_inc(sd, lb_nobusyg[idle]); goto out_balanced; } - busiest = find_busiest_queue(group, idle); + busiest = find_busiest_queue(group, idle, &cpus); if (!busiest) { schedstat_inc(sd, lb_nobusyq[idle]); goto out_balanced; @@ -2138,8 +2148,12 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, double_rq_unlock(this_rq, busiest); /* All tasks on this runqueue were pinned by CPU affinity */ - if (unlikely(all_pinned)) + if (unlikely(all_pinned)) { + cpu_clear(busiest->cpu, cpus); + if (!cpus_empty(cpus)) + goto redo; goto out_balanced; + } } if (!nr_moved) { @@ -2226,18 +2240,21 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, unsigned long imbalance; int nr_moved = 0; int sd_idle = 0; + cpumask_t cpus = CPU_MASK_ALL; if (sd->flags & SD_SHARE_CPUPOWER) sd_idle = 1; schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); - group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle); +redo: + group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, + &sd_idle, &cpus); if (!group) { schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); goto out_balanced; } - busiest = find_busiest_queue(group, NEWLY_IDLE); + busiest = find_busiest_queue(group, NEWLY_IDLE, &cpus); if (!busiest) { schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); goto out_balanced; @@ -2254,6 +2271,12 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, nr_moved = move_tasks(this_rq, this_cpu, busiest, imbalance, sd, NEWLY_IDLE, NULL); spin_unlock(&busiest->lock); + + if (!nr_moved) { + cpu_clear(busiest->cpu, cpus); + if (!cpus_empty(cpus)) + goto redo; + } } if (!nr_moved) { @@ -6037,6 +6060,7 @@ void __init sched_init(void) rq->cpu_load[j] = 0; rq->active_balance = 0; rq->push_cpu = 0; + rq->cpu = i; rq->migration_thread = NULL; INIT_LIST_HEAD(&rq->migration_queue); rq->cpu = i; |