diff options
author | Paul Menage <menage@google.com> | 2007-10-18 23:40:22 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-10-19 11:53:41 -0700 |
commit | 8707d8b8c0cbdf4441507f8dded194167da896c7 (patch) | |
tree | 1e9ac6b15027bd55263378e551c1595a937d66d6 /kernel/cpuset.c | |
parent | 020958b6272882c1a8bfbe5f3e0927f3845c2698 (diff) | |
download | lwn-8707d8b8c0cbdf4441507f8dded194167da896c7.tar.gz lwn-8707d8b8c0cbdf4441507f8dded194167da896c7.zip |
Fix cpusets update_cpumask
Cause writes to cpuset "cpus" file to update cpus_allowed for member tasks:
- collect batches of tasks under tasklist_lock and then call
set_cpus_allowed() on them outside the lock (since this can sleep).
- add a simple generic priority heap type to allow efficient collection
of batches of tasks to be processed without duplicating or missing any
tasks in subsequent batches.
- make "cpus" file update a no-op if the mask hasn't changed
- fix race between update_cpumask() and sched_setaffinity() by making
sched_setaffinity() post-check that it's not running on any cpus outside
cpuset_cpus_allowed().
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Paul Menage <menage@google.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'kernel/cpuset.c')
-rw-r--r-- | kernel/cpuset.c | 105 |
1 files changed, 101 insertions, 4 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 64ad59cfad9b..fa31cb9f9898 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -38,6 +38,7 @@ #include <linux/mount.h> #include <linux/namei.h> #include <linux/pagemap.h> +#include <linux/prio_heap.h> #include <linux/proc_fs.h> #include <linux/rcupdate.h> #include <linux/sched.h> @@ -701,6 +702,36 @@ done: /* Don't kfree(doms) -- partition_sched_domains() does that. */ } +static inline int started_after_time(struct task_struct *t1, + struct timespec *time, + struct task_struct *t2) +{ + int start_diff = timespec_compare(&t1->start_time, time); + if (start_diff > 0) { + return 1; + } else if (start_diff < 0) { + return 0; + } else { + /* + * Arbitrarily, if two processes started at the same + * time, we'll say that the lower pointer value + * started first. Note that t2 may have exited by now + * so this may not be a valid pointer any longer, but + * that's fine - it still serves to distinguish + * between two tasks started (effectively) + * simultaneously. + */ + return t1 > t2; + } +} + +static inline int started_after(void *p1, void *p2) +{ + struct task_struct *t1 = p1; + struct task_struct *t2 = p2; + return started_after_time(t1, &t2->start_time, t2); +} + /* * Call with manage_mutex held. May take callback_mutex during call. */ @@ -708,8 +739,15 @@ done: static int update_cpumask(struct cpuset *cs, char *buf) { struct cpuset trialcs; - int retval; - int cpus_changed, is_load_balanced; + int retval, i; + int is_load_balanced; + struct cgroup_iter it; + struct cgroup *cgrp = cs->css.cgroup; + struct task_struct *p, *dropped; + /* Never dereference latest_task, since it's not refcounted */ + struct task_struct *latest_task = NULL; + struct ptr_heap heap; + struct timespec latest_time = { 0, 0 }; /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ if (cs == &top_cpuset) @@ -736,14 +774,73 @@ static int update_cpumask(struct cpuset *cs, char *buf) if (retval < 0) return retval; - cpus_changed = !cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); + /* Nothing to do if the cpus didn't change */ + if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) + return 0; + retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after); + if (retval) + return retval; + is_load_balanced = is_sched_load_balance(&trialcs); mutex_lock(&callback_mutex); cs->cpus_allowed = trialcs.cpus_allowed; mutex_unlock(&callback_mutex); - if (cpus_changed && is_load_balanced) + again: + /* + * Scan tasks in the cpuset, and update the cpumasks of any + * that need an update. Since we can't call set_cpus_allowed() + * while holding tasklist_lock, gather tasks to be processed + * in a heap structure. If the statically-sized heap fills up, + * overflow tasks that started later, and in future iterations + * only consider tasks that started after the latest task in + * the previous pass. This guarantees forward progress and + * that we don't miss any tasks + */ + heap.size = 0; + cgroup_iter_start(cgrp, &it); + while ((p = cgroup_iter_next(cgrp, &it))) { + /* Only affect tasks that don't have the right cpus_allowed */ + if (cpus_equal(p->cpus_allowed, cs->cpus_allowed)) + continue; + /* + * Only process tasks that started after the last task + * we processed + */ + if (!started_after_time(p, &latest_time, latest_task)) + continue; + dropped = heap_insert(&heap, p); + if (dropped == NULL) { + get_task_struct(p); + } else if (dropped != p) { + get_task_struct(p); + put_task_struct(dropped); + } + } + cgroup_iter_end(cgrp, &it); + if (heap.size) { + for (i = 0; i < heap.size; i++) { + struct task_struct *p = heap.ptrs[i]; + if (i == 0) { + latest_time = p->start_time; + latest_task = p; + } + set_cpus_allowed(p, cs->cpus_allowed); + put_task_struct(p); + } + /* + * If we had to process any tasks at all, scan again + * in case some of them were in the middle of forking + * children that didn't notice the new cpumask + * restriction. Not the most efficient way to do it, + * but it avoids having to take callback_mutex in the + * fork path + */ + goto again; + } + heap_free(&heap); + if (is_load_balanced) rebuild_sched_domains(); return 0; |