diff options
Diffstat (limited to 'kernel/cpuset.c')
-rw-r--r-- | kernel/cpuset.c | 152 |
1 files changed, 119 insertions, 33 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index c86ee051b734..18aea1bd1284 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -4,15 +4,14 @@ * Processor and Memory placement constraints for sets of tasks. * * Copyright (C) 2003 BULL SA. - * Copyright (C) 2004 Silicon Graphics, Inc. + * Copyright (C) 2004-2006 Silicon Graphics, Inc. * * Portions derived from Patrick Mochel's sysfs code. * sysfs is Copyright (c) 2001-3 Patrick Mochel - * Portions Copyright (c) 2004 Silicon Graphics, Inc. * - * 2003-10-10 Written by Simon Derr <simon.derr@bull.net> + * 2003-10-10 Written by Simon Derr. * 2003-10-22 Updates by Stephen Hemminger. - * 2004 May-July Rework by Paul Jackson <pj@sgi.com> + * 2004 May-July Rework by Paul Jackson. * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of the Linux @@ -108,37 +107,49 @@ typedef enum { CS_MEM_EXCLUSIVE, CS_MEMORY_MIGRATE, CS_REMOVED, - CS_NOTIFY_ON_RELEASE + CS_NOTIFY_ON_RELEASE, + CS_SPREAD_PAGE, + CS_SPREAD_SLAB, } cpuset_flagbits_t; /* convenient tests for these bits */ static inline int is_cpu_exclusive(const struct cpuset *cs) { - return !!test_bit(CS_CPU_EXCLUSIVE, &cs->flags); + return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); } static inline int is_mem_exclusive(const struct cpuset *cs) { - return !!test_bit(CS_MEM_EXCLUSIVE, &cs->flags); + return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); } static inline int is_removed(const struct cpuset *cs) { - return !!test_bit(CS_REMOVED, &cs->flags); + return test_bit(CS_REMOVED, &cs->flags); } static inline int notify_on_release(const struct cpuset *cs) { - return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); + return test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); } static inline int is_memory_migrate(const struct cpuset *cs) { - return !!test_bit(CS_MEMORY_MIGRATE, &cs->flags); + return test_bit(CS_MEMORY_MIGRATE, &cs->flags); +} + +static inline int is_spread_page(const struct cpuset *cs) +{ + return test_bit(CS_SPREAD_PAGE, &cs->flags); +} + +static inline int is_spread_slab(const struct cpuset *cs) +{ + return test_bit(CS_SPREAD_SLAB, &cs->flags); } /* - * Increment this atomic integer everytime any cpuset changes its + * Increment this integer everytime any cpuset changes its * mems_allowed value. Users of cpusets can track this generation * number, and avoid having to lock and reload mems_allowed unless * the cpuset they're using changes generation. @@ -152,8 +163,11 @@ static inline int is_memory_migrate(const struct cpuset *cs) * on every visit to __alloc_pages(), to efficiently check whether * its current->cpuset->mems_allowed has changed, requiring an update * of its current->mems_allowed. + * + * Since cpuset_mems_generation is guarded by manage_mutex, + * there is no need to mark it atomic. */ -static atomic_t cpuset_mems_generation = ATOMIC_INIT(1); +static int cpuset_mems_generation; static struct cpuset top_cpuset = { .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), @@ -657,6 +671,14 @@ void cpuset_update_task_memory_state(void) cs = tsk->cpuset; /* Maybe changed when task not locked */ guarantee_online_mems(cs, &tsk->mems_allowed); tsk->cpuset_mems_generation = cs->mems_generation; + if (is_spread_page(cs)) + tsk->flags |= PF_SPREAD_PAGE; + else + tsk->flags &= ~PF_SPREAD_PAGE; + if (is_spread_slab(cs)) + tsk->flags |= PF_SPREAD_SLAB; + else + tsk->flags &= ~PF_SPREAD_SLAB; task_unlock(tsk); mutex_unlock(&callback_mutex); mpol_rebind_task(tsk, &tsk->mems_allowed); @@ -858,8 +880,7 @@ static int update_nodemask(struct cpuset *cs, char *buf) mutex_lock(&callback_mutex); cs->mems_allowed = trialcs.mems_allowed; - atomic_inc(&cpuset_mems_generation); - cs->mems_generation = atomic_read(&cpuset_mems_generation); + cs->mems_generation = cpuset_mems_generation++; mutex_unlock(&callback_mutex); set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */ @@ -957,7 +978,8 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) /* * update_flag - read a 0 or a 1 in a file and update associated flag * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, - * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE) + * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE, + * CS_SPREAD_PAGE, CS_SPREAD_SLAB) * cs: the cpuset to update * buf: the buffer where we read the 0 or 1 * @@ -1188,6 +1210,8 @@ typedef enum { FILE_NOTIFY_ON_RELEASE, FILE_MEMORY_PRESSURE_ENABLED, FILE_MEMORY_PRESSURE, + FILE_SPREAD_PAGE, + FILE_SPREAD_SLAB, FILE_TASKLIST, } cpuset_filetype_t; @@ -1247,6 +1271,14 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us case FILE_MEMORY_PRESSURE: retval = -EACCES; break; + case FILE_SPREAD_PAGE: + retval = update_flag(CS_SPREAD_PAGE, cs, buffer); + cs->mems_generation = cpuset_mems_generation++; + break; + case FILE_SPREAD_SLAB: + retval = update_flag(CS_SPREAD_SLAB, cs, buffer); + cs->mems_generation = cpuset_mems_generation++; + break; case FILE_TASKLIST: retval = attach_task(cs, buffer, &pathbuf); break; @@ -1356,6 +1388,12 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, case FILE_MEMORY_PRESSURE: s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter)); break; + case FILE_SPREAD_PAGE: + *s++ = is_spread_page(cs) ? '1' : '0'; + break; + case FILE_SPREAD_SLAB: + *s++ = is_spread_slab(cs) ? '1' : '0'; + break; default: retval = -EINVAL; goto out; @@ -1719,6 +1757,16 @@ static struct cftype cft_memory_pressure = { .private = FILE_MEMORY_PRESSURE, }; +static struct cftype cft_spread_page = { + .name = "memory_spread_page", + .private = FILE_SPREAD_PAGE, +}; + +static struct cftype cft_spread_slab = { + .name = "memory_spread_slab", + .private = FILE_SPREAD_SLAB, +}; + static int cpuset_populate_dir(struct dentry *cs_dentry) { int err; @@ -1737,6 +1785,10 @@ static int cpuset_populate_dir(struct dentry *cs_dentry) return err; if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0) return err; + if ((err = cpuset_add_file(cs_dentry, &cft_spread_page)) < 0) + return err; + if ((err = cpuset_add_file(cs_dentry, &cft_spread_slab)) < 0) + return err; if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0) return err; return 0; @@ -1765,13 +1817,16 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) cs->flags = 0; if (notify_on_release(parent)) set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); + if (is_spread_page(parent)) + set_bit(CS_SPREAD_PAGE, &cs->flags); + if (is_spread_slab(parent)) + set_bit(CS_SPREAD_SLAB, &cs->flags); cs->cpus_allowed = CPU_MASK_NONE; cs->mems_allowed = NODE_MASK_NONE; atomic_set(&cs->count, 0); INIT_LIST_HEAD(&cs->sibling); INIT_LIST_HEAD(&cs->children); - atomic_inc(&cpuset_mems_generation); - cs->mems_generation = atomic_read(&cpuset_mems_generation); + cs->mems_generation = cpuset_mems_generation++; fmeter_init(&cs->fmeter); cs->parent = parent; @@ -1861,7 +1916,7 @@ int __init cpuset_init_early(void) struct task_struct *tsk = current; tsk->cpuset = &top_cpuset; - tsk->cpuset->mems_generation = atomic_read(&cpuset_mems_generation); + tsk->cpuset->mems_generation = cpuset_mems_generation++; return 0; } @@ -1880,8 +1935,7 @@ int __init cpuset_init(void) top_cpuset.mems_allowed = NODE_MASK_ALL; fmeter_init(&top_cpuset.fmeter); - atomic_inc(&cpuset_mems_generation); - top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation); + top_cpuset.mems_generation = cpuset_mems_generation++; init_task.cpuset = &top_cpuset; @@ -1972,7 +2026,7 @@ void cpuset_fork(struct task_struct *child) * because tsk is already marked PF_EXITING, so attach_task() won't * mess with it, or task is a failed fork, never visible to attach_task. * - * Hack: + * the_top_cpuset_hack: * * Set the exiting tasks cpuset to the root cpuset (top_cpuset). * @@ -2011,7 +2065,7 @@ void cpuset_exit(struct task_struct *tsk) struct cpuset *cs; cs = tsk->cpuset; - tsk->cpuset = &top_cpuset; /* Hack - see comment above */ + tsk->cpuset = &top_cpuset; /* the_top_cpuset_hack - see above */ if (notify_on_release(cs)) { char *pathbuf = NULL; @@ -2151,7 +2205,7 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) { int node; /* node that zone z is on */ const struct cpuset *cs; /* current cpuset ancestors */ - int allowed = 1; /* is allocation in zone z allowed? */ + int allowed; /* is allocation in zone z allowed? */ if (in_interrupt()) return 1; @@ -2204,6 +2258,44 @@ void cpuset_unlock(void) } /** + * cpuset_mem_spread_node() - On which node to begin search for a page + * + * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for + * tasks in a cpuset with is_spread_page or is_spread_slab set), + * and if the memory allocation used cpuset_mem_spread_node() + * to determine on which node to start looking, as it will for + * certain page cache or slab cache pages such as used for file + * system buffers and inode caches, then instead of starting on the + * local node to look for a free page, rather spread the starting + * node around the tasks mems_allowed nodes. + * + * We don't have to worry about the returned node being offline + * because "it can't happen", and even if it did, it would be ok. + * + * The routines calling guarantee_online_mems() are careful to + * only set nodes in task->mems_allowed that are online. So it + * should not be possible for the following code to return an + * offline node. But if it did, that would be ok, as this routine + * is not returning the node where the allocation must be, only + * the node where the search should start. The zonelist passed to + * __alloc_pages() will include all nodes. If the slab allocator + * is passed an offline node, it will fall back to the local node. + * See kmem_cache_alloc_node(). + */ + +int cpuset_mem_spread_node(void) +{ + int node; + + node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed); + if (node == MAX_NUMNODES) + node = first_node(current->mems_allowed); + current->cpuset_mem_spread_rotor = node; + return node; +} +EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); + +/** * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors? * @p: pointer to task_struct of some other task. * @@ -2284,12 +2376,12 @@ void __cpuset_memory_pressure_bump(void) * - No need to task_lock(tsk) on this tsk->cpuset reference, as it * doesn't really matter if tsk->cpuset changes after we read it, * and we take manage_mutex, keeping attach_task() from changing it - * anyway. + * anyway. No need to check that tsk->cpuset != NULL, thanks to + * the_top_cpuset_hack in cpuset_exit(), which sets an exiting tasks + * cpuset to top_cpuset. */ - static int proc_cpuset_show(struct seq_file *m, void *v) { - struct cpuset *cs; struct task_struct *tsk; char *buf; int retval = 0; @@ -2300,13 +2392,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v) tsk = m->private; mutex_lock(&manage_mutex); - cs = tsk->cpuset; - if (!cs) { - retval = -EINVAL; - goto out; - } - - retval = cpuset_path(cs, buf, PAGE_SIZE); + retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE); if (retval < 0) goto out; seq_puts(m, buf); |