From 08604bd9935dc98fb62ef61d5b7baa7ccc10f8c2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 16 Jun 2009 15:31:12 -0700 Subject: time: move PIT_TICK_RATE to linux/timex.h PIT_TICK_RATE is currently defined in four architectures, but in three different places. While linux/timex.h is not the perfect place for it, it is still a reasonable replacement for those drivers that traditionally use asm/timex.h to get CLOCK_TICK_RATE and expect it to be the PIT frequency. Note that for Alpha, the actual value changed from 1193182UL to 1193180UL. This is unlikely to make a difference, and probably can only improve accuracy. There was a discussion on the correct value of CLOCK_TICK_RATE a few years ago, after which every existing instance was getting changed to 1193182. According to the specification, it should be 1193181.818181... Signed-off-by: Arnd Bergmann Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Len Brown Cc: john stultz Cc: Dmitry Torokhov Cc: Takashi Iwai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/timex.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/timex.h b/include/linux/timex.h index 9910e3bd5b31..e6967d10d9e5 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -280,6 +280,9 @@ extern int do_adjtimex(struct timex *); int read_current_timer(unsigned long *timer_val); +/* The clock frequency of the i8253/i8254 PIT */ +#define PIT_TICK_RATE 1193182ul + #endif /* KERNEL */ #endif /* LINUX_TIMEX_H */ -- cgit v1.2.3 From 3b0fde0fac19c180317eb0601b3504083f4b9bf5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 16 Jun 2009 15:31:16 -0700 Subject: firmware_map: fix hang with x86/32bit Addresses http://bugzilla.kernel.org/show_bug.cgi?id=13484 Peer reported: | The bug is introduced from kernel 2.6.27, if E820 table reserve the memory | above 4G in 32bit OS(BIOS-e820: 00000000fff80000 - 0000000120000000 | (reserved)), system will report Int 6 error and hang up. The bug is caused by | the following code in drivers/firmware/memmap.c, the resource_size_t is 32bit | variable in 32bit OS, the BUG_ON() will be invoked to result in the Int 6 | error. I try the latest 32bit Ubuntu and Fedora distributions, all hit this | bug. |====== |static int firmware_map_add_entry(resource_size_t start, resource_size_t end, | const char *type, | struct firmware_map_entry *entry) and it only happen with CONFIG_PHYS_ADDR_T_64BIT is not set. it turns out we need to pass u64 instead of resource_size_t for that. [akpm@linux-foundation.org: add comment] Reported-and-tested-by: Peer Chen Signed-off-by: Yinghai Lu Cc: Ingo Molnar Acked-by: H. Peter Anvin Cc: Thomas Gleixner Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/firmware/memmap.c | 16 +++++++++------- include/linux/firmware-map.h | 12 ++++-------- 2 files changed, 13 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c index 05aa2d406ac6..d5ea8a68d338 100644 --- a/drivers/firmware/memmap.c +++ b/drivers/firmware/memmap.c @@ -31,8 +31,12 @@ * information is necessary as for the resource tree. */ struct firmware_map_entry { - resource_size_t start; /* start of the memory range */ - resource_size_t end; /* end of the memory range (incl.) */ + /* + * start and end must be u64 rather than resource_size_t, because e820 + * resources can lie at addresses above 4G. + */ + u64 start; /* start of the memory range */ + u64 end; /* end of the memory range (incl.) */ const char *type; /* type of the memory range */ struct list_head list; /* entry for the linked list */ struct kobject kobj; /* kobject for each entry */ @@ -101,7 +105,7 @@ static LIST_HEAD(map_entries); * Common implementation of firmware_map_add() and firmware_map_add_early() * which expects a pre-allocated struct firmware_map_entry. **/ -static int firmware_map_add_entry(resource_size_t start, resource_size_t end, +static int firmware_map_add_entry(u64 start, u64 end, const char *type, struct firmware_map_entry *entry) { @@ -132,8 +136,7 @@ static int firmware_map_add_entry(resource_size_t start, resource_size_t end, * * Returns 0 on success, or -ENOMEM if no memory could be allocated. **/ -int firmware_map_add(resource_size_t start, resource_size_t end, - const char *type) +int firmware_map_add(u64 start, u64 end, const char *type) { struct firmware_map_entry *entry; @@ -157,8 +160,7 @@ int firmware_map_add(resource_size_t start, resource_size_t end, * * Returns 0 on success, or -ENOMEM if no memory could be allocated. **/ -int __init firmware_map_add_early(resource_size_t start, resource_size_t end, - const char *type) +int __init firmware_map_add_early(u64 start, u64 end, const char *type) { struct firmware_map_entry *entry; diff --git a/include/linux/firmware-map.h b/include/linux/firmware-map.h index cca686b39123..875451f1373a 100644 --- a/include/linux/firmware-map.h +++ b/include/linux/firmware-map.h @@ -24,21 +24,17 @@ */ #ifdef CONFIG_FIRMWARE_MEMMAP -int firmware_map_add(resource_size_t start, resource_size_t end, - const char *type); -int firmware_map_add_early(resource_size_t start, resource_size_t end, - const char *type); +int firmware_map_add(u64 start, u64 end, const char *type); +int firmware_map_add_early(u64 start, u64 end, const char *type); #else /* CONFIG_FIRMWARE_MEMMAP */ -static inline int firmware_map_add(resource_size_t start, resource_size_t end, - const char *type) +static inline int firmware_map_add(u64 start, u64 end, const char *type) { return 0; } -static inline int firmware_map_add_early(resource_size_t start, - resource_size_t end, const char *type) +static inline int firmware_map_add_early(u64 start, u64 end, const char *type) { return 0; } -- cgit v1.2.3 From bb1f17b0372de93758653ca3454bc0df18dc2e5c Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 16 Jun 2009 15:31:18 -0700 Subject: mm: consolidate init_mm definition * create mm/init-mm.c, move init_mm there * remove INIT_MM, initialize init_mm with C99 initializer * unexport init_mm on all arches: init_mm is already unexported on x86. One strange place is some OMAP driver (drivers/video/omap/) which won't build modular, but it's already wants get_vm_area() export. Somebody should look there. [akpm@linux-foundation.org: add missing #includes] Signed-off-by: Alexey Dobriyan Cc: Mike Frysinger Cc: Americo Wang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/kernel/init_task.c | 3 --- arch/arm/kernel/init_task.c | 4 ---- arch/avr32/kernel/init_task.c | 4 ---- arch/blackfin/kernel/init_task.c | 4 ---- arch/cris/kernel/process.c | 4 ---- arch/frv/kernel/init_task.c | 4 ---- arch/h8300/kernel/init_task.c | 4 ---- arch/ia64/kernel/init_task.c | 4 ---- arch/m32r/kernel/init_task.c | 4 ---- arch/m68k/kernel/process.c | 4 ---- arch/m68knommu/kernel/init_task.c | 4 ---- arch/mips/kernel/init_task.c | 4 ---- arch/mn10300/kernel/init_task.c | 3 --- arch/parisc/kernel/init_task.c | 4 ---- arch/powerpc/kernel/init_task.c | 4 ---- arch/s390/kernel/init_task.c | 4 ---- arch/sh/kernel/init_task.c | 3 --- arch/sparc/kernel/init_task.c | 3 --- arch/um/kernel/init_task.c | 3 --- arch/x86/kernel/init_task.c | 1 - arch/xtensa/kernel/init_task.c | 4 ---- include/linux/init_task.h | 12 ------------ mm/Makefile | 1 + mm/init-mm.c | 20 ++++++++++++++++++++ 24 files changed, 21 insertions(+), 88 deletions(-) create mode 100644 mm/init-mm.c (limited to 'include') diff --git a/arch/alpha/kernel/init_task.c b/arch/alpha/kernel/init_task.c index c2938e574a40..19b86328ffd7 100644 --- a/arch/alpha/kernel/init_task.c +++ b/arch/alpha/kernel/init_task.c @@ -10,10 +10,7 @@ static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); -struct mm_struct init_mm = INIT_MM(init_mm); struct task_struct init_task = INIT_TASK(init_task); - -EXPORT_SYMBOL(init_mm); EXPORT_SYMBOL(init_task); union thread_union init_thread_union diff --git a/arch/arm/kernel/init_task.c b/arch/arm/kernel/init_task.c index e859af349467..3f470866bb89 100644 --- a/arch/arm/kernel/init_task.c +++ b/arch/arm/kernel/init_task.c @@ -14,10 +14,6 @@ static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); -struct mm_struct init_mm = INIT_MM(init_mm); - -EXPORT_SYMBOL(init_mm); - /* * Initial thread structure. * diff --git a/arch/avr32/kernel/init_task.c b/arch/avr32/kernel/init_task.c index 993d56ee3cf3..57ec9f2dcd95 100644 --- a/arch/avr32/kernel/init_task.c +++ b/arch/avr32/kernel/init_task.c @@ -15,10 +15,6 @@ static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); -struct mm_struct init_mm = INIT_MM(init_mm); - -EXPORT_SYMBOL(init_mm); - /* * Initial thread structure. Must be aligned on an 8192-byte boundary. */ diff --git a/arch/blackfin/kernel/init_task.c b/arch/blackfin/kernel/init_task.c index 2c228c020978..c26c34de9f3c 100644 --- a/arch/blackfin/kernel/init_task.c +++ b/arch/blackfin/kernel/init_task.c @@ -35,10 +35,6 @@ static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); - -struct mm_struct init_mm = INIT_MM(init_mm); -EXPORT_SYMBOL(init_mm); - /* * Initial task structure. * diff --git a/arch/cris/kernel/process.c b/arch/cris/kernel/process.c index 4df0b320d524..51dcd04d2777 100644 --- a/arch/cris/kernel/process.c +++ b/arch/cris/kernel/process.c @@ -38,10 +38,6 @@ static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); -struct mm_struct init_mm = INIT_MM(init_mm); - -EXPORT_SYMBOL(init_mm); - /* * Initial thread structure. * diff --git a/arch/frv/kernel/init_task.c b/arch/frv/kernel/init_task.c index 29429a8b7f6a..1d3df1d9495c 100644 --- a/arch/frv/kernel/init_task.c +++ b/arch/frv/kernel/init_task.c @@ -12,10 +12,6 @@ static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); -struct mm_struct init_mm = INIT_MM(init_mm); - -EXPORT_SYMBOL(init_mm); - /* * Initial thread structure. * diff --git a/arch/h8300/kernel/init_task.c b/arch/h8300/kernel/init_task.c index cb5dc552da97..089c65ed6eb3 100644 --- a/arch/h8300/kernel/init_task.c +++ b/arch/h8300/kernel/init_task.c @@ -14,10 +14,6 @@ static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); -struct mm_struct init_mm = INIT_MM(init_mm); - -EXPORT_SYMBOL(init_mm); - /* * Initial task structure. * diff --git a/arch/ia64/kernel/init_task.c b/arch/ia64/kernel/init_task.c index 5b0e830c6f33..c475fc281be7 100644 --- a/arch/ia64/kernel/init_task.c +++ b/arch/ia64/kernel/init_task.c @@ -19,10 +19,6 @@ static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); -struct mm_struct init_mm = INIT_MM(init_mm); - -EXPORT_SYMBOL(init_mm); - /* * Initial task structure. * diff --git a/arch/m32r/kernel/init_task.c b/arch/m32r/kernel/init_task.c index 016885c6f260..fce57e5d3f91 100644 --- a/arch/m32r/kernel/init_task.c +++ b/arch/m32r/kernel/init_task.c @@ -13,10 +13,6 @@ static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); -struct mm_struct init_mm = INIT_MM(init_mm); - -EXPORT_SYMBOL(init_mm); - /* * Initial thread structure. * diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c index ec37fb56c127..72bad65dba3a 100644 --- a/arch/m68k/kernel/process.c +++ b/arch/m68k/kernel/process.c @@ -42,10 +42,6 @@ */ static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); -struct mm_struct init_mm = INIT_MM(init_mm); - -EXPORT_SYMBOL(init_mm); - union thread_union init_thread_union __attribute__((section(".data.init_task"), aligned(THREAD_SIZE))) = { INIT_THREAD_INFO(init_task) }; diff --git a/arch/m68knommu/kernel/init_task.c b/arch/m68knommu/kernel/init_task.c index fe282de1d596..45e97a207fed 100644 --- a/arch/m68knommu/kernel/init_task.c +++ b/arch/m68knommu/kernel/init_task.c @@ -14,10 +14,6 @@ static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); -struct mm_struct init_mm = INIT_MM(init_mm); - -EXPORT_SYMBOL(init_mm); - /* * Initial task structure. * diff --git a/arch/mips/kernel/init_task.c b/arch/mips/kernel/init_task.c index 149cd914526e..5b457a40c784 100644 --- a/arch/mips/kernel/init_task.c +++ b/arch/mips/kernel/init_task.c @@ -11,10 +11,6 @@ static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); -struct mm_struct init_mm = INIT_MM(init_mm); - -EXPORT_SYMBOL(init_mm); - /* * Initial thread structure. * diff --git a/arch/mn10300/kernel/init_task.c b/arch/mn10300/kernel/init_task.c index 5ac3566f8c98..80d423b80af3 100644 --- a/arch/mn10300/kernel/init_task.c +++ b/arch/mn10300/kernel/init_task.c @@ -20,9 +20,6 @@ static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); -struct mm_struct init_mm = INIT_MM(init_mm); -EXPORT_SYMBOL(init_mm); - /* * Initial thread structure. * diff --git a/arch/parisc/kernel/init_task.c b/arch/parisc/kernel/init_task.c index 1e25a45d64c1..82974b20fc10 100644 --- a/arch/parisc/kernel/init_task.c +++ b/arch/parisc/kernel/init_task.c @@ -36,10 +36,6 @@ static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); -struct mm_struct init_mm = INIT_MM(init_mm); - -EXPORT_SYMBOL(init_mm); - /* * Initial task structure. * diff --git a/arch/powerpc/kernel/init_task.c b/arch/powerpc/kernel/init_task.c index 688b329800bd..ffc4253fef55 100644 --- a/arch/powerpc/kernel/init_task.c +++ b/arch/powerpc/kernel/init_task.c @@ -9,10 +9,6 @@ static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); -struct mm_struct init_mm = INIT_MM(init_mm); - -EXPORT_SYMBOL(init_mm); - /* * Initial thread structure. * diff --git a/arch/s390/kernel/init_task.c b/arch/s390/kernel/init_task.c index 7db95c0b8693..fe787f9e5f3f 100644 --- a/arch/s390/kernel/init_task.c +++ b/arch/s390/kernel/init_task.c @@ -18,10 +18,6 @@ static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); -struct mm_struct init_mm = INIT_MM(init_mm); - -EXPORT_SYMBOL(init_mm); - /* * Initial thread structure. * diff --git a/arch/sh/kernel/init_task.c b/arch/sh/kernel/init_task.c index 80c35ff71d56..1719957c0a69 100644 --- a/arch/sh/kernel/init_task.c +++ b/arch/sh/kernel/init_task.c @@ -10,9 +10,6 @@ static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct pt_regs fake_swapper_regs; -struct mm_struct init_mm = INIT_MM(init_mm); -EXPORT_SYMBOL(init_mm); - /* * Initial thread structure. * diff --git a/arch/sparc/kernel/init_task.c b/arch/sparc/kernel/init_task.c index f28cb8278e98..28125c5b3d3c 100644 --- a/arch/sparc/kernel/init_task.c +++ b/arch/sparc/kernel/init_task.c @@ -10,10 +10,7 @@ static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); -struct mm_struct init_mm = INIT_MM(init_mm); struct task_struct init_task = INIT_TASK(init_task); - -EXPORT_SYMBOL(init_mm); EXPORT_SYMBOL(init_task); /* .text section in head.S is aligned at 8k boundary and this gets linked diff --git a/arch/um/kernel/init_task.c b/arch/um/kernel/init_task.c index 806d381947bf..b25121b537d8 100644 --- a/arch/um/kernel/init_task.c +++ b/arch/um/kernel/init_task.c @@ -10,11 +10,8 @@ #include "linux/mqueue.h" #include "asm/uaccess.h" -struct mm_struct init_mm = INIT_MM(init_mm); static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); -EXPORT_SYMBOL(init_mm); - /* * Initial task structure. * diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c index df3bf269beab..270ff83efc11 100644 --- a/arch/x86/kernel/init_task.c +++ b/arch/x86/kernel/init_task.c @@ -12,7 +12,6 @@ static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); -struct mm_struct init_mm = INIT_MM(init_mm); /* * Initial thread structure. diff --git a/arch/xtensa/kernel/init_task.c b/arch/xtensa/kernel/init_task.c index e07f5c9fcd35..c4302f0e4ba0 100644 --- a/arch/xtensa/kernel/init_task.c +++ b/arch/xtensa/kernel/init_task.c @@ -23,10 +23,6 @@ static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); -struct mm_struct init_mm = INIT_MM(init_mm); - -EXPORT_SYMBOL(init_mm); - union thread_union init_thread_union __attribute__((__section__(".data.init_task"))) = { INIT_THREAD_INFO(init_task) }; diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 28b1f30601b5..5368fbdc7801 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -15,18 +15,6 @@ extern struct files_struct init_files; extern struct fs_struct init_fs; -#define INIT_MM(name) \ -{ \ - .mm_rb = RB_ROOT, \ - .pgd = swapper_pg_dir, \ - .mm_users = ATOMIC_INIT(2), \ - .mm_count = ATOMIC_INIT(1), \ - .mmap_sem = __RWSEM_INITIALIZER(name.mmap_sem), \ - .page_table_lock = __SPIN_LOCK_UNLOCKED(name.page_table_lock), \ - .mmlist = LIST_HEAD_INIT(name.mmlist), \ - .cpu_vm_mask = CPU_MASK_ALL, \ -} - #define INIT_SIGNALS(sig) { \ .count = ATOMIC_INIT(1), \ .wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(sig.wait_chldexit),\ diff --git a/mm/Makefile b/mm/Makefile index e89acb090b4d..cf76be785add 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -12,6 +12,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ page_isolation.o mm_init.o $(mmu-y) +obj-y += init-mm.o obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o obj-$(CONFIG_BOUNCE) += bounce.o diff --git a/mm/init-mm.c b/mm/init-mm.c new file mode 100644 index 000000000000..57aba0da9668 --- /dev/null +++ b/mm/init-mm.c @@ -0,0 +1,20 @@ +#include +#include +#include +#include +#include +#include + +#include +#include + +struct mm_struct init_mm = { + .mm_rb = RB_ROOT, + .pgd = swapper_pg_dir, + .mm_users = ATOMIC_INIT(2), + .mm_count = ATOMIC_INIT(1), + .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), + .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), + .mmlist = LIST_HEAD_INIT(init_mm.mmlist), + .cpu_vm_mask = CPU_MASK_ALL, +}; -- cgit v1.2.3 From 1ebf26a9b338534def47f307c6c8694b6dfc0a79 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:31:19 -0700 Subject: readahead: make mmap_miss an unsigned int This makes the performance impact of possible mmap_miss wrap around to be temporary and tolerable: i.e. MMAP_LOTSAMISS=100 extra readarounds. Otherwise if ever mmap_miss wraps around to negative, it takes INT_MAX cache misses to bring it back to normal state. During the time mmap readaround will be _enabled_ for whatever wild random workload. That's almost permanent performance impact. Signed-off-by: Wu Fengguang Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index ede84fa7da5d..8146e0264ef9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -879,7 +879,7 @@ struct file_ra_state { there are only # of pages ahead */ unsigned int ra_pages; /* Maximum readahead window */ - int mmap_miss; /* Cache miss stat for mmap accesses */ + unsigned int mmap_miss; /* Cache miss stat for mmap accesses */ loff_t prev_pos; /* Cache last read() position */ }; -- cgit v1.2.3 From d30a11004e3411909f2448546f036a011978062e Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:31:30 -0700 Subject: readahead: record mmap read-around states in file_ra_state Mmap read-around now shares the same code style and data structure with readahead code. This also removes do_page_cache_readahead(). Its last user, mmap read-around, has been changed to call ra_submit(). The no-readahead-if-congested logic is dumped by the way. Users will be pretty sensitive about the slow loading of executables. So it's unfavorable to disabled mmap read-around on a congested queue. [akpm@linux-foundation.org: coding-style fixes] Cc: Nick Piggin Signed-off-by: Fengguang Wu Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 5 +++-- mm/filemap.c | 12 +++++++----- mm/readahead.c | 23 ++--------------------- 3 files changed, 12 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index ad613ed66ab0..33da7f538841 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1178,8 +1178,6 @@ void task_dirty_inc(struct task_struct *tsk); #define VM_MAX_READAHEAD 128 /* kbytes */ #define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */ -int do_page_cache_readahead(struct address_space *mapping, struct file *filp, - pgoff_t offset, unsigned long nr_to_read); int force_page_cache_readahead(struct address_space *mapping, struct file *filp, pgoff_t offset, unsigned long nr_to_read); @@ -1197,6 +1195,9 @@ void page_cache_async_readahead(struct address_space *mapping, unsigned long size); unsigned long max_sane_readahead(unsigned long nr); +unsigned long ra_submit(struct file_ra_state *ra, + struct address_space *mapping, + struct file *filp); /* Do stack extension */ extern int expand_stack(struct vm_area_struct *vma, unsigned long address); diff --git a/mm/filemap.c b/mm/filemap.c index 5c0c6518f341..734891d0663d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1488,13 +1488,15 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma, if (ra->mmap_miss > MMAP_LOTSAMISS) return; + /* + * mmap read-around + */ ra_pages = max_sane_readahead(ra->ra_pages); if (ra_pages) { - pgoff_t start = 0; - - if (offset > ra_pages / 2) - start = offset - ra_pages / 2; - do_page_cache_readahead(mapping, file, start, ra_pages); + ra->start = max_t(long, 0, offset - ra_pages/2); + ra->size = ra_pages; + ra->async_size = 0; + ra_submit(ra, mapping, file); } } diff --git a/mm/readahead.c b/mm/readahead.c index d7c6e143a129..a7f01fcce9e7 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -133,15 +133,12 @@ out: } /* - * do_page_cache_readahead actually reads a chunk of disk. It allocates all + * __do_page_cache_readahead() actually reads a chunk of disk. It allocates all * the pages first, then submits them all for I/O. This avoids the very bad * behaviour which would occur if page allocations are causing VM writeback. * We really don't want to intermingle reads and writes like that. * * Returns the number of pages requested, or the maximum amount of I/O allowed. - * - * do_page_cache_readahead() returns -1 if it encountered request queue - * congestion. */ static int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, @@ -231,22 +228,6 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, return ret; } -/* - * This version skips the IO if the queue is read-congested, and will tell the - * block layer to abandon the readahead if request allocation would block. - * - * force_page_cache_readahead() will ignore queue congestion and will block on - * request queues. - */ -int do_page_cache_readahead(struct address_space *mapping, struct file *filp, - pgoff_t offset, unsigned long nr_to_read) -{ - if (bdi_read_congested(mapping->backing_dev_info)) - return -1; - - return __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0); -} - /* * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a * sensible upper limit. @@ -260,7 +241,7 @@ unsigned long max_sane_readahead(unsigned long nr) /* * Submit IO for the read-ahead request in file_ra_state. */ -static unsigned long ra_submit(struct file_ra_state *ra, +unsigned long ra_submit(struct file_ra_state *ra, struct address_space *mapping, struct file *filp) { int actual; -- cgit v1.2.3 From dc566127dd161b6c997466a2349ac179527ea89b Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:31:32 -0700 Subject: radix-tree: add radix_tree_prev_hole() The counterpart of radix_tree_next_hole(). To be used by context readahead. Signed-off-by: Wu Fengguang Cc: Vladislav Bolkhovitin Cc: Jens Axboe Cc: Jeff Moyer Cc: Nick Piggin Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 2 ++ lib/radix-tree.c | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) (limited to 'include') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 355f6e80db0d..c5da74918096 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -167,6 +167,8 @@ radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results, unsigned long first_index, unsigned int max_items); unsigned long radix_tree_next_hole(struct radix_tree_root *root, unsigned long index, unsigned long max_scan); +unsigned long radix_tree_prev_hole(struct radix_tree_root *root, + unsigned long index, unsigned long max_scan); int radix_tree_preload(gfp_t gfp_mask); void radix_tree_init(void); void *radix_tree_tag_set(struct radix_tree_root *root, diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 4bb42a0344ec..5301a52cdb4d 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -666,6 +666,43 @@ unsigned long radix_tree_next_hole(struct radix_tree_root *root, } EXPORT_SYMBOL(radix_tree_next_hole); +/** + * radix_tree_prev_hole - find the prev hole (not-present entry) + * @root: tree root + * @index: index key + * @max_scan: maximum range to search + * + * Search backwards in the range [max(index-max_scan+1, 0), index] + * for the first hole. + * + * Returns: the index of the hole if found, otherwise returns an index + * outside of the set specified (in which case 'index - return >= max_scan' + * will be true). In rare cases of wrap-around, LONG_MAX will be returned. + * + * radix_tree_next_hole may be called under rcu_read_lock. However, like + * radix_tree_gang_lookup, this will not atomically search a snapshot of + * the tree at a single point in time. For example, if a hole is created + * at index 10, then subsequently a hole is created at index 5, + * radix_tree_prev_hole covering both indexes may return 5 if called under + * rcu_read_lock. + */ +unsigned long radix_tree_prev_hole(struct radix_tree_root *root, + unsigned long index, unsigned long max_scan) +{ + unsigned long i; + + for (i = 0; i < max_scan; i++) { + if (!radix_tree_lookup(root, index)) + break; + index--; + if (index == LONG_MAX) + break; + } + + return index; +} +EXPORT_SYMBOL(radix_tree_prev_hole); + static unsigned int __lookup(struct radix_tree_node *slot, void ***results, unsigned long index, unsigned int max_items, unsigned long *next_index) -- cgit v1.2.3 From d2bf6be8ab63aa84e6149aac934649aadf3828b1 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 16 Jun 2009 15:31:39 -0700 Subject: mm: clean up get_user_pages_fast() documentation Move more documentation for get_user_pages_fast into the new kerneldoc comment. Add some comments for get_user_pages as well. Also, move get_user_pages_fast declaration up to get_user_pages. It wasn't there initially because it was once a static inline function. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Nick Piggin Cc: Andy Grover Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 20 +++++--------------- mm/memory.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ mm/util.c | 16 ++++++++++++---- 3 files changed, 67 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 33da7f538841..a880161a3854 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -824,8 +824,11 @@ static inline int handle_mm_fault(struct mm_struct *mm, extern int make_pages_present(unsigned long addr, unsigned long end); extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); -int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, - int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); +int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int write, int force, + struct page **pages, struct vm_area_struct **vmas); +int get_user_pages_fast(unsigned long start, int nr_pages, int write, + struct page **pages); extern int try_to_release_page(struct page * page, gfp_t gfp_mask); extern void do_invalidatepage(struct page *page, unsigned long offset); @@ -849,19 +852,6 @@ extern int mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long start, unsigned long end, unsigned long newflags); -/* - * get_user_pages_fast provides equivalent functionality to get_user_pages, - * operating on current and current->mm (force=0 and doesn't return any vmas). - * - * get_user_pages_fast may take mmap_sem and page tables, so no assumptions - * can be made about locking. get_user_pages_fast is to be implemented in a - * way that is advantageous (vs get_user_pages()) when the user memory area is - * already faulted in and present in ptes. However if the pages have to be - * faulted in, it may turn out to be slightly slower). - */ -int get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages); - /* * A callback you can register to apply pressure to ageable caches. * diff --git a/mm/memory.c b/mm/memory.c index 4126dd16778c..891bad0613f4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1360,6 +1360,56 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, return i; } +/** + * get_user_pages() - pin user pages in memory + * @tsk: task_struct of target task + * @mm: mm_struct of target mm + * @start: starting user address + * @len: number of pages from start to pin + * @write: whether pages will be written to by the caller + * @force: whether to force write access even if user mapping is + * readonly. This will result in the page being COWed even + * in MAP_SHARED mappings. You do not want this. + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @vmas: array of pointers to vmas corresponding to each page. + * Or NULL if the caller does not require them. + * + * Returns number of pages pinned. This may be fewer than the number + * requested. If len is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. Each page returned must be released + * with a put_page() call when it is finished with. vmas will only + * remain valid while mmap_sem is held. + * + * Must be called with mmap_sem held for read or write. + * + * get_user_pages walks a process's page tables and takes a reference to + * each struct page that each user address corresponds to at a given + * instant. That is, it takes the page that would be accessed if a user + * thread accesses the given user virtual address at that instant. + * + * This does not guarantee that the page exists in the user mappings when + * get_user_pages returns, and there may even be a completely different + * page there in some cases (eg. if mmapped pagecache has been invalidated + * and subsequently re faulted). However it does guarantee that the page + * won't be freed completely. And mostly callers simply care that the page + * contains data that was valid *at some point in time*. Typically, an IO + * or similar operation cannot guarantee anything stronger anyway because + * locks can't be held over the syscall boundary. + * + * If write=0, the page must not be written to. If the page is written to, + * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called + * after the page is finished with, and before put_page is called. + * + * get_user_pages is typically used for fewer-copy IO operations, to get a + * handle on the memory by some means other than accesses via the user virtual + * addresses. The pages may be submitted for DMA to devices or accessed via + * their kernel linear mapping (via the kmap APIs). Care should be taken to + * use the correct cache flushing APIs. + * + * See also get_user_pages_fast, for performance critical applications. + */ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas) diff --git a/mm/util.c b/mm/util.c index abc65aa7cdfc..d5d2213728c5 100644 --- a/mm/util.c +++ b/mm/util.c @@ -233,13 +233,21 @@ void arch_pick_mmap_layout(struct mm_struct *mm) * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * - * Attempt to pin user pages in memory without taking mm->mmap_sem. - * If not successful, it will fall back to taking the lock and - * calling get_user_pages(). - * * Returns number of pages pinned. This may be fewer than the number * requested. If nr_pages is 0 or negative, returns 0. If no pages * were pinned, returns -errno. + * + * get_user_pages_fast provides equivalent functionality to get_user_pages, + * operating on current and current->mm, with force=0 and vma=NULL. However + * unlike get_user_pages, it must be called without mmap_sem held. + * + * get_user_pages_fast may take mmap_sem and page table locks, so no + * assumptions can be made about lack of locking. get_user_pages_fast is to be + * implemented in a way that is advantageous (vs get_user_pages()) when the + * user memory area is already faulted in and present in ptes. However if the + * pages have to be faulted in, it may turn out to be slightly slower so + * callers need to carefully consider what to use. On many architectures, + * get_user_pages_fast simply falls back to get_user_pages. */ int __attribute__((weak)) get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) -- cgit v1.2.3 From 58568d2a8215cb6f55caf2332017d7bdff954e1c Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 16 Jun 2009 15:31:49 -0700 Subject: cpuset,mm: update tasks' mems_allowed in time Fix allocating page cache/slab object on the unallowed node when memory spread is set by updating tasks' mems_allowed after its cpuset's mems is changed. In order to update tasks' mems_allowed in time, we must modify the code of memory policy. Because the memory policy is applied in the process's context originally. After applying this patch, one task directly manipulates anothers mems_allowed, and we use alloc_lock in the task_struct to protect mems_allowed and memory policy of the task. But in the fast path, we didn't use lock to protect them, because adding a lock may lead to performance regression. But if we don't add a lock,the task might see no nodes when changing cpuset's mems_allowed to some non-overlapping set. In order to avoid it, we set all new allowed nodes, then clear newly disallowed ones. [lee.schermerhorn@hp.com: The rework of mpol_new() to extract the adjusting of the node mask to apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind() with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local allocation. Fix this by adding the check for MPOL_PREFERRED and empty node mask to mpol_new_mpolicy(). Remove the now unneeded 'nodes = NULL' from mpol_new(). Note that mpol_new_mempolicy() is always called with a non-NULL 'nodes' parameter now that it has been removed from mpol_new(). Therefore, we don't need to test nodes for NULL before testing it for 'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to verify this assumption.] [lee.schermerhorn@hp.com: I don't think the function name 'mpol_new_mempolicy' is descriptive enough to differentiate it from mpol_new(). This function applies cpuset set context, usually constraining nodes to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag is set, it also translates the nodes. So I settled on 'mpol_set_nodemask()', because the comment block for mpol_new() mentions that we need to call this function to "set nodes". Some additional minor line length, whitespace and typo cleanup.] Signed-off-by: Miao Xie Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Christoph Lameter Cc: Paul Menage Cc: Nick Piggin Cc: Yasunori Goto Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpuset.h | 13 ++-- include/linux/sched.h | 8 +-- init/main.c | 6 +- kernel/cpuset.c | 184 +++++++++++++------------------------------------ kernel/kthread.c | 2 + mm/mempolicy.c | 143 +++++++++++++++++++++++++++----------- mm/page_alloc.c | 5 +- 7 files changed, 170 insertions(+), 191 deletions(-) (limited to 'include') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 05ea1dd7d681..a5740fc4d04b 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -18,7 +18,6 @@ extern int number_of_cpusets; /* How many cpusets are defined in system? */ -extern int cpuset_init_early(void); extern int cpuset_init(void); extern void cpuset_init_smp(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); @@ -27,7 +26,6 @@ extern void cpuset_cpus_allowed_locked(struct task_struct *p, extern nodemask_t cpuset_mems_allowed(struct task_struct *p); #define cpuset_current_mems_allowed (current->mems_allowed) void cpuset_init_current_mems_allowed(void); -void cpuset_update_task_memory_state(void); int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask); extern int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask); @@ -92,9 +90,13 @@ extern void rebuild_sched_domains(void); extern void cpuset_print_task_mems_allowed(struct task_struct *p); +static inline void set_mems_allowed(nodemask_t nodemask) +{ + current->mems_allowed = nodemask; +} + #else /* !CONFIG_CPUSETS */ -static inline int cpuset_init_early(void) { return 0; } static inline int cpuset_init(void) { return 0; } static inline void cpuset_init_smp(void) {} @@ -116,7 +118,6 @@ static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) #define cpuset_current_mems_allowed (node_states[N_HIGH_MEMORY]) static inline void cpuset_init_current_mems_allowed(void) {} -static inline void cpuset_update_task_memory_state(void) {} static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) { @@ -188,6 +189,10 @@ static inline void cpuset_print_task_mems_allowed(struct task_struct *p) { } +static inline void set_mems_allowed(nodemask_t nodemask) +{ +} + #endif /* !CONFIG_CPUSETS */ #endif /* _LINUX_CPUSET_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index c900aa530070..1048bf50540a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1318,7 +1318,8 @@ struct task_struct { /* Thread group tracking */ u32 parent_exec_id; u32 self_exec_id; -/* Protection of (de-)allocation: mm, files, fs, tty, keyrings */ +/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, + * mempolicy */ spinlock_t alloc_lock; #ifdef CONFIG_GENERIC_HARDIRQS @@ -1386,8 +1387,7 @@ struct task_struct { cputime_t acct_timexpd; /* stime + utime since last update */ #endif #ifdef CONFIG_CPUSETS - nodemask_t mems_allowed; - int cpuset_mems_generation; + nodemask_t mems_allowed; /* Protected by alloc_lock */ int cpuset_mem_spread_rotor; #endif #ifdef CONFIG_CGROUPS @@ -1410,7 +1410,7 @@ struct task_struct { struct list_head perf_counter_list; #endif #ifdef CONFIG_NUMA - struct mempolicy *mempolicy; + struct mempolicy *mempolicy; /* Protected by alloc_lock */ short il_next; #endif atomic_t fs_excl; /* holding fs exclusive resources */ diff --git a/init/main.c b/init/main.c index f6204f712e7c..5e0d3f047eaf 100644 --- a/init/main.c +++ b/init/main.c @@ -670,7 +670,6 @@ asmlinkage void __init start_kernel(void) initrd_start = 0; } #endif - cpuset_init_early(); page_cgroup_init(); enable_debug_pagealloc(); cpu_hotplug_init(); @@ -867,6 +866,11 @@ static noinline int init_post(void) static int __init kernel_init(void * unused) { lock_kernel(); + + /* + * init can allocate pages on any node + */ + set_mems_allowed(node_possible_map); /* * init can run on any cpu. */ diff --git a/kernel/cpuset.c b/kernel/cpuset.c index af5a83d52187..7e75a41bd508 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -97,12 +97,6 @@ struct cpuset { struct cpuset *parent; /* my parent */ - /* - * Copy of global cpuset_mems_generation as of the most - * recent time this cpuset changed its mems_allowed. - */ - int mems_generation; - struct fmeter fmeter; /* memory_pressure filter */ /* partition number for rebuild_sched_domains() */ @@ -176,27 +170,6 @@ static inline int is_spread_slab(const struct cpuset *cs) return test_bit(CS_SPREAD_SLAB, &cs->flags); } -/* - * Increment this integer everytime any cpuset changes its - * mems_allowed value. Users of cpusets can track this generation - * number, and avoid having to lock and reload mems_allowed unless - * the cpuset they're using changes generation. - * - * A single, global generation is needed because cpuset_attach_task() could - * reattach a task to a different cpuset, which must not have its - * generation numbers aliased with those of that tasks previous cpuset. - * - * Generations are needed for mems_allowed because one task cannot - * modify another's memory placement. So we must enable every task, - * on every visit to __alloc_pages(), to efficiently check whether - * its current->cpuset->mems_allowed has changed, requiring an update - * of its current->mems_allowed. - * - * Since writes to cpuset_mems_generation are guarded by the cgroup lock - * there is no need to mark it atomic. - */ -static int cpuset_mems_generation; - static struct cpuset top_cpuset = { .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), }; @@ -228,8 +201,9 @@ static struct cpuset top_cpuset = { * If a task is only holding callback_mutex, then it has read-only * access to cpusets. * - * The task_struct fields mems_allowed and mems_generation may only - * be accessed in the context of that task, so require no locks. + * Now, the task_struct fields mems_allowed and mempolicy may be changed + * by other task, we use alloc_lock in the task_struct fields to protect + * them. * * The cpuset_common_file_read() handlers only hold callback_mutex across * small pieces of code, such as when reading out possibly multi-word @@ -349,69 +323,6 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs, tsk->flags &= ~PF_SPREAD_SLAB; } -/** - * cpuset_update_task_memory_state - update task memory placement - * - * If the current tasks cpusets mems_allowed changed behind our - * backs, update current->mems_allowed, mems_generation and task NUMA - * mempolicy to the new value. - * - * Task mempolicy is updated by rebinding it relative to the - * current->cpuset if a task has its memory placement changed. - * Do not call this routine if in_interrupt(). - * - * Call without callback_mutex or task_lock() held. May be - * called with or without cgroup_mutex held. Thanks in part to - * 'the_top_cpuset_hack', the task's cpuset pointer will never - * be NULL. This routine also might acquire callback_mutex during - * call. - * - * Reading current->cpuset->mems_generation doesn't need task_lock - * to guard the current->cpuset derefence, because it is guarded - * from concurrent freeing of current->cpuset using RCU. - * - * The rcu_dereference() is technically probably not needed, - * as I don't actually mind if I see a new cpuset pointer but - * an old value of mems_generation. However this really only - * matters on alpha systems using cpusets heavily. If I dropped - * that rcu_dereference(), it would save them a memory barrier. - * For all other arch's, rcu_dereference is a no-op anyway, and for - * alpha systems not using cpusets, another planned optimization, - * avoiding the rcu critical section for tasks in the root cpuset - * which is statically allocated, so can't vanish, will make this - * irrelevant. Better to use RCU as intended, than to engage in - * some cute trick to save a memory barrier that is impossible to - * test, for alpha systems using cpusets heavily, which might not - * even exist. - * - * This routine is needed to update the per-task mems_allowed data, - * within the tasks context, when it is trying to allocate memory - * (in various mm/mempolicy.c routines) and notices that some other - * task has been modifying its cpuset. - */ - -void cpuset_update_task_memory_state(void) -{ - int my_cpusets_mem_gen; - struct task_struct *tsk = current; - struct cpuset *cs; - - rcu_read_lock(); - my_cpusets_mem_gen = task_cs(tsk)->mems_generation; - rcu_read_unlock(); - - if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { - mutex_lock(&callback_mutex); - task_lock(tsk); - cs = task_cs(tsk); /* Maybe changed when task not locked */ - guarantee_online_mems(cs, &tsk->mems_allowed); - tsk->cpuset_mems_generation = cs->mems_generation; - task_unlock(tsk); - mutex_unlock(&callback_mutex); - mpol_rebind_task(tsk, &tsk->mems_allowed); - } -} - /* * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? * @@ -1017,14 +928,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * other task, the task_struct mems_allowed that we are hacking * is for our current task, which must allocate new pages for that * migrating memory region. - * - * We call cpuset_update_task_memory_state() before hacking - * our tasks mems_allowed, so that we are assured of being in - * sync with our tasks cpuset, and in particular, callbacks to - * cpuset_update_task_memory_state() from nested page allocations - * won't see any mismatch of our cpuset and task mems_generation - * values, so won't overwrite our hacked tasks mems_allowed - * nodemask. */ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, @@ -1032,22 +935,37 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, { struct task_struct *tsk = current; - cpuset_update_task_memory_state(); - - mutex_lock(&callback_mutex); tsk->mems_allowed = *to; - mutex_unlock(&callback_mutex); do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); - mutex_lock(&callback_mutex); guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed); - mutex_unlock(&callback_mutex); } /* - * Rebind task's vmas to cpuset's new mems_allowed, and migrate pages to new - * nodes if memory_migrate flag is set. Called with cgroup_mutex held. + * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy + * @tsk: the task to change + * @newmems: new nodes that the task will be set + * + * In order to avoid seeing no nodes if the old and new nodes are disjoint, + * we structure updates as setting all new allowed nodes, then clearing newly + * disallowed ones. + * + * Called with task's alloc_lock held + */ +static void cpuset_change_task_nodemask(struct task_struct *tsk, + nodemask_t *newmems) +{ + nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); + mpol_rebind_task(tsk, &tsk->mems_allowed); + mpol_rebind_task(tsk, newmems); + tsk->mems_allowed = *newmems; +} + +/* + * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy + * of it to cpuset's new mems_allowed, and migrate pages to new nodes if + * memory_migrate flag is set. Called with cgroup_mutex held. */ static void cpuset_change_nodemask(struct task_struct *p, struct cgroup_scanner *scan) @@ -1056,12 +974,19 @@ static void cpuset_change_nodemask(struct task_struct *p, struct cpuset *cs; int migrate; const nodemask_t *oldmem = scan->data; + nodemask_t newmems; + + cs = cgroup_cs(scan->cg); + guarantee_online_mems(cs, &newmems); + + task_lock(p); + cpuset_change_task_nodemask(p, &newmems); + task_unlock(p); mm = get_task_mm(p); if (!mm) return; - cs = cgroup_cs(scan->cg); migrate = is_memory_migrate(cs); mpol_rebind_mm(mm, &cs->mems_allowed); @@ -1114,10 +1039,10 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, /* * Handle user request to change the 'mems' memory placement * of a cpuset. Needs to validate the request, update the - * cpusets mems_allowed and mems_generation, and for each - * task in the cpuset, rebind any vma mempolicies and if - * the cpuset is marked 'memory_migrate', migrate the tasks - * pages to the new memory. + * cpusets mems_allowed, and for each task in the cpuset, + * update mems_allowed and rebind task's mempolicy and any vma + * mempolicies and if the cpuset is marked 'memory_migrate', + * migrate the tasks pages to the new memory. * * Call with cgroup_mutex held. May take callback_mutex during call. * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, @@ -1170,7 +1095,6 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, mutex_lock(&callback_mutex); cs->mems_allowed = trialcs->mems_allowed; - cs->mems_generation = cpuset_mems_generation++; mutex_unlock(&callback_mutex); update_tasks_nodemask(cs, &oldmem, &heap); @@ -1434,15 +1358,18 @@ static void cpuset_attach(struct cgroup_subsys *ss, if (cs == &top_cpuset) { cpumask_copy(cpus_attach, cpu_possible_mask); + to = node_possible_map; } else { - mutex_lock(&callback_mutex); guarantee_online_cpus(cs, cpus_attach); - mutex_unlock(&callback_mutex); + guarantee_online_mems(cs, &to); } err = set_cpus_allowed_ptr(tsk, cpus_attach); if (err) return; + task_lock(tsk); + cpuset_change_task_nodemask(tsk, &to); + task_unlock(tsk); cpuset_update_task_spread_flag(cs, tsk); from = oldcs->mems_allowed; @@ -1848,8 +1775,6 @@ static struct cgroup_subsys_state *cpuset_create( struct cpuset *parent; if (!cont->parent) { - /* This is early initialization for the top cgroup */ - top_cpuset.mems_generation = cpuset_mems_generation++; return &top_cpuset.css; } parent = cgroup_cs(cont->parent); @@ -1861,7 +1786,6 @@ static struct cgroup_subsys_state *cpuset_create( return ERR_PTR(-ENOMEM); } - cpuset_update_task_memory_state(); cs->flags = 0; if (is_spread_page(parent)) set_bit(CS_SPREAD_PAGE, &cs->flags); @@ -1870,7 +1794,6 @@ static struct cgroup_subsys_state *cpuset_create( set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); cpumask_clear(cs->cpus_allowed); nodes_clear(cs->mems_allowed); - cs->mems_generation = cpuset_mems_generation++; fmeter_init(&cs->fmeter); cs->relax_domain_level = -1; @@ -1889,8 +1812,6 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) { struct cpuset *cs = cgroup_cs(cont); - cpuset_update_task_memory_state(); - if (is_sched_load_balance(cs)) update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); @@ -1911,21 +1832,6 @@ struct cgroup_subsys cpuset_subsys = { .early_init = 1, }; -/* - * cpuset_init_early - just enough so that the calls to - * cpuset_update_task_memory_state() in early init code - * are harmless. - */ - -int __init cpuset_init_early(void) -{ - alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_NOWAIT); - - top_cpuset.mems_generation = cpuset_mems_generation++; - return 0; -} - - /** * cpuset_init - initialize cpusets at system boot * @@ -1936,11 +1842,13 @@ int __init cpuset_init(void) { int err = 0; + if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)) + BUG(); + cpumask_setall(top_cpuset.cpus_allowed); nodes_setall(top_cpuset.mems_allowed); fmeter_init(&top_cpuset.fmeter); - top_cpuset.mems_generation = cpuset_mems_generation++; set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); top_cpuset.relax_domain_level = -1; diff --git a/kernel/kthread.c b/kernel/kthread.c index 41c88fe40500..7fa441333529 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -236,6 +237,7 @@ int kthreadd(void *unused) ignore_signals(tsk); set_user_nice(tsk, KTHREAD_NICE_LEVEL); set_cpus_allowed_ptr(tsk, cpu_all_mask); + set_mems_allowed(node_possible_map); current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 3eb4a6fdc043..46bdf9ddf2ba 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -182,13 +182,54 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) return 0; } -/* Create a new policy */ +/* + * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if + * any, for the new policy. mpol_new() has already validated the nodes + * parameter with respect to the policy mode and flags. But, we need to + * handle an empty nodemask with MPOL_PREFERRED here. + * + * Must be called holding task's alloc_lock to protect task's mems_allowed + * and mempolicy. May also be called holding the mmap_semaphore for write. + */ +static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes) +{ + nodemask_t cpuset_context_nmask; + int ret; + + /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ + if (pol == NULL) + return 0; + + VM_BUG_ON(!nodes); + if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) + nodes = NULL; /* explicit local allocation */ + else { + if (pol->flags & MPOL_F_RELATIVE_NODES) + mpol_relative_nodemask(&cpuset_context_nmask, nodes, + &cpuset_current_mems_allowed); + else + nodes_and(cpuset_context_nmask, *nodes, + cpuset_current_mems_allowed); + if (mpol_store_user_nodemask(pol)) + pol->w.user_nodemask = *nodes; + else + pol->w.cpuset_mems_allowed = + cpuset_current_mems_allowed; + } + + ret = mpol_ops[pol->mode].create(pol, + nodes ? &cpuset_context_nmask : NULL); + return ret; +} + +/* + * This function just creates a new policy, does some check and simple + * initialization. You must invoke mpol_set_nodemask() to set nodes. + */ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, nodemask_t *nodes) { struct mempolicy *policy; - nodemask_t cpuset_context_nmask; - int ret; pr_debug("setting mode %d flags %d nodes[0] %lx\n", mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); @@ -210,7 +251,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, if (((flags & MPOL_F_STATIC_NODES) || (flags & MPOL_F_RELATIVE_NODES))) return ERR_PTR(-EINVAL); - nodes = NULL; /* flag local alloc */ } } else if (nodes_empty(*nodes)) return ERR_PTR(-EINVAL); @@ -221,30 +261,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, policy->mode = mode; policy->flags = flags; - if (nodes) { - /* - * cpuset related setup doesn't apply to local allocation - */ - cpuset_update_task_memory_state(); - if (flags & MPOL_F_RELATIVE_NODES) - mpol_relative_nodemask(&cpuset_context_nmask, nodes, - &cpuset_current_mems_allowed); - else - nodes_and(cpuset_context_nmask, *nodes, - cpuset_current_mems_allowed); - if (mpol_store_user_nodemask(policy)) - policy->w.user_nodemask = *nodes; - else - policy->w.cpuset_mems_allowed = - cpuset_mems_allowed(current); - } - - ret = mpol_ops[mode].create(policy, - nodes ? &cpuset_context_nmask : NULL); - if (ret < 0) { - kmem_cache_free(policy_cache, policy); - return ERR_PTR(ret); - } return policy; } @@ -324,6 +340,8 @@ static void mpol_rebind_policy(struct mempolicy *pol, /* * Wrapper for mpol_rebind_policy() that just requires task * pointer, and updates task mempolicy. + * + * Called with task's alloc_lock held. */ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) @@ -600,8 +618,9 @@ static void mpol_set_task_struct_flag(void) static long do_set_mempolicy(unsigned short mode, unsigned short flags, nodemask_t *nodes) { - struct mempolicy *new; + struct mempolicy *new, *old; struct mm_struct *mm = current->mm; + int ret; new = mpol_new(mode, flags, nodes); if (IS_ERR(new)) @@ -615,20 +634,33 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, */ if (mm) down_write(&mm->mmap_sem); - mpol_put(current->mempolicy); + task_lock(current); + ret = mpol_set_nodemask(new, nodes); + if (ret) { + task_unlock(current); + if (mm) + up_write(&mm->mmap_sem); + mpol_put(new); + return ret; + } + old = current->mempolicy; current->mempolicy = new; mpol_set_task_struct_flag(); if (new && new->mode == MPOL_INTERLEAVE && nodes_weight(new->v.nodes)) current->il_next = first_node(new->v.nodes); + task_unlock(current); if (mm) up_write(&mm->mmap_sem); + mpol_put(old); return 0; } /* * Return nodemask for policy for get_mempolicy() query + * + * Called with task's alloc_lock held */ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) { @@ -674,7 +706,6 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, struct vm_area_struct *vma = NULL; struct mempolicy *pol = current->mempolicy; - cpuset_update_task_memory_state(); if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) return -EINVAL; @@ -683,7 +714,9 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) return -EINVAL; *policy = 0; /* just so it's initialized */ + task_lock(current); *nmask = cpuset_current_mems_allowed; + task_unlock(current); return 0; } @@ -738,8 +771,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, } err = 0; - if (nmask) + if (nmask) { + task_lock(current); get_policy_nodemask(pol, nmask); + task_unlock(current); + } out: mpol_cond_put(pol); @@ -979,6 +1015,14 @@ static long do_mbind(unsigned long start, unsigned long len, return err; } down_write(&mm->mmap_sem); + task_lock(current); + err = mpol_set_nodemask(new, nmask); + task_unlock(current); + if (err) { + up_write(&mm->mmap_sem); + mpol_put(new); + return err; + } vma = check_range(mm, start, end, nmask, flags | MPOL_MF_INVERT, &pagelist); @@ -1545,8 +1589,6 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) struct mempolicy *pol = get_vma_policy(current, vma, addr); struct zonelist *zl; - cpuset_update_task_memory_state(); - if (unlikely(pol->mode == MPOL_INTERLEAVE)) { unsigned nid; @@ -1593,8 +1635,6 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) { struct mempolicy *pol = current->mempolicy; - if ((gfp & __GFP_WAIT) && !in_interrupt()) - cpuset_update_task_memory_state(); if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) pol = &default_policy; @@ -1854,6 +1894,8 @@ restart: */ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) { + int ret; + sp->root = RB_ROOT; /* empty tree == default mempolicy */ spin_lock_init(&sp->lock); @@ -1863,9 +1905,19 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) /* contextualize the tmpfs mount point mempolicy */ new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); - mpol_put(mpol); /* drop our ref on sb mpol */ - if (IS_ERR(new)) + if (IS_ERR(new)) { + mpol_put(mpol); /* drop our ref on sb mpol */ return; /* no valid nodemask intersection */ + } + + task_lock(current); + ret = mpol_set_nodemask(new, &mpol->w.user_nodemask); + task_unlock(current); + mpol_put(mpol); /* drop our ref on sb mpol */ + if (ret) { + mpol_put(new); + return; + } /* Create pseudo-vma that contains just the policy */ memset(&pvma, 0, sizeof(struct vm_area_struct)); @@ -2086,8 +2138,19 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) new = mpol_new(mode, mode_flags, &nodes); if (IS_ERR(new)) err = 1; - else if (no_context) - new->w.user_nodemask = nodes; /* save for contextualization */ + else { + int ret; + + task_lock(current); + ret = mpol_set_nodemask(new, &nodes); + task_unlock(current); + if (ret) + err = 1; + else if (no_context) { + /* save for contextualization */ + new->w.user_nodemask = nodes; + } + } out: /* Restore string for error message */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 17d5f539a9aa..7cc3179e3591 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1569,10 +1569,7 @@ nofail_alloc: /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); - /* - * The task's cpuset might have expanded its set of allowable nodes - */ - cpuset_update_task_memory_state(); + p->flags |= PF_MEMALLOC; lockdep_set_current_reclaim_state(gfp_mask); -- cgit v1.2.3 From d239171e4f6efd58d7e423853056b1b6a74f1446 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:31:52 -0700 Subject: page allocator: replace __alloc_pages_internal() with __alloc_pages_nodemask() The start of a large patch series to clean up and optimise the page allocator. The performance improvements are in a wide range depending on the exact machine but the results I've seen so fair are approximately; kernbench: 0 to 0.12% (elapsed time) 0.49% to 3.20% (sys time) aim9: -4% to 30% (for page_test and brk_test) tbench: -1% to 4% hackbench: -2.5% to 3.45% (mostly within the noise though) netperf-udp -1.34% to 4.06% (varies between machines a bit) netperf-tcp -0.44% to 5.22% (varies between machines a bit) I haven't sysbench figures at hand, but previously they were within the -0.5% to 2% range. On netperf, the client and server were bound to opposite number CPUs to maximise the problems with cache line bouncing of the struct pages so I expect different people to report different results for netperf depending on their exact machine and how they ran the test (different machines, same cpus client/server, shared cache but two threads client/server, different socket client/server etc). I also measured the vmlinux sizes for a single x86-based config with CONFIG_DEBUG_INFO enabled but not CONFIG_DEBUG_VM. The core of the .config is based on the Debian Lenny kernel config so I expect it to be reasonably typical. This patch: __alloc_pages_internal is the core page allocator function but essentially it is an alias of __alloc_pages_nodemask. Naming a publicly available and exported function "internal" is also a big ugly. This patch renames __alloc_pages_internal() to __alloc_pages_nodemask() and deletes the old nodemask function. Warning - This patch renames an exported symbol. No kernel driver is affected by external drivers calling __alloc_pages_internal() should change the call to __alloc_pages_nodemask() without any alteration of parameters. Signed-off-by: Mel Gorman Reviewed-by: Christoph Lameter Reviewed-by: KOSAKI Motohiro Reviewed-by: Pekka Enberg Cc: Peter Zijlstra Cc: Nick Piggin Cc: Dave Hansen Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 12 ++---------- mm/page_alloc.c | 4 ++-- 2 files changed, 4 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 3760e7c5de02..549ec5583103 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -172,24 +172,16 @@ static inline void arch_alloc_page(struct page *page, int order) { } #endif struct page * -__alloc_pages_internal(gfp_t gfp_mask, unsigned int order, +__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, nodemask_t *nodemask); static inline struct page * __alloc_pages(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist) { - return __alloc_pages_internal(gfp_mask, order, zonelist, NULL); + return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL); } -static inline struct page * -__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, nodemask_t *nodemask) -{ - return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask); -} - - static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cbed869fd831..d58df9031503 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1458,7 +1458,7 @@ try_next_zone: * This is the 'heart' of the zoned buddy allocator. */ struct page * -__alloc_pages_internal(gfp_t gfp_mask, unsigned int order, +__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, nodemask_t *nodemask) { const gfp_t wait = gfp_mask & __GFP_WAIT; @@ -1667,7 +1667,7 @@ nopage: got_pg: return page; } -EXPORT_SYMBOL(__alloc_pages_internal); +EXPORT_SYMBOL(__alloc_pages_nodemask); /* * Common helper functions. -- cgit v1.2.3 From b3c466ce512923298ae8c0121d3e9f397a3f1210 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:31:53 -0700 Subject: page allocator: do not sanity check order in the fast path No user of the allocator API should be passing in an order >= MAX_ORDER but we check for it on each and every allocation. Delete this check and make it a VM_BUG_ON check further down the call path. [akpm@linux-foundation.org: s/VM_BUG_ON/WARN_ON_ONCE/] Signed-off-by: Mel Gorman Reviewed-by: Christoph Lameter Reviewed-by: KOSAKI Motohiro Reviewed-by: Pekka Enberg Cc: Peter Zijlstra Cc: Nick Piggin Cc: Dave Hansen Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 6 ------ mm/page_alloc.c | 3 +++ 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 549ec5583103..c2d3fe03b5d2 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -185,9 +185,6 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order, static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order) { - if (unlikely(order >= MAX_ORDER)) - return NULL; - /* Unknown node is current node */ if (nid < 0) nid = numa_node_id(); @@ -201,9 +198,6 @@ extern struct page *alloc_pages_current(gfp_t gfp_mask, unsigned order); static inline struct page * alloc_pages(gfp_t gfp_mask, unsigned int order) { - if (unlikely(order >= MAX_ORDER)) - return NULL; - return alloc_pages_current(gfp_mask, order); } extern struct page *alloc_page_vma(gfp_t gfp_mask, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d58df9031503..bfbd95c0610f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1401,6 +1401,9 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, classzone_idx = zone_idx(preferred_zone); + if (WARN_ON_ONCE(order >= MAX_ORDER)) + return NULL; + zonelist_scan: /* * Scan zonelist, looking for a zone with enough free. -- cgit v1.2.3 From 6484eb3e2a81807722c5f28efef94d8338b7b996 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:31:54 -0700 Subject: page allocator: do not check NUMA node ID when the caller knows the node is valid Callers of alloc_pages_node() can optionally specify -1 as a node to mean "allocate from the current node". However, a number of the callers in fast paths know for a fact their node is valid. To avoid a comparison and branch, this patch adds alloc_pages_exact_node() that only checks the nid with VM_BUG_ON(). Callers that know their node is valid are then converted. Signed-off-by: Mel Gorman Reviewed-by: Christoph Lameter Reviewed-by: KOSAKI Motohiro Reviewed-by: Pekka Enberg Acked-by: Paul Mundt [for the SLOB NUMA bits] Cc: Peter Zijlstra Cc: Nick Piggin Cc: Dave Hansen Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/hp/common/sba_iommu.c | 2 +- arch/ia64/kernel/mca.c | 3 +-- arch/ia64/kernel/uncached.c | 3 ++- arch/ia64/sn/pci/pci_dma.c | 3 ++- arch/powerpc/platforms/cell/ras.c | 4 ++-- arch/x86/kvm/vmx.c | 2 +- drivers/misc/sgi-gru/grufile.c | 2 +- drivers/misc/sgi-xp/xpc_uv.c | 2 +- include/linux/gfp.h | 9 +++++++++ include/linux/mm.h | 1 - kernel/profile.c | 8 ++++---- mm/filemap.c | 2 +- mm/hugetlb.c | 4 ++-- mm/mempolicy.c | 2 +- mm/migrate.c | 2 +- mm/slab.c | 4 ++-- mm/slob.c | 4 ++-- 17 files changed, 33 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c index 56ceb68eb99d..fe63b2dc9d07 100644 --- a/arch/ia64/hp/common/sba_iommu.c +++ b/arch/ia64/hp/common/sba_iommu.c @@ -1131,7 +1131,7 @@ sba_alloc_coherent (struct device *dev, size_t size, dma_addr_t *dma_handle, gfp #ifdef CONFIG_NUMA { struct page *page; - page = alloc_pages_node(ioc->node == MAX_NUMNODES ? + page = alloc_pages_exact_node(ioc->node == MAX_NUMNODES ? numa_node_id() : ioc->node, flags, get_order(size)); diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index 8f33a8840422..5b17bd402275 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -1829,8 +1829,7 @@ ia64_mca_cpu_init(void *cpu_data) data = mca_bootmem(); first_time = 0; } else - data = page_address(alloc_pages_node(numa_node_id(), - GFP_KERNEL, get_order(sz))); + data = __get_free_pages(GFP_KERNEL, get_order(sz)); if (!data) panic("Could not allocate MCA memory for cpu %d\n", cpu); diff --git a/arch/ia64/kernel/uncached.c b/arch/ia64/kernel/uncached.c index 8eff8c1d40a6..6ba72ab42fcc 100644 --- a/arch/ia64/kernel/uncached.c +++ b/arch/ia64/kernel/uncached.c @@ -98,7 +98,8 @@ static int uncached_add_chunk(struct uncached_pool *uc_pool, int nid) /* attempt to allocate a granule's worth of cached memory pages */ - page = alloc_pages_node(nid, GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, + page = alloc_pages_exact_node(nid, + GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, IA64_GRANULE_SHIFT-PAGE_SHIFT); if (!page) { mutex_unlock(&uc_pool->add_chunk_mutex); diff --git a/arch/ia64/sn/pci/pci_dma.c b/arch/ia64/sn/pci/pci_dma.c index d876423e4e75..98b684928e12 100644 --- a/arch/ia64/sn/pci/pci_dma.c +++ b/arch/ia64/sn/pci/pci_dma.c @@ -90,7 +90,8 @@ static void *sn_dma_alloc_coherent(struct device *dev, size_t size, */ node = pcibus_to_node(pdev->bus); if (likely(node >=0)) { - struct page *p = alloc_pages_node(node, flags, get_order(size)); + struct page *p = alloc_pages_exact_node(node, + flags, get_order(size)); if (likely(p)) cpuaddr = page_address(p); diff --git a/arch/powerpc/platforms/cell/ras.c b/arch/powerpc/platforms/cell/ras.c index 296b5268754e..5e0a191764fc 100644 --- a/arch/powerpc/platforms/cell/ras.c +++ b/arch/powerpc/platforms/cell/ras.c @@ -122,8 +122,8 @@ static int __init cbe_ptcal_enable_on_node(int nid, int order) area->nid = nid; area->order = order; - area->pages = alloc_pages_node(area->nid, GFP_KERNEL | GFP_THISNODE, - area->order); + area->pages = alloc_pages_exact_node(area->nid, GFP_KERNEL|GFP_THISNODE, + area->order); if (!area->pages) { printk(KERN_WARNING "%s: no page on node %d\n", diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 32d6ae8fb60e..e770bf349ec4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1277,7 +1277,7 @@ static struct vmcs *alloc_vmcs_cpu(int cpu) struct page *pages; struct vmcs *vmcs; - pages = alloc_pages_node(node, GFP_KERNEL, vmcs_config.order); + pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order); if (!pages) return NULL; vmcs = page_address(pages); diff --git a/drivers/misc/sgi-gru/grufile.c b/drivers/misc/sgi-gru/grufile.c index bbefe77c67a9..3ce2920e2bf3 100644 --- a/drivers/misc/sgi-gru/grufile.c +++ b/drivers/misc/sgi-gru/grufile.c @@ -302,7 +302,7 @@ static int gru_init_tables(unsigned long gru_base_paddr, void *gru_base_vaddr) pnode = uv_node_to_pnode(nid); if (bid < 0 || gru_base[bid]) continue; - page = alloc_pages_node(nid, GFP_KERNEL, order); + page = alloc_pages_exact_node(nid, GFP_KERNEL, order); if (!page) goto fail; gru_base[bid] = page_address(page); diff --git a/drivers/misc/sgi-xp/xpc_uv.c b/drivers/misc/sgi-xp/xpc_uv.c index 9172fcdee4e2..c76677afda1b 100644 --- a/drivers/misc/sgi-xp/xpc_uv.c +++ b/drivers/misc/sgi-xp/xpc_uv.c @@ -232,7 +232,7 @@ xpc_create_gru_mq_uv(unsigned int mq_size, int cpu, char *irq_name, mq->mmr_blade = uv_cpu_to_blade_id(cpu); nid = cpu_to_node(cpu); - page = alloc_pages_node(nid, GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, + page = alloc_pages_exact_node(nid, GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, pg_order); if (page == NULL) { dev_err(xpc_part, "xpc_create_gru_mq_uv() failed to alloc %d " diff --git a/include/linux/gfp.h b/include/linux/gfp.h index c2d3fe03b5d2..4efa33088a82 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -5,6 +5,7 @@ #include #include #include +#include struct vm_area_struct; @@ -192,6 +193,14 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask)); } +static inline struct page *alloc_pages_exact_node(int nid, gfp_t gfp_mask, + unsigned int order) +{ + VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); + + return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask)); +} + #ifdef CONFIG_NUMA extern struct page *alloc_pages_current(gfp_t gfp_mask, unsigned order); diff --git a/include/linux/mm.h b/include/linux/mm.h index a880161a3854..7b548e7cfbd9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -7,7 +7,6 @@ #include #include -#include #include #include #include diff --git a/kernel/profile.c b/kernel/profile.c index 28cf26ad2d24..69911b5745eb 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -365,7 +365,7 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info, node = cpu_to_node(cpu); per_cpu(cpu_profile_flip, cpu) = 0; if (!per_cpu(cpu_profile_hits, cpu)[1]) { - page = alloc_pages_node(node, + page = alloc_pages_exact_node(node, GFP_KERNEL | __GFP_ZERO, 0); if (!page) @@ -373,7 +373,7 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info, per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); } if (!per_cpu(cpu_profile_hits, cpu)[0]) { - page = alloc_pages_node(node, + page = alloc_pages_exact_node(node, GFP_KERNEL | __GFP_ZERO, 0); if (!page) @@ -564,14 +564,14 @@ static int create_hash_tables(void) int node = cpu_to_node(cpu); struct page *page; - page = alloc_pages_node(node, + page = alloc_pages_exact_node(node, GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, 0); if (!page) goto out_cleanup; per_cpu(cpu_profile_hits, cpu)[1] = (struct profile_hit *)page_address(page); - page = alloc_pages_node(node, + page = alloc_pages_exact_node(node, GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, 0); if (!page) diff --git a/mm/filemap.c b/mm/filemap.c index 6846a902f5cf..22396713feb9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -521,7 +521,7 @@ struct page *__page_cache_alloc(gfp_t gfp) { if (cpuset_do_page_mem_spread()) { int n = cpuset_mem_spread_node(); - return alloc_pages_node(n, gfp, 0); + return alloc_pages_exact_node(n, gfp, 0); } return alloc_pages(gfp, 0); } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e83ad2c9228c..2f8241f300f5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -630,7 +630,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) if (h->order >= MAX_ORDER) return NULL; - page = alloc_pages_node(nid, + page = alloc_pages_exact_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); @@ -649,7 +649,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) * Use a helper variable to find the next node and then * copy it back to hugetlb_next_nid afterwards: * otherwise there's a window in which a racer might - * pass invalid nid MAX_NUMNODES to alloc_pages_node. + * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. * But we don't need to use a spin_lock here: it really * doesn't matter if occasionally a racer chooses the * same nid as we do. Move nid forward in the mask even diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 46bdf9ddf2ba..e08e2c4da63a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -803,7 +803,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, static struct page *new_node_page(struct page *page, unsigned long node, int **x) { - return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0); + return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0); } /* diff --git a/mm/migrate.c b/mm/migrate.c index 068655d8f883..5a24923e7fd7 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -802,7 +802,7 @@ static struct page *new_page_node(struct page *p, unsigned long private, *result = &pm->status; - return alloc_pages_node(pm->node, + return alloc_pages_exact_node(pm->node, GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); } diff --git a/mm/slab.c b/mm/slab.c index 18e3164de09a..bb3254c95cd2 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1707,7 +1707,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) if (cachep->flags & SLAB_RECLAIM_ACCOUNT) flags |= __GFP_RECLAIMABLE; - page = alloc_pages_node(nodeid, flags, cachep->gfporder); + page = alloc_pages_exact_node(nodeid, flags, cachep->gfporder); if (!page) return NULL; @@ -3261,7 +3261,7 @@ retry: if (local_flags & __GFP_WAIT) local_irq_enable(); kmem_flagcheck(cache, flags); - obj = kmem_getpages(cache, local_flags, -1); + obj = kmem_getpages(cache, local_flags, numa_node_id()); if (local_flags & __GFP_WAIT) local_irq_disable(); if (obj) { diff --git a/mm/slob.c b/mm/slob.c index 12f261499925..64f6db1943bf 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -46,7 +46,7 @@ * NUMA support in SLOB is fairly simplistic, pushing most of the real * logic down to the page allocator, and simply doing the node accounting * on the upper levels. In the event that a node id is explicitly - * provided, alloc_pages_node() with the specified node id is used + * provided, alloc_pages_exact_node() with the specified node id is used * instead. The common case (or when the node id isn't explicitly provided) * will default to the current node, as per numa_node_id(). * @@ -244,7 +244,7 @@ static void *slob_new_pages(gfp_t gfp, int order, int node) #ifdef CONFIG_NUMA if (node != -1) - page = alloc_pages_node(node, gfp, order); + page = alloc_pages_exact_node(node, gfp, order); else #endif page = alloc_pages(gfp, order); -- cgit v1.2.3 From 49255c619fbd482d704289b5eb2795f8e3b7ff2e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:31:58 -0700 Subject: page allocator: move check for disabled anti-fragmentation out of fastpath On low-memory systems, anti-fragmentation gets disabled as there is nothing it can do and it would just incur overhead shuffling pages between lists constantly. Currently the check is made in the free page fast path for every page. This patch moves it to a slow path. On machines with low memory, there will be small amount of additional overhead as pages get shuffled between lists but it should quickly settle. Signed-off-by: Mel Gorman Reviewed-by: Christoph Lameter Reviewed-by: KOSAKI Motohiro Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Nick Piggin Cc: Dave Hansen Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 3 --- mm/page_alloc.c | 4 ++++ 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index a47c879e1304..0aa4445b0b8a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -50,9 +50,6 @@ extern int page_group_by_mobility_disabled; static inline int get_pageblock_migratetype(struct page *page) { - if (unlikely(page_group_by_mobility_disabled)) - return MIGRATE_UNMOVABLE; - return get_pageblock_flags_group(page, PB_migrate, PB_migrate_end); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 512bf9a618c7..b09859629e93 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -168,6 +168,10 @@ int page_group_by_mobility_disabled __read_mostly; static void set_pageblock_migratetype(struct page *page, int migratetype) { + + if (unlikely(page_group_by_mobility_disabled)) + migratetype = MIGRATE_UNMOVABLE; + set_pageblock_flags_group(page, (unsigned long)migratetype, PB_migrate, PB_migrate_end); } -- cgit v1.2.3 From 418589663d6011de9006425b6c5721e1544fb47a Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:32:12 -0700 Subject: page allocator: use allocation flags as an index to the zone watermark ALLOC_WMARK_MIN, ALLOC_WMARK_LOW and ALLOC_WMARK_HIGH determin whether pages_min, pages_low or pages_high is used as the zone watermark when allocating the pages. Two branches in the allocator hotpath determine which watermark to use. This patch uses the flags as an array index into a watermark array that is indexed with WMARK_* defines accessed via helpers. All call sites that use zone->pages_* are updated to use the helpers for accessing the values and the array offsets for setting. Signed-off-by: Mel Gorman Reviewed-by: Christoph Lameter Cc: KOSAKI Motohiro Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Nick Piggin Cc: Dave Hansen Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 11 +++++----- Documentation/vm/balance | 18 ++++++++-------- arch/m32r/mm/discontig.c | 6 +++--- include/linux/mmzone.h | 16 +++++++++++++- mm/page_alloc.c | 51 +++++++++++++++++++++++---------------------- mm/vmscan.c | 39 ++++++++++++++++++---------------- mm/vmstat.c | 6 +++--- 7 files changed, 83 insertions(+), 64 deletions(-) (limited to 'include') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 6fab2dcbb4d3..0ea5adbc5b16 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -233,8 +233,8 @@ These protections are added to score to judge whether this zone should be used for page allocation or should be reclaimed. In this example, if normal pages (index=2) are required to this DMA zone and -pages_high is used for watermark, the kernel judges this zone should not be -used because pages_free(1355) is smaller than watermark + protection[2] +watermark[WMARK_HIGH] is used for watermark, the kernel judges this zone should +not be used because pages_free(1355) is smaller than watermark + protection[2] (4 + 2004 = 2008). If this protection value is 0, this zone would be used for normal page requirement. If requirement is DMA zone(index=0), protection[0] (=0) is used. @@ -280,9 +280,10 @@ The default value is 65536. min_free_kbytes: This is used to force the Linux VM to keep a minimum number -of kilobytes free. The VM uses this number to compute a pages_min -value for each lowmem zone in the system. Each lowmem zone gets -a number of reserved free pages based proportionally on its size. +of kilobytes free. The VM uses this number to compute a +watermark[WMARK_MIN] value for each lowmem zone in the system. +Each lowmem zone gets a number of reserved free pages based +proportionally on its size. Some minimal amount of memory is needed to satisfy PF_MEMALLOC allocations; if you set this to lower than 1024KB, your system will diff --git a/Documentation/vm/balance b/Documentation/vm/balance index bd3d31bc4915..c46e68cf9344 100644 --- a/Documentation/vm/balance +++ b/Documentation/vm/balance @@ -75,15 +75,15 @@ Page stealing from process memory and shm is done if stealing the page would alleviate memory pressure on any zone in the page's node that has fallen below its watermark. -pages_min/pages_low/pages_high/low_on_memory/zone_wake_kswapd: These are -per-zone fields, used to determine when a zone needs to be balanced. When -the number of pages falls below pages_min, the hysteric field low_on_memory -gets set. This stays set till the number of free pages becomes pages_high. -When low_on_memory is set, page allocation requests will try to free some -pages in the zone (providing GFP_WAIT is set in the request). Orthogonal -to this, is the decision to poke kswapd to free some zone pages. That -decision is not hysteresis based, and is done when the number of free -pages is below pages_low; in which case zone_wake_kswapd is also set. +watemark[WMARK_MIN/WMARK_LOW/WMARK_HIGH]/low_on_memory/zone_wake_kswapd: These +are per-zone fields, used to determine when a zone needs to be balanced. When +the number of pages falls below watermark[WMARK_MIN], the hysteric field +low_on_memory gets set. This stays set till the number of free pages becomes +watermark[WMARK_HIGH]. When low_on_memory is set, page allocation requests will +try to free some pages in the zone (providing GFP_WAIT is set in the request). +Orthogonal to this, is the decision to poke kswapd to free some zone pages. +That decision is not hysteresis based, and is done when the number of free +pages is below watermark[WMARK_LOW]; in which case zone_wake_kswapd is also set. (Good) Ideas that I have heard: diff --git a/arch/m32r/mm/discontig.c b/arch/m32r/mm/discontig.c index 7daf897292cf..b7a78ad429b7 100644 --- a/arch/m32r/mm/discontig.c +++ b/arch/m32r/mm/discontig.c @@ -154,9 +154,9 @@ unsigned long __init zone_sizes_init(void) * Use all area of internal RAM. * see __alloc_pages() */ - NODE_DATA(1)->node_zones->pages_min = 0; - NODE_DATA(1)->node_zones->pages_low = 0; - NODE_DATA(1)->node_zones->pages_high = 0; + NODE_DATA(1)->node_zones->watermark[WMARK_MIN] = 0; + NODE_DATA(1)->node_zones->watermark[WMARK_LOW] = 0; + NODE_DATA(1)->node_zones->watermark[WMARK_HIGH] = 0; return holes; } diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 0aa4445b0b8a..dd8487f0442f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -163,6 +163,17 @@ static inline int is_unevictable_lru(enum lru_list l) #endif } +enum zone_watermarks { + WMARK_MIN, + WMARK_LOW, + WMARK_HIGH, + NR_WMARK +}; + +#define min_wmark_pages(z) (z->watermark[WMARK_MIN]) +#define low_wmark_pages(z) (z->watermark[WMARK_LOW]) +#define high_wmark_pages(z) (z->watermark[WMARK_HIGH]) + struct per_cpu_pages { int count; /* number of pages in the list */ int high; /* high watermark, emptying needed */ @@ -275,7 +286,10 @@ struct zone_reclaim_stat { struct zone { /* Fields commonly accessed by the page allocator */ - unsigned long pages_min, pages_low, pages_high; + + /* zone watermarks, access with *_wmark_pages(zone) macros */ + unsigned long watermark[NR_WMARK]; + /* * We don't know if the memory that we're going to allocate will be freeable * or/and it will be released eventually, so to avoid totally wasting several diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8485735fc690..abe26003124d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1150,10 +1150,15 @@ failed: return NULL; } -#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ -#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */ -#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */ -#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */ +/* The ALLOC_WMARK bits are used as an index to zone->watermark */ +#define ALLOC_WMARK_MIN WMARK_MIN +#define ALLOC_WMARK_LOW WMARK_LOW +#define ALLOC_WMARK_HIGH WMARK_HIGH +#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */ + +/* Mask to get the watermark bits */ +#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) + #define ALLOC_HARDER 0x10 /* try to alloc harder */ #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ @@ -1440,14 +1445,10 @@ zonelist_scan: !cpuset_zone_allowed_softwall(zone, gfp_mask)) goto try_next_zone; + BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { unsigned long mark; - if (alloc_flags & ALLOC_WMARK_MIN) - mark = zone->pages_min; - else if (alloc_flags & ALLOC_WMARK_LOW) - mark = zone->pages_low; - else - mark = zone->pages_high; + mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; if (!zone_watermark_ok(zone, order, mark, classzone_idx, alloc_flags)) { if (!zone_reclaim_mode || @@ -1959,7 +1960,7 @@ static unsigned int nr_free_zone_pages(int offset) for_each_zone_zonelist(zone, z, zonelist, offset) { unsigned long size = zone->present_pages; - unsigned long high = zone->pages_high; + unsigned long high = high_wmark_pages(zone); if (size > high) sum += size - high; } @@ -2096,9 +2097,9 @@ void show_free_areas(void) "\n", zone->name, K(zone_page_state(zone, NR_FREE_PAGES)), - K(zone->pages_min), - K(zone->pages_low), - K(zone->pages_high), + K(min_wmark_pages(zone)), + K(low_wmark_pages(zone)), + K(high_wmark_pages(zone)), K(zone_page_state(zone, NR_ACTIVE_ANON)), K(zone_page_state(zone, NR_INACTIVE_ANON)), K(zone_page_state(zone, NR_ACTIVE_FILE)), @@ -2702,8 +2703,8 @@ static inline unsigned long wait_table_bits(unsigned long size) /* * Mark a number of pageblocks as MIGRATE_RESERVE. The number - * of blocks reserved is based on zone->pages_min. The memory within the - * reserve will tend to store contiguous free pages. Setting min_free_kbytes + * of blocks reserved is based on min_wmark_pages(zone). The memory within + * the reserve will tend to store contiguous free pages. Setting min_free_kbytes * higher will lead to a bigger reserve which will get freed as contiguous * blocks as reclaim kicks in */ @@ -2716,7 +2717,7 @@ static void setup_zone_migrate_reserve(struct zone *zone) /* Get the start pfn, end pfn and the number of blocks to reserve */ start_pfn = zone->zone_start_pfn; end_pfn = start_pfn + zone->spanned_pages; - reserve = roundup(zone->pages_min, pageblock_nr_pages) >> + reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> pageblock_order; for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { @@ -4319,8 +4320,8 @@ static void calculate_totalreserve_pages(void) max = zone->lowmem_reserve[j]; } - /* we treat pages_high as reserved pages. */ - max += zone->pages_high; + /* we treat the high watermark as reserved pages. */ + max += high_wmark_pages(zone); if (max > zone->present_pages) max = zone->present_pages; @@ -4400,7 +4401,7 @@ void setup_per_zone_pages_min(void) * need highmem pages, so cap pages_min to a small * value here. * - * The (pages_high-pages_low) and (pages_low-pages_min) + * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) * deltas controls asynch page reclaim, and so should * not be capped for highmem. */ @@ -4411,17 +4412,17 @@ void setup_per_zone_pages_min(void) min_pages = SWAP_CLUSTER_MAX; if (min_pages > 128) min_pages = 128; - zone->pages_min = min_pages; + zone->watermark[WMARK_MIN] = min_pages; } else { /* * If it's a lowmem zone, reserve a number of pages * proportionate to the zone's size. */ - zone->pages_min = tmp; + zone->watermark[WMARK_MIN] = tmp; } - zone->pages_low = zone->pages_min + (tmp >> 2); - zone->pages_high = zone->pages_min + (tmp >> 1); + zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); + zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); setup_zone_migrate_reserve(zone); spin_unlock_irqrestore(&zone->lock, flags); } @@ -4566,7 +4567,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, * whenever sysctl_lowmem_reserve_ratio changes. * * The reserve ratio obviously has absolutely no relation with the - * pages_min watermarks. The lowmem reserve ratio can only make sense + * minimum watermarks. The lowmem reserve ratio can only make sense * if in function of the boot time zone sizes. */ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, diff --git a/mm/vmscan.c b/mm/vmscan.c index a6b7d14812e6..e5245d051647 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1401,7 +1401,7 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, free = zone_page_state(zone, NR_FREE_PAGES); /* If we have very few page cache pages, force-scan anon pages. */ - if (unlikely(file + free <= zone->pages_high)) { + if (unlikely(file + free <= high_wmark_pages(zone))) { percent[0] = 100; percent[1] = 0; return; @@ -1533,11 +1533,13 @@ static void shrink_zone(int priority, struct zone *zone, * try to reclaim pages from zones which will satisfy the caller's allocation * request. * - * We reclaim from a zone even if that zone is over pages_high. Because: + * We reclaim from a zone even if that zone is over high_wmark_pages(zone). + * Because: * a) The caller may be trying to free *extra* pages to satisfy a higher-order * allocation or - * b) The zones may be over pages_high but they must go *over* pages_high to - * satisfy the `incremental min' zone defense algorithm. + * b) The target zone may be at high_wmark_pages(zone) but the lower zones + * must go *over* high_wmark_pages(zone) to satisfy the `incremental min' + * zone defense algorithm. * * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. @@ -1743,7 +1745,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, /* * For kswapd, balance_pgdat() will work across all this node's zones until - * they are all at pages_high. + * they are all at high_wmark_pages(zone). * * Returns the number of pages which were actually freed. * @@ -1756,11 +1758,11 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, * the zone for when the problem goes away. * * kswapd scans the zones in the highmem->normal->dma direction. It skips - * zones which have free_pages > pages_high, but once a zone is found to have - * free_pages <= pages_high, we scan that zone and the lower zones regardless - * of the number of free pages in the lower zones. This interoperates with - * the page allocator fallback scheme to ensure that aging of pages is balanced - * across the zones. + * zones which have free_pages > high_wmark_pages(zone), but once a zone is + * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the + * lower zones regardless of the number of free pages in the lower zones. This + * interoperates with the page allocator fallback scheme to ensure that aging + * of pages is balanced across the zones. */ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) { @@ -1781,7 +1783,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) }; /* * temp_priority is used to remember the scanning priority at which - * this zone was successfully refilled to free_pages == pages_high. + * this zone was successfully refilled to + * free_pages == high_wmark_pages(zone). */ int temp_priority[MAX_NR_ZONES]; @@ -1826,8 +1829,8 @@ loop_again: shrink_active_list(SWAP_CLUSTER_MAX, zone, &sc, priority, 0); - if (!zone_watermark_ok(zone, order, zone->pages_high, - 0, 0)) { + if (!zone_watermark_ok(zone, order, + high_wmark_pages(zone), 0, 0)) { end_zone = i; break; } @@ -1861,8 +1864,8 @@ loop_again: priority != DEF_PRIORITY) continue; - if (!zone_watermark_ok(zone, order, zone->pages_high, - end_zone, 0)) + if (!zone_watermark_ok(zone, order, + high_wmark_pages(zone), end_zone, 0)) all_zones_ok = 0; temp_priority[i] = priority; sc.nr_scanned = 0; @@ -1871,8 +1874,8 @@ loop_again: * We put equal pressure on every zone, unless one * zone has way too many pages free already. */ - if (!zone_watermark_ok(zone, order, 8*zone->pages_high, - end_zone, 0)) + if (!zone_watermark_ok(zone, order, + 8*high_wmark_pages(zone), end_zone, 0)) shrink_zone(priority, zone, &sc); reclaim_state->reclaimed_slab = 0; nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, @@ -2038,7 +2041,7 @@ void wakeup_kswapd(struct zone *zone, int order) return; pgdat = zone->zone_pgdat; - if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0)) + if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) return; if (pgdat->kswapd_max_order < order) pgdat->kswapd_max_order = order; diff --git a/mm/vmstat.c b/mm/vmstat.c index 74d66dba0cbe..415110772c73 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -714,9 +714,9 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n spanned %lu" "\n present %lu", zone_page_state(zone, NR_FREE_PAGES), - zone->pages_min, - zone->pages_low, - zone->pages_high, + min_wmark_pages(zone), + low_wmark_pages(zone), + high_wmark_pages(zone), zone->pages_scanned, zone->lru[LRU_ACTIVE_ANON].nr_scan, zone->lru[LRU_INACTIVE_ANON].nr_scan, -- cgit v1.2.3 From 62bc62a873116805774ffd37d7f86aa4faa832b1 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 16 Jun 2009 15:32:15 -0700 Subject: page allocator: use a pre-calculated value instead of num_online_nodes() in fast paths num_online_nodes() is called in a number of places but most often by the page allocator when deciding whether the zonelist needs to be filtered based on cpusets or the zonelist cache. This is actually a heavy function and touches a number of cache lines. This patch stores the number of online nodes at boot time and updates the value when nodes get onlined and offlined. The value is then used in a number of important paths in place of num_online_nodes(). [rientjes@google.com: do not override definition of node_set_online() with macro] Signed-off-by: Christoph Lameter Signed-off-by: Mel Gorman Cc: KOSAKI Motohiro Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Nick Piggin Cc: Dave Hansen Cc: Lee Schermerhorn Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/nodemask.h | 19 ++++++++++++++++--- mm/hugetlb.c | 4 ++-- mm/page_alloc.c | 10 ++++++---- mm/slub.c | 2 +- net/sunrpc/svc.c | 2 +- 5 files changed, 26 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 848025cd7087..829b94b156f2 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -408,6 +408,19 @@ static inline int num_node_state(enum node_states state) #define next_online_node(nid) next_node((nid), node_states[N_ONLINE]) extern int nr_node_ids; +extern int nr_online_nodes; + +static inline void node_set_online(int nid) +{ + node_set_state(nid, N_ONLINE); + nr_online_nodes = num_node_state(N_ONLINE); +} + +static inline void node_set_offline(int nid) +{ + node_clear_state(nid, N_ONLINE); + nr_online_nodes = num_node_state(N_ONLINE); +} #else static inline int node_state(int node, enum node_states state) @@ -434,7 +447,10 @@ static inline int num_node_state(enum node_states state) #define first_online_node 0 #define next_online_node(nid) (MAX_NUMNODES) #define nr_node_ids 1 +#define nr_online_nodes 1 +#define node_set_online(node) node_set_state((node), N_ONLINE) +#define node_set_offline(node) node_clear_state((node), N_ONLINE) #endif #define node_online_map node_states[N_ONLINE] @@ -454,9 +470,6 @@ static inline int num_node_state(enum node_states state) #define node_online(node) node_state((node), N_ONLINE) #define node_possible(node) node_state((node), N_POSSIBLE) -#define node_set_online(node) node_set_state((node), N_ONLINE) -#define node_set_offline(node) node_clear_state((node), N_ONLINE) - #define for_each_node(node) for_each_node_state(node, N_POSSIBLE) #define for_each_online_node(node) for_each_node_state(node, N_ONLINE) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2f8241f300f5..7b9b6015b2ec 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -875,7 +875,7 @@ static void return_unused_surplus_pages(struct hstate *h, * can no longer free unreserved surplus pages. This occurs when * the nodes with surplus pages have no free pages. */ - unsigned long remaining_iterations = num_online_nodes(); + unsigned long remaining_iterations = nr_online_nodes; /* Uncommit the reservation */ h->resv_huge_pages -= unused_resv_pages; @@ -904,7 +904,7 @@ static void return_unused_surplus_pages(struct hstate *h, h->surplus_huge_pages--; h->surplus_huge_pages_node[nid]--; nr_pages--; - remaining_iterations = num_online_nodes(); + remaining_iterations = nr_online_nodes; } } } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e60e41474332..0c9f406e3c44 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -161,7 +161,9 @@ static unsigned long __meminitdata dma_reserve; #if MAX_NUMNODES > 1 int nr_node_ids __read_mostly = MAX_NUMNODES; +int nr_online_nodes __read_mostly = 1; EXPORT_SYMBOL(nr_node_ids); +EXPORT_SYMBOL(nr_online_nodes); #endif int page_group_by_mobility_disabled __read_mostly; @@ -1466,7 +1468,7 @@ this_zone_full: if (NUMA_BUILD) zlc_mark_zone_full(zonelist, z); try_next_zone: - if (NUMA_BUILD && !did_zlc_setup && num_online_nodes() > 1) { + if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { /* * we do zlc_setup after the first zone is tried but only * if there are multiple nodes make it worthwhile @@ -2265,7 +2267,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write, } -#define MAX_NODE_LOAD (num_online_nodes()) +#define MAX_NODE_LOAD (nr_online_nodes) static int node_load[MAX_NUMNODES]; /** @@ -2474,7 +2476,7 @@ static void build_zonelists(pg_data_t *pgdat) /* NUMA-aware ordering of nodes */ local_node = pgdat->node_id; - load = num_online_nodes(); + load = nr_online_nodes; prev_node = local_node; nodes_clear(used_mask); @@ -2625,7 +2627,7 @@ void build_all_zonelists(void) printk("Built %i zonelists in %s order, mobility grouping %s. " "Total pages: %ld\n", - num_online_nodes(), + nr_online_nodes, zonelist_order_name[current_zonelist_order], page_group_by_mobility_disabled ? "off" : "on", vm_total_pages); diff --git a/mm/slub.c b/mm/slub.c index 30354bfeb43d..1c950775d6be 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3737,7 +3737,7 @@ static int list_locations(struct kmem_cache *s, char *buf, to_cpumask(l->cpus)); } - if (num_online_nodes() > 1 && !nodes_empty(l->nodes) && + if (nr_online_nodes > 1 && !nodes_empty(l->nodes) && len < PAGE_SIZE - 60) { len += sprintf(buf + len, " nodes="); len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50, diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 8847add6ca16..5ed8931dfe98 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -124,7 +124,7 @@ svc_pool_map_choose_mode(void) { unsigned int node; - if (num_online_nodes() > 1) { + if (nr_online_nodes > 1) { /* * Actually have multiple NUMA nodes, * so split pools on NUMA node boundaries -- cgit v1.2.3 From 20a0307c0396c2edb651401d2f2db193dda2f3c9 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:32:22 -0700 Subject: mm: introduce PageHuge() for testing huge/gigantic pages A series of patches to enhance the /proc/pagemap interface and to add a userspace executable which can be used to present the pagemap data. Export 10 more flags to end users (and more for kernel developers): 11. KPF_MMAP (pseudo flag) memory mapped page 12. KPF_ANON (pseudo flag) memory mapped page (anonymous) 13. KPF_SWAPCACHE page is in swap cache 14. KPF_SWAPBACKED page is swap/RAM backed 15. KPF_COMPOUND_HEAD (*) 16. KPF_COMPOUND_TAIL (*) 17. KPF_HUGE hugeTLB pages 18. KPF_UNEVICTABLE page is in the unevictable LRU list 19. KPF_HWPOISON hardware detected corruption 20. KPF_NOPAGE (pseudo flag) no page frame at the address (*) For compound pages, exporting _both_ head/tail info enables users to tell where a compound page starts/ends, and its order. a simple demo of the page-types tool # ./page-types -h page-types [options] -r|--raw Raw mode, for kernel developers -a|--addr addr-spec Walk a range of pages -b|--bits bits-spec Walk pages with specified bits -l|--list Show page details in ranges -L|--list-each Show page details one by one -N|--no-summary Don't show summay info -h|--help Show this usage message addr-spec: N one page at offset N (unit: pages) N+M pages range from N to N+M-1 N,M pages range from N to M-1 N, pages range from N to end ,M pages range from 0 to M bits-spec: bit1,bit2 (flags & (bit1|bit2)) != 0 bit1,bit2=bit1 (flags & (bit1|bit2)) == bit1 bit1,~bit2 (flags & (bit1|bit2)) == bit1 =bit1,bit2 flags == (bit1|bit2) bit-names: locked error referenced uptodate dirty lru active slab writeback reclaim buddy mmap anonymous swapcache swapbacked compound_head compound_tail huge unevictable hwpoison nopage reserved(r) mlocked(r) mappedtodisk(r) private(r) private_2(r) owner_private(r) arch(r) uncached(r) readahead(o) slob_free(o) slub_frozen(o) slub_debug(o) (r) raw mode bits (o) overloaded bits # ./page-types flags page-count MB symbolic-flags long-symbolic-flags 0x0000000000000000 487369 1903 _________________________________ 0x0000000000000014 5 0 __R_D____________________________ referenced,dirty 0x0000000000000020 1 0 _____l___________________________ lru 0x0000000000000024 34 0 __R__l___________________________ referenced,lru 0x0000000000000028 3838 14 ___U_l___________________________ uptodate,lru 0x0001000000000028 48 0 ___U_l_______________________I___ uptodate,lru,readahead 0x000000000000002c 6478 25 __RU_l___________________________ referenced,uptodate,lru 0x000100000000002c 47 0 __RU_l_______________________I___ referenced,uptodate,lru,readahead 0x0000000000000040 8344 32 ______A__________________________ active 0x0000000000000060 1 0 _____lA__________________________ lru,active 0x0000000000000068 348 1 ___U_lA__________________________ uptodate,lru,active 0x0001000000000068 12 0 ___U_lA______________________I___ uptodate,lru,active,readahead 0x000000000000006c 988 3 __RU_lA__________________________ referenced,uptodate,lru,active 0x000100000000006c 48 0 __RU_lA______________________I___ referenced,uptodate,lru,active,readahead 0x0000000000004078 1 0 ___UDlA_______b__________________ uptodate,dirty,lru,active,swapbacked 0x000000000000407c 34 0 __RUDlA_______b__________________ referenced,uptodate,dirty,lru,active,swapbacked 0x0000000000000400 503 1 __________B______________________ buddy 0x0000000000000804 1 0 __R________M_____________________ referenced,mmap 0x0000000000000828 1029 4 ___U_l_____M_____________________ uptodate,lru,mmap 0x0001000000000828 43 0 ___U_l_____M_________________I___ uptodate,lru,mmap,readahead 0x000000000000082c 382 1 __RU_l_____M_____________________ referenced,uptodate,lru,mmap 0x000100000000082c 12 0 __RU_l_____M_________________I___ referenced,uptodate,lru,mmap,readahead 0x0000000000000868 192 0 ___U_lA____M_____________________ uptodate,lru,active,mmap 0x0001000000000868 12 0 ___U_lA____M_________________I___ uptodate,lru,active,mmap,readahead 0x000000000000086c 800 3 __RU_lA____M_____________________ referenced,uptodate,lru,active,mmap 0x000100000000086c 31 0 __RU_lA____M_________________I___ referenced,uptodate,lru,active,mmap,readahead 0x0000000000004878 2 0 ___UDlA____M__b__________________ uptodate,dirty,lru,active,mmap,swapbacked 0x0000000000001000 492 1 ____________a____________________ anonymous 0x0000000000005808 4 0 ___U_______Ma_b__________________ uptodate,mmap,anonymous,swapbacked 0x0000000000005868 2839 11 ___U_lA____Ma_b__________________ uptodate,lru,active,mmap,anonymous,swapbacked 0x000000000000586c 30 0 __RU_lA____Ma_b__________________ referenced,uptodate,lru,active,mmap,anonymous,swapbacked total 513968 2007 # ./page-types -r flags page-count MB symbolic-flags long-symbolic-flags 0x0000000000000000 468002 1828 _________________________________ 0x0000000100000000 19102 74 _____________________r___________ reserved 0x0000000000008000 41 0 _______________H_________________ compound_head 0x0000000000010000 188 0 ________________T________________ compound_tail 0x0000000000008014 1 0 __R_D__________H_________________ referenced,dirty,compound_head 0x0000000000010014 4 0 __R_D___________T________________ referenced,dirty,compound_tail 0x0000000000000020 1 0 _____l___________________________ lru 0x0000000800000024 34 0 __R__l__________________P________ referenced,lru,private 0x0000000000000028 3794 14 ___U_l___________________________ uptodate,lru 0x0001000000000028 46 0 ___U_l_______________________I___ uptodate,lru,readahead 0x0000000400000028 44 0 ___U_l_________________d_________ uptodate,lru,mappedtodisk 0x0001000400000028 2 0 ___U_l_________________d_____I___ uptodate,lru,mappedtodisk,readahead 0x000000000000002c 6434 25 __RU_l___________________________ referenced,uptodate,lru 0x000100000000002c 47 0 __RU_l_______________________I___ referenced,uptodate,lru,readahead 0x000000040000002c 14 0 __RU_l_________________d_________ referenced,uptodate,lru,mappedtodisk 0x000000080000002c 30 0 __RU_l__________________P________ referenced,uptodate,lru,private 0x0000000800000040 8124 31 ______A_________________P________ active,private 0x0000000000000040 219 0 ______A__________________________ active 0x0000000800000060 1 0 _____lA_________________P________ lru,active,private 0x0000000000000068 322 1 ___U_lA__________________________ uptodate,lru,active 0x0001000000000068 12 0 ___U_lA______________________I___ uptodate,lru,active,readahead 0x0000000400000068 13 0 ___U_lA________________d_________ uptodate,lru,active,mappedtodisk 0x0000000800000068 12 0 ___U_lA_________________P________ uptodate,lru,active,private 0x000000000000006c 977 3 __RU_lA__________________________ referenced,uptodate,lru,active 0x000100000000006c 48 0 __RU_lA______________________I___ referenced,uptodate,lru,active,readahead 0x000000040000006c 5 0 __RU_lA________________d_________ referenced,uptodate,lru,active,mappedtodisk 0x000000080000006c 3 0 __RU_lA_________________P________ referenced,uptodate,lru,active,private 0x0000000c0000006c 3 0 __RU_lA________________dP________ referenced,uptodate,lru,active,mappedtodisk,private 0x0000000c00000068 1 0 ___U_lA________________dP________ uptodate,lru,active,mappedtodisk,private 0x0000000000004078 1 0 ___UDlA_______b__________________ uptodate,dirty,lru,active,swapbacked 0x000000000000407c 34 0 __RUDlA_______b__________________ referenced,uptodate,dirty,lru,active,swapbacked 0x0000000000000400 538 2 __________B______________________ buddy 0x0000000000000804 1 0 __R________M_____________________ referenced,mmap 0x0000000000000828 1029 4 ___U_l_____M_____________________ uptodate,lru,mmap 0x0001000000000828 43 0 ___U_l_____M_________________I___ uptodate,lru,mmap,readahead 0x000000000000082c 382 1 __RU_l_____M_____________________ referenced,uptodate,lru,mmap 0x000100000000082c 12 0 __RU_l_____M_________________I___ referenced,uptodate,lru,mmap,readahead 0x0000000000000868 192 0 ___U_lA____M_____________________ uptodate,lru,active,mmap 0x0001000000000868 12 0 ___U_lA____M_________________I___ uptodate,lru,active,mmap,readahead 0x000000000000086c 800 3 __RU_lA____M_____________________ referenced,uptodate,lru,active,mmap 0x000100000000086c 31 0 __RU_lA____M_________________I___ referenced,uptodate,lru,active,mmap,readahead 0x0000000000004878 2 0 ___UDlA____M__b__________________ uptodate,dirty,lru,active,mmap,swapbacked 0x0000000000001000 492 1 ____________a____________________ anonymous 0x0000000000005008 2 0 ___U________a_b__________________ uptodate,anonymous,swapbacked 0x0000000000005808 4 0 ___U_______Ma_b__________________ uptodate,mmap,anonymous,swapbacked 0x000000000000580c 1 0 __RU_______Ma_b__________________ referenced,uptodate,mmap,anonymous,swapbacked 0x0000000000005868 2839 11 ___U_lA____Ma_b__________________ uptodate,lru,active,mmap,anonymous,swapbacked 0x000000000000586c 29 0 __RU_lA____Ma_b__________________ referenced,uptodate,lru,active,mmap,anonymous,swapbacked total 513968 2007 # ./page-types --raw --list --no-summary --bits reserved offset count flags 0 15 _____________________r___________ 31 4 _____________________r___________ 159 97 _____________________r___________ 4096 2067 _____________________r___________ 6752 2390 _____________________r___________ 9355 3 _____________________r___________ 9728 14526 _____________________r___________ This patch: Introduce PageHuge(), which identifies huge/gigantic pages by their dedicated compound destructor functions. Also move prep_compound_gigantic_page() to hugetlb.c and make __free_pages_ok() non-static. Signed-off-by: Wu Fengguang Cc: KOSAKI Motohiro Cc: Andi Kleen Cc: Matt Mackall Cc: Alexey Dobriyan Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/page.c | 1 + include/linux/hugetlb.h | 7 ++++ mm/hugetlb.c | 98 +++++++++++++++++++++++++++++++------------------ mm/internal.h | 5 +-- mm/page_alloc.c | 17 --------- 5 files changed, 73 insertions(+), 55 deletions(-) (limited to 'include') diff --git a/fs/proc/page.c b/fs/proc/page.c index e9983837d08d..38dd88b7ce84 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include "internal.h" diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 03be7f29ca01..a05a5ef33391 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -11,6 +11,8 @@ struct ctl_table; +int PageHuge(struct page *page); + static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) { return vma->vm_flags & VM_HUGETLB; @@ -61,6 +63,11 @@ void hugetlb_change_protection(struct vm_area_struct *vma, #else /* !CONFIG_HUGETLB_PAGE */ +static inline int PageHuge(struct page *page) +{ + return 0; +} + static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) { return 0; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7b9b6015b2ec..a56e6f3ce979 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -578,41 +578,6 @@ static void free_huge_page(struct page *page) hugetlb_put_quota(mapping, 1); } -/* - * Increment or decrement surplus_huge_pages. Keep node-specific counters - * balanced by operating on them in a round-robin fashion. - * Returns 1 if an adjustment was made. - */ -static int adjust_pool_surplus(struct hstate *h, int delta) -{ - static int prev_nid; - int nid = prev_nid; - int ret = 0; - - VM_BUG_ON(delta != -1 && delta != 1); - do { - nid = next_node(nid, node_online_map); - if (nid == MAX_NUMNODES) - nid = first_node(node_online_map); - - /* To shrink on this node, there must be a surplus page */ - if (delta < 0 && !h->surplus_huge_pages_node[nid]) - continue; - /* Surplus cannot exceed the total number of pages */ - if (delta > 0 && h->surplus_huge_pages_node[nid] >= - h->nr_huge_pages_node[nid]) - continue; - - h->surplus_huge_pages += delta; - h->surplus_huge_pages_node[nid] += delta; - ret = 1; - break; - } while (nid != prev_nid); - - prev_nid = nid; - return ret; -} - static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) { set_compound_page_dtor(page, free_huge_page); @@ -623,6 +588,34 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) put_page(page); /* free it into the hugepage allocator */ } +static void prep_compound_gigantic_page(struct page *page, unsigned long order) +{ + int i; + int nr_pages = 1 << order; + struct page *p = page + 1; + + /* we rely on prep_new_huge_page to set the destructor */ + set_compound_order(page, order); + __SetPageHead(page); + for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { + __SetPageTail(p); + p->first_page = page; + } +} + +int PageHuge(struct page *page) +{ + compound_page_dtor *dtor; + + if (!PageCompound(page)) + return 0; + + page = compound_head(page); + dtor = get_compound_page_dtor(page); + + return dtor == free_huge_page; +} + static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) { struct page *page; @@ -1140,6 +1133,41 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count) } #endif +/* + * Increment or decrement surplus_huge_pages. Keep node-specific counters + * balanced by operating on them in a round-robin fashion. + * Returns 1 if an adjustment was made. + */ +static int adjust_pool_surplus(struct hstate *h, int delta) +{ + static int prev_nid; + int nid = prev_nid; + int ret = 0; + + VM_BUG_ON(delta != -1 && delta != 1); + do { + nid = next_node(nid, node_online_map); + if (nid == MAX_NUMNODES) + nid = first_node(node_online_map); + + /* To shrink on this node, there must be a surplus page */ + if (delta < 0 && !h->surplus_huge_pages_node[nid]) + continue; + /* Surplus cannot exceed the total number of pages */ + if (delta > 0 && h->surplus_huge_pages_node[nid] >= + h->nr_huge_pages_node[nid]) + continue; + + h->surplus_huge_pages += delta; + h->surplus_huge_pages_node[nid] += delta; + ret = 1; + break; + } while (nid != prev_nid); + + prev_nid = nid; + return ret; +} + #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) { diff --git a/mm/internal.h b/mm/internal.h index 4b1672a8cf76..b4ac332e8072 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -16,9 +16,6 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); -extern void prep_compound_page(struct page *page, unsigned long order); -extern void prep_compound_gigantic_page(struct page *page, unsigned long order); - static inline void set_page_count(struct page *page, int v) { atomic_set(&page->_count, v); @@ -51,6 +48,8 @@ extern void putback_lru_page(struct page *page); */ extern unsigned long highest_memmap_pfn; extern void __free_pages_bootmem(struct page *page, unsigned int order); +extern void prep_compound_page(struct page *page, unsigned long order); + /* * function for dealing with page's order in buddy system. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8ca06d87dc1f..131655cdb6b2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -300,23 +300,6 @@ void prep_compound_page(struct page *page, unsigned long order) } } -#ifdef CONFIG_HUGETLBFS -void prep_compound_gigantic_page(struct page *page, unsigned long order) -{ - int i; - int nr_pages = 1 << order; - struct page *p = page + 1; - - set_compound_page_dtor(page, free_compound_page); - set_compound_order(page, order); - __SetPageHead(page); - for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { - __SetPageTail(p); - p->first_page = page; - } -} -#endif - static int destroy_compound_page(struct page *page, unsigned long order) { int i; -- cgit v1.2.3 From 56e49d218890f49b0057710a4b6fef31f5ffbfec Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 16 Jun 2009 15:32:28 -0700 Subject: vmscan: evict use-once pages first When the file LRU lists are dominated by streaming IO pages, evict those pages first, before considering evicting other pages. This should be safe from deadlocks or performance problems because only three things can happen to an inactive file page: 1) referenced twice and promoted to the active list 2) evicted by the pageout code 3) under IO, after which it will get evicted or promoted The pages freed in this way can either be reused for streaming IO, or allocated for something else. If the pages are used for streaming IO, this pageout pattern continues. Otherwise, we will fall back to the normal pageout pattern. Signed-off-by: Rik van Riel Reported-by: Elladan Cc: KOSAKI Motohiro Cc: Peter Zijlstra Cc: Lee Schermerhorn Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 7 +++++++ mm/memcontrol.c | 11 +++++++++++ mm/vmscan.c | 38 +++++++++++++++++++++++++++++++++++++- 3 files changed, 55 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 25b9ca93d232..45add35dda1b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -94,6 +94,7 @@ extern void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, extern void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority); int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg); +int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg); unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, struct zone *zone, enum lru_list lru); @@ -239,6 +240,12 @@ mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) return 1; } +static inline int +mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) +{ + return 1; +} + static inline unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, struct zone *zone, enum lru_list lru) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 78eb8552818b..70db6e0a5eec 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -570,6 +570,17 @@ int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) return 0; } +int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) +{ + unsigned long active; + unsigned long inactive; + + inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE); + active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE); + + return (active > inactive); +} + unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, struct zone *zone, enum lru_list lru) diff --git a/mm/vmscan.c b/mm/vmscan.c index e5245d051647..9673437a5457 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1351,12 +1351,48 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) return low; } +static int inactive_file_is_low_global(struct zone *zone) +{ + unsigned long active, inactive; + + active = zone_page_state(zone, NR_ACTIVE_FILE); + inactive = zone_page_state(zone, NR_INACTIVE_FILE); + + return (active > inactive); +} + +/** + * inactive_file_is_low - check if file pages need to be deactivated + * @zone: zone to check + * @sc: scan control of this context + * + * When the system is doing streaming IO, memory pressure here + * ensures that active file pages get deactivated, until more + * than half of the file pages are on the inactive list. + * + * Once we get to that situation, protect the system's working + * set from being evicted by disabling active file page aging. + * + * This uses a different ratio than the anonymous pages, because + * the page cache uses a use-once replacement algorithm. + */ +static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) +{ + int low; + + if (scanning_global_lru(sc)) + low = inactive_file_is_low_global(zone); + else + low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup); + return low; +} + static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, struct zone *zone, struct scan_control *sc, int priority) { int file = is_file_lru(lru); - if (lru == LRU_ACTIVE_FILE) { + if (lru == LRU_ACTIVE_FILE && inactive_file_is_low(zone, sc)) { shrink_active_list(nr_to_scan, zone, sc, priority, file); return 0; } -- cgit v1.2.3 From 6e08a369ee10b361ac1cdcdf4fabd420fd08beb3 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:32:29 -0700 Subject: vmscan: cleanup the scan batching code The vmscan batching logic is twisting. Move it into a standalone function nr_scan_try_batch() and document it. No behavior change. Signed-off-by: Wu Fengguang Acked-by: Rik van Riel Cc: Nick Piggin Cc: Christoph Lameter Acked-by: Peter Zijlstra Acked-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 4 ++-- mm/page_alloc.c | 2 +- mm/vmscan.c | 39 ++++++++++++++++++++++++++++----------- mm/vmstat.c | 8 ++++---- 4 files changed, 35 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index dd8487f0442f..db976b9f8791 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -334,9 +334,9 @@ struct zone { /* Fields commonly accessed by the page reclaim scanner */ spinlock_t lru_lock; - struct { + struct zone_lru { struct list_head list; - unsigned long nr_scan; + unsigned long nr_saved_scan; /* accumulated for batching */ } lru[NR_LRU_LISTS]; struct zone_reclaim_stat reclaim_stat; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 131655cdb6b2..e5b8f628d166 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3657,7 +3657,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone_pcp_init(zone); for_each_lru(l) { INIT_LIST_HEAD(&zone->lru[l].list); - zone->lru[l].nr_scan = 0; + zone->lru[l].nr_saved_scan = 0; } zone->reclaim_stat.recent_rotated[0] = 0; zone->reclaim_stat.recent_rotated[1] = 0; diff --git a/mm/vmscan.c b/mm/vmscan.c index 9673437a5457..d4da097533ce 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1492,6 +1492,26 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, percent[1] = 100 - percent[0]; } +/* + * Smallish @nr_to_scan's are deposited in @nr_saved_scan, + * until we collected @swap_cluster_max pages to scan. + */ +static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, + unsigned long *nr_saved_scan, + unsigned long swap_cluster_max) +{ + unsigned long nr; + + *nr_saved_scan += nr_to_scan; + nr = *nr_saved_scan; + + if (nr >= swap_cluster_max) + *nr_saved_scan = 0; + else + nr = 0; + + return nr; +} /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. @@ -1517,14 +1537,11 @@ static void shrink_zone(int priority, struct zone *zone, scan >>= priority; scan = (scan * percent[file]) / 100; } - if (scanning_global_lru(sc)) { - zone->lru[l].nr_scan += scan; - nr[l] = zone->lru[l].nr_scan; - if (nr[l] >= swap_cluster_max) - zone->lru[l].nr_scan = 0; - else - nr[l] = 0; - } else + if (scanning_global_lru(sc)) + nr[l] = nr_scan_try_batch(scan, + &zone->lru[l].nr_saved_scan, + swap_cluster_max); + else nr[l] = scan; } @@ -2124,11 +2141,11 @@ static void shrink_all_zones(unsigned long nr_pages, int prio, l == LRU_ACTIVE_FILE)) continue; - zone->lru[l].nr_scan += (lru_pages >> prio) + 1; - if (zone->lru[l].nr_scan >= nr_pages || pass > 3) { + zone->lru[l].nr_saved_scan += (lru_pages >> prio) + 1; + if (zone->lru[l].nr_saved_scan >= nr_pages || pass > 3) { unsigned long nr_to_scan; - zone->lru[l].nr_scan = 0; + zone->lru[l].nr_saved_scan = 0; nr_to_scan = min(nr_pages, lru_pages); nr_reclaimed += shrink_list(l, nr_to_scan, zone, sc, prio); diff --git a/mm/vmstat.c b/mm/vmstat.c index 415110772c73..84c055556911 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -718,10 +718,10 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, low_wmark_pages(zone), high_wmark_pages(zone), zone->pages_scanned, - zone->lru[LRU_ACTIVE_ANON].nr_scan, - zone->lru[LRU_INACTIVE_ANON].nr_scan, - zone->lru[LRU_ACTIVE_FILE].nr_scan, - zone->lru[LRU_INACTIVE_FILE].nr_scan, + zone->lru[LRU_ACTIVE_ANON].nr_saved_scan, + zone->lru[LRU_INACTIVE_ANON].nr_saved_scan, + zone->lru[LRU_ACTIVE_FILE].nr_saved_scan, + zone->lru[LRU_INACTIVE_FILE].nr_saved_scan, zone->spanned_pages, zone->present_pages); -- cgit v1.2.3 From 3b6748e2dd69906af3835db4dc9d1c8a3ee4c68c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 16 Jun 2009 15:32:35 -0700 Subject: mm: introduce follow_pfn() Analoguous to follow_phys(), add a helper that looks up the PFN at a user virtual address in an IO mapping or a raw PFN mapping. Signed-off-by: Johannes Weiner Cc: Christoph Hellwig Acked-by: Magnus Damm Cc: Hans Verkuil Cc: Paul Mundt Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ mm/memory.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7b548e7cfbd9..c80dbd4a62c6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -792,6 +792,8 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows); +int follow_pfn(struct vm_area_struct *vma, unsigned long address, + unsigned long *pfn); int follow_phys(struct vm_area_struct *vma, unsigned long address, unsigned int flags, unsigned long *prot, resource_size_t *phys); int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, diff --git a/mm/memory.c b/mm/memory.c index 24ff20480bb7..d5d1653d60a6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3140,6 +3140,35 @@ out: return -EINVAL; } +/** + * follow_pfn - look up PFN at a user virtual address + * @vma: memory mapping + * @address: user virtual address + * @pfn: location to store found PFN + * + * Only IO mappings and raw PFN mappings are allowed. + * + * Returns zero and the pfn at @pfn on success, -ve otherwise. + */ +int follow_pfn(struct vm_area_struct *vma, unsigned long address, + unsigned long *pfn) +{ + int ret = -EINVAL; + spinlock_t *ptl; + pte_t *ptep; + + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + return ret; + + ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); + if (ret) + return ret; + *pfn = pte_pfn(*ptep); + pte_unmap_unlock(ptep, ptl); + return 0; +} +EXPORT_SYMBOL(follow_pfn); + #ifdef CONFIG_HAVE_IOREMAP_PROT int follow_phys(struct vm_area_struct *vma, unsigned long address, unsigned int flags, -- cgit v1.2.3 From 7f33d49a2ed546e01f7b1d0607661810f2421859 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 16 Jun 2009 15:32:41 -0700 Subject: mm, PM/Freezer: Disable OOM killer when tasks are frozen Currently, the following scenario appears to be possible in theory: * Tasks are frozen for hibernation or suspend. * Free pages are almost exhausted. * Certain piece of code in the suspend code path attempts to allocate some memory using GFP_KERNEL and allocation order less than or equal to PAGE_ALLOC_COSTLY_ORDER. * __alloc_pages_internal() cannot find a free page so it invokes the OOM killer. * The OOM killer attempts to kill a task, but the task is frozen, so it doesn't die immediately. * __alloc_pages_internal() jumps to 'restart', unsuccessfully tries to find a free page and invokes the OOM killer. * No progress can be made. Although it is now hard to trigger during hibernation due to the memory shrinking carried out by the hibernation code, it is theoretically possible to trigger during suspend after the memory shrinking has been removed from that code path. Moreover, since memory allocations are going to be used for the hibernation memory shrinking, it will be even more likely to happen during hibernation. To prevent it from happening, introduce the oom_killer_disabled switch that will cause __alloc_pages_internal() to fail in the situations in which the OOM killer would have been called and make the freezer set this switch after tasks have been successfully frozen. [akpm@linux-foundation.org: be nicer to the namespace] Signed-off-by: Rafael J. Wysocki Cc: Fengguang Wu Cc: David Rientjes Acked-by: Pavel Machek Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 12 ++++++++++++ kernel/power/process.c | 5 +++++ mm/page_alloc.c | 4 ++++ 3 files changed, 21 insertions(+) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 4efa33088a82..06b7e8cc80ac 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -243,4 +243,16 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp); void drain_all_pages(void); void drain_local_pages(void *dummy); +extern bool oom_killer_disabled; + +static inline void oom_killer_disable(void) +{ + oom_killer_disabled = true; +} + +static inline void oom_killer_enable(void) +{ + oom_killer_disabled = false; +} + #endif /* __LINUX_GFP_H */ diff --git a/kernel/power/process.c b/kernel/power/process.c index ca634019497a..da2072d73811 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -117,9 +117,12 @@ int freeze_processes(void) if (error) goto Exit; printk("done."); + + oom_killer_disable(); Exit: BUG_ON(in_atomic()); printk("\n"); + return error; } @@ -145,6 +148,8 @@ static void thaw_tasks(bool nosig_only) void thaw_processes(void) { + oom_killer_enable(); + printk("Restarting tasks ... "); thaw_tasks(true); thaw_tasks(false); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 61290ea721c8..5b09488d0f55 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -178,6 +178,8 @@ static void set_pageblock_migratetype(struct page *page, int migratetype) PB_migrate, PB_migrate_end); } +bool oom_killer_disabled __read_mostly; + #ifdef CONFIG_DEBUG_VM static int page_outside_zone_boundaries(struct zone *zone, struct page *page) { @@ -1769,6 +1771,8 @@ rebalance: */ if (!did_some_progress) { if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { + if (oom_killer_disabled) + goto nopage; page = __alloc_pages_may_oom(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, -- cgit v1.2.3 From 31c911329e048b715a1dfeaaf617be9430fd7f4e Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 16 Jun 2009 15:32:45 -0700 Subject: mm: check the argument of kunmap on architectures without highmem If you're using a non-highmem architecture, passing an argument with the wrong type to kunmap() doesn't give you a warning because the ifdef doesn't check the type. Using a static inline function solves the problem nicely. Reported-by: David Woodhouse Signed-off-by: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/highmem.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 1fcb7126a01f..211ff4497269 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -55,7 +55,9 @@ static inline void *kmap(struct page *page) return page_address(page); } -#define kunmap(page) do { (void) (page); } while (0) +static inline void kunmap(struct page *page) +{ +} static inline void *kmap_atomic(struct page *page, enum km_type idx) { -- cgit v1.2.3 From b70d94ee438b3fd9c15c7691d7a932a135c18101 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 16 Jun 2009 15:32:46 -0700 Subject: page-allocator: use integer fields lookup for gfp_zone and check for errors in flags passed to the page allocator This simplifies the code in gfp_zone() and also keeps the ability of the compiler to use constant folding to get rid of gfp_zone processing. The lookup of the zone is done using a bitfield stored in an integer. So the code in gfp_zone is a simple extraction of bits from a constant bitfield. The compiler is generating a load of a constant into a register and then performs a shift and mask operation to get the zone from a gfp_t. No cachelines are touched and no branches have to be predicted by the compiler. We are doing some macro tricks here to convince the compiler to always do the constant folding if possible. Signed-off-by: Christoph Lameter Cc: KAMEZAWA Hiroyuki Reviewed-by: Mel Gorman Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 111 +++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 96 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 06b7e8cc80ac..412178afd423 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -21,7 +21,8 @@ struct vm_area_struct; #define __GFP_DMA ((__force gfp_t)0x01u) #define __GFP_HIGHMEM ((__force gfp_t)0x02u) #define __GFP_DMA32 ((__force gfp_t)0x04u) - +#define __GFP_MOVABLE ((__force gfp_t)0x08u) /* Page is movable */ +#define GFP_ZONEMASK (__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE) /* * Action modifiers - doesn't change the zoning * @@ -51,7 +52,6 @@ struct vm_area_struct; #define __GFP_HARDWALL ((__force gfp_t)0x20000u) /* Enforce hardwall cpuset memory allocs */ #define __GFP_THISNODE ((__force gfp_t)0x40000u)/* No fallback, no policies */ #define __GFP_RECLAIMABLE ((__force gfp_t)0x80000u) /* Page is reclaimable */ -#define __GFP_MOVABLE ((__force gfp_t)0x100000u) /* Page is movable */ #define __GFP_BITS_SHIFT 21 /* Room for 21 __GFP_FOO bits */ #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) @@ -116,24 +116,105 @@ static inline int allocflags_to_migratetype(gfp_t gfp_flags) ((gfp_flags & __GFP_RECLAIMABLE) != 0); } -static inline enum zone_type gfp_zone(gfp_t flags) -{ +#ifdef CONFIG_HIGHMEM +#define OPT_ZONE_HIGHMEM ZONE_HIGHMEM +#else +#define OPT_ZONE_HIGHMEM ZONE_NORMAL +#endif + #ifdef CONFIG_ZONE_DMA - if (flags & __GFP_DMA) - return ZONE_DMA; +#define OPT_ZONE_DMA ZONE_DMA +#else +#define OPT_ZONE_DMA ZONE_NORMAL #endif + #ifdef CONFIG_ZONE_DMA32 - if (flags & __GFP_DMA32) - return ZONE_DMA32; +#define OPT_ZONE_DMA32 ZONE_DMA32 +#else +#define OPT_ZONE_DMA32 ZONE_NORMAL #endif - if ((flags & (__GFP_HIGHMEM | __GFP_MOVABLE)) == - (__GFP_HIGHMEM | __GFP_MOVABLE)) - return ZONE_MOVABLE; -#ifdef CONFIG_HIGHMEM - if (flags & __GFP_HIGHMEM) - return ZONE_HIGHMEM; + +/* + * GFP_ZONE_TABLE is a word size bitstring that is used for looking up the + * zone to use given the lowest 4 bits of gfp_t. Entries are ZONE_SHIFT long + * and there are 16 of them to cover all possible combinations of + * __GFP_DMA, __GFP_DMA32, __GFP_MOVABLE and __GFP_HIGHMEM + * + * The zone fallback order is MOVABLE=>HIGHMEM=>NORMAL=>DMA32=>DMA. + * But GFP_MOVABLE is not only a zone specifier but also an allocation + * policy. Therefore __GFP_MOVABLE plus another zone selector is valid. + * Only 1bit of the lowest 3 bit (DMA,DMA32,HIGHMEM) can be set to "1". + * + * bit result + * ================= + * 0x0 => NORMAL + * 0x1 => DMA or NORMAL + * 0x2 => HIGHMEM or NORMAL + * 0x3 => BAD (DMA+HIGHMEM) + * 0x4 => DMA32 or DMA or NORMAL + * 0x5 => BAD (DMA+DMA32) + * 0x6 => BAD (HIGHMEM+DMA32) + * 0x7 => BAD (HIGHMEM+DMA32+DMA) + * 0x8 => NORMAL (MOVABLE+0) + * 0x9 => DMA or NORMAL (MOVABLE+DMA) + * 0xa => MOVABLE (Movable is valid only if HIGHMEM is set too) + * 0xb => BAD (MOVABLE+HIGHMEM+DMA) + * 0xc => DMA32 (MOVABLE+HIGHMEM+DMA32) + * 0xd => BAD (MOVABLE+DMA32+DMA) + * 0xe => BAD (MOVABLE+DMA32+HIGHMEM) + * 0xf => BAD (MOVABLE+DMA32+HIGHMEM+DMA) + * + * ZONES_SHIFT must be <= 2 on 32 bit platforms. + */ + +#if 16 * ZONES_SHIFT > BITS_PER_LONG +#error ZONES_SHIFT too large to create GFP_ZONE_TABLE integer +#endif + +#define GFP_ZONE_TABLE ( \ + (ZONE_NORMAL << 0 * ZONES_SHIFT) \ + | (OPT_ZONE_DMA << __GFP_DMA * ZONES_SHIFT) \ + | (OPT_ZONE_HIGHMEM << __GFP_HIGHMEM * ZONES_SHIFT) \ + | (OPT_ZONE_DMA32 << __GFP_DMA32 * ZONES_SHIFT) \ + | (ZONE_NORMAL << __GFP_MOVABLE * ZONES_SHIFT) \ + | (OPT_ZONE_DMA << (__GFP_MOVABLE | __GFP_DMA) * ZONES_SHIFT) \ + | (ZONE_MOVABLE << (__GFP_MOVABLE | __GFP_HIGHMEM) * ZONES_SHIFT)\ + | (OPT_ZONE_DMA32 << (__GFP_MOVABLE | __GFP_DMA32) * ZONES_SHIFT)\ +) + +/* + * GFP_ZONE_BAD is a bitmap for all combination of __GFP_DMA, __GFP_DMA32 + * __GFP_HIGHMEM and __GFP_MOVABLE that are not permitted. One flag per + * entry starting with bit 0. Bit is set if the combination is not + * allowed. + */ +#define GFP_ZONE_BAD ( \ + 1 << (__GFP_DMA | __GFP_HIGHMEM) \ + | 1 << (__GFP_DMA | __GFP_DMA32) \ + | 1 << (__GFP_DMA32 | __GFP_HIGHMEM) \ + | 1 << (__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM) \ + | 1 << (__GFP_MOVABLE | __GFP_HIGHMEM | __GFP_DMA) \ + | 1 << (__GFP_MOVABLE | __GFP_DMA32 | __GFP_DMA) \ + | 1 << (__GFP_MOVABLE | __GFP_DMA32 | __GFP_HIGHMEM) \ + | 1 << (__GFP_MOVABLE | __GFP_DMA32 | __GFP_DMA | __GFP_HIGHMEM)\ +) + +static inline enum zone_type gfp_zone(gfp_t flags) +{ + enum zone_type z; + int bit = flags & GFP_ZONEMASK; + + z = (GFP_ZONE_TABLE >> (bit * ZONES_SHIFT)) & + ((1 << ZONES_SHIFT) - 1); + + if (__builtin_constant_p(bit)) + BUILD_BUG_ON((GFP_ZONE_BAD >> bit) & 1); + else { +#ifdef CONFIG_DEBUG_VM + BUG_ON((GFP_ZONE_BAD >> bit) & 1); #endif - return ZONE_NORMAL; + } + return z; } /* -- cgit v1.2.3 From bc75d33f0fc1d56e734db1f56d3cfc8097b8e0cf Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 16 Jun 2009 15:32:48 -0700 Subject: page-allocator: clean up functions related to pages_min Change the names of two functions. It doesn't affect behavior. Presently, setup_per_zone_pages_min() changes low, high of zone as well as min. So a better name is setup_per_zone_wmarks(). That's because Mel changed zone->pages_[hig/low/min] to zone->watermark array in "page allocator: replace the watermark-related union in struct zone with a watermark[] array". * setup_per_zone_pages_min => setup_per_zone_wmarks Of course, we have to change init_per_zone_pages_min, too. There are not pages_min any more. * init_per_zone_pages_min => init_per_zone_wmark_min [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Minchan Kim Acked-by: Mel Gorman Cc: KOSAKI Motohiro Cc: Rik van Riel Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- mm/memory_hotplug.c | 2 +- mm/page_alloc.c | 17 +++++++++-------- 3 files changed, 11 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index c80dbd4a62c6..6e11cda1ba02 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1052,7 +1052,7 @@ extern int __meminit __early_pfn_to_nid(unsigned long pfn); extern void set_dma_reserve(unsigned long new_dma_reserve); extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long, enum memmap_context); -extern void setup_per_zone_pages_min(void); +extern void setup_per_zone_wmarks(void); extern void mem_init(void); extern void __init mmap_init(void); extern void show_mem(void); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c083cf5fd6df..037291e15b27 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -422,7 +422,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) zone->present_pages += onlined_pages; zone->zone_pgdat->node_present_pages += onlined_pages; - setup_per_zone_pages_min(); + setup_per_zone_wmarks(); if (onlined_pages) { kswapd_run(zone_to_nid(zone)); node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5b09488d0f55..8629c84fd9ba 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4396,12 +4396,13 @@ static void setup_per_zone_lowmem_reserve(void) } /** - * setup_per_zone_pages_min - called when min_free_kbytes changes. + * setup_per_zone_wmarks - called when min_free_kbytes changes + * or when memory is hot-added * - * Ensures that the pages_{min,low,high} values for each zone are set correctly - * with respect to min_free_kbytes. + * Ensures that the watermark[min,low,high] values for each zone are set + * correctly with respect to min_free_kbytes. */ -void setup_per_zone_pages_min(void) +void setup_per_zone_wmarks(void) { unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); unsigned long lowmem_pages = 0; @@ -4519,7 +4520,7 @@ static void __init setup_per_zone_inactive_ratio(void) * 8192MB: 11584k * 16384MB: 16384k */ -static int __init init_per_zone_pages_min(void) +static int __init init_per_zone_wmark_min(void) { unsigned long lowmem_kbytes; @@ -4530,12 +4531,12 @@ static int __init init_per_zone_pages_min(void) min_free_kbytes = 128; if (min_free_kbytes > 65536) min_free_kbytes = 65536; - setup_per_zone_pages_min(); + setup_per_zone_wmarks(); setup_per_zone_lowmem_reserve(); setup_per_zone_inactive_ratio(); return 0; } -module_init(init_per_zone_pages_min) +module_init(init_per_zone_wmark_min) /* * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so @@ -4547,7 +4548,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, { proc_dointvec(table, write, file, buffer, length, ppos); if (write) - setup_per_zone_pages_min(); + setup_per_zone_wmarks(); return 0; } -- cgit v1.2.3 From 96cb4df5ddf5e6d5785b5acd4003e3689b87f896 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 16 Jun 2009 15:32:49 -0700 Subject: page-allocator: add inactive ratio calculation function of each zone Factor the per-zone arithemetic inside setup_per_zone_inactive_ratio()'s loop into a a separate function, calculate_zone_inactive_ratio(). This function will be used in a later patch [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Minchan Kim Reviewed-by: KOSAKI Motohiro Cc: Rik van Riel Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + mm/page_alloc.c | 28 ++++++++++++++++------------ 2 files changed, 17 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6e11cda1ba02..f0473a88ca2e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1053,6 +1053,7 @@ extern void set_dma_reserve(unsigned long new_dma_reserve); extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long, enum memmap_context); extern void setup_per_zone_wmarks(void); +extern void calculate_zone_inactive_ratio(struct zone *zone); extern void mem_init(void); extern void __init mmap_init(void); extern void show_mem(void); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8629c84fd9ba..303607f1d323 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4478,22 +4478,26 @@ void setup_per_zone_wmarks(void) * 1TB 101 10GB * 10TB 320 32GB */ -static void __init setup_per_zone_inactive_ratio(void) +void calculate_zone_inactive_ratio(struct zone *zone) { - struct zone *zone; + unsigned int gb, ratio; - for_each_zone(zone) { - unsigned int gb, ratio; + /* Zone size in gigabytes */ + gb = zone->present_pages >> (30 - PAGE_SHIFT); + if (gb) + ratio = int_sqrt(10 * gb); + else + ratio = 1; - /* Zone size in gigabytes */ - gb = zone->present_pages >> (30 - PAGE_SHIFT); - if (gb) - ratio = int_sqrt(10 * gb); - else - ratio = 1; + zone->inactive_ratio = ratio; +} - zone->inactive_ratio = ratio; - } +static void __init setup_per_zone_inactive_ratio(void) +{ + struct zone *zone; + + for_each_zone(zone) + calculate_zone_inactive_ratio(zone); } /* -- cgit v1.2.3 From 6837765963f1723e80ca97b1fae660f3a60d77df Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 16 Jun 2009 15:32:51 -0700 Subject: mm: remove CONFIG_UNEVICTABLE_LRU config option Currently, nobody wants to turn UNEVICTABLE_LRU off. Thus this configurability is unnecessary. Signed-off-by: KOSAKI Motohiro Cc: Johannes Weiner Cc: Andi Kleen Acked-by: Minchan Kim Cc: David Woodhouse Cc: Matt Mackall Cc: Rik van Riel Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 4 ---- fs/proc/meminfo.c | 4 ---- fs/proc/page.c | 2 -- include/linux/mmzone.h | 13 ------------- include/linux/page-flags.h | 16 +--------------- include/linux/pagemap.h | 12 ------------ include/linux/rmap.h | 7 ------- include/linux/swap.h | 19 ------------------- include/linux/vmstat.h | 2 -- kernel/sysctl.c | 2 -- mm/Kconfig | 14 +------------- mm/internal.h | 6 ------ mm/mlock.c | 22 ---------------------- mm/page_alloc.c | 9 --------- mm/rmap.c | 3 +-- mm/vmscan.c | 17 ----------------- mm/vmstat.c | 4 ---- 17 files changed, 3 insertions(+), 153 deletions(-) (limited to 'include') diff --git a/drivers/base/node.c b/drivers/base/node.c index 40b809742a1c..91d4087b4039 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -72,10 +72,8 @@ static ssize_t node_read_meminfo(struct sys_device * dev, "Node %d Inactive(anon): %8lu kB\n" "Node %d Active(file): %8lu kB\n" "Node %d Inactive(file): %8lu kB\n" -#ifdef CONFIG_UNEVICTABLE_LRU "Node %d Unevictable: %8lu kB\n" "Node %d Mlocked: %8lu kB\n" -#endif #ifdef CONFIG_HIGHMEM "Node %d HighTotal: %8lu kB\n" "Node %d HighFree: %8lu kB\n" @@ -105,10 +103,8 @@ static ssize_t node_read_meminfo(struct sys_device * dev, nid, K(node_page_state(nid, NR_INACTIVE_ANON)), nid, K(node_page_state(nid, NR_ACTIVE_FILE)), nid, K(node_page_state(nid, NR_INACTIVE_FILE)), -#ifdef CONFIG_UNEVICTABLE_LRU nid, K(node_page_state(nid, NR_UNEVICTABLE)), nid, K(node_page_state(nid, NR_MLOCK)), -#endif #ifdef CONFIG_HIGHMEM nid, K(i.totalhigh), nid, K(i.freehigh), diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index c6b0302af4c4..d5c410d47fae 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -64,10 +64,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) "Inactive(anon): %8lu kB\n" "Active(file): %8lu kB\n" "Inactive(file): %8lu kB\n" -#ifdef CONFIG_UNEVICTABLE_LRU "Unevictable: %8lu kB\n" "Mlocked: %8lu kB\n" -#endif #ifdef CONFIG_HIGHMEM "HighTotal: %8lu kB\n" "HighFree: %8lu kB\n" @@ -109,10 +107,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(pages[LRU_INACTIVE_ANON]), K(pages[LRU_ACTIVE_FILE]), K(pages[LRU_INACTIVE_FILE]), -#ifdef CONFIG_UNEVICTABLE_LRU K(pages[LRU_UNEVICTABLE]), K(global_page_state(NR_MLOCK)), -#endif #ifdef CONFIG_HIGHMEM K(i.totalhigh), K(i.freehigh), diff --git a/fs/proc/page.c b/fs/proc/page.c index 9d926bd279a4..2707c6c7a20f 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -172,10 +172,8 @@ static u64 get_uflags(struct page *page) u |= kpf_copy_bit(k, KPF_SWAPCACHE, PG_swapcache); u |= kpf_copy_bit(k, KPF_SWAPBACKED, PG_swapbacked); -#ifdef CONFIG_UNEVICTABLE_LRU u |= kpf_copy_bit(k, KPF_UNEVICTABLE, PG_unevictable); u |= kpf_copy_bit(k, KPF_MLOCKED, PG_mlocked); -#endif #ifdef CONFIG_IA64_UNCACHED_ALLOCATOR u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index db976b9f8791..889598537370 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -83,13 +83,8 @@ enum zone_stat_item { NR_ACTIVE_ANON, /* " " " " " */ NR_INACTIVE_FILE, /* " " " " " */ NR_ACTIVE_FILE, /* " " " " " */ -#ifdef CONFIG_UNEVICTABLE_LRU NR_UNEVICTABLE, /* " " " " " */ NR_MLOCK, /* mlock()ed pages found and moved off LRU */ -#else - NR_UNEVICTABLE = NR_ACTIVE_FILE, /* avoid compiler errors in dead code */ - NR_MLOCK = NR_ACTIVE_FILE, -#endif NR_ANON_PAGES, /* Mapped anonymous pages */ NR_FILE_MAPPED, /* pagecache pages mapped into pagetables. only modified from process context */ @@ -132,11 +127,7 @@ enum lru_list { LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE, LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE, LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE, -#ifdef CONFIG_UNEVICTABLE_LRU LRU_UNEVICTABLE, -#else - LRU_UNEVICTABLE = LRU_ACTIVE_FILE, /* avoid compiler errors in dead code */ -#endif NR_LRU_LISTS }; @@ -156,11 +147,7 @@ static inline int is_active_lru(enum lru_list l) static inline int is_unevictable_lru(enum lru_list l) { -#ifdef CONFIG_UNEVICTABLE_LRU return (l == LRU_UNEVICTABLE); -#else - return 0; -#endif } enum zone_watermarks { diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 62214c7d2d93..d6792f88a176 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -95,9 +95,7 @@ enum pageflags { PG_reclaim, /* To be reclaimed asap */ PG_buddy, /* Page is free, on buddy lists */ PG_swapbacked, /* Page is backed by RAM/swap */ -#ifdef CONFIG_UNEVICTABLE_LRU PG_unevictable, /* Page is "unevictable" */ -#endif #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT PG_mlocked, /* Page is vma mlocked */ #endif @@ -248,14 +246,8 @@ PAGEFLAG_FALSE(SwapCache) SETPAGEFLAG_NOOP(SwapCache) CLEARPAGEFLAG_NOOP(SwapCache) #endif -#ifdef CONFIG_UNEVICTABLE_LRU PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable) TESTCLEARFLAG(Unevictable, unevictable) -#else -PAGEFLAG_FALSE(Unevictable) TESTCLEARFLAG_FALSE(Unevictable) - SETPAGEFLAG_NOOP(Unevictable) CLEARPAGEFLAG_NOOP(Unevictable) - __CLEARPAGEFLAG_NOOP(Unevictable) -#endif #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT #define MLOCK_PAGES 1 @@ -382,12 +374,6 @@ static inline void __ClearPageTail(struct page *page) #endif /* !PAGEFLAGS_EXTENDED */ -#ifdef CONFIG_UNEVICTABLE_LRU -#define __PG_UNEVICTABLE (1 << PG_unevictable) -#else -#define __PG_UNEVICTABLE 0 -#endif - #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT #define __PG_MLOCKED (1 << PG_mlocked) #else @@ -403,7 +389,7 @@ static inline void __ClearPageTail(struct page *page) 1 << PG_private | 1 << PG_private_2 | \ 1 << PG_buddy | 1 << PG_writeback | 1 << PG_reserved | \ 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \ - __PG_UNEVICTABLE | __PG_MLOCKED) + 1 << PG_unevictable | __PG_MLOCKED) /* * Flags checked when a page is prepped for return by the page allocator. diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 34da5230faab..aec3252afcf5 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -22,9 +22,7 @@ enum mapping_flags { AS_EIO = __GFP_BITS_SHIFT + 0, /* IO error on async write */ AS_ENOSPC = __GFP_BITS_SHIFT + 1, /* ENOSPC on async write */ AS_MM_ALL_LOCKS = __GFP_BITS_SHIFT + 2, /* under mm_take_all_locks() */ -#ifdef CONFIG_UNEVICTABLE_LRU AS_UNEVICTABLE = __GFP_BITS_SHIFT + 3, /* e.g., ramdisk, SHM_LOCK */ -#endif }; static inline void mapping_set_error(struct address_space *mapping, int error) @@ -37,8 +35,6 @@ static inline void mapping_set_error(struct address_space *mapping, int error) } } -#ifdef CONFIG_UNEVICTABLE_LRU - static inline void mapping_set_unevictable(struct address_space *mapping) { set_bit(AS_UNEVICTABLE, &mapping->flags); @@ -55,14 +51,6 @@ static inline int mapping_unevictable(struct address_space *mapping) return test_bit(AS_UNEVICTABLE, &mapping->flags); return !!mapping; } -#else -static inline void mapping_set_unevictable(struct address_space *mapping) { } -static inline void mapping_clear_unevictable(struct address_space *mapping) { } -static inline int mapping_unevictable(struct address_space *mapping) -{ - return 0; -} -#endif static inline gfp_t mapping_gfp_mask(struct address_space * mapping) { diff --git a/include/linux/rmap.h b/include/linux/rmap.h index b35bc0e19cd9..619379a1dd98 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -105,18 +105,11 @@ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *); */ int page_mkclean(struct page *); -#ifdef CONFIG_UNEVICTABLE_LRU /* * called in munlock()/munmap() path to check for other vmas holding * the page mlocked. */ int try_to_munlock(struct page *); -#else -static inline int try_to_munlock(struct page *page) -{ - return 0; /* a.k.a. SWAP_SUCCESS */ -} -#endif #else /* !CONFIG_MMU */ diff --git a/include/linux/swap.h b/include/linux/swap.h index d476aad3ff57..f30c06908f09 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -235,7 +235,6 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order) } #endif -#ifdef CONFIG_UNEVICTABLE_LRU extern int page_evictable(struct page *page, struct vm_area_struct *vma); extern void scan_mapping_unevictable_pages(struct address_space *); @@ -244,24 +243,6 @@ extern int scan_unevictable_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); extern int scan_unevictable_register_node(struct node *node); extern void scan_unevictable_unregister_node(struct node *node); -#else -static inline int page_evictable(struct page *page, - struct vm_area_struct *vma) -{ - return 1; -} - -static inline void scan_mapping_unevictable_pages(struct address_space *mapping) -{ -} - -static inline int scan_unevictable_register_node(struct node *node) -{ - return 0; -} - -static inline void scan_unevictable_unregister_node(struct node *node) { } -#endif extern int kswapd_run(int nid); diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 524cd1b28ecb..ff4696c6dce3 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -41,7 +41,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_HUGETLB_PAGE HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL, #endif -#ifdef CONFIG_UNEVICTABLE_LRU UNEVICTABLE_PGCULLED, /* culled to noreclaim list */ UNEVICTABLE_PGSCANNED, /* scanned for reclaimability */ UNEVICTABLE_PGRESCUED, /* rescued from noreclaim list */ @@ -50,7 +49,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, UNEVICTABLE_PGCLEARED, /* on COW, page truncate */ UNEVICTABLE_PGSTRANDED, /* unable to isolate on unlock */ UNEVICTABLE_MLOCKFREED, -#endif NR_VM_EVENT_ITEMS }; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0e51a35a4486..2ccee08f92f1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1325,7 +1325,6 @@ static struct ctl_table vm_table[] = { .extra2 = &one, }, #endif -#ifdef CONFIG_UNEVICTABLE_LRU { .ctl_name = CTL_UNNUMBERED, .procname = "scan_unevictable_pages", @@ -1334,7 +1333,6 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = &scan_unevictable_handler, }, -#endif /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt diff --git a/mm/Kconfig b/mm/Kconfig index 71830ba7b986..97d2c88b745e 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -203,25 +203,13 @@ config VIRT_TO_BUS def_bool y depends on !ARCH_NO_VIRT_TO_BUS -config UNEVICTABLE_LRU - bool "Add LRU list to track non-evictable pages" - default y - help - Keeps unevictable pages off of the active and inactive pageout - lists, so kswapd will not waste CPU time or have its balancing - algorithms thrown off by scanning these pages. Selecting this - will use one page flag and increase the code size a little, - say Y unless you know what you are doing. - - See Documentation/vm/unevictable-lru.txt for more information. - config HAVE_MLOCK bool default y if MMU=y config HAVE_MLOCKED_PAGE_BIT bool - default y if HAVE_MLOCK=y && UNEVICTABLE_LRU=y + default y if HAVE_MLOCK=y config MMU_NOTIFIER bool diff --git a/mm/internal.h b/mm/internal.h index b4ac332e8072..f02c7508068d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -73,7 +73,6 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) } #endif -#ifdef CONFIG_UNEVICTABLE_LRU /* * unevictable_migrate_page() called only from migrate_page_copy() to * migrate unevictable flag to new page. @@ -85,11 +84,6 @@ static inline void unevictable_migrate_page(struct page *new, struct page *old) if (TestClearPageUnevictable(old)) SetPageUnevictable(new); } -#else -static inline void unevictable_migrate_page(struct page *new, struct page *old) -{ -} -#endif #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT /* diff --git a/mm/mlock.c b/mm/mlock.c index ac130433c7d3..45eb650b9654 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -31,7 +31,6 @@ int can_do_mlock(void) } EXPORT_SYMBOL(can_do_mlock); -#ifdef CONFIG_UNEVICTABLE_LRU /* * Mlocked pages are marked with PageMlocked() flag for efficient testing * in vmscan and, possibly, the fault path; and to support semi-accurate @@ -261,27 +260,6 @@ static int __mlock_posix_error_return(long retval) return retval; } -#else /* CONFIG_UNEVICTABLE_LRU */ - -/* - * Just make pages present if VM_LOCKED. No-op if unlocking. - */ -static long __mlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end, - int mlock) -{ - if (mlock && (vma->vm_flags & VM_LOCKED)) - return make_pages_present(start, end); - return 0; -} - -static inline int __mlock_posix_error_return(long retval) -{ - return 0; -} - -#endif /* CONFIG_UNEVICTABLE_LRU */ - /** * mlock_vma_pages_range() - mlock pages in specified vma range. * @vma - the vma containing the specfied address range diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 00e293734fc9..c95a77cd581b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2077,19 +2077,14 @@ void show_free_areas(void) printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" " inactive_file:%lu" -//TODO: check/adjust line lengths -#ifdef CONFIG_UNEVICTABLE_LRU " unevictable:%lu" -#endif " dirty:%lu writeback:%lu unstable:%lu\n" " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", global_page_state(NR_ACTIVE_ANON), global_page_state(NR_ACTIVE_FILE), global_page_state(NR_INACTIVE_ANON), global_page_state(NR_INACTIVE_FILE), -#ifdef CONFIG_UNEVICTABLE_LRU global_page_state(NR_UNEVICTABLE), -#endif global_page_state(NR_FILE_DIRTY), global_page_state(NR_WRITEBACK), global_page_state(NR_UNSTABLE_NFS), @@ -2113,9 +2108,7 @@ void show_free_areas(void) " inactive_anon:%lukB" " active_file:%lukB" " inactive_file:%lukB" -#ifdef CONFIG_UNEVICTABLE_LRU " unevictable:%lukB" -#endif " present:%lukB" " pages_scanned:%lu" " all_unreclaimable? %s" @@ -2129,9 +2122,7 @@ void show_free_areas(void) K(zone_page_state(zone, NR_INACTIVE_ANON)), K(zone_page_state(zone, NR_ACTIVE_FILE)), K(zone_page_state(zone, NR_INACTIVE_FILE)), -#ifdef CONFIG_UNEVICTABLE_LRU K(zone_page_state(zone, NR_UNEVICTABLE)), -#endif K(zone->present_pages), zone->pages_scanned, (zone_is_all_unreclaimable(zone) ? "yes" : "no") diff --git a/mm/rmap.c b/mm/rmap.c index 23122af32611..316c9d6930ad 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1202,7 +1202,6 @@ int try_to_unmap(struct page *page, int migration) return ret; } -#ifdef CONFIG_UNEVICTABLE_LRU /** * try_to_munlock - try to munlock a page * @page: the page to be munlocked @@ -1226,4 +1225,4 @@ int try_to_munlock(struct page *page) else return try_to_unmap_file(page, 1, 0); } -#endif + diff --git a/mm/vmscan.c b/mm/vmscan.c index 879d034930c4..2c4b945b011f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -514,7 +514,6 @@ int remove_mapping(struct address_space *mapping, struct page *page) * * lru_lock must not be held, interrupts must be enabled. */ -#ifdef CONFIG_UNEVICTABLE_LRU void putback_lru_page(struct page *page) { int lru; @@ -568,20 +567,6 @@ redo: put_page(page); /* drop ref from isolate */ } -#else /* CONFIG_UNEVICTABLE_LRU */ - -void putback_lru_page(struct page *page) -{ - int lru; - VM_BUG_ON(PageLRU(page)); - - lru = !!TestClearPageActive(page) + page_is_file_cache(page); - lru_cache_add_lru(page, lru); - put_page(page); -} -#endif /* CONFIG_UNEVICTABLE_LRU */ - - /* * shrink_page_list() returns the number of reclaimed pages */ @@ -2470,7 +2455,6 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) } #endif -#ifdef CONFIG_UNEVICTABLE_LRU /* * page_evictable - test whether a page is evictable * @page: the page to test @@ -2717,4 +2701,3 @@ void scan_unevictable_unregister_node(struct node *node) sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); } -#endif diff --git a/mm/vmstat.c b/mm/vmstat.c index 1e151cf6bf86..1e3aa8139f22 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -629,10 +629,8 @@ static const char * const vmstat_text[] = { "nr_active_anon", "nr_inactive_file", "nr_active_file", -#ifdef CONFIG_UNEVICTABLE_LRU "nr_unevictable", "nr_mlock", -#endif "nr_anon_pages", "nr_mapped", "nr_file_pages", @@ -687,7 +685,6 @@ static const char * const vmstat_text[] = { "htlb_buddy_alloc_success", "htlb_buddy_alloc_fail", #endif -#ifdef CONFIG_UNEVICTABLE_LRU "unevictable_pgs_culled", "unevictable_pgs_scanned", "unevictable_pgs_rescued", @@ -697,7 +694,6 @@ static const char * const vmstat_text[] = { "unevictable_pgs_stranded", "unevictable_pgs_mlockfreed", #endif -#endif }; static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, -- cgit v1.2.3 From cb4b86ba47bb0937b71fb825b3ed88adf7a190f0 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 16 Jun 2009 15:32:52 -0700 Subject: mm: add swap cache interface for swap reference In a following patch, the usage of swap cache is recorded into swap_map. This patch is for necessary interface changes to do that. 2 interfaces: - swapcache_prepare() - swapcache_free() are added for allocating/freeing refcnt from swap-cache to existing swap entries. But implementation itself is not changed under this patch. At adding swapcache_free(), memcg's hook code is moved under swapcache_free(). This is better than using scattered hooks. Signed-off-by: KAMEZAWA Hiroyuki Reviewed-by: Daisuke Nishimura Acked-by: Balbir Singh Cc: Hugh Dickins Cc: Johannes Weiner Cc: Li Zefan Cc: Dhaval Giani Cc: YAMAMOTO Takashi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 7 +++++++ mm/shmem.c | 2 +- mm/swap_state.c | 11 +++++------ mm/swapfile.c | 19 +++++++++++++++++++ mm/vmscan.c | 3 +-- 5 files changed, 33 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index f30c06908f09..259e96c150ef 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -282,8 +282,10 @@ extern void si_swapinfo(struct sysinfo *); extern swp_entry_t get_swap_page(void); extern swp_entry_t get_swap_page_of_type(int); extern int swap_duplicate(swp_entry_t); +extern int swapcache_prepare(swp_entry_t); extern int valid_swaphandles(swp_entry_t, unsigned long *); extern void swap_free(swp_entry_t); +extern void swapcache_free(swp_entry_t, struct page *page); extern int free_swap_and_cache(swp_entry_t); extern int swap_type_of(dev_t, sector_t, struct block_device **); extern unsigned int count_swap_pages(int, int); @@ -352,11 +354,16 @@ static inline void show_swap_cache_info(void) #define free_swap_and_cache(swp) is_migration_entry(swp) #define swap_duplicate(swp) is_migration_entry(swp) +#define swapcache_prepare(swp) is_migration_entry(swp) static inline void swap_free(swp_entry_t swp) { } +static inline void swapcache_free(swp_entry_t swp, struct page *page) +{ +} + static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr) { diff --git a/mm/shmem.c b/mm/shmem.c index 0132fbd45a23..47ab19182287 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1097,7 +1097,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) shmem_swp_unmap(entry); unlock: spin_unlock(&info->lock); - swap_free(swap); + swapcache_free(swap, NULL); redirty: set_page_dirty(page); if (wbc->for_reclaim) diff --git a/mm/swap_state.c b/mm/swap_state.c index 1416e7e9e02d..19bdf3017a9e 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -162,11 +162,11 @@ int add_to_swap(struct page *page) return 1; case -EEXIST: /* Raced with "speculative" read_swap_cache_async */ - swap_free(entry); + swapcache_free(entry, NULL); continue; default: /* -ENOMEM radix-tree allocation failure */ - swap_free(entry); + swapcache_free(entry, NULL); return 0; } } @@ -188,8 +188,7 @@ void delete_from_swap_cache(struct page *page) __delete_from_swap_cache(page); spin_unlock_irq(&swapper_space.tree_lock); - mem_cgroup_uncharge_swapcache(page, entry); - swap_free(entry); + swapcache_free(entry, page); page_cache_release(page); } @@ -293,7 +292,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /* * Swap entry may have been freed since our caller observed it. */ - if (!swap_duplicate(entry)) + if (!swapcache_prepare(entry)) break; /* @@ -317,7 +316,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, } ClearPageSwapBacked(new_page); __clear_page_locked(new_page); - swap_free(entry); + swapcache_free(entry, NULL); } while (err != -ENOMEM); if (new_page) diff --git a/mm/swapfile.c b/mm/swapfile.c index 312fafe0ab6e..3187079903fd 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -509,6 +509,16 @@ void swap_free(swp_entry_t entry) } } +/* + * Called after dropping swapcache to decrease refcnt to swap entries. + */ +void swapcache_free(swp_entry_t entry, struct page *page) +{ + if (page) + mem_cgroup_uncharge_swapcache(page, entry); + return swap_free(entry); +} + /* * How many references to page are currently swapped out? */ @@ -1979,6 +1989,15 @@ bad_file: goto out; } +/* + * Called when allocating swap cache for exising swap entry, + */ +int swapcache_prepare(swp_entry_t entry) +{ + return swap_duplicate(entry); +} + + struct swap_info_struct * get_swap_info_struct(unsigned type) { diff --git a/mm/vmscan.c b/mm/vmscan.c index 2c4b945b011f..52339dd7bf85 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -470,8 +470,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page) swp_entry_t swap = { .val = page_private(page) }; __delete_from_swap_cache(page); spin_unlock_irq(&mapping->tree_lock); - mem_cgroup_uncharge_swapcache(page, swap); - swap_free(swap); + swapcache_free(swap, page); } else { __remove_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); -- cgit v1.2.3 From 355cfa73ddff2fb8fa14e93bd94a057cc022512e Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 16 Jun 2009 15:32:53 -0700 Subject: mm: modify swap_map and add SWAP_HAS_CACHE flag This is a part of the patches for fixing memcg's swap accountinf leak. But, IMHO, not a bad patch even if no memcg. There are 2 kinds of references to swap. - reference from swap entry - reference from swap cache Then, - If there is swap cache && swap's refcnt is 1, there is only swap cache. (*) swapcount(entry) == 1 && find_get_page(swapper_space, entry) != NULL This counting logic have worked well for a long time. But considering that we cannot know there is a _real_ reference or not by swap_map[], current usage of counter is not very good. This patch adds a flag SWAP_HAS_CACHE and recored information that a swap entry has a cache or not. This will remove -1 magic used in swapfile.c and be a help to avoid unnecessary find_get_page(). Signed-off-by: KAMEZAWA Hiroyuki Tested-by: Daisuke Nishimura Cc: Balbir Singh Cc: Hugh Dickins Cc: Johannes Weiner Cc: Li Zefan Cc: Dhaval Giani Cc: YAMAMOTO Takashi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 14 ++-- mm/swap_state.c | 5 +- mm/swapfile.c | 214 ++++++++++++++++++++++++++++++++++++++------------- 3 files changed, 172 insertions(+), 61 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 259e96c150ef..fed5e8e1104b 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -129,9 +129,10 @@ enum { #define SWAP_CLUSTER_MAX 32 -#define SWAP_MAP_MAX 0x7fff -#define SWAP_MAP_BAD 0x8000 - +#define SWAP_MAP_MAX 0x7ffe +#define SWAP_MAP_BAD 0x7fff +#define SWAP_HAS_CACHE 0x8000 /* There is a swap cache of entry. */ +#define SWAP_COUNT_MASK (~SWAP_HAS_CACHE) /* * The in-memory structure used to track swap areas. */ @@ -281,7 +282,7 @@ extern long total_swap_pages; extern void si_swapinfo(struct sysinfo *); extern swp_entry_t get_swap_page(void); extern swp_entry_t get_swap_page_of_type(int); -extern int swap_duplicate(swp_entry_t); +extern void swap_duplicate(swp_entry_t); extern int swapcache_prepare(swp_entry_t); extern int valid_swaphandles(swp_entry_t, unsigned long *); extern void swap_free(swp_entry_t); @@ -353,9 +354,12 @@ static inline void show_swap_cache_info(void) } #define free_swap_and_cache(swp) is_migration_entry(swp) -#define swap_duplicate(swp) is_migration_entry(swp) #define swapcache_prepare(swp) is_migration_entry(swp) +static inline void swap_duplicate(swp_entry_t swp) +{ +} + static inline void swap_free(swp_entry_t swp) { } diff --git a/mm/swap_state.c b/mm/swap_state.c index 19bdf3017a9e..b9ca029673a5 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -292,7 +292,10 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /* * Swap entry may have been freed since our caller observed it. */ - if (!swapcache_prepare(entry)) + err = swapcache_prepare(entry); + if (err == -EEXIST) /* seems racy */ + continue; + if (err) /* swp entry is obsolete ? */ break; /* diff --git a/mm/swapfile.c b/mm/swapfile.c index 3187079903fd..0d7296971ad9 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -53,6 +53,33 @@ static struct swap_info_struct swap_info[MAX_SWAPFILES]; static DEFINE_MUTEX(swapon_mutex); +/* For reference count accounting in swap_map */ +/* enum for swap_map[] handling. internal use only */ +enum { + SWAP_MAP = 0, /* ops for reference from swap users */ + SWAP_CACHE, /* ops for reference from swap cache */ +}; + +static inline int swap_count(unsigned short ent) +{ + return ent & SWAP_COUNT_MASK; +} + +static inline bool swap_has_cache(unsigned short ent) +{ + return !!(ent & SWAP_HAS_CACHE); +} + +static inline unsigned short encode_swapmap(int count, bool has_cache) +{ + unsigned short ret = count; + + if (has_cache) + return SWAP_HAS_CACHE | ret; + return ret; +} + + /* * We need this because the bdev->unplug_fn can sleep and we cannot * hold swap_lock while calling the unplug_fn. And swap_lock @@ -167,7 +194,8 @@ static int wait_for_discard(void *word) #define SWAPFILE_CLUSTER 256 #define LATENCY_LIMIT 256 -static inline unsigned long scan_swap_map(struct swap_info_struct *si) +static inline unsigned long scan_swap_map(struct swap_info_struct *si, + int cache) { unsigned long offset; unsigned long scan_base; @@ -285,7 +313,10 @@ checks: si->lowest_bit = si->max; si->highest_bit = 0; } - si->swap_map[offset] = 1; + if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */ + si->swap_map[offset] = encode_swapmap(0, true); + else /* at suspend */ + si->swap_map[offset] = encode_swapmap(1, false); si->cluster_next = offset + 1; si->flags -= SWP_SCANNING; @@ -401,7 +432,8 @@ swp_entry_t get_swap_page(void) continue; swap_list.next = next; - offset = scan_swap_map(si); + /* This is called for allocating swap entry for cache */ + offset = scan_swap_map(si, SWAP_CACHE); if (offset) { spin_unlock(&swap_lock); return swp_entry(type, offset); @@ -415,6 +447,7 @@ noswap: return (swp_entry_t) {0}; } +/* The only caller of this function is now susupend routine */ swp_entry_t get_swap_page_of_type(int type) { struct swap_info_struct *si; @@ -424,7 +457,8 @@ swp_entry_t get_swap_page_of_type(int type) si = swap_info + type; if (si->flags & SWP_WRITEOK) { nr_swap_pages--; - offset = scan_swap_map(si); + /* This is called for allocating swap entry, not cache */ + offset = scan_swap_map(si, SWAP_MAP); if (offset) { spin_unlock(&swap_lock); return swp_entry(type, offset); @@ -471,25 +505,38 @@ out: return NULL; } -static int swap_entry_free(struct swap_info_struct *p, swp_entry_t ent) +static int swap_entry_free(struct swap_info_struct *p, + swp_entry_t ent, int cache) { unsigned long offset = swp_offset(ent); - int count = p->swap_map[offset]; - - if (count < SWAP_MAP_MAX) { - count--; - p->swap_map[offset] = count; - if (!count) { - if (offset < p->lowest_bit) - p->lowest_bit = offset; - if (offset > p->highest_bit) - p->highest_bit = offset; - if (p->prio > swap_info[swap_list.next].prio) - swap_list.next = p - swap_info; - nr_swap_pages++; - p->inuse_pages--; - mem_cgroup_uncharge_swap(ent); + int count = swap_count(p->swap_map[offset]); + bool has_cache; + + has_cache = swap_has_cache(p->swap_map[offset]); + + if (cache == SWAP_MAP) { /* dropping usage count of swap */ + if (count < SWAP_MAP_MAX) { + count--; + p->swap_map[offset] = encode_swapmap(count, has_cache); } + } else { /* dropping swap cache flag */ + VM_BUG_ON(!has_cache); + p->swap_map[offset] = encode_swapmap(count, false); + + } + /* return code. */ + count = p->swap_map[offset]; + /* free if no reference */ + if (!count) { + if (offset < p->lowest_bit) + p->lowest_bit = offset; + if (offset > p->highest_bit) + p->highest_bit = offset; + if (p->prio > swap_info[swap_list.next].prio) + swap_list.next = p - swap_info; + nr_swap_pages++; + p->inuse_pages--; + mem_cgroup_uncharge_swap(ent); } return count; } @@ -504,7 +551,7 @@ void swap_free(swp_entry_t entry) p = swap_info_get(entry); if (p) { - swap_entry_free(p, entry); + swap_entry_free(p, entry, SWAP_MAP); spin_unlock(&swap_lock); } } @@ -514,9 +561,16 @@ void swap_free(swp_entry_t entry) */ void swapcache_free(swp_entry_t entry, struct page *page) { + struct swap_info_struct *p; + if (page) mem_cgroup_uncharge_swapcache(page, entry); - return swap_free(entry); + p = swap_info_get(entry); + if (p) { + swap_entry_free(p, entry, SWAP_CACHE); + spin_unlock(&swap_lock); + } + return; } /* @@ -531,8 +585,7 @@ static inline int page_swapcount(struct page *page) entry.val = page_private(page); p = swap_info_get(entry); if (p) { - /* Subtract the 1 for the swap cache itself */ - count = p->swap_map[swp_offset(entry)] - 1; + count = swap_count(p->swap_map[swp_offset(entry)]); spin_unlock(&swap_lock); } return count; @@ -594,7 +647,7 @@ int free_swap_and_cache(swp_entry_t entry) p = swap_info_get(entry); if (p) { - if (swap_entry_free(p, entry) == 1) { + if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) { page = find_get_page(&swapper_space, entry.val); if (page && !trylock_page(page)) { page_cache_release(page); @@ -901,7 +954,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, i = 1; } count = si->swap_map[i]; - if (count && count != SWAP_MAP_BAD) + if (count && swap_count(count) != SWAP_MAP_BAD) break; } return i; @@ -1005,13 +1058,13 @@ static int try_to_unuse(unsigned int type) */ shmem = 0; swcount = *swap_map; - if (swcount > 1) { + if (swap_count(swcount)) { if (start_mm == &init_mm) shmem = shmem_unuse(entry, page); else retval = unuse_mm(start_mm, entry, page); } - if (*swap_map > 1) { + if (swap_count(*swap_map)) { int set_start_mm = (*swap_map >= swcount); struct list_head *p = &start_mm->mmlist; struct mm_struct *new_start_mm = start_mm; @@ -1021,7 +1074,7 @@ static int try_to_unuse(unsigned int type) atomic_inc(&new_start_mm->mm_users); atomic_inc(&prev_mm->mm_users); spin_lock(&mmlist_lock); - while (*swap_map > 1 && !retval && !shmem && + while (swap_count(*swap_map) && !retval && !shmem && (p = p->next) != &start_mm->mmlist) { mm = list_entry(p, struct mm_struct, mmlist); if (!atomic_inc_not_zero(&mm->mm_users)) @@ -1033,14 +1086,16 @@ static int try_to_unuse(unsigned int type) cond_resched(); swcount = *swap_map; - if (swcount <= 1) + if (!swap_count(swcount)) /* any usage ? */ ; else if (mm == &init_mm) { set_start_mm = 1; shmem = shmem_unuse(entry, page); } else retval = unuse_mm(mm, entry, page); - if (set_start_mm && *swap_map < swcount) { + + if (set_start_mm && + swap_count(*swap_map) < swcount) { mmput(new_start_mm); atomic_inc(&mm->mm_users); new_start_mm = mm; @@ -1067,21 +1122,25 @@ static int try_to_unuse(unsigned int type) } /* - * How could swap count reach 0x7fff when the maximum - * pid is 0x7fff, and there's no way to repeat a swap - * page within an mm (except in shmem, where it's the - * shared object which takes the reference count)? - * We believe SWAP_MAP_MAX cannot occur in Linux 2.4. - * + * How could swap count reach 0x7ffe ? + * There's no way to repeat a swap page within an mm + * (except in shmem, where it's the shared object which takes + * the reference count)? + * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned + * short is too small....) * If that's wrong, then we should worry more about * exit_mmap() and do_munmap() cases described above: * we might be resetting SWAP_MAP_MAX too early here. * We know "Undead"s can happen, they're okay, so don't * report them; but do report if we reset SWAP_MAP_MAX. */ - if (*swap_map == SWAP_MAP_MAX) { + /* We might release the lock_page() in unuse_mm(). */ + if (!PageSwapCache(page) || page_private(page) != entry.val) + goto retry; + + if (swap_count(*swap_map) == SWAP_MAP_MAX) { spin_lock(&swap_lock); - *swap_map = 1; + *swap_map = encode_swapmap(0, true); spin_unlock(&swap_lock); reset_overflow = 1; } @@ -1099,7 +1158,8 @@ static int try_to_unuse(unsigned int type) * pages would be incorrect if swap supported "shared * private" pages, but they are handled by tmpfs files. */ - if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { + if (swap_count(*swap_map) && + PageDirty(page) && PageSwapCache(page)) { struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, }; @@ -1126,6 +1186,7 @@ static int try_to_unuse(unsigned int type) * mark page dirty so shrink_page_list will preserve it. */ SetPageDirty(page); +retry: unlock_page(page); page_cache_release(page); @@ -1952,15 +2013,23 @@ void si_swapinfo(struct sysinfo *val) * * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as * "permanent", but will be reclaimed by the next swapoff. + * Returns error code in following case. + * - success -> 0 + * - swp_entry is invalid -> EINVAL + * - swp_entry is migration entry -> EINVAL + * - swap-cache reference is requested but there is already one. -> EEXIST + * - swap-cache reference is requested but the entry is not used. -> ENOENT */ -int swap_duplicate(swp_entry_t entry) +static int __swap_duplicate(swp_entry_t entry, bool cache) { struct swap_info_struct * p; unsigned long offset, type; - int result = 0; + int result = -EINVAL; + int count; + bool has_cache; if (is_migration_entry(entry)) - return 1; + return -EINVAL; type = swp_type(entry); if (type >= nr_swapfiles) @@ -1969,17 +2038,40 @@ int swap_duplicate(swp_entry_t entry) offset = swp_offset(entry); spin_lock(&swap_lock); - if (offset < p->max && p->swap_map[offset]) { - if (p->swap_map[offset] < SWAP_MAP_MAX - 1) { - p->swap_map[offset]++; - result = 1; - } else if (p->swap_map[offset] <= SWAP_MAP_MAX) { + + if (unlikely(offset >= p->max)) + goto unlock_out; + + count = swap_count(p->swap_map[offset]); + has_cache = swap_has_cache(p->swap_map[offset]); + + if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */ + + /* set SWAP_HAS_CACHE if there is no cache and entry is used */ + if (!has_cache && count) { + p->swap_map[offset] = encode_swapmap(count, true); + result = 0; + } else if (has_cache) /* someone added cache */ + result = -EEXIST; + else if (!count) /* no users */ + result = -ENOENT; + + } else if (count || has_cache) { + if (count < SWAP_MAP_MAX - 1) { + p->swap_map[offset] = encode_swapmap(count + 1, + has_cache); + result = 0; + } else if (count <= SWAP_MAP_MAX) { if (swap_overflow++ < 5) - printk(KERN_WARNING "swap_dup: swap entry overflow\n"); - p->swap_map[offset] = SWAP_MAP_MAX; - result = 1; + printk(KERN_WARNING + "swap_dup: swap entry overflow\n"); + p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX, + has_cache); + result = 0; } - } + } else + result = -ENOENT; /* unused swap entry */ +unlock_out: spin_unlock(&swap_lock); out: return result; @@ -1988,13 +2080,25 @@ bad_file: printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); goto out; } +/* + * increase reference count of swap entry by 1. + */ +void swap_duplicate(swp_entry_t entry) +{ + __swap_duplicate(entry, SWAP_MAP); +} /* + * @entry: swap entry for which we allocate swap cache. + * * Called when allocating swap cache for exising swap entry, + * This can return error codes. Returns 0 at success. + * -EBUSY means there is a swap cache. + * Note: return code is different from swap_duplicate(). */ int swapcache_prepare(swp_entry_t entry) { - return swap_duplicate(entry); + return __swap_duplicate(entry, SWAP_CACHE); } @@ -2035,7 +2139,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) /* Don't read in free or bad pages */ if (!si->swap_map[toff]) break; - if (si->swap_map[toff] == SWAP_MAP_BAD) + if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) break; } /* Count contiguous allocated slots below our target */ @@ -2043,7 +2147,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) /* Don't read in free or bad pages */ if (!si->swap_map[toff]) break; - if (si->swap_map[toff] == SWAP_MAP_BAD) + if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) break; } spin_unlock(&swap_lock); -- cgit v1.2.3 From 2ff05b2b4eac2e63d345fc731ea151a060247f53 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 16 Jun 2009 15:32:56 -0700 Subject: oom: move oom_adj value from task_struct to mm_struct The per-task oom_adj value is a characteristic of its mm more than the task itself since it's not possible to oom kill any thread that shares the mm. If a task were to be killed while attached to an mm that could not be freed because another thread were set to OOM_DISABLE, it would have needlessly been terminated since there is no potential for future memory freeing. This patch moves oomkilladj (now more appropriately named oom_adj) from struct task_struct to struct mm_struct. This requires task_lock() on a task to check its oom_adj value to protect against exec, but it's already necessary to take the lock when dereferencing the mm to find the total VM size for the badness heuristic. This fixes a livelock if the oom killer chooses a task and another thread sharing the same memory has an oom_adj value of OOM_DISABLE. This occurs because oom_kill_task() repeatedly returns 1 and refuses to kill the chosen task while select_bad_process() will repeatedly choose the same task during the next retry. Taking task_lock() in select_bad_process() to check for OOM_DISABLE and in oom_kill_task() to check for threads sharing the same memory will be removed in the next patch in this series where it will no longer be necessary. Writing to /proc/pid/oom_adj for a kthread will now return -EINVAL since these threads are immune from oom killing already. They simply report an oom_adj value of OOM_DISABLE. Cc: Nick Piggin Cc: Rik van Riel Cc: Mel Gorman Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 15 ++++++++++----- fs/proc/base.c | 19 ++++++++++++++++--- include/linux/mm_types.h | 2 ++ include/linux/sched.h | 1 - mm/oom_kill.c | 34 ++++++++++++++++++++++------------ 5 files changed, 50 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index cd8717a36271..ebff3c10a07f 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -1003,11 +1003,13 @@ CHAPTER 3: PER-PROCESS PARAMETERS 3.1 /proc//oom_adj - Adjust the oom-killer score ------------------------------------------------------ -This file can be used to adjust the score used to select which processes -should be killed in an out-of-memory situation. Giving it a high score will -increase the likelihood of this process being killed by the oom-killer. Valid -values are in the range -16 to +15, plus the special value -17, which disables -oom-killing altogether for this process. +This file can be used to adjust the score used to select which processes should +be killed in an out-of-memory situation. The oom_adj value is a characteristic +of the task's mm, so all threads that share an mm with pid will have the same +oom_adj value. A high value will increase the likelihood of this process being +killed by the oom-killer. Valid values are in the range -16 to +15 as +explained below and a special value of -17, which disables oom-killing +altogether for threads sharing pid's mm. The process to be killed in an out-of-memory situation is selected among all others based on its badness score. This value equals the original memory size of the process @@ -1021,6 +1023,9 @@ the parent's score if they do not share the same memory. Thus forking servers are the prime candidates to be killed. Having only one 'hungry' child will make parent less preferable than the child. +/proc//oom_adj cannot be changed for kthreads since they are immune from +oom-killing already. + /proc//oom_score shows process' current badness score. The following heuristics are then applied: diff --git a/fs/proc/base.c b/fs/proc/base.c index 1539e630c47d..3ce5ae9e3d2d 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1006,7 +1006,12 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf, if (!task) return -ESRCH; - oom_adjust = task->oomkilladj; + task_lock(task); + if (task->mm) + oom_adjust = task->mm->oom_adj; + else + oom_adjust = OOM_DISABLE; + task_unlock(task); put_task_struct(task); len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); @@ -1035,11 +1040,19 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, task = get_proc_task(file->f_path.dentry->d_inode); if (!task) return -ESRCH; - if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) { + task_lock(task); + if (!task->mm) { + task_unlock(task); + put_task_struct(task); + return -EINVAL; + } + if (oom_adjust < task->mm->oom_adj && !capable(CAP_SYS_RESOURCE)) { + task_unlock(task); put_task_struct(task); return -EACCES; } - task->oomkilladj = oom_adjust; + task->mm->oom_adj = oom_adjust; + task_unlock(task); put_task_struct(task); if (end - buffer == 0) return -EIO; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 0e80e26ecf21..f4408106fcbc 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -232,6 +232,8 @@ struct mm_struct { unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ + s8 oom_adj; /* OOM kill score adjustment (bit shift) */ + cpumask_t cpu_vm_mask; /* Architecture-specific MM context */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 1048bf50540a..1bc6fae0c135 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1178,7 +1178,6 @@ struct task_struct { * a short time */ unsigned char fpu_counter; - s8 oomkilladj; /* OOM kill score adjustment (bit shift). */ #ifdef CONFIG_BLK_DEV_IO_TRACE unsigned int btrace_seq; #endif diff --git a/mm/oom_kill.c b/mm/oom_kill.c index a7b2460e922b..b60913520ef3 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -58,6 +58,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) unsigned long points, cpu_time, run_time; struct mm_struct *mm; struct task_struct *child; + int oom_adj; task_lock(p); mm = p->mm; @@ -65,6 +66,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) task_unlock(p); return 0; } + oom_adj = mm->oom_adj; /* * The memory size of the process is the basis for the badness. @@ -148,15 +150,15 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) points /= 8; /* - * Adjust the score by oomkilladj. + * Adjust the score by oom_adj. */ - if (p->oomkilladj) { - if (p->oomkilladj > 0) { + if (oom_adj) { + if (oom_adj > 0) { if (!points) points = 1; - points <<= p->oomkilladj; + points <<= oom_adj; } else - points >>= -(p->oomkilladj); + points >>= -(oom_adj); } #ifdef DEBUG @@ -251,8 +253,12 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, *ppoints = ULONG_MAX; } - if (p->oomkilladj == OOM_DISABLE) + task_lock(p); + if (p->mm && p->mm->oom_adj == OOM_DISABLE) { + task_unlock(p); continue; + } + task_unlock(p); points = badness(p, uptime.tv_sec); if (points > *ppoints || !chosen) { @@ -304,8 +310,7 @@ static void dump_tasks(const struct mem_cgroup *mem) } printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, - get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj, - p->comm); + get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm); task_unlock(p); } while_each_thread(g, p); } @@ -367,8 +372,12 @@ static int oom_kill_task(struct task_struct *p) * Don't kill the process if any threads are set to OOM_DISABLE */ do_each_thread(g, q) { - if (q->mm == mm && q->oomkilladj == OOM_DISABLE) + task_lock(q); + if (q->mm == mm && q->mm && q->mm->oom_adj == OOM_DISABLE) { + task_unlock(q); return 1; + } + task_unlock(q); } while_each_thread(g, q); __oom_kill_task(p, 1); @@ -393,10 +402,11 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, struct task_struct *c; if (printk_ratelimit()) { - printk(KERN_WARNING "%s invoked oom-killer: " - "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", - current->comm, gfp_mask, order, current->oomkilladj); task_lock(current); + printk(KERN_WARNING "%s invoked oom-killer: " + "gfp_mask=0x%x, order=%d, oom_adj=%d\n", + current->comm, gfp_mask, order, + current->mm ? current->mm->oom_adj : OOM_DISABLE); cpuset_print_task_mems_allowed(current); task_unlock(current); dump_stack(); -- cgit v1.2.3 From 286973552f051404abdb58dd9b2f8f7558efe4e5 Mon Sep 17 00:00:00 2001 From: Mike Waychison Date: Tue, 16 Jun 2009 15:32:59 -0700 Subject: mm: remove __invalidate_mapping_pages variant Remove __invalidate_mapping_pages atomic variant now that its sole caller can sleep (fixed in eccb95cee4f0d56faa46ef22fb94dd4a3578d3eb ("vfs: fix lock inversion in drop_pagecache_sb()")). This fixes softlockups that can occur while in the drop_caches path. Signed-off-by: Mike Waychison Cc: Jan Kara Cc: Wu Fengguang Cc: Dave Chinner Cc: Nick Piggin Acked-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/drop_caches.c | 2 +- include/linux/fs.h | 3 --- mm/truncate.c | 39 ++++++++++++++++----------------------- 3 files changed, 17 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/fs/drop_caches.c b/fs/drop_caches.c index b6a719a909f8..a2edb7913447 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -24,7 +24,7 @@ static void drop_pagecache_sb(struct super_block *sb) continue; __iget(inode); spin_unlock(&inode_lock); - __invalidate_mapping_pages(inode->i_mapping, 0, -1, true); + invalidate_mapping_pages(inode->i_mapping, 0, -1); iput(toput_inode); toput_inode = inode; spin_lock(&inode_lock); diff --git a/include/linux/fs.h b/include/linux/fs.h index 8146e0264ef9..f5ae9f19b8a3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2036,9 +2036,6 @@ extern int __invalidate_device(struct block_device *); extern int invalidate_partition(struct gendisk *, int); #endif extern int invalidate_inodes(struct super_block *); -unsigned long __invalidate_mapping_pages(struct address_space *mapping, - pgoff_t start, pgoff_t end, - bool be_atomic); unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end); diff --git a/mm/truncate.c b/mm/truncate.c index 12e1579f9165..ccc3ecf7cb98 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -267,8 +267,21 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) } EXPORT_SYMBOL(truncate_inode_pages); -unsigned long __invalidate_mapping_pages(struct address_space *mapping, - pgoff_t start, pgoff_t end, bool be_atomic) +/** + * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode + * @mapping: the address_space which holds the pages to invalidate + * @start: the offset 'from' which to invalidate + * @end: the offset 'to' which to invalidate (inclusive) + * + * This function only removes the unlocked pages, if you want to + * remove all the pages of one inode, you must call truncate_inode_pages. + * + * invalidate_mapping_pages() will not block on IO activity. It will not + * invalidate pages which are dirty, locked, under writeback or mapped into + * pagetables. + */ +unsigned long invalidate_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t end) { struct pagevec pvec; pgoff_t next = start; @@ -309,30 +322,10 @@ unlock: break; } pagevec_release(&pvec); - if (likely(!be_atomic)) - cond_resched(); + cond_resched(); } return ret; } - -/** - * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode - * @mapping: the address_space which holds the pages to invalidate - * @start: the offset 'from' which to invalidate - * @end: the offset 'to' which to invalidate (inclusive) - * - * This function only removes the unlocked pages, if you want to - * remove all the pages of one inode, you must call truncate_inode_pages. - * - * invalidate_mapping_pages() will not block on IO activity. It will not - * invalidate pages which are dirty, locked, under writeback or mapped into - * pagetables. - */ -unsigned long invalidate_mapping_pages(struct address_space *mapping, - pgoff_t start, pgoff_t end) -{ - return __invalidate_mapping_pages(mapping, start, end, false); -} EXPORT_SYMBOL(invalidate_mapping_pages); /* -- cgit v1.2.3 From aca8bf323edd31ad462dc98c107c23a5c6022ca2 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 16 Jun 2009 15:33:02 -0700 Subject: mm: remove file argument from swap_readpage() The file argument resulted from address_space's readpage long time ago. We don't use it any more. Let's remove unnecessary argement. Signed-off-by: Minchan Kim Acked-by: Hugh Dickins Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 2 +- mm/page_io.c | 2 +- mm/swap_state.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index fed5e8e1104b..0cedf31af0b0 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -256,7 +256,7 @@ extern void swap_unplug_io_fn(struct backing_dev_info *, struct page *); #ifdef CONFIG_SWAP /* linux/mm/page_io.c */ -extern int swap_readpage(struct file *, struct page *); +extern int swap_readpage(struct page *); extern int swap_writepage(struct page *page, struct writeback_control *wbc); extern void end_swap_bio_read(struct bio *bio, int err); diff --git a/mm/page_io.c b/mm/page_io.c index 3023c475e041..c6f3e5071de3 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -120,7 +120,7 @@ out: return ret; } -int swap_readpage(struct file *file, struct page *page) +int swap_readpage(struct page *page) { struct bio *bio; int ret = 0; diff --git a/mm/swap_state.c b/mm/swap_state.c index b62e7f56051c..42cd38eba79f 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -313,7 +313,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * Initiate read into locked page and return. */ lru_cache_add_anon(new_page); - swap_readpage(NULL, new_page); + swap_readpage(new_page); return new_page; } ClearPageSwapBacked(new_page); -- cgit v1.2.3 From 168f5ac668f63dfb64439766e3ef9e866b83719d Mon Sep 17 00:00:00 2001 From: Sergei Trofimovich Date: Tue, 16 Jun 2009 15:33:02 -0700 Subject: mm cleanup: shmem_file_setup: 'char *' -> 'const char *' for name argument As function shmem_file_setup does not modify/allocate/free/pass given filename - mark it as const. Signed-off-by: Sergei Trofimovich Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- mm/shmem.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index f0473a88ca2e..d88d6fc530ad 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -724,7 +724,7 @@ static inline int shmem_lock(struct file *file, int lock, return 0; } #endif -struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags); +struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags); int shmem_zero_setup(struct vm_area_struct *); diff --git a/mm/shmem.c b/mm/shmem.c index 47ab19182287..e89d7ec18eda 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2612,7 +2612,7 @@ int shmem_unuse(swp_entry_t entry, struct page *page) * @size: size to be set for the file * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size */ -struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) +struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) { int error; struct file *file; -- cgit v1.2.3 From 6fe6b7e35785e3232ffe7f81d3893f1316710a02 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:33:05 -0700 Subject: vmscan: report vm_flags in page_referenced() Collect vma->vm_flags of the VMAs that actually referenced the page. This is preparing for more informed reclaim heuristics, eg. to protect executable file pages more aggressively. For now only the VM_EXEC bit will be used by the caller. Thanks to Johannes, Peter and Minchan for all the good tips. Acked-by: Peter Zijlstra Reviewed-by: Rik van Riel Reviewed-by: Minchan Kim Reviewed-by: Johannes Weiner Signed-off-by: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 5 +++-- mm/rmap.c | 37 ++++++++++++++++++++++++++----------- mm/vmscan.c | 7 +++++-- 3 files changed, 34 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 619379a1dd98..216d024f830d 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -83,7 +83,8 @@ static inline void page_dup_rmap(struct page *page, struct vm_area_struct *vma, /* * Called from mm/vmscan.c to handle paging out */ -int page_referenced(struct page *, int is_locked, struct mem_cgroup *cnt); +int page_referenced(struct page *, int is_locked, + struct mem_cgroup *cnt, unsigned long *vm_flags); int try_to_unmap(struct page *, int ignore_refs); /* @@ -117,7 +118,7 @@ int try_to_munlock(struct page *); #define anon_vma_prepare(vma) (0) #define anon_vma_link(vma) do {} while (0) -#define page_referenced(page,l,cnt) TestClearPageReferenced(page) +#define page_referenced(page, locked, cnt, flags) TestClearPageReferenced(page) #define try_to_unmap(page, refs) SWAP_FAIL static inline int page_mkclean(struct page *page) diff --git a/mm/rmap.c b/mm/rmap.c index 316c9d6930ad..c9ccc1a72dc3 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -333,7 +333,9 @@ static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) * repeatedly from either page_referenced_anon or page_referenced_file. */ static int page_referenced_one(struct page *page, - struct vm_area_struct *vma, unsigned int *mapcount) + struct vm_area_struct *vma, + unsigned int *mapcount, + unsigned long *vm_flags) { struct mm_struct *mm = vma->vm_mm; unsigned long address; @@ -381,11 +383,14 @@ out_unmap: (*mapcount)--; pte_unmap_unlock(pte, ptl); out: + if (referenced) + *vm_flags |= vma->vm_flags; return referenced; } static int page_referenced_anon(struct page *page, - struct mem_cgroup *mem_cont) + struct mem_cgroup *mem_cont, + unsigned long *vm_flags) { unsigned int mapcount; struct anon_vma *anon_vma; @@ -405,7 +410,8 @@ static int page_referenced_anon(struct page *page, */ if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) continue; - referenced += page_referenced_one(page, vma, &mapcount); + referenced += page_referenced_one(page, vma, + &mapcount, vm_flags); if (!mapcount) break; } @@ -418,6 +424,7 @@ static int page_referenced_anon(struct page *page, * page_referenced_file - referenced check for object-based rmap * @page: the page we're checking references on. * @mem_cont: target memory controller + * @vm_flags: collect encountered vma->vm_flags who actually referenced the page * * For an object-based mapped page, find all the places it is mapped and * check/clear the referenced flag. This is done by following the page->mapping @@ -427,7 +434,8 @@ static int page_referenced_anon(struct page *page, * This function is only called from page_referenced for object-based pages. */ static int page_referenced_file(struct page *page, - struct mem_cgroup *mem_cont) + struct mem_cgroup *mem_cont, + unsigned long *vm_flags) { unsigned int mapcount; struct address_space *mapping = page->mapping; @@ -467,7 +475,8 @@ static int page_referenced_file(struct page *page, */ if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) continue; - referenced += page_referenced_one(page, vma, &mapcount); + referenced += page_referenced_one(page, vma, + &mapcount, vm_flags); if (!mapcount) break; } @@ -481,29 +490,35 @@ static int page_referenced_file(struct page *page, * @page: the page to test * @is_locked: caller holds lock on the page * @mem_cont: target memory controller + * @vm_flags: collect encountered vma->vm_flags who actually referenced the page * * Quick test_and_clear_referenced for all mappings to a page, * returns the number of ptes which referenced the page. */ -int page_referenced(struct page *page, int is_locked, - struct mem_cgroup *mem_cont) +int page_referenced(struct page *page, + int is_locked, + struct mem_cgroup *mem_cont, + unsigned long *vm_flags) { int referenced = 0; if (TestClearPageReferenced(page)) referenced++; + *vm_flags = 0; if (page_mapped(page) && page->mapping) { if (PageAnon(page)) - referenced += page_referenced_anon(page, mem_cont); + referenced += page_referenced_anon(page, mem_cont, + vm_flags); else if (is_locked) - referenced += page_referenced_file(page, mem_cont); + referenced += page_referenced_file(page, mem_cont, + vm_flags); else if (!trylock_page(page)) referenced++; else { if (page->mapping) - referenced += - page_referenced_file(page, mem_cont); + referenced += page_referenced_file(page, + mem_cont, vm_flags); unlock_page(page); } } diff --git a/mm/vmscan.c b/mm/vmscan.c index 52339dd7bf85..6be2068f61c8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -577,6 +577,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, struct pagevec freed_pvec; int pgactivate = 0; unsigned long nr_reclaimed = 0; + unsigned long vm_flags; cond_resched(); @@ -627,7 +628,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep_locked; } - referenced = page_referenced(page, 1, sc->mem_cgroup); + referenced = page_referenced(page, 1, + sc->mem_cgroup, &vm_flags); /* In active use or really unfreeable? Activate it. */ if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced && page_mapping_inuse(page)) @@ -1208,6 +1210,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, { unsigned long pgmoved; unsigned long pgscanned; + unsigned long vm_flags; LIST_HEAD(l_hold); /* The pages which were snipped off */ LIST_HEAD(l_inactive); struct page *page; @@ -1248,7 +1251,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, /* page_referenced clears PageReferenced */ if (page_mapping_inuse(page) && - page_referenced(page, 0, sc->mem_cgroup)) + page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) pgmoved++; list_add(&page->lru, &l_inactive); -- cgit v1.2.3 From 24cf72518c79cdcda486ed26074ff8151291cf65 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:33:23 -0700 Subject: vmscan: count the number of times zone_reclaim() scans and fails On NUMA machines, the administrator can configure zone_reclaim_mode that is a more targetted form of direct reclaim. On machines with large NUMA distances for example, a zone_reclaim_mode defaults to 1 meaning that clean unmapped pages will be reclaimed if the zone watermarks are not being met. There is a heuristic that determines if the scan is worthwhile but it is possible that the heuristic will fail and the CPU gets tied up scanning uselessly. Detecting the situation requires some guesswork and experimentation so this patch adds a counter "zreclaim_failed" to /proc/vmstat. If during high CPU utilisation this counter is increasing rapidly, then the resolution to the problem may be to set /proc/sys/vm/zone_reclaim_mode to 0. [akpm@linux-foundation.org: name things consistently] Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Christoph Lameter Reviewed-by: KOSAKI Motohiro Cc: Wu Fengguang Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmstat.h | 3 +++ mm/vmscan.c | 3 +++ mm/vmstat.c | 3 +++ 3 files changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index ff4696c6dce3..81a97cf8f0a0 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -36,6 +36,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, FOR_ALL_ZONES(PGSTEAL), FOR_ALL_ZONES(PGSCAN_KSWAPD), FOR_ALL_ZONES(PGSCAN_DIRECT), +#ifdef CONFIG_NUMA + PGSCAN_ZONE_RECLAIM_FAILED, +#endif PGINODESTEAL, SLABS_SCANNED, KSWAPD_STEAL, KSWAPD_INODESTEAL, PAGEOUTRUN, ALLOCSTALL, PGROTATED, #ifdef CONFIG_HUGETLB_PAGE diff --git a/mm/vmscan.c b/mm/vmscan.c index 16c82a868e2b..3018ad756133 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2519,6 +2519,9 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) ret = __zone_reclaim(zone, gfp_mask, order); zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); + if (!ret) + count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); + return ret; } #endif diff --git a/mm/vmstat.c b/mm/vmstat.c index 1e3aa8139f22..138bed53706e 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -673,6 +673,9 @@ static const char * const vmstat_text[] = { TEXTS_FOR_ZONES("pgscan_kswapd") TEXTS_FOR_ZONES("pgscan_direct") +#ifdef CONFIG_NUMA + "zone_reclaim_failed", +#endif "pginodesteal", "slabs_scanned", "kswapd_steal", -- cgit v1.2.3 From a7d932af06e8eee2163627d19898e18da5635449 Mon Sep 17 00:00:00 2001 From: Dan Smith Date: Tue, 16 Jun 2009 15:33:33 -0700 Subject: utsname.h: make new_utsname fields use the proper length constant The members of the new_utsname structure are defined with magic numbers that *should* correspond to the constant __NEW_UTS_LEN+1. Everywhere else, code assumes this and uses the constant, so this patch makes the structure match. Originally suggested by Serge here: https://lists.linux-foundation.org/pipermail/containers/2009-March/016258.html Signed-off-by: Dan Smith Acked-by: Serge Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/utsname.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/utsname.h b/include/linux/utsname.h index 11232676bfff..3656b300de3a 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -22,12 +22,12 @@ struct old_utsname { }; struct new_utsname { - char sysname[65]; - char nodename[65]; - char release[65]; - char version[65]; - char machine[65]; - char domainname[65]; + char sysname[__NEW_UTS_LEN + 1]; + char nodename[__NEW_UTS_LEN + 1]; + char release[__NEW_UTS_LEN + 1]; + char version[__NEW_UTS_LEN + 1]; + char machine[__NEW_UTS_LEN + 1]; + char domainname[__NEW_UTS_LEN + 1]; }; #ifdef __KERNEL__ -- cgit v1.2.3 From 4938d7e0233a455f04507bac81d0886c71529537 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Jun 2009 15:33:36 -0700 Subject: poll: avoid extra wakeups in select/poll After introduction of keyed wakeups Davide Libenzi did on epoll, we are able to avoid spurious wakeups in poll()/select() code too. For example, typical use of poll()/select() is to wait for incoming network frames on many sockets. But TX completion for UDP/TCP frames call sock_wfree() which in turn schedules thread. When scheduled, thread does a full scan of all polled fds and can sleep again, because nothing is really available. If number of fds is large, this cause significant load. This patch makes select()/poll() aware of keyed wakeups and useless wakeups are avoided. This reduces number of context switches by about 50% on some setups, and work performed by sofirq handlers. Signed-off-by: Eric Dumazet Acked-by: David S. Miller Acked-by: Andi Kleen Acked-by: Ingo Molnar Acked-by: Davide Libenzi Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/select.c | 40 ++++++++++++++++++++++++++++++++++++---- include/linux/poll.h | 3 +++ 2 files changed, 39 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/fs/select.c b/fs/select.c index 0fe0e1469df3..d870237e42c7 100644 --- a/fs/select.c +++ b/fs/select.c @@ -168,7 +168,7 @@ static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p) return table->entry++; } -static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) +static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) { struct poll_wqueues *pwq = wait->private; DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task); @@ -194,6 +194,16 @@ static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) return default_wake_function(&dummy_wait, mode, sync, key); } +static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + struct poll_table_entry *entry; + + entry = container_of(wait, struct poll_table_entry, wait); + if (key && !((unsigned long)key & entry->key)) + return 0; + return __pollwake(wait, mode, sync, key); +} + /* Add a new entry */ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p) @@ -205,6 +215,7 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, get_file(filp); entry->filp = filp; entry->wait_address = wait_address; + entry->key = p->key; init_waitqueue_func_entry(&entry->wait, pollwake); entry->wait.private = pwq; add_wait_queue(wait_address, &entry->wait); @@ -362,6 +373,18 @@ get_max: #define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR) #define POLLEX_SET (POLLPRI) +static inline void wait_key_set(poll_table *wait, unsigned long in, + unsigned long out, unsigned long bit) +{ + if (wait) { + wait->key = POLLEX_SET; + if (in & bit) + wait->key |= POLLIN_SET; + if (out & bit) + wait->key |= POLLOUT_SET; + } +} + int do_select(int n, fd_set_bits *fds, struct timespec *end_time) { ktime_t expire, *to = NULL; @@ -418,20 +441,25 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) if (file) { f_op = file->f_op; mask = DEFAULT_POLLMASK; - if (f_op && f_op->poll) - mask = (*f_op->poll)(file, retval ? NULL : wait); + if (f_op && f_op->poll) { + wait_key_set(wait, in, out, bit); + mask = (*f_op->poll)(file, wait); + } fput_light(file, fput_needed); if ((mask & POLLIN_SET) && (in & bit)) { res_in |= bit; retval++; + wait = NULL; } if ((mask & POLLOUT_SET) && (out & bit)) { res_out |= bit; retval++; + wait = NULL; } if ((mask & POLLEX_SET) && (ex & bit)) { res_ex |= bit; retval++; + wait = NULL; } } } @@ -685,8 +713,12 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait) mask = POLLNVAL; if (file != NULL) { mask = DEFAULT_POLLMASK; - if (file->f_op && file->f_op->poll) + if (file->f_op && file->f_op->poll) { + if (pwait) + pwait->key = pollfd->events | + POLLERR | POLLHUP; mask = file->f_op->poll(file, pwait); + } /* Mask out unneeded events. */ mask &= pollfd->events | POLLERR | POLLHUP; fput_light(file, fput_needed); diff --git a/include/linux/poll.h b/include/linux/poll.h index 8c24ef8d9976..fa287f25138d 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -32,6 +32,7 @@ typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_ typedef struct poll_table_struct { poll_queue_proc qproc; + unsigned long key; } poll_table; static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p) @@ -43,10 +44,12 @@ static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_addres static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc) { pt->qproc = qproc; + pt->key = ~0UL; /* all events enabled */ } struct poll_table_entry { struct file *filp; + unsigned long key; wait_queue_t wait; wait_queue_head_t *wait_address; }; -- cgit v1.2.3 From 0d9c25dde878a636ee9a9b53923569171bf9a55b Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 16 Jun 2009 15:33:37 -0700 Subject: headers: move module_bug_finalize()/module_bug_cleanup() definitions into module.h They're in linux/bug.h at present, which causes include order tangles. In particular, linux/bug.h cannot be used by linux/atomic.h because, according to Nikanth: linux/bug.h pulls in linux/module.h => linux/spinlock.h => asm/spinlock.h (which uses atomic_inc) => asm/atomic.h. bug.h is a pretty low-level thing and module.h is a higher-level thing, IMO. Cc: Nikanth Karthikesan Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bug.h | 12 ------------ include/linux/module.h | 17 +++++++++++++++++ 2 files changed, 17 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/bug.h b/include/linux/bug.h index 54398d2c6d8d..d276b5510c83 100644 --- a/include/linux/bug.h +++ b/include/linux/bug.h @@ -1,7 +1,6 @@ #ifndef _LINUX_BUG_H #define _LINUX_BUG_H -#include #include enum bug_trap_type { @@ -24,10 +23,6 @@ const struct bug_entry *find_bug(unsigned long bugaddr); enum bug_trap_type report_bug(unsigned long bug_addr, struct pt_regs *regs); -int module_bug_finalize(const Elf_Ehdr *, const Elf_Shdr *, - struct module *); -void module_bug_cleanup(struct module *); - /* These are defined by the architecture */ int is_valid_bugaddr(unsigned long addr); @@ -38,13 +33,6 @@ static inline enum bug_trap_type report_bug(unsigned long bug_addr, { return BUG_TRAP_TYPE_BUG; } -static inline int module_bug_finalize(const Elf_Ehdr *hdr, - const Elf_Shdr *sechdrs, - struct module *mod) -{ - return 0; -} -static inline void module_bug_cleanup(struct module *mod) {} #endif /* CONFIG_GENERIC_BUG */ #endif /* _LINUX_BUG_H */ diff --git a/include/linux/module.h b/include/linux/module.h index a7bc6e7b43a7..505f20dcc1c7 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -697,4 +697,21 @@ static inline void module_remove_modinfo_attrs(struct module *mod) #define __MODULE_STRING(x) __stringify(x) + +#ifdef CONFIG_GENERIC_BUG +int module_bug_finalize(const Elf_Ehdr *, const Elf_Shdr *, + struct module *); +void module_bug_cleanup(struct module *); + +#else /* !CONFIG_GENERIC_BUG */ + +static inline int module_bug_finalize(const Elf_Ehdr *hdr, + const Elf_Shdr *sechdrs, + struct module *mod) +{ + return 0; +} +static inline void module_bug_cleanup(struct module *mod) {} +#endif /* CONFIG_GENERIC_BUG */ + #endif /* _LINUX_MODULE_H */ -- cgit v1.2.3 From 10fc89d01a7ea2ecc2a58d2f4e7700f47178cd62 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Tue, 16 Jun 2009 15:33:38 -0700 Subject: drbd: add major number to major.h Since we have had a LANANA major number for years, and it is documented in devices.txt, I think that this first patch can go upstream. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/major.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/major.h b/include/linux/major.h index 058ec15dd060..6a8ca98c9a96 100644 --- a/include/linux/major.h +++ b/include/linux/major.h @@ -145,6 +145,7 @@ #define UNIX98_PTY_MAJOR_COUNT 8 #define UNIX98_PTY_SLAVE_MAJOR (UNIX98_PTY_MASTER_MAJOR+UNIX98_PTY_MAJOR_COUNT) +#define DRBD_MAJOR 147 #define RTF_MAJOR 150 #define RAW_MAJOR 162 -- cgit v1.2.3 From 8b0b1db0133e4218a9b45c09e53793c039edebe1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 16 Jun 2009 15:33:39 -0700 Subject: remove put_cpu_no_resched() put_cpu_no_resched() is an optimization of put_cpu() which unfortunately can cause high latencies. The nfs iostats code uses put_cpu_no_resched() in a code sequence where a reschedule request caused by an interrupt between the get_cpu() and the put_cpu_no_resched() can delay the reschedule for at least HZ. The other users of put_cpu_no_resched() optimize correctly in interrupt code, but there is no real harm in using the put_cpu() function which is an alias for preempt_enable(). The extra check of the preemmpt count is not as critical as the potential source of missing a reschedule. Debugged in the preempt-rt tree and verified in mainline. Impact: remove a high latency source [akpm@linux-foundation.org: build fix] Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Cc: Tony Luck Cc: Trond Myklebust Cc: "J. Bruce Fields" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/perfmon.c | 2 +- fs/nfs/iostat.h | 6 +++--- include/linux/smp.h | 1 - 3 files changed, 4 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index 8a06dc480594..bdc176cb5e85 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -5595,7 +5595,7 @@ pfm_interrupt_handler(int irq, void *arg) (*pfm_alt_intr_handler->handler)(irq, arg, regs); } - put_cpu_no_resched(); + put_cpu(); return IRQ_HANDLED; } diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h index a2ab2529b5ca..ceda50aad73c 100644 --- a/fs/nfs/iostat.h +++ b/fs/nfs/iostat.h @@ -31,7 +31,7 @@ static inline void nfs_inc_server_stats(const struct nfs_server *server, cpu = get_cpu(); iostats = per_cpu_ptr(server->io_stats, cpu); iostats->events[stat]++; - put_cpu_no_resched(); + put_cpu(); } static inline void nfs_inc_stats(const struct inode *inode, @@ -50,7 +50,7 @@ static inline void nfs_add_server_stats(const struct nfs_server *server, cpu = get_cpu(); iostats = per_cpu_ptr(server->io_stats, cpu); iostats->bytes[stat] += addend; - put_cpu_no_resched(); + put_cpu(); } static inline void nfs_add_stats(const struct inode *inode, @@ -71,7 +71,7 @@ static inline void nfs_add_fscache_stats(struct inode *inode, cpu = get_cpu(); iostats = per_cpu_ptr(NFS_SERVER(inode)->io_stats, cpu); iostats->fscache[stat] += addend; - put_cpu_no_resched(); + put_cpu(); } #endif diff --git a/include/linux/smp.h b/include/linux/smp.h index a69db820eed6..9e3d8af09207 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -177,7 +177,6 @@ static inline void init_call_single_data(void) #define get_cpu() ({ preempt_disable(); smp_processor_id(); }) #define put_cpu() preempt_enable() -#define put_cpu_no_resched() preempt_enable_no_resched() /* * Callback to arch code if there's nosmp or maxcpus=0 on the -- cgit v1.2.3 From e4c9dd0fbad60c098a026e9b06d9de1bc98c5e89 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 16 Jun 2009 15:33:47 -0700 Subject: kmap_types: make most arches use generic header file Convert most arches to use asm-generic/kmap_types.h. Move the KM_FENCE_ macro additions into asm-generic/kmap_types.h, controlled by __WITH_KM_FENCE from each arch's kmap_types.h file. Would be nice to be able to add custom KM_types per arch, but I don't yet see a nice, clean way to do that. Built on x86_64, i386, mips, sparc, alpha(tonyb), powerpc(tonyb), and 68k(tonyb). Note: avr32 should be able to remove KM_PTE2 (since it's not used) and then just use the generic kmap_types.h file. Get avr32 maintainer approval. Signed-off-by: Randy Dunlap Cc: Acked-by: Mike Frysinger Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Bryan Wu Cc: Mikael Starvik Cc: Hirokazu Takata Cc: "Luck Tony" Cc: Geert Uytterhoeven Cc: Ralf Baechle Cc: David Howells Cc: Kyle McMartin Cc: Martin Schwidefsky Cc: Paul Mundt Cc: "David S. Miller" Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Arnd Bergmann Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/kmap_types.h | 24 +++--------------------- arch/blackfin/include/asm/kmap_types.h | 17 +---------------- arch/cris/include/asm/kmap_types.h | 17 +---------------- arch/h8300/include/asm/kmap_types.h | 17 +---------------- arch/ia64/include/asm/kmap_types.h | 24 +++--------------------- arch/m32r/include/asm/kmap_types.h | 23 +++-------------------- arch/m68k/include/asm/kmap_types.h | 17 +---------------- arch/microblaze/include/asm/kmap_types.h | 25 +------------------------ arch/mips/include/asm/kmap_types.h | 24 +++--------------------- arch/mn10300/include/asm/kmap_types.h | 27 +-------------------------- arch/parisc/include/asm/kmap_types.h | 24 +++--------------------- arch/s390/include/asm/kmap_types.h | 17 +---------------- arch/sh/include/asm/kmap_types.h | 24 +++--------------------- arch/sparc/include/asm/kmap_types.h | 17 +---------------- arch/x86/include/asm/kmap_types.h | 23 +++-------------------- arch/xtensa/include/asm/kmap_types.h | 27 +-------------------------- include/asm-generic/kmap_types.h | 2 +- 17 files changed, 31 insertions(+), 318 deletions(-) (limited to 'include') diff --git a/arch/alpha/include/asm/kmap_types.h b/arch/alpha/include/asm/kmap_types.h index 3e6735a34c57..a8d4ec8ea4b6 100644 --- a/arch/alpha/include/asm/kmap_types.h +++ b/arch/alpha/include/asm/kmap_types.h @@ -3,30 +3,12 @@ /* Dummy header just to define km_type. */ - #ifdef CONFIG_DEBUG_HIGHMEM -# define D(n) __KM_FENCE_##n , -#else -# define D(n) +#define __WITH_KM_FENCE #endif -enum km_type { -D(0) KM_BOUNCE_READ, -D(1) KM_SKB_SUNRPC_DATA, -D(2) KM_SKB_DATA_SOFTIRQ, -D(3) KM_USER0, -D(4) KM_USER1, -D(5) KM_BIO_SRC_IRQ, -D(6) KM_BIO_DST_IRQ, -D(7) KM_PTE0, -D(8) KM_PTE1, -D(9) KM_IRQ0, -D(10) KM_IRQ1, -D(11) KM_SOFTIRQ0, -D(12) KM_SOFTIRQ1, -D(13) KM_TYPE_NR -}; +#include -#undef D +#undef __WITH_KM_FENCE #endif diff --git a/arch/blackfin/include/asm/kmap_types.h b/arch/blackfin/include/asm/kmap_types.h index e215f7104974..0a88622339ee 100644 --- a/arch/blackfin/include/asm/kmap_types.h +++ b/arch/blackfin/include/asm/kmap_types.h @@ -1,21 +1,6 @@ #ifndef _ASM_KMAP_TYPES_H #define _ASM_KMAP_TYPES_H -enum km_type { - KM_BOUNCE_READ, - KM_SKB_SUNRPC_DATA, - KM_SKB_DATA_SOFTIRQ, - KM_USER0, - KM_USER1, - KM_BIO_SRC_IRQ, - KM_BIO_DST_IRQ, - KM_PTE0, - KM_PTE1, - KM_IRQ0, - KM_IRQ1, - KM_SOFTIRQ0, - KM_SOFTIRQ1, - KM_TYPE_NR -}; +#include #endif diff --git a/arch/cris/include/asm/kmap_types.h b/arch/cris/include/asm/kmap_types.h index 492988cb9077..d2d643c4ea59 100644 --- a/arch/cris/include/asm/kmap_types.h +++ b/arch/cris/include/asm/kmap_types.h @@ -5,21 +5,6 @@ * is actually used on cris. */ -enum km_type { - KM_BOUNCE_READ, - KM_SKB_SUNRPC_DATA, - KM_SKB_DATA_SOFTIRQ, - KM_USER0, - KM_USER1, - KM_BIO_SRC_IRQ, - KM_BIO_DST_IRQ, - KM_PTE0, - KM_PTE1, - KM_IRQ0, - KM_IRQ1, - KM_SOFTIRQ0, - KM_SOFTIRQ1, - KM_TYPE_NR -}; +#include #endif diff --git a/arch/h8300/include/asm/kmap_types.h b/arch/h8300/include/asm/kmap_types.h index 1ec8a3427120..be12a7160116 100644 --- a/arch/h8300/include/asm/kmap_types.h +++ b/arch/h8300/include/asm/kmap_types.h @@ -1,21 +1,6 @@ #ifndef _ASM_H8300_KMAP_TYPES_H #define _ASM_H8300_KMAP_TYPES_H -enum km_type { - KM_BOUNCE_READ, - KM_SKB_SUNRPC_DATA, - KM_SKB_DATA_SOFTIRQ, - KM_USER0, - KM_USER1, - KM_BIO_SRC_IRQ, - KM_BIO_DST_IRQ, - KM_PTE0, - KM_PTE1, - KM_IRQ0, - KM_IRQ1, - KM_SOFTIRQ0, - KM_SOFTIRQ1, - KM_TYPE_NR -}; +#include #endif diff --git a/arch/ia64/include/asm/kmap_types.h b/arch/ia64/include/asm/kmap_types.h index 5d1658aa2b3b..05d5f9996105 100644 --- a/arch/ia64/include/asm/kmap_types.h +++ b/arch/ia64/include/asm/kmap_types.h @@ -1,30 +1,12 @@ #ifndef _ASM_IA64_KMAP_TYPES_H #define _ASM_IA64_KMAP_TYPES_H - #ifdef CONFIG_DEBUG_HIGHMEM -# define D(n) __KM_FENCE_##n , -#else -# define D(n) +#define __WITH_KM_FENCE #endif -enum km_type { -D(0) KM_BOUNCE_READ, -D(1) KM_SKB_SUNRPC_DATA, -D(2) KM_SKB_DATA_SOFTIRQ, -D(3) KM_USER0, -D(4) KM_USER1, -D(5) KM_BIO_SRC_IRQ, -D(6) KM_BIO_DST_IRQ, -D(7) KM_PTE0, -D(8) KM_PTE1, -D(9) KM_IRQ0, -D(10) KM_IRQ1, -D(11) KM_SOFTIRQ0, -D(12) KM_SOFTIRQ1, -D(13) KM_TYPE_NR -}; +#include -#undef D +#undef __WITH_KM_FENCE #endif /* _ASM_IA64_KMAP_TYPES_H */ diff --git a/arch/m32r/include/asm/kmap_types.h b/arch/m32r/include/asm/kmap_types.h index fa94dc6410ea..4cdb5e3a06bf 100644 --- a/arch/m32r/include/asm/kmap_types.h +++ b/arch/m32r/include/asm/kmap_types.h @@ -2,28 +2,11 @@ #define __M32R_KMAP_TYPES_H #ifdef CONFIG_DEBUG_HIGHMEM -# define D(n) __KM_FENCE_##n , -#else -# define D(n) +#define __WITH_KM_FENCE #endif -enum km_type { -D(0) KM_BOUNCE_READ, -D(1) KM_SKB_SUNRPC_DATA, -D(2) KM_SKB_DATA_SOFTIRQ, -D(3) KM_USER0, -D(4) KM_USER1, -D(5) KM_BIO_SRC_IRQ, -D(6) KM_BIO_DST_IRQ, -D(7) KM_PTE0, -D(8) KM_PTE1, -D(9) KM_IRQ0, -D(10) KM_IRQ1, -D(11) KM_SOFTIRQ0, -D(12) KM_SOFTIRQ1, -D(13) KM_TYPE_NR -}; +#include -#undef D +#undef __WITH_KM_FENCE #endif /* __M32R_KMAP_TYPES_H */ diff --git a/arch/m68k/include/asm/kmap_types.h b/arch/m68k/include/asm/kmap_types.h index c843c63d3801..3413cc1390ec 100644 --- a/arch/m68k/include/asm/kmap_types.h +++ b/arch/m68k/include/asm/kmap_types.h @@ -1,21 +1,6 @@ #ifndef __ASM_M68K_KMAP_TYPES_H #define __ASM_M68K_KMAP_TYPES_H -enum km_type { - KM_BOUNCE_READ, - KM_SKB_SUNRPC_DATA, - KM_SKB_DATA_SOFTIRQ, - KM_USER0, - KM_USER1, - KM_BIO_SRC_IRQ, - KM_BIO_DST_IRQ, - KM_PTE0, - KM_PTE1, - KM_IRQ0, - KM_IRQ1, - KM_SOFTIRQ0, - KM_SOFTIRQ1, - KM_TYPE_NR -}; +#include #endif /* __ASM_M68K_KMAP_TYPES_H */ diff --git a/arch/microblaze/include/asm/kmap_types.h b/arch/microblaze/include/asm/kmap_types.h index 4d7e222f5dd7..25975252d83d 100644 --- a/arch/microblaze/include/asm/kmap_types.h +++ b/arch/microblaze/include/asm/kmap_types.h @@ -1,29 +1,6 @@ -/* - * Copyright (C) 2006 Atmark Techno, Inc. - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - */ - #ifndef _ASM_MICROBLAZE_KMAP_TYPES_H #define _ASM_MICROBLAZE_KMAP_TYPES_H -enum km_type { - KM_BOUNCE_READ, - KM_SKB_SUNRPC_DATA, - KM_SKB_DATA_SOFTIRQ, - KM_USER0, - KM_USER1, - KM_BIO_SRC_IRQ, - KM_BIO_DST_IRQ, - KM_PTE0, - KM_PTE1, - KM_IRQ0, - KM_IRQ1, - KM_SOFTIRQ0, - KM_SOFTIRQ1, - KM_TYPE_NR, -}; +#include #endif /* _ASM_MICROBLAZE_KMAP_TYPES_H */ diff --git a/arch/mips/include/asm/kmap_types.h b/arch/mips/include/asm/kmap_types.h index 806aae3c5338..58e91ed0388f 100644 --- a/arch/mips/include/asm/kmap_types.h +++ b/arch/mips/include/asm/kmap_types.h @@ -1,30 +1,12 @@ #ifndef _ASM_KMAP_TYPES_H #define _ASM_KMAP_TYPES_H - #ifdef CONFIG_DEBUG_HIGHMEM -# define D(n) __KM_FENCE_##n , -#else -# define D(n) +#define __WITH_KM_FENCE #endif -enum km_type { -D(0) KM_BOUNCE_READ, -D(1) KM_SKB_SUNRPC_DATA, -D(2) KM_SKB_DATA_SOFTIRQ, -D(3) KM_USER0, -D(4) KM_USER1, -D(5) KM_BIO_SRC_IRQ, -D(6) KM_BIO_DST_IRQ, -D(7) KM_PTE0, -D(8) KM_PTE1, -D(9) KM_IRQ0, -D(10) KM_IRQ1, -D(11) KM_SOFTIRQ0, -D(12) KM_SOFTIRQ1, -D(13) KM_TYPE_NR -}; +#include -#undef D +#undef __WITH_KM_FENCE #endif diff --git a/arch/mn10300/include/asm/kmap_types.h b/arch/mn10300/include/asm/kmap_types.h index 3398f9f35603..76d093b58d4f 100644 --- a/arch/mn10300/include/asm/kmap_types.h +++ b/arch/mn10300/include/asm/kmap_types.h @@ -1,31 +1,6 @@ -/* MN10300 kmap_atomic() slot IDs - * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ #ifndef _ASM_KMAP_TYPES_H #define _ASM_KMAP_TYPES_H -enum km_type { - KM_BOUNCE_READ, - KM_SKB_SUNRPC_DATA, - KM_SKB_DATA_SOFTIRQ, - KM_USER0, - KM_USER1, - KM_BIO_SRC_IRQ, - KM_BIO_DST_IRQ, - KM_PTE0, - KM_PTE1, - KM_IRQ0, - KM_IRQ1, - KM_SOFTIRQ0, - KM_SOFTIRQ1, - KM_TYPE_NR -}; +#include #endif /* _ASM_KMAP_TYPES_H */ diff --git a/arch/parisc/include/asm/kmap_types.h b/arch/parisc/include/asm/kmap_types.h index 806aae3c5338..58e91ed0388f 100644 --- a/arch/parisc/include/asm/kmap_types.h +++ b/arch/parisc/include/asm/kmap_types.h @@ -1,30 +1,12 @@ #ifndef _ASM_KMAP_TYPES_H #define _ASM_KMAP_TYPES_H - #ifdef CONFIG_DEBUG_HIGHMEM -# define D(n) __KM_FENCE_##n , -#else -# define D(n) +#define __WITH_KM_FENCE #endif -enum km_type { -D(0) KM_BOUNCE_READ, -D(1) KM_SKB_SUNRPC_DATA, -D(2) KM_SKB_DATA_SOFTIRQ, -D(3) KM_USER0, -D(4) KM_USER1, -D(5) KM_BIO_SRC_IRQ, -D(6) KM_BIO_DST_IRQ, -D(7) KM_PTE0, -D(8) KM_PTE1, -D(9) KM_IRQ0, -D(10) KM_IRQ1, -D(11) KM_SOFTIRQ0, -D(12) KM_SOFTIRQ1, -D(13) KM_TYPE_NR -}; +#include -#undef D +#undef __WITH_KM_FENCE #endif diff --git a/arch/s390/include/asm/kmap_types.h b/arch/s390/include/asm/kmap_types.h index fd1574648223..94ec3ee07983 100644 --- a/arch/s390/include/asm/kmap_types.h +++ b/arch/s390/include/asm/kmap_types.h @@ -2,22 +2,7 @@ #ifndef _ASM_KMAP_TYPES_H #define _ASM_KMAP_TYPES_H -enum km_type { - KM_BOUNCE_READ, - KM_SKB_SUNRPC_DATA, - KM_SKB_DATA_SOFTIRQ, - KM_USER0, - KM_USER1, - KM_BIO_SRC_IRQ, - KM_BIO_DST_IRQ, - KM_PTE0, - KM_PTE1, - KM_IRQ0, - KM_IRQ1, - KM_SOFTIRQ0, - KM_SOFTIRQ1, - KM_TYPE_NR -}; +#include #endif #endif /* __KERNEL__ */ diff --git a/arch/sh/include/asm/kmap_types.h b/arch/sh/include/asm/kmap_types.h index 84d565c696be..5962b08b6dd8 100644 --- a/arch/sh/include/asm/kmap_types.h +++ b/arch/sh/include/asm/kmap_types.h @@ -3,30 +3,12 @@ /* Dummy header just to define km_type. */ - #ifdef CONFIG_DEBUG_HIGHMEM -# define D(n) __KM_FENCE_##n , -#else -# define D(n) +#define __WITH_KM_FENCE #endif -enum km_type { -D(0) KM_BOUNCE_READ, -D(1) KM_SKB_SUNRPC_DATA, -D(2) KM_SKB_DATA_SOFTIRQ, -D(3) KM_USER0, -D(4) KM_USER1, -D(5) KM_BIO_SRC_IRQ, -D(6) KM_BIO_DST_IRQ, -D(7) KM_PTE0, -D(8) KM_PTE1, -D(9) KM_IRQ0, -D(10) KM_IRQ1, -D(11) KM_SOFTIRQ0, -D(12) KM_SOFTIRQ1, -D(13) KM_TYPE_NR -}; +#include -#undef D +#undef __WITH_KM_FENCE #endif diff --git a/arch/sparc/include/asm/kmap_types.h b/arch/sparc/include/asm/kmap_types.h index 602f5e034f7a..aad21745fbb9 100644 --- a/arch/sparc/include/asm/kmap_types.h +++ b/arch/sparc/include/asm/kmap_types.h @@ -5,21 +5,6 @@ * is actually used on sparc. -DaveM */ -enum km_type { - KM_BOUNCE_READ, - KM_SKB_SUNRPC_DATA, - KM_SKB_DATA_SOFTIRQ, - KM_USER0, - KM_USER1, - KM_BIO_SRC_IRQ, - KM_BIO_DST_IRQ, - KM_PTE0, - KM_PTE1, - KM_IRQ0, - KM_IRQ1, - KM_SOFTIRQ0, - KM_SOFTIRQ1, - KM_TYPE_NR -}; +#include #endif diff --git a/arch/x86/include/asm/kmap_types.h b/arch/x86/include/asm/kmap_types.h index 5759c165a5cf..9e00a731a7fb 100644 --- a/arch/x86/include/asm/kmap_types.h +++ b/arch/x86/include/asm/kmap_types.h @@ -2,28 +2,11 @@ #define _ASM_X86_KMAP_TYPES_H #if defined(CONFIG_X86_32) && defined(CONFIG_DEBUG_HIGHMEM) -# define D(n) __KM_FENCE_##n , -#else -# define D(n) +#define __WITH_KM_FENCE #endif -enum km_type { -D(0) KM_BOUNCE_READ, -D(1) KM_SKB_SUNRPC_DATA, -D(2) KM_SKB_DATA_SOFTIRQ, -D(3) KM_USER0, -D(4) KM_USER1, -D(5) KM_BIO_SRC_IRQ, -D(6) KM_BIO_DST_IRQ, -D(7) KM_PTE0, -D(8) KM_PTE1, -D(9) KM_IRQ0, -D(10) KM_IRQ1, -D(11) KM_SOFTIRQ0, -D(12) KM_SOFTIRQ1, -D(13) KM_TYPE_NR -}; +#include -#undef D +#undef __WITH_KM_FENCE #endif /* _ASM_X86_KMAP_TYPES_H */ diff --git a/arch/xtensa/include/asm/kmap_types.h b/arch/xtensa/include/asm/kmap_types.h index 9e822d2e3bce..11c687e527f1 100644 --- a/arch/xtensa/include/asm/kmap_types.h +++ b/arch/xtensa/include/asm/kmap_types.h @@ -1,31 +1,6 @@ -/* - * include/asm-xtensa/kmap_types.h - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 2001 - 2005 Tensilica Inc. - */ - #ifndef _XTENSA_KMAP_TYPES_H #define _XTENSA_KMAP_TYPES_H -enum km_type { - KM_BOUNCE_READ, - KM_SKB_SUNRPC_DATA, - KM_SKB_DATA_SOFTIRQ, - KM_USER0, - KM_USER1, - KM_BIO_SRC_IRQ, - KM_BIO_DST_IRQ, - KM_PTE0, - KM_PTE1, - KM_IRQ0, - KM_IRQ1, - KM_SOFTIRQ0, - KM_SOFTIRQ1, - KM_TYPE_NR -}; +#include #endif /* _XTENSA_KMAP_TYPES_H */ diff --git a/include/asm-generic/kmap_types.h b/include/asm-generic/kmap_types.h index 58c33055c304..54e8b3d956b7 100644 --- a/include/asm-generic/kmap_types.h +++ b/include/asm-generic/kmap_types.h @@ -1,7 +1,7 @@ #ifndef _ASM_GENERIC_KMAP_TYPES_H #define _ASM_GENERIC_KMAP_TYPES_H -#ifdef CONFIG_DEBUG_HIGHMEM +#ifdef __WITH_KM_FENCE # define D(n) __KM_FENCE_##n , #else # define D(n) -- cgit v1.2.3 From cc6f26774136b7f5307abcd3887f08360c9b7554 Mon Sep 17 00:00:00 2001 From: Masatake YAMATO Date: Tue, 16 Jun 2009 15:33:49 -0700 Subject: syscalls.h: remove duplicated declarations for sys_pipe2 sys_pipe2 is declared twice in include/linux/syscalls.h. Signed-off-by: Masatake YAMATO Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/syscalls.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 418d90f5effe..fa4242cdade8 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -434,6 +434,7 @@ asmlinkage long sys_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg); asmlinkage long sys_fcntl64(unsigned int fd, unsigned int cmd, unsigned long arg); #endif +asmlinkage long sys_pipe(int __user *fildes); asmlinkage long sys_pipe2(int __user *fildes, int flags); asmlinkage long sys_dup(unsigned int fildes); asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd); @@ -751,8 +752,6 @@ asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *, asmlinkage long sys_ppoll(struct pollfd __user *, unsigned int, struct timespec __user *, const sigset_t __user *, size_t); -asmlinkage long sys_pipe2(int __user *, int); -asmlinkage long sys_pipe(int __user *); int kernel_execve(const char *filename, char *const argv[], char *const envp[]); -- cgit v1.2.3 From 55e331cf7ebe20665253770589cd9eb06048bf25 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 16 Jun 2009 15:33:53 -0700 Subject: drivers: add support for the TI VLYNQ bus Add support for the TI VLYNQ high-speed, serial and packetized bus. This bus allows external devices to be connected to the System-on-Chip and appear in the main system memory just like any memory mapped peripheral. It is widely used in TI's networking and multimedia SoC, including the AR7 SoC. Signed-off-by: Eugene Konev Signed-off-by: Florian Fainelli Cc: Ralf Baechle Cc: Alan Cox Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 8 + drivers/Kconfig | 2 + drivers/Makefile | 1 + drivers/vlynq/Kconfig | 20 ++ drivers/vlynq/Makefile | 5 + drivers/vlynq/vlynq.c | 814 +++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/vlynq.h | 161 ++++++++++ 7 files changed, 1011 insertions(+) create mode 100644 drivers/vlynq/Kconfig create mode 100644 drivers/vlynq/Makefile create mode 100644 drivers/vlynq/vlynq.c create mode 100644 include/linux/vlynq.h (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index 2cb7566904b1..b735f7dc91d9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6281,6 +6281,14 @@ F: drivers/net/macvlan.c F: include/linux/if_*vlan.h F: net/8021q/ +VLYNQ BUS +P: Florian Fainelli +M: florian@openwrt.org +L: openwrt-devel@lists.openwrt.org +S: Maintained +F: drivers/vlynq/vlynq.c +F: include/linux/vlynq.h + VOLTAGE AND CURRENT REGULATOR FRAMEWORK P: Liam Girdwood M: lrg@slimlogic.co.uk diff --git a/drivers/Kconfig b/drivers/Kconfig index 00cf9553f740..a442c8f29fc1 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -104,6 +104,8 @@ source "drivers/auxdisplay/Kconfig" source "drivers/uio/Kconfig" +source "drivers/vlynq/Kconfig" + source "drivers/xen/Kconfig" source "drivers/staging/Kconfig" diff --git a/drivers/Makefile b/drivers/Makefile index 9e7d4e56c85b..00b44f4ccf03 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -105,6 +105,7 @@ obj-$(CONFIG_PPC_PS3) += ps3/ obj-$(CONFIG_OF) += of/ obj-$(CONFIG_SSB) += ssb/ obj-$(CONFIG_VIRTIO) += virtio/ +obj-$(CONFIG_VLYNQ) += vlynq/ obj-$(CONFIG_STAGING) += staging/ obj-y += platform/ obj-y += ieee802154/ diff --git a/drivers/vlynq/Kconfig b/drivers/vlynq/Kconfig new file mode 100644 index 000000000000..f6542211db48 --- /dev/null +++ b/drivers/vlynq/Kconfig @@ -0,0 +1,20 @@ +menu "TI VLYNQ" + +config VLYNQ + bool "TI VLYNQ bus support" + depends on AR7 && EXPERIMENTAL + help + Support for Texas Instruments(R) VLYNQ bus. + The VLYNQ bus is a high-speed, serial and packetized + data bus which allows external peripherals of a SoC + to appear into the system's main memory. + + If unsure, say N + +config VLYNQ_DEBUG + bool "VLYNQ bus debug" + depends on VLYNQ && KERNEL_DEBUG + help + Turn on VLYNQ bus debugging. + +endmenu diff --git a/drivers/vlynq/Makefile b/drivers/vlynq/Makefile new file mode 100644 index 000000000000..b3f61149b599 --- /dev/null +++ b/drivers/vlynq/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for kernel vlynq drivers +# + +obj-$(CONFIG_VLYNQ) += vlynq.o diff --git a/drivers/vlynq/vlynq.c b/drivers/vlynq/vlynq.c new file mode 100644 index 000000000000..7335433b067b --- /dev/null +++ b/drivers/vlynq/vlynq.c @@ -0,0 +1,814 @@ +/* + * Copyright (C) 2006, 2007 Eugene Konev + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Parts of the VLYNQ specification can be found here: + * http://www.ti.com/litv/pdf/sprue36a + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define VLYNQ_CTRL_PM_ENABLE 0x80000000 +#define VLYNQ_CTRL_CLOCK_INT 0x00008000 +#define VLYNQ_CTRL_CLOCK_DIV(x) (((x) & 7) << 16) +#define VLYNQ_CTRL_INT_LOCAL 0x00004000 +#define VLYNQ_CTRL_INT_ENABLE 0x00002000 +#define VLYNQ_CTRL_INT_VECTOR(x) (((x) & 0x1f) << 8) +#define VLYNQ_CTRL_INT2CFG 0x00000080 +#define VLYNQ_CTRL_RESET 0x00000001 + +#define VLYNQ_CTRL_CLOCK_MASK (0x7 << 16) + +#define VLYNQ_INT_OFFSET 0x00000014 +#define VLYNQ_REMOTE_OFFSET 0x00000080 + +#define VLYNQ_STATUS_LINK 0x00000001 +#define VLYNQ_STATUS_LERROR 0x00000080 +#define VLYNQ_STATUS_RERROR 0x00000100 + +#define VINT_ENABLE 0x00000100 +#define VINT_TYPE_EDGE 0x00000080 +#define VINT_LEVEL_LOW 0x00000040 +#define VINT_VECTOR(x) ((x) & 0x1f) +#define VINT_OFFSET(irq) (8 * ((irq) % 4)) + +#define VLYNQ_AUTONEGO_V2 0x00010000 + +struct vlynq_regs { + u32 revision; + u32 control; + u32 status; + u32 int_prio; + u32 int_status; + u32 int_pending; + u32 int_ptr; + u32 tx_offset; + struct vlynq_mapping rx_mapping[4]; + u32 chip; + u32 autonego; + u32 unused[6]; + u32 int_device[8]; +}; + +#ifdef VLYNQ_DEBUG +static void vlynq_dump_regs(struct vlynq_device *dev) +{ + int i; + + printk(KERN_DEBUG "VLYNQ local=%p remote=%p\n", + dev->local, dev->remote); + for (i = 0; i < 32; i++) { + printk(KERN_DEBUG "VLYNQ: local %d: %08x\n", + i + 1, ((u32 *)dev->local)[i]); + printk(KERN_DEBUG "VLYNQ: remote %d: %08x\n", + i + 1, ((u32 *)dev->remote)[i]); + } +} + +static void vlynq_dump_mem(u32 *base, int count) +{ + int i; + + for (i = 0; i < (count + 3) / 4; i++) { + if (i % 4 == 0) + printk(KERN_DEBUG "\nMEM[0x%04x]:", i * 4); + printk(KERN_DEBUG " 0x%08x", *(base + i)); + } + printk(KERN_DEBUG "\n"); +} +#endif + +/* Check the VLYNQ link status with a given device */ +static int vlynq_linked(struct vlynq_device *dev) +{ + int i; + + for (i = 0; i < 100; i++) + if (readl(&dev->local->status) & VLYNQ_STATUS_LINK) + return 1; + else + cpu_relax(); + + return 0; +} + +static void vlynq_reset(struct vlynq_device *dev) +{ + writel(readl(&dev->local->control) | VLYNQ_CTRL_RESET, + &dev->local->control); + + /* Wait for the devices to finish resetting */ + msleep(5); + + /* Remove reset bit */ + writel(readl(&dev->local->control) & ~VLYNQ_CTRL_RESET, + &dev->local->control); + + /* Give some time for the devices to settle */ + msleep(5); +} + +static void vlynq_irq_unmask(unsigned int irq) +{ + u32 val; + struct vlynq_device *dev = get_irq_chip_data(irq); + int virq; + + BUG_ON(!dev); + virq = irq - dev->irq_start; + val = readl(&dev->remote->int_device[virq >> 2]); + val |= (VINT_ENABLE | virq) << VINT_OFFSET(virq); + writel(val, &dev->remote->int_device[virq >> 2]); +} + +static void vlynq_irq_mask(unsigned int irq) +{ + u32 val; + struct vlynq_device *dev = get_irq_chip_data(irq); + int virq; + + BUG_ON(!dev); + virq = irq - dev->irq_start; + val = readl(&dev->remote->int_device[virq >> 2]); + val &= ~(VINT_ENABLE << VINT_OFFSET(virq)); + writel(val, &dev->remote->int_device[virq >> 2]); +} + +static int vlynq_irq_type(unsigned int irq, unsigned int flow_type) +{ + u32 val; + struct vlynq_device *dev = get_irq_chip_data(irq); + int virq; + + BUG_ON(!dev); + virq = irq - dev->irq_start; + val = readl(&dev->remote->int_device[virq >> 2]); + switch (flow_type & IRQ_TYPE_SENSE_MASK) { + case IRQ_TYPE_EDGE_RISING: + case IRQ_TYPE_EDGE_FALLING: + case IRQ_TYPE_EDGE_BOTH: + val |= VINT_TYPE_EDGE << VINT_OFFSET(virq); + val &= ~(VINT_LEVEL_LOW << VINT_OFFSET(virq)); + break; + case IRQ_TYPE_LEVEL_HIGH: + val &= ~(VINT_TYPE_EDGE << VINT_OFFSET(virq)); + val &= ~(VINT_LEVEL_LOW << VINT_OFFSET(virq)); + break; + case IRQ_TYPE_LEVEL_LOW: + val &= ~(VINT_TYPE_EDGE << VINT_OFFSET(virq)); + val |= VINT_LEVEL_LOW << VINT_OFFSET(virq); + break; + default: + return -EINVAL; + } + writel(val, &dev->remote->int_device[virq >> 2]); + return 0; +} + +static void vlynq_local_ack(unsigned int irq) +{ + struct vlynq_device *dev = get_irq_chip_data(irq); + + u32 status = readl(&dev->local->status); + + pr_debug("%s: local status: 0x%08x\n", + dev_name(&dev->dev), status); + writel(status, &dev->local->status); +} + +static void vlynq_remote_ack(unsigned int irq) +{ + struct vlynq_device *dev = get_irq_chip_data(irq); + + u32 status = readl(&dev->remote->status); + + pr_debug("%s: remote status: 0x%08x\n", + dev_name(&dev->dev), status); + writel(status, &dev->remote->status); +} + +static irqreturn_t vlynq_irq(int irq, void *dev_id) +{ + struct vlynq_device *dev = dev_id; + u32 status; + int virq = 0; + + status = readl(&dev->local->int_status); + writel(status, &dev->local->int_status); + + if (unlikely(!status)) + spurious_interrupt(); + + while (status) { + if (status & 1) + do_IRQ(dev->irq_start + virq); + status >>= 1; + virq++; + } + + return IRQ_HANDLED; +} + +static struct irq_chip vlynq_irq_chip = { + .name = "vlynq", + .unmask = vlynq_irq_unmask, + .mask = vlynq_irq_mask, + .set_type = vlynq_irq_type, +}; + +static struct irq_chip vlynq_local_chip = { + .name = "vlynq local error", + .unmask = vlynq_irq_unmask, + .mask = vlynq_irq_mask, + .ack = vlynq_local_ack, +}; + +static struct irq_chip vlynq_remote_chip = { + .name = "vlynq local error", + .unmask = vlynq_irq_unmask, + .mask = vlynq_irq_mask, + .ack = vlynq_remote_ack, +}; + +static int vlynq_setup_irq(struct vlynq_device *dev) +{ + u32 val; + int i, virq; + + if (dev->local_irq == dev->remote_irq) { + printk(KERN_ERR + "%s: local vlynq irq should be different from remote\n", + dev_name(&dev->dev)); + return -EINVAL; + } + + /* Clear local and remote error bits */ + writel(readl(&dev->local->status), &dev->local->status); + writel(readl(&dev->remote->status), &dev->remote->status); + + /* Now setup interrupts */ + val = VLYNQ_CTRL_INT_VECTOR(dev->local_irq); + val |= VLYNQ_CTRL_INT_ENABLE | VLYNQ_CTRL_INT_LOCAL | + VLYNQ_CTRL_INT2CFG; + val |= readl(&dev->local->control); + writel(VLYNQ_INT_OFFSET, &dev->local->int_ptr); + writel(val, &dev->local->control); + + val = VLYNQ_CTRL_INT_VECTOR(dev->remote_irq); + val |= VLYNQ_CTRL_INT_ENABLE; + val |= readl(&dev->remote->control); + writel(VLYNQ_INT_OFFSET, &dev->remote->int_ptr); + writel(val, &dev->remote->int_ptr); + writel(val, &dev->remote->control); + + for (i = dev->irq_start; i <= dev->irq_end; i++) { + virq = i - dev->irq_start; + if (virq == dev->local_irq) { + set_irq_chip_and_handler(i, &vlynq_local_chip, + handle_level_irq); + set_irq_chip_data(i, dev); + } else if (virq == dev->remote_irq) { + set_irq_chip_and_handler(i, &vlynq_remote_chip, + handle_level_irq); + set_irq_chip_data(i, dev); + } else { + set_irq_chip_and_handler(i, &vlynq_irq_chip, + handle_simple_irq); + set_irq_chip_data(i, dev); + writel(0, &dev->remote->int_device[virq >> 2]); + } + } + + if (request_irq(dev->irq, vlynq_irq, IRQF_SHARED, "vlynq", dev)) { + printk(KERN_ERR "%s: request_irq failed\n", + dev_name(&dev->dev)); + return -EAGAIN; + } + + return 0; +} + +static void vlynq_device_release(struct device *dev) +{ + struct vlynq_device *vdev = to_vlynq_device(dev); + kfree(vdev); +} + +static int vlynq_device_match(struct device *dev, + struct device_driver *drv) +{ + struct vlynq_device *vdev = to_vlynq_device(dev); + struct vlynq_driver *vdrv = to_vlynq_driver(drv); + struct vlynq_device_id *ids = vdrv->id_table; + + while (ids->id) { + if (ids->id == vdev->dev_id) { + vdev->divisor = ids->divisor; + vlynq_set_drvdata(vdev, ids); + printk(KERN_INFO "Driver found for VLYNQ " + "device: %08x\n", vdev->dev_id); + return 1; + } + printk(KERN_DEBUG "Not using the %08x VLYNQ device's driver" + " for VLYNQ device: %08x\n", ids->id, vdev->dev_id); + ids++; + } + return 0; +} + +static int vlynq_device_probe(struct device *dev) +{ + struct vlynq_device *vdev = to_vlynq_device(dev); + struct vlynq_driver *drv = to_vlynq_driver(dev->driver); + struct vlynq_device_id *id = vlynq_get_drvdata(vdev); + int result = -ENODEV; + + if (drv->probe) + result = drv->probe(vdev, id); + if (result) + put_device(dev); + return result; +} + +static int vlynq_device_remove(struct device *dev) +{ + struct vlynq_driver *drv = to_vlynq_driver(dev->driver); + + if (drv->remove) + drv->remove(to_vlynq_device(dev)); + + return 0; +} + +int __vlynq_register_driver(struct vlynq_driver *driver, struct module *owner) +{ + driver->driver.name = driver->name; + driver->driver.bus = &vlynq_bus_type; + return driver_register(&driver->driver); +} +EXPORT_SYMBOL(__vlynq_register_driver); + +void vlynq_unregister_driver(struct vlynq_driver *driver) +{ + driver_unregister(&driver->driver); +} +EXPORT_SYMBOL(vlynq_unregister_driver); + +/* + * A VLYNQ remote device can clock the VLYNQ bus master + * using a dedicated clock line. In that case, both the + * remove device and the bus master should have the same + * serial clock dividers configured. Iterate through the + * 8 possible dividers until we actually link with the + * device. + */ +static int __vlynq_try_remote(struct vlynq_device *dev) +{ + int i; + + vlynq_reset(dev); + for (i = dev->dev_id ? vlynq_rdiv2 : vlynq_rdiv8; dev->dev_id ? + i <= vlynq_rdiv8 : i >= vlynq_rdiv2; + dev->dev_id ? i++ : i--) { + + if (!vlynq_linked(dev)) + break; + + writel((readl(&dev->remote->control) & + ~VLYNQ_CTRL_CLOCK_MASK) | + VLYNQ_CTRL_CLOCK_INT | + VLYNQ_CTRL_CLOCK_DIV(i - vlynq_rdiv1), + &dev->remote->control); + writel((readl(&dev->local->control) + & ~(VLYNQ_CTRL_CLOCK_INT | + VLYNQ_CTRL_CLOCK_MASK)) | + VLYNQ_CTRL_CLOCK_DIV(i - vlynq_rdiv1), + &dev->local->control); + + if (vlynq_linked(dev)) { + printk(KERN_DEBUG + "%s: using remote clock divisor %d\n", + dev_name(&dev->dev), i - vlynq_rdiv1 + 1); + dev->divisor = i; + return 0; + } else { + vlynq_reset(dev); + } + } + + return -ENODEV; +} + +/* + * A VLYNQ remote device can be clocked by the VLYNQ bus + * master using a dedicated clock line. In that case, only + * the bus master configures the serial clock divider. + * Iterate through the 8 possible dividers until we + * actually get a link with the device. + */ +static int __vlynq_try_local(struct vlynq_device *dev) +{ + int i; + + vlynq_reset(dev); + + for (i = dev->dev_id ? vlynq_ldiv2 : vlynq_ldiv8; dev->dev_id ? + i <= vlynq_ldiv8 : i >= vlynq_ldiv2; + dev->dev_id ? i++ : i--) { + + writel((readl(&dev->local->control) & + ~VLYNQ_CTRL_CLOCK_MASK) | + VLYNQ_CTRL_CLOCK_INT | + VLYNQ_CTRL_CLOCK_DIV(i - vlynq_ldiv1), + &dev->local->control); + + if (vlynq_linked(dev)) { + printk(KERN_DEBUG + "%s: using local clock divisor %d\n", + dev_name(&dev->dev), i - vlynq_ldiv1 + 1); + dev->divisor = i; + return 0; + } else { + vlynq_reset(dev); + } + } + + return -ENODEV; +} + +/* + * When using external clocking method, serial clock + * is supplied by an external oscillator, therefore we + * should mask the local clock bit in the clock control + * register for both the bus master and the remote device. + */ +static int __vlynq_try_external(struct vlynq_device *dev) +{ + vlynq_reset(dev); + if (!vlynq_linked(dev)) + return -ENODEV; + + writel((readl(&dev->remote->control) & + ~VLYNQ_CTRL_CLOCK_INT), + &dev->remote->control); + + writel((readl(&dev->local->control) & + ~VLYNQ_CTRL_CLOCK_INT), + &dev->local->control); + + if (vlynq_linked(dev)) { + printk(KERN_DEBUG "%s: using external clock\n", + dev_name(&dev->dev)); + dev->divisor = vlynq_div_external; + return 0; + } + + return -ENODEV; +} + +static int __vlynq_enable_device(struct vlynq_device *dev) +{ + int result; + struct plat_vlynq_ops *ops = dev->dev.platform_data; + + result = ops->on(dev); + if (result) + return result; + + switch (dev->divisor) { + case vlynq_div_external: + case vlynq_div_auto: + /* When the device is brought from reset it should have clock + * generation negotiated by hardware. + * Check which device is generating clocks and perform setup + * accordingly */ + if (vlynq_linked(dev) && readl(&dev->remote->control) & + VLYNQ_CTRL_CLOCK_INT) { + if (!__vlynq_try_remote(dev) || + !__vlynq_try_local(dev) || + !__vlynq_try_external(dev)) + return 0; + } else { + if (!__vlynq_try_external(dev) || + !__vlynq_try_local(dev) || + !__vlynq_try_remote(dev)) + return 0; + } + break; + case vlynq_ldiv1: + case vlynq_ldiv2: + case vlynq_ldiv3: + case vlynq_ldiv4: + case vlynq_ldiv5: + case vlynq_ldiv6: + case vlynq_ldiv7: + case vlynq_ldiv8: + writel(VLYNQ_CTRL_CLOCK_INT | + VLYNQ_CTRL_CLOCK_DIV(dev->divisor - + vlynq_ldiv1), &dev->local->control); + writel(0, &dev->remote->control); + if (vlynq_linked(dev)) { + printk(KERN_DEBUG + "%s: using local clock divisor %d\n", + dev_name(&dev->dev), + dev->divisor - vlynq_ldiv1 + 1); + return 0; + } + break; + case vlynq_rdiv1: + case vlynq_rdiv2: + case vlynq_rdiv3: + case vlynq_rdiv4: + case vlynq_rdiv5: + case vlynq_rdiv6: + case vlynq_rdiv7: + case vlynq_rdiv8: + writel(0, &dev->local->control); + writel(VLYNQ_CTRL_CLOCK_INT | + VLYNQ_CTRL_CLOCK_DIV(dev->divisor - + vlynq_rdiv1), &dev->remote->control); + if (vlynq_linked(dev)) { + printk(KERN_DEBUG + "%s: using remote clock divisor %d\n", + dev_name(&dev->dev), + dev->divisor - vlynq_rdiv1 + 1); + return 0; + } + break; + } + + ops->off(dev); + return -ENODEV; +} + +int vlynq_enable_device(struct vlynq_device *dev) +{ + struct plat_vlynq_ops *ops = dev->dev.platform_data; + int result = -ENODEV; + + result = __vlynq_enable_device(dev); + if (result) + return result; + + result = vlynq_setup_irq(dev); + if (result) + ops->off(dev); + + dev->enabled = !result; + return result; +} +EXPORT_SYMBOL(vlynq_enable_device); + + +void vlynq_disable_device(struct vlynq_device *dev) +{ + struct plat_vlynq_ops *ops = dev->dev.platform_data; + + dev->enabled = 0; + free_irq(dev->irq, dev); + ops->off(dev); +} +EXPORT_SYMBOL(vlynq_disable_device); + +int vlynq_set_local_mapping(struct vlynq_device *dev, u32 tx_offset, + struct vlynq_mapping *mapping) +{ + int i; + + if (!dev->enabled) + return -ENXIO; + + writel(tx_offset, &dev->local->tx_offset); + for (i = 0; i < 4; i++) { + writel(mapping[i].offset, &dev->local->rx_mapping[i].offset); + writel(mapping[i].size, &dev->local->rx_mapping[i].size); + } + return 0; +} +EXPORT_SYMBOL(vlynq_set_local_mapping); + +int vlynq_set_remote_mapping(struct vlynq_device *dev, u32 tx_offset, + struct vlynq_mapping *mapping) +{ + int i; + + if (!dev->enabled) + return -ENXIO; + + writel(tx_offset, &dev->remote->tx_offset); + for (i = 0; i < 4; i++) { + writel(mapping[i].offset, &dev->remote->rx_mapping[i].offset); + writel(mapping[i].size, &dev->remote->rx_mapping[i].size); + } + return 0; +} +EXPORT_SYMBOL(vlynq_set_remote_mapping); + +int vlynq_set_local_irq(struct vlynq_device *dev, int virq) +{ + int irq = dev->irq_start + virq; + if (dev->enabled) + return -EBUSY; + + if ((irq < dev->irq_start) || (irq > dev->irq_end)) + return -EINVAL; + + if (virq == dev->remote_irq) + return -EINVAL; + + dev->local_irq = virq; + + return 0; +} +EXPORT_SYMBOL(vlynq_set_local_irq); + +int vlynq_set_remote_irq(struct vlynq_device *dev, int virq) +{ + int irq = dev->irq_start + virq; + if (dev->enabled) + return -EBUSY; + + if ((irq < dev->irq_start) || (irq > dev->irq_end)) + return -EINVAL; + + if (virq == dev->local_irq) + return -EINVAL; + + dev->remote_irq = virq; + + return 0; +} +EXPORT_SYMBOL(vlynq_set_remote_irq); + +static int vlynq_probe(struct platform_device *pdev) +{ + struct vlynq_device *dev; + struct resource *regs_res, *mem_res, *irq_res; + int len, result; + + regs_res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "regs"); + if (!regs_res) + return -ENODEV; + + mem_res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "mem"); + if (!mem_res) + return -ENODEV; + + irq_res = platform_get_resource_byname(pdev, IORESOURCE_IRQ, "devirq"); + if (!irq_res) + return -ENODEV; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) { + printk(KERN_ERR + "vlynq: failed to allocate device structure\n"); + return -ENOMEM; + } + + dev->id = pdev->id; + dev->dev.bus = &vlynq_bus_type; + dev->dev.parent = &pdev->dev; + dev_set_name(&dev->dev, "vlynq%d", dev->id); + dev->dev.platform_data = pdev->dev.platform_data; + dev->dev.release = vlynq_device_release; + + dev->regs_start = regs_res->start; + dev->regs_end = regs_res->end; + dev->mem_start = mem_res->start; + dev->mem_end = mem_res->end; + + len = regs_res->end - regs_res->start; + if (!request_mem_region(regs_res->start, len, dev_name(&dev->dev))) { + printk(KERN_ERR "%s: Can't request vlynq registers\n", + dev_name(&dev->dev)); + result = -ENXIO; + goto fail_request; + } + + dev->local = ioremap(regs_res->start, len); + if (!dev->local) { + printk(KERN_ERR "%s: Can't remap vlynq registers\n", + dev_name(&dev->dev)); + result = -ENXIO; + goto fail_remap; + } + + dev->remote = (struct vlynq_regs *)((void *)dev->local + + VLYNQ_REMOTE_OFFSET); + + dev->irq = platform_get_irq_byname(pdev, "irq"); + dev->irq_start = irq_res->start; + dev->irq_end = irq_res->end; + dev->local_irq = dev->irq_end - dev->irq_start; + dev->remote_irq = dev->local_irq - 1; + + if (device_register(&dev->dev)) + goto fail_register; + platform_set_drvdata(pdev, dev); + + printk(KERN_INFO "%s: regs 0x%p, irq %d, mem 0x%p\n", + dev_name(&dev->dev), (void *)dev->regs_start, dev->irq, + (void *)dev->mem_start); + + dev->dev_id = 0; + dev->divisor = vlynq_div_auto; + result = __vlynq_enable_device(dev); + if (result == 0) { + dev->dev_id = readl(&dev->remote->chip); + ((struct plat_vlynq_ops *)(dev->dev.platform_data))->off(dev); + } + if (dev->dev_id) + printk(KERN_INFO "Found a VLYNQ device: %08x\n", dev->dev_id); + + return 0; + +fail_register: + iounmap(dev->local); +fail_remap: +fail_request: + release_mem_region(regs_res->start, len); + kfree(dev); + return result; +} + +static int vlynq_remove(struct platform_device *pdev) +{ + struct vlynq_device *dev = platform_get_drvdata(pdev); + + device_unregister(&dev->dev); + iounmap(dev->local); + release_mem_region(dev->regs_start, dev->regs_end - dev->regs_start); + + kfree(dev); + + return 0; +} + +static struct platform_driver vlynq_platform_driver = { + .driver.name = "vlynq", + .probe = vlynq_probe, + .remove = __devexit_p(vlynq_remove), +}; + +struct bus_type vlynq_bus_type = { + .name = "vlynq", + .match = vlynq_device_match, + .probe = vlynq_device_probe, + .remove = vlynq_device_remove, +}; +EXPORT_SYMBOL(vlynq_bus_type); + +static int __devinit vlynq_init(void) +{ + int res = 0; + + res = bus_register(&vlynq_bus_type); + if (res) + goto fail_bus; + + res = platform_driver_register(&vlynq_platform_driver); + if (res) + goto fail_platform; + + return 0; + +fail_platform: + bus_unregister(&vlynq_bus_type); +fail_bus: + return res; +} + +static void __devexit vlynq_exit(void) +{ + platform_driver_unregister(&vlynq_platform_driver); + bus_unregister(&vlynq_bus_type); +} + +module_init(vlynq_init); +module_exit(vlynq_exit); diff --git a/include/linux/vlynq.h b/include/linux/vlynq.h new file mode 100644 index 000000000000..8f6a95882b09 --- /dev/null +++ b/include/linux/vlynq.h @@ -0,0 +1,161 @@ +/* + * Copyright (C) 2006, 2007 Eugene Konev + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef __VLYNQ_H__ +#define __VLYNQ_H__ + +#include +#include +#include + +#define VLYNQ_NUM_IRQS 32 + +struct vlynq_mapping { + u32 size; + u32 offset; +}; + +enum vlynq_divisor { + vlynq_div_auto = 0, + vlynq_ldiv1, + vlynq_ldiv2, + vlynq_ldiv3, + vlynq_ldiv4, + vlynq_ldiv5, + vlynq_ldiv6, + vlynq_ldiv7, + vlynq_ldiv8, + vlynq_rdiv1, + vlynq_rdiv2, + vlynq_rdiv3, + vlynq_rdiv4, + vlynq_rdiv5, + vlynq_rdiv6, + vlynq_rdiv7, + vlynq_rdiv8, + vlynq_div_external +}; + +struct vlynq_device_id { + u32 id; + enum vlynq_divisor divisor; + unsigned long driver_data; +}; + +struct vlynq_regs; +struct vlynq_device { + u32 id, dev_id; + int local_irq; + int remote_irq; + enum vlynq_divisor divisor; + u32 regs_start, regs_end; + u32 mem_start, mem_end; + u32 irq_start, irq_end; + int irq; + int enabled; + struct vlynq_regs *local; + struct vlynq_regs *remote; + struct device dev; +}; + +struct vlynq_driver { + char *name; + struct vlynq_device_id *id_table; + int (*probe)(struct vlynq_device *dev, struct vlynq_device_id *id); + void (*remove)(struct vlynq_device *dev); + struct device_driver driver; +}; + +struct plat_vlynq_ops { + int (*on)(struct vlynq_device *dev); + void (*off)(struct vlynq_device *dev); +}; + +static inline struct vlynq_driver *to_vlynq_driver(struct device_driver *drv) +{ + return container_of(drv, struct vlynq_driver, driver); +} + +static inline struct vlynq_device *to_vlynq_device(struct device *device) +{ + return container_of(device, struct vlynq_device, dev); +} + +extern struct bus_type vlynq_bus_type; + +extern int __vlynq_register_driver(struct vlynq_driver *driver, + struct module *owner); + +static inline int vlynq_register_driver(struct vlynq_driver *driver) +{ + return __vlynq_register_driver(driver, THIS_MODULE); +} + +static inline void *vlynq_get_drvdata(struct vlynq_device *dev) +{ + return dev_get_drvdata(&dev->dev); +} + +static inline void vlynq_set_drvdata(struct vlynq_device *dev, void *data) +{ + dev_set_drvdata(&dev->dev, data); +} + +static inline u32 vlynq_mem_start(struct vlynq_device *dev) +{ + return dev->mem_start; +} + +static inline u32 vlynq_mem_end(struct vlynq_device *dev) +{ + return dev->mem_end; +} + +static inline u32 vlynq_mem_len(struct vlynq_device *dev) +{ + return dev->mem_end - dev->mem_start + 1; +} + +static inline int vlynq_virq_to_irq(struct vlynq_device *dev, int virq) +{ + int irq = dev->irq_start + virq; + if ((irq < dev->irq_start) || (irq > dev->irq_end)) + return -EINVAL; + + return irq; +} + +static inline int vlynq_irq_to_virq(struct vlynq_device *dev, int irq) +{ + if ((irq < dev->irq_start) || (irq > dev->irq_end)) + return -EINVAL; + + return irq - dev->irq_start; +} + +extern void vlynq_unregister_driver(struct vlynq_driver *driver); +extern int vlynq_enable_device(struct vlynq_device *dev); +extern void vlynq_disable_device(struct vlynq_device *dev); +extern int vlynq_set_local_mapping(struct vlynq_device *dev, u32 tx_offset, + struct vlynq_mapping *mapping); +extern int vlynq_set_remote_mapping(struct vlynq_device *dev, u32 tx_offset, + struct vlynq_mapping *mapping); +extern int vlynq_set_local_irq(struct vlynq_device *dev, int virq); +extern int vlynq_set_remote_irq(struct vlynq_device *dev, int virq); + +#endif /* __VLYNQ_H__ */ -- cgit v1.2.3 From 8f3128e714ded7cf1e8c786c204a4f253b5d8ff4 Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Tue, 16 Jun 2009 15:34:17 -0700 Subject: lis3: add click function The LIS302DL accelerometer chip has a 'click' feature which can be used to detect sudden motion on any of the three axis. Configuration data is passed via spi platform_data and no action is taken if that's not specified, so it won't harm any existing platform. To make the configuration effective, the IRQ lines need to be set up appropriately. This patch also adds a way to do that from board support code. The DD_* definitions were factored out to an own enum because they are specific to LIS3LV02D devices. Signed-off-by: Daniel Mack Acked-by: Pavel Machek Acked-by: Eric Piel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/hwmon/lis3lv02d.c | 20 ++++++++++++++++++++ drivers/hwmon/lis3lv02d.h | 19 ++++++++++++++++++- drivers/hwmon/lis3lv02d_spi.c | 1 + include/linux/lis3lv02d.h | 39 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 78 insertions(+), 1 deletion(-) create mode 100644 include/linux/lis3lv02d.h (limited to 'include') diff --git a/drivers/hwmon/lis3lv02d.c b/drivers/hwmon/lis3lv02d.c index 366190609c73..271338bdb6be 100644 --- a/drivers/hwmon/lis3lv02d.c +++ b/drivers/hwmon/lis3lv02d.c @@ -438,6 +438,26 @@ int lis3lv02d_init_device(struct lis3lv02d *dev) if (lis3lv02d_joystick_enable()) printk(KERN_ERR DRIVER_NAME ": joystick initialization failed\n"); + /* passing in platform specific data is purely optional and only + * used by the SPI transport layer at the moment */ + if (dev->pdata) { + struct lis3lv02d_platform_data *p = dev->pdata; + + if (p->click_flags && (dev->whoami == LIS_SINGLE_ID)) { + dev->write(dev, CLICK_CFG, p->click_flags); + dev->write(dev, CLICK_TIMELIMIT, p->click_time_limit); + dev->write(dev, CLICK_LATENCY, p->click_latency); + dev->write(dev, CLICK_WINDOW, p->click_window); + dev->write(dev, CLICK_THSZ, p->click_thresh_z & 0xf); + dev->write(dev, CLICK_THSY_X, + (p->click_thresh_x & 0xf) | + (p->click_thresh_y << 4)); + } + + if (p->irq_cfg) + dev->write(dev, CTRL_REG3, p->irq_cfg); + } + /* bail if we did not get an IRQ from the bus layer */ if (!dev->irq) { printk(KERN_ERR DRIVER_NAME diff --git a/drivers/hwmon/lis3lv02d.h b/drivers/hwmon/lis3lv02d.h index 5a5a196e6a66..e320e2f511f1 100644 --- a/drivers/hwmon/lis3lv02d.h +++ b/drivers/hwmon/lis3lv02d.h @@ -29,12 +29,14 @@ * They can also be connected via I²C. */ +#include + /* 2-byte registers */ #define LIS_DOUBLE_ID 0x3A /* LIS3LV02D[LQ] */ /* 1-byte registers */ #define LIS_SINGLE_ID 0x3B /* LIS[32]02DL and others */ -enum lis3lv02d_reg { +enum lis3_reg { WHO_AM_I = 0x0F, OFFSET_X = 0x16, OFFSET_Y = 0x17, @@ -62,6 +64,19 @@ enum lis3lv02d_reg { FF_WU_THS_L = 0x34, FF_WU_THS_H = 0x35, FF_WU_DURATION = 0x36, +}; + +enum lis302d_reg { + CLICK_CFG = 0x38, + CLICK_SRC = 0x39, + CLICK_THSY_X = 0x3B, + CLICK_THSZ = 0x3C, + CLICK_TIMELIMIT = 0x3D, + CLICK_LATENCY = 0x3E, + CLICK_WINDOW = 0x3F, +}; + +enum lis3lv02d_reg { DD_CFG = 0x38, DD_SRC = 0x39, DD_ACK = 0x3A, @@ -183,6 +198,8 @@ struct lis3lv02d { struct fasync_struct *async_queue; /* queue for the misc device */ wait_queue_head_t misc_wait; /* Wait queue for the misc device */ unsigned long misc_opened; /* bit0: whether the device is open */ + + struct lis3lv02d_platform_data *pdata; /* for passing board config */ }; int lis3lv02d_init_device(struct lis3lv02d *lis3); diff --git a/drivers/hwmon/lis3lv02d_spi.c b/drivers/hwmon/lis3lv02d_spi.c index 07ae74b0e191..3827ff04485f 100644 --- a/drivers/hwmon/lis3lv02d_spi.c +++ b/drivers/hwmon/lis3lv02d_spi.c @@ -72,6 +72,7 @@ static int __devinit lis302dl_spi_probe(struct spi_device *spi) lis3_dev.write = lis3_spi_write; lis3_dev.irq = spi->irq; lis3_dev.ac = lis3lv02d_axis_normal; + lis3_dev.pdata = spi->dev.platform_data; spi_set_drvdata(spi, &lis3_dev); ret = lis3lv02d_init_device(&lis3_dev); diff --git a/include/linux/lis3lv02d.h b/include/linux/lis3lv02d.h new file mode 100644 index 000000000000..ad651f4e45ac --- /dev/null +++ b/include/linux/lis3lv02d.h @@ -0,0 +1,39 @@ +#ifndef __LIS3LV02D_H_ +#define __LIS3LV02D_H_ + +struct lis3lv02d_platform_data { + /* please note: the 'click' feature is only supported for + * LIS[32]02DL variants of the chip and will be ignored for + * others */ +#define LIS3_CLICK_SINGLE_X (1 << 0) +#define LIS3_CLICK_DOUBLE_X (1 << 1) +#define LIS3_CLICK_SINGLE_Y (1 << 2) +#define LIS3_CLICK_DOUBLE_Y (1 << 3) +#define LIS3_CLICK_SINGLE_Z (1 << 4) +#define LIS3_CLICK_DOUBLE_Z (1 << 5) + unsigned char click_flags; + unsigned char click_thresh_x; + unsigned char click_thresh_y; + unsigned char click_thresh_z; + unsigned char click_time_limit; + unsigned char click_latency; + unsigned char click_window; + +#define LIS3_IRQ1_DISABLE (0 << 0) +#define LIS3_IRQ1_FF_WU_1 (1 << 0) +#define LIS3_IRQ1_FF_WU_2 (2 << 0) +#define LIS3_IRQ1_FF_WU_12 (3 << 0) +#define LIS3_IRQ1_DATA_READY (4 << 0) +#define LIS3_IRQ1_CLICK (7 << 0) +#define LIS3_IRQ2_DISABLE (0 << 3) +#define LIS3_IRQ2_FF_WU_1 (1 << 3) +#define LIS3_IRQ2_FF_WU_2 (2 << 3) +#define LIS3_IRQ2_FF_WU_12 (3 << 3) +#define LIS3_IRQ2_DATA_READY (4 << 3) +#define LIS3_IRQ2_CLICK (7 << 3) +#define LIS3_IRQ_OPEN_DRAIN (1 << 6) +#define LIS3_IRQ_ACTIVE_HIGH (1 << 7) + unsigned char irq_cfg; +}; + +#endif /* __LIS3LV02D_H_ */ -- cgit v1.2.3 From ae52bb2384f721562f15f719de1acb8e934733cb Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 16 Jun 2009 15:34:19 -0700 Subject: fbdev: move logo externs to header file Now we have __initconst, we can finally move the external declarations for the various Linux logo structures to . James' ack dates back to the previous submission (way to long ago), when the logos were still __initdata, which caused failures on some platforms with some toolchain versions. Signed-off-by: Geert Uytterhoeven Acked-by: James Simmons Cc: Krzysztof Helt Cc: Sam Ravnborg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/kernel/prom_init.c | 3 --- arch/powerpc/platforms/cell/spu_base.c | 11 +---------- arch/um/include/shared/init.h | 2 +- drivers/video/logo/logo.c | 15 --------------- include/linux/init.h | 2 +- include/linux/linux_logo.h | 16 ++++++++++++++++ scripts/pnmtologo.c | 18 +++++++++--------- 7 files changed, 28 insertions(+), 39 deletions(-) (limited to 'include') diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index 2f0e64b53642..ef6f64950e9b 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -44,10 +44,7 @@ #include #include -#ifdef CONFIG_LOGO_LINUX_CLUT224 #include -extern const struct linux_logo logo_linux_clut224; -#endif /* * Properties whose value is longer than this get excluded from our diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c index 9abd210d87c1..8547e86bfb42 100644 --- a/arch/powerpc/platforms/cell/spu_base.c +++ b/arch/powerpc/platforms/cell/spu_base.c @@ -752,17 +752,8 @@ static int __init init_spu_base(void) goto out_unregister_sysdev_class; } - if (ret > 0) { - /* - * We cannot put the forward declaration in - * because of conflicting session type - * conflicts for const and __initdata with different compiler - * versions - */ - extern const struct linux_logo logo_spe_clut224; - + if (ret > 0) fb_append_extra_logo(&logo_spe_clut224, ret); - } mutex_lock(&spu_full_list_mutex); xmon_register_spus(&spu_full_list); diff --git a/arch/um/include/shared/init.h b/arch/um/include/shared/init.h index 37dd097c16c0..b3906f860a87 100644 --- a/arch/um/include/shared/init.h +++ b/arch/um/include/shared/init.h @@ -27,7 +27,7 @@ * sign followed by value, e.g.: * * static int init_variable __initdata = 0; - * static char linux_logo[] __initdata = { 0x32, 0x36, ... }; + * static const char linux_logo[] __initconst = { 0x32, 0x36, ... }; * * Don't forget to initialize data not at file scope, i.e. within a function, * as gcc otherwise puts the data into the bss section and not into the init diff --git a/drivers/video/logo/logo.c b/drivers/video/logo/logo.c index 2e85a2b52d05..ea7a8ccc830c 100644 --- a/drivers/video/logo/logo.c +++ b/drivers/video/logo/logo.c @@ -21,21 +21,6 @@ #include #endif -extern const struct linux_logo logo_linux_mono; -extern const struct linux_logo logo_linux_vga16; -extern const struct linux_logo logo_linux_clut224; -extern const struct linux_logo logo_blackfin_vga16; -extern const struct linux_logo logo_blackfin_clut224; -extern const struct linux_logo logo_dec_clut224; -extern const struct linux_logo logo_mac_clut224; -extern const struct linux_logo logo_parisc_clut224; -extern const struct linux_logo logo_sgi_clut224; -extern const struct linux_logo logo_sun_clut224; -extern const struct linux_logo logo_superh_mono; -extern const struct linux_logo logo_superh_vga16; -extern const struct linux_logo logo_superh_clut224; -extern const struct linux_logo logo_m32r_clut224; - static int nologo; module_param(nologo, bool, 0); MODULE_PARM_DESC(nologo, "Disables startup logo"); diff --git a/include/linux/init.h b/include/linux/init.h index b2189803f19a..8c2c9989626d 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -29,7 +29,7 @@ * sign followed by value, e.g.: * * static int init_variable __initdata = 0; - * static char linux_logo[] __initdata = { 0x32, 0x36, ... }; + * static const char linux_logo[] __initconst = { 0x32, 0x36, ... }; * * Don't forget to initialize data not at file scope, i.e. within a function, * as gcc otherwise puts the data into the bss section and not into the init diff --git a/include/linux/linux_logo.h b/include/linux/linux_logo.h index 08a92969c76e..ca5bd91d12e1 100644 --- a/include/linux/linux_logo.h +++ b/include/linux/linux_logo.h @@ -32,6 +32,22 @@ struct linux_logo { const unsigned char *data; }; +extern const struct linux_logo logo_linux_mono; +extern const struct linux_logo logo_linux_vga16; +extern const struct linux_logo logo_linux_clut224; +extern const struct linux_logo logo_blackfin_vga16; +extern const struct linux_logo logo_blackfin_clut224; +extern const struct linux_logo logo_dec_clut224; +extern const struct linux_logo logo_mac_clut224; +extern const struct linux_logo logo_parisc_clut224; +extern const struct linux_logo logo_sgi_clut224; +extern const struct linux_logo logo_sun_clut224; +extern const struct linux_logo logo_superh_mono; +extern const struct linux_logo logo_superh_vga16; +extern const struct linux_logo logo_superh_clut224; +extern const struct linux_logo logo_m32r_clut224; +extern const struct linux_logo logo_spe_clut224; + extern const struct linux_logo *fb_find_logo(int depth); #ifdef CONFIG_FB_LOGO_EXTRA extern void fb_append_extra_logo(const struct linux_logo *logo, diff --git a/scripts/pnmtologo.c b/scripts/pnmtologo.c index 6aa2a2483f8d..64f5ddb09ea6 100644 --- a/scripts/pnmtologo.c +++ b/scripts/pnmtologo.c @@ -237,22 +237,22 @@ static void write_header(void) fprintf(out, " * Linux logo %s\n", logoname); fputs(" */\n\n", out); fputs("#include \n\n", out); - fprintf(out, "static unsigned char %s_data[] __initdata = {\n", + fprintf(out, "static const unsigned char %s_data[] __initconst = {\n", logoname); } static void write_footer(void) { fputs("\n};\n\n", out); - fprintf(out, "struct linux_logo %s __initdata = {\n", logoname); - fprintf(out, " .type\t= %s,\n", logo_types[logo_type]); - fprintf(out, " .width\t= %d,\n", logo_width); - fprintf(out, " .height\t= %d,\n", logo_height); + fprintf(out, "const struct linux_logo %s __initconst = {\n", logoname); + fprintf(out, "\t.type\t\t= %s,\n", logo_types[logo_type]); + fprintf(out, "\t.width\t\t= %d,\n", logo_width); + fprintf(out, "\t.height\t\t= %d,\n", logo_height); if (logo_type == LINUX_LOGO_CLUT224) { - fprintf(out, " .clutsize\t= %d,\n", logo_clutsize); - fprintf(out, " .clut\t= %s_clut,\n", logoname); + fprintf(out, "\t.clutsize\t= %d,\n", logo_clutsize); + fprintf(out, "\t.clut\t\t= %s_clut,\n", logoname); } - fprintf(out, " .data\t= %s_data\n", logoname); + fprintf(out, "\t.data\t\t= %s_data\n", logoname); fputs("};\n\n", out); /* close logo file */ @@ -374,7 +374,7 @@ static void write_logo_clut224(void) fputs("\n};\n\n", out); /* write logo clut */ - fprintf(out, "static unsigned char %s_clut[] __initdata = {\n", + fprintf(out, "static const unsigned char %s_clut[] __initconst = {\n", logoname); write_hex_cnt = 0; for (i = 0; i < logo_clutsize; i++) { -- cgit v1.2.3 From 4410f3910947dcea8672280b3adecd53cec4e85e Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Tue, 16 Jun 2009 15:34:38 -0700 Subject: fbdev: add support for handoff from firmware to hw framebuffers With KMS we have ran into an issue where we really want the KMS fb driver to be the one running the console, so panics etc can be shown by switching out of X etc. However with vesafb/efifb built-in, we end up with those on fb0 and the KMS fb driver on fb1, driving the same piece of hw, so this adds an fb info flag to denote a firmware fbdev, and adds a new aperture base/size range which can be compared when the hw drivers are installed to see if there is a conflict with a firmware driver, and if there is the firmware driver is unregistered and the hw driver takes over. It uses new aperture_base/size members instead of comparing on the fix smem_start/length, as smem_start/length might for example only cover the first 1MB of the PCI aperture, and we could allocate the kms fb from 8MB into the aperture, thus they would never overlap. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Dave Airlie Acked-by: Peter Jones Cc: Geert Uytterhoeven Cc: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/gpu/drm/i915/intel_fb.c | 8 ++++++++ drivers/video/efifb.c | 5 ++++- drivers/video/fbmem.c | 31 +++++++++++++++++++++++++++++++ drivers/video/vesafb.c | 15 ++++++++++++++- include/linux/fb.h | 12 +++++++++++- 5 files changed, 68 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/i915/intel_fb.c b/drivers/gpu/drm/i915/intel_fb.c index 0ecf6b76a401..8e28e5993df5 100644 --- a/drivers/gpu/drm/i915/intel_fb.c +++ b/drivers/gpu/drm/i915/intel_fb.c @@ -504,6 +504,14 @@ static int intelfb_create(struct drm_device *dev, uint32_t fb_width, info->fbops = &intelfb_ops; info->fix.line_length = fb->pitch; + + /* setup aperture base/size for vesafb takeover */ + info->aperture_base = dev->mode_config.fb_base; + if (IS_I9XX(dev)) + info->aperture_size = pci_resource_len(dev->pdev, 2); + else + info->aperture_size = pci_resource_len(dev->pdev, 0); + info->fix.smem_start = dev->mode_config.fb_base + obj_priv->gtt_offset; info->fix.smem_len = size; diff --git a/drivers/video/efifb.c b/drivers/video/efifb.c index 8dea2bc92705..eb12182b2059 100644 --- a/drivers/video/efifb.c +++ b/drivers/video/efifb.c @@ -280,6 +280,9 @@ static int __init efifb_probe(struct platform_device *dev) info->pseudo_palette = info->par; info->par = NULL; + info->aperture_base = efifb_fix.smem_start; + info->aperture_size = size_total; + info->screen_base = ioremap(efifb_fix.smem_start, efifb_fix.smem_len); if (!info->screen_base) { printk(KERN_ERR "efifb: abort, cannot ioremap video memory " @@ -337,7 +340,7 @@ static int __init efifb_probe(struct platform_device *dev) info->fbops = &efifb_ops; info->var = efifb_defined; info->fix = efifb_fix; - info->flags = FBINFO_FLAG_DEFAULT; + info->flags = FBINFO_FLAG_DEFAULT | FBINFO_MISC_FIRMWARE; if ((err = fb_alloc_cmap(&info->cmap, 256, 0)) < 0) { printk(KERN_ERR "efifb: cannot allocate colormap\n"); diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c index d412a1ddc12f..f8a09bf8d0cd 100644 --- a/drivers/video/fbmem.c +++ b/drivers/video/fbmem.c @@ -1462,6 +1462,16 @@ static int fb_check_foreignness(struct fb_info *fi) return 0; } +static bool fb_do_apertures_overlap(struct fb_info *gen, struct fb_info *hw) +{ + /* is the generic aperture base the same as the HW one */ + if (gen->aperture_base == hw->aperture_base) + return true; + /* is the generic aperture base inside the hw base->hw base+size */ + if (gen->aperture_base > hw->aperture_base && gen->aperture_base <= hw->aperture_base + hw->aperture_size) + return true; + return false; +} /** * register_framebuffer - registers a frame buffer device * @fb_info: frame buffer info structure @@ -1485,6 +1495,23 @@ register_framebuffer(struct fb_info *fb_info) if (fb_check_foreignness(fb_info)) return -ENOSYS; + /* check all firmware fbs and kick off if the base addr overlaps */ + for (i = 0 ; i < FB_MAX; i++) { + if (!registered_fb[i]) + continue; + + if (registered_fb[i]->flags & FBINFO_MISC_FIRMWARE) { + if (fb_do_apertures_overlap(registered_fb[i], fb_info)) { + printk(KERN_ERR "fb: conflicting fb hw usage " + "%s vs %s - removing generic driver\n", + fb_info->fix.id, + registered_fb[i]->fix.id); + unregister_framebuffer(registered_fb[i]); + break; + } + } + } + num_registered_fb++; for (i = 0 ; i < FB_MAX; i++) if (!registered_fb[i]) @@ -1586,6 +1613,10 @@ unregister_framebuffer(struct fb_info *fb_info) device_destroy(fb_class, MKDEV(FB_MAJOR, i)); event.info = fb_info; fb_notifier_call_chain(FB_EVENT_FB_UNREGISTERED, &event); + + /* this may free fb info */ + if (fb_info->fbops->fb_destroy) + fb_info->fbops->fb_destroy(fb_info); done: return ret; } diff --git a/drivers/video/vesafb.c b/drivers/video/vesafb.c index d6856f43d241..bd37ee1f6a25 100644 --- a/drivers/video/vesafb.c +++ b/drivers/video/vesafb.c @@ -174,8 +174,17 @@ static int vesafb_setcolreg(unsigned regno, unsigned red, unsigned green, return err; } +static void vesafb_destroy(struct fb_info *info) +{ + if (info->screen_base) + iounmap(info->screen_base); + release_mem_region(info->aperture_base, info->aperture_size); + framebuffer_release(info); +} + static struct fb_ops vesafb_ops = { .owner = THIS_MODULE, + .fb_destroy = vesafb_destroy, .fb_setcolreg = vesafb_setcolreg, .fb_pan_display = vesafb_pan_display, .fb_fillrect = cfb_fillrect, @@ -286,6 +295,10 @@ static int __init vesafb_probe(struct platform_device *dev) info->pseudo_palette = info->par; info->par = NULL; + /* set vesafb aperture size for generic probing */ + info->aperture_base = screen_info.lfb_base; + info->aperture_size = size_total; + info->screen_base = ioremap(vesafb_fix.smem_start, vesafb_fix.smem_len); if (!info->screen_base) { printk(KERN_ERR @@ -437,7 +450,7 @@ static int __init vesafb_probe(struct platform_device *dev) info->fbops = &vesafb_ops; info->var = vesafb_defined; info->fix = vesafb_fix; - info->flags = FBINFO_FLAG_DEFAULT | + info->flags = FBINFO_FLAG_DEFAULT | FBINFO_MISC_FIRMWARE | (ypan ? FBINFO_HWACCEL_YPAN : 0); if (!ypan) diff --git a/include/linux/fb.h b/include/linux/fb.h index 330c4b1bfcaa..3c5562a52167 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -677,6 +677,9 @@ struct fb_ops { /* get capability given var */ void (*fb_get_caps)(struct fb_info *info, struct fb_blit_caps *caps, struct fb_var_screeninfo *var); + + /* teardown any resources to do with this framebuffer */ + void (*fb_destroy)(struct fb_info *info); }; #ifdef CONFIG_FB_TILEBLITTING @@ -786,6 +789,8 @@ struct fb_tile_ops { #define FBINFO_MISC_USEREVENT 0x10000 /* event request from userspace */ #define FBINFO_MISC_TILEBLITTING 0x20000 /* use tile blitting */ +#define FBINFO_MISC_FIRMWARE 0x40000 /* a replaceable firmware + inited framebuffer */ /* A driver may set this flag to indicate that it does want a set_par to be * called every time when fbcon_switch is executed. The advantage is that with @@ -854,7 +859,12 @@ struct fb_info { u32 state; /* Hardware state i.e suspend */ void *fbcon_par; /* fbcon use-only private area */ /* From here on everything is device dependent */ - void *par; + void *par; + /* we need the PCI or similiar aperture base/size not + smem_start/size as smem_start may just be an object + allocated inside the aperture so may not actually overlap */ + resource_size_t aperture_base; + resource_size_t aperture_size; }; #ifdef MODULE -- cgit v1.2.3 From 3ed167af96ed098187ea41353fe02d1af20d38a1 Mon Sep 17 00:00:00 2001 From: Kristoffer Ericson Date: Tue, 16 Jun 2009 15:34:40 -0700 Subject: fbdev: s1d13xxxfb: add accelerated bitblt functions Add accelerated bitblt functions to s1d13xxx based video chipsets, more specificly functions copyarea and fillrect. It has only been tested and activated for 13506 chipsets but is expected to work for the majority of s1d13xxx based chips. This patch also cleans up the driver with respect of whitespaces and other formatting issues. We update the current status comments. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Kristoffer Ericson Cc: Russell King Cc: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/s1d13xxxfb.c | 341 ++++++++++++++++++++++++++++++++++++++++----- include/video/s1d13xxxfb.h | 9 ++ 2 files changed, 317 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/drivers/video/s1d13xxxfb.c b/drivers/video/s1d13xxxfb.c index 0726aecf3b7e..0deb0a8867b7 100644 --- a/drivers/video/s1d13xxxfb.c +++ b/drivers/video/s1d13xxxfb.c @@ -2,6 +2,7 @@ * * (c) 2004 Simtec Electronics * (c) 2005 Thibaut VARENE + * (c) 2009 Kristoffer Ericson * * Driver for Epson S1D13xxx series framebuffer chips * @@ -10,18 +11,10 @@ * linux/drivers/video/epson1355fb.c * linux/drivers/video/epson/s1d13xxxfb.c (2.4 driver by Epson) * - * Note, currently only tested on S1D13806 with 16bit CRT. - * As such, this driver might still contain some hardcoded bits relating to - * S1D13806. - * Making it work on other S1D13XXX chips should merely be a matter of adding - * a few switch()s, some missing glue here and there maybe, and split header - * files. - * * TODO: - handle dual screen display (CRT and LCD at the same time). * - check_var(), mode change, etc. - * - PM untested. - * - Accelerated interfaces. - * - Probably not SMP safe :) + * - probably not SMP safe :) + * - support all bitblt operations on all cards * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of this archive for @@ -31,19 +24,24 @@ #include #include #include - #include #include #include #include #include +#include +#include #include #include