From 8005aba69a6440a535a4cc2aed99ffca580847e0 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 21 Jun 2005 15:39:22 -0700
Subject: [SPARC64]: Fix cmsg length checks in Solaris emulation layer.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc64/solaris/socket.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/sparc64')

diff --git a/arch/sparc64/solaris/socket.c b/arch/sparc64/solaris/socket.c
index ec8e074c4eac..06740582717e 100644
--- a/arch/sparc64/solaris/socket.c
+++ b/arch/sparc64/solaris/socket.c
@@ -317,8 +317,10 @@ asmlinkage int solaris_sendmsg(int fd, struct sol_nmsghdr __user *user_msg, unsi
 		unsigned long *kcmsg;
 		compat_size_t cmlen;
 
-		if(kern_msg.msg_controllen > sizeof(ctl) &&
-		   kern_msg.msg_controllen <= 256) {
+		if (kern_msg.msg_controllen <= sizeof(compat_size_t))
+			return -EINVAL;
+
+		if(kern_msg.msg_controllen > sizeof(ctl)) {
 			err = -ENOBUFS;
 			ctl_buf = kmalloc(kern_msg.msg_controllen, GFP_KERNEL);
 			if(!ctl_buf)
-- 
cgit v1.2.3


From 39c715b71740c4a78ba4769fb54826929bac03cb Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 21 Jun 2005 17:14:34 -0700
Subject: [PATCH] smp_processor_id() cleanup

This patch implements a number of smp_processor_id() cleanup ideas that
Arjan van de Ven and I came up with.

The previous __smp_processor_id/_smp_processor_id/smp_processor_id API
spaghetti was hard to follow both on the implementational and on the
usage side.

Some of the complexity arose from picking wrong names, some of the
complexity comes from the fact that not all architectures defined
__smp_processor_id.

In the new code, there are two externally visible symbols:

 - smp_processor_id(): debug variant.

 - raw_smp_processor_id(): nondebug variant. Replaces all existing
   uses of _smp_processor_id() and __smp_processor_id(). Defined
   by every SMP architecture in include/asm-*/smp.h.

There is one new internal symbol, dependent on DEBUG_PREEMPT:

 - debug_smp_processor_id(): internal debug variant, mapped to
                             smp_processor_id().

Also, i moved debug_smp_processor_id() from lib/kernel_lock.c into a new
lib/smp_processor_id.c file.  All related comments got updated and/or
clarified.

I have build/boot tested the following 8 .config combinations on x86:

 {SMP,UP} x {PREEMPT,!PREEMPT} x {DEBUG_PREEMPT,!DEBUG_PREEMPT}

I have also build/boot tested x64 on UP/PREEMPT/DEBUG_PREEMPT.  (Other
architectures are untested, but should work just fine.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/traps.c          |  2 +-
 arch/i386/lib/delay.c             |  2 +-
 arch/ppc/lib/locks.c              |  4 +--
 arch/ppc64/kernel/idle.c          |  2 +-
 arch/sh/lib/delay.c               |  2 +-
 arch/sparc64/lib/delay.c          |  2 +-
 arch/x86_64/lib/delay.c           |  2 +-
 drivers/acpi/processor_idle.c     |  2 +-
 drivers/input/gameport/gameport.c |  2 +-
 drivers/oprofile/buffer_sync.c    |  4 +--
 fs/xfs/linux-2.6/xfs_linux.h      |  6 ++---
 include/asm-alpha/smp.h           |  2 +-
 include/asm-arm/smp.h             |  2 +-
 include/asm-i386/smp.h            |  2 +-
 include/asm-ia64/smp.h            |  2 +-
 include/asm-m32r/smp.h            |  2 +-
 include/asm-mips/smp.h            |  2 +-
 include/asm-parisc/smp.h          |  2 +-
 include/asm-ppc/smp.h             |  2 +-
 include/asm-ppc64/smp.h           |  2 +-
 include/asm-s390/smp.h            |  2 +-
 include/asm-sh/smp.h              |  2 +-
 include/asm-sparc/smp.h           |  2 +-
 include/asm-sparc64/smp.h         |  2 +-
 include/asm-um/smp.h              |  3 ++-
 include/asm-x86_64/smp.h          |  2 +-
 include/linux/mmzone.h            |  2 +-
 include/linux/smp.h               | 40 ++++++++++++----------------
 include/net/route.h               |  2 +-
 include/net/snmp.h                | 14 +++++-----
 kernel/module.c                   |  2 +-
 kernel/power/smp.c                |  4 +--
 kernel/sched.c                    |  4 +--
 kernel/stop_machine.c             |  4 +--
 lib/Makefile                      |  1 +
 lib/kernel_lock.c                 | 55 ---------------------------------------
 lib/smp_processor_id.c            | 55 +++++++++++++++++++++++++++++++++++++++
 37 files changed, 119 insertions(+), 125 deletions(-)
 create mode 100644 lib/smp_processor_id.c

(limited to 'arch/sparc64')

diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 00c63419c06f..83c579e82a81 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -306,7 +306,7 @@ void die(const char * str, struct pt_regs * regs, long err)
 	};
 	static int die_counter;
 
-	if (die.lock_owner != _smp_processor_id()) {
+	if (die.lock_owner != raw_smp_processor_id()) {
 		console_verbose();
 		spin_lock_irq(&die.lock);
 		die.lock_owner = smp_processor_id();
diff --git a/arch/i386/lib/delay.c b/arch/i386/lib/delay.c
index 080639f262b1..eb0cdfe9280f 100644
--- a/arch/i386/lib/delay.c
+++ b/arch/i386/lib/delay.c
@@ -34,7 +34,7 @@ inline void __const_udelay(unsigned long xloops)
 	xloops *= 4;
 	__asm__("mull %0"
 		:"=d" (xloops), "=&a" (d0)
-		:"1" (xloops),"0" (cpu_data[_smp_processor_id()].loops_per_jiffy * (HZ/4)));
+		:"1" (xloops),"0" (cpu_data[raw_smp_processor_id()].loops_per_jiffy * (HZ/4)));
         __delay(++xloops);
 }
 
diff --git a/arch/ppc/lib/locks.c b/arch/ppc/lib/locks.c
index 694163d696d8..c450dc4b766e 100644
--- a/arch/ppc/lib/locks.c
+++ b/arch/ppc/lib/locks.c
@@ -130,7 +130,7 @@ void _raw_read_lock(rwlock_t *rw)
 		while (!read_can_lock(rw)) {
 			if (--stuck == 0) {
 				printk("_read_lock(%p) CPU#%d lock %d\n",
-				       rw, _smp_processor_id(), rw->lock);
+				       rw, raw_smp_processor_id(), rw->lock);
 				stuck = INIT_STUCK;
 			}
 		}
@@ -158,7 +158,7 @@ void _raw_write_lock(rwlock_t *rw)
 		while (!write_can_lock(rw)) {
 			if (--stuck == 0) {
 				printk("write_lock(%p) CPU#%d lock %d)\n",
-				       rw, _smp_processor_id(), rw->lock);
+				       rw, raw_smp_processor_id(), rw->lock);
 				stuck = INIT_STUCK;
 			}
 		}
diff --git a/arch/ppc64/kernel/idle.c b/arch/ppc64/kernel/idle.c
index f24ce2b87200..ff8a7db142d3 100644
--- a/arch/ppc64/kernel/idle.c
+++ b/arch/ppc64/kernel/idle.c
@@ -292,7 +292,7 @@ static int native_idle(void)
 		if (need_resched())
 			schedule();
 
-		if (cpu_is_offline(_smp_processor_id()) &&
+		if (cpu_is_offline(raw_smp_processor_id()) &&
 		    system_state == SYSTEM_RUNNING)
 			cpu_die();
 	}
diff --git a/arch/sh/lib/delay.c b/arch/sh/lib/delay.c
index 50b36037d86b..351714694d6d 100644
--- a/arch/sh/lib/delay.c
+++ b/arch/sh/lib/delay.c
@@ -24,7 +24,7 @@ inline void __const_udelay(unsigned long xloops)
 	__asm__("dmulu.l	%0, %2\n\t"
 		"sts	mach, %0"
 		: "=r" (xloops)
-		: "0" (xloops), "r" (cpu_data[_smp_processor_id()].loops_per_jiffy)
+		: "0" (xloops), "r" (cpu_data[raw_smp_processor_id()].loops_per_jiffy)
 		: "macl", "mach");
 	__delay(xloops * HZ);
 }
diff --git a/arch/sparc64/lib/delay.c b/arch/sparc64/lib/delay.c
index f6b4c784d53e..e8808727617a 100644
--- a/arch/sparc64/lib/delay.c
+++ b/arch/sparc64/lib/delay.c
@@ -31,7 +31,7 @@ void __const_udelay(unsigned long n)
 {
 	n *= 4;
 
-	n *= (cpu_data(_smp_processor_id()).udelay_val * (HZ/4));
+	n *= (cpu_data(raw_smp_processor_id()).udelay_val * (HZ/4));
 	n >>= 32;
 
 	__delay(n + 1);
diff --git a/arch/x86_64/lib/delay.c b/arch/x86_64/lib/delay.c
index 6e2d66472eb1..aed61a668a1b 100644
--- a/arch/x86_64/lib/delay.c
+++ b/arch/x86_64/lib/delay.c
@@ -34,7 +34,7 @@ void __delay(unsigned long loops)
 
 inline void __const_udelay(unsigned long xloops)
 {
-	__delay(((xloops * cpu_data[_smp_processor_id()].loops_per_jiffy) >> 32) * HZ);
+	__delay(((xloops * cpu_data[raw_smp_processor_id()].loops_per_jiffy) >> 32) * HZ);
 }
 
 void __udelay(unsigned long usecs)
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index ff64d333e95f..c9d671cf7857 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -171,7 +171,7 @@ static void acpi_processor_idle (void)
 	int			sleep_ticks = 0;
 	u32			t1, t2 = 0;
 
-	pr = processors[_smp_processor_id()];
+	pr = processors[raw_smp_processor_id()];
 	if (!pr)
 		return;
 
diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c
index 9b8ff396e6f8..e152d0fa0cdd 100644
--- a/drivers/input/gameport/gameport.c
+++ b/drivers/input/gameport/gameport.c
@@ -134,7 +134,7 @@ static int gameport_measure_speed(struct gameport *gameport)
 	}
 
 	gameport_close(gameport);
-	return (cpu_data[_smp_processor_id()].loops_per_jiffy * (unsigned long)HZ / (1000 / 50)) / (tx < 1 ? 1 : tx);
+	return (cpu_data[raw_smp_processor_id()].loops_per_jiffy * (unsigned long)HZ / (1000 / 50)) / (tx < 1 ? 1 : tx);
 
 #else
 
diff --git a/drivers/oprofile/buffer_sync.c b/drivers/oprofile/buffer_sync.c
index 55720dc6ec43..745a14183634 100644
--- a/drivers/oprofile/buffer_sync.c
+++ b/drivers/oprofile/buffer_sync.c
@@ -62,7 +62,7 @@ static int task_exit_notify(struct notifier_block * self, unsigned long val, voi
 	/* To avoid latency problems, we only process the current CPU,
 	 * hoping that most samples for the task are on this CPU
 	 */
-	sync_buffer(_smp_processor_id());
+	sync_buffer(raw_smp_processor_id());
   	return 0;
 }
 
@@ -86,7 +86,7 @@ static int munmap_notify(struct notifier_block * self, unsigned long val, void *
 		/* To avoid latency problems, we only process the current CPU,
 		 * hoping that most samples for the task are on this CPU
 		 */
-		sync_buffer(_smp_processor_id());
+		sync_buffer(raw_smp_processor_id());
 		return 0;
 	}
 
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 71bb41019a12..7d7c8788ea75 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -145,10 +145,10 @@ static inline void set_buffer_unwritten_io(struct buffer_head *bh)
 #define xfs_inherit_nosymlinks	xfs_params.inherit_nosym.val
 #define xfs_rotorstep		xfs_params.rotorstep.val
 
-#ifndef __smp_processor_id
-#define __smp_processor_id()	smp_processor_id()
+#ifndef raw_smp_processor_id
+#define raw_smp_processor_id()	smp_processor_id()
 #endif
-#define current_cpu()		__smp_processor_id()
+#define current_cpu()		raw_smp_processor_id()
 #define current_pid()		(current->pid)
 #define current_fsuid(cred)	(current->fsuid)
 #define current_fsgid(cred)	(current->fsgid)
diff --git a/include/asm-alpha/smp.h b/include/asm-alpha/smp.h
index cbc173ae45aa..9950706abdf8 100644
--- a/include/asm-alpha/smp.h
+++ b/include/asm-alpha/smp.h
@@ -43,7 +43,7 @@ extern struct cpuinfo_alpha cpu_data[NR_CPUS];
 #define PROC_CHANGE_PENALTY     20
 
 #define hard_smp_processor_id()	__hard_smp_processor_id()
-#define smp_processor_id()	(current_thread_info()->cpu)
+#define raw_smp_processor_id()	(current_thread_info()->cpu)
 
 extern cpumask_t cpu_present_mask;
 extern cpumask_t cpu_online_map;
diff --git a/include/asm-arm/smp.h b/include/asm-arm/smp.h
index bd44f894690f..6c6c60adbbaa 100644
--- a/include/asm-arm/smp.h
+++ b/include/asm-arm/smp.h
@@ -21,7 +21,7 @@
 # error "<asm-arm/smp.h> included in non-SMP build"
 #endif
 
-#define smp_processor_id()	(current_thread_info()->cpu)
+#define raw_smp_processor_id() (current_thread_info()->cpu)
 
 extern cpumask_t cpu_present_mask;
 #define cpu_possible_map cpu_present_mask
diff --git a/include/asm-i386/smp.h b/include/asm-i386/smp.h
index e03a206dfa36..55ef31f66bbe 100644
--- a/include/asm-i386/smp.h
+++ b/include/asm-i386/smp.h
@@ -51,7 +51,7 @@ extern u8 x86_cpu_to_apicid[];
  * from the initial startup. We map APIC_BASE very early in page_setup(),
  * so this is correct in the x86 case.
  */
-#define __smp_processor_id() (current_thread_info()->cpu)
+#define raw_smp_processor_id() (current_thread_info()->cpu)
 
 extern cpumask_t cpu_callout_map;
 extern cpumask_t cpu_callin_map;
diff --git a/include/asm-ia64/smp.h b/include/asm-ia64/smp.h
index 3ba1a061e4ae..a3914352c995 100644
--- a/include/asm-ia64/smp.h
+++ b/include/asm-ia64/smp.h
@@ -46,7 +46,7 @@ ia64_get_lid (void)
 #define SMP_IRQ_REDIRECTION	(1 << 0)
 #define SMP_IPI_REDIRECTION	(1 << 1)
 
-#define smp_processor_id()	(current_thread_info()->cpu)
+#define raw_smp_processor_id() (current_thread_info()->cpu)
 
 extern struct smp_boot_data {
 	int cpu_count;
diff --git a/include/asm-m32r/smp.h b/include/asm-m32r/smp.h
index 8cd4d0da4be1..b9a20cdad65f 100644
--- a/include/asm-m32r/smp.h
+++ b/include/asm-m32r/smp.h
@@ -66,7 +66,7 @@ extern volatile int cpu_2_physid[NR_CPUS];
 #define physid_to_cpu(physid)	physid_2_cpu[physid]
 #define cpu_to_physid(cpu_id)	cpu_2_physid[cpu_id]
 
-#define smp_processor_id()	(current_thread_info()->cpu)
+#define raw_smp_processor_id()	(current_thread_info()->cpu)
 
 extern cpumask_t cpu_callout_map;
 #define cpu_possible_map cpu_callout_map
diff --git a/include/asm-mips/smp.h b/include/asm-mips/smp.h
index 8ba370ecfd4c..5618f1e12f40 100644
--- a/include/asm-mips/smp.h
+++ b/include/asm-mips/smp.h
@@ -21,7 +21,7 @@
 #include <linux/cpumask.h>
 #include <asm/atomic.h>
 
-#define smp_processor_id()	(current_thread_info()->cpu)
+#define raw_smp_processor_id() (current_thread_info()->cpu)
 
 /* Map from cpu id to sequential logical cpu number.  This will only
    not be idempotent when cpus failed to come on-line.  */
diff --git a/include/asm-parisc/smp.h b/include/asm-parisc/smp.h
index fde77ac35463..9413f67a540b 100644
--- a/include/asm-parisc/smp.h
+++ b/include/asm-parisc/smp.h
@@ -51,7 +51,7 @@ extern void smp_send_reschedule(int cpu);
 
 extern unsigned long cpu_present_mask;
 
-#define smp_processor_id()	(current_thread_info()->cpu)
+#define raw_smp_processor_id()	(current_thread_info()->cpu)
 
 #endif /* CONFIG_SMP */
 
diff --git a/include/asm-ppc/smp.h b/include/asm-ppc/smp.h
index ebfb614f55f6..17530c232c76 100644
--- a/include/asm-ppc/smp.h
+++ b/include/asm-ppc/smp.h
@@ -44,7 +44,7 @@ extern void smp_message_recv(int, struct pt_regs *);
 #define NO_PROC_ID		0xFF            /* No processor magic marker */
 #define PROC_CHANGE_PENALTY	20
 
-#define smp_processor_id() (current_thread_info()->cpu)
+#define raw_smp_processor_id()	(current_thread_info()->cpu)
 
 extern int __cpu_up(unsigned int cpu);
 
diff --git a/include/asm-ppc64/smp.h b/include/asm-ppc64/smp.h
index c8646fa999c2..8115ecb8feee 100644
--- a/include/asm-ppc64/smp.h
+++ b/include/asm-ppc64/smp.h
@@ -45,7 +45,7 @@ void generic_cpu_die(unsigned int cpu);
 void generic_mach_cpu_die(void);
 #endif
 
-#define __smp_processor_id() (get_paca()->paca_index)
+#define raw_smp_processor_id()	(get_paca()->paca_index)
 #define hard_smp_processor_id() (get_paca()->hw_cpu_id)
 
 extern cpumask_t cpu_sibling_map[NR_CPUS];
diff --git a/include/asm-s390/smp.h b/include/asm-s390/smp.h
index 9473786387a3..dd50e57a928f 100644
--- a/include/asm-s390/smp.h
+++ b/include/asm-s390/smp.h
@@ -47,7 +47,7 @@ extern int smp_call_function_on(void (*func) (void *info), void *info,
  
 #define PROC_CHANGE_PENALTY	20		/* Schedule penalty */
 
-#define smp_processor_id() (S390_lowcore.cpu_data.cpu_nr)
+#define raw_smp_processor_id()	(S390_lowcore.cpu_data.cpu_nr)
 
 extern int smp_get_cpu(cpumask_t cpu_map);
 extern void smp_put_cpu(int cpu);
diff --git a/include/asm-sh/smp.h b/include/asm-sh/smp.h
index 38b54469d7d1..f19a8b3b69a6 100644
--- a/include/asm-sh/smp.h
+++ b/include/asm-sh/smp.h
@@ -25,7 +25,7 @@ extern cpumask_t cpu_possible_map;
 
 #define cpu_online(cpu)		cpu_isset(cpu, cpu_online_map)
 
-#define smp_processor_id()	(current_thread_info()->cpu)
+#define raw_smp_processor_id()	(current_thread_info()->cpu)
 
 /* I've no idea what the real meaning of this is */
 #define PROC_CHANGE_PENALTY	20
diff --git a/include/asm-sparc/smp.h b/include/asm-sparc/smp.h
index f986c0d0922a..4f96d8333a12 100644
--- a/include/asm-sparc/smp.h
+++ b/include/asm-sparc/smp.h
@@ -148,7 +148,7 @@ extern __inline__ int hard_smp_processor_id(void)
 }
 #endif
 
-#define smp_processor_id()	(current_thread_info()->cpu)
+#define raw_smp_processor_id()		(current_thread_info()->cpu)
 
 #define prof_multiplier(__cpu)		cpu_data(__cpu).multiplier
 #define prof_counter(__cpu)		cpu_data(__cpu).counter
diff --git a/include/asm-sparc64/smp.h b/include/asm-sparc64/smp.h
index 5e3e06d908fe..110a2de89123 100644
--- a/include/asm-sparc64/smp.h
+++ b/include/asm-sparc64/smp.h
@@ -64,7 +64,7 @@ static __inline__ int hard_smp_processor_id(void)
 	}
 }
 
-#define smp_processor_id() (current_thread_info()->cpu)
+#define raw_smp_processor_id() (current_thread_info()->cpu)
 
 #endif /* !(__ASSEMBLY__) */
 
diff --git a/include/asm-um/smp.h b/include/asm-um/smp.h
index 4412d5d9c26b..d879eba2b52c 100644
--- a/include/asm-um/smp.h
+++ b/include/asm-um/smp.h
@@ -8,7 +8,8 @@
 #include "asm/current.h"
 #include "linux/cpumask.h"
 
-#define smp_processor_id() (current_thread->cpu)
+#define raw_smp_processor_id() (current_thread->cpu)
+
 #define cpu_logical_map(n) (n)
 #define cpu_number_map(n) (n)
 #define PROC_CHANGE_PENALTY	15 /* Pick a number, any number */
diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h
index 96844fecbde8..a7425aa5a3b7 100644
--- a/include/asm-x86_64/smp.h
+++ b/include/asm-x86_64/smp.h
@@ -68,7 +68,7 @@ static inline int num_booting_cpus(void)
 	return cpus_weight(cpu_callout_map);
 }
 
-#define __smp_processor_id() read_pda(cpunumber)
+#define raw_smp_processor_id() read_pda(cpunumber)
 
 extern __inline int hard_smp_processor_id(void)
 {
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e530c6c092f1..beacd931b606 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -381,7 +381,7 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *,
 
 #include <linux/topology.h>
 /* Returns the number of the current Node. */
-#define numa_node_id()		(cpu_to_node(_smp_processor_id()))
+#define numa_node_id()		(cpu_to_node(raw_smp_processor_id()))
 
 #ifndef CONFIG_DISCONTIGMEM
 
diff --git a/include/linux/smp.h b/include/linux/smp.h
index dcf1db3b35d3..9dfa3ee769ae 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -92,10 +92,7 @@ void smp_prepare_boot_cpu(void);
 /*
  *	These macros fold the SMP functionality into a single CPU system
  */
-
-#if !defined(__smp_processor_id) || !defined(CONFIG_PREEMPT)
-# define smp_processor_id()			0
-#endif
+#define raw_smp_processor_id()			0
 #define hard_smp_processor_id()			0
 #define smp_call_function(func,info,retry,wait)	({ 0; })
 #define on_each_cpu(func,info,retry,wait)	({ func(info); 0; })
@@ -106,30 +103,25 @@ static inline void smp_send_reschedule(int cpu) { }
 #endif /* !SMP */
 
 /*
- * DEBUG_PREEMPT support: check whether smp_processor_id() is being
- * used in a preemption-safe way.
+ * smp_processor_id(): get the current CPU ID.
  *
- * An architecture has to enable this debugging code explicitly.
- * It can do so by renaming the smp_processor_id() macro to
- * __smp_processor_id().  This should only be done after some minimal
- * testing, because usually there are a number of false positives
- * that an architecture will trigger.
+ * if DEBUG_PREEMPT is enabled the we check whether it is
+ * used in a preemption-safe way. (smp_processor_id() is safe
+ * if it's used in a preemption-off critical section, or in
+ * a thread that is bound to the current CPU.)
  *
- * To fix a false positive (i.e. smp_processor_id() use that the
- * debugging code reports but which use for some reason is legal),
- * change the smp_processor_id() reference to _smp_processor_id(),
- * which is the nondebug variant.  NOTE: don't use this to hack around
- * real bugs.
+ * NOTE: raw_smp_processor_id() is for internal use only
+ * (smp_processor_id() is the preferred variant), but in rare
+ * instances it might also be used to turn off false positives
+ * (i.e. smp_processor_id() use that the debugging code reports but
+ * which use for some reason is legal). Don't use this to hack around
+ * the warning message, as your code might not work under PREEMPT.
  */
-#ifdef __smp_processor_id
-# if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
-   extern unsigned int smp_processor_id(void);
-# else
-#  define smp_processor_id() __smp_processor_id()
-# endif
-# define _smp_processor_id() __smp_processor_id()
+#ifdef CONFIG_DEBUG_PREEMPT
+  extern unsigned int debug_smp_processor_id(void);
+# define smp_processor_id() debug_smp_processor_id()
 #else
-# define _smp_processor_id() smp_processor_id()
+# define smp_processor_id() raw_smp_processor_id()
 #endif
 
 #define get_cpu()		({ preempt_disable(); smp_processor_id(); })
diff --git a/include/net/route.h b/include/net/route.h
index d34ca8fc6756..c3cd069a9aca 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -107,7 +107,7 @@ struct rt_cache_stat
 
 extern struct rt_cache_stat *rt_cache_stat;
 #define RT_CACHE_STAT_INC(field)					  \
-		(per_cpu_ptr(rt_cache_stat, _smp_processor_id())->field++)
+		(per_cpu_ptr(rt_cache_stat, raw_smp_processor_id())->field++)
 
 extern struct ip_rt_acct *ip_rt_acct;
 
diff --git a/include/net/snmp.h b/include/net/snmp.h
index a15ab256276e..a36bed8ea210 100644
--- a/include/net/snmp.h
+++ b/include/net/snmp.h
@@ -128,18 +128,18 @@ struct linux_mib {
 #define SNMP_STAT_USRPTR(name)	(name[1])
 
 #define SNMP_INC_STATS_BH(mib, field) 	\
-	(per_cpu_ptr(mib[0], _smp_processor_id())->mibs[field]++)
+	(per_cpu_ptr(mib[0], raw_smp_processor_id())->mibs[field]++)
 #define SNMP_INC_STATS_OFFSET_BH(mib, field, offset)	\
-	(per_cpu_ptr(mib[0], _smp_processor_id())->mibs[field + (offset)]++)
+	(per_cpu_ptr(mib[0], raw_smp_processor_id())->mibs[field + (offset)]++)
 #define SNMP_INC_STATS_USER(mib, field) \
-	(per_cpu_ptr(mib[1], _smp_processor_id())->mibs[field]++)
+	(per_cpu_ptr(mib[1], raw_smp_processor_id())->mibs[field]++)
 #define SNMP_INC_STATS(mib, field) 	\
-	(per_cpu_ptr(mib[!in_softirq()], _smp_processor_id())->mibs[field]++)
+	(per_cpu_ptr(mib[!in_softirq()], raw_smp_processor_id())->mibs[field]++)
 #define SNMP_DEC_STATS(mib, field) 	\
-	(per_cpu_ptr(mib[!in_softirq()], _smp_processor_id())->mibs[field]--)
+	(per_cpu_ptr(mib[!in_softirq()], raw_smp_processor_id())->mibs[field]--)
 #define SNMP_ADD_STATS_BH(mib, field, addend) 	\
-	(per_cpu_ptr(mib[0], _smp_processor_id())->mibs[field] += addend)
+	(per_cpu_ptr(mib[0], raw_smp_processor_id())->mibs[field] += addend)
 #define SNMP_ADD_STATS_USER(mib, field, addend) 	\
-	(per_cpu_ptr(mib[1], _smp_processor_id())->mibs[field] += addend)
+	(per_cpu_ptr(mib[1], raw_smp_processor_id())->mibs[field] += addend)
 
 #endif
diff --git a/kernel/module.c b/kernel/module.c
index 83b3d376708c..a566745dde62 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -379,7 +379,7 @@ static void module_unload_init(struct module *mod)
 	for (i = 0; i < NR_CPUS; i++)
 		local_set(&mod->ref[i].count, 0);
 	/* Hold reference count during initialization. */
-	local_set(&mod->ref[_smp_processor_id()].count, 1);
+	local_set(&mod->ref[raw_smp_processor_id()].count, 1);
 	/* Backwards compatibility macros put refcount during init. */
 	mod->waiter = current;
 }
diff --git a/kernel/power/smp.c b/kernel/power/smp.c
index cba3584b80fe..457c2302ed42 100644
--- a/kernel/power/smp.c
+++ b/kernel/power/smp.c
@@ -48,11 +48,11 @@ void disable_nonboot_cpus(void)
 {
 	oldmask = current->cpus_allowed;
 	set_cpus_allowed(current, cpumask_of_cpu(0));
-	printk("Freezing CPUs (at %d)", _smp_processor_id());
+	printk("Freezing CPUs (at %d)", raw_smp_processor_id());
 	current->state = TASK_INTERRUPTIBLE;
 	schedule_timeout(HZ);
 	printk("...");
-	BUG_ON(_smp_processor_id() != 0);
+	BUG_ON(raw_smp_processor_id() != 0);
 
 	/* FIXME: for this to work, all the CPUs must be running
 	 * "idle" thread (or we deadlock). Is that guaranteed? */
diff --git a/kernel/sched.c b/kernel/sched.c
index f12a0c8a7d98..deca041fc364 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3814,7 +3814,7 @@ EXPORT_SYMBOL(yield);
  */
 void __sched io_schedule(void)
 {
-	struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id());
+	struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
 
 	atomic_inc(&rq->nr_iowait);
 	schedule();
@@ -3825,7 +3825,7 @@ EXPORT_SYMBOL(io_schedule);
 
 long __sched io_schedule_timeout(long timeout)
 {
-	struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id());
+	struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
 	long ret;
 
 	atomic_inc(&rq->nr_iowait);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 6116b25aa7cf..84a9d18aa8da 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -100,7 +100,7 @@ static int stop_machine(void)
 	stopmachine_state = STOPMACHINE_WAIT;
 
 	for_each_online_cpu(i) {
-		if (i == _smp_processor_id())
+		if (i == raw_smp_processor_id())
 			continue;
 		ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL);
 		if (ret < 0)
@@ -182,7 +182,7 @@ struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
 
 	/* If they don't care which CPU fn runs on, bind to any online one. */
 	if (cpu == NR_CPUS)
-		cpu = _smp_processor_id();
+		cpu = raw_smp_processor_id();
 
 	p = kthread_create(do_stop, &smdata, "kstopmachine");
 	if (!IS_ERR(p)) {
diff --git a/lib/Makefile b/lib/Makefile
index 9eccea9429a7..5f10cb898407 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -20,6 +20,7 @@ lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o
 obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o
+obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o
 
 ifneq ($(CONFIG_HAVE_DEC_LOCK),y) 
   lib-y += dec_and_lock.o
diff --git a/lib/kernel_lock.c b/lib/kernel_lock.c
index 99b0ae3d51dd..bd2bc5d887b8 100644
--- a/lib/kernel_lock.c
+++ b/lib/kernel_lock.c
@@ -9,61 +9,6 @@
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 
-#if defined(CONFIG_PREEMPT) && defined(__smp_processor_id) && \
-		defined(CONFIG_DEBUG_PREEMPT)
-
-/*
- * Debugging check.
- */
-unsigned int smp_processor_id(void)
-{
-	unsigned long preempt_count = preempt_count();
-	int this_cpu = __smp_processor_id();
-	cpumask_t this_mask;
-
-	if (likely(preempt_count))
-		goto out;
-
-	if (irqs_disabled())
-		goto out;
-
-	/*
-	 * Kernel threads bound to a single CPU can safely use
-	 * smp_processor_id():
-	 */
-	this_mask = cpumask_of_cpu(this_cpu);
-
-	if (cpus_equal(current->cpus_allowed, this_mask))
-		goto out;
-
-	/*
-	 * It is valid to assume CPU-locality during early bootup:
-	 */
-	if (system_state != SYSTEM_RUNNING)
-		goto out;
-
-	/*
-	 * Avoid recursion:
-	 */
-	preempt_disable();
-
-	if (!printk_ratelimit())
-		goto out_enable;
-
-	printk(KERN_ERR "BUG: using smp_processor_id() in preemptible [%08x] code: %s/%d\n", preempt_count(), current->comm, current->pid);
-	print_symbol("caller is %s\n", (long)__builtin_return_address(0));
-	dump_stack();
-
-out_enable:
-	preempt_enable_no_resched();
-out:
-	return this_cpu;
-}
-
-EXPORT_SYMBOL(smp_processor_id);
-
-#endif /* PREEMPT && __smp_processor_id && DEBUG_PREEMPT */
-
 #ifdef CONFIG_PREEMPT_BKL
 /*
  * The 'big kernel semaphore'
diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
new file mode 100644
index 000000000000..42c08ef828c5
--- /dev/null
+++ b/lib/smp_processor_id.c
@@ -0,0 +1,55 @@
+/*
+ * lib/smp_processor_id.c
+ *
+ * DEBUG_PREEMPT variant of smp_processor_id().
+ */
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+
+unsigned int debug_smp_processor_id(void)
+{
+	unsigned long preempt_count = preempt_count();
+	int this_cpu = raw_smp_processor_id();
+	cpumask_t this_mask;
+
+	if (likely(preempt_count))
+		goto out;
+
+	if (irqs_disabled())
+		goto out;
+
+	/*
+	 * Kernel threads bound to a single CPU can safely use
+	 * smp_processor_id():
+	 */
+	this_mask = cpumask_of_cpu(this_cpu);
+
+	if (cpus_equal(current->cpus_allowed, this_mask))
+		goto out;
+
+	/*
+	 * It is valid to assume CPU-locality during early bootup:
+	 */
+	if (system_state != SYSTEM_RUNNING)
+		goto out;
+
+	/*
+	 * Avoid recursion:
+	 */
+	preempt_disable();
+
+	if (!printk_ratelimit())
+		goto out_enable;
+
+	printk(KERN_ERR "BUG: using smp_processor_id() in preemptible [%08x] code: %s/%d\n", preempt_count(), current->comm, current->pid);
+	print_symbol("caller is %s\n", (long)__builtin_return_address(0));
+	dump_stack();
+
+out_enable:
+	preempt_enable_no_resched();
+out:
+	return this_cpu;
+}
+
+EXPORT_SYMBOL(debug_smp_processor_id);
+
-- 
cgit v1.2.3


From 63551ae0feaaa23807ebea60de1901564bbef32e Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 21 Jun 2005 17:14:44 -0700
Subject: [PATCH] Hugepage consolidation

A lot of the code in arch/*/mm/hugetlbpage.c is quite similar.  This patch
attempts to consolidate a lot of the code across the arch's, putting the
combined version in mm/hugetlb.c.  There are a couple of uglyish hacks in
order to covert all the hugepage archs, but the result is a very large
reduction in the total amount of code.  It also means things like hugepage
lazy allocation could be implemented in one place, instead of six.

Tested, at least a little, on ppc64, i386 and x86_64.

Notes:
	- this patch changes the meaning of set_huge_pte() to be more
	  analagous to set_pte()
	- does SH4 need s special huge_ptep_get_and_clear()??

Acked-by: William Lee Irwin <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/mm/hugetlbpage.c    | 170 ++----------------------------------
 arch/ia64/mm/hugetlbpage.c    | 158 +---------------------------------
 arch/ppc64/mm/hugetlbpage.c   | 180 +-------------------------------------
 arch/sh/mm/hugetlbpage.c      | 196 ++++++------------------------------------
 arch/sh64/mm/hugetlbpage.c    |  18 +++-
 arch/sparc64/mm/hugetlbpage.c | 195 ++++++-----------------------------------
 include/asm-i386/page.h       |   1 +
 include/asm-i386/pgtable.h    |   2 +-
 include/asm-ia64/pgtable.h    |   1 +
 include/asm-sh/page.h         |   1 +
 include/asm-sh/pgtable.h      |   1 +
 include/asm-sh64/page.h       |   1 +
 include/asm-sh64/pgtable.h    |   2 +
 include/asm-sparc64/page.h    |   2 +
 include/asm-sparc64/pgtable.h |   1 +
 include/asm-x86_64/page.h     |   1 +
 include/asm-x86_64/pgtable.h  |   3 +-
 include/linux/hugetlb.h       |  40 +++++++--
 mm/hugetlb.c                  | 177 +++++++++++++++++++++++++++++++++++++-
 19 files changed, 300 insertions(+), 850 deletions(-)

(limited to 'arch/sparc64')

diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c
index 171fc925e1e4..5aa06001a4bd 100644
--- a/arch/i386/mm/hugetlbpage.c
+++ b/arch/i386/mm/hugetlbpage.c
@@ -18,7 +18,7 @@
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 
-static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
 	pud_t *pud;
@@ -30,7 +30,7 @@ static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 	return (pte_t *) pmd;
 }
 
-static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
 	pud_t *pud;
@@ -42,21 +42,6 @@ static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 	return (pte_t *) pmd;
 }
 
-static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page, pte_t * page_table, int write_access)
-{
-	pte_t entry;
-
-	add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
-	if (write_access) {
-		entry =
-		    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
-	} else
-		entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
-	entry = pte_mkyoung(entry);
-	mk_pte_huge(entry);
-	set_pte(page_table, entry);
-}
-
 /*
  * This function checks for proper alignment of input addr and len parameters.
  */
@@ -69,77 +54,6 @@ int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
 	return 0;
 }
 
-int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
-			struct vm_area_struct *vma)
-{
-	pte_t *src_pte, *dst_pte, entry;
-	struct page *ptepage;
-	unsigned long addr = vma->vm_start;
-	unsigned long end = vma->vm_end;
-
-	while (addr < end) {
-		dst_pte = huge_pte_alloc(dst, addr);
-		if (!dst_pte)
-			goto nomem;
-		src_pte = huge_pte_offset(src, addr);
-		entry = *src_pte;
-		ptepage = pte_page(entry);
-		get_page(ptepage);
-		set_pte(dst_pte, entry);
-		add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
-		addr += HPAGE_SIZE;
-	}
-	return 0;
-
-nomem:
-	return -ENOMEM;
-}
-
-int
-follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
-		    struct page **pages, struct vm_area_struct **vmas,
-		    unsigned long *position, int *length, int i)
-{
-	unsigned long vpfn, vaddr = *position;
-	int remainder = *length;
-
-	WARN_ON(!is_vm_hugetlb_page(vma));
-
-	vpfn = vaddr/PAGE_SIZE;
-	while (vaddr < vma->vm_end && remainder) {
-
-		if (pages) {
-			pte_t *pte;
-			struct page *page;
-
-			pte = huge_pte_offset(mm, vaddr);
-
-			/* hugetlb should be locked, and hence, prefaulted */
-			WARN_ON(!pte || pte_none(*pte));
-
-			page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
-
-			WARN_ON(!PageCompound(page));
-
-			get_page(page);
-			pages[i] = page;
-		}
-
-		if (vmas)
-			vmas[i] = vma;
-
-		vaddr += PAGE_SIZE;
-		++vpfn;
-		--remainder;
-		++i;
-	}
-
-	*length = remainder;
-	*position = vaddr;
-
-	return i;
-}
-
 #if 0	/* This is just for testing */
 struct page *
 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
@@ -204,83 +118,15 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 }
 #endif
 
-void unmap_hugepage_range(struct vm_area_struct *vma,
-		unsigned long start, unsigned long end)
+void hugetlb_clean_stale_pgtable(pte_t *pte)
 {
-	struct mm_struct *mm = vma->vm_mm;
-	unsigned long address;
-	pte_t pte, *ptep;
+	pmd_t *pmd = (pmd_t *) pte;
 	struct page *page;
 
-	BUG_ON(start & (HPAGE_SIZE - 1));
-	BUG_ON(end & (HPAGE_SIZE - 1));
-
-	for (address = start; address < end; address += HPAGE_SIZE) {
-		ptep = huge_pte_offset(mm, address);
-		if (!ptep)
-			continue;
-		pte = ptep_get_and_clear(mm, address, ptep);
-		if (pte_none(pte))
-			continue;
-		page = pte_page(pte);
-		put_page(page);
-	}
-	add_mm_counter(mm ,rss, -((end - start) >> PAGE_SHIFT));
-	flush_tlb_range(vma, start, end);
-}
-
-int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
-{
-	struct mm_struct *mm = current->mm;
-	unsigned long addr;
-	int ret = 0;
-
-	BUG_ON(vma->vm_start & ~HPAGE_MASK);
-	BUG_ON(vma->vm_end & ~HPAGE_MASK);
-
-	spin_lock(&mm->page_table_lock);
-	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
-		unsigned long idx;
-		pte_t *pte = huge_pte_alloc(mm, addr);
-		struct page *page;
-
-		if (!pte) {
-			ret = -ENOMEM;
-			goto out;
-		}
-
-		if (!pte_none(*pte))
-			continue;
-
-		idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
-			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
-		page = find_get_page(mapping, idx);
-		if (!page) {
-			/* charge the fs quota first */
-			if (hugetlb_get_quota(mapping)) {
-				ret = -ENOMEM;
-				goto out;
-			}
-			page = alloc_huge_page();
-			if (!page) {
-				hugetlb_put_quota(mapping);
-				ret = -ENOMEM;
-				goto out;
-			}
-			ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
-			if (! ret) {
-				unlock_page(page);
-			} else {
-				hugetlb_put_quota(mapping);
-				free_huge_page(page);
-				goto out;
-			}
-		}
-		set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE);
-	}
-out:
-	spin_unlock(&mm->page_table_lock);
-	return ret;
+	page = pmd_page(*pmd);
+	pmd_clear(pmd);
+	dec_page_state(nr_page_table_pages);
+	page_cache_release(page);
 }
 
 /* x86_64 also uses this file */
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index df08ae7634b6..e0a776a3044c 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -24,7 +24,7 @@
 
 unsigned int hpage_shift=HPAGE_SHIFT_DEFAULT;
 
-static pte_t *
+pte_t *
 huge_pte_alloc (struct mm_struct *mm, unsigned long addr)
 {
 	unsigned long taddr = htlbpage_to_page(addr);
@@ -43,7 +43,7 @@ huge_pte_alloc (struct mm_struct *mm, unsigned long addr)
 	return pte;
 }
 
-static pte_t *
+pte_t *
 huge_pte_offset (struct mm_struct *mm, unsigned long addr)
 {
 	unsigned long taddr = htlbpage_to_page(addr);
@@ -67,23 +67,6 @@ huge_pte_offset (struct mm_struct *mm, unsigned long addr)
 
 #define mk_pte_huge(entry) { pte_val(entry) |= _PAGE_P; }
 
-static void
-set_huge_pte (struct mm_struct *mm, struct vm_area_struct *vma,
-	      struct page *page, pte_t * page_table, int write_access)
-{
-	pte_t entry;
-
-	add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
-	if (write_access) {
-		entry =
-		    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
-	} else
-		entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
-	entry = pte_mkyoung(entry);
-	mk_pte_huge(entry);
-	set_pte(page_table, entry);
-	return;
-}
 /*
  * This function checks for proper alignment of input addr and len parameters.
  */
@@ -99,68 +82,6 @@ int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
 	return 0;
 }
 
-int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
-			struct vm_area_struct *vma)
-{
-	pte_t *src_pte, *dst_pte, entry;
-	struct page *ptepage;
-	unsigned long addr = vma->vm_start;
-	unsigned long end = vma->vm_end;
-
-	while (addr < end) {
-		dst_pte = huge_pte_alloc(dst, addr);
-		if (!dst_pte)
-			goto nomem;
-		src_pte = huge_pte_offset(src, addr);
-		entry = *src_pte;
-		ptepage = pte_page(entry);
-		get_page(ptepage);
-		set_pte(dst_pte, entry);
-		add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
-		addr += HPAGE_SIZE;
-	}
-	return 0;
-nomem:
-	return -ENOMEM;
-}
-
-int
-follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
-		    struct page **pages, struct vm_area_struct **vmas,
-		    unsigned long *st, int *length, int i)
-{
-	pte_t *ptep, pte;
-	unsigned long start = *st;
-	unsigned long pstart;
-	int len = *length;
-	struct page *page;
-
-	do {
-		pstart = start & HPAGE_MASK;
-		ptep = huge_pte_offset(mm, start);
-		pte = *ptep;
-
-back1:
-		page = pte_page(pte);
-		if (pages) {
-			page += ((start & ~HPAGE_MASK) >> PAGE_SHIFT);
-			get_page(page);
-			pages[i] = page;
-		}
-		if (vmas)
-			vmas[i] = vma;
-		i++;
-		len--;
-		start += PAGE_SIZE;
-		if (((start & HPAGE_MASK) == pstart) && len &&
-				(start < vma->vm_end))
-			goto back1;
-	} while (len && start < vma->vm_end);
-	*length = len;
-	*st = start;
-	return i;
-}
-
 struct page *follow_huge_addr(struct mm_struct *mm, unsigned long addr, int write)
 {
 	struct page *page;
@@ -212,81 +133,6 @@ void hugetlb_free_pgd_range(struct mmu_gather **tlb,
 	free_pgd_range(tlb, addr, end, floor, ceiling);
 }
 
-void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end)
-{
-	struct mm_struct *mm = vma->vm_mm;
-	unsigned long address;
-	pte_t *pte;
-	struct page *page;
-
-	BUG_ON(start & (HPAGE_SIZE - 1));
-	BUG_ON(end & (HPAGE_SIZE - 1));
-
-	for (address = start; address < end; address += HPAGE_SIZE) {
-		pte = huge_pte_offset(mm, address);
-		if (pte_none(*pte))
-			continue;
-		page = pte_page(*pte);
-		put_page(page);
-		pte_clear(mm, address, pte);
-	}
-	add_mm_counter(mm, rss, - ((end - start) >> PAGE_SHIFT));
-	flush_tlb_range(vma, start, end);
-}
-
-int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
-{
-	struct mm_struct *mm = current->mm;
-	unsigned long addr;
-	int ret = 0;
-
-	BUG_ON(vma->vm_start & ~HPAGE_MASK);
-	BUG_ON(vma->vm_end & ~HPAGE_MASK);
-
-	spin_lock(&mm->page_table_lock);
-	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
-		unsigned long idx;
-		pte_t *pte = huge_pte_alloc(mm, addr);
-		struct page *page;
-
-		if (!pte) {
-			ret = -ENOMEM;
-			goto out;
-		}
-		if (!pte_none(*pte))
-			continue;
-
-		idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
-			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
-		page = find_get_page(mapping, idx);
-		if (!page) {
-			/* charge the fs quota first */
-			if (hugetlb_get_quota(mapping)) {
-				ret = -ENOMEM;
-				goto out;
-			}
-			page = alloc_huge_page();
-			if (!page) {
-				hugetlb_put_quota(mapping);
-				ret = -ENOMEM;
-				goto out;
-			}
-			ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
-			if (! ret) {
-				unlock_page(page);
-			} else {
-				hugetlb_put_quota(mapping);
-				page_cache_release(page);
-				goto out;
-			}
-		}
-		set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE);
-	}
-out:
-	spin_unlock(&mm->page_table_lock);
-	return ret;
-}
-
 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 		unsigned long pgoff, unsigned long flags)
 {
diff --git a/arch/ppc64/mm/hugetlbpage.c b/arch/ppc64/mm/hugetlbpage.c
index d3bf86a5c1ad..b4ab766f5980 100644
--- a/arch/ppc64/mm/hugetlbpage.c
+++ b/arch/ppc64/mm/hugetlbpage.c
@@ -121,7 +121,7 @@ static pte_t *hugepte_alloc(struct mm_struct *mm, pud_t *dir, unsigned long addr
 	return hugepte_offset(dir, addr);
 }
 
-static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
 	pud_t *pud;
 
@@ -134,7 +134,7 @@ static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 	return hugepte_offset(pud, addr);
 }
 
-static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
 	pud_t *pud;
 
@@ -147,25 +147,6 @@ static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 	return hugepte_alloc(mm, pud, addr);
 }
 
-static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma,
-			 unsigned long addr, struct page *page,
-			 pte_t *ptep, int write_access)
-{
-	pte_t entry;
-
-	add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
-	if (write_access) {
-		entry =
-		    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
-	} else {
-		entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
-	}
-	entry = pte_mkyoung(entry);
-	entry = pte_mkhuge(entry);
-
-	set_pte_at(mm, addr, ptep, entry);
-}
-
 /*
  * This function checks for proper alignment of input addr and len parameters.
  */
@@ -259,80 +240,6 @@ int prepare_hugepage_range(unsigned long addr, unsigned long len)
 	return -EINVAL;
 }
 
-int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
-			struct vm_area_struct *vma)
-{
-	pte_t *src_pte, *dst_pte, entry;
-	struct page *ptepage;
-	unsigned long addr = vma->vm_start;
-	unsigned long end = vma->vm_end;
-	int err = -ENOMEM;
-
-	while (addr < end) {
-		dst_pte = huge_pte_alloc(dst, addr);
-		if (!dst_pte)
-			goto out;
-
-		src_pte = huge_pte_offset(src, addr);
-		entry = *src_pte;
-		
-		ptepage = pte_page(entry);
-		get_page(ptepage);
-		add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
-		set_pte_at(dst, addr, dst_pte, entry);
-
-		addr += HPAGE_SIZE;
-	}
-
-	err = 0;
- out:
-	return err;
-}
-
-int
-follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
-		    struct page **pages, struct vm_area_struct **vmas,
-		    unsigned long *position, int *length, int i)
-{
-	unsigned long vpfn, vaddr = *position;
-	int remainder = *length;
-
-	WARN_ON(!is_vm_hugetlb_page(vma));
-
-	vpfn = vaddr/PAGE_SIZE;
-	while (vaddr < vma->vm_end && remainder) {
-		if (pages) {
-			pte_t *pte;
-			struct page *page;
-
-			pte = huge_pte_offset(mm, vaddr);
-
-			/* hugetlb should be locked, and hence, prefaulted */
-			WARN_ON(!pte || pte_none(*pte));
-
-			page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
-
-			WARN_ON(!PageCompound(page));
-
-			get_page(page);
-			pages[i] = page;
-		}
-
-		if (vmas)
-			vmas[i] = vma;
-
-		vaddr += PAGE_SIZE;
-		++vpfn;
-		--remainder;
-		++i;
-	}
-
-	*length = remainder;
-	*position = vaddr;
-
-	return i;
-}
-
 struct page *
 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
 {
@@ -363,89 +270,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 	return NULL;
 }
 
-void unmap_hugepage_range(struct vm_area_struct *vma,
-			  unsigned long start, unsigned long end)
-{
-	struct mm_struct *mm = vma->vm_mm;
-	unsigned long addr;
-	pte_t *ptep;
-	struct page *page;
-
-	WARN_ON(!is_vm_hugetlb_page(vma));
-	BUG_ON((start % HPAGE_SIZE) != 0);
-	BUG_ON((end % HPAGE_SIZE) != 0);
-
-	for (addr = start; addr < end; addr += HPAGE_SIZE) {
-		pte_t pte;
-
-		ptep = huge_pte_offset(mm, addr);
-		if (!ptep || pte_none(*ptep))
-			continue;
-
-		pte = *ptep;
-		page = pte_page(pte);
-		pte_clear(mm, addr, ptep);
-
-		put_page(page);
-	}
-	add_mm_counter(mm, rss, -((end - start) >> PAGE_SHIFT));
-	flush_tlb_pending();
-}
-
-int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
-{
-	struct mm_struct *mm = current->mm;
-	unsigned long addr;
-	int ret = 0;
-
-	WARN_ON(!is_vm_hugetlb_page(vma));
-	BUG_ON((vma->vm_start % HPAGE_SIZE) != 0);
-	BUG_ON((vma->vm_end % HPAGE_SIZE) != 0);
-
-	spin_lock(&mm->page_table_lock);
-	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
-		unsigned long idx;
-		pte_t *pte = huge_pte_alloc(mm, addr);
-		struct page *page;
-
-		if (!pte) {
-			ret = -ENOMEM;
-			goto out;
-		}
-		if (! pte_none(*pte))
-			continue;
-
-		idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
-			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
-		page = find_get_page(mapping, idx);
-		if (!page) {
-			/* charge the fs quota first */
-			if (hugetlb_get_quota(mapping)) {
-				ret = -ENOMEM;
-				goto out;
-			}
-			page = alloc_huge_page();
-			if (!page) {
-				hugetlb_put_quota(mapping);
-				ret = -ENOMEM;
-				goto out;
-			}
-			ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
-			if (! ret) {
-				unlock_page(page);
-			} else {
-				hugetlb_put_quota(mapping);
-				free_huge_page(page);
-				goto out;
-			}
-		}
-		set_huge_pte(mm, vma, addr, page, pte, vma->vm_flags & VM_WRITE);
-	}
-out:
-	spin_unlock(&mm->page_table_lock);
-	return ret;
-}
-
 /* Because we have an exclusive hugepage region which lies within the
  * normal user address space, we have to take special measures to make
  * non-huge mmap()s evade the hugepage reserved regions. */
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c
index 1f897bab2318..95bb1a6c6060 100644
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -24,7 +24,7 @@
 #include <asm/tlbflush.h>
 #include <asm/cacheflush.h>
 
-static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
 	pmd_t *pmd;
@@ -39,7 +39,7 @@ static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 	return pte;
 }
 
-static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
 	pmd_t *pmd;
@@ -56,28 +56,34 @@ static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 
 #define mk_pte_huge(entry) do { pte_val(entry) |= _PAGE_SZHUGE; } while (0)
 
-static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma,
-			 struct page *page, pte_t * page_table, int write_access)
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+		     pte_t *ptep, pte_t entry)
 {
-	unsigned long i;
-	pte_t entry;
+	int i;
 
-	add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
+	for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
+		set_pte_at(mm, addr, ptep, entry);
+		ptep++;
+		addr += PAGE_SIZE;
+		pte_val(entry) += PAGE_SIZE;
+	}
+}
 
-	if (write_access)
-		entry = pte_mkwrite(pte_mkdirty(mk_pte(page,
-						       vma->vm_page_prot)));
-	else
-		entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
-	entry = pte_mkyoung(entry);
-	mk_pte_huge(entry);
+pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
+			      pte_t *ptep)
+{
+	pte_t entry;
+	int i;
 
-	for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
-		set_pte(page_table, entry);
-		page_table++;
+	entry = *ptep;
 
-		pte_val(entry) += PAGE_SIZE;
+	for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
+		pte_clear(mm, addr, ptep);
+		addr += PAGE_SIZE;
+		ptep++;
 	}
+
+	return entry;
 }
 
 /*
@@ -92,79 +98,6 @@ int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
 	return 0;
 }
 
-int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
-			    struct vm_area_struct *vma)
-{
-	pte_t *src_pte, *dst_pte, entry;
-	struct page *ptepage;
-	unsigned long addr = vma->vm_start;
-	unsigned long end = vma->vm_end;
-	int i;
-
-	while (addr < end) {
-		dst_pte = huge_pte_alloc(dst, addr);
-		if (!dst_pte)
-			goto nomem;
-		src_pte = huge_pte_offset(src, addr);
-		BUG_ON(!src_pte || pte_none(*src_pte));
-		entry = *src_pte;
-		ptepage = pte_page(entry);
-		get_page(ptepage);
-		for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
-			set_pte(dst_pte, entry);
-			pte_val(entry) += PAGE_SIZE;
-			dst_pte++;
-		}
-		add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
-		addr += HPAGE_SIZE;
-	}
-	return 0;
-
-nomem:
-	return -ENOMEM;
-}
-
-int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
-			struct page **pages, struct vm_area_struct **vmas,
-			unsigned long *position, int *length, int i)
-{
-	unsigned long vaddr = *position;
-	int remainder = *length;
-
-	WARN_ON(!is_vm_hugetlb_page(vma));
-
-	while (vaddr < vma->vm_end && remainder) {
-		if (pages) {
-			pte_t *pte;
-			struct page *page;
-
-			pte = huge_pte_offset(mm, vaddr);
-
-			/* hugetlb should be locked, and hence, prefaulted */
-			BUG_ON(!pte || pte_none(*pte));
-
-			page = pte_page(*pte);
-
-			WARN_ON(!PageCompound(page));
-
-			get_page(page);
-			pages[i] = page;
-		}
-
-		if (vmas)
-			vmas[i] = vma;
-
-		vaddr += PAGE_SIZE;
-		--remainder;
-		++i;
-	}
-
-	*length = remainder;
-	*position = vaddr;
-
-	return i;
-}
-
 struct page *follow_huge_addr(struct mm_struct *mm,
 			      unsigned long address, int write)
 {
@@ -181,84 +114,3 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 {
 	return NULL;
 }
-
-void unmap_hugepage_range(struct vm_area_struct *vma,
-			  unsigned long start, unsigned long end)
-{
-	struct mm_struct *mm = vma->vm_mm;
-	unsigned long address;
-	pte_t *pte;
-	struct page *page;
-	int i;
-
-	BUG_ON(start & (HPAGE_SIZE - 1));
-	BUG_ON(end & (HPAGE_SIZE - 1));
-
-	for (address = start; address < end; address += HPAGE_SIZE) {
-		pte = huge_pte_offset(mm, address);
-		BUG_ON(!pte);
-		if (pte_none(*pte))
-			continue;
-		page = pte_page(*pte);
-		put_page(page);
-		for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
-			pte_clear(mm, address+(i*PAGE_SIZE), pte);
-			pte++;
-		}
-	}
-	add_mm_counter(mm, rss, -((end - start) >> PAGE_SHIFT));
-	flush_tlb_range(vma, start, end);
-}
-
-int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
-{
-	struct mm_struct *mm = current->mm;
-	unsigned long addr;
-	int ret = 0;
-
-	BUG_ON(vma->vm_start & ~HPAGE_MASK);
-	BUG_ON(vma->vm_end & ~HPAGE_MASK);
-
-	spin_lock(&mm->page_table_lock);
-	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
-		unsigned long idx;
-		pte_t *pte = huge_pte_alloc(mm, addr);
-		struct page *page;
-
-		if (!pte) {
-			ret = -ENOMEM;
-			goto out;
-		}
-		if (!pte_none(*pte))
-			continue;
-
-		idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
-			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
-		page = find_get_page(mapping, idx);
-		if (!page) {
-			/* charge the fs quota first */
-			if (hugetlb_get_quota(mapping)) {
-				ret = -ENOMEM;
-				goto out;
-			}
-			page = alloc_huge_page();
-			if (!page) {
-				hugetlb_put_quota(mapping);
-				ret = -ENOMEM;
-				goto out;
-			}
-			ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
-			if (! ret) {
-				unlock_page(page);
-			} else {
-				hugetlb_put_quota(mapping);
-				free_huge_page(page);
-				goto out;
-			}
-		}
-		set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE);
-	}
-out:
-	spin_unlock(&mm->page_table_lock);
-	return ret;
-}
diff --git a/arch/sh64/mm/hugetlbpage.c b/arch/sh64/mm/hugetlbpage.c
index bcad2aefa4ee..dcd9c8a8baf8 100644
--- a/arch/sh64/mm/hugetlbpage.c
+++ b/arch/sh64/mm/hugetlbpage.c
@@ -24,7 +24,7 @@
 #include <asm/tlbflush.h>
 #include <asm/cacheflush.h>
 
-static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
 	pmd_t *pmd;
@@ -39,7 +39,7 @@ static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 	return pte;
 }
 
-static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
 	pmd_t *pmd;
@@ -80,6 +80,20 @@ static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 	}
 }
 
+pte_t huge_ptep_get_and_clear(pte_t *ptep)
+{
+	pte_t entry;
+
+	entry = *ptep;
+
+	for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
+		pte_clear(pte);
+		pte++;
+	}
+
+	return entry;
+}
+
 /*
  * This function checks for proper alignment of input addr and len parameters.
  */
diff --git a/arch/sparc64/mm/hugetlbpage.c b/arch/sparc64/mm/hugetlbpage.c
index 5a1f831b2de1..625cbb336a23 100644
--- a/arch/sparc64/mm/hugetlbpage.c
+++ b/arch/sparc64/mm/hugetlbpage.c
@@ -22,7 +22,7 @@
 #include <asm/cacheflush.h>
 #include <asm/mmu_context.h>
 
-static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
 	pud_t *pud;
@@ -41,7 +41,7 @@ static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 	return pte;
 }
 
-static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
 	pud_t *pud;
@@ -62,30 +62,34 @@ static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 
 #define mk_pte_huge(entry) do { pte_val(entry) |= _PAGE_SZHUGE; } while (0)
 
-static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma,
-			 unsigned long addr,
-			 struct page *page, pte_t * page_table, int write_access)
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+		     pte_t *ptep, pte_t entry)
 {
-	unsigned long i;
-	pte_t entry;
+	int i;
+
+	for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
+		set_pte_at(mm, addr, ptep, entry);
+		ptep++;
+		addr += PAGE_SIZE;
+		pte_val(entry) += PAGE_SIZE;
+	}
+}
 
-	add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
+pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
+			      pte_t *ptep)
+{
+	pte_t entry;
+	int i;
 
-	if (write_access)
-		entry = pte_mkwrite(pte_mkdirty(mk_pte(page,
-						       vma->vm_page_prot)));
-	else
-		entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
-	entry = pte_mkyoung(entry);
-	mk_pte_huge(entry);
+	entry = *ptep;
 
 	for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
-		set_pte_at(mm, addr, page_table, entry);
-		page_table++;
+		pte_clear(mm, addr, ptep);
 		addr += PAGE_SIZE;
-
-		pte_val(entry) += PAGE_SIZE;
+		ptep++;
 	}
+
+	return entry;
 }
 
 /*
@@ -100,79 +104,6 @@ int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
 	return 0;
 }
 
-int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
-			    struct vm_area_struct *vma)
-{
-	pte_t *src_pte, *dst_pte, entry;
-	struct page *ptepage;
-	unsigned long addr = vma->vm_start;
-	unsigned long end = vma->vm_end;
-	int i;
-
-	while (addr < end) {
-		dst_pte = huge_pte_alloc(dst, addr);
-		if (!dst_pte)
-			goto nomem;
-		src_pte = huge_pte_offset(src, addr);
-		BUG_ON(!src_pte || pte_none(*src_pte));
-		entry = *src_pte;
-		ptepage = pte_page(entry);
-		get_page(ptepage);
-		for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
-			set_pte_at(dst, addr, dst_pte, entry);
-			pte_val(entry) += PAGE_SIZE;
-			dst_pte++;
-			addr += PAGE_SIZE;
-		}
-		add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
-	}
-	return 0;
-
-nomem:
-	return -ENOMEM;
-}
-
-int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
-			struct page **pages, struct vm_area_struct **vmas,
-			unsigned long *position, int *length, int i)
-{
-	unsigned long vaddr = *position;
-	int remainder = *length;
-
-	WARN_ON(!is_vm_hugetlb_page(vma));
-
-	while (vaddr < vma->vm_end && remainder) {
-		if (pages) {
-			pte_t *pte;
-			struct page *page;
-
-			pte = huge_pte_offset(mm, vaddr);
-
-			/* hugetlb should be locked, and hence, prefaulted */
-			BUG_ON(!pte || pte_none(*pte));
-
-			page = pte_page(*pte);
-
-			WARN_ON(!PageCompound(page));
-
-			get_page(page);
-			pages[i] = page;
-		}
-
-		if (vmas)
-			vmas[i] = vma;
-
-		vaddr += PAGE_SIZE;
-		--remainder;
-		++i;
-	}
-
-	*length = remainder;
-	*position = vaddr;
-
-	return i;
-}
-
 struct page *follow_huge_addr(struct mm_struct *mm,
 			      unsigned long address, int write)
 {
@@ -190,34 +121,6 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 	return NULL;
 }
 
-void unmap_hugepage_range(struct vm_area_struct *vma,
-			  unsigned long start, unsigned long end)
-{
-	struct mm_struct *mm = vma->vm_mm;
-	unsigned long address;
-	pte_t *pte;
-	struct page *page;
-	int i;
-
-	BUG_ON(start & (HPAGE_SIZE - 1));
-	BUG_ON(end & (HPAGE_SIZE - 1));
-
-	for (address = start; address < end; address += HPAGE_SIZE) {
-		pte = huge_pte_offset(mm, address);
-		BUG_ON(!pte);
-		if (pte_none(*pte))
-			continue;
-		page = pte_page(*pte);
-		put_page(page);
-		for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
-			pte_clear(mm, address+(i*PAGE_SIZE), pte);
-			pte++;
-		}
-	}
-	add_mm_counter(mm, rss, -((end - start) >> PAGE_SHIFT));
-	flush_tlb_range(vma, start, end);
-}
-
 static void context_reload(void *__data)
 {
 	struct mm_struct *mm = __data;
@@ -226,12 +129,8 @@ static void context_reload(void *__data)
 		load_secondary_context(mm);
 }
 
-int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
+void hugetlb_prefault_arch_hook(struct mm_struct *mm)
 {
-	struct mm_struct *mm = current->mm;
-	unsigned long addr;
-	int ret = 0;
-
 	/* On UltraSPARC-III+ and later, configure the second half of
 	 * the Data-TLB for huge pages.
 	 */
@@ -261,50 +160,4 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 		}
 		spin_unlock(&ctx_alloc_lock);
 	}
-
-	BUG_ON(vma->vm_start & ~HPAGE_MASK);
-	BUG_ON(vma->vm_end & ~HPAGE_MASK);
-
-	spin_lock(&mm->page_table_lock);
-	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
-		unsigned long idx;
-		pte_t *pte = huge_pte_alloc(mm, addr);
-		struct page *page;
-
-		if (!pte) {
-			ret = -ENOMEM;
-			goto out;
-		}
-		if (!pte_none(*pte))
-			continue;
-
-		idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
-			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
-		page = find_get_page(mapping, idx);
-		if (!page) {
-			/* charge the fs quota first */
-			if (hugetlb_get_quota(mapping)) {
-				ret = -ENOMEM;
-				goto out;
-			}
-			page = alloc_huge_page();
-			if (!page) {
-				hugetlb_put_quota(mapping);
-				ret = -ENOMEM;
-				goto out;
-			}
-			ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
-			if (! ret) {
-				unlock_page(page);
-			} else {
-				hugetlb_put_quota(mapping);
-				free_huge_page(page);
-				goto out;
-			}
-		}
-		set_huge_pte(mm, vma, addr, page, pte, vma->vm_flags & VM_WRITE);
-	}
-out:
-	spin_unlock(&mm->page_table_lock);
-	return ret;
 }
diff --git a/include/asm-i386/page.h b/include/asm-i386/page.h
index ed13969fa2d6..41400d342d44 100644
--- a/include/asm-i386/page.h
+++ b/include/asm-i386/page.h
@@ -68,6 +68,7 @@ typedef struct { unsigned long pgprot; } pgprot_t;
 #define HPAGE_MASK	(~(HPAGE_SIZE - 1))
 #define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT - PAGE_SHIFT)
 #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
+#define ARCH_HAS_HUGETLB_CLEAN_STALE_PGTABLE
 #endif
 
 #define pgd_val(x)	((x).pgd)
diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h
index 8d60c2b4b003..e9efe148fdf7 100644
--- a/include/asm-i386/pgtable.h
+++ b/include/asm-i386/pgtable.h
@@ -236,6 +236,7 @@ static inline pte_t pte_mkexec(pte_t pte)	{ (pte).pte_low |= _PAGE_USER; return
 static inline pte_t pte_mkdirty(pte_t pte)	{ (pte).pte_low |= _PAGE_DIRTY; return pte; }
 static inline pte_t pte_mkyoung(pte_t pte)	{ (pte).pte_low |= _PAGE_ACCESSED; return pte; }
 static inline pte_t pte_mkwrite(pte_t pte)	{ (pte).pte_low |= _PAGE_RW; return pte; }
+static inline pte_t pte_mkhuge(pte_t pte)	{ (pte).pte_low |= _PAGE_PRESENT | _PAGE_PSE; return pte; }
 
 #ifdef CONFIG_X86_PAE
 # include <asm/pgtable-3level.h>
@@ -275,7 +276,6 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr,
  */
 
 #define mk_pte(page, pgprot)	pfn_pte(page_to_pfn(page), (pgprot))
-#define mk_pte_huge(entry) ((entry).pte_low |= _PAGE_PRESENT | _PAGE_PSE)
 
 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 {
diff --git a/include/asm-ia64/pgtable.h b/include/asm-ia64/pgtable.h
index fcc9c3344ab4..48586e08f432 100644
--- a/include/asm-ia64/pgtable.h
+++ b/include/asm-ia64/pgtable.h
@@ -283,6 +283,7 @@ ia64_phys_addr_valid (unsigned long addr)
 #define pte_mkyoung(pte)	(__pte(pte_val(pte) | _PAGE_A))
 #define pte_mkclean(pte)	(__pte(pte_val(pte) & ~_PAGE_D))
 #define pte_mkdirty(pte)	(__pte(pte_val(pte) | _PAGE_D))
+#define pte_mkhuge(pte)		(__pte(pte_val(pte) | _PAGE_P))
 
 /*
  * Macro to a page protection value as "uncacheable".  Note that "protection" is really a
diff --git a/include/asm-sh/page.h b/include/asm-sh/page.h
index 4c6d129e7d91..180467be8e7b 100644
--- a/include/asm-sh/page.h
+++ b/include/asm-sh/page.h
@@ -31,6 +31,7 @@
 #define HPAGE_SIZE		(1UL << HPAGE_SHIFT)
 #define HPAGE_MASK		(~(HPAGE_SIZE-1))
 #define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT-PAGE_SHIFT)
+#define ARCH_HAS_SETCLEAR_HUGE_PTE
 #endif
 
 #ifdef __KERNEL__
diff --git a/include/asm-sh/pgtable.h b/include/asm-sh/pgtable.h
index cd847a47a9aa..ecb909572d3f 100644
--- a/include/asm-sh/pgtable.h
+++ b/include/asm-sh/pgtable.h
@@ -196,6 +196,7 @@ static inline pte_t pte_mkexec(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _
 static inline pte_t pte_mkdirty(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _PAGE_DIRTY)); return pte; }
 static inline pte_t pte_mkyoung(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _PAGE_ACCESSED)); return pte; }
 static inline pte_t pte_mkwrite(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _PAGE_RW)); return pte; }
+static inline pte_t pte_mkhuge(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _PAGE_SZHUGE)); return pte; }
 
 /*
  * Macro and implementation to make a page protection as uncachable.
diff --git a/include/asm-sh64/page.h b/include/asm-sh64/page.h
index e1f7f5a41210..d6167f1c0e99 100644
--- a/include/asm-sh64/page.h
+++ b/include/asm-sh64/page.h
@@ -41,6 +41,7 @@
 #define HPAGE_SIZE		(1UL << HPAGE_SHIFT)
 #define HPAGE_MASK		(~(HPAGE_SIZE-1))
 #define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT-PAGE_SHIFT)
+#define ARCH_HAS_SETCLEAR_HUGE_PTE
 #endif
 
 #ifdef __KERNEL__
diff --git a/include/asm-sh64/pgtable.h b/include/asm-sh64/pgtable.h
index 525e1523ef5f..78ac6be2d9ef 100644
--- a/include/asm-sh64/pgtable.h
+++ b/include/asm-sh64/pgtable.h
@@ -430,6 +430,8 @@ extern inline pte_t pte_mkwrite(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) |
 extern inline pte_t pte_mkexec(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _PAGE_EXECUTE)); return pte; }
 extern inline pte_t pte_mkdirty(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _PAGE_DIRTY)); return pte; }
 extern inline pte_t pte_mkyoung(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _PAGE_ACCESSED)); return pte; }
+extern inline pte_t pte_mkhuge(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _PAGE_SZHUGE)); return pte; }
+
 
 /*
  * Conversion functions: convert a page and protection to a page entry.
diff --git a/include/asm-sparc64/page.h b/include/asm-sparc64/page.h
index 219ea043a14a..b87dbbd64bc9 100644
--- a/include/asm-sparc64/page.h
+++ b/include/asm-sparc64/page.h
@@ -95,6 +95,8 @@ typedef unsigned long pgprot_t;
 #define HPAGE_SIZE		(_AC(1,UL) << HPAGE_SHIFT)
 #define HPAGE_MASK		(~(HPAGE_SIZE - 1UL))
 #define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT - PAGE_SHIFT)
+#define ARCH_HAS_SETCLEAR_HUGE_PTE
+#define ARCH_HAS_HUGETLB_PREFAULT_HOOK
 #endif
 
 #define TASK_UNMAPPED_BASE	(test_thread_flag(TIF_32BIT) ? \
diff --git a/include/asm-sparc64/pgtable.h b/include/asm-sparc64/pgtable.h
index ae2cd5b09a7c..1ae00c5087f1 100644
--- a/include/asm-sparc64/pgtable.h
+++ b/include/asm-sparc64/pgtable.h
@@ -286,6 +286,7 @@ static inline pte_t pte_modify(pte_t orig_pte, pgprot_t new_prot)
 #define pte_mkyoung(pte)	(__pte(pte_val(pte) | _PAGE_ACCESSED | _PAGE_R))
 #define pte_mkwrite(pte)	(__pte(pte_val(pte) | _PAGE_WRITE))
 #define pte_mkdirty(pte)	(__pte(pte_val(pte) | _PAGE_MODIFIED | _PAGE_W))
+#define pte_mkhuge(pte)		(__pte(pte_val(pte) | _PAGE_SZHUGE))
 
 /* to find an entry in a page-table-directory. */
 #define pgd_index(address)	(((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
diff --git a/include/asm-x86_64/page.h b/include/asm-x86_64/page.h
index f43048035a03..9ce338c3a71e 100644
--- a/include/asm-x86_64/page.h
+++ b/include/asm-x86_64/page.h
@@ -28,6 +28,7 @@
 #define HPAGE_SIZE	((1UL) << HPAGE_SHIFT)
 #define HPAGE_MASK	(~(HPAGE_SIZE - 1))
 #define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT - PAGE_SHIFT)
+#define ARCH_HAS_HUGETLB_CLEAN_STALE_PGTABLE
 
 #ifdef __KERNEL__
 #ifndef __ASSEMBLY__
diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h
index db2a0efbf573..4eec176c3c39 100644
--- a/include/asm-x86_64/pgtable.h
+++ b/include/asm-x86_64/pgtable.h
@@ -253,6 +253,7 @@ extern inline int pte_young(pte_t pte)		{ return pte_val(pte) & _PAGE_ACCESSED;
 extern inline int pte_write(pte_t pte)		{ return pte_val(pte) & _PAGE_RW; }
 static inline int pte_file(pte_t pte)		{ return pte_val(pte) & _PAGE_FILE; }
 
+#define __LARGE_PTE (_PAGE_PSE|_PAGE_PRESENT)
 extern inline pte_t pte_rdprotect(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_USER)); return pte; }
 extern inline pte_t pte_exprotect(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_USER)); return pte; }
 extern inline pte_t pte_mkclean(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_DIRTY)); return pte; }
@@ -263,6 +264,7 @@ extern inline pte_t pte_mkexec(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _
 extern inline pte_t pte_mkdirty(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _PAGE_DIRTY)); return pte; }
 extern inline pte_t pte_mkyoung(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _PAGE_ACCESSED)); return pte; }
 extern inline pte_t pte_mkwrite(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _PAGE_RW)); return pte; }
+extern inline pte_t pte_mkhuge(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | __LARGE_PTE)); return pte; }
 
 struct vm_area_struct;
 
@@ -290,7 +292,6 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr,
  */
 #define pgprot_noncached(prot)	(__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT))
 
-#define __LARGE_PTE (_PAGE_PSE|_PAGE_PRESENT) 
 static inline int pmd_large(pmd_t pte) { 
 	return (pmd_val(pte) & __LARGE_PTE) == __LARGE_PTE; 
 } 	
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 6af1ae4a8211..f529d1442815 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -4,6 +4,7 @@
 #ifdef CONFIG_HUGETLB_PAGE
 
 #include <linux/mempolicy.h>
+#include <asm/tlbflush.h>
 
 struct ctl_table;
 
@@ -22,12 +23,6 @@ int hugetlb_report_meminfo(char *);
 int hugetlb_report_node_meminfo(int, char *);
 int is_hugepage_mem_enough(size_t);
 unsigned long hugetlb_total_pages(void);
-struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
-			      int write);
-struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-				pmd_t *pmd, int write);
-int is_aligned_hugepage_range(unsigned long addr, unsigned long len);
-int pmd_huge(pmd_t pmd);
 struct page *alloc_huge_page(void);
 void free_huge_page(struct page *);
 
@@ -35,6 +30,17 @@ extern unsigned long max_huge_pages;
 extern const unsigned long hugetlb_zero, hugetlb_infinity;
 extern int sysctl_hugetlb_shm_group;
 
+/* arch callbacks */
+
+pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr);
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr);
+struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
+			      int write);
+struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
+				pmd_t *pmd, int write);
+int is_aligned_hugepage_range(unsigned long addr, unsigned long len);
+int pmd_huge(pmd_t pmd);
+
 #ifndef ARCH_HAS_HUGEPAGE_ONLY_RANGE
 #define is_hugepage_only_range(mm, addr, len)	0
 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \
@@ -48,6 +54,28 @@ extern int sysctl_hugetlb_shm_group;
 int prepare_hugepage_range(unsigned long addr, unsigned long len);
 #endif
 
+#ifndef ARCH_HAS_SETCLEAR_HUGE_PTE
+#define set_huge_pte_at(mm, addr, ptep, pte)	set_pte_at(mm, addr, ptep, pte)
+#define huge_ptep_get_and_clear(mm, addr, ptep) ptep_get_and_clear(mm, addr, ptep)
+#else
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+		     pte_t *ptep, pte_t pte);
+pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
+			      pte_t *ptep);
+#endif
+
+#ifndef ARCH_HAS_HUGETLB_PREFAULT_HOOK
+#define hugetlb_prefault_arch_hook(mm)		do { } while (0)
+#else
+void hugetlb_prefault_arch_hook(struct mm_struct *mm);
+#endif
+
+#ifndef ARCH_HAS_HUGETLB_CLEAN_STALE_PGTABLE
+#define hugetlb_clean_stale_pgtable(pte)	BUG()
+#else
+void hugetlb_clean_stale_pgtable(pte_t *pte);
+#endif
+
 #else /* !CONFIG_HUGETLB_PAGE */
 
 static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4eb5ae3fbe10..fbd1111ea119 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -7,10 +7,14 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/mm.h>
-#include <linux/hugetlb.h>
 #include <linux/sysctl.h>
 #include <linux/highmem.h>
 #include <linux/nodemask.h>
+#include <linux/pagemap.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+
+#include <linux/hugetlb.h>
 
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
 static unsigned long nr_huge_pages, free_huge_pages;
@@ -249,6 +253,72 @@ struct vm_operations_struct hugetlb_vm_ops = {
 	.nopage = hugetlb_nopage,
 };
 
+static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page)
+{
+	pte_t entry;
+
+	if (vma->vm_flags & VM_WRITE) {
+		entry =
+		    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
+	} else {
+		entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
+	}
+	entry = pte_mkyoung(entry);
+	entry = pte_mkhuge(entry);
+
+	return entry;
+}
+
+int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
+			    struct vm_area_struct *vma)
+{
+	pte_t *src_pte, *dst_pte, entry;
+	struct page *ptepage;
+	unsigned long addr = vma->vm_start;
+	unsigned long end = vma->vm_end;
+
+	while (addr < end) {
+		dst_pte = huge_pte_alloc(dst, addr);
+		if (!dst_pte)
+			goto nomem;
+		src_pte = huge_pte_offset(src, addr);
+		BUG_ON(!src_pte || pte_none(*src_pte)); /* prefaulted */
+		entry = *src_pte;
+		ptepage = pte_page(entry);
+		get_page(ptepage);
+		add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
+		set_huge_pte_at(dst, addr, dst_pte, entry);
+		addr += HPAGE_SIZE;
+	}
+	return 0;
+
+nomem:
+	return -ENOMEM;
+}
+
+void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
+			  unsigned long end)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long address;
+	pte_t pte;
+	struct page *page;
+
+	WARN_ON(!is_vm_hugetlb_page(vma));
+	BUG_ON(start & ~HPAGE_MASK);
+	BUG_ON(end & ~HPAGE_MASK);
+
+	for (address = start; address < end; address += HPAGE_SIZE) {
+		pte = huge_ptep_get_and_clear(mm, address, huge_pte_offset(mm, address));
+		if (pte_none(pte))
+			continue;
+		page = pte_page(pte);
+		put_page(page);
+	}
+	add_mm_counter(mm, rss,  -((end - start) >> PAGE_SHIFT));
+	flush_tlb_range(vma, start, end);
+}
+
 void zap_hugepage_range(struct vm_area_struct *vma,
 			unsigned long start, unsigned long length)
 {
@@ -258,3 +328,108 @@ void zap_hugepage_range(struct vm_area_struct *vma,
 	unmap_hugepage_range(vma, start, start + length);
 	spin_unlock(&mm->page_table_lock);
 }
+
+int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long addr;
+	int ret = 0;
+
+	WARN_ON(!is_vm_hugetlb_page(vma));
+	BUG_ON(vma->vm_start & ~HPAGE_MASK);
+	BUG_ON(vma->vm_end & ~HPAGE_MASK);
+
+	hugetlb_prefault_arch_hook(mm);
+
+	spin_lock(&mm->page_table_lock);
+	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
+		unsigned long idx;
+		pte_t *pte = huge_pte_alloc(mm, addr);
+		struct page *page;
+
+		if (!pte) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		if (! pte_none(*pte))
+			hugetlb_clean_stale_pgtable(pte);
+
+		idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
+			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+		page = find_get_page(mapping, idx);
+		if (!page) {
+			/* charge the fs quota first */
+			if (hugetlb_get_quota(mapping)) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			page = alloc_huge_page();
+			if (!page) {
+				hugetlb_put_quota(mapping);
+				ret = -ENOMEM;
+				goto out;
+			}
+			ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
+			if (! ret) {
+				unlock_page(page);
+			} else {
+				hugetlb_put_quota(mapping);
+				free_huge_page(page);
+				goto out;
+			}
+		}
+		add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
+		set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page));
+	}
+out:
+	spin_unlock(&mm->page_table_lock);
+	return ret;
+}
+
+int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
+			struct page **pages, struct vm_area_struct **vmas,
+			unsigned long *position, int *length, int i)
+{
+	unsigned long vpfn, vaddr = *position;
+	int remainder = *length;
+
+	BUG_ON(!is_vm_hugetlb_page(vma));
+
+	vpfn = vaddr/PAGE_SIZE;
+	while (vaddr < vma->vm_end && remainder) {
+
+		if (pages) {
+			pte_t *pte;
+			struct page *page;
+
+			/* Some archs (sparc64, sh*) have multiple
+			 * pte_ts to each hugepage.  We have to make
+			 * sure we get the first, for the page
+			 * indexing below to work. */
+			pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
+
+			/* hugetlb should be locked, and hence, prefaulted */
+			WARN_ON(!pte || pte_none(*pte));
+
+			page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
+
+			WARN_ON(!PageCompound(page));
+
+			get_page(page);
+			pages[i] = page;
+		}
+
+		if (vmas)
+			vmas[i] = vma;
+
+		vaddr += PAGE_SIZE;
+		++vpfn;
+		--remainder;
+		++i;
+	}
+
+	*length = remainder;
+	*position = vaddr;
+
+	return i;
+}
-- 
cgit v1.2.3


From 1363c3cd8603a913a27e2995dccbd70d5312d8e6 Mon Sep 17 00:00:00 2001
From: Wolfgang Wander <wwc@rentec.com>
Date: Tue, 21 Jun 2005 17:14:49 -0700
Subject: [PATCH] Avoiding mmap fragmentation

Ingo recently introduced a great speedup for allocating new mmaps using the
free_area_cache pointer which boosts the specweb SSL benchmark by 4-5% and
causes huge performance increases in thread creation.

The downside of this patch is that it does lead to fragmentation in the
mmap-ed areas (visible via /proc/self/maps), such that some applications
that work fine under 2.4 kernels quickly run out of memory on any 2.6
kernel.

The problem is twofold:

  1) the free_area_cache is used to continue a search for memory where
     the last search ended.  Before the change new areas were always
     searched from the base address on.

     So now new small areas are cluttering holes of all sizes
     throughout the whole mmap-able region whereas before small holes
     tended to close holes near the base leaving holes far from the base
     large and available for larger requests.

  2) the free_area_cache also is set to the location of the last
     munmap-ed area so in scenarios where we allocate e.g.  five regions of
     1K each, then free regions 4 2 3 in this order the next request for 1K
     will be placed in the position of the old region 3, whereas before we
     appended it to the still active region 1, placing it at the location
     of the old region 2.  Before we had 1 free region of 2K, now we only
     get two free regions of 1K -> fragmentation.

The patch addresses thes issues by introducing yet another cache descriptor
cached_hole_size that contains the largest known hole size below the
current free_area_cache.  If a new request comes in the size is compared
against the cached_hole_size and if the request can be filled with a hole
below free_area_cache the search is started from the base instead.

The results look promising: Whereas 2.6.12-rc4 fragments quickly and my
(earlier posted) leakme.c test program terminates after 50000+ iterations
with 96 distinct and fragmented maps in /proc/self/maps it performs nicely
(as expected) with thread creation, Ingo's test_str02 with 20000 threads
requires 0.7s system time.

Taking out Ingo's patch (un-patch available per request) by basically
deleting all mentions of free_area_cache from the kernel and starting the
search for new memory always at the respective bases we observe: leakme
terminates successfully with 11 distinctive hardly fragmented areas in
/proc/self/maps but thread creating is gringdingly slow: 30+s(!) system
time for Ingo's test_str02 with 20000 threads.

Now - drumroll ;-) the appended patch works fine with leakme: it ends with
only 7 distinct areas in /proc/self/maps and also thread creation seems
sufficiently fast with 0.71s for 20000 threads.

Signed-off-by: Wolfgang Wander <wwc@rentec.com>
Credit-to: "Richard Purdie" <rpurdie@rpsys.net>
Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu> (partly)
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/arm/mm/mmap.c              | 10 +++++++-
 arch/i386/mm/hugetlbpage.c      | 34 ++++++++++++++++++++++----
 arch/ppc64/mm/hugetlbpage.c     | 34 ++++++++++++++++++++++----
 arch/sh/kernel/sys_sh.c         |  8 +++++++
 arch/sparc64/kernel/sys_sparc.c |  8 +++++++
 arch/x86_64/ia32/ia32_aout.c    |  1 +
 arch/x86_64/kernel/sys_x86_64.c |  9 +++++++
 fs/binfmt_aout.c                |  1 +
 fs/binfmt_elf.c                 |  1 +
 fs/hugetlbfs/inode.c            |  3 +++
 include/linux/sched.h           | 11 +++++----
 kernel/fork.c                   |  2 ++
 mm/mmap.c                       | 53 +++++++++++++++++++++++++++++++----------
 mm/nommu.c                      |  2 +-
 14 files changed, 147 insertions(+), 30 deletions(-)

(limited to 'arch/sparc64')

diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c
index 32c4b0e35b37..3de7f84b53c2 100644
--- a/arch/arm/mm/mmap.c
+++ b/arch/arm/mm/mmap.c
@@ -73,7 +73,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
 		    (!vma || addr + len <= vma->vm_start))
 			return addr;
 	}
-	start_addr = addr = mm->free_area_cache;
+	if (len > mm->cached_hole_size) {
+	        start_addr = addr = mm->free_area_cache;
+	} else {
+	        start_addr = addr = TASK_UNMAPPED_BASE;
+	        mm->cached_hole_size = 0;
+	}
 
 full_search:
 	if (do_align)
@@ -90,6 +95,7 @@ full_search:
 			 */
 			if (start_addr != TASK_UNMAPPED_BASE) {
 				start_addr = addr = TASK_UNMAPPED_BASE;
+				mm->cached_hole_size = 0;
 				goto full_search;
 			}
 			return -ENOMEM;
@@ -101,6 +107,8 @@ full_search:
 			mm->free_area_cache = addr + len;
 			return addr;
 		}
+		if (addr + mm->cached_hole_size < vma->vm_start)
+		        mm->cached_hole_size = vma->vm_start - addr;
 		addr = vma->vm_end;
 		if (do_align)
 			addr = COLOUR_ALIGN(addr, pgoff);
diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c
index 5aa06001a4bd..3b099f32b948 100644
--- a/arch/i386/mm/hugetlbpage.c
+++ b/arch/i386/mm/hugetlbpage.c
@@ -140,7 +140,12 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
 	struct vm_area_struct *vma;
 	unsigned long start_addr;
 
-	start_addr = mm->free_area_cache;
+	if (len > mm->cached_hole_size) {
+	        start_addr = mm->free_area_cache;
+	} else {
+	        start_addr = TASK_UNMAPPED_BASE;
+	        mm->cached_hole_size = 0;
+	}
 
 full_search:
 	addr = ALIGN(start_addr, HPAGE_SIZE);
@@ -154,6 +159,7 @@ full_search:
 			 */
 			if (start_addr != TASK_UNMAPPED_BASE) {
 				start_addr = TASK_UNMAPPED_BASE;
+				mm->cached_hole_size = 0;
 				goto full_search;
 			}
 			return -ENOMEM;
@@ -162,6 +168,8 @@ full_search:
 			mm->free_area_cache = addr + len;
 			return addr;
 		}
+		if (addr + mm->cached_hole_size < vma->vm_start)
+		        mm->cached_hole_size = vma->vm_start - addr;
 		addr = ALIGN(vma->vm_end, HPAGE_SIZE);
 	}
 }
@@ -173,12 +181,17 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma, *prev_vma;
 	unsigned long base = mm->mmap_base, addr = addr0;
+	unsigned long largest_hole = mm->cached_hole_size;
 	int first_time = 1;
 
 	/* don't allow allocations above current base */
 	if (mm->free_area_cache > base)
 		mm->free_area_cache = base;
 
+	if (len <= largest_hole) {
+	        largest_hole = 0;
+		mm->free_area_cache  = base;
+	}
 try_again:
 	/* make sure it can fit in the remaining address space */
 	if (mm->free_area_cache < len)
@@ -199,13 +212,21 @@ try_again:
 		 * vma->vm_start, use it:
 		 */
 		if (addr + len <= vma->vm_start &&
-				(!prev_vma || (addr >= prev_vma->vm_end)))
+		            (!prev_vma || (addr >= prev_vma->vm_end))) {
 			/* remember the address as a hint for next time */
-			return (mm->free_area_cache = addr);
-		else
+		        mm->cached_hole_size = largest_hole;
+		        return (mm->free_area_cache = addr);
+		} else {
 			/* pull free_area_cache down to the first hole */
-			if (mm->free_area_cache == vma->vm_end)
+		        if (mm->free_area_cache == vma->vm_end) {
 				mm->free_area_cache = vma->vm_start;
+				mm->cached_hole_size = largest_hole;
+			}
+		}
+
+		/* remember the largest hole we saw so far */
+		if (addr + largest_hole < vma->vm_start)
+		        largest_hole = vma->vm_start - addr;
 
 		/* try just below the current vma->vm_start */
 		addr = (vma->vm_start - len) & HPAGE_MASK;
@@ -218,6 +239,7 @@ fail:
 	 */
 	if (first_time) {
 		mm->free_area_cache = base;
+		largest_hole = 0;
 		first_time = 0;
 		goto try_again;
 	}
@@ -228,6 +250,7 @@ fail:
 	 * allocations.
 	 */
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
+	mm->cached_hole_size = ~0UL;
 	addr = hugetlb_get_unmapped_area_bottomup(file, addr0,
 			len, pgoff, flags);
 
@@ -235,6 +258,7 @@ fail:
 	 * Restore the topdown base:
 	 */
 	mm->free_area_cache = base;
+	mm->cached_hole_size = ~0UL;
 
 	return addr;
 }
diff --git a/arch/ppc64/mm/hugetlbpage.c b/arch/ppc64/mm/hugetlbpage.c
index b4ab766f5980..fdcfe97c75c1 100644
--- a/arch/ppc64/mm/hugetlbpage.c
+++ b/arch/ppc64/mm/hugetlbpage.c
@@ -292,7 +292,12 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
 		    && !is_hugepage_only_range(mm, addr,len))
 			return addr;
 	}
-	start_addr = addr = mm->free_area_cache;
+	if (len > mm->cached_hole_size) {
+	        start_addr = addr = mm->free_area_cache;
+	} else {
+	        start_addr = addr = TASK_UNMAPPED_BASE;
+	        mm->cached_hole_size = 0;
+	}
 
 full_search:
 	vma = find_vma(mm, addr);
@@ -316,6 +321,8 @@ full_search:
 			mm->free_area_cache = addr + len;
 			return addr;
 		}
+		if (addr + mm->cached_hole_size < vma->vm_start)
+		        mm->cached_hole_size = vma->vm_start - addr;
 		addr = vma->vm_end;
 		vma = vma->vm_next;
 	}
@@ -323,6 +330,7 @@ full_search:
 	/* Make sure we didn't miss any holes */
 	if (start_addr != TASK_UNMAPPED_BASE) {
 		start_addr = addr = TASK_UNMAPPED_BASE;
+		mm->cached_hole_size = 0;
 		goto full_search;
 	}
 	return -ENOMEM;
@@ -344,6 +352,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 	struct vm_area_struct *vma, *prev_vma;
 	struct mm_struct *mm = current->mm;
 	unsigned long base = mm->mmap_base, addr = addr0;
+	unsigned long largest_hole = mm->cached_hole_size;
 	int first_time = 1;
 
 	/* requested length too big for entire address space */
@@ -364,6 +373,10 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 			return addr;
 	}
 
+	if (len <= largest_hole) {
+	        largest_hole = 0;
+		mm->free_area_cache = base;
+	}
 try_again:
 	/* make sure it can fit in the remaining address space */
 	if (mm->free_area_cache < len)
@@ -392,13 +405,21 @@ hugepage_recheck:
 		 * vma->vm_start, use it:
 		 */
 		if (addr+len <= vma->vm_start &&
-				(!prev_vma || (addr >= prev_vma->vm_end)))
+		          (!prev_vma || (addr >= prev_vma->vm_end))) {
 			/* remember the address as a hint for next time */
-			return (mm->free_area_cache = addr);
-		else
+		        mm->cached_hole_size = largest_hole;
+		        return (mm->free_area_cache = addr);
+		} else {
 			/* pull free_area_cache down to the first hole */
-			if (mm->free_area_cache == vma->vm_end)
+		        if (mm->free_area_cache == vma->vm_end) {
 				mm->free_area_cache = vma->vm_start;
+				mm->cached_hole_size = largest_hole;
+			}
+		}
+
+		/* remember the largest hole we saw so far */
+		if (addr + largest_hole < vma->vm_start)
+		        largest_hole = vma->vm_start - addr;
 
 		/* try just below the current vma->vm_start */
 		addr = vma->vm_start-len;
@@ -411,6 +432,7 @@ fail:
 	 */
 	if (first_time) {
 		mm->free_area_cache = base;
+		largest_hole = 0;
 		first_time = 0;
 		goto try_again;
 	}
@@ -421,11 +443,13 @@ fail:
 	 * allocations.
 	 */
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
+	mm->cached_hole_size = ~0UL;
 	addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
 	/*
 	 * Restore the topdown base:
 	 */
 	mm->free_area_cache = base;
+	mm->cached_hole_size = ~0UL;
 
 	return addr;
 }
diff --git a/arch/sh/kernel/sys_sh.c b/arch/sh/kernel/sys_sh.c
index df5ac294c379..917b2f32f260 100644
--- a/arch/sh/kernel/sys_sh.c
+++ b/arch/sh/kernel/sys_sh.c
@@ -79,6 +79,10 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
 		    (!vma || addr + len <= vma->vm_start))
 			return addr;
 	}
+	if (len <= mm->cached_hole_size) {
+	        mm->cached_hole_size = 0;
+		mm->free_area_cache = TASK_UNMAPPED_BASE;
+	}
 	if (flags & MAP_PRIVATE)
 		addr = PAGE_ALIGN(mm->free_area_cache);
 	else
@@ -95,6 +99,7 @@ full_search:
 			 */
 			if (start_addr != TASK_UNMAPPED_BASE) {
 				start_addr = addr = TASK_UNMAPPED_BASE;
+				mm->cached_hole_size = 0;
 				goto full_search;
 			}
 			return -ENOMEM;
@@ -106,6 +111,9 @@ full_search:
 			mm->free_area_cache = addr + len;
 			return addr;
 		}
+		if (addr + mm->cached_hole_size < vma->vm_start)
+		        mm->cached_hole_size = vma->vm_start - addr;
+
 		addr = vma->vm_end;
 		if (!(flags & MAP_PRIVATE))
 			addr = COLOUR_ALIGN(addr);
diff --git a/arch/sparc64/kernel/sys_sparc.c b/arch/sparc64/kernel/sys_sparc.c
index 0077f02f4b37..5f8c822a2b4a 100644
--- a/arch/sparc64/kernel/sys_sparc.c
+++ b/arch/sparc64/kernel/sys_sparc.c
@@ -84,6 +84,10 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi
 			return addr;
 	}
 
+	if (len <= mm->cached_hole_size) {
+	        mm->cached_hole_size = 0;
+		mm->free_area_cache = TASK_UNMAPPED_BASE;
+	}
 	start_addr = addr = mm->free_area_cache;
 
 	task_size -= len;
@@ -103,6 +107,7 @@ full_search:
 		if (task_size < addr) {
 			if (start_addr != TASK_UNMAPPED_BASE) {
 				start_addr = addr = TASK_UNMAPPED_BASE;
+				mm->cached_hole_size = 0;
 				goto full_search;
 			}
 			return -ENOMEM;
@@ -114,6 +119,9 @@ full_search:
 			mm->free_area_cache = addr + len;
 			return addr;
 		}
+		if (addr + mm->cached_hole_size < vma->vm_start)
+		        mm->cached_hole_size = vma->vm_start - addr;
+
 		addr = vma->vm_end;
 		if (do_color_align)
 			addr = COLOUR_ALIGN(addr, pgoff);
diff --git a/arch/x86_64/ia32/ia32_aout.c b/arch/x86_64/ia32/ia32_aout.c
index 1965efc974dc..c12edf5d97f0 100644
--- a/arch/x86_64/ia32/ia32_aout.c
+++ b/arch/x86_64/ia32/ia32_aout.c
@@ -312,6 +312,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	current->mm->brk = ex.a_bss +
 		(current->mm->start_brk = N_BSSADDR(ex));
 	current->mm->free_area_cache = TASK_UNMAPPED_BASE;
+	current->mm->cached_hole_size = 0;
 
 	set_mm_counter(current->mm, rss, 0);
 	current->mm->mmap = NULL;
diff --git a/arch/x86_64/kernel/sys_x86_64.c b/arch/x86_64/kernel/sys_x86_64.c
index d9798dd433fc..cc7821c68851 100644
--- a/arch/x86_64/kernel/sys_x86_64.c
+++ b/arch/x86_64/kernel/sys_x86_64.c
@@ -105,6 +105,11 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
 		    (!vma || addr + len <= vma->vm_start))
 			return addr;
 	}
+	if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32))
+	    && len <= mm->cached_hole_size) {
+	        mm->cached_hole_size = 0;
+		mm->free_area_cache = begin;
+	}
 	addr = mm->free_area_cache;
 	if (addr < begin) 
 		addr = begin; 
@@ -120,6 +125,7 @@ full_search:
 			 */
 			if (start_addr != begin) {
 				start_addr = addr = begin;
+				mm->cached_hole_size = 0;
 				goto full_search;
 			}
 			return -ENOMEM;
@@ -131,6 +137,9 @@ full_search:
 			mm->free_area_cache = addr + len;
 			return addr;
 		}
+		if (addr + mm->cached_hole_size < vma->vm_start)
+		        mm->cached_hole_size = vma->vm_start - addr;
+
 		addr = vma->vm_end;
 	}
 }
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 009b8920c1ff..dd9baabaf016 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -316,6 +316,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	current->mm->brk = ex.a_bss +
 		(current->mm->start_brk = N_BSSADDR(ex));
 	current->mm->free_area_cache = current->mm->mmap_base;
+	current->mm->cached_hole_size = 0;
 
 	set_mm_counter(current->mm, rss, 0);
 	current->mm->mmap = NULL;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index f8f6b6b76179..7976a238f0a3 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -775,6 +775,7 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	   change some of these later */
 	set_mm_counter(current->mm, rss, 0);
 	current->mm->free_area_cache = current->mm->mmap_base;
+	current->mm->cached_hole_size = 0;
 	retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
 				 executable_stack);
 	if (retval < 0) {
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 2af3338f891b..3a9b6d179cbd 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -122,6 +122,9 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 
 	start_addr = mm->free_area_cache;
 
+	if (len <= mm->cached_hole_size)
+		start_addr = TASK_UNMAPPED_BASE;
+
 full_search:
 	addr = ALIGN(start_addr, HPAGE_SIZE);
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4dbb109022f3..b58afd97a180 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -201,8 +201,8 @@ extern unsigned long
 arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
 			  unsigned long len, unsigned long pgoff,
 			  unsigned long flags);
-extern void arch_unmap_area(struct vm_area_struct *area);
-extern void arch_unmap_area_topdown(struct vm_area_struct *area);
+extern void arch_unmap_area(struct mm_struct *, unsigned long);
+extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
 
 #define set_mm_counter(mm, member, value) (mm)->_##member = (value)
 #define get_mm_counter(mm, member) ((mm)->_##member)
@@ -218,9 +218,10 @@ struct mm_struct {
 	unsigned long (*get_unmapped_area) (struct file *filp,
 				unsigned long addr, unsigned long len,
 				unsigned long pgoff, unsigned long flags);
-	void (*unmap_area) (struct vm_area_struct *area);
-	unsigned long mmap_base;		/* base of mmap area */
-	unsigned long free_area_cache;		/* first hole */
+	void (*unmap_area) (struct mm_struct *mm, unsigned long addr);
+        unsigned long mmap_base;		/* base of mmap area */
+        unsigned long cached_hole_size;         /* if non-zero, the largest hole below free_area_cache */
+	unsigned long free_area_cache;		/* first hole of size cached_hole_size or larger */
 	pgd_t * pgd;
 	atomic_t mm_users;			/* How many users with user space? */
 	atomic_t mm_count;			/* How many references to "struct mm_struct" (users count as 1) */
diff --git a/kernel/fork.c b/kernel/fork.c
index f42a17f88699..876b31cd822d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -194,6 +194,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
 	mm->mmap = NULL;
 	mm->mmap_cache = NULL;
 	mm->free_area_cache = oldmm->mmap_base;
+	mm->cached_hole_size = ~0UL;
 	mm->map_count = 0;
 	set_mm_counter(mm, rss, 0);
 	set_mm_counter(mm, anon_rss, 0);
@@ -322,6 +323,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm)
 	mm->ioctx_list = NULL;
 	mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm);
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
+	mm->cached_hole_size = ~0UL;
 
 	if (likely(!mm_alloc_pgd(mm))) {
 		mm->def_flags = 0;
diff --git a/mm/mmap.c b/mm/mmap.c
index de54acd9942f..9da23c1ef9dc 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1175,7 +1175,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
 		    (!vma || addr + len <= vma->vm_start))
 			return addr;
 	}
-	start_addr = addr = mm->free_area_cache;
+	if (len > mm->cached_hole_size) {
+	        start_addr = addr = mm->free_area_cache;
+	} else {
+	        start_addr = addr = TASK_UNMAPPED_BASE;
+	        mm->cached_hole_size = 0;
+	}
 
 full_search:
 	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
@@ -1186,7 +1191,9 @@ full_search:
 			 * some holes.
 			 */
 			if (start_addr != TASK_UNMAPPED_BASE) {
-				start_addr = addr = TASK_UNMAPPED_BASE;
+				addr = TASK_UNMAPPED_BASE;
+			        start_addr = addr;
+				mm->cached_hole_size = 0;
 				goto full_search;
 			}
 			return -ENOMEM;
@@ -1198,19 +1205,22 @@ full_search:
 			mm->free_area_cache = addr + len;
 			return addr;
 		}
+		if (addr + mm->cached_hole_size < vma->vm_start)
+		        mm->cached_hole_size = vma->vm_start - addr;
 		addr = vma->vm_end;
 	}
 }
 #endif	
 
-void arch_unmap_area(struct vm_area_struct *area)
+void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
 {
 	/*
 	 * Is this a new hole at the lowest possible address?
 	 */
-	if (area->vm_start >= TASK_UNMAPPED_BASE &&
-			area->vm_start < area->vm_mm->free_area_cache)
-		area->vm_mm->free_area_cache = area->vm_start;
+	if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) {
+		mm->free_area_cache = addr;
+		mm->cached_hole_size = ~0UL;
+	}
 }
 
 /*
@@ -1240,6 +1250,12 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 			return addr;
 	}
 
+	/* check if free_area_cache is useful for us */
+	if (len <= mm->cached_hole_size) {
+ 	        mm->cached_hole_size = 0;
+ 		mm->free_area_cache = mm->mmap_base;
+ 	}
+
 	/* either no address requested or can't fit in requested address hole */
 	addr = mm->free_area_cache;
 
@@ -1264,6 +1280,10 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 			/* remember the address as a hint for next time */
 			return (mm->free_area_cache = addr);
 
+ 		/* remember the largest hole we saw so far */
+ 		if (addr + mm->cached_hole_size < vma->vm_start)
+ 		        mm->cached_hole_size = vma->vm_start - addr;
+
 		/* try just below the current vma->vm_start */
 		addr = vma->vm_start-len;
 	} while (len < vma->vm_start);
@@ -1274,28 +1294,30 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 	 * can happen with large stack limits and large mmap()
 	 * allocations.
 	 */
-	mm->free_area_cache = TASK_UNMAPPED_BASE;
+	mm->cached_hole_size = ~0UL;
+  	mm->free_area_cache = TASK_UNMAPPED_BASE;
 	addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
 	/*
 	 * Restore the topdown base:
 	 */
 	mm->free_area_cache = mm->mmap_base;
+	mm->cached_hole_size = ~0UL;
 
 	return addr;
 }
 #endif
 
-void arch_unmap_area_topdown(struct vm_area_struct *area)
+void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)
 {
 	/*
 	 * Is this a new hole at the highest possible address?
 	 */
-	if (area->vm_end > area->vm_mm->free_area_cache)
-		area->vm_mm->free_area_cache = area->vm_end;
+	if (addr > mm->free_area_cache)
+		mm->free_area_cache = addr;
 
 	/* dont allow allocations above current base */
-	if (area->vm_mm->free_area_cache > area->vm_mm->mmap_base)
-		area->vm_mm->free_area_cache = area->vm_mm->mmap_base;
+	if (mm->free_area_cache > mm->mmap_base)
+		mm->free_area_cache = mm->mmap_base;
 }
 
 unsigned long
@@ -1595,7 +1617,6 @@ static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area)
 	if (area->vm_flags & VM_LOCKED)
 		area->vm_mm->locked_vm -= len >> PAGE_SHIFT;
 	vm_stat_unaccount(area);
-	area->vm_mm->unmap_area(area);
 	remove_vm_struct(area);
 }
 
@@ -1649,6 +1670,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
 {
 	struct vm_area_struct **insertion_point;
 	struct vm_area_struct *tail_vma = NULL;
+	unsigned long addr;
 
 	insertion_point = (prev ? &prev->vm_next : &mm->mmap);
 	do {
@@ -1659,6 +1681,11 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
 	} while (vma && vma->vm_start < end);
 	*insertion_point = vma;
 	tail_vma->vm_next = NULL;
+	if (mm->unmap_area == arch_unmap_area)
+		addr = prev ? prev->vm_end : mm->mmap_base;
+	else
+		addr = vma ?  vma->vm_start : mm->mmap_base;
+	mm->unmap_area(mm, addr);
 	mm->mmap_cache = NULL;		/* Kill the cache. */
 }
 
diff --git a/mm/nommu.c b/mm/nommu.c
index c53e9c8f6b4a..ce74452c02d9 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1067,7 +1067,7 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
 	return -ENOMEM;
 }
 
-void arch_unmap_area(struct vm_area_struct *area)
+void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
 {
 }
 
-- 
cgit v1.2.3