35 files changed, 5245 insertions, 0 deletions
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
new file mode 100644
index 000000000000..329da276c6f1
--- /dev/null
+++ b/arch/x86/lib/Makefile
@@ -0,0 +1,5 @@
+ifeq ($(CONFIG_X86_32),y)
+include ${srctree}/arch/x86/lib/Makefile_32
+else
+include ${srctree}/arch/x86/lib/Makefile_64
+endif
diff --git a/arch/x86/lib/Makefile_32 b/arch/x86/lib/Makefile_32
new file mode 100644
index 000000000000..98d1f1e2e2ef
--- /dev/null
+++ b/arch/x86/lib/Makefile_32
@@ -0,0 +1,11 @@
+#
+# Makefile for i386-specific library files..
+#
+
+
+lib-y = checksum_32.o delay_32.o usercopy_32.o getuser_32.o putuser_32.o memcpy_32.o strstr_32.o \
+	bitops_32.o semaphore_32.o string_32.o
+
+lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
+
+obj-$(CONFIG_SMP)	+= msr-on-cpu.o
diff --git a/arch/x86/lib/Makefile_64 b/arch/x86/lib/Makefile_64
new file mode 100644
index 000000000000..bbabad3c9335
--- /dev/null
+++ b/arch/x86/lib/Makefile_64
@@ -0,0 +1,13 @@
+#
+# Makefile for x86_64-specific library files.
+#
+
+CFLAGS_csum-partial_64.o := -funroll-loops
+
+obj-y := io_64.o iomap_copy_64.o
+obj-$(CONFIG_SMP)	+= msr-on-cpu.o
+
+lib-y := csum-partial_64.o csum-copy_64.o csum-wrappers_64.o delay_64.o \
+	usercopy_64.o getuser_64.o putuser_64.o  \
+	thunk_64.o clear_page_64.o copy_page_64.o bitstr_64.o bitops_64.o
+lib-y += memcpy_64.o memmove_64.o memset_64.o copy_user_64.o rwlock_64.o copy_user_nocache_64.o
diff --git a/arch/x86/lib/bitops_32.c b/arch/x86/lib/bitops_32.c
new file mode 100644
index 000000000000..afd0045595d4
--- /dev/null
+++ b/arch/x86/lib/bitops_32.c
@@ -0,0 +1,70 @@
+#include <linux/bitops.h>
+#include <linux/module.h>
+
+/**
+ * find_next_bit - find the first set bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The maximum size to search
+ */
+int find_next_bit(const unsigned long *addr, int size, int offset)
+{
+	const unsigned long *p = addr + (offset >> 5);
+	int set = 0, bit = offset & 31, res;
+
+	if (bit) {
+		/*
+		 * Look for nonzero in the first 32 bits:
+		 */
+		__asm__("bsfl %1,%0\n\t"
+			"jne 1f\n\t"
+			"movl $32, %0\n"
+			"1:"
+			: "=r" (set)
+			: "r" (*p >> bit));
+		if (set < (32 - bit))
+			return set + offset;
+		set = 32 - bit;
+		p++;
+	}
+	/*
+	 * No set bit yet, search remaining full words for a bit
+	 */
+	res = find_first_bit (p, size - 32 * (p - addr));
+	return (offset + set + res);
+}
+EXPORT_SYMBOL(find_next_bit);
+
+/**
+ * find_next_zero_bit - find the first zero bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The maximum size to search
+ */
+int find_next_zero_bit(const unsigned long *addr, int size, int offset)
+{
+	const unsigned long *p = addr + (offset >> 5);
+	int set = 0, bit = offset & 31, res;
+
+	if (bit) {
+		/*
+		 * Look for zero in the first 32 bits.
+		 */
+		__asm__("bsfl %1,%0\n\t"
+			"jne 1f\n\t"
+			"movl $32, %0\n"
+			"1:"
+			: "=r" (set)
+			: "r" (~(*p >> bit)));
+		if (set < (32 - bit))
+			return set + offset;
+		set = 32 - bit;
+		p++;
+	}
+	/*
+	 * No zero yet, search remaining full bytes for a zero
+	 */
+	res = find_first_zero_bit(p, size - 32 * (p - addr));
+	return (offset + set + res);
+}
+EXPORT_SYMBOL(find_next_zero_bit);
diff --git a/arch/x86/lib/bitops_64.c b/arch/x86/lib/bitops_64.c
new file mode 100644
index 000000000000..95b6d9639fba
--- /dev/null
+++ b/arch/x86/lib/bitops_64.c
@@ -0,0 +1,175 @@
+#include <linux/bitops.h>
+
+#undef find_first_zero_bit
+#undef find_next_zero_bit
+#undef find_first_bit
+#undef find_next_bit
+
+static inline long
+__find_first_zero_bit(const unsigned long * addr, unsigned long size)
+{
+	long d0, d1, d2;
+	long res;
+
+	/*
+	 * We must test the size in words, not in bits, because
+	 * otherwise incoming sizes in the range -63..-1 will not run
+	 * any scasq instructions, and then the flags used by the je
+	 * instruction will have whatever random value was in place
+	 * before.  Nobody should call us like that, but
+	 * find_next_zero_bit() does when offset and size are at the
+	 * same word and it fails to find a zero itself.
+	 */
+	size += 63;
+	size >>= 6;
+	if (!size)
+		return 0;
+	asm volatile(
+		"  repe; scasq\n"
+		"  je 1f\n"
+		"  xorq -8(%%rdi),%%rax\n"
+		"  subq $8,%%rdi\n"
+		"  bsfq %%rax,%%rdx\n"
+		"1:  subq %[addr],%%rdi\n"
+		"  shlq $3,%%rdi\n"
+		"  addq %%rdi,%%rdx"
+		:"=d" (res), "=&c" (d0), "=&D" (d1), "=&a" (d2)
+		:"0" (0ULL), "1" (size), "2" (addr), "3" (-1ULL),
+		 [addr] "S" (addr) : "memory");
+	/*
+	 * Any register would do for [addr] above, but GCC tends to
+	 * prefer rbx over rsi, even though rsi is readily available
+	 * and doesn't have to be saved.
+	 */
+	return res;
+}
+
+/**
+ * find_first_zero_bit - find the first zero bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The maximum size to search
+ *
+ * Returns the bit-number of the first zero bit, not the number of the byte
+ * containing a bit.
+ */
+long find_first_zero_bit(const unsigned long * addr, unsigned long size)
+{
+	return __find_first_zero_bit (addr, size);
+}
+
+/**
+ * find_next_zero_bit - find the first zero bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The maximum size to search
+ */
+long find_next_zero_bit (const unsigned long * addr, long size, long offset)
+{
+	const unsigned long * p = addr + (offset >> 6);
+	unsigned long set = 0;
+	unsigned long res, bit = offset&63;
+
+	if (bit) {
+		/*
+		 * Look for zero in first word
+		 */
+		asm("bsfq %1,%0\n\t"
+		    "cmoveq %2,%0"
+		    : "=r" (set)
+		    : "r" (~(*p >> bit)), "r"(64L));
+		if (set < (64 - bit))
+			return set + offset;
+		set = 64 - bit;
+		p++;
+	}
+	/*
+	 * No zero yet, search remaining full words for a zero
+	 */
+	res = __find_first_zero_bit (p, size - 64 * (p - addr));
+
+	return (offset + set + res);
+}
+
+static inline long
+__find_first_bit(const unsigned long * addr, unsigned long size)
+{
+	long d0, d1;
+	long res;
+
+	/*
+	 * We must test the size in words, not in bits, because
+	 * otherwise incoming sizes in the range -63..-1 will not run
+	 * any scasq instructions, and then the flags used by the jz
+	 * instruction will have whatever random value was in place
+	 * before.  Nobody should call us like that, but
+	 * find_next_bit() does when offset and size are at the same
+	 * word and it fails to find a one itself.
+	 */
+	size += 63;
+	size >>= 6;
+	if (!size)
+		return 0;
+	asm volatile(
+		"   repe; scasq\n"
+		"   jz 1f\n"
+		"   subq $8,%%rdi\n"
+		"   bsfq (%%rdi),%%rax\n"
+		"1: subq %[addr],%%rdi\n"
+		"   shlq $3,%%rdi\n"
+		"   addq %%rdi,%%rax"
+		:"=a" (res), "=&c" (d0), "=&D" (d1)
+		:"0" (0ULL), "1" (size), "2" (addr),
+		 [addr] "r" (addr) : "memory");
+	return res;
+}
+
+/**
+ * find_first_bit - find the first set bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The maximum size to search
+ *
+ * Returns the bit-number of the first set bit, not the number of the byte
+ * containing a bit.
+ */
+long find_first_bit(const unsigned long * addr, unsigned long size)
+{
+	return __find_first_bit(addr,size);
+}
+
+/**
+ * find_next_bit - find the first set bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The maximum size to search
+ */
+long find_next_bit(const unsigned long * addr, long size, long offset)
+{
+	const unsigned long * p = addr + (offset >> 6);
+	unsigned long set = 0, bit = offset & 63, res;
+
+	if (bit) {
+		/*
+		 * Look for nonzero in the first 64 bits:
+		 */
+		asm("bsfq %1,%0\n\t"
+		    "cmoveq %2,%0\n\t"
+		    : "=r" (set)
+		    : "r" (*p >> bit), "r" (64L));
+		if (set < (64 - bit))
+			return set + offset;
+		set = 64 - bit;
+		p++;
+	}
+	/*
+	 * No set bit yet, search remaining full words for a bit
+	 */
+	res = __find_first_bit (p, size - 64 * (p - addr));
+	return (offset + set + res);
+}
+
+#include <linux/module.h>
+
+EXPORT_SYMBOL(find_next_bit);
+EXPORT_SYMBOL(find_first_bit);
+EXPORT_SYMBOL(find_first_zero_bit);
+EXPORT_SYMBOL(find_next_zero_bit);
diff --git a/arch/x86/lib/bitstr_64.c b/arch/x86/lib/bitstr_64.c
new file mode 100644
index 000000000000..24676609a6ac
--- /dev/null
+++ b/arch/x86/lib/bitstr_64.c
@@ -0,0 +1,28 @@
+#include <linux/module.h>
+#include <linux/bitops.h>
+
+/* Find string of zero bits in a bitmap */ 
+unsigned long 
+find_next_zero_string(unsigned long *bitmap, long start, long nbits, int len)
+{ 
+	unsigned long n, end, i; 	
+
+ again:
+	n = find_next_zero_bit(bitmap, nbits, start);
+	if (n == -1) 
+		return -1;
+	
+	/* could test bitsliced, but it's hardly worth it */
+	end = n+len;
+	if (end >= nbits) 
+		return -1; 
+	for (i = n+1; i < end; i++) { 
+		if (test_bit(i, bitmap)) {  
+			start = i+1; 
+			goto again; 
+		} 
+	}
+	return n;
+}
+
+EXPORT_SYMBOL(find_next_zero_string);
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
new file mode 100644
index 000000000000..adbccd0bbb78
--- /dev/null
+++ b/arch/x86/lib/checksum_32.S
@@ -0,0 +1,546 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		IP/TCP/UDP checksumming routines
+ *
+ * Authors:	Jorge Cwik, <jorge@laser.satlink.net>
+ *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *		Tom May, <ftom@netcom.com>
+ *              Pentium Pro/II routines:
+ *              Alexander Kjeldaas <astor@guardian.no>
+ *              Finn Arne Gangstad <finnag@guardian.no>
+ *		Lots of code moved from tcp.c and ip.c; see those files
+ *		for more names.
+ *
+ * Changes:     Ingo Molnar, converted csum_partial_copy() to 2.1 exception
+ *			     handling.
+ *		Andi Kleen,  add zeroing on error
+ *                   converted to pure assembler
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/errno.h>
+				
+/*
+ * computes a partial checksum, e.g. for TCP/UDP fragments
+ */
+
+/*	
+unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
+ */
+		
+.text
+		
+#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
+
+	  /*		
+	   * Experiments with Ethernet and SLIP connections show that buff
+	   * is aligned on either a 2-byte or 4-byte boundary.  We get at
+	   * least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
+	   * Fortunately, it is easy to convert 2-byte alignment to 4-byte
+	   * alignment for the unrolled loop.
+	   */		
+ENTRY(csum_partial)
+	CFI_STARTPROC
+	pushl %esi
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET esi, 0
+	pushl %ebx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET ebx, 0
+	movl 20(%esp),%eax	# Function arg: unsigned int sum
+	movl 16(%esp),%ecx	# Function arg: int len
+	movl 12(%esp),%esi	# Function arg: unsigned char *buff
+	testl $3, %esi		# Check alignment.
+	jz 2f			# Jump if alignment is ok.
+	testl $1, %esi		# Check alignment.
+	jz 10f			# Jump if alignment is boundary of 2bytes.
+
+	# buf is odd
+	dec %ecx
+	jl 8f
+	movzbl (%esi), %ebx
+	adcl %ebx, %eax
+	roll $8, %eax
+	inc %esi
+	testl $2, %esi
+	jz 2f
+10:
+	subl $2, %ecx		# Alignment uses up two bytes.
+	jae 1f			# Jump if we had at least two bytes.
+	addl $2, %ecx		# ecx was < 2.  Deal with it.
+	jmp 4f
+1:	movw (%esi), %bx
+	addl $2, %esi
+	addw %bx, %ax
+	adcl $0, %eax
+2:
+	movl %ecx, %edx
+	shrl $5, %ecx
+	jz 2f
+	testl %esi, %esi
+1:	movl (%esi), %ebx
+	adcl %ebx, %eax
+	movl 4(%esi), %ebx
+	adcl %ebx, %eax
+	movl 8(%esi), %ebx
+	adcl %ebx, %eax
+	movl 12(%esi), %ebx
+	adcl %ebx, %eax
+	movl 16(%esi), %ebx
+	adcl %ebx, %eax
+	movl 20(%esi), %ebx
+	adcl %ebx, %eax
+	movl 24(%esi), %ebx
+	adcl %ebx, %eax
+	movl 28(%esi), %ebx
+	adcl %ebx, %eax
+	lea 32(%esi), %esi
+	dec %ecx
+	jne 1b
+	adcl $0, %eax
+2:	movl %edx, %ecx
+	andl $0x1c, %edx
+	je 4f
+	shrl $2, %edx		# This clears CF
+3:	adcl (%esi), %eax
+	lea 4(%esi), %esi
+	dec %edx
+	jne 3b
+	adcl $0, %eax
+4:	andl $3, %ecx
+	jz 7f
+	cmpl $2, %ecx
+	jb 5f
+	movw (%esi),%cx
+	leal 2(%esi),%esi
+	je 6f
+	shll $16,%ecx
+5:	movb (%esi),%cl
+6:	addl %ecx,%eax
+	adcl $0, %eax 
+7:	
+	testl $1, 12(%esp)
+	jz 8f
+	roll $8, %eax
+8:
+	popl %ebx
+	CFI_ADJUST_CFA_OFFSET -4
+	CFI_RESTORE ebx
+	popl %esi
+	CFI_ADJUST_CFA_OFFSET -4
+	CFI_RESTORE esi
+	ret
+	CFI_ENDPROC
+ENDPROC(csum_partial)
+
+#else
+
+/* Version for PentiumII/PPro */
+
+ENTRY(csum_partial)
+	CFI_STARTPROC
+	pushl %esi
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET esi, 0
+	pushl %ebx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET ebx, 0
+	movl 20(%esp),%eax	# Function arg: unsigned int sum
+	movl 16(%esp),%ecx	# Function arg: int len
+	movl 12(%esp),%esi	# Function arg:	const unsigned char *buf
+
+	testl $3, %esi         
+	jnz 25f                 
+10:
+	movl %ecx, %edx
+	movl %ecx, %ebx
+	andl $0x7c, %ebx
+	shrl $7, %ecx
+	addl %ebx,%esi
+	shrl $2, %ebx  
+	negl %ebx
+	lea 45f(%ebx,%ebx,2), %ebx
+	testl %esi, %esi
+	jmp *%ebx
+
+	# Handle 2-byte-aligned regions
+20:	addw (%esi), %ax
+	lea 2(%esi), %esi
+	adcl $0, %eax
+	jmp 10b
+25:
+	testl $1, %esi         
+	jz 30f                 
+	# buf is odd
+	dec %ecx
+	jl 90f
+	movzbl (%esi), %ebx
+	addl %ebx, %eax
+	adcl $0, %eax
+	roll $8, %eax
+	inc %esi
+	testl $2, %esi
+	jz 10b
+
+30:	subl $2, %ecx          
+	ja 20b                 
+	je 32f
+	addl $2, %ecx
+	jz 80f
+	movzbl (%esi),%ebx	# csumming 1 byte, 2-aligned
+	addl %ebx, %eax
+	adcl $0, %eax
+	jmp 80f
+32:
+	addw (%esi), %ax	# csumming 2 bytes, 2-aligned
+	adcl $0, %eax
+	jmp 80f
+
+40: 
+	addl -128(%esi), %eax
+	adcl -124(%esi), %eax
+	adcl -120(%esi), %eax
+	adcl -116(%esi), %eax   
+	adcl -112(%esi), %eax   
+	adcl -108(%esi), %eax
+	adcl -104(%esi), %eax
+	adcl -100(%esi), %eax
+	adcl -96(%esi), %eax
+	adcl -92(%esi), %eax
+	adcl -88(%esi), %eax
+	adcl -84(%esi), %eax
+	adcl -80(%esi), %eax
+	adcl -76(%esi), %eax
+	adcl -72(%esi), %eax
+	adcl -68(%esi), %eax
+	adcl -64(%esi), %eax     
+	adcl -60(%esi), %eax     
+	adcl -56(%esi), %eax     
+	adcl -52(%esi), %eax   
+	adcl -48(%esi), %eax   
+	adcl -44(%esi), %eax
+	adcl -40(%esi), %eax
+	adcl -36(%esi), %eax
+	adcl -32(%esi), %eax
+	adcl -28(%esi), %eax
+	adcl -24(%esi), %eax
+	adcl -20(%esi), %eax
+	adcl -16(%esi), %eax
+	adcl -12(%esi), %eax
+	adcl -8(%esi), %eax
+	adcl -4(%esi), %eax
+45:
+	lea 128(%esi), %esi
+	adcl $0, %eax
+	dec %ecx
+	jge 40b
+	movl %edx, %ecx
+50:	andl $3, %ecx
+	jz 80f
+
+	# Handle the last 1-3 bytes without jumping
+	notl %ecx		# 1->2, 2->1, 3->0, higher bits are masked
+	movl $0xffffff,%ebx	# by the shll and shrl instructions
+	shll $3,%ecx
+	shrl %cl,%ebx
+	andl -128(%esi),%ebx	# esi is 4-aligned so should be ok
+	addl %ebx,%eax
+	adcl $0,%eax
+80: 
+	testl $1, 12(%esp)
+	jz 90f
+	roll $8, %eax
+90: 
+	popl %ebx
+	CFI_ADJUST_CFA_OFFSET -4
+	CFI_RESTORE ebx
+	popl %esi
+	CFI_ADJUST_CFA_OFFSET -4
+	CFI_RESTORE esi
+	ret
+	CFI_ENDPROC
+ENDPROC(csum_partial)
+				
+#endif
+
+/*
+unsigned int csum_partial_copy_generic (const char *src, char *dst,
+				  int len, int sum, int *src_err_ptr, int *dst_err_ptr)
+ */ 
+
+/*
+ * Copy from ds while checksumming, otherwise like csum_partial
+ *
+ * The macros SRC and DST specify the type of access for the instruction.
+ * thus we can call a custom exception handler for all access types.
+ *
+ * FIXME: could someone double-check whether I haven't mixed up some SRC and
+ *	  DST definitions? It's damn hard to trigger all cases.  I hope I got
+ *	  them all but there's no guarantee.
+ */
+
+#define SRC(y...)			\
+	9999: y;			\
+	.section __ex_table, "a";	\
+	.long 9999b, 6001f	;	\
+	.previous
+
+#define DST(y...)			\
+	9999: y;			\
+	.section __ex_table, "a";	\
+	.long 9999b, 6002f	;	\
+	.previous
+
+#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
+
+#define ARGBASE 16		
+#define FP		12
+		
+ENTRY(csum_partial_copy_generic)
+	CFI_STARTPROC
+	subl  $4,%esp	
+	CFI_ADJUST_CFA_OFFSET 4
+	pushl %edi
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET edi, 0
+	pushl %esi
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET esi, 0
+	pushl %ebx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET ebx, 0
+	movl ARGBASE+16(%esp),%eax	# sum
+	movl ARGBASE+12(%esp),%ecx	# len
+	movl ARGBASE+4(%esp),%esi	# src
+	movl ARGBASE+8(%esp),%edi	# dst
+
+	testl $2, %edi			# Check alignment. 
+	jz 2f				# Jump if alignment is ok.
+	subl $2, %ecx			# Alignment uses up two bytes.
+	jae 1f				# Jump if we had at least two bytes.
+	addl $2, %ecx			# ecx was < 2.  Deal with it.
+	jmp 4f
+SRC(1:	movw (%esi), %bx	)
+	addl $2, %esi
+DST(	movw %bx, (%edi)	)
+	addl $2, %edi
+	addw %bx, %ax	
+	adcl $0, %eax
+2:
+	movl %ecx, FP(%esp)
+	shrl $5, %ecx
+	jz 2f
+	testl %esi, %esi
+SRC(1:	movl (%esi), %ebx	)
+SRC(	movl 4(%esi), %edx	)
+	adcl %ebx, %eax
+DST(	movl %ebx, (%edi)	)
+	adcl %edx, %eax
+DST(	movl %edx, 4(%edi)	)
+
+SRC(	movl 8(%esi), %ebx	)
+SRC(	movl 12(%esi), %edx	)
+	adcl %ebx, %eax
+DST(	movl %ebx, 8(%edi)	)
+	adcl %edx, %eax
+DST(	movl %edx, 12(%edi)	)
+
+SRC(	movl 16(%esi), %ebx 	)
+SRC(	movl 20(%esi), %edx	)
+	adcl %ebx, %eax
+DST(	movl %ebx, 16(%edi)	)
+	adcl %edx, %eax
+DST(	movl %edx, 20(%edi)	)
+
+SRC(	movl 24(%esi), %ebx	)
+SRC(	movl 28(%esi), %edx	)
+	adcl %ebx, %eax
+DST(	movl %ebx, 24(%edi)	)
+	adcl %edx, %eax
+DST(	movl %edx, 28(%edi)	)
+
+	lea 32(%esi), %esi
+	lea 32(%edi), %edi
+	dec %ecx
+	jne 1b
+	adcl $0, %eax
+2:	movl FP(%esp), %edx
+	movl %edx, %ecx
+	andl $0x1c, %edx
+	je 4f
+	shrl $2, %edx			# This clears CF
+SRC(3:	movl (%esi), %ebx	)
+	adcl %ebx, %eax
+DST(	movl %ebx, (%edi)	)
+	lea 4(%esi), %esi
+	lea 4(%edi), %edi
+	dec %edx
+	jne 3b
+	adcl $0, %eax
+4:	andl $3, %ecx
+	jz 7f
+	cmpl $2, %ecx
+	jb 5f
+SRC(	movw (%esi), %cx	)
+	leal 2(%esi), %esi
+DST(	movw %cx, (%edi)	)
+	leal 2(%edi), %edi
+	je 6f
+	shll $16,%ecx
+SRC(5:	movb (%esi), %cl	)
+DST(	movb %cl, (%edi)	)
+6:	addl %ecx, %eax
+	adcl $0, %eax
+7:
+5000:
+
+# Exception handler:
+.section .fixup, "ax"							
+
+6001:
+	movl ARGBASE+20(%esp), %ebx	# src_err_ptr
+	movl $-EFAULT, (%ebx)
+
+	# zero the complete destination - computing the rest
+	# is too much work 
+	movl ARGBASE+8(%esp), %edi	# dst
+	movl ARGBASE+12(%esp), %ecx	# len
+	xorl %eax,%eax
+	rep ; stosb
+
+	jmp 5000b
+
+6002:
+	movl ARGBASE+24(%esp), %ebx	# dst_err_ptr
+	movl $-EFAULT,(%ebx)
+	jmp 5000b
+
+.previous
+
+	popl %ebx
+	CFI_ADJUST_CFA_OFFSET -4
+	CFI_RESTORE ebx
+	popl %esi
+	CFI_ADJUST_CFA_OFFSET -4
+	CFI_RESTORE esi
+	popl %edi
+	CFI_ADJUST_CFA_OFFSET -4
+	CFI_RESTORE edi
+	popl %ecx			# equivalent to addl $4,%esp
+	CFI_ADJUST_CFA_OFFSET -4
+	ret	
+	CFI_ENDPROC
+ENDPROC(csum_partial_copy_generic)
+
+#else
+
+/* Version for PentiumII/PPro */
+
+#define ROUND1(x) \
+	SRC(movl x(%esi), %ebx	)	;	\
+	addl %ebx, %eax			;	\
+	DST(movl %ebx, x(%edi)	)	; 
+
+#define ROUND(x) \
+	SRC(movl x(%esi), %ebx	)	;	\
+	adcl %ebx, %eax			;	\
+	DST(movl %ebx, x(%edi)	)	;
+
+#define ARGBASE 12
+		
+ENTRY(csum_partial_copy_generic)
+	CFI_STARTPROC
+	pushl %ebx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET ebx, 0
+	pushl %edi
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET edi, 0
+	pushl %esi
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET esi, 0
+	movl ARGBASE+4(%esp),%esi	#src
+	movl ARGBASE+8(%esp),%edi	#dst	
+	movl ARGBASE+12(%esp),%ecx	#len
+	movl ARGBASE+16(%esp),%eax	#sum
+#	movl %ecx, %edx  
+	movl %ecx, %ebx  
+	movl %esi, %edx
+	shrl $6, %ecx     
+	andl $0x3c, %ebx  
+	negl %ebx
+	subl %ebx, %esi  
+	subl %ebx, %edi  
+	lea  -1(%esi),%edx
+	andl $-32,%edx
+	lea 3f(%ebx,%ebx), %ebx
+	testl %esi, %esi 
+	jmp *%ebx
+1:	addl $64,%esi
+	addl $64,%edi 
+	SRC(movb -32(%edx),%bl)	; SRC(movb (%edx),%bl)
+	ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52)	
+	ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36)	
+	ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20)	
+	ROUND (-16) ROUND(-12) ROUND(-8)  ROUND(-4)	
+3:	adcl $0,%eax
+	addl $64, %edx
+	dec %ecx
+	jge 1b
+4:	movl ARGBASE+12(%esp),%edx	#len
+	andl $3, %edx
+	jz 7f
+	cmpl $2, %edx
+	jb 5f
+SRC(	movw (%esi), %dx         )
+	leal 2(%esi), %esi
+DST(	movw %dx, (%edi)         )
+	leal 2(%edi), %edi
+	je 6f
+	shll $16,%edx
+5:
+SRC(	movb (%esi), %dl         )
+DST(	movb %dl, (%edi)         )
+6:	addl %edx, %eax
+	adcl $0, %eax
+7:
+.section .fixup, "ax"
+6001:	movl	ARGBASE+20(%esp), %ebx	# src_err_ptr	
+	movl $-EFAULT, (%ebx)
+	# zero the complete destination (computing the rest is too much work)
+	movl ARGBASE+8(%esp),%edi	# dst
+	movl ARGBASE+12(%esp),%ecx	# len
+	xorl %eax,%eax
+	rep; stosb
+	jmp 7b
+6002:	movl ARGBASE+24(%esp), %ebx	# dst_err_ptr
+	movl $-EFAULT, (%ebx)
+	jmp  7b			
+.previous				
+
+	popl %esi
+	CFI_ADJUST_CFA_OFFSET -4
+	CFI_RESTORE esi
+	popl %edi
+	CFI_ADJUST_CFA_OFFSET -4
+	CFI_RESTORE edi
+	popl %ebx
+	CFI_ADJUST_CFA_OFFSET -4
+	CFI_RESTORE ebx
+	ret
+	CFI_ENDPROC
+ENDPROC(csum_partial_copy_generic)
+				
+#undef ROUND
+#undef ROUND1		
+		
+#endif
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
new file mode 100644
index 000000000000..9a10a78bb4a4
--- /dev/null
+++ b/arch/x86/lib/clear_page_64.S
@@ -0,0 +1,59 @@
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+/*
+ * Zero a page. 	
+ * rdi	page
+ */			
+	ALIGN
+clear_page_c:
+	CFI_STARTPROC
+	movl $4096/8,%ecx
+	xorl %eax,%eax
+	rep stosq
+	ret
+	CFI_ENDPROC
+ENDPROC(clear_page)
+
+ENTRY(clear_page)
+	CFI_STARTPROC
+	xorl   %eax,%eax
+	movl   $4096/64,%ecx
+	.p2align 4
+.Lloop:
+	decl	%ecx
+#define PUT(x) movq %rax,x*8(%rdi)
+	movq %rax,(%rdi)
+	PUT(1)
+	PUT(2)
+	PUT(3)
+	PUT(4)
+	PUT(5)
+	PUT(6)
+	PUT(7)
+	leaq	64(%rdi),%rdi
+	jnz	.Lloop
+	nop
+	ret
+	CFI_ENDPROC
+.Lclear_page_end:
+ENDPROC(clear_page)
+
+	/* Some CPUs run faster using the string instructions.
+	   It is also a lot simpler. Use this when possible */
+
+#include <asm/cpufeature.h>
+
+	.section .altinstr_replacement,"ax"
+1:	.byte 0xeb					/* jmp <disp8> */
+	.byte (clear_page_c - clear_page) - (2f - 1b)	/* offset */
+2:
+	.previous
+	.section .altinstructions,"a"
+	.align 8
+	.quad clear_page
+	.quad 1b
+	.byte X86_FEATURE_REP_GOOD
+	.byte .Lclear_page_end - clear_page
+	.byte 2b - 1b
+	.previous
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
new file mode 100644
index 000000000000..727a5d46d2fc
--- /dev/null
+++ b/arch/x86/lib/copy_page_64.S
@@ -0,0 +1,119 @@
+/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+	ALIGN
+copy_page_c:
+	CFI_STARTPROC
+	movl $4096/8,%ecx
+	rep movsq
+	ret
+	CFI_ENDPROC
+ENDPROC(copy_page_c)
+
+/* Don't use streaming store because it's better when the target
+   ends up in cache. */
+	    
+/* Could vary the prefetch distance based on SMP/UP */
+
+ENTRY(copy_page)
+	CFI_STARTPROC
+	subq	$3*8,%rsp
+	CFI_ADJUST_CFA_OFFSET 3*8
+	movq	%rbx,(%rsp)
+	CFI_REL_OFFSET rbx, 0
+	movq	%r12,1*8(%rsp)
+	CFI_REL_OFFSET r12, 1*8
+	movq	%r13,2*8(%rsp)
+	CFI_REL_OFFSET r13, 2*8
+
+	movl	$(4096/64)-5,%ecx
+	.p2align 4
+.Loop64:
+  	dec     %rcx
+
+	movq        (%rsi), %rax
+	movq      8 (%rsi), %rbx
+	movq     16 (%rsi), %rdx
+	movq     24 (%rsi), %r8
+	movq     32 (%rsi), %r9
+	movq     40 (%rsi), %r10
+	movq     48 (%rsi), %r11
+	movq     56 (%rsi), %r12
+
+	prefetcht0 5*64(%rsi)
+
+	movq     %rax,    (%rdi)
+	movq     %rbx,  8 (%rdi)
+	movq     %rdx, 16 (%rdi)
+	movq     %r8,  24 (%rdi)
+	movq     %r9,  32 (%rdi)
+	movq     %r10, 40 (%rdi)
+	movq     %r11, 48 (%rdi)
+	movq     %r12, 56 (%rdi)
+
+	leaq    64 (%rsi), %rsi
+	leaq    64 (%rdi), %rdi
+
+	jnz     .Loop64
+
+	movl	$5,%ecx
+	.p2align 4
+.Loop2:
+	decl   %ecx
+
+	movq        (%rsi), %rax
+	movq      8 (%rsi), %rbx
+	movq     16 (%rsi), %rdx
+	movq     24 (%rsi), %r8
+	movq     32 (%rsi), %r9
+	movq     40 (%rsi), %r10
+	movq     48 (%rsi), %r11
+	movq     56 (%rsi), %r12
+
+	movq     %rax,    (%rdi)
+	movq     %rbx,  8 (%rdi)
+	movq     %rdx, 16 (%rdi)
+	movq     %r8,  24 (%rdi)
+	movq     %r9,  32 (%rdi)
+	movq     %r10, 40 (%rdi)
+	movq     %r11, 48 (%rdi)
+	movq     %r12, 56 (%rdi)
+
+	leaq	64(%rdi),%rdi
+	leaq	64(%rsi),%rsi
+
+	jnz	.Loop2
+
+	movq	(%rsp),%rbx
+	CFI_RESTORE rbx
+	movq	1*8(%rsp),%r12
+	CFI_RESTORE r12
+	movq	2*8(%rsp),%r13
+	CFI_RESTORE r13
+	addq	$3*8,%rsp
+	CFI_ADJUST_CFA_OFFSET -3*8
+	ret
+.Lcopy_page_end:
+	CFI_ENDPROC
+ENDPROC(copy_page)
+
+	/* Some CPUs run faster using the string copy instructions.
+	   It is also a lot simpler. Use this when possible */
+
+#include <asm/cpufeature.h>
+
+	.section .altinstr_replacement,"ax"
+1:	.byte 0xeb					/* jmp <disp8> */
+	.byte (copy_page_c - copy_page) - (2f - 1b)	/* offset */
+2:
+	.previous
+	.section .altinstructions,"a"
+	.align 8
+	.quad copy_page
+	.quad 1b
+	.byte X86_FEATURE_REP_GOOD
+	.byte .Lcopy_page_end - copy_page
+	.byte 2b - 1b
+	.previous
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
new file mode 100644
index 000000000000..70bebd310408
--- /dev/null
+++ b/arch/x86/lib/copy_user_64.S
@@ -0,0 +1,354 @@
+/* Copyright 2002 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License v2.
+ * 
+ * Functions to copy from and to user space.		
+ */		 
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+#define FIX_ALIGNMENT 1
+
+#include <asm/current.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/cpufeature.h>
+
+	.macro ALTERNATIVE_JUMP feature,orig,alt
+0:
+	.byte 0xe9	/* 32bit jump */
+	.long \orig-1f	/* by default jump to orig */
+1:
+	.section .altinstr_replacement,"ax"
+2:	.byte 0xe9	             /* near jump with 32bit immediate */
+	.long \alt-1b /* offset */   /* or alternatively to alt */
+	.previous
+	.section .altinstructions,"a"
+	.align 8
+	.quad  0b
+	.quad  2b
+	.byte  \feature		     /* when feature is set */
+	.byte  5
+	.byte  5
+	.previous
+	.endm
+
+/* Standard copy_to_user with segment limit checking */		
+ENTRY(copy_to_user)
+	CFI_STARTPROC
+	GET_THREAD_INFO(%rax)
+	movq %rdi,%rcx
+	addq %rdx,%rcx
+	jc  bad_to_user
+	cmpq threadinfo_addr_limit(%rax),%rcx
+	jae bad_to_user
+	xorl %eax,%eax	/* clear zero flag */
+	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+	CFI_ENDPROC
+
+ENTRY(copy_user_generic)
+	CFI_STARTPROC
+	movl $1,%ecx	/* set zero flag */
+	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+	CFI_ENDPROC
+
+ENTRY(__copy_from_user_inatomic)
+	CFI_STARTPROC
+	xorl %ecx,%ecx	/* clear zero flag */
+	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+	CFI_ENDPROC
+
+/* Standard copy_from_user with segment limit checking */	
+ENTRY(copy_from_user)
+	CFI_STARTPROC
+	GET_THREAD_INFO(%rax)
+	movq %rsi,%rcx
+	addq %rdx,%rcx
+	jc  bad_from_user
+	cmpq threadinfo_addr_limit(%rax),%rcx
+	jae  bad_from_user
+	movl $1,%ecx	/* set zero flag */
+	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+	CFI_ENDPROC
+ENDPROC(copy_from_user)
+	
+	.section .fixup,"ax"
+	/* must zero dest */
+bad_from_user:
+	CFI_STARTPROC
+	movl %edx,%ecx
+	xorl %eax,%eax
+	rep
+	stosb
+bad_to_user:
+	movl	%edx,%eax
+	ret
+	CFI_ENDPROC
+END(bad_from_user)
+	.previous
+	
+		
+/*
+ * copy_user_generic_unrolled - memory copy with exception handling.
+ * This version is for CPUs like P4 that don't have efficient micro code for rep movsq
+ * 	
+ * Input:	
+ * rdi destination
+ * rsi source
+ * rdx count
+ * ecx zero flag -- if true zero destination on error
+ *
+ * Output:		
+ * eax uncopied bytes or 0 if successful.
+ */
+ENTRY(copy_user_generic_unrolled)
+	CFI_STARTPROC
+	pushq %rbx
+	CFI_ADJUST_CFA_OFFSET 8
+	CFI_REL_OFFSET rbx, 0
+	pushq %rcx
+	CFI_ADJUST_CFA_OFFSET 8
+	CFI_REL_OFFSET rcx, 0
+	xorl %eax,%eax		/*zero for the exception handler */
+
+#ifdef FIX_ALIGNMENT
+	/* check for bad alignment of destination */
+	movl %edi,%ecx
+	andl $7,%ecx
+	jnz  .Lbad_alignment
+.Lafter_bad_alignment:
+#endif
+
+	movq %rdx,%rcx
+
+	movl $64,%ebx
+	shrq $6,%rdx
+	decq %rdx
+	js   .Lhandle_tail
+
+	.p2align 4
+.Lloop:
+.Ls1:	movq (%rsi),%r11
+.Ls2:	movq 1*8(%rsi),%r8
+.Ls3:	movq 2*8(%rsi),%r9
+.Ls4:	movq 3*8(%rsi),%r10
+.Ld1:	movq %r11,(%rdi)
+.Ld2:	movq %r8,1*8(%rdi)
+.Ld3:	movq %r9,2*8(%rdi)
+.Ld4:	movq %r10,3*8(%rdi)
+
+.Ls5:	movq 4*8(%rsi),%r11
+.Ls6:	movq 5*8(%rsi),%r8
+.Ls7:	movq 6*8(%rsi),%r9
+.Ls8:	movq 7*8(%rsi),%r10
+.Ld5:	movq %r11,4*8(%rdi)
+.Ld6:	movq %r8,5*8(%rdi)
+.Ld7:	movq %r9,6*8(%rdi)
+.Ld8:	movq %r10,7*8(%rdi)
+
+	decq %rdx
+
+	leaq 64(%rsi),%rsi
+	leaq 64(%rdi),%rdi
+
+	jns  .Lloop
+
+	.p2align 4
+.Lhandle_tail:
+	movl %ecx,%edx
+	andl $63,%ecx
+	shrl $3,%ecx
+	jz   .Lhandle_7
+	movl $8,%ebx
+	.p2align 4
+.Lloop_8:
+.Ls9:	movq (%rsi),%r8
+.Ld9:	movq %r8,(%rdi)
+	decl %ecx
+	leaq 8(%rdi),%rdi
+	leaq 8(%rsi),%rsi
+	jnz .Lloop_8
+
+.Lhandle_7:
+	movl %edx,%ecx
+	andl $7,%ecx
+	jz   .Lende
+	.p2align 4
+.Lloop_1:
+.Ls10:	movb (%rsi),%bl
+.Ld10:	movb %bl,(%rdi)
+	incq %rdi
+	incq %rsi
+	decl %ecx
+	jnz .Lloop_1
+
+	CFI_REMEMBER_STATE
+.Lende:
+	popq %rcx
+	CFI_ADJUST_CFA_OFFSET -8
+	CFI_RESTORE rcx
+	popq %rbx
+	CFI_ADJUST_CFA_OFFSET -8
+	CFI_RESTORE rbx
+	ret
+	CFI_RESTORE_STATE
+
+#ifdef FIX_ALIGNMENT
+	/* align destination */
+	.p2align 4
+.Lbad_alignment:
+	movl $8,%r9d
+	subl %ecx,%r9d
+	movl %r9d,%ecx
+	cmpq %r9,%rdx
+	jz   .Lhandle_7
+	js   .Lhandle_7
+.Lalign_1:
+.Ls11:	movb (%rsi),%bl
+.Ld11:	movb %bl,(%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz .Lalign_1
+	subq %r9,%rdx
+	jmp .Lafter_bad_alignment
+#endif
+
+	/* table sorted by exception address */
+	.section __ex_table,"a"
+	.align 8
+	.quad .Ls1,.Ls1e
+	.quad .Ls2,.Ls2e
+	.quad .Ls3,.Ls3e
+	.quad .Ls4,.Ls4e
+	.quad .Ld1,.Ls1e
+	.quad .Ld2,.Ls2e
+	.quad .Ld3,.Ls3e
+	.quad .Ld4,.Ls4e
+	.quad .Ls5,.Ls5e
+	.quad .Ls6,.Ls6e
+	.quad .Ls7,.Ls7e
+	.quad .Ls8,.Ls8e
+	.quad .Ld5,.Ls5e
+	.quad .Ld6,.Ls6e
+	.quad .Ld7,.Ls7e
+	.quad .Ld8,.Ls8e
+	.quad .Ls9,.Le_quad
+	.quad .Ld9,.Le_quad
+	.quad .Ls10,.Le_byte
+	.quad .Ld10,.Le_byte
+#ifdef FIX_ALIGNMENT
+	.quad .Ls11,.Lzero_rest
+	.quad .Ld11,.Lzero_rest
+#endif
+	.quad .Le5,.Le_zero
+	.previous
+
+	/* compute 64-offset for main loop. 8 bytes accuracy with error on the
+	   pessimistic side. this is gross. it would be better to fix the
+	   interface. */
+	/* eax: zero, ebx: 64 */
+.Ls1e: 	addl $8,%eax
+.Ls2e: 	addl $8,%eax
+.Ls3e: 	addl $8,%eax
+.Ls4e: 	addl $8,%eax
+.Ls5e: 	addl $8,%eax
+.Ls6e: 	addl $8,%eax
+.Ls7e: 	addl $8,%eax
+.Ls8e: 	addl $8,%eax
+	addq %rbx,%rdi	/* +64 */
+	subq %rax,%rdi  /* correct destination with computed offset */
+
+	shlq $6,%rdx	/* loop counter * 64 (stride length) */
+	addq %rax,%rdx	/* add offset to loopcnt */
+	andl $63,%ecx	/* remaining bytes */
+	addq %rcx,%rdx	/* add them */
+	jmp .Lzero_rest
+
+	/* exception on quad word loop in tail handling */
+	/* ecx:	loopcnt/8, %edx: length, rdi: correct */
+.Le_quad:
+	shll $3,%ecx
+	andl $7,%edx
+	addl %ecx,%edx
+	/* edx: bytes to zero, rdi: dest, eax:zero */
+.Lzero_rest:
+	cmpl $0,(%rsp)
+	jz   .Le_zero
+	movq %rdx,%rcx
+.Le_byte:
+	xorl %eax,%eax
+.Le5:	rep
+	stosb
+	/* when there is another exception while zeroing the rest just return */
+.Le_zero:
+	movq %rdx,%rax
+	jmp .Lende
+	CFI_ENDPROC
+ENDPROC(copy_user_generic)
+
+
+	/* Some CPUs run faster using the string copy instructions.
+	   This is also a lot simpler. Use them when possible.
+	   Patch in jmps to this code instead of copying it fully
+	   to avoid unwanted aliasing in the exception tables. */
+
+ /* rdi	destination
+  * rsi source
+  * rdx count
+  * ecx zero flag
+  *
+  * Output:
+  * eax uncopied bytes or 0 if successfull.
+  *
+  * Only 4GB of copy is supported. This shouldn't be a problem
+  * because the kernel normally only writes from/to page sized chunks
+  * even if user space passed a longer buffer.
+  * And more would be dangerous because both Intel and AMD have
+  * errata with rep movsq > 4GB. If someone feels the need to fix
+  * this please consider this.
+  */
+ENTRY(copy_user_generic_string)
+	CFI_STARTPROC
+	movl %ecx,%r8d		/* save zero flag */
+	movl %edx,%ecx
+	shrl $3,%ecx
+	andl $7,%edx	
+	jz   10f
+1:	rep 
+	movsq 
+	movl %edx,%ecx
+2:	rep
+	movsb
+9:	movl %ecx,%eax
+	ret
+
+	/* multiple of 8 byte */
+10:	rep
+	movsq
+	xor %eax,%eax
+	ret
+
+	/* exception handling */
+3:      lea (%rdx,%rcx,8),%rax	/* exception on quad loop */
+	jmp 6f
+5:	movl %ecx,%eax		/* exception on byte loop */
+	/* eax: left over bytes */
+6:	testl %r8d,%r8d		/* zero flag set? */
+	jz 7f
+	movl %eax,%ecx		/* initialize x86 loop counter */
+	push %rax
+	xorl %eax,%eax
+8:	rep
+	stosb 			/* zero the rest */
+11:	pop %rax
+7:	ret
+	CFI_ENDPROC
+END(copy_user_generic_c)
+
+	.section __ex_table,"a"
+	.quad 1b,3b
+	.quad 2b,5b
+	.quad 8b,11b
+	.quad 10b,3b
+	.previous
diff --git a/arch/x86/lib/copy_user_nocache_64.S b/arch/x86/lib/copy_user_nocache_64.S
new file mode 100644
index 000000000000..4620efb12f13
--- /dev/null
+++ b/arch/x86/lib/copy_user_nocache_64.S
@@ -0,0 +1,217 @@
+/* Copyright 2002 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License v2.
+ *
+ * Functions to copy from and to user space.
+ */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+#define FIX_ALIGNMENT 1
+
+#include <asm/current.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/cpufeature.h>
+
+/*
+ * copy_user_nocache - Uncached memory copy with exception handling
+ * This will force destination/source out of cache for more performance.
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count
+ * rcx zero flag	when 1 zero on exception
+ *
+ * Output:
+ * eax uncopied bytes or 0 if successful.
+ */
+ENTRY(__copy_user_nocache)
+	CFI_STARTPROC
+	pushq %rbx
+	CFI_ADJUST_CFA_OFFSET 8
+	CFI_REL_OFFSET rbx, 0
+	pushq %rcx		/* save zero flag */
+	CFI_ADJUST_CFA_OFFSET 8
+	CFI_REL_OFFSET rcx, 0
+
+	xorl %eax,%eax		/* zero for the exception handler */
+
+#ifdef FIX_ALIGNMENT
+	/* check for bad alignment of destination */
+	movl %edi,%ecx
+	andl $7,%ecx
+	jnz  .Lbad_alignment
+.Lafter_bad_alignment:
+#endif
+
+	movq %rdx,%rcx
+
+	movl $64,%ebx
+	shrq $6,%rdx
+	decq %rdx
+	js   .Lhandle_tail
+
+	.p2align 4
+.Lloop:
+.Ls1:	movq (%rsi),%r11
+.Ls2:	movq 1*8(%rsi),%r8
+.Ls3:	movq 2*8(%rsi),%r9
+.Ls4:	movq 3*8(%rsi),%r10
+.Ld1:	movnti %r11,(%rdi)
+.Ld2:	movnti %r8,1*8(%rdi)
+.Ld3:	movnti %r9,2*8(%rdi)
+.Ld4:	movnti %r10,3*8(%rdi)
+
+.Ls5:	movq 4*8(%rsi),%r11
+.Ls6:	movq 5*8(%rsi),%r8
+.Ls7:	movq 6*8(%rsi),%r9
+.Ls8:	movq 7*8(%rsi),%r10
+.Ld5:	movnti %r11,4*8(%rdi)
+.Ld6:	movnti %r8,5*8(%rdi)
+.Ld7:	movnti %r9,6*8(%rdi)
+.Ld8:	movnti %r10,7*8(%rdi)
+
+	dec  %rdx
+
+	leaq 64(%rsi),%rsi
+	leaq 64(%rdi),%rdi
+
+	jns  .Lloop
+
+	.p2align 4
+.Lhandle_tail:
+	movl %ecx,%edx
+	andl $63,%ecx
+	shrl $3,%ecx
+	jz   .Lhandle_7
+	movl $8,%ebx
+	.p2align 4
+.Lloop_8:
+.Ls9:	movq (%rsi),%r8
+.Ld9:	movnti %r8,(%rdi)
+	decl %ecx
+	leaq 8(%rdi),%rdi
+	leaq 8(%rsi),%rsi
+	jnz .Lloop_8
+
+.Lhandle_7:
+	movl %edx,%ecx
+	andl $7,%ecx
+	jz   .Lende
+	.p2align 4
+.Lloop_1:
+.Ls10:	movb (%rsi),%bl
+.Ld10:	movb %bl,(%rdi)
+	incq %rdi
+	incq %rsi
+	decl %ecx
+	jnz .Lloop_1
+
+	CFI_REMEMBER_STATE
+.Lende:
+	popq %rcx
+	CFI_ADJUST_CFA_OFFSET -8
+	CFI_RESTORE %rcx
+	popq %rbx
+	CFI_ADJUST_CFA_OFFSET -8
+	CFI_RESTORE rbx
+	ret
+	CFI_RESTORE_STATE
+
+#ifdef FIX_ALIGNMENT
+	/* align destination */
+	.p2align 4
+.Lbad_alignment:
+	movl $8,%r9d
+	subl %ecx,%r9d
+	movl %r9d,%ecx
+	cmpq %r9,%rdx
+	jz   .Lhandle_7
+	js   .Lhandle_7
+.Lalign_1:
+.Ls11:	movb (%rsi),%bl
+.Ld11:	movb %bl,(%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz .Lalign_1
+	subq %r9,%rdx
+	jmp .Lafter_bad_alignment
+#endif
+
+	/* table sorted by exception address */
+	.section __ex_table,"a"
+	.align 8
+	.quad .Ls1,.Ls1e
+	.quad .Ls2,.Ls2e
+	.quad .Ls3,.Ls3e
+	.quad .Ls4,.Ls4e
+	.quad .Ld1,.Ls1e
+	.quad .Ld2,.Ls2e
+	.quad .Ld3,.Ls3e
+	.quad .Ld4,.Ls4e
+	.quad .Ls5,.Ls5e
+	.quad .Ls6,.Ls6e
+	.quad .Ls7,.Ls7e
+	.quad .Ls8,.Ls8e
+	.quad .Ld5,.Ls5e
+	.quad .Ld6,.Ls6e
+	.quad .Ld7,.Ls7e
+	.quad .Ld8,.Ls8e
+	.quad .Ls9,.Le_quad
+	.quad .Ld9,.Le_quad
+	.quad .Ls10,.Le_byte
+	.quad .Ld10,.Le_byte
+#ifdef FIX_ALIGNMENT
+	.quad .Ls11,.Lzero_rest
+	.quad .Ld11,.Lzero_rest
+#endif
+	.quad .Le5,.Le_zero
+	.previous
+
+	/* compute 64-offset for main loop. 8 bytes accuracy with error on the
+	   pessimistic side. this is gross. it would be better to fix the
+	   interface. */
+	/* eax: zero, ebx: 64 */
+.Ls1e: 	addl $8,%eax
+.Ls2e: 	addl $8,%eax
+.Ls3e: 	addl $8,%eax
+.Ls4e: 	addl $8,%eax
+.Ls5e: 	addl $8,%eax
+.Ls6e: 	addl $8,%eax
+.Ls7e: 	addl $8,%eax
+.Ls8e: 	addl $8,%eax
+	addq %rbx,%rdi	/* +64 */
+	subq %rax,%rdi  /* correct destination with computed offset */
+
+	shlq $6,%rdx	/* loop counter * 64 (stride length) */
+	addq %rax,%rdx	/* add offset to loopcnt */
+	andl $63,%ecx	/* remaining bytes */
+	addq %rcx,%rdx	/* add them */
+	jmp .Lzero_rest
+
+	/* exception on quad word loop in tail handling */
+	/* ecx:	loopcnt/8, %edx: length, rdi: correct */
+.Le_quad:
+	shll $3,%ecx
+	andl $7,%edx
+	addl %ecx,%edx
+	/* edx: bytes to zero, rdi: dest, eax:zero */
+.Lzero_rest:
+	cmpl $0,(%rsp)	/* zero flag set? */
+	jz   .Le_zero
+	movq %rdx,%rcx
+.Le_byte:
+	xorl %eax,%eax
+.Le5:	rep
+	stosb
+	/* when there is another exception while zeroing the rest just return */
+.Le_zero:
+	movq %rdx,%rax
+	jmp .Lende
+	CFI_ENDPROC
+ENDPROC(__copy_user_nocache)
+
+
diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S
new file mode 100644
index 000000000000..f0dba36578ea
--- /dev/null
+++ b/arch/x86/lib/csum-copy_64.S
@@ -0,0 +1,249 @@
+/*
+ * Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ *	
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of this archive
+ * for more details. No warranty for anything given at all.
+ */
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/errno.h>
+
+/*
+ * Checksum copy with exception handling.
+ * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the 
+ * destination is zeroed.
+ * 
+ * Input
+ * rdi  source
+ * rsi  destination
+ * edx  len (32bit)
+ * ecx  sum (32bit) 
+ * r8   src_err_ptr (int)
+ * r9   dst_err_ptr (int)
+ *
+ * Output
+ * eax  64bit sum. undefined in case of exception.
+ * 
+ * Wrappers need to take care of valid exception sum and zeroing.		 
+ * They also should align source or destination to 8 bytes.
+ */
+
+	.macro source
+10:
+	.section __ex_table,"a"
+	.align 8
+	.quad 10b,.Lbad_source
+	.previous
+	.endm
+		
+	.macro dest
+20:
+	.section __ex_table,"a"
+	.align 8
+	.quad 20b,.Lbad_dest
+	.previous
+	.endm
+			
+	.macro ignore L=.Lignore
+30:
+	.section __ex_table,"a"
+	.align 8
+	.quad 30b,\L
+	.previous
+	.endm
+	
+				
+ENTRY(csum_partial_copy_generic)
+	CFI_STARTPROC
+	cmpl	 $3*64,%edx
+	jle	 .Lignore
+
+.Lignore:		
+	subq  $7*8,%rsp
+	CFI_ADJUST_CFA_OFFSET 7*8
+	movq  %rbx,2*8(%rsp)
+	CFI_REL_OFFSET rbx, 2*8
+	movq  %r12,3*8(%rsp)
+	CFI_REL_OFFSET r12, 3*8
+	movq  %r14,4*8(%rsp)
+	CFI_REL_OFFSET r14, 4*8
+	movq  %r13,5*8(%rsp)
+	CFI_REL_OFFSET r13, 5*8
+	movq  %rbp,6*8(%rsp)
+	CFI_REL_OFFSET rbp, 6*8
+
+	movq  %r8,(%rsp)
+	movq  %r9,1*8(%rsp)
+	
+	movl  %ecx,%eax
+	movl  %edx,%ecx
+
+	xorl  %r9d,%r9d
+	movq  %rcx,%r12
+
+	shrq  $6,%r12
+	jz    .Lhandle_tail       /* < 64 */
+
+	clc
+	
+	/* main loop. clear in 64 byte blocks */
+	/* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */
+	/* r11:	temp3, rdx: temp4, r12 loopcnt */
+	/* r10:	temp5, rbp: temp6, r14 temp7, r13 temp8 */
+	.p2align 4
+.Lloop:
+	source
+	movq  (%rdi),%rbx
+	source
+	movq  8(%rdi),%r8
+	source
+	movq  16(%rdi),%r11
+	source
+	movq  24(%rdi),%rdx
+
+	source
+	movq  32(%rdi),%r10
+	source
+	movq  40(%rdi),%rbp
+	source
+	movq  48(%rdi),%r14
+	source
+	movq  56(%rdi),%r13
+		
+	ignore 2f
+	prefetcht0 5*64(%rdi)
+2:							
+	adcq  %rbx,%rax
+	adcq  %r8,%rax
+	adcq  %r11,%rax
+	adcq  %rdx,%rax
+	adcq  %r10,%rax
+	adcq  %rbp,%rax
+	adcq  %r14,%rax
+	adcq  %r13,%rax
+
+	decl %r12d
+	
+	dest
+	movq %rbx,(%rsi)
+	dest
+	movq %r8,8(%rsi)
+	dest
+	movq %r11,16(%rsi)
+	dest
+	movq %rdx,24(%rsi)
+
+	dest
+	movq %r10,32(%rsi)
+	dest
+	movq %rbp,40(%rsi)
+	dest
+	movq %r14,48(%rsi)
+	dest
+	movq %r13,56(%rsi)
+	
+3:
+	
+	leaq 64(%rdi),%rdi
+	leaq 64(%rsi),%rsi
+
+	jnz   .Lloop
+
+	adcq  %r9,%rax
+
+	/* do last upto 56 bytes */
+.Lhandle_tail:
+	/* ecx:	count */
+	movl %ecx,%r10d
+	andl $63,%ecx
+	shrl $3,%ecx
+	jz 	 .Lfold
+	clc
+	.p2align 4
+.Lloop_8:	
+	source
+	movq (%rdi),%rbx
+	adcq %rbx,%rax
+	decl %ecx
+	dest
+	movq %rbx,(%rsi)
+	leaq 8(%rsi),%rsi /* preserve carry */
+	leaq 8(%rdi),%rdi
+	jnz	.Lloop_8
+	adcq %r9,%rax	/* add in carry */
+
+.Lfold:
+	/* reduce checksum to 32bits */
+	movl %eax,%ebx
+	shrq $32,%rax
+	addl %ebx,%eax
+	adcl %r9d,%eax
+
+	/* do last upto 6 bytes */	
+.Lhandle_7:
+	movl %r10d,%ecx
+	andl $7,%ecx
+	shrl $1,%ecx
+	jz   .Lhandle_1
+	movl $2,%edx
+	xorl %ebx,%ebx
+	clc  
+	.p2align 4
+.Lloop_1:	
+	source
+	movw (%rdi),%bx
+	adcl %ebx,%eax
+	decl %ecx
+	dest
+	movw %bx,(%rsi)
+	leaq 2(%rdi),%rdi
+	leaq 2(%rsi),%rsi
+	jnz .Lloop_1
+	adcl %r9d,%eax	/* add in carry */
+	
+	/* handle last odd byte */
+.Lhandle_1:
+	testl $1,%r10d
+	jz    .Lende
+	xorl  %ebx,%ebx
+	source
+	movb (%rdi),%bl
+	dest
+	movb %bl,(%rsi)
+	addl %ebx,%eax
+	adcl %r9d,%eax		/* carry */
+			
+	CFI_REMEMBER_STATE
+.Lende:
+	movq 2*8(%rsp),%rbx
+	CFI_RESTORE rbx
+	movq 3*8(%rsp),%r12
+	CFI_RESTORE r12
+	movq 4*8(%rsp),%r14
+	CFI_RESTORE r14
+	movq 5*8(%rsp),%r13
+	CFI_RESTORE r13
+	movq 6*8(%rsp),%rbp
+	CFI_RESTORE rbp
+	addq $7*8,%rsp
+	CFI_ADJUST_CFA_OFFSET -7*8
+	ret
+	CFI_RESTORE_STATE
+
+	/* Exception handlers. Very simple, zeroing is done in the wrappers */
+.Lbad_source:
+	movq (%rsp),%rax
+	testq %rax,%rax
+	jz   .Lende
+	movl $-EFAULT,(%rax)
+	jmp  .Lende
+	
+.Lbad_dest:
+	movq 8(%rsp),%rax
+	testq %rax,%rax
+	jz   .Lende	
+	movl $-EFAULT,(%rax)
+	jmp .Lende
+	CFI_ENDPROC
+ENDPROC(csum_partial_copy_generic)
diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
new file mode 100644
index 000000000000..bc503f506903
--- /dev/null
+++ b/arch/x86/lib/csum-partial_64.c
@@ -0,0 +1,150 @@
+/*
+ * arch/x86_64/lib/csum-partial.c
+ *
+ * This file contains network checksum routines that are better done
+ * in an architecture-specific manner due to speed.
+ */
+ 
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <asm/checksum.h>
+
+static inline unsigned short from32to16(unsigned a) 
+{
+	unsigned short b = a >> 16; 
+	asm("addw %w2,%w0\n\t"
+	    "adcw $0,%w0\n" 
+	    : "=r" (b)
+	    : "0" (b), "r" (a));
+	return b;
+}
+
+/*
+ * Do a 64-bit checksum on an arbitrary memory area.
+ * Returns a 32bit checksum.
+ *
+ * This isn't as time critical as it used to be because many NICs
+ * do hardware checksumming these days.
+ * 
+ * Things tried and found to not make it faster:
+ * Manual Prefetching
+ * Unrolling to an 128 bytes inner loop.
+ * Using interleaving with more registers to break the carry chains.
+ */
+static unsigned do_csum(const unsigned char *buff, unsigned len)
+{
+	unsigned odd, count;
+	unsigned long result = 0;
+
+	if (unlikely(len == 0))
+		return result; 
+	odd = 1 & (unsigned long) buff;
+	if (unlikely(odd)) {
+		result = *buff << 8;
+		len--;
+		buff++;
+	}
+	count = len >> 1;		/* nr of 16-bit words.. */
+	if (count) {
+		if (2 & (unsigned long) buff) {
+			result += *(unsigned short *)buff;
+			count--;
+			len -= 2;
+			buff += 2;
+		}
+		count >>= 1;		/* nr of 32-bit words.. */
+		if (count) {
+			unsigned long zero;
+			unsigned count64;
+			if (4 & (unsigned long) buff) {
+				result += *(unsigned int *) buff;
+				count--;
+				len -= 4;
+				buff += 4;
+			}
+			count >>= 1;	/* nr of 64-bit words.. */
+
+			/* main loop using 64byte blocks */
+			zero = 0;
+			count64 = count >> 3;
+			while (count64) { 
+				asm("addq 0*8(%[src]),%[res]\n\t"
+				    "adcq 1*8(%[src]),%[res]\n\t"
+				    "adcq 2*8(%[src]),%[res]\n\t"
+				    "adcq 3*8(%[src]),%[res]\n\t"
+				    "adcq 4*8(%[src]),%[res]\n\t"
+				    "adcq 5*8(%[src]),%[res]\n\t"
+				    "adcq 6*8(%[src]),%[res]\n\t"
+				    "adcq 7*8(%[src]),%[res]\n\t"
+				    "adcq %[zero],%[res]"
+				    : [res] "=r" (result)
+				    : [src] "r" (buff), [zero] "r" (zero),
+				    "[res]" (result));
+				buff += 64;
+				count64--;
+			}
+
+			/* last upto 7 8byte blocks */
+			count %= 8; 
+			while (count) { 
+				asm("addq %1,%0\n\t"
+				    "adcq %2,%0\n" 
+					    : "=r" (result)
+				    : "m" (*(unsigned long *)buff), 
+				    "r" (zero),  "0" (result));
+				--count; 
+					buff += 8;
+			}
+			result = add32_with_carry(result>>32,
+						  result&0xffffffff); 
+
+			if (len & 4) {
+				result += *(unsigned int *) buff;
+				buff += 4;
+			}
+		}
+		if (len & 2) {
+			result += *(unsigned short *) buff;
+			buff += 2;
+		}
+	}
+	if (len & 1)
+		result += *buff;
+	result = add32_with_carry(result>>32, result & 0xffffffff); 
+	if (unlikely(odd)) { 
+		result = from32to16(result);
+		result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
+	}
+	return result;
+}
+
+/*
+ * computes the checksum of a memory block at buff, length len,
+ * and adds in "sum" (32-bit)
+ *
+ * returns a 32-bit number suitable for feeding into itself
+ * or csum_tcpudp_magic
+ *
+ * this function must be called with even lengths, except
+ * for the last fragment, which may be odd
+ *
+ * it's best to have buff aligned on a 64-bit boundary
+ */
+__wsum csum_partial(const void *buff, int len, __wsum sum)
+{
+	return (__force __wsum)add32_with_carry(do_csum(buff, len),
+						(__force u32)sum);
+}
+
+EXPORT_SYMBOL(csum_partial);
+
+/*
+ * this routine is used for miscellaneous IP-like checksums, mainly
+ * in icmp.c
+ */
+__sum16 ip_compute_csum(const void *buff, int len)
+{
+	return csum_fold(csum_partial(buff,len,0));
+}
+EXPORT_SYMBOL(ip_compute_csum);
+
diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c
new file mode 100644
index 000000000000..fd42a4a095fc
--- /dev/null
+++ b/arch/x86/lib/csum-wrappers_64.c
@@ -0,0 +1,135 @@
+/* Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License v.2
+ * 
+ * Wrappers of assembly checksum functions for x86-64.
+ */
+
+#include <asm/checksum.h>
+#include <linux/module.h>
+
+/** 
+ * csum_partial_copy_from_user - Copy and checksum from user space. 
+ * @src: source address (user space) 
+ * @dst: destination address
+ * @len: number of bytes to be copied.
+ * @isum: initial sum that is added into the result (32bit unfolded)
+ * @errp: set to -EFAULT for an bad source address.
+ * 
+ * Returns an 32bit unfolded checksum of the buffer.
+ * src and dst are best aligned to 64bits. 
+ */ 
+__wsum
+csum_partial_copy_from_user(const void __user *src, void *dst,
+			    int len, __wsum isum, int *errp)
+{ 
+	might_sleep();
+	*errp = 0;
+	if (likely(access_ok(VERIFY_READ,src, len))) { 
+		/* Why 6, not 7? To handle odd addresses aligned we
+		   would need to do considerable complications to fix the
+		   checksum which is defined as an 16bit accumulator. The
+		   fix alignment code is primarily for performance
+		   compatibility with 32bit and that will handle odd
+		   addresses slowly too. */
+		if (unlikely((unsigned long)src & 6)) {			
+			while (((unsigned long)src & 6) && len >= 2) { 
+				__u16 val16;			
+				*errp = __get_user(val16, (const __u16 __user *)src);
+				if (*errp)
+					return isum;
+				*(__u16 *)dst = val16;
+				isum = (__force __wsum)add32_with_carry(
+						(__force unsigned)isum, val16);
+				src += 2; 
+				dst += 2; 
+				len -= 2;
+			}
+		}
+		isum = csum_partial_copy_generic((__force const void *)src,
+					dst, len, isum, errp, NULL);
+		if (likely(*errp == 0)) 
+			return isum;
+	} 
+	*errp = -EFAULT;
+	memset(dst,0,len); 
+	return isum;		
+} 
+
+EXPORT_SYMBOL(csum_partial_copy_from_user);
+
+/** 
+ * csum_partial_copy_to_user - Copy and checksum to user space. 
+ * @src: source address
+ * @dst: destination address (user space)
+ * @len: number of bytes to be copied.
+ * @isum: initial sum that is added into the result (32bit unfolded)
+ * @errp: set to -EFAULT for an bad destination address.
+ * 
+ * Returns an 32bit unfolded checksum of the buffer.
+ * src and dst are best aligned to 64bits.
+ */ 
+__wsum
+csum_partial_copy_to_user(const void *src, void __user *dst,
+			  int len, __wsum isum, int *errp)
+{ 
+	might_sleep();
+	if (unlikely(!access_ok(VERIFY_WRITE, dst, len))) {
+		*errp = -EFAULT;
+		return 0; 
+	}
+
+	if (unlikely((unsigned long)dst & 6)) {
+		while (((unsigned long)dst & 6) && len >= 2) { 
+			__u16 val16 = *(__u16 *)src;
+			isum = (__force __wsum)add32_with_carry(
+					(__force unsigned)isum, val16);
+			*errp = __put_user(val16, (__u16 __user *)dst);
+			if (*errp)
+				return isum;
+			src += 2; 
+			dst += 2; 
+			len -= 2;
+		}
+	}
+
+	*errp = 0;
+	return csum_partial_copy_generic(src, (void __force *)dst,len,isum,NULL,errp); 
+} 
+
+EXPORT_SYMBOL(csum_partial_copy_to_user);
+
+/** 
+ * csum_partial_copy_nocheck - Copy and checksum.
+ * @src: source address
+ * @dst: destination address
+ * @len: number of bytes to be copied.
+ * @isum: initial sum that is added into the result (32bit unfolded)
+ * 
+ * Returns an 32bit unfolded checksum of the buffer.
+ */ 
+__wsum
+csum_partial_copy_nocheck(const void *src, void *dst, int len, __wsum sum)
+{ 
+	return csum_partial_copy_generic(src,dst,len,sum,NULL,NULL);
+} 
+EXPORT_SYMBOL(csum_partial_copy_nocheck);
+
+__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
+			const struct in6_addr *daddr,
+			__u32 len, unsigned short proto, __wsum sum)
+{
+	__u64 rest, sum64;
+     
+	rest = (__force __u64)htonl(len) + (__force __u64)htons(proto) +
+		(__force __u64)sum;
+	asm("  addq (%[saddr]),%[sum]\n"
+	    "  adcq 8(%[saddr]),%[sum]\n"
+	    "  adcq (%[daddr]),%[sum]\n" 
+	    "  adcq 8(%[daddr]),%[sum]\n"
+	    "  adcq $0,%[sum]\n"
+	    : [sum] "=r" (sum64) 
+	    : "[sum]" (rest),[saddr] "r" (saddr), [daddr] "r" (daddr));
+	return csum_fold((__force __wsum)add32_with_carry(sum64 & 0xffffffff, sum64>>32));
+}
+
+EXPORT_SYMBOL(csum_ipv6_magic);
diff --git a/arch/x86/lib/delay_32.c b/arch/x86/lib/delay_32.c
new file mode 100644
index 000000000000..f6edb11364df
--- /dev/null
+++ b/arch/x86/lib/delay_32.c
@@ -0,0 +1,103 @@
+/*
+ *	Precise Delay Loops for i386
+ *
+ *	Copyright (C) 1993 Linus Torvalds
+ *	Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
+ *
+ *	The __delay function must _NOT_ be inlined as its execution time
+ *	depends wildly on alignment on many x86 processors. The additional
+ *	jump magic is needed to get the timing stable on all the CPU's
+ *	we have to worry about.
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+
+#include <asm/processor.h>
+#include <asm/delay.h>
+#include <asm/timer.h>
+
+#ifdef CONFIG_SMP
+# include <asm/smp.h>
+#endif
+
+/* simple loop based delay: */
+static void delay_loop(unsigned long loops)
+{
+	int d0;
+
+	__asm__ __volatile__(
+		"\tjmp 1f\n"
+		".align 16\n"
+		"1:\tjmp 2f\n"
+		".align 16\n"
+		"2:\tdecl %0\n\tjns 2b"
+		:"=&a" (d0)
+		:"0" (loops));
+}
+
+/* TSC based delay: */
+static void delay_tsc(unsigned long loops)
+{
+	unsigned long bclock, now;
+
+	rdtscl(bclock);
+	do {
+		rep_nop();
+		rdtscl(now);
+	} while ((now-bclock) < loops);
+}
+
+/*
+ * Since we calibrate only once at boot, this
+ * function should be set once at boot and not changed
+ */
+static void (*delay_fn)(unsigned long) = delay_loop;
+
+void use_tsc_delay(void)
+{
+	delay_fn = delay_tsc;
+}
+
+int read_current_timer(unsigned long *timer_val)
+{
+	if (delay_fn == delay_tsc) {
+		rdtscl(*timer_val);
+		return 0;
+	}
+	return -1;
+}
+
+void __delay(unsigned long loops)
+{
+	delay_fn(loops);
+}
+
+inline void __const_udelay(unsigned long xloops)
+{
+	int d0;
+
+	xloops *= 4;
+	__asm__("mull %0"
+		:"=d" (xloops), "=&a" (d0)
+		:"1" (xloops), "0"
+		(cpu_data[raw_smp_processor_id()].loops_per_jiffy * (HZ/4)));
+
+	__delay(++xloops);
+}
+
+void __udelay(unsigned long usecs)
+{
+	__const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */
+}
+
+void __ndelay(unsigned long nsecs)
+{
+	__const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */
+}
+
+EXPORT_SYMBOL(__delay);
+EXPORT_SYMBOL(__const_udelay);
+EXPORT_SYMBOL(__udelay);
+EXPORT_SYMBOL(__ndelay);
diff --git a/arch/x86/lib/delay_64.c b/arch/x86/lib/delay_64.c
new file mode 100644
index 000000000000..2dbebd308347
--- /dev/null
+++ b/arch/x86/lib/delay_64.c
@@ -0,0 +1,57 @@
+/*
+ *	Precise Delay Loops for x86-64
+ *
+ *	Copyright (C) 1993 Linus Torvalds
+ *	Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
+ *
+ *	The __delay function must _NOT_ be inlined as its execution time
+ *	depends wildly on alignment on many x86 processors. 
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <asm/delay.h>
+#include <asm/msr.h>
+
+#ifdef CONFIG_SMP
+#include <asm/smp.h>
+#endif
+
+int read_current_timer(unsigned long *timer_value)
+{
+	rdtscll(*timer_value);
+	return 0;
+}
+
+void __delay(unsigned long loops)
+{
+	unsigned bclock, now;
+	
+	rdtscl(bclock);
+	do
+	{
+		rep_nop(); 
+		rdtscl(now);
+	}
+	while((now-bclock) < loops);
+}
+EXPORT_SYMBOL(__delay);
+
+inline void __const_udelay(unsigned long xloops)
+{
+	__delay(((xloops * HZ * cpu_data[raw_smp_processor_id()].loops_per_jiffy) >> 32) + 1);
+}
+EXPORT_SYMBOL(__const_udelay);
+
+void __udelay(unsigned long usecs)
+{
+	__const_udelay(usecs * 0x000010c7);  /* 2**32 / 1000000 (rounded up) */
+}
+EXPORT_SYMBOL(__udelay);
+
+void __ndelay(unsigned long nsecs)
+{
+	__const_udelay(nsecs * 0x00005);  /* 2**32 / 1000000000 (rounded up) */
+}
+EXPORT_SYMBOL(__ndelay);
diff --git a/arch/x86/lib/getuser_32.S b/arch/x86/lib/getuser_32.S
new file mode 100644
index 000000000000..6d84b53f12a2
--- /dev/null
+++ b/arch/x86/lib/getuser_32.S
@@ -0,0 +1,78 @@
+/*
+ * __get_user functions.
+ *
+ * (C) Copyright 1998 Linus Torvalds
+ *
+ * These functions have a non-standard call interface
+ * to make them more efficient, especially as they
+ * return an error value in addition to the "real"
+ * return value.
+ */
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/thread_info.h>
+
+
+/*
+ * __get_user_X
+ *
+ * Inputs:	%eax contains the address
+ *
+ * Outputs:	%eax is error code (0 or -EFAULT)
+ *		%edx contains zero-extended value
+ *
+ * These functions should not modify any other registers,
+ * as they get called from within inline assembly.
+ */
+
+.text
+ENTRY(__get_user_1)
+	CFI_STARTPROC
+	GET_THREAD_INFO(%edx)
+	cmpl TI_addr_limit(%edx),%eax
+	jae bad_get_user
+1:	movzbl (%eax),%edx
+	xorl %eax,%eax
+	ret
+	CFI_ENDPROC
+ENDPROC(__get_user_1)
+
+ENTRY(__get_user_2)
+	CFI_STARTPROC
+	addl $1,%eax
+	jc bad_get_user
+	GET_THREAD_INFO(%edx)
+	cmpl TI_addr_limit(%edx),%eax
+	jae bad_get_user
+2:	movzwl -1(%eax),%edx
+	xorl %eax,%eax
+	ret
+	CFI_ENDPROC
+ENDPROC(__get_user_2)
+
+ENTRY(__get_user_4)
+	CFI_STARTPROC
+	addl $3,%eax
+	jc bad_get_user
+	GET_THREAD_INFO(%edx)
+	cmpl TI_addr_limit(%edx),%eax
+	jae bad_get_user
+3:	movl -3(%eax),%edx
+	xorl %eax,%eax
+	ret
+	CFI_ENDPROC
+ENDPROC(__get_user_4)
+
+bad_get_user:
+	CFI_STARTPROC
+	xorl %edx,%edx
+	movl $-14,%eax
+	ret
+	CFI_ENDPROC
+END(bad_get_user)
+
+.section __ex_table,"a"
+	.long 1b,bad_get_user
+	.long 2b,bad_get_user
+	.long 3b,bad_get_user
+.previous
diff --git a/arch/x86/lib/getuser_64.S b/arch/x86/lib/getuser_64.S
new file mode 100644
index 000000000000..5448876261f8
--- /dev/null
+++ b/arch/x86/lib/getuser_64.S
@@ -0,0 +1,109 @@
+/*
+ * __get_user functions.
+ *
+ * (C) Copyright 1998 Linus Torvalds
+ * (C) Copyright 2005 Andi Kleen
+ *
+ * These functions have a non-standard call interface
+ * to make them more efficient, especially as they
+ * return an error value in addition to the "real"
+ * return value.
+ */
+
+/*
+ * __get_user_X
+ *
+ * Inputs:	%rcx contains the address.
+ *		The register is modified, but all changes are undone
+ *		before returning because the C code doesn't know about it.
+ *
+ * Outputs:	%rax is error code (0 or -EFAULT)
+ *		%rdx contains zero-extended value
+ * 
+ * %r8 is destroyed.
+ *
+ * These functions should not modify any other registers,
+ * as they get called from within inline assembly.
+ */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/page.h>
+#include <asm/errno.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+
+	.text
+ENTRY(__get_user_1)
+	CFI_STARTPROC
+	GET_THREAD_INFO(%r8)
+	cmpq threadinfo_addr_limit(%r8),%rcx
+	jae bad_get_user
+1:	movzb (%rcx),%edx
+	xorl %eax,%eax
+	ret
+	CFI_ENDPROC
+ENDPROC(__get_user_1)
+
+ENTRY(__get_user_2)
+	CFI_STARTPROC
+	GET_THREAD_INFO(%r8)
+	addq $1,%rcx
+	jc 20f
+	cmpq threadinfo_addr_limit(%r8),%rcx
+	jae 20f
+	decq   %rcx
+2:	movzwl (%rcx),%edx
+	xorl %eax,%eax
+	ret
+20:	decq    %rcx
+	jmp	bad_get_user
+	CFI_ENDPROC
+ENDPROC(__get_user_2)
+
+ENTRY(__get_user_4)
+	CFI_STARTPROC
+	GET_THREAD_INFO(%r8)
+	addq $3,%rcx
+	jc 30f
+	cmpq threadinfo_addr_limit(%r8),%rcx
+	jae 30f
+	subq $3,%rcx
+3:	movl (%rcx),%edx
+	xorl %eax,%eax
+	ret
+30:	subq $3,%rcx
+	jmp bad_get_user
+	CFI_ENDPROC
+ENDPROC(__get_user_4)
+
+ENTRY(__get_user_8)
+	CFI_STARTPROC
+	GET_THREAD_INFO(%r8)
+	addq $7,%rcx
+	jc 40f
+	cmpq threadinfo_addr_limit(%r8),%rcx
+	jae	40f
+	subq	$7,%rcx
+4:	movq (%rcx),%rdx
+	xorl %eax,%eax
+	ret
+40:	subq $7,%rcx
+	jmp bad_get_user
+	CFI_ENDPROC
+ENDPROC(__get_user_8)
+
+bad_get_user:
+	CFI_STARTPROC
+	xorl %edx,%edx
+	movq $(-EFAULT),%rax
+	ret
+	CFI_ENDPROC
+END(bad_get_user)
+
+.section __ex_table,"a"
+	.quad 1b,bad_get_user
+	.quad 2b,bad_get_user
+	.quad 3b,bad_get_user
+	.quad 4b,bad_get_user
+.previous
diff --git a/arch/x86/lib/io_64.c b/arch/x86/lib/io_64.c
new file mode 100644
index 000000000000..87b4a4e18039
--- /dev/null
+++ b/arch/x86/lib/io_64.c
@@ -0,0 +1,23 @@
+#include <linux/string.h>
+#include <asm/io.h>
+#include <linux/module.h>
+
+void __memcpy_toio(unsigned long dst,const void*src,unsigned len)
+{
+	__inline_memcpy((void *) dst,src,len);
+}
+EXPORT_SYMBOL(__memcpy_toio);
+
+void __memcpy_fromio(void *dst,unsigned long src,unsigned len)
+{
+	__inline_memcpy(dst,(const void *) src,len);
+}
+EXPORT_SYMBOL(__memcpy_fromio);
+
+void memset_io(volatile void __iomem *a, int b, size_t c)
+{
+	/* XXX: memset can mangle the IO patterns quite a bit.
+	   perhaps it would be better to use a dumb one */
+	memset((void *)a,b,c);
+}
+EXPORT_SYMBOL(memset_io);
diff --git a/arch/x86/lib/iomap_copy_64.S b/arch/x86/lib/iomap_copy_64.S
new file mode 100644
index 000000000000..05a95e713da8
--- /dev/null
+++ b/arch/x86/lib/iomap_copy_64.S
@@ -0,0 +1,30 @@
+/*
+ * Copyright 2006 PathScale, Inc.  All Rights Reserved.
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+/*
+ * override generic version in lib/iomap_copy.c
+ */
+ENTRY(__iowrite32_copy)
+	CFI_STARTPROC
+	movl %edx,%ecx
+	rep movsd
+	ret
+	CFI_ENDPROC
+ENDPROC(__iowrite32_copy)
diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c
new file mode 100644
index 000000000000..8ac51b82a632
--- /dev/null
+++ b/arch/x86/lib/memcpy_32.c
@@ -0,0 +1,43 @@
+#include <linux/string.h>
+#include <linux/module.h>
+
+#undef memcpy
+#undef memset
+
+void *memcpy(void *to, const void *from, size_t n)
+{
+#ifdef CONFIG_X86_USE_3DNOW
+	return __memcpy3d(to, from, n);
+#else
+	return __memcpy(to, from, n);
+#endif
+}
+EXPORT_SYMBOL(memcpy);
+
+void *memset(void *s, int c, size_t count)
+{
+	return __memset(s, c, count);
+}
+EXPORT_SYMBOL(memset);
+
+void *memmove(void *dest, const void *src, size_t n)
+{
+	int d0, d1, d2;
+
+	if (dest < src) {
+		memcpy(dest,src,n);
+	} else {
+		__asm__ __volatile__(
+			"std\n\t"
+			"rep\n\t"
+			"movsb\n\t"
+			"cld"
+			: "=&c" (d0), "=&S" (d1), "=&D" (d2)
+			:"0" (n),
+			 "1" (n-1+(const char *)src),
+			 "2" (n-1+(char *)dest)
+			:"memory");
+	}
+	return dest;
+}
+EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
new file mode 100644
index 000000000000..c22981fa2f3a
--- /dev/null
+++ b/arch/x86/lib/memcpy_64.S
@@ -0,0 +1,131 @@
+/* Copyright 2002 Andi Kleen */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
+
+/*
+ * memcpy - Copy a memory block.
+ *
+ * Input:	
+ * rdi destination
+ * rsi source
+ * rdx count
+ * 
+ * Output:
+ * rax original destination
+ */	
+
+	ALIGN
+memcpy_c:
+	CFI_STARTPROC
+	movq %rdi,%rax
+	movl %edx,%ecx
+	shrl $3,%ecx
+	andl $7,%edx
+	rep movsq
+	movl %edx,%ecx
+	rep movsb
+	ret
+	CFI_ENDPROC
+ENDPROC(memcpy_c)
+
+ENTRY(__memcpy)
+ENTRY(memcpy)
+	CFI_STARTPROC
+	pushq %rbx
+	CFI_ADJUST_CFA_OFFSET 8
+	CFI_REL_OFFSET rbx, 0
+	movq %rdi,%rax
+
+	movl %edx,%ecx
+	shrl $6,%ecx
+	jz .Lhandle_tail
+
+	.p2align 4
+.Lloop_64:
+	decl %ecx
+
+	movq (%rsi),%r11
+	movq 8(%rsi),%r8
+
+	movq %r11,(%rdi)
+	movq %r8,1*8(%rdi)
+
+	movq 2*8(%rsi),%r9
+	movq 3*8(%rsi),%r10
+
+	movq %r9,2*8(%rdi)
+	movq %r10,3*8(%rdi)
+
+	movq 4*8(%rsi),%r11
+	movq 5*8(%rsi),%r8
+
+	movq %r11,4*8(%rdi)
+	movq %r8,5*8(%rdi)
+
+	movq 6*8(%rsi),%r9
+	movq 7*8(%rsi),%r10
+
+	movq %r9,6*8(%rdi)
+	movq %r10,7*8(%rdi)
+
+	leaq 64(%rsi),%rsi
+	leaq 64(%rdi),%rdi
+	jnz  .Lloop_64
+
+.Lhandle_tail:
+	movl %edx,%ecx
+	andl $63,%ecx
+	shrl $3,%ecx
+	jz   .Lhandle_7
+	.p2align 4
+.Lloop_8:
+	decl %ecx
+	movq (%rsi),%r8
+	movq %r8,(%rdi)
+	leaq 8(%rdi),%rdi
+	leaq 8(%rsi),%rsi
+	jnz  .Lloop_8
+
+.Lhandle_7:
+	movl %edx,%ecx
+	andl $7,%ecx
+	jz .Lende
+	.p2align 4
+.Lloop_1:
+	movb (%rsi),%r8b
+	movb %r8b,(%rdi)
+	incq %rdi
+	incq %rsi
+	decl %ecx
+	jnz .Lloop_1
+
+.Lende:
+	popq %rbx
+	CFI_ADJUST_CFA_OFFSET -8
+	CFI_RESTORE rbx
+	ret
+.Lfinal:
+	CFI_ENDPROC
+ENDPROC(memcpy)
+ENDPROC(__memcpy)
+
+	/* Some CPUs run faster using the string copy instructions.
+	   It is also a lot simpler. Use this when possible */
+
+	.section .altinstr_replacement,"ax"
+1:	.byte 0xeb				/* jmp <disp8> */
+	.byte (memcpy_c - memcpy) - (2f - 1b)	/* offset */
+2:
+	.previous
+	.section .altinstructions,"a"
+	.align 8
+	.quad memcpy
+	.quad 1b
+	.byte X86_FEATURE_REP_GOOD
+	/* Replace only beginning, memcpy is used to apply alternatives, so it
+	 * is silly to overwrite itself with nops - reboot is only outcome... */
+	.byte 2b - 1b
+	.byte 2b - 1b
+	.previous
diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c
new file mode 100644
index 000000000000..751ebae8ec42
--- /dev/null
+++ b/arch/x86/lib/memmove_64.c
@@ -0,0 +1,21 @@
+/* Normally compiler builtins are used, but sometimes the compiler calls out
+   of line code. Based on asm-i386/string.h.
+ */
+#define _STRING_C
+#include <linux/string.h>
+#include <linux/module.h>
+
+#undef memmove
+void *memmove(void * dest,const void *src,size_t count)
+{
+	if (dest < src) { 
+		return memcpy(dest,src,count);
+	} else {
+		char *p = (char *) dest + count;
+		char *s = (char *) src + count;
+		while (count--)
+			*--p = *--s;
+	}
+	return dest;
+} 
+EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
new file mode 100644
index 000000000000..2c5948116bd2
--- /dev/null
+++ b/arch/x86/lib/memset_64.S
@@ -0,0 +1,133 @@
+/* Copyright 2002 Andi Kleen, SuSE Labs */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+/*
+ * ISO C memset - set a memory block to a byte value.
+ *	
+ * rdi   destination
+ * rsi   value (char) 
+ * rdx   count (bytes) 
+ * 
+ * rax   original destination
+ */	
+	ALIGN
+memset_c:
+	CFI_STARTPROC
+	movq %rdi,%r9
+	movl %edx,%r8d
+	andl $7,%r8d
+	movl %edx,%ecx
+	shrl $3,%ecx
+	/* expand byte value  */
+	movzbl %sil,%esi
+	movabs $0x0101010101010101,%rax
+	mulq %rsi		/* with rax, clobbers rdx */
+	rep stosq
+	movl %r8d,%ecx
+	rep stosb
+	movq %r9,%rax
+	ret
+	CFI_ENDPROC
+ENDPROC(memset_c)
+
+ENTRY(memset)
+ENTRY(__memset)
+	CFI_STARTPROC
+	movq %rdi,%r10
+	movq %rdx,%r11
+
+	/* expand byte value  */
+	movzbl %sil,%ecx
+	movabs $0x0101010101010101,%rax
+	mul    %rcx		/* with rax, clobbers rdx */
+
+	/* align dst */
+	movl  %edi,%r9d
+	andl  $7,%r9d
+	jnz  .Lbad_alignment
+	CFI_REMEMBER_STATE
+.Lafter_bad_alignment:
+
+	movl %r11d,%ecx
+	shrl $6,%ecx
+	jz	 .Lhandle_tail
+
+	.p2align 4
+.Lloop_64:
+	decl   %ecx
+	movq  %rax,(%rdi)
+	movq  %rax,8(%rdi)
+	movq  %rax,16(%rdi)
+	movq  %rax,24(%rdi)
+	movq  %rax,32(%rdi)
+	movq  %rax,40(%rdi)
+	movq  %rax,48(%rdi)
+	movq  %rax,56(%rdi)
+	leaq  64(%rdi),%rdi
+	jnz    .Lloop_64
+
+	/* Handle tail in loops. The loops should be faster than hard
+	   to predict jump tables. */
+	.p2align 4
+.Lhandle_tail:
+	movl	%r11d,%ecx
+	andl    $63&(~7),%ecx
+	jz 		.Lhandle_7
+	shrl	$3,%ecx
+	.p2align 4
+.Lloop_8:
+	decl   %ecx
+	movq  %rax,(%rdi)
+	leaq  8(%rdi),%rdi
+	jnz    .Lloop_8
+
+.Lhandle_7:
+	movl	%r11d,%ecx
+	andl	$7,%ecx
+	jz      .Lende
+	.p2align 4
+.Lloop_1:
+	decl    %ecx
+	movb 	%al,(%rdi)
+	leaq	1(%rdi),%rdi
+	jnz     .Lloop_1
+
+.Lende:
+	movq	%r10,%rax
+	ret
+
+	CFI_RESTORE_STATE
+.Lbad_alignment:
+	cmpq $7,%r11
+	jbe	.Lhandle_7
+	movq %rax,(%rdi)	/* unaligned store */
+	movq $8,%r8
+	subq %r9,%r8
+	addq %r8,%rdi
+	subq %r8,%r11
+	jmp .Lafter_bad_alignment
+.Lfinal:
+	CFI_ENDPROC
+ENDPROC(memset)
+ENDPROC(__memset)
+
+	/* Some CPUs run faster using the string instructions.
+	   It is also a lot simpler. Use this when possible */
+
+#include <asm/cpufeature.h>
+
+	.section .altinstr_replacement,"ax"
+1:	.byte 0xeb				/* jmp <disp8> */
+	.byte (memset_c - memset) - (2f - 1b)	/* offset */
+2:
+	.previous
+	.section .altinstructions,"a"
+	.align 8
+	.quad memset
+	.quad 1b
+	.byte X86_FEATURE_REP_GOOD
+	.byte .Lfinal - memset
+	.byte 2b - 1b
+	.previous
diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c
new file mode 100644
index 000000000000..28084d2e8dd4
--- /dev/null
+++ b/arch/x86/lib/mmx_32.c
@@ -0,0 +1,403 @@
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <linux/hardirq.h>
+#include <linux/module.h>
+
+#include <asm/i387.h>
+
+
+/*
+ *	MMX 3DNow! library helper functions
+ *
+ *	To do:
+ *	We can use MMX just for prefetch in IRQ's. This may be a win. 
+ *		(reported so on K6-III)
+ *	We should use a better code neutral filler for the short jump
+ *		leal ebx. [ebx] is apparently best for K6-2, but Cyrix ??
+ *	We also want to clobber the filler register so we don't get any
+ *		register forwarding stalls on the filler. 
+ *
+ *	Add *user handling. Checksums are not a win with MMX on any CPU
+ *	tested so far for any MMX solution figured.
+ *
+ *	22/09/2000 - Arjan van de Ven 
+ *		Improved for non-egineering-sample Athlons 
+ *
+ */
+ 
+void *_mmx_memcpy(void *to, const void *from, size_t len)
+{
+	void *p;
+	int i;
+
+	if (unlikely(in_interrupt()))
+		return __memcpy(to, from, len);
+
+	p = to;
+	i = len >> 6; /* len/64 */
+
+	kernel_fpu_begin();
+
+	__asm__ __volatile__ (
+		"1: prefetch (%0)\n"		/* This set is 28 bytes */
+		"   prefetch 64(%0)\n"
+		"   prefetch 128(%0)\n"
+		"   prefetch 192(%0)\n"
+		"   prefetch 256(%0)\n"
+		"2:  \n"
+		".section .fixup, \"ax\"\n"
+		"3: movw $0x1AEB, 1b\n"	/* jmp on 26 bytes */
+		"   jmp 2b\n"
+		".previous\n"
+		".section __ex_table,\"a\"\n"
+		"	.align 4\n"
+		"	.long 1b, 3b\n"
+		".previous"
+		: : "r" (from) );
+		
+	
+	for(; i>5; i--)
+	{
+		__asm__ __volatile__ (
+		"1:  prefetch 320(%0)\n"
+		"2:  movq (%0), %%mm0\n"
+		"  movq 8(%0), %%mm1\n"
+		"  movq 16(%0), %%mm2\n"
+		"  movq 24(%0), %%mm3\n"
+		"  movq %%mm0, (%1)\n"
+		"  movq %%mm1, 8(%1)\n"
+		"  movq %%mm2, 16(%1)\n"
+		"  movq %%mm3, 24(%1)\n"
+		"  movq 32(%0), %%mm0\n"
+		"  movq 40(%0), %%mm1\n"
+		"  movq 48(%0), %%mm2\n"
+		"  movq 56(%0), %%mm3\n"
+		"  movq %%mm0, 32(%1)\n"
+		"  movq %%mm1, 40(%1)\n"
+		"  movq %%mm2, 48(%1)\n"
+		"  movq %%mm3, 56(%1)\n"
+		".section .fixup, \"ax\"\n"
+		"3: movw $0x05EB, 1b\n"	/* jmp on 5 bytes */
+		"   jmp 2b\n"
+		".previous\n"
+		".section __ex_table,\"a\"\n"
+		"	.align 4\n"
+		"	.long 1b, 3b\n"
+		".previous"
+		: : "r" (from), "r" (to) : "memory");
+		from+=64;
+		to+=64;
+	}
+
+	for(; i>0; i--)
+	{
+		__asm__ __volatile__ (
+		"  movq (%0), %%mm0\n"
+		"  movq 8(%0), %%mm1\n"
+		"  movq 16(%0), %%mm2\n"
+		"  movq 24(%0), %%mm3\n"
+		"  movq %%mm0, (%1)\n"
+		"  movq %%mm1, 8(%1)\n"
+		"  movq %%mm2, 16(%1)\n"
+		"  movq %%mm3, 24(%1)\n"
+		"  movq 32(%0), %%mm0\n"
+		"  movq 40(%0), %%mm1\n"
+		"  movq 48(%0), %%mm2\n"
+		"  movq 56(%0), %%mm3\n"
+		"  movq %%mm0, 32(%1)\n"
+		"  movq %%mm1, 40(%1)\n"
+		"  movq %%mm2, 48(%1)\n"
+		"  movq %%mm3, 56(%1)\n"
+		: : "r" (from), "r" (to) : "memory");
+		from+=64;
+		to+=64;
+	}
+	/*
+	 *	Now do the tail of the block
+	 */
+	__memcpy(to, from, len&63);
+	kernel_fpu_end();
+	return p;
+}
+
+#ifdef CONFIG_MK7
+
+/*
+ *	The K7 has streaming cache bypass load/store. The Cyrix III, K6 and
+ *	other MMX using processors do not.
+ */
+
+static void fast_clear_page(void *page)
+{
+	int i;
+
+	kernel_fpu_begin();
+	
+	__asm__ __volatile__ (
+		"  pxor %%mm0, %%mm0\n" : :
+	);
+
+	for(i=0;i<4096/64;i++)
+	{
+		__asm__ __volatile__ (
+		"  movntq %%mm0, (%0)\n"
+		"  movntq %%mm0, 8(%0)\n"
+		"  movntq %%mm0, 16(%0)\n"
+		"  movntq %%mm0, 24(%0)\n"
+		"  movntq %%mm0, 32(%0)\n"
+		"  movntq %%mm0, 40(%0)\n"
+		"  movntq %%mm0, 48(%0)\n"
+		"  movntq %%mm0, 56(%0)\n"
+		: : "r" (page) : "memory");
+		page+=64;
+	}
+	/* since movntq is weakly-ordered, a "sfence" is needed to become
+	 * ordered again.
+	 */
+	__asm__ __volatile__ (
+		"  sfence \n" : :
+	);
+	kernel_fpu_end();
+}
+
+static void fast_copy_page(void *to, void *from)
+{
+	int i;
+
+	kernel_fpu_begin();
+
+	/* maybe the prefetch stuff can go before the expensive fnsave...
+	 * but that is for later. -AV
+	 */
+	__asm__ __volatile__ (
+		"1: prefetch (%0)\n"
+		"   prefetch 64(%0)\n"
+		"   prefetch 128(%0)\n"
+		"   prefetch 192(%0)\n"
+		"   prefetch 256(%0)\n"
+		"2:  \n"
+		".section .fixup, \"ax\"\n"
+		"3: movw $0x1AEB, 1b\n"	/* jmp on 26 bytes */
+		"   jmp 2b\n"
+		".previous\n"
+		".section __ex_table,\"a\"\n"
+		"	.align 4\n"
+		"	.long 1b, 3b\n"
+		".previous"
+		: : "r" (from) );
+
+	for(i=0; i<(4096-320)/64; i++)
+	{
+		__asm__ __volatile__ (
+		"1: prefetch 320(%0)\n"
+		"2: movq (%0), %%mm0\n"
+		"   movntq %%mm0, (%1)\n"
+		"   movq 8(%0), %%mm1\n"
+		"   movntq %%mm1, 8(%1)\n"
+		"   movq 16(%0), %%mm2\n"
+		"   movntq %%mm2, 16(%1)\n"
+		"   movq 24(%0), %%mm3\n"
+		"   movntq %%mm3, 24(%1)\n"
+		"   movq 32(%0), %%mm4\n"
+		"   movntq %%mm4, 32(%1)\n"
+		"   movq 40(%0), %%mm5\n"
+		"   movntq %%mm5, 40(%1)\n"
+		"   movq 48(%0), %%mm6\n"
+		"   movntq %%mm6, 48(%1)\n"
+		"   movq 56(%0), %%mm7\n"
+		"   movntq %%mm7, 56(%1)\n"
+		".section .fixup, \"ax\"\n"
+		"3: movw $0x05EB, 1b\n"	/* jmp on 5 bytes */
+		"   jmp 2b\n"
+		".previous\n"
+		".section __ex_table,\"a\"\n"
+		"	.align 4\n"
+		"	.long 1b, 3b\n"
+		".previous"
+		: : "r" (from), "r" (to) : "memory");
+		from+=64;
+		to+=64;
+	}
+	for(i=(4096-320)/64; i<4096/64; i++)
+	{
+		__asm__ __volatile__ (
+		"2: movq (%0), %%mm0\n"
+		"   movntq %%mm0, (%1)\n"
+		"   movq 8(%0), %%mm1\n"
+		"   movntq %%mm1, 8(%1)\n"
+		"   movq 16(%0), %%mm2\n"
+		"   movntq %%mm2, 16(%1)\n"
+		"   movq 24(%0), %%mm3\n"
+		"   movntq %%mm3, 24(%1)\n"
+		"   movq 32(%0), %%mm4\n"
+		"   movntq %%mm4, 32(%1)\n"
+		"   movq 40(%0), %%mm5\n"
+		"   movntq %%mm5, 40(%1)\n"
+		"   movq 48(%0), %%mm6\n"
+		"   movntq %%mm6, 48(%1)\n"
+		"   movq 56(%0), %%mm7\n"
+		"   movntq %%mm7, 56(%1)\n"
+		: : "r" (from), "r" (to) : "memory");
+		from+=64;
+		to+=64;
+	}
+	/* since movntq is weakly-ordered, a "sfence" is needed to become
+	 * ordered again.
+	 */
+	__asm__ __volatile__ (
+		"  sfence \n" : :
+	);
+	kernel_fpu_end();
+}
+
+#else
+
+/*
+ *	Generic MMX implementation without K7 specific streaming
+ */
+ 
+static void fast_clear_page(void *page)
+{
+	int i;
+	
+	kernel_fpu_begin();
+	
+	__asm__ __volatile__ (
+		"  pxor %%mm0, %%mm0\n" : :
+	);
+
+	for(i=0;i<4096/128;i++)
+	{
+		__asm__ __volatile__ (
+		"  movq %%mm0, (%0)\n"
+		"  movq %%mm0, 8(%0)\n"
+		"  movq %%mm0, 16(%0)\n"
+		"  movq %%mm0, 24(%0)\n"
+		"  movq %%mm0, 32(%0)\n"
+		"  movq %%mm0, 40(%0)\n"
+		"  movq %%mm0, 48(%0)\n"
+		"  movq %%mm0, 56(%0)\n"
+		"  movq %%mm0, 64(%0)\n"
+		"  movq %%mm0, 72(%0)\n"
+		"  movq %%mm0, 80(%0)\n"
+		"  movq %%mm0, 88(%0)\n"
+		"  movq %%mm0, 96(%0)\n"
+		"  movq %%mm0, 104(%0)\n"
+		"  movq %%mm0, 112(%0)\n"
+		"  movq %%mm0, 120(%0)\n"
+		: : "r" (page) : "memory");
+		page+=128;
+	}
+
+	kernel_fpu_end();
+}
+
+static void fast_copy_page(void *to, void *from)
+{
+	int i;
+	
+	
+	kernel_fpu_begin();
+
+	__asm__ __volatile__ (
+		"1: prefetch (%0)\n"
+		"   prefetch 64(%0)\n"
+		"   prefetch 128(%0)\n"
+		"   prefetch 192(%0)\n"
+		"   prefetch 256(%0)\n"
+		"2:  \n"
+		".section .fixup, \"ax\"\n"
+		"3: movw $0x1AEB, 1b\n"	/* jmp on 26 bytes */
+		"   jmp 2b\n"
+		".previous\n"
+		".section __ex_table,\"a\"\n"
+		"	.align 4\n"
+		"	.long 1b, 3b\n"
+		".previous"
+		: : "r" (from) );
+
+	for(i=0; i<4096/64; i++)
+	{
+		__asm__ __volatile__ (
+		"1: prefetch 320(%0)\n"
+		"2: movq (%0), %%mm0\n"
+		"   movq 8(%0), %%mm1\n"
+		"   movq 16(%0), %%mm2\n"
+		"   movq 24(%0), %%mm3\n"
+		"   movq %%mm0, (%1)\n"
+		"   movq %%mm1, 8(%1)\n"
+		"   movq %%mm2, 16(%1)\n"
+		"   movq %%mm3, 24(%1)\n"
+		"   movq 32(%0), %%mm0\n"
+		"   movq 40(%0), %%mm1\n"
+		"   movq 48(%0), %%mm2\n"
+		"   movq 56(%0), %%mm3\n"
+		"   movq %%mm0, 32(%1)\n"
+		"   movq %%mm1, 40(%1)\n"
+		"   movq %%mm2, 48(%1)\n"
+		"   movq %%mm3, 56(%1)\n"
+		".section .fixup, \"ax\"\n"
+		"3: movw $0x05EB, 1b\n"	/* jmp on 5 bytes */
+		"   jmp 2b\n"
+		".previous\n"
+		".section __ex_table,\"a\"\n"
+		"	.align 4\n"
+		"	.long 1b, 3b\n"
+		".previous"
+		: : "r" (from), "r" (to) : "memory");
+		from+=64;
+		to+=64;
+	}
+	kernel_fpu_end();
+}
+
+
+#endif
+
+/*
+ *	Favour MMX for page clear and copy. 
+ */
+
+static void slow_zero_page(void * page)
+{
+	int d0, d1;
+	__asm__ __volatile__( \
+		"cld\n\t" \
+		"rep ; stosl" \
+		: "=&c" (d0), "=&D" (d1)
+		:"a" (0),"1" (page),"0" (1024)
+		:"memory");
+}
+ 
+void mmx_clear_page(void * page)
+{
+	if(unlikely(in_interrupt()))
+		slow_zero_page(page);
+	else
+		fast_clear_page(page);
+}
+
+static void slow_copy_page(void *to, void *from)
+{
+	int d0, d1, d2;
+	__asm__ __volatile__( \
+		"cld\n\t" \
+		"rep ; movsl" \
+		: "=&c" (d0), "=&D" (d1), "=&S" (d2) \
+		: "0" (1024),"1" ((long) to),"2" ((long) from) \
+		: "memory");
+}
+  
+
+void mmx_copy_page(void *to, void *from)
+{
+	if(unlikely(in_interrupt()))
+		slow_copy_page(to, from);
+	else
+		fast_copy_page(to, from);
+}
+
+EXPORT_SYMBOL(_mmx_memcpy);
+EXPORT_SYMBOL(mmx_clear_page);
+EXPORT_SYMBOL(mmx_copy_page);
diff --git a/arch/x86/lib/msr-on-cpu.c b/arch/x86/lib/msr-on-cpu.c
new file mode 100644
index 000000000000..7767962f25d3
--- /dev/null
+++ b/arch/x86/lib/msr-on-cpu.c
@@ -0,0 +1,119 @@
+#include <linux/module.h>
+#include <linux/preempt.h>
+#include <linux/smp.h>
+#include <asm/msr.h>
+
+struct msr_info {
+	u32 msr_no;
+	u32 l, h;
+	int err;
+};
+
+static void __rdmsr_on_cpu(void *info)
+{
+	struct msr_info *rv = info;
+
+	rdmsr(rv->msr_no, rv->l, rv->h);
+}
+
+static void __rdmsr_safe_on_cpu(void *info)
+{
+	struct msr_info *rv = info;
+
+	rv->err = rdmsr_safe(rv->msr_no, &rv->l, &rv->h);
+}
+
+static int _rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h, int safe)
+{
+	int err = 0;
+	preempt_disable();
+	if (smp_processor_id() == cpu)
+		if (safe)
+			err = rdmsr_safe(msr_no, l, h);
+		else
+			rdmsr(msr_no, *l, *h);
+	else {
+		struct msr_info rv;
+
+		rv.msr_no = msr_no;
+		if (safe) {
+			smp_call_function_single(cpu, __rdmsr_safe_on_cpu,
+						 &rv, 0, 1);
+			err = rv.err;
+		} else {
+			smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 0, 1);
+		}
+		*l = rv.l;
+		*h = rv.h;
+	}
+	preempt_enable();
+	return err;
+}
+
+static void __wrmsr_on_cpu(void *info)
+{
+	struct msr_info *rv = info;
+
+	wrmsr(rv->msr_no, rv->l, rv->h);
+}
+
+static void __wrmsr_safe_on_cpu(void *info)
+{
+	struct msr_info *rv = info;
+
+	rv->err = wrmsr_safe(rv->msr_no, rv->l, rv->h);
+}
+
+static int _wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h, int safe)
+{
+	int err = 0;
+	preempt_disable();
+	if (smp_processor_id() == cpu)
+		if (safe)
+			err = wrmsr_safe(msr_no, l, h);
+		else
+			wrmsr(msr_no, l, h);
+	else {
+		struct msr_info rv;
+
+		rv.msr_no = msr_no;
+		rv.l = l;
+		rv.h = h;
+		if (safe) {
+			smp_call_function_single(cpu, __wrmsr_safe_on_cpu,
+						 &rv, 0, 1);
+			err = rv.err;
+		} else {
+			smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 0, 1);
+		}
+	}
+	preempt_enable();
+	return err;
+}
+
+void wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
+{
+	_wrmsr_on_cpu(cpu, msr_no, l, h, 0);
+}
+
+void rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
+{
+	_rdmsr_on_cpu(cpu, msr_no, l, h, 0);
+}
+
+/* These "safe" variants are slower and should be used when the target MSR
+   may not actually exist. */
+int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
+{
+	return _wrmsr_on_cpu(cpu, msr_no, l, h, 1);
+}
+
+int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
+{
+	return _rdmsr_on_cpu(cpu, msr_no, l, h, 1);
+}
+
+EXPORT_SYMBOL(rdmsr_on_cpu);
+EXPORT_SYMBOL(wrmsr_on_cpu);
+EXPORT_SYMBOL(rdmsr_safe_on_cpu);
+EXPORT_SYMBOL(wrmsr_safe_on_cpu);
diff --git a/arch/x86/lib/putuser_32.S b/arch/x86/lib/putuser_32.S
new file mode 100644
index 000000000000..f58fba109d18
--- /dev/null
+++ b/arch/x86/lib/putuser_32.S
@@ -0,0 +1,98 @@
+/*
+ * __put_user functions.
+ *
+ * (C) Copyright 2005 Linus Torvalds
+ *
+ * These functions have a non-standard call interface
+ * to make them more efficient, especially as they
+ * return an error value in addition to the "real"
+ * return value.
+ */
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/thread_info.h>
+
+
+/*
+ * __put_user_X
+ *
+ * Inputs:	%eax[:%edx] contains the data
+ *		%ecx contains the address
+ *
+ * Outputs:	%eax is error code (0 or -EFAULT)
+ *
+ * These functions should not modify any other registers,
+ * as they get called from within inline assembly.
+ */
+
+#define ENTER	CFI_STARTPROC ; \
+		pushl %ebx ; \
+		CFI_ADJUST_CFA_OFFSET 4 ; \
+		CFI_REL_OFFSET ebx, 0 ; \
+		GET_THREAD_INFO(%ebx)
+#define EXIT	popl %ebx ; \
+		CFI_ADJUST_CFA_OFFSET -4 ; \
+		CFI_RESTORE ebx ; \
+		ret ; \
+		CFI_ENDPROC
+
+.text
+ENTRY(__put_user_1)
+	ENTER
+	cmpl TI_addr_limit(%ebx),%ecx
+	jae bad_put_user
+1:	movb %al,(%ecx)
+	xorl %eax,%eax
+	EXIT
+ENDPROC(__put_user_1)
+
+ENTRY(__put_user_2)
+	ENTER
+	movl TI_addr_limit(%ebx),%ebx
+	subl $1,%ebx
+	cmpl %ebx,%ecx
+	jae bad_put_user
+2:	movw %ax,(%ecx)
+	xorl %eax,%eax
+	EXIT
+ENDPROC(__put_user_2)
+
+ENTRY(__put_user_4)
+	ENTER
+	movl TI_addr_limit(%ebx),%ebx
+	subl $3,%ebx
+	cmpl %ebx,%ecx
+	jae bad_put_user
+3:	movl %eax,(%ecx)
+	xorl %eax,%eax
+	EXIT
+ENDPROC(__put_user_4)
+
+ENTRY(__put_user_8)
+	ENTER
+	movl TI_addr_limit(%ebx),%ebx
+	subl $7,%ebx
+	cmpl %ebx,%ecx
+	jae bad_put_user
+4:	movl %eax,(%ecx)
+5:	movl %edx,4(%ecx)
+	xorl %eax,%eax
+	EXIT
+ENDPROC(__put_user_8)
+
+bad_put_user:
+	CFI_STARTPROC simple
+	CFI_DEF_CFA esp, 2*4
+	CFI_OFFSET eip, -1*4
+	CFI_OFFSET ebx, -2*4
+	movl $-14,%eax
+	EXIT
+END(bad_put_user)
+
+.section __ex_table,"a"
+	.long 1b,bad_put_user
+	.long 2b,bad_put_user
+	.long 3b,bad_put_user
+	.long 4b,bad_put_user
+	.long 5b,bad_put_user
+.previous
diff --git a/arch/x86/lib/putuser_64.S b/arch/x86/lib/putuser_64.S
new file mode 100644
index 000000000000..4989f5a8fa9b
--- /dev/null
+++ b/arch/x86/lib/putuser_64.S
@@ -0,0 +1,106 @@
+/*
+ * __put_user functions.
+ *
+ * (C) Copyright 1998 Linus Torvalds
+ * (C) Copyright 2005 Andi Kleen
+ *
+ * These functions have a non-standard call interface
+ * to make them more efficient, especially as they
+ * return an error value in addition to the "real"
+ * return value.
+ */
+
+/*
+ * __put_user_X
+ *
+ * Inputs:	%rcx contains the address
+ *		%rdx contains new value
+ *
+ * Outputs:	%rax is error code (0 or -EFAULT)
+ *
+ * %r8 is destroyed.
+ *
+ * These functions should not modify any other registers,
+ * as they get called from within inline assembly.
+ */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/page.h>
+#include <asm/errno.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+
+	.text
+ENTRY(__put_user_1)
+	CFI_STARTPROC
+	GET_THREAD_INFO(%r8)
+	cmpq threadinfo_addr_limit(%r8),%rcx
+	jae bad_put_user
+1:	movb %dl,(%rcx)
+	xorl %eax,%eax
+	ret
+	CFI_ENDPROC
+ENDPROC(__put_user_1)
+
+ENTRY(__put_user_2)
+	CFI_STARTPROC
+	GET_THREAD_INFO(%r8)
+	addq $1,%rcx
+	jc 20f
+	cmpq threadinfo_addr_limit(%r8),%rcx
+	jae 20f
+	decq %rcx
+2:	movw %dx,(%rcx)
+	xorl %eax,%eax
+	ret
+20:	decq %rcx
+	jmp bad_put_user
+	CFI_ENDPROC
+ENDPROC(__put_user_2)
+
+ENTRY(__put_user_4)
+	CFI_STARTPROC
+	GET_THREAD_INFO(%r8)
+	addq $3,%rcx
+	jc 30f
+	cmpq threadinfo_addr_limit(%r8),%rcx
+	jae 30f
+	subq $3,%rcx
+3:	movl %edx,(%rcx)
+	xorl %eax,%eax
+	ret
+30:	subq $3,%rcx
+	jmp bad_put_user
+	CFI_ENDPROC
+ENDPROC(__put_user_4)
+
+ENTRY(__put_user_8)
+	CFI_STARTPROC
+	GET_THREAD_INFO(%r8)
+	addq $7,%rcx
+	jc 40f
+	cmpq threadinfo_addr_limit(%r8),%rcx
+	jae 40f
+	subq $7,%rcx
+4:	movq %rdx,(%rcx)
+	xorl %eax,%eax
+	ret
+40:	subq $7,%rcx
+	jmp bad_put_user
+	CFI_ENDPROC
+ENDPROC(__put_user_8)
+
+bad_put_user:
+	CFI_STARTPROC
+	movq $(-EFAULT),%rax
+	ret
+	CFI_ENDPROC
+END(bad_put_user)
+
+.section __ex_table,"a"
+	.quad 1b,bad_put_user
+	.quad 2b,bad_put_user
+	.quad 3b,bad_put_user
+	.quad 4b,bad_put_user
+.previous
diff --git a/arch/x86/lib/rwlock_64.S b/arch/x86/lib/rwlock_64.S
new file mode 100644
index 000000000000..0cde1f807314
--- /dev/null
+++ b/arch/x86/lib/rwlock_64.S
@@ -0,0 +1,38 @@
+/* Slow paths of read/write spinlocks. */
+
+#include <linux/linkage.h>
+#include <asm/rwlock.h>
+#include <asm/alternative-asm.i>
+#include <asm/dwarf2.h>
+
+/* rdi:	pointer to rwlock_t */
+ENTRY(__write_lock_failed)
+	CFI_STARTPROC
+	LOCK_PREFIX
+	addl $RW_LOCK_BIAS,(%rdi)
+1:	rep
+	nop
+	cmpl $RW_LOCK_BIAS,(%rdi)
+	jne 1b
+	LOCK_PREFIX
+	subl $RW_LOCK_BIAS,(%rdi)
+	jnz  __write_lock_failed
+	ret
+	CFI_ENDPROC
+END(__write_lock_failed)
+
+/* rdi:	pointer to rwlock_t */
+ENTRY(__read_lock_failed)
+	CFI_STARTPROC
+	LOCK_PREFIX
+	incl (%rdi)
+1:	rep
+	nop
+	cmpl $1,(%rdi)
+	js 1b
+	LOCK_PREFIX
+	decl (%rdi)
+	js __read_lock_failed
+	ret
+	CFI_ENDPROC
+END(__read_lock_failed)
diff --git a/arch/x86/lib/semaphore_32.S b/arch/x86/lib/semaphore_32.S
new file mode 100644
index 000000000000..c01eb39c0b43
--- /dev/null
+++ b/arch/x86/lib/semaphore_32.S
@@ -0,0 +1,219 @@
+/*
+ * i386 semaphore implementation.
+ *
+ * (C) Copyright 1999 Linus Torvalds
+ *
+ * Portions Copyright 1999 Red Hat, Inc.
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/rwlock.h>
+#include <asm/alternative-asm.i>
+#include <asm/frame.i>
+#include <asm/dwarf2.h>
+
+/*
+ * The semaphore operations have a special calling sequence that
+ * allow us to do a simpler in-line version of them. These routines
+ * need to convert that sequence back into the C sequence when
+ * there is contention on the semaphore.
+ *
+ * %eax contains the semaphore pointer on entry. Save the C-clobbered
+ * registers (%eax, %edx and %ecx) except %eax whish is either a return
+ * value or just clobbered..
+ */
+	.section .sched.text
+ENTRY(__down_failed)
+	CFI_STARTPROC
+	FRAME
+	pushl %edx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET edx,0
+	pushl %ecx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET ecx,0
+	call __down
+	popl %ecx
+	CFI_ADJUST_CFA_OFFSET -4
+	CFI_RESTORE ecx
+	popl %edx
+	CFI_ADJUST_CFA_OFFSET -4
+	CFI_RESTORE edx
+	ENDFRAME
+	ret
+	CFI_ENDPROC
+	END(__down_failed)
+
+ENTRY(__down_failed_interruptible)
+	CFI_STARTPROC
+	FRAME
+	pushl %edx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET edx,0
+	pushl %ecx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET ecx,0
+	call __down_interruptible
+	popl %ecx
+	CFI_ADJUST_CFA_OFFSET -4
+	CFI_RESTORE ecx
+	popl %edx
+	CFI_ADJUST_CFA_OFFSET -4
+	CFI_RESTORE edx
+	ENDFRAME
+	ret
+	CFI_ENDPROC
+	END(__down_failed_interruptible)
+
+ENTRY(__down_failed_trylock)
+	CFI_STARTPROC
+	FRAME
+	pushl %edx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET edx,0
+	pushl %ecx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET ecx,0
+	call __down_trylock
+	popl %ecx
+	CFI_ADJUST_CFA_OFFSET -4
+	CFI_RESTORE ecx
+	popl %edx
+	CFI_ADJUST_CFA_OFFSET -4
+	CFI_RESTORE edx
+	ENDFRAME
+	ret
+	CFI_ENDPROC
+	END(__down_failed_trylock)
+
+ENTRY(__up_wakeup)
+	CFI_STARTPROC
+	FRAME
+	pushl %edx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET edx,0
+	pushl %ecx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET ecx,0
+	call __up
+	popl %ecx
+	CFI_ADJUST_CFA_OFFSET -4
+	CFI_RESTORE ecx
+	popl %edx
+	CFI_ADJUST_CFA_OFFSET -4
+	CFI_RESTORE edx
+	ENDFRAME
+	ret
+	CFI_ENDPROC
+	END(__up_wakeup)
+
+/*
+ * rw spinlock fallbacks
+ */
+#ifdef CONFIG_SMP
+ENTRY(__write_lock_failed)
+	CFI_STARTPROC simple
+	FRAME
+2: 	LOCK_PREFIX
+	addl	$ RW_LOCK_BIAS,(%eax)
+1:	rep; nop
+	cmpl	$ RW_LOCK_BIAS,(%eax)
+	jne	1b
+	LOCK_PREFIX
+	subl	$ RW_LOCK_BIAS,(%eax)
+	jnz	2b
+	ENDFRAME
+	ret
+	CFI_ENDPROC
+	END(__write_lock_failed)
+
+ENTRY(__read_lock_failed)
+	CFI_STARTPROC
+	FRAME
+2: 	LOCK_PREFIX
+	incl	(%eax)
+1:	rep; nop
+	cmpl	$1,(%eax)
+	js	1b
+	LOCK_PREFIX
+	decl	(%eax)
+	js	2b
+	ENDFRAME
+	ret
+	CFI_ENDPROC
+	END(__read_lock_failed)
+
+#endif
+
+#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
+
+/* Fix up special calling conventions */
+ENTRY(call_rwsem_down_read_failed)
+	CFI_STARTPROC
+	push %ecx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET ecx,0
+	push %edx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET edx,0
+	call rwsem_down_read_failed
+	pop %edx
+	CFI_ADJUST_CFA_OFFSET -4
+	pop %ecx
+	CFI_ADJUST_CFA_OFFSET -4
+	ret
+	CFI_ENDPROC
+	END(call_rwsem_down_read_failed)
+
+ENTRY(call_rwsem_down_write_failed)
+	CFI_STARTPROC
+	push %ecx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET ecx,0
+	calll rwsem_down_write_failed
+	pop %ecx
+	CFI_ADJUST_CFA_OFFSET -4
+	ret
+	CFI_ENDPROC
+	END(call_rwsem_down_write_failed)
+
+ENTRY(call_rwsem_wake)
+	CFI_STARTPROC
+	decw %dx    /* do nothing if still outstanding active readers */
+	jnz 1f
+	push %ecx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET ecx,0
+	call rwsem_wake
+	pop %ecx
+	CFI_ADJUST_CFA_OFFSET -4
+1:	ret
+	CFI_ENDPROC
+	END(call_rwsem_wake)
+
+/* Fix up special calling conventions */
+ENTRY(call_rwsem_downgrade_wake)
+	CFI_STARTPROC
+	push %ecx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET ecx,0
+	push %edx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET edx,0
+	call rwsem_downgrade_wake
+	pop %edx
+	CFI_ADJUST_CFA_OFFSET -4
+	pop %ecx
+	CFI_ADJUST_CFA_OFFSET -4
+	ret
+	CFI_ENDPROC
+	END(call_rwsem_downgrade_wake)
+
+#endif
diff --git a/arch/x86/lib/string_32.c b/arch/x86/lib/string_32.c
new file mode 100644
index 000000000000..2c773fefa3dd
--- /dev/null
+++ b/arch/x86/lib/string_32.c
@@ -0,0 +1,257 @@
+/*
+ * Most of the string-functions are rather heavily hand-optimized,
+ * see especially strsep,strstr,str[c]spn. They should work, but are not
+ * very easy to understand. Everything is done entirely within the register
+ * set, making the functions fast and clean. String instructions have been
+ * used through-out, making for "slightly" unclear code :-)
+ *
+ * AK: On P4 and K7 using non string instruction implementations might be faster
+ * for large memory blocks. But most of them are unlikely to be used on large
+ * strings.
+ */
+
+#include <linux/string.h>
+#include <linux/module.h>
+
+#ifdef __HAVE_ARCH_STRCPY
+char *strcpy(char * dest,const char *src)
+{
+	int d0, d1, d2;
+	asm volatile( "1:\tlodsb\n\t"
+		"stosb\n\t"
+		"testb %%al,%%al\n\t"
+		"jne 1b"
+		: "=&S" (d0), "=&D" (d1), "=&a" (d2)
+		:"0" (src),"1" (dest) : "memory");
+	return dest;
+}
+EXPORT_SYMBOL(strcpy);
+#endif
+
+#ifdef __HAVE_ARCH_STRNCPY
+char *strncpy(char * dest,const char *src,size_t count)
+{
+	int d0, d1, d2, d3;
+	asm volatile( "1:\tdecl %2\n\t"
+		"js 2f\n\t"
+		"lodsb\n\t"
+		"stosb\n\t"
+		"testb %%al,%%al\n\t"
+		"jne 1b\n\t"
+		"rep\n\t"
+		"stosb\n"
+		"2:"
+		: "=&S" (d0), "=&D" (d1), "=&c" (d2), "=&a" (d3)
+		:"0" (src),"1" (dest),"2" (count) : "memory");
+	return dest;
+}
+EXPORT_SYMBOL(strncpy);
+#endif
+
+#ifdef __HAVE_ARCH_STRCAT
+char *strcat(char * dest,const char * src)
+{
+	int d0, d1, d2, d3;
+	asm volatile( "repne\n\t"
+		"scasb\n\t"
+		"decl %1\n"
+		"1:\tlodsb\n\t"
+		"stosb\n\t"
+		"testb %%al,%%al\n\t"
+		"jne 1b"
+		: "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3)
+		: "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu): "memory");
+	return dest;
+}
+EXPORT_SYMBOL(strcat);
+#endif
+
+#ifdef __HAVE_ARCH_STRNCAT
+char *strncat(char * dest,const char * src,size_t count)
+{
+	int d0, d1, d2, d3;
+	asm volatile( "repne\n\t"
+		"scasb\n\t"
+		"decl %1\n\t"
+		"movl %8,%3\n"
+		"1:\tdecl %3\n\t"
+		"js 2f\n\t"
+		"lodsb\n\t"
+		"stosb\n\t"
+		"testb %%al,%%al\n\t"
+		"jne 1b\n"
+		"2:\txorl %2,%2\n\t"
+		"stosb"
+		: "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3)
+		: "0" (src),"1" (dest),"2" (0),"3" (0xffffffffu), "g" (count)
+		: "memory");
+	return dest;
+}
+EXPORT_SYMBOL(strncat);
+#endif
+
+#ifdef __HAVE_ARCH_STRCMP
+int strcmp(const char * cs,const char * ct)
+{
+	int d0, d1;
+	int res;
+	asm volatile( "1:\tlodsb\n\t"
+		"scasb\n\t"
+		"jne 2f\n\t"
+		"testb %%al,%%al\n\t"
+		"jne 1b\n\t"
+		"xorl %%eax,%%eax\n\t"
+		"jmp 3f\n"
+		"2:\tsbbl %%eax,%%eax\n\t"
+		"orb $1,%%al\n"
+		"3:"
+		:"=a" (res), "=&S" (d0), "=&D" (d1)
+		:"1" (cs),"2" (ct)
+		:"memory");
+	return res;
+}
+EXPORT_SYMBOL(strcmp);
+#endif
+
+#ifdef __HAVE_ARCH_STRNCMP
+int strncmp(const char * cs,const char * ct,size_t count)
+{
+	int res;
+	int d0, d1, d2;
+	asm volatile( "1:\tdecl %3\n\t"
+		"js 2f\n\t"
+		"lodsb\n\t"
+		"scasb\n\t"
+		"jne 3f\n\t"
+		"testb %%al,%%al\n\t"
+		"jne 1b\n"
+		"2:\txorl %%eax,%%eax\n\t"
+		"jmp 4f\n"
+		"3:\tsbbl %%eax,%%eax\n\t"
+		"orb $1,%%al\n"
+		"4:"
+		:"=a" (res), "=&S" (d0), "=&D" (d1), "=&c" (d2)
+		:"1" (cs),"2" (ct),"3" (count)
+		:"memory");
+	return res;
+}
+EXPORT_SYMBOL(strncmp);
+#endif
+
+#ifdef __HAVE_ARCH_STRCHR
+char *strchr(const char * s, int c)
+{
+	int d0;
+	char * res;
+	asm volatile( "movb %%al,%%ah\n"
+		"1:\tlodsb\n\t"
+		"cmpb %%ah,%%al\n\t"
+		"je 2f\n\t"
+		"testb %%al,%%al\n\t"
+		"jne 1b\n\t"
+		"movl $1,%1\n"
+		"2:\tmovl %1,%0\n\t"
+		"decl %0"
+		:"=a" (res), "=&S" (d0)
+		:"1" (s),"0" (c)
+		:"memory");
+	return res;
+}
+EXPORT_SYMBOL(strchr);
+#endif
+
+#ifdef __HAVE_ARCH_STRRCHR
+char *strrchr(const char * s, int c)
+{
+	int d0, d1;
+	char * res;
+	asm volatile( "movb %%al,%%ah\n"
+		"1:\tlodsb\n\t"
+		"cmpb %%ah,%%al\n\t"
+		"jne 2f\n\t"
+		"leal -1(%%esi),%0\n"
+		"2:\ttestb %%al,%%al\n\t"
+		"jne 1b"
+		:"=g" (res), "=&S" (d0), "=&a" (d1)
+		:"0" (0),"1" (s),"2" (c)
+		:"memory");
+	return res;
+}
+EXPORT_SYMBOL(strrchr);
+#endif
+
+#ifdef __HAVE_ARCH_STRLEN
+size_t strlen(const char * s)
+{
+	int d0;
+	int res;
+	asm volatile( "repne\n\t"
+		"scasb\n\t"
+		"notl %0\n\t"
+		"decl %0"
+		:"=c" (res), "=&D" (d0)
+		:"1" (s),"a" (0), "0" (0xffffffffu)
+		:"memory");
+	return res;
+}
+EXPORT_SYMBOL(strlen);
+#endif
+
+#ifdef __HAVE_ARCH_MEMCHR
+void *memchr(const void *cs,int c,size_t count)
+{
+	int d0;
+	void *res;
+	if (!count)
+		return NULL;
+	asm volatile( "repne\n\t"
+		"scasb\n\t"
+		"je 1f\n\t"
+		"movl $1,%0\n"
+		"1:\tdecl %0"
+		:"=D" (res), "=&c" (d0)
+		:"a" (c),"0" (cs),"1" (count)
+		:"memory");
+	return res;
+}
+EXPORT_SYMBOL(memchr);
+#endif
+
+#ifdef __HAVE_ARCH_MEMSCAN
+void *memscan(void * addr, int c, size_t size)
+{
+	if (!size)
+		return addr;
+	asm volatile("repnz; scasb\n\t"
+	    "jnz 1f\n\t"
+	    "dec %%edi\n"
+	    "1:"
+	    : "=D" (addr), "=c" (size)
+	    : "0" (addr), "1" (size), "a" (c)
+	    : "memory");
+	return addr;
+}
+EXPORT_SYMBOL(memscan);
+#endif
+
+#ifdef __HAVE_ARCH_STRNLEN
+size_t strnlen(const char *s, size_t count)
+{
+	int d0;
+	int res;
+	asm volatile( "movl %2,%0\n\t"
+		"jmp 2f\n"
+		"1:\tcmpb $0,(%0)\n\t"
+		"je 3f\n\t"
+		"incl %0\n"
+		"2:\tdecl %1\n\t"
+		"cmpl $-1,%1\n\t"
+		"jne 1b\n"
+		"3:\tsubl %2,%0"
+		:"=a" (res), "=&d" (d0)
+		:"c" (s),"1" (count)
+		:"memory");
+	return res;
+}
+EXPORT_SYMBOL(strnlen);
+#endif
diff --git a/arch/x86/lib/strstr_32.c b/arch/x86/lib/strstr_32.c
new file mode 100644
index 000000000000..a3dafbf59dae
--- /dev/null
+++ b/arch/x86/lib/strstr_32.c
@@ -0,0 +1,31 @@
+#include <linux/string.h>
+
+char * strstr(const char * cs,const char * ct)
+{
+int	d0, d1;
+register char * __res;
+__asm__ __volatile__(
+	"movl %6,%%edi\n\t"
+	"repne\n\t"
+	"scasb\n\t"
+	"notl %%ecx\n\t"
+	"decl %%ecx\n\t"	/* NOTE! This also sets Z if searchstring='' */
+	"movl %%ecx,%%edx\n"
+	"1:\tmovl %6,%%edi\n\t"
+	"movl %%esi,%%eax\n\t"
+	"movl %%edx,%%ecx\n\t"
+	"repe\n\t"
+	"cmpsb\n\t"
+	"je 2f\n\t"		/* also works for empty string, see above */
+	"xchgl %%eax,%%esi\n\t"
+	"incl %%esi\n\t"
+	"cmpb $0,-1(%%eax)\n\t"
+	"jne 1b\n\t"
+	"xorl %%eax,%%eax\n\t"
+	"2:"
+	:"=a" (__res), "=&c" (d0), "=&S" (d1)
+	:"0" (0), "1" (0xffffffff), "2" (cs), "g" (ct)
+	:"dx", "di");
+return __res;
+}
+
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
new file mode 100644
index 000000000000..55e586d352d3
--- /dev/null
+++ b/arch/x86/lib/thunk_64.S
@@ -0,0 +1,67 @@
+/*
+ * Save registers before calling assembly functions. This avoids
+ * disturbance of register allocation in some inline assembly constructs.
+ * Copyright 2001,2002 by Andi Kleen, SuSE Labs.
+ * Subject to the GNU public license, v.2. No warranty of any kind.
+ */
+
+	#include <linux/linkage.h>
+	#include <asm/dwarf2.h>
+	#include <asm/calling.h>			
+	#include <asm/rwlock.h>
+		
+	/* rdi:	arg1 ... normal C conventions. rax is saved/restored. */ 	
+	.macro thunk name,func
+	.globl \name
+\name:	
+	CFI_STARTPROC
+	SAVE_ARGS
+	call \func
+	jmp  restore
+	CFI_ENDPROC
+	.endm
+
+	/* rdi:	arg1 ... normal C conventions. rax is passed from C. */ 	
+	.macro thunk_retrax name,func
+	.globl \name
+\name:	
+	CFI_STARTPROC
+	SAVE_ARGS
+	call \func
+	jmp  restore_norax
+	CFI_ENDPROC
+	.endm
+	
+
+	.section .sched.text
+#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
+	thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed
+	thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed
+	thunk rwsem_wake_thunk,rwsem_wake
+	thunk rwsem_downgrade_thunk,rwsem_downgrade_wake
+#endif	
+	
+	thunk __down_failed,__down
+	thunk_retrax __down_failed_interruptible,__down_interruptible
+	thunk_retrax __down_failed_trylock,__down_trylock
+	thunk __up_wakeup,__up
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+	thunk trace_hardirqs_on_thunk,trace_hardirqs_on
+	thunk trace_hardirqs_off_thunk,trace_hardirqs_off
+#endif
+	
+	/* SAVE_ARGS below is used only for the .cfi directives it contains. */
+	CFI_STARTPROC
+	SAVE_ARGS
+restore:
+	RESTORE_ARGS
+	ret	
+	CFI_ENDPROC
+	
+	CFI_STARTPROC
+	SAVE_ARGS
+restore_norax:	
+	RESTORE_ARGS 1
+	ret
+	CFI_ENDPROC
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
new file mode 100644
index 000000000000..9f38b12b4af1
--- /dev/null
+++ b/arch/x86/lib/usercopy_32.c
@@ -0,0 +1,882 @@
+/* 
+ * User address space access functions.
+ * The non inlined parts of asm-i386/uaccess.h are here.
+ *
+ * Copyright 1997 Andi Kleen <ak@muc.de>
+ * Copyright 1997 Linus Torvalds
+ */
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/blkdev.h>
+#include <linux/module.h>
+#include <linux/backing-dev.h>
+#include <linux/interrupt.h>
+#include <asm/uaccess.h>
+#include <asm/mmx.h>
+
+static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned long n)
+{
+#ifdef CONFIG_X86_INTEL_USERCOPY
+	if (n >= 64 && ((a1 ^ a2) & movsl_mask.mask))
+		return 0;
+#endif
+	return 1;
+}
+#define movsl_is_ok(a1,a2,n) \
+	__movsl_is_ok((unsigned long)(a1),(unsigned long)(a2),(n))
+
+/*
+ * Copy a null terminated string from userspace.
+ */
+
+#define __do_strncpy_from_user(dst,src,count,res)			   \
+do {									   \
+	int __d0, __d1, __d2;						   \
+	might_sleep();							   \
+	__asm__ __volatile__(						   \
+		"	testl %1,%1\n"					   \
+		"	jz 2f\n"					   \
+		"0:	lodsb\n"					   \
+		"	stosb\n"					   \
+		"	testb %%al,%%al\n"				   \
+		"	jz 1f\n"					   \
+		"	decl %1\n"					   \
+		"	jnz 0b\n"					   \
+		"1:	subl %1,%0\n"					   \
+		"2:\n"							   \
+		".section .fixup,\"ax\"\n"				   \
+		"3:	movl %5,%0\n"					   \
+		"	jmp 2b\n"					   \
+		".previous\n"						   \
+		".section __ex_table,\"a\"\n"				   \
+		"	.align 4\n"					   \
+		"	.long 0b,3b\n"					   \
+		".previous"						   \
+		: "=d"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1),	   \
+		  "=&D" (__d2)						   \
+		: "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \
+		: "memory");						   \
+} while (0)
+
+/**
+ * __strncpy_from_user: - Copy a NUL terminated string from userspace, with less checking.
+ * @dst:   Destination address, in kernel space.  This buffer must be at
+ *         least @count bytes long.
+ * @src:   Source address, in user space.
+ * @count: Maximum number of bytes to copy, including the trailing NUL.
+ * 
+ * Copies a NUL-terminated string from userspace to kernel space.
+ * Caller must check the specified block with access_ok() before calling
+ * this function.
+ *
+ * On success, returns the length of the string (not including the trailing
+ * NUL).
+ *
+ * If access to userspace fails, returns -EFAULT (some data may have been
+ * copied).
+ *
+ * If @count is smaller than the length of the string, copies @count bytes
+ * and returns @count.
+ */
+long
+__strncpy_from_user(char *dst, const char __user *src, long count)
+{
+	long res;
+	__do_strncpy_from_user(dst, src, count, res);
+	return res;
+}
+EXPORT_SYMBOL(__strncpy_from_user);
+
+/**
+ * strncpy_from_user: - Copy a NUL terminated string from userspace.
+ * @dst:   Destination address, in kernel space.  This buffer must be at
+ *         least @count bytes long.
+ * @src:   Source address, in user space.
+ * @count: Maximum number of bytes to copy, including the trailing NUL.
+ * 
+ * Copies a NUL-terminated string from userspace to kernel space.
+ *
+ * On success, returns the length of the string (not including the trailing
+ * NUL).
+ *
+ * If access to userspace fails, returns -EFAULT (some data may have been
+ * copied).
+ *
+ * If @count is smaller than the length of the string, copies @count bytes
+ * and returns @count.
+ */
+long
+strncpy_from_user(char *dst, const char __user *src, long count)
+{
+	long res = -EFAULT;
+	if (access_ok(VERIFY_READ, src, 1))
+		__do_strncpy_from_user(dst, src, count, res);
+	return res;
+}
+EXPORT_SYMBOL(strncpy_from_user);
+
+/*
+ * Zero Userspace
+ */
+
+#define __do_clear_user(addr,size)					\
+do {									\
+	int __d0;							\
+	might_sleep();							\
+  	__asm__ __volatile__(						\
+		"0:	rep; stosl\n"					\
+		"	movl %2,%0\n"					\
+		"1:	rep; stosb\n"					\
+		"2:\n"							\
+		".section .fixup,\"ax\"\n"				\
+		"3:	lea 0(%2,%0,4),%0\n"				\
+		"	jmp 2b\n"					\
+		".previous\n"						\
+		".section __ex_table,\"a\"\n"				\
+		"	.align 4\n"					\
+		"	.long 0b,3b\n"					\
+		"	.long 1b,2b\n"					\
+		".previous"						\
+		: "=&c"(size), "=&D" (__d0)				\
+		: "r"(size & 3), "0"(size / 4), "1"(addr), "a"(0));	\
+} while (0)
+
+/**
+ * clear_user: - Zero a block of memory in user space.
+ * @to:   Destination address, in user space.
+ * @n:    Number of bytes to zero.
+ *
+ * Zero a block of memory in user space.
+ *
+ * Returns number of bytes that could not be cleared.
+ * On success, this will be zero.
+ */
+unsigned long
+clear_user(void __user *to, unsigned long n)
+{
+	might_sleep();
+	if (access_ok(VERIFY_WRITE, to, n))
+		__do_clear_user(to, n);
+	return n;
+}
+EXPORT_SYMBOL(clear_user);
+
+/**
+ * __clear_user: - Zero a block of memory in user space, with less checking.
+ * @to:   Destination address, in user space.
+ * @n:    Number of bytes to zero.
+ *
+ * Zero a block of memory in user space.  Caller must check
+ * the specified block with access_ok() before calling this function.
+ *
+ * Returns number of bytes that could not be cleared.
+ * On success, this will be zero.
+ */
+unsigned long
+__clear_user(void __user *to, unsigned long n)
+{
+	__do_clear_user(to, n);
+	return n;
+}
+EXPORT_SYMBOL(__clear_user);
+
+/**
+ * strnlen_user: - Get the size of a string in user space.
+ * @s: The string to measure.
+ * @n: The maximum valid length
+ *
+ * Get the size of a NUL-terminated string in user space.
+ *
+ * Returns the size of the string INCLUDING the terminating NUL.
+ * On exception, returns 0.
+ * If the string is too long, returns a value greater than @n.
+ */
+long strnlen_user(const char __user *s, long n)
+{
+	unsigned long mask = -__addr_ok(s);
+	unsigned long res, tmp;
+
+	might_sleep();
+
+	__asm__ __volatile__(
+		"	testl %0, %0\n"
+		"	jz 3f\n"
+		"	andl %0,%%ecx\n"
+		"0:	repne; scasb\n"
+		"	setne %%al\n"
+		"	subl %%ecx,%0\n"
+		"	addl %0,%%eax\n"
+		"1:\n"
+		".section .fixup,\"ax\"\n"
+		"2:	xorl %%eax,%%eax\n"
+		"	jmp 1b\n"
+		"3:	movb $1,%%al\n"
+		"	jmp 1b\n"
+		".previous\n"
+		".section __ex_table,\"a\"\n"
+		"	.align 4\n"
+		"	.long 0b,2b\n"
+		".previous"
+		:"=r" (n), "=D" (s), "=a" (res), "=c" (tmp)
+		:"0" (n), "1" (s), "2" (0), "3" (mask)
+		:"cc");
+	return res & mask;
+}
+EXPORT_SYMBOL(strnlen_user);
+
+#ifdef CONFIG_X86_INTEL_USERCOPY
+static unsigned long
+__copy_user_intel(void __user *to, const void *from, unsigned long size)
+{
+	int d0, d1;
+	__asm__ __volatile__(
+		       "       .align 2,0x90\n"
+		       "1:     movl 32(%4), %%eax\n"
+		       "       cmpl $67, %0\n"
+		       "       jbe 3f\n"
+		       "2:     movl 64(%4), %%eax\n"
+		       "       .align 2,0x90\n"
+		       "3:     movl 0(%4), %%eax\n"
+		       "4:     movl 4(%4), %%edx\n"
+		       "5:     movl %%eax, 0(%3)\n"
+		       "6:     movl %%edx, 4(%3)\n"
+		       "7:     movl 8(%4), %%eax\n"
+		       "8:     movl 12(%4),%%edx\n"
+		       "9:     movl %%eax, 8(%3)\n"
+		       "10:    movl %%edx, 12(%3)\n"
+		       "11:    movl 16(%4), %%eax\n"
+		       "12:    movl 20(%4), %%edx\n"
+		       "13:    movl %%eax, 16(%3)\n"
+		       "14:    movl %%edx, 20(%3)\n"
+		       "15:    movl 24(%4), %%eax\n"
+		       "16:    movl 28(%4), %%edx\n"
+		       "17:    movl %%eax, 24(%3)\n"
+		       "18:    movl %%edx, 28(%3)\n"
+		       "19:    movl 32(%4), %%eax\n"
+		       "20:    movl 36(%4), %%edx\n"
+		       "21:    movl %%eax, 32(%3)\n"
+		       "22:    movl %%edx, 36(%3)\n"
+		       "23:    movl 40(%4), %%eax\n"
+		       "24:    movl 44(%4), %%edx\n"
+		       "25:    movl %%eax, 40(%3)\n"
+		       "26:    movl %%edx, 44(%3)\n"
+		       "27:    movl 48(%4), %%eax\n"
+		       "28:    movl 52(%4), %%edx\n"
+		       "29:    movl %%eax, 48(%3)\n"
+		       "30:    movl %%edx, 52(%3)\n"
+		       "31:    movl 56(%4), %%eax\n"
+		       "32:    movl 60(%4), %%edx\n"
+		       "33:    movl %%eax, 56(%3)\n"
+		       "34:    movl %%edx, 60(%3)\n"
+		       "       addl $-64, %0\n"
+		       "       addl $64, %4\n"
+		       "       addl $64, %3\n"
+		       "       cmpl $63, %0\n"
+		       "       ja  1b\n"
+		       "35:    movl  %0, %%eax\n"
+		       "       shrl  $2, %0\n"
+		       "       andl  $3, %%eax\n"
+		       "       cld\n"
+		       "99:    rep; movsl\n"
+		       "36:    movl %%eax, %0\n"
+		       "37:    rep; movsb\n"
+		       "100:\n"
+		       ".section .fixup,\"ax\"\n"
+		       "101:   lea 0(%%eax,%0,4),%0\n"
+		       "       jmp 100b\n"
+		       ".previous\n"
+		       ".section __ex_table,\"a\"\n"
+		       "       .align 4\n"
+		       "       .long 1b,100b\n"
+		       "       .long 2b,100b\n"
+		       "       .long 3b,100b\n"
+		       "       .long 4b,100b\n"
+		       "       .long 5b,100b\n"
+		       "       .long 6b,100b\n"
+		       "       .long 7b,100b\n"
+		       "       .long 8b,100b\n"
+		       "       .long 9b,100b\n"
+		       "       .long 10b,100b\n"
+		       "       .long 11b,100b\n"
+		       "       .long 12b,100b\n"
+		       "       .long 13b,100b\n"
+		       "       .long 14b,100b\n"
+		       "       .long 15b,100b\n"
+		       "       .long 16b,100b\n"
+		       "       .long 17b,100b\n"
+		       "       .long 18b,100b\n"
+		       "       .long 19b,100b\n"
+		       "       .long 20b,100b\n"
+		       "       .long 21b,100b\n"
+		       "       .long 22b,100b\n"
+		       "       .long 23b,100b\n"
+		       "       .long 24b,100b\n"
+		       "       .long 25b,100b\n"
+		       "       .long 26b,100b\n"
+		       "       .long 27b,100b\n"
+		       "       .long 28b,100b\n"
+		       "       .long 29b,100b\n"
+		       "       .long 30b,100b\n"
+		       "       .long 31b,100b\n"
+		       "       .long 32b,100b\n"
+		       "       .long 33b,100b\n"
+		       "       .long 34b,100b\n"
+		       "       .long 35b,100b\n"
+		       "       .long 36b,100b\n"
+		       "       .long 37b,100b\n"
+		       "       .long 99b,101b\n"
+		       ".previous"
+		       : "=&c"(size), "=&D" (d0), "=&S" (d1)
+		       :  "1"(to), "2"(from), "0"(size)
+		       : "eax", "edx", "memory");
+	return size;
+}
+
+static unsigned long
+__copy_user_zeroing_intel(void *to, const void __user *from, unsigned long size)
+{
+	int d0, d1;
+	__asm__ __volatile__(
+		       "        .align 2,0x90\n"
+		       "0:      movl 32(%4), %%eax\n"
+		       "        cmpl $67, %0\n"      
+		       "        jbe 2f\n"            
+		       "1:      movl 64(%4), %%eax\n"
+		       "        .align 2,0x90\n"     
+		       "2:      movl 0(%4), %%eax\n" 
+		       "21:     movl 4(%4), %%edx\n" 
+		       "        movl %%eax, 0(%3)\n" 
+		       "        movl %%edx, 4(%3)\n" 
+		       "3:      movl 8(%4), %%eax\n" 
+		       "31:     movl 12(%4),%%edx\n" 
+		       "        movl %%eax, 8(%3)\n" 
+		       "        movl %%edx, 12(%3)\n"
+		       "4:      movl 16(%4), %%eax\n"
+		       "41:     movl 20(%4), %%edx\n"
+		       "        movl %%eax, 16(%3)\n"
+		       "        movl %%edx, 20(%3)\n"
+		       "10:     movl 24(%4), %%eax\n"
+		       "51:     movl 28(%4), %%edx\n"
+		       "        movl %%eax, 24(%3)\n"
+		       "        movl %%edx, 28(%3)\n"
+		       "11:     movl 32(%4), %%eax\n"
+		       "61:     movl 36(%4), %%edx\n"
+		       "        movl %%eax, 32(%3)\n"
+		       "        movl %%edx, 36(%3)\n"
+		       "12:     movl 40(%4), %%eax\n"
+		       "71:     movl 44(%4), %%edx\n"
+		       "        movl %%eax, 40(%3)\n"
+		       "        movl %%edx, 44(%3)\n"
+		       "13:     movl 48(%4), %%eax\n"
+		       "81:     movl 52(%4), %%edx\n"
+		       "        movl %%eax, 48(%3)\n"
+		       "        movl %%edx, 52(%3)\n"
+		       "14:     movl 56(%4), %%eax\n"
+		       "91:     movl 60(%4), %%edx\n"
+		       "        movl %%eax, 56(%3)\n"
+		       "        movl %%edx, 60(%3)\n"
+		       "        addl $-64, %0\n"     
+		       "        addl $64, %4\n"      
+		       "        addl $64, %3\n"      
+		       "        cmpl $63, %0\n"      
+		       "        ja  0b\n"            
+		       "5:      movl  %0, %%eax\n"   
+		       "        shrl  $2, %0\n"      
+		       "        andl $3, %%eax\n"    
+		       "        cld\n"               
+		       "6:      rep; movsl\n"   
+		       "        movl %%eax,%0\n"
+		       "7:      rep; movsb\n"	
+		       "8:\n"			
+		       ".section .fixup,\"ax\"\n"
+		       "9:      lea 0(%%eax,%0,4),%0\n"	
+		       "16:     pushl %0\n"	
+		       "        pushl %%eax\n"	
+		       "        xorl %%eax,%%eax\n"
+		       "        rep; stosb\n"	
+		       "        popl %%eax\n"	
+		       "        popl %0\n"	
+		       "        jmp 8b\n"	
+		       ".previous\n"		
+		       ".section __ex_table,\"a\"\n"
+		       "	.align 4\n"	   
+		       "	.long 0b,16b\n"	 
+		       "	.long 1b,16b\n"
+		       "	.long 2b,16b\n"
+		       "	.long 21b,16b\n"
+		       "	.long 3b,16b\n"	
+		       "	.long 31b,16b\n"
+		       "	.long 4b,16b\n"	
+		       "	.long 41b,16b\n"
+		       "	.long 10b,16b\n"
+		       "	.long 51b,16b\n"
+		       "	.long 11b,16b\n"
+		       "	.long 61b,16b\n"
+		       "	.long 12b,16b\n"
+		       "	.long 71b,16b\n"
+		       "	.long 13b,16b\n"
+		       "	.long 81b,16b\n"
+		       "	.long 14b,16b\n"
+		       "	.long 91b,16b\n"
+		       "	.long 6b,9b\n"	
+		       "        .long 7b,16b\n" 
+		       ".previous"		
+		       : "=&c"(size), "=&D" (d0), "=&S" (d1)
+		       :  "1"(to), "2"(from), "0"(size)
+		       : "eax", "edx", "memory");
+	return size;
+}
+
+/*
+ * Non Temporal Hint version of __copy_user_zeroing_intel.  It is cache aware.
+ * hyoshiok@miraclelinux.com
+ */
+
+static unsigned long __copy_user_zeroing_intel_nocache(void *to,
+				const void __user *from, unsigned long size)
+{
+        int d0, d1;
+
+	__asm__ __volatile__(
+	       "        .align 2,0x90\n"
+	       "0:      movl 32(%4), %%eax\n"
+	       "        cmpl $67, %0\n"
+	       "        jbe 2f\n"
+	       "1:      movl 64(%4), %%eax\n"
+	       "        .align 2,0x90\n"
+	       "2:      movl 0(%4), %%eax\n"
+	       "21:     movl 4(%4), %%edx\n"
+	       "        movnti %%eax, 0(%3)\n"
+	       "        movnti %%edx, 4(%3)\n"
+	       "3:      movl 8(%4), %%eax\n"
+	       "31:     movl 12(%4),%%edx\n"
+	       "        movnti %%eax, 8(%3)\n"
+	       "        movnti %%edx, 12(%3)\n"
+	       "4:      movl 16(%4), %%eax\n"
+	       "41:     movl 20(%4), %%edx\n"
+	       "        movnti %%eax, 16(%3)\n"
+	       "        movnti %%edx, 20(%3)\n"
+	       "10:     movl 24(%4), %%eax\n"
+	       "51:     movl 28(%4), %%edx\n"
+	       "        movnti %%eax, 24(%3)\n"
+	       "        movnti %%edx, 28(%3)\n"
+	       "11:     movl 32(%4), %%eax\n"
+	       "61:     movl 36(%4), %%edx\n"
+	       "        movnti %%eax, 32(%3)\n"
+	       "        movnti %%edx, 36(%3)\n"
+	       "12:     movl 40(%4), %%eax\n"
+	       "71:     movl 44(%4), %%edx\n"
+	       "        movnti %%eax, 40(%3)\n"
+	       "        movnti %%edx, 44(%3)\n"
+	       "13:     movl 48(%4), %%eax\n"
+	       "81:     movl 52(%4), %%edx\n"
+	       "        movnti %%eax, 48(%3)\n"
+	       "        movnti %%edx, 52(%3)\n"
+	       "14:     movl 56(%4), %%eax\n"
+	       "91:     movl 60(%4), %%edx\n"
+	       "        movnti %%eax, 56(%3)\n"
+	       "        movnti %%edx, 60(%3)\n"
+	       "        addl $-64, %0\n"
+	       "        addl $64, %4\n"
+	       "        addl $64, %3\n"
+	       "        cmpl $63, %0\n"
+	       "        ja  0b\n"
+	       "        sfence \n"
+	       "5:      movl  %0, %%eax\n"
+	       "        shrl  $2, %0\n"
+	       "        andl $3, %%eax\n"
+	       "        cld\n"
+	       "6:      rep; movsl\n"
+	       "        movl %%eax,%0\n"
+	       "7:      rep; movsb\n"
+	       "8:\n"
+	       ".section .fixup,\"ax\"\n"
+	       "9:      lea 0(%%eax,%0,4),%0\n"
+	       "16:     pushl %0\n"
+	       "        pushl %%eax\n"
+	       "        xorl %%eax,%%eax\n"
+	       "        rep; stosb\n"
+	       "        popl %%eax\n"
+	       "        popl %0\n"
+	       "        jmp 8b\n"
+	       ".previous\n"
+	       ".section __ex_table,\"a\"\n"
+	       "	.align 4\n"
+	       "	.long 0b,16b\n"
+	       "	.long 1b,16b\n"
+	       "	.long 2b,16b\n"
+	       "	.long 21b,16b\n"
+	       "	.long 3b,16b\n"
+	       "	.long 31b,16b\n"
+	       "	.long 4b,16b\n"
+	       "	.long 41b,16b\n"
+	       "	.long 10b,16b\n"
+	       "	.long 51b,16b\n"
+	       "	.long 11b,16b\n"
+	       "	.long 61b,16b\n"
+	       "	.long 12b,16b\n"
+	       "	.long 71b,16b\n"
+	       "	.long 13b,16b\n"
+	       "	.long 81b,16b\n"
+	       "	.long 14b,16b\n"
+	       "	.long 91b,16b\n"
+	       "	.long 6b,9b\n"
+	       "        .long 7b,16b\n"
+	       ".previous"
+	       : "=&c"(size), "=&D" (d0), "=&S" (d1)
+	       :  "1"(to), "2"(from), "0"(size)
+	       : "eax", "edx", "memory");
+	return size;
+}
+
+static unsigned long __copy_user_intel_nocache(void *to,
+				const void __user *from, unsigned long size)
+{
+        int d0, d1;
+
+	__asm__ __volatile__(
+	       "        .align 2,0x90\n"
+	       "0:      movl 32(%4), %%eax\n"
+	       "        cmpl $67, %0\n"
+	       "        jbe 2f\n"
+	       "1:      movl 64(%4), %%eax\n"
+	       "        .align 2,0x90\n"
+	       "2:      movl 0(%4), %%eax\n"
+	       "21:     movl 4(%4), %%edx\n"
+	       "        movnti %%eax, 0(%3)\n"
+	       "        movnti %%edx, 4(%3)\n"
+	       "3:      movl 8(%4), %%eax\n"
+	       "31:     movl 12(%4),%%edx\n"
+	       "        movnti %%eax, 8(%3)\n"
+	       "        movnti %%edx, 12(%3)\n"
+	       "4:      movl 16(%4), %%eax\n"
+	       "41:     movl 20(%4), %%edx\n"
+	       "        movnti %%eax, 16(%3)\n"
+	       "        movnti %%edx, 20(%3)\n"
+	       "10:     movl 24(%4), %%eax\n"
+	       "51:     movl 28(%4), %%edx\n"
+	       "        movnti %%eax, 24(%3)\n"
+	       "        movnti %%edx, 28(%3)\n"
+	       "11:     movl 32(%4), %%eax\n"
+	       "61:     movl 36(%4), %%edx\n"
+	       "        movnti %%eax, 32(%3)\n"
+	       "        movnti %%edx, 36(%3)\n"
+	       "12:     movl 40(%4), %%eax\n"
+	       "71:     movl 44(%4), %%edx\n"
+	       "        movnti %%eax, 40(%3)\n"
+	       "        movnti %%edx, 44(%3)\n"
+	       "13:     movl 48(%4), %%eax\n"
+	       "81:     movl 52(%4), %%edx\n"
+	       "        movnti %%eax, 48(%3)\n"
+	       "        movnti %%edx, 52(%3)\n"
+	       "14:     movl 56(%4), %%eax\n"
+	       "91:     movl 60(%4), %%edx\n"
+	       "        movnti %%eax, 56(%3)\n"
+	       "        movnti %%edx, 60(%3)\n"
+	       "        addl $-64, %0\n"
+	       "        addl $64, %4\n"
+	       "        addl $64, %3\n"
+	       "        cmpl $63, %0\n"
+	       "        ja  0b\n"
+	       "        sfence \n"
+	       "5:      movl  %0, %%eax\n"
+	       "        shrl  $2, %0\n"
+	       "        andl $3, %%eax\n"
+	       "        cld\n"
+	       "6:      rep; movsl\n"
+	       "        movl %%eax,%0\n"
+	       "7:      rep; movsb\n"
+	       "8:\n"
+	       ".section .fixup,\"ax\"\n"
+	       "9:      lea 0(%%eax,%0,4),%0\n"
+	       "16:     jmp 8b\n"
+	       ".previous\n"
+	       ".section __ex_table,\"a\"\n"
+	       "	.align 4\n"
+	       "	.long 0b,16b\n"
+	       "	.long 1b,16b\n"
+	       "	.long 2b,16b\n"
+	       "	.long 21b,16b\n"
+	       "	.long 3b,16b\n"
+	       "	.long 31b,16b\n"
+	       "	.long 4b,16b\n"
+	       "	.long 41b,16b\n"
+	       "	.long 10b,16b\n"
+	       "	.long 51b,16b\n"
+	       "	.long 11b,16b\n"
+	       "	.long 61b,16b\n"
+	       "	.long 12b,16b\n"
+	       "	.long 71b,16b\n"
+	       "	.long 13b,16b\n"
+	       "	.long 81b,16b\n"
+	       "	.long 14b,16b\n"
+	       "	.long 91b,16b\n"
+	       "	.long 6b,9b\n"
+	       "        .long 7b,16b\n"
+	       ".previous"
+	       : "=&c"(size), "=&D" (d0), "=&S" (d1)
+	       :  "1"(to), "2"(from), "0"(size)
+	       : "eax", "edx", "memory");
+	return size;
+}
+
+#else
+
+/*
+ * Leave these declared but undefined.  They should not be any references to
+ * them
+ */
+unsigned long __copy_user_zeroing_intel(void *to, const void __user *from,
+					unsigned long size);
+unsigned long __copy_user_intel(void __user *to, const void *from,
+					unsigned long size);
+unsigned long __copy_user_zeroing_intel_nocache(void *to,
+				const void __user *from, unsigned long size);
+#endif /* CONFIG_X86_INTEL_USERCOPY */
+
+/* Generic arbitrary sized copy.  */
+#define __copy_user(to,from,size)					\
+do {									\
+	int __d0, __d1, __d2;						\
+	__asm__ __volatile__(						\
+		"	cmp  $7,%0\n"					\
+		"	jbe  1f\n"					\
+		"	movl %1,%0\n"					\
+		"	negl %0\n"					\
+		"	andl $7,%0\n"					\
+		"	subl %0,%3\n"					\
+		"4:	rep; movsb\n"					\
+		"	movl %3,%0\n"					\
+		"	shrl $2,%0\n"					\
+		"	andl $3,%3\n"					\
+		"	.align 2,0x90\n"				\
+		"0:	rep; movsl\n"					\
+		"	movl %3,%0\n"					\
+		"1:	rep; movsb\n"					\
+		"2:\n"							\
+		".section .fixup,\"ax\"\n"				\
+		"5:	addl %3,%0\n"					\
+		"	jmp 2b\n"					\
+		"3:	lea 0(%3,%0,4),%0\n"				\
+		"	jmp 2b\n"					\
+		".previous\n"						\
+		".section __ex_table,\"a\"\n"				\
+		"	.align 4\n"					\
+		"	.long 4b,5b\n"					\
+		"	.long 0b,3b\n"					\
+		"	.long 1b,2b\n"					\
+		".previous"						\
+		: "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2)	\
+		: "3"(size), "0"(size), "1"(to), "2"(from)		\
+		: "memory");						\
+} while (0)
+
+#define __copy_user_zeroing(to,from,size)				\
+do {									\
+	int __d0, __d1, __d2;						\
+	__asm__ __volatile__(						\
+		"	cmp  $7,%0\n"					\
+		"	jbe  1f\n"					\
+		"	movl %1,%0\n"					\
+		"	negl %0\n"					\
+		"	andl $7,%0\n"					\
+		"	subl %0,%3\n"					\
+		"4:	rep; movsb\n"					\
+		"	movl %3,%0\n"					\
+		"	shrl $2,%0\n"					\
+		"	andl $3,%3\n"					\
+		"	.align 2,0x90\n"				\
+		"0:	rep; movsl\n"					\
+		"	movl %3,%0\n"					\
+		"1:	rep; movsb\n"					\
+		"2:\n"							\
+		".section .fixup,\"ax\"\n"				\
+		"5:	addl %3,%0\n"					\
+		"	jmp 6f\n"					\
+		"3:	lea 0(%3,%0,4),%0\n"				\
+		"6:	pushl %0\n"					\
+		"	pushl %%eax\n"					\
+		"	xorl %%eax,%%eax\n"				\
+		"	rep; stosb\n"					\
+		"	popl %%eax\n"					\
+		"	popl %0\n"					\
+		"	jmp 2b\n"					\
+		".previous\n"						\
+		".section __ex_table,\"a\"\n"				\
+		"	.align 4\n"					\
+		"	.long 4b,5b\n"					\
+		"	.long 0b,3b\n"					\
+		"	.long 1b,6b\n"					\
+		".previous"						\
+		: "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2)	\
+		: "3"(size), "0"(size), "1"(to), "2"(from)		\
+		: "memory");						\
+} while (0)
+
+unsigned long __copy_to_user_ll(void __user *to, const void *from,
+				unsigned long n)
+{
+#ifndef CONFIG_X86_WP_WORKS_OK
+	if (unlikely(boot_cpu_data.wp_works_ok == 0) &&
+			((unsigned long )to) < TASK_SIZE) {
+		/*
+		 * When we are in an atomic section (see
+		 * mm/filemap.c:file_read_actor), return the full
+		 * length to take the slow path.
+		 */
+		if (in_atomic())
+			return n;
+
+		/* 
+		 * CPU does not honor the WP bit when writing
+		 * from supervisory mode, and due to preemption or SMP,
+		 * the page tables can change at any time.
+		 * Do it manually.	Manfred <manfred@colorfullife.com>
+		 */
+		while (n) {
+		      	unsigned long offset = ((unsigned long)to)%PAGE_SIZE;
+			unsigned long len = PAGE_SIZE - offset;
+			int retval;
+			struct page *pg;
+			void *maddr;
+			
+			if (len > n)
+				len = n;
+
+survive:
+			down_read(&current->mm->mmap_sem);
+			retval = get_user_pages(current, current->mm,
+					(unsigned long )to, 1, 1, 0, &pg, NULL);
+
+			if (retval == -ENOMEM && is_init(current)) {
+				up_read(&current->mm->mmap_sem);
+				congestion_wait(WRITE, HZ/50);
+				goto survive;
+			}
+
+			if (retval != 1) {
+				up_read(&current->mm->mmap_sem);
+		       		break;
+		       	}
+
+			maddr = kmap_atomic(pg, KM_USER0);
+			memcpy(maddr + offset, from, len);
+			kunmap_atomic(maddr, KM_USER0);
+			set_page_dirty_lock(pg);
+			put_page(pg);
+			up_read(&current->mm->mmap_sem);
+
+			from += len;
+			to += len;
+			n -= len;
+		}
+		return n;
+	}
+#endif
+	if (movsl_is_ok(to, from, n))
+		__copy_user(to, from, n);
+	else
+		n = __copy_user_intel(to, from, n);
+	return n;
+}
+EXPORT_SYMBOL(__copy_to_user_ll);
+
+unsigned long __copy_from_user_ll(void *to, const void __user *from,
+					unsigned long n)
+{
+	if (movsl_is_ok(to, from, n))
+		__copy_user_zeroing(to, from, n);
+	else
+		n = __copy_user_zeroing_intel(to, from, n);
+	return n;
+}
+EXPORT_SYMBOL(__copy_from_user_ll);
+
+unsigned long __copy_from_user_ll_nozero(void *to, const void __user *from,
+					 unsigned long n)
+{
+	if (movsl_is_ok(to, from, n))
+		__copy_user(to, from, n);
+	else
+		n = __copy_user_intel((void __user *)to,
+				      (const void *)from, n);
+	return n;
+}
+EXPORT_SYMBOL(__copy_from_user_ll_nozero);
+
+unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from,
+					unsigned long n)
+{
+#ifdef CONFIG_X86_INTEL_USERCOPY
+	if ( n > 64 && cpu_has_xmm2)
+                n = __copy_user_zeroing_intel_nocache(to, from, n);
+	else
+		__copy_user_zeroing(to, from, n);
+#else
+        __copy_user_zeroing(to, from, n);
+#endif
+	return n;
+}
+
+unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from,
+					unsigned long n)
+{
+#ifdef CONFIG_X86_INTEL_USERCOPY
+	if ( n > 64 && cpu_has_xmm2)
+                n = __copy_user_intel_nocache(to, from, n);
+	else
+		__copy_user(to, from, n);
+#else
+        __copy_user(to, from, n);
+#endif
+	return n;
+}
+
+/**
+ * copy_to_user: - Copy a block of data into user space.
+ * @to:   Destination address, in user space.
+ * @from: Source address, in kernel space.
+ * @n:    Number of bytes to copy.
+ *
+ * Context: User context only.  This function may sleep.
+ *
+ * Copy data from kernel space to user space.
+ *
+ * Returns number of bytes that could not be copied.
+ * On success, this will be zero.
+ */
+unsigned long
+copy_to_user(void __user *to, const void *from, unsigned long n)
+{
+	if (access_ok(VERIFY_WRITE, to, n))
+		n = __copy_to_user(to, from, n);
+	return n;
+}
+EXPORT_SYMBOL(copy_to_user);
+
+/**
+ * copy_from_user: - Copy a block of data from user space.
+ * @to:   Destination address, in kernel space.
+ * @from: Source address, in user space.
+ * @n:    Number of bytes to copy.
+ *
+ * Context: User context only.  This function may sleep.
+ *
+ * Copy data from user space to kernel space.
+ *
+ * Returns number of bytes that could not be copied.
+ * On success, this will be zero.
+ *
+ * If some data could not be copied, this function will pad the copied
+ * data to the requested size using zero bytes.
+ */
+unsigned long
+copy_from_user(void *to, const void __user *from, unsigned long n)
+{
+	if (access_ok(VERIFY_READ, from, n))
+		n = __copy_from_user(to, from, n);
+	else
+		memset(to, 0, n);
+	return n;
+}
+EXPORT_SYMBOL(copy_from_user);
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
new file mode 100644
index 000000000000..893d43f838cc
--- /dev/null
+++ b/arch/x86/lib/usercopy_64.c
@@ -0,0 +1,166 @@
+/* 
+ * User address space access functions.
+ *
+ * Copyright 1997 Andi Kleen <ak@muc.de>
+ * Copyright 1997 Linus Torvalds
+ * Copyright 2002 Andi Kleen <ak@suse.de>
+ */
+#include <linux/module.h>
+#include <asm/uaccess.h>
+
+/*
+ * Copy a null terminated string from userspace.
+ */
+
+#define __do_strncpy_from_user(dst,src,count,res)			   \
+do {									   \
+	long __d0, __d1, __d2;						   \
+	might_sleep();							   \
+	__asm__ __volatile__(						   \
+		"	testq %1,%1\n"					   \
+		"	jz 2f\n"					   \
+		"0:	lodsb\n"					   \
+		"	stosb\n"					   \
+		"	testb %%al,%%al\n"				   \
+		"	jz 1f\n"					   \
+		"	decq %1\n"					   \
+		"	jnz 0b\n"					   \
+		"1:	subq %1,%0\n"					   \
+		"2:\n"							   \
+		".section .fixup,\"ax\"\n"				   \
+		"3:	movq %5,%0\n"					   \
+		"	jmp 2b\n"					   \
+		".previous\n"						   \
+		".section __ex_table,\"a\"\n"				   \
+		"	.align 8\n"					   \
+		"	.quad 0b,3b\n"					   \
+		".previous"						   \
+		: "=r"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1),	   \
+		  "=&D" (__d2)						   \
+		: "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \
+		: "memory");						   \
+} while (0)
+
+long
+__strncpy_from_user(char *dst, const char __user *src, long count)
+{
+	long res;
+	__do_strncpy_from_user(dst, src, count, res);
+	return res;
+}
+EXPORT_SYMBOL(__strncpy_from_user);
+
+long
+strncpy_from_user(char *dst, const char __user *src, long count)
+{
+	long res = -EFAULT;
+	if (access_ok(VERIFY_READ, src, 1))
+		return __strncpy_from_user(dst, src, count);
+	return res;
+}
+EXPORT_SYMBOL(strncpy_from_user);
+
+/*
+ * Zero Userspace
+ */
+
+unsigned long __clear_user(void __user *addr, unsigned long size)
+{
+	long __d0;
+	might_sleep();
+	/* no memory constraint because it doesn't change any memory gcc knows
+	   about */
+	asm volatile(
+		"	testq  %[size8],%[size8]\n"
+		"	jz     4f\n"
+		"0:	movq %[zero],(%[dst])\n"
+		"	addq   %[eight],%[dst]\n"
+		"	decl %%ecx ; jnz   0b\n"
+		"4:	movq  %[size1],%%rcx\n"
+		"	testl %%ecx,%%ecx\n"
+		"	jz     2f\n"
+		"1:	movb   %b[zero],(%[dst])\n"
+		"	incq   %[dst]\n"
+		"	decl %%ecx ; jnz  1b\n"
+		"2:\n"
+		".section .fixup,\"ax\"\n"
+		"3:	lea 0(%[size1],%[size8],8),%[size8]\n"
+		"	jmp 2b\n"
+		".previous\n"
+		".section __ex_table,\"a\"\n"
+		"       .align 8\n"
+		"	.quad 0b,3b\n"
+		"	.quad 1b,2b\n"
+		".previous"
+		: [size8] "=c"(size), [dst] "=&D" (__d0)
+		: [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr),
+		  [zero] "r" (0UL), [eight] "r" (8UL));
+	return size;
+}
+EXPORT_SYMBOL(__clear_user);
+
+unsigned long clear_user(void __user *to, unsigned long n)
+{
+	if (access_ok(VERIFY_WRITE, to, n))
+		return __clear_user(to, n);
+	return n;
+}
+EXPORT_SYMBOL(clear_user);
+
+/*
+ * Return the size of a string (including the ending 0)
+ *
+ * Return 0 on exception, a value greater than N if too long
+ */
+
+long __strnlen_user(const char __user *s, long n)
+{
+	long res = 0;
+	char c;
+
+	while (1) {
+		if (res>n)
+			return n+1;
+		if (__get_user(c, s))
+			return 0;
+		if (!c)
+			return res+1;
+		res++;
+		s++;
+	}
+}
+EXPORT_SYMBOL(__strnlen_user);
+
+long strnlen_user(const char __user *s, long n)
+{
+	if (!access_ok(VERIFY_READ, s, n))
+		return 0;
+	return __strnlen_user(s, n);
+}
+EXPORT_SYMBOL(strnlen_user);
+
+long strlen_user(const char __user *s)
+{
+	long res = 0;
+	char c;
+
+	for (;;) {
+		if (get_user(c, s))
+			return 0;
+		if (!c)
+			return res+1;
+		res++;
+		s++;
+	}
+}
+EXPORT_SYMBOL(strlen_user);
+
+unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len)
+{
+	if (access_ok(VERIFY_WRITE, to, len) && access_ok(VERIFY_READ, from, len)) { 
+		return copy_user_generic((__force void *)to, (__force void *)from, len);
+	} 
+	return len;		
+}
+EXPORT_SYMBOL(copy_in_user);
+