From 775b64d2b6ca37697de925f70799c710aab5849a Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sat, 12 Jan 2008 20:40:46 +0100
Subject: PM: Acquire device locks on suspend

This patch reorganizes the way suspend and resume notifications are
sent to drivers.  The major changes are that now the PM core acquires
every device semaphore before calling the methods, and calls to
device_add() during suspends will fail, while calls to device_del()
during suspends will block.

It also provides a way to safely remove a suspended device with the
help of the PM core, by using the device_pm_schedule_removal() callback
introduced specifically for this purpose, and updates two drivers (msr
and cpuid) that need to use it.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/x86/kernel/cpuid.c | 6 +++---
 arch/x86/kernel/msr.c   | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 05c9936a16cc..d387c770c518 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -157,15 +157,15 @@ static int __cpuinit cpuid_class_cpu_callback(struct notifier_block *nfb,
 
 	switch (action) {
 	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
 		err = cpuid_device_create(cpu);
 		break;
 	case CPU_UP_CANCELED:
-	case CPU_UP_CANCELED_FROZEN:
 	case CPU_DEAD:
-	case CPU_DEAD_FROZEN:
 		cpuid_device_destroy(cpu);
 		break;
+	case CPU_UP_CANCELED_FROZEN:
+		destroy_suspended_device(cpuid_class, MKDEV(CPUID_MAJOR, cpu));
+		break;
 	}
 	return err ? NOTIFY_BAD : NOTIFY_OK;
 }
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index ee6eba4ecfea..21f6e3c0be18 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -155,15 +155,15 @@ static int __cpuinit msr_class_cpu_callback(struct notifier_block *nfb,
 
 	switch (action) {
 	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
 		err = msr_device_create(cpu);
 		break;
 	case CPU_UP_CANCELED:
-	case CPU_UP_CANCELED_FROZEN:
 	case CPU_DEAD:
-	case CPU_DEAD_FROZEN:
 		msr_device_destroy(cpu);
 		break;
+	case CPU_UP_CANCELED_FROZEN:
+		destroy_suspended_device(msr_class, MKDEV(MSR_MAJOR, cpu));
+		break;
 	}
 	return err ? NOTIFY_BAD : NOTIFY_OK;
 }
-- 
cgit v1.2.3


From 5b3f355d8fef95901505e924818b3031092453c2 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 17 Dec 2007 15:54:39 -0400
Subject: Kobject: change arch/x86/kernel/cpu/intel_cacheinfo.c to use
 kobject_init_and_add

Stop using kobject_register, as this way we can control the sending of
the uevent properly, after everything is properly initialized.

Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 9f530ff43c21..3509542eed87 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -733,10 +733,8 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
 	if (unlikely(retval < 0))
 		return retval;
 
-	cache_kobject[cpu]->parent = &sys_dev->kobj;
-	kobject_set_name(cache_kobject[cpu], "%s", "cache");
-	cache_kobject[cpu]->ktype = &ktype_percpu_entry;
-	retval = kobject_register(cache_kobject[cpu]);
+	retval = kobject_init_and_add(cache_kobject[cpu], &ktype_percpu_entry,
+				      &sys_dev->kobj, "%s", "cache");
 	if (retval < 0) {
 		cpuid4_cache_sysfs_exit(cpu);
 		return retval;
@@ -746,10 +744,9 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
 		this_object = INDEX_KOBJECT_PTR(cpu,i);
 		this_object->cpu = cpu;
 		this_object->index = i;
-		this_object->kobj.parent = cache_kobject[cpu];
-		kobject_set_name(&(this_object->kobj), "index%1lu", i);
-		this_object->kobj.ktype = &ktype_cache;
-		retval = kobject_register(&(this_object->kobj));
+		retval = kobject_init_and_add(&(this_object->kobj),
+					      &ktype_cache, cache_kobject[cpu],
+					      "index%1lu", i);
 		if (unlikely(retval)) {
 			for (j = 0; j < i; j++) {
 				kobject_unregister(
@@ -759,10 +756,12 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
 			cpuid4_cache_sysfs_exit(cpu);
 			break;
 		}
+		kobject_uevent(&(this_object->kobj), KOBJ_ADD);
 	}
 	if (!retval)
 		cpu_set(cpu, cache_dev_map);
 
+	kobject_uevent(cache_kobject[cpu], KOBJ_ADD);
 	return retval;
 }
 
-- 
cgit v1.2.3


From a521cf209c6e7042f85b2c5b16da3ffa8004fb7b Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Wed, 19 Dec 2007 09:23:20 -0800
Subject: Kobject: change arch/x86/kernel/cpu/mcheck/mce_amd_64.c to use
 kobject_create_and_add

Make this kobject dynamic and convert it to not use kobject_register,
which is going away.

Cc: Jacob Shin <jacob.shin@amd.com>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 752fb16a817d..2d65311d206e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -65,7 +65,7 @@ static struct threshold_block threshold_defaults = {
 };
 
 struct threshold_bank {
-	struct kobject kobj;
+	struct kobject *kobj;
 	struct threshold_block *blocks;
 	cpumask_t cpus;
 };
@@ -433,7 +433,7 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
 		per_cpu(threshold_banks, cpu)[bank]->blocks = b;
 
 	kobject_set_name(&b->kobj, "misc%i", block);
-	b->kobj.parent = &per_cpu(threshold_banks, cpu)[bank]->kobj;
+	b->kobj.parent = per_cpu(threshold_banks, cpu)[bank]->kobj;
 	b->kobj.ktype = &threshold_ktype;
 	err = kobject_register(&b->kobj);
 	if (err)
@@ -489,7 +489,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 			goto out;
 
 		err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj,
-					&b->kobj, name);
+					b->kobj, name);
 		if (err)
 			goto out;
 
@@ -505,16 +505,15 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 		goto out;
 	}
 
-	kobject_set_name(&b->kobj, "threshold_bank%i", bank);
-	b->kobj.parent = &per_cpu(device_mce, cpu).kobj;
+	b->kobj = kobject_create_and_add(name, &per_cpu(device_mce, cpu).kobj);
+	if (!b->kobj)
+		goto out_free;
+
 #ifndef CONFIG_SMP
 	b->cpus = CPU_MASK_ALL;
 #else
 	b->cpus = per_cpu(cpu_core_map, cpu);
 #endif
-	err = kobject_register(&b->kobj);
-	if (err)
-		goto out_free;
 
 	per_cpu(threshold_banks, cpu)[bank] = b;
 
@@ -531,7 +530,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 			continue;
 
 		err = sysfs_create_link(&per_cpu(device_mce, i).kobj,
-					&b->kobj, name);
+					b->kobj, name);
 		if (err)
 			goto out;
 
@@ -627,7 +626,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
 	deallocate_threshold_block(cpu, bank);
 
 free_out:
-	kobject_unregister(&b->kobj);
+	kobject_unregister(b->kobj);
 	kfree(b);
 	per_cpu(threshold_banks, cpu)[bank] = NULL;
 }
-- 
cgit v1.2.3


From 542eb75a27616bdde95c8d3764e0ab703579f8b5 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Wed, 19 Dec 2007 09:23:20 -0800
Subject: Kobject: change arch/x86/kernel/cpu/mcheck/mce_amd_64.c to use
 kobject_init_and_add

Stop using kobject_register, as this way we can control the sending of
the uevent properly, after everything is properly initialized.

Cc: Jacob Shin <jacob.shin@amd.com>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 2d65311d206e..ef15f35b10ed 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -432,10 +432,9 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
 	else
 		per_cpu(threshold_banks, cpu)[bank]->blocks = b;
 
-	kobject_set_name(&b->kobj, "misc%i", block);
-	b->kobj.parent = per_cpu(threshold_banks, cpu)[bank]->kobj;
-	b->kobj.ktype = &threshold_ktype;
-	err = kobject_register(&b->kobj);
+	err = kobject_init_and_add(&b->kobj, &threshold_ktype,
+				   per_cpu(threshold_banks, cpu)[bank]->kobj,
+				   "misc%i", block);
 	if (err)
 		goto out_free;
 recurse:
@@ -451,6 +450,8 @@ recurse:
 	if (err)
 		goto out_free;
 
+	kobject_uevent(&b->kobj, KOBJ_ADD);
+
 	return err;
 
 out_free:
-- 
cgit v1.2.3


From 38a382ae5dd4f4d04e3046816b0a41836094e538 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Thu, 20 Dec 2007 08:13:05 -0800
Subject: Kobject: convert arch/* from kobject_unregister() to kobject_put()

There is no need for kobject_unregister() anymore, thanks to Kay's
kobject cleanup changes, so replace all instances of it with
kobject_put().


Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/ia64/kernel/topology.c             | 9 ++++-----
 arch/s390/hypfs/inode.c                 | 4 ++--
 arch/sh/kernel/cpu/sh4/sq.c             | 2 +-
 arch/x86/kernel/cpu/intel_cacheinfo.c   | 9 ++++-----
 arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 6 +++---
 5 files changed, 14 insertions(+), 16 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c
index c4311e3adf55..a2484fc1a06c 100644
--- a/arch/ia64/kernel/topology.c
+++ b/arch/ia64/kernel/topology.c
@@ -366,10 +366,9 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
 					      "index%1lu", i);
 		if (unlikely(retval)) {
 			for (j = 0; j < i; j++) {
-				kobject_unregister(
-					&(LEAF_KOBJECT_PTR(cpu,j)->kobj));
+				kobject_put(&(LEAF_KOBJECT_PTR(cpu,j)->kobj));
 			}
-			kobject_unregister(&all_cpu_cache_info[cpu].kobj);
+			kobject_put(&all_cpu_cache_info[cpu].kobj);
 			cpu_cache_sysfs_exit(cpu);
 			break;
 		}
@@ -386,10 +385,10 @@ static int __cpuinit cache_remove_dev(struct sys_device * sys_dev)
 	unsigned long i;
 
 	for (i = 0; i < all_cpu_cache_info[cpu].num_cache_leaves; i++)
-		kobject_unregister(&(LEAF_KOBJECT_PTR(cpu,i)->kobj));
+		kobject_put(&(LEAF_KOBJECT_PTR(cpu,i)->kobj));
 
 	if (all_cpu_cache_info[cpu].kobj.parent) {
-		kobject_unregister(&all_cpu_cache_info[cpu].kobj);
+		kobject_put(&all_cpu_cache_info[cpu].kobj);
 		memset(&all_cpu_cache_info[cpu].kobj,
 			0,
 			sizeof(struct kobject));
diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c
index 631a6109f642..4b010ff814c9 100644
--- a/arch/s390/hypfs/inode.c
+++ b/arch/s390/hypfs/inode.c
@@ -517,7 +517,7 @@ static int __init hypfs_init(void)
 	return 0;
 
 fail_filesystem:
-	kobject_unregister(s390_kobj);
+	kobject_put(s390_kobj);
 fail_sysfs:
 	if (!MACHINE_IS_VM)
 		hypfs_diag_exit();
@@ -531,7 +531,7 @@ static void __exit hypfs_exit(void)
 	if (!MACHINE_IS_VM)
 		hypfs_diag_exit();
 	unregister_filesystem(&hypfs_type);
-	kobject_unregister(s390_kobj);
+	kobject_put(s390_kobj);
 }
 
 module_init(hypfs_init)
diff --git a/arch/sh/kernel/cpu/sh4/sq.c b/arch/sh/kernel/cpu/sh4/sq.c
index 97fd9b9a4820..3008c00eea6b 100644
--- a/arch/sh/kernel/cpu/sh4/sq.c
+++ b/arch/sh/kernel/cpu/sh4/sq.c
@@ -360,7 +360,7 @@ static int __devexit sq_sysdev_remove(struct sys_device *sysdev)
 	unsigned int cpu = sysdev->id;
 	struct kobject *kobj = sq_kobject[cpu];
 
-	kobject_unregister(kobj);
+	kobject_put(kobj);
 	return 0;
 }
 
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 3509542eed87..8b4507b8469b 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -749,10 +749,9 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
 					      "index%1lu", i);
 		if (unlikely(retval)) {
 			for (j = 0; j < i; j++) {
-				kobject_unregister(
-					&(INDEX_KOBJECT_PTR(cpu,j)->kobj));
+				kobject_put(&(INDEX_KOBJECT_PTR(cpu,j)->kobj));
 			}
-			kobject_unregister(cache_kobject[cpu]);
+			kobject_put(cache_kobject[cpu]);
 			cpuid4_cache_sysfs_exit(cpu);
 			break;
 		}
@@ -777,8 +776,8 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
 	cpu_clear(cpu, cache_dev_map);
 
 	for (i = 0; i < num_cache_leaves; i++)
-		kobject_unregister(&(INDEX_KOBJECT_PTR(cpu,i)->kobj));
-	kobject_unregister(cache_kobject[cpu]);
+		kobject_put(&(INDEX_KOBJECT_PTR(cpu,i)->kobj));
+	kobject_put(cache_kobject[cpu]);
 	cpuid4_cache_sysfs_exit(cpu);
 }
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index ef15f35b10ed..753588755fee 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -456,7 +456,7 @@ recurse:
 
 out_free:
 	if (b) {
-		kobject_unregister(&b->kobj);
+		kobject_put(&b->kobj);
 		kfree(b);
 	}
 	return err;
@@ -581,7 +581,7 @@ static void deallocate_threshold_block(unsigned int cpu,
 		return;
 
 	list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) {
-		kobject_unregister(&pos->kobj);
+		kobject_put(&pos->kobj);
 		list_del(&pos->miscj);
 		kfree(pos);
 	}
@@ -627,7 +627,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
 	deallocate_threshold_block(cpu, bank);
 
 free_out:
-	kobject_unregister(b->kobj);
+	kobject_put(b->kobj);
 	kfree(b);
 	per_cpu(threshold_banks, cpu)[bank] = NULL;
 }
-- 
cgit v1.2.3


From af5ca3f4ec5cc4432a42a73b050dd8898ce8fd00 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Thu, 20 Dec 2007 02:09:39 +0100
Subject: Driver core: change sysdev classes to use dynamic kobject names

All kobjects require a dynamically allocated name now. We no longer
need to keep track if the name is statically assigned, we can just
unconditionally free() all kobject names on cleanup.

Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/arm/kernel/time.c                   |  4 ++--
 arch/arm/mach-integrator/integrator_ap.c |  2 +-
 arch/arm/mach-pxa/cm-x270.c              |  2 +-
 arch/arm/mach-pxa/lpd270.c               |  2 +-
 arch/arm/mach-pxa/lubbock.c              |  2 +-
 arch/arm/mach-pxa/mainstone.c            |  2 +-
 arch/arm/mach-s3c2410/s3c2410.c          |  2 +-
 arch/arm/mach-s3c2412/s3c2412.c          |  2 +-
 arch/arm/mach-s3c2440/mach-osiris.c      |  2 +-
 arch/arm/mach-s3c2443/s3c2443.c          |  2 +-
 arch/arm/mach-sa1100/irq.c               |  2 +-
 arch/arm/oprofile/common.c               |  2 +-
 arch/arm/plat-omap/gpio.c                |  2 +-
 arch/arm/plat-s3c24xx/dma.c              |  2 +-
 arch/arm/plat-s3c24xx/s3c244x.c          |  4 ++--
 arch/avr32/kernel/time.c                 |  2 +-
 arch/mips/kernel/i8259.c                 |  2 +-
 arch/powerpc/platforms/cell/spu_base.c   |  2 +-
 arch/powerpc/platforms/powermac/pic.c    |  2 +-
 arch/powerpc/sysdev/ipic.c               |  2 +-
 arch/powerpc/sysdev/mpic.c               |  2 +-
 arch/powerpc/sysdev/qe_lib/qe_ic.c       |  2 +-
 arch/ppc/syslib/ipic.c                   |  2 +-
 arch/ppc/syslib/open_pic.c               |  2 +-
 arch/ppc/syslib/open_pic2.c              |  2 +-
 arch/s390/kernel/time.c                  |  2 +-
 arch/sh/drivers/dma/dma-sysfs.c          |  2 +-
 arch/sh/kernel/time.c                    |  2 +-
 arch/x86/kernel/apic_32.c                |  2 +-
 arch/x86/kernel/apic_64.c                |  2 +-
 arch/x86/kernel/cpu/mcheck/mce_64.c      |  2 +-
 arch/x86/kernel/i8237.c                  |  2 +-
 arch/x86/kernel/i8259_32.c               |  2 +-
 arch/x86/kernel/i8259_64.c               |  2 +-
 arch/x86/kernel/io_apic_32.c             |  2 +-
 arch/x86/kernel/io_apic_64.c             |  2 +-
 arch/x86/kernel/nmi_32.c                 |  2 +-
 arch/x86/kernel/nmi_64.c                 |  2 +-
 arch/x86/oprofile/nmi_int.c              |  2 +-
 drivers/acpi/pci_link.c                  |  2 +-
 drivers/base/class.c                     |  2 +-
 drivers/base/cpu.c                       |  2 +-
 drivers/base/memory.c                    |  2 +-
 drivers/base/node.c                      |  2 +-
 drivers/base/sys.c                       |  1 +
 drivers/edac/edac_module.c               |  2 +-
 drivers/kvm/kvm_main.c                   |  2 +-
 drivers/macintosh/via-pmu.c              |  2 +-
 drivers/scsi/libsas/sas_scsi_host.c      |  2 +-
 include/linux/kobject.h                  | 13 ++-----------
 include/linux/sysdev.h                   |  1 +
 kernel/rtmutex-tester.c                  |  2 +-
 kernel/time/clocksource.c                |  2 +-
 kernel/time/timekeeping.c                |  2 +-
 lib/kobject.c                            | 14 +++++---------
 55 files changed, 62 insertions(+), 73 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/arm/kernel/time.c b/arch/arm/kernel/time.c
index 1533d3ecd7a0..f6f3689a86ee 100644
--- a/arch/arm/kernel/time.c
+++ b/arch/arm/kernel/time.c
@@ -195,7 +195,7 @@ static int leds_shutdown(struct sys_device *dev)
 }
 
 static struct sysdev_class leds_sysclass = {
-	set_kset_name("leds"),
+	.name		= "leds",
 	.shutdown	= leds_shutdown,
 	.suspend	= leds_suspend,
 	.resume		= leds_resume,
@@ -369,7 +369,7 @@ static int timer_resume(struct sys_device *dev)
 #endif
 
 static struct sysdev_class timer_sysclass = {
-	set_kset_name("timer"),
+	.name		= "timer",
 	.suspend	= timer_suspend,
 	.resume		= timer_resume,
 };
diff --git a/arch/arm/mach-integrator/integrator_ap.c b/arch/arm/mach-integrator/integrator_ap.c
index 72280754354d..df37e93c6fc9 100644
--- a/arch/arm/mach-integrator/integrator_ap.c
+++ b/arch/arm/mach-integrator/integrator_ap.c
@@ -214,7 +214,7 @@ static int irq_resume(struct sys_device *dev)
 #endif
 
 static struct sysdev_class irq_class = {
-	set_kset_name("irq"),
+	.name		= "irq",
 	.suspend	= irq_suspend,
 	.resume		= irq_resume,
 };
diff --git a/arch/arm/mach-pxa/cm-x270.c b/arch/arm/mach-pxa/cm-x270.c
index 177664ccb2e2..a16349272f54 100644
--- a/arch/arm/mach-pxa/cm-x270.c
+++ b/arch/arm/mach-pxa/cm-x270.c
@@ -566,7 +566,7 @@ static int cmx270_resume(struct sys_device *dev)
 }
 
 static struct sysdev_class cmx270_pm_sysclass = {
-	set_kset_name("pm"),
+	.name = "pm",
 	.resume = cmx270_resume,
 	.suspend = cmx270_suspend,
 };
diff --git a/arch/arm/mach-pxa/lpd270.c b/arch/arm/mach-pxa/lpd270.c
index 26116440a7c9..78ebad063cba 100644
--- a/arch/arm/mach-pxa/lpd270.c
+++ b/arch/arm/mach-pxa/lpd270.c
@@ -122,7 +122,7 @@ static int lpd270_irq_resume(struct sys_device *dev)
 }
 
 static struct sysdev_class lpd270_irq_sysclass = {
-	set_kset_name("cpld_irq"),
+	.name = "cpld_irq",
 	.resume = lpd270_irq_resume,
 };
 
diff --git a/arch/arm/mach-pxa/lubbock.c b/arch/arm/mach-pxa/lubbock.c
index 011a1a72b61c..1d3112dc629e 100644
--- a/arch/arm/mach-pxa/lubbock.c
+++ b/arch/arm/mach-pxa/lubbock.c
@@ -126,7 +126,7 @@ static int lubbock_irq_resume(struct sys_device *dev)
 }
 
 static struct sysdev_class lubbock_irq_sysclass = {
-	set_kset_name("cpld_irq"),
+	.name = "cpld_irq",
 	.resume = lubbock_irq_resume,
 };
 
diff --git a/arch/arm/mach-pxa/mainstone.c b/arch/arm/mach-pxa/mainstone.c
index a4bc3483cbb3..41d8c6cea62b 100644
--- a/arch/arm/mach-pxa/mainstone.c
+++ b/arch/arm/mach-pxa/mainstone.c
@@ -120,7 +120,7 @@ static int mainstone_irq_resume(struct sys_device *dev)
 }
 
 static struct sysdev_class mainstone_irq_sysclass = {
-	set_kset_name("cpld_irq"),
+	.name = "cpld_irq",
 	.resume = mainstone_irq_resume,
 };
 
diff --git a/arch/arm/mach-s3c2410/s3c2410.c b/arch/arm/mach-s3c2410/s3c2410.c
index e580303cb0ab..0e7991940f81 100644
--- a/arch/arm/mach-s3c2410/s3c2410.c
+++ b/arch/arm/mach-s3c2410/s3c2410.c
@@ -100,7 +100,7 @@ void __init s3c2410_init_clocks(int xtal)
 }
 
 struct sysdev_class s3c2410_sysclass = {
-	set_kset_name("s3c2410-core"),
+	.name = "s3c2410-core",
 };
 
 static struct sys_device s3c2410_sysdev = {
diff --git a/arch/arm/mach-s3c2412/s3c2412.c b/arch/arm/mach-s3c2412/s3c2412.c
index 4f92a1562d77..265cd3f567a3 100644
--- a/arch/arm/mach-s3c2412/s3c2412.c
+++ b/arch/arm/mach-s3c2412/s3c2412.c
@@ -196,7 +196,7 @@ void __init s3c2412_init_clocks(int xtal)
 */
 
 struct sysdev_class s3c2412_sysclass = {
-	set_kset_name("s3c2412-core"),
+	.name = "s3c2412-core",
 };
 
 static int __init s3c2412_core_init(void)
diff --git a/arch/arm/mach-s3c2440/mach-osiris.c b/arch/arm/mach-s3c2440/mach-osiris.c
index c326983f4a8f..78af7664988b 100644
--- a/arch/arm/mach-s3c2440/mach-osiris.c
+++ b/arch/arm/mach-s3c2440/mach-osiris.c
@@ -312,7 +312,7 @@ static int osiris_pm_resume(struct sys_device *sd)
 #endif
 
 static struct sysdev_class osiris_pm_sysclass = {
-	set_kset_name("mach-osiris"),
+	.name		= "mach-osiris",
 	.suspend	= osiris_pm_suspend,
 	.resume		= osiris_pm_resume,
 };
diff --git a/arch/arm/mach-s3c2443/s3c2443.c b/arch/arm/mach-s3c2443/s3c2443.c
index 8d8117158d23..9ce490560af9 100644
--- a/arch/arm/mach-s3c2443/s3c2443.c
+++ b/arch/arm/mach-s3c2443/s3c2443.c
@@ -43,7 +43,7 @@ static struct map_desc s3c2443_iodesc[] __initdata = {
 };
 
 struct sysdev_class s3c2443_sysclass = {
-	set_kset_name("s3c2443-core"),
+	.name = "s3c2443-core",
 };
 
 static struct sys_device s3c2443_sysdev = {
diff --git a/arch/arm/mach-sa1100/irq.c b/arch/arm/mach-sa1100/irq.c
index edf3347d9c5b..3dc17d7bf38e 100644
--- a/arch/arm/mach-sa1100/irq.c
+++ b/arch/arm/mach-sa1100/irq.c
@@ -283,7 +283,7 @@ static int sa1100irq_resume(struct sys_device *dev)
 }
 
 static struct sysdev_class sa1100irq_sysclass = {
-	set_kset_name("sa11x0-irq"),
+	.name		= "sa11x0-irq",
 	.suspend	= sa1100irq_suspend,
 	.resume		= sa1100irq_resume,
 };
diff --git a/arch/arm/oprofile/common.c b/arch/arm/oprofile/common.c
index a9de727c9327..0a5cf3a6438b 100644
--- a/arch/arm/oprofile/common.c
+++ b/arch/arm/oprofile/common.c
@@ -96,7 +96,7 @@ static int op_arm_resume(struct sys_device *dev)
 }
 
 static struct sysdev_class oprofile_sysclass = {
-	set_kset_name("oprofile"),
+	.name		= "oprofile",
 	.resume		= op_arm_resume,
 	.suspend	= op_arm_suspend,
 };
diff --git a/arch/arm/plat-omap/gpio.c b/arch/arm/plat-omap/gpio.c
index 6097753394ad..b2a87b8ef673 100644
--- a/arch/arm/plat-omap/gpio.c
+++ b/arch/arm/plat-omap/gpio.c
@@ -1455,7 +1455,7 @@ static int omap_gpio_resume(struct sys_device *dev)
 }
 
 static struct sysdev_class omap_gpio_sysclass = {
-	set_kset_name("gpio"),
+	.name		= "gpio",
 	.suspend	= omap_gpio_suspend,
 	.resume		= omap_gpio_resume,
 };
diff --git a/arch/arm/plat-s3c24xx/dma.c b/arch/arm/plat-s3c24xx/dma.c
index 29696e46ed65..aae1b9cbaf44 100644
--- a/arch/arm/plat-s3c24xx/dma.c
+++ b/arch/arm/plat-s3c24xx/dma.c
@@ -1265,7 +1265,7 @@ static int s3c2410_dma_resume(struct sys_device *dev)
 #endif /* CONFIG_PM */
 
 struct sysdev_class dma_sysclass = {
-	set_kset_name("s3c24xx-dma"),
+	.name		= "s3c24xx-dma",
 	.suspend	= s3c2410_dma_suspend,
 	.resume		= s3c2410_dma_resume,
 };
diff --git a/arch/arm/plat-s3c24xx/s3c244x.c b/arch/arm/plat-s3c24xx/s3c244x.c
index 3444b13afac5..f197bb3a2366 100644
--- a/arch/arm/plat-s3c24xx/s3c244x.c
+++ b/arch/arm/plat-s3c24xx/s3c244x.c
@@ -151,13 +151,13 @@ static int s3c244x_resume(struct sys_device *dev)
 /* Since the S3C2442 and S3C2440 share  items, put both sysclasses here */
 
 struct sysdev_class s3c2440_sysclass = {
-	set_kset_name("s3c2440-core"),
+	.name		= "s3c2440-core",
 	.suspend	= s3c244x_suspend,
 	.resume		= s3c244x_resume
 };
 
 struct sysdev_class s3c2442_sysclass = {
-	set_kset_name("s3c2442-core"),
+	.name		= "s3c2442-core",
 	.suspend	= s3c244x_suspend,
 	.resume		= s3c244x_resume
 };
diff --git a/arch/avr32/kernel/time.c b/arch/avr32/kernel/time.c
index 7014a3571ec0..36a46c3ae308 100644
--- a/arch/avr32/kernel/time.c
+++ b/arch/avr32/kernel/time.c
@@ -214,7 +214,7 @@ void __init time_init(void)
 }
 
 static struct sysdev_class timer_class = {
-	set_kset_name("timer"),
+	.name = "timer",
 };
 
 static struct sys_device timer_device = {
diff --git a/arch/mips/kernel/i8259.c b/arch/mips/kernel/i8259.c
index 471013577108..197d7977de35 100644
--- a/arch/mips/kernel/i8259.c
+++ b/arch/mips/kernel/i8259.c
@@ -238,7 +238,7 @@ static int i8259A_shutdown(struct sys_device *dev)
 }
 
 static struct sysdev_class i8259_sysdev_class = {
-	set_kset_name("i8259"),
+	.name = "i8259",
 	.resume = i8259A_resume,
 	.shutdown = i8259A_shutdown,
 };
diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c
index c83c3e3f5178..a08862203643 100644
--- a/arch/powerpc/platforms/cell/spu_base.c
+++ b/arch/powerpc/platforms/cell/spu_base.c
@@ -459,7 +459,7 @@ static int spu_shutdown(struct sys_device *sysdev)
 }
 
 static struct sysdev_class spu_sysdev_class = {
-	set_kset_name("spu"),
+	.name = "spu",
 	.shutdown = spu_shutdown,
 };
 
diff --git a/arch/powerpc/platforms/powermac/pic.c b/arch/powerpc/platforms/powermac/pic.c
index 999f5e160897..84c0d4ef76a2 100644
--- a/arch/powerpc/platforms/powermac/pic.c
+++ b/arch/powerpc/platforms/powermac/pic.c
@@ -663,7 +663,7 @@ static int pmacpic_resume(struct sys_device *sysdev)
 #endif /* CONFIG_PM && CONFIG_PPC32 */
 
 static struct sysdev_class pmacpic_sysclass = {
-	set_kset_name("pmac_pic"),
+	.name = "pmac_pic",
 };
 
 static struct sys_device device_pmacpic = {
diff --git a/arch/powerpc/sysdev/ipic.c b/arch/powerpc/sysdev/ipic.c
index 05a56e55804c..e898ff4d2b97 100644
--- a/arch/powerpc/sysdev/ipic.c
+++ b/arch/powerpc/sysdev/ipic.c
@@ -725,7 +725,7 @@ unsigned int ipic_get_irq(void)
 }
 
 static struct sysdev_class ipic_sysclass = {
-	set_kset_name("ipic"),
+	.name = "ipic",
 };
 
 static struct sys_device device_ipic = {
diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c
index e47938899a92..212a94f5d34b 100644
--- a/arch/powerpc/sysdev/mpic.c
+++ b/arch/powerpc/sysdev/mpic.c
@@ -1584,7 +1584,7 @@ static struct sysdev_class mpic_sysclass = {
 	.resume = mpic_resume,
 	.suspend = mpic_suspend,
 #endif
-	set_kset_name("mpic"),
+	.name = "mpic",
 };
 
 static int mpic_init_sys(void)
diff --git a/arch/powerpc/sysdev/qe_lib/qe_ic.c b/arch/powerpc/sysdev/qe_lib/qe_ic.c
index e1c0fd6dbc1a..f59444d3be75 100644
--- a/arch/powerpc/sysdev/qe_lib/qe_ic.c
+++ b/arch/powerpc/sysdev/qe_lib/qe_ic.c
@@ -483,7 +483,7 @@ int qe_ic_set_high_priority(unsigned int virq, unsigned int priority, int high)
 }
 
 static struct sysdev_class qe_ic_sysclass = {
-	set_kset_name("qe_ic"),
+	.name = "qe_ic",
 };
 
 static struct sys_device device_qe_ic = {
diff --git a/arch/ppc/syslib/ipic.c b/arch/ppc/syslib/ipic.c
index 9192777d0f78..4f163e20939e 100644
--- a/arch/ppc/syslib/ipic.c
+++ b/arch/ppc/syslib/ipic.c
@@ -614,7 +614,7 @@ int ipic_get_irq(void)
 }
 
 static struct sysdev_class ipic_sysclass = {
-	set_kset_name("ipic"),
+	.name = "ipic",
 };
 
 static struct sys_device device_ipic = {
diff --git a/arch/ppc/syslib/open_pic.c b/arch/ppc/syslib/open_pic.c
index 18ec94733293..da36522d327a 100644
--- a/arch/ppc/syslib/open_pic.c
+++ b/arch/ppc/syslib/open_pic.c
@@ -1043,7 +1043,7 @@ int openpic_resume(struct sys_device *sysdev)
 #endif /* CONFIG_PM */
 
 static struct sysdev_class openpic_sysclass = {
-	set_kset_name("openpic"),
+	.name = "openpic",
 };
 
 static struct sys_device device_openpic = {
diff --git a/arch/ppc/syslib/open_pic2.c b/arch/ppc/syslib/open_pic2.c
index d585207f9f77..449075a04798 100644
--- a/arch/ppc/syslib/open_pic2.c
+++ b/arch/ppc/syslib/open_pic2.c
@@ -666,7 +666,7 @@ int openpic2_resume(struct sys_device *sysdev)
 
 /* HACK ALERT */
 static struct sysdev_class openpic2_sysclass = {
-	set_kset_name("openpic2"),
+	.name = "openpic2",
 };
 
 static struct sys_device device_openpic2 = {
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index 22b800ce2126..3bbac1293be4 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -1145,7 +1145,7 @@ static void etr_work_fn(struct work_struct *work)
  * Sysfs interface functions
  */
 static struct sysdev_class etr_sysclass = {
-	set_kset_name("etr")
+	.name	= "etr",
 };
 
 static struct sys_device etr_port0_dev = {
diff --git a/arch/sh/drivers/dma/dma-sysfs.c b/arch/sh/drivers/dma/dma-sysfs.c
index eebcd4768bbf..51b57c0d1a3c 100644
--- a/arch/sh/drivers/dma/dma-sysfs.c
+++ b/arch/sh/drivers/dma/dma-sysfs.c
@@ -19,7 +19,7 @@
 #include <asm/dma.h>
 
 static struct sysdev_class dma_sysclass = {
-	set_kset_name("dma"),
+	.name = "dma",
 };
 EXPORT_SYMBOL(dma_sysclass);
 
diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c
index a3a67d151e52..2bc04bfee738 100644
--- a/arch/sh/kernel/time.c
+++ b/arch/sh/kernel/time.c
@@ -174,7 +174,7 @@ int timer_resume(struct sys_device *dev)
 #endif
 
 static struct sysdev_class timer_sysclass = {
-	set_kset_name("timer"),
+	.name	 = "timer",
 	.suspend = timer_suspend,
 	.resume	 = timer_resume,
 };
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index edb5108e5d0e..a56c782653be 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -1530,7 +1530,7 @@ static int lapic_resume(struct sys_device *dev)
  */
 
 static struct sysdev_class lapic_sysclass = {
-	set_kset_name("lapic"),
+	.name		= "lapic",
 	.resume		= lapic_resume,
 	.suspend	= lapic_suspend,
 };
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index f28ccb588fba..fa6cdee6d303 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -639,7 +639,7 @@ static int lapic_resume(struct sys_device *dev)
 }
 
 static struct sysdev_class lapic_sysclass = {
-	set_kset_name("lapic"),
+	.name		= "lapic",
 	.resume		= lapic_resume,
 	.suspend	= lapic_suspend,
 };
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 4b21d29fb5aa..242e8668dbeb 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -745,7 +745,7 @@ static void mce_restart(void)
 
 static struct sysdev_class mce_sysclass = {
 	.resume = mce_resume,
-	set_kset_name("machinecheck"),
+	.name = "machinecheck",
 };
 
 DEFINE_PER_CPU(struct sys_device, device_mce);
diff --git a/arch/x86/kernel/i8237.c b/arch/x86/kernel/i8237.c
index 29313832df0c..dbd6c1d1b638 100644
--- a/arch/x86/kernel/i8237.c
+++ b/arch/x86/kernel/i8237.c
@@ -51,7 +51,7 @@ static int i8237A_suspend(struct sys_device *dev, pm_message_t state)
 }
 
 static struct sysdev_class i8237_sysdev_class = {
-	set_kset_name("i8237"),
+	.name = "i8237",
 	.suspend = i8237A_suspend,
 	.resume = i8237A_resume,
 };
diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259_32.c
index f634fc715c99..5f3496d01984 100644
--- a/arch/x86/kernel/i8259_32.c
+++ b/arch/x86/kernel/i8259_32.c
@@ -258,7 +258,7 @@ static int i8259A_shutdown(struct sys_device *dev)
 }
 
 static struct sysdev_class i8259_sysdev_class = {
-	set_kset_name("i8259"),
+	.name = "i8259",
 	.suspend = i8259A_suspend,
 	.resume = i8259A_resume,
 	.shutdown = i8259A_shutdown,
diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c
index 3f27ea0b9816..ba6d57286f56 100644
--- a/arch/x86/kernel/i8259_64.c
+++ b/arch/x86/kernel/i8259_64.c
@@ -370,7 +370,7 @@ static int i8259A_shutdown(struct sys_device *dev)
 }
 
 static struct sysdev_class i8259_sysdev_class = {
-	set_kset_name("i8259"),
+	.name = "i8259",
 	.suspend = i8259A_suspend,
 	.resume = i8259A_resume,
 	.shutdown = i8259A_shutdown,
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index a6b1490e00c4..ab77f1905469 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -2401,7 +2401,7 @@ static int ioapic_resume(struct sys_device *dev)
 }
 
 static struct sysdev_class ioapic_sysdev_class = {
-	set_kset_name("ioapic"),
+	.name = "ioapic",
 	.suspend = ioapic_suspend,
 	.resume = ioapic_resume,
 };
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index cbac1670c7c3..23a3ac06a23e 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -1850,7 +1850,7 @@ static int ioapic_resume(struct sys_device *dev)
 }
 
 static struct sysdev_class ioapic_sysdev_class = {
-	set_kset_name("ioapic"),
+	.name = "ioapic",
 	.suspend = ioapic_suspend,
 	.resume = ioapic_resume,
 };
diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c
index 852db2906921..4f4bfd3a88b6 100644
--- a/arch/x86/kernel/nmi_32.c
+++ b/arch/x86/kernel/nmi_32.c
@@ -176,7 +176,7 @@ static int lapic_nmi_resume(struct sys_device *dev)
 
 
 static struct sysdev_class nmi_sysclass = {
-	set_kset_name("lapic_nmi"),
+	.name		= "lapic_nmi",
 	.resume		= lapic_nmi_resume,
 	.suspend	= lapic_nmi_suspend,
 };
diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c
index 4253c4e8849c..c3d1476b6a11 100644
--- a/arch/x86/kernel/nmi_64.c
+++ b/arch/x86/kernel/nmi_64.c
@@ -211,7 +211,7 @@ static int lapic_nmi_resume(struct sys_device *dev)
 }
 
 static struct sysdev_class nmi_sysclass = {
-	set_kset_name("lapic_nmi"),
+	.name		= "lapic_nmi",
 	.resume		= lapic_nmi_resume,
 	.suspend	= lapic_nmi_suspend,
 };
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 944bbcdd2b8d..c8ab79ef4276 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -51,7 +51,7 @@ static int nmi_resume(struct sys_device *dev)
 
 
 static struct sysdev_class oprofile_sysclass = {
-	set_kset_name("oprofile"),
+	.name		= "oprofile",
 	.resume		= nmi_resume,
 	.suspend	= nmi_suspend,
 };
diff --git a/drivers/acpi/pci_link.c b/drivers/acpi/pci_link.c
index c9f526e55392..5400ea173f6f 100644
--- a/drivers/acpi/pci_link.c
+++ b/drivers/acpi/pci_link.c
@@ -911,7 +911,7 @@ __setup("acpi_irq_balance", acpi_irq_balance_set);
 
 /* FIXME: we will remove this interface after all drivers call pci_disable_device */
 static struct sysdev_class irqrouter_sysdev_class = {
-	set_kset_name("irqrouter"),
+	.name = "irqrouter",
 	.resume = irqrouter_resume,
 };
 
diff --git a/drivers/base/class.c b/drivers/base/class.c
index 61fd26cc9f0e..b962a76875d2 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -466,7 +466,6 @@ static struct kset_uevent_ops class_uevent_ops = {
  * entirely soon.
  */
 static struct kset class_obj_subsys = {
-	.kobj = { .k_name = "class_obj", },
 	.uevent_ops = &class_uevent_ops,
 };
 
@@ -872,6 +871,7 @@ int __init classes_init(void)
 	/* ick, this is ugly, the things we go through to keep from showing up
 	 * in sysfs... */
 	kset_init(&class_obj_subsys);
+	kobject_set_name(&class_obj_subsys.kobj, "class_obj");
 	if (!class_obj_subsys.kobj.parent)
 		class_obj_subsys.kobj.parent = &class_obj_subsys.kobj;
 	return 0;
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 40545071e3c9..c5885f5ce0ac 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -14,7 +14,7 @@
 #include "base.h"
 
 struct sysdev_class cpu_sysdev_class = {
-	set_kset_name("cpu"),
+	.name = "cpu",
 };
 EXPORT_SYMBOL(cpu_sysdev_class);
 
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 7868707c7eda..7ae413fdd5fc 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -26,7 +26,7 @@
 #define MEMORY_CLASS_NAME	"memory"
 
 static struct sysdev_class memory_sysdev_class = {
-	set_kset_name(MEMORY_CLASS_NAME),
+	.name = MEMORY_CLASS_NAME,
 };
 
 static const char *memory_uevent_name(struct kset *kset, struct kobject *kobj)
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 88eeed72b5d6..e59861f18ce5 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -15,7 +15,7 @@
 #include <linux/device.h>
 
 static struct sysdev_class node_class = {
-	set_kset_name("node"),
+	.name = "node",
 };
 
 
diff --git a/drivers/base/sys.c b/drivers/base/sys.c
index e666441dd76b..2f79c55acdcc 100644
--- a/drivers/base/sys.c
+++ b/drivers/base/sys.c
@@ -136,6 +136,7 @@ int sysdev_class_register(struct sysdev_class * cls)
 	cls->kset.kobj.parent = &system_kset->kobj;
 	cls->kset.kobj.ktype = &ktype_sysdev_class;
 	cls->kset.kobj.kset = system_kset;
+	kobject_set_name(&cls->kset.kobj, cls->name);
 	return kset_register(&cls->kset);
 }
 
diff --git a/drivers/edac/edac_module.c b/drivers/edac/edac_module.c
index e0c4a4086055..7e1374afd967 100644
--- a/drivers/edac/edac_module.c
+++ b/drivers/edac/edac_module.c
@@ -31,7 +31,7 @@ struct workqueue_struct *edac_workqueue;
  *	need to export to other files in this modules
  */
 static struct sysdev_class edac_class = {
-	set_kset_name("edac"),
+	.name = "edac",
 };
 static int edac_class_valid;
 
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index 47c10b8f89b3..c0f372f1d761 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -3451,7 +3451,7 @@ static int kvm_resume(struct sys_device *dev)
 }
 
 static struct sysdev_class kvm_sysdev_class = {
-	set_kset_name("kvm"),
+	.name = "kvm",
 	.suspend = kvm_suspend,
 	.resume = kvm_resume,
 };
diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index 6123c70153d3..ac420b17e16f 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -2796,7 +2796,7 @@ static int pmu_sys_resume(struct sys_device *sysdev)
 #endif /* CONFIG_PM_SLEEP && CONFIG_PPC32 */
 
 static struct sysdev_class pmu_sysclass = {
-	set_kset_name("pmu"),
+	.name = "pmu",
 };
 
 static struct sys_device device_pmu = {
diff --git a/drivers/scsi/libsas/sas_scsi_host.c b/drivers/scsi/libsas/sas_scsi_host.c
index 7663841eb4cf..a3fdc57e2673 100644
--- a/drivers/scsi/libsas/sas_scsi_host.c
+++ b/drivers/scsi/libsas/sas_scsi_host.c
@@ -464,7 +464,7 @@ int sas_eh_bus_reset_handler(struct scsi_cmnd *cmd)
 	res = sas_phy_reset(phy, 1);
 	if (res)
 		SAS_DPRINTK("Bus reset of %s failed 0x%x\n",
-			    phy->dev.kobj.k_name,
+			    kobject_name(&phy->dev.kobj),
 			    res);
 	if (res == TMF_RESP_FUNC_SUCC || res == TMF_RESP_FUNC_COMPLETE)
 		return SUCCESS;
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index 504ac0eb4412..4adbe1d83081 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -61,7 +61,7 @@ enum kobject_action {
 };
 
 struct kobject {
-	const char		* k_name;
+	const char		*name;
 	struct kref		kref;
 	struct list_head	entry;
 	struct kobject		* parent;
@@ -69,7 +69,6 @@ struct kobject {
 	struct kobj_type	* ktype;
 	struct sysfs_dirent	* sd;
 	unsigned int state_initialized:1;
-	unsigned int state_name_set:1;
 	unsigned int state_in_sysfs:1;
 	unsigned int state_add_uevent_sent:1;
 	unsigned int state_remove_uevent_sent:1;
@@ -80,7 +79,7 @@ extern int kobject_set_name(struct kobject *, const char *, ...)
 
 static inline const char * kobject_name(const struct kobject * kobj)
 {
-	return kobj->k_name;
+	return kobj->name;
 }
 
 extern void kobject_init(struct kobject *kobj, struct kobj_type *ktype);
@@ -189,14 +188,6 @@ static inline struct kobj_type *get_ktype(struct kobject *kobj)
 
 extern struct kobject * kset_find_obj(struct kset *, const char *);
 
-
-/*
- * Use this when initializing an embedded kset with no other 
- * fields to initialize.
- */
-#define set_kset_name(str)	.kset = { .kobj = { .k_name = str } }
-
-
 /* The global /sys/kernel/ kobject for people to chain off of */
 extern struct kobject *kernel_kobj;
 /* The global /sys/hypervisor/ kobject for people to chain off of */
diff --git a/include/linux/sysdev.h b/include/linux/sysdev.h
index e285746588d6..f752e73bf977 100644
--- a/include/linux/sysdev.h
+++ b/include/linux/sysdev.h
@@ -29,6 +29,7 @@
 struct sys_device;
 
 struct sysdev_class {
+	const char *name;
 	struct list_head	drivers;
 
 	/* Default operations for these types of devices */
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index e3055ba69159..092e4c620af9 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -394,7 +394,7 @@ static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL);
 static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command);
 
 static struct sysdev_class rttest_sysclass = {
-	set_kset_name("rttest"),
+	.name = "rttest",
 };
 
 static int init_test_thread(int id)
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c8a9d13874df..8d6125ad2cf0 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -441,7 +441,7 @@ static SYSDEV_ATTR(available_clocksource, 0600,
 		   sysfs_show_available_clocksources, NULL);
 
 static struct sysdev_class clocksource_sysclass = {
-	set_kset_name("clocksource"),
+	.name = "clocksource",
 };
 
 static struct sys_device device_clocksource = {
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e5e466b27598..ab46ae8c062b 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -335,9 +335,9 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
 
 /* sysfs resume/suspend bits for timekeeping */
 static struct sysdev_class timekeeping_sysclass = {
+	.name		= "timekeeping",
 	.resume		= timekeeping_resume,
 	.suspend	= timekeeping_suspend,
-	set_kset_name("timekeeping"),
 };
 
 static struct sys_device device_timer = {
diff --git a/lib/kobject.c b/lib/kobject.c
index a0773734545c..8dc32454661d 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -165,7 +165,7 @@ static int kobject_add_internal(struct kobject *kobj)
 	if (!kobj)
 		return -ENOENT;
 
-	if (!kobj->k_name || !kobj->k_name[0]) {
+	if (!kobj->name || !kobj->name[0]) {
 		pr_debug("kobject: (%p): attempted to be registered with empty "
 			 "name!\n", kobj);
 		WARN_ON(1);
@@ -228,13 +228,11 @@ static int kobject_set_name_vargs(struct kobject *kobj, const char *fmt,
 	if (!name)
 		return -ENOMEM;
 
-
 	/* Free the old name, if necessary. */
-	kfree(kobj->k_name);
+	kfree(kobj->name);
 
 	/* Now, set the new name */
-	kobj->k_name = name;
-	kobj->state_name_set = 1;
+	kobj->name = name;
 
 	return 0;
 }
@@ -295,7 +293,6 @@ void kobject_init(struct kobject *kobj, struct kobj_type *ktype)
 	kref_init(&kobj->kref);
 	INIT_LIST_HEAD(&kobj->entry);
 	kobj->ktype = ktype;
-	kobj->state_name_set = 0;
 	kobj->state_in_sysfs = 0;
 	kobj->state_add_uevent_sent = 0;
 	kobj->state_remove_uevent_sent = 0;
@@ -551,8 +548,7 @@ struct kobject * kobject_get(struct kobject * kobj)
 static void kobject_cleanup(struct kobject *kobj)
 {
 	struct kobj_type *t = get_ktype(kobj);
-	const char *name = kobj->k_name;
-	int name_set = kobj->state_name_set;
+	const char *name = kobj->name;
 
 	pr_debug("kobject: '%s' (%p): %s\n",
 		 kobject_name(kobj), kobj, __FUNCTION__);
@@ -583,7 +579,7 @@ static void kobject_cleanup(struct kobject *kobj)
 	}
 
 	/* free name if we allocated it */
-	if (name_set && name) {
+	if (name) {
 		pr_debug("kobject: '%s': free name\n", name);
 		kfree(name);
 	}
-- 
cgit v1.2.3


From 86ef5c9a8edd78e6bf92879f32329d89b2d55b5a Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Fri, 25 Jan 2008 21:08:02 +0100
Subject: cpu-hotplug: replace lock_cpu_hotplug() with get_online_cpus()

Replace all lock_cpu_hotplug/unlock_cpu_hotplug from the kernel and use
get_online_cpus and put_online_cpus instead as it highlights the
refcount semantics in these operations.

The new API guarantees protection against the cpu-hotplug operation, but
it doesn't guarantee serialized access to any of the local data
structures. Hence the changes needs to be reviewed.

In case of pseries_add_processor/pseries_remove_processor, use
cpu_maps_update_begin()/cpu_maps_update_done() as we're modifying the
cpu_present_map there.

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/cpu-hotplug.txt                | 11 ++++++-----
 arch/mips/kernel/mips-mt-fpaff.c             | 10 +++++-----
 arch/powerpc/platforms/pseries/hotplug-cpu.c |  8 ++++----
 arch/powerpc/platforms/pseries/rtasd.c       |  8 ++++----
 arch/x86/kernel/cpu/mtrr/main.c              |  8 ++++----
 arch/x86/kernel/microcode.c                  | 16 ++++++++--------
 drivers/lguest/x86/core.c                    |  8 ++++----
 drivers/s390/char/sclp_config.c              |  4 ++--
 include/linux/cpu.h                          |  8 ++++----
 kernel/cpu.c                                 | 10 +++++-----
 kernel/cpuset.c                              | 14 +++++++-------
 kernel/rcutorture.c                          |  6 +++---
 kernel/sched.c                               |  4 ++--
 kernel/stop_machine.c                        |  4 ++--
 net/core/flow.c                              |  4 ++--
 15 files changed, 62 insertions(+), 61 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/Documentation/cpu-hotplug.txt b/Documentation/cpu-hotplug.txt
index a741f658a3c9..fb94f5a71b68 100644
--- a/Documentation/cpu-hotplug.txt
+++ b/Documentation/cpu-hotplug.txt
@@ -109,12 +109,13 @@ Never use anything other than cpumask_t to represent bitmap of CPUs.
 	for_each_cpu_mask(x,mask) - Iterate over some random collection of cpu mask.
 
 	#include <linux/cpu.h>
-	lock_cpu_hotplug() and unlock_cpu_hotplug():
+	get_online_cpus() and put_online_cpus():
 
-The above calls are used to inhibit cpu hotplug operations. While holding the
-cpucontrol mutex, cpu_online_map will not change. If you merely need to avoid
-cpus going away, you could also use preempt_disable() and preempt_enable()
-for those sections. Just remember the critical section cannot call any
+The above calls are used to inhibit cpu hotplug operations. While the
+cpu_hotplug.refcount is non zero, the cpu_online_map will not change.
+If you merely need to avoid cpus going away, you could also use
+preempt_disable() and preempt_enable() for those sections.
+Just remember the critical section cannot call any
 function that can sleep or schedule this process away. The preempt_disable()
 will work as long as stop_machine_run() is used to take a cpu down.
 
diff --git a/arch/mips/kernel/mips-mt-fpaff.c b/arch/mips/kernel/mips-mt-fpaff.c
index 892665bb12b1..bb4f00c0cbe9 100644
--- a/arch/mips/kernel/mips-mt-fpaff.c
+++ b/arch/mips/kernel/mips-mt-fpaff.c
@@ -58,13 +58,13 @@ asmlinkage long mipsmt_sys_sched_setaffinity(pid_t pid, unsigned int len,
 	if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask)))
 		return -EFAULT;
 
-	lock_cpu_hotplug();
+	get_online_cpus();
 	read_lock(&tasklist_lock);
 
 	p = find_process_by_pid(pid);
 	if (!p) {
 		read_unlock(&tasklist_lock);
-		unlock_cpu_hotplug();
+		put_online_cpus();
 		return -ESRCH;
 	}
 
@@ -106,7 +106,7 @@ asmlinkage long mipsmt_sys_sched_setaffinity(pid_t pid, unsigned int len,
 
 out_unlock:
 	put_task_struct(p);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 	return retval;
 }
 
@@ -125,7 +125,7 @@ asmlinkage long mipsmt_sys_sched_getaffinity(pid_t pid, unsigned int len,
 	if (len < real_len)
 		return -EINVAL;
 
-	lock_cpu_hotplug();
+	get_online_cpus();
 	read_lock(&tasklist_lock);
 
 	retval = -ESRCH;
@@ -140,7 +140,7 @@ asmlinkage long mipsmt_sys_sched_getaffinity(pid_t pid, unsigned int len,
 
 out_unlock:
 	read_unlock(&tasklist_lock);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 	if (retval)
 		return retval;
 	if (copy_to_user(user_mask_ptr, &mask, real_len))
diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index 412e6b42986f..c4ad54e0f288 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -153,7 +153,7 @@ static int pseries_add_processor(struct device_node *np)
 	for (i = 0; i < nthreads; i++)
 		cpu_set(i, tmp);
 
-	lock_cpu_hotplug();
+	cpu_maps_update_begin();
 
 	BUG_ON(!cpus_subset(cpu_present_map, cpu_possible_map));
 
@@ -190,7 +190,7 @@ static int pseries_add_processor(struct device_node *np)
 	}
 	err = 0;
 out_unlock:
-	unlock_cpu_hotplug();
+	cpu_maps_update_done();
 	return err;
 }
 
@@ -211,7 +211,7 @@ static void pseries_remove_processor(struct device_node *np)
 
 	nthreads = len / sizeof(u32);
 
-	lock_cpu_hotplug();
+	cpu_maps_update_begin();
 	for (i = 0; i < nthreads; i++) {
 		for_each_present_cpu(cpu) {
 			if (get_hard_smp_processor_id(cpu) != intserv[i])
@@ -225,7 +225,7 @@ static void pseries_remove_processor(struct device_node *np)
 			printk(KERN_WARNING "Could not find cpu to remove "
 			       "with physical id 0x%x\n", intserv[i]);
 	}
-	unlock_cpu_hotplug();
+	cpu_maps_update_done();
 }
 
 static int pseries_smp_notifier(struct notifier_block *nb,
diff --git a/arch/powerpc/platforms/pseries/rtasd.c b/arch/powerpc/platforms/pseries/rtasd.c
index 73401c820110..e3078ce41518 100644
--- a/arch/powerpc/platforms/pseries/rtasd.c
+++ b/arch/powerpc/platforms/pseries/rtasd.c
@@ -382,7 +382,7 @@ static void do_event_scan_all_cpus(long delay)
 {
 	int cpu;
 
-	lock_cpu_hotplug();
+	get_online_cpus();
 	cpu = first_cpu(cpu_online_map);
 	for (;;) {
 		set_cpus_allowed(current, cpumask_of_cpu(cpu));
@@ -390,15 +390,15 @@ static void do_event_scan_all_cpus(long delay)
 		set_cpus_allowed(current, CPU_MASK_ALL);
 
 		/* Drop hotplug lock, and sleep for the specified delay */
-		unlock_cpu_hotplug();
+		put_online_cpus();
 		msleep_interruptible(delay);
-		lock_cpu_hotplug();
+		get_online_cpus();
 
 		cpu = next_cpu(cpu, cpu_online_map);
 		if (cpu == NR_CPUS)
 			break;
 	}
-	unlock_cpu_hotplug();
+	put_online_cpus();
 }
 
 static int rtasd(void *unused)
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 3b20613325dc..beb45c9c0835 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -349,7 +349,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
 	replace = -1;
 
 	/* No CPU hotplug when we change MTRR entries */
-	lock_cpu_hotplug();
+	get_online_cpus();
 	/*  Search for existing MTRR  */
 	mutex_lock(&mtrr_mutex);
 	for (i = 0; i < num_var_ranges; ++i) {
@@ -405,7 +405,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
 	error = i;
  out:
 	mutex_unlock(&mtrr_mutex);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 	return error;
 }
 
@@ -495,7 +495,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
 
 	max = num_var_ranges;
 	/* No CPU hotplug when we change MTRR entries */
-	lock_cpu_hotplug();
+	get_online_cpus();
 	mutex_lock(&mtrr_mutex);
 	if (reg < 0) {
 		/*  Search for existing MTRR  */
@@ -536,7 +536,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
 	error = reg;
  out:
 	mutex_unlock(&mtrr_mutex);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 	return error;
 }
 /**
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
index 09c315214a5e..40cfd5488719 100644
--- a/arch/x86/kernel/microcode.c
+++ b/arch/x86/kernel/microcode.c
@@ -436,7 +436,7 @@ static ssize_t microcode_write (struct file *file, const char __user *buf, size_
 		return -EINVAL;
 	}
 
-	lock_cpu_hotplug();
+	get_online_cpus();
 	mutex_lock(&microcode_mutex);
 
 	user_buffer = (void __user *) buf;
@@ -447,7 +447,7 @@ static ssize_t microcode_write (struct file *file, const char __user *buf, size_
 		ret = (ssize_t)len;
 
 	mutex_unlock(&microcode_mutex);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 
 	return ret;
 }
@@ -658,14 +658,14 @@ static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz)
 
 		old = current->cpus_allowed;
 
-		lock_cpu_hotplug();
+		get_online_cpus();
 		set_cpus_allowed(current, cpumask_of_cpu(cpu));
 
 		mutex_lock(&microcode_mutex);
 		if (uci->valid)
 			err = cpu_request_microcode(cpu);
 		mutex_unlock(&microcode_mutex);
-		unlock_cpu_hotplug();
+		put_online_cpus();
 		set_cpus_allowed(current, old);
 	}
 	if (err)
@@ -817,9 +817,9 @@ static int __init microcode_init (void)
 		return PTR_ERR(microcode_pdev);
 	}
 
-	lock_cpu_hotplug();
+	get_online_cpus();
 	error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 	if (error) {
 		microcode_dev_exit();
 		platform_device_unregister(microcode_pdev);
@@ -839,9 +839,9 @@ static void __exit microcode_exit (void)
 
 	unregister_hotcpu_notifier(&mc_cpu_notifier);
 
-	lock_cpu_hotplug();
+	get_online_cpus();
 	sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 
 	platform_device_unregister(microcode_pdev);
 }
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 482aec2a9631..96d0fd07c57d 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -459,7 +459,7 @@ void __init lguest_arch_host_init(void)
 
 	/* We don't need the complexity of CPUs coming and going while we're
 	 * doing this. */
-	lock_cpu_hotplug();
+	get_online_cpus();
 	if (cpu_has_pge) { /* We have a broader idea of "global". */
 		/* Remember that this was originally set (for cleanup). */
 		cpu_had_pge = 1;
@@ -469,20 +469,20 @@ void __init lguest_arch_host_init(void)
 		/* Turn off the feature in the global feature set. */
 		clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
 	}
-	unlock_cpu_hotplug();
+	put_online_cpus();
 };
 /*:*/
 
 void __exit lguest_arch_host_fini(void)
 {
 	/* If we had PGE before we started, turn it back on now. */
-	lock_cpu_hotplug();
+	get_online_cpus();
 	if (cpu_had_pge) {
 		set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
 		/* adjust_pge's argument "1" means set PGE. */
 		on_each_cpu(adjust_pge, (void *)1, 0, 1);
 	}
-	unlock_cpu_hotplug();
+	put_online_cpus();
 }
 
 
diff --git a/drivers/s390/char/sclp_config.c b/drivers/s390/char/sclp_config.c
index 5322e5e54a98..9dc77f14fa52 100644
--- a/drivers/s390/char/sclp_config.c
+++ b/drivers/s390/char/sclp_config.c
@@ -29,12 +29,12 @@ static void sclp_cpu_capability_notify(struct work_struct *work)
 	struct sys_device *sysdev;
 
 	printk(KERN_WARNING TAG "cpu capability changed.\n");
-	lock_cpu_hotplug();
+	get_online_cpus();
 	for_each_online_cpu(cpu) {
 		sysdev = get_cpu_sysdev(cpu);
 		kobject_uevent(&sysdev->kobj, KOBJ_CHANGE);
 	}
-	unlock_cpu_hotplug();
+	put_online_cpus();
 }
 
 static void sclp_conf_receiver_fn(struct evbuf_header *evbuf)
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index a40247e4d462..3a3ff1c5cbef 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -100,8 +100,8 @@ static inline void cpuhotplug_mutex_unlock(struct mutex *cpu_hp_mutex)
 	mutex_unlock(cpu_hp_mutex);
 }
 
-extern void lock_cpu_hotplug(void);
-extern void unlock_cpu_hotplug(void);
+extern void get_online_cpus(void);
+extern void put_online_cpus(void);
 #define hotcpu_notifier(fn, pri) {				\
 	static struct notifier_block fn##_nb =			\
 		{ .notifier_call = fn, .priority = pri };	\
@@ -118,8 +118,8 @@ static inline void cpuhotplug_mutex_lock(struct mutex *cpu_hp_mutex)
 static inline void cpuhotplug_mutex_unlock(struct mutex *cpu_hp_mutex)
 { }
 
-#define lock_cpu_hotplug()	do { } while (0)
-#define unlock_cpu_hotplug()	do { } while (0)
+#define get_online_cpus()	do { } while (0)
+#define put_online_cpus()	do { } while (0)
 #define hotcpu_notifier(fn, pri)	do { (void)(fn); } while (0)
 /* These aren't inline functions due to a GCC bug. */
 #define register_hotcpu_notifier(nb)	({ (void)(nb); 0; })
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 656dc3fcbbae..b0c4152995f8 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -48,7 +48,7 @@ void __init cpu_hotplug_init(void)
 
 #ifdef CONFIG_HOTPLUG_CPU
 
-void lock_cpu_hotplug(void)
+void get_online_cpus(void)
 {
 	might_sleep();
 	if (cpu_hotplug.active_writer == current)
@@ -58,9 +58,9 @@ void lock_cpu_hotplug(void)
 	mutex_unlock(&cpu_hotplug.lock);
 
 }
-EXPORT_SYMBOL_GPL(lock_cpu_hotplug);
+EXPORT_SYMBOL_GPL(get_online_cpus);
 
-void unlock_cpu_hotplug(void)
+void put_online_cpus(void)
 {
 	if (cpu_hotplug.active_writer == current)
 		return;
@@ -73,7 +73,7 @@ void unlock_cpu_hotplug(void)
 	mutex_unlock(&cpu_hotplug.lock);
 
 }
-EXPORT_SYMBOL_GPL(unlock_cpu_hotplug);
+EXPORT_SYMBOL_GPL(put_online_cpus);
 
 #endif	/* CONFIG_HOTPLUG_CPU */
 
@@ -110,7 +110,7 @@ void cpu_maps_update_done(void)
  *   non zero and goes to sleep again.
  *
  * However, this is very difficult to achieve in practice since
- * lock_cpu_hotplug() not an api which is called all that often.
+ * get_online_cpus() not an api which is called all that often.
  *
  */
 static void cpu_hotplug_begin(void)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 50f5dc463688..cfaf6419d817 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -537,10 +537,10 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
  *
  * Call with cgroup_mutex held.  May take callback_mutex during
  * call due to the kfifo_alloc() and kmalloc() calls.  May nest
- * a call to the lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
+ * a call to the get_online_cpus()/put_online_cpus() pair.
  * Must not be called holding callback_mutex, because we must not
- * call lock_cpu_hotplug() while holding callback_mutex.  Elsewhere
- * the kernel nests callback_mutex inside lock_cpu_hotplug() calls.
+ * call get_online_cpus() while holding callback_mutex.  Elsewhere
+ * the kernel nests callback_mutex inside get_online_cpus() calls.
  * So the reverse nesting would risk an ABBA deadlock.
  *
  * The three key local variables below are:
@@ -691,9 +691,9 @@ restart:
 
 rebuild:
 	/* Have scheduler rebuild sched domains */
-	lock_cpu_hotplug();
+	get_online_cpus();
 	partition_sched_domains(ndoms, doms);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 
 done:
 	if (q && !IS_ERR(q))
@@ -1617,10 +1617,10 @@ static struct cgroup_subsys_state *cpuset_create(
  *
  * If the cpuset being removed has its flag 'sched_load_balance'
  * enabled, then simulate turning sched_load_balance off, which
- * will call rebuild_sched_domains().  The lock_cpu_hotplug()
+ * will call rebuild_sched_domains().  The get_online_cpus()
  * call in rebuild_sched_domains() must not be made while holding
  * callback_mutex.  Elsewhere the kernel nests callback_mutex inside
- * lock_cpu_hotplug() calls.  So the reverse nesting would risk an
+ * get_online_cpus() calls.  So the reverse nesting would risk an
  * ABBA deadlock.
  */
 
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index c3e165c2318f..fd599829e72a 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -726,11 +726,11 @@ static void rcu_torture_shuffle_tasks(void)
 	cpumask_t tmp_mask = CPU_MASK_ALL;
 	int i;
 
-	lock_cpu_hotplug();
+	get_online_cpus();
 
 	/* No point in shuffling if there is only one online CPU (ex: UP) */
 	if (num_online_cpus() == 1) {
-		unlock_cpu_hotplug();
+		put_online_cpus();
 		return;
 	}
 
@@ -762,7 +762,7 @@ static void rcu_torture_shuffle_tasks(void)
 	else
 		rcu_idle_cpu--;
 
-	unlock_cpu_hotplug();
+	put_online_cpus();
 }
 
 /* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
diff --git a/kernel/sched.c b/kernel/sched.c
index 86e55a9c2de6..672aa68bfeac 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7152,7 +7152,7 @@ static int load_balance_monitor(void *unused)
 		int i, cpu, balanced = 1;
 
 		/* Prevent cpus going down or coming up */
-		lock_cpu_hotplug();
+		get_online_cpus();
 		/* lockout changes to doms_cur[] array */
 		lock_doms_cur();
 		/*
@@ -7186,7 +7186,7 @@ static int load_balance_monitor(void *unused)
 		rcu_read_unlock();
 
 		unlock_doms_cur();
-		unlock_cpu_hotplug();
+		put_online_cpus();
 
 		if (!balanced)
 			timeout = sysctl_sched_min_bal_int_shares;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 319821ef78af..51b5ee53571a 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -203,13 +203,13 @@ int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
 	int ret;
 
 	/* No CPUs can come up or down during this. */
-	lock_cpu_hotplug();
+	get_online_cpus();
 	p = __stop_machine_run(fn, data, cpu);
 	if (!IS_ERR(p))
 		ret = kthread_stop(p);
 	else
 		ret = PTR_ERR(p);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 
 	return ret;
 }
diff --git a/net/core/flow.c b/net/core/flow.c
index 3ed2b4b1d6d4..6489f4e24ecf 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -293,7 +293,7 @@ void flow_cache_flush(void)
 	static DEFINE_MUTEX(flow_flush_sem);
 
 	/* Don't want cpus going down or up during this. */
-	lock_cpu_hotplug();
+	get_online_cpus();
 	mutex_lock(&flow_flush_sem);
 	atomic_set(&info.cpuleft, num_online_cpus());
 	init_completion(&info.completion);
@@ -305,7 +305,7 @@ void flow_cache_flush(void)
 
 	wait_for_completion(&info.completion);
 	mutex_unlock(&flow_flush_sem);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 }
 
 static void __devinit flow_cache_cpu_prepare(int cpu)
-- 
cgit v1.2.3


From 8f4d37ec073c17e2d4aa8851df5837d798606d6f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 25 Jan 2008 21:08:29 +0100
Subject: sched: high-res preemption tick

Use HR-timers (when available) to deliver an accurate preemption tick.

The regular scheduler tick that runs at 1/HZ can be too coarse when nice
level are used. The fairness system will still keep the cpu utilisation 'fair'
by then delaying the task that got an excessive amount of CPU time but try to
minimize this by delivering preemption points spot-on.

The average frequency of this extra interrupt is sched_latency / nr_latency.
Which need not be higher than 1/HZ, its just that the distribution within the
sched_latency period is important.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/entry_64.S       |   6 +-
 arch/x86/kernel/signal_32.c      |   3 +
 arch/x86/kernel/signal_64.c      |   3 +
 include/asm-x86/thread_info_32.h |   2 +
 include/asm-x86/thread_info_64.h |   5 +
 include/linux/hrtimer.h          |   9 ++
 include/linux/sched.h            |   3 +-
 kernel/Kconfig.hz                |   2 +
 kernel/sched.c                   | 210 +++++++++++++++++++++++++++++++++++++--
 kernel/sched_fair.c              |  69 ++++++++++++-
 kernel/sched_idletask.c          |   2 +-
 kernel/sched_rt.c                |   2 +-
 12 files changed, 295 insertions(+), 21 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 3a058bb16409..e70f3881d7e4 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -283,7 +283,7 @@ sysret_careful:
 sysret_signal:
 	TRACE_IRQS_ON
 	sti
-	testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
+	testl $_TIF_DO_NOTIFY_MASK,%edx
 	jz    1f
 
 	/* Really a signal */
@@ -377,7 +377,7 @@ int_very_careful:
 	jmp int_restore_rest
 	
 int_signal:
-	testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
+	testl $_TIF_DO_NOTIFY_MASK,%edx
 	jz 1f
 	movq %rsp,%rdi		# &ptregs -> arg1
 	xorl %esi,%esi		# oldset -> arg2
@@ -603,7 +603,7 @@ retint_careful:
 	jmp retint_check
 	
 retint_signal:
-	testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
+	testl $_TIF_DO_NOTIFY_MASK,%edx
 	jz    retint_swapgs
 	TRACE_IRQS_ON
 	sti
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index 9bdd83022f5f..20f29e4c1d33 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -658,6 +658,9 @@ void do_notify_resume(struct pt_regs *regs, void *_unused,
 	/* deal with pending signal delivery */
 	if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK))
 		do_signal(regs);
+
+	if (thread_info_flags & _TIF_HRTICK_RESCHED)
+		hrtick_resched();
 	
 	clear_thread_flag(TIF_IRET);
 }
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index ab086b0357fc..38d806467c0f 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -480,6 +480,9 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 	/* deal with pending signal delivery */
 	if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK))
 		do_signal(regs);
+
+	if (thread_info_flags & _TIF_HRTICK_RESCHED)
+		hrtick_resched();
 }
 
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
diff --git a/include/asm-x86/thread_info_32.h b/include/asm-x86/thread_info_32.h
index 22a8cbcd35e2..ef58fd2a6eb0 100644
--- a/include/asm-x86/thread_info_32.h
+++ b/include/asm-x86/thread_info_32.h
@@ -132,6 +132,7 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_SYSCALL_AUDIT	6	/* syscall auditing active */
 #define TIF_SECCOMP		7	/* secure computing */
 #define TIF_RESTORE_SIGMASK	8	/* restore signal mask in do_signal() */
+#define TIF_HRTICK_RESCHED	9	/* reprogram hrtick timer */
 #define TIF_MEMDIE		16
 #define TIF_DEBUG		17	/* uses debug registers */
 #define TIF_IO_BITMAP		18	/* uses I/O bitmap */
@@ -147,6 +148,7 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1<<TIF_SECCOMP)
 #define _TIF_RESTORE_SIGMASK	(1<<TIF_RESTORE_SIGMASK)
+#define _TIF_HRTICK_RESCHED	(1<<TIF_HRTICK_RESCHED)
 #define _TIF_DEBUG		(1<<TIF_DEBUG)
 #define _TIF_IO_BITMAP		(1<<TIF_IO_BITMAP)
 #define _TIF_FREEZE		(1<<TIF_FREEZE)
diff --git a/include/asm-x86/thread_info_64.h b/include/asm-x86/thread_info_64.h
index beae2bfb62ca..7f6ee68f0002 100644
--- a/include/asm-x86/thread_info_64.h
+++ b/include/asm-x86/thread_info_64.h
@@ -115,6 +115,7 @@ static inline struct thread_info *stack_thread_info(void)
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_RESTORE_SIGMASK	9	/* restore signal mask in do_signal */
 #define TIF_MCE_NOTIFY		10	/* notify userspace of an MCE */
+#define TIF_HRTICK_RESCHED	11	/* reprogram hrtick timer */
 /* 16 free */
 #define TIF_IA32		17	/* 32bit process */ 
 #define TIF_FORK		18	/* ret_from_fork */
@@ -133,6 +134,7 @@ static inline struct thread_info *stack_thread_info(void)
 #define _TIF_SECCOMP		(1<<TIF_SECCOMP)
 #define _TIF_RESTORE_SIGMASK	(1<<TIF_RESTORE_SIGMASK)
 #define _TIF_MCE_NOTIFY		(1<<TIF_MCE_NOTIFY)
+#define _TIF_HRTICK_RESCHED	(1<<TIF_HRTICK_RESCHED)
 #define _TIF_IA32		(1<<TIF_IA32)
 #define _TIF_FORK		(1<<TIF_FORK)
 #define _TIF_ABI_PENDING	(1<<TIF_ABI_PENDING)
@@ -146,6 +148,9 @@ static inline struct thread_info *stack_thread_info(void)
 /* work to do on any return to user space */
 #define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP)
 
+#define _TIF_DO_NOTIFY_MASK \
+	(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY|_TIF_HRTICK_RESCHED)
+
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW (_TIF_DEBUG|_TIF_IO_BITMAP)
 
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 7a9398e19704..ecc8e2685e2b 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -217,6 +217,11 @@ static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer)
 	return timer->base->get_time();
 }
 
+static inline int hrtimer_is_hres_active(struct hrtimer *timer)
+{
+	return timer->base->cpu_base->hres_active;
+}
+
 /*
  * The resolution of the clocks. The resolution value is returned in
  * the clock_getres() system call to give application programmers an
@@ -248,6 +253,10 @@ static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer)
 	return timer->base->softirq_time;
 }
 
+static inline int hrtimer_is_hres_active(struct hrtimer *timer)
+{
+	return 0;
+}
 #endif
 
 extern ktime_t ktime_get(void);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7907845c2348..43e0339d65fc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -257,6 +257,7 @@ extern void trap_init(void);
 extern void account_process_tick(struct task_struct *task, int user);
 extern void update_process_times(int user);
 extern void scheduler_tick(void);
+extern void hrtick_resched(void);
 
 extern void sched_show_task(struct task_struct *p);
 
@@ -849,7 +850,7 @@ struct sched_class {
 #endif
 
 	void (*set_curr_task) (struct rq *rq);
-	void (*task_tick) (struct rq *rq, struct task_struct *p);
+	void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
 	void (*task_new) (struct rq *rq, struct task_struct *p);
 	void (*set_cpus_allowed)(struct task_struct *p, cpumask_t *newmask);
 
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 4af15802ccd4..526128a2e622 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -54,3 +54,5 @@ config HZ
 	default 300 if HZ_300
 	default 1000 if HZ_1000
 
+config SCHED_HRTICK
+	def_bool HIGH_RES_TIMERS && X86
diff --git a/kernel/sched.c b/kernel/sched.c
index 6ee37602a6d8..17f93d3eda91 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -65,6 +65,7 @@
 #include <linux/reciprocal_div.h>
 #include <linux/unistd.h>
 #include <linux/pagemap.h>
+#include <linux/hrtimer.h>
 
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
@@ -451,6 +452,12 @@ struct rq {
 	struct list_head migration_queue;
 #endif
 
+#ifdef CONFIG_SCHED_HRTICK
+	unsigned long hrtick_flags;
+	ktime_t hrtick_expire;
+	struct hrtimer hrtick_timer;
+#endif
+
 #ifdef CONFIG_SCHEDSTATS
 	/* latency stats */
 	struct sched_info rq_sched_info;
@@ -572,6 +579,8 @@ enum {
 	SCHED_FEAT_START_DEBIT		= 4,
 	SCHED_FEAT_TREE_AVG		= 8,
 	SCHED_FEAT_APPROX_AVG		= 16,
+	SCHED_FEAT_HRTICK		= 32,
+	SCHED_FEAT_DOUBLE_TICK		= 64,
 };
 
 const_debug unsigned int sysctl_sched_features =
@@ -579,7 +588,9 @@ const_debug unsigned int sysctl_sched_features =
 		SCHED_FEAT_WAKEUP_PREEMPT	* 1 |
 		SCHED_FEAT_START_DEBIT		* 1 |
 		SCHED_FEAT_TREE_AVG		* 0 |
-		SCHED_FEAT_APPROX_AVG		* 0;
+		SCHED_FEAT_APPROX_AVG		* 0 |
+		SCHED_FEAT_HRTICK		* 1 |
+		SCHED_FEAT_DOUBLE_TICK		* 0;
 
 #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
 
@@ -796,6 +807,173 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
 }
 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 
+static void __resched_task(struct task_struct *p, int tif_bit);
+
+static inline void resched_task(struct task_struct *p)
+{
+	__resched_task(p, TIF_NEED_RESCHED);
+}
+
+#ifdef CONFIG_SCHED_HRTICK
+/*
+ * Use HR-timers to deliver accurate preemption points.
+ *
+ * Its all a bit involved since we cannot program an hrt while holding the
+ * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
+ * reschedule event.
+ *
+ * When we get rescheduled we reprogram the hrtick_timer outside of the
+ * rq->lock.
+ */
+static inline void resched_hrt(struct task_struct *p)
+{
+	__resched_task(p, TIF_HRTICK_RESCHED);
+}
+
+static inline void resched_rq(struct rq *rq)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rq->lock, flags);
+	resched_task(rq->curr);
+	spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+enum {
+	HRTICK_SET,		/* re-programm hrtick_timer */
+	HRTICK_RESET,		/* not a new slice */
+};
+
+/*
+ * Use hrtick when:
+ *  - enabled by features
+ *  - hrtimer is actually high res
+ */
+static inline int hrtick_enabled(struct rq *rq)
+{
+	if (!sched_feat(HRTICK))
+		return 0;
+	return hrtimer_is_hres_active(&rq->hrtick_timer);
+}
+
+/*
+ * Called to set the hrtick timer state.
+ *
+ * called with rq->lock held and irqs disabled
+ */
+static void hrtick_start(struct rq *rq, u64 delay, int reset)
+{
+	assert_spin_locked(&rq->lock);
+
+	/*
+	 * preempt at: now + delay
+	 */
+	rq->hrtick_expire =
+		ktime_add_ns(rq->hrtick_timer.base->get_time(), delay);
+	/*
+	 * indicate we need to program the timer
+	 */
+	__set_bit(HRTICK_SET, &rq->hrtick_flags);
+	if (reset)
+		__set_bit(HRTICK_RESET, &rq->hrtick_flags);
+
+	/*
+	 * New slices are called from the schedule path and don't need a
+	 * forced reschedule.
+	 */
+	if (reset)
+		resched_hrt(rq->curr);
+}
+
+static void hrtick_clear(struct rq *rq)
+{
+	if (hrtimer_active(&rq->hrtick_timer))
+		hrtimer_cancel(&rq->hrtick_timer);
+}
+
+/*
+ * Update the timer from the possible pending state.
+ */
+static void hrtick_set(struct rq *rq)
+{
+	ktime_t time;
+	int set, reset;
+	unsigned long flags;
+
+	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
+
+	spin_lock_irqsave(&rq->lock, flags);
+	set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags);
+	reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags);
+	time = rq->hrtick_expire;
+	clear_thread_flag(TIF_HRTICK_RESCHED);
+	spin_unlock_irqrestore(&rq->lock, flags);
+
+	if (set) {
+		hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS);
+		if (reset && !hrtimer_active(&rq->hrtick_timer))
+			resched_rq(rq);
+	} else
+		hrtick_clear(rq);
+}
+
+/*
+ * High-resolution timer tick.
+ * Runs from hardirq context with interrupts disabled.
+ */
+static enum hrtimer_restart hrtick(struct hrtimer *timer)
+{
+	struct rq *rq = container_of(timer, struct rq, hrtick_timer);
+
+	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
+
+	spin_lock(&rq->lock);
+	__update_rq_clock(rq);
+	rq->curr->sched_class->task_tick(rq, rq->curr, 1);
+	spin_unlock(&rq->lock);
+
+	return HRTIMER_NORESTART;
+}
+
+static inline void init_rq_hrtick(struct rq *rq)
+{
+	rq->hrtick_flags = 0;
+	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	rq->hrtick_timer.function = hrtick;
+	rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+}
+
+void hrtick_resched(void)
+{
+	struct rq *rq;
+	unsigned long flags;
+
+	if (!test_thread_flag(TIF_HRTICK_RESCHED))
+		return;
+
+	local_irq_save(flags);
+	rq = cpu_rq(smp_processor_id());
+	hrtick_set(rq);
+	local_irq_restore(flags);
+}
+#else
+static inline void hrtick_clear(struct rq *rq)
+{
+}
+
+static inline void hrtick_set(struct rq *rq)
+{
+}
+
+static inline void init_rq_hrtick(struct rq *rq)
+{
+}
+
+void hrtick_resched(void)
+{
+}
+#endif
+
 /*
  * resched_task - mark a task 'to be rescheduled now'.
  *
@@ -809,16 +987,16 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
 #endif
 
-static void resched_task(struct task_struct *p)
+static void __resched_task(struct task_struct *p, int tif_bit)
 {
 	int cpu;
 
 	assert_spin_locked(&task_rq(p)->lock);
 
-	if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
+	if (unlikely(test_tsk_thread_flag(p, tif_bit)))
 		return;
 
-	set_tsk_thread_flag(p, TIF_NEED_RESCHED);
+	set_tsk_thread_flag(p, tif_bit);
 
 	cpu = task_cpu(p);
 	if (cpu == smp_processor_id())
@@ -841,10 +1019,10 @@ static void resched_cpu(int cpu)
 	spin_unlock_irqrestore(&rq->lock, flags);
 }
 #else
-static inline void resched_task(struct task_struct *p)
+static void __resched_task(struct task_struct *p, int tif_bit)
 {
 	assert_spin_locked(&task_rq(p)->lock);
-	set_tsk_need_resched(p);
+	set_tsk_thread_flag(p, tif_bit);
 }
 #endif
 
@@ -3497,7 +3675,7 @@ void scheduler_tick(void)
 	rq->tick_timestamp = rq->clock;
 	update_cpu_load(rq);
 	if (curr != rq->idle) /* FIXME: needed? */
-		curr->sched_class->task_tick(rq, curr);
+		curr->sched_class->task_tick(rq, curr, 0);
 	spin_unlock(&rq->lock);
 
 #ifdef CONFIG_SMP
@@ -3643,6 +3821,8 @@ need_resched_nonpreemptible:
 
 	schedule_debug(prev);
 
+	hrtick_clear(rq);
+
 	/*
 	 * Do the rq-clock update outside the rq lock:
 	 */
@@ -3680,14 +3860,20 @@ need_resched_nonpreemptible:
 		++*switch_count;
 
 		context_switch(rq, prev, next); /* unlocks the rq */
+		/*
+		 * the context switch might have flipped the stack from under
+		 * us, hence refresh the local variables.
+		 */
+		cpu = smp_processor_id();
+		rq = cpu_rq(cpu);
 	} else
 		spin_unlock_irq(&rq->lock);
 
-	if (unlikely(reacquire_kernel_lock(current) < 0)) {
-		cpu = smp_processor_id();
-		rq = cpu_rq(cpu);
+	hrtick_set(rq);
+
+	if (unlikely(reacquire_kernel_lock(current) < 0))
 		goto need_resched_nonpreemptible;
-	}
+
 	preempt_enable_no_resched();
 	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
 		goto need_resched;
@@ -6913,6 +7099,8 @@ void __init sched_init(void)
 		rq->rt.overloaded = 0;
 		rq_attach_root(rq, &def_root_domain);
 #endif
+		init_rq_hrtick(rq);
+
 		atomic_set(&rq->nr_iowait, 0);
 
 		array = &rq->rt.active;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index dfa18d55561d..3dab1ff83c4f 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -642,13 +642,29 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 	cfs_rq->curr = NULL;
 }
 
-static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+static void
+entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 {
 	/*
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
 
+#ifdef CONFIG_SCHED_HRTICK
+	/*
+	 * queued ticks are scheduled to match the slice, so don't bother
+	 * validating it and just reschedule.
+	 */
+	if (queued)
+		return resched_task(rq_of(cfs_rq)->curr);
+	/*
+	 * don't let the period tick interfere with the hrtick preemption
+	 */
+	if (!sched_feat(DOUBLE_TICK) &&
+			hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
+		return;
+#endif
+
 	if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
 		check_preempt_tick(cfs_rq, curr);
 }
@@ -754,6 +770,43 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
 
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
+#ifdef CONFIG_SCHED_HRTICK
+static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
+{
+	int requeue = rq->curr == p;
+	struct sched_entity *se = &p->se;
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+	WARN_ON(task_rq(p) != rq);
+
+	if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) {
+		u64 slice = sched_slice(cfs_rq, se);
+		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+		s64 delta = slice - ran;
+
+		if (delta < 0) {
+			if (rq->curr == p)
+				resched_task(p);
+			return;
+		}
+
+		/*
+		 * Don't schedule slices shorter than 10000ns, that just
+		 * doesn't make sense. Rely on vruntime for fairness.
+		 */
+		if (!requeue)
+			delta = max(10000LL, delta);
+
+		hrtick_start(rq, delta, requeue);
+	}
+}
+#else
+static inline void
+hrtick_start_fair(struct rq *rq, struct task_struct *p)
+{
+}
+#endif
+
 /*
  * The enqueue_task method is called before nr_running is
  * increased. Here we update the fair scheduling stats and
@@ -782,6 +835,8 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
 	 */
 	if (incload)
 		inc_cpu_load(rq, topse->load.weight);
+
+	hrtick_start_fair(rq, rq->curr);
 }
 
 /*
@@ -814,6 +869,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
 	 */
 	if (decload)
 		dec_cpu_load(rq, topse->load.weight);
+
+	hrtick_start_fair(rq, rq->curr);
 }
 
 /*
@@ -1049,6 +1106,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 
 static struct task_struct *pick_next_task_fair(struct rq *rq)
 {
+	struct task_struct *p;
 	struct cfs_rq *cfs_rq = &rq->cfs;
 	struct sched_entity *se;
 
@@ -1060,7 +1118,10 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
 		cfs_rq = group_cfs_rq(se);
 	} while (cfs_rq);
 
-	return task_of(se);
+	p = task_of(se);
+	hrtick_start_fair(rq, p);
+
+	return p;
 }
 
 /*
@@ -1235,14 +1296,14 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 /*
  * scheduler tick hitting a task of our scheduling class:
  */
-static void task_tick_fair(struct rq *rq, struct task_struct *curr)
+static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &curr->se;
 
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
-		entity_tick(cfs_rq, se);
+		entity_tick(cfs_rq, se, queued);
 	}
 }
 
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index ef7a2661fa10..2bcafa375633 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -61,7 +61,7 @@ move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
 }
 #endif
 
-static void task_tick_idle(struct rq *rq, struct task_struct *curr)
+static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
 {
 }
 
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index f350f7b15158..83fbbcb8019e 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -863,7 +863,7 @@ static void watchdog(struct rq *rq, struct task_struct *p)
 	}
 }
 
-static void task_tick_rt(struct rq *rq, struct task_struct *p)
+static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
 {
 	update_curr_rt(rq);
 
-- 
cgit v1.2.3


From 9745512ce79de686df354dc70a8d1a74d801892d Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 25 Jan 2008 21:08:34 +0100
Subject: sched: latencytop support

LatencyTOP kernel infrastructure; it measures latencies in the
scheduler and tracks it system wide and per process.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/stacktrace.c |  27 +++++
 fs/proc/base.c               |  78 ++++++++++++++
 include/linux/latencytop.h   |  44 ++++++++
 include/linux/sched.h        |   5 +
 include/linux/stacktrace.h   |   3 +
 kernel/Makefile              |   1 +
 kernel/fork.c                |   1 +
 kernel/latencytop.c          | 239 +++++++++++++++++++++++++++++++++++++++++++
 kernel/sched_fair.c          |   8 +-
 kernel/sysctl.c              |  10 ++
 lib/Kconfig.debug            |  14 +++
 11 files changed, 429 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/latencytop.h
 create mode 100644 kernel/latencytop.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 6fa6cf036c70..55771fd7e545 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -33,6 +33,19 @@ static void save_stack_address(void *data, unsigned long addr)
 		trace->entries[trace->nr_entries++] = addr;
 }
 
+static void save_stack_address_nosched(void *data, unsigned long addr)
+{
+	struct stack_trace *trace = (struct stack_trace *)data;
+	if (in_sched_functions(addr))
+		return;
+	if (trace->skip > 0) {
+		trace->skip--;
+		return;
+	}
+	if (trace->nr_entries < trace->max_entries)
+		trace->entries[trace->nr_entries++] = addr;
+}
+
 static const struct stacktrace_ops save_stack_ops = {
 	.warning = save_stack_warning,
 	.warning_symbol = save_stack_warning_symbol,
@@ -40,6 +53,13 @@ static const struct stacktrace_ops save_stack_ops = {
 	.address = save_stack_address,
 };
 
+static const struct stacktrace_ops save_stack_ops_nosched = {
+	.warning = save_stack_warning,
+	.warning_symbol = save_stack_warning_symbol,
+	.stack = save_stack_stack,
+	.address = save_stack_address_nosched,
+};
+
 /*
  * Save stack-backtrace addresses into a stack_trace buffer.
  */
@@ -50,3 +70,10 @@ void save_stack_trace(struct stack_trace *trace)
 		trace->entries[trace->nr_entries++] = ULONG_MAX;
 }
 EXPORT_SYMBOL(save_stack_trace);
+
+void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
+{
+	dump_trace(tsk, NULL, NULL, &save_stack_ops_nosched, trace);
+	if (trace->nr_entries < trace->max_entries)
+		trace->entries[trace->nr_entries++] = ULONG_MAX;
+}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 7411bfb0b7cc..91fa8e6ce8ad 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -310,6 +310,77 @@ static int proc_pid_schedstat(struct task_struct *task, char *buffer)
 }
 #endif
 
+#ifdef CONFIG_LATENCYTOP
+static int lstats_show_proc(struct seq_file *m, void *v)
+{
+	int i;
+	struct task_struct *task = m->private;
+	seq_puts(m, "Latency Top version : v0.1\n");
+
+	for (i = 0; i < 32; i++) {
+		if (task->latency_record[i].backtrace[0]) {
+			int q;
+			seq_printf(m, "%i %li %li ",
+				task->latency_record[i].count,
+				task->latency_record[i].time,
+				task->latency_record[i].max);
+			for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
+				char sym[KSYM_NAME_LEN];
+				char *c;
+				if (!task->latency_record[i].backtrace[q])
+					break;
+				if (task->latency_record[i].backtrace[q] == ULONG_MAX)
+					break;
+				sprint_symbol(sym, task->latency_record[i].backtrace[q]);
+				c = strchr(sym, '+');
+				if (c)
+					*c = 0;
+				seq_printf(m, "%s ", sym);
+			}
+			seq_printf(m, "\n");
+		}
+
+	}
+	return 0;
+}
+
+static int lstats_open(struct inode *inode, struct file *file)
+{
+	int ret;
+	struct seq_file *m;
+	struct task_struct *task = get_proc_task(inode);
+
+	ret = single_open(file, lstats_show_proc, NULL);
+	if (!ret) {
+		m = file->private_data;
+		m->private = task;
+	}
+	return ret;
+}
+
+static ssize_t lstats_write(struct file *file, const char __user *buf,
+			    size_t count, loff_t *offs)
+{
+	struct seq_file *m;
+	struct task_struct *task;
+
+	m = file->private_data;
+	task = m->private;
+	clear_all_latency_tracing(task);
+
+	return count;
+}
+
+static const struct file_operations proc_lstats_operations = {
+	.open		= lstats_open,
+	.read		= seq_read,
+	.write		= lstats_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+#endif
+
 /* The badness from the OOM killer */
 unsigned long badness(struct task_struct *p, unsigned long uptime);
 static int proc_oom_score(struct task_struct *task, char *buffer)
@@ -1020,6 +1091,7 @@ static const struct file_operations proc_fault_inject_operations = {
 };
 #endif
 
+
 #ifdef CONFIG_SCHED_DEBUG
 /*
  * Print out various scheduling related per-task fields:
@@ -2230,6 +2302,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_SCHEDSTATS
 	INF("schedstat",  S_IRUGO, pid_schedstat),
 #endif
+#ifdef CONFIG_LATENCYTOP
+	REG("latency",  S_IRUGO, lstats),
+#endif
 #ifdef CONFIG_PROC_PID_CPUSET
 	REG("cpuset",     S_IRUGO, cpuset),
 #endif
@@ -2555,6 +2630,9 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_SCHEDSTATS
 	INF("schedstat", S_IRUGO, pid_schedstat),
 #endif
+#ifdef CONFIG_LATENCYTOP
+	REG("latency",  S_IRUGO, lstats),
+#endif
 #ifdef CONFIG_PROC_PID_CPUSET
 	REG("cpuset",    S_IRUGO, cpuset),
 #endif
diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h
new file mode 100644
index 000000000000..901c2d6377a8
--- /dev/null
+++ b/include/linux/latencytop.h
@@ -0,0 +1,44 @@
+/*
+ * latencytop.h: Infrastructure for displaying latency
+ *
+ * (C) Copyright 2008 Intel Corporation
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ */
+
+#ifndef _INCLUDE_GUARD_LATENCYTOP_H_
+#define _INCLUDE_GUARD_LATENCYTOP_H_
+
+#ifdef CONFIG_LATENCYTOP
+
+#define LT_SAVECOUNT		32
+#define LT_BACKTRACEDEPTH	12
+
+struct latency_record {
+	unsigned long	backtrace[LT_BACKTRACEDEPTH];
+	unsigned int	count;
+	unsigned long	time;
+	unsigned long	max;
+};
+
+
+struct task_struct;
+
+void account_scheduler_latency(struct task_struct *task, int usecs, int inter);
+
+void clear_all_latency_tracing(struct task_struct *p);
+
+#else
+
+static inline void
+account_scheduler_latency(struct task_struct *task, int usecs, int inter)
+{
+}
+
+static inline void clear_all_latency_tracing(struct task_struct *p)
+{
+}
+
+#endif
+
+#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index acadcab89ef9..dfc76e172f3f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -88,6 +88,7 @@ struct sched_param {
 #include <linux/hrtimer.h>
 #include <linux/task_io_accounting.h>
 #include <linux/kobject.h>
+#include <linux/latencytop.h>
 
 #include <asm/processor.h>
 
@@ -1220,6 +1221,10 @@ struct task_struct {
 	int make_it_fail;
 #endif
 	struct prop_local_single dirties;
+#ifdef CONFIG_LATENCYTOP
+	int latency_record_count;
+	struct latency_record latency_record[LT_SAVECOUNT];
+#endif
 };
 
 /*
diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h
index e7fa657d0c49..5da9794b2d78 100644
--- a/include/linux/stacktrace.h
+++ b/include/linux/stacktrace.h
@@ -9,10 +9,13 @@ struct stack_trace {
 };
 
 extern void save_stack_trace(struct stack_trace *trace);
+extern void save_stack_trace_tsk(struct task_struct *tsk,
+				struct stack_trace *trace);
 
 extern void print_stack_trace(struct stack_trace *trace, int spaces);
 #else
 # define save_stack_trace(trace)			do { } while (0)
+# define save_stack_trace_tsk(tsk, trace)		do { } while (0)
 # define print_stack_trace(trace, spaces)		do { } while (0)
 #endif
 
diff --git a/kernel/Makefile b/kernel/Makefile
index 68755cd9a7e4..390d42146267 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -62,6 +62,7 @@ obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_MARKERS) += marker.o
+obj-$(CONFIG_LATENCYTOP) += latencytop.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/fork.c b/kernel/fork.c
index 0c969f4fade0..39d22b3357de 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1205,6 +1205,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #ifdef TIF_SYSCALL_EMU
 	clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
 #endif
+	clear_all_latency_tracing(p);
 
 	/* Our parent execution domain becomes current domain
 	   These must match for thread signalling to apply */
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
new file mode 100644
index 000000000000..b4e3c85abe74
--- /dev/null
+++ b/kernel/latencytop.c
@@ -0,0 +1,239 @@
+/*
+ * latencytop.c: Latency display infrastructure
+ *
+ * (C) Copyright 2008 Intel Corporation
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#include <linux/latencytop.h>
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/notifier.h>
+#include <linux/spinlock.h>
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/stacktrace.h>
+
+static DEFINE_SPINLOCK(latency_lock);
+
+#define MAXLR 128
+static struct latency_record latency_record[MAXLR];
+
+int latencytop_enabled;
+
+void clear_all_latency_tracing(struct task_struct *p)
+{
+	unsigned long flags;
+
+	if (!latencytop_enabled)
+		return;
+
+	spin_lock_irqsave(&latency_lock, flags);
+	memset(&p->latency_record, 0, sizeof(p->latency_record));
+	p->latency_record_count = 0;
+	spin_unlock_irqrestore(&latency_lock, flags);
+}
+
+static void clear_global_latency_tracing(void)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&latency_lock, flags);
+	memset(&latency_record, 0, sizeof(latency_record));
+	spin_unlock_irqrestore(&latency_lock, flags);
+}
+
+static void __sched
+account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat)
+{
+	int firstnonnull = MAXLR + 1;
+	int i;
+
+	if (!latencytop_enabled)
+		return;
+
+	/* skip kernel threads for now */
+	if (!tsk->mm)
+		return;
+
+	for (i = 0; i < MAXLR; i++) {
+		int q;
+		int same = 1;
+		/* Nothing stored: */
+		if (!latency_record[i].backtrace[0]) {
+			if (firstnonnull > i)
+				firstnonnull = i;
+			continue;
+		}
+		for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) {
+			if (latency_record[i].backtrace[q] !=
+				lat->backtrace[q])
+				same = 0;
+			if (same && lat->backtrace[q] == 0)
+				break;
+			if (same && lat->backtrace[q] == ULONG_MAX)
+				break;
+		}
+		if (same) {
+			latency_record[i].count++;
+			latency_record[i].time += lat->time;
+			if (lat->time > latency_record[i].max)
+				latency_record[i].max = lat->time;
+			return;
+		}
+	}
+
+	i = firstnonnull;
+	if (i >= MAXLR - 1)
+		return;
+
+	/* Allocted a new one: */
+	memcpy(&latency_record[i], lat, sizeof(struct latency_record));
+}
+
+static inline void store_stacktrace(struct task_struct *tsk, struct latency_record *lat)
+{
+	struct stack_trace trace;
+
+	memset(&trace, 0, sizeof(trace));
+	trace.max_entries = LT_BACKTRACEDEPTH;
+	trace.entries = &lat->backtrace[0];
+	trace.skip = 0;
+	save_stack_trace_tsk(tsk, &trace);
+}
+
+void __sched
+account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
+{
+	unsigned long flags;
+	int i, q;
+	struct latency_record lat;
+
+	if (!latencytop_enabled)
+		return;
+
+	/* Long interruptible waits are generally user requested... */
+	if (inter && usecs > 5000)
+		return;
+
+	memset(&lat, 0, sizeof(lat));
+	lat.count = 1;
+	lat.time = usecs;
+	lat.max = usecs;
+	store_stacktrace(tsk, &lat);
+
+	spin_lock_irqsave(&latency_lock, flags);
+
+	account_global_scheduler_latency(tsk, &lat);
+
+	/*
+	 * short term hack; if we're > 32 we stop; future we recycle:
+	 */
+	tsk->latency_record_count++;
+	if (tsk->latency_record_count >= LT_SAVECOUNT)
+		goto out_unlock;
+
+	for (i = 0; i < LT_SAVECOUNT ; i++) {
+		struct latency_record *mylat;
+		int same = 1;
+		mylat = &tsk->latency_record[i];
+		for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) {
+			if (mylat->backtrace[q] !=
+				lat.backtrace[q])
+				same = 0;
+			if (same && lat.backtrace[q] == 0)
+				break;
+			if (same && lat.backtrace[q] == ULONG_MAX)
+				break;
+		}
+		if (same) {
+			mylat->count++;
+			mylat->time += lat.time;
+			if (lat.time > mylat->max)
+				mylat->max = lat.time;
+			goto out_unlock;
+		}
+	}
+
+	/* Allocated a new one: */
+	i = tsk->latency_record_count;
+	memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record));
+
+out_unlock:
+	spin_unlock_irqrestore(&latency_lock, flags);
+}
+
+static int lstats_show(struct seq_file *m, void *v)
+{
+	int i;
+
+	seq_puts(m, "Latency Top version : v0.1\n");
+
+	for (i = 0; i < MAXLR; i++) {
+		if (latency_record[i].backtrace[0]) {
+			int q;
+			seq_printf(m, "%i %li %li ",
+				latency_record[i].count,
+				latency_record[i].time,
+				latency_record[i].max);
+			for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
+				char sym[KSYM_NAME_LEN];
+				char *c;
+				if (!latency_record[i].backtrace[q])
+					break;
+				if (latency_record[i].backtrace[q] == ULONG_MAX)
+					break;
+				sprint_symbol(sym, latency_record[i].backtrace[q]);
+				c = strchr(sym, '+');
+				if (c)
+					*c = 0;
+				seq_printf(m, "%s ", sym);
+			}
+			seq_printf(m, "\n");
+		}
+	}
+	return 0;
+}
+
+static ssize_t
+lstats_write(struct file *file, const char __user *buf, size_t count,
+	     loff_t *offs)
+{
+	clear_global_latency_tracing();
+
+	return count;
+}
+
+static int lstats_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, lstats_show, NULL);
+}
+
+static struct file_operations lstats_fops = {
+	.open		= lstats_open,
+	.read		= seq_read,
+	.write		= lstats_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init init_lstats_procfs(void)
+{
+	struct proc_dir_entry *pe;
+
+	pe = create_proc_entry("latency_stats", 0644, NULL);
+	if (!pe)
+		return -ENOMEM;
+
+	pe->proc_fops = &lstats_fops;
+
+	return 0;
+}
+__initcall(init_lstats_procfs);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3dab1ff83c4f..1b3b40ad7c54 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -20,6 +20,8 @@
  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  */
 
+#include <linux/latencytop.h>
+
 /*
  * Targeted preemption latency for CPU-bound tasks:
  * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds)
@@ -434,6 +436,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 #ifdef CONFIG_SCHEDSTATS
 	if (se->sleep_start) {
 		u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
+		struct task_struct *tsk = task_of(se);
 
 		if ((s64)delta < 0)
 			delta = 0;
@@ -443,9 +446,12 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 		se->sleep_start = 0;
 		se->sum_sleep_runtime += delta;
+
+		account_scheduler_latency(tsk, delta >> 10, 1);
 	}
 	if (se->block_start) {
 		u64 delta = rq_of(cfs_rq)->clock - se->block_start;
+		struct task_struct *tsk = task_of(se);
 
 		if ((s64)delta < 0)
 			delta = 0;
@@ -462,11 +468,11 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		 * time that the task spent sleeping:
 		 */
 		if (unlikely(prof_on == SLEEP_PROFILING)) {
-			struct task_struct *tsk = task_of(se);
 
 			profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk),
 				     delta >> 20);
 		}
+		account_scheduler_latency(tsk, delta >> 10, 0);
 	}
 #endif
 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3afbd25f43eb..5418ef61e16e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -81,6 +81,7 @@ extern int compat_log;
 extern int maps_protect;
 extern int sysctl_stat_interval;
 extern int audit_argv_kb;
+extern int latencytop_enabled;
 
 /* Constants used for minimum and  maximum */
 #ifdef CONFIG_DETECT_SOFTLOCKUP
@@ -416,6 +417,15 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec_taint,
 	},
 #endif
+#ifdef CONFIG_LATENCYTOP
+	{
+		.procname	= "latencytop",
+		.data		= &latencytop_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
 #ifdef CONFIG_SECURITY_CAPABILITIES
 	{
 		.procname	= "cap-bound",
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index a60109307d32..14fb355e3caa 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -517,4 +517,18 @@ config FAULT_INJECTION_STACKTRACE_FILTER
 	help
 	  Provide stacktrace filter for fault-injection capabilities
 
+config LATENCYTOP
+	bool "Latency measuring infrastructure"
+	select FRAME_POINTER if !MIPS
+	select KALLSYMS
+	select KALLSYMS_ALL
+	select STACKTRACE
+	select SCHEDSTATS
+	select SCHED_DEBUG
+	depends on X86 || X86_64
+	help
+	  Enable this option if you want to use the LatencyTOP tool
+	  to find out which userspace is blocking on what kernel operations.
+
+
 source "samples/Kconfig"
-- 
cgit v1.2.3


From 01ba2bdc6b639764745ff678caf3fb9e5bcd745a Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Sun, 20 Jan 2008 14:15:03 +0100
Subject: all archs: consolidate init and exit sections in vmlinux.lds.h

This patch consolidate all definitions of .init.text, .init.data
and .exit.text, .exit.data section definitions in
the generic vmlinux.lds.h.

This is a preparational patch - alone it does not buy
us much good.

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
---
 arch/alpha/kernel/vmlinux.lds.S     |  8 ++++----
 arch/arm/kernel/vmlinux.lds.S       | 10 +++++-----
 arch/avr32/kernel/vmlinux.lds.S     |  8 ++++----
 arch/blackfin/kernel/vmlinux.lds.S  |  8 ++++----
 arch/cris/arch-v10/vmlinux.lds.S    |  8 ++++----
 arch/cris/arch-v32/vmlinux.lds.S    |  8 ++++----
 arch/frv/kernel/vmlinux.lds.S       | 14 +++++++-------
 arch/h8300/kernel/vmlinux.lds.S     |  8 ++++----
 arch/ia64/kernel/vmlinux.lds.S      |  8 ++++----
 arch/m32r/kernel/vmlinux.lds.S      | 12 ++++++------
 arch/m68k/kernel/vmlinux-std.lds    |  8 ++++----
 arch/m68k/kernel/vmlinux-sun3.lds   |  8 ++++----
 arch/m68knommu/kernel/vmlinux.lds.S |  8 ++++----
 arch/mips/kernel/vmlinux.lds.S      |  8 ++++----
 arch/parisc/kernel/vmlinux.lds.S    |  8 ++++----
 arch/powerpc/kernel/vmlinux.lds.S   | 10 ++++++----
 arch/ppc/kernel/vmlinux.lds.S       |  8 ++++----
 arch/s390/kernel/vmlinux.lds.S      |  8 ++++----
 arch/sh/kernel/vmlinux_32.lds.S     |  8 ++++----
 arch/sh/kernel/vmlinux_64.lds.S     |  8 ++++----
 arch/sparc/kernel/vmlinux.lds.S     |  8 ++++----
 arch/sparc64/kernel/vmlinux.lds.S   |  8 ++++----
 arch/um/kernel/dyn.lds.S            |  4 ++--
 arch/um/kernel/uml.lds.S            |  4 ++--
 arch/v850/kernel/vmlinux.lds.S      | 10 +++++-----
 arch/x86/kernel/vmlinux_32.lds.S    | 14 ++++++++++----
 arch/x86/kernel/vmlinux_64.lds.S    | 19 +++++++++++++------
 arch/xtensa/kernel/vmlinux.lds.S    |  9 +++++----
 include/asm-generic/vmlinux.lds.h   |  7 +++++++
 29 files changed, 140 insertions(+), 117 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/alpha/kernel/vmlinux.lds.S b/arch/alpha/kernel/vmlinux.lds.S
index 55c05b511f4c..f13249be17c5 100644
--- a/arch/alpha/kernel/vmlinux.lds.S
+++ b/arch/alpha/kernel/vmlinux.lds.S
@@ -46,11 +46,11 @@ SECTIONS
 	__init_begin = .;
 	.init.text : {
 		_sinittext = .;
-		*(.init.text)
+		INIT_TEXT
 		_einittext = .;
 	}
 	.init.data : {
-		*(.init.data)
+		INIT_DATA
 	}
 
 	. = ALIGN(16);
@@ -136,8 +136,8 @@ SECTIONS
 
 	/* Sections to be discarded */
 	/DISCARD/ : {
-		*(.exit.text)
-		*(.exit.data)
+		EXIT_TEXT
+		EXIT_DATA
 		*(.exitcall.exit)
 	}
 
diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S
index 30f732c7fdb5..4898bdcfe7dd 100644
--- a/arch/arm/kernel/vmlinux.lds.S
+++ b/arch/arm/kernel/vmlinux.lds.S
@@ -30,7 +30,7 @@ SECTIONS
 	}
 
 	.init : {			/* Init code and data		*/
-			*(.init.text)
+			INIT_TEXT
 		_einittext = .;
 		__proc_info_begin = .;
 			*(.proc.info.init)
@@ -70,15 +70,15 @@ SECTIONS
 		__per_cpu_end = .;
 #ifndef CONFIG_XIP_KERNEL
 		__init_begin = _stext;
-		*(.init.data)
+		INIT_DATA
 		. = ALIGN(4096);
 		__init_end = .;
 #endif
 	}
 
 	/DISCARD/ : {			/* Exit code and data		*/
-		*(.exit.text)
-		*(.exit.data)
+		EXIT_TEXT
+		EXIT_DATA
 		*(.exitcall.exit)
 #ifndef CONFIG_MMU
 		*(.fixup)
@@ -130,7 +130,7 @@ SECTIONS
 #ifdef CONFIG_XIP_KERNEL
 		. = ALIGN(4096);
 		__init_begin = .;
-		*(.init.data)
+		INIT_DATA
 		. = ALIGN(4096);
 		__init_end = .;
 #endif
diff --git a/arch/avr32/kernel/vmlinux.lds.S b/arch/avr32/kernel/vmlinux.lds.S
index 11f08e35a2eb..481cfd40c053 100644
--- a/arch/avr32/kernel/vmlinux.lds.S
+++ b/arch/avr32/kernel/vmlinux.lds.S
@@ -27,19 +27,19 @@ SECTIONS
 		__init_begin = .;
 			_sinittext = .;
 			*(.text.reset)
-			*(.init.text)
+			INIT_TEXT
 			/*
 			 * .exit.text is discarded at runtime, not
 			 * link time, to deal with references from
 			 * __bug_table
 			 */
-			*(.exit.text)
+			EXIT_TEXT
 			_einittext = .;
 		. = ALIGN(4);
 		__tagtable_begin = .;
 			*(.taglist.init)
 		__tagtable_end = .;
-			*(.init.data)
+			INIT_DATA
 		. = ALIGN(16);
 		__setup_start = .;
 			*(.init.setup)
@@ -135,7 +135,7 @@ SECTIONS
 	 * thrown away, as cleanup code is never called unless it's a module.
 	 */
 	/DISCARD/       	: {
-		*(.exit.data)
+		EXIT_DATA
 		*(.exitcall.exit)
 	}
 
diff --git a/arch/blackfin/kernel/vmlinux.lds.S b/arch/blackfin/kernel/vmlinux.lds.S
index 9b75bc83c71f..858722421b40 100644
--- a/arch/blackfin/kernel/vmlinux.lds.S
+++ b/arch/blackfin/kernel/vmlinux.lds.S
@@ -91,13 +91,13 @@ SECTIONS
 	{
 		. = ALIGN(PAGE_SIZE);
 		__sinittext = .;
-		*(.init.text)
+		INIT_TEXT
 		__einittext = .;
 	}
 	.init.data :
 	{
 		. = ALIGN(16);
-		*(.init.data)
+		INIT_DATA
 	}
 	.init.setup :
 	{
@@ -198,8 +198,8 @@ SECTIONS
 
 	/DISCARD/ :
 	{
-		*(.exit.text)
-		*(.exit.data)
+		EXIT_TEXT
+		EXIT_DATA
 		*(.exitcall.exit)
 	}
 }
diff --git a/arch/cris/arch-v10/vmlinux.lds.S b/arch/cris/arch-v10/vmlinux.lds.S
index 97a7876ed681..93c9f0ea286b 100644
--- a/arch/cris/arch-v10/vmlinux.lds.S
+++ b/arch/cris/arch-v10/vmlinux.lds.S
@@ -57,10 +57,10 @@ SECTIONS
   	__init_begin = .;
 	.init.text : { 
 		   _sinittext = .;
-		   *(.init.text)
+		   INIT_TEXT
 		   _einittext = .;
 	}
-  	.init.data : { *(.init.data) }
+	.init.data : { INIT_DATA }
   	. = ALIGN(16);
   	__setup_start = .;
   	.init.setup : { *(.init.setup) }
@@ -109,8 +109,8 @@ SECTIONS
 
 	/* Sections to be discarded */
   	/DISCARD/ : {
-        	*(.text.exit)
-        	*(.data.exit)
+		EXIT_TEXT
+		EXIT_DATA
 		*(.exitcall.exit)
         }
 
diff --git a/arch/cris/arch-v32/vmlinux.lds.S b/arch/cris/arch-v32/vmlinux.lds.S
index b076c134c0bb..fead8c59ea63 100644
--- a/arch/cris/arch-v32/vmlinux.lds.S
+++ b/arch/cris/arch-v32/vmlinux.lds.S
@@ -61,10 +61,10 @@ SECTIONS
   	__init_begin = .;
 	.init.text : {
 		   _sinittext = .;
-		   *(.init.text)
+		   INIT_TEXT
 		   _einittext = .;
 	}
-  	.init.data : { *(.init.data) }
+	.init.data : { INIT_DATA }
   	. = ALIGN(16);
   	__setup_start = .;
   	.init.setup : { *(.init.setup) }
@@ -124,8 +124,8 @@ SECTIONS
 
 	/* Sections to be discarded */
   	/DISCARD/ : {
-        	*(.text.exit)
-        	*(.data.exit)
+		EXIT_TEXT
+		EXIT_DATA
 		*(.exitcall.exit)
         }
 
diff --git a/arch/frv/kernel/vmlinux.lds.S b/arch/frv/kernel/vmlinux.lds.S
index a17a81d58bf6..f42b328b1dd0 100644
--- a/arch/frv/kernel/vmlinux.lds.S
+++ b/arch/frv/kernel/vmlinux.lds.S
@@ -28,14 +28,14 @@ SECTIONS
   .init.text : {
 	*(.text.head)
 #ifndef CONFIG_DEBUG_INFO
-	*(.init.text)
-	*(.exit.text)
-	*(.exit.data)
+	INIT_TEXT
+	EXIT_TEXT
+	EXIT_DATA
 	*(.exitcall.exit)
 #endif
   }
   _einittext = .;
-  .init.data : { *(.init.data) }
+  .init.data : { INIT_DATA }
 
   . = ALIGN(8);
   __setup_start = .;
@@ -106,8 +106,8 @@ SECTIONS
 	LOCK_TEXT
 #ifdef CONFIG_DEBUG_INFO
 	*(
-	.init.text
-	.exit.text
+	INIT_TEXT
+	EXIT_TEXT
 	.exitcall.exit
 	)
 #endif
@@ -138,7 +138,7 @@ SECTIONS
   .data : {			/* Data */
 	DATA_DATA
 	*(.data.*)
-	*(.exit.data)
+	EXIT_DATA
 	CONSTRUCTORS
 	}
 
diff --git a/arch/h8300/kernel/vmlinux.lds.S b/arch/h8300/kernel/vmlinux.lds.S
index a2e72d495551..43a87b9085b6 100644
--- a/arch/h8300/kernel/vmlinux.lds.S
+++ b/arch/h8300/kernel/vmlinux.lds.S
@@ -110,9 +110,9 @@ SECTIONS
 	. = ALIGN(0x4) ;
 	___init_begin = .;
 	__sinittext = .; 
-		*(.init.text)
+		INIT_TEXT
 	__einittext = .; 
-		*(.init.data)
+		INIT_DATA
 	. = ALIGN(0x4) ;
 	___setup_start = .;
 		*(.init.setup)
@@ -124,8 +124,8 @@ SECTIONS
 	___con_initcall_start = .;
 		*(.con_initcall.init)
 	___con_initcall_end = .;
-		*(.exit.text)
-		*(.exit.data)
+		EXIT_TEXT
+		EXIT_DATA
 #if defined(CONFIG_BLK_DEV_INITRD)
 		. = ALIGN(4);
 	___initramfs_start = .;
diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S
index 757e419ebcf8..80622acc95de 100644
--- a/arch/ia64/kernel/vmlinux.lds.S
+++ b/arch/ia64/kernel/vmlinux.lds.S
@@ -27,8 +27,8 @@ SECTIONS
 {
   /* Sections to be discarded */
   /DISCARD/ : {
-	*(.exit.text)
-	*(.exit.data)
+	EXIT_TEXT
+	EXIT_DATA
 	*(.exitcall.exit)
 	*(.IA_64.unwind.exit.text)
 	*(.IA_64.unwind_info.exit.text)
@@ -119,12 +119,12 @@ SECTIONS
   .init.text : AT(ADDR(.init.text) - LOAD_OFFSET)
 	{
 	  _sinittext = .;
-	  *(.init.text)
+	  INIT_TEXT
 	  _einittext = .;
 	}
 
   .init.data : AT(ADDR(.init.data) - LOAD_OFFSET)
-	{ *(.init.data) }
+	{ INIT_DATA }
 
 #ifdef CONFIG_BLK_DEV_INITRD
   .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET)
diff --git a/arch/m32r/kernel/vmlinux.lds.S b/arch/m32r/kernel/vmlinux.lds.S
index 942a8c7a4417..41b07854fcc6 100644
--- a/arch/m32r/kernel/vmlinux.lds.S
+++ b/arch/m32r/kernel/vmlinux.lds.S
@@ -76,10 +76,10 @@ SECTIONS
   __init_begin = .;
   .init.text : {
 	_sinittext = .;
-	*(.init.text)
+	INIT_TEXT
 	_einittext = .;
   }
-  .init.data : { *(.init.data) }
+  .init.data : { INIT_DATA }
   . = ALIGN(16);
   __setup_start = .;
   .init.setup : { *(.init.setup) }
@@ -100,8 +100,8 @@ SECTIONS
   .altinstr_replacement : { *(.altinstr_replacement) }
   /* .exit.text is discard at runtime, not link time, to deal with references
      from .altinstructions and .eh_frame */
-  .exit.text : { *(.exit.text) }
-  .exit.data : { *(.exit.data) }
+  .exit.text : { EXIT_TEXT }
+  .exit.data : { EXIT_DATA }
 
 #ifdef CONFIG_BLK_DEV_INITRD
   . = ALIGN(4096);
@@ -124,8 +124,8 @@ SECTIONS
 
   /* Sections to be discarded */
   /DISCARD/ : {
-	*(.exit.text)
-	*(.exit.data)
+	EXIT_TEXT
+	EXIT_DATA
 	*(.exitcall.exit)
 	}
 
diff --git a/arch/m68k/kernel/vmlinux-std.lds b/arch/m68k/kernel/vmlinux-std.lds
index 59fe285865ec..7537cc5e6159 100644
--- a/arch/m68k/kernel/vmlinux-std.lds
+++ b/arch/m68k/kernel/vmlinux-std.lds
@@ -45,10 +45,10 @@ SECTIONS
   __init_begin = .;
   .init.text : {
 	_sinittext = .;
-	*(.init.text)
+	INIT_TEXT
 	_einittext = .;
   }
-  .init.data : { *(.init.data) }
+  .init.data : { INIT_DATA }
   . = ALIGN(16);
   __setup_start = .;
   .init.setup : { *(.init.setup) }
@@ -82,8 +82,8 @@ SECTIONS
 
   /* Sections to be discarded */
   /DISCARD/ : {
-	*(.exit.text)
-	*(.exit.data)
+	EXIT_TEXT
+	EXIT_DATA
 	*(.exitcall.exit)
 	}
 
diff --git a/arch/m68k/kernel/vmlinux-sun3.lds b/arch/m68k/kernel/vmlinux-sun3.lds
index 4adffefb5c48..cdc313e7c299 100644
--- a/arch/m68k/kernel/vmlinux-sun3.lds
+++ b/arch/m68k/kernel/vmlinux-sun3.lds
@@ -38,10 +38,10 @@ SECTIONS
 __init_begin = .;
 	.init.text : {
 		_sinittext = .;
-		*(.init.text)
+		INIT_TEXT
 		_einittext = .;
 	}
-	.init.data : { *(.init.data) }
+	.init.data : { INIT_DATA }
 	. = ALIGN(16);
 	__setup_start = .;
 	.init.setup : { *(.init.setup) }
@@ -77,8 +77,8 @@ __init_begin = .;
 
   /* Sections to be discarded */
   /DISCARD/ : {
-	*(.exit.text)
-	*(.exit.data)
+	EXIT_TEXT
+	EXIT_DATA
 	*(.exitcall.exit)
 	}
 
diff --git a/arch/m68knommu/kernel/vmlinux.lds.S b/arch/m68knommu/kernel/vmlinux.lds.S
index 07a0055602f4..b44edb08e212 100644
--- a/arch/m68knommu/kernel/vmlinux.lds.S
+++ b/arch/m68knommu/kernel/vmlinux.lds.S
@@ -143,9 +143,9 @@ SECTIONS {
 		. = ALIGN(4096);
 		__init_begin = .;
 		_sinittext = .;
-		*(.init.text)
+		INIT_TEXT
 		_einittext = .;
-		*(.init.data)
+		INIT_DATA
 		. = ALIGN(16);
 		__setup_start = .;
 		*(.init.setup)
@@ -170,8 +170,8 @@ SECTIONS {
 	} > INIT
 
 	/DISCARD/ : {
-		*(.exit.text)
-		*(.exit.data)
+		EXIT_TEXT
+		EXIT_DATA
 		*(.exitcall.exit)
 	}
 
diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S
index 5fc2398bdb76..b5470ceb418b 100644
--- a/arch/mips/kernel/vmlinux.lds.S
+++ b/arch/mips/kernel/vmlinux.lds.S
@@ -114,11 +114,11 @@ SECTIONS
 	__init_begin = .;
 	.init.text : {
 		_sinittext = .;
-		*(.init.text)
+		INIT_TEXT
 		_einittext = .;
 	}
 	.init.data : {
-		*(.init.data)
+		INIT_DATA
 	}
 	. = ALIGN(16);
 	.init.setup : {
@@ -144,10 +144,10 @@ SECTIONS
 	 * references from .rodata
 	 */
 	.exit.text : {
-		*(.exit.text)
+		EXIT_TEXT
 	}
 	.exit.data : {
-		*(.exit.data)
+		EXIT_DATA
 	}
 #if defined(CONFIG_BLK_DEV_INITRD)
 	. = ALIGN(_PAGE_SIZE);
diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S
index 40d0ff9b81ab..50b4a3a25d0a 100644
--- a/arch/parisc/kernel/vmlinux.lds.S
+++ b/arch/parisc/kernel/vmlinux.lds.S
@@ -172,11 +172,11 @@ SECTIONS
 	__init_begin = .;
 	.init.text : { 
 		_sinittext = .;
-		*(.init.text)
+		INIT_TEXT
 		_einittext = .;
 	}
 	.init.data : {
-		*(.init.data)
+		INIT_DATA
 	}
 	. = ALIGN(16);
 	.init.setup : {
@@ -215,10 +215,10 @@ SECTIONS
 	 *  from .altinstructions and .eh_frame
 	 */
 	.exit.text : {
-		*(.exit.text)
+		EXIT_TEXT
 	}
 	.exit.data : {
-		*(.exit.data)
+		EXIT_DATA
 	}
 #ifdef CONFIG_BLK_DEV_INITRD
 	. = ALIGN(PAGE_SIZE);
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index f66fa5d966b0..0afb9e31d2a0 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -23,7 +23,7 @@ SECTIONS
 	/* Sections to be discarded. */
 	/DISCARD/ : {
 	*(.exitcall.exit)
-	*(.exit.data)
+	EXIT_DATA
 	}
 
 	. = KERNELBASE;
@@ -76,17 +76,19 @@ SECTIONS
 
 	.init.text : {
 		_sinittext = .;
-		*(.init.text)
+		INIT_TEXT
 		_einittext = .;
 	}
 
 	/* .exit.text is discarded at runtime, not link time,
 	 * to deal with references from __bug_table
 	 */
-	.exit.text : { *(.exit.text) }
+	.exit.text : {
+		EXIT_TEXT
+	}
 
 	.init.data : {
-		*(.init.data);
+		INIT_DATA
 		__vtop_table_begin = .;
 		*(.vtop_fixup);
 		__vtop_table_end = .;
diff --git a/arch/ppc/kernel/vmlinux.lds.S b/arch/ppc/kernel/vmlinux.lds.S
index 98c1212674f6..52b64fcbdfc5 100644
--- a/arch/ppc/kernel/vmlinux.lds.S
+++ b/arch/ppc/kernel/vmlinux.lds.S
@@ -97,14 +97,14 @@ SECTIONS
   __init_begin = .;
   .init.text : {
 	_sinittext = .;
-	*(.init.text)
+	INIT_TEXT
 	_einittext = .;
   }
   /* .exit.text is discarded at runtime, not link time,
      to deal with references from __bug_table */
-  .exit.text : { *(.exit.text) }
+  .exit.text : { EXIT_TEXT }
   .init.data : {
-    *(.init.data);
+    INIT_DATA
     __vtop_table_begin = .;
     *(.vtop_fixup);
     __vtop_table_end = .;
@@ -164,6 +164,6 @@ SECTIONS
   /* Sections to be discarded. */
   /DISCARD/ : {
     *(.exitcall.exit)
-    *(.exit.data)
+    EXIT_DATA
   }
 }
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S
index 936159199346..7d43c3cd3ef3 100644
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -97,7 +97,7 @@ SECTIONS
 	__init_begin = .;
 	.init.text : {
 		_sinittext = .;
-		*(.init.text)
+		INIT_TEXT
 		_einittext = .;
 	}
 	/*
@@ -105,11 +105,11 @@ SECTIONS
 	 * to deal with references from __bug_table
 	*/
 	.exit.text : {
-		*(.exit.text)
+		EXIT_TEXT
 	}
 
 	.init.data : {
-		*(.init.data)
+		INIT_DATA
 	}
 	. = ALIGN(0x100);
 	.init.setup : {
@@ -156,7 +156,7 @@ SECTIONS
 
 	/* Sections to be discarded */
 	/DISCARD/ : {
-		*(.exit.data)
+		EXIT_DATA
 		*(.exitcall.exit)
 	}
 
diff --git a/arch/sh/kernel/vmlinux_32.lds.S b/arch/sh/kernel/vmlinux_32.lds.S
index d549fac6d3e7..c7113786ecd4 100644
--- a/arch/sh/kernel/vmlinux_32.lds.S
+++ b/arch/sh/kernel/vmlinux_32.lds.S
@@ -84,9 +84,9 @@ SECTIONS
 	. = ALIGN(PAGE_SIZE);		/* Init code and data */
 	__init_begin = .;
 	_sinittext = .;
-	.init.text : { *(.init.text) }
+	.init.text : { INIT_TEXT }
 	_einittext = .;
-	.init.data : { *(.init.data) }
+	.init.data : { INIT_DATA }
 
 	. = ALIGN(16);
 	__setup_start = .;
@@ -122,8 +122,8 @@ SECTIONS
 	 * .exit.text is discarded at runtime, not link time, to deal with
 	 * references from __bug_table
 	 */
-	.exit.text : { *(.exit.text) }
-	.exit.data : { *(.exit.data) }
+	.exit.text : { EXIT_TEXT }
+	.exit.data : { EXIT_DATA }
 
 	. = ALIGN(PAGE_SIZE);
 	.bss : {
diff --git a/arch/sh/kernel/vmlinux_64.lds.S b/arch/sh/kernel/vmlinux_64.lds.S
index 2fd0f7401484..3f1bd6392bb3 100644
--- a/arch/sh/kernel/vmlinux_64.lds.S
+++ b/arch/sh/kernel/vmlinux_64.lds.S
@@ -96,9 +96,9 @@ SECTIONS
 	. = ALIGN(PAGE_SIZE);		/* Init code and data */
 	__init_begin = .;
 	_sinittext = .;
-	.init.text : C_PHYS(.init.text) { *(.init.text) }
+	.init.text : C_PHYS(.init.text) { INIT_TEXT }
 	_einittext = .;
-	.init.data : C_PHYS(.init.data) { *(.init.data) }
+	.init.data : C_PHYS(.init.data) { INIT_DATA }
 	. = ALIGN(L1_CACHE_BYTES);	/* Better if Cache Line aligned */
 	__setup_start = .;
 	.init.setup : C_PHYS(.init.setup) { *(.init.setup) }
@@ -134,8 +134,8 @@ SECTIONS
 	 * .exit.text is discarded at runtime, not link time, to deal with
 	 * references from __bug_table
 	 */
-	.exit.text : C_PHYS(.exit.text) { *(.exit.text) }
-	.exit.data : C_PHYS(.exit.data) { *(.exit.data) }
+	.exit.text : C_PHYS(.exit.text) { EXIT_TEXT }
+	.exit.data : C_PHYS(.exit.data) { EXIT_DATA }
 
 	. = ALIGN(PAGE_SIZE);
 	.bss : C_PHYS(.bss) {
diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S
index a8b4200f9cc3..216147d6e61f 100644
--- a/arch/sparc/kernel/vmlinux.lds.S
+++ b/arch/sparc/kernel/vmlinux.lds.S
@@ -48,12 +48,12 @@ SECTIONS
 	__init_begin = .;
 	.init.text : {
 		_sinittext = .;
-		*(.init.text)
+		INIT_TEXT
 		_einittext = .;
 	}
 	__init_text_end = .;
 	.init.data : {
-		*(.init.data)
+		INIT_DATA
 	}
 	. = ALIGN(16);
 	.init.setup : {
@@ -102,8 +102,8 @@ SECTIONS
 	_end = . ;
 	PROVIDE (end = .);
 	/DISCARD/ : {
-		*(.exit.text)
-		*(.exit.data)
+		EXIT_TEXT
+		EXIT_DATA
 		*(.exitcall.exit)
 	}
 
diff --git a/arch/sparc64/kernel/vmlinux.lds.S b/arch/sparc64/kernel/vmlinux.lds.S
index 9fcd503bc04a..01f809617e5e 100644
--- a/arch/sparc64/kernel/vmlinux.lds.S
+++ b/arch/sparc64/kernel/vmlinux.lds.S
@@ -56,11 +56,11 @@ SECTIONS
 	.init.text : {
 		__init_begin = .;
 		_sinittext = .;
-		*(.init.text)
+		INIT_TEXT
 		_einittext = .;
 	}
 	.init.data : {
-		*(.init.data)
+		INIT_DATA
 	}
 	. = ALIGN(16);
 	.init.setup : {
@@ -137,8 +137,8 @@ SECTIONS
 	PROVIDE (end = .);
 
 	/DISCARD/ : {
-		*(.exit.text)
-		*(.exit.data)
+		EXIT_TEXT
+		EXIT_DATA
 		*(.exitcall.exit)
 	}
 
diff --git a/arch/um/kernel/dyn.lds.S b/arch/um/kernel/dyn.lds.S
index 3866f4960f04..26090b7f323e 100644
--- a/arch/um/kernel/dyn.lds.S
+++ b/arch/um/kernel/dyn.lds.S
@@ -17,7 +17,7 @@ SECTIONS
   __init_begin = .;
   .init.text : {
 	_sinittext = .;
-	*(.init.text)
+	INIT_TEXT
 	_einittext = .;
   }
 
@@ -84,7 +84,7 @@ SECTIONS
 
   #include "asm/common.lds.S"
 
-  init.data : { *(.init.data) }
+  init.data : { INIT_DATA }
 
   /* Ensure the __preinit_array_start label is properly aligned.  We
      could instead move the label definition inside the section, but
diff --git a/arch/um/kernel/uml.lds.S b/arch/um/kernel/uml.lds.S
index 13df191e2b41..5828c1d54505 100644
--- a/arch/um/kernel/uml.lds.S
+++ b/arch/um/kernel/uml.lds.S
@@ -23,7 +23,7 @@ SECTIONS
   __init_begin = .;
   .init.text : {
 	_sinittext = .;
-	*(.init.text)
+	INIT_TEXT
 	_einittext = .;
   }
   . = ALIGN(4096);
@@ -48,7 +48,7 @@ SECTIONS
 
   #include "asm/common.lds.S"
 
-  init.data : { *(init.data) }
+  init.data : { INIT_DATA }
   .data    :
   {
     . = ALIGN(KERNEL_STACK_SIZE);		/* init_task */
diff --git a/arch/v850/kernel/vmlinux.lds.S b/arch/v850/kernel/vmlinux.lds.S
index 6172599b4ce2..d08cd1d27f27 100644
--- a/arch/v850/kernel/vmlinux.lds.S
+++ b/arch/v850/kernel/vmlinux.lds.S
@@ -114,7 +114,7 @@
 #define DATA_CONTENTS							      \
 		__sdata = . ;						      \
 		DATA_DATA						      \
-			*(.exit.data)	/* 2.5 convention */		      \
+			EXIT_DATA	/* 2.5 convention */		      \
 			*(.data.exit)	/* 2.4 convention */		      \
 		. = ALIGN (16) ;					      \
 		*(.data.cacheline_aligned)				      \
@@ -157,9 +157,9 @@
 		. = ALIGN (4096) ;					      \
 		__init_start = . ;					      \
 			__sinittext = .;				      \
-			*(.init.text)	/* 2.5 convention */		      \
+			INIT_TEXT	/* 2.5 convention */		      \
 			__einittext = .;				      \
-			*(.init.data)					      \
+			INIT_DATA					      \
 			*(.text.init)	/* 2.4 convention */		      \
 			*(.data.init)					      \
 		INITCALL_CONTENTS					      \
@@ -170,7 +170,7 @@
 #define ROMK_INIT_RAM_CONTENTS						      \
 		. = ALIGN (4096) ;					      \
 		__init_start = . ;					      \
-			*(.init.data)	/* 2.5 convention */		      \
+			INIT_DATA	/* 2.5 convention */		      \
 			*(.data.init)	/* 2.4 convention */		      \
 		__init_end = . ;					      \
 		. = ALIGN (4096) ;
@@ -179,7 +179,7 @@
    should go into ROM.  */	
 #define ROMK_INIT_ROM_CONTENTS						      \
 			_sinittext = .;					      \
-			*(.init.text)	/* 2.5 convention */		      \
+			INIT_TEXT	/* 2.5 convention */		      \
 			_einittext = .;					      \
 			*(.text.init)	/* 2.4 convention */		      \
 		INITCALL_CONTENTS					      \
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 7d72cce00529..84c913f38f98 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -131,10 +131,12 @@ SECTIONS
   .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
   	__init_begin = .;
 	_sinittext = .;
-	*(.init.text)
+	INIT_TEXT
 	_einittext = .;
   }
-  .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) }
+  .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
+	INIT_DATA
+  }
   . = ALIGN(16);
   .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
   	__setup_start = .;
@@ -169,8 +171,12 @@ SECTIONS
   }
   /* .exit.text is discard at runtime, not link time, to deal with references
      from .altinstructions and .eh_frame */
-  .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
-  .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) }
+  .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
+	EXIT_TEXT
+  }
+  .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
+	EXIT_DATA
+  }
 #if defined(CONFIG_BLK_DEV_INITRD)
   . = ALIGN(4096);
   .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index ba8ea97abd21..ea5386944e67 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -155,12 +155,15 @@ SECTIONS
   __init_begin = .;
   .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
 	_sinittext = .;
-	*(.init.text)
+	INIT_TEXT
 	_einittext = .;
   }
-  __initdata_begin = .;
-  .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) }
-  __initdata_end = .;
+  .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
+	__initdata_begin = .;
+	INIT_DATA
+	__initdata_end = .;
+   }
+
   . = ALIGN(16);
   __setup_start = .;
   .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) }
@@ -187,8 +190,12 @@ SECTIONS
   }
   /* .exit.text is discard at runtime, not link time, to deal with references
      from .altinstructions and .eh_frame */
-  .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
-  .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) }
+  .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
+	EXIT_TEXT
+  }
+  .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
+	EXIT_DATA
+  }
 
 /* vdso blob that is mapped into user space */
   vdso_start = . ;
diff --git a/arch/xtensa/kernel/vmlinux.lds.S b/arch/xtensa/kernel/vmlinux.lds.S
index ac4ed52034db..7d0f55a4982d 100644
--- a/arch/xtensa/kernel/vmlinux.lds.S
+++ b/arch/xtensa/kernel/vmlinux.lds.S
@@ -136,13 +136,13 @@ SECTIONS
   __init_begin = .;
   .init.text : {
   	_sinittext = .;
-	*(.init.literal) *(.init.text)
+	*(.init.literal) INIT_TEXT
 	_einittext = .;
   }
 
   .init.data :
   {
-    *(.init.data)
+    INIT_DATA
     . = ALIGN(0x4);
     __tagtable_begin = .;
     *(.taglist)
@@ -278,8 +278,9 @@ SECTIONS
   /* Sections to be discarded */
   /DISCARD/ :
   {
-  	*(.exit.literal .exit.text)
-  	*(.exit.data)
+	*(.exit.literal)
+	EXIT_TEXT
+	EXIT_DATA
         *(.exitcall.exit)
   }
 
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 9f584cc5c5fb..ae0166e83490 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -183,6 +183,13 @@
 		*(.kprobes.text)					\
 		VMLINUX_SYMBOL(__kprobes_text_end) = .;
 
+/* init and exit section handling */
+#define INIT_TEXT *(.init.text)
+#define INIT_DATA *(.init.data)
+#define EXIT_TEXT *(.exit.text)
+#define EXIT_DATA *(.exit.data)
+
+
 		/* DWARF debug sections.
 		Symbols in the DWARF debugging sections are relative to
 		the beginning of the section so we begin them at 0.  */
-- 
cgit v1.2.3


From 213eca7f4888e9817e8076cdab6b9f7295c181f6 Mon Sep 17 00:00:00 2001
From: Greg KH <gregkh@suse.de>
Date: Wed, 30 Jan 2008 13:29:58 +0100
Subject: kobj: fix threshold_init_device/kobject_uevent_env oops

the logic in this function is just crazy.  It's recursive, but we
can circumvent the creation for the kobject and whole creation of the
threshold_block if some conditions are met.  That's why we see the
allocate_threshold_blocks so many times in the callstack, yet only a few
kobjects created.

Then we blow up in kobject_uevent_env() on the first debug printk.
Which means that we are just passing in garbage.

Man, this is one time that comments in code would have been very nice to
have, and why forward goto's into major code blocks are just evil...

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 753588755fee..073afa7dd89a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -450,7 +450,8 @@ recurse:
 	if (err)
 		goto out_free;
 
-	kobject_uevent(&b->kobj, KOBJ_ADD);
+	if (b)
+		kobject_uevent(&b->kobj, KOBJ_ADD);
 
 	return err;
 
-- 
cgit v1.2.3


From b10db7f0d2b589a7f88dc3026e150756cb437a28 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Wed, 30 Jan 2008 13:30:00 +0100
Subject: time: more timer related cleanups

I was confused by FSEC = 10^15 NSEC statement, plus small whitespace
fixes. When there's copyright, there should be GPL.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/hpet.c       | 3 ++-
 arch/x86/kernel/process_64.c | 2 +-
 kernel/softirq.c             | 4 +++-
 kernel/time/tick-sched.c     | 2 +-
 kernel/time/timer_stats.c    | 2 +-
 5 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 2f99ee206b95..9ec2ab793042 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -16,7 +16,8 @@
 #define HPET_MASK	CLOCKSOURCE_MASK(32)
 #define HPET_SHIFT	22
 
-/* FSEC = 10^-15 NSEC = 10^-9 */
+/* FSEC = 10^-15
+   NSEC = 10^-9 */
 #define FSEC_PER_NSEC	1000000
 
 /*
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ab79e1dfa023..c2db7ef93565 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -207,7 +207,7 @@ static inline void play_dead(void)
  * low exit latency (ie sit in a loop waiting for
  * somebody to say that they'd like to reschedule)
  */
-void cpu_idle (void)
+void cpu_idle(void)
 {
 	current_thread_info()->status |= TS_POLLING;
 	/* endless idle loop with no priority at all */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index bd89bc4eb0b9..8fe1ff40102d 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -3,7 +3,9 @@
  *
  *	Copyright (C) 1992 Linus Torvalds
  *
- * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
+ *	Distribute under GPLv2.
+ *
+ *	Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
  */
 
 #include <linux/module.h>
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index d36ee2fd1a3b..49e12f6a4bab 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -9,7 +9,7 @@
  *
  *  Started by: Thomas Gleixner and Ingo Molnar
  *
- *  For licencing details see kernel-base/COPYING
+ *  Distribute under GPLv2.
  */
 #include <linux/cpu.h>
 #include <linux/err.h>
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index c36bb7ed0301..417da8c5bc72 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -26,7 +26,7 @@
  * the pid and cmdline from the owner process if applicable.
  *
  * Start/stop data collection:
- * # echo 1[0] >/proc/timer_stats
+ * # echo [1|0] >/proc/timer_stats
  *
  * Display the information collected so far:
  * # cat /proc/timer_stats
-- 
cgit v1.2.3


From 316da3b3fc8efa9a5d2c99e0d449f01ff38c6aba Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:30:02 +0100
Subject: x86: restrict PIT clocksource usage

PIT clocksource is registered unconditionally even when HPET is enabled
or when PIT is replaced by the local APIC timer. In both cases PIT can
not be used as it is stopped and the readout would be stale.

Prevent registering PIT in those cases.

patch depends on:

  x86: offer is_hpet_enabled() on !CONFIG_HPET_TIMER too

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/i8253.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index a42c80745325..0f8f35458a8f 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -197,7 +197,15 @@ static struct clocksource clocksource_pit = {
 
 static int __init init_pit_clocksource(void)
 {
-	if (num_possible_cpus() > 1) /* PIT does not scale! */
+	 /*
+	  * Several reasons not to register PIT as a clocksource:
+	  *
+	  * - On SMP PIT does not scale due to i8253_lock
+	  * - when HPET is enabled
+	  * - when local APIC timer is active (PIT is switched off)
+	  */
+	if (num_possible_cpus() > 1 || is_hpet_enabled() ||
+	    pit_clockevent.mode != CLOCK_EVT_MODE_PERIODIC)
 		return 0;
 
 	clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20);
-- 
cgit v1.2.3


From 4713e22ce81eb8b3353e16435362eb3d0ec95640 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:30:02 +0100
Subject: clocksource: add unregister function to disable unusable clocksources

On x86 the PIT might become an unusable clocksource. Add an unregister
function to provide a possibilty to remove the PIT from the list of
available clock sources.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/i8253.c     |  1 +
 include/linux/clocksource.h |  1 +
 kernel/time/clocksource.c   | 15 +++++++++++++++
 3 files changed, 17 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 0f8f35458a8f..decc5d294d76 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -13,6 +13,7 @@
 #include <asm/delay.h>
 #include <asm/i8253.h>
 #include <asm/io.h>
+#include <asm/hpet.h>
 
 DEFINE_SPINLOCK(i8253_lock);
 EXPORT_SYMBOL(i8253_lock);
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 07b42153de24..85778a4b1209 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -215,6 +215,7 @@ static inline void clocksource_calculate_interval(struct clocksource *c,
 
 /* used to install a new clocksource */
 extern int clocksource_register(struct clocksource*);
+extern void clocksource_unregister(struct clocksource*);
 extern struct clocksource* clocksource_get_next(void);
 extern void clocksource_change_rating(struct clocksource *cs, int rating);
 extern void clocksource_resume(void);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index edd5ef8e1765..6e9259a5d501 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -337,6 +337,21 @@ void clocksource_change_rating(struct clocksource *cs, int rating)
 	spin_unlock_irqrestore(&clocksource_lock, flags);
 }
 
+/**
+ * clocksource_unregister - remove a registered clocksource
+ */
+void clocksource_unregister(struct clocksource *cs)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&clocksource_lock, flags);
+	list_del(&cs->list);
+	if (clocksource_override == cs)
+		clocksource_override = NULL;
+	next_clocksource = select_clocksource();
+	spin_unlock_irqrestore(&clocksource_lock, flags);
+}
+
 #ifdef CONFIG_SYSFS
 /**
  * sysfs_show_current_clocksources - sysfs interface for current clocksource
-- 
cgit v1.2.3


From 1a0c009ac53de4a7664a1239936f0bc258133156 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:30:03 +0100
Subject: x86: unregister PIT clocksource when PIT is disabled

The following scenario might leave PIT as a disfunctional clock source:

    PIT is registered as clocksource
    PM_TIMER is registered as clocksource and enables highres/dyntick mode
    PIT is switched to oneshot mode
    -> now the readout of PIT is bogus, but the user might select PIT
    via the sysfs override, which would break the box as the time
    readout is unusable.

Unregister the PIT clocksource when the PIT clock event device is switched
into shutdown / oneshot mode.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/i8253.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index decc5d294d76..377c3f8411f8 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -18,6 +18,12 @@
 DEFINE_SPINLOCK(i8253_lock);
 EXPORT_SYMBOL(i8253_lock);
 
+#ifdef CONFIG_X86_32
+static void pit_disable_clocksource(void);
+#else
+static inline void pit_disable_clocksource(void) { }
+#endif
+
 /*
  * HPET replaces the PIT, when enabled. So we need to know, which of
  * the two timers is used
@@ -52,11 +58,13 @@ static void init_pit_timer(enum clock_event_mode mode,
 			outb_p(0, PIT_CH0);
 			outb_p(0, PIT_CH0);
 		}
+		pit_disable_clocksource();
 		break;
 
 	case CLOCK_EVT_MODE_ONESHOT:
 		/* One shot setup */
 		outb_p(0x38, PIT_MODE);
+		pit_disable_clocksource();
 		break;
 
 	case CLOCK_EVT_MODE_RESUME:
@@ -196,6 +204,17 @@ static struct clocksource clocksource_pit = {
 	.shift	= 20,
 };
 
+static void pit_disable_clocksource(void)
+{
+	/*
+	 * Use mult to check whether it is registered or not
+	 */
+	if (clocksource_pit.mult) {
+		clocksource_unregister(&clocksource_pit);
+		clocksource_pit.mult = 0;
+	}
+}
+
 static int __init init_pit_clocksource(void)
 {
 	 /*
-- 
cgit v1.2.3


From e3f37a54f690d3e64995ea7ecea08c5ab3070faf Mon Sep 17 00:00:00 2001
From: Balaji Rao <balajirrao@gmail.com>
Date: Wed, 30 Jan 2008 13:30:03 +0100
Subject: x86: assign IRQs to HPET timers

The userspace API for the HPET (see Documentation/hpet.txt) did not work. The
HPET_IE_ON ioctl was failing as there was no IRQ assigned to the timer
device. This patch fixes it by allocating IRQs to timer blocks in the HPET.

arch/x86/kernel/hpet.c |   13 +++++--------
drivers/char/hpet.c    |   45 ++++++++++++++++++++++++++++++++++++++-------
include/linux/hpet.h   |    2 +-
3 files changed, 44 insertions(+), 16 deletions(-)

Signed-off-by: Balaji Rao <balajirrao@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/hpet.c | 13 +++++--------
 drivers/char/hpet.c    | 45 ++++++++++++++++++++++++++++++++++++++-------
 include/linux/hpet.h   |  2 +-
 3 files changed, 44 insertions(+), 16 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 9ec2ab793042..786aa227afdf 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -117,8 +117,7 @@ int is_hpet_enabled(void)
 static void hpet_reserve_platform_timers(unsigned long id)
 {
 	struct hpet __iomem *hpet = hpet_virt_address;
-	struct hpet_timer __iomem *timer = &hpet->hpet_timers[2];
-	unsigned int nrtimers, i;
+	unsigned int nrtimers;
 	struct hpet_data hd;
 
 	nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
@@ -133,16 +132,14 @@ static void hpet_reserve_platform_timers(unsigned long id)
 #ifdef CONFIG_HPET_EMULATE_RTC
 	hpet_reserve_timer(&hd, 1);
 #endif
-
 	hd.hd_irq[0] = HPET_LEGACY_8254;
 	hd.hd_irq[1] = HPET_LEGACY_RTC;
 
-	for (i = 2; i < nrtimers; timer++, i++)
-		hd.hd_irq[i] = (timer->hpet_config & Tn_INT_ROUTE_CNF_MASK) >>
-			Tn_INT_ROUTE_CNF_SHIFT;
-
+	/*
+	 * IRQs for the other timers are assigned dynamically
+	 * in hpet_alloc
+	 */
 	hpet_alloc(&hd);
-
 }
 #else
 static void hpet_reserve_platform_timers(unsigned long id) { }
diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c
index 4c16778e3f84..593b32cfbc33 100644
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c
@@ -806,14 +806,14 @@ static unsigned long hpet_calibrate(struct hpets *hpetp)
 
 int hpet_alloc(struct hpet_data *hdp)
 {
-	u64 cap, mcfg;
+	u64 cap, mcfg, hpet_config;
 	struct hpet_dev *devp;
-	u32 i, ntimer;
+	u32 i, ntimer, irq;
 	struct hpets *hpetp;
 	size_t siz;
 	struct hpet __iomem *hpet;
 	static struct hpets *last = NULL;
-	unsigned long period;
+	unsigned long period, irq_bitmap;
 	unsigned long long temp;
 
 	/*
@@ -840,11 +840,41 @@ int hpet_alloc(struct hpet_data *hdp)
 	hpetp->hp_hpet_phys = hdp->hd_phys_address;
 
 	hpetp->hp_ntimer = hdp->hd_nirqs;
+	hpet = hpetp->hp_hpet;
 
-	for (i = 0; i < hdp->hd_nirqs; i++)
-		hpetp->hp_dev[i].hd_hdwirq = hdp->hd_irq[i];
+	/* Assign IRQs statically for legacy devices */
+	hpetp->hp_dev[0].hd_hdwirq = hdp->hd_irq[0];
+	hpetp->hp_dev[1].hd_hdwirq = hdp->hd_irq[1];
 
-	hpet = hpetp->hp_hpet;
+	/* Assign IRQs dynamically for the others */
+	for (i = 2, devp = &hpetp->hp_dev[2]; i < hdp->hd_nirqs; i++, devp++) {
+		struct hpet_timer __iomem *timer;
+
+		timer = &hpet->hpet_timers[devp - hpetp->hp_dev];
+
+		hpet_config = readq(&timer->hpet_config);
+		irq_bitmap = (hpet_config & Tn_INT_ROUTE_CAP_MASK)
+			>> Tn_INT_ROUTE_CAP_SHIFT;
+		if (!irq_bitmap)
+			irq = 0;        /* No valid IRQ Assignable */
+		else {
+			irq = find_first_bit(&irq_bitmap, 32);
+			do {
+				hpet_config |= irq << Tn_INT_ROUTE_CNF_SHIFT;
+				writeq(hpet_config, &timer->hpet_config);
+
+				/*
+				 * Verify whether we have written a valid
+				 * IRQ number by reading it back again
+				 */
+				hpet_config = readq(&timer->hpet_config);
+				if (irq == (hpet_config & Tn_INT_ROUTE_CNF_MASK)
+						>> Tn_INT_ROUTE_CNF_SHIFT)
+					break;  /* Success */
+			} while ((irq = (find_next_bit(&irq_bitmap, 32, irq))));
+		}
+		hpetp->hp_dev[i].hd_hdwirq = irq;
+	}
 
 	cap = readq(&hpet->hpet_cap);
 
@@ -875,7 +905,8 @@ int hpet_alloc(struct hpet_data *hdp)
 		hpetp->hp_which, hdp->hd_phys_address,
 		hpetp->hp_ntimer > 1 ? "s" : "");
 	for (i = 0; i < hpetp->hp_ntimer; i++)
-		printk("%s %d", i > 0 ? "," : "", hdp->hd_irq[i]);
+		printk("%s %d", i > 0 ? "," : "",
+				hpetp->hp_dev[i].hd_hdwirq);
 	printk("\n");
 
 	printk(KERN_INFO "hpet%u: %u %d-bit timers, %Lu Hz\n",
diff --git a/include/linux/hpet.h b/include/linux/hpet.h
index 707f7cb9e795..e3c0b2aa944c 100644
--- a/include/linux/hpet.h
+++ b/include/linux/hpet.h
@@ -64,7 +64,7 @@ struct hpet {
  */
 
 #define	Tn_INT_ROUTE_CAP_MASK		(0xffffffff00000000ULL)
-#define	Tn_INI_ROUTE_CAP_SHIFT		(32UL)
+#define	Tn_INT_ROUTE_CAP_SHIFT		(32UL)
 #define	Tn_FSB_INT_DELCAP_MASK		(0x8000UL)
 #define	Tn_FSB_INT_DELCAP_SHIFT		(15)
 #define	Tn_FSB_EN_CNF_MASK		(0x4000UL)
-- 
cgit v1.2.3


From 37a47db8d7f0f38dac5acf5a13abbc8f401707fa Mon Sep 17 00:00:00 2001
From: Balaji Rao <balajirrao@gmail.com>
Date: Wed, 30 Jan 2008 13:30:03 +0100
Subject: x86: assign IRQs to HPET timers, fix

Looks like IRQ 31 is assigned to timer 3, even without the patch!
I wonder who wrote the number 31. But the manual says that it is
zero by default.

I think we should check whether the timer has been allocated an IRQ before
proceeding to assign one to it.  Here is a patch that does this.

Signed-off-by: Balaji Rao <balajirrao@gmail.com>
Tested-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/hpet.c | 10 +++++-----
 drivers/char/hpet.c    |  6 ++++++
 2 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 786aa227afdf..a3c56c9b8a02 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -117,7 +117,8 @@ int is_hpet_enabled(void)
 static void hpet_reserve_platform_timers(unsigned long id)
 {
 	struct hpet __iomem *hpet = hpet_virt_address;
-	unsigned int nrtimers;
+	struct hpet_timer __iomem *timer = &hpet->hpet_timers[2];
+	unsigned int nrtimers, i;
 	struct hpet_data hd;
 
 	nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
@@ -135,10 +136,9 @@ static void hpet_reserve_platform_timers(unsigned long id)
 	hd.hd_irq[0] = HPET_LEGACY_8254;
 	hd.hd_irq[1] = HPET_LEGACY_RTC;
 
-	/*
-	 * IRQs for the other timers are assigned dynamically
-	 * in hpet_alloc
-	 */
+       for (i = 2; i < nrtimers; timer++, i++)
+	       hd.hd_irq[i] = (timer->hpet_config & Tn_INT_ROUTE_CNF_MASK) >>
+		       Tn_INT_ROUTE_CNF_SHIFT;
 	hpet_alloc(&hd);
 }
 #else
diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c
index 593b32cfbc33..22f5fd02ea87 100644
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c
@@ -852,6 +852,12 @@ int hpet_alloc(struct hpet_data *hdp)
 
 		timer = &hpet->hpet_timers[devp - hpetp->hp_dev];
 
+		/* Check if there's already an IRQ assigned to the timer */
+		if (hdp->hd_irq[i]) {
+			hpetp->hp_dev[i].hd_hdwirq = hdp->hd_irq[i];
+			continue;
+		}
+
 		hpet_config = readq(&timer->hpet_config);
 		irq_bitmap = (hpet_config & Tn_INT_ROUTE_CAP_MASK)
 			>> Tn_INT_ROUTE_CAP_SHIFT;
-- 
cgit v1.2.3


From 5c9c9bec0589be696c70c5efb448b17d5ab720e2 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 30 Jan 2008 13:30:04 +0100
Subject: x86: hibernation: document __save_processor_state() on x86

Document the fact that __save_processor_state() has to save all CPU
registers referred to by the kernel in case a different kernel is
used to load and restore a hibernation image containing it.

Sigend-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/suspend_64.c | 20 ++++++++++++++++++++
 include/asm-x86/suspend_64.h |  9 ++++++++-
 2 files changed, 28 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/suspend_64.c b/arch/x86/kernel/suspend_64.c
index 2e5efaaf8800..569f1b540e36 100644
--- a/arch/x86/kernel/suspend_64.c
+++ b/arch/x86/kernel/suspend_64.c
@@ -19,6 +19,21 @@ extern const void __nosave_begin, __nosave_end;
 
 struct saved_context saved_context;
 
+/**
+ *	__save_processor_state - save CPU registers before creating a
+ *		hibernation image and before restoring the memory state from it
+ *	@ctxt - structure to store the registers contents in
+ *
+ *	NOTE: If there is a CPU register the modification of which by the
+ *	boot kernel (ie. the kernel used for loading the hibernation image)
+ *	might affect the operations of the restored target kernel (ie. the one
+ *	saved in the hibernation image), then its contents must be saved by this
+ *	function.  In other words, if kernel A is hibernated and different
+ *	kernel B is used for loading the hibernation image into memory, the
+ *	kernel A's __save_processor_state() function must save all registers
+ *	needed by kernel A, so that it can operate correctly after the resume
+ *	regardless of what kernel B does in the meantime.
+ */
 void __save_processor_state(struct saved_context *ctxt)
 {
 	kernel_fpu_begin();
@@ -69,6 +84,11 @@ static void do_fpu_end(void)
 	kernel_fpu_end();
 }
 
+/**
+ *	__restore_processor_state - restore the contents of CPU registers saved
+ *		by __save_processor_state()
+ *	@ctxt - structure to load the registers contents from
+ */
 void __restore_processor_state(struct saved_context *ctxt)
 {
 	/*
diff --git a/include/asm-x86/suspend_64.h b/include/asm-x86/suspend_64.h
index c505a76bcf6e..4404668f9aa4 100644
--- a/include/asm-x86/suspend_64.h
+++ b/include/asm-x86/suspend_64.h
@@ -15,7 +15,14 @@ arch_prepare_suspend(void)
 	return 0;
 }
 
-/* Image of the saved processor state. If you touch this, fix acpi/wakeup.S. */
+/*
+ * Image of the saved processor state, used by the low level ACPI suspend to
+ * RAM code and by the low level hibernation code.
+ *
+ * If you modify it, fix arch/x86/kernel/acpi/wakeup_64.S and make sure that
+ * __save/__restore_processor_state(), defined in arch/x86/kernel/suspend_64.c,
+ * still work as required.
+ */
 struct saved_context {
 	struct pt_regs regs;
   	u16 ds, es, fs, gs, ss;
-- 
cgit v1.2.3


From 4c6b8b4d62fb4cb843c32db71e0a8301039908f3 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Wed, 30 Jan 2008 13:30:04 +0100
Subject: x86: fix: s2ram + P4 + tsc = annoyance

s2ram recently became useful here, except for the kernel's annoying
habit of disabling my P4's perfectly good TSC.

[  107.894470] CPU 1 is now offline
[  107.894474] SMP alternatives: switching to UP code
[  107.895832] CPU0 attaching sched-domain:
[  107.895836]  domain 0: span 1
[  107.895838]   groups: 1
[  107.896097] CPU1 is down
[    3.726156] Intel machine check architecture supported.
[    3.726165] Intel machine check reporting enabled on CPU#0.
[    3.726167] CPU0: Intel P4/Xeon Extended MCE MSRs (12) available
[    3.726170] CPU0: Thermal monitoring enabled
[    3.726175] Back to C!
[    3.726708] Force enabled HPET at resume
[    3.726775] Enabling non-boot CPUs ...
[    3.727049] CPU0 attaching NULL sched-domain.
[    3.727165] SMP alternatives: switching to SMP code
[    3.727858] Booting processor 1/1 eip 3000
[    3.727862] CPU 1 irqstacks, hard=b042f000 soft=b042d000
[    3.738173] Initializing CPU#1
[    3.798912] Calibrating delay using timer specific routine.. 5986.12 BogoMIPS (lpj=2993061)
[    3.798920] CPU: After generic identify, caps: bfebfbff 00000000 00000000 00000000 00004400 00000000 00000000 00000000
[    3.798931] CPU: Trace cache: 12K uops, L1 D cache: 8K
[    3.798934] CPU: L2 cache: 512K
[    3.798936] CPU: Physical Processor ID: 0
[    3.798938] CPU: After all inits, caps: bfebfbff 00000000 00000000 0000b080 00004400 00000000 00000000 00000000
[    3.798946] Intel machine check architecture supported.
[    3.798952] Intel machine check reporting enabled on CPU#1.
[    3.798955] CPU1: Intel P4/Xeon Extended MCE MSRs (12) available
[    3.798959] CPU1: Thermal monitoring enabled
[    3.799161] CPU1: Intel(R) Pentium(R) 4 CPU 3.00GHz stepping 09
[    3.799187] checking TSC synchronization [CPU#0 -> CPU#1]:
[    3.819181] Measured 63588552840 cycles TSC warp between CPUs, turning off TSC clock.
[    3.819184] Marking TSC unstable due to: check_tsc_sync_source failed.

If check_tsc_warp() is called after initial boot, and the TSC has in the
meantime been set (BIOS, user, silicon, elves) to a value lower than the
last stored/stale value, we blame the TSC.  Reset to pristine condition
after every test.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/tsc_sync.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 9125efe66a06..05d8f25de6ae 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -129,23 +129,23 @@ void __cpuinit check_tsc_sync_source(int cpu)
 	while (atomic_read(&stop_count) != cpus-1)
 		cpu_relax();
 
-	/*
-	 * Reset it - just in case we boot another CPU later:
-	 */
-	atomic_set(&start_count, 0);
-
 	if (nr_warps) {
 		printk("\n");
 		printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs,"
 				    " turning off TSC clock.\n", max_warp);
 		mark_tsc_unstable("check_tsc_sync_source failed");
-		nr_warps = 0;
-		max_warp = 0;
-		last_tsc = 0;
 	} else {
 		printk(" passed.\n");
 	}
 
+	/*
+	 * Reset it - just in case we boot another CPU later:
+	 */
+	atomic_set(&start_count, 0);
+	nr_warps = 0;
+	max_warp = 0;
+	last_tsc = 0;
+
 	/*
 	 * Let the target continue with the bootup:
 	 */
-- 
cgit v1.2.3


From b02aae9cf52956dfe1bec73f77f81a3d05d3902b Mon Sep 17 00:00:00 2001
From: Rene Herman <rene.herman@gmail.com>
Date: Wed, 30 Jan 2008 13:30:05 +0100
Subject: x86: provide a DMI based port 0x80 I/O delay override.

x86: provide a DMI based port 0x80 I/O delay override.

Certain (HP) laptops experience trouble from our port 0x80 I/O delay
writes. This patch provides for a DMI based switch to the "alternate
diagnostic port" 0xed (as used by some BIOSes as well) for these.

David P. Reed confirmed that port 0xed works for him and provides a
proper delay. The symptoms of _not_ working are a hanging machine,
with "hwclock" use being a direct trigger.

Earlier versions of this attempted to simply use udelay(2), with the
2 being a value tested to be a nicely conservative upper-bound with
help from many on the linux-kernel mailinglist but that approach has
two problems.

First, pre-loops_per_jiffy calibration (which is post PIT init while
some implementations of the PIT are actually one of the historically
problematic devices that need the delay) udelay() isn't particularly
well-defined. We could initialise loops_per_jiffy conservatively (and
based on CPU family so as to not unduly delay old machines) which
would sort of work, but...

Second, delaying isn't the only effect that a write to port 0x80 has.
It's also a PCI posting barrier which some devices may be explicitly
or implicitly relying on. Alan Cox did a survey and found evidence
that additionally some drivers may be racy on SMP without the bus
locking outb.

Switching to an inb() makes the timing too unpredictable and as such,
this DMI based switch should be the safest approach for now. Any more
invasive changes should get more rigid testing first. It's moreover
only very few machines with the problem and a DMI based hack seems
to fit that situation.

This also introduces a command-line parameter "io_delay" to override
the DMI based choice again:

	io_delay=<standard|alternate>

where "standard" means using the standard port 0x80 and "alternate"
port 0xed.

This retains the udelay method as a config (CONFIG_UDELAY_IO_DELAY) and
command-line ("io_delay=udelay") choice for testing purposes as well.

This does not change the io_delay() in the boot code which is using
the same port 0x80 I/O delay but those do not appear to be a problem
as David P. Reed reported the problem was already gone after using the
udelay version. He moreover reported that booting with "acpi=off" also
fixed things and seeing as how ACPI isn't touched until after this DMI
based I/O port switch I believe it's safe to leave the ones in the boot
code be.

The DMI strings from David's HP Pavilion dv9000z are in there already
and we need to get/verify the DMI info from other machines with the
problem, notably the HP Pavilion dv6000z.

This patch is partly based on earlier patches from Pavel Machek and
David P. Reed.

Signed-off-by: Rene Herman <rene.herman@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/kernel-parameters.txt |   8 +++
 arch/x86/Kconfig.debug              |   9 +++
 arch/x86/boot/compressed/misc_32.c  |   8 +--
 arch/x86/boot/compressed/misc_64.c  |   8 +--
 arch/x86/kernel/Makefile_32         |   2 +-
 arch/x86/kernel/Makefile_64         |   2 +-
 arch/x86/kernel/io_delay.c          | 106 ++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/setup_32.c          |   2 +
 arch/x86/kernel/setup_64.c          |   2 +
 include/asm-x86/io_32.h             |   8 ++-
 include/asm-x86/io_64.h             |  33 +++++++----
 11 files changed, 165 insertions(+), 23 deletions(-)
 create mode 100644 arch/x86/kernel/io_delay.c

(limited to 'arch/x86/kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 880f882160e2..9e6056058425 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -794,6 +794,14 @@ and is between 256 and 4096 characters. It is defined in the file
 			for translation below 32 bit and if not available
 			then look in the higher range.
 
+	io_delay=	[X86-32,X86-64] I/O delay method
+		standard
+			Standard port 0x80 delay
+		alternate
+			Alternate port 0xed delay
+		udelay
+			Simple two microsecond delay
+
 	io7=		[HW] IO7 for Marvel based alpha systems
 			See comment before marvel_specify_io7 in
 			arch/alpha/kernel/core_marvel.c.
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 761ca7b5f120..40aba670fb37 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -112,4 +112,13 @@ config IOMMU_LEAK
 	  Add a simple leak tracer to the IOMMU code. This is useful when you
 	  are debugging a buggy device driver that leaks IOMMU mappings.
 
+config UDELAY_IO_DELAY
+	bool "Delay I/O through udelay instead of outb"
+	depends on DEBUG_KERNEL
+	help
+	  Make inb_p/outb_p use udelay() based delays by default. Please note
+	  that udelay() does not have the same bus-level side-effects that
+	  the normal outb based delay does meaning this could cause drivers
+	  to change behaviour and/or bugs to surface.
+
 endmenu
diff --git a/arch/x86/boot/compressed/misc_32.c b/arch/x86/boot/compressed/misc_32.c
index b74d60d1b2fa..288e16283ef9 100644
--- a/arch/x86/boot/compressed/misc_32.c
+++ b/arch/x86/boot/compressed/misc_32.c
@@ -276,10 +276,10 @@ static void putstr(const char *s)
 	RM_SCREEN_INFO.orig_y = y;
 
 	pos = (x + cols * y) * 2;	/* Update cursor position */
-	outb_p(14, vidport);
-	outb_p(0xff & (pos >> 9), vidport+1);
-	outb_p(15, vidport);
-	outb_p(0xff & (pos >> 1), vidport+1);
+	outb(14, vidport);
+	outb(0xff & (pos >> 9), vidport+1);
+	outb(15, vidport);
+	outb(0xff & (pos >> 1), vidport+1);
 }
 
 static void* memset(void* s, int c, unsigned n)
diff --git a/arch/x86/boot/compressed/misc_64.c b/arch/x86/boot/compressed/misc_64.c
index 6ea015aa65e4..43e5fcc37be9 100644
--- a/arch/x86/boot/compressed/misc_64.c
+++ b/arch/x86/boot/compressed/misc_64.c
@@ -269,10 +269,10 @@ static void putstr(const char *s)
 	RM_SCREEN_INFO.orig_y = y;
 
 	pos = (x + cols * y) * 2;	/* Update cursor position */
-	outb_p(14, vidport);
-	outb_p(0xff & (pos >> 9), vidport+1);
-	outb_p(15, vidport);
-	outb_p(0xff & (pos >> 1), vidport+1);
+	outb(14, vidport);
+	outb(0xff & (pos >> 9), vidport+1);
+	outb(15, vidport);
+	outb(0xff & (pos >> 1), vidport+1);
 }
 
 static void* memset(void* s, int c, unsigned n)
diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32
index a7bc93c27662..0cc1981d1e38 100644
--- a/arch/x86/kernel/Makefile_32
+++ b/arch/x86/kernel/Makefile_32
@@ -8,7 +8,7 @@ CPPFLAGS_vmlinux.lds += -Ui386
 obj-y	:= process_32.o signal_32.o entry_32.o traps_32.o irq_32.o \
 		ptrace_32.o time_32.o ioport_32.o ldt_32.o setup_32.o i8259_32.o sys_i386_32.o \
 		pci-dma_32.o i386_ksyms_32.o i387_32.o bootflag.o e820_32.o\
-		quirks.o i8237.o topology.o alternative.o i8253.o tsc_32.o
+		quirks.o i8237.o topology.o alternative.o i8253.o tsc_32.o io_delay.o
 
 obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
 obj-y				+= cpu/
diff --git a/arch/x86/kernel/Makefile_64 b/arch/x86/kernel/Makefile_64
index 5a88890d8ee9..08a68f0d8fda 100644
--- a/arch/x86/kernel/Makefile_64
+++ b/arch/x86/kernel/Makefile_64
@@ -11,7 +11,7 @@ obj-y	:= process_64.o signal_64.o entry_64.o traps_64.o irq_64.o \
 		x8664_ksyms_64.o i387_64.o syscall_64.o vsyscall_64.o \
 		setup64.o bootflag.o e820_64.o reboot_64.o quirks.o i8237.o \
 		pci-dma_64.o pci-nommu_64.o alternative.o hpet.o tsc_64.o bugs_64.o \
-		i8253.o
+		i8253.o io_delay.o
 
 obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
 obj-y				+= cpu/
diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c
new file mode 100644
index 000000000000..4d955e74b974
--- /dev/null
+++ b/arch/x86/kernel/io_delay.c
@@ -0,0 +1,106 @@
+/*
+ * I/O delay strategies for inb_p/outb_p
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/dmi.h>
+#include <asm/io.h>
+
+/*
+ * Allow for a DMI based override of port 0x80 needed for certain HP laptops
+ */
+#define IO_DELAY_PORT_STD 0x80
+#define IO_DELAY_PORT_ALT 0xed
+
+static void standard_io_delay(void)
+{
+	asm volatile ("outb %%al, %0" : : "N" (IO_DELAY_PORT_STD));
+}
+
+static void alternate_io_delay(void)
+{
+	asm volatile ("outb %%al, %0" : : "N" (IO_DELAY_PORT_ALT));
+}
+
+/*
+ * 2 usecs is an upper-bound for the outb delay but note that udelay doesn't
+ * have the bus-level side-effects that outb does
+ */
+#define IO_DELAY_USECS 2
+
+/*
+ * High on a hill was a lonely goatherd
+ */
+static void udelay_io_delay(void)
+{
+	udelay(IO_DELAY_USECS);
+}
+
+#ifndef CONFIG_UDELAY_IO_DELAY
+static void (*io_delay)(void) = standard_io_delay;
+#else
+static void (*io_delay)(void) = udelay_io_delay;
+#endif
+
+/*
+ * Paravirt wants native_io_delay to be a constant.
+ */
+void native_io_delay(void)
+{
+	io_delay();
+}
+EXPORT_SYMBOL(native_io_delay);
+
+#ifndef CONFIG_UDELAY_IO_DELAY
+static int __init dmi_alternate_io_delay_port(const struct dmi_system_id *id)
+{
+	printk(KERN_NOTICE "%s: using alternate I/O delay port\n", id->ident);
+	io_delay = alternate_io_delay;
+	return 0;
+}
+
+static struct dmi_system_id __initdata alternate_io_delay_port_dmi_table[] = {
+	{
+		.callback	= dmi_alternate_io_delay_port,
+		.ident		= "HP Pavilion dv9000z",
+		.matches	= {
+			DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
+			DMI_MATCH(DMI_BOARD_NAME, "30B9")
+		}
+	},
+	{
+	}
+};
+
+static int __initdata io_delay_override;
+
+void __init io_delay_init(void)
+{
+	if (!io_delay_override)
+		dmi_check_system(alternate_io_delay_port_dmi_table);
+}
+#endif
+
+static int __init io_delay_param(char *s)
+{
+	if (!s)
+		return -EINVAL;
+
+	if (!strcmp(s, "standard"))
+		io_delay = standard_io_delay;
+	else if (!strcmp(s, "alternate"))
+		io_delay = alternate_io_delay;
+	else if (!strcmp(s, "udelay"))
+		io_delay = udelay_io_delay;
+	else
+		return -EINVAL;
+
+#ifndef CONFIG_UDELAY_IO_DELAY
+	io_delay_override = 1;
+#endif
+	return 0;
+}
+
+early_param("io_delay", io_delay_param);
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 9c24b45b513c..51bdc0b1b72e 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -648,6 +648,8 @@ void __init setup_arch(char **cmdline_p)
 
 	dmi_scan_machine();
 
+	io_delay_init();;
+
 #ifdef CONFIG_X86_GENERICARCH
 	generic_apic_probe();
 #endif	
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 30d94d1d5f5f..ec976edf0399 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -311,6 +311,8 @@ void __init setup_arch(char **cmdline_p)
 
 	dmi_scan_machine();
 
+	io_delay_init();
+
 #ifdef CONFIG_SMP
 	/* setup to use the static apicid table during kernel startup */
 	x86_cpu_to_apicid_ptr = (void *)&x86_cpu_to_apicid_init;
diff --git a/include/asm-x86/io_32.h b/include/asm-x86/io_32.h
index fe881cd1e6f4..a8d25c38b91c 100644
--- a/include/asm-x86/io_32.h
+++ b/include/asm-x86/io_32.h
@@ -250,10 +250,14 @@ static inline void flush_write_buffers(void)
 
 #endif /* __KERNEL__ */
 
-static inline void native_io_delay(void)
+#ifndef CONFIG_UDELAY_IO_DELAY
+extern void io_delay_init(void);
+#else
+static inline void io_delay_init(void)
 {
-	asm volatile("outb %%al,$0x80" : : : "memory");
 }
+#endif
+extern void native_io_delay(void);
 
 #if defined(CONFIG_PARAVIRT)
 #include <asm/paravirt.h>
diff --git a/include/asm-x86/io_64.h b/include/asm-x86/io_64.h
index a037b0794332..5bebaf961692 100644
--- a/include/asm-x86/io_64.h
+++ b/include/asm-x86/io_64.h
@@ -35,13 +35,24 @@
   *  - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
   */
 
-#define __SLOW_DOWN_IO "\noutb %%al,$0x80"
+#ifndef CONFIG_UDELAY_IO_DELAY
+extern void io_delay_init(void);
+#else
+static inline void io_delay_init(void)
+{
+}
+#endif
+extern void native_io_delay(void);
 
+static inline void slow_down_io(void)
+{
+	native_io_delay();
 #ifdef REALLY_SLOW_IO
-#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO
-#else
-#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO
+	native_io_delay();
+	native_io_delay();
+	native_io_delay();
 #endif
+}
 
 /*
  * Talk about misusing macros..
@@ -50,21 +61,21 @@
 static inline void out##s(unsigned x value, unsigned short port) {
 
 #define __OUT2(s,s1,s2) \
-__asm__ __volatile__ ("out" #s " %" s1 "0,%" s2 "1"
+__asm__ __volatile__ ("out" #s " %" s1 "0,%" s2 "1" : : "a" (value), "Nd" (port))
 
 #define __OUT(s,s1,x) \
-__OUT1(s,x) __OUT2(s,s1,"w") : : "a" (value), "Nd" (port)); } \
-__OUT1(s##_p,x) __OUT2(s,s1,"w") __FULL_SLOW_DOWN_IO : : "a" (value), "Nd" (port));} \
+__OUT1(s,x) __OUT2(s,s1,"w"); } \
+__OUT1(s##_p,x) __OUT2(s,s1,"w"); slow_down_io(); }
 
 #define __IN1(s) \
 static inline RETURN_TYPE in##s(unsigned short port) { RETURN_TYPE _v;
 
 #define __IN2(s,s1,s2) \
-__asm__ __volatile__ ("in" #s " %" s2 "1,%" s1 "0"
+__asm__ __volatile__ ("in" #s " %" s2 "1,%" s1 "0" : "=a" (_v) : "Nd" (port))
 
-#define __IN(s,s1,i...) \
-__IN1(s) __IN2(s,s1,"w") : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \
-__IN1(s##_p) __IN2(s,s1,"w") __FULL_SLOW_DOWN_IO : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \
+#define __IN(s,s1) \
+__IN1(s) __IN2(s,s1,"w"); return _v; } \
+__IN1(s##_p) __IN2(s,s1,"w"); slow_down_io(); return _v; }
 
 #define __INS(s) \
 static inline void ins##s(unsigned short port, void * addr, unsigned long count) \
-- 
cgit v1.2.3


From 6e7c402590b75b6b45138792445ee0f0315a8473 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 30 Jan 2008 13:30:05 +0100
Subject: x86: various changes and cleanups to in_p/out_p delay details

various changes to the in_p/out_p delay details:

- add the io_delay=none method
- make each method selectable from the kernel config
- simplify the delay code a bit by getting rid of an indirect function call
- add the /proc/sys/kernel/io_delay_type sysctl
- change 'io_delay=standard|alternate' to io_delay=0x80 and io_delay=0xed
- make the io delay config not depend on CONFIG_DEBUG_KERNEL

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: "David P. Reed" <dpreed@reed.com>
---
 Documentation/kernel-parameters.txt |  12 ++--
 arch/x86/Kconfig.debug              |  79 ++++++++++++++++++++++++---
 arch/x86/kernel/io_delay.c          | 106 +++++++++++++++++-------------------
 include/asm-x86/io_32.h             |  10 +---
 include/asm-x86/io_64.h             |  10 +---
 kernel/sysctl.c                     |   9 +++
 6 files changed, 143 insertions(+), 83 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 9e6056058425..b427b7c0e5d0 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -795,12 +795,14 @@ and is between 256 and 4096 characters. It is defined in the file
 			then look in the higher range.
 
 	io_delay=	[X86-32,X86-64] I/O delay method
-		standard
-			Standard port 0x80 delay
-		alternate
-			Alternate port 0xed delay
+		0x80
+			Standard port 0x80 based delay
+		0xed
+			Alternate port 0xed based delay (needed on some systems)
 		udelay
-			Simple two microsecond delay
+			Simple two microseconds delay
+		none
+			No delay
 
 	io7=		[HW] IO7 for Marvel based alpha systems
 			See comment before marvel_specify_io7 in
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 40aba670fb37..77eda46f97b8 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -112,13 +112,78 @@ config IOMMU_LEAK
 	  Add a simple leak tracer to the IOMMU code. This is useful when you
 	  are debugging a buggy device driver that leaks IOMMU mappings.
 
-config UDELAY_IO_DELAY
-	bool "Delay I/O through udelay instead of outb"
-	depends on DEBUG_KERNEL
+#
+# IO delay types:
+#
+
+config IO_DELAY_TYPE_0X80
+	int
+	default "0"
+
+config IO_DELAY_TYPE_0XED
+	int
+	default "1"
+
+config IO_DELAY_TYPE_UDELAY
+	int
+	default "2"
+
+config IO_DELAY_TYPE_NONE
+	int
+	default "3"
+
+choice
+	prompt "IO delay type"
+	default IO_DELAY_0X80
+
+config IO_DELAY_0X80
+	bool "port 0x80 based port-IO delay [recommended]"
+	help
+	  This is the traditional Linux IO delay used for in/out_p.
+	  It is the most tested hence safest selection here.
+
+config IO_DELAY_0XED
+	bool "port 0xed based port-IO delay"
+	help
+	  Use port 0xed as the IO delay. This frees up port 0x80 which is
+	  often used as a hardware-debug port.
+
+config IO_DELAY_UDELAY
+	bool "udelay based port-IO delay"
+	help
+	  Use udelay(2) as the IO delay method. This provides the delay
+	  while not having any side-effect on the IO port space.
+
+config IO_DELAY_NONE
+	bool "no port-IO delay"
 	help
-	  Make inb_p/outb_p use udelay() based delays by default. Please note
-	  that udelay() does not have the same bus-level side-effects that
-	  the normal outb based delay does meaning this could cause drivers
-	  to change behaviour and/or bugs to surface.
+	  No port-IO delay. Will break on old boxes that require port-IO
+	  delay for certain operations. Should work on most new machines.
+
+endchoice
+
+if IO_DELAY_0X80
+config DEFAULT_IO_DELAY_TYPE
+	int
+	default IO_DELAY_TYPE_0X80
+endif
+
+if IO_DELAY_0XED
+config DEFAULT_IO_DELAY_TYPE
+	int
+	default IO_DELAY_TYPE_0XED
+endif
+
+if IO_DELAY_UDELAY
+config DEFAULT_IO_DELAY_TYPE
+	int
+	default IO_DELAY_TYPE_UDELAY
+endif
+
+if IO_DELAY_NONE
+config DEFAULT_IO_DELAY_TYPE
+	int
+	default IO_DELAY_TYPE_NONE
+endif
 
 endmenu
diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c
index 4d955e74b974..f052e34dc94c 100644
--- a/arch/x86/kernel/io_delay.c
+++ b/arch/x86/kernel/io_delay.c
@@ -1,5 +1,9 @@
 /*
  * I/O delay strategies for inb_p/outb_p
+ *
+ * Allow for a DMI based override of port 0x80, needed for certain HP laptops
+ * and possibly other systems. Also allow for the gradual elimination of
+ * outb_p/inb_p API uses.
  */
 #include <linux/kernel.h>
 #include <linux/module.h>
@@ -8,98 +12,86 @@
 #include <linux/dmi.h>
 #include <asm/io.h>
 
-/*
- * Allow for a DMI based override of port 0x80 needed for certain HP laptops
- */
-#define IO_DELAY_PORT_STD 0x80
-#define IO_DELAY_PORT_ALT 0xed
-
-static void standard_io_delay(void)
-{
-	asm volatile ("outb %%al, %0" : : "N" (IO_DELAY_PORT_STD));
-}
-
-static void alternate_io_delay(void)
-{
-	asm volatile ("outb %%al, %0" : : "N" (IO_DELAY_PORT_ALT));
-}
-
-/*
- * 2 usecs is an upper-bound for the outb delay but note that udelay doesn't
- * have the bus-level side-effects that outb does
- */
-#define IO_DELAY_USECS 2
-
-/*
- * High on a hill was a lonely goatherd
- */
-static void udelay_io_delay(void)
-{
-	udelay(IO_DELAY_USECS);
-}
+int io_delay_type __read_mostly = CONFIG_DEFAULT_IO_DELAY_TYPE;
+EXPORT_SYMBOL_GPL(io_delay_type);
 
-#ifndef CONFIG_UDELAY_IO_DELAY
-static void (*io_delay)(void) = standard_io_delay;
-#else
-static void (*io_delay)(void) = udelay_io_delay;
-#endif
+static int __initdata io_delay_override;
 
 /*
  * Paravirt wants native_io_delay to be a constant.
  */
 void native_io_delay(void)
 {
-	io_delay();
+	switch (io_delay_type) {
+	default:
+	case CONFIG_IO_DELAY_TYPE_0X80:
+		asm volatile ("outb %al, $0x80");
+		break;
+	case CONFIG_IO_DELAY_TYPE_0XED:
+		asm volatile ("outb %al, $0xed");
+		break;
+	case CONFIG_IO_DELAY_TYPE_UDELAY:
+		/*
+		 * 2 usecs is an upper-bound for the outb delay but
+		 * note that udelay doesn't have the bus-level
+		 * side-effects that outb does, nor does udelay() have
+		 * precise timings during very early bootup (the delays
+		 * are shorter until calibrated):
+		 */
+		udelay(2);
+	case CONFIG_IO_DELAY_TYPE_NONE:
+		break;
+	}
 }
 EXPORT_SYMBOL(native_io_delay);
 
-#ifndef CONFIG_UDELAY_IO_DELAY
-static int __init dmi_alternate_io_delay_port(const struct dmi_system_id *id)
+static int __init dmi_io_delay_0xed_port(const struct dmi_system_id *id)
 {
-	printk(KERN_NOTICE "%s: using alternate I/O delay port\n", id->ident);
-	io_delay = alternate_io_delay;
+	if (io_delay_type == CONFIG_IO_DELAY_TYPE_0X80) {
+		printk(KERN_NOTICE "%s: using 0xed I/O delay port\n",
+			id->ident);
+		io_delay_type = CONFIG_IO_DELAY_TYPE_0XED;
+	}
+
 	return 0;
 }
 
-static struct dmi_system_id __initdata alternate_io_delay_port_dmi_table[] = {
+/*
+ * Quirk table for systems that misbehave (lock up, etc.) if port
+ * 0x80 is used:
+ */
+static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = {
 	{
-		.callback	= dmi_alternate_io_delay_port,
+		.callback	= dmi_io_delay_0xed_port,
 		.ident		= "HP Pavilion dv9000z",
 		.matches	= {
 			DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
 			DMI_MATCH(DMI_BOARD_NAME, "30B9")
 		}
 	},
-	{
-	}
+	{ }
 };
 
-static int __initdata io_delay_override;
-
 void __init io_delay_init(void)
 {
 	if (!io_delay_override)
-		dmi_check_system(alternate_io_delay_port_dmi_table);
+		dmi_check_system(io_delay_0xed_port_dmi_table);
 }
-#endif
 
 static int __init io_delay_param(char *s)
 {
-	if (!s)
-		return -EINVAL;
-
-	if (!strcmp(s, "standard"))
-		io_delay = standard_io_delay;
-	else if (!strcmp(s, "alternate"))
-		io_delay = alternate_io_delay;
+	if (!strcmp(s, "0x80"))
+		io_delay_type = CONFIG_IO_DELAY_TYPE_0X80;
+	else if (!strcmp(s, "0xed"))
+		io_delay_type = CONFIG_IO_DELAY_TYPE_0XED;
 	else if (!strcmp(s, "udelay"))
-		io_delay = udelay_io_delay;
+		io_delay_type = CONFIG_IO_DELAY_TYPE_UDELAY;
+	else if (!strcmp(s, "none"))
+		io_delay_type = CONFIG_IO_DELAY_TYPE_NONE;
 	else
 		return -EINVAL;
 
-#ifndef CONFIG_UDELAY_IO_DELAY
 	io_delay_override = 1;
-#endif
 	return 0;
 }
 
diff --git a/include/asm-x86/io_32.h b/include/asm-x86/io_32.h
index a8d25c38b91c..2a04bd17eac5 100644
--- a/include/asm-x86/io_32.h
+++ b/include/asm-x86/io_32.h
@@ -250,15 +250,11 @@ static inline void flush_write_buffers(void)
 
 #endif /* __KERNEL__ */
 
-#ifndef CONFIG_UDELAY_IO_DELAY
-extern void io_delay_init(void);
-#else
-static inline void io_delay_init(void)
-{
-}
-#endif
 extern void native_io_delay(void);
 
+extern int io_delay_type;
+extern void io_delay_init(void);
+
 #if defined(CONFIG_PARAVIRT)
 #include <asm/paravirt.h>
 #else
diff --git a/include/asm-x86/io_64.h b/include/asm-x86/io_64.h
index 5bebaf961692..dbcc03aa1c6a 100644
--- a/include/asm-x86/io_64.h
+++ b/include/asm-x86/io_64.h
@@ -35,15 +35,11 @@
   *  - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
   */
 
-#ifndef CONFIG_UDELAY_IO_DELAY
-extern void io_delay_init(void);
-#else
-static inline void io_delay_init(void)
-{
-}
-#endif
 extern void native_io_delay(void);
 
+extern int io_delay_type;
+extern void io_delay_init(void);
+
 static inline void slow_down_io(void)
 {
 	native_io_delay();
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4bc8e48434a7..357b68ba23ec 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -53,6 +53,7 @@
 #ifdef CONFIG_X86
 #include <asm/nmi.h>
 #include <asm/stacktrace.h>
+#include <asm/io.h>
 #endif
 
 static int deprecated_sysctl_warning(struct __sysctl_args *args);
@@ -727,6 +728,14 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "io_delay_type",
+		.data		= &io_delay_type,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 #endif
 #if defined(CONFIG_MMU)
 	{
-- 
cgit v1.2.3


From f9fc58910ebc448b0b7d37af1bf57a896a78e9c4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 30 Jan 2008 13:30:05 +0100
Subject: x86: add DMI quirk for io-delay hangs on Compaq Presario V6000
 laptops

add the DMI strings provided by Islam Amer <pharon@gmail.com>, for
the Compaq Presario V6000 (Quanta/30B7).

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/io_delay.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c
index f052e34dc94c..bd49321034db 100644
--- a/arch/x86/kernel/io_delay.c
+++ b/arch/x86/kernel/io_delay.c
@@ -61,6 +61,14 @@ static int __init dmi_io_delay_0xed_port(const struct dmi_system_id *id)
  * 0x80 is used:
  */
 static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = {
+	{
+		.callback	= dmi_io_delay_0xed_port,
+		.ident		= "Compaq Presario V6000",
+		.matches	= {
+			DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
+			DMI_MATCH(DMI_BOARD_NAME, "30B7")
+		}
+	},
 	{
 		.callback	= dmi_io_delay_0xed_port,
 		.ident		= "HP Pavilion dv9000z",
@@ -69,6 +77,14 @@ static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = {
 			DMI_MATCH(DMI_BOARD_NAME, "30B9")
 		}
 	},
+	{
+		.callback	= dmi_io_delay_0xed_port,
+		.ident		= "HP Pavilion tx1000",
+		.matches	= {
+			DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
+			DMI_MATCH(DMI_BOARD_NAME, "30BF")
+		}
+	},
 	{ }
 };
 
-- 
cgit v1.2.3


From 83bd01024b1fdfc41d9b758e5669e80fca72df66 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:30:06 +0100
Subject: x86: protect against sigaltstack wraparound

cf http://lkml.org/lkml/2007/10/3/41

To summarize: on Linux, SA_ONSTACK decides whether you are already on the
signal stack based on the value of the SP at the time of a signal.  If
you are not already inside the range, you are not "on the signal stack"
and so the new signal handler frame starts over at the base of the signal
stack.

sigaltstack (and sigstack before it) was invented in BSD.  There, the
SA_ONSTACK behavior has always been different.  It uses a kernel state
flag to decide, rather than the SP value.  When you first take an
SA_ONSTACK signal and switch to the alternate signal stack, it sets the
SS_ONSTACK flag in the thread's sigaltstack state in the kernel.
Thereafter you are "on the signal stack" and don't switch SP before
pushing a handler frame no matter what the SP value is.  Only when you
sigreturn from the original handler context do you clear the SS_ONSTACK
flag so that a new handler frame will start over at the base of the
alternate signal stack.

The undesireable effect of the Linux behavior is that an overflow of the
alternate signal stack can not only go undetected, but lead to a ring
buffer effect of clobbering the original handler frame at the base of the
signal stack for each successive signal that comes just after the
overflow.  This is what Shi Weihua's test case demonstrates.  Normally
this does not come up because of the signal mask, but the test case uses
SA_NODEFER for its SIGSEGV handler.

The other subtle part of the existing Linux semantics is that a simple
longjmp out of a signal handler serves to take you off the signal stack
in a safe and reliable fashion without having used sigreturn (nor having
just returned from the handler normally, which means the same).  After
the longjmp (or even informal stack switching not via any proper libc or
kernel interface), the alternate signal stack stands ready to be used
again.

A paranoid program would allocate a PROT_NONE red zone around its
alternate signal stack.  Then a small overflow would trigger a SIGSEGV in
handler setup, and be fatal (core dump) whether or not SIGSEGV is
blocked.  As with thread stack red zones, that cannot catch all overflows
(or underflows).  e.g., a local array as large as page size allocated in
a function called from a handler, but not actually touched before more
calls push more stack, could cause an overflow that silently pushes into
some unrelated allocated pages.

The BSD behavior does not do anything in particular about overflow.  But
it does at least avoid the wraparound or "ring buffer effect", so you'll
just get a straightforward all-out overflow down your address space past
the low end of the alternate signal stack.  I don't know what the BSD
behavior is for longjmp out of an SA_ONSTACK handler.

The POSIX wording relating to sigaltstack is pretty minimal.  I don't
think it speaks to this issue one way or another.  (The program that
overflows its stack is clearly in undefined behavior territory of one
sort or another anyhow.)

Given the longjmp issue and the potential for highly subtle complications
in existing programs relying on this in arcane ways deep in their code, I
am very dubious about changing the behavior to the BSD style persistent
flag.  I think Shi Weihua's patches have a similar effect by tracking the
SP used in the last handler setup.

I think it would be sensible for the signal handler setup code to detect
when it would itself be causing a stack overflow.  Maybe something like
the following patch (untested).  This issue exists in the same way on all
machines, so ideally they would all do a similar check.

When it's the handler function itself or its callees that cause the
overflow, rather than the signal handler frame setup alone crossing the
boundary, this still won't help.  But I don't see any way to distinguish
that from the valid longjmp case.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/signal_32.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index 20f29e4c1d33..5c6170c44b00 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -295,6 +295,13 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size)
 	/* Default to using normal stack */
 	esp = regs->esp;
 
+	/*
+	 * If we are on the alternate signal stack and would overflow it, don't.
+	 * Return an always-bogus address instead so we will die with SIGSEGV.
+	 */
+	if (on_sig_stack(esp) && !likely(on_sig_stack(esp - frame_size)))
+		return (void __user *) -1L;
+
 	/* This is the X/Open sanctioned signal stack switching.  */
 	if (ka->sa.sa_flags & SA_ONSTACK) {
 		if (sas_ss_flags(esp) == 0)
-- 
cgit v1.2.3


From 53d517cdbaac704352b3d0c10fecb99e0b54572e Mon Sep 17 00:00:00 2001
From: Guillaume Chazarain <guichaz@yahoo.fr>
Date: Wed, 30 Jan 2008 13:30:06 +0100
Subject: x86: scale cyc_2_nsec according to CPU frequency

scale the sched_clock() cyc_2_nsec scaling factor according to
CPU frequency changes.

[ mingo@elte.hu: simplified it and fixed it for SMP. ]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/tsc_32.c | 43 +++++++++++++++++++++++++++++++-----
 arch/x86/kernel/tsc_64.c | 57 +++++++++++++++++++++++++++++++++++++++---------
 include/asm-x86/timer.h  | 23 ++++++++++++++-----
 3 files changed, 102 insertions(+), 21 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
index 9ebc0dab66b4..00bb4c1c0593 100644
--- a/arch/x86/kernel/tsc_32.c
+++ b/arch/x86/kernel/tsc_32.c
@@ -5,6 +5,7 @@
 #include <linux/jiffies.h>
 #include <linux/init.h>
 #include <linux/dmi.h>
+#include <linux/percpu.h>
 
 #include <asm/delay.h>
 #include <asm/tsc.h>
@@ -80,13 +81,31 @@ EXPORT_SYMBOL_GPL(check_tsc_unstable);
  *
  *			-johnstul@us.ibm.com "math is hard, lets go shopping!"
  */
-unsigned long cyc2ns_scale __read_mostly;
 
-#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
+DEFINE_PER_CPU(unsigned long, cyc2ns);
 
-static inline void set_cyc2ns_scale(unsigned long cpu_khz)
+static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
 {
-	cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
+	unsigned long flags, prev_scale, *scale;
+	unsigned long long tsc_now, ns_now;
+
+	local_irq_save(flags);
+	sched_clock_idle_sleep_event();
+
+	scale = &per_cpu(cyc2ns, cpu);
+
+	rdtscll(tsc_now);
+	ns_now = __cycles_2_ns(tsc_now);
+
+	prev_scale = *scale;
+	if (cpu_khz)
+		*scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
+
+	/*
+	 * Start smoothly with the new frequency:
+	 */
+	sched_clock_idle_wakeup_event(0);
+	local_irq_restore(flags);
 }
 
 /*
@@ -239,7 +258,9 @@ time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data)
 						ref_freq, freq->new);
 			if (!(freq->flags & CPUFREQ_CONST_LOOPS)) {
 				tsc_khz = cpu_khz;
-				set_cyc2ns_scale(cpu_khz);
+				preempt_disable();
+				set_cyc2ns_scale(cpu_khz, smp_processor_id());
+				preempt_enable();
 				/*
 				 * TSC based sched_clock turns
 				 * to junk w/ cpufreq
@@ -367,6 +388,8 @@ static inline void check_geode_tsc_reliable(void) { }
 
 void __init tsc_init(void)
 {
+	int cpu;
+
 	if (!cpu_has_tsc || tsc_disable)
 		goto out_no_tsc;
 
@@ -380,7 +403,15 @@ void __init tsc_init(void)
 				(unsigned long)cpu_khz / 1000,
 				(unsigned long)cpu_khz % 1000);
 
-	set_cyc2ns_scale(cpu_khz);
+	/*
+	 * Secondary CPUs do not run through tsc_init(), so set up
+	 * all the scale factors for all CPUs, assuming the same
+	 * speed as the bootup CPU. (cpufreq notifiers will fix this
+	 * up if their speed diverges)
+	 */
+	for_each_possible_cpu(cpu)
+		set_cyc2ns_scale(cpu_khz, cpu);
+
 	use_tsc_delay();
 
 	/* Check and install the TSC clocksource */
diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c
index 9c70af45b42b..32edd2c50e94 100644
--- a/arch/x86/kernel/tsc_64.c
+++ b/arch/x86/kernel/tsc_64.c
@@ -10,6 +10,7 @@
 
 #include <asm/hpet.h>
 #include <asm/timex.h>
+#include <asm/timer.h>
 
 static int notsc __initdata = 0;
 
@@ -18,16 +19,48 @@ EXPORT_SYMBOL(cpu_khz);
 unsigned int tsc_khz;
 EXPORT_SYMBOL(tsc_khz);
 
-static unsigned int cyc2ns_scale __read_mostly;
+/* Accelerators for sched_clock()
+ * convert from cycles(64bits) => nanoseconds (64bits)
+ *  basic equation:
+ *		ns = cycles / (freq / ns_per_sec)
+ *		ns = cycles * (ns_per_sec / freq)
+ *		ns = cycles * (10^9 / (cpu_khz * 10^3))
+ *		ns = cycles * (10^6 / cpu_khz)
+ *
+ *	Then we use scaling math (suggested by george@mvista.com) to get:
+ *		ns = cycles * (10^6 * SC / cpu_khz) / SC
+ *		ns = cycles * cyc2ns_scale / SC
+ *
+ *	And since SC is a constant power of two, we can convert the div
+ *  into a shift.
+ *
+ *  We can use khz divisor instead of mhz to keep a better precision, since
+ *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
+ *  (mathieu.desnoyers@polymtl.ca)
+ *
+ *			-johnstul@us.ibm.com "math is hard, lets go shopping!"
+ */
+DEFINE_PER_CPU(unsigned long, cyc2ns);
 
-static inline void set_cyc2ns_scale(unsigned long khz)
+static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
 {
-	cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / khz;
-}
+	unsigned long flags, prev_scale, *scale;
+	unsigned long long tsc_now, ns_now;
 
-static unsigned long long cycles_2_ns(unsigned long long cyc)
-{
-	return (cyc * cyc2ns_scale) >> NS_SCALE;
+	local_irq_save(flags);
+	sched_clock_idle_sleep_event();
+
+	scale = &per_cpu(cyc2ns, cpu);
+
+	rdtscll(tsc_now);
+	ns_now = __cycles_2_ns(tsc_now);
+
+	prev_scale = *scale;
+	if (cpu_khz)
+		*scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
+
+	sched_clock_idle_wakeup_event(0);
+	local_irq_restore(flags);
 }
 
 unsigned long long sched_clock(void)
@@ -100,7 +133,9 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
 			mark_tsc_unstable("cpufreq changes");
 	}
 
-	set_cyc2ns_scale(tsc_khz_ref);
+	preempt_disable();
+	set_cyc2ns_scale(tsc_khz_ref, smp_processor_id());
+	preempt_enable();
 
 	return 0;
 }
@@ -151,7 +186,7 @@ static unsigned long __init tsc_read_refs(unsigned long *pm,
 void __init tsc_calibrate(void)
 {
 	unsigned long flags, tsc1, tsc2, tr1, tr2, pm1, pm2, hpet1, hpet2;
-	int hpet = is_hpet_enabled();
+	int hpet = is_hpet_enabled(), cpu;
 
 	local_irq_save(flags);
 
@@ -206,7 +241,9 @@ void __init tsc_calibrate(void)
 	}
 
 	tsc_khz = tsc2 / tsc1;
-	set_cyc2ns_scale(tsc_khz);
+
+	for_each_possible_cpu(cpu)
+		set_cyc2ns_scale(tsc_khz, cpu);
 }
 
 /*
diff --git a/include/asm-x86/timer.h b/include/asm-x86/timer.h
index 0db7e994fb8b..4f6fcb050c11 100644
--- a/include/asm-x86/timer.h
+++ b/include/asm-x86/timer.h
@@ -2,6 +2,7 @@
 #define _ASMi386_TIMER_H
 #include <linux/init.h>
 #include <linux/pm.h>
+#include <linux/percpu.h>
 
 #define TICK_SIZE (tick_nsec / 1000)
 
@@ -16,7 +17,7 @@ extern int recalibrate_cpu_khz(void);
 #define calculate_cpu_khz() native_calculate_cpu_khz()
 #endif
 
-/* Accellerators for sched_clock()
+/* Accelerators for sched_clock()
  * convert from cycles(64bits) => nanoseconds (64bits)
  *  basic equation:
  *		ns = cycles / (freq / ns_per_sec)
@@ -31,20 +32,32 @@ extern int recalibrate_cpu_khz(void);
  *	And since SC is a constant power of two, we can convert the div
  *  into a shift.
  *
- *  We can use khz divisor instead of mhz to keep a better percision, since
+ *  We can use khz divisor instead of mhz to keep a better precision, since
  *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
  *  (mathieu.desnoyers@polymtl.ca)
  *
  *			-johnstul@us.ibm.com "math is hard, lets go shopping!"
  */
-extern unsigned long cyc2ns_scale __read_mostly;
+
+DECLARE_PER_CPU(unsigned long, cyc2ns);
 
 #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
 
-static inline unsigned long long cycles_2_ns(unsigned long long cyc)
+static inline unsigned long long __cycles_2_ns(unsigned long long cyc)
 {
-	return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
+	return cyc * per_cpu(cyc2ns, smp_processor_id()) >> CYC2NS_SCALE_FACTOR;
 }
 
+static inline unsigned long long cycles_2_ns(unsigned long long cyc)
+{
+	unsigned long long ns;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	ns = __cycles_2_ns(cyc);
+	local_irq_restore(flags);
+
+	return ns;
+}
 
 #endif
-- 
cgit v1.2.3


From 5ee613b6751cd91db4b6bd7c1dc9d2f9cf65cde2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 30 Jan 2008 13:30:06 +0100
Subject: x86: idle wakeup event in the HLT loop

do a proper idle-wakeup event on HLT as well - some CPUs stop the TSC
in HLT too, not just when going through the ACPI methods.

(the ACPI idle code already does this.)

[ update the 64-bit side too, as noticed by Jiri Slaby. ]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/process_32.c | 15 ++++++++++++---
 arch/x86/kernel/process_64.c | 13 ++++++++++---
 2 files changed, 22 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 46d391d49de8..a63d2d2556ee 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -113,10 +113,19 @@ void default_idle(void)
 		smp_mb();
 
 		local_irq_disable();
-		if (!need_resched())
+		if (!need_resched()) {
+			ktime_t t0, t1;
+			u64 t0n, t1n;
+
+			t0 = ktime_get();
+			t0n = ktime_to_ns(t0);
 			safe_halt();	/* enables interrupts racelessly */
-		else
-			local_irq_enable();
+			local_irq_disable();
+			t1 = ktime_get();
+			t1n = ktime_to_ns(t1);
+			sched_clock_idle_wakeup_event(t1n - t0n);
+		}
+		local_irq_enable();
 		current_thread_info()->status |= TS_POLLING;
 	} else {
 		/* loop is done by the caller */
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index c2db7ef93565..40fed477f3e5 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -116,9 +116,16 @@ static void default_idle(void)
 	smp_mb();
 	local_irq_disable();
 	if (!need_resched()) {
-		/* Enables interrupts one instruction before HLT.
-		   x86 special cases this so there is no race. */
-		safe_halt();
+		ktime_t t0, t1;
+		u64 t0n, t1n;
+
+		t0 = ktime_get();
+		t0n = ktime_to_ns(t0);
+		safe_halt();	/* enables interrupts racelessly */
+		local_irq_disable();
+		t1 = ktime_get();
+		t1n = ktime_to_ns(t1);
+		sched_clock_idle_wakeup_event(t1n - t0n);
 	} else
 		local_irq_enable();
 	current_thread_info()->status |= TS_POLLING;
-- 
cgit v1.2.3


From 39d44a51474a52bec6d72d30ebc76f5159101d90 Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Wed, 30 Jan 2008 13:30:06 +0100
Subject: x86: enable irq in default_idle on 64-bit

local_irq_enable() is missing after sched_clock_idle_wakeup_event().

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/process_64.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 40fed477f3e5..2c9e59448f4c 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -126,8 +126,8 @@ static void default_idle(void)
 		t1 = ktime_get();
 		t1n = ktime_to_ns(t1);
 		sched_clock_idle_wakeup_event(t1n - t0n);
-	} else
-		local_irq_enable();
+	}
+	local_irq_enable();
 	current_thread_info()->status |= TS_POLLING;
 }
 
-- 
cgit v1.2.3


From c140df973c07ac328aafd19d4f4c413f2f8902df Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 30 Jan 2008 13:30:09 +0100
Subject: x86: clean up arch/x86/kernel/aperture_64.c

whitespace cleanup. No code changed:

   text    data     bss     dec     hex filename
   2080      76       4    2160     870 aperture_64.o.before
   2080      76       4    2160     870 aperture_64.o.after

                                       errors   lines of code   errors/KLOC
 arch/x86/kernel/aperture_64.c            114             299         381.2
 arch/x86/kernel/aperture_64.c              0             315             0

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/aperture_64.c | 244 ++++++++++++++++++++++--------------------
 1 file changed, 130 insertions(+), 114 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 5b6992799c9d..250db0527f5d 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -1,12 +1,12 @@
-/* 
+/*
  * Firmware replacement code.
- * 
+ *
  * Work around broken BIOSes that don't set an aperture or only set the
- * aperture in the AGP bridge. 
- * If all fails map the aperture over some low memory.  This is cheaper than 
- * doing bounce buffering. The memory is lost. This is done at early boot 
- * because only the bootmem allocator can allocate 32+MB. 
- * 
+ * aperture in the AGP bridge.
+ * If all fails map the aperture over some low memory.  This is cheaper than
+ * doing bounce buffering. The memory is lost. This is done at early boot
+ * because only the bootmem allocator can allocate 32+MB.
+ *
  * Copyright 2002 Andi Kleen, SuSE Labs.
  */
 #include <linux/kernel.h>
@@ -30,7 +30,7 @@ int gart_iommu_aperture_disabled __initdata = 0;
 int gart_iommu_aperture_allowed __initdata = 0;
 
 int fallback_aper_order __initdata = 1; /* 64MB */
-int fallback_aper_force __initdata = 0; 
+int fallback_aper_force __initdata = 0;
 
 int fix_aperture __initdata = 1;
 
@@ -49,20 +49,20 @@ static void __init insert_aperture_resource(u32 aper_base, u32 aper_size)
 /* This code runs before the PCI subsystem is initialized, so just
    access the northbridge directly. */
 
-static u32 __init allocate_aperture(void) 
+static u32 __init allocate_aperture(void)
 {
 	u32 aper_size;
-	void *p; 
+	void *p;
 
-	if (fallback_aper_order > 7) 
-		fallback_aper_order = 7; 
-	aper_size = (32 * 1024 * 1024) << fallback_aper_order; 
+	if (fallback_aper_order > 7)
+		fallback_aper_order = 7;
+	aper_size = (32 * 1024 * 1024) << fallback_aper_order;
 
-	/* 
-	 * Aperture has to be naturally aligned. This means an 2GB aperture won't
-	 * have much chance of finding a place in the lower 4GB of memory.
-	 * Unfortunately we cannot move it up because that would make the
-	 * IOMMU useless.
+	/*
+	 * Aperture has to be naturally aligned. This means a 2GB aperture
+	 * won't have much chance of finding a place in the lower 4GB of
+	 * memory. Unfortunately we cannot move it up because that would
+	 * make the IOMMU useless.
 	 */
 	p = __alloc_bootmem_nopanic(aper_size, aper_size, 0);
 	if (!p || __pa(p)+aper_size > 0xffffffff) {
@@ -73,54 +73,60 @@ static u32 __init allocate_aperture(void)
 		return 0;
 	}
 	printk("Mapping aperture over %d KB of RAM @ %lx\n",
-	       aper_size >> 10, __pa(p)); 
+	       aper_size >> 10, __pa(p));
 	insert_aperture_resource((u32)__pa(p), aper_size);
-	return (u32)__pa(p); 
+
+	return (u32)__pa(p);
 }
 
 static int __init aperture_valid(u64 aper_base, u32 aper_size)
-{ 
-	if (!aper_base) 
+{
+	if (!aper_base)
 		return 0;
-	if (aper_size < 64*1024*1024) { 
+	if (aper_size < 64*1024*1024) {
 		printk("Aperture too small (%d MB)\n", aper_size>>20);
 		return 0;
 	}
 	if (aper_base + aper_size > 0x100000000UL) {
 		printk("Aperture beyond 4GB. Ignoring.\n");
-		return 0; 
+		return 0;
 	}
 	if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
 		printk("Aperture pointing to e820 RAM. Ignoring.\n");
-		return 0; 
-	} 
+		return 0;
+	}
 	return 1;
-} 
+}
 
 /* Find a PCI capability */
-static __u32 __init find_cap(int num, int slot, int func, int cap) 
-{ 
-	u8 pos;
+static __u32 __init find_cap(int num, int slot, int func, int cap)
+{
 	int bytes;
-	if (!(read_pci_config_16(num,slot,func,PCI_STATUS) & PCI_STATUS_CAP_LIST))
+	u8 pos;
+
+	if (!(read_pci_config_16(num, slot, func, PCI_STATUS) &
+						PCI_STATUS_CAP_LIST))
 		return 0;
-	pos = read_pci_config_byte(num,slot,func,PCI_CAPABILITY_LIST);
-	for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { 
+
+	pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST);
+	for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) {
 		u8 id;
-		pos &= ~3; 
-		id = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_ID);
+
+		pos &= ~3;
+		id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID);
 		if (id == 0xff)
 			break;
-		if (id == cap) 
-			return pos; 
-		pos = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_NEXT); 
-	} 
+		if (id == cap)
+			return pos;
+		pos = read_pci_config_byte(num, slot, func,
+						pos+PCI_CAP_LIST_NEXT);
+	}
 	return 0;
-} 
+}
 
 /* Read a standard AGPv3 bridge header */
 static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
-{ 
+{
 	u32 apsize;
 	u32 apsizereg;
 	int nbits;
@@ -128,7 +134,7 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
 	u64 aper;
 
 	printk("AGP bridge at %02x:%02x:%02x\n", num, slot, func);
-	apsizereg = read_pci_config_16(num,slot,func, cap + 0x14);
+	apsizereg = read_pci_config_16(num, slot, func, cap + 0x14);
 	if (apsizereg == 0xffffffff) {
 		printk("APSIZE in AGP bridge unreadable\n");
 		return 0;
@@ -136,80 +142,84 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
 
 	apsize = apsizereg & 0xfff;
 	/* Some BIOS use weird encodings not in the AGPv3 table. */
-	if (apsize & 0xff) 
-		apsize |= 0xf00; 
+	if (apsize & 0xff)
+		apsize |= 0xf00;
 	nbits = hweight16(apsize);
 	*order = 7 - nbits;
 	if ((int)*order < 0) /* < 32MB */
 		*order = 0;
-	
-	aper_low = read_pci_config(num,slot,func, 0x10);
-	aper_hi = read_pci_config(num,slot,func,0x14);
+
+	aper_low = read_pci_config(num, slot, func, 0x10);
+	aper_hi = read_pci_config(num, slot, func, 0x14);
 	aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32);
 
-	printk("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", 
+	printk("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n",
 	       aper, 32 << *order, apsizereg);
 
 	if (!aperture_valid(aper, (32*1024*1024) << *order))
-	    return 0;
-	return (u32)aper; 
-} 
-
-/* Look for an AGP bridge. Windows only expects the aperture in the
-   AGP bridge and some BIOS forget to initialize the Northbridge too.
-   Work around this here. 
-
-   Do an PCI bus scan by hand because we're running before the PCI
-   subsystem. 
+		return 0;
+	return (u32)aper;
+}
 
-   All K8 AGP bridges are AGPv3 compliant, so we can do this scan
-   generically. It's probably overkill to always scan all slots because
-   the AGP bridges should be always an own bus on the HT hierarchy, 
-   but do it here for future safety. */
+/*
+ * Look for an AGP bridge. Windows only expects the aperture in the
+ * AGP bridge and some BIOS forget to initialize the Northbridge too.
+ * Work around this here.
+ *
+ * Do an PCI bus scan by hand because we're running before the PCI
+ * subsystem.
+ *
+ * All K8 AGP bridges are AGPv3 compliant, so we can do this scan
+ * generically. It's probably overkill to always scan all slots because
+ * the AGP bridges should be always an own bus on the HT hierarchy,
+ * but do it here for future safety.
+ */
 static __u32 __init search_agp_bridge(u32 *order, int *valid_agp)
 {
 	int num, slot, func;
 
 	/* Poor man's PCI discovery */
-	for (num = 0; num < 256; num++) { 
-		for (slot = 0; slot < 32; slot++) { 
-			for (func = 0; func < 8; func++) { 
+	for (num = 0; num < 256; num++) {
+		for (slot = 0; slot < 32; slot++) {
+			for (func = 0; func < 8; func++) {
 				u32 class, cap;
 				u8 type;
-				class = read_pci_config(num,slot,func,
+				class = read_pci_config(num, slot, func,
 							PCI_CLASS_REVISION);
 				if (class == 0xffffffff)
-					break; 
-				
-				switch (class >> 16) { 
+					break;
+
+				switch (class >> 16) {
 				case PCI_CLASS_BRIDGE_HOST:
 				case PCI_CLASS_BRIDGE_OTHER: /* needed? */
 					/* AGP bridge? */
-					cap = find_cap(num,slot,func,PCI_CAP_ID_AGP);
+					cap = find_cap(num, slot, func,
+							PCI_CAP_ID_AGP);
 					if (!cap)
 						break;
-					*valid_agp = 1; 
-					return read_agp(num,slot,func,cap,order);
-				} 
-				
+					*valid_agp = 1;
+					return read_agp(num, slot, func, cap,
+							order);
+				}
+
 				/* No multi-function device? */
-				type = read_pci_config_byte(num,slot,func,
+				type = read_pci_config_byte(num, slot, func,
 							       PCI_HEADER_TYPE);
 				if (!(type & 0x80))
 					break;
-			} 
-		} 
+			}
+		}
 	}
-	printk("No AGP bridge found\n"); 
+	printk("No AGP bridge found\n");
+
 	return 0;
 }
 
 void __init gart_iommu_hole_init(void)
-{ 
-	int fix, num; 
+{
 	u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0;
 	u64 aper_base, last_aper_base = 0;
-	int valid_agp = 0;
+	int fix, num, valid_agp = 0;
 
 	if (gart_iommu_aperture_disabled || !fix_aperture ||
 	    !early_pci_allowed())
@@ -218,24 +228,24 @@ void __init gart_iommu_hole_init(void)
 	printk(KERN_INFO  "Checking aperture...\n");
 
 	fix = 0;
-	for (num = 24; num < 32; num++) {		
+	for (num = 24; num < 32; num++) {
 		if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
 			continue;
 
 		iommu_detected = 1;
 		gart_iommu_aperture = 1;
 
-		aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7; 
-		aper_size = (32 * 1024 * 1024) << aper_order; 
+		aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7;
+		aper_size = (32 * 1024 * 1024) << aper_order;
 		aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff;
-		aper_base <<= 25; 
+		aper_base <<= 25;
 
-		printk("CPU %d: aperture @ %Lx size %u MB\n", num-24, 
+		printk("CPU %d: aperture @ %Lx size %u MB\n", num-24,
 		       aper_base, aper_size>>20);
-		
+
 		if (!aperture_valid(aper_base, aper_size)) {
-			fix = 1; 
-			break; 
+			fix = 1;
+			break;
 		}
 
 		if ((last_aper_order && aper_order != last_aper_order) ||
@@ -245,27 +255,28 @@ void __init gart_iommu_hole_init(void)
 		}
 		last_aper_order = aper_order;
 		last_aper_base = aper_base;
-	} 
+	}
 
 	if (!fix && !fallback_aper_force) {
 		if (last_aper_base) {
 			unsigned long n = (32 * 1024 * 1024) << last_aper_order;
+
 			insert_aperture_resource((u32)last_aper_base, n);
 		}
-		return; 
+		return;
 	}
 
 	if (!fallback_aper_force)
-		aper_alloc = search_agp_bridge(&aper_order, &valid_agp); 
-		
-	if (aper_alloc) { 
+		aper_alloc = search_agp_bridge(&aper_order, &valid_agp);
+
+	if (aper_alloc) {
 		/* Got the aperture from the AGP bridge */
 	} else if (swiotlb && !valid_agp) {
 		/* Do nothing */
 	} else if ((!no_iommu && end_pfn > MAX_DMA32_PFN) ||
 		   force_iommu ||
 		   valid_agp ||
-		   fallback_aper_force) { 
+		   fallback_aper_force) {
 		printk("Your BIOS doesn't leave a aperture memory hole\n");
 		printk("Please enable the IOMMU option in the BIOS setup\n");
 		printk("This costs you %d MB of RAM\n",
@@ -273,27 +284,32 @@ void __init gart_iommu_hole_init(void)
 
 		aper_order = fallback_aper_order;
 		aper_alloc = allocate_aperture();
-		if (!aper_alloc) { 
-			/* Could disable AGP and IOMMU here, but it's probably
-			   not worth it. But the later users cannot deal with
-			   bad apertures and turning on the aperture over memory
-			   causes very strange problems, so it's better to 
-			   panic early. */
+		if (!aper_alloc) {
+			/*
+			 * Could disable AGP and IOMMU here, but it's
+			 * probably not worth it. But the later users
+			 * cannot deal with bad apertures and turning
+			 * on the aperture over memory causes very
+			 * strange problems, so it's better to panic
+			 * early.
+			 */
 			panic("Not enough memory for aperture");
 		}
-	} else { 
-		return; 
-	} 
+	} else {
+		return;
+	}
 
 	/* Fix up the north bridges */
-	for (num = 24; num < 32; num++) { 		
+	for (num = 24; num < 32; num++) {
 		if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
-			continue;	
-
-		/* Don't enable translation yet. That is done later. 
-		   Assume this BIOS didn't initialise the GART so 
-		   just overwrite all previous bits */ 
-		write_pci_config(0, num, 3, 0x90, aper_order<<1); 
-		write_pci_config(0, num, 3, 0x94, aper_alloc>>25); 
-	} 
-} 
+			continue;
+
+		/*
+		 * Don't enable translation yet. That is done later.
+		 * Assume this BIOS didn't initialise the GART so
+		 * just overwrite all previous bits
+		 */
+		write_pci_config(0, num, 3, 0x90, aper_order<<1);
+		write_pci_config(0, num, 3, 0x94, aper_alloc>>25);
+	}
+}
-- 
cgit v1.2.3


From 31183ba8fd05b6ddc67ab4d726167cbc52e1b346 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 30 Jan 2008 13:30:10 +0100
Subject: x86: clean up arch/x86/kernel/aperture_64.c printk()s

clean up arch/x86/kernel/aperture_64.c printk()s.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/aperture_64.c | 42 ++++++++++++++++++++++++------------------
 1 file changed, 24 insertions(+), 18 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 250db0527f5d..52d2beac4556 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -66,14 +66,15 @@ static u32 __init allocate_aperture(void)
 	 */
 	p = __alloc_bootmem_nopanic(aper_size, aper_size, 0);
 	if (!p || __pa(p)+aper_size > 0xffffffff) {
-		printk("Cannot allocate aperture memory hole (%p,%uK)\n",
-		       p, aper_size>>10);
+		printk(KERN_ERR
+			"Cannot allocate aperture memory hole (%p,%uK)\n",
+				p, aper_size>>10);
 		if (p)
 			free_bootmem(__pa(p), aper_size);
 		return 0;
 	}
-	printk("Mapping aperture over %d KB of RAM @ %lx\n",
-	       aper_size >> 10, __pa(p));
+	printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n",
+			aper_size >> 10, __pa(p));
 	insert_aperture_resource((u32)__pa(p), aper_size);
 
 	return (u32)__pa(p);
@@ -83,18 +84,20 @@ static int __init aperture_valid(u64 aper_base, u32 aper_size)
 {
 	if (!aper_base)
 		return 0;
+
 	if (aper_size < 64*1024*1024) {
-		printk("Aperture too small (%d MB)\n", aper_size>>20);
+		printk(KERN_ERR "Aperture too small (%d MB)\n", aper_size>>20);
 		return 0;
 	}
 	if (aper_base + aper_size > 0x100000000UL) {
-		printk("Aperture beyond 4GB. Ignoring.\n");
+		printk(KERN_ERR "Aperture beyond 4GB. Ignoring.\n");
 		return 0;
 	}
 	if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
-		printk("Aperture pointing to e820 RAM. Ignoring.\n");
+		printk(KERN_ERR "Aperture pointing to e820 RAM. Ignoring.\n");
 		return 0;
 	}
+
 	return 1;
 }
 
@@ -133,10 +136,10 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
 	u32 aper_low, aper_hi;
 	u64 aper;
 
-	printk("AGP bridge at %02x:%02x:%02x\n", num, slot, func);
+	printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", num, slot, func);
 	apsizereg = read_pci_config_16(num, slot, func, cap + 0x14);
 	if (apsizereg == 0xffffffff) {
-		printk("APSIZE in AGP bridge unreadable\n");
+		printk(KERN_ERR "APSIZE in AGP bridge unreadable\n");
 		return 0;
 	}
 
@@ -153,8 +156,8 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
 	aper_hi = read_pci_config(num, slot, func, 0x14);
 	aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32);
 
-	printk("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n",
-	       aper, 32 << *order, apsizereg);
+	printk(KERN_INFO "Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n",
+			aper, 32 << *order, apsizereg);
 
 	if (!aperture_valid(aper, (32*1024*1024) << *order))
 		return 0;
@@ -210,7 +213,7 @@ static __u32 __init search_agp_bridge(u32 *order, int *valid_agp)
 			}
 		}
 	}
-	printk("No AGP bridge found\n");
+	printk(KERN_INFO "No AGP bridge found\n");
 
 	return 0;
 }
@@ -240,8 +243,8 @@ void __init gart_iommu_hole_init(void)
 		aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff;
 		aper_base <<= 25;
 
-		printk("CPU %d: aperture @ %Lx size %u MB\n", num-24,
-		       aper_base, aper_size>>20);
+		printk(KERN_INFO "CPU %d: aperture @ %Lx size %u MB\n",
+				num-24, aper_base, aper_size>>20);
 
 		if (!aperture_valid(aper_base, aper_size)) {
 			fix = 1;
@@ -277,10 +280,13 @@ void __init gart_iommu_hole_init(void)
 		   force_iommu ||
 		   valid_agp ||
 		   fallback_aper_force) {
-		printk("Your BIOS doesn't leave a aperture memory hole\n");
-		printk("Please enable the IOMMU option in the BIOS setup\n");
-		printk("This costs you %d MB of RAM\n",
-		       32 << fallback_aper_order);
+		printk(KERN_ERR
+			"Your BIOS doesn't leave a aperture memory hole\n");
+		printk(KERN_ERR
+			"Please enable the IOMMU option in the BIOS setup\n");
+		printk(KERN_ERR
+			"This costs you %d MB of RAM\n",
+				32 << fallback_aper_order);
 
 		aper_order = fallback_aper_order;
 		aper_alloc = allocate_aperture();
-- 
cgit v1.2.3


From 05fccb0e3840248324a96b320562210410be73dc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 30 Jan 2008 13:30:12 +0100
Subject: x86: code cleanups in arch/x86/kernel/pci-gart_64.c

code cleanups:

                                       errors   lines of code   errors/KLOC
 arch/x86/kernel/pci-gart_64.c            183             748         244.6
 arch/x86/kernel/pci-gart_64.c              0             790             0

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/pci-gart_64.c | 508 +++++++++++++++++++++++-------------------
 1 file changed, 276 insertions(+), 232 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 06bcba536045..d2b46b489412 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -1,12 +1,12 @@
 /*
  * Dynamic DMA mapping support for AMD Hammer.
- * 
+ *
  * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI.
  * This allows to use PCI devices that only support 32bit addresses on systems
- * with more than 4GB. 
+ * with more than 4GB.
  *
  * See Documentation/DMA-mapping.txt for the interface specification.
- * 
+ *
  * Copyright 2002 Andi Kleen, SuSE Labs.
  * Subject to the GNU General Public License v2 only.
  */
@@ -37,23 +37,26 @@
 #include <asm/k8.h>
 
 static unsigned long iommu_bus_base;	/* GART remapping area (physical) */
-static unsigned long iommu_size; 	/* size of remapping area bytes */
+static unsigned long iommu_size;	/* size of remapping area bytes */
 static unsigned long iommu_pages;	/* .. and in pages */
 
-static u32 *iommu_gatt_base; 		/* Remapping table */
+static u32 *iommu_gatt_base;		/* Remapping table */
 
-/* If this is disabled the IOMMU will use an optimized flushing strategy
-   of only flushing when an mapping is reused. With it true the GART is flushed 
-   for every mapping. Problem is that doing the lazy flush seems to trigger
-   bugs with some popular PCI cards, in particular 3ware (but has been also
-   also seen with Qlogic at least). */
+/*
+ * If this is disabled the IOMMU will use an optimized flushing strategy
+ * of only flushing when an mapping is reused. With it true the GART is
+ * flushed for every mapping. Problem is that doing the lazy flush seems
+ * to trigger bugs with some popular PCI cards, in particular 3ware (but
+ * has been also also seen with Qlogic at least).
+ */
 int iommu_fullflush = 1;
 
-/* Allocation bitmap for the remapping area */ 
+/* Allocation bitmap for the remapping area: */
 static DEFINE_SPINLOCK(iommu_bitmap_lock);
-static unsigned long *iommu_gart_bitmap; /* guarded by iommu_bitmap_lock */
+/* Guarded by iommu_bitmap_lock: */
+static unsigned long *iommu_gart_bitmap;
 
-static u32 gart_unmapped_entry; 
+static u32 gart_unmapped_entry;
 
 #define GPTE_VALID    1
 #define GPTE_COHERENT 2
@@ -61,10 +64,10 @@ static u32 gart_unmapped_entry;
 	(((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT)
 #define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28))
 
-#define to_pages(addr,size) \
+#define to_pages(addr, size) \
 	(round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
 
-#define EMERGENCY_PAGES 32 /* = 128KB */ 
+#define EMERGENCY_PAGES 32 /* = 128KB */
 
 #ifdef CONFIG_AGP
 #define AGPEXTERN extern
@@ -77,130 +80,152 @@ AGPEXTERN int agp_memory_reserved;
 AGPEXTERN __u32 *agp_gatt_table;
 
 static unsigned long next_bit;  /* protected by iommu_bitmap_lock */
-static int need_flush; 		/* global flush state. set for each gart wrap */
+static int need_flush;		/* global flush state. set for each gart wrap */
 
-static unsigned long alloc_iommu(int size) 
-{ 	
+static unsigned long alloc_iommu(int size)
+{
 	unsigned long offset, flags;
 
-	spin_lock_irqsave(&iommu_bitmap_lock, flags);	
-	offset = find_next_zero_string(iommu_gart_bitmap,next_bit,iommu_pages,size);
+	spin_lock_irqsave(&iommu_bitmap_lock, flags);
+	offset = find_next_zero_string(iommu_gart_bitmap, next_bit,
+					iommu_pages, size);
 	if (offset == -1) {
 		need_flush = 1;
-		offset = find_next_zero_string(iommu_gart_bitmap,0,iommu_pages,size);
+		offset = find_next_zero_string(iommu_gart_bitmap, 0,
+						iommu_pages, size);
 	}
-	if (offset != -1) { 
-		set_bit_string(iommu_gart_bitmap, offset, size); 
-		next_bit = offset+size; 
-		if (next_bit >= iommu_pages) { 
+	if (offset != -1) {
+		set_bit_string(iommu_gart_bitmap, offset, size);
+		next_bit = offset+size;
+		if (next_bit >= iommu_pages) {
 			next_bit = 0;
 			need_flush = 1;
-		} 
-	} 
+		}
+	}
 	if (iommu_fullflush)
 		need_flush = 1;
-	spin_unlock_irqrestore(&iommu_bitmap_lock, flags);      
+	spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
+
 	return offset;
-} 
+}
 
 static void free_iommu(unsigned long offset, int size)
-{ 
+{
 	unsigned long flags;
+
 	spin_lock_irqsave(&iommu_bitmap_lock, flags);
 	__clear_bit_string(iommu_gart_bitmap, offset, size);
 	spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
-} 
+}
 
-/* 
+/*
  * Use global flush state to avoid races with multiple flushers.
  */
 static void flush_gart(void)
-{ 
+{
 	unsigned long flags;
+
 	spin_lock_irqsave(&iommu_bitmap_lock, flags);
 	if (need_flush) {
 		k8_flush_garts();
 		need_flush = 0;
-	} 
+	}
 	spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
-} 
+}
 
 #ifdef CONFIG_IOMMU_LEAK
 
-#define SET_LEAK(x) if (iommu_leak_tab) \
-			iommu_leak_tab[x] = __builtin_return_address(0);
-#define CLEAR_LEAK(x) if (iommu_leak_tab) \
-			iommu_leak_tab[x] = NULL;
+#define SET_LEAK(x)							\
+	do {								\
+		if (iommu_leak_tab)					\
+			iommu_leak_tab[x] = __builtin_return_address(0);\
+	} while (0)
+
+#define CLEAR_LEAK(x)							\
+	do {								\
+		if (iommu_leak_tab)					\
+			iommu_leak_tab[x] = NULL;			\
+	} while (0)
 
 /* Debugging aid for drivers that don't free their IOMMU tables */
-static void **iommu_leak_tab; 
+static void **iommu_leak_tab;
 static int leak_trace;
 static int iommu_leak_pages = 20;
+
 static void dump_leak(void)
 {
 	int i;
-	static int dump; 
-	if (dump || !iommu_leak_tab) return;
+	static int dump;
+
+	if (dump || !iommu_leak_tab)
+		return;
 	dump = 1;
-	show_stack(NULL,NULL);
-	/* Very crude. dump some from the end of the table too */ 
-	printk("Dumping %d pages from end of IOMMU:\n", iommu_leak_pages); 
-	for (i = 0; i < iommu_leak_pages; i+=2) {
-		printk("%lu: ", iommu_pages-i);
+	show_stack(NULL, NULL);
+
+	/* Very crude. dump some from the end of the table too */
+	printk(KERN_DEBUG "Dumping %d pages from end of IOMMU:\n",
+	       iommu_leak_pages);
+	for (i = 0; i < iommu_leak_pages; i += 2) {
+		printk(KERN_DEBUG "%lu: ", iommu_pages-i);
 		printk_address((unsigned long) iommu_leak_tab[iommu_pages-i]);
-		printk("%c", (i+1)%2 == 0 ? '\n' : ' '); 
-	} 
-	printk("\n");
+		printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' ');
+	}
+	printk(KERN_DEBUG "\n");
 }
 #else
-#define SET_LEAK(x)
-#define CLEAR_LEAK(x)
+# define SET_LEAK(x)
+# define CLEAR_LEAK(x)
 #endif
 
 static void iommu_full(struct device *dev, size_t size, int dir)
 {
-	/* 
+	/*
 	 * Ran out of IOMMU space for this operation. This is very bad.
 	 * Unfortunately the drivers cannot handle this operation properly.
-	 * Return some non mapped prereserved space in the aperture and 
+	 * Return some non mapped prereserved space in the aperture and
 	 * let the Northbridge deal with it. This will result in garbage
 	 * in the IO operation. When the size exceeds the prereserved space
-	 * memory corruption will occur or random memory will be DMAed 
+	 * memory corruption will occur or random memory will be DMAed
 	 * out. Hopefully no network devices use single mappings that big.
-	 */ 
-	
-	printk(KERN_ERR 
-  "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n",
-	       size, dev->bus_id);
+	 */
+
+	printk(KERN_ERR
+		"PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n",
+		size, dev->bus_id);
 
 	if (size > PAGE_SIZE*EMERGENCY_PAGES) {
 		if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
 			panic("PCI-DMA: Memory would be corrupted\n");
-		if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL) 
-			panic(KERN_ERR "PCI-DMA: Random memory would be DMAed\n");
-	} 
-
+		if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL)
+			panic(KERN_ERR
+				"PCI-DMA: Random memory would be DMAed\n");
+	}
 #ifdef CONFIG_IOMMU_LEAK
-	dump_leak(); 
+	dump_leak();
 #endif
-} 
+}
 
-static inline int need_iommu(struct device *dev, unsigned long addr, size_t size)
-{ 
+static inline int
+need_iommu(struct device *dev, unsigned long addr, size_t size)
+{
 	u64 mask = *dev->dma_mask;
 	int high = addr + size > mask;
 	int mmu = high;
-	if (force_iommu) 
-		mmu = 1; 
-	return mmu; 
+
+	if (force_iommu)
+		mmu = 1;
+
+	return mmu;
 }
 
-static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
-{ 
+static inline int
+nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
+{
 	u64 mask = *dev->dma_mask;
 	int high = addr + size > mask;
 	int mmu = high;
-	return mmu; 
+
+	return mmu;
 }
 
 /* Map a single continuous physical area into the IOMMU.
@@ -208,13 +233,14 @@ static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t
  */
 static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
 				size_t size, int dir)
-{ 
+{
 	unsigned long npages = to_pages(phys_mem, size);
 	unsigned long iommu_page = alloc_iommu(npages);
 	int i;
+
 	if (iommu_page == -1) {
 		if (!nonforced_iommu(dev, phys_mem, size))
-			return phys_mem; 
+			return phys_mem;
 		if (panic_on_overflow)
 			panic("dma_map_area overflow %lu bytes\n", size);
 		iommu_full(dev, size, dir);
@@ -229,35 +255,39 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
 	return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
 }
 
-static dma_addr_t gart_map_simple(struct device *dev, char *buf,
-				 size_t size, int dir)
+static dma_addr_t
+gart_map_simple(struct device *dev, char *buf, size_t size, int dir)
 {
 	dma_addr_t map = dma_map_area(dev, virt_to_bus(buf), size, dir);
+
 	flush_gart();
+
 	return map;
 }
 
 /* Map a single area into the IOMMU */
-static dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir)
+static dma_addr_t
+gart_map_single(struct device *dev, void *addr, size_t size, int dir)
 {
 	unsigned long phys_mem, bus;
 
 	if (!dev)
 		dev = &fallback_dev;
 
-	phys_mem = virt_to_phys(addr); 
+	phys_mem = virt_to_phys(addr);
 	if (!need_iommu(dev, phys_mem, size))
-		return phys_mem; 
+		return phys_mem;
 
 	bus = gart_map_simple(dev, addr, size, dir);
-	return bus; 
+
+	return bus;
 }
 
 /*
  * Free a DMA mapping.
  */
 static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
-		      size_t size, int direction)
+			      size_t size, int direction)
 {
 	unsigned long iommu_page;
 	int npages;
@@ -266,6 +296,7 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
 	if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE ||
 	    dma_addr >= iommu_bus_base + iommu_size)
 		return;
+
 	iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT;
 	npages = to_pages(dma_addr, size);
 	for (i = 0; i < npages; i++) {
@@ -278,7 +309,8 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
 /*
  * Wrapper for pci_unmap_single working with scatterlists.
  */
-static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
+static void
+gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
 {
 	struct scatterlist *s;
 	int i;
@@ -303,12 +335,13 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
 
 	for_each_sg(sg, s, nents, i) {
 		unsigned long addr = sg_phys(s);
-		if (nonforced_iommu(dev, addr, s->length)) { 
+
+		if (nonforced_iommu(dev, addr, s->length)) {
 			addr = dma_map_area(dev, addr, s->length, dir);
-			if (addr == bad_dma_address) { 
-				if (i > 0) 
+			if (addr == bad_dma_address) {
+				if (i > 0)
 					gart_unmap_sg(dev, sg, i, dir);
-				nents = 0; 
+				nents = 0;
 				sg[0].dma_length = 0;
 				break;
 			}
@@ -317,15 +350,16 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
 		s->dma_length = s->length;
 	}
 	flush_gart();
+
 	return nents;
 }
 
 /* Map multiple scatterlist entries continuous into the first. */
 static int __dma_map_cont(struct scatterlist *start, int nelems,
-		      struct scatterlist *sout, unsigned long pages)
+			  struct scatterlist *sout, unsigned long pages)
 {
 	unsigned long iommu_start = alloc_iommu(pages);
-	unsigned long iommu_page = iommu_start; 
+	unsigned long iommu_page = iommu_start;
 	struct scatterlist *s;
 	int i;
 
@@ -335,32 +369,33 @@ static int __dma_map_cont(struct scatterlist *start, int nelems,
 	for_each_sg(start, s, nelems, i) {
 		unsigned long pages, addr;
 		unsigned long phys_addr = s->dma_address;
-		
+
 		BUG_ON(s != start && s->offset);
 		if (s == start) {
 			sout->dma_address = iommu_bus_base;
 			sout->dma_address += iommu_page*PAGE_SIZE + s->offset;
 			sout->dma_length = s->length;
-		} else { 
-			sout->dma_length += s->length; 
+		} else {
+			sout->dma_length += s->length;
 		}
 
 		addr = phys_addr;
-		pages = to_pages(s->offset, s->length); 
-		while (pages--) { 
-			iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); 
+		pages = to_pages(s->offset, s->length);
+		while (pages--) {
+			iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
 			SET_LEAK(iommu_page);
 			addr += PAGE_SIZE;
 			iommu_page++;
 		}
-	} 
-	BUG_ON(iommu_page - iommu_start != pages);	
+	}
+	BUG_ON(iommu_page - iommu_start != pages);
+
 	return 0;
 }
 
-static inline int dma_map_cont(struct scatterlist *start, int nelems,
-		      struct scatterlist *sout,
-		      unsigned long pages, int need)
+static inline int
+dma_map_cont(struct scatterlist *start, int nelems, struct scatterlist *sout,
+	     unsigned long pages, int need)
 {
 	if (!need) {
 		BUG_ON(nelems != 1);
@@ -370,22 +405,19 @@ static inline int dma_map_cont(struct scatterlist *start, int nelems,
 	}
 	return __dma_map_cont(start, nelems, sout, pages);
 }
-		
+
 /*
  * DMA map all entries in a scatterlist.
- * Merge chunks that have page aligned sizes into a continuous mapping. 
+ * Merge chunks that have page aligned sizes into a continuous mapping.
  */
-static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
-			int dir)
+static int
+gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
 {
-	int i;
-	int out;
-	int start;
-	unsigned long pages = 0;
-	int need = 0, nextneed;
 	struct scatterlist *s, *ps, *start_sg, *sgmap;
+	int need = 0, nextneed, i, out, start;
+	unsigned long pages = 0;
 
-	if (nents == 0) 
+	if (nents == 0)
 		return 0;
 
 	if (!dev)
@@ -397,15 +429,19 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
 	ps = NULL; /* shut up gcc */
 	for_each_sg(sg, s, nents, i) {
 		dma_addr_t addr = sg_phys(s);
+
 		s->dma_address = addr;
-		BUG_ON(s->length == 0); 
+		BUG_ON(s->length == 0);
 
-		nextneed = need_iommu(dev, addr, s->length); 
+		nextneed = need_iommu(dev, addr, s->length);
 
 		/* Handle the previous not yet processed entries */
 		if (i > start) {
-			/* Can only merge when the last chunk ends on a page 
-			   boundary and the new one doesn't have an offset. */
+			/*
+			 * Can only merge when the last chunk ends on a
+			 * page boundary and the new one doesn't have an
+			 * offset.
+			 */
 			if (!iommu_merge || !nextneed || !need || s->offset ||
 			    (ps->offset + ps->length) % PAGE_SIZE) {
 				if (dma_map_cont(start_sg, i - start, sgmap,
@@ -436,6 +472,7 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
 error:
 	flush_gart();
 	gart_unmap_sg(dev, sg, out, dir);
+
 	/* When it was forced or merged try again in a dumb way */
 	if (force_iommu || iommu_merge) {
 		out = dma_map_sg_nonforce(dev, sg, nents, dir);
@@ -444,64 +481,68 @@ error:
 	}
 	if (panic_on_overflow)
 		panic("dma_map_sg: overflow on %lu pages\n", pages);
+
 	iommu_full(dev, pages << PAGE_SHIFT, dir);
 	for_each_sg(sg, s, nents, i)
 		s->dma_address = bad_dma_address;
 	return 0;
-} 
+}
 
 static int no_agp;
 
 static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
-{ 
-	unsigned long a; 
-	if (!iommu_size) { 
-		iommu_size = aper_size; 
-		if (!no_agp) 
-			iommu_size /= 2; 
-	} 
-
-	a = aper + iommu_size; 
+{
+	unsigned long a;
+
+	if (!iommu_size) {
+		iommu_size = aper_size;
+		if (!no_agp)
+			iommu_size /= 2;
+	}
+
+	a = aper + iommu_size;
 	iommu_size -= round_up(a, LARGE_PAGE_SIZE) - a;
 
-	if (iommu_size < 64*1024*1024) 
+	if (iommu_size < 64*1024*1024) {
 		printk(KERN_WARNING
-  "PCI-DMA: Warning: Small IOMMU %luMB. Consider increasing the AGP aperture in BIOS\n",iommu_size>>20); 
-	
+			"PCI-DMA: Warning: Small IOMMU %luMB."
+			" Consider increasing the AGP aperture in BIOS\n",
+				iommu_size >> 20);
+	}
+
 	return iommu_size;
-} 
+}
 
-static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) 
-{ 
-	unsigned aper_size = 0, aper_base_32;
+static __init unsigned read_aperture(struct pci_dev *dev, u32 *size)
+{
+	unsigned aper_size = 0, aper_base_32, aper_order;
 	u64 aper_base;
-	unsigned aper_order;
 
-	pci_read_config_dword(dev, 0x94, &aper_base_32); 
+	pci_read_config_dword(dev, 0x94, &aper_base_32);
 	pci_read_config_dword(dev, 0x90, &aper_order);
-	aper_order = (aper_order >> 1) & 7;	
+	aper_order = (aper_order >> 1) & 7;
 
-	aper_base = aper_base_32 & 0x7fff; 
+	aper_base = aper_base_32 & 0x7fff;
 	aper_base <<= 25;
 
-	aper_size = (32 * 1024 * 1024) << aper_order; 
-       if (aper_base + aper_size > 0x100000000UL || !aper_size)
+	aper_size = (32 * 1024 * 1024) << aper_order;
+	if (aper_base + aper_size > 0x100000000UL || !aper_size)
 		aper_base = 0;
 
 	*size = aper_size;
 	return aper_base;
-} 
+}
 
-/* 
+/*
  * Private Northbridge GATT initialization in case we cannot use the
- * AGP driver for some reason.  
+ * AGP driver for some reason.
  */
 static __init int init_k8_gatt(struct agp_kern_info *info)
-{ 
+{
+	unsigned aper_size, gatt_size, new_aper_size;
+	unsigned aper_base, new_aper_base;
 	struct pci_dev *dev;
 	void *gatt;
-	unsigned aper_base, new_aper_base;
-	unsigned aper_size, gatt_size, new_aper_size;
 	int i;
 
 	printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
@@ -509,75 +550,77 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
 	dev = NULL;
 	for (i = 0; i < num_k8_northbridges; i++) {
 		dev = k8_northbridges[i];
-		new_aper_base = read_aperture(dev, &new_aper_size); 
-		if (!new_aper_base) 
-			goto nommu; 
-		
-		if (!aper_base) { 
+		new_aper_base = read_aperture(dev, &new_aper_size);
+		if (!new_aper_base)
+			goto nommu;
+
+		if (!aper_base) {
 			aper_size = new_aper_size;
 			aper_base = new_aper_base;
-		} 
-		if (aper_size != new_aper_size || aper_base != new_aper_base) 
+		}
+		if (aper_size != new_aper_size || aper_base != new_aper_base)
 			goto nommu;
 	}
 	if (!aper_base)
-		goto nommu; 
+		goto nommu;
 	info->aper_base = aper_base;
-	info->aper_size = aper_size>>20; 
+	info->aper_size = aper_size >> 20;
 
-	gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); 
-	gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size)); 
-	if (!gatt) 
+	gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32);
+	gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size));
+	if (!gatt)
 		panic("Cannot allocate GATT table");
-	if (change_page_attr_addr((unsigned long)gatt, gatt_size >> PAGE_SHIFT, PAGE_KERNEL_NOCACHE))
+	if (change_page_attr_addr((unsigned long)gatt, gatt_size >> PAGE_SHIFT,
+				  PAGE_KERNEL_NOCACHE))
 		panic("Could not set GART PTEs to uncacheable pages");
 	global_flush_tlb();
 
-	memset(gatt, 0, gatt_size); 
+	memset(gatt, 0, gatt_size);
 	agp_gatt_table = gatt;
 
 	for (i = 0; i < num_k8_northbridges; i++) {
-		u32 ctl; 
-		u32 gatt_reg; 
+		u32 gatt_reg;
+		u32 ctl;
 
 		dev = k8_northbridges[i];
-		gatt_reg = __pa(gatt) >> 12; 
-		gatt_reg <<= 4; 
+		gatt_reg = __pa(gatt) >> 12;
+		gatt_reg <<= 4;
 		pci_write_config_dword(dev, 0x98, gatt_reg);
-		pci_read_config_dword(dev, 0x90, &ctl); 
+		pci_read_config_dword(dev, 0x90, &ctl);
 
 		ctl |= 1;
 		ctl &= ~((1<<4) | (1<<5));
 
-		pci_write_config_dword(dev, 0x90, ctl); 
+		pci_write_config_dword(dev, 0x90, ctl);
 	}
 	flush_gart();
-	
-	printk("PCI-DMA: aperture base @ %x size %u KB\n",aper_base, aper_size>>10); 
+
+	printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n",
+	       aper_base, aper_size>>10);
 	return 0;
 
  nommu:
- 	/* Should not happen anymore */
+	/* Should not happen anymore */
 	printk(KERN_ERR "PCI-DMA: More than 4GB of RAM and no IOMMU\n"
 	       KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction.\n");
-	return -1; 
-} 
+	return -1;
+}
 
 extern int agp_amd64_init(void);
 
 static const struct dma_mapping_ops gart_dma_ops = {
-	.mapping_error = NULL,
-	.map_single = gart_map_single,
-	.map_simple = gart_map_simple,
-	.unmap_single = gart_unmap_single,
-	.sync_single_for_cpu = NULL,
-	.sync_single_for_device = NULL,
-	.sync_single_range_for_cpu = NULL,
-	.sync_single_range_for_device = NULL,
-	.sync_sg_for_cpu = NULL,
-	.sync_sg_for_device = NULL,
-	.map_sg = gart_map_sg,
-	.unmap_sg = gart_unmap_sg,
+	.mapping_error			= NULL,
+	.map_single			= gart_map_single,
+	.map_simple			= gart_map_simple,
+	.unmap_single			= gart_unmap_single,
+	.sync_single_for_cpu		= NULL,
+	.sync_single_for_device		= NULL,
+	.sync_single_range_for_cpu	= NULL,
+	.sync_single_range_for_device	= NULL,
+	.sync_sg_for_cpu		= NULL,
+	.sync_sg_for_device		= NULL,
+	.map_sg				= gart_map_sg,
+	.unmap_sg			= gart_unmap_sg,
 };
 
 void gart_iommu_shutdown(void)
@@ -588,23 +631,23 @@ void gart_iommu_shutdown(void)
 	if (no_agp && (dma_ops != &gart_dma_ops))
 		return;
 
-        for (i = 0; i < num_k8_northbridges; i++) {
-                u32 ctl;
+	for (i = 0; i < num_k8_northbridges; i++) {
+		u32 ctl;
 
-                dev = k8_northbridges[i];
-                pci_read_config_dword(dev, 0x90, &ctl);
+		dev = k8_northbridges[i];
+		pci_read_config_dword(dev, 0x90, &ctl);
 
-                ctl &= ~1;
+		ctl &= ~1;
 
-                pci_write_config_dword(dev, 0x90, ctl);
-        }
+		pci_write_config_dword(dev, 0x90, ctl);
+	}
 }
 
 void __init gart_iommu_init(void)
-{ 
+{
 	struct agp_kern_info info;
-	unsigned long aper_size;
 	unsigned long iommu_start;
+	unsigned long aper_size;
 	unsigned long scratch;
 	long i;
 
@@ -614,14 +657,14 @@ void __init gart_iommu_init(void)
 	}
 
 #ifndef CONFIG_AGP_AMD64
-	no_agp = 1; 
+	no_agp = 1;
 #else
 	/* Makefile puts PCI initialization via subsys_initcall first. */
 	/* Add other K8 AGP bridge drivers here */
-	no_agp = no_agp || 
-		(agp_amd64_init() < 0) || 
+	no_agp = no_agp ||
+		(agp_amd64_init() < 0) ||
 		(agp_copy_info(agp_bridge, &info) < 0);
-#endif	
+#endif
 
 	if (swiotlb)
 		return;
@@ -643,77 +686,78 @@ void __init gart_iommu_init(void)
 	}
 
 	printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n");
-	aper_size = info.aper_size * 1024 * 1024;	
-	iommu_size = check_iommu_size(info.aper_base, aper_size); 
-	iommu_pages = iommu_size >> PAGE_SHIFT; 
-
-	iommu_gart_bitmap = (void*)__get_free_pages(GFP_KERNEL, 
-						    get_order(iommu_pages/8)); 
-	if (!iommu_gart_bitmap) 
-		panic("Cannot allocate iommu bitmap\n"); 
+	aper_size = info.aper_size * 1024 * 1024;
+	iommu_size = check_iommu_size(info.aper_base, aper_size);
+	iommu_pages = iommu_size >> PAGE_SHIFT;
+
+	iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL,
+						      get_order(iommu_pages/8));
+	if (!iommu_gart_bitmap)
+		panic("Cannot allocate iommu bitmap\n");
 	memset(iommu_gart_bitmap, 0, iommu_pages/8);
 
 #ifdef CONFIG_IOMMU_LEAK
-	if (leak_trace) { 
-		iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, 
+	if (leak_trace) {
+		iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL,
 				  get_order(iommu_pages*sizeof(void *)));
-		if (iommu_leak_tab) 
-			memset(iommu_leak_tab, 0, iommu_pages * 8); 
+		if (iommu_leak_tab)
+			memset(iommu_leak_tab, 0, iommu_pages * 8);
 		else
-			printk("PCI-DMA: Cannot allocate leak trace area\n"); 
-	} 
+			printk(KERN_DEBUG
+			       "PCI-DMA: Cannot allocate leak trace area\n");
+	}
 #endif
 
-	/* 
+	/*
 	 * Out of IOMMU space handling.
-	 * Reserve some invalid pages at the beginning of the GART. 
-	 */ 
-	set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); 
+	 * Reserve some invalid pages at the beginning of the GART.
+	 */
+	set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
 
-	agp_memory_reserved = iommu_size;	
+	agp_memory_reserved = iommu_size;
 	printk(KERN_INFO
 	       "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
-	       iommu_size>>20); 
+	       iommu_size >> 20);
 
-	iommu_start = aper_size - iommu_size;	
-	iommu_bus_base = info.aper_base + iommu_start; 
+	iommu_start = aper_size - iommu_size;
+	iommu_bus_base = info.aper_base + iommu_start;
 	bad_dma_address = iommu_bus_base;
 	iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT);
 
-	/* 
+	/*
 	 * Unmap the IOMMU part of the GART. The alias of the page is
 	 * always mapped with cache enabled and there is no full cache
 	 * coherency across the GART remapping. The unmapping avoids
 	 * automatic prefetches from the CPU allocating cache lines in
 	 * there. All CPU accesses are done via the direct mapping to
 	 * the backing memory. The GART address is only used by PCI
-	 * devices. 
+	 * devices.
 	 */
 	clear_kernel_mapping((unsigned long)__va(iommu_bus_base), iommu_size);
 
-	/* 
-	 * Try to workaround a bug (thanks to BenH) 
-	 * Set unmapped entries to a scratch page instead of 0. 
+	/*
+	 * Try to workaround a bug (thanks to BenH)
+	 * Set unmapped entries to a scratch page instead of 0.
 	 * Any prefetches that hit unmapped entries won't get an bus abort
 	 * then.
 	 */
-	scratch = get_zeroed_page(GFP_KERNEL); 
-	if (!scratch) 
+	scratch = get_zeroed_page(GFP_KERNEL);
+	if (!scratch)
 		panic("Cannot allocate iommu scratch page");
 	gart_unmapped_entry = GPTE_ENCODE(__pa(scratch));
-	for (i = EMERGENCY_PAGES; i < iommu_pages; i++) 
+	for (i = EMERGENCY_PAGES; i < iommu_pages; i++)
 		iommu_gatt_base[i] = gart_unmapped_entry;
 
 	flush_gart();
 	dma_ops = &gart_dma_ops;
-} 
+}
 
 void __init gart_parse_options(char *p)
 {
 	int arg;
 
 #ifdef CONFIG_IOMMU_LEAK
-	if (!strncmp(p,"leak",4)) {
+	if (!strncmp(p, "leak", 4)) {
 		leak_trace = 1;
 		p += 4;
 		if (*p == '=') ++p;
@@ -723,18 +767,18 @@ void __init gart_parse_options(char *p)
 #endif
 	if (isdigit(*p) && get_option(&p, &arg))
 		iommu_size = arg;
-	if (!strncmp(p, "fullflush",8))
+	if (!strncmp(p, "fullflush", 8))
 		iommu_fullflush = 1;
-	if (!strncmp(p, "nofullflush",11))
+	if (!strncmp(p, "nofullflush", 11))
 		iommu_fullflush = 0;
-	if (!strncmp(p,"noagp",5))
+	if (!strncmp(p, "noagp", 5))
 		no_agp = 1;
-	if (!strncmp(p, "noaperture",10))
+	if (!strncmp(p, "noaperture", 10))
 		fix_aperture = 0;
 	/* duplicated from pci-dma.c */
-	if (!strncmp(p,"force",5))
+	if (!strncmp(p, "force", 5))
 		gart_iommu_aperture_allowed = 1;
-	if (!strncmp(p,"allowed",7))
+	if (!strncmp(p, "allowed", 7))
 		gart_iommu_aperture_allowed = 1;
 	if (!strncmp(p, "memaper", 7)) {
 		fallback_aper_force = 1;
-- 
cgit v1.2.3


From 2f36fa13ce49ffd000249feaedfcbefbcc83a72f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:30:12 +0100
Subject: x86: clean up arch/x86/kernel/e820_64.c

White space and coding style cleanup.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820_64.c | 305 +++++++++++++++++++++++++++-------------------
 1 file changed, 177 insertions(+), 128 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
index 04698e0b056c..d41cd2f01733 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -1,4 +1,4 @@
-/* 
+/*
  * Handle the memory map.
  * The functions here do the job until bootmem takes over.
  *
@@ -29,44 +29,44 @@
 
 struct e820map e820;
 
-/* 
+/*
  * PFN of last memory page.
  */
-unsigned long end_pfn; 
+unsigned long end_pfn;
 EXPORT_SYMBOL(end_pfn);
 
-/* 
+/*
  * end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
  * The direct mapping extends to end_pfn_map, so that we can directly access
  * apertures, ACPI and other tables without having to play with fixmaps.
- */ 
-unsigned long end_pfn_map; 
+ */
+unsigned long end_pfn_map;
 
-/* 
+/*
  * Last pfn which the user wants to use.
  */
 static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
 
 extern struct resource code_resource, data_resource, bss_resource;
 
-/* Check for some hardcoded bad areas that early boot is not allowed to touch */ 
+/* Check for some hardcoded bad areas that early boot is not allowed to touch */
 static inline int bad_addr(unsigned long *addrp, unsigned long size)
-{ 
-	unsigned long addr = *addrp, last = addr + size; 
+{
+	unsigned long addr = *addrp, last = addr + size;
 
 	/* various gunk below that needed for SMP startup */
-	if (addr < 0x8000) { 
+	if (addr < 0x8000) {
 		*addrp = PAGE_ALIGN(0x8000);
-		return 1; 
+		return 1;
 	}
 
 	/* direct mapping tables of the kernel */
-	if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) { 
+	if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) {
 		*addrp = PAGE_ALIGN(table_end << PAGE_SHIFT);
 		return 1;
-	} 
+	}
 
-	/* initrd */ 
+	/* initrd */
 #ifdef CONFIG_BLK_DEV_INITRD
 	if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
 		unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
@@ -77,7 +77,7 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size)
 			*addrp = PAGE_ALIGN(ramdisk_end);
 			return 1;
 		}
-	} 
+	}
 #endif
 	/* kernel code */
 	if (last >= __pa_symbol(&_text) && addr < __pa_symbol(&_end)) {
@@ -97,9 +97,9 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size)
 		return 1;
 	}
 #endif
-	/* XXX ramdisk image here? */ 
+	/* XXX ramdisk image here? */
 	return 0;
-} 
+}
 
 /*
  * This function checks if any part of the range <start,end> is mapped
@@ -107,16 +107,18 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size)
  */
 int
 e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
-{ 
+{
 	int i;
-	for (i = 0; i < e820.nr_map; i++) { 
-		struct e820entry *ei = &e820.map[i]; 
-		if (type && ei->type != type) 
+
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+
+		if (type && ei->type != type)
 			continue;
 		if (ei->addr >= end || ei->addr + ei->size <= start)
-			continue; 
-		return 1; 
-	} 
+			continue;
+		return 1;
+	}
 	return 0;
 }
 EXPORT_SYMBOL_GPL(e820_any_mapped);
@@ -127,11 +129,14 @@ EXPORT_SYMBOL_GPL(e820_any_mapped);
  * Note: this function only works correct if the e820 table is sorted and
  * not-overlapping, which is the case
  */
-int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type)
+int __init e820_all_mapped(unsigned long start, unsigned long end,
+			   unsigned type)
 {
 	int i;
+
 	for (i = 0; i < e820.nr_map; i++) {
 		struct e820entry *ei = &e820.map[i];
+
 		if (type && ei->type != type)
 			continue;
 		/* is the region (part) in overlap with the current region ?*/
@@ -143,59 +148,66 @@ int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type
 		 */
 		if (ei->addr <= start)
 			start = ei->addr + ei->size;
-		/* if start is now at or beyond end, we're done, full coverage */
+		/*
+		 * if start is now at or beyond end, we're done, full
+		 * coverage
+		 */
 		if (start >= end)
-			return 1; /* we're done */
+			return 1;
 	}
 	return 0;
 }
 
-/* 
- * Find a free area in a specific range. 
- */ 
-unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size) 
-{ 
-	int i; 
-	for (i = 0; i < e820.nr_map; i++) { 
-		struct e820entry *ei = &e820.map[i]; 
-		unsigned long addr = ei->addr, last; 
-		if (ei->type != E820_RAM) 
-			continue; 
-		if (addr < start) 
+/*
+ * Find a free area in a specific range.
+ */
+unsigned long __init find_e820_area(unsigned long start, unsigned long end,
+				    unsigned size)
+{
+	int i;
+
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+		unsigned long addr = ei->addr, last;
+
+		if (ei->type != E820_RAM)
+			continue;
+		if (addr < start)
 			addr = start;
-		if (addr > ei->addr + ei->size) 
-			continue; 
+		if (addr > ei->addr + ei->size)
+			continue;
 		while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size)
 			;
 		last = PAGE_ALIGN(addr) + size;
 		if (last > ei->addr + ei->size)
 			continue;
-		if (last > end) 
+		if (last > end)
 			continue;
-		return addr; 
-	} 
-	return -1UL;		
-} 
+		return addr;
+	}
+	return -1UL;
+}
 
 /*
  * Find the highest page frame number we have available
  */
 unsigned long __init e820_end_of_ram(void)
 {
-	unsigned long end_pfn = 0;
+	unsigned long end_pfn;
+
 	end_pfn = find_max_pfn_with_active_regions();
-	
-	if (end_pfn > end_pfn_map) 
+
+	if (end_pfn > end_pfn_map)
 		end_pfn_map = end_pfn;
 	if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
 		end_pfn_map = MAXMEM>>PAGE_SHIFT;
 	if (end_pfn > end_user_pfn)
 		end_pfn = end_user_pfn;
-	if (end_pfn > end_pfn_map) 
-		end_pfn = end_pfn_map; 
+	if (end_pfn > end_pfn_map)
+		end_pfn = end_pfn_map;
 
-	printk("end_pfn_map = %lu\n", end_pfn_map);
-	return end_pfn;	
+	printk(KERN_INFO "end_pfn_map = %lu\n", end_pfn_map);
+	return end_pfn;
 }
 
 /*
@@ -219,9 +231,9 @@ void __init e820_reserve_resources(void)
 		request_resource(&iomem_resource, res);
 		if (e820.map[i].type == E820_RAM) {
 			/*
-			 *  We don't know which RAM region contains kernel data,
-			 *  so we try it repeatedly and let the resource manager
-			 *  test it.
+			 * We don't know which RAM region contains kernel data,
+			 * so we try it repeatedly and let the resource manager
+			 * test it.
 			 */
 			request_resource(res, &code_resource);
 			request_resource(res, &data_resource);
@@ -322,9 +334,9 @@ e820_register_active_regions(int nid, unsigned long start_pfn,
 			add_active_range(nid, ei_startpfn, ei_endpfn);
 }
 
-/* 
+/*
  * Add a memory region to the kernel e820 map.
- */ 
+ */
 void __init add_memory_region(unsigned long start, unsigned long size, int type)
 {
 	int x = e820.nr_map;
@@ -349,9 +361,7 @@ unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long end_pfn = end >> PAGE_SHIFT;
-	unsigned long ei_startpfn;
-	unsigned long ei_endpfn;
-	unsigned long ram = 0;
+	unsigned long ei_startpfn, ei_endpfn, ram = 0;
 	int i;
 
 	for (i = 0; i < e820.nr_map; i++) {
@@ -369,22 +379,25 @@ void __init e820_print_map(char *who)
 
 	for (i = 0; i < e820.nr_map; i++) {
 		printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
-			(unsigned long long) e820.map[i].addr,
-			(unsigned long long) (e820.map[i].addr + e820.map[i].size));
+		       (unsigned long long) e820.map[i].addr,
+		       (unsigned long long)
+		       (e820.map[i].addr + e820.map[i].size));
 		switch (e820.map[i].type) {
-		case E820_RAM:	printk("(usable)\n");
-				break;
+		case E820_RAM:
+			printk(KERN_CONT "(usable)\n");
+			break;
 		case E820_RESERVED:
-				printk("(reserved)\n");
-				break;
+			printk(KERN_CONT "(reserved)\n");
+			break;
 		case E820_ACPI:
-				printk("(ACPI data)\n");
-				break;
+			printk(KERN_CONT "(ACPI data)\n");
+			break;
 		case E820_NVS:
-				printk("(ACPI NVS)\n");
-				break;
-		default:	printk("type %u\n", e820.map[i].type);
-				break;
+			printk(KERN_CONT "(ACPI NVS)\n");
+			break;
+		default:
+			printk(KERN_CONT "type %u\n", e820.map[i].type);
+			break;
 		}
 	}
 }
@@ -392,11 +405,11 @@ void __init e820_print_map(char *who)
 /*
  * Sanitize the BIOS e820 map.
  *
- * Some e820 responses include overlapping entries.  The following 
+ * Some e820 responses include overlapping entries. The following
  * replaces the original e820 map with a new one, removing overlaps.
  *
  */
-static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
+static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map)
 {
 	struct change_member {
 		struct e820entry *pbios; /* pointer to original bios entry */
@@ -416,7 +429,8 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
 	int i;
 
 	/*
-		Visually we're performing the following (1,2,3,4 = memory types)...
+		Visually we're performing the following
+		(1,2,3,4 = memory types)...
 
 		Sample memory map (w/overlaps):
 		   ____22__________________
@@ -458,22 +472,23 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
 	old_nr = *pnr_map;
 
 	/* bail out if we find any unreasonable addresses in bios map */
-	for (i=0; i<old_nr; i++)
+	for (i = 0; i < old_nr; i++)
 		if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
 			return -1;
 
 	/* create pointers for initial change-point information (for sorting) */
-	for (i=0; i < 2*old_nr; i++)
+	for (i = 0; i < 2 * old_nr; i++)
 		change_point[i] = &change_point_list[i];
 
 	/* record all known change-points (starting and ending addresses),
 	   omitting those that are for empty memory regions */
 	chgidx = 0;
-	for (i=0; i < old_nr; i++)	{
+	for (i = 0; i < old_nr; i++)	{
 		if (biosmap[i].size != 0) {
 			change_point[chgidx]->addr = biosmap[i].addr;
 			change_point[chgidx++]->pbios = &biosmap[i];
-			change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
+			change_point[chgidx]->addr = biosmap[i].addr +
+				biosmap[i].size;
 			change_point[chgidx++]->pbios = &biosmap[i];
 		}
 	}
@@ -483,75 +498,106 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
 	still_changing = 1;
 	while (still_changing)	{
 		still_changing = 0;
-		for (i=1; i < chg_nr; i++)  {
-			/* if <current_addr> > <last_addr>, swap */
-			/* or, if current=<start_addr> & last=<end_addr>, swap */
-			if ((change_point[i]->addr < change_point[i-1]->addr) ||
-				((change_point[i]->addr == change_point[i-1]->addr) &&
-				 (change_point[i]->addr == change_point[i]->pbios->addr) &&
-				 (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
-			   )
-			{
+		for (i = 1; i < chg_nr; i++)  {
+			unsigned long long curaddr, lastaddr;
+			unsigned long long curpbaddr, lastpbaddr;
+
+			curaddr = change_point[i]->addr;
+			lastaddr = change_point[i - 1]->addr;
+			curpbaddr = change_point[i]->pbios->addr;
+			lastpbaddr = change_point[i - 1]->pbios->addr;
+
+			/*
+			 * swap entries, when:
+			 *
+			 * curaddr > lastaddr or
+			 * curaddr == lastaddr and curaddr == curpbaddr and
+			 * lastaddr != lastpbaddr
+			 */
+			if (curaddr < lastaddr ||
+			    (curaddr == lastaddr && curaddr == curpbaddr &&
+			     lastaddr != lastpbaddr)) {
 				change_tmp = change_point[i];
 				change_point[i] = change_point[i-1];
 				change_point[i-1] = change_tmp;
-				still_changing=1;
+				still_changing = 1;
 			}
 		}
 	}
 
 	/* create a new bios memory map, removing overlaps */
-	overlap_entries=0;	 /* number of entries in the overlap table */
-	new_bios_entry=0;	 /* index for creating new bios map entries */
+	overlap_entries = 0;	 /* number of entries in the overlap table */
+	new_bios_entry = 0;	 /* index for creating new bios map entries */
 	last_type = 0;		 /* start with undefined memory type */
 	last_addr = 0;		 /* start with 0 as last starting address */
+
 	/* loop through change-points, determining affect on the new bios map */
-	for (chgidx=0; chgidx < chg_nr; chgidx++)
-	{
+	for (chgidx = 0; chgidx < chg_nr; chgidx++) {
 		/* keep track of all overlapping bios entries */
-		if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
-		{
-			/* add map entry to overlap list (> 1 entry implies an overlap) */
-			overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
-		}
-		else
-		{
-			/* remove entry from list (order independent, so swap with last) */
-			for (i=0; i<overlap_entries; i++)
-			{
-				if (overlap_list[i] == change_point[chgidx]->pbios)
-					overlap_list[i] = overlap_list[overlap_entries-1];
+		if (change_point[chgidx]->addr ==
+		    change_point[chgidx]->pbios->addr) {
+			/*
+			 * add map entry to overlap list (> 1 entry
+			 * implies an overlap)
+			 */
+			overlap_list[overlap_entries++] =
+				change_point[chgidx]->pbios;
+		} else {
+			/*
+			 * remove entry from list (order independent,
+			 * so swap with last)
+			 */
+			for (i = 0; i < overlap_entries; i++) {
+				if (overlap_list[i] ==
+				    change_point[chgidx]->pbios)
+					overlap_list[i] =
+						overlap_list[overlap_entries-1];
 			}
 			overlap_entries--;
 		}
-		/* if there are overlapping entries, decide which "type" to use */
-		/* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
+		/*
+		 * if there are overlapping entries, decide which
+		 * "type" to use (larger value takes precedence --
+		 * 1=usable, 2,3,4,4+=unusable)
+		 */
 		current_type = 0;
-		for (i=0; i<overlap_entries; i++)
+		for (i = 0; i < overlap_entries; i++)
 			if (overlap_list[i]->type > current_type)
 				current_type = overlap_list[i]->type;
-		/* continue building up new bios map based on this information */
+		/*
+		 * continue building up new bios map based on this
+		 * information
+		 */
 		if (current_type != last_type)	{
 			if (last_type != 0)	 {
 				new_bios[new_bios_entry].size =
 					change_point[chgidx]->addr - last_addr;
-				/* move forward only if the new size was non-zero */
+				/*
+				 * move forward only if the new size
+				 * was non-zero
+				 */
 				if (new_bios[new_bios_entry].size != 0)
+					/*
+					 * no more space left for new
+					 * bios entries ?
+					 */
 					if (++new_bios_entry >= E820MAX)
-						break; 	/* no more space left for new bios entries */
+						break;
 			}
 			if (current_type != 0)	{
-				new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
+				new_bios[new_bios_entry].addr =
+					change_point[chgidx]->addr;
 				new_bios[new_bios_entry].type = current_type;
-				last_addr=change_point[chgidx]->addr;
+				last_addr = change_point[chgidx]->addr;
 			}
 			last_type = current_type;
 		}
 	}
-	new_nr = new_bios_entry;   /* retain count for new bios entries */
+	/* retain count for new bios entries */
+	new_nr = new_bios_entry;
 
 	/* copy new bios mapping into original location */
-	memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
+	memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
 	*pnr_map = new_nr;
 
 	return 0;
@@ -566,7 +612,7 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
  * will have given us a memory map that we can use to properly
  * set up memory.  If we aren't, we'll fake a memory map.
  */
-static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
+static int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
 {
 	/* Only one memory region (or negative)? Ignore it */
 	if (nr_map < 2)
@@ -583,7 +629,7 @@ static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
 			return -1;
 
 		add_memory_region(start, size, type);
-	} while (biosmap++,--nr_map);
+	} while (biosmap++, --nr_map);
 	return 0;
 }
 
@@ -613,9 +659,9 @@ static int __init parse_memopt(char *p)
 	if (!p)
 		return -EINVAL;
 	end_user_pfn = memparse(p, &p);
-	end_user_pfn >>= PAGE_SHIFT;	
+	end_user_pfn >>= PAGE_SHIFT;
 	return 0;
-} 
+}
 early_param("mem", parse_memopt);
 
 static int userdef __initdata;
@@ -627,9 +673,9 @@ static int __init parse_memmap_opt(char *p)
 
 	if (!strcmp(p, "exactmap")) {
 #ifdef CONFIG_CRASH_DUMP
-		/* If we are doing a crash dump, we
-		 * still need to know the real mem
-		 * size before original memory map is
+		/*
+		 * If we are doing a crash dump, we still need to know
+		 * the real mem size before original memory map is
 		 * reset.
 		 */
 		e820_register_active_regions(0, 0, -1UL);
@@ -713,8 +759,10 @@ __init void e820_setup_gap(void)
 
 	if (!found) {
 		gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
-		printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n"
-		       KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n");
+		printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
+		       "address range\n"
+		       KERN_ERR "PCI: Unassigned devices with 32bit resource "
+		       "registers may break!\n");
 	}
 
 	/*
@@ -727,8 +775,9 @@ __init void e820_setup_gap(void)
 	/* Fun with two's complement */
 	pci_mem_start = (gapstart + round) & -round;
 
-	printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
-		pci_mem_start, gapstart, gapsize);
+	printk(KERN_INFO
+	       "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
+	       pci_mem_start, gapstart, gapsize);
 }
 
 int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
-- 
cgit v1.2.3


From 78aa1f66f77da078357bd263fcac95fbf6bca15b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:30:13 +0100
Subject: x86: clean up arch/x86/kernel/ldt_32/64.c

White space and coding style clenaup.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ldt_32.c | 60 ++++++++++++++++++++++-------------------
 arch/x86/kernel/ldt_64.c | 69 +++++++++++++++++++++++++-----------------------
 2 files changed, 69 insertions(+), 60 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ldt_32.c b/arch/x86/kernel/ldt_32.c
index 9ff90a27c45f..e366c5fd0d19 100644
--- a/arch/x86/kernel/ldt_32.c
+++ b/arch/x86/kernel/ldt_32.c
@@ -17,7 +17,7 @@
 #include <asm/desc.h>
 #include <asm/mmu_context.h>
 
-#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
+#ifdef CONFIG_SMP
 static void flush_ldt(void *null)
 {
 	if (current->active_mm)
@@ -34,19 +34,20 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
 	if (mincount <= pc->size)
 		return 0;
 	oldsize = pc->size;
-	mincount = (mincount+511)&(~511);
-	if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
-		newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
+	mincount = (mincount + 511) & (~511);
+	if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE)
+		newldt = vmalloc(mincount * LDT_ENTRY_SIZE);
 	else
-		newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
+		newldt = kmalloc(mincount * LDT_ENTRY_SIZE, GFP_KERNEL);
 
 	if (!newldt)
 		return -ENOMEM;
 
 	if (oldsize)
-		memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
+		memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE);
 	oldldt = pc->ldt;
-	memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
+	memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
+	       (mincount - oldsize) * LDT_ENTRY_SIZE);
 	pc->ldt = newldt;
 	wmb();
 	pc->size = mincount;
@@ -55,6 +56,7 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
 	if (reload) {
 #ifdef CONFIG_SMP
 		cpumask_t mask;
+
 		preempt_disable();
 		load_LDT(pc);
 		mask = cpumask_of_cpu(smp_processor_id());
@@ -66,7 +68,7 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
 #endif
 	}
 	if (oldsize) {
-		if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
+		if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
 			vfree(oldldt);
 		else
 			kfree(oldldt);
@@ -77,9 +79,10 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
 static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
 {
 	int err = alloc_ldt(new, old->size, 0);
+
 	if (err < 0)
 		return err;
-	memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
+	memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE);
 	return 0;
 }
 
@@ -89,7 +92,7 @@ static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
  */
 int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 {
-	struct mm_struct * old_mm;
+	struct mm_struct *old_mm;
 	int retval = 0;
 
 	mutex_init(&mm->context.lock);
@@ -111,7 +114,7 @@ void destroy_context(struct mm_struct *mm)
 	if (mm->context.size) {
 		if (mm == current->active_mm)
 			clear_LDT();
-		if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
+		if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
 			vfree(mm->context.ldt);
 		else
 			kfree(mm->context.ldt);
@@ -119,19 +122,19 @@ void destroy_context(struct mm_struct *mm)
 	}
 }
 
-static int read_ldt(void __user * ptr, unsigned long bytecount)
+static int read_ldt(void __user *ptr, unsigned long bytecount)
 {
 	int err;
 	unsigned long size;
-	struct mm_struct * mm = current->mm;
+	struct mm_struct *mm = current->mm;
 
 	if (!mm->context.size)
 		return 0;
-	if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
-		bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
+	if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
+		bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
 
 	mutex_lock(&mm->context.lock);
-	size = mm->context.size*LDT_ENTRY_SIZE;
+	size = mm->context.size * LDT_ENTRY_SIZE;
 	if (size > bytecount)
 		size = bytecount;
 
@@ -143,7 +146,7 @@ static int read_ldt(void __user * ptr, unsigned long bytecount)
 		goto error_return;
 	if (size != bytecount) {
 		/* zero-fill the rest */
-		if (clear_user(ptr+size, bytecount-size) != 0) {
+		if (clear_user(ptr + size, bytecount - size) != 0) {
 			err = -EFAULT;
 			goto error_return;
 		}
@@ -153,13 +156,13 @@ error_return:
 	return err;
 }
 
-static int read_default_ldt(void __user * ptr, unsigned long bytecount)
+static int read_default_ldt(void __user *ptr, unsigned long bytecount)
 {
 	int err;
 	unsigned long size;
 
 	err = 0;
-	size = 5*sizeof(struct desc_struct);
+	size = 5 * sizeof(struct desc_struct);
 	if (size > bytecount)
 		size = bytecount;
 
@@ -170,9 +173,9 @@ static int read_default_ldt(void __user * ptr, unsigned long bytecount)
 	return err;
 }
 
-static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
+static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
 {
-	struct mm_struct * mm = current->mm;
+	struct mm_struct *mm = current->mm;
 	__u32 entry_1, entry_2;
 	int error;
 	struct user_desc ldt_info;
@@ -180,7 +183,7 @@ static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
 	error = -EINVAL;
 	if (bytecount != sizeof(ldt_info))
 		goto out;
-	error = -EFAULT; 	
+	error = -EFAULT;
 	if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
 		goto out;
 
@@ -196,13 +199,14 @@ static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
 
 	mutex_lock(&mm->context.lock);
 	if (ldt_info.entry_number >= mm->context.size) {
-		error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
+		error = alloc_ldt(&current->mm->context,
+				  ldt_info.entry_number + 1, 1);
 		if (error < 0)
 			goto out_unlock;
 	}
 
-   	/* Allow LDTs to be cleared by the user. */
-   	if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
+	/* Allow LDTs to be cleared by the user. */
+	if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
 		if (oldmode || LDT_empty(&ldt_info)) {
 			entry_1 = 0;
 			entry_2 = 0;
@@ -217,7 +221,8 @@ static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
 
 	/* Install the new entry ...  */
 install:
-	write_ldt_entry(mm->context.ldt, ldt_info.entry_number, entry_1, entry_2);
+	write_ldt_entry(mm->context.ldt, ldt_info.entry_number, entry_1,
+			entry_2);
 	error = 0;
 
 out_unlock:
@@ -226,7 +231,8 @@ out:
 	return error;
 }
 
-asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
+asmlinkage int sys_modify_ldt(int func, void __user *ptr,
+			      unsigned long bytecount)
 {
 	int ret = -ENOSYS;
 
diff --git a/arch/x86/kernel/ldt_64.c b/arch/x86/kernel/ldt_64.c
index 60e57abb8e90..6c41db367de8 100644
--- a/arch/x86/kernel/ldt_64.c
+++ b/arch/x86/kernel/ldt_64.c
@@ -2,7 +2,7 @@
  * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
  * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
  * Copyright (C) 2002 Andi Kleen
- * 
+ *
  * This handles calls from both 32bit and 64bit mode.
  */
 
@@ -20,11 +20,11 @@
 #include <asm/desc.h>
 #include <asm/proto.h>
 
-#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
+#ifdef CONFIG_SMP
 static void flush_ldt(void *null)
 {
 	if (current->active_mm)
-               load_LDT(&current->active_mm->context);
+		load_LDT(&current->active_mm->context);
 }
 #endif
 
@@ -37,19 +37,20 @@ static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload)
 	if (mincount <= (unsigned)pc->size)
 		return 0;
 	oldsize = pc->size;
-	mincount = (mincount+511)&(~511);
-	if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
-		newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
+	mincount = (mincount + 511) & (~511);
+	if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE)
+		newldt = vmalloc(mincount * LDT_ENTRY_SIZE);
 	else
-		newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
+		newldt = kmalloc(mincount * LDT_ENTRY_SIZE, GFP_KERNEL);
 
 	if (!newldt)
 		return -ENOMEM;
 
 	if (oldsize)
-		memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
+		memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE);
 	oldldt = pc->ldt;
-	memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
+	memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
+	       (mincount - oldsize) * LDT_ENTRY_SIZE);
 	wmb();
 	pc->ldt = newldt;
 	wmb();
@@ -70,7 +71,7 @@ static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload)
 #endif
 	}
 	if (oldsize) {
-		if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
+		if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
 			vfree(oldldt);
 		else
 			kfree(oldldt);
@@ -81,9 +82,10 @@ static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload)
 static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
 {
 	int err = alloc_ldt(new, old->size, 0);
+
 	if (err < 0)
 		return err;
-	memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
+	memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE);
 	return 0;
 }
 
@@ -93,7 +95,7 @@ static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
  */
 int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 {
-	struct mm_struct * old_mm;
+	struct mm_struct *old_mm;
 	int retval = 0;
 
 	mutex_init(&mm->context.lock);
@@ -108,13 +110,12 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 }
 
 /*
- * 
  * Don't touch the LDT register - we're already in the next thread.
  */
 void destroy_context(struct mm_struct *mm)
 {
 	if (mm->context.size) {
-		if ((unsigned)mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
+		if ((unsigned)mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
 			vfree(mm->context.ldt);
 		else
 			kfree(mm->context.ldt);
@@ -122,19 +123,19 @@ void destroy_context(struct mm_struct *mm)
 	}
 }
 
-static int read_ldt(void __user * ptr, unsigned long bytecount)
+static int read_ldt(void __user *ptr, unsigned long bytecount)
 {
 	int err;
 	unsigned long size;
-	struct mm_struct * mm = current->mm;
+	struct mm_struct *mm = current->mm;
 
 	if (!mm->context.size)
 		return 0;
-	if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
-		bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
+	if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
+		bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
 
 	mutex_lock(&mm->context.lock);
-	size = mm->context.size*LDT_ENTRY_SIZE;
+	size = mm->context.size * LDT_ENTRY_SIZE;
 	if (size > bytecount)
 		size = bytecount;
 
@@ -146,7 +147,7 @@ static int read_ldt(void __user * ptr, unsigned long bytecount)
 		goto error_return;
 	if (size != bytecount) {
 		/* zero-fill the rest */
-		if (clear_user(ptr+size, bytecount-size) != 0) {
+		if (clear_user(ptr + size, bytecount - size) != 0) {
 			err = -EFAULT;
 			goto error_return;
 		}
@@ -156,21 +157,21 @@ error_return:
 	return err;
 }
 
-static int read_default_ldt(void __user * ptr, unsigned long bytecount)
+static int read_default_ldt(void __user *ptr, unsigned long bytecount)
 {
-	/* Arbitrary number */ 
+	/* Arbitrary number */
 	/* x86-64 default LDT is all zeros */
-	if (bytecount > 128) 
-		bytecount = 128; 	
+	if (bytecount > 128)
+		bytecount = 128;
 	if (clear_user(ptr, bytecount))
 		return -EFAULT;
-	return bytecount; 
+	return bytecount;
 }
 
-static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
+static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
 {
 	struct task_struct *me = current;
-	struct mm_struct * mm = me->mm;
+	struct mm_struct *mm = me->mm;
 	__u32 entry_1, entry_2, *lp;
 	int error;
 	struct user_desc ldt_info;
@@ -179,7 +180,7 @@ static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
 
 	if (bytecount != sizeof(ldt_info))
 		goto out;
-	error = -EFAULT; 	
+	error = -EFAULT;
 	if (copy_from_user(&ldt_info, ptr, bytecount))
 		goto out;
 
@@ -195,15 +196,16 @@ static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
 
 	mutex_lock(&mm->context.lock);
 	if (ldt_info.entry_number >= (unsigned)mm->context.size) {
-		error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
+		error = alloc_ldt(&current->mm->context,
+				  ldt_info.entry_number + 1, 1);
 		if (error < 0)
 			goto out_unlock;
 	}
 
-	lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt);
+	lp = (__u32 *)((ldt_info.entry_number << 3) + (char *)mm->context.ldt);
 
-   	/* Allow LDTs to be cleared by the user. */
-   	if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
+	/* Allow LDTs to be cleared by the user. */
+	if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
 		if (oldmode || LDT_empty(&ldt_info)) {
 			entry_1 = 0;
 			entry_2 = 0;
@@ -228,7 +230,8 @@ out:
 	return error;
 }
 
-asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
+asmlinkage int sys_modify_ldt(int func, void __user *ptr,
+			      unsigned long bytecount)
 {
 	int ret = -ENOSYS;
 
-- 
cgit v1.2.3


From fc2d625c4fac18e672a3b7c61af5de22d7ab7d87 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:30:13 +0100
Subject: x86: introduce ldt_write accessor

Create a ldt write accessor like the 32 bit one.

Preparatory patch for merging ldt.c and anyway necessary for
64bit paravirt ops.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ldt_64.c  | 8 +++-----
 include/asm-x86/desc_64.h | 9 +++++++++
 2 files changed, 12 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ldt_64.c b/arch/x86/kernel/ldt_64.c
index 6c41db367de8..d72dc7a0636f 100644
--- a/arch/x86/kernel/ldt_64.c
+++ b/arch/x86/kernel/ldt_64.c
@@ -172,7 +172,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
 {
 	struct task_struct *me = current;
 	struct mm_struct *mm = me->mm;
-	__u32 entry_1, entry_2, *lp;
+	__u32 entry_1, entry_2;
 	int error;
 	struct user_desc ldt_info;
 
@@ -202,8 +202,6 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
 			goto out_unlock;
 	}
 
-	lp = (__u32 *)((ldt_info.entry_number << 3) + (char *)mm->context.ldt);
-
 	/* Allow LDTs to be cleared by the user. */
 	if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
 		if (oldmode || LDT_empty(&ldt_info)) {
@@ -220,8 +218,8 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
 
 	/* Install the new entry ...  */
 install:
-	*lp	= entry_1;
-	*(lp+1)	= entry_2;
+	write_ldt_entry(mm->context.ldt, ldt_info.entry_number, entry_1,
+			entry_2);
 	error = 0;
 
 out_unlock:
diff --git a/include/asm-x86/desc_64.h b/include/asm-x86/desc_64.h
index bb2009ecbbca..7d48df72aef2 100644
--- a/include/asm-x86/desc_64.h
+++ b/include/asm-x86/desc_64.h
@@ -38,6 +38,15 @@ extern struct desc_struct default_ldt[];
 extern struct gate_struct idt_table[];
 extern struct desc_ptr cpu_gdt_descr[];
 
+static inline void write_ldt_entry(struct desc_struct *ldt,
+				   int entry, u32 entry_low, u32 entry_high)
+{
+	__u32 *lp = (__u32 *)((entry << 3) + (char *)ldt);
+
+	lp[0] = entry_low;
+	lp[1] = entry_high;
+}
+
 /* the cpu gdt accessor */
 #define cpu_gdt(_cpu) ((struct desc_struct *)cpu_gdt_descr[_cpu].address)
 
-- 
cgit v1.2.3


From 70f5088dd5e9fbd3a71b3a5b01395c676158194b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:30:13 +0100
Subject: x86: prepare arch/x86/kernel/ldt_32/64.c for merging

White space and coding style cleanups.

Change unsigned to int. There is no win when we compare mincount against pc->size,
which is an int as well. Casting pc->size to unsigned just might hide real problems.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ldt_32.c |  3 +--
 arch/x86/kernel/ldt_64.c | 24 +++++++++++-------------
 2 files changed, 12 insertions(+), 15 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ldt_32.c b/arch/x86/kernel/ldt_32.c
index e366c5fd0d19..bb15753abaf2 100644
--- a/arch/x86/kernel/ldt_32.c
+++ b/arch/x86/kernel/ldt_32.c
@@ -27,8 +27,7 @@ static void flush_ldt(void *null)
 
 static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
 {
-	void *oldldt;
-	void *newldt;
+	void *oldldt, *newldt;
 	int oldsize;
 
 	if (mincount <= pc->size)
diff --git a/arch/x86/kernel/ldt_64.c b/arch/x86/kernel/ldt_64.c
index d72dc7a0636f..95903938e7ad 100644
--- a/arch/x86/kernel/ldt_64.c
+++ b/arch/x86/kernel/ldt_64.c
@@ -18,7 +18,7 @@
 #include <asm/system.h>
 #include <asm/ldt.h>
 #include <asm/desc.h>
-#include <asm/proto.h>
+#include <asm/mmu_context.h>
 
 #ifdef CONFIG_SMP
 static void flush_ldt(void *null)
@@ -28,13 +28,12 @@ static void flush_ldt(void *null)
 }
 #endif
 
-static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload)
+static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
 {
-	void *oldldt;
-	void *newldt;
-	unsigned oldsize;
+	void *oldldt, *newldt;
+	int oldsize;
 
-	if (mincount <= (unsigned)pc->size)
+	if (mincount <= pc->size)
 		return 0;
 	oldsize = pc->size;
 	mincount = (mincount + 511) & (~511);
@@ -56,13 +55,14 @@ static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload)
 	wmb();
 	pc->size = mincount;
 	wmb();
+
 	if (reload) {
 #ifdef CONFIG_SMP
 		cpumask_t mask;
 
 		preempt_disable();
-		mask = cpumask_of_cpu(smp_processor_id());
 		load_LDT(pc);
+		mask = cpumask_of_cpu(smp_processor_id());
 		if (!cpus_equal(current->mm->cpu_vm_mask, mask))
 			smp_call_function(flush_ldt, NULL, 1, 1);
 		preempt_enable();
@@ -115,7 +115,7 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 void destroy_context(struct mm_struct *mm)
 {
 	if (mm->context.size) {
-		if ((unsigned)mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
+		if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
 			vfree(mm->context.ldt);
 		else
 			kfree(mm->context.ldt);
@@ -170,18 +170,16 @@ static int read_default_ldt(void __user *ptr, unsigned long bytecount)
 
 static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
 {
-	struct task_struct *me = current;
-	struct mm_struct *mm = me->mm;
+	struct mm_struct *mm = current->mm;
 	__u32 entry_1, entry_2;
 	int error;
 	struct user_desc ldt_info;
 
 	error = -EINVAL;
-
 	if (bytecount != sizeof(ldt_info))
 		goto out;
 	error = -EFAULT;
-	if (copy_from_user(&ldt_info, ptr, bytecount))
+	if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
 		goto out;
 
 	error = -EINVAL;
@@ -195,7 +193,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
 	}
 
 	mutex_lock(&mm->context.lock);
-	if (ldt_info.entry_number >= (unsigned)mm->context.size) {
+	if (ldt_info.entry_number >= mm->context.size) {
 		error = alloc_ldt(&current->mm->context,
 				  ldt_info.entry_number + 1, 1);
 		if (error < 0)
-- 
cgit v1.2.3


From 77e463d1040d6310211ac5162729f5d4afc4dd8c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:30:14 +0100
Subject: x86: merge arch/x86/kernel/ldt_32/64.c

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/Makefile_32 |   2 +-
 arch/x86/kernel/Makefile_64 |   2 +-
 arch/x86/kernel/ldt.c       | 264 ++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/ldt_32.c    | 253 ------------------------------------------
 arch/x86/kernel/ldt_64.c    | 249 -----------------------------------------
 5 files changed, 266 insertions(+), 504 deletions(-)
 create mode 100644 arch/x86/kernel/ldt.c
 delete mode 100644 arch/x86/kernel/ldt_32.c
 delete mode 100644 arch/x86/kernel/ldt_64.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32
index 0cc1981d1e38..31ff982bc26b 100644
--- a/arch/x86/kernel/Makefile_32
+++ b/arch/x86/kernel/Makefile_32
@@ -6,7 +6,7 @@ extra-y := head_32.o init_task.o vmlinux.lds
 CPPFLAGS_vmlinux.lds += -Ui386
 
 obj-y	:= process_32.o signal_32.o entry_32.o traps_32.o irq_32.o \
-		ptrace_32.o time_32.o ioport_32.o ldt_32.o setup_32.o i8259_32.o sys_i386_32.o \
+		ptrace_32.o time_32.o ioport_32.o ldt.o setup_32.o i8259_32.o sys_i386_32.o \
 		pci-dma_32.o i386_ksyms_32.o i387_32.o bootflag.o e820_32.o\
 		quirks.o i8237.o topology.o alternative.o i8253.o tsc_32.o io_delay.o
 
diff --git a/arch/x86/kernel/Makefile_64 b/arch/x86/kernel/Makefile_64
index 08a68f0d8fda..9cb3df27c413 100644
--- a/arch/x86/kernel/Makefile_64
+++ b/arch/x86/kernel/Makefile_64
@@ -7,7 +7,7 @@ CPPFLAGS_vmlinux.lds += -Ux86_64
 EXTRA_AFLAGS	:= -traditional
 
 obj-y	:= process_64.o signal_64.o entry_64.o traps_64.o irq_64.o \
-		ptrace_64.o time_64.o ioport_64.o ldt_64.o setup_64.o i8259_64.o sys_x86_64.o \
+		ptrace_64.o time_64.o ioport_64.o ldt.o setup_64.o i8259_64.o sys_x86_64.o \
 		x8664_ksyms_64.o i387_64.o syscall_64.o vsyscall_64.o \
 		setup64.o bootflag.o e820_64.o reboot_64.o quirks.o i8237.o \
 		pci-dma_64.o pci-nommu_64.o alternative.o hpet.o tsc_64.o bugs_64.o \
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
new file mode 100644
index 000000000000..a8cdca3615bf
--- /dev/null
+++ b/arch/x86/kernel/ldt.c
@@ -0,0 +1,264 @@
+/*
+ * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2002 Andi Kleen
+ *
+ * This handles calls from both 32bit and 64bit mode.
+ */
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/ldt.h>
+#include <asm/desc.h>
+#include <asm/mmu_context.h>
+
+#ifdef CONFIG_SMP
+static void flush_ldt(void *null)
+{
+	if (current->active_mm)
+		load_LDT(&current->active_mm->context);
+}
+#endif
+
+static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
+{
+	void *oldldt, *newldt;
+	int oldsize;
+
+	if (mincount <= pc->size)
+		return 0;
+	oldsize = pc->size;
+	mincount = (mincount + 511) & (~511);
+	if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE)
+		newldt = vmalloc(mincount * LDT_ENTRY_SIZE);
+	else
+		newldt = kmalloc(mincount * LDT_ENTRY_SIZE, GFP_KERNEL);
+
+	if (!newldt)
+		return -ENOMEM;
+
+	if (oldsize)
+		memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE);
+	oldldt = pc->ldt;
+	memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
+	       (mincount - oldsize) * LDT_ENTRY_SIZE);
+
+#ifdef CONFIG_X86_64
+	/* CHECKME: Do we really need this ? */
+	wmb();
+#endif
+	pc->ldt = newldt;
+	wmb();
+	pc->size = mincount;
+	wmb();
+
+	if (reload) {
+#ifdef CONFIG_SMP
+		cpumask_t mask;
+
+		preempt_disable();
+		load_LDT(pc);
+		mask = cpumask_of_cpu(smp_processor_id());
+		if (!cpus_equal(current->mm->cpu_vm_mask, mask))
+			smp_call_function(flush_ldt, NULL, 1, 1);
+		preempt_enable();
+#else
+		load_LDT(pc);
+#endif
+	}
+	if (oldsize) {
+		if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
+			vfree(oldldt);
+		else
+			kfree(oldldt);
+	}
+	return 0;
+}
+
+static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
+{
+	int err = alloc_ldt(new, old->size, 0);
+
+	if (err < 0)
+		return err;
+	memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE);
+	return 0;
+}
+
+/*
+ * we do not have to muck with descriptors here, that is
+ * done in switch_mm() as needed.
+ */
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+{
+	struct mm_struct *old_mm;
+	int retval = 0;
+
+	mutex_init(&mm->context.lock);
+	mm->context.size = 0;
+	old_mm = current->mm;
+	if (old_mm && old_mm->context.size > 0) {
+		mutex_lock(&old_mm->context.lock);
+		retval = copy_ldt(&mm->context, &old_mm->context);
+		mutex_unlock(&old_mm->context.lock);
+	}
+	return retval;
+}
+
+/*
+ * No need to lock the MM as we are the last user
+ *
+ * 64bit: Don't touch the LDT register - we're already in the next thread.
+ */
+void destroy_context(struct mm_struct *mm)
+{
+	if (mm->context.size) {
+#ifdef CONFIG_X86_32
+		/* CHECKME: Can this ever happen ? */
+		if (mm == current->active_mm)
+			clear_LDT();
+#endif
+		if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
+			vfree(mm->context.ldt);
+		else
+			kfree(mm->context.ldt);
+		mm->context.size = 0;
+	}
+}
+
+static int read_ldt(void __user *ptr, unsigned long bytecount)
+{
+	int err;
+	unsigned long size;
+	struct mm_struct *mm = current->mm;
+
+	if (!mm->context.size)
+		return 0;
+	if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
+		bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
+
+	mutex_lock(&mm->context.lock);
+	size = mm->context.size * LDT_ENTRY_SIZE;
+	if (size > bytecount)
+		size = bytecount;
+
+	err = 0;
+	if (copy_to_user(ptr, mm->context.ldt, size))
+		err = -EFAULT;
+	mutex_unlock(&mm->context.lock);
+	if (err < 0)
+		goto error_return;
+	if (size != bytecount) {
+		/* zero-fill the rest */
+		if (clear_user(ptr + size, bytecount - size) != 0) {
+			err = -EFAULT;
+			goto error_return;
+		}
+	}
+	return bytecount;
+error_return:
+	return err;
+}
+
+static int read_default_ldt(void __user *ptr, unsigned long bytecount)
+{
+	/* CHECKME: Can we use _one_ random number ? */
+#ifdef CONFIG_X86_32
+	unsigned long size = 5 * sizeof(struct desc_struct);
+#else
+	unsigned long size = 128;
+#endif
+	if (bytecount > size)
+		bytecount = size;
+	if (clear_user(ptr, bytecount))
+		return -EFAULT;
+	return bytecount;
+}
+
+static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
+{
+	struct mm_struct *mm = current->mm;
+	__u32 entry_1, entry_2;
+	int error;
+	struct user_desc ldt_info;
+
+	error = -EINVAL;
+	if (bytecount != sizeof(ldt_info))
+		goto out;
+	error = -EFAULT;
+	if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
+		goto out;
+
+	error = -EINVAL;
+	if (ldt_info.entry_number >= LDT_ENTRIES)
+		goto out;
+	if (ldt_info.contents == 3) {
+		if (oldmode)
+			goto out;
+		if (ldt_info.seg_not_present == 0)
+			goto out;
+	}
+
+	mutex_lock(&mm->context.lock);
+	if (ldt_info.entry_number >= mm->context.size) {
+		error = alloc_ldt(&current->mm->context,
+				  ldt_info.entry_number + 1, 1);
+		if (error < 0)
+			goto out_unlock;
+	}
+
+	/* Allow LDTs to be cleared by the user. */
+	if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
+		if (oldmode || LDT_empty(&ldt_info)) {
+			entry_1 = 0;
+			entry_2 = 0;
+			goto install;
+		}
+	}
+
+	entry_1 = LDT_entry_a(&ldt_info);
+	entry_2 = LDT_entry_b(&ldt_info);
+	if (oldmode)
+		entry_2 &= ~(1 << 20);
+
+	/* Install the new entry ...  */
+install:
+	write_ldt_entry(mm->context.ldt, ldt_info.entry_number, entry_1,
+			entry_2);
+	error = 0;
+
+out_unlock:
+	mutex_unlock(&mm->context.lock);
+out:
+	return error;
+}
+
+asmlinkage int sys_modify_ldt(int func, void __user *ptr,
+			      unsigned long bytecount)
+{
+	int ret = -ENOSYS;
+
+	switch (func) {
+	case 0:
+		ret = read_ldt(ptr, bytecount);
+		break;
+	case 1:
+		ret = write_ldt(ptr, bytecount, 1);
+		break;
+	case 2:
+		ret = read_default_ldt(ptr, bytecount);
+		break;
+	case 0x11:
+		ret = write_ldt(ptr, bytecount, 0);
+		break;
+	}
+	return ret;
+}
diff --git a/arch/x86/kernel/ldt_32.c b/arch/x86/kernel/ldt_32.c
deleted file mode 100644
index bb15753abaf2..000000000000
--- a/arch/x86/kernel/ldt_32.c
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
- * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
- * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
- */
-
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/vmalloc.h>
-#include <linux/slab.h>
-
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <asm/ldt.h>
-#include <asm/desc.h>
-#include <asm/mmu_context.h>
-
-#ifdef CONFIG_SMP
-static void flush_ldt(void *null)
-{
-	if (current->active_mm)
-		load_LDT(&current->active_mm->context);
-}
-#endif
-
-static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
-{
-	void *oldldt, *newldt;
-	int oldsize;
-
-	if (mincount <= pc->size)
-		return 0;
-	oldsize = pc->size;
-	mincount = (mincount + 511) & (~511);
-	if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE)
-		newldt = vmalloc(mincount * LDT_ENTRY_SIZE);
-	else
-		newldt = kmalloc(mincount * LDT_ENTRY_SIZE, GFP_KERNEL);
-
-	if (!newldt)
-		return -ENOMEM;
-
-	if (oldsize)
-		memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE);
-	oldldt = pc->ldt;
-	memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
-	       (mincount - oldsize) * LDT_ENTRY_SIZE);
-	pc->ldt = newldt;
-	wmb();
-	pc->size = mincount;
-	wmb();
-
-	if (reload) {
-#ifdef CONFIG_SMP
-		cpumask_t mask;
-
-		preempt_disable();
-		load_LDT(pc);
-		mask = cpumask_of_cpu(smp_processor_id());
-		if (!cpus_equal(current->mm->cpu_vm_mask, mask))
-			smp_call_function(flush_ldt, NULL, 1, 1);
-		preempt_enable();
-#else
-		load_LDT(pc);
-#endif
-	}
-	if (oldsize) {
-		if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
-			vfree(oldldt);
-		else
-			kfree(oldldt);
-	}
-	return 0;
-}
-
-static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
-{
-	int err = alloc_ldt(new, old->size, 0);
-
-	if (err < 0)
-		return err;
-	memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE);
-	return 0;
-}
-
-/*
- * we do not have to muck with descriptors here, that is
- * done in switch_mm() as needed.
- */
-int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
-{
-	struct mm_struct *old_mm;
-	int retval = 0;
-
-	mutex_init(&mm->context.lock);
-	mm->context.size = 0;
-	old_mm = current->mm;
-	if (old_mm && old_mm->context.size > 0) {
-		mutex_lock(&old_mm->context.lock);
-		retval = copy_ldt(&mm->context, &old_mm->context);
-		mutex_unlock(&old_mm->context.lock);
-	}
-	return retval;
-}
-
-/*
- * No need to lock the MM as we are the last user
- */
-void destroy_context(struct mm_struct *mm)
-{
-	if (mm->context.size) {
-		if (mm == current->active_mm)
-			clear_LDT();
-		if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
-			vfree(mm->context.ldt);
-		else
-			kfree(mm->context.ldt);
-		mm->context.size = 0;
-	}
-}
-
-static int read_ldt(void __user *ptr, unsigned long bytecount)
-{
-	int err;
-	unsigned long size;
-	struct mm_struct *mm = current->mm;
-
-	if (!mm->context.size)
-		return 0;
-	if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
-		bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
-
-	mutex_lock(&mm->context.lock);
-	size = mm->context.size * LDT_ENTRY_SIZE;
-	if (size > bytecount)
-		size = bytecount;
-
-	err = 0;
-	if (copy_to_user(ptr, mm->context.ldt, size))
-		err = -EFAULT;
-	mutex_unlock(&mm->context.lock);
-	if (err < 0)
-		goto error_return;
-	if (size != bytecount) {
-		/* zero-fill the rest */
-		if (clear_user(ptr + size, bytecount - size) != 0) {
-			err = -EFAULT;
-			goto error_return;
-		}
-	}
-	return bytecount;
-error_return:
-	return err;
-}
-
-static int read_default_ldt(void __user *ptr, unsigned long bytecount)
-{
-	int err;
-	unsigned long size;
-
-	err = 0;
-	size = 5 * sizeof(struct desc_struct);
-	if (size > bytecount)
-		size = bytecount;
-
-	err = size;
-	if (clear_user(ptr, size))
-		err = -EFAULT;
-
-	return err;
-}
-
-static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
-{
-	struct mm_struct *mm = current->mm;
-	__u32 entry_1, entry_2;
-	int error;
-	struct user_desc ldt_info;
-
-	error = -EINVAL;
-	if (bytecount != sizeof(ldt_info))
-		goto out;
-	error = -EFAULT;
-	if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
-		goto out;
-
-	error = -EINVAL;
-	if (ldt_info.entry_number >= LDT_ENTRIES)
-		goto out;
-	if (ldt_info.contents == 3) {
-		if (oldmode)
-			goto out;
-		if (ldt_info.seg_not_present == 0)
-			goto out;
-	}
-
-	mutex_lock(&mm->context.lock);
-	if (ldt_info.entry_number >= mm->context.size) {
-		error = alloc_ldt(&current->mm->context,
-				  ldt_info.entry_number + 1, 1);
-		if (error < 0)
-			goto out_unlock;
-	}
-
-	/* Allow LDTs to be cleared by the user. */
-	if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
-		if (oldmode || LDT_empty(&ldt_info)) {
-			entry_1 = 0;
-			entry_2 = 0;
-			goto install;
-		}
-	}
-
-	entry_1 = LDT_entry_a(&ldt_info);
-	entry_2 = LDT_entry_b(&ldt_info);
-	if (oldmode)
-		entry_2 &= ~(1 << 20);
-
-	/* Install the new entry ...  */
-install:
-	write_ldt_entry(mm->context.ldt, ldt_info.entry_number, entry_1,
-			entry_2);
-	error = 0;
-
-out_unlock:
-	mutex_unlock(&mm->context.lock);
-out:
-	return error;
-}
-
-asmlinkage int sys_modify_ldt(int func, void __user *ptr,
-			      unsigned long bytecount)
-{
-	int ret = -ENOSYS;
-
-	switch (func) {
-	case 0:
-		ret = read_ldt(ptr, bytecount);
-		break;
-	case 1:
-		ret = write_ldt(ptr, bytecount, 1);
-		break;
-	case 2:
-		ret = read_default_ldt(ptr, bytecount);
-		break;
-	case 0x11:
-		ret = write_ldt(ptr, bytecount, 0);
-		break;
-	}
-	return ret;
-}
diff --git a/arch/x86/kernel/ldt_64.c b/arch/x86/kernel/ldt_64.c
deleted file mode 100644
index 95903938e7ad..000000000000
--- a/arch/x86/kernel/ldt_64.c
+++ /dev/null
@@ -1,249 +0,0 @@
-/*
- * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
- * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
- * Copyright (C) 2002 Andi Kleen
- *
- * This handles calls from both 32bit and 64bit mode.
- */
-
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/vmalloc.h>
-#include <linux/slab.h>
-
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <asm/ldt.h>
-#include <asm/desc.h>
-#include <asm/mmu_context.h>
-
-#ifdef CONFIG_SMP
-static void flush_ldt(void *null)
-{
-	if (current->active_mm)
-		load_LDT(&current->active_mm->context);
-}
-#endif
-
-static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
-{
-	void *oldldt, *newldt;
-	int oldsize;
-
-	if (mincount <= pc->size)
-		return 0;
-	oldsize = pc->size;
-	mincount = (mincount + 511) & (~511);
-	if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE)
-		newldt = vmalloc(mincount * LDT_ENTRY_SIZE);
-	else
-		newldt = kmalloc(mincount * LDT_ENTRY_SIZE, GFP_KERNEL);
-
-	if (!newldt)
-		return -ENOMEM;
-
-	if (oldsize)
-		memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE);
-	oldldt = pc->ldt;
-	memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
-	       (mincount - oldsize) * LDT_ENTRY_SIZE);
-	wmb();
-	pc->ldt = newldt;
-	wmb();
-	pc->size = mincount;
-	wmb();
-
-	if (reload) {
-#ifdef CONFIG_SMP
-		cpumask_t mask;
-
-		preempt_disable();
-		load_LDT(pc);
-		mask = cpumask_of_cpu(smp_processor_id());
-		if (!cpus_equal(current->mm->cpu_vm_mask, mask))
-			smp_call_function(flush_ldt, NULL, 1, 1);
-		preempt_enable();
-#else
-		load_LDT(pc);
-#endif
-	}
-	if (oldsize) {
-		if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
-			vfree(oldldt);
-		else
-			kfree(oldldt);
-	}
-	return 0;
-}
-
-static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
-{
-	int err = alloc_ldt(new, old->size, 0);
-
-	if (err < 0)
-		return err;
-	memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE);
-	return 0;
-}
-
-/*
- * we do not have to muck with descriptors here, that is
- * done in switch_mm() as needed.
- */
-int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
-{
-	struct mm_struct *old_mm;
-	int retval = 0;
-
-	mutex_init(&mm->context.lock);
-	mm->context.size = 0;
-	old_mm = current->mm;
-	if (old_mm && old_mm->context.size > 0) {
-		mutex_lock(&old_mm->context.lock);
-		retval = copy_ldt(&mm->context, &old_mm->context);
-		mutex_unlock(&old_mm->context.lock);
-	}
-	return retval;
-}
-
-/*
- * Don't touch the LDT register - we're already in the next thread.
- */
-void destroy_context(struct mm_struct *mm)
-{
-	if (mm->context.size) {
-		if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
-			vfree(mm->context.ldt);
-		else
-			kfree(mm->context.ldt);
-		mm->context.size = 0;
-	}
-}
-
-static int read_ldt(void __user *ptr, unsigned long bytecount)
-{
-	int err;
-	unsigned long size;
-	struct mm_struct *mm = current->mm;
-
-	if (!mm->context.size)
-		return 0;
-	if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
-		bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
-
-	mutex_lock(&mm->context.lock);
-	size = mm->context.size * LDT_ENTRY_SIZE;
-	if (size > bytecount)
-		size = bytecount;
-
-	err = 0;
-	if (copy_to_user(ptr, mm->context.ldt, size))
-		err = -EFAULT;
-	mutex_unlock(&mm->context.lock);
-	if (err < 0)
-		goto error_return;
-	if (size != bytecount) {
-		/* zero-fill the rest */
-		if (clear_user(ptr + size, bytecount - size) != 0) {
-			err = -EFAULT;
-			goto error_return;
-		}
-	}
-	return bytecount;
-error_return:
-	return err;
-}
-
-static int read_default_ldt(void __user *ptr, unsigned long bytecount)
-{
-	/* Arbitrary number */
-	/* x86-64 default LDT is all zeros */
-	if (bytecount > 128)
-		bytecount = 128;
-	if (clear_user(ptr, bytecount))
-		return -EFAULT;
-	return bytecount;
-}
-
-static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
-{
-	struct mm_struct *mm = current->mm;
-	__u32 entry_1, entry_2;
-	int error;
-	struct user_desc ldt_info;
-
-	error = -EINVAL;
-	if (bytecount != sizeof(ldt_info))
-		goto out;
-	error = -EFAULT;
-	if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
-		goto out;
-
-	error = -EINVAL;
-	if (ldt_info.entry_number >= LDT_ENTRIES)
-		goto out;
-	if (ldt_info.contents == 3) {
-		if (oldmode)
-			goto out;
-		if (ldt_info.seg_not_present == 0)
-			goto out;
-	}
-
-	mutex_lock(&mm->context.lock);
-	if (ldt_info.entry_number >= mm->context.size) {
-		error = alloc_ldt(&current->mm->context,
-				  ldt_info.entry_number + 1, 1);
-		if (error < 0)
-			goto out_unlock;
-	}
-
-	/* Allow LDTs to be cleared by the user. */
-	if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
-		if (oldmode || LDT_empty(&ldt_info)) {
-			entry_1 = 0;
-			entry_2 = 0;
-			goto install;
-		}
-	}
-
-	entry_1 = LDT_entry_a(&ldt_info);
-	entry_2 = LDT_entry_b(&ldt_info);
-	if (oldmode)
-		entry_2 &= ~(1 << 20);
-
-	/* Install the new entry ...  */
-install:
-	write_ldt_entry(mm->context.ldt, ldt_info.entry_number, entry_1,
-			entry_2);
-	error = 0;
-
-out_unlock:
-	mutex_unlock(&mm->context.lock);
-out:
-	return error;
-}
-
-asmlinkage int sys_modify_ldt(int func, void __user *ptr,
-			      unsigned long bytecount)
-{
-	int ret = -ENOSYS;
-
-	switch (func) {
-	case 0:
-		ret = read_ldt(ptr, bytecount);
-		break;
-	case 1:
-		ret = write_ldt(ptr, bytecount, 1);
-		break;
-	case 2:
-		ret = read_default_ldt(ptr, bytecount);
-		break;
-	case 0x11:
-		ret = write_ldt(ptr, bytecount, 0);
-		break;
-	}
-	return ret;
-}
-- 
cgit v1.2.3


From 37e650c7c8a27de533d409b53c29f4135dcc7af6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:30:14 +0100
Subject: x86: rename get_maxlvt to lapic_get_maxlvt

Use the same name for the 32 and 64 bit variant.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic_64.c    | 12 ++++++------
 arch/x86/kernel/io_apic_64.c |  2 +-
 arch/x86/kernel/smpboot_64.c |  2 +-
 include/asm-x86/apic_64.h    |  2 +-
 4 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index fa6cdee6d303..dfeda91fa80c 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -160,7 +160,7 @@ void enable_NMI_through_LVT0 (void * dummy)
 	apic_write(APIC_LVT0, v);
 }
 
-int get_maxlvt(void)
+int lapic_get_maxlvt(void)
 {
 	unsigned int v, maxlvt;
 
@@ -194,7 +194,7 @@ void clear_local_APIC(void)
 	int maxlvt;
 	unsigned int v;
 
-	maxlvt = get_maxlvt();
+	maxlvt = lapic_get_maxlvt();
 
 	/*
 	 * Masking an LVT entry can trigger a local APIC error
@@ -333,7 +333,7 @@ int __init verify_local_APIC(void)
 	reg1 = GET_APIC_VERSION(reg0);
 	if (reg1 == 0x00 || reg1 == 0xff)
 		return 0;
-	reg1 = get_maxlvt();
+	reg1 = lapic_get_maxlvt();
 	if (reg1 < 0x02 || reg1 == 0xff)
 		return 0;
 
@@ -519,7 +519,7 @@ void __cpuinit setup_local_APIC (void)
 
 	{
 		unsigned oldvalue;
-		maxlvt = get_maxlvt();
+		maxlvt = lapic_get_maxlvt();
 		oldvalue = apic_read(APIC_ESR);
 		value = ERROR_APIC_VECTOR;      // enables sending errors
 		apic_write(APIC_LVTERR, value);
@@ -571,7 +571,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
 	if (!apic_pm_state.active)
 		return 0;
 
-	maxlvt = get_maxlvt();
+	maxlvt = lapic_get_maxlvt();
 
 	apic_pm_state.apic_id = apic_read(APIC_ID);
 	apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
@@ -605,7 +605,7 @@ static int lapic_resume(struct sys_device *dev)
 	if (!apic_pm_state.active)
 		return 0;
 
-	maxlvt = get_maxlvt();
+	maxlvt = lapic_get_maxlvt();
 
 	local_irq_save(flags);
 	rdmsr(MSR_IA32_APICBASE, l, h);
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 23a3ac06a23e..d4f5286101a9 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -1069,7 +1069,7 @@ void __apicdebuginit print_local_APIC(void * dummy)
 	v = apic_read(APIC_LVR);
 	printk(KERN_INFO "... APIC VERSION: %08x\n", v);
 	ver = GET_APIC_VERSION(v);
-	maxlvt = get_maxlvt();
+	maxlvt = lapic_get_maxlvt();
 
 	v = apic_read(APIC_TASKPRI);
 	printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c
index aaf4e1291217..8147b7d4db63 100644
--- a/arch/x86/kernel/smpboot_64.c
+++ b/arch/x86/kernel/smpboot_64.c
@@ -466,7 +466,7 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta
 	 */
 	Dprintk("#startup loops: %d.\n", num_starts);
 
-	maxlvt = get_maxlvt();
+	maxlvt = lapic_get_maxlvt();
 
 	for (j = 1; j <= num_starts; j++) {
 		Dprintk("Sending STARTUP #%d.\n",j);
diff --git a/include/asm-x86/apic_64.h b/include/asm-x86/apic_64.h
index 9a0ec02a49a1..b5f850f25114 100644
--- a/include/asm-x86/apic_64.h
+++ b/include/asm-x86/apic_64.h
@@ -64,7 +64,7 @@ static inline void ack_APIC_irq(void)
 	apic_write(APIC_EOI, 0);
 }
 
-extern int get_maxlvt(void);
+extern int lapic_get_maxlvt(void);
 extern void clear_local_APIC(void);
 extern void connect_bsp_APIC(void);
 extern void disconnect_bsp_APIC(int virt_wire_setup);
-- 
cgit v1.2.3


From 3c6bb07ac1b4174318606a26f0de8ceb9f6d8133 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:30:15 +0100
Subject: x86: use u32 for safe_apic_wait_icr_idle()

Preperatory patch for merging apic headers.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic_64.c    |  4 ++--
 arch/x86/kernel/smpboot_64.c | 11 ++++++-----
 include/asm-x86/apic_64.h    |  2 +-
 3 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index dfeda91fa80c..3de3764a862c 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -135,9 +135,9 @@ void apic_wait_icr_idle(void)
 		cpu_relax();
 }
 
-unsigned int safe_apic_wait_icr_idle(void)
+u32 safe_apic_wait_icr_idle(void)
 {
-	unsigned int send_status;
+	u32 send_status;
 	int timeout;
 
 	timeout = 0;
diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c
index 8147b7d4db63..b36d32ff0b39 100644
--- a/arch/x86/kernel/smpboot_64.c
+++ b/arch/x86/kernel/smpboot_64.c
@@ -384,19 +384,20 @@ static void inquire_remote_apic(int apicid)
 	unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
 	char *names[] = { "ID", "VERSION", "SPIV" };
 	int timeout;
-	unsigned int status;
+	u32 status;
 
 	printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid);
 
 	for (i = 0; i < ARRAY_SIZE(regs); i++) {
-		printk("... APIC #%d %s: ", apicid, names[i]);
+		printk(KERN_INFO "... APIC #%d %s: ", apicid, names[i]);
 
 		/*
 		 * Wait for idle.
 		 */
 		status = safe_apic_wait_icr_idle();
 		if (status)
-			printk("a previous APIC delivery may have failed\n");
+			printk(KERN_CONT
+			       "a previous APIC delivery may have failed\n");
 
 		apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
 		apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]);
@@ -410,10 +411,10 @@ static void inquire_remote_apic(int apicid)
 		switch (status) {
 		case APIC_ICR_RR_VALID:
 			status = apic_read(APIC_RRR);
-			printk("%08x\n", status);
+			printk(KERN_CONT "%08x\n", status);
 			break;
 		default:
-			printk("failed\n");
+			printk(KERN_CONT "failed\n");
 		}
 	}
 }
diff --git a/include/asm-x86/apic_64.h b/include/asm-x86/apic_64.h
index b5f850f25114..7bfad0224178 100644
--- a/include/asm-x86/apic_64.h
+++ b/include/asm-x86/apic_64.h
@@ -49,7 +49,7 @@ static __inline unsigned int apic_read(unsigned long reg)
 }
 
 extern void apic_wait_icr_idle(void);
-extern unsigned int safe_apic_wait_icr_idle(void);
+extern u32 safe_apic_wait_icr_idle(void);
 
 static inline void ack_APIC_irq(void)
 {
-- 
cgit v1.2.3


From 42e0a9aa5d467188687c6b705412578e53c14af6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:30:15 +0100
Subject: x86: use u32 for some lapic functions

Use u32 so 32 and 64bit have the same interface.

Andrew Morton: xen, lguest build fixes

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic_32.c  |  4 ++--
 arch/x86/lguest/boot.c     |  4 ++--
 arch/x86/xen/enlighten.c   |  4 ++--
 include/asm-x86/apic_32.h  | 16 +++++++---------
 include/asm-x86/apic_64.h  |  6 +++---
 include/asm-x86/paravirt.h | 12 ++++++------
 6 files changed, 22 insertions(+), 24 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index a56c782653be..3a069acb270c 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -135,9 +135,9 @@ void apic_wait_icr_idle(void)
 		cpu_relax();
 }
 
-unsigned long safe_apic_wait_icr_idle(void)
+u32 safe_apic_wait_icr_idle(void)
 {
-	unsigned long send_status;
+	u32 send_status;
 	int timeout;
 
 	timeout = 0;
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 92c56117eae5..df04bf884dd4 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -788,11 +788,11 @@ static void lguest_wbinvd(void)
  * code qualifies for Advanced.  It will also never interrupt anything.  It
  * does, however, allow us to get through the Linux boot code. */
 #ifdef CONFIG_X86_LOCAL_APIC
-static void lguest_apic_write(unsigned long reg, unsigned long v)
+static void lguest_apic_write(unsigned long reg, u32 v)
 {
 }
 
-static unsigned long lguest_apic_read(unsigned long reg)
+static u32 lguest_apic_read(unsigned long reg)
 {
 	return 0;
 }
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 79ad15252150..00829401389e 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -521,12 +521,12 @@ static void xen_io_delay(void)
 }
 
 #ifdef CONFIG_X86_LOCAL_APIC
-static unsigned long xen_apic_read(unsigned long reg)
+static u32 xen_apic_read(unsigned long reg)
 {
 	return 0;
 }
 
-static void xen_apic_write(unsigned long reg, unsigned long val)
+static void xen_apic_write(unsigned long reg, u32 val)
 {
 	/* Warn to see if there's any stray references */
 	WARN_ON(1);
diff --git a/include/asm-x86/apic_32.h b/include/asm-x86/apic_32.h
index f909e2daf226..649e9a6f6683 100644
--- a/include/asm-x86/apic_32.h
+++ b/include/asm-x86/apic_32.h
@@ -51,25 +51,23 @@ extern int local_apic_timer_disabled;
 #define setup_secondary_clock setup_secondary_APIC_clock
 #endif
 
-static __inline fastcall void native_apic_write(unsigned long reg,
-						unsigned long v)
+static __inline fastcall void native_apic_write(unsigned long reg, u32 v)
 {
-	*((volatile unsigned long *)(APIC_BASE+reg)) = v;
+	*((volatile u32 *)(APIC_BASE + reg)) = v;
 }
 
-static __inline fastcall void native_apic_write_atomic(unsigned long reg,
-						       unsigned long v)
+static __inline fastcall void native_apic_write_atomic(unsigned long reg, u32 v)
 {
-	xchg((volatile unsigned long *)(APIC_BASE+reg), v);
+	(void) xchg((u32 *)(APIC_BASE + reg), v);
 }
 
-static __inline fastcall unsigned long native_apic_read(unsigned long reg)
+static __inline fastcall u32 native_apic_read(unsigned long reg)
 {
-	return *((volatile unsigned long *)(APIC_BASE+reg));
+	return *((volatile u32 *)(APIC_BASE + reg));
 }
 
 extern void apic_wait_icr_idle(void);
-extern unsigned long safe_apic_wait_icr_idle(void);
+extern u32 safe_apic_wait_icr_idle(void);
 extern int get_physical_broadcast(void);
 
 #ifdef CONFIG_X86_GOOD_APIC
diff --git a/include/asm-x86/apic_64.h b/include/asm-x86/apic_64.h
index 7bfad0224178..9d0c06c4df91 100644
--- a/include/asm-x86/apic_64.h
+++ b/include/asm-x86/apic_64.h
@@ -38,14 +38,14 @@ struct pt_regs;
  * Basic functions accessing APICs.
  */
 
-static __inline void apic_write(unsigned long reg, unsigned int v)
+static __inline void apic_write(unsigned long reg, u32 v)
 {
 	*((volatile unsigned int *)(APIC_BASE+reg)) = v;
 }
 
-static __inline unsigned int apic_read(unsigned long reg)
+static __inline u32 apic_read(unsigned long reg)
 {
-	return *((volatile unsigned int *)(APIC_BASE+reg));
+	return *((volatile u32 *)(APIC_BASE+reg));
 }
 
 extern void apic_wait_icr_idle(void);
diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h
index f59d370c5df4..19fd3e67b08c 100644
--- a/include/asm-x86/paravirt.h
+++ b/include/asm-x86/paravirt.h
@@ -150,9 +150,9 @@ struct pv_apic_ops {
 	 * Direct APIC operations, principally for VMI.  Ideally
 	 * these shouldn't be in this interface.
 	 */
-	void (*apic_write)(unsigned long reg, unsigned long v);
-	void (*apic_write_atomic)(unsigned long reg, unsigned long v);
-	unsigned long (*apic_read)(unsigned long reg);
+	void (*apic_write)(unsigned long reg, u32 v);
+	void (*apic_write_atomic)(unsigned long reg, u32 v);
+	u32 (*apic_read)(unsigned long reg);
 	void (*setup_boot_clock)(void);
 	void (*setup_secondary_clock)(void);
 
@@ -690,17 +690,17 @@ static inline void slow_down_io(void) {
 /*
  * Basic functions accessing APICs.
  */
-static inline void apic_write(unsigned long reg, unsigned long v)
+static inline void apic_write(unsigned long reg, u32 v)
 {
 	PVOP_VCALL2(pv_apic_ops.apic_write, reg, v);
 }
 
-static inline void apic_write_atomic(unsigned long reg, unsigned long v)
+static inline void apic_write_atomic(unsigned long reg, u32 v)
 {
 	PVOP_VCALL2(pv_apic_ops.apic_write_atomic, reg, v);
 }
 
-static inline unsigned long apic_read(unsigned long reg)
+static inline u32 apic_read(unsigned long reg)
 {
 	return PVOP_CALL1(unsigned long, pv_apic_ops.apic_read, reg);
 }
-- 
cgit v1.2.3


From 376ff0352c24a5fa47f1250dd60937b5a9077672 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:30:16 +0100
Subject: x86: move acpi and pci declarations

Move acpi/pci related declarations to the correct headers
and remove the duplicate.

Build fix from: Andrew Morton

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_32.c | 3 +--
 arch/x86/kernel/setup_64.c | 2 --
 include/asm-x86/acpi_32.h  | 2 --
 include/asm-x86/pci.h      | 4 +++-
 include/asm-x86/proto.h    | 3 ---
 5 files changed, 4 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 51bdc0b1b72e..236d30b264d8 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -44,6 +44,7 @@
 #include <linux/crash_dump.h>
 #include <linux/dmi.h>
 #include <linux/pfn.h>
+#include <linux/pci.h>
 
 #include <video/edid.h>
 
@@ -663,9 +664,7 @@ void __init setup_arch(char **cmdline_p)
 	acpi_boot_table_init();
 #endif
 
-#ifdef CONFIG_PCI
 	early_quirks();
-#endif
 
 #ifdef CONFIG_ACPI
 	acpi_boot_init();
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index ec976edf0399..97a497652d20 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -407,9 +407,7 @@ void __init setup_arch(char **cmdline_p)
 	reserve_crashkernel();
 	paging_init();
 
-#ifdef CONFIG_PCI
 	early_quirks();
-#endif
 
 	/*
 	 * set this early, so we dont allocate cpu0
diff --git a/include/asm-x86/acpi_32.h b/include/asm-x86/acpi_32.h
index 723493e6c851..3cce5a0f2d9c 100644
--- a/include/asm-x86/acpi_32.h
+++ b/include/asm-x86/acpi_32.h
@@ -81,8 +81,6 @@ int __acpi_release_global_lock(unsigned int *lock);
         :"=r"(n_hi), "=r"(n_lo)     \
         :"0"(n_hi), "1"(n_lo))
 
-extern void early_quirks(void);
-
 #ifdef CONFIG_ACPI
 extern int acpi_lapic;
 extern int acpi_ioapic;
diff --git a/include/asm-x86/pci.h b/include/asm-x86/pci.h
index e88361966347..6983730d86fd 100644
--- a/include/asm-x86/pci.h
+++ b/include/asm-x86/pci.h
@@ -66,6 +66,7 @@ extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 
 
 #ifdef CONFIG_PCI
+extern void early_quirks(void);
 static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 					enum pci_dma_burst_strategy *strat,
 					unsigned long *strategy_parameter)
@@ -73,9 +74,10 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 	*strat = PCI_DMA_BURST_INFINITY;
 	*strategy_parameter = ~0UL;
 }
+#else
+static inline void early_quirks(void) { }
 #endif
 
-
 #endif  /* __KERNEL__ */
 
 #ifdef CONFIG_X86_32
diff --git a/include/asm-x86/proto.h b/include/asm-x86/proto.h
index dabba55f7ed8..64fe816ea37a 100644
--- a/include/asm-x86/proto.h
+++ b/include/asm-x86/proto.h
@@ -58,8 +58,6 @@ extern void show_registers(struct pt_regs *regs);
 
 extern void exception_table_check(void);
 
-extern void acpi_reserve_bootmem(void);
-
 extern void swap_low_mappings(void);
 
 extern void __show_regs(struct pt_regs * regs);
@@ -69,7 +67,6 @@ extern void syscall32_cpu_init(void);
 
 extern void setup_node_bootmem(int nodeid, unsigned long start, unsigned long end);
 
-extern void early_quirks(void);
 extern void check_efer(void);
 
 extern void select_idle_routine(const struct cpuinfo_x86 *c);
-- 
cgit v1.2.3


From 8c61b900ebb8ec5918ffd776ba1a61a5f022566d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:30:16 +0100
Subject: x86: make early_indentify_cpu static

early_indentify_cpu is only used in setup_64.c

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_64.c | 4 +++-
 include/asm-x86/proto.h    | 2 --
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 97a497652d20..0e2bffe2e203 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -141,6 +141,8 @@ struct resource bss_resource = {
 	.flags = IORESOURCE_RAM,
 };
 
+static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
+
 #ifdef CONFIG_PROC_VMCORE
 /* elfcorehdr= specifies the location of elf core header
  * stored by the crashed kernel. This option will be passed
@@ -844,7 +846,7 @@ struct cpu_model_info {
 /* Do some early cpuid on the boot CPU to get some parameter that are
    needed before check_bugs. Everything advanced is in identify_cpu
    below. */
-void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
+static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
 {
 	u32 tfms;
 
diff --git a/include/asm-x86/proto.h b/include/asm-x86/proto.h
index 64fe816ea37a..9e3d51fd91a2 100644
--- a/include/asm-x86/proto.h
+++ b/include/asm-x86/proto.h
@@ -40,8 +40,6 @@ extern int nohpet;
 
 extern void early_printk(const char *fmt, ...) __attribute__((format(printf,1,2)));
 
-extern void early_identify_cpu(struct cpuinfo_x86 *c);
-
 extern int k8_scan_nodes(unsigned long start, unsigned long end);
 
 extern void numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn);
-- 
cgit v1.2.3


From 718fc13b4675470ea191522ef98b02a55d990fa1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:30:17 +0100
Subject: x86: move debug related declarations to kdebug.h

Move them and fixup some users.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820_64.c      | 1 +
 arch/x86/kernel/head64.c       | 1 +
 arch/x86/kernel/process_32.c   | 1 +
 arch/x86/mm/init_64.c          | 1 +
 include/asm-x86/arch_hooks.h   | 5 +----
 include/asm-x86/kdebug.h       | 5 +++++
 include/asm-x86/processor_32.h | 1 -
 include/asm-x86/proto.h        | 8 --------
 include/asm-x86/system_32.h    | 1 -
 9 files changed, 10 insertions(+), 14 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
index d41cd2f01733..e510cfd5bb71 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -26,6 +26,7 @@
 #include <asm/proto.h>
 #include <asm/setup.h>
 #include <asm/sections.h>
+#include <asm/kdebug.h>
 
 struct e820map e820;
 
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 6b3469311e42..d156bfbffa9a 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -19,6 +19,7 @@
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/sections.h>
+#include <asm/kdebug.h>
 
 static void __init zap_identity_mappings(void)
 {
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index a63d2d2556ee..a8cdd09ad53f 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -55,6 +55,7 @@
 
 #include <asm/tlbflush.h>
 #include <asm/cpu.h>
+#include <asm/kdebug.h>
 
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 0f9c8c890658..9677abb6cf8a 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -43,6 +43,7 @@
 #include <asm/proto.h>
 #include <asm/smp.h>
 #include <asm/sections.h>
+#include <asm/kdebug.h>
 
 #ifndef Dprintk
 #define Dprintk(x...)
diff --git a/include/asm-x86/arch_hooks.h b/include/asm-x86/arch_hooks.h
index a8c1fca9726d..768aee8a04ef 100644
--- a/include/asm-x86/arch_hooks.h
+++ b/include/asm-x86/arch_hooks.h
@@ -6,7 +6,7 @@
 /*
  *	linux/include/asm/arch_hooks.h
  *
- *	define the architecture specific hooks 
+ *	define the architecture specific hooks
  */
 
 /* these aren't arch hooks, they are generic routines
@@ -24,7 +24,4 @@ extern void trap_init_hook(void);
 extern void time_init_hook(void);
 extern void mca_nmi_hook(void);
 
-extern int setup_early_printk(char *);
-extern void early_printk(const char *fmt, ...) __attribute__((format(printf,1,2)));
-
 #endif
diff --git a/include/asm-x86/kdebug.h b/include/asm-x86/kdebug.h
index e2f9b62e535e..49e5c91d490c 100644
--- a/include/asm-x86/kdebug.h
+++ b/include/asm-x86/kdebug.h
@@ -22,10 +22,15 @@ enum die_val {
 	DIE_PAGE_FAULT,
 };
 
+extern void early_printk(const char *fmt, ...) __attribute__((format(printf,1,2)));
 extern void printk_address(unsigned long address);
 extern void die(const char *,struct pt_regs *,long);
 extern void __die(const char *,struct pt_regs *,long);
 extern void show_registers(struct pt_regs *regs);
+extern void __show_registers(struct pt_regs *, int all);
+extern void show_trace(struct task_struct *, struct pt_regs *, unsigned long *);
+extern void __show_regs(struct pt_regs *regs);
+extern void show_regs(struct pt_regs *regs);
 extern void dump_pagetable(unsigned long);
 extern unsigned long oops_begin(void);
 extern void oops_end(unsigned long);
diff --git a/include/asm-x86/processor_32.h b/include/asm-x86/processor_32.h
index 13976b086837..58880a1ddd65 100644
--- a/include/asm-x86/processor_32.h
+++ b/include/asm-x86/processor_32.h
@@ -423,7 +423,6 @@ extern void prepare_to_copy(struct task_struct *tsk);
 extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
 
 extern unsigned long thread_saved_pc(struct task_struct *tsk);
-void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack);
 
 unsigned long get_wchan(struct task_struct *p);
 
diff --git a/include/asm-x86/proto.h b/include/asm-x86/proto.h
index 9b96dae5b7ac..b10cd1b82089 100644
--- a/include/asm-x86/proto.h
+++ b/include/asm-x86/proto.h
@@ -38,8 +38,6 @@ extern u32 pmtmr_ioport;
 #endif
 extern int nohpet;
 
-extern void early_printk(const char *fmt, ...) __attribute__((format(printf,1,2)));
-
 extern void numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn);
 extern unsigned long numa_free_all_bootmem(void);
 
@@ -49,16 +47,10 @@ extern void load_gs_index(unsigned gs);
 
 extern unsigned long end_pfn_map; 
 
-extern void show_trace(struct task_struct *, struct pt_regs *, unsigned long * rsp);
-extern void show_registers(struct pt_regs *regs);
-
 extern void exception_table_check(void);
 
 extern void swap_low_mappings(void);
 
-extern void __show_regs(struct pt_regs * regs);
-extern void show_regs(struct pt_regs * regs);
-
 extern void syscall32_cpu_init(void);
 
 extern void setup_node_bootmem(int nodeid, unsigned long start, unsigned long end);
diff --git a/include/asm-x86/system_32.h b/include/asm-x86/system_32.h
index ef8468883bac..db6283eb5e46 100644
--- a/include/asm-x86/system_32.h
+++ b/include/asm-x86/system_32.h
@@ -315,6 +315,5 @@ extern unsigned long arch_align_stack(unsigned long sp);
 extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
 
 void default_idle(void);
-void __show_registers(struct pt_regs *, int all);
 
 #endif
-- 
cgit v1.2.3


From af7a78e9258ffcca681e080cbc857f854869144f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:30:17 +0100
Subject: x86: move mce related declarations

Move the mce related declarations where they belong, fix the
users and remove 32bit dependency in mce.h

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_64.c |  1 +
 include/asm-x86/mce.h      | 14 +++++++-------
 include/asm-x86/proto.h    |  1 -
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 0e2bffe2e203..a84a4efc7fe6 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -59,6 +59,7 @@
 #include <asm/sections.h>
 #include <asm/dmi.h>
 #include <asm/cacheflush.h>
+#include <asm/mce.h>
 
 /*
  * Machine setup..
diff --git a/include/asm-x86/mce.h b/include/asm-x86/mce.h
index df304fd89c27..e6ff507a73b0 100644
--- a/include/asm-x86/mce.h
+++ b/include/asm-x86/mce.h
@@ -85,14 +85,7 @@ struct mce_log {
 #ifdef __KERNEL__
 
 #ifdef CONFIG_X86_32
-#ifdef CONFIG_X86_MCE
-extern void mcheck_init(struct cpuinfo_x86 *c);
-#else
-#define mcheck_init(c) do {} while(0)
-#endif
-
 extern int mce_disabled;
-
 #else /* CONFIG_X86_32 */
 
 #include <asm/atomic.h>
@@ -121,6 +114,13 @@ extern int mce_notify_user(void);
 
 #endif /* !CONFIG_X86_32 */
 
+
+
+#ifdef CONFIG_X86_MCE
+extern void mcheck_init(struct cpuinfo_x86 *c);
+#else
+#define mcheck_init(c) do { } while (0)
+#endif
 extern void stop_mce(void);
 extern void restart_mce(void);
 
diff --git a/include/asm-x86/proto.h b/include/asm-x86/proto.h
index b10cd1b82089..02420dea73c6 100644
--- a/include/asm-x86/proto.h
+++ b/include/asm-x86/proto.h
@@ -13,7 +13,6 @@ extern void pda_init(int);
 
 extern void early_idt_handler(void);
 
-extern void mcheck_init(struct cpuinfo_x86 *c);
 extern void init_memory_mapping(unsigned long start, unsigned long end);
 
 extern void system_call(void); 
-- 
cgit v1.2.3


From 70a20025632ca320316b5068326784d07c8ff351 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:30:18 +0100
Subject: x86: move pmtmr related declarations

Move more stuff out of proto.h

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic_64.c    | 1 +
 arch/x86/kernel/pmtimer_64.c | 4 ++--
 include/asm-x86/proto.h      | 9 ---------
 include/linux/acpi_pmtmr.h   | 2 ++
 4 files changed, 5 insertions(+), 11 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 3de3764a862c..0cb14d4c2c5c 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -26,6 +26,7 @@
 #include <linux/module.h>
 #include <linux/ioport.h>
 #include <linux/clockchips.h>
+#include <linux/acpi_pmtmr.h>
 
 #include <asm/atomic.h>
 #include <asm/smp.h>
diff --git a/arch/x86/kernel/pmtimer_64.c b/arch/x86/kernel/pmtimer_64.c
index ae8f91214f15..b112406f1996 100644
--- a/arch/x86/kernel/pmtimer_64.c
+++ b/arch/x86/kernel/pmtimer_64.c
@@ -19,13 +19,13 @@
 #include <linux/time.h>
 #include <linux/init.h>
 #include <linux/cpumask.h>
+#include <linux/acpi_pmtmr.h>
+
 #include <asm/io.h>
 #include <asm/proto.h>
 #include <asm/msr.h>
 #include <asm/vsyscall.h>
 
-#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */
-
 static inline u32 cyc2us(u32 cycles)
 {
 	/* The Power Management Timer ticks at 3.579545 ticks per microsecond.
diff --git a/include/asm-x86/proto.h b/include/asm-x86/proto.h
index a47e526716f4..9074aa7ebc65 100644
--- a/include/asm-x86/proto.h
+++ b/include/asm-x86/proto.h
@@ -25,15 +25,6 @@ extern void ia32_sysenter_target(void);
 extern void config_acpi_tables(void);
 extern void ia32_syscall(void);
 
-extern int pmtimer_mark_offset(void);
-extern void pmtimer_resume(void);
-extern void pmtimer_wait(unsigned);
-extern unsigned int do_gettimeoffset_pm(void);
-#ifdef CONFIG_X86_PM_TIMER
-extern u32 pmtmr_ioport;
-#else
-#define pmtmr_ioport 0
-#endif
 extern int nohpet;
 
 extern void reserve_bootmem_generic(unsigned long phys, unsigned len);
diff --git a/include/linux/acpi_pmtmr.h b/include/linux/acpi_pmtmr.h
index 1d0ef1ae8036..7e3d2859be50 100644
--- a/include/linux/acpi_pmtmr.h
+++ b/include/linux/acpi_pmtmr.h
@@ -25,6 +25,8 @@ static inline u32 acpi_pm_read_early(void)
 	return acpi_pm_read_verified() & ACPI_PM_MASK;
 }
 
+extern void pmtimer_wait(unsigned);
+
 #else
 
 static inline u32 acpi_pm_read_early(void)
-- 
cgit v1.2.3


From eaf76e8b93dd5e7a53e2cd6db53f3ca18ca0fe4c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:30:19 +0100
Subject: x86: remove duplicate start_kernel declaration

start_kernel is already declared in a generic header file.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/head64.c | 1 +
 include/asm-x86/proto.h  | 2 --
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index d156bfbffa9a..6643f3f994fb 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -10,6 +10,7 @@
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/percpu.h>
+#include <linux/start_kernel.h>
 
 #include <asm/processor.h>
 #include <asm/proto.h>
diff --git a/include/asm-x86/proto.h b/include/asm-x86/proto.h
index 4eddf2761804..5a45478b5d3a 100644
--- a/include/asm-x86/proto.h
+++ b/include/asm-x86/proto.h
@@ -8,8 +8,6 @@
 struct cpuinfo_x86; 
 struct pt_regs;
 
-extern void start_kernel(void);
-
 extern void early_idt_handler(void);
 
 extern void init_memory_mapping(unsigned long start, unsigned long end);
-- 
cgit v1.2.3


From 3e35a0e525253837fc0ea4d0e060de3302bd9537 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:30:19 +0100
Subject: x86: move ioapic code where it belongs

The commit 399287229c775a8962a852a761d65dc9475dec7c hacked the
ioapic resource mapping into apic.c for no good reason.
Move the code into io_apic_64.c where it belongs.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic_64.c    | 87 ------------------------------------------
 arch/x86/kernel/io_apic_64.c | 90 ++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/setup_64.c   |  1 +
 include/asm-x86/io_apic_64.h |  2 +
 4 files changed, 93 insertions(+), 87 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 0cb14d4c2c5c..e0191cdd6f7a 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -49,7 +49,6 @@ static int apic_calibrate_pmtmr __initdata;
 int local_apic_timer_c2_ok;
 EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
 
-static struct resource *ioapic_resources;
 static struct resource lapic_resource = {
 	.name = "Local APIC",
 	.flags = IORESOURCE_MEM | IORESOURCE_BUSY,
@@ -714,64 +713,6 @@ static int __init detect_init_APIC (void)
 	return 0;
 }
 
-#ifdef CONFIG_X86_IO_APIC
-static struct resource * __init ioapic_setup_resources(void)
-{
-#define IOAPIC_RESOURCE_NAME_SIZE 11
-	unsigned long n;
-	struct resource *res;
-	char *mem;
-	int i;
-
-	if (nr_ioapics <= 0)
-		return NULL;
-
-	n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
-	n *= nr_ioapics;
-
-	mem = alloc_bootmem(n);
-	res = (void *)mem;
-
-	if (mem != NULL) {
-		memset(mem, 0, n);
-		mem += sizeof(struct resource) * nr_ioapics;
-
-		for (i = 0; i < nr_ioapics; i++) {
-			res[i].name = mem;
-			res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
-			sprintf(mem,  "IOAPIC %u", i);
-			mem += IOAPIC_RESOURCE_NAME_SIZE;
-		}
-	}
-
-	ioapic_resources = res;
-
-	return res;
-}
-
-static int __init ioapic_insert_resources(void)
-{
-	int i;
-	struct resource *r = ioapic_resources;
-
-	if (!r) {
-		printk("IO APIC resources could be not be allocated.\n");
-		return -1;
-	}
-
-	for (i = 0; i < nr_ioapics; i++) {
-		insert_resource(&iomem_resource, r);
-		r++;
-	}
-
-	return 0;
-}
-
-/* Insert the IO APIC resources after PCI initialization has occured to handle
- * IO APICS that are mapped in on a BAR in PCI space. */
-late_initcall(ioapic_insert_resources);
-#endif
-
 void __init init_apic_mappings(void)
 {
 	unsigned long apic_phys;
@@ -801,34 +742,6 @@ void __init init_apic_mappings(void)
 	 * default configuration (or the MP table is broken).
 	 */
 	boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
-
-	{
-		unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
-		int i;
-		struct resource *ioapic_res;
-
-		ioapic_res = ioapic_setup_resources();
-		for (i = 0; i < nr_ioapics; i++) {
-			if (smp_found_config) {
-				ioapic_phys = mp_ioapics[i].mpc_apicaddr;
-			} else {
-				ioapic_phys = (unsigned long)
-					alloc_bootmem_pages(PAGE_SIZE);
-				ioapic_phys = __pa(ioapic_phys);
-			}
-			set_fixmap_nocache(idx, ioapic_phys);
-			apic_printk(APIC_VERBOSE,
-				    "mapped IOAPIC to %016lx (%016lx)\n",
-				    __fix_to_virt(idx), ioapic_phys);
-			idx++;
-
-			if (ioapic_res != NULL) {
-				ioapic_res->start = ioapic_phys;
-				ioapic_res->end = ioapic_phys + (4 * 1024) - 1;
-				ioapic_res++;
-			}
-		}
-	}
 }
 
 /*
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index d4f5286101a9..c6de7854ac63 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -35,6 +35,7 @@
 #ifdef CONFIG_ACPI
 #include <acpi/acpi_bus.h>
 #endif
+#include <linux/bootmem.h>
 
 #include <asm/idle.h>
 #include <asm/io.h>
@@ -2288,3 +2289,92 @@ void __init setup_ioapic_dest(void)
 }
 #endif
 
+#define IOAPIC_RESOURCE_NAME_SIZE 11
+
+static struct resource *ioapic_resources;
+
+static struct resource * __init ioapic_setup_resources(void)
+{
+	unsigned long n;
+	struct resource *res;
+	char *mem;
+	int i;
+
+	if (nr_ioapics <= 0)
+		return NULL;
+
+	n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
+	n *= nr_ioapics;
+
+	mem = alloc_bootmem(n);
+	res = (void *)mem;
+
+	if (mem != NULL) {
+		memset(mem, 0, n);
+		mem += sizeof(struct resource) * nr_ioapics;
+
+		for (i = 0; i < nr_ioapics; i++) {
+			res[i].name = mem;
+			res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+			sprintf(mem,  "IOAPIC %u", i);
+			mem += IOAPIC_RESOURCE_NAME_SIZE;
+		}
+	}
+
+	ioapic_resources = res;
+
+	return res;
+}
+
+void __init ioapic_init_mappings(void)
+{
+	unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
+	struct resource *ioapic_res;
+	int i;
+
+	ioapic_res = ioapic_setup_resources();
+	for (i = 0; i < nr_ioapics; i++) {
+		if (smp_found_config) {
+			ioapic_phys = mp_ioapics[i].mpc_apicaddr;
+		} else {
+			ioapic_phys = (unsigned long)
+				alloc_bootmem_pages(PAGE_SIZE);
+			ioapic_phys = __pa(ioapic_phys);
+		}
+		set_fixmap_nocache(idx, ioapic_phys);
+		apic_printk(APIC_VERBOSE,
+			    "mapped IOAPIC to %016lx (%016lx)\n",
+			    __fix_to_virt(idx), ioapic_phys);
+		idx++;
+
+		if (ioapic_res != NULL) {
+			ioapic_res->start = ioapic_phys;
+			ioapic_res->end = ioapic_phys + (4 * 1024) - 1;
+			ioapic_res++;
+		}
+	}
+}
+
+static int __init ioapic_insert_resources(void)
+{
+	int i;
+	struct resource *r = ioapic_resources;
+
+	if (!r) {
+		printk(KERN_ERR
+		       "IO APIC resources could be not be allocated.\n");
+		return -1;
+	}
+
+	for (i = 0; i < nr_ioapics; i++) {
+		insert_resource(&iomem_resource, r);
+		r++;
+	}
+
+	return 0;
+}
+
+/* Insert the IO APIC resources after PCI initialization has occured to handle
+ * IO APICS that are mapped in on a BAR in PCI space. */
+late_initcall(ioapic_insert_resources);
+
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index a84a4efc7fe6..bcb5f3aaa097 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -433,6 +433,7 @@ void __init setup_arch(char **cmdline_p)
 	if (smp_found_config)
 		get_smp_config();
 	init_apic_mappings();
+	ioapic_init_mappings();
 
 	/*
 	 * We trust e820 completely. No explicit ROM probing in memory.
diff --git a/include/asm-x86/io_apic_64.h b/include/asm-x86/io_apic_64.h
index 762a841b939e..3e3c2114d93a 100644
--- a/include/asm-x86/io_apic_64.h
+++ b/include/asm-x86/io_apic_64.h
@@ -127,6 +127,8 @@ extern int io_apic_get_redir_entries (int ioapic);
 extern int io_apic_set_pci_routing (int ioapic, int pin, int irq, int, int);
 #endif
 
+extern void ioapic_init_mappings(void);
+
 extern int sis_apic_bug; /* dummy */ 
 
 #endif
-- 
cgit v1.2.3


From 87ebecf14ca4f669cb52be46c954f3d9201394b8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:30:19 +0100
Subject: x86: move ack_bad_irq into irq code

Match i386, where we have this in the irq code. It belongs there.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic_64.c | 20 --------------------
 arch/x86/kernel/irq_64.c  | 20 ++++++++++++++++++++
 2 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index e0191cdd6f7a..032bf1e765df 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -169,26 +169,6 @@ int lapic_get_maxlvt(void)
 	return maxlvt;
 }
 
-/*
- * 'what should we do if we get a hw irq event on an illegal vector'.
- * each architecture has to answer this themselves.
- */
-void ack_bad_irq(unsigned int irq)
-{
-	printk("unexpected IRQ trap at vector %02x\n", irq);
-	/*
-	 * Currently unexpected vectors happen only on SMP and APIC.
-	 * We _must_ ack these because every local APIC has only N
-	 * irq slots per priority level, and a 'hanging, unacked' IRQ
-	 * holds up an irq slot - in excessive cases (when multiple
-	 * unexpected vectors occur) that might lock up the APIC
-	 * completely.
-	 * But don't ack when the APIC is disabled. -AK
-	 */
-	if (!disable_apic)
-		ack_APIC_irq();
-}
-
 void clear_local_APIC(void)
 {
 	int maxlvt;
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 6b5c730d67b9..6c3a3b6e5cf4 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -20,6 +20,26 @@
 
 atomic_t irq_err_count;
 
+/*
+ * 'what should we do if we get a hw irq event on an illegal vector'.
+ * each architecture has to answer this themselves.
+ */
+void ack_bad_irq(unsigned int irq)
+{
+	printk(KERN_WARNING "unexpected IRQ trap at vector %02x\n", irq);
+	/*
+	 * Currently unexpected vectors happen only on SMP and APIC.
+	 * We _must_ ack these because every local APIC has only N
+	 * irq slots per priority level, and a 'hanging, unacked' IRQ
+	 * holds up an irq slot - in excessive cases (when multiple
+	 * unexpected vectors occur) that might lock up the APIC
+	 * completely.
+	 * But don't ack when the APIC is disabled. -AK
+	 */
+	if (!disable_apic)
+		ack_APIC_irq();
+}
+
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
 /*
  * Probabilistic stack overflow check:
-- 
cgit v1.2.3


From 3a12d93dc0b46bc710317272bf91640d7a8b6f18 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:30:20 +0100
Subject: x86: make smp_local_timer_interrupt() static

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic_64.c | 2 +-
 include/asm-x86/apic.h    | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 032bf1e765df..5bf329ed9d78 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -923,7 +923,7 @@ void setup_APIC_extended_lvt(unsigned char lvt_off, unsigned char vector,
  * value into /proc/profile.
  */
 
-void smp_local_timer_interrupt(void)
+static void smp_local_timer_interrupt(void)
 {
 	int cpu = smp_processor_id();
 	struct clock_event_device *evt = &per_cpu(lapic_events, cpu);
diff --git a/include/asm-x86/apic.h b/include/asm-x86/apic.h
index c064c1f84bab..d0a221fa1fc3 100644
--- a/include/asm-x86/apic.h
+++ b/include/asm-x86/apic.h
@@ -112,7 +112,6 @@ extern void sync_Arb_IDs(void);
 extern void init_bsp_APIC(void);
 extern void setup_local_APIC(void);
 extern void init_apic_mappings(void);
-extern void smp_local_timer_interrupt(void);
 extern void setup_boot_APIC_clock(void);
 extern void setup_secondary_APIC_clock(void);
 extern int APIC_init_uniprocessor(void);
-- 
cgit v1.2.3


From 0e078e2f5060e06f9b3f32e55665ea835343440e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:30:20 +0100
Subject: x86: prepare merging arch/x86/kernel/apic_32/64.c

Shuffle code around, so we get a readable diff.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic_32.c |   92 ++--
 arch/x86/kernel/apic_64.c | 1315 ++++++++++++++++++++++++---------------------
 2 files changed, 737 insertions(+), 670 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index 3a069acb270c..420c15842e44 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -563,6 +563,9 @@ static void local_apic_timer_interrupt(void)
 		return;
 	}
 
+	/*
+	 * the NMI deadlock-detector uses this.
+	 */
 	per_cpu(irq_stat, cpu).apic_timer_irqs++;
 
 	evt->event_handler(evt);
@@ -617,7 +620,7 @@ int setup_profiling_timer(unsigned int multiplier)
 void clear_local_APIC(void)
 {
 	int maxlvt = lapic_get_maxlvt();
-	unsigned long v;
+	u32 v;
 
 	/*
 	 * Masking an LVT entry can trigger a local APIC error
@@ -1209,50 +1212,6 @@ int __init APIC_init_uniprocessor (void)
 	return 0;
 }
 
-/*
- * APIC command line parameters
- */
-static int __init parse_lapic(char *arg)
-{
-	enable_local_apic = 1;
-	return 0;
-}
-early_param("lapic", parse_lapic);
-
-static int __init parse_nolapic(char *arg)
-{
-	enable_local_apic = -1;
-	clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
-	return 0;
-}
-early_param("nolapic", parse_nolapic);
-
-static int __init parse_disable_lapic_timer(char *arg)
-{
-	local_apic_timer_disabled = 1;
-	return 0;
-}
-early_param("nolapic_timer", parse_disable_lapic_timer);
-
-static int __init parse_lapic_timer_c2_ok(char *arg)
-{
-	local_apic_timer_c2_ok = 1;
-	return 0;
-}
-early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
-
-static int __init apic_set_verbosity(char *str)
-{
-	if (strcmp("debug", str) == 0)
-		apic_verbosity = APIC_DEBUG;
-	else if (strcmp("verbose", str) == 0)
-		apic_verbosity = APIC_VERBOSE;
-	return 1;
-}
-
-__setup("apic=", apic_set_verbosity);
-
-
 /*
  * Local APIC interrupts
  */
@@ -1565,3 +1524,46 @@ device_initcall(init_lapic_sysfs);
 static void apic_pm_activate(void) { }
 
 #endif	/* CONFIG_PM */
+
+/*
+ * APIC command line parameters
+ */
+static int __init parse_lapic(char *arg)
+{
+	enable_local_apic = 1;
+	return 0;
+}
+early_param("lapic", parse_lapic);
+
+static int __init parse_nolapic(char *arg)
+{
+	enable_local_apic = -1;
+	clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
+	return 0;
+}
+early_param("nolapic", parse_nolapic);
+
+static int __init parse_disable_lapic_timer(char *arg)
+{
+	local_apic_timer_disabled = 1;
+	return 0;
+}
+early_param("nolapic_timer", parse_disable_lapic_timer);
+
+static int __init parse_lapic_timer_c2_ok(char *arg)
+{
+	local_apic_timer_c2_ok = 1;
+	return 0;
+}
+early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
+
+static int __init apic_set_verbosity(char *str)
+{
+	if (strcmp("debug", str) == 0)
+		apic_verbosity = APIC_DEBUG;
+	else if (strcmp("verbose", str) == 0)
+		apic_verbosity = APIC_VERBOSE;
+	return 1;
+}
+__setup("apic=", apic_set_verbosity);
+
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 5bf329ed9d78..915808bd8a2a 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -44,6 +44,7 @@
 int apic_verbosity;
 int disable_apic_timer __cpuinitdata;
 static int apic_calibrate_pmtmr __initdata;
+int disable_apic;
 
 /* Local APIC timer works in C2? */
 int local_apic_timer_c2_ok;
@@ -60,10 +61,8 @@ static int lapic_next_event(unsigned long delta,
 			    struct clock_event_device *evt);
 static void lapic_timer_setup(enum clock_event_mode mode,
 			      struct clock_event_device *evt);
-
 static void lapic_timer_broadcast(cpumask_t mask);
-
-static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen);
+static void apic_pm_activate(void);
 
 static struct clock_event_device lapic_clockevent = {
 	.name		= "lapic",
@@ -78,57 +77,34 @@ static struct clock_event_device lapic_clockevent = {
 };
 static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
 
-static int lapic_next_event(unsigned long delta,
-			    struct clock_event_device *evt)
+/*
+ * Get the LAPIC version
+ */
+static inline int lapic_get_version(void)
 {
-	apic_write(APIC_TMICT, delta);
-	return 0;
+	return GET_APIC_VERSION(apic_read(APIC_LVR));
 }
 
-static void lapic_timer_setup(enum clock_event_mode mode,
-			      struct clock_event_device *evt)
+/*
+ * Check, if the APIC is integrated or a seperate chip
+ */
+static inline int lapic_is_integrated(void)
 {
-	unsigned long flags;
-	unsigned int v;
-
-	/* Lapic used as dummy for broadcast ? */
-	if (evt->features & CLOCK_EVT_FEAT_DUMMY)
-		return;
-
-	local_irq_save(flags);
-
-	switch (mode) {
-	case CLOCK_EVT_MODE_PERIODIC:
-	case CLOCK_EVT_MODE_ONESHOT:
-		__setup_APIC_LVTT(calibration_result,
-				  mode != CLOCK_EVT_MODE_PERIODIC, 1);
-		break;
-	case CLOCK_EVT_MODE_UNUSED:
-	case CLOCK_EVT_MODE_SHUTDOWN:
-		v = apic_read(APIC_LVTT);
-		v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
-		apic_write(APIC_LVTT, v);
-		break;
-	case CLOCK_EVT_MODE_RESUME:
-		/* Nothing to do here */
-		break;
-	}
-
-	local_irq_restore(flags);
+	return 1;
 }
 
 /*
- * Local APIC timer broadcast function
+ * Check, whether this is a modern or a first generation APIC
  */
-static void lapic_timer_broadcast(cpumask_t mask)
+static int modern_apic(void)
 {
-#ifdef CONFIG_SMP
-	send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
-#endif
+	/* AMD systems use old APIC versions, so check the CPU */
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
+	    boot_cpu_data.x86 >= 0xf)
+		return 1;
+	return lapic_get_version() >= 0x14;
 }
 
-static void apic_pm_activate(void);
-
 void apic_wait_icr_idle(void)
 {
 	while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
@@ -151,7 +127,10 @@ u32 safe_apic_wait_icr_idle(void)
 	return send_status;
 }
 
-void enable_NMI_through_LVT0 (void * dummy)
+/**
+ * enable_NMI_through_LVT0 - enable NMI through local vector table 0
+ */
+void enable_NMI_through_LVT0(void *dummy)
 {
 	unsigned int v;
 
@@ -160,6 +139,9 @@ void enable_NMI_through_LVT0 (void * dummy)
 	apic_write(APIC_LVT0, v);
 }
 
+/**
+ * lapic_get_maxlvt - get the maximum number of local vector table entries
+ */
 int lapic_get_maxlvt(void)
 {
 	unsigned int v, maxlvt;
@@ -169,184 +151,476 @@ int lapic_get_maxlvt(void)
 	return maxlvt;
 }
 
-void clear_local_APIC(void)
+/*
+ * This function sets up the local APIC timer, with a timeout of
+ * 'clocks' APIC bus clock. During calibration we actually call
+ * this function twice on the boot CPU, once with a bogus timeout
+ * value, second time for real. The other (noncalibrating) CPUs
+ * call this function only once, with the real, calibrated value.
+ *
+ * We do reads before writes even if unnecessary, to get around the
+ * P5 APIC double write bug.
+ */
+
+static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
 {
-	int maxlvt;
-	unsigned int v;
+	unsigned int lvtt_value, tmp_value;
 
-	maxlvt = lapic_get_maxlvt();
+	lvtt_value = LOCAL_TIMER_VECTOR;
+	if (!oneshot)
+		lvtt_value |= APIC_LVT_TIMER_PERIODIC;
+	if (!irqen)
+		lvtt_value |= APIC_LVT_MASKED;
 
-	/*
-	 * Masking an LVT entry can trigger a local APIC error
-	 * if the vector is zero. Mask LVTERR first to prevent this.
-	 */
-	if (maxlvt >= 3) {
-		v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
-		apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
-	}
-	/*
-	 * Careful: we have to set masks only first to deassert
-	 * any level-triggered sources.
-	 */
-	v = apic_read(APIC_LVTT);
-	apic_write(APIC_LVTT, v | APIC_LVT_MASKED);
-	v = apic_read(APIC_LVT0);
-	apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
-	v = apic_read(APIC_LVT1);
-	apic_write(APIC_LVT1, v | APIC_LVT_MASKED);
-	if (maxlvt >= 4) {
-		v = apic_read(APIC_LVTPC);
-		apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
-	}
+	apic_write(APIC_LVTT, lvtt_value);
 
 	/*
-	 * Clean APIC state for other OSs:
+	 * Divide PICLK by 16
 	 */
-	apic_write(APIC_LVTT, APIC_LVT_MASKED);
-	apic_write(APIC_LVT0, APIC_LVT_MASKED);
-	apic_write(APIC_LVT1, APIC_LVT_MASKED);
-	if (maxlvt >= 3)
-		apic_write(APIC_LVTERR, APIC_LVT_MASKED);
-	if (maxlvt >= 4)
-		apic_write(APIC_LVTPC, APIC_LVT_MASKED);
-	apic_write(APIC_ESR, 0);
-	apic_read(APIC_ESR);
+	tmp_value = apic_read(APIC_TDCR);
+	apic_write(APIC_TDCR, (tmp_value
+				& ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
+				| APIC_TDR_DIV_16);
+
+	if (!oneshot)
+		apic_write(APIC_TMICT, clocks);
 }
 
-void disconnect_bsp_APIC(int virt_wire_setup)
+/*
+ * Setup extended LVT (K8 specific)
+ */
+void setup_APIC_extended_lvt(unsigned char lvt_off, unsigned char vector,
+			     unsigned char msg_type, unsigned char mask)
 {
-	/* Go back to Virtual Wire compatibility mode */
-	unsigned long value;
-
-	/* For the spurious interrupt use vector F, and enable it */
-	value = apic_read(APIC_SPIV);
-	value &= ~APIC_VECTOR_MASK;
-	value |= APIC_SPIV_APIC_ENABLED;
-	value |= 0xf;
-	apic_write(APIC_SPIV, value);
-
-	if (!virt_wire_setup) {
-		/*
-		 * For LVT0 make it edge triggered, active high,
-		 * external and enabled
-		 */
-		value = apic_read(APIC_LVT0);
-		value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
-			APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
-			APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
-		value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
-		value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
-		apic_write(APIC_LVT0, value);
-	} else {
-		/* Disable LVT0 */
-		apic_write(APIC_LVT0, APIC_LVT_MASKED);
-	}
+	unsigned long reg = (lvt_off << 4) + K8_APIC_EXT_LVT_BASE;
+	unsigned int  v   = (mask << 16) | (msg_type << 8) | vector;
 
-	/* For LVT1 make it edge triggered, active high, nmi and enabled */
-	value = apic_read(APIC_LVT1);
-	value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
-			APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
-			APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
-	value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
-	value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
-	apic_write(APIC_LVT1, value);
+	apic_write(reg, v);
 }
 
-void disable_local_APIC(void)
+/*
+ * Program the next event, relative to now
+ */
+static int lapic_next_event(unsigned long delta,
+			    struct clock_event_device *evt)
 {
-	unsigned int value;
-
-	clear_local_APIC();
-
-	/*
-	 * Disable APIC (implies clearing of registers
-	 * for 82489DX!).
-	 */
-	value = apic_read(APIC_SPIV);
-	value &= ~APIC_SPIV_APIC_ENABLED;
-	apic_write(APIC_SPIV, value);
+	apic_write(APIC_TMICT, delta);
+	return 0;
 }
 
-void lapic_shutdown(void)
+/*
+ * Setup the lapic timer in periodic or oneshot mode
+ */
+static void lapic_timer_setup(enum clock_event_mode mode,
+			      struct clock_event_device *evt)
 {
 	unsigned long flags;
+	unsigned int v;
 
-	if (!cpu_has_apic)
+	/* Lapic used as dummy for broadcast ? */
+	if (evt->features & CLOCK_EVT_FEAT_DUMMY)
 		return;
 
 	local_irq_save(flags);
 
-	disable_local_APIC();
+	switch (mode) {
+	case CLOCK_EVT_MODE_PERIODIC:
+	case CLOCK_EVT_MODE_ONESHOT:
+		__setup_APIC_LVTT(calibration_result,
+				  mode != CLOCK_EVT_MODE_PERIODIC, 1);
+		break;
+	case CLOCK_EVT_MODE_UNUSED:
+	case CLOCK_EVT_MODE_SHUTDOWN:
+		v = apic_read(APIC_LVTT);
+		v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
+		apic_write(APIC_LVTT, v);
+		break;
+	case CLOCK_EVT_MODE_RESUME:
+		/* Nothing to do here */
+		break;
+	}
 
 	local_irq_restore(flags);
 }
 
 /*
- * This is to verify that we're looking at a real local APIC.
- * Check these against your board if the CPUs aren't getting
- * started for no apparent reason.
+ * Local APIC timer broadcast function
  */
-int __init verify_local_APIC(void)
+static void lapic_timer_broadcast(cpumask_t mask)
 {
-	unsigned int reg0, reg1;
-
-	/*
-	 * The version register is read-only in a real APIC.
-	 */
-	reg0 = apic_read(APIC_LVR);
-	apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
-	apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
-	reg1 = apic_read(APIC_LVR);
-	apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
+#ifdef CONFIG_SMP
+	send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
+#endif
+}
 
-	/*
-	 * The two version reads above should print the same
-	 * numbers.  If the second one is different, then we
-	 * poke at a non-APIC.
-	 */
-	if (reg1 != reg0)
-		return 0;
+/*
+ * Setup the local APIC timer for this CPU. Copy the initilized values
+ * of the boot CPU and register the clock event in the framework.
+ */
+static void setup_APIC_timer(void)
+{
+	struct clock_event_device *levt = &__get_cpu_var(lapic_events);
 
-	/*
-	 * Check if the version looks reasonably.
-	 */
-	reg1 = GET_APIC_VERSION(reg0);
-	if (reg1 == 0x00 || reg1 == 0xff)
-		return 0;
-	reg1 = lapic_get_maxlvt();
-	if (reg1 < 0x02 || reg1 == 0xff)
-		return 0;
+	memcpy(levt, &lapic_clockevent, sizeof(*levt));
+	levt->cpumask = cpumask_of_cpu(smp_processor_id());
 
-	/*
-	 * The ID register is read/write in a real APIC.
-	 */
-	reg0 = apic_read(APIC_ID);
-	apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
-	apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
-	reg1 = apic_read(APIC_ID);
-	apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
-	apic_write(APIC_ID, reg0);
-	if (reg1 != (reg0 ^ APIC_ID_MASK))
-		return 0;
+	clockevents_register_device(levt);
+}
 
-	/*
+/*
+ * In this function we calibrate APIC bus clocks to the external
+ * timer. Unfortunately we cannot use jiffies and the timer irq
+ * to calibrate, since some later bootup code depends on getting
+ * the first irq? Ugh.
+ *
+ * We want to do the calibration only once since we
+ * want to have local timer irqs syncron. CPUs connected
+ * by the same APIC bus have the very same bus frequency.
+ * And we want to have irqs off anyways, no accidental
+ * APIC irq that way.
+ */
+
+#define TICK_COUNT 100000000
+
+static void __init calibrate_APIC_clock(void)
+{
+	unsigned apic, apic_start;
+	unsigned long tsc, tsc_start;
+	int result;
+
+	local_irq_disable();
+
+	/*
+	 * Put whatever arbitrary (but long enough) timeout
+	 * value into the APIC clock, we just want to get the
+	 * counter running for calibration.
+	 *
+	 * No interrupt enable !
+	 */
+	__setup_APIC_LVTT(250000000, 0, 0);
+
+	apic_start = apic_read(APIC_TMCCT);
+#ifdef CONFIG_X86_PM_TIMER
+	if (apic_calibrate_pmtmr && pmtmr_ioport) {
+		pmtimer_wait(5000);  /* 5ms wait */
+		apic = apic_read(APIC_TMCCT);
+		result = (apic_start - apic) * 1000L / 5;
+	} else
+#endif
+	{
+		rdtscll(tsc_start);
+
+		do {
+			apic = apic_read(APIC_TMCCT);
+			rdtscll(tsc);
+		} while ((tsc - tsc_start) < TICK_COUNT &&
+				(apic_start - apic) < TICK_COUNT);
+
+		result = (apic_start - apic) * 1000L * tsc_khz /
+					(tsc - tsc_start);
+	}
+
+	local_irq_enable();
+
+	printk(KERN_DEBUG "APIC timer calibration result %d\n", result);
+
+	printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n",
+		result / 1000 / 1000, result / 1000 % 1000);
+
+	/* Calculate the scaled math multiplication factor */
+	lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC, 32);
+	lapic_clockevent.max_delta_ns =
+		clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
+	lapic_clockevent.min_delta_ns =
+		clockevent_delta2ns(0xF, &lapic_clockevent);
+
+	calibration_result = result / HZ;
+}
+
+void __init setup_boot_APIC_clock(void)
+{
+	/*
+	 * The local apic timer can be disabled via the kernel commandline.
+	 * Register the lapic timer as a dummy clock event source on SMP
+	 * systems, so the broadcast mechanism is used. On UP systems simply
+	 * ignore it.
+	 */
+	if (disable_apic_timer) {
+		printk(KERN_INFO "Disabling APIC timer\n");
+		/* No broadcast on UP ! */
+		if (num_possible_cpus() > 1)
+			setup_APIC_timer();
+		return;
+	}
+
+	printk(KERN_INFO "Using local APIC timer interrupts.\n");
+	calibrate_APIC_clock();
+
+	/*
+	 * If nmi_watchdog is set to IO_APIC, we need the
+	 * PIT/HPET going.  Otherwise register lapic as a dummy
+	 * device.
+	 */
+	if (nmi_watchdog != NMI_IO_APIC)
+		lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
+	else
+		printk(KERN_WARNING "APIC timer registered as dummy,"
+		       " due to nmi_watchdog=1!\n");
+
+	setup_APIC_timer();
+}
+
+/*
+ * AMD C1E enabled CPUs have a real nasty problem: Some BIOSes set the
+ * C1E flag only in the secondary CPU, so when we detect the wreckage
+ * we already have enabled the boot CPU local apic timer. Check, if
+ * disable_apic_timer is set and the DUMMY flag is cleared. If yes,
+ * set the DUMMY flag again and force the broadcast mode in the
+ * clockevents layer.
+ */
+void __cpuinit check_boot_apic_timer_broadcast(void)
+{
+	if (!disable_apic_timer ||
+	    (lapic_clockevent.features & CLOCK_EVT_FEAT_DUMMY))
+		return;
+
+	printk(KERN_INFO "AMD C1E detected late. Force timer broadcast.\n");
+	lapic_clockevent.features |= CLOCK_EVT_FEAT_DUMMY;
+
+	local_irq_enable();
+	clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, &boot_cpu_id);
+	local_irq_disable();
+}
+
+void __cpuinit setup_secondary_APIC_clock(void)
+{
+	check_boot_apic_timer_broadcast();
+	setup_APIC_timer();
+}
+
+/*
+ * The guts of the apic timer interrupt
+ */
+static void local_apic_timer_interrupt(void)
+{
+	int cpu = smp_processor_id();
+	struct clock_event_device *evt = &per_cpu(lapic_events, cpu);
+
+	/*
+	 * Normally we should not be here till LAPIC has been initialized but
+	 * in some cases like kdump, its possible that there is a pending LAPIC
+	 * timer interrupt from previous kernel's context and is delivered in
+	 * new kernel the moment interrupts are enabled.
+	 *
+	 * Interrupts are enabled early and LAPIC is setup much later, hence
+	 * its possible that when we get here evt->event_handler is NULL.
+	 * Check for event_handler being NULL and discard the interrupt as
+	 * spurious.
+	 */
+	if (!evt->event_handler) {
+		printk(KERN_WARNING
+		       "Spurious LAPIC timer interrupt on cpu %d\n", cpu);
+		/* Switch it off */
+		lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
+		return;
+	}
+
+	/*
+	 * the NMI deadlock-detector uses this.
+	 */
+	add_pda(apic_timer_irqs, 1);
+
+	evt->event_handler(evt);
+}
+
+/*
+ * Local APIC timer interrupt. This is the most natural way for doing
+ * local interrupts, but local timer interrupts can be emulated by
+ * broadcast interrupts too. [in case the hw doesn't support APIC timers]
+ *
+ * [ if a single-CPU system runs an SMP kernel then we call the local
+ *   interrupt as well. Thus we cannot inline the local irq ... ]
+ */
+void smp_apic_timer_interrupt(struct pt_regs *regs)
+{
+	struct pt_regs *old_regs = set_irq_regs(regs);
+
+	/*
+	 * NOTE! We'd better ACK the irq immediately,
+	 * because timer handling can be slow.
+	 */
+	ack_APIC_irq();
+	/*
+	 * update_process_times() expects us to have done irq_enter().
+	 * Besides, if we don't timer interrupts ignore the global
+	 * interrupt lock, which is the WrongThing (tm) to do.
+	 */
+	exit_idle();
+	irq_enter();
+	local_apic_timer_interrupt();
+	irq_exit();
+	set_irq_regs(old_regs);
+}
+
+int setup_profiling_timer(unsigned int multiplier)
+{
+	return -EINVAL;
+}
+
+
+/*
+ * Local APIC start and shutdown
+ */
+
+/**
+ * clear_local_APIC - shutdown the local APIC
+ *
+ * This is called, when a CPU is disabled and before rebooting, so the state of
+ * the local APIC has no dangling leftovers. Also used to cleanout any BIOS
+ * leftovers during boot.
+ */
+void clear_local_APIC(void)
+{
+	int maxlvt = lapic_get_maxlvt();
+	u32 v;
+
+	/*
+	 * Masking an LVT entry can trigger a local APIC error
+	 * if the vector is zero. Mask LVTERR first to prevent this.
+	 */
+	if (maxlvt >= 3) {
+		v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
+		apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
+	}
+	/*
+	 * Careful: we have to set masks only first to deassert
+	 * any level-triggered sources.
+	 */
+	v = apic_read(APIC_LVTT);
+	apic_write(APIC_LVTT, v | APIC_LVT_MASKED);
+	v = apic_read(APIC_LVT0);
+	apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
+	v = apic_read(APIC_LVT1);
+	apic_write(APIC_LVT1, v | APIC_LVT_MASKED);
+	if (maxlvt >= 4) {
+		v = apic_read(APIC_LVTPC);
+		apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
+	}
+
+	/*
+	 * Clean APIC state for other OSs:
+	 */
+	apic_write(APIC_LVTT, APIC_LVT_MASKED);
+	apic_write(APIC_LVT0, APIC_LVT_MASKED);
+	apic_write(APIC_LVT1, APIC_LVT_MASKED);
+	if (maxlvt >= 3)
+		apic_write(APIC_LVTERR, APIC_LVT_MASKED);
+	if (maxlvt >= 4)
+		apic_write(APIC_LVTPC, APIC_LVT_MASKED);
+	apic_write(APIC_ESR, 0);
+	apic_read(APIC_ESR);
+}
+
+/**
+ * disable_local_APIC - clear and disable the local APIC
+ */
+void disable_local_APIC(void)
+{
+	unsigned int value;
+
+	clear_local_APIC();
+
+	/*
+	 * Disable APIC (implies clearing of registers
+	 * for 82489DX!).
+	 */
+	value = apic_read(APIC_SPIV);
+	value &= ~APIC_SPIV_APIC_ENABLED;
+	apic_write(APIC_SPIV, value);
+}
+
+void lapic_shutdown(void)
+{
+	unsigned long flags;
+
+	if (!cpu_has_apic)
+		return;
+
+	local_irq_save(flags);
+
+	disable_local_APIC();
+
+	local_irq_restore(flags);
+}
+
+/*
+ * This is to verify that we're looking at a real local APIC.
+ * Check these against your board if the CPUs aren't getting
+ * started for no apparent reason.
+ */
+int __init verify_local_APIC(void)
+{
+	unsigned int reg0, reg1;
+
+	/*
+	 * The version register is read-only in a real APIC.
+	 */
+	reg0 = apic_read(APIC_LVR);
+	apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
+	apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
+	reg1 = apic_read(APIC_LVR);
+	apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
+
+	/*
+	 * The two version reads above should print the same
+	 * numbers.  If the second one is different, then we
+	 * poke at a non-APIC.
+	 */
+	if (reg1 != reg0)
+		return 0;
+
+	/*
+	 * Check if the version looks reasonably.
+	 */
+	reg1 = GET_APIC_VERSION(reg0);
+	if (reg1 == 0x00 || reg1 == 0xff)
+		return 0;
+	reg1 = lapic_get_maxlvt();
+	if (reg1 < 0x02 || reg1 == 0xff)
+		return 0;
+
+	/*
+	 * The ID register is read/write in a real APIC.
+	 */
+	reg0 = apic_read(APIC_ID);
+	apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
+	apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
+	reg1 = apic_read(APIC_ID);
+	apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
+	apic_write(APIC_ID, reg0);
+	if (reg1 != (reg0 ^ APIC_ID_MASK))
+		return 0;
+
+	/*
 	 * The next two are just to see if we have sane values.
 	 * They're only really relevant if we're in Virtual Wire
 	 * compatibility mode, but most boxes are anymore.
 	 */
 	reg0 = apic_read(APIC_LVT0);
-	apic_printk(APIC_DEBUG,"Getting LVT0: %x\n", reg0);
+	apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0);
 	reg1 = apic_read(APIC_LVT1);
 	apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
 
 	return 1;
 }
 
+/**
+ * sync_Arb_IDs - synchronize APIC bus arbitration IDs
+ */
 void __init sync_Arb_IDs(void)
 {
 	/* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */
-	unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
-	if (ver >= 0x14)	/* P4 or higher */
+	if (modern_apic())
 		return;
 
 	/*
@@ -398,7 +672,10 @@ void __init init_bsp_APIC(void)
 	apic_write(APIC_LVT1, value);
 }
 
-void __cpuinit setup_local_APIC (void)
+/**
+ * setup_local_APIC - setup the local APIC
+ */
+void __cpuinit setup_local_APIC(void)
 {
 	unsigned int value, maxlvt;
 	int i, j;
@@ -517,162 +794,8 @@ void __cpuinit setup_local_APIC (void)
 
 	nmi_watchdog_default();
 	setup_apic_nmi_watchdog(NULL);
-	apic_pm_activate();
-}
-
-#ifdef CONFIG_PM
-
-static struct {
-	/* 'active' is true if the local APIC was enabled by us and
-	   not the BIOS; this signifies that we are also responsible
-	   for disabling it before entering apm/acpi suspend */
-	int active;
-	/* r/w apic fields */
-	unsigned int apic_id;
-	unsigned int apic_taskpri;
-	unsigned int apic_ldr;
-	unsigned int apic_dfr;
-	unsigned int apic_spiv;
-	unsigned int apic_lvtt;
-	unsigned int apic_lvtpc;
-	unsigned int apic_lvt0;
-	unsigned int apic_lvt1;
-	unsigned int apic_lvterr;
-	unsigned int apic_tmict;
-	unsigned int apic_tdcr;
-	unsigned int apic_thmr;
-} apic_pm_state;
-
-static int lapic_suspend(struct sys_device *dev, pm_message_t state)
-{
-	unsigned long flags;
-	int maxlvt;
-
-	if (!apic_pm_state.active)
-		return 0;
-
-	maxlvt = lapic_get_maxlvt();
-
-	apic_pm_state.apic_id = apic_read(APIC_ID);
-	apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
-	apic_pm_state.apic_ldr = apic_read(APIC_LDR);
-	apic_pm_state.apic_dfr = apic_read(APIC_DFR);
-	apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
-	apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
-	if (maxlvt >= 4)
-		apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
-	apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
-	apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
-	apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
-	apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
-	apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
-#ifdef CONFIG_X86_MCE_INTEL
-	if (maxlvt >= 5)
-		apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
-#endif
-	local_irq_save(flags);
-	disable_local_APIC();
-	local_irq_restore(flags);
-	return 0;
-}
-
-static int lapic_resume(struct sys_device *dev)
-{
-	unsigned int l, h;
-	unsigned long flags;
-	int maxlvt;
-
-	if (!apic_pm_state.active)
-		return 0;
-
-	maxlvt = lapic_get_maxlvt();
-
-	local_irq_save(flags);
-	rdmsr(MSR_IA32_APICBASE, l, h);
-	l &= ~MSR_IA32_APICBASE_BASE;
-	l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
-	wrmsr(MSR_IA32_APICBASE, l, h);
-	apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
-	apic_write(APIC_ID, apic_pm_state.apic_id);
-	apic_write(APIC_DFR, apic_pm_state.apic_dfr);
-	apic_write(APIC_LDR, apic_pm_state.apic_ldr);
-	apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
-	apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
-	apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
-	apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
-#ifdef CONFIG_X86_MCE_INTEL
-	if (maxlvt >= 5)
-		apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
-#endif
-	if (maxlvt >= 4)
-		apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
-	apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
-	apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
-	apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
-	apic_write(APIC_ESR, 0);
-	apic_read(APIC_ESR);
-	apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
-	apic_write(APIC_ESR, 0);
-	apic_read(APIC_ESR);
-	local_irq_restore(flags);
-	return 0;
-}
-
-static struct sysdev_class lapic_sysclass = {
-	.name		= "lapic",
-	.resume		= lapic_resume,
-	.suspend	= lapic_suspend,
-};
-
-static struct sys_device device_lapic = {
-	.id		= 0,
-	.cls		= &lapic_sysclass,
-};
-
-static void __cpuinit apic_pm_activate(void)
-{
-	apic_pm_state.active = 1;
-}
-
-static int __init init_lapic_sysfs(void)
-{
-	int error;
-	if (!cpu_has_apic)
-		return 0;
-	/* XXX: remove suspend/resume procs if !apic_pm_state.active? */
-	error = sysdev_class_register(&lapic_sysclass);
-	if (!error)
-		error = sysdev_register(&device_lapic);
-	return error;
-}
-device_initcall(init_lapic_sysfs);
-
-#else	/* CONFIG_PM */
-
-static void apic_pm_activate(void) { }
-
-#endif	/* CONFIG_PM */
-
-static int __init apic_set_verbosity(char *str)
-{
-	if (str == NULL)  {
-		skip_ioapic_setup = 0;
-		ioapic_force = 1;
-		return 0;
-	}
-	if (strcmp("debug", str) == 0)
-		apic_verbosity = APIC_DEBUG;
-	else if (strcmp("verbose", str) == 0)
-		apic_verbosity = APIC_VERBOSE;
-	else {
-		printk(KERN_WARNING "APIC Verbosity level %s not recognised"
-				" use apic=verbose or apic=debug\n", str);
-		return -EINVAL;
-	}
-
-	return 0;
+	apic_pm_activate();
 }
-early_param("apic", apic_set_verbosity);
 
 /*
  * Detect and enable local APICs on non-SMP boards.
@@ -680,8 +803,7 @@ early_param("apic", apic_set_verbosity);
  * On AMD64 we trust the BIOS - if it says no APIC it is likely
  * not correctly set up (usually the APIC timer won't work etc.)
  */
-
-static int __init detect_init_APIC (void)
+static int __init detect_init_APIC(void)
 {
 	if (!cpu_has_apic) {
 		printk(KERN_INFO "No local APIC present\n");
@@ -693,6 +815,9 @@ static int __init detect_init_APIC (void)
 	return 0;
 }
 
+/**
+ * init_apic_mappings - initialize APIC mappings
+ */
 void __init init_apic_mappings(void)
 {
 	unsigned long apic_phys;
@@ -725,264 +850,267 @@ void __init init_apic_mappings(void)
 }
 
 /*
- * This function sets up the local APIC timer, with a timeout of
- * 'clocks' APIC bus clock. During calibration we actually call
- * this function twice on the boot CPU, once with a bogus timeout
- * value, second time for real. The other (noncalibrating) CPUs
- * call this function only once, with the real, calibrated value.
- *
- * We do reads before writes even if unnecessary, to get around the
- * P5 APIC double write bug.
+ * This initializes the IO-APIC and APIC hardware if this is
+ * a UP kernel.
  */
-
-static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
+int __init APIC_init_uniprocessor(void)
 {
-	unsigned int lvtt_value, tmp_value;
-
-	lvtt_value = LOCAL_TIMER_VECTOR;
-	if (!oneshot)
-		lvtt_value |= APIC_LVT_TIMER_PERIODIC;
-	if (!irqen)
-		lvtt_value |= APIC_LVT_MASKED;
-
-	apic_write(APIC_LVTT, lvtt_value);
-
-	/*
-	 * Divide PICLK by 16
-	 */
-	tmp_value = apic_read(APIC_TDCR);
-	apic_write(APIC_TDCR, (tmp_value
-				& ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
-				| APIC_TDR_DIV_16);
+	if (disable_apic) {
+		printk(KERN_INFO "Apic disabled\n");
+		return -1;
+	}
+	if (!cpu_has_apic) {
+		disable_apic = 1;
+		printk(KERN_INFO "Apic disabled by BIOS\n");
+		return -1;
+	}
 
-	if (!oneshot)
-		apic_write(APIC_TMICT, clocks);
-}
+	verify_local_APIC();
 
-static void setup_APIC_timer(void)
-{
-	struct clock_event_device *levt = &__get_cpu_var(lapic_events);
+	phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id);
+	apic_write(APIC_ID, SET_APIC_ID(boot_cpu_id));
 
-	memcpy(levt, &lapic_clockevent, sizeof(*levt));
-	levt->cpumask = cpumask_of_cpu(smp_processor_id());
+	setup_local_APIC();
 
-	clockevents_register_device(levt);
+	if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
+		setup_IO_APIC();
+	else
+		nr_ioapics = 0;
+	setup_boot_APIC_clock();
+	check_nmi_watchdog();
+	return 0;
 }
 
 /*
- * In this function we calibrate APIC bus clocks to the external
- * timer. Unfortunately we cannot use jiffies and the timer irq
- * to calibrate, since some later bootup code depends on getting
- * the first irq? Ugh.
- *
- * We want to do the calibration only once since we
- * want to have local timer irqs syncron. CPUs connected
- * by the same APIC bus have the very same bus frequency.
- * And we want to have irqs off anyways, no accidental
- * APIC irq that way.
+ * Local APIC interrupts
  */
 
-#define TICK_COUNT 100000000
-
-static void __init calibrate_APIC_clock(void)
+/*
+ * This interrupt should _never_ happen with our APIC/SMP architecture
+ */
+asmlinkage void smp_spurious_interrupt(void)
 {
-	unsigned apic, apic_start;
-	unsigned long tsc, tsc_start;
-	int result;
-
-	local_irq_disable();
-
+	unsigned int v;
+	exit_idle();
+	irq_enter();
 	/*
-	 * Put whatever arbitrary (but long enough) timeout
-	 * value into the APIC clock, we just want to get the
-	 * counter running for calibration.
-	 *
-	 * No interrupt enable !
+	 * Check if this really is a spurious interrupt and ACK it
+	 * if it is a vectored one.  Just in case...
+	 * Spurious interrupts should not be ACKed.
 	 */
-	__setup_APIC_LVTT(250000000, 0, 0);
-
-	apic_start = apic_read(APIC_TMCCT);
-#ifdef CONFIG_X86_PM_TIMER
-	if (apic_calibrate_pmtmr && pmtmr_ioport) {
-		pmtimer_wait(5000);  /* 5ms wait */
-		apic = apic_read(APIC_TMCCT);
-		result = (apic_start - apic) * 1000L / 5;
-	} else
-#endif
-	{
-		rdtscll(tsc_start);
-
-		do {
-			apic = apic_read(APIC_TMCCT);
-			rdtscll(tsc);
-		} while ((tsc - tsc_start) < TICK_COUNT &&
-				(apic_start - apic) < TICK_COUNT);
-
-		result = (apic_start - apic) * 1000L * tsc_khz /
-					(tsc - tsc_start);
-	}
-
-	local_irq_enable();
+	v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
+	if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
+		ack_APIC_irq();
 
-	printk(KERN_DEBUG "APIC timer calibration result %d\n", result);
+	add_pda(irq_spurious_count, 1);
+	irq_exit();
+}
 
-	printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n",
-		result / 1000 / 1000, result / 1000 % 1000);
+/*
+ * This interrupt should never happen with our APIC/SMP architecture
+ */
+asmlinkage void smp_error_interrupt(void)
+{
+	unsigned int v, v1;
 
-	/* Calculate the scaled math multiplication factor */
-	lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC, 32);
-	lapic_clockevent.max_delta_ns =
-		clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
-	lapic_clockevent.min_delta_ns =
-		clockevent_delta2ns(0xF, &lapic_clockevent);
+	exit_idle();
+	irq_enter();
+	/* First tickle the hardware, only then report what went on. -- REW */
+	v = apic_read(APIC_ESR);
+	apic_write(APIC_ESR, 0);
+	v1 = apic_read(APIC_ESR);
+	ack_APIC_irq();
+	atomic_inc(&irq_err_count);
 
-	calibration_result = result / HZ;
+	/* Here is what the APIC error bits mean:
+	   0: Send CS error
+	   1: Receive CS error
+	   2: Send accept error
+	   3: Receive accept error
+	   4: Reserved
+	   5: Send illegal vector
+	   6: Received illegal vector
+	   7: Illegal register address
+	*/
+	printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
+		smp_processor_id(), v , v1);
+	irq_exit();
 }
 
-void __init setup_boot_APIC_clock (void)
+void disconnect_bsp_APIC(int virt_wire_setup)
 {
-	/*
-	 * The local apic timer can be disabled via the kernel commandline.
-	 * Register the lapic timer as a dummy clock event source on SMP
-	 * systems, so the broadcast mechanism is used. On UP systems simply
-	 * ignore it.
-	 */
-	if (disable_apic_timer) {
-		printk(KERN_INFO "Disabling APIC timer\n");
-		/* No broadcast on UP ! */
-		if (num_possible_cpus() > 1)
-			setup_APIC_timer();
-		return;
-	}
+	/* Go back to Virtual Wire compatibility mode */
+	unsigned long value;
 
-	printk(KERN_INFO "Using local APIC timer interrupts.\n");
-	calibrate_APIC_clock();
+	/* For the spurious interrupt use vector F, and enable it */
+	value = apic_read(APIC_SPIV);
+	value &= ~APIC_VECTOR_MASK;
+	value |= APIC_SPIV_APIC_ENABLED;
+	value |= 0xf;
+	apic_write(APIC_SPIV, value);
 
-	/*
-	 * If nmi_watchdog is set to IO_APIC, we need the
-	 * PIT/HPET going.  Otherwise register lapic as a dummy
-	 * device.
-	 */
-	if (nmi_watchdog != NMI_IO_APIC)
-		lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
-	else
-		printk(KERN_WARNING "APIC timer registered as dummy,"
-		       " due to nmi_watchdog=1!\n");
+	if (!virt_wire_setup) {
+		/*
+		 * For LVT0 make it edge triggered, active high,
+		 * external and enabled
+		 */
+		value = apic_read(APIC_LVT0);
+		value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
+			APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+			APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
+		value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+		value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
+		apic_write(APIC_LVT0, value);
+	} else {
+		/* Disable LVT0 */
+		apic_write(APIC_LVT0, APIC_LVT_MASKED);
+	}
 
-	setup_APIC_timer();
+	/* For LVT1 make it edge triggered, active high, nmi and enabled */
+	value = apic_read(APIC_LVT1);
+	value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
+			APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+			APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
+	value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+	value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
+	apic_write(APIC_LVT1, value);
 }
 
 /*
- * AMD C1E enabled CPUs have a real nasty problem: Some BIOSes set the
- * C1E flag only in the secondary CPU, so when we detect the wreckage
- * we already have enabled the boot CPU local apic timer. Check, if
- * disable_apic_timer is set and the DUMMY flag is cleared. If yes,
- * set the DUMMY flag again and force the broadcast mode in the
- * clockevents layer.
+ * Power management
  */
-void __cpuinit check_boot_apic_timer_broadcast(void)
-{
-	if (!disable_apic_timer ||
-	    (lapic_clockevent.features & CLOCK_EVT_FEAT_DUMMY))
-		return;
+#ifdef CONFIG_PM
+
+static struct {
+	/* 'active' is true if the local APIC was enabled by us and
+	   not the BIOS; this signifies that we are also responsible
+	   for disabling it before entering apm/acpi suspend */
+	int active;
+	/* r/w apic fields */
+	unsigned int apic_id;
+	unsigned int apic_taskpri;
+	unsigned int apic_ldr;
+	unsigned int apic_dfr;
+	unsigned int apic_spiv;
+	unsigned int apic_lvtt;
+	unsigned int apic_lvtpc;
+	unsigned int apic_lvt0;
+	unsigned int apic_lvt1;
+	unsigned int apic_lvterr;
+	unsigned int apic_tmict;
+	unsigned int apic_tdcr;
+	unsigned int apic_thmr;
+} apic_pm_state;
+
+static int lapic_suspend(struct sys_device *dev, pm_message_t state)
+{
+	unsigned long flags;
+	int maxlvt;
 
-	printk(KERN_INFO "AMD C1E detected late. Force timer broadcast.\n");
-	lapic_clockevent.features |= CLOCK_EVT_FEAT_DUMMY;
+	if (!apic_pm_state.active)
+		return 0;
 
-	local_irq_enable();
-	clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, &boot_cpu_id);
-	local_irq_disable();
-}
+	maxlvt = lapic_get_maxlvt();
 
-void __cpuinit setup_secondary_APIC_clock(void)
-{
-	check_boot_apic_timer_broadcast();
-	setup_APIC_timer();
+	apic_pm_state.apic_id = apic_read(APIC_ID);
+	apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
+	apic_pm_state.apic_ldr = apic_read(APIC_LDR);
+	apic_pm_state.apic_dfr = apic_read(APIC_DFR);
+	apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
+	apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
+	if (maxlvt >= 4)
+		apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
+	apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
+	apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
+	apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
+	apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
+	apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
+#ifdef CONFIG_X86_MCE_INTEL
+	if (maxlvt >= 5)
+		apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
+#endif
+	local_irq_save(flags);
+	disable_local_APIC();
+	local_irq_restore(flags);
+	return 0;
 }
 
-int setup_profiling_timer(unsigned int multiplier)
+static int lapic_resume(struct sys_device *dev)
 {
-	return -EINVAL;
-}
+	unsigned int l, h;
+	unsigned long flags;
+	int maxlvt;
 
-void setup_APIC_extended_lvt(unsigned char lvt_off, unsigned char vector,
-			     unsigned char msg_type, unsigned char mask)
-{
-	unsigned long reg = (lvt_off << 4) + K8_APIC_EXT_LVT_BASE;
-	unsigned int  v   = (mask << 16) | (msg_type << 8) | vector;
-	apic_write(reg, v);
-}
+	if (!apic_pm_state.active)
+		return 0;
 
-/*
- * Local timer interrupt handler. It does both profiling and
- * process statistics/rescheduling.
- *
- * We do profiling in every local tick, statistics/rescheduling
- * happen only every 'profiling multiplier' ticks. The default
- * multiplier is 1 and it can be changed by writing the new multiplier
- * value into /proc/profile.
- */
+	maxlvt = lapic_get_maxlvt();
 
-static void smp_local_timer_interrupt(void)
-{
-	int cpu = smp_processor_id();
-	struct clock_event_device *evt = &per_cpu(lapic_events, cpu);
+	local_irq_save(flags);
+	rdmsr(MSR_IA32_APICBASE, l, h);
+	l &= ~MSR_IA32_APICBASE_BASE;
+	l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
+	wrmsr(MSR_IA32_APICBASE, l, h);
+	apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
+	apic_write(APIC_ID, apic_pm_state.apic_id);
+	apic_write(APIC_DFR, apic_pm_state.apic_dfr);
+	apic_write(APIC_LDR, apic_pm_state.apic_ldr);
+	apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
+	apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
+	apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
+	apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
+#ifdef CONFIG_X86_MCE_INTEL
+	if (maxlvt >= 5)
+		apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
+#endif
+	if (maxlvt >= 4)
+		apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
+	apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
+	apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
+	apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
+	apic_write(APIC_ESR, 0);
+	apic_read(APIC_ESR);
+	apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
+	apic_write(APIC_ESR, 0);
+	apic_read(APIC_ESR);
+	local_irq_restore(flags);
+	return 0;
+}
 
-	/*
-	 * Normally we should not be here till LAPIC has been initialized but
-	 * in some cases like kdump, its possible that there is a pending LAPIC
-	 * timer interrupt from previous kernel's context and is delivered in
-	 * new kernel the moment interrupts are enabled.
-	 *
-	 * Interrupts are enabled early and LAPIC is setup much later, hence
-	 * its possible that when we get here evt->event_handler is NULL.
-	 * Check for event_handler being NULL and discard the interrupt as
-	 * spurious.
-	 */
-	if (!evt->event_handler) {
-		printk(KERN_WARNING
-		       "Spurious LAPIC timer interrupt on cpu %d\n", cpu);
-		/* Switch it off */
-		lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
-		return;
-	}
+static struct sysdev_class lapic_sysclass = {
+	.name		= "lapic",
+	.resume		= lapic_resume,
+	.suspend	= lapic_suspend,
+};
 
-	/*
-	 * the NMI deadlock-detector uses this.
-	 */
-	add_pda(apic_timer_irqs, 1);
+static struct sys_device device_lapic = {
+	.id		= 0,
+	.cls		= &lapic_sysclass,
+};
 
-	evt->event_handler(evt);
+static void __cpuinit apic_pm_activate(void)
+{
+	apic_pm_state.active = 1;
 }
 
-/*
- * Local APIC timer interrupt. This is the most natural way for doing
- * local interrupts, but local timer interrupts can be emulated by
- * broadcast interrupts too. [in case the hw doesn't support APIC timers]
- *
- * [ if a single-CPU system runs an SMP kernel then we call the local
- *   interrupt as well. Thus we cannot inline the local irq ... ]
- */
-void smp_apic_timer_interrupt(struct pt_regs *regs)
+static int __init init_lapic_sysfs(void)
 {
-	struct pt_regs *old_regs = set_irq_regs(regs);
-
-	/*
-	 * NOTE! We'd better ACK the irq immediately,
-	 * because timer handling can be slow.
-	 */
-	ack_APIC_irq();
-	/*
-	 * update_process_times() expects us to have done irq_enter().
-	 * Besides, if we don't timer interrupts ignore the global
-	 * interrupt lock, which is the WrongThing (tm) to do.
-	 */
-	exit_idle();
-	irq_enter();
-	smp_local_timer_interrupt();
-	irq_exit();
-	set_irq_regs(old_regs);
+	int error;
+	if (!cpu_has_apic)
+		return 0;
+	/* XXX: remove suspend/resume procs if !apic_pm_state.active? */
+	error = sysdev_class_register(&lapic_sysclass);
+	if (!error)
+		error = sysdev_register(&device_lapic);
+	return error;
 }
+device_initcall(init_lapic_sysfs);
+
+#else	/* CONFIG_PM */
+
+static void apic_pm_activate(void) { }
+
+#endif	/* CONFIG_PM */
 
 /*
  * apic_is_clustered_box() -- Check if we can expect good TSC
@@ -1032,91 +1160,28 @@ __cpuinit int apic_is_clustered_box(void)
 }
 
 /*
- * This interrupt should _never_ happen with our APIC/SMP architecture
- */
-asmlinkage void smp_spurious_interrupt(void)
-{
-	unsigned int v;
-	exit_idle();
-	irq_enter();
-	/*
-	 * Check if this really is a spurious interrupt and ACK it
-	 * if it is a vectored one.  Just in case...
-	 * Spurious interrupts should not be ACKed.
-	 */
-	v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
-	if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
-		ack_APIC_irq();
-
-	add_pda(irq_spurious_count, 1);
-	irq_exit();
-}
-
-/*
- * This interrupt should never happen with our APIC/SMP architecture
- */
-
-asmlinkage void smp_error_interrupt(void)
-{
-	unsigned int v, v1;
-
-	exit_idle();
-	irq_enter();
-	/* First tickle the hardware, only then report what went on. -- REW */
-	v = apic_read(APIC_ESR);
-	apic_write(APIC_ESR, 0);
-	v1 = apic_read(APIC_ESR);
-	ack_APIC_irq();
-	atomic_inc(&irq_err_count);
-
-	/* Here is what the APIC error bits mean:
-	   0: Send CS error
-	   1: Receive CS error
-	   2: Send accept error
-	   3: Receive accept error
-	   4: Reserved
-	   5: Send illegal vector
-	   6: Received illegal vector
-	   7: Illegal register address
-	*/
-	printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
-		smp_processor_id(), v , v1);
-	irq_exit();
-}
-
-int disable_apic;
-
-/*
- * This initializes the IO-APIC and APIC hardware if this is
- * a UP kernel.
+ * APIC command line parameters
  */
-int __init APIC_init_uniprocessor (void)
+static int __init apic_set_verbosity(char *str)
 {
-	if (disable_apic) {
-		printk(KERN_INFO "Apic disabled\n");
-		return -1;
+	if (str == NULL)  {
+		skip_ioapic_setup = 0;
+		ioapic_force = 1;
+		return 0;
 	}
-	if (!cpu_has_apic) {
-		disable_apic = 1;
-		printk(KERN_INFO "Apic disabled by BIOS\n");
-		return -1;
+	if (strcmp("debug", str) == 0)
+		apic_verbosity = APIC_DEBUG;
+	else if (strcmp("verbose", str) == 0)
+		apic_verbosity = APIC_VERBOSE;
+	else {
+		printk(KERN_WARNING "APIC Verbosity level %s not recognised"
+				" use apic=verbose or apic=debug\n", str);
+		return -EINVAL;
 	}
 
-	verify_local_APIC();
-
-	phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id);
-	apic_write(APIC_ID, SET_APIC_ID(boot_cpu_id));
-
-	setup_local_APIC();
-
-	if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
-		setup_IO_APIC();
-	else
-		nr_ioapics = 0;
-	setup_boot_APIC_clock();
-	check_nmi_watchdog();
 	return 0;
 }
+early_param("apic", apic_set_verbosity);
 
 static __init int setup_disableapic(char *str)
 {
-- 
cgit v1.2.3


From f2f58178f497ca56501d44d79982621e19c5007f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:30:23 +0100
Subject: x86: simplify set_bitmap in ioport_32.c

Simplify set_bitmap(). This is not in a hotpath and we really can use the
straight forward loop through those bits. A similar implementation is used
in the 64 bit code as well.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ioport_32.c | 32 ++++++--------------------------
 1 file changed, 6 insertions(+), 26 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ioport_32.c b/arch/x86/kernel/ioport_32.c
index 4ed48dc8df1e..c5a64011764f 100644
--- a/arch/x86/kernel/ioport_32.c
+++ b/arch/x86/kernel/ioport_32.c
@@ -16,36 +16,16 @@
 #include <linux/syscalls.h>
 
 /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
-static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
+static void set_bitmap(unsigned long *bitmap, unsigned int base,
+		       unsigned int extent, int new_value)
 {
-	unsigned long mask;
-	unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG);
-	unsigned int low_index = base & (BITS_PER_LONG-1);
-	int length = low_index + extent;
-
-	if (low_index != 0) {
-		mask = (~0UL << low_index);
-		if (length < BITS_PER_LONG)
-			mask &= ~(~0UL << length);
-		if (new_value)
-			*bitmap_base++ |= mask;
-		else
-			*bitmap_base++ &= ~mask;
-		length -= BITS_PER_LONG;
-	}
-
-	mask = (new_value ? ~0UL : 0UL);
-	while (length >= BITS_PER_LONG) {
-		*bitmap_base++ = mask;
-		length -= BITS_PER_LONG;
-	}
+	unsigned int i;
 
-	if (length > 0) {
-		mask = ~(~0UL << length);
+	for (i = base; i < base + extent; i++) {
 		if (new_value)
-			*bitmap_base++ |= mask;
+			__set_bit(i, bitmap);
 		else
-			*bitmap_base++ &= ~mask;
+			__clear_bit(i, bitmap);
 	}
 }
 
-- 
cgit v1.2.3


From 9a211abeaab74e2634669a64ebd82fac5d94d276 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:30:24 +0100
Subject: x86: clean up ioport_32.c

Remove unused variables, rename the "unused" argument to regp. It is used !
Codingstyle fixes.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ioport_32.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ioport_32.c b/arch/x86/kernel/ioport_32.c
index c5a64011764f..c281ffa18259 100644
--- a/arch/x86/kernel/ioport_32.c
+++ b/arch/x86/kernel/ioport_32.c
@@ -29,16 +29,14 @@ static void set_bitmap(unsigned long *bitmap, unsigned int base,
 	}
 }
 
-
 /*
  * this changes the io permissions bitmap in the current task.
  */
 asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
 {
-	unsigned long i, max_long, bytes, bytes_updated;
 	struct thread_struct * t = &current->thread;
 	struct tss_struct * tss;
-	unsigned long *bitmap;
+	unsigned long i, max_long;
 
 	if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
 		return -EINVAL;
@@ -51,7 +49,8 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
 	 * this is why we delay this operation until now:
 	 */
 	if (!t->io_bitmap_ptr) {
-		bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
+		unsigned long *bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
+
 		if (!bitmap)
 			return -ENOMEM;
 
@@ -80,10 +79,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
 		if (t->io_bitmap_ptr[i] != ~0UL)
 			max_long = i;
 
-	bytes = (max_long + 1) * sizeof(long);
-	bytes_updated = max(bytes, t->io_bitmap_max);
-
-	t->io_bitmap_max = bytes;
+	t->io_bitmap_max = (max_long + 1) * sizeof(unsigned long);
 
 	/*
 	 * Sets the lazy trigger so that the next I/O operation will
@@ -110,9 +106,9 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
  * code.
  */
 
-asmlinkage long sys_iopl(unsigned long unused)
+asmlinkage long sys_iopl(unsigned long regsp)
 {
-	volatile struct pt_regs * regs = (struct pt_regs *) &unused;
+	volatile struct pt_regs *regs = (struct pt_regs *)&regsp;
 	unsigned int level = regs->ebx;
 	unsigned int old = (regs->eflags >> 12) & 3;
 	struct thread_struct *t = &current->thread;
@@ -124,8 +120,10 @@ asmlinkage long sys_iopl(unsigned long unused)
 		if (!capable(CAP_SYS_RAWIO))
 			return -EPERM;
 	}
+
 	t->iopl = level << 12;
 	regs->eflags = (regs->eflags & ~X86_EFLAGS_IOPL) | t->iopl;
 	set_iopl_mask(t->iopl);
+
 	return 0;
 }
-- 
cgit v1.2.3


From ed4aed98da8d042716d327a0b538dd8002c0a767 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:30:24 +0100
Subject: x86: clean up arch/x86/kernel/vsmp_64.c

White space and coding style clenaup.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/vsmp_64.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index 414caf0c5f9a..d971210a6d36 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -25,21 +25,24 @@ static int __init vsmp_init(void)
 		return 0;
 
 	/* Check if we are running on a ScaleMP vSMP box */
-	if ((read_pci_config_16(0, 0x1f, 0, PCI_VENDOR_ID) != PCI_VENDOR_ID_SCALEMP) ||
-	    (read_pci_config_16(0, 0x1f, 0, PCI_DEVICE_ID) != PCI_DEVICE_ID_SCALEMP_VSMP_CTL))
+	if ((read_pci_config_16(0, 0x1f, 0, PCI_VENDOR_ID) !=
+	     PCI_VENDOR_ID_SCALEMP) ||
+	    (read_pci_config_16(0, 0x1f, 0, PCI_DEVICE_ID) !=
+	     PCI_DEVICE_ID_SCALEMP_VSMP_CTL))
 		return 0;
 
 	/* set vSMP magic bits to indicate vSMP capable kernel */
 	address = ioremap(read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0), 8);
 	cap = readl(address);
 	ctl = readl(address + 4);
-	printk("vSMP CTL: capabilities:0x%08x  control:0x%08x\n", cap, ctl);
+	printk(KERN_INFO "vSMP CTL: capabilities:0x%08x  control:0x%08x\n",
+	       cap, ctl);
 	if (cap & ctl & (1 << 4)) {
 		/* Turn on vSMP IRQ fastpath handling (see system.h) */
 		ctl &= ~(1 << 4);
 		writel(ctl, address + 4);
 		ctl = readl(address + 4);
-		printk("vSMP CTL: control set to:0x%08x\n", ctl);
+		printk(KERN_INFO "vSMP CTL: control set to:0x%08x\n", ctl);
 	}
 
 	iounmap(address);
-- 
cgit v1.2.3


From 6ce60b07e670e800c4c5cfe984ed5188e7a64135 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:30:26 +0100
Subject: x86: unify mc146818rtc.h - prepare for sharing rtc code

Unify mc146818rtc.h by adding the rtc_cmos_read/write functions to
time_64.c. This is a preparatory patch to finaly share the rtc code,
which is unsurprisingly similar.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/time_64.c        | 21 +++++++++
 include/asm-x86/mc146818rtc.h    | 98 ++++++++++++++++++++++++++++++++++++++--
 include/asm-x86/mc146818rtc_32.h | 97 ---------------------------------------
 include/asm-x86/mc146818rtc_64.h | 29 ------------
 4 files changed, 116 insertions(+), 129 deletions(-)
 delete mode 100644 include/asm-x86/mc146818rtc_32.h
 delete mode 100644 include/asm-x86/mc146818rtc_64.h

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index 368b1942b39a..0a01504586a5 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -69,6 +69,27 @@ unsigned long profile_pc(struct pt_regs *regs)
 }
 EXPORT_SYMBOL(profile_pc);
 
+/* Routines for accessing the CMOS RAM/RTC. */
+unsigned char rtc_cmos_read(unsigned char addr)
+{
+	unsigned char val;
+	lock_cmos_prefix(addr);
+	outb_p(addr, RTC_PORT(0));
+	val = inb_p(RTC_PORT(1));
+	lock_cmos_suffix(addr);
+	return val;
+}
+EXPORT_SYMBOL(rtc_cmos_read);
+
+void rtc_cmos_write(unsigned char val, unsigned char addr)
+{
+	lock_cmos_prefix(addr);
+	outb_p(addr, RTC_PORT(0));
+	outb_p(val, RTC_PORT(1));
+	lock_cmos_suffix(addr);
+}
+EXPORT_SYMBOL(rtc_cmos_write);
+
 /*
  * In order to set the CMOS clock precisely, set_rtc_mmss has to be called 500
  * ms after the second nowtime has started, because when nowtime is written
diff --git a/include/asm-x86/mc146818rtc.h b/include/asm-x86/mc146818rtc.h
index 5c2bb66caf17..9d39436df231 100644
--- a/include/asm-x86/mc146818rtc.h
+++ b/include/asm-x86/mc146818rtc.h
@@ -1,5 +1,97 @@
-#ifdef CONFIG_X86_32
-# include "mc146818rtc_32.h"
+/*
+ * Machine dependent access functions for RTC registers.
+ */
+#ifndef _ASM_MC146818RTC_H
+#define _ASM_MC146818RTC_H
+
+#include <asm/io.h>
+#include <asm/system.h>
+#include <asm/processor.h>
+#include <linux/mc146818rtc.h>
+
+#ifndef RTC_PORT
+#define RTC_PORT(x)	(0x70 + (x))
+#define RTC_ALWAYS_BCD	1	/* RTC operates in binary mode */
+#endif
+
+#if defined(CONFIG_X86_32) && defined(__HAVE_ARCH_CMPXCHG)
+/*
+ * This lock provides nmi access to the CMOS/RTC registers.  It has some
+ * special properties.  It is owned by a CPU and stores the index register
+ * currently being accessed (if owned).  The idea here is that it works
+ * like a normal lock (normally).  However, in an NMI, the NMI code will
+ * first check to see if its CPU owns the lock, meaning that the NMI
+ * interrupted during the read/write of the device.  If it does, it goes ahead
+ * and performs the access and then restores the index register.  If it does
+ * not, it locks normally.
+ *
+ * Note that since we are working with NMIs, we need this lock even in
+ * a non-SMP machine just to mark that the lock is owned.
+ *
+ * This only works with compare-and-swap.  There is no other way to
+ * atomically claim the lock and set the owner.
+ */
+#include <linux/smp.h>
+extern volatile unsigned long cmos_lock;
+
+/*
+ * All of these below must be called with interrupts off, preempt
+ * disabled, etc.
+ */
+
+static inline void lock_cmos(unsigned char reg)
+{
+	unsigned long new;
+	new = ((smp_processor_id()+1) << 8) | reg;
+	for (;;) {
+		if (cmos_lock) {
+			cpu_relax();
+			continue;
+		}
+		if (__cmpxchg(&cmos_lock, 0, new, sizeof(cmos_lock)) == 0)
+			return;
+	}
+}
+
+static inline void unlock_cmos(void)
+{
+	cmos_lock = 0;
+}
+static inline int do_i_have_lock_cmos(void)
+{
+	return (cmos_lock >> 8) == (smp_processor_id()+1);
+}
+static inline unsigned char current_lock_cmos_reg(void)
+{
+	return cmos_lock & 0xff;
+}
+#define lock_cmos_prefix(reg) \
+	do {					\
+		unsigned long cmos_flags;	\
+		local_irq_save(cmos_flags);	\
+		lock_cmos(reg)
+#define lock_cmos_suffix(reg) \
+		unlock_cmos();			\
+		local_irq_restore(cmos_flags);	\
+	} while (0)
 #else
-# include "mc146818rtc_64.h"
+#define lock_cmos_prefix(reg) do {} while (0)
+#define lock_cmos_suffix(reg) do {} while (0)
+#define lock_cmos(reg)
+#define unlock_cmos()
+#define do_i_have_lock_cmos() 0
+#define current_lock_cmos_reg() 0
 #endif
+
+/*
+ * The yet supported machines all access the RTC index register via
+ * an ISA port access but the way to access the date register differs ...
+ */
+#define CMOS_READ(addr) rtc_cmos_read(addr)
+#define CMOS_WRITE(val, addr) rtc_cmos_write(val, addr)
+unsigned char rtc_cmos_read(unsigned char addr);
+void rtc_cmos_write(unsigned char val, unsigned char addr);
+
+#define RTC_IRQ 8
+
+#endif /* _ASM_MC146818RTC_H */
diff --git a/include/asm-x86/mc146818rtc_32.h b/include/asm-x86/mc146818rtc_32.h
deleted file mode 100644
index 1613b42eaf58..000000000000
--- a/include/asm-x86/mc146818rtc_32.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Machine dependent access functions for RTC registers.
- */
-#ifndef _ASM_MC146818RTC_H
-#define _ASM_MC146818RTC_H
-
-#include <asm/io.h>
-#include <asm/system.h>
-#include <asm/processor.h>
-#include <linux/mc146818rtc.h>
-
-#ifndef RTC_PORT
-#define RTC_PORT(x)	(0x70 + (x))
-#define RTC_ALWAYS_BCD	1	/* RTC operates in binary mode */
-#endif
-
-#ifdef __HAVE_ARCH_CMPXCHG
-/*
- * This lock provides nmi access to the CMOS/RTC registers.  It has some
- * special properties.  It is owned by a CPU and stores the index register
- * currently being accessed (if owned).  The idea here is that it works
- * like a normal lock (normally).  However, in an NMI, the NMI code will
- * first check to see if its CPU owns the lock, meaning that the NMI
- * interrupted during the read/write of the device.  If it does, it goes ahead
- * and performs the access and then restores the index register.  If it does
- * not, it locks normally.
- *
- * Note that since we are working with NMIs, we need this lock even in
- * a non-SMP machine just to mark that the lock is owned.
- *
- * This only works with compare-and-swap.  There is no other way to
- * atomically claim the lock and set the owner.
- */
-#include <linux/smp.h>
-extern volatile unsigned long cmos_lock;
-
-/*
- * All of these below must be called with interrupts off, preempt
- * disabled, etc.
- */
-
-static inline void lock_cmos(unsigned char reg)
-{
-	unsigned long new;
-	new = ((smp_processor_id()+1) << 8) | reg;
-	for (;;) {
-		if (cmos_lock) {
-			cpu_relax();
-			continue;
-		}
-		if (__cmpxchg(&cmos_lock, 0, new, sizeof(cmos_lock)) == 0)
-			return;
-	}
-}
-
-static inline void unlock_cmos(void)
-{
-	cmos_lock = 0;
-}
-static inline int do_i_have_lock_cmos(void)
-{
-	return (cmos_lock >> 8) == (smp_processor_id()+1);
-}
-static inline unsigned char current_lock_cmos_reg(void)
-{
-	return cmos_lock & 0xff;
-}
-#define lock_cmos_prefix(reg) \
-	do {					\
-		unsigned long cmos_flags;	\
-		local_irq_save(cmos_flags);	\
-		lock_cmos(reg)
-#define lock_cmos_suffix(reg) \
-		unlock_cmos();			\
-		local_irq_restore(cmos_flags);	\
-	} while (0)
-#else
-#define lock_cmos_prefix(reg) do {} while (0)
-#define lock_cmos_suffix(reg) do {} while (0)
-#define lock_cmos(reg)
-#define unlock_cmos()
-#define do_i_have_lock_cmos() 0
-#define current_lock_cmos_reg() 0
-#endif
-
-/*
- * The yet supported machines all access the RTC index register via
- * an ISA port access but the way to access the date register differs ...
- */
-#define CMOS_READ(addr) rtc_cmos_read(addr)
-#define CMOS_WRITE(val, addr) rtc_cmos_write(val, addr)
-unsigned char rtc_cmos_read(unsigned char addr);
-void rtc_cmos_write(unsigned char val, unsigned char addr);
-
-#define RTC_IRQ 8
-
-#endif /* _ASM_MC146818RTC_H */
diff --git a/include/asm-x86/mc146818rtc_64.h b/include/asm-x86/mc146818rtc_64.h
deleted file mode 100644
index d6e3009430c1..000000000000
--- a/include/asm-x86/mc146818rtc_64.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Machine dependent access functions for RTC registers.
- */
-#ifndef _ASM_MC146818RTC_H
-#define _ASM_MC146818RTC_H
-
-#include <asm/io.h>
-
-#ifndef RTC_PORT
-#define RTC_PORT(x)	(0x70 + (x))
-#define RTC_ALWAYS_BCD	1	/* RTC operates in binary mode */
-#endif
-
-/*
- * The yet supported machines all access the RTC index register via
- * an ISA port access but the way to access the date register differs ...
- */
-#define CMOS_READ(addr) ({ \
-outb_p((addr),RTC_PORT(0)); \
-inb_p(RTC_PORT(1)); \
-})
-#define CMOS_WRITE(val, addr) ({ \
-outb_p((addr),RTC_PORT(0)); \
-outb_p((val),RTC_PORT(1)); \
-})
-
-#define RTC_IRQ 8
-
-#endif /* _ASM_MC146818RTC_H */
-- 
cgit v1.2.3


From fe599f9fbc5d470ec5b55d08f2bbb991ddecbbc8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:30:26 +0100
Subject: x86: isolate the rtc code for sharing

The mach-default/mach_time.h code inline is moved to arch/x86/kernel/rtc.c
and the header files are adjusted.

Shrink the 3 dozen includes to the ones we really need.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/Makefile_32              |   2 +-
 arch/x86/kernel/rtc.c                    | 166 +++++++++++++++++++++++++++++++
 arch/x86/kernel/time_32.c                | 105 +------------------
 include/asm-x86/mach-default/mach_time.h | 111 ---------------------
 include/asm-x86/mc146818rtc.h            |   3 +
 include/asm-x86/time.h                   |   2 +-
 6 files changed, 175 insertions(+), 214 deletions(-)
 create mode 100644 arch/x86/kernel/rtc.c
 delete mode 100644 include/asm-x86/mach-default/mach_time.h

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32
index 31ff982bc26b..eb2da53578d7 100644
--- a/arch/x86/kernel/Makefile_32
+++ b/arch/x86/kernel/Makefile_32
@@ -8,7 +8,7 @@ CPPFLAGS_vmlinux.lds += -Ui386
 obj-y	:= process_32.o signal_32.o entry_32.o traps_32.o irq_32.o \
 		ptrace_32.o time_32.o ioport_32.o ldt.o setup_32.o i8259_32.o sys_i386_32.o \
 		pci-dma_32.o i386_ksyms_32.o i387_32.o bootflag.o e820_32.o\
-		quirks.o i8237.o topology.o alternative.o i8253.o tsc_32.o io_delay.o
+		quirks.o i8237.o topology.o alternative.o i8253.o tsc_32.o io_delay.o rtc.o
 
 obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
 obj-y				+= cpu/
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
new file mode 100644
index 000000000000..45bf54d9f4c5
--- /dev/null
+++ b/arch/x86/kernel/rtc.c
@@ -0,0 +1,166 @@
+/*
+ * RTC related functions
+ */
+#include <linux/bcd.h>
+#include <linux/mc146818rtc.h>
+
+#include <asm/time.h>
+
+/*
+ * In order to set the CMOS clock precisely, set_rtc_mmss has to be
+ * called 500 ms after the second nowtime has started, because when
+ * nowtime is written into the registers of the CMOS clock, it will
+ * jump to the next second precisely 500 ms later. Check the Motorola
+ * MC146818A or Dallas DS12887 data sheet for details.
+ *
+ * BUG: This routine does not handle hour overflow properly; it just
+ *      sets the minutes. Usually you'll only notice that after reboot!
+ */
+int mach_set_rtc_mmss(unsigned long nowtime)
+{
+	int retval = 0;
+	int real_seconds, real_minutes, cmos_minutes;
+	unsigned char save_control, save_freq_select;
+
+	save_control = CMOS_READ(RTC_CONTROL); /* tell the clock it's being set */
+	CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL);
+
+	save_freq_select = CMOS_READ(RTC_FREQ_SELECT); /* stop and reset prescaler */
+	CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT);
+
+	cmos_minutes = CMOS_READ(RTC_MINUTES);
+	if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
+		BCD_TO_BIN(cmos_minutes);
+
+	/*
+	 * since we're only adjusting minutes and seconds,
+	 * don't interfere with hour overflow. This avoids
+	 * messing with unknown time zones but requires your
+	 * RTC not to be off by more than 15 minutes
+	 */
+	real_seconds = nowtime % 60;
+	real_minutes = nowtime / 60;
+	if (((abs(real_minutes - cmos_minutes) + 15)/30) & 1)
+		real_minutes += 30;		/* correct for half hour time zone */
+	real_minutes %= 60;
+
+	if (abs(real_minutes - cmos_minutes) < 30) {
+		if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
+			BIN_TO_BCD(real_seconds);
+			BIN_TO_BCD(real_minutes);
+		}
+		CMOS_WRITE(real_seconds,RTC_SECONDS);
+		CMOS_WRITE(real_minutes,RTC_MINUTES);
+	} else {
+		printk(KERN_WARNING
+		       "set_rtc_mmss: can't update from %d to %d\n",
+		       cmos_minutes, real_minutes);
+		retval = -1;
+	}
+
+	/* The following flags have to be released exactly in this order,
+	 * otherwise the DS12887 (popular MC146818A clone with integrated
+	 * battery and quartz) will not reset the oscillator and will not
+	 * update precisely 500 ms later. You won't find this mentioned in
+	 * the Dallas Semiconductor data sheets, but who believes data
+	 * sheets anyway ...                           -- Markus Kuhn
+	 */
+	CMOS_WRITE(save_control, RTC_CONTROL);
+	CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
+
+	return retval;
+}
+
+unsigned long mach_get_cmos_time(void)
+{
+	unsigned int year, mon, day, hour, min, sec;
+
+	do {
+		sec = CMOS_READ(RTC_SECONDS);
+		min = CMOS_READ(RTC_MINUTES);
+		hour = CMOS_READ(RTC_HOURS);
+		day = CMOS_READ(RTC_DAY_OF_MONTH);
+		mon = CMOS_READ(RTC_MONTH);
+		year = CMOS_READ(RTC_YEAR);
+	} while (sec != CMOS_READ(RTC_SECONDS));
+
+	if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
+		BCD_TO_BIN(sec);
+		BCD_TO_BIN(min);
+		BCD_TO_BIN(hour);
+		BCD_TO_BIN(day);
+		BCD_TO_BIN(mon);
+		BCD_TO_BIN(year);
+	}
+
+	year += 1900;
+	if (year < 1970)
+		year += 100;
+
+	return mktime(year, mon, day, hour, min, sec);
+}
+
+DEFINE_SPINLOCK(rtc_lock);
+EXPORT_SYMBOL(rtc_lock);
+
+/*
+ * This is a special lock that is owned by the CPU and holds the index
+ * register we are working with.  It is required for NMI access to the
+ * CMOS/RTC registers.  See include/asm-i386/mc146818rtc.h for details.
+ */
+volatile unsigned long cmos_lock = 0;
+EXPORT_SYMBOL(cmos_lock);
+
+/* Routines for accessing the CMOS RAM/RTC. */
+unsigned char rtc_cmos_read(unsigned char addr)
+{
+	unsigned char val;
+
+	lock_cmos_prefix(addr);
+	outb_p(addr, RTC_PORT(0));
+	val = inb_p(RTC_PORT(1));
+	lock_cmos_suffix(addr);
+	return val;
+}
+EXPORT_SYMBOL(rtc_cmos_read);
+
+void rtc_cmos_write(unsigned char val, unsigned char addr)
+{
+	lock_cmos_prefix(addr);
+	outb_p(addr, RTC_PORT(0));
+	outb_p(val, RTC_PORT(1));
+	lock_cmos_suffix(addr);
+}
+EXPORT_SYMBOL(rtc_cmos_write);
+
+static int set_rtc_mmss(unsigned long nowtime)
+{
+	int retval;
+	unsigned long flags;
+
+	/* gets recalled with irq locally disabled */
+	/* XXX - does irqsave resolve this? -johnstul */
+	spin_lock_irqsave(&rtc_lock, flags);
+	retval = set_wallclock(nowtime);
+	spin_unlock_irqrestore(&rtc_lock, flags);
+
+	return retval;
+}
+
+/* not static: needed by APM */
+unsigned long read_persistent_clock(void)
+{
+	unsigned long retval;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rtc_lock, flags);
+	retval = get_wallclock();
+	spin_unlock_irqrestore(&rtc_lock, flags);
+
+	return retval;
+}
+
+int update_persistent_clock(struct timespec now)
+{
+	return set_rtc_mmss(now.tv_sec);
+}
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index 8a322c96bc23..e9ead762abe8 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -28,98 +28,21 @@
  *	serialize accesses to xtime/lost_ticks).
  */
 
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/param.h>
-#include <linux/string.h>
-#include <linux/mm.h>
+#include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/time.h>
-#include <linux/delay.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/module.h>
-#include <linux/sysdev.h>
-#include <linux/bcd.h>
-#include <linux/efi.h>
 #include <linux/mca.h>
 
-#include <asm/io.h>
-#include <asm/smp.h>
-#include <asm/irq.h>
-#include <asm/msr.h>
-#include <asm/delay.h>
-#include <asm/mpspec.h>
-#include <asm/uaccess.h>
-#include <asm/processor.h>
-#include <asm/timer.h>
-#include <asm/time.h>
-
-#include "mach_time.h"
-
-#include <linux/timex.h>
-
-#include <asm/hpet.h>
-
 #include <asm/arch_hooks.h>
+#include <asm/hpet.h>
+#include <asm/time.h>
 
 #include "io_ports.h"
-
-#include <asm/i8259.h>
-
 #include "do_timer.h"
 
 unsigned int cpu_khz;	/* Detected as we calibrate the TSC */
 EXPORT_SYMBOL(cpu_khz);
 
-DEFINE_SPINLOCK(rtc_lock);
-EXPORT_SYMBOL(rtc_lock);
-
-/*
- * This is a special lock that is owned by the CPU and holds the index
- * register we are working with.  It is required for NMI access to the
- * CMOS/RTC registers.  See include/asm-i386/mc146818rtc.h for details.
- */
-volatile unsigned long cmos_lock = 0;
-EXPORT_SYMBOL(cmos_lock);
-
-/* Routines for accessing the CMOS RAM/RTC. */
-unsigned char rtc_cmos_read(unsigned char addr)
-{
-	unsigned char val;
-	lock_cmos_prefix(addr);
-	outb_p(addr, RTC_PORT(0));
-	val = inb_p(RTC_PORT(1));
-	lock_cmos_suffix(addr);
-	return val;
-}
-EXPORT_SYMBOL(rtc_cmos_read);
-
-void rtc_cmos_write(unsigned char val, unsigned char addr)
-{
-	lock_cmos_prefix(addr);
-	outb_p(addr, RTC_PORT(0));
-	outb_p(val, RTC_PORT(1));
-	lock_cmos_suffix(addr);
-}
-EXPORT_SYMBOL(rtc_cmos_write);
-
-static int set_rtc_mmss(unsigned long nowtime)
-{
-	int retval;
-	unsigned long flags;
-
-	/* gets recalled with irq locally disabled */
-	/* XXX - does irqsave resolve this? -johnstul */
-	spin_lock_irqsave(&rtc_lock, flags);
-	retval = set_wallclock(nowtime);
-	spin_unlock_irqrestore(&rtc_lock, flags);
-
-	return retval;
-}
-
-
 int timer_ack;
 
 unsigned long profile_pc(struct pt_regs *regs)
@@ -137,7 +60,7 @@ unsigned long profile_pc(struct pt_regs *regs)
 		/* Return address is either directly at stack pointer
 		   or above a saved eflags. Eflags has bits 22-31 zero,
 		   kernel addresses don't. */
- 		if (sp[0] >> 22)
+		if (sp[0] >> 22)
 			return sp[0];
 		if (sp[1] >> 22)
 			return sp[1];
@@ -193,26 +116,6 @@ irqreturn_t timer_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-/* not static: needed by APM */
-unsigned long read_persistent_clock(void)
-{
-	unsigned long retval;
-	unsigned long flags;
-
-	spin_lock_irqsave(&rtc_lock, flags);
-
-	retval = get_wallclock();
-
-	spin_unlock_irqrestore(&rtc_lock, flags);
-
-	return retval;
-}
-
-int update_persistent_clock(struct timespec now)
-{
-	return set_rtc_mmss(now.tv_sec);
-}
-
 extern void (*late_time_init)(void);
 /* Duplicate of time_init() below, with hpet_enable part added */
 void __init hpet_time_init(void)
diff --git a/include/asm-x86/mach-default/mach_time.h b/include/asm-x86/mach-default/mach_time.h
deleted file mode 100644
index 31eb5de6f3dc..000000000000
--- a/include/asm-x86/mach-default/mach_time.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- *  include/asm-i386/mach-default/mach_time.h
- *
- *  Machine specific set RTC function for generic.
- *  Split out from time.c by Osamu Tomita <tomita@cinet.co.jp>
- */
-#ifndef _MACH_TIME_H
-#define _MACH_TIME_H
-
-#include <linux/mc146818rtc.h>
-
-/* for check timing call set_rtc_mmss() 500ms     */
-/* used in arch/i386/time.c::do_timer_interrupt() */
-#define USEC_AFTER	500000
-#define USEC_BEFORE	500000
-
-/*
- * In order to set the CMOS clock precisely, set_rtc_mmss has to be
- * called 500 ms after the second nowtime has started, because when
- * nowtime is written into the registers of the CMOS clock, it will
- * jump to the next second precisely 500 ms later. Check the Motorola
- * MC146818A or Dallas DS12887 data sheet for details.
- *
- * BUG: This routine does not handle hour overflow properly; it just
- *      sets the minutes. Usually you'll only notice that after reboot!
- */
-static inline int mach_set_rtc_mmss(unsigned long nowtime)
-{
-	int retval = 0;
-	int real_seconds, real_minutes, cmos_minutes;
-	unsigned char save_control, save_freq_select;
-
-	save_control = CMOS_READ(RTC_CONTROL); /* tell the clock it's being set */
-	CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL);
-
-	save_freq_select = CMOS_READ(RTC_FREQ_SELECT); /* stop and reset prescaler */
-	CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT);
-
-	cmos_minutes = CMOS_READ(RTC_MINUTES);
-	if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
-		BCD_TO_BIN(cmos_minutes);
-
-	/*
-	 * since we're only adjusting minutes and seconds,
-	 * don't interfere with hour overflow. This avoids
-	 * messing with unknown time zones but requires your
-	 * RTC not to be off by more than 15 minutes
-	 */
-	real_seconds = nowtime % 60;
-	real_minutes = nowtime / 60;
-	if (((abs(real_minutes - cmos_minutes) + 15)/30) & 1)
-		real_minutes += 30;		/* correct for half hour time zone */
-	real_minutes %= 60;
-
-	if (abs(real_minutes - cmos_minutes) < 30) {
-		if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
-			BIN_TO_BCD(real_seconds);
-			BIN_TO_BCD(real_minutes);
-		}
-		CMOS_WRITE(real_seconds,RTC_SECONDS);
-		CMOS_WRITE(real_minutes,RTC_MINUTES);
-	} else {
-		printk(KERN_WARNING
-		       "set_rtc_mmss: can't update from %d to %d\n",
-		       cmos_minutes, real_minutes);
-		retval = -1;
-	}
-
-	/* The following flags have to be released exactly in this order,
-	 * otherwise the DS12887 (popular MC146818A clone with integrated
-	 * battery and quartz) will not reset the oscillator and will not
-	 * update precisely 500 ms later. You won't find this mentioned in
-	 * the Dallas Semiconductor data sheets, but who believes data
-	 * sheets anyway ...                           -- Markus Kuhn
-	 */
-	CMOS_WRITE(save_control, RTC_CONTROL);
-	CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
-
-	return retval;
-}
-
-static inline unsigned long mach_get_cmos_time(void)
-{
-	unsigned int year, mon, day, hour, min, sec;
-
-	do {
-		sec = CMOS_READ(RTC_SECONDS);
-		min = CMOS_READ(RTC_MINUTES);
-		hour = CMOS_READ(RTC_HOURS);
-		day = CMOS_READ(RTC_DAY_OF_MONTH);
-		mon = CMOS_READ(RTC_MONTH);
-		year = CMOS_READ(RTC_YEAR);
-	} while (sec != CMOS_READ(RTC_SECONDS));
-
-	if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
-		BCD_TO_BIN(sec);
-		BCD_TO_BIN(min);
-		BCD_TO_BIN(hour);
-		BCD_TO_BIN(day);
-		BCD_TO_BIN(mon);
-		BCD_TO_BIN(year);
-	}
-
-	year += 1900;
-	if (year < 1970)
-		year += 100;
-
-	return mktime(year, mon, day, hour, min, sec);
-}
-
-#endif /* !_MACH_TIME_H */
diff --git a/include/asm-x86/mc146818rtc.h b/include/asm-x86/mc146818rtc.h
index 9d39436df231..cdd9f965835a 100644
--- a/include/asm-x86/mc146818rtc.h
+++ b/include/asm-x86/mc146818rtc.h
@@ -92,6 +92,9 @@ static inline unsigned char current_lock_cmos_reg(void)
 unsigned char rtc_cmos_read(unsigned char addr);
 void rtc_cmos_write(unsigned char val, unsigned char addr);
 
+extern int mach_set_rtc_mmss(unsigned long nowtime);
+extern unsigned long mach_get_cmos_time(void);
+
 #define RTC_IRQ 8
 
 #endif /* _ASM_MC146818RTC_H */
diff --git a/include/asm-x86/time.h b/include/asm-x86/time.h
index eac011366dc2..b3f94cd81ac6 100644
--- a/include/asm-x86/time.h
+++ b/include/asm-x86/time.h
@@ -2,7 +2,7 @@
 #define _ASMi386_TIME_H
 
 #include <linux/efi.h>
-#include "mach_time.h"
+#include <asm/mc146818rtc.h>
 
 static inline unsigned long native_get_wallclock(void)
 {
-- 
cgit v1.2.3


From 1122b134bcd6e77c5a4117952b8cbc55c8d018bc Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:30:27 +0100
Subject: x86: share rtc code

Remove the rtc code from time_64.c and add the extra bits to the
i386 path. The ACPI century check is probably valid for i386 as
well, but this is material for a separate patch.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/Makefile_64 |   2 +-
 arch/x86/kernel/rtc.c       |  96 +++++++++++++++++----------
 arch/x86/kernel/time_64.c   | 157 --------------------------------------------
 3 files changed, 64 insertions(+), 191 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile_64 b/arch/x86/kernel/Makefile_64
index 9cb3df27c413..ae95d21ea885 100644
--- a/arch/x86/kernel/Makefile_64
+++ b/arch/x86/kernel/Makefile_64
@@ -11,7 +11,7 @@ obj-y	:= process_64.o signal_64.o entry_64.o traps_64.o irq_64.o \
 		x8664_ksyms_64.o i387_64.o syscall_64.o vsyscall_64.o \
 		setup64.o bootflag.o e820_64.o reboot_64.o quirks.o i8237.o \
 		pci-dma_64.o pci-nommu_64.o alternative.o hpet.o tsc_64.o bugs_64.o \
-		i8253.o io_delay.o
+		i8253.o io_delay.o rtc.o
 
 obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
 obj-y				+= cpu/
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 45bf54d9f4c5..d040840ff1b6 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -1,11 +1,32 @@
 /*
  * RTC related functions
  */
+#include <linux/acpi.h>
 #include <linux/bcd.h>
 #include <linux/mc146818rtc.h>
 
 #include <asm/time.h>
 
+#ifdef CONFIG_X86_32
+# define CMOS_YEARS_OFFS 1900
+/*
+ * This is a special lock that is owned by the CPU and holds the index
+ * register we are working with.  It is required for NMI access to the
+ * CMOS/RTC registers.  See include/asm-i386/mc146818rtc.h for details.
+ */
+volatile unsigned long cmos_lock = 0;
+EXPORT_SYMBOL(cmos_lock);
+#else
+/*
+ * x86-64 systems only exists since 2002.
+ * This will work up to Dec 31, 2100
+ */
+# define CMOS_YEARS_OFFS 2000
+#endif
+
+DEFINE_SPINLOCK(rtc_lock);
+EXPORT_SYMBOL(rtc_lock);
+
 /*
  * In order to set the CMOS clock precisely, set_rtc_mmss has to be
  * called 500 ms after the second nowtime has started, because when
@@ -22,10 +43,12 @@ int mach_set_rtc_mmss(unsigned long nowtime)
 	int real_seconds, real_minutes, cmos_minutes;
 	unsigned char save_control, save_freq_select;
 
-	save_control = CMOS_READ(RTC_CONTROL); /* tell the clock it's being set */
+	 /* tell the clock it's being set */
+	save_control = CMOS_READ(RTC_CONTROL);
 	CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL);
 
-	save_freq_select = CMOS_READ(RTC_FREQ_SELECT); /* stop and reset prescaler */
+	/* stop and reset prescaler */
+	save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
 	CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT);
 
 	cmos_minutes = CMOS_READ(RTC_MINUTES);
@@ -40,8 +63,9 @@ int mach_set_rtc_mmss(unsigned long nowtime)
 	 */
 	real_seconds = nowtime % 60;
 	real_minutes = nowtime / 60;
+	/* correct for half hour time zone */
 	if (((abs(real_minutes - cmos_minutes) + 15)/30) & 1)
-		real_minutes += 30;		/* correct for half hour time zone */
+		real_minutes += 30;
 	real_minutes %= 60;
 
 	if (abs(real_minutes - cmos_minutes) < 30) {
@@ -73,18 +97,32 @@ int mach_set_rtc_mmss(unsigned long nowtime)
 
 unsigned long mach_get_cmos_time(void)
 {
-	unsigned int year, mon, day, hour, min, sec;
-
-	do {
-		sec = CMOS_READ(RTC_SECONDS);
-		min = CMOS_READ(RTC_MINUTES);
-		hour = CMOS_READ(RTC_HOURS);
-		day = CMOS_READ(RTC_DAY_OF_MONTH);
-		mon = CMOS_READ(RTC_MONTH);
-		year = CMOS_READ(RTC_YEAR);
-	} while (sec != CMOS_READ(RTC_SECONDS));
-
-	if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
+	unsigned int year, mon, day, hour, min, sec, century = 0;
+
+	/*
+	 * If UIP is clear, then we have >= 244 microseconds before
+	 * RTC registers will be updated.  Spec sheet says that this
+	 * is the reliable way to read RTC - registers. If UIP is set
+	 * then the register access might be invalid.
+	 */
+	while ((CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP))
+		cpu_relax();
+
+	sec = CMOS_READ(RTC_SECONDS);
+	min = CMOS_READ(RTC_MINUTES);
+	hour = CMOS_READ(RTC_HOURS);
+	day = CMOS_READ(RTC_DAY_OF_MONTH);
+	mon = CMOS_READ(RTC_MONTH);
+	year = CMOS_READ(RTC_YEAR);
+
+#if defined(CONFIG_ACPI) && defined(CONFIG_X86_64)
+	/* CHECKME: Is this really 64bit only ??? */
+	if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
+	    acpi_gbl_FADT.century)
+		century = CMOS_READ(acpi_gbl_FADT.century);
+#endif
+
+	if (RTC_ALWAYS_BCD || !(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY)) {
 		BCD_TO_BIN(sec);
 		BCD_TO_BIN(min);
 		BCD_TO_BIN(hour);
@@ -93,24 +131,19 @@ unsigned long mach_get_cmos_time(void)
 		BCD_TO_BIN(year);
 	}
 
-	year += 1900;
-	if (year < 1970)
-		year += 100;
+	if (century) {
+		BCD_TO_BIN(century);
+		year += century * 100;
+		printk(KERN_INFO "Extended CMOS year: %d\n", century * 100);
+	} else {
+		year += CMOS_YEARS_OFFS;
+		if (year < 1970)
+			year += 100;
+	}
 
 	return mktime(year, mon, day, hour, min, sec);
 }
 
-DEFINE_SPINLOCK(rtc_lock);
-EXPORT_SYMBOL(rtc_lock);
-
-/*
- * This is a special lock that is owned by the CPU and holds the index
- * register we are working with.  It is required for NMI access to the
- * CMOS/RTC registers.  See include/asm-i386/mc146818rtc.h for details.
- */
-volatile unsigned long cmos_lock = 0;
-EXPORT_SYMBOL(cmos_lock);
-
 /* Routines for accessing the CMOS RAM/RTC. */
 unsigned char rtc_cmos_read(unsigned char addr)
 {
@@ -138,8 +171,6 @@ static int set_rtc_mmss(unsigned long nowtime)
 	int retval;
 	unsigned long flags;
 
-	/* gets recalled with irq locally disabled */
-	/* XXX - does irqsave resolve this? -johnstul */
 	spin_lock_irqsave(&rtc_lock, flags);
 	retval = set_wallclock(nowtime);
 	spin_unlock_irqrestore(&rtc_lock, flags);
@@ -150,8 +181,7 @@ static int set_rtc_mmss(unsigned long nowtime)
 /* not static: needed by APM */
 unsigned long read_persistent_clock(void)
 {
-	unsigned long retval;
-	unsigned long flags;
+	unsigned long retval, flags;
 
 	spin_lock_irqsave(&rtc_lock, flags);
 	retval = get_wallclock();
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index 0a01504586a5..64cd03ed9bfc 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -46,9 +46,6 @@
 #include <asm/nmi.h>
 #include <asm/vgtod.h>
 
-DEFINE_SPINLOCK(rtc_lock);
-EXPORT_SYMBOL(rtc_lock);
-
 volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
 
 unsigned long profile_pc(struct pt_regs *regs)
@@ -69,103 +66,6 @@ unsigned long profile_pc(struct pt_regs *regs)
 }
 EXPORT_SYMBOL(profile_pc);
 
-/* Routines for accessing the CMOS RAM/RTC. */
-unsigned char rtc_cmos_read(unsigned char addr)
-{
-	unsigned char val;
-	lock_cmos_prefix(addr);
-	outb_p(addr, RTC_PORT(0));
-	val = inb_p(RTC_PORT(1));
-	lock_cmos_suffix(addr);
-	return val;
-}
-EXPORT_SYMBOL(rtc_cmos_read);
-
-void rtc_cmos_write(unsigned char val, unsigned char addr)
-{
-	lock_cmos_prefix(addr);
-	outb_p(addr, RTC_PORT(0));
-	outb_p(val, RTC_PORT(1));
-	lock_cmos_suffix(addr);
-}
-EXPORT_SYMBOL(rtc_cmos_write);
-
-/*
- * In order to set the CMOS clock precisely, set_rtc_mmss has to be called 500
- * ms after the second nowtime has started, because when nowtime is written
- * into the registers of the CMOS clock, it will jump to the next second
- * precisely 500 ms later. Check the Motorola MC146818A or Dallas DS12887 data
- * sheet for details.
- */
-
-static int set_rtc_mmss(unsigned long nowtime)
-{
-	int retval = 0;
-	int real_seconds, real_minutes, cmos_minutes;
-	unsigned char control, freq_select;
-	unsigned long flags;
-
-/*
- * set_rtc_mmss is called when irqs are enabled, so disable irqs here
- */
-	spin_lock_irqsave(&rtc_lock, flags);
-/*
- * Tell the clock it's being set and stop it.
- */
-	control = CMOS_READ(RTC_CONTROL);
-	CMOS_WRITE(control | RTC_SET, RTC_CONTROL);
-
-	freq_select = CMOS_READ(RTC_FREQ_SELECT);
-	CMOS_WRITE(freq_select | RTC_DIV_RESET2, RTC_FREQ_SELECT);
-
-	cmos_minutes = CMOS_READ(RTC_MINUTES);
-		BCD_TO_BIN(cmos_minutes);
-
-/*
- * since we're only adjusting minutes and seconds, don't interfere with hour
- * overflow. This avoids messing with unknown time zones but requires your RTC
- * not to be off by more than 15 minutes. Since we're calling it only when
- * our clock is externally synchronized using NTP, this shouldn't be a problem.
- */
-
-	real_seconds = nowtime % 60;
-	real_minutes = nowtime / 60;
-	if (((abs(real_minutes - cmos_minutes) + 15) / 30) & 1)
-		real_minutes += 30;		/* correct for half hour time zone */
-	real_minutes %= 60;
-
-	if (abs(real_minutes - cmos_minutes) >= 30) {
-		printk(KERN_WARNING "time.c: can't update CMOS clock "
-		       "from %d to %d\n", cmos_minutes, real_minutes);
-		retval = -1;
-	} else {
-		BIN_TO_BCD(real_seconds);
-		BIN_TO_BCD(real_minutes);
-		CMOS_WRITE(real_seconds, RTC_SECONDS);
-		CMOS_WRITE(real_minutes, RTC_MINUTES);
-	}
-
-/*
- * The following flags have to be released exactly in this order, otherwise the
- * DS12887 (popular MC146818A clone with integrated battery and quartz) will
- * not reset the oscillator and will not update precisely 500 ms later. You
- * won't find this mentioned in the Dallas Semiconductor data sheets, but who
- * believes data sheets anyway ... -- Markus Kuhn
- */
-
-	CMOS_WRITE(control, RTC_CONTROL);
-	CMOS_WRITE(freq_select, RTC_FREQ_SELECT);
-
-	spin_unlock_irqrestore(&rtc_lock, flags);
-
-	return retval;
-}
-
-int update_persistent_clock(struct timespec now)
-{
-	return set_rtc_mmss(now.tv_sec);
-}
-
 static irqreturn_t timer_event_interrupt(int irq, void *dev_id)
 {
 	add_pda(irq0_irqs, 1);
@@ -175,63 +75,6 @@ static irqreturn_t timer_event_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-unsigned long read_persistent_clock(void)
-{
-	unsigned int year, mon, day, hour, min, sec;
-	unsigned long flags;
-	unsigned century = 0;
-
-	spin_lock_irqsave(&rtc_lock, flags);
-	/*
-	 * if UIP is clear, then we have >= 244 microseconds before RTC
-	 * registers will be updated.  Spec sheet says that this is the
-	 * reliable way to read RTC - registers invalid (off bus) during update
-	 */
-	while ((CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP))
-		cpu_relax();
-
-
-	/* now read all RTC registers while stable with interrupts disabled */
-	sec = CMOS_READ(RTC_SECONDS);
-	min = CMOS_READ(RTC_MINUTES);
-	hour = CMOS_READ(RTC_HOURS);
-	day = CMOS_READ(RTC_DAY_OF_MONTH);
-	mon = CMOS_READ(RTC_MONTH);
-	year = CMOS_READ(RTC_YEAR);
-#ifdef CONFIG_ACPI
-	if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
-				acpi_gbl_FADT.century)
-		century = CMOS_READ(acpi_gbl_FADT.century);
-#endif
-	spin_unlock_irqrestore(&rtc_lock, flags);
-
-	/*
-	 * We know that x86-64 always uses BCD format, no need to check the
-	 * config register.
-	 */
-
-	BCD_TO_BIN(sec);
-	BCD_TO_BIN(min);
-	BCD_TO_BIN(hour);
-	BCD_TO_BIN(day);
-	BCD_TO_BIN(mon);
-	BCD_TO_BIN(year);
-
-	if (century) {
-		BCD_TO_BIN(century);
-		year += century * 100;
-		printk(KERN_INFO "Extended CMOS year: %d\n", century * 100);
-	} else {
-		/*
-		 * x86-64 systems only exists since 2002.
-		 * This will work up to Dec 31, 2100
-		 */
-		year += 2000;
-	}
-
-	return mktime(year, mon, day, hour, min, sec);
-}
-
 /* calibrate_cpu is used on systems with fixed rate TSCs to determine
  * processor frequency */
 #define TICK_COUNT 100000000
-- 
cgit v1.2.3


From 081e10b96e971da2eba05ab1ecbf2c051fa119f6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:30:27 +0100
Subject: x86: clean up arch/x86/kernel/time_64.c includes

Reduce the lets include all to the minimum.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/time_64.c | 30 +++---------------------------
 1 file changed, 3 insertions(+), 27 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index 64cd03ed9bfc..f88bf6b802e3 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -11,38 +11,14 @@
  *  RTC support code taken from arch/i386/kernel/timers/time_hpet.c
  */
 
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/interrupt.h>
+#include <linux/clockchips.h>
 #include <linux/init.h>
-#include <linux/mc146818rtc.h>
-#include <linux/time.h>
-#include <linux/ioport.h>
+#include <linux/interrupt.h>
 #include <linux/module.h>
-#include <linux/device.h>
-#include <linux/sysdev.h>
-#include <linux/bcd.h>
-#include <linux/notifier.h>
-#include <linux/cpu.h>
-#include <linux/kallsyms.h>
-#include <linux/acpi.h>
-#include <linux/clockchips.h>
+#include <linux/time.h>
 
-#ifdef CONFIG_ACPI
-#include <acpi/achware.h>	/* for PM timer frequency */
-#include <acpi/acpi_bus.h>
-#endif
 #include <asm/i8253.h>
-#include <asm/pgtable.h>
-#include <asm/vsyscall.h>
-#include <asm/timex.h>
-#include <asm/proto.h>
-#include <asm/hpet.h>
-#include <asm/sections.h>
-#include <linux/hpet.h>
-#include <asm/apic.h>
 #include <asm/hpet.h>
-#include <asm/mpspec.h>
 #include <asm/nmi.h>
 #include <asm/vgtod.h>
 
-- 
cgit v1.2.3


From 16da2f93054fc379522b93afc71d49751bd8be2b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:30:27 +0100
Subject: x86: smp_64.c: remove unused exports and cleanup while at it

The exports are nowhere used. There is even no reason why they were
ever introduced.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/smp_64.c | 74 +++++++++++++++++++++++-------------------------
 1 file changed, 35 insertions(+), 39 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/smp_64.c b/arch/x86/kernel/smp_64.c
index 03fa6ed559c6..62b0f2a1b1e8 100644
--- a/arch/x86/kernel/smp_64.c
+++ b/arch/x86/kernel/smp_64.c
@@ -29,7 +29,7 @@
 #include <asm/idle.h>
 
 /*
- *	Smarter SMP flushing macros. 
+ *	Smarter SMP flushing macros.
  *		c/o Linus Torvalds.
  *
  *	These mean you can really definitely utterly forget about
@@ -37,15 +37,15 @@
  *
  *	Optimizations Manfred Spraul <manfred@colorfullife.com>
  *
- * 	More scalable flush, from Andi Kleen
+ *	More scalable flush, from Andi Kleen
  *
- * 	To avoid global state use 8 different call vectors.
- * 	Each CPU uses a specific vector to trigger flushes on other
- * 	CPUs. Depending on the received vector the target CPUs look into
+ *	To avoid global state use 8 different call vectors.
+ *	Each CPU uses a specific vector to trigger flushes on other
+ *	CPUs. Depending on the received vector the target CPUs look into
  *	the right per cpu variable for the flush data.
  *
- * 	With more than 8 CPUs they are hashed to the 8 available
- * 	vectors. The limited global vector space forces us to this right now.
+ *	With more than 8 CPUs they are hashed to the 8 available
+ *	vectors. The limited global vector space forces us to this right now.
  *	In future when interrupts are split into per CPU domains this could be
  *	fixed, at the cost of triggering multiple IPIs in some cases.
  */
@@ -67,7 +67,7 @@ union smp_flush_state {
 static DEFINE_PER_CPU(union smp_flush_state, flush_state);
 
 /*
- * We cannot call mmdrop() because we are in interrupt context, 
+ * We cannot call mmdrop() because we are in interrupt context,
  * instead update mm->cpu_vm_mask.
  */
 static inline void leave_mm(int cpu)
@@ -85,25 +85,25 @@ static inline void leave_mm(int cpu)
  * 1) switch_mm() either 1a) or 1b)
  * 1a) thread switch to a different mm
  * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
- * 	Stop ipi delivery for the old mm. This is not synchronized with
- * 	the other cpus, but smp_invalidate_interrupt ignore flush ipis
- * 	for the wrong mm, and in the worst case we perform a superfluous
- * 	tlb flush.
+ *	Stop ipi delivery for the old mm. This is not synchronized with
+ *	the other cpus, but smp_invalidate_interrupt ignore flush ipis
+ *	for the wrong mm, and in the worst case we perform a superfluous
+ *	tlb flush.
  * 1a2) set cpu mmu_state to TLBSTATE_OK
- * 	Now the smp_invalidate_interrupt won't call leave_mm if cpu0
+ *	Now the smp_invalidate_interrupt won't call leave_mm if cpu0
  *	was in lazy tlb mode.
  * 1a3) update cpu active_mm
- * 	Now cpu0 accepts tlb flushes for the new mm.
+ *	Now cpu0 accepts tlb flushes for the new mm.
  * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
- * 	Now the other cpus will send tlb flush ipis.
+ *	Now the other cpus will send tlb flush ipis.
  * 1a4) change cr3.
  * 1b) thread switch without mm change
  *	cpu active_mm is correct, cpu0 already handles
  *	flush ipis.
  * 1b1) set cpu mmu_state to TLBSTATE_OK
  * 1b2) test_and_set the cpu bit in cpu_vm_mask.
- * 	Atomically set the bit [other cpus will start sending flush ipis],
- * 	and test the bit.
+ *	Atomically set the bit [other cpus will start sending flush ipis],
+ *	and test the bit.
  * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
  * 2) switch %%esp, ie current
  *
@@ -142,7 +142,7 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
 
 	if (!cpu_isset(cpu, f->flush_cpumask))
 		goto out;
-		/* 
+		/*
 		 * This was a BUG() but until someone can quote me the
 		 * line from the intel manual that guarantees an IPI to
 		 * multiple CPUs is retried _only_ on the erroring CPUs
@@ -150,7 +150,7 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
 		 *
 		 * BUG();
 		 */
-		 
+
 	if (f->flush_mm == read_pda(active_mm)) {
 		if (read_pda(mmu_state) == TLBSTATE_OK) {
 			if (f->flush_va == FLUSH_ALL)
@@ -176,9 +176,11 @@ static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
 	sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
 	f = &per_cpu(flush_state, sender);
 
-	/* Could avoid this lock when
-	   num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
-	   probably not worth checking this for a cache-hot lock. */
+	/*
+	 * Could avoid this lock when
+	 * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
+	 * probably not worth checking this for a cache-hot lock.
+	 */
 	spin_lock(&f->tlbstate_lock);
 
 	f->flush_mm = mm;
@@ -202,14 +204,14 @@ static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
 int __cpuinit init_smp_flush(void)
 {
 	int i;
+
 	for_each_cpu_mask(i, cpu_possible_map) {
 		spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
 	}
 	return 0;
 }
-
 core_initcall(init_smp_flush);
-	
+
 void flush_tlb_current_task(void)
 {
 	struct mm_struct *mm = current->mm;
@@ -224,7 +226,6 @@ void flush_tlb_current_task(void)
 		flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
 	preempt_enable();
 }
-EXPORT_SYMBOL(flush_tlb_current_task);
 
 void flush_tlb_mm (struct mm_struct * mm)
 {
@@ -245,7 +246,6 @@ void flush_tlb_mm (struct mm_struct * mm)
 
 	preempt_enable();
 }
-EXPORT_SYMBOL(flush_tlb_mm);
 
 void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
 {
@@ -259,8 +259,8 @@ void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
 	if (current->active_mm == mm) {
 		if(current->mm)
 			__flush_tlb_one(va);
-		 else
-		 	leave_mm(smp_processor_id());
+		else
+			leave_mm(smp_processor_id());
 	}
 
 	if (!cpus_empty(cpu_mask))
@@ -268,7 +268,6 @@ void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
 
 	preempt_enable();
 }
-EXPORT_SYMBOL(flush_tlb_page);
 
 static void do_flush_tlb_all(void* info)
 {
@@ -325,11 +324,9 @@ void unlock_ipi_call_lock(void)
  * this function sends a 'generic call function' IPI to all other CPU
  * of the system defined in the mask.
  */
-
-static int
-__smp_call_function_mask(cpumask_t mask,
-			 void (*func)(void *), void *info,
-			 int wait)
+static int __smp_call_function_mask(cpumask_t mask,
+				    void (*func)(void *), void *info,
+				    int wait)
 {
 	struct call_data_struct data;
 	cpumask_t allbutself;
@@ -417,11 +414,10 @@ EXPORT_SYMBOL(smp_call_function_mask);
  */
 
 int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
-	int nonatomic, int wait)
+			      int nonatomic, int wait)
 {
 	/* prevent preemption and reschedule on another processor */
-	int ret;
-	int me = get_cpu();
+	int ret, me = get_cpu();
 
 	/* Can deadlock when called with interrupts disabled */
 	WARN_ON(irqs_disabled());
@@ -471,9 +467,9 @@ static void stop_this_cpu(void *dummy)
 	 */
 	cpu_clear(smp_processor_id(), cpu_online_map);
 	disable_local_APIC();
-	for (;;) 
+	for (;;)
 		halt();
-} 
+}
 
 void smp_send_stop(void)
 {
-- 
cgit v1.2.3


From a72368dd37f6ae333fbab03598e46a995d91decc Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:30:28 +0100
Subject: x86: remove dead code and exports

No users.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/process_64.c | 7 -------
 include/asm-x86/idle.h       | 1 -
 2 files changed, 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 2c9e59448f4c..98d85952f574 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -72,13 +72,6 @@ void idle_notifier_register(struct notifier_block *n)
 {
 	atomic_notifier_chain_register(&idle_notifier, n);
 }
-EXPORT_SYMBOL_GPL(idle_notifier_register);
-
-void idle_notifier_unregister(struct notifier_block *n)
-{
-	atomic_notifier_chain_unregister(&idle_notifier, n);
-}
-EXPORT_SYMBOL(idle_notifier_unregister);
 
 void enter_idle(void)
 {
diff --git a/include/asm-x86/idle.h b/include/asm-x86/idle.h
index 6bd47dcf2067..d240e5b30a45 100644
--- a/include/asm-x86/idle.h
+++ b/include/asm-x86/idle.h
@@ -6,7 +6,6 @@
 
 struct notifier_block;
 void idle_notifier_register(struct notifier_block *n);
-void idle_notifier_unregister(struct notifier_block *n);
 
 void enter_idle(void);
 void exit_idle(void);
-- 
cgit v1.2.3


From 3abf024d2abb79614d8c4cb25a70d5596f77d0ad Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:30:28 +0100
Subject: x86: nuke a ton of unused exports

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perfctr-watchdog.c | 1 -
 arch/x86/kernel/e820_64.c              | 1 -
 arch/x86/kernel/init_task.c            | 1 -
 arch/x86/kernel/pci-swiotlb_64.c       | 1 -
 arch/x86/kernel/smpboot_32.c           | 1 -
 arch/x86/kernel/smpboot_64.c           | 2 --
 arch/x86/kernel/stacktrace.c           | 1 -
 arch/x86/kernel/traps_64.c             | 8 --------
 arch/x86/mach-voyager/voyager_smp.c    | 1 -
 arch/x86/mm/numa_64.c                  | 1 -
 include/asm-x86/bug.h                  | 3 ---
 include/asm-x86/mmu_context_64.h       | 2 +-
 12 files changed, 1 insertion(+), 22 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index c02541e6e653..9b838324b818 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -167,7 +167,6 @@ void release_evntsel_nmi(unsigned int msr)
 	clear_bit(counter, evntsel_nmi_owner);
 }
 
-EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
 EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
 EXPORT_SYMBOL(reserve_perfctr_nmi);
 EXPORT_SYMBOL(release_perfctr_nmi);
diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
index e510cfd5bb71..0128b0b0e5b5 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -34,7 +34,6 @@ struct e820map e820;
  * PFN of last memory page.
  */
 unsigned long end_pfn;
-EXPORT_SYMBOL(end_pfn);
 
 /*
  * end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index 468c9c437842..5b3ce7934363 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -15,7 +15,6 @@ static struct files_struct init_files = INIT_FILES;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
-EXPORT_SYMBOL(init_mm);
 
 /*
  * Initial thread structure.
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
index 102866d729a5..82a0a674a003 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -10,7 +10,6 @@
 #include <asm/dma.h>
 
 int swiotlb __read_mostly;
-EXPORT_SYMBOL(swiotlb);
 
 const struct dma_mapping_ops swiotlb_dma_ops = {
 	.mapping_error = swiotlb_dma_mapping_error,
diff --git a/arch/x86/kernel/smpboot_32.c b/arch/x86/kernel/smpboot_32.c
index 4ea80cbe52e5..753d7acf4dac 100644
--- a/arch/x86/kernel/smpboot_32.c
+++ b/arch/x86/kernel/smpboot_32.c
@@ -83,7 +83,6 @@ EXPORT_SYMBOL(cpu_online_map);
 
 cpumask_t cpu_callin_map;
 cpumask_t cpu_callout_map;
-EXPORT_SYMBOL(cpu_callout_map);
 cpumask_t cpu_possible_map;
 EXPORT_SYMBOL(cpu_possible_map);
 static cpumask_t smp_commenced_mask;
diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c
index b36d32ff0b39..7552db9ee9ff 100644
--- a/arch/x86/kernel/smpboot_64.c
+++ b/arch/x86/kernel/smpboot_64.c
@@ -78,8 +78,6 @@ EXPORT_SYMBOL(cpu_online_map);
  */
 cpumask_t cpu_callin_map;
 cpumask_t cpu_callout_map;
-EXPORT_SYMBOL(cpu_callout_map);
-
 cpumask_t cpu_possible_map;
 EXPORT_SYMBOL(cpu_possible_map);
 
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 55771fd7e545..c571edd11878 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -69,7 +69,6 @@ void save_stack_trace(struct stack_trace *trace)
 	if (trace->nr_entries < trace->max_entries)
 		trace->entries[trace->nr_entries++] = ULONG_MAX;
 }
-EXPORT_SYMBOL(save_stack_trace);
 
 void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
 {
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index cc68b92316cd..aa248d754533 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -461,14 +461,6 @@ int is_valid_bugaddr(unsigned long rip)
 	return ud2 == 0x0b0f;
 }
 
-#ifdef CONFIG_BUG
-void out_of_line_bug(void)
-{ 
-	BUG(); 
-} 
-EXPORT_SYMBOL(out_of_line_bug);
-#endif
-
 static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
 static int die_owner = -1;
 static unsigned int die_nest_count;
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index 73c435ce10fd..981def2b4e9b 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -227,7 +227,6 @@ static cpumask_t smp_commenced_mask = CPU_MASK_NONE;
 /* This is for the new dynamic CPU boot code */
 cpumask_t cpu_callin_map = CPU_MASK_NONE;
 cpumask_t cpu_callout_map = CPU_MASK_NONE;
-EXPORT_SYMBOL(cpu_callout_map);
 cpumask_t cpu_possible_map = CPU_MASK_NONE;
 EXPORT_SYMBOL(cpu_possible_map);
 
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 2d8fdc05f415..e7f3f4e9ec85 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -624,7 +624,6 @@ void __init init_cpu_to_node(void)
 
 EXPORT_SYMBOL(cpu_to_node);
 EXPORT_SYMBOL(node_to_cpumask);
-EXPORT_SYMBOL(memnode);
 EXPORT_SYMBOL(node_data);
 
 #ifdef CONFIG_DISCONTIGMEM
diff --git a/include/asm-x86/bug.h b/include/asm-x86/bug.h
index fd8bdc639c48..8d477a201392 100644
--- a/include/asm-x86/bug.h
+++ b/include/asm-x86/bug.h
@@ -33,9 +33,6 @@
 	} while(0)
 #endif
 
-void out_of_line_bug(void);
-#else /* CONFIG_BUG */
-static inline void out_of_line_bug(void) { }
 #endif /* !CONFIG_BUG */
 
 #include <asm-generic/bug.h>
diff --git a/include/asm-x86/mmu_context_64.h b/include/asm-x86/mmu_context_64.h
index 0cce83a78378..29f95c3acc28 100644
--- a/include/asm-x86/mmu_context_64.h
+++ b/include/asm-x86/mmu_context_64.h
@@ -49,7 +49,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 	else {
 		write_pda(mmu_state, TLBSTATE_OK);
 		if (read_pda(active_mm) != next)
-			out_of_line_bug();
+			BUG();
 		if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
 			/* We were in lazy tlb mode and leave_mm disabled 
 			 * tlb flush IPI delivery. We must reload CR3
-- 
cgit v1.2.3


From f0cd0af1b004f601f3cf96d9e001ffad9207f642 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Wed, 30 Jan 2008 13:30:29 +0100
Subject: x86: unexport __{read,write}_lock_failed

This patch removes the unused exports for __{read,write}_lock_failed.

Signed-off-by: Adrian Bunk <bunk@kernel.org>

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/i386_ksyms_32.c  | 7 -------
 arch/x86/kernel/x8664_ksyms_64.c | 7 -------
 2 files changed, 14 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index 02112fcc0de7..061627806a2d 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -22,12 +22,5 @@ EXPORT_SYMBOL(__put_user_8);
 
 EXPORT_SYMBOL(strstr);
 
-#ifdef CONFIG_SMP
-extern void FASTCALL( __write_lock_failed(rwlock_t *rw));
-extern void FASTCALL( __read_lock_failed(rwlock_t *rw));
-EXPORT_SYMBOL(__write_lock_failed);
-EXPORT_SYMBOL(__read_lock_failed);
-#endif
-
 EXPORT_SYMBOL(csum_partial);
 EXPORT_SYMBOL(empty_zero_page);
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 77c25b307635..e28b533674dd 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -34,13 +34,6 @@ EXPORT_SYMBOL(__copy_from_user_inatomic);
 EXPORT_SYMBOL(copy_page);
 EXPORT_SYMBOL(clear_page);
 
-#ifdef CONFIG_SMP
-extern void  __write_lock_failed(rwlock_t *rw);
-extern void  __read_lock_failed(rwlock_t *rw);
-EXPORT_SYMBOL(__write_lock_failed);
-EXPORT_SYMBOL(__read_lock_failed);
-#endif
-
 /* Export string functions. We normally rely on gcc builtin for most of these,
    but gcc sometimes decides not to inline them. */    
 #undef memcpy
-- 
cgit v1.2.3


From f20ebee41882d28c965166e56c1331fbd28778bb Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:30:29 +0100
Subject: x86: move 8259 defines to i8259.h

Move the i8259 defines and remove the now io_ports.h

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic_32.c               |  2 --
 arch/x86/kernel/apm_32.c                |  2 --
 arch/x86/kernel/i8259_32.c              |  2 --
 arch/x86/kernel/io_apic_32.c            |  2 --
 arch/x86/kernel/time_32.c               |  1 -
 arch/x86/kernel/vmiclock_32.c           |  1 -
 include/asm-x86/i8259.h                 | 17 ++++++++++++++++-
 include/asm-x86/mach-default/io_ports.h | 25 -------------------------
 8 files changed, 16 insertions(+), 36 deletions(-)
 delete mode 100644 include/asm-x86/mach-default/io_ports.h

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index 420c15842e44..56352c11a896 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -43,8 +43,6 @@
 #include <mach_apicdef.h>
 #include <mach_ipi.h>
 
-#include "io_ports.h"
-
 /*
  * Sanity check
  */
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index af045ca0f653..7496c2e4b6ae 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -235,8 +235,6 @@
 #include <asm/paravirt.h>
 #include <asm/reboot.h>
 
-#include "io_ports.h"
-
 #if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
 extern int (*console_blank_hook)(int);
 #endif
diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259_32.c
index 5f3496d01984..3321ce669295 100644
--- a/arch/x86/kernel/i8259_32.c
+++ b/arch/x86/kernel/i8259_32.c
@@ -21,8 +21,6 @@
 #include <asm/arch_hooks.h>
 #include <asm/i8259.h>
 
-#include <io_ports.h>
-
 /*
  * This is the 'legacy' 8259A Programmable Interrupt Controller,
  * present in the majority of PC/AT boxes.
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index ab77f1905469..75bf8dc77650 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -48,8 +48,6 @@
 #include <mach_apic.h>
 #include <mach_apicdef.h>
 
-#include "io_ports.h"
-
 int (*ioapic_renumber_irq)(int ioapic, int irq);
 atomic_t irq_mis_count;
 
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index e9ead762abe8..2dcbb81b4cd3 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -37,7 +37,6 @@
 #include <asm/hpet.h>
 #include <asm/time.h>
 
-#include "io_ports.h"
 #include "do_timer.h"
 
 unsigned int cpu_khz;	/* Detected as we calibrate the TSC */
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index b1b5ab08b26e..57f9ef5a324c 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -35,7 +35,6 @@
 #include <asm/i8253.h>
 
 #include <irq_vectors.h>
-#include "io_ports.h"
 
 #define VMI_ONESHOT  (VMI_ALARM_IS_ONESHOT  | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
 #define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
diff --git a/include/asm-x86/i8259.h b/include/asm-x86/i8259.h
index 29d8f9a6b3fc..cabcc6cf3923 100644
--- a/include/asm-x86/i8259.h
+++ b/include/asm-x86/i8259.h
@@ -3,10 +3,25 @@
 
 extern unsigned int cached_irq_mask;
 
-#define __byte(x,y) 		(((unsigned char *) &(y))[x])
+#define __byte(x,y)		(((unsigned char *) &(y))[x])
 #define cached_master_mask	(__byte(0, cached_irq_mask))
 #define cached_slave_mask	(__byte(1, cached_irq_mask))
 
+/* i8259A PIC registers */
+#define PIC_MASTER_CMD		0x20
+#define PIC_MASTER_IMR		0x21
+#define PIC_MASTER_ISR		PIC_MASTER_CMD
+#define PIC_MASTER_POLL		PIC_MASTER_ISR
+#define PIC_MASTER_OCW3		PIC_MASTER_ISR
+#define PIC_SLAVE_CMD		0xa0
+#define PIC_SLAVE_IMR		0xa1
+
+/* i8259A PIC related value */
+#define PIC_CASCADE_IR		2
+#define MASTER_ICW4_DEFAULT	0x01
+#define SLAVE_ICW4_DEFAULT	0x01
+#define PIC_ICW4_AEOI		2
+
 extern spinlock_t i8259A_lock;
 
 extern void init_8259A(int auto_eoi);
diff --git a/include/asm-x86/mach-default/io_ports.h b/include/asm-x86/mach-default/io_ports.h
deleted file mode 100644
index 48540ba97166..000000000000
--- a/include/asm-x86/mach-default/io_ports.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- *  arch/i386/mach-generic/io_ports.h
- *
- *  Machine specific IO port address definition for generic.
- *  Written by Osamu Tomita <tomita@cinet.co.jp>
- */
-#ifndef _MACH_IO_PORTS_H
-#define _MACH_IO_PORTS_H
-
-/* i8259A PIC registers */
-#define PIC_MASTER_CMD		0x20
-#define PIC_MASTER_IMR		0x21
-#define PIC_MASTER_ISR		PIC_MASTER_CMD
-#define PIC_MASTER_POLL		PIC_MASTER_ISR
-#define PIC_MASTER_OCW3		PIC_MASTER_ISR
-#define PIC_SLAVE_CMD		0xa0
-#define PIC_SLAVE_IMR		0xa1
-
-/* i8259A PIC related value */
-#define PIC_CASCADE_IR		2
-#define MASTER_ICW4_DEFAULT	0x01
-#define SLAVE_ICW4_DEFAULT	0x01
-#define PIC_ICW4_AEOI		2
-
-#endif /* !_MACH_IO_PORTS_H */
-- 
cgit v1.2.3


From 2b8e05b5677d2b4f3cd218ee90a7332715cb262f Mon Sep 17 00:00:00 2001
From: Paul Jimenez <pj@place.org>
Date: Wed, 30 Jan 2008 13:30:29 +0100
Subject: x86: make i8259_64 more _32-like

Howdy! Here's a simple janitorish patch for you:

This patch mainly hinges around two includes and their ramifications:

#include <i8259.h>	which provides cached_{slave,master}_mask
#include <io_ports.h>	which provides PIC_{MASTER,SLAVE}_{IMR,CMD}

Adding these two includes and using those half dozen or so definitions
removed 140+ lines of diffs between i8259_32.c and i8259_64.c, thus
making it easier for the real substantitive differences between them to
show up, and hopefully therefore making it easier to eventually merge
the two.  All the warnings that checkpatch.pl throws (missing spaces
after commas and >80 character lines) exist intentionally to match
i8259_32.c.

Signed-off-by: Paul Jimenez <pj@place.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/i8259_64.c | 154 ++++++++++++++++++++++++---------------------
 1 file changed, 81 insertions(+), 73 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c
index ba6d57286f56..be82b1217691 100644
--- a/arch/x86/kernel/i8259_64.c
+++ b/arch/x86/kernel/i8259_64.c
@@ -21,6 +21,7 @@
 #include <asm/delay.h>
 #include <asm/desc.h>
 #include <asm/apic.h>
+#include <asm/i8259.h>
 
 /*
  * Common place to define all x86 IRQ vectors
@@ -48,7 +49,7 @@
  */
 
 /*
- * The IO-APIC gives us many more interrupt sources. Most of these 
+ * The IO-APIC gives us many more interrupt sources. Most of these
  * are unused but an SMP system is supposed to have enough memory ...
  * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
  * across the spectrum, so we really want to be prepared to get all
@@ -114,11 +115,7 @@ static struct irq_chip i8259A_chip = {
 /*
  * This contains the irq mask for both 8259A irq controllers,
  */
-static unsigned int cached_irq_mask = 0xffff;
-
-#define __byte(x,y) 	(((unsigned char *)&(y))[x])
-#define cached_21	(__byte(0,cached_irq_mask))
-#define cached_A1	(__byte(1,cached_irq_mask))
+unsigned int cached_irq_mask = 0xffff;
 
 /*
  * Not all IRQs can be routed through the IO-APIC, eg. on certain (older)
@@ -139,9 +136,9 @@ void disable_8259A_irq(unsigned int irq)
 	spin_lock_irqsave(&i8259A_lock, flags);
 	cached_irq_mask |= mask;
 	if (irq & 8)
-		outb(cached_A1,0xA1);
+		outb(cached_slave_mask, PIC_SLAVE_IMR);
 	else
-		outb(cached_21,0x21);
+		outb(cached_master_mask, PIC_MASTER_IMR);
 	spin_unlock_irqrestore(&i8259A_lock, flags);
 }
 
@@ -153,9 +150,9 @@ void enable_8259A_irq(unsigned int irq)
 	spin_lock_irqsave(&i8259A_lock, flags);
 	cached_irq_mask &= mask;
 	if (irq & 8)
-		outb(cached_A1,0xA1);
+		outb(cached_slave_mask, PIC_SLAVE_IMR);
 	else
-		outb(cached_21,0x21);
+		outb(cached_master_mask, PIC_MASTER_IMR);
 	spin_unlock_irqrestore(&i8259A_lock, flags);
 }
 
@@ -167,9 +164,9 @@ int i8259A_irq_pending(unsigned int irq)
 
 	spin_lock_irqsave(&i8259A_lock, flags);
 	if (irq < 8)
-		ret = inb(0x20) & mask;
+		ret = inb(PIC_MASTER_CMD) & mask;
 	else
-		ret = inb(0xA0) & (mask >> 8);
+		ret = inb(PIC_SLAVE_CMD) & (mask >> 8);
 	spin_unlock_irqrestore(&i8259A_lock, flags);
 
 	return ret;
@@ -196,14 +193,14 @@ static inline int i8259A_irq_real(unsigned int irq)
 	int irqmask = 1<<irq;
 
 	if (irq < 8) {
-		outb(0x0B,0x20);		/* ISR register */
-		value = inb(0x20) & irqmask;
-		outb(0x0A,0x20);		/* back to the IRR register */
+		outb(0x0B,PIC_MASTER_CMD);	/* ISR register */
+		value = inb(PIC_MASTER_CMD) & irqmask;
+		outb(0x0A,PIC_MASTER_CMD);	/* back to the IRR register */
 		return value;
 	}
-	outb(0x0B,0xA0);		/* ISR register */
-	value = inb(0xA0) & (irqmask >> 8);
-	outb(0x0A,0xA0);		/* back to the IRR register */
+	outb(0x0B,PIC_SLAVE_CMD);	/* ISR register */
+	value = inb(PIC_SLAVE_CMD) & (irqmask >> 8);
+	outb(0x0A,PIC_SLAVE_CMD);	/* back to the IRR register */
 	return value;
 }
 
@@ -240,14 +237,17 @@ static void mask_and_ack_8259A(unsigned int irq)
 
 handle_real_irq:
 	if (irq & 8) {
-		inb(0xA1);		/* DUMMY - (do we need this?) */
-		outb(cached_A1,0xA1);
-		outb(0x60+(irq&7),0xA0);/* 'Specific EOI' to slave */
-		outb(0x62,0x20);	/* 'Specific EOI' to master-IRQ2 */
+		inb(PIC_SLAVE_IMR);	/* DUMMY - (do we need this?) */
+		outb(cached_slave_mask, PIC_SLAVE_IMR);
+		/* 'Specific EOI' to slave */
+		outb(0x60+(irq&7),PIC_SLAVE_CMD);
+		 /* 'Specific EOI' to master-IRQ2 */
+		outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD);
 	} else {
-		inb(0x21);		/* DUMMY - (do we need this?) */
-		outb(cached_21,0x21);
-		outb(0x60+irq,0x20);	/* 'Specific EOI' to master */
+		inb(PIC_MASTER_IMR);	/* DUMMY - (do we need this?) */
+		outb(cached_master_mask, PIC_MASTER_IMR);
+		/* 'Specific EOI' to master */
+		outb(0x60+irq,PIC_MASTER_CMD);
 	}
 	spin_unlock_irqrestore(&i8259A_lock, flags);
 	return;
@@ -270,7 +270,8 @@ spurious_8259A_irq:
 		 * lets ACK and report it. [once per IRQ]
 		 */
 		if (!(spurious_irq_mask & irqmask)) {
-			printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq);
+			printk(KERN_DEBUG
+			       "spurious 8259A interrupt: IRQ%d.\n", irq);
 			spurious_irq_mask |= irqmask;
 		}
 		atomic_inc(&irq_err_count);
@@ -283,51 +284,6 @@ spurious_8259A_irq:
 	}
 }
 
-void init_8259A(int auto_eoi)
-{
-	unsigned long flags;
-
-	i8259A_auto_eoi = auto_eoi;
-
-	spin_lock_irqsave(&i8259A_lock, flags);
-
-	outb(0xff, 0x21);	/* mask all of 8259A-1 */
-	outb(0xff, 0xA1);	/* mask all of 8259A-2 */
-
-	/*
-	 * outb_p - this has to work on a wide range of PC hardware.
-	 */
-	outb_p(0x11, 0x20);	/* ICW1: select 8259A-1 init */
-	outb_p(IRQ0_VECTOR, 0x21);	/* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */
-	outb_p(0x04, 0x21);	/* 8259A-1 (the master) has a slave on IR2 */
-	if (auto_eoi)
-		outb_p(0x03, 0x21);	/* master does Auto EOI */
-	else
-		outb_p(0x01, 0x21);	/* master expects normal EOI */
-
-	outb_p(0x11, 0xA0);	/* ICW1: select 8259A-2 init */
-	outb_p(IRQ8_VECTOR, 0xA1);	/* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */
-	outb_p(0x02, 0xA1);	/* 8259A-2 is a slave on master's IR2 */
-	outb_p(0x01, 0xA1);	/* (slave's support for AEOI in flat mode
-				    is to be investigated) */
-
-	if (auto_eoi)
-		/*
-		 * in AEOI mode we just have to mask the interrupt
-		 * when acking.
-		 */
-		i8259A_chip.mask_ack = disable_8259A_irq;
-	else
-		i8259A_chip.mask_ack = mask_and_ack_8259A;
-
-	udelay(100);		/* wait for 8259A to initialize */
-
-	outb(cached_21, 0x21);	/* restore master IRQ mask */
-	outb(cached_A1, 0xA1);	/* restore slave IRQ mask */
-
-	spin_unlock_irqrestore(&i8259A_lock, flags);
-}
-
 static char irq_trigger[2];
 /**
  * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
@@ -364,8 +320,8 @@ static int i8259A_shutdown(struct sys_device *dev)
 	 * the kernel initialization code can get it
 	 * out of.
 	 */
-	outb(0xff, 0x21);	/* mask all of 8259A-1 */
-	outb(0xff, 0xA1);	/* mask all of 8259A-1 */
+	outb(0xff, PIC_MASTER_IMR);	/* mask all of 8259A-1 */
+	outb(0xff, PIC_SLAVE_IMR);	/* mask all of 8259A-1 */
 	return 0;
 }
 
@@ -391,6 +347,58 @@ static int __init i8259A_init_sysfs(void)
 
 device_initcall(i8259A_init_sysfs);
 
+void init_8259A(int auto_eoi)
+{
+	unsigned long flags;
+
+	i8259A_auto_eoi = auto_eoi;
+
+	spin_lock_irqsave(&i8259A_lock, flags);
+
+	outb(0xff, PIC_MASTER_IMR);	/* mask all of 8259A-1 */
+	outb(0xff, PIC_SLAVE_IMR);	/* mask all of 8259A-2 */
+
+	/*
+	 * outb_p - this has to work on a wide range of PC hardware.
+	 */
+	outb_p(0x11, PIC_MASTER_CMD);	/* ICW1: select 8259A-1 init */
+	/* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */
+	outb_p(IRQ0_VECTOR, PIC_MASTER_IMR);
+	/* 8259A-1 (the master) has a slave on IR2 */
+	outb_p(0x04, PIC_MASTER_IMR);
+	if (auto_eoi)	/* master does Auto EOI */
+		outb_p(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
+	else		/* master expects normal EOI */
+		outb_p(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
+
+	outb_p(0x11, PIC_SLAVE_CMD);	/* ICW1: select 8259A-2 init */
+	/* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */
+	outb_p(IRQ8_VECTOR, PIC_SLAVE_IMR);
+	/* 8259A-2 is a slave on master's IR2 */
+	outb_p(PIC_CASCADE_IR, PIC_SLAVE_IMR);
+	/* (slave's support for AEOI in flat mode is to be investigated) */
+	outb_p(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR);
+
+	if (auto_eoi)
+		/*
+		 * In AEOI mode we just have to mask the interrupt
+		 * when acking.
+		 */
+		i8259A_chip.mask_ack = disable_8259A_irq;
+	else
+		i8259A_chip.mask_ack = mask_and_ack_8259A;
+
+	udelay(100);		/* wait for 8259A to initialize */
+
+	outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
+	outb(cached_slave_mask, PIC_SLAVE_IMR);	  /* restore slave IRQ mask */
+
+	spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+
+
+
 /*
  * IRQ2 is cascade interrupt to second interrupt controller
  */
-- 
cgit v1.2.3


From 013d23e1567c4cebee0a2db5c8fa97b91b34ac2a Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Wed, 30 Jan 2008 13:30:30 +0100
Subject: x86 e820_64.c: make 2 functions static

This patch makes the following needlessly global functions static:
- e820_print_map()
- early_panic()

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/e820_64.c | 4 ++--
 include/asm-x86/e820_64.h | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
index 0128b0b0e5b5..11a3d65db0c1 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -373,7 +373,7 @@ unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
 	return end - start - (ram << PAGE_SHIFT);
 }
 
-void __init e820_print_map(char *who)
+static void __init e820_print_map(char *who)
 {
 	int i;
 
@@ -633,7 +633,7 @@ static int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
 	return 0;
 }
 
-void early_panic(char *msg)
+static void early_panic(char *msg)
 {
 	early_printk(msg);
 	panic(msg);
diff --git a/include/asm-x86/e820_64.h b/include/asm-x86/e820_64.h
index 0bd4787a5d57..e535e6044e21 100644
--- a/include/asm-x86/e820_64.h
+++ b/include/asm-x86/e820_64.h
@@ -21,7 +21,6 @@ extern void contig_e820_setup(void);
 extern unsigned long e820_end_of_ram(void);
 extern void e820_reserve_resources(void);
 extern void e820_mark_nosave_regions(void);
-extern void e820_print_map(char *who);
 extern int e820_any_mapped(unsigned long start, unsigned long end, unsigned type);
 extern int e820_all_mapped(unsigned long start, unsigned long end, unsigned type);
 extern unsigned long e820_hole_size(unsigned long start, unsigned long end);
-- 
cgit v1.2.3


From 52e3d90def24008c9b3435a1032d9a7f05766bce Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Wed, 30 Jan 2008 13:30:30 +0100
Subject: x86: io_apic_64.c: remove unused config check

CONFIG_IRQBALANCE doesn't exist on x86_64.

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/io_apic_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index c6de7854ac63..3e471d0fb150 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -1439,7 +1439,7 @@ static void ack_apic_level(unsigned int irq)
 	int do_unmask_irq = 0;
 
 	irq_complete_move(irq);
-#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
+#ifdef CONFIG_GENERIC_PENDING_IRQ
 	/* If we are moving the irq we need to mask it */
 	if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) {
 		do_unmask_irq = 1;
-- 
cgit v1.2.3


From 231fd906c5e71219bbc32618a8ed8b439d5dde98 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Wed, 30 Jan 2008 13:30:30 +0100
Subject: x86 mce_64.c: make struct mcelog static

This patch makes the needlessly global struct mcelog static.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mcheck/mce_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 242e8668dbeb..8ef6a6bfd112 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -63,7 +63,7 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
  * separate MCEs from kernel messages to avoid bogus bug reports.
  */
 
-struct mce_log mcelog = {
+static struct mce_log mcelog = {
 	MCE_LOG_SIGNATURE,
 	MCE_LOG_LEN,
 };
-- 
cgit v1.2.3


From 867ab545668385b903f9379019000383675c49b3 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Wed, 30 Jan 2008 13:30:31 +0100
Subject: x86: nmi_64.c: make code static

This patch makes the following needlessly global code static:
- panic_on_timeout
- setup_nmi_watchdog()

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/nmi_64.c | 4 ++--
 include/asm-x86/nmi_64.h | 2 --
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c
index c3d1476b6a11..53faef632fc6 100644
--- a/arch/x86/kernel/nmi_64.c
+++ b/arch/x86/kernel/nmi_64.c
@@ -39,7 +39,7 @@ static cpumask_t backtrace_mask = CPU_MASK_NONE;
  *  0: the lapic NMI watchdog is disabled, but can be enabled
  */
 atomic_t nmi_active = ATOMIC_INIT(0);		/* oprofile uses this */
-int panic_on_timeout;
+static int panic_on_timeout;
 
 unsigned int nmi_watchdog = NMI_DEFAULT;
 static unsigned int nmi_hz = HZ;
@@ -136,7 +136,7 @@ int __init check_nmi_watchdog (void)
 	return 0;
 }
 
-int __init setup_nmi_watchdog(char *str)
+static int __init setup_nmi_watchdog(char *str)
 {
 	int nmi;
 
diff --git a/include/asm-x86/nmi_64.h b/include/asm-x86/nmi_64.h
index 92f36fb1dc62..2eeb74e5f3ff 100644
--- a/include/asm-x86/nmi_64.h
+++ b/include/asm-x86/nmi_64.h
@@ -38,7 +38,6 @@ extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
 
 #define get_nmi_reason() inb(0x61)
 
-extern int panic_on_timeout;
 extern int unknown_nmi_panic;
 extern int nmi_watchdog_enabled;
 
@@ -57,7 +56,6 @@ extern void enable_timer_nmi_watchdog(void);
 extern int nmi_watchdog_tick (struct pt_regs * regs, unsigned reason);
 
 extern void nmi_watchdog_default(void);
-extern int setup_nmi_watchdog(char *);
 
 extern atomic_t nmi_active;
 extern unsigned int nmi_watchdog;
-- 
cgit v1.2.3


From ed65260bb814a5c600a4e883f6bda73e287f5dec Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Wed, 30 Jan 2008 13:30:31 +0100
Subject: x86: pci-calgary_64.c: make a variable static

"debugging" is a horrible name for a global variable - thankfully it can
become static.

Also put it out of __read_mostly so that gcc no longer has to emit it
at all.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/pci-calgary_64.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 6bf1f716909d..21d7e0f8610c 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -183,7 +183,7 @@ static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, };
 
 /* enable this to stress test the chip's TCE cache */
 #ifdef CONFIG_IOMMU_DEBUG
-int debugging __read_mostly = 1;
+static int debugging = 1;
 
 static inline unsigned long verify_bit_range(unsigned long* bitmap,
 	int expected, unsigned long start, unsigned long end)
@@ -202,7 +202,7 @@ static inline unsigned long verify_bit_range(unsigned long* bitmap,
 	return ~0UL;
 }
 #else /* debugging is disabled */
-int debugging __read_mostly = 0;
+static int debugging;
 
 static inline unsigned long verify_bit_range(unsigned long* bitmap,
 	int expected, unsigned long start, unsigned long end)
-- 
cgit v1.2.3


From 3e7593966be6f6d29a15138c0c96b961d437f2f5 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Wed, 30 Jan 2008 13:30:31 +0100
Subject: x86: pci-dma_64.c: cleanups

This patch contains the following cleanups:
- make the needlessly global iommu_setup() static
- remove the unused EXPORT_SYMBOL(iommu_merge)

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/pci-dma_64.c | 3 +--
 include/asm-x86/pci_64.h     | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/pci-dma_64.c b/arch/x86/kernel/pci-dma_64.c
index 5552d23d23c2..a82473d192a3 100644
--- a/arch/x86/kernel/pci-dma_64.c
+++ b/arch/x86/kernel/pci-dma_64.c
@@ -13,7 +13,6 @@
 #include <asm/calgary.h>
 
 int iommu_merge __read_mostly = 0;
-EXPORT_SYMBOL(iommu_merge);
 
 dma_addr_t bad_dma_address __read_mostly;
 EXPORT_SYMBOL(bad_dma_address);
@@ -230,7 +229,7 @@ EXPORT_SYMBOL(dma_set_mask);
  * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
  * documentation.
  */
-__init int iommu_setup(char *p)
+static __init int iommu_setup(char *p)
 {
 	iommu_merge = 1;
 
diff --git a/include/asm-x86/pci_64.h b/include/asm-x86/pci_64.h
index ef54226a9325..374690314539 100644
--- a/include/asm-x86/pci_64.h
+++ b/include/asm-x86/pci_64.h
@@ -26,7 +26,6 @@ extern int (*pci_config_write)(int seg, int bus, int dev, int fn, int reg, int l
 
 
 extern void pci_iommu_alloc(void);
-extern int iommu_setup(char *opt);
 
 /* The PCI address space does equal the physical memory
  * address space.  The networking and block device layers use
-- 
cgit v1.2.3


From 2d2ee8de5f6d26ef2942e0b449aa68d9236d5777 Mon Sep 17 00:00:00 2001
From: Paul Jimenez <pj@place.org>
Date: Wed, 30 Jan 2008 13:30:31 +0100
Subject: x86: mtrr use type bool [RESEND AGAIN]

This is a janitorish patch to 1) remove private TRUE/FALSE #def's in
favor of using the standard enum from linux/stddef.h and 2) switch the
variables holding those values to type 'bool' (from linux/types.h)
since it both seems more appropriate and allows for potentially better
optimization.

As a truly minor aside, I removed a couple of comments documenting
a 'do_safe' parameter that seems to no longer exist.

Signed-off-by: Paul Jimenez <pj@place.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/mtrr/amd.c     |  2 --
 arch/x86/kernel/cpu/mtrr/generic.c | 16 +++++++---------
 arch/x86/kernel/cpu/mtrr/if.c      | 15 ++++++++-------
 arch/x86/kernel/cpu/mtrr/main.c    |  8 +++++---
 arch/x86/kernel/cpu/mtrr/mtrr.h    |  6 ++----
 include/asm-x86/mtrr.h             |  8 ++++----
 6 files changed, 26 insertions(+), 29 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c
index 0949cdbf848a..ee2331b0e58f 100644
--- a/arch/x86/kernel/cpu/mtrr/amd.c
+++ b/arch/x86/kernel/cpu/mtrr/amd.c
@@ -53,8 +53,6 @@ static void amd_set_mtrr(unsigned int reg, unsigned long base,
     <base> The base address of the region.
     <size> The size of the region. If this is 0 the region is disabled.
     <type> The type of the region.
-    <do_safe> If TRUE, do the change safely. If FALSE, safety measures should
-    be done externally.
     [RETURNS] Nothing.
 */
 {
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 992f08dfbb6c..1c331c373a43 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -188,7 +188,7 @@ static inline void k8_enable_fixed_iorrs(void)
  * \param changed pointer which indicates whether the MTRR needed to be changed
  * \param msrwords pointer to the MSR values which the MSR should have
  */
-static void set_fixed_range(int msr, int * changed, unsigned int * msrwords)
+static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
 {
 	unsigned lo, hi;
 
@@ -200,7 +200,7 @@ static void set_fixed_range(int msr, int * changed, unsigned int * msrwords)
 		    ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK))
 			k8_enable_fixed_iorrs();
 		mtrr_wrmsr(msr, msrwords[0], msrwords[1]);
-		*changed = TRUE;
+		*changed = true;
 	}
 }
 
@@ -260,7 +260,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
 static int set_fixed_ranges(mtrr_type * frs)
 {
 	unsigned long long *saved = (unsigned long long *) frs;
-	int changed = FALSE;
+	bool changed = false;
 	int block=-1, range;
 
 	while (fixed_range_blocks[++block].ranges)
@@ -273,17 +273,17 @@ static int set_fixed_ranges(mtrr_type * frs)
 
 /*  Set the MSR pair relating to a var range. Returns TRUE if
     changes are made  */
-static int set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr)
+static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr)
 {
 	unsigned int lo, hi;
-	int changed = FALSE;
+	bool changed = false;
 
 	rdmsr(MTRRphysBase_MSR(index), lo, hi);
 	if ((vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL)
 	    || (vr->base_hi & (size_and_mask >> (32 - PAGE_SHIFT))) !=
 		(hi & (size_and_mask >> (32 - PAGE_SHIFT)))) {
 		mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi);
-		changed = TRUE;
+		changed = true;
 	}
 
 	rdmsr(MTRRphysMask_MSR(index), lo, hi);
@@ -292,7 +292,7 @@ static int set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr)
 	    || (vr->mask_hi & (size_and_mask >> (32 - PAGE_SHIFT))) !=
 		(hi & (size_and_mask >> (32 - PAGE_SHIFT)))) {
 		mtrr_wrmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi);
-		changed = TRUE;
+		changed = true;
 	}
 	return changed;
 }
@@ -417,8 +417,6 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base,
     <base> The base address of the region.
     <size> The size of the region. If this is 0 the region is disabled.
     <type> The type of the region.
-    <do_safe> If TRUE, do the change safely. If FALSE, safety measures should
-    be done externally.
     [RETURNS] Nothing.
 */
 {
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index c7d8f1756745..14535686c099 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -37,7 +37,7 @@ const char *mtrr_attrib_to_str(int x)
 
 static int
 mtrr_file_add(unsigned long base, unsigned long size,
-	      unsigned int type, char increment, struct file *file, int page)
+	      unsigned int type, bool increment, struct file *file, int page)
 {
 	int reg, max;
 	unsigned int *fcount = FILE_FCOUNT(file); 
@@ -55,7 +55,7 @@ mtrr_file_add(unsigned long base, unsigned long size,
 		base >>= PAGE_SHIFT;
 		size >>= PAGE_SHIFT;
 	}
-	reg = mtrr_add_page(base, size, type, 1);
+	reg = mtrr_add_page(base, size, type, true);
 	if (reg >= 0)
 		++fcount[reg];
 	return reg;
@@ -141,7 +141,7 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
 		size >>= PAGE_SHIFT;
 		err =
 		    mtrr_add_page((unsigned long) base, (unsigned long) size, i,
-				  1);
+				  true);
 		if (err < 0)
 			return err;
 		return len;
@@ -217,7 +217,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
 		if (!capable(CAP_SYS_ADMIN))
 			return -EPERM;
 		err =
-		    mtrr_file_add(sentry.base, sentry.size, sentry.type, 1,
+		    mtrr_file_add(sentry.base, sentry.size, sentry.type, true,
 				  file, 0);
 		break;
 	case MTRRIOC_SET_ENTRY:
@@ -226,7 +226,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
 #endif
 		if (!capable(CAP_SYS_ADMIN))
 			return -EPERM;
-		err = mtrr_add(sentry.base, sentry.size, sentry.type, 0);
+		err = mtrr_add(sentry.base, sentry.size, sentry.type, false);
 		break;
 	case MTRRIOC_DEL_ENTRY:
 #ifdef CONFIG_COMPAT
@@ -270,7 +270,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
 		if (!capable(CAP_SYS_ADMIN))
 			return -EPERM;
 		err =
-		    mtrr_file_add(sentry.base, sentry.size, sentry.type, 1,
+		    mtrr_file_add(sentry.base, sentry.size, sentry.type, true,
 				  file, 1);
 		break;
 	case MTRRIOC_SET_PAGE_ENTRY:
@@ -279,7 +279,8 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
 #endif
 		if (!capable(CAP_SYS_ADMIN))
 			return -EPERM;
-		err = mtrr_add_page(sentry.base, sentry.size, sentry.type, 0);
+		err =
+		    mtrr_add_page(sentry.base, sentry.size, sentry.type, false);
 		break;
 	case MTRRIOC_DEL_PAGE_ENTRY:
 #ifdef CONFIG_COMPAT
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index beb45c9c0835..60af5ed2b5c0 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -311,7 +311,7 @@ static void set_mtrr(unsigned int reg, unsigned long base,
  */
 
 int mtrr_add_page(unsigned long base, unsigned long size, 
-		  unsigned int type, char increment)
+		  unsigned int type, bool increment)
 {
 	int i, replace, error;
 	mtrr_type ltype;
@@ -394,7 +394,9 @@ int mtrr_add_page(unsigned long base, unsigned long size,
 		if (likely(replace < 0))
 			usage_table[i] = 1;
 		else {
-			usage_table[i] = usage_table[replace] + !!increment;
+			usage_table[i] = usage_table[replace];
+			if (increment)
+				usage_table[i]++;
 			if (unlikely(replace != i)) {
 				set_mtrr(replace, 0, 0, 0);
 				usage_table[replace] = 0;
@@ -460,7 +462,7 @@ static int mtrr_check(unsigned long base, unsigned long size)
 
 int
 mtrr_add(unsigned long base, unsigned long size, unsigned int type,
-	 char increment)
+	 bool increment)
 {
 	if (mtrr_check(base, size))
 		return -EINVAL;
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 289dfe6030e3..54347e9a95c0 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -2,10 +2,8 @@
  * local mtrr defines.
  */
 
-#ifndef TRUE
-#define TRUE  1
-#define FALSE 0
-#endif
+#include <linux/types.h>
+#include <linux/stddef.h>
 
 #define MTRRcap_MSR     0x0fe
 #define MTRRdefType_MSR 0x2ff
diff --git a/include/asm-x86/mtrr.h b/include/asm-x86/mtrr.h
index e8320e4e6ca2..262670e42078 100644
--- a/include/asm-x86/mtrr.h
+++ b/include/asm-x86/mtrr.h
@@ -89,9 +89,9 @@ struct mtrr_gentry
 extern void mtrr_save_fixed_ranges(void *);
 extern void mtrr_save_state(void);
 extern int mtrr_add (unsigned long base, unsigned long size,
-		     unsigned int type, char increment);
+		     unsigned int type, bool increment);
 extern int mtrr_add_page (unsigned long base, unsigned long size,
-		     unsigned int type, char increment);
+		     unsigned int type, bool increment);
 extern int mtrr_del (int reg, unsigned long base, unsigned long size);
 extern int mtrr_del_page (int reg, unsigned long base, unsigned long size);
 extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi);
@@ -101,12 +101,12 @@ extern void mtrr_bp_init(void);
 #define mtrr_save_fixed_ranges(arg) do {} while (0)
 #define mtrr_save_state() do {} while (0)
 static __inline__ int mtrr_add (unsigned long base, unsigned long size,
-				unsigned int type, char increment)
+				unsigned int type, bool increment)
 {
     return -ENODEV;
 }
 static __inline__ int mtrr_add_page (unsigned long base, unsigned long size,
-				unsigned int type, char increment)
+				unsigned int type, bool increment)
 {
     return -ENODEV;
 }
-- 
cgit v1.2.3


From 201c19948b879ed95ac986bc994af29d7cf4859f Mon Sep 17 00:00:00 2001
From: Lucas Woods <woodzy@gmail.com>
Date: Wed, 30 Jan 2008 13:30:32 +0100
Subject: x86: remove duplicate includes

Signed-off-by: Lucas Woods <woodzy@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/hpet.c           | 1 -
 arch/x86/kernel/pci-calgary_64.c | 1 -
 arch/x86/vdso/vclock_gettime.c   | 1 -
 3 files changed, 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index a3c56c9b8a02..5c1702789be4 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -6,7 +6,6 @@
 #include <linux/init.h>
 #include <linux/sysdev.h>
 #include <linux/pm.h>
-#include <linux/delay.h>
 
 #include <asm/fixmap.h>
 #include <asm/hpet.h>
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 21d7e0f8610c..21f34db2c03c 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -30,7 +30,6 @@
 #include <linux/spinlock.h>
 #include <linux/string.h>
 #include <linux/dma-mapping.h>
-#include <linux/init.h>
 #include <linux/bitops.h>
 #include <linux/pci_ids.h>
 #include <linux/pci.h>
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 5b54cdfb2b07..23476c2ebfc4 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -19,7 +19,6 @@
 #include <asm/hpet.h>
 #include <asm/unistd.h>
 #include <asm/io.h>
-#include <asm/vgtod.h>
 #include "vextern.h"
 
 #define gtod vdso_vsyscall_gtod_data
-- 
cgit v1.2.3


From c9cce83dd1d59f52e2c8f8c7d265ba4854c40785 Mon Sep 17 00:00:00 2001
From: Bernhard Walle <bwalle@suse.de>
Date: Wed, 30 Jan 2008 13:30:32 +0100
Subject: x86: remove extern declarations for code, data, bss resources

This patch removes the extern struct resource declarations for
data_resource, code_resource and bss_resource on x86 and declares that
three structures as static as done on other architectures like IA64.

On i386, these structures are moved to setup_32.c (from e820_32.c) because
that's code that is not specific to e820 and also required on EFI systems.
That makes the "extern" reference superfluous.

On x86_64, data_resource, code_resource and bss_resource are passed to
e820_reserve_resources() as arguments just as done on i386 and IA64.  That
also avoids the "extern" reference and it's possible to make it static.

Signed-off-by: Bernhard Walle <bwalle@suse.de>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/e820_32.c  | 110 ++-------------------------------------------
 arch/x86/kernel/e820_64.c  |  11 +++--
 arch/x86/kernel/setup_32.c | 106 +++++++++++++++++++++++++++++++++++++++++--
 arch/x86/kernel/setup_64.c |   8 ++--
 include/asm-x86/e820_32.h  |   6 +++
 include/asm-x86/e820_64.h  |   5 ++-
 include/linux/ioport.h     |   2 +
 7 files changed, 127 insertions(+), 121 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c
index 18f500d185a2..87cadc86d5ee 100644
--- a/arch/x86/kernel/e820_32.c
+++ b/arch/x86/kernel/e820_32.c
@@ -37,26 +37,6 @@ unsigned long pci_mem_start = 0x10000000;
 EXPORT_SYMBOL(pci_mem_start);
 #endif
 extern int user_defined_memmap;
-struct resource data_resource = {
-	.name	= "Kernel data",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-struct resource code_resource = {
-	.name	= "Kernel code",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-struct resource bss_resource = {
-	.name	= "Kernel bss",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
-};
 
 static struct resource system_rom_resource = {
 	.name	= "System ROM",
@@ -111,60 +91,6 @@ static struct resource video_rom_resource = {
 	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
 };
 
-static struct resource video_ram_resource = {
-	.name	= "Video RAM area",
-	.start	= 0xa0000,
-	.end	= 0xbffff,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-static struct resource standard_io_resources[] = { {
-	.name	= "dma1",
-	.start	= 0x0000,
-	.end	= 0x001f,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name	= "pic1",
-	.start	= 0x0020,
-	.end	= 0x0021,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name   = "timer0",
-	.start	= 0x0040,
-	.end    = 0x0043,
-	.flags  = IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name   = "timer1",
-	.start  = 0x0050,
-	.end    = 0x0053,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name	= "keyboard",
-	.start	= 0x0060,
-	.end	= 0x006f,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name	= "dma page reg",
-	.start	= 0x0080,
-	.end	= 0x008f,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name	= "pic2",
-	.start	= 0x00a0,
-	.end	= 0x00a1,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name	= "dma2",
-	.start	= 0x00c0,
-	.end	= 0x00df,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-}, {
-	.name	= "fpu",
-	.start	= 0x00f0,
-	.end	= 0x00ff,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
-} };
-
 #define ROMSIGNATURE 0xaa55
 
 static int __init romsignature(const unsigned char *rom)
@@ -260,10 +186,9 @@ static void __init probe_roms(void)
  * Request address space for all standard RAM and ROM resources
  * and also for regions reported as reserved by the e820.
  */
-static void __init
-legacy_init_iomem_resources(struct resource *code_resource,
-			    struct resource *data_resource,
-			    struct resource *bss_resource)
+void __init legacy_init_iomem_resources(struct resource *code_resource,
+		struct resource *data_resource,
+		struct resource *bss_resource)
 {
 	int i;
 
@@ -305,35 +230,6 @@ legacy_init_iomem_resources(struct resource *code_resource,
 	}
 }
 
-/*
- * Request address space for all standard resources
- *
- * This is called just before pcibios_init(), which is also a
- * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
- */
-static int __init request_standard_resources(void)
-{
-	int i;
-
-	printk("Setting up standard PCI resources\n");
-	if (efi_enabled)
-		efi_initialize_iomem_resources(&code_resource,
-				&data_resource, &bss_resource);
-	else
-		legacy_init_iomem_resources(&code_resource,
-				&data_resource, &bss_resource);
-
-	/* EFI systems may still have VGA */
-	request_resource(&iomem_resource, &video_ram_resource);
-
-	/* request I/O space for devices used on all i[345]86 PCs */
-	for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
-		request_resource(&ioport_resource, &standard_io_resources[i]);
-	return 0;
-}
-
-subsys_initcall(request_standard_resources);
-
 #if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
 /**
  * e820_mark_nosave_regions - Find the ranges of physical addresses that do not
diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
index 11a3d65db0c1..151236896243 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -47,8 +47,6 @@ unsigned long end_pfn_map;
  */
 static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
 
-extern struct resource code_resource, data_resource, bss_resource;
-
 /* Check for some hardcoded bad areas that early boot is not allowed to touch */
 static inline int bad_addr(unsigned long *addrp, unsigned long size)
 {
@@ -213,7 +211,8 @@ unsigned long __init e820_end_of_ram(void)
 /*
  * Mark e820 reserved areas as busy for the resource manager.
  */
-void __init e820_reserve_resources(void)
+void __init e820_reserve_resources(struct resource *code_resource,
+		struct resource *data_resource, struct resource *bss_resource)
 {
 	int i;
 	for (i = 0; i < e820.nr_map; i++) {
@@ -235,9 +234,9 @@ void __init e820_reserve_resources(void)
 			 * so we try it repeatedly and let the resource manager
 			 * test it.
 			 */
-			request_resource(res, &code_resource);
-			request_resource(res, &data_resource);
-			request_resource(res, &bss_resource);
+			request_resource(res, code_resource);
+			request_resource(res, data_resource);
+			request_resource(res, bss_resource);
 #ifdef CONFIG_KEXEC
 			if (crashk_res.start != crashk_res.end)
 				request_resource(res, &crashk_res);
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 236d30b264d8..32edf70d6b0d 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -73,9 +73,80 @@ int disable_pse __cpuinitdata = 0;
 /*
  * Machine setup..
  */
-extern struct resource code_resource;
-extern struct resource data_resource;
-extern struct resource bss_resource;
+static struct resource data_resource = {
+	.name	= "Kernel data",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+static struct resource code_resource = {
+	.name	= "Kernel code",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+static struct resource bss_resource = {
+	.name	= "Kernel bss",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+static struct resource video_ram_resource = {
+	.name	= "Video RAM area",
+	.start	= 0xa0000,
+	.end	= 0xbffff,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+static struct resource standard_io_resources[] = { {
+	.name	= "dma1",
+	.start	= 0x0000,
+	.end	= 0x001f,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+	.name	= "pic1",
+	.start	= 0x0020,
+	.end	= 0x0021,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+	.name   = "timer0",
+	.start	= 0x0040,
+	.end    = 0x0043,
+	.flags  = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+	.name   = "timer1",
+	.start  = 0x0050,
+	.end    = 0x0053,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+	.name	= "keyboard",
+	.start	= 0x0060,
+	.end	= 0x006f,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+	.name	= "dma page reg",
+	.start	= 0x0080,
+	.end	= 0x008f,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+	.name	= "pic2",
+	.start	= 0x00a0,
+	.end	= 0x00a1,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+	.name	= "dma2",
+	.start	= 0x00c0,
+	.end	= 0x00df,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+	.name	= "fpu",
+	.start	= 0x00f0,
+	.end	= 0x00ff,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_IO
+} };
 
 /* cpu data as detected by the assembly code in head.S */
 struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
@@ -693,3 +764,32 @@ void __init setup_arch(char **cmdline_p)
 #endif
 #endif
 }
+
+/*
+ * Request address space for all standard resources
+ *
+ * This is called just before pcibios_init(), which is also a
+ * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
+ */
+static int __init request_standard_resources(void)
+{
+	int i;
+
+	printk(KERN_INFO "Setting up standard PCI resources\n");
+	if (efi_enabled)
+		efi_initialize_iomem_resources(&code_resource,
+				&data_resource, &bss_resource);
+	else
+		legacy_init_iomem_resources(&code_resource,
+				&data_resource, &bss_resource);
+
+	/* EFI systems may still have VGA */
+	request_resource(&iomem_resource, &video_ram_resource);
+
+	/* request I/O space for devices used on all i[345]86 PCs */
+	for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
+		request_resource(&ioport_resource, &standard_io_resources[i]);
+	return 0;
+}
+
+subsys_initcall(request_standard_resources);
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index bcb5f3aaa097..1acb435a0585 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -123,19 +123,19 @@ struct resource standard_io_resources[] = {
 
 #define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
 
-struct resource data_resource = {
+static struct resource data_resource = {
 	.name = "Kernel data",
 	.start = 0,
 	.end = 0,
 	.flags = IORESOURCE_RAM,
 };
-struct resource code_resource = {
+static struct resource code_resource = {
 	.name = "Kernel code",
 	.start = 0,
 	.end = 0,
 	.flags = IORESOURCE_RAM,
 };
-struct resource bss_resource = {
+static struct resource bss_resource = {
 	.name = "Kernel bss",
 	.start = 0,
 	.end = 0,
@@ -438,7 +438,7 @@ void __init setup_arch(char **cmdline_p)
 	/*
 	 * We trust e820 completely. No explicit ROM probing in memory.
  	 */
-	e820_reserve_resources(); 
+	e820_reserve_resources(&code_resource, &data_resource, &bss_resource);
 	e820_mark_nosave_regions();
 
 	{
diff --git a/include/asm-x86/e820_32.h b/include/asm-x86/e820_32.h
index 03f60c690c8a..ae5ea19623fa 100644
--- a/include/asm-x86/e820_32.h
+++ b/include/asm-x86/e820_32.h
@@ -12,6 +12,8 @@
 #ifndef __E820_HEADER
 #define __E820_HEADER
 
+#include <linux/ioport.h>
+
 #define HIGH_MEMORY	(1024*1024)
 
 #ifndef __ASSEMBLY__
@@ -26,6 +28,9 @@ extern void register_bootmem_low_pages(unsigned long max_low_pfn);
 extern void e820_register_memory(void);
 extern void limit_regions(unsigned long long size);
 extern void print_memory_map(char *who);
+extern void legacy_init_iomem_resources(struct resource *code_resource,
+			    struct resource *data_resource,
+			    struct resource *bss_resource);
 
 #if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
 extern void e820_mark_nosave_regions(void);
@@ -35,5 +40,6 @@ static inline void e820_mark_nosave_regions(void)
 }
 #endif
 
+
 #endif/*!__ASSEMBLY__*/
 #endif/*__E820_HEADER*/
diff --git a/include/asm-x86/e820_64.h b/include/asm-x86/e820_64.h
index e535e6044e21..1c7ba8804176 100644
--- a/include/asm-x86/e820_64.h
+++ b/include/asm-x86/e820_64.h
@@ -11,6 +11,8 @@
 #ifndef __E820_HEADER
 #define __E820_HEADER
 
+#include <linux/ioport.h>
+
 #ifndef __ASSEMBLY__
 extern unsigned long find_e820_area(unsigned long start, unsigned long end, 
 				    unsigned size);
@@ -19,7 +21,8 @@ extern void add_memory_region(unsigned long start, unsigned long size,
 extern void setup_memory_region(void);
 extern void contig_e820_setup(void); 
 extern unsigned long e820_end_of_ram(void);
-extern void e820_reserve_resources(void);
+extern void e820_reserve_resources(struct resource *code_resource,
+		struct resource *data_resource, struct resource *bss_resource);
 extern void e820_mark_nosave_regions(void);
 extern int e820_any_mapped(unsigned long start, unsigned long end, unsigned type);
 extern int e820_all_mapped(unsigned long start, unsigned long end, unsigned type);
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 6187a8567bc7..605d237364d2 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -8,6 +8,7 @@
 #ifndef _LINUX_IOPORT_H
 #define _LINUX_IOPORT_H
 
+#ifndef __ASSEMBLY__
 #include <linux/compiler.h>
 #include <linux/types.h>
 /*
@@ -153,4 +154,5 @@ extern struct resource * __devm_request_region(struct device *dev,
 extern void __devm_release_region(struct device *dev, struct resource *parent,
 				  resource_size_t start, resource_size_t n);
 
+#endif /* __ASSEMBLY__ */
 #endif	/* _LINUX_IOPORT_H */
-- 
cgit v1.2.3


From 416b72182ac3f3f4931ed17d0256b1d805d1b553 Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Wed, 30 Jan 2008 13:30:33 +0100
Subject: x86: clean up nmi_32/64.c

clean up and make nmi_32/64.c more similar.
- white space and coding style clean up.
- nmi_cpu_busy is available on CONFIG_SMP.
- move functions __acpi_nmi_enable, acpi_nmi_enable,
  __acpi_nmi_disable and acpi_nmi_disable.
- make variables name more similar.

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/nmi_32.c | 14 ++++---
 arch/x86/kernel/nmi_64.c | 95 ++++++++++++++++++++++++------------------------
 2 files changed, 55 insertions(+), 54 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c
index 4f4bfd3a88b6..edd413650b3b 100644
--- a/arch/x86/kernel/nmi_32.c
+++ b/arch/x86/kernel/nmi_32.c
@@ -51,13 +51,13 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
 
 static int endflag __initdata = 0;
 
+#ifdef CONFIG_SMP
 /* The performance counters used by NMI_LOCAL_APIC don't trigger when
  * the CPU is idle. To make sure the NMI watchdog really ticks on all
  * CPUs during the test make them busy.
  */
 static __init void nmi_cpu_busy(void *data)
 {
-#ifdef CONFIG_SMP
 	local_irq_enable_in_hardirq();
 	/* Intentionally don't use cpu_relax here. This is
 	   to make sure that the performance counter really ticks,
@@ -67,8 +67,8 @@ static __init void nmi_cpu_busy(void *data)
 	   care if they get somewhat less cycles. */
 	while (endflag == 0)
 		mb();
-#endif
 }
+#endif
 
 static int __init check_nmi_watchdog(void)
 {
@@ -87,11 +87,13 @@ static int __init check_nmi_watchdog(void)
 
 	printk(KERN_INFO "Testing NMI watchdog ... ");
 
+#ifdef CONFIG_SMP
 	if (nmi_watchdog == NMI_LOCAL_APIC)
 		smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
+#endif
 
 	for_each_possible_cpu(cpu)
-		prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count;
+		prev_nmi_count[cpu] = nmi_count(cpu);
 	local_irq_enable();
 	mdelay((20*1000)/nmi_hz); // wait 20 ticks
 
@@ -237,10 +239,10 @@ void acpi_nmi_disable(void)
 		on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
 }
 
-void setup_apic_nmi_watchdog (void *unused)
+void setup_apic_nmi_watchdog(void *unused)
 {
 	if (__get_cpu_var(wd_enabled))
- 		return;
+		return;
 
 	/* cheap hack to support suspend/resume */
 	/* if cpu0 is not active neither should the other cpus */
@@ -329,7 +331,7 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
 	unsigned int sum;
 	int touched = 0;
 	int cpu = smp_processor_id();
-	int rc=0;
+	int rc = 0;
 
 	/* check for other users first */
 	if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c
index 53faef632fc6..fb99484d21cf 100644
--- a/arch/x86/kernel/nmi_64.c
+++ b/arch/x86/kernel/nmi_64.c
@@ -78,22 +78,22 @@ static __init void nmi_cpu_busy(void *data)
 }
 #endif
 
-int __init check_nmi_watchdog (void)
+int __init check_nmi_watchdog(void)
 {
-	int *counts;
+	int *prev_nmi_count;
 	int cpu;
 
-	if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED)) 
+	if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED))
 		return 0;
 
 	if (!atomic_read(&nmi_active))
 		return 0;
 
-	counts = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
-	if (!counts)
+	prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
+	if (!prev_nmi_count)
 		return -1;
 
-	printk(KERN_INFO "testing NMI watchdog ... ");
+	printk(KERN_INFO "Testing NMI watchdog ... ");
 
 #ifdef CONFIG_SMP
 	if (nmi_watchdog == NMI_LOCAL_APIC)
@@ -101,30 +101,29 @@ int __init check_nmi_watchdog (void)
 #endif
 
 	for (cpu = 0; cpu < NR_CPUS; cpu++)
-		counts[cpu] = cpu_pda(cpu)->__nmi_count;
+		prev_nmi_count[cpu] = cpu_pda(cpu)->__nmi_count;
 	local_irq_enable();
 	mdelay((20*1000)/nmi_hz); // wait 20 ticks
 
 	for_each_online_cpu(cpu) {
 		if (!per_cpu(wd_enabled, cpu))
 			continue;
-		if (cpu_pda(cpu)->__nmi_count - counts[cpu] <= 5) {
+		if (cpu_pda(cpu)->__nmi_count - prev_nmi_count[cpu] <= 5) {
 			printk(KERN_WARNING "WARNING: CPU#%d: NMI "
 			       "appears to be stuck (%d->%d)!\n",
-			       cpu,
-			       counts[cpu],
-			       cpu_pda(cpu)->__nmi_count);
+				cpu,
+				prev_nmi_count[cpu],
+				cpu_pda(cpu)->__nmi_count);
 			per_cpu(wd_enabled, cpu) = 0;
 			atomic_dec(&nmi_active);
 		}
 	}
+	endflag = 1;
 	if (!atomic_read(&nmi_active)) {
-		kfree(counts);
+		kfree(prev_nmi_count);
 		atomic_set(&nmi_active, -1);
-		endflag = 1;
 		return -1;
 	}
-	endflag = 1;
 	printk("OK.\n");
 
 	/* now that we know it works we can reduce NMI frequency to
@@ -132,7 +131,7 @@ int __init check_nmi_watchdog (void)
 	if (nmi_watchdog == NMI_LOCAL_APIC)
 		nmi_hz = lapic_adjust_nmi_hz(1);
 
-	kfree(counts);
+	kfree(prev_nmi_count);
 	return 0;
 }
 
@@ -159,34 +158,6 @@ static int __init setup_nmi_watchdog(char *str)
 
 __setup("nmi_watchdog=", setup_nmi_watchdog);
 
-
-static void __acpi_nmi_disable(void *__unused)
-{
-	apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
-}
-
-/*
- * Disable timer based NMIs on all CPUs:
- */
-void acpi_nmi_disable(void)
-{
-	if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
-		on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
-}
-
-static void __acpi_nmi_enable(void *__unused)
-{
-	apic_write(APIC_LVT0, APIC_DM_NMI);
-}
-
-/*
- * Enable timer based NMIs on all CPUs:
- */
-void acpi_nmi_enable(void)
-{
-	if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
-		on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
-}
 #ifdef CONFIG_PM
 
 static int nmi_pm_active; /* nmi_active before suspend */
@@ -217,7 +188,7 @@ static struct sysdev_class nmi_sysclass = {
 };
 
 static struct sys_device device_lapic_nmi = {
-	.id		= 0,
+	.id	= 0,
 	.cls	= &nmi_sysclass,
 };
 
@@ -231,7 +202,7 @@ static int __init init_lapic_nmi_sysfs(void)
 	if (nmi_watchdog != NMI_LOCAL_APIC)
 		return 0;
 
-	if ( atomic_read(&nmi_active) < 0 )
+	if (atomic_read(&nmi_active) < 0)
 		return 0;
 
 	error = sysdev_class_register(&nmi_sysclass);
@@ -244,9 +215,37 @@ late_initcall(init_lapic_nmi_sysfs);
 
 #endif	/* CONFIG_PM */
 
+static void __acpi_nmi_enable(void *__unused)
+{
+	apic_write(APIC_LVT0, APIC_DM_NMI);
+}
+
+/*
+ * Enable timer based NMIs on all CPUs:
+ */
+void acpi_nmi_enable(void)
+{
+	if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
+		on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
+}
+
+static void __acpi_nmi_disable(void *__unused)
+{
+	apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
+}
+
+/*
+ * Disable timer based NMIs on all CPUs:
+ */
+void acpi_nmi_disable(void)
+{
+	if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
+		on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
+}
+
 void setup_apic_nmi_watchdog(void *unused)
 {
-	if (__get_cpu_var(wd_enabled) == 1)
+	if (__get_cpu_var(wd_enabled))
 		return;
 
 	/* cheap hack to support suspend/resume */
@@ -311,8 +310,9 @@ void touch_nmi_watchdog(void)
 		}
 	}
 
- 	touch_softlockup_watchdog();
+	touch_softlockup_watchdog();
 }
+EXPORT_SYMBOL(touch_nmi_watchdog);
 
 int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
 {
@@ -479,4 +479,3 @@ void __trigger_all_cpu_backtrace(void)
 
 EXPORT_SYMBOL(nmi_active);
 EXPORT_SYMBOL(nmi_watchdog);
-EXPORT_SYMBOL(touch_nmi_watchdog);
-- 
cgit v1.2.3


From 6abcd98ffafbff81f0bfd7ee1d129e634af13245 Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 30 Jan 2008 13:30:33 +0100
Subject: x86: irqflags consolidation

This patch consolidates the irqflags include files containing common
paravirt definitions. The native definition for interrupt handling, halt,
and such, are the same for 32 and 64 bit, and they are kept in irqflags.h.
the differences are split in the arch-specific files.

The syscall function, irq_enable_sysexit, has a very specific i386 naming,
and its name is then changed to a more general one.

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/asm-offsets_32.c |   2 +-
 arch/x86/kernel/entry_32.S       |   8 +-
 arch/x86/kernel/paravirt_32.c    |  10 +-
 arch/x86/kernel/vmi_32.c         |   4 +-
 arch/x86/xen/enlighten.c         |   2 +-
 include/asm-x86/irqflags.h       | 246 ++++++++++++++++++++++++++++++++++++++-
 include/asm-x86/irqflags_32.h    | 195 -------------------------------
 include/asm-x86/irqflags_64.h    | 174 ---------------------------
 include/asm-x86/paravirt.h       |   9 +-
 9 files changed, 261 insertions(+), 389 deletions(-)
 delete mode 100644 include/asm-x86/irqflags_32.h
 delete mode 100644 include/asm-x86/irqflags_64.h

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 0e45981b2dd7..c1ccfabb4a9e 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -123,7 +123,7 @@ void foo(void)
 	OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
 	OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
 	OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
-	OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
+	OFFSET(PV_CPU_irq_enable_syscall_ret, pv_cpu_ops, irq_enable_syscall_ret);
 	OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
 #endif
 
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index dc7f938e5015..d63609dd64b9 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -58,7 +58,7 @@
  * for paravirtualization.  The following will never clobber any registers:
  *   INTERRUPT_RETURN (aka. "iret")
  *   GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
- *   ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
+ *   ENABLE_INTERRUPTS_SYSCALL_RET (aka "sti; sysexit").
  *
  * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
  * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
@@ -351,7 +351,7 @@ sysenter_past_esp:
 	xorl %ebp,%ebp
 	TRACE_IRQS_ON
 1:	mov  PT_FS(%esp), %fs
-	ENABLE_INTERRUPTS_SYSEXIT
+	ENABLE_INTERRUPTS_SYSCALL_RET
 	CFI_ENDPROC
 .pushsection .fixup,"ax"
 2:	movl $0,PT_FS(%esp)
@@ -882,10 +882,10 @@ ENTRY(native_iret)
 .previous
 END(native_iret)
 
-ENTRY(native_irq_enable_sysexit)
+ENTRY(native_irq_enable_syscall_ret)
 	sti
 	sysexit
-END(native_irq_enable_sysexit)
+END(native_irq_enable_syscall_ret)
 #endif
 
 KPROBE_ENTRY(int3)
diff --git a/arch/x86/kernel/paravirt_32.c b/arch/x86/kernel/paravirt_32.c
index f5000799f8ef..706b0562ea40 100644
--- a/arch/x86/kernel/paravirt_32.c
+++ b/arch/x86/kernel/paravirt_32.c
@@ -60,7 +60,7 @@ DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
 DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
 DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
 DEF_NATIVE(pv_cpu_ops, iret, "iret");
-DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "sti; sysexit");
+DEF_NATIVE(pv_cpu_ops, irq_enable_syscall_ret, "sti; sysexit");
 DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
 DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
 DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
@@ -88,7 +88,7 @@ static unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 	SITE(pv_irq_ops, restore_fl);
 	SITE(pv_irq_ops, save_fl);
 	SITE(pv_cpu_ops, iret);
-	SITE(pv_cpu_ops, irq_enable_sysexit);
+	SITE(pv_cpu_ops, irq_enable_syscall_ret);
 	SITE(pv_mmu_ops, read_cr2);
 	SITE(pv_mmu_ops, read_cr3);
 	SITE(pv_mmu_ops, write_cr3);
@@ -186,7 +186,7 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
 		/* If the operation is a nop, then nop the callsite */
 		ret = paravirt_patch_nop();
 	else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
-		 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit))
+		 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_syscall_ret))
 		/* If operation requires a jmp, then jmp */
 		ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len);
 	else
@@ -237,7 +237,7 @@ static void native_flush_tlb_single(unsigned long addr)
 
 /* These are in entry.S */
 extern void native_iret(void);
-extern void native_irq_enable_sysexit(void);
+extern void native_irq_enable_syscall_ret(void);
 
 static int __init print_banner(void)
 {
@@ -384,7 +384,7 @@ struct pv_cpu_ops pv_cpu_ops = {
 	.write_idt_entry = write_dt_entry,
 	.load_esp0 = native_load_esp0,
 
-	.irq_enable_sysexit = native_irq_enable_sysexit,
+	.irq_enable_syscall_ret = native_irq_enable_syscall_ret,
 	.iret = native_iret,
 
 	.set_iopl_mask = native_set_iopl_mask,
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index f02bad68abaa..aacce426cbd0 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -148,7 +148,7 @@ static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
 					      insns, eip);
 		case PARAVIRT_PATCH(pv_cpu_ops.iret):
 			return patch_internal(VMI_CALL_IRET, len, insns, eip);
-		case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit):
+		case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_syscall_ret):
 			return patch_internal(VMI_CALL_SYSEXIT, len, insns, eip);
 		default:
 			break;
@@ -870,7 +870,7 @@ static inline int __init activate_vmi(void)
 	 * the backend.  They are performance critical anyway, so requiring
 	 * a patch is not a big problem.
 	 */
-	pv_cpu_ops.irq_enable_sysexit = (void *)0xfeedbab0;
+	pv_cpu_ops.irq_enable_syscall_ret = (void *)0xfeedbab0;
 	pv_cpu_ops.iret = (void *)0xbadbab0;
 
 #ifdef CONFIG_SMP
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 00829401389e..d3574485cb15 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -953,7 +953,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
 	.read_pmc = native_read_pmc,
 
 	.iret = (void *)&hypercall_page[__HYPERVISOR_iret],
-	.irq_enable_sysexit = NULL,  /* never called */
+	.irq_enable_syscall_ret = NULL,  /* never called */
 
 	.load_tr_desc = paravirt_nop,
 	.set_ldt = xen_set_ldt,
diff --git a/include/asm-x86/irqflags.h b/include/asm-x86/irqflags.h
index 1b695ff52687..92021c1ffa3a 100644
--- a/include/asm-x86/irqflags.h
+++ b/include/asm-x86/irqflags.h
@@ -1,5 +1,245 @@
-#ifdef CONFIG_X86_32
-# include "irqflags_32.h"
+#ifndef _X86_IRQFLAGS_H_
+#define _X86_IRQFLAGS_H_
+
+#include <asm/processor-flags.h>
+
+#ifndef __ASSEMBLY__
+/*
+ * Interrupt control:
+ */
+
+static inline unsigned long native_save_fl(void)
+{
+	unsigned long flags;
+
+	__asm__ __volatile__(
+		"# __raw_save_flags\n\t"
+		"pushf ; pop %0"
+		: "=g" (flags)
+		: /* no input */
+		: "memory"
+	);
+
+	return flags;
+}
+
+static inline void native_restore_fl(unsigned long flags)
+{
+	__asm__ __volatile__(
+		"push %0 ; popf"
+		: /* no output */
+		:"g" (flags)
+		:"memory", "cc"
+	);
+}
+
+static inline void native_irq_disable(void)
+{
+	asm volatile("cli": : :"memory");
+}
+
+static inline void native_irq_enable(void)
+{
+	asm volatile("sti": : :"memory");
+}
+
+static inline void native_safe_halt(void)
+{
+	asm volatile("sti; hlt": : :"memory");
+}
+
+static inline void native_halt(void)
+{
+	asm volatile("hlt": : :"memory");
+}
+
+#endif
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#ifndef __ASSEMBLY__
+
+static inline unsigned long __raw_local_save_flags(void)
+{
+	return native_save_fl();
+}
+
+static inline void raw_local_irq_restore(unsigned long flags)
+{
+	native_restore_fl(flags);
+}
+
+static inline void raw_local_irq_disable(void)
+{
+	native_irq_disable();
+}
+
+static inline void raw_local_irq_enable(void)
+{
+	native_irq_enable();
+}
+
+/*
+ * Used in the idle loop; sti takes one instruction cycle
+ * to complete:
+ */
+static inline void raw_safe_halt(void)
+{
+	native_safe_halt();
+}
+
+/*
+ * Used when interrupts are already enabled or to
+ * shutdown the processor:
+ */
+static inline void halt(void)
+{
+	native_halt();
+}
+
+/*
+ * For spinlocks, etc:
+ */
+static inline unsigned long __raw_local_irq_save(void)
+{
+	unsigned long flags = __raw_local_save_flags();
+
+	raw_local_irq_disable();
+
+	return flags;
+}
+#else
+
+#define ENABLE_INTERRUPTS(x)	sti
+#define DISABLE_INTERRUPTS(x)	cli
+
+#ifdef CONFIG_X86_64
+#define INTERRUPT_RETURN	iretq
+#define ENABLE_INTERRUPTS_SYSCALL_RET			\
+			movq	%gs:pda_oldrsp, %rsp;	\
+			swapgs;				\
+			sysretq;
+#else
+#define INTERRUPT_RETURN		iret
+#define ENABLE_INTERRUPTS_SYSCALL_RET	sti; sysexit
+#define GET_CR0_INTO_EAX		movl %cr0, %eax
+#endif
+
+
+#endif /* __ASSEMBLY__ */
+#endif /* CONFIG_PARAVIRT */
+
+#ifndef __ASSEMBLY__
+#define raw_local_save_flags(flags) \
+		do { (flags) = __raw_local_save_flags(); } while (0)
+
+#define raw_local_irq_save(flags) \
+		do { (flags) = __raw_local_irq_save(); } while (0)
+
+static inline int raw_irqs_disabled_flags(unsigned long flags)
+{
+	return !(flags & X86_EFLAGS_IF);
+}
+
+static inline int raw_irqs_disabled(void)
+{
+	unsigned long flags = __raw_local_save_flags();
+
+	return raw_irqs_disabled_flags(flags);
+}
+
+/*
+ * makes the traced hardirq state match with the machine state
+ *
+ * should be a rarely used function, only in places where its
+ * otherwise impossible to know the irq state, like in traps.
+ */
+static inline void trace_hardirqs_fixup_flags(unsigned long flags)
+{
+	if (raw_irqs_disabled_flags(flags))
+		trace_hardirqs_off();
+	else
+		trace_hardirqs_on();
+}
+
+static inline void trace_hardirqs_fixup(void)
+{
+	unsigned long flags = __raw_local_save_flags();
+
+	trace_hardirqs_fixup_flags(flags);
+}
+
 #else
-# include "irqflags_64.h"
+
+#ifdef CONFIG_X86_64
+/*
+ * Currently paravirt can't handle swapgs nicely when we
+ * don't have a stack we can rely on (such as a user space
+ * stack).  So we either find a way around these or just fault
+ * and emulate if a guest tries to call swapgs directly.
+ *
+ * Either way, this is a good way to document that we don't
+ * have a reliable stack. x86_64 only.
+ */
+#define SWAPGS_UNSAFE_STACK	swapgs
+#define ARCH_TRACE_IRQS_ON		call trace_hardirqs_on_thunk
+#define ARCH_TRACE_IRQS_OFF		call trace_hardirqs_off_thunk
+#define ARCH_LOCKDEP_SYS_EXIT		call lockdep_sys_exit_thunk
+#define ARCH_LOCKDEP_SYS_EXIT_IRQ	\
+	TRACE_IRQS_ON; \
+	sti; \
+	SAVE_REST; \
+	LOCKDEP_SYS_EXIT; \
+	RESTORE_REST; \
+	cli; \
+	TRACE_IRQS_OFF;
+
+#else
+#define ARCH_TRACE_IRQS_ON			\
+	pushl %eax;				\
+	pushl %ecx;				\
+	pushl %edx;				\
+	call trace_hardirqs_on;			\
+	popl %edx;				\
+	popl %ecx;				\
+	popl %eax;
+
+#define ARCH_TRACE_IRQS_OFF			\
+	pushl %eax;				\
+	pushl %ecx;				\
+	pushl %edx;				\
+	call trace_hardirqs_off;		\
+	popl %edx;				\
+	popl %ecx;				\
+	popl %eax;
+
+#define ARCH_LOCKDEP_SYS_EXIT			\
+	pushl %eax;				\
+	pushl %ecx;				\
+	pushl %edx;				\
+	call lockdep_sys_exit;			\
+	popl %edx;				\
+	popl %ecx;				\
+	popl %eax;
+
+#define ARCH_LOCKDEP_SYS_EXIT_IRQ
+#endif
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+#  define TRACE_IRQS_ON		ARCH_TRACE_IRQS_ON
+#  define TRACE_IRQS_OFF	ARCH_TRACE_IRQS_OFF
+#else
+#  define TRACE_IRQS_ON
+#  define TRACE_IRQS_OFF
+#endif
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#  define LOCKDEP_SYS_EXIT	ARCH_LOCKDEP_SYS_EXIT
+#  define LOCKDEP_SYS_EXIT_IRQ	ARCH_LOCKDEP_SYS_EXIT_IRQ
+# else
+#  define LOCKDEP_SYS_EXIT
+#  define LOCKDEP_SYS_EXIT_IRQ
+# endif
+
+#endif /* __ASSEMBLY__ */
 #endif
diff --git a/include/asm-x86/irqflags_32.h b/include/asm-x86/irqflags_32.h
deleted file mode 100644
index 98b21b9bdce8..000000000000
--- a/include/asm-x86/irqflags_32.h
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- * IRQ flags handling
- *
- * This file gets included from lowlevel asm headers too, to provide
- * wrapped versions of the local_irq_*() APIs, based on the
- * raw_local_irq_*() functions from the lowlevel headers.
- */
-#ifndef _ASM_IRQFLAGS_H
-#define _ASM_IRQFLAGS_H
-#include <asm/processor-flags.h>
-
-#ifndef __ASSEMBLY__
-static inline unsigned long native_save_fl(void)
-{
-	unsigned long f;
-	asm volatile("pushfl ; popl %0":"=g" (f): /* no input */);
-	return f;
-}
-
-static inline void native_restore_fl(unsigned long f)
-{
-	asm volatile("pushl %0 ; popfl": /* no output */
-			     :"g" (f)
-			     :"memory", "cc");
-}
-
-static inline void native_irq_disable(void)
-{
-	asm volatile("cli": : :"memory");
-}
-
-static inline void native_irq_enable(void)
-{
-	asm volatile("sti": : :"memory");
-}
-
-static inline void native_safe_halt(void)
-{
-	asm volatile("sti; hlt": : :"memory");
-}
-
-static inline void native_halt(void)
-{
-	asm volatile("hlt": : :"memory");
-}
-#endif	/* __ASSEMBLY__ */
-
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#else
-#ifndef __ASSEMBLY__
-
-static inline unsigned long __raw_local_save_flags(void)
-{
-	return native_save_fl();
-}
-
-static inline void raw_local_irq_restore(unsigned long flags)
-{
-	native_restore_fl(flags);
-}
-
-static inline void raw_local_irq_disable(void)
-{
-	native_irq_disable();
-}
-
-static inline void raw_local_irq_enable(void)
-{
-	native_irq_enable();
-}
-
-/*
- * Used in the idle loop; sti takes one instruction cycle
- * to complete:
- */
-static inline void raw_safe_halt(void)
-{
-	native_safe_halt();
-}
-
-/*
- * Used when interrupts are already enabled or to
- * shutdown the processor:
- */
-static inline void halt(void)
-{
-	native_halt();
-}
-
-/*
- * For spinlocks, etc:
- */
-static inline unsigned long __raw_local_irq_save(void)
-{
-	unsigned long flags = __raw_local_save_flags();
-
-	raw_local_irq_disable();
-
-	return flags;
-}
-
-#else
-#define DISABLE_INTERRUPTS(clobbers)	cli
-#define ENABLE_INTERRUPTS(clobbers)	sti
-#define ENABLE_INTERRUPTS_SYSEXIT	sti; sysexit
-#define INTERRUPT_RETURN		iret
-#define GET_CR0_INTO_EAX		movl %cr0, %eax
-#endif /* __ASSEMBLY__ */
-#endif /* CONFIG_PARAVIRT */
-
-#ifndef __ASSEMBLY__
-#define raw_local_save_flags(flags) \
-		do { (flags) = __raw_local_save_flags(); } while (0)
-
-#define raw_local_irq_save(flags) \
-		do { (flags) = __raw_local_irq_save(); } while (0)
-
-static inline int raw_irqs_disabled_flags(unsigned long flags)
-{
-	return !(flags & X86_EFLAGS_IF);
-}
-
-static inline int raw_irqs_disabled(void)
-{
-	unsigned long flags = __raw_local_save_flags();
-
-	return raw_irqs_disabled_flags(flags);
-}
-
-/*
- * makes the traced hardirq state match with the machine state
- *
- * should be a rarely used function, only in places where its
- * otherwise impossible to know the irq state, like in traps.
- */
-static inline void trace_hardirqs_fixup_flags(unsigned long flags)
-{
-	if (raw_irqs_disabled_flags(flags))
-		trace_hardirqs_off();
-	else
-		trace_hardirqs_on();
-}
-
-static inline void trace_hardirqs_fixup(void)
-{
-	unsigned long flags = __raw_local_save_flags();
-
-	trace_hardirqs_fixup_flags(flags);
-}
-#endif /* __ASSEMBLY__ */
-
-/*
- * Do the CPU's IRQ-state tracing from assembly code. We call a
- * C function, so save all the C-clobbered registers:
- */
-#ifdef CONFIG_TRACE_IRQFLAGS
-
-# define TRACE_IRQS_ON				\
-	pushl %eax;				\
-	pushl %ecx;				\
-	pushl %edx;				\
-	call trace_hardirqs_on;			\
-	popl %edx;				\
-	popl %ecx;				\
-	popl %eax;
-
-# define TRACE_IRQS_OFF				\
-	pushl %eax;				\
-	pushl %ecx;				\
-	pushl %edx;				\
-	call trace_hardirqs_off;		\
-	popl %edx;				\
-	popl %ecx;				\
-	popl %eax;
-
-#else
-# define TRACE_IRQS_ON
-# define TRACE_IRQS_OFF
-#endif
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# define LOCKDEP_SYS_EXIT			\
-	pushl %eax;				\
-	pushl %ecx;				\
-	pushl %edx;				\
-	call lockdep_sys_exit;			\
-	popl %edx;				\
-	popl %ecx;				\
-	popl %eax;
-#else
-# define LOCKDEP_SYS_EXIT
-#endif
-
-#endif
diff --git a/include/asm-x86/irqflags_64.h b/include/asm-x86/irqflags_64.h
deleted file mode 100644
index 38c07db733cf..000000000000
--- a/include/asm-x86/irqflags_64.h
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * IRQ flags handling
- *
- * This file gets included from lowlevel asm headers too, to provide
- * wrapped versions of the local_irq_*() APIs, based on the
- * raw_local_irq_*() functions from the lowlevel headers.
- */
-#ifndef _ASM_IRQFLAGS_H
-#define _ASM_IRQFLAGS_H
-#include <asm/processor-flags.h>
-
-#ifndef __ASSEMBLY__
-/*
- * Interrupt control:
- */
-
-static inline unsigned long __raw_local_save_flags(void)
-{
-	unsigned long flags;
-
-	__asm__ __volatile__(
-		"# __raw_save_flags\n\t"
-		"pushfq ; popq %q0"
-		: "=g" (flags)
-		: /* no input */
-		: "memory"
-	);
-
-	return flags;
-}
-
-#define raw_local_save_flags(flags) \
-		do { (flags) = __raw_local_save_flags(); } while (0)
-
-static inline void raw_local_irq_restore(unsigned long flags)
-{
-	__asm__ __volatile__(
-		"pushq %0 ; popfq"
-		: /* no output */
-		:"g" (flags)
-		:"memory", "cc"
-	);
-}
-
-#ifdef CONFIG_X86_VSMP
-
-/*
- * Interrupt control for the VSMP architecture:
- */
-
-static inline void raw_local_irq_disable(void)
-{
-	unsigned long flags = __raw_local_save_flags();
-
-	raw_local_irq_restore((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC);
-}
-
-static inline void raw_local_irq_enable(void)
-{
-	unsigned long flags = __raw_local_save_flags();
-
-	raw_local_irq_restore((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
-}
-
-static inline int raw_irqs_disabled_flags(unsigned long flags)
-{
-	return !(flags & X86_EFLAGS_IF) || (flags & X86_EFLAGS_AC);
-}
-
-#else /* CONFIG_X86_VSMP */
-
-static inline void raw_local_irq_disable(void)
-{
-	__asm__ __volatile__("cli" : : : "memory");
-}
-
-static inline void raw_local_irq_enable(void)
-{
-	__asm__ __volatile__("sti" : : : "memory");
-}
-
-static inline int raw_irqs_disabled_flags(unsigned long flags)
-{
-	return !(flags & X86_EFLAGS_IF);
-}
-
-#endif
-
-/*
- * For spinlocks, etc.:
- */
-
-static inline unsigned long __raw_local_irq_save(void)
-{
-	unsigned long flags = __raw_local_save_flags();
-
-	raw_local_irq_disable();
-
-	return flags;
-}
-
-#define raw_local_irq_save(flags) \
-		do { (flags) = __raw_local_irq_save(); } while (0)
-
-static inline int raw_irqs_disabled(void)
-{
-	unsigned long flags = __raw_local_save_flags();
-
-	return raw_irqs_disabled_flags(flags);
-}
-
-/*
- * makes the traced hardirq state match with the machine state
- *
- * should be a rarely used function, only in places where its
- * otherwise impossible to know the irq state, like in traps.
- */
-static inline void trace_hardirqs_fixup_flags(unsigned long flags)
-{
-	if (raw_irqs_disabled_flags(flags))
-		trace_hardirqs_off();
-	else
-		trace_hardirqs_on();
-}
-
-static inline void trace_hardirqs_fixup(void)
-{
-	unsigned long flags = __raw_local_save_flags();
-
-	trace_hardirqs_fixup_flags(flags);
-}
-/*
- * Used in the idle loop; sti takes one instruction cycle
- * to complete:
- */
-static inline void raw_safe_halt(void)
-{
-	__asm__ __volatile__("sti; hlt" : : : "memory");
-}
-
-/*
- * Used when interrupts are already enabled or to
- * shutdown the processor:
- */
-static inline void halt(void)
-{
-	__asm__ __volatile__("hlt": : :"memory");
-}
-
-#else /* __ASSEMBLY__: */
-# ifdef CONFIG_TRACE_IRQFLAGS
-#  define TRACE_IRQS_ON		call trace_hardirqs_on_thunk
-#  define TRACE_IRQS_OFF	call trace_hardirqs_off_thunk
-# else
-#  define TRACE_IRQS_ON
-#  define TRACE_IRQS_OFF
-# endif
-# ifdef CONFIG_DEBUG_LOCK_ALLOC
-#  define LOCKDEP_SYS_EXIT	call lockdep_sys_exit_thunk
-#  define LOCKDEP_SYS_EXIT_IRQ	\
-	TRACE_IRQS_ON; \
-	sti; \
-	SAVE_REST; \
-	LOCKDEP_SYS_EXIT; \
-	RESTORE_REST; \
-	cli; \
-	TRACE_IRQS_OFF;
-# else
-#  define LOCKDEP_SYS_EXIT
-#  define LOCKDEP_SYS_EXIT_IRQ
-# endif
-#endif
-
-#endif
diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h
index 19fd3e67b08c..be7b934f6c54 100644
--- a/include/asm-x86/paravirt.h
+++ b/include/asm-x86/paravirt.h
@@ -121,7 +121,7 @@ struct pv_cpu_ops {
 	u64 (*read_pmc)(void);
 
 	/* These two are jmp to, not actually called. */
-	void (*irq_enable_sysexit)(void);
+	void (*irq_enable_syscall_ret)(void);
 	void (*iret)(void);
 
 	struct pv_lazy_ops lazy_mode;
@@ -1138,9 +1138,10 @@ static inline unsigned long __raw_local_irq_save(void)
 		  call *%cs:pv_irq_ops+PV_IRQ_irq_enable;		\
 		  popl %edx; popl %ecx; popl %eax)
 
-#define ENABLE_INTERRUPTS_SYSEXIT					       \
-	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), CLBR_NONE,\
-		  jmp *%cs:pv_cpu_ops+PV_CPU_irq_enable_sysexit)
+#define ENABLE_INTERRUPTS_SYSCALL_RET					\
+	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_syscall_ret),\
+		  CLBR_NONE,						\
+		  jmp *%cs:pv_cpu_ops+PV_CPU_irq_enable_syscall_ret)
 
 #define GET_CR0_INTO_EAX			\
 	push %ecx; push %edx;			\
-- 
cgit v1.2.3


From 0b9c99b6f21c2e9e00938e9c57942ed71bfe4d21 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:30:35 +0100
Subject: x86: cleanup tlbflush.h variants

Bring the tlbflush.h variants into sync to prepare merging and
paravirt support.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/smp_64.c            |  12 ++--
 arch/x86/mach-voyager/voyager_smp.c |   7 +--
 arch/x86/mm/boot_ioremap_32.c       |   2 +-
 include/asm-x86/cpufeature.h        |   9 +++
 include/asm-x86/tlbflush_32.h       | 120 +++++++++++++++---------------------
 include/asm-x86/tlbflush_64.h       |  78 +++++++++++++++++------
 6 files changed, 128 insertions(+), 100 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/smp_64.c b/arch/x86/kernel/smp_64.c
index 62b0f2a1b1e8..7142447b5666 100644
--- a/arch/x86/kernel/smp_64.c
+++ b/arch/x86/kernel/smp_64.c
@@ -55,7 +55,6 @@ union smp_flush_state {
 		cpumask_t flush_cpumask;
 		struct mm_struct *flush_mm;
 		unsigned long flush_va;
-#define FLUSH_ALL	-1ULL
 		spinlock_t tlbstate_lock;
 	};
 	char pad[SMP_CACHE_BYTES];
@@ -153,7 +152,7 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
 
 	if (f->flush_mm == read_pda(active_mm)) {
 		if (read_pda(mmu_state) == TLBSTATE_OK) {
-			if (f->flush_va == FLUSH_ALL)
+			if (f->flush_va == TLB_FLUSH_ALL)
 				local_flush_tlb();
 			else
 				__flush_tlb_one(f->flush_va);
@@ -166,11 +165,12 @@ out:
 	add_pda(irq_tlb_count, 1);
 }
 
-static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
-						unsigned long va)
+void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
+			     unsigned long va)
 {
 	int sender;
 	union smp_flush_state *f;
+	cpumask_t cpumask = *cpumaskp;
 
 	/* Caller has disabled preemption */
 	sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
@@ -223,7 +223,7 @@ void flush_tlb_current_task(void)
 
 	local_flush_tlb();
 	if (!cpus_empty(cpu_mask))
-		flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
+		flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
 	preempt_enable();
 }
 
@@ -242,7 +242,7 @@ void flush_tlb_mm (struct mm_struct * mm)
 			leave_mm(smp_processor_id());
 	}
 	if (!cpus_empty(cpu_mask))
-		flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
+		flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
 
 	preempt_enable();
 }
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index 981def2b4e9b..b472a2df0b7f 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -800,7 +800,6 @@ static void smp_reschedule_interrupt(void)
 static struct mm_struct *flush_mm;
 static unsigned long flush_va;
 static DEFINE_SPINLOCK(tlbstate_lock);
-#define FLUSH_ALL	0xffffffff
 
 /*
  * We cannot call mmdrop() because we are in interrupt context,
@@ -834,7 +833,7 @@ static void smp_invalidate_interrupt(void)
 
 	if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
 		if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
-			if (flush_va == FLUSH_ALL)
+			if (flush_va == TLB_FLUSH_ALL)
 				local_flush_tlb();
 			else
 				__flush_tlb_one(flush_va);
@@ -903,7 +902,7 @@ void flush_tlb_current_task(void)
 	cpu_mask = cpus_addr(mm->cpu_vm_mask)[0] & ~(1 << smp_processor_id());
 	local_flush_tlb();
 	if (cpu_mask)
-		voyager_flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
+		voyager_flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
 
 	preempt_enable();
 }
@@ -923,7 +922,7 @@ void flush_tlb_mm(struct mm_struct *mm)
 			leave_mm(smp_processor_id());
 	}
 	if (cpu_mask)
-		voyager_flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
+		voyager_flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
 
 	preempt_enable();
 }
diff --git a/arch/x86/mm/boot_ioremap_32.c b/arch/x86/mm/boot_ioremap_32.c
index f14da2a53ece..b20f74a2770f 100644
--- a/arch/x86/mm/boot_ioremap_32.c
+++ b/arch/x86/mm/boot_ioremap_32.c
@@ -57,7 +57,7 @@ static void __boot_ioremap(unsigned long phys_addr, unsigned long nrpages,
 	pte = boot_vaddr_to_pte(virtual_source);
 	for (i=0; i < nrpages; i++, phys_addr += PAGE_SIZE, pte++) {
 		set_pte(pte, pfn_pte(phys_addr>>PAGE_SHIFT, PAGE_KERNEL));
-		__flush_tlb_one(&vaddr[i*PAGE_SIZE]);
+		__flush_tlb_one((unsigned long) &vaddr[i*PAGE_SIZE]);
 	}
 }
 
diff --git a/include/asm-x86/cpufeature.h b/include/asm-x86/cpufeature.h
index 4c7875554d01..acbf6681740d 100644
--- a/include/asm-x86/cpufeature.h
+++ b/include/asm-x86/cpufeature.h
@@ -163,6 +163,12 @@
 #define cpu_has_clflush		boot_cpu_has(X86_FEATURE_CLFLSH)
 #define cpu_has_bts		boot_cpu_has(X86_FEATURE_BTS)
 
+#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64)
+# define cpu_has_invlpg		1
+#else
+# define cpu_has_invlpg		(boot_cpu_data.x86 > 3)
+#endif
+
 #ifdef CONFIG_X86_64
 
 #undef  cpu_has_vme
@@ -183,6 +189,9 @@
 #undef  cpu_has_centaur_mcr
 #define cpu_has_centaur_mcr	0
 
+#undef  cpu_has_pge
+#define cpu_has_pge		1
+
 #endif /* CONFIG_X86_64 */
 
 #endif /* _ASM_X86_CPUFEATURE_H */
diff --git a/include/asm-x86/tlbflush_32.h b/include/asm-x86/tlbflush_32.h
index 2bd5b95e2048..9e07cc8f2d94 100644
--- a/include/asm-x86/tlbflush_32.h
+++ b/include/asm-x86/tlbflush_32.h
@@ -1,8 +1,11 @@
-#ifndef _I386_TLBFLUSH_H
-#define _I386_TLBFLUSH_H
+#ifndef _X86_TLBFLUSH_H
+#define _X86_TLBFLUSH_H
 
 #include <linux/mm.h>
+#include <linux/sched.h>
+
 #include <asm/processor.h>
+#include <asm/system.h>
 
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
@@ -12,62 +15,41 @@
 #define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
 #endif
 
-#define __native_flush_tlb()						\
-	do {								\
-		unsigned int tmpreg;					\
-									\
-		__asm__ __volatile__(					\
-			"movl %%cr3, %0;              \n"		\
-			"movl %0, %%cr3;  # flush TLB \n"		\
-			: "=r" (tmpreg)					\
-			:: "memory");					\
-	} while (0)
+static inline void __native_flush_tlb(void)
+{
+	write_cr3(read_cr3());
+}
 
-/*
- * Global pages have to be flushed a bit differently. Not a real
- * performance problem because this does not happen often.
- */
-#define __native_flush_tlb_global()					\
-	do {								\
-		unsigned int tmpreg, cr4, cr4_orig;			\
-									\
-		__asm__ __volatile__(					\
-			"movl %%cr4, %2;  # turn off PGE     \n"	\
-			"movl %2, %1;                        \n"	\
-			"andl %3, %1;                        \n"	\
-			"movl %1, %%cr4;                     \n"	\
-			"movl %%cr3, %0;                     \n"	\
-			"movl %0, %%cr3;  # flush TLB        \n"	\
-			"movl %2, %%cr4;  # turn PGE back on \n"	\
-			: "=&r" (tmpreg), "=&r" (cr4), "=&r" (cr4_orig)	\
-			: "i" (~X86_CR4_PGE)				\
-			: "memory");					\
-	} while (0)
-
-#define __native_flush_tlb_single(addr) 				\
-	__asm__ __volatile__("invlpg (%0)" ::"r" (addr) : "memory")
-
-# define __flush_tlb_all()						\
-	do {								\
-		if (cpu_has_pge)					\
-			__flush_tlb_global();				\
-		else							\
-			__flush_tlb();					\
-	} while (0)
-
-#define cpu_has_invlpg	(boot_cpu_data.x86 > 3)
-
-#ifdef CONFIG_X86_INVLPG
-# define __flush_tlb_one(addr) __flush_tlb_single(addr)
-#else
-# define __flush_tlb_one(addr)						\
-	do {								\
-		if (cpu_has_invlpg)					\
-			__flush_tlb_single(addr);			\
-		else							\
-			__flush_tlb();					\
-	} while (0)
-#endif
+static inline void __native_flush_tlb_global(void)
+{
+	unsigned long cr4 = read_cr4();
+
+	/* clear PGE */
+	write_cr4(cr4 & ~X86_CR4_PGE);
+	/* write old PGE again and flush TLBs */
+	write_cr4(cr4);
+}
+
+static inline void __native_flush_tlb_single(unsigned long addr)
+{
+	__asm__ __volatile__("invlpg (%0)" ::"r" (addr) : "memory");
+}
+
+static inline void __flush_tlb_all(void)
+{
+	if (cpu_has_pge)
+		__flush_tlb_global();
+	else
+		__flush_tlb();
+}
+
+static inline void __flush_tlb_one(unsigned long addr)
+{
+	if (cpu_has_invlpg)
+		__flush_tlb_single(addr);
+	else
+		__flush_tlb();
+}
 
 /*
  * TLB flushing:
@@ -86,11 +68,8 @@
 
 #define TLB_FLUSH_ALL	0xffffffff
 
-
 #ifndef CONFIG_SMP
 
-#include <linux/sched.h>
-
 #define flush_tlb() __flush_tlb()
 #define flush_tlb_all() __flush_tlb_all()
 #define local_flush_tlb() __flush_tlb()
@@ -102,21 +81,22 @@ static inline void flush_tlb_mm(struct mm_struct *mm)
 }
 
 static inline void flush_tlb_page(struct vm_area_struct *vma,
-	unsigned long addr)
+				  unsigned long addr)
 {
 	if (vma->vm_mm == current->active_mm)
 		__flush_tlb_one(addr);
 }
 
 static inline void flush_tlb_range(struct vm_area_struct *vma,
-	unsigned long start, unsigned long end)
+				   unsigned long start, unsigned long end)
 {
 	if (vma->vm_mm == current->active_mm)
 		__flush_tlb();
 }
 
 static inline void native_flush_tlb_others(const cpumask_t *cpumask,
-					   struct mm_struct *mm, unsigned long va)
+					   struct mm_struct *mm,
+					   unsigned long va)
 {
 }
 
@@ -124,8 +104,7 @@ static inline void native_flush_tlb_others(const cpumask_t *cpumask,
 
 #include <asm/smp.h>
 
-#define local_flush_tlb() \
-	__flush_tlb()
+#define local_flush_tlb() __flush_tlb()
 
 extern void flush_tlb_all(void);
 extern void flush_tlb_current_task(void);
@@ -134,7 +113,8 @@ extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
 
 #define flush_tlb()	flush_tlb_current_task()
 
-static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
+static inline void flush_tlb_range(struct vm_area_struct *vma,
+				   unsigned long start, unsigned long end)
 {
 	flush_tlb_mm(vma->vm_mm);
 }
@@ -152,17 +132,17 @@ struct tlb_state
 	char __cacheline_padding[L1_CACHE_BYTES-8];
 };
 DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
+
 #endif	/* SMP */
 
 #ifndef CONFIG_PARAVIRT
-#define flush_tlb_others(mask, mm, va)		\
-	native_flush_tlb_others(&mask, mm, va)
+#define flush_tlb_others(mask, mm, va)	native_flush_tlb_others(&mask, mm, va)
 #endif
 
 static inline void flush_tlb_kernel_range(unsigned long start,
-					unsigned long end)
+					  unsigned long end)
 {
 	flush_tlb_all();
 }
 
-#endif /* _I386_TLBFLUSH_H */
+#endif /* _X86_TLBFLUSH_H */
diff --git a/include/asm-x86/tlbflush_64.h b/include/asm-x86/tlbflush_64.h
index 7731fd23d572..0bed440ba9fe 100644
--- a/include/asm-x86/tlbflush_64.h
+++ b/include/asm-x86/tlbflush_64.h
@@ -1,26 +1,55 @@
-#ifndef _X8664_TLBFLUSH_H
-#define _X8664_TLBFLUSH_H
+#ifndef _X86_TLBFLUSH_H
+#define _X86_TLBFLUSH_H
 
 #include <linux/mm.h>
 #include <linux/sched.h>
+
 #include <asm/processor.h>
 #include <asm/system.h>
 
-static inline void __flush_tlb(void)
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define __flush_tlb() __native_flush_tlb()
+#define __flush_tlb_global() __native_flush_tlb_global()
+#define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
+#endif
+
+static inline void __native_flush_tlb(void)
 {
 	write_cr3(read_cr3());
 }
 
-static inline void __flush_tlb_all(void)
+static inline void __native_flush_tlb_global(void)
 {
 	unsigned long cr4 = read_cr4();
-	write_cr4(cr4 & ~X86_CR4_PGE);	/* clear PGE */
-	write_cr4(cr4);			/* write old PGE again and flush TLBs */
+
+	/* clear PGE */
+	write_cr4(cr4 & ~X86_CR4_PGE);
+	/* write old PGE again and flush TLBs */
+	write_cr4(cr4);
 }
 
-#define __flush_tlb_one(addr) \
-	__asm__ __volatile__("invlpg (%0)" :: "r" (addr) : "memory")
+static inline void __native_flush_tlb_single(unsigned long addr)
+{
+	__asm__ __volatile__("invlpg (%0)" ::"r" (addr) : "memory");
+}
+
+static inline void __flush_tlb_all(void)
+{
+	if (cpu_has_pge)
+		__flush_tlb_global();
+	else
+		__flush_tlb();
+}
 
+static inline void __flush_tlb_one(unsigned long addr)
+{
+	if (cpu_has_invlpg)
+		__flush_tlb_single(addr);
+	else
+		__flush_tlb();
+}
 
 /*
  * TLB flushing:
@@ -37,6 +66,8 @@ static inline void __flush_tlb_all(void)
  * range a few INVLPGs in a row are a win.
  */
 
+#define TLB_FLUSH_ALL	-1ULL
+
 #ifndef CONFIG_SMP
 
 #define flush_tlb() __flush_tlb()
@@ -50,25 +81,30 @@ static inline void flush_tlb_mm(struct mm_struct *mm)
 }
 
 static inline void flush_tlb_page(struct vm_area_struct *vma,
-	unsigned long addr)
+				  unsigned long addr)
 {
 	if (vma->vm_mm == current->active_mm)
 		__flush_tlb_one(addr);
 }
 
 static inline void flush_tlb_range(struct vm_area_struct *vma,
-	unsigned long start, unsigned long end)
+				   unsigned long start, unsigned long end)
 {
 	if (vma->vm_mm == current->active_mm)
 		__flush_tlb();
 }
 
-#else
+static inline void native_flush_tlb_others(const cpumask_t *cpumask,
+					   struct mm_struct *mm,
+					   unsigned long va)
+{
+}
+
+#else  /* SMP */
 
 #include <asm/smp.h>
 
-#define local_flush_tlb() \
-	__flush_tlb()
+#define local_flush_tlb() __flush_tlb()
 
 extern void flush_tlb_all(void);
 extern void flush_tlb_current_task(void);
@@ -77,24 +113,28 @@ extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
 
 #define flush_tlb()	flush_tlb_current_task()
 
-static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
+static inline void flush_tlb_range(struct vm_area_struct *vma,
+				   unsigned long start, unsigned long end)
 {
 	flush_tlb_mm(vma->vm_mm);
 }
 
+void native_flush_tlb_others(const cpumask_t *cpumask, struct mm_struct *mm,
+			     unsigned long va);
+
 #define TLBSTATE_OK	1
 #define TLBSTATE_LAZY	2
 
-/* Roughly an IPI every 20MB with 4k pages for freeing page table
-   ranges. Cost is about 42k of memory for each CPU. */
-#define ARCH_FREE_PTE_NR 5350	
+#endif	/* SMP */
 
+#ifndef CONFIG_PARAVIRT
+#define flush_tlb_others(mask, mm, va)	native_flush_tlb_others(&mask, mm, va)
 #endif
 
 static inline void flush_tlb_kernel_range(unsigned long start,
-					unsigned long end)
+					  unsigned long end)
 {
 	flush_tlb_all();
 }
 
-#endif /* _X8664_TLBFLUSH_H */
+#endif /* _X86_TLBFLUSH_H */
-- 
cgit v1.2.3


From 64883ab0e3386d72112a9091d886352a7b4b8bf6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:30:35 +0100
Subject: x86: cleanup mpspec variants

Bring the mpspec variants into sync to prepare merging and
paravirt support.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/mpparse_32.c             |  18 +--
 arch/x86/mach-visws/mpparse.c            |  16 +--
 include/asm-x86/mach-bigsmp/mach_apic.h  |  12 +-
 include/asm-x86/mach-default/mach_apic.h |  18 +--
 include/asm-x86/mach-es7000/mach_apic.h  |  10 +-
 include/asm-x86/mach-numaq/mach_apic.h   |  10 +-
 include/asm-x86/mach-summit/mach_apic.h  |  18 +--
 include/asm-x86/mpspec_32.h              |  49 ++++----
 include/asm-x86/mpspec_64.h              | 196 ++++---------------------------
 include/asm-x86/mpspec_def.h             |  87 ++++++++------
 10 files changed, 148 insertions(+), 286 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/mpparse_32.c b/arch/x86/kernel/mpparse_32.c
index 7a05a7f6099a..22fc8d7dec11 100644
--- a/arch/x86/kernel/mpparse_32.c
+++ b/arch/x86/kernel/mpparse_32.c
@@ -258,7 +258,7 @@ static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
 	if (!(m->mpc_flags & MPC_APIC_USABLE))
 		return;
 
-	printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n",
+	printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
 		m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
 	if (nr_ioapics >= MAX_IO_APICS) {
 		printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n",
@@ -405,9 +405,9 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
 
 	mps_oem_check(mpc, oem, str);
 
-	printk("APIC at: 0x%lX\n",mpc->mpc_lapic);
+	printk("APIC at: 0x%X\n", mpc->mpc_lapic);
 
-	/* 
+	/*
 	 * Save the local APIC address (it might be non-default) -- but only
 	 * if we're not using ACPI.
 	 */
@@ -918,14 +918,14 @@ void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
 	 */
 	mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
 	mp_ioapic_routing[idx].gsi_base = gsi_base;
-	mp_ioapic_routing[idx].gsi_end = gsi_base + 
+	mp_ioapic_routing[idx].gsi_end = gsi_base +
 		io_apic_get_redir_entries(idx);
 
-	printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, "
-		"GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, 
-		mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
-		mp_ioapic_routing[idx].gsi_base,
-		mp_ioapic_routing[idx].gsi_end);
+	printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
+	       "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
+	       mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
+	       mp_ioapic_routing[idx].gsi_base,
+	       mp_ioapic_routing[idx].gsi_end);
 }
 
 void __init
diff --git a/arch/x86/mach-visws/mpparse.c b/arch/x86/mach-visws/mpparse.c
index f3c74fab8b95..2a8456a1f44f 100644
--- a/arch/x86/mach-visws/mpparse.c
+++ b/arch/x86/mach-visws/mpparse.c
@@ -36,19 +36,19 @@ unsigned int __initdata maxcpus = NR_CPUS;
 
 static void __init MP_processor_info (struct mpc_config_processor *m)
 {
- 	int ver, logical_apicid;
+	int ver, logical_apicid;
 	physid_mask_t apic_cpus;
- 	
+
 	if (!(m->mpc_cpuflag & CPU_ENABLED))
 		return;
 
 	logical_apicid = m->mpc_apicid;
-	printk(KERN_INFO "%sCPU #%d %ld:%ld APIC version %d\n",
-		m->mpc_cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "",
-		m->mpc_apicid,
-		(m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
-		(m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
-		m->mpc_apicver);
+	printk(KERN_INFO "%sCPU #%d %u:%u APIC version %d\n",
+	       m->mpc_cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "",
+	       m->mpc_apicid,
+	       (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
+	       (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
+	       m->mpc_apicver);
 
 	if (m->mpc_cpuflag & CPU_BOOTPROCESSOR)
 		boot_cpu_physical_apicid = m->mpc_apicid;
diff --git a/include/asm-x86/mach-bigsmp/mach_apic.h b/include/asm-x86/mach-bigsmp/mach_apic.h
index ebd319f838ab..6df235e8ea91 100644
--- a/include/asm-x86/mach-bigsmp/mach_apic.h
+++ b/include/asm-x86/mach-bigsmp/mach_apic.h
@@ -110,13 +110,13 @@ static inline int cpu_to_logical_apicid(int cpu)
 }
 
 static inline int mpc_apic_id(struct mpc_config_processor *m,
-			struct mpc_config_translation *translation_record)
+			      struct mpc_config_translation *translation_record)
 {
-	printk("Processor #%d %ld:%ld APIC version %d\n",
-	        m->mpc_apicid,
-	        (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
-	        (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
-	        m->mpc_apicver);
+	printk("Processor #%d %u:%u APIC version %d\n",
+	       m->mpc_apicid,
+	       (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
+	       (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
+	       m->mpc_apicver);
 	return m->mpc_apicid;
 }
 
diff --git a/include/asm-x86/mach-default/mach_apic.h b/include/asm-x86/mach-default/mach_apic.h
index 6db1c3babe9a..e3c2c1012c1c 100644
--- a/include/asm-x86/mach-default/mach_apic.h
+++ b/include/asm-x86/mach-default/mach_apic.h
@@ -89,15 +89,15 @@ static inline physid_mask_t apicid_to_cpu_present(int phys_apicid)
 	return physid_mask_of_physid(phys_apicid);
 }
 
-static inline int mpc_apic_id(struct mpc_config_processor *m, 
-			struct mpc_config_translation *translation_record)
-{
-	printk("Processor #%d %ld:%ld APIC version %d\n",
-			m->mpc_apicid,
-			(m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
-			(m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
-			m->mpc_apicver);
-	return (m->mpc_apicid);
+static inline int mpc_apic_id(struct mpc_config_processor *m,
+			      struct mpc_config_translation *translation_record)
+{
+	printk("Processor #%d %u:%u APIC version %d\n",
+	       m->mpc_apicid,
+	       (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
+	       (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
+	       m->mpc_apicver);
+	return m->mpc_apicid;
 }
 
 static inline void setup_portio_remap(void)
diff --git a/include/asm-x86/mach-es7000/mach_apic.h b/include/asm-x86/mach-es7000/mach_apic.h
index caec64be516d..d23011fdf454 100644
--- a/include/asm-x86/mach-es7000/mach_apic.h
+++ b/include/asm-x86/mach-es7000/mach_apic.h
@@ -131,11 +131,11 @@ static inline int cpu_to_logical_apicid(int cpu)
 
 static inline int mpc_apic_id(struct mpc_config_processor *m, struct mpc_config_translation *unused)
 {
-	printk("Processor #%d %ld:%ld APIC version %d\n",
-	        m->mpc_apicid,
-	        (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
-	        (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
-	        m->mpc_apicver);
+	printk("Processor #%d %u:%u APIC version %d\n",
+	       m->mpc_apicid,
+	       (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
+	       (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
+	       m->mpc_apicver);
 	return (m->mpc_apicid);
 }
 
diff --git a/include/asm-x86/mach-numaq/mach_apic.h b/include/asm-x86/mach-numaq/mach_apic.h
index 5e5e7dd2692e..17e183bd39c1 100644
--- a/include/asm-x86/mach-numaq/mach_apic.h
+++ b/include/asm-x86/mach-numaq/mach_apic.h
@@ -101,11 +101,11 @@ static inline int mpc_apic_id(struct mpc_config_processor *m,
 	int quad = translation_record->trans_quad;
 	int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid);
 
-	printk("Processor #%d %ld:%ld APIC version %d (quad %d, apic %d)\n",
-			m->mpc_apicid,
-			(m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
-			(m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
-			m->mpc_apicver, quad, logical_apicid);
+	printk("Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
+	       m->mpc_apicid,
+	       (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
+	       (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
+	       m->mpc_apicver, quad, logical_apicid);
 	return logical_apicid;
 }
 
diff --git a/include/asm-x86/mach-summit/mach_apic.h b/include/asm-x86/mach-summit/mach_apic.h
index 732f776aab8e..062c97f6100b 100644
--- a/include/asm-x86/mach-summit/mach_apic.h
+++ b/include/asm-x86/mach-summit/mach_apic.h
@@ -126,15 +126,15 @@ static inline physid_mask_t apicid_to_cpu_present(int apicid)
 	return physid_mask_of_physid(0);
 }
 
-static inline int mpc_apic_id(struct mpc_config_processor *m, 
-			struct mpc_config_translation *translation_record)
-{
-	printk("Processor #%d %ld:%ld APIC version %d\n",
-			m->mpc_apicid,
-			(m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
-			(m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
-			m->mpc_apicver);
-	return (m->mpc_apicid);
+static inline int mpc_apic_id(struct mpc_config_processor *m,
+			      struct mpc_config_translation *translation_record)
+{
+	printk("Processor #%d %u:%u APIC version %d\n",
+	       m->mpc_apicid,
+	       (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
+	       (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
+	       m->mpc_apicver);
+	return m->mpc_apicid;
 }
 
 static inline void setup_portio_remap(void)
diff --git a/include/asm-x86/mpspec_32.h b/include/asm-x86/mpspec_32.h
index f21349399d14..bb73185e6a7d 100644
--- a/include/asm-x86/mpspec_32.h
+++ b/include/asm-x86/mpspec_32.h
@@ -1,34 +1,37 @@
 #ifndef __ASM_MPSPEC_H
 #define __ASM_MPSPEC_H
 
-#include <linux/cpumask.h>
 #include <asm/mpspec_def.h>
 #include <mach_mpspec.h>
 
-extern int mp_bus_id_to_type [MAX_MP_BUSSES];
-extern int mp_bus_id_to_node [MAX_MP_BUSSES];
-extern int mp_bus_id_to_local [MAX_MP_BUSSES];
-extern int quad_local_to_mp_bus_id [NR_CPUS/4][4];
-extern int mp_bus_id_to_pci_bus [MAX_MP_BUSSES];
+extern int mp_bus_id_to_type[MAX_MP_BUSSES];
+extern int mp_bus_id_to_node[MAX_MP_BUSSES];
+extern int mp_bus_id_to_local[MAX_MP_BUSSES];
+extern int quad_local_to_mp_bus_id[NR_CPUS/4][4];
 
 extern unsigned int def_to_bigsmp;
+extern int apic_version[MAX_APICS];
+extern int pic_mode;
+
+extern int mp_bus_id_to_pci_bus[MAX_MP_BUSSES];
+
 extern unsigned int boot_cpu_physical_apicid;
 extern int smp_found_config;
-extern void find_smp_config (void);
-extern void get_smp_config (void);
 extern int nr_ioapics;
-extern int apic_version [MAX_APICS];
 extern int mp_irq_entries;
-extern struct mpc_config_intsrc mp_irqs [MAX_IRQ_SOURCES];
+extern struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
 extern int mpc_default_type;
 extern unsigned long mp_lapic_addr;
-extern int pic_mode;
+
+extern void find_smp_config (void);
+extern void get_smp_config (void);
 
 #ifdef CONFIG_ACPI
 extern void mp_register_lapic (u8 id, u8 enabled);
 extern void mp_register_lapic_address (u64 address);
 extern void mp_register_ioapic (u8 id, u32 address, u32 gsi_base);
-extern void mp_override_legacy_irq (u8 bus_irq, u8 polarity, u8 trigger, u32 gsi);
+extern void mp_override_legacy_irq (u8 bus_irq, u8 polarity, u8 trigger,
+				    u32 gsi);
 extern void mp_config_acpi_legacy_irqs (void);
 extern int mp_register_gsi (u32 gsi, int edge_level, int active_high_low);
 #endif /* CONFIG_ACPI */
@@ -50,7 +53,7 @@ typedef struct physid_mask physid_mask_t;
 #define physids_and(dst, src1, src2)		bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
 #define physids_or(dst, src1, src2)		bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
 #define physids_clear(map)			bitmap_zero((map).mask, MAX_APICS)
-#define physids_complement(dst, src)		bitmap_complement((dst).mask,(src).mask, MAX_APICS)
+#define physids_complement(dst, src)		bitmap_complement((dst).mask, (src).mask, MAX_APICS)
 #define physids_empty(map)			bitmap_empty((map).mask, MAX_APICS)
 #define physids_equal(map1, map2)		bitmap_equal((map1).mask, (map2).mask, MAX_APICS)
 #define physids_weight(map)			bitmap_weight((map).mask, MAX_APICS)
@@ -58,18 +61,18 @@ typedef struct physid_mask physid_mask_t;
 #define physids_shift_left(d, s, n)		bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS)
 #define physids_coerce(map)			((map).mask[0])
 
-#define physids_promote(physids)						\
-	({									\
-		physid_mask_t __physid_mask = PHYSID_MASK_NONE;			\
-		__physid_mask.mask[0] = physids;				\
-		__physid_mask;							\
+#define physids_promote(physids)					\
+	({								\
+		physid_mask_t __physid_mask = PHYSID_MASK_NONE;		\
+		__physid_mask.mask[0] = physids;			\
+		__physid_mask;						\
 	})
 
-#define physid_mask_of_physid(physid)						\
-	({									\
-		physid_mask_t __physid_mask = PHYSID_MASK_NONE;			\
-		physid_set(physid, __physid_mask);				\
-		__physid_mask;							\
+#define physid_mask_of_physid(physid)					\
+	({								\
+		physid_mask_t __physid_mask = PHYSID_MASK_NONE;		\
+		physid_set(physid, __physid_mask);			\
+		__physid_mask;						\
 	})
 
 #define PHYSID_MASK_ALL		{ {[0 ... PHYSID_ARRAY_SIZE-1] = ~0UL} }
diff --git a/include/asm-x86/mpspec_64.h b/include/asm-x86/mpspec_64.h
index 017fddb61dc5..16eab20667cd 100644
--- a/include/asm-x86/mpspec_64.h
+++ b/include/asm-x86/mpspec_64.h
@@ -1,189 +1,35 @@
 #ifndef __ASM_MPSPEC_H
 #define __ASM_MPSPEC_H
 
-/*
- * Structure definitions for SMP machines following the
- * Intel Multiprocessing Specification 1.1 and 1.4.
- */
-
-/*
- * This tag identifies where the SMP configuration
- * information is. 
- */
- 
-#define SMP_MAGIC_IDENT	(('_'<<24)|('P'<<16)|('M'<<8)|'_')
-
-/*
- * A maximum of 255 APICs with the current APIC ID architecture.
- */
-#define MAX_APICS 255
-
-struct intel_mp_floating
-{
-	char mpf_signature[4];		/* "_MP_" 			*/
-	unsigned int mpf_physptr;	/* Configuration table address	*/
-	unsigned char mpf_length;	/* Our length (paragraphs)	*/
-	unsigned char mpf_specification;/* Specification version	*/
-	unsigned char mpf_checksum;	/* Checksum (makes sum 0)	*/
-	unsigned char mpf_feature1;	/* Standard or configuration ? 	*/
-	unsigned char mpf_feature2;	/* Bit7 set for IMCR|PIC	*/
-	unsigned char mpf_feature3;	/* Unused (0)			*/
-	unsigned char mpf_feature4;	/* Unused (0)			*/
-	unsigned char mpf_feature5;	/* Unused (0)			*/
-};
-
-struct mp_config_table
-{
-	char mpc_signature[4];
-#define MPC_SIGNATURE "PCMP"
-	unsigned short mpc_length;	/* Size of table */
-	char  mpc_spec;			/* 0x01 */
-	char  mpc_checksum;
-	char  mpc_oem[8];
-	char  mpc_productid[12];
-	unsigned int mpc_oemptr;	/* 0 if not present */
-	unsigned short mpc_oemsize;	/* 0 if not present */
-	unsigned short mpc_oemcount;
-	unsigned int mpc_lapic;	/* APIC address */
-	unsigned int reserved;
-};
-
-/* Followed by entries */
-
-#define	MP_PROCESSOR	0
-#define	MP_BUS		1
-#define	MP_IOAPIC	2
-#define	MP_INTSRC	3
-#define	MP_LINTSRC	4
-
-struct mpc_config_processor
-{
-	unsigned char mpc_type;
-	unsigned char mpc_apicid;	/* Local APIC number */
-	unsigned char mpc_apicver;	/* Its versions */
-	unsigned char mpc_cpuflag;
-#define CPU_ENABLED		1	/* Processor is available */
-#define CPU_BOOTPROCESSOR	2	/* Processor is the BP */
-	unsigned int mpc_cpufeature;		
-#define CPU_STEPPING_MASK 0x0F
-#define CPU_MODEL_MASK	0xF0
-#define CPU_FAMILY_MASK	0xF00
-	unsigned int mpc_featureflag;	/* CPUID feature value */
-	unsigned int mpc_reserved[2];
-};
-
-struct mpc_config_bus
-{
-	unsigned char mpc_type;
-	unsigned char mpc_busid;
-	unsigned char mpc_bustype[6];
-};
-
-/* List of Bus Type string values, Intel MP Spec. */
-#define BUSTYPE_EISA	"EISA"
-#define BUSTYPE_ISA	"ISA"
-#define BUSTYPE_INTERN	"INTERN"	/* Internal BUS */
-#define BUSTYPE_MCA	"MCA"
-#define BUSTYPE_VL	"VL"		/* Local bus */
-#define BUSTYPE_PCI	"PCI"
-#define BUSTYPE_PCMCIA	"PCMCIA"
-#define BUSTYPE_CBUS	"CBUS"
-#define BUSTYPE_CBUSII	"CBUSII"
-#define BUSTYPE_FUTURE	"FUTURE"
-#define BUSTYPE_MBI	"MBI"
-#define BUSTYPE_MBII	"MBII"
-#define BUSTYPE_MPI	"MPI"
-#define BUSTYPE_MPSA	"MPSA"
-#define BUSTYPE_NUBUS	"NUBUS"
-#define BUSTYPE_TC	"TC"
-#define BUSTYPE_VME	"VME"
-#define BUSTYPE_XPRESS	"XPRESS"
-
-struct mpc_config_ioapic
-{
-	unsigned char mpc_type;
-	unsigned char mpc_apicid;
-	unsigned char mpc_apicver;
-	unsigned char mpc_flags;
-#define MPC_APIC_USABLE		0x01
-	unsigned int mpc_apicaddr;
-};
-
-struct mpc_config_intsrc
-{
-	unsigned char mpc_type;
-	unsigned char mpc_irqtype;
-	unsigned short mpc_irqflag;
-	unsigned char mpc_srcbus;
-	unsigned char mpc_srcbusirq;
-	unsigned char mpc_dstapic;
-	unsigned char mpc_dstirq;
-};
-
-enum mp_irq_source_types {
-	mp_INT = 0,
-	mp_NMI = 1,
-	mp_SMI = 2,
-	mp_ExtINT = 3
-};
-
-#define MP_IRQDIR_DEFAULT	0
-#define MP_IRQDIR_HIGH		1
-#define MP_IRQDIR_LOW		3
-
-
-struct mpc_config_lintsrc
-{
-	unsigned char mpc_type;
-	unsigned char mpc_irqtype;
-	unsigned short mpc_irqflag;
-	unsigned char mpc_srcbusid;
-	unsigned char mpc_srcbusirq;
-	unsigned char mpc_destapic;	
-#define MP_APIC_ALL	0xFF
-	unsigned char mpc_destapiclint;
-};
-
-/*
- *	Default configurations
- *
- *	1	2 CPU ISA 82489DX
- *	2	2 CPU EISA 82489DX neither IRQ 0 timer nor IRQ 13 DMA chaining
- *	3	2 CPU EISA 82489DX
- *	4	2 CPU MCA 82489DX
- *	5	2 CPU ISA+PCI
- *	6	2 CPU EISA+PCI
- *	7	2 CPU MCA+PCI
- */
+#include <asm/mpspec_def.h>
 
 #define MAX_MP_BUSSES 256
 /* Each PCI slot may be a combo card with its own bus.  4 IRQ pins per slot. */
 #define MAX_IRQ_SOURCES (MAX_MP_BUSSES * 4)
 extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
-extern int mp_bus_id_to_pci_bus [MAX_MP_BUSSES];
+
+extern int mp_bus_id_to_pci_bus[MAX_MP_BUSSES];
 
 extern unsigned int boot_cpu_physical_apicid;
 extern int smp_found_config;
-extern void find_smp_config (void);
-extern void get_smp_config (void);
 extern int nr_ioapics;
-extern unsigned char apic_version [MAX_APICS];
 extern int mp_irq_entries;
-extern struct mpc_config_intsrc mp_irqs [MAX_IRQ_SOURCES];
+extern struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
 extern int mpc_default_type;
 extern unsigned long mp_lapic_addr;
 
+extern void find_smp_config (void);
+extern void get_smp_config (void);
+
 #ifdef CONFIG_ACPI
 extern void mp_register_lapic (u8 id, u8 enabled);
 extern void mp_register_lapic_address (u64 address);
-
 extern void mp_register_ioapic (u8 id, u32 address, u32 gsi_base);
-extern void mp_override_legacy_irq (u8 bus_irq, u8 polarity, u8 trigger, u32 gsi);
+extern void mp_override_legacy_irq (u8 bus_irq, u8 polarity, u8 trigger,
+				    u32 gsi);
 extern void mp_config_acpi_legacy_irqs (void);
-extern int mp_register_gsi (u32 gsi, int triggering, int polarity);
-#endif
-
-extern int using_apic_timer;
+extern int mp_register_gsi (u32 gsi, int edge_level, int active_high_low);
+#endif /* CONFIG_ACPI */
 
 #define PHYSID_ARRAY_SIZE	BITS_TO_LONGS(MAX_APICS)
 
@@ -210,18 +56,18 @@ typedef struct physid_mask physid_mask_t;
 #define physids_shift_left(d, s, n)		bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS)
 #define physids_coerce(map)			((map).mask[0])
 
-#define physids_promote(physids)						\
-	({									\
-		physid_mask_t __physid_mask = PHYSID_MASK_NONE;			\
-		__physid_mask.mask[0] = physids;				\
-		__physid_mask;							\
+#define physids_promote(physids)					\
+	({								\
+		physid_mask_t __physid_mask = PHYSID_MASK_NONE;		\
+		__physid_mask.mask[0] = physids;			\
+		__physid_mask;						\
 	})
 
-#define physid_mask_of_physid(physid)						\
-	({									\
-		physid_mask_t __physid_mask = PHYSID_MASK_NONE;			\
-		physid_set(physid, __physid_mask);				\
-		__physid_mask;							\
+#define physid_mask_of_physid(physid)					\
+	({								\
+		physid_mask_t __physid_mask = PHYSID_MASK_NONE;		\
+		physid_set(physid, __physid_mask);			\
+		__physid_mask;						\
 	})
 
 #define PHYSID_MASK_ALL		{ {[0 ... PHYSID_ARRAY_SIZE-1] = ~0UL} }
diff --git a/include/asm-x86/mpspec_def.h b/include/asm-x86/mpspec_def.h
index 13bafb16e7af..3504617fe648 100644
--- a/include/asm-x86/mpspec_def.h
+++ b/include/asm-x86/mpspec_def.h
@@ -8,52 +8,68 @@
 
 /*
  * This tag identifies where the SMP configuration
- * information is. 
+ * information is.
  */
- 
+
 #define SMP_MAGIC_IDENT	(('_'<<24)|('P'<<16)|('M'<<8)|'_')
 
-#define MAX_MPC_ENTRY 1024
-#define MAX_APICS      256
+#ifdef CONFIG_X86_32
+# define MAX_MPC_ENTRY 1024
+# define MAX_APICS      256
+#else
+/*
+ * A maximum of 255 APICs with the current APIC ID architecture.
+ */
+# define MAX_APICS 255
+#endif
 
 struct intel_mp_floating
 {
-	char mpf_signature[4];		/* "_MP_" 			*/
-	unsigned long mpf_physptr;	/* Configuration table address	*/
+	char mpf_signature[4];		/* "_MP_"			*/
+	unsigned int mpf_physptr;	/* Configuration table address	*/
 	unsigned char mpf_length;	/* Our length (paragraphs)	*/
 	unsigned char mpf_specification;/* Specification version	*/
 	unsigned char mpf_checksum;	/* Checksum (makes sum 0)	*/
-	unsigned char mpf_feature1;	/* Standard or configuration ? 	*/
+	unsigned char mpf_feature1;	/* Standard or configuration ?	*/
 	unsigned char mpf_feature2;	/* Bit7 set for IMCR|PIC	*/
 	unsigned char mpf_feature3;	/* Unused (0)			*/
 	unsigned char mpf_feature4;	/* Unused (0)			*/
 	unsigned char mpf_feature5;	/* Unused (0)			*/
 };
 
+#define MPC_SIGNATURE "PCMP"
+
 struct mp_config_table
 {
 	char mpc_signature[4];
-#define MPC_SIGNATURE "PCMP"
 	unsigned short mpc_length;	/* Size of table */
 	char  mpc_spec;			/* 0x01 */
 	char  mpc_checksum;
 	char  mpc_oem[8];
 	char  mpc_productid[12];
-	unsigned long mpc_oemptr;	/* 0 if not present */
+	unsigned int mpc_oemptr;	/* 0 if not present */
 	unsigned short mpc_oemsize;	/* 0 if not present */
 	unsigned short mpc_oemcount;
-	unsigned long mpc_lapic;	/* APIC address */
-	unsigned long reserved;
+	unsigned int mpc_lapic;	/* APIC address */
+	unsigned int reserved;
 };
 
 /* Followed by entries */
 
-#define	MP_PROCESSOR	0
-#define	MP_BUS		1
-#define	MP_IOAPIC	2
-#define	MP_INTSRC	3
-#define	MP_LINTSRC	4
-#define	MP_TRANSLATION  192  /* Used by IBM NUMA-Q to describe node locality */
+#define	MP_PROCESSOR		0
+#define	MP_BUS			1
+#define	MP_IOAPIC		2
+#define	MP_INTSRC		3
+#define	MP_LINTSRC		4
+/* Used by IBM NUMA-Q to describe node locality */
+#define	MP_TRANSLATION		192
+
+#define CPU_ENABLED		1	/* Processor is available */
+#define CPU_BOOTPROCESSOR	2	/* Processor is the BP */
+
+#define CPU_STEPPING_MASK	0x000F
+#define CPU_MODEL_MASK		0x00F0
+#define CPU_FAMILY_MASK		0x0F00
 
 struct mpc_config_processor
 {
@@ -61,14 +77,9 @@ struct mpc_config_processor
 	unsigned char mpc_apicid;	/* Local APIC number */
 	unsigned char mpc_apicver;	/* Its versions */
 	unsigned char mpc_cpuflag;
-#define CPU_ENABLED		1	/* Processor is available */
-#define CPU_BOOTPROCESSOR	2	/* Processor is the BP */
-	unsigned long mpc_cpufeature;		
-#define CPU_STEPPING_MASK 0x0F
-#define CPU_MODEL_MASK	0xF0
-#define CPU_FAMILY_MASK	0xF00
-	unsigned long mpc_featureflag;	/* CPUID feature value */
-	unsigned long mpc_reserved[2];
+	unsigned int mpc_cpufeature;
+	unsigned int mpc_featureflag;	/* CPUID feature value */
+	unsigned int mpc_reserved[2];
 };
 
 struct mpc_config_bus
@@ -98,14 +109,15 @@ struct mpc_config_bus
 #define BUSTYPE_VME	"VME"
 #define BUSTYPE_XPRESS	"XPRESS"
 
+#define MPC_APIC_USABLE		0x01
+
 struct mpc_config_ioapic
 {
 	unsigned char mpc_type;
 	unsigned char mpc_apicid;
 	unsigned char mpc_apicver;
 	unsigned char mpc_flags;
-#define MPC_APIC_USABLE		0x01
-	unsigned long mpc_apicaddr;
+	unsigned int mpc_apicaddr;
 };
 
 struct mpc_config_intsrc
@@ -130,6 +142,7 @@ enum mp_irq_source_types {
 #define MP_IRQDIR_HIGH		1
 #define MP_IRQDIR_LOW		3
 
+#define MP_APIC_ALL	0xFF
 
 struct mpc_config_lintsrc
 {
@@ -138,15 +151,15 @@ struct mpc_config_lintsrc
 	unsigned short mpc_irqflag;
 	unsigned char mpc_srcbusid;
 	unsigned char mpc_srcbusirq;
-	unsigned char mpc_destapic;	
-#define MP_APIC_ALL	0xFF
+	unsigned char mpc_destapic;
 	unsigned char mpc_destapiclint;
 };
 
+#define MPC_OEM_SIGNATURE "_OEM"
+
 struct mp_config_oemtable
 {
 	char oem_signature[4];
-#define MPC_OEM_SIGNATURE "_OEM"
 	unsigned short oem_length;	/* Size of table */
 	char  oem_rev;			/* 0x01 */
 	char  oem_checksum;
@@ -155,13 +168,13 @@ struct mp_config_oemtable
 
 struct mpc_config_translation
 {
-        unsigned char mpc_type;
-        unsigned char trans_len;
-        unsigned char trans_type;
-        unsigned char trans_quad;
-        unsigned char trans_global;
-        unsigned char trans_local;
-        unsigned short trans_reserved;
+	unsigned char mpc_type;
+	unsigned char trans_len;
+	unsigned char trans_type;
+	unsigned char trans_quad;
+	unsigned char trans_global;
+	unsigned char trans_local;
+	unsigned short trans_reserved;
 };
 
 /*
-- 
cgit v1.2.3


From fe21a445b98c9d52f02f3412d7a2fd39784f3b22 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:30:38 +0100
Subject: x86: adjust numa 32 namespace

Use the 64bit numa variable names for numa32 as well.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/smpboot_32.c  | 16 ++++++++--------
 include/asm-x86/topology_32.h |  8 ++++----
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/smpboot_32.c b/arch/x86/kernel/smpboot_32.c
index 753d7acf4dac..239ada1c499c 100644
--- a/arch/x86/kernel/smpboot_32.c
+++ b/arch/x86/kernel/smpboot_32.c
@@ -466,19 +466,19 @@ extern struct {
 #ifdef CONFIG_NUMA
 
 /* which logical CPUs are on which nodes */
-cpumask_t node_2_cpu_mask[MAX_NUMNODES] __read_mostly =
+cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly =
 				{ [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE };
-EXPORT_SYMBOL(node_2_cpu_mask);
+EXPORT_SYMBOL(node_to_cpumask_map);
 /* which node each logical CPU is on */
-int cpu_2_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };
-EXPORT_SYMBOL(cpu_2_node);
+int cpu_to_node_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };
+EXPORT_SYMBOL(cpu_to_node_map);
 
 /* set up a mapping between cpu and node. */
 static inline void map_cpu_to_node(int cpu, int node)
 {
 	printk("Mapping cpu %d to node %d\n", cpu, node);
-	cpu_set(cpu, node_2_cpu_mask[node]);
-	cpu_2_node[cpu] = node;
+	cpu_set(cpu, node_to_cpumask_map[node]);
+	cpu_to_node_map[cpu] = node;
 }
 
 /* undo a mapping between cpu and node. */
@@ -488,8 +488,8 @@ static inline void unmap_cpu_to_node(int cpu)
 
 	printk("Unmapping cpu %d from all nodes\n", cpu);
 	for (node = 0; node < MAX_NUMNODES; node ++)
-		cpu_clear(cpu, node_2_cpu_mask[node]);
-	cpu_2_node[cpu] = 0;
+		cpu_clear(cpu, node_to_cpumask_map[node]);
+	cpu_to_node_map[cpu] = 0;
 }
 #else /* !CONFIG_NUMA */
 
diff --git a/include/asm-x86/topology_32.h b/include/asm-x86/topology_32.h
index fdca80a851e2..e28dbdcc5670 100644
--- a/include/asm-x86/topology_32.h
+++ b/include/asm-x86/topology_32.h
@@ -39,13 +39,13 @@
 #include <linux/cpumask.h>
 
 /* Mappings between logical cpu number and node number */
-extern cpumask_t node_2_cpu_mask[];
-extern int cpu_2_node[];
+extern cpumask_t node_to_cpumask_map[];
+extern int cpu_to_node_map[];
 
 /* Returns the number of the node containing CPU 'cpu' */
 static inline int cpu_to_node(int cpu)
 { 
-	return cpu_2_node[cpu];
+	return cpu_to_node_map[cpu];
 }
 
 /* Returns the number of the node containing Node 'node'.  This architecture is flat, 
@@ -55,7 +55,7 @@ static inline int cpu_to_node(int cpu)
 /* Returns a bitmask of CPUs on Node 'node'. */
 static inline cpumask_t node_to_cpumask(int node)
 {
-	return node_2_cpu_mask[node];
+	return node_to_cpumask_map[node];
 }
 
 /* Returns the number of the first CPU on Node 'node'. */
-- 
cgit v1.2.3


From 7ebad705340f35276326ed93a43190e62f725f77 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Wed, 30 Jan 2008 13:30:39 +0100
Subject: x86: use CR0 defines.

Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/bugs.c         | 3 ++-
 arch/x86/kernel/cpu/cyrix.c        | 6 ++----
 arch/x86/kernel/cpu/mtrr/cyrix.c   | 3 ++-
 arch/x86/kernel/cpu/mtrr/generic.c | 3 ++-
 arch/x86/kernel/cpu/mtrr/state.c   | 3 ++-
 5 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 205fd5ba57f7..a96abd453e0d 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -11,6 +11,7 @@
 #include <linux/utsname.h>
 #include <asm/bugs.h>
 #include <asm/processor.h>
+#include <asm/processor-flags.h>
 #include <asm/i387.h>
 #include <asm/msr.h>
 #include <asm/paravirt.h>
@@ -35,7 +36,7 @@ __setup("mca-pentium", mca_pentium);
 static int __init no_387(char *s)
 {
 	boot_cpu_data.hard_math = 0;
-	write_cr0(0xE | read_cr0());
+	write_cr0(X86_CR0_TS | X86_CR0_EM | X86_CR0_MP | read_cr0());
 	return 1;
 }
 
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 88d66fb8411d..404a6a2d4016 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -5,6 +5,7 @@
 #include <asm/dma.h>
 #include <asm/io.h>
 #include <asm/processor-cyrix.h>
+#include <asm/processor-flags.h>
 #include <asm/timer.h>
 #include <asm/pci-direct.h>
 #include <asm/tsc.h>
@@ -126,15 +127,12 @@ static void __cpuinit set_cx86_reorder(void)
 
 static void __cpuinit set_cx86_memwb(void)
 {
-	u32 cr0;
-
 	printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
 
 	/* CCR2 bit 2: unlock NW bit */
 	setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04);
 	/* set 'Not Write-through' */
-	cr0 = 0x20000000;
-	write_cr0(read_cr0() | cr0);
+	write_cr0(read_cr0() | X86_CR0_NW);
 	/* CCR2 bit 2: lock NW bit and set WT1 */
 	setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14 );
 }
diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c
index 9964be3de2b7..8e139c70f888 100644
--- a/arch/x86/kernel/cpu/mtrr/cyrix.c
+++ b/arch/x86/kernel/cpu/mtrr/cyrix.c
@@ -4,6 +4,7 @@
 #include <asm/msr.h>
 #include <asm/io.h>
 #include <asm/processor-cyrix.h>
+#include <asm/processor-flags.h>
 #include "mtrr.h"
 
 int arr3_protected;
@@ -142,7 +143,7 @@ static void prepare_set(void)
 
 	/*  Disable and flush caches. Note that wbinvd flushes the TLBs as
 	    a side-effect  */
-	cr0 = read_cr0() | 0x40000000;
+	cr0 = read_cr0() | X86_CR0_CD;
 	wbinvd();
 	write_cr0(cr0);
 	wbinvd();
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 1c331c373a43..55d31ff118fb 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -9,6 +9,7 @@
 #include <asm/msr.h>
 #include <asm/system.h>
 #include <asm/cpufeature.h>
+#include <asm/processor-flags.h>
 #include <asm/tlbflush.h>
 #include "mtrr.h"
 
@@ -350,7 +351,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
 	spin_lock(&set_atomicity_lock);
 
 	/*  Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
-	cr0 = read_cr0() | 0x40000000;	/* set CD flag */
+	cr0 = read_cr0() | X86_CR0_CD;
 	write_cr0(cr0);
 	wbinvd();
 
diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c
index 49e20c2afcdf..9f8ba923d1c9 100644
--- a/arch/x86/kernel/cpu/mtrr/state.c
+++ b/arch/x86/kernel/cpu/mtrr/state.c
@@ -4,6 +4,7 @@
 #include <asm/mtrr.h>
 #include <asm/msr.h>
 #include <asm/processor-cyrix.h>
+#include <asm/processor-flags.h>
 #include "mtrr.h"
 
 
@@ -25,7 +26,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt)
 
 		/*  Disable and flush caches. Note that wbinvd flushes the TLBs as
 		    a side-effect  */
-		cr0 = read_cr0() | 0x40000000;
+		cr0 = read_cr0() | X86_CR0_CD;
 		wbinvd();
 		write_cr0(cr0);
 		wbinvd();
-- 
cgit v1.2.3


From 5cabbd97b10229159919ff17f746b69742cec84d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 30 Jan 2008 13:30:39 +0100
Subject: x86: remove unused tsk_thread from asm-offsets_64.c

So this patch simply removes the "thread" from asm-offsets.c since I
can't find an owner for it.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/asm-offsets_64.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index d1b6ed98774e..40f41752c1df 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -38,7 +38,6 @@ int main(void)
 #define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry))
 	ENTRY(state);
 	ENTRY(flags); 
-	ENTRY(thread); 
 	ENTRY(pid);
 	BLANK();
 #undef ENTRY
-- 
cgit v1.2.3


From 04e1ba852132c9ad006affcd5b8c8606295170b0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:30:39 +0100
Subject: x86: cleanup kernel/setup_64.c

Clean it up before applying more patches to it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_64.c | 249 +++++++++++++++++++++++----------------------
 1 file changed, 126 insertions(+), 123 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 1acb435a0585..62db062ba138 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -77,7 +77,7 @@ unsigned long saved_video_mode;
 
 int force_mwait __cpuinitdata;
 
-/* 
+/*
  * Early DMI memory
  */
 int dmi_alloc_index;
@@ -169,12 +169,12 @@ contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
 	bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
 	bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
 	if (bootmap == -1L)
-		panic("Cannot find bootmem map of size %ld\n",bootmap_size);
+		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
 	bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
 	e820_register_active_regions(0, start_pfn, end_pfn);
 	free_bootmem_with_active_regions(0, end_pfn);
 	reserve_bootmem(bootmap, bootmap_size);
-} 
+}
 #endif
 
 #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
@@ -208,7 +208,8 @@ static void __init reserve_crashkernel(void)
 	unsigned long long crash_size, crash_base;
 	int ret;
 
-	free_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT;
+	free_mem =
+		((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT;
 
 	ret = parse_crashkernel(boot_command_line, free_mem,
 			&crash_size, &crash_base);
@@ -240,7 +241,7 @@ unsigned __initdata ebda_size;
 static void discover_ebda(void)
 {
 	/*
-	 * there is a real-mode segmented pointer pointing to the 
+	 * there is a real-mode segmented pointer pointing to the
 	 * 4K EBDA area at 0x40E
 	 */
 	ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
@@ -259,6 +260,8 @@ static void discover_ebda(void)
 
 void __init setup_arch(char **cmdline_p)
 {
+	unsigned i;
+
 	printk(KERN_INFO "Command line: %s\n", boot_command_line);
 
 	ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
@@ -345,13 +348,13 @@ void __init setup_arch(char **cmdline_p)
 #endif
 
 #ifdef CONFIG_NUMA
-	numa_initmem_init(0, end_pfn); 
+	numa_initmem_init(0, end_pfn);
 #else
 	contig_initmem_init(0, end_pfn);
 #endif
 
 	/* Reserve direct mapping */
-	reserve_bootmem_generic(table_start << PAGE_SHIFT, 
+	reserve_bootmem_generic(table_start << PAGE_SHIFT,
 				(table_end - table_start) << PAGE_SHIFT);
 
 	/* reserve kernel */
@@ -379,14 +382,14 @@ void __init setup_arch(char **cmdline_p)
 #endif
 
 #ifdef CONFIG_ACPI_SLEEP
-       /*
-        * Reserve low memory region for sleep support.
-        */
-       acpi_reserve_bootmem();
-#endif
 	/*
-	 * Find and reserve possible boot-time SMP configuration:
+	 * Reserve low memory region for sleep support.
 	 */
+       acpi_reserve_bootmem();
+#endif
+       /*
+	* Find and reserve possible boot-time SMP configuration:
+	*/
 	find_smp_config();
 #ifdef CONFIG_BLK_DEV_INITRD
 	if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
@@ -437,16 +440,13 @@ void __init setup_arch(char **cmdline_p)
 
 	/*
 	 * We trust e820 completely. No explicit ROM probing in memory.
- 	 */
+	 */
 	e820_reserve_resources(&code_resource, &data_resource, &bss_resource);
 	e820_mark_nosave_regions();
 
-	{
-	unsigned i;
 	/* request I/O space for devices used on all i[345]86 PCs */
 	for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
 		request_resource(&ioport_resource, &standard_io_resources[i]);
-	}
 
 	e820_setup_gap();
 
@@ -483,9 +483,10 @@ static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
 
 	if (n >= 0x80000005) {
 		cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
-		printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
-			edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
-		c->x86_cache_size=(ecx>>24)+(edx>>24);
+		printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
+		       "D cache %dK (%d bytes/line)\n",
+		       edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
+		c->x86_cache_size = (ecx>>24) + (edx>>24);
 		/* On K8 L1 TLB is inclusive, so don't count it */
 		c->x86_tlbsize = 0;
 	}
@@ -501,9 +502,9 @@ static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
 	}
 
 	if (n >= 0x80000007)
-		cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power); 
+		cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power);
 	if (n >= 0x80000008) {
-		cpuid(0x80000008, &eax, &dummy, &dummy, &dummy); 
+		cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
 		c->x86_virt_bits = (eax >> 8) & 0xff;
 		c->x86_phys_bits = eax & 0xff;
 	}
@@ -512,14 +513,15 @@ static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
 #ifdef CONFIG_NUMA
 static int nearby_node(int apicid)
 {
-	int i;
+	int i, node;
+
 	for (i = apicid - 1; i >= 0; i--) {
-		int node = apicid_to_node[i];
+		node = apicid_to_node[i];
 		if (node != NUMA_NO_NODE && node_online(node))
 			return node;
 	}
 	for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
-		int node = apicid_to_node[i];
+		node = apicid_to_node[i];
 		if (node != NUMA_NO_NODE && node_online(node))
 			return node;
 	}
@@ -559,27 +561,29 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
 	c->phys_proc_id = phys_pkg_id(bits);
 
 #ifdef CONFIG_NUMA
-  	node = c->phys_proc_id;
- 	if (apicid_to_node[apicid] != NUMA_NO_NODE)
- 		node = apicid_to_node[apicid];
- 	if (!node_online(node)) {
- 		/* Two possibilities here:
- 		   - The CPU is missing memory and no node was created.
- 		   In that case try picking one from a nearby CPU
- 		   - The APIC IDs differ from the HyperTransport node IDs
- 		   which the K8 northbridge parsing fills in.
- 		   Assume they are all increased by a constant offset,
- 		   but in the same order as the HT nodeids.
- 		   If that doesn't result in a usable node fall back to the
- 		   path for the previous case.  */
+	node = c->phys_proc_id;
+	if (apicid_to_node[apicid] != NUMA_NO_NODE)
+		node = apicid_to_node[apicid];
+	if (!node_online(node)) {
+		/* Two possibilities here:
+		   - The CPU is missing memory and no node was created.
+		   In that case try picking one from a nearby CPU
+		   - The APIC IDs differ from the HyperTransport node IDs
+		   which the K8 northbridge parsing fills in.
+		   Assume they are all increased by a constant offset,
+		   but in the same order as the HT nodeids.
+		   If that doesn't result in a usable node fall back to the
+		   path for the previous case.  */
+
 		int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits);
- 		if (ht_nodeid >= 0 &&
- 		    apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
- 			node = apicid_to_node[ht_nodeid];
- 		/* Pick a nearby node */
- 		if (!node_online(node))
- 			node = nearby_node(apicid);
- 	}
+
+		if (ht_nodeid >= 0 &&
+		    apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
+			node = apicid_to_node[ht_nodeid];
+		/* Pick a nearby node */
+		if (!node_online(node))
+			node = nearby_node(apicid);
+	}
 	numa_set_node(cpu, node);
 
 	printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
@@ -599,8 +603,8 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
 /* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
 static __cpuinit int amd_apic_timer_broken(void)
 {
-	u32 lo, hi;
-	u32 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
+	u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
+
 	switch (eax & CPUID_XFAM) {
 	case CPUID_XFAM_K8:
 		if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F)
@@ -628,7 +632,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 	/*
 	 * Disable TLB flush filter by setting HWCR.FFDIS on K8
 	 * bit 6 of msr C001_0015
- 	 *
+	 *
 	 * Errata 63 for SH-B3 steppings
 	 * Errata 122 for all steppings (F+ have it disabled by default)
 	 */
@@ -642,10 +646,11 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 	/* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
 	   3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
 	clear_bit(0*32+31, &c->x86_capability);
-	
+
 	/* On C+ stepping K8 rep microcode works well for copy/memset */
 	level = cpuid_eax(1);
-	if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58))
+	if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) ||
+			     level >= 0x0f58))
 		set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
 	if (c->x86 == 0x10 || c->x86 == 0x11)
 		set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
@@ -656,14 +661,14 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 
 	level = get_model_name(c);
 	if (!level) {
-		switch (c->x86) { 
+		switch (c->x86) {
 		case 15:
 			/* Should distinguish Models here, but this is only
 			   a fallback anyways. */
 			strcpy(c->x86_model_id, "Hammer");
-			break; 
-		} 
-	} 
+			break;
+		}
+	}
 	display_cacheinfo(c);
 
 	/* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
@@ -697,25 +702,26 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_SMP
-	u32 	eax, ebx, ecx, edx;
-	int 	index_msb, core_bits;
+	u32 eax, ebx, ecx, edx;
+	int index_msb, core_bits;
 
 	cpuid(1, &eax, &ebx, &ecx, &edx);
 
 
 	if (!cpu_has(c, X86_FEATURE_HT))
 		return;
- 	if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
+	if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
 		goto out;
 
 	smp_num_siblings = (ebx & 0xff0000) >> 16;
 
 	if (smp_num_siblings == 1) {
 		printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
-	} else if (smp_num_siblings > 1 ) {
+	} else if (smp_num_siblings > 1) {
 
 		if (smp_num_siblings > NR_CPUS) {
-			printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
+			printk(KERN_WARNING "CPU: Unsupported number of "
+			       "siblings %d", smp_num_siblings);
 			smp_num_siblings = 1;
 			return;
 		}
@@ -725,7 +731,7 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 
 		smp_num_siblings = smp_num_siblings / c->x86_max_cores;
 
-		index_msb = get_count_order(smp_num_siblings) ;
+		index_msb = get_count_order(smp_num_siblings);
 
 		core_bits = get_count_order(c->x86_max_cores);
 
@@ -734,8 +740,10 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 	}
 out:
 	if ((c->x86_max_cores * smp_num_siblings) > 1) {
-		printk(KERN_INFO  "CPU: Physical Processor ID: %d\n", c->phys_proc_id);
-		printk(KERN_INFO  "CPU: Processor Core ID: %d\n", c->cpu_core_id);
+		printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
+		       c->phys_proc_id);
+		printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
+		       c->cpu_core_id);
 	}
 
 #endif
@@ -783,7 +791,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 	unsigned n;
 
 	init_intel_cacheinfo(c);
-	if (c->cpuid_level > 9 ) {
+	if (c->cpuid_level > 9) {
 		unsigned eax = cpuid_eax(10);
 		/* Check for version and the number of counters */
 		if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
@@ -822,7 +830,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 		set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
 	else
 		clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
- 	c->x86_max_cores = intel_num_cpu_cores(c);
+	c->x86_max_cores = intel_num_cpu_cores(c);
 
 	srat_detect_node();
 }
@@ -869,7 +877,7 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
 	      (unsigned int *)&c->x86_vendor_id[0],
 	      (unsigned int *)&c->x86_vendor_id[8],
 	      (unsigned int *)&c->x86_vendor_id[4]);
-		
+
 	get_cpu_vendor(c);
 
 	/* Initialize the standard set of capabilities */
@@ -887,7 +895,7 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
 			c->x86 += (tfms >> 20) & 0xff;
 		if (c->x86 >= 0x6)
 			c->x86_model += ((tfms >> 16) & 0xF) << 4;
-		if (c->x86_capability[0] & (1<<19)) 
+		if (c->x86_capability[0] & (1<<19))
 			c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
 	} else {
 		/* Have CPUID level 0 only - unheard of */
@@ -959,7 +967,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 	}
 
 	select_idle_routine(c);
-	detect_ht(c); 
+	detect_ht(c);
 
 	/*
 	 * On SMP, boot_cpu_data holds the common feature set between
@@ -969,7 +977,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 	 */
 	if (c != &boot_cpu_data) {
 		/* AND the already accumulated flags with these */
-		for (i = 0 ; i < NCAPINTS ; i++)
+		for (i = 0; i < NCAPINTS; i++)
 			boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
 	}
 
@@ -982,17 +990,16 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 	numa_add_cpu(smp_processor_id());
 #endif
 }
- 
 
 void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
 {
 	if (c->x86_model_id[0])
-		printk("%s", c->x86_model_id);
+		printk(KERN_INFO "%s", c->x86_model_id);
 
-	if (c->x86_mask || c->cpuid_level >= 0) 
-		printk(" stepping %02x\n", c->x86_mask);
+	if (c->x86_mask || c->cpuid_level >= 0)
+		printk(KERN_CONT " stepping %02x\n", c->x86_mask);
 	else
-		printk("\n");
+		printk(KERN_CONT "\n");
 }
 
 /*
@@ -1002,9 +1009,9 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
 static int show_cpuinfo(struct seq_file *m, void *v)
 {
 	struct cpuinfo_x86 *c = v;
-	int cpu = 0;
+	int cpu = 0, i;
 
-	/* 
+	/*
 	 * These flag bits must match the definitions in <asm/cpufeature.h>.
 	 * NULL means this bit is undefined or reserved; either way it doesn't
 	 * have meaning as far as Linux is concerned.  Note that it's important
@@ -1014,10 +1021,10 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 	 */
 	static const char *const x86_cap_flags[] = {
 		/* Intel-defined */
-	        "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
-	        "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
-	        "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
-	        "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe",
+		"fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
+		"cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
+		"pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
+		"fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe",
 
 		/* AMD-defined */
 		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
@@ -1084,34 +1091,35 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 	cpu = c->cpu_index;
 #endif
 
-	seq_printf(m,"processor\t: %u\n"
-		     "vendor_id\t: %s\n"
-		     "cpu family\t: %d\n"
-		     "model\t\t: %d\n"
-		     "model name\t: %s\n",
-		     (unsigned)cpu,
-		     c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
-		     c->x86,
-		     (int)c->x86_model,
-		     c->x86_model_id[0] ? c->x86_model_id : "unknown");
-	
+	seq_printf(m, "processor\t: %u\n"
+		   "vendor_id\t: %s\n"
+		   "cpu family\t: %d\n"
+		   "model\t\t: %d\n"
+		   "model name\t: %s\n",
+		   (unsigned)cpu,
+		   c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
+		   c->x86,
+		   (int)c->x86_model,
+		   c->x86_model_id[0] ? c->x86_model_id : "unknown");
+
 	if (c->x86_mask || c->cpuid_level >= 0)
 		seq_printf(m, "stepping\t: %d\n", c->x86_mask);
 	else
 		seq_printf(m, "stepping\t: unknown\n");
-	
-	if (cpu_has(c,X86_FEATURE_TSC)) {
+
+	if (cpu_has(c, X86_FEATURE_TSC)) {
 		unsigned int freq = cpufreq_quick_get((unsigned)cpu);
+
 		if (!freq)
 			freq = cpu_khz;
 		seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
-			     freq / 1000, (freq % 1000));
+			   freq / 1000, (freq % 1000));
 	}
 
 	/* Cache size */
-	if (c->x86_cache_size >= 0) 
+	if (c->x86_cache_size >= 0)
 		seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
-	
+
 #ifdef CONFIG_SMP
 	if (smp_num_siblings * c->x86_max_cores > 1) {
 		seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
@@ -1120,48 +1128,43 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 		seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
 		seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
 	}
-#endif	
+#endif
 
 	seq_printf(m,
-	        "fpu\t\t: yes\n"
-	        "fpu_exception\t: yes\n"
-	        "cpuid level\t: %d\n"
-	        "wp\t\t: yes\n"
-	        "flags\t\t:",
+		   "fpu\t\t: yes\n"
+		   "fpu_exception\t: yes\n"
+		   "cpuid level\t: %d\n"
+		   "wp\t\t: yes\n"
+		   "flags\t\t:",
 		   c->cpuid_level);
 
-	{ 
-		int i; 
-		for ( i = 0 ; i < 32*NCAPINTS ; i++ )
-			if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
-				seq_printf(m, " %s", x86_cap_flags[i]);
-	}
-		
+	for (i = 0; i < 32*NCAPINTS; i++)
+		if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
+			seq_printf(m, " %s", x86_cap_flags[i]);
+
 	seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
 		   c->loops_per_jiffy/(500000/HZ),
 		   (c->loops_per_jiffy/(5000/HZ)) % 100);
 
-	if (c->x86_tlbsize > 0) 
+	if (c->x86_tlbsize > 0)
 		seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
 	seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size);
 	seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
 
-	seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", 
+	seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
 		   c->x86_phys_bits, c->x86_virt_bits);
 
 	seq_printf(m, "power management:");
-	{
-		unsigned i;
-		for (i = 0; i < 32; i++) 
-			if (c->x86_power & (1 << i)) {
-				if (i < ARRAY_SIZE(x86_power_flags) &&
-					x86_power_flags[i])
-					seq_printf(m, "%s%s",
-						x86_power_flags[i][0]?" ":"",
-						x86_power_flags[i]);
-				else
-					seq_printf(m, " [%d]", i);
-			}
+	for (i = 0; i < 32; i++) {
+		if (c->x86_power & (1 << i)) {
+			if (i < ARRAY_SIZE(x86_power_flags) &&
+			    x86_power_flags[i])
+				seq_printf(m, "%s%s",
+					   x86_power_flags[i][0]?" ":"",
+					   x86_power_flags[i]);
+			else
+				seq_printf(m, " [%d]", i);
+		}
 	}
 
 	seq_printf(m, "\n\n");
@@ -1189,7 +1192,7 @@ static void c_stop(struct seq_file *m, void *v)
 }
 
 struct seq_operations cpuinfo_op = {
-	.start =c_start,
+	.start = c_start,
 	.next =	c_next,
 	.stop =	c_stop,
 	.show =	show_cpuinfo,
-- 
cgit v1.2.3


From 1c69524c2e5b83e52a098ebdeb4a8b52169f6a03 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <Yinghai.Lu@Sun.COM>
Date: Wed, 30 Jan 2008 13:30:39 +0100
Subject: x86: clear IO_APIC before enabing apic error vector.

4 socket quad core, 8 socket quad core will do apic ID lifting for BSP.

But io-apic regs for ExtINT still use 0 as dest.

so when we enable apic error vector in BSP, we will get one APIC error.

CPU: L1 I Cache: 64K (64 bytes/line), D cache 64K (64 bytes/line)
CPU: L2 Cache: 512K (64 bytes/line)
CPU 0/4 -> Node 0
CPU: Physical Processor ID: 1
CPU: Processor Core ID: 0
SMP alternatives: switching to UP code
ACPI: Core revision 20070126
enabled ExtINT on CPU#0
ESR value after enabling vector: 00000000, after 0000000c
APIC error on CPU0: 0c(08)
ENABLING IO-APIC IRQs
Synchronizing Arb IDs.

So move enable_IO_APIC from setup_IO_APIC into setup_local_APIC and call it
before enabling the ACPI error vector.

[ tglx: arch/x86 adaptation ]

Signed-off-by: Yinghai Lu <yinghai.lu@sun.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apic_64.c    | 7 +++++++
 arch/x86/kernel/io_apic_64.c | 7 +++++--
 include/asm-x86/hw_irq_64.h  | 1 +
 3 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 915808bd8a2a..994298bf4921 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -774,6 +774,13 @@ void __cpuinit setup_local_APIC(void)
 		value = APIC_DM_NMI | APIC_LVT_MASKED;
 	apic_write(APIC_LVT1, value);
 
+	/*
+	 * Now enable IO-APICs, actually call clear_IO_APIC
+	 * We need clear_IO_APIC before enabling vector on BP
+	 */
+	if (!smp_processor_id() && !skip_ioapic_setup && nr_ioapics)
+		enable_IO_APIC();
+
 	{
 		unsigned oldvalue;
 		maxlvt = lapic_get_maxlvt();
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 3e471d0fb150..4ef85a3b3f9f 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -1172,7 +1172,7 @@ void __apicdebuginit print_PIC(void)
 
 #endif  /*  0  */
 
-static void __init enable_IO_APIC(void)
+void __init enable_IO_APIC(void)
 {
 	union IO_APIC_reg_01 reg_01;
 	int i8259_apic, i8259_pin;
@@ -1789,7 +1789,10 @@ __setup("no_timer_check", notimercheck);
 
 void __init setup_IO_APIC(void)
 {
-	enable_IO_APIC();
+
+	/*
+	 * calling enable_IO_APIC() is moved to setup_local_APIC for BP
+	 */
 
 	if (acpi_ioapic)
 		io_apic_irqs = ~0;	/* all IRQs go through IOAPIC */
diff --git a/include/asm-x86/hw_irq_64.h b/include/asm-x86/hw_irq_64.h
index a470d59da678..a346159b6ac3 100644
--- a/include/asm-x86/hw_irq_64.h
+++ b/include/asm-x86/hw_irq_64.h
@@ -135,6 +135,7 @@ extern void init_8259A(int aeoi);
 extern void send_IPI_self(int vector);
 extern void init_VISWS_APIC_irqs(void);
 extern void setup_IO_APIC(void);
+extern void enable_IO_APIC(void);
 extern void disable_IO_APIC(void);
 extern void print_IO_APIC(void);
 extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn);
-- 
cgit v1.2.3


From a860b63c41f241c685245127a3d32f352cb04c12 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Wed, 30 Jan 2008 13:30:39 +0100
Subject: x86: store core id bits in cpuinfo_x8

We need to store core id bits to cpuinfo_x86 in early_identify_cpu. So we
use it to create acpiid_to_node array in k8topolgy.c

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Len Brown <lenb@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/setup_64.c     | 72 +++++++++++++++++++++++++++---------------
 include/asm-x86/processor_64.h |  1 +
 2 files changed, 48 insertions(+), 25 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 62db062ba138..84f66b7b4d2e 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -542,18 +542,7 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
 	int node = 0;
 	unsigned apicid = hard_smp_processor_id();
 #endif
-	unsigned ecx = cpuid_ecx(0x80000008);
-
-	c->x86_max_cores = (ecx & 0xff) + 1;
-
-	/* CPU telling us the core id bits shift? */
-	bits = (ecx >> 12) & 0xF;
-
-	/* Otherwise recompute */
-	if (bits == 0) {
-		while ((1 << bits) < c->x86_max_cores)
-			bits++;
-	}
+	bits = c->x86_coreid_bits;
 
 	/* Low order bits define the core id (index of core in socket) */
 	c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
@@ -591,6 +580,33 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
 #endif
 }
 
+static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+	unsigned bits, ecx;
+
+	/* Multi core CPU? */
+	if (c->extended_cpuid_level < 0x80000008)
+		return;
+
+	ecx = cpuid_ecx(0x80000008);
+
+	c->x86_max_cores = (ecx & 0xff) + 1;
+
+	/* CPU telling us the core id bits shift? */
+	bits = (ecx >> 12) & 0xF;
+
+	/* Otherwise recompute */
+	if (bits == 0) {
+		while ((1 << bits) < c->x86_max_cores)
+			bits++;
+	}
+
+	c->x86_coreid_bits = bits;
+
+#endif
+}
+
 #define ENABLE_C1E_MASK		0x18000000
 #define CPUID_PROCESSOR_SIGNATURE	1
 #define CPUID_XFAM		0x0ff00000
@@ -858,7 +874,7 @@ struct cpu_model_info {
    below. */
 static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
 {
-	u32 tfms;
+	u32 tfms, xlvl;
 
 	c->loops_per_jiffy = loops_per_jiffy;
 	c->x86_cache_size = -1;
@@ -869,6 +885,7 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
 	c->x86_clflush_size = 64;
 	c->x86_cache_alignment = c->x86_clflush_size;
 	c->x86_max_cores = 1;
+	c->x86_coreid_bits = 0;
 	c->extended_cpuid_level = 0;
 	memset(&c->x86_capability, 0, sizeof c->x86_capability);
 
@@ -905,18 +922,6 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
 #ifdef CONFIG_SMP
 	c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
 #endif
-}
-
-/*
- * This does the hard work of actually picking apart the CPU stuff...
- */
-void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
-{
-	int i;
-	u32 xlvl;
-
-	early_identify_cpu(c);
-
 	/* AMD-defined flags: level 0x80000001 */
 	xlvl = cpuid_eax(0x80000000);
 	c->extended_cpuid_level = xlvl;
@@ -937,6 +942,23 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 			c->x86_capability[2] = cpuid_edx(0x80860001);
 	}
 
+	switch (c->x86_vendor) {
+	case X86_VENDOR_AMD:
+		early_init_amd(c);
+		break;
+	}
+
+}
+
+/*
+ * This does the hard work of actually picking apart the CPU stuff...
+ */
+void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
+{
+	int i;
+
+	early_identify_cpu(c);
+
 	init_scattered_cpuid_features(c);
 
 	c->apicid = phys_pkg_id(0);
diff --git a/include/asm-x86/processor_64.h b/include/asm-x86/processor_64.h
index 6c0d96a7af63..2ae8ceb8a74e 100644
--- a/include/asm-x86/processor_64.h
+++ b/include/asm-x86/processor_64.h
@@ -61,6 +61,7 @@ struct cpuinfo_x86 {
 	int	x86_tlbsize;	/* number of 4K pages in DTLB/ITLB combined(in pages)*/
         __u8    x86_virt_bits, x86_phys_bits;
 	__u8	x86_max_cores;	/* cpuid returned max cores value */
+	__u8	x86_coreid_bits; /* cpuid returned core id bits */
         __u32   x86_power; 	
 	__u32   extended_cpuid_level;	/* Max extended CPUID function supported */
 	unsigned long loops_per_jiffy;
-- 
cgit v1.2.3


From 739f33b38bf88312447e38ae8b7ac3acdbb72a6b Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 30 Jan 2008 13:30:40 +0100
Subject: x86: untable __init references between IO data

Earlier patch added IO APIC setup into local APIC setup. This caused
modpost warnings. Fix them by untangling setup_local_APIC() and splitting
it into smaller functions. The IO APIC initialization is only called
for the BP init.

Also removed some outdated debugging code and minor cleanup.

[ tglx: arch/x86 adaptation ]

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apic_64.c    | 46 ++++++++++++++++++++++----------------------
 arch/x86/kernel/smpboot_64.c |  8 ++++++++
 include/asm-x86/apic.h       |  1 +
 3 files changed, 32 insertions(+), 23 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 994298bf4921..d341f798255c 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -677,7 +677,7 @@ void __init init_bsp_APIC(void)
  */
 void __cpuinit setup_local_APIC(void)
 {
-	unsigned int value, maxlvt;
+	unsigned int value;
 	int i, j;
 
 	value = apic_read(APIC_LVR);
@@ -773,32 +773,23 @@ void __cpuinit setup_local_APIC(void)
 	else
 		value = APIC_DM_NMI | APIC_LVT_MASKED;
 	apic_write(APIC_LVT1, value);
+}
+
+void __cpuinit lapic_setup_esr(void)
+{
+	unsigned maxlvt = lapic_get_maxlvt();
 
+	apic_write(APIC_LVTERR, ERROR_APIC_VECTOR);
 	/*
-	 * Now enable IO-APICs, actually call clear_IO_APIC
-	 * We need clear_IO_APIC before enabling vector on BP
+	 * spec says clear errors after enabling vector.
 	 */
-	if (!smp_processor_id() && !skip_ioapic_setup && nr_ioapics)
-		enable_IO_APIC();
-
-	{
-		unsigned oldvalue;
-		maxlvt = lapic_get_maxlvt();
-		oldvalue = apic_read(APIC_ESR);
-		value = ERROR_APIC_VECTOR;      // enables sending errors
-		apic_write(APIC_LVTERR, value);
-		/*
-		 * spec says clear errors after enabling vector.
-		 */
-		if (maxlvt > 3)
-			apic_write(APIC_ESR, 0);
-		value = apic_read(APIC_ESR);
-		if (value != oldvalue)
-			apic_printk(APIC_VERBOSE,
-			"ESR value after enabling vector: %08x, after %08x\n",
-			oldvalue, value);
-	}
+	if (maxlvt > 3)
+		apic_write(APIC_ESR, 0);
+}
 
+void __cpuinit end_local_APIC_setup(void)
+{
+	lapic_setup_esr();
 	nmi_watchdog_default();
 	setup_apic_nmi_watchdog(NULL);
 	apic_pm_activate();
@@ -879,6 +870,15 @@ int __init APIC_init_uniprocessor(void)
 
 	setup_local_APIC();
 
+	/*
+	 * Now enable IO-APICs, actually call clear_IO_APIC
+	 * We need clear_IO_APIC before enabling vector on BP
+	 */
+	if (!skip_ioapic_setup && nr_ioapics)
+		enable_IO_APIC();
+
+	end_local_APIC_setup();
+
 	if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
 		setup_IO_APIC();
 	else
diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c
index 7552db9ee9ff..ddefb38c53fb 100644
--- a/arch/x86/kernel/smpboot_64.c
+++ b/arch/x86/kernel/smpboot_64.c
@@ -210,6 +210,7 @@ void __cpuinit smp_callin(void)
 
 	Dprintk("CALLIN, before setup_local_APIC().\n");
 	setup_local_APIC();
+	end_local_APIC_setup();
 
 	/*
 	 * Get our bogomips.
@@ -884,6 +885,13 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 	 */
 	setup_local_APIC();
 
+	/*
+	 * Enable IO APIC before setting up error vector
+	 */
+	if (!skip_ioapic_setup && nr_ioapics)
+		enable_IO_APIC();
+	end_local_APIC_setup();
+
 	if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) {
 		panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
 		      GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id);
diff --git a/include/asm-x86/apic.h b/include/asm-x86/apic.h
index 18d932dff476..5e8192d36e5a 100644
--- a/include/asm-x86/apic.h
+++ b/include/asm-x86/apic.h
@@ -112,6 +112,7 @@ extern void cache_APIC_registers(void);
 extern void sync_Arb_IDs(void);
 extern void init_bsp_APIC(void);
 extern void setup_local_APIC(void);
+extern void end_local_APIC_setup(void);
 extern void init_apic_mappings(void);
 extern void setup_boot_APIC_clock(void);
 extern void setup_secondary_APIC_clock(void);
-- 
cgit v1.2.3


From 7b83dae7aa31db4f6d6e78c3c6d490a7ac58699c Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 30 Jan 2008 13:30:40 +0100
Subject: x86: extended interrupt LVT support for AMD Barcelona

Also macro definitions in apicdef.h has been updated.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apic_64.c               | 26 ++++++++++++++++++++++----
 arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 10 +++++-----
 include/asm-x86/apic.h                  |  4 ++--
 include/asm-x86/apicdef.h               | 19 ++++++++++++-------
 4 files changed, 41 insertions(+), 18 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index d341f798255c..027004262105 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -187,17 +187,35 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
 }
 
 /*
- * Setup extended LVT (K8 specific)
+ * Setup extended LVT, AMD specific (K8, family 10h)
+ *
+ * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
+ * MCE interrupts are supported. Thus MCE offset must be set to 0.
  */
-void setup_APIC_extended_lvt(unsigned char lvt_off, unsigned char vector,
-			     unsigned char msg_type, unsigned char mask)
+
+#define APIC_EILVT_LVTOFF_MCE 0
+#define APIC_EILVT_LVTOFF_IBS 1
+
+static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
 {
-	unsigned long reg = (lvt_off << 4) + K8_APIC_EXT_LVT_BASE;
+	unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
 	unsigned int  v   = (mask << 16) | (msg_type << 8) | vector;
 
 	apic_write(reg, v);
 }
 
+u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
+{
+	setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
+	return APIC_EILVT_LVTOFF_MCE;
+}
+
+u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
+{
+	setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
+	return APIC_EILVT_LVTOFF_IBS;
+}
+
 /*
  * Program the next event, relative to now
  */
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 073afa7dd89a..550502596ca3 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -118,6 +118,7 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
 {
 	unsigned int bank, block;
 	unsigned int cpu = smp_processor_id();
+	u8 lvt_off;
 	u32 low = 0, high = 0, address = 0;
 
 	for (bank = 0; bank < NR_BANKS; ++bank) {
@@ -153,14 +154,13 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
 			if (shared_bank[bank] && c->cpu_core_id)
 				break;
 #endif
+			lvt_off = setup_APIC_eilvt_mce(THRESHOLD_APIC_VECTOR,
+						       APIC_EILVT_MSG_FIX, 0);
+
 			high &= ~MASK_LVTOFF_HI;
-			high |= K8_APIC_EXT_LVT_ENTRY_THRESHOLD << 20;
+			high |= lvt_off << 20;
 			wrmsr(address, low, high);
 
-			setup_APIC_extended_lvt(K8_APIC_EXT_LVT_ENTRY_THRESHOLD,
-						THRESHOLD_APIC_VECTOR,
-						K8_APIC_EXT_INT_MSG_FIX, 0);
-
 			threshold_defaults.address = address;
 			threshold_restart_bank(&threshold_defaults, 0, 0);
 		}
diff --git a/include/asm-x86/apic.h b/include/asm-x86/apic.h
index 5e8192d36e5a..423022759cb2 100644
--- a/include/asm-x86/apic.h
+++ b/include/asm-x86/apic.h
@@ -126,8 +126,8 @@ extern void enable_NMI_through_LVT0(void *dummy);
 extern void setup_apic_routing(void);
 #endif
 
-extern void setup_APIC_extended_lvt(unsigned char lvt_off, unsigned char vector,
-				    unsigned char msg_type, unsigned char mask);
+extern u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask);
+extern u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask);
 
 extern int apic_is_clustered_box(void);
 
diff --git a/include/asm-x86/apicdef.h b/include/asm-x86/apicdef.h
index 5f7abe9b5f87..550af7a6f88e 100644
--- a/include/asm-x86/apicdef.h
+++ b/include/asm-x86/apicdef.h
@@ -116,13 +116,18 @@
 #define		APIC_TDR_DIV_32		0x8
 #define		APIC_TDR_DIV_64		0x9
 #define		APIC_TDR_DIV_128	0xA
-
-#define K8_APIC_EXT_LVT_BASE		0x500
-#define K8_APIC_EXT_INT_MSG_FIX		0x0
-#define K8_APIC_EXT_INT_MSG_SMI		0x2
-#define K8_APIC_EXT_INT_MSG_NMI		0x4
-#define K8_APIC_EXT_INT_MSG_EXT		0x7
-#define K8_APIC_EXT_LVT_ENTRY_THRESHOLD	0
+#define	APIC_EILVT0     0x500
+#define		APIC_EILVT_NR_AMD_K8	1	/* Number of extended interrupts */
+#define		APIC_EILVT_NR_AMD_10H	4
+#define		APIC_EILVT_LVTOFF(x)	(((x)>>4)&0xF)
+#define		APIC_EILVT_MSG_FIX	0x0
+#define		APIC_EILVT_MSG_SMI	0x2
+#define		APIC_EILVT_MSG_NMI	0x4
+#define		APIC_EILVT_MSG_EXT	0x7
+#define		APIC_EILVT_MASKED	(1<<16)
+#define	APIC_EILVT1     0x510
+#define	APIC_EILVT2     0x520
+#define	APIC_EILVT3     0x530
 
 #define APIC_BASE (fix_to_virt(FIX_APIC_BASE))
 
-- 
cgit v1.2.3


From c1d171a002942ea2d93b4fbd0c9583c56fce0772 Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Wed, 30 Jan 2008 13:30:40 +0100
Subject: x86: randomize brk

Randomize the location of the heap (brk) for i386 and x86_64.  The range is
randomized in the range starting at current brk location up to 0x02000000
offset for both architectures.  This, together with
pie-executable-randomization.patch and
pie-executable-randomization-fix.patch, should make the address space
randomization on i386 and x86_64 complete.

Arjan says:

This is known to break older versions of some emacs variants, whose dumper
code assumed that the last variable declared in the program is equal to the
start of the dynamically allocated memory region.

(The dumper is the code where emacs effectively dumps core at the end of it's
compilation stage; this coredump is then loaded as the main program during
normal use)

iirc this was 5 years or so; we found this way back when I was at RH and we
first did the security stuff there (including this brk randomization).  It
wasn't all variants of emacs, and it got fixed as a result (I vaguely remember
that emacs already had code to deal with it for other archs/oses, just
ifdeffed wrongly).

It's a rare and wrong assumption as a general thing, just on x86 it mostly
happened to be true (but to be honest, it'll break too if gcc does
something fancy or if the linker does a non-standard order).  Still its
something we should at least document.

Note 2: afaik it only broke the emacs *build*.  I'm not 100% sure about that
(it IS 5 years ago) though.

[ akpm@linux-foundation.org: deuglification ]

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Roland McGrath <roland@redhat.com>
Cc: Jakub Jelinek <jakub@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/process_32.c | 7 +++++++
 arch/x86/kernel/process_64.c | 7 +++++++
 fs/binfmt_elf.c              | 6 ++++++
 include/asm-x86/elf.h        | 3 +++
 mm/mmap.c                    | 3 ++-
 5 files changed, 25 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index a8cdd09ad53f..631af167bc51 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -992,3 +992,10 @@ unsigned long arch_align_stack(unsigned long sp)
 		sp -= get_random_int() % 8192;
 	return sp & ~0xf;
 }
+
+unsigned long arch_randomize_brk(struct mm_struct *mm)
+{
+	unsigned long range_end = mm->brk + 0x02000000;
+	return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
+}
+
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 98d85952f574..aa9414ed74c7 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -914,3 +914,10 @@ unsigned long arch_align_stack(unsigned long sp)
 		sp -= get_random_int() % 8192;
 	return sp & ~0xf;
 }
+
+unsigned long arch_randomize_brk(struct mm_struct *mm)
+{
+	unsigned long range_end = mm->brk + 0x02000000;
+	return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
+}
+
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index f0b3171842f2..043a800c8f71 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1021,6 +1021,12 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	current->mm->end_data = end_data;
 	current->mm->start_stack = bprm->p;
 
+#ifdef arch_randomize_brk
+	if (current->flags & PF_RANDOMIZE)
+		current->mm->brk = current->mm->start_brk =
+			arch_randomize_brk(current->mm);
+#endif
+
 	if (current->personality & MMAP_PAGE_ZERO) {
 		/* Why this, you ask???  Well SVr4 maps page 0 as read-only,
 		   and some applications "depend" upon this behavior.
diff --git a/include/asm-x86/elf.h b/include/asm-x86/elf.h
index ec42a4d2e83b..cd3204ebbbdd 100644
--- a/include/asm-x86/elf.h
+++ b/include/asm-x86/elf.h
@@ -285,6 +285,9 @@ struct linux_binprm;
 extern int arch_setup_additional_pages(struct linux_binprm *bprm,
 				       int executable_stack);
 
+extern unsigned long arch_randomize_brk(struct mm_struct *mm);
+#define arch_randomize_brk arch_randomize_brk
+
 #endif /* __KERNEL__ */
 
 #endif
diff --git a/mm/mmap.c b/mm/mmap.c
index bfa389fc6ded..d2b6d44962b7 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -251,7 +251,8 @@ asmlinkage unsigned long sys_brk(unsigned long brk)
 	 * not page aligned -Ram Gupta
 	 */
 	rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
-	if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim)
+	if (rlim < RLIM_INFINITY && (brk - mm->start_brk) +
+			(mm->end_data - mm->start_data) > rlim)
 		goto out;
 
 	newbrk = PAGE_ALIGN(brk);
-- 
cgit v1.2.3


From 108b545137b03ec1d6a5765017d57f86056bf57c Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:30:41 +0100
Subject: x86 vDSO: harmonize asm-offsets

This change harmonizes the asm-offsets macros used in the 32-bit vDSO
across 32-bit and 64-bit builds.  It's a purely cosmetic change for now,
but it paves the way for consolidating the 32-bit vDSO builds.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/asm-offsets_32.c        | 20 ++++++------
 arch/x86/kernel/vsyscall-sigreturn_32.S | 54 ++++++++++++++++-----------------
 2 files changed, 37 insertions(+), 37 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index c1ccfabb4a9e..4f750ec810f6 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -38,15 +38,15 @@ void foo(void);
 
 void foo(void)
 {
-	OFFSET(SIGCONTEXT_eax, sigcontext, eax);
-	OFFSET(SIGCONTEXT_ebx, sigcontext, ebx);
-	OFFSET(SIGCONTEXT_ecx, sigcontext, ecx);
-	OFFSET(SIGCONTEXT_edx, sigcontext, edx);
-	OFFSET(SIGCONTEXT_esi, sigcontext, esi);
-	OFFSET(SIGCONTEXT_edi, sigcontext, edi);
-	OFFSET(SIGCONTEXT_ebp, sigcontext, ebp);
-	OFFSET(SIGCONTEXT_esp, sigcontext, esp);
-	OFFSET(SIGCONTEXT_eip, sigcontext, eip);
+	OFFSET(IA32_SIGCONTEXT_eax, sigcontext, eax);
+	OFFSET(IA32_SIGCONTEXT_ebx, sigcontext, ebx);
+	OFFSET(IA32_SIGCONTEXT_ecx, sigcontext, ecx);
+	OFFSET(IA32_SIGCONTEXT_edx, sigcontext, edx);
+	OFFSET(IA32_SIGCONTEXT_esi, sigcontext, esi);
+	OFFSET(IA32_SIGCONTEXT_edi, sigcontext, edi);
+	OFFSET(IA32_SIGCONTEXT_ebp, sigcontext, ebp);
+	OFFSET(IA32_SIGCONTEXT_esp, sigcontext, esp);
+	OFFSET(IA32_SIGCONTEXT_eip, sigcontext, eip);
 	BLANK();
 
 	OFFSET(CPUINFO_x86, cpuinfo_x86, x86);
@@ -94,7 +94,7 @@ void foo(void)
 	BLANK();
 
 	OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
-	OFFSET(RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
+	OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
 	BLANK();
 
 	OFFSET(pbe_address, pbe, address);
diff --git a/arch/x86/kernel/vsyscall-sigreturn_32.S b/arch/x86/kernel/vsyscall-sigreturn_32.S
index a92262f41659..e939253ad654 100644
--- a/arch/x86/kernel/vsyscall-sigreturn_32.S
+++ b/arch/x86/kernel/vsyscall-sigreturn_32.S
@@ -91,27 +91,27 @@ __kernel_rt_sigreturn:
 	.sleb128 offset;		/*       offset */		\
 1:
 
-	do_cfa_expr(SIGCONTEXT_esp+4)
-	do_expr(0, SIGCONTEXT_eax+4)
-	do_expr(1, SIGCONTEXT_ecx+4)
-	do_expr(2, SIGCONTEXT_edx+4)
-	do_expr(3, SIGCONTEXT_ebx+4)
-	do_expr(5, SIGCONTEXT_ebp+4)
-	do_expr(6, SIGCONTEXT_esi+4)
-	do_expr(7, SIGCONTEXT_edi+4)
-	do_expr(8, SIGCONTEXT_eip+4)
+	do_cfa_expr(IA32_SIGCONTEXT_esp+4)
+	do_expr(0, IA32_SIGCONTEXT_eax+4)
+	do_expr(1, IA32_SIGCONTEXT_ecx+4)
+	do_expr(2, IA32_SIGCONTEXT_edx+4)
+	do_expr(3, IA32_SIGCONTEXT_ebx+4)
+	do_expr(5, IA32_SIGCONTEXT_ebp+4)
+	do_expr(6, IA32_SIGCONTEXT_esi+4)
+	do_expr(7, IA32_SIGCONTEXT_edi+4)
+	do_expr(8, IA32_SIGCONTEXT_eip+4)
 
 	.byte 0x42	/* DW_CFA_advance_loc 2 -- nop; popl eax. */
 
-	do_cfa_expr(SIGCONTEXT_esp)
-	do_expr(0, SIGCONTEXT_eax)
-	do_expr(1, SIGCONTEXT_ecx)
-	do_expr(2, SIGCONTEXT_edx)
-	do_expr(3, SIGCONTEXT_ebx)
-	do_expr(5, SIGCONTEXT_ebp)
-	do_expr(6, SIGCONTEXT_esi)
-	do_expr(7, SIGCONTEXT_edi)
-	do_expr(8, SIGCONTEXT_eip)
+	do_cfa_expr(IA32_SIGCONTEXT_esp)
+	do_expr(0, IA32_SIGCONTEXT_eax)
+	do_expr(1, IA32_SIGCONTEXT_ecx)
+	do_expr(2, IA32_SIGCONTEXT_edx)
+	do_expr(3, IA32_SIGCONTEXT_ebx)
+	do_expr(5, IA32_SIGCONTEXT_ebp)
+	do_expr(6, IA32_SIGCONTEXT_esi)
+	do_expr(7, IA32_SIGCONTEXT_edi)
+	do_expr(8, IA32_SIGCONTEXT_eip)
 
 	.align 4
 .LENDFDEDLSI1:
@@ -128,15 +128,15 @@ __kernel_rt_sigreturn:
 	   slightly less complicated than the above, since we don't
 	   modify the stack pointer in the process.  */
 
-	do_cfa_expr(RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_esp)
-	do_expr(0, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_eax)
-	do_expr(1, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ecx)
-	do_expr(2, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_edx)
-	do_expr(3, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ebx)
-	do_expr(5, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ebp)
-	do_expr(6, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_esi)
-	do_expr(7, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_edi)
-	do_expr(8, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_eip)
+	do_cfa_expr(IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_esp)
+	do_expr(0, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_eax)
+	do_expr(1, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ecx)
+	do_expr(2, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_edx)
+	do_expr(3, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ebx)
+	do_expr(5, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ebp)
+	do_expr(6, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_esi)
+	do_expr(7, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_edi)
+	do_expr(8, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_eip)
 
 	.align 4
 .LENDFDEDLSI2:
-- 
cgit v1.2.3


From 0c2f51a7d2546f65e4198cccd6a56e2a8b857677 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:30:42 +0100
Subject: x86 vDSO: arch/x86/vdso/vdso32

This moves the i386 vDSO sources into arch/x86/vdso/vdso32/, a
new directory.  This patch is a pure renaming, but paves the way
for consolidating the vDSO build logic.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/ia32/vsyscall-sigreturn.S      |   3 +-
 arch/x86/kernel/Makefile_32             |   3 +
 arch/x86/kernel/vsyscall-int80_32.S     |  53 ------------
 arch/x86/kernel/vsyscall-note_32.S      |  45 ----------
 arch/x86/kernel/vsyscall-sigreturn_32.S | 143 --------------------------------
 arch/x86/kernel/vsyscall-sysenter_32.S  | 122 ---------------------------
 arch/x86/vdso/vdso32/int80.S            |  53 ++++++++++++
 arch/x86/vdso/vdso32/note.S             |  45 ++++++++++
 arch/x86/vdso/vdso32/sigreturn.S        | 143 ++++++++++++++++++++++++++++++++
 arch/x86/vdso/vdso32/sysenter.S         | 122 +++++++++++++++++++++++++++
 10 files changed, 367 insertions(+), 365 deletions(-)
 delete mode 100644 arch/x86/kernel/vsyscall-int80_32.S
 delete mode 100644 arch/x86/kernel/vsyscall-note_32.S
 delete mode 100644 arch/x86/kernel/vsyscall-sigreturn_32.S
 delete mode 100644 arch/x86/kernel/vsyscall-sysenter_32.S
 create mode 100644 arch/x86/vdso/vdso32/int80.S
 create mode 100644 arch/x86/vdso/vdso32/note.S
 create mode 100644 arch/x86/vdso/vdso32/sigreturn.S
 create mode 100644 arch/x86/vdso/vdso32/sysenter.S

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/ia32/vsyscall-sigreturn.S b/arch/x86/ia32/vsyscall-sigreturn.S
index b383be00baec..295eecf91f17 100644
--- a/arch/x86/ia32/vsyscall-sigreturn.S
+++ b/arch/x86/ia32/vsyscall-sigreturn.S
@@ -139,5 +139,4 @@ __kernel_rt_sigreturn:
 	.align 4
 .LENDFDE3:
 
-#include "../../x86/kernel/vsyscall-note_32.S"
-
+#include "../vdso/vdso32/note.S"
diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32
index eb2da53578d7..f7c1c1c88dda 100644
--- a/arch/x86/kernel/Makefile_32
+++ b/arch/x86/kernel/Makefile_32
@@ -56,6 +56,9 @@ $(obj)/vsyscall_32.o: $(obj)/vsyscall-int80_32.so $(obj)/vsyscall-sysenter_32.so
 targets += $(foreach F,int80 sysenter,vsyscall-$F_32.o vsyscall-$F_32.so)
 targets += vsyscall-note_32.o vsyscall_32.lds
 
+$(obj)/vsyscall-%_32.o: $(src)/../vdso/vdso32/%.S
+	$(call if_changed_dep,as_o_S)
+
 # The DSO images are built using a special linker script.
 quiet_cmd_syscall = SYSCALL $@
       cmd_syscall = $(CC) -m elf_i386 -nostdlib $(SYSCFLAGS_$(@F)) \
diff --git a/arch/x86/kernel/vsyscall-int80_32.S b/arch/x86/kernel/vsyscall-int80_32.S
deleted file mode 100644
index 103cab6aa7c0..000000000000
--- a/arch/x86/kernel/vsyscall-int80_32.S
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Code for the vsyscall page.  This version uses the old int $0x80 method.
- *
- * NOTE:
- * 1) __kernel_vsyscall _must_ be first in this page.
- * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S
- *    for details.
- */
-
-	.text
-	.globl __kernel_vsyscall
-	.type __kernel_vsyscall,@function
-__kernel_vsyscall:
-.LSTART_vsyscall:
-	int $0x80
-	ret
-.LEND_vsyscall:
-	.size __kernel_vsyscall,.-.LSTART_vsyscall
-	.previous
-
-	.section .eh_frame,"a",@progbits
-.LSTARTFRAMEDLSI:
-	.long .LENDCIEDLSI-.LSTARTCIEDLSI
-.LSTARTCIEDLSI:
-	.long 0			/* CIE ID */
-	.byte 1			/* Version number */
-	.string "zR"		/* NUL-terminated augmentation string */
-	.uleb128 1		/* Code alignment factor */
-	.sleb128 -4		/* Data alignment factor */
-	.byte 8			/* Return address register column */
-	.uleb128 1		/* Augmentation value length */
-	.byte 0x1b		/* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
-	.byte 0x0c		/* DW_CFA_def_cfa */
-	.uleb128 4
-	.uleb128 4
-	.byte 0x88		/* DW_CFA_offset, column 0x8 */
-	.uleb128 1
-	.align 4
-.LENDCIEDLSI:
-	.long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */
-.LSTARTFDEDLSI:
-	.long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */
-	.long .LSTART_vsyscall-.	/* PC-relative start address */
-	.long .LEND_vsyscall-.LSTART_vsyscall
-	.uleb128 0
-	.align 4
-.LENDFDEDLSI:
-	.previous
-
-/*
- * Get the common code for the sigreturn entry points.
- */
-#include "vsyscall-sigreturn_32.S"
diff --git a/arch/x86/kernel/vsyscall-note_32.S b/arch/x86/kernel/vsyscall-note_32.S
deleted file mode 100644
index fcf376a37f79..000000000000
--- a/arch/x86/kernel/vsyscall-note_32.S
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
- * Here we can supply some information useful to userland.
- */
-
-#include <linux/version.h>
-#include <linux/elfnote.h>
-
-/* Ideally this would use UTS_NAME, but using a quoted string here
-   doesn't work. Remember to change this when changing the
-   kernel's name. */
-ELFNOTE_START(Linux, 0, "a")
-	.long LINUX_VERSION_CODE
-ELFNOTE_END
-
-#ifdef CONFIG_XEN
-/*
- * Add a special note telling glibc's dynamic linker a fake hardware
- * flavor that it will use to choose the search path for libraries in the
- * same way it uses real hardware capabilities like "mmx".
- * We supply "nosegneg" as the fake capability, to indicate that we
- * do not like negative offsets in instructions using segment overrides,
- * since we implement those inefficiently.  This makes it possible to
- * install libraries optimized to avoid those access patterns in someplace
- * like /lib/i686/tls/nosegneg.  Note that an /etc/ld.so.conf.d/file
- * corresponding to the bits here is needed to make ldconfig work right.
- * It should contain:
- *	hwcap 1 nosegneg
- * to match the mapping of bit to name that we give here.
- *
- * At runtime, the fake hardware feature will be considered to be present
- * if its bit is set in the mask word.  So, we start with the mask 0, and
- * at boot time we set VDSO_NOTE_NONEGSEG_BIT if running under Xen.
- */
-
-#include "../../x86/xen/vdso.h"	/* Defines VDSO_NOTE_NONEGSEG_BIT.  */
-
-	.globl VDSO_NOTE_MASK
-ELFNOTE_START(GNU, 2, "a")
-	.long 1			/* ncaps */
-VDSO_NOTE_MASK:
-	.long 0			/* mask */
-	.byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg"	/* bit, name */
-ELFNOTE_END
-#endif
diff --git a/arch/x86/kernel/vsyscall-sigreturn_32.S b/arch/x86/kernel/vsyscall-sigreturn_32.S
deleted file mode 100644
index e939253ad654..000000000000
--- a/arch/x86/kernel/vsyscall-sigreturn_32.S
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * Common code for the sigreturn entry points on the vsyscall page.
- * So far this code is the same for both int80 and sysenter versions.
- * This file is #include'd by vsyscall-*.S to define them after the
- * vsyscall entry point.  The kernel assumes that the addresses of these
- * routines are constant for all vsyscall implementations.
- */
-
-#include <asm/unistd.h>
-#include <asm/asm-offsets.h>
-
-
-/* XXX
-   Should these be named "_sigtramp" or something?
-*/
-
-	.text
-	.org __kernel_vsyscall+32,0x90
-	.globl __kernel_sigreturn
-	.type __kernel_sigreturn,@function
-__kernel_sigreturn:
-.LSTART_sigreturn:
-	popl %eax		/* XXX does this mean it needs unwind info? */
-	movl $__NR_sigreturn, %eax
-	int $0x80
-.LEND_sigreturn:
-	.size __kernel_sigreturn,.-.LSTART_sigreturn
-
-	.balign 32
-	.globl __kernel_rt_sigreturn
-	.type __kernel_rt_sigreturn,@function
-__kernel_rt_sigreturn:
-.LSTART_rt_sigreturn:
-	movl $__NR_rt_sigreturn, %eax
-	int $0x80
-.LEND_rt_sigreturn:
-	.size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn
-	.balign 32
-	.previous
-
-	.section .eh_frame,"a",@progbits
-.LSTARTFRAMEDLSI1:
-	.long .LENDCIEDLSI1-.LSTARTCIEDLSI1
-.LSTARTCIEDLSI1:
-	.long 0			/* CIE ID */
-	.byte 1			/* Version number */
-	.string "zRS"		/* NUL-terminated augmentation string */
-	.uleb128 1		/* Code alignment factor */
-	.sleb128 -4		/* Data alignment factor */
-	.byte 8			/* Return address register column */
-	.uleb128 1		/* Augmentation value length */
-	.byte 0x1b		/* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
-	.byte 0			/* DW_CFA_nop */
-	.align 4
-.LENDCIEDLSI1:
-	.long .LENDFDEDLSI1-.LSTARTFDEDLSI1 /* Length FDE */
-.LSTARTFDEDLSI1:
-	.long .LSTARTFDEDLSI1-.LSTARTFRAMEDLSI1 /* CIE pointer */
-	/* HACK: The dwarf2 unwind routines will subtract 1 from the
-	   return address to get an address in the middle of the
-	   presumed call instruction.  Since we didn't get here via
-	   a call, we need to include the nop before the real start
-	   to make up for it.  */
-	.long .LSTART_sigreturn-1-.	/* PC-relative start address */
-	.long .LEND_sigreturn-.LSTART_sigreturn+1
-	.uleb128 0			/* Augmentation */
-	/* What follows are the instructions for the table generation.
-	   We record the locations of each register saved.  This is
-	   complicated by the fact that the "CFA" is always assumed to
-	   be the value of the stack pointer in the caller.  This means
-	   that we must define the CFA of this body of code to be the
-	   saved value of the stack pointer in the sigcontext.  Which
-	   also means that there is no fixed relation to the other 
-	   saved registers, which means that we must use DW_CFA_expression
-	   to compute their addresses.  It also means that when we 
-	   adjust the stack with the popl, we have to do it all over again.  */
-
-#define do_cfa_expr(offset)						\
-	.byte 0x0f;			/* DW_CFA_def_cfa_expression */	\
-	.uleb128 1f-0f;			/*   length */			\
-0:	.byte 0x74;			/*     DW_OP_breg4 */		\
-	.sleb128 offset;		/*      offset */		\
-	.byte 0x06;			/*     DW_OP_deref */		\
-1:
-
-#define do_expr(regno, offset)						\
-	.byte 0x10;			/* DW_CFA_expression */		\
-	.uleb128 regno;			/*   regno */			\
-	.uleb128 1f-0f;			/*   length */			\
-0:	.byte 0x74;			/*     DW_OP_breg4 */		\
-	.sleb128 offset;		/*       offset */		\
-1:
-
-	do_cfa_expr(IA32_SIGCONTEXT_esp+4)
-	do_expr(0, IA32_SIGCONTEXT_eax+4)
-	do_expr(1, IA32_SIGCONTEXT_ecx+4)
-	do_expr(2, IA32_SIGCONTEXT_edx+4)
-	do_expr(3, IA32_SIGCONTEXT_ebx+4)
-	do_expr(5, IA32_SIGCONTEXT_ebp+4)
-	do_expr(6, IA32_SIGCONTEXT_esi+4)
-	do_expr(7, IA32_SIGCONTEXT_edi+4)
-	do_expr(8, IA32_SIGCONTEXT_eip+4)
-
-	.byte 0x42	/* DW_CFA_advance_loc 2 -- nop; popl eax. */
-
-	do_cfa_expr(IA32_SIGCONTEXT_esp)
-	do_expr(0, IA32_SIGCONTEXT_eax)
-	do_expr(1, IA32_SIGCONTEXT_ecx)
-	do_expr(2, IA32_SIGCONTEXT_edx)
-	do_expr(3, IA32_SIGCONTEXT_ebx)
-	do_expr(5, IA32_SIGCONTEXT_ebp)
-	do_expr(6, IA32_SIGCONTEXT_esi)
-	do_expr(7, IA32_SIGCONTEXT_edi)
-	do_expr(8, IA32_SIGCONTEXT_eip)
-
-	.align 4
-.LENDFDEDLSI1:
-
-	.long .LENDFDEDLSI2-.LSTARTFDEDLSI2 /* Length FDE */
-.LSTARTFDEDLSI2:
-	.long .LSTARTFDEDLSI2-.LSTARTFRAMEDLSI1 /* CIE pointer */
-	/* HACK: See above wrt unwind library assumptions.  */
-	.long .LSTART_rt_sigreturn-1-.	/* PC-relative start address */
-	.long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1
-	.uleb128 0			/* Augmentation */
-	/* What follows are the instructions for the table generation.
-	   We record the locations of each register saved.  This is
-	   slightly less complicated than the above, since we don't
-	   modify the stack pointer in the process.  */
-
-	do_cfa_expr(IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_esp)
-	do_expr(0, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_eax)
-	do_expr(1, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ecx)
-	do_expr(2, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_edx)
-	do_expr(3, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ebx)
-	do_expr(5, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ebp)
-	do_expr(6, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_esi)
-	do_expr(7, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_edi)
-	do_expr(8, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_eip)
-
-	.align 4
-.LENDFDEDLSI2:
-	.previous
diff --git a/arch/x86/kernel/vsyscall-sysenter_32.S b/arch/x86/kernel/vsyscall-sysenter_32.S
deleted file mode 100644
index ed879bf42995..000000000000
--- a/arch/x86/kernel/vsyscall-sysenter_32.S
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Code for the vsyscall page.  This version uses the sysenter instruction.
- *
- * NOTE:
- * 1) __kernel_vsyscall _must_ be first in this page.
- * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S
- *    for details.
- */
-
-/*
- * The caller puts arg2 in %ecx, which gets pushed. The kernel will use
- * %ecx itself for arg2. The pushing is because the sysexit instruction
- * (found in entry.S) requires that we clobber %ecx with the desired %esp.
- * User code might expect that %ecx is unclobbered though, as it would be
- * for returning via the iret instruction, so we must push and pop.
- *
- * The caller puts arg3 in %edx, which the sysexit instruction requires
- * for %eip. Thus, exactly as for arg2, we must push and pop.
- *
- * Arg6 is different. The caller puts arg6 in %ebp. Since the sysenter
- * instruction clobbers %esp, the user's %esp won't even survive entry
- * into the kernel. We store %esp in %ebp. Code in entry.S must fetch
- * arg6 from the stack.
- *
- * You can not use this vsyscall for the clone() syscall because the
- * three dwords on the parent stack do not get copied to the child.
- */
-	.text
-	.globl __kernel_vsyscall
-	.type __kernel_vsyscall,@function
-__kernel_vsyscall:
-.LSTART_vsyscall:
-	push %ecx
-.Lpush_ecx:
-	push %edx
-.Lpush_edx:
-	push %ebp
-.Lenter_kernel:
-	movl %esp,%ebp
-	sysenter
-
-	/* 7: align return point with nop's to make disassembly easier */
-	.space 7,0x90
-
-	/* 14: System call restart point is here! (SYSENTER_RETURN-2) */
-	jmp .Lenter_kernel
-	/* 16: System call normal return point is here! */
-	.globl SYSENTER_RETURN	/* Symbol used by sysenter.c  */
-SYSENTER_RETURN:
-	pop %ebp
-.Lpop_ebp:
-	pop %edx
-.Lpop_edx:
-	pop %ecx
-.Lpop_ecx:
-	ret
-.LEND_vsyscall:
-	.size __kernel_vsyscall,.-.LSTART_vsyscall
-	.previous
-
-	.section .eh_frame,"a",@progbits
-.LSTARTFRAMEDLSI:
-	.long .LENDCIEDLSI-.LSTARTCIEDLSI
-.LSTARTCIEDLSI:
-	.long 0			/* CIE ID */
-	.byte 1			/* Version number */
-	.string "zR"		/* NUL-terminated augmentation string */
-	.uleb128 1		/* Code alignment factor */
-	.sleb128 -4		/* Data alignment factor */
-	.byte 8			/* Return address register column */
-	.uleb128 1		/* Augmentation value length */
-	.byte 0x1b		/* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
-	.byte 0x0c		/* DW_CFA_def_cfa */
-	.uleb128 4
-	.uleb128 4
-	.byte 0x88		/* DW_CFA_offset, column 0x8 */
-	.uleb128 1
-	.align 4
-.LENDCIEDLSI:
-	.long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */
-.LSTARTFDEDLSI:
-	.long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */
-	.long .LSTART_vsyscall-.	/* PC-relative start address */
-	.long .LEND_vsyscall-.LSTART_vsyscall
-	.uleb128 0
-	/* What follows are the instructions for the table generation.
-	   We have to record all changes of the stack pointer.  */
-	.byte 0x04		/* DW_CFA_advance_loc4 */
-	.long .Lpush_ecx-.LSTART_vsyscall
-	.byte 0x0e		/* DW_CFA_def_cfa_offset */
-	.byte 0x08		/* RA at offset 8 now */
-	.byte 0x04		/* DW_CFA_advance_loc4 */
-	.long .Lpush_edx-.Lpush_ecx
-	.byte 0x0e		/* DW_CFA_def_cfa_offset */
-	.byte 0x0c		/* RA at offset 12 now */
-	.byte 0x04		/* DW_CFA_advance_loc4 */
-	.long .Lenter_kernel-.Lpush_edx
-	.byte 0x0e		/* DW_CFA_def_cfa_offset */
-	.byte 0x10		/* RA at offset 16 now */
-	.byte 0x85, 0x04	/* DW_CFA_offset %ebp -16 */
-	/* Finally the epilogue.  */
-	.byte 0x04		/* DW_CFA_advance_loc4 */
-	.long .Lpop_ebp-.Lenter_kernel
-	.byte 0x0e		/* DW_CFA_def_cfa_offset */
-	.byte 0x0c		/* RA at offset 12 now */
-	.byte 0xc5		/* DW_CFA_restore %ebp */
-	.byte 0x04		/* DW_CFA_advance_loc4 */
-	.long .Lpop_edx-.Lpop_ebp
-	.byte 0x0e		/* DW_CFA_def_cfa_offset */
-	.byte 0x08		/* RA at offset 8 now */
-	.byte 0x04		/* DW_CFA_advance_loc4 */
-	.long .Lpop_ecx-.Lpop_edx
-	.byte 0x0e		/* DW_CFA_def_cfa_offset */
-	.byte 0x04		/* RA at offset 4 now */
-	.align 4
-.LENDFDEDLSI:
-	.previous
-
-/*
- * Get the common code for the sigreturn entry points.
- */
-#include "vsyscall-sigreturn_32.S"
diff --git a/arch/x86/vdso/vdso32/int80.S b/arch/x86/vdso/vdso32/int80.S
new file mode 100644
index 000000000000..3c8e4c62ace3
--- /dev/null
+++ b/arch/x86/vdso/vdso32/int80.S
@@ -0,0 +1,53 @@
+/*
+ * Code for the vsyscall page.  This version uses the old int $0x80 method.
+ *
+ * NOTE:
+ * 1) __kernel_vsyscall _must_ be first in this page.
+ * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S
+ *    for details.
+ */
+
+	.text
+	.globl __kernel_vsyscall
+	.type __kernel_vsyscall,@function
+__kernel_vsyscall:
+.LSTART_vsyscall:
+	int $0x80
+	ret
+.LEND_vsyscall:
+	.size __kernel_vsyscall,.-.LSTART_vsyscall
+	.previous
+
+	.section .eh_frame,"a",@progbits
+.LSTARTFRAMEDLSI:
+	.long .LENDCIEDLSI-.LSTARTCIEDLSI
+.LSTARTCIEDLSI:
+	.long 0			/* CIE ID */
+	.byte 1			/* Version number */
+	.string "zR"		/* NUL-terminated augmentation string */
+	.uleb128 1		/* Code alignment factor */
+	.sleb128 -4		/* Data alignment factor */
+	.byte 8			/* Return address register column */
+	.uleb128 1		/* Augmentation value length */
+	.byte 0x1b		/* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
+	.byte 0x0c		/* DW_CFA_def_cfa */
+	.uleb128 4
+	.uleb128 4
+	.byte 0x88		/* DW_CFA_offset, column 0x8 */
+	.uleb128 1
+	.align 4
+.LENDCIEDLSI:
+	.long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */
+.LSTARTFDEDLSI:
+	.long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */
+	.long .LSTART_vsyscall-.	/* PC-relative start address */
+	.long .LEND_vsyscall-.LSTART_vsyscall
+	.uleb128 0
+	.align 4
+.LENDFDEDLSI:
+	.previous
+
+/*
+ * Get the common code for the sigreturn entry points.
+ */
+#include "sigreturn.S"
diff --git a/arch/x86/vdso/vdso32/note.S b/arch/x86/vdso/vdso32/note.S
new file mode 100644
index 000000000000..0cf934f2633e
--- /dev/null
+++ b/arch/x86/vdso/vdso32/note.S
@@ -0,0 +1,45 @@
+/*
+ * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
+ * Here we can supply some information useful to userland.
+ */
+
+#include <linux/version.h>
+#include <linux/elfnote.h>
+
+/* Ideally this would use UTS_NAME, but using a quoted string here
+   doesn't work. Remember to change this when changing the
+   kernel's name. */
+ELFNOTE_START(Linux, 0, "a")
+	.long LINUX_VERSION_CODE
+ELFNOTE_END
+
+#ifdef CONFIG_XEN
+/*
+ * Add a special note telling glibc's dynamic linker a fake hardware
+ * flavor that it will use to choose the search path for libraries in the
+ * same way it uses real hardware capabilities like "mmx".
+ * We supply "nosegneg" as the fake capability, to indicate that we
+ * do not like negative offsets in instructions using segment overrides,
+ * since we implement those inefficiently.  This makes it possible to
+ * install libraries optimized to avoid those access patterns in someplace
+ * like /lib/i686/tls/nosegneg.  Note that an /etc/ld.so.conf.d/file
+ * corresponding to the bits here is needed to make ldconfig work right.
+ * It should contain:
+ *	hwcap 1 nosegneg
+ * to match the mapping of bit to name that we give here.
+ *
+ * At runtime, the fake hardware feature will be considered to be present
+ * if its bit is set in the mask word.  So, we start with the mask 0, and
+ * at boot time we set VDSO_NOTE_NONEGSEG_BIT if running under Xen.
+ */
+
+#include "../../xen/vdso.h"	/* Defines VDSO_NOTE_NONEGSEG_BIT.  */
+
+	.globl VDSO_NOTE_MASK
+ELFNOTE_START(GNU, 2, "a")
+	.long 1			/* ncaps */
+VDSO_NOTE_MASK:
+	.long 0			/* mask */
+	.byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg"	/* bit, name */
+ELFNOTE_END
+#endif
diff --git a/arch/x86/vdso/vdso32/sigreturn.S b/arch/x86/vdso/vdso32/sigreturn.S
new file mode 100644
index 000000000000..778f4649e32f
--- /dev/null
+++ b/arch/x86/vdso/vdso32/sigreturn.S
@@ -0,0 +1,143 @@
+/*
+ * Common code for the sigreturn entry points on the vsyscall page.
+ * So far this code is the same for both int80 and sysenter versions.
+ * This file is #include'd by vsyscall-*.S to define them after the
+ * vsyscall entry point.  The kernel assumes that the addresses of these
+ * routines are constant for all vsyscall implementations.
+ */
+
+#include <asm/unistd.h>
+#include <asm/asm-offsets.h>
+
+
+/* XXX
+   Should these be named "_sigtramp" or something?
+*/
+
+	.text
+	.org __kernel_vsyscall+32,0x90
+	.globl __kernel_sigreturn
+	.type __kernel_sigreturn,@function
+__kernel_sigreturn:
+.LSTART_sigreturn:
+	popl %eax		/* XXX does this mean it needs unwind info? */
+	movl $__NR_sigreturn, %eax
+	int $0x80
+.LEND_sigreturn:
+	.size __kernel_sigreturn,.-.LSTART_sigreturn
+
+	.balign 32
+	.globl __kernel_rt_sigreturn
+	.type __kernel_rt_sigreturn,@function
+__kernel_rt_sigreturn:
+.LSTART_rt_sigreturn:
+	movl $__NR_rt_sigreturn, %eax
+	int $0x80
+.LEND_rt_sigreturn:
+	.size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn
+	.balign 32
+	.previous
+
+	.section .eh_frame,"a",@progbits
+.LSTARTFRAMEDLSI1:
+	.long .LENDCIEDLSI1-.LSTARTCIEDLSI1
+.LSTARTCIEDLSI1:
+	.long 0			/* CIE ID */
+	.byte 1			/* Version number */
+	.string "zRS"		/* NUL-terminated augmentation string */
+	.uleb128 1		/* Code alignment factor */
+	.sleb128 -4		/* Data alignment factor */
+	.byte 8			/* Return address register column */
+	.uleb128 1		/* Augmentation value length */
+	.byte 0x1b		/* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
+	.byte 0			/* DW_CFA_nop */
+	.align 4
+.LENDCIEDLSI1:
+	.long .LENDFDEDLSI1-.LSTARTFDEDLSI1 /* Length FDE */
+.LSTARTFDEDLSI1:
+	.long .LSTARTFDEDLSI1-.LSTARTFRAMEDLSI1 /* CIE pointer */
+	/* HACK: The dwarf2 unwind routines will subtract 1 from the
+	   return address to get an address in the middle of the
+	   presumed call instruction.  Since we didn't get here via
+	   a call, we need to include the nop before the real start
+	   to make up for it.  */
+	.long .LSTART_sigreturn-1-.	/* PC-relative start address */
+	.long .LEND_sigreturn-.LSTART_sigreturn+1
+	.uleb128 0			/* Augmentation */
+	/* What follows are the instructions for the table generation.
+	   We record the locations of each register saved.  This is
+	   complicated by the fact that the "CFA" is always assumed to
+	   be the value of the stack pointer in the caller.  This means
+	   that we must define the CFA of this body of code to be the
+	   saved value of the stack pointer in the sigcontext.  Which
+	   also means that there is no fixed relation to the other
+	   saved registers, which means that we must use DW_CFA_expression
+	   to compute their addresses.  It also means that when we
+	   adjust the stack with the popl, we have to do it all over again.  */
+
+#define do_cfa_expr(offset)						\
+	.byte 0x0f;			/* DW_CFA_def_cfa_expression */	\
+	.uleb128 1f-0f;			/*   length */			\
+0:	.byte 0x74;			/*     DW_OP_breg4 */		\
+	.sleb128 offset;		/*      offset */		\
+	.byte 0x06;			/*     DW_OP_deref */		\
+1:
+
+#define do_expr(regno, offset)						\
+	.byte 0x10;			/* DW_CFA_expression */		\
+	.uleb128 regno;			/*   regno */			\
+	.uleb128 1f-0f;			/*   length */			\
+0:	.byte 0x74;			/*     DW_OP_breg4 */		\
+	.sleb128 offset;		/*       offset */		\
+1:
+
+	do_cfa_expr(IA32_SIGCONTEXT_esp+4)
+	do_expr(0, IA32_SIGCONTEXT_eax+4)
+	do_expr(1, IA32_SIGCONTEXT_ecx+4)
+	do_expr(2, IA32_SIGCONTEXT_edx+4)
+	do_expr(3, IA32_SIGCONTEXT_ebx+4)
+	do_expr(5, IA32_SIGCONTEXT_ebp+4)
+	do_expr(6, IA32_SIGCONTEXT_esi+4)
+	do_expr(7, IA32_SIGCONTEXT_edi+4)
+	do_expr(8, IA32_SIGCONTEXT_eip+4)
+
+	.byte 0x42	/* DW_CFA_advance_loc 2 -- nop; popl eax. */
+
+	do_cfa_expr(IA32_SIGCONTEXT_esp)
+	do_expr(0, IA32_SIGCONTEXT_eax)
+	do_expr(1, IA32_SIGCONTEXT_ecx)
+	do_expr(2, IA32_SIGCONTEXT_edx)
+	do_expr(3, IA32_SIGCONTEXT_ebx)
+	do_expr(5, IA32_SIGCONTEXT_ebp)
+	do_expr(6, IA32_SIGCONTEXT_esi)
+	do_expr(7, IA32_SIGCONTEXT_edi)
+	do_expr(8, IA32_SIGCONTEXT_eip)
+
+	.align 4
+.LENDFDEDLSI1:
+
+	.long .LENDFDEDLSI2-.LSTARTFDEDLSI2 /* Length FDE */
+.LSTARTFDEDLSI2:
+	.long .LSTARTFDEDLSI2-.LSTARTFRAMEDLSI1 /* CIE pointer */
+	/* HACK: See above wrt unwind library assumptions.  */
+	.long .LSTART_rt_sigreturn-1-.	/* PC-relative start address */
+	.long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1
+	.uleb128 0			/* Augmentation */
+	/* What follows are the instructions for the table generation.
+	   We record the locations of each register saved.  This is
+	   slightly less complicated than the above, since we don't
+	   modify the stack pointer in the process.  */
+
+	do_cfa_expr(IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_esp)
+	do_expr(0, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_eax)
+	do_expr(1, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ecx)
+	do_expr(2, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_edx)
+	do_expr(3, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ebx)
+	do_expr(5, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ebp)
+	do_expr(6, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_esi)
+	do_expr(7, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_edi)
+	do_expr(8, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_eip)
+
+	.align 4
+.LENDFDEDLSI2:
+	.previous
diff --git a/arch/x86/vdso/vdso32/sysenter.S b/arch/x86/vdso/vdso32/sysenter.S
new file mode 100644
index 000000000000..4b98fc41d3e0
--- /dev/null
+++ b/arch/x86/vdso/vdso32/sysenter.S
@@ -0,0 +1,122 @@
+/*
+ * Code for the vsyscall page.  This version uses the sysenter instruction.
+ *
+ * NOTE:
+ * 1) __kernel_vsyscall _must_ be first in this page.
+ * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S
+ *    for details.
+ */
+
+/*
+ * The caller puts arg2 in %ecx, which gets pushed. The kernel will use
+ * %ecx itself for arg2. The pushing is because the sysexit instruction
+ * (found in entry.S) requires that we clobber %ecx with the desired %esp.
+ * User code might expect that %ecx is unclobbered though, as it would be
+ * for returning via the iret instruction, so we must push and pop.
+ *
+ * The caller puts arg3 in %edx, which the sysexit instruction requires
+ * for %eip. Thus, exactly as for arg2, we must push and pop.
+ *
+ * Arg6 is different. The caller puts arg6 in %ebp. Since the sysenter
+ * instruction clobbers %esp, the user's %esp won't even survive entry
+ * into the kernel. We store %esp in %ebp. Code in entry.S must fetch
+ * arg6 from the stack.
+ *
+ * You can not use this vsyscall for the clone() syscall because the
+ * three dwords on the parent stack do not get copied to the child.
+ */
+	.text
+	.globl __kernel_vsyscall
+	.type __kernel_vsyscall,@function
+__kernel_vsyscall:
+.LSTART_vsyscall:
+	push %ecx
+.Lpush_ecx:
+	push %edx
+.Lpush_edx:
+	push %ebp
+.Lenter_kernel:
+	movl %esp,%ebp
+	sysenter
+
+	/* 7: align return point with nop's to make disassembly easier */
+	.space 7,0x90
+
+	/* 14: System call restart point is here! (SYSENTER_RETURN-2) */
+	jmp .Lenter_kernel
+	/* 16: System call normal return point is here! */
+	.globl SYSENTER_RETURN	/* Symbol used by sysenter.c  */
+SYSENTER_RETURN:
+	pop %ebp
+.Lpop_ebp:
+	pop %edx
+.Lpop_edx:
+	pop %ecx
+.Lpop_ecx:
+	ret
+.LEND_vsyscall:
+	.size __kernel_vsyscall,.-.LSTART_vsyscall
+	.previous
+
+	.section .eh_frame,"a",@progbits
+.LSTARTFRAMEDLSI:
+	.long .LENDCIEDLSI-.LSTARTCIEDLSI
+.LSTARTCIEDLSI:
+	.long 0			/* CIE ID */
+	.byte 1			/* Version number */
+	.string "zR"		/* NUL-terminated augmentation string */
+	.uleb128 1		/* Code alignment factor */
+	.sleb128 -4		/* Data alignment factor */
+	.byte 8			/* Return address register column */
+	.uleb128 1		/* Augmentation value length */
+	.byte 0x1b		/* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
+	.byte 0x0c		/* DW_CFA_def_cfa */
+	.uleb128 4
+	.uleb128 4
+	.byte 0x88		/* DW_CFA_offset, column 0x8 */
+	.uleb128 1
+	.align 4
+.LENDCIEDLSI:
+	.long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */
+.LSTARTFDEDLSI:
+	.long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */
+	.long .LSTART_vsyscall-.	/* PC-relative start address */
+	.long .LEND_vsyscall-.LSTART_vsyscall
+	.uleb128 0
+	/* What follows are the instructions for the table generation.
+	   We have to record all changes of the stack pointer.  */
+	.byte 0x04		/* DW_CFA_advance_loc4 */
+	.long .Lpush_ecx-.LSTART_vsyscall
+	.byte 0x0e		/* DW_CFA_def_cfa_offset */
+	.byte 0x08		/* RA at offset 8 now */
+	.byte 0x04		/* DW_CFA_advance_loc4 */
+	.long .Lpush_edx-.Lpush_ecx
+	.byte 0x0e		/* DW_CFA_def_cfa_offset */
+	.byte 0x0c		/* RA at offset 12 now */
+	.byte 0x04		/* DW_CFA_advance_loc4 */
+	.long .Lenter_kernel-.Lpush_edx
+	.byte 0x0e		/* DW_CFA_def_cfa_offset */
+	.byte 0x10		/* RA at offset 16 now */
+	.byte 0x85, 0x04	/* DW_CFA_offset %ebp -16 */
+	/* Finally the epilogue.  */
+	.byte 0x04		/* DW_CFA_advance_loc4 */
+	.long .Lpop_ebp-.Lenter_kernel
+	.byte 0x0e		/* DW_CFA_def_cfa_offset */
+	.byte 0x0c		/* RA at offset 12 now */
+	.byte 0xc5		/* DW_CFA_restore %ebp */
+	.byte 0x04		/* DW_CFA_advance_loc4 */
+	.long .Lpop_edx-.Lpop_ebp
+	.byte 0x0e		/* DW_CFA_def_cfa_offset */
+	.byte 0x08		/* RA at offset 8 now */
+	.byte 0x04		/* DW_CFA_advance_loc4 */
+	.long .Lpop_ecx-.Lpop_edx
+	.byte 0x0e		/* DW_CFA_def_cfa_offset */
+	.byte 0x04		/* RA at offset 4 now */
+	.align 4
+.LENDFDEDLSI:
+	.previous
+
+/*
+ * Get the common code for the sigreturn entry points.
+ */
+#include "sigreturn.S"
-- 
cgit v1.2.3


From 6c3652efcafa6a6d795093362cb4290c84994b5c Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:30:42 +0100
Subject: x86 vDSO: i386 vdso32

This makes the i386 kernel use the new vDSO build in arch/x86/vdso/vdso32/
to replace the old one from arch/x86/kernel/.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Makefile_32              |  3 +-
 arch/x86/kernel/Makefile_32       | 43 +------------------------
 arch/x86/kernel/asm-offsets_32.c  |  2 --
 arch/x86/kernel/signal_32.c       |  5 +--
 arch/x86/kernel/sysenter_32.c     |  6 ++--
 arch/x86/kernel/vsyscall_32.S     | 15 ---------
 arch/x86/kernel/vsyscall_32.lds.S | 67 ---------------------------------------
 arch/x86/vdso/Makefile            |  8 +++--
 arch/x86/vdso/vdso32.S            | 15 +++++++++
 arch/x86/vdso/vdso32/note.S       |  3 +-
 arch/x86/vdso/vdso32/sysenter.S   |  3 +-
 arch/x86/xen/setup.c              |  5 ++-
 include/asm-x86/elf.h             | 13 ++------
 include/asm-x86/vdso.h            | 14 ++++++++
 14 files changed, 50 insertions(+), 152 deletions(-)
 delete mode 100644 arch/x86/kernel/vsyscall_32.S
 delete mode 100644 arch/x86/kernel/vsyscall_32.lds.S
 create mode 100644 arch/x86/vdso/vdso32.S

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/Makefile_32 b/arch/x86/Makefile_32
index b152db71d5b5..9214224773ed 100644
--- a/arch/x86/Makefile_32
+++ b/arch/x86/Makefile_32
@@ -110,7 +110,8 @@ libs-y 					+= arch/x86/lib/
 core-y					+= arch/x86/kernel/ \
 					   arch/x86/mm/ \
 					   $(mcore-y)/ \
-					   arch/x86/crypto/
+					   arch/x86/crypto/ \
+					   arch/x86/vdso/
 drivers-$(CONFIG_MATH_EMULATION)	+= arch/x86/math-emu/
 drivers-$(CONFIG_PCI)			+= arch/x86/pci/
 # must be linked after kernel/
diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32
index f7c1c1c88dda..0eef8226753b 100644
--- a/arch/x86/kernel/Makefile_32
+++ b/arch/x86/kernel/Makefile_32
@@ -33,7 +33,7 @@ obj-$(CONFIG_X86_NUMAQ)		+= numaq_32.o
 obj-$(CONFIG_X86_SUMMIT_NUMA)	+= summit_32.o
 obj-$(CONFIG_KPROBES)		+= kprobes_32.o
 obj-$(CONFIG_MODULES)		+= module_32.o
-obj-y				+= sysenter_32.o vsyscall_32.o
+obj-y				+= sysenter_32.o
 obj-$(CONFIG_ACPI_SRAT) 	+= srat_32.o
 obj-$(CONFIG_EFI) 		+= efi_32.o efi_stub_32.o
 obj-$(CONFIG_DOUBLEFAULT) 	+= doublefault_32.o
@@ -48,44 +48,3 @@ obj-$(CONFIG_PARAVIRT)		+= paravirt_32.o
 obj-y				+= pcspeaker.o
 
 obj-$(CONFIG_SCx200)		+= scx200_32.o
-
-# vsyscall_32.o contains the vsyscall DSO images as __initdata.
-# We must build both images before we can assemble it.
-# Note: kbuild does not track this dependency due to usage of .incbin
-$(obj)/vsyscall_32.o: $(obj)/vsyscall-int80_32.so $(obj)/vsyscall-sysenter_32.so
-targets += $(foreach F,int80 sysenter,vsyscall-$F_32.o vsyscall-$F_32.so)
-targets += vsyscall-note_32.o vsyscall_32.lds
-
-$(obj)/vsyscall-%_32.o: $(src)/../vdso/vdso32/%.S
-	$(call if_changed_dep,as_o_S)
-
-# The DSO images are built using a special linker script.
-quiet_cmd_syscall = SYSCALL $@
-      cmd_syscall = $(CC) -m elf_i386 -nostdlib $(SYSCFLAGS_$(@F)) \
-		          -Wl,-T,$(filter-out FORCE,$^) -o $@
-
-export CPPFLAGS_vsyscall_32.lds += -P -C -Ui386
-
-vsyscall-flags = -shared -s -Wl,-soname=linux-gate.so.1 \
-		 $(call ld-option, -Wl$(comma)--hash-style=sysv)
-SYSCFLAGS_vsyscall-sysenter_32.so	= $(vsyscall-flags)
-SYSCFLAGS_vsyscall-int80_32.so	= $(vsyscall-flags)
-
-$(obj)/vsyscall-int80_32.so $(obj)/vsyscall-sysenter_32.so: \
-$(obj)/vsyscall-%.so: $(src)/vsyscall_32.lds \
-		      $(obj)/vsyscall-%.o $(obj)/vsyscall-note_32.o FORCE
-	$(call if_changed,syscall)
-
-# We also create a special relocatable object that should mirror the symbol
-# table and layout of the linked DSO.  With ld -R we can then refer to
-# these symbols in the kernel code rather than hand-coded addresses.
-extra-y += vsyscall-syms.o
-$(obj)/built-in.o: $(obj)/vsyscall-syms.o
-$(obj)/built-in.o: ld_flags += -R $(obj)/vsyscall-syms.o
-
-SYSCFLAGS_vsyscall-syms.o = -r
-$(obj)/vsyscall-syms.o: $(src)/vsyscall_32.lds \
-			$(obj)/vsyscall-sysenter_32.o $(obj)/vsyscall-note_32.o FORCE
-	$(call if_changed,syscall)
-
-
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 4f750ec810f6..fd7464d23339 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -111,8 +111,6 @@ void foo(void)
 	DEFINE(PTRS_PER_PMD, PTRS_PER_PMD);
 	DEFINE(PTRS_PER_PGD, PTRS_PER_PGD);
 
-	DEFINE(VDSO_PRELINK_asm, VDSO_PRELINK);
-
 	OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
 
 #ifdef CONFIG_PARAVIRT
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index 5c6170c44b00..1ac53e9a0859 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -23,6 +23,7 @@
 #include <asm/ucontext.h>
 #include <asm/uaccess.h>
 #include <asm/i387.h>
+#include <asm/vdso.h>
 #include "sigframe_32.h"
 
 #define DEBUG_SIG 0
@@ -362,7 +363,7 @@ static int setup_frame(int sig, struct k_sigaction *ka,
 	}
 
 	if (current->binfmt->hasvdso)
-		restorer = (void *)VDSO_SYM(&__kernel_sigreturn);
+		restorer = VDSO32_SYMBOL(current->mm->context.vdso, sigreturn);
 	else
 		restorer = (void *)&frame->retcode;
 	if (ka->sa.sa_flags & SA_RESTORER)
@@ -459,7 +460,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 		goto give_sigsegv;
 
 	/* Set up to return from userspace.  */
-	restorer = (void *)VDSO_SYM(&__kernel_rt_sigreturn);
+	restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn);
 	if (ka->sa.sa_flags & SA_RESTORER)
 		restorer = ka->sa.sa_restorer;
 	err |= __put_user(restorer, &frame->pretcode);
diff --git a/arch/x86/kernel/sysenter_32.c b/arch/x86/kernel/sysenter_32.c
index 5a2d951e2608..85c52d23ee40 100644
--- a/arch/x86/kernel/sysenter_32.c
+++ b/arch/x86/kernel/sysenter_32.c
@@ -23,6 +23,7 @@
 #include <asm/unistd.h>
 #include <asm/elf.h>
 #include <asm/tlbflush.h>
+#include <asm/vdso.h>
 
 enum {
 	VDSO_DISABLED = 0,
@@ -259,9 +260,6 @@ int __init sysenter_setup(void)
 	return 0;
 }
 
-/* Defined in vsyscall-sysenter.S */
-extern void SYSENTER_RETURN;
-
 /* Setup a VMA at program startup for the vsyscall page */
 int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
 {
@@ -308,7 +306,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
 
 	current->mm->context.vdso = (void *)addr;
 	current_thread_info()->sysenter_return =
-		(void *)VDSO_SYM(&SYSENTER_RETURN);
+		VDSO32_SYMBOL(addr, SYSENTER_RETURN);
 
   up_fail:
 	up_write(&mm->mmap_sem);
diff --git a/arch/x86/kernel/vsyscall_32.S b/arch/x86/kernel/vsyscall_32.S
deleted file mode 100644
index a5ab3dc4fd25..000000000000
--- a/arch/x86/kernel/vsyscall_32.S
+++ /dev/null
@@ -1,15 +0,0 @@
-#include <linux/init.h>
-
-__INITDATA
-
-	.globl vsyscall_int80_start, vsyscall_int80_end
-vsyscall_int80_start:
-	.incbin "arch/x86/kernel/vsyscall-int80_32.so"
-vsyscall_int80_end:
-
-	.globl vsyscall_sysenter_start, vsyscall_sysenter_end
-vsyscall_sysenter_start:
-	.incbin "arch/x86/kernel/vsyscall-sysenter_32.so"
-vsyscall_sysenter_end:
-
-__FINIT
diff --git a/arch/x86/kernel/vsyscall_32.lds.S b/arch/x86/kernel/vsyscall_32.lds.S
deleted file mode 100644
index 4a8b0ed9b8fb..000000000000
--- a/arch/x86/kernel/vsyscall_32.lds.S
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Linker script for vsyscall DSO.  The vsyscall page is an ELF shared
- * object prelinked to its virtual address, and with only one read-only
- * segment (that fits in one page).  This script controls its layout.
- */
-#include <asm/asm-offsets.h>
-
-SECTIONS
-{
-  . = VDSO_PRELINK_asm + SIZEOF_HEADERS;
-
-  .hash           : { *(.hash) }		:text
-  .gnu.hash       : { *(.gnu.hash) }
-  .dynsym         : { *(.dynsym) }
-  .dynstr         : { *(.dynstr) }
-  .gnu.version    : { *(.gnu.version) }
-  .gnu.version_d  : { *(.gnu.version_d) }
-  .gnu.version_r  : { *(.gnu.version_r) }
-
-  /* This linker script is used both with -r and with -shared.
-     For the layouts to match, we need to skip more than enough
-     space for the dynamic symbol table et al.  If this amount
-     is insufficient, ld -shared will barf.  Just increase it here.  */
-  . = VDSO_PRELINK_asm + 0x400;
-
-  .text           : { *(.text) }		:text =0x90909090
-  .note		  : { *(.note.*) }		:text :note
-  .eh_frame_hdr   : { *(.eh_frame_hdr) }	:text :eh_frame_hdr
-  .eh_frame       : { KEEP (*(.eh_frame)) }	:text
-  .dynamic        : { *(.dynamic) }		:text :dynamic
-  .useless        : {
-  	*(.got.plt) *(.got)
-	*(.data .data.* .gnu.linkonce.d.*)
-	*(.dynbss)
-	*(.bss .bss.* .gnu.linkonce.b.*)
-  }						:text
-}
-
-/*
- * We must supply the ELF program headers explicitly to get just one
- * PT_LOAD segment, and set the flags explicitly to make segments read-only.
- */
-PHDRS
-{
-  text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */
-  dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
-  note PT_NOTE FLAGS(4); /* PF_R */
-  eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */
-}
-
-/*
- * This controls what symbols we export from the DSO.
- */
-VERSION
-{
-  LINUX_2.5 {
-    global:
-    	__kernel_vsyscall;
-    	__kernel_sigreturn;
-    	__kernel_rt_sigreturn;
-
-    local: *;
-  };
-}
-
-/* The ELF entry point can be used to set the AT_SYSINFO value.  */
-ENTRY(__kernel_vsyscall);
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index a02e1ca2a1bc..ca2aabf8ed39 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -14,7 +14,8 @@ vdso-install-$(VDSO32-y)	+= $(vdso32-y:=.so)
 vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vvar.o
 
 # files to link into kernel
-obj-y := vma.o vdso.o
+obj-$(VDSO64-y)			+= vma.o vdso.o
+obj-$(CONFIG_X86_32)		+= vdso32.o
 
 vobjs := $(foreach F,$(vobjs-y),$(obj)/$F)
 
@@ -52,7 +53,7 @@ $(obj)/vclock_gettime.o: KBUILD_CFLAGS = $(CFL)
 $(obj)/vgetcpu.o: KBUILD_CFLAGS = $(CFL)
 
 targets += vdso-syms.lds
-obj-y += vdso-syms.lds
+obj-$(VDSO64-y)			+= vdso-syms.lds
 
 #
 # Match symbols in the DSO that look like VDSO*; produce a file of constants.
@@ -68,6 +69,7 @@ $(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE
 #
 # Build multiple 32-bit vDSO images to choose from at boot time.
 #
+obj-$(VDSO32-y)			+= vdso32-syms.lds
 vdso32.so-$(CONFIG_X86_32)	+= int80
 vdso32.so-$(VDSO32-y)		+= sysenter
 
@@ -84,6 +86,8 @@ targets += vdso32/note.o $(vdso32.so-y:%=vdso32/%.o)
 
 extra-y	+= $(vdso32.so-y:%=vdso32-%.so)
 
+$(obj)/vdso32.o: $(vdso32.so-y:%=$(obj)/vdso32-%.so)
+
 KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS))
 $(vdso32.so-y:%=$(obj)/vdso32-%.so.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32)
 $(vdso32.so-y:%=$(obj)/vdso32-%.so.dbg): asflags-$(CONFIG_X86_64) += -m32
diff --git a/arch/x86/vdso/vdso32.S b/arch/x86/vdso/vdso32.S
new file mode 100644
index 000000000000..cab020c99c3d
--- /dev/null
+++ b/arch/x86/vdso/vdso32.S
@@ -0,0 +1,15 @@
+#include <linux/init.h>
+
+__INITDATA
+
+	.globl vsyscall_int80_start, vsyscall_int80_end
+vsyscall_int80_start:
+	.incbin "arch/x86/vdso/vdso32-int80.so"
+vsyscall_int80_end:
+
+	.globl vsyscall_sysenter_start, vsyscall_sysenter_end
+vsyscall_sysenter_start:
+	.incbin "arch/x86/vdso/vdso32-sysenter.so"
+vsyscall_sysenter_end:
+
+__FINIT
diff --git a/arch/x86/vdso/vdso32/note.S b/arch/x86/vdso/vdso32/note.S
index 0cf934f2633e..c83f25734696 100644
--- a/arch/x86/vdso/vdso32/note.S
+++ b/arch/x86/vdso/vdso32/note.S
@@ -35,10 +35,9 @@ ELFNOTE_END
 
 #include "../../xen/vdso.h"	/* Defines VDSO_NOTE_NONEGSEG_BIT.  */
 
-	.globl VDSO_NOTE_MASK
 ELFNOTE_START(GNU, 2, "a")
 	.long 1			/* ncaps */
-VDSO_NOTE_MASK:
+VDSO32_NOTE_MASK:		/* Symbol used by arch/x86/xen/setup.c */
 	.long 0			/* mask */
 	.byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg"	/* bit, name */
 ELFNOTE_END
diff --git a/arch/x86/vdso/vdso32/sysenter.S b/arch/x86/vdso/vdso32/sysenter.S
index 4b98fc41d3e0..109bfa394eaa 100644
--- a/arch/x86/vdso/vdso32/sysenter.S
+++ b/arch/x86/vdso/vdso32/sysenter.S
@@ -45,8 +45,7 @@ __kernel_vsyscall:
 	/* 14: System call restart point is here! (SYSENTER_RETURN-2) */
 	jmp .Lenter_kernel
 	/* 16: System call normal return point is here! */
-	.globl SYSENTER_RETURN	/* Symbol used by sysenter.c  */
-SYSENTER_RETURN:
+VDSO32_SYSENTER_RETURN:	/* Symbol used by sysenter.c via vdso32-syms.h */
 	pop %ebp
 .Lpop_ebp:
 	pop %edx
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index f84e77226646..fd91568090f4 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -10,6 +10,7 @@
 #include <linux/pm.h>
 
 #include <asm/elf.h>
+#include <asm/vdso.h>
 #include <asm/e820.h>
 #include <asm/setup.h>
 #include <asm/xen/hypervisor.h>
@@ -61,10 +62,8 @@ static void xen_idle(void)
  */
 static void fiddle_vdso(void)
 {
-	extern u32 VDSO_NOTE_MASK; /* See ../kernel/vsyscall-note.S.  */
 	extern char vsyscall_int80_start;
-	u32 *mask = (u32 *) ((unsigned long) &VDSO_NOTE_MASK - VDSO_PRELINK +
-			     &vsyscall_int80_start);
+	u32 *mask = VDSO32_SYMBOL(&vsyscall_int80_start, NOTE_MASK);
 	*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
 }
 
diff --git a/include/asm-x86/elf.h b/include/asm-x86/elf.h
index cd3204ebbbdd..70edff2d5671 100644
--- a/include/asm-x86/elf.h
+++ b/include/asm-x86/elf.h
@@ -78,6 +78,7 @@ typedef struct user_fxsr_struct elf_fpxregset_t;
 #include <asm/processor.h>
 #include <asm/system.h>		/* for savesegment */
 #include <asm/desc.h>
+#include <asm/vdso.h>
 
 /*
  * This is used to ensure we don't load something for the wrong architecture.
@@ -247,17 +248,9 @@ extern int dump_task_extended_fpu (struct task_struct *,
 
 #define VDSO_HIGH_BASE		(__fix_to_virt(FIX_VDSO))
 #define VDSO_CURRENT_BASE	((unsigned long)current->mm->context.vdso)
-#define VDSO_PRELINK		0
 
-#define VDSO_SYM(x) \
-		(VDSO_CURRENT_BASE + (unsigned long)(x) - VDSO_PRELINK)
-
-#define VDSO_HIGH_EHDR		((const struct elfhdr *) VDSO_HIGH_BASE)
-#define VDSO_EHDR		((const struct elfhdr *) VDSO_CURRENT_BASE)
-
-extern void __kernel_vsyscall;
-
-#define VDSO_ENTRY		VDSO_SYM(&__kernel_vsyscall)
+#define VDSO_ENTRY \
+	((unsigned long) VDSO32_SYMBOL(VDSO_CURRENT_BASE, vsyscall))
 
 /* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */
 
diff --git a/include/asm-x86/vdso.h b/include/asm-x86/vdso.h
index 9379ec06fd1f..629bcb6e8e45 100644
--- a/include/asm-x86/vdso.h
+++ b/include/asm-x86/vdso.h
@@ -1,6 +1,7 @@
 #ifndef _ASM_X86_VDSO_H
 #define _ASM_X86_VDSO_H	1
 
+#ifdef CONFIG_X86_64
 extern const char VDSO64_PRELINK[];
 
 /*
@@ -10,5 +11,18 @@ extern const char VDSO64_PRELINK[];
 #define VDSO64_SYMBOL(base, name) ({		\
 	extern const char VDSO64_##name[];	\
 	(void *) (VDSO64_##name - VDSO64_PRELINK + (unsigned long) (base)); })
+#endif
+
+#if defined CONFIG_X86_32 || defined CONFIG_COMPAT
+extern const char VDSO32_PRELINK[];
+
+/*
+ * Given a pointer to the vDSO image, find the pointer to VDSO32_name
+ * as that symbol is defined in the vDSO sources or linker script.
+ */
+#define VDSO32_SYMBOL(base, name) ({		\
+	extern const char VDSO32_##name[];	\
+	(void *) (VDSO32_##name - VDSO32_PRELINK + (unsigned long) (base)); })
+#endif
 
 #endif	/* asm-x86/vdso.h */
-- 
cgit v1.2.3


From f288f32dc51042fd3a493b85b226e63f1ad3edcc Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:30:42 +0100
Subject: x86 vDSO: vdso32 setup

This moves arch/x86/kernel/sysenter_32.c to arch/x86/vdso/vdso32-setup.c,
keeping all the code relating only to vDSO magic in the vdso/ subdirectory.
This is a pure renaming, but it paves the way to consolidating the code for
dealing with 32-bit vDSOs across CONFIG_X86_32 and CONFIG_IA32_EMULATION.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/Makefile_32   |   1 -
 arch/x86/kernel/sysenter_32.c | 344 ------------------------------------------
 arch/x86/vdso/Makefile        |   2 +-
 arch/x86/vdso/vdso32-setup.c  | 344 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 345 insertions(+), 346 deletions(-)
 delete mode 100644 arch/x86/kernel/sysenter_32.c
 create mode 100644 arch/x86/vdso/vdso32-setup.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32
index 0eef8226753b..2c9596b9349c 100644
--- a/arch/x86/kernel/Makefile_32
+++ b/arch/x86/kernel/Makefile_32
@@ -33,7 +33,6 @@ obj-$(CONFIG_X86_NUMAQ)		+= numaq_32.o
 obj-$(CONFIG_X86_SUMMIT_NUMA)	+= summit_32.o
 obj-$(CONFIG_KPROBES)		+= kprobes_32.o
 obj-$(CONFIG_MODULES)		+= module_32.o
-obj-y				+= sysenter_32.o
 obj-$(CONFIG_ACPI_SRAT) 	+= srat_32.o
 obj-$(CONFIG_EFI) 		+= efi_32.o efi_stub_32.o
 obj-$(CONFIG_DOUBLEFAULT) 	+= doublefault_32.o
diff --git a/arch/x86/kernel/sysenter_32.c b/arch/x86/kernel/sysenter_32.c
deleted file mode 100644
index 85c52d23ee40..000000000000
--- a/arch/x86/kernel/sysenter_32.c
+++ /dev/null
@@ -1,344 +0,0 @@
-/*
- * (C) Copyright 2002 Linus Torvalds
- * Portions based on the vdso-randomization code from exec-shield:
- * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar
- *
- * This file contains the needed initializations to support sysenter.
- */
-
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/thread_info.h>
-#include <linux/sched.h>
-#include <linux/gfp.h>
-#include <linux/string.h>
-#include <linux/elf.h>
-#include <linux/mm.h>
-#include <linux/err.h>
-#include <linux/module.h>
-
-#include <asm/cpufeature.h>
-#include <asm/msr.h>
-#include <asm/pgtable.h>
-#include <asm/unistd.h>
-#include <asm/elf.h>
-#include <asm/tlbflush.h>
-#include <asm/vdso.h>
-
-enum {
-	VDSO_DISABLED = 0,
-	VDSO_ENABLED = 1,
-	VDSO_COMPAT = 2,
-};
-
-#ifdef CONFIG_COMPAT_VDSO
-#define VDSO_DEFAULT	VDSO_COMPAT
-#else
-#define VDSO_DEFAULT	VDSO_ENABLED
-#endif
-
-/*
- * Should the kernel map a VDSO page into processes and pass its
- * address down to glibc upon exec()?
- */
-unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT;
-
-EXPORT_SYMBOL_GPL(vdso_enabled);
-
-static int __init vdso_setup(char *s)
-{
-	vdso_enabled = simple_strtoul(s, NULL, 0);
-
-	return 1;
-}
-
-__setup("vdso=", vdso_setup);
-
-extern asmlinkage void sysenter_entry(void);
-
-static __init void reloc_symtab(Elf32_Ehdr *ehdr,
-				unsigned offset, unsigned size)
-{
-	Elf32_Sym *sym = (void *)ehdr + offset;
-	unsigned nsym = size / sizeof(*sym);
-	unsigned i;
-
-	for(i = 0; i < nsym; i++, sym++) {
-		if (sym->st_shndx == SHN_UNDEF ||
-		    sym->st_shndx == SHN_ABS)
-			continue;  /* skip */
-
-		if (sym->st_shndx > SHN_LORESERVE) {
-			printk(KERN_INFO "VDSO: unexpected st_shndx %x\n",
-			       sym->st_shndx);
-			continue;
-		}
-
-		switch(ELF_ST_TYPE(sym->st_info)) {
-		case STT_OBJECT:
-		case STT_FUNC:
-		case STT_SECTION:
-		case STT_FILE:
-			sym->st_value += VDSO_HIGH_BASE;
-		}
-	}
-}
-
-static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset)
-{
-	Elf32_Dyn *dyn = (void *)ehdr + offset;
-
-	for(; dyn->d_tag != DT_NULL; dyn++)
-		switch(dyn->d_tag) {
-		case DT_PLTGOT:
-		case DT_HASH:
-		case DT_STRTAB:
-		case DT_SYMTAB:
-		case DT_RELA:
-		case DT_INIT:
-		case DT_FINI:
-		case DT_REL:
-		case DT_DEBUG:
-		case DT_JMPREL:
-		case DT_VERSYM:
-		case DT_VERDEF:
-		case DT_VERNEED:
-		case DT_ADDRRNGLO ... DT_ADDRRNGHI:
-			/* definitely pointers needing relocation */
-			dyn->d_un.d_ptr += VDSO_HIGH_BASE;
-			break;
-
-		case DT_ENCODING ... OLD_DT_LOOS-1:
-		case DT_LOOS ... DT_HIOS-1:
-			/* Tags above DT_ENCODING are pointers if
-			   they're even */
-			if (dyn->d_tag >= DT_ENCODING &&
-			    (dyn->d_tag & 1) == 0)
-				dyn->d_un.d_ptr += VDSO_HIGH_BASE;
-			break;
-
-		case DT_VERDEFNUM:
-		case DT_VERNEEDNUM:
-		case DT_FLAGS_1:
-		case DT_RELACOUNT:
-		case DT_RELCOUNT:
-		case DT_VALRNGLO ... DT_VALRNGHI:
-			/* definitely not pointers */
-			break;
-
-		case OLD_DT_LOOS ... DT_LOOS-1:
-		case DT_HIOS ... DT_VALRNGLO-1:
-		default:
-			if (dyn->d_tag > DT_ENCODING)
-				printk(KERN_INFO "VDSO: unexpected DT_tag %x\n",
-				       dyn->d_tag);
-			break;
-		}
-}
-
-static __init void relocate_vdso(Elf32_Ehdr *ehdr)
-{
-	Elf32_Phdr *phdr;
-	Elf32_Shdr *shdr;
-	int i;
-
-	BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 ||
-	       !elf_check_arch(ehdr) ||
-	       ehdr->e_type != ET_DYN);
-
-	ehdr->e_entry += VDSO_HIGH_BASE;
-
-	/* rebase phdrs */
-	phdr = (void *)ehdr + ehdr->e_phoff;
-	for (i = 0; i < ehdr->e_phnum; i++) {
-		phdr[i].p_vaddr += VDSO_HIGH_BASE;
-
-		/* relocate dynamic stuff */
-		if (phdr[i].p_type == PT_DYNAMIC)
-			reloc_dyn(ehdr, phdr[i].p_offset);
-	}
-
-	/* rebase sections */
-	shdr = (void *)ehdr + ehdr->e_shoff;
-	for(i = 0; i < ehdr->e_shnum; i++) {
-		if (!(shdr[i].sh_flags & SHF_ALLOC))
-			continue;
-
-		shdr[i].sh_addr += VDSO_HIGH_BASE;
-
-		if (shdr[i].sh_type == SHT_SYMTAB ||
-		    shdr[i].sh_type == SHT_DYNSYM)
-			reloc_symtab(ehdr, shdr[i].sh_offset,
-				     shdr[i].sh_size);
-	}
-}
-
-void enable_sep_cpu(void)
-{
-	int cpu = get_cpu();
-	struct tss_struct *tss = &per_cpu(init_tss, cpu);
-
-	if (!boot_cpu_has(X86_FEATURE_SEP)) {
-		put_cpu();
-		return;
-	}
-
-	tss->x86_tss.ss1 = __KERNEL_CS;
-	tss->x86_tss.esp1 = sizeof(struct tss_struct) + (unsigned long) tss;
-	wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
-	wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.esp1, 0);
-	wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) sysenter_entry, 0);
-	put_cpu();	
-}
-
-static struct vm_area_struct gate_vma;
-
-static int __init gate_vma_init(void)
-{
-	gate_vma.vm_mm = NULL;
-	gate_vma.vm_start = FIXADDR_USER_START;
-	gate_vma.vm_end = FIXADDR_USER_END;
-	gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
-	gate_vma.vm_page_prot = __P101;
-	/*
-	 * Make sure the vDSO gets into every core dump.
-	 * Dumping its contents makes post-mortem fully interpretable later
-	 * without matching up the same kernel and hardware config to see
-	 * what PC values meant.
-	 */
-	gate_vma.vm_flags |= VM_ALWAYSDUMP;
-	return 0;
-}
-
-/*
- * These symbols are defined by vsyscall.o to mark the bounds
- * of the ELF DSO images included therein.
- */
-extern const char vsyscall_int80_start, vsyscall_int80_end;
-extern const char vsyscall_sysenter_start, vsyscall_sysenter_end;
-static struct page *syscall_pages[1];
-
-static void map_compat_vdso(int map)
-{
-	static int vdso_mapped;
-
-	if (map == vdso_mapped)
-		return;
-
-	vdso_mapped = map;
-
-	__set_fixmap(FIX_VDSO, page_to_pfn(syscall_pages[0]) << PAGE_SHIFT,
-		     map ? PAGE_READONLY_EXEC : PAGE_NONE);
-
-	/* flush stray tlbs */
-	flush_tlb_all();
-}
-
-int __init sysenter_setup(void)
-{
-	void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
-	const void *vsyscall;
-	size_t vsyscall_len;
-
-	syscall_pages[0] = virt_to_page(syscall_page);
-
-	gate_vma_init();
-
-	printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
-
-	if (!boot_cpu_has(X86_FEATURE_SEP)) {
-		vsyscall = &vsyscall_int80_start;
-		vsyscall_len = &vsyscall_int80_end - &vsyscall_int80_start;
-	} else {
-		vsyscall = &vsyscall_sysenter_start;
-		vsyscall_len = &vsyscall_sysenter_end - &vsyscall_sysenter_start;
-	}
-
-	memcpy(syscall_page, vsyscall, vsyscall_len);
-	relocate_vdso(syscall_page);
-
-	return 0;
-}
-
-/* Setup a VMA at program startup for the vsyscall page */
-int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
-{
-	struct mm_struct *mm = current->mm;
-	unsigned long addr;
-	int ret = 0;
-	bool compat;
-
-	down_write(&mm->mmap_sem);
-
-	/* Test compat mode once here, in case someone
-	   changes it via sysctl */
-	compat = (vdso_enabled == VDSO_COMPAT);
-
-	map_compat_vdso(compat);
-
-	if (compat)
-		addr = VDSO_HIGH_BASE;
-	else {
-		addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
-		if (IS_ERR_VALUE(addr)) {
-			ret = addr;
-			goto up_fail;
-		}
-
-		/*
-		 * MAYWRITE to allow gdb to COW and set breakpoints
-		 *
-		 * Make sure the vDSO gets into every core dump.
-		 * Dumping its contents makes post-mortem fully
-		 * interpretable later without matching up the same
-		 * kernel and hardware config to see what PC values
-		 * meant.
-		 */
-		ret = install_special_mapping(mm, addr, PAGE_SIZE,
-					      VM_READ|VM_EXEC|
-					      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
-					      VM_ALWAYSDUMP,
-					      syscall_pages);
-
-		if (ret)
-			goto up_fail;
-	}
-
-	current->mm->context.vdso = (void *)addr;
-	current_thread_info()->sysenter_return =
-		VDSO32_SYMBOL(addr, SYSENTER_RETURN);
-
-  up_fail:
-	up_write(&mm->mmap_sem);
-
-	return ret;
-}
-
-const char *arch_vma_name(struct vm_area_struct *vma)
-{
-	if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
-		return "[vdso]";
-	return NULL;
-}
-
-struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
-{
-	struct mm_struct *mm = tsk->mm;
-
-	/* Check to see if this task was created in compat vdso mode */
-	if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE)
-		return &gate_vma;
-	return NULL;
-}
-
-int in_gate_area(struct task_struct *task, unsigned long addr)
-{
-	const struct vm_area_struct *vma = get_gate_vma(task);
-
-	return vma && addr >= vma->vm_start && addr < vma->vm_end;
-}
-
-int in_gate_area_no_task(unsigned long addr)
-{
-	return 0;
-}
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index ca2aabf8ed39..1efe785979a2 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -15,7 +15,7 @@ vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vvar.o
 
 # files to link into kernel
 obj-$(VDSO64-y)			+= vma.o vdso.o
-obj-$(CONFIG_X86_32)		+= vdso32.o
+obj-$(CONFIG_X86_32)		+= vdso32.o vdso32-setup.o
 
 vobjs := $(foreach F,$(vobjs-y),$(obj)/$F)
 
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
new file mode 100644
index 000000000000..85c52d23ee40
--- /dev/null
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -0,0 +1,344 @@
+/*
+ * (C) Copyright 2002 Linus Torvalds
+ * Portions based on the vdso-randomization code from exec-shield:
+ * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar
+ *
+ * This file contains the needed initializations to support sysenter.
+ */
+
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/thread_info.h>
+#include <linux/sched.h>
+#include <linux/gfp.h>
+#include <linux/string.h>
+#include <linux/elf.h>
+#include <linux/mm.h>
+#include <linux/err.h>
+#include <linux/module.h>
+
+#include <asm/cpufeature.h>
+#include <asm/msr.h>
+#include <asm/pgtable.h>
+#include <asm/unistd.h>
+#include <asm/elf.h>
+#include <asm/tlbflush.h>
+#include <asm/vdso.h>
+
+enum {
+	VDSO_DISABLED = 0,
+	VDSO_ENABLED = 1,
+	VDSO_COMPAT = 2,
+};
+
+#ifdef CONFIG_COMPAT_VDSO
+#define VDSO_DEFAULT	VDSO_COMPAT
+#else
+#define VDSO_DEFAULT	VDSO_ENABLED
+#endif
+
+/*
+ * Should the kernel map a VDSO page into processes and pass its
+ * address down to glibc upon exec()?
+ */
+unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT;
+
+EXPORT_SYMBOL_GPL(vdso_enabled);
+
+static int __init vdso_setup(char *s)
+{
+	vdso_enabled = simple_strtoul(s, NULL, 0);
+
+	return 1;
+}
+
+__setup("vdso=", vdso_setup);
+
+extern asmlinkage void sysenter_entry(void);
+
+static __init void reloc_symtab(Elf32_Ehdr *ehdr,
+				unsigned offset, unsigned size)
+{
+	Elf32_Sym *sym = (void *)ehdr + offset;
+	unsigned nsym = size / sizeof(*sym);
+	unsigned i;
+
+	for(i = 0; i < nsym; i++, sym++) {
+		if (sym->st_shndx == SHN_UNDEF ||
+		    sym->st_shndx == SHN_ABS)
+			continue;  /* skip */
+
+		if (sym->st_shndx > SHN_LORESERVE) {
+			printk(KERN_INFO "VDSO: unexpected st_shndx %x\n",
+			       sym->st_shndx);
+			continue;
+		}
+
+		switch(ELF_ST_TYPE(sym->st_info)) {
+		case STT_OBJECT:
+		case STT_FUNC:
+		case STT_SECTION:
+		case STT_FILE:
+			sym->st_value += VDSO_HIGH_BASE;
+		}
+	}
+}
+
+static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset)
+{
+	Elf32_Dyn *dyn = (void *)ehdr + offset;
+
+	for(; dyn->d_tag != DT_NULL; dyn++)
+		switch(dyn->d_tag) {
+		case DT_PLTGOT:
+		case DT_HASH:
+		case DT_STRTAB:
+		case DT_SYMTAB:
+		case DT_RELA:
+		case DT_INIT:
+		case DT_FINI:
+		case DT_REL:
+		case DT_DEBUG:
+		case DT_JMPREL:
+		case DT_VERSYM:
+		case DT_VERDEF:
+		case DT_VERNEED:
+		case DT_ADDRRNGLO ... DT_ADDRRNGHI:
+			/* definitely pointers needing relocation */
+			dyn->d_un.d_ptr += VDSO_HIGH_BASE;
+			break;
+
+		case DT_ENCODING ... OLD_DT_LOOS-1:
+		case DT_LOOS ... DT_HIOS-1:
+			/* Tags above DT_ENCODING are pointers if
+			   they're even */
+			if (dyn->d_tag >= DT_ENCODING &&
+			    (dyn->d_tag & 1) == 0)
+				dyn->d_un.d_ptr += VDSO_HIGH_BASE;
+			break;
+
+		case DT_VERDEFNUM:
+		case DT_VERNEEDNUM:
+		case DT_FLAGS_1:
+		case DT_RELACOUNT:
+		case DT_RELCOUNT:
+		case DT_VALRNGLO ... DT_VALRNGHI:
+			/* definitely not pointers */
+			break;
+
+		case OLD_DT_LOOS ... DT_LOOS-1:
+		case DT_HIOS ... DT_VALRNGLO-1:
+		default:
+			if (dyn->d_tag > DT_ENCODING)
+				printk(KERN_INFO "VDSO: unexpected DT_tag %x\n",
+				       dyn->d_tag);
+			break;
+		}
+}
+
+static __init void relocate_vdso(Elf32_Ehdr *ehdr)
+{
+	Elf32_Phdr *phdr;
+	Elf32_Shdr *shdr;
+	int i;
+
+	BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 ||
+	       !elf_check_arch(ehdr) ||
+	       ehdr->e_type != ET_DYN);
+
+	ehdr->e_entry += VDSO_HIGH_BASE;
+
+	/* rebase phdrs */
+	phdr = (void *)ehdr + ehdr->e_phoff;
+	for (i = 0; i < ehdr->e_phnum; i++) {
+		phdr[i].p_vaddr += VDSO_HIGH_BASE;
+
+		/* relocate dynamic stuff */
+		if (phdr[i].p_type == PT_DYNAMIC)
+			reloc_dyn(ehdr, phdr[i].p_offset);
+	}
+
+	/* rebase sections */
+	shdr = (void *)ehdr + ehdr->e_shoff;
+	for(i = 0; i < ehdr->e_shnum; i++) {
+		if (!(shdr[i].sh_flags & SHF_ALLOC))
+			continue;
+
+		shdr[i].sh_addr += VDSO_HIGH_BASE;
+
+		if (shdr[i].sh_type == SHT_SYMTAB ||
+		    shdr[i].sh_type == SHT_DYNSYM)
+			reloc_symtab(ehdr, shdr[i].sh_offset,
+				     shdr[i].sh_size);
+	}
+}
+
+void enable_sep_cpu(void)
+{
+	int cpu = get_cpu();
+	struct tss_struct *tss = &per_cpu(init_tss, cpu);
+
+	if (!boot_cpu_has(X86_FEATURE_SEP)) {
+		put_cpu();
+		return;
+	}
+
+	tss->x86_tss.ss1 = __KERNEL_CS;
+	tss->x86_tss.esp1 = sizeof(struct tss_struct) + (unsigned long) tss;
+	wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
+	wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.esp1, 0);
+	wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) sysenter_entry, 0);
+	put_cpu();	
+}
+
+static struct vm_area_struct gate_vma;
+
+static int __init gate_vma_init(void)
+{
+	gate_vma.vm_mm = NULL;
+	gate_vma.vm_start = FIXADDR_USER_START;
+	gate_vma.vm_end = FIXADDR_USER_END;
+	gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
+	gate_vma.vm_page_prot = __P101;
+	/*
+	 * Make sure the vDSO gets into every core dump.
+	 * Dumping its contents makes post-mortem fully interpretable later
+	 * without matching up the same kernel and hardware config to see
+	 * what PC values meant.
+	 */
+	gate_vma.vm_flags |= VM_ALWAYSDUMP;
+	return 0;
+}
+
+/*
+ * These symbols are defined by vsyscall.o to mark the bounds
+ * of the ELF DSO images included therein.
+ */
+extern const char vsyscall_int80_start, vsyscall_int80_end;
+extern const char vsyscall_sysenter_start, vsyscall_sysenter_end;
+static struct page *syscall_pages[1];
+
+static void map_compat_vdso(int map)
+{
+	static int vdso_mapped;
+
+	if (map == vdso_mapped)
+		return;
+
+	vdso_mapped = map;
+
+	__set_fixmap(FIX_VDSO, page_to_pfn(syscall_pages[0]) << PAGE_SHIFT,
+		     map ? PAGE_READONLY_EXEC : PAGE_NONE);
+
+	/* flush stray tlbs */
+	flush_tlb_all();
+}
+
+int __init sysenter_setup(void)
+{
+	void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
+	const void *vsyscall;
+	size_t vsyscall_len;
+
+	syscall_pages[0] = virt_to_page(syscall_page);
+
+	gate_vma_init();
+
+	printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
+
+	if (!boot_cpu_has(X86_FEATURE_SEP)) {
+		vsyscall = &vsyscall_int80_start;
+		vsyscall_len = &vsyscall_int80_end - &vsyscall_int80_start;
+	} else {
+		vsyscall = &vsyscall_sysenter_start;
+		vsyscall_len = &vsyscall_sysenter_end - &vsyscall_sysenter_start;
+	}
+
+	memcpy(syscall_page, vsyscall, vsyscall_len);
+	relocate_vdso(syscall_page);
+
+	return 0;
+}
+
+/* Setup a VMA at program startup for the vsyscall page */
+int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long addr;
+	int ret = 0;
+	bool compat;
+
+	down_write(&mm->mmap_sem);
+
+	/* Test compat mode once here, in case someone
+	   changes it via sysctl */
+	compat = (vdso_enabled == VDSO_COMPAT);
+
+	map_compat_vdso(compat);
+
+	if (compat)
+		addr = VDSO_HIGH_BASE;
+	else {
+		addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
+		if (IS_ERR_VALUE(addr)) {
+			ret = addr;
+			goto up_fail;
+		}
+
+		/*
+		 * MAYWRITE to allow gdb to COW and set breakpoints
+		 *
+		 * Make sure the vDSO gets into every core dump.
+		 * Dumping its contents makes post-mortem fully
+		 * interpretable later without matching up the same
+		 * kernel and hardware config to see what PC values
+		 * meant.
+		 */
+		ret = install_special_mapping(mm, addr, PAGE_SIZE,
+					      VM_READ|VM_EXEC|
+					      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
+					      VM_ALWAYSDUMP,
+					      syscall_pages);
+
+		if (ret)
+			goto up_fail;
+	}
+
+	current->mm->context.vdso = (void *)addr;
+	current_thread_info()->sysenter_return =
+		VDSO32_SYMBOL(addr, SYSENTER_RETURN);
+
+  up_fail:
+	up_write(&mm->mmap_sem);
+
+	return ret;
+}
+
+const char *arch_vma_name(struct vm_area_struct *vma)
+{
+	if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
+		return "[vdso]";
+	return NULL;
+}
+
+struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
+{
+	struct mm_struct *mm = tsk->mm;
+
+	/* Check to see if this task was created in compat vdso mode */
+	if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE)
+		return &gate_vma;
+	return NULL;
+}
+
+int in_gate_area(struct task_struct *task, unsigned long addr)
+{
+	const struct vm_area_struct *vma = get_gate_vma(task);
+
+	return vma && addr >= vma->vm_start && addr < vma->vm_end;
+}
+
+int in_gate_area_no_task(unsigned long addr)
+{
+	return 0;
+}
-- 
cgit v1.2.3


From 0aa97fb22624f18e5925d702ab0364d3838cfd91 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:30:43 +0100
Subject: x86 vDSO: ia32_sysenter_target

This harmonizes the name for the entry point from the 32-bit sysenter
instruction across 32-bit and 64-bit kernels.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/entry_32.S   | 10 +++++-----
 arch/x86/vdso/vdso32-setup.c |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index d63609dd64b9..153bb87a4eea 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -283,7 +283,7 @@ END(resume_kernel)
    the vsyscall page.  See vsyscall-sysentry.S, which defines the symbol.  */
 
 	# sysenter call handler stub
-ENTRY(sysenter_entry)
+ENTRY(ia32_sysenter_target)
 	CFI_STARTPROC simple
 	CFI_SIGNAL_FRAME
 	CFI_DEF_CFA esp, 0
@@ -360,7 +360,7 @@ sysenter_past_esp:
 	.align 4
 	.long 1b,2b
 .popsection
-ENDPROC(sysenter_entry)
+ENDPROC(ia32_sysenter_target)
 
 	# system call handler stub
 ENTRY(system_call)
@@ -768,7 +768,7 @@ label:						\
 
 KPROBE_ENTRY(debug)
 	RING0_INT_FRAME
-	cmpl $sysenter_entry,(%esp)
+	cmpl $ia32_sysenter_target,(%esp)
 	jne debug_stack_correct
 	FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
 debug_stack_correct:
@@ -799,7 +799,7 @@ KPROBE_ENTRY(nmi)
 	popl %eax
 	CFI_ADJUST_CFA_OFFSET -4
 	je nmi_espfix_stack
-	cmpl $sysenter_entry,(%esp)
+	cmpl $ia32_sysenter_target,(%esp)
 	je nmi_stack_fixup
 	pushl %eax
 	CFI_ADJUST_CFA_OFFSET 4
@@ -812,7 +812,7 @@ KPROBE_ENTRY(nmi)
 	popl %eax
 	CFI_ADJUST_CFA_OFFSET -4
 	jae nmi_stack_correct
-	cmpl $sysenter_entry,12(%esp)
+	cmpl $ia32_sysenter_target,12(%esp)
 	je nmi_debug_stack_check
 nmi_stack_correct:
 	/* We have a RING0_INT_FRAME here */
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 85c52d23ee40..fb71a93c5dce 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -54,7 +54,7 @@ static int __init vdso_setup(char *s)
 
 __setup("vdso=", vdso_setup);
 
-extern asmlinkage void sysenter_entry(void);
+extern asmlinkage void ia32_sysenter_target(void);
 
 static __init void reloc_symtab(Elf32_Ehdr *ehdr,
 				unsigned offset, unsigned size)
@@ -187,7 +187,7 @@ void enable_sep_cpu(void)
 	tss->x86_tss.esp1 = sizeof(struct tss_struct) + (unsigned long) tss;
 	wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
 	wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.esp1, 0);
-	wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) sysenter_entry, 0);
+	wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0);
 	put_cpu();	
 }
 
-- 
cgit v1.2.3


From 36197c92a20c142fc2a068e0366053d770fa0096 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:30:43 +0100
Subject: x86 vDSO: ia32 sysenter_return

This changes the 64-bit kernel's support for the 32-bit sysenter
instruction to use stored fields rather than constants for the
user-mode return address, as the 32-bit kernel does.  This adds a
sysenter_return field to struct thread_info, as 32-bit has.  There
is no observable effect from this yet.  It makes the assembly code
independent of the 32-bit vDSO mapping address, paving the way for
making the vDSO address vary as it does on the 32-bit kernel.

[ akpm@linux-foundation.org: build fix on !CONFIG_IA32_EMULATION ]

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/ia32/ia32entry.S        | 7 +++----
 arch/x86/ia32/syscall32.c        | 4 ++++
 arch/x86/kernel/asm-offsets_64.c | 3 +++
 include/asm-x86/thread_info_64.h | 3 +++
 4 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index df588f0f76e1..2499a324feaa 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -12,7 +12,6 @@
 #include <asm/ia32_unistd.h>	
 #include <asm/thread_info.h>	
 #include <asm/segment.h>
-#include <asm/vsyscall32.h>
 #include <asm/irqflags.h>
 #include <linux/linkage.h>
 
@@ -104,7 +103,7 @@ ENTRY(ia32_sysenter_target)
 	pushfq
 	CFI_ADJUST_CFA_OFFSET 8
 	/*CFI_REL_OFFSET rflags,0*/
-	movl	$VSYSCALL32_SYSEXIT, %r10d
+	movl	8*3-THREAD_SIZE+threadinfo_sysenter_return(%rsp), %r10d
 	CFI_REGISTER rip,r10
 	pushq	$__USER32_CS
 	CFI_ADJUST_CFA_OFFSET 8
@@ -142,6 +141,8 @@ sysenter_do_call:
 	andl    $~TS_COMPAT,threadinfo_status(%r10)
 	/* clear IF, that popfq doesn't enable interrupts early */
 	andl  $~0x200,EFLAGS-R11(%rsp) 
+	movl	RIP-R11(%rsp),%edx		/* User %eip */
+	CFI_REGISTER rip,rdx
 	RESTORE_ARGS 1,24,1,1,1,1
 	popfq
 	CFI_ADJUST_CFA_OFFSET -8
@@ -149,8 +150,6 @@ sysenter_do_call:
 	popq	%rcx				/* User %esp */
 	CFI_ADJUST_CFA_OFFSET -8
 	CFI_REGISTER rsp,rcx
-	movl	$VSYSCALL32_SYSEXIT,%edx	/* User %eip */
-	CFI_REGISTER rip,rdx
 	TRACE_IRQS_ON
 	swapgs
 	sti		/* sti only takes effect after the next instruction */
diff --git a/arch/x86/ia32/syscall32.c b/arch/x86/ia32/syscall32.c
index d751d96c2ef2..98ff99f5b59a 100644
--- a/arch/x86/ia32/syscall32.c
+++ b/arch/x86/ia32/syscall32.c
@@ -46,6 +46,10 @@ int syscall32_setup_pages(struct linux_binprm *bprm, int exstack)
 				      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
 				      VM_ALWAYSDUMP,
 				      syscall32_pages);
+	if (ret == 0) {
+		current->mm->context.vdso = (void __user *)VSYSCALL32_BASE;
+		current_thread_info()->sysenter_return = VSYSCALL32_SYSEXIT;
+	}
 	up_write(&mm->mmap_sem);
 	return ret;
 }
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 40f41752c1df..c27c646214f4 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -46,6 +46,9 @@ int main(void)
 	ENTRY(addr_limit);
 	ENTRY(preempt_count);
 	ENTRY(status);
+#ifdef CONFIG_IA32_EMULATION
+	ENTRY(sysenter_return);
+#endif
 	BLANK();
 #undef ENTRY
 #define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry))
diff --git a/include/asm-x86/thread_info_64.h b/include/asm-x86/thread_info_64.h
index 1ac23c157231..c8e7736fc792 100644
--- a/include/asm-x86/thread_info_64.h
+++ b/include/asm-x86/thread_info_64.h
@@ -33,6 +33,9 @@ struct thread_info {
 
 	mm_segment_t		addr_limit;	
 	struct restart_block    restart_block;
+#ifdef CONFIG_IA32_EMULATION
+	void __user		*sysenter_return;
+#endif
 };
 #endif
 
-- 
cgit v1.2.3


From 91394eb0975b3771dde7071a0825c6df6c20ff8a Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:30:45 +0100
Subject: x86: use get_desc_base

This changes a couple of places to use the get_desc_base function.
They were duplicating the same calculation with different equivalent code.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/ia32/tls32.c        | 7 +------
 arch/x86/kernel/process_64.c | 6 +-----
 2 files changed, 2 insertions(+), 11 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/ia32/tls32.c b/arch/x86/ia32/tls32.c
index 1cc4340de3ca..cac4b26b5895 100644
--- a/arch/x86/ia32/tls32.c
+++ b/arch/x86/ia32/tls32.c
@@ -85,11 +85,6 @@ asmlinkage long sys32_set_thread_area(struct user_desc __user *u_info)
  * Get the current Thread-Local Storage area:
  */
 
-#define GET_BASE(desc) ( \
-	(((desc)->a >> 16) & 0x0000ffff) | \
-	(((desc)->b << 16) & 0x00ff0000) | \
-	( (desc)->b        & 0xff000000)   )
-
 #define GET_LIMIT(desc) ( \
 	((desc)->a & 0x0ffff) | \
 	 ((desc)->b & 0xf0000) )
@@ -117,7 +112,7 @@ int do_get_thread_area(struct thread_struct *t, struct user_desc __user *u_info)
 
 	memset(&info, 0, sizeof(struct user_desc));
 	info.entry_number = idx;
-	info.base_addr = GET_BASE(desc);
+	info.base_addr = get_desc_base(desc);
 	info.limit = GET_LIMIT(desc);
 	info.seg_32bit = GET_32BIT(desc);
 	info.contents = GET_CONTENTS(desc);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index aa9414ed74c7..9ea1d7546f80 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -464,11 +464,7 @@ static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
 
 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
 {
-	struct desc_struct *desc = (void *)t->thread.tls_array;
-	desc += tls;
-	return desc->base0 | 
-		(((u32)desc->base1) << 16) | 
-		(((u32)desc->base2) << 24);
+	return get_desc_base(&t->thread.tls_array[tls]);
 }
 
 /*
-- 
cgit v1.2.3


From df5d438e33d7fc914ba9b6e0d6b019a8966c5fcc Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:30:45 +0100
Subject: x86: ptrace fs/gs_base

The fs_base and gs_base fields are available in user_regs_struct.
But reading these via ptrace (PTRACE_GETREGS or PTRACE_PEEKUSR) does
not give a reliably useful value.  The thread_struct fields are 0
when do_arch_prctl decided to use a GDT slot instead of MSR_FS_BASE,
which it does for a value under 1<<32.

This changes ptrace access to fs_base and gs_base to work like
PTRACE_ARCH_PRCTL does.  That is, it reads the base address that
user-mode memory access using the fs/gs instruction prefixes will
use, regardless of how it's being implemented in the kernel.  The
MSR vs GDT is an implementation detail that is pretty much hidden
from userland in the actual using, and there is no reason that
ptrace should give the internal implementation picture rather than
the user-mode semantic picture.  In the case of setting the value,
this can implicitly change the fsindex/gsindex value (also
separately in user_regs_struct), which is what happens when the
thread calls arch_prctl itself.  In a PTRACE_SETREGS, the fs_base
change will come after the fsindex change due to the order of the
struct, and so a change the debugger made to fs_base will have the
effect intended, another part of the user_regs_struct will now
differ when read back from what the debugger wrote.

This makes PTRACE_ARCH_PRCTL obsolete.  We could consider declaring
it deprecated and removing it one day, though there is no hurry.
For the foreseeable future, debuggers have to assume an old kernel
that does not report reliable fs_base/gs_base values in user_regs_struct
and stick to PTRACE_ARCH_PRCTL anyway.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ptrace_64.c | 35 +++++++++++++++++++++++++++++++----
 1 file changed, 31 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ptrace_64.c b/arch/x86/kernel/ptrace_64.c
index 607085f3f08a..1edece36044c 100644
--- a/arch/x86/kernel/ptrace_64.c
+++ b/arch/x86/kernel/ptrace_64.c
@@ -22,6 +22,7 @@
 #include <asm/pgtable.h>
 #include <asm/system.h>
 #include <asm/processor.h>
+#include <asm/prctl.h>
 #include <asm/i387.h>
 #include <asm/debugreg.h>
 #include <asm/ldt.h>
@@ -260,12 +261,22 @@ static int putreg(struct task_struct *child,
 		case offsetof(struct user_regs_struct,fs_base):
 			if (value >= TASK_SIZE_OF(child))
 				return -EIO;
-			child->thread.fs = value;
+			/*
+			 * When changing the segment base, use do_arch_prctl
+			 * to set either thread.fs or thread.fsindex and the
+			 * corresponding GDT slot.
+			 */
+			if (child->thread.fs != value)
+				return do_arch_prctl(child, ARCH_SET_FS, value);
 			return 0;
 		case offsetof(struct user_regs_struct,gs_base):
+			/*
+			 * Exactly the same here as the %fs handling above.
+			 */
 			if (value >= TASK_SIZE_OF(child))
 				return -EIO;
-			child->thread.gs = value;
+			if (child->thread.gs != value)
+				return do_arch_prctl(child, ARCH_SET_GS, value);
 			return 0;
 		case offsetof(struct user_regs_struct, eflags):
 			value &= FLAG_MASK;
@@ -296,9 +307,25 @@ static unsigned long getreg(struct task_struct *child, unsigned long regno)
 		case offsetof(struct user_regs_struct, es):
 			return child->thread.es; 
 		case offsetof(struct user_regs_struct, fs_base):
-			return child->thread.fs;
+			/*
+			 * do_arch_prctl may have used a GDT slot instead of
+			 * the MSR.  To userland, it appears the same either
+			 * way, except the %fs segment selector might not be 0.
+			 */
+			if (child->thread.fs != 0)
+				return child->thread.fs;
+			if (child->thread.fsindex != FS_TLS_SEL)
+				return 0;
+			return get_desc_base(&child->thread.tls_array[FS_TLS]);
 		case offsetof(struct user_regs_struct, gs_base):
-			return child->thread.gs;
+			/*
+			 * Exactly the same here as the %fs handling above.
+			 */
+			if (child->thread.gs != 0)
+				return child->thread.gs;
+			if (child->thread.gsindex != GS_TLS_SEL)
+				return 0;
+			return get_desc_base(&child->thread.tls_array[GS_TLS]);
 		default:
 			regno = regno - sizeof(struct pt_regs);
 			val = get_stack_long(child, regno);
-- 
cgit v1.2.3


From 13abd0e50433092c41551bc13c32268028b6d663 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:30:45 +0100
Subject: x86: tls32 moved

This renames arch/x86/ia32/tls32.c to arch/x86/kernel/tls.c, which does
nothing now but paves the way to consolidate this code for 32-bit too.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Zachary Amsden <zach@vmware.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/ia32/Makefile      |   2 +-
 arch/x86/ia32/tls32.c       | 158 --------------------------------------------
 arch/x86/kernel/Makefile_64 |   1 +
 arch/x86/kernel/tls.c       | 158 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 160 insertions(+), 159 deletions(-)
 delete mode 100644 arch/x86/ia32/tls32.c
 create mode 100644 arch/x86/kernel/tls.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile
index 8c19b763b2fb..ea6088640847 100644
--- a/arch/x86/ia32/Makefile
+++ b/arch/x86/ia32/Makefile
@@ -2,7 +2,7 @@
 # Makefile for the ia32 kernel emulation subsystem.
 #
 
-obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o tls32.o \
+obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o \
 	ia32_binfmt.o fpu32.o ptrace32.o
 
 sysv-$(CONFIG_SYSVIPC) := ipc32.o
diff --git a/arch/x86/ia32/tls32.c b/arch/x86/ia32/tls32.c
deleted file mode 100644
index cac4b26b5895..000000000000
--- a/arch/x86/ia32/tls32.c
+++ /dev/null
@@ -1,158 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/user.h>
-
-#include <asm/uaccess.h>
-#include <asm/desc.h>
-#include <asm/system.h>
-#include <asm/ldt.h>
-#include <asm/processor.h>
-#include <asm/proto.h>
-
-/*
- * sys_alloc_thread_area: get a yet unused TLS descriptor index.
- */
-static int get_free_idx(void)
-{
-	struct thread_struct *t = &current->thread;
-	int idx;
-
-	for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
-		if (desc_empty((struct n_desc_struct *)(t->tls_array) + idx))
-			return idx + GDT_ENTRY_TLS_MIN;
-	return -ESRCH;
-}
-
-/*
- * Set a given TLS descriptor:
- * When you want addresses > 32bit use arch_prctl() 
- */
-int do_set_thread_area(struct thread_struct *t, struct user_desc __user *u_info)
-{
-	struct user_desc info;
-	struct n_desc_struct *desc;
-	int cpu, idx;
-
-	if (copy_from_user(&info, u_info, sizeof(info)))
-		return -EFAULT;
-
-	idx = info.entry_number;
-
-	/*
-	 * index -1 means the kernel should try to find and
-	 * allocate an empty descriptor:
-	 */
-	if (idx == -1) {
-		idx = get_free_idx();
-		if (idx < 0)
-			return idx;
-		if (put_user(idx, &u_info->entry_number))
-			return -EFAULT;
-	}
-
-	if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
-		return -EINVAL;
-
-	desc = ((struct n_desc_struct *)t->tls_array) + idx - GDT_ENTRY_TLS_MIN;
-
-	/*
-	 * We must not get preempted while modifying the TLS.
-	 */
-	cpu = get_cpu();
-
-	if (LDT_empty(&info)) {
-		desc->a = 0;
-		desc->b = 0;
-	} else {
-		desc->a = LDT_entry_a(&info);
-		desc->b = LDT_entry_b(&info);
-	}
-	if (t == &current->thread)
-		load_TLS(t, cpu);
-
-	put_cpu();
-	return 0;
-}
-
-asmlinkage long sys32_set_thread_area(struct user_desc __user *u_info)
-{ 
-	return do_set_thread_area(&current->thread, u_info); 
-} 
-
-
-/*
- * Get the current Thread-Local Storage area:
- */
-
-#define GET_LIMIT(desc) ( \
-	((desc)->a & 0x0ffff) | \
-	 ((desc)->b & 0xf0000) )
-	
-#define GET_32BIT(desc)		(((desc)->b >> 22) & 1)
-#define GET_CONTENTS(desc)	(((desc)->b >> 10) & 3)
-#define GET_WRITABLE(desc)	(((desc)->b >>  9) & 1)
-#define GET_LIMIT_PAGES(desc)	(((desc)->b >> 23) & 1)
-#define GET_PRESENT(desc)	(((desc)->b >> 15) & 1)
-#define GET_USEABLE(desc)	(((desc)->b >> 20) & 1)
-#define GET_LONGMODE(desc)	(((desc)->b >> 21) & 1)
-
-int do_get_thread_area(struct thread_struct *t, struct user_desc __user *u_info)
-{
-	struct user_desc info;
-	struct n_desc_struct *desc;
-	int idx;
-
-	if (get_user(idx, &u_info->entry_number))
-		return -EFAULT;
-	if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
-		return -EINVAL;
-
-	desc = ((struct n_desc_struct *)t->tls_array) + idx - GDT_ENTRY_TLS_MIN;
-
-	memset(&info, 0, sizeof(struct user_desc));
-	info.entry_number = idx;
-	info.base_addr = get_desc_base(desc);
-	info.limit = GET_LIMIT(desc);
-	info.seg_32bit = GET_32BIT(desc);
-	info.contents = GET_CONTENTS(desc);
-	info.read_exec_only = !GET_WRITABLE(desc);
-	info.limit_in_pages = GET_LIMIT_PAGES(desc);
-	info.seg_not_present = !GET_PRESENT(desc);
-	info.useable = GET_USEABLE(desc);
-	info.lm = GET_LONGMODE(desc);
-
-	if (copy_to_user(u_info, &info, sizeof(info)))
-		return -EFAULT;
-	return 0;
-}
-
-asmlinkage long sys32_get_thread_area(struct user_desc __user *u_info)
-{
-	return do_get_thread_area(&current->thread, u_info);
-} 
-
-
-int ia32_child_tls(struct task_struct *p, struct pt_regs *childregs)
-{
-	struct n_desc_struct *desc;
-	struct user_desc info;
-	struct user_desc __user *cp;
-	int idx;
-	
-	cp = (void __user *)childregs->rsi;
-	if (copy_from_user(&info, cp, sizeof(info)))
-		return -EFAULT;
-	if (LDT_empty(&info))
-		return -EINVAL;
-	
-	idx = info.entry_number;
-	if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
-		return -EINVAL;
-	
-	desc = (struct n_desc_struct *)(p->thread.tls_array) + idx - GDT_ENTRY_TLS_MIN;
-	desc->a = LDT_entry_a(&info);
-	desc->b = LDT_entry_b(&info);
-
-	return 0;
-}
diff --git a/arch/x86/kernel/Makefile_64 b/arch/x86/kernel/Makefile_64
index ae95d21ea885..e1ba82e582a8 100644
--- a/arch/x86/kernel/Makefile_64
+++ b/arch/x86/kernel/Makefile_64
@@ -13,6 +13,7 @@ obj-y	:= process_64.o signal_64.o entry_64.o traps_64.o irq_64.o \
 		pci-dma_64.o pci-nommu_64.o alternative.o hpet.o tsc_64.o bugs_64.o \
 		i8253.o io_delay.o rtc.o
 
+obj-$(CONFIG_IA32_EMULATION)	+= tls.o
 obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
 obj-y				+= cpu/
 obj-y				+= acpi/
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
new file mode 100644
index 000000000000..5291596f19b0
--- /dev/null
+++ b/arch/x86/kernel/tls.c
@@ -0,0 +1,158 @@
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/user.h>
+
+#include <asm/uaccess.h>
+#include <asm/desc.h>
+#include <asm/system.h>
+#include <asm/ldt.h>
+#include <asm/processor.h>
+#include <asm/proto.h>
+
+/*
+ * sys_alloc_thread_area: get a yet unused TLS descriptor index.
+ */
+static int get_free_idx(void)
+{
+	struct thread_struct *t = &current->thread;
+	int idx;
+
+	for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
+		if (desc_empty((struct n_desc_struct *)(t->tls_array) + idx))
+			return idx + GDT_ENTRY_TLS_MIN;
+	return -ESRCH;
+}
+
+/*
+ * Set a given TLS descriptor:
+ * When you want addresses > 32bit use arch_prctl()
+ */
+int do_set_thread_area(struct thread_struct *t, struct user_desc __user *u_info)
+{
+	struct user_desc info;
+	struct n_desc_struct *desc;
+	int cpu, idx;
+
+	if (copy_from_user(&info, u_info, sizeof(info)))
+		return -EFAULT;
+
+	idx = info.entry_number;
+
+	/*
+	 * index -1 means the kernel should try to find and
+	 * allocate an empty descriptor:
+	 */
+	if (idx == -1) {
+		idx = get_free_idx();
+		if (idx < 0)
+			return idx;
+		if (put_user(idx, &u_info->entry_number))
+			return -EFAULT;
+	}
+
+	if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+		return -EINVAL;
+
+	desc = ((struct n_desc_struct *)t->tls_array) + idx - GDT_ENTRY_TLS_MIN;
+
+	/*
+	 * We must not get preempted while modifying the TLS.
+	 */
+	cpu = get_cpu();
+
+	if (LDT_empty(&info)) {
+		desc->a = 0;
+		desc->b = 0;
+	} else {
+		desc->a = LDT_entry_a(&info);
+		desc->b = LDT_entry_b(&info);
+	}
+	if (t == &current->thread)
+		load_TLS(t, cpu);
+
+	put_cpu();
+	return 0;
+}
+
+asmlinkage long sys32_set_thread_area(struct user_desc __user *u_info)
+{
+	return do_set_thread_area(&current->thread, u_info);
+}
+
+
+/*
+ * Get the current Thread-Local Storage area:
+ */
+
+#define GET_LIMIT(desc) ( \
+	((desc)->a & 0x0ffff) | \
+	 ((desc)->b & 0xf0000) )
+
+#define GET_32BIT(desc)		(((desc)->b >> 22) & 1)
+#define GET_CONTENTS(desc)	(((desc)->b >> 10) & 3)
+#define GET_WRITABLE(desc)	(((desc)->b >>  9) & 1)
+#define GET_LIMIT_PAGES(desc)	(((desc)->b >> 23) & 1)
+#define GET_PRESENT(desc)	(((desc)->b >> 15) & 1)
+#define GET_USEABLE(desc)	(((desc)->b >> 20) & 1)
+#define GET_LONGMODE(desc)	(((desc)->b >> 21) & 1)
+
+int do_get_thread_area(struct thread_struct *t, struct user_desc __user *u_info)
+{
+	struct user_desc info;
+	struct n_desc_struct *desc;
+	int idx;
+
+	if (get_user(idx, &u_info->entry_number))
+		return -EFAULT;
+	if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+		return -EINVAL;
+
+	desc = ((struct n_desc_struct *)t->tls_array) + idx - GDT_ENTRY_TLS_MIN;
+
+	memset(&info, 0, sizeof(struct user_desc));
+	info.entry_number = idx;
+	info.base_addr = get_desc_base(desc);
+	info.limit = GET_LIMIT(desc);
+	info.seg_32bit = GET_32BIT(desc);
+	info.contents = GET_CONTENTS(desc);
+	info.read_exec_only = !GET_WRITABLE(desc);
+	info.limit_in_pages = GET_LIMIT_PAGES(desc);
+	info.seg_not_present = !GET_PRESENT(desc);
+	info.useable = GET_USEABLE(desc);
+	info.lm = GET_LONGMODE(desc);
+
+	if (copy_to_user(u_info, &info, sizeof(info)))
+		return -EFAULT;
+	return 0;
+}
+
+asmlinkage long sys32_get_thread_area(struct user_desc __user *u_info)
+{
+	return do_get_thread_area(&current->thread, u_info);
+}
+
+
+int ia32_child_tls(struct task_struct *p, struct pt_regs *childregs)
+{
+	struct n_desc_struct *desc;
+	struct user_desc info;
+	struct user_desc __user *cp;
+	int idx;
+
+	cp = (void __user *)childregs->rsi;
+	if (copy_from_user(&info, cp, sizeof(info)))
+		return -EFAULT;
+	if (LDT_empty(&info))
+		return -EINVAL;
+
+	idx = info.entry_number;
+	if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+		return -EINVAL;
+
+	desc = (struct n_desc_struct *)(p->thread.tls_array) + idx - GDT_ENTRY_TLS_MIN;
+	desc->a = LDT_entry_a(&info);
+	desc->b = LDT_entry_b(&info);
+
+	return 0;
+}
-- 
cgit v1.2.3


From efd1ca52d04d2f6df337a3332cee56cd60e6d4c4 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:30:46 +0100
Subject: x86: TLS cleanup

This consolidates the four different places that implemented the same
encoding magic for the GDT-slot 32-bit TLS support.  The old tls32.c was
renamed and is now only slightly modified to be the shared implementation.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Zachary Amsden <zach@vmware.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/ia32/ia32entry.S    |   4 +-
 arch/x86/kernel/Makefile_32  |   1 +
 arch/x86/kernel/process_32.c | 141 ++-----------------------------------------
 arch/x86/kernel/process_64.c |   3 +-
 arch/x86/kernel/ptrace_32.c  |  91 +++-------------------------
 arch/x86/kernel/ptrace_64.c  |  26 ++++----
 arch/x86/kernel/tls.c        |  96 ++++++++++++-----------------
 include/asm-x86/ia32.h       |   6 --
 include/asm-x86/ptrace.h     |  11 ++++
 9 files changed, 77 insertions(+), 302 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 2499a324feaa..0db0a6291bbd 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -643,8 +643,8 @@ ia32_sys_call_table:
 	.quad compat_sys_futex		/* 240 */
 	.quad compat_sys_sched_setaffinity
 	.quad compat_sys_sched_getaffinity
-	.quad sys32_set_thread_area
-	.quad sys32_get_thread_area
+	.quad sys_set_thread_area
+	.quad sys_get_thread_area
 	.quad compat_sys_io_setup	/* 245 */
 	.quad sys_io_destroy
 	.quad compat_sys_io_getevents
diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32
index 2c9596b9349c..9a6577a746ba 100644
--- a/arch/x86/kernel/Makefile_32
+++ b/arch/x86/kernel/Makefile_32
@@ -10,6 +10,7 @@ obj-y	:= process_32.o signal_32.o entry_32.o traps_32.o irq_32.o \
 		pci-dma_32.o i386_ksyms_32.o i387_32.o bootflag.o e820_32.o\
 		quirks.o i8237.o topology.o alternative.o i8253.o tsc_32.o io_delay.o rtc.o
 
+obj-y				+= tls.o
 obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
 obj-y				+= cpu/
 obj-y				+= acpi/
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 631af167bc51..4d66a56280d3 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -501,32 +501,15 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
 		set_tsk_thread_flag(p, TIF_IO_BITMAP);
 	}
 
+	err = 0;
+
 	/*
 	 * Set a new TLS for the child thread?
 	 */
-	if (clone_flags & CLONE_SETTLS) {
-		struct desc_struct *desc;
-		struct user_desc info;
-		int idx;
-
-		err = -EFAULT;
-		if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info)))
-			goto out;
-		err = -EINVAL;
-		if (LDT_empty(&info))
-			goto out;
-
-		idx = info.entry_number;
-		if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
-			goto out;
-
-		desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
-		desc->a = LDT_entry_a(&info);
-		desc->b = LDT_entry_b(&info);
-	}
+	if (clone_flags & CLONE_SETTLS)
+		err = do_set_thread_area(p, -1,
+			(struct user_desc __user *)childregs->esi, 0);
 
-	err = 0;
- out:
 	if (err && p->thread.io_bitmap_ptr) {
 		kfree(p->thread.io_bitmap_ptr);
 		p->thread.io_bitmap_max = 0;
@@ -872,120 +855,6 @@ unsigned long get_wchan(struct task_struct *p)
 	return 0;
 }
 
-/*
- * sys_alloc_thread_area: get a yet unused TLS descriptor index.
- */
-static int get_free_idx(void)
-{
-	struct thread_struct *t = &current->thread;
-	int idx;
-
-	for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
-		if (desc_empty(t->tls_array + idx))
-			return idx + GDT_ENTRY_TLS_MIN;
-	return -ESRCH;
-}
-
-/*
- * Set a given TLS descriptor:
- */
-asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
-{
-	struct thread_struct *t = &current->thread;
-	struct user_desc info;
-	struct desc_struct *desc;
-	int cpu, idx;
-
-	if (copy_from_user(&info, u_info, sizeof(info)))
-		return -EFAULT;
-	idx = info.entry_number;
-
-	/*
-	 * index -1 means the kernel should try to find and
-	 * allocate an empty descriptor:
-	 */
-	if (idx == -1) {
-		idx = get_free_idx();
-		if (idx < 0)
-			return idx;
-		if (put_user(idx, &u_info->entry_number))
-			return -EFAULT;
-	}
-
-	if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
-		return -EINVAL;
-
-	desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN;
-
-	/*
-	 * We must not get preempted while modifying the TLS.
-	 */
-	cpu = get_cpu();
-
-	if (LDT_empty(&info)) {
-		desc->a = 0;
-		desc->b = 0;
-	} else {
-		desc->a = LDT_entry_a(&info);
-		desc->b = LDT_entry_b(&info);
-	}
-	load_TLS(t, cpu);
-
-	put_cpu();
-
-	return 0;
-}
-
-/*
- * Get the current Thread-Local Storage area:
- */
-
-#define GET_BASE(desc) ( \
-	(((desc)->a >> 16) & 0x0000ffff) | \
-	(((desc)->b << 16) & 0x00ff0000) | \
-	( (desc)->b        & 0xff000000)   )
-
-#define GET_LIMIT(desc) ( \
-	((desc)->a & 0x0ffff) | \
-	 ((desc)->b & 0xf0000) )
-	
-#define GET_32BIT(desc)		(((desc)->b >> 22) & 1)
-#define GET_CONTENTS(desc)	(((desc)->b >> 10) & 3)
-#define GET_WRITABLE(desc)	(((desc)->b >>  9) & 1)
-#define GET_LIMIT_PAGES(desc)	(((desc)->b >> 23) & 1)
-#define GET_PRESENT(desc)	(((desc)->b >> 15) & 1)
-#define GET_USEABLE(desc)	(((desc)->b >> 20) & 1)
-
-asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
-{
-	struct user_desc info;
-	struct desc_struct *desc;
-	int idx;
-
-	if (get_user(idx, &u_info->entry_number))
-		return -EFAULT;
-	if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
-		return -EINVAL;
-
-	memset(&info, 0, sizeof(info));
-
-	desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
-
-	info.entry_number = idx;
-	info.base_addr = GET_BASE(desc);
-	info.limit = GET_LIMIT(desc);
-	info.seg_32bit = GET_32BIT(desc);
-	info.contents = GET_CONTENTS(desc);
-	info.read_exec_only = !GET_WRITABLE(desc);
-	info.limit_in_pages = GET_LIMIT_PAGES(desc);
-	info.seg_not_present = !GET_PRESENT(desc);
-	info.useable = GET_USEABLE(desc);
-
-	if (copy_to_user(u_info, &info, sizeof(info)))
-		return -EFAULT;
-	return 0;
-}
-
 unsigned long arch_align_stack(unsigned long sp)
 {
 	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 9ea1d7546f80..ccc9d68d5a58 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -524,7 +524,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
 	if (clone_flags & CLONE_SETTLS) {
 #ifdef CONFIG_IA32_EMULATION
 		if (test_thread_flag(TIF_IA32))
-			err = ia32_child_tls(p, childregs); 
+			err = do_set_thread_area(p, -1,
+				(struct user_desc __user *)childregs->rsi, 0);
 		else 			
 #endif	 
 			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); 
diff --git a/arch/x86/kernel/ptrace_32.c b/arch/x86/kernel/ptrace_32.c
index ff5431cc03ee..09227cfb7c4c 100644
--- a/arch/x86/kernel/ptrace_32.c
+++ b/arch/x86/kernel/ptrace_32.c
@@ -276,85 +276,6 @@ void ptrace_disable(struct task_struct *child)
 	clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
 }
 
-/*
- * Perform get_thread_area on behalf of the traced child.
- */
-static int
-ptrace_get_thread_area(struct task_struct *child,
-		       int idx, struct user_desc __user *user_desc)
-{
-	struct user_desc info;
-	struct desc_struct *desc;
-
-/*
- * Get the current Thread-Local Storage area:
- */
-
-#define GET_BASE(desc) ( \
-	(((desc)->a >> 16) & 0x0000ffff) | \
-	(((desc)->b << 16) & 0x00ff0000) | \
-	( (desc)->b        & 0xff000000)   )
-
-#define GET_LIMIT(desc) ( \
-	((desc)->a & 0x0ffff) | \
-	 ((desc)->b & 0xf0000) )
-
-#define GET_32BIT(desc)		(((desc)->b >> 22) & 1)
-#define GET_CONTENTS(desc)	(((desc)->b >> 10) & 3)
-#define GET_WRITABLE(desc)	(((desc)->b >>  9) & 1)
-#define GET_LIMIT_PAGES(desc)	(((desc)->b >> 23) & 1)
-#define GET_PRESENT(desc)	(((desc)->b >> 15) & 1)
-#define GET_USEABLE(desc)	(((desc)->b >> 20) & 1)
-
-	if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
-		return -EINVAL;
-
-	desc = child->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
-
-	info.entry_number = idx;
-	info.base_addr = GET_BASE(desc);
-	info.limit = GET_LIMIT(desc);
-	info.seg_32bit = GET_32BIT(desc);
-	info.contents = GET_CONTENTS(desc);
-	info.read_exec_only = !GET_WRITABLE(desc);
-	info.limit_in_pages = GET_LIMIT_PAGES(desc);
-	info.seg_not_present = !GET_PRESENT(desc);
-	info.useable = GET_USEABLE(desc);
-
-	if (copy_to_user(user_desc, &info, sizeof(info)))
-		return -EFAULT;
-
-	return 0;
-}
-
-/*
- * Perform set_thread_area on behalf of the traced child.
- */
-static int
-ptrace_set_thread_area(struct task_struct *child,
-		       int idx, struct user_desc __user *user_desc)
-{
-	struct user_desc info;
-	struct desc_struct *desc;
-
-	if (copy_from_user(&info, user_desc, sizeof(info)))
-		return -EFAULT;
-
-	if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
-		return -EINVAL;
-
-	desc = child->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
-	if (LDT_empty(&info)) {
-		desc->a = 0;
-		desc->b = 0;
-	} else {
-		desc->a = LDT_entry_a(&info);
-		desc->b = LDT_entry_b(&info);
-	}
-
-	return 0;
-}
-
 long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 {
 	struct user * dummy = NULL;
@@ -601,13 +522,17 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 	}
 
 	case PTRACE_GET_THREAD_AREA:
-		ret = ptrace_get_thread_area(child, addr,
-					(struct user_desc __user *) data);
+		if (addr < 0)
+			return -EIO;
+		ret = do_get_thread_area(child, addr,
+					 (struct user_desc __user *) data);
 		break;
 
 	case PTRACE_SET_THREAD_AREA:
-		ret = ptrace_set_thread_area(child, addr,
-					(struct user_desc __user *) data);
+		if (addr < 0)
+			return -EIO;
+		ret = do_set_thread_area(child, addr,
+					 (struct user_desc __user *) data, 0);
 		break;
 
 	default:
diff --git a/arch/x86/kernel/ptrace_64.c b/arch/x86/kernel/ptrace_64.c
index 1edece36044c..375fadc23a25 100644
--- a/arch/x86/kernel/ptrace_64.c
+++ b/arch/x86/kernel/ptrace_64.c
@@ -474,23 +474,19 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		   64bit debugger to fully examine them too. Better
 		   don't use it against 64bit processes, use
 		   PTRACE_ARCH_PRCTL instead. */
-	case PTRACE_SET_THREAD_AREA: {
-		struct user_desc __user *p;
-		int old; 
-		p = (struct user_desc __user *)data;
-		get_user(old,  &p->entry_number); 
-		put_user(addr, &p->entry_number);
-		ret = do_set_thread_area(&child->thread, p);
-		put_user(old,  &p->entry_number); 
-		break;
 	case PTRACE_GET_THREAD_AREA:
-		p = (struct user_desc __user *)data;
-		get_user(old,  &p->entry_number); 
-		put_user(addr, &p->entry_number);
-		ret = do_get_thread_area(&child->thread, p);
-		put_user(old,  &p->entry_number); 
+		if (addr < 0)
+			return -EIO;
+		ret = do_get_thread_area(child, addr,
+					 (struct user_desc __user *) data);
+
+		break;
+	case PTRACE_SET_THREAD_AREA:
+		if (addr < 0)
+			return -EIO;
+		ret = do_set_thread_area(child, addr,
+					 (struct user_desc __user *) data, 0);
 		break;
-	} 
 #endif
 		/* normal 64bit interface to access TLS data. 
 		   Works just like arch_prctl, except that the arguments
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index 5291596f19b0..67a377621b12 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -19,31 +19,34 @@ static int get_free_idx(void)
 	int idx;
 
 	for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
-		if (desc_empty((struct n_desc_struct *)(t->tls_array) + idx))
+		if (desc_empty(&t->tls_array[idx]))
 			return idx + GDT_ENTRY_TLS_MIN;
 	return -ESRCH;
 }
 
 /*
  * Set a given TLS descriptor:
- * When you want addresses > 32bit use arch_prctl()
  */
-int do_set_thread_area(struct thread_struct *t, struct user_desc __user *u_info)
+int do_set_thread_area(struct task_struct *p, int idx,
+		       struct user_desc __user *u_info,
+		       int can_allocate)
 {
+	struct thread_struct *t = &p->thread;
 	struct user_desc info;
-	struct n_desc_struct *desc;
-	int cpu, idx;
+	u32 *desc;
+	int cpu;
 
 	if (copy_from_user(&info, u_info, sizeof(info)))
 		return -EFAULT;
 
-	idx = info.entry_number;
+	if (idx == -1)
+		idx = info.entry_number;
 
 	/*
 	 * index -1 means the kernel should try to find and
 	 * allocate an empty descriptor:
 	 */
-	if (idx == -1) {
+	if (idx == -1 && can_allocate) {
 		idx = get_free_idx();
 		if (idx < 0)
 			return idx;
@@ -54,7 +57,7 @@ int do_set_thread_area(struct thread_struct *t, struct user_desc __user *u_info)
 	if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
 		return -EINVAL;
 
-	desc = ((struct n_desc_struct *)t->tls_array) + idx - GDT_ENTRY_TLS_MIN;
+	desc = (u32 *) &t->tls_array[idx - GDT_ENTRY_TLS_MIN];
 
 	/*
 	 * We must not get preempted while modifying the TLS.
@@ -62,11 +65,11 @@ int do_set_thread_area(struct thread_struct *t, struct user_desc __user *u_info)
 	cpu = get_cpu();
 
 	if (LDT_empty(&info)) {
-		desc->a = 0;
-		desc->b = 0;
+		desc[0] = 0;
+		desc[1] = 0;
 	} else {
-		desc->a = LDT_entry_a(&info);
-		desc->b = LDT_entry_b(&info);
+		desc[0] = LDT_entry_a(&info);
+		desc[1] = LDT_entry_b(&info);
 	}
 	if (t == &current->thread)
 		load_TLS(t, cpu);
@@ -75,9 +78,9 @@ int do_set_thread_area(struct thread_struct *t, struct user_desc __user *u_info)
 	return 0;
 }
 
-asmlinkage long sys32_set_thread_area(struct user_desc __user *u_info)
+asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
 {
-	return do_set_thread_area(&current->thread, u_info);
+	return do_set_thread_area(current, -1, u_info, 1);
 }
 
 
@@ -85,34 +88,32 @@ asmlinkage long sys32_set_thread_area(struct user_desc __user *u_info)
  * Get the current Thread-Local Storage area:
  */
 
-#define GET_LIMIT(desc) ( \
-	((desc)->a & 0x0ffff) | \
-	 ((desc)->b & 0xf0000) )
-
-#define GET_32BIT(desc)		(((desc)->b >> 22) & 1)
-#define GET_CONTENTS(desc)	(((desc)->b >> 10) & 3)
-#define GET_WRITABLE(desc)	(((desc)->b >>  9) & 1)
-#define GET_LIMIT_PAGES(desc)	(((desc)->b >> 23) & 1)
-#define GET_PRESENT(desc)	(((desc)->b >> 15) & 1)
-#define GET_USEABLE(desc)	(((desc)->b >> 20) & 1)
-#define GET_LONGMODE(desc)	(((desc)->b >> 21) & 1)
-
-int do_get_thread_area(struct thread_struct *t, struct user_desc __user *u_info)
+#define GET_LIMIT(desc)		(((desc)[0] & 0x0ffff) | ((desc)[1] & 0xf0000))
+#define GET_32BIT(desc)		(((desc)[1] >> 22) & 1)
+#define GET_CONTENTS(desc)	(((desc)[1] >> 10) & 3)
+#define GET_WRITABLE(desc)	(((desc)[1] >>  9) & 1)
+#define GET_LIMIT_PAGES(desc)	(((desc)[1] >> 23) & 1)
+#define GET_PRESENT(desc)	(((desc)[1] >> 15) & 1)
+#define GET_USEABLE(desc)	(((desc)[1] >> 20) & 1)
+#define GET_LONGMODE(desc)	(((desc)[1] >> 21) & 1)
+
+int do_get_thread_area(struct task_struct *p, int idx,
+		       struct user_desc __user *u_info)
 {
+	struct thread_struct *t = &p->thread;
 	struct user_desc info;
-	struct n_desc_struct *desc;
-	int idx;
+	u32 *desc;
 
-	if (get_user(idx, &u_info->entry_number))
+	if (idx == -1 && get_user(idx, &u_info->entry_number))
 		return -EFAULT;
 	if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
 		return -EINVAL;
 
-	desc = ((struct n_desc_struct *)t->tls_array) + idx - GDT_ENTRY_TLS_MIN;
+	desc = (u32 *) &t->tls_array[idx - GDT_ENTRY_TLS_MIN];
 
 	memset(&info, 0, sizeof(struct user_desc));
 	info.entry_number = idx;
-	info.base_addr = get_desc_base(desc);
+	info.base_addr = get_desc_base((void *)desc);
 	info.limit = GET_LIMIT(desc);
 	info.seg_32bit = GET_32BIT(desc);
 	info.contents = GET_CONTENTS(desc);
@@ -120,39 +121,16 @@ int do_get_thread_area(struct thread_struct *t, struct user_desc __user *u_info)
 	info.limit_in_pages = GET_LIMIT_PAGES(desc);
 	info.seg_not_present = !GET_PRESENT(desc);
 	info.useable = GET_USEABLE(desc);
+#ifdef CONFIG_X86_64
 	info.lm = GET_LONGMODE(desc);
+#endif
 
 	if (copy_to_user(u_info, &info, sizeof(info)))
 		return -EFAULT;
 	return 0;
 }
 
-asmlinkage long sys32_get_thread_area(struct user_desc __user *u_info)
+asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
 {
-	return do_get_thread_area(&current->thread, u_info);
-}
-
-
-int ia32_child_tls(struct task_struct *p, struct pt_regs *childregs)
-{
-	struct n_desc_struct *desc;
-	struct user_desc info;
-	struct user_desc __user *cp;
-	int idx;
-
-	cp = (void __user *)childregs->rsi;
-	if (copy_from_user(&info, cp, sizeof(info)))
-		return -EFAULT;
-	if (LDT_empty(&info))
-		return -EINVAL;
-
-	idx = info.entry_number;
-	if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
-		return -EINVAL;
-
-	desc = (struct n_desc_struct *)(p->thread.tls_array) + idx - GDT_ENTRY_TLS_MIN;
-	desc->a = LDT_entry_a(&info);
-	desc->b = LDT_entry_b(&info);
-
-	return 0;
+	return do_get_thread_area(current, -1, u_info);
 }
diff --git a/include/asm-x86/ia32.h b/include/asm-x86/ia32.h
index 0190b7c4e319..aa9733206e29 100644
--- a/include/asm-x86/ia32.h
+++ b/include/asm-x86/ia32.h
@@ -159,12 +159,6 @@ struct ustat32 {
 #define IA32_STACK_TOP IA32_PAGE_OFFSET
 
 #ifdef __KERNEL__
-struct user_desc;
-struct siginfo_t;
-int do_get_thread_area(struct thread_struct *t, struct user_desc __user *info);
-int do_set_thread_area(struct thread_struct *t, struct user_desc __user *info);
-int ia32_child_tls(struct task_struct *p, struct pt_regs *childregs);
-
 struct linux_binprm;
 extern int ia32_setup_arg_pages(struct linux_binprm *bprm,
 				unsigned long stack_top, int exec_stack);
diff --git a/include/asm-x86/ptrace.h b/include/asm-x86/ptrace.h
index 51ddb2590870..105d1534eaf4 100644
--- a/include/asm-x86/ptrace.h
+++ b/include/asm-x86/ptrace.h
@@ -137,6 +137,17 @@ enum {
 };
 #endif /* __KERNEL__ */
 #endif /* !__i386__ */
+
+#ifdef __KERNEL__
+
+struct user_desc;
+extern int do_get_thread_area(struct task_struct *p, int idx,
+			      struct user_desc __user *info);
+extern int do_set_thread_area(struct task_struct *p, int idx,
+			      struct user_desc __user *info, int can_allocate);
+
+#endif /* __KERNEL__ */
+
 #endif /* !__ASSEMBLY__ */
 
 #endif
-- 
cgit v1.2.3


From b3ca74a2bfc66262d21443b160815eb26d6699e6 Mon Sep 17 00:00:00 2001
From: Vladimir Berezniker <vmpn@hitechman.com>
Date: Wed, 30 Jan 2008 13:30:46 +0100
Subject: x86: sanitize user specified e820 memmap values

Sanitize user specified e820 memory ranges, using the same logic that is
applied to the values returned by the BIOS.  This ensures consistent
handling regardless of the source of the memory mappings.

Allows overriding portions of the memory map without specifying one in
it's entirety (memmap=exactmap).

E.g. marking a range of bad RAM as reserved with memmap=48M$528M

BIOS supplied range

BIOS-e820: 0000000000100000 - 000000007fe80000 (usable)

becomes

user: 0000000000100000 - 0000000021000000 (usable)
user: 0000000021000000 - 0000000024000000 (reserved)
user: 0000000024000000 - 000000007fe80000 (usable)

Previously this did not work, as the original BIOS range was left
untouched while the user defined range was appended to the end of the
memory map.

[ tglx: arch/x86 adaptation ]

Signed-off-by: Vladimir Berezniker <vmpn@hitechman.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/e820_64.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
index 151236896243..8e7321101a0a 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -691,6 +691,8 @@ static int __init parse_memmap_opt(char *p)
 	mem_size = memparse(p, &p);
 	if (p == oldp)
 		return -EINVAL;
+
+	userdef = 1;
 	if (*p == '@') {
 		start_at = memparse(p+1, &p);
 		add_memory_region(start_at, mem_size, E820_RAM);
@@ -710,6 +712,12 @@ early_param("memmap", parse_memmap_opt);
 void __init finish_e820_parsing(void)
 {
 	if (userdef) {
+		char nr = e820.nr_map;
+
+		if (sanitize_e820_map(e820.map, &nr) < 0)
+			early_panic("Invalid user supplied memory map");
+		e820.nr_map = nr;
+
 		printk(KERN_INFO "user-defined physical RAM map:\n");
 		e820_print_map("user");
 	}
-- 
cgit v1.2.3


From 949ec325c79add68af13705b68a885a1efb84234 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <Yinghai.Lu@Sun.COM>
Date: Wed, 30 Jan 2008 13:30:46 +0100
Subject: x86: set cpu_index to nr_cpus instead of 0

Some BIOSes that support two/four dualcore/quadcore systems, will get:

ACPI: LAPIC (acpi_id[0x01] lapic_id[0x00] enabled)
Processor #0 15:1 APIC version 16
ACPI: LAPIC (acpi_id[0x02] lapic_id[0x01] enabled)
Processor #1 15:1 APIC version 16
ACPI: LAPIC (acpi_id[0x03] lapic_id[0x02] enabled)
Processor #2 15:1 APIC version 16
ACPI: LAPIC (acpi_id[0x04] lapic_id[0x03] enabled)
Processor #3 15:1 APIC version 16
ACPI: LAPIC (acpi_id[0x05] lapic_id[0x84] disabled)
ACPI: LAPIC (acpi_id[0x06] lapic_id[0x85] disabled)
ACPI: LAPIC (acpi_id[0x07] lapic_id[0x86] disabled)
ACPI: LAPIC (acpi_id[0x08] lapic_id[0x87] disabled)
ACPI: LAPIC (acpi_id[0x09] lapic_id[0x88] disabled)
ACPI: LAPIC (acpi_id[0x0a] lapic_id[0x89] disabled)
ACPI: LAPIC (acpi_id[0x0b] lapic_id[0x8a] disabled)
ACPI: LAPIC (acpi_id[0x0c] lapic_id[0x8b] disabled)
ACPI: LAPIC (acpi_id[0x0d] lapic_id[0x8c] disabled)
ACPI: LAPIC (acpi_id[0x0e] lapic_id[0x8d] disabled)
ACPI: LAPIC (acpi_id[0x0f] lapic_id[0x8e] disabled)
ACPI: LAPIC (acpi_id[0x10] lapic_id[0x8f] disabled)

SMP: Allowing 16 CPUs, 12 hotplug CPUs

the /proc/cpuinfo will show a bunch of NULL cpus with cpu_index=0

so assign impossible cpu_index value at first instead of 0.

Signed-off-by: Yinghai Lu <yinghai.lu@sun.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/smpboot_64.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c
index ddefb38c53fb..8ac8eb620428 100644
--- a/arch/x86/kernel/smpboot_64.c
+++ b/arch/x86/kernel/smpboot_64.c
@@ -861,6 +861,18 @@ void __init smp_set_apicids(void)
 	x86_cpu_to_apicid_ptr = NULL;
 }
 
+static void __init smp_cpu_index_default(void)
+{
+	int i;
+	struct cpuinfo_x86 *c;
+
+	for_each_cpu_mask(i, cpu_possible_map) {
+		c = &cpu_data(i);
+		/* mark all to hotplug */
+		c->cpu_index = NR_CPUS;
+	}
+}
+
 /*
  * Prepare for SMP bootup.  The MP table or ACPI has been read
  * earlier.  Just do some sanity checking here and enable APIC mode.
@@ -868,6 +880,7 @@ void __init smp_set_apicids(void)
 void __init smp_prepare_cpus(unsigned int max_cpus)
 {
 	nmi_watchdog_default();
+	smp_cpu_index_default();
 	current_cpu_data = boot_cpu_data;
 	current_thread_info()->cpu = 0;  /* needed? */
 	smp_set_apicids();
-- 
cgit v1.2.3


From 9de819fe72fc6979ddd18aa04ef9e2af5aa8bc5f Mon Sep 17 00:00:00 2001
From: Yinghai Lu <Yinghai.Lu@Sun.COM>
Date: Wed, 30 Jan 2008 13:30:46 +0100
Subject: x86: do not set boot cpu in cpu_online_map at x86_64_start_kernel()

In init/main.c boot_cpu_init() does that later.

Signed-off-by: Yinghai Lu <yinghai.lu@sun.com>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/head64.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 6643f3f994fb..4a1c1356c41a 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -69,8 +69,6 @@ void __init x86_64_start_kernel(char * real_mode_data)
 
 	pda_init(0);
 	copy_bootdata(__va(real_mode_data));
-#ifdef CONFIG_SMP
-	cpu_set(0, cpu_online_map);
-#endif
+
 	start_kernel();
 }
-- 
cgit v1.2.3


From 4ebd1290ba12121d66285cc06987ca97bcdfc55b Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bbpetkov@yahoo.de>
Date: Wed, 30 Jan 2008 13:30:46 +0100
Subject: x86: vmlinux_32.lds.S: remove repeated comment from the x86-32 linker
 script

Remove repeated comment from the linker script for the x86-32 target.

Signed-off-by: Borislav Petkov <bbpetkov@yahoo.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/vmlinux_32.lds.S | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 84c913f38f98..ec072588ff01 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -8,12 +8,6 @@
  * put it inside the section definition.
  */
 
-/* Don't define absolute symbols until and unless you know that symbol
- * value is should remain constant even if kernel image is relocated
- * at run time. Absolute symbols are not relocated. If symbol value should
- * change if kernel is relocated, make the symbol section relative and
- * put it inside the section definition.
- */
 #define LOAD_OFFSET __PAGE_OFFSET
 
 #include <asm-generic/vmlinux.lds.h>
-- 
cgit v1.2.3


From b263295dbffd33b0fbff670720fa178c30e3392a Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 30 Jan 2008 13:30:47 +0100
Subject: x86: 64-bit, make sparsemem vmemmap the only memory model

Use sparsemem as the only memory model for UP, SMP and NUMA.  Measurements
indicate that DISCONTIGMEM has a higher overhead than sparsemem.  And
FLATMEMs benefits are minimal.  So I think its best to simply standardize
on sparsemem.

Results of page allocator tests (test can be had via git from slab git
tree branch tests)

Measurements in cycle counts. 1000 allocations were performed and then the
average cycle count was calculated.

Order	FlatMem	Discontig	SparseMem
0	  639	  665		  641
1	  567	  647		  593
2	  679	  774		  692
3	  763	  967		  781
4	  961	 1501		  962
5	 1356	 2344		 1392
6	 2224	 3982		 2336
7	 4869	 7225		 5074
8	12500	14048		12732
9	27926	28223		28165
10	58578	58714		58682

(Note that FlatMem is an SMP config and the rest NUMA configurations)

Memory use:

SMP Sparsemem
-------------

Kernel size:

   text    data     bss     dec     hex filename
3849268  397739 1264856 5511863  541ab7 vmlinux

             total       used       free     shared    buffers     cached
Mem:       8242252      41164    8201088          0        352      11512
-/+ buffers/cache:      29300    8212952
Swap:      9775512          0    9775512

SMP Flatmem
-----------

Kernel size:

   text    data     bss     dec     hex filename
3844612  397739 1264536 5506887  540747 vmlinux

So 4.5k growth in text size vs. FLATMEM.

             total       used       free     shared    buffers     cached
Mem:       8244052      40544    8203508          0        352      11484
-/+ buffers/cache:      28708    8215344

2k growth in overall memory use after boot.

NUMA discontig:

   text    data     bss     dec     hex filename
3888124  470659 1276504 5635287  55fcd7 vmlinux

             total       used       free     shared    buffers     cached
Mem:       8256256      56908    8199348          0        352      11496
-/+ buffers/cache:      45060    8211196
Swap:      9775512          0    9775512

NUMA sparse:

   text    data     bss     dec     hex filename
3896428  470659 1276824 5643911  561e87 vmlinux

8k text growth. Given that we fully inline virt_to_page and friends now
that is rather good.

             total       used       free     shared    buffers     cached
Mem:       8264720      57240    8207480          0        352      11516
-/+ buffers/cache:      45372    8219348
Swap:      9775512          0    9775512

The total available memory is increased by 8k.

This patch makes sparsemem the default and removes discontig and
flatmem support from x86.

[ akpm@linux-foundation.org: allnoconfig build fix ]

Acked-by: Andi Kleen <ak@suse.de>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig                   | 22 ++++++---------
 arch/x86/configs/x86_64_defconfig  |  9 ------
 arch/x86/kernel/machine_kexec_64.c |  5 ----
 arch/x86/mm/init_64.c              | 28 -------------------
 arch/x86/mm/ioremap_64.c           | 17 ------------
 arch/x86/mm/numa_64.c              | 21 --------------
 arch/x86/mm/srat_64.c              | 57 --------------------------------------
 include/asm-x86/mmzone_64.h        |  6 ----
 include/asm-x86/page_64.h          |  3 --
 9 files changed, 9 insertions(+), 159 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2f4d88babd36..da98368f66af 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -891,25 +891,29 @@ config HAVE_ARCH_ALLOC_REMAP
 
 config ARCH_FLATMEM_ENABLE
 	def_bool y
-	depends on (X86_32 && ARCH_SELECT_MEMORY_MODEL && X86_PC) || (X86_64 && !NUMA)
+	depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && X86_PC
 
 config ARCH_DISCONTIGMEM_ENABLE
 	def_bool y
-	depends on NUMA
+	depends on NUMA && X86_32
 
 config ARCH_DISCONTIGMEM_DEFAULT
 	def_bool y
-	depends on NUMA
+	depends on NUMA && X86_32
+
+config ARCH_SPARSEMEM_DEFAULT
+	def_bool y
+	depends on X86_64
 
 config ARCH_SPARSEMEM_ENABLE
 	def_bool y
-	depends on NUMA || (EXPERIMENTAL && (X86_PC || X86_64))
+	depends on X86_64 || NUMA || (EXPERIMENTAL && X86_PC)
 	select SPARSEMEM_STATIC if X86_32
 	select SPARSEMEM_VMEMMAP_ENABLE if X86_64
 
 config ARCH_SELECT_MEMORY_MODEL
 	def_bool y
-	depends on X86_32 && ARCH_SPARSEMEM_ENABLE
+	depends on ARCH_SPARSEMEM_ENABLE
 
 config ARCH_MEMORY_PROBE
 	def_bool X86_64
@@ -1207,18 +1211,10 @@ config ARCH_ENABLE_MEMORY_HOTPLUG
 	def_bool y
 	depends on X86_64 || (X86_32 && HIGHMEM)
 
-config MEMORY_HOTPLUG_RESERVE
-	def_bool X86_64
-	depends on (MEMORY_HOTPLUG && DISCONTIGMEM)
-
 config HAVE_ARCH_EARLY_PFN_TO_NID
 	def_bool X86_64
 	depends on NUMA
 
-config OUT_OF_LINE_PFN_TO_PAGE
-	def_bool X86_64
-	depends on DISCONTIGMEM
-
 menu "Power management options"
 	depends on !X86_VOYAGER
 
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 38a83f9c966f..9e2b0ef851de 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -145,15 +145,6 @@ CONFIG_K8_NUMA=y
 CONFIG_NODES_SHIFT=6
 CONFIG_X86_64_ACPI_NUMA=y
 CONFIG_NUMA_EMU=y
-CONFIG_ARCH_DISCONTIGMEM_ENABLE=y
-CONFIG_ARCH_DISCONTIGMEM_DEFAULT=y
-CONFIG_ARCH_SPARSEMEM_ENABLE=y
-CONFIG_SELECT_MEMORY_MODEL=y
-# CONFIG_FLATMEM_MANUAL is not set
-CONFIG_DISCONTIGMEM_MANUAL=y
-# CONFIG_SPARSEMEM_MANUAL is not set
-CONFIG_DISCONTIGMEM=y
-CONFIG_FLAT_NODE_MEM_MAP=y
 CONFIG_NEED_MULTIPLE_NODES=y
 # CONFIG_SPARSEMEM_STATIC is not set
 CONFIG_SPLIT_PTLOCK_CPUS=4
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index aa3d2c8f7737..a1fef42f8cdb 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -234,10 +234,5 @@ NORET_TYPE void machine_kexec(struct kimage *image)
 void arch_crash_save_vmcoreinfo(void)
 {
 	VMCOREINFO_SYMBOL(init_level4_pgt);
-
-#ifdef CONFIG_ARCH_DISCONTIGMEM_ENABLE
-	VMCOREINFO_SYMBOL(node_data);
-	VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
-#endif
 }
 
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 0fbb657a8b19..251eeb325ae3 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -486,34 +486,6 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
-#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
-/*
- * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
- * just online the pages.
- */
-int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
-{
-	int err = -EIO;
-	unsigned long pfn;
-	unsigned long total = 0, mem = 0;
-	for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
-		if (pfn_valid(pfn)) {
-			online_page(pfn_to_page(pfn));
-			err = 0;
-			mem++;
-		}
-		total++;
-	}
-	if (!err) {
-		z->spanned_pages += total;
-		z->present_pages += mem;
-		z->zone_pgdat->node_spanned_pages += total;
-		z->zone_pgdat->node_present_pages += mem;
-	}
-	return err;
-}
-#endif
-
 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
 			 kcore_vsyscall;
 
diff --git a/arch/x86/mm/ioremap_64.c b/arch/x86/mm/ioremap_64.c
index 6cac90aa5032..b03db4ca9cad 100644
--- a/arch/x86/mm/ioremap_64.c
+++ b/arch/x86/mm/ioremap_64.c
@@ -86,23 +86,6 @@ void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned l
 	if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
 		return (__force void __iomem *)phys_to_virt(phys_addr);
 
-#ifdef CONFIG_FLATMEM
-	/*
-	 * Don't allow anybody to remap normal RAM that we're using..
-	 */
-	if (last_addr < virt_to_phys(high_memory)) {
-		char *t_addr, *t_end;
- 		struct page *page;
-
-		t_addr = __va(phys_addr);
-		t_end = t_addr + (size - 1);
-	   
-		for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
-			if(!PageReserved(page))
-				return NULL;
-	}
-#endif
-
 	pgprot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_GLOBAL
 			  | _PAGE_DIRTY | _PAGE_ACCESSED | flags);
 	/*
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 29b69300aee2..46b4b5e1a02a 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -153,12 +153,10 @@ int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
 	return shift;
 }
 
-#ifdef CONFIG_SPARSEMEM
 int early_pfn_to_nid(unsigned long pfn)
 {
 	return phys_to_nid(pfn << PAGE_SHIFT);
 }
-#endif
 
 static void * __init early_node_mem(int nodeid, unsigned long start,
 				    unsigned long end, unsigned long size)
@@ -635,23 +633,4 @@ void __init init_cpu_to_node(void)
 	}
 }
 
-#ifdef CONFIG_DISCONTIGMEM
-/*
- * Functions to convert PFNs from/to per node page addresses.
- * These are out of line because they are quite big.
- * They could be all tuned by pre caching more state.
- * Should do that.
- */
 
-int pfn_valid(unsigned long pfn)
-{
-	unsigned nid;
-	if (pfn >= num_physpages)
-		return 0;
-	nid = pfn_to_nid(pfn);
-	if (nid == 0xff)
-		return 0;
-	return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid);
-}
-EXPORT_SYMBOL(pfn_valid);
-#endif
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index ea85172fc0cc..9be14171144b 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -151,62 +151,6 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
 	       pxm, pa->apic_id, node);
 }
 
-#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
-/*
- * Protect against too large hotadd areas that would fill up memory.
- */
-static int hotadd_enough_memory(struct bootnode *nd)
-{
-	static unsigned long allocated;
-	static unsigned long last_area_end;
-	unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT;
-	long mem = pages * sizeof(struct page);
-	unsigned long addr;
-	unsigned long allowed;
-	unsigned long oldpages = pages;
-
-	if (mem < 0)
-		return 0;
-	allowed = (end_pfn - absent_pages_in_range(0, end_pfn)) * PAGE_SIZE;
-	allowed = (allowed / 100) * hotadd_percent;
-	if (allocated + mem > allowed) {
-		unsigned long range;
-		/* Give them at least part of their hotadd memory upto hotadd_percent
-		   It would be better to spread the limit out
-		   over multiple hotplug areas, but that is too complicated
-		   right now */
-		if (allocated >= allowed)
-			return 0;
-		range = allowed - allocated;
-		pages = (range / PAGE_SIZE);
-		mem = pages * sizeof(struct page);
-		nd->end = nd->start + range;
-	}
-	/* Not completely fool proof, but a good sanity check */
-	addr = find_e820_area(last_area_end, end_pfn<<PAGE_SHIFT, mem);
-	if (addr == -1UL)
-		return 0;
-	if (pages != oldpages)
-		printk(KERN_NOTICE "SRAT: Hotadd area limited to %lu bytes\n",
-			pages << PAGE_SHIFT);
-	last_area_end = addr + mem;
-	allocated += mem;
-	return 1;
-}
-
-static int update_end_of_memory(unsigned long end)
-{
-	found_add_area = 1;
-	if ((end >> PAGE_SHIFT) > end_pfn)
-		end_pfn = end >> PAGE_SHIFT;
-	return 1;
-}
-
-static inline int save_add_info(void)
-{
-	return hotadd_percent > 0;
-}
-#else
 int update_end_of_memory(unsigned long end) {return -1;}
 static int hotadd_enough_memory(struct bootnode *nd) {return 1;}
 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
@@ -214,7 +158,6 @@ static inline int save_add_info(void) {return 1;}
 #else
 static inline int save_add_info(void) {return 0;}
 #endif
-#endif
 /*
  * Update nodes_add and decide if to include add are in the zone.
  * Both SPARSE and RESERVE need nodes_add infomation.
diff --git a/include/asm-x86/mmzone_64.h b/include/asm-x86/mmzone_64.h
index 1e0ed34a6adc..b0c25ae111d9 100644
--- a/include/asm-x86/mmzone_64.h
+++ b/include/asm-x86/mmzone_64.h
@@ -43,12 +43,6 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
 
 extern int early_pfn_to_nid(unsigned long pfn);
 
-#ifdef CONFIG_DISCONTIGMEM
-#define pfn_to_nid(pfn) phys_to_nid((unsigned long)(pfn) << PAGE_SHIFT)
-
-extern int pfn_valid(unsigned long pfn);
-#endif
-
 #ifdef CONFIG_NUMA_EMU
 #define FAKE_NODE_MIN_SIZE	(64*1024*1024)
 #define FAKE_NODE_MIN_HASH_MASK	(~(FAKE_NODE_MIN_SIZE - 1uL))
diff --git a/include/asm-x86/page_64.h b/include/asm-x86/page_64.h
index 6fdc904a5fa5..d400167c5509 100644
--- a/include/asm-x86/page_64.h
+++ b/include/asm-x86/page_64.h
@@ -122,9 +122,6 @@ extern unsigned long __phys_addr(unsigned long);
 #define __va(x)			((void *)((unsigned long)(x)+PAGE_OFFSET))
 #define __boot_va(x)		__va(x)
 #define __boot_pa(x)		__pa(x)
-#ifdef CONFIG_FLATMEM
-#define pfn_valid(pfn)		((pfn) < end_pfn)
-#endif
 
 #define virt_to_page(kaddr)	pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
 #define virt_addr_valid(kaddr)	pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
-- 
cgit v1.2.3


From 5f627f8e122a163ce53908d55e088247db31f1d7 Mon Sep 17 00:00:00 2001
From: Ralf Baechle <ralf@linux-mips.org>
Date: Wed, 30 Jan 2008 13:30:47 +0100
Subject: mips, x86: optimize the i8259 code a bit

The timer code always calls the clock_event_device set_net_event and
set_mode methods with interrupts disabled, so no need to use
spin_lock_irqsave / spin_unlock_irqrestore for those.

Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Acked-by:Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/mips/kernel/i8253.c | 12 ++++--------
 arch/x86/kernel/i8253.c  | 12 ++++--------
 2 files changed, 8 insertions(+), 16 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/mips/kernel/i8253.c b/arch/mips/kernel/i8253.c
index c2d497ceffdd..fc4aa07b6d35 100644
--- a/arch/mips/kernel/i8253.c
+++ b/arch/mips/kernel/i8253.c
@@ -24,9 +24,7 @@ DEFINE_SPINLOCK(i8253_lock);
 static void init_pit_timer(enum clock_event_mode mode,
 			   struct clock_event_device *evt)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&i8253_lock, flags);
+	spin_lock(&i8253_lock);
 
 	switch(mode) {
 	case CLOCK_EVT_MODE_PERIODIC:
@@ -55,7 +53,7 @@ static void init_pit_timer(enum clock_event_mode mode,
 		/* Nothing to do here */
 		break;
 	}
-	spin_unlock_irqrestore(&i8253_lock, flags);
+	spin_unlock(&i8253_lock);
 }
 
 /*
@@ -65,12 +63,10 @@ static void init_pit_timer(enum clock_event_mode mode,
  */
 static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&i8253_lock, flags);
+	spin_lock(&i8253_lock);
 	outb_p(delta & 0xff , PIT_CH0);	/* LSB */
 	outb(delta >> 8 , PIT_CH0);	/* MSB */
-	spin_unlock_irqrestore(&i8253_lock, flags);
+	spin_unlock(&i8253_lock);
 
 	return 0;
 }
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 377c3f8411f8..c76fef1ce355 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -38,9 +38,7 @@ struct clock_event_device *global_clock_event;
 static void init_pit_timer(enum clock_event_mode mode,
 			   struct clock_event_device *evt)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&i8253_lock, flags);
+	spin_lock(&i8253_lock);
 
 	switch(mode) {
 	case CLOCK_EVT_MODE_PERIODIC:
@@ -71,7 +69,7 @@ static void init_pit_timer(enum clock_event_mode mode,
 		/* Nothing to do here */
 		break;
 	}
-	spin_unlock_irqrestore(&i8253_lock, flags);
+	spin_unlock(&i8253_lock);
 }
 
 /*
@@ -81,12 +79,10 @@ static void init_pit_timer(enum clock_event_mode mode,
  */
 static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&i8253_lock, flags);
+	spin_lock(&i8253_lock);
 	outb_p(delta & 0xff , PIT_CH0);	/* LSB */
 	outb(delta >> 8 , PIT_CH0);	/* MSB */
-	spin_unlock_irqrestore(&i8253_lock, flags);
+	spin_unlock(&i8253_lock);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 2c0b8a7578f7653e1e5312a5232e8ead563cf477 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Wed, 30 Jan 2008 13:30:47 +0100
Subject: x86: fall back on interrupt disable in cmpxchg8b on 80386 and 80486

Actually, on 386, cmpxchg and cmpxchg_local fall back on
cmpxchg_386_u8/16/32: it disables interruptions around non atomic
updates to mimic the cmpxchg behavior.

The comment:
/* Poor man's cmpxchg for 386. Unsuitable for SMP */

already present in cmpxchg_386_u32 tells much about how this cmpxchg
implementation should not be used in a SMP context. However, the cmpxchg_local
can perfectly use this fallback, since it only needs to be atomic wrt the local
cpu.

This patch adds a cmpxchg_486_u64 and uses it as a fallback for cmpxchg64
and cmpxchg64_local on 80386 and 80486.

Q:
but why is it called cmpxchg_486 when the other functions are called

A:
Because the standard cmpxchg is missing only on 386, but cmpxchg8b is
missing both on 386 and 486.

Citing Intel's Instruction set reference:

cmpxchg:
This instruction is not supported on Intel processors earlier than the
Intel486 processors.

cmpxchg8b:
This instruction encoding is not supported on Intel processors earlier
than the Pentium processors.

Q:
What's the reason to have cmpxchg64_local on 32 bit architectures?
Without that need all this would just be a few simple defines.

A:
cmpxchg64_local on 32 bits architectures takes unsigned long long
parameters, but cmpxchg_local only takes longs. Since we have cmpxchg8b
to execute a 8 byte cmpxchg atomically on pentium and +, it makes sense
to provide a flavor of cmpxchg and cmpxchg_local using this instruction.

Also, for 32 bits architectures lacking the 64 bits atomic cmpxchg, it
makes sense _not_ to define cmpxchg64 while cmpxchg could still be
available.

Moreover, the fallback for cmpxchg8b on i386 for 386 and 486 is a

However, cmpxchg64_local will be emulated by disabling interrupts on all
architectures where it is not supported atomically.

Therefore, we *could* turn cmpxchg64_local into a cmpxchg_local, but it
would make the 386/486 fallbacks ugly, make its design different from
cmpxchg/cmpxchg64 (which really depends on atomic operations and cannot
be emulated) and require the __cmpxchg_local to be expressed as a macro
rather than an inline function so the parameters would not be fixed to
unsigned long long in every case.

So I think cmpxchg64_local makes sense there, but I am open to
suggestions.

Q:
Are there any callers?

A:
I am actually using it in LTTng in my timestamping code. I use it to
work around CPUs with asynchronous TSCs. I need to update 64 bits
values atomically on this 32 bits architecture.

Changelog:
- Ran though checkpatch.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/intel.c  |  17 ++++++
 include/asm-x86/cmpxchg_32.h | 122 ++++++++++++++++++++++++++++---------------
 2 files changed, 96 insertions(+), 43 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index cc8c501b9f39..867ff94579be 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -342,5 +342,22 @@ unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
 EXPORT_SYMBOL(cmpxchg_386_u32);
 #endif
 
+#ifndef CONFIG_X86_CMPXCHG64
+unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
+{
+	u64 prev;
+	unsigned long flags;
+
+	/* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
+	local_irq_save(flags);
+	prev = *(u64 *)ptr;
+	if (prev == old)
+		*(u64 *)ptr = new;
+	local_irq_restore(flags);
+	return prev;
+}
+EXPORT_SYMBOL(cmpxchg_486_u64);
+#endif
+
 // arch_initcall(intel_cpu_init);
 
diff --git a/include/asm-x86/cmpxchg_32.h b/include/asm-x86/cmpxchg_32.h
index f86ede28f6dc..cea1dae288a7 100644
--- a/include/asm-x86/cmpxchg_32.h
+++ b/include/asm-x86/cmpxchg_32.h
@@ -105,15 +105,24 @@ static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int siz
 
 #ifdef CONFIG_X86_CMPXCHG
 #define __HAVE_ARCH_CMPXCHG 1
-#define cmpxchg(ptr,o,n)\
-	((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
-					(unsigned long)(n),sizeof(*(ptr))))
-#define sync_cmpxchg(ptr,o,n)\
-	((__typeof__(*(ptr)))__sync_cmpxchg((ptr),(unsigned long)(o),\
-					(unsigned long)(n),sizeof(*(ptr))))
-#define cmpxchg_local(ptr,o,n)\
-	((__typeof__(*(ptr)))__cmpxchg_local((ptr),(unsigned long)(o),\
-					(unsigned long)(n),sizeof(*(ptr))))
+#define cmpxchg(ptr, o, n)						     \
+	((__typeof__(*(ptr)))__cmpxchg((ptr), (unsigned long)(o),	     \
+					(unsigned long)(n), sizeof(*(ptr))))
+#define sync_cmpxchg(ptr, o, n)						     \
+	((__typeof__(*(ptr)))__sync_cmpxchg((ptr), (unsigned long)(o),	     \
+					(unsigned long)(n), sizeof(*(ptr))))
+#define cmpxchg_local(ptr, o, n)					     \
+	((__typeof__(*(ptr)))__cmpxchg_local((ptr), (unsigned long)(o),	     \
+					(unsigned long)(n), sizeof(*(ptr))))
+#endif
+
+#ifdef CONFIG_X86_CMPXCHG64
+#define cmpxchg64(ptr, o, n)						      \
+	((__typeof__(*(ptr)))__cmpxchg64((ptr), (unsigned long long)(o),      \
+					(unsigned long long)(n)))
+#define cmpxchg64_local(ptr, o, n)					      \
+	((__typeof__(*(ptr)))__cmpxchg64_local((ptr), (unsigned long long)(o),\
+					(unsigned long long)(n)))
 #endif
 
 static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
@@ -203,6 +212,34 @@ static inline unsigned long __cmpxchg_local(volatile void *ptr,
 	return old;
 }
 
+static inline unsigned long long __cmpxchg64(volatile void *ptr,
+			unsigned long long old, unsigned long long new)
+{
+	unsigned long long prev;
+	__asm__ __volatile__(LOCK_PREFIX "cmpxchg8b %3"
+			     : "=A"(prev)
+			     : "b"((unsigned long)new),
+			       "c"((unsigned long)(new >> 32)),
+			       "m"(*__xg(ptr)),
+			       "0"(old)
+			     : "memory");
+	return prev;
+}
+
+static inline unsigned long long __cmpxchg64_local(volatile void *ptr,
+			unsigned long long old, unsigned long long new)
+{
+	unsigned long long prev;
+	__asm__ __volatile__("cmpxchg8b %3"
+			     : "=A"(prev)
+			     : "b"((unsigned long)new),
+			       "c"((unsigned long)(new >> 32)),
+			       "m"(*__xg(ptr)),
+			       "0"(old)
+			     : "memory");
+	return prev;
+}
+
 #ifndef CONFIG_X86_CMPXCHG
 /*
  * Building a kernel capable running on 80386. It may be necessary to
@@ -228,7 +265,7 @@ static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
 	return old;
 }
 
-#define cmpxchg(ptr,o,n)						\
+#define cmpxchg(ptr, o, n)						\
 ({									\
 	__typeof__(*(ptr)) __ret;					\
 	if (likely(boot_cpu_data.x86 > 3))				\
@@ -239,7 +276,7 @@ static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
 					(unsigned long)(n), sizeof(*(ptr))); \
 	__ret;								\
 })
-#define cmpxchg_local(ptr,o,n)						\
+#define cmpxchg_local(ptr, o, n)					\
 ({									\
 	__typeof__(*(ptr)) __ret;					\
 	if (likely(boot_cpu_data.x86 > 3))				\
@@ -252,38 +289,37 @@ static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
 })
 #endif
 
-static inline unsigned long long __cmpxchg64(volatile void *ptr, unsigned long long old,
-				      unsigned long long new)
-{
-	unsigned long long prev;
-	__asm__ __volatile__(LOCK_PREFIX "cmpxchg8b %3"
-			     : "=A"(prev)
-			     : "b"((unsigned long)new),
-			       "c"((unsigned long)(new >> 32)),
-			       "m"(*__xg(ptr)),
-			       "0"(old)
-			     : "memory");
-	return prev;
-}
+#ifndef CONFIG_X86_CMPXCHG64
+/*
+ * Building a kernel capable running on 80386 and 80486. It may be necessary
+ * to simulate the cmpxchg8b on the 80386 and 80486 CPU.
+ */
 
-static inline unsigned long long __cmpxchg64_local(volatile void *ptr,
-			unsigned long long old, unsigned long long new)
-{
-	unsigned long long prev;
-	__asm__ __volatile__("cmpxchg8b %3"
-			     : "=A"(prev)
-			     : "b"((unsigned long)new),
-			       "c"((unsigned long)(new >> 32)),
-			       "m"(*__xg(ptr)),
-			       "0"(old)
-			     : "memory");
-	return prev;
-}
+extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64);
+
+#define cmpxchg64(ptr, o, n)						\
+({									\
+	__typeof__(*(ptr)) __ret;					\
+	if (likely(boot_cpu_data.x86 > 4))				\
+		__ret = __cmpxchg64((ptr), (unsigned long long)(o),	\
+				(unsigned long long)(n));		\
+	else								\
+		__ret = cmpxchg_486_u64((ptr), (unsigned long long)(o),	\
+				(unsigned long long)(n));		\
+	__ret;								\
+})
+#define cmpxchg64_local(ptr, o, n)					\
+({									\
+	__typeof__(*(ptr)) __ret;					\
+	if (likely(boot_cpu_data.x86 > 4))				\
+		__ret = __cmpxchg64_local((ptr), (unsigned long long)(o), \
+				(unsigned long long)(n));		\
+	else								\
+		__ret = cmpxchg_486_u64((ptr), (unsigned long long)(o),	\
+				(unsigned long long)(n));		\
+	__ret;								\
+})
+
+#endif
 
-#define cmpxchg64(ptr,o,n)\
-	((__typeof__(*(ptr)))__cmpxchg64((ptr),(unsigned long long)(o),\
-					(unsigned long long)(n)))
-#define cmpxchg64_local(ptr,o,n)\
-	((__typeof__(*(ptr)))__cmpxchg64_local((ptr),(unsigned long long)(o),\
-					(unsigned long long)(n)))
 #endif
-- 
cgit v1.2.3


From 77c03dcd448aa4c7be45f4edb97381ef463e3911 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:30:48 +0100
Subject: x86: remove TRAP_FLAG

This gets rid of the local constant macro TRAP_FLAG.
It's redundant with the public constant macro X86_EFLAGS_TF.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ptrace_32.c | 9 +++------
 arch/x86/kernel/ptrace_64.c | 9 +++------
 2 files changed, 6 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ptrace_32.c b/arch/x86/kernel/ptrace_32.c
index 09227cfb7c4c..4619bda470b6 100644
--- a/arch/x86/kernel/ptrace_32.c
+++ b/arch/x86/kernel/ptrace_32.c
@@ -37,9 +37,6 @@
  */
 #define FLAG_MASK 0x00050dd5
 
-/* set's the trap flag. */
-#define TRAP_FLAG 0x100
-
 /*
  * Offset of eflags on child stack..
  */
@@ -235,11 +232,11 @@ static void set_singlestep(struct task_struct *child)
 	/*
 	 * If TF was already set, don't do anything else
 	 */
-	if (regs->eflags & TRAP_FLAG)
+	if (regs->eflags & X86_EFLAGS_TF)
 		return;
 
 	/* Set TF on the kernel stack.. */
-	regs->eflags |= TRAP_FLAG;
+	regs->eflags |= X86_EFLAGS_TF;
 
 	/*
 	 * ..but if TF is changed by the instruction we will trace,
@@ -260,7 +257,7 @@ static void clear_singlestep(struct task_struct *child)
 	/* But touch TF only if it was set by us.. */
 	if (child->ptrace & PT_DTRACE) {
 		struct pt_regs *regs = get_child_regs(child);
-		regs->eflags &= ~TRAP_FLAG;
+		regs->eflags &= ~X86_EFLAGS_TF;
 		child->ptrace &= ~PT_DTRACE;
 	}
 }
diff --git a/arch/x86/kernel/ptrace_64.c b/arch/x86/kernel/ptrace_64.c
index 375fadc23a25..8e433b3773d2 100644
--- a/arch/x86/kernel/ptrace_64.c
+++ b/arch/x86/kernel/ptrace_64.c
@@ -42,9 +42,6 @@
  */
 #define FLAG_MASK 0x54dd5UL
 
-/* set's the trap flag. */
-#define TRAP_FLAG 0x100UL
-
 /*
  * eflags and offset of eflags on child stack..
  */
@@ -187,11 +184,11 @@ static void set_singlestep(struct task_struct *child)
 	/*
 	 * If TF was already set, don't do anything else
 	 */
-	if (regs->eflags & TRAP_FLAG)
+	if (regs->eflags & X86_EFLAGS_TF)
 		return;
 
 	/* Set TF on the kernel stack.. */
-	regs->eflags |= TRAP_FLAG;
+	regs->eflags |= X86_EFLAGS_TF;
 
 	/*
 	 * ..but if TF is changed by the instruction we will trace,
@@ -212,7 +209,7 @@ static void clear_singlestep(struct task_struct *child)
 	/* But touch TF only if it was set by us.. */
 	if (child->ptrace & PT_DTRACE) {
 		struct pt_regs *regs = task_pt_regs(child);
-		regs->eflags &= ~TRAP_FLAG;
+		regs->eflags &= ~X86_EFLAGS_TF;
 		child->ptrace &= ~PT_DTRACE;
 	}
 }
-- 
cgit v1.2.3


From 7f232343e0ea37ffc0a552cdbd4825482c949281 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:30:48 +0100
Subject: x86: arch_has_single_step

This defines the new standard arch_has_single_step macro.  It makes the
existing set_singlestep and clear_singlestep entry points global, and
renames them to the new standard names user_enable_single_step and
user_disable_single_step, respectively.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ptrace_32.c | 12 ++++++------
 arch/x86/kernel/ptrace_64.c | 12 ++++++------
 include/asm-x86/ptrace.h    |  7 +++++++
 3 files changed, 19 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ptrace_32.c b/arch/x86/kernel/ptrace_32.c
index 4619bda470b6..1402a54ef61f 100644
--- a/arch/x86/kernel/ptrace_32.c
+++ b/arch/x86/kernel/ptrace_32.c
@@ -218,7 +218,7 @@ static inline int is_setting_trap_flag(struct task_struct *child, struct pt_regs
 	return 0;
 }
 
-static void set_singlestep(struct task_struct *child)
+void user_enable_single_step(struct task_struct *child)
 {
 	struct pt_regs *regs = get_child_regs(child);
 
@@ -249,7 +249,7 @@ static void set_singlestep(struct task_struct *child)
 	child->ptrace |= PT_DTRACE;
 }
 
-static void clear_singlestep(struct task_struct *child)
+void user_disable_single_step(struct task_struct *child)
 {
 	/* Always clear TIF_SINGLESTEP... */
 	clear_tsk_thread_flag(child, TIF_SINGLESTEP);
@@ -269,7 +269,7 @@ static void clear_singlestep(struct task_struct *child)
  */
 void ptrace_disable(struct task_struct *child)
 { 
-	clear_singlestep(child);
+	user_disable_single_step(child);
 	clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
 }
 
@@ -403,7 +403,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		}
 		child->exit_code = data;
 		/* make sure the single step bit is not set. */
-		clear_singlestep(child);
+		user_disable_single_step(child);
 		wake_up_process(child);
 		ret = 0;
 		break;
@@ -419,7 +419,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 			break;
 		child->exit_code = SIGKILL;
 		/* make sure the single step bit is not set. */
-		clear_singlestep(child);
+		user_disable_single_step(child);
 		wake_up_process(child);
 		break;
 
@@ -435,7 +435,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 			clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
 
 		clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-		set_singlestep(child);
+		user_enable_single_step(child);
 		child->exit_code = data;
 		/* give it a chance to run. */
 		wake_up_process(child);
diff --git a/arch/x86/kernel/ptrace_64.c b/arch/x86/kernel/ptrace_64.c
index 8e433b3773d2..7373a99facf3 100644
--- a/arch/x86/kernel/ptrace_64.c
+++ b/arch/x86/kernel/ptrace_64.c
@@ -170,7 +170,7 @@ static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
 	return 0;
 }
 
-static void set_singlestep(struct task_struct *child)
+void user_enable_single_step(struct task_struct *child)
 {
 	struct pt_regs *regs = task_pt_regs(child);
 
@@ -201,7 +201,7 @@ static void set_singlestep(struct task_struct *child)
 	child->ptrace |= PT_DTRACE;
 }
 
-static void clear_singlestep(struct task_struct *child)
+void user_disable_single_step(struct task_struct *child)
 {
 	/* Always clear TIF_SINGLESTEP... */
 	clear_tsk_thread_flag(child, TIF_SINGLESTEP);
@@ -221,7 +221,7 @@ static void clear_singlestep(struct task_struct *child)
  */
 void ptrace_disable(struct task_struct *child)
 { 
-	clear_singlestep(child);
+	user_disable_single_step(child);
 }
 
 static int putreg(struct task_struct *child,
@@ -461,7 +461,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		clear_tsk_thread_flag(child, TIF_SINGLESTEP);
 		child->exit_code = data;
 		/* make sure the single step bit is not set. */
-		clear_singlestep(child);
+		user_disable_single_step(child);
 		wake_up_process(child);
 		ret = 0;
 		break;
@@ -504,7 +504,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		clear_tsk_thread_flag(child, TIF_SINGLESTEP);
 		child->exit_code = SIGKILL;
 		/* make sure the single step bit is not set. */
-		clear_singlestep(child);
+		user_disable_single_step(child);
 		wake_up_process(child);
 		break;
 
@@ -513,7 +513,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		if (!valid_signal(data))
 			break;
 		clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
-		set_singlestep(child);
+		user_enable_single_step(child);
 		child->exit_code = data;
 		/* give it a chance to run. */
 		wake_up_process(child);
diff --git a/include/asm-x86/ptrace.h b/include/asm-x86/ptrace.h
index 105d1534eaf4..fe75422f034b 100644
--- a/include/asm-x86/ptrace.h
+++ b/include/asm-x86/ptrace.h
@@ -140,6 +140,13 @@ enum {
 
 #ifdef __KERNEL__
 
+/*
+ * These are defined as per linux/ptrace.h, which see.
+ */
+#define arch_has_single_step()	(1)
+extern void user_enable_single_step(struct task_struct *);
+extern void user_disable_single_step(struct task_struct *);
+
 struct user_desc;
 extern int do_get_thread_area(struct task_struct *p, int idx,
 			      struct user_desc __user *info);
-- 
cgit v1.2.3


From fa1e03eae2f38e7b38095301b043da9c274d2284 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:30:50 +0100
Subject: x86: single_step moved

This moves the single-step support code from ptrace_64.c into a new file
step.c, verbatim.  This paves the way for consolidating this code between
64-bit and 32-bit versions.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/Makefile_64 |   2 +
 arch/x86/kernel/ptrace_64.c | 134 ------------------------------------------
 arch/x86/kernel/step.c      | 140 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 142 insertions(+), 134 deletions(-)
 create mode 100644 arch/x86/kernel/step.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile_64 b/arch/x86/kernel/Makefile_64
index e1ba82e582a8..d908f0175e76 100644
--- a/arch/x86/kernel/Makefile_64
+++ b/arch/x86/kernel/Makefile_64
@@ -13,6 +13,8 @@ obj-y	:= process_64.o signal_64.o entry_64.o traps_64.o irq_64.o \
 		pci-dma_64.o pci-nommu_64.o alternative.o hpet.o tsc_64.o bugs_64.o \
 		i8253.o io_delay.o rtc.o
 
+obj-y				+= step.o
+
 obj-$(CONFIG_IA32_EMULATION)	+= tls.o
 obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
 obj-y				+= cpu/
diff --git a/arch/x86/kernel/ptrace_64.c b/arch/x86/kernel/ptrace_64.c
index 7373a99facf3..4abfbced9b26 100644
--- a/arch/x86/kernel/ptrace_64.c
+++ b/arch/x86/kernel/ptrace_64.c
@@ -80,140 +80,6 @@ static inline long put_stack_long(struct task_struct *task, int offset,
 	return 0;
 }
 
-#define LDT_SEGMENT 4
-
-unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *regs)
-{
-	unsigned long addr, seg;
-
-	addr = regs->rip;
-	seg = regs->cs & 0xffff;
-
-	/*
-	 * We'll assume that the code segments in the GDT
-	 * are all zero-based. That is largely true: the
-	 * TLS segments are used for data, and the PNPBIOS
-	 * and APM bios ones we just ignore here.
-	 */
-	if (seg & LDT_SEGMENT) {
-		u32 *desc;
-		unsigned long base;
-
-		seg &= ~7UL;
-
-		mutex_lock(&child->mm->context.lock);
-		if (unlikely((seg >> 3) >= child->mm->context.size))
-			addr = -1L; /* bogus selector, access would fault */
-		else {
-			desc = child->mm->context.ldt + seg;
-			base = ((desc[0] >> 16) |
-				((desc[1] & 0xff) << 16) |
-				(desc[1] & 0xff000000));
-
-			/* 16-bit code segment? */
-			if (!((desc[1] >> 22) & 1))
-				addr &= 0xffff;
-			addr += base;
-		}
-		mutex_unlock(&child->mm->context.lock);
-	}
-
-	return addr;
-}
-
-static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
-{
-	int i, copied;
-	unsigned char opcode[15];
-	unsigned long addr = convert_rip_to_linear(child, regs);
-
-	copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
-	for (i = 0; i < copied; i++) {
-		switch (opcode[i]) {
-		/* popf and iret */
-		case 0x9d: case 0xcf:
-			return 1;
-
-			/* CHECKME: 64 65 */
-
-		/* opcode and address size prefixes */
-		case 0x66: case 0x67:
-			continue;
-		/* irrelevant prefixes (segment overrides and repeats) */
-		case 0x26: case 0x2e:
-		case 0x36: case 0x3e:
-		case 0x64: case 0x65:
-		case 0xf2: case 0xf3:
-			continue;
-
-		case 0x40 ... 0x4f:
-			if (regs->cs != __USER_CS)
-				/* 32-bit mode: register increment */
-				return 0;
-			/* 64-bit mode: REX prefix */
-			continue;
-
-			/* CHECKME: f2, f3 */
-
-		/*
-		 * pushf: NOTE! We should probably not let
-		 * the user see the TF bit being set. But
-		 * it's more pain than it's worth to avoid
-		 * it, and a debugger could emulate this
-		 * all in user space if it _really_ cares.
-		 */
-		case 0x9c:
-		default:
-			return 0;
-		}
-	}
-	return 0;
-}
-
-void user_enable_single_step(struct task_struct *child)
-{
-	struct pt_regs *regs = task_pt_regs(child);
-
-	/*
-	 * Always set TIF_SINGLESTEP - this guarantees that
-	 * we single-step system calls etc..  This will also
-	 * cause us to set TF when returning to user mode.
-	 */
-	set_tsk_thread_flag(child, TIF_SINGLESTEP);
-
-	/*
-	 * If TF was already set, don't do anything else
-	 */
-	if (regs->eflags & X86_EFLAGS_TF)
-		return;
-
-	/* Set TF on the kernel stack.. */
-	regs->eflags |= X86_EFLAGS_TF;
-
-	/*
-	 * ..but if TF is changed by the instruction we will trace,
-	 * don't mark it as being "us" that set it, so that we
-	 * won't clear it by hand later.
-	 */
-	if (is_setting_trap_flag(child, regs))
-		return;
-
-	child->ptrace |= PT_DTRACE;
-}
-
-void user_disable_single_step(struct task_struct *child)
-{
-	/* Always clear TIF_SINGLESTEP... */
-	clear_tsk_thread_flag(child, TIF_SINGLESTEP);
-
-	/* But touch TF only if it was set by us.. */
-	if (child->ptrace & PT_DTRACE) {
-		struct pt_regs *regs = task_pt_regs(child);
-		regs->eflags &= ~X86_EFLAGS_TF;
-		child->ptrace &= ~PT_DTRACE;
-	}
-}
-
 /*
  * Called by kernel/ptrace.c when detaching..
  *
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
new file mode 100644
index 000000000000..cb3c8bc2939a
--- /dev/null
+++ b/arch/x86/kernel/step.c
@@ -0,0 +1,140 @@
+/*
+ * x86 single-step support code, common to 32-bit and 64-bit.
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/ptrace.h>
+
+#define LDT_SEGMENT 4
+
+unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *regs)
+{
+	unsigned long addr, seg;
+
+	addr = regs->rip;
+	seg = regs->cs & 0xffff;
+
+	/*
+	 * We'll assume that the code segments in the GDT
+	 * are all zero-based. That is largely true: the
+	 * TLS segments are used for data, and the PNPBIOS
+	 * and APM bios ones we just ignore here.
+	 */
+	if (seg & LDT_SEGMENT) {
+		u32 *desc;
+		unsigned long base;
+
+		seg &= ~7UL;
+
+		mutex_lock(&child->mm->context.lock);
+		if (unlikely((seg >> 3) >= child->mm->context.size))
+			addr = -1L; /* bogus selector, access would fault */
+		else {
+			desc = child->mm->context.ldt + seg;
+			base = ((desc[0] >> 16) |
+				((desc[1] & 0xff) << 16) |
+				(desc[1] & 0xff000000));
+
+			/* 16-bit code segment? */
+			if (!((desc[1] >> 22) & 1))
+				addr &= 0xffff;
+			addr += base;
+		}
+		mutex_unlock(&child->mm->context.lock);
+	}
+
+	return addr;
+}
+
+static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
+{
+	int i, copied;
+	unsigned char opcode[15];
+	unsigned long addr = convert_rip_to_linear(child, regs);
+
+	copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
+	for (i = 0; i < copied; i++) {
+		switch (opcode[i]) {
+		/* popf and iret */
+		case 0x9d: case 0xcf:
+			return 1;
+
+			/* CHECKME: 64 65 */
+
+		/* opcode and address size prefixes */
+		case 0x66: case 0x67:
+			continue;
+		/* irrelevant prefixes (segment overrides and repeats) */
+		case 0x26: case 0x2e:
+		case 0x36: case 0x3e:
+		case 0x64: case 0x65:
+		case 0xf2: case 0xf3:
+			continue;
+
+		case 0x40 ... 0x4f:
+			if (regs->cs != __USER_CS)
+				/* 32-bit mode: register increment */
+				return 0;
+			/* 64-bit mode: REX prefix */
+			continue;
+
+			/* CHECKME: f2, f3 */
+
+		/*
+		 * pushf: NOTE! We should probably not let
+		 * the user see the TF bit being set. But
+		 * it's more pain than it's worth to avoid
+		 * it, and a debugger could emulate this
+		 * all in user space if it _really_ cares.
+		 */
+		case 0x9c:
+		default:
+			return 0;
+		}
+	}
+	return 0;
+}
+
+void user_enable_single_step(struct task_struct *child)
+{
+	struct pt_regs *regs = task_pt_regs(child);
+
+	/*
+	 * Always set TIF_SINGLESTEP - this guarantees that
+	 * we single-step system calls etc..  This will also
+	 * cause us to set TF when returning to user mode.
+	 */
+	set_tsk_thread_flag(child, TIF_SINGLESTEP);
+
+	/*
+	 * If TF was already set, don't do anything else
+	 */
+	if (regs->eflags & X86_EFLAGS_TF)
+		return;
+
+	/* Set TF on the kernel stack.. */
+	regs->eflags |= X86_EFLAGS_TF;
+
+	/*
+	 * ..but if TF is changed by the instruction we will trace,
+	 * don't mark it as being "us" that set it, so that we
+	 * won't clear it by hand later.
+	 */
+	if (is_setting_trap_flag(child, regs))
+		return;
+
+	child->ptrace |= PT_DTRACE;
+}
+
+void user_disable_single_step(struct task_struct *child)
+{
+	/* Always clear TIF_SINGLESTEP... */
+	clear_tsk_thread_flag(child, TIF_SINGLESTEP);
+
+	/* But touch TF only if it was set by us.. */
+	if (child->ptrace & PT_DTRACE) {
+		struct pt_regs *regs = task_pt_regs(child);
+		regs->eflags &= ~X86_EFLAGS_TF;
+		child->ptrace &= ~PT_DTRACE;
+	}
+}
-- 
cgit v1.2.3


From 3f80c1adc900769f2070432419da3b5ddbf066fc Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:30:50 +0100
Subject: x86: single_step segment macros

This cleans up the single-step code to use the asm/segment.h macros
for segment selector magic bits, rather than its own constant.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/step.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index cb3c8bc2939a..3b70f20f21f9 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -5,8 +5,6 @@
 #include <linux/mm.h>
 #include <linux/ptrace.h>
 
-#define LDT_SEGMENT 4
-
 unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *regs)
 {
 	unsigned long addr, seg;
@@ -20,7 +18,7 @@ unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *r
 	 * TLS segments are used for data, and the PNPBIOS
 	 * and APM bios ones we just ignore here.
 	 */
-	if (seg & LDT_SEGMENT) {
+	if ((seg & SEGMENT_TI_MASK) == SEGMENT_LDT) {
 		u32 *desc;
 		unsigned long base;
 
-- 
cgit v1.2.3


From 5f76cb1f6c42e7575256595f85b8b97d84ec669e Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:30:50 +0100
Subject: x86: single_step 0xf0

This fixes the 64-bit single-step handling code's instruction
decoder to grok the 0xf0 (lock) prefix, which the 32-bit code
already does correctly.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/step.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 3b70f20f21f9..6a93b93f91f1 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -66,7 +66,7 @@ static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
 		case 0x26: case 0x2e:
 		case 0x36: case 0x3e:
 		case 0x64: case 0x65:
-		case 0xf2: case 0xf3:
+		case 0xf0: case 0xf2: case 0xf3:
 			continue;
 
 		case 0x40 ... 0x4f:
-- 
cgit v1.2.3


From 7122ec8158b0f88befd94f4da8feae2c8d08d1b4 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:30:50 +0100
Subject: x86: single_step: share code

This removes the single-step code from ptrace_32.c and uses the step.c code
shared with the 64-bit kernel.  The two versions of the code were nearly
identical already, so the shared code has only a couple of simple #ifdef's.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/Makefile_32 |   1 +
 arch/x86/kernel/ptrace_32.c | 125 --------------------------------------------
 arch/x86/kernel/step.c      |  14 +++++
 3 files changed, 15 insertions(+), 125 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32
index 9a6577a746ba..20e23c4c18b6 100644
--- a/arch/x86/kernel/Makefile_32
+++ b/arch/x86/kernel/Makefile_32
@@ -11,6 +11,7 @@ obj-y	:= process_32.o signal_32.o entry_32.o traps_32.o irq_32.o \
 		quirks.o i8237.o topology.o alternative.o i8253.o tsc_32.o io_delay.o rtc.o
 
 obj-y				+= tls.o
+obj-y				+= step.o
 obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
 obj-y				+= cpu/
 obj-y				+= acpi/
diff --git a/arch/x86/kernel/ptrace_32.c b/arch/x86/kernel/ptrace_32.c
index 1402a54ef61f..b73960885c3f 100644
--- a/arch/x86/kernel/ptrace_32.c
+++ b/arch/x86/kernel/ptrace_32.c
@@ -137,131 +137,6 @@ static unsigned long getreg(struct task_struct *child,
 	return retval;
 }
 
-#define LDT_SEGMENT 4
-
-static unsigned long convert_eip_to_linear(struct task_struct *child, struct pt_regs *regs)
-{
-	unsigned long addr, seg;
-
-	addr = regs->eip;
-	seg = regs->xcs & 0xffff;
-	if (regs->eflags & VM_MASK) {
-		addr = (addr & 0xffff) + (seg << 4);
-		return addr;
-	}
-
-	/*
-	 * We'll assume that the code segments in the GDT
-	 * are all zero-based. That is largely true: the
-	 * TLS segments are used for data, and the PNPBIOS
-	 * and APM bios ones we just ignore here.
-	 */
-	if (seg & LDT_SEGMENT) {
-		u32 *desc;
-		unsigned long base;
-
-		seg &= ~7UL;
-
-		mutex_lock(&child->mm->context.lock);
-		if (unlikely((seg >> 3) >= child->mm->context.size))
-			addr = -1L; /* bogus selector, access would fault */
-		else {
-			desc = child->mm->context.ldt + seg;
-			base = ((desc[0] >> 16) |
-				((desc[1] & 0xff) << 16) |
-				(desc[1] & 0xff000000));
-
-			/* 16-bit code segment? */
-			if (!((desc[1] >> 22) & 1))
-				addr &= 0xffff;
-			addr += base;
-		}
-		mutex_unlock(&child->mm->context.lock);
-	}
-	return addr;
-}
-
-static inline int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
-{
-	int i, copied;
-	unsigned char opcode[15];
-	unsigned long addr = convert_eip_to_linear(child, regs);
-
-	copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
-	for (i = 0; i < copied; i++) {
-		switch (opcode[i]) {
-		/* popf and iret */
-		case 0x9d: case 0xcf:
-			return 1;
-		/* opcode and address size prefixes */
-		case 0x66: case 0x67:
-			continue;
-		/* irrelevant prefixes (segment overrides and repeats) */
-		case 0x26: case 0x2e:
-		case 0x36: case 0x3e:
-		case 0x64: case 0x65:
-		case 0xf0: case 0xf2: case 0xf3:
-			continue;
-
-		/*
-		 * pushf: NOTE! We should probably not let
-		 * the user see the TF bit being set. But
-		 * it's more pain than it's worth to avoid
-		 * it, and a debugger could emulate this
-		 * all in user space if it _really_ cares.
-		 */
-		case 0x9c:
-		default:
-			return 0;
-		}
-	}
-	return 0;
-}
-
-void user_enable_single_step(struct task_struct *child)
-{
-	struct pt_regs *regs = get_child_regs(child);
-
-	/*
-	 * Always set TIF_SINGLESTEP - this guarantees that 
-	 * we single-step system calls etc..  This will also
-	 * cause us to set TF when returning to user mode.
-	 */
-	set_tsk_thread_flag(child, TIF_SINGLESTEP);
-
-	/*
-	 * If TF was already set, don't do anything else
-	 */
-	if (regs->eflags & X86_EFLAGS_TF)
-		return;
-
-	/* Set TF on the kernel stack.. */
-	regs->eflags |= X86_EFLAGS_TF;
-
-	/*
-	 * ..but if TF is changed by the instruction we will trace,
-	 * don't mark it as being "us" that set it, so that we
-	 * won't clear it by hand later.
-	 */
-	if (is_setting_trap_flag(child, regs))
-		return;
-	
-	child->ptrace |= PT_DTRACE;
-}
-
-void user_disable_single_step(struct task_struct *child)
-{
-	/* Always clear TIF_SINGLESTEP... */
-	clear_tsk_thread_flag(child, TIF_SINGLESTEP);
-
-	/* But touch TF only if it was set by us.. */
-	if (child->ptrace & PT_DTRACE) {
-		struct pt_regs *regs = get_child_regs(child);
-		regs->eflags &= ~X86_EFLAGS_TF;
-		child->ptrace &= ~PT_DTRACE;
-	}
-}
-
 /*
  * Called by kernel/ptrace.c when detaching..
  *
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 6a93b93f91f1..6732272e3479 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -5,12 +5,24 @@
 #include <linux/mm.h>
 #include <linux/ptrace.h>
 
+#ifdef CONFIG_X86_32
+static
+#endif
 unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *regs)
 {
 	unsigned long addr, seg;
 
+#ifdef CONFIG_X86_64
 	addr = regs->rip;
 	seg = regs->cs & 0xffff;
+#else
+	addr = regs->eip;
+	seg = regs->xcs & 0xffff;
+	if (regs->eflags & X86_EFLAGS_VM) {
+		addr = (addr & 0xffff) + (seg << 4);
+		return addr;
+	}
+#endif
 
 	/*
 	 * We'll assume that the code segments in the GDT
@@ -69,12 +81,14 @@ static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
 		case 0xf0: case 0xf2: case 0xf3:
 			continue;
 
+#ifdef CONFIG_X86_64
 		case 0x40 ... 0x4f:
 			if (regs->cs != __USER_CS)
 				/* 32-bit mode: register increment */
 				return 0;
 			/* 64-bit mode: REX prefix */
 			continue;
+#endif
 
 			/* CHECKME: f2, f3 */
 
-- 
cgit v1.2.3


From e1f287735c1e58c653b516931b5d3dd899edcb77 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:30:50 +0100
Subject: x86 single_step: TIF_FORCED_TF

This changes the single-step support to use a new thread_info flag
TIF_FORCED_TF instead of the PT_DTRACE flag in task_struct.ptrace.
This keeps arch implementation uses out of this non-arch field.

This changes the ptrace access to eflags to mask TF and maintain
the TIF_FORCED_TF flag directly if userland sets TF, instead of
relying on ptrace_signal_deliver.  The 64-bit and 32-bit kernels
are harmonized on this same behavior.  The ptrace_signal_deliver
approach works now, but this change makes the low-level register
access code reliable when called from different contexts than a
ptrace stop, which will be possible in the future.

The 64-bit do_debug exception handler is also changed not to clear TF
from user-mode registers.  This matches the 32-bit kernel's behavior.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/ia32/ptrace32.c         | 20 ++++++++++++++++++--
 arch/x86/kernel/process_32.c     |  3 ---
 arch/x86/kernel/process_64.c     |  5 -----
 arch/x86/kernel/ptrace_32.c      | 17 +++++++++++++++++
 arch/x86/kernel/ptrace_64.c      | 20 ++++++++++++++++++++
 arch/x86/kernel/signal_32.c      | 12 +++++-------
 arch/x86/kernel/signal_64.c      | 14 +++++---------
 arch/x86/kernel/step.c           |  9 +++------
 arch/x86/kernel/traps_64.c       | 23 +++++------------------
 include/asm-x86/signal.h         | 11 ++---------
 include/asm-x86/thread_info_32.h |  2 ++
 include/asm-x86/thread_info_64.h |  2 ++
 12 files changed, 79 insertions(+), 59 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/ia32/ptrace32.c b/arch/x86/ia32/ptrace32.c
index 9d754b640205..5dee33417313 100644
--- a/arch/x86/ia32/ptrace32.c
+++ b/arch/x86/ia32/ptrace32.c
@@ -89,6 +89,15 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 val)
 		__u64 *flags = &stack[offsetof(struct pt_regs, eflags)/8];
 
 		val &= FLAG_MASK;
+		/*
+		 * If the user value contains TF, mark that
+		 * it was not "us" (the debugger) that set it.
+		 * If not, make sure it stays set if we had.
+		 */
+		if (val & X86_EFLAGS_TF)
+			clear_tsk_thread_flag(child, TIF_FORCED_TF);
+		else if (test_tsk_thread_flag(child, TIF_FORCED_TF))
+			val |= X86_EFLAGS_TF;
 		*flags = val | (*flags & ~FLAG_MASK);
 		break;
 	}
@@ -179,9 +188,17 @@ static int getreg32(struct task_struct *child, unsigned regno, u32 *val)
 	R32(eax, rax);
 	R32(orig_eax, orig_rax);
 	R32(eip, rip);
-	R32(eflags, eflags);
 	R32(esp, rsp);
 
+	case offsetof(struct user32, regs.eflags):
+		/*
+		 * If the debugger set TF, hide it from the readout.
+		 */
+		*val = stack[offsetof(struct pt_regs, eflags)/8];
+		if (test_tsk_thread_flag(child, TIF_FORCED_TF))
+			*val &= ~X86_EFLAGS_TF;
+		break;
+
 	case offsetof(struct user32, u_debugreg[0]):
 		*val = child->thread.debugreg0;
 		break;
@@ -425,4 +442,3 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
 	put_task_struct(child);
 	return ret;
 }
-
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 4d66a56280d3..d9905c9d0fd5 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -817,9 +817,6 @@ asmlinkage int sys_execve(struct pt_regs regs)
 			(char __user * __user *) regs.edx,
 			&regs);
 	if (error == 0) {
-		task_lock(current);
-		current->ptrace &= ~PT_DTRACE;
-		task_unlock(current);
 		/* Make sure we don't return using sysenter.. */
 		set_thread_flag(TIF_IRET);
 	}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ccc9d68d5a58..f7356e5517f6 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -709,11 +709,6 @@ long sys_execve(char __user *name, char __user * __user *argv,
 	if (IS_ERR(filename)) 
 		return error;
 	error = do_execve(filename, argv, envp, &regs); 
-	if (error == 0) {
-		task_lock(current);
-		current->ptrace &= ~PT_DTRACE;
-		task_unlock(current);
-	}
 	putname(filename);
 	return error;
 }
diff --git a/arch/x86/kernel/ptrace_32.c b/arch/x86/kernel/ptrace_32.c
index b73960885c3f..bc7fd802dcc7 100644
--- a/arch/x86/kernel/ptrace_32.c
+++ b/arch/x86/kernel/ptrace_32.c
@@ -104,6 +104,15 @@ static int putreg(struct task_struct *child,
 			break;
 		case EFL:
 			value &= FLAG_MASK;
+			/*
+			 * If the user value contains TF, mark that
+			 * it was not "us" (the debugger) that set it.
+			 * If not, make sure it stays set if we had.
+			 */
+			if (value & X86_EFLAGS_TF)
+				clear_tsk_thread_flag(child, TIF_FORCED_TF);
+			else if (test_tsk_thread_flag(child, TIF_FORCED_TF))
+				value |= X86_EFLAGS_TF;
 			value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK;
 			break;
 	}
@@ -119,6 +128,14 @@ static unsigned long getreg(struct task_struct *child,
 	unsigned long retval = ~0UL;
 
 	switch (regno >> 2) {
+		case EFL:
+			/*
+			 * If the debugger set TF, hide it from the readout.
+			 */
+			retval = get_stack_long(child, EFL_OFFSET);
+			if (test_tsk_thread_flag(child, TIF_FORCED_TF))
+				retval &= ~X86_EFLAGS_TF;
+			break;
 		case GS:
 			retval = child->thread.gs;
 			break;
diff --git a/arch/x86/kernel/ptrace_64.c b/arch/x86/kernel/ptrace_64.c
index 4abfbced9b26..035d53e99c57 100644
--- a/arch/x86/kernel/ptrace_64.c
+++ b/arch/x86/kernel/ptrace_64.c
@@ -143,6 +143,15 @@ static int putreg(struct task_struct *child,
 			return 0;
 		case offsetof(struct user_regs_struct, eflags):
 			value &= FLAG_MASK;
+			/*
+			 * If the user value contains TF, mark that
+			 * it was not "us" (the debugger) that set it.
+			 * If not, make sure it stays set if we had.
+			 */
+			if (value & X86_EFLAGS_TF)
+				clear_tsk_thread_flag(child, TIF_FORCED_TF);
+			else if (test_tsk_thread_flag(child, TIF_FORCED_TF))
+				value |= X86_EFLAGS_TF;
 			tmp = get_stack_long(child, EFL_OFFSET); 
 			tmp &= ~FLAG_MASK; 
 			value |= tmp;
@@ -189,6 +198,17 @@ static unsigned long getreg(struct task_struct *child, unsigned long regno)
 			if (child->thread.gsindex != GS_TLS_SEL)
 				return 0;
 			return get_desc_base(&child->thread.tls_array[GS_TLS]);
+		case offsetof(struct user_regs_struct, eflags):
+			/*
+			 * If the debugger set TF, hide it from the readout.
+			 */
+			regno = regno - sizeof(struct pt_regs);
+			val = get_stack_long(child, regno);
+			if (test_tsk_thread_flag(child, TIF_IA32))
+				val &= 0xffffffff;
+			if (test_tsk_thread_flag(child, TIF_FORCED_TF))
+				val &= ~X86_EFLAGS_TF;
+			return val;
 		default:
 			regno = regno - sizeof(struct pt_regs);
 			val = get_stack_long(child, regno);
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index 1ac53e9a0859..0a7c812212c9 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -545,14 +545,12 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 	}
 
 	/*
-	 * If TF is set due to a debugger (PT_DTRACE), clear the TF flag so
-	 * that register information in the sigcontext is correct.
+	 * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF
+	 * flag so that register information in the sigcontext is correct.
 	 */
-	if (unlikely(regs->eflags & TF_MASK)
-	    && likely(current->ptrace & PT_DTRACE)) {
-		current->ptrace &= ~PT_DTRACE;
-		regs->eflags &= ~TF_MASK;
-	}
+	if (unlikely(regs->eflags & X86_EFLAGS_TF) &&
+	    likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
+		regs->eflags &= ~X86_EFLAGS_TF;
 
 	/* Set up the stack frame */
 	if (ka->sa.sa_flags & SA_SIGINFO)
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index 38d806467c0f..ab0178ebe00a 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -349,16 +349,12 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 	}
 
 	/*
-	 * If TF is set due to a debugger (PT_DTRACE), clear the TF
-	 * flag so that register information in the sigcontext is
-	 * correct.
+	 * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF
+	 * flag so that register information in the sigcontext is correct.
 	 */
-	if (unlikely(regs->eflags & TF_MASK)) {
-		if (likely(current->ptrace & PT_DTRACE)) {
-			current->ptrace &= ~PT_DTRACE;
-			regs->eflags &= ~TF_MASK;
-		}
-	}
+	if (unlikely(regs->eflags & X86_EFLAGS_TF) &&
+	    likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
+		regs->eflags &= ~X86_EFLAGS_TF;
 
 #ifdef CONFIG_IA32_EMULATION
 	if (test_thread_flag(TIF_IA32)) {
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 6732272e3479..243bff650ca5 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -135,7 +135,7 @@ void user_enable_single_step(struct task_struct *child)
 	if (is_setting_trap_flag(child, regs))
 		return;
 
-	child->ptrace |= PT_DTRACE;
+	set_tsk_thread_flag(child, TIF_FORCED_TF);
 }
 
 void user_disable_single_step(struct task_struct *child)
@@ -144,9 +144,6 @@ void user_disable_single_step(struct task_struct *child)
 	clear_tsk_thread_flag(child, TIF_SINGLESTEP);
 
 	/* But touch TF only if it was set by us.. */
-	if (child->ptrace & PT_DTRACE) {
-		struct pt_regs *regs = task_pt_regs(child);
-		regs->eflags &= ~X86_EFLAGS_TF;
-		child->ptrace &= ~PT_DTRACE;
-	}
+	if (test_and_clear_tsk_thread_flag(child, TIF_FORCED_TF))
+		task_pt_regs(child)->eflags &= ~X86_EFLAGS_TF;
 }
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index aa248d754533..874aca397b02 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -865,27 +865,14 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs,
 
 	tsk->thread.debugreg6 = condition;
 
-	/* Mask out spurious TF errors due to lazy TF clearing */
+
+	/*
+	 * Single-stepping through TF: make sure we ignore any events in
+	 * kernel space (but re-enable TF when returning to user mode).
+	 */
 	if (condition & DR_STEP) {
-		/*
-		 * The TF error should be masked out only if the current
-		 * process is not traced and if the TRAP flag has been set
-		 * previously by a tracing process (condition detected by
-		 * the PT_DTRACE flag); remember that the i386 TRAP flag
-		 * can be modified by the process itself in user mode,
-		 * allowing programs to debug themselves without the ptrace()
-		 * interface.
-		 */
                 if (!user_mode(regs))
                        goto clear_TF_reenable;
-		/*
-		 * Was the TF flag set by a debugger? If so, clear it now,
-		 * so that register information is correct.
-		 */
-		if (tsk->ptrace & PT_DTRACE) {
-			regs->eflags &= ~TF_MASK;
-			tsk->ptrace &= ~PT_DTRACE;
-		}
 	}
 
 	/* Ok, finally something we can handle */
diff --git a/include/asm-x86/signal.h b/include/asm-x86/signal.h
index 987a422a2c78..aee7eca585ab 100644
--- a/include/asm-x86/signal.h
+++ b/include/asm-x86/signal.h
@@ -245,21 +245,14 @@ static __inline__ int sigfindinword(unsigned long word)
 
 struct pt_regs;
 
-#define ptrace_signal_deliver(regs, cookie)		\
-	do {						\
-		if (current->ptrace & PT_DTRACE) {	\
-			current->ptrace &= ~PT_DTRACE;	\
-			(regs)->eflags &= ~TF_MASK;	\
-		}					\
-	} while (0)
-
 #else /* __i386__ */
 
 #undef __HAVE_ARCH_SIG_BITOPS
 
+#endif /* !__i386__ */
+
 #define ptrace_signal_deliver(regs, cookie) do { } while (0)
 
-#endif /* !__i386__ */
 #endif /* __KERNEL__ */
 #endif /* __ASSEMBLY__ */
 
diff --git a/include/asm-x86/thread_info_32.h b/include/asm-x86/thread_info_32.h
index a516e9192f11..009ecc6ad38b 100644
--- a/include/asm-x86/thread_info_32.h
+++ b/include/asm-x86/thread_info_32.h
@@ -138,6 +138,7 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_IO_BITMAP		18	/* uses I/O bitmap */
 #define TIF_FREEZE		19	/* is freezing for suspend */
 #define TIF_NOTSC		20	/* TSC is not accessible in userland */
+#define TIF_FORCED_TF		21	/* true if TF in eflags artificially */
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_SIGPENDING		(1<<TIF_SIGPENDING)
@@ -153,6 +154,7 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_IO_BITMAP		(1<<TIF_IO_BITMAP)
 #define _TIF_FREEZE		(1<<TIF_FREEZE)
 #define _TIF_NOTSC		(1<<TIF_NOTSC)
+#define _TIF_FORCED_TF		(1<<TIF_FORCED_TF)
 
 /* work to do on interrupt/exception return */
 #define _TIF_WORK_MASK \
diff --git a/include/asm-x86/thread_info_64.h b/include/asm-x86/thread_info_64.h
index c8e7736fc792..e0f41b3deced 100644
--- a/include/asm-x86/thread_info_64.h
+++ b/include/asm-x86/thread_info_64.h
@@ -121,6 +121,7 @@ static inline struct thread_info *stack_thread_info(void)
 #define TIF_DEBUG		21	/* uses debug registers */
 #define TIF_IO_BITMAP		22	/* uses I/O bitmap */
 #define TIF_FREEZE		23	/* is freezing for suspend */
+#define TIF_FORCED_TF		24	/* true if TF in eflags artificially */
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_SIGPENDING		(1<<TIF_SIGPENDING)
@@ -138,6 +139,7 @@ static inline struct thread_info *stack_thread_info(void)
 #define _TIF_DEBUG		(1<<TIF_DEBUG)
 #define _TIF_IO_BITMAP		(1<<TIF_IO_BITMAP)
 #define _TIF_FREEZE		(1<<TIF_FREEZE)
+#define _TIF_FORCED_TF		(1<<TIF_FORCED_TF)
 
 /* work to do on interrupt/exception return */
 #define _TIF_WORK_MASK \
-- 
cgit v1.2.3


From 18982c158f9c255d437713e63a93270d07408674 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:30:51 +0100
Subject: x86-64: ptrace generic resume

This removes the handling for PTRACE_CONT et al from the 64-bit
ptrace code, so it uses the new generic code via ptrace_request.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ptrace_64.c | 45 ---------------------------------------------
 1 file changed, 45 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ptrace_64.c b/arch/x86/kernel/ptrace_64.c
index 035d53e99c57..b129b1fbb5d9 100644
--- a/arch/x86/kernel/ptrace_64.c
+++ b/arch/x86/kernel/ptrace_64.c
@@ -334,23 +334,6 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		}
 		break;
 	}
-	case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */
-	case PTRACE_CONT:    /* restart after signal. */
-
-		ret = -EIO;
-		if (!valid_signal(data))
-			break;
-		if (request == PTRACE_SYSCALL)
-			set_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
-		else
-			clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
-		clear_tsk_thread_flag(child, TIF_SINGLESTEP);
-		child->exit_code = data;
-		/* make sure the single step bit is not set. */
-		user_disable_single_step(child);
-		wake_up_process(child);
-		ret = 0;
-		break;
 
 #ifdef CONFIG_IA32_EMULATION
 		/* This makes only sense with 32bit programs. Allow a
@@ -378,34 +361,6 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		ret = do_arch_prctl(child, data, addr);
 		break;
 
-/*
- * make the child exit.  Best I can do is send it a sigkill. 
- * perhaps it should be put in the status that it wants to 
- * exit.
- */
-	case PTRACE_KILL:
-		ret = 0;
-		if (child->exit_state == EXIT_ZOMBIE)	/* already dead */
-			break;
-		clear_tsk_thread_flag(child, TIF_SINGLESTEP);
-		child->exit_code = SIGKILL;
-		/* make sure the single step bit is not set. */
-		user_disable_single_step(child);
-		wake_up_process(child);
-		break;
-
-	case PTRACE_SINGLESTEP:    /* set the trap flag. */
-		ret = -EIO;
-		if (!valid_signal(data))
-			break;
-		clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
-		user_enable_single_step(child);
-		child->exit_code = data;
-		/* give it a chance to run. */
-		wake_up_process(child);
-		ret = 0;
-		break;
-
 	case PTRACE_GETREGS: { /* Get all gp regs from the child. */
 	  	if (!access_ok(VERIFY_WRITE, (unsigned __user *)data,
 			       sizeof(struct user_regs_struct))) {
-- 
cgit v1.2.3


From 227195d4a6185e81855f56ed9bc815cad9a39398 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:30:51 +0100
Subject: x86-32: ptrace generic resume

This removes the handling for PTRACE_CONT et al from the 32-bit
ptrace code, so it uses the new generic code via ptrace_request.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ptrace_32.c | 57 ---------------------------------------------
 1 file changed, 57 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ptrace_32.c b/arch/x86/kernel/ptrace_32.c
index bc7fd802dcc7..bd3668c2421a 100644
--- a/arch/x86/kernel/ptrace_32.c
+++ b/arch/x86/kernel/ptrace_32.c
@@ -277,63 +277,6 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		  }
 		  break;
 
-	case PTRACE_SYSEMU: /* continue and stop at next syscall, which will not be executed */
-	case PTRACE_SYSCALL:	/* continue and stop at next (return from) syscall */
-	case PTRACE_CONT:	/* restart after signal. */
-		ret = -EIO;
-		if (!valid_signal(data))
-			break;
-		if (request == PTRACE_SYSEMU) {
-			set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
-			clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-		} else if (request == PTRACE_SYSCALL) {
-			set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-			clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
-		} else {
-			clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
-			clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-		}
-		child->exit_code = data;
-		/* make sure the single step bit is not set. */
-		user_disable_single_step(child);
-		wake_up_process(child);
-		ret = 0;
-		break;
-
-/*
- * make the child exit.  Best I can do is send it a sigkill. 
- * perhaps it should be put in the status that it wants to 
- * exit.
- */
-	case PTRACE_KILL:
-		ret = 0;
-		if (child->exit_state == EXIT_ZOMBIE)	/* already dead */
-			break;
-		child->exit_code = SIGKILL;
-		/* make sure the single step bit is not set. */
-		user_disable_single_step(child);
-		wake_up_process(child);
-		break;
-
-	case PTRACE_SYSEMU_SINGLESTEP: /* Same as SYSEMU, but singlestep if not syscall */
-	case PTRACE_SINGLESTEP:	/* set the trap flag. */
-		ret = -EIO;
-		if (!valid_signal(data))
-			break;
-
-		if (request == PTRACE_SYSEMU_SINGLESTEP)
-			set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
-		else
-			clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
-
-		clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-		user_enable_single_step(child);
-		child->exit_code = data;
-		/* give it a chance to run. */
-		wake_up_process(child);
-		ret = 0;
-		break;
-
 	case PTRACE_GETREGS: { /* Get all gp regs from the child. */
 	  	if (!access_ok(VERIFY_WRITE, datap, FRAME_SIZE*sizeof(long))) {
 			ret = -EIO;
-- 
cgit v1.2.3


From 62a97d447b511bf4f0f0aa8cdccfb9ed1c934c8b Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:30:52 +0100
Subject: x86-32 ptrace: use task_pt_regs

This cleans up the 32-bit ptrace code to use task_pt_regs instead of its
own redundant code that does the same thing a different way.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ptrace_32.c | 68 +++++++++++----------------------------------
 1 file changed, 16 insertions(+), 52 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ptrace_32.c b/arch/x86/kernel/ptrace_32.c
index bd3668c2421a..a1425e9ad028 100644
--- a/arch/x86/kernel/ptrace_32.c
+++ b/arch/x86/kernel/ptrace_32.c
@@ -37,53 +37,20 @@
  */
 #define FLAG_MASK 0x00050dd5
 
-/*
- * Offset of eflags on child stack..
- */
-#define EFL_OFFSET offsetof(struct pt_regs, eflags)
-
-static inline struct pt_regs *get_child_regs(struct task_struct *task)
-{
-	void *stack_top = (void *)task->thread.esp0;
-	return stack_top - sizeof(struct pt_regs);
-}
-
-/*
- * This routine will get a word off of the processes privileged stack.
- * the offset is bytes into the pt_regs structure on the stack.
- * This routine assumes that all the privileged stacks are in our
- * data space.
- */   
-static inline int get_stack_long(struct task_struct *task, int offset)
+static long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
 {
-	unsigned char *stack;
-
-	stack = (unsigned char *)task->thread.esp0 - sizeof(struct pt_regs);
-	stack += offset;
-	return (*((int *)stack));
-}
-
-/*
- * This routine will put a word on the processes privileged stack.
- * the offset is bytes into the pt_regs structure on the stack.
- * This routine assumes that all the privileged stacks are in our
- * data space.
- */
-static inline int put_stack_long(struct task_struct *task, int offset,
-	unsigned long data)
-{
-	unsigned char * stack;
-
-	stack = (unsigned char *)task->thread.esp0 - sizeof(struct pt_regs);
-	stack += offset;
-	*(unsigned long *) stack = data;
-	return 0;
+	BUILD_BUG_ON(offsetof(struct pt_regs, ebx) != 0);
+	if (regno > FS)
+		--regno;
+	return &regs->ebx + regno;
 }
 
 static int putreg(struct task_struct *child,
 	unsigned long regno, unsigned long value)
 {
-	switch (regno >> 2) {
+	struct pt_regs *regs = task_pt_regs(child);
+	regno >>= 2;
+	switch (regno) {
 		case GS:
 			if (value && (value & 3) != 3)
 				return -EIO;
@@ -113,26 +80,25 @@ static int putreg(struct task_struct *child,
 				clear_tsk_thread_flag(child, TIF_FORCED_TF);
 			else if (test_tsk_thread_flag(child, TIF_FORCED_TF))
 				value |= X86_EFLAGS_TF;
-			value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK;
+			value |= regs->eflags & ~FLAG_MASK;
 			break;
 	}
-	if (regno > FS*4)
-		regno -= 1*4;
-	put_stack_long(child, regno, value);
+	*pt_regs_access(regs, regno) = value;
 	return 0;
 }
 
-static unsigned long getreg(struct task_struct *child,
-	unsigned long regno)
+static unsigned long getreg(struct task_struct *child, unsigned long regno)
 {
+	struct pt_regs *regs = task_pt_regs(child);
 	unsigned long retval = ~0UL;
 
-	switch (regno >> 2) {
+	regno >>= 2;
+	switch (regno) {
 		case EFL:
 			/*
 			 * If the debugger set TF, hide it from the readout.
 			 */
-			retval = get_stack_long(child, EFL_OFFSET);
+			retval = regs->eflags;
 			if (test_tsk_thread_flag(child, TIF_FORCED_TF))
 				retval &= ~X86_EFLAGS_TF;
 			break;
@@ -147,9 +113,7 @@ static unsigned long getreg(struct task_struct *child,
 			retval = 0xffff;
 			/* fall through */
 		default:
-			if (regno > FS*4)
-				regno -= 1*4;
-			retval &= get_stack_long(child, regno);
+			retval &= *pt_regs_access(regs, regno);
 	}
 	return retval;
 }
-- 
cgit v1.2.3


From e4aed6cc45f06acd35e3dfbbaf632c5d5aa897c0 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:30:52 +0100
Subject: x86-64 ptrace: use task_pt_regs

This cleans up the 64-bit ptrace code to use task_pt_regs instead of its
own redundant code that does the same thing a different way.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ptrace_64.c | 60 +++++++++------------------------------------
 1 file changed, 12 insertions(+), 48 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ptrace_64.c b/arch/x86/kernel/ptrace_64.c
index b129b1fbb5d9..d0a0aeaaa0c2 100644
--- a/arch/x86/kernel/ptrace_64.c
+++ b/arch/x86/kernel/ptrace_64.c
@@ -42,44 +42,6 @@
  */
 #define FLAG_MASK 0x54dd5UL
 
-/*
- * eflags and offset of eflags on child stack..
- */
-#define EFLAGS offsetof(struct pt_regs, eflags)
-#define EFL_OFFSET ((int)(EFLAGS-sizeof(struct pt_regs)))
-
-/*
- * this routine will get a word off of the processes privileged stack. 
- * the offset is how far from the base addr as stored in the TSS.  
- * this routine assumes that all the privileged stacks are in our
- * data space.
- */   
-static inline unsigned long get_stack_long(struct task_struct *task, int offset)
-{
-	unsigned char *stack;
-
-	stack = (unsigned char *)task->thread.rsp0;
-	stack += offset;
-	return (*((unsigned long *)stack));
-}
-
-/*
- * this routine will put a word on the processes privileged stack. 
- * the offset is how far from the base addr as stored in the TSS.  
- * this routine assumes that all the privileged stacks are in our
- * data space.
- */
-static inline long put_stack_long(struct task_struct *task, int offset,
-	unsigned long data)
-{
-	unsigned char * stack;
-
-	stack = (unsigned char *) task->thread.rsp0;
-	stack += offset;
-	*(unsigned long *) stack = data;
-	return 0;
-}
-
 /*
  * Called by kernel/ptrace.c when detaching..
  *
@@ -90,11 +52,16 @@ void ptrace_disable(struct task_struct *child)
 	user_disable_single_step(child);
 }
 
+static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long offset)
+{
+	BUILD_BUG_ON(offsetof(struct pt_regs, r15) != 0);
+	return &regs->r15 + (offset / sizeof(regs->r15));
+}
+
 static int putreg(struct task_struct *child,
 	unsigned long regno, unsigned long value)
 {
-	unsigned long tmp; 
-	
+	struct pt_regs *regs = task_pt_regs(child);
 	switch (regno) {
 		case offsetof(struct user_regs_struct,fs):
 			if (value && (value & 3) != 3)
@@ -152,9 +119,7 @@ static int putreg(struct task_struct *child,
 				clear_tsk_thread_flag(child, TIF_FORCED_TF);
 			else if (test_tsk_thread_flag(child, TIF_FORCED_TF))
 				value |= X86_EFLAGS_TF;
-			tmp = get_stack_long(child, EFL_OFFSET); 
-			tmp &= ~FLAG_MASK; 
-			value |= tmp;
+			value |= regs->eflags & ~FLAG_MASK;
 			break;
 		case offsetof(struct user_regs_struct,cs): 
 			if ((value & 3) != 3)
@@ -162,12 +127,13 @@ static int putreg(struct task_struct *child,
 			value &= 0xffff;
 			break;
 	}
-	put_stack_long(child, regno - sizeof(struct pt_regs), value);
+	*pt_regs_access(regs, regno) = value;
 	return 0;
 }
 
 static unsigned long getreg(struct task_struct *child, unsigned long regno)
 {
+	struct pt_regs *regs = task_pt_regs(child);
 	unsigned long val;
 	switch (regno) {
 		case offsetof(struct user_regs_struct, fs):
@@ -202,16 +168,14 @@ static unsigned long getreg(struct task_struct *child, unsigned long regno)
 			/*
 			 * If the debugger set TF, hide it from the readout.
 			 */
-			regno = regno - sizeof(struct pt_regs);
-			val = get_stack_long(child, regno);
+			val = regs->eflags;
 			if (test_tsk_thread_flag(child, TIF_IA32))
 				val &= 0xffffffff;
 			if (test_tsk_thread_flag(child, TIF_FORCED_TF))
 				val &= ~X86_EFLAGS_TF;
 			return val;
 		default:
-			regno = regno - sizeof(struct pt_regs);
-			val = get_stack_long(child, regno);
+			val = *pt_regs_access(regs, regno);
 			if (test_tsk_thread_flag(child, TIF_IA32))
 				val &= 0xffffffff;
 			return val;
-- 
cgit v1.2.3


From 962ff3804d31a4d090bbcbd3d06a4b63e3a5b5fd Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:30:52 +0100
Subject: x86: x86-64 ptrace debugreg cleanup

This cleans up the 64-bit ptrace code to separate the guts of the
debug register access from the implementation of PTRACE_PEEKUSR and
PTRACE_POKEUSR.  The new functions ptrace_[gs]et_debugreg are made
global so that the ia32 code can later be changed to call them too.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ptrace_64.c | 140 +++++++++++++++++++++-----------------------
 include/asm-x86/ptrace.h    |   3 +
 2 files changed, 69 insertions(+), 74 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ptrace_64.c b/arch/x86/kernel/ptrace_64.c
index d0a0aeaaa0c2..4ba66d8af717 100644
--- a/arch/x86/kernel/ptrace_64.c
+++ b/arch/x86/kernel/ptrace_64.c
@@ -183,9 +183,63 @@ static unsigned long getreg(struct task_struct *child, unsigned long regno)
 
 }
 
+unsigned long ptrace_get_debugreg(struct task_struct *child, int n)
+{
+	switch (n) {
+	case 0:		return child->thread.debugreg0;
+	case 1:		return child->thread.debugreg1;
+	case 2:		return child->thread.debugreg2;
+	case 3:		return child->thread.debugreg3;
+	case 6:		return child->thread.debugreg6;
+	case 7:		return child->thread.debugreg7;
+	}
+	return 0;
+}
+
+int ptrace_set_debugreg(struct task_struct *child, int n, unsigned long data)
+{
+	int i;
+
+	if (n < 4) {
+		int dsize = test_tsk_thread_flag(child, TIF_IA32) ? 3 : 7;
+		if (unlikely(data >= TASK_SIZE_OF(child) - dsize))
+			return -EIO;
+	}
+
+	switch (n) {
+	case 0:		child->thread.debugreg0 = data; break;
+	case 1:		child->thread.debugreg1 = data; break;
+	case 2:		child->thread.debugreg2 = data; break;
+	case 3:		child->thread.debugreg3 = data; break;
+
+	case 6:
+		if (data >> 32)
+			return -EIO;
+		child->thread.debugreg6 = data;
+		break;
+
+	case 7:
+		/*
+		 * See ptrace_32.c for an explanation of this awkward check.
+		 */
+		data &= ~DR_CONTROL_RESERVED;
+		for (i = 0; i < 4; i++)
+			if ((0x5554 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
+				return -EIO;
+		child->thread.debugreg7 = data;
+		if (data)
+			set_tsk_thread_flag(child, TIF_DEBUG);
+		else
+			clear_tsk_thread_flag(child, TIF_DEBUG);
+		break;
+	}
+
+	return 0;
+}
+
 long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 {
-	long i, ret;
+	long ret;
 	unsigned ui;
 
 	switch (request) {
@@ -204,32 +258,14 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		    addr > sizeof(struct user) - 7)
 			break;
 
-		switch (addr) { 
-		case 0 ... sizeof(struct user_regs_struct) - sizeof(long):
+		tmp = 0;
+		if (addr < sizeof(struct user_regs_struct))
 			tmp = getreg(child, addr);
-			break;
-		case offsetof(struct user, u_debugreg[0]):
-			tmp = child->thread.debugreg0;
-			break;
-		case offsetof(struct user, u_debugreg[1]):
-			tmp = child->thread.debugreg1;
-			break;
-		case offsetof(struct user, u_debugreg[2]):
-			tmp = child->thread.debugreg2;
-			break;
-		case offsetof(struct user, u_debugreg[3]):
-			tmp = child->thread.debugreg3;
-			break;
-		case offsetof(struct user, u_debugreg[6]):
-			tmp = child->thread.debugreg6;
-			break;
-		case offsetof(struct user, u_debugreg[7]):
-			tmp = child->thread.debugreg7;
-			break;
-		default:
-			tmp = 0;
-			break;
+		else if (addr >= offsetof(struct user, u_debugreg[0])) {
+			addr -= offsetof(struct user, u_debugreg[0]);
+			tmp = ptrace_get_debugreg(child, addr / sizeof(long));
 		}
+
 		ret = put_user(tmp,(unsigned long __user *) data);
 		break;
 	}
@@ -241,63 +277,19 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		break;
 
 	case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
-	{
-		int dsize = test_tsk_thread_flag(child, TIF_IA32) ? 3 : 7;
 		ret = -EIO;
 		if ((addr & 7) ||
 		    addr > sizeof(struct user) - 7)
 			break;
 
-		switch (addr) { 
-		case 0 ... sizeof(struct user_regs_struct) - sizeof(long):
+		if (addr < sizeof(struct user_regs_struct))
 			ret = putreg(child, addr, data);
-			break;
-		/* Disallows to set a breakpoint into the vsyscall */
-		case offsetof(struct user, u_debugreg[0]):
-			if (data >= TASK_SIZE_OF(child) - dsize) break;
-			child->thread.debugreg0 = data;
-			ret = 0;
-			break;
-		case offsetof(struct user, u_debugreg[1]):
-			if (data >= TASK_SIZE_OF(child) - dsize) break;
-			child->thread.debugreg1 = data;
-			ret = 0;
-			break;
-		case offsetof(struct user, u_debugreg[2]):
-			if (data >= TASK_SIZE_OF(child) - dsize) break;
-			child->thread.debugreg2 = data;
-			ret = 0;
-			break;
-		case offsetof(struct user, u_debugreg[3]):
-			if (data >= TASK_SIZE_OF(child) - dsize) break;
-			child->thread.debugreg3 = data;
-			ret = 0;
-			break;
-		case offsetof(struct user, u_debugreg[6]):
-				  if (data >> 32)
-				break; 
-			child->thread.debugreg6 = data;
-			ret = 0;
-			break;
-		case offsetof(struct user, u_debugreg[7]):
-			/* See arch/i386/kernel/ptrace.c for an explanation of
-			 * this awkward check.*/
-			data &= ~DR_CONTROL_RESERVED;
-			for(i=0; i<4; i++)
-				if ((0x5554 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
-					break;
-			if (i == 4) {
-			  child->thread.debugreg7 = data;
-			  if (data)
-			  	set_tsk_thread_flag(child, TIF_DEBUG);
-			  else
-			  	clear_tsk_thread_flag(child, TIF_DEBUG);
-			  ret = 0;
-		  	}
-		  break;
+		else if (addr >= offsetof(struct user, u_debugreg[0])) {
+			addr -= offsetof(struct user, u_debugreg[0]);
+			ret = ptrace_set_debugreg(child,
+						  addr / sizeof(long), data);
 		}
 		break;
-	}
 
 #ifdef CONFIG_IA32_EMULATION
 		/* This makes only sense with 32bit programs. Allow a
diff --git a/include/asm-x86/ptrace.h b/include/asm-x86/ptrace.h
index fe75422f034b..d223decd7b01 100644
--- a/include/asm-x86/ptrace.h
+++ b/include/asm-x86/ptrace.h
@@ -110,6 +110,9 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
 
 struct task_struct;
 
+extern unsigned long ptrace_get_debugreg(struct task_struct *child, int n);
+extern int ptrace_set_debugreg(struct task_struct *child, int n, unsigned long);
+
 extern unsigned long
 convert_rip_to_linear(struct task_struct *child, struct pt_regs *regs);
 
-- 
cgit v1.2.3


From d9771e8c50020bb1b4ca9eca9c188874ff126aa4 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:30:52 +0100
Subject: x86: x86-32 ptrace debugreg cleanup

This cleans up the 32-bit ptrace code to separate the guts of the
debug register access from the implementation of PTRACE_PEEKUSR and
PTRACE_POKEUSR.  The new functions ptrace_[gs]et_debugreg match the
new 64-bit entry points for parity, but they don't need to be global.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ptrace_32.c | 119 +++++++++++++++++++++++++-------------------
 1 file changed, 69 insertions(+), 50 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ptrace_32.c b/arch/x86/kernel/ptrace_32.c
index a1425e9ad028..512f8412b799 100644
--- a/arch/x86/kernel/ptrace_32.c
+++ b/arch/x86/kernel/ptrace_32.c
@@ -118,6 +118,72 @@ static unsigned long getreg(struct task_struct *child, unsigned long regno)
 	return retval;
 }
 
+/*
+ * This function is trivial and will be inlined by the compiler.
+ * Having it separates the implementation details of debug
+ * registers from the interface details of ptrace.
+ */
+static unsigned long ptrace_get_debugreg(struct task_struct *child, int n)
+{
+	return child->thread.debugreg[n];
+}
+
+static int ptrace_set_debugreg(struct task_struct *child,
+			       int n, unsigned long data)
+{
+	if (unlikely(n == 4 || n == 5))
+		return -EIO;
+
+	if (n < 4 && unlikely(data >= TASK_SIZE - 3))
+		return -EIO;
+
+	if (n == 7) {
+		/*
+		 * Sanity-check data. Take one half-byte at once with
+		 * check = (val >> (16 + 4*i)) & 0xf. It contains the
+		 * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
+		 * 2 and 3 are LENi. Given a list of invalid values,
+		 * we do mask |= 1 << invalid_value, so that
+		 * (mask >> check) & 1 is a correct test for invalid
+		 * values.
+		 *
+		 * R/Wi contains the type of the breakpoint /
+		 * watchpoint, LENi contains the length of the watched
+		 * data in the watchpoint case.
+		 *
+		 * The invalid values are:
+		 * - LENi == 0x10 (undefined), so mask |= 0x0f00.
+		 * - R/Wi == 0x10 (break on I/O reads or writes), so
+		 *   mask |= 0x4444.
+		 * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
+		 *   0x1110.
+		 *
+		 * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
+		 *
+		 * See the Intel Manual "System Programming Guide",
+		 * 15.2.4
+		 *
+		 * Note that LENi == 0x10 is defined on x86_64 in long
+		 * mode (i.e. even for 32-bit userspace software, but
+		 * 64-bit kernel), so the x86_64 mask value is 0x5454.
+		 * See the AMD manual no. 24593 (AMD64 System Programming)
+		 */
+		int i;
+		data &= ~DR_CONTROL_RESERVED;
+		for (i = 0; i < 4; i++)
+			if ((0x5f54 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
+				return -EIO;
+		if (data)
+			set_tsk_thread_flag(child, TIF_DEBUG);
+		else
+			clear_tsk_thread_flag(child, TIF_DEBUG);
+	}
+
+	child->thread.debugreg[n] = data;
+
+	return 0;
+}
+
 /*
  * Called by kernel/ptrace.c when detaching..
  *
@@ -158,7 +224,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		   addr <= (long) &dummy->u_debugreg[7]){
 			addr -= (long) &dummy->u_debugreg[0];
 			addr = addr >> 2;
-			tmp = child->thread.debugreg[addr];
+			tmp = ptrace_get_debugreg(child, addr);
 		}
 		ret = put_user(tmp, datap);
 		break;
@@ -188,56 +254,9 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		  ret = -EIO;
 		  if(addr >= (long) &dummy->u_debugreg[0] &&
 		     addr <= (long) &dummy->u_debugreg[7]){
-
-			  if(addr == (long) &dummy->u_debugreg[4]) break;
-			  if(addr == (long) &dummy->u_debugreg[5]) break;
-			  if(addr < (long) &dummy->u_debugreg[4] &&
-			     ((unsigned long) data) >= TASK_SIZE-3) break;
-			  
-			  /* Sanity-check data. Take one half-byte at once with
-			   * check = (val >> (16 + 4*i)) & 0xf. It contains the
-			   * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
-			   * 2 and 3 are LENi. Given a list of invalid values,
-			   * we do mask |= 1 << invalid_value, so that
-			   * (mask >> check) & 1 is a correct test for invalid
-			   * values.
-			   *
-			   * R/Wi contains the type of the breakpoint /
-			   * watchpoint, LENi contains the length of the watched
-			   * data in the watchpoint case.
-			   *
-			   * The invalid values are:
-			   * - LENi == 0x10 (undefined), so mask |= 0x0f00.
-			   * - R/Wi == 0x10 (break on I/O reads or writes), so
-			   *   mask |= 0x4444.
-			   * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
-			   *   0x1110.
-			   *
-			   * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
-			   *
-			   * See the Intel Manual "System Programming Guide",
-			   * 15.2.4
-			   *
-			   * Note that LENi == 0x10 is defined on x86_64 in long
-			   * mode (i.e. even for 32-bit userspace software, but
-			   * 64-bit kernel), so the x86_64 mask value is 0x5454.
-			   * See the AMD manual no. 24593 (AMD64 System
-			   * Programming)*/
-
-			  if(addr == (long) &dummy->u_debugreg[7]) {
-				  data &= ~DR_CONTROL_RESERVED;
-				  for(i=0; i<4; i++)
-					  if ((0x5f54 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
-						  goto out_tsk;
-				  if (data)
-					  set_tsk_thread_flag(child, TIF_DEBUG);
-				  else
-					  clear_tsk_thread_flag(child, TIF_DEBUG);
-			  }
 			  addr -= (long) &dummy->u_debugreg;
 			  addr = addr >> 2;
-			  child->thread.debugreg[addr] = data;
-			  ret = 0;
+			  ret = ptrace_set_debugreg(child, addr, data);
 		  }
 		  break;
 
@@ -335,7 +354,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		ret = ptrace_request(child, request, addr, data);
 		break;
 	}
- out_tsk:
+
 	return ret;
 }
 
-- 
cgit v1.2.3


From 7e9916040b3020d0f36d68bb7512e3b80b623097 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:30:54 +0100
Subject: x86: debugctlmsr context switch

This adds low-level support for a per-thread value of MSR_IA32_DEBUGCTLMSR.
The per-thread value is switched in when TIF_DEBUGCTLMSR is set.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/process_32.c     | 6 +++++-
 arch/x86/kernel/process_64.c     | 3 +++
 include/asm-x86/processor_32.h   | 2 ++
 include/asm-x86/processor_64.h   | 2 ++
 include/asm-x86/thread_info_32.h | 6 ++++--
 include/asm-x86/thread_info_64.h | 4 +++-
 6 files changed, 19 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index d9905c9d0fd5..d5462f228daf 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -602,10 +602,14 @@ static noinline void
 __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 		 struct tss_struct *tss)
 {
-	struct thread_struct *next;
+	struct thread_struct *prev, *next;
 
+	prev = &prev_p->thread;
 	next = &next_p->thread;
 
+	if (next->debugctlmsr != prev->debugctlmsr)
+		wrmsr(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr, 0);
+
 	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
 		set_debugreg(next->debugreg[0], 0);
 		set_debugreg(next->debugreg[1], 1);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index f7356e5517f6..ae5eca17aa3c 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -555,6 +555,9 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
 	prev = &prev_p->thread,
 	next = &next_p->thread;
 
+	if (next->debugctlmsr != prev->debugctlmsr)
+		wrmsrl(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr);
+
 	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
 		loaddebug(next, 0);
 		loaddebug(next, 1);
diff --git a/include/asm-x86/processor_32.h b/include/asm-x86/processor_32.h
index 2540bf8d5724..3c67eacb3168 100644
--- a/include/asm-x86/processor_32.h
+++ b/include/asm-x86/processor_32.h
@@ -368,6 +368,8 @@ struct thread_struct {
  	unsigned long	iopl;
 /* max allowed port in the bitmap, in bytes: */
 	unsigned long	io_bitmap_max;
+/* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set.  */
+	unsigned long	debugctlmsr;
 };
 
 #define INIT_THREAD  {							\
diff --git a/include/asm-x86/processor_64.h b/include/asm-x86/processor_64.h
index 20d8935d141a..e7bea4fed642 100644
--- a/include/asm-x86/processor_64.h
+++ b/include/asm-x86/processor_64.h
@@ -238,6 +238,8 @@ struct thread_struct {
 	int		ioperm;
 	unsigned long	*io_bitmap_ptr;
 	unsigned io_bitmap_max;
+/* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set.  */
+	unsigned long	debugctlmsr;
 /* cached TLS descriptors. */
 	u64 tls_array[GDT_ENTRY_TLS_ENTRIES];
 } __attribute__((aligned(16)));
diff --git a/include/asm-x86/thread_info_32.h b/include/asm-x86/thread_info_32.h
index 009ecc6ad38b..306fc80800e1 100644
--- a/include/asm-x86/thread_info_32.h
+++ b/include/asm-x86/thread_info_32.h
@@ -139,6 +139,7 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_FREEZE		19	/* is freezing for suspend */
 #define TIF_NOTSC		20	/* TSC is not accessible in userland */
 #define TIF_FORCED_TF		21	/* true if TF in eflags artificially */
+#define TIF_DEBUGCTLMSR		22	/* uses thread_struct.debugctlmsr */
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_SIGPENDING		(1<<TIF_SIGPENDING)
@@ -155,6 +156,7 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_FREEZE		(1<<TIF_FREEZE)
 #define _TIF_NOTSC		(1<<TIF_NOTSC)
 #define _TIF_FORCED_TF		(1<<TIF_FORCED_TF)
+#define _TIF_DEBUGCTLMSR	(1<<TIF_DEBUGCTLMSR)
 
 /* work to do on interrupt/exception return */
 #define _TIF_WORK_MASK \
@@ -164,8 +166,8 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_ALLWORK_MASK	(0x0000FFFF & ~_TIF_SECCOMP)
 
 /* flags to check in __switch_to() */
-#define _TIF_WORK_CTXSW_NEXT (_TIF_IO_BITMAP | _TIF_NOTSC | _TIF_DEBUG)
-#define _TIF_WORK_CTXSW_PREV (_TIF_IO_BITMAP | _TIF_NOTSC)
+#define _TIF_WORK_CTXSW_NEXT (_TIF_IO_BITMAP | _TIF_NOTSC | _TIF_DEBUG | _TIF_DEBUGCTLMSR)
+#define _TIF_WORK_CTXSW_PREV (_TIF_IO_BITMAP | _TIF_NOTSC | _TIF_DEBUGCTLMSR)
 
 /*
  * Thread-synchronous status.
diff --git a/include/asm-x86/thread_info_64.h b/include/asm-x86/thread_info_64.h
index e0f41b3deced..ee35fd12b541 100644
--- a/include/asm-x86/thread_info_64.h
+++ b/include/asm-x86/thread_info_64.h
@@ -122,6 +122,7 @@ static inline struct thread_info *stack_thread_info(void)
 #define TIF_IO_BITMAP		22	/* uses I/O bitmap */
 #define TIF_FREEZE		23	/* is freezing for suspend */
 #define TIF_FORCED_TF		24	/* true if TF in eflags artificially */
+#define TIF_DEBUGCTLMSR		25	/* uses thread_struct.debugctlmsr */
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_SIGPENDING		(1<<TIF_SIGPENDING)
@@ -140,6 +141,7 @@ static inline struct thread_info *stack_thread_info(void)
 #define _TIF_IO_BITMAP		(1<<TIF_IO_BITMAP)
 #define _TIF_FREEZE		(1<<TIF_FREEZE)
 #define _TIF_FORCED_TF		(1<<TIF_FORCED_TF)
+#define _TIF_DEBUGCTLMSR	(1<<TIF_DEBUGCTLMSR)
 
 /* work to do on interrupt/exception return */
 #define _TIF_WORK_MASK \
@@ -151,7 +153,7 @@ static inline struct thread_info *stack_thread_info(void)
 	(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY|_TIF_HRTICK_RESCHED)
 
 /* flags to check in __switch_to() */
-#define _TIF_WORK_CTXSW (_TIF_DEBUG|_TIF_IO_BITMAP)
+#define _TIF_WORK_CTXSW (_TIF_DEBUG|_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR)
 
 #define PREEMPT_ACTIVE     0x10000000
 
-- 
cgit v1.2.3


From 10faa81e102e2b7695f386812055cd2ef9e44b4c Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:30:54 +0100
Subject: x86: debugctlmsr arch_has_block_step

This implements user-mode step-until-branch on x86 using the BTF bit
in MSR_IA32_DEBUGCTLMSR.  It's just like single-step, only less so.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/step.c     | 64 +++++++++++++++++++++++++++++++++++++++++++---
 arch/x86/kernel/traps_32.c |  6 +++++
 arch/x86/kernel/traps_64.c |  6 +++++
 include/asm-x86/ptrace.h   |  7 +++++
 4 files changed, 80 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 243bff650ca5..cf4b9dac4a05 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -107,7 +107,10 @@ static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
 	return 0;
 }
 
-void user_enable_single_step(struct task_struct *child)
+/*
+ * Enable single-stepping.  Return nonzero if user mode is not using TF itself.
+ */
+static int enable_single_step(struct task_struct *child)
 {
 	struct pt_regs *regs = task_pt_regs(child);
 
@@ -122,7 +125,7 @@ void user_enable_single_step(struct task_struct *child)
 	 * If TF was already set, don't do anything else
 	 */
 	if (regs->eflags & X86_EFLAGS_TF)
-		return;
+		return 0;
 
 	/* Set TF on the kernel stack.. */
 	regs->eflags |= X86_EFLAGS_TF;
@@ -133,13 +136,68 @@ void user_enable_single_step(struct task_struct *child)
 	 * won't clear it by hand later.
 	 */
 	if (is_setting_trap_flag(child, regs))
-		return;
+		return 0;
 
 	set_tsk_thread_flag(child, TIF_FORCED_TF);
+
+	return 1;
+}
+
+/*
+ * Install this value in MSR_IA32_DEBUGCTLMSR whenever child is running.
+ */
+static void write_debugctlmsr(struct task_struct *child, unsigned long val)
+{
+	child->thread.debugctlmsr = val;
+
+	if (child != current)
+		return;
+
+#ifdef CONFIG_X86_64
+	wrmsrl(MSR_IA32_DEBUGCTLMSR, val);
+#else
+	wrmsr(MSR_IA32_DEBUGCTLMSR, val, 0);
+#endif
+}
+
+/*
+ * Enable single or block step.
+ */
+static void enable_step(struct task_struct *child, bool block)
+{
+	/*
+	 * Make sure block stepping (BTF) is not enabled unless it should be.
+	 * Note that we don't try to worry about any is_setting_trap_flag()
+	 * instructions after the first when using block stepping.
+	 * So noone should try to use debugger block stepping in a program
+	 * that uses user-mode single stepping itself.
+	 */
+	if (enable_single_step(child) && block) {
+		set_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
+		write_debugctlmsr(child, DEBUGCTLMSR_BTF);
+	} else if (test_and_clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR)) {
+		write_debugctlmsr(child, 0);
+	}
+}
+
+void user_enable_single_step(struct task_struct *child)
+{
+	enable_step(child, 0);
+}
+
+void user_enable_block_step(struct task_struct *child)
+{
+	enable_step(child, 1);
 }
 
 void user_disable_single_step(struct task_struct *child)
 {
+	/*
+	 * Make sure block stepping (BTF) is disabled.
+	 */
+	if (test_and_clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR))
+		write_debugctlmsr(child, 0);
+
 	/* Always clear TIF_SINGLESTEP... */
 	clear_tsk_thread_flag(child, TIF_SINGLESTEP);
 
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index 02d1e1e58e81..9b0bbd508cd5 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -837,6 +837,12 @@ fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code)
 
 	get_debugreg(condition, 6);
 
+	/*
+	 * The processor cleared BTF, so don't mark that we need it set.
+	 */
+	clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
+	tsk->thread.debugctlmsr = 0;
+
 	if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
 					SIGTRAP) == NOTIFY_STOP)
 		return;
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index 874aca397b02..610a64d6bdf0 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -850,6 +850,12 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs,
 
 	get_debugreg(condition, 6);
 
+	/*
+	 * The processor cleared BTF, so don't mark that we need it set.
+	 */
+	clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
+	tsk->thread.debugctlmsr = 0;
+
 	if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
 						SIGTRAP) == NOTIFY_STOP)
 		return;
diff --git a/include/asm-x86/ptrace.h b/include/asm-x86/ptrace.h
index d223decd7b01..04204f359298 100644
--- a/include/asm-x86/ptrace.h
+++ b/include/asm-x86/ptrace.h
@@ -150,6 +150,13 @@ enum {
 extern void user_enable_single_step(struct task_struct *);
 extern void user_disable_single_step(struct task_struct *);
 
+extern void user_enable_block_step(struct task_struct *);
+#ifdef CONFIG_X86_DEBUGCTLMSR
+#define arch_has_block_step()	(1)
+#else
+#define arch_has_block_step()	(boot_cpu_data.x86 >= 6)
+#endif
+
 struct user_desc;
 extern int do_get_thread_area(struct task_struct *p, int idx,
 			      struct user_desc __user *info);
-- 
cgit v1.2.3


From 1ecc798c67645e0ee6eb028cb988b71b3edf4b40 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:30:54 +0100
Subject: x86: debugctlmsr kprobes

This adjusts the x86 kprobes implementation to cope with per-thread
MSR_IA32_DEBUGCTLMSR being set for user mode.  I haven't delved deep
enough into the kprobes code to be really sure this covers all the
cases where the user-mode BTF setting needs to be cleared or restored.
It looks about right to me.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/kprobes_32.c | 15 +++++++++++++++
 arch/x86/kernel/kprobes_64.c | 14 ++++++++++++++
 2 files changed, 29 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/kprobes_32.c b/arch/x86/kernel/kprobes_32.c
index 3a020f79f82b..bc4a68367cd0 100644
--- a/arch/x86/kernel/kprobes_32.c
+++ b/arch/x86/kernel/kprobes_32.c
@@ -217,8 +217,21 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
 		kcb->kprobe_saved_eflags &= ~IF_MASK;
 }
 
+static __always_inline void clear_btf(void)
+{
+	if (test_thread_flag(TIF_DEBUGCTLMSR))
+		wrmsr(MSR_IA32_DEBUGCTLMSR, 0, 0);
+}
+
+static __always_inline void restore_btf(void)
+{
+	if (test_thread_flag(TIF_DEBUGCTLMSR))
+		wrmsr(MSR_IA32_DEBUGCTLMSR, current->thread.debugctlmsr, 0);
+}
+
 static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
 {
+	clear_btf();
 	regs->eflags |= TF_MASK;
 	regs->eflags &= ~IF_MASK;
 	/*single step inline if the instruction is an int3*/
@@ -542,6 +555,8 @@ static void __kprobes resume_execution(struct kprobe *p,
 	regs->eip = orig_eip + (regs->eip - copy_eip);
 
 no_change:
+	restore_btf();
+
 	return;
 }
 
diff --git a/arch/x86/kernel/kprobes_64.c b/arch/x86/kernel/kprobes_64.c
index 5df19a9f9239..10d66e323c7d 100644
--- a/arch/x86/kernel/kprobes_64.c
+++ b/arch/x86/kernel/kprobes_64.c
@@ -256,8 +256,21 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
 		kcb->kprobe_saved_rflags &= ~IF_MASK;
 }
 
+static __always_inline void clear_btf(void)
+{
+	if (test_thread_flag(TIF_DEBUGCTLMSR))
+		wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
+}
+
+static __always_inline void restore_btf(void)
+{
+	if (test_thread_flag(TIF_DEBUGCTLMSR))
+		wrmsrl(MSR_IA32_DEBUGCTLMSR, current->thread.debugctlmsr);
+}
+
 static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
 {
+	clear_btf();
 	regs->eflags |= TF_MASK;
 	regs->eflags &= ~IF_MASK;
 	/*single step inline if the instruction is an int3*/
@@ -527,6 +540,7 @@ static void __kprobes resume_execution(struct kprobe *p,
 
 	regs->rip = orig_rip + (regs->rip - copy_rip);
 no_change:
+	restore_btf();
 
 	return;
 }
-- 
cgit v1.2.3


From 7bf0c23ed24b0d95a2a717f86dce1f210e16f8a5 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Wed, 30 Jan 2008 13:30:55 +0100
Subject: x86: prevent dereferencing non-allocated per_cpu variables

'for_each_possible_cpu(i)' when there's a _remote possibility_ of
dereferencing a non-allocated per_cpu variable involved.

All files except mm/vmstat.c are x86 arch.

Thanks to pageexec@freemail.hu for pointing this out.

Signed-off-by: Mike Travis <travis@sgi.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: <pageexec@freemail.hu>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/smp_32.c     | 4 ++--
 arch/x86/kernel/smpboot_32.c | 4 ++--
 arch/x86/xen/smp.c           | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/smp_32.c b/arch/x86/kernel/smp_32.c
index fcaa026eb807..d4c01a4aca60 100644
--- a/arch/x86/kernel/smp_32.c
+++ b/arch/x86/kernel/smp_32.c
@@ -223,7 +223,7 @@ void send_IPI_mask_sequence(cpumask_t mask, int vector)
 	 */ 
 
 	local_irq_save(flags);
-	for (query_cpu = 0; query_cpu < NR_CPUS; ++query_cpu) {
+	for_each_possible_cpu(query_cpu) {
 		if (cpu_isset(query_cpu, mask)) {
 			__send_IPI_dest_field(cpu_to_logical_apicid(query_cpu),
 					      vector);
@@ -675,7 +675,7 @@ static int convert_apicid_to_cpu(int apic_id)
 {
 	int i;
 
-	for (i = 0; i < NR_CPUS; i++) {
+	for_each_possible_cpu(i) {
 		if (per_cpu(x86_cpu_to_apicid, i) == apic_id)
 			return i;
 	}
diff --git a/arch/x86/kernel/smpboot_32.c b/arch/x86/kernel/smpboot_32.c
index 239ada1c499c..0bf7f20baba0 100644
--- a/arch/x86/kernel/smpboot_32.c
+++ b/arch/x86/kernel/smpboot_32.c
@@ -1090,7 +1090,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
 	 * Allow the user to impress friends.
 	 */
 	Dprintk("Before bogomips.\n");
-	for (cpu = 0; cpu < NR_CPUS; cpu++)
+	for_each_possible_cpu(cpu)
 		if (cpu_isset(cpu, cpu_callout_map))
 			bogosum += cpu_data(cpu).loops_per_jiffy;
 	printk(KERN_INFO
@@ -1121,7 +1121,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
 	 * construct cpu_sibling_map, so that we can tell sibling CPUs
 	 * efficiently.
 	 */
-	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+	for_each_possible_cpu(cpu) {
 		cpus_clear(per_cpu(cpu_sibling_map, cpu));
 		cpus_clear(per_cpu(cpu_core_map, cpu));
 	}
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index c1b131bcdcbe..8e1234e14559 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -146,7 +146,7 @@ void __init xen_smp_prepare_boot_cpu(void)
 	   old memory can be recycled */
 	make_lowmem_page_readwrite(&per_cpu__gdt_page);
 
-	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+	for_each_possible_cpu(cpu) {
 		cpus_clear(per_cpu(cpu_sibling_map, cpu));
 		/*
 		 * cpu_core_map lives in a per cpu area that is cleared
@@ -163,7 +163,7 @@ void __init xen_smp_prepare_cpus(unsigned int max_cpus)
 {
 	unsigned cpu;
 
-	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+	for_each_possible_cpu(cpu) {
 		cpus_clear(per_cpu(cpu_sibling_map, cpu));
 		/*
 		 * cpu_core_ map will be zeroed when the per
-- 
cgit v1.2.3


From 5548fecdff5617ba3a2f09f0e585e1ac6e1bd25c Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 30 Jan 2008 13:30:55 +0100
Subject: x86: clean up bitops-related warnings

Add casts to appropriate places to silence spurious bitops warnings.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Andi Kleen <ak@suse.de>

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/setup_64.c   | 31 ++++++++++++++++---------------
 arch/x86/kernel/smpboot_64.c |  4 ++--
 arch/x86/mm/numa_64.c        |  2 +-
 include/asm-x86/cpufeature.h |  2 +-
 include/asm-x86/numa_64.h    |  2 +-
 include/linux/thread_info.h  | 10 +++++-----
 6 files changed, 26 insertions(+), 25 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 84f66b7b4d2e..63dd39b843b5 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -661,19 +661,19 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 
 	/* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
 	   3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
-	clear_bit(0*32+31, &c->x86_capability);
+	clear_bit(0*32+31, (unsigned long *)&c->x86_capability);
 
 	/* On C+ stepping K8 rep microcode works well for copy/memset */
 	level = cpuid_eax(1);
 	if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) ||
 			     level >= 0x0f58))
-		set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
+		set_bit(X86_FEATURE_REP_GOOD, (unsigned long *)&c->x86_capability);
 	if (c->x86 == 0x10 || c->x86 == 0x11)
-		set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
+		set_bit(X86_FEATURE_REP_GOOD, (unsigned long *)&c->x86_capability);
 
 	/* Enable workaround for FXSAVE leak */
 	if (c->x86 >= 6)
-		set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability);
+		set_bit(X86_FEATURE_FXSAVE_LEAK, (unsigned long *)&c->x86_capability);
 
 	level = get_model_name(c);
 	if (!level) {
@@ -689,7 +689,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 
 	/* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
 	if (c->x86_power & (1<<8))
-		set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
+		set_bit(X86_FEATURE_CONSTANT_TSC, (unsigned long *)&c->x86_capability);
 
 	/* Multi core CPU? */
 	if (c->extended_cpuid_level >= 0x80000008)
@@ -702,14 +702,14 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 		num_cache_leaves = 3;
 
 	if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
-		set_bit(X86_FEATURE_K8, &c->x86_capability);
+		set_bit(X86_FEATURE_K8, (unsigned long *)&c->x86_capability);
 
 	/* RDTSC can be speculated around */
-	clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
+	clear_bit(X86_FEATURE_SYNC_RDTSC, (unsigned long *)&c->x86_capability);
 
 	/* Family 10 doesn't support C states in MWAIT so don't use it */
 	if (c->x86 == 0x10 && !force_mwait)
-		clear_bit(X86_FEATURE_MWAIT, &c->x86_capability);
+		clear_bit(X86_FEATURE_MWAIT, (unsigned long *)&c->x86_capability);
 
 	if (amd_apic_timer_broken())
 		disable_apic_timer = 1;
@@ -811,16 +811,17 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 		unsigned eax = cpuid_eax(10);
 		/* Check for version and the number of counters */
 		if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
-			set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability);
+			set_bit(X86_FEATURE_ARCH_PERFMON,
+				(unsigned long *)&c->x86_capability);
 	}
 
 	if (cpu_has_ds) {
 		unsigned int l1, l2;
 		rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
 		if (!(l1 & (1<<11)))
-			set_bit(X86_FEATURE_BTS, c->x86_capability);
+			set_bit(X86_FEATURE_BTS, (unsigned long *)c->x86_capability);
 		if (!(l1 & (1<<12)))
-			set_bit(X86_FEATURE_PEBS, c->x86_capability);
+			set_bit(X86_FEATURE_PEBS, (unsigned long *)c->x86_capability);
 	}
 
 	n = c->extended_cpuid_level;
@@ -839,13 +840,13 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 		c->x86_cache_alignment = c->x86_clflush_size * 2;
 	if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
 	    (c->x86 == 0x6 && c->x86_model >= 0x0e))
-		set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
+		set_bit(X86_FEATURE_CONSTANT_TSC, (unsigned long *)&c->x86_capability);
 	if (c->x86 == 6)
-		set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
+		set_bit(X86_FEATURE_REP_GOOD, (unsigned long *)&c->x86_capability);
 	if (c->x86 == 15)
-		set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
+		set_bit(X86_FEATURE_SYNC_RDTSC, (unsigned long *)&c->x86_capability);
 	else
-		clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
+		clear_bit(X86_FEATURE_SYNC_RDTSC, (unsigned long *)&c->x86_capability);
 	c->x86_max_cores = intel_num_cpu_cores(c);
 
 	srat_detect_node();
diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c
index 8ac8eb620428..ac1089f2b917 100644
--- a/arch/x86/kernel/smpboot_64.c
+++ b/arch/x86/kernel/smpboot_64.c
@@ -691,7 +691,7 @@ do_rest:
 	}
 	if (boot_error) {
 		cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
-		clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
+		clear_bit(cpu, (unsigned long *)&cpu_initialized); /* was set by cpu_init() */
 		clear_node_cpumask(cpu); /* was set by numa_add_cpu */
 		cpu_clear(cpu, cpu_present_map);
 		cpu_clear(cpu, cpu_possible_map);
@@ -1036,7 +1036,7 @@ void remove_cpu_from_maps(void)
 
 	cpu_clear(cpu, cpu_callout_map);
 	cpu_clear(cpu, cpu_callin_map);
-	clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
+	clear_bit(cpu, (unsigned long *)&cpu_initialized); /* was set by cpu_init() */
 	clear_node_cpumask(cpu);
 }
 
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 46b4b5e1a02a..848231481619 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -547,7 +547,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
 
 __cpuinit void numa_add_cpu(int cpu)
 {
-	set_bit(cpu, &node_to_cpumask_map[cpu_to_node(cpu)]);
+	set_bit(cpu, (unsigned long *)&node_to_cpumask_map[cpu_to_node(cpu)]);
 }
 
 void __cpuinit numa_set_node(int cpu, int node)
diff --git a/include/asm-x86/cpufeature.h b/include/asm-x86/cpufeature.h
index acbf6681740d..761922972f6f 100644
--- a/include/asm-x86/cpufeature.h
+++ b/include/asm-x86/cpufeature.h
@@ -124,7 +124,7 @@
 	   (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) ||	\
 	   (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) )	\
 	  ? 1 :								\
-	  test_bit(bit, (c)->x86_capability))
+	 test_bit(bit, (unsigned long *)(c)->x86_capability))
 #define boot_cpu_has(bit)	cpu_has(&boot_cpu_data, bit)
 
 #define cpu_has_fpu		boot_cpu_has(X86_FEATURE_FPU)
diff --git a/include/asm-x86/numa_64.h b/include/asm-x86/numa_64.h
index c3c20db1fba3..4449889bb0c0 100644
--- a/include/asm-x86/numa_64.h
+++ b/include/asm-x86/numa_64.h
@@ -32,7 +32,7 @@ extern void __init init_cpu_to_node(void);
 
 static inline void clear_node_cpumask(int cpu)
 {
-	clear_bit(cpu, &node_to_cpumask_map[cpu_to_node(cpu)]);
+	clear_bit(cpu, (unsigned long *)&node_to_cpumask_map[cpu_to_node(cpu)]);
 }
 
 #else
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 9c4ad755d7e5..dfbdfb9836f4 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -42,27 +42,27 @@ extern long do_no_restart_syscall(struct restart_block *parm);
 
 static inline void set_ti_thread_flag(struct thread_info *ti, int flag)
 {
-	set_bit(flag,&ti->flags);
+	set_bit(flag, (unsigned long *)&ti->flags);
 }
 
 static inline void clear_ti_thread_flag(struct thread_info *ti, int flag)
 {
-	clear_bit(flag,&ti->flags);
+	clear_bit(flag, (unsigned long *)&ti->flags);
 }
 
 static inline int test_and_set_ti_thread_flag(struct thread_info *ti, int flag)
 {
-	return test_and_set_bit(flag,&ti->flags);
+	return test_and_set_bit(flag, (unsigned long *)&ti->flags);
 }
 
 static inline int test_and_clear_ti_thread_flag(struct thread_info *ti, int flag)
 {
-	return test_and_clear_bit(flag,&ti->flags);
+	return test_and_clear_bit(flag, (unsigned long *)&ti->flags);
 }
 
 static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
 {
-	return test_bit(flag,&ti->flags);
+	return test_bit(flag, (unsigned long *)&ti->flags);
 }
 
 #define set_thread_flag(flag) \
-- 
cgit v1.2.3


From 53756d3722172815f52272b28c6d5d5e9639adde Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 30 Jan 2008 13:30:55 +0100
Subject: x86: add set/clear_cpu_cap operations

The patch to suppress bitops-related warnings added a pile of ugly
casts.  Many of these were related to the management of x86 CPU
capabilities.  Clean these up by adding specific set/clear_cpu_cap
macros, and use them consistently.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Andi Kleen <ak@suse.de>

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/alternative.c              | 13 +++++++------
 arch/x86/kernel/apic_32.c                  |  8 ++++----
 arch/x86/kernel/apic_64.c                  |  2 +-
 arch/x86/kernel/cpu/addon_cpuid_features.c |  2 +-
 arch/x86/kernel/cpu/mcheck/mce_64.c        |  2 +-
 arch/x86/kernel/setup_32.c                 |  2 +-
 arch/x86/kernel/setup_64.c                 | 29 ++++++++++++++---------------
 arch/x86/kernel/vmi_32.c                   | 10 +++++-----
 include/asm-x86/cpufeature.h               |  5 ++++-
 9 files changed, 38 insertions(+), 35 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index d6405e0842b5..cdc43242da92 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -356,15 +356,15 @@ void alternatives_smp_switch(int smp)
 	spin_lock_irqsave(&smp_alt, flags);
 	if (smp) {
 		printk(KERN_INFO "SMP alternatives: switching to SMP code\n");
-		clear_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
-		clear_bit(X86_FEATURE_UP, cpu_data(0).x86_capability);
+		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
+		clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
 		list_for_each_entry(mod, &smp_alt_modules, next)
 			alternatives_smp_lock(mod->locks, mod->locks_end,
 					      mod->text, mod->text_end);
 	} else {
 		printk(KERN_INFO "SMP alternatives: switching to UP code\n");
-		set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
-		set_bit(X86_FEATURE_UP, cpu_data(0).x86_capability);
+		set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
+		set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
 		list_for_each_entry(mod, &smp_alt_modules, next)
 			alternatives_smp_unlock(mod->locks, mod->locks_end,
 						mod->text, mod->text_end);
@@ -431,8 +431,9 @@ void __init alternative_instructions(void)
 	if (smp_alt_once) {
 		if (1 == num_possible_cpus()) {
 			printk(KERN_INFO "SMP alternatives: switching to UP code\n");
-			set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability);
-			set_bit(X86_FEATURE_UP, cpu_data(0).x86_capability);
+			set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
+			set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
+
 			alternatives_smp_unlock(__smp_locks, __smp_locks_end,
 						_text, _etext);
 		}
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index 56352c11a896..1ee443a8e61b 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -1078,7 +1078,7 @@ static int __init detect_init_APIC (void)
 		printk(KERN_WARNING "Could not enable APIC!\n");
 		return -1;
 	}
-	set_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
+	set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
 	mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
 
 	/* The BIOS may have set up the APIC at some other address */
@@ -1168,7 +1168,7 @@ fake_ioapic_page:
 int __init APIC_init_uniprocessor (void)
 {
 	if (enable_local_apic < 0)
-		clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
+		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
 
 	if (!smp_found_config && !cpu_has_apic)
 		return -1;
@@ -1180,7 +1180,7 @@ int __init APIC_init_uniprocessor (void)
 	    APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
 		printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
 		       boot_cpu_physical_apicid);
-		clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
+		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
 		return -1;
 	}
 
@@ -1536,7 +1536,7 @@ early_param("lapic", parse_lapic);
 static int __init parse_nolapic(char *arg)
 {
 	enable_local_apic = -1;
-	clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
+	clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
 	return 0;
 }
 early_param("nolapic", parse_nolapic);
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 027004262105..ab4ae50399fd 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -1211,7 +1211,7 @@ early_param("apic", apic_set_verbosity);
 static __init int setup_disableapic(char *str)
 {
 	disable_apic = 1;
-	clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
+	clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
 	return 0;
 }
 early_param("disableapic", setup_disableapic);
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 3e91d3ee26ec..238468ae1993 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -45,6 +45,6 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 			&regs[CR_ECX], &regs[CR_EDX]);
 
 		if (regs[cb->reg] & (1 << cb->bit))
-			set_bit(cb->feature, c->x86_capability);
+			set_cpu_cap(c, cb->feature);
 	}
 }
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 8ef6a6bfd112..3c7672c40cf4 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -80,7 +80,7 @@ void mce_log(struct mce *mce)
 			/* When the buffer fills up discard new entries. Assume
 			   that the earlier errors are the more interesting. */
 			if (entry >= MCE_LOG_LEN) {
-				set_bit(MCE_OVERFLOW, &mcelog.flags);
+				set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
 				return;
 			}
 			/* Old left over entry. Skip. */
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 32edf70d6b0d..e9ede0fc585a 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -238,7 +238,7 @@ static int __init parse_mem(char *arg)
 		return -EINVAL;
 
 	if (strcmp(arg, "nopentium") == 0) {
-		clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
+		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PSE);
 		disable_pse = 1;
 	} else {
 		/* If the user specifies memory size, we
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 63dd39b843b5..ce4d6b52ce36 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -667,13 +667,13 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 	level = cpuid_eax(1);
 	if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) ||
 			     level >= 0x0f58))
-		set_bit(X86_FEATURE_REP_GOOD, (unsigned long *)&c->x86_capability);
+		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
 	if (c->x86 == 0x10 || c->x86 == 0x11)
-		set_bit(X86_FEATURE_REP_GOOD, (unsigned long *)&c->x86_capability);
+		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
 
 	/* Enable workaround for FXSAVE leak */
 	if (c->x86 >= 6)
-		set_bit(X86_FEATURE_FXSAVE_LEAK, (unsigned long *)&c->x86_capability);
+		set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
 
 	level = get_model_name(c);
 	if (!level) {
@@ -689,7 +689,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 
 	/* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
 	if (c->x86_power & (1<<8))
-		set_bit(X86_FEATURE_CONSTANT_TSC, (unsigned long *)&c->x86_capability);
+		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
 
 	/* Multi core CPU? */
 	if (c->extended_cpuid_level >= 0x80000008)
@@ -702,14 +702,14 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 		num_cache_leaves = 3;
 
 	if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
-		set_bit(X86_FEATURE_K8, (unsigned long *)&c->x86_capability);
+		set_cpu_cap(c, X86_FEATURE_K8);
 
 	/* RDTSC can be speculated around */
-	clear_bit(X86_FEATURE_SYNC_RDTSC, (unsigned long *)&c->x86_capability);
+	clear_cpu_cap(c, X86_FEATURE_SYNC_RDTSC);
 
 	/* Family 10 doesn't support C states in MWAIT so don't use it */
 	if (c->x86 == 0x10 && !force_mwait)
-		clear_bit(X86_FEATURE_MWAIT, (unsigned long *)&c->x86_capability);
+		clear_cpu_cap(c, X86_FEATURE_MWAIT);
 
 	if (amd_apic_timer_broken())
 		disable_apic_timer = 1;
@@ -811,17 +811,16 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 		unsigned eax = cpuid_eax(10);
 		/* Check for version and the number of counters */
 		if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
-			set_bit(X86_FEATURE_ARCH_PERFMON,
-				(unsigned long *)&c->x86_capability);
+			set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
 	}
 
 	if (cpu_has_ds) {
 		unsigned int l1, l2;
 		rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
 		if (!(l1 & (1<<11)))
-			set_bit(X86_FEATURE_BTS, (unsigned long *)c->x86_capability);
+			set_cpu_cap(c, X86_FEATURE_BTS);
 		if (!(l1 & (1<<12)))
-			set_bit(X86_FEATURE_PEBS, (unsigned long *)c->x86_capability);
+			set_cpu_cap(c, X86_FEATURE_PEBS);
 	}
 
 	n = c->extended_cpuid_level;
@@ -840,13 +839,13 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 		c->x86_cache_alignment = c->x86_clflush_size * 2;
 	if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
 	    (c->x86 == 0x6 && c->x86_model >= 0x0e))
-		set_bit(X86_FEATURE_CONSTANT_TSC, (unsigned long *)&c->x86_capability);
+		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
 	if (c->x86 == 6)
-		set_bit(X86_FEATURE_REP_GOOD, (unsigned long *)&c->x86_capability);
+		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
 	if (c->x86 == 15)
-		set_bit(X86_FEATURE_SYNC_RDTSC, (unsigned long *)&c->x86_capability);
+		set_cpu_cap(c, X86_FEATURE_SYNC_RDTSC);
 	else
-		clear_bit(X86_FEATURE_SYNC_RDTSC, (unsigned long *)&c->x86_capability);
+		clear_cpu_cap(c, X86_FEATURE_SYNC_RDTSC);
 	c->x86_max_cores = intel_num_cpu_cores(c);
 
 	srat_detect_node();
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index aacce426cbd0..87e5633805a9 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -963,19 +963,19 @@ static int __init parse_vmi(char *arg)
 		return -EINVAL;
 
 	if (!strcmp(arg, "disable_pge")) {
-		clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
+		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
 		disable_pge = 1;
 	} else if (!strcmp(arg, "disable_pse")) {
-		clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
+		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PSE);
 		disable_pse = 1;
 	} else if (!strcmp(arg, "disable_sep")) {
-		clear_bit(X86_FEATURE_SEP, boot_cpu_data.x86_capability);
+		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP);
 		disable_sep = 1;
 	} else if (!strcmp(arg, "disable_tsc")) {
-		clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
+		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC);
 		disable_tsc = 1;
 	} else if (!strcmp(arg, "disable_mtrr")) {
-		clear_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability);
+		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_MTRR);
 		disable_mtrr = 1;
 	} else if (!strcmp(arg, "disable_timer")) {
 		disable_vmi_timer = 1;
diff --git a/include/asm-x86/cpufeature.h b/include/asm-x86/cpufeature.h
index 761922972f6f..87dd900154d1 100644
--- a/include/asm-x86/cpufeature.h
+++ b/include/asm-x86/cpufeature.h
@@ -124,9 +124,12 @@
 	   (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) ||	\
 	   (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) )	\
 	  ? 1 :								\
-	 test_bit(bit, (unsigned long *)(c)->x86_capability))
+	 test_bit(bit, (unsigned long *)((c)->x86_capability)))
 #define boot_cpu_has(bit)	cpu_has(&boot_cpu_data, bit)
 
+#define set_cpu_cap(c, bit)	set_bit(bit, (unsigned long *)((c)->x86_capability))
+#define clear_cpu_cap(c, bit)	clear_bit(bit, (unsigned long *)((c)->x86_capability))
+
 #define cpu_has_fpu		boot_cpu_has(X86_FEATURE_FPU)
 #define cpu_has_vme		boot_cpu_has(X86_FEATURE_VME)
 #define cpu_has_de		boot_cpu_has(X86_FEATURE_DE)
-- 
cgit v1.2.3


From 65ea5b0349903585bfed9720fa06f5edb4f1cd25 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Wed, 30 Jan 2008 13:30:56 +0100
Subject: x86: rename the struct pt_regs members for 32/64-bit consistency

We have a lot of code which differs only by the naming of specific
members of structures that contain registers.  In order to enable
additional unifications, this patch drops the e- or r- size prefix
from the register names in struct pt_regs, and drops the x- prefixes
for segment registers on the 32-bit side.

This patch also performs the equivalent renames in some additional
places that might be candidates for unification in the future.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/ia32/ia32_aout.c           |  30 +++---
 arch/x86/ia32/ia32_binfmt.c         |  42 ++++----
 arch/x86/ia32/ia32_signal.c         | 100 +++++++++----------
 arch/x86/ia32/ptrace32.c            |  44 ++++-----
 arch/x86/ia32/sys_ia32.c            |   6 +-
 arch/x86/kernel/acpi/wakeup_64.S    |  32 +++---
 arch/x86/kernel/asm-offsets_32.c    |  32 +++---
 arch/x86/kernel/asm-offsets_64.c    |  18 ++--
 arch/x86/kernel/cpu/common.c        |   2 +-
 arch/x86/kernel/cpu/mcheck/mce_64.c |  14 +--
 arch/x86/kernel/i8259_32.c          |   2 +-
 arch/x86/kernel/io_apic_64.c        |   2 +-
 arch/x86/kernel/ioport_32.c         |   8 +-
 arch/x86/kernel/ioport_64.c         |   6 +-
 arch/x86/kernel/irq_32.c            |  20 ++--
 arch/x86/kernel/irq_64.c            |  10 +-
 arch/x86/kernel/kprobes_32.c        |  92 ++++++++---------
 arch/x86/kernel/kprobes_64.c        |  69 ++++++-------
 arch/x86/kernel/process_32.c        | 140 +++++++++++++-------------
 arch/x86/kernel/process_64.c        |  44 ++++-----
 arch/x86/kernel/ptrace_32.c         |  26 ++---
 arch/x86/kernel/ptrace_64.c         |  24 ++---
 arch/x86/kernel/signal_32.c         | 192 ++++++++++++++++++------------------
 arch/x86/kernel/signal_64.c         | 115 ++++++++++-----------
 arch/x86/kernel/smp_64.c            |   2 +-
 arch/x86/kernel/smpboot_32.c        |  10 +-
 arch/x86/kernel/step.c              |  15 +--
 arch/x86/kernel/suspend_asm_64.S    |  32 +++---
 arch/x86/kernel/time_32.c           |   8 +-
 arch/x86/kernel/time_64.c           |   4 +-
 arch/x86/kernel/traps_32.c          | 114 ++++++++++-----------
 arch/x86/kernel/traps_64.c          |  84 ++++++++--------
 arch/x86/kernel/vm86_32.c           |  94 +++++++++---------
 arch/x86/kernel/vmi_32.c            |  50 +++++-----
 arch/x86/kernel/vsyscall_64.c       |   2 +-
 arch/x86/lguest/boot.c              |  22 ++---
 arch/x86/mach-voyager/voyager_smp.c |  12 +--
 arch/x86/mm/extable_32.c            |   6 +-
 arch/x86/mm/fault_32.c              |  38 +++----
 arch/x86/mm/fault_64.c              |  22 ++---
 arch/x86/oprofile/backtrace.c       |   6 +-
 arch/x86/xen/enlighten.c            |  18 ++--
 arch/x86/xen/events.c               |   2 +-
 include/asm-x86/compat.h            |   2 +-
 include/asm-x86/elf.h               |  66 ++++++-------
 include/asm-x86/kexec_32.h          |  36 +++----
 include/asm-x86/kexec_64.h          |  20 ++--
 include/asm-x86/kprobes_32.h        |   2 +-
 include/asm-x86/kprobes_64.h        |   2 +-
 include/asm-x86/mce.h               |   4 +-
 include/asm-x86/processor_32.h      |  20 ++--
 include/asm-x86/processor_64.h      |   8 +-
 include/asm-x86/ptrace.h            |  80 ++++++++++++---
 kernel/signal.c                     |   4 +-
 54 files changed, 953 insertions(+), 902 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index a764e4e95314..f1a0f83676dc 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -53,7 +53,7 @@ static void dump_thread32(struct pt_regs *regs, struct user32 *dump)
 /* changed the size calculations - should hopefully work better. lbt */
 	dump->magic = CMAGIC;
 	dump->start_code = 0;
-	dump->start_stack = regs->rsp & ~(PAGE_SIZE - 1);
+	dump->start_stack = regs->sp & ~(PAGE_SIZE - 1);
 	dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
 	dump->u_dsize = ((unsigned long)
 			 (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
@@ -75,22 +75,22 @@ static void dump_thread32(struct pt_regs *regs, struct user32 *dump)
 		dump->u_ssize = tmp >> PAGE_SHIFT;
 	}
 
-	dump->regs.ebx = regs->rbx;
-	dump->regs.ecx = regs->rcx;
-	dump->regs.edx = regs->rdx;
-	dump->regs.esi = regs->rsi;
-	dump->regs.edi = regs->rdi;
-	dump->regs.ebp = regs->rbp;
-	dump->regs.eax = regs->rax;
+	dump->regs.bx = regs->bx;
+	dump->regs.cx = regs->cx;
+	dump->regs.dx = regs->dx;
+	dump->regs.si = regs->si;
+	dump->regs.di = regs->di;
+	dump->regs.bp = regs->bp;
+	dump->regs.ax = regs->ax;
 	dump->regs.ds = current->thread.ds;
 	dump->regs.es = current->thread.es;
 	asm("movl %%fs,%0" : "=r" (fs)); dump->regs.fs = fs;
 	asm("movl %%gs,%0" : "=r" (gs)); dump->regs.gs = gs;
-	dump->regs.orig_eax = regs->orig_rax;
-	dump->regs.eip = regs->rip;
+	dump->regs.orig_ax = regs->orig_ax;
+	dump->regs.ip = regs->ip;
 	dump->regs.cs = regs->cs;
-	dump->regs.eflags = regs->eflags;
-	dump->regs.esp = regs->rsp;
+	dump->regs.flags = regs->flags;
+	dump->regs.sp = regs->sp;
 	dump->regs.ss = regs->ss;
 
 #if 1 /* FIXME */
@@ -432,9 +432,9 @@ beyond_if:
 	asm volatile("movl %0,%%fs" :: "r" (0)); \
 	asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS));
 	load_gs_index(0);
-	(regs)->rip = ex.a_entry;
-	(regs)->rsp = current->mm->start_stack;
-	(regs)->eflags = 0x200;
+	(regs)->ip = ex.a_entry;
+	(regs)->sp = current->mm->start_stack;
+	(regs)->flags = 0x200;
 	(regs)->cs = __USER32_CS;
 	(regs)->ss = __USER32_DS;
 	regs->r8 = regs->r9 = regs->r10 = regs->r11 =
diff --git a/arch/x86/ia32/ia32_binfmt.c b/arch/x86/ia32/ia32_binfmt.c
index e32974c3dd3b..806135c3f975 100644
--- a/arch/x86/ia32/ia32_binfmt.c
+++ b/arch/x86/ia32/ia32_binfmt.c
@@ -73,22 +73,22 @@ struct file;
 /* Assumes current==process to be dumped */
 #undef	ELF_CORE_COPY_REGS
 #define ELF_CORE_COPY_REGS(pr_reg, regs)       		\
-	pr_reg[0] = regs->rbx;				\
-	pr_reg[1] = regs->rcx;				\
-	pr_reg[2] = regs->rdx;				\
-	pr_reg[3] = regs->rsi;				\
-	pr_reg[4] = regs->rdi;				\
-	pr_reg[5] = regs->rbp;				\
-	pr_reg[6] = regs->rax;				\
+	pr_reg[0] = regs->bx;				\
+	pr_reg[1] = regs->cx;				\
+	pr_reg[2] = regs->dx;				\
+	pr_reg[3] = regs->si;				\
+	pr_reg[4] = regs->di;				\
+	pr_reg[5] = regs->bp;				\
+	pr_reg[6] = regs->ax;				\
 	pr_reg[7] = _GET_SEG(ds);   			\
 	pr_reg[8] = _GET_SEG(es);			\
 	pr_reg[9] = _GET_SEG(fs);			\
 	pr_reg[10] = _GET_SEG(gs);			\
-	pr_reg[11] = regs->orig_rax;			\
-	pr_reg[12] = regs->rip;				\
+	pr_reg[11] = regs->orig_ax;			\
+	pr_reg[12] = regs->ip;				\
 	pr_reg[13] = regs->cs;				\
-	pr_reg[14] = regs->eflags;			\
-	pr_reg[15] = regs->rsp;				\
+	pr_reg[14] = regs->flags;			\
+	pr_reg[15] = regs->sp;				\
 	pr_reg[16] = regs->ss;
 
 
@@ -205,9 +205,9 @@ do {							\
 	asm volatile("movl %0,%%fs" :: "r" (0)); \
 	asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS)); \
 	load_gs_index(0); \
-	(regs)->rip = (new_rip); \
-	(regs)->rsp = (new_rsp); \
-	(regs)->eflags = 0x200; \
+	(regs)->ip = (new_rip); \
+	(regs)->sp = (new_rsp); \
+	(regs)->flags = X86_EFLAGS_IF; \
 	(regs)->cs = __USER32_CS; \
 	(regs)->ss = __USER32_DS; \
 	set_fs(USER_DS); \
@@ -233,13 +233,13 @@ extern int syscall32_setup_pages(struct linux_binprm *, int exstack);
 static void elf32_init(struct pt_regs *regs)
 {
 	struct task_struct *me = current; 
-	regs->rdi = 0;
-	regs->rsi = 0;
-	regs->rdx = 0;
-	regs->rcx = 0;
-	regs->rax = 0;
-	regs->rbx = 0; 
-	regs->rbp = 0; 
+	regs->di = 0;
+	regs->si = 0;
+	regs->dx = 0;
+	regs->cx = 0;
+	regs->ax = 0;
+	regs->bx = 0;
+	regs->bp = 0;
 	regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 =
 		regs->r13 = regs->r14 = regs->r15 = 0; 
     me->thread.fs = 0; 
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 39356a756b28..f2da443f8c7b 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -154,7 +154,7 @@ asmlinkage long sys32_sigaltstack(const stack_ia32_t __user *uss_ptr,
 	}
 	seg = get_fs();
 	set_fs(KERNEL_DS);
-	ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, regs->rsp);
+	ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, regs->sp);
 	set_fs(seg);
 	if (ret >= 0 && uoss_ptr)  {
 		if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(stack_ia32_t)) ||
@@ -195,7 +195,7 @@ struct rt_sigframe
 #define COPY(x)		{ \
 	unsigned int reg;			\
 	err |= __get_user(reg, &sc->e ##x);	\
-	regs->r ## x = reg;			\
+	regs->x = reg;			\
 }
 
 #define RELOAD_SEG(seg,mask)						\
@@ -220,7 +220,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
 #if DEBUG_SIG
 	printk(KERN_DEBUG "SIG restore_sigcontext: "
 	       "sc=%p err(%x) eip(%x) cs(%x) flg(%x)\n",
-	       sc, sc->err, sc->eip, sc->cs, sc->eflags);
+	       sc, sc->err, sc->ip, sc->cs, sc->flags);
 #endif
 
 	/*
@@ -249,9 +249,9 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
 	regs->ss |= 3;
 
 	err |= __get_user(tmpflags, &sc->eflags);
-	regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5);
+	regs->flags = (regs->flags & ~0x40DD5) | (tmpflags & 0x40DD5);
 	/* disable syscall checks */
-	regs->orig_rax = -1;
+	regs->orig_ax = -1;
 
 	err |= __get_user(tmp, &sc->fpstate);
 	buf = compat_ptr(tmp);
@@ -279,9 +279,9 @@ badframe:
 
 asmlinkage long sys32_sigreturn(struct pt_regs *regs)
 {
-	struct sigframe __user *frame = (struct sigframe __user *)(regs->rsp-8);
+	struct sigframe __user *frame = (struct sigframe __user *)(regs->sp-8);
 	sigset_t set;
-	unsigned int eax;
+	unsigned int ax;
 
 	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
 		goto badframe;
@@ -298,9 +298,9 @@ asmlinkage long sys32_sigreturn(struct pt_regs *regs)
 	recalc_sigpending();
 	spin_unlock_irq(&current->sighand->siglock);
 
-	if (ia32_restore_sigcontext(regs, &frame->sc, &eax))
+	if (ia32_restore_sigcontext(regs, &frame->sc, &ax))
 		goto badframe;
-	return eax;
+	return ax;
 
 badframe:
 	signal_fault(regs, frame, "32bit sigreturn");
@@ -311,10 +311,10 @@ asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs)
 {
 	struct rt_sigframe __user *frame;
 	sigset_t set;
-	unsigned int eax;
+	unsigned int ax;
 	struct pt_regs tregs;
 
-	frame = (struct rt_sigframe __user *)(regs->rsp - 4);
+	frame = (struct rt_sigframe __user *)(regs->sp - 4);
 
 	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
 		goto badframe;
@@ -327,14 +327,14 @@ asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs)
 	recalc_sigpending();
 	spin_unlock_irq(&current->sighand->siglock);
 
-	if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax))
+	if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
 		goto badframe;
 
 	tregs = *regs;
 	if (sys32_sigaltstack(&frame->uc.uc_stack, NULL, &tregs) == -EFAULT)
 		goto badframe;
 
-	return eax;
+	return ax;
 
 badframe:
 	signal_fault(regs, frame, "32bit rt sigreturn");
@@ -361,21 +361,21 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc,
 	__asm__("movl %%es,%0" : "=r"(tmp): "0"(tmp));
 	err |= __put_user(tmp, (unsigned int __user *)&sc->es);
 
-	err |= __put_user((u32)regs->rdi, &sc->edi);
-	err |= __put_user((u32)regs->rsi, &sc->esi);
-	err |= __put_user((u32)regs->rbp, &sc->ebp);
-	err |= __put_user((u32)regs->rsp, &sc->esp);
-	err |= __put_user((u32)regs->rbx, &sc->ebx);
-	err |= __put_user((u32)regs->rdx, &sc->edx);
-	err |= __put_user((u32)regs->rcx, &sc->ecx);
-	err |= __put_user((u32)regs->rax, &sc->eax);
+	err |= __put_user((u32)regs->di, &sc->edi);
+	err |= __put_user((u32)regs->si, &sc->esi);
+	err |= __put_user((u32)regs->bp, &sc->ebp);
+	err |= __put_user((u32)regs->sp, &sc->esp);
+	err |= __put_user((u32)regs->bx, &sc->ebx);
+	err |= __put_user((u32)regs->dx, &sc->edx);
+	err |= __put_user((u32)regs->cx, &sc->ecx);
+	err |= __put_user((u32)regs->ax, &sc->eax);
 	err |= __put_user((u32)regs->cs, &sc->cs);
 	err |= __put_user((u32)regs->ss, &sc->ss);
 	err |= __put_user(current->thread.trap_no, &sc->trapno);
 	err |= __put_user(current->thread.error_code, &sc->err);
-	err |= __put_user((u32)regs->rip, &sc->eip);
-	err |= __put_user((u32)regs->eflags, &sc->eflags);
-	err |= __put_user((u32)regs->rsp, &sc->esp_at_signal);
+	err |= __put_user((u32)regs->ip, &sc->eip);
+	err |= __put_user((u32)regs->flags, &sc->eflags);
+	err |= __put_user((u32)regs->sp, &sc->esp_at_signal);
 
 	tmp = save_i387_ia32(current, fpstate, regs, 0);
 	if (tmp < 0)
@@ -400,28 +400,28 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc,
 static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
 				 size_t frame_size)
 {
-	unsigned long rsp;
+	unsigned long sp;
 
 	/* Default to using normal stack */
-	rsp = regs->rsp;
+	sp = regs->sp;
 
 	/* This is the X/Open sanctioned signal stack switching.  */
 	if (ka->sa.sa_flags & SA_ONSTACK) {
-		if (sas_ss_flags(rsp) == 0)
-			rsp = current->sas_ss_sp + current->sas_ss_size;
+		if (sas_ss_flags(sp) == 0)
+			sp = current->sas_ss_sp + current->sas_ss_size;
 	}
 
 	/* This is the legacy signal stack switching. */
 	else if ((regs->ss & 0xffff) != __USER_DS &&
 		!(ka->sa.sa_flags & SA_RESTORER) &&
 		 ka->sa.sa_restorer)
-		rsp = (unsigned long) ka->sa.sa_restorer;
+		sp = (unsigned long) ka->sa.sa_restorer;
 
-	rsp -= frame_size;
+	sp -= frame_size;
 	/* Align the stack pointer according to the i386 ABI,
 	 * i.e. so that on function entry ((sp + 4) & 15) == 0. */
-	rsp = ((rsp + 4) & -16ul) - 4;
-	return (void __user *) rsp;
+	sp = ((sp + 4) & -16ul) - 4;
+	return (void __user *) sp;
 }
 
 int ia32_setup_frame(int sig, struct k_sigaction *ka,
@@ -486,13 +486,13 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
 		goto give_sigsegv;
 
 	/* Set up registers for signal handler */
-	regs->rsp = (unsigned long) frame;
-	regs->rip = (unsigned long) ka->sa.sa_handler;
+	regs->sp = (unsigned long) frame;
+	regs->ip = (unsigned long) ka->sa.sa_handler;
 
 	/* Make -mregparm=3 work */
-	regs->rax = sig;
-	regs->rdx = 0;
-	regs->rcx = 0;
+	regs->ax = sig;
+	regs->dx = 0;
+	regs->cx = 0;
 
 	asm volatile("movl %0,%%ds" :: "r" (__USER32_DS));
 	asm volatile("movl %0,%%es" :: "r" (__USER32_DS));
@@ -501,13 +501,13 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
 	regs->ss = __USER32_DS;
 
 	set_fs(USER_DS);
-	regs->eflags &= ~TF_MASK;
+	regs->flags &= ~TF_MASK;
 	if (test_thread_flag(TIF_SINGLESTEP))
 		ptrace_notify(SIGTRAP);
 
 #if DEBUG_SIG
 	printk(KERN_DEBUG "SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n",
-	       current->comm, current->pid, frame, regs->rip, frame->pretcode);
+	       current->comm, current->pid, frame, regs->ip, frame->pretcode);
 #endif
 
 	return 0;
@@ -556,7 +556,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 	err |= __put_user(0, &frame->uc.uc_flags);
 	err |= __put_user(0, &frame->uc.uc_link);
 	err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
-	err |= __put_user(sas_ss_flags(regs->rsp),
+	err |= __put_user(sas_ss_flags(regs->sp),
 			  &frame->uc.uc_stack.ss_flags);
 	err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
 	err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate,
@@ -581,18 +581,18 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 		goto give_sigsegv;
 
 	/* Set up registers for signal handler */
-	regs->rsp = (unsigned long) frame;
-	regs->rip = (unsigned long) ka->sa.sa_handler;
+	regs->sp = (unsigned long) frame;
+	regs->ip = (unsigned long) ka->sa.sa_handler;
 
 	/* Make -mregparm=3 work */
-	regs->rax = sig;
-	regs->rdx = (unsigned long) &frame->info;
-	regs->rcx = (unsigned long) &frame->uc;
+	regs->ax = sig;
+	regs->dx = (unsigned long) &frame->info;
+	regs->cx = (unsigned long) &frame->uc;
 
 	/* Make -mregparm=3 work */
-	regs->rax = sig;
-	regs->rdx = (unsigned long) &frame->info;
-	regs->rcx = (unsigned long) &frame->uc;
+	regs->ax = sig;
+	regs->dx = (unsigned long) &frame->info;
+	regs->cx = (unsigned long) &frame->uc;
 
 	asm volatile("movl %0,%%ds" :: "r" (__USER32_DS));
 	asm volatile("movl %0,%%es" :: "r" (__USER32_DS));
@@ -601,13 +601,13 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 	regs->ss = __USER32_DS;
 
 	set_fs(USER_DS);
-	regs->eflags &= ~TF_MASK;
+	regs->flags &= ~TF_MASK;
 	if (test_thread_flag(TIF_SINGLESTEP))
 		ptrace_notify(SIGTRAP);
 
 #if DEBUG_SIG
 	printk(KERN_DEBUG "SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n",
-	       current->comm, current->pid, frame, regs->rip, frame->pretcode);
+	       current->comm, current->pid, frame, regs->ip, frame->pretcode);
 #endif
 
 	return 0;
diff --git a/arch/x86/ia32/ptrace32.c b/arch/x86/ia32/ptrace32.c
index 8c6fb9d8352b..1e382e3bd882 100644
--- a/arch/x86/ia32/ptrace32.c
+++ b/arch/x86/ia32/ptrace32.c
@@ -73,19 +73,19 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 val)
 		stack[offsetof(struct pt_regs, cs)/8] = val & 0xffff;
 		break;
 
-	R32(ebx, rbx);
-	R32(ecx, rcx);
-	R32(edx, rdx);
-	R32(edi, rdi);
-	R32(esi, rsi);
-	R32(ebp, rbp);
-	R32(eax, rax);
-	R32(orig_eax, orig_rax);
-	R32(eip, rip);
-	R32(esp, rsp);
+	R32(ebx, bx);
+	R32(ecx, cx);
+	R32(edx, dx);
+	R32(edi, di);
+	R32(esi, si);
+	R32(ebp, bp);
+	R32(eax, ax);
+	R32(orig_eax, orig_ax);
+	R32(eip, ip);
+	R32(esp, sp);
 
 	case offsetof(struct user32, regs.eflags): {
-		__u64 *flags = &stack[offsetof(struct pt_regs, eflags)/8];
+		__u64 *flags = &stack[offsetof(struct pt_regs, flags)/8];
 
 		val &= FLAG_MASK;
 		/*
@@ -145,22 +145,22 @@ static int getreg32(struct task_struct *child, unsigned regno, u32 *val)
 
 	R32(cs, cs);
 	R32(ss, ss);
-	R32(ebx, rbx);
-	R32(ecx, rcx);
-	R32(edx, rdx);
-	R32(edi, rdi);
-	R32(esi, rsi);
-	R32(ebp, rbp);
-	R32(eax, rax);
-	R32(orig_eax, orig_rax);
-	R32(eip, rip);
-	R32(esp, rsp);
+	R32(ebx, bx);
+	R32(ecx, cx);
+	R32(edx, dx);
+	R32(edi, di);
+	R32(esi, si);
+	R32(ebp, bp);
+	R32(eax, ax);
+	R32(orig_eax, orig_ax);
+	R32(eip, ip);
+	R32(esp, sp);
 
 	case offsetof(struct user32, regs.eflags):
 		/*
 		 * If the debugger set TF, hide it from the readout.
 		 */
-		*val = stack[offsetof(struct pt_regs, eflags)/8];
+		*val = stack[offsetof(struct pt_regs, flags)/8];
 		if (test_tsk_thread_flag(child, TIF_FORCED_TF))
 			*val &= ~X86_EFLAGS_TF;
 		break;
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 58991abc5b59..abf71d26fc2a 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -816,11 +816,11 @@ asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv,
 asmlinkage long sys32_clone(unsigned int clone_flags, unsigned int newsp,
 			    struct pt_regs *regs)
 {
-	void __user *parent_tid = (void __user *)regs->rdx;
-	void __user *child_tid = (void __user *)regs->rdi;
+	void __user *parent_tid = (void __user *)regs->dx;
+	void __user *child_tid = (void __user *)regs->di;
 
 	if (!newsp)
-		newsp = regs->rsp;
+		newsp = regs->sp;
 	return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
 }
 
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index 5ed3bc5c61d7..2e1b9e0d0767 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -344,13 +344,13 @@ do_suspend_lowlevel:
 	call	save_processor_state
 
 	movq	$saved_context, %rax
-	movq	%rsp, pt_regs_rsp(%rax)
-	movq	%rbp, pt_regs_rbp(%rax)
-	movq	%rsi, pt_regs_rsi(%rax)
-	movq	%rdi, pt_regs_rdi(%rax)
-	movq	%rbx, pt_regs_rbx(%rax)
-	movq	%rcx, pt_regs_rcx(%rax)
-	movq	%rdx, pt_regs_rdx(%rax)
+	movq	%rsp, pt_regs_sp(%rax)
+	movq	%rbp, pt_regs_bp(%rax)
+	movq	%rsi, pt_regs_si(%rax)
+	movq	%rdi, pt_regs_di(%rax)
+	movq	%rbx, pt_regs_bx(%rax)
+	movq	%rcx, pt_regs_cx(%rax)
+	movq	%rdx, pt_regs_dx(%rax)
 	movq	%r8, pt_regs_r8(%rax)
 	movq	%r9, pt_regs_r9(%rax)
 	movq	%r10, pt_regs_r10(%rax)
@@ -360,7 +360,7 @@ do_suspend_lowlevel:
 	movq	%r14, pt_regs_r14(%rax)
 	movq	%r15, pt_regs_r15(%rax)
 	pushfq
-	popq	pt_regs_eflags(%rax)
+	popq	pt_regs_flags(%rax)
 
 	movq	$.L97, saved_rip(%rip)
 
@@ -391,15 +391,15 @@ do_suspend_lowlevel:
 	movq	%rbx, %cr2
 	movq	saved_context_cr0(%rax), %rbx
 	movq	%rbx, %cr0
-	pushq	pt_regs_eflags(%rax)
+	pushq	pt_regs_flags(%rax)
 	popfq
-	movq	pt_regs_rsp(%rax), %rsp
-	movq	pt_regs_rbp(%rax), %rbp
-	movq	pt_regs_rsi(%rax), %rsi
-	movq	pt_regs_rdi(%rax), %rdi
-	movq	pt_regs_rbx(%rax), %rbx
-	movq	pt_regs_rcx(%rax), %rcx
-	movq	pt_regs_rdx(%rax), %rdx
+	movq	pt_regs_sp(%rax), %rsp
+	movq	pt_regs_bp(%rax), %rbp
+	movq	pt_regs_si(%rax), %rsi
+	movq	pt_regs_di(%rax), %rdi
+	movq	pt_regs_bx(%rax), %rbx
+	movq	pt_regs_cx(%rax), %rcx
+	movq	pt_regs_dx(%rax), %rdx
 	movq	pt_regs_r8(%rax), %r8
 	movq	pt_regs_r9(%rax), %r9
 	movq	pt_regs_r10(%rax), %r10
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index fd7464d23339..a3a8be7618d1 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -75,22 +75,22 @@ void foo(void)
 	OFFSET(GDS_pad, Xgt_desc_struct, pad);
 	BLANK();
 
-	OFFSET(PT_EBX, pt_regs, ebx);
-	OFFSET(PT_ECX, pt_regs, ecx);
-	OFFSET(PT_EDX, pt_regs, edx);
-	OFFSET(PT_ESI, pt_regs, esi);
-	OFFSET(PT_EDI, pt_regs, edi);
-	OFFSET(PT_EBP, pt_regs, ebp);
-	OFFSET(PT_EAX, pt_regs, eax);
-	OFFSET(PT_DS,  pt_regs, xds);
-	OFFSET(PT_ES,  pt_regs, xes);
-	OFFSET(PT_FS,  pt_regs, xfs);
-	OFFSET(PT_ORIG_EAX, pt_regs, orig_eax);
-	OFFSET(PT_EIP, pt_regs, eip);
-	OFFSET(PT_CS,  pt_regs, xcs);
-	OFFSET(PT_EFLAGS, pt_regs, eflags);
-	OFFSET(PT_OLDESP, pt_regs, esp);
-	OFFSET(PT_OLDSS,  pt_regs, xss);
+	OFFSET(PT_EBX, pt_regs, bx);
+	OFFSET(PT_ECX, pt_regs, cx);
+	OFFSET(PT_EDX, pt_regs, dx);
+	OFFSET(PT_ESI, pt_regs, si);
+	OFFSET(PT_EDI, pt_regs, di);
+	OFFSET(PT_EBP, pt_regs, bp);
+	OFFSET(PT_EAX, pt_regs, ax);
+	OFFSET(PT_DS,  pt_regs, ds);
+	OFFSET(PT_ES,  pt_regs, es);
+	OFFSET(PT_FS,  pt_regs, fs);
+	OFFSET(PT_ORIG_EAX, pt_regs, orig_ax);
+	OFFSET(PT_EIP, pt_regs, ip);
+	OFFSET(PT_CS,  pt_regs, cs);
+	OFFSET(PT_EFLAGS, pt_regs, flags);
+	OFFSET(PT_OLDESP, pt_regs, sp);
+	OFFSET(PT_OLDSS,  pt_regs, ss);
 	BLANK();
 
 	OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index c27c646214f4..2e918ebf21d3 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -83,14 +83,14 @@ int main(void)
 	DEFINE(pbe_next, offsetof(struct pbe, next));
 	BLANK();
 #define ENTRY(entry) DEFINE(pt_regs_ ## entry, offsetof(struct pt_regs, entry))
-	ENTRY(rbx);
-	ENTRY(rbx);
-	ENTRY(rcx);
-	ENTRY(rdx);
-	ENTRY(rsp);
-	ENTRY(rbp);
-	ENTRY(rsi);
-	ENTRY(rdi);
+	ENTRY(bx);
+	ENTRY(bx);
+	ENTRY(cx);
+	ENTRY(dx);
+	ENTRY(sp);
+	ENTRY(bp);
+	ENTRY(si);
+	ENTRY(di);
 	ENTRY(r8);
 	ENTRY(r9);
 	ENTRY(r10);
@@ -99,7 +99,7 @@ int main(void)
 	ENTRY(r13);
 	ENTRY(r14);
 	ENTRY(r15);
-	ENTRY(eflags);
+	ENTRY(flags);
 	BLANK();
 #undef ENTRY
 #define ENTRY(entry) DEFINE(saved_context_ ## entry, offsetof(struct saved_context, entry))
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index e2fcf2051bdb..5db2a163bf4b 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -634,7 +634,7 @@ void __init early_cpu_init(void)
 struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
 {
 	memset(regs, 0, sizeof(struct pt_regs));
-	regs->xfs = __KERNEL_PERCPU;
+	regs->fs = __KERNEL_PERCPU;
 	return regs;
 }
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 3c7672c40cf4..0adad772d0da 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -110,12 +110,12 @@ static void print_mce(struct mce *m)
 	       KERN_EMERG
 	       "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
 	       m->cpu, m->mcgstatus, m->bank, m->status);
-	if (m->rip) {
+	if (m->ip) {
 		printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
 		       !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
-		       m->cs, m->rip);
+		       m->cs, m->ip);
 		if (m->cs == __KERNEL_CS)
-			print_symbol("{%s}", m->rip);
+			print_symbol("{%s}", m->ip);
 		printk("\n");
 	}
 	printk(KERN_EMERG "TSC %Lx ", m->tsc);
@@ -156,16 +156,16 @@ static int mce_available(struct cpuinfo_x86 *c)
 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
 {
 	if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
-		m->rip = regs->rip;
+		m->ip = regs->ip;
 		m->cs = regs->cs;
 	} else {
-		m->rip = 0;
+		m->ip = 0;
 		m->cs = 0;
 	}
 	if (rip_msr) {
 		/* Assume the RIP in the MSR is exact. Is this true? */
 		m->mcgstatus |= MCG_STATUS_EIPV;
-		rdmsrl(rip_msr, m->rip);
+		rdmsrl(rip_msr, m->ip);
 		m->cs = 0;
 	}
 }
@@ -288,7 +288,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 		 * instruction which caused the MCE.
 		 */
 		if (m.mcgstatus & MCG_STATUS_EIPV)
-			user_space = panicm.rip && (panicm.cs & 3);
+			user_space = panicm.ip && (panicm.cs & 3);
 
 		/*
 		 * If we know that the error was in user space, send a
diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259_32.c
index 3321ce669295..f201e7da1bbc 100644
--- a/arch/x86/kernel/i8259_32.c
+++ b/arch/x86/kernel/i8259_32.c
@@ -339,7 +339,7 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id)
 	outb(0,0xF0);
 	if (ignore_fpu_irq || !boot_cpu_data.hard_math)
 		return IRQ_NONE;
-	math_error((void __user *)get_irq_regs()->eip);
+	math_error((void __user *)get_irq_regs()->ip);
 	return IRQ_HANDLED;
 }
 
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 4ef85a3b3f9f..fa70005be5e8 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -1412,7 +1412,7 @@ static void irq_complete_move(unsigned int irq)
 	if (likely(!cfg->move_in_progress))
 		return;
 
-	vector = ~get_irq_regs()->orig_rax;
+	vector = ~get_irq_regs()->orig_ax;
 	me = smp_processor_id();
 	if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
 		cpumask_t cleanup_mask;
diff --git a/arch/x86/kernel/ioport_32.c b/arch/x86/kernel/ioport_32.c
index c281ffa18259..9295e01ff49c 100644
--- a/arch/x86/kernel/ioport_32.c
+++ b/arch/x86/kernel/ioport_32.c
@@ -100,7 +100,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
  * beyond the 0x3ff range: to get the full 65536 ports bitmapped
  * you'd need 8kB of bitmaps/process, which is a bit excessive.
  *
- * Here we just change the eflags value on the stack: we allow
+ * Here we just change the flags value on the stack: we allow
  * only the super-user to do it. This depends on the stack-layout
  * on system-call entry - see also fork() and the signal handling
  * code.
@@ -109,8 +109,8 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
 asmlinkage long sys_iopl(unsigned long regsp)
 {
 	volatile struct pt_regs *regs = (struct pt_regs *)&regsp;
-	unsigned int level = regs->ebx;
-	unsigned int old = (regs->eflags >> 12) & 3;
+	unsigned int level = regs->bx;
+	unsigned int old = (regs->flags >> 12) & 3;
 	struct thread_struct *t = &current->thread;
 
 	if (level > 3)
@@ -122,7 +122,7 @@ asmlinkage long sys_iopl(unsigned long regsp)
 	}
 
 	t->iopl = level << 12;
-	regs->eflags = (regs->eflags & ~X86_EFLAGS_IOPL) | t->iopl;
+	regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | t->iopl;
 	set_iopl_mask(t->iopl);
 
 	return 0;
diff --git a/arch/x86/kernel/ioport_64.c b/arch/x86/kernel/ioport_64.c
index 5f62fad64dab..ff7514b757e5 100644
--- a/arch/x86/kernel/ioport_64.c
+++ b/arch/x86/kernel/ioport_64.c
@@ -95,7 +95,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
  * beyond the 0x3ff range: to get the full 65536 ports bitmapped
  * you'd need 8kB of bitmaps/process, which is a bit excessive.
  *
- * Here we just change the eflags value on the stack: we allow
+ * Here we just change the flags value on the stack: we allow
  * only the super-user to do it. This depends on the stack-layout
  * on system-call entry - see also fork() and the signal handling
  * code.
@@ -103,7 +103,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
 
 asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
 {
-	unsigned int old = (regs->eflags >> 12) & 3;
+	unsigned int old = (regs->flags >> 12) & 3;
 
 	if (level > 3)
 		return -EINVAL;
@@ -112,6 +112,6 @@ asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
 		if (!capable(CAP_SYS_RAWIO))
 			return -EPERM;
 	}
-	regs->eflags = (regs->eflags &~ X86_EFLAGS_IOPL) | (level << 12);
+	regs->flags = (regs->flags &~ X86_EFLAGS_IOPL) | (level << 12);
 	return 0;
 }
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index d3fde94f7345..b49616bcc16b 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -70,7 +70,7 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs)
 {	
 	struct pt_regs *old_regs;
 	/* high bit used in ret_from_ code */
-	int irq = ~regs->orig_eax;
+	int irq = ~regs->orig_ax;
 	struct irq_desc *desc = irq_desc + irq;
 #ifdef CONFIG_4KSTACKS
 	union irq_ctx *curctx, *irqctx;
@@ -88,13 +88,13 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs)
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
 	/* Debugging check for stack overflow: is there less than 1KB free? */
 	{
-		long esp;
+		long sp;
 
 		__asm__ __volatile__("andl %%esp,%0" :
-					"=r" (esp) : "0" (THREAD_SIZE - 1));
-		if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) {
+					"=r" (sp) : "0" (THREAD_SIZE - 1));
+		if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) {
 			printk("do_IRQ: stack overflow: %ld\n",
-				esp - sizeof(struct thread_info));
+				sp - sizeof(struct thread_info));
 			dump_stack();
 		}
 	}
@@ -112,7 +112,7 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs)
 	 * current stack (which is the irq stack already after all)
 	 */
 	if (curctx != irqctx) {
-		int arg1, arg2, ebx;
+		int arg1, arg2, bx;
 
 		/* build the stack frame on the IRQ stack */
 		isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
@@ -128,10 +128,10 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs)
 			(curctx->tinfo.preempt_count & SOFTIRQ_MASK);
 
 		asm volatile(
-			"       xchgl  %%ebx,%%esp      \n"
-			"       call   *%%edi           \n"
-			"       movl   %%ebx,%%esp      \n"
-			: "=a" (arg1), "=d" (arg2), "=b" (ebx)
+			"       xchgl  %%ebx,%%esp    \n"
+			"       call   *%%edi         \n"
+			"       movl   %%ebx,%%esp    \n"
+			: "=a" (arg1), "=d" (arg2), "=b" (bx)
 			:  "0" (irq),   "1" (desc),  "2" (isp),
 			   "D" (desc->handle_irq)
 			: "memory", "cc"
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 6c3a3b6e5cf4..3aac15466a91 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -53,11 +53,11 @@ static inline void stack_overflow_check(struct pt_regs *regs)
 	u64 curbase = (u64)task_stack_page(current);
 	static unsigned long warned = -60*HZ;
 
-	if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE &&
-	    regs->rsp <  curbase + sizeof(struct thread_info) + 128 &&
+	if (regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE &&
+	    regs->sp <  curbase + sizeof(struct thread_info) + 128 &&
 	    time_after(jiffies, warned + 60*HZ)) {
-		printk("do_IRQ: %s near stack overflow (cur:%Lx,rsp:%lx)\n",
-		       current->comm, curbase, regs->rsp);
+		printk("do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
+		       current->comm, curbase, regs->sp);
 		show_stack(NULL,NULL);
 		warned = jiffies;
 	}
@@ -162,7 +162,7 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
 	struct pt_regs *old_regs = set_irq_regs(regs);
 
 	/* high bit used in ret_from_ code  */
-	unsigned vector = ~regs->orig_rax;
+	unsigned vector = ~regs->orig_ax;
 	unsigned irq;
 
 	exit_idle();
diff --git a/arch/x86/kernel/kprobes_32.c b/arch/x86/kernel/kprobes_32.c
index bc4a68367cd0..d708cd4f956f 100644
--- a/arch/x86/kernel/kprobes_32.c
+++ b/arch/x86/kernel/kprobes_32.c
@@ -212,7 +212,7 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
 {
 	__get_cpu_var(current_kprobe) = p;
 	kcb->kprobe_saved_eflags = kcb->kprobe_old_eflags
-		= (regs->eflags & (TF_MASK | IF_MASK));
+		= (regs->flags & (TF_MASK | IF_MASK));
 	if (is_IF_modifier(p->opcode))
 		kcb->kprobe_saved_eflags &= ~IF_MASK;
 }
@@ -232,20 +232,20 @@ static __always_inline void restore_btf(void)
 static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
 {
 	clear_btf();
-	regs->eflags |= TF_MASK;
-	regs->eflags &= ~IF_MASK;
+	regs->flags |= TF_MASK;
+	regs->flags &= ~IF_MASK;
 	/*single step inline if the instruction is an int3*/
 	if (p->opcode == BREAKPOINT_INSTRUCTION)
-		regs->eip = (unsigned long)p->addr;
+		regs->ip = (unsigned long)p->addr;
 	else
-		regs->eip = (unsigned long)p->ainsn.insn;
+		regs->ip = (unsigned long)p->ainsn.insn;
 }
 
 /* Called with kretprobe_lock held */
 void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
 				      struct pt_regs *regs)
 {
-	unsigned long *sara = (unsigned long *)&regs->esp;
+	unsigned long *sara = (unsigned long *)&regs->sp;
 
 	ri->ret_addr = (kprobe_opcode_t *) *sara;
 
@@ -264,7 +264,7 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
 	kprobe_opcode_t *addr;
 	struct kprobe_ctlblk *kcb;
 
-	addr = (kprobe_opcode_t *)(regs->eip - sizeof(kprobe_opcode_t));
+	addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
 
 	/*
 	 * We don't want to be preempted for the entire
@@ -279,8 +279,8 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
 		if (p) {
 			if (kcb->kprobe_status == KPROBE_HIT_SS &&
 				*p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
-				regs->eflags &= ~TF_MASK;
-				regs->eflags |= kcb->kprobe_saved_eflags;
+				regs->flags &= ~TF_MASK;
+				regs->flags |= kcb->kprobe_saved_eflags;
 				goto no_kprobe;
 			}
 			/* We have reentered the kprobe_handler(), since
@@ -301,7 +301,7 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
 			 * another cpu right after we hit, no further
 			 * handling of this interrupt is appropriate
 			 */
-				regs->eip -= sizeof(kprobe_opcode_t);
+				regs->ip -= sizeof(kprobe_opcode_t);
 				ret = 1;
 				goto no_kprobe;
 			}
@@ -325,7 +325,7 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
 			 * Back up over the (now missing) int3 and run
 			 * the original instruction.
 			 */
-			regs->eip -= sizeof(kprobe_opcode_t);
+			regs->ip -= sizeof(kprobe_opcode_t);
 			ret = 1;
 		}
 		/* Not one of ours: let kernel handle it */
@@ -344,7 +344,7 @@ ss_probe:
 	if (p->ainsn.boostable == 1 && !p->post_handler){
 		/* Boost up -- we can execute copied instructions directly */
 		reset_current_kprobe();
-		regs->eip = (unsigned long)p->ainsn.insn;
+		regs->ip = (unsigned long)p->ainsn.insn;
 		preempt_enable_no_resched();
 		return 1;
 	}
@@ -368,7 +368,7 @@ no_kprobe:
 	asm volatile ( ".global kretprobe_trampoline\n"
 			"kretprobe_trampoline: \n"
 			"	pushf\n"
-			/* skip cs, eip, orig_eax */
+			/* skip cs, ip, orig_ax */
 			"	subl $12, %esp\n"
 			"	pushl %fs\n"
 			"	pushl %ds\n"
@@ -382,10 +382,10 @@ no_kprobe:
 			"	pushl %ebx\n"
 			"	movl %esp, %eax\n"
 			"	call trampoline_handler\n"
-			/* move eflags to cs */
+			/* move flags to cs */
 			"	movl 52(%esp), %edx\n"
 			"	movl %edx, 48(%esp)\n"
-			/* save true return address on eflags */
+			/* save true return address on flags */
 			"	movl %eax, 52(%esp)\n"
 			"	popl %ebx\n"
 			"	popl %ecx\n"
@@ -394,7 +394,7 @@ no_kprobe:
 			"	popl %edi\n"
 			"	popl %ebp\n"
 			"	popl %eax\n"
-			/* skip eip, orig_eax, es, ds, fs */
+			/* skip ip, orig_ax, es, ds, fs */
 			"	addl $20, %esp\n"
 			"	popf\n"
 			"	ret\n");
@@ -415,9 +415,9 @@ fastcall void *__kprobes trampoline_handler(struct pt_regs *regs)
 	spin_lock_irqsave(&kretprobe_lock, flags);
 	head = kretprobe_inst_table_head(current);
 	/* fixup registers */
-	regs->xcs = __KERNEL_CS | get_kernel_rpl();
-	regs->eip = trampoline_address;
-	regs->orig_eax = 0xffffffff;
+	regs->cs = __KERNEL_CS | get_kernel_rpl();
+	regs->ip = trampoline_address;
+	regs->orig_ax = 0xffffffff;
 
 	/*
 	 * It is possible to have multiple instances associated with a given
@@ -478,11 +478,11 @@ fastcall void *__kprobes trampoline_handler(struct pt_regs *regs)
  * interrupt.  We have to fix up the stack as follows:
  *
  * 0) Except in the case of absolute or indirect jump or call instructions,
- * the new eip is relative to the copied instruction.  We need to make
+ * the new ip is relative to the copied instruction.  We need to make
  * it relative to the original instruction.
  *
  * 1) If the single-stepped instruction was pushfl, then the TF and IF
- * flags are set in the just-pushed eflags, and may need to be cleared.
+ * flags are set in the just-pushed flags, and may need to be cleared.
  *
  * 2) If the single-stepped instruction was a call, the return address
  * that is atop the stack is the address following the copied instruction.
@@ -493,11 +493,11 @@ fastcall void *__kprobes trampoline_handler(struct pt_regs *regs)
 static void __kprobes resume_execution(struct kprobe *p,
 		struct pt_regs *regs, struct kprobe_ctlblk *kcb)
 {
-	unsigned long *tos = (unsigned long *)&regs->esp;
+	unsigned long *tos = (unsigned long *)&regs->sp;
 	unsigned long copy_eip = (unsigned long)p->ainsn.insn;
 	unsigned long orig_eip = (unsigned long)p->addr;
 
-	regs->eflags &= ~TF_MASK;
+	regs->flags &= ~TF_MASK;
 	switch (p->ainsn.insn[0]) {
 	case 0x9c:		/* pushfl */
 		*tos &= ~(TF_MASK | IF_MASK);
@@ -508,8 +508,8 @@ static void __kprobes resume_execution(struct kprobe *p,
 	case 0xca:
 	case 0xcb:
 	case 0xcf:
-	case 0xea:		/* jmp absolute -- eip is correct */
-		/* eip is already adjusted, no more changes required */
+	case 0xea:		/* jmp absolute -- ip is correct */
+		/* ip is already adjusted, no more changes required */
 		p->ainsn.boostable = 1;
 		goto no_change;
 	case 0xe8:		/* call relative - Fix return addr */
@@ -522,14 +522,14 @@ static void __kprobes resume_execution(struct kprobe *p,
 		if ((p->ainsn.insn[1] & 0x30) == 0x10) {
 			/*
 			 * call absolute, indirect
-			 * Fix return addr; eip is correct.
+			 * Fix return addr; ip is correct.
 			 * But this is not boostable
 			 */
 			*tos = orig_eip + (*tos - copy_eip);
 			goto no_change;
 		} else if (((p->ainsn.insn[1] & 0x31) == 0x20) ||	/* jmp near, absolute indirect */
 			   ((p->ainsn.insn[1] & 0x31) == 0x21)) {	/* jmp far, absolute indirect */
-			/* eip is correct. And this is boostable */
+			/* ip is correct. And this is boostable */
 			p->ainsn.boostable = 1;
 			goto no_change;
 		}
@@ -538,21 +538,21 @@ static void __kprobes resume_execution(struct kprobe *p,
 	}
 
 	if (p->ainsn.boostable == 0) {
-		if ((regs->eip > copy_eip) &&
-		    (regs->eip - copy_eip) + 5 < MAX_INSN_SIZE) {
+		if ((regs->ip > copy_eip) &&
+		    (regs->ip - copy_eip) + 5 < MAX_INSN_SIZE) {
 			/*
 			 * These instructions can be executed directly if it
 			 * jumps back to correct address.
 			 */
-			set_jmp_op((void *)regs->eip,
-				   (void *)orig_eip + (regs->eip - copy_eip));
+			set_jmp_op((void *)regs->ip,
+				   (void *)orig_eip + (regs->ip - copy_eip));
 			p->ainsn.boostable = 1;
 		} else {
 			p->ainsn.boostable = -1;
 		}
 	}
 
-	regs->eip = orig_eip + (regs->eip - copy_eip);
+	regs->ip = orig_eip + (regs->ip - copy_eip);
 
 no_change:
 	restore_btf();
@@ -578,8 +578,8 @@ static int __kprobes post_kprobe_handler(struct pt_regs *regs)
 	}
 
 	resume_execution(cur, regs, kcb);
-	regs->eflags |= kcb->kprobe_saved_eflags;
-	trace_hardirqs_fixup_flags(regs->eflags);
+	regs->flags |= kcb->kprobe_saved_eflags;
+	trace_hardirqs_fixup_flags(regs->flags);
 
 	/*Restore back the original saved kprobes variables and continue. */
 	if (kcb->kprobe_status == KPROBE_REENTER) {
@@ -591,11 +591,11 @@ out:
 	preempt_enable_no_resched();
 
 	/*
-	 * if somebody else is singlestepping across a probe point, eflags
+	 * if somebody else is singlestepping across a probe point, flags
 	 * will have TF set, in which case, continue the remaining processing
 	 * of do_debug, as if this is not a probe hit.
 	 */
-	if (regs->eflags & TF_MASK)
+	if (regs->flags & TF_MASK)
 		return 0;
 
 	return 1;
@@ -612,12 +612,12 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 		/*
 		 * We are here because the instruction being single
 		 * stepped caused a page fault. We reset the current
-		 * kprobe and the eip points back to the probe address
+		 * kprobe and the ip points back to the probe address
 		 * and allow the page fault handler to continue as a
 		 * normal page fault.
 		 */
-		regs->eip = (unsigned long)cur->addr;
-		regs->eflags |= kcb->kprobe_old_eflags;
+		regs->ip = (unsigned long)cur->addr;
+		regs->flags |= kcb->kprobe_old_eflags;
 		if (kcb->kprobe_status == KPROBE_REENTER)
 			restore_previous_kprobe(kcb);
 		else
@@ -703,7 +703,7 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
 
 	kcb->jprobe_saved_regs = *regs;
-	kcb->jprobe_saved_esp = &regs->esp;
+	kcb->jprobe_saved_esp = &regs->sp;
 	addr = (unsigned long)(kcb->jprobe_saved_esp);
 
 	/*
@@ -715,9 +715,9 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 	 */
 	memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr,
 			MIN_STACK_SIZE(addr));
-	regs->eflags &= ~IF_MASK;
+	regs->flags &= ~IF_MASK;
 	trace_hardirqs_off();
-	regs->eip = (unsigned long)(jp->entry);
+	regs->ip = (unsigned long)(jp->entry);
 	return 1;
 }
 
@@ -736,15 +736,15 @@ void __kprobes jprobe_return(void)
 int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
 {
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-	u8 *addr = (u8 *) (regs->eip - 1);
+	u8 *addr = (u8 *) (regs->ip - 1);
 	unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_esp);
 	struct jprobe *jp = container_of(p, struct jprobe, kp);
 
 	if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) {
-		if (&regs->esp != kcb->jprobe_saved_esp) {
+		if (&regs->sp != kcb->jprobe_saved_esp) {
 			struct pt_regs *saved_regs = &kcb->jprobe_saved_regs;
-			printk("current esp %p does not match saved esp %p\n",
-			       &regs->esp, kcb->jprobe_saved_esp);
+			printk("current sp %p does not match saved sp %p\n",
+			       &regs->sp, kcb->jprobe_saved_esp);
 			printk("Saved registers for jprobe %p\n", jp);
 			show_registers(saved_regs);
 			printk("Current registers\n");
diff --git a/arch/x86/kernel/kprobes_64.c b/arch/x86/kernel/kprobes_64.c
index 10d66e323c7d..f6837cd3bed5 100644
--- a/arch/x86/kernel/kprobes_64.c
+++ b/arch/x86/kernel/kprobes_64.c
@@ -251,7 +251,7 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
 {
 	__get_cpu_var(current_kprobe) = p;
 	kcb->kprobe_saved_rflags = kcb->kprobe_old_rflags
-		= (regs->eflags & (TF_MASK | IF_MASK));
+		= (regs->flags & (TF_MASK | IF_MASK));
 	if (is_IF_modifier(p->ainsn.insn))
 		kcb->kprobe_saved_rflags &= ~IF_MASK;
 }
@@ -271,20 +271,20 @@ static __always_inline void restore_btf(void)
 static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
 {
 	clear_btf();
-	regs->eflags |= TF_MASK;
-	regs->eflags &= ~IF_MASK;
+	regs->flags |= TF_MASK;
+	regs->flags &= ~IF_MASK;
 	/*single step inline if the instruction is an int3*/
 	if (p->opcode == BREAKPOINT_INSTRUCTION)
-		regs->rip = (unsigned long)p->addr;
+		regs->ip = (unsigned long)p->addr;
 	else
-		regs->rip = (unsigned long)p->ainsn.insn;
+		regs->ip = (unsigned long)p->ainsn.insn;
 }
 
 /* Called with kretprobe_lock held */
 void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
 				      struct pt_regs *regs)
 {
-	unsigned long *sara = (unsigned long *)regs->rsp;
+	unsigned long *sara = (unsigned long *)regs->sp;
 
 	ri->ret_addr = (kprobe_opcode_t *) *sara;
 	/* Replace the return addr with trampoline addr */
@@ -295,7 +295,7 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
 {
 	struct kprobe *p;
 	int ret = 0;
-	kprobe_opcode_t *addr = (kprobe_opcode_t *)(regs->rip - sizeof(kprobe_opcode_t));
+	kprobe_opcode_t *addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
 	struct kprobe_ctlblk *kcb;
 
 	/*
@@ -311,8 +311,8 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
 		if (p) {
 			if (kcb->kprobe_status == KPROBE_HIT_SS &&
 				*p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
-				regs->eflags &= ~TF_MASK;
-				regs->eflags |= kcb->kprobe_saved_rflags;
+				regs->flags &= ~TF_MASK;
+				regs->flags |= kcb->kprobe_saved_rflags;
 				goto no_kprobe;
 			} else if (kcb->kprobe_status == KPROBE_HIT_SSDONE) {
 				/* TODO: Provide re-entrancy from
@@ -321,7 +321,7 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
 				 * the instruction of the new probe.
 				 */
 				arch_disarm_kprobe(p);
-				regs->rip = (unsigned long)p->addr;
+				regs->ip = (unsigned long)p->addr;
 				reset_current_kprobe();
 				ret = 1;
 			} else {
@@ -345,7 +345,7 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
 			 * another cpu right after we hit, no further
 			 * handling of this interrupt is appropriate
 			 */
-				regs->rip = (unsigned long)addr;
+				regs->ip = (unsigned long)addr;
 				ret = 1;
 				goto no_kprobe;
 			}
@@ -369,7 +369,7 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
 			 * Back up over the (now missing) int3 and run
 			 * the original instruction.
 			 */
-			regs->rip = (unsigned long)addr;
+			regs->ip = (unsigned long)addr;
 			ret = 1;
 		}
 		/* Not one of ours: let kernel handle it */
@@ -454,7 +454,7 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 	}
 
 	kretprobe_assert(ri, orig_ret_address, trampoline_address);
-	regs->rip = orig_ret_address;
+	regs->ip = orig_ret_address;
 
 	reset_current_kprobe();
 	spin_unlock_irqrestore(&kretprobe_lock, flags);
@@ -484,11 +484,11 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
  * interrupt.  We have to fix up the stack as follows:
  *
  * 0) Except in the case of absolute or indirect jump or call instructions,
- * the new rip is relative to the copied instruction.  We need to make
+ * the new ip is relative to the copied instruction.  We need to make
  * it relative to the original instruction.
  *
  * 1) If the single-stepped instruction was pushfl, then the TF and IF
- * flags are set in the just-pushed eflags, and may need to be cleared.
+ * flags are set in the just-pushed flags, and may need to be cleared.
  *
  * 2) If the single-stepped instruction was a call, the return address
  * that is atop the stack is the address following the copied instruction.
@@ -497,7 +497,7 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 static void __kprobes resume_execution(struct kprobe *p,
 		struct pt_regs *regs, struct kprobe_ctlblk *kcb)
 {
-	unsigned long *tos = (unsigned long *)regs->rsp;
+	unsigned long *tos = (unsigned long *)regs->sp;
 	unsigned long copy_rip = (unsigned long)p->ainsn.insn;
 	unsigned long orig_rip = (unsigned long)p->addr;
 	kprobe_opcode_t *insn = p->ainsn.insn;
@@ -506,7 +506,7 @@ static void __kprobes resume_execution(struct kprobe *p,
 	if (*insn >= 0x40 && *insn <= 0x4f)
 		insn++;
 
-	regs->eflags &= ~TF_MASK;
+	regs->flags &= ~TF_MASK;
 	switch (*insn) {
 	case 0x9c:	/* pushfl */
 		*tos &= ~(TF_MASK | IF_MASK);
@@ -538,7 +538,8 @@ static void __kprobes resume_execution(struct kprobe *p,
 		break;
 	}
 
-	regs->rip = orig_rip + (regs->rip - copy_rip);
+	regs->ip = orig_rip + (regs->ip - copy_rip);
+
 no_change:
 	restore_btf();
 
@@ -559,8 +560,8 @@ int __kprobes post_kprobe_handler(struct pt_regs *regs)
 	}
 
 	resume_execution(cur, regs, kcb);
-	regs->eflags |= kcb->kprobe_saved_rflags;
-	trace_hardirqs_fixup_flags(regs->eflags);
+	regs->flags |= kcb->kprobe_saved_rflags;
+	trace_hardirqs_fixup_flags(regs->flags);
 
 	/* Restore the original saved kprobes variables and continue. */
 	if (kcb->kprobe_status == KPROBE_REENTER) {
@@ -572,11 +573,11 @@ out:
 	preempt_enable_no_resched();
 
 	/*
-	 * if somebody else is singlestepping across a probe point, eflags
+	 * if somebody else is singlestepping across a probe point, flags
 	 * will have TF set, in which case, continue the remaining processing
 	 * of do_debug, as if this is not a probe hit.
 	 */
-	if (regs->eflags & TF_MASK)
+	if (regs->flags & TF_MASK)
 		return 0;
 
 	return 1;
@@ -594,12 +595,12 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 		/*
 		 * We are here because the instruction being single
 		 * stepped caused a page fault. We reset the current
-		 * kprobe and the rip points back to the probe address
+		 * kprobe and the ip points back to the probe address
 		 * and allow the page fault handler to continue as a
 		 * normal page fault.
 		 */
-		regs->rip = (unsigned long)cur->addr;
-		regs->eflags |= kcb->kprobe_old_rflags;
+		regs->ip = (unsigned long)cur->addr;
+		regs->flags |= kcb->kprobe_old_rflags;
 		if (kcb->kprobe_status == KPROBE_REENTER)
 			restore_previous_kprobe(kcb);
 		else
@@ -629,9 +630,9 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 		 * In case the user-specified fault handler returned
 		 * zero, try to fix up.
 		 */
-		fixup = search_exception_tables(regs->rip);
+		fixup = search_exception_tables(regs->ip);
 		if (fixup) {
-			regs->rip = fixup->fixup;
+			regs->ip = fixup->fixup;
 			return 1;
 		}
 
@@ -688,7 +689,7 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
 
 	kcb->jprobe_saved_regs = *regs;
-	kcb->jprobe_saved_rsp = (long *) regs->rsp;
+	kcb->jprobe_saved_rsp = (long *) regs->sp;
 	addr = (unsigned long)(kcb->jprobe_saved_rsp);
 	/*
 	 * As Linus pointed out, gcc assumes that the callee
@@ -699,9 +700,9 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 	 */
 	memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr,
 			MIN_STACK_SIZE(addr));
-	regs->eflags &= ~IF_MASK;
+	regs->flags &= ~IF_MASK;
 	trace_hardirqs_off();
-	regs->rip = (unsigned long)(jp->entry);
+	regs->ip = (unsigned long)(jp->entry);
 	return 1;
 }
 
@@ -720,15 +721,15 @@ void __kprobes jprobe_return(void)
 int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
 {
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-	u8 *addr = (u8 *) (regs->rip - 1);
+	u8 *addr = (u8 *) (regs->ip - 1);
 	unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_rsp);
 	struct jprobe *jp = container_of(p, struct jprobe, kp);
 
 	if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) {
-		if ((unsigned long *)regs->rsp != kcb->jprobe_saved_rsp) {
+		if ((unsigned long *)regs->sp != kcb->jprobe_saved_rsp) {
 			struct pt_regs *saved_regs = &kcb->jprobe_saved_regs;
-			printk("current rsp %p does not match saved rsp %p\n",
-			       (long *)regs->rsp, kcb->jprobe_saved_rsp);
+			printk("current sp %p does not match saved sp %p\n",
+			       (long *)regs->sp, kcb->jprobe_saved_rsp);
 			printk("Saved registers for jprobe %p\n", jp);
 			show_registers(saved_regs);
 			printk("Current registers\n");
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index d5462f228daf..c9f28e02e86d 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -265,13 +265,13 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
  * New with Core Duo processors, MWAIT can take some hints based on CPU
  * capability.
  */
-void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
+void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
 {
 	if (!need_resched()) {
 		__monitor((void *)&current_thread_info()->flags, 0, 0);
 		smp_mb();
 		if (!need_resched())
-			__mwait(eax, ecx);
+			__mwait(ax, cx);
 	}
 }
 
@@ -320,15 +320,15 @@ void __show_registers(struct pt_regs *regs, int all)
 {
 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
 	unsigned long d0, d1, d2, d3, d6, d7;
-	unsigned long esp;
+	unsigned long sp;
 	unsigned short ss, gs;
 
 	if (user_mode_vm(regs)) {
-		esp = regs->esp;
-		ss = regs->xss & 0xffff;
+		sp = regs->sp;
+		ss = regs->ss & 0xffff;
 		savesegment(gs, gs);
 	} else {
-		esp = (unsigned long) (&regs->esp);
+		sp = (unsigned long) (&regs->sp);
 		savesegment(ss, ss);
 		savesegment(gs, gs);
 	}
@@ -341,17 +341,17 @@ void __show_registers(struct pt_regs *regs, int all)
 			init_utsname()->version);
 
 	printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
-			0xffff & regs->xcs, regs->eip, regs->eflags,
+			0xffff & regs->cs, regs->ip, regs->flags,
 			smp_processor_id());
-	print_symbol("EIP is at %s\n", regs->eip);
+	print_symbol("EIP is at %s\n", regs->ip);
 
 	printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
-		regs->eax, regs->ebx, regs->ecx, regs->edx);
+		regs->ax, regs->bx, regs->cx, regs->dx);
 	printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
-		regs->esi, regs->edi, regs->ebp, esp);
+		regs->si, regs->di, regs->bp, sp);
 	printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
-	       regs->xds & 0xffff, regs->xes & 0xffff,
-	       regs->xfs & 0xffff, gs, ss);
+	       regs->ds & 0xffff, regs->es & 0xffff,
+	       regs->fs & 0xffff, gs, ss);
 
 	if (!all)
 		return;
@@ -379,12 +379,12 @@ void __show_registers(struct pt_regs *regs, int all)
 void show_regs(struct pt_regs *regs)
 {
 	__show_registers(regs, 1);
-	show_trace(NULL, regs, &regs->esp);
+	show_trace(NULL, regs, &regs->sp);
 }
 
 /*
- * This gets run with %ebx containing the
- * function to call, and %edx containing
+ * This gets run with %bx containing the
+ * function to call, and %dx containing
  * the "args".
  */
 extern void kernel_thread_helper(void);
@@ -398,16 +398,16 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
 
 	memset(&regs, 0, sizeof(regs));
 
-	regs.ebx = (unsigned long) fn;
-	regs.edx = (unsigned long) arg;
+	regs.bx = (unsigned long) fn;
+	regs.dx = (unsigned long) arg;
 
-	regs.xds = __USER_DS;
-	regs.xes = __USER_DS;
-	regs.xfs = __KERNEL_PERCPU;
-	regs.orig_eax = -1;
-	regs.eip = (unsigned long) kernel_thread_helper;
-	regs.xcs = __KERNEL_CS | get_kernel_rpl();
-	regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
+	regs.ds = __USER_DS;
+	regs.es = __USER_DS;
+	regs.fs = __KERNEL_PERCPU;
+	regs.orig_ax = -1;
+	regs.ip = (unsigned long) kernel_thread_helper;
+	regs.cs = __KERNEL_CS | get_kernel_rpl();
+	regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
 
 	/* Ok, create the new process.. */
 	return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
@@ -470,7 +470,7 @@ void prepare_to_copy(struct task_struct *tsk)
 	unlazy_fpu(tsk);
 }
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
+int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
 	unsigned long unused,
 	struct task_struct * p, struct pt_regs * regs)
 {
@@ -480,8 +480,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
 
 	childregs = task_pt_regs(p);
 	*childregs = *regs;
-	childregs->eax = 0;
-	childregs->esp = esp;
+	childregs->ax = 0;
+	childregs->sp = sp;
 
 	p->thread.esp = (unsigned long) childregs;
 	p->thread.esp0 = (unsigned long) (childregs+1);
@@ -508,7 +508,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
 	 */
 	if (clone_flags & CLONE_SETTLS)
 		err = do_set_thread_area(p, -1,
-			(struct user_desc __user *)childregs->esi, 0);
+			(struct user_desc __user *)childregs->si, 0);
 
 	if (err && p->thread.io_bitmap_ptr) {
 		kfree(p->thread.io_bitmap_ptr);
@@ -527,7 +527,7 @@ void dump_thread(struct pt_regs * regs, struct user * dump)
 /* changed the size calculations - should hopefully work better. lbt */
 	dump->magic = CMAGIC;
 	dump->start_code = 0;
-	dump->start_stack = regs->esp & ~(PAGE_SIZE - 1);
+	dump->start_stack = regs->sp & ~(PAGE_SIZE - 1);
 	dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
 	dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
 	dump->u_dsize -= dump->u_tsize;
@@ -538,23 +538,23 @@ void dump_thread(struct pt_regs * regs, struct user * dump)
 	if (dump->start_stack < TASK_SIZE)
 		dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
 
-	dump->regs.ebx = regs->ebx;
-	dump->regs.ecx = regs->ecx;
-	dump->regs.edx = regs->edx;
-	dump->regs.esi = regs->esi;
-	dump->regs.edi = regs->edi;
-	dump->regs.ebp = regs->ebp;
-	dump->regs.eax = regs->eax;
-	dump->regs.ds = regs->xds;
-	dump->regs.es = regs->xes;
-	dump->regs.fs = regs->xfs;
+	dump->regs.ebx = regs->bx;
+	dump->regs.ecx = regs->cx;
+	dump->regs.edx = regs->dx;
+	dump->regs.esi = regs->si;
+	dump->regs.edi = regs->di;
+	dump->regs.ebp = regs->bp;
+	dump->regs.eax = regs->ax;
+	dump->regs.ds = regs->ds;
+	dump->regs.es = regs->es;
+	dump->regs.fs = regs->fs;
 	savesegment(gs,dump->regs.gs);
-	dump->regs.orig_eax = regs->orig_eax;
-	dump->regs.eip = regs->eip;
-	dump->regs.cs = regs->xcs;
-	dump->regs.eflags = regs->eflags;
-	dump->regs.esp = regs->esp;
-	dump->regs.ss = regs->xss;
+	dump->regs.orig_eax = regs->orig_ax;
+	dump->regs.eip = regs->ip;
+	dump->regs.cs = regs->cs;
+	dump->regs.eflags = regs->flags;
+	dump->regs.esp = regs->sp;
+	dump->regs.ss = regs->ss;
 
 	dump->u_fpvalid = dump_fpu (regs, &dump->i387);
 }
@@ -566,10 +566,10 @@ EXPORT_SYMBOL(dump_thread);
 int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
 {
 	struct pt_regs ptregs = *task_pt_regs(tsk);
-	ptregs.xcs &= 0xffff;
-	ptregs.xds &= 0xffff;
-	ptregs.xes &= 0xffff;
-	ptregs.xss &= 0xffff;
+	ptregs.cs &= 0xffff;
+	ptregs.ds &= 0xffff;
+	ptregs.es &= 0xffff;
+	ptregs.ss &= 0xffff;
 
 	elf_core_copy_regs(regs, &ptregs);
 
@@ -684,7 +684,7 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
  * More important, however, is the fact that this allows us much
  * more flexibility.
  *
- * The return value (in %eax) will be the "prev" task after
+ * The return value (in %ax) will be the "prev" task after
  * the task-switch, and shows up in ret_from_fork in entry.S,
  * for example.
  */
@@ -771,7 +771,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
 
 asmlinkage int sys_fork(struct pt_regs regs)
 {
-	return do_fork(SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
+	return do_fork(SIGCHLD, regs.sp, &regs, 0, NULL, NULL);
 }
 
 asmlinkage int sys_clone(struct pt_regs regs)
@@ -780,12 +780,12 @@ asmlinkage int sys_clone(struct pt_regs regs)
 	unsigned long newsp;
 	int __user *parent_tidptr, *child_tidptr;
 
-	clone_flags = regs.ebx;
-	newsp = regs.ecx;
-	parent_tidptr = (int __user *)regs.edx;
-	child_tidptr = (int __user *)regs.edi;
+	clone_flags = regs.bx;
+	newsp = regs.cx;
+	parent_tidptr = (int __user *)regs.dx;
+	child_tidptr = (int __user *)regs.di;
 	if (!newsp)
-		newsp = regs.esp;
+		newsp = regs.sp;
 	return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
 }
 
@@ -801,7 +801,7 @@ asmlinkage int sys_clone(struct pt_regs regs)
  */
 asmlinkage int sys_vfork(struct pt_regs regs)
 {
-	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
+	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.sp, &regs, 0, NULL, NULL);
 }
 
 /*
@@ -812,13 +812,13 @@ asmlinkage int sys_execve(struct pt_regs regs)
 	int error;
 	char * filename;
 
-	filename = getname((char __user *) regs.ebx);
+	filename = getname((char __user *) regs.bx);
 	error = PTR_ERR(filename);
 	if (IS_ERR(filename))
 		goto out;
 	error = do_execve(filename,
-			(char __user * __user *) regs.ecx,
-			(char __user * __user *) regs.edx,
+			(char __user * __user *) regs.cx,
+			(char __user * __user *) regs.dx,
 			&regs);
 	if (error == 0) {
 		/* Make sure we don't return using sysenter.. */
@@ -834,24 +834,24 @@ out:
 
 unsigned long get_wchan(struct task_struct *p)
 {
-	unsigned long ebp, esp, eip;
+	unsigned long bp, sp, ip;
 	unsigned long stack_page;
 	int count = 0;
 	if (!p || p == current || p->state == TASK_RUNNING)
 		return 0;
 	stack_page = (unsigned long)task_stack_page(p);
-	esp = p->thread.esp;
-	if (!stack_page || esp < stack_page || esp > top_esp+stack_page)
+	sp = p->thread.esp;
+	if (!stack_page || sp < stack_page || sp > top_esp+stack_page)
 		return 0;
-	/* include/asm-i386/system.h:switch_to() pushes ebp last. */
-	ebp = *(unsigned long *) esp;
+	/* include/asm-i386/system.h:switch_to() pushes bp last. */
+	bp = *(unsigned long *) sp;
 	do {
-		if (ebp < stack_page || ebp > top_ebp+stack_page)
+		if (bp < stack_page || bp > top_ebp+stack_page)
 			return 0;
-		eip = *(unsigned long *) (ebp+4);
-		if (!in_sched_functions(eip))
-			return eip;
-		ebp = *(unsigned long *) ebp;
+		ip = *(unsigned long *) (bp+4);
+		if (!in_sched_functions(ip))
+			return ip;
+		bp = *(unsigned long *) bp;
 	} while (count++ < 16);
 	return 0;
 }
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ae5eca17aa3c..efbb1a2eab97 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -257,13 +257,13 @@ void cpu_idle(void)
  * New with Core Duo processors, MWAIT can take some hints based on CPU
  * capability.
  */
-void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
+void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
 {
 	if (!need_resched()) {
 		__monitor((void *)&current_thread_info()->flags, 0, 0);
 		smp_mb();
 		if (!need_resched())
-			__mwait(eax, ecx);
+			__mwait(ax, cx);
 	}
 }
 
@@ -330,16 +330,16 @@ void __show_regs(struct pt_regs * regs)
 		init_utsname()->release,
 		(int)strcspn(init_utsname()->version, " "),
 		init_utsname()->version);
-	printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
-	printk_address(regs->rip); 
-	printk("RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->rsp,
-		regs->eflags);
+	printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
+	printk_address(regs->ip);
+	printk("RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->sp,
+		regs->flags);
 	printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
-	       regs->rax, regs->rbx, regs->rcx);
+	       regs->ax, regs->bx, regs->cx);
 	printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
-	       regs->rdx, regs->rsi, regs->rdi); 
+	       regs->dx, regs->si, regs->di);
 	printk("RBP: %016lx R08: %016lx R09: %016lx\n",
-	       regs->rbp, regs->r8, regs->r9); 
+	       regs->bp, regs->r8, regs->r9);
 	printk("R10: %016lx R11: %016lx R12: %016lx\n",
 	       regs->r10, regs->r11, regs->r12); 
 	printk("R13: %016lx R14: %016lx R15: %016lx\n",
@@ -476,7 +476,7 @@ void prepare_to_copy(struct task_struct *tsk)
 	unlazy_fpu(tsk);
 }
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, 
+int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
 		unsigned long unused,
 	struct task_struct * p, struct pt_regs * regs)
 {
@@ -488,10 +488,10 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
 			(THREAD_SIZE + task_stack_page(p))) - 1;
 	*childregs = *regs;
 
-	childregs->rax = 0;
-	childregs->rsp = rsp;
-	if (rsp == ~0UL)
-		childregs->rsp = (unsigned long)childregs;
+	childregs->ax = 0;
+	childregs->sp = sp;
+	if (sp == ~0UL)
+		childregs->sp = (unsigned long)childregs;
 
 	p->thread.rsp = (unsigned long) childregs;
 	p->thread.rsp0 = (unsigned long) (childregs+1);
@@ -525,7 +525,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
 #ifdef CONFIG_IA32_EMULATION
 		if (test_thread_flag(TIF_IA32))
 			err = do_set_thread_area(p, -1,
-				(struct user_desc __user *)childregs->rsi, 0);
+				(struct user_desc __user *)childregs->si, 0);
 		else 			
 #endif	 
 			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); 
@@ -732,7 +732,7 @@ void set_personality_64bit(void)
 
 asmlinkage long sys_fork(struct pt_regs *regs)
 {
-	return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
+	return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
 }
 
 asmlinkage long
@@ -740,7 +740,7 @@ sys_clone(unsigned long clone_flags, unsigned long newsp,
 	  void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
 {
 	if (!newsp)
-		newsp = regs->rsp;
+		newsp = regs->sp;
 	return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
 }
 
@@ -756,14 +756,14 @@ sys_clone(unsigned long clone_flags, unsigned long newsp,
  */
 asmlinkage long sys_vfork(struct pt_regs *regs)
 {
-	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
+	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
 		    NULL, NULL);
 }
 
 unsigned long get_wchan(struct task_struct *p)
 {
 	unsigned long stack;
-	u64 fp,rip;
+	u64 fp,ip;
 	int count = 0;
 
 	if (!p || p == current || p->state==TASK_RUNNING)
@@ -776,9 +776,9 @@ unsigned long get_wchan(struct task_struct *p)
 		if (fp < (unsigned long)stack ||
 		    fp > (unsigned long)stack+THREAD_SIZE)
 			return 0; 
-		rip = *(u64 *)(fp+8); 
-		if (!in_sched_functions(rip))
-			return rip; 
+		ip = *(u64 *)(fp+8);
+		if (!in_sched_functions(ip))
+			return ip;
 		fp = *(u64 *)fp; 
 	} while (count++ < 16); 
 	return 0;
diff --git a/arch/x86/kernel/ptrace_32.c b/arch/x86/kernel/ptrace_32.c
index 512f8412b799..f81e2f1827d4 100644
--- a/arch/x86/kernel/ptrace_32.c
+++ b/arch/x86/kernel/ptrace_32.c
@@ -39,10 +39,10 @@
 
 static long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
 {
-	BUILD_BUG_ON(offsetof(struct pt_regs, ebx) != 0);
+	BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0);
 	if (regno > FS)
 		--regno;
-	return &regs->ebx + regno;
+	return &regs->bx + regno;
 }
 
 static int putreg(struct task_struct *child,
@@ -80,7 +80,7 @@ static int putreg(struct task_struct *child,
 				clear_tsk_thread_flag(child, TIF_FORCED_TF);
 			else if (test_tsk_thread_flag(child, TIF_FORCED_TF))
 				value |= X86_EFLAGS_TF;
-			value |= regs->eflags & ~FLAG_MASK;
+			value |= regs->flags & ~FLAG_MASK;
 			break;
 	}
 	*pt_regs_access(regs, regno) = value;
@@ -98,7 +98,7 @@ static unsigned long getreg(struct task_struct *child, unsigned long regno)
 			/*
 			 * If the debugger set TF, hide it from the readout.
 			 */
-			retval = regs->eflags;
+			retval = regs->flags;
 			if (test_tsk_thread_flag(child, TIF_FORCED_TF))
 				retval &= ~X86_EFLAGS_TF;
 			break;
@@ -369,8 +369,8 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
 	info.si_signo = SIGTRAP;
 	info.si_code = TRAP_BRKPT;
 
-	/* User-mode eip? */
-	info.si_addr = user_mode_vm(regs) ? (void __user *) regs->eip : NULL;
+	/* User-mode ip? */
+	info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL;
 
 	/* Send us the fake SIGTRAP */
 	force_sig_info(SIGTRAP, &info, tsk);
@@ -392,12 +392,12 @@ int do_syscall_trace(struct pt_regs *regs, int entryexit)
 
 	/* do the secure computing check first */
 	if (!entryexit)
-		secure_computing(regs->orig_eax);
+		secure_computing(regs->orig_ax);
 
 	if (unlikely(current->audit_context)) {
 		if (entryexit)
-			audit_syscall_exit(AUDITSC_RESULT(regs->eax),
-						regs->eax);
+			audit_syscall_exit(AUDITSC_RESULT(regs->ax),
+						regs->ax);
 		/* Debug traps, when using PTRACE_SINGLESTEP, must be sent only
 		 * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is
 		 * not used, entry.S will call us only on syscall exit, not
@@ -445,13 +445,13 @@ int do_syscall_trace(struct pt_regs *regs, int entryexit)
 	ret = is_sysemu;
 out:
 	if (unlikely(current->audit_context) && !entryexit)
-		audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_eax,
-				    regs->ebx, regs->ecx, regs->edx, regs->esi);
+		audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_ax,
+				    regs->bx, regs->cx, regs->dx, regs->si);
 	if (ret == 0)
 		return 0;
 
-	regs->orig_eax = -1; /* force skip of syscall restarting */
+	regs->orig_ax = -1; /* force skip of syscall restarting */
 	if (unlikely(current->audit_context))
-		audit_syscall_exit(AUDITSC_RESULT(regs->eax), regs->eax);
+		audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
 	return 1;
 }
diff --git a/arch/x86/kernel/ptrace_64.c b/arch/x86/kernel/ptrace_64.c
index 4ba66d8af717..bee20bb1a6c0 100644
--- a/arch/x86/kernel/ptrace_64.c
+++ b/arch/x86/kernel/ptrace_64.c
@@ -119,7 +119,7 @@ static int putreg(struct task_struct *child,
 				clear_tsk_thread_flag(child, TIF_FORCED_TF);
 			else if (test_tsk_thread_flag(child, TIF_FORCED_TF))
 				value |= X86_EFLAGS_TF;
-			value |= regs->eflags & ~FLAG_MASK;
+			value |= regs->flags & ~FLAG_MASK;
 			break;
 		case offsetof(struct user_regs_struct,cs): 
 			if ((value & 3) != 3)
@@ -168,7 +168,7 @@ static unsigned long getreg(struct task_struct *child, unsigned long regno)
 			/*
 			 * If the debugger set TF, hide it from the readout.
 			 */
-			val = regs->eflags;
+			val = regs->flags;
 			if (test_tsk_thread_flag(child, TIF_IA32))
 				val &= 0xffffffff;
 			if (test_tsk_thread_flag(child, TIF_FORCED_TF))
@@ -383,9 +383,9 @@ static void syscall_trace(struct pt_regs *regs)
 {
 
 #if 0
-	printk("trace %s rip %lx rsp %lx rax %d origrax %d caller %lx tiflags %x ptrace %x\n",
+	printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n",
 	       current->comm,
-	       regs->rip, regs->rsp, regs->rax, regs->orig_rax, __builtin_return_address(0),
+	       regs->ip, regs->sp, regs->ax, regs->orig_ax, __builtin_return_address(0),
 	       current_thread_info()->flags, current->ptrace); 
 #endif
 
@@ -405,7 +405,7 @@ static void syscall_trace(struct pt_regs *regs)
 asmlinkage void syscall_trace_enter(struct pt_regs *regs)
 {
 	/* do the secure computing check first */
-	secure_computing(regs->orig_rax);
+	secure_computing(regs->orig_ax);
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE)
 	    && (current->ptrace & PT_PTRACED))
@@ -414,14 +414,14 @@ asmlinkage void syscall_trace_enter(struct pt_regs *regs)
 	if (unlikely(current->audit_context)) {
 		if (test_thread_flag(TIF_IA32)) {
 			audit_syscall_entry(AUDIT_ARCH_I386,
-					    regs->orig_rax,
-					    regs->rbx, regs->rcx,
-					    regs->rdx, regs->rsi);
+					    regs->orig_ax,
+					    regs->bx, regs->cx,
+					    regs->dx, regs->si);
 		} else {
 			audit_syscall_entry(AUDIT_ARCH_X86_64,
-					    regs->orig_rax,
-					    regs->rdi, regs->rsi,
-					    regs->rdx, regs->r10);
+					    regs->orig_ax,
+					    regs->di, regs->si,
+					    regs->dx, regs->r10);
 		}
 	}
 }
@@ -429,7 +429,7 @@ asmlinkage void syscall_trace_enter(struct pt_regs *regs)
 asmlinkage void syscall_trace_leave(struct pt_regs *regs)
 {
 	if (unlikely(current->audit_context))
-		audit_syscall_exit(AUDITSC_RESULT(regs->rax), regs->rax);
+		audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
 
 	if ((test_thread_flag(TIF_SYSCALL_TRACE)
 	     || test_thread_flag(TIF_SINGLESTEP))
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index 0a7c812212c9..40fd3515ccf1 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -82,14 +82,14 @@ sys_sigaction(int sig, const struct old_sigaction __user *act,
 }
 
 asmlinkage int
-sys_sigaltstack(unsigned long ebx)
+sys_sigaltstack(unsigned long bx)
 {
 	/* This is needed to make gcc realize it doesn't own the "struct pt_regs" */
-	struct pt_regs *regs = (struct pt_regs *)&ebx;
-	const stack_t __user *uss = (const stack_t __user *)ebx;
-	stack_t __user *uoss = (stack_t __user *)regs->ecx;
+	struct pt_regs *regs = (struct pt_regs *)&bx;
+	const stack_t __user *uss = (const stack_t __user *)bx;
+	stack_t __user *uoss = (stack_t __user *)regs->cx;
 
-	return do_sigaltstack(uss, uoss, regs->esp);
+	return do_sigaltstack(uss, uoss, regs->sp);
 }
 
 
@@ -105,17 +105,17 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax
 	/* Always make any pending restarted system calls return -EINTR */
 	current_thread_info()->restart_block.fn = do_no_restart_syscall;
 
-#define COPY(x)		err |= __get_user(regs->x, &sc->x)
+#define COPY(x)		err |= __get_user(regs->x, &sc->e ## x)
 
 #define COPY_SEG(seg)							\
 	{ unsigned short tmp;						\
 	  err |= __get_user(tmp, &sc->seg);				\
-	  regs->x##seg = tmp; }
+	  regs->seg = tmp; }
 
 #define COPY_SEG_STRICT(seg)						\
 	{ unsigned short tmp;						\
 	  err |= __get_user(tmp, &sc->seg);				\
-	  regs->x##seg = tmp|3; }
+	  regs->seg = tmp|3; }
 
 #define GET_SEG(seg)							\
 	{ unsigned short tmp;						\
@@ -131,22 +131,22 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax
 	COPY_SEG(fs);
 	COPY_SEG(es);
 	COPY_SEG(ds);
-	COPY(edi);
-	COPY(esi);
-	COPY(ebp);
-	COPY(esp);
-	COPY(ebx);
-	COPY(edx);
-	COPY(ecx);
-	COPY(eip);
+	COPY(di);
+	COPY(si);
+	COPY(bp);
+	COPY(sp);
+	COPY(bx);
+	COPY(dx);
+	COPY(cx);
+	COPY(ip);
 	COPY_SEG_STRICT(cs);
 	COPY_SEG_STRICT(ss);
 	
 	{
 		unsigned int tmpflags;
 		err |= __get_user(tmpflags, &sc->eflags);
-		regs->eflags = (regs->eflags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
-		regs->orig_eax = -1;		/* disable syscall checks */
+		regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
+		regs->orig_ax = -1;		/* disable syscall checks */
 	}
 
 	{
@@ -175,9 +175,9 @@ badframe:
 asmlinkage int sys_sigreturn(unsigned long __unused)
 {
 	struct pt_regs *regs = (struct pt_regs *) &__unused;
-	struct sigframe __user *frame = (struct sigframe __user *)(regs->esp - 8);
+	struct sigframe __user *frame = (struct sigframe __user *)(regs->sp - 8);
 	sigset_t set;
-	int eax;
+	int ax;
 
 	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
 		goto badframe;
@@ -193,17 +193,17 @@ asmlinkage int sys_sigreturn(unsigned long __unused)
 	recalc_sigpending();
 	spin_unlock_irq(&current->sighand->siglock);
 	
-	if (restore_sigcontext(regs, &frame->sc, &eax))
+	if (restore_sigcontext(regs, &frame->sc, &ax))
 		goto badframe;
-	return eax;
+	return ax;
 
 badframe:
 	if (show_unhandled_signals && printk_ratelimit())
-		printk("%s%s[%d] bad frame in sigreturn frame:%p eip:%lx"
-		       " esp:%lx oeax:%lx\n",
+		printk("%s%s[%d] bad frame in sigreturn frame:%p ip:%lx"
+		       " sp:%lx oeax:%lx\n",
 		    task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
-		    current->comm, task_pid_nr(current), frame, regs->eip,
-		    regs->esp, regs->orig_eax);
+		    current->comm, task_pid_nr(current), frame, regs->ip,
+		    regs->sp, regs->orig_ax);
 
 	force_sig(SIGSEGV, current);
 	return 0;
@@ -212,9 +212,9 @@ badframe:
 asmlinkage int sys_rt_sigreturn(unsigned long __unused)
 {
 	struct pt_regs *regs = (struct pt_regs *) &__unused;
-	struct rt_sigframe __user *frame = (struct rt_sigframe __user *)(regs->esp - 4);
+	struct rt_sigframe __user *frame = (struct rt_sigframe __user *)(regs->sp - 4);
 	sigset_t set;
-	int eax;
+	int ax;
 
 	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
 		goto badframe;
@@ -227,13 +227,13 @@ asmlinkage int sys_rt_sigreturn(unsigned long __unused)
 	recalc_sigpending();
 	spin_unlock_irq(&current->sighand->siglock);
 	
-	if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax))
+	if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
 		goto badframe;
 
-	if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->esp) == -EFAULT)
+	if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT)
 		goto badframe;
 
-	return eax;
+	return ax;
 
 badframe:
 	force_sig(SIGSEGV, current);
@@ -250,27 +250,27 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
 {
 	int tmp, err = 0;
 
-	err |= __put_user(regs->xfs, (unsigned int __user *)&sc->fs);
+	err |= __put_user(regs->fs, (unsigned int __user *)&sc->fs);
 	savesegment(gs, tmp);
 	err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
 
-	err |= __put_user(regs->xes, (unsigned int __user *)&sc->es);
-	err |= __put_user(regs->xds, (unsigned int __user *)&sc->ds);
-	err |= __put_user(regs->edi, &sc->edi);
-	err |= __put_user(regs->esi, &sc->esi);
-	err |= __put_user(regs->ebp, &sc->ebp);
-	err |= __put_user(regs->esp, &sc->esp);
-	err |= __put_user(regs->ebx, &sc->ebx);
-	err |= __put_user(regs->edx, &sc->edx);
-	err |= __put_user(regs->ecx, &sc->ecx);
-	err |= __put_user(regs->eax, &sc->eax);
+	err |= __put_user(regs->es, (unsigned int __user *)&sc->es);
+	err |= __put_user(regs->ds, (unsigned int __user *)&sc->ds);
+	err |= __put_user(regs->di, &sc->edi);
+	err |= __put_user(regs->si, &sc->esi);
+	err |= __put_user(regs->bp, &sc->ebp);
+	err |= __put_user(regs->sp, &sc->esp);
+	err |= __put_user(regs->bx, &sc->ebx);
+	err |= __put_user(regs->dx, &sc->edx);
+	err |= __put_user(regs->cx, &sc->ecx);
+	err |= __put_user(regs->ax, &sc->eax);
 	err |= __put_user(current->thread.trap_no, &sc->trapno);
 	err |= __put_user(current->thread.error_code, &sc->err);
-	err |= __put_user(regs->eip, &sc->eip);
-	err |= __put_user(regs->xcs, (unsigned int __user *)&sc->cs);
-	err |= __put_user(regs->eflags, &sc->eflags);
-	err |= __put_user(regs->esp, &sc->esp_at_signal);
-	err |= __put_user(regs->xss, (unsigned int __user *)&sc->ss);
+	err |= __put_user(regs->ip, &sc->eip);
+	err |= __put_user(regs->cs, (unsigned int __user *)&sc->cs);
+	err |= __put_user(regs->flags, &sc->eflags);
+	err |= __put_user(regs->sp, &sc->esp_at_signal);
+	err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss);
 
 	tmp = save_i387(fpstate);
 	if (tmp < 0)
@@ -291,36 +291,36 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
 static inline void __user *
 get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size)
 {
-	unsigned long esp;
+	unsigned long sp;
 
 	/* Default to using normal stack */
-	esp = regs->esp;
+	sp = regs->sp;
 
 	/*
 	 * If we are on the alternate signal stack and would overflow it, don't.
 	 * Return an always-bogus address instead so we will die with SIGSEGV.
 	 */
-	if (on_sig_stack(esp) && !likely(on_sig_stack(esp - frame_size)))
+	if (on_sig_stack(sp) && !likely(on_sig_stack(sp - frame_size)))
 		return (void __user *) -1L;
 
 	/* This is the X/Open sanctioned signal stack switching.  */
 	if (ka->sa.sa_flags & SA_ONSTACK) {
-		if (sas_ss_flags(esp) == 0)
-			esp = current->sas_ss_sp + current->sas_ss_size;
+		if (sas_ss_flags(sp) == 0)
+			sp = current->sas_ss_sp + current->sas_ss_size;
 	}
 
 	/* This is the legacy signal stack switching. */
-	else if ((regs->xss & 0xffff) != __USER_DS &&
+	else if ((regs->ss & 0xffff) != __USER_DS &&
 		 !(ka->sa.sa_flags & SA_RESTORER) &&
 		 ka->sa.sa_restorer) {
-		esp = (unsigned long) ka->sa.sa_restorer;
+		sp = (unsigned long) ka->sa.sa_restorer;
 	}
 
-	esp -= frame_size;
+	sp -= frame_size;
 	/* Align the stack pointer according to the i386 ABI,
 	 * i.e. so that on function entry ((sp + 4) & 15) == 0. */
-	esp = ((esp + 4) & -16ul) - 4;
-	return (void __user *) esp;
+	sp = ((sp + 4) & -16ul) - 4;
+	return (void __user *) sp;
 }
 
 /* These symbols are defined with the addresses in the vsyscall page.
@@ -387,16 +387,16 @@ static int setup_frame(int sig, struct k_sigaction *ka,
 		goto give_sigsegv;
 
 	/* Set up registers for signal handler */
-	regs->esp = (unsigned long) frame;
-	regs->eip = (unsigned long) ka->sa.sa_handler;
-	regs->eax = (unsigned long) sig;
-	regs->edx = (unsigned long) 0;
-	regs->ecx = (unsigned long) 0;
+	regs->sp = (unsigned long) frame;
+	regs->ip = (unsigned long) ka->sa.sa_handler;
+	regs->ax = (unsigned long) sig;
+	regs->dx = (unsigned long) 0;
+	regs->cx = (unsigned long) 0;
 
-	regs->xds = __USER_DS;
-	regs->xes = __USER_DS;
-	regs->xss = __USER_DS;
-	regs->xcs = __USER_CS;
+	regs->ds = __USER_DS;
+	regs->es = __USER_DS;
+	regs->ss = __USER_DS;
+	regs->cs = __USER_CS;
 
 	/*
 	 * Clear TF when entering the signal handler, but
@@ -404,13 +404,13 @@ static int setup_frame(int sig, struct k_sigaction *ka,
 	 * The tracer may want to single-step inside the
 	 * handler too.
 	 */
-	regs->eflags &= ~TF_MASK;
+	regs->flags &= ~TF_MASK;
 	if (test_thread_flag(TIF_SINGLESTEP))
 		ptrace_notify(SIGTRAP);
 
 #if DEBUG_SIG
 	printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
-		current->comm, current->pid, frame, regs->eip, frame->pretcode);
+		current->comm, current->pid, frame, regs->ip, frame->pretcode);
 #endif
 
 	return 0;
@@ -450,7 +450,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 	err |= __put_user(0, &frame->uc.uc_flags);
 	err |= __put_user(0, &frame->uc.uc_link);
 	err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
-	err |= __put_user(sas_ss_flags(regs->esp),
+	err |= __put_user(sas_ss_flags(regs->sp),
 			  &frame->uc.uc_stack.ss_flags);
 	err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
 	err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate,
@@ -466,7 +466,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 	err |= __put_user(restorer, &frame->pretcode);
 	 
 	/*
-	 * This is movl $,%eax ; int $0x80
+	 * This is movl $,%ax ; int $0x80
 	 *
 	 * WE DO NOT USE IT ANY MORE! It's only left here for historical
 	 * reasons and because gdb uses it as a signature to notice
@@ -480,16 +480,16 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 		goto give_sigsegv;
 
 	/* Set up registers for signal handler */
-	regs->esp = (unsigned long) frame;
-	regs->eip = (unsigned long) ka->sa.sa_handler;
-	regs->eax = (unsigned long) usig;
-	regs->edx = (unsigned long) &frame->info;
-	regs->ecx = (unsigned long) &frame->uc;
+	regs->sp = (unsigned long) frame;
+	regs->ip = (unsigned long) ka->sa.sa_handler;
+	regs->ax = (unsigned long) usig;
+	regs->dx = (unsigned long) &frame->info;
+	regs->cx = (unsigned long) &frame->uc;
 
-	regs->xds = __USER_DS;
-	regs->xes = __USER_DS;
-	regs->xss = __USER_DS;
-	regs->xcs = __USER_CS;
+	regs->ds = __USER_DS;
+	regs->es = __USER_DS;
+	regs->ss = __USER_DS;
+	regs->cs = __USER_CS;
 
 	/*
 	 * Clear TF when entering the signal handler, but
@@ -497,13 +497,13 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 	 * The tracer may want to single-step inside the
 	 * handler too.
 	 */
-	regs->eflags &= ~TF_MASK;
+	regs->flags &= ~TF_MASK;
 	if (test_thread_flag(TIF_SINGLESTEP))
 		ptrace_notify(SIGTRAP);
 
 #if DEBUG_SIG
 	printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
-		current->comm, current->pid, frame, regs->eip, frame->pretcode);
+		current->comm, current->pid, frame, regs->ip, frame->pretcode);
 #endif
 
 	return 0;
@@ -524,23 +524,23 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 	int ret;
 
 	/* Are we from a system call? */
-	if (regs->orig_eax >= 0) {
+	if (regs->orig_ax >= 0) {
 		/* If so, check system call restarting.. */
-		switch (regs->eax) {
+		switch (regs->ax) {
 		        case -ERESTART_RESTARTBLOCK:
 			case -ERESTARTNOHAND:
-				regs->eax = -EINTR;
+				regs->ax = -EINTR;
 				break;
 
 			case -ERESTARTSYS:
 				if (!(ka->sa.sa_flags & SA_RESTART)) {
-					regs->eax = -EINTR;
+					regs->ax = -EINTR;
 					break;
 				}
 			/* fallthrough */
 			case -ERESTARTNOINTR:
-				regs->eax = regs->orig_eax;
-				regs->eip -= 2;
+				regs->ax = regs->orig_ax;
+				regs->ip -= 2;
 		}
 	}
 
@@ -548,9 +548,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 	 * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF
 	 * flag so that register information in the sigcontext is correct.
 	 */
-	if (unlikely(regs->eflags & X86_EFLAGS_TF) &&
+	if (unlikely(regs->flags & X86_EFLAGS_TF) &&
 	    likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
-		regs->eflags &= ~X86_EFLAGS_TF;
+		regs->flags &= ~X86_EFLAGS_TF;
 
 	/* Set up the stack frame */
 	if (ka->sa.sa_flags & SA_SIGINFO)
@@ -622,19 +622,19 @@ static void fastcall do_signal(struct pt_regs *regs)
 	}
 
 	/* Did we come from a system call? */
-	if (regs->orig_eax >= 0) {
+	if (regs->orig_ax >= 0) {
 		/* Restart the system call - no handlers present */
-		switch (regs->eax) {
+		switch (regs->ax) {
 		case -ERESTARTNOHAND:
 		case -ERESTARTSYS:
 		case -ERESTARTNOINTR:
-			regs->eax = regs->orig_eax;
-			regs->eip -= 2;
+			regs->ax = regs->orig_ax;
+			regs->ip -= 2;
 			break;
 
 		case -ERESTART_RESTARTBLOCK:
-			regs->eax = __NR_restart_syscall;
-			regs->eip -= 2;
+			regs->ax = __NR_restart_syscall;
+			regs->ip -= 2;
 			break;
 		}
 	}
@@ -657,7 +657,7 @@ void do_notify_resume(struct pt_regs *regs, void *_unused,
 {
 	/* Pending single-step? */
 	if (thread_info_flags & _TIF_SINGLESTEP) {
-		regs->eflags |= TF_MASK;
+		regs->flags |= TF_MASK;
 		clear_thread_flag(TIF_SINGLESTEP);
 	}
 
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index ab0178ebe00a..4b228fd83b31 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -39,7 +39,7 @@ asmlinkage long
 sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
 		struct pt_regs *regs)
 {
-	return do_sigaltstack(uss, uoss, regs->rsp);
+	return do_sigaltstack(uss, uoss, regs->sp);
 }
 
 
@@ -62,10 +62,11 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned
 	/* Always make any pending restarted system calls return -EINTR */
 	current_thread_info()->restart_block.fn = do_no_restart_syscall;
 
+#define COPYR(x)	err |= __get_user(regs->x, &sc->r ## x)
 #define COPY(x)		err |= __get_user(regs->x, &sc->x)
 
-	COPY(rdi); COPY(rsi); COPY(rbp); COPY(rsp); COPY(rbx);
-	COPY(rdx); COPY(rcx); COPY(rip);
+	COPYR(di); COPYR(si); COPYR(bp); COPYR(sp); COPYR(bx);
+	COPYR(dx); COPYR(cx); COPYR(ip);
 	COPY(r8);
 	COPY(r9);
 	COPY(r10);
@@ -87,8 +88,8 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned
 	{
 		unsigned int tmpflags;
 		err |= __get_user(tmpflags, &sc->eflags);
-		regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5);
-		regs->orig_rax = -1;		/* disable syscall checks */
+		regs->flags = (regs->flags & ~0x40DD5) | (tmpflags & 0x40DD5);
+		regs->orig_ax = -1;		/* disable syscall checks */
 	}
 
 	{
@@ -119,9 +120,9 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
 {
 	struct rt_sigframe __user *frame;
 	sigset_t set;
-	unsigned long eax;
+	unsigned long ax;
 
-	frame = (struct rt_sigframe __user *)(regs->rsp - 8);
+	frame = (struct rt_sigframe __user *)(regs->sp - 8);
 	if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) {
 		goto badframe;
 	} 
@@ -135,17 +136,17 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
 	recalc_sigpending();
 	spin_unlock_irq(&current->sighand->siglock);
 	
-	if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax))
+	if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
 		goto badframe;
 
 #ifdef DEBUG_SIG
-	printk("%d sigreturn rip:%lx rsp:%lx frame:%p rax:%lx\n",current->pid,regs->rip,regs->rsp,frame,eax);
+	printk("%d sigreturn ip:%lx sp:%lx frame:%p ax:%lx\n",current->pid,regs->ip,regs->sp,frame,ax);
 #endif
 
-	if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->rsp) == -EFAULT)
+	if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT)
 		goto badframe;
 
-	return eax;
+	return ax;
 
 badframe:
 	signal_fault(regs,frame,"sigreturn");
@@ -165,14 +166,14 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned lo
 	err |= __put_user(0, &sc->gs);
 	err |= __put_user(0, &sc->fs);
 
-	err |= __put_user(regs->rdi, &sc->rdi);
-	err |= __put_user(regs->rsi, &sc->rsi);
-	err |= __put_user(regs->rbp, &sc->rbp);
-	err |= __put_user(regs->rsp, &sc->rsp);
-	err |= __put_user(regs->rbx, &sc->rbx);
-	err |= __put_user(regs->rdx, &sc->rdx);
-	err |= __put_user(regs->rcx, &sc->rcx);
-	err |= __put_user(regs->rax, &sc->rax);
+	err |= __put_user(regs->di, &sc->rdi);
+	err |= __put_user(regs->si, &sc->rsi);
+	err |= __put_user(regs->bp, &sc->rbp);
+	err |= __put_user(regs->sp, &sc->rsp);
+	err |= __put_user(regs->bx, &sc->rbx);
+	err |= __put_user(regs->dx, &sc->rdx);
+	err |= __put_user(regs->cx, &sc->rcx);
+	err |= __put_user(regs->ax, &sc->rax);
 	err |= __put_user(regs->r8, &sc->r8);
 	err |= __put_user(regs->r9, &sc->r9);
 	err |= __put_user(regs->r10, &sc->r10);
@@ -183,8 +184,8 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned lo
 	err |= __put_user(regs->r15, &sc->r15);
 	err |= __put_user(me->thread.trap_no, &sc->trapno);
 	err |= __put_user(me->thread.error_code, &sc->err);
-	err |= __put_user(regs->rip, &sc->rip);
-	err |= __put_user(regs->eflags, &sc->eflags);
+	err |= __put_user(regs->ip, &sc->rip);
+	err |= __put_user(regs->flags, &sc->eflags);
 	err |= __put_user(mask, &sc->oldmask);
 	err |= __put_user(me->thread.cr2, &sc->cr2);
 
@@ -198,18 +199,18 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned lo
 static void __user *
 get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
 {
-	unsigned long rsp;
+	unsigned long sp;
 
 	/* Default to using normal stack - redzone*/
-	rsp = regs->rsp - 128;
+	sp = regs->sp - 128;
 
 	/* This is the X/Open sanctioned signal stack switching.  */
 	if (ka->sa.sa_flags & SA_ONSTACK) {
-		if (sas_ss_flags(rsp) == 0)
-			rsp = current->sas_ss_sp + current->sas_ss_size;
+		if (sas_ss_flags(sp) == 0)
+			sp = current->sas_ss_sp + current->sas_ss_size;
 	}
 
-	return (void __user *)round_down(rsp - size, 16); 
+	return (void __user *)round_down(sp - size, 16);
 }
 
 static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
@@ -246,7 +247,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 	err |= __put_user(0, &frame->uc.uc_flags);
 	err |= __put_user(0, &frame->uc.uc_link);
 	err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
-	err |= __put_user(sas_ss_flags(regs->rsp),
+	err |= __put_user(sas_ss_flags(regs->sp),
 			  &frame->uc.uc_stack.ss_flags);
 	err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
 	err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me);
@@ -271,21 +272,21 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 		goto give_sigsegv;
 
 #ifdef DEBUG_SIG
-	printk("%d old rip %lx old rsp %lx old rax %lx\n", current->pid,regs->rip,regs->rsp,regs->rax);
+	printk("%d old ip %lx old sp %lx old ax %lx\n", current->pid,regs->ip,regs->sp,regs->ax);
 #endif
 
 	/* Set up registers for signal handler */
-	regs->rdi = sig;
+	regs->di = sig;
 	/* In case the signal handler was declared without prototypes */ 
-	regs->rax = 0;	
+	regs->ax = 0;
 
 	/* This also works for non SA_SIGINFO handlers because they expect the
 	   next argument after the signal number on the stack. */
-	regs->rsi = (unsigned long)&frame->info; 
-	regs->rdx = (unsigned long)&frame->uc; 
-	regs->rip = (unsigned long) ka->sa.sa_handler;
+	regs->si = (unsigned long)&frame->info;
+	regs->dx = (unsigned long)&frame->uc;
+	regs->ip = (unsigned long) ka->sa.sa_handler;
 
-	regs->rsp = (unsigned long)frame;
+	regs->sp = (unsigned long)frame;
 
 	/* Set up the CS register to run signal handlers in 64-bit mode,
 	   even if the handler happens to be interrupting 32-bit code. */
@@ -295,12 +296,12 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 	   see include/asm-x86_64/uaccess.h for details. */
 	set_fs(USER_DS);
 
-	regs->eflags &= ~TF_MASK;
+	regs->flags &= ~TF_MASK;
 	if (test_thread_flag(TIF_SINGLESTEP))
 		ptrace_notify(SIGTRAP);
 #ifdef DEBUG_SIG
 	printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%p\n",
-		current->comm, current->pid, frame, regs->rip, frame->pretcode);
+		current->comm, current->pid, frame, regs->ip, frame->pretcode);
 #endif
 
 	return 0;
@@ -321,29 +322,29 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 	int ret;
 
 #ifdef DEBUG_SIG
-	printk("handle_signal pid:%d sig:%lu rip:%lx rsp:%lx regs=%p\n",
+	printk("handle_signal pid:%d sig:%lu ip:%lx sp:%lx regs=%p\n",
 		current->pid, sig,
-		regs->rip, regs->rsp, regs);
+		regs->ip, regs->sp, regs);
 #endif
 
 	/* Are we from a system call? */
-	if ((long)regs->orig_rax >= 0) {
+	if ((long)regs->orig_ax >= 0) {
 		/* If so, check system call restarting.. */
-		switch (regs->rax) {
+		switch (regs->ax) {
 		        case -ERESTART_RESTARTBLOCK:
 			case -ERESTARTNOHAND:
-				regs->rax = -EINTR;
+				regs->ax = -EINTR;
 				break;
 
 			case -ERESTARTSYS:
 				if (!(ka->sa.sa_flags & SA_RESTART)) {
-					regs->rax = -EINTR;
+					regs->ax = -EINTR;
 					break;
 				}
 				/* fallthrough */
 			case -ERESTARTNOINTR:
-				regs->rax = regs->orig_rax;
-				regs->rip -= 2;
+				regs->ax = regs->orig_ax;
+				regs->ip -= 2;
 				break;
 		}
 	}
@@ -352,9 +353,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 	 * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF
 	 * flag so that register information in the sigcontext is correct.
 	 */
-	if (unlikely(regs->eflags & X86_EFLAGS_TF) &&
+	if (unlikely(regs->flags & X86_EFLAGS_TF) &&
 	    likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
-		regs->eflags &= ~X86_EFLAGS_TF;
+		regs->flags &= ~X86_EFLAGS_TF;
 
 #ifdef CONFIG_IA32_EMULATION
 	if (test_thread_flag(TIF_IA32)) {
@@ -426,21 +427,21 @@ static void do_signal(struct pt_regs *regs)
 	}
 
 	/* Did we come from a system call? */
-	if ((long)regs->orig_rax >= 0) {
+	if ((long)regs->orig_ax >= 0) {
 		/* Restart the system call - no handlers present */
-		long res = regs->rax;
+		long res = regs->ax;
 		switch (res) {
 		case -ERESTARTNOHAND:
 		case -ERESTARTSYS:
 		case -ERESTARTNOINTR:
-			regs->rax = regs->orig_rax;
-			regs->rip -= 2;
+			regs->ax = regs->orig_ax;
+			regs->ip -= 2;
 			break;
 		case -ERESTART_RESTARTBLOCK:
-			regs->rax = test_thread_flag(TIF_IA32) ?
+			regs->ax = test_thread_flag(TIF_IA32) ?
 					__NR_ia32_restart_syscall :
 					__NR_restart_syscall;
-			regs->rip -= 2;
+			regs->ip -= 2;
 			break;
 		}
 	}
@@ -457,13 +458,13 @@ void
 do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 {
 #ifdef DEBUG_SIG
-	printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%p pending:%x\n",
-	       thread_info_flags, regs->rip, regs->rsp, __builtin_return_address(0),signal_pending(current)); 
+	printk("do_notify_resume flags:%x ip:%lx sp:%lx caller:%p pending:%x\n",
+	       thread_info_flags, regs->ip, regs->sp, __builtin_return_address(0),signal_pending(current));
 #endif
 	       
 	/* Pending single-step? */
 	if (thread_info_flags & _TIF_SINGLESTEP) {
-		regs->eflags |= TF_MASK;
+		regs->flags |= TF_MASK;
 		clear_thread_flag(TIF_SINGLESTEP);
 	}
 
@@ -485,8 +486,8 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
 { 
 	struct task_struct *me = current; 
 	if (show_unhandled_signals && printk_ratelimit())
-		printk("%s[%d] bad frame in %s frame:%p rip:%lx rsp:%lx orax:%lx\n",
-	       me->comm,me->pid,where,frame,regs->rip,regs->rsp,regs->orig_rax); 
+		printk("%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx\n",
+	       me->comm,me->pid,where,frame,regs->ip,regs->sp,regs->orig_ax);
 
 	force_sig(SIGSEGV, me); 
 } 
diff --git a/arch/x86/kernel/smp_64.c b/arch/x86/kernel/smp_64.c
index 7142447b5666..02a6533e8909 100644
--- a/arch/x86/kernel/smp_64.c
+++ b/arch/x86/kernel/smp_64.c
@@ -136,7 +136,7 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
 	 * orig_rax contains the negated interrupt vector.
 	 * Use that to determine where the sender put the data.
 	 */
-	sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START;
+	sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
 	f = &per_cpu(flush_state, sender);
 
 	if (!cpu_isset(cpu, f->flush_cpumask))
diff --git a/arch/x86/kernel/smpboot_32.c b/arch/x86/kernel/smpboot_32.c
index 0bf7f20baba0..3566191832b3 100644
--- a/arch/x86/kernel/smpboot_32.c
+++ b/arch/x86/kernel/smpboot_32.c
@@ -447,7 +447,7 @@ void __devinit initialize_secondary(void)
 {
 	/*
 	 * We don't actually need to load the full TSS,
-	 * basically just the stack pointer and the eip.
+	 * basically just the stack pointer and the ip.
 	 */
 
 	asm volatile(
@@ -459,7 +459,7 @@ void __devinit initialize_secondary(void)
 
 /* Static state in head.S used to set up a CPU */
 extern struct {
-	void * esp;
+	void * sp;
 	unsigned short ss;
 } stack_start;
 
@@ -667,7 +667,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
 	 * target processor state.
 	 */
 	startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
-		         (unsigned long) stack_start.esp);
+		         (unsigned long) stack_start.sp);
 
 	/*
 	 * Run STARTUP IPI loop.
@@ -806,9 +806,9 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
 	alternatives_smp_switch(1);
 
 	/* So we see what's up   */
-	printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip);
+	printk("Booting processor %d/%d ip %lx\n", cpu, apicid, start_eip);
 	/* Stack for startup_32 can be just as for start_secondary onwards */
-	stack_start.esp = (void *) idle->thread.esp;
+	stack_start.sp = (void *) idle->thread.esp;
 
 	irq_ctx_init(cpu);
 
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index cf4b9dac4a05..f55c003f5b63 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -12,17 +12,12 @@ unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *r
 {
 	unsigned long addr, seg;
 
-#ifdef CONFIG_X86_64
-	addr = regs->rip;
+	addr = regs->ip;
 	seg = regs->cs & 0xffff;
-#else
-	addr = regs->eip;
-	seg = regs->xcs & 0xffff;
-	if (regs->eflags & X86_EFLAGS_VM) {
+	if (v8086_mode(regs)) {
 		addr = (addr & 0xffff) + (seg << 4);
 		return addr;
 	}
-#endif
 
 	/*
 	 * We'll assume that the code segments in the GDT
@@ -124,11 +119,11 @@ static int enable_single_step(struct task_struct *child)
 	/*
 	 * If TF was already set, don't do anything else
 	 */
-	if (regs->eflags & X86_EFLAGS_TF)
+	if (regs->flags & X86_EFLAGS_TF)
 		return 0;
 
 	/* Set TF on the kernel stack.. */
-	regs->eflags |= X86_EFLAGS_TF;
+	regs->flags |= X86_EFLAGS_TF;
 
 	/*
 	 * ..but if TF is changed by the instruction we will trace,
@@ -203,5 +198,5 @@ void user_disable_single_step(struct task_struct *child)
 
 	/* But touch TF only if it was set by us.. */
 	if (test_and_clear_tsk_thread_flag(child, TIF_FORCED_TF))
-		task_pt_regs(child)->eflags &= ~X86_EFLAGS_TF;
+		task_pt_regs(child)->flags &= ~X86_EFLAGS_TF;
 }
diff --git a/arch/x86/kernel/suspend_asm_64.S b/arch/x86/kernel/suspend_asm_64.S
index 72f952103e50..aeb9a4d7681e 100644
--- a/arch/x86/kernel/suspend_asm_64.S
+++ b/arch/x86/kernel/suspend_asm_64.S
@@ -18,13 +18,13 @@
 
 ENTRY(swsusp_arch_suspend)
 	movq	$saved_context, %rax
-	movq	%rsp, pt_regs_rsp(%rax)
-	movq	%rbp, pt_regs_rbp(%rax)
-	movq	%rsi, pt_regs_rsi(%rax)
-	movq	%rdi, pt_regs_rdi(%rax)
-	movq	%rbx, pt_regs_rbx(%rax)
-	movq	%rcx, pt_regs_rcx(%rax)
-	movq	%rdx, pt_regs_rdx(%rax)
+	movq	%rsp, pt_regs_sp(%rax)
+	movq	%rbp, pt_regs_bp(%rax)
+	movq	%rsi, pt_regs_si(%rax)
+	movq	%rdi, pt_regs_di(%rax)
+	movq	%rbx, pt_regs_bx(%rax)
+	movq	%rcx, pt_regs_cx(%rax)
+	movq	%rdx, pt_regs_dx(%rax)
 	movq	%r8, pt_regs_r8(%rax)
 	movq	%r9, pt_regs_r9(%rax)
 	movq	%r10, pt_regs_r10(%rax)
@@ -34,7 +34,7 @@ ENTRY(swsusp_arch_suspend)
 	movq	%r14, pt_regs_r14(%rax)
 	movq	%r15, pt_regs_r15(%rax)
 	pushfq
-	popq	pt_regs_eflags(%rax)
+	popq	pt_regs_flags(%rax)
 
 	/* save the address of restore_registers */
 	movq	$restore_registers, %rax
@@ -115,13 +115,13 @@ ENTRY(restore_registers)
 
 	/* We don't restore %rax, it must be 0 anyway */
 	movq	$saved_context, %rax
-	movq	pt_regs_rsp(%rax), %rsp
-	movq	pt_regs_rbp(%rax), %rbp
-	movq	pt_regs_rsi(%rax), %rsi
-	movq	pt_regs_rdi(%rax), %rdi
-	movq	pt_regs_rbx(%rax), %rbx
-	movq	pt_regs_rcx(%rax), %rcx
-	movq	pt_regs_rdx(%rax), %rdx
+	movq	pt_regs_sp(%rax), %rsp
+	movq	pt_regs_bp(%rax), %rbp
+	movq	pt_regs_si(%rax), %rsi
+	movq	pt_regs_di(%rax), %rdi
+	movq	pt_regs_bx(%rax), %rbx
+	movq	pt_regs_cx(%rax), %rcx
+	movq	pt_regs_dx(%rax), %rdx
 	movq	pt_regs_r8(%rax), %r8
 	movq	pt_regs_r9(%rax), %r9
 	movq	pt_regs_r10(%rax), %r10
@@ -130,7 +130,7 @@ ENTRY(restore_registers)
 	movq	pt_regs_r13(%rax), %r13
 	movq	pt_regs_r14(%rax), %r14
 	movq	pt_regs_r15(%rax), %r15
-	pushq	pt_regs_eflags(%rax)
+	pushq	pt_regs_flags(%rax)
 	popfq
 
 	xorq	%rax, %rax
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index 2dcbb81b4cd3..1a89e93f3f1c 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -49,15 +49,15 @@ unsigned long profile_pc(struct pt_regs *regs)
 	unsigned long pc = instruction_pointer(regs);
 
 #ifdef CONFIG_SMP
-	if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->xcs) &&
+	if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->cs) &&
 	    in_lock_functions(pc)) {
 #ifdef CONFIG_FRAME_POINTER
-		return *(unsigned long *)(regs->ebp + 4);
+		return *(unsigned long *)(regs->bp + 4);
 #else
-		unsigned long *sp = (unsigned long *)&regs->esp;
+		unsigned long *sp = (unsigned long *)&regs->sp;
 
 		/* Return address is either directly at stack pointer
-		   or above a saved eflags. Eflags has bits 22-31 zero,
+		   or above a saved flags. Eflags has bits 22-31 zero,
 		   kernel addresses don't. */
 		if (sp[0] >> 22)
 			return sp[0];
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index f88bf6b802e3..bf0bcc9bb001 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -29,10 +29,10 @@ unsigned long profile_pc(struct pt_regs *regs)
 	unsigned long pc = instruction_pointer(regs);
 
 	/* Assume the lock function has either no stack frame or a copy
-	   of eflags from PUSHF
+	   of flags from PUSHF
 	   Eflags always has bits 22 and up cleared unlike kernel addresses. */
 	if (!user_mode(regs) && in_lock_functions(pc)) {
-		unsigned long *sp = (unsigned long *)regs->rsp;
+		unsigned long *sp = (unsigned long *)regs->sp;
 		if (sp[0] >> 22)
 			return sp[0];
 		if (sp[1] >> 22)
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index 9b0bbd508cd5..931ef10960ee 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -114,11 +114,11 @@ struct stack_frame {
 };
 
 static inline unsigned long print_context_stack(struct thread_info *tinfo,
-				unsigned long *stack, unsigned long ebp,
+				unsigned long *stack, unsigned long bp,
 				const struct stacktrace_ops *ops, void *data)
 {
 #ifdef	CONFIG_FRAME_POINTER
-	struct stack_frame *frame = (struct stack_frame *)ebp;
+	struct stack_frame *frame = (struct stack_frame *)bp;
 	while (valid_stack_ptr(tinfo, frame, sizeof(*frame))) {
 		struct stack_frame *next;
 		unsigned long addr;
@@ -145,7 +145,7 @@ static inline unsigned long print_context_stack(struct thread_info *tinfo,
 			ops->address(data, addr);
 	}
 #endif
-	return ebp;
+	return bp;
 }
 
 #define MSG(msg) ops->warning(data, msg)
@@ -154,7 +154,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 	        unsigned long *stack,
 		const struct stacktrace_ops *ops, void *data)
 {
-	unsigned long ebp = 0;
+	unsigned long bp = 0;
 
 	if (!task)
 		task = current;
@@ -167,13 +167,13 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 	}
 
 #ifdef CONFIG_FRAME_POINTER
-	if (!ebp) {
+	if (!bp) {
 		if (task == current) {
-			/* Grab ebp right from our regs */
-			asm ("movl %%ebp, %0" : "=r" (ebp) : );
+			/* Grab bp right from our regs */
+			asm ("movl %%ebp, %0" : "=r" (bp) : );
 		} else {
-			/* ebp is the last reg pushed by switch_to */
-			ebp = *(unsigned long *) task->thread.esp;
+			/* bp is the last reg pushed by switch_to */
+			bp = *(unsigned long *) task->thread.esp;
 		}
 	}
 #endif
@@ -182,7 +182,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 		struct thread_info *context;
 		context = (struct thread_info *)
 			((unsigned long)stack & (~(THREAD_SIZE - 1)));
-		ebp = print_context_stack(context, stack, ebp, ops, data);
+		bp = print_context_stack(context, stack, bp, ops, data);
 		/* Should be after the line below, but somewhere
 		   in early boot context comes out corrupted and we
 		   can't reference it -AK */
@@ -246,19 +246,19 @@ void show_trace(struct task_struct *task, struct pt_regs *regs,
 }
 
 static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
-			       unsigned long *esp, char *log_lvl)
+			       unsigned long *sp, char *log_lvl)
 {
 	unsigned long *stack;
 	int i;
 
-	if (esp == NULL) {
+	if (sp == NULL) {
 		if (task)
-			esp = (unsigned long*)task->thread.esp;
+			sp = (unsigned long*)task->thread.esp;
 		else
-			esp = (unsigned long *)&esp;
+			sp = (unsigned long *)&sp;
 	}
 
-	stack = esp;
+	stack = sp;
 	for(i = 0; i < kstack_depth_to_print; i++) {
 		if (kstack_end(stack))
 			break;
@@ -267,13 +267,13 @@ static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
 		printk("%08lx ", *stack++);
 	}
 	printk("\n%sCall Trace:\n", log_lvl);
-	show_trace_log_lvl(task, regs, esp, log_lvl);
+	show_trace_log_lvl(task, regs, sp, log_lvl);
 }
 
-void show_stack(struct task_struct *task, unsigned long *esp)
+void show_stack(struct task_struct *task, unsigned long *sp)
 {
 	printk("       ");
-	show_stack_log_lvl(task, NULL, esp, "");
+	show_stack_log_lvl(task, NULL, sp, "");
 }
 
 /*
@@ -307,30 +307,30 @@ void show_registers(struct pt_regs *regs)
 	 * time of the fault..
 	 */
 	if (!user_mode_vm(regs)) {
-		u8 *eip;
+		u8 *ip;
 		unsigned int code_prologue = code_bytes * 43 / 64;
 		unsigned int code_len = code_bytes;
 		unsigned char c;
 
 		printk("\n" KERN_EMERG "Stack: ");
-		show_stack_log_lvl(NULL, regs, &regs->esp, KERN_EMERG);
+		show_stack_log_lvl(NULL, regs, &regs->sp, KERN_EMERG);
 
 		printk(KERN_EMERG "Code: ");
 
-		eip = (u8 *)regs->eip - code_prologue;
-		if (eip < (u8 *)PAGE_OFFSET ||
-			probe_kernel_address(eip, c)) {
+		ip = (u8 *)regs->ip - code_prologue;
+		if (ip < (u8 *)PAGE_OFFSET ||
+			probe_kernel_address(ip, c)) {
 			/* try starting at EIP */
-			eip = (u8 *)regs->eip;
+			ip = (u8 *)regs->ip;
 			code_len = code_len - code_prologue + 1;
 		}
-		for (i = 0; i < code_len; i++, eip++) {
-			if (eip < (u8 *)PAGE_OFFSET ||
-				probe_kernel_address(eip, c)) {
+		for (i = 0; i < code_len; i++, ip++) {
+			if (ip < (u8 *)PAGE_OFFSET ||
+				probe_kernel_address(ip, c)) {
 				printk(" Bad EIP value.");
 				break;
 			}
-			if (eip == (u8 *)regs->eip)
+			if (ip == (u8 *)regs->ip)
 				printk("<%02x> ", c);
 			else
 				printk("%02x ", c);
@@ -339,13 +339,13 @@ void show_registers(struct pt_regs *regs)
 	printk("\n");
 }	
 
-int is_valid_bugaddr(unsigned long eip)
+int is_valid_bugaddr(unsigned long ip)
 {
 	unsigned short ud2;
 
-	if (eip < PAGE_OFFSET)
+	if (ip < PAGE_OFFSET)
 		return 0;
-	if (probe_kernel_address((unsigned short *)eip, ud2))
+	if (probe_kernel_address((unsigned short *)ip, ud2))
 		return 0;
 
 	return ud2 == 0x0b0f;
@@ -382,10 +382,10 @@ void die(const char * str, struct pt_regs * regs, long err)
 		raw_local_irq_save(flags);
 
 	if (++die.lock_owner_depth < 3) {
-		unsigned long esp;
+		unsigned long sp;
 		unsigned short ss;
 
-		report_bug(regs->eip, regs);
+		report_bug(regs->ip, regs);
 
 		printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff,
 		       ++die_counter);
@@ -405,15 +405,15 @@ void die(const char * str, struct pt_regs * regs, long err)
 				NOTIFY_STOP) {
 			show_registers(regs);
 			/* Executive summary in case the oops scrolled away */
-			esp = (unsigned long) (&regs->esp);
+			sp = (unsigned long) (&regs->sp);
 			savesegment(ss, ss);
 			if (user_mode(regs)) {
-				esp = regs->esp;
-				ss = regs->xss & 0xffff;
+				sp = regs->sp;
+				ss = regs->ss & 0xffff;
 			}
-			printk(KERN_EMERG "EIP: [<%08lx>] ", regs->eip);
-			print_symbol("%s", regs->eip);
-			printk(" SS:ESP %04x:%08lx\n", ss, esp);
+			printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
+			print_symbol("%s", regs->ip);
+			printk(" SS:ESP %04x:%08lx\n", ss, sp);
 		}
 		else
 			regs = NULL;
@@ -454,7 +454,7 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
 {
 	struct task_struct *tsk = current;
 
-	if (regs->eflags & VM_MASK) {
+	if (regs->flags & VM_MASK) {
 		if (vm86)
 			goto vm86_trap;
 		goto trap_signal;
@@ -548,13 +548,13 @@ fastcall void do_##name(struct pt_regs * regs, long error_code) \
 	do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
 }
 
-DO_VM86_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->eip)
+DO_VM86_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->ip)
 #ifndef CONFIG_KPROBES
 DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
 #endif
 DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
 DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
-DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip, 0)
+DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
 DO_ERROR( 9, SIGFPE,  "coprocessor segment overrun", coprocessor_segment_overrun)
 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
 DO_ERROR(11, SIGBUS,  "segment not present", segment_not_present)
@@ -596,7 +596,7 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs,
 	}
 	put_cpu();
 
-	if (regs->eflags & VM_MASK)
+	if (regs->flags & VM_MASK)
 		goto gp_in_vm86;
 
 	if (!user_mode(regs))
@@ -607,9 +607,9 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs,
 	if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
 	    printk_ratelimit())
 		printk(KERN_INFO
-		    "%s[%d] general protection eip:%lx esp:%lx error:%lx\n",
+		    "%s[%d] general protection ip:%lx sp:%lx error:%lx\n",
 		    current->comm, task_pid_nr(current),
-		    regs->eip, regs->esp, error_code);
+		    regs->ip, regs->sp, error_code);
 
 	force_sig(SIGSEGV, current);
 	return;
@@ -705,8 +705,8 @@ void __kprobes die_nmi(struct pt_regs *regs, const char *msg)
 	*/
 	bust_spinlocks(1);
 	printk(KERN_EMERG "%s", msg);
-	printk(" on CPU%d, eip %08lx, registers:\n",
-		smp_processor_id(), regs->eip);
+	printk(" on CPU%d, ip %08lx, registers:\n",
+		smp_processor_id(), regs->ip);
 	show_registers(regs);
 	console_silent();
 	spin_unlock(&nmi_print_lock);
@@ -847,7 +847,7 @@ fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code)
 					SIGTRAP) == NOTIFY_STOP)
 		return;
 	/* It's safe to allow irq's after DR6 has been saved */
-	if (regs->eflags & X86_EFLAGS_IF)
+	if (regs->flags & X86_EFLAGS_IF)
 		local_irq_enable();
 
 	/* Mask out spurious debug traps due to lazy DR7 setting */
@@ -856,7 +856,7 @@ fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code)
 			goto clear_dr7;
 	}
 
-	if (regs->eflags & VM_MASK)
+	if (regs->flags & VM_MASK)
 		goto debug_vm86;
 
 	/* Save debug status register where ptrace can see it */
@@ -892,7 +892,7 @@ debug_vm86:
 
 clear_TF_reenable:
 	set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
-	regs->eflags &= ~TF_MASK;
+	regs->flags &= ~TF_MASK;
 	return;
 }
 
@@ -901,7 +901,7 @@ clear_TF_reenable:
  * the correct behaviour even in the presence of the asynchronous
  * IRQ13 behaviour
  */
-void math_error(void __user *eip)
+void math_error(void __user *ip)
 {
 	struct task_struct * task;
 	siginfo_t info;
@@ -917,7 +917,7 @@ void math_error(void __user *eip)
 	info.si_signo = SIGFPE;
 	info.si_errno = 0;
 	info.si_code = __SI_FAULT;
-	info.si_addr = eip;
+	info.si_addr = ip;
 	/*
 	 * (~cwd & swd) will mask out exceptions that are not set to unmasked
 	 * status.  0x3f is the exception bits in these regs, 0x200 is the
@@ -963,10 +963,10 @@ void math_error(void __user *eip)
 fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code)
 {
 	ignore_fpu_irq = 1;
-	math_error((void __user *)regs->eip);
+	math_error((void __user *)regs->ip);
 }
 
-static void simd_math_error(void __user *eip)
+static void simd_math_error(void __user *ip)
 {
 	struct task_struct * task;
 	siginfo_t info;
@@ -982,7 +982,7 @@ static void simd_math_error(void __user *eip)
 	info.si_signo = SIGFPE;
 	info.si_errno = 0;
 	info.si_code = __SI_FAULT;
-	info.si_addr = eip;
+	info.si_addr = ip;
 	/*
 	 * The SIMD FPU exceptions are handled a little differently, as there
 	 * is only a single status/control register.  Thus, to determine which
@@ -1020,13 +1020,13 @@ fastcall void do_simd_coprocessor_error(struct pt_regs * regs,
 	if (cpu_has_xmm) {
 		/* Handle SIMD FPU exceptions on PIII+ processors. */
 		ignore_fpu_irq = 1;
-		simd_math_error((void __user *)regs->eip);
+		simd_math_error((void __user *)regs->ip);
 	} else {
 		/*
 		 * Handle strange cache flush from user space exception
 		 * in all other cases.  This is undocumented behaviour.
 		 */
-		if (regs->eflags & VM_MASK) {
+		if (regs->flags & VM_MASK) {
 			handle_vm86_fault((struct kernel_vm86_regs *)regs,
 					  error_code);
 			return;
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index 610a64d6bdf0..f7fecf9d47c3 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -76,20 +76,20 @@ asmlinkage void spurious_interrupt_bug(void);
 
 static inline void conditional_sti(struct pt_regs *regs)
 {
-	if (regs->eflags & X86_EFLAGS_IF)
+	if (regs->flags & X86_EFLAGS_IF)
 		local_irq_enable();
 }
 
 static inline void preempt_conditional_sti(struct pt_regs *regs)
 {
 	preempt_disable();
-	if (regs->eflags & X86_EFLAGS_IF)
+	if (regs->flags & X86_EFLAGS_IF)
 		local_irq_enable();
 }
 
 static inline void preempt_conditional_cli(struct pt_regs *regs)
 {
-	if (regs->eflags & X86_EFLAGS_IF)
+	if (regs->flags & X86_EFLAGS_IF)
 		local_irq_disable();
 	/* Make sure to not schedule here because we could be running
 	   on an exception stack. */
@@ -353,7 +353,7 @@ show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack)
 }
 
 static void
-_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp)
+_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp)
 {
 	unsigned long *stack;
 	int i;
@@ -364,14 +364,14 @@ _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp)
 	// debugging aid: "show_stack(NULL, NULL);" prints the
 	// back trace for this cpu.
 
-	if (rsp == NULL) {
+	if (sp == NULL) {
 		if (tsk)
-			rsp = (unsigned long *)tsk->thread.rsp;
+			sp = (unsigned long *)tsk->thread.rsp;
 		else
-			rsp = (unsigned long *)&rsp;
+			sp = (unsigned long *)&sp;
 	}
 
-	stack = rsp;
+	stack = sp;
 	for(i=0; i < kstack_depth_to_print; i++) {
 		if (stack >= irqstack && stack <= irqstack_end) {
 			if (stack == irqstack_end) {
@@ -387,12 +387,12 @@ _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp)
 		printk(" %016lx", *stack++);
 		touch_nmi_watchdog();
 	}
-	show_trace(tsk, regs, rsp);
+	show_trace(tsk, regs, sp);
 }
 
-void show_stack(struct task_struct *tsk, unsigned long * rsp)
+void show_stack(struct task_struct *tsk, unsigned long * sp)
 {
-	_show_stack(tsk, NULL, rsp);
+	_show_stack(tsk, NULL, sp);
 }
 
 /*
@@ -416,11 +416,11 @@ void show_registers(struct pt_regs *regs)
 {
 	int i;
 	int in_kernel = !user_mode(regs);
-	unsigned long rsp;
+	unsigned long sp;
 	const int cpu = smp_processor_id();
 	struct task_struct *cur = cpu_pda(cpu)->pcurrent;
 
-	rsp = regs->rsp;
+	sp = regs->sp;
 	printk("CPU %d ", cpu);
 	__show_regs(regs);
 	printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
@@ -432,15 +432,15 @@ void show_registers(struct pt_regs *regs)
 	 */
 	if (in_kernel) {
 		printk("Stack: ");
-		_show_stack(NULL, regs, (unsigned long*)rsp);
+		_show_stack(NULL, regs, (unsigned long*)sp);
 
 		printk("\nCode: ");
-		if (regs->rip < PAGE_OFFSET)
+		if (regs->ip < PAGE_OFFSET)
 			goto bad;
 
 		for (i=0; i<20; i++) {
 			unsigned char c;
-			if (__get_user(c, &((unsigned char*)regs->rip)[i])) {
+			if (__get_user(c, &((unsigned char*)regs->ip)[i])) {
 bad:
 				printk(" Bad RIP value.");
 				break;
@@ -451,11 +451,11 @@ bad:
 	printk("\n");
 }	
 
-int is_valid_bugaddr(unsigned long rip)
+int is_valid_bugaddr(unsigned long ip)
 {
 	unsigned short ud2;
 
-	if (__copy_from_user(&ud2, (const void __user *) rip, sizeof(ud2)))
+	if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2)))
 		return 0;
 
 	return ud2 == 0x0b0f;
@@ -521,8 +521,8 @@ void __kprobes __die(const char * str, struct pt_regs * regs, long err)
 	add_taint(TAINT_DIE);
 	/* Executive summary in case the oops scrolled away */
 	printk(KERN_ALERT "RIP ");
-	printk_address(regs->rip); 
-	printk(" RSP <%016lx>\n", regs->rsp); 
+	printk_address(regs->ip);
+	printk(" RSP <%016lx>\n", regs->sp);
 	if (kexec_should_crash(current))
 		crash_kexec(regs);
 }
@@ -532,7 +532,7 @@ void die(const char * str, struct pt_regs * regs, long err)
 	unsigned long flags = oops_begin();
 
 	if (!user_mode(regs))
-		report_bug(regs->rip, regs);
+		report_bug(regs->ip, regs);
 
 	__die(str, regs, err);
 	oops_end(flags);
@@ -582,9 +582,9 @@ static void __kprobes do_trap(int trapnr, int signr, char *str,
 		if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
 		    printk_ratelimit())
 			printk(KERN_INFO
-			       "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
+			       "%s[%d] trap %s ip:%lx sp:%lx error:%lx\n",
 			       tsk->comm, tsk->pid, str,
-			       regs->rip, regs->rsp, error_code); 
+			       regs->ip, regs->sp, error_code);
 
 		if (info)
 			force_sig_info(signr, info, tsk);
@@ -597,9 +597,9 @@ static void __kprobes do_trap(int trapnr, int signr, char *str,
 	/* kernel trap */ 
 	{	     
 		const struct exception_table_entry *fixup;
-		fixup = search_exception_tables(regs->rip);
+		fixup = search_exception_tables(regs->ip);
 		if (fixup)
-			regs->rip = fixup->fixup;
+			regs->ip = fixup->fixup;
 		else {
 			tsk->thread.error_code = error_code;
 			tsk->thread.trap_no = trapnr;
@@ -635,10 +635,10 @@ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
 	do_trap(trapnr, signr, str, regs, error_code, &info); \
 }
 
-DO_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->rip)
+DO_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->ip)
 DO_ERROR( 4, SIGSEGV, "overflow", overflow)
 DO_ERROR( 5, SIGSEGV, "bounds", bounds)
-DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->rip)
+DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
 DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
 DO_ERROR( 9, SIGFPE,  "coprocessor segment overrun", coprocessor_segment_overrun)
 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
@@ -688,9 +688,9 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
 		if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
 		    printk_ratelimit())
 			printk(KERN_INFO
-		       "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",
+		       "%s[%d] general protection ip:%lx sp:%lx error:%lx\n",
 			       tsk->comm, tsk->pid,
-			       regs->rip, regs->rsp, error_code); 
+			       regs->ip, regs->sp, error_code);
 
 		force_sig(SIGSEGV, tsk);
 		return;
@@ -699,9 +699,9 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
 	/* kernel gp */
 	{
 		const struct exception_table_entry *fixup;
-		fixup = search_exception_tables(regs->rip);
+		fixup = search_exception_tables(regs->ip);
 		if (fixup) {
-			regs->rip = fixup->fixup;
+			regs->ip = fixup->fixup;
 			return;
 		}
 
@@ -824,15 +824,15 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
 {
 	struct pt_regs *regs = eregs;
 	/* Did already sync */
-	if (eregs == (struct pt_regs *)eregs->rsp)
+	if (eregs == (struct pt_regs *)eregs->sp)
 		;
 	/* Exception from user space */
 	else if (user_mode(eregs))
 		regs = task_pt_regs(current);
 	/* Exception from kernel and interrupts are enabled. Move to
  	   kernel process stack. */
-	else if (eregs->eflags & X86_EFLAGS_IF)
-		regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs));
+	else if (eregs->flags & X86_EFLAGS_IF)
+		regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
 	if (eregs != regs)
 		*regs = *eregs;
 	return regs;
@@ -887,7 +887,7 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs,
 	info.si_signo = SIGTRAP;
 	info.si_errno = 0;
 	info.si_code = TRAP_BRKPT;
-	info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL;
+	info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
 	force_sig_info(SIGTRAP, &info, tsk);
 
 clear_dr7:
@@ -897,16 +897,16 @@ clear_dr7:
 
 clear_TF_reenable:
 	set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
-	regs->eflags &= ~TF_MASK;
+	regs->flags &= ~TF_MASK;
 	preempt_conditional_cli(regs);
 }
 
 static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
 {
 	const struct exception_table_entry *fixup;
-	fixup = search_exception_tables(regs->rip);
+	fixup = search_exception_tables(regs->ip);
 	if (fixup) {
-		regs->rip = fixup->fixup;
+		regs->ip = fixup->fixup;
 		return 1;
 	}
 	notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
@@ -923,7 +923,7 @@ static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
  */
 asmlinkage void do_coprocessor_error(struct pt_regs *regs)
 {
-	void __user *rip = (void __user *)(regs->rip);
+	void __user *ip = (void __user *)(regs->ip);
 	struct task_struct * task;
 	siginfo_t info;
 	unsigned short cwd, swd;
@@ -943,7 +943,7 @@ asmlinkage void do_coprocessor_error(struct pt_regs *regs)
 	info.si_signo = SIGFPE;
 	info.si_errno = 0;
 	info.si_code = __SI_FAULT;
-	info.si_addr = rip;
+	info.si_addr = ip;
 	/*
 	 * (~cwd & swd) will mask out exceptions that are not set to unmasked
 	 * status.  0x3f is the exception bits in these regs, 0x200 is the
@@ -992,7 +992,7 @@ asmlinkage void bad_intr(void)
 
 asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
 {
-	void __user *rip = (void __user *)(regs->rip);
+	void __user *ip = (void __user *)(regs->ip);
 	struct task_struct * task;
 	siginfo_t info;
 	unsigned short mxcsr;
@@ -1012,7 +1012,7 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
 	info.si_signo = SIGFPE;
 	info.si_errno = 0;
 	info.si_code = __SI_FAULT;
-	info.si_addr = rip;
+	info.si_addr = ip;
 	/*
 	 * The SIMD FPU exceptions are handled a little differently, as there
 	 * is only a single status/control register.  Thus, to determine which
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 157e4bedd3c5..980e85b90091 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -70,10 +70,10 @@
 /*
  * 8- and 16-bit register defines..
  */
-#define AL(regs)	(((unsigned char *)&((regs)->pt.eax))[0])
-#define AH(regs)	(((unsigned char *)&((regs)->pt.eax))[1])
-#define IP(regs)	(*(unsigned short *)&((regs)->pt.eip))
-#define SP(regs)	(*(unsigned short *)&((regs)->pt.esp))
+#define AL(regs)	(((unsigned char *)&((regs)->pt.ax))[0])
+#define AH(regs)	(((unsigned char *)&((regs)->pt.ax))[1])
+#define IP(regs)	(*(unsigned short *)&((regs)->pt.ip))
+#define SP(regs)	(*(unsigned short *)&((regs)->pt.sp))
 
 /*
  * virtual flags (16 and 32-bit versions)
@@ -93,12 +93,12 @@ static int copy_vm86_regs_to_user(struct vm86_regs __user *user,
 {
 	int ret = 0;
 
-	/* kernel_vm86_regs is missing xgs, so copy everything up to
+	/* kernel_vm86_regs is missing gs, so copy everything up to
 	   (but not including) orig_eax, and then rest including orig_eax. */
-	ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_eax));
-	ret += copy_to_user(&user->orig_eax, &regs->pt.orig_eax,
+	ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_ax));
+	ret += copy_to_user(&user->orig_eax, &regs->pt.orig_ax,
 			    sizeof(struct kernel_vm86_regs) -
-			    offsetof(struct kernel_vm86_regs, pt.orig_eax));
+			    offsetof(struct kernel_vm86_regs, pt.orig_ax));
 
 	return ret;
 }
@@ -110,12 +110,12 @@ static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs,
 {
 	int ret = 0;
 
-	/* copy eax-xfs inclusive */
-	ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_eax));
-	/* copy orig_eax-__gsh+extra */
-	ret += copy_from_user(&regs->pt.orig_eax, &user->orig_eax,
+	/* copy ax-fs inclusive */
+	ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_ax));
+	/* copy orig_ax-__gsh+extra */
+	ret += copy_from_user(&regs->pt.orig_ax, &user->orig_eax,
 			      sizeof(struct kernel_vm86_regs) -
-			      offsetof(struct kernel_vm86_regs, pt.orig_eax) +
+			      offsetof(struct kernel_vm86_regs, pt.orig_ax) +
 			      extra);
 	return ret;
 }
@@ -138,7 +138,7 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
 		printk("no vm86_info: BAD\n");
 		do_exit(SIGSEGV);
 	}
-	set_flags(regs->pt.eflags, VEFLAGS, VIF_MASK | current->thread.v86mask);
+	set_flags(regs->pt.flags, VEFLAGS, VIF_MASK | current->thread.v86mask);
 	tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs,regs);
 	tmp += put_user(current->thread.screen_bitmap,&current->thread.vm86_info->screen_bitmap);
 	if (tmp) {
@@ -155,7 +155,7 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
 
 	ret = KVM86->regs32;
 
-	ret->xfs = current->thread.saved_fs;
+	ret->fs = current->thread.saved_fs;
 	loadsegment(gs, current->thread.saved_gs);
 
 	return ret;
@@ -197,7 +197,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
 
 asmlinkage int sys_vm86old(struct pt_regs regs)
 {
-	struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs.ebx;
+	struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs.bx;
 	struct kernel_vm86_struct info; /* declare this _on top_,
 					 * this avoids wasting of stack space.
 					 * This remains on the stack until we
@@ -237,12 +237,12 @@ asmlinkage int sys_vm86(struct pt_regs regs)
 	struct vm86plus_struct __user *v86;
 
 	tsk = current;
-	switch (regs.ebx) {
+	switch (regs.bx) {
 		case VM86_REQUEST_IRQ:
 		case VM86_FREE_IRQ:
 		case VM86_GET_IRQ_BITS:
 		case VM86_GET_AND_RESET_IRQ:
-			ret = do_vm86_irq_handling(regs.ebx, (int)regs.ecx);
+			ret = do_vm86_irq_handling(regs.bx, (int)regs.cx);
 			goto out;
 		case VM86_PLUS_INSTALL_CHECK:
 			/* NOTE: on old vm86 stuff this will return the error
@@ -258,7 +258,7 @@ asmlinkage int sys_vm86(struct pt_regs regs)
 	ret = -EPERM;
 	if (tsk->thread.saved_esp0)
 		goto out;
-	v86 = (struct vm86plus_struct __user *)regs.ecx;
+	v86 = (struct vm86plus_struct __user *)regs.cx;
 	tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
 				       offsetof(struct kernel_vm86_struct, regs32) -
 				       sizeof(info.regs));
@@ -281,23 +281,23 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
 /*
  * make sure the vm86() system call doesn't try to do anything silly
  */
-	info->regs.pt.xds = 0;
-	info->regs.pt.xes = 0;
-	info->regs.pt.xfs = 0;
+	info->regs.pt.ds = 0;
+	info->regs.pt.es = 0;
+	info->regs.pt.fs = 0;
 
 /* we are clearing gs later just before "jmp resume_userspace",
  * because it is not saved/restored.
  */
 
 /*
- * The eflags register is also special: we cannot trust that the user
+ * The flags register is also special: we cannot trust that the user
  * has set it up safely, so this makes sure interrupt etc flags are
  * inherited from protected mode.
  */
- 	VEFLAGS = info->regs.pt.eflags;
-	info->regs.pt.eflags &= SAFE_MASK;
-	info->regs.pt.eflags |= info->regs32->eflags & ~SAFE_MASK;
-	info->regs.pt.eflags |= VM_MASK;
+	VEFLAGS = info->regs.pt.flags;
+	info->regs.pt.flags &= SAFE_MASK;
+	info->regs.pt.flags |= info->regs32->flags & ~SAFE_MASK;
+	info->regs.pt.flags |= VM_MASK;
 
 	switch (info->cpu_type) {
 		case CPU_286:
@@ -315,11 +315,11 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
 	}
 
 /*
- * Save old state, set default return value (%eax) to 0
+ * Save old state, set default return value (%ax) to 0
  */
-	info->regs32->eax = 0;
+	info->regs32->ax = 0;
 	tsk->thread.saved_esp0 = tsk->thread.esp0;
-	tsk->thread.saved_fs = info->regs32->xfs;
+	tsk->thread.saved_fs = info->regs32->fs;
 	savesegment(gs, tsk->thread.saved_gs);
 
 	tss = &per_cpu(init_tss, get_cpu());
@@ -352,7 +352,7 @@ static inline void return_to_32bit(struct kernel_vm86_regs * regs16, int retval)
 	struct pt_regs * regs32;
 
 	regs32 = save_v86_state(regs16);
-	regs32->eax = retval;
+	regs32->ax = retval;
 	__asm__ __volatile__("movl %0,%%esp\n\t"
 		"movl %1,%%ebp\n\t"
 		"jmp resume_userspace"
@@ -373,12 +373,12 @@ static inline void clear_IF(struct kernel_vm86_regs * regs)
 
 static inline void clear_TF(struct kernel_vm86_regs * regs)
 {
-	regs->pt.eflags &= ~TF_MASK;
+	regs->pt.flags &= ~TF_MASK;
 }
 
 static inline void clear_AC(struct kernel_vm86_regs * regs)
 {
-	regs->pt.eflags &= ~AC_MASK;
+	regs->pt.flags &= ~AC_MASK;
 }
 
 /* It is correct to call set_IF(regs) from the set_vflags_*
@@ -392,11 +392,11 @@ static inline void clear_AC(struct kernel_vm86_regs * regs)
  * [KD]
  */
 
-static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs * regs)
+static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs * regs)
 {
-	set_flags(VEFLAGS, eflags, current->thread.v86mask);
-	set_flags(regs->pt.eflags, eflags, SAFE_MASK);
-	if (eflags & IF_MASK)
+	set_flags(VEFLAGS, flags, current->thread.v86mask);
+	set_flags(regs->pt.flags, flags, SAFE_MASK);
+	if (flags & IF_MASK)
 		set_IF(regs);
 	else
 		clear_IF(regs);
@@ -405,7 +405,7 @@ static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs
 static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs * regs)
 {
 	set_flags(VFLAGS, flags, current->thread.v86mask);
-	set_flags(regs->pt.eflags, flags, SAFE_MASK);
+	set_flags(regs->pt.flags, flags, SAFE_MASK);
 	if (flags & IF_MASK)
 		set_IF(regs);
 	else
@@ -414,7 +414,7 @@ static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_reg
 
 static inline unsigned long get_vflags(struct kernel_vm86_regs * regs)
 {
-	unsigned long flags = regs->pt.eflags & RETURN_MASK;
+	unsigned long flags = regs->pt.flags & RETURN_MASK;
 
 	if (VEFLAGS & VIF_MASK)
 		flags |= IF_MASK;
@@ -518,7 +518,7 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
 	unsigned long __user *intr_ptr;
 	unsigned long segoffs;
 
-	if (regs->pt.xcs == BIOSSEG)
+	if (regs->pt.cs == BIOSSEG)
 		goto cannot_handle;
 	if (is_revectored(i, &KVM86->int_revectored))
 		goto cannot_handle;
@@ -530,9 +530,9 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
 	if ((segoffs >> 16) == BIOSSEG)
 		goto cannot_handle;
 	pushw(ssp, sp, get_vflags(regs), cannot_handle);
-	pushw(ssp, sp, regs->pt.xcs, cannot_handle);
+	pushw(ssp, sp, regs->pt.cs, cannot_handle);
 	pushw(ssp, sp, IP(regs), cannot_handle);
-	regs->pt.xcs = segoffs >> 16;
+	regs->pt.cs = segoffs >> 16;
 	SP(regs) -= 6;
 	IP(regs) = segoffs & 0xffff;
 	clear_TF(regs);
@@ -549,7 +549,7 @@ int handle_vm86_trap(struct kernel_vm86_regs * regs, long error_code, int trapno
 	if (VMPI.is_vm86pus) {
 		if ( (trapno==3) || (trapno==1) )
 			return_to_32bit(regs, VM86_TRAP + (trapno << 8));
-		do_int(regs, trapno, (unsigned char __user *) (regs->pt.xss << 4), SP(regs));
+		do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs));
 		return 0;
 	}
 	if (trapno !=1)
@@ -585,10 +585,10 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
 		handle_vm86_trap(regs, 0, 1); \
 	return; } while (0)
 
-	orig_flags = *(unsigned short *)&regs->pt.eflags;
+	orig_flags = *(unsigned short *)&regs->pt.flags;
 
-	csp = (unsigned char __user *) (regs->pt.xcs << 4);
-	ssp = (unsigned char __user *) (regs->pt.xss << 4);
+	csp = (unsigned char __user *) (regs->pt.cs << 4);
+	ssp = (unsigned char __user *) (regs->pt.ss << 4);
 	sp = SP(regs);
 	ip = IP(regs);
 
@@ -675,7 +675,7 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
 			SP(regs) += 6;
 		}
 		IP(regs) = newip;
-		regs->pt.xcs = newcs;
+		regs->pt.cs = newcs;
 		CHECK_IF_IN_TRAP;
 		if (data32) {
 			set_vflags_long(newflags, regs);
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 87e5633805a9..599b6f2ed562 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -88,13 +88,13 @@ struct vmi_timer_ops vmi_timer_ops;
 #define IRQ_PATCH_DISABLE  5
 
 static inline void patch_offset(void *insnbuf,
-				unsigned long eip, unsigned long dest)
+				unsigned long ip, unsigned long dest)
 {
-        *(unsigned long *)(insnbuf+1) = dest-eip-5;
+        *(unsigned long *)(insnbuf+1) = dest-ip-5;
 }
 
 static unsigned patch_internal(int call, unsigned len, void *insnbuf,
-			       unsigned long eip)
+			       unsigned long ip)
 {
 	u64 reloc;
 	struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc;
@@ -103,13 +103,13 @@ static unsigned patch_internal(int call, unsigned len, void *insnbuf,
 		case VMI_RELOCATION_CALL_REL:
 			BUG_ON(len < 5);
 			*(char *)insnbuf = MNEM_CALL;
-			patch_offset(insnbuf, eip, (unsigned long)rel->eip);
+			patch_offset(insnbuf, ip, (unsigned long)rel->eip);
 			return 5;
 
 		case VMI_RELOCATION_JUMP_REL:
 			BUG_ON(len < 5);
 			*(char *)insnbuf = MNEM_JMP;
-			patch_offset(insnbuf, eip, (unsigned long)rel->eip);
+			patch_offset(insnbuf, ip, (unsigned long)rel->eip);
 			return 5;
 
 		case VMI_RELOCATION_NOP:
@@ -131,25 +131,25 @@ static unsigned patch_internal(int call, unsigned len, void *insnbuf,
  * sequence.  The callee does nop padding for us.
  */
 static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
-			  unsigned long eip, unsigned len)
+			  unsigned long ip, unsigned len)
 {
 	switch (type) {
 		case PARAVIRT_PATCH(pv_irq_ops.irq_disable):
 			return patch_internal(VMI_CALL_DisableInterrupts, len,
-					      insns, eip);
+					      insns, ip);
 		case PARAVIRT_PATCH(pv_irq_ops.irq_enable):
 			return patch_internal(VMI_CALL_EnableInterrupts, len,
-					      insns, eip);
+					      insns, ip);
 		case PARAVIRT_PATCH(pv_irq_ops.restore_fl):
 			return patch_internal(VMI_CALL_SetInterruptMask, len,
-					      insns, eip);
+					      insns, ip);
 		case PARAVIRT_PATCH(pv_irq_ops.save_fl):
 			return patch_internal(VMI_CALL_GetInterruptMask, len,
-					      insns, eip);
+					      insns, ip);
 		case PARAVIRT_PATCH(pv_cpu_ops.iret):
-			return patch_internal(VMI_CALL_IRET, len, insns, eip);
+			return patch_internal(VMI_CALL_IRET, len, insns, ip);
 		case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_syscall_ret):
-			return patch_internal(VMI_CALL_SYSEXIT, len, insns, eip);
+			return patch_internal(VMI_CALL_SYSEXIT, len, insns, ip);
 		default:
 			break;
 	}
@@ -157,29 +157,29 @@ static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
 }
 
 /* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */
-static void vmi_cpuid(unsigned int *eax, unsigned int *ebx,
-                               unsigned int *ecx, unsigned int *edx)
+static void vmi_cpuid(unsigned int *ax, unsigned int *bx,
+                               unsigned int *cx, unsigned int *dx)
 {
 	int override = 0;
-	if (*eax == 1)
+	if (*ax == 1)
 		override = 1;
         asm volatile ("call *%6"
-                      : "=a" (*eax),
-                        "=b" (*ebx),
-                        "=c" (*ecx),
-                        "=d" (*edx)
-                      : "0" (*eax), "2" (*ecx), "r" (vmi_ops.cpuid));
+                      : "=a" (*ax),
+                        "=b" (*bx),
+                        "=c" (*cx),
+                        "=d" (*dx)
+                      : "0" (*ax), "2" (*cx), "r" (vmi_ops.cpuid));
 	if (override) {
 		if (disable_pse)
-			*edx &= ~X86_FEATURE_PSE;
+			*dx &= ~X86_FEATURE_PSE;
 		if (disable_pge)
-			*edx &= ~X86_FEATURE_PGE;
+			*dx &= ~X86_FEATURE_PGE;
 		if (disable_sep)
-			*edx &= ~X86_FEATURE_SEP;
+			*dx &= ~X86_FEATURE_SEP;
 		if (disable_tsc)
-			*edx &= ~X86_FEATURE_TSC;
+			*dx &= ~X86_FEATURE_TSC;
 		if (disable_mtrr)
-			*edx &= ~X86_FEATURE_MTRR;
+			*dx &= ~X86_FEATURE_MTRR;
 	}
 }
 
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index ad4005c6d4a1..018f7cf33790 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -43,7 +43,7 @@
 #include <asm/vgtod.h>
 
 #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
-#define __syscall_clobber "r11","rcx","memory"
+#define __syscall_clobber "r11","cx","memory"
 #define __pa_vsymbol(x)			\
 	({unsigned long v;  		\
 	extern char __vsyscall_0; 	\
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index df04bf884dd4..ea46d05853bb 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -175,8 +175,8 @@ static void lguest_leave_lazy_mode(void)
  * check there when it wants to deliver an interrupt.
  */
 
-/* save_flags() is expected to return the processor state (ie. "eflags").  The
- * eflags word contains all kind of stuff, but in practice Linux only cares
+/* save_flags() is expected to return the processor state (ie. "flags").  The
+ * flags word contains all kind of stuff, but in practice Linux only cares
  * about the interrupt flag.  Our "save_flags()" just returns that. */
 static unsigned long save_fl(void)
 {
@@ -323,30 +323,30 @@ static void lguest_load_tr_desc(void)
  * anyone (including userspace) can just use the raw "cpuid" instruction and
  * the Host won't even notice since it isn't privileged.  So we try not to get
  * too worked up about it. */
-static void lguest_cpuid(unsigned int *eax, unsigned int *ebx,
-			 unsigned int *ecx, unsigned int *edx)
+static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
+			 unsigned int *cx, unsigned int *dx)
 {
-	int function = *eax;
+	int function = *ax;
 
-	native_cpuid(eax, ebx, ecx, edx);
+	native_cpuid(ax, bx, cx, dx);
 	switch (function) {
 	case 1:	/* Basic feature request. */
 		/* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */
-		*ecx &= 0x00002201;
+		*cx &= 0x00002201;
 		/* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */
-		*edx &= 0x07808101;
+		*dx &= 0x07808101;
 		/* The Host can do a nice optimization if it knows that the
 		 * kernel mappings (addresses above 0xC0000000 or whatever
 		 * PAGE_OFFSET is set to) haven't changed.  But Linux calls
 		 * flush_tlb_user() for both user and kernel mappings unless
 		 * the Page Global Enable (PGE) feature bit is set. */
-		*edx |= 0x00002000;
+		*dx |= 0x00002000;
 		break;
 	case 0x80000000:
 		/* Futureproof this a little: if they ask how much extended
 		 * processor information there is, limit it to known fields. */
-		if (*eax > 0x80000008)
-			*eax = 0x80000008;
+		if (*ax > 0x80000008)
+			*ax = 0x80000008;
 		break;
 	}
 }
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index b472a2df0b7f..f2c13482acc0 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -526,7 +526,7 @@ static void __init do_boot_cpu(__u8 cpu)
 	 * initial kernel stack.  We need to alter this to give the
 	 * booting CPU a new stack (taken from its idle process) */
 	extern struct {
-		__u8 *esp;
+		__u8 *sp;
 		unsigned short ss;
 	} stack_start;
 	/* This is the format of the CPI IDT gate (in real mode) which
@@ -555,9 +555,9 @@ static void __init do_boot_cpu(__u8 cpu)
 	idle = fork_idle(cpu);
 	if (IS_ERR(idle))
 		panic("failed fork for CPU%d", cpu);
-	idle->thread.eip = (unsigned long)start_secondary;
+	idle->thread.ip = (unsigned long)start_secondary;
 	/* init_tasks (in sched.c) is indexed logically */
-	stack_start.esp = (void *)idle->thread.esp;
+	stack_start.sp = (void *)idle->thread.sp;
 
 	init_gdt(cpu);
 	per_cpu(current_task, cpu) = idle;
@@ -567,7 +567,7 @@ static void __init do_boot_cpu(__u8 cpu)
 	/* Note: Don't modify initial ss override */
 	VDEBUG(("VOYAGER SMP: Booting CPU%d at 0x%lx[%x:%x], stack %p\n", cpu,
 		(unsigned long)hijack_source.val, hijack_source.idt.Segment,
-		hijack_source.idt.Offset, stack_start.esp));
+		hijack_source.idt.Offset, stack_start.sp));
 
 	/* init lowmem identity mapping */
 	clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
@@ -745,8 +745,8 @@ void __init initialize_secondary(void)
 	 */
 
 	asm volatile ("movl %0,%%esp\n\t"
-		      "jmp *%1"::"r" (current->thread.esp),
-		      "r"(current->thread.eip));
+		      "jmp *%1"::"r" (current->thread.sp),
+		      "r"(current->thread.ip));
 }
 
 /* handle a Voyager SYS_INT -- If we don't, the base board will
diff --git a/arch/x86/mm/extable_32.c b/arch/x86/mm/extable_32.c
index 0ce4f22a2635..41685461f8b2 100644
--- a/arch/x86/mm/extable_32.c
+++ b/arch/x86/mm/extable_32.c
@@ -11,7 +11,7 @@ int fixup_exception(struct pt_regs *regs)
 	const struct exception_table_entry *fixup;
 
 #ifdef CONFIG_PNPBIOS
-	if (unlikely(SEGMENT_IS_PNP_CODE(regs->xcs)))
+	if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs)))
 	{
 		extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp;
 		extern u32 pnp_bios_is_utter_crap;
@@ -25,9 +25,9 @@ int fixup_exception(struct pt_regs *regs)
 	}
 #endif
 
-	fixup = search_exception_tables(regs->eip);
+	fixup = search_exception_tables(regs->ip);
 	if (fixup) {
-		regs->eip = fixup->fixup;
+		regs->ip = fixup->fixup;
 		return 1;
 	}
 
diff --git a/arch/x86/mm/fault_32.c b/arch/x86/mm/fault_32.c
index a2273d44aa27..6056c6d71835 100644
--- a/arch/x86/mm/fault_32.c
+++ b/arch/x86/mm/fault_32.c
@@ -72,15 +72,15 @@ static inline int notify_page_fault(struct pt_regs *regs)
 static inline unsigned long get_segment_eip(struct pt_regs *regs,
 					    unsigned long *eip_limit)
 {
-	unsigned long eip = regs->eip;
-	unsigned seg = regs->xcs & 0xffff;
+	unsigned long ip = regs->ip;
+	unsigned seg = regs->cs & 0xffff;
 	u32 seg_ar, seg_limit, base, *desc;
 
 	/* Unlikely, but must come before segment checks. */
-	if (unlikely(regs->eflags & VM_MASK)) {
+	if (unlikely(regs->flags & VM_MASK)) {
 		base = seg << 4;
 		*eip_limit = base + 0xffff;
-		return base + (eip & 0xffff);
+		return base + (ip & 0xffff);
 	}
 
 	/* The standard kernel/user address space limit. */
@@ -88,16 +88,16 @@ static inline unsigned long get_segment_eip(struct pt_regs *regs,
 	
 	/* By far the most common cases. */
 	if (likely(SEGMENT_IS_FLAT_CODE(seg)))
-		return eip;
+		return ip;
 
 	/* Check the segment exists, is within the current LDT/GDT size,
 	   that kernel/user (ring 0..3) has the appropriate privilege,
 	   that it's a code segment, and get the limit. */
 	__asm__ ("larl %3,%0; lsll %3,%1"
 		 : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
-	if ((~seg_ar & 0x9800) || eip > seg_limit) {
+	if ((~seg_ar & 0x9800) || ip > seg_limit) {
 		*eip_limit = 0;
-		return 1;	 /* So that returned eip > *eip_limit. */
+		return 1;	 /* So that returned ip > *eip_limit. */
 	}
 
 	/* Get the GDT/LDT descriptor base. 
@@ -127,7 +127,7 @@ static inline unsigned long get_segment_eip(struct pt_regs *regs,
 	seg_limit += base;
 	if (seg_limit < *eip_limit && seg_limit >= base)
 		*eip_limit = seg_limit;
-	return eip + base;
+	return ip + base;
 }
 
 /* 
@@ -345,7 +345,7 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs,
 
 	/* It's safe to allow irq's after cr2 has been saved and the vmalloc
 	   fault has been handled. */
-	if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
+	if (regs->flags & (X86_EFLAGS_IF|VM_MASK))
 		local_irq_enable();
 
 	mm = tsk->mm;
@@ -374,7 +374,7 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs,
 	 */
 	if (!down_read_trylock(&mm->mmap_sem)) {
 		if ((error_code & 4) == 0 &&
-		    !search_exception_tables(regs->eip))
+		    !search_exception_tables(regs->ip))
 			goto bad_area_nosemaphore;
 		down_read(&mm->mmap_sem);
 	}
@@ -388,12 +388,12 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs,
 		goto bad_area;
 	if (error_code & 4) {
 		/*
-		 * Accessing the stack below %esp is always a bug.
+		 * Accessing the stack below %sp is always a bug.
 		 * The large cushion allows instructions like enter
 		 * and pusha to work.  ("enter $65535,$31" pushes
-		 * 32 pointers and then decrements %esp by 65535.)
+		 * 32 pointers and then decrements %sp by 65535.)
 		 */
-		if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp)
+		if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
 			goto bad_area;
 	}
 	if (expand_stack(vma, address))
@@ -442,7 +442,7 @@ good_area:
 	/*
 	 * Did it hit the DOS screen memory VA from vm86 mode?
 	 */
-	if (regs->eflags & VM_MASK) {
+	if (regs->flags & VM_MASK) {
 		unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
 		if (bit < 32)
 			tsk->thread.screen_bitmap |= 1 << bit;
@@ -474,11 +474,11 @@ bad_area_nosemaphore:
 
 		if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
 		    printk_ratelimit()) {
-			printk("%s%s[%d]: segfault at %08lx eip %08lx "
-			    "esp %08lx error %lx\n",
+			printk("%s%s[%d]: segfault at %08lx ip %08lx "
+			    "sp %08lx error %lx\n",
 			    task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
-			    tsk->comm, task_pid_nr(tsk), address, regs->eip,
-			    regs->esp, error_code);
+			    tsk->comm, task_pid_nr(tsk), address, regs->ip,
+			    regs->sp, error_code);
 		}
 		tsk->thread.cr2 = address;
 		/* Kernel addresses are always protection faults */
@@ -544,7 +544,7 @@ no_context:
 			printk(KERN_ALERT "BUG: unable to handle kernel paging"
 					" request");
 		printk(" at virtual address %08lx\n",address);
-		printk(KERN_ALERT "printing eip: %08lx ", regs->eip);
+		printk(KERN_ALERT "printing ip: %08lx ", regs->ip);
 
 		page = read_cr3();
 		page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
diff --git a/arch/x86/mm/fault_64.c b/arch/x86/mm/fault_64.c
index 0e26230669ca..88a7abda29ce 100644
--- a/arch/x86/mm/fault_64.c
+++ b/arch/x86/mm/fault_64.c
@@ -198,7 +198,7 @@ KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
 static int is_errata93(struct pt_regs *regs, unsigned long address) 
 {
 	static int warned;
-	if (address != regs->rip)
+	if (address != regs->ip)
 		return 0;
 	if ((address >> 32) != 0) 
 		return 0;
@@ -209,7 +209,7 @@ static int is_errata93(struct pt_regs *regs, unsigned long address)
 			printk(errata93_warning); 		
 			warned = 1;
 		}
-		regs->rip = address;
+		regs->ip = address;
 		return 1;
 	}
 	return 0;
@@ -355,7 +355,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 	if (notify_page_fault(regs))
 		return;
 
-	if (likely(regs->eflags & X86_EFLAGS_IF))
+	if (likely(regs->flags & X86_EFLAGS_IF))
 		local_irq_enable();
 
 	if (unlikely(error_code & PF_RSVD))
@@ -393,7 +393,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 	 */
 	if (!down_read_trylock(&mm->mmap_sem)) {
 		if ((error_code & PF_USER) == 0 &&
-		    !search_exception_tables(regs->rip))
+		    !search_exception_tables(regs->ip))
 			goto bad_area_nosemaphore;
 		down_read(&mm->mmap_sem);
 	}
@@ -409,7 +409,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 		/* Allow userspace just enough access below the stack pointer
 		 * to let the 'enter' instruction work.
 		 */
-		if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)
+		if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
 			goto bad_area;
 	}
 	if (expand_stack(vma, address))
@@ -488,10 +488,10 @@ bad_area_nosemaphore:
 		if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
 		    printk_ratelimit()) {
 			printk(
-		       "%s%s[%d]: segfault at %lx rip %lx rsp %lx error %lx\n",
+		       "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx\n",
 					tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
-					tsk->comm, tsk->pid, address, regs->rip,
-					regs->rsp, error_code);
+					tsk->comm, tsk->pid, address, regs->ip,
+					regs->sp, error_code);
 		}
        
 		tsk->thread.cr2 = address;
@@ -509,9 +509,9 @@ bad_area_nosemaphore:
 no_context:
 	
 	/* Are we prepared to handle this kernel fault?  */
-	fixup = search_exception_tables(regs->rip);
+	fixup = search_exception_tables(regs->ip);
 	if (fixup) {
-		regs->rip = fixup->fixup;
+		regs->ip = fixup->fixup;
 		return;
 	}
 
@@ -537,7 +537,7 @@ no_context:
 	else
 		printk(KERN_ALERT "Unable to handle kernel paging request");
 	printk(" at %016lx RIP: \n" KERN_ALERT,address);
-	printk_address(regs->rip);
+	printk_address(regs->ip);
 	dump_pagetable(address);
 	tsk->thread.cr2 = address;
 	tsk->thread.trap_no = 14;
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index 0ed046a187f7..cc353a0b183e 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -48,7 +48,7 @@ static struct stacktrace_ops backtrace_ops = {
 };
 
 struct frame_head {
-	struct frame_head *ebp;
+	struct frame_head *bp;
 	unsigned long ret;
 } __attribute__((packed));
 
@@ -67,10 +67,10 @@ dump_user_backtrace(struct frame_head * head)
 
 	/* frame pointers should strictly progress back up the stack
 	 * (towards higher addresses) */
-	if (head >= bufhead[0].ebp)
+	if (head >= bufhead[0].bp)
 		return NULL;
 
-	return bufhead[0].ebp;
+	return bufhead[0].bp;
 }
 
 void
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index d3574485cb15..29517faaa735 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -141,8 +141,8 @@ static void __init xen_banner(void)
 	printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic);
 }
 
-static void xen_cpuid(unsigned int *eax, unsigned int *ebx,
-		      unsigned int *ecx, unsigned int *edx)
+static void xen_cpuid(unsigned int *ax, unsigned int *bx,
+		      unsigned int *cx, unsigned int *dx)
 {
 	unsigned maskedx = ~0;
 
@@ -150,18 +150,18 @@ static void xen_cpuid(unsigned int *eax, unsigned int *ebx,
 	 * Mask out inconvenient features, to try and disable as many
 	 * unsupported kernel subsystems as possible.
 	 */
-	if (*eax == 1)
+	if (*ax == 1)
 		maskedx = ~((1 << X86_FEATURE_APIC) |  /* disable APIC */
 			    (1 << X86_FEATURE_ACPI) |  /* disable ACPI */
 			    (1 << X86_FEATURE_ACC));   /* thermal monitoring */
 
 	asm(XEN_EMULATE_PREFIX "cpuid"
-		: "=a" (*eax),
-		  "=b" (*ebx),
-		  "=c" (*ecx),
-		  "=d" (*edx)
-		: "0" (*eax), "2" (*ecx));
-	*edx &= maskedx;
+		: "=a" (*ax),
+		  "=b" (*bx),
+		  "=c" (*cx),
+		  "=d" (*dx)
+		: "0" (*ax), "2" (*cx));
+	*dx &= maskedx;
 }
 
 static void xen_set_debugreg(int reg, unsigned long val)
diff --git a/arch/x86/xen/events.c b/arch/x86/xen/events.c
index 6d1da5809e6f..aebab9704dd7 100644
--- a/arch/x86/xen/events.c
+++ b/arch/x86/xen/events.c
@@ -487,7 +487,7 @@ fastcall void xen_evtchn_do_upcall(struct pt_regs *regs)
 			int irq = evtchn_to_irq[port];
 
 			if (irq != -1) {
-				regs->orig_eax = ~irq;
+				regs->orig_ax = ~irq;
 				do_IRQ(regs);
 			}
 		}
diff --git a/include/asm-x86/compat.h b/include/asm-x86/compat.h
index 66ba7987184a..b270ee04959e 100644
--- a/include/asm-x86/compat.h
+++ b/include/asm-x86/compat.h
@@ -207,7 +207,7 @@ static inline compat_uptr_t ptr_to_compat(void __user *uptr)
 static __inline__ void __user *compat_alloc_user_space(long len)
 {
 	struct pt_regs *regs = task_pt_regs(current);
-	return (void __user *)regs->rsp - len; 
+	return (void __user *)regs->sp - len;
 }
 
 static inline int is_compat_task(void)
diff --git a/include/asm-x86/elf.h b/include/asm-x86/elf.h
index 60f5101d9483..5e5705bf082a 100644
--- a/include/asm-x86/elf.h
+++ b/include/asm-x86/elf.h
@@ -99,32 +99,32 @@ typedef struct user_fxsr_struct elf_fpxregset_t;
    just to make things more deterministic.
  */
 #define ELF_PLAT_INIT(_r, load_addr)	do { \
-	_r->ebx = 0; _r->ecx = 0; _r->edx = 0; \
-	_r->esi = 0; _r->edi = 0; _r->ebp = 0; \
-	_r->eax = 0; \
+	_r->bx = 0; _r->cx = 0; _r->dx = 0; \
+	_r->si = 0; _r->di = 0; _r->bp = 0; \
+	_r->ax = 0; \
 } while (0)
 
 /* regs is struct pt_regs, pr_reg is elf_gregset_t (which is
    now struct_user_regs, they are different) */
 
 #define ELF_CORE_COPY_REGS(pr_reg, regs)		\
-	pr_reg[0] = regs->ebx;				\
-	pr_reg[1] = regs->ecx;				\
-	pr_reg[2] = regs->edx;				\
-	pr_reg[3] = regs->esi;				\
-	pr_reg[4] = regs->edi;				\
-	pr_reg[5] = regs->ebp;				\
-	pr_reg[6] = regs->eax;				\
-	pr_reg[7] = regs->xds & 0xffff;			\
-	pr_reg[8] = regs->xes & 0xffff;			\
-	pr_reg[9] = regs->xfs & 0xffff;			\
+	pr_reg[0] = regs->bx;				\
+	pr_reg[1] = regs->cx;				\
+	pr_reg[2] = regs->dx;				\
+	pr_reg[3] = regs->si;				\
+	pr_reg[4] = regs->di;				\
+	pr_reg[5] = regs->bp;				\
+	pr_reg[6] = regs->ax;				\
+	pr_reg[7] = regs->ds & 0xffff;			\
+	pr_reg[8] = regs->es & 0xffff;			\
+	pr_reg[9] = regs->fs & 0xffff;			\
 	savesegment(gs,pr_reg[10]);			\
-	pr_reg[11] = regs->orig_eax;			\
-	pr_reg[12] = regs->eip;				\
-	pr_reg[13] = regs->xcs & 0xffff;		\
-	pr_reg[14] = regs->eflags;			\
-	pr_reg[15] = regs->esp;				\
-	pr_reg[16] = regs->xss & 0xffff;
+	pr_reg[11] = regs->orig_ax;			\
+	pr_reg[12] = regs->ip;				\
+	pr_reg[13] = regs->cs & 0xffff;			\
+	pr_reg[14] = regs->flags;			\
+	pr_reg[15] = regs->sp;				\
+	pr_reg[16] = regs->ss & 0xffff;
 
 #define ELF_PLATFORM	(utsname()->machine)
 #define set_personality_64bit()	do { } while (0)
@@ -142,9 +142,9 @@ extern unsigned int vdso_enabled;
 
 #define ELF_PLAT_INIT(_r, load_addr)	do {		  \
 	struct task_struct *cur = current;		  \
-	(_r)->rbx = 0; (_r)->rcx = 0; (_r)->rdx = 0;	  \
-	(_r)->rsi = 0; (_r)->rdi = 0; (_r)->rbp = 0;	  \
-	(_r)->rax = 0;					  \
+	(_r)->bx = 0; (_r)->cx = 0; (_r)->dx = 0;	  \
+	(_r)->si = 0; (_r)->di = 0; (_r)->bp = 0;	  \
+	(_r)->ax = 0;					  \
 	(_r)->r8 = 0;					  \
 	(_r)->r9 = 0;					  \
 	(_r)->r10 = 0;					  \
@@ -169,22 +169,22 @@ extern unsigned int vdso_enabled;
 	(pr_reg)[1] = (regs)->r14;				\
 	(pr_reg)[2] = (regs)->r13;				\
 	(pr_reg)[3] = (regs)->r12;				\
-	(pr_reg)[4] = (regs)->rbp;				\
-	(pr_reg)[5] = (regs)->rbx;				\
+	(pr_reg)[4] = (regs)->bp;				\
+	(pr_reg)[5] = (regs)->bx;				\
 	(pr_reg)[6] = (regs)->r11;				\
 	(pr_reg)[7] = (regs)->r10;				\
 	(pr_reg)[8] = (regs)->r9;				\
 	(pr_reg)[9] = (regs)->r8;				\
-	(pr_reg)[10] = (regs)->rax;				\
-	(pr_reg)[11] = (regs)->rcx;				\
-	(pr_reg)[12] = (regs)->rdx;				\
-	(pr_reg)[13] = (regs)->rsi;				\
-	(pr_reg)[14] = (regs)->rdi;				\
-	(pr_reg)[15] = (regs)->orig_rax;			\
-	(pr_reg)[16] = (regs)->rip;				\
+	(pr_reg)[10] = (regs)->ax;				\
+	(pr_reg)[11] = (regs)->cx;				\
+	(pr_reg)[12] = (regs)->dx;				\
+	(pr_reg)[13] = (regs)->si;				\
+	(pr_reg)[14] = (regs)->di;				\
+	(pr_reg)[15] = (regs)->orig_ax;			\
+	(pr_reg)[16] = (regs)->ip;				\
 	(pr_reg)[17] = (regs)->cs;				\
-	(pr_reg)[18] = (regs)->eflags;				\
-	(pr_reg)[19] = (regs)->rsp;				\
+	(pr_reg)[18] = (regs)->flags;				\
+	(pr_reg)[19] = (regs)->sp;				\
 	(pr_reg)[20] = (regs)->ss;				\
 	(pr_reg)[21] = current->thread.fs;			\
 	(pr_reg)[22] = current->thread.gs;			\
diff --git a/include/asm-x86/kexec_32.h b/include/asm-x86/kexec_32.h
index 4b9dc9e6b701..ff39d2f88022 100644
--- a/include/asm-x86/kexec_32.h
+++ b/include/asm-x86/kexec_32.h
@@ -45,7 +45,7 @@
 /* We can also handle crash dumps from 64 bit kernel. */
 #define vmcore_elf_check_arch_cross(x) ((x)->e_machine == EM_X86_64)
 
-/* CPU does not save ss and esp on stack if execution is already
+/* CPU does not save ss and sp on stack if execution is already
  * running in kernel mode at the time of NMI occurrence. This code
  * fixes it.
  */
@@ -53,16 +53,16 @@ static inline void crash_fixup_ss_esp(struct pt_regs *newregs,
 					struct pt_regs *oldregs)
 {
 	memcpy(newregs, oldregs, sizeof(*newregs));
-	newregs->esp = (unsigned long)&(oldregs->esp);
+	newregs->sp = (unsigned long)&(oldregs->sp);
 	__asm__ __volatile__(
 			"xorl %%eax, %%eax\n\t"
 			"movw %%ss, %%ax\n\t"
-			:"=a"(newregs->xss));
+			:"=a"(newregs->ss));
 }
 
 /*
  * This function is responsible for capturing register states if coming
- * via panic otherwise just fix up the ss and esp if coming via kernel
+ * via panic otherwise just fix up the ss and sp if coming via kernel
  * mode exception.
  */
 static inline void crash_setup_regs(struct pt_regs *newregs,
@@ -71,21 +71,21 @@ static inline void crash_setup_regs(struct pt_regs *newregs,
        if (oldregs)
                crash_fixup_ss_esp(newregs, oldregs);
        else {
-               __asm__ __volatile__("movl %%ebx,%0" : "=m"(newregs->ebx));
-               __asm__ __volatile__("movl %%ecx,%0" : "=m"(newregs->ecx));
-               __asm__ __volatile__("movl %%edx,%0" : "=m"(newregs->edx));
-               __asm__ __volatile__("movl %%esi,%0" : "=m"(newregs->esi));
-               __asm__ __volatile__("movl %%edi,%0" : "=m"(newregs->edi));
-               __asm__ __volatile__("movl %%ebp,%0" : "=m"(newregs->ebp));
-               __asm__ __volatile__("movl %%eax,%0" : "=m"(newregs->eax));
-               __asm__ __volatile__("movl %%esp,%0" : "=m"(newregs->esp));
-               __asm__ __volatile__("movw %%ss, %%ax;" :"=a"(newregs->xss));
-               __asm__ __volatile__("movw %%cs, %%ax;" :"=a"(newregs->xcs));
-               __asm__ __volatile__("movw %%ds, %%ax;" :"=a"(newregs->xds));
-               __asm__ __volatile__("movw %%es, %%ax;" :"=a"(newregs->xes));
-               __asm__ __volatile__("pushfl; popl %0" :"=m"(newregs->eflags));
+               __asm__ __volatile__("movl %%ebx,%0" : "=m"(newregs->bx));
+               __asm__ __volatile__("movl %%ecx,%0" : "=m"(newregs->cx));
+               __asm__ __volatile__("movl %%edx,%0" : "=m"(newregs->dx));
+               __asm__ __volatile__("movl %%esi,%0" : "=m"(newregs->si));
+               __asm__ __volatile__("movl %%edi,%0" : "=m"(newregs->di));
+               __asm__ __volatile__("movl %%ebp,%0" : "=m"(newregs->bp));
+               __asm__ __volatile__("movl %%eax,%0" : "=m"(newregs->ax));
+               __asm__ __volatile__("movl %%esp,%0" : "=m"(newregs->sp));
+               __asm__ __volatile__("movl %%ss, %%eax;" :"=a"(newregs->ss));
+               __asm__ __volatile__("movl %%cs, %%eax;" :"=a"(newregs->cs));
+               __asm__ __volatile__("movl %%ds, %%eax;" :"=a"(newregs->ds));
+               __asm__ __volatile__("movl %%es, %%eax;" :"=a"(newregs->es));
+               __asm__ __volatile__("pushfl; popl %0" :"=m"(newregs->flags));
 
-               newregs->eip = (unsigned long)current_text_addr();
+               newregs->ip = (unsigned long)current_text_addr();
        }
 }
 asmlinkage NORET_TYPE void
diff --git a/include/asm-x86/kexec_64.h b/include/asm-x86/kexec_64.h
index 738e581b67f8..b5f989b15c0b 100644
--- a/include/asm-x86/kexec_64.h
+++ b/include/asm-x86/kexec_64.h
@@ -60,14 +60,14 @@ static inline void crash_setup_regs(struct pt_regs *newregs,
 	if (oldregs)
 		memcpy(newregs, oldregs, sizeof(*newregs));
 	else {
-		__asm__ __volatile__("movq %%rbx,%0" : "=m"(newregs->rbx));
-		__asm__ __volatile__("movq %%rcx,%0" : "=m"(newregs->rcx));
-		__asm__ __volatile__("movq %%rdx,%0" : "=m"(newregs->rdx));
-		__asm__ __volatile__("movq %%rsi,%0" : "=m"(newregs->rsi));
-		__asm__ __volatile__("movq %%rdi,%0" : "=m"(newregs->rdi));
-		__asm__ __volatile__("movq %%rbp,%0" : "=m"(newregs->rbp));
-		__asm__ __volatile__("movq %%rax,%0" : "=m"(newregs->rax));
-		__asm__ __volatile__("movq %%rsp,%0" : "=m"(newregs->rsp));
+		__asm__ __volatile__("movq %%rbx,%0" : "=m"(newregs->bx));
+		__asm__ __volatile__("movq %%rcx,%0" : "=m"(newregs->cx));
+		__asm__ __volatile__("movq %%rdx,%0" : "=m"(newregs->dx));
+		__asm__ __volatile__("movq %%rsi,%0" : "=m"(newregs->si));
+		__asm__ __volatile__("movq %%rdi,%0" : "=m"(newregs->di));
+		__asm__ __volatile__("movq %%rbp,%0" : "=m"(newregs->bp));
+		__asm__ __volatile__("movq %%rax,%0" : "=m"(newregs->ax));
+		__asm__ __volatile__("movq %%rsp,%0" : "=m"(newregs->sp));
 		__asm__ __volatile__("movq %%r8,%0" : "=m"(newregs->r8));
 		__asm__ __volatile__("movq %%r9,%0" : "=m"(newregs->r9));
 		__asm__ __volatile__("movq %%r10,%0" : "=m"(newregs->r10));
@@ -78,9 +78,9 @@ static inline void crash_setup_regs(struct pt_regs *newregs,
 		__asm__ __volatile__("movq %%r15,%0" : "=m"(newregs->r15));
 		__asm__ __volatile__("movl %%ss, %%eax;" :"=a"(newregs->ss));
 		__asm__ __volatile__("movl %%cs, %%eax;" :"=a"(newregs->cs));
-		__asm__ __volatile__("pushfq; popq %0" :"=m"(newregs->eflags));
+		__asm__ __volatile__("pushfq; popq %0" :"=m"(newregs->flags));
 
-		newregs->rip = (unsigned long)current_text_addr();
+		newregs->ip = (unsigned long)current_text_addr();
 	}
 }
 
diff --git a/include/asm-x86/kprobes_32.h b/include/asm-x86/kprobes_32.h
index 9fe8f3bddfd5..2f38315bc39f 100644
--- a/include/asm-x86/kprobes_32.h
+++ b/include/asm-x86/kprobes_32.h
@@ -84,7 +84,7 @@ struct kprobe_ctlblk {
  */
 static inline void restore_interrupts(struct pt_regs *regs)
 {
-	if (regs->eflags & IF_MASK)
+	if (regs->flags & IF_MASK)
 		local_irq_enable();
 }
 
diff --git a/include/asm-x86/kprobes_64.h b/include/asm-x86/kprobes_64.h
index 743d76218fc9..8c919d35cdd3 100644
--- a/include/asm-x86/kprobes_64.h
+++ b/include/asm-x86/kprobes_64.h
@@ -77,7 +77,7 @@ struct kprobe_ctlblk {
  */
 static inline void restore_interrupts(struct pt_regs *regs)
 {
-	if (regs->eflags & IF_MASK)
+	if (regs->flags & IF_MASK)
 		local_irq_enable();
 }
 
diff --git a/include/asm-x86/mce.h b/include/asm-x86/mce.h
index e6ff507a73b0..94f1fd79e22a 100644
--- a/include/asm-x86/mce.h
+++ b/include/asm-x86/mce.h
@@ -13,7 +13,7 @@
 #define MCG_CTL_P	 (1UL<<8)   /* MCG_CAP register available */
 
 #define MCG_STATUS_RIPV  (1UL<<0)   /* restart ip valid */
-#define MCG_STATUS_EIPV  (1UL<<1)   /* eip points to correct instruction */
+#define MCG_STATUS_EIPV  (1UL<<1)   /* ip points to correct instruction */
 #define MCG_STATUS_MCIP  (1UL<<2)   /* machine check in progress */
 
 #define MCI_STATUS_VAL   (1UL<<63)  /* valid error */
@@ -30,7 +30,7 @@ struct mce {
 	__u64 misc;
 	__u64 addr;
 	__u64 mcgstatus;
-	__u64 rip;
+	__u64 ip;
 	__u64 tsc;	/* cpu time stamp counter */
 	__u64 res1;	/* for future extension */
 	__u64 res2;	/* dito. */
diff --git a/include/asm-x86/processor_32.h b/include/asm-x86/processor_32.h
index 3c67eacb3168..c85400fe58c4 100644
--- a/include/asm-x86/processor_32.h
+++ b/include/asm-x86/processor_32.h
@@ -398,14 +398,14 @@ struct thread_struct {
 
 #define start_thread(regs, new_eip, new_esp) do {		\
 	__asm__("movl %0,%%gs": :"r" (0));			\
-	regs->xfs = 0;						\
+	regs->fs = 0;						\
 	set_fs(USER_DS);					\
-	regs->xds = __USER_DS;					\
-	regs->xes = __USER_DS;					\
-	regs->xss = __USER_DS;					\
-	regs->xcs = __USER_CS;					\
-	regs->eip = new_eip;					\
-	regs->esp = new_esp;					\
+	regs->ds = __USER_DS;					\
+	regs->es = __USER_DS;					\
+	regs->ss = __USER_DS;					\
+	regs->cs = __USER_CS;					\
+	regs->ip = new_eip;					\
+	regs->sp = new_esp;					\
 } while (0)
 
 /* Forward declaration, a strange C thing */
@@ -440,7 +440,7 @@ unsigned long get_wchan(struct task_struct *p);
  * is accessable even if the CPU haven't stored the SS/ESP registers
  * on the stack (interrupt gate does not save these registers
  * when switching to the same priv ring).
- * Therefore beware: accessing the xss/esp fields of the
+ * Therefore beware: accessing the ss/esp fields of the
  * "struct pt_regs" is possible, but they may contain the
  * completely wrong values.
  */
@@ -451,8 +451,8 @@ unsigned long get_wchan(struct task_struct *p);
        __regs__ - 1;                                                   \
 })
 
-#define KSTK_EIP(task) (task_pt_regs(task)->eip)
-#define KSTK_ESP(task) (task_pt_regs(task)->esp)
+#define KSTK_EIP(task) (task_pt_regs(task)->ip)
+#define KSTK_ESP(task) (task_pt_regs(task)->sp)
 
 
 struct microcode_header {
diff --git a/include/asm-x86/processor_64.h b/include/asm-x86/processor_64.h
index e7bea4fed642..797770113e6d 100644
--- a/include/asm-x86/processor_64.h
+++ b/include/asm-x86/processor_64.h
@@ -258,12 +258,12 @@ struct thread_struct {
 #define start_thread(regs,new_rip,new_rsp) do { \
 	asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0));	 \
 	load_gs_index(0);							\
-	(regs)->rip = (new_rip);						 \
-	(regs)->rsp = (new_rsp);						 \
+	(regs)->ip = (new_rip);						 \
+	(regs)->sp = (new_rsp);						 \
 	write_pda(oldrsp, (new_rsp));						 \
 	(regs)->cs = __USER_CS;							 \
 	(regs)->ss = __USER_DS;							 \
-	(regs)->eflags = 0x200;							 \
+	(regs)->flags = 0x200;							 \
 	set_fs(USER_DS);							 \
 } while(0) 
 
@@ -297,7 +297,7 @@ extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
 
 extern unsigned long get_wchan(struct task_struct *p);
 #define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.rsp0 - 1)
-#define KSTK_EIP(tsk) (task_pt_regs(tsk)->rip)
+#define KSTK_EIP(tsk) (task_pt_regs(tsk)->ip)
 #define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
 
 
diff --git a/include/asm-x86/ptrace.h b/include/asm-x86/ptrace.h
index 04204f359298..9187b2fab754 100644
--- a/include/asm-x86/ptrace.h
+++ b/include/asm-x86/ptrace.h
@@ -10,6 +10,8 @@
 /* this struct defines the way the registers are stored on the
    stack during a system call. */
 
+#ifndef __KERNEL__
+
 struct pt_regs {
 	long ebx;
 	long ecx;
@@ -21,7 +23,7 @@ struct pt_regs {
 	int  xds;
 	int  xes;
 	int  xfs;
-	/* int  xgs; */
+	/* int  gs; */
 	long orig_eax;
 	long eip;
 	int  xcs;
@@ -30,7 +32,27 @@ struct pt_regs {
 	int  xss;
 };
 
-#ifdef __KERNEL__
+#else /* __KERNEL__ */
+
+struct pt_regs {
+	long bx;
+	long cx;
+	long dx;
+	long si;
+	long di;
+	long bp;
+	long ax;
+	int  ds;
+	int  es;
+	int  fs;
+	/* int  gs; */
+	long orig_ax;
+	long ip;
+	int  cs;
+	long flags;
+	long sp;
+	int  ss;
+};
 
 #include <asm/vm86.h>
 #include <asm/segment.h>
@@ -47,27 +69,30 @@ extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int erro
  */
 static inline int user_mode(struct pt_regs *regs)
 {
-	return (regs->xcs & SEGMENT_RPL_MASK) == USER_RPL;
+	return (regs->cs & SEGMENT_RPL_MASK) == USER_RPL;
 }
 static inline int user_mode_vm(struct pt_regs *regs)
 {
-	return ((regs->xcs & SEGMENT_RPL_MASK) | (regs->eflags & VM_MASK)) >= USER_RPL;
+	return ((regs->cs & SEGMENT_RPL_MASK) |
+		(regs->flags & VM_MASK)) >= USER_RPL;
 }
 static inline int v8086_mode(struct pt_regs *regs)
 {
-	return (regs->eflags & VM_MASK);
+	return (regs->flags & VM_MASK);
 }
 
-#define instruction_pointer(regs) ((regs)->eip)
-#define frame_pointer(regs) ((regs)->ebp)
+#define instruction_pointer(regs) ((regs)->ip)
+#define frame_pointer(regs) ((regs)->bp)
 #define stack_pointer(regs) ((unsigned long)(regs))
-#define regs_return_value(regs) ((regs)->eax)
+#define regs_return_value(regs) ((regs)->ax)
 
 extern unsigned long profile_pc(struct pt_regs *regs);
 #endif /* __KERNEL__ */
 
 #else /* __i386__ */
 
+#ifndef __KERNEL__
+
 struct pt_regs {
 	unsigned long r15;
 	unsigned long r14;
@@ -96,14 +121,43 @@ struct pt_regs {
 /* top of stack page */
 };
 
-#ifdef __KERNEL__
+#else /* __KERNEL__ */
+
+struct pt_regs {
+	unsigned long r15;
+	unsigned long r14;
+	unsigned long r13;
+	unsigned long r12;
+	unsigned long bp;
+	unsigned long bx;
+/* arguments: non interrupts/non tracing syscalls only save upto here*/
+	unsigned long r11;
+	unsigned long r10;
+	unsigned long r9;
+	unsigned long r8;
+	unsigned long ax;
+	unsigned long cx;
+	unsigned long dx;
+	unsigned long si;
+	unsigned long di;
+	unsigned long orig_ax;
+/* end of arguments */
+/* cpu exception frame or undefined */
+	unsigned long ip;
+	unsigned long cs;
+	unsigned long flags;
+	unsigned long sp;
+	unsigned long ss;
+/* top of stack page */
+};
 
 #define user_mode(regs) (!!((regs)->cs & 3))
 #define user_mode_vm(regs) user_mode(regs)
-#define instruction_pointer(regs) ((regs)->rip)
-#define frame_pointer(regs) ((regs)->rbp)
-#define stack_pointer(regs) ((regs)->rsp)
-#define regs_return_value(regs) ((regs)->rax)
+#define v8086_mode(regs) 0	/* No V86 mode support in long mode */
+#define instruction_pointer(regs) ((regs)->ip)
+#define frame_pointer(regs) ((regs)->bp)
+#define stack_pointer(regs) ((regs)->sp)
+#define regs_return_value(regs) ((regs)->ax)
 
 extern unsigned long profile_pc(struct pt_regs *regs);
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
diff --git a/kernel/signal.c b/kernel/signal.c
index afa4f781f924..bf49ce6f016b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -733,13 +733,13 @@ static void print_fatal_signal(struct pt_regs *regs, int signr)
 		current->comm, task_pid_nr(current), signr);
 
 #if defined(__i386__) && !defined(__arch_um__)
-	printk("code at %08lx: ", regs->eip);
+	printk("code at %08lx: ", regs->ip);
 	{
 		int i;
 		for (i = 0; i < 16; i++) {
 			unsigned char insn;
 
-			__get_user(insn, (unsigned char *)(regs->eip + i));
+			__get_user(insn, (unsigned char *)(regs->ip + i));
 			printk("%02x ", insn);
 		}
 	}
-- 
cgit v1.2.3


From 153d5f2e5787c74e9cbb6b6687c9b04be1b59893 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Wed, 30 Jan 2008 13:30:56 +0100
Subject: x86: use generic register names in struct user_regs_struct

Switch struct user_regs_struct (defined in <asm/user.h>, which is no
longer exported to userspace) to using register names without e- or
r-prefixes for both 32 and 64 bit x86.  This is intended as a
preliminary step in unifying this code between architectures.

Also, be a bit more strict in truncating 32-bit "extended" segment
register values to 16 bits.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/process_32.c | 35 ++++++++++++++++++-----------------
 arch/x86/kernel/ptrace_64.c  |  4 ++--
 include/asm-x86/user_32.h    | 24 +++++++++++++++++-------
 include/asm-x86/user_64.h    | 41 +++++++++++++++++++++++++++++++----------
 4 files changed, 68 insertions(+), 36 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index c9f28e02e86d..53406461074f 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -523,6 +523,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
 void dump_thread(struct pt_regs * regs, struct user * dump)
 {
 	int i;
+	u16 gs;
 
 /* changed the size calculations - should hopefully work better. lbt */
 	dump->magic = CMAGIC;
@@ -538,23 +539,23 @@ void dump_thread(struct pt_regs * regs, struct user * dump)
 	if (dump->start_stack < TASK_SIZE)
 		dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
 
-	dump->regs.ebx = regs->bx;
-	dump->regs.ecx = regs->cx;
-	dump->regs.edx = regs->dx;
-	dump->regs.esi = regs->si;
-	dump->regs.edi = regs->di;
-	dump->regs.ebp = regs->bp;
-	dump->regs.eax = regs->ax;
-	dump->regs.ds = regs->ds;
-	dump->regs.es = regs->es;
-	dump->regs.fs = regs->fs;
-	savesegment(gs,dump->regs.gs);
-	dump->regs.orig_eax = regs->orig_ax;
-	dump->regs.eip = regs->ip;
-	dump->regs.cs = regs->cs;
-	dump->regs.eflags = regs->flags;
-	dump->regs.esp = regs->sp;
-	dump->regs.ss = regs->ss;
+	dump->regs.bx = regs->bx;
+	dump->regs.cx = regs->cx;
+	dump->regs.dx = regs->dx;
+	dump->regs.si = regs->si;
+	dump->regs.di = regs->di;
+	dump->regs.bp = regs->bp;
+	dump->regs.ax = regs->ax;
+	dump->regs.ds = (u16)regs->ds;
+	dump->regs.es = (u16)regs->es;
+	dump->regs.fs = (u16)regs->fs;
+	savesegment(gs,gs);
+	dump->regs.orig_ax = regs->orig_ax;
+	dump->regs.ip = regs->ip;
+	dump->regs.cs = (u16)regs->cs;
+	dump->regs.flags = regs->flags;
+	dump->regs.sp = regs->sp;
+	dump->regs.ss = (u16)regs->ss;
 
 	dump->u_fpvalid = dump_fpu (regs, &dump->i387);
 }
diff --git a/arch/x86/kernel/ptrace_64.c b/arch/x86/kernel/ptrace_64.c
index bee20bb1a6c0..56b31cd3b865 100644
--- a/arch/x86/kernel/ptrace_64.c
+++ b/arch/x86/kernel/ptrace_64.c
@@ -108,7 +108,7 @@ static int putreg(struct task_struct *child,
 			if (child->thread.gs != value)
 				return do_arch_prctl(child, ARCH_SET_GS, value);
 			return 0;
-		case offsetof(struct user_regs_struct, eflags):
+		case offsetof(struct user_regs_struct,flags):
 			value &= FLAG_MASK;
 			/*
 			 * If the user value contains TF, mark that
@@ -164,7 +164,7 @@ static unsigned long getreg(struct task_struct *child, unsigned long regno)
 			if (child->thread.gsindex != GS_TLS_SEL)
 				return 0;
 			return get_desc_base(&child->thread.tls_array[GS_TLS]);
-		case offsetof(struct user_regs_struct, eflags):
+		case offsetof(struct user_regs_struct, flags):
 			/*
 			 * If the debugger set TF, hide it from the readout.
 			 */
diff --git a/include/asm-x86/user_32.h b/include/asm-x86/user_32.h
index 0e85d2a5e33a..ed8b8fc6906c 100644
--- a/include/asm-x86/user_32.h
+++ b/include/asm-x86/user_32.h
@@ -75,13 +75,23 @@ struct user_fxsr_struct {
  * doesn't use the extra segment registers)
  */
 struct user_regs_struct {
-	long ebx, ecx, edx, esi, edi, ebp, eax;
-	unsigned short ds, __ds, es, __es;
-	unsigned short fs, __fs, gs, __gs;
-	long orig_eax, eip;
-	unsigned short cs, __cs;
-	long eflags, esp;
-	unsigned short ss, __ss;
+	unsigned long	bx;
+	unsigned long	cx;
+	unsigned long	dx;
+	unsigned long	si;
+	unsigned long	di;
+	unsigned long	bp;
+	unsigned long	ax;
+	unsigned long	ds;
+	unsigned long	es;
+	unsigned long	fs;
+	unsigned long	gs;
+	unsigned long	orig_ax;
+	unsigned long	ip;
+	unsigned long	cs;
+	unsigned long	flags;
+	unsigned long	sp;
+	unsigned long	ss;
 };
 
 /* When the kernel dumps core, it starts by dumping the user struct -
diff --git a/include/asm-x86/user_64.h b/include/asm-x86/user_64.h
index 12785c649ac5..a5449d456cc0 100644
--- a/include/asm-x86/user_64.h
+++ b/include/asm-x86/user_64.h
@@ -40,13 +40,13 @@
  * and both the standard and SIMD floating point data can be accessed via
  * the new ptrace requests.  In either case, changes to the FPU environment
  * will be reflected in the task's state as expected.
- * 
+ *
  * x86-64 support by Andi Kleen.
  */
 
 /* This matches the 64bit FXSAVE format as defined by AMD. It is the same
    as the 32bit format defined by Intel, except that the selector:offset pairs for
-   data and eip are replaced with flat 64bit pointers. */ 
+   data and eip are replaced with flat 64bit pointers. */
 struct user_i387_struct {
 	unsigned short	cwd;
 	unsigned short	swd;
@@ -65,13 +65,34 @@ struct user_i387_struct {
  * Segment register layout in coredumps.
  */
 struct user_regs_struct {
-	unsigned long r15,r14,r13,r12,rbp,rbx,r11,r10;
-	unsigned long r9,r8,rax,rcx,rdx,rsi,rdi,orig_rax;
-	unsigned long rip,cs,eflags;
-	unsigned long rsp,ss;
-  	unsigned long fs_base, gs_base;
-	unsigned long ds,es,fs,gs; 
-}; 
+	unsigned long	r15;
+	unsigned long	r14;
+	unsigned long	r13;
+	unsigned long	r12;
+	unsigned long	bp;
+	unsigned long	bx;
+	unsigned long	r11;
+	unsigned long	r10;
+	unsigned long	r9;
+	unsigned long	r8;
+	unsigned long	ax;
+	unsigned long	cx;
+	unsigned long	dx;
+	unsigned long	si;
+	unsigned long	di;
+	unsigned long	orig_ax;
+	unsigned long	ip;
+	unsigned long	cs;
+	unsigned long	flags;
+	unsigned long	sp;
+	unsigned long	ss;
+	unsigned long	fs_base;
+	unsigned long	gs_base;
+	unsigned long	ds;
+	unsigned long	es;
+	unsigned long	fs;
+	unsigned long	gs;
+};
 
 /* When the kernel dumps core, it starts by dumping the user struct -
    this will be used by gdb to figure out where the data and stack segments
@@ -94,7 +115,7 @@ struct user{
 				   This is actually the bottom of the stack,
 				   the top of the stack is always found in the
 				   esp register.  */
-  long int signal;     		/* Signal that caused the core dump. */
+  long int signal;		/* Signal that caused the core dump. */
   int reserved;			/* No longer used */
   int pad1;
   struct user_pt_regs * u_ar0;	/* Used by gdb to help find the values for */
-- 
cgit v1.2.3


From 742fa54a62be6a263df14a553bf832724471dfbe Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Wed, 30 Jan 2008 13:30:56 +0100
Subject: x86: use generic register names in struct sigcontext

Switch struct sigcontext (defined in <asm/sigcontext*.h>) to using
register names withut e- or r-prefixes for both 32- and 64-bit x86.
This is intended as a preliminary step in unifying this code between
architectures.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/ia32/ia32_signal.c      | 32 ++++++++++++------------
 arch/x86/kernel/asm-offsets_32.c | 18 +++++++-------
 arch/x86/kernel/asm-offsets_64.c | 18 +++++++-------
 arch/x86/kernel/signal_32.c      | 28 ++++++++++-----------
 arch/x86/kernel/signal_64.c      | 29 +++++++++++----------
 arch/x86/vdso/vdso32/sigreturn.S | 54 ++++++++++++++++++++--------------------
 include/asm-x86/sigcontext.h     | 42 +++++++++++++++----------------
 include/asm-x86/sigcontext32.h   | 22 ++++++++--------
 8 files changed, 121 insertions(+), 122 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index f2da443f8c7b..d03d43f32f4c 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -192,9 +192,9 @@ struct rt_sigframe
 	char retcode[8];
 };
 
-#define COPY(x)		{ \
-	unsigned int reg;			\
-	err |= __get_user(reg, &sc->e ##x);	\
+#define COPY(x)		{ 		\
+	unsigned int reg;		\
+	err |= __get_user(reg, &sc->x);	\
 	regs->x = reg;			\
 }
 
@@ -248,7 +248,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
 	err |= __get_user(regs->ss, &sc->ss);
 	regs->ss |= 3;
 
-	err |= __get_user(tmpflags, &sc->eflags);
+	err |= __get_user(tmpflags, &sc->flags);
 	regs->flags = (regs->flags & ~0x40DD5) | (tmpflags & 0x40DD5);
 	/* disable syscall checks */
 	regs->orig_ax = -1;
@@ -268,7 +268,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
 		}
 	}
 
-	err |= __get_user(tmp, &sc->eax);
+	err |= __get_user(tmp, &sc->ax);
 	*peax = tmp;
 
 	return err;
@@ -361,21 +361,21 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc,
 	__asm__("movl %%es,%0" : "=r"(tmp): "0"(tmp));
 	err |= __put_user(tmp, (unsigned int __user *)&sc->es);
 
-	err |= __put_user((u32)regs->di, &sc->edi);
-	err |= __put_user((u32)regs->si, &sc->esi);
-	err |= __put_user((u32)regs->bp, &sc->ebp);
-	err |= __put_user((u32)regs->sp, &sc->esp);
-	err |= __put_user((u32)regs->bx, &sc->ebx);
-	err |= __put_user((u32)regs->dx, &sc->edx);
-	err |= __put_user((u32)regs->cx, &sc->ecx);
-	err |= __put_user((u32)regs->ax, &sc->eax);
+	err |= __put_user((u32)regs->di, &sc->di);
+	err |= __put_user((u32)regs->si, &sc->si);
+	err |= __put_user((u32)regs->bp, &sc->bp);
+	err |= __put_user((u32)regs->sp, &sc->sp);
+	err |= __put_user((u32)regs->bx, &sc->bx);
+	err |= __put_user((u32)regs->dx, &sc->dx);
+	err |= __put_user((u32)regs->cx, &sc->cx);
+	err |= __put_user((u32)regs->ax, &sc->ax);
 	err |= __put_user((u32)regs->cs, &sc->cs);
 	err |= __put_user((u32)regs->ss, &sc->ss);
 	err |= __put_user(current->thread.trap_no, &sc->trapno);
 	err |= __put_user(current->thread.error_code, &sc->err);
-	err |= __put_user((u32)regs->ip, &sc->eip);
-	err |= __put_user((u32)regs->flags, &sc->eflags);
-	err |= __put_user((u32)regs->sp, &sc->esp_at_signal);
+	err |= __put_user((u32)regs->ip, &sc->ip);
+	err |= __put_user((u32)regs->flags, &sc->flags);
+	err |= __put_user((u32)regs->sp, &sc->sp_at_signal);
 
 	tmp = save_i387_ia32(current, fpstate, regs, 0);
 	if (tmp < 0)
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index a3a8be7618d1..4fc24a61f431 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -38,15 +38,15 @@ void foo(void);
 
 void foo(void)
 {
-	OFFSET(IA32_SIGCONTEXT_eax, sigcontext, eax);
-	OFFSET(IA32_SIGCONTEXT_ebx, sigcontext, ebx);
-	OFFSET(IA32_SIGCONTEXT_ecx, sigcontext, ecx);
-	OFFSET(IA32_SIGCONTEXT_edx, sigcontext, edx);
-	OFFSET(IA32_SIGCONTEXT_esi, sigcontext, esi);
-	OFFSET(IA32_SIGCONTEXT_edi, sigcontext, edi);
-	OFFSET(IA32_SIGCONTEXT_ebp, sigcontext, ebp);
-	OFFSET(IA32_SIGCONTEXT_esp, sigcontext, esp);
-	OFFSET(IA32_SIGCONTEXT_eip, sigcontext, eip);
+	OFFSET(IA32_SIGCONTEXT_ax, sigcontext, ax);
+	OFFSET(IA32_SIGCONTEXT_bx, sigcontext, bx);
+	OFFSET(IA32_SIGCONTEXT_cx, sigcontext, cx);
+	OFFSET(IA32_SIGCONTEXT_dx, sigcontext, dx);
+	OFFSET(IA32_SIGCONTEXT_si, sigcontext, si);
+	OFFSET(IA32_SIGCONTEXT_di, sigcontext, di);
+	OFFSET(IA32_SIGCONTEXT_bp, sigcontext, bp);
+	OFFSET(IA32_SIGCONTEXT_sp, sigcontext, sp);
+	OFFSET(IA32_SIGCONTEXT_ip, sigcontext, ip);
 	BLANK();
 
 	OFFSET(CPUINFO_x86, cpuinfo_x86, x86);
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 2e918ebf21d3..a05428764314 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -63,15 +63,15 @@ int main(void)
 #undef ENTRY
 #ifdef CONFIG_IA32_EMULATION
 #define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry))
-	ENTRY(eax);
-	ENTRY(ebx);
-	ENTRY(ecx);
-	ENTRY(edx);
-	ENTRY(esi);
-	ENTRY(edi);
-	ENTRY(ebp);
-	ENTRY(esp);
-	ENTRY(eip);
+	ENTRY(ax);
+	ENTRY(bx);
+	ENTRY(cx);
+	ENTRY(dx);
+	ENTRY(si);
+	ENTRY(di);
+	ENTRY(bp);
+	ENTRY(sp);
+	ENTRY(ip);
 	BLANK();
 #undef ENTRY
 	DEFINE(IA32_RT_SIGFRAME_sigcontext,
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index 40fd3515ccf1..64cb3c05de69 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -105,7 +105,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax
 	/* Always make any pending restarted system calls return -EINTR */
 	current_thread_info()->restart_block.fn = do_no_restart_syscall;
 
-#define COPY(x)		err |= __get_user(regs->x, &sc->e ## x)
+#define COPY(x)		err |= __get_user(regs->x, &sc->x)
 
 #define COPY_SEG(seg)							\
 	{ unsigned short tmp;						\
@@ -144,7 +144,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax
 	
 	{
 		unsigned int tmpflags;
-		err |= __get_user(tmpflags, &sc->eflags);
+		err |= __get_user(tmpflags, &sc->flags);
 		regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
 		regs->orig_ax = -1;		/* disable syscall checks */
 	}
@@ -165,7 +165,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax
 		}
 	}
 
-	err |= __get_user(*peax, &sc->eax);
+	err |= __get_user(*peax, &sc->ax);
 	return err;
 
 badframe:
@@ -256,20 +256,20 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
 
 	err |= __put_user(regs->es, (unsigned int __user *)&sc->es);
 	err |= __put_user(regs->ds, (unsigned int __user *)&sc->ds);
-	err |= __put_user(regs->di, &sc->edi);
-	err |= __put_user(regs->si, &sc->esi);
-	err |= __put_user(regs->bp, &sc->ebp);
-	err |= __put_user(regs->sp, &sc->esp);
-	err |= __put_user(regs->bx, &sc->ebx);
-	err |= __put_user(regs->dx, &sc->edx);
-	err |= __put_user(regs->cx, &sc->ecx);
-	err |= __put_user(regs->ax, &sc->eax);
+	err |= __put_user(regs->di, &sc->di);
+	err |= __put_user(regs->si, &sc->si);
+	err |= __put_user(regs->bp, &sc->bp);
+	err |= __put_user(regs->sp, &sc->sp);
+	err |= __put_user(regs->bx, &sc->bx);
+	err |= __put_user(regs->dx, &sc->dx);
+	err |= __put_user(regs->cx, &sc->cx);
+	err |= __put_user(regs->ax, &sc->ax);
 	err |= __put_user(current->thread.trap_no, &sc->trapno);
 	err |= __put_user(current->thread.error_code, &sc->err);
-	err |= __put_user(regs->ip, &sc->eip);
+	err |= __put_user(regs->ip, &sc->ip);
 	err |= __put_user(regs->cs, (unsigned int __user *)&sc->cs);
-	err |= __put_user(regs->flags, &sc->eflags);
-	err |= __put_user(regs->sp, &sc->esp_at_signal);
+	err |= __put_user(regs->flags, &sc->flags);
+	err |= __put_user(regs->sp, &sc->sp_at_signal);
 	err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss);
 
 	tmp = save_i387(fpstate);
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index 4b228fd83b31..1c9bca56fb55 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -62,11 +62,10 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned
 	/* Always make any pending restarted system calls return -EINTR */
 	current_thread_info()->restart_block.fn = do_no_restart_syscall;
 
-#define COPYR(x)	err |= __get_user(regs->x, &sc->r ## x)
 #define COPY(x)		err |= __get_user(regs->x, &sc->x)
 
-	COPYR(di); COPYR(si); COPYR(bp); COPYR(sp); COPYR(bx);
-	COPYR(dx); COPYR(cx); COPYR(ip);
+	COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
+	COPY(dx); COPY(cx); COPY(ip);
 	COPY(r8);
 	COPY(r9);
 	COPY(r10);
@@ -87,7 +86,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned
 
 	{
 		unsigned int tmpflags;
-		err |= __get_user(tmpflags, &sc->eflags);
+		err |= __get_user(tmpflags, &sc->flags);
 		regs->flags = (regs->flags & ~0x40DD5) | (tmpflags & 0x40DD5);
 		regs->orig_ax = -1;		/* disable syscall checks */
 	}
@@ -109,7 +108,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned
 		}
 	}
 
-	err |= __get_user(*prax, &sc->rax);
+	err |= __get_user(*prax, &sc->ax);
 	return err;
 
 badframe:
@@ -166,14 +165,14 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned lo
 	err |= __put_user(0, &sc->gs);
 	err |= __put_user(0, &sc->fs);
 
-	err |= __put_user(regs->di, &sc->rdi);
-	err |= __put_user(regs->si, &sc->rsi);
-	err |= __put_user(regs->bp, &sc->rbp);
-	err |= __put_user(regs->sp, &sc->rsp);
-	err |= __put_user(regs->bx, &sc->rbx);
-	err |= __put_user(regs->dx, &sc->rdx);
-	err |= __put_user(regs->cx, &sc->rcx);
-	err |= __put_user(regs->ax, &sc->rax);
+	err |= __put_user(regs->di, &sc->di);
+	err |= __put_user(regs->si, &sc->si);
+	err |= __put_user(regs->bp, &sc->bp);
+	err |= __put_user(regs->sp, &sc->sp);
+	err |= __put_user(regs->bx, &sc->bx);
+	err |= __put_user(regs->dx, &sc->dx);
+	err |= __put_user(regs->cx, &sc->cx);
+	err |= __put_user(regs->ax, &sc->ax);
 	err |= __put_user(regs->r8, &sc->r8);
 	err |= __put_user(regs->r9, &sc->r9);
 	err |= __put_user(regs->r10, &sc->r10);
@@ -184,8 +183,8 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned lo
 	err |= __put_user(regs->r15, &sc->r15);
 	err |= __put_user(me->thread.trap_no, &sc->trapno);
 	err |= __put_user(me->thread.error_code, &sc->err);
-	err |= __put_user(regs->ip, &sc->rip);
-	err |= __put_user(regs->flags, &sc->eflags);
+	err |= __put_user(regs->ip, &sc->ip);
+	err |= __put_user(regs->flags, &sc->flags);
 	err |= __put_user(mask, &sc->oldmask);
 	err |= __put_user(me->thread.cr2, &sc->cr2);
 
diff --git a/arch/x86/vdso/vdso32/sigreturn.S b/arch/x86/vdso/vdso32/sigreturn.S
index cade2752928b..31776d0efc8c 100644
--- a/arch/x86/vdso/vdso32/sigreturn.S
+++ b/arch/x86/vdso/vdso32/sigreturn.S
@@ -92,27 +92,27 @@ __kernel_rt_sigreturn:
 	.sleb128 offset;		/*       offset */		\
 1:
 
-	do_cfa_expr(IA32_SIGCONTEXT_esp+4)
-	do_expr(0, IA32_SIGCONTEXT_eax+4)
-	do_expr(1, IA32_SIGCONTEXT_ecx+4)
-	do_expr(2, IA32_SIGCONTEXT_edx+4)
-	do_expr(3, IA32_SIGCONTEXT_ebx+4)
-	do_expr(5, IA32_SIGCONTEXT_ebp+4)
-	do_expr(6, IA32_SIGCONTEXT_esi+4)
-	do_expr(7, IA32_SIGCONTEXT_edi+4)
-	do_expr(8, IA32_SIGCONTEXT_eip+4)
+	do_cfa_expr(IA32_SIGCONTEXT_sp+4)
+	do_expr(0, IA32_SIGCONTEXT_ax+4)
+	do_expr(1, IA32_SIGCONTEXT_cx+4)
+	do_expr(2, IA32_SIGCONTEXT_dx+4)
+	do_expr(3, IA32_SIGCONTEXT_bx+4)
+	do_expr(5, IA32_SIGCONTEXT_bp+4)
+	do_expr(6, IA32_SIGCONTEXT_si+4)
+	do_expr(7, IA32_SIGCONTEXT_di+4)
+	do_expr(8, IA32_SIGCONTEXT_ip+4)
 
 	.byte 0x42	/* DW_CFA_advance_loc 2 -- nop; popl eax. */
 
-	do_cfa_expr(IA32_SIGCONTEXT_esp)
-	do_expr(0, IA32_SIGCONTEXT_eax)
-	do_expr(1, IA32_SIGCONTEXT_ecx)
-	do_expr(2, IA32_SIGCONTEXT_edx)
-	do_expr(3, IA32_SIGCONTEXT_ebx)
-	do_expr(5, IA32_SIGCONTEXT_ebp)
-	do_expr(6, IA32_SIGCONTEXT_esi)
-	do_expr(7, IA32_SIGCONTEXT_edi)
-	do_expr(8, IA32_SIGCONTEXT_eip)
+	do_cfa_expr(IA32_SIGCONTEXT_sp)
+	do_expr(0, IA32_SIGCONTEXT_ax)
+	do_expr(1, IA32_SIGCONTEXT_cx)
+	do_expr(2, IA32_SIGCONTEXT_dx)
+	do_expr(3, IA32_SIGCONTEXT_bx)
+	do_expr(5, IA32_SIGCONTEXT_bp)
+	do_expr(6, IA32_SIGCONTEXT_si)
+	do_expr(7, IA32_SIGCONTEXT_di)
+	do_expr(8, IA32_SIGCONTEXT_ip)
 
 	.align 4
 .LENDFDEDLSI1:
@@ -129,15 +129,15 @@ __kernel_rt_sigreturn:
 	   slightly less complicated than the above, since we don't
 	   modify the stack pointer in the process.  */
 
-	do_cfa_expr(IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_esp)
-	do_expr(0, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_eax)
-	do_expr(1, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ecx)
-	do_expr(2, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_edx)
-	do_expr(3, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ebx)
-	do_expr(5, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ebp)
-	do_expr(6, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_esi)
-	do_expr(7, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_edi)
-	do_expr(8, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_eip)
+	do_cfa_expr(IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_sp)
+	do_expr(0, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ax)
+	do_expr(1, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_cx)
+	do_expr(2, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_dx)
+	do_expr(3, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_bx)
+	do_expr(5, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_bp)
+	do_expr(6, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_si)
+	do_expr(7, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_di)
+	do_expr(8, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ip)
 
 	.align 4
 .LENDFDEDLSI2:
diff --git a/include/asm-x86/sigcontext.h b/include/asm-x86/sigcontext.h
index c047f9dc3423..681deade5f00 100644
--- a/include/asm-x86/sigcontext.h
+++ b/include/asm-x86/sigcontext.h
@@ -63,20 +63,20 @@ struct sigcontext {
 	unsigned short fs, __fsh;
 	unsigned short es, __esh;
 	unsigned short ds, __dsh;
-	unsigned long edi;
-	unsigned long esi;
-	unsigned long ebp;
-	unsigned long esp;
-	unsigned long ebx;
-	unsigned long edx;
-	unsigned long ecx;
-	unsigned long eax;
+	unsigned long di;
+	unsigned long si;
+	unsigned long bp;
+	unsigned long sp;
+	unsigned long bx;
+	unsigned long dx;
+	unsigned long cx;
+	unsigned long ax;
 	unsigned long trapno;
 	unsigned long err;
-	unsigned long eip;
+	unsigned long ip;
 	unsigned short cs, __csh;
-	unsigned long eflags;
-	unsigned long esp_at_signal;
+	unsigned long flags;
+	unsigned long sp_at_signal;
 	unsigned short ss, __ssh;
 	struct _fpstate __user * fpstate;
 	unsigned long oldmask;
@@ -111,16 +111,16 @@ struct sigcontext {
 	unsigned long r13;
 	unsigned long r14;
 	unsigned long r15;
-	unsigned long rdi;
-	unsigned long rsi;
-	unsigned long rbp;
-	unsigned long rbx;
-	unsigned long rdx;
-	unsigned long rax;
-	unsigned long rcx;
-	unsigned long rsp;
-	unsigned long rip;
-	unsigned long eflags;		/* RFLAGS */
+	unsigned long di;
+	unsigned long si;
+	unsigned long bp;
+	unsigned long bx;
+	unsigned long dx;
+	unsigned long ax;
+	unsigned long cx;
+	unsigned long sp;
+	unsigned long ip;
+	unsigned long flags;
 	unsigned short cs;
 	unsigned short gs;
 	unsigned short fs;
diff --git a/include/asm-x86/sigcontext32.h b/include/asm-x86/sigcontext32.h
index 3d657038ab7c..6ffab4fd593a 100644
--- a/include/asm-x86/sigcontext32.h
+++ b/include/asm-x86/sigcontext32.h
@@ -48,20 +48,20 @@ struct sigcontext_ia32 {
        unsigned short fs, __fsh;
        unsigned short es, __esh;
        unsigned short ds, __dsh;
-       unsigned int edi;
-       unsigned int esi;
-       unsigned int ebp;
-       unsigned int esp;
-       unsigned int ebx;
-       unsigned int edx;
-       unsigned int ecx;
-       unsigned int eax;
+       unsigned int di;
+       unsigned int si;
+       unsigned int bp;
+       unsigned int sp;
+       unsigned int bx;
+       unsigned int dx;
+       unsigned int cx;
+       unsigned int ax;
        unsigned int trapno;
        unsigned int err;
-       unsigned int eip;
+       unsigned int ip;
        unsigned short cs, __csh;
-       unsigned int eflags;
-       unsigned int esp_at_signal;
+       unsigned int flags;
+       unsigned int sp_at_signal;
        unsigned short ss, __ssh;
        unsigned int fpstate;		/* really (struct _fpstate_ia32 *) */
        unsigned int oldmask;
-- 
cgit v1.2.3


From a46ff73d53f011de1dfc1983b05db2c04d193713 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:30:57 +0100
Subject: x86: setup64 eflags constants

This cleans up arch/x86/kernel/setup64.c to use the X86_EFLAGS_* constants
from <asm/processor-flags.h> instead of the EF_* enum in <asm/ptrace.h>.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/setup64.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c
index 3558ac78c926..51297cca6b42 100644
--- a/arch/x86/kernel/setup64.c
+++ b/arch/x86/kernel/setup64.c
@@ -169,7 +169,8 @@ void syscall_init(void)
 #endif
 
 	/* Flags to clear on syscall */
-	wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); 
+	wrmsrl(MSR_SYSCALL_MASK,
+	       X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
 }
 
 void __cpuinit check_efer(void)
-- 
cgit v1.2.3


From 80976c0867b7537d77511492d3180b8093b0ba3d Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:30:58 +0100
Subject: x86: x86-64 ptrace whitespace

This canonicalizes the indentation in the getreg and putreg functions.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ptrace_64.c | 224 ++++++++++++++++++++++----------------------
 1 file changed, 112 insertions(+), 112 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ptrace_64.c b/arch/x86/kernel/ptrace_64.c
index 56b31cd3b865..2427548f2a01 100644
--- a/arch/x86/kernel/ptrace_64.c
+++ b/arch/x86/kernel/ptrace_64.c
@@ -2,7 +2,7 @@
 /*
  * Pentium III FXSR, SSE support
  *	Gareth Hughes <gareth@valinux.com>, May 2000
- * 
+ *
  * x86-64 port 2000-2002 Andi Kleen
  */
 
@@ -48,7 +48,7 @@
  * Make sure the single step bit is not set.
  */
 void ptrace_disable(struct task_struct *child)
-{ 
+{
 	user_disable_single_step(child);
 }
 
@@ -63,69 +63,69 @@ static int putreg(struct task_struct *child,
 {
 	struct pt_regs *regs = task_pt_regs(child);
 	switch (regno) {
-		case offsetof(struct user_regs_struct,fs):
-			if (value && (value & 3) != 3)
-				return -EIO;
-			child->thread.fsindex = value & 0xffff; 
-			return 0;
-		case offsetof(struct user_regs_struct,gs):
-			if (value && (value & 3) != 3)
-				return -EIO;
-			child->thread.gsindex = value & 0xffff;
-			return 0;
-		case offsetof(struct user_regs_struct,ds):
-			if (value && (value & 3) != 3)
-				return -EIO;
-			child->thread.ds = value & 0xffff;
-			return 0;
-		case offsetof(struct user_regs_struct,es): 
-			if (value && (value & 3) != 3)
-				return -EIO;
-			child->thread.es = value & 0xffff;
-			return 0;
-		case offsetof(struct user_regs_struct,ss):
-			if ((value & 3) != 3)
-				return -EIO;
-			value &= 0xffff;
-			return 0;
-		case offsetof(struct user_regs_struct,fs_base):
-			if (value >= TASK_SIZE_OF(child))
-				return -EIO;
-			/*
-			 * When changing the segment base, use do_arch_prctl
-			 * to set either thread.fs or thread.fsindex and the
-			 * corresponding GDT slot.
-			 */
-			if (child->thread.fs != value)
-				return do_arch_prctl(child, ARCH_SET_FS, value);
-			return 0;
-		case offsetof(struct user_regs_struct,gs_base):
-			/*
-			 * Exactly the same here as the %fs handling above.
-			 */
-			if (value >= TASK_SIZE_OF(child))
-				return -EIO;
-			if (child->thread.gs != value)
-				return do_arch_prctl(child, ARCH_SET_GS, value);
-			return 0;
-		case offsetof(struct user_regs_struct,flags):
-			value &= FLAG_MASK;
-			/*
-			 * If the user value contains TF, mark that
-			 * it was not "us" (the debugger) that set it.
-			 * If not, make sure it stays set if we had.
-			 */
-			if (value & X86_EFLAGS_TF)
-				clear_tsk_thread_flag(child, TIF_FORCED_TF);
-			else if (test_tsk_thread_flag(child, TIF_FORCED_TF))
-				value |= X86_EFLAGS_TF;
-			value |= regs->flags & ~FLAG_MASK;
-			break;
-		case offsetof(struct user_regs_struct,cs): 
-			if ((value & 3) != 3)
-				return -EIO;
-			value &= 0xffff;
-			break;
+	case offsetof(struct user_regs_struct,fs):
+		if (value && (value & 3) != 3)
+			return -EIO;
+		child->thread.fsindex = value & 0xffff;
+		return 0;
+	case offsetof(struct user_regs_struct,gs):
+		if (value && (value & 3) != 3)
+			return -EIO;
+		child->thread.gsindex = value & 0xffff;
+		return 0;
+	case offsetof(struct user_regs_struct,ds):
+		if (value && (value & 3) != 3)
+			return -EIO;
+		child->thread.ds = value & 0xffff;
+		return 0;
+	case offsetof(struct user_regs_struct,es):
+		if (value && (value & 3) != 3)
+			return -EIO;
+		child->thread.es = value & 0xffff;
+		return 0;
+	case offsetof(struct user_regs_struct,ss):
+		if ((value & 3) != 3)
+			return -EIO;
+		value &= 0xffff;
+		return 0;
+	case offsetof(struct user_regs_struct,fs_base):
+		if (value >= TASK_SIZE_OF(child))
+			return -EIO;
+		/*
+		 * When changing the segment base, use do_arch_prctl
+		 * to set either thread.fs or thread.fsindex and the
+		 * corresponding GDT slot.
+		 */
+		if (child->thread.fs != value)
+			return do_arch_prctl(child, ARCH_SET_FS, value);
+		return 0;
+	case offsetof(struct user_regs_struct,gs_base):
+		/*
+		 * Exactly the same here as the %fs handling above.
+		 */
+		if (value >= TASK_SIZE_OF(child))
+			return -EIO;
+		if (child->thread.gs != value)
+			return do_arch_prctl(child, ARCH_SET_GS, value);
+		return 0;
+	case offsetof(struct user_regs_struct,flags):
+		value &= FLAG_MASK;
+		/*
+		 * If the user value contains TF, mark that
+		 * it was not "us" (the debugger) that set it.
+		 * If not, make sure it stays set if we had.
+		 */
+		if (value & X86_EFLAGS_TF)
+			clear_tsk_thread_flag(child, TIF_FORCED_TF);
+		else if (test_tsk_thread_flag(child, TIF_FORCED_TF))
+			value |= X86_EFLAGS_TF;
+		value |= regs->flags & ~FLAG_MASK;
+		break;
+	case offsetof(struct user_regs_struct,cs):
+		if ((value & 3) != 3)
+			return -EIO;
+		value &= 0xffff;
+		break;
 	}
 	*pt_regs_access(regs, regno) = value;
 	return 0;
@@ -136,49 +136,49 @@ static unsigned long getreg(struct task_struct *child, unsigned long regno)
 	struct pt_regs *regs = task_pt_regs(child);
 	unsigned long val;
 	switch (regno) {
-		case offsetof(struct user_regs_struct, fs):
-			return child->thread.fsindex;
-		case offsetof(struct user_regs_struct, gs):
-			return child->thread.gsindex;
-		case offsetof(struct user_regs_struct, ds):
-			return child->thread.ds;
-		case offsetof(struct user_regs_struct, es):
-			return child->thread.es; 
-		case offsetof(struct user_regs_struct, fs_base):
-			/*
-			 * do_arch_prctl may have used a GDT slot instead of
-			 * the MSR.  To userland, it appears the same either
-			 * way, except the %fs segment selector might not be 0.
-			 */
-			if (child->thread.fs != 0)
-				return child->thread.fs;
-			if (child->thread.fsindex != FS_TLS_SEL)
-				return 0;
-			return get_desc_base(&child->thread.tls_array[FS_TLS]);
-		case offsetof(struct user_regs_struct, gs_base):
-			/*
-			 * Exactly the same here as the %fs handling above.
-			 */
-			if (child->thread.gs != 0)
-				return child->thread.gs;
-			if (child->thread.gsindex != GS_TLS_SEL)
-				return 0;
-			return get_desc_base(&child->thread.tls_array[GS_TLS]);
-		case offsetof(struct user_regs_struct, flags):
-			/*
-			 * If the debugger set TF, hide it from the readout.
-			 */
-			val = regs->flags;
-			if (test_tsk_thread_flag(child, TIF_IA32))
-				val &= 0xffffffff;
-			if (test_tsk_thread_flag(child, TIF_FORCED_TF))
-				val &= ~X86_EFLAGS_TF;
-			return val;
-		default:
-			val = *pt_regs_access(regs, regno);
-			if (test_tsk_thread_flag(child, TIF_IA32))
-				val &= 0xffffffff;
-			return val;
+	case offsetof(struct user_regs_struct, fs):
+		return child->thread.fsindex;
+	case offsetof(struct user_regs_struct, gs):
+		return child->thread.gsindex;
+	case offsetof(struct user_regs_struct, ds):
+		return child->thread.ds;
+	case offsetof(struct user_regs_struct, es):
+		return child->thread.es;
+	case offsetof(struct user_regs_struct, fs_base):
+		/*
+		 * do_arch_prctl may have used a GDT slot instead of
+		 * the MSR.  To userland, it appears the same either
+		 * way, except the %fs segment selector might not be 0.
+		 */
+		if (child->thread.fs != 0)
+			return child->thread.fs;
+		if (child->thread.fsindex != FS_TLS_SEL)
+			return 0;
+		return get_desc_base(&child->thread.tls_array[FS_TLS]);
+	case offsetof(struct user_regs_struct, gs_base):
+		/*
+		 * Exactly the same here as the %fs handling above.
+		 */
+		if (child->thread.gs != 0)
+			return child->thread.gs;
+		if (child->thread.gsindex != GS_TLS_SEL)
+			return 0;
+		return get_desc_base(&child->thread.tls_array[GS_TLS]);
+	case offsetof(struct user_regs_struct, flags):
+		/*
+		 * If the debugger set TF, hide it from the readout.
+		 */
+		val = regs->flags;
+		if (test_tsk_thread_flag(child, TIF_IA32))
+			val &= 0xffffffff;
+		if (test_tsk_thread_flag(child, TIF_FORCED_TF))
+			val &= ~X86_EFLAGS_TF;
+		return val;
+	default:
+		val = *pt_regs_access(regs, regno);
+		if (test_tsk_thread_flag(child, TIF_IA32))
+			val &= 0xffffffff;
+		return val;
 	}
 
 }
@@ -244,7 +244,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 
 	switch (request) {
 	/* when I and D space are separate, these will need to be fixed. */
-	case PTRACE_PEEKTEXT: /* read word at location addr. */ 
+	case PTRACE_PEEKTEXT: /* read word at location addr. */
 	case PTRACE_PEEKDATA:
 		ret = generic_ptrace_peekdata(child, addr, data);
 		break;
@@ -310,10 +310,10 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 					 (struct user_desc __user *) data, 0);
 		break;
 #endif
-		/* normal 64bit interface to access TLS data. 
+		/* normal 64bit interface to access TLS data.
 		   Works just like arch_prctl, except that the arguments
 		   are reversed. */
-	case PTRACE_ARCH_PRCTL: 
+	case PTRACE_ARCH_PRCTL:
 		ret = do_arch_prctl(child, data, addr);
 		break;
 
@@ -386,7 +386,7 @@ static void syscall_trace(struct pt_regs *regs)
 	printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n",
 	       current->comm,
 	       regs->ip, regs->sp, regs->ax, regs->orig_ax, __builtin_return_address(0),
-	       current_thread_info()->flags, current->ptrace); 
+	       current_thread_info()->flags, current->ptrace);
 #endif
 
 	ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
-- 
cgit v1.2.3


From 9e714bed644cb463489b9250774a4b0fb352cabc Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:30:58 +0100
Subject: x86: x86-32 ptrace whitespace

This canonicalizes the indentation in the getreg and putreg functions.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ptrace_32.c | 110 ++++++++++++++++++++++----------------------
 1 file changed, 55 insertions(+), 55 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ptrace_32.c b/arch/x86/kernel/ptrace_32.c
index f81e2f1827d4..5aca84ef26d5 100644
--- a/arch/x86/kernel/ptrace_32.c
+++ b/arch/x86/kernel/ptrace_32.c
@@ -51,37 +51,37 @@ static int putreg(struct task_struct *child,
 	struct pt_regs *regs = task_pt_regs(child);
 	regno >>= 2;
 	switch (regno) {
-		case GS:
-			if (value && (value & 3) != 3)
-				return -EIO;
-			child->thread.gs = value;
-			return 0;
-		case DS:
-		case ES:
-		case FS:
-			if (value && (value & 3) != 3)
-				return -EIO;
-			value &= 0xffff;
-			break;
-		case SS:
-		case CS:
-			if ((value & 3) != 3)
-				return -EIO;
-			value &= 0xffff;
-			break;
-		case EFL:
-			value &= FLAG_MASK;
-			/*
-			 * If the user value contains TF, mark that
-			 * it was not "us" (the debugger) that set it.
-			 * If not, make sure it stays set if we had.
-			 */
-			if (value & X86_EFLAGS_TF)
-				clear_tsk_thread_flag(child, TIF_FORCED_TF);
-			else if (test_tsk_thread_flag(child, TIF_FORCED_TF))
-				value |= X86_EFLAGS_TF;
-			value |= regs->flags & ~FLAG_MASK;
-			break;
+	case GS:
+		if (value && (value & 3) != 3)
+			return -EIO;
+		child->thread.gs = value;
+		return 0;
+	case DS:
+	case ES:
+	case FS:
+		if (value && (value & 3) != 3)
+			return -EIO;
+		value &= 0xffff;
+		break;
+	case SS:
+	case CS:
+		if ((value & 3) != 3)
+			return -EIO;
+		value &= 0xffff;
+		break;
+	case EFL:
+		value &= FLAG_MASK;
+		/*
+		 * If the user value contains TF, mark that
+		 * it was not "us" (the debugger) that set it.
+		 * If not, make sure it stays set if we had.
+		 */
+		if (value & X86_EFLAGS_TF)
+			clear_tsk_thread_flag(child, TIF_FORCED_TF);
+		else if (test_tsk_thread_flag(child, TIF_FORCED_TF))
+			value |= X86_EFLAGS_TF;
+		value |= regs->flags & ~FLAG_MASK;
+		break;
 	}
 	*pt_regs_access(regs, regno) = value;
 	return 0;
@@ -94,26 +94,26 @@ static unsigned long getreg(struct task_struct *child, unsigned long regno)
 
 	regno >>= 2;
 	switch (regno) {
-		case EFL:
-			/*
-			 * If the debugger set TF, hide it from the readout.
-			 */
-			retval = regs->flags;
-			if (test_tsk_thread_flag(child, TIF_FORCED_TF))
-				retval &= ~X86_EFLAGS_TF;
-			break;
-		case GS:
-			retval = child->thread.gs;
-			break;
-		case DS:
-		case ES:
-		case FS:
-		case SS:
-		case CS:
-			retval = 0xffff;
-			/* fall through */
-		default:
-			retval &= *pt_regs_access(regs, regno);
+	case EFL:
+		/*
+		 * If the debugger set TF, hide it from the readout.
+		 */
+		retval = regs->flags;
+		if (test_tsk_thread_flag(child, TIF_FORCED_TF))
+			retval &= ~X86_EFLAGS_TF;
+		break;
+	case GS:
+		retval = child->thread.gs;
+		break;
+	case DS:
+	case ES:
+	case FS:
+	case SS:
+	case CS:
+		retval = 0xffff;
+		/* fall through */
+	default:
+		retval &= *pt_regs_access(regs, regno);
 	}
 	return retval;
 }
@@ -190,7 +190,7 @@ static int ptrace_set_debugreg(struct task_struct *child,
  * Make sure the single step bit is not set.
  */
 void ptrace_disable(struct task_struct *child)
-{ 
+{
 	user_disable_single_step(child);
 	clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
 }
@@ -203,7 +203,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 
 	switch (request) {
 	/* when I and D space are separate, these will need to be fixed. */
-	case PTRACE_PEEKTEXT: /* read word at location addr. */ 
+	case PTRACE_PEEKTEXT: /* read word at location addr. */
 	case PTRACE_PEEKDATA:
 		ret = generic_ptrace_peekdata(child, addr, data);
 		break;
@@ -213,7 +213,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		unsigned long tmp;
 
 		ret = -EIO;
-		if ((addr & 3) || addr < 0 || 
+		if ((addr & 3) || addr < 0 ||
 		    addr > sizeof(struct user) - 3)
 			break;
 
@@ -238,7 +238,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 
 	case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
 		ret = -EIO;
-		if ((addr & 3) || addr < 0 || 
+		if ((addr & 3) || addr < 0 ||
 		    addr > sizeof(struct user) - 3)
 			break;
 
-- 
cgit v1.2.3


From ce90f340855d7a9b3bec24f0fe49a76904242387 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:30:58 +0100
Subject: x86: x86-64 ptrace get/putreg current task

This generalizes the getreg and putreg functions so they can be used on the
current task, as well as on a task stopped in TASK_TRACED and switched off.
This lays the groundwork to share this code for all kinds of user-mode
machine state access, not just ptrace.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ptrace_64.c | 36 ++++++++++++++++++++++++++++++++++--
 1 file changed, 34 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ptrace_64.c b/arch/x86/kernel/ptrace_64.c
index 2427548f2a01..5979dbe8e0a2 100644
--- a/arch/x86/kernel/ptrace_64.c
+++ b/arch/x86/kernel/ptrace_64.c
@@ -67,21 +67,29 @@ static int putreg(struct task_struct *child,
 		if (value && (value & 3) != 3)
 			return -EIO;
 		child->thread.fsindex = value & 0xffff;
+		if (child == current)
+			loadsegment(fs, child->thread.fsindex);
 		return 0;
 	case offsetof(struct user_regs_struct,gs):
 		if (value && (value & 3) != 3)
 			return -EIO;
 		child->thread.gsindex = value & 0xffff;
+		if (child == current)
+			load_gs_index(child->thread.gsindex);
 		return 0;
 	case offsetof(struct user_regs_struct,ds):
 		if (value && (value & 3) != 3)
 			return -EIO;
 		child->thread.ds = value & 0xffff;
+		if (child == current)
+			loadsegment(ds, child->thread.ds);
 		return 0;
 	case offsetof(struct user_regs_struct,es):
 		if (value && (value & 3) != 3)
 			return -EIO;
 		child->thread.es = value & 0xffff;
+		if (child == current)
+			loadsegment(es, child->thread.es);
 		return 0;
 	case offsetof(struct user_regs_struct,ss):
 		if ((value & 3) != 3)
@@ -135,14 +143,32 @@ static unsigned long getreg(struct task_struct *child, unsigned long regno)
 {
 	struct pt_regs *regs = task_pt_regs(child);
 	unsigned long val;
+	unsigned int seg;
 	switch (regno) {
 	case offsetof(struct user_regs_struct, fs):
+		if (child == current) {
+			/* Older gas can't assemble movq %?s,%r?? */
+			asm("movl %%fs,%0" : "=r" (seg));
+			return seg;
+		}
 		return child->thread.fsindex;
 	case offsetof(struct user_regs_struct, gs):
+		if (child == current) {
+			asm("movl %%gs,%0" : "=r" (seg));
+			return seg;
+		}
 		return child->thread.gsindex;
 	case offsetof(struct user_regs_struct, ds):
+		if (child == current) {
+			asm("movl %%ds,%0" : "=r" (seg));
+			return seg;
+		}
 		return child->thread.ds;
 	case offsetof(struct user_regs_struct, es):
+		if (child == current) {
+			asm("movl %%es,%0" : "=r" (seg));
+			return seg;
+		}
 		return child->thread.es;
 	case offsetof(struct user_regs_struct, fs_base):
 		/*
@@ -152,7 +178,10 @@ static unsigned long getreg(struct task_struct *child, unsigned long regno)
 		 */
 		if (child->thread.fs != 0)
 			return child->thread.fs;
-		if (child->thread.fsindex != FS_TLS_SEL)
+		seg = child->thread.fsindex;
+		if (child == current)
+			asm("movl %%fs,%0" : "=r" (seg));
+		if (seg != FS_TLS_SEL)
 			return 0;
 		return get_desc_base(&child->thread.tls_array[FS_TLS]);
 	case offsetof(struct user_regs_struct, gs_base):
@@ -161,7 +190,10 @@ static unsigned long getreg(struct task_struct *child, unsigned long regno)
 		 */
 		if (child->thread.gs != 0)
 			return child->thread.gs;
-		if (child->thread.gsindex != GS_TLS_SEL)
+		seg = child->thread.gsindex;
+		if (child == current)
+			asm("movl %%gs,%0" : "=r" (seg));
+		if (seg != GS_TLS_SEL)
 			return 0;
 		return get_desc_base(&child->thread.tls_array[GS_TLS]);
 	case offsetof(struct user_regs_struct, flags):
-- 
cgit v1.2.3


From 5fd4d16bd59a9fc84ca94c4fce4abc23fe219108 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:30:58 +0100
Subject: x86: x86-32 ptrace get/putreg current task

This generalizes the getreg and putreg functions so they can be used on the
current task, as well as on a task stopped in TASK_TRACED and switched off.
This lays the groundwork to share this code for all kinds of user-mode
machine state access, not just ptrace.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ptrace_32.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ptrace_32.c b/arch/x86/kernel/ptrace_32.c
index 5aca84ef26d5..26071305de2c 100644
--- a/arch/x86/kernel/ptrace_32.c
+++ b/arch/x86/kernel/ptrace_32.c
@@ -55,6 +55,12 @@ static int putreg(struct task_struct *child,
 		if (value && (value & 3) != 3)
 			return -EIO;
 		child->thread.gs = value;
+		if (child == current)
+			/*
+			 * The user-mode %gs is not affected by
+			 * kernel entry, so we must update the CPU.
+			 */
+			loadsegment(gs, value);
 		return 0;
 	case DS:
 	case ES:
@@ -104,6 +110,8 @@ static unsigned long getreg(struct task_struct *child, unsigned long regno)
 		break;
 	case GS:
 		retval = child->thread.gs;
+		if (child == current)
+			savesegment(gs, retval);
 		break;
 	case DS:
 	case ES:
-- 
cgit v1.2.3


From 0f5340933f9bacb403f49baaf8073320e3984841 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:30:59 +0100
Subject: x86: x86-32 thread_struct.debugreg

This replaces the debugreg[7] member of thread_struct with individual
members debugreg0, etc.  This saves two words for the dummies 4 and 5,
and harmonizes the code between 32 and 64.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/process_32.c   | 31 ++++++++++++++++++++-----------
 arch/x86/kernel/ptrace_32.c    | 29 ++++++++++++++++++++++++-----
 arch/x86/kernel/signal_32.c    |  4 ++--
 arch/x86/kernel/traps_32.c     |  4 ++--
 arch/x86/power/cpu.c           | 14 +++++++-------
 include/asm-x86/processor_32.h |  7 ++++++-
 6 files changed, 61 insertions(+), 28 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 53406461074f..3744cf63682c 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -445,7 +445,12 @@ void flush_thread(void)
 {
 	struct task_struct *tsk = current;
 
-	memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
+	tsk->thread.debugreg0 = 0;
+	tsk->thread.debugreg1 = 0;
+	tsk->thread.debugreg2 = 0;
+	tsk->thread.debugreg3 = 0;
+	tsk->thread.debugreg6 = 0;
+	tsk->thread.debugreg7 = 0;
 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));	
 	clear_tsk_thread_flag(tsk, TIF_DEBUG);
 	/*
@@ -522,7 +527,6 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
  */
 void dump_thread(struct pt_regs * regs, struct user * dump)
 {
-	int i;
 	u16 gs;
 
 /* changed the size calculations - should hopefully work better. lbt */
@@ -533,8 +537,14 @@ void dump_thread(struct pt_regs * regs, struct user * dump)
 	dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
 	dump->u_dsize -= dump->u_tsize;
 	dump->u_ssize = 0;
-	for (i = 0; i < 8; i++)
-		dump->u_debugreg[i] = current->thread.debugreg[i];  
+	dump->u_debugreg[0] = current->thread.debugreg0;
+	dump->u_debugreg[1] = current->thread.debugreg1;
+	dump->u_debugreg[2] = current->thread.debugreg2;
+	dump->u_debugreg[3] = current->thread.debugreg3;
+	dump->u_debugreg[4] = 0;
+	dump->u_debugreg[5] = 0;
+	dump->u_debugreg[6] = current->thread.debugreg6;
+	dump->u_debugreg[7] = current->thread.debugreg7;
 
 	if (dump->start_stack < TASK_SIZE)
 		dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
@@ -612,13 +622,13 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 		wrmsr(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr, 0);
 
 	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
-		set_debugreg(next->debugreg[0], 0);
-		set_debugreg(next->debugreg[1], 1);
-		set_debugreg(next->debugreg[2], 2);
-		set_debugreg(next->debugreg[3], 3);
+		set_debugreg(next->debugreg0, 0);
+		set_debugreg(next->debugreg1, 1);
+		set_debugreg(next->debugreg2, 2);
+		set_debugreg(next->debugreg3, 3);
 		/* no 4 and 5 */
-		set_debugreg(next->debugreg[6], 6);
-		set_debugreg(next->debugreg[7], 7);
+		set_debugreg(next->debugreg6, 6);
+		set_debugreg(next->debugreg7, 7);
 	}
 
 #ifdef CONFIG_SECCOMP
@@ -869,4 +879,3 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
 	unsigned long range_end = mm->brk + 0x02000000;
 	return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
 }
-
diff --git a/arch/x86/kernel/ptrace_32.c b/arch/x86/kernel/ptrace_32.c
index 26071305de2c..fed83d066135 100644
--- a/arch/x86/kernel/ptrace_32.c
+++ b/arch/x86/kernel/ptrace_32.c
@@ -133,19 +133,39 @@ static unsigned long getreg(struct task_struct *child, unsigned long regno)
  */
 static unsigned long ptrace_get_debugreg(struct task_struct *child, int n)
 {
-	return child->thread.debugreg[n];
+	switch (n) {
+	case 0:		return child->thread.debugreg0;
+	case 1:		return child->thread.debugreg1;
+	case 2:		return child->thread.debugreg2;
+	case 3:		return child->thread.debugreg3;
+	case 6:		return child->thread.debugreg6;
+	case 7:		return child->thread.debugreg7;
+	}
+	return 0;
 }
 
 static int ptrace_set_debugreg(struct task_struct *child,
 			       int n, unsigned long data)
 {
+	int i;
+
 	if (unlikely(n == 4 || n == 5))
 		return -EIO;
 
 	if (n < 4 && unlikely(data >= TASK_SIZE - 3))
 		return -EIO;
 
-	if (n == 7) {
+	switch (n) {
+	case 0:		child->thread.debugreg0 = data; break;
+	case 1:		child->thread.debugreg1 = data; break;
+	case 2:		child->thread.debugreg2 = data; break;
+	case 3:		child->thread.debugreg3 = data; break;
+
+	case 6:
+		child->thread.debugreg6 = data;
+		break;
+
+	case 7:
 		/*
 		 * Sanity-check data. Take one half-byte at once with
 		 * check = (val >> (16 + 4*i)) & 0xf. It contains the
@@ -176,19 +196,18 @@ static int ptrace_set_debugreg(struct task_struct *child,
 		 * 64-bit kernel), so the x86_64 mask value is 0x5454.
 		 * See the AMD manual no. 24593 (AMD64 System Programming)
 		 */
-		int i;
 		data &= ~DR_CONTROL_RESERVED;
 		for (i = 0; i < 4; i++)
 			if ((0x5f54 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
 				return -EIO;
+		child->thread.debugreg7 = data;
 		if (data)
 			set_tsk_thread_flag(child, TIF_DEBUG);
 		else
 			clear_tsk_thread_flag(child, TIF_DEBUG);
+		break;
 	}
 
-	child->thread.debugreg[n] = data;
-
 	return 0;
 }
 
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index 64cb3c05de69..2bf5c9aed106 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -605,8 +605,8 @@ static void fastcall do_signal(struct pt_regs *regs)
 		 * have been cleared if the watchpoint triggered
 		 * inside the kernel.
 		 */
-		if (unlikely(current->thread.debugreg[7]))
-			set_debugreg(current->thread.debugreg[7], 7);
+		if (unlikely(current->thread.debugreg7))
+			set_debugreg(current->thread.debugreg7, 7);
 
 		/* Whee!  Actually deliver the signal.  */
 		if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index 931ef10960ee..27713553cc59 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -852,7 +852,7 @@ fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code)
 
 	/* Mask out spurious debug traps due to lazy DR7 setting */
 	if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
-		if (!tsk->thread.debugreg[7])
+		if (!tsk->thread.debugreg7)
 			goto clear_dr7;
 	}
 
@@ -860,7 +860,7 @@ fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code)
 		goto debug_vm86;
 
 	/* Save debug status register where ptrace can see it */
-	tsk->thread.debugreg[6] = condition;
+	tsk->thread.debugreg6 = condition;
 
 	/*
 	 * Single-stepping through TF: make sure we ignore any events in
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 998fd3ec0d68..5a98dc35addf 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -74,14 +74,14 @@ static void fix_processor_context(void)
 	/*
 	 * Now maybe reload the debug registers
 	 */
-	if (current->thread.debugreg[7]){
-		set_debugreg(current->thread.debugreg[0], 0);
-		set_debugreg(current->thread.debugreg[1], 1);
-		set_debugreg(current->thread.debugreg[2], 2);
-		set_debugreg(current->thread.debugreg[3], 3);
+	if (current->thread.debugreg7) {
+		set_debugreg(current->thread.debugreg0, 0);
+		set_debugreg(current->thread.debugreg1, 1);
+		set_debugreg(current->thread.debugreg2, 2);
+		set_debugreg(current->thread.debugreg3, 3);
 		/* no 4 and 5 */
-		set_debugreg(current->thread.debugreg[6], 6);
-		set_debugreg(current->thread.debugreg[7], 7);
+		set_debugreg(current->thread.debugreg6, 6);
+		set_debugreg(current->thread.debugreg7, 7);
 	}
 
 }
diff --git a/include/asm-x86/processor_32.h b/include/asm-x86/processor_32.h
index c85400fe58c4..d50a4b48d441 100644
--- a/include/asm-x86/processor_32.h
+++ b/include/asm-x86/processor_32.h
@@ -353,7 +353,12 @@ struct thread_struct {
 	unsigned long	fs;
 	unsigned long	gs;
 /* Hardware debugging registers */
-	unsigned long	debugreg[8];  /* %%db0-7 debug registers */
+	unsigned long	debugreg0;
+	unsigned long	debugreg1;
+	unsigned long	debugreg2;
+	unsigned long	debugreg3;
+	unsigned long	debugreg6;
+	unsigned long	debugreg7;
 /* fault info */
 	unsigned long	cr2, trap_no, error_code;
 /* floating point info */
-- 
cgit v1.2.3


From d52e9d690fff1fd9d0ccffe375fd01b700f82a64 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:31:01 +0100
Subject: x86: ptrace_32 renamed

This renames ptrace_32.c back to ptrace.c, in preparation
for merging the 32/64 versions of these files.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/Makefile_32 |   3 +-
 arch/x86/kernel/ptrace.c    | 484 ++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/ptrace_32.c | 484 --------------------------------------------
 3 files changed, 486 insertions(+), 485 deletions(-)
 create mode 100644 arch/x86/kernel/ptrace.c
 delete mode 100644 arch/x86/kernel/ptrace_32.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32
index 20e23c4c18b6..b2d7aea4c82d 100644
--- a/arch/x86/kernel/Makefile_32
+++ b/arch/x86/kernel/Makefile_32
@@ -6,10 +6,11 @@ extra-y := head_32.o init_task.o vmlinux.lds
 CPPFLAGS_vmlinux.lds += -Ui386
 
 obj-y	:= process_32.o signal_32.o entry_32.o traps_32.o irq_32.o \
-		ptrace_32.o time_32.o ioport_32.o ldt.o setup_32.o i8259_32.o sys_i386_32.o \
+		time_32.o ioport_32.o ldt.o setup_32.o i8259_32.o sys_i386_32.o \
 		pci-dma_32.o i386_ksyms_32.o i387_32.o bootflag.o e820_32.o\
 		quirks.o i8237.o topology.o alternative.o i8253.o tsc_32.o io_delay.o rtc.o
 
+obj-y				+= ptrace.o
 obj-y				+= tls.o
 obj-y				+= step.o
 obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
new file mode 100644
index 000000000000..fed83d066135
--- /dev/null
+++ b/arch/x86/kernel/ptrace.c
@@ -0,0 +1,484 @@
+/* By Ross Biro 1/23/92 */
+/*
+ * Pentium III FXSR, SSE support
+ *	Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/errno.h>
+#include <linux/ptrace.h>
+#include <linux/user.h>
+#include <linux/security.h>
+#include <linux/audit.h>
+#include <linux/seccomp.h>
+#include <linux/signal.h>
+
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/system.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/debugreg.h>
+#include <asm/ldt.h>
+#include <asm/desc.h>
+
+/*
+ * does not yet catch signals sent when the child dies.
+ * in exit.c or in signal.c.
+ */
+
+/*
+ * Determines which flags the user has access to [1 = access, 0 = no access].
+ * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), NT(14), IOPL(12-13), IF(9).
+ * Also masks reserved bits (31-22, 15, 5, 3, 1).
+ */
+#define FLAG_MASK 0x00050dd5
+
+static long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
+{
+	BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0);
+	if (regno > FS)
+		--regno;
+	return &regs->bx + regno;
+}
+
+static int putreg(struct task_struct *child,
+	unsigned long regno, unsigned long value)
+{
+	struct pt_regs *regs = task_pt_regs(child);
+	regno >>= 2;
+	switch (regno) {
+	case GS:
+		if (value && (value & 3) != 3)
+			return -EIO;
+		child->thread.gs = value;
+		if (child == current)
+			/*
+			 * The user-mode %gs is not affected by
+			 * kernel entry, so we must update the CPU.
+			 */
+			loadsegment(gs, value);
+		return 0;
+	case DS:
+	case ES:
+	case FS:
+		if (value && (value & 3) != 3)
+			return -EIO;
+		value &= 0xffff;
+		break;
+	case SS:
+	case CS:
+		if ((value & 3) != 3)
+			return -EIO;
+		value &= 0xffff;
+		break;
+	case EFL:
+		value &= FLAG_MASK;
+		/*
+		 * If the user value contains TF, mark that
+		 * it was not "us" (the debugger) that set it.
+		 * If not, make sure it stays set if we had.
+		 */
+		if (value & X86_EFLAGS_TF)
+			clear_tsk_thread_flag(child, TIF_FORCED_TF);
+		else if (test_tsk_thread_flag(child, TIF_FORCED_TF))
+			value |= X86_EFLAGS_TF;
+		value |= regs->flags & ~FLAG_MASK;
+		break;
+	}
+	*pt_regs_access(regs, regno) = value;
+	return 0;
+}
+
+static unsigned long getreg(struct task_struct *child, unsigned long regno)
+{
+	struct pt_regs *regs = task_pt_regs(child);
+	unsigned long retval = ~0UL;
+
+	regno >>= 2;
+	switch (regno) {
+	case EFL:
+		/*
+		 * If the debugger set TF, hide it from the readout.
+		 */
+		retval = regs->flags;
+		if (test_tsk_thread_flag(child, TIF_FORCED_TF))
+			retval &= ~X86_EFLAGS_TF;
+		break;
+	case GS:
+		retval = child->thread.gs;
+		if (child == current)
+			savesegment(gs, retval);
+		break;
+	case DS:
+	case ES:
+	case FS:
+	case SS:
+	case CS:
+		retval = 0xffff;
+		/* fall through */
+	default:
+		retval &= *pt_regs_access(regs, regno);
+	}
+	return retval;
+}
+
+/*
+ * This function is trivial and will be inlined by the compiler.
+ * Having it separates the implementation details of debug
+ * registers from the interface details of ptrace.
+ */
+static unsigned long ptrace_get_debugreg(struct task_struct *child, int n)
+{
+	switch (n) {
+	case 0:		return child->thread.debugreg0;
+	case 1:		return child->thread.debugreg1;
+	case 2:		return child->thread.debugreg2;
+	case 3:		return child->thread.debugreg3;
+	case 6:		return child->thread.debugreg6;
+	case 7:		return child->thread.debugreg7;
+	}
+	return 0;
+}
+
+static int ptrace_set_debugreg(struct task_struct *child,
+			       int n, unsigned long data)
+{
+	int i;
+
+	if (unlikely(n == 4 || n == 5))
+		return -EIO;
+
+	if (n < 4 && unlikely(data >= TASK_SIZE - 3))
+		return -EIO;
+
+	switch (n) {
+	case 0:		child->thread.debugreg0 = data; break;
+	case 1:		child->thread.debugreg1 = data; break;
+	case 2:		child->thread.debugreg2 = data; break;
+	case 3:		child->thread.debugreg3 = data; break;
+
+	case 6:
+		child->thread.debugreg6 = data;
+		break;
+
+	case 7:
+		/*
+		 * Sanity-check data. Take one half-byte at once with
+		 * check = (val >> (16 + 4*i)) & 0xf. It contains the
+		 * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
+		 * 2 and 3 are LENi. Given a list of invalid values,
+		 * we do mask |= 1 << invalid_value, so that
+		 * (mask >> check) & 1 is a correct test for invalid
+		 * values.
+		 *
+		 * R/Wi contains the type of the breakpoint /
+		 * watchpoint, LENi contains the length of the watched
+		 * data in the watchpoint case.
+		 *
+		 * The invalid values are:
+		 * - LENi == 0x10 (undefined), so mask |= 0x0f00.
+		 * - R/Wi == 0x10 (break on I/O reads or writes), so
+		 *   mask |= 0x4444.
+		 * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
+		 *   0x1110.
+		 *
+		 * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
+		 *
+		 * See the Intel Manual "System Programming Guide",
+		 * 15.2.4
+		 *
+		 * Note that LENi == 0x10 is defined on x86_64 in long
+		 * mode (i.e. even for 32-bit userspace software, but
+		 * 64-bit kernel), so the x86_64 mask value is 0x5454.
+		 * See the AMD manual no. 24593 (AMD64 System Programming)
+		 */
+		data &= ~DR_CONTROL_RESERVED;
+		for (i = 0; i < 4; i++)
+			if ((0x5f54 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
+				return -EIO;
+		child->thread.debugreg7 = data;
+		if (data)
+			set_tsk_thread_flag(child, TIF_DEBUG);
+		else
+			clear_tsk_thread_flag(child, TIF_DEBUG);
+		break;
+	}
+
+	return 0;
+}
+
+/*
+ * Called by kernel/ptrace.c when detaching..
+ *
+ * Make sure the single step bit is not set.
+ */
+void ptrace_disable(struct task_struct *child)
+{
+	user_disable_single_step(child);
+	clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+}
+
+long arch_ptrace(struct task_struct *child, long request, long addr, long data)
+{
+	struct user * dummy = NULL;
+	int i, ret;
+	unsigned long __user *datap = (unsigned long __user *)data;
+
+	switch (request) {
+	/* when I and D space are separate, these will need to be fixed. */
+	case PTRACE_PEEKTEXT: /* read word at location addr. */
+	case PTRACE_PEEKDATA:
+		ret = generic_ptrace_peekdata(child, addr, data);
+		break;
+
+	/* read the word at location addr in the USER area. */
+	case PTRACE_PEEKUSR: {
+		unsigned long tmp;
+
+		ret = -EIO;
+		if ((addr & 3) || addr < 0 ||
+		    addr > sizeof(struct user) - 3)
+			break;
+
+		tmp = 0;  /* Default return condition */
+		if(addr < FRAME_SIZE*sizeof(long))
+			tmp = getreg(child, addr);
+		if(addr >= (long) &dummy->u_debugreg[0] &&
+		   addr <= (long) &dummy->u_debugreg[7]){
+			addr -= (long) &dummy->u_debugreg[0];
+			addr = addr >> 2;
+			tmp = ptrace_get_debugreg(child, addr);
+		}
+		ret = put_user(tmp, datap);
+		break;
+	}
+
+	/* when I and D space are separate, this will have to be fixed. */
+	case PTRACE_POKETEXT: /* write the word at location addr. */
+	case PTRACE_POKEDATA:
+		ret = generic_ptrace_pokedata(child, addr, data);
+		break;
+
+	case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
+		ret = -EIO;
+		if ((addr & 3) || addr < 0 ||
+		    addr > sizeof(struct user) - 3)
+			break;
+
+		if (addr < FRAME_SIZE*sizeof(long)) {
+			ret = putreg(child, addr, data);
+			break;
+		}
+		/* We need to be very careful here.  We implicitly
+		   want to modify a portion of the task_struct, and we
+		   have to be selective about what portions we allow someone
+		   to modify. */
+
+		  ret = -EIO;
+		  if(addr >= (long) &dummy->u_debugreg[0] &&
+		     addr <= (long) &dummy->u_debugreg[7]){
+			  addr -= (long) &dummy->u_debugreg;
+			  addr = addr >> 2;
+			  ret = ptrace_set_debugreg(child, addr, data);
+		  }
+		  break;
+
+	case PTRACE_GETREGS: { /* Get all gp regs from the child. */
+	  	if (!access_ok(VERIFY_WRITE, datap, FRAME_SIZE*sizeof(long))) {
+			ret = -EIO;
+			break;
+		}
+		for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) {
+			__put_user(getreg(child, i), datap);
+			datap++;
+		}
+		ret = 0;
+		break;
+	}
+
+	case PTRACE_SETREGS: { /* Set all gp regs in the child. */
+		unsigned long tmp;
+	  	if (!access_ok(VERIFY_READ, datap, FRAME_SIZE*sizeof(long))) {
+			ret = -EIO;
+			break;
+		}
+		for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) {
+			__get_user(tmp, datap);
+			putreg(child, i, tmp);
+			datap++;
+		}
+		ret = 0;
+		break;
+	}
+
+	case PTRACE_GETFPREGS: { /* Get the child FPU state. */
+		if (!access_ok(VERIFY_WRITE, datap,
+			       sizeof(struct user_i387_struct))) {
+			ret = -EIO;
+			break;
+		}
+		ret = 0;
+		if (!tsk_used_math(child))
+			init_fpu(child);
+		get_fpregs((struct user_i387_struct __user *)data, child);
+		break;
+	}
+
+	case PTRACE_SETFPREGS: { /* Set the child FPU state. */
+		if (!access_ok(VERIFY_READ, datap,
+			       sizeof(struct user_i387_struct))) {
+			ret = -EIO;
+			break;
+		}
+		set_stopped_child_used_math(child);
+		set_fpregs(child, (struct user_i387_struct __user *)data);
+		ret = 0;
+		break;
+	}
+
+	case PTRACE_GETFPXREGS: { /* Get the child extended FPU state. */
+		if (!access_ok(VERIFY_WRITE, datap,
+			       sizeof(struct user_fxsr_struct))) {
+			ret = -EIO;
+			break;
+		}
+		if (!tsk_used_math(child))
+			init_fpu(child);
+		ret = get_fpxregs((struct user_fxsr_struct __user *)data, child);
+		break;
+	}
+
+	case PTRACE_SETFPXREGS: { /* Set the child extended FPU state. */
+		if (!access_ok(VERIFY_READ, datap,
+			       sizeof(struct user_fxsr_struct))) {
+			ret = -EIO;
+			break;
+		}
+		set_stopped_child_used_math(child);
+		ret = set_fpxregs(child, (struct user_fxsr_struct __user *)data);
+		break;
+	}
+
+	case PTRACE_GET_THREAD_AREA:
+		if (addr < 0)
+			return -EIO;
+		ret = do_get_thread_area(child, addr,
+					 (struct user_desc __user *) data);
+		break;
+
+	case PTRACE_SET_THREAD_AREA:
+		if (addr < 0)
+			return -EIO;
+		ret = do_set_thread_area(child, addr,
+					 (struct user_desc __user *) data, 0);
+		break;
+
+	default:
+		ret = ptrace_request(child, request, addr, data);
+		break;
+	}
+
+	return ret;
+}
+
+void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
+{
+	struct siginfo info;
+
+	tsk->thread.trap_no = 1;
+	tsk->thread.error_code = error_code;
+
+	memset(&info, 0, sizeof(info));
+	info.si_signo = SIGTRAP;
+	info.si_code = TRAP_BRKPT;
+
+	/* User-mode ip? */
+	info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL;
+
+	/* Send us the fake SIGTRAP */
+	force_sig_info(SIGTRAP, &info, tsk);
+}
+
+/* notification of system call entry/exit
+ * - triggered by current->work.syscall_trace
+ */
+__attribute__((regparm(3)))
+int do_syscall_trace(struct pt_regs *regs, int entryexit)
+{
+	int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU);
+	/*
+	 * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall
+	 * interception
+	 */
+	int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP);
+	int ret = 0;
+
+	/* do the secure computing check first */
+	if (!entryexit)
+		secure_computing(regs->orig_ax);
+
+	if (unlikely(current->audit_context)) {
+		if (entryexit)
+			audit_syscall_exit(AUDITSC_RESULT(regs->ax),
+						regs->ax);
+		/* Debug traps, when using PTRACE_SINGLESTEP, must be sent only
+		 * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is
+		 * not used, entry.S will call us only on syscall exit, not
+		 * entry; so when TIF_SYSCALL_AUDIT is used we must avoid
+		 * calling send_sigtrap() on syscall entry.
+		 *
+		 * Note that when PTRACE_SYSEMU_SINGLESTEP is used,
+		 * is_singlestep is false, despite his name, so we will still do
+		 * the correct thing.
+		 */
+		else if (is_singlestep)
+			goto out;
+	}
+
+	if (!(current->ptrace & PT_PTRACED))
+		goto out;
+
+	/* If a process stops on the 1st tracepoint with SYSCALL_TRACE
+	 * and then is resumed with SYSEMU_SINGLESTEP, it will come in
+	 * here. We have to check this and return */
+	if (is_sysemu && entryexit)
+		return 0;
+
+	/* Fake a debug trap */
+	if (is_singlestep)
+		send_sigtrap(current, regs, 0);
+
+ 	if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu)
+		goto out;
+
+	/* the 0x80 provides a way for the tracing parent to distinguish
+	   between a syscall stop and SIGTRAP delivery */
+	/* Note that the debugger could change the result of test_thread_flag!*/
+	ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0));
+
+	/*
+	 * this isn't the same as continuing with a signal, but it will do
+	 * for normal use.  strace only continues with a signal if the
+	 * stopping signal is not SIGTRAP.  -brl
+	 */
+	if (current->exit_code) {
+		send_sig(current->exit_code, current, 1);
+		current->exit_code = 0;
+	}
+	ret = is_sysemu;
+out:
+	if (unlikely(current->audit_context) && !entryexit)
+		audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_ax,
+				    regs->bx, regs->cx, regs->dx, regs->si);
+	if (ret == 0)
+		return 0;
+
+	regs->orig_ax = -1; /* force skip of syscall restarting */
+	if (unlikely(current->audit_context))
+		audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
+	return 1;
+}
diff --git a/arch/x86/kernel/ptrace_32.c b/arch/x86/kernel/ptrace_32.c
deleted file mode 100644
index fed83d066135..000000000000
--- a/arch/x86/kernel/ptrace_32.c
+++ /dev/null
@@ -1,484 +0,0 @@
-/* By Ross Biro 1/23/92 */
-/*
- * Pentium III FXSR, SSE support
- *	Gareth Hughes <gareth@valinux.com>, May 2000
- */
-
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/errno.h>
-#include <linux/ptrace.h>
-#include <linux/user.h>
-#include <linux/security.h>
-#include <linux/audit.h>
-#include <linux/seccomp.h>
-#include <linux/signal.h>
-
-#include <asm/uaccess.h>
-#include <asm/pgtable.h>
-#include <asm/system.h>
-#include <asm/processor.h>
-#include <asm/i387.h>
-#include <asm/debugreg.h>
-#include <asm/ldt.h>
-#include <asm/desc.h>
-
-/*
- * does not yet catch signals sent when the child dies.
- * in exit.c or in signal.c.
- */
-
-/*
- * Determines which flags the user has access to [1 = access, 0 = no access].
- * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), NT(14), IOPL(12-13), IF(9).
- * Also masks reserved bits (31-22, 15, 5, 3, 1).
- */
-#define FLAG_MASK 0x00050dd5
-
-static long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
-{
-	BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0);
-	if (regno > FS)
-		--regno;
-	return &regs->bx + regno;
-}
-
-static int putreg(struct task_struct *child,
-	unsigned long regno, unsigned long value)
-{
-	struct pt_regs *regs = task_pt_regs(child);
-	regno >>= 2;
-	switch (regno) {
-	case GS:
-		if (value && (value & 3) != 3)
-			return -EIO;
-		child->thread.gs = value;
-		if (child == current)
-			/*
-			 * The user-mode %gs is not affected by
-			 * kernel entry, so we must update the CPU.
-			 */
-			loadsegment(gs, value);
-		return 0;
-	case DS:
-	case ES:
-	case FS:
-		if (value && (value & 3) != 3)
-			return -EIO;
-		value &= 0xffff;
-		break;
-	case SS:
-	case CS:
-		if ((value & 3) != 3)
-			return -EIO;
-		value &= 0xffff;
-		break;
-	case EFL:
-		value &= FLAG_MASK;
-		/*
-		 * If the user value contains TF, mark that
-		 * it was not "us" (the debugger) that set it.
-		 * If not, make sure it stays set if we had.
-		 */
-		if (value & X86_EFLAGS_TF)
-			clear_tsk_thread_flag(child, TIF_FORCED_TF);
-		else if (test_tsk_thread_flag(child, TIF_FORCED_TF))
-			value |= X86_EFLAGS_TF;
-		value |= regs->flags & ~FLAG_MASK;
-		break;
-	}
-	*pt_regs_access(regs, regno) = value;
-	return 0;
-}
-
-static unsigned long getreg(struct task_struct *child, unsigned long regno)
-{
-	struct pt_regs *regs = task_pt_regs(child);
-	unsigned long retval = ~0UL;
-
-	regno >>= 2;
-	switch (regno) {
-	case EFL:
-		/*
-		 * If the debugger set TF, hide it from the readout.
-		 */
-		retval = regs->flags;
-		if (test_tsk_thread_flag(child, TIF_FORCED_TF))
-			retval &= ~X86_EFLAGS_TF;
-		break;
-	case GS:
-		retval = child->thread.gs;
-		if (child == current)
-			savesegment(gs, retval);
-		break;
-	case DS:
-	case ES:
-	case FS:
-	case SS:
-	case CS:
-		retval = 0xffff;
-		/* fall through */
-	default:
-		retval &= *pt_regs_access(regs, regno);
-	}
-	return retval;
-}
-
-/*
- * This function is trivial and will be inlined by the compiler.
- * Having it separates the implementation details of debug
- * registers from the interface details of ptrace.
- */
-static unsigned long ptrace_get_debugreg(struct task_struct *child, int n)
-{
-	switch (n) {
-	case 0:		return child->thread.debugreg0;
-	case 1:		return child->thread.debugreg1;
-	case 2:		return child->thread.debugreg2;
-	case 3:		return child->thread.debugreg3;
-	case 6:		return child->thread.debugreg6;
-	case 7:		return child->thread.debugreg7;
-	}
-	return 0;
-}
-
-static int ptrace_set_debugreg(struct task_struct *child,
-			       int n, unsigned long data)
-{
-	int i;
-
-	if (unlikely(n == 4 || n == 5))
-		return -EIO;
-
-	if (n < 4 && unlikely(data >= TASK_SIZE - 3))
-		return -EIO;
-
-	switch (n) {
-	case 0:		child->thread.debugreg0 = data; break;
-	case 1:		child->thread.debugreg1 = data; break;
-	case 2:		child->thread.debugreg2 = data; break;
-	case 3:		child->thread.debugreg3 = data; break;
-
-	case 6:
-		child->thread.debugreg6 = data;
-		break;
-
-	case 7:
-		/*
-		 * Sanity-check data. Take one half-byte at once with
-		 * check = (val >> (16 + 4*i)) & 0xf. It contains the
-		 * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
-		 * 2 and 3 are LENi. Given a list of invalid values,
-		 * we do mask |= 1 << invalid_value, so that
-		 * (mask >> check) & 1 is a correct test for invalid
-		 * values.
-		 *
-		 * R/Wi contains the type of the breakpoint /
-		 * watchpoint, LENi contains the length of the watched
-		 * data in the watchpoint case.
-		 *
-		 * The invalid values are:
-		 * - LENi == 0x10 (undefined), so mask |= 0x0f00.
-		 * - R/Wi == 0x10 (break on I/O reads or writes), so
-		 *   mask |= 0x4444.
-		 * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
-		 *   0x1110.
-		 *
-		 * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
-		 *
-		 * See the Intel Manual "System Programming Guide",
-		 * 15.2.4
-		 *
-		 * Note that LENi == 0x10 is defined on x86_64 in long
-		 * mode (i.e. even for 32-bit userspace software, but
-		 * 64-bit kernel), so the x86_64 mask value is 0x5454.
-		 * See the AMD manual no. 24593 (AMD64 System Programming)
-		 */
-		data &= ~DR_CONTROL_RESERVED;
-		for (i = 0; i < 4; i++)
-			if ((0x5f54 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
-				return -EIO;
-		child->thread.debugreg7 = data;
-		if (data)
-			set_tsk_thread_flag(child, TIF_DEBUG);
-		else
-			clear_tsk_thread_flag(child, TIF_DEBUG);
-		break;
-	}
-
-	return 0;
-}
-
-/*
- * Called by kernel/ptrace.c when detaching..
- *
- * Make sure the single step bit is not set.
- */
-void ptrace_disable(struct task_struct *child)
-{
-	user_disable_single_step(child);
-	clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
-}
-
-long arch_ptrace(struct task_struct *child, long request, long addr, long data)
-{
-	struct user * dummy = NULL;
-	int i, ret;
-	unsigned long __user *datap = (unsigned long __user *)data;
-
-	switch (request) {
-	/* when I and D space are separate, these will need to be fixed. */
-	case PTRACE_PEEKTEXT: /* read word at location addr. */
-	case PTRACE_PEEKDATA:
-		ret = generic_ptrace_peekdata(child, addr, data);
-		break;
-
-	/* read the word at location addr in the USER area. */
-	case PTRACE_PEEKUSR: {
-		unsigned long tmp;
-
-		ret = -EIO;
-		if ((addr & 3) || addr < 0 ||
-		    addr > sizeof(struct user) - 3)
-			break;
-
-		tmp = 0;  /* Default return condition */
-		if(addr < FRAME_SIZE*sizeof(long))
-			tmp = getreg(child, addr);
-		if(addr >= (long) &dummy->u_debugreg[0] &&
-		   addr <= (long) &dummy->u_debugreg[7]){
-			addr -= (long) &dummy->u_debugreg[0];
-			addr = addr >> 2;
-			tmp = ptrace_get_debugreg(child, addr);
-		}
-		ret = put_user(tmp, datap);
-		break;
-	}
-
-	/* when I and D space are separate, this will have to be fixed. */
-	case PTRACE_POKETEXT: /* write the word at location addr. */
-	case PTRACE_POKEDATA:
-		ret = generic_ptrace_pokedata(child, addr, data);
-		break;
-
-	case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
-		ret = -EIO;
-		if ((addr & 3) || addr < 0 ||
-		    addr > sizeof(struct user) - 3)
-			break;
-
-		if (addr < FRAME_SIZE*sizeof(long)) {
-			ret = putreg(child, addr, data);
-			break;
-		}
-		/* We need to be very careful here.  We implicitly
-		   want to modify a portion of the task_struct, and we
-		   have to be selective about what portions we allow someone
-		   to modify. */
-
-		  ret = -EIO;
-		  if(addr >= (long) &dummy->u_debugreg[0] &&
-		     addr <= (long) &dummy->u_debugreg[7]){
-			  addr -= (long) &dummy->u_debugreg;
-			  addr = addr >> 2;
-			  ret = ptrace_set_debugreg(child, addr, data);
-		  }
-		  break;
-
-	case PTRACE_GETREGS: { /* Get all gp regs from the child. */
-	  	if (!access_ok(VERIFY_WRITE, datap, FRAME_SIZE*sizeof(long))) {
-			ret = -EIO;
-			break;
-		}
-		for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) {
-			__put_user(getreg(child, i), datap);
-			datap++;
-		}
-		ret = 0;
-		break;
-	}
-
-	case PTRACE_SETREGS: { /* Set all gp regs in the child. */
-		unsigned long tmp;
-	  	if (!access_ok(VERIFY_READ, datap, FRAME_SIZE*sizeof(long))) {
-			ret = -EIO;
-			break;
-		}
-		for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) {
-			__get_user(tmp, datap);
-			putreg(child, i, tmp);
-			datap++;
-		}
-		ret = 0;
-		break;
-	}
-
-	case PTRACE_GETFPREGS: { /* Get the child FPU state. */
-		if (!access_ok(VERIFY_WRITE, datap,
-			       sizeof(struct user_i387_struct))) {
-			ret = -EIO;
-			break;
-		}
-		ret = 0;
-		if (!tsk_used_math(child))
-			init_fpu(child);
-		get_fpregs((struct user_i387_struct __user *)data, child);
-		break;
-	}
-
-	case PTRACE_SETFPREGS: { /* Set the child FPU state. */
-		if (!access_ok(VERIFY_READ, datap,
-			       sizeof(struct user_i387_struct))) {
-			ret = -EIO;
-			break;
-		}
-		set_stopped_child_used_math(child);
-		set_fpregs(child, (struct user_i387_struct __user *)data);
-		ret = 0;
-		break;
-	}
-
-	case PTRACE_GETFPXREGS: { /* Get the child extended FPU state. */
-		if (!access_ok(VERIFY_WRITE, datap,
-			       sizeof(struct user_fxsr_struct))) {
-			ret = -EIO;
-			break;
-		}
-		if (!tsk_used_math(child))
-			init_fpu(child);
-		ret = get_fpxregs((struct user_fxsr_struct __user *)data, child);
-		break;
-	}
-
-	case PTRACE_SETFPXREGS: { /* Set the child extended FPU state. */
-		if (!access_ok(VERIFY_READ, datap,
-			       sizeof(struct user_fxsr_struct))) {
-			ret = -EIO;
-			break;
-		}
-		set_stopped_child_used_math(child);
-		ret = set_fpxregs(child, (struct user_fxsr_struct __user *)data);
-		break;
-	}
-
-	case PTRACE_GET_THREAD_AREA:
-		if (addr < 0)
-			return -EIO;
-		ret = do_get_thread_area(child, addr,
-					 (struct user_desc __user *) data);
-		break;
-
-	case PTRACE_SET_THREAD_AREA:
-		if (addr < 0)
-			return -EIO;
-		ret = do_set_thread_area(child, addr,
-					 (struct user_desc __user *) data, 0);
-		break;
-
-	default:
-		ret = ptrace_request(child, request, addr, data);
-		break;
-	}
-
-	return ret;
-}
-
-void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
-{
-	struct siginfo info;
-
-	tsk->thread.trap_no = 1;
-	tsk->thread.error_code = error_code;
-
-	memset(&info, 0, sizeof(info));
-	info.si_signo = SIGTRAP;
-	info.si_code = TRAP_BRKPT;
-
-	/* User-mode ip? */
-	info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL;
-
-	/* Send us the fake SIGTRAP */
-	force_sig_info(SIGTRAP, &info, tsk);
-}
-
-/* notification of system call entry/exit
- * - triggered by current->work.syscall_trace
- */
-__attribute__((regparm(3)))
-int do_syscall_trace(struct pt_regs *regs, int entryexit)
-{
-	int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU);
-	/*
-	 * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall
-	 * interception
-	 */
-	int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP);
-	int ret = 0;
-
-	/* do the secure computing check first */
-	if (!entryexit)
-		secure_computing(regs->orig_ax);
-
-	if (unlikely(current->audit_context)) {
-		if (entryexit)
-			audit_syscall_exit(AUDITSC_RESULT(regs->ax),
-						regs->ax);
-		/* Debug traps, when using PTRACE_SINGLESTEP, must be sent only
-		 * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is
-		 * not used, entry.S will call us only on syscall exit, not
-		 * entry; so when TIF_SYSCALL_AUDIT is used we must avoid
-		 * calling send_sigtrap() on syscall entry.
-		 *
-		 * Note that when PTRACE_SYSEMU_SINGLESTEP is used,
-		 * is_singlestep is false, despite his name, so we will still do
-		 * the correct thing.
-		 */
-		else if (is_singlestep)
-			goto out;
-	}
-
-	if (!(current->ptrace & PT_PTRACED))
-		goto out;
-
-	/* If a process stops on the 1st tracepoint with SYSCALL_TRACE
-	 * and then is resumed with SYSEMU_SINGLESTEP, it will come in
-	 * here. We have to check this and return */
-	if (is_sysemu && entryexit)
-		return 0;
-
-	/* Fake a debug trap */
-	if (is_singlestep)
-		send_sigtrap(current, regs, 0);
-
- 	if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu)
-		goto out;
-
-	/* the 0x80 provides a way for the tracing parent to distinguish
-	   between a syscall stop and SIGTRAP delivery */
-	/* Note that the debugger could change the result of test_thread_flag!*/
-	ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0));
-
-	/*
-	 * this isn't the same as continuing with a signal, but it will do
-	 * for normal use.  strace only continues with a signal if the
-	 * stopping signal is not SIGTRAP.  -brl
-	 */
-	if (current->exit_code) {
-		send_sig(current->exit_code, current, 1);
-		current->exit_code = 0;
-	}
-	ret = is_sysemu;
-out:
-	if (unlikely(current->audit_context) && !entryexit)
-		audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_ax,
-				    regs->bx, regs->cx, regs->dx, regs->si);
-	if (ret == 0)
-		return 0;
-
-	regs->orig_ax = -1; /* force skip of syscall restarting */
-	if (unlikely(current->audit_context))
-		audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
-	return 1;
-}
-- 
cgit v1.2.3


From e39c2891415b3b5c7381ece06bb45b3c7bdd4342 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:31:01 +0100
Subject: x86: ptrace FLAG_MASK cleanup

This cleans up the FLAG_MASK macro to use symbolic constants instead of a
magic number.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ptrace.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index fed83d066135..b71226d653ed 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -32,10 +32,15 @@
 
 /*
  * Determines which flags the user has access to [1 = access, 0 = no access].
- * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), NT(14), IOPL(12-13), IF(9).
- * Also masks reserved bits (31-22, 15, 5, 3, 1).
  */
-#define FLAG_MASK 0x00050dd5
+#define FLAG_MASK_32		((unsigned long)			\
+				 (X86_EFLAGS_CF | X86_EFLAGS_PF |	\
+				  X86_EFLAGS_AF | X86_EFLAGS_ZF |	\
+				  X86_EFLAGS_SF | X86_EFLAGS_TF |	\
+				  X86_EFLAGS_DF | X86_EFLAGS_OF |	\
+				  X86_EFLAGS_RF | X86_EFLAGS_AC))
+
+#define FLAG_MASK		FLAG_MASK_32
 
 static long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
 {
-- 
cgit v1.2.3


From 06ee1b687ac91698ccd47fa652d5b3cf1bfcd806 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:31:01 +0100
Subject: x86: x86 ptrace getreg/putreg cleanup

This cleans up the getreg/putreg functions to move the special cases
(segment registers and eflags) out into their own subroutines.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ptrace.c | 162 ++++++++++++++++++++++++++++-------------------
 1 file changed, 96 insertions(+), 66 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index b71226d653ed..eaec75a4094b 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -45,92 +45,122 @@
 static long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
 {
 	BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0);
+	regno >>= 2;
 	if (regno > FS)
 		--regno;
 	return &regs->bx + regno;
 }
 
-static int putreg(struct task_struct *child,
-	unsigned long regno, unsigned long value)
+static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
 {
-	struct pt_regs *regs = task_pt_regs(child);
-	regno >>= 2;
-	switch (regno) {
-	case GS:
-		if (value && (value & 3) != 3)
-			return -EIO;
-		child->thread.gs = value;
-		if (child == current)
+	/*
+	 * Returning the value truncates it to 16 bits.
+	 */
+	unsigned int retval;
+	if (offset != offsetof(struct user_regs_struct, gs))
+		retval = *pt_regs_access(task_pt_regs(task), offset);
+	else {
+		retval = task->thread.gs;
+		if (task == current)
+			savesegment(gs, retval);
+	}
+	return retval;
+}
+
+static int set_segment_reg(struct task_struct *task,
+			   unsigned long offset, u16 value)
+{
+	/*
+	 * The value argument was already truncated to 16 bits.
+	 */
+	if (value && (value & 3) != 3)
+		return -EIO;
+
+	if (offset != offsetof(struct user_regs_struct, gs))
+		*pt_regs_access(task_pt_regs(task), offset) = value;
+	else {
+		task->thread.gs = value;
+		if (task == current)
 			/*
 			 * The user-mode %gs is not affected by
 			 * kernel entry, so we must update the CPU.
 			 */
 			loadsegment(gs, value);
-		return 0;
-	case DS:
-	case ES:
-	case FS:
-		if (value && (value & 3) != 3)
-			return -EIO;
-		value &= 0xffff;
-		break;
-	case SS:
-	case CS:
-		if ((value & 3) != 3)
-			return -EIO;
-		value &= 0xffff;
-		break;
-	case EFL:
-		value &= FLAG_MASK;
-		/*
-		 * If the user value contains TF, mark that
-		 * it was not "us" (the debugger) that set it.
-		 * If not, make sure it stays set if we had.
-		 */
-		if (value & X86_EFLAGS_TF)
-			clear_tsk_thread_flag(child, TIF_FORCED_TF);
-		else if (test_tsk_thread_flag(child, TIF_FORCED_TF))
-			value |= X86_EFLAGS_TF;
-		value |= regs->flags & ~FLAG_MASK;
-		break;
 	}
-	*pt_regs_access(regs, regno) = value;
+
 	return 0;
 }
 
-static unsigned long getreg(struct task_struct *child, unsigned long regno)
+static unsigned long get_flags(struct task_struct *task)
 {
-	struct pt_regs *regs = task_pt_regs(child);
-	unsigned long retval = ~0UL;
+	unsigned long retval = task_pt_regs(task)->flags;
+
+	/*
+	 * If the debugger set TF, hide it from the readout.
+	 */
+	if (test_tsk_thread_flag(task, TIF_FORCED_TF))
+		retval &= ~X86_EFLAGS_TF;
 
-	regno >>= 2;
-	switch (regno) {
-	case EFL:
-		/*
-		 * If the debugger set TF, hide it from the readout.
-		 */
-		retval = regs->flags;
-		if (test_tsk_thread_flag(child, TIF_FORCED_TF))
-			retval &= ~X86_EFLAGS_TF;
-		break;
-	case GS:
-		retval = child->thread.gs;
-		if (child == current)
-			savesegment(gs, retval);
-		break;
-	case DS:
-	case ES:
-	case FS:
-	case SS:
-	case CS:
-		retval = 0xffff;
-		/* fall through */
-	default:
-		retval &= *pt_regs_access(regs, regno);
-	}
 	return retval;
 }
 
+static int set_flags(struct task_struct *task, unsigned long value)
+{
+	struct pt_regs *regs = task_pt_regs(task);
+
+	/*
+	 * If the user value contains TF, mark that
+	 * it was not "us" (the debugger) that set it.
+	 * If not, make sure it stays set if we had.
+	 */
+	if (value & X86_EFLAGS_TF)
+		clear_tsk_thread_flag(task, TIF_FORCED_TF);
+	else if (test_tsk_thread_flag(task, TIF_FORCED_TF))
+		value |= X86_EFLAGS_TF;
+
+	regs->flags = (regs->flags & ~FLAG_MASK) | (value & FLAG_MASK);
+
+	return 0;
+}
+
+static int putreg(struct task_struct *child,
+		  unsigned long offset, unsigned long value)
+{
+	switch (offset) {
+	case offsetof(struct user_regs_struct, cs):
+	case offsetof(struct user_regs_struct, ds):
+	case offsetof(struct user_regs_struct, es):
+	case offsetof(struct user_regs_struct, fs):
+	case offsetof(struct user_regs_struct, gs):
+	case offsetof(struct user_regs_struct, ss):
+		return set_segment_reg(child, offset, value);
+
+	case offsetof(struct user_regs_struct, flags):
+		return set_flags(child, value);
+	}
+
+	*pt_regs_access(task_pt_regs(child), offset) = value;
+	return 0;
+}
+
+static unsigned long getreg(struct task_struct *task, unsigned long offset)
+{
+	switch (offset) {
+	case offsetof(struct user_regs_struct, cs):
+	case offsetof(struct user_regs_struct, ds):
+	case offsetof(struct user_regs_struct, es):
+	case offsetof(struct user_regs_struct, fs):
+	case offsetof(struct user_regs_struct, gs):
+	case offsetof(struct user_regs_struct, ss):
+		return get_segment_reg(task, offset);
+
+	case offsetof(struct user_regs_struct, flags):
+		return get_flags(task);
+	}
+
+	return *pt_regs_access(task_pt_regs(task), offset);
+}
+
 /*
  * This function is trivial and will be inlined by the compiler.
  * Having it separates the implementation details of debug
-- 
cgit v1.2.3


From 2047b08be67b70875d8765fc81d34ce28041bec3 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:31:01 +0100
Subject: x86: x86 ptrace getreg/putreg merge

This merges 64-bit support into the low-level register access
functions in arch/x86/kernel/ptrace.c, paving the way to share
this file between 32-bit and 64-bit builds.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ptrace.c | 217 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 213 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index eaec75a4094b..c709868d28a5 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -24,6 +24,8 @@
 #include <asm/debugreg.h>
 #include <asm/ldt.h>
 #include <asm/desc.h>
+#include <asm/prctl.h>
+#include <asm/proto.h>
 
 /*
  * does not yet catch signals sent when the child dies.
@@ -40,6 +42,16 @@
 				  X86_EFLAGS_DF | X86_EFLAGS_OF |	\
 				  X86_EFLAGS_RF | X86_EFLAGS_AC))
 
+/*
+ * Determines whether a value may be installed in a segment register.
+ */
+static inline bool invalid_selector(u16 value)
+{
+	return unlikely(value != 0 && (value & SEGMENT_RPL_MASK) != USER_RPL);
+}
+
+#ifdef CONFIG_X86_32
+
 #define FLAG_MASK		FLAG_MASK_32
 
 static long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
@@ -73,7 +85,7 @@ static int set_segment_reg(struct task_struct *task,
 	/*
 	 * The value argument was already truncated to 16 bits.
 	 */
-	if (value && (value & 3) != 3)
+	if (invalid_selector(value))
 		return -EIO;
 
 	if (offset != offsetof(struct user_regs_struct, gs))
@@ -91,6 +103,142 @@ static int set_segment_reg(struct task_struct *task,
 	return 0;
 }
 
+static unsigned long debugreg_addr_limit(struct task_struct *task)
+{
+	return TASK_SIZE - 3;
+}
+
+#else  /* CONFIG_X86_64 */
+
+#define FLAG_MASK		(FLAG_MASK_32 | X86_EFLAGS_NT)
+
+static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long offset)
+{
+	BUILD_BUG_ON(offsetof(struct pt_regs, r15) != 0);
+	return &regs->r15 + (offset / sizeof(regs->r15));
+}
+
+static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
+{
+	/*
+	 * Returning the value truncates it to 16 bits.
+	 */
+	unsigned int seg;
+
+	switch (offset) {
+	case offsetof(struct user_regs_struct, fs):
+		if (task == current) {
+			/* Older gas can't assemble movq %?s,%r?? */
+			asm("movl %%fs,%0" : "=r" (seg));
+			return seg;
+		}
+		return task->thread.fsindex;
+	case offsetof(struct user_regs_struct, gs):
+		if (task == current) {
+			asm("movl %%gs,%0" : "=r" (seg));
+			return seg;
+		}
+		return task->thread.gsindex;
+	case offsetof(struct user_regs_struct, ds):
+		if (task == current) {
+			asm("movl %%ds,%0" : "=r" (seg));
+			return seg;
+		}
+		return task->thread.ds;
+	case offsetof(struct user_regs_struct, es):
+		if (task == current) {
+			asm("movl %%es,%0" : "=r" (seg));
+			return seg;
+		}
+		return task->thread.es;
+
+	case offsetof(struct user_regs_struct, cs):
+	case offsetof(struct user_regs_struct, ss):
+		break;
+	}
+	return *pt_regs_access(task_pt_regs(task), offset);
+}
+
+static int set_segment_reg(struct task_struct *task,
+			   unsigned long offset, u16 value)
+{
+	/*
+	 * The value argument was already truncated to 16 bits.
+	 */
+	if (invalid_selector(value))
+		return -EIO;
+
+	switch (offset) {
+	case offsetof(struct user_regs_struct,fs):
+		/*
+		 * If this is setting fs as for normal 64-bit use but
+		 * setting fs_base has implicitly changed it, leave it.
+		 */
+		if ((value == FS_TLS_SEL && task->thread.fsindex == 0 &&
+		     task->thread.fs != 0) ||
+		    (value == 0 && task->thread.fsindex == FS_TLS_SEL &&
+		     task->thread.fs == 0))
+			break;
+		task->thread.fsindex = value;
+		if (task == current)
+			loadsegment(fs, task->thread.fsindex);
+		break;
+	case offsetof(struct user_regs_struct,gs):
+		/*
+		 * If this is setting gs as for normal 64-bit use but
+		 * setting gs_base has implicitly changed it, leave it.
+		 */
+		if ((value == GS_TLS_SEL && task->thread.gsindex == 0 &&
+		     task->thread.gs != 0) ||
+		    (value == 0 && task->thread.gsindex == GS_TLS_SEL &&
+		     task->thread.gs == 0))
+			break;
+		task->thread.gsindex = value;
+		if (task == current)
+			load_gs_index(task->thread.gsindex);
+		break;
+	case offsetof(struct user_regs_struct,ds):
+		task->thread.ds = value;
+		if (task == current)
+			loadsegment(ds, task->thread.ds);
+		break;
+	case offsetof(struct user_regs_struct,es):
+		task->thread.es = value;
+		if (task == current)
+			loadsegment(es, task->thread.es);
+		break;
+
+		/*
+		 * Can't actually change these in 64-bit mode.
+		 */
+	case offsetof(struct user_regs_struct,cs):
+#ifdef CONFIG_IA32_EMULATION
+		if (test_tsk_thread_flag(task, TIF_IA32))
+			task_pt_regs(task)->cs = value;
+		break;
+#endif
+	case offsetof(struct user_regs_struct,ss):
+#ifdef CONFIG_IA32_EMULATION
+		if (test_tsk_thread_flag(task, TIF_IA32))
+			task_pt_regs(task)->ss = value;
+		break;
+#endif
+	}
+
+	return 0;
+}
+
+static unsigned long debugreg_addr_limit(struct task_struct *task)
+{
+#ifdef CONFIG_IA32_EMULATION
+	if (test_tsk_thread_flag(task, TIF_IA32))
+		return IA32_PAGE_OFFSET - 3;
+#endif
+	return TASK_SIZE64 - 7;
+}
+
+#endif	/* CONFIG_X86_32 */
+
 static unsigned long get_flags(struct task_struct *task)
 {
 	unsigned long retval = task_pt_regs(task)->flags;
@@ -137,6 +285,29 @@ static int putreg(struct task_struct *child,
 
 	case offsetof(struct user_regs_struct, flags):
 		return set_flags(child, value);
+
+#ifdef CONFIG_X86_64
+	case offsetof(struct user_regs_struct,fs_base):
+		if (value >= TASK_SIZE_OF(child))
+			return -EIO;
+		/*
+		 * When changing the segment base, use do_arch_prctl
+		 * to set either thread.fs or thread.fsindex and the
+		 * corresponding GDT slot.
+		 */
+		if (child->thread.fs != value)
+			return do_arch_prctl(child, ARCH_SET_FS, value);
+		return 0;
+	case offsetof(struct user_regs_struct,gs_base):
+		/*
+		 * Exactly the same here as the %fs handling above.
+		 */
+		if (value >= TASK_SIZE_OF(child))
+			return -EIO;
+		if (child->thread.gs != value)
+			return do_arch_prctl(child, ARCH_SET_GS, value);
+		return 0;
+#endif
 	}
 
 	*pt_regs_access(task_pt_regs(child), offset) = value;
@@ -156,6 +327,37 @@ static unsigned long getreg(struct task_struct *task, unsigned long offset)
 
 	case offsetof(struct user_regs_struct, flags):
 		return get_flags(task);
+
+#ifdef CONFIG_X86_64
+	case offsetof(struct user_regs_struct, fs_base): {
+		/*
+		 * do_arch_prctl may have used a GDT slot instead of
+		 * the MSR.  To userland, it appears the same either
+		 * way, except the %fs segment selector might not be 0.
+		 */
+		unsigned int seg = task->thread.fsindex;
+		if (task->thread.fs != 0)
+			return task->thread.fs;
+		if (task == current)
+			asm("movl %%fs,%0" : "=r" (seg));
+		if (seg != FS_TLS_SEL)
+			return 0;
+		return get_desc_base(&task->thread.tls_array[FS_TLS]);
+	}
+	case offsetof(struct user_regs_struct, gs_base): {
+		/*
+		 * Exactly the same here as the %fs handling above.
+		 */
+		unsigned int seg = task->thread.gsindex;
+		if (task->thread.gs != 0)
+			return task->thread.gs;
+		if (task == current)
+			asm("movl %%gs,%0" : "=r" (seg));
+		if (seg != GS_TLS_SEL)
+			return 0;
+		return get_desc_base(&task->thread.tls_array[GS_TLS]);
+	}
+#endif
 	}
 
 	return *pt_regs_access(task_pt_regs(task), offset);
@@ -187,7 +389,7 @@ static int ptrace_set_debugreg(struct task_struct *child,
 	if (unlikely(n == 4 || n == 5))
 		return -EIO;
 
-	if (n < 4 && unlikely(data >= TASK_SIZE - 3))
+	if (n < 4 && unlikely(data >= debugreg_addr_limit(child)))
 		return -EIO;
 
 	switch (n) {
@@ -197,6 +399,8 @@ static int ptrace_set_debugreg(struct task_struct *child,
 	case 3:		child->thread.debugreg3 = data; break;
 
 	case 6:
+		if ((data & ~0xffffffffUL) != 0)
+			return -EIO;
 		child->thread.debugreg6 = data;
 		break;
 
@@ -215,7 +419,7 @@ static int ptrace_set_debugreg(struct task_struct *child,
 		 * data in the watchpoint case.
 		 *
 		 * The invalid values are:
-		 * - LENi == 0x10 (undefined), so mask |= 0x0f00.
+		 * - LENi == 0x10 (undefined), so mask |= 0x0f00.	[32-bit]
 		 * - R/Wi == 0x10 (break on I/O reads or writes), so
 		 *   mask |= 0x4444.
 		 * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
@@ -231,9 +435,14 @@ static int ptrace_set_debugreg(struct task_struct *child,
 		 * 64-bit kernel), so the x86_64 mask value is 0x5454.
 		 * See the AMD manual no. 24593 (AMD64 System Programming)
 		 */
+#ifdef CONFIG_X86_32
+#define	DR7_MASK	0x5f54
+#else
+#define	DR7_MASK	0x5554
+#endif
 		data &= ~DR_CONTROL_RESERVED;
 		for (i = 0; i < 4; i++)
-			if ((0x5f54 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
+			if ((DR7_MASK >> ((data >> (16 + 4*i)) & 0xf)) & 1)
 				return -EIO;
 		child->thread.debugreg7 = data;
 		if (data)
-- 
cgit v1.2.3


From e9c86c789f067f75211cedb3f13aa58369b0d14a Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:31:01 +0100
Subject: x86: x86 ptrace arch merge

This adds 64-bit support to arch_ptrace in arch/x86/kernel/ptrace.c,
so this function can be used for native ptrace on both 32 and 64.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ptrace.c | 65 ++++++++++++++++++++++++++----------------------
 1 file changed, 35 insertions(+), 30 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index c709868d28a5..7161d60e152d 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -463,12 +463,13 @@ static int ptrace_set_debugreg(struct task_struct *child,
 void ptrace_disable(struct task_struct *child)
 {
 	user_disable_single_step(child);
+#ifdef TIF_SYSCALL_EMU
 	clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+#endif
 }
 
 long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 {
-	struct user * dummy = NULL;
 	int i, ret;
 	unsigned long __user *datap = (unsigned long __user *)data;
 
@@ -484,18 +485,17 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		unsigned long tmp;
 
 		ret = -EIO;
-		if ((addr & 3) || addr < 0 ||
-		    addr > sizeof(struct user) - 3)
+		if ((addr & (sizeof(data) - 1)) || addr < 0 ||
+		    addr >= sizeof(struct user))
 			break;
 
 		tmp = 0;  /* Default return condition */
-		if(addr < FRAME_SIZE*sizeof(long))
+		if (addr < sizeof(struct user_regs_struct))
 			tmp = getreg(child, addr);
-		if(addr >= (long) &dummy->u_debugreg[0] &&
-		   addr <= (long) &dummy->u_debugreg[7]){
-			addr -= (long) &dummy->u_debugreg[0];
-			addr = addr >> 2;
-			tmp = ptrace_get_debugreg(child, addr);
+		else if (addr >= offsetof(struct user, u_debugreg[0]) &&
+			 addr <= offsetof(struct user, u_debugreg[7])) {
+			addr -= offsetof(struct user, u_debugreg[0]);
+			tmp = ptrace_get_debugreg(child, addr / sizeof(data));
 		}
 		ret = put_user(tmp, datap);
 		break;
@@ -509,34 +509,26 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 
 	case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
 		ret = -EIO;
-		if ((addr & 3) || addr < 0 ||
-		    addr > sizeof(struct user) - 3)
+		if ((addr & (sizeof(data) - 1)) || addr < 0 ||
+		    addr >= sizeof(struct user))
 			break;
 
-		if (addr < FRAME_SIZE*sizeof(long)) {
+		if (addr < sizeof(struct user_regs_struct))
 			ret = putreg(child, addr, data);
-			break;
+		else if (addr >= offsetof(struct user, u_debugreg[0]) &&
+			 addr <= offsetof(struct user, u_debugreg[7])) {
+			addr -= offsetof(struct user, u_debugreg[0]);
+			ret = ptrace_set_debugreg(child,
+						  addr / sizeof(data), data);
 		}
-		/* We need to be very careful here.  We implicitly
-		   want to modify a portion of the task_struct, and we
-		   have to be selective about what portions we allow someone
-		   to modify. */
-
-		  ret = -EIO;
-		  if(addr >= (long) &dummy->u_debugreg[0] &&
-		     addr <= (long) &dummy->u_debugreg[7]){
-			  addr -= (long) &dummy->u_debugreg;
-			  addr = addr >> 2;
-			  ret = ptrace_set_debugreg(child, addr, data);
-		  }
-		  break;
+		break;
 
 	case PTRACE_GETREGS: { /* Get all gp regs from the child. */
-	  	if (!access_ok(VERIFY_WRITE, datap, FRAME_SIZE*sizeof(long))) {
+		if (!access_ok(VERIFY_WRITE, datap, sizeof(struct user_regs_struct))) {
 			ret = -EIO;
 			break;
 		}
-		for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) {
+		for (i = 0; i < sizeof(struct user_regs_struct); i += sizeof(long)) {
 			__put_user(getreg(child, i), datap);
 			datap++;
 		}
@@ -546,11 +538,11 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 
 	case PTRACE_SETREGS: { /* Set all gp regs in the child. */
 		unsigned long tmp;
-	  	if (!access_ok(VERIFY_READ, datap, FRAME_SIZE*sizeof(long))) {
+		if (!access_ok(VERIFY_READ, datap, sizeof(struct user_regs_struct))) {
 			ret = -EIO;
 			break;
 		}
-		for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) {
+		for (i = 0; i < sizeof(struct user_regs_struct); i += sizeof(long)) {
 			__get_user(tmp, datap);
 			putreg(child, i, tmp);
 			datap++;
@@ -584,6 +576,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		break;
 	}
 
+#ifdef CONFIG_X86_32
 	case PTRACE_GETFPXREGS: { /* Get the child extended FPU state. */
 		if (!access_ok(VERIFY_WRITE, datap,
 			       sizeof(struct user_fxsr_struct))) {
@@ -606,7 +599,9 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		ret = set_fpxregs(child, (struct user_fxsr_struct __user *)data);
 		break;
 	}
+#endif
 
+#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
 	case PTRACE_GET_THREAD_AREA:
 		if (addr < 0)
 			return -EIO;
@@ -620,6 +615,16 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		ret = do_set_thread_area(child, addr,
 					 (struct user_desc __user *) data, 0);
 		break;
+#endif
+
+#ifdef CONFIG_X86_64
+		/* normal 64bit interface to access TLS data.
+		   Works just like arch_prctl, except that the arguments
+		   are reversed. */
+	case PTRACE_ARCH_PRCTL:
+		ret = do_arch_prctl(child, data, addr);
+		break;
+#endif
 
 	default:
 		ret = ptrace_request(child, request, addr, data);
-- 
cgit v1.2.3


From 86976cd805eccf46d9b720bb188a540fc5769427 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:31:01 +0100
Subject: x86: x86 ptrace merge syscall trace

This moves the 64-bit syscall tracing functions into ptrace.c,
so that ptrace_64.c becomes entirely obsolete.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ptrace.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 7161d60e152d..509804957f5f 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -634,6 +634,8 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 	return ret;
 }
 
+#ifdef CONFIG_X86_32
+
 void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
 {
 	struct siginfo info;
@@ -731,3 +733,65 @@ out:
 		audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
 	return 1;
 }
+
+#else  /* CONFIG_X86_64 */
+
+static void syscall_trace(struct pt_regs *regs)
+{
+
+#if 0
+	printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n",
+	       current->comm,
+	       regs->ip, regs->sp, regs->ax, regs->orig_ax, __builtin_return_address(0),
+	       current_thread_info()->flags, current->ptrace);
+#endif
+
+	ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
+				? 0x80 : 0));
+	/*
+	 * this isn't the same as continuing with a signal, but it will do
+	 * for normal use.  strace only continues with a signal if the
+	 * stopping signal is not SIGTRAP.  -brl
+	 */
+	if (current->exit_code) {
+		send_sig(current->exit_code, current, 1);
+		current->exit_code = 0;
+	}
+}
+
+asmlinkage void syscall_trace_enter(struct pt_regs *regs)
+{
+	/* do the secure computing check first */
+	secure_computing(regs->orig_ax);
+
+	if (test_thread_flag(TIF_SYSCALL_TRACE)
+	    && (current->ptrace & PT_PTRACED))
+		syscall_trace(regs);
+
+	if (unlikely(current->audit_context)) {
+		if (test_thread_flag(TIF_IA32)) {
+			audit_syscall_entry(AUDIT_ARCH_I386,
+					    regs->orig_ax,
+					    regs->bx, regs->cx,
+					    regs->dx, regs->si);
+		} else {
+			audit_syscall_entry(AUDIT_ARCH_X86_64,
+					    regs->orig_ax,
+					    regs->di, regs->si,
+					    regs->dx, regs->r10);
+		}
+	}
+}
+
+asmlinkage void syscall_trace_leave(struct pt_regs *regs)
+{
+	if (unlikely(current->audit_context))
+		audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
+
+	if ((test_thread_flag(TIF_SYSCALL_TRACE)
+	     || test_thread_flag(TIF_SINGLESTEP))
+	    && (current->ptrace & PT_PTRACED))
+		syscall_trace(regs);
+}
+
+#endif	/* CONFIG_X86_32 */
-- 
cgit v1.2.3


From cb757c41f3c2e1ac6f950f9d070e9849983efc18 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:31:01 +0100
Subject: x86: x86 ia32 ptrace getreg/putreg merge

This reimplements the 64-bit IA32-emulation register access
functions in arch/x86/kernel/ptrace.c, where they can share
some guts with the native access functions directly.

These functions are not used yet, but this paves the way to move
IA32 ptrace support into this file to share its local functions.

[akpm@linuxfoundation.org: Build fix]

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ptrace.c | 130 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 128 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 509804957f5f..d56aa18309f8 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -215,14 +215,14 @@ static int set_segment_reg(struct task_struct *task,
 #ifdef CONFIG_IA32_EMULATION
 		if (test_tsk_thread_flag(task, TIF_IA32))
 			task_pt_regs(task)->cs = value;
-		break;
 #endif
+		break;
 	case offsetof(struct user_regs_struct,ss):
 #ifdef CONFIG_IA32_EMULATION
 		if (test_tsk_thread_flag(task, TIF_IA32))
 			task_pt_regs(task)->ss = value;
-		break;
 #endif
+		break;
 	}
 
 	return 0;
@@ -634,6 +634,132 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 	return ret;
 }
 
+#ifdef CONFIG_IA32_EMULATION
+
+#include <asm/user32.h>
+
+#define R32(l,q)							\
+	case offsetof(struct user32, regs.l):				\
+		regs->q = value; break
+
+#define SEG32(rs)							\
+	case offsetof(struct user32, regs.rs):				\
+		return set_segment_reg(child,				\
+				       offsetof(struct user_regs_struct, rs), \
+				       value);				\
+		break
+
+static int putreg32(struct task_struct *child, unsigned regno, u32 value)
+{
+	struct pt_regs *regs = task_pt_regs(child);
+
+	switch (regno) {
+
+	SEG32(cs);
+	SEG32(ds);
+	SEG32(es);
+	SEG32(fs);
+	SEG32(gs);
+	SEG32(ss);
+
+	R32(ebx, bx);
+	R32(ecx, cx);
+	R32(edx, dx);
+	R32(edi, di);
+	R32(esi, si);
+	R32(ebp, bp);
+	R32(eax, ax);
+	R32(orig_eax, orig_ax);
+	R32(eip, ip);
+	R32(esp, sp);
+
+	case offsetof(struct user32, regs.eflags):
+		return set_flags(child, value);
+
+	case offsetof(struct user32, u_debugreg[0]) ...
+		offsetof(struct user32, u_debugreg[7]):
+		regno -= offsetof(struct user32, u_debugreg[0]);
+		return ptrace_set_debugreg(child, regno / 4, value);
+
+	default:
+		if (regno > sizeof(struct user32) || (regno & 3))
+			return -EIO;
+
+		/*
+		 * Other dummy fields in the virtual user structure
+		 * are ignored
+		 */
+		break;
+	}
+	return 0;
+}
+
+#undef R32
+#undef SEG32
+
+#define R32(l,q)							\
+	case offsetof(struct user32, regs.l):				\
+		*val = regs->q; break
+
+#define SEG32(rs)							\
+	case offsetof(struct user32, regs.rs):				\
+		*val = get_segment_reg(child,				\
+				       offsetof(struct user_regs_struct, rs)); \
+		break
+
+static int getreg32(struct task_struct *child, unsigned regno, u32 *val)
+{
+	struct pt_regs *regs = task_pt_regs(child);
+
+	switch (regno) {
+
+	SEG32(ds);
+	SEG32(es);
+	SEG32(fs);
+	SEG32(gs);
+
+	R32(cs, cs);
+	R32(ss, ss);
+	R32(ebx, bx);
+	R32(ecx, cx);
+	R32(edx, dx);
+	R32(edi, di);
+	R32(esi, si);
+	R32(ebp, bp);
+	R32(eax, ax);
+	R32(orig_eax, orig_ax);
+	R32(eip, ip);
+	R32(esp, sp);
+
+	case offsetof(struct user32, regs.eflags):
+		*val = get_flags(child);
+		break;
+
+	case offsetof(struct user32, u_debugreg[0]) ...
+		offsetof(struct user32, u_debugreg[7]):
+		regno -= offsetof(struct user32, u_debugreg[0]);
+		*val = ptrace_get_debugreg(child, regno / 4);
+		break;
+
+	default:
+		if (regno > sizeof(struct user32) || (regno & 3))
+			return -EIO;
+
+		/*
+		 * Other dummy fields in the virtual user structure
+		 * are ignored
+		 */
+		*val = 0;
+		break;
+	}
+	return 0;
+}
+
+#undef R32
+#undef SEG32
+
+#endif	/* CONFIG_IA32_EMULATION */
+
 #ifdef CONFIG_X86_32
 
 void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
-- 
cgit v1.2.3


From 099cd6e9dac84baafdef00c3955ee68e71282f86 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:31:01 +0100
Subject: x86: x86 ia32 ptrace arch merge

This moves the sys32_ptrace code into arch/x86/kernel/ptrace.c,
verbatim except for a few hard-coded sizes replaced with sizeof.
Here this code can use the shared local functions in this file.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ptrace.c | 214 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 214 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index d56aa18309f8..3399c1be79b8 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -636,6 +636,10 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 
 #ifdef CONFIG_IA32_EMULATION
 
+#include <linux/compat.h>
+#include <linux/syscalls.h>
+#include <asm/ia32.h>
+#include <asm/fpu32.h>
 #include <asm/user32.h>
 
 #define R32(l,q)							\
@@ -758,6 +762,216 @@ static int getreg32(struct task_struct *child, unsigned regno, u32 *val)
 #undef R32
 #undef SEG32
 
+static long ptrace32_siginfo(unsigned request, u32 pid, u32 addr, u32 data)
+{
+	siginfo_t __user *si = compat_alloc_user_space(sizeof(siginfo_t));
+	compat_siginfo_t __user *si32 = compat_ptr(data);
+	siginfo_t ssi;
+	int ret;
+
+	if (request == PTRACE_SETSIGINFO) {
+		memset(&ssi, 0, sizeof(siginfo_t));
+		ret = copy_siginfo_from_user32(&ssi, si32);
+		if (ret)
+			return ret;
+		if (copy_to_user(si, &ssi, sizeof(siginfo_t)))
+			return -EFAULT;
+	}
+	ret = sys_ptrace(request, pid, addr, (unsigned long)si);
+	if (ret)
+		return ret;
+	if (request == PTRACE_GETSIGINFO) {
+		if (copy_from_user(&ssi, si, sizeof(siginfo_t)))
+			return -EFAULT;
+		ret = copy_siginfo_to_user32(si32, &ssi);
+	}
+	return ret;
+}
+
+asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
+{
+	struct task_struct *child;
+	struct pt_regs *childregs;
+	void __user *datap = compat_ptr(data);
+	int ret;
+	__u32 val;
+
+	switch (request) {
+	case PTRACE_TRACEME:
+	case PTRACE_ATTACH:
+	case PTRACE_KILL:
+	case PTRACE_CONT:
+	case PTRACE_SINGLESTEP:
+	case PTRACE_SINGLEBLOCK:
+	case PTRACE_DETACH:
+	case PTRACE_SYSCALL:
+	case PTRACE_OLDSETOPTIONS:
+	case PTRACE_SETOPTIONS:
+	case PTRACE_SET_THREAD_AREA:
+	case PTRACE_GET_THREAD_AREA:
+		return sys_ptrace(request, pid, addr, data);
+
+	default:
+		return -EINVAL;
+
+	case PTRACE_PEEKTEXT:
+	case PTRACE_PEEKDATA:
+	case PTRACE_POKEDATA:
+	case PTRACE_POKETEXT:
+	case PTRACE_POKEUSR:
+	case PTRACE_PEEKUSR:
+	case PTRACE_GETREGS:
+	case PTRACE_SETREGS:
+	case PTRACE_SETFPREGS:
+	case PTRACE_GETFPREGS:
+	case PTRACE_SETFPXREGS:
+	case PTRACE_GETFPXREGS:
+	case PTRACE_GETEVENTMSG:
+		break;
+
+	case PTRACE_SETSIGINFO:
+	case PTRACE_GETSIGINFO:
+		return ptrace32_siginfo(request, pid, addr, data);
+	}
+
+	child = ptrace_get_task_struct(pid);
+	if (IS_ERR(child))
+		return PTR_ERR(child);
+
+	ret = ptrace_check_attach(child, request == PTRACE_KILL);
+	if (ret < 0)
+		goto out;
+
+	childregs = task_pt_regs(child);
+
+	switch (request) {
+	case PTRACE_PEEKDATA:
+	case PTRACE_PEEKTEXT:
+		ret = 0;
+		if (access_process_vm(child, addr, &val, sizeof(u32), 0) !=
+		    sizeof(u32))
+			ret = -EIO;
+		else
+			ret = put_user(val, (unsigned int __user *)datap);
+		break;
+
+	case PTRACE_POKEDATA:
+	case PTRACE_POKETEXT:
+		ret = 0;
+		if (access_process_vm(child, addr, &data, sizeof(u32), 1) !=
+		    sizeof(u32))
+			ret = -EIO;
+		break;
+
+	case PTRACE_PEEKUSR:
+		ret = getreg32(child, addr, &val);
+		if (ret == 0)
+			ret = put_user(val, (__u32 __user *)datap);
+		break;
+
+	case PTRACE_POKEUSR:
+		ret = putreg32(child, addr, data);
+		break;
+
+	case PTRACE_GETREGS: { /* Get all gp regs from the child. */
+		int i;
+
+		if (!access_ok(VERIFY_WRITE, datap, 16*4)) {
+			ret = -EIO;
+			break;
+		}
+		ret = 0;
+		for (i = 0; i < sizeof(struct user_regs_struct32); i += sizeof(__u32)) {
+			getreg32(child, i, &val);
+			ret |= __put_user(val, (u32 __user *)datap);
+			datap += sizeof(u32);
+		}
+		break;
+	}
+
+	case PTRACE_SETREGS: { /* Set all gp regs in the child. */
+		unsigned long tmp;
+		int i;
+
+		if (!access_ok(VERIFY_READ, datap, 16*4)) {
+			ret = -EIO;
+			break;
+		}
+		ret = 0;
+		for (i = 0; i < sizeof(struct user_regs_struct32); i += sizeof(u32)) {
+			ret |= __get_user(tmp, (u32 __user *)datap);
+			putreg32(child, i, tmp);
+			datap += sizeof(u32);
+		}
+		break;
+	}
+
+	case PTRACE_GETFPREGS:
+		ret = -EIO;
+		if (!access_ok(VERIFY_READ, compat_ptr(data),
+			       sizeof(struct user_i387_struct)))
+			break;
+		save_i387_ia32(child, datap, childregs, 1);
+		ret = 0;
+			break;
+
+	case PTRACE_SETFPREGS:
+		ret = -EIO;
+		if (!access_ok(VERIFY_WRITE, datap,
+			       sizeof(struct user_i387_struct)))
+			break;
+		ret = 0;
+		/* don't check EFAULT to be bug-to-bug compatible to i386 */
+		restore_i387_ia32(child, datap, 1);
+		break;
+
+	case PTRACE_GETFPXREGS: {
+		struct user32_fxsr_struct __user *u = datap;
+
+		init_fpu(child);
+		ret = -EIO;
+		if (!access_ok(VERIFY_WRITE, u, sizeof(*u)))
+			break;
+			ret = -EFAULT;
+		if (__copy_to_user(u, &child->thread.i387.fxsave, sizeof(*u)))
+			break;
+		ret = __put_user(childregs->cs, &u->fcs);
+		ret |= __put_user(child->thread.ds, &u->fos);
+		break;
+	}
+	case PTRACE_SETFPXREGS: {
+		struct user32_fxsr_struct __user *u = datap;
+
+		unlazy_fpu(child);
+		ret = -EIO;
+		if (!access_ok(VERIFY_READ, u, sizeof(*u)))
+			break;
+		/*
+		 * no checking to be bug-to-bug compatible with i386.
+		 * but silence warning
+		 */
+		if (__copy_from_user(&child->thread.i387.fxsave, u, sizeof(*u)))
+			;
+		set_stopped_child_used_math(child);
+		child->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
+		ret = 0;
+		break;
+	}
+
+	case PTRACE_GETEVENTMSG:
+		ret = put_user(child->ptrace_message,
+			       (unsigned int __user *)compat_ptr(data));
+		break;
+
+	default:
+		BUG();
+	}
+
+ out:
+	put_task_struct(child);
+	return ret;
+}
+
 #endif	/* CONFIG_IA32_EMULATION */
 
 #ifdef CONFIG_X86_32
-- 
cgit v1.2.3


From cbc9d9d98215f08ed998228e7bce88502d1ce360 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:31:02 +0100
Subject: x86: x86 ptrace merge complete

This switches over the 64-bit build to use the shared ptrace code,
instead of the old ptrace_64.c and arch/x86/ia32/ptrace32.c code.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/ia32/Makefile      | 2 +-
 arch/x86/kernel/Makefile_64 | 3 ++-
 include/asm-x86/ptrace.h    | 3 ---
 3 files changed, 3 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile
index ea6088640847..ec71cfeac87e 100644
--- a/arch/x86/ia32/Makefile
+++ b/arch/x86/ia32/Makefile
@@ -3,7 +3,7 @@
 #
 
 obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o \
-	ia32_binfmt.o fpu32.o ptrace32.o
+	ia32_binfmt.o fpu32.o
 
 sysv-$(CONFIG_SYSVIPC) := ipc32.o
 obj-$(CONFIG_IA32_EMULATION) += $(sysv-y)
diff --git a/arch/x86/kernel/Makefile_64 b/arch/x86/kernel/Makefile_64
index d908f0175e76..b41e838dc788 100644
--- a/arch/x86/kernel/Makefile_64
+++ b/arch/x86/kernel/Makefile_64
@@ -7,12 +7,13 @@ CPPFLAGS_vmlinux.lds += -Ux86_64
 EXTRA_AFLAGS	:= -traditional
 
 obj-y	:= process_64.o signal_64.o entry_64.o traps_64.o irq_64.o \
-		ptrace_64.o time_64.o ioport_64.o ldt.o setup_64.o i8259_64.o sys_x86_64.o \
+		time_64.o ioport_64.o ldt.o setup_64.o i8259_64.o sys_x86_64.o \
 		x8664_ksyms_64.o i387_64.o syscall_64.o vsyscall_64.o \
 		setup64.o bootflag.o e820_64.o reboot_64.o quirks.o i8237.o \
 		pci-dma_64.o pci-nommu_64.o alternative.o hpet.o tsc_64.o bugs_64.o \
 		i8253.o io_delay.o rtc.o
 
+obj-y				+= ptrace.o
 obj-y				+= step.o
 
 obj-$(CONFIG_IA32_EMULATION)	+= tls.o
diff --git a/include/asm-x86/ptrace.h b/include/asm-x86/ptrace.h
index 1b7a8b8d6f60..9228870f6157 100644
--- a/include/asm-x86/ptrace.h
+++ b/include/asm-x86/ptrace.h
@@ -164,9 +164,6 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
 
 struct task_struct;
 
-extern unsigned long ptrace_get_debugreg(struct task_struct *child, int n);
-extern int ptrace_set_debugreg(struct task_struct *child, int n, unsigned long);
-
 extern unsigned long
 convert_rip_to_linear(struct task_struct *child, struct pt_regs *regs);
 
-- 
cgit v1.2.3


From 25149b62d3e6a3e737af39bd4a0b4e97de0811b7 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:31:02 +0100
Subject: x86: x86 ptrace merge removals

This removes the old separate 64-bit and ia32 ptrace source files.
They are no longer used.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/ia32/ptrace32.c    | 411 --------------------------------------
 arch/x86/kernel/ptrace_64.c | 470 --------------------------------------------
 2 files changed, 881 deletions(-)
 delete mode 100644 arch/x86/ia32/ptrace32.c
 delete mode 100644 arch/x86/kernel/ptrace_64.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/ia32/ptrace32.c b/arch/x86/ia32/ptrace32.c
deleted file mode 100644
index d5663e295330..000000000000
--- a/arch/x86/ia32/ptrace32.c
+++ /dev/null
@@ -1,411 +0,0 @@
-/*
- * 32bit ptrace for x86-64.
- *
- * Copyright 2001,2002 Andi Kleen, SuSE Labs.
- * Some parts copied from arch/i386/kernel/ptrace.c. See that file for earlier
- * copyright.
- *
- * This allows to access 64bit processes too; but there is no way to
- * see the extended register contents.
- */
-
-#include <linux/kernel.h>
-#include <linux/stddef.h>
-#include <linux/sched.h>
-#include <linux/syscalls.h>
-#include <linux/unistd.h>
-#include <linux/mm.h>
-#include <linux/err.h>
-#include <linux/ptrace.h>
-#include <asm/ptrace.h>
-#include <asm/compat.h>
-#include <asm/uaccess.h>
-#include <asm/user32.h>
-#include <asm/user.h>
-#include <asm/errno.h>
-#include <asm/debugreg.h>
-#include <asm/i387.h>
-#include <asm/fpu32.h>
-#include <asm/ia32.h>
-
-/*
- * Determines which flags the user has access to [1 = access, 0 = no access].
- * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), IOPL(12-13), IF(9).
- * Also masks reserved bits (31-22, 15, 5, 3, 1).
- */
-#define FLAG_MASK 0x54dd5UL
-
-#define R32(l,q)							\
-	case offsetof(struct user32, regs.l):				\
-		regs->q = val; break;
-
-static int putreg32(struct task_struct *child, unsigned regno, u32 val)
-{
-	struct pt_regs *regs = task_pt_regs(child);
-
-	switch (regno) {
-	case offsetof(struct user32, regs.fs):
-		if (val && (val & 3) != 3)
-			return -EIO;
-		child->thread.fsindex = val & 0xffff;
-		if (child == current)
-			loadsegment(fs, child->thread.fsindex);
-		break;
-	case offsetof(struct user32, regs.gs):
-		if (val && (val & 3) != 3)
-			return -EIO;
-		child->thread.gsindex = val & 0xffff;
-		if (child == current)
-			load_gs_index(child->thread.gsindex);
-		break;
-	case offsetof(struct user32, regs.ds):
-		if (val && (val & 3) != 3)
-			return -EIO;
-		child->thread.ds = val & 0xffff;
-		if (child == current)
-			loadsegment(ds, child->thread.ds);
-		break;
-	case offsetof(struct user32, regs.es):
-		child->thread.es = val & 0xffff;
-		if (child == current)
-			loadsegment(es, child->thread.ds);
-		break;
-	case offsetof(struct user32, regs.ss):
-		if ((val & 3) != 3)
-			return -EIO;
-		regs->ss = val & 0xffff;
-		break;
-	case offsetof(struct user32, regs.cs):
-		if ((val & 3) != 3)
-			return -EIO;
-		regs->cs = val & 0xffff;
-		break;
-
-	R32(ebx, bx);
-	R32(ecx, cx);
-	R32(edx, dx);
-	R32(edi, di);
-	R32(esi, si);
-	R32(ebp, bp);
-	R32(eax, ax);
-	R32(orig_eax, orig_ax);
-	R32(eip, ip);
-	R32(esp, sp);
-
-	case offsetof(struct user32, regs.eflags):
-		val &= FLAG_MASK;
-		/*
-		 * If the user value contains TF, mark that
-		 * it was not "us" (the debugger) that set it.
-		 * If not, make sure it stays set if we had.
-		 */
-		if (val & X86_EFLAGS_TF)
-			clear_tsk_thread_flag(child, TIF_FORCED_TF);
-		else if (test_tsk_thread_flag(child, TIF_FORCED_TF))
-			val |= X86_EFLAGS_TF;
-		regs->flags = val | (regs->flags & ~FLAG_MASK);
-		break;
-
-	case offsetof(struct user32, u_debugreg[0]) ...
-		offsetof(struct user32, u_debugreg[7]):
-		regno -= offsetof(struct user32, u_debugreg[0]);
-		return ptrace_set_debugreg(child, regno / 4, val);
-
-	default:
-		if (regno > sizeof(struct user32) || (regno & 3))
-			return -EIO;
-
-		/*
-		 * Other dummy fields in the virtual user structure
-		 * are ignored
-		 */
-		break;
-	}
-	return 0;
-}
-
-#undef R32
-
-#define R32(l,q)							\
-	case offsetof(struct user32, regs.l):				\
-		*val = regs->q; break
-
-static int getreg32(struct task_struct *child, unsigned regno, u32 *val)
-{
-	struct pt_regs *regs = task_pt_regs(child);
-
-	switch (regno) {
-	case offsetof(struct user32, regs.fs):
-		*val = child->thread.fsindex;
-		if (child == current)
-			asm("movl %%fs,%0" : "=r" (*val));
-		break;
-	case offsetof(struct user32, regs.gs):
-		*val = child->thread.gsindex;
-		if (child == current)
-			asm("movl %%gs,%0" : "=r" (*val));
-		break;
-	case offsetof(struct user32, regs.ds):
-		*val = child->thread.ds;
-		if (child == current)
-			asm("movl %%ds,%0" : "=r" (*val));
-		break;
-	case offsetof(struct user32, regs.es):
-		*val = child->thread.es;
-		if (child == current)
-			asm("movl %%es,%0" : "=r" (*val));
-		break;
-
-	R32(cs, cs);
-	R32(ss, ss);
-	R32(ebx, bx);
-	R32(ecx, cx);
-	R32(edx, dx);
-	R32(edi, di);
-	R32(esi, si);
-	R32(ebp, bp);
-	R32(eax, ax);
-	R32(orig_eax, orig_ax);
-	R32(eip, ip);
-	R32(esp, sp);
-
-	case offsetof(struct user32, regs.eflags):
-		/*
-		 * If the debugger set TF, hide it from the readout.
-		 */
-		*val = regs->flags;
-		if (test_tsk_thread_flag(child, TIF_FORCED_TF))
-			*val &= ~X86_EFLAGS_TF;
-		break;
-
-	case offsetof(struct user32, u_debugreg[0]) ...
-		offsetof(struct user32, u_debugreg[7]):
-		regno -= offsetof(struct user32, u_debugreg[0]);
-		*val = ptrace_get_debugreg(child, regno / 4);
-		break;
-
-	default:
-		if (regno > sizeof(struct user32) || (regno & 3))
-			return -EIO;
-
-		/*
-		 * Other dummy fields in the virtual user structure
-		 * are ignored
-		 */
-		*val = 0;
-		break;
-	}
-	return 0;
-}
-
-#undef R32
-
-static long ptrace32_siginfo(unsigned request, u32 pid, u32 addr, u32 data)
-{
-	siginfo_t __user *si = compat_alloc_user_space(sizeof(siginfo_t));
-	compat_siginfo_t __user *si32 = compat_ptr(data);
-	siginfo_t ssi;
-	int ret;
-
-	if (request == PTRACE_SETSIGINFO) {
-		memset(&ssi, 0, sizeof(siginfo_t));
-		ret = copy_siginfo_from_user32(&ssi, si32);
-		if (ret)
-			return ret;
-		if (copy_to_user(si, &ssi, sizeof(siginfo_t)))
-			return -EFAULT;
-	}
-	ret = sys_ptrace(request, pid, addr, (unsigned long)si);
-	if (ret)
-		return ret;
-	if (request == PTRACE_GETSIGINFO) {
-		if (copy_from_user(&ssi, si, sizeof(siginfo_t)))
-			return -EFAULT;
-		ret = copy_siginfo_to_user32(si32, &ssi);
-	}
-	return ret;
-}
-
-asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
-{
-	struct task_struct *child;
-	struct pt_regs *childregs;
-	void __user *datap = compat_ptr(data);
-	int ret;
-	__u32 val;
-
-	switch (request) {
-	case PTRACE_TRACEME:
-	case PTRACE_ATTACH:
-	case PTRACE_KILL:
-	case PTRACE_CONT:
-	case PTRACE_SINGLESTEP:
-	case PTRACE_SINGLEBLOCK:
-	case PTRACE_DETACH:
-	case PTRACE_SYSCALL:
-	case PTRACE_OLDSETOPTIONS:
-	case PTRACE_SETOPTIONS:
-	case PTRACE_SET_THREAD_AREA:
-	case PTRACE_GET_THREAD_AREA:
-		return sys_ptrace(request, pid, addr, data);
-
-	default:
-		return -EINVAL;
-
-	case PTRACE_PEEKTEXT:
-	case PTRACE_PEEKDATA:
-	case PTRACE_POKEDATA:
-	case PTRACE_POKETEXT:
-	case PTRACE_POKEUSR:
-	case PTRACE_PEEKUSR:
-	case PTRACE_GETREGS:
-	case PTRACE_SETREGS:
-	case PTRACE_SETFPREGS:
-	case PTRACE_GETFPREGS:
-	case PTRACE_SETFPXREGS:
-	case PTRACE_GETFPXREGS:
-	case PTRACE_GETEVENTMSG:
-		break;
-
-	case PTRACE_SETSIGINFO:
-	case PTRACE_GETSIGINFO:
-		return ptrace32_siginfo(request, pid, addr, data);
-	}
-
-	child = ptrace_get_task_struct(pid);
-	if (IS_ERR(child))
-		return PTR_ERR(child);
-
-	ret = ptrace_check_attach(child, request == PTRACE_KILL);
-	if (ret < 0)
-		goto out;
-
-	childregs = task_pt_regs(child);
-
-	switch (request) {
-	case PTRACE_PEEKDATA:
-	case PTRACE_PEEKTEXT:
-		ret = 0;
-		if (access_process_vm(child, addr, &val, sizeof(u32), 0) !=
-		    sizeof(u32))
-			ret = -EIO;
-		else
-			ret = put_user(val, (unsigned int __user *)datap);
-		break;
-
-	case PTRACE_POKEDATA:
-	case PTRACE_POKETEXT:
-		ret = 0;
-		if (access_process_vm(child, addr, &data, sizeof(u32), 1) !=
-		    sizeof(u32))
-			ret = -EIO;
-		break;
-
-	case PTRACE_PEEKUSR:
-		ret = getreg32(child, addr, &val);
-		if (ret == 0)
-			ret = put_user(val, (__u32 __user *)datap);
-		break;
-
-	case PTRACE_POKEUSR:
-		ret = putreg32(child, addr, data);
-		break;
-
-	case PTRACE_GETREGS: { /* Get all gp regs from the child. */
-		int i;
-
-		if (!access_ok(VERIFY_WRITE, datap, 16*4)) {
-			ret = -EIO;
-			break;
-		}
-		ret = 0;
-		for (i = 0; i <= 16*4; i += sizeof(__u32)) {
-			getreg32(child, i, &val);
-			ret |= __put_user(val, (u32 __user *)datap);
-			datap += sizeof(u32);
-		}
-		break;
-	}
-
-	case PTRACE_SETREGS: { /* Set all gp regs in the child. */
-		unsigned long tmp;
-		int i;
-
-		if (!access_ok(VERIFY_READ, datap, 16*4)) {
-			ret = -EIO;
-			break;
-		}
-		ret = 0;
-		for (i = 0; i <= 16*4; i += sizeof(u32)) {
-			ret |= __get_user(tmp, (u32 __user *)datap);
-			putreg32(child, i, tmp);
-			datap += sizeof(u32);
-		}
-		break;
-	}
-
-	case PTRACE_GETFPREGS:
-		ret = -EIO;
-		if (!access_ok(VERIFY_READ, compat_ptr(data),
-			       sizeof(struct user_i387_struct)))
-			break;
-		save_i387_ia32(child, datap, childregs, 1);
-		ret = 0;
-			break;
-
-	case PTRACE_SETFPREGS:
-		ret = -EIO;
-		if (!access_ok(VERIFY_WRITE, datap,
-			       sizeof(struct user_i387_struct)))
-			break;
-		ret = 0;
-		/* don't check EFAULT to be bug-to-bug compatible to i386 */
-		restore_i387_ia32(child, datap, 1);
-		break;
-
-	case PTRACE_GETFPXREGS: {
-		struct user32_fxsr_struct __user *u = datap;
-
-		init_fpu(child);
-		ret = -EIO;
-		if (!access_ok(VERIFY_WRITE, u, sizeof(*u)))
-			break;
-			ret = -EFAULT;
-		if (__copy_to_user(u, &child->thread.i387.fxsave, sizeof(*u)))
-			break;
-		ret = __put_user(childregs->cs, &u->fcs);
-		ret |= __put_user(child->thread.ds, &u->fos);
-		break;
-	}
-	case PTRACE_SETFPXREGS: {
-		struct user32_fxsr_struct __user *u = datap;
-
-		unlazy_fpu(child);
-		ret = -EIO;
-		if (!access_ok(VERIFY_READ, u, sizeof(*u)))
-			break;
-		/*
-		 * no checking to be bug-to-bug compatible with i386.
-		 * but silence warning
-		 */
-		if (__copy_from_user(&child->thread.i387.fxsave, u, sizeof(*u)))
-			;
-		set_stopped_child_used_math(child);
-		child->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
-		ret = 0;
-		break;
-	}
-
-	case PTRACE_GETEVENTMSG:
-		ret = put_user(child->ptrace_message,
-			       (unsigned int __user *)compat_ptr(data));
-		break;
-
-	default:
-		BUG();
-	}
-
- out:
-	put_task_struct(child);
-	return ret;
-}
diff --git a/arch/x86/kernel/ptrace_64.c b/arch/x86/kernel/ptrace_64.c
deleted file mode 100644
index 5979dbe8e0a2..000000000000
--- a/arch/x86/kernel/ptrace_64.c
+++ /dev/null
@@ -1,470 +0,0 @@
-/* By Ross Biro 1/23/92 */
-/*
- * Pentium III FXSR, SSE support
- *	Gareth Hughes <gareth@valinux.com>, May 2000
- *
- * x86-64 port 2000-2002 Andi Kleen
- */
-
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/errno.h>
-#include <linux/ptrace.h>
-#include <linux/user.h>
-#include <linux/security.h>
-#include <linux/audit.h>
-#include <linux/seccomp.h>
-#include <linux/signal.h>
-
-#include <asm/uaccess.h>
-#include <asm/pgtable.h>
-#include <asm/system.h>
-#include <asm/processor.h>
-#include <asm/prctl.h>
-#include <asm/i387.h>
-#include <asm/debugreg.h>
-#include <asm/ldt.h>
-#include <asm/desc.h>
-#include <asm/proto.h>
-#include <asm/ia32.h>
-
-/*
- * does not yet catch signals sent when the child dies.
- * in exit.c or in signal.c.
- */
-
-/*
- * Determines which flags the user has access to [1 = access, 0 = no access].
- * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), IOPL(12-13), IF(9).
- * Also masks reserved bits (63-22, 15, 5, 3, 1).
- */
-#define FLAG_MASK 0x54dd5UL
-
-/*
- * Called by kernel/ptrace.c when detaching..
- *
- * Make sure the single step bit is not set.
- */
-void ptrace_disable(struct task_struct *child)
-{
-	user_disable_single_step(child);
-}
-
-static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long offset)
-{
-	BUILD_BUG_ON(offsetof(struct pt_regs, r15) != 0);
-	return &regs->r15 + (offset / sizeof(regs->r15));
-}
-
-static int putreg(struct task_struct *child,
-	unsigned long regno, unsigned long value)
-{
-	struct pt_regs *regs = task_pt_regs(child);
-	switch (regno) {
-	case offsetof(struct user_regs_struct,fs):
-		if (value && (value & 3) != 3)
-			return -EIO;
-		child->thread.fsindex = value & 0xffff;
-		if (child == current)
-			loadsegment(fs, child->thread.fsindex);
-		return 0;
-	case offsetof(struct user_regs_struct,gs):
-		if (value && (value & 3) != 3)
-			return -EIO;
-		child->thread.gsindex = value & 0xffff;
-		if (child == current)
-			load_gs_index(child->thread.gsindex);
-		return 0;
-	case offsetof(struct user_regs_struct,ds):
-		if (value && (value & 3) != 3)
-			return -EIO;
-		child->thread.ds = value & 0xffff;
-		if (child == current)
-			loadsegment(ds, child->thread.ds);
-		return 0;
-	case offsetof(struct user_regs_struct,es):
-		if (value && (value & 3) != 3)
-			return -EIO;
-		child->thread.es = value & 0xffff;
-		if (child == current)
-			loadsegment(es, child->thread.es);
-		return 0;
-	case offsetof(struct user_regs_struct,ss):
-		if ((value & 3) != 3)
-			return -EIO;
-		value &= 0xffff;
-		return 0;
-	case offsetof(struct user_regs_struct,fs_base):
-		if (value >= TASK_SIZE_OF(child))
-			return -EIO;
-		/*
-		 * When changing the segment base, use do_arch_prctl
-		 * to set either thread.fs or thread.fsindex and the
-		 * corresponding GDT slot.
-		 */
-		if (child->thread.fs != value)
-			return do_arch_prctl(child, ARCH_SET_FS, value);
-		return 0;
-	case offsetof(struct user_regs_struct,gs_base):
-		/*
-		 * Exactly the same here as the %fs handling above.
-		 */
-		if (value >= TASK_SIZE_OF(child))
-			return -EIO;
-		if (child->thread.gs != value)
-			return do_arch_prctl(child, ARCH_SET_GS, value);
-		return 0;
-	case offsetof(struct user_regs_struct,flags):
-		value &= FLAG_MASK;
-		/*
-		 * If the user value contains TF, mark that
-		 * it was not "us" (the debugger) that set it.
-		 * If not, make sure it stays set if we had.
-		 */
-		if (value & X86_EFLAGS_TF)
-			clear_tsk_thread_flag(child, TIF_FORCED_TF);
-		else if (test_tsk_thread_flag(child, TIF_FORCED_TF))
-			value |= X86_EFLAGS_TF;
-		value |= regs->flags & ~FLAG_MASK;
-		break;
-	case offsetof(struct user_regs_struct,cs):
-		if ((value & 3) != 3)
-			return -EIO;
-		value &= 0xffff;
-		break;
-	}
-	*pt_regs_access(regs, regno) = value;
-	return 0;
-}
-
-static unsigned long getreg(struct task_struct *child, unsigned long regno)
-{
-	struct pt_regs *regs = task_pt_regs(child);
-	unsigned long val;
-	unsigned int seg;
-	switch (regno) {
-	case offsetof(struct user_regs_struct, fs):
-		if (child == current) {
-			/* Older gas can't assemble movq %?s,%r?? */
-			asm("movl %%fs,%0" : "=r" (seg));
-			return seg;
-		}
-		return child->thread.fsindex;
-	case offsetof(struct user_regs_struct, gs):
-		if (child == current) {
-			asm("movl %%gs,%0" : "=r" (seg));
-			return seg;
-		}
-		return child->thread.gsindex;
-	case offsetof(struct user_regs_struct, ds):
-		if (child == current) {
-			asm("movl %%ds,%0" : "=r" (seg));
-			return seg;
-		}
-		return child->thread.ds;
-	case offsetof(struct user_regs_struct, es):
-		if (child == current) {
-			asm("movl %%es,%0" : "=r" (seg));
-			return seg;
-		}
-		return child->thread.es;
-	case offsetof(struct user_regs_struct, fs_base):
-		/*
-		 * do_arch_prctl may have used a GDT slot instead of
-		 * the MSR.  To userland, it appears the same either
-		 * way, except the %fs segment selector might not be 0.
-		 */
-		if (child->thread.fs != 0)
-			return child->thread.fs;
-		seg = child->thread.fsindex;
-		if (child == current)
-			asm("movl %%fs,%0" : "=r" (seg));
-		if (seg != FS_TLS_SEL)
-			return 0;
-		return get_desc_base(&child->thread.tls_array[FS_TLS]);
-	case offsetof(struct user_regs_struct, gs_base):
-		/*
-		 * Exactly the same here as the %fs handling above.
-		 */
-		if (child->thread.gs != 0)
-			return child->thread.gs;
-		seg = child->thread.gsindex;
-		if (child == current)
-			asm("movl %%gs,%0" : "=r" (seg));
-		if (seg != GS_TLS_SEL)
-			return 0;
-		return get_desc_base(&child->thread.tls_array[GS_TLS]);
-	case offsetof(struct user_regs_struct, flags):
-		/*
-		 * If the debugger set TF, hide it from the readout.
-		 */
-		val = regs->flags;
-		if (test_tsk_thread_flag(child, TIF_IA32))
-			val &= 0xffffffff;
-		if (test_tsk_thread_flag(child, TIF_FORCED_TF))
-			val &= ~X86_EFLAGS_TF;
-		return val;
-	default:
-		val = *pt_regs_access(regs, regno);
-		if (test_tsk_thread_flag(child, TIF_IA32))
-			val &= 0xffffffff;
-		return val;
-	}
-
-}
-
-unsigned long ptrace_get_debugreg(struct task_struct *child, int n)
-{
-	switch (n) {
-	case 0:		return child->thread.debugreg0;
-	case 1:		return child->thread.debugreg1;
-	case 2:		return child->thread.debugreg2;
-	case 3:		return child->thread.debugreg3;
-	case 6:		return child->thread.debugreg6;
-	case 7:		return child->thread.debugreg7;
-	}
-	return 0;
-}
-
-int ptrace_set_debugreg(struct task_struct *child, int n, unsigned long data)
-{
-	int i;
-
-	if (n < 4) {
-		int dsize = test_tsk_thread_flag(child, TIF_IA32) ? 3 : 7;
-		if (unlikely(data >= TASK_SIZE_OF(child) - dsize))
-			return -EIO;
-	}
-
-	switch (n) {
-	case 0:		child->thread.debugreg0 = data; break;
-	case 1:		child->thread.debugreg1 = data; break;
-	case 2:		child->thread.debugreg2 = data; break;
-	case 3:		child->thread.debugreg3 = data; break;
-
-	case 6:
-		if (data >> 32)
-			return -EIO;
-		child->thread.debugreg6 = data;
-		break;
-
-	case 7:
-		/*
-		 * See ptrace_32.c for an explanation of this awkward check.
-		 */
-		data &= ~DR_CONTROL_RESERVED;
-		for (i = 0; i < 4; i++)
-			if ((0x5554 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
-				return -EIO;
-		child->thread.debugreg7 = data;
-		if (data)
-			set_tsk_thread_flag(child, TIF_DEBUG);
-		else
-			clear_tsk_thread_flag(child, TIF_DEBUG);
-		break;
-	}
-
-	return 0;
-}
-
-long arch_ptrace(struct task_struct *child, long request, long addr, long data)
-{
-	long ret;
-	unsigned ui;
-
-	switch (request) {
-	/* when I and D space are separate, these will need to be fixed. */
-	case PTRACE_PEEKTEXT: /* read word at location addr. */
-	case PTRACE_PEEKDATA:
-		ret = generic_ptrace_peekdata(child, addr, data);
-		break;
-
-	/* read the word at location addr in the USER area. */
-	case PTRACE_PEEKUSR: {
-		unsigned long tmp;
-
-		ret = -EIO;
-		if ((addr & 7) ||
-		    addr > sizeof(struct user) - 7)
-			break;
-
-		tmp = 0;
-		if (addr < sizeof(struct user_regs_struct))
-			tmp = getreg(child, addr);
-		else if (addr >= offsetof(struct user, u_debugreg[0])) {
-			addr -= offsetof(struct user, u_debugreg[0]);
-			tmp = ptrace_get_debugreg(child, addr / sizeof(long));
-		}
-
-		ret = put_user(tmp,(unsigned long __user *) data);
-		break;
-	}
-
-	/* when I and D space are separate, this will have to be fixed. */
-	case PTRACE_POKETEXT: /* write the word at location addr. */
-	case PTRACE_POKEDATA:
-		ret = generic_ptrace_pokedata(child, addr, data);
-		break;
-
-	case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
-		ret = -EIO;
-		if ((addr & 7) ||
-		    addr > sizeof(struct user) - 7)
-			break;
-
-		if (addr < sizeof(struct user_regs_struct))
-			ret = putreg(child, addr, data);
-		else if (addr >= offsetof(struct user, u_debugreg[0])) {
-			addr -= offsetof(struct user, u_debugreg[0]);
-			ret = ptrace_set_debugreg(child,
-						  addr / sizeof(long), data);
-		}
-		break;
-
-#ifdef CONFIG_IA32_EMULATION
-		/* This makes only sense with 32bit programs. Allow a
-		   64bit debugger to fully examine them too. Better
-		   don't use it against 64bit processes, use
-		   PTRACE_ARCH_PRCTL instead. */
-	case PTRACE_GET_THREAD_AREA:
-		if (addr < 0)
-			return -EIO;
-		ret = do_get_thread_area(child, addr,
-					 (struct user_desc __user *) data);
-
-		break;
-	case PTRACE_SET_THREAD_AREA:
-		if (addr < 0)
-			return -EIO;
-		ret = do_set_thread_area(child, addr,
-					 (struct user_desc __user *) data, 0);
-		break;
-#endif
-		/* normal 64bit interface to access TLS data.
-		   Works just like arch_prctl, except that the arguments
-		   are reversed. */
-	case PTRACE_ARCH_PRCTL:
-		ret = do_arch_prctl(child, data, addr);
-		break;
-
-	case PTRACE_GETREGS: { /* Get all gp regs from the child. */
-	  	if (!access_ok(VERIFY_WRITE, (unsigned __user *)data,
-			       sizeof(struct user_regs_struct))) {
-			ret = -EIO;
-			break;
-		}
-		ret = 0;
-		for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) {
-			ret |= __put_user(getreg(child, ui),(unsigned long __user *) data);
-			data += sizeof(long);
-		}
-		break;
-	}
-
-	case PTRACE_SETREGS: { /* Set all gp regs in the child. */
-		unsigned long tmp;
-	  	if (!access_ok(VERIFY_READ, (unsigned __user *)data,
-			       sizeof(struct user_regs_struct))) {
-			ret = -EIO;
-			break;
-		}
-		ret = 0;
-		for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) {
-			ret = __get_user(tmp, (unsigned long __user *) data);
-			if (ret)
-				break;
-			ret = putreg(child, ui, tmp);
-			if (ret)
-				break;
-			data += sizeof(long);
-		}
-		break;
-	}
-
-	case PTRACE_GETFPREGS: { /* Get the child extended FPU state. */
-		if (!access_ok(VERIFY_WRITE, (unsigned __user *)data,
-			       sizeof(struct user_i387_struct))) {
-			ret = -EIO;
-			break;
-		}
-		ret = get_fpregs((struct user_i387_struct __user *)data, child);
-		break;
-	}
-
-	case PTRACE_SETFPREGS: { /* Set the child extended FPU state. */
-		if (!access_ok(VERIFY_READ, (unsigned __user *)data,
-			       sizeof(struct user_i387_struct))) {
-			ret = -EIO;
-			break;
-		}
-		set_stopped_child_used_math(child);
-		ret = set_fpregs(child, (struct user_i387_struct __user *)data);
-		break;
-	}
-
-	default:
-		ret = ptrace_request(child, request, addr, data);
-		break;
-	}
-	return ret;
-}
-
-static void syscall_trace(struct pt_regs *regs)
-{
-
-#if 0
-	printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n",
-	       current->comm,
-	       regs->ip, regs->sp, regs->ax, regs->orig_ax, __builtin_return_address(0),
-	       current_thread_info()->flags, current->ptrace);
-#endif
-
-	ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
-				? 0x80 : 0));
-	/*
-	 * this isn't the same as continuing with a signal, but it will do
-	 * for normal use.  strace only continues with a signal if the
-	 * stopping signal is not SIGTRAP.  -brl
-	 */
-	if (current->exit_code) {
-		send_sig(current->exit_code, current, 1);
-		current->exit_code = 0;
-	}
-}
-
-asmlinkage void syscall_trace_enter(struct pt_regs *regs)
-{
-	/* do the secure computing check first */
-	secure_computing(regs->orig_ax);
-
-	if (test_thread_flag(TIF_SYSCALL_TRACE)
-	    && (current->ptrace & PT_PTRACED))
-		syscall_trace(regs);
-
-	if (unlikely(current->audit_context)) {
-		if (test_thread_flag(TIF_IA32)) {
-			audit_syscall_entry(AUDIT_ARCH_I386,
-					    regs->orig_ax,
-					    regs->bx, regs->cx,
-					    regs->dx, regs->si);
-		} else {
-			audit_syscall_entry(AUDIT_ARCH_X86_64,
-					    regs->orig_ax,
-					    regs->di, regs->si,
-					    regs->dx, regs->r10);
-		}
-	}
-}
-
-asmlinkage void syscall_trace_leave(struct pt_regs *regs)
-{
-	if (unlikely(current->audit_context))
-		audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
-
-	if ((test_thread_flag(TIF_SYSCALL_TRACE)
-	     || test_thread_flag(TIF_SINGLESTEP))
-	    && (current->ptrace & PT_PTRACED))
-		syscall_trace(regs);
-}
-- 
cgit v1.2.3


From faca62273b602ab482fb7d3d940dbf41ef08b00e Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Wed, 30 Jan 2008 13:31:02 +0100
Subject: x86: use generic register name in the thread and tss structures

This changes size-specific register names (eip/rip, esp/rsp, etc.) to
generic names in the thread and tss structures.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/asm-offsets_32.c |  4 ++--
 arch/x86/kernel/cpu/common.c     |  2 +-
 arch/x86/kernel/doublefault_32.c | 15 ++++++++-------
 arch/x86/kernel/entry_32.S       |  6 +++---
 arch/x86/kernel/paravirt_32.c    |  2 +-
 arch/x86/kernel/process_32.c     | 12 ++++++------
 arch/x86/kernel/process_64.c     | 16 ++++++++--------
 arch/x86/kernel/smpboot_32.c     |  8 ++++----
 arch/x86/kernel/smpboot_64.c     |  6 +++---
 arch/x86/kernel/traps_32.c       |  6 +++---
 arch/x86/kernel/traps_64.c       |  4 ++--
 arch/x86/kernel/vm86_32.c        | 16 ++++++++--------
 arch/x86/kernel/vmi_32.c         | 10 +++++-----
 arch/x86/lguest/boot.c           |  6 +++---
 arch/x86/vdso/vdso32-setup.c     |  4 ++--
 arch/x86/xen/enlighten.c         |  6 +++---
 arch/x86/xen/smp.c               |  4 ++--
 drivers/lguest/x86/core.c        |  4 ++--
 include/asm-x86/paravirt.h       |  6 +++---
 include/asm-x86/processor_32.h   | 37 +++++++++++++++++--------------------
 include/asm-x86/processor_64.h   | 20 ++++++++++----------
 include/asm-x86/system_32.h      |  4 ++--
 include/asm-x86/system_64.h      |  2 +-
 23 files changed, 99 insertions(+), 101 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 4fc24a61f431..415313556708 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -101,8 +101,8 @@ void foo(void)
 	OFFSET(pbe_orig_address, pbe, orig_address);
 	OFFSET(pbe_next, pbe, next);
 
-	/* Offset from the sysenter stack to tss.esp0 */
-	DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, x86_tss.esp0) -
+	/* Offset from the sysenter stack to tss.sp0 */
+	DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
 		 sizeof(struct tss_struct));
 
 	DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 5db2a163bf4b..235cd615b89d 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -691,7 +691,7 @@ void __cpuinit cpu_init(void)
 		BUG();
 	enter_lazy_tlb(&init_mm, curr);
 
-	load_esp0(t, thread);
+	load_sp0(t, thread);
 	set_tss_desc(cpu,t);
 	load_TR_desc();
 	load_LDT(&init_mm.context);
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c
index 40978af630e7..cc19a3ea403a 100644
--- a/arch/x86/kernel/doublefault_32.c
+++ b/arch/x86/kernel/doublefault_32.c
@@ -35,12 +35,13 @@ static void doublefault_fn(void)
 		if (ptr_ok(tss)) {
 			struct i386_hw_tss *t = (struct i386_hw_tss *)tss;
 
-			printk(KERN_EMERG "eip = %08lx, esp = %08lx\n", t->eip, t->esp);
+			printk(KERN_EMERG "eip = %08lx, esp = %08lx\n",
+			       t->ip, t->sp);
 
 			printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n",
-				t->eax, t->ebx, t->ecx, t->edx);
+				t->ax, t->bx, t->cx, t->dx);
 			printk(KERN_EMERG "esi = %08lx, edi = %08lx\n",
-				t->esi, t->edi);
+				t->si, t->di);
 		}
 	}
 
@@ -50,15 +51,15 @@ static void doublefault_fn(void)
 
 struct tss_struct doublefault_tss __cacheline_aligned = {
 	.x86_tss = {
-		.esp0		= STACK_START,
+		.sp0		= STACK_START,
 		.ss0		= __KERNEL_DS,
 		.ldt		= 0,
 		.io_bitmap_base	= INVALID_IO_BITMAP_OFFSET,
 
-		.eip		= (unsigned long) doublefault_fn,
+		.ip		= (unsigned long) doublefault_fn,
 		/* 0x2 bit is always set */
-		.eflags		= X86_EFLAGS_SF | 0x2,
-		.esp		= STACK_START,
+		.flags		= X86_EFLAGS_SF | 0x2,
+		.sp		= STACK_START,
 		.es		= __USER_DS,
 		.cs		= __KERNEL_CS,
 		.ss		= __KERNEL_DS,
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 153bb87a4eea..6a474e1028c7 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -288,7 +288,7 @@ ENTRY(ia32_sysenter_target)
 	CFI_SIGNAL_FRAME
 	CFI_DEF_CFA esp, 0
 	CFI_REGISTER esp, ebp
-	movl TSS_sysenter_esp0(%esp),%esp
+	movl TSS_sysenter_sp0(%esp),%esp
 sysenter_past_esp:
 	/*
 	 * No need to follow this irqs on/off section: the syscall
@@ -743,7 +743,7 @@ END(device_not_available)
  * that sets up the real kernel stack. Check here, since we can't
  * allow the wrong stack to be used.
  *
- * "TSS_sysenter_esp0+12" is because the NMI/debug handler will have
+ * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
  * already pushed 3 words if it hits on the sysenter instruction:
  * eflags, cs and eip.
  *
@@ -755,7 +755,7 @@ END(device_not_available)
 	cmpw $__KERNEL_CS,4(%esp);		\
 	jne ok;					\
 label:						\
-	movl TSS_sysenter_esp0+offset(%esp),%esp;	\
+	movl TSS_sysenter_sp0+offset(%esp),%esp;	\
 	CFI_DEF_CFA esp, 0;			\
 	CFI_UNDEFINED eip;			\
 	pushfl;					\
diff --git a/arch/x86/kernel/paravirt_32.c b/arch/x86/kernel/paravirt_32.c
index 706b0562ea40..f4e3a8e01cf2 100644
--- a/arch/x86/kernel/paravirt_32.c
+++ b/arch/x86/kernel/paravirt_32.c
@@ -382,7 +382,7 @@ struct pv_cpu_ops pv_cpu_ops = {
 	.write_ldt_entry = write_dt_entry,
 	.write_gdt_entry = write_dt_entry,
 	.write_idt_entry = write_dt_entry,
-	.load_esp0 = native_load_esp0,
+	.load_sp0 = native_load_sp0,
 
 	.irq_enable_syscall_ret = native_irq_enable_syscall_ret,
 	.iret = native_iret,
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 3744cf63682c..add3bf34e205 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -75,7 +75,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_number);
  */
 unsigned long thread_saved_pc(struct task_struct *tsk)
 {
-	return ((unsigned long *)tsk->thread.esp)[3];
+	return ((unsigned long *)tsk->thread.sp)[3];
 }
 
 /*
@@ -488,10 +488,10 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
 	childregs->ax = 0;
 	childregs->sp = sp;
 
-	p->thread.esp = (unsigned long) childregs;
-	p->thread.esp0 = (unsigned long) (childregs+1);
+	p->thread.sp = (unsigned long) childregs;
+	p->thread.sp0 = (unsigned long) (childregs+1);
 
-	p->thread.eip = (unsigned long) ret_from_fork;
+	p->thread.ip = (unsigned long) ret_from_fork;
 
 	savesegment(gs,p->thread.gs);
 
@@ -718,7 +718,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
 	/*
 	 * Reload esp0.
 	 */
-	load_esp0(tss, next);
+	load_sp0(tss, next);
 
 	/*
 	 * Save away %gs. No need to save %fs, as it was saved on the
@@ -851,7 +851,7 @@ unsigned long get_wchan(struct task_struct *p)
 	if (!p || p == current || p->state == TASK_RUNNING)
 		return 0;
 	stack_page = (unsigned long)task_stack_page(p);
-	sp = p->thread.esp;
+	sp = p->thread.sp;
 	if (!stack_page || sp < stack_page || sp > top_esp+stack_page)
 		return 0;
 	/* include/asm-i386/system.h:switch_to() pushes bp last. */
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index efbb1a2eab97..238193822e23 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -493,9 +493,9 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
 	if (sp == ~0UL)
 		childregs->sp = (unsigned long)childregs;
 
-	p->thread.rsp = (unsigned long) childregs;
-	p->thread.rsp0 = (unsigned long) (childregs+1);
-	p->thread.userrsp = me->thread.userrsp; 
+	p->thread.sp = (unsigned long) childregs;
+	p->thread.sp0 = (unsigned long) (childregs+1);
+	p->thread.usersp = me->thread.usersp;
 
 	set_tsk_thread_flag(p, TIF_FORK);
 
@@ -607,7 +607,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	/*
 	 * Reload esp0, LDT and the page table pointer:
 	 */
-	tss->rsp0 = next->rsp0;
+	tss->sp0 = next->sp0;
 
 	/* 
 	 * Switch DS and ES.
@@ -666,8 +666,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	/* 
 	 * Switch the PDA and FPU contexts.
 	 */
-	prev->userrsp = read_pda(oldrsp); 
-	write_pda(oldrsp, next->userrsp); 
+	prev->usersp = read_pda(oldrsp);
+	write_pda(oldrsp, next->usersp);
 	write_pda(pcurrent, next_p); 
 
 	write_pda(kernelstack,
@@ -769,9 +769,9 @@ unsigned long get_wchan(struct task_struct *p)
 	if (!p || p == current || p->state==TASK_RUNNING)
 		return 0; 
 	stack = (unsigned long)task_stack_page(p);
-	if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
+	if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
 		return 0;
-	fp = *(u64 *)(p->thread.rsp);
+	fp = *(u64 *)(p->thread.sp);
 	do { 
 		if (fp < (unsigned long)stack ||
 		    fp > (unsigned long)stack+THREAD_SIZE)
diff --git a/arch/x86/kernel/smpboot_32.c b/arch/x86/kernel/smpboot_32.c
index 3566191832b3..0f294d6e22cf 100644
--- a/arch/x86/kernel/smpboot_32.c
+++ b/arch/x86/kernel/smpboot_32.c
@@ -454,7 +454,7 @@ void __devinit initialize_secondary(void)
 		"movl %0,%%esp\n\t"
 		"jmp *%1"
 		:
-		:"m" (current->thread.esp),"m" (current->thread.eip));
+		:"m" (current->thread.sp),"m" (current->thread.ip));
 }
 
 /* Static state in head.S used to set up a CPU */
@@ -753,7 +753,7 @@ static inline struct task_struct * __cpuinit alloc_idle_task(int cpu)
 		/* initialize thread_struct.  we really want to avoid destroy
 		 * idle tread
 		 */
-		idle->thread.esp = (unsigned long)task_pt_regs(idle);
+		idle->thread.sp = (unsigned long)task_pt_regs(idle);
 		init_idle(idle, cpu);
 		return idle;
 	}
@@ -798,7 +798,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
  	per_cpu(current_task, cpu) = idle;
 	early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
 
-	idle->thread.eip = (unsigned long) start_secondary;
+	idle->thread.ip = (unsigned long) start_secondary;
 	/* start_eip had better be page-aligned! */
 	start_eip = setup_trampoline();
 
@@ -808,7 +808,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
 	/* So we see what's up   */
 	printk("Booting processor %d/%d ip %lx\n", cpu, apicid, start_eip);
 	/* Stack for startup_32 can be just as for start_secondary onwards */
-	stack_start.sp = (void *) idle->thread.esp;
+	stack_start.sp = (void *) idle->thread.sp;
 
 	irq_ctx_init(cpu);
 
diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c
index ac1089f2b917..c3f2736ba530 100644
--- a/arch/x86/kernel/smpboot_64.c
+++ b/arch/x86/kernel/smpboot_64.c
@@ -577,7 +577,7 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid)
 	c_idle.idle = get_idle_for_cpu(cpu);
 
 	if (c_idle.idle) {
-		c_idle.idle->thread.rsp = (unsigned long) (((struct pt_regs *)
+		c_idle.idle->thread.sp = (unsigned long) (((struct pt_regs *)
 			(THREAD_SIZE +  task_stack_page(c_idle.idle))) - 1);
 		init_idle(c_idle.idle, cpu);
 		goto do_rest;
@@ -613,8 +613,8 @@ do_rest:
 
 	start_rip = setup_trampoline();
 
-	init_rsp = c_idle.idle->thread.rsp;
-	per_cpu(init_tss,cpu).rsp0 = init_rsp;
+	init_rsp = c_idle.idle->thread.sp;
+	per_cpu(init_tss,cpu).sp0 = init_rsp;
 	initial_code = start_secondary;
 	clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
 
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index 27713553cc59..57491942cc4e 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -163,7 +163,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 		unsigned long dummy;
 		stack = &dummy;
 		if (task != current)
-			stack = (unsigned long *)task->thread.esp;
+			stack = (unsigned long *)task->thread.sp;
 	}
 
 #ifdef CONFIG_FRAME_POINTER
@@ -173,7 +173,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 			asm ("movl %%ebp, %0" : "=r" (bp) : );
 		} else {
 			/* bp is the last reg pushed by switch_to */
-			bp = *(unsigned long *) task->thread.esp;
+			bp = *(unsigned long *) task->thread.sp;
 		}
 	}
 #endif
@@ -253,7 +253,7 @@ static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
 
 	if (sp == NULL) {
 		if (task)
-			sp = (unsigned long*)task->thread.esp;
+			sp = (unsigned long*)task->thread.sp;
 		else
 			sp = (unsigned long *)&sp;
 	}
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index f7fecf9d47c3..965f2cc3a013 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -230,7 +230,7 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
 		unsigned long dummy;
 		stack = &dummy;
 		if (tsk && tsk != current)
-			stack = (unsigned long *)tsk->thread.rsp;
+			stack = (unsigned long *)tsk->thread.sp;
 	}
 
 	/*
@@ -366,7 +366,7 @@ _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp)
 
 	if (sp == NULL) {
 		if (tsk)
-			sp = (unsigned long *)tsk->thread.rsp;
+			sp = (unsigned long *)tsk->thread.sp;
 		else
 			sp = (unsigned long *)&sp;
 	}
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 980e85b90091..e85bb44265cb 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -147,10 +147,10 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
 	}
 
 	tss = &per_cpu(init_tss, get_cpu());
-	current->thread.esp0 = current->thread.saved_esp0;
+	current->thread.sp0 = current->thread.saved_sp0;
 	current->thread.sysenter_cs = __KERNEL_CS;
-	load_esp0(tss, &current->thread);
-	current->thread.saved_esp0 = 0;
+	load_sp0(tss, &current->thread);
+	current->thread.saved_sp0 = 0;
 	put_cpu();
 
 	ret = KVM86->regs32;
@@ -207,7 +207,7 @@ asmlinkage int sys_vm86old(struct pt_regs regs)
 	int tmp, ret = -EPERM;
 
 	tsk = current;
-	if (tsk->thread.saved_esp0)
+	if (tsk->thread.saved_sp0)
 		goto out;
 	tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
 				       offsetof(struct kernel_vm86_struct, vm86plus) -
@@ -256,7 +256,7 @@ asmlinkage int sys_vm86(struct pt_regs regs)
 
 	/* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */
 	ret = -EPERM;
-	if (tsk->thread.saved_esp0)
+	if (tsk->thread.saved_sp0)
 		goto out;
 	v86 = (struct vm86plus_struct __user *)regs.cx;
 	tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
@@ -318,15 +318,15 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
  * Save old state, set default return value (%ax) to 0
  */
 	info->regs32->ax = 0;
-	tsk->thread.saved_esp0 = tsk->thread.esp0;
+	tsk->thread.saved_sp0 = tsk->thread.sp0;
 	tsk->thread.saved_fs = info->regs32->fs;
 	savesegment(gs, tsk->thread.saved_gs);
 
 	tss = &per_cpu(init_tss, get_cpu());
-	tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0;
+	tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0;
 	if (cpu_has_sep)
 		tsk->thread.sysenter_cs = 0;
-	load_esp0(tss, &tsk->thread);
+	load_sp0(tss, &tsk->thread);
 	put_cpu();
 
 	tsk->thread.screen_bitmap = info->screen_bitmap;
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 599b6f2ed562..4cfda7dbe90f 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -62,7 +62,7 @@ static struct {
 	void (*cpuid)(void /* non-c */);
 	void (*_set_ldt)(u32 selector);
 	void (*set_tr)(u32 selector);
-	void (*set_kernel_stack)(u32 selector, u32 esp0);
+	void (*set_kernel_stack)(u32 selector, u32 sp0);
 	void (*allocate_page)(u32, u32, u32, u32, u32);
 	void (*release_page)(u32, u32);
 	void (*set_pte)(pte_t, pte_t *, unsigned);
@@ -214,17 +214,17 @@ static void vmi_set_tr(void)
 	vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct));
 }
 
-static void vmi_load_esp0(struct tss_struct *tss,
+static void vmi_load_sp0(struct tss_struct *tss,
 				   struct thread_struct *thread)
 {
-	tss->x86_tss.esp0 = thread->esp0;
+	tss->x86_tss.sp0 = thread->sp0;
 
 	/* This can only happen when SEP is enabled, no need to test "SEP"arately */
 	if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
 		tss->x86_tss.ss1 = thread->sysenter_cs;
 		wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
 	}
-	vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.esp0);
+	vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.sp0);
 }
 
 static void vmi_flush_tlb_user(void)
@@ -793,7 +793,7 @@ static inline int __init activate_vmi(void)
 	para_fill(pv_cpu_ops.write_ldt_entry, WriteLDTEntry);
 	para_fill(pv_cpu_ops.write_gdt_entry, WriteGDTEntry);
 	para_fill(pv_cpu_ops.write_idt_entry, WriteIDTEntry);
-	para_wrap(pv_cpu_ops.load_esp0, vmi_load_esp0, set_kernel_stack, UpdateKernelStack);
+	para_wrap(pv_cpu_ops.load_sp0, vmi_load_sp0, set_kernel_stack, UpdateKernelStack);
 	para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
 	para_fill(pv_cpu_ops.io_delay, IODelay);
 
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index ea46d05853bb..c751e3c03e85 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -755,10 +755,10 @@ static void lguest_time_init(void)
  * segment), the privilege level (we're privilege level 1, the Host is 0 and
  * will not tolerate us trying to use that), the stack pointer, and the number
  * of pages in the stack. */
-static void lguest_load_esp0(struct tss_struct *tss,
+static void lguest_load_sp0(struct tss_struct *tss,
 				     struct thread_struct *thread)
 {
-	lazy_hcall(LHCALL_SET_STACK, __KERNEL_DS|0x1, thread->esp0,
+	lazy_hcall(LHCALL_SET_STACK, __KERNEL_DS|0x1, thread->sp0,
 		   THREAD_SIZE/PAGE_SIZE);
 }
 
@@ -957,7 +957,7 @@ __init void lguest_init(void)
 	pv_cpu_ops.cpuid = lguest_cpuid;
 	pv_cpu_ops.load_idt = lguest_load_idt;
 	pv_cpu_ops.iret = lguest_iret;
-	pv_cpu_ops.load_esp0 = lguest_load_esp0;
+	pv_cpu_ops.load_sp0 = lguest_load_sp0;
 	pv_cpu_ops.load_tr_desc = lguest_load_tr_desc;
 	pv_cpu_ops.set_ldt = lguest_set_ldt;
 	pv_cpu_ops.load_tls = lguest_load_tls;
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index d97a6d7d062b..e0feb66a2408 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -243,9 +243,9 @@ void enable_sep_cpu(void)
 	}
 
 	tss->x86_tss.ss1 = __KERNEL_CS;
-	tss->x86_tss.esp1 = sizeof(struct tss_struct) + (unsigned long) tss;
+	tss->x86_tss.sp1 = sizeof(struct tss_struct) + (unsigned long) tss;
 	wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
-	wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.esp1, 0);
+	wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0);
 	wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0);
 	put_cpu();	
 }
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 29517faaa735..d81e8d709102 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -499,11 +499,11 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
 	preempt_enable();
 }
 
-static void xen_load_esp0(struct tss_struct *tss,
+static void xen_load_sp0(struct tss_struct *tss,
 			  struct thread_struct *thread)
 {
 	struct multicall_space mcs = xen_mc_entry(0);
-	MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->esp0);
+	MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
 	xen_mc_issue(PARAVIRT_LAZY_CPU);
 }
 
@@ -968,7 +968,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
 	.write_ldt_entry = xen_write_ldt_entry,
 	.write_gdt_entry = xen_write_gdt_entry,
 	.write_idt_entry = xen_write_idt_entry,
-	.load_esp0 = xen_load_esp0,
+	.load_sp0 = xen_load_sp0,
 
 	.set_iopl_mask = xen_set_iopl_mask,
 	.io_delay = xen_io_delay,
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 8e1234e14559..aafc54437403 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -239,10 +239,10 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 	ctxt->gdt_ents      = ARRAY_SIZE(gdt->gdt);
 
 	ctxt->user_regs.cs = __KERNEL_CS;
-	ctxt->user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
+	ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
 
 	ctxt->kernel_ss = __KERNEL_DS;
-	ctxt->kernel_sp = idle->thread.esp0;
+	ctxt->kernel_sp = idle->thread.sp0;
 
 	ctxt->event_callback_cs     = __KERNEL_CS;
 	ctxt->event_callback_eip    = (unsigned long)xen_hypervisor_callback;
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 96d0fd07c57d..44adb00e1490 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -94,7 +94,7 @@ static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages)
 	/* Set up the two "TSS" members which tell the CPU what stack to use
 	 * for traps which do directly into the Guest (ie. traps at privilege
 	 * level 1). */
-	pages->state.guest_tss.esp1 = lg->esp1;
+	pages->state.guest_tss.sp1 = lg->esp1;
 	pages->state.guest_tss.ss1 = lg->ss1;
 
 	/* Copy direct-to-Guest trap entries. */
@@ -416,7 +416,7 @@ void __init lguest_arch_host_init(void)
 		/* We know where we want the stack to be when the Guest enters
 		 * the switcher: in pages->regs.  The stack grows upwards, so
 		 * we start it at the end of that structure. */
-		state->guest_tss.esp0 = (long)(&pages->regs + 1);
+		state->guest_tss.sp0 = (long)(&pages->regs + 1);
 		/* And this is the GDT entry to use for the stack: we keep a
 		 * couple of special LGUEST entries. */
 		state->guest_tss.ss0 = LGUEST_DS;
diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h
index be7b934f6c54..d1780e32722e 100644
--- a/include/asm-x86/paravirt.h
+++ b/include/asm-x86/paravirt.h
@@ -101,7 +101,7 @@ struct pv_cpu_ops {
 				int entrynum, u32 low, u32 high);
 	void (*write_idt_entry)(struct desc_struct *,
 				int entrynum, u32 low, u32 high);
-	void (*load_esp0)(struct tss_struct *tss, struct thread_struct *t);
+	void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t);
 
 	void (*set_iopl_mask)(unsigned mask);
 
@@ -449,10 +449,10 @@ static inline int paravirt_enabled(void)
 	return pv_info.paravirt_enabled;
 }
 
-static inline void load_esp0(struct tss_struct *tss,
+static inline void load_sp0(struct tss_struct *tss,
 			     struct thread_struct *thread)
 {
-	PVOP_VCALL2(pv_cpu_ops.load_esp0, tss, thread);
+	PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread);
 }
 
 #define ARCH_SETUP			pv_init_ops.arch_setup();
diff --git a/include/asm-x86/processor_32.h b/include/asm-x86/processor_32.h
index d50a4b48d441..6846cc346400 100644
--- a/include/asm-x86/processor_32.h
+++ b/include/asm-x86/processor_32.h
@@ -292,20 +292,17 @@ struct thread_struct;
 /* This is the TSS defined by the hardware. */
 struct i386_hw_tss {
 	unsigned short	back_link,__blh;
-	unsigned long	esp0;
+	unsigned long	sp0;
 	unsigned short	ss0,__ss0h;
-	unsigned long	esp1;
+	unsigned long	sp1;
 	unsigned short	ss1,__ss1h;	/* ss1 is used to cache MSR_IA32_SYSENTER_CS */
-	unsigned long	esp2;
+	unsigned long	sp2;
 	unsigned short	ss2,__ss2h;
 	unsigned long	__cr3;
-	unsigned long	eip;
-	unsigned long	eflags;
-	unsigned long	eax,ecx,edx,ebx;
-	unsigned long	esp;
-	unsigned long	ebp;
-	unsigned long	esi;
-	unsigned long	edi;
+	unsigned long	ip;
+	unsigned long	flags;
+	unsigned long	ax, cx, dx, bx;
+	unsigned long	sp, bp, si, di;
 	unsigned short	es, __esh;
 	unsigned short	cs, __csh;
 	unsigned short	ss, __ssh;
@@ -346,10 +343,10 @@ struct tss_struct {
 struct thread_struct {
 /* cached TLS descriptors. */
 	struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
-	unsigned long	esp0;
+	unsigned long	sp0;
 	unsigned long	sysenter_cs;
-	unsigned long	eip;
-	unsigned long	esp;
+	unsigned long	ip;
+	unsigned long	sp;
 	unsigned long	fs;
 	unsigned long	gs;
 /* Hardware debugging registers */
@@ -366,7 +363,7 @@ struct thread_struct {
 /* virtual 86 mode info */
 	struct vm86_struct __user * vm86_info;
 	unsigned long		screen_bitmap;
-	unsigned long		v86flags, v86mask, saved_esp0;
+	unsigned long		v86flags, v86mask, saved_sp0;
 	unsigned int		saved_fs, saved_gs;
 /* IO permissions */
 	unsigned long	*io_bitmap_ptr;
@@ -378,7 +375,7 @@ struct thread_struct {
 };
 
 #define INIT_THREAD  {							\
-	.esp0 = sizeof(init_stack) + (long)&init_stack,			\
+	.sp0 = sizeof(init_stack) + (long)&init_stack,			\
 	.vm86_info = NULL,						\
 	.sysenter_cs = __KERNEL_CS,					\
 	.io_bitmap_ptr = NULL,						\
@@ -393,7 +390,7 @@ struct thread_struct {
  */
 #define INIT_TSS  {							\
 	.x86_tss = {							\
-		.esp0		= sizeof(init_stack) + (long)&init_stack, \
+		.sp0		= sizeof(init_stack) + (long)&init_stack, \
 		.ss0		= __KERNEL_DS,				\
 		.ss1		= __KERNEL_CS,				\
 		.io_bitmap_base	= INVALID_IO_BITMAP_OFFSET,		\
@@ -503,9 +500,9 @@ static inline void rep_nop(void)
 
 #define cpu_relax()	rep_nop()
 
-static inline void native_load_esp0(struct tss_struct *tss, struct thread_struct *thread)
+static inline void native_load_sp0(struct tss_struct *tss, struct thread_struct *thread)
 {
-	tss->x86_tss.esp0 = thread->esp0;
+	tss->x86_tss.sp0 = thread->sp0;
 	/* This can only happen when SEP is enabled, no need to test "SEP"arately */
 	if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
 		tss->x86_tss.ss1 = thread->sysenter_cs;
@@ -585,9 +582,9 @@ static inline void native_set_iopl_mask(unsigned mask)
 #define paravirt_enabled() 0
 #define __cpuid native_cpuid
 
-static inline void load_esp0(struct tss_struct *tss, struct thread_struct *thread)
+static inline void load_sp0(struct tss_struct *tss, struct thread_struct *thread)
 {
-	native_load_esp0(tss, thread);
+	native_load_sp0(tss, thread);
 }
 
 /*
diff --git a/include/asm-x86/processor_64.h b/include/asm-x86/processor_64.h
index 797770113e6d..0780f3e3fdfe 100644
--- a/include/asm-x86/processor_64.h
+++ b/include/asm-x86/processor_64.h
@@ -177,9 +177,9 @@ union i387_union {
 
 struct tss_struct {
 	u32 reserved1;
-	u64 rsp0;	
-	u64 rsp1;
-	u64 rsp2;
+	u64 sp0;
+	u64 sp1;
+	u64 sp2;
 	u64 reserved2;
 	u64 ist[7];
 	u32 reserved3;
@@ -216,9 +216,9 @@ DECLARE_PER_CPU(struct orig_ist, orig_ist);
 #endif
 
 struct thread_struct {
-	unsigned long	rsp0;
-	unsigned long	rsp;
-	unsigned long 	userrsp;	/* Copy from PDA */ 
+	unsigned long	sp0;
+	unsigned long	sp;
+	unsigned long 	usersp;	/* Copy from PDA */
 	unsigned long	fs;
 	unsigned long	gs;
 	unsigned short	es, ds, fsindex, gsindex;	
@@ -245,11 +245,11 @@ struct thread_struct {
 } __attribute__((aligned(16)));
 
 #define INIT_THREAD  { \
-	.rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
+	.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
 }
 
 #define INIT_TSS  { \
-	.rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
+	.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
 }
 
 #define INIT_MMAP \
@@ -293,10 +293,10 @@ extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
  * Return saved PC of a blocked thread.
  * What is this good for? it will be always the scheduler or ret_from_fork.
  */
-#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.rsp - 8))
+#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8))
 
 extern unsigned long get_wchan(struct task_struct *p);
-#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.rsp0 - 1)
+#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
 #define KSTK_EIP(tsk) (task_pt_regs(tsk)->ip)
 #define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
 
diff --git a/include/asm-x86/system_32.h b/include/asm-x86/system_32.h
index db6283eb5e46..f5b3f77f5310 100644
--- a/include/asm-x86/system_32.h
+++ b/include/asm-x86/system_32.h
@@ -28,9 +28,9 @@ extern struct task_struct * FASTCALL(__switch_to(struct task_struct *prev, struc
 		     "1:\t"						\
 		     "popl %%ebp\n\t"					\
 		     "popfl"						\
-		     :"=m" (prev->thread.esp),"=m" (prev->thread.eip),	\
+		     :"=m" (prev->thread.sp),"=m" (prev->thread.ip),	\
 		      "=a" (last),"=S" (esi),"=D" (edi)			\
-		     :"m" (next->thread.esp),"m" (next->thread.eip),	\
+		     :"m" (next->thread.sp),"m" (next->thread.ip),	\
 		      "2" (prev), "d" (next));				\
 } while (0)
 
diff --git a/include/asm-x86/system_64.h b/include/asm-x86/system_64.h
index 6e9e4841a2da..3dcb217a7202 100644
--- a/include/asm-x86/system_64.h
+++ b/include/asm-x86/system_64.h
@@ -40,7 +40,7 @@
 		     RESTORE_CONTEXT						    \
 		     : "=a" (last)					  	  \
 		     : [next] "S" (next), [prev] "D" (prev),			  \
-		       [threadrsp] "i" (offsetof(struct task_struct, thread.rsp)), \
+		       [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \
 		       [ti_flags] "i" (offsetof(struct thread_info, flags)),\
 		       [tif_fork] "i" (TIF_FORK),			  \
 		       [thread_info] "i" (offsetof(struct task_struct, stack)), \
-- 
cgit v1.2.3


From 2ba7deef09dad6662dc4fa8b275af8d0794fd9fc Mon Sep 17 00:00:00 2001
From: Len Brown <lenb@kernel.org>
Date: Wed, 30 Jan 2008 13:31:02 +0100
Subject: x86: 32-bit IOAPIC: de-fang IRQ compression

commit c434b7a6aedfe428ad17cd61b21b125a7b7a29ce
(x86: avoid wasting IRQs for PCI devices)
created a concept of "IRQ compression" on i386
to conserve IRQ numbers on systems with many
sparsely populated IO APICs.

The same scheme was also added to x86_64,
but later removed when x86_64 recieved an IRQ over-haul
that made it unnecessary -- including per-CPU
IRQ vectors that greatly increased the IRQ capacity
on the machine.

i386 has not received the analogous over-haul,
and thus a previous attempt to delete IRQ compression
from i386 was rejected on the theory that there may
exist machines that actually need it.  The fact is
that the author of IRQ compression patch was unable
to confirm the actual existence of such a system.

As a result, all i386 kernels with IOAPIC support
pay the following:

1. confusion

IRQ compression re-names the traditional IOAPIC
pin numbers (aka ACPI GSI's) into sequential IRQ #s:

ACPI: PCI Interrupt 0000:00:1c.0[A] -> GSI 20 (level, low) -> IRQ 16
ACPI: PCI Interrupt 0000:00:1c.1[B] -> GSI 21 (level, low) -> IRQ 17
ACPI: PCI Interrupt 0000:00:1c.2[C] -> GSI 22 (level, low) -> IRQ 18
ACPI: PCI Interrupt 0000:00:1c.3[D] -> GSI 23 (level, low) -> IRQ 19
ACPI: PCI Interrupt 0000:00:1c.4[A] -> GSI 20 (level, low) -> IRQ 16

This makes /proc/interrupts look different
depending on system configuration and device probe order.
It is also different than the x86_64 kernel running
on the exact same system.  As a result, programmers
get confused when comparing systems.

2. complexity

The IRQ code in Linux is already overly complex,
and IRQ compression makes it worse.  There have
already been two bug workarounds related to IRQ
compression -- the IRQ0 timer workaround and
the VIA PCI IRQ workaround.

3. size

All i386 kernels with IOAPIC support contain an int[4096] --
a 4 page array to contain the renamed IRQs.

So while the irq compression code on i386 should really
be deleted -- even before merging the x86_64 irq-overhaul,
this patch simply disables it on all high volume systems
to avoid problems #1 and #2 on most all i386 systems.

A large system with pin numbers >=64 will still have compression
to conserve limited IRQ numbers for sparse IOAPICS.  However,
the vast majority of the planet, those with only pin numbers < 64
will use an identity GSI -> IRQ mapping.

Signed-off-by: Len Brown <len.brown@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 arch/x86/kernel/mpparse_32.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/mpparse_32.c b/arch/x86/kernel/mpparse_32.c
index 22fc8d7dec11..bfcfc41f5607 100644
--- a/arch/x86/kernel/mpparse_32.c
+++ b/arch/x86/kernel/mpparse_32.c
@@ -1041,13 +1041,14 @@ void __init mp_config_acpi_legacy_irqs (void)
 }
 
 #define MAX_GSI_NUM	4096
+#define IRQ_COMPRESSION_START	64
 
 int mp_register_gsi(u32 gsi, int triggering, int polarity)
 {
 	int ioapic = -1;
 	int ioapic_pin = 0;
 	int idx, bit = 0;
-	static int pci_irq = 16;
+	static int pci_irq = IRQ_COMPRESSION_START;
 	/*
 	 * Mapping between Global System Interrups, which
 	 * represent all possible interrupts, and IRQs
@@ -1086,12 +1087,16 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
 	if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
 		Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
 			mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
-		return gsi_to_irq[gsi];
+		return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
 	}
 
 	mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
 
-	if (triggering == ACPI_LEVEL_SENSITIVE) {
+	/*
+	 * For GSI >= 64, use IRQ compression
+	 */
+	if ((gsi >= IRQ_COMPRESSION_START)
+		&& (triggering == ACPI_LEVEL_SENSITIVE)) {
 		/*
 		 * For PCI devices assign IRQs in order, avoiding gaps
 		 * due to unused I/O APIC pins.
-- 
cgit v1.2.3


From 6612538ca9b38f0f45d0aec2aae8992c43313705 Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Wed, 30 Jan 2008 13:31:03 +0100
Subject: x86: clean up process_32/64.c

White space and coding style clean up.
Make process_32/64.c similar.

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/process_32.c |   6 +-
 arch/x86/kernel/process_64.c | 197 +++++++++++++++++++++----------------------
 2 files changed, 101 insertions(+), 102 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index add3bf34e205..5350763a2d03 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -142,7 +142,7 @@ EXPORT_SYMBOL(default_idle);
  * to poll the ->work.need_resched flag instead of waiting for the
  * cross-CPU IPI to arrive. Use this option with caution.
  */
-static void poll_idle (void)
+static void poll_idle(void)
 {
 	cpu_relax();
 }
@@ -493,7 +493,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
 
 	p->thread.ip = (unsigned long) ret_from_fork;
 
-	savesegment(gs,p->thread.gs);
+	savesegment(gs, p->thread.gs);
 
 	tsk = current;
 	if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
@@ -571,7 +571,7 @@ void dump_thread(struct pt_regs * regs, struct user * dump)
 }
 EXPORT_SYMBOL(dump_thread);
 
-/* 
+/*
  * Capture the user space registers if the task is not running (in user space)
  */
 int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 238193822e23..4c4d8b3f046e 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -3,7 +3,7 @@
  *
  *  Pentium III FXSR, SSE support
  *	Gareth Hughes <gareth@valinux.com>, May 2000
- * 
+ *
  *  X86-64 port
  *	Andi Kleen.
  *
@@ -19,19 +19,19 @@
 #include <linux/cpu.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/fs.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
-#include <linux/fs.h>
 #include <linux/elfcore.h>
 #include <linux/smp.h>
 #include <linux/slab.h>
 #include <linux/user.h>
-#include <linux/module.h>
 #include <linux/a.out.h>
 #include <linux/interrupt.h>
+#include <linux/utsname.h>
 #include <linux/delay.h>
+#include <linux/module.h>
 #include <linux/ptrace.h>
-#include <linux/utsname.h>
 #include <linux/random.h>
 #include <linux/notifier.h>
 #include <linux/kprobes.h>
@@ -129,54 +129,12 @@ static void default_idle(void)
  * to poll the ->need_resched flag instead of waiting for the
  * cross-CPU IPI to arrive. Use this option with caution.
  */
-static void poll_idle (void)
+static void poll_idle(void)
 {
 	local_irq_enable();
 	cpu_relax();
 }
 
-static void do_nothing(void *unused)
-{
-}
-
-void cpu_idle_wait(void)
-{
-	unsigned int cpu, this_cpu = get_cpu();
-	cpumask_t map, tmp = current->cpus_allowed;
-
-	set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
-	put_cpu();
-
-	cpus_clear(map);
-	for_each_online_cpu(cpu) {
-		per_cpu(cpu_idle_state, cpu) = 1;
-		cpu_set(cpu, map);
-	}
-
-	__get_cpu_var(cpu_idle_state) = 0;
-
-	wmb();
-	do {
-		ssleep(1);
-		for_each_online_cpu(cpu) {
-			if (cpu_isset(cpu, map) &&
-					!per_cpu(cpu_idle_state, cpu))
-				cpu_clear(cpu, map);
-		}
-		cpus_and(map, map, cpu_online_map);
-		/*
-		 * We waited 1 sec, if a CPU still did not call idle
-		 * it may be because it is in idle and not waking up
-		 * because it has nothing to do.
-		 * Give all the remaining CPUS a kick.
-		 */
-		smp_call_function_mask(map, do_nothing, 0, 0);
-	} while (!cpus_empty(map));
-
-	set_cpus_allowed(current, tmp);
-}
-EXPORT_SYMBOL_GPL(cpu_idle_wait);
-
 #ifdef CONFIG_HOTPLUG_CPU
 DECLARE_PER_CPU(int, cpu_state);
 
@@ -247,6 +205,47 @@ void cpu_idle(void)
 	}
 }
 
+static void do_nothing(void *unused)
+{
+}
+
+void cpu_idle_wait(void)
+{
+	unsigned int cpu, this_cpu = get_cpu();
+	cpumask_t map, tmp = current->cpus_allowed;
+
+	set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
+	put_cpu();
+
+	cpus_clear(map);
+	for_each_online_cpu(cpu) {
+		per_cpu(cpu_idle_state, cpu) = 1;
+		cpu_set(cpu, map);
+	}
+
+	__get_cpu_var(cpu_idle_state) = 0;
+
+	wmb();
+	do {
+		ssleep(1);
+		for_each_online_cpu(cpu) {
+			if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
+				cpu_clear(cpu, map);
+		}
+		cpus_and(map, map, cpu_online_map);
+		/*
+		 * We waited 1 sec, if a CPU still did not call idle
+		 * it may be because it is in idle and not waking up
+		 * because it has nothing to do.
+		 * Give all the remaining CPUS a kick.
+		 */
+		smp_call_function_mask(map, do_nothing, 0, 0);
+	} while (!cpus_empty(map));
+
+	set_cpus_allowed(current, tmp);
+}
+EXPORT_SYMBOL_GPL(cpu_idle_wait);
+
 /*
  * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
  * which can obviate IPI to trigger checking of need_resched.
@@ -300,7 +299,7 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
 	}
 }
 
-static int __init idle_setup (char *str)
+static int __init idle_setup(char *str)
 {
 	if (!strcmp(str, "poll")) {
 		printk("using polling idle threads.\n");
@@ -315,13 +314,13 @@ static int __init idle_setup (char *str)
 }
 early_param("idle", idle_setup);
 
-/* Prints also some state that isn't saved in the pt_regs */ 
+/* Prints also some state that isn't saved in the pt_regs */
 void __show_regs(struct pt_regs * regs)
 {
 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
 	unsigned long d0, d1, d2, d3, d6, d7;
-	unsigned int fsindex,gsindex;
-	unsigned int ds,cs,es; 
+	unsigned int fsindex, gsindex;
+	unsigned int ds, cs, es;
 
 	printk("\n");
 	print_modules();
@@ -390,7 +389,7 @@ void exit_thread(void)
 	struct task_struct *me = current;
 	struct thread_struct *t = &me->thread;
 
-	if (me->thread.io_bitmap_ptr) { 
+	if (me->thread.io_bitmap_ptr) {
 		struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
 
 		kfree(t->io_bitmap_ptr);
@@ -426,7 +425,7 @@ void flush_thread(void)
 	tsk->thread.debugreg3 = 0;
 	tsk->thread.debugreg6 = 0;
 	tsk->thread.debugreg7 = 0;
-	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));	
+	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
 	/*
 	 * Forget coprocessor state..
 	 */
@@ -449,7 +448,7 @@ void release_thread(struct task_struct *dead_task)
 
 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
 {
-	struct user_desc ud = { 
+	struct user_desc ud = {
 		.base_addr = addr,
 		.limit = 0xfffff,
 		.seg_32bit = 1,
@@ -458,8 +457,8 @@ static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
 	};
 	struct n_desc_struct *desc = (void *)t->thread.tls_array;
 	desc += tls;
-	desc->a = LDT_entry_a(&ud); 
-	desc->b = LDT_entry_b(&ud); 
+	desc->a = LDT_entry_a(&ud);
+	desc->b = LDT_entry_b(&ud);
 }
 
 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
@@ -516,7 +515,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
 		memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
 				IO_BITMAP_BYTES);
 		set_tsk_thread_flag(p, TIF_IO_BITMAP);
-	} 
+	}
 
 	/*
 	 * Set a new TLS for the child thread?
@@ -544,11 +543,29 @@ out:
 /*
  * This special macro can be used to load a debugging register
  */
-#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
+#define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
+
+/*
+ * Capture the user space registers if the task is not running (in user space)
+ */
+int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
+{
+	struct pt_regs *pp, ptregs;
+
+	pp = task_pt_regs(tsk);
+
+	ptregs = *pp;
+	ptregs.cs &= 0xffff;
+	ptregs.ss &= 0xffff;
+
+	elf_core_copy_regs(regs, &ptregs);
+
+	return 1;
+}
 
 static inline void __switch_to_xtra(struct task_struct *prev_p,
-			     	    struct task_struct *next_p,
-			     	    struct tss_struct *tss)
+				    struct task_struct *next_p,
+				    struct tss_struct *tss)
 {
 	struct thread_struct *prev, *next;
 
@@ -586,7 +603,7 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
 /*
  *	switch_to(x,y) should switch tasks from x to y.
  *
- * This could still be optimized: 
+ * This could still be optimized:
  * - fold all the options into a flag word and test it with a single test.
  * - could test fs/gs bitsliced
  *
@@ -597,7 +614,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 {
 	struct thread_struct *prev = &prev_p->thread,
 				 *next = &next_p->thread;
-	int cpu = smp_processor_id();  
+	int cpu = smp_processor_id();
 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
 
 	/* we're going to use this soon, after a few expensive things */
@@ -700,7 +717,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 /*
  * sys_execve() executes a new program.
  */
-asmlinkage 
+asmlinkage
 long sys_execve(char __user *name, char __user * __user *argv,
 		char __user * __user *envp, struct pt_regs regs)
 {
@@ -721,12 +738,12 @@ void set_personality_64bit(void)
 	/* inherit personality from parent */
 
 	/* Make sure to be in 64bit mode */
-	clear_thread_flag(TIF_IA32); 
+	clear_thread_flag(TIF_IA32);
 
 	/* TBD: overwrites user setup. Should have two bits.
 	   But 64bit processes have always behaved this way,
 	   so it's not too bad. The main problem is just that
-   	   32bit childs are affected again. */
+	   32bit childs are affected again. */
 	current->personality &= ~READ_IMPLIES_EXEC;
 }
 
@@ -819,19 +836,19 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
 		/* Not strictly needed for fs, but do it for symmetry
 		   with gs */
 		if (addr >= TASK_SIZE_OF(task))
-			return -EPERM; 
+			return -EPERM;
 		cpu = get_cpu();
-		/* handle small bases via the GDT because that's faster to 
+		/* handle small bases via the GDT because that's faster to
 		   switch. */
-		if (addr <= 0xffffffff) { 
+		if (addr <= 0xffffffff) {
 			set_32bit_tls(task, FS_TLS, addr);
-			if (doit) { 
-				load_TLS(&task->thread, cpu); 
+			if (doit) {
+				load_TLS(&task->thread, cpu);
 				asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
 			}
 			task->thread.fsindex = FS_TLS_SEL;
 			task->thread.fs = 0;
-		} else { 
+		} else {
 			task->thread.fsindex = 0;
 			task->thread.fs = addr;
 			if (doit) {
@@ -843,24 +860,24 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
 		}
 		put_cpu();
 		break;
-	case ARCH_GET_FS: { 
-		unsigned long base; 
+	case ARCH_GET_FS: {
+		unsigned long base;
 		if (task->thread.fsindex == FS_TLS_SEL)
 			base = read_32bit_tls(task, FS_TLS);
 		else if (doit)
 			rdmsrl(MSR_FS_BASE, base);
 		else
 			base = task->thread.fs;
-		ret = put_user(base, (unsigned long __user *)addr); 
-		break; 
+		ret = put_user(base, (unsigned long __user *)addr);
+		break;
 	}
-	case ARCH_GET_GS: { 
+	case ARCH_GET_GS: {
 		unsigned long base;
 		unsigned gsindex;
 		if (task->thread.gsindex == GS_TLS_SEL)
 			base = read_32bit_tls(task, GS_TLS);
 		else if (doit) {
- 			asm("movl %%gs,%0" : "=r" (gsindex));
+			asm("movl %%gs,%0" : "=r" (gsindex));
 			if (gsindex)
 				rdmsrl(MSR_KERNEL_GS_BASE, base);
 			else
@@ -868,39 +885,21 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
 		}
 		else
 			base = task->thread.gs;
-		ret = put_user(base, (unsigned long __user *)addr); 
+		ret = put_user(base, (unsigned long __user *)addr);
 		break;
 	}
 
 	default:
 		ret = -EINVAL;
 		break;
-	} 
+	}
 
-	return ret;	
-} 
+	return ret;
+}
 
 long sys_arch_prctl(int code, unsigned long addr)
 {
 	return do_arch_prctl(current, code, addr);
-} 
-
-/* 
- * Capture the user space registers if the task is not running (in user space)
- */
-int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
-{
-	struct pt_regs *pp, ptregs;
-
-	pp = task_pt_regs(tsk);
-
-	ptregs = *pp; 
-	ptregs.cs &= 0xffff;
-	ptregs.ss &= 0xffff;
-
-	elf_core_copy_regs(regs, &ptregs);
- 
-	return 1;
 }
 
 unsigned long arch_align_stack(unsigned long sp)
-- 
cgit v1.2.3


From fe58fc8f40257948c2f9fc5a56863077ce3138f0 Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 30 Jan 2008 13:31:03 +0100
Subject: x86: wipe out traditional opt from x86_64 Makefile

Among other things, using -traditional as a gcc option stops us from
using macro token pasting, which is a feature we heavily rely on.

There was still a use of -traditional in arch/x86/kernel/Makefile_64,
which this patch removes.

I don't see any problems building kernels in my x86_64 box without
-traditional.

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/Makefile_64 | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile_64 b/arch/x86/kernel/Makefile_64
index b41e838dc788..19af64e1a3fc 100644
--- a/arch/x86/kernel/Makefile_64
+++ b/arch/x86/kernel/Makefile_64
@@ -4,7 +4,6 @@
 
 extra-y 	:= head_64.o head64.o init_task.o vmlinux.lds
 CPPFLAGS_vmlinux.lds += -Ux86_64
-EXTRA_AFLAGS	:= -traditional
 
 obj-y	:= process_64.o signal_64.o entry_64.o traps_64.o irq_64.o \
 		time_64.o ioport_64.o ldt.o setup_64.o i8259_64.o sys_x86_64.o \
-- 
cgit v1.2.3


From 16e2011be67b8625c1c600f9742c2279be3c0c68 Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 30 Jan 2008 13:31:03 +0100
Subject: x86: allow sched clock to be overridden by paravirt

This patch turns the sched_clock into native_sched_clock.
sched clock becomes a weak symbol, which can then give its
place to a paravirt definition.

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/tsc_64.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c
index 32edd2c50e94..204a9080af04 100644
--- a/arch/x86/kernel/tsc_64.c
+++ b/arch/x86/kernel/tsc_64.c
@@ -63,7 +63,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
 	local_irq_restore(flags);
 }
 
-unsigned long long sched_clock(void)
+unsigned long long native_sched_clock(void)
 {
 	unsigned long a = 0;
 
@@ -77,6 +77,19 @@ unsigned long long sched_clock(void)
 	return cycles_2_ns(a);
 }
 
+/* We need to define a real function for sched_clock, to override the
+   weak default version */
+#ifdef CONFIG_PARAVIRT
+unsigned long long sched_clock(void)
+{
+	return paravirt_sched_clock();
+}
+#else
+unsigned long long
+sched_clock(void) __attribute__((alias("native_sched_clock")));
+#endif
+
+
 static int tsc_unstable;
 
 inline int check_tsc_unstable(void)
-- 
cgit v1.2.3


From 4e87173eacfd0d798aeeba14026893797826bc93 Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 30 Jan 2008 13:31:03 +0100
Subject: x86: split get_cycles_sync

This patch splits get_cycles_sync() into  __get_cycles_sync(),
and the rdtscll part. Paravirt guests cannot issue rdtscl directly,
as it involves a function call in vdso area.

So, using the __get_cycles_sync() base, we introduce vget_cycles_sync,
which then calls the native version of rdtscll. Ideally, however, a guest
should define its own clocksource, together with a vread function

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/tsc_64.c |  2 +-
 include/asm-x86/tsc.h    | 37 +++++++++++++++++++++++++++++++++----
 2 files changed, 34 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c
index 204a9080af04..3723401c4593 100644
--- a/arch/x86/kernel/tsc_64.c
+++ b/arch/x86/kernel/tsc_64.c
@@ -306,7 +306,7 @@ static cycle_t read_tsc(void)
 
 static cycle_t __vsyscall_fn vread_tsc(void)
 {
-	cycle_t ret = (cycle_t)get_cycles_sync();
+	cycle_t ret = (cycle_t)vget_cycles_sync();
 	return ret;
 }
 
diff --git a/include/asm-x86/tsc.h b/include/asm-x86/tsc.h
index d7b1c4e7a108..9b7d264897fa 100644
--- a/include/asm-x86/tsc.h
+++ b/include/asm-x86/tsc.h
@@ -33,14 +33,14 @@ static inline cycles_t get_cycles(void)
 }
 
 /* Like get_cycles, but make sure the CPU is synchronized. */
-static __always_inline cycles_t get_cycles_sync(void)
+static __always_inline cycles_t __get_cycles_sync(void)
 {
 	unsigned long long ret;
 	unsigned eax, edx;
 
 	/*
-  	 * Use RDTSCP if possible; it is guaranteed to be synchronous
- 	 * and doesn't cause a VMEXIT on Hypervisors
+	 * Use RDTSCP if possible; it is guaranteed to be synchronous
+	 * and doesn't cause a VMEXIT on Hypervisors
 	 */
 	alternative_io(ASM_NOP3, ".byte 0x0f,0x01,0xf9", X86_FEATURE_RDTSCP,
 		       ASM_OUTPUT2("=a" (eax), "=d" (edx)),
@@ -55,11 +55,40 @@ static __always_inline cycles_t get_cycles_sync(void)
 	 */
 	alternative_io("cpuid", ASM_NOP2, X86_FEATURE_SYNC_RDTSC,
 			  "=a" (eax), "0" (1) : "ebx","ecx","edx","memory");
-	rdtscll(ret);
 
+	return 0;
+}
+
+static __always_inline cycles_t get_cycles_sync(void)
+{
+	unsigned long long ret;
+	ret = __get_cycles_sync();
+	if (!ret)
+		rdtscll(ret);
 	return ret;
 }
 
+#ifdef CONFIG_PARAVIRT
+/*
+ * For paravirt guests, some functionalities are executed through function
+ * pointers in the various pvops structures.
+ * These function pointers exist inside the kernel and can not
+ * be accessed by user space. To avoid this, we make a copy of the
+ * get_cycles_sync (called in kernel) but force the use of native_read_tsc.
+ * Ideally, the guest should set up it's own clock and vread
+ */
+static __always_inline long long vget_cycles_sync(void)
+{
+	unsigned long long ret;
+	ret = __get_cycles_sync();
+	if (!ret)
+		ret = native_read_tsc();
+	return ret;
+}
+#else
+# define vget_cycles_sync() get_cycles_sync()
+#endif
+
 extern void tsc_init(void);
 extern void mark_tsc_unstable(char *reason);
 extern int unsynchronized_tsc(void);
-- 
cgit v1.2.3


From 8f12dea6135d0a55b151dcb4c6bbe211f5f8d35d Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 30 Jan 2008 13:31:06 +0100
Subject: x86: introduce native_read_tscp

Targetting paravirt, this patch introduces native_read_tscp, in
place of rdtscp() macro. When in a paravirt guest, this will
involve a function call, and thus, cannot be done in the vdso area.
These users then have to call the native version directly

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/vsyscall_64.c |  4 ++--
 arch/x86/vdso/vgetcpu.c       |  4 ++--
 include/asm-x86/msr.h         | 29 +++++++++++++++++++++--------
 3 files changed, 25 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 018f7cf33790..c4c5e765cd2c 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -190,7 +190,7 @@ time_t __vsyscall(1) vtime(time_t *t)
 long __vsyscall(2)
 vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
 {
-	unsigned int dummy, p;
+	unsigned int p;
 	unsigned long j = 0;
 
 	/* Fast cache - only recompute value once per jiffies and avoid
@@ -205,7 +205,7 @@ vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
 		p = tcache->blob[1];
 	} else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
 		/* Load per CPU data from RDTSCP */
-		rdtscp(dummy, dummy, p);
+		native_read_tscp(&p);
 	} else {
 		/* Load per CPU data from GDT */
 		asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c
index 3b1ae1abfba9..c8097f17f8a9 100644
--- a/arch/x86/vdso/vgetcpu.c
+++ b/arch/x86/vdso/vgetcpu.c
@@ -15,11 +15,11 @@
 
 long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
 {
-	unsigned int dummy, p;
+	unsigned int p;
 
 	if (*vdso_vgetcpu_mode == VGETCPU_RDTSCP) {
 		/* Load per CPU data from RDTSCP */
-		rdtscp(dummy, dummy, p);
+		native_read_tscp(&p);
 	} else {
 		/* Load per CPU data from GDT */
 		asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
diff --git a/include/asm-x86/msr.h b/include/asm-x86/msr.h
index 0648b2d57a38..b6262e99fc8e 100644
--- a/include/asm-x86/msr.h
+++ b/include/asm-x86/msr.h
@@ -7,6 +7,27 @@
 # include <linux/types.h>
 #endif
 
+#ifdef __KERNEL__
+#ifndef __ASSEMBLY__
+static inline unsigned long long native_read_tscp(int *aux)
+{
+	unsigned long low, high;
+	asm volatile (".byte 0x0f,0x01,0xf9"
+		      : "=a" (low), "=d" (high), "=c" (*aux));
+	return low | ((u64)high >> 32);
+}
+
+#define rdtscp(low, high, aux)						\
+       do {                                                            \
+		unsigned long long _val = native_read_tscp(&(aux));     \
+		(low) = (u32)_val;                                      \
+		(high) = (u32)(_val >> 32);                             \
+       } while (0)
+
+#define rdtscpll(val, aux) (val) = native_read_tscp(&(aux))
+#endif
+#endif
+
 #ifdef __i386__
 
 #ifdef __KERNEL__
@@ -178,8 +199,6 @@ static inline int wrmsr_safe(u32 __msr, u32 __low, u32 __high)
 #define rdtscl(low) \
      __asm__ __volatile__ ("rdtsc" : "=a" (low) : : "edx")
 
-#define rdtscp(low,high,aux) \
-     __asm__ __volatile__ (".byte 0x0f,0x01,0xf9" : "=a" (low), "=d" (high), "=c" (aux))
 
 #define rdtscll(val) do { \
      unsigned int __a,__d; \
@@ -187,12 +206,6 @@ static inline int wrmsr_safe(u32 __msr, u32 __low, u32 __high)
      (val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \
 } while(0)
 
-#define rdtscpll(val, aux) do { \
-     unsigned long __a, __d; \
-     __asm__ __volatile__ (".byte 0x0f,0x01,0xf9" : "=a" (__a), "=d" (__d), "=c" (aux)); \
-     (val) = (__d << 32) | __a; \
-} while (0)
-
 #define write_tsc(val1,val2) wrmsr(0x10, val1, val2)
 
 #define write_rdtscp_aux(val) wrmsr(0xc0000103, val, 0)
-- 
cgit v1.2.3


From cc503c1b43e002e3f1fed70f46d947e2bf349bb6 Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Wed, 30 Jan 2008 13:31:07 +0100
Subject: x86: PIE executable randomization

main executable of (specially compiled/linked -pie/-fpie) ET_DYN binaries
onto a random address (in cases in which mmap() is allowed to perform a
randomization).

The code has been extraced from Ingo's exec-shield patch
http://people.redhat.com/mingo/exec-shield/

[akpm@linux-foundation.org: fix used-uninitialsied warning]
[kamezawa.hiroyu@jp.fujitsu.com: fixed ia32 ELF on x86_64 handling]

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Roland McGrath <roland@redhat.com>
Cc: Jakub Jelinek <jakub@redhat.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/ia64/ia32/binfmt_elf32.c |   2 +-
 arch/x86/kernel/sys_x86_64.c  |  98 ++++++++++++++++++++++++++++++++++
 arch/x86/mm/mmap_64.c         | 119 ++++++++++++++++++++++++++++++++++++------
 fs/binfmt_elf.c               | 107 +++++++++++++++++++++++++++++--------
 include/asm-x86/pgtable_64.h  |   1 +
 5 files changed, 287 insertions(+), 40 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/ia64/ia32/binfmt_elf32.c b/arch/ia64/ia32/binfmt_elf32.c
index 3e35987af458..2a662215359c 100644
--- a/arch/ia64/ia32/binfmt_elf32.c
+++ b/arch/ia64/ia32/binfmt_elf32.c
@@ -222,7 +222,7 @@ elf32_set_personality (void)
 }
 
 static unsigned long
-elf32_map (struct file *filep, unsigned long addr, struct elf_phdr *eppnt, int prot, int type)
+elf32_map (struct file *filep, unsigned long addr, struct elf_phdr *eppnt, int prot, int type, unsigned long unused)
 {
 	unsigned long pgoff = (eppnt->p_vaddr) & ~IA32_PAGE_MASK;
 
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 907942ee6e76..95485e63fd2f 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -12,6 +12,7 @@
 #include <linux/file.h>
 #include <linux/utsname.h>
 #include <linux/personality.h>
+#include <linux/random.h>
 
 #include <asm/uaccess.h>
 #include <asm/ia32.h>
@@ -65,6 +66,7 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
 			   unsigned long *end)
 {
 	if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) {
+		unsigned long new_begin;
 		/* This is usually used needed to map code in small
 		   model, so it needs to be in the first 31bit. Limit
 		   it to that.  This means we need to move the
@@ -74,6 +76,11 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
 		   of playground for now. -AK */ 
 		*begin = 0x40000000; 
 		*end = 0x80000000;		
+		if (current->flags & PF_RANDOMIZE) {
+			new_begin = randomize_range(*begin, *begin + 0x02000000, 0);
+			if (new_begin)
+				*begin = new_begin;
+		}
 	} else {
 		*begin = TASK_UNMAPPED_BASE;
 		*end = TASK_SIZE; 
@@ -143,6 +150,97 @@ full_search:
 	}
 }
 
+
+unsigned long
+arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
+			  const unsigned long len, const unsigned long pgoff,
+			  const unsigned long flags)
+{
+	struct vm_area_struct *vma;
+	struct mm_struct *mm = current->mm;
+	unsigned long addr = addr0;
+
+	/* requested length too big for entire address space */
+	if (len > TASK_SIZE)
+		return -ENOMEM;
+
+	if (flags & MAP_FIXED)
+		return addr;
+
+	/* for MAP_32BIT mappings we force the legact mmap base */
+	if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT))
+		goto bottomup;
+
+	/* requesting a specific address */
+	if (addr) {
+		addr = PAGE_ALIGN(addr);
+		vma = find_vma(mm, addr);
+		if (TASK_SIZE - len >= addr &&
+				(!vma || addr + len <= vma->vm_start))
+			return addr;
+	}
+
+	/* check if free_area_cache is useful for us */
+	if (len <= mm->cached_hole_size) {
+ 	        mm->cached_hole_size = 0;
+ 		mm->free_area_cache = mm->mmap_base;
+ 	}
+
+	/* either no address requested or can't fit in requested address hole */
+	addr = mm->free_area_cache;
+
+	/* make sure it can fit in the remaining address space */
+	if (addr > len) {
+		vma = find_vma(mm, addr-len);
+		if (!vma || addr <= vma->vm_start)
+			/* remember the address as a hint for next time */
+			return (mm->free_area_cache = addr-len);
+	}
+
+	if (mm->mmap_base < len)
+		goto bottomup;
+
+	addr = mm->mmap_base-len;
+
+	do {
+		/*
+		 * Lookup failure means no vma is above this address,
+		 * else if new region fits below vma->vm_start,
+		 * return with success:
+		 */
+		vma = find_vma(mm, addr);
+		if (!vma || addr+len <= vma->vm_start)
+			/* remember the address as a hint for next time */
+			return (mm->free_area_cache = addr);
+
+ 		/* remember the largest hole we saw so far */
+ 		if (addr + mm->cached_hole_size < vma->vm_start)
+ 		        mm->cached_hole_size = vma->vm_start - addr;
+
+		/* try just below the current vma->vm_start */
+		addr = vma->vm_start-len;
+	} while (len < vma->vm_start);
+
+bottomup:
+	/*
+	 * A failed mmap() very likely causes application failure,
+	 * so fall back to the bottom-up function here. This scenario
+	 * can happen with large stack limits and large mmap()
+	 * allocations.
+	 */
+	mm->cached_hole_size = ~0UL;
+  	mm->free_area_cache = TASK_UNMAPPED_BASE;
+	addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
+	/*
+	 * Restore the topdown base:
+	 */
+	mm->free_area_cache = mm->mmap_base;
+	mm->cached_hole_size = ~0UL;
+
+	return addr;
+}
+
+
 asmlinkage long sys_uname(struct new_utsname __user * name)
 {
 	int err;
diff --git a/arch/x86/mm/mmap_64.c b/arch/x86/mm/mmap_64.c
index ffb71a31bb6e..8cf03ea651f8 100644
--- a/arch/x86/mm/mmap_64.c
+++ b/arch/x86/mm/mmap_64.c
@@ -1,32 +1,117 @@
-/* Copyright 2005 Andi Kleen, SuSE Labs.
- * Licensed under GPL, v.2
+/*
+ *  linux/arch/x86-64/mm/mmap.c
+ *
+ *  flexible mmap layout support
+ *
+ * Based on code by Ingo Molnar and Andi Kleen, copyrighted
+ * as follows:
+ *
+ * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
+ * All Rights Reserved.
+ * Copyright 2005 Andi Kleen, SUSE Labs.
+ * Copyright 2007 Jiri Kosina, SUSE Labs.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
  */
+
+#include <linux/personality.h>
 #include <linux/mm.h>
-#include <linux/sched.h>
 #include <linux/random.h>
+#include <linux/limits.h>
+#include <linux/sched.h>
 #include <asm/ia32.h>
 
-/* Notebook: move the mmap code from sys_x86_64.c over here. */
+/*
+ * Top of mmap area (just below the process stack).
+ *
+ * Leave an at least ~128 MB hole.
+ */
+#define MIN_GAP (128*1024*1024)
+#define MAX_GAP (TASK_SIZE/6*5)
 
-void arch_pick_mmap_layout(struct mm_struct *mm)
+static inline unsigned long mmap_base(void)
+{
+	unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
+
+	if (gap < MIN_GAP)
+		gap = MIN_GAP;
+	else if (gap > MAX_GAP)
+		gap = MAX_GAP;
+
+	return TASK_SIZE - (gap & PAGE_MASK);
+}
+
+static inline int mmap_is_32(void)
 {
 #ifdef CONFIG_IA32_EMULATION
-	if (current_thread_info()->flags & _TIF_IA32)
-		return ia32_pick_mmap_layout(mm);
+	if (test_thread_flag(TIF_IA32))
+		return 1;
 #endif
-	mm->mmap_base = TASK_UNMAPPED_BASE;
+	return 0;
+}
+
+static inline int mmap_is_legacy(void)
+{
+	if (current->personality & ADDR_COMPAT_LAYOUT)
+		return 1;
+
+	if (current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY)
+		return 1;
+
+	return sysctl_legacy_va_layout;
+}
+
+/*
+ * This function, called very early during the creation of a new
+ * process VM image, sets up which VM layout function to use:
+ */
+void arch_pick_mmap_layout(struct mm_struct *mm)
+{
+	int rnd = 0;
 	if (current->flags & PF_RANDOMIZE) {
 		/*
-		 * Add 28bit randomness which is about 40bits of
-		 * address space because mmap base has to be page
-		 * aligned.  or ~1/128 of the total user VM (total
-		 * user address space is 47bits)
+		 * Add 28bit randomness which is about 40bits of address space
+		 * because mmap base has to be page aligned.
+		 * or ~1/128 of the total user VM
+		 * (total user address space is 47bits)
 		 */
-		unsigned rnd = get_random_int() & 0xfffffff;
+		rnd = get_random_int() & 0xfffffff;
+	}
 
-		mm->mmap_base += ((unsigned long)rnd) << PAGE_SHIFT;
+	/*
+	 * Fall back to the standard layout if the personality
+	 * bit is set, or if the expected stack growth is unlimited:
+	 */
+	if (mmap_is_32()) {
+#ifdef CONFIG_IA32_EMULATION
+		/* ia32_pick_mmap_layout has its own. */
+		return ia32_pick_mmap_layout(mm);
+#endif
+	} else if(mmap_is_legacy()) {
+		mm->mmap_base = TASK_UNMAPPED_BASE;
+		mm->get_unmapped_area = arch_get_unmapped_area;
+		mm->unmap_area = arch_unmap_area;
+	} else {
+		mm->mmap_base = mmap_base();
+		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+		mm->unmap_area = arch_unmap_area_topdown;
+		if (current->flags & PF_RANDOMIZE)
+			rnd = -rnd;
+	}
+	if (current->flags & PF_RANDOMIZE) {
+		mm->mmap_base += ((long)rnd) << PAGE_SHIFT;
 	}
-	mm->get_unmapped_area = arch_get_unmapped_area;
-	mm->unmap_area = arch_unmap_area;
 }
-
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 043a800c8f71..8193d24be159 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -45,7 +45,7 @@
 
 static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs);
 static int load_elf_library(struct file *);
-static unsigned long elf_map (struct file *, unsigned long, struct elf_phdr *, int, int);
+static unsigned long elf_map (struct file *, unsigned long, struct elf_phdr *, int, int, unsigned long);
 
 /*
  * If we don't support core dumping, then supply a NULL so we
@@ -298,33 +298,70 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 #ifndef elf_map
 
 static unsigned long elf_map(struct file *filep, unsigned long addr,
-		struct elf_phdr *eppnt, int prot, int type)
+		struct elf_phdr *eppnt, int prot, int type,
+		unsigned long total_size)
 {
 	unsigned long map_addr;
-	unsigned long pageoffset = ELF_PAGEOFFSET(eppnt->p_vaddr);
+	unsigned long size = eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr);
+	unsigned long off = eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr);
+	addr = ELF_PAGESTART(addr);
+	size = ELF_PAGEALIGN(size);
 
-	down_write(&current->mm->mmap_sem);
 	/* mmap() will return -EINVAL if given a zero size, but a
 	 * segment with zero filesize is perfectly valid */
-	if (eppnt->p_filesz + pageoffset)
-		map_addr = do_mmap(filep, ELF_PAGESTART(addr),
-				   eppnt->p_filesz + pageoffset, prot, type,
-				   eppnt->p_offset - pageoffset);
-	else
-		map_addr = ELF_PAGESTART(addr);
+	if (!size)
+		return addr;
+
+	down_write(&current->mm->mmap_sem);
+	/*
+	* total_size is the size of the ELF (interpreter) image.
+	* The _first_ mmap needs to know the full size, otherwise
+	* randomization might put this image into an overlapping
+	* position with the ELF binary image. (since size < total_size)
+	* So we first map the 'big' image - and unmap the remainder at
+	* the end. (which unmap is needed for ELF images with holes.)
+	*/
+	if (total_size) {
+		total_size = ELF_PAGEALIGN(total_size);
+		map_addr = do_mmap(filep, addr, total_size, prot, type, off);
+		if (!BAD_ADDR(map_addr))
+			do_munmap(current->mm, map_addr+size, total_size-size);
+	} else
+		map_addr = do_mmap(filep, addr, size, prot, type, off);
+
 	up_write(&current->mm->mmap_sem);
 	return(map_addr);
 }
 
 #endif /* !elf_map */
 
+static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
+{
+	int i, first_idx = -1, last_idx = -1;
+
+	for (i = 0; i < nr; i++) {
+		if (cmds[i].p_type == PT_LOAD) {
+			last_idx = i;
+			if (first_idx == -1)
+				first_idx = i;
+		}
+	}
+	if (first_idx == -1)
+		return 0;
+
+	return cmds[last_idx].p_vaddr + cmds[last_idx].p_memsz -
+				ELF_PAGESTART(cmds[first_idx].p_vaddr);
+}
+
+
 /* This is much more generalized than the library routine read function,
    so we keep this separate.  Technically the library read function
    is only provided so that we can read a.out libraries that have
    an ELF header */
 
 static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
-		struct file *interpreter, unsigned long *interp_load_addr)
+		struct file *interpreter, unsigned long *interp_map_addr,
+		unsigned long no_base)
 {
 	struct elf_phdr *elf_phdata;
 	struct elf_phdr *eppnt;
@@ -332,6 +369,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 	int load_addr_set = 0;
 	unsigned long last_bss = 0, elf_bss = 0;
 	unsigned long error = ~0UL;
+	unsigned long total_size;
 	int retval, i, size;
 
 	/* First of all, some simple consistency checks */
@@ -370,6 +408,12 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 		goto out_close;
 	}
 
+	total_size = total_mapping_size(elf_phdata, interp_elf_ex->e_phnum);
+	if (!total_size) {
+		error = -EINVAL;
+		goto out_close;
+	}
+
 	eppnt = elf_phdata;
 	for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) {
 		if (eppnt->p_type == PT_LOAD) {
@@ -387,9 +431,14 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 			vaddr = eppnt->p_vaddr;
 			if (interp_elf_ex->e_type == ET_EXEC || load_addr_set)
 				elf_type |= MAP_FIXED;
+			else if (no_base && interp_elf_ex->e_type == ET_DYN)
+				load_addr = -vaddr;
 
 			map_addr = elf_map(interpreter, load_addr + vaddr,
-					   eppnt, elf_prot, elf_type);
+					   eppnt, elf_prot, elf_type, total_size);
+			total_size = 0;
+			if (!*interp_map_addr)
+				*interp_map_addr = map_addr;
 			error = map_addr;
 			if (BAD_ADDR(map_addr))
 				goto out_close;
@@ -455,8 +504,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 			goto out_close;
 	}
 
-	*interp_load_addr = load_addr;
-	error = ((unsigned long)interp_elf_ex->e_entry) + load_addr;
+	error = load_addr;
 
 out_close:
 	kfree(elf_phdata);
@@ -553,7 +601,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	int elf_exec_fileno;
 	int retval, i;
 	unsigned int size;
-	unsigned long elf_entry, interp_load_addr = 0;
+	unsigned long elf_entry;
+	unsigned long interp_load_addr = 0;
 	unsigned long start_code, end_code, start_data, end_data;
 	unsigned long reloc_func_desc = 0;
 	char passed_fileno[6];
@@ -825,9 +874,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	current->mm->start_stack = bprm->p;
 
 	/* Now we do a little grungy work by mmaping the ELF image into
-	   the correct location in memory.  At this point, we assume that
-	   the image should be loaded at fixed address, not at a variable
-	   address. */
+	   the correct location in memory. */
 	for(i = 0, elf_ppnt = elf_phdata;
 	    i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
 		int elf_prot = 0, elf_flags;
@@ -881,11 +928,15 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 			 * default mmap base, as well as whatever program they
 			 * might try to exec.  This is because the brk will
 			 * follow the loader, and is not movable.  */
+#ifdef CONFIG_X86
+			load_bias = 0;
+#else
 			load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
+#endif
 		}
 
 		error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
-				elf_prot, elf_flags);
+				elf_prot, elf_flags,0);
 		if (BAD_ADDR(error)) {
 			send_sig(SIGKILL, current, 0);
 			retval = IS_ERR((void *)error) ?
@@ -961,13 +1012,25 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	}
 
 	if (elf_interpreter) {
-		if (interpreter_type == INTERPRETER_AOUT)
+		if (interpreter_type == INTERPRETER_AOUT) {
 			elf_entry = load_aout_interp(&loc->interp_ex,
 						     interpreter);
-		else
+		} else {
+			unsigned long uninitialized_var(interp_map_addr);
+
 			elf_entry = load_elf_interp(&loc->interp_elf_ex,
 						    interpreter,
-						    &interp_load_addr);
+						    &interp_map_addr,
+						    load_bias);
+			if (!IS_ERR((void *)elf_entry)) {
+				/*
+				 * load_elf_interp() returns relocation
+				 * adjustment
+				 */
+				interp_load_addr = elf_entry;
+				elf_entry += loc->interp_elf_ex.e_entry;
+			}
+		}
 		if (BAD_ADDR(elf_entry)) {
 			force_sig(SIGSEGV, current);
 			retval = IS_ERR((void *)elf_entry) ?
diff --git a/include/asm-x86/pgtable_64.h b/include/asm-x86/pgtable_64.h
index 9d4f11dd566f..6cf40dec0932 100644
--- a/include/asm-x86/pgtable_64.h
+++ b/include/asm-x86/pgtable_64.h
@@ -413,6 +413,7 @@ pte_t *lookup_address(unsigned long addr);
 		remap_pfn_range(vma, vaddr, pfn, size, prot)
 
 #define HAVE_ARCH_UNMAPPED_AREA
+#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
 
 #define pgtable_cache_init()   do { } while (0)
 #define check_pgt_cache()      do { } while (0)
-- 
cgit v1.2.3


From bb1ad8205be4cb95e3286d7442596da6fd70409f Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 30 Jan 2008 13:31:07 +0100
Subject: x86: PIE executable randomization, checkpatch fixes

#39: FILE: arch/ia64/ia32/binfmt_elf32.c:229:
+elf32_map (struct file *filep, unsigned long addr, struct elf_phdr *eppnt, int prot, int type, unsigned long unused)

WARNING: no space between function name and open parenthesis '('
#39: FILE: arch/ia64/ia32/binfmt_elf32.c:229:
+elf32_map (struct file *filep, unsigned long addr, struct elf_phdr *eppnt, int prot, int type, unsigned long unused)

WARNING: line over 80 characters
#67: FILE: arch/x86/kernel/sys_x86_64.c:80:
+			new_begin = randomize_range(*begin, *begin + 0x02000000, 0);

ERROR: use tabs not spaces
#110: FILE: arch/x86/kernel/sys_x86_64.c:185:
+ ^I        mm->cached_hole_size = 0;$

ERROR: use tabs not spaces
#111: FILE: arch/x86/kernel/sys_x86_64.c:186:
+ ^I^Imm->free_area_cache = mm->mmap_base;$

ERROR: use tabs not spaces
#112: FILE: arch/x86/kernel/sys_x86_64.c:187:
+ ^I}$

ERROR: use tabs not spaces
#141: FILE: arch/x86/kernel/sys_x86_64.c:216:
+ ^I^I/* remember the largest hole we saw so far */$

ERROR: use tabs not spaces
#142: FILE: arch/x86/kernel/sys_x86_64.c:217:
+ ^I^Iif (addr + mm->cached_hole_size < vma->vm_start)$

ERROR: use tabs not spaces
#143: FILE: arch/x86/kernel/sys_x86_64.c:218:
+ ^I^I        mm->cached_hole_size = vma->vm_start - addr;$

ERROR: use tabs not spaces
#157: FILE: arch/x86/kernel/sys_x86_64.c:232:
+  ^Imm->free_area_cache = TASK_UNMAPPED_BASE;$

ERROR: need a space before the open parenthesis '('
#291: FILE: arch/x86/mm/mmap_64.c:101:
+	} else if(mmap_is_legacy()) {

WARNING: braces {} are not necessary for single statement blocks
#302: FILE: arch/x86/mm/mmap_64.c:112:
+	if (current->flags & PF_RANDOMIZE) {
+		mm->mmap_base += ((long)rnd) << PAGE_SHIFT;
+	}

WARNING: line over 80 characters
#314: FILE: fs/binfmt_elf.c:48:
+static unsigned long elf_map (struct file *, unsigned long, struct elf_phdr *, int, int, unsigned long);

WARNING: no space between function name and open parenthesis '('
#314: FILE: fs/binfmt_elf.c:48:
+static unsigned long elf_map (struct file *, unsigned long, struct elf_phdr *, int, int, unsigned long);

WARNING: line over 80 characters
#429: FILE: fs/binfmt_elf.c:438:
+					   eppnt, elf_prot, elf_type, total_size);

ERROR: need space after that ',' (ctx:VxV)
#480: FILE: fs/binfmt_elf.c:939:
+				elf_prot, elf_flags,0);
 				                   ^

total: 9 errors, 7 warnings, 461 lines checked
Your patch has style problems, please review.  If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.

Please run checkpatch prior to sending patches

Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Jakub Jelinek <jakub@redhat.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/ia64/ia32/binfmt_elf32.c |  3 ++-
 arch/x86/kernel/sys_x86_64.c  | 14 +++++++-------
 arch/x86/mm/mmap_64.c         |  5 ++---
 fs/binfmt_elf.c               |  7 ++++---
 4 files changed, 15 insertions(+), 14 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/ia64/ia32/binfmt_elf32.c b/arch/ia64/ia32/binfmt_elf32.c
index 2a662215359c..4f0c30c38e99 100644
--- a/arch/ia64/ia32/binfmt_elf32.c
+++ b/arch/ia64/ia32/binfmt_elf32.c
@@ -222,7 +222,8 @@ elf32_set_personality (void)
 }
 
 static unsigned long
-elf32_map (struct file *filep, unsigned long addr, struct elf_phdr *eppnt, int prot, int type, unsigned long unused)
+elf32_map(struct file *filep, unsigned long addr, struct elf_phdr *eppnt,
+		int prot, int type, unsigned long unused)
 {
 	unsigned long pgoff = (eppnt->p_vaddr) & ~IA32_PAGE_MASK;
 
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 95485e63fd2f..bd802a5e1aa3 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -182,9 +182,9 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 
 	/* check if free_area_cache is useful for us */
 	if (len <= mm->cached_hole_size) {
- 	        mm->cached_hole_size = 0;
- 		mm->free_area_cache = mm->mmap_base;
- 	}
+		mm->cached_hole_size = 0;
+		mm->free_area_cache = mm->mmap_base;
+	}
 
 	/* either no address requested or can't fit in requested address hole */
 	addr = mm->free_area_cache;
@@ -213,9 +213,9 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 			/* remember the address as a hint for next time */
 			return (mm->free_area_cache = addr);
 
- 		/* remember the largest hole we saw so far */
- 		if (addr + mm->cached_hole_size < vma->vm_start)
- 		        mm->cached_hole_size = vma->vm_start - addr;
+		/* remember the largest hole we saw so far */
+		if (addr + mm->cached_hole_size < vma->vm_start)
+			mm->cached_hole_size = vma->vm_start - addr;
 
 		/* try just below the current vma->vm_start */
 		addr = vma->vm_start-len;
@@ -229,7 +229,7 @@ bottomup:
 	 * allocations.
 	 */
 	mm->cached_hole_size = ~0UL;
-  	mm->free_area_cache = TASK_UNMAPPED_BASE;
+	mm->free_area_cache = TASK_UNMAPPED_BASE;
 	addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
 	/*
 	 * Restore the topdown base:
diff --git a/arch/x86/mm/mmap_64.c b/arch/x86/mm/mmap_64.c
index 8cf03ea651f8..65b34f226f14 100644
--- a/arch/x86/mm/mmap_64.c
+++ b/arch/x86/mm/mmap_64.c
@@ -100,7 +100,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 		/* ia32_pick_mmap_layout has its own. */
 		return ia32_pick_mmap_layout(mm);
 #endif
-	} else if(mmap_is_legacy()) {
+	} else if (mmap_is_legacy()) {
 		mm->mmap_base = TASK_UNMAPPED_BASE;
 		mm->get_unmapped_area = arch_get_unmapped_area;
 		mm->unmap_area = arch_unmap_area;
@@ -111,7 +111,6 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 		if (current->flags & PF_RANDOMIZE)
 			rnd = -rnd;
 	}
-	if (current->flags & PF_RANDOMIZE) {
+	if (current->flags & PF_RANDOMIZE)
 		mm->mmap_base += ((long)rnd) << PAGE_SHIFT;
-	}
 }
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 8193d24be159..b8bca1ebc1a0 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -45,7 +45,8 @@
 
 static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs);
 static int load_elf_library(struct file *);
-static unsigned long elf_map (struct file *, unsigned long, struct elf_phdr *, int, int, unsigned long);
+static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *,
+				int, int, unsigned long);
 
 /*
  * If we don't support core dumping, then supply a NULL so we
@@ -435,7 +436,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 				load_addr = -vaddr;
 
 			map_addr = elf_map(interpreter, load_addr + vaddr,
-					   eppnt, elf_prot, elf_type, total_size);
+					eppnt, elf_prot, elf_type, total_size);
 			total_size = 0;
 			if (!*interp_map_addr)
 				*interp_map_addr = map_addr;
@@ -936,7 +937,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 		}
 
 		error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
-				elf_prot, elf_flags,0);
+				elf_prot, elf_flags, 0);
 		if (BAD_ADDR(error)) {
 			send_sig(SIGKILL, current, 0);
 			retval = IS_ERR((void *)error) ?
-- 
cgit v1.2.3


From d89542229b657bdcce6a6f76168f9098ee3e9344 Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 30 Jan 2008 13:31:08 +0100
Subject: x86: put together equal pieces of system.h

This patch puts together pieces of system_{32,64}.h that
looks like the same. It's the first step towards integration
of this file.

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/process_64.c |  2 +-
 include/asm-x86/system.h     | 69 ++++++++++++++++++++++++++++++++++++++++++++
 include/asm-x86/system_32.h  | 58 -------------------------------------
 include/asm-x86/system_64.h  | 12 --------
 4 files changed, 70 insertions(+), 71 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 4c4d8b3f046e..057b5442ffda 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -99,7 +99,7 @@ void exit_idle(void)
  * We use this if we don't have any better
  * idle routine..
  */
-static void default_idle(void)
+void default_idle(void)
 {
 	current_thread_info()->status &= ~TS_POLLING;
 	/*
diff --git a/include/asm-x86/system.h b/include/asm-x86/system.h
index 692562b48f2a..d0803f8c70c4 100644
--- a/include/asm-x86/system.h
+++ b/include/asm-x86/system.h
@@ -1,5 +1,74 @@
+#ifndef _ASM_X86_SYSTEM_H_
+#define _ASM_X86_SYSTEM_H_
+
+#include <asm/asm.h>
+
 #ifdef CONFIG_X86_32
 # include "system_32.h"
 #else
 # include "system_64.h"
 #endif
+
+#ifdef __KERNEL__
+#define _set_base(addr, base) do { unsigned long __pr; \
+__asm__ __volatile__ ("movw %%dx,%1\n\t" \
+	"rorl $16,%%edx\n\t" \
+	"movb %%dl,%2\n\t" \
+	"movb %%dh,%3" \
+	:"=&d" (__pr) \
+	:"m" (*((addr)+2)), \
+	 "m" (*((addr)+4)), \
+	 "m" (*((addr)+7)), \
+	 "0" (base) \
+	); } while (0)
+
+#define _set_limit(addr, limit) do { unsigned long __lr; \
+__asm__ __volatile__ ("movw %%dx,%1\n\t" \
+	"rorl $16,%%edx\n\t" \
+	"movb %2,%%dh\n\t" \
+	"andb $0xf0,%%dh\n\t" \
+	"orb %%dh,%%dl\n\t" \
+	"movb %%dl,%2" \
+	:"=&d" (__lr) \
+	:"m" (*(addr)), \
+	 "m" (*((addr)+6)), \
+	 "0" (limit) \
+	); } while (0)
+
+#define set_base(ldt, base) _set_base(((char *)&(ldt)) , (base))
+#define set_limit(ldt, limit) _set_limit(((char *)&(ldt)) , ((limit)-1))
+
+/*
+ * Save a segment register away
+ */
+#define savesegment(seg, value) \
+	asm volatile("mov %%" #seg ",%0":"=rm" (value))
+
+static inline unsigned long get_limit(unsigned long segment)
+{
+	unsigned long __limit;
+	__asm__("lsll %1,%0"
+		:"=r" (__limit):"r" (segment));
+	return __limit+1;
+}
+#endif /* __KERNEL__ */
+
+static inline void clflush(void *__p)
+{
+	asm volatile("clflush %0" : "+m" (*(char __force *)__p));
+}
+
+#define nop() __asm__ __volatile__ ("nop")
+
+void disable_hlt(void);
+void enable_hlt(void);
+
+extern int es7000_plat;
+void cpu_idle_wait(void);
+
+extern unsigned long arch_align_stack(unsigned long sp);
+extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
+
+void default_idle(void);
+
+#endif
diff --git a/include/asm-x86/system_32.h b/include/asm-x86/system_32.h
index 28978b17b07a..fb457642ac58 100644
--- a/include/asm-x86/system_32.h
+++ b/include/asm-x86/system_32.h
@@ -34,34 +34,6 @@ extern struct task_struct * FASTCALL(__switch_to(struct task_struct *prev, struc
 		      "2" (prev), "d" (next));				\
 } while (0)
 
-#define _set_base(addr,base) do { unsigned long __pr; \
-__asm__ __volatile__ ("movw %%dx,%1\n\t" \
-	"rorl $16,%%edx\n\t" \
-	"movb %%dl,%2\n\t" \
-	"movb %%dh,%3" \
-	:"=&d" (__pr) \
-	:"m" (*((addr)+2)), \
-	 "m" (*((addr)+4)), \
-	 "m" (*((addr)+7)), \
-         "0" (base) \
-        ); } while(0)
-
-#define _set_limit(addr,limit) do { unsigned long __lr; \
-__asm__ __volatile__ ("movw %%dx,%1\n\t" \
-	"rorl $16,%%edx\n\t" \
-	"movb %2,%%dh\n\t" \
-	"andb $0xf0,%%dh\n\t" \
-	"orb %%dh,%%dl\n\t" \
-	"movb %%dl,%2" \
-	:"=&d" (__lr) \
-	:"m" (*(addr)), \
-	 "m" (*((addr)+6)), \
-	 "0" (limit) \
-        ); } while(0)
-
-#define set_base(ldt,base) _set_base( ((char *)&(ldt)) , (base) )
-#define set_limit(ldt,limit) _set_limit( ((char *)&(ldt)) , ((limit)-1) )
-
 /*
  * Load a segment. Fall back on loading the zero
  * segment if something goes wrong..
@@ -83,12 +55,6 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t" \
 		".previous"			\
 		: :"rm" (value))
 
-/*
- * Save a segment register away
- */
-#define savesegment(seg, value) \
-	asm volatile("mov %%" #seg ",%0":"=rm" (value))
-
 
 static inline void native_clts(void)
 {
@@ -161,11 +127,6 @@ static inline void native_wbinvd(void)
 	asm volatile("wbinvd": : :"memory");
 }
 
-static inline void clflush(void *__p)
-{
-	asm volatile("clflush %0" : "+m" (*(char __force *)__p));
-}
-
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else
@@ -190,15 +151,6 @@ static inline void clflush(void *__p)
 
 #endif	/* __KERNEL__ */
 
-static inline unsigned long get_limit(unsigned long segment)
-{
-	unsigned long __limit;
-	__asm__("lsll %1,%0"
-		:"=r" (__limit):"r" (segment));
-	return __limit+1;
-}
-
-#define nop() __asm__ __volatile__ ("nop")
 
 /*
  * Force strict CPU ordering.
@@ -305,15 +257,5 @@ static inline unsigned long get_limit(unsigned long segment)
  * disable hlt during certain critical i/o operations
  */
 #define HAVE_DISABLE_HLT
-void disable_hlt(void);
-void enable_hlt(void);
-
-extern int es7000_plat;
-void cpu_idle_wait(void);
-
-extern unsigned long arch_align_stack(unsigned long sp);
-extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
-
-void default_idle(void);
 
 #endif
diff --git a/include/asm-x86/system_64.h b/include/asm-x86/system_64.h
index 3dcb217a7202..cc5b2666a044 100644
--- a/include/asm-x86/system_64.h
+++ b/include/asm-x86/system_64.h
@@ -141,13 +141,6 @@ static inline void write_cr8(unsigned long val)
 
 #endif	/* __KERNEL__ */
 
-static inline void clflush(volatile void *__p)
-{
-	asm volatile("clflush %0" : "+m" (*(char __force *)__p));
-}
-
-#define nop() __asm__ __volatile__ ("nop")
-
 #ifdef CONFIG_SMP
 #define smp_mb()	mb()
 #define smp_rmb()	barrier()
@@ -177,9 +170,4 @@ static inline void clflush(volatile void *__p)
 
 #include <linux/irqflags.h>
 
-void cpu_idle_wait(void);
-
-extern unsigned long arch_align_stack(unsigned long sp);
-extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
-
 #endif
-- 
cgit v1.2.3


From eee3af4a2c83a97fff107ddc445d9df6fded9ce4 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Wed, 30 Jan 2008 13:31:09 +0100
Subject: x86, ptrace: support for branch trace store(BTS)

Resend using different mail client

Changes to the last version:
- split implementation into two layers: ds/bts and ptrace
- renamed TIF's
- save/restore ds save area msr in __switch_to_xtra()
- make block-stepping only look at BTF bit

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/Makefile_32      |   1 +
 arch/x86/kernel/Makefile_64      |   1 +
 arch/x86/kernel/cpu/intel.c      |   5 +
 arch/x86/kernel/ds.c             | 429 +++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/process_32.c     |  19 +-
 arch/x86/kernel/process_64.c     |  26 ++-
 arch/x86/kernel/ptrace.c         | 212 +++++++++++++++++++
 arch/x86/kernel/setup_64.c       |   5 +
 arch/x86/kernel/step.c           |  18 +-
 include/asm-x86/ds.h             |  65 ++++++
 include/asm-x86/processor_32.h   |   3 +
 include/asm-x86/processor_64.h   |   3 +
 include/asm-x86/ptrace-abi.h     |  52 +++++
 include/asm-x86/ptrace.h         |  11 +
 include/asm-x86/thread_info_32.h |  12 +-
 include/asm-x86/thread_info_64.h |   9 +-
 16 files changed, 859 insertions(+), 12 deletions(-)
 create mode 100644 arch/x86/kernel/ds.c
 create mode 100644 include/asm-x86/ds.h

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32
index b2d7aea4c82d..cc2651bcc07f 100644
--- a/arch/x86/kernel/Makefile_32
+++ b/arch/x86/kernel/Makefile_32
@@ -11,6 +11,7 @@ obj-y	:= process_32.o signal_32.o entry_32.o traps_32.o irq_32.o \
 		quirks.o i8237.o topology.o alternative.o i8253.o tsc_32.o io_delay.o rtc.o
 
 obj-y				+= ptrace.o
+obj-y				+= ds.o
 obj-y				+= tls.o
 obj-y				+= step.o
 obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
diff --git a/arch/x86/kernel/Makefile_64 b/arch/x86/kernel/Makefile_64
index 19af64e1a3fc..2ec96acf6486 100644
--- a/arch/x86/kernel/Makefile_64
+++ b/arch/x86/kernel/Makefile_64
@@ -13,6 +13,7 @@ obj-y	:= process_64.o signal_64.o entry_64.o traps_64.o irq_64.o \
 		i8253.o io_delay.o rtc.o
 
 obj-y				+= ptrace.o
+obj-y				+= ds.o
 obj-y				+= step.o
 
 obj-$(CONFIG_IA32_EMULATION)	+= tls.o
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 867ff94579be..e4b7e73e9024 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -11,6 +11,8 @@
 #include <asm/pgtable.h>
 #include <asm/msr.h>
 #include <asm/uaccess.h>
+#include <asm/ptrace.h>
+#include <asm/ds.h>
 
 #include "cpu.h"
 
@@ -219,6 +221,9 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 		if (!(l1 & (1<<12)))
 			set_bit(X86_FEATURE_PEBS, c->x86_capability);
 	}
+
+	if (cpu_has_bts)
+		ds_init_intel(c);
 }
 
 static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 * c, unsigned int size)
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
new file mode 100644
index 000000000000..996a7c4f5963
--- /dev/null
+++ b/arch/x86/kernel/ds.c
@@ -0,0 +1,429 @@
+/*
+ * Debug Store support
+ *
+ * This provides a low-level interface to the hardware's Debug Store
+ * feature that is used for last branch recording (LBR) and
+ * precise-event based sampling (PEBS).
+ *
+ * Different architectures use a different DS layout/pointer size.
+ * The below functions therefore work on a void*.
+ *
+ *
+ * Since there is no user for PEBS, yet, only LBR (or branch
+ * trace store, BTS) is supported.
+ *
+ *
+ * Copyright (C) 2007 Intel Corporation.
+ * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007
+ */
+
+#include <asm/ds.h>
+
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+
+
+/*
+ * Debug Store (DS) save area configuration (see Intel64 and IA32
+ * Architectures Software Developer's Manual, section 18.5)
+ *
+ * The DS configuration consists of the following fields; different
+ * architetures vary in the size of those fields.
+ * - double-word aligned base linear address of the BTS buffer
+ * - write pointer into the BTS buffer
+ * - end linear address of the BTS buffer (one byte beyond the end of
+ *   the buffer)
+ * - interrupt pointer into BTS buffer
+ *   (interrupt occurs when write pointer passes interrupt pointer)
+ * - double-word aligned base linear address of the PEBS buffer
+ * - write pointer into the PEBS buffer
+ * - end linear address of the PEBS buffer (one byte beyond the end of
+ *   the buffer)
+ * - interrupt pointer into PEBS buffer
+ *   (interrupt occurs when write pointer passes interrupt pointer)
+ * - value to which counter is reset following counter overflow
+ *
+ * On later architectures, the last branch recording hardware uses
+ * 64bit pointers even in 32bit mode.
+ *
+ *
+ * Branch Trace Store (BTS) records store information about control
+ * flow changes. They at least provide the following information:
+ * - source linear address
+ * - destination linear address
+ *
+ * Netburst supported a predicated bit that had been dropped in later
+ * architectures. We do not suppor it.
+ *
+ *
+ * In order to abstract from the actual DS and BTS layout, we describe
+ * the access to the relevant fields.
+ * Thanks to Andi Kleen for proposing this design.
+ *
+ * The implementation, however, is not as general as it might seem. In
+ * order to stay somewhat simple and efficient, we assume an
+ * underlying unsigned type (mostly a pointer type) and we expect the
+ * field to be at least as big as that type.
+ */
+
+/*
+ * A special from_ip address to indicate that the BTS record is an
+ * info record that needs to be interpreted or skipped.
+ */
+#define BTS_ESCAPE_ADDRESS (-1)
+
+/*
+ * A field access descriptor
+ */
+struct access_desc {
+	unsigned char offset;
+	unsigned char size;
+};
+
+/*
+ * The configuration for a particular DS/BTS hardware implementation.
+ */
+struct ds_configuration {
+	/* the DS configuration */
+	unsigned char  sizeof_ds;
+	struct access_desc bts_buffer_base;
+	struct access_desc bts_index;
+	struct access_desc bts_absolute_maximum;
+	struct access_desc bts_interrupt_threshold;
+	/* the BTS configuration */
+	unsigned char  sizeof_bts;
+	struct access_desc from_ip;
+	struct access_desc to_ip;
+	/* BTS variants used to store additional information like
+	   timestamps */
+	struct access_desc info_type;
+	struct access_desc info_data;
+	unsigned long debugctl_mask;
+};
+
+/*
+ * The global configuration used by the below accessor functions
+ */
+static struct ds_configuration ds_cfg;
+
+/*
+ * Accessor functions for some DS and BTS fields using the above
+ * global ptrace_bts_cfg.
+ */
+static inline void *get_bts_buffer_base(char *base)
+{
+	return *(void **)(base + ds_cfg.bts_buffer_base.offset);
+}
+static inline void set_bts_buffer_base(char *base, void *value)
+{
+	(*(void **)(base + ds_cfg.bts_buffer_base.offset)) = value;
+}
+static inline void *get_bts_index(char *base)
+{
+	return *(void **)(base + ds_cfg.bts_index.offset);
+}
+static inline void set_bts_index(char *base, void *value)
+{
+	(*(void **)(base + ds_cfg.bts_index.offset)) = value;
+}
+static inline void *get_bts_absolute_maximum(char *base)
+{
+	return *(void **)(base + ds_cfg.bts_absolute_maximum.offset);
+}
+static inline void set_bts_absolute_maximum(char *base, void *value)
+{
+	(*(void **)(base + ds_cfg.bts_absolute_maximum.offset)) = value;
+}
+static inline void *get_bts_interrupt_threshold(char *base)
+{
+	return *(void **)(base + ds_cfg.bts_interrupt_threshold.offset);
+}
+static inline void set_bts_interrupt_threshold(char *base, void *value)
+{
+	(*(void **)(base + ds_cfg.bts_interrupt_threshold.offset)) = value;
+}
+static inline long get_from_ip(char *base)
+{
+	return *(long *)(base + ds_cfg.from_ip.offset);
+}
+static inline void set_from_ip(char *base, long value)
+{
+	(*(long *)(base + ds_cfg.from_ip.offset)) = value;
+}
+static inline long get_to_ip(char *base)
+{
+	return *(long *)(base + ds_cfg.to_ip.offset);
+}
+static inline void set_to_ip(char *base, long value)
+{
+	(*(long *)(base + ds_cfg.to_ip.offset)) = value;
+}
+static inline unsigned char get_info_type(char *base)
+{
+	return *(unsigned char *)(base + ds_cfg.info_type.offset);
+}
+static inline void set_info_type(char *base, unsigned char value)
+{
+	(*(unsigned char *)(base + ds_cfg.info_type.offset)) = value;
+}
+/*
+ * The info data might overlap with the info type on some architectures.
+ * We therefore read and write the exact number of bytes.
+ */
+static inline unsigned long long get_info_data(char *base)
+{
+	unsigned long long value = 0;
+	memcpy(&value,
+	       base + ds_cfg.info_data.offset,
+	       ds_cfg.info_data.size);
+	return value;
+}
+static inline void set_info_data(char *base, unsigned long long value)
+{
+	memcpy(base + ds_cfg.info_data.offset,
+	       &value,
+	       ds_cfg.info_data.size);
+}
+
+
+int ds_allocate(void **dsp, size_t bts_size_in_records)
+{
+	size_t bts_size_in_bytes = 0;
+	void *bts = 0;
+	void *ds = 0;
+
+	if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
+		return -EOPNOTSUPP;
+
+	if (bts_size_in_records < 0)
+		return -EINVAL;
+
+	bts_size_in_bytes =
+		bts_size_in_records * ds_cfg.sizeof_bts;
+
+	if (bts_size_in_bytes <= 0)
+		return -EINVAL;
+
+	bts = kzalloc(bts_size_in_bytes, GFP_KERNEL);
+
+	if (!bts)
+		return -ENOMEM;
+
+	ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL);
+
+	if (!ds) {
+		kfree(bts);
+		return -ENOMEM;
+	}
+
+	set_bts_buffer_base(ds, bts);
+	set_bts_index(ds, bts);
+	set_bts_absolute_maximum(ds, bts + bts_size_in_bytes);
+	set_bts_interrupt_threshold(ds, bts + bts_size_in_bytes + 1);
+
+	*dsp = ds;
+	return 0;
+}
+
+int ds_free(void **dsp)
+{
+	if (*dsp)
+		kfree(get_bts_buffer_base(*dsp));
+	kfree(*dsp);
+	*dsp = 0;
+
+	return 0;
+}
+
+int ds_get_bts_size(void *ds)
+{
+	size_t size_in_bytes;
+
+	if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
+		return -EOPNOTSUPP;
+
+	size_in_bytes =
+		get_bts_absolute_maximum(ds) -
+		get_bts_buffer_base(ds);
+
+	return size_in_bytes / ds_cfg.sizeof_bts;
+}
+
+int ds_get_bts_index(void *ds)
+{
+	size_t index_offset_in_bytes;
+
+	if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
+		return -EOPNOTSUPP;
+
+	index_offset_in_bytes =
+		get_bts_index(ds) -
+		get_bts_buffer_base(ds);
+
+	return index_offset_in_bytes / ds_cfg.sizeof_bts;
+}
+
+int ds_read_bts(void *ds, size_t index, struct bts_struct *out)
+{
+	void *bts;
+
+	if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
+		return -EOPNOTSUPP;
+
+	if (index < 0)
+		return -EINVAL;
+
+	if (index >= ds_get_bts_size(ds))
+		return -EINVAL;
+
+	bts = get_bts_buffer_base(ds);
+	bts = (char *)bts + (index * ds_cfg.sizeof_bts);
+
+	memset(out, 0, sizeof(*out));
+	if (get_from_ip(bts) == BTS_ESCAPE_ADDRESS) {
+		out->qualifier         = get_info_type(bts);
+		out->variant.timestamp = get_info_data(bts);
+	} else {
+		out->qualifier = BTS_BRANCH;
+		out->variant.lbr.from_ip = get_from_ip(bts);
+		out->variant.lbr.to_ip   = get_to_ip(bts);
+	}
+
+	return 0;
+}
+
+int ds_write_bts(void *ds, const struct bts_struct *in)
+{
+	void *bts;
+
+	if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
+		return -EOPNOTSUPP;
+
+	if (ds_get_bts_size(ds) <= 0)
+		return -ENXIO;
+
+	bts = get_bts_index(ds);
+
+	memset(bts, 0, ds_cfg.sizeof_bts);
+	switch (in->qualifier) {
+	case BTS_INVALID:
+		break;
+
+	case BTS_BRANCH:
+		set_from_ip(bts, in->variant.lbr.from_ip);
+		set_to_ip(bts, in->variant.lbr.to_ip);
+		break;
+
+	case BTS_TASK_ARRIVES:
+	case BTS_TASK_DEPARTS:
+		set_from_ip(bts, BTS_ESCAPE_ADDRESS);
+		set_info_type(bts, in->qualifier);
+		set_info_data(bts, in->variant.timestamp);
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	bts = (char *)bts + ds_cfg.sizeof_bts;
+	if (bts >= get_bts_absolute_maximum(ds))
+		bts = get_bts_buffer_base(ds);
+	set_bts_index(ds, bts);
+
+	return 0;
+}
+
+unsigned long ds_debugctl_mask(void)
+{
+	return ds_cfg.debugctl_mask;
+}
+
+#ifdef __i386__
+static const struct ds_configuration ds_cfg_netburst = {
+	.sizeof_ds = 9 * 4,
+	.bts_buffer_base = { 0, 4 },
+	.bts_index = { 4, 4 },
+	.bts_absolute_maximum = { 8, 4 },
+	.bts_interrupt_threshold = { 12, 4 },
+	.sizeof_bts = 3 * 4,
+	.from_ip = { 0, 4 },
+	.to_ip = { 4, 4 },
+	.info_type = { 4, 1 },
+	.info_data = { 5, 7 },
+	.debugctl_mask = (1<<2)|(1<<3)
+};
+
+static const struct ds_configuration ds_cfg_pentium_m = {
+	.sizeof_ds = 9 * 4,
+	.bts_buffer_base = { 0, 4 },
+	.bts_index = { 4, 4 },
+	.bts_absolute_maximum = { 8, 4 },
+	.bts_interrupt_threshold = { 12, 4 },
+	.sizeof_bts = 3 * 4,
+	.from_ip = { 0, 4 },
+	.to_ip = { 4, 4 },
+	.info_type = { 4, 1 },
+	.info_data = { 5, 7 },
+	.debugctl_mask = (1<<6)|(1<<7)
+};
+#endif /* _i386_ */
+
+static const struct ds_configuration ds_cfg_core2 = {
+	.sizeof_ds = 9 * 8,
+	.bts_buffer_base = { 0, 8 },
+	.bts_index = { 8, 8 },
+	.bts_absolute_maximum = { 16, 8 },
+	.bts_interrupt_threshold = { 24, 8 },
+	.sizeof_bts = 3 * 8,
+	.from_ip = { 0, 8 },
+	.to_ip = { 8, 8 },
+	.info_type = { 8, 1 },
+	.info_data = { 9, 7 },
+	.debugctl_mask = (1<<6)|(1<<7)|(1<<9)
+};
+
+static inline void
+ds_configure(const struct ds_configuration *cfg)
+{
+	ds_cfg = *cfg;
+}
+
+void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
+{
+	switch (c->x86) {
+	case 0x6:
+		switch (c->x86_model) {
+#ifdef __i386__
+		case 0xD:
+		case 0xE: /* Pentium M */
+			ds_configure(&ds_cfg_pentium_m);
+			break;
+#endif /* _i386_ */
+		case 0xF: /* Core2 */
+			ds_configure(&ds_cfg_core2);
+			break;
+		default:
+			/* sorry, don't know about them */
+			break;
+		}
+		break;
+	case 0xF:
+		switch (c->x86_model) {
+#ifdef __i386__
+		case 0x0:
+		case 0x1:
+		case 0x2: /* Netburst */
+			ds_configure(&ds_cfg_netburst);
+			break;
+#endif /* _i386_ */
+		default:
+			/* sorry, don't know about them */
+			break;
+		}
+		break;
+	default:
+		/* sorry, don't know about them */
+		break;
+	}
+}
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 5350763a2d03..2b9db9371060 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -614,11 +614,21 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 		 struct tss_struct *tss)
 {
 	struct thread_struct *prev, *next;
+	unsigned long debugctl;
 
 	prev = &prev_p->thread;
 	next = &next_p->thread;
 
-	if (next->debugctlmsr != prev->debugctlmsr)
+	debugctl = prev->debugctlmsr;
+	if (next->ds_area_msr != prev->ds_area_msr) {
+		/* we clear debugctl to make sure DS
+		 * is not in use when we change it */
+		debugctl = 0;
+		wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
+		wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0);
+	}
+
+	if (next->debugctlmsr != debugctl)
 		wrmsr(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr, 0);
 
 	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
@@ -642,6 +652,13 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 	}
 #endif
 
+	if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
+		ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
+
+	if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
+		ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
+
+
 	if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
 		/*
 		 * Disable the bitmap via an invalid offset. We still cache
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 057b5442ffda..843bf0c978a4 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -568,11 +568,21 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
 				    struct tss_struct *tss)
 {
 	struct thread_struct *prev, *next;
+	unsigned long debugctl;
 
 	prev = &prev_p->thread,
 	next = &next_p->thread;
 
-	if (next->debugctlmsr != prev->debugctlmsr)
+	debugctl = prev->debugctlmsr;
+	if (next->ds_area_msr != prev->ds_area_msr) {
+		/* we clear debugctl to make sure DS
+		 * is not in use when we change it */
+		debugctl = 0;
+		wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
+		wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
+	}
+
+	if (next->debugctlmsr != debugctl)
 		wrmsrl(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr);
 
 	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
@@ -598,6 +608,16 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
 		 */
 		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
 	}
+
+	/*
+	 * Last branch recording recofiguration of trace hardware and
+	 * disentangling of trace data per task.
+	 */
+	if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
+		ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
+
+	if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
+		ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
 }
 
 /*
@@ -701,8 +721,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	/*
 	 * Now maybe reload the debug registers and handle I/O bitmaps
 	 */
-	if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
-	    || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))
+	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
+		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
 		__switch_to_xtra(prev_p, next_p, tss);
 
 	/* If the task has used fpu the last 5 timeslices, just do a full
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 3399c1be79b8..8d0dd8b5effe 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -2,6 +2,9 @@
 /*
  * Pentium III FXSR, SSE support
  *	Gareth Hughes <gareth@valinux.com>, May 2000
+ *
+ * BTS tracing
+ *	Markus Metzger <markus.t.metzger@intel.com>, Dec 2007
  */
 
 #include <linux/kernel.h>
@@ -26,6 +29,14 @@
 #include <asm/desc.h>
 #include <asm/prctl.h>
 #include <asm/proto.h>
+#include <asm/ds.h>
+
+
+/*
+ * The maximal size of a BTS buffer per traced task in number of BTS
+ * records.
+ */
+#define PTRACE_BTS_BUFFER_MAX 4000
 
 /*
  * does not yet catch signals sent when the child dies.
@@ -455,6 +466,165 @@ static int ptrace_set_debugreg(struct task_struct *child,
 	return 0;
 }
 
+static int ptrace_bts_max_buffer_size(void)
+{
+	return PTRACE_BTS_BUFFER_MAX;
+}
+
+static int ptrace_bts_get_buffer_size(struct task_struct *child)
+{
+	if (!child->thread.ds_area_msr)
+		return -ENXIO;
+
+	return ds_get_bts_size((void *)child->thread.ds_area_msr);
+}
+
+static int ptrace_bts_get_index(struct task_struct *child)
+{
+	if (!child->thread.ds_area_msr)
+		return -ENXIO;
+
+	return ds_get_bts_index((void *)child->thread.ds_area_msr);
+}
+
+static int ptrace_bts_read_record(struct task_struct *child,
+				  long index,
+				  struct bts_struct __user *out)
+{
+	struct bts_struct ret;
+	int retval;
+
+	if (!child->thread.ds_area_msr)
+		return -ENXIO;
+
+	retval = ds_read_bts((void *)child->thread.ds_area_msr,
+			     index, &ret);
+	if (retval)
+		return retval;
+
+	if (copy_to_user(out, &ret, sizeof(ret)))
+		return -EFAULT;
+
+	return sizeof(ret);
+}
+
+static int ptrace_bts_write_record(struct task_struct *child,
+				   const struct bts_struct *in)
+{
+	int retval;
+
+	if (!child->thread.ds_area_msr)
+		return -ENXIO;
+
+	retval = ds_write_bts((void *)child->thread.ds_area_msr, in);
+	if (retval)
+		return retval;
+
+	return sizeof(*in);
+}
+
+static int ptrace_bts_config(struct task_struct *child,
+			     unsigned long options)
+{
+	unsigned long debugctl_mask = ds_debugctl_mask();
+	int retval;
+
+	retval = ptrace_bts_get_buffer_size(child);
+	if (retval < 0)
+		return retval;
+	if (retval == 0)
+		return -ENXIO;
+
+	if (options & PTRACE_BTS_O_TRACE_TASK) {
+		child->thread.debugctlmsr |= debugctl_mask;
+		set_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
+	} else {
+		/* there is no way for us to check whether we 'own'
+		 * the respective bits in the DEBUGCTL MSR, we're
+		 * about to clear */
+		child->thread.debugctlmsr &= ~debugctl_mask;
+
+		if (!child->thread.debugctlmsr)
+			clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
+	}
+
+	if (options & PTRACE_BTS_O_TIMESTAMPS)
+		set_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
+	else
+		clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
+
+	return 0;
+}
+
+static int ptrace_bts_status(struct task_struct *child)
+{
+	unsigned long debugctl_mask = ds_debugctl_mask();
+	int retval, status = 0;
+
+	retval = ptrace_bts_get_buffer_size(child);
+	if (retval < 0)
+		return retval;
+	if (retval == 0)
+		return -ENXIO;
+
+	if (ptrace_bts_get_buffer_size(child) <= 0)
+		return -ENXIO;
+
+	if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) &&
+	    child->thread.debugctlmsr & debugctl_mask)
+		status |= PTRACE_BTS_O_TRACE_TASK;
+	if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS))
+		status |= PTRACE_BTS_O_TIMESTAMPS;
+
+	return status;
+}
+
+static int ptrace_bts_allocate_bts(struct task_struct *child,
+				   int size_in_records)
+{
+	int retval = 0;
+	void *ds;
+
+	if (size_in_records < 0)
+		return -EINVAL;
+
+	if (size_in_records > ptrace_bts_max_buffer_size())
+		return -EINVAL;
+
+	if (size_in_records == 0) {
+		ptrace_bts_config(child, /* options = */ 0);
+	} else {
+		retval = ds_allocate(&ds, size_in_records);
+		if (retval)
+			return retval;
+	}
+
+	if (child->thread.ds_area_msr)
+		ds_free((void **)&child->thread.ds_area_msr);
+
+	child->thread.ds_area_msr = (unsigned long)ds;
+	if (child->thread.ds_area_msr)
+		set_tsk_thread_flag(child, TIF_DS_AREA_MSR);
+	else
+		clear_tsk_thread_flag(child, TIF_DS_AREA_MSR);
+
+	return retval;
+}
+
+void ptrace_bts_take_timestamp(struct task_struct *tsk,
+			       enum bts_qualifier qualifier)
+{
+	struct bts_struct rec = {
+		.qualifier = qualifier,
+		.variant.timestamp = sched_clock()
+	};
+
+	if (ptrace_bts_get_buffer_size(tsk) <= 0)
+		return;
+
+	ptrace_bts_write_record(tsk, &rec);
+}
+
 /*
  * Called by kernel/ptrace.c when detaching..
  *
@@ -466,6 +636,11 @@ void ptrace_disable(struct task_struct *child)
 #ifdef TIF_SYSCALL_EMU
 	clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
 #endif
+	ptrace_bts_config(child, /* options = */ 0);
+	if (child->thread.ds_area_msr) {
+	    ds_free((void **)&child->thread.ds_area_msr);
+	    clear_tsk_thread_flag(child, TIF_DS_AREA_MSR);
+	}
 }
 
 long arch_ptrace(struct task_struct *child, long request, long addr, long data)
@@ -626,6 +801,36 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		break;
 #endif
 
+	case PTRACE_BTS_MAX_BUFFER_SIZE:
+		ret = ptrace_bts_max_buffer_size();
+		break;
+
+	case PTRACE_BTS_ALLOCATE_BUFFER:
+		ret = ptrace_bts_allocate_bts(child, data);
+		break;
+
+	case PTRACE_BTS_GET_BUFFER_SIZE:
+		ret = ptrace_bts_get_buffer_size(child);
+		break;
+
+	case PTRACE_BTS_GET_INDEX:
+		ret = ptrace_bts_get_index(child);
+		break;
+
+	case PTRACE_BTS_READ_RECORD:
+		ret = ptrace_bts_read_record
+			(child, data,
+			 (struct bts_struct __user *) addr);
+		break;
+
+	case PTRACE_BTS_CONFIG:
+		ret = ptrace_bts_config(child, data);
+		break;
+
+	case PTRACE_BTS_STATUS:
+		ret = ptrace_bts_status(child);
+		break;
+
 	default:
 		ret = ptrace_request(child, request, addr, data);
 		break;
@@ -809,6 +1014,13 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
 	case PTRACE_SETOPTIONS:
 	case PTRACE_SET_THREAD_AREA:
 	case PTRACE_GET_THREAD_AREA:
+	case PTRACE_BTS_MAX_BUFFER_SIZE:
+	case PTRACE_BTS_ALLOCATE_BUFFER:
+	case PTRACE_BTS_GET_BUFFER_SIZE:
+	case PTRACE_BTS_GET_INDEX:
+	case PTRACE_BTS_READ_RECORD:
+	case PTRACE_BTS_CONFIG:
+	case PTRACE_BTS_STATUS:
 		return sys_ptrace(request, pid, addr, data);
 
 	default:
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index ce4d6b52ce36..f2b131ef844e 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -60,6 +60,7 @@
 #include <asm/dmi.h>
 #include <asm/cacheflush.h>
 #include <asm/mce.h>
+#include <asm/ds.h>
 
 /*
  * Machine setup..
@@ -823,6 +824,10 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 			set_cpu_cap(c, X86_FEATURE_PEBS);
 	}
 
+
+	if (cpu_has_bts)
+		ds_init_intel(c);
+
 	n = c->extended_cpuid_level;
 	if (n >= 0x80000008) {
 		unsigned eax = cpuid_eax(0x80000008);
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index f55c003f5b63..21ea22fda5fc 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -169,9 +169,14 @@ static void enable_step(struct task_struct *child, bool block)
 	 */
 	if (enable_single_step(child) && block) {
 		set_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
-		write_debugctlmsr(child, DEBUGCTLMSR_BTF);
-	} else if (test_and_clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR)) {
-		write_debugctlmsr(child, 0);
+		write_debugctlmsr(child,
+				  child->thread.debugctlmsr | DEBUGCTLMSR_BTF);
+	} else {
+	    write_debugctlmsr(child,
+			      child->thread.debugctlmsr & ~TIF_DEBUGCTLMSR);
+
+	    if (!child->thread.debugctlmsr)
+		    clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
 	}
 }
 
@@ -190,8 +195,11 @@ void user_disable_single_step(struct task_struct *child)
 	/*
 	 * Make sure block stepping (BTF) is disabled.
 	 */
-	if (test_and_clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR))
-		write_debugctlmsr(child, 0);
+	write_debugctlmsr(child,
+			  child->thread.debugctlmsr & ~TIF_DEBUGCTLMSR);
+
+	if (!child->thread.debugctlmsr)
+		clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
 
 	/* Always clear TIF_SINGLESTEP... */
 	clear_tsk_thread_flag(child, TIF_SINGLESTEP);
diff --git a/include/asm-x86/ds.h b/include/asm-x86/ds.h
new file mode 100644
index 000000000000..edd8467740a6
--- /dev/null
+++ b/include/asm-x86/ds.h
@@ -0,0 +1,65 @@
+/*
+ * Debug Store (DS) support
+ *
+ * This provides a low-level interface to the hardware's Debug Store
+ * feature that is used for last branch recording (LBR) and
+ * precise-event based sampling (PEBS).
+ *
+ * Different architectures use a different DS layout/pointer size.
+ * The below functions therefore work on a void*.
+ *
+ *
+ * Since there is no user for PEBS, yet, only LBR (or branch
+ * trace store, BTS) is supported.
+ *
+ *
+ * Copyright (C) 2007 Intel Corporation.
+ * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007
+ */
+
+#ifndef _ASM_X86_DS_H
+#define _ASM_X86_DS_H
+
+#include <linux/types.h>
+#include <linux/init.h>
+
+struct cpuinfo_x86;
+
+
+/* a branch trace record entry
+ *
+ * In order to unify the interface between various processor versions,
+ * we use the below data structure for all processors.
+ */
+enum bts_qualifier {
+	BTS_INVALID = 0,
+	BTS_BRANCH,
+	BTS_TASK_ARRIVES,
+	BTS_TASK_DEPARTS
+};
+
+struct bts_struct {
+	enum bts_qualifier qualifier;
+	union {
+		/* BTS_BRANCH */
+		struct {
+			long from_ip;
+			long to_ip;
+		} lbr;
+		/* BTS_TASK_ARRIVES or
+		   BTS_TASK_DEPARTS */
+		unsigned long long timestamp;
+	} variant;
+};
+
+
+extern int ds_allocate(void **, size_t);
+extern int ds_free(void **);
+extern int ds_get_bts_size(void *);
+extern int ds_get_bts_index(void *);
+extern int ds_read_bts(void *, size_t, struct bts_struct *);
+extern int ds_write_bts(void *, const struct bts_struct *);
+extern unsigned long ds_debugctl_mask(void);
+extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *c);
+
+#endif /* _ASM_X86_DS_H */
diff --git a/include/asm-x86/processor_32.h b/include/asm-x86/processor_32.h
index 0d83da198127..9c0ab7f26bd9 100644
--- a/include/asm-x86/processor_32.h
+++ b/include/asm-x86/processor_32.h
@@ -360,6 +360,9 @@ struct thread_struct {
 	unsigned long	io_bitmap_max;
 /* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set.  */
 	unsigned long	debugctlmsr;
+/* Debug Store - if not 0 points to a DS Save Area configuration;
+ *               goes into MSR_IA32_DS_AREA */
+	unsigned long	ds_area_msr;
 };
 
 #define INIT_THREAD  {							\
diff --git a/include/asm-x86/processor_64.h b/include/asm-x86/processor_64.h
index 0780f3e3fdfe..7b7f8a142e20 100644
--- a/include/asm-x86/processor_64.h
+++ b/include/asm-x86/processor_64.h
@@ -240,6 +240,9 @@ struct thread_struct {
 	unsigned io_bitmap_max;
 /* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set.  */
 	unsigned long	debugctlmsr;
+/* Debug Store - if not 0 points to a DS Save Area configuration;
+ *               goes into MSR_IA32_DS_AREA */
+	unsigned long	ds_area_msr;
 /* cached TLS descriptors. */
 	u64 tls_array[GDT_ENTRY_TLS_ENTRIES];
 } __attribute__((aligned(16)));
diff --git a/include/asm-x86/ptrace-abi.h b/include/asm-x86/ptrace-abi.h
index adce6b51df2e..6fadc5214e14 100644
--- a/include/asm-x86/ptrace-abi.h
+++ b/include/asm-x86/ptrace-abi.h
@@ -80,4 +80,56 @@
 
 #define PTRACE_SINGLEBLOCK	33	/* resume execution until next branch */
 
+/* Return maximal BTS buffer size in number of records,
+   if successuf; -1, otherwise.
+   EOPNOTSUPP...processor does not support bts tracing */
+#define PTRACE_BTS_MAX_BUFFER_SIZE 40
+
+/* Allocate new bts buffer (free old one, if exists) of size DATA bts records;
+   parameter ADDR is ignored.
+   Return 0, if successful; -1, otherwise.
+   EOPNOTSUPP...processor does not support bts tracing
+   EINVAL.......invalid size in records
+   ENOMEM.......out of memory */
+#define PTRACE_BTS_ALLOCATE_BUFFER 41
+
+/* Return the size of the bts buffer in number of bts records,
+   if successful; -1, otherwise.
+   EOPNOTSUPP...processor does not support bts tracing
+   ENXIO........no buffer allocated */
+#define PTRACE_BTS_GET_BUFFER_SIZE 42
+
+/* Return the index of the next bts record to be written,
+   if successful; -1, otherwise.
+   EOPNOTSUPP...processor does not support bts tracing
+   ENXIO........no buffer allocated
+   After the first warp-around, this is the start of the circular bts buffer. */
+#define PTRACE_BTS_GET_INDEX 43
+
+/* Read the DATA'th bts record into a ptrace_bts_record buffer provided in ADDR.
+   Return 0, if successful; -1, otherwise
+   EOPNOTSUPP...processor does not support bts tracing
+   ENXIO........no buffer allocated
+   EINVAL.......invalid index */
+#define PTRACE_BTS_READ_RECORD 44
+
+/* Configure last branch trace; the configuration is given as a bit-mask of
+   PTRACE_BTS_O_* options in DATA; parameter ADDR is ignored.
+   Return 0, if successful; -1, otherwise
+   EOPNOTSUPP...processor does not support bts tracing
+   ENXIO........no buffer allocated */
+#define PTRACE_BTS_CONFIG 45
+
+/* Return the configuration as bit-mask of PTRACE_BTS_O_* options
+   if successful; -1, otherwise.
+   EOPNOTSUPP...processor does not support bts tracing
+   ENXIO........no buffer allocated */
+#define PTRACE_BTS_STATUS 46
+
+/* Trace configuration options */
+/* Collect last branch trace */
+#define PTRACE_BTS_O_TRACE_TASK 0x1
+/* Take timestamps when the task arrives and departs */
+#define PTRACE_BTS_O_TIMESTAMPS 0x2
+
 #endif
diff --git a/include/asm-x86/ptrace.h b/include/asm-x86/ptrace.h
index 9228870f6157..a9a1bab1451a 100644
--- a/include/asm-x86/ptrace.h
+++ b/include/asm-x86/ptrace.h
@@ -4,8 +4,19 @@
 #include <linux/compiler.h>	/* For __user */
 #include <asm/ptrace-abi.h>
 
+
 #ifndef __ASSEMBLY__
 
+#ifdef __KERNEL__
+
+#include <asm/ds.h>
+
+struct task_struct;
+extern void ptrace_bts_take_timestamp(struct task_struct *, enum bts_qualifier);
+
+#endif /* __KERNEL__ */
+
+
 #ifdef __i386__
 /* this struct defines the way the registers are stored on the
    stack during a system call. */
diff --git a/include/asm-x86/thread_info_32.h b/include/asm-x86/thread_info_32.h
index 306fc80800e1..5bd508260ffb 100644
--- a/include/asm-x86/thread_info_32.h
+++ b/include/asm-x86/thread_info_32.h
@@ -140,6 +140,8 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_NOTSC		20	/* TSC is not accessible in userland */
 #define TIF_FORCED_TF		21	/* true if TF in eflags artificially */
 #define TIF_DEBUGCTLMSR		22	/* uses thread_struct.debugctlmsr */
+#define TIF_DS_AREA_MSR 	23      /* uses thread_struct.ds_area_msr */
+#define TIF_BTS_TRACE_TS        24      /* record scheduling event timestamps */
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_SIGPENDING		(1<<TIF_SIGPENDING)
@@ -157,6 +159,8 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_NOTSC		(1<<TIF_NOTSC)
 #define _TIF_FORCED_TF		(1<<TIF_FORCED_TF)
 #define _TIF_DEBUGCTLMSR	(1<<TIF_DEBUGCTLMSR)
+#define _TIF_DS_AREA_MSR	(1<<TIF_DS_AREA_MSR)
+#define _TIF_BTS_TRACE_TS	(1<<TIF_BTS_TRACE_TS)
 
 /* work to do on interrupt/exception return */
 #define _TIF_WORK_MASK \
@@ -166,8 +170,12 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_ALLWORK_MASK	(0x0000FFFF & ~_TIF_SECCOMP)
 
 /* flags to check in __switch_to() */
-#define _TIF_WORK_CTXSW_NEXT (_TIF_IO_BITMAP | _TIF_NOTSC | _TIF_DEBUG | _TIF_DEBUGCTLMSR)
-#define _TIF_WORK_CTXSW_PREV (_TIF_IO_BITMAP | _TIF_NOTSC | _TIF_DEBUGCTLMSR)
+#define _TIF_WORK_CTXSW \
+    (_TIF_IO_BITMAP | _TIF_NOTSC | _TIF_DEBUGCTLMSR | \
+     _TIF_DS_AREA_MSR | _TIF_BTS_TRACE_TS)
+#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
+#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW | _TIF_DEBUG)
+
 
 /*
  * Thread-synchronous status.
diff --git a/include/asm-x86/thread_info_64.h b/include/asm-x86/thread_info_64.h
index ee35fd12b541..c2911a99cc32 100644
--- a/include/asm-x86/thread_info_64.h
+++ b/include/asm-x86/thread_info_64.h
@@ -123,6 +123,8 @@ static inline struct thread_info *stack_thread_info(void)
 #define TIF_FREEZE		23	/* is freezing for suspend */
 #define TIF_FORCED_TF		24	/* true if TF in eflags artificially */
 #define TIF_DEBUGCTLMSR		25	/* uses thread_struct.debugctlmsr */
+#define TIF_DS_AREA_MSR	25      /* uses thread_struct.ds_area_msr */
+#define TIF_BTS_TRACE_TS	26      /* record scheduling event timestamps */
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_SIGPENDING		(1<<TIF_SIGPENDING)
@@ -142,6 +144,8 @@ static inline struct thread_info *stack_thread_info(void)
 #define _TIF_FREEZE		(1<<TIF_FREEZE)
 #define _TIF_FORCED_TF		(1<<TIF_FORCED_TF)
 #define _TIF_DEBUGCTLMSR	(1<<TIF_DEBUGCTLMSR)
+#define _TIF_DS_AREA_MSR	(1<<TIF_DS_AREA_MSR)
+#define _TIF_BTS_TRACE_TS	(1<<TIF_BTS_TRACE_TS)
 
 /* work to do on interrupt/exception return */
 #define _TIF_WORK_MASK \
@@ -153,7 +157,10 @@ static inline struct thread_info *stack_thread_info(void)
 	(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY|_TIF_HRTICK_RESCHED)
 
 /* flags to check in __switch_to() */
-#define _TIF_WORK_CTXSW (_TIF_DEBUG|_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR)
+#define _TIF_WORK_CTXSW \
+    (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS)
+#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
+#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
 
 #define PREEMPT_ACTIVE     0x10000000
 
-- 
cgit v1.2.3


From 21db5584f955652a6e34afa7d3dbd72dd6c82a89 Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 30 Jan 2008 13:31:10 +0100
Subject: x86: export math_state_restore

Export math_state_restore symbol, so it can be used for hypervisors.
They are commonly loaded as modules (lguest being an example).

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/traps_64.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index 965f2cc3a013..1a12a81fdb1b 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -1074,6 +1074,7 @@ asmlinkage void math_state_restore(void)
 	task_thread_info(me)->status |= TS_USEDFPU;
 	me->fpu_counter++;
 }
+EXPORT_SYMBOL_GPL(math_state_restore);
 
 void __init trap_init(void)
 {
-- 
cgit v1.2.3


From 70fd93c9d9e07dd2ea4465df76a0ddd414fd21ac Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 30 Jan 2008 13:31:10 +0100
Subject: x86: export cpu_gdt_descr

With paravirualization, hypervisors needs to handle the gdt,
that was right to this point only used at very early
inialization code. Hypervisors (lguest being the current case)
are commonly modules, so make it an export

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/x8664_ksyms_64.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index e28b533674dd..a66e9c1a0537 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -8,6 +8,7 @@
 #include <asm/processor.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
+#include <asm/desc.h>
 
 EXPORT_SYMBOL(kernel_thread);
 
@@ -53,3 +54,8 @@ EXPORT_SYMBOL(init_level4_pgt);
 EXPORT_SYMBOL(load_gs_index);
 
 EXPORT_SYMBOL(_proxy_pda);
+
+#ifdef CONFIG_PARAVIRT
+/* Virtualized guests may want to use it */
+EXPORT_SYMBOL_GPL(cpu_gdt_descr);
+#endif
-- 
cgit v1.2.3


From 49a697871e2edcbc9cc682466bc4f2316b854d23 Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 30 Jan 2008 13:31:10 +0100
Subject: x86: turn priviled operation into a macro in head_64.S

under paravirt, read cr2 cannot be issued directly anymore.
So wrap it in a macro, defined to the operation itself in case
paravirt is off, but to something else if we have paravirt
in the game

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/head_64.S | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index b6167fe3330e..c31b1c96a9d3 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -19,6 +19,13 @@
 #include <asm/msr.h>
 #include <asm/cache.h>
 
+#ifdef CONFIG_PARAVIRT
+#include <asm/asm-offsets.h>
+#include <asm/paravirt.h>
+#else
+#define GET_CR2_INTO_RCX movq %cr2, %rcx
+#endif
+
 /* we are not able to switch in one step to the final KERNEL ADRESS SPACE
  * because we need identity-mapped pages.
  *
@@ -267,7 +274,7 @@ ENTRY(early_idt_handler)
 	xorl %eax,%eax
 	movq 8(%rsp),%rsi	# get rip
 	movq (%rsp),%rdx
-	movq %cr2,%rcx
+	GET_CR2_INTO_RCX
 	leaq early_idt_msg(%rip),%rdi
 	call early_printk
 	cmpl $2,early_recursion_flag(%rip)
-- 
cgit v1.2.3


From ee238e5ca66858f80170f87724f84d67183b069a Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 30 Jan 2008 13:31:10 +0100
Subject: x86: prepare time related functions for paravirt

This patch add provisions for time related functions so they
can be later replaced by paravirt versions.

it basically encloses {g,s}et_wallclock inside the
already existent functions update_persistent_clock and
read_persistent_clock, and defines {s,g}et_wallclock
to the core of such functions.

it also allow for a later-on-game time initialization, as done
by i386. Paravirt guests can set a function to do their own
initialization this way.

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/time_64.c | 12 +++++++++---
 include/asm-x86/time.h    | 26 +++++++++++++++++++++-----
 2 files changed, 30 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index bf0bcc9bb001..91d4d495904e 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -21,6 +21,8 @@
 #include <asm/hpet.h>
 #include <asm/nmi.h>
 #include <asm/vgtod.h>
+#include <asm/time.h>
+#include <asm/timer.h>
 
 volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
 
@@ -54,7 +56,7 @@ static irqreturn_t timer_event_interrupt(int irq, void *dev_id)
 /* calibrate_cpu is used on systems with fixed rate TSCs to determine
  * processor frequency */
 #define TICK_COUNT 100000000
-static unsigned int __init tsc_calibrate_cpu_khz(void)
+unsigned long __init native_calculate_cpu_khz(void)
 {
 	int tsc_start, tsc_now;
 	int i, no_ctr_free;
@@ -104,20 +106,23 @@ static struct irqaction irq0 = {
 	.name		= "timer"
 };
 
-void __init time_init(void)
+void __init hpet_time_init(void)
 {
 	if (!hpet_enable())
 		setup_pit_timer();
 
 	setup_irq(0, &irq0);
+}
 
+void __init time_init(void)
+{
 	tsc_calibrate();
 
 	cpu_khz = tsc_khz;
 	if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
 		boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
 		boot_cpu_data.x86 == 16)
-		cpu_khz = tsc_calibrate_cpu_khz();
+		cpu_khz = calculate_cpu_khz();
 
 	if (unsynchronized_tsc())
 		mark_tsc_unstable("TSCs unsynchronized");
@@ -130,4 +135,5 @@ void __init time_init(void)
 	printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
 		cpu_khz / 1000, cpu_khz % 1000);
 	init_tsc_clocksource();
+	late_time_init = choose_time_init();
 }
diff --git a/include/asm-x86/time.h b/include/asm-x86/time.h
index b3f94cd81ac6..68779b048a3e 100644
--- a/include/asm-x86/time.h
+++ b/include/asm-x86/time.h
@@ -1,8 +1,12 @@
-#ifndef _ASMi386_TIME_H
-#define _ASMi386_TIME_H
+#ifndef _ASMX86_TIME_H
+#define _ASMX86_TIME_H
+
+extern void (*late_time_init)(void);
+extern void hpet_time_init(void);
 
-#include <linux/efi.h>
 #include <asm/mc146818rtc.h>
+#ifdef CONFIG_X86_32
+#include <linux/efi.h>
 
 static inline unsigned long native_get_wallclock(void)
 {
@@ -28,8 +32,20 @@ static inline int native_set_wallclock(unsigned long nowtime)
 	return retval;
 }
 
-extern void (*late_time_init)(void);
-extern void hpet_time_init(void);
+#else
+extern void native_time_init_hook(void);
+
+static inline unsigned long native_get_wallclock(void)
+{
+	return mach_get_cmos_time();
+}
+
+static inline int native_set_wallclock(unsigned long nowtime)
+{
+	return mach_set_rtc_mmss(nowtime);
+}
+
+#endif
 
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
-- 
cgit v1.2.3


From 746ef0cd0c7190d570c65b8e39a4ac67550ae43a Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 30 Jan 2008 13:31:11 +0100
Subject: x86: prepare 64-bit architecture initialization for paravirt

This patch prepares the x86_64 architecture initialization for
paravirt. It requires a memory initialization step, which is done
by implementing 64-bit version for machine_specific_memory_setup,
and putting an ARCH_SETUP hook, for guest-dependent initialization.
This last step is done akin to i386

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/e820_64.c    |  9 +++++++--
 arch/x86/kernel/setup_64.c   | 28 +++++++++++++++++++++++++++-
 arch/x86/kernel/smpboot_64.c |  4 ++--
 include/asm-x86/setup.h      | 11 ++++++++---
 4 files changed, 44 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
index 8e7321101a0a..abc473bcabe8 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -638,8 +638,10 @@ static void early_panic(char *msg)
 	panic(msg);
 }
 
-void __init setup_memory_region(void)
+/* We're not void only for x86 32-bit compat */
+char * __init machine_specific_memory_setup(void)
 {
+	char *who = "BIOS-e820";
 	/*
 	 * Try to copy the BIOS-supplied E820-map.
 	 *
@@ -650,7 +652,10 @@ void __init setup_memory_region(void)
 	if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0)
 		early_panic("Cannot find a valid memory map");
 	printk(KERN_INFO "BIOS-provided physical RAM map:\n");
-	e820_print_map("BIOS-e820");
+	e820_print_map(who);
+
+	/* In case someone cares... */
+	return who;
 }
 
 static int __init parse_memopt(char *p)
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index f2b131ef844e..8dd110d93e73 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -39,6 +39,7 @@
 #include <linux/dmi.h>
 #include <linux/dma-mapping.h>
 #include <linux/ctype.h>
+#include <linux/uaccess.h>
 
 #include <asm/mtrr.h>
 #include <asm/uaccess.h>
@@ -62,6 +63,12 @@
 #include <asm/mce.h>
 #include <asm/ds.h>
 
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define ARCH_SETUP
+#endif
+
 /*
  * Machine setup..
  */
@@ -246,6 +253,16 @@ static void discover_ebda(void)
 	 * 4K EBDA area at 0x40E
 	 */
 	ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
+	/*
+	 * There can be some situations, like paravirtualized guests,
+	 * in which there is no available ebda information. In such
+	 * case, just skip it
+	 */
+	if (!ebda_addr) {
+		ebda_size = 0;
+		return;
+	}
+
 	ebda_addr <<= 4;
 
 	ebda_size = *(unsigned short *)__va(ebda_addr);
@@ -259,6 +276,12 @@ static void discover_ebda(void)
 		ebda_size = 64*1024;
 }
 
+/* Overridden in paravirt.c if CONFIG_PARAVIRT */
+void __attribute__((weak)) memory_setup(void)
+{
+       machine_specific_memory_setup();
+}
+
 void __init setup_arch(char **cmdline_p)
 {
 	unsigned i;
@@ -276,7 +299,10 @@ void __init setup_arch(char **cmdline_p)
 	rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
 	rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
 #endif
-	setup_memory_region();
+
+	ARCH_SETUP
+
+	memory_setup();
 	copy_edd();
 
 	if (!boot_params.hdr.root_flags)
diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c
index c3f2736ba530..cb73c4da87fc 100644
--- a/arch/x86/kernel/smpboot_64.c
+++ b/arch/x86/kernel/smpboot_64.c
@@ -369,7 +369,7 @@ void __cpuinit start_secondary(void)
 
 	unlock_ipi_call_lock();
 
-	setup_secondary_APIC_clock();
+	setup_secondary_clock();
 
 	cpu_idle();
 }
@@ -923,7 +923,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 	 * Set up local APIC timer on boot CPU.
 	 */
 
-	setup_boot_APIC_clock();
+	setup_boot_clock();
 }
 
 /*
diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h
index 24d786e07b49..071e054abd82 100644
--- a/include/asm-x86/setup.h
+++ b/include/asm-x86/setup.h
@@ -3,6 +3,13 @@
 
 #define COMMAND_LINE_SIZE 2048
 
+#ifndef __ASSEMBLY__
+char *machine_specific_memory_setup(void);
+#ifndef CONFIG_PARAVIRT
+#define paravirt_post_allocator_init()	do {} while (0)
+#endif
+#endif /* __ASSEMBLY__ */
+
 #ifdef __KERNEL__
 
 #ifdef __i386__
@@ -51,9 +58,7 @@ void __init add_memory_region(unsigned long long start,
 
 extern unsigned long init_pg_tables_end;
 
-#ifndef CONFIG_PARAVIRT
-#define paravirt_post_allocator_init()	do {} while (0)
-#endif
+
 
 #endif /* __i386__ */
 #endif /* _SETUP */
-- 
cgit v1.2.3


From 6842ef0e85a9cc1295f3ef933a230f863b01eb0f Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 30 Jan 2008 13:31:11 +0100
Subject: x86: unify desc_struct

This patch aims to make the access of struct desc_struct variables
equal across architectures. In this patch, I unify the i386 and x86_64
versions under an anonymous union, keeping the way they are accessed
untouched (a and b for 32-bit code, individual bit-fields for 64-bit).

This solution is not beautiful, but will allow us to integrate common
code that differed by the way descriptors were used. This is to be viewed
incrementally. There's simply too much code to be fixed at once.

In the future, goal is to set up in a single way of acessing
the desc_struct fields.

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apm_32.c       |  2 +-
 arch/x86/kernel/cpu/common.c   | 35 +++++++++++++++++++++--------------
 arch/x86/kernel/process_64.c   |  2 +-
 arch/x86/kernel/traps_32.c     |  3 ++-
 include/asm-x86/desc_defs.h    | 25 +++++++++++++++++--------
 include/asm-x86/lguest.h       |  4 ++--
 include/asm-x86/processor_32.h |  5 +----
 7 files changed, 45 insertions(+), 31 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 7496c2e4b6ae..e32f6c37db9b 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -405,7 +405,7 @@ static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue);
 static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue);
 static struct apm_user *	user_list;
 static DEFINE_SPINLOCK(user_list_lock);
-static const struct desc_struct	bad_bios_desc = { 0, 0x00409200 };
+static const struct desc_struct	bad_bios_desc = { { { 0, 0x00409200 } } };
 
 static const char		driver_version[] = "1.16ac";	/* no spaces */
 
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 235cd615b89d..4f9e31912a25 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -22,31 +22,38 @@
 #include "cpu.h"
 
 DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
-	[GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 },
-	[GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 },
-	[GDT_ENTRY_DEFAULT_USER_CS] = { 0x0000ffff, 0x00cffa00 },
-	[GDT_ENTRY_DEFAULT_USER_DS] = { 0x0000ffff, 0x00cff200 },
+	[GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
+	[GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
+	[GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
+	[GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } },
 	/*
 	 * Segments used for calling PnP BIOS have byte granularity.
 	 * They code segments and data segments have fixed 64k limits,
 	 * the transfer segment sizes are set at run time.
 	 */
-	[GDT_ENTRY_PNPBIOS_CS32] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
-	[GDT_ENTRY_PNPBIOS_CS16] = { 0x0000ffff, 0x00009a00 },/* 16-bit code */
-	[GDT_ENTRY_PNPBIOS_DS] = { 0x0000ffff, 0x00009200 }, /* 16-bit data */
-	[GDT_ENTRY_PNPBIOS_TS1] = { 0x00000000, 0x00009200 },/* 16-bit data */
-	[GDT_ENTRY_PNPBIOS_TS2] = { 0x00000000, 0x00009200 },/* 16-bit data */
+	/* 32-bit code */
+	[GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } },
+	/* 16-bit code */
+	[GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } },
+	/* 16-bit data */
+	[GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } },
+	/* 16-bit data */
+	[GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } },
+	/* 16-bit data */
+	[GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } },
 	/*
 	 * The APM segments have byte granularity and their bases
 	 * are set at run time.  All have 64k limits.
 	 */
-	[GDT_ENTRY_APMBIOS_BASE] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
+	/* 32-bit code */
+	[GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } },
 	/* 16-bit code */
-	[GDT_ENTRY_APMBIOS_BASE+1] = { 0x0000ffff, 0x00009a00 },
-	[GDT_ENTRY_APMBIOS_BASE+2] = { 0x0000ffff, 0x00409200 }, /* data */
+	[GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } },
+	/* data */
+	[GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
 
-	[GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 },
-	[GDT_ENTRY_PERCPU] = { 0x00000000, 0x00000000 },
+	[GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
+	[GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } },
 } };
 EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
 
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 843bf0c978a4..86c310acc989 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -455,7 +455,7 @@ static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
 		.limit_in_pages = 1,
 		.useable = 1,
 	};
-	struct n_desc_struct *desc = (void *)t->thread.tls_array;
+	struct desc_struct *desc = (void *)t->thread.tls_array;
 	desc += tls;
 	desc->a = LDT_entry_a(&ud);
 	desc->b = LDT_entry_b(&ud);
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index 57491942cc4e..0d45017ed824 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -76,7 +76,8 @@ char ignore_fpu_irq = 0;
  * F0 0F bug workaround.. We have a special link segment
  * for this.
  */
-struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, };
+struct desc_struct idt_table[256]
+	__attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
 
 asmlinkage void divide_error(void);
 asmlinkage void debug(void);
diff --git a/include/asm-x86/desc_defs.h b/include/asm-x86/desc_defs.h
index 089004070099..de47eb0a23aa 100644
--- a/include/asm-x86/desc_defs.h
+++ b/include/asm-x86/desc_defs.h
@@ -11,17 +11,26 @@
 
 #include <linux/types.h>
 
+/*
+ * FIXME: Acessing the desc_struct through its fields is more elegant,
+ * and should be the one valid thing to do. However, a lot of open code
+ * still touches the a and b acessors, and doing this allow us to do it
+ * incrementally. We keep the signature as a struct, rather than an union,
+ * so we can get rid of it transparently in the future -- glommer
+ */
 // 8 byte segment descriptor
 struct desc_struct {
-	u16 limit0;
-	u16 base0;
-	unsigned base1 : 8, type : 4, s : 1, dpl : 2, p : 1;
-	unsigned limit : 4, avl : 1, l : 1, d : 1, g : 1, base2 : 8;
-} __attribute__((packed));
+	union {
+		struct { unsigned int a, b; };
+		struct {
+			u16 limit0;
+			u16 base0;
+			unsigned base1: 8, type: 4, s: 1, dpl: 2, p: 1;
+			unsigned limit: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8;
+		};
 
-struct n_desc_struct {
-	unsigned int a,b;
-};
+	};
+} __attribute__((packed));
 
 enum {
 	GATE_INTERRUPT = 0xE,
diff --git a/include/asm-x86/lguest.h b/include/asm-x86/lguest.h
index ccd338460811..17c908c0ef1c 100644
--- a/include/asm-x86/lguest.h
+++ b/include/asm-x86/lguest.h
@@ -78,8 +78,8 @@ static inline void lguest_set_ts(void)
 }
 
 /* Full 4G segment descriptors, suitable for CS and DS. */
-#define FULL_EXEC_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9b00})
-#define FULL_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9300})
+#define FULL_EXEC_SEGMENT ((struct desc_struct){ { {0x0000ffff, 0x00cf9b00} } })
+#define FULL_SEGMENT ((struct desc_struct){ { {0x0000ffff, 0x00cf9300} } })
 
 #endif /* __ASSEMBLY__ */
 
diff --git a/include/asm-x86/processor_32.h b/include/asm-x86/processor_32.h
index 9c0ab7f26bd9..bc48ad64de47 100644
--- a/include/asm-x86/processor_32.h
+++ b/include/asm-x86/processor_32.h
@@ -20,14 +20,11 @@
 #include <linux/cpumask.h>
 #include <linux/init.h>
 #include <asm/processor-flags.h>
+#include <asm/desc_defs.h>
 
 /* flag for disabling the tsc */
 extern int tsc_disable;
 
-struct desc_struct {
-	unsigned long a,b;
-};
-
 static inline int desc_empty(const void *ptr)
 {
 	const u32 *desc = ptr;
-- 
cgit v1.2.3


From 6b68f01baa810e9f63fbf39e9d5c3ef1d94a966f Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 30 Jan 2008 13:31:12 +0100
Subject: x86: unify struct desc_ptr

This patch unifies struct desc_ptr between i386 and x86_64.
They can be expressed in the exact same way in C code, only
having to change the name of one of them. As Xgt_desc_struct
is ugly and big, this is the one that goes away.

There's also a padding field in i386, but it is not really
needed in the C structure definition.

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/asm-offsets_32.c   |  5 ++---
 arch/x86/kernel/cpu/common.c       |  2 +-
 arch/x86/kernel/doublefault_32.c   |  2 +-
 arch/x86/kernel/efi_32.c           |  4 ++--
 arch/x86/kernel/machine_kexec_32.c |  4 ++--
 arch/x86/kernel/reboot_32.c        |  2 +-
 arch/x86/lguest/boot.c             |  4 ++--
 arch/x86/xen/enlighten.c           | 10 +++++-----
 drivers/kvm/svm.c                  |  2 +-
 include/asm-x86/desc_32.h          | 16 +++++-----------
 include/asm-x86/lguest.h           |  8 ++++----
 include/asm-x86/paravirt.h         | 18 +++++++++---------
 include/asm-x86/processor_32.h     |  2 +-
 include/asm-x86/suspend_32.h       |  4 ++--
 14 files changed, 38 insertions(+), 45 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 415313556708..afd84463b712 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -70,9 +70,8 @@ void foo(void)
 	OFFSET(TI_cpu, thread_info, cpu);
 	BLANK();
 
-	OFFSET(GDS_size, Xgt_desc_struct, size);
-	OFFSET(GDS_address, Xgt_desc_struct, address);
-	OFFSET(GDS_pad, Xgt_desc_struct, pad);
+	OFFSET(GDS_size, desc_ptr, size);
+	OFFSET(GDS_address, desc_ptr, address);
 	BLANK();
 
 	OFFSET(PT_EBX, pt_regs, bx);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4f9e31912a25..69507ae8a65b 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -649,7 +649,7 @@ struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
  * it's on the real one. */
 void switch_to_new_gdt(void)
 {
-	struct Xgt_desc_struct gdt_descr;
+	struct desc_ptr gdt_descr;
 
 	gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
 	gdt_descr.size = GDT_SIZE - 1;
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c
index cc19a3ea403a..d16122a8e4eb 100644
--- a/arch/x86/kernel/doublefault_32.c
+++ b/arch/x86/kernel/doublefault_32.c
@@ -17,7 +17,7 @@ static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE];
 
 static void doublefault_fn(void)
 {
-	struct Xgt_desc_struct gdt_desc = {0, 0};
+	struct desc_ptr gdt_desc = {0, 0};
 	unsigned long gdt, tss;
 
 	store_gdt(&gdt_desc);
diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c
index e2be78f49399..863e8926f2bb 100644
--- a/arch/x86/kernel/efi_32.c
+++ b/arch/x86/kernel/efi_32.c
@@ -69,7 +69,7 @@ static void efi_call_phys_prelog(void) __acquires(efi_rt_lock)
 {
 	unsigned long cr4;
 	unsigned long temp;
-	struct Xgt_desc_struct gdt_descr;
+	struct desc_ptr gdt_descr;
 
 	spin_lock(&efi_rt_lock);
 	local_irq_save(efi_rt_eflags);
@@ -111,7 +111,7 @@ static void efi_call_phys_prelog(void) __acquires(efi_rt_lock)
 static void efi_call_phys_epilog(void) __releases(efi_rt_lock)
 {
 	unsigned long cr4;
-	struct Xgt_desc_struct gdt_descr;
+	struct desc_ptr gdt_descr;
 
 	gdt_descr.address = (unsigned long)get_cpu_gdt_table(0);
 	gdt_descr.size = GDT_SIZE - 1;
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 11b935f4f886..c1cfd60639d4 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -32,7 +32,7 @@ static u32 kexec_pte1[1024] PAGE_ALIGNED;
 
 static void set_idt(void *newidt, __u16 limit)
 {
-	struct Xgt_desc_struct curidt;
+	struct desc_ptr curidt;
 
 	/* ia32 supports unaliged loads & stores */
 	curidt.size    = limit;
@@ -44,7 +44,7 @@ static void set_idt(void *newidt, __u16 limit)
 
 static void set_gdt(void *newgdt, __u16 limit)
 {
-	struct Xgt_desc_struct curgdt;
+	struct desc_ptr curgdt;
 
 	/* ia32 supports unaligned loads & stores */
 	curgdt.size    = limit;
diff --git a/arch/x86/kernel/reboot_32.c b/arch/x86/kernel/reboot_32.c
index bb1a0f889c5e..c3376fae639d 100644
--- a/arch/x86/kernel/reboot_32.c
+++ b/arch/x86/kernel/reboot_32.c
@@ -161,7 +161,7 @@ real_mode_gdt_entries [3] =
 	0x000092000100ffffULL	/* 16-bit real-mode 64k data at 0x00000100 */
 };
 
-static struct Xgt_desc_struct
+static struct desc_ptr
 real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries },
 real_mode_idt = { 0x3ff, 0 },
 no_idt = { 0, 0 };
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index c751e3c03e85..aa0bdd5fc4bb 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -229,7 +229,7 @@ static void lguest_write_idt_entry(struct desc_struct *dt,
 /* Changing to a different IDT is very rare: we keep the IDT up-to-date every
  * time it is written, so we can simply loop through all entries and tell the
  * Host about them. */
-static void lguest_load_idt(const struct Xgt_desc_struct *desc)
+static void lguest_load_idt(const struct desc_ptr *desc)
 {
 	unsigned int i;
 	struct desc_struct *idt = (void *)desc->address;
@@ -252,7 +252,7 @@ static void lguest_load_idt(const struct Xgt_desc_struct *desc)
  * hypercall and use that repeatedly to load a new IDT.  I don't think it
  * really matters, but wouldn't it be nice if they were the same?
  */
-static void lguest_load_gdt(const struct Xgt_desc_struct *desc)
+static void lguest_load_gdt(const struct desc_ptr *desc)
 {
 	BUG_ON((desc->size+1)/8 != GDT_ENTRIES);
 	hcall(LHCALL_LOAD_GDT, __pa(desc->address), GDT_ENTRIES, 0);
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index d81e8d709102..c32e0fd0f838 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -295,7 +295,7 @@ static void xen_set_ldt(const void *addr, unsigned entries)
 	xen_mc_issue(PARAVIRT_LAZY_CPU);
 }
 
-static void xen_load_gdt(const struct Xgt_desc_struct *dtr)
+static void xen_load_gdt(const struct desc_ptr *dtr)
 {
 	unsigned long *frames;
 	unsigned long va = dtr->address;
@@ -395,7 +395,7 @@ static int cvt_gate_to_trap(int vector, u32 low, u32 high,
 }
 
 /* Locations of each CPU's IDT */
-static DEFINE_PER_CPU(struct Xgt_desc_struct, idt_desc);
+static DEFINE_PER_CPU(struct desc_ptr, idt_desc);
 
 /* Set an IDT entry.  If the entry is part of the current IDT, then
    also update Xen. */
@@ -427,7 +427,7 @@ static void xen_write_idt_entry(struct desc_struct *dt, int entrynum,
 	preempt_enable();
 }
 
-static void xen_convert_trap_info(const struct Xgt_desc_struct *desc,
+static void xen_convert_trap_info(const struct desc_ptr *desc,
 				  struct trap_info *traps)
 {
 	unsigned in, out, count;
@@ -446,7 +446,7 @@ static void xen_convert_trap_info(const struct Xgt_desc_struct *desc,
 
 void xen_copy_trap_info(struct trap_info *traps)
 {
-	const struct Xgt_desc_struct *desc = &__get_cpu_var(idt_desc);
+	const struct desc_ptr *desc = &__get_cpu_var(idt_desc);
 
 	xen_convert_trap_info(desc, traps);
 }
@@ -454,7 +454,7 @@ void xen_copy_trap_info(struct trap_info *traps)
 /* Load a new IDT into Xen.  In principle this can be per-CPU, so we
    hold a spinlock to protect the static traps[] array (static because
    it avoids allocation, and saves stack space). */
-static void xen_load_idt(const struct Xgt_desc_struct *desc)
+static void xen_load_idt(const struct desc_ptr *desc)
 {
 	static DEFINE_SPINLOCK(lock);
 	static struct trap_info traps[257];
diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c
index 4e04e49a2f1c..ced4ac1955db 100644
--- a/drivers/kvm/svm.c
+++ b/drivers/kvm/svm.c
@@ -290,7 +290,7 @@ static void svm_hardware_enable(void *garbage)
 #ifdef CONFIG_X86_64
 	struct desc_ptr gdt_descr;
 #else
-	struct Xgt_desc_struct gdt_descr;
+	struct desc_ptr gdt_descr;
 #endif
 	struct desc_struct *gdt;
 	int me = raw_smp_processor_id();
diff --git a/include/asm-x86/desc_32.h b/include/asm-x86/desc_32.h
index c547403f341d..bc5ca3453466 100644
--- a/include/asm-x86/desc_32.h
+++ b/include/asm-x86/desc_32.h
@@ -12,12 +12,6 @@
 
 #include <asm/mmu.h>
 
-struct Xgt_desc_struct {
-	unsigned short size;
-	unsigned long address __attribute__((packed));
-	unsigned short pad;
-} __attribute__ ((packed));
-
 struct gdt_page
 {
 	struct desc_struct gdt[GDT_ENTRIES];
@@ -29,7 +23,7 @@ static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
 	return per_cpu(gdt_page, cpu).gdt;
 }
 
-extern struct Xgt_desc_struct idt_descr;
+extern struct desc_ptr idt_descr;
 extern struct desc_struct idt_table[];
 extern void set_intr_gate(unsigned int irq, void * addr);
 
@@ -107,22 +101,22 @@ static inline void native_load_tr_desc(void)
 	asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
 }
 
-static inline void native_load_gdt(const struct Xgt_desc_struct *dtr)
+static inline void native_load_gdt(const struct desc_ptr *dtr)
 {
 	asm volatile("lgdt %0"::"m" (*dtr));
 }
 
-static inline void native_load_idt(const struct Xgt_desc_struct *dtr)
+static inline void native_load_idt(const struct desc_ptr *dtr)
 {
 	asm volatile("lidt %0"::"m" (*dtr));
 }
 
-static inline void native_store_gdt(struct Xgt_desc_struct *dtr)
+static inline void native_store_gdt(struct desc_ptr *dtr)
 {
 	asm ("sgdt %0":"=m" (*dtr));
 }
 
-static inline void native_store_idt(struct Xgt_desc_struct *dtr)
+static inline void native_store_idt(struct desc_ptr *dtr)
 {
 	asm ("sidt %0":"=m" (*dtr));
 }
diff --git a/include/asm-x86/lguest.h b/include/asm-x86/lguest.h
index 17c908c0ef1c..3585a1628b59 100644
--- a/include/asm-x86/lguest.h
+++ b/include/asm-x86/lguest.h
@@ -44,13 +44,13 @@ struct lguest_ro_state
 {
 	/* Host information we need to restore when we switch back. */
 	u32 host_cr3;
-	struct Xgt_desc_struct host_idt_desc;
-	struct Xgt_desc_struct host_gdt_desc;
+	struct desc_ptr host_idt_desc;
+	struct desc_ptr host_gdt_desc;
 	u32 host_sp;
 
 	/* Fields which are used when guest is running. */
-	struct Xgt_desc_struct guest_idt_desc;
-	struct Xgt_desc_struct guest_gdt_desc;
+	struct desc_ptr guest_idt_desc;
+	struct desc_ptr guest_gdt_desc;
 	struct i386_hw_tss guest_tss;
 	struct desc_struct guest_idt[IDT_ENTRIES];
 	struct desc_struct guest_gdt[GDT_ENTRIES];
diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h
index e95c2a655165..0333fb6988b5 100644
--- a/include/asm-x86/paravirt.h
+++ b/include/asm-x86/paravirt.h
@@ -20,7 +20,7 @@
 
 struct page;
 struct thread_struct;
-struct Xgt_desc_struct;
+struct desc_ptr;
 struct tss_struct;
 struct mm_struct;
 struct desc_struct;
@@ -88,10 +88,10 @@ struct pv_cpu_ops {
 
 	/* Segment descriptor handling */
 	void (*load_tr_desc)(void);
-	void (*load_gdt)(const struct Xgt_desc_struct *);
-	void (*load_idt)(const struct Xgt_desc_struct *);
-	void (*store_gdt)(struct Xgt_desc_struct *);
-	void (*store_idt)(struct Xgt_desc_struct *);
+	void (*load_gdt)(const struct desc_ptr *);
+	void (*load_idt)(const struct desc_ptr *);
+	void (*store_gdt)(struct desc_ptr *);
+	void (*store_idt)(struct desc_ptr *);
 	void (*set_ldt)(const void *desc, unsigned entries);
 	unsigned long (*store_tr)(void);
 	void (*load_tls)(struct thread_struct *t, unsigned int cpu);
@@ -630,11 +630,11 @@ static inline void load_TR_desc(void)
 {
 	PVOP_VCALL0(pv_cpu_ops.load_tr_desc);
 }
-static inline void load_gdt(const struct Xgt_desc_struct *dtr)
+static inline void load_gdt(const struct desc_ptr *dtr)
 {
 	PVOP_VCALL1(pv_cpu_ops.load_gdt, dtr);
 }
-static inline void load_idt(const struct Xgt_desc_struct *dtr)
+static inline void load_idt(const struct desc_ptr *dtr)
 {
 	PVOP_VCALL1(pv_cpu_ops.load_idt, dtr);
 }
@@ -642,11 +642,11 @@ static inline void set_ldt(const void *addr, unsigned entries)
 {
 	PVOP_VCALL2(pv_cpu_ops.set_ldt, addr, entries);
 }
-static inline void store_gdt(struct Xgt_desc_struct *dtr)
+static inline void store_gdt(struct desc_ptr *dtr)
 {
 	PVOP_VCALL1(pv_cpu_ops.store_gdt, dtr);
 }
-static inline void store_idt(struct Xgt_desc_struct *dtr)
+static inline void store_idt(struct desc_ptr *dtr)
 {
 	PVOP_VCALL1(pv_cpu_ops.store_idt, dtr);
 }
diff --git a/include/asm-x86/processor_32.h b/include/asm-x86/processor_32.h
index bc48ad64de47..e5056ab9dd9f 100644
--- a/include/asm-x86/processor_32.h
+++ b/include/asm-x86/processor_32.h
@@ -707,7 +707,7 @@ extern void enable_sep_cpu(void);
 extern int sysenter_setup(void);
 
 /* Defined in head.S */
-extern struct Xgt_desc_struct early_gdt_descr;
+extern struct desc_ptr early_gdt_descr;
 
 extern void cpu_set_gdt(int);
 extern void switch_to_new_gdt(void);
diff --git a/include/asm-x86/suspend_32.h b/include/asm-x86/suspend_32.h
index a2520732ffd6..1bbda3ad7796 100644
--- a/include/asm-x86/suspend_32.h
+++ b/include/asm-x86/suspend_32.h
@@ -12,8 +12,8 @@ static inline int arch_prepare_suspend(void) { return 0; }
 struct saved_context {
   	u16 es, fs, gs, ss;
 	unsigned long cr0, cr2, cr3, cr4;
-	struct Xgt_desc_struct gdt;
-	struct Xgt_desc_struct idt;
+	struct desc_ptr gdt;
+	struct desc_ptr idt;
 	u16 ldt;
 	u16 tss;
 	unsigned long tr;
-- 
cgit v1.2.3


From f6dc247cac1a8ee64e59019c3b948d23f3b1a36f Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 30 Jan 2008 13:31:12 +0100
Subject: x86: change gdt acessor macro name

This patch changes the name of x86_64 macro used to access the per-cpu
gdt. It is now equal to the i386 version, which will allow code to be shared.

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/setup64.c     |  2 +-
 arch/x86/kernel/suspend_64.c  |  2 +-
 arch/x86/kernel/vsyscall_64.c |  2 +-
 include/asm-x86/desc_64.h     | 10 +++++-----
 4 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c
index 51297cca6b42..05cafcb94109 100644
--- a/arch/x86/kernel/setup64.c
+++ b/arch/x86/kernel/setup64.c
@@ -228,7 +228,7 @@ void __cpuinit cpu_init (void)
 	 * and set up the GDT descriptor:
 	 */
 	if (cpu)
- 		memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE);
+		memcpy(get_cpu_gdt_table(cpu), cpu_gdt_table, GDT_SIZE);
 
 	cpu_gdt_descr[cpu].size = GDT_SIZE;
 	load_gdt((const struct desc_ptr *)&cpu_gdt_descr[cpu]);
diff --git a/arch/x86/kernel/suspend_64.c b/arch/x86/kernel/suspend_64.c
index 569f1b540e36..279c25775d19 100644
--- a/arch/x86/kernel/suspend_64.c
+++ b/arch/x86/kernel/suspend_64.c
@@ -140,7 +140,7 @@ void fix_processor_context(void)
 
 	set_tss_desc(cpu,t);	/* This just modifies memory; should not be necessary. But... This is necessary, because 386 hardware has concept of busy TSS or some similar stupidity. */
 
-	cpu_gdt(cpu)[GDT_ENTRY_TSS].type = 9;
+	get_cpu_gdt_table(cpu)[GDT_ENTRY_TSS].type = 9;
 
 	syscall_init();                         /* This sets MSR_*STAR and related */
 	load_TR_desc();				/* This does ltr */
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index c4c5e765cd2c..e5c1118a8098 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -297,7 +297,7 @@ static void __cpuinit vsyscall_set_cpu(int cpu)
 	/* Store cpu number in limit so that it can be loaded quickly
 	   in user space in vgetcpu.
 	   12 bits for the CPU and 8 bits for the node. */
-	d = (unsigned long *)(cpu_gdt(cpu) + GDT_ENTRY_PER_CPU);
+	d = (unsigned long *)(get_cpu_gdt_table(cpu) + GDT_ENTRY_PER_CPU);
 	*d = 0x0f40000000000ULL;
 	*d |= cpu;
 	*d |= (node & 0xf) << 12;
diff --git a/include/asm-x86/desc_64.h b/include/asm-x86/desc_64.h
index 230ac6e50a0f..4d67f61b4c23 100644
--- a/include/asm-x86/desc_64.h
+++ b/include/asm-x86/desc_64.h
@@ -48,7 +48,7 @@ static inline void write_ldt_entry(struct desc_struct *ldt,
 }
 
 /* the cpu gdt accessor */
-#define cpu_gdt(_cpu) ((struct desc_struct *)cpu_gdt_descr[_cpu].address)
+#define get_cpu_gdt_table(x) ((struct desc_struct *)cpu_gdt_descr[x].address)
 
 static inline void load_gdt(const struct desc_ptr *ptr)
 {
@@ -141,15 +141,15 @@ static inline void set_tss_desc(unsigned cpu, void *addr)
 	 * -1? seg base+limit should be pointing to the address of the
 	 * last valid byte
 	 */
-	set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_TSS],
+	set_tssldt_descriptor(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TSS],
 		(unsigned long)addr, DESC_TSS,
 		IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1);
 }
 
 static inline void set_ldt_desc(unsigned cpu, void *addr, int size)
 {
-	set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_LDT], (unsigned long)addr,
-			      DESC_LDT, size * 8 - 1);
+	set_tssldt_descriptor(&get_cpu_gdt_table(cpu)[GDT_ENTRY_LDT],
+			     (unsigned long)addr, DESC_LDT, size * 8 - 1);
 }
 
 #define LDT_entry_a(info) \
@@ -183,7 +183,7 @@ static inline void set_ldt_desc(unsigned cpu, void *addr, int size)
 static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
 {
 	unsigned int i;
-	u64 *gdt = (u64 *)(cpu_gdt(cpu) + GDT_ENTRY_TLS_MIN);
+	u64 *gdt = (u64 *)(get_cpu_gdt_table(cpu) + GDT_ENTRY_TLS_MIN);
 
 	for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
 		gdt[i] = t->tls_array[i];
-- 
cgit v1.2.3


From 010d4f8221cf51a2ab8b037d0149506b397d073f Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 30 Jan 2008 13:31:12 +0100
Subject: x86: introduce gate_desc type.

To account for the differences in gate descriptor in i386 and x86_64
a gate_desc type is introduced.

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/traps_32.c  |  2 +-
 include/asm-x86/desc_32.h   | 15 ++++++++-------
 include/asm-x86/desc_64.h   |  4 ++--
 include/asm-x86/desc_defs.h |  8 +++++++-
 4 files changed, 18 insertions(+), 11 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index 0d45017ed824..c70c41fd710b 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -76,7 +76,7 @@ char ignore_fpu_irq = 0;
  * F0 0F bug workaround.. We have a special link segment
  * for this.
  */
-struct desc_struct idt_table[256]
+gate_desc idt_table[256]
 	__attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
 
 asmlinkage void divide_error(void);
diff --git a/include/asm-x86/desc_32.h b/include/asm-x86/desc_32.h
index bc5ca3453466..77f1e5a4ad7c 100644
--- a/include/asm-x86/desc_32.h
+++ b/include/asm-x86/desc_32.h
@@ -3,6 +3,7 @@
 
 #include <asm/ldt.h>
 #include <asm/segment.h>
+#include <asm/desc_defs.h>
 
 #ifndef __ASSEMBLY__
 
@@ -24,7 +25,7 @@ static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
 }
 
 extern struct desc_ptr idt_descr;
-extern struct desc_struct idt_table[];
+extern gate_desc idt_table[];
 extern void set_intr_gate(unsigned int irq, void * addr);
 
 static inline void pack_descriptor(__u32 *a, __u32 *b,
@@ -35,11 +36,11 @@ static inline void pack_descriptor(__u32 *a, __u32 *b,
 		(limit & 0x000f0000) | ((type & 0xff) << 8) | ((flags & 0xf) << 20);
 }
 
-static inline void pack_gate(__u32 *a, __u32 *b,
+static inline void pack_gate(gate_desc *gate,
 	unsigned long base, unsigned short seg, unsigned char type, unsigned char flags)
 {
-	*a = (seg << 16) | (base & 0xffff);
-	*b = (base & 0xffff0000) | ((type & 0xff) << 8) | (flags & 0xff);
+	gate->a = (seg << 16) | (base & 0xffff);
+	gate->b = (base & 0xffff0000) | ((type & 0xff) << 8) | (flags & 0xff);
 }
 
 #define DESCTYPE_LDT 	0x82	/* present, system, DPL-0, LDT */
@@ -139,9 +140,9 @@ static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
 
 static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg)
 {
-	__u32 a, b;
-	pack_gate(&a, &b, (unsigned long)addr, seg, type, 0);
-	write_idt_entry(idt_table, gate, a, b);
+	gate_desc g;
+	pack_gate(&g, (unsigned long)addr, seg, type, 0);
+	write_idt_entry(idt_table, gate, g.a, g.b);
 }
 
 static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, const void *addr)
diff --git a/include/asm-x86/desc_64.h b/include/asm-x86/desc_64.h
index 8bd84a4ef2d5..887393f044b1 100644
--- a/include/asm-x86/desc_64.h
+++ b/include/asm-x86/desc_64.h
@@ -30,7 +30,7 @@ static inline unsigned long __store_tr(void)
 
 #define store_tr(tr) (tr) = __store_tr()
 
-extern struct gate_struct idt_table[];
+extern gate_desc idt_table[];
 extern struct desc_ptr cpu_gdt_descr[];
 
 static inline void write_ldt_entry(struct desc_struct *ldt,
@@ -58,7 +58,7 @@ static inline void store_gdt(struct desc_ptr *ptr)
 static inline void _set_gate(void *adr, unsigned type, unsigned long func,
 			     unsigned dpl, unsigned ist)
 {
-	struct gate_struct s;
+	gate_desc s;
 
 	s.offset_low = PTR_LOW(func);
 	s.segment = __KERNEL_CS;
diff --git a/include/asm-x86/desc_defs.h b/include/asm-x86/desc_defs.h
index de47eb0a23aa..9732285116af 100644
--- a/include/asm-x86/desc_defs.h
+++ b/include/asm-x86/desc_defs.h
@@ -39,7 +39,7 @@ enum {
 };
 
 // 16byte gate
-struct gate_struct {
+struct gate_struct64 {
 	u16 offset_low;
 	u16 segment;
 	unsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1;
@@ -67,6 +67,12 @@ struct ldttss_desc {
 	u32 zero1;
 } __attribute__((packed));
 
+#ifdef CONFIG_X86_64
+typedef struct gate_struct64 gate_desc;
+#else
+typedef struct desc_struct gate_desc;
+#endif
+
 struct desc_ptr {
 	unsigned short size;
 	unsigned long address;
-- 
cgit v1.2.3


From 8d947344c47a40626730bb80d136d8daac9f2060 Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 30 Jan 2008 13:31:12 +0100
Subject: x86: change write_idt_entry signature

this patch changes write_idt_entry signature. It now takes a gate_desc
instead of the a and b parameters. It will allow it to be later unified
between i386 and x86_64.

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
CC: Zachary Amsden <zach@vmware.com>
CC: Jeremy Fitzhardinge <Jeremy.Fitzhardinge.citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/paravirt_32.c |  2 +-
 arch/x86/kernel/vmi_32.c      | 10 +++++++++-
 arch/x86/lguest/boot.c        |  9 +++++----
 arch/x86/xen/enlighten.c      |  8 ++++----
 include/asm-x86/desc_32.h     | 10 ++++++++--
 include/asm-x86/paravirt.h    |  9 +++++----
 6 files changed, 32 insertions(+), 16 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/paravirt_32.c b/arch/x86/kernel/paravirt_32.c
index f4e3a8e01cf2..13bbc99b639b 100644
--- a/arch/x86/kernel/paravirt_32.c
+++ b/arch/x86/kernel/paravirt_32.c
@@ -381,7 +381,7 @@ struct pv_cpu_ops pv_cpu_ops = {
 	.load_tls = native_load_tls,
 	.write_ldt_entry = write_dt_entry,
 	.write_gdt_entry = write_dt_entry,
-	.write_idt_entry = write_dt_entry,
+	.write_idt_entry = native_write_idt_entry,
 	.load_sp0 = native_load_sp0,
 
 	.irq_enable_syscall_ret = native_irq_enable_syscall_ret,
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 4cfda7dbe90f..a635b22de25f 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -62,6 +62,7 @@ static struct {
 	void (*cpuid)(void /* non-c */);
 	void (*_set_ldt)(u32 selector);
 	void (*set_tr)(u32 selector);
+	void (*write_idt_entry)(struct desc_struct *, int, u32, u32);
 	void (*set_kernel_stack)(u32 selector, u32 sp0);
 	void (*allocate_page)(u32, u32, u32, u32, u32);
 	void (*release_page)(u32, u32);
@@ -214,6 +215,12 @@ static void vmi_set_tr(void)
 	vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct));
 }
 
+static void vmi_write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
+{
+	u32 *idt_entry = (u32 *)g;
+	vmi_ops.write_idt_entry(dt, entry, idt_entry[0], idt_entry[2]);
+}
+
 static void vmi_load_sp0(struct tss_struct *tss,
 				   struct thread_struct *thread)
 {
@@ -792,7 +799,8 @@ static inline int __init activate_vmi(void)
 	pv_cpu_ops.load_tls = vmi_load_tls;
 	para_fill(pv_cpu_ops.write_ldt_entry, WriteLDTEntry);
 	para_fill(pv_cpu_ops.write_gdt_entry, WriteGDTEntry);
-	para_fill(pv_cpu_ops.write_idt_entry, WriteIDTEntry);
+	para_wrap(pv_cpu_ops.write_idt_entry, vmi_write_idt_entry,
+		  write_idt_entry, WriteIDTEntry);
 	para_wrap(pv_cpu_ops.load_sp0, vmi_load_sp0, set_kernel_stack, UpdateKernelStack);
 	para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
 	para_fill(pv_cpu_ops.io_delay, IODelay);
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index aa0bdd5fc4bb..b50c8ad25ab4 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -217,13 +217,14 @@ static void irq_enable(void)
  * address of the handler, and... well, who cares?  The Guest just asks the
  * Host to make the change anyway, because the Host controls the real IDT.
  */
-static void lguest_write_idt_entry(struct desc_struct *dt,
-				   int entrynum, u32 low, u32 high)
+static void lguest_write_idt_entry(gate_desc *dt,
+				   int entrynum, const gate_desc *g)
 {
+	u32 *desc = (u32 *)g;
 	/* Keep the local copy up to date. */
-	write_dt_entry(dt, entrynum, low, high);
+	native_write_idt_entry(dt, entrynum, g);
 	/* Tell Host about this new entry. */
-	hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, low, high);
+	hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]);
 }
 
 /* Changing to a different IDT is very rare: we keep the IDT up-to-date every
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index c32e0fd0f838..b7b7346d8cdc 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -399,8 +399,7 @@ static DEFINE_PER_CPU(struct desc_ptr, idt_desc);
 
 /* Set an IDT entry.  If the entry is part of the current IDT, then
    also update Xen. */
-static void xen_write_idt_entry(struct desc_struct *dt, int entrynum,
-				u32 low, u32 high)
+static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
 {
 	unsigned long p = (unsigned long)&dt[entrynum];
 	unsigned long start, end;
@@ -412,14 +411,15 @@ static void xen_write_idt_entry(struct desc_struct *dt, int entrynum,
 
 	xen_mc_flush();
 
-	write_dt_entry(dt, entrynum, low, high);
+	native_write_idt_entry(dt, entrynum, g);
 
 	if (p >= start && (p + 8) <= end) {
 		struct trap_info info[2];
+		u32 *desc = (u32 *)g;
 
 		info[1].address = 0;
 
-		if (cvt_gate_to_trap(entrynum, low, high, &info[0]))
+		if (cvt_gate_to_trap(entrynum, desc[0], desc[1], &info[0]))
 			if (HYPERVISOR_set_trap_table(info))
 				BUG();
 	}
diff --git a/include/asm-x86/desc_32.h b/include/asm-x86/desc_32.h
index 77f1e5a4ad7c..54b2314f2ddf 100644
--- a/include/asm-x86/desc_32.h
+++ b/include/asm-x86/desc_32.h
@@ -70,9 +70,15 @@ static inline void pack_gate(gate_desc *gate,
 
 #define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
 #define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
-#define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
+#define write_idt_entry(dt, entry, g) native_write_idt_entry(dt, entry, g)
 #endif
 
+static inline void native_write_idt_entry(gate_desc *idt, int entry,
+					  const gate_desc *gate)
+{
+	memcpy(&idt[entry], gate, sizeof(*gate));
+}
+
 static inline void write_dt_entry(struct desc_struct *dt,
 				  int entry, u32 entry_low, u32 entry_high)
 {
@@ -142,7 +148,7 @@ static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned s
 {
 	gate_desc g;
 	pack_gate(&g, (unsigned long)addr, seg, type, 0);
-	write_idt_entry(idt_table, gate, g.a, g.b);
+	write_idt_entry(idt_table, gate, &g);
 }
 
 static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, const void *addr)
diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h
index 0333fb6988b5..86a9d7b0920f 100644
--- a/include/asm-x86/paravirt.h
+++ b/include/asm-x86/paravirt.h
@@ -17,6 +17,7 @@
 #include <linux/types.h>
 #include <linux/cpumask.h>
 #include <asm/kmap_types.h>
+#include <asm/desc_defs.h>
 
 struct page;
 struct thread_struct;
@@ -99,8 +100,8 @@ struct pv_cpu_ops {
 				int entrynum, u32 low, u32 high);
 	void (*write_gdt_entry)(struct desc_struct *,
 				int entrynum, u32 low, u32 high);
-	void (*write_idt_entry)(struct desc_struct *,
-				int entrynum, u32 low, u32 high);
+	void (*write_idt_entry)(gate_desc *,
+				int entrynum, const gate_desc *gate);
 	void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t);
 
 	void (*set_iopl_mask)(unsigned mask);
@@ -667,9 +668,9 @@ static inline void write_gdt_entry(void *dt, int entry, u32 low, u32 high)
 {
 	PVOP_VCALL4(pv_cpu_ops.write_gdt_entry, dt, entry, low, high);
 }
-static inline void write_idt_entry(void *dt, int entry, u32 low, u32 high)
+static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
 {
-	PVOP_VCALL4(pv_cpu_ops.write_idt_entry, dt, entry, low, high);
+	PVOP_VCALL3(pv_cpu_ops.write_idt_entry, dt, entry, g);
 }
 static inline void set_iopl_mask(unsigned mask)
 {
-- 
cgit v1.2.3


From 5af725026fe902bf81f1b90b1b9d9ee4b9e1eb6a Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 30 Jan 2008 13:31:13 +0100
Subject: x86: modify write_ldt function

This patch modifies the write_ldt() function to make use
of the new struct desc_struct instead of entry_1 and entry_2
entries

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ldt.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index a8cdca3615bf..7eb0c8a45734 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -186,7 +186,7 @@ static int read_default_ldt(void __user *ptr, unsigned long bytecount)
 static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
 {
 	struct mm_struct *mm = current->mm;
-	__u32 entry_1, entry_2;
+	struct desc_struct ldt;
 	int error;
 	struct user_desc ldt_info;
 
@@ -218,21 +218,20 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
 	/* Allow LDTs to be cleared by the user. */
 	if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
 		if (oldmode || LDT_empty(&ldt_info)) {
-			entry_1 = 0;
-			entry_2 = 0;
+			memset(&ldt, 0, sizeof(ldt));
 			goto install;
 		}
 	}
 
-	entry_1 = LDT_entry_a(&ldt_info);
-	entry_2 = LDT_entry_b(&ldt_info);
+	ldt.a = LDT_entry_a(&ldt_info);
+	ldt.b = LDT_entry_b(&ldt_info);
 	if (oldmode)
-		entry_2 &= ~(1 << 20);
+		ldt.avl = 0;
 
 	/* Install the new entry ...  */
 install:
-	write_ldt_entry(mm->context.ldt, ldt_info.entry_number, entry_1,
-			entry_2);
+	write_ldt_entry(mm->context.ldt, ldt_info.entry_number,
+			ldt.a, ldt.b);
 	error = 0;
 
 out_unlock:
-- 
cgit v1.2.3


From 80fbb69a8d1268ef48dfe21da80e68cb01922f31 Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 30 Jan 2008 13:31:13 +0100
Subject: x86: introduce fill_ldt

This patch introduces fill_ldt(), which populates a ldt descriptor
from a user_desc in once, instead of relying in the LDT_entry_a and
LDT_entry_b macros

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ldt.c        |  3 +--
 arch/x86/kernel/process_64.c |  3 +--
 arch/x86/kernel/tls.c        |  7 +++----
 include/asm-x86/desc.h       | 29 +++++++++++++++++++++++++++++
 include/asm-x86/desc_32.h    | 15 ---------------
 include/asm-x86/desc_64.h    | 17 -----------------
 6 files changed, 34 insertions(+), 40 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 7eb0c8a45734..3e872b468533 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -223,8 +223,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
 		}
 	}
 
-	ldt.a = LDT_entry_a(&ldt_info);
-	ldt.b = LDT_entry_b(&ldt_info);
+	fill_ldt(&ldt, &ldt_info);
 	if (oldmode)
 		ldt.avl = 0;
 
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 86c310acc989..f91521e26335 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -457,8 +457,7 @@ static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
 	};
 	struct desc_struct *desc = (void *)t->thread.tls_array;
 	desc += tls;
-	desc->a = LDT_entry_a(&ud);
-	desc->b = LDT_entry_b(&ud);
+	fill_ldt(desc, &ud);
 }
 
 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index 67a377621b12..74d2b65a82eb 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -67,10 +67,9 @@ int do_set_thread_area(struct task_struct *p, int idx,
 	if (LDT_empty(&info)) {
 		desc[0] = 0;
 		desc[1] = 0;
-	} else {
-		desc[0] = LDT_entry_a(&info);
-		desc[1] = LDT_entry_b(&info);
-	}
+	} else
+		fill_ldt((struct desc_struct *)desc, &info);
+
 	if (t == &current->thread)
 		load_TLS(t, cpu);
 
diff --git a/include/asm-x86/desc.h b/include/asm-x86/desc.h
index 6065c5092265..47086d2d9298 100644
--- a/include/asm-x86/desc.h
+++ b/include/asm-x86/desc.h
@@ -1,5 +1,34 @@
+#ifndef _ASM_DESC_H_
+#define _ASM_DESC_H_
+
+#ifndef __ASSEMBLY__
+#include <asm/desc_defs.h>
+#include <asm/ldt.h>
+
+static inline void fill_ldt(struct desc_struct *desc, struct user_desc *info)
+{
+	desc->limit0 = info->limit & 0x0ffff;
+	desc->base0 = info->base_addr & 0x0000ffff;
+
+	desc->base1 = (info->base_addr & 0x00ff0000) >> 16;
+	desc->type = (info->read_exec_only ^ 1) << 1;
+	desc->type |= info->contents << 2;
+	desc->s = 1;
+	desc->dpl = 0x3;
+	desc->p = info->seg_not_present ^ 1;
+	desc->limit = (info->limit & 0xf0000) >> 16;
+	desc->avl = info->useable;
+	desc->d = info->seg_32bit;
+	desc->g = info->limit_in_pages;
+	desc->base2 = (info->base_addr & 0xff000000) >> 24;
+}
+
+#endif
+
 #ifdef CONFIG_X86_32
 # include "desc_32.h"
 #else
 # include "desc_64.h"
 #endif
+
+#endif
diff --git a/include/asm-x86/desc_32.h b/include/asm-x86/desc_32.h
index 54b2314f2ddf..03700991c5db 100644
--- a/include/asm-x86/desc_32.h
+++ b/include/asm-x86/desc_32.h
@@ -163,21 +163,6 @@ static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, const vo
 
 #define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
 
-#define LDT_entry_a(info) \
-	((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
-
-#define LDT_entry_b(info) \
-	(((info)->base_addr & 0xff000000) | \
-	(((info)->base_addr & 0x00ff0000) >> 16) | \
-	((info)->limit & 0xf0000) | \
-	(((info)->read_exec_only ^ 1) << 9) | \
-	((info)->contents << 10) | \
-	(((info)->seg_not_present ^ 1) << 15) | \
-	((info)->seg_32bit << 22) | \
-	((info)->limit_in_pages << 23) | \
-	((info)->useable << 20) | \
-	0x7000)
-
 #define LDT_empty(info) (\
 	(info)->base_addr	== 0	&& \
 	(info)->limit		== 0	&& \
diff --git a/include/asm-x86/desc_64.h b/include/asm-x86/desc_64.h
index c49f928ed8b6..ba7fb87d10f3 100644
--- a/include/asm-x86/desc_64.h
+++ b/include/asm-x86/desc_64.h
@@ -147,23 +147,6 @@ static inline void set_ldt_desc(unsigned cpu, void *addr, int size)
 			     (unsigned long)addr, DESC_LDT, size * 8 - 1);
 }
 
-#define LDT_entry_a(info) \
-	((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
-/* Don't allow setting of the lm bit. It is useless anyways because
-   64bit system calls require __USER_CS. */
-#define LDT_entry_b(info) \
-	(((info)->base_addr & 0xff000000) | \
-	(((info)->base_addr & 0x00ff0000) >> 16) | \
-	((info)->limit & 0xf0000) | \
-	(((info)->read_exec_only ^ 1) << 9) | \
-	((info)->contents << 10) | \
-	(((info)->seg_not_present ^ 1) << 15) | \
-	((info)->seg_32bit << 22) | \
-	((info)->limit_in_pages << 23) | \
-	((info)->useable << 20) | \
-	/* ((info)->lm << 21) | */ \
-	0x7000)
-
 #define LDT_empty(info) (\
 	(info)->base_addr	== 0	&& \
 	(info)->limit		== 0	&& \
-- 
cgit v1.2.3


From 014b15be30c04622d130946ab7c0a9101b523a8a Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 30 Jan 2008 13:31:13 +0100
Subject: x86: change write_gdt_entry signature.

This patch changes the write_gdt_entry function signature.
Instead of the old "a" and "b" parameters, it now receives
a pointer to a desc_struct, and the size of the entry being
handled. This is because x86_64 can have some 16-byte entries
as well as 8-byte ones.

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
CC: Zachary Amsden <zach@vmware.com>
CC: Jeremy Fitzhardinge <Jeremy.Fitzhardinge.citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/paravirt_32.c  |  2 +-
 arch/x86/kernel/smpcommon_32.c |  7 ++++---
 arch/x86/kernel/vmi_32.c       | 21 +++++++++++++++------
 arch/x86/lguest/boot.c         |  6 +++---
 arch/x86/xen/enlighten.c       | 11 +++++------
 include/asm-x86/desc_32.h      | 34 ++++++++++++++++++++++------------
 include/asm-x86/paravirt.h     |  9 ++++++---
 7 files changed, 56 insertions(+), 34 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/paravirt_32.c b/arch/x86/kernel/paravirt_32.c
index 13bbc99b639b..77602c1252d7 100644
--- a/arch/x86/kernel/paravirt_32.c
+++ b/arch/x86/kernel/paravirt_32.c
@@ -380,7 +380,7 @@ struct pv_cpu_ops pv_cpu_ops = {
 	.store_tr = native_store_tr,
 	.load_tls = native_load_tls,
 	.write_ldt_entry = write_dt_entry,
-	.write_gdt_entry = write_dt_entry,
+	.write_gdt_entry = native_write_gdt_entry,
 	.write_idt_entry = native_write_idt_entry,
 	.load_sp0 = native_load_sp0,
 
diff --git a/arch/x86/kernel/smpcommon_32.c b/arch/x86/kernel/smpcommon_32.c
index bbfe85a0f699..8bc38af29aef 100644
--- a/arch/x86/kernel/smpcommon_32.c
+++ b/arch/x86/kernel/smpcommon_32.c
@@ -14,10 +14,11 @@ __cpuinit void init_gdt(int cpu)
 {
 	struct desc_struct *gdt = get_cpu_gdt_table(cpu);
 
-	pack_descriptor((u32 *)&gdt[GDT_ENTRY_PERCPU].a,
-			(u32 *)&gdt[GDT_ENTRY_PERCPU].b,
+	pack_descriptor(&gdt[GDT_ENTRY_PERCPU],
 			__per_cpu_offset[cpu], 0xFFFFF,
-			0x80 | DESCTYPE_S | 0x2, 0x8);
+			0x2 | DESCTYPE_S, 0x8);
+
+	gdt[GDT_ENTRY_PERCPU].s = 1;
 
 	per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
 	per_cpu(cpu_number, cpu) = cpu;
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index a635b22de25f..21edd0d6eae5 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -63,6 +63,7 @@ static struct {
 	void (*_set_ldt)(u32 selector);
 	void (*set_tr)(u32 selector);
 	void (*write_idt_entry)(struct desc_struct *, int, u32, u32);
+	void (*write_gdt_entry)(struct desc_struct *, int, u32, u32);
 	void (*set_kernel_stack)(u32 selector, u32 sp0);
 	void (*allocate_page)(u32, u32, u32, u32, u32);
 	void (*release_page)(u32, u32);
@@ -187,7 +188,7 @@ static void vmi_cpuid(unsigned int *ax, unsigned int *bx,
 static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new)
 {
 	if (gdt[nr].a != new->a || gdt[nr].b != new->b)
-		write_gdt_entry(gdt, nr, new->a, new->b);
+		write_gdt_entry(gdt, nr, new, 0);
 }
 
 static void vmi_load_tls(struct thread_struct *t, unsigned int cpu)
@@ -201,12 +202,12 @@ static void vmi_load_tls(struct thread_struct *t, unsigned int cpu)
 static void vmi_set_ldt(const void *addr, unsigned entries)
 {
 	unsigned cpu = smp_processor_id();
-	u32 low, high;
+	struct desc_struct desc;
 
-	pack_descriptor(&low, &high, (unsigned long)addr,
+	pack_descriptor(&desc, (unsigned long)addr,
 			entries * sizeof(struct desc_struct) - 1,
-			DESCTYPE_LDT, 0);
-	write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, low, high);
+			DESC_LDT, 0);
+	write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, &desc, DESC_LDT);
 	vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0);
 }
 
@@ -221,6 +222,13 @@ static void vmi_write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
 	vmi_ops.write_idt_entry(dt, entry, idt_entry[0], idt_entry[2]);
 }
 
+static void vmi_write_gdt_entry(struct desc_struct *dt, int entry,
+				const void *desc, int type)
+{
+	u32 *gdt_entry = (u32 *)desc;
+	vmi_ops.write_gdt_entry(dt, entry, gdt_entry[0], gdt_entry[2]);
+}
+
 static void vmi_load_sp0(struct tss_struct *tss,
 				   struct thread_struct *thread)
 {
@@ -798,7 +806,8 @@ static inline int __init activate_vmi(void)
 	para_fill(pv_cpu_ops.store_tr, GetTR);
 	pv_cpu_ops.load_tls = vmi_load_tls;
 	para_fill(pv_cpu_ops.write_ldt_entry, WriteLDTEntry);
-	para_fill(pv_cpu_ops.write_gdt_entry, WriteGDTEntry);
+	para_wrap(pv_cpu_ops.write_gdt_entry, vmi_write_gdt_entry,
+		  write_gdt_entry, WriteGDTEntry);
 	para_wrap(pv_cpu_ops.write_idt_entry, vmi_write_idt_entry,
 		  write_idt_entry, WriteIDTEntry);
 	para_wrap(pv_cpu_ops.load_sp0, vmi_load_sp0, set_kernel_stack, UpdateKernelStack);
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index b50c8ad25ab4..a63373759f08 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -262,10 +262,10 @@ static void lguest_load_gdt(const struct desc_ptr *desc)
 /* For a single GDT entry which changes, we do the lazy thing: alter our GDT,
  * then tell the Host to reload the entire thing.  This operation is so rare
  * that this naive implementation is reasonable. */
-static void lguest_write_gdt_entry(struct desc_struct *dt,
-				   int entrynum, u32 low, u32 high)
+static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum,
+				   const void *desc, int type)
 {
-	write_dt_entry(dt, entrynum, low, high);
+	native_write_gdt_entry(dt, entrynum, desc, type);
 	hcall(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES, 0);
 }
 
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index b7b7346d8cdc..7f98c63f6381 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -475,22 +475,21 @@ static void xen_load_idt(const struct desc_ptr *desc)
 /* Write a GDT descriptor entry.  Ignore LDT descriptors, since
    they're handled differently. */
 static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
-				u32 low, u32 high)
+				const void *desc, int type)
 {
 	preempt_disable();
 
-	switch ((high >> 8) & 0xff) {
-	case DESCTYPE_LDT:
-	case DESCTYPE_TSS:
+	switch (type) {
+	case DESC_LDT:
+	case DESC_TSS:
 		/* ignore */
 		break;
 
 	default: {
 		xmaddr_t maddr = virt_to_machine(&dt[entry]);
-		u64 desc = (u64)high << 32 | low;
 
 		xen_mc_flush();
-		if (HYPERVISOR_update_descriptor(maddr.maddr, desc))
+		if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
 			BUG();
 	}
 
diff --git a/include/asm-x86/desc_32.h b/include/asm-x86/desc_32.h
index 03700991c5db..41d8214c6173 100644
--- a/include/asm-x86/desc_32.h
+++ b/include/asm-x86/desc_32.h
@@ -28,12 +28,13 @@ extern struct desc_ptr idt_descr;
 extern gate_desc idt_table[];
 extern void set_intr_gate(unsigned int irq, void * addr);
 
-static inline void pack_descriptor(__u32 *a, __u32 *b,
+static inline void pack_descriptor(struct desc_struct *desc,
 	unsigned long base, unsigned long limit, unsigned char type, unsigned char flags)
 {
-	*a = ((base & 0xffff) << 16) | (limit & 0xffff);
-	*b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
+	desc->a = ((base & 0xffff) << 16) | (limit & 0xffff);
+	desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
 		(limit & 0x000f0000) | ((type & 0xff) << 8) | ((flags & 0xf) << 20);
+	desc->p = 1;
 }
 
 static inline void pack_gate(gate_desc *gate,
@@ -69,7 +70,8 @@ static inline void pack_gate(gate_desc *gate,
 #define set_ldt native_set_ldt
 
 #define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
-#define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
+#define write_gdt_entry(dt, entry, desc, type) \
+				native_write_gdt_entry(dt, entry, desc, type)
 #define write_idt_entry(dt, entry, g) native_write_idt_entry(dt, entry, g)
 #endif
 
@@ -79,6 +81,12 @@ static inline void native_write_idt_entry(gate_desc *idt, int entry,
 	memcpy(&idt[entry], gate, sizeof(*gate));
 }
 
+static inline void native_write_gdt_entry(struct desc_struct *gdt, int entry,
+					  const void *desc, int type)
+{
+	memcpy(&gdt[entry], desc, sizeof(struct desc_struct));
+}
+
 static inline void write_dt_entry(struct desc_struct *dt,
 				  int entry, u32 entry_low, u32 entry_high)
 {
@@ -86,18 +94,20 @@ static inline void write_dt_entry(struct desc_struct *dt,
 	dt[entry].b = entry_high;
 }
 
+
 static inline void native_set_ldt(const void *addr, unsigned int entries)
 {
 	if (likely(entries == 0))
 		__asm__ __volatile__("lldt %w0"::"q" (0));
 	else {
 		unsigned cpu = smp_processor_id();
-		__u32 a, b;
+		ldt_desc ldt;
 
-		pack_descriptor(&a, &b, (unsigned long)addr,
+		pack_descriptor(&ldt, (unsigned long)addr,
 				entries * sizeof(struct desc_struct) - 1,
-				DESCTYPE_LDT, 0);
-		write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, a, b);
+				DESC_LDT, 0);
+		write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT,
+				&ldt, DESC_LDT);
 		__asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
 	}
 }
@@ -153,11 +163,11 @@ static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned s
 
 static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, const void *addr)
 {
-	__u32 a, b;
-	pack_descriptor(&a, &b, (unsigned long)addr,
+	tss_desc tss;
+	pack_descriptor(&tss, (unsigned long)addr,
 			offsetof(struct tss_struct, __cacheline_filler) - 1,
-			DESCTYPE_TSS, 0);
-	write_gdt_entry(get_cpu_gdt_table(cpu), entry, a, b);
+			DESC_TSS, 0);
+	write_gdt_entry(get_cpu_gdt_table(cpu), entry, &tss, DESC_TSS);
 }
 
 
diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h
index 86a9d7b0920f..3f2abf295e2a 100644
--- a/include/asm-x86/paravirt.h
+++ b/include/asm-x86/paravirt.h
@@ -99,7 +99,7 @@ struct pv_cpu_ops {
 	void (*write_ldt_entry)(struct desc_struct *,
 				int entrynum, u32 low, u32 high);
 	void (*write_gdt_entry)(struct desc_struct *,
-				int entrynum, u32 low, u32 high);
+				int entrynum, const void *desc, int size);
 	void (*write_idt_entry)(gate_desc *,
 				int entrynum, const gate_desc *gate);
 	void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t);
@@ -664,10 +664,13 @@ static inline void write_ldt_entry(void *dt, int entry, u32 low, u32 high)
 {
 	PVOP_VCALL4(pv_cpu_ops.write_ldt_entry, dt, entry, low, high);
 }
-static inline void write_gdt_entry(void *dt, int entry, u32 low, u32 high)
+
+static inline void write_gdt_entry(struct desc_struct *dt, int entry,
+				   void *desc, int type)
 {
-	PVOP_VCALL4(pv_cpu_ops.write_gdt_entry, dt, entry, low, high);
+	PVOP_VCALL4(pv_cpu_ops.write_gdt_entry, dt, entry, desc, type);
 }
+
 static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
 {
 	PVOP_VCALL3(pv_cpu_ops.write_idt_entry, dt, entry, g);
-- 
cgit v1.2.3


From 75b8bb3e56ca09a467fbbe5229bc68627f7445be Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 30 Jan 2008 13:31:13 +0100
Subject: x86: change write_ldt_entry signature

this patch changes the signature of write_ldt_entry.

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
CC: Zachary Amsden <zach@vmware.com>
CC: Jeremy Fitzhardinge <Jeremy.Fitzhardinge.citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ldt.c         |  3 +--
 arch/x86/kernel/paravirt_32.c |  2 +-
 arch/x86/kernel/vmi_32.c      | 11 ++++++++++-
 arch/x86/xen/enlighten.c      |  4 ++--
 include/asm-x86/desc_32.h     |  9 ++++++++-
 include/asm-x86/desc_64.h     |  7 ++-----
 include/asm-x86/paravirt.h    | 10 ++++++----
 7 files changed, 30 insertions(+), 16 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 3e872b468533..b8ef46270e24 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -229,8 +229,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
 
 	/* Install the new entry ...  */
 install:
-	write_ldt_entry(mm->context.ldt, ldt_info.entry_number,
-			ldt.a, ldt.b);
+	write_ldt_entry(mm->context.ldt, ldt_info.entry_number, &ldt);
 	error = 0;
 
 out_unlock:
diff --git a/arch/x86/kernel/paravirt_32.c b/arch/x86/kernel/paravirt_32.c
index 77602c1252d7..dd063fba2b21 100644
--- a/arch/x86/kernel/paravirt_32.c
+++ b/arch/x86/kernel/paravirt_32.c
@@ -379,7 +379,7 @@ struct pv_cpu_ops pv_cpu_ops = {
 	.store_idt = native_store_idt,
 	.store_tr = native_store_tr,
 	.load_tls = native_load_tls,
-	.write_ldt_entry = write_dt_entry,
+	.write_ldt_entry = native_write_ldt_entry,
 	.write_gdt_entry = native_write_gdt_entry,
 	.write_idt_entry = native_write_idt_entry,
 	.load_sp0 = native_load_sp0,
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 21edd0d6eae5..10c46419d35d 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -64,6 +64,7 @@ static struct {
 	void (*set_tr)(u32 selector);
 	void (*write_idt_entry)(struct desc_struct *, int, u32, u32);
 	void (*write_gdt_entry)(struct desc_struct *, int, u32, u32);
+	void (*write_ldt_entry)(struct desc_struct *, int, u32, u32);
 	void (*set_kernel_stack)(u32 selector, u32 sp0);
 	void (*allocate_page)(u32, u32, u32, u32, u32);
 	void (*release_page)(u32, u32);
@@ -229,6 +230,13 @@ static void vmi_write_gdt_entry(struct desc_struct *dt, int entry,
 	vmi_ops.write_gdt_entry(dt, entry, gdt_entry[0], gdt_entry[2]);
 }
 
+static void vmi_write_ldt_entry(struct desc_struct *dt, int entry,
+				const void *desc)
+{
+	u32 *ldt_entry = (u32 *)desc;
+	vmi_ops.write_idt_entry(dt, entry, ldt_entry[0], ldt_entry[2]);
+}
+
 static void vmi_load_sp0(struct tss_struct *tss,
 				   struct thread_struct *thread)
 {
@@ -805,7 +813,8 @@ static inline int __init activate_vmi(void)
 	para_fill(pv_cpu_ops.store_idt, GetIDT);
 	para_fill(pv_cpu_ops.store_tr, GetTR);
 	pv_cpu_ops.load_tls = vmi_load_tls;
-	para_fill(pv_cpu_ops.write_ldt_entry, WriteLDTEntry);
+	para_wrap(pv_cpu_ops.write_ldt_entry, vmi_write_ldt_entry,
+		  write_ldt_entry, WriteLDTEntry);
 	para_wrap(pv_cpu_ops.write_gdt_entry, vmi_write_gdt_entry,
 		  write_gdt_entry, WriteGDTEntry);
 	para_wrap(pv_cpu_ops.write_idt_entry, vmi_write_idt_entry,
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 7f98c63f6381..72dd14d0685c 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -357,11 +357,11 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
 }
 
 static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
-				u32 low, u32 high)
+				const void *ptr)
 {
 	unsigned long lp = (unsigned long)&dt[entrynum];
 	xmaddr_t mach_lp = virt_to_machine(lp);
-	u64 entry = (u64)high << 32 | low;
+	u64 entry = *(u64 *)ptr;
 
 	preempt_disable();
 
diff --git a/include/asm-x86/desc_32.h b/include/asm-x86/desc_32.h
index 41d8214c6173..92a72b0381e2 100644
--- a/include/asm-x86/desc_32.h
+++ b/include/asm-x86/desc_32.h
@@ -69,12 +69,19 @@ static inline void pack_gate(gate_desc *gate,
 #define load_TLS(t, cpu) native_load_tls(t, cpu)
 #define set_ldt native_set_ldt
 
-#define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
+#define write_ldt_entry(dt, entry, desc) \
+				native_write_ldt_entry(dt, entry, desc)
 #define write_gdt_entry(dt, entry, desc, type) \
 				native_write_gdt_entry(dt, entry, desc, type)
 #define write_idt_entry(dt, entry, g) native_write_idt_entry(dt, entry, g)
 #endif
 
+static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry,
+					  const void *desc)
+{
+	memcpy(&ldt[entry], desc, sizeof(struct desc_struct));
+}
+
 static inline void native_write_idt_entry(gate_desc *idt, int entry,
 					  const gate_desc *gate)
 {
diff --git a/include/asm-x86/desc_64.h b/include/asm-x86/desc_64.h
index ba7fb87d10f3..7cdd3f0420fd 100644
--- a/include/asm-x86/desc_64.h
+++ b/include/asm-x86/desc_64.h
@@ -34,12 +34,9 @@ extern gate_desc idt_table[];
 extern struct desc_ptr cpu_gdt_descr[];
 
 static inline void write_ldt_entry(struct desc_struct *ldt,
-				   int entry, u32 entry_low, u32 entry_high)
+				   int entry, void *ptr)
 {
-	__u32 *lp = (__u32 *)((entry << 3) + (char *)ldt);
-
-	lp[0] = entry_low;
-	lp[1] = entry_high;
+	memcpy(&ldt[entry], ptr, 8);
 }
 
 /* the cpu gdt accessor */
diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h
index 3f2abf295e2a..4f23f434a1f3 100644
--- a/include/asm-x86/paravirt.h
+++ b/include/asm-x86/paravirt.h
@@ -96,8 +96,8 @@ struct pv_cpu_ops {
 	void (*set_ldt)(const void *desc, unsigned entries);
 	unsigned long (*store_tr)(void);
 	void (*load_tls)(struct thread_struct *t, unsigned int cpu);
-	void (*write_ldt_entry)(struct desc_struct *,
-				int entrynum, u32 low, u32 high);
+	void (*write_ldt_entry)(struct desc_struct *ldt, int entrynum,
+				const void *desc);
 	void (*write_gdt_entry)(struct desc_struct *,
 				int entrynum, const void *desc, int size);
 	void (*write_idt_entry)(gate_desc *,
@@ -660,9 +660,11 @@ static inline void load_TLS(struct thread_struct *t, unsigned cpu)
 {
 	PVOP_VCALL2(pv_cpu_ops.load_tls, t, cpu);
 }
-static inline void write_ldt_entry(void *dt, int entry, u32 low, u32 high)
+
+static inline void write_ldt_entry(struct desc_struct *dt, int entry,
+				   const void *desc)
 {
-	PVOP_VCALL4(pv_cpu_ops.write_ldt_entry, dt, entry, low, high);
+	PVOP_VCALL3(pv_cpu_ops.write_ldt_entry, dt, entry, desc);
 }
 
 static inline void write_gdt_entry(struct desc_struct *dt, int entry,
-- 
cgit v1.2.3


From cc6978528cbd475d952e0eb5073375839dfb600e Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 30 Jan 2008 13:31:14 +0100
Subject: x86: modify get_desc_base

This patch makes get_desc_base() receive a struct desc_struct,
and then uses its internal fields to compute the base address.
This is done at both i386 and x86_64, and then it is moved
to common header

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/tls.c     | 2 +-
 arch/x86/mm/fault_32.c    | 2 +-
 include/asm-x86/desc.h    | 5 +++++
 include/asm-x86/desc_32.h | 8 --------
 include/asm-x86/desc_64.h | 9 ---------
 5 files changed, 7 insertions(+), 19 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index 74d2b65a82eb..98f428be8e8c 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -112,7 +112,7 @@ int do_get_thread_area(struct task_struct *p, int idx,
 
 	memset(&info, 0, sizeof(struct user_desc));
 	info.entry_number = idx;
-	info.base_addr = get_desc_base((void *)desc);
+	info.base_addr = get_desc_base((struct desc_struct *)desc);
 	info.limit = GET_LIMIT(desc);
 	info.seg_32bit = GET_32BIT(desc);
 	info.contents = GET_CONTENTS(desc);
diff --git a/arch/x86/mm/fault_32.c b/arch/x86/mm/fault_32.c
index 6056c6d71835..ef5ab2b925c4 100644
--- a/arch/x86/mm/fault_32.c
+++ b/arch/x86/mm/fault_32.c
@@ -115,7 +115,7 @@ static inline unsigned long get_segment_eip(struct pt_regs *regs,
 	}
 
 	/* Decode the code segment base from the descriptor */
-	base = get_desc_base((unsigned long *)desc);
+	base = get_desc_base((struct desc_struct *)desc);
 
 	if (seg & (1<<2)) { 
 		mutex_unlock(&current->mm->context.lock);
diff --git a/include/asm-x86/desc.h b/include/asm-x86/desc.h
index 99c4adc851e2..a6fdd7c7b6b2 100644
--- a/include/asm-x86/desc.h
+++ b/include/asm-x86/desc.h
@@ -69,6 +69,11 @@ static inline void load_LDT(mm_context_t *pc)
 	preempt_enable();
 }
 
+static inline unsigned long get_desc_base(struct desc_struct *desc)
+{
+	return desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24);
+}
+
 #else
 /*
  * GET_DESC_BASE reads the descriptor base of the specified segment.
diff --git a/include/asm-x86/desc_32.h b/include/asm-x86/desc_32.h
index 01415ad35fdf..8450c2a99c3a 100644
--- a/include/asm-x86/desc_32.h
+++ b/include/asm-x86/desc_32.h
@@ -168,14 +168,6 @@ static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, const vo
 
 #define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
 
-static inline unsigned long get_desc_base(unsigned long *desc)
-{
-	unsigned long base;
-	base = ((desc[0] >> 16)  & 0x0000ffff) |
-		((desc[1] << 16) & 0x00ff0000) |
-		(desc[1] & 0xff000000);
-	return base;
-}
 #endif /* !__ASSEMBLY__ */
 
 #endif
diff --git a/include/asm-x86/desc_64.h b/include/asm-x86/desc_64.h
index 448f96ed973f..a7a6c301c6bc 100644
--- a/include/asm-x86/desc_64.h
+++ b/include/asm-x86/desc_64.h
@@ -156,15 +156,6 @@ static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
 		gdt[i] = t->tls_array[i];
 }
 
-static inline unsigned long get_desc_base(const void *ptr)
-{
-	const u32 *desc = ptr;
-	unsigned long base;
-	base = ((desc[0] >> 16)  & 0x0000ffff) |
-		((desc[1] << 16) & 0x00ff0000) |
-		(desc[1] & 0xff000000);
-	return base;
-}
 #endif /* !__ASSEMBLY__ */
 
 #endif
-- 
cgit v1.2.3


From 507f90c9f92592e7630b1c1e87bf92d2c9858cc6 Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 30 Jan 2008 13:31:14 +0100
Subject: x86: move _set_gate and its users to a common location

This patch moves _set_gate and its users to desc.h. We can now
use common code for x86_64 and i386.

[ mingo@elte.hu: set_system_gate() fixes for nasty crashes. ]

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/traps_32.c  | 34 -----------------
 include/asm-x86/desc.h      | 92 +++++++++++++++++++++++++++++++++++++++++++++
 include/asm-x86/desc_32.h   | 16 --------
 include/asm-x86/desc_64.h   | 45 ----------------------
 include/asm-x86/desc_defs.h |  5 +--
 5 files changed, 93 insertions(+), 99 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index c70c41fd710b..3065b3f41928 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -1102,40 +1102,6 @@ asmlinkage void math_emulate(long arg)
 
 #endif /* CONFIG_MATH_EMULATION */
 
-/*
- * This needs to use 'idt_table' rather than 'idt', and
- * thus use the _nonmapped_ version of the IDT, as the
- * Pentium F0 0F bugfix can have resulted in the mapped
- * IDT being write-protected.
- */
-void set_intr_gate(unsigned int n, void *addr)
-{
-	_set_gate(n, DESCTYPE_INT, addr, __KERNEL_CS);
-}
-
-/*
- * This routine sets up an interrupt gate at directory privilege level 3.
- */
-static inline void set_system_intr_gate(unsigned int n, void *addr)
-{
-	_set_gate(n, DESCTYPE_INT | DESCTYPE_DPL3, addr, __KERNEL_CS);
-}
-
-static void __init set_trap_gate(unsigned int n, void *addr)
-{
-	_set_gate(n, DESCTYPE_TRAP, addr, __KERNEL_CS);
-}
-
-static void __init set_system_gate(unsigned int n, void *addr)
-{
-	_set_gate(n, DESCTYPE_TRAP | DESCTYPE_DPL3, addr, __KERNEL_CS);
-}
-
-static void __init set_task_gate(unsigned int n, unsigned int gdt_entry)
-{
-	_set_gate(n, DESCTYPE_TASK, (void *)0, (gdt_entry<<3));
-}
-
 
 void __init trap_init(void)
 {
diff --git a/include/asm-x86/desc.h b/include/asm-x86/desc.h
index e61a5a38caba..d75bc0634313 100644
--- a/include/asm-x86/desc.h
+++ b/include/asm-x86/desc.h
@@ -33,6 +33,22 @@ extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
 extern struct desc_ptr cpu_gdt_descr[];
 /* the cpu gdt accessor */
 #define get_cpu_gdt_table(x) ((struct desc_struct *)cpu_gdt_descr[x].address)
+
+static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
+			     unsigned dpl, unsigned ist, unsigned seg)
+{
+	gate->offset_low = PTR_LOW(func);
+	gate->segment = __KERNEL_CS;
+	gate->ist = ist;
+	gate->p = 1;
+	gate->dpl = dpl;
+	gate->zero0 = 0;
+	gate->zero1 = 0;
+	gate->type = type;
+	gate->offset_middle = PTR_MIDDLE(func);
+	gate->offset_high = PTR_HIGH(func);
+}
+
 #else
 struct gdt_page {
 	struct desc_struct gdt[GDT_ENTRIES];
@@ -43,6 +59,16 @@ static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
 {
 	return per_cpu(gdt_page, cpu).gdt;
 }
+
+static inline void pack_gate(gate_desc *gate, unsigned char type,
+       unsigned long base, unsigned dpl, unsigned flags, unsigned short seg)
+
+{
+	gate->a = (seg << 16) | (base & 0xffff);
+	gate->b = (base & 0xffff0000) |
+		  (((0x80 | type | (dpl << 5)) & 0xff) << 8);
+}
+
 #endif
 
 #ifdef CONFIG_PARAVIRT
@@ -242,6 +268,72 @@ static inline unsigned long get_desc_base(struct desc_struct *desc)
 {
 	return desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24);
 }
+static inline void _set_gate(int gate, unsigned type, void *addr,
+			      unsigned dpl, unsigned ist, unsigned seg)
+{
+	gate_desc s;
+	pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
+	/*
+	 * does not need to be atomic because it is only done once at
+	 * setup time
+	 */
+	write_idt_entry(idt_table, gate, &s);
+}
+
+/*
+ * This needs to use 'idt_table' rather than 'idt', and
+ * thus use the _nonmapped_ version of the IDT, as the
+ * Pentium F0 0F bugfix can have resulted in the mapped
+ * IDT being write-protected.
+ */
+static inline void set_intr_gate(unsigned int n, void *addr)
+{
+	BUG_ON((unsigned)n > 0xFF);
+	_set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
+}
+
+/*
+ * This routine sets up an interrupt gate at directory privilege level 3.
+ */
+static inline void set_system_intr_gate(unsigned int n, void *addr)
+{
+	BUG_ON((unsigned)n > 0xFF);
+	_set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS);
+}
+
+static inline void set_trap_gate(unsigned int n, void *addr)
+{
+	BUG_ON((unsigned)n > 0xFF);
+	_set_gate(n, GATE_TRAP, addr, 0, 0, __KERNEL_CS);
+}
+
+static inline void set_system_gate(unsigned int n, void *addr)
+{
+	BUG_ON((unsigned)n > 0xFF);
+#ifdef CONFIG_X86_32
+	_set_gate(n, GATE_TRAP, addr, 0x3, 0, __KERNEL_CS);
+#else
+	_set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS);
+#endif
+}
+
+static inline void set_task_gate(unsigned int n, unsigned int gdt_entry)
+{
+	BUG_ON((unsigned)n > 0xFF);
+	_set_gate(n, GATE_TASK, (void *)0, 0, 0, (gdt_entry<<3));
+}
+
+static inline void set_intr_gate_ist(int n, void *addr, unsigned ist)
+{
+	BUG_ON((unsigned)n > 0xFF);
+	_set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS);
+}
+
+static inline void set_system_gate_ist(int n, void *addr, unsigned ist)
+{
+	BUG_ON((unsigned)n > 0xFF);
+	_set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS);
+}
 
 #else
 /*
diff --git a/include/asm-x86/desc_32.h b/include/asm-x86/desc_32.h
index 4bf20b7dd741..3b112ec186a0 100644
--- a/include/asm-x86/desc_32.h
+++ b/include/asm-x86/desc_32.h
@@ -10,22 +10,6 @@
 #include <linux/preempt.h>
 #include <linux/percpu.h>
 
-extern void set_intr_gate(unsigned int irq, void * addr);
-
-static inline void pack_gate(gate_desc *gate,
-	unsigned long base, unsigned short seg, unsigned char type, unsigned char flags)
-{
-	gate->a = (seg << 16) | (base & 0xffff);
-	gate->b = (base & 0xffff0000) | ((type & 0xff) << 8) | (flags & 0xff);
-}
-
-static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg)
-{
-	gate_desc g;
-	pack_gate(&g, (unsigned long)addr, seg, type, 0);
-	write_idt_entry(idt_table, gate, &g);
-}
-
 static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, const void *addr)
 {
 	tss_desc tss;
diff --git a/include/asm-x86/desc_64.h b/include/asm-x86/desc_64.h
index b0290c45319c..6bc92e6e5cc3 100644
--- a/include/asm-x86/desc_64.h
+++ b/include/asm-x86/desc_64.h
@@ -11,51 +11,6 @@
 
 #include <asm/segment.h>
 
-static inline void _set_gate(int gate, unsigned type, unsigned long func,
-			     unsigned dpl, unsigned ist)
-{
-	gate_desc s;
-
-	s.offset_low = PTR_LOW(func);
-	s.segment = __KERNEL_CS;
-	s.ist = ist;
-	s.p = 1;
-	s.dpl = dpl;
-	s.zero0 = 0;
-	s.zero1 = 0;
-	s.type = type;
-	s.offset_middle = PTR_MIDDLE(func);
-	s.offset_high = PTR_HIGH(func);
-	/*
-	 * does not need to be atomic because it is only done once at
-	 * setup time
-	 */
-	write_idt_entry(idt_table, gate, &s);
-}
-
-static inline void set_intr_gate(int nr, void *func)
-{
-	BUG_ON((unsigned)nr > 0xFF);
-	_set_gate(nr, GATE_INTERRUPT, (unsigned long) func, 0, 0);
-}
-
-static inline void set_intr_gate_ist(int nr, void *func, unsigned ist)
-{
-	BUG_ON((unsigned)nr > 0xFF);
-	_set_gate(nr, GATE_INTERRUPT, (unsigned long) func, 0, ist);
-}
-
-static inline void set_system_gate(int nr, void *func)
-{
-	BUG_ON((unsigned)nr > 0xFF);
-	_set_gate(nr, GATE_INTERRUPT, (unsigned long) func, 3, 0);
-}
-
-static inline void set_system_gate_ist(int nr, void *func, unsigned ist)
-{
-	_set_gate(nr, GATE_INTERRUPT, (unsigned long) func, 3, ist);
-}
-
 static inline void set_tss_desc(unsigned cpu, void *addr)
 {
 	struct desc_struct *d = get_cpu_gdt_table(cpu);
diff --git a/include/asm-x86/desc_defs.h b/include/asm-x86/desc_defs.h
index ebb64fe3a450..e33f078b3e54 100644
--- a/include/asm-x86/desc_defs.h
+++ b/include/asm-x86/desc_defs.h
@@ -36,6 +36,7 @@ enum {
 	GATE_INTERRUPT = 0xE,
 	GATE_TRAP = 0xF,
 	GATE_CALL = 0xC,
+	GATE_TASK = 0x5,
 };
 
 // 16byte gate
@@ -55,10 +56,6 @@ struct gate_struct64 {
 enum {
 	DESC_TSS = 0x9,
 	DESC_LDT = 0x2,
-	DESCTYPE_TASK = 0x85,   /* present, system, DPL-0, task gate */
-	DESCTYPE_INT =  0x8e,   /* present, system, DPL-0, interrupt gate */
-	DESCTYPE_TRAP = 0x8f,   /* present, system, DPL-0, trap gate */
-	DESCTYPE_DPL3 = 0x60,   /* DPL-3 */
 	DESCTYPE_S =	0x10,	/* !system */
 };
 
-- 
cgit v1.2.3


From 6b7d190b14d478c759be22dcb7ee4695a711b562 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Wed, 30 Jan 2008 13:31:16 +0100
Subject: x86: remove last users of FASTCALL

FASTCALL() is always empty.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/acpi/sleep_32.c | 2 +-
 arch/x86/kernel/vm86_32.c       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/sleep_32.c b/arch/x86/kernel/acpi/sleep_32.c
index 10699489cfe7..09f820d920b1 100644
--- a/arch/x86/kernel/acpi/sleep_32.c
+++ b/arch/x86/kernel/acpi/sleep_32.c
@@ -17,7 +17,7 @@ unsigned long acpi_wakeup_address = 0;
 unsigned long acpi_realmode_flags;
 extern char wakeup_start, wakeup_end;
 
-extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long));
+extern unsigned long acpi_copy_wakeup_routine(unsigned long);
 
 /**
  * acpi_save_state_mem - save kernel state
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index e85bb44265cb..5aa5e2f9f0ad 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -120,7 +120,7 @@ static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs,
 	return ret;
 }
 
-struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs));
+struct pt_regs * save_v86_state(struct kernel_vm86_regs * regs);
 struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
 {
 	struct tss_struct *tss;
-- 
cgit v1.2.3


From 75604d7f7f1ee93e4d19d9e19f4497b7ed842f2a Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Wed, 30 Jan 2008 13:31:17 +0100
Subject: x86: remove all definitions with fastcall

fastcall is always defined to be empty, remove it from arch/x86

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apic_32.c            |  2 +-
 arch/x86/kernel/cpu/mcheck/k7.c      |  2 +-
 arch/x86/kernel/cpu/mcheck/mce.h     |  2 +-
 arch/x86/kernel/cpu/mcheck/mce_32.c  |  4 ++--
 arch/x86/kernel/cpu/mcheck/p4.c      |  4 ++--
 arch/x86/kernel/cpu/mcheck/p5.c      |  2 +-
 arch/x86/kernel/cpu/mcheck/p6.c      |  2 +-
 arch/x86/kernel/cpu/mcheck/winchip.c |  2 +-
 arch/x86/kernel/io_apic_32.c         |  2 +-
 arch/x86/kernel/irq_32.c             |  2 +-
 arch/x86/kernel/kprobes_32.c         |  2 +-
 arch/x86/kernel/process_32.c         |  2 +-
 arch/x86/kernel/signal_32.c          |  2 +-
 arch/x86/kernel/smp_32.c             |  8 ++++----
 arch/x86/kernel/traps_32.c           | 24 ++++++++++++------------
 arch/x86/kernel/vm86_32.c            |  3 +--
 arch/x86/mach-voyager/voyager_smp.c  | 20 ++++++++++----------
 arch/x86/mm/fault_32.c               |  5 ++---
 arch/x86/xen/events.c                |  2 +-
 19 files changed, 45 insertions(+), 47 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index 1ee443a8e61b..69a13d127da3 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -578,7 +578,7 @@ static void local_apic_timer_interrupt(void)
  *   interrupt as well. Thus we cannot inline the local irq ... ]
  */
 
-void fastcall smp_apic_timer_interrupt(struct pt_regs *regs)
+void smp_apic_timer_interrupt(struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
 
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
index eef63e3630c2..39fa76fd3851 100644
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ b/arch/x86/kernel/cpu/mcheck/k7.c
@@ -16,7 +16,7 @@
 #include "mce.h"
 
 /* Machine Check Handler For AMD Athlon/Duron */
-static fastcall void k7_machine_check(struct pt_regs * regs, long error_code)
+static void k7_machine_check(struct pt_regs * regs, long error_code)
 {
 	int recover=1;
 	u32 alow, ahigh, high, low;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h
index 81fb6e2d35f3..ae9f628838f1 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.h
+++ b/arch/x86/kernel/cpu/mcheck/mce.h
@@ -8,7 +8,7 @@ void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
 void winchip_mcheck_init(struct cpuinfo_x86 *c);
 
 /* Call the installed machine check handler for this CPU setup. */
-extern fastcall void (*machine_check_vector)(struct pt_regs *, long error_code);
+extern void (*machine_check_vector)(struct pt_regs *, long error_code);
 
 extern int nr_mce_banks;
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
index 34c781eddee4..a5182dcd94ae 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_32.c
@@ -22,13 +22,13 @@ int nr_mce_banks;
 EXPORT_SYMBOL_GPL(nr_mce_banks);	/* non-fatal.o */
 
 /* Handle unconfigured int18 (should never happen) */
-static fastcall void unexpected_machine_check(struct pt_regs * regs, long error_code)
+static void unexpected_machine_check(struct pt_regs * regs, long error_code)
 {	
 	printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id());
 }
 
 /* Call the installed machine check handler for this CPU setup. */
-void fastcall (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check;
+void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check;
 
 /* This has to be run for each processor */
 void mcheck_init(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
index be4dabfee1f5..16a6238dbc2a 100644
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -57,7 +57,7 @@ static void intel_thermal_interrupt(struct pt_regs *regs)
 /* Thermal interrupt handler for this CPU setup */
 static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt;
 
-fastcall void smp_thermal_interrupt(struct pt_regs *regs)
+void smp_thermal_interrupt(struct pt_regs *regs)
 {
 	irq_enter();
 	vendor_thermal_interrupt(regs);
@@ -141,7 +141,7 @@ static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
 	rdmsr (MSR_IA32_MCG_EIP, r->eip, h);
 }
 
-static fastcall void intel_machine_check(struct pt_regs * regs, long error_code)
+static void intel_machine_check(struct pt_regs * regs, long error_code)
 {
 	int recover=1;
 	u32 alow, ahigh, high, low;
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index 94bc43d950cf..a18310aaae0c 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -16,7 +16,7 @@
 #include "mce.h"
 
 /* Machine check handler for Pentium class Intel */
-static fastcall void pentium_machine_check(struct pt_regs * regs, long error_code)
+static void pentium_machine_check(struct pt_regs * regs, long error_code)
 {
 	u32 loaddr, hi, lotype;
 	rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c
index deeae42ce199..cb3829e07987 100644
--- a/arch/x86/kernel/cpu/mcheck/p6.c
+++ b/arch/x86/kernel/cpu/mcheck/p6.c
@@ -16,7 +16,7 @@
 #include "mce.h"
 
 /* Machine Check Handler For PII/PIII */
-static fastcall void intel_machine_check(struct pt_regs * regs, long error_code)
+static void intel_machine_check(struct pt_regs * regs, long error_code)
 {
 	int recover=1;
 	u32 alow, ahigh, high, low;
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index 9e424b6c293d..3d428d5afc52 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -15,7 +15,7 @@
 #include "mce.h"
 
 /* Machine check handler for WinChip C6 */
-static fastcall void winchip_machine_check(struct pt_regs * regs, long error_code)
+static void winchip_machine_check(struct pt_regs * regs, long error_code)
 {
 	printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
 	add_taint(TAINT_MACHINE_CHECK);
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 75bf8dc77650..76f11c3e3906 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -725,7 +725,7 @@ late_initcall(balanced_irq_init);
 #endif /* CONFIG_SMP */
 
 #ifndef CONFIG_SMP
-void fastcall send_IPI_self(int vector)
+void send_IPI_self(int vector)
 {
 	unsigned int cfg;
 
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index b49616bcc16b..cef054b09d27 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -66,7 +66,7 @@ static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
  * SMP cross-CPU interrupts have their own specific
  * handlers).
  */
-fastcall unsigned int do_IRQ(struct pt_regs *regs)
+unsigned int do_IRQ(struct pt_regs *regs)
 {	
 	struct pt_regs *old_regs;
 	/* high bit used in ret_from_ code */
diff --git a/arch/x86/kernel/kprobes_32.c b/arch/x86/kernel/kprobes_32.c
index d708cd4f956f..8eccd2d04709 100644
--- a/arch/x86/kernel/kprobes_32.c
+++ b/arch/x86/kernel/kprobes_32.c
@@ -403,7 +403,7 @@ no_kprobe:
 /*
  * Called from kretprobe_trampoline
  */
-fastcall void *__kprobes trampoline_handler(struct pt_regs *regs)
+void *__kprobes trampoline_handler(struct pt_regs *regs)
 {
 	struct kretprobe_instance *ri = NULL;
 	struct hlist_head *head, empty_rp;
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 2b9db9371060..48e92e3758c2 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -716,7 +716,7 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
  * the task-switch, and shows up in ret_from_fork in entry.S,
  * for example.
  */
-struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 {
 	struct thread_struct *prev = &prev_p->thread,
 				 *next = &next_p->thread;
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index 2bf5c9aed106..74df55895c8c 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -575,7 +575,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
  * want to handle. Thus you cannot kill init even with a SIGKILL even by
  * mistake.
  */
-static void fastcall do_signal(struct pt_regs *regs)
+static void do_signal(struct pt_regs *regs)
 {
 	siginfo_t info;
 	int signr;
diff --git a/arch/x86/kernel/smp_32.c b/arch/x86/kernel/smp_32.c
index d4c01a4aca60..070816ac79e1 100644
--- a/arch/x86/kernel/smp_32.c
+++ b/arch/x86/kernel/smp_32.c
@@ -159,7 +159,7 @@ void __send_IPI_shortcut(unsigned int shortcut, int vector)
 	apic_write_around(APIC_ICR, cfg);
 }
 
-void fastcall send_IPI_self(int vector)
+void send_IPI_self(int vector)
 {
 	__send_IPI_shortcut(APIC_DEST_SELF, vector);
 }
@@ -310,7 +310,7 @@ void leave_mm(unsigned long cpu)
  * 2) Leave the mm if we are in the lazy tlb mode.
  */
 
-fastcall void smp_invalidate_interrupt(struct pt_regs *regs)
+void smp_invalidate_interrupt(struct pt_regs *regs)
 {
 	unsigned long cpu;
 
@@ -638,13 +638,13 @@ static void native_smp_send_stop(void)
  * all the work is done automatically when
  * we return from the interrupt.
  */
-fastcall void smp_reschedule_interrupt(struct pt_regs *regs)
+void smp_reschedule_interrupt(struct pt_regs *regs)
 {
 	ack_APIC_irq();
 	__get_cpu_var(irq_stat).irq_resched_count++;
 }
 
-fastcall void smp_call_function_interrupt(struct pt_regs *regs)
+void smp_call_function_interrupt(struct pt_regs *regs)
 {
 	void (*func) (void *info) = call_data->func;
 	void *info = call_data->info;
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index 3065b3f41928..2eb6ca0ef672 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -501,7 +501,7 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
 }
 
 #define DO_ERROR(trapnr, signr, str, name) \
-fastcall void do_##name(struct pt_regs * regs, long error_code) \
+void do_##name(struct pt_regs * regs, long error_code) \
 { \
 	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
 						== NOTIFY_STOP) \
@@ -510,7 +510,7 @@ fastcall void do_##name(struct pt_regs * regs, long error_code) \
 }
 
 #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \
-fastcall void do_##name(struct pt_regs * regs, long error_code) \
+void do_##name(struct pt_regs * regs, long error_code) \
 { \
 	siginfo_t info; \
 	if (irq) \
@@ -526,7 +526,7 @@ fastcall void do_##name(struct pt_regs * regs, long error_code) \
 }
 
 #define DO_VM86_ERROR(trapnr, signr, str, name) \
-fastcall void do_##name(struct pt_regs * regs, long error_code) \
+void do_##name(struct pt_regs * regs, long error_code) \
 { \
 	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
 						== NOTIFY_STOP) \
@@ -535,7 +535,7 @@ fastcall void do_##name(struct pt_regs * regs, long error_code) \
 }
 
 #define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
-fastcall void do_##name(struct pt_regs * regs, long error_code) \
+void do_##name(struct pt_regs * regs, long error_code) \
 { \
 	siginfo_t info; \
 	info.si_signo = signr; \
@@ -563,7 +563,7 @@ DO_ERROR(12, SIGBUS,  "stack segment", stack_segment)
 DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
 DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1)
 
-fastcall void __kprobes do_general_protection(struct pt_regs * regs,
+void __kprobes do_general_protection(struct pt_regs * regs,
 					      long error_code)
 {
 	int cpu = get_cpu();
@@ -764,7 +764,7 @@ static __kprobes void default_do_nmi(struct pt_regs * regs)
 
 static int ignore_nmis;
 
-fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code)
+__kprobes void do_nmi(struct pt_regs * regs, long error_code)
 {
 	int cpu;
 
@@ -793,7 +793,7 @@ void restart_nmi(void)
 }
 
 #ifdef CONFIG_KPROBES
-fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
+void __kprobes do_int3(struct pt_regs *regs, long error_code)
 {
 	trace_hardirqs_fixup();
 
@@ -829,7 +829,7 @@ fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
  * find every occurrence of the TF bit that could be saved away even
  * by user code)
  */
-fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code)
+void __kprobes do_debug(struct pt_regs * regs, long error_code)
 {
 	unsigned int condition;
 	struct task_struct *tsk = current;
@@ -961,7 +961,7 @@ void math_error(void __user *ip)
 	force_sig_info(SIGFPE, &info, task);
 }
 
-fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code)
+void do_coprocessor_error(struct pt_regs * regs, long error_code)
 {
 	ignore_fpu_irq = 1;
 	math_error((void __user *)regs->ip);
@@ -1015,7 +1015,7 @@ static void simd_math_error(void __user *ip)
 	force_sig_info(SIGFPE, &info, task);
 }
 
-fastcall void do_simd_coprocessor_error(struct pt_regs * regs,
+void do_simd_coprocessor_error(struct pt_regs * regs,
 					  long error_code)
 {
 	if (cpu_has_xmm) {
@@ -1039,7 +1039,7 @@ fastcall void do_simd_coprocessor_error(struct pt_regs * regs,
 	}
 }
 
-fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
+void do_spurious_interrupt_bug(struct pt_regs * regs,
 					  long error_code)
 {
 #if 0
@@ -1048,7 +1048,7 @@ fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
 #endif
 }
 
-fastcall unsigned long patch_espfix_desc(unsigned long uesp,
+unsigned long patch_espfix_desc(unsigned long uesp,
 					  unsigned long kesp)
 {
 	struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 5aa5e2f9f0ad..c9f67effbc42 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -120,8 +120,7 @@ static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs,
 	return ret;
 }
 
-struct pt_regs * save_v86_state(struct kernel_vm86_regs * regs);
-struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
+struct pt_regs * save_v86_state(struct kernel_vm86_regs * regs)
 {
 	struct tss_struct *tss;
 	struct pt_regs *ret;
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index f2c13482acc0..751777241881 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -755,7 +755,7 @@ void __init initialize_secondary(void)
  * System interrupts occur because some problem was detected on the
  * various busses.  To find out what you have to probe all the
  * hardware via the CAT bus.  FIXME: At the moment we do nothing. */
-fastcall void smp_vic_sys_interrupt(struct pt_regs *regs)
+void smp_vic_sys_interrupt(struct pt_regs *regs)
 {
 	ack_CPI(VIC_SYS_INT);
 	printk("Voyager SYSTEM INTERRUPT\n");
@@ -764,7 +764,7 @@ fastcall void smp_vic_sys_interrupt(struct pt_regs *regs)
 /* Handle a voyager CMN_INT; These interrupts occur either because of
  * a system status change or because a single bit memory error
  * occurred.  FIXME: At the moment, ignore all this. */
-fastcall void smp_vic_cmn_interrupt(struct pt_regs *regs)
+void smp_vic_cmn_interrupt(struct pt_regs *regs)
 {
 	static __u8 in_cmn_int = 0;
 	static DEFINE_SPINLOCK(cmn_int_lock);
@@ -1086,7 +1086,7 @@ voyager_smp_call_function_mask(cpumask_t cpumask,
  * no local APIC, so I can't do this
  *
  * This function is currently a placeholder and is unused in the code */
-fastcall void smp_apic_timer_interrupt(struct pt_regs *regs)
+void smp_apic_timer_interrupt(struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
 	wrapper_smp_local_timer_interrupt();
@@ -1094,7 +1094,7 @@ fastcall void smp_apic_timer_interrupt(struct pt_regs *regs)
 }
 
 /* All of the QUAD interrupt GATES */
-fastcall void smp_qic_timer_interrupt(struct pt_regs *regs)
+void smp_qic_timer_interrupt(struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
 	ack_QIC_CPI(QIC_TIMER_CPI);
@@ -1102,31 +1102,31 @@ fastcall void smp_qic_timer_interrupt(struct pt_regs *regs)
 	set_irq_regs(old_regs);
 }
 
-fastcall void smp_qic_invalidate_interrupt(struct pt_regs *regs)
+void smp_qic_invalidate_interrupt(struct pt_regs *regs)
 {
 	ack_QIC_CPI(QIC_INVALIDATE_CPI);
 	smp_invalidate_interrupt();
 }
 
-fastcall void smp_qic_reschedule_interrupt(struct pt_regs *regs)
+void smp_qic_reschedule_interrupt(struct pt_regs *regs)
 {
 	ack_QIC_CPI(QIC_RESCHEDULE_CPI);
 	smp_reschedule_interrupt();
 }
 
-fastcall void smp_qic_enable_irq_interrupt(struct pt_regs *regs)
+void smp_qic_enable_irq_interrupt(struct pt_regs *regs)
 {
 	ack_QIC_CPI(QIC_ENABLE_IRQ_CPI);
 	smp_enable_irq_interrupt();
 }
 
-fastcall void smp_qic_call_function_interrupt(struct pt_regs *regs)
+void smp_qic_call_function_interrupt(struct pt_regs *regs)
 {
 	ack_QIC_CPI(QIC_CALL_FUNCTION_CPI);
 	smp_call_function_interrupt();
 }
 
-fastcall void smp_vic_cpi_interrupt(struct pt_regs *regs)
+void smp_vic_cpi_interrupt(struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
 	__u8 cpu = smp_processor_id();
@@ -1333,7 +1333,7 @@ int setup_profiling_timer(unsigned int multiplier)
 /* This is a bit of a mess, but forced on us by the genirq changes
  * there's no genirq handler that really does what voyager wants
  * so hack it up with the simple IRQ handler */
-static void fastcall handle_vic_irq(unsigned int irq, struct irq_desc *desc)
+static void handle_vic_irq(unsigned int irq, struct irq_desc *desc)
 {
 	before_handle_vic_irq(irq);
 	handle_simple_irq(irq, desc);
diff --git a/arch/x86/mm/fault_32.c b/arch/x86/mm/fault_32.c
index ef5ab2b925c4..8aed912b04ec 100644
--- a/arch/x86/mm/fault_32.c
+++ b/arch/x86/mm/fault_32.c
@@ -214,7 +214,7 @@ static noinline void force_sig_info_fault(int si_signo, int si_code,
 	force_sig_info(si_signo, &info, tsk);
 }
 
-fastcall void do_invalid_op(struct pt_regs *, unsigned long);
+void do_invalid_op(struct pt_regs *, unsigned long);
 
 static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
 {
@@ -293,8 +293,7 @@ int show_unhandled_signals = 1;
  *	bit 3 == 1 means use of reserved bit detected
  *	bit 4 == 1 means fault was an instruction fetch
  */
-fastcall void __kprobes do_page_fault(struct pt_regs *regs,
-				      unsigned long error_code)
+void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
 {
 	struct task_struct *tsk;
 	struct mm_struct *mm;
diff --git a/arch/x86/xen/events.c b/arch/x86/xen/events.c
index aebab9704dd7..dcf613e17581 100644
--- a/arch/x86/xen/events.c
+++ b/arch/x86/xen/events.c
@@ -465,7 +465,7 @@ void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
  * a bitset of words which contain pending event bits.  The second
  * level is a bitset of pending events themselves.
  */
-fastcall void xen_evtchn_do_upcall(struct pt_regs *regs)
+void xen_evtchn_do_upcall(struct pt_regs *regs)
 {
 	int cpu = get_cpu();
 	struct shared_info *s = HYPERVISOR_shared_info;
-- 
cgit v1.2.3


From fa20efd2fcd9349770113c6f72fc76ce437b62f5 Mon Sep 17 00:00:00 2001
From: Aaron Durbin <adurbin@google.com>
Date: Wed, 30 Jan 2008 13:31:17 +0100
Subject: x86: add ACPI reboot option

Add the ability to reboot an x86_64 based machine using the RESET_REG in the
FADT ACPI table.

Signed-off-by: Aaron Durbin <adurbin@google.com>
Cc: Len Brown <lenb@kernel.org>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/x86_64/boot-options.txt |  5 ++++-
 arch/x86/kernel/reboot_64.c           | 11 ++++++++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt
index 945311840a10..d7a333258682 100644
--- a/Documentation/x86_64/boot-options.txt
+++ b/Documentation/x86_64/boot-options.txt
@@ -110,12 +110,15 @@ Idle loop
 
 Rebooting
 
-   reboot=b[ios] | t[riple] | k[bd] [, [w]arm | [c]old]
+   reboot=b[ios] | t[riple] | k[bd] | a[cpi] [, [w]arm | [c]old]
    bios	  Use the CPU reboot vector for warm reset
    warm   Don't set the cold reboot flag
    cold   Set the cold reboot flag
    triple Force a triple fault (init)
    kbd    Use the keyboard controller. cold reset (default)
+   acpi   Use the ACPI RESET_REG in the FADT. If ACPI is not configured or the
+          ACPI reset does not work, the reboot path attempts the reset using
+          the keyboard controller.
 
    Using warm reset will be much faster especially on big memory
    systems because the BIOS will not go through the memory check.
diff --git a/arch/x86/kernel/reboot_64.c b/arch/x86/kernel/reboot_64.c
index 53620a92a8fd..307f996a3933 100644
--- a/arch/x86/kernel/reboot_64.c
+++ b/arch/x86/kernel/reboot_64.c
@@ -9,6 +9,7 @@
 #include <linux/pm.h>
 #include <linux/kdebug.h>
 #include <linux/sched.h>
+#include <acpi/reboot.h>
 #include <asm/io.h>
 #include <asm/delay.h>
 #include <asm/desc.h>
@@ -29,7 +30,8 @@ EXPORT_SYMBOL(pm_power_off);
 static long no_idt[3];
 static enum { 
 	BOOT_TRIPLE = 't',
-	BOOT_KBD = 'k'
+	BOOT_KBD = 'k',
+	BOOT_ACPI = 'a'
 } reboot_type = BOOT_KBD;
 static int reboot_mode = 0;
 int reboot_force;
@@ -39,6 +41,7 @@ int reboot_force;
    cold   Set the cold reboot flag
    triple Force a triple fault (init)
    kbd    Use the keyboard controller. cold reset (default)
+   acpi   Use the RESET_REG in the FADT
    force  Avoid anything that could hang.
  */ 
 static int __init reboot_setup(char *str)
@@ -54,6 +57,7 @@ static int __init reboot_setup(char *str)
 			break;
 
 		case 't':
+		case 'a':
 		case 'b':
 		case 'k':
 			reboot_type = *str;
@@ -146,6 +150,11 @@ void machine_emergency_restart(void)
 
 			reboot_type = BOOT_KBD;
 			break;
+
+		case BOOT_ACPI:
+			acpi_reboot();
+			reboot_type = BOOT_KBD;
+			break;
 		}      
 	}      
 }
-- 
cgit v1.2.3


From 8c8b8859b64baf6d7c33900e8720c7bafe775b2c Mon Sep 17 00:00:00 2001
From: Daniel Walker <dwalker@mvista.com>
Date: Wed, 30 Jan 2008 13:31:17 +0100
Subject: mcheck mce_64: mce_read_sem to mutex

Converted to a mutex, and changed the name to mce_read_mutex.

Signed-off-by: Daniel Walker <dwalker@mvista.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/mcheck/mce_64.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 0adad772d0da..8cd47fe2ef2c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -564,7 +564,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 			loff_t *off)
 {
 	unsigned long *cpu_tsc;
-	static DECLARE_MUTEX(mce_read_sem);
+	static DEFINE_MUTEX(mce_read_mutex);
 	unsigned next;
 	char __user *buf = ubuf;
 	int i, err;
@@ -573,12 +573,12 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 	if (!cpu_tsc)
 		return -ENOMEM;
 
-	down(&mce_read_sem);
+	mutex_lock(&mce_read_mutex);
 	next = rcu_dereference(mcelog.next);
 
 	/* Only supports full reads right now */
 	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
-		up(&mce_read_sem);
+		mutex_unlock(&mce_read_mutex);
 		kfree(cpu_tsc);
 		return -EINVAL;
 	}
@@ -621,7 +621,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 			memset(&mcelog.entry[i], 0, sizeof(struct mce));
 		}
 	}
-	up(&mce_read_sem);
+	mutex_unlock(&mce_read_mutex);
 	kfree(cpu_tsc);
 	return err ? -EFAULT : buf - ubuf;
 }
-- 
cgit v1.2.3


From 5b83683f32b113d07edfb67a33ce389fc624423d Mon Sep 17 00:00:00 2001
From: "Huang, Ying" <ying.huang@intel.com>
Date: Wed, 30 Jan 2008 13:31:19 +0100
Subject: x86: EFI runtime service support

This patch adds basic runtime services support for EFI x86_64 system.  The
main file of the patch is the addition of efi_64.c for x86_64.  This file is
modeled after the EFI IA32 avatar.  EFI runtime services initialization are
implemented in efi_64.c.  Some x86_64 specifics are worth noting here.  On
x86_64, parameters passed to EFI firmware services need to follow the EFI
calling convention.  For this purpose, a set of functions named efi_call<x>
(<x> is the number of parameters) are implemented.  EFI function calls are
wrapped before calling the firmware service.  The duplicated code between
efi_32.c and efi_64.c is placed in efi.c to remove them from efi_32.c.

Signed-off-by: Chandramouli Narayanan <mouli@linux.intel.com>
Signed-off-by: Huang Ying <ying.huang@intel.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig              |   2 +-
 arch/x86/kernel/Makefile_64   |   1 +
 arch/x86/kernel/efi.c         | 480 ++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/efi_64.c      | 164 +++++++++++++++
 arch/x86/kernel/efi_stub_64.S | 109 ++++++++++
 arch/x86/kernel/setup_64.c    |  17 +-
 include/asm-x86/bootparam.h   |   5 +-
 include/asm-x86/efi.h         |  70 ++++++
 include/asm-x86/fixmap_64.h   |   3 +
 9 files changed, 847 insertions(+), 4 deletions(-)
 create mode 100644 arch/x86/kernel/efi.c
 create mode 100644 arch/x86/kernel/efi_64.c
 create mode 100644 arch/x86/kernel/efi_stub_64.S
 create mode 100644 include/asm-x86/efi.h

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fef944bb920e..23936301db56 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -959,7 +959,7 @@ config MTRR
 config EFI
 	def_bool n
 	prompt "Boot from EFI support"
-	depends on X86_32 && ACPI
+	depends on ACPI
 	---help---
 	This enables the kernel to boot on EFI platforms using
 	system configuration information passed to it from the firmware.
diff --git a/arch/x86/kernel/Makefile_64 b/arch/x86/kernel/Makefile_64
index 2ec96acf6486..e5093dd8cf01 100644
--- a/arch/x86/kernel/Makefile_64
+++ b/arch/x86/kernel/Makefile_64
@@ -39,6 +39,7 @@ obj-$(CONFIG_X86_PM_TIMER)	+= pmtimer_64.o
 obj-$(CONFIG_X86_VSMP)		+= vsmp_64.o
 obj-$(CONFIG_K8_NB)		+= k8.o
 obj-$(CONFIG_AUDIT)		+= audit_64.o
+obj-$(CONFIG_EFI)		+= efi.o efi_64.o efi_stub_64.o
 
 obj-$(CONFIG_MODULES)		+= module_64.o
 obj-$(CONFIG_PCI)		+= early-quirks.o
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
new file mode 100644
index 000000000000..0a61522e85c7
--- /dev/null
+++ b/arch/x86/kernel/efi.c
@@ -0,0 +1,480 @@
+/*
+ * Common EFI (Extensible Firmware Interface) support functions
+ * Based on Extensible Firmware Interface Specification version 1.0
+ *
+ * Copyright (C) 1999 VA Linux Systems
+ * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
+ * Copyright (C) 1999-2002 Hewlett-Packard Co.
+ *	David Mosberger-Tang <davidm@hpl.hp.com>
+ *	Stephane Eranian <eranian@hpl.hp.com>
+ * Copyright (C) 2005-2008 Intel Co.
+ *	Fenghua Yu <fenghua.yu@intel.com>
+ *	Bibo Mao <bibo.mao@intel.com>
+ *	Chandramouli Narayanan <mouli@linux.intel.com>
+ *	Huang Ying <ying.huang@intel.com>
+ *
+ * Copied from efi_32.c to eliminate the duplicated code between EFI
+ * 32/64 support code. --ying 2007-10-26
+ *
+ * All EFI Runtime Services are not implemented yet as EFI only
+ * supports physical mode addressing on SoftSDV. This is to be fixed
+ * in a future version.  --drummond 1999-07-20
+ *
+ * Implemented EFI runtime services and virtual mode calls.  --davidm
+ *
+ * Goutham Rao: <goutham.rao@intel.com>
+ *	Skip non-WB memory and ignore empty memory ranges.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/efi.h>
+#include <linux/bootmem.h>
+#include <linux/spinlock.h>
+#include <linux/uaccess.h>
+#include <linux/time.h>
+#include <linux/io.h>
+#include <linux/reboot.h>
+#include <linux/bcd.h>
+
+#include <asm/setup.h>
+#include <asm/efi.h>
+#include <asm/time.h>
+
+#define EFI_DEBUG	1
+#define PFX 		"EFI: "
+
+int efi_enabled;
+EXPORT_SYMBOL(efi_enabled);
+
+struct efi efi;
+EXPORT_SYMBOL(efi);
+
+struct efi_memory_map memmap;
+
+struct efi efi_phys __initdata;
+static efi_system_table_t efi_systab __initdata;
+
+static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
+{
+	return efi_call_virt2(get_time, tm, tc);
+}
+
+static efi_status_t virt_efi_set_time(efi_time_t *tm)
+{
+	return efi_call_virt1(set_time, tm);
+}
+
+static efi_status_t virt_efi_get_wakeup_time(efi_bool_t *enabled,
+					     efi_bool_t *pending,
+					     efi_time_t *tm)
+{
+	return efi_call_virt3(get_wakeup_time,
+			      enabled, pending, tm);
+}
+
+static efi_status_t virt_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm)
+{
+	return efi_call_virt2(set_wakeup_time,
+			      enabled, tm);
+}
+
+static efi_status_t virt_efi_get_variable(efi_char16_t *name,
+					  efi_guid_t *vendor,
+					  u32 *attr,
+					  unsigned long *data_size,
+					  void *data)
+{
+	return efi_call_virt5(get_variable,
+			      name, vendor, attr,
+			      data_size, data);
+}
+
+static efi_status_t virt_efi_get_next_variable(unsigned long *name_size,
+					       efi_char16_t *name,
+					       efi_guid_t *vendor)
+{
+	return efi_call_virt3(get_next_variable,
+			      name_size, name, vendor);
+}
+
+static efi_status_t virt_efi_set_variable(efi_char16_t *name,
+					  efi_guid_t *vendor,
+					  unsigned long attr,
+					  unsigned long data_size,
+					  void *data)
+{
+	return efi_call_virt5(set_variable,
+			      name, vendor, attr,
+			      data_size, data);
+}
+
+static efi_status_t virt_efi_get_next_high_mono_count(u32 *count)
+{
+	return efi_call_virt1(get_next_high_mono_count, count);
+}
+
+static void virt_efi_reset_system(int reset_type,
+				  efi_status_t status,
+				  unsigned long data_size,
+				  efi_char16_t *data)
+{
+	efi_call_virt4(reset_system, reset_type, status,
+		       data_size, data);
+}
+
+static efi_status_t virt_efi_set_virtual_address_map(
+	unsigned long memory_map_size,
+	unsigned long descriptor_size,
+	u32 descriptor_version,
+	efi_memory_desc_t *virtual_map)
+{
+	return efi_call_virt4(set_virtual_address_map,
+			      memory_map_size, descriptor_size,
+			      descriptor_version, virtual_map);
+}
+
+static efi_status_t __init phys_efi_set_virtual_address_map(
+	unsigned long memory_map_size,
+	unsigned long descriptor_size,
+	u32 descriptor_version,
+	efi_memory_desc_t *virtual_map)
+{
+	efi_status_t status;
+
+	efi_call_phys_prelog();
+	status = efi_call_phys4(efi_phys.set_virtual_address_map,
+				memory_map_size, descriptor_size,
+				descriptor_version, virtual_map);
+	efi_call_phys_epilog();
+	return status;
+}
+
+static efi_status_t __init phys_efi_get_time(efi_time_t *tm,
+					     efi_time_cap_t *tc)
+{
+	efi_status_t status;
+
+	efi_call_phys_prelog();
+	status = efi_call_phys2(efi_phys.get_time, tm, tc);
+	efi_call_phys_epilog();
+	return status;
+}
+
+int efi_set_rtc_mmss(unsigned long nowtime)
+{
+	int real_seconds, real_minutes;
+	efi_status_t 	status;
+	efi_time_t 	eft;
+	efi_time_cap_t 	cap;
+
+	status = efi.get_time(&eft, &cap);
+	if (status != EFI_SUCCESS) {
+		printk(KERN_ERR "Oops: efitime: can't read time!\n");
+		return -1;
+	}
+
+	real_seconds = nowtime % 60;
+	real_minutes = nowtime / 60;
+	if (((abs(real_minutes - eft.minute) + 15)/30) & 1)
+		real_minutes += 30;
+	real_minutes %= 60;
+	eft.minute = real_minutes;
+	eft.second = real_seconds;
+
+	status = efi.set_time(&eft);
+	if (status != EFI_SUCCESS) {
+		printk(KERN_ERR "Oops: efitime: can't write time!\n");
+		return -1;
+	}
+	return 0;
+}
+
+unsigned long efi_get_time(void)
+{
+	efi_status_t status;
+	efi_time_t eft;
+	efi_time_cap_t cap;
+
+	status = efi.get_time(&eft, &cap);
+	if (status != EFI_SUCCESS)
+		printk(KERN_ERR "Oops: efitime: can't read time!\n");
+
+	return mktime(eft.year, eft.month, eft.day, eft.hour,
+		      eft.minute, eft.second);
+}
+
+#if EFI_DEBUG
+static void __init print_efi_memmap(void)
+{
+	efi_memory_desc_t *md;
+	void *p;
+	int i;
+
+	for (p = memmap.map, i = 0;
+	     p < memmap.map_end;
+	     p += memmap.desc_size, i++) {
+		md = p;
+		printk(KERN_INFO PFX "mem%02u: type=%u, attr=0x%llx, "
+			"range=[0x%016llx-0x%016llx) (%lluMB)\n",
+			i, md->type, md->attribute, md->phys_addr,
+			md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
+			(md->num_pages >> (20 - EFI_PAGE_SHIFT)));
+	}
+}
+#endif  /*  EFI_DEBUG  */
+
+void __init efi_init(void)
+{
+	efi_config_table_t *config_tables;
+	efi_runtime_services_t *runtime;
+	efi_char16_t *c16;
+	char vendor[100] = "unknown";
+	int i = 0;
+	void *tmp;
+
+#ifdef CONFIG_X86_32
+	efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab;
+	memmap.phys_map = (void *)boot_params.efi_info.efi_memmap;
+#else
+	efi_phys.systab = (efi_system_table_t *)
+		(boot_params.efi_info.efi_systab |
+		 ((__u64)boot_params.efi_info.efi_systab_hi<<32));
+	memmap.phys_map = (void *)
+		(boot_params.efi_info.efi_memmap |
+		 ((__u64)boot_params.efi_info.efi_memmap_hi<<32));
+#endif
+	memmap.nr_map = boot_params.efi_info.efi_memmap_size /
+		boot_params.efi_info.efi_memdesc_size;
+	memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
+	memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
+
+	efi.systab = efi_early_ioremap((unsigned long)efi_phys.systab,
+				       sizeof(efi_system_table_t));
+	if (efi.systab == NULL)
+		printk(KERN_ERR "Couldn't map the EFI system table!\n");
+	memcpy(&efi_systab, efi.systab, sizeof(efi_system_table_t));
+	efi_early_iounmap(efi.systab, sizeof(efi_system_table_t));
+	efi.systab = &efi_systab;
+
+	/*
+	 * Verify the EFI Table
+	 */
+	if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
+		printk(KERN_ERR "EFI system table signature incorrect!\n");
+	if ((efi.systab->hdr.revision >> 16) == 0)
+		printk(KERN_ERR "Warning: EFI system table version "
+		       "%d.%02d, expected 1.00 or greater!\n",
+		       efi.systab->hdr.revision >> 16,
+		       efi.systab->hdr.revision & 0xffff);
+
+	/*
+	 * Show what we know for posterity
+	 */
+	c16 = tmp = efi_early_ioremap(efi.systab->fw_vendor, 2);
+	if (c16) {
+		for (i = 0; i < sizeof(vendor) && *c16; ++i)
+			vendor[i] = *c16++;
+		vendor[i] = '\0';
+	} else
+		printk(KERN_ERR PFX "Could not map the firmware vendor!\n");
+	efi_early_iounmap(tmp, 2);
+
+	printk(KERN_INFO "EFI v%u.%.02u by %s \n",
+	       efi.systab->hdr.revision >> 16,
+	       efi.systab->hdr.revision & 0xffff, vendor);
+
+	/*
+	 * Let's see what config tables the firmware passed to us.
+	 */
+	config_tables = efi_early_ioremap(
+		efi.systab->tables,
+		efi.systab->nr_tables * sizeof(efi_config_table_t));
+	if (config_tables == NULL)
+		printk(KERN_ERR "Could not map EFI Configuration Table!\n");
+
+	printk(KERN_INFO);
+	for (i = 0; i < efi.systab->nr_tables; i++) {
+		if (!efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID)) {
+			efi.mps = config_tables[i].table;
+			printk(" MPS=0x%lx ", config_tables[i].table);
+		} else if (!efi_guidcmp(config_tables[i].guid,
+					ACPI_20_TABLE_GUID)) {
+			efi.acpi20 = config_tables[i].table;
+			printk(" ACPI 2.0=0x%lx ", config_tables[i].table);
+		} else if (!efi_guidcmp(config_tables[i].guid,
+					ACPI_TABLE_GUID)) {
+			efi.acpi = config_tables[i].table;
+			printk(" ACPI=0x%lx ", config_tables[i].table);
+		} else if (!efi_guidcmp(config_tables[i].guid,
+					SMBIOS_TABLE_GUID)) {
+			efi.smbios = config_tables[i].table;
+			printk(" SMBIOS=0x%lx ", config_tables[i].table);
+		} else if (!efi_guidcmp(config_tables[i].guid,
+					HCDP_TABLE_GUID)) {
+			efi.hcdp = config_tables[i].table;
+			printk(" HCDP=0x%lx ", config_tables[i].table);
+		} else if (!efi_guidcmp(config_tables[i].guid,
+					UGA_IO_PROTOCOL_GUID)) {
+			efi.uga = config_tables[i].table;
+			printk(" UGA=0x%lx ", config_tables[i].table);
+		}
+	}
+	printk("\n");
+	efi_early_iounmap(config_tables,
+			  efi.systab->nr_tables * sizeof(efi_config_table_t));
+
+	/*
+	 * Check out the runtime services table. We need to map
+	 * the runtime services table so that we can grab the physical
+	 * address of several of the EFI runtime functions, needed to
+	 * set the firmware into virtual mode.
+	 */
+	runtime = efi_early_ioremap((unsigned long)efi.systab->runtime,
+				    sizeof(efi_runtime_services_t));
+	if (runtime != NULL) {
+		/*
+		 * We will only need *early* access to the following
+		 * two EFI runtime services before set_virtual_address_map
+		 * is invoked.
+		 */
+		efi_phys.get_time = (efi_get_time_t *)runtime->get_time;
+		efi_phys.set_virtual_address_map =
+			(efi_set_virtual_address_map_t *)
+			runtime->set_virtual_address_map;
+		/*
+		 * Make efi_get_time can be called before entering
+		 * virtual mode.
+		 */
+		efi.get_time = phys_efi_get_time;
+	} else
+		printk(KERN_ERR "Could not map the EFI runtime service "
+		       "table!\n");
+	efi_early_iounmap(runtime, sizeof(efi_runtime_services_t));
+
+	/* Map the EFI memory map */
+	memmap.map = efi_early_ioremap((unsigned long)memmap.phys_map,
+				       memmap.nr_map * memmap.desc_size);
+	if (memmap.map == NULL)
+		printk(KERN_ERR "Could not map the EFI memory map!\n");
+	memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
+	if (memmap.desc_size != sizeof(efi_memory_desc_t))
+		printk(KERN_WARNING "Kernel-defined memdesc"
+		       "doesn't match the one from EFI!\n");
+
+#ifdef CONFIG_X86_64
+	/* Setup for EFI runtime service */
+	reboot_type = BOOT_EFI;
+
+#endif
+#if EFI_DEBUG
+	print_efi_memmap();
+#endif
+}
+
+/*
+ * This function will switch the EFI runtime services to virtual mode.
+ * Essentially, look through the EFI memmap and map every region that
+ * has the runtime attribute bit set in its memory descriptor and update
+ * that memory descriptor with the virtual address obtained from ioremap().
+ * This enables the runtime services to be called without having to
+ * thunk back into physical mode for every invocation.
+ */
+void __init efi_enter_virtual_mode(void)
+{
+	efi_memory_desc_t *md;
+	efi_status_t status;
+	unsigned long end;
+	void *p;
+
+	efi.systab = NULL;
+	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+		md = p;
+		if (!(md->attribute & EFI_MEMORY_RUNTIME))
+			continue;
+		if ((md->attribute & EFI_MEMORY_WB) &&
+		    (((md->phys_addr + (md->num_pages<<EFI_PAGE_SHIFT)) >>
+		      PAGE_SHIFT) < end_pfn_map))
+			md->virt_addr = (unsigned long)__va(md->phys_addr);
+		else
+			md->virt_addr = (unsigned long)
+				efi_ioremap(md->phys_addr,
+					    md->num_pages << EFI_PAGE_SHIFT);
+		if (!md->virt_addr)
+			printk(KERN_ERR PFX "ioremap of 0x%llX failed!\n",
+			       (unsigned long long)md->phys_addr);
+		end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
+		if ((md->phys_addr <= (unsigned long)efi_phys.systab) &&
+		    ((unsigned long)efi_phys.systab < end))
+			efi.systab = (efi_system_table_t *)(unsigned long)
+				(md->virt_addr - md->phys_addr +
+				 (unsigned long)efi_phys.systab);
+	}
+
+	BUG_ON(!efi.systab);
+
+	status = phys_efi_set_virtual_address_map(
+		memmap.desc_size * memmap.nr_map,
+		memmap.desc_size,
+		memmap.desc_version,
+		memmap.phys_map);
+
+	if (status != EFI_SUCCESS) {
+		printk(KERN_ALERT "Unable to switch EFI into virtual mode "
+		       "(status=%lx)!\n", status);
+		panic("EFI call to SetVirtualAddressMap() failed!");
+	}
+
+	/*
+	 * Now that EFI is in virtual mode, update the function
+	 * pointers in the runtime service table to the new virtual addresses.
+	 *
+	 * Call EFI services through wrapper functions.
+	 */
+	efi.get_time = virt_efi_get_time;
+	efi.set_time = virt_efi_set_time;
+	efi.get_wakeup_time = virt_efi_get_wakeup_time;
+	efi.set_wakeup_time = virt_efi_set_wakeup_time;
+	efi.get_variable = virt_efi_get_variable;
+	efi.get_next_variable = virt_efi_get_next_variable;
+	efi.set_variable = virt_efi_set_variable;
+	efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
+	efi.reset_system = virt_efi_reset_system;
+	efi.set_virtual_address_map = virt_efi_set_virtual_address_map;
+#ifdef CONFIG_X86_64
+	runtime_code_page_mkexec();
+#endif
+}
+
+/*
+ * Convenience functions to obtain memory types and attributes
+ */
+u32 efi_mem_type(unsigned long phys_addr)
+{
+	efi_memory_desc_t *md;
+	void *p;
+
+	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+		md = p;
+		if ((md->phys_addr <= phys_addr) &&
+		    (phys_addr < (md->phys_addr +
+				  (md->num_pages << EFI_PAGE_SHIFT))))
+			return md->type;
+	}
+	return 0;
+}
+
+u64 efi_mem_attributes(unsigned long phys_addr)
+{
+	efi_memory_desc_t *md;
+	void *p;
+
+	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+		md = p;
+		if ((md->phys_addr <= phys_addr) &&
+		    (phys_addr < (md->phys_addr +
+				  (md->num_pages << EFI_PAGE_SHIFT))))
+			return md->attribute;
+	}
+	return 0;
+}
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
new file mode 100644
index 000000000000..f2000dbc7195
--- /dev/null
+++ b/arch/x86/kernel/efi_64.c
@@ -0,0 +1,164 @@
+/*
+ * x86_64 specific EFI support functions
+ * Based on Extensible Firmware Interface Specification version 1.0
+ *
+ * Copyright (C) 2005-2008 Intel Co.
+ *	Fenghua Yu <fenghua.yu@intel.com>
+ *	Bibo Mao <bibo.mao@intel.com>
+ *	Chandramouli Narayanan <mouli@linux.intel.com>
+ *	Huang Ying <ying.huang@intel.com>
+ *
+ * Code to convert EFI to E820 map has been implemented in elilo bootloader
+ * based on a EFI patch by Edgar Hucek. Based on the E820 map, the page table
+ * is setup appropriately for EFI runtime code.
+ * - mouli 06/14/2007.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/bootmem.h>
+#include <linux/ioport.h>
+#include <linux/module.h>
+#include <linux/efi.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/reboot.h>
+
+#include <asm/setup.h>
+#include <asm/page.h>
+#include <asm/e820.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/cacheflush.h>
+#include <asm/proto.h>
+#include <asm/efi.h>
+
+static pgd_t save_pgd __initdata;
+static unsigned long efi_flags __initdata;
+
+static int __init setup_noefi(char *arg)
+{
+	efi_enabled = 0;
+	return 0;
+}
+early_param("noefi", setup_noefi);
+
+static void __init early_mapping_set_exec(unsigned long start,
+					  unsigned long end,
+					  int executable)
+{
+	pte_t *kpte;
+	int level;
+
+	while (start < end) {
+		kpte = lookup_address((unsigned long)__va(start), &level);
+		BUG_ON(!kpte);
+		if (executable)
+			set_pte(kpte, pte_mkexec(*kpte));
+		else
+			set_pte(kpte, __pte((pte_val(*kpte) | _PAGE_NX) & \
+					    __supported_pte_mask));
+		if (pte_huge(*kpte))
+			start = (start + PMD_SIZE) & PMD_MASK;
+		else
+			start = (start + PAGE_SIZE) & PAGE_MASK;
+	}
+}
+
+static void __init early_runtime_code_mapping_set_exec(int executable)
+{
+	efi_memory_desc_t *md;
+	void *p;
+
+	/* Make EFI runtime service code area executable */
+	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+		md = p;
+		if (md->type == EFI_RUNTIME_SERVICES_CODE) {
+			unsigned long end;
+			end = md->phys_addr + (md->num_pages << PAGE_SHIFT);
+			early_mapping_set_exec(md->phys_addr, end, executable);
+		}
+	}
+}
+
+void __init efi_call_phys_prelog(void)
+{
+	unsigned long vaddress;
+
+	local_irq_save(efi_flags);
+	early_runtime_code_mapping_set_exec(1);
+	vaddress = (unsigned long)__va(0x0UL);
+	pgd_val(save_pgd) = pgd_val(*pgd_offset_k(0x0UL));
+	set_pgd(pgd_offset_k(0x0UL), *pgd_offset_k(vaddress));
+	__flush_tlb_all();
+}
+
+void __init efi_call_phys_epilog(void)
+{
+	/*
+	 * After the lock is released, the original page table is restored.
+	 */
+	set_pgd(pgd_offset_k(0x0UL), save_pgd);
+	early_runtime_code_mapping_set_exec(0);
+	__flush_tlb_all();
+	local_irq_restore(efi_flags);
+}
+
+/*
+ * We need to map the EFI memory map again after init_memory_mapping().
+ */
+void __init efi_map_memmap(void)
+{
+	memmap.map = __va(memmap.phys_map);
+	memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
+}
+
+void __init efi_reserve_bootmem(void)
+{
+	reserve_bootmem_generic((unsigned long)memmap.phys_map,
+				memmap.nr_map * memmap.desc_size);
+}
+
+void __init runtime_code_page_mkexec(void)
+{
+	efi_memory_desc_t *md;
+	void *p;
+
+	/* Make EFI runtime service code area executable */
+	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+		md = p;
+		if (md->type == EFI_RUNTIME_SERVICES_CODE)
+			change_page_attr_addr(md->virt_addr,
+					      md->num_pages,
+					      PAGE_KERNEL_EXEC);
+	}
+	__flush_tlb_all();
+}
+
+void __iomem * __init efi_ioremap(unsigned long offset,
+				  unsigned long size)
+{
+	static unsigned pages_mapped;
+	unsigned long last_addr;
+	unsigned i, pages;
+
+	last_addr = offset + size - 1;
+	offset &= PAGE_MASK;
+	pages = (PAGE_ALIGN(last_addr) - offset) >> PAGE_SHIFT;
+	if (pages_mapped + pages > MAX_EFI_IO_PAGES)
+		return NULL;
+
+	for (i = 0; i < pages; i++) {
+		set_fixmap_nocache(FIX_EFI_IO_MAP_FIRST_PAGE - pages_mapped,
+				   offset);
+		offset += PAGE_SIZE;
+		pages_mapped++;
+	}
+
+	return (void __iomem *)__fix_to_virt(FIX_EFI_IO_MAP_FIRST_PAGE - \
+					     (pages_mapped - pages));
+}
diff --git a/arch/x86/kernel/efi_stub_64.S b/arch/x86/kernel/efi_stub_64.S
new file mode 100644
index 000000000000..99b47d48c9f4
--- /dev/null
+++ b/arch/x86/kernel/efi_stub_64.S
@@ -0,0 +1,109 @@
+/*
+ * Function calling ABI conversion from Linux to EFI for x86_64
+ *
+ * Copyright (C) 2007 Intel Corp
+ *	Bibo Mao <bibo.mao@intel.com>
+ *	Huang Ying <ying.huang@intel.com>
+ */
+
+#include <linux/linkage.h>
+
+#define SAVE_XMM			\
+	mov %rsp, %rax;			\
+	subq $0x70, %rsp;		\
+	and $~0xf, %rsp;		\
+	mov %rax, (%rsp);		\
+	mov %cr0, %rax;			\
+	clts;				\
+	mov %rax, 0x8(%rsp);		\
+	movaps %xmm0, 0x60(%rsp);	\
+	movaps %xmm1, 0x50(%rsp);	\
+	movaps %xmm2, 0x40(%rsp);	\
+	movaps %xmm3, 0x30(%rsp);	\
+	movaps %xmm4, 0x20(%rsp);	\
+	movaps %xmm5, 0x10(%rsp)
+
+#define RESTORE_XMM			\
+	movaps 0x60(%rsp), %xmm0;	\
+	movaps 0x50(%rsp), %xmm1;	\
+	movaps 0x40(%rsp), %xmm2;	\
+	movaps 0x30(%rsp), %xmm3;	\
+	movaps 0x20(%rsp), %xmm4;	\
+	movaps 0x10(%rsp), %xmm5;	\
+	mov 0x8(%rsp), %rsi;		\
+	mov %rsi, %cr0;			\
+	mov (%rsp), %rsp
+
+ENTRY(efi_call0)
+	SAVE_XMM
+	subq $32, %rsp
+	call *%rdi
+	addq $32, %rsp
+	RESTORE_XMM
+	ret
+
+ENTRY(efi_call1)
+	SAVE_XMM
+	subq $32, %rsp
+	mov  %rsi, %rcx
+	call *%rdi
+	addq $32, %rsp
+	RESTORE_XMM
+	ret
+
+ENTRY(efi_call2)
+	SAVE_XMM
+	subq $32, %rsp
+	mov  %rsi, %rcx
+	call *%rdi
+	addq $32, %rsp
+	RESTORE_XMM
+	ret
+
+ENTRY(efi_call3)
+	SAVE_XMM
+	subq $32, %rsp
+	mov  %rcx, %r8
+	mov  %rsi, %rcx
+	call *%rdi
+	addq $32, %rsp
+	RESTORE_XMM
+	ret
+
+ENTRY(efi_call4)
+	SAVE_XMM
+	subq $32, %rsp
+	mov %r8, %r9
+	mov %rcx, %r8
+	mov %rsi, %rcx
+	call *%rdi
+	addq $32, %rsp
+	RESTORE_XMM
+	ret
+
+ENTRY(efi_call5)
+	SAVE_XMM
+	subq $48, %rsp
+	mov %r9, 32(%rsp)
+	mov %r8, %r9
+	mov %rcx, %r8
+	mov %rsi, %rcx
+	call *%rdi
+	addq $48, %rsp
+	RESTORE_XMM
+	ret
+
+ENTRY(efi_call6)
+	SAVE_XMM
+	mov (%rsp), %rax
+	mov 8(%rax), %rax
+	subq $48, %rsp
+	mov %r9, 32(%rsp)
+	mov %rax, 40(%rsp)
+	mov %r8, %r9
+	mov %rcx, %r8
+	mov %rsi, %rcx
+	call *%rdi
+	addq $48, %rsp
+	RESTORE_XMM
+	ret
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 8dd110d93e73..90b8bb4748b9 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -30,6 +30,7 @@
 #include <linux/crash_dump.h>
 #include <linux/root_dev.h>
 #include <linux/pci.h>
+#include <linux/efi.h>
 #include <linux/acpi.h>
 #include <linux/kallsyms.h>
 #include <linux/edd.h>
@@ -299,6 +300,11 @@ void __init setup_arch(char **cmdline_p)
 	rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
 	rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
 #endif
+#ifdef CONFIG_EFI
+	if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
+		     "EL64", 4))
+		efi_enabled = 1;
+#endif
 
 	ARCH_SETUP
 
@@ -341,6 +347,8 @@ void __init setup_arch(char **cmdline_p)
 	discover_ebda();
 
 	init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
+	if (efi_enabled)
+		efi_init();
 
 	dmi_scan_machine();
 
@@ -414,6 +422,12 @@ void __init setup_arch(char **cmdline_p)
 	 */
        acpi_reserve_bootmem();
 #endif
+
+	if (efi_enabled) {
+		efi_map_memmap();
+		efi_reserve_bootmem();
+	}
+
        /*
 	* Find and reserve possible boot-time SMP configuration:
 	*/
@@ -479,7 +493,8 @@ void __init setup_arch(char **cmdline_p)
 
 #ifdef CONFIG_VT
 #if defined(CONFIG_VGA_CONSOLE)
-	conswitchp = &vga_con;
+	if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
+		conswitchp = &vga_con;
 #elif defined(CONFIG_DUMMY_CONSOLE)
 	conswitchp = &dummy_con;
 #endif
diff --git a/include/asm-x86/bootparam.h b/include/asm-x86/bootparam.h
index 19f3ddf2df4b..51151356840f 100644
--- a/include/asm-x86/bootparam.h
+++ b/include/asm-x86/bootparam.h
@@ -54,13 +54,14 @@ struct sys_desc_table {
 };
 
 struct efi_info {
-	__u32 _pad1;
+	__u32 efi_loader_signature;
 	__u32 efi_systab;
 	__u32 efi_memdesc_size;
 	__u32 efi_memdesc_version;
 	__u32 efi_memmap;
 	__u32 efi_memmap_size;
-	__u32 _pad2[2];
+	__u32 efi_systab_hi;
+	__u32 efi_memmap_hi;
 };
 
 /* The so-called "zeropage" */
diff --git a/include/asm-x86/efi.h b/include/asm-x86/efi.h
new file mode 100644
index 000000000000..1459d3d58653
--- /dev/null
+++ b/include/asm-x86/efi.h
@@ -0,0 +1,70 @@
+#ifndef _ASM_X86_EFI_H
+#define _ASM_X86_EFI_H
+
+#ifdef CONFIG_X86_32
+#else /* !CONFIG_X86_32 */
+
+#define MAX_EFI_IO_PAGES	100
+
+extern u64 efi_call0(void *fp);
+extern u64 efi_call1(void *fp, u64 arg1);
+extern u64 efi_call2(void *fp, u64 arg1, u64 arg2);
+extern u64 efi_call3(void *fp, u64 arg1, u64 arg2, u64 arg3);
+extern u64 efi_call4(void *fp, u64 arg1, u64 arg2, u64 arg3, u64 arg4);
+extern u64 efi_call5(void *fp, u64 arg1, u64 arg2, u64 arg3,
+		     u64 arg4, u64 arg5);
+extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3,
+		     u64 arg4, u64 arg5, u64 arg6);
+
+#define efi_call_phys0(f)			\
+	efi_call0((void *)(f))
+#define efi_call_phys1(f, a1)			\
+	efi_call1((void *)(f), (u64)(a1))
+#define efi_call_phys2(f, a1, a2)			\
+	efi_call2((void *)(f), (u64)(a1), (u64)(a2))
+#define efi_call_phys3(f, a1, a2, a3)				\
+	efi_call3((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3))
+#define efi_call_phys4(f, a1, a2, a3, a4)				\
+	efi_call4((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3),		\
+		  (u64)(a4))
+#define efi_call_phys5(f, a1, a2, a3, a4, a5)				\
+	efi_call5((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3),		\
+		  (u64)(a4), (u64)(a5))
+#define efi_call_phys6(f, a1, a2, a3, a4, a5, a6)			\
+	efi_call6((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3),		\
+		  (u64)(a4), (u64)(a5), (u64)(a6))
+
+#define efi_call_virt0(f)				\
+	efi_call0((void *)(efi.systab->runtime->f))
+#define efi_call_virt1(f, a1)					\
+	efi_call1((void *)(efi.systab->runtime->f), (u64)(a1))
+#define efi_call_virt2(f, a1, a2)					\
+	efi_call2((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2))
+#define efi_call_virt3(f, a1, a2, a3)					\
+	efi_call3((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
+		  (u64)(a3))
+#define efi_call_virt4(f, a1, a2, a3, a4)				\
+	efi_call4((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
+		  (u64)(a3), (u64)(a4))
+#define efi_call_virt5(f, a1, a2, a3, a4, a5)				\
+	efi_call5((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
+		  (u64)(a3), (u64)(a4), (u64)(a5))
+#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6)			\
+	efi_call6((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
+		  (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6))
+
+#define efi_early_ioremap(addr, size)		early_ioremap(addr, size)
+#define efi_early_iounmap(vaddr, size)		early_iounmap(vaddr, size)
+
+extern void *efi_ioremap(unsigned long offset, unsigned long size);
+
+extern int efi_time;
+
+#endif /* CONFIG_X86_32 */
+
+extern void efi_reserve_bootmem(void);
+extern void efi_call_phys_prelog(void);
+extern void efi_call_phys_epilog(void);
+extern void runtime_code_page_mkexec(void);
+
+#endif
diff --git a/include/asm-x86/fixmap_64.h b/include/asm-x86/fixmap_64.h
index cdfbe4a6ae6f..8f44782e5fe5 100644
--- a/include/asm-x86/fixmap_64.h
+++ b/include/asm-x86/fixmap_64.h
@@ -15,6 +15,7 @@
 #include <asm/apicdef.h>
 #include <asm/page.h>
 #include <asm/vsyscall.h>
+#include <asm/efi.h>
 
 /*
  * Here we define all the compile-time 'special' virtual
@@ -41,6 +42,8 @@ enum fixed_addresses {
 	FIX_APIC_BASE,	/* local (CPU) APIC) -- required for SMP or not */
 	FIX_IO_APIC_BASE_0,
 	FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
+	FIX_EFI_IO_MAP_LAST_PAGE,
+	FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE+MAX_EFI_IO_PAGES-1,
 	__end_of_fixed_addresses
 };
 
-- 
cgit v1.2.3


From de18c850af701ac9512b7239e88fa45e4c168771 Mon Sep 17 00:00:00 2001
From: "Huang, Ying" <ying.huang@intel.com>
Date: Wed, 30 Jan 2008 13:31:19 +0100
Subject: x86: EFI runtime service support: EFI runtime services

This patch adds support for several EFI runtime services for EFI x86_64
system.

The EFI support for emergency_restart is added.

Signed-off-by: Chandramouli Narayanan <mouli@linux.intel.com>
Signed-off-by: Huang Ying <ying.huang@intel.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/reboot_64.c         | 20 +++++++++++++-------
 include/asm-x86/emergency-restart.h |  9 +++++++++
 2 files changed, 22 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/reboot_64.c b/arch/x86/kernel/reboot_64.c
index 307f996a3933..d6bdf93ffca9 100644
--- a/arch/x86/kernel/reboot_64.c
+++ b/arch/x86/kernel/reboot_64.c
@@ -9,6 +9,7 @@
 #include <linux/pm.h>
 #include <linux/kdebug.h>
 #include <linux/sched.h>
+#include <linux/efi.h>
 #include <acpi/reboot.h>
 #include <asm/io.h>
 #include <asm/delay.h>
@@ -28,20 +29,17 @@ void (*pm_power_off)(void);
 EXPORT_SYMBOL(pm_power_off);
 
 static long no_idt[3];
-static enum { 
-	BOOT_TRIPLE = 't',
-	BOOT_KBD = 'k',
-	BOOT_ACPI = 'a'
-} reboot_type = BOOT_KBD;
+enum reboot_type reboot_type = BOOT_KBD;
 static int reboot_mode = 0;
 int reboot_force;
 
-/* reboot=t[riple] | k[bd] [, [w]arm | [c]old]
+/* reboot=t[riple] | k[bd] | e[fi] [, [w]arm | [c]old]
    warm   Don't set the cold reboot flag
    cold   Set the cold reboot flag
    triple Force a triple fault (init)
    kbd    Use the keyboard controller. cold reset (default)
    acpi   Use the RESET_REG in the FADT
+   efi    Use efi reset_system runtime service
    force  Avoid anything that could hang.
  */ 
 static int __init reboot_setup(char *str)
@@ -60,6 +58,7 @@ static int __init reboot_setup(char *str)
 		case 'a':
 		case 'b':
 		case 'k':
+		case 'e':
 			reboot_type = *str;
 			break;
 		case 'f':
@@ -155,7 +154,14 @@ void machine_emergency_restart(void)
 			acpi_reboot();
 			reboot_type = BOOT_KBD;
 			break;
-		}      
+
+		case BOOT_EFI:
+			if (efi_enabled)
+				efi.reset_system(reboot_mode ? EFI_RESET_WARM : EFI_RESET_COLD,
+						 EFI_SUCCESS, 0, NULL);
+			reboot_type = BOOT_KBD;
+			break;
+		}
 	}      
 }
 
diff --git a/include/asm-x86/emergency-restart.h b/include/asm-x86/emergency-restart.h
index 680c39563345..54189084462a 100644
--- a/include/asm-x86/emergency-restart.h
+++ b/include/asm-x86/emergency-restart.h
@@ -1,6 +1,15 @@
 #ifndef _ASM_EMERGENCY_RESTART_H
 #define _ASM_EMERGENCY_RESTART_H
 
+enum reboot_type {
+	BOOT_TRIPLE = 't',
+	BOOT_KBD = 'k',
+	BOOT_ACPI = 'a',
+	BOOT_EFI = 'e'
+};
+
+extern enum reboot_type reboot_type;
+
 extern void machine_emergency_restart(void);
 
 #endif /* _ASM_EMERGENCY_RESTART_H */
-- 
cgit v1.2.3


From e429795c68d3001ecae74f6465420c9f043b0ece Mon Sep 17 00:00:00 2001
From: "Huang, Ying" <ying.huang@intel.com>
Date: Wed, 30 Jan 2008 13:31:19 +0100
Subject: x86: EFI runtime service support: remove duplicated code from
 efi_32.c

This patch removes the duplicated code between efi_32.c and efi.c.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/Makefile_32 |   2 +-
 arch/x86/kernel/e820_32.c   |   5 -
 arch/x86/kernel/efi_32.c    | 430 +-------------------------------------------
 arch/x86/kernel/setup_32.c  |  11 +-
 include/asm-x86/efi.h       |  42 +++++
 5 files changed, 47 insertions(+), 443 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32
index cc2651bcc07f..a854e23eac0b 100644
--- a/arch/x86/kernel/Makefile_32
+++ b/arch/x86/kernel/Makefile_32
@@ -38,7 +38,7 @@ obj-$(CONFIG_X86_SUMMIT_NUMA)	+= summit_32.o
 obj-$(CONFIG_KPROBES)		+= kprobes_32.o
 obj-$(CONFIG_MODULES)		+= module_32.o
 obj-$(CONFIG_ACPI_SRAT) 	+= srat_32.o
-obj-$(CONFIG_EFI) 		+= efi_32.o efi_stub_32.o
+obj-$(CONFIG_EFI) 		+= efi.o efi_32.o efi_stub_32.o
 obj-$(CONFIG_DOUBLEFAULT) 	+= doublefault_32.o
 obj-$(CONFIG_VM86)		+= vm86_32.o
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c
index 87cadc86d5ee..56335a85a15a 100644
--- a/arch/x86/kernel/e820_32.c
+++ b/arch/x86/kernel/e820_32.c
@@ -17,11 +17,6 @@
 #include <asm/e820.h>
 #include <asm/setup.h>
 
-#ifdef CONFIG_EFI
-int efi_enabled = 0;
-EXPORT_SYMBOL(efi_enabled);
-#endif
-
 struct e820map e820;
 struct change_member {
 	struct e820entry *pbios; /* pointer to original bios entry */
diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c
index 863e8926f2bb..1df13725e519 100644
--- a/arch/x86/kernel/efi_32.c
+++ b/arch/x86/kernel/efi_32.c
@@ -39,21 +39,8 @@
 #include <asm/desc.h>
 #include <asm/tlbflush.h>
 
-#define EFI_DEBUG	0
 #define PFX 		"EFI: "
 
-extern efi_status_t asmlinkage efi_call_phys(void *, ...);
-
-struct efi efi;
-EXPORT_SYMBOL(efi);
-static struct efi efi_phys;
-struct efi_memory_map memmap;
-
-/*
- * We require an early boot_ioremap mapping mechanism initially
- */
-extern void * boot_ioremap(unsigned long, unsigned long);
-
 /*
  * To make EFI call EFI runtime service in physical addressing mode we need
  * prelog/epilog before/after the invocation to disable interrupt, to
@@ -65,7 +52,7 @@ static unsigned long efi_rt_eflags;
 static DEFINE_SPINLOCK(efi_rt_lock);
 static pgd_t efi_bak_pg_dir_pointer[2];
 
-static void efi_call_phys_prelog(void) __acquires(efi_rt_lock)
+void efi_call_phys_prelog(void) __acquires(efi_rt_lock)
 {
 	unsigned long cr4;
 	unsigned long temp;
@@ -108,7 +95,7 @@ static void efi_call_phys_prelog(void) __acquires(efi_rt_lock)
 	load_gdt(&gdt_descr);
 }
 
-static void efi_call_phys_epilog(void) __releases(efi_rt_lock)
+void efi_call_phys_epilog(void) __releases(efi_rt_lock)
 {
 	unsigned long cr4;
 	struct desc_ptr gdt_descr;
@@ -138,87 +125,6 @@ static void efi_call_phys_epilog(void) __releases(efi_rt_lock)
 	spin_unlock(&efi_rt_lock);
 }
 
-static efi_status_t
-phys_efi_set_virtual_address_map(unsigned long memory_map_size,
-				 unsigned long descriptor_size,
-				 u32 descriptor_version,
-				 efi_memory_desc_t *virtual_map)
-{
-	efi_status_t status;
-
-	efi_call_phys_prelog();
-	status = efi_call_phys(efi_phys.set_virtual_address_map,
-				     memory_map_size, descriptor_size,
-				     descriptor_version, virtual_map);
-	efi_call_phys_epilog();
-	return status;
-}
-
-static efi_status_t
-phys_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
-{
-	efi_status_t status;
-
-	efi_call_phys_prelog();
-	status = efi_call_phys(efi_phys.get_time, tm, tc);
-	efi_call_phys_epilog();
-	return status;
-}
-
-inline int efi_set_rtc_mmss(unsigned long nowtime)
-{
-	int real_seconds, real_minutes;
-	efi_status_t 	status;
-	efi_time_t 	eft;
-	efi_time_cap_t 	cap;
-
-	spin_lock(&efi_rt_lock);
-	status = efi.get_time(&eft, &cap);
-	spin_unlock(&efi_rt_lock);
-	if (status != EFI_SUCCESS)
-		panic("Ooops, efitime: can't read time!\n");
-	real_seconds = nowtime % 60;
-	real_minutes = nowtime / 60;
-
-	if (((abs(real_minutes - eft.minute) + 15)/30) & 1)
-		real_minutes += 30;
-	real_minutes %= 60;
-
-	eft.minute = real_minutes;
-	eft.second = real_seconds;
-
-	if (status != EFI_SUCCESS) {
-		printk("Ooops: efitime: can't read time!\n");
-		return -1;
-	}
-	return 0;
-}
-/*
- * This is used during kernel init before runtime
- * services have been remapped and also during suspend, therefore,
- * we'll need to call both in physical and virtual modes.
- */
-inline unsigned long efi_get_time(void)
-{
-	efi_status_t status;
-	efi_time_t eft;
-	efi_time_cap_t cap;
-
-	if (efi.get_time) {
-		/* if we are in virtual mode use remapped function */
- 		status = efi.get_time(&eft, &cap);
-	} else {
-		/* we are in physical mode */
-		status = phys_efi_get_time(&eft, &cap);
-	}
-
-	if (status != EFI_SUCCESS)
-		printk("Oops: efitime: can't read time status: 0x%lx\n",status);
-
-	return mktime(eft.year, eft.month, eft.day, eft.hour,
-			eft.minute, eft.second);
-}
-
 int is_available_memory(efi_memory_desc_t * md)
 {
 	if (!(md->attribute & EFI_MEMORY_WB))
@@ -250,24 +156,6 @@ void __init efi_map_memmap(void)
 	memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
 }
 
-#if EFI_DEBUG
-static void __init print_efi_memmap(void)
-{
-	efi_memory_desc_t *md;
-	void *p;
-	int i;
-
-	for (p = memmap.map, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
-		md = p;
-		printk(KERN_INFO "mem%02u: type=%u, attr=0x%llx, "
-			"range=[0x%016llx-0x%016llx) (%lluMB)\n",
-			i, md->type, md->attribute, md->phys_addr,
-			md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
-			(md->num_pages >> (20 - EFI_PAGE_SHIFT)));
-	}
-}
-#endif  /*  EFI_DEBUG  */
-
 /*
  * Walks the EFI memory map and calls CALLBACK once for each EFI
  * memory descriptor that has memory that is available for kernel use.
@@ -319,288 +207,6 @@ void efi_memmap_walk(efi_freemem_callback_t callback, void *arg)
 	}
 }
 
-void __init efi_init(void)
-{
-	efi_config_table_t *config_tables;
-	efi_runtime_services_t *runtime;
-	efi_char16_t *c16;
-	char vendor[100] = "unknown";
-	unsigned long num_config_tables;
-	int i = 0;
-
-	memset(&efi, 0, sizeof(efi) );
-	memset(&efi_phys, 0, sizeof(efi_phys));
-
-	efi_phys.systab =
-		(efi_system_table_t *)boot_params.efi_info.efi_systab;
-	memmap.phys_map = (void *)boot_params.efi_info.efi_memmap;
-	memmap.nr_map = boot_params.efi_info.efi_memmap_size/
-		boot_params.efi_info.efi_memdesc_size;
-	memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
-	memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
-
-	efi.systab = (efi_system_table_t *)
-		boot_ioremap((unsigned long) efi_phys.systab,
-			sizeof(efi_system_table_t));
-	/*
-	 * Verify the EFI Table
-	 */
-	if (efi.systab == NULL)
-		printk(KERN_ERR PFX "Woah! Couldn't map the EFI system table.\n");
-	if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
-		printk(KERN_ERR PFX "Woah! EFI system table signature incorrect\n");
-	if ((efi.systab->hdr.revision >> 16) == 0)
-		printk(KERN_ERR PFX "Warning: EFI system table version "
-		       "%d.%02d, expected 1.00 or greater\n",
-		       efi.systab->hdr.revision >> 16,
-		       efi.systab->hdr.revision & 0xffff);
-
-	/*
-	 * Grab some details from the system table
-	 */
-	num_config_tables = efi.systab->nr_tables;
-	config_tables = (efi_config_table_t *)efi.systab->tables;
-	runtime = efi.systab->runtime;
-
-	/*
-	 * Show what we know for posterity
-	 */
-	c16 = (efi_char16_t *) boot_ioremap(efi.systab->fw_vendor, 2);
-	if (c16) {
-		for (i = 0; i < (sizeof(vendor) - 1) && *c16; ++i)
-			vendor[i] = *c16++;
-		vendor[i] = '\0';
-	} else
-		printk(KERN_ERR PFX "Could not map the firmware vendor!\n");
-
-	printk(KERN_INFO PFX "EFI v%u.%.02u by %s \n",
-	       efi.systab->hdr.revision >> 16,
-	       efi.systab->hdr.revision & 0xffff, vendor);
-
-	/*
-	 * Let's see what config tables the firmware passed to us.
-	 */
-	config_tables = (efi_config_table_t *)
-				boot_ioremap((unsigned long) config_tables,
-			        num_config_tables * sizeof(efi_config_table_t));
-
-	if (config_tables == NULL)
-		printk(KERN_ERR PFX "Could not map EFI Configuration Table!\n");
-
-	efi.mps        = EFI_INVALID_TABLE_ADDR;
-	efi.acpi       = EFI_INVALID_TABLE_ADDR;
-	efi.acpi20     = EFI_INVALID_TABLE_ADDR;
-	efi.smbios     = EFI_INVALID_TABLE_ADDR;
-	efi.sal_systab = EFI_INVALID_TABLE_ADDR;
-	efi.boot_info  = EFI_INVALID_TABLE_ADDR;
-	efi.hcdp       = EFI_INVALID_TABLE_ADDR;
-	efi.uga        = EFI_INVALID_TABLE_ADDR;
-
-	for (i = 0; i < num_config_tables; i++) {
-		if (efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID) == 0) {
-			efi.mps = config_tables[i].table;
-			printk(KERN_INFO " MPS=0x%lx ", config_tables[i].table);
-		} else
-		    if (efi_guidcmp(config_tables[i].guid, ACPI_20_TABLE_GUID) == 0) {
-			efi.acpi20 = config_tables[i].table;
-			printk(KERN_INFO " ACPI 2.0=0x%lx ", config_tables[i].table);
-		} else
-		    if (efi_guidcmp(config_tables[i].guid, ACPI_TABLE_GUID) == 0) {
-			efi.acpi = config_tables[i].table;
-			printk(KERN_INFO " ACPI=0x%lx ", config_tables[i].table);
-		} else
-		    if (efi_guidcmp(config_tables[i].guid, SMBIOS_TABLE_GUID) == 0) {
-			efi.smbios = config_tables[i].table;
-			printk(KERN_INFO " SMBIOS=0x%lx ", config_tables[i].table);
-		} else
-		    if (efi_guidcmp(config_tables[i].guid, HCDP_TABLE_GUID) == 0) {
-			efi.hcdp = config_tables[i].table;
-			printk(KERN_INFO " HCDP=0x%lx ", config_tables[i].table);
-		} else
-		    if (efi_guidcmp(config_tables[i].guid, UGA_IO_PROTOCOL_GUID) == 0) {
-			efi.uga = config_tables[i].table;
-			printk(KERN_INFO " UGA=0x%lx ", config_tables[i].table);
-		}
-	}
-	printk("\n");
-
-	/*
-	 * Check out the runtime services table. We need to map
-	 * the runtime services table so that we can grab the physical
-	 * address of several of the EFI runtime functions, needed to
-	 * set the firmware into virtual mode.
-	 */
-
-	runtime = (efi_runtime_services_t *) boot_ioremap((unsigned long)
-						runtime,
-				      		sizeof(efi_runtime_services_t));
-	if (runtime != NULL) {
-		/*
-	 	 * We will only need *early* access to the following
-		 * two EFI runtime services before set_virtual_address_map
-		 * is invoked.
- 	 	 */
-		efi_phys.get_time = (efi_get_time_t *) runtime->get_time;
-		efi_phys.set_virtual_address_map =
-			(efi_set_virtual_address_map_t *)
-				runtime->set_virtual_address_map;
-	} else
-		printk(KERN_ERR PFX "Could not map the runtime service table!\n");
-
-	/* Map the EFI memory map for use until paging_init() */
-	memmap.map = boot_ioremap(boot_params.efi_info.efi_memmap,
-				  boot_params.efi_info.efi_memmap_size);
-	if (memmap.map == NULL)
-		printk(KERN_ERR PFX "Could not map the EFI memory map!\n");
-
-	memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
-
-#if EFI_DEBUG
-	print_efi_memmap();
-#endif
-}
-
-static inline void __init check_range_for_systab(efi_memory_desc_t *md)
-{
-	if (((unsigned long)md->phys_addr <= (unsigned long)efi_phys.systab) &&
-		((unsigned long)efi_phys.systab < md->phys_addr +
-		((unsigned long)md->num_pages << EFI_PAGE_SHIFT))) {
-		unsigned long addr;
-
-		addr = md->virt_addr - md->phys_addr +
-			(unsigned long)efi_phys.systab;
-		efi.systab = (efi_system_table_t *)addr;
-	}
-}
-
-/*
- * Wrap all the virtual calls in a way that forces the parameters on the stack.
- */
-
-#define efi_call_virt(f, args...) \
-     ((efi_##f##_t __attribute__((regparm(0)))*)efi.systab->runtime->f)(args)
-
-static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
-{
-	return efi_call_virt(get_time, tm, tc);
-}
-
-static efi_status_t virt_efi_set_time (efi_time_t *tm)
-{
-	return efi_call_virt(set_time, tm);
-}
-
-static efi_status_t virt_efi_get_wakeup_time (efi_bool_t *enabled,
-					      efi_bool_t *pending,
-					      efi_time_t *tm)
-{
-	return efi_call_virt(get_wakeup_time, enabled, pending, tm);
-}
-
-static efi_status_t virt_efi_set_wakeup_time (efi_bool_t enabled,
-					      efi_time_t *tm)
-{
-	return efi_call_virt(set_wakeup_time, enabled, tm);
-}
-
-static efi_status_t virt_efi_get_variable (efi_char16_t *name,
-					   efi_guid_t *vendor, u32 *attr,
-					   unsigned long *data_size, void *data)
-{
-	return efi_call_virt(get_variable, name, vendor, attr, data_size, data);
-}
-
-static efi_status_t virt_efi_get_next_variable (unsigned long *name_size,
-						efi_char16_t *name,
-						efi_guid_t *vendor)
-{
-	return efi_call_virt(get_next_variable, name_size, name, vendor);
-}
-
-static efi_status_t virt_efi_set_variable (efi_char16_t *name,
-					   efi_guid_t *vendor,
-					   unsigned long attr,
-					   unsigned long data_size, void *data)
-{
-	return efi_call_virt(set_variable, name, vendor, attr, data_size, data);
-}
-
-static efi_status_t virt_efi_get_next_high_mono_count (u32 *count)
-{
-	return efi_call_virt(get_next_high_mono_count, count);
-}
-
-static void virt_efi_reset_system (int reset_type, efi_status_t status,
-				   unsigned long data_size,
-				   efi_char16_t *data)
-{
-	efi_call_virt(reset_system, reset_type, status, data_size, data);
-}
-
-/*
- * This function will switch the EFI runtime services to virtual mode.
- * Essentially, look through the EFI memmap and map every region that
- * has the runtime attribute bit set in its memory descriptor and update
- * that memory descriptor with the virtual address obtained from ioremap().
- * This enables the runtime services to be called without having to
- * thunk back into physical mode for every invocation.
- */
-
-void __init efi_enter_virtual_mode(void)
-{
-	efi_memory_desc_t *md;
-	efi_status_t status;
-	void *p;
-
-	efi.systab = NULL;
-
-	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-		md = p;
-
-		if (!(md->attribute & EFI_MEMORY_RUNTIME))
-			continue;
-
-		md->virt_addr = (unsigned long)ioremap(md->phys_addr,
-			md->num_pages << EFI_PAGE_SHIFT);
-		if (!(unsigned long)md->virt_addr) {
-			printk(KERN_ERR PFX "ioremap of 0x%lX failed\n",
-				(unsigned long)md->phys_addr);
-		}
-		/* update the virtual address of the EFI system table */
-		check_range_for_systab(md);
-	}
-
-	BUG_ON(!efi.systab);
-
-	status = phys_efi_set_virtual_address_map(
-			memmap.desc_size * memmap.nr_map,
-			memmap.desc_size,
-			memmap.desc_version,
-		       	memmap.phys_map);
-
-	if (status != EFI_SUCCESS) {
-		printk (KERN_ALERT "You are screwed! "
-			"Unable to switch EFI into virtual mode "
-			"(status=%lx)\n", status);
-		panic("EFI call to SetVirtualAddressMap() failed!");
-	}
-
-	/*
-	 * Now that EFI is in virtual mode, update the function
-	 * pointers in the runtime service table to the new virtual addresses.
-	 */
-
-	efi.get_time = virt_efi_get_time;
-	efi.set_time = virt_efi_set_time;
-	efi.get_wakeup_time = virt_efi_get_wakeup_time;
-	efi.set_wakeup_time = virt_efi_set_wakeup_time;
-	efi.get_variable = virt_efi_get_variable;
-	efi.get_next_variable = virt_efi_get_next_variable;
-	efi.set_variable = virt_efi_set_variable;
-	efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
-	efi.reset_system = virt_efi_reset_system;
-}
-
 void __init
 efi_initialize_iomem_resources(struct resource *code_resource,
 			       struct resource *data_resource,
@@ -683,35 +289,3 @@ efi_initialize_iomem_resources(struct resource *code_resource,
 		}
 	}
 }
-
-/*
- * Convenience functions to obtain memory types and attributes
- */
-
-u32 efi_mem_type(unsigned long phys_addr)
-{
-	efi_memory_desc_t *md;
-	void *p;
-
-	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-		md = p;
-		if ((md->phys_addr <= phys_addr) && (phys_addr <
-			(md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) ))
-			return md->type;
-	}
-	return 0;
-}
-
-u64 efi_mem_attributes(unsigned long phys_addr)
-{
-	efi_memory_desc_t *md;
-	void *p;
-
-	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-		md = p;
-		if ((md->phys_addr <= phys_addr) && (phys_addr <
-			(md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) ))
-			return md->attribute;
-	}
-	return 0;
-}
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index e9ede0fc585a..32fc87adc4a3 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -618,16 +618,9 @@ void __init setup_arch(char **cmdline_p)
 	pre_setup_arch_hook();
 	early_cpu_init();
 
-	/*
-	 * FIXME: This isn't an official loader_type right
-	 * now but does currently work with elilo.
-	 * If we were configured as an EFI kernel, check to make
-	 * sure that we were loaded correctly from elilo and that
-	 * the system table is valid.  If not, then initialize normally.
-	 */
 #ifdef CONFIG_EFI
-	if ((boot_params.hdr.type_of_loader == 0x50) &&
-	    boot_params.efi_info.efi_systab)
+	if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
+		     "EL32", 4))
 		efi_enabled = 1;
 #endif
 
diff --git a/include/asm-x86/efi.h b/include/asm-x86/efi.h
index 1459d3d58653..6d54502755aa 100644
--- a/include/asm-x86/efi.h
+++ b/include/asm-x86/efi.h
@@ -2,6 +2,48 @@
 #define _ASM_X86_EFI_H
 
 #ifdef CONFIG_X86_32
+
+extern unsigned long asmlinkage efi_call_phys(void *, ...);
+
+#define efi_call_phys0(f)		efi_call_phys(f)
+#define efi_call_phys1(f, a1)		efi_call_phys(f, a1)
+#define efi_call_phys2(f, a1, a2)	efi_call_phys(f, a1, a2)
+#define efi_call_phys3(f, a1, a2, a3)	efi_call_phys(f, a1, a2, a3)
+#define efi_call_phys4(f, a1, a2, a3, a4)	\
+	efi_call_phys(f, a1, a2, a3, a4)
+#define efi_call_phys5(f, a1, a2, a3, a4, a5)	\
+	efi_call_phys(f, a1, a2, a3, a4, a5)
+#define efi_call_phys6(f, a1, a2, a3, a4, a5, a6)	\
+	efi_call_phys(f, a1, a2, a3, a4, a5, a6)
+/*
+ * Wrap all the virtual calls in a way that forces the parameters on the stack.
+ */
+
+#define efi_call_virt(f, args...) \
+     ((efi_##f##_t __attribute__((regparm(0)))*)efi.systab->runtime->f)(args)
+
+#define efi_call_virt0(f)		efi_call_virt(f)
+#define efi_call_virt1(f, a1)		efi_call_virt(f, a1)
+#define efi_call_virt2(f, a1, a2)	efi_call_virt(f, a1, a2)
+#define efi_call_virt3(f, a1, a2, a3)	efi_call_virt(f, a1, a2, a3)
+#define efi_call_virt4(f, a1, a2, a3, a4)	\
+	efi_call_virt(f, a1, a2, a3, a4)
+#define efi_call_virt5(f, a1, a2, a3, a4, a5)	\
+	efi_call_virt(f, a1, a2, a3, a4, a5)
+#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6)	\
+	efi_call_virt(f, a1, a2, a3, a4, a5, a6)
+/*
+ * We require an early boot_ioremap mapping mechanism initially
+ */
+extern void *boot_ioremap(unsigned long, unsigned long);
+
+#define efi_early_ioremap(addr, size)		boot_ioremap(addr, size)
+#define efi_early_iounmap(vaddr, size)
+
+#define efi_ioremap(addr, size)			ioremap(addr, size)
+
+#define end_pfn_map				max_low_pfn
+
 #else /* !CONFIG_X86_32 */
 
 #define MAX_EFI_IO_PAGES	100
-- 
cgit v1.2.3


From 2215e69d2cf5024647f9a034807990590d25dd4e Mon Sep 17 00:00:00 2001
From: "Huang, Ying" <ying.huang@intel.com>
Date: Wed, 30 Jan 2008 13:31:19 +0100
Subject: x86 boot: use E820 memory map on EFI 32 platform

Because the EFI memory map are converted to e820 memory map in bootloader, the
EFI memory map handling code is removed to clean up.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Cc: Andi Kleen <ak@suse.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/e820_32.c  | 117 +++--------------------------------
 arch/x86/kernel/efi_32.c   | 150 ---------------------------------------------
 arch/x86/kernel/setup_32.c |  16 ++---
 arch/x86/mm/init_32.c      |  18 ------
 include/asm-x86/e820_32.h  |   2 +-
 5 files changed, 16 insertions(+), 287 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c
index 56335a85a15a..931934a7b353 100644
--- a/arch/x86/kernel/e820_32.c
+++ b/arch/x86/kernel/e820_32.c
@@ -7,7 +7,6 @@
 #include <linux/kexec.h>
 #include <linux/module.h>
 #include <linux/mm.h>
-#include <linux/efi.h>
 #include <linux/pfn.h>
 #include <linux/uaccess.h>
 #include <linux/suspend.h>
@@ -181,7 +180,7 @@ static void __init probe_roms(void)
  * Request address space for all standard RAM and ROM resources
  * and also for regions reported as reserved by the e820.
  */
-void __init legacy_init_iomem_resources(struct resource *code_resource,
+void __init init_iomem_resources(struct resource *code_resource,
 		struct resource *data_resource,
 		struct resource *bss_resource)
 {
@@ -261,19 +260,17 @@ void __init add_memory_region(unsigned long long start,
 {
 	int x;
 
-	if (!efi_enabled) {
-       		x = e820.nr_map;
+	x = e820.nr_map;
 
-		if (x == E820MAX) {
-		    printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
-		    return;
-		}
-
-		e820.map[x].addr = start;
-		e820.map[x].size = size;
-		e820.map[x].type = type;
-		e820.nr_map++;
+	if (x == E820MAX) {
+		printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
+		return;
 	}
+
+	e820.map[x].addr = start;
+	e820.map[x].size = size;
+	e820.map[x].type = type;
+	e820.nr_map++;
 } /* add_memory_region */
 
 /*
@@ -488,29 +485,6 @@ int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
 	return 0;
 }
 
-/*
- * Callback for efi_memory_walk.
- */
-static int __init
-efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
-{
-	unsigned long *max_pfn = arg, pfn;
-
-	if (start < end) {
-		pfn = PFN_UP(end -1);
-		if (pfn > *max_pfn)
-			*max_pfn = pfn;
-	}
-	return 0;
-}
-
-static int __init
-efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
-{
-	memory_present(0, PFN_UP(start), PFN_DOWN(end));
-	return 0;
-}
-
 /*
  * Find the highest page frame number we have available
  */
@@ -519,11 +493,6 @@ void __init find_max_pfn(void)
 	int i;
 
 	max_pfn = 0;
-	if (efi_enabled) {
-		efi_memmap_walk(efi_find_max_pfn, &max_pfn);
-		efi_memmap_walk(efi_memory_present_wrapper, NULL);
-		return;
-	}
 
 	for (i = 0; i < e820.nr_map; i++) {
 		unsigned long start, end;
@@ -540,24 +509,6 @@ void __init find_max_pfn(void)
 	}
 }
 
-/*
- * Free all available memory for boot time allocation.  Used
- * as a callback function by efi_memory_walk()
- */
-
-static int __init
-free_available_memory(unsigned long start, unsigned long end, void *arg)
-{
-	/* check max_low_pfn */
-	if (start >= (max_low_pfn << PAGE_SHIFT))
-		return 0;
-	if (end >= (max_low_pfn << PAGE_SHIFT))
-		end = max_low_pfn << PAGE_SHIFT;
-	if (start < end)
-		free_bootmem(start, end - start);
-
-	return 0;
-}
 /*
  * Register fully available low RAM pages with the bootmem allocator.
  */
@@ -565,10 +516,6 @@ void __init register_bootmem_low_pages(unsigned long max_low_pfn)
 {
 	int i;
 
-	if (efi_enabled) {
-		efi_memmap_walk(free_available_memory, NULL);
-		return;
-	}
 	for (i = 0; i < e820.nr_map; i++) {
 		unsigned long curr_pfn, last_pfn, size;
 		/*
@@ -676,56 +623,12 @@ void __init print_memory_map(char *who)
 	}
 }
 
-static __init __always_inline void efi_limit_regions(unsigned long long size)
-{
-	unsigned long long current_addr = 0;
-	efi_memory_desc_t *md, *next_md;
-	void *p, *p1;
-	int i, j;
-
-	j = 0;
-	p1 = memmap.map;
-	for (p = p1, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
-		md = p;
-		next_md = p1;
-		current_addr = md->phys_addr +
-			PFN_PHYS(md->num_pages);
-		if (is_available_memory(md)) {
-			if (md->phys_addr >= size) continue;
-			memcpy(next_md, md, memmap.desc_size);
-			if (current_addr >= size) {
-				next_md->num_pages -=
-					PFN_UP(current_addr-size);
-			}
-			p1 += memmap.desc_size;
-			next_md = p1;
-			j++;
-		} else if ((md->attribute & EFI_MEMORY_RUNTIME) ==
-			   EFI_MEMORY_RUNTIME) {
-			/* In order to make runtime services
-			 * available we have to include runtime
-			 * memory regions in memory map */
-			memcpy(next_md, md, memmap.desc_size);
-			p1 += memmap.desc_size;
-			next_md = p1;
-			j++;
-		}
-	}
-	memmap.nr_map = j;
-	memmap.map_end = memmap.map +
-		(memmap.nr_map * memmap.desc_size);
-}
-
 void __init limit_regions(unsigned long long size)
 {
 	unsigned long long current_addr;
 	int i;
 
 	print_memory_map("limit_regions start");
-	if (efi_enabled) {
-		efi_limit_regions(size);
-		return;
-	}
 	for (i = 0; i < e820.nr_map; i++) {
 		current_addr = e820.map[i].addr + e820.map[i].size;
 		if (current_addr < size)
diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c
index 1df13725e519..30f937116288 100644
--- a/arch/x86/kernel/efi_32.c
+++ b/arch/x86/kernel/efi_32.c
@@ -125,22 +125,6 @@ void efi_call_phys_epilog(void) __releases(efi_rt_lock)
 	spin_unlock(&efi_rt_lock);
 }
 
-int is_available_memory(efi_memory_desc_t * md)
-{
-	if (!(md->attribute & EFI_MEMORY_WB))
-		return 0;
-
-	switch (md->type) {
-		case EFI_LOADER_CODE:
-		case EFI_LOADER_DATA:
-		case EFI_BOOT_SERVICES_CODE:
-		case EFI_BOOT_SERVICES_DATA:
-		case EFI_CONVENTIONAL_MEMORY:
-			return 1;
-	}
-	return 0;
-}
-
 /*
  * We need to map the EFI memory map again after paging_init().
  */
@@ -155,137 +139,3 @@ void __init efi_map_memmap(void)
 
 	memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
 }
-
-/*
- * Walks the EFI memory map and calls CALLBACK once for each EFI
- * memory descriptor that has memory that is available for kernel use.
- */
-void efi_memmap_walk(efi_freemem_callback_t callback, void *arg)
-{
-	int prev_valid = 0;
-	struct range {
-		unsigned long start;
-		unsigned long end;
-	} uninitialized_var(prev), curr;
-	efi_memory_desc_t *md;
-	unsigned long start, end;
-	void *p;
-
-	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-		md = p;
-
-		if ((md->num_pages == 0) || (!is_available_memory(md)))
-			continue;
-
-		curr.start = md->phys_addr;
-		curr.end = curr.start + (md->num_pages << EFI_PAGE_SHIFT);
-
-		if (!prev_valid) {
-			prev = curr;
-			prev_valid = 1;
-		} else {
-			if (curr.start < prev.start)
-				printk(KERN_INFO PFX "Unordered memory map\n");
-			if (prev.end == curr.start)
-				prev.end = curr.end;
-			else {
-				start =
-				    (unsigned long) (PAGE_ALIGN(prev.start));
-				end = (unsigned long) (prev.end & PAGE_MASK);
-				if ((end > start)
-				    && (*callback) (start, end, arg) < 0)
-					return;
-				prev = curr;
-			}
-		}
-	}
-	if (prev_valid) {
-		start = (unsigned long) PAGE_ALIGN(prev.start);
-		end = (unsigned long) (prev.end & PAGE_MASK);
-		if (end > start)
-			(*callback) (start, end, arg);
-	}
-}
-
-void __init
-efi_initialize_iomem_resources(struct resource *code_resource,
-			       struct resource *data_resource,
-			       struct resource *bss_resource)
-{
-	struct resource *res;
-	efi_memory_desc_t *md;
-	void *p;
-
-	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-		md = p;
-
-		if ((md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >
-		    0x100000000ULL)
-			continue;
-		res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
-		switch (md->type) {
-		case EFI_RESERVED_TYPE:
-			res->name = "Reserved Memory";
-			break;
-		case EFI_LOADER_CODE:
-			res->name = "Loader Code";
-			break;
-		case EFI_LOADER_DATA:
-			res->name = "Loader Data";
-			break;
-		case EFI_BOOT_SERVICES_DATA:
-			res->name = "BootServices Data";
-			break;
-		case EFI_BOOT_SERVICES_CODE:
-			res->name = "BootServices Code";
-			break;
-		case EFI_RUNTIME_SERVICES_CODE:
-			res->name = "Runtime Service Code";
-			break;
-		case EFI_RUNTIME_SERVICES_DATA:
-			res->name = "Runtime Service Data";
-			break;
-		case EFI_CONVENTIONAL_MEMORY:
-			res->name = "Conventional Memory";
-			break;
-		case EFI_UNUSABLE_MEMORY:
-			res->name = "Unusable Memory";
-			break;
-		case EFI_ACPI_RECLAIM_MEMORY:
-			res->name = "ACPI Reclaim";
-			break;
-		case EFI_ACPI_MEMORY_NVS:
-			res->name = "ACPI NVS";
-			break;
-		case EFI_MEMORY_MAPPED_IO:
-			res->name = "Memory Mapped IO";
-			break;
-		case EFI_MEMORY_MAPPED_IO_PORT_SPACE:
-			res->name = "Memory Mapped IO Port Space";
-			break;
-		default:
-			res->name = "Reserved";
-			break;
-		}
-		res->start = md->phys_addr;
-		res->end = res->start + ((md->num_pages << EFI_PAGE_SHIFT) - 1);
-		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
-		if (request_resource(&iomem_resource, res) < 0)
-			printk(KERN_ERR PFX "Failed to allocate res %s : "
-				"0x%llx-0x%llx\n", res->name,
-				(unsigned long long)res->start,
-				(unsigned long long)res->end);
-		/*
-		 * We don't know which region contains kernel data so we try
-		 * it repeatedly and let the resource manager test it.
-		 */
-		if (md->type == EFI_CONVENTIONAL_MEMORY) {
-			request_resource(res, code_resource);
-			request_resource(res, data_resource);
-			request_resource(res, bss_resource);
-#ifdef CONFIG_KEXEC
-			request_resource(res, &crashk_res);
-#endif
-		}
-	}
-}
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 32fc87adc4a3..2e805da337a2 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -644,12 +644,12 @@ void __init setup_arch(char **cmdline_p)
 	rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
 #endif
 	ARCH_SETUP
+
+	printk(KERN_INFO "BIOS-provided physical RAM map:\n");
+	print_memory_map(memory_setup());
+
 	if (efi_enabled)
 		efi_init();
-	else {
-		printk(KERN_INFO "BIOS-provided physical RAM map:\n");
-		print_memory_map(memory_setup());
-	}
 
 	copy_edd();
 
@@ -769,14 +769,8 @@ static int __init request_standard_resources(void)
 	int i;
 
 	printk(KERN_INFO "Setting up standard PCI resources\n");
-	if (efi_enabled)
-		efi_initialize_iomem_resources(&code_resource,
-				&data_resource, &bss_resource);
-	else
-		legacy_init_iomem_resources(&code_resource,
-				&data_resource, &bss_resource);
+	init_iomem_resources(&code_resource, &data_resource, &bss_resource);
 
-	/* EFI systems may still have VGA */
 	request_resource(&iomem_resource, &video_ram_resource);
 
 	/* request I/O space for devices used on all i[345]86 PCs */
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index e6e34c7dcabf..29130970c193 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -27,7 +27,6 @@
 #include <linux/bootmem.h>
 #include <linux/slab.h>
 #include <linux/proc_fs.h>
-#include <linux/efi.h>
 #include <linux/memory_hotplug.h>
 #include <linux/initrd.h>
 #include <linux/cpumask.h>
@@ -216,23 +215,6 @@ int page_is_ram(unsigned long pagenr)
 	int i;
 	unsigned long addr, end;
 
-	if (efi_enabled) {
-		efi_memory_desc_t *md;
-		void *p;
-
-		for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-			md = p;
-			if (!is_available_memory(md))
-				continue;
-			addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT;
-			end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT;
-
-			if ((pagenr >= addr) && (pagenr < end))
-				return 1;
-		}
-		return 0;
-	}
-
 	for (i = 0; i < e820.nr_map; i++) {
 
 		if (e820.map[i].type != E820_RAM)	/* not usable memory */
diff --git a/include/asm-x86/e820_32.h b/include/asm-x86/e820_32.h
index ae5ea19623fa..e2faf5f3a0bb 100644
--- a/include/asm-x86/e820_32.h
+++ b/include/asm-x86/e820_32.h
@@ -28,7 +28,7 @@ extern void register_bootmem_low_pages(unsigned long max_low_pfn);
 extern void e820_register_memory(void);
 extern void limit_regions(unsigned long long size);
 extern void print_memory_map(char *who);
-extern void legacy_init_iomem_resources(struct resource *code_resource,
+extern void init_iomem_resources(struct resource *code_resource,
 			    struct resource *data_resource,
 			    struct resource *bss_resource);
 
-- 
cgit v1.2.3


From 2f4aaf53c21e644ba0f581ce62b985d767388c64 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Wed, 30 Jan 2008 13:31:20 +0100
Subject: x86, ptrace: remove bad comment

Remove no longer correct comment.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/process_64.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index f91521e26335..836a71adfa11 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -608,10 +608,6 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
 		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
 	}
 
-	/*
-	 * Last branch recording recofiguration of trace hardware and
-	 * disentangling of trace data per task.
-	 */
 	if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
 		ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
 
-- 
cgit v1.2.3


From 3c68904fee1459b6d51040864e15d19098eedef7 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Wed, 30 Jan 2008 13:31:20 +0100
Subject: x86, ptrace: use jiffies for BTS timestamps

Replace sched_clock() with jiffies for BTS timestamps.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ds.c     | 30 ++++++++++--------------------
 arch/x86/kernel/ptrace.c |  2 +-
 include/asm-x86/ds.h     |  2 +-
 3 files changed, 12 insertions(+), 22 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 996a7c4f5963..e7855def97c3 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -167,23 +167,13 @@ static inline void set_info_type(char *base, unsigned char value)
 {
 	(*(unsigned char *)(base + ds_cfg.info_type.offset)) = value;
 }
-/*
- * The info data might overlap with the info type on some architectures.
- * We therefore read and write the exact number of bytes.
- */
-static inline unsigned long long get_info_data(char *base)
+static inline unsigned long get_info_data(char *base)
 {
-	unsigned long long value = 0;
-	memcpy(&value,
-	       base + ds_cfg.info_data.offset,
-	       ds_cfg.info_data.size);
-	return value;
+	return *(unsigned long *)(base + ds_cfg.info_data.offset);
 }
-static inline void set_info_data(char *base, unsigned long long value)
+static inline void set_info_data(char *base, unsigned long value)
 {
-	memcpy(base + ds_cfg.info_data.offset,
-	       &value,
-	       ds_cfg.info_data.size);
+	(*(unsigned long *)(base + ds_cfg.info_data.offset)) = value;
 }
 
 
@@ -282,8 +272,8 @@ int ds_read_bts(void *ds, size_t index, struct bts_struct *out)
 
 	memset(out, 0, sizeof(*out));
 	if (get_from_ip(bts) == BTS_ESCAPE_ADDRESS) {
-		out->qualifier         = get_info_type(bts);
-		out->variant.timestamp = get_info_data(bts);
+		out->qualifier       = get_info_type(bts);
+		out->variant.jiffies = get_info_data(bts);
 	} else {
 		out->qualifier = BTS_BRANCH;
 		out->variant.lbr.from_ip = get_from_ip(bts);
@@ -319,7 +309,7 @@ int ds_write_bts(void *ds, const struct bts_struct *in)
 	case BTS_TASK_DEPARTS:
 		set_from_ip(bts, BTS_ESCAPE_ADDRESS);
 		set_info_type(bts, in->qualifier);
-		set_info_data(bts, in->variant.timestamp);
+		set_info_data(bts, in->variant.jiffies);
 		break;
 
 	default:
@@ -350,7 +340,7 @@ static const struct ds_configuration ds_cfg_netburst = {
 	.from_ip = { 0, 4 },
 	.to_ip = { 4, 4 },
 	.info_type = { 4, 1 },
-	.info_data = { 5, 7 },
+	.info_data = { 8, 4 },
 	.debugctl_mask = (1<<2)|(1<<3)
 };
 
@@ -364,7 +354,7 @@ static const struct ds_configuration ds_cfg_pentium_m = {
 	.from_ip = { 0, 4 },
 	.to_ip = { 4, 4 },
 	.info_type = { 4, 1 },
-	.info_data = { 5, 7 },
+	.info_data = { 8, 4 },
 	.debugctl_mask = (1<<6)|(1<<7)
 };
 #endif /* _i386_ */
@@ -379,7 +369,7 @@ static const struct ds_configuration ds_cfg_core2 = {
 	.from_ip = { 0, 8 },
 	.to_ip = { 8, 8 },
 	.info_type = { 8, 1 },
-	.info_data = { 9, 7 },
+	.info_data = { 16, 8 },
 	.debugctl_mask = (1<<6)|(1<<7)|(1<<9)
 };
 
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 8d0dd8b5effe..ec86abaab530 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -616,7 +616,7 @@ void ptrace_bts_take_timestamp(struct task_struct *tsk,
 {
 	struct bts_struct rec = {
 		.qualifier = qualifier,
-		.variant.timestamp = sched_clock()
+		.variant.jiffies = jiffies
 	};
 
 	if (ptrace_bts_get_buffer_size(tsk) <= 0)
diff --git a/include/asm-x86/ds.h b/include/asm-x86/ds.h
index edd8467740a6..c9e15381dc7f 100644
--- a/include/asm-x86/ds.h
+++ b/include/asm-x86/ds.h
@@ -48,7 +48,7 @@ struct bts_struct {
 		} lbr;
 		/* BTS_TASK_ARRIVES or
 		   BTS_TASK_DEPARTS */
-		unsigned long long timestamp;
+		unsigned long jiffies;
 	} variant;
 };
 
-- 
cgit v1.2.3


From e4811f2568c55e595a7bf15a3b9aba863b31fb94 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Wed, 30 Jan 2008 13:31:20 +0100
Subject: x86, ptrace: change BTS GET ptrace interface

Change the ptrace interface to mimick an array from newst to oldest.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ptrace.c     | 30 ++++++++++++++++--------------
 include/asm-x86/ptrace-abi.h | 17 ++++++-----------
 2 files changed, 22 insertions(+), 25 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index ec86abaab530..3e78c124e2d2 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -479,26 +479,33 @@ static int ptrace_bts_get_buffer_size(struct task_struct *child)
 	return ds_get_bts_size((void *)child->thread.ds_area_msr);
 }
 
-static int ptrace_bts_get_index(struct task_struct *child)
-{
-	if (!child->thread.ds_area_msr)
-		return -ENXIO;
-
-	return ds_get_bts_index((void *)child->thread.ds_area_msr);
-}
-
 static int ptrace_bts_read_record(struct task_struct *child,
 				  long index,
 				  struct bts_struct __user *out)
 {
 	struct bts_struct ret;
 	int retval;
+	int bts_size;
+	int bts_index;
 
 	if (!child->thread.ds_area_msr)
 		return -ENXIO;
 
+	if (index < 0)
+		return -EINVAL;
+
+	bts_size = ds_get_bts_size((void *)child->thread.ds_area_msr);
+	if (bts_size <= index)
+		return -EINVAL;
+
+	/* translate the ptrace bts index into the ds bts index */
+	bts_index = ds_get_bts_index((void *)child->thread.ds_area_msr);
+	bts_index -= (index + 1);
+	if (bts_index < 0)
+		bts_index += bts_size;
+
 	retval = ds_read_bts((void *)child->thread.ds_area_msr,
-			     index, &ret);
+			     bts_index, &ret);
 	if (retval)
 		return retval;
 
@@ -813,10 +820,6 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		ret = ptrace_bts_get_buffer_size(child);
 		break;
 
-	case PTRACE_BTS_GET_INDEX:
-		ret = ptrace_bts_get_index(child);
-		break;
-
 	case PTRACE_BTS_READ_RECORD:
 		ret = ptrace_bts_read_record
 			(child, data,
@@ -1017,7 +1020,6 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
 	case PTRACE_BTS_MAX_BUFFER_SIZE:
 	case PTRACE_BTS_ALLOCATE_BUFFER:
 	case PTRACE_BTS_GET_BUFFER_SIZE:
-	case PTRACE_BTS_GET_INDEX:
 	case PTRACE_BTS_READ_RECORD:
 	case PTRACE_BTS_CONFIG:
 	case PTRACE_BTS_STATUS:
diff --git a/include/asm-x86/ptrace-abi.h b/include/asm-x86/ptrace-abi.h
index 6fadc5214e14..b473ad45e9ca 100644
--- a/include/asm-x86/ptrace-abi.h
+++ b/include/asm-x86/ptrace-abi.h
@@ -99,32 +99,27 @@
    ENXIO........no buffer allocated */
 #define PTRACE_BTS_GET_BUFFER_SIZE 42
 
-/* Return the index of the next bts record to be written,
-   if successful; -1, otherwise.
-   EOPNOTSUPP...processor does not support bts tracing
-   ENXIO........no buffer allocated
-   After the first warp-around, this is the start of the circular bts buffer. */
-#define PTRACE_BTS_GET_INDEX 43
-
-/* Read the DATA'th bts record into a ptrace_bts_record buffer provided in ADDR.
+/* Read the DATA'th bts record into a ptrace_bts_record buffer
+   provided in ADDR.
+   Records are ordered from newest to oldest.
    Return 0, if successful; -1, otherwise
    EOPNOTSUPP...processor does not support bts tracing
    ENXIO........no buffer allocated
    EINVAL.......invalid index */
-#define PTRACE_BTS_READ_RECORD 44
+#define PTRACE_BTS_READ_RECORD 43
 
 /* Configure last branch trace; the configuration is given as a bit-mask of
    PTRACE_BTS_O_* options in DATA; parameter ADDR is ignored.
    Return 0, if successful; -1, otherwise
    EOPNOTSUPP...processor does not support bts tracing
    ENXIO........no buffer allocated */
-#define PTRACE_BTS_CONFIG 45
+#define PTRACE_BTS_CONFIG 44
 
 /* Return the configuration as bit-mask of PTRACE_BTS_O_* options
    if successful; -1, otherwise.
    EOPNOTSUPP...processor does not support bts tracing
    ENXIO........no buffer allocated */
-#define PTRACE_BTS_STATUS 46
+#define PTRACE_BTS_STATUS 45
 
 /* Trace configuration options */
 /* Collect last branch trace */
-- 
cgit v1.2.3


From a95d67f87e1a5f1b4429be3ba3bf7b4051657908 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Wed, 30 Jan 2008 13:31:20 +0100
Subject: x86, ptrace: new ptrace BTS API

Here's the new ptrace BTS API that supports two different overflow handling mechanisms (wrap-around and buffer-full-signal) to support two different use cases (debugging and profiling).

It further combines buffer allocation and configuration.

Opens:
- memory rlimit
- overflow signal

What would be the right signal to use?

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ds.c         |  56 ++++++++++-
 arch/x86/kernel/ptrace.c     | 221 +++++++++++++++++++++++++------------------
 include/asm-x86/ds.h         |   7 ++
 include/asm-x86/ptrace-abi.h |  94 +++++++++---------
 include/asm-x86/ptrace.h     |   1 +
 5 files changed, 237 insertions(+), 142 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index e7855def97c3..6eb5d49a36bb 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -177,18 +177,20 @@ static inline void set_info_data(char *base, unsigned long value)
 }
 
 
-int ds_allocate(void **dsp, size_t bts_size_in_records)
+int ds_allocate(void **dsp, size_t bts_size_in_bytes)
 {
-	size_t bts_size_in_bytes = 0;
-	void *bts = 0;
-	void *ds = 0;
+	size_t bts_size_in_records;
+	void *bts;
+	void *ds;
 
 	if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
 		return -EOPNOTSUPP;
 
-	if (bts_size_in_records < 0)
+	if (bts_size_in_bytes < 0)
 		return -EINVAL;
 
+	bts_size_in_records =
+		bts_size_in_bytes / ds_cfg.sizeof_bts;
 	bts_size_in_bytes =
 		bts_size_in_records * ds_cfg.sizeof_bts;
 
@@ -233,9 +235,21 @@ int ds_get_bts_size(void *ds)
 	if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
 		return -EOPNOTSUPP;
 
+	if (!ds)
+		return 0;
+
 	size_in_bytes =
 		get_bts_absolute_maximum(ds) -
 		get_bts_buffer_base(ds);
+	return size_in_bytes;
+}
+
+int ds_get_bts_end(void *ds)
+{
+	size_t size_in_bytes = ds_get_bts_size(ds);
+
+	if (size_in_bytes <= 0)
+		return size_in_bytes;
 
 	return size_in_bytes / ds_cfg.sizeof_bts;
 }
@@ -254,6 +268,38 @@ int ds_get_bts_index(void *ds)
 	return index_offset_in_bytes / ds_cfg.sizeof_bts;
 }
 
+int ds_set_overflow(void *ds, int method)
+{
+	switch (method) {
+	case DS_O_SIGNAL:
+		return -EOPNOTSUPP;
+	case DS_O_WRAP:
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+int ds_get_overflow(void *ds)
+{
+	return DS_O_WRAP;
+}
+
+int ds_clear(void *ds)
+{
+	int bts_size = ds_get_bts_size(ds);
+	void *bts_base;
+
+	if (bts_size <= 0)
+		return bts_size;
+
+	bts_base = get_bts_buffer_base(ds);
+	memset(bts_base, 0, bts_size);
+
+	set_bts_index(ds, bts_base);
+	return 0;
+}
+
 int ds_read_bts(void *ds, size_t index, struct bts_struct *out)
 {
 	void *bts;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 3e78c124e2d2..18972a305890 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -32,12 +32,6 @@
 #include <asm/ds.h>
 
 
-/*
- * The maximal size of a BTS buffer per traced task in number of BTS
- * records.
- */
-#define PTRACE_BTS_BUFFER_MAX 4000
-
 /*
  * does not yet catch signals sent when the child dies.
  * in exit.c or in signal.c.
@@ -466,17 +460,12 @@ static int ptrace_set_debugreg(struct task_struct *child,
 	return 0;
 }
 
-static int ptrace_bts_max_buffer_size(void)
-{
-	return PTRACE_BTS_BUFFER_MAX;
-}
-
-static int ptrace_bts_get_buffer_size(struct task_struct *child)
+static int ptrace_bts_get_size(struct task_struct *child)
 {
 	if (!child->thread.ds_area_msr)
 		return -ENXIO;
 
-	return ds_get_bts_size((void *)child->thread.ds_area_msr);
+	return ds_get_bts_index((void *)child->thread.ds_area_msr);
 }
 
 static int ptrace_bts_read_record(struct task_struct *child,
@@ -485,7 +474,7 @@ static int ptrace_bts_read_record(struct task_struct *child,
 {
 	struct bts_struct ret;
 	int retval;
-	int bts_size;
+	int bts_end;
 	int bts_index;
 
 	if (!child->thread.ds_area_msr)
@@ -494,15 +483,15 @@ static int ptrace_bts_read_record(struct task_struct *child,
 	if (index < 0)
 		return -EINVAL;
 
-	bts_size = ds_get_bts_size((void *)child->thread.ds_area_msr);
-	if (bts_size <= index)
+	bts_end = ds_get_bts_end((void *)child->thread.ds_area_msr);
+	if (bts_end <= index)
 		return -EINVAL;
 
 	/* translate the ptrace bts index into the ds bts index */
 	bts_index = ds_get_bts_index((void *)child->thread.ds_area_msr);
 	bts_index -= (index + 1);
 	if (bts_index < 0)
-		bts_index += bts_size;
+		bts_index += bts_end;
 
 	retval = ds_read_bts((void *)child->thread.ds_area_msr,
 			     bts_index, &ret);
@@ -530,19 +519,97 @@ static int ptrace_bts_write_record(struct task_struct *child,
 	return sizeof(*in);
 }
 
-static int ptrace_bts_config(struct task_struct *child,
-			     unsigned long options)
+static int ptrace_bts_clear(struct task_struct *child)
 {
-	unsigned long debugctl_mask = ds_debugctl_mask();
-	int retval;
+	if (!child->thread.ds_area_msr)
+		return -ENXIO;
 
-	retval = ptrace_bts_get_buffer_size(child);
-	if (retval < 0)
-		return retval;
-	if (retval == 0)
+	return ds_clear((void *)child->thread.ds_area_msr);
+}
+
+static int ptrace_bts_drain(struct task_struct *child,
+			    struct bts_struct __user *out)
+{
+	int end, i;
+	void *ds = (void *)child->thread.ds_area_msr;
+
+	if (!ds)
 		return -ENXIO;
 
-	if (options & PTRACE_BTS_O_TRACE_TASK) {
+	end = ds_get_bts_index(ds);
+	if (end <= 0)
+		return end;
+
+	for (i = 0; i < end; i++, out++) {
+		struct bts_struct ret;
+		int retval;
+
+		retval = ds_read_bts(ds, i, &ret);
+		if (retval < 0)
+			return retval;
+
+		if (copy_to_user(out, &ret, sizeof(ret)))
+			return -EFAULT;
+	}
+
+	ds_clear(ds);
+
+	return i;
+}
+
+static int ptrace_bts_config(struct task_struct *child,
+			     const struct ptrace_bts_config __user *ucfg)
+{
+	struct ptrace_bts_config cfg;
+	unsigned long debugctl_mask;
+	int bts_size, ret;
+	void *ds;
+
+	if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
+		return -EFAULT;
+
+	bts_size = 0;
+	ds = (void *)child->thread.ds_area_msr;
+	if (ds) {
+		bts_size = ds_get_bts_size(ds);
+		if (bts_size < 0)
+			return bts_size;
+	}
+
+	if (bts_size != cfg.size) {
+		ret = ds_free((void **)&child->thread.ds_area_msr);
+		if (ret < 0)
+			return ret;
+
+		if (cfg.size > 0)
+			ret = ds_allocate((void **)&child->thread.ds_area_msr,
+					  cfg.size);
+		ds = (void *)child->thread.ds_area_msr;
+		if (ds)
+			set_tsk_thread_flag(child, TIF_DS_AREA_MSR);
+		else
+			clear_tsk_thread_flag(child, TIF_DS_AREA_MSR);
+
+		if (ret < 0)
+			return ret;
+
+		bts_size = ds_get_bts_size(ds);
+		if (bts_size <= 0)
+			return bts_size;
+	}
+
+	if (ds) {
+		if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
+			ret = ds_set_overflow(ds, DS_O_SIGNAL);
+		} else {
+			ret = ds_set_overflow(ds, DS_O_WRAP);
+		}
+		if (ret < 0)
+			return ret;
+	}
+
+	debugctl_mask = ds_debugctl_mask();
+	if (ds && (cfg.flags & PTRACE_BTS_O_TRACE)) {
 		child->thread.debugctlmsr |= debugctl_mask;
 		set_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
 	} else {
@@ -555,7 +622,7 @@ static int ptrace_bts_config(struct task_struct *child,
 			clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
 	}
 
-	if (options & PTRACE_BTS_O_TIMESTAMPS)
+	if (ds && (cfg.flags & PTRACE_BTS_O_SCHED))
 		set_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
 	else
 		clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
@@ -563,59 +630,32 @@ static int ptrace_bts_config(struct task_struct *child,
 	return 0;
 }
 
-static int ptrace_bts_status(struct task_struct *child)
+static int ptrace_bts_status(struct task_struct *child,
+			     struct ptrace_bts_config __user *ucfg)
 {
-	unsigned long debugctl_mask = ds_debugctl_mask();
-	int retval, status = 0;
-
-	retval = ptrace_bts_get_buffer_size(child);
-	if (retval < 0)
-		return retval;
-	if (retval == 0)
-		return -ENXIO;
-
-	if (ptrace_bts_get_buffer_size(child) <= 0)
-		return -ENXIO;
+	void *ds = (void *)child->thread.ds_area_msr;
+	struct ptrace_bts_config cfg;
 
-	if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) &&
-	    child->thread.debugctlmsr & debugctl_mask)
-		status |= PTRACE_BTS_O_TRACE_TASK;
-	if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS))
-		status |= PTRACE_BTS_O_TIMESTAMPS;
+	memset(&cfg, 0, sizeof(cfg));
 
-	return status;
-}
+	if (ds) {
+		cfg.size = ds_get_bts_size(ds);
 
-static int ptrace_bts_allocate_bts(struct task_struct *child,
-				   int size_in_records)
-{
-	int retval = 0;
-	void *ds;
+		if (ds_get_overflow(ds) == DS_O_SIGNAL)
+			cfg.flags |= PTRACE_BTS_O_SIGNAL;
 
-	if (size_in_records < 0)
-		return -EINVAL;
+		if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) &&
+		    child->thread.debugctlmsr & ds_debugctl_mask())
+			cfg.flags |= PTRACE_BTS_O_TRACE;
 
-	if (size_in_records > ptrace_bts_max_buffer_size())
-		return -EINVAL;
-
-	if (size_in_records == 0) {
-		ptrace_bts_config(child, /* options = */ 0);
-	} else {
-		retval = ds_allocate(&ds, size_in_records);
-		if (retval)
-			return retval;
+		if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS))
+			cfg.flags |= PTRACE_BTS_O_SCHED;
 	}
 
-	if (child->thread.ds_area_msr)
-		ds_free((void **)&child->thread.ds_area_msr);
-
-	child->thread.ds_area_msr = (unsigned long)ds;
-	if (child->thread.ds_area_msr)
-		set_tsk_thread_flag(child, TIF_DS_AREA_MSR);
-	else
-		clear_tsk_thread_flag(child, TIF_DS_AREA_MSR);
+	if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
+		return -EFAULT;
 
-	return retval;
+	return sizeof(cfg);
 }
 
 void ptrace_bts_take_timestamp(struct task_struct *tsk,
@@ -626,9 +666,6 @@ void ptrace_bts_take_timestamp(struct task_struct *tsk,
 		.variant.jiffies = jiffies
 	};
 
-	if (ptrace_bts_get_buffer_size(tsk) <= 0)
-		return;
-
 	ptrace_bts_write_record(tsk, &rec);
 }
 
@@ -808,30 +845,32 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		break;
 #endif
 
-	case PTRACE_BTS_MAX_BUFFER_SIZE:
-		ret = ptrace_bts_max_buffer_size();
+	case PTRACE_BTS_CONFIG:
+		ret = ptrace_bts_config
+			(child, (struct ptrace_bts_config __user *)addr);
 		break;
 
-	case PTRACE_BTS_ALLOCATE_BUFFER:
-		ret = ptrace_bts_allocate_bts(child, data);
+	case PTRACE_BTS_STATUS:
+		ret = ptrace_bts_status
+			(child, (struct ptrace_bts_config __user *)addr);
 		break;
 
-	case PTRACE_BTS_GET_BUFFER_SIZE:
-		ret = ptrace_bts_get_buffer_size(child);
+	case PTRACE_BTS_SIZE:
+		ret = ptrace_bts_get_size(child);
 		break;
 
-	case PTRACE_BTS_READ_RECORD:
+	case PTRACE_BTS_GET:
 		ret = ptrace_bts_read_record
-			(child, data,
-			 (struct bts_struct __user *) addr);
+			(child, data, (struct bts_struct __user *) addr);
 		break;
 
-	case PTRACE_BTS_CONFIG:
-		ret = ptrace_bts_config(child, data);
+	case PTRACE_BTS_CLEAR:
+		ret = ptrace_bts_clear(child);
 		break;
 
-	case PTRACE_BTS_STATUS:
-		ret = ptrace_bts_status(child);
+	case PTRACE_BTS_DRAIN:
+		ret = ptrace_bts_drain
+			(child, (struct bts_struct __user *) addr);
 		break;
 
 	default:
@@ -1017,12 +1056,12 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
 	case PTRACE_SETOPTIONS:
 	case PTRACE_SET_THREAD_AREA:
 	case PTRACE_GET_THREAD_AREA:
-	case PTRACE_BTS_MAX_BUFFER_SIZE:
-	case PTRACE_BTS_ALLOCATE_BUFFER:
-	case PTRACE_BTS_GET_BUFFER_SIZE:
-	case PTRACE_BTS_READ_RECORD:
 	case PTRACE_BTS_CONFIG:
 	case PTRACE_BTS_STATUS:
+	case PTRACE_BTS_SIZE:
+	case PTRACE_BTS_GET:
+	case PTRACE_BTS_CLEAR:
+	case PTRACE_BTS_DRAIN:
 		return sys_ptrace(request, pid, addr, data);
 
 	default:
diff --git a/include/asm-x86/ds.h b/include/asm-x86/ds.h
index c9e15381dc7f..b84040abee68 100644
--- a/include/asm-x86/ds.h
+++ b/include/asm-x86/ds.h
@@ -52,11 +52,18 @@ struct bts_struct {
 	} variant;
 };
 
+/* Overflow handling mechanisms */
+#define DS_O_SIGNAL	1 /* send overflow signal */
+#define DS_O_WRAP	2 /* wrap around */
 
 extern int ds_allocate(void **, size_t);
 extern int ds_free(void **);
 extern int ds_get_bts_size(void *);
+extern int ds_get_bts_end(void *);
 extern int ds_get_bts_index(void *);
+extern int ds_set_overflow(void *, int);
+extern int ds_get_overflow(void *);
+extern int ds_clear(void *);
 extern int ds_read_bts(void *, size_t, struct bts_struct *);
 extern int ds_write_bts(void *, const struct bts_struct *);
 extern unsigned long ds_debugctl_mask(void);
diff --git a/include/asm-x86/ptrace-abi.h b/include/asm-x86/ptrace-abi.h
index b473ad45e9ca..cf2fe4633ee5 100644
--- a/include/asm-x86/ptrace-abi.h
+++ b/include/asm-x86/ptrace-abi.h
@@ -80,51 +80,53 @@
 
 #define PTRACE_SINGLEBLOCK	33	/* resume execution until next branch */
 
-/* Return maximal BTS buffer size in number of records,
-   if successuf; -1, otherwise.
-   EOPNOTSUPP...processor does not support bts tracing */
-#define PTRACE_BTS_MAX_BUFFER_SIZE 40
-
-/* Allocate new bts buffer (free old one, if exists) of size DATA bts records;
-   parameter ADDR is ignored.
-   Return 0, if successful; -1, otherwise.
-   EOPNOTSUPP...processor does not support bts tracing
-   EINVAL.......invalid size in records
-   ENOMEM.......out of memory */
-#define PTRACE_BTS_ALLOCATE_BUFFER 41
-
-/* Return the size of the bts buffer in number of bts records,
-   if successful; -1, otherwise.
-   EOPNOTSUPP...processor does not support bts tracing
-   ENXIO........no buffer allocated */
-#define PTRACE_BTS_GET_BUFFER_SIZE 42
-
-/* Read the DATA'th bts record into a ptrace_bts_record buffer
-   provided in ADDR.
-   Records are ordered from newest to oldest.
-   Return 0, if successful; -1, otherwise
-   EOPNOTSUPP...processor does not support bts tracing
-   ENXIO........no buffer allocated
-   EINVAL.......invalid index */
-#define PTRACE_BTS_READ_RECORD 43
-
-/* Configure last branch trace; the configuration is given as a bit-mask of
-   PTRACE_BTS_O_* options in DATA; parameter ADDR is ignored.
-   Return 0, if successful; -1, otherwise
-   EOPNOTSUPP...processor does not support bts tracing
-   ENXIO........no buffer allocated */
-#define PTRACE_BTS_CONFIG 44
-
-/* Return the configuration as bit-mask of PTRACE_BTS_O_* options
-   if successful; -1, otherwise.
-   EOPNOTSUPP...processor does not support bts tracing
-   ENXIO........no buffer allocated */
-#define PTRACE_BTS_STATUS 45
-
-/* Trace configuration options */
-/* Collect last branch trace */
-#define PTRACE_BTS_O_TRACE_TASK 0x1
-/* Take timestamps when the task arrives and departs */
-#define PTRACE_BTS_O_TIMESTAMPS 0x2
+/* configuration/status structure used in PTRACE_BTS_CONFIG and
+   PTRACE_BTS_STATUS commands.
+*/
+struct ptrace_bts_config {
+	/* requested or actual size of BTS buffer in bytes */
+	unsigned long size;
+	/* bitmask of below flags */
+	unsigned long flags;
+};
+
+#define PTRACE_BTS_O_TRACE	0x1 /* branch trace */
+#define PTRACE_BTS_O_SCHED	0x2 /* scheduling events w/ jiffies */
+#define PTRACE_BTS_O_SIGNAL     0x4 /* send SIG? on buffer overflow
+				       instead of wrapping around */
+#define PTRACE_BTS_O_CUT_SIZE	0x8 /* cut requested size to max available
+				       instead of failing */
+
+#define PTRACE_BTS_CONFIG	40
+/* Configure branch trace recording.
+   DATA is ignored, ADDR points to a struct ptrace_bts_config.
+   A new buffer is allocated, iff the size changes.
+*/
+#define PTRACE_BTS_STATUS	41
+/* Return the current configuration.
+   DATA is ignored, ADDR points to a struct ptrace_bts_config
+   that will contain the result.
+*/
+#define PTRACE_BTS_SIZE		42
+/* Return the number of available BTS records.
+   DATA and ADDR are ignored.
+*/
+#define PTRACE_BTS_GET		43
+/* Get a single BTS record.
+   DATA defines the index into the BTS array, where 0 is the newest
+   entry, and higher indices refer to older entries.
+   ADDR is pointing to struct bts_struct (see asm/ds.h).
+*/
+#define PTRACE_BTS_CLEAR	44
+/* Clear the BTS buffer.
+   DATA and ADDR are ignored.
+*/
+#define PTRACE_BTS_DRAIN	45
+/* Read all available BTS records and clear the buffer.
+   DATA is ignored. ADDR points to an array of struct bts_struct of
+   suitable size.
+   BTS records are read from oldest to newest.
+   Returns number of BTS records drained.
+*/
 
 #endif
diff --git a/include/asm-x86/ptrace.h b/include/asm-x86/ptrace.h
index a9a1bab1451a..61946fe8c085 100644
--- a/include/asm-x86/ptrace.h
+++ b/include/asm-x86/ptrace.h
@@ -9,6 +9,7 @@
 
 #ifdef __KERNEL__
 
+/* the DS BTS struct is used for ptrace as well */
 #include <asm/ds.h>
 
 struct task_struct;
-- 
cgit v1.2.3


From aa470140e86e45723cf8387292edbce9106ddc1f Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Wed, 30 Jan 2008 13:31:21 +0100
Subject: x86: kprobe-booster for x86-64

This patch adds kprobe-booster to kprobes_64.c.

- Changes are based on x86-32.
- Add REX prefix checking code.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Signed-off-by: Jim Keniston <jkenisto@us.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/kprobes_64.c | 142 ++++++++++++++++++++++++++++++++++++++++++-
 include/asm-x86/kprobes_64.h |  10 +++
 2 files changed, 149 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/kprobes_64.c b/arch/x86/kernel/kprobes_64.c
index f6837cd3bed5..bf0e18473677 100644
--- a/arch/x86/kernel/kprobes_64.c
+++ b/arch/x86/kernel/kprobes_64.c
@@ -55,6 +55,105 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = {
 };
 const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
 
+/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
+static __always_inline void set_jmp_op(void *from, void *to)
+{
+	struct __arch_jmp_op {
+		char op;
+		s32 raddr;
+	} __attribute__((packed)) * jop;
+	jop = (struct __arch_jmp_op *)from;
+	jop->raddr = (s32)((long)(to) - ((long)(from) + 5));
+	jop->op = RELATIVEJUMP_INSTRUCTION;
+}
+
+/*
+ * returns non-zero if opcode is boostable
+ * RIP relative instructions are adjusted at copying time
+ */
+static __always_inline int can_boost(kprobe_opcode_t *opcodes)
+{
+#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
+	(((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) |   \
+	  (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) |   \
+	  (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) |   \
+	  (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf))    \
+	 << (row % 64))
+	/*
+	 * Undefined/reserved opcodes, conditional jump, Opcode Extension
+	 * Groups, and some special opcodes can not boost.
+	 */
+	static const unsigned long twobyte_is_boostable[256 / 64] = {
+		/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f        */
+		/*      ----------------------------------------------        */
+		W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0)|/* 00 */
+		W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)|/* 10 */
+		W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)|/* 20 */
+		W(0x30, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),/* 30 */
+		W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)|/* 40 */
+		W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)|/* 50 */
+		W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1)|/* 60 */
+		W(0x70, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1),/* 70 */
+		W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)|/* 80 */
+		W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)|/* 90 */
+		W(0xa0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1)|/* a0 */
+		W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1),/* b0 */
+		W(0xc0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1)|/* c0 */
+		W(0xd0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1)|/* d0 */
+		W(0xe0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1)|/* e0 */
+		W(0xf0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0) /* f0 */
+		/*      -----------------------------------------------       */
+		/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f        */
+	};
+#undef W
+	kprobe_opcode_t opcode;
+	kprobe_opcode_t *orig_opcodes = opcodes;
+
+retry:
+	if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
+		return 0;
+	opcode = *(opcodes++);
+
+	/* 2nd-byte opcode */
+	if (opcode == 0x0f) {
+		if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
+			return 0;
+		return test_bit(*opcodes, twobyte_is_boostable);
+	}
+
+	switch (opcode & 0xf0) {
+	case 0x40:
+		goto retry; /* REX prefix is boostable */
+	case 0x60:
+		if (0x63 < opcode && opcode < 0x67)
+			goto retry; /* prefixes */
+		/* can't boost Address-size override and bound */
+		return (opcode != 0x62 && opcode != 0x67);
+	case 0x70:
+		return 0; /* can't boost conditional jump */
+	case 0xc0:
+		/* can't boost software-interruptions */
+		return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf;
+	case 0xd0:
+		/* can boost AA* and XLAT */
+		return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7);
+	case 0xe0:
+		/* can boost in/out and absolute jmps */
+		return ((opcode & 0x04) || opcode == 0xea);
+	case 0xf0:
+		if ((opcode & 0x0c) == 0 && opcode != 0xf1)
+			goto retry; /* lock/rep(ne) prefix */
+		/* clear and set flags are boostable */
+		return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe));
+	default:
+		/* segment override prefixes are boostable */
+		if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e)
+			goto retry; /* prefixes */
+		/* CS override prefix and call are not boostable */
+		return (opcode != 0x2e && opcode != 0x9a);
+	}
+}
+
 /*
  * returns non-zero if opcode modifies the interrupt flag.
  */
@@ -86,7 +185,7 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)
 
 /*
  * Determine if the instruction uses the %rip-relative addressing mode.
- * If it does, return the address of the 32-bit displacement word.
+ * If it does, Return the address of the 32-bit displacement word.
  * If not, return null.
  */
 static s32 __kprobes *is_riprel(u8 *insn)
@@ -210,6 +309,11 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p)
 		BUG_ON((s64) (s32) disp != disp); /* Sanity check.  */
 		*ripdisp = disp;
 	}
+	if (can_boost(p->addr)) {
+		p->ainsn.boostable = 0;
+	} else {
+		p->ainsn.boostable = -1;
+	}
 	p->opcode = *p->addr;
 }
 
@@ -226,7 +330,7 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
 void __kprobes arch_remove_kprobe(struct kprobe *p)
 {
 	mutex_lock(&kprobe_mutex);
-	free_insn_slot(p->ainsn.insn, 0);
+	free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1));
 	mutex_unlock(&kprobe_mutex);
 }
 
@@ -384,6 +488,15 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
 		return 1;
 
 ss_probe:
+#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM)
+	if (p->ainsn.boostable == 1 && !p->post_handler) {
+		/* Boost up -- we can execute copied instructions directly */
+		reset_current_kprobe();
+		regs->ip = (unsigned long)p->ainsn.insn;
+		preempt_enable_no_resched();
+		return 1;
+	}
+#endif
 	prepare_singlestep(p, regs);
 	kcb->kprobe_status = KPROBE_HIT_SS;
 	return 1;
@@ -493,6 +606,11 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
  * 2) If the single-stepped instruction was a call, the return address
  * that is atop the stack is the address following the copied instruction.
  * We need to make it the address following the original instruction.
+ *
+ * If this is the first time we've single-stepped the instruction at
+ * this probepoint, and the instruction is boostable, boost it: add a
+ * jump instruction after the copied instruction, that jumps to the next
+ * instruction after the probepoint.
  */
 static void __kprobes resume_execution(struct kprobe *p,
 		struct pt_regs *regs, struct kprobe_ctlblk *kcb)
@@ -519,6 +637,7 @@ static void __kprobes resume_execution(struct kprobe *p,
 	case 0xcf:
 	case 0xea:	/* jmp absolute -- ip is correct */
 		/* ip is already adjusted, no more changes required */
+		p->ainsn.boostable = 1;
 		goto no_change;
 	case 0xe8:	/* call relative - Fix return addr */
 		*tos = orig_rip + (*tos - copy_rip);
@@ -527,17 +646,34 @@ static void __kprobes resume_execution(struct kprobe *p,
 		if ((insn[1] & 0x30) == 0x10) {
 			/* call absolute, indirect */
 			/* Fix return addr; ip is correct. */
+			/* not boostable */
 			*tos = orig_rip + (*tos - copy_rip);
 			goto no_change;
 		} else if (((insn[1] & 0x31) == 0x20) ||	/* jmp near, absolute indirect */
 			   ((insn[1] & 0x31) == 0x21)) {	/* jmp far, absolute indirect */
-			/* ip is correct. */
+			/* ip is correct. And this is boostable */
+			p->ainsn.boostable = 1;
 			goto no_change;
 		}
 	default:
 		break;
 	}
 
+	if (p->ainsn.boostable == 0) {
+		if ((regs->ip > copy_rip) &&
+		    (regs->ip - copy_rip) + 5 < MAX_INSN_SIZE) {
+			/*
+			 * These instructions can be executed directly if it
+			 * jumps back to correct address.
+			 */
+			set_jmp_op((void *)regs->ip,
+				   (void *)orig_rip + (regs->ip - copy_rip));
+			p->ainsn.boostable = 1;
+		} else {
+			p->ainsn.boostable = -1;
+		}
+	}
+
 	regs->ip = orig_rip + (regs->ip - copy_rip);
 
 no_change:
diff --git a/include/asm-x86/kprobes_64.h b/include/asm-x86/kprobes_64.h
index 8c919d35cdd3..e7e921dac7c2 100644
--- a/include/asm-x86/kprobes_64.h
+++ b/include/asm-x86/kprobes_64.h
@@ -34,6 +34,7 @@ struct kprobe;
 
 typedef u8 kprobe_opcode_t;
 #define BREAKPOINT_INSTRUCTION	0xcc
+#define RELATIVEJUMP_INSTRUCTION 0xe9
 #define MAX_INSN_SIZE 15
 #define MAX_STACK_SIZE 64
 #define MIN_STACK_SIZE(ADDR) (((MAX_STACK_SIZE) < \
@@ -52,6 +53,15 @@ extern void arch_remove_kprobe(struct kprobe *p);
 struct arch_specific_insn {
 	/* copy of the original instruction */
 	kprobe_opcode_t *insn;
+	/*
+	 * boostable = -1: This instruction type is not boostable.
+	 * boostable = 0: This instruction type is boostable.
+	 * boostable = 1: This instruction has been boosted: we have
+	 * added a relative jump after the instruction copy in insn,
+	 * so no single-step and fixup are needed (unless there's
+	 * a post_handler or break_handler).
+	 */
+	int boostable;
 };
 
 struct prev_kprobe {
-- 
cgit v1.2.3


From da07ab0375897bb9e108b28129df140ecd3ee94e Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Wed, 30 Jan 2008 13:31:21 +0100
Subject: x86: return probe-booster for x86-64

This patch adds kretprobe-booster to kprobes_64.c.

- Changes are based on x86-32.
- Rewrite register saving/restoring code

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/kprobes_64.c | 92 +++++++++++++++++++++++++++++++-------------
 1 file changed, 65 insertions(+), 27 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/kprobes_64.c b/arch/x86/kernel/kprobes_64.c
index bf0e18473677..bc93b1dd9a01 100644
--- a/arch/x86/kernel/kprobes_64.c
+++ b/arch/x86/kernel/kprobes_64.c
@@ -28,6 +28,8 @@
  *		Fixed to handle %rip-relative addressing mode correctly.
  * 2005-May     Rusty Lynch <rusty.lynch@intel.com>
  *              Added function return probes functionality
+ * 2007-Dec	Masami Hiramatsu <mhiramat@redhat.com> added kprobe-booster
+ * 		and kretprobe-booster for x86-64
  */
 
 #include <linux/kprobes.h>
@@ -507,21 +509,65 @@ no_kprobe:
 }
 
 /*
- * For function-return probes, init_kprobes() establishes a probepoint
- * here. When a retprobed function returns, this probe is hit and
- * trampoline_probe_handler() runs, calling the kretprobe's handler.
+ * When a retprobed function returns, this code saves registers and
+ * calls trampoline_handler() runs, which calls the kretprobe's handler.
  */
- void kretprobe_trampoline_holder(void)
+ void __kprobes kretprobe_trampoline_holder(void)
  {
  	asm volatile (  ".global kretprobe_trampoline\n"
- 			"kretprobe_trampoline: \n"
- 			"nop\n");
+			"kretprobe_trampoline: \n"
+			/* We don't bother saving the ss register */
+			"	pushq %rsp\n"
+			"	pushfq\n"
+			/*
+			 * Skip cs, ip, orig_ax.
+			 * trampoline_handler() will plug in these values
+			 */
+			"	subq $24, %rsp\n"
+			"	pushq %rdi\n"
+			"	pushq %rsi\n"
+			"	pushq %rdx\n"
+			"	pushq %rcx\n"
+			"	pushq %rax\n"
+			"	pushq %r8\n"
+			"	pushq %r9\n"
+			"	pushq %r10\n"
+			"	pushq %r11\n"
+			"	pushq %rbx\n"
+			"	pushq %rbp\n"
+			"	pushq %r12\n"
+			"	pushq %r13\n"
+			"	pushq %r14\n"
+			"	pushq %r15\n"
+			"	movq %rsp, %rdi\n"
+			"	call trampoline_handler\n"
+			/* Replace saved sp with true return address. */
+			"	movq %rax, 152(%rsp)\n"
+			"	popq %r15\n"
+			"	popq %r14\n"
+			"	popq %r13\n"
+			"	popq %r12\n"
+			"	popq %rbp\n"
+			"	popq %rbx\n"
+			"	popq %r11\n"
+			"	popq %r10\n"
+			"	popq %r9\n"
+			"	popq %r8\n"
+			"	popq %rax\n"
+			"	popq %rcx\n"
+			"	popq %rdx\n"
+			"	popq %rsi\n"
+			"	popq %rdi\n"
+			/* Skip orig_ax, ip, cs */
+			"	addq $24, %rsp\n"
+			"	popfq\n"
+			"	ret\n");
  }
 
 /*
- * Called when we hit the probe point at kretprobe_trampoline
+ * Called from kretprobe_trampoline
  */
-int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
+fastcall void * __kprobes trampoline_handler(struct pt_regs *regs)
 {
 	struct kretprobe_instance *ri = NULL;
 	struct hlist_head *head, empty_rp;
@@ -532,6 +578,10 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 	INIT_HLIST_HEAD(&empty_rp);
 	spin_lock_irqsave(&kretprobe_lock, flags);
 	head = kretprobe_inst_table_head(current);
+	/* fixup rt_regs */
+	regs->cs = __KERNEL_CS;
+	regs->ip = trampoline_address;
+	regs->orig_ax = 0xffffffffffffffff;
 
 	/*
 	 * It is possible to have multiple instances associated with a given
@@ -551,8 +601,12 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 			/* another task is sharing our hash bucket */
 			continue;
 
-		if (ri->rp && ri->rp->handler)
+		if (ri->rp && ri->rp->handler) {
+			__get_cpu_var(current_kprobe) = &ri->rp->kp;
+			get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
 			ri->rp->handler(ri, regs);
+			__get_cpu_var(current_kprobe) = NULL;
+		}
 
 		orig_ret_address = (unsigned long)ri->ret_addr;
 		recycle_rp_inst(ri, &empty_rp);
@@ -567,22 +621,14 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 	}
 
 	kretprobe_assert(ri, orig_ret_address, trampoline_address);
-	regs->ip = orig_ret_address;
 
-	reset_current_kprobe();
 	spin_unlock_irqrestore(&kretprobe_lock, flags);
-	preempt_enable_no_resched();
 
 	hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
 		hlist_del(&ri->hlist);
 		kfree(ri);
 	}
-	/*
-	 * By returning a non-zero value, we are telling
-	 * kprobe_handler() that we don't want the post_handler
-	 * to run (and have re-enabled preemption)
-	 */
-	return 1;
+	return (void *)orig_ret_address;
 }
 
 /*
@@ -881,20 +927,12 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
 	return 0;
 }
 
-static struct kprobe trampoline_p = {
-	.addr = (kprobe_opcode_t *) &kretprobe_trampoline,
-	.pre_handler = trampoline_probe_handler
-};
-
 int __init arch_init_kprobes(void)
 {
-	return register_kprobe(&trampoline_p);
+	return 0;
 }
 
 int __kprobes arch_trampoline_kprobe(struct kprobe *p)
 {
-	if (p->addr == (kprobe_opcode_t *)&kretprobe_trampoline)
-		return 1;
-
 	return 0;
 }
-- 
cgit v1.2.3


From 8533bbe9f87b01f49ff951f665ea1988252fa3c2 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Wed, 30 Jan 2008 13:31:21 +0100
Subject: x86: prepare kprobes code for x86 unification

This patch cleanup kprobes code on x86 for unification.
This patch is based on Arjan's previous work.

- Remove spurious whitespace changes
- Add harmless includes
- Make the 32/64 files more identical
 - Generalize structure fields' and local variable name.
 - Wrap accessing to stack address by macros.
 - Modify bitmap making macro.
 - Merge fixup code into is_riprel() and change its name to fix_riprel().
 - Set MAX_INSN_SIZE to 16 on both arch.
 - Use u32 for bitmaps on both architectures.
 - Clarify some comments.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Jim Keniston <jkenisto@us.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/kprobes_32.c | 321 ++++++++++++++++++++-------------
 arch/x86/kernel/kprobes_64.c | 409 ++++++++++++++++++++++---------------------
 arch/x86/mm/fault_32.c       |   1 -
 arch/x86/mm/fault_64.c       |   1 -
 include/asm-x86/kprobes_32.h |  32 ++--
 include/asm-x86/kprobes_64.h |  30 ++--
 6 files changed, 447 insertions(+), 347 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/kprobes_32.c b/arch/x86/kernel/kprobes_32.c
index 8eccd2d04709..8e06431d8b03 100644
--- a/arch/x86/kernel/kprobes_32.c
+++ b/arch/x86/kernel/kprobes_32.c
@@ -29,10 +29,15 @@
 
 #include <linux/kprobes.h>
 #include <linux/ptrace.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 #include <linux/preempt.h>
+#include <linux/module.h>
 #include <linux/kdebug.h>
+
 #include <asm/cacheflush.h>
 #include <asm/desc.h>
+#include <asm/pgtable.h>
 #include <asm/uaccess.h>
 #include <asm/alternative.h>
 
@@ -41,65 +46,121 @@ void jprobe_return_end(void);
 DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
 DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
 
+/*
+ * "&regs->sp" looks wrong, but it's correct for x86_32.  x86_32 CPUs
+ * don't save the ss and esp registers if the CPU is already in kernel
+ * mode when it traps.  So for kprobes, regs->sp and regs->ss are not
+ * the [nonexistent] saved stack pointer and ss register, but rather
+ * the top 8 bytes of the pre-int3 stack.  So &regs->sp happens to
+ * point to the top of the pre-int3 stack.
+ */
+#define stack_addr(regs) ((unsigned long *)&regs->sp)
+
+#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
+	(((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) |   \
+	  (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) |   \
+	  (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) |   \
+	  (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf))    \
+	 << (row % 32))
+	/*
+	 * Undefined/reserved opcodes, conditional jump, Opcode Extension
+	 * Groups, and some special opcodes can not boost.
+	 */
+static const u32 twobyte_is_boostable[256 / 32] = {
+	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
+	/*      ----------------------------------------------          */
+	W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */
+	W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 10 */
+	W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 20 */
+	W(0x30, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
+	W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
+	W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
+	W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1) | /* 60 */
+	W(0x70, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */
+	W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 80 */
+	W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
+	W(0xa0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* a0 */
+	W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) , /* b0 */
+	W(0xc0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
+	W(0xd0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) , /* d0 */
+	W(0xe0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* e0 */
+	W(0xf0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0)   /* f0 */
+	/*      -----------------------------------------------         */
+	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
+};
+static const u32 onebyte_has_modrm[256 / 32] = {
+	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
+	/*      -----------------------------------------------         */
+	W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */
+	W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */
+	W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */
+	W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */
+	W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
+	W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
+	W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */
+	W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */
+	W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
+	W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */
+	W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */
+	W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */
+	W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */
+	W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
+	W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */
+	W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1)   /* f0 */
+	/*      -----------------------------------------------         */
+	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
+};
+static const u32 twobyte_has_modrm[256 / 32] = {
+	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
+	/*      -----------------------------------------------         */
+	W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */
+	W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */
+	W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */
+	W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */
+	W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */
+	W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */
+	W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */
+	W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */
+	W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */
+	W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */
+	W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */
+	W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */
+	W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */
+	W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */
+	W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */
+	W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0)   /* ff */
+	/*      -----------------------------------------------         */
+	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
+};
+#undef W
+
 struct kretprobe_blackpoint kretprobe_blacklist[] = {
 	{"__switch_to", }, /* This function switches only current task, but
-			     doesn't switch kernel stack.*/
+			      doesn't switch kernel stack.*/
 	{NULL, NULL}	/* Terminator */
 };
 const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
 
-/* insert a jmp code */
+/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
 static __always_inline void set_jmp_op(void *from, void *to)
 {
 	struct __arch_jmp_op {
 		char op;
-		long raddr;
-	} __attribute__((packed)) *jop;
+		s32 raddr;
+	} __attribute__((packed)) * jop;
 	jop = (struct __arch_jmp_op *)from;
-	jop->raddr = (long)(to) - ((long)(from) + 5);
+	jop->raddr = (s32)((long)(to) - ((long)(from) + 5));
 	jop->op = RELATIVEJUMP_INSTRUCTION;
 }
 
 /*
- * returns non-zero if opcodes can be boosted.
+ * returns non-zero if opcode is boostable.
  */
 static __always_inline int can_boost(kprobe_opcode_t *opcodes)
 {
-#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf)		      \
-	(((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) |   \
-	  (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) |   \
-	  (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) |   \
-	  (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf))    \
-	 << (row % 32))
-	/*
-	 * Undefined/reserved opcodes, conditional jump, Opcode Extension
-	 * Groups, and some special opcodes can not be boost.
-	 */
-	static const unsigned long twobyte_is_boostable[256 / 32] = {
-		/*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
-		/*      -------------------------------         */
-		W(0x00, 0,0,1,1,0,0,1,0,1,1,0,0,0,0,0,0)| /* 00 */
-		W(0x10, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 10 */
-		W(0x20, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0)| /* 20 */
-		W(0x30, 0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 30 */
-		W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 40 */
-		W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 50 */
-		W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1)| /* 60 */
-		W(0x70, 0,0,0,0,1,1,1,1,0,0,0,0,0,0,1,1), /* 70 */
-		W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 80 */
-		W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), /* 90 */
-		W(0xa0, 1,1,0,1,1,1,0,0,1,1,0,1,1,1,0,1)| /* a0 */
-		W(0xb0, 1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1), /* b0 */
-		W(0xc0, 1,1,0,0,0,0,0,0,1,1,1,1,1,1,1,1)| /* c0 */
-		W(0xd0, 0,1,1,1,0,1,0,0,1,1,0,1,1,1,0,1), /* d0 */
-		W(0xe0, 0,1,1,0,0,1,0,0,1,1,0,1,1,1,0,1)| /* e0 */
-		W(0xf0, 0,1,1,1,0,1,0,0,1,1,1,0,1,1,1,0)  /* f0 */
-		/*      -------------------------------         */
-		/*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
-	};
-#undef W
 	kprobe_opcode_t opcode;
 	kprobe_opcode_t *orig_opcodes = opcodes;
+
 retry:
 	if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
 		return 0;
@@ -109,7 +170,8 @@ retry:
 	if (opcode == 0x0f) {
 		if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
 			return 0;
-		return test_bit(*opcodes, twobyte_is_boostable);
+		return test_bit(*opcodes,
+				(unsigned long *)twobyte_is_boostable);
 	}
 
 	switch (opcode & 0xf0) {
@@ -132,12 +194,13 @@ retry:
 	case 0xf0:
 		if ((opcode & 0x0c) == 0 && opcode != 0xf1)
 			goto retry; /* lock/rep(ne) prefix */
-		/* clear and set flags can be boost */
+		/* clear and set flags are boostable */
 		return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe));
 	default:
+		/* segment override prefixes are boostable */
 		if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e)
 			goto retry; /* prefixes */
-		/* can't boost CS override and call */
+		/* CS override prefix and call are not boostable */
 		return (opcode != 0x2e && opcode != 0x9a);
 	}
 }
@@ -145,9 +208,9 @@ retry:
 /*
  * returns non-zero if opcode modifies the interrupt flag.
  */
-static int __kprobes is_IF_modifier(kprobe_opcode_t opcode)
+static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
 {
-	switch (opcode) {
+	switch (*insn) {
 	case 0xfa:		/* cli */
 	case 0xfb:		/* sti */
 	case 0xcf:		/* iret/iretd */
@@ -157,20 +220,24 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t opcode)
 	return 0;
 }
 
+static void __kprobes arch_copy_kprobe(struct kprobe *p)
+{
+	memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+	if (can_boost(p->addr))
+		p->ainsn.boostable = 0;
+	else
+		p->ainsn.boostable = -1;
+
+	p->opcode = *p->addr;
+}
+
 int __kprobes arch_prepare_kprobe(struct kprobe *p)
 {
-	/* insn: must be on special executable page on i386. */
+	/* insn: must be on special executable page on x86. */
 	p->ainsn.insn = get_insn_slot();
 	if (!p->ainsn.insn)
 		return -ENOMEM;
-
-	memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
-	p->opcode = *p->addr;
-	if (can_boost(p->addr)) {
-		p->ainsn.boostable = 0;
-	} else {
-		p->ainsn.boostable = -1;
-	}
+	arch_copy_kprobe(p);
 	return 0;
 }
 
@@ -195,26 +262,26 @@ static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
 {
 	kcb->prev_kprobe.kp = kprobe_running();
 	kcb->prev_kprobe.status = kcb->kprobe_status;
-	kcb->prev_kprobe.old_eflags = kcb->kprobe_old_eflags;
-	kcb->prev_kprobe.saved_eflags = kcb->kprobe_saved_eflags;
+	kcb->prev_kprobe.old_flags = kcb->kprobe_old_flags;
+	kcb->prev_kprobe.saved_flags = kcb->kprobe_saved_flags;
 }
 
 static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
 {
 	__get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
 	kcb->kprobe_status = kcb->prev_kprobe.status;
-	kcb->kprobe_old_eflags = kcb->prev_kprobe.old_eflags;
-	kcb->kprobe_saved_eflags = kcb->prev_kprobe.saved_eflags;
+	kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags;
+	kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags;
 }
 
 static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
 				struct kprobe_ctlblk *kcb)
 {
 	__get_cpu_var(current_kprobe) = p;
-	kcb->kprobe_saved_eflags = kcb->kprobe_old_eflags
+	kcb->kprobe_saved_flags = kcb->kprobe_old_flags
 		= (regs->flags & (TF_MASK | IF_MASK));
-	if (is_IF_modifier(p->opcode))
-		kcb->kprobe_saved_eflags &= ~IF_MASK;
+	if (is_IF_modifier(p->ainsn.insn))
+		kcb->kprobe_saved_flags &= ~IF_MASK;
 }
 
 static __always_inline void clear_btf(void)
@@ -245,7 +312,7 @@ static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
 void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
 				      struct pt_regs *regs)
 {
-	unsigned long *sara = (unsigned long *)&regs->sp;
+	unsigned long *sara = stack_addr(regs);
 
 	ri->ret_addr = (kprobe_opcode_t *) *sara;
 
@@ -280,7 +347,7 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
 			if (kcb->kprobe_status == KPROBE_HIT_SS &&
 				*p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
 				regs->flags &= ~TF_MASK;
-				regs->flags |= kcb->kprobe_saved_eflags;
+				regs->flags |= kcb->kprobe_saved_flags;
 				goto no_kprobe;
 			}
 			/* We have reentered the kprobe_handler(), since
@@ -301,7 +368,7 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
 			 * another cpu right after we hit, no further
 			 * handling of this interrupt is appropriate
 			 */
-				regs->ip -= sizeof(kprobe_opcode_t);
+				regs->ip = (unsigned long)addr;
 				ret = 1;
 				goto no_kprobe;
 			}
@@ -325,7 +392,7 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
 			 * Back up over the (now missing) int3 and run
 			 * the original instruction.
 			 */
-			regs->ip -= sizeof(kprobe_opcode_t);
+			regs->ip = (unsigned long)addr;
 			ret = 1;
 		}
 		/* Not one of ours: let kernel handle it */
@@ -341,7 +408,7 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
 
 ss_probe:
 #if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM)
-	if (p->ainsn.boostable == 1 && !p->post_handler){
+	if (p->ainsn.boostable == 1 && !p->post_handler) {
 		/* Boost up -- we can execute copied instructions directly */
 		reset_current_kprobe();
 		regs->ip = (unsigned long)p->ainsn.insn;
@@ -359,16 +426,18 @@ no_kprobe:
 }
 
 /*
- * For function-return probes, init_kprobes() establishes a probepoint
- * here. When a retprobed function returns, this probe is hit and
- * trampoline_probe_handler() runs, calling the kretprobe's handler.
+ * When a retprobed function returns, this code saves registers and
+ * calls trampoline_handler() runs, which calls the kretprobe's handler.
  */
  void __kprobes kretprobe_trampoline_holder(void)
  {
 	asm volatile ( ".global kretprobe_trampoline\n"
 			"kretprobe_trampoline: \n"
 			"	pushf\n"
-			/* skip cs, ip, orig_ax */
+			/*
+			 * Skip cs, ip, orig_ax.
+			 * trampoline_handler() will plug in these values
+			 */
 			"	subl $12, %esp\n"
 			"	pushl %fs\n"
 			"	pushl %ds\n"
@@ -382,10 +451,10 @@ no_kprobe:
 			"	pushl %ebx\n"
 			"	movl %esp, %eax\n"
 			"	call trampoline_handler\n"
-			/* move flags to cs */
+			/* Move flags to cs */
 			"	movl 52(%esp), %edx\n"
 			"	movl %edx, 48(%esp)\n"
-			/* save true return address on flags */
+			/* Replace saved flags with true return address. */
 			"	movl %eax, 52(%esp)\n"
 			"	popl %ebx\n"
 			"	popl %ecx\n"
@@ -394,16 +463,16 @@ no_kprobe:
 			"	popl %edi\n"
 			"	popl %ebp\n"
 			"	popl %eax\n"
-			/* skip ip, orig_ax, es, ds, fs */
+			/* Skip ip, orig_ax, es, ds, fs */
 			"	addl $20, %esp\n"
 			"	popf\n"
 			"	ret\n");
-}
+ }
 
 /*
  * Called from kretprobe_trampoline
  */
-void *__kprobes trampoline_handler(struct pt_regs *regs)
+void * __kprobes trampoline_handler(struct pt_regs *regs)
 {
 	struct kretprobe_instance *ri = NULL;
 	struct hlist_head *head, empty_rp;
@@ -417,27 +486,27 @@ void *__kprobes trampoline_handler(struct pt_regs *regs)
 	/* fixup registers */
 	regs->cs = __KERNEL_CS | get_kernel_rpl();
 	regs->ip = trampoline_address;
-	regs->orig_ax = 0xffffffff;
+	regs->orig_ax = ~0UL;
 
 	/*
 	 * It is possible to have multiple instances associated with a given
-	 * task either because an multiple functions in the call path
-	 * have a return probe installed on them, and/or more then one return
+	 * task either because multiple functions in the call path have
+	 * return probes installed on them, and/or more then one
 	 * return probe was registered for a target function.
 	 *
 	 * We can handle this because:
-	 *     - instances are always inserted at the head of the list
+	 *     - instances are always pushed into the head of the list
 	 *     - when multiple return probes are registered for the same
-	 *       function, the first instance's ret_addr will point to the
-	 *       real return address, and all the rest will point to
-	 *       kretprobe_trampoline
+	 *	 function, the (chronologically) first instance's ret_addr
+	 *	 will be the real return address, and all the rest will
+	 *	 point to kretprobe_trampoline.
 	 */
 	hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
 		if (ri->task != current)
 			/* another task is sharing our hash bucket */
 			continue;
 
-		if (ri->rp && ri->rp->handler){
+		if (ri->rp && ri->rp->handler) {
 			__get_cpu_var(current_kprobe) = &ri->rp->kp;
 			get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
 			ri->rp->handler(ri, regs);
@@ -457,13 +526,14 @@ void *__kprobes trampoline_handler(struct pt_regs *regs)
 	}
 
 	kretprobe_assert(ri, orig_ret_address, trampoline_address);
+
 	spin_unlock_irqrestore(&kretprobe_lock, flags);
 
 	hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
 		hlist_del(&ri->hlist);
 		kfree(ri);
 	}
-	return (void*)orig_ret_address;
+	return (void *)orig_ret_address;
 }
 
 /*
@@ -488,48 +558,55 @@ void *__kprobes trampoline_handler(struct pt_regs *regs)
  * that is atop the stack is the address following the copied instruction.
  * We need to make it the address following the original instruction.
  *
- * This function also checks instruction size for preparing direct execution.
+ * If this is the first time we've single-stepped the instruction at
+ * this probepoint, and the instruction is boostable, boost it: add a
+ * jump instruction after the copied instruction, that jumps to the next
+ * instruction after the probepoint.
  */
 static void __kprobes resume_execution(struct kprobe *p,
 		struct pt_regs *regs, struct kprobe_ctlblk *kcb)
 {
-	unsigned long *tos = (unsigned long *)&regs->sp;
-	unsigned long copy_eip = (unsigned long)p->ainsn.insn;
-	unsigned long orig_eip = (unsigned long)p->addr;
+	unsigned long *tos = stack_addr(regs);
+	unsigned long copy_ip = (unsigned long)p->ainsn.insn;
+	unsigned long orig_ip = (unsigned long)p->addr;
+	kprobe_opcode_t *insn = p->ainsn.insn;
 
 	regs->flags &= ~TF_MASK;
-	switch (p->ainsn.insn[0]) {
-	case 0x9c:		/* pushfl */
+	switch (*insn) {
+	case 0x9c:	/* pushfl */
 		*tos &= ~(TF_MASK | IF_MASK);
-		*tos |= kcb->kprobe_old_eflags;
+		*tos |= kcb->kprobe_old_flags;
 		break;
-	case 0xc2:		/* iret/ret/lret */
+	case 0xc2:	/* iret/ret/lret */
 	case 0xc3:
 	case 0xca:
 	case 0xcb:
 	case 0xcf:
-	case 0xea:		/* jmp absolute -- ip is correct */
+	case 0xea:	/* jmp absolute -- ip is correct */
 		/* ip is already adjusted, no more changes required */
 		p->ainsn.boostable = 1;
 		goto no_change;
-	case 0xe8:		/* call relative - Fix return addr */
-		*tos = orig_eip + (*tos - copy_eip);
+	case 0xe8:	/* call relative - Fix return addr */
+		*tos = orig_ip + (*tos - copy_ip);
 		break;
-	case 0x9a:		/* call absolute -- same as call absolute, indirect */
-		*tos = orig_eip + (*tos - copy_eip);
+	case 0x9a:	/* call absolute -- same as call absolute, indirect */
+		*tos = orig_ip + (*tos - copy_ip);
 		goto no_change;
 	case 0xff:
-		if ((p->ainsn.insn[1] & 0x30) == 0x10) {
+		if ((insn[1] & 0x30) == 0x10) {
 			/*
 			 * call absolute, indirect
 			 * Fix return addr; ip is correct.
 			 * But this is not boostable
 			 */
-			*tos = orig_eip + (*tos - copy_eip);
+			*tos = orig_ip + (*tos - copy_ip);
 			goto no_change;
-		} else if (((p->ainsn.insn[1] & 0x31) == 0x20) ||	/* jmp near, absolute indirect */
-			   ((p->ainsn.insn[1] & 0x31) == 0x21)) {	/* jmp far, absolute indirect */
-			/* ip is correct. And this is boostable */
+		} else if (((insn[1] & 0x31) == 0x20) ||
+			   ((insn[1] & 0x31) == 0x21)) {
+			/*
+			 * jmp near and far, absolute indirect
+			 * ip is correct. And this is boostable
+			 */
 			p->ainsn.boostable = 1;
 			goto no_change;
 		}
@@ -538,21 +615,21 @@ static void __kprobes resume_execution(struct kprobe *p,
 	}
 
 	if (p->ainsn.boostable == 0) {
-		if ((regs->ip > copy_eip) &&
-		    (regs->ip - copy_eip) + 5 < MAX_INSN_SIZE) {
+		if ((regs->ip > copy_ip) &&
+		    (regs->ip - copy_ip) + 5 < MAX_INSN_SIZE) {
 			/*
 			 * These instructions can be executed directly if it
 			 * jumps back to correct address.
 			 */
 			set_jmp_op((void *)regs->ip,
-				   (void *)orig_eip + (regs->ip - copy_eip));
+				   (void *)orig_ip + (regs->ip - copy_ip));
 			p->ainsn.boostable = 1;
 		} else {
 			p->ainsn.boostable = -1;
 		}
 	}
 
-	regs->ip = orig_eip + (regs->ip - copy_eip);
+	regs->ip += orig_ip - copy_ip;
 
 no_change:
 	restore_btf();
@@ -578,10 +655,10 @@ static int __kprobes post_kprobe_handler(struct pt_regs *regs)
 	}
 
 	resume_execution(cur, regs, kcb);
-	regs->flags |= kcb->kprobe_saved_eflags;
+	regs->flags |= kcb->kprobe_saved_flags;
 	trace_hardirqs_fixup_flags(regs->flags);
 
-	/*Restore back the original saved kprobes variables and continue. */
+	/* Restore back the original saved kprobes variables and continue. */
 	if (kcb->kprobe_status == KPROBE_REENTER) {
 		restore_previous_kprobe(kcb);
 		goto out;
@@ -617,7 +694,7 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 		 * normal page fault.
 		 */
 		regs->ip = (unsigned long)cur->addr;
-		regs->flags |= kcb->kprobe_old_eflags;
+		regs->flags |= kcb->kprobe_old_flags;
 		if (kcb->kprobe_status == KPROBE_REENTER)
 			restore_previous_kprobe(kcb);
 		else
@@ -628,7 +705,7 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 	case KPROBE_HIT_SSDONE:
 		/*
 		 * We increment the nmissed count for accounting,
-		 * we can also use npre/npostfault count for accouting
+		 * we can also use npre/npostfault count for accounting
 		 * these specific fault cases.
 		 */
 		kprobes_inc_nmissed_count(cur);
@@ -651,7 +728,7 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 			return 1;
 
 		/*
-		 * fixup_exception() could not handle it,
+		 * fixup routine could not handle it,
 		 * Let do_page_fault() fix it.
 		 */
 		break;
@@ -662,7 +739,7 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 }
 
 /*
- * Wrapper routine to for handling exceptions.
+ * Wrapper routine for handling exceptions.
  */
 int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
 				       unsigned long val, void *data)
@@ -703,11 +780,11 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
 
 	kcb->jprobe_saved_regs = *regs;
-	kcb->jprobe_saved_esp = &regs->sp;
-	addr = (unsigned long)(kcb->jprobe_saved_esp);
+	kcb->jprobe_saved_sp = stack_addr(regs);
+	addr = (unsigned long)(kcb->jprobe_saved_sp);
 
 	/*
-	 * TBD: As Linus pointed out, gcc assumes that the callee
+	 * As Linus pointed out, gcc assumes that the callee
 	 * owns the argument space and could overwrite it, e.g.
 	 * tailcall optimization. So, to be absolutely safe
 	 * we also save and restore enough stack bytes to cover
@@ -730,21 +807,20 @@ void __kprobes jprobe_return(void)
 		      "       .globl jprobe_return_end	\n"
 		      "       jprobe_return_end:	\n"
 		      "       nop			\n"::"b"
-		      (kcb->jprobe_saved_esp):"memory");
+		      (kcb->jprobe_saved_sp):"memory");
 }
 
 int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
 {
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
 	u8 *addr = (u8 *) (regs->ip - 1);
-	unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_esp);
 	struct jprobe *jp = container_of(p, struct jprobe, kp);
 
 	if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) {
-		if (&regs->sp != kcb->jprobe_saved_esp) {
+		if (stack_addr(regs) != kcb->jprobe_saved_sp) {
 			struct pt_regs *saved_regs = &kcb->jprobe_saved_regs;
 			printk("current sp %p does not match saved sp %p\n",
-			       &regs->sp, kcb->jprobe_saved_esp);
+			       stack_addr(regs), kcb->jprobe_saved_sp);
 			printk("Saved registers for jprobe %p\n", jp);
 			show_registers(saved_regs);
 			printk("Current registers\n");
@@ -752,20 +828,21 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
 			BUG();
 		}
 		*regs = kcb->jprobe_saved_regs;
-		memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack,
-		       MIN_STACK_SIZE(stack_addr));
+		memcpy((kprobe_opcode_t *)(kcb->jprobe_saved_sp),
+		       kcb->jprobes_stack,
+		       MIN_STACK_SIZE(kcb->jprobe_saved_sp));
 		preempt_enable_no_resched();
 		return 1;
 	}
 	return 0;
 }
 
-int __kprobes arch_trampoline_kprobe(struct kprobe *p)
+int __init arch_init_kprobes(void)
 {
 	return 0;
 }
 
-int __init arch_init_kprobes(void)
+int __kprobes arch_trampoline_kprobe(struct kprobe *p)
 {
 	return 0;
 }
diff --git a/arch/x86/kernel/kprobes_64.c b/arch/x86/kernel/kprobes_64.c
index bc93b1dd9a01..2d7763749b1b 100644
--- a/arch/x86/kernel/kprobes_64.c
+++ b/arch/x86/kernel/kprobes_64.c
@@ -40,16 +40,97 @@
 #include <linux/module.h>
 #include <linux/kdebug.h>
 
+#include <asm/cacheflush.h>
+#include <asm/desc.h>
 #include <asm/pgtable.h>
 #include <asm/uaccess.h>
 #include <asm/alternative.h>
 
 void jprobe_return_end(void);
-static void __kprobes arch_copy_kprobe(struct kprobe *p);
 
 DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
 DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
 
+#define stack_addr(regs) ((unsigned long *)regs->sp)
+
+#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
+	(((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) |   \
+	  (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) |   \
+	  (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) |   \
+	  (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf))    \
+	 << (row % 32))
+	/*
+	 * Undefined/reserved opcodes, conditional jump, Opcode Extension
+	 * Groups, and some special opcodes can not boost.
+	 */
+static const u32 twobyte_is_boostable[256 / 32] = {
+	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
+	/*      ----------------------------------------------          */
+	W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */
+	W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 10 */
+	W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 20 */
+	W(0x30, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
+	W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
+	W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
+	W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1) | /* 60 */
+	W(0x70, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */
+	W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 80 */
+	W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
+	W(0xa0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* a0 */
+	W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) , /* b0 */
+	W(0xc0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
+	W(0xd0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) , /* d0 */
+	W(0xe0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* e0 */
+	W(0xf0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0)   /* f0 */
+	/*      -----------------------------------------------         */
+	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
+};
+static const u32 onebyte_has_modrm[256 / 32] = {
+	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
+	/*      -----------------------------------------------         */
+	W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */
+	W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */
+	W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */
+	W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */
+	W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
+	W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
+	W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */
+	W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */
+	W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
+	W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */
+	W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */
+	W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */
+	W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */
+	W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
+	W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */
+	W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1)   /* f0 */
+	/*      -----------------------------------------------         */
+	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
+};
+static const u32 twobyte_has_modrm[256 / 32] = {
+	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
+	/*      -----------------------------------------------         */
+	W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */
+	W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */
+	W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */
+	W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */
+	W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */
+	W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */
+	W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */
+	W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */
+	W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */
+	W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */
+	W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */
+	W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */
+	W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */
+	W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */
+	W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */
+	W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0)   /* ff */
+	/*      -----------------------------------------------         */
+	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
+};
+#undef W
+
 struct kretprobe_blackpoint kretprobe_blacklist[] = {
 	{"__switch_to", }, /* This function switches only current task, but
 			      doesn't switch kernel stack.*/
@@ -70,44 +151,11 @@ static __always_inline void set_jmp_op(void *from, void *to)
 }
 
 /*
- * returns non-zero if opcode is boostable
+ * returns non-zero if opcode is boostable.
  * RIP relative instructions are adjusted at copying time
  */
 static __always_inline int can_boost(kprobe_opcode_t *opcodes)
 {
-#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
-	(((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) |   \
-	  (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) |   \
-	  (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) |   \
-	  (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf))    \
-	 << (row % 64))
-	/*
-	 * Undefined/reserved opcodes, conditional jump, Opcode Extension
-	 * Groups, and some special opcodes can not boost.
-	 */
-	static const unsigned long twobyte_is_boostable[256 / 64] = {
-		/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f        */
-		/*      ----------------------------------------------        */
-		W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0)|/* 00 */
-		W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)|/* 10 */
-		W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)|/* 20 */
-		W(0x30, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),/* 30 */
-		W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)|/* 40 */
-		W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)|/* 50 */
-		W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1)|/* 60 */
-		W(0x70, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1),/* 70 */
-		W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)|/* 80 */
-		W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)|/* 90 */
-		W(0xa0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1)|/* a0 */
-		W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1),/* b0 */
-		W(0xc0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1)|/* c0 */
-		W(0xd0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1)|/* d0 */
-		W(0xe0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1)|/* e0 */
-		W(0xf0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0) /* f0 */
-		/*      -----------------------------------------------       */
-		/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f        */
-	};
-#undef W
 	kprobe_opcode_t opcode;
 	kprobe_opcode_t *orig_opcodes = opcodes;
 
@@ -120,7 +168,8 @@ retry:
 	if (opcode == 0x0f) {
 		if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
 			return 0;
-		return test_bit(*opcodes, twobyte_is_boostable);
+		return test_bit(*opcodes,
+				(unsigned long *)twobyte_is_boostable);
 	}
 
 	switch (opcode & 0xf0) {
@@ -169,80 +218,25 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
 		return 1;
 	}
 
-	if (*insn  >= 0x40 && *insn <= 0x4f && *++insn == 0xcf)
-		return 1;
-	return 0;
-}
-
-int __kprobes arch_prepare_kprobe(struct kprobe *p)
-{
-	/* insn: must be on special executable page on x86_64. */
-	p->ainsn.insn = get_insn_slot();
-	if (!p->ainsn.insn) {
-		return -ENOMEM;
-	}
-	arch_copy_kprobe(p);
+	/*
+	 * on 64 bit x86, 0x40-0x4f are prefixes so we need to look
+	 * at the next byte instead.. but of course not recurse infinitely
+	 */
+	if (*insn  >= 0x40 && *insn <= 0x4f)
+		return is_IF_modifier(++insn);
 	return 0;
 }
 
 /*
- * Determine if the instruction uses the %rip-relative addressing mode.
+ * Adjust the displacement if the instruction uses the %rip-relative
+ * addressing mode.
  * If it does, Return the address of the 32-bit displacement word.
  * If not, return null.
  */
-static s32 __kprobes *is_riprel(u8 *insn)
+static void __kprobes fix_riprel(struct kprobe *p)
 {
-#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf)		      \
-	(((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) |   \
-	  (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) |   \
-	  (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) |   \
-	  (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf))    \
-	 << (row % 64))
-	static const u64 onebyte_has_modrm[256 / 64] = {
-		/*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
-		/*      -------------------------------         */
-		W(0x00, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 00 */
-		W(0x10, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 10 */
-		W(0x20, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 20 */
-		W(0x30, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0), /* 30 */
-		W(0x40, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 40 */
-		W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 50 */
-		W(0x60, 0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0)| /* 60 */
-		W(0x70, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 70 */
-		W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */
-		W(0x90, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 90 */
-		W(0xa0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* a0 */
-		W(0xb0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* b0 */
-		W(0xc0, 1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0)| /* c0 */
-		W(0xd0, 1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1)| /* d0 */
-		W(0xe0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* e0 */
-		W(0xf0, 0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1)  /* f0 */
-		/*      -------------------------------         */
-		/*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
-	};
-	static const u64 twobyte_has_modrm[256 / 64] = {
-		/*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
-		/*      -------------------------------         */
-		W(0x00, 1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1)| /* 0f */
-		W(0x10, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0)| /* 1f */
-		W(0x20, 1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1)| /* 2f */
-		W(0x30, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 3f */
-		W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 4f */
-		W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 5f */
-		W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 6f */
-		W(0x70, 1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1), /* 7f */
-		W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 8f */
-		W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 9f */
-		W(0xa0, 0,0,0,1,1,1,1,1,0,0,0,1,1,1,1,1)| /* af */
-		W(0xb0, 1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1), /* bf */
-		W(0xc0, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0)| /* cf */
-		W(0xd0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* df */
-		W(0xe0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* ef */
-		W(0xf0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0)  /* ff */
-		/*      -------------------------------         */
-		/*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
-	};
-#undef	W
+	u8 *insn = p->ainsn.insn;
+	s64 disp;
 	int need_modrm;
 
 	/* Skip legacy instruction prefixes.  */
@@ -271,54 +265,60 @@ static s32 __kprobes *is_riprel(u8 *insn)
 
 	if (*insn == 0x0f) {	/* Two-byte opcode.  */
 		++insn;
-		need_modrm = test_bit(*insn, twobyte_has_modrm);
-	} else {		/* One-byte opcode.  */
-		need_modrm = test_bit(*insn, onebyte_has_modrm);
-	}
+		need_modrm = test_bit(*insn,
+				      (unsigned long *)twobyte_has_modrm);
+	} else			/* One-byte opcode.  */
+		need_modrm = test_bit(*insn,
+				      (unsigned long *)onebyte_has_modrm);
 
 	if (need_modrm) {
 		u8 modrm = *++insn;
 		if ((modrm & 0xc7) == 0x05) { /* %rip+disp32 addressing mode */
 			/* Displacement follows ModRM byte.  */
-			return (s32 *) ++insn;
+			++insn;
+			/*
+			 * The copied instruction uses the %rip-relative
+			 * addressing mode.  Adjust the displacement for the
+			 * difference between the original location of this
+			 * instruction and the location of the copy that will
+			 * actually be run.  The tricky bit here is making sure
+			 * that the sign extension happens correctly in this
+			 * calculation, since we need a signed 32-bit result to
+			 * be sign-extended to 64 bits when it's added to the
+			 * %rip value and yield the same 64-bit result that the
+			 * sign-extension of the original signed 32-bit
+			 * displacement would have given.
+			 */
+			disp = (u8 *) p->addr + *((s32 *) insn) -
+			       (u8 *) p->ainsn.insn;
+			BUG_ON((s64) (s32) disp != disp); /* Sanity check.  */
+			*(s32 *)insn = (s32) disp;
 		}
 	}
-
-	/* No %rip-relative addressing mode here.  */
-	return NULL;
 }
 
 static void __kprobes arch_copy_kprobe(struct kprobe *p)
 {
-	s32 *ripdisp;
-	memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE);
-	ripdisp = is_riprel(p->ainsn.insn);
-	if (ripdisp) {
-		/*
-		 * The copied instruction uses the %rip-relative
-		 * addressing mode.  Adjust the displacement for the
-		 * difference between the original location of this
-		 * instruction and the location of the copy that will
-		 * actually be run.  The tricky bit here is making sure
-		 * that the sign extension happens correctly in this
-		 * calculation, since we need a signed 32-bit result to
-		 * be sign-extended to 64 bits when it's added to the
-		 * %rip value and yield the same 64-bit result that the
-		 * sign-extension of the original signed 32-bit
-		 * displacement would have given.
-		 */
-		s64 disp = (u8 *) p->addr + *ripdisp - (u8 *) p->ainsn.insn;
-		BUG_ON((s64) (s32) disp != disp); /* Sanity check.  */
-		*ripdisp = disp;
-	}
-	if (can_boost(p->addr)) {
+	memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+	fix_riprel(p);
+	if (can_boost(p->addr))
 		p->ainsn.boostable = 0;
-	} else {
+	else
 		p->ainsn.boostable = -1;
-	}
+
 	p->opcode = *p->addr;
 }
 
+int __kprobes arch_prepare_kprobe(struct kprobe *p)
+{
+	/* insn: must be on special executable page on x86. */
+	p->ainsn.insn = get_insn_slot();
+	if (!p->ainsn.insn)
+		return -ENOMEM;
+	arch_copy_kprobe(p);
+	return 0;
+}
+
 void __kprobes arch_arm_kprobe(struct kprobe *p)
 {
 	text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
@@ -340,26 +340,26 @@ static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
 {
 	kcb->prev_kprobe.kp = kprobe_running();
 	kcb->prev_kprobe.status = kcb->kprobe_status;
-	kcb->prev_kprobe.old_rflags = kcb->kprobe_old_rflags;
-	kcb->prev_kprobe.saved_rflags = kcb->kprobe_saved_rflags;
+	kcb->prev_kprobe.old_flags = kcb->kprobe_old_flags;
+	kcb->prev_kprobe.saved_flags = kcb->kprobe_saved_flags;
 }
 
 static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
 {
 	__get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
 	kcb->kprobe_status = kcb->prev_kprobe.status;
-	kcb->kprobe_old_rflags = kcb->prev_kprobe.old_rflags;
-	kcb->kprobe_saved_rflags = kcb->prev_kprobe.saved_rflags;
+	kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags;
+	kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags;
 }
 
 static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
 				struct kprobe_ctlblk *kcb)
 {
 	__get_cpu_var(current_kprobe) = p;
-	kcb->kprobe_saved_rflags = kcb->kprobe_old_rflags
+	kcb->kprobe_saved_flags = kcb->kprobe_old_flags
 		= (regs->flags & (TF_MASK | IF_MASK));
 	if (is_IF_modifier(p->ainsn.insn))
-		kcb->kprobe_saved_rflags &= ~IF_MASK;
+		kcb->kprobe_saved_flags &= ~IF_MASK;
 }
 
 static __always_inline void clear_btf(void)
@@ -390,20 +390,27 @@ static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
 void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
 				      struct pt_regs *regs)
 {
-	unsigned long *sara = (unsigned long *)regs->sp;
+	unsigned long *sara = stack_addr(regs);
 
 	ri->ret_addr = (kprobe_opcode_t *) *sara;
+
 	/* Replace the return addr with trampoline addr */
 	*sara = (unsigned long) &kretprobe_trampoline;
 }
 
-int __kprobes kprobe_handler(struct pt_regs *regs)
+/*
+ * Interrupts are disabled on entry as trap3 is an interrupt gate and they
+ * remain disabled thorough out this function.
+ */
+static int __kprobes kprobe_handler(struct pt_regs *regs)
 {
 	struct kprobe *p;
 	int ret = 0;
-	kprobe_opcode_t *addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
+	kprobe_opcode_t *addr;
 	struct kprobe_ctlblk *kcb;
 
+	addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
+
 	/*
 	 * We don't want to be preempted for the entire
 	 * duration of kprobe processing
@@ -418,7 +425,7 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
 			if (kcb->kprobe_status == KPROBE_HIT_SS &&
 				*p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
 				regs->flags &= ~TF_MASK;
-				regs->flags |= kcb->kprobe_saved_rflags;
+				regs->flags |= kcb->kprobe_saved_flags;
 				goto no_kprobe;
 			} else if (kcb->kprobe_status == KPROBE_HIT_SSDONE) {
 				/* TODO: Provide re-entrancy from
@@ -429,22 +436,20 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
 				arch_disarm_kprobe(p);
 				regs->ip = (unsigned long)p->addr;
 				reset_current_kprobe();
-				ret = 1;
-			} else {
-				/* We have reentered the kprobe_handler(), since
-				 * another probe was hit while within the
-				 * handler. We here save the original kprobe
-				 * variables and just single step on instruction
-				 * of the new probe without calling any user
-				 * handlers.
-				 */
-				save_previous_kprobe(kcb);
-				set_current_kprobe(p, regs, kcb);
-				kprobes_inc_nmissed_count(p);
-				prepare_singlestep(p, regs);
-				kcb->kprobe_status = KPROBE_REENTER;
 				return 1;
 			}
+			/* We have reentered the kprobe_handler(), since
+			 * another probe was hit while within the handler.
+			 * We here save the original kprobes variables and
+			 * just single step on the instruction of the new probe
+			 * without calling any user handlers.
+			 */
+			save_previous_kprobe(kcb);
+			set_current_kprobe(p, regs, kcb);
+			kprobes_inc_nmissed_count(p);
+			prepare_singlestep(p, regs);
+			kcb->kprobe_status = KPROBE_REENTER;
+			return 1;
 		} else {
 			if (*addr != BREAKPOINT_INSTRUCTION) {
 			/* The breakpoint instruction was removed by
@@ -578,23 +583,23 @@ fastcall void * __kprobes trampoline_handler(struct pt_regs *regs)
 	INIT_HLIST_HEAD(&empty_rp);
 	spin_lock_irqsave(&kretprobe_lock, flags);
 	head = kretprobe_inst_table_head(current);
-	/* fixup rt_regs */
+	/* fixup registers */
 	regs->cs = __KERNEL_CS;
 	regs->ip = trampoline_address;
-	regs->orig_ax = 0xffffffffffffffff;
+	regs->orig_ax = ~0UL;
 
 	/*
 	 * It is possible to have multiple instances associated with a given
-	 * task either because an multiple functions in the call path
-	 * have a return probe installed on them, and/or more then one return
+	 * task either because multiple functions in the call path have
+	 * return probes installed on them, and/or more then one
 	 * return probe was registered for a target function.
 	 *
 	 * We can handle this because:
-	 *     - instances are always inserted at the head of the list
+	 *     - instances are always pushed into the head of the list
 	 *     - when multiple return probes are registered for the same
-	 *       function, the first instance's ret_addr will point to the
-	 *       real return address, and all the rest will point to
-	 *       kretprobe_trampoline
+	 *	 function, the (chronologically) first instance's ret_addr
+	 *	 will be the real return address, and all the rest will
+	 *	 point to kretprobe_trampoline.
 	 */
 	hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
 		if (ri->task != current)
@@ -661,9 +666,9 @@ fastcall void * __kprobes trampoline_handler(struct pt_regs *regs)
 static void __kprobes resume_execution(struct kprobe *p,
 		struct pt_regs *regs, struct kprobe_ctlblk *kcb)
 {
-	unsigned long *tos = (unsigned long *)regs->sp;
-	unsigned long copy_rip = (unsigned long)p->ainsn.insn;
-	unsigned long orig_rip = (unsigned long)p->addr;
+	unsigned long *tos = stack_addr(regs);
+	unsigned long copy_ip = (unsigned long)p->ainsn.insn;
+	unsigned long orig_ip = (unsigned long)p->addr;
 	kprobe_opcode_t *insn = p->ainsn.insn;
 
 	/*skip the REX prefix*/
@@ -674,7 +679,7 @@ static void __kprobes resume_execution(struct kprobe *p,
 	switch (*insn) {
 	case 0x9c:	/* pushfl */
 		*tos &= ~(TF_MASK | IF_MASK);
-		*tos |= kcb->kprobe_old_rflags;
+		*tos |= kcb->kprobe_old_flags;
 		break;
 	case 0xc2:	/* iret/ret/lret */
 	case 0xc3:
@@ -686,18 +691,23 @@ static void __kprobes resume_execution(struct kprobe *p,
 		p->ainsn.boostable = 1;
 		goto no_change;
 	case 0xe8:	/* call relative - Fix return addr */
-		*tos = orig_rip + (*tos - copy_rip);
+		*tos = orig_ip + (*tos - copy_ip);
 		break;
 	case 0xff:
 		if ((insn[1] & 0x30) == 0x10) {
-			/* call absolute, indirect */
-			/* Fix return addr; ip is correct. */
-			/* not boostable */
-			*tos = orig_rip + (*tos - copy_rip);
+			/*
+			 * call absolute, indirect
+			 * Fix return addr; ip is correct.
+			 * But this is not boostable
+			 */
+			*tos = orig_ip + (*tos - copy_ip);
 			goto no_change;
-		} else if (((insn[1] & 0x31) == 0x20) ||	/* jmp near, absolute indirect */
-			   ((insn[1] & 0x31) == 0x21)) {	/* jmp far, absolute indirect */
-			/* ip is correct. And this is boostable */
+		} else if (((insn[1] & 0x31) == 0x20) ||
+			   ((insn[1] & 0x31) == 0x21)) {
+			/*
+			 * jmp near and far, absolute indirect
+			 * ip is correct. And this is boostable
+			 */
 			p->ainsn.boostable = 1;
 			goto no_change;
 		}
@@ -706,21 +716,21 @@ static void __kprobes resume_execution(struct kprobe *p,
 	}
 
 	if (p->ainsn.boostable == 0) {
-		if ((regs->ip > copy_rip) &&
-		    (regs->ip - copy_rip) + 5 < MAX_INSN_SIZE) {
+		if ((regs->ip > copy_ip) &&
+		    (regs->ip - copy_ip) + 5 < MAX_INSN_SIZE) {
 			/*
 			 * These instructions can be executed directly if it
 			 * jumps back to correct address.
 			 */
 			set_jmp_op((void *)regs->ip,
-				   (void *)orig_rip + (regs->ip - copy_rip));
+				   (void *)orig_ip + (regs->ip - copy_ip));
 			p->ainsn.boostable = 1;
 		} else {
 			p->ainsn.boostable = -1;
 		}
 	}
 
-	regs->ip = orig_rip + (regs->ip - copy_rip);
+	regs->ip += orig_ip - copy_ip;
 
 no_change:
 	restore_btf();
@@ -728,7 +738,11 @@ no_change:
 	return;
 }
 
-int __kprobes post_kprobe_handler(struct pt_regs *regs)
+/*
+ * Interrupts are disabled on entry as trap1 is an interrupt gate and they
+ * remain disabled thoroughout this function.
+ */
+static int __kprobes post_kprobe_handler(struct pt_regs *regs)
 {
 	struct kprobe *cur = kprobe_running();
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
@@ -742,10 +756,10 @@ int __kprobes post_kprobe_handler(struct pt_regs *regs)
 	}
 
 	resume_execution(cur, regs, kcb);
-	regs->flags |= kcb->kprobe_saved_rflags;
+	regs->flags |= kcb->kprobe_saved_flags;
 	trace_hardirqs_fixup_flags(regs->flags);
 
-	/* Restore the original saved kprobes variables and continue. */
+	/* Restore back the original saved kprobes variables and continue. */
 	if (kcb->kprobe_status == KPROBE_REENTER) {
 		restore_previous_kprobe(kcb);
 		goto out;
@@ -782,7 +796,7 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 		 * normal page fault.
 		 */
 		regs->ip = (unsigned long)cur->addr;
-		regs->flags |= kcb->kprobe_old_rflags;
+		regs->flags |= kcb->kprobe_old_flags;
 		if (kcb->kprobe_status == KPROBE_REENTER)
 			restore_previous_kprobe(kcb);
 		else
@@ -793,7 +807,7 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 	case KPROBE_HIT_SSDONE:
 		/*
 		 * We increment the nmissed count for accounting,
-		 * we can also use npre/npostfault count for accouting
+		 * we can also use npre/npostfault count for accounting
 		 * these specific fault cases.
 		 */
 		kprobes_inc_nmissed_count(cur);
@@ -819,7 +833,7 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 		}
 
 		/*
-		 * fixup() could not handle it,
+		 * fixup routine could not handle it,
 		 * Let do_page_fault() fix it.
 		 */
 		break;
@@ -838,7 +852,7 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
 	struct die_args *args = (struct die_args *)data;
 	int ret = NOTIFY_DONE;
 
-	if (args->regs && user_mode(args->regs))
+	if (args->regs && user_mode_vm(args->regs))
 		return ret;
 
 	switch (val) {
@@ -871,8 +885,9 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
 
 	kcb->jprobe_saved_regs = *regs;
-	kcb->jprobe_saved_rsp = (long *) regs->sp;
-	addr = (unsigned long)(kcb->jprobe_saved_rsp);
+	kcb->jprobe_saved_sp = stack_addr(regs);
+	addr = (unsigned long)(kcb->jprobe_saved_sp);
+
 	/*
 	 * As Linus pointed out, gcc assumes that the callee
 	 * owns the argument space and could overwrite it, e.g.
@@ -897,21 +912,20 @@ void __kprobes jprobe_return(void)
 		      "       .globl jprobe_return_end	\n"
 		      "       jprobe_return_end:	\n"
 		      "       nop			\n"::"b"
-		      (kcb->jprobe_saved_rsp):"memory");
+		      (kcb->jprobe_saved_sp):"memory");
 }
 
 int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
 {
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
 	u8 *addr = (u8 *) (regs->ip - 1);
-	unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_rsp);
 	struct jprobe *jp = container_of(p, struct jprobe, kp);
 
 	if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) {
-		if ((unsigned long *)regs->sp != kcb->jprobe_saved_rsp) {
+		if (stack_addr(regs) != kcb->jprobe_saved_sp) {
 			struct pt_regs *saved_regs = &kcb->jprobe_saved_regs;
 			printk("current sp %p does not match saved sp %p\n",
-			       (long *)regs->sp, kcb->jprobe_saved_rsp);
+			       stack_addr(regs), kcb->jprobe_saved_sp);
 			printk("Saved registers for jprobe %p\n", jp);
 			show_registers(saved_regs);
 			printk("Current registers\n");
@@ -919,8 +933,9 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
 			BUG();
 		}
 		*regs = kcb->jprobe_saved_regs;
-		memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack,
-		       MIN_STACK_SIZE(stack_addr));
+		memcpy((kprobe_opcode_t *)(kcb->jprobe_saved_sp),
+		       kcb->jprobes_stack,
+		       MIN_STACK_SIZE(kcb->jprobe_saved_sp));
 		preempt_enable_no_resched();
 		return 1;
 	}
diff --git a/arch/x86/mm/fault_32.c b/arch/x86/mm/fault_32.c
index 8aed912b04ec..db8d748814e4 100644
--- a/arch/x86/mm/fault_32.c
+++ b/arch/x86/mm/fault_32.c
@@ -25,7 +25,6 @@
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
 #include <linux/kdebug.h>
-#include <linux/kprobes.h>
 
 #include <asm/system.h>
 #include <asm/desc.h>
diff --git a/arch/x86/mm/fault_64.c b/arch/x86/mm/fault_64.c
index 88a7abda29ce..162050d4e5a3 100644
--- a/arch/x86/mm/fault_64.c
+++ b/arch/x86/mm/fault_64.c
@@ -25,7 +25,6 @@
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
 #include <linux/kdebug.h>
-#include <linux/kprobes.h>
 
 #include <asm/system.h>
 #include <asm/pgalloc.h>
diff --git a/include/asm-x86/kprobes_32.h b/include/asm-x86/kprobes_32.h
index 2f38315bc39f..94dff09b2ebe 100644
--- a/include/asm-x86/kprobes_32.h
+++ b/include/asm-x86/kprobes_32.h
@@ -2,7 +2,6 @@
 #define _ASM_KPROBES_H
 /*
  *  Kernel Probes (KProbes)
- *  include/asm-i386/kprobes.h
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -23,14 +22,17 @@
  * 2002-Oct	Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
  *		Probes initial implementation ( includes suggestions from
  *		Rusty Russell).
+ * 2004-Oct	Prasanna S Panchamukhi <prasanna@in.ibm.com> and Jim Keniston
+ *		kenistoj@us.ibm.com adopted from i386.
  */
 #include <linux/types.h>
 #include <linux/ptrace.h>
+#include <linux/percpu.h>
 
 #define  __ARCH_WANT_KPROBES_INSN_SLOT
 
-struct kprobe;
 struct pt_regs;
+struct kprobe;
 
 typedef u8 kprobe_opcode_t;
 #define BREAKPOINT_INSTRUCTION	0xcc
@@ -38,9 +40,11 @@ typedef u8 kprobe_opcode_t;
 #define MAX_INSN_SIZE 16
 #define MAX_STACK_SIZE 64
 #define MIN_STACK_SIZE(ADDR) (((MAX_STACK_SIZE) < \
-	(((unsigned long)current_thread_info()) + THREAD_SIZE - (ADDR))) \
+	(((unsigned long)current_thread_info()) + THREAD_SIZE \
+	 - (unsigned long)(ADDR))) \
 	? (MAX_STACK_SIZE) \
-	: (((unsigned long)current_thread_info()) + THREAD_SIZE - (ADDR)))
+	: (((unsigned long)current_thread_info()) + THREAD_SIZE \
+	   - (unsigned long)(ADDR)))
 
 #define ARCH_SUPPORTS_KRETPROBES
 #define flush_insn_slot(p)	do { } while (0)
@@ -55,8 +59,12 @@ struct arch_specific_insn {
 	/* copy of the original instruction */
 	kprobe_opcode_t *insn;
 	/*
-	 * If this flag is not 0, this kprobe can be boost when its
-	 * post_handler and break_handler is not set.
+	 * boostable = -1: This instruction type is not boostable.
+	 * boostable = 0: This instruction type is boostable.
+	 * boostable = 1: This instruction has been boosted: we have
+	 * added a relative jump after the instruction copy in insn,
+	 * so no single-step and fixup are needed (unless there's
+	 * a post_handler or break_handler).
 	 */
 	int boostable;
 };
@@ -64,16 +72,16 @@ struct arch_specific_insn {
 struct prev_kprobe {
 	struct kprobe *kp;
 	unsigned long status;
-	unsigned long old_eflags;
-	unsigned long saved_eflags;
+	unsigned long old_flags;
+	unsigned long saved_flags;
 };
 
 /* per-cpu kprobe control block */
 struct kprobe_ctlblk {
 	unsigned long kprobe_status;
-	unsigned long kprobe_old_eflags;
-	unsigned long kprobe_saved_eflags;
-	unsigned long *jprobe_saved_esp;
+	unsigned long kprobe_old_flags;
+	unsigned long kprobe_saved_flags;
+	unsigned long *jprobe_saved_sp;
 	struct pt_regs jprobe_saved_regs;
 	kprobe_opcode_t jprobes_stack[MAX_STACK_SIZE];
 	struct prev_kprobe prev_kprobe;
@@ -88,7 +96,7 @@ static inline void restore_interrupts(struct pt_regs *regs)
 		local_irq_enable();
 }
 
+extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
 extern int kprobe_exceptions_notify(struct notifier_block *self,
 				    unsigned long val, void *data);
-extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
 #endif				/* _ASM_KPROBES_H */
diff --git a/include/asm-x86/kprobes_64.h b/include/asm-x86/kprobes_64.h
index e7e921dac7c2..94dff09b2ebe 100644
--- a/include/asm-x86/kprobes_64.h
+++ b/include/asm-x86/kprobes_64.h
@@ -2,7 +2,6 @@
 #define _ASM_KPROBES_H
 /*
  *  Kernel Probes (KProbes)
- *  include/asm-x86_64/kprobes.h
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -20,6 +19,9 @@
  *
  * Copyright (C) IBM Corporation, 2002, 2004
  *
+ * 2002-Oct	Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
+ *		Probes initial implementation ( includes suggestions from
+ *		Rusty Russell).
  * 2004-Oct	Prasanna S Panchamukhi <prasanna@in.ibm.com> and Jim Keniston
  *		kenistoj@us.ibm.com adopted from i386.
  */
@@ -35,19 +37,22 @@ struct kprobe;
 typedef u8 kprobe_opcode_t;
 #define BREAKPOINT_INSTRUCTION	0xcc
 #define RELATIVEJUMP_INSTRUCTION 0xe9
-#define MAX_INSN_SIZE 15
+#define MAX_INSN_SIZE 16
 #define MAX_STACK_SIZE 64
 #define MIN_STACK_SIZE(ADDR) (((MAX_STACK_SIZE) < \
-	(((unsigned long)current_thread_info()) + THREAD_SIZE - (ADDR))) \
+	(((unsigned long)current_thread_info()) + THREAD_SIZE \
+	 - (unsigned long)(ADDR))) \
 	? (MAX_STACK_SIZE) \
-	: (((unsigned long)current_thread_info()) + THREAD_SIZE - (ADDR)))
+	: (((unsigned long)current_thread_info()) + THREAD_SIZE \
+	   - (unsigned long)(ADDR)))
 
 #define ARCH_SUPPORTS_KRETPROBES
+#define flush_insn_slot(p)	do { } while (0)
+
 extern const int kretprobe_blacklist_size;
 
+void arch_remove_kprobe(struct kprobe *p);
 void kretprobe_trampoline(void);
-extern void arch_remove_kprobe(struct kprobe *p);
-#define flush_insn_slot(p)	do { } while (0)
 
 /* Architecture specific copy of original instruction*/
 struct arch_specific_insn {
@@ -67,16 +72,16 @@ struct arch_specific_insn {
 struct prev_kprobe {
 	struct kprobe *kp;
 	unsigned long status;
-	unsigned long old_rflags;
-	unsigned long saved_rflags;
+	unsigned long old_flags;
+	unsigned long saved_flags;
 };
 
 /* per-cpu kprobe control block */
 struct kprobe_ctlblk {
 	unsigned long kprobe_status;
-	unsigned long kprobe_old_rflags;
-	unsigned long kprobe_saved_rflags;
-	unsigned long *jprobe_saved_rsp;
+	unsigned long kprobe_old_flags;
+	unsigned long kprobe_saved_flags;
+	unsigned long *jprobe_saved_sp;
 	struct pt_regs jprobe_saved_regs;
 	kprobe_opcode_t jprobes_stack[MAX_STACK_SIZE];
 	struct prev_kprobe prev_kprobe;
@@ -91,10 +96,7 @@ static inline void restore_interrupts(struct pt_regs *regs)
 		local_irq_enable();
 }
 
-extern int post_kprobe_handler(struct pt_regs *regs);
 extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
-extern int kprobe_handler(struct pt_regs *regs);
-
 extern int kprobe_exceptions_notify(struct notifier_block *self,
 				    unsigned long val, void *data);
 #endif				/* _ASM_KPROBES_H */
-- 
cgit v1.2.3


From d6be29b871e285d33be0e3025929e2d6bcabb0c0 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Wed, 30 Jan 2008 13:31:21 +0100
Subject: x86: kprobes code for x86 unification

This patch unifies kprobes code.

- Unify kprobes_*.h to kprobes.h
- Unify kprobes_*.c to kprobes.c
  (Differences are separated by ifdefs)
 - Most differences are related to REX prefix and rip relatives.
 - Two inline assembly code are different.
 - One difference in kprobe_handlre()
 - One fixup exception code is different, but it will be unified
   if mm/extable_*.c are unified.
- Merge history logs into arch/x86/kernel/kprobes.c.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Signed-off-by: Jim Keniston <jkenisto@us.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/Makefile_32  |    2 +-
 arch/x86/kernel/Makefile_64  |    2 +-
 arch/x86/kernel/kprobes.c    | 1045 ++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/kprobes_32.c |  848 ----------------------------------
 arch/x86/kernel/kprobes_64.c |  953 --------------------------------------
 include/asm-x86/kprobes.h    |  103 ++++-
 include/asm-x86/kprobes_32.h |  102 -----
 include/asm-x86/kprobes_64.h |  102 -----
 8 files changed, 1145 insertions(+), 2012 deletions(-)
 create mode 100644 arch/x86/kernel/kprobes.c
 delete mode 100644 arch/x86/kernel/kprobes_32.c
 delete mode 100644 arch/x86/kernel/kprobes_64.c
 delete mode 100644 include/asm-x86/kprobes_32.h
 delete mode 100644 include/asm-x86/kprobes_64.h

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32
index a854e23eac0b..c7a959da363a 100644
--- a/arch/x86/kernel/Makefile_32
+++ b/arch/x86/kernel/Makefile_32
@@ -35,7 +35,7 @@ obj-$(CONFIG_KEXEC)		+= machine_kexec_32.o relocate_kernel_32.o crash.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump_32.o
 obj-$(CONFIG_X86_NUMAQ)		+= numaq_32.o
 obj-$(CONFIG_X86_SUMMIT_NUMA)	+= summit_32.o
-obj-$(CONFIG_KPROBES)		+= kprobes_32.o
+obj-$(CONFIG_KPROBES)		+= kprobes.o
 obj-$(CONFIG_MODULES)		+= module_32.o
 obj-$(CONFIG_ACPI_SRAT) 	+= srat_32.o
 obj-$(CONFIG_EFI) 		+= efi.o efi_32.o efi_stub_32.o
diff --git a/arch/x86/kernel/Makefile_64 b/arch/x86/kernel/Makefile_64
index e5093dd8cf01..fbb370071239 100644
--- a/arch/x86/kernel/Makefile_64
+++ b/arch/x86/kernel/Makefile_64
@@ -34,7 +34,7 @@ obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
 obj-$(CONFIG_GART_IOMMU)	+= pci-gart_64.o aperture_64.o
 obj-$(CONFIG_CALGARY_IOMMU)	+= pci-calgary_64.o tce_64.o
 obj-$(CONFIG_SWIOTLB)		+= pci-swiotlb_64.o
-obj-$(CONFIG_KPROBES)		+= kprobes_64.o
+obj-$(CONFIG_KPROBES)		+= kprobes.o
 obj-$(CONFIG_X86_PM_TIMER)	+= pmtimer_64.o
 obj-$(CONFIG_X86_VSMP)		+= vsmp_64.o
 obj-$(CONFIG_K8_NB)		+= k8.o
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
new file mode 100644
index 000000000000..9aadd4d4a229
--- /dev/null
+++ b/arch/x86/kernel/kprobes.c
@@ -0,0 +1,1045 @@
+/*
+ *  Kernel Probes (KProbes)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ *
+ * 2002-Oct	Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
+ *		Probes initial implementation ( includes contributions from
+ *		Rusty Russell).
+ * 2004-July	Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
+ *		interface to access function arguments.
+ * 2004-Oct	Jim Keniston <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
+ *		<prasanna@in.ibm.com> adapted for x86_64 from i386.
+ * 2005-Mar	Roland McGrath <roland@redhat.com>
+ *		Fixed to handle %rip-relative addressing mode correctly.
+ * 2005-May	Hien Nguyen <hien@us.ibm.com>, Jim Keniston
+ *		<jkenisto@us.ibm.com> and Prasanna S Panchamukhi
+ *		<prasanna@in.ibm.com> added function-return probes.
+ * 2005-May	Rusty Lynch <rusty.lynch@intel.com>
+ * 		Added function return probes functionality
+ * 2006-Feb	Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp> added
+ * 		kprobe-booster and kretprobe-booster for i386.
+ * 2007-Dec	Masami Hiramatsu <mhiramat@redhat.com> added kprobe-booster
+ * 		and kretprobe-booster for x86-64
+ * 2007-Dec	Masami Hiramatsu <mhiramat@redhat.com>, Arjan van de Ven
+ * 		<arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com>
+ * 		unified x86 kprobes code.
+ */
+
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/preempt.h>
+#include <linux/module.h>
+#include <linux/kdebug.h>
+
+#include <asm/cacheflush.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/uaccess.h>
+#include <asm/alternative.h>
+
+void jprobe_return_end(void);
+
+DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
+DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
+
+#ifdef CONFIG_X86_64
+#define stack_addr(regs) ((unsigned long *)regs->sp)
+#else
+/*
+ * "&regs->sp" looks wrong, but it's correct for x86_32.  x86_32 CPUs
+ * don't save the ss and esp registers if the CPU is already in kernel
+ * mode when it traps.  So for kprobes, regs->sp and regs->ss are not
+ * the [nonexistent] saved stack pointer and ss register, but rather
+ * the top 8 bytes of the pre-int3 stack.  So &regs->sp happens to
+ * point to the top of the pre-int3 stack.
+ */
+#define stack_addr(regs) ((unsigned long *)&regs->sp)
+#endif
+
+#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
+	(((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) |   \
+	  (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) |   \
+	  (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) |   \
+	  (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf))    \
+	 << (row % 32))
+	/*
+	 * Undefined/reserved opcodes, conditional jump, Opcode Extension
+	 * Groups, and some special opcodes can not boost.
+	 */
+static const u32 twobyte_is_boostable[256 / 32] = {
+	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
+	/*      ----------------------------------------------          */
+	W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */
+	W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 10 */
+	W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 20 */
+	W(0x30, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
+	W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
+	W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
+	W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1) | /* 60 */
+	W(0x70, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */
+	W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 80 */
+	W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
+	W(0xa0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* a0 */
+	W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) , /* b0 */
+	W(0xc0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
+	W(0xd0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) , /* d0 */
+	W(0xe0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* e0 */
+	W(0xf0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0)   /* f0 */
+	/*      -----------------------------------------------         */
+	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
+};
+static const u32 onebyte_has_modrm[256 / 32] = {
+	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
+	/*      -----------------------------------------------         */
+	W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */
+	W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */
+	W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */
+	W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */
+	W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
+	W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
+	W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */
+	W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */
+	W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
+	W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */
+	W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */
+	W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */
+	W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */
+	W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
+	W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */
+	W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1)   /* f0 */
+	/*      -----------------------------------------------         */
+	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
+};
+static const u32 twobyte_has_modrm[256 / 32] = {
+	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
+	/*      -----------------------------------------------         */
+	W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */
+	W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */
+	W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */
+	W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */
+	W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */
+	W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */
+	W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */
+	W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */
+	W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */
+	W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */
+	W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */
+	W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */
+	W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */
+	W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */
+	W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */
+	W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0)   /* ff */
+	/*      -----------------------------------------------         */
+	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
+};
+#undef W
+
+struct kretprobe_blackpoint kretprobe_blacklist[] = {
+	{"__switch_to", }, /* This function switches only current task, but
+			      doesn't switch kernel stack.*/
+	{NULL, NULL}	/* Terminator */
+};
+const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
+
+/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
+static __always_inline void set_jmp_op(void *from, void *to)
+{
+	struct __arch_jmp_op {
+		char op;
+		s32 raddr;
+	} __attribute__((packed)) * jop;
+	jop = (struct __arch_jmp_op *)from;
+	jop->raddr = (s32)((long)(to) - ((long)(from) + 5));
+	jop->op = RELATIVEJUMP_INSTRUCTION;
+}
+
+/*
+ * Returns non-zero if opcode is boostable.
+ * RIP relative instructions are adjusted at copying time in 64 bits mode
+ */
+static __always_inline int can_boost(kprobe_opcode_t *opcodes)
+{
+	kprobe_opcode_t opcode;
+	kprobe_opcode_t *orig_opcodes = opcodes;
+
+retry:
+	if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
+		return 0;
+	opcode = *(opcodes++);
+
+	/* 2nd-byte opcode */
+	if (opcode == 0x0f) {
+		if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
+			return 0;
+		return test_bit(*opcodes,
+				(unsigned long *)twobyte_is_boostable);
+	}
+
+	switch (opcode & 0xf0) {
+#ifdef CONFIG_X86_64
+	case 0x40:
+		goto retry; /* REX prefix is boostable */
+#endif
+	case 0x60:
+		if (0x63 < opcode && opcode < 0x67)
+			goto retry; /* prefixes */
+		/* can't boost Address-size override and bound */
+		return (opcode != 0x62 && opcode != 0x67);
+	case 0x70:
+		return 0; /* can't boost conditional jump */
+	case 0xc0:
+		/* can't boost software-interruptions */
+		return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf;
+	case 0xd0:
+		/* can boost AA* and XLAT */
+		return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7);
+	case 0xe0:
+		/* can boost in/out and absolute jmps */
+		return ((opcode & 0x04) || opcode == 0xea);
+	case 0xf0:
+		if ((opcode & 0x0c) == 0 && opcode != 0xf1)
+			goto retry; /* lock/rep(ne) prefix */
+		/* clear and set flags are boostable */
+		return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe));
+	default:
+		/* segment override prefixes are boostable */
+		if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e)
+			goto retry; /* prefixes */
+		/* CS override prefix and call are not boostable */
+		return (opcode != 0x2e && opcode != 0x9a);
+	}
+}
+
+/*
+ * Returns non-zero if opcode modifies the interrupt flag.
+ */
+static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
+{
+	switch (*insn) {
+	case 0xfa:		/* cli */
+	case 0xfb:		/* sti */
+	case 0xcf:		/* iret/iretd */
+	case 0x9d:		/* popf/popfd */
+		return 1;
+	}
+#ifdef CONFIG_X86_64
+	/*
+	 * on 64 bit x86, 0x40-0x4f are prefixes so we need to look
+	 * at the next byte instead.. but of course not recurse infinitely
+	 */
+	if (*insn  >= 0x40 && *insn <= 0x4f)
+		return is_IF_modifier(++insn);
+#endif
+	return 0;
+}
+
+#ifdef CONFIG_X86_64
+/*
+ * Adjust the displacement if the instruction uses the %rip-relative
+ * addressing mode.
+ * If it does, Return the address of the 32-bit displacement word.
+ * If not, return null.
+ */
+static void __kprobes fix_riprel(struct kprobe *p)
+{
+	u8 *insn = p->ainsn.insn;
+	s64 disp;
+	int need_modrm;
+
+	/* Skip legacy instruction prefixes.  */
+	while (1) {
+		switch (*insn) {
+		case 0x66:
+		case 0x67:
+		case 0x2e:
+		case 0x3e:
+		case 0x26:
+		case 0x64:
+		case 0x65:
+		case 0x36:
+		case 0xf0:
+		case 0xf3:
+		case 0xf2:
+			++insn;
+			continue;
+		}
+		break;
+	}
+
+	/* Skip REX instruction prefix.  */
+	if ((*insn & 0xf0) == 0x40)
+		++insn;
+
+	if (*insn == 0x0f) {
+		/* Two-byte opcode.  */
+		++insn;
+		need_modrm = test_bit(*insn,
+				      (unsigned long *)twobyte_has_modrm);
+	} else
+		/* One-byte opcode.  */
+		need_modrm = test_bit(*insn,
+				      (unsigned long *)onebyte_has_modrm);
+
+	if (need_modrm) {
+		u8 modrm = *++insn;
+		if ((modrm & 0xc7) == 0x05) {
+			/* %rip+disp32 addressing mode */
+			/* Displacement follows ModRM byte.  */
+			++insn;
+			/*
+			 * The copied instruction uses the %rip-relative
+			 * addressing mode.  Adjust the displacement for the
+			 * difference between the original location of this
+			 * instruction and the location of the copy that will
+			 * actually be run.  The tricky bit here is making sure
+			 * that the sign extension happens correctly in this
+			 * calculation, since we need a signed 32-bit result to
+			 * be sign-extended to 64 bits when it's added to the
+			 * %rip value and yield the same 64-bit result that the
+			 * sign-extension of the original signed 32-bit
+			 * displacement would have given.
+			 */
+			disp = (u8 *) p->addr + *((s32 *) insn) -
+			       (u8 *) p->ainsn.insn;
+			BUG_ON((s64) (s32) disp != disp); /* Sanity check.  */
+			*(s32 *)insn = (s32) disp;
+		}
+	}
+}
+#endif
+
+static void __kprobes arch_copy_kprobe(struct kprobe *p)
+{
+	memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+#ifdef CONFIG_X86_64
+	fix_riprel(p);
+#endif
+	if (can_boost(p->addr))
+		p->ainsn.boostable = 0;
+	else
+		p->ainsn.boostable = -1;
+
+	p->opcode = *p->addr;
+}
+
+int __kprobes arch_prepare_kprobe(struct kprobe *p)
+{
+	/* insn: must be on special executable page on x86. */
+	p->ainsn.insn = get_insn_slot();
+	if (!p->ainsn.insn)
+		return -ENOMEM;
+	arch_copy_kprobe(p);
+	return 0;
+}
+
+void __kprobes arch_arm_kprobe(struct kprobe *p)
+{
+	text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
+}
+
+void __kprobes arch_disarm_kprobe(struct kprobe *p)
+{
+	text_poke(p->addr, &p->opcode, 1);
+}
+
+void __kprobes arch_remove_kprobe(struct kprobe *p)
+{
+	mutex_lock(&kprobe_mutex);
+	free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1));
+	mutex_unlock(&kprobe_mutex);
+}
+
+static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
+{
+	kcb->prev_kprobe.kp = kprobe_running();
+	kcb->prev_kprobe.status = kcb->kprobe_status;
+	kcb->prev_kprobe.old_flags = kcb->kprobe_old_flags;
+	kcb->prev_kprobe.saved_flags = kcb->kprobe_saved_flags;
+}
+
+static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
+{
+	__get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
+	kcb->kprobe_status = kcb->prev_kprobe.status;
+	kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags;
+	kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags;
+}
+
+static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
+				struct kprobe_ctlblk *kcb)
+{
+	__get_cpu_var(current_kprobe) = p;
+	kcb->kprobe_saved_flags = kcb->kprobe_old_flags
+		= (regs->flags & (TF_MASK | IF_MASK));
+	if (is_IF_modifier(p->ainsn.insn))
+		kcb->kprobe_saved_flags &= ~IF_MASK;
+}
+
+static __always_inline void clear_btf(void)
+{
+	if (test_thread_flag(TIF_DEBUGCTLMSR))
+		wrmsr(MSR_IA32_DEBUGCTLMSR, 0, 0);
+}
+
+static __always_inline void restore_btf(void)
+{
+	if (test_thread_flag(TIF_DEBUGCTLMSR))
+		wrmsr(MSR_IA32_DEBUGCTLMSR, current->thread.debugctlmsr, 0);
+}
+
+static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
+{
+	clear_btf();
+	regs->flags |= TF_MASK;
+	regs->flags &= ~IF_MASK;
+	/*single step inline if the instruction is an int3*/
+	if (p->opcode == BREAKPOINT_INSTRUCTION)
+		regs->ip = (unsigned long)p->addr;
+	else
+		regs->ip = (unsigned long)p->ainsn.insn;
+}
+
+/* Called with kretprobe_lock held */
+void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
+				      struct pt_regs *regs)
+{
+	unsigned long *sara = stack_addr(regs);
+
+	ri->ret_addr = (kprobe_opcode_t *) *sara;
+
+	/* Replace the return addr with trampoline addr */
+	*sara = (unsigned long) &kretprobe_trampoline;
+}
+
+/*
+ * Interrupts are disabled on entry as trap3 is an interrupt gate and they
+ * remain disabled thorough out this function.
+ */
+static int __kprobes kprobe_handler(struct pt_regs *regs)
+{
+	struct kprobe *p;
+	int ret = 0;
+	kprobe_opcode_t *addr;
+	struct kprobe_ctlblk *kcb;
+
+	addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
+
+	/*
+	 * We don't want to be preempted for the entire
+	 * duration of kprobe processing
+	 */
+	preempt_disable();
+	kcb = get_kprobe_ctlblk();
+
+	/* Check we're not actually recursing */
+	if (kprobe_running()) {
+		p = get_kprobe(addr);
+		if (p) {
+			if (kcb->kprobe_status == KPROBE_HIT_SS &&
+				*p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
+				regs->flags &= ~TF_MASK;
+				regs->flags |= kcb->kprobe_saved_flags;
+				goto no_kprobe;
+#ifdef CONFIG_X86_64
+			} else if (kcb->kprobe_status == KPROBE_HIT_SSDONE) {
+				/* TODO: Provide re-entrancy from
+				 * post_kprobes_handler() and avoid exception
+				 * stack corruption while single-stepping on
+				 * the instruction of the new probe.
+				 */
+				arch_disarm_kprobe(p);
+				regs->ip = (unsigned long)p->addr;
+				reset_current_kprobe();
+				return 1;
+#endif
+			}
+			/* We have reentered the kprobe_handler(), since
+			 * another probe was hit while within the handler.
+			 * We here save the original kprobes variables and
+			 * just single step on the instruction of the new probe
+			 * without calling any user handlers.
+			 */
+			save_previous_kprobe(kcb);
+			set_current_kprobe(p, regs, kcb);
+			kprobes_inc_nmissed_count(p);
+			prepare_singlestep(p, regs);
+			kcb->kprobe_status = KPROBE_REENTER;
+			return 1;
+		} else {
+			if (*addr != BREAKPOINT_INSTRUCTION) {
+			/* The breakpoint instruction was removed by
+			 * another cpu right after we hit, no further
+			 * handling of this interrupt is appropriate
+			 */
+				regs->ip = (unsigned long)addr;
+				ret = 1;
+				goto no_kprobe;
+			}
+			p = __get_cpu_var(current_kprobe);
+			if (p->break_handler && p->break_handler(p, regs))
+				goto ss_probe;
+		}
+		goto no_kprobe;
+	}
+
+	p = get_kprobe(addr);
+	if (!p) {
+		if (*addr != BREAKPOINT_INSTRUCTION) {
+			/*
+			 * The breakpoint instruction was removed right
+			 * after we hit it.  Another cpu has removed
+			 * either a probepoint or a debugger breakpoint
+			 * at this address.  In either case, no further
+			 * handling of this interrupt is appropriate.
+			 * Back up over the (now missing) int3 and run
+			 * the original instruction.
+			 */
+			regs->ip = (unsigned long)addr;
+			ret = 1;
+		}
+		/* Not one of ours: let kernel handle it */
+		goto no_kprobe;
+	}
+
+	set_current_kprobe(p, regs, kcb);
+	kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+
+	if (p->pre_handler && p->pre_handler(p, regs))
+		/* handler has already set things up, so skip ss setup */
+		return 1;
+
+ss_probe:
+#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM)
+	if (p->ainsn.boostable == 1 && !p->post_handler) {
+		/* Boost up -- we can execute copied instructions directly */
+		reset_current_kprobe();
+		regs->ip = (unsigned long)p->ainsn.insn;
+		preempt_enable_no_resched();
+		return 1;
+	}
+#endif
+	prepare_singlestep(p, regs);
+	kcb->kprobe_status = KPROBE_HIT_SS;
+	return 1;
+
+no_kprobe:
+	preempt_enable_no_resched();
+	return ret;
+}
+
+/*
+ * When a retprobed function returns, this code saves registers and
+ * calls trampoline_handler() runs, which calls the kretprobe's handler.
+ */
+ void __kprobes kretprobe_trampoline_holder(void)
+ {
+	asm volatile (
+			".global kretprobe_trampoline\n"
+			"kretprobe_trampoline: \n"
+#ifdef CONFIG_X86_64
+			/* We don't bother saving the ss register */
+			"	pushq %rsp\n"
+			"	pushfq\n"
+			/*
+			 * Skip cs, ip, orig_ax.
+			 * trampoline_handler() will plug in these values
+			 */
+			"	subq $24, %rsp\n"
+			"	pushq %rdi\n"
+			"	pushq %rsi\n"
+			"	pushq %rdx\n"
+			"	pushq %rcx\n"
+			"	pushq %rax\n"
+			"	pushq %r8\n"
+			"	pushq %r9\n"
+			"	pushq %r10\n"
+			"	pushq %r11\n"
+			"	pushq %rbx\n"
+			"	pushq %rbp\n"
+			"	pushq %r12\n"
+			"	pushq %r13\n"
+			"	pushq %r14\n"
+			"	pushq %r15\n"
+			"	movq %rsp, %rdi\n"
+			"	call trampoline_handler\n"
+			/* Replace saved sp with true return address. */
+			"	movq %rax, 152(%rsp)\n"
+			"	popq %r15\n"
+			"	popq %r14\n"
+			"	popq %r13\n"
+			"	popq %r12\n"
+			"	popq %rbp\n"
+			"	popq %rbx\n"
+			"	popq %r11\n"
+			"	popq %r10\n"
+			"	popq %r9\n"
+			"	popq %r8\n"
+			"	popq %rax\n"
+			"	popq %rcx\n"
+			"	popq %rdx\n"
+			"	popq %rsi\n"
+			"	popq %rdi\n"
+			/* Skip orig_ax, ip, cs */
+			"	addq $24, %rsp\n"
+			"	popfq\n"
+#else
+			"	pushf\n"
+			/*
+			 * Skip cs, ip, orig_ax.
+			 * trampoline_handler() will plug in these values
+			 */
+			"	subl $12, %esp\n"
+			"	pushl %fs\n"
+			"	pushl %ds\n"
+			"	pushl %es\n"
+			"	pushl %eax\n"
+			"	pushl %ebp\n"
+			"	pushl %edi\n"
+			"	pushl %esi\n"
+			"	pushl %edx\n"
+			"	pushl %ecx\n"
+			"	pushl %ebx\n"
+			"	movl %esp, %eax\n"
+			"	call trampoline_handler\n"
+			/* Move flags to cs */
+			"	movl 52(%esp), %edx\n"
+			"	movl %edx, 48(%esp)\n"
+			/* Replace saved flags with true return address. */
+			"	movl %eax, 52(%esp)\n"
+			"	popl %ebx\n"
+			"	popl %ecx\n"
+			"	popl %edx\n"
+			"	popl %esi\n"
+			"	popl %edi\n"
+			"	popl %ebp\n"
+			"	popl %eax\n"
+			/* Skip ip, orig_ax, es, ds, fs */
+			"	addl $20, %esp\n"
+			"	popf\n"
+#endif
+			"	ret\n");
+ }
+
+/*
+ * Called from kretprobe_trampoline
+ */
+void * __kprobes trampoline_handler(struct pt_regs *regs)
+{
+	struct kretprobe_instance *ri = NULL;
+	struct hlist_head *head, empty_rp;
+	struct hlist_node *node, *tmp;
+	unsigned long flags, orig_ret_address = 0;
+	unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline;
+
+	INIT_HLIST_HEAD(&empty_rp);
+	spin_lock_irqsave(&kretprobe_lock, flags);
+	head = kretprobe_inst_table_head(current);
+	/* fixup registers */
+#ifdef CONFIG_X86_64
+	regs->cs = __KERNEL_CS;
+#else
+	regs->cs = __KERNEL_CS | get_kernel_rpl();
+#endif
+	regs->ip = trampoline_address;
+	regs->orig_ax = ~0UL;
+
+	/*
+	 * It is possible to have multiple instances associated with a given
+	 * task either because multiple functions in the call path have
+	 * return probes installed on them, and/or more then one
+	 * return probe was registered for a target function.
+	 *
+	 * We can handle this because:
+	 *     - instances are always pushed into the head of the list
+	 *     - when multiple return probes are registered for the same
+	 *	 function, the (chronologically) first instance's ret_addr
+	 *	 will be the real return address, and all the rest will
+	 *	 point to kretprobe_trampoline.
+	 */
+	hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
+		if (ri->task != current)
+			/* another task is sharing our hash bucket */
+			continue;
+
+		if (ri->rp && ri->rp->handler) {
+			__get_cpu_var(current_kprobe) = &ri->rp->kp;
+			get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
+			ri->rp->handler(ri, regs);
+			__get_cpu_var(current_kprobe) = NULL;
+		}
+
+		orig_ret_address = (unsigned long)ri->ret_addr;
+		recycle_rp_inst(ri, &empty_rp);
+
+		if (orig_ret_address != trampoline_address)
+			/*
+			 * This is the real return address. Any other
+			 * instances associated with this task are for
+			 * other calls deeper on the call stack
+			 */
+			break;
+	}
+
+	kretprobe_assert(ri, orig_ret_address, trampoline_address);
+
+	spin_unlock_irqrestore(&kretprobe_lock, flags);
+
+	hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
+		hlist_del(&ri->hlist);
+		kfree(ri);
+	}
+	return (void *)orig_ret_address;
+}
+
+/*
+ * Called after single-stepping.  p->addr is the address of the
+ * instruction whose first byte has been replaced by the "int 3"
+ * instruction.  To avoid the SMP problems that can occur when we
+ * temporarily put back the original opcode to single-step, we
+ * single-stepped a copy of the instruction.  The address of this
+ * copy is p->ainsn.insn.
+ *
+ * This function prepares to return from the post-single-step
+ * interrupt.  We have to fix up the stack as follows:
+ *
+ * 0) Except in the case of absolute or indirect jump or call instructions,
+ * the new ip is relative to the copied instruction.  We need to make
+ * it relative to the original instruction.
+ *
+ * 1) If the single-stepped instruction was pushfl, then the TF and IF
+ * flags are set in the just-pushed flags, and may need to be cleared.
+ *
+ * 2) If the single-stepped instruction was a call, the return address
+ * that is atop the stack is the address following the copied instruction.
+ * We need to make it the address following the original instruction.
+ *
+ * If this is the first time we've single-stepped the instruction at
+ * this probepoint, and the instruction is boostable, boost it: add a
+ * jump instruction after the copied instruction, that jumps to the next
+ * instruction after the probepoint.
+ */
+static void __kprobes resume_execution(struct kprobe *p,
+		struct pt_regs *regs, struct kprobe_ctlblk *kcb)
+{
+	unsigned long *tos = stack_addr(regs);
+	unsigned long copy_ip = (unsigned long)p->ainsn.insn;
+	unsigned long orig_ip = (unsigned long)p->addr;
+	kprobe_opcode_t *insn = p->ainsn.insn;
+
+#ifdef CONFIG_X86_64
+	/*skip the REX prefix*/
+	if (*insn >= 0x40 && *insn <= 0x4f)
+		insn++;
+#endif
+
+	regs->flags &= ~TF_MASK;
+	switch (*insn) {
+	case 0x9c:	/* pushfl */
+		*tos &= ~(TF_MASK | IF_MASK);
+		*tos |= kcb->kprobe_old_flags;
+		break;
+	case 0xc2:	/* iret/ret/lret */
+	case 0xc3:
+	case 0xca:
+	case 0xcb:
+	case 0xcf:
+	case 0xea:	/* jmp absolute -- ip is correct */
+		/* ip is already adjusted, no more changes required */
+		p->ainsn.boostable = 1;
+		goto no_change;
+	case 0xe8:	/* call relative - Fix return addr */
+		*tos = orig_ip + (*tos - copy_ip);
+		break;
+#ifndef CONFIG_X86_64
+	case 0x9a:	/* call absolute -- same as call absolute, indirect */
+		*tos = orig_ip + (*tos - copy_ip);
+		goto no_change;
+#endif
+	case 0xff:
+		if ((insn[1] & 0x30) == 0x10) {
+			/*
+			 * call absolute, indirect
+			 * Fix return addr; ip is correct.
+			 * But this is not boostable
+			 */
+			*tos = orig_ip + (*tos - copy_ip);
+			goto no_change;
+		} else if (((insn[1] & 0x31) == 0x20) ||
+			   ((insn[1] & 0x31) == 0x21)) {
+			/*
+			 * jmp near and far, absolute indirect
+			 * ip is correct. And this is boostable
+			 */
+			p->ainsn.boostable = 1;
+			goto no_change;
+		}
+	default:
+		break;
+	}
+
+	if (p->ainsn.boostable == 0) {
+		if ((regs->ip > copy_ip) &&
+		    (regs->ip - copy_ip) + 5 < MAX_INSN_SIZE) {
+			/*
+			 * These instructions can be executed directly if it
+			 * jumps back to correct address.
+			 */
+			set_jmp_op((void *)regs->ip,
+				   (void *)orig_ip + (regs->ip - copy_ip));
+			p->ainsn.boostable = 1;
+		} else {
+			p->ainsn.boostable = -1;
+		}
+	}
+
+	regs->ip += orig_ip - copy_ip;
+
+no_change:
+	restore_btf();
+
+	return;
+}
+
+/*
+ * Interrupts are disabled on entry as trap1 is an interrupt gate and they
+ * remain disabled thoroughout this function.
+ */
+static int __kprobes post_kprobe_handler(struct pt_regs *regs)
+{
+	struct kprobe *cur = kprobe_running();
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+	if (!cur)
+		return 0;
+
+	if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
+		kcb->kprobe_status = KPROBE_HIT_SSDONE;
+		cur->post_handler(cur, regs, 0);
+	}
+
+	resume_execution(cur, regs, kcb);
+	regs->flags |= kcb->kprobe_saved_flags;
+	trace_hardirqs_fixup_flags(regs->flags);
+
+	/* Restore back the original saved kprobes variables and continue. */
+	if (kcb->kprobe_status == KPROBE_REENTER) {
+		restore_previous_kprobe(kcb);
+		goto out;
+	}
+	reset_current_kprobe();
+out:
+	preempt_enable_no_resched();
+
+	/*
+	 * if somebody else is singlestepping across a probe point, flags
+	 * will have TF set, in which case, continue the remaining processing
+	 * of do_debug, as if this is not a probe hit.
+	 */
+	if (regs->flags & TF_MASK)
+		return 0;
+
+	return 1;
+}
+
+int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
+{
+	struct kprobe *cur = kprobe_running();
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+	switch (kcb->kprobe_status) {
+	case KPROBE_HIT_SS:
+	case KPROBE_REENTER:
+		/*
+		 * We are here because the instruction being single
+		 * stepped caused a page fault. We reset the current
+		 * kprobe and the ip points back to the probe address
+		 * and allow the page fault handler to continue as a
+		 * normal page fault.
+		 */
+		regs->ip = (unsigned long)cur->addr;
+		regs->flags |= kcb->kprobe_old_flags;
+		if (kcb->kprobe_status == KPROBE_REENTER)
+			restore_previous_kprobe(kcb);
+		else
+			reset_current_kprobe();
+		preempt_enable_no_resched();
+		break;
+	case KPROBE_HIT_ACTIVE:
+	case KPROBE_HIT_SSDONE:
+		/*
+		 * We increment the nmissed count for accounting,
+		 * we can also use npre/npostfault count for accounting
+		 * these specific fault cases.
+		 */
+		kprobes_inc_nmissed_count(cur);
+
+		/*
+		 * We come here because instructions in the pre/post
+		 * handler caused the page_fault, this could happen
+		 * if handler tries to access user space by
+		 * copy_from_user(), get_user() etc. Let the
+		 * user-specified handler try to fix it first.
+		 */
+		if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
+			return 1;
+
+		/*
+		 * In case the user-specified fault handler returned
+		 * zero, try to fix up.
+		 */
+#ifdef CONFIG_X86_64
+		{
+			const struct exception_table_entry *fixup;
+			fixup = search_exception_tables(regs->ip);
+			if (fixup) {
+				regs->ip = fixup->fixup;
+				return 1;
+			}
+		}
+#else
+		if (fixup_exception(regs))
+			return 1;
+#endif
+		/*
+		 * fixup routine could not handle it,
+		 * Let do_page_fault() fix it.
+		 */
+		break;
+	default:
+		break;
+	}
+	return 0;
+}
+
+/*
+ * Wrapper routine for handling exceptions.
+ */
+int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
+				       unsigned long val, void *data)
+{
+	struct die_args *args = (struct die_args *)data;
+	int ret = NOTIFY_DONE;
+
+	if (args->regs && user_mode_vm(args->regs))
+		return ret;
+
+	switch (val) {
+	case DIE_INT3:
+		if (kprobe_handler(args->regs))
+			ret = NOTIFY_STOP;
+		break;
+	case DIE_DEBUG:
+		if (post_kprobe_handler(args->regs))
+			ret = NOTIFY_STOP;
+		break;
+	case DIE_GPF:
+		/* kprobe_running() needs smp_processor_id() */
+		preempt_disable();
+		if (kprobe_running() &&
+		    kprobe_fault_handler(args->regs, args->trapnr))
+			ret = NOTIFY_STOP;
+		preempt_enable();
+		break;
+	default:
+		break;
+	}
+	return ret;
+}
+
+int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+	struct jprobe *jp = container_of(p, struct jprobe, kp);
+	unsigned long addr;
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+	kcb->jprobe_saved_regs = *regs;
+	kcb->jprobe_saved_sp = stack_addr(regs);
+	addr = (unsigned long)(kcb->jprobe_saved_sp);
+
+	/*
+	 * As Linus pointed out, gcc assumes that the callee
+	 * owns the argument space and could overwrite it, e.g.
+	 * tailcall optimization. So, to be absolutely safe
+	 * we also save and restore enough stack bytes to cover
+	 * the argument area.
+	 */
+	memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr,
+	       MIN_STACK_SIZE(addr));
+	regs->flags &= ~IF_MASK;
+	trace_hardirqs_off();
+	regs->ip = (unsigned long)(jp->entry);
+	return 1;
+}
+
+void __kprobes jprobe_return(void)
+{
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+	asm volatile (
+#ifdef CONFIG_X86_64
+			"       xchg   %%rbx,%%rsp	\n"
+#else
+			"       xchgl   %%ebx,%%esp	\n"
+#endif
+			"       int3			\n"
+			"       .globl jprobe_return_end\n"
+			"       jprobe_return_end:	\n"
+			"       nop			\n"::"b"
+			(kcb->jprobe_saved_sp):"memory");
+}
+
+int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
+{
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+	u8 *addr = (u8 *) (regs->ip - 1);
+	struct jprobe *jp = container_of(p, struct jprobe, kp);
+
+	if ((addr > (u8 *) jprobe_return) &&
+	    (addr < (u8 *) jprobe_return_end)) {
+		if (stack_addr(regs) != kcb->jprobe_saved_sp) {
+			struct pt_regs *saved_regs = &kcb->jprobe_saved_regs;
+			printk(KERN_ERR
+			       "current sp %p does not match saved sp %p\n",
+			       stack_addr(regs), kcb->jprobe_saved_sp);
+			printk(KERN_ERR "Saved registers for jprobe %p\n", jp);
+			show_registers(saved_regs);
+			printk(KERN_ERR "Current registers\n");
+			show_registers(regs);
+			BUG();
+		}
+		*regs = kcb->jprobe_saved_regs;
+		memcpy((kprobe_opcode_t *)(kcb->jprobe_saved_sp),
+		       kcb->jprobes_stack,
+		       MIN_STACK_SIZE(kcb->jprobe_saved_sp));
+		preempt_enable_no_resched();
+		return 1;
+	}
+	return 0;
+}
+
+int __init arch_init_kprobes(void)
+{
+	return 0;
+}
+
+int __kprobes arch_trampoline_kprobe(struct kprobe *p)
+{
+	return 0;
+}
diff --git a/arch/x86/kernel/kprobes_32.c b/arch/x86/kernel/kprobes_32.c
deleted file mode 100644
index 8e06431d8b03..000000000000
--- a/arch/x86/kernel/kprobes_32.c
+++ /dev/null
@@ -1,848 +0,0 @@
-/*
- *  Kernel Probes (KProbes)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright (C) IBM Corporation, 2002, 2004
- *
- * 2002-Oct	Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
- *		Probes initial implementation ( includes contributions from
- *		Rusty Russell).
- * 2004-July	Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
- *		interface to access function arguments.
- * 2005-May	Hien Nguyen <hien@us.ibm.com>, Jim Keniston
- *		<jkenisto@us.ibm.com> and Prasanna S Panchamukhi
- *		<prasanna@in.ibm.com> added function-return probes.
- */
-
-#include <linux/kprobes.h>
-#include <linux/ptrace.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-#include <linux/preempt.h>
-#include <linux/module.h>
-#include <linux/kdebug.h>
-
-#include <asm/cacheflush.h>
-#include <asm/desc.h>
-#include <asm/pgtable.h>
-#include <asm/uaccess.h>
-#include <asm/alternative.h>
-
-void jprobe_return_end(void);
-
-DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
-DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
-
-/*
- * "&regs->sp" looks wrong, but it's correct for x86_32.  x86_32 CPUs
- * don't save the ss and esp registers if the CPU is already in kernel
- * mode when it traps.  So for kprobes, regs->sp and regs->ss are not
- * the [nonexistent] saved stack pointer and ss register, but rather
- * the top 8 bytes of the pre-int3 stack.  So &regs->sp happens to
- * point to the top of the pre-int3 stack.
- */
-#define stack_addr(regs) ((unsigned long *)&regs->sp)
-
-#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
-	(((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) |   \
-	  (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) |   \
-	  (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) |   \
-	  (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf))    \
-	 << (row % 32))
-	/*
-	 * Undefined/reserved opcodes, conditional jump, Opcode Extension
-	 * Groups, and some special opcodes can not boost.
-	 */
-static const u32 twobyte_is_boostable[256 / 32] = {
-	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
-	/*      ----------------------------------------------          */
-	W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */
-	W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 10 */
-	W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 20 */
-	W(0x30, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
-	W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
-	W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
-	W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1) | /* 60 */
-	W(0x70, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */
-	W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 80 */
-	W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
-	W(0xa0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* a0 */
-	W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) , /* b0 */
-	W(0xc0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
-	W(0xd0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) , /* d0 */
-	W(0xe0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* e0 */
-	W(0xf0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0)   /* f0 */
-	/*      -----------------------------------------------         */
-	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
-};
-static const u32 onebyte_has_modrm[256 / 32] = {
-	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
-	/*      -----------------------------------------------         */
-	W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */
-	W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */
-	W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */
-	W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */
-	W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
-	W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
-	W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */
-	W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */
-	W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
-	W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */
-	W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */
-	W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */
-	W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */
-	W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
-	W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */
-	W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1)   /* f0 */
-	/*      -----------------------------------------------         */
-	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
-};
-static const u32 twobyte_has_modrm[256 / 32] = {
-	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
-	/*      -----------------------------------------------         */
-	W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */
-	W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */
-	W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */
-	W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */
-	W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */
-	W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */
-	W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */
-	W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */
-	W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */
-	W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */
-	W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */
-	W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */
-	W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */
-	W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */
-	W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */
-	W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0)   /* ff */
-	/*      -----------------------------------------------         */
-	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
-};
-#undef W
-
-struct kretprobe_blackpoint kretprobe_blacklist[] = {
-	{"__switch_to", }, /* This function switches only current task, but
-			      doesn't switch kernel stack.*/
-	{NULL, NULL}	/* Terminator */
-};
-const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
-
-/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
-static __always_inline void set_jmp_op(void *from, void *to)
-{
-	struct __arch_jmp_op {
-		char op;
-		s32 raddr;
-	} __attribute__((packed)) * jop;
-	jop = (struct __arch_jmp_op *)from;
-	jop->raddr = (s32)((long)(to) - ((long)(from) + 5));
-	jop->op = RELATIVEJUMP_INSTRUCTION;
-}
-
-/*
- * returns non-zero if opcode is boostable.
- */
-static __always_inline int can_boost(kprobe_opcode_t *opcodes)
-{
-	kprobe_opcode_t opcode;
-	kprobe_opcode_t *orig_opcodes = opcodes;
-
-retry:
-	if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
-		return 0;
-	opcode = *(opcodes++);
-
-	/* 2nd-byte opcode */
-	if (opcode == 0x0f) {
-		if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
-			return 0;
-		return test_bit(*opcodes,
-				(unsigned long *)twobyte_is_boostable);
-	}
-
-	switch (opcode & 0xf0) {
-	case 0x60:
-		if (0x63 < opcode && opcode < 0x67)
-			goto retry; /* prefixes */
-		/* can't boost Address-size override and bound */
-		return (opcode != 0x62 && opcode != 0x67);
-	case 0x70:
-		return 0; /* can't boost conditional jump */
-	case 0xc0:
-		/* can't boost software-interruptions */
-		return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf;
-	case 0xd0:
-		/* can boost AA* and XLAT */
-		return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7);
-	case 0xe0:
-		/* can boost in/out and absolute jmps */
-		return ((opcode & 0x04) || opcode == 0xea);
-	case 0xf0:
-		if ((opcode & 0x0c) == 0 && opcode != 0xf1)
-			goto retry; /* lock/rep(ne) prefix */
-		/* clear and set flags are boostable */
-		return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe));
-	default:
-		/* segment override prefixes are boostable */
-		if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e)
-			goto retry; /* prefixes */
-		/* CS override prefix and call are not boostable */
-		return (opcode != 0x2e && opcode != 0x9a);
-	}
-}
-
-/*
- * returns non-zero if opcode modifies the interrupt flag.
- */
-static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
-{
-	switch (*insn) {
-	case 0xfa:		/* cli */
-	case 0xfb:		/* sti */
-	case 0xcf:		/* iret/iretd */
-	case 0x9d:		/* popf/popfd */
-		return 1;
-	}
-	return 0;
-}
-
-static void __kprobes arch_copy_kprobe(struct kprobe *p)
-{
-	memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
-	if (can_boost(p->addr))
-		p->ainsn.boostable = 0;
-	else
-		p->ainsn.boostable = -1;
-
-	p->opcode = *p->addr;
-}
-
-int __kprobes arch_prepare_kprobe(struct kprobe *p)
-{
-	/* insn: must be on special executable page on x86. */
-	p->ainsn.insn = get_insn_slot();
-	if (!p->ainsn.insn)
-		return -ENOMEM;
-	arch_copy_kprobe(p);
-	return 0;
-}
-
-void __kprobes arch_arm_kprobe(struct kprobe *p)
-{
-	text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
-}
-
-void __kprobes arch_disarm_kprobe(struct kprobe *p)
-{
-	text_poke(p->addr, &p->opcode, 1);
-}
-
-void __kprobes arch_remove_kprobe(struct kprobe *p)
-{
-	mutex_lock(&kprobe_mutex);
-	free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1));
-	mutex_unlock(&kprobe_mutex);
-}
-
-static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
-{
-	kcb->prev_kprobe.kp = kprobe_running();
-	kcb->prev_kprobe.status = kcb->kprobe_status;
-	kcb->prev_kprobe.old_flags = kcb->kprobe_old_flags;
-	kcb->prev_kprobe.saved_flags = kcb->kprobe_saved_flags;
-}
-
-static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
-{
-	__get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
-	kcb->kprobe_status = kcb->prev_kprobe.status;
-	kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags;
-	kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags;
-}
-
-static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
-				struct kprobe_ctlblk *kcb)
-{
-	__get_cpu_var(current_kprobe) = p;
-	kcb->kprobe_saved_flags = kcb->kprobe_old_flags
-		= (regs->flags & (TF_MASK | IF_MASK));
-	if (is_IF_modifier(p->ainsn.insn))
-		kcb->kprobe_saved_flags &= ~IF_MASK;
-}
-
-static __always_inline void clear_btf(void)
-{
-	if (test_thread_flag(TIF_DEBUGCTLMSR))
-		wrmsr(MSR_IA32_DEBUGCTLMSR, 0, 0);
-}
-
-static __always_inline void restore_btf(void)
-{
-	if (test_thread_flag(TIF_DEBUGCTLMSR))
-		wrmsr(MSR_IA32_DEBUGCTLMSR, current->thread.debugctlmsr, 0);
-}
-
-static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
-{
-	clear_btf();
-	regs->flags |= TF_MASK;
-	regs->flags &= ~IF_MASK;
-	/*single step inline if the instruction is an int3*/
-	if (p->opcode == BREAKPOINT_INSTRUCTION)
-		regs->ip = (unsigned long)p->addr;
-	else
-		regs->ip = (unsigned long)p->ainsn.insn;
-}
-
-/* Called with kretprobe_lock held */
-void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
-				      struct pt_regs *regs)
-{
-	unsigned long *sara = stack_addr(regs);
-
-	ri->ret_addr = (kprobe_opcode_t *) *sara;
-
-	/* Replace the return addr with trampoline addr */
-	*sara = (unsigned long) &kretprobe_trampoline;
-}
-
-/*
- * Interrupts are disabled on entry as trap3 is an interrupt gate and they
- * remain disabled thorough out this function.
- */
-static int __kprobes kprobe_handler(struct pt_regs *regs)
-{
-	struct kprobe *p;
-	int ret = 0;
-	kprobe_opcode_t *addr;
-	struct kprobe_ctlblk *kcb;
-
-	addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
-
-	/*
-	 * We don't want to be preempted for the entire
-	 * duration of kprobe processing
-	 */
-	preempt_disable();
-	kcb = get_kprobe_ctlblk();
-
-	/* Check we're not actually recursing */
-	if (kprobe_running()) {
-		p = get_kprobe(addr);
-		if (p) {
-			if (kcb->kprobe_status == KPROBE_HIT_SS &&
-				*p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
-				regs->flags &= ~TF_MASK;
-				regs->flags |= kcb->kprobe_saved_flags;
-				goto no_kprobe;
-			}
-			/* We have reentered the kprobe_handler(), since
-			 * another probe was hit while within the handler.
-			 * We here save the original kprobes variables and
-			 * just single step on the instruction of the new probe
-			 * without calling any user handlers.
-			 */
-			save_previous_kprobe(kcb);
-			set_current_kprobe(p, regs, kcb);
-			kprobes_inc_nmissed_count(p);
-			prepare_singlestep(p, regs);
-			kcb->kprobe_status = KPROBE_REENTER;
-			return 1;
-		} else {
-			if (*addr != BREAKPOINT_INSTRUCTION) {
-			/* The breakpoint instruction was removed by
-			 * another cpu right after we hit, no further
-			 * handling of this interrupt is appropriate
-			 */
-				regs->ip = (unsigned long)addr;
-				ret = 1;
-				goto no_kprobe;
-			}
-			p = __get_cpu_var(current_kprobe);
-			if (p->break_handler && p->break_handler(p, regs)) {
-				goto ss_probe;
-			}
-		}
-		goto no_kprobe;
-	}
-
-	p = get_kprobe(addr);
-	if (!p) {
-		if (*addr != BREAKPOINT_INSTRUCTION) {
-			/*
-			 * The breakpoint instruction was removed right
-			 * after we hit it.  Another cpu has removed
-			 * either a probepoint or a debugger breakpoint
-			 * at this address.  In either case, no further
-			 * handling of this interrupt is appropriate.
-			 * Back up over the (now missing) int3 and run
-			 * the original instruction.
-			 */
-			regs->ip = (unsigned long)addr;
-			ret = 1;
-		}
-		/* Not one of ours: let kernel handle it */
-		goto no_kprobe;
-	}
-
-	set_current_kprobe(p, regs, kcb);
-	kcb->kprobe_status = KPROBE_HIT_ACTIVE;
-
-	if (p->pre_handler && p->pre_handler(p, regs))
-		/* handler has already set things up, so skip ss setup */
-		return 1;
-
-ss_probe:
-#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM)
-	if (p->ainsn.boostable == 1 && !p->post_handler) {
-		/* Boost up -- we can execute copied instructions directly */
-		reset_current_kprobe();
-		regs->ip = (unsigned long)p->ainsn.insn;
-		preempt_enable_no_resched();
-		return 1;
-	}
-#endif
-	prepare_singlestep(p, regs);
-	kcb->kprobe_status = KPROBE_HIT_SS;
-	return 1;
-
-no_kprobe:
-	preempt_enable_no_resched();
-	return ret;
-}
-
-/*
- * When a retprobed function returns, this code saves registers and
- * calls trampoline_handler() runs, which calls the kretprobe's handler.
- */
- void __kprobes kretprobe_trampoline_holder(void)
- {
-	asm volatile ( ".global kretprobe_trampoline\n"
-			"kretprobe_trampoline: \n"
-			"	pushf\n"
-			/*
-			 * Skip cs, ip, orig_ax.
-			 * trampoline_handler() will plug in these values
-			 */
-			"	subl $12, %esp\n"
-			"	pushl %fs\n"
-			"	pushl %ds\n"
-			"	pushl %es\n"
-			"	pushl %eax\n"
-			"	pushl %ebp\n"
-			"	pushl %edi\n"
-			"	pushl %esi\n"
-			"	pushl %edx\n"
-			"	pushl %ecx\n"
-			"	pushl %ebx\n"
-			"	movl %esp, %eax\n"
-			"	call trampoline_handler\n"
-			/* Move flags to cs */
-			"	movl 52(%esp), %edx\n"
-			"	movl %edx, 48(%esp)\n"
-			/* Replace saved flags with true return address. */
-			"	movl %eax, 52(%esp)\n"
-			"	popl %ebx\n"
-			"	popl %ecx\n"
-			"	popl %edx\n"
-			"	popl %esi\n"
-			"	popl %edi\n"
-			"	popl %ebp\n"
-			"	popl %eax\n"
-			/* Skip ip, orig_ax, es, ds, fs */
-			"	addl $20, %esp\n"
-			"	popf\n"
-			"	ret\n");
- }
-
-/*
- * Called from kretprobe_trampoline
- */
-void * __kprobes trampoline_handler(struct pt_regs *regs)
-{
-	struct kretprobe_instance *ri = NULL;
-	struct hlist_head *head, empty_rp;
-	struct hlist_node *node, *tmp;
-	unsigned long flags, orig_ret_address = 0;
-	unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline;
-
-	INIT_HLIST_HEAD(&empty_rp);
-	spin_lock_irqsave(&kretprobe_lock, flags);
-	head = kretprobe_inst_table_head(current);
-	/* fixup registers */
-	regs->cs = __KERNEL_CS | get_kernel_rpl();
-	regs->ip = trampoline_address;
-	regs->orig_ax = ~0UL;
-
-	/*
-	 * It is possible to have multiple instances associated with a given
-	 * task either because multiple functions in the call path have
-	 * return probes installed on them, and/or more then one
-	 * return probe was registered for a target function.
-	 *
-	 * We can handle this because:
-	 *     - instances are always pushed into the head of the list
-	 *     - when multiple return probes are registered for the same
-	 *	 function, the (chronologically) first instance's ret_addr
-	 *	 will be the real return address, and all the rest will
-	 *	 point to kretprobe_trampoline.
-	 */
-	hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
-		if (ri->task != current)
-			/* another task is sharing our hash bucket */
-			continue;
-
-		if (ri->rp && ri->rp->handler) {
-			__get_cpu_var(current_kprobe) = &ri->rp->kp;
-			get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
-			ri->rp->handler(ri, regs);
-			__get_cpu_var(current_kprobe) = NULL;
-		}
-
-		orig_ret_address = (unsigned long)ri->ret_addr;
-		recycle_rp_inst(ri, &empty_rp);
-
-		if (orig_ret_address != trampoline_address)
-			/*
-			 * This is the real return address. Any other
-			 * instances associated with this task are for
-			 * other calls deeper on the call stack
-			 */
-			break;
-	}
-
-	kretprobe_assert(ri, orig_ret_address, trampoline_address);
-
-	spin_unlock_irqrestore(&kretprobe_lock, flags);
-
-	hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
-		hlist_del(&ri->hlist);
-		kfree(ri);
-	}
-	return (void *)orig_ret_address;
-}
-
-/*
- * Called after single-stepping.  p->addr is the address of the
- * instruction whose first byte has been replaced by the "int 3"
- * instruction.  To avoid the SMP problems that can occur when we
- * temporarily put back the original opcode to single-step, we
- * single-stepped a copy of the instruction.  The address of this
- * copy is p->ainsn.insn.
- *
- * This function prepares to return from the post-single-step
- * interrupt.  We have to fix up the stack as follows:
- *
- * 0) Except in the case of absolute or indirect jump or call instructions,
- * the new ip is relative to the copied instruction.  We need to make
- * it relative to the original instruction.
- *
- * 1) If the single-stepped instruction was pushfl, then the TF and IF
- * flags are set in the just-pushed flags, and may need to be cleared.
- *
- * 2) If the single-stepped instruction was a call, the return address
- * that is atop the stack is the address following the copied instruction.
- * We need to make it the address following the original instruction.
- *
- * If this is the first time we've single-stepped the instruction at
- * this probepoint, and the instruction is boostable, boost it: add a
- * jump instruction after the copied instruction, that jumps to the next
- * instruction after the probepoint.
- */
-static void __kprobes resume_execution(struct kprobe *p,
-		struct pt_regs *regs, struct kprobe_ctlblk *kcb)
-{
-	unsigned long *tos = stack_addr(regs);
-	unsigned long copy_ip = (unsigned long)p->ainsn.insn;
-	unsigned long orig_ip = (unsigned long)p->addr;
-	kprobe_opcode_t *insn = p->ainsn.insn;
-
-	regs->flags &= ~TF_MASK;
-	switch (*insn) {
-	case 0x9c:	/* pushfl */
-		*tos &= ~(TF_MASK | IF_MASK);
-		*tos |= kcb->kprobe_old_flags;
-		break;
-	case 0xc2:	/* iret/ret/lret */
-	case 0xc3:
-	case 0xca:
-	case 0xcb:
-	case 0xcf:
-	case 0xea:	/* jmp absolute -- ip is correct */
-		/* ip is already adjusted, no more changes required */
-		p->ainsn.boostable = 1;
-		goto no_change;
-	case 0xe8:	/* call relative - Fix return addr */
-		*tos = orig_ip + (*tos - copy_ip);
-		break;
-	case 0x9a:	/* call absolute -- same as call absolute, indirect */
-		*tos = orig_ip + (*tos - copy_ip);
-		goto no_change;
-	case 0xff:
-		if ((insn[1] & 0x30) == 0x10) {
-			/*
-			 * call absolute, indirect
-			 * Fix return addr; ip is correct.
-			 * But this is not boostable
-			 */
-			*tos = orig_ip + (*tos - copy_ip);
-			goto no_change;
-		} else if (((insn[1] & 0x31) == 0x20) ||
-			   ((insn[1] & 0x31) == 0x21)) {
-			/*
-			 * jmp near and far, absolute indirect
-			 * ip is correct. And this is boostable
-			 */
-			p->ainsn.boostable = 1;
-			goto no_change;
-		}
-	default:
-		break;
-	}
-
-	if (p->ainsn.boostable == 0) {
-		if ((regs->ip > copy_ip) &&
-		    (regs->ip - copy_ip) + 5 < MAX_INSN_SIZE) {
-			/*
-			 * These instructions can be executed directly if it
-			 * jumps back to correct address.
-			 */
-			set_jmp_op((void *)regs->ip,
-				   (void *)orig_ip + (regs->ip - copy_ip));
-			p->ainsn.boostable = 1;
-		} else {
-			p->ainsn.boostable = -1;
-		}
-	}
-
-	regs->ip += orig_ip - copy_ip;
-
-no_change:
-	restore_btf();
-
-	return;
-}
-
-/*
- * Interrupts are disabled on entry as trap1 is an interrupt gate and they
- * remain disabled thoroughout this function.
- */
-static int __kprobes post_kprobe_handler(struct pt_regs *regs)
-{
-	struct kprobe *cur = kprobe_running();
-	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-
-	if (!cur)
-		return 0;
-
-	if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
-		kcb->kprobe_status = KPROBE_HIT_SSDONE;
-		cur->post_handler(cur, regs, 0);
-	}
-
-	resume_execution(cur, regs, kcb);
-	regs->flags |= kcb->kprobe_saved_flags;
-	trace_hardirqs_fixup_flags(regs->flags);
-
-	/* Restore back the original saved kprobes variables and continue. */
-	if (kcb->kprobe_status == KPROBE_REENTER) {
-		restore_previous_kprobe(kcb);
-		goto out;
-	}
-	reset_current_kprobe();
-out:
-	preempt_enable_no_resched();
-
-	/*
-	 * if somebody else is singlestepping across a probe point, flags
-	 * will have TF set, in which case, continue the remaining processing
-	 * of do_debug, as if this is not a probe hit.
-	 */
-	if (regs->flags & TF_MASK)
-		return 0;
-
-	return 1;
-}
-
-int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
-{
-	struct kprobe *cur = kprobe_running();
-	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-
-	switch(kcb->kprobe_status) {
-	case KPROBE_HIT_SS:
-	case KPROBE_REENTER:
-		/*
-		 * We are here because the instruction being single
-		 * stepped caused a page fault. We reset the current
-		 * kprobe and the ip points back to the probe address
-		 * and allow the page fault handler to continue as a
-		 * normal page fault.
-		 */
-		regs->ip = (unsigned long)cur->addr;
-		regs->flags |= kcb->kprobe_old_flags;
-		if (kcb->kprobe_status == KPROBE_REENTER)
-			restore_previous_kprobe(kcb);
-		else
-			reset_current_kprobe();
-		preempt_enable_no_resched();
-		break;
-	case KPROBE_HIT_ACTIVE:
-	case KPROBE_HIT_SSDONE:
-		/*
-		 * We increment the nmissed count for accounting,
-		 * we can also use npre/npostfault count for accounting
-		 * these specific fault cases.
-		 */
-		kprobes_inc_nmissed_count(cur);
-
-		/*
-		 * We come here because instructions in the pre/post
-		 * handler caused the page_fault, this could happen
-		 * if handler tries to access user space by
-		 * copy_from_user(), get_user() etc. Let the
-		 * user-specified handler try to fix it first.
-		 */
-		if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
-			return 1;
-
-		/*
-		 * In case the user-specified fault handler returned
-		 * zero, try to fix up.
-		 */
-		if (fixup_exception(regs))
-			return 1;
-
-		/*
-		 * fixup routine could not handle it,
-		 * Let do_page_fault() fix it.
-		 */
-		break;
-	default:
-		break;
-	}
-	return 0;
-}
-
-/*
- * Wrapper routine for handling exceptions.
- */
-int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
-				       unsigned long val, void *data)
-{
-	struct die_args *args = (struct die_args *)data;
-	int ret = NOTIFY_DONE;
-
-	if (args->regs && user_mode_vm(args->regs))
-		return ret;
-
-	switch (val) {
-	case DIE_INT3:
-		if (kprobe_handler(args->regs))
-			ret = NOTIFY_STOP;
-		break;
-	case DIE_DEBUG:
-		if (post_kprobe_handler(args->regs))
-			ret = NOTIFY_STOP;
-		break;
-	case DIE_GPF:
-		/* kprobe_running() needs smp_processor_id() */
-		preempt_disable();
-		if (kprobe_running() &&
-		    kprobe_fault_handler(args->regs, args->trapnr))
-			ret = NOTIFY_STOP;
-		preempt_enable();
-		break;
-	default:
-		break;
-	}
-	return ret;
-}
-
-int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
-{
-	struct jprobe *jp = container_of(p, struct jprobe, kp);
-	unsigned long addr;
-	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-
-	kcb->jprobe_saved_regs = *regs;
-	kcb->jprobe_saved_sp = stack_addr(regs);
-	addr = (unsigned long)(kcb->jprobe_saved_sp);
-
-	/*
-	 * As Linus pointed out, gcc assumes that the callee
-	 * owns the argument space and could overwrite it, e.g.
-	 * tailcall optimization. So, to be absolutely safe
-	 * we also save and restore enough stack bytes to cover
-	 * the argument area.
-	 */
-	memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr,
-			MIN_STACK_SIZE(addr));
-	regs->flags &= ~IF_MASK;
-	trace_hardirqs_off();
-	regs->ip = (unsigned long)(jp->entry);
-	return 1;
-}
-
-void __kprobes jprobe_return(void)
-{
-	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-
-	asm volatile ("       xchgl   %%ebx,%%esp     \n"
-		      "       int3			\n"
-		      "       .globl jprobe_return_end	\n"
-		      "       jprobe_return_end:	\n"
-		      "       nop			\n"::"b"
-		      (kcb->jprobe_saved_sp):"memory");
-}
-
-int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
-{
-	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-	u8 *addr = (u8 *) (regs->ip - 1);
-	struct jprobe *jp = container_of(p, struct jprobe, kp);
-
-	if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) {
-		if (stack_addr(regs) != kcb->jprobe_saved_sp) {
-			struct pt_regs *saved_regs = &kcb->jprobe_saved_regs;
-			printk("current sp %p does not match saved sp %p\n",
-			       stack_addr(regs), kcb->jprobe_saved_sp);
-			printk("Saved registers for jprobe %p\n", jp);
-			show_registers(saved_regs);
-			printk("Current registers\n");
-			show_registers(regs);
-			BUG();
-		}
-		*regs = kcb->jprobe_saved_regs;
-		memcpy((kprobe_opcode_t *)(kcb->jprobe_saved_sp),
-		       kcb->jprobes_stack,
-		       MIN_STACK_SIZE(kcb->jprobe_saved_sp));
-		preempt_enable_no_resched();
-		return 1;
-	}
-	return 0;
-}
-
-int __init arch_init_kprobes(void)
-{
-	return 0;
-}
-
-int __kprobes arch_trampoline_kprobe(struct kprobe *p)
-{
-	return 0;
-}
diff --git a/arch/x86/kernel/kprobes_64.c b/arch/x86/kernel/kprobes_64.c
deleted file mode 100644
index 2d7763749b1b..000000000000
--- a/arch/x86/kernel/kprobes_64.c
+++ /dev/null
@@ -1,953 +0,0 @@
-/*
- *  Kernel Probes (KProbes)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright (C) IBM Corporation, 2002, 2004
- *
- * 2002-Oct	Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
- *		Probes initial implementation ( includes contributions from
- *		Rusty Russell).
- * 2004-July	Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
- *		interface to access function arguments.
- * 2004-Oct	Jim Keniston <kenistoj@us.ibm.com> and Prasanna S Panchamukhi
- *		<prasanna@in.ibm.com> adapted for x86_64
- * 2005-Mar	Roland McGrath <roland@redhat.com>
- *		Fixed to handle %rip-relative addressing mode correctly.
- * 2005-May     Rusty Lynch <rusty.lynch@intel.com>
- *              Added function return probes functionality
- * 2007-Dec	Masami Hiramatsu <mhiramat@redhat.com> added kprobe-booster
- * 		and kretprobe-booster for x86-64
- */
-
-#include <linux/kprobes.h>
-#include <linux/ptrace.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-#include <linux/preempt.h>
-#include <linux/module.h>
-#include <linux/kdebug.h>
-
-#include <asm/cacheflush.h>
-#include <asm/desc.h>
-#include <asm/pgtable.h>
-#include <asm/uaccess.h>
-#include <asm/alternative.h>
-
-void jprobe_return_end(void);
-
-DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
-DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
-
-#define stack_addr(regs) ((unsigned long *)regs->sp)
-
-#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
-	(((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) |   \
-	  (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) |   \
-	  (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) |   \
-	  (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf))    \
-	 << (row % 32))
-	/*
-	 * Undefined/reserved opcodes, conditional jump, Opcode Extension
-	 * Groups, and some special opcodes can not boost.
-	 */
-static const u32 twobyte_is_boostable[256 / 32] = {
-	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
-	/*      ----------------------------------------------          */
-	W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */
-	W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 10 */
-	W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 20 */
-	W(0x30, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
-	W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
-	W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
-	W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1) | /* 60 */
-	W(0x70, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */
-	W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 80 */
-	W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
-	W(0xa0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* a0 */
-	W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) , /* b0 */
-	W(0xc0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
-	W(0xd0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) , /* d0 */
-	W(0xe0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* e0 */
-	W(0xf0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0)   /* f0 */
-	/*      -----------------------------------------------         */
-	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
-};
-static const u32 onebyte_has_modrm[256 / 32] = {
-	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
-	/*      -----------------------------------------------         */
-	W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */
-	W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */
-	W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */
-	W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */
-	W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
-	W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
-	W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */
-	W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */
-	W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
-	W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */
-	W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */
-	W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */
-	W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */
-	W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
-	W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */
-	W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1)   /* f0 */
-	/*      -----------------------------------------------         */
-	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
-};
-static const u32 twobyte_has_modrm[256 / 32] = {
-	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
-	/*      -----------------------------------------------         */
-	W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */
-	W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */
-	W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */
-	W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */
-	W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */
-	W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */
-	W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */
-	W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */
-	W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */
-	W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */
-	W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */
-	W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */
-	W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */
-	W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */
-	W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */
-	W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0)   /* ff */
-	/*      -----------------------------------------------         */
-	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
-};
-#undef W
-
-struct kretprobe_blackpoint kretprobe_blacklist[] = {
-	{"__switch_to", }, /* This function switches only current task, but
-			      doesn't switch kernel stack.*/
-	{NULL, NULL}	/* Terminator */
-};
-const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
-
-/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
-static __always_inline void set_jmp_op(void *from, void *to)
-{
-	struct __arch_jmp_op {
-		char op;
-		s32 raddr;
-	} __attribute__((packed)) * jop;
-	jop = (struct __arch_jmp_op *)from;
-	jop->raddr = (s32)((long)(to) - ((long)(from) + 5));
-	jop->op = RELATIVEJUMP_INSTRUCTION;
-}
-
-/*
- * returns non-zero if opcode is boostable.
- * RIP relative instructions are adjusted at copying time
- */
-static __always_inline int can_boost(kprobe_opcode_t *opcodes)
-{
-	kprobe_opcode_t opcode;
-	kprobe_opcode_t *orig_opcodes = opcodes;
-
-retry:
-	if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
-		return 0;
-	opcode = *(opcodes++);
-
-	/* 2nd-byte opcode */
-	if (opcode == 0x0f) {
-		if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
-			return 0;
-		return test_bit(*opcodes,
-				(unsigned long *)twobyte_is_boostable);
-	}
-
-	switch (opcode & 0xf0) {
-	case 0x40:
-		goto retry; /* REX prefix is boostable */
-	case 0x60:
-		if (0x63 < opcode && opcode < 0x67)
-			goto retry; /* prefixes */
-		/* can't boost Address-size override and bound */
-		return (opcode != 0x62 && opcode != 0x67);
-	case 0x70:
-		return 0; /* can't boost conditional jump */
-	case 0xc0:
-		/* can't boost software-interruptions */
-		return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf;
-	case 0xd0:
-		/* can boost AA* and XLAT */
-		return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7);
-	case 0xe0:
-		/* can boost in/out and absolute jmps */
-		return ((opcode & 0x04) || opcode == 0xea);
-	case 0xf0:
-		if ((opcode & 0x0c) == 0 && opcode != 0xf1)
-			goto retry; /* lock/rep(ne) prefix */
-		/* clear and set flags are boostable */
-		return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe));
-	default:
-		/* segment override prefixes are boostable */
-		if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e)
-			goto retry; /* prefixes */
-		/* CS override prefix and call are not boostable */
-		return (opcode != 0x2e && opcode != 0x9a);
-	}
-}
-
-/*
- * returns non-zero if opcode modifies the interrupt flag.
- */
-static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
-{
-	switch (*insn) {
-	case 0xfa:		/* cli */
-	case 0xfb:		/* sti */
-	case 0xcf:		/* iret/iretd */
-	case 0x9d:		/* popf/popfd */
-		return 1;
-	}
-
-	/*
-	 * on 64 bit x86, 0x40-0x4f are prefixes so we need to look
-	 * at the next byte instead.. but of course not recurse infinitely
-	 */
-	if (*insn  >= 0x40 && *insn <= 0x4f)
-		return is_IF_modifier(++insn);
-	return 0;
-}
-
-/*
- * Adjust the displacement if the instruction uses the %rip-relative
- * addressing mode.
- * If it does, Return the address of the 32-bit displacement word.
- * If not, return null.
- */
-static void __kprobes fix_riprel(struct kprobe *p)
-{
-	u8 *insn = p->ainsn.insn;
-	s64 disp;
-	int need_modrm;
-
-	/* Skip legacy instruction prefixes.  */
-	while (1) {
-		switch (*insn) {
-		case 0x66:
-		case 0x67:
-		case 0x2e:
-		case 0x3e:
-		case 0x26:
-		case 0x64:
-		case 0x65:
-		case 0x36:
-		case 0xf0:
-		case 0xf3:
-		case 0xf2:
-			++insn;
-			continue;
-		}
-		break;
-	}
-
-	/* Skip REX instruction prefix.  */
-	if ((*insn & 0xf0) == 0x40)
-		++insn;
-
-	if (*insn == 0x0f) {	/* Two-byte opcode.  */
-		++insn;
-		need_modrm = test_bit(*insn,
-				      (unsigned long *)twobyte_has_modrm);
-	} else			/* One-byte opcode.  */
-		need_modrm = test_bit(*insn,
-				      (unsigned long *)onebyte_has_modrm);
-
-	if (need_modrm) {
-		u8 modrm = *++insn;
-		if ((modrm & 0xc7) == 0x05) { /* %rip+disp32 addressing mode */
-			/* Displacement follows ModRM byte.  */
-			++insn;
-			/*
-			 * The copied instruction uses the %rip-relative
-			 * addressing mode.  Adjust the displacement for the
-			 * difference between the original location of this
-			 * instruction and the location of the copy that will
-			 * actually be run.  The tricky bit here is making sure
-			 * that the sign extension happens correctly in this
-			 * calculation, since we need a signed 32-bit result to
-			 * be sign-extended to 64 bits when it's added to the
-			 * %rip value and yield the same 64-bit result that the
-			 * sign-extension of the original signed 32-bit
-			 * displacement would have given.
-			 */
-			disp = (u8 *) p->addr + *((s32 *) insn) -
-			       (u8 *) p->ainsn.insn;
-			BUG_ON((s64) (s32) disp != disp); /* Sanity check.  */
-			*(s32 *)insn = (s32) disp;
-		}
-	}
-}
-
-static void __kprobes arch_copy_kprobe(struct kprobe *p)
-{
-	memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
-	fix_riprel(p);
-	if (can_boost(p->addr))
-		p->ainsn.boostable = 0;
-	else
-		p->ainsn.boostable = -1;
-
-	p->opcode = *p->addr;
-}
-
-int __kprobes arch_prepare_kprobe(struct kprobe *p)
-{
-	/* insn: must be on special executable page on x86. */
-	p->ainsn.insn = get_insn_slot();
-	if (!p->ainsn.insn)
-		return -ENOMEM;
-	arch_copy_kprobe(p);
-	return 0;
-}
-
-void __kprobes arch_arm_kprobe(struct kprobe *p)
-{
-	text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
-}
-
-void __kprobes arch_disarm_kprobe(struct kprobe *p)
-{
-	text_poke(p->addr, &p->opcode, 1);
-}
-
-void __kprobes arch_remove_kprobe(struct kprobe *p)
-{
-	mutex_lock(&kprobe_mutex);
-	free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1));
-	mutex_unlock(&kprobe_mutex);
-}
-
-static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
-{
-	kcb->prev_kprobe.kp = kprobe_running();
-	kcb->prev_kprobe.status = kcb->kprobe_status;
-	kcb->prev_kprobe.old_flags = kcb->kprobe_old_flags;
-	kcb->prev_kprobe.saved_flags = kcb->kprobe_saved_flags;
-}
-
-static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
-{
-	__get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
-	kcb->kprobe_status = kcb->prev_kprobe.status;
-	kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags;
-	kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags;
-}
-
-static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
-				struct kprobe_ctlblk *kcb)
-{
-	__get_cpu_var(current_kprobe) = p;
-	kcb->kprobe_saved_flags = kcb->kprobe_old_flags
-		= (regs->flags & (TF_MASK | IF_MASK));
-	if (is_IF_modifier(p->ainsn.insn))
-		kcb->kprobe_saved_flags &= ~IF_MASK;
-}
-
-static __always_inline void clear_btf(void)
-{
-	if (test_thread_flag(TIF_DEBUGCTLMSR))
-		wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
-}
-
-static __always_inline void restore_btf(void)
-{
-	if (test_thread_flag(TIF_DEBUGCTLMSR))
-		wrmsrl(MSR_IA32_DEBUGCTLMSR, current->thread.debugctlmsr);
-}
-
-static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
-{
-	clear_btf();
-	regs->flags |= TF_MASK;
-	regs->flags &= ~IF_MASK;
-	/*single step inline if the instruction is an int3*/
-	if (p->opcode == BREAKPOINT_INSTRUCTION)
-		regs->ip = (unsigned long)p->addr;
-	else
-		regs->ip = (unsigned long)p->ainsn.insn;
-}
-
-/* Called with kretprobe_lock held */
-void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
-				      struct pt_regs *regs)
-{
-	unsigned long *sara = stack_addr(regs);
-
-	ri->ret_addr = (kprobe_opcode_t *) *sara;
-
-	/* Replace the return addr with trampoline addr */
-	*sara = (unsigned long) &kretprobe_trampoline;
-}
-
-/*
- * Interrupts are disabled on entry as trap3 is an interrupt gate and they
- * remain disabled thorough out this function.
- */
-static int __kprobes kprobe_handler(struct pt_regs *regs)
-{
-	struct kprobe *p;
-	int ret = 0;
-	kprobe_opcode_t *addr;
-	struct kprobe_ctlblk *kcb;
-
-	addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
-
-	/*
-	 * We don't want to be preempted for the entire
-	 * duration of kprobe processing
-	 */
-	preempt_disable();
-	kcb = get_kprobe_ctlblk();
-
-	/* Check we're not actually recursing */
-	if (kprobe_running()) {
-		p = get_kprobe(addr);
-		if (p) {
-			if (kcb->kprobe_status == KPROBE_HIT_SS &&
-				*p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
-				regs->flags &= ~TF_MASK;
-				regs->flags |= kcb->kprobe_saved_flags;
-				goto no_kprobe;
-			} else if (kcb->kprobe_status == KPROBE_HIT_SSDONE) {
-				/* TODO: Provide re-entrancy from
-				 * post_kprobes_handler() and avoid exception
-				 * stack corruption while single-stepping on
-				 * the instruction of the new probe.
-				 */
-				arch_disarm_kprobe(p);
-				regs->ip = (unsigned long)p->addr;
-				reset_current_kprobe();
-				return 1;
-			}
-			/* We have reentered the kprobe_handler(), since
-			 * another probe was hit while within the handler.
-			 * We here save the original kprobes variables and
-			 * just single step on the instruction of the new probe
-			 * without calling any user handlers.
-			 */
-			save_previous_kprobe(kcb);
-			set_current_kprobe(p, regs, kcb);
-			kprobes_inc_nmissed_count(p);
-			prepare_singlestep(p, regs);
-			kcb->kprobe_status = KPROBE_REENTER;
-			return 1;
-		} else {
-			if (*addr != BREAKPOINT_INSTRUCTION) {
-			/* The breakpoint instruction was removed by
-			 * another cpu right after we hit, no further
-			 * handling of this interrupt is appropriate
-			 */
-				regs->ip = (unsigned long)addr;
-				ret = 1;
-				goto no_kprobe;
-			}
-			p = __get_cpu_var(current_kprobe);
-			if (p->break_handler && p->break_handler(p, regs)) {
-				goto ss_probe;
-			}
-		}
-		goto no_kprobe;
-	}
-
-	p = get_kprobe(addr);
-	if (!p) {
-		if (*addr != BREAKPOINT_INSTRUCTION) {
-			/*
-			 * The breakpoint instruction was removed right
-			 * after we hit it.  Another cpu has removed
-			 * either a probepoint or a debugger breakpoint
-			 * at this address.  In either case, no further
-			 * handling of this interrupt is appropriate.
-			 * Back up over the (now missing) int3 and run
-			 * the original instruction.
-			 */
-			regs->ip = (unsigned long)addr;
-			ret = 1;
-		}
-		/* Not one of ours: let kernel handle it */
-		goto no_kprobe;
-	}
-
-	set_current_kprobe(p, regs, kcb);
-	kcb->kprobe_status = KPROBE_HIT_ACTIVE;
-
-	if (p->pre_handler && p->pre_handler(p, regs))
-		/* handler has already set things up, so skip ss setup */
-		return 1;
-
-ss_probe:
-#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM)
-	if (p->ainsn.boostable == 1 && !p->post_handler) {
-		/* Boost up -- we can execute copied instructions directly */
-		reset_current_kprobe();
-		regs->ip = (unsigned long)p->ainsn.insn;
-		preempt_enable_no_resched();
-		return 1;
-	}
-#endif
-	prepare_singlestep(p, regs);
-	kcb->kprobe_status = KPROBE_HIT_SS;
-	return 1;
-
-no_kprobe:
-	preempt_enable_no_resched();
-	return ret;
-}
-
-/*
- * When a retprobed function returns, this code saves registers and
- * calls trampoline_handler() runs, which calls the kretprobe's handler.
- */
- void __kprobes kretprobe_trampoline_holder(void)
- {
- 	asm volatile (  ".global kretprobe_trampoline\n"
-			"kretprobe_trampoline: \n"
-			/* We don't bother saving the ss register */
-			"	pushq %rsp\n"
-			"	pushfq\n"
-			/*
-			 * Skip cs, ip, orig_ax.
-			 * trampoline_handler() will plug in these values
-			 */
-			"	subq $24, %rsp\n"
-			"	pushq %rdi\n"
-			"	pushq %rsi\n"
-			"	pushq %rdx\n"
-			"	pushq %rcx\n"
-			"	pushq %rax\n"
-			"	pushq %r8\n"
-			"	pushq %r9\n"
-			"	pushq %r10\n"
-			"	pushq %r11\n"
-			"	pushq %rbx\n"
-			"	pushq %rbp\n"
-			"	pushq %r12\n"
-			"	pushq %r13\n"
-			"	pushq %r14\n"
-			"	pushq %r15\n"
-			"	movq %rsp, %rdi\n"
-			"	call trampoline_handler\n"
-			/* Replace saved sp with true return address. */
-			"	movq %rax, 152(%rsp)\n"
-			"	popq %r15\n"
-			"	popq %r14\n"
-			"	popq %r13\n"
-			"	popq %r12\n"
-			"	popq %rbp\n"
-			"	popq %rbx\n"
-			"	popq %r11\n"
-			"	popq %r10\n"
-			"	popq %r9\n"
-			"	popq %r8\n"
-			"	popq %rax\n"
-			"	popq %rcx\n"
-			"	popq %rdx\n"
-			"	popq %rsi\n"
-			"	popq %rdi\n"
-			/* Skip orig_ax, ip, cs */
-			"	addq $24, %rsp\n"
-			"	popfq\n"
-			"	ret\n");
- }
-
-/*
- * Called from kretprobe_trampoline
- */
-fastcall void * __kprobes trampoline_handler(struct pt_regs *regs)
-{
-	struct kretprobe_instance *ri = NULL;
-	struct hlist_head *head, empty_rp;
-	struct hlist_node *node, *tmp;
-	unsigned long flags, orig_ret_address = 0;
-	unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline;
-
-	INIT_HLIST_HEAD(&empty_rp);
-	spin_lock_irqsave(&kretprobe_lock, flags);
-	head = kretprobe_inst_table_head(current);
-	/* fixup registers */
-	regs->cs = __KERNEL_CS;
-	regs->ip = trampoline_address;
-	regs->orig_ax = ~0UL;
-
-	/*
-	 * It is possible to have multiple instances associated with a given
-	 * task either because multiple functions in the call path have
-	 * return probes installed on them, and/or more then one
-	 * return probe was registered for a target function.
-	 *
-	 * We can handle this because:
-	 *     - instances are always pushed into the head of the list
-	 *     - when multiple return probes are registered for the same
-	 *	 function, the (chronologically) first instance's ret_addr
-	 *	 will be the real return address, and all the rest will
-	 *	 point to kretprobe_trampoline.
-	 */
-	hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
-		if (ri->task != current)
-			/* another task is sharing our hash bucket */
-			continue;
-
-		if (ri->rp && ri->rp->handler) {
-			__get_cpu_var(current_kprobe) = &ri->rp->kp;
-			get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
-			ri->rp->handler(ri, regs);
-			__get_cpu_var(current_kprobe) = NULL;
-		}
-
-		orig_ret_address = (unsigned long)ri->ret_addr;
-		recycle_rp_inst(ri, &empty_rp);
-
-		if (orig_ret_address != trampoline_address)
-			/*
-			 * This is the real return address. Any other
-			 * instances associated with this task are for
-			 * other calls deeper on the call stack
-			 */
-			break;
-	}
-
-	kretprobe_assert(ri, orig_ret_address, trampoline_address);
-
-	spin_unlock_irqrestore(&kretprobe_lock, flags);
-
-	hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
-		hlist_del(&ri->hlist);
-		kfree(ri);
-	}
-	return (void *)orig_ret_address;
-}
-
-/*
- * Called after single-stepping.  p->addr is the address of the
- * instruction whose first byte has been replaced by the "int 3"
- * instruction.  To avoid the SMP problems that can occur when we
- * temporarily put back the original opcode to single-step, we
- * single-stepped a copy of the instruction.  The address of this
- * copy is p->ainsn.insn.
- *
- * This function prepares to return from the post-single-step
- * interrupt.  We have to fix up the stack as follows:
- *
- * 0) Except in the case of absolute or indirect jump or call instructions,
- * the new ip is relative to the copied instruction.  We need to make
- * it relative to the original instruction.
- *
- * 1) If the single-stepped instruction was pushfl, then the TF and IF
- * flags are set in the just-pushed flags, and may need to be cleared.
- *
- * 2) If the single-stepped instruction was a call, the return address
- * that is atop the stack is the address following the copied instruction.
- * We need to make it the address following the original instruction.
- *
- * If this is the first time we've single-stepped the instruction at
- * this probepoint, and the instruction is boostable, boost it: add a
- * jump instruction after the copied instruction, that jumps to the next
- * instruction after the probepoint.
- */
-static void __kprobes resume_execution(struct kprobe *p,
-		struct pt_regs *regs, struct kprobe_ctlblk *kcb)
-{
-	unsigned long *tos = stack_addr(regs);
-	unsigned long copy_ip = (unsigned long)p->ainsn.insn;
-	unsigned long orig_ip = (unsigned long)p->addr;
-	kprobe_opcode_t *insn = p->ainsn.insn;
-
-	/*skip the REX prefix*/
-	if (*insn >= 0x40 && *insn <= 0x4f)
-		insn++;
-
-	regs->flags &= ~TF_MASK;
-	switch (*insn) {
-	case 0x9c:	/* pushfl */
-		*tos &= ~(TF_MASK | IF_MASK);
-		*tos |= kcb->kprobe_old_flags;
-		break;
-	case 0xc2:	/* iret/ret/lret */
-	case 0xc3:
-	case 0xca:
-	case 0xcb:
-	case 0xcf:
-	case 0xea:	/* jmp absolute -- ip is correct */
-		/* ip is already adjusted, no more changes required */
-		p->ainsn.boostable = 1;
-		goto no_change;
-	case 0xe8:	/* call relative - Fix return addr */
-		*tos = orig_ip + (*tos - copy_ip);
-		break;
-	case 0xff:
-		if ((insn[1] & 0x30) == 0x10) {
-			/*
-			 * call absolute, indirect
-			 * Fix return addr; ip is correct.
-			 * But this is not boostable
-			 */
-			*tos = orig_ip + (*tos - copy_ip);
-			goto no_change;
-		} else if (((insn[1] & 0x31) == 0x20) ||
-			   ((insn[1] & 0x31) == 0x21)) {
-			/*
-			 * jmp near and far, absolute indirect
-			 * ip is correct. And this is boostable
-			 */
-			p->ainsn.boostable = 1;
-			goto no_change;
-		}
-	default:
-		break;
-	}
-
-	if (p->ainsn.boostable == 0) {
-		if ((regs->ip > copy_ip) &&
-		    (regs->ip - copy_ip) + 5 < MAX_INSN_SIZE) {
-			/*
-			 * These instructions can be executed directly if it
-			 * jumps back to correct address.
-			 */
-			set_jmp_op((void *)regs->ip,
-				   (void *)orig_ip + (regs->ip - copy_ip));
-			p->ainsn.boostable = 1;
-		} else {
-			p->ainsn.boostable = -1;
-		}
-	}
-
-	regs->ip += orig_ip - copy_ip;
-
-no_change:
-	restore_btf();
-
-	return;
-}
-
-/*
- * Interrupts are disabled on entry as trap1 is an interrupt gate and they
- * remain disabled thoroughout this function.
- */
-static int __kprobes post_kprobe_handler(struct pt_regs *regs)
-{
-	struct kprobe *cur = kprobe_running();
-	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-
-	if (!cur)
-		return 0;
-
-	if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
-		kcb->kprobe_status = KPROBE_HIT_SSDONE;
-		cur->post_handler(cur, regs, 0);
-	}
-
-	resume_execution(cur, regs, kcb);
-	regs->flags |= kcb->kprobe_saved_flags;
-	trace_hardirqs_fixup_flags(regs->flags);
-
-	/* Restore back the original saved kprobes variables and continue. */
-	if (kcb->kprobe_status == KPROBE_REENTER) {
-		restore_previous_kprobe(kcb);
-		goto out;
-	}
-	reset_current_kprobe();
-out:
-	preempt_enable_no_resched();
-
-	/*
-	 * if somebody else is singlestepping across a probe point, flags
-	 * will have TF set, in which case, continue the remaining processing
-	 * of do_debug, as if this is not a probe hit.
-	 */
-	if (regs->flags & TF_MASK)
-		return 0;
-
-	return 1;
-}
-
-int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
-{
-	struct kprobe *cur = kprobe_running();
-	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-	const struct exception_table_entry *fixup;
-
-	switch(kcb->kprobe_status) {
-	case KPROBE_HIT_SS:
-	case KPROBE_REENTER:
-		/*
-		 * We are here because the instruction being single
-		 * stepped caused a page fault. We reset the current
-		 * kprobe and the ip points back to the probe address
-		 * and allow the page fault handler to continue as a
-		 * normal page fault.
-		 */
-		regs->ip = (unsigned long)cur->addr;
-		regs->flags |= kcb->kprobe_old_flags;
-		if (kcb->kprobe_status == KPROBE_REENTER)
-			restore_previous_kprobe(kcb);
-		else
-			reset_current_kprobe();
-		preempt_enable_no_resched();
-		break;
-	case KPROBE_HIT_ACTIVE:
-	case KPROBE_HIT_SSDONE:
-		/*
-		 * We increment the nmissed count for accounting,
-		 * we can also use npre/npostfault count for accounting
-		 * these specific fault cases.
-		 */
-		kprobes_inc_nmissed_count(cur);
-
-		/*
-		 * We come here because instructions in the pre/post
-		 * handler caused the page_fault, this could happen
-		 * if handler tries to access user space by
-		 * copy_from_user(), get_user() etc. Let the
-		 * user-specified handler try to fix it first.
-		 */
-		if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
-			return 1;
-
-		/*
-		 * In case the user-specified fault handler returned
-		 * zero, try to fix up.
-		 */
-		fixup = search_exception_tables(regs->ip);
-		if (fixup) {
-			regs->ip = fixup->fixup;
-			return 1;
-		}
-
-		/*
-		 * fixup routine could not handle it,
-		 * Let do_page_fault() fix it.
-		 */
-		break;
-	default:
-		break;
-	}
-	return 0;
-}
-
-/*
- * Wrapper routine for handling exceptions.
- */
-int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
-				       unsigned long val, void *data)
-{
-	struct die_args *args = (struct die_args *)data;
-	int ret = NOTIFY_DONE;
-
-	if (args->regs && user_mode_vm(args->regs))
-		return ret;
-
-	switch (val) {
-	case DIE_INT3:
-		if (kprobe_handler(args->regs))
-			ret = NOTIFY_STOP;
-		break;
-	case DIE_DEBUG:
-		if (post_kprobe_handler(args->regs))
-			ret = NOTIFY_STOP;
-		break;
-	case DIE_GPF:
-		/* kprobe_running() needs smp_processor_id() */
-		preempt_disable();
-		if (kprobe_running() &&
-		    kprobe_fault_handler(args->regs, args->trapnr))
-			ret = NOTIFY_STOP;
-		preempt_enable();
-		break;
-	default:
-		break;
-	}
-	return ret;
-}
-
-int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
-{
-	struct jprobe *jp = container_of(p, struct jprobe, kp);
-	unsigned long addr;
-	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-
-	kcb->jprobe_saved_regs = *regs;
-	kcb->jprobe_saved_sp = stack_addr(regs);
-	addr = (unsigned long)(kcb->jprobe_saved_sp);
-
-	/*
-	 * As Linus pointed out, gcc assumes that the callee
-	 * owns the argument space and could overwrite it, e.g.
-	 * tailcall optimization. So, to be absolutely safe
-	 * we also save and restore enough stack bytes to cover
-	 * the argument area.
-	 */
-	memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr,
-			MIN_STACK_SIZE(addr));
-	regs->flags &= ~IF_MASK;
-	trace_hardirqs_off();
-	regs->ip = (unsigned long)(jp->entry);
-	return 1;
-}
-
-void __kprobes jprobe_return(void)
-{
-	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-
-	asm volatile ("       xchg   %%rbx,%%rsp     \n"
-		      "       int3			\n"
-		      "       .globl jprobe_return_end	\n"
-		      "       jprobe_return_end:	\n"
-		      "       nop			\n"::"b"
-		      (kcb->jprobe_saved_sp):"memory");
-}
-
-int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
-{
-	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-	u8 *addr = (u8 *) (regs->ip - 1);
-	struct jprobe *jp = container_of(p, struct jprobe, kp);
-
-	if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) {
-		if (stack_addr(regs) != kcb->jprobe_saved_sp) {
-			struct pt_regs *saved_regs = &kcb->jprobe_saved_regs;
-			printk("current sp %p does not match saved sp %p\n",
-			       stack_addr(regs), kcb->jprobe_saved_sp);
-			printk("Saved registers for jprobe %p\n", jp);
-			show_registers(saved_regs);
-			printk("Current registers\n");
-			show_registers(regs);
-			BUG();
-		}
-		*regs = kcb->jprobe_saved_regs;
-		memcpy((kprobe_opcode_t *)(kcb->jprobe_saved_sp),
-		       kcb->jprobes_stack,
-		       MIN_STACK_SIZE(kcb->jprobe_saved_sp));
-		preempt_enable_no_resched();
-		return 1;
-	}
-	return 0;
-}
-
-int __init arch_init_kprobes(void)
-{
-	return 0;
-}
-
-int __kprobes arch_trampoline_kprobe(struct kprobe *p)
-{
-	return 0;
-}
diff --git a/include/asm-x86/kprobes.h b/include/asm-x86/kprobes.h
index b7bbd25ba2a6..6e6371a856e7 100644
--- a/include/asm-x86/kprobes.h
+++ b/include/asm-x86/kprobes.h
@@ -1,5 +1,98 @@
-#ifdef CONFIG_X86_32
-# include "kprobes_32.h"
-#else
-# include "kprobes_64.h"
-#endif
+#ifndef _ASM_KPROBES_H
+#define _ASM_KPROBES_H
+/*
+ *  Kernel Probes (KProbes)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ *
+ * See arch/x86/kernel/kprobes.c for x86 kprobes history.
+ */
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/percpu.h>
+
+#define  __ARCH_WANT_KPROBES_INSN_SLOT
+
+struct pt_regs;
+struct kprobe;
+
+typedef u8 kprobe_opcode_t;
+#define BREAKPOINT_INSTRUCTION	0xcc
+#define RELATIVEJUMP_INSTRUCTION 0xe9
+#define MAX_INSN_SIZE 16
+#define MAX_STACK_SIZE 64
+#define MIN_STACK_SIZE(ADDR) (((MAX_STACK_SIZE) < \
+	(((unsigned long)current_thread_info()) + THREAD_SIZE \
+	 - (unsigned long)(ADDR))) \
+	? (MAX_STACK_SIZE) \
+	: (((unsigned long)current_thread_info()) + THREAD_SIZE \
+	   - (unsigned long)(ADDR)))
+
+#define ARCH_SUPPORTS_KRETPROBES
+#define flush_insn_slot(p)	do { } while (0)
+
+extern const int kretprobe_blacklist_size;
+
+void arch_remove_kprobe(struct kprobe *p);
+void kretprobe_trampoline(void);
+
+/* Architecture specific copy of original instruction*/
+struct arch_specific_insn {
+	/* copy of the original instruction */
+	kprobe_opcode_t *insn;
+	/*
+	 * boostable = -1: This instruction type is not boostable.
+	 * boostable = 0: This instruction type is boostable.
+	 * boostable = 1: This instruction has been boosted: we have
+	 * added a relative jump after the instruction copy in insn,
+	 * so no single-step and fixup are needed (unless there's
+	 * a post_handler or break_handler).
+	 */
+	int boostable;
+};
+
+struct prev_kprobe {
+	struct kprobe *kp;
+	unsigned long status;
+	unsigned long old_flags;
+	unsigned long saved_flags;
+};
+
+/* per-cpu kprobe control block */
+struct kprobe_ctlblk {
+	unsigned long kprobe_status;
+	unsigned long kprobe_old_flags;
+	unsigned long kprobe_saved_flags;
+	unsigned long *jprobe_saved_sp;
+	struct pt_regs jprobe_saved_regs;
+	kprobe_opcode_t jprobes_stack[MAX_STACK_SIZE];
+	struct prev_kprobe prev_kprobe;
+};
+
+/* trap3/1 are intr gates for kprobes.  So, restore the status of IF,
+ * if necessary, before executing the original int3/1 (trap) handler.
+ */
+static inline void restore_interrupts(struct pt_regs *regs)
+{
+	if (regs->flags & IF_MASK)
+		local_irq_enable();
+}
+
+extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
+extern int kprobe_exceptions_notify(struct notifier_block *self,
+				    unsigned long val, void *data);
+#endif				/* _ASM_KPROBES_H */
diff --git a/include/asm-x86/kprobes_32.h b/include/asm-x86/kprobes_32.h
deleted file mode 100644
index 94dff09b2ebe..000000000000
--- a/include/asm-x86/kprobes_32.h
+++ /dev/null
@@ -1,102 +0,0 @@
-#ifndef _ASM_KPROBES_H
-#define _ASM_KPROBES_H
-/*
- *  Kernel Probes (KProbes)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright (C) IBM Corporation, 2002, 2004
- *
- * 2002-Oct	Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
- *		Probes initial implementation ( includes suggestions from
- *		Rusty Russell).
- * 2004-Oct	Prasanna S Panchamukhi <prasanna@in.ibm.com> and Jim Keniston
- *		kenistoj@us.ibm.com adopted from i386.
- */
-#include <linux/types.h>
-#include <linux/ptrace.h>
-#include <linux/percpu.h>
-
-#define  __ARCH_WANT_KPROBES_INSN_SLOT
-
-struct pt_regs;
-struct kprobe;
-
-typedef u8 kprobe_opcode_t;
-#define BREAKPOINT_INSTRUCTION	0xcc
-#define RELATIVEJUMP_INSTRUCTION 0xe9
-#define MAX_INSN_SIZE 16
-#define MAX_STACK_SIZE 64
-#define MIN_STACK_SIZE(ADDR) (((MAX_STACK_SIZE) < \
-	(((unsigned long)current_thread_info()) + THREAD_SIZE \
-	 - (unsigned long)(ADDR))) \
-	? (MAX_STACK_SIZE) \
-	: (((unsigned long)current_thread_info()) + THREAD_SIZE \
-	   - (unsigned long)(ADDR)))
-
-#define ARCH_SUPPORTS_KRETPROBES
-#define flush_insn_slot(p)	do { } while (0)
-
-extern const int kretprobe_blacklist_size;
-
-void arch_remove_kprobe(struct kprobe *p);
-void kretprobe_trampoline(void);
-
-/* Architecture specific copy of original instruction*/
-struct arch_specific_insn {
-	/* copy of the original instruction */
-	kprobe_opcode_t *insn;
-	/*
-	 * boostable = -1: This instruction type is not boostable.
-	 * boostable = 0: This instruction type is boostable.
-	 * boostable = 1: This instruction has been boosted: we have
-	 * added a relative jump after the instruction copy in insn,
-	 * so no single-step and fixup are needed (unless there's
-	 * a post_handler or break_handler).
-	 */
-	int boostable;
-};
-
-struct prev_kprobe {
-	struct kprobe *kp;
-	unsigned long status;
-	unsigned long old_flags;
-	unsigned long saved_flags;
-};
-
-/* per-cpu kprobe control block */
-struct kprobe_ctlblk {
-	unsigned long kprobe_status;
-	unsigned long kprobe_old_flags;
-	unsigned long kprobe_saved_flags;
-	unsigned long *jprobe_saved_sp;
-	struct pt_regs jprobe_saved_regs;
-	kprobe_opcode_t jprobes_stack[MAX_STACK_SIZE];
-	struct prev_kprobe prev_kprobe;
-};
-
-/* trap3/1 are intr gates for kprobes.  So, restore the status of IF,
- * if necessary, before executing the original int3/1 (trap) handler.
- */
-static inline void restore_interrupts(struct pt_regs *regs)
-{
-	if (regs->flags & IF_MASK)
-		local_irq_enable();
-}
-
-extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
-extern int kprobe_exceptions_notify(struct notifier_block *self,
-				    unsigned long val, void *data);
-#endif				/* _ASM_KPROBES_H */
diff --git a/include/asm-x86/kprobes_64.h b/include/asm-x86/kprobes_64.h
deleted file mode 100644
index 94dff09b2ebe..000000000000
--- a/include/asm-x86/kprobes_64.h
+++ /dev/null
@@ -1,102 +0,0 @@
-#ifndef _ASM_KPROBES_H
-#define _ASM_KPROBES_H
-/*
- *  Kernel Probes (KProbes)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright (C) IBM Corporation, 2002, 2004
- *
- * 2002-Oct	Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
- *		Probes initial implementation ( includes suggestions from
- *		Rusty Russell).
- * 2004-Oct	Prasanna S Panchamukhi <prasanna@in.ibm.com> and Jim Keniston
- *		kenistoj@us.ibm.com adopted from i386.
- */
-#include <linux/types.h>
-#include <linux/ptrace.h>
-#include <linux/percpu.h>
-
-#define  __ARCH_WANT_KPROBES_INSN_SLOT
-
-struct pt_regs;
-struct kprobe;
-
-typedef u8 kprobe_opcode_t;
-#define BREAKPOINT_INSTRUCTION	0xcc
-#define RELATIVEJUMP_INSTRUCTION 0xe9
-#define MAX_INSN_SIZE 16
-#define MAX_STACK_SIZE 64
-#define MIN_STACK_SIZE(ADDR) (((MAX_STACK_SIZE) < \
-	(((unsigned long)current_thread_info()) + THREAD_SIZE \
-	 - (unsigned long)(ADDR))) \
-	? (MAX_STACK_SIZE) \
-	: (((unsigned long)current_thread_info()) + THREAD_SIZE \
-	   - (unsigned long)(ADDR)))
-
-#define ARCH_SUPPORTS_KRETPROBES
-#define flush_insn_slot(p)	do { } while (0)
-
-extern const int kretprobe_blacklist_size;
-
-void arch_remove_kprobe(struct kprobe *p);
-void kretprobe_trampoline(void);
-
-/* Architecture specific copy of original instruction*/
-struct arch_specific_insn {
-	/* copy of the original instruction */
-	kprobe_opcode_t *insn;
-	/*
-	 * boostable = -1: This instruction type is not boostable.
-	 * boostable = 0: This instruction type is boostable.
-	 * boostable = 1: This instruction has been boosted: we have
-	 * added a relative jump after the instruction copy in insn,
-	 * so no single-step and fixup are needed (unless there's
-	 * a post_handler or break_handler).
-	 */
-	int boostable;
-};
-
-struct prev_kprobe {
-	struct kprobe *kp;
-	unsigned long status;
-	unsigned long old_flags;
-	unsigned long saved_flags;
-};
-
-/* per-cpu kprobe control block */
-struct kprobe_ctlblk {
-	unsigned long kprobe_status;
-	unsigned long kprobe_old_flags;
-	unsigned long kprobe_saved_flags;
-	unsigned long *jprobe_saved_sp;
-	struct pt_regs jprobe_saved_regs;
-	kprobe_opcode_t jprobes_stack[MAX_STACK_SIZE];
-	struct prev_kprobe prev_kprobe;
-};
-
-/* trap3/1 are intr gates for kprobes.  So, restore the status of IF,
- * if necessary, before executing the original int3/1 (trap) handler.
- */
-static inline void restore_interrupts(struct pt_regs *regs)
-{
-	if (regs->flags & IF_MASK)
-		local_irq_enable();
-}
-
-extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
-extern int kprobe_exceptions_notify(struct notifier_block *self,
-				    unsigned long val, void *data);
-#endif				/* _ASM_KPROBES_H */
-- 
cgit v1.2.3


From bdb4f156064e5f627213af82292eb8b5cf2dc5aa Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Wed, 30 Jan 2008 13:31:21 +0100
Subject: i386: hard_{en,dis}able_TSC can be static

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/process_32.c   | 4 ++--
 include/asm-x86/processor_32.h | 2 --
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 48e92e3758c2..40cc29695eba 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -588,7 +588,7 @@ int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
 }
 
 #ifdef CONFIG_SECCOMP
-void hard_disable_TSC(void)
+static void hard_disable_TSC(void)
 {
 	write_cr4(read_cr4() | X86_CR4_TSD);
 }
@@ -603,7 +603,7 @@ void disable_TSC(void)
 		hard_disable_TSC();
 	preempt_enable();
 }
-void hard_enable_TSC(void)
+static void hard_enable_TSC(void)
 {
 	write_cr4(read_cr4() & ~X86_CR4_TSD);
 }
diff --git a/include/asm-x86/processor_32.h b/include/asm-x86/processor_32.h
index e5056ab9dd9f..a5560a3b57ef 100644
--- a/include/asm-x86/processor_32.h
+++ b/include/asm-x86/processor_32.h
@@ -206,9 +206,7 @@ extern int bootloader_type;
 
 #define HAVE_ARCH_PICK_MMAP_LAYOUT
 
-extern void hard_disable_TSC(void);
 extern void disable_TSC(void);
-extern void hard_enable_TSC(void);
 
 /*
  * Size of io_bitmap.
-- 
cgit v1.2.3


From 22f5991c85dec1281cce5c8df9ee92b43b1738c8 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Wed, 30 Jan 2008 13:31:23 +0100
Subject: x86-64: honor notify_die() returning NOTIFY_STOP

This requires making die() return a value, making its callers honor
this (and be prepared that it may return), and making oops_end() have
two additional parameters.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/mcheck/mce_64.c |  8 ++++----
 arch/x86/kernel/traps_64.c          | 23 +++++++++++++++--------
 arch/x86/mm/fault_64.c              | 12 ++++++------
 include/asm-x86/kdebug.h            |  4 ++--
 4 files changed, 27 insertions(+), 20 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 8cd47fe2ef2c..bc6e35153d83 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -192,10 +192,10 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 
 	atomic_inc(&mce_entry);
 
-	if (regs)
-		notify_die(DIE_NMI, "machine check", regs, error_code, 18,
-			   SIGKILL);
-	if (!banks)
+	if ((regs
+	     && notify_die(DIE_NMI, "machine check", regs, error_code,
+			   18, SIGKILL) == NOTIFY_STOP)
+	    || !banks)
 		goto out2;
 
 	memset(&m, 0, sizeof(struct mce));
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index 1a12a81fdb1b..cf90ceb48f5f 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -488,7 +488,7 @@ unsigned __kprobes long oops_begin(void)
 	return flags;
 }
 
-void __kprobes oops_end(unsigned long flags)
+void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
 { 
 	die_owner = -1;
 	bust_spinlocks(0);
@@ -497,12 +497,17 @@ void __kprobes oops_end(unsigned long flags)
 		/* Nest count reaches zero, release the lock. */
 		__raw_spin_unlock(&die_lock);
 	raw_local_irq_restore(flags);
+	if (!regs) {
+		oops_exit();
+		return;
+	}
 	if (panic_on_oops)
 		panic("Fatal exception");
 	oops_exit();
+	do_exit(signr);
 }
 
-void __kprobes __die(const char * str, struct pt_regs * regs, long err)
+int __kprobes __die(const char * str, struct pt_regs * regs, long err)
 {
 	static int die_counter;
 	printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
@@ -516,7 +521,8 @@ void __kprobes __die(const char * str, struct pt_regs * regs, long err)
 	printk("DEBUG_PAGEALLOC");
 #endif
 	printk("\n");
-	notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV);
+	if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
+		return 1;
 	show_registers(regs);
 	add_taint(TAINT_DIE);
 	/* Executive summary in case the oops scrolled away */
@@ -525,6 +531,7 @@ void __kprobes __die(const char * str, struct pt_regs * regs, long err)
 	printk(" RSP <%016lx>\n", regs->sp);
 	if (kexec_should_crash(current))
 		crash_kexec(regs);
+	return 0;
 }
 
 void die(const char * str, struct pt_regs * regs, long err)
@@ -534,9 +541,9 @@ void die(const char * str, struct pt_regs * regs, long err)
 	if (!user_mode(regs))
 		report_bug(regs->ip, regs);
 
-	__die(str, regs, err);
-	oops_end(flags);
-	do_exit(SIGSEGV); 
+	if (__die(str, regs, err))
+		regs = NULL;
+	oops_end(flags, regs, SIGSEGV);
 }
 
 void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
@@ -553,10 +560,10 @@ void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
 		crash_kexec(regs);
 	if (do_panic || panic_on_oops)
 		panic("Non maskable interrupt");
-	oops_end(flags);
+	oops_end(flags, NULL, SIGBUS);
 	nmi_exit();
 	local_irq_enable();
-	do_exit(SIGSEGV);
+	do_exit(SIGBUS);
 }
 
 static void __kprobes do_trap(int trapnr, int signr, char *str,
diff --git a/arch/x86/mm/fault_64.c b/arch/x86/mm/fault_64.c
index 162050d4e5a3..121c7bda6297 100644
--- a/arch/x86/mm/fault_64.c
+++ b/arch/x86/mm/fault_64.c
@@ -227,9 +227,9 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
 	tsk->thread.cr2 = address;
 	tsk->thread.trap_no = 14;
 	tsk->thread.error_code = error_code;
-	__die("Bad pagetable", regs, error_code);
-	oops_end(flags);
-	do_exit(SIGKILL);
+	if (__die("Bad pagetable", regs, error_code))
+		regs = NULL;
+	oops_end(flags, regs, SIGKILL);
 }
 
 /*
@@ -541,11 +541,11 @@ no_context:
 	tsk->thread.cr2 = address;
 	tsk->thread.trap_no = 14;
 	tsk->thread.error_code = error_code;
-	__die("Oops", regs, error_code);
+	if (__die("Oops", regs, error_code))
+		regs = NULL;
 	/* Executive summary in case the body of the oops scrolled away */
 	printk(KERN_EMERG "CR2: %016lx\n", address);
-	oops_end(flags);
-	do_exit(SIGKILL);
+	oops_end(flags, regs, SIGKILL);
 
 /*
  * We ran out of memory, or some other thing happened to us that made
diff --git a/include/asm-x86/kdebug.h b/include/asm-x86/kdebug.h
index 49e5c91d490c..a5e5e3b7eb23 100644
--- a/include/asm-x86/kdebug.h
+++ b/include/asm-x86/kdebug.h
@@ -25,7 +25,7 @@ enum die_val {
 extern void early_printk(const char *fmt, ...) __attribute__((format(printf,1,2)));
 extern void printk_address(unsigned long address);
 extern void die(const char *,struct pt_regs *,long);
-extern void __die(const char *,struct pt_regs *,long);
+extern int __must_check __die(const char *, struct pt_regs *, long);
 extern void show_registers(struct pt_regs *regs);
 extern void __show_registers(struct pt_regs *, int all);
 extern void show_trace(struct task_struct *, struct pt_regs *, unsigned long *);
@@ -33,6 +33,6 @@ extern void __show_regs(struct pt_regs *regs);
 extern void show_regs(struct pt_regs *regs);
 extern void dump_pagetable(unsigned long);
 extern unsigned long oops_begin(void);
-extern void oops_end(unsigned long);
+extern void oops_end(unsigned long, struct pt_regs *, int signr);
 
 #endif
-- 
cgit v1.2.3


From 3e7622f9d7807a0a826d042cafc211cd1a29448c Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Wed, 30 Jan 2008 13:31:23 +0100
Subject: x86: move to .rodata/.init.data

The array is never written, and on 64-bits it's not even being used
past initial boot.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/entry_32.S  | 2 +-
 arch/x86/kernel/i8259_64.c  | 2 +-
 include/asm-x86/hw_irq_32.h | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 6a474e1028c7..be5c31d04884 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -583,7 +583,7 @@ END(syscall_badsys)
  * Build the entry stubs and pointer table with
  * some assembler magic.
  */
-.data
+.section .rodata,"a"
 ENTRY(interrupt)
 .text
 
diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c
index be82b1217691..99c8406ae253 100644
--- a/arch/x86/kernel/i8259_64.c
+++ b/arch/x86/kernel/i8259_64.c
@@ -77,7 +77,7 @@ BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf)
 	IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
 
 /* for the irq vectors */
-static void (*interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
+static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
 					  IRQLIST_16(0x2), IRQLIST_16(0x3),
 	IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
 	IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
diff --git a/include/asm-x86/hw_irq_32.h b/include/asm-x86/hw_irq_32.h
index b93e35a708ac..6d65fbb6358b 100644
--- a/include/asm-x86/hw_irq_32.h
+++ b/include/asm-x86/hw_irq_32.h
@@ -26,7 +26,7 @@
  * Interrupt entry/exit code at both C and assembly level
  */
 
-extern void (*interrupt[NR_IRQS])(void);
+extern void (*const interrupt[NR_IRQS])(void);
 
 #ifdef CONFIG_SMP
 void reschedule_interrupt(void);
-- 
cgit v1.2.3


From cae4595764cb3b08f6517e99bac1e3862854b1a1 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Wed, 30 Jan 2008 13:31:23 +0100
Subject: x86: make __{save,restore}_processor_state static

.. allowing to remove their declarations from a global include file
(the symbols don't exist for anything but x86).

Likewise for 64-bits' fix_processor_context(), just that that one was
properly declared in an arch-specific header.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/suspend_64.c | 8 +++++---
 arch/x86/power/cpu.c         | 4 ++--
 include/asm-x86/suspend_64.h | 2 --
 include/linux/suspend.h      | 3 ---
 4 files changed, 7 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/suspend_64.c b/arch/x86/kernel/suspend_64.c
index 279c25775d19..09199511c256 100644
--- a/arch/x86/kernel/suspend_64.c
+++ b/arch/x86/kernel/suspend_64.c
@@ -17,6 +17,8 @@
 /* References to section boundaries */
 extern const void __nosave_begin, __nosave_end;
 
+static void fix_processor_context(void);
+
 struct saved_context saved_context;
 
 /**
@@ -34,7 +36,7 @@ struct saved_context saved_context;
  *	needed by kernel A, so that it can operate correctly after the resume
  *	regardless of what kernel B does in the meantime.
  */
-void __save_processor_state(struct saved_context *ctxt)
+static void __save_processor_state(struct saved_context *ctxt)
 {
 	kernel_fpu_begin();
 
@@ -89,7 +91,7 @@ static void do_fpu_end(void)
  *		by __save_processor_state()
  *	@ctxt - structure to load the registers contents from
  */
-void __restore_processor_state(struct saved_context *ctxt)
+static void __restore_processor_state(struct saved_context *ctxt)
 {
 	/*
 	 * control registers
@@ -133,7 +135,7 @@ void restore_processor_state(void)
 	__restore_processor_state(&saved_context);
 }
 
-void fix_processor_context(void)
+static void fix_processor_context(void)
 {
 	int cpu = smp_processor_id();
 	struct tss_struct *t = &per_cpu(init_tss, cpu);
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 5a98dc35addf..efcf620d1439 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -19,7 +19,7 @@ unsigned long saved_context_esp, saved_context_ebp;
 unsigned long saved_context_esi, saved_context_edi;
 unsigned long saved_context_eflags;
 
-void __save_processor_state(struct saved_context *ctxt)
+static void __save_processor_state(struct saved_context *ctxt)
 {
 	mtrr_save_fixed_ranges(NULL);
 	kernel_fpu_begin();
@@ -86,7 +86,7 @@ static void fix_processor_context(void)
 
 }
 
-void __restore_processor_state(struct saved_context *ctxt)
+static void __restore_processor_state(struct saved_context *ctxt)
 {
 	/*
 	 * control registers
diff --git a/include/asm-x86/suspend_64.h b/include/asm-x86/suspend_64.h
index 4404668f9aa4..2eb92cb81a0d 100644
--- a/include/asm-x86/suspend_64.h
+++ b/include/asm-x86/suspend_64.h
@@ -45,8 +45,6 @@ struct saved_context {
 #define loaddebug(thread,register) \
 	set_debugreg((thread)->debugreg##register, register)
 
-extern void fix_processor_context(void);
-
 /* routines for saving/restoring kernel state */
 extern int acpi_save_state_mem(void);
 extern char core_restore_code;
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 4360e0816956..40280df2a3db 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -211,9 +211,6 @@ static inline int hibernate(void) { return -ENOSYS; }
 #ifdef CONFIG_PM_SLEEP
 void save_processor_state(void);
 void restore_processor_state(void);
-struct saved_context;
-void __save_processor_state(struct saved_context *ctxt);
-void __restore_processor_state(struct saved_context *ctxt);
 
 /* kernel/power/main.c */
 extern struct blocking_notifier_head pm_chain_head;
-- 
cgit v1.2.3


From e94271017f0933b29362a3c9dea5a6b9d04d98e1 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Wed, 30 Jan 2008 13:31:24 +0100
Subject: x86: adjust enable_NMI_through_LVT0()

Its previous use in a call to on_each_cpu() was pointless, as at the
time that code gets executed only one CPU is online. Further, the
function can be __cpuinit, and for this to work without
CONFIG_HOTPLUG_CPU setup_nmi() must also get an attribute (this one
can even be __init; on 64-bits check_timer() also was lacking that
attribute).

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apic_32.c    | 2 +-
 arch/x86/kernel/apic_64.c    | 2 +-
 arch/x86/kernel/io_apic_32.c | 4 ++--
 arch/x86/kernel/io_apic_64.c | 6 +++---
 arch/x86/kernel/smpboot_32.c | 2 +-
 arch/x86/kernel/smpboot_64.c | 2 +-
 include/asm-x86/apic.h       | 2 +-
 7 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index 69a13d127da3..4c014fca2057 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -152,7 +152,7 @@ u32 safe_apic_wait_icr_idle(void)
 /**
  * enable_NMI_through_LVT0 - enable NMI through local vector table 0
  */
-void enable_NMI_through_LVT0 (void * dummy)
+void __cpuinit enable_NMI_through_LVT0(void)
 {
 	unsigned int v = APIC_DM_NMI;
 
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index ab4ae50399fd..47b8ef51dde0 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -130,7 +130,7 @@ u32 safe_apic_wait_icr_idle(void)
 /**
  * enable_NMI_through_LVT0 - enable NMI through local vector table 0
  */
-void enable_NMI_through_LVT0(void *dummy)
+void __cpuinit enable_NMI_through_LVT0(void)
 {
 	unsigned int v;
 
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 76f11c3e3906..0d204237489e 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -2078,7 +2078,7 @@ static struct irq_chip lapic_chip __read_mostly = {
 	.eoi		= ack_apic,
 };
 
-static void setup_nmi (void)
+static void __init setup_nmi(void)
 {
 	/*
  	 * Dirty trick to enable the NMI watchdog ...
@@ -2091,7 +2091,7 @@ static void setup_nmi (void)
 	 */ 
 	apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
 
-	on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1);
+	enable_NMI_through_LVT0();
 
 	apic_printk(APIC_VERBOSE, " done.\n");
 }
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index fa70005be5e8..f914d84a21da 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -1566,7 +1566,7 @@ static struct hw_interrupt_type lapic_irq_type __read_mostly = {
 	.end = end_lapic_irq,
 };
 
-static void setup_nmi (void)
+static void __init setup_nmi(void)
 {
 	/*
  	 * Dirty trick to enable the NMI watchdog ...
@@ -1579,7 +1579,7 @@ static void setup_nmi (void)
 	 */ 
 	printk(KERN_INFO "activating NMI Watchdog ...");
 
-	enable_NMI_through_LVT0(NULL);
+	enable_NMI_through_LVT0();
 
 	printk(" done.\n");
 }
@@ -1655,7 +1655,7 @@ static inline void unlock_ExtINT_logic(void)
  *
  * FIXME: really need to revamp this for modern platforms only.
  */
-static inline void check_timer(void)
+static inline void __init check_timer(void)
 {
 	struct irq_cfg *cfg = irq_cfg + 0;
 	int apic1, pin1, apic2, pin2;
diff --git a/arch/x86/kernel/smpboot_32.c b/arch/x86/kernel/smpboot_32.c
index 0f294d6e22cf..2034332ad080 100644
--- a/arch/x86/kernel/smpboot_32.c
+++ b/arch/x86/kernel/smpboot_32.c
@@ -404,7 +404,7 @@ static void __cpuinit start_secondary(void *unused)
 	setup_secondary_clock();
 	if (nmi_watchdog == NMI_IO_APIC) {
 		disable_8259A_irq(0);
-		enable_NMI_through_LVT0(NULL);
+		enable_NMI_through_LVT0();
 		enable_8259A_irq(0);
 	}
 	/*
diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c
index cb73c4da87fc..4c03ddccd681 100644
--- a/arch/x86/kernel/smpboot_64.c
+++ b/arch/x86/kernel/smpboot_64.c
@@ -337,7 +337,7 @@ void __cpuinit start_secondary(void)
 
 	if (nmi_watchdog == NMI_IO_APIC) {
 		disable_8259A_irq(0);
-		enable_NMI_through_LVT0(NULL);
+		enable_NMI_through_LVT0();
 		enable_8259A_irq(0);
 	}
 
diff --git a/include/asm-x86/apic.h b/include/asm-x86/apic.h
index 72bf09cf13ab..bcfc07fd3661 100644
--- a/include/asm-x86/apic.h
+++ b/include/asm-x86/apic.h
@@ -117,7 +117,7 @@ extern void init_apic_mappings(void);
 extern void setup_boot_APIC_clock(void);
 extern void setup_secondary_APIC_clock(void);
 extern int APIC_init_uniprocessor(void);
-extern void enable_NMI_through_LVT0(void *dummy);
+extern void enable_NMI_through_LVT0(void);
 
 /*
  * On 32bit this is mach-xxx local
-- 
cgit v1.2.3


From c6b48324325ffb637c3aafb2d795408febf40198 Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Wed, 30 Jan 2008 13:31:25 +0100
Subject: x86, kexec: force x86 arches to boot kdump kernels on boot cpu

Recently a kdump bug was discovered in which a system would hang inside
calibrate_delay during the booting of the kdump kernel.  This was caused
by the fact that the jiffies counter was not being incremented during
timer calibration.  The root cause of this problem was found to be a
bios misconfiguration of the hypertransport bus.  On system affected by
this hang, the bios had assigned APIC ids which used extended apic bits
(more than the nominal 4 bit ids's), but failed to configure bit 17 of
the hypertransport transaction config register, which indicated that the
mask for the destination field of interrupt packets accross the ht bus
(see section 3.3.9 of
http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/26094.PDF).
If a crash occurs on a cpu with an APIC id that extends beyond 4 bits,
it will not recieve interrupts during the kdump kernel boot, and this
hang will be the result.  The fix is to add this patch, whcih add an
early pci quirk check, to forcibly enable this bit in the httcfg
register.  This enables all cpus on a system to receive interrupts, and
allows kdump kernel bootup to procede normally.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/early-quirks.c | 86 +++++++++++++++++++++++++++++++-----------
 1 file changed, 65 insertions(+), 21 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 88bb83ec895f..b55258e49208 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -21,7 +21,30 @@
 #include <asm/gart.h>
 #endif
 
-static void __init via_bugs(void)
+static void __init fix_hypertransport_config(int num, int slot, int func)
+{
+	u32 htcfg;
+	/*
+	 * we found a hypertransport bus
+	 * make sure that we are broadcasting
+	 * interrupts to all cpus on the ht bus
+	 * if we're using extended apic ids
+	 */
+	htcfg = read_pci_config(num, slot, func, 0x68);
+	if (htcfg & (1 << 18)) {
+		printk(KERN_INFO "Detected use of extended apic ids on hypertransport bus\n");
+		if ((htcfg & (1 << 17)) == 0) {
+			printk(KERN_INFO "Enabling hypertransport extended apic interrupt broadcast\n");
+			printk(KERN_INFO "Note this is a bios bug, please contact your hw vendor\n");
+			htcfg |= (1 << 17);
+			write_pci_config(num, slot, func, 0x68, htcfg);
+		}
+	}
+
+
+}
+
+static void __init via_bugs(int  num, int slot, int func)
 {
 #ifdef CONFIG_GART_IOMMU
 	if ((end_pfn > MAX_DMA32_PFN ||  force_iommu) &&
@@ -44,7 +67,7 @@ static int __init nvidia_hpet_check(struct acpi_table_header *header)
 #endif /* CONFIG_X86_IO_APIC */
 #endif /* CONFIG_ACPI */
 
-static void __init nvidia_bugs(void)
+static void __init nvidia_bugs(int num, int slot, int func)
 {
 #ifdef CONFIG_ACPI
 #ifdef CONFIG_X86_IO_APIC
@@ -72,7 +95,7 @@ static void __init nvidia_bugs(void)
 
 }
 
-static void __init ati_bugs(void)
+static void __init ati_bugs(int num, int slot, int func)
 {
 #ifdef CONFIG_X86_IO_APIC
 	if (timer_over_8254 == 1) {
@@ -83,15 +106,27 @@ static void __init ati_bugs(void)
 #endif
 }
 
+#define QFLAG_APPLY_ONCE 	0x1
+#define QFLAG_APPLIED		0x2
+#define QFLAG_DONE		(QFLAG_APPLY_ONCE|QFLAG_APPLIED)
 struct chipset {
-	u16 vendor;
-	void (*f)(void);
+	u32 vendor;
+	u32 device;
+	u32 class;
+	u32 class_mask;
+	u32 flags;
+	void (*f)(int num, int slot, int func);
 };
 
 static struct chipset early_qrk[] __initdata = {
-	{ PCI_VENDOR_ID_NVIDIA, nvidia_bugs },
-	{ PCI_VENDOR_ID_VIA, via_bugs },
-	{ PCI_VENDOR_ID_ATI, ati_bugs },
+	{ PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID,
+	  PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs },
+	{ PCI_VENDOR_ID_VIA, PCI_ANY_ID,
+	  PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs },
+	{ PCI_VENDOR_ID_ATI, PCI_ANY_ID,
+	  PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, ati_bugs },
+	{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
+	  PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config },
 	{}
 };
 
@@ -106,27 +141,36 @@ void __init early_quirks(void)
 	for (num = 0; num < 32; num++) {
 		for (slot = 0; slot < 32; slot++) {
 			for (func = 0; func < 8; func++) {
-				u32 class;
-				u32 vendor;
+				u16 class;
+				u16 vendor;
+				u16 device;
 				u8 type;
 				int i;
-				class = read_pci_config(num,slot,func,
+
+				class = read_pci_config_16(num,slot,func,
 							PCI_CLASS_REVISION);
-				if (class == 0xffffffff)
+				if (class == 0xffff)
 					break;
 
-				if ((class >> 16) != PCI_CLASS_BRIDGE_PCI)
-					continue;
-
-				vendor = read_pci_config(num, slot, func,
+				vendor = read_pci_config_16(num, slot, func,
 							 PCI_VENDOR_ID);
-				vendor &= 0xffff;
 
-				for (i = 0; early_qrk[i].f; i++)
-					if (early_qrk[i].vendor == vendor) {
-						early_qrk[i].f();
-						return;
+				device = read_pci_config_16(num, slot, func,
+							PCI_DEVICE_ID);
+
+				for(i=0;early_qrk[i].f != NULL;i++) {
+					if (((early_qrk[i].vendor == PCI_ANY_ID) ||
+					    (early_qrk[i].vendor == vendor)) &&
+					   ((early_qrk[i].device == PCI_ANY_ID) ||
+					    (early_qrk[i].device == device)) &&
+					   (!((early_qrk[i].class ^ class) &
+					     early_qrk[i].class_mask))) {
+						if ((early_qrk[i].flags & QFLAG_DONE) != QFLAG_DONE)
+							early_qrk[i].f(num, slot, func);
+						early_qrk[i].flags |= QFLAG_APPLIED;
+
 					}
+				}
 
 				type = read_pci_config_byte(num, slot, func,
 							    PCI_HEADER_TYPE);
-- 
cgit v1.2.3


From 3b095a04e71243bd0f1679c04f1e8d73a3c9c5a9 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Wed, 30 Jan 2008 13:31:26 +0100
Subject: x86: cleanup i387_32.c according to checkpatch

clean up checkpatch warnings/errors on i387_32.c

The old and new i387_32.s (asm listings) were checked with diff to
be identical so it's safe to apply this patch.

Signed-off-by: Cyrill Gorcunov <gorunov@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/i387_32.c | 291 ++++++++++++++++++++++++----------------------
 1 file changed, 149 insertions(+), 142 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/i387_32.c b/arch/x86/kernel/i387_32.c
index 7d2e12f6c78b..bebe03463461 100644
--- a/arch/x86/kernel/i387_32.c
+++ b/arch/x86/kernel/i387_32.c
@@ -29,11 +29,13 @@ void mxcsr_feature_mask_init(void)
 	unsigned long mask = 0;
 	clts();
 	if (cpu_has_fxsr) {
-		memset(&current->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
-		asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave)); 
+		memset(&current->thread.i387.fxsave, 0,
+		       sizeof(struct i387_fxsave_struct));
+		asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave));
 		mask = current->thread.i387.fxsave.mxcsr_mask;
-		if (mask == 0) mask = 0x0000ffbf;
-	} 
+		if (mask == 0)
+			mask = 0x0000ffbf;
+	}
 	mxcsr_feature_mask &= mask;
 	stts();
 }
@@ -47,18 +49,21 @@ void mxcsr_feature_mask_init(void)
 void init_fpu(struct task_struct *tsk)
 {
 	if (cpu_has_fxsr) {
-		memset(&tsk->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
+		memset(&tsk->thread.i387.fxsave, 0,
+		       sizeof(struct i387_fxsave_struct));
 		tsk->thread.i387.fxsave.cwd = 0x37f;
 		if (cpu_has_xmm)
 			tsk->thread.i387.fxsave.mxcsr = 0x1f80;
 	} else {
-		memset(&tsk->thread.i387.fsave, 0, sizeof(struct i387_fsave_struct));
+		memset(&tsk->thread.i387.fsave, 0,
+		       sizeof(struct i387_fsave_struct));
 		tsk->thread.i387.fsave.cwd = 0xffff037fu;
 		tsk->thread.i387.fsave.swd = 0xffff0000u;
 		tsk->thread.i387.fsave.twd = 0xffffffffu;
 		tsk->thread.i387.fsave.fos = 0xffff0000u;
 	}
-	/* only the device not available exception or ptrace can call init_fpu */
+	/* only the device not available exception
+	 * or ptrace can call init_fpu */
 	set_stopped_child_used_math(tsk);
 }
 
@@ -83,21 +88,22 @@ EXPORT_SYMBOL_GPL(kernel_fpu_begin);
  * FPU tag word conversions.
  */
 
-static inline unsigned short twd_i387_to_fxsr( unsigned short twd )
+static inline unsigned short twd_i387_to_fxsr(unsigned short twd)
 {
 	unsigned int tmp; /* to avoid 16 bit prefixes in the code */
- 
+
 	/* Transform each pair of bits into 01 (valid) or 00 (empty) */
-        tmp = ~twd;
-        tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
-        /* and move the valid bits to the lower byte. */
-        tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
-        tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
-        tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
-        return tmp;
+	tmp = ~twd;
+	tmp = (tmp | (tmp >> 1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
+	/* and move the valid bits to the lower byte. */
+	tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
+	tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
+	tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
+
+	return tmp;
 }
 
-static inline unsigned long twd_fxsr_to_i387( struct i387_fxsave_struct *fxsave )
+static inline unsigned long twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)
 {
 	struct _fpxreg *st = NULL;
 	unsigned long tos = (fxsave->swd >> 11) & 7;
@@ -108,26 +114,26 @@ static inline unsigned long twd_fxsr_to_i387( struct i387_fxsave_struct *fxsave
 
 #define FPREG_ADDR(f, n)	((void *)&(f)->st_space + (n) * 16);
 
-	for ( i = 0 ; i < 8 ; i++ ) {
-		if ( twd & 0x1 ) {
-			st = FPREG_ADDR( fxsave, (i - tos) & 7 );
+	for (i = 0; i < 8; i++) {
+		if (twd & 0x1) {
+			st = FPREG_ADDR(fxsave, (i - tos) & 7);
 
-			switch ( st->exponent & 0x7fff ) {
+			switch (st->exponent & 0x7fff) {
 			case 0x7fff:
 				tag = 2;		/* Special */
 				break;
 			case 0x0000:
-				if ( !st->significand[0] &&
-				     !st->significand[1] &&
-				     !st->significand[2] &&
-				     !st->significand[3] ) {
+				if (!st->significand[0] &&
+				    !st->significand[1] &&
+				    !st->significand[2] &&
+				    !st->significand[3]) {
 					tag = 1;	/* Zero */
 				} else {
 					tag = 2;	/* Special */
 				}
 				break;
 			default:
-				if ( st->significand[3] & 0x8000 ) {
+				if (st->significand[3] & 0x8000) {
 					tag = 0;	/* Valid */
 				} else {
 					tag = 2;	/* Special */
@@ -147,18 +153,18 @@ static inline unsigned long twd_fxsr_to_i387( struct i387_fxsave_struct *fxsave
  * FPU state interaction.
  */
 
-unsigned short get_fpu_cwd( struct task_struct *tsk )
+unsigned short get_fpu_cwd(struct task_struct *tsk)
 {
-	if ( cpu_has_fxsr ) {
+	if (cpu_has_fxsr) {
 		return tsk->thread.i387.fxsave.cwd;
 	} else {
 		return (unsigned short)tsk->thread.i387.fsave.cwd;
 	}
 }
 
-unsigned short get_fpu_swd( struct task_struct *tsk )
+unsigned short get_fpu_swd(struct task_struct *tsk)
 {
-	if ( cpu_has_fxsr ) {
+	if (cpu_has_fxsr) {
 		return tsk->thread.i387.fxsave.swd;
 	} else {
 		return (unsigned short)tsk->thread.i387.fsave.swd;
@@ -166,9 +172,9 @@ unsigned short get_fpu_swd( struct task_struct *tsk )
 }
 
 #if 0
-unsigned short get_fpu_twd( struct task_struct *tsk )
+unsigned short get_fpu_twd(struct task_struct *tsk)
 {
-	if ( cpu_has_fxsr ) {
+	if (cpu_has_fxsr) {
 		return tsk->thread.i387.fxsave.twd;
 	} else {
 		return (unsigned short)tsk->thread.i387.fsave.twd;
@@ -176,9 +182,9 @@ unsigned short get_fpu_twd( struct task_struct *tsk )
 }
 #endif  /*  0  */
 
-unsigned short get_fpu_mxcsr( struct task_struct *tsk )
+unsigned short get_fpu_mxcsr(struct task_struct *tsk)
 {
-	if ( cpu_has_xmm ) {
+	if (cpu_has_xmm) {
 		return tsk->thread.i387.fxsave.mxcsr;
 	} else {
 		return 0x1f80;
@@ -187,27 +193,27 @@ unsigned short get_fpu_mxcsr( struct task_struct *tsk )
 
 #if 0
 
-void set_fpu_cwd( struct task_struct *tsk, unsigned short cwd )
+void set_fpu_cwd(struct task_struct *tsk, unsigned short cwd)
 {
-	if ( cpu_has_fxsr ) {
+	if (cpu_has_fxsr) {
 		tsk->thread.i387.fxsave.cwd = cwd;
 	} else {
 		tsk->thread.i387.fsave.cwd = ((long)cwd | 0xffff0000u);
 	}
 }
 
-void set_fpu_swd( struct task_struct *tsk, unsigned short swd )
+void set_fpu_swd(struct task_struct *tsk, unsigned short swd)
 {
-	if ( cpu_has_fxsr ) {
+	if (cpu_has_fxsr) {
 		tsk->thread.i387.fxsave.swd = swd;
 	} else {
 		tsk->thread.i387.fsave.swd = ((long)swd | 0xffff0000u);
 	}
 }
 
-void set_fpu_twd( struct task_struct *tsk, unsigned short twd )
+void set_fpu_twd(struct task_struct *tsk, unsigned short twd)
 {
-	if ( cpu_has_fxsr ) {
+	if (cpu_has_fxsr) {
 		tsk->thread.i387.fxsave.twd = twd_i387_to_fxsr(twd);
 	} else {
 		tsk->thread.i387.fsave.twd = ((long)twd | 0xffff0000u);
@@ -220,8 +226,8 @@ void set_fpu_twd( struct task_struct *tsk, unsigned short twd )
  * FXSR floating point environment conversions.
  */
 
-static int convert_fxsr_to_user( struct _fpstate __user *buf,
-					struct i387_fxsave_struct *fxsave )
+static int convert_fxsr_to_user(struct _fpstate __user *buf,
+				struct i387_fxsave_struct *fxsave)
 {
 	unsigned long env[7];
 	struct _fpreg __user *to;
@@ -236,32 +242,32 @@ static int convert_fxsr_to_user( struct _fpstate __user *buf,
 	env[5] = fxsave->foo;
 	env[6] = fxsave->fos;
 
-	if ( __copy_to_user( buf, env, 7 * sizeof(unsigned long) ) )
+	if (__copy_to_user(buf, env, 7 * sizeof(unsigned long)))
 		return 1;
 
 	to = &buf->_st[0];
 	from = (struct _fpxreg *) &fxsave->st_space[0];
-	for ( i = 0 ; i < 8 ; i++, to++, from++ ) {
+	for (i = 0; i < 8; i++, to++, from++) {
 		unsigned long __user *t = (unsigned long __user *)to;
 		unsigned long *f = (unsigned long *)from;
 
 		if (__put_user(*f, t) ||
-				__put_user(*(f + 1), t + 1) ||
-				__put_user(from->exponent, &to->exponent))
+		    __put_user(*(f + 1), t + 1) ||
+		    __put_user(from->exponent, &to->exponent))
 			return 1;
 	}
 	return 0;
 }
 
-static int convert_fxsr_from_user( struct i387_fxsave_struct *fxsave,
-					  struct _fpstate __user *buf )
+static int convert_fxsr_from_user(struct i387_fxsave_struct *fxsave,
+				  struct _fpstate __user *buf)
 {
 	unsigned long env[7];
 	struct _fpxreg *to;
 	struct _fpreg __user *from;
 	int i;
 
-	if ( __copy_from_user( env, buf, 7 * sizeof(long) ) )
+	if (__copy_from_user(env, buf, 7 * sizeof(long)))
 		return 1;
 
 	fxsave->cwd = (unsigned short)(env[0] & 0xffff);
@@ -275,13 +281,13 @@ static int convert_fxsr_from_user( struct i387_fxsave_struct *fxsave,
 
 	to = (struct _fpxreg *) &fxsave->st_space[0];
 	from = &buf->_st[0];
-	for ( i = 0 ; i < 8 ; i++, to++, from++ ) {
+	for (i = 0; i < 8; i++, to++, from++) {
 		unsigned long *t = (unsigned long *)to;
 		unsigned long __user *f = (unsigned long __user *)from;
 
 		if (__get_user(*t, f) ||
-				__get_user(*(t + 1), f + 1) ||
-				__get_user(to->exponent, &from->exponent))
+		    __get_user(*(t + 1), f + 1) ||
+		    __get_user(to->exponent, &from->exponent))
 			return 1;
 	}
 	return 0;
@@ -291,42 +297,42 @@ static int convert_fxsr_from_user( struct i387_fxsave_struct *fxsave,
  * Signal frame handlers.
  */
 
-static inline int save_i387_fsave( struct _fpstate __user *buf )
+static inline int save_i387_fsave(struct _fpstate __user *buf)
 {
 	struct task_struct *tsk = current;
 
-	unlazy_fpu( tsk );
+	unlazy_fpu(tsk);
 	tsk->thread.i387.fsave.status = tsk->thread.i387.fsave.swd;
-	if ( __copy_to_user( buf, &tsk->thread.i387.fsave,
-			     sizeof(struct i387_fsave_struct) ) )
+	if (__copy_to_user(buf, &tsk->thread.i387.fsave,
+			   sizeof(struct i387_fsave_struct)))
 		return -1;
 	return 1;
 }
 
-static int save_i387_fxsave( struct _fpstate __user *buf )
+static int save_i387_fxsave(struct _fpstate __user *buf)
 {
 	struct task_struct *tsk = current;
 	int err = 0;
 
-	unlazy_fpu( tsk );
+	unlazy_fpu(tsk);
 
-	if ( convert_fxsr_to_user( buf, &tsk->thread.i387.fxsave ) )
+	if (convert_fxsr_to_user(buf, &tsk->thread.i387.fxsave))
 		return -1;
 
-	err |= __put_user( tsk->thread.i387.fxsave.swd, &buf->status );
-	err |= __put_user( X86_FXSR_MAGIC, &buf->magic );
-	if ( err )
+	err |= __put_user(tsk->thread.i387.fxsave.swd, &buf->status);
+	err |= __put_user(X86_FXSR_MAGIC, &buf->magic);
+	if (err)
 		return -1;
 
-	if ( __copy_to_user( &buf->_fxsr_env[0], &tsk->thread.i387.fxsave,
-			     sizeof(struct i387_fxsave_struct) ) )
+	if (__copy_to_user(&buf->_fxsr_env[0], &tsk->thread.i387.fxsave,
+			   sizeof(struct i387_fxsave_struct)))
 		return -1;
 	return 1;
 }
 
-int save_i387( struct _fpstate __user *buf )
+int save_i387(struct _fpstate __user *buf)
 {
-	if ( !used_math() )
+	if (!used_math())
 		return 0;
 
 	/* This will cause a "finit" to be triggered by the next
@@ -334,49 +340,49 @@ int save_i387( struct _fpstate __user *buf )
 	 */
 	clear_used_math();
 
-	if ( HAVE_HWFP ) {
-		if ( cpu_has_fxsr ) {
-			return save_i387_fxsave( buf );
+	if (HAVE_HWFP) {
+		if (cpu_has_fxsr) {
+			return save_i387_fxsave(buf);
 		} else {
-			return save_i387_fsave( buf );
+			return save_i387_fsave(buf);
 		}
 	} else {
-		return save_i387_soft( &current->thread.i387.soft, buf );
+		return save_i387_soft(&current->thread.i387.soft, buf);
 	}
 }
 
-static inline int restore_i387_fsave( struct _fpstate __user *buf )
+static inline int restore_i387_fsave(struct _fpstate __user *buf)
 {
 	struct task_struct *tsk = current;
-	clear_fpu( tsk );
-	return __copy_from_user( &tsk->thread.i387.fsave, buf,
-				 sizeof(struct i387_fsave_struct) );
+	clear_fpu(tsk);
+	return __copy_from_user(&tsk->thread.i387.fsave, buf,
+				sizeof(struct i387_fsave_struct));
 }
 
-static int restore_i387_fxsave( struct _fpstate __user *buf )
+static int restore_i387_fxsave(struct _fpstate __user *buf)
 {
 	int err;
 	struct task_struct *tsk = current;
-	clear_fpu( tsk );
-	err = __copy_from_user( &tsk->thread.i387.fxsave, &buf->_fxsr_env[0],
-				sizeof(struct i387_fxsave_struct) );
+	clear_fpu(tsk);
+	err = __copy_from_user(&tsk->thread.i387.fxsave, &buf->_fxsr_env[0],
+			       sizeof(struct i387_fxsave_struct));
 	/* mxcsr reserved bits must be masked to zero for security reasons */
 	tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
-	return err ? 1 : convert_fxsr_from_user( &tsk->thread.i387.fxsave, buf );
+	return err ? 1 : convert_fxsr_from_user(&tsk->thread.i387.fxsave, buf);
 }
 
-int restore_i387( struct _fpstate __user *buf )
+int restore_i387(struct _fpstate __user *buf)
 {
 	int err;
 
-	if ( HAVE_HWFP ) {
-		if ( cpu_has_fxsr ) {
-			err = restore_i387_fxsave( buf );
+	if (HAVE_HWFP) {
+		if (cpu_has_fxsr) {
+			err = restore_i387_fxsave(buf);
 		} else {
-			err = restore_i387_fsave( buf );
+			err = restore_i387_fsave(buf);
 		}
 	} else {
-		err = restore_i387_soft( &current->thread.i387.soft, buf );
+		err = restore_i387_soft(&current->thread.i387.soft, buf);
 	}
 	set_used_math();
 	return err;
@@ -386,67 +392,67 @@ int restore_i387( struct _fpstate __user *buf )
  * ptrace request handlers.
  */
 
-static inline int get_fpregs_fsave( struct user_i387_struct __user *buf,
-				    struct task_struct *tsk )
+static inline int get_fpregs_fsave(struct user_i387_struct __user *buf,
+				   struct task_struct *tsk)
 {
-	return __copy_to_user( buf, &tsk->thread.i387.fsave,
-			       sizeof(struct user_i387_struct) );
+	return __copy_to_user(buf, &tsk->thread.i387.fsave,
+			      sizeof(struct user_i387_struct));
 }
 
-static inline int get_fpregs_fxsave( struct user_i387_struct __user *buf,
-				     struct task_struct *tsk )
+static inline int get_fpregs_fxsave(struct user_i387_struct __user *buf,
+				    struct task_struct *tsk)
 {
-	return convert_fxsr_to_user( (struct _fpstate __user *)buf,
-				     &tsk->thread.i387.fxsave );
+	return convert_fxsr_to_user((struct _fpstate __user *)buf,
+				    &tsk->thread.i387.fxsave);
 }
 
-int get_fpregs( struct user_i387_struct __user *buf, struct task_struct *tsk )
+int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *tsk)
 {
-	if ( HAVE_HWFP ) {
-		if ( cpu_has_fxsr ) {
-			return get_fpregs_fxsave( buf, tsk );
+	if (HAVE_HWFP) {
+		if (cpu_has_fxsr) {
+			return get_fpregs_fxsave(buf, tsk);
 		} else {
-			return get_fpregs_fsave( buf, tsk );
+			return get_fpregs_fsave(buf, tsk);
 		}
 	} else {
-		return save_i387_soft( &tsk->thread.i387.soft,
-				       (struct _fpstate __user *)buf );
+		return save_i387_soft(&tsk->thread.i387.soft,
+				      (struct _fpstate __user *)buf);
 	}
 }
 
-static inline int set_fpregs_fsave( struct task_struct *tsk,
-				    struct user_i387_struct __user *buf )
+static inline int set_fpregs_fsave(struct task_struct *tsk,
+				   struct user_i387_struct __user *buf)
 {
-	return __copy_from_user( &tsk->thread.i387.fsave, buf,
-				 sizeof(struct user_i387_struct) );
+	return __copy_from_user(&tsk->thread.i387.fsave, buf,
+				sizeof(struct user_i387_struct));
 }
 
-static inline int set_fpregs_fxsave( struct task_struct *tsk,
-				     struct user_i387_struct __user *buf )
+static inline int set_fpregs_fxsave(struct task_struct *tsk,
+				    struct user_i387_struct __user *buf)
 {
-	return convert_fxsr_from_user( &tsk->thread.i387.fxsave,
-				       (struct _fpstate __user *)buf );
+	return convert_fxsr_from_user(&tsk->thread.i387.fxsave,
+				      (struct _fpstate __user *)buf);
 }
 
-int set_fpregs( struct task_struct *tsk, struct user_i387_struct __user *buf )
+int set_fpregs(struct task_struct *tsk, struct user_i387_struct __user *buf)
 {
-	if ( HAVE_HWFP ) {
-		if ( cpu_has_fxsr ) {
-			return set_fpregs_fxsave( tsk, buf );
+	if (HAVE_HWFP) {
+		if (cpu_has_fxsr) {
+			return set_fpregs_fxsave(tsk, buf);
 		} else {
-			return set_fpregs_fsave( tsk, buf );
+			return set_fpregs_fsave(tsk, buf);
 		}
 	} else {
-		return restore_i387_soft( &tsk->thread.i387.soft,
-					  (struct _fpstate __user *)buf );
+		return restore_i387_soft(&tsk->thread.i387.soft,
+					 (struct _fpstate __user *)buf);
 	}
 }
 
-int get_fpxregs( struct user_fxsr_struct __user *buf, struct task_struct *tsk )
+int get_fpxregs(struct user_fxsr_struct __user *buf, struct task_struct *tsk)
 {
-	if ( cpu_has_fxsr ) {
-		if (__copy_to_user( buf, &tsk->thread.i387.fxsave,
-				    sizeof(struct user_fxsr_struct) ))
+	if (cpu_has_fxsr) {
+		if (__copy_to_user(buf, &tsk->thread.i387.fxsave,
+				   sizeof(struct user_fxsr_struct)))
 			return -EFAULT;
 		return 0;
 	} else {
@@ -454,15 +460,16 @@ int get_fpxregs( struct user_fxsr_struct __user *buf, struct task_struct *tsk )
 	}
 }
 
-int set_fpxregs( struct task_struct *tsk, struct user_fxsr_struct __user *buf )
+int set_fpxregs(struct task_struct *tsk, struct user_fxsr_struct __user *buf)
 {
 	int ret = 0;
 
-	if ( cpu_has_fxsr ) {
-		if (__copy_from_user( &tsk->thread.i387.fxsave, buf,
-				  sizeof(struct user_fxsr_struct) ))
+	if (cpu_has_fxsr) {
+		if (__copy_from_user(&tsk->thread.i387.fxsave, buf,
+				     sizeof(struct user_fxsr_struct)))
 			ret = -EFAULT;
-		/* mxcsr reserved bits must be masked to zero for security reasons */
+		/* mxcsr reserved bits must be masked to zero
+		 * for security reasons */
 		tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
 	} else {
 		ret = -EIO;
@@ -474,41 +481,40 @@ int set_fpxregs( struct task_struct *tsk, struct user_fxsr_struct __user *buf )
  * FPU state for core dumps.
  */
 
-static inline void copy_fpu_fsave( struct task_struct *tsk,
-				   struct user_i387_struct *fpu )
+static inline void copy_fpu_fsave(struct task_struct *tsk,
+				  struct user_i387_struct *fpu)
 {
-	memcpy( fpu, &tsk->thread.i387.fsave,
-		sizeof(struct user_i387_struct) );
+	memcpy(fpu, &tsk->thread.i387.fsave,
+	       sizeof(struct user_i387_struct));
 }
 
-static inline void copy_fpu_fxsave( struct task_struct *tsk,
-				   struct user_i387_struct *fpu )
+static inline void copy_fpu_fxsave(struct task_struct *tsk,
+				   struct user_i387_struct *fpu)
 {
 	unsigned short *to;
 	unsigned short *from;
 	int i;
 
-	memcpy( fpu, &tsk->thread.i387.fxsave, 7 * sizeof(long) );
+	memcpy(fpu, &tsk->thread.i387.fxsave, 7 * sizeof(long));
 
 	to = (unsigned short *)&fpu->st_space[0];
 	from = (unsigned short *)&tsk->thread.i387.fxsave.st_space[0];
-	for ( i = 0 ; i < 8 ; i++, to += 5, from += 8 ) {
-		memcpy( to, from, 5 * sizeof(unsigned short) );
-	}
+	for (i = 0; i < 8; i++, to += 5, from += 8)
+		memcpy(to, from, 5 * sizeof(unsigned short));
 }
 
-int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu )
+int dump_fpu(struct pt_regs *regs, struct user_i387_struct *fpu)
 {
 	int fpvalid;
 	struct task_struct *tsk = current;
 
 	fpvalid = !!used_math();
-	if ( fpvalid ) {
-		unlazy_fpu( tsk );
-		if ( cpu_has_fxsr ) {
-			copy_fpu_fxsave( tsk, fpu );
+	if (fpvalid) {
+		unlazy_fpu(tsk);
+		if (cpu_has_fxsr) {
+			copy_fpu_fxsave(tsk, fpu);
 		} else {
-			copy_fpu_fsave( tsk, fpu );
+			copy_fpu_fsave(tsk, fpu);
 		}
 	}
 
@@ -531,7 +537,8 @@ int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu)
 	return fpvalid;
 }
 
-int dump_task_extended_fpu(struct task_struct *tsk, struct user_fxsr_struct *fpu)
+int dump_task_extended_fpu(struct task_struct *tsk,
+			   struct user_fxsr_struct *fpu)
 {
 	int fpvalid = tsk_used_math(tsk) && cpu_has_fxsr;
 
-- 
cgit v1.2.3


From 7bcbc78dea92fdf0947fa48e248da3c993a5690f Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Wed, 30 Jan 2008 13:31:26 +0100
Subject: x86: clean up arch/x86/kernel/early-quirks.c

clean up checkpatch errors. No code changed.

      text    data     bss     dec     hex filename
       705     120       0     825     339 early-quirks.o.before
       705     120       0     825     339 early-quirks.o.after

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/early-quirks.c | 91 ++++++++++++++++++++++--------------------
 1 file changed, 47 insertions(+), 44 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index b55258e49208..3f88e437e843 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -32,10 +32,13 @@ static void __init fix_hypertransport_config(int num, int slot, int func)
 	 */
 	htcfg = read_pci_config(num, slot, func, 0x68);
 	if (htcfg & (1 << 18)) {
-		printk(KERN_INFO "Detected use of extended apic ids on hypertransport bus\n");
+		printk(KERN_INFO "Detected use of extended apic ids "
+				 "on hypertransport bus\n");
 		if ((htcfg & (1 << 17)) == 0) {
-			printk(KERN_INFO "Enabling hypertransport extended apic interrupt broadcast\n");
-			printk(KERN_INFO "Note this is a bios bug, please contact your hw vendor\n");
+			printk(KERN_INFO "Enabling hypertransport extended "
+					 "apic interrupt broadcast\n");
+			printk(KERN_INFO "Note this is a bios bug, "
+					 "please contact your hw vendor\n");
 			htcfg |= (1 << 17);
 			write_pci_config(num, slot, func, 0x68, htcfg);
 		}
@@ -130,6 +133,43 @@ static struct chipset early_qrk[] __initdata = {
 	{}
 };
 
+static void check_dev_quirk(int num, int slot, int func)
+{
+	u16 class;
+	u16 vendor;
+	u16 device;
+	u8 type;
+	int i;
+
+	class = read_pci_config_16(num, slot, func, PCI_CLASS_DEVICE);
+
+	if (class == 0xffff)
+		return;
+
+	vendor = read_pci_config_16(num, slot, func, PCI_VENDOR_ID);
+
+	device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID);
+
+	for (i = 0; early_qrk[i].f != NULL; i++) {
+		if (((early_qrk[i].vendor == PCI_ANY_ID) ||
+			(early_qrk[i].vendor == vendor)) &&
+			((early_qrk[i].device == PCI_ANY_ID) ||
+			(early_qrk[i].device == device)) &&
+			(!((early_qrk[i].class ^ class) &
+			    early_qrk[i].class_mask))) {
+				if ((early_qrk[i].flags &
+				     QFLAG_DONE) != QFLAG_DONE)
+					early_qrk[i].f(num, slot, func);
+				early_qrk[i].flags |= QFLAG_APPLIED;
+			}
+	}
+
+	type = read_pci_config_byte(num, slot, func,
+				    PCI_HEADER_TYPE);
+	if (!(type & 0x80))
+		return;
+}
+
 void __init early_quirks(void)
 {
 	int num, slot, func;
@@ -138,45 +178,8 @@ void __init early_quirks(void)
 		return;
 
 	/* Poor man's PCI discovery */
-	for (num = 0; num < 32; num++) {
-		for (slot = 0; slot < 32; slot++) {
-			for (func = 0; func < 8; func++) {
-				u16 class;
-				u16 vendor;
-				u16 device;
-				u8 type;
-				int i;
-
-				class = read_pci_config_16(num,slot,func,
-							PCI_CLASS_REVISION);
-				if (class == 0xffff)
-					break;
-
-				vendor = read_pci_config_16(num, slot, func,
-							 PCI_VENDOR_ID);
-
-				device = read_pci_config_16(num, slot, func,
-							PCI_DEVICE_ID);
-
-				for(i=0;early_qrk[i].f != NULL;i++) {
-					if (((early_qrk[i].vendor == PCI_ANY_ID) ||
-					    (early_qrk[i].vendor == vendor)) &&
-					   ((early_qrk[i].device == PCI_ANY_ID) ||
-					    (early_qrk[i].device == device)) &&
-					   (!((early_qrk[i].class ^ class) &
-					     early_qrk[i].class_mask))) {
-						if ((early_qrk[i].flags & QFLAG_DONE) != QFLAG_DONE)
-							early_qrk[i].f(num, slot, func);
-						early_qrk[i].flags |= QFLAG_APPLIED;
-
-					}
-				}
-
-				type = read_pci_config_byte(num, slot, func,
-							    PCI_HEADER_TYPE);
-				if (!(type & 0x80))
-					break;
-			}
-		}
-	}
+	for (num = 0; num < 32; num++)
+		for (slot = 0; slot < 32; slot++)
+			for (func = 0; func < 8; func++)
+				check_dev_quirk(num, slot, func);
 }
-- 
cgit v1.2.3


From 053de044411111da00272d1b4e174e7dd743f499 Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 30 Jan 2008 13:31:27 +0100
Subject: x86: get rid of _MASK flags

There's no need for the *_MASK flags (TF_MASK, IF_MASK, etc), found in
processor.h (both _32 and _64). They have a one-to-one mapping with the
EFLAGS value. This patch removes the definitions, and use the already
existent X86_EFLAGS_ version when applicable.

[ roland@redhat.com: KVM build fixes. ]

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/ia32/ia32_signal.c    |  4 ++--
 arch/x86/kernel/kprobes.c      | 18 +++++++++---------
 arch/x86/kernel/signal_64.c    |  4 ++--
 arch/x86/kernel/traps_64.c     |  2 +-
 drivers/kvm/vmx.c              |  8 ++++----
 include/asm-x86/kprobes.h      |  2 +-
 include/asm-x86/processor.h    |  2 ++
 include/asm-x86/processor_32.h |  1 -
 include/asm-x86/processor_64.h | 11 -----------
 9 files changed, 21 insertions(+), 31 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index d03d43f32f4c..0e24e3fda3d7 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -501,7 +501,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
 	regs->ss = __USER32_DS;
 
 	set_fs(USER_DS);
-	regs->flags &= ~TF_MASK;
+	regs->flags &= ~X86_EFLAGS_TF;
 	if (test_thread_flag(TIF_SINGLESTEP))
 		ptrace_notify(SIGTRAP);
 
@@ -601,7 +601,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 	regs->ss = __USER32_DS;
 
 	set_fs(USER_DS);
-	regs->flags &= ~TF_MASK;
+	regs->flags &= ~X86_EFLAGS_TF;
 	if (test_thread_flag(TIF_SINGLESTEP))
 		ptrace_notify(SIGTRAP);
 
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 9aadd4d4a229..8de82c8cedd6 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -387,9 +387,9 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
 {
 	__get_cpu_var(current_kprobe) = p;
 	kcb->kprobe_saved_flags = kcb->kprobe_old_flags
-		= (regs->flags & (TF_MASK | IF_MASK));
+		= (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
 	if (is_IF_modifier(p->ainsn.insn))
-		kcb->kprobe_saved_flags &= ~IF_MASK;
+		kcb->kprobe_saved_flags &= ~X86_EFLAGS_IF;
 }
 
 static __always_inline void clear_btf(void)
@@ -407,8 +407,8 @@ static __always_inline void restore_btf(void)
 static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
 {
 	clear_btf();
-	regs->flags |= TF_MASK;
-	regs->flags &= ~IF_MASK;
+	regs->flags |= X86_EFLAGS_TF;
+	regs->flags &= ~X86_EFLAGS_IF;
 	/*single step inline if the instruction is an int3*/
 	if (p->opcode == BREAKPOINT_INSTRUCTION)
 		regs->ip = (unsigned long)p->addr;
@@ -454,7 +454,7 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
 		if (p) {
 			if (kcb->kprobe_status == KPROBE_HIT_SS &&
 				*p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
-				regs->flags &= ~TF_MASK;
+				regs->flags &= ~X86_EFLAGS_TF;
 				regs->flags |= kcb->kprobe_saved_flags;
 				goto no_kprobe;
 #ifdef CONFIG_X86_64
@@ -749,10 +749,10 @@ static void __kprobes resume_execution(struct kprobe *p,
 		insn++;
 #endif
 
-	regs->flags &= ~TF_MASK;
+	regs->flags &= ~X86_EFLAGS_TF;
 	switch (*insn) {
 	case 0x9c:	/* pushfl */
-		*tos &= ~(TF_MASK | IF_MASK);
+		*tos &= ~(X86_EFLAGS_TF | X86_EFLAGS_IF);
 		*tos |= kcb->kprobe_old_flags;
 		break;
 	case 0xc2:	/* iret/ret/lret */
@@ -852,7 +852,7 @@ out:
 	 * will have TF set, in which case, continue the remaining processing
 	 * of do_debug, as if this is not a probe hit.
 	 */
-	if (regs->flags & TF_MASK)
+	if (regs->flags & X86_EFLAGS_TF)
 		return 0;
 
 	return 1;
@@ -982,7 +982,7 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 	 */
 	memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr,
 	       MIN_STACK_SIZE(addr));
-	regs->flags &= ~IF_MASK;
+	regs->flags &= ~X86_EFLAGS_IF;
 	trace_hardirqs_off();
 	regs->ip = (unsigned long)(jp->entry);
 	return 1;
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index 1c9bca56fb55..4eb751c60390 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -295,7 +295,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 	   see include/asm-x86_64/uaccess.h for details. */
 	set_fs(USER_DS);
 
-	regs->flags &= ~TF_MASK;
+	regs->flags &= ~X86_EFLAGS_TF;
 	if (test_thread_flag(TIF_SINGLESTEP))
 		ptrace_notify(SIGTRAP);
 #ifdef DEBUG_SIG
@@ -463,7 +463,7 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 	       
 	/* Pending single-step? */
 	if (thread_info_flags & _TIF_SINGLESTEP) {
-		regs->flags |= TF_MASK;
+		regs->flags |= X86_EFLAGS_TF;
 		clear_thread_flag(TIF_SINGLESTEP);
 	}
 
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index cf90ceb48f5f..7118fa0320ae 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -904,7 +904,7 @@ clear_dr7:
 
 clear_TF_reenable:
 	set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
-	regs->flags &= ~TF_MASK;
+	regs->flags &= ~X86_EFLAGS_TF;
 	preempt_conditional_cli(regs);
 }
 
diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
index bb56ae3f89b6..5b397b6c9f93 100644
--- a/drivers/kvm/vmx.c
+++ b/drivers/kvm/vmx.c
@@ -524,7 +524,7 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
 static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 {
 	if (vcpu->rmode.active)
-		rflags |= IOPL_MASK | X86_EFLAGS_VM;
+		rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
 	vmcs_writel(GUEST_RFLAGS, rflags);
 }
 
@@ -1050,7 +1050,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
 	vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar);
 
 	flags = vmcs_readl(GUEST_RFLAGS);
-	flags &= ~(IOPL_MASK | X86_EFLAGS_VM);
+	flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
 	flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT);
 	vmcs_writel(GUEST_RFLAGS, flags);
 
@@ -1107,9 +1107,9 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
 	vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
 
 	flags = vmcs_readl(GUEST_RFLAGS);
-	vcpu->rmode.save_iopl = (flags & IOPL_MASK) >> IOPL_SHIFT;
+	vcpu->rmode.save_iopl = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
 
-	flags |= IOPL_MASK | X86_EFLAGS_VM;
+	flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
 
 	vmcs_writel(GUEST_RFLAGS, flags);
 	vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
diff --git a/include/asm-x86/kprobes.h b/include/asm-x86/kprobes.h
index 6e6371a856e7..143476a3cb52 100644
--- a/include/asm-x86/kprobes.h
+++ b/include/asm-x86/kprobes.h
@@ -88,7 +88,7 @@ struct kprobe_ctlblk {
  */
 static inline void restore_interrupts(struct pt_regs *regs)
 {
-	if (regs->flags & IF_MASK)
+	if (regs->flags & X86_EFLAGS_IF)
 		local_irq_enable();
 }
 
diff --git a/include/asm-x86/processor.h b/include/asm-x86/processor.h
index dea81b70895d..e8dd394c9f46 100644
--- a/include/asm-x86/processor.h
+++ b/include/asm-x86/processor.h
@@ -1,6 +1,8 @@
 #ifndef __ASM_X86_PROCESSOR_H
 #define __ASM_X86_PROCESSOR_H
 
+#include <asm/processor-flags.h>
+
 static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
 					 unsigned int *ecx, unsigned int *edx)
 {
diff --git a/include/asm-x86/processor_32.h b/include/asm-x86/processor_32.h
index 76cada930d22..b9dbe4668e75 100644
--- a/include/asm-x86/processor_32.h
+++ b/include/asm-x86/processor_32.h
@@ -19,7 +19,6 @@
 #include <asm/percpu.h>
 #include <linux/cpumask.h>
 #include <linux/init.h>
-#include <asm/processor-flags.h>
 #include <asm/desc_defs.h>
 
 static inline int desc_empty(const void *ptr)
diff --git a/include/asm-x86/processor_64.h b/include/asm-x86/processor_64.h
index a0a9e5515097..c49716a76644 100644
--- a/include/asm-x86/processor_64.h
+++ b/include/asm-x86/processor_64.h
@@ -18,19 +18,8 @@
 #include <asm/percpu.h>
 #include <linux/personality.h>
 #include <linux/cpumask.h>
-#include <asm/processor-flags.h>
 #include <asm/desc_defs.h>
 
-#define TF_MASK		0x00000100
-#define IF_MASK		0x00000200
-#define IOPL_MASK	0x00003000
-#define NT_MASK		0x00004000
-#define VM_MASK		0x00020000
-#define AC_MASK		0x00040000
-#define VIF_MASK	0x00080000	/* virtual interrupt flag */
-#define VIP_MASK	0x00100000	/* virtual interrupt pending */
-#define ID_MASK		0x00200000
-
 static inline int desc_empty(const void *ptr)
 {
 	const u32 *desc = ptr;
-- 
cgit v1.2.3


From ca241c75037b32e0216a68e39ad2801d04fa1f87 Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 30 Jan 2008 13:31:31 +0100
Subject: x86: unify tss_struct

Although slighly different, the tss_struct is very similar in x86_64 and
i386. The really different part, which matchs the hardware vision of it, is
now called x86_hw_tss, and each of the architectures provides yours.
It's then used as a field in the outter tss_struct.

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/asm-offsets_64.c |  2 +-
 arch/x86/kernel/doublefault_32.c |  2 +-
 arch/x86/kernel/process_64.c     |  2 +-
 arch/x86/kernel/setup64.c        |  4 +--
 arch/x86/kernel/smpboot_64.c     |  2 +-
 include/asm-x86/lguest.h         |  2 +-
 include/asm-x86/processor.h      | 77 ++++++++++++++++++++++++++++++++++++++++
 include/asm-x86/processor_32.h   | 62 --------------------------------
 include/asm-x86/processor_64.h   | 35 +-----------------
 9 files changed, 85 insertions(+), 103 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index a05428764314..2b32719a3fea 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -110,7 +110,7 @@ int main(void)
 	ENTRY(cr8);
 	BLANK();
 #undef ENTRY
-	DEFINE(TSS_ist, offsetof(struct tss_struct, ist));
+	DEFINE(TSS_ist, offsetof(struct tss_struct, x86_tss.ist));
 	BLANK();
 	DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx));
 	BLANK();
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c
index d16122a8e4eb..a47798b59f07 100644
--- a/arch/x86/kernel/doublefault_32.c
+++ b/arch/x86/kernel/doublefault_32.c
@@ -33,7 +33,7 @@ static void doublefault_fn(void)
 		printk(KERN_EMERG "double fault, tss at %08lx\n", tss);
 
 		if (ptr_ok(tss)) {
-			struct i386_hw_tss *t = (struct i386_hw_tss *)tss;
+			struct x86_hw_tss *t = (struct x86_hw_tss *)tss;
 
 			printk(KERN_EMERG "eip = %08lx, esp = %08lx\n",
 			       t->ip, t->sp);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 836a71adfa11..af56104b73ff 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -639,7 +639,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	/*
 	 * Reload esp0, LDT and the page table pointer:
 	 */
-	tss->sp0 = next->sp0;
+	tss->x86_tss.sp0 = next->sp0;
 
 	/* 
 	 * Switch DS and ES.
diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c
index 05cafcb94109..3b0ffa31f3c0 100644
--- a/arch/x86/kernel/setup64.c
+++ b/arch/x86/kernel/setup64.c
@@ -258,10 +258,10 @@ void __cpuinit cpu_init (void)
 				      v, cpu); 
 		}
 		estacks += PAGE_SIZE << order[v];
-		orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks;
+		orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks;
 	}
 
-	t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+	t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
 	/*
 	 * <= is required because the CPU will access up to
 	 * 8 bits beyond the end of the IO permission bitmap.
diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c
index 4c03ddccd681..2ea02a71b644 100644
--- a/arch/x86/kernel/smpboot_64.c
+++ b/arch/x86/kernel/smpboot_64.c
@@ -614,7 +614,7 @@ do_rest:
 	start_rip = setup_trampoline();
 
 	init_rsp = c_idle.idle->thread.sp;
-	per_cpu(init_tss,cpu).sp0 = init_rsp;
+	per_cpu(init_tss, cpu).x86_tss.sp0 = init_rsp;
 	initial_code = start_secondary;
 	clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
 
diff --git a/include/asm-x86/lguest.h b/include/asm-x86/lguest.h
index 3585a1628b59..1c8367a692f6 100644
--- a/include/asm-x86/lguest.h
+++ b/include/asm-x86/lguest.h
@@ -51,7 +51,7 @@ struct lguest_ro_state
 	/* Fields which are used when guest is running. */
 	struct desc_ptr guest_idt_desc;
 	struct desc_ptr guest_gdt_desc;
-	struct i386_hw_tss guest_tss;
+	struct x86_hw_tss guest_tss;
 	struct desc_struct guest_idt[IDT_ENTRIES];
 	struct desc_struct guest_gdt[GDT_ENTRIES];
 };
diff --git a/include/asm-x86/processor.h b/include/asm-x86/processor.h
index 3deb5ba55f55..cede9ad3dc6e 100644
--- a/include/asm-x86/processor.h
+++ b/include/asm-x86/processor.h
@@ -8,6 +8,7 @@ struct task_struct;
 struct mm_struct;
 
 #include <asm/page.h>
+#include <asm/percpu.h>
 #include <asm/system.h>
 
 /*
@@ -38,6 +39,82 @@ static inline void load_cr3(pgd_t *pgdir)
 	write_cr3(__pa(pgdir));
 }
 
+#ifdef CONFIG_X86_32
+/* This is the TSS defined by the hardware. */
+struct x86_hw_tss {
+	unsigned short	back_link, __blh;
+	unsigned long	sp0;
+	unsigned short	ss0, __ss0h;
+	unsigned long	sp1;
+	unsigned short	ss1, __ss1h;	/* ss1 caches MSR_IA32_SYSENTER_CS */
+	unsigned long	sp2;
+	unsigned short	ss2, __ss2h;
+	unsigned long	__cr3;
+	unsigned long	ip;
+	unsigned long	flags;
+	unsigned long	ax, cx, dx, bx;
+	unsigned long	sp, bp, si, di;
+	unsigned short	es, __esh;
+	unsigned short	cs, __csh;
+	unsigned short	ss, __ssh;
+	unsigned short	ds, __dsh;
+	unsigned short	fs, __fsh;
+	unsigned short	gs, __gsh;
+	unsigned short	ldt, __ldth;
+	unsigned short	trace, io_bitmap_base;
+} __attribute__((packed));
+#else
+struct x86_hw_tss {
+	u32 reserved1;
+	u64 sp0;
+	u64 sp1;
+	u64 sp2;
+	u64 reserved2;
+	u64 ist[7];
+	u32 reserved3;
+	u32 reserved4;
+	u16 reserved5;
+	u16 io_bitmap_base;
+} __attribute__((packed)) ____cacheline_aligned;
+#endif
+
+/*
+ * Size of io_bitmap.
+ */
+#define IO_BITMAP_BITS  65536
+#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
+#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
+#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap)
+#define INVALID_IO_BITMAP_OFFSET 0x8000
+#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
+
+struct tss_struct {
+	struct x86_hw_tss x86_tss;
+
+	/*
+	 * The extra 1 is there because the CPU will access an
+	 * additional byte beyond the end of the IO permission
+	 * bitmap. The extra byte must be all 1 bits, and must
+	 * be within the limit.
+	 */
+	unsigned long	io_bitmap[IO_BITMAP_LONGS + 1];
+	/*
+	 * Cache the current maximum and the last task that used the bitmap:
+	 */
+	unsigned long io_bitmap_max;
+	struct thread_struct *io_bitmap_owner;
+	/*
+	 * pads the TSS to be cacheline-aligned (size is 0x100)
+	 */
+	unsigned long __cacheline_filler[35];
+	/*
+	 * .. and then another 0x100 bytes for emergency kernel stack
+	 */
+	unsigned long stack[64];
+} __attribute__((packed));
+
+DECLARE_PER_CPU(struct tss_struct, init_tss);
+
 #ifdef CONFIG_X86_32
 # include "processor_32.h"
 #else
diff --git a/include/asm-x86/processor_32.h b/include/asm-x86/processor_32.h
index 6cd2149dcbad..57b345bc3c74 100644
--- a/include/asm-x86/processor_32.h
+++ b/include/asm-x86/processor_32.h
@@ -81,7 +81,6 @@ struct cpuinfo_x86 {
 extern struct cpuinfo_x86 boot_cpu_data;
 extern struct cpuinfo_x86 new_cpu_data;
 extern struct tss_struct doublefault_tss;
-DECLARE_PER_CPU(struct tss_struct, init_tss);
 
 #ifdef CONFIG_SMP
 DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
@@ -123,16 +122,6 @@ extern unsigned int mca_pentium_flag;
 #define TASK_SIZE	(PAGE_OFFSET)
 
 
-/*
- * Size of io_bitmap.
- */
-#define IO_BITMAP_BITS  65536
-#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
-#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
-#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
-#define INVALID_IO_BITMAP_OFFSET 0x8000
-#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
-
 struct i387_fsave_struct {
 	long	cwd;
 	long	swd;
@@ -185,57 +174,6 @@ typedef struct {
 	unsigned long seg;
 } mm_segment_t;
 
-struct thread_struct;
-
-/* This is the TSS defined by the hardware. */
-struct i386_hw_tss {
-	unsigned short	back_link,__blh;
-	unsigned long	sp0;
-	unsigned short	ss0,__ss0h;
-	unsigned long	sp1;
-	unsigned short	ss1,__ss1h;	/* ss1 is used to cache MSR_IA32_SYSENTER_CS */
-	unsigned long	sp2;
-	unsigned short	ss2,__ss2h;
-	unsigned long	__cr3;
-	unsigned long	ip;
-	unsigned long	flags;
-	unsigned long	ax, cx, dx, bx;
-	unsigned long	sp, bp, si, di;
-	unsigned short	es, __esh;
-	unsigned short	cs, __csh;
-	unsigned short	ss, __ssh;
-	unsigned short	ds, __dsh;
-	unsigned short	fs, __fsh;
-	unsigned short	gs, __gsh;
-	unsigned short	ldt, __ldth;
-	unsigned short	trace, io_bitmap_base;
-} __attribute__((packed));
-
-struct tss_struct {
-	struct i386_hw_tss x86_tss;
-
-	/*
-	 * The extra 1 is there because the CPU will access an
-	 * additional byte beyond the end of the IO permission
-	 * bitmap. The extra byte must be all 1 bits, and must
-	 * be within the limit.
-	 */
-	unsigned long	io_bitmap[IO_BITMAP_LONGS + 1];
-	/*
-	 * Cache the current maximum and the last task that used the bitmap:
-	 */
-	unsigned long io_bitmap_max;
-	struct thread_struct *io_bitmap_owner;
-	/*
-	 * pads the TSS to be cacheline-aligned (size is 0x100)
-	 */
-	unsigned long __cacheline_filler[35];
-	/*
-	 * .. and then another 0x100 bytes for emergency kernel stack
-	 */
-	unsigned long stack[64];
-} __attribute__((packed));
-
 #define ARCH_MIN_TASKALIGN	16
 
 struct thread_struct {
diff --git a/include/asm-x86/processor_64.h b/include/asm-x86/processor_64.h
index 1984a4a38b74..8d342c23ad14 100644
--- a/include/asm-x86/processor_64.h
+++ b/include/asm-x86/processor_64.h
@@ -91,14 +91,6 @@ extern void identify_cpu(struct cpuinfo_x86 *);
 #define TASK_SIZE 		(test_thread_flag(TIF_IA32) ? IA32_PAGE_OFFSET : TASK_SIZE64)
 #define TASK_SIZE_OF(child) 	((test_tsk_thread_flag(child, TIF_IA32)) ? IA32_PAGE_OFFSET : TASK_SIZE64)
 
-/*
- * Size of io_bitmap.
- */
-#define IO_BITMAP_BITS  65536
-#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
-#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
-#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
-#define INVALID_IO_BITMAP_OFFSET 0x8000
 
 struct i387_fxsave_struct {
 	u16	cwd;
@@ -118,32 +110,7 @@ union i387_union {
 	struct i387_fxsave_struct	fxsave;
 };
 
-struct tss_struct {
-	u32 reserved1;
-	u64 sp0;
-	u64 sp1;
-	u64 sp2;
-	u64 reserved2;
-	u64 ist[7];
-	u32 reserved3;
-	u32 reserved4;
-	u16 reserved5;
-	u16 io_bitmap_base;
-	/*
-	 * The extra 1 is there because the CPU will access an
-	 * additional byte beyond the end of the IO permission
-	 * bitmap. The extra byte must be all 1 bits, and must
-	 * be within the limit. Thus we have:
-	 *
-	 * 128 bytes, the bitmap itself, for ports 0..0x3ff
-	 * 8 bytes, for an extra "long" of ~0UL
-	 */
-	unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
-} __attribute__((packed)) ____cacheline_aligned;
-
-
 extern struct cpuinfo_x86 boot_cpu_data;
-DECLARE_PER_CPU(struct tss_struct,init_tss);
 /* Save the original ist values for checking stack pointers during debugging */
 struct orig_ist {
 	unsigned long ist[7];
@@ -195,7 +162,7 @@ struct thread_struct {
 }
 
 #define INIT_TSS  { \
-	.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
+	.x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
 }
 
 #define INIT_MMAP \
-- 
cgit v1.2.3


From 7818a1e0294debee02d5135e17b89f28b8871887 Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 30 Jan 2008 13:31:31 +0100
Subject: x86: provide 64-bit with a load_sp0 function.

Paravirt guests need to inform the underlying hypervisor whenever the sp0
tss field changes. i386 already has such a function, and we use it for
x86_64 too. There's an unnecessary (for 64-bit) msr handling part in the original
version, and it is placed around an ifdef. Making no more sense in
processor_32.h, it is moved to the common header

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/process_64.c   |  2 +-
 arch/x86/kernel/smpboot_64.c   |  2 +-
 include/asm-x86/processor.h    | 22 +++++++++++++++++++++-
 include/asm-x86/processor_32.h | 20 --------------------
 4 files changed, 23 insertions(+), 23 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index af56104b73ff..e3a3610ade10 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -639,7 +639,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	/*
 	 * Reload esp0, LDT and the page table pointer:
 	 */
-	tss->x86_tss.sp0 = next->sp0;
+	load_sp0(tss, next);
 
 	/* 
 	 * Switch DS and ES.
diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c
index 2ea02a71b644..5bd42ce144da 100644
--- a/arch/x86/kernel/smpboot_64.c
+++ b/arch/x86/kernel/smpboot_64.c
@@ -614,7 +614,7 @@ do_rest:
 	start_rip = setup_trampoline();
 
 	init_rsp = c_idle.idle->thread.sp;
-	per_cpu(init_tss, cpu).x86_tss.sp0 = init_rsp;
+	load_sp0(&per_cpu(init_tss, cpu), &c_idle.idle->thread);
 	initial_code = start_secondary;
 	clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
 
diff --git a/include/asm-x86/processor.h b/include/asm-x86/processor.h
index cede9ad3dc6e..b1ea52156362 100644
--- a/include/asm-x86/processor.h
+++ b/include/asm-x86/processor.h
@@ -193,8 +193,22 @@ static inline void native_set_iopl_mask(unsigned mask)
 #endif
 }
 
+static inline void native_load_sp0(struct tss_struct *tss,
+				   struct thread_struct *thread)
+{
+	tss->x86_tss.sp0 = thread->sp0;
+#ifdef CONFIG_X86_32
+	/* Only happens when SEP is enabled, no need to test "SEP"arately */
+	if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
+		tss->x86_tss.ss1 = thread->sysenter_cs;
+		wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
+	}
+#endif
+}
 
-#ifndef CONFIG_PARAVIRT
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
 #define __cpuid native_cpuid
 #define paravirt_enabled() 0
 
@@ -206,6 +220,12 @@ static inline void native_set_iopl_mask(unsigned mask)
 #define set_debugreg(value, register)				\
 	native_set_debugreg(register, value)
 
+static inline void load_sp0(struct tss_struct *tss,
+			    struct thread_struct *thread)
+{
+	native_load_sp0(tss, thread);
+}
+
 #define set_iopl_mask native_set_iopl_mask
 #endif /* CONFIG_PARAVIRT */
 
diff --git a/include/asm-x86/processor_32.h b/include/asm-x86/processor_32.h
index 57b345bc3c74..53037d1a6ae6 100644
--- a/include/asm-x86/processor_32.h
+++ b/include/asm-x86/processor_32.h
@@ -278,26 +278,6 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
 
 #define KSTK_ESP(task) (task_pt_regs(task)->sp)
 
-static inline void native_load_sp0(struct tss_struct *tss, struct thread_struct *thread)
-{
-	tss->x86_tss.sp0 = thread->sp0;
-	/* This can only happen when SEP is enabled, no need to test "SEP"arately */
-	if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
-		tss->x86_tss.ss1 = thread->sysenter_cs;
-		wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
-	}
-}
-
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#else
-
-static inline void load_sp0(struct tss_struct *tss, struct thread_struct *thread)
-{
-	native_load_sp0(tss, thread);
-}
-#endif /* CONFIG_PARAVIRT */
-
 /* generic versions from gas */
 #define GENERIC_NOP1	".byte 0x90\n"
 #define GENERIC_NOP2    	".byte 0x89,0xf6\n"
-- 
cgit v1.2.3


From 5300db887e251276eaab2801af297acf4b53b9eb Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 30 Jan 2008 13:31:33 +0100
Subject: x86: unify x86_cpuinfo struct.

x86_cpuinfo is one more to the family of "not fundamentally different"
structs. It's unified in processor.h, with very specific fields enclosed
around ifdefs.

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/common.c   |  6 ++--
 include/asm-x86/processor.h    | 82 ++++++++++++++++++++++++++++++++++++++++++
 include/asm-x86/processor_32.h | 67 ----------------------------------
 include/asm-x86/processor_64.h | 57 -----------------------------
 4 files changed, 85 insertions(+), 127 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 69507ae8a65b..ecd13c0e8542 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -427,7 +427,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 
 	printk(KERN_DEBUG "CPU: After generic identify, caps:");
 	for (i = 0; i < NCAPINTS; i++)
-		printk(" %08lx", c->x86_capability[i]);
+		printk(" %08x", c->x86_capability[i]);
 	printk("\n");
 
 	if (this_cpu->c_identify) {
@@ -435,7 +435,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 
 		printk(KERN_DEBUG "CPU: After vendor identify, caps:");
 		for (i = 0; i < NCAPINTS; i++)
-			printk(" %08lx", c->x86_capability[i]);
+			printk(" %08x", c->x86_capability[i]);
 		printk("\n");
 	}
 
@@ -493,7 +493,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 
 	printk(KERN_DEBUG "CPU: After all inits, caps:");
 	for (i = 0; i < NCAPINTS; i++)
-		printk(" %08lx", c->x86_capability[i]);
+		printk(" %08x", c->x86_capability[i]);
 	printk("\n");
 
 	/*
diff --git a/include/asm-x86/processor.h b/include/asm-x86/processor.h
index 8b56c23f043d..2b896b0aa3fa 100644
--- a/include/asm-x86/processor.h
+++ b/include/asm-x86/processor.h
@@ -10,6 +10,9 @@ struct mm_struct;
 #include <asm/page.h>
 #include <asm/percpu.h>
 #include <asm/system.h>
+#include <asm/percpu.h>
+#include <linux/cpumask.h>
+#include <linux/cache.h>
 
 /*
  * Default implementation of macro that returns current
@@ -30,6 +33,85 @@ static inline void *current_text_addr(void)
 #define ARCH_MIN_MMSTRUCT_ALIGN	0
 #endif
 
+/*
+ *  CPU type and hardware bug flags. Kept separately for each CPU.
+ *  Members of this structure are referenced in head.S, so think twice
+ *  before touching them. [mj]
+ */
+
+struct cpuinfo_x86 {
+	__u8	x86;		/* CPU family */
+	__u8	x86_vendor;	/* CPU vendor */
+	__u8	x86_model;
+	__u8	x86_mask;
+#ifdef CONFIG_X86_32
+	char	wp_works_ok;	/* It doesn't on 386's */
+	char	hlt_works_ok;	/* Problems on some 486Dx4's and old 386's */
+	char	hard_math;
+	char	rfu;
+	char	fdiv_bug;
+	char	f00f_bug;
+	char	coma_bug;
+	char	pad0;
+#else
+	/* number of 4K pages in DTLB/ITLB combined(in pages)*/
+	int     x86_tlbsize;
+	__u8    x86_virt_bits, x86_phys_bits;
+	/* cpuid returned core id bits */
+	__u8    x86_coreid_bits;
+	/* Max extended CPUID function supported */
+	__u32   extended_cpuid_level;
+#endif
+	int	cpuid_level;	/* Maximum supported CPUID level, -1=no CPUID */
+	__u32	x86_capability[NCAPINTS];
+	char	x86_vendor_id[16];
+	char	x86_model_id[64];
+	int 	x86_cache_size;  /* in KB - valid for CPUS which support this
+				    call  */
+	int 	x86_cache_alignment;	/* In bytes */
+	int	x86_power;
+	unsigned long loops_per_jiffy;
+#ifdef CONFIG_SMP
+	cpumask_t llc_shared_map;	/* cpus sharing the last level cache */
+#endif
+	unsigned char x86_max_cores;	/* cpuid returned max cores value */
+	unsigned char apicid;
+	unsigned short x86_clflush_size;
+#ifdef CONFIG_SMP
+	unsigned char booted_cores;	/* number of cores as seen by OS */
+	__u8 phys_proc_id; 		/* Physical processor id. */
+	__u8 cpu_core_id;  		/* Core id */
+	__u8 cpu_index;			/* index into per_cpu list */
+#endif
+} __attribute__((__aligned__(SMP_CACHE_BYTES)));
+
+#define X86_VENDOR_INTEL 0
+#define X86_VENDOR_CYRIX 1
+#define X86_VENDOR_AMD 2
+#define X86_VENDOR_UMC 3
+#define X86_VENDOR_NEXGEN 4
+#define X86_VENDOR_CENTAUR 5
+#define X86_VENDOR_TRANSMETA 7
+#define X86_VENDOR_NSC 8
+#define X86_VENDOR_NUM 9
+#define X86_VENDOR_UNKNOWN 0xff
+
+extern struct cpuinfo_x86 boot_cpu_data;
+
+#ifdef CONFIG_SMP
+DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
+#define cpu_data(cpu)		per_cpu(cpu_info, cpu)
+#define current_cpu_data	cpu_data(smp_processor_id())
+#else
+#define cpu_data(cpu)		boot_cpu_data
+#define current_cpu_data	boot_cpu_data
+#endif
+
+extern void print_cpu_info(struct cpuinfo_x86 *);
+extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
+extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
+extern unsigned short num_cache_leaves;
+
 static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
 					 unsigned int *ecx, unsigned int *edx)
 {
diff --git a/include/asm-x86/processor_32.h b/include/asm-x86/processor_32.h
index 8380243a19d3..f0f7ca48444e 100644
--- a/include/asm-x86/processor_32.h
+++ b/include/asm-x86/processor_32.h
@@ -14,83 +14,16 @@
 #include <asm/cpufeature.h>
 #include <asm/msr.h>
 #include <asm/system.h>
-#include <linux/cache.h>
 #include <linux/threads.h>
-#include <asm/percpu.h>
-#include <linux/cpumask.h>
 #include <linux/init.h>
 #include <asm/desc_defs.h>
 
-/*
- *  CPU type and hardware bug flags. Kept separately for each CPU.
- *  Members of this structure are referenced in head.S, so think twice
- *  before touching them. [mj]
- */
-
-struct cpuinfo_x86 {
-	__u8	x86;		/* CPU family */
-	__u8	x86_vendor;	/* CPU vendor */
-	__u8	x86_model;
-	__u8	x86_mask;
-	char	wp_works_ok;	/* It doesn't on 386's */
-	char	hlt_works_ok;	/* Problems on some 486Dx4's and old 386's */
-	char	hard_math;
-	char	rfu;
-       	int	cpuid_level;	/* Maximum supported CPUID level, -1=no CPUID */
-	unsigned long	x86_capability[NCAPINTS];
-	char	x86_vendor_id[16];
-	char	x86_model_id[64];
-	int 	x86_cache_size;  /* in KB - valid for CPUS which support this
-				    call  */
-	int 	x86_cache_alignment;	/* In bytes */
-	char	fdiv_bug;
-	char	f00f_bug;
-	char	coma_bug;
-	char	pad0;
-	int	x86_power;
-	unsigned long loops_per_jiffy;
-#ifdef CONFIG_SMP
-	cpumask_t llc_shared_map;	/* cpus sharing the last level cache */
-#endif
-	unsigned char x86_max_cores;	/* cpuid returned max cores value */
-	unsigned char apicid;
-	unsigned short x86_clflush_size;
-#ifdef CONFIG_SMP
-	unsigned char booted_cores;	/* number of cores as seen by OS */
-	__u8 phys_proc_id; 		/* Physical processor id. */
-	__u8 cpu_core_id;  		/* Core id */
-	__u8 cpu_index;			/* index into per_cpu list */
-#endif
-} __attribute__((__aligned__(SMP_CACHE_BYTES)));
-
-#define X86_VENDOR_INTEL 0
-#define X86_VENDOR_CYRIX 1
-#define X86_VENDOR_AMD 2
-#define X86_VENDOR_UMC 3
-#define X86_VENDOR_NEXGEN 4
-#define X86_VENDOR_CENTAUR 5
-#define X86_VENDOR_TRANSMETA 7
-#define X86_VENDOR_NSC 8
-#define X86_VENDOR_NUM 9
-#define X86_VENDOR_UNKNOWN 0xff
-
 /*
  * capabilities of CPUs
  */
-
-extern struct cpuinfo_x86 boot_cpu_data;
 extern struct cpuinfo_x86 new_cpu_data;
 extern struct tss_struct doublefault_tss;
 
-#ifdef CONFIG_SMP
-DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
-#define cpu_data(cpu)		per_cpu(cpu_info, cpu)
-#define current_cpu_data	cpu_data(smp_processor_id())
-#else
-#define cpu_data(cpu)		boot_cpu_data
-#define current_cpu_data	boot_cpu_data
-#endif
-
 /*
  * the following now lives in the per cpu area:
  * extern	int cpu_llc_id[NR_CPUS];
diff --git a/include/asm-x86/processor_64.h b/include/asm-x86/processor_64.h
index 7fe5c1461c83..5045abe91e37 100644
--- a/include/asm-x86/processor_64.h
+++ b/include/asm-x86/processor_64.h
@@ -15,65 +15,9 @@
 #include <asm/current.h>
 #include <asm/system.h>
 #include <asm/mmsegment.h>
-#include <asm/percpu.h>
 #include <linux/personality.h>
-#include <linux/cpumask.h>
 #include <asm/desc_defs.h>
 
-/*
- *  CPU type and hardware bug flags. Kept separately for each CPU.
- */
-
-struct cpuinfo_x86 {
-	__u8	x86;		/* CPU family */
-	__u8	x86_vendor;	/* CPU vendor */
-	__u8	x86_model;
-	__u8	x86_mask;
-	int	cpuid_level;	/* Maximum supported CPUID level, -1=no CPUID */
-	__u32	x86_capability[NCAPINTS];
-	char	x86_vendor_id[16];
-	char	x86_model_id[64];
-	int 	x86_cache_size;  /* in KB */
-	int	x86_clflush_size;
-	int	x86_cache_alignment;
-	int	x86_tlbsize;	/* number of 4K pages in DTLB/ITLB combined(in pages)*/
-        __u8    x86_virt_bits, x86_phys_bits;
-	__u8	x86_max_cores;	/* cpuid returned max cores value */
-	__u8	x86_coreid_bits; /* cpuid returned core id bits */
-        __u32   x86_power; 	
-	__u32   extended_cpuid_level;	/* Max extended CPUID function supported */
-	unsigned long loops_per_jiffy;
-#ifdef CONFIG_SMP
-	cpumask_t llc_shared_map;	/* cpus sharing the last level cache */
-#endif
-	__u8	apicid;
-#ifdef CONFIG_SMP
-	__u8	booted_cores;	/* number of cores as seen by OS */
-	__u8	phys_proc_id;	/* Physical Processor id. */
-	__u8	cpu_core_id;	/* Core id. */
-	__u8	cpu_index;	/* index into per_cpu list */
-#endif
-} ____cacheline_aligned;
-
-#define X86_VENDOR_INTEL 0
-#define X86_VENDOR_CYRIX 1
-#define X86_VENDOR_AMD 2
-#define X86_VENDOR_UMC 3
-#define X86_VENDOR_NEXGEN 4
-#define X86_VENDOR_CENTAUR 5
-#define X86_VENDOR_TRANSMETA 7
-#define X86_VENDOR_NUM 8
-#define X86_VENDOR_UNKNOWN 0xff
-
-#ifdef CONFIG_SMP
-DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
-#define cpu_data(cpu)		per_cpu(cpu_info, cpu)
-#define current_cpu_data	cpu_data(smp_processor_id())
-#else
-#define cpu_data(cpu)		boot_cpu_data
-#define current_cpu_data	boot_cpu_data
-#endif
-
 extern char ignore_irq13;
 
 extern void identify_cpu(struct cpuinfo_x86 *);
@@ -110,7 +54,6 @@ union i387_union {
 	struct i387_fxsave_struct	fxsave;
 };
 
-extern struct cpuinfo_x86 boot_cpu_data;
 /* Save the original ist values for checking stack pointers during debugging */
 struct orig_ist {
 	unsigned long ist[7];
-- 
cgit v1.2.3


From 1a53905adddf6cc6d795bd7e988c60a19773f72e Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 30 Jan 2008 13:31:39 +0100
Subject: x86: move definitions to processor.h

This patch moves definitions that are present in only one of the files
(between processor_32.h and processor_64.h), to processor.h. They're mostly
structures and function definitions.

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/common.c   |  2 +-
 arch/x86/kernel/setup_64.c     |  2 +-
 include/asm-x86/processor.h    | 41 +++++++++++++++++++++++++++++++++++++++++
 include/asm-x86/processor_32.h | 35 -----------------------------------
 include/asm-x86/processor_64.h |  6 ------
 5 files changed, 43 insertions(+), 43 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index ecd13c0e8542..e48832a6c2a9 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -399,7 +399,7 @@ __setup("serialnumber", x86_serial_nr_setup);
 /*
  * This does the hard work of actually picking apart the CPU stuff...
  */
-static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
+void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 {
 	int i;
 
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 90b8bb4748b9..02409100f456 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -757,7 +757,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 		disable_apic_timer = 1;
 }
 
-static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
+void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_SMP
 	u32 eax, ebx, ecx, edx;
diff --git a/include/asm-x86/processor.h b/include/asm-x86/processor.h
index 87466b6d3b92..c6b749a018a7 100644
--- a/include/asm-x86/processor.h
+++ b/include/asm-x86/processor.h
@@ -96,7 +96,12 @@ struct cpuinfo_x86 {
 #define X86_VENDOR_NUM 9
 #define X86_VENDOR_UNKNOWN 0xff
 
+/*
+ * capabilities of CPUs
+ */
 extern struct cpuinfo_x86 boot_cpu_data;
+extern struct cpuinfo_x86 new_cpu_data;
+extern struct tss_struct doublefault_tss;
 
 #ifdef CONFIG_SMP
 DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
@@ -107,11 +112,22 @@ DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
 #define current_cpu_data	boot_cpu_data
 #endif
 
+void cpu_detect(struct cpuinfo_x86 *c);
+
+extern void identify_cpu(struct cpuinfo_x86 *);
+extern void identify_boot_cpu(void);
+extern void identify_secondary_cpu(struct cpuinfo_x86 *);
 extern void print_cpu_info(struct cpuinfo_x86 *);
 extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
 extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
 extern unsigned short num_cache_leaves;
 
+#if defined(CONFIG_X86_HT) || defined(CONFIG_X86_64)
+extern void detect_ht(struct cpuinfo_x86 *c);
+#else
+static inline void detect_ht(struct cpuinfo_x86 *c) {}
+#endif
+
 static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
 					 unsigned int *ecx, unsigned int *edx)
 {
@@ -205,6 +221,11 @@ struct tss_struct {
 
 DECLARE_PER_CPU(struct tss_struct, init_tss);
 
+/* Save the original ist values for checking stack pointers during debugging */
+struct orig_ist {
+	unsigned long ist[7];
+};
+
 #ifdef CONFIG_X86_32
 # include "processor_32.h"
 #else
@@ -547,8 +568,28 @@ extern void select_idle_routine(const struct cpuinfo_x86 *c);
 
 extern unsigned long boot_option_idle_override;
 
+extern void enable_sep_cpu(void);
+extern int sysenter_setup(void);
+
+/* Defined in head.S */
+extern struct desc_ptr early_gdt_descr;
+
+extern void cpu_set_gdt(int);
+extern void switch_to_new_gdt(void);
+extern void cpu_init(void);
+extern void init_gdt(int cpu);
+
+/* from system description table in BIOS.  Mostly for MCA use, but
+ * others may find it useful. */
+extern unsigned int machine_id;
+extern unsigned int machine_submodel_id;
+extern unsigned int BIOS_revision;
+extern unsigned int mca_pentium_flag;
+
 /* Boot loader type from the setup header */
 extern int bootloader_type;
+
+extern char ignore_fpu_irq;
 #define cache_line_size() (boot_cpu_data.x86_cache_alignment)
 
 #define HAVE_ARCH_PICK_MMAP_LAYOUT 1
diff --git a/include/asm-x86/processor_32.h b/include/asm-x86/processor_32.h
index fa921aa1a93f..84a4c5e47d57 100644
--- a/include/asm-x86/processor_32.h
+++ b/include/asm-x86/processor_32.h
@@ -18,36 +18,12 @@
 #include <linux/init.h>
 #include <asm/desc_defs.h>
 
-/*
- * capabilities of CPUs
- */
-extern struct cpuinfo_x86 new_cpu_data;
-extern struct tss_struct doublefault_tss;
 
 /*
  * the following now lives in the per cpu area:
  * extern	int cpu_llc_id[NR_CPUS];
  */
 DECLARE_PER_CPU(u8, cpu_llc_id);
-extern char ignore_fpu_irq;
-
-void __init cpu_detect(struct cpuinfo_x86 *c);
-
-extern void identify_boot_cpu(void);
-extern void identify_secondary_cpu(struct cpuinfo_x86 *);
-
-#ifdef CONFIG_X86_HT
-extern void detect_ht(struct cpuinfo_x86 *c);
-#else
-static inline void detect_ht(struct cpuinfo_x86 *c) {}
-#endif
-
-/* from system description table in BIOS.  Mostly for MCA use, but
-others may find it useful. */
-extern unsigned int machine_id;
-extern unsigned int machine_submodel_id;
-extern unsigned int BIOS_revision;
-extern unsigned int mca_pentium_flag;
 
 /*
  * User space process size: 3GB (default).
@@ -277,15 +253,4 @@ static inline void prefetchw(const void *x)
 			  "r" (x));
 }
 
-extern void enable_sep_cpu(void);
-extern int sysenter_setup(void);
-
-/* Defined in head.S */
-extern struct desc_ptr early_gdt_descr;
-
-extern void cpu_set_gdt(int);
-extern void switch_to_new_gdt(void);
-extern void cpu_init(void);
-extern void init_gdt(int cpu);
-
 #endif /* __ASM_I386_PROCESSOR_H */
diff --git a/include/asm-x86/processor_64.h b/include/asm-x86/processor_64.h
index 1a258749b7ca..45e382989b33 100644
--- a/include/asm-x86/processor_64.h
+++ b/include/asm-x86/processor_64.h
@@ -17,8 +17,6 @@
 #include <linux/personality.h>
 #include <asm/desc_defs.h>
 
-extern void identify_cpu(struct cpuinfo_x86 *);
-
 /*
  * User space process size. 47bits minus one guard page.
  */
@@ -51,10 +49,6 @@ union i387_union {
 	struct i387_fxsave_struct	fxsave;
 };
 
-/* Save the original ist values for checking stack pointers during debugging */
-struct orig_ist {
-	unsigned long ist[7];
-};
 DECLARE_PER_CPU(struct orig_ist, orig_ist);
 
 #define INIT_THREAD  { \
-- 
cgit v1.2.3


From 6d48583ba9ade609634e694fc35ea62b7a8adaaa Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Wed, 30 Jan 2008 13:31:41 +0100
Subject: x86: unify extable_{32|64}.c

Introduce fixup_exception() on 64-bit and use it in kprobes to
eliminate an #ifdef.

Only 64-bit needs search_extable() due to a stepping bug.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/kprobes.c | 12 +--------
 arch/x86/mm/Makefile_32   |  2 +-
 arch/x86/mm/Makefile_64   |  2 +-
 arch/x86/mm/extable.c     | 62 +++++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/mm/extable_32.c  | 35 --------------------------
 arch/x86/mm/extable_64.c  | 34 --------------------------
 6 files changed, 65 insertions(+), 82 deletions(-)
 create mode 100644 arch/x86/mm/extable.c
 delete mode 100644 arch/x86/mm/extable_32.c
 delete mode 100644 arch/x86/mm/extable_64.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 8de82c8cedd6..7848bf74e2ab 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -904,19 +904,9 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 		 * In case the user-specified fault handler returned
 		 * zero, try to fix up.
 		 */
-#ifdef CONFIG_X86_64
-		{
-			const struct exception_table_entry *fixup;
-			fixup = search_exception_tables(regs->ip);
-			if (fixup) {
-				regs->ip = fixup->fixup;
-				return 1;
-			}
-		}
-#else
 		if (fixup_exception(regs))
 			return 1;
-#endif
+
 		/*
 		 * fixup routine could not handle it,
 		 * Let do_page_fault() fix it.
diff --git a/arch/x86/mm/Makefile_32 b/arch/x86/mm/Makefile_32
index a01aca79dc69..6a5e981981f7 100644
--- a/arch/x86/mm/Makefile_32
+++ b/arch/x86/mm/Makefile_32
@@ -2,7 +2,7 @@
 # Makefile for the linux i386-specific parts of the memory manager.
 #
 
-obj-y	:= init_32.o pgtable_32.o fault_32.o ioremap_32.o extable_32.o pageattr_32.o mmap.o
+obj-y	:= init_32.o pgtable_32.o fault_32.o ioremap_32.o extable.o pageattr_32.o mmap.o
 
 obj-$(CONFIG_NUMA) += discontig_32.o
 obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
diff --git a/arch/x86/mm/Makefile_64 b/arch/x86/mm/Makefile_64
index b5a74486c3aa..09c997fe5856 100644
--- a/arch/x86/mm/Makefile_64
+++ b/arch/x86/mm/Makefile_64
@@ -2,7 +2,7 @@
 # Makefile for the linux x86_64-specific parts of the memory manager.
 #
 
-obj-y	 := init_64.o fault_64.o ioremap_64.o extable_64.o pageattr_64.o mmap.o
+obj-y	 := init_64.o fault_64.o ioremap_64.o extable.o pageattr_64.o mmap.o
 obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
 obj-$(CONFIG_NUMA) += numa_64.o
 obj-$(CONFIG_K8_NUMA) += k8topology_64.o
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
new file mode 100644
index 000000000000..7e8db53528a7
--- /dev/null
+++ b/arch/x86/mm/extable.c
@@ -0,0 +1,62 @@
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <asm/uaccess.h>
+
+
+int fixup_exception(struct pt_regs *regs)
+{
+	const struct exception_table_entry *fixup;
+
+#ifdef CONFIG_PNPBIOS
+	if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
+		extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp;
+		extern u32 pnp_bios_is_utter_crap;
+		pnp_bios_is_utter_crap = 1;
+		printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n");
+		__asm__ volatile(
+			"movl %0, %%esp\n\t"
+			"jmp *%1\n\t"
+			: : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip));
+		panic("do_trap: can't hit this");
+	}
+#endif
+
+	fixup = search_exception_tables(regs->ip);
+	if (fixup) {
+		regs->ip = fixup->fixup;
+		return 1;
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_X86_64
+/*
+ * Need to defined our own search_extable on X86_64 to work around
+ * a B stepping K8 bug.
+ */
+const struct exception_table_entry *
+search_extable(const struct exception_table_entry *first,
+	       const struct exception_table_entry *last,
+	       unsigned long value)
+{
+	/* B stepping K8 bug */
+	if ((value >> 32) == 0)
+		value |= 0xffffffffUL << 32;
+
+	while (first <= last) {
+		const struct exception_table_entry *mid;
+		long diff;
+
+		mid = (last - first) / 2 + first;
+		diff = mid->insn - value;
+		if (diff == 0)
+			return mid;
+		else if (diff < 0)
+			first = mid+1;
+		else
+			last = mid-1;
+	}
+	return NULL;
+}
+#endif
diff --git a/arch/x86/mm/extable_32.c b/arch/x86/mm/extable_32.c
deleted file mode 100644
index 41685461f8b2..000000000000
--- a/arch/x86/mm/extable_32.c
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * linux/arch/i386/mm/extable.c
- */
-
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <asm/uaccess.h>
-
-int fixup_exception(struct pt_regs *regs)
-{
-	const struct exception_table_entry *fixup;
-
-#ifdef CONFIG_PNPBIOS
-	if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs)))
-	{
-		extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp;
-		extern u32 pnp_bios_is_utter_crap;
-		pnp_bios_is_utter_crap = 1;
-		printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n");
-		__asm__ volatile(
-			"movl %0, %%esp\n\t"
-			"jmp *%1\n\t"
-			: : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip));
-		panic("do_trap: can't hit this");
-	}
-#endif
-
-	fixup = search_exception_tables(regs->ip);
-	if (fixup) {
-		regs->ip = fixup->fixup;
-		return 1;
-	}
-
-	return 0;
-}
diff --git a/arch/x86/mm/extable_64.c b/arch/x86/mm/extable_64.c
deleted file mode 100644
index 79ac6e7100af..000000000000
--- a/arch/x86/mm/extable_64.c
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * linux/arch/x86_64/mm/extable.c
- */
-
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/init.h>
-#include <asm/uaccess.h>
-
-/* Simple binary search */
-const struct exception_table_entry *
-search_extable(const struct exception_table_entry *first,
-	       const struct exception_table_entry *last,
-	       unsigned long value)
-{
-	/* Work around a B stepping K8 bug */
-	if ((value >> 32) == 0)
-		value |= 0xffffffffUL << 32; 
-
-        while (first <= last) {
-		const struct exception_table_entry *mid;
-		long diff;
-
-		mid = (last - first) / 2 + first;
-		diff = mid->insn - value;
-                if (diff == 0)
-                        return mid;
-                else if (diff < 0)
-                        first = mid+1;
-                else
-                        last = mid-1;
-        }
-        return NULL;
-}
-- 
cgit v1.2.3


From ab4a574ef23cfa801a5078f7d7c2f2b76ecd6d91 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 30 Jan 2008 13:31:42 +0100
Subject: arch/x86/: spelling fixes

Spelling fixes.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apic_32.c    | 2 +-
 arch/x86/kernel/mpparse_32.c | 2 +-
 arch/x86/kernel/vm86_32.c    | 2 +-
 arch/x86/mm/srat_64.c        | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index 4c014fca2057..4330a899ddcb 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -108,7 +108,7 @@ static inline int lapic_get_version(void)
 }
 
 /*
- * Check, if the APIC is integrated or a seperate chip
+ * Check, if the APIC is integrated or a separate chip
  */
 static inline int lapic_is_integrated(void)
 {
diff --git a/arch/x86/kernel/mpparse_32.c b/arch/x86/kernel/mpparse_32.c
index bfcfc41f5607..a4c05372626b 100644
--- a/arch/x86/kernel/mpparse_32.c
+++ b/arch/x86/kernel/mpparse_32.c
@@ -1050,7 +1050,7 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
 	int idx, bit = 0;
 	static int pci_irq = IRQ_COMPRESSION_START;
 	/*
-	 * Mapping between Global System Interrups, which
+	 * Mapping between Global System Interrupts, which
 	 * represent all possible interrupts, and IRQs
 	 * assigned to actual devices.
 	 */
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index c9f67effbc42..738c2104df30 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -384,7 +384,7 @@ static inline void clear_AC(struct kernel_vm86_regs * regs)
  * functions. However someone forgot to call clear_IF(regs)
  * in the opposite case.
  * After the command sequence CLI PUSHF STI POPF you should
- * end up with interrups disabled, but you ended up with
+ * end up with interrupts disabled, but you ended up with
  * interrupts enabled.
  *  ( I was testing my own changes, but the only bug I
  *    could find was in a function I had not changed. )
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 9be14171144b..5c0637e4c2f4 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -160,7 +160,7 @@ static inline int save_add_info(void) {return 0;}
 #endif
 /*
  * Update nodes_add and decide if to include add are in the zone.
- * Both SPARSE and RESERVE need nodes_add infomation.
+ * Both SPARSE and RESERVE need nodes_add information.
  * This code supports one contiguous hot add area per node.
  */
 static int reserve_hotadd(int node, unsigned long start, unsigned long end)
-- 
cgit v1.2.3


From e7b5e11eaaa8ef93a34e68016de51152d0d62911 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Wed, 30 Jan 2008 13:31:43 +0100
Subject: x86: kprobes leftover cleanups

Eliminate __always_inline, all of these static functions are
only called once.  Minor whitespace cleanup.  Eliminate one
supefluous return at end of void function.  Change the one
#ifndef to #ifdef to match the sense of the rest of the config
tests.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Acked-by: Masami Hiramatsu <mhiramat@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/kprobes.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 7848bf74e2ab..521a469acaad 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -159,7 +159,7 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = {
 const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
 
 /* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
-static __always_inline void set_jmp_op(void *from, void *to)
+static void __kprobes set_jmp_op(void *from, void *to)
 {
 	struct __arch_jmp_op {
 		char op;
@@ -174,7 +174,7 @@ static __always_inline void set_jmp_op(void *from, void *to)
  * Returns non-zero if opcode is boostable.
  * RIP relative instructions are adjusted at copying time in 64 bits mode
  */
-static __always_inline int can_boost(kprobe_opcode_t *opcodes)
+static int __kprobes can_boost(kprobe_opcode_t *opcodes)
 {
 	kprobe_opcode_t opcode;
 	kprobe_opcode_t *orig_opcodes = opcodes;
@@ -392,13 +392,13 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
 		kcb->kprobe_saved_flags &= ~X86_EFLAGS_IF;
 }
 
-static __always_inline void clear_btf(void)
+static void __kprobes clear_btf(void)
 {
 	if (test_thread_flag(TIF_DEBUGCTLMSR))
 		wrmsr(MSR_IA32_DEBUGCTLMSR, 0, 0);
 }
 
-static __always_inline void restore_btf(void)
+static void __kprobes restore_btf(void)
 {
 	if (test_thread_flag(TIF_DEBUGCTLMSR))
 		wrmsr(MSR_IA32_DEBUGCTLMSR, current->thread.debugctlmsr, 0);
@@ -409,7 +409,7 @@ static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
 	clear_btf();
 	regs->flags |= X86_EFLAGS_TF;
 	regs->flags &= ~X86_EFLAGS_IF;
-	/*single step inline if the instruction is an int3*/
+	/* single step inline if the instruction is an int3 */
 	if (p->opcode == BREAKPOINT_INSTRUCTION)
 		regs->ip = (unsigned long)p->addr;
 	else
@@ -767,7 +767,7 @@ static void __kprobes resume_execution(struct kprobe *p,
 	case 0xe8:	/* call relative - Fix return addr */
 		*tos = orig_ip + (*tos - copy_ip);
 		break;
-#ifndef CONFIG_X86_64
+#ifdef CONFIG_X86_32
 	case 0x9a:	/* call absolute -- same as call absolute, indirect */
 		*tos = orig_ip + (*tos - copy_ip);
 		goto no_change;
@@ -813,8 +813,6 @@ static void __kprobes resume_execution(struct kprobe *p,
 
 no_change:
 	restore_btf();
-
-	return;
 }
 
 /*
-- 
cgit v1.2.3


From cc927a25bd704448b18b095c658cbfdd79dab865 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:31:49 +0100
Subject: x86: x86 i387 header cleanup

This moves some code into asm-x86/i387_64.h in preparation for
unifying this code between 32 and 64.  The 32-bit versions of
some things are copied in some existing names changed to match
32-bit names and share code.  For 64, save_i387 is moved into
an inline from i387_64.c; this matches restore_i387, which is
already an inline, and makes sense since there is exactly one
caller (in signal_64.c).  The save_i387 function could use more
cosmetic cleanup, but it is just moved verbatim in this patch.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/i387_64.c |  31 -----
 include/asm-x86/i387_64.h | 315 ++++++++++++++++++++++++++++++++++------------
 2 files changed, 233 insertions(+), 113 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/i387_64.c b/arch/x86/kernel/i387_64.c
index bfaff28fb134..f335a76d7ea7 100644
--- a/arch/x86/kernel/i387_64.c
+++ b/arch/x86/kernel/i387_64.c
@@ -71,37 +71,6 @@ void init_fpu(struct task_struct *child)
 	set_stopped_child_used_math(child);
 }
 
-/*
- * Signal frame handlers.
- */
-
-int save_i387(struct _fpstate __user *buf)
-{
-	struct task_struct *tsk = current;
-	int err = 0;
-
-	BUILD_BUG_ON(sizeof(struct user_i387_struct) !=
-			sizeof(tsk->thread.i387.fxsave));
-
-	if ((unsigned long)buf % 16) 
-		printk("save_i387: bad fpstate %p\n",buf); 
-
-	if (!used_math())
-		return 0;
-	clear_used_math(); /* trigger finit */
-	if (task_thread_info(tsk)->status & TS_USEDFPU) {
-		err = save_i387_checking((struct i387_fxsave_struct __user *)buf);
-		if (err) return err;
-		task_thread_info(tsk)->status &= ~TS_USEDFPU;
-		stts();
-	} else {
-		if (__copy_to_user(buf, &tsk->thread.i387.fxsave,
-				   sizeof(struct i387_fxsave_struct)))
-			return -1;
-	}
-	return 1;
-}
-
 /*
  * ptrace request handlers.
  */
diff --git a/include/asm-x86/i387_64.h b/include/asm-x86/i387_64.h
index 12edb9f2d755..4cea7373ac78 100644
--- a/include/asm-x86/i387_64.h
+++ b/include/asm-x86/i387_64.h
@@ -7,33 +7,23 @@
  * x86-64 work by Andi Kleen 2002
  */
 
-#ifndef __ASM_X86_64_I387_H
-#define __ASM_X86_64_I387_H
+#ifndef _ASM_X86_I387_H
+#define _ASM_X86_I387_H
 
 #include <linux/sched.h>
+#include <linux/kernel_stat.h>
 #include <asm/processor.h>
 #include <asm/sigcontext.h>
 #include <asm/user.h>
-#include <asm/thread_info.h>
 #include <asm/uaccess.h>
 
 extern void fpu_init(void);
 extern unsigned int mxcsr_feature_mask;
 extern void mxcsr_feature_mask_init(void);
 extern void init_fpu(struct task_struct *child);
-extern int save_i387(struct _fpstate __user *buf);
 extern asmlinkage void math_state_restore(void);
 
-/*
- * FPU lazy state save handling...
- */
-
-#define unlazy_fpu(tsk) do { \
-	if (task_thread_info(tsk)->status & TS_USEDFPU) \
-		save_init_fpu(tsk); 			\
-	else						\
-		tsk->fpu_counter = 0;			\
-} while (0)
+#ifdef CONFIG_X86_64
 
 /* Ignore delayed exceptions from user space */
 static inline void tolerant_fwait(void)
@@ -46,52 +36,8 @@ static inline void tolerant_fwait(void)
 		     "	.previous\n");
 }
 
-#define clear_fpu(tsk) do { \
-	if (task_thread_info(tsk)->status & TS_USEDFPU) {	\
-		tolerant_fwait();				\
-		task_thread_info(tsk)->status &= ~TS_USEDFPU;	\
-		stts();						\
-	}							\
-} while (0)
-
-/*
- * ptrace request handers...
- */
-extern int get_fpregs(struct user_i387_struct __user *buf,
-		      struct task_struct *tsk);
-extern int set_fpregs(struct task_struct *tsk,
-		      struct user_i387_struct __user *buf);
-
-/*
- * i387 state interaction
- */
-#define get_fpu_mxcsr(t) ((t)->thread.i387.fxsave.mxcsr)
-#define get_fpu_cwd(t) ((t)->thread.i387.fxsave.cwd)
-#define get_fpu_fxsr_twd(t) ((t)->thread.i387.fxsave.twd)
-#define get_fpu_swd(t) ((t)->thread.i387.fxsave.swd)
-#define set_fpu_cwd(t,val) ((t)->thread.i387.fxsave.cwd = (val))
-#define set_fpu_swd(t,val) ((t)->thread.i387.fxsave.swd = (val))
-#define set_fpu_fxsr_twd(t,val) ((t)->thread.i387.fxsave.twd = (val))
-
-#define X87_FSW_ES (1 << 7)	/* Exception Summary */
-
-/* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception
-   is pending. Clear the x87 state here by setting it to fixed
-   values. The kernel data segment can be sometimes 0 and sometimes
-   new user value. Both should be ok.
-   Use the PDA as safe address because it should be already in L1. */
-static inline void clear_fpu_state(struct i387_fxsave_struct *fx)
+static inline int restore_fpu_checking(struct i387_fxsave_struct *fx)
 {
-	if (unlikely(fx->swd & X87_FSW_ES))
-		 asm volatile("fnclex");
-	alternative_input(ASM_NOP8 ASM_NOP2,
-	     	     "    emms\n"		/* clear stack tags */
-	     	     "    fildl %%gs:0",	/* load to clear state */
-		     X86_FEATURE_FXSAVE_LEAK);
-}
-
-static inline int restore_fpu_checking(struct i387_fxsave_struct *fx) 
-{ 
 	int err;
 
 	asm volatile("1:  rex64/fxrstor (%[fx])\n\t"
@@ -105,7 +51,7 @@ static inline int restore_fpu_checking(struct i387_fxsave_struct *fx)
 		     "   .quad  1b,3b\n"
 		     ".previous"
 		     : [err] "=r" (err)
-#if 0 /* See comment in __fxsave_clear() below. */
+#if 0 /* See comment in __save_init_fpu() below. */
 		     : [fx] "r" (fx), "m" (*fx), "0" (0));
 #else
 		     : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0));
@@ -113,10 +59,27 @@ static inline int restore_fpu_checking(struct i387_fxsave_struct *fx)
 	if (unlikely(err))
 		init_fpu(current);
 	return err;
-} 
+}
 
-static inline int save_i387_checking(struct i387_fxsave_struct __user *fx) 
-{ 
+#define X87_FSW_ES (1 << 7)	/* Exception Summary */
+
+/* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception
+   is pending. Clear the x87 state here by setting it to fixed
+   values. The kernel data segment can be sometimes 0 and sometimes
+   new user value. Both should be ok.
+   Use the PDA as safe address because it should be already in L1. */
+static inline void clear_fpu_state(struct i387_fxsave_struct *fx)
+{
+	if (unlikely(fx->swd & X87_FSW_ES))
+		 asm volatile("fnclex");
+	alternative_input(ASM_NOP8 ASM_NOP2,
+		     "    emms\n"		/* clear stack tags */
+		     "    fildl %%gs:0",	/* load to clear state */
+		     X86_FEATURE_FXSAVE_LEAK);
+}
+
+static inline int save_i387_checking(struct i387_fxsave_struct __user *fx)
+{
 	int err;
 
 	asm volatile("1:  rex64/fxsave (%[fx])\n\t"
@@ -139,9 +102,9 @@ static inline int save_i387_checking(struct i387_fxsave_struct __user *fx)
 		err = -EFAULT;
 	/* No need to clear here because the caller clears USED_MATH */
 	return err;
-} 
+}
 
-static inline void __fxsave_clear(struct task_struct *tsk)
+static inline void __save_init_fpu(struct task_struct *tsk)
 {
 	/* Using "rex64; fxsave %0" is broken because, if the memory operand
 	   uses any extended registers for addressing, a second REX prefix
@@ -169,18 +132,142 @@ static inline void __fxsave_clear(struct task_struct *tsk)
 					      thread.i387.fxsave)));
 #endif
 	clear_fpu_state(&tsk->thread.i387.fxsave);
+	task_thread_info(tsk)->status &= ~TS_USEDFPU;
+}
+
+/*
+ * Signal frame handlers.
+ */
+
+static inline int save_i387(struct _fpstate __user *buf)
+{
+	struct task_struct *tsk = current;
+	int err = 0;
+
+	BUILD_BUG_ON(sizeof(struct user_i387_struct) !=
+			sizeof(tsk->thread.i387.fxsave));
+
+	if ((unsigned long)buf % 16)
+		printk("save_i387: bad fpstate %p\n", buf);
+
+	if (!used_math())
+		return 0;
+	clear_used_math(); /* trigger finit */
+	if (task_thread_info(tsk)->status & TS_USEDFPU) {
+		err = save_i387_checking((struct i387_fxsave_struct __user *)buf);
+		if (err) return err;
+		task_thread_info(tsk)->status &= ~TS_USEDFPU;
+		stts();
+	} else {
+		if (__copy_to_user(buf, &tsk->thread.i387.fxsave,
+				   sizeof(struct i387_fxsave_struct)))
+			return -1;
+	}
+	return 1;
+}
+
+/*
+ * This restores directly out of user space. Exceptions are handled.
+ */
+static inline int restore_i387(struct _fpstate __user *buf)
+{
+	set_used_math();
+	if (!(task_thread_info(current)->status & TS_USEDFPU)) {
+		clts();
+		task_thread_info(current)->status |= TS_USEDFPU;
+	}
+	return restore_fpu_checking((__force struct i387_fxsave_struct *)buf);
+}
+
+#else  /* CONFIG_X86_32 */
+
+static inline void tolerant_fwait(void)
+{
+	asm volatile("fnclex ; fwait");
+}
+
+static inline void restore_fpu(struct task_struct *tsk)
+{
+	/*
+	 * The "nop" is needed to make the instructions the same
+	 * length.
+	 */
+	alternative_input(
+		"nop ; frstor %1",
+		"fxrstor %1",
+		X86_FEATURE_FXSR,
+		"m" ((tsk)->thread.i387.fxsave));
+}
+
+/* We need a safe address that is cheap to find and that is already
+   in L1 during context switch. The best choices are unfortunately
+   different for UP and SMP */
+#ifdef CONFIG_SMP
+#define safe_address (__per_cpu_offset[0])
+#else
+#define safe_address (kstat_cpu(0).cpustat.user)
+#endif
+
+/*
+ * These must be called with preempt disabled
+ */
+static inline void __save_init_fpu(struct task_struct *tsk)
+{
+	/* Use more nops than strictly needed in case the compiler
+	   varies code */
+	alternative_input(
+		"fnsave %[fx] ;fwait;" GENERIC_NOP8 GENERIC_NOP4,
+		"fxsave %[fx]\n"
+		"bt $7,%[fsw] ; jnc 1f ; fnclex\n1:",
+		X86_FEATURE_FXSR,
+		[fx] "m" (tsk->thread.i387.fxsave),
+		[fsw] "m" (tsk->thread.i387.fxsave.swd) : "memory");
+	/* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
+	   is pending.  Clear the x87 state here by setting it to fixed
+	   values. safe_address is a random variable that should be in L1 */
+	alternative_input(
+		GENERIC_NOP8 GENERIC_NOP2,
+		"emms\n\t"	  	/* clear stack tags */
+		"fildl %[addr]", 	/* set F?P to defined value */
+		X86_FEATURE_FXSAVE_LEAK,
+		[addr] "m" (safe_address));
+	task_thread_info(tsk)->status &= ~TS_USEDFPU;
+}
+
+/*
+ * Signal frame handlers...
+ */
+extern int save_i387(struct _fpstate __user *buf);
+extern int restore_i387(struct _fpstate __user *buf);
+
+#endif	/* CONFIG_X86_64 */
+
+static inline void __unlazy_fpu(struct task_struct *tsk)
+{
+	if (task_thread_info(tsk)->status & TS_USEDFPU) {
+		__save_init_fpu(tsk);
+		stts();
+	} else
+		tsk->fpu_counter = 0;
+}
+
+static inline void __clear_fpu(struct task_struct *tsk)
+{
+	if (task_thread_info(tsk)->status & TS_USEDFPU) {
+		tolerant_fwait();
+		task_thread_info(tsk)->status &= ~TS_USEDFPU;
+		stts();
+	}
 }
 
 static inline void kernel_fpu_begin(void)
 {
 	struct thread_info *me = current_thread_info();
 	preempt_disable();
-	if (me->status & TS_USEDFPU) {
-		__fxsave_clear(me->task);
-		me->status &= ~TS_USEDFPU;
-		return;
-	}
-	clts();
+	if (me->status & TS_USEDFPU)
+		__save_init_fpu(me->task);
+	else
+		clts();
 }
 
 static inline void kernel_fpu_end(void)
@@ -189,24 +276,88 @@ static inline void kernel_fpu_end(void)
 	preempt_enable();
 }
 
+#ifdef CONFIG_X86_64
+
 static inline void save_init_fpu(struct task_struct *tsk)
 {
- 	__fxsave_clear(tsk);
-	task_thread_info(tsk)->status &= ~TS_USEDFPU;
+	__save_init_fpu(tsk);
 	stts();
 }
 
-/* 
- * This restores directly out of user space. Exceptions are handled.
+#define unlazy_fpu	__unlazy_fpu
+#define clear_fpu	__clear_fpu
+
+#else  /* CONFIG_X86_32 */
+
+/*
+ * These disable preemption on their own and are safe
  */
-static inline int restore_i387(struct _fpstate __user *buf)
+static inline void save_init_fpu(struct task_struct *tsk)
 {
-	set_used_math();
-	if (!(task_thread_info(current)->status & TS_USEDFPU)) {
-		clts();
-		task_thread_info(current)->status |= TS_USEDFPU;
+	preempt_disable();
+	__save_init_fpu(tsk);
+	stts();
+	preempt_enable();
+}
+
+static inline void unlazy_fpu(struct task_struct *tsk)
+{
+	preempt_disable();
+	__unlazy_fpu(tsk);
+	preempt_enable();
+}
+
+static inline void clear_fpu(struct task_struct *tsk)
+{
+	preempt_disable();
+	__clear_fpu(tsk);
+	preempt_enable();
+}
+
+#endif	/* CONFIG_X86_64 */
+
+/*
+ * ptrace request handlers...
+ */
+extern int get_fpregs(struct user_i387_struct __user *buf,
+		      struct task_struct *tsk);
+extern int set_fpregs(struct task_struct *tsk,
+		      struct user_i387_struct __user *buf);
+
+struct user_fxsr_struct;
+extern int get_fpxregs(struct user_fxsr_struct __user *buf,
+		       struct task_struct *tsk);
+extern int set_fpxregs(struct task_struct *tsk,
+		       struct user_fxsr_struct __user *buf);
+
+/*
+ * i387 state interaction
+ */
+static inline unsigned short get_fpu_cwd(struct task_struct *tsk)
+{
+	if (cpu_has_fxsr) {
+		return tsk->thread.i387.fxsave.cwd;
+	} else {
+		return (unsigned short)tsk->thread.i387.fsave.cwd;
+	}
+}
+
+static inline unsigned short get_fpu_swd(struct task_struct *tsk)
+{
+	if (cpu_has_fxsr) {
+		return tsk->thread.i387.fxsave.swd;
+	} else {
+		return (unsigned short)tsk->thread.i387.fsave.swd;
+	}
+}
+
+static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk)
+{
+	if (cpu_has_xmm) {
+		return tsk->thread.i387.fxsave.mxcsr;
+	} else {
+		return MXCSR_DEFAULT;
 	}
-	return restore_fpu_checking((__force struct i387_fxsave_struct *)buf);
 }
 
-#endif /* __ASM_X86_64_I387_H */
+#endif	/* _ASM_X86_I387_H */
-- 
cgit v1.2.3


From b7b71725fb9584454bfe5f231223bd63421798fb Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:31:50 +0100
Subject: x86: i387 renaming

This renames arch/x86/kernel/{i387_32.c => i387.c}.
This is a pure renaming, but paves the way for merging
the 32-bit and 64-bit versions of this code.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/Makefile_32 |   3 +-
 arch/x86/kernel/i387.c      | 551 ++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/i387_32.c   | 551 --------------------------------------------
 3 files changed, 553 insertions(+), 552 deletions(-)
 create mode 100644 arch/x86/kernel/i387.c
 delete mode 100644 arch/x86/kernel/i387_32.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32
index c7a959da363a..a67198044b0d 100644
--- a/arch/x86/kernel/Makefile_32
+++ b/arch/x86/kernel/Makefile_32
@@ -7,9 +7,10 @@ CPPFLAGS_vmlinux.lds += -Ui386
 
 obj-y	:= process_32.o signal_32.o entry_32.o traps_32.o irq_32.o \
 		time_32.o ioport_32.o ldt.o setup_32.o i8259_32.o sys_i386_32.o \
-		pci-dma_32.o i386_ksyms_32.o i387_32.o bootflag.o e820_32.o\
+		pci-dma_32.o i386_ksyms_32.o bootflag.o e820_32.o\
 		quirks.o i8237.o topology.o alternative.o i8253.o tsc_32.o io_delay.o rtc.o
 
+obj-y				+= i387.o
 obj-y				+= ptrace.o
 obj-y				+= ds.o
 obj-y				+= tls.o
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
new file mode 100644
index 000000000000..bebe03463461
--- /dev/null
+++ b/arch/x86/kernel/i387.c
@@ -0,0 +1,551 @@
+/*
+ *  Copyright (C) 1994 Linus Torvalds
+ *
+ *  Pentium III FXSR, SSE support
+ *  General FPU state handling cleanups
+ *	Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/math_emu.h>
+#include <asm/sigcontext.h>
+#include <asm/user.h>
+#include <asm/ptrace.h>
+#include <asm/uaccess.h>
+
+#ifdef CONFIG_MATH_EMULATION
+#define HAVE_HWFP (boot_cpu_data.hard_math)
+#else
+#define HAVE_HWFP 1
+#endif
+
+static unsigned long mxcsr_feature_mask __read_mostly = 0xffffffff;
+
+void mxcsr_feature_mask_init(void)
+{
+	unsigned long mask = 0;
+	clts();
+	if (cpu_has_fxsr) {
+		memset(&current->thread.i387.fxsave, 0,
+		       sizeof(struct i387_fxsave_struct));
+		asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave));
+		mask = current->thread.i387.fxsave.mxcsr_mask;
+		if (mask == 0)
+			mask = 0x0000ffbf;
+	}
+	mxcsr_feature_mask &= mask;
+	stts();
+}
+
+/*
+ * The _current_ task is using the FPU for the first time
+ * so initialize it and set the mxcsr to its default
+ * value at reset if we support XMM instructions and then
+ * remeber the current task has used the FPU.
+ */
+void init_fpu(struct task_struct *tsk)
+{
+	if (cpu_has_fxsr) {
+		memset(&tsk->thread.i387.fxsave, 0,
+		       sizeof(struct i387_fxsave_struct));
+		tsk->thread.i387.fxsave.cwd = 0x37f;
+		if (cpu_has_xmm)
+			tsk->thread.i387.fxsave.mxcsr = 0x1f80;
+	} else {
+		memset(&tsk->thread.i387.fsave, 0,
+		       sizeof(struct i387_fsave_struct));
+		tsk->thread.i387.fsave.cwd = 0xffff037fu;
+		tsk->thread.i387.fsave.swd = 0xffff0000u;
+		tsk->thread.i387.fsave.twd = 0xffffffffu;
+		tsk->thread.i387.fsave.fos = 0xffff0000u;
+	}
+	/* only the device not available exception
+	 * or ptrace can call init_fpu */
+	set_stopped_child_used_math(tsk);
+}
+
+/*
+ * FPU lazy state save handling.
+ */
+
+void kernel_fpu_begin(void)
+{
+	struct thread_info *thread = current_thread_info();
+
+	preempt_disable();
+	if (thread->status & TS_USEDFPU) {
+		__save_init_fpu(thread->task);
+		return;
+	}
+	clts();
+}
+EXPORT_SYMBOL_GPL(kernel_fpu_begin);
+
+/*
+ * FPU tag word conversions.
+ */
+
+static inline unsigned short twd_i387_to_fxsr(unsigned short twd)
+{
+	unsigned int tmp; /* to avoid 16 bit prefixes in the code */
+
+	/* Transform each pair of bits into 01 (valid) or 00 (empty) */
+	tmp = ~twd;
+	tmp = (tmp | (tmp >> 1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
+	/* and move the valid bits to the lower byte. */
+	tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
+	tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
+	tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
+
+	return tmp;
+}
+
+static inline unsigned long twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)
+{
+	struct _fpxreg *st = NULL;
+	unsigned long tos = (fxsave->swd >> 11) & 7;
+	unsigned long twd = (unsigned long) fxsave->twd;
+	unsigned long tag;
+	unsigned long ret = 0xffff0000u;
+	int i;
+
+#define FPREG_ADDR(f, n)	((void *)&(f)->st_space + (n) * 16);
+
+	for (i = 0; i < 8; i++) {
+		if (twd & 0x1) {
+			st = FPREG_ADDR(fxsave, (i - tos) & 7);
+
+			switch (st->exponent & 0x7fff) {
+			case 0x7fff:
+				tag = 2;		/* Special */
+				break;
+			case 0x0000:
+				if (!st->significand[0] &&
+				    !st->significand[1] &&
+				    !st->significand[2] &&
+				    !st->significand[3]) {
+					tag = 1;	/* Zero */
+				} else {
+					tag = 2;	/* Special */
+				}
+				break;
+			default:
+				if (st->significand[3] & 0x8000) {
+					tag = 0;	/* Valid */
+				} else {
+					tag = 2;	/* Special */
+				}
+				break;
+			}
+		} else {
+			tag = 3;			/* Empty */
+		}
+		ret |= (tag << (2 * i));
+		twd = twd >> 1;
+	}
+	return ret;
+}
+
+/*
+ * FPU state interaction.
+ */
+
+unsigned short get_fpu_cwd(struct task_struct *tsk)
+{
+	if (cpu_has_fxsr) {
+		return tsk->thread.i387.fxsave.cwd;
+	} else {
+		return (unsigned short)tsk->thread.i387.fsave.cwd;
+	}
+}
+
+unsigned short get_fpu_swd(struct task_struct *tsk)
+{
+	if (cpu_has_fxsr) {
+		return tsk->thread.i387.fxsave.swd;
+	} else {
+		return (unsigned short)tsk->thread.i387.fsave.swd;
+	}
+}
+
+#if 0
+unsigned short get_fpu_twd(struct task_struct *tsk)
+{
+	if (cpu_has_fxsr) {
+		return tsk->thread.i387.fxsave.twd;
+	} else {
+		return (unsigned short)tsk->thread.i387.fsave.twd;
+	}
+}
+#endif  /*  0  */
+
+unsigned short get_fpu_mxcsr(struct task_struct *tsk)
+{
+	if (cpu_has_xmm) {
+		return tsk->thread.i387.fxsave.mxcsr;
+	} else {
+		return 0x1f80;
+	}
+}
+
+#if 0
+
+void set_fpu_cwd(struct task_struct *tsk, unsigned short cwd)
+{
+	if (cpu_has_fxsr) {
+		tsk->thread.i387.fxsave.cwd = cwd;
+	} else {
+		tsk->thread.i387.fsave.cwd = ((long)cwd | 0xffff0000u);
+	}
+}
+
+void set_fpu_swd(struct task_struct *tsk, unsigned short swd)
+{
+	if (cpu_has_fxsr) {
+		tsk->thread.i387.fxsave.swd = swd;
+	} else {
+		tsk->thread.i387.fsave.swd = ((long)swd | 0xffff0000u);
+	}
+}
+
+void set_fpu_twd(struct task_struct *tsk, unsigned short twd)
+{
+	if (cpu_has_fxsr) {
+		tsk->thread.i387.fxsave.twd = twd_i387_to_fxsr(twd);
+	} else {
+		tsk->thread.i387.fsave.twd = ((long)twd | 0xffff0000u);
+	}
+}
+
+#endif  /*  0  */
+
+/*
+ * FXSR floating point environment conversions.
+ */
+
+static int convert_fxsr_to_user(struct _fpstate __user *buf,
+				struct i387_fxsave_struct *fxsave)
+{
+	unsigned long env[7];
+	struct _fpreg __user *to;
+	struct _fpxreg *from;
+	int i;
+
+	env[0] = (unsigned long)fxsave->cwd | 0xffff0000ul;
+	env[1] = (unsigned long)fxsave->swd | 0xffff0000ul;
+	env[2] = twd_fxsr_to_i387(fxsave);
+	env[3] = fxsave->fip;
+	env[4] = fxsave->fcs | ((unsigned long)fxsave->fop << 16);
+	env[5] = fxsave->foo;
+	env[6] = fxsave->fos;
+
+	if (__copy_to_user(buf, env, 7 * sizeof(unsigned long)))
+		return 1;
+
+	to = &buf->_st[0];
+	from = (struct _fpxreg *) &fxsave->st_space[0];
+	for (i = 0; i < 8; i++, to++, from++) {
+		unsigned long __user *t = (unsigned long __user *)to;
+		unsigned long *f = (unsigned long *)from;
+
+		if (__put_user(*f, t) ||
+		    __put_user(*(f + 1), t + 1) ||
+		    __put_user(from->exponent, &to->exponent))
+			return 1;
+	}
+	return 0;
+}
+
+static int convert_fxsr_from_user(struct i387_fxsave_struct *fxsave,
+				  struct _fpstate __user *buf)
+{
+	unsigned long env[7];
+	struct _fpxreg *to;
+	struct _fpreg __user *from;
+	int i;
+
+	if (__copy_from_user(env, buf, 7 * sizeof(long)))
+		return 1;
+
+	fxsave->cwd = (unsigned short)(env[0] & 0xffff);
+	fxsave->swd = (unsigned short)(env[1] & 0xffff);
+	fxsave->twd = twd_i387_to_fxsr((unsigned short)(env[2] & 0xffff));
+	fxsave->fip = env[3];
+	fxsave->fop = (unsigned short)((env[4] & 0xffff0000ul) >> 16);
+	fxsave->fcs = (env[4] & 0xffff);
+	fxsave->foo = env[5];
+	fxsave->fos = env[6];
+
+	to = (struct _fpxreg *) &fxsave->st_space[0];
+	from = &buf->_st[0];
+	for (i = 0; i < 8; i++, to++, from++) {
+		unsigned long *t = (unsigned long *)to;
+		unsigned long __user *f = (unsigned long __user *)from;
+
+		if (__get_user(*t, f) ||
+		    __get_user(*(t + 1), f + 1) ||
+		    __get_user(to->exponent, &from->exponent))
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * Signal frame handlers.
+ */
+
+static inline int save_i387_fsave(struct _fpstate __user *buf)
+{
+	struct task_struct *tsk = current;
+
+	unlazy_fpu(tsk);
+	tsk->thread.i387.fsave.status = tsk->thread.i387.fsave.swd;
+	if (__copy_to_user(buf, &tsk->thread.i387.fsave,
+			   sizeof(struct i387_fsave_struct)))
+		return -1;
+	return 1;
+}
+
+static int save_i387_fxsave(struct _fpstate __user *buf)
+{
+	struct task_struct *tsk = current;
+	int err = 0;
+
+	unlazy_fpu(tsk);
+
+	if (convert_fxsr_to_user(buf, &tsk->thread.i387.fxsave))
+		return -1;
+
+	err |= __put_user(tsk->thread.i387.fxsave.swd, &buf->status);
+	err |= __put_user(X86_FXSR_MAGIC, &buf->magic);
+	if (err)
+		return -1;
+
+	if (__copy_to_user(&buf->_fxsr_env[0], &tsk->thread.i387.fxsave,
+			   sizeof(struct i387_fxsave_struct)))
+		return -1;
+	return 1;
+}
+
+int save_i387(struct _fpstate __user *buf)
+{
+	if (!used_math())
+		return 0;
+
+	/* This will cause a "finit" to be triggered by the next
+	 * attempted FPU operation by the 'current' process.
+	 */
+	clear_used_math();
+
+	if (HAVE_HWFP) {
+		if (cpu_has_fxsr) {
+			return save_i387_fxsave(buf);
+		} else {
+			return save_i387_fsave(buf);
+		}
+	} else {
+		return save_i387_soft(&current->thread.i387.soft, buf);
+	}
+}
+
+static inline int restore_i387_fsave(struct _fpstate __user *buf)
+{
+	struct task_struct *tsk = current;
+	clear_fpu(tsk);
+	return __copy_from_user(&tsk->thread.i387.fsave, buf,
+				sizeof(struct i387_fsave_struct));
+}
+
+static int restore_i387_fxsave(struct _fpstate __user *buf)
+{
+	int err;
+	struct task_struct *tsk = current;
+	clear_fpu(tsk);
+	err = __copy_from_user(&tsk->thread.i387.fxsave, &buf->_fxsr_env[0],
+			       sizeof(struct i387_fxsave_struct));
+	/* mxcsr reserved bits must be masked to zero for security reasons */
+	tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
+	return err ? 1 : convert_fxsr_from_user(&tsk->thread.i387.fxsave, buf);
+}
+
+int restore_i387(struct _fpstate __user *buf)
+{
+	int err;
+
+	if (HAVE_HWFP) {
+		if (cpu_has_fxsr) {
+			err = restore_i387_fxsave(buf);
+		} else {
+			err = restore_i387_fsave(buf);
+		}
+	} else {
+		err = restore_i387_soft(&current->thread.i387.soft, buf);
+	}
+	set_used_math();
+	return err;
+}
+
+/*
+ * ptrace request handlers.
+ */
+
+static inline int get_fpregs_fsave(struct user_i387_struct __user *buf,
+				   struct task_struct *tsk)
+{
+	return __copy_to_user(buf, &tsk->thread.i387.fsave,
+			      sizeof(struct user_i387_struct));
+}
+
+static inline int get_fpregs_fxsave(struct user_i387_struct __user *buf,
+				    struct task_struct *tsk)
+{
+	return convert_fxsr_to_user((struct _fpstate __user *)buf,
+				    &tsk->thread.i387.fxsave);
+}
+
+int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *tsk)
+{
+	if (HAVE_HWFP) {
+		if (cpu_has_fxsr) {
+			return get_fpregs_fxsave(buf, tsk);
+		} else {
+			return get_fpregs_fsave(buf, tsk);
+		}
+	} else {
+		return save_i387_soft(&tsk->thread.i387.soft,
+				      (struct _fpstate __user *)buf);
+	}
+}
+
+static inline int set_fpregs_fsave(struct task_struct *tsk,
+				   struct user_i387_struct __user *buf)
+{
+	return __copy_from_user(&tsk->thread.i387.fsave, buf,
+				sizeof(struct user_i387_struct));
+}
+
+static inline int set_fpregs_fxsave(struct task_struct *tsk,
+				    struct user_i387_struct __user *buf)
+{
+	return convert_fxsr_from_user(&tsk->thread.i387.fxsave,
+				      (struct _fpstate __user *)buf);
+}
+
+int set_fpregs(struct task_struct *tsk, struct user_i387_struct __user *buf)
+{
+	if (HAVE_HWFP) {
+		if (cpu_has_fxsr) {
+			return set_fpregs_fxsave(tsk, buf);
+		} else {
+			return set_fpregs_fsave(tsk, buf);
+		}
+	} else {
+		return restore_i387_soft(&tsk->thread.i387.soft,
+					 (struct _fpstate __user *)buf);
+	}
+}
+
+int get_fpxregs(struct user_fxsr_struct __user *buf, struct task_struct *tsk)
+{
+	if (cpu_has_fxsr) {
+		if (__copy_to_user(buf, &tsk->thread.i387.fxsave,
+				   sizeof(struct user_fxsr_struct)))
+			return -EFAULT;
+		return 0;
+	} else {
+		return -EIO;
+	}
+}
+
+int set_fpxregs(struct task_struct *tsk, struct user_fxsr_struct __user *buf)
+{
+	int ret = 0;
+
+	if (cpu_has_fxsr) {
+		if (__copy_from_user(&tsk->thread.i387.fxsave, buf,
+				     sizeof(struct user_fxsr_struct)))
+			ret = -EFAULT;
+		/* mxcsr reserved bits must be masked to zero
+		 * for security reasons */
+		tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
+	} else {
+		ret = -EIO;
+	}
+	return ret;
+}
+
+/*
+ * FPU state for core dumps.
+ */
+
+static inline void copy_fpu_fsave(struct task_struct *tsk,
+				  struct user_i387_struct *fpu)
+{
+	memcpy(fpu, &tsk->thread.i387.fsave,
+	       sizeof(struct user_i387_struct));
+}
+
+static inline void copy_fpu_fxsave(struct task_struct *tsk,
+				   struct user_i387_struct *fpu)
+{
+	unsigned short *to;
+	unsigned short *from;
+	int i;
+
+	memcpy(fpu, &tsk->thread.i387.fxsave, 7 * sizeof(long));
+
+	to = (unsigned short *)&fpu->st_space[0];
+	from = (unsigned short *)&tsk->thread.i387.fxsave.st_space[0];
+	for (i = 0; i < 8; i++, to += 5, from += 8)
+		memcpy(to, from, 5 * sizeof(unsigned short));
+}
+
+int dump_fpu(struct pt_regs *regs, struct user_i387_struct *fpu)
+{
+	int fpvalid;
+	struct task_struct *tsk = current;
+
+	fpvalid = !!used_math();
+	if (fpvalid) {
+		unlazy_fpu(tsk);
+		if (cpu_has_fxsr) {
+			copy_fpu_fxsave(tsk, fpu);
+		} else {
+			copy_fpu_fsave(tsk, fpu);
+		}
+	}
+
+	return fpvalid;
+}
+EXPORT_SYMBOL(dump_fpu);
+
+int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu)
+{
+	int fpvalid = !!tsk_used_math(tsk);
+
+	if (fpvalid) {
+		if (tsk == current)
+			unlazy_fpu(tsk);
+		if (cpu_has_fxsr)
+			copy_fpu_fxsave(tsk, fpu);
+		else
+			copy_fpu_fsave(tsk, fpu);
+	}
+	return fpvalid;
+}
+
+int dump_task_extended_fpu(struct task_struct *tsk,
+			   struct user_fxsr_struct *fpu)
+{
+	int fpvalid = tsk_used_math(tsk) && cpu_has_fxsr;
+
+	if (fpvalid) {
+		if (tsk == current)
+		       unlazy_fpu(tsk);
+		memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(*fpu));
+	}
+	return fpvalid;
+}
diff --git a/arch/x86/kernel/i387_32.c b/arch/x86/kernel/i387_32.c
deleted file mode 100644
index bebe03463461..000000000000
--- a/arch/x86/kernel/i387_32.c
+++ /dev/null
@@ -1,551 +0,0 @@
-/*
- *  Copyright (C) 1994 Linus Torvalds
- *
- *  Pentium III FXSR, SSE support
- *  General FPU state handling cleanups
- *	Gareth Hughes <gareth@valinux.com>, May 2000
- */
-
-#include <linux/sched.h>
-#include <linux/module.h>
-#include <asm/processor.h>
-#include <asm/i387.h>
-#include <asm/math_emu.h>
-#include <asm/sigcontext.h>
-#include <asm/user.h>
-#include <asm/ptrace.h>
-#include <asm/uaccess.h>
-
-#ifdef CONFIG_MATH_EMULATION
-#define HAVE_HWFP (boot_cpu_data.hard_math)
-#else
-#define HAVE_HWFP 1
-#endif
-
-static unsigned long mxcsr_feature_mask __read_mostly = 0xffffffff;
-
-void mxcsr_feature_mask_init(void)
-{
-	unsigned long mask = 0;
-	clts();
-	if (cpu_has_fxsr) {
-		memset(&current->thread.i387.fxsave, 0,
-		       sizeof(struct i387_fxsave_struct));
-		asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave));
-		mask = current->thread.i387.fxsave.mxcsr_mask;
-		if (mask == 0)
-			mask = 0x0000ffbf;
-	}
-	mxcsr_feature_mask &= mask;
-	stts();
-}
-
-/*
- * The _current_ task is using the FPU for the first time
- * so initialize it and set the mxcsr to its default
- * value at reset if we support XMM instructions and then
- * remeber the current task has used the FPU.
- */
-void init_fpu(struct task_struct *tsk)
-{
-	if (cpu_has_fxsr) {
-		memset(&tsk->thread.i387.fxsave, 0,
-		       sizeof(struct i387_fxsave_struct));
-		tsk->thread.i387.fxsave.cwd = 0x37f;
-		if (cpu_has_xmm)
-			tsk->thread.i387.fxsave.mxcsr = 0x1f80;
-	} else {
-		memset(&tsk->thread.i387.fsave, 0,
-		       sizeof(struct i387_fsave_struct));
-		tsk->thread.i387.fsave.cwd = 0xffff037fu;
-		tsk->thread.i387.fsave.swd = 0xffff0000u;
-		tsk->thread.i387.fsave.twd = 0xffffffffu;
-		tsk->thread.i387.fsave.fos = 0xffff0000u;
-	}
-	/* only the device not available exception
-	 * or ptrace can call init_fpu */
-	set_stopped_child_used_math(tsk);
-}
-
-/*
- * FPU lazy state save handling.
- */
-
-void kernel_fpu_begin(void)
-{
-	struct thread_info *thread = current_thread_info();
-
-	preempt_disable();
-	if (thread->status & TS_USEDFPU) {
-		__save_init_fpu(thread->task);
-		return;
-	}
-	clts();
-}
-EXPORT_SYMBOL_GPL(kernel_fpu_begin);
-
-/*
- * FPU tag word conversions.
- */
-
-static inline unsigned short twd_i387_to_fxsr(unsigned short twd)
-{
-	unsigned int tmp; /* to avoid 16 bit prefixes in the code */
-
-	/* Transform each pair of bits into 01 (valid) or 00 (empty) */
-	tmp = ~twd;
-	tmp = (tmp | (tmp >> 1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
-	/* and move the valid bits to the lower byte. */
-	tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
-	tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
-	tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
-
-	return tmp;
-}
-
-static inline unsigned long twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)
-{
-	struct _fpxreg *st = NULL;
-	unsigned long tos = (fxsave->swd >> 11) & 7;
-	unsigned long twd = (unsigned long) fxsave->twd;
-	unsigned long tag;
-	unsigned long ret = 0xffff0000u;
-	int i;
-
-#define FPREG_ADDR(f, n)	((void *)&(f)->st_space + (n) * 16);
-
-	for (i = 0; i < 8; i++) {
-		if (twd & 0x1) {
-			st = FPREG_ADDR(fxsave, (i - tos) & 7);
-
-			switch (st->exponent & 0x7fff) {
-			case 0x7fff:
-				tag = 2;		/* Special */
-				break;
-			case 0x0000:
-				if (!st->significand[0] &&
-				    !st->significand[1] &&
-				    !st->significand[2] &&
-				    !st->significand[3]) {
-					tag = 1;	/* Zero */
-				} else {
-					tag = 2;	/* Special */
-				}
-				break;
-			default:
-				if (st->significand[3] & 0x8000) {
-					tag = 0;	/* Valid */
-				} else {
-					tag = 2;	/* Special */
-				}
-				break;
-			}
-		} else {
-			tag = 3;			/* Empty */
-		}
-		ret |= (tag << (2 * i));
-		twd = twd >> 1;
-	}
-	return ret;
-}
-
-/*
- * FPU state interaction.
- */
-
-unsigned short get_fpu_cwd(struct task_struct *tsk)
-{
-	if (cpu_has_fxsr) {
-		return tsk->thread.i387.fxsave.cwd;
-	} else {
-		return (unsigned short)tsk->thread.i387.fsave.cwd;
-	}
-}
-
-unsigned short get_fpu_swd(struct task_struct *tsk)
-{
-	if (cpu_has_fxsr) {
-		return tsk->thread.i387.fxsave.swd;
-	} else {
-		return (unsigned short)tsk->thread.i387.fsave.swd;
-	}
-}
-
-#if 0
-unsigned short get_fpu_twd(struct task_struct *tsk)
-{
-	if (cpu_has_fxsr) {
-		return tsk->thread.i387.fxsave.twd;
-	} else {
-		return (unsigned short)tsk->thread.i387.fsave.twd;
-	}
-}
-#endif  /*  0  */
-
-unsigned short get_fpu_mxcsr(struct task_struct *tsk)
-{
-	if (cpu_has_xmm) {
-		return tsk->thread.i387.fxsave.mxcsr;
-	} else {
-		return 0x1f80;
-	}
-}
-
-#if 0
-
-void set_fpu_cwd(struct task_struct *tsk, unsigned short cwd)
-{
-	if (cpu_has_fxsr) {
-		tsk->thread.i387.fxsave.cwd = cwd;
-	} else {
-		tsk->thread.i387.fsave.cwd = ((long)cwd | 0xffff0000u);
-	}
-}
-
-void set_fpu_swd(struct task_struct *tsk, unsigned short swd)
-{
-	if (cpu_has_fxsr) {
-		tsk->thread.i387.fxsave.swd = swd;
-	} else {
-		tsk->thread.i387.fsave.swd = ((long)swd | 0xffff0000u);
-	}
-}
-
-void set_fpu_twd(struct task_struct *tsk, unsigned short twd)
-{
-	if (cpu_has_fxsr) {
-		tsk->thread.i387.fxsave.twd = twd_i387_to_fxsr(twd);
-	} else {
-		tsk->thread.i387.fsave.twd = ((long)twd | 0xffff0000u);
-	}
-}
-
-#endif  /*  0  */
-
-/*
- * FXSR floating point environment conversions.
- */
-
-static int convert_fxsr_to_user(struct _fpstate __user *buf,
-				struct i387_fxsave_struct *fxsave)
-{
-	unsigned long env[7];
-	struct _fpreg __user *to;
-	struct _fpxreg *from;
-	int i;
-
-	env[0] = (unsigned long)fxsave->cwd | 0xffff0000ul;
-	env[1] = (unsigned long)fxsave->swd | 0xffff0000ul;
-	env[2] = twd_fxsr_to_i387(fxsave);
-	env[3] = fxsave->fip;
-	env[4] = fxsave->fcs | ((unsigned long)fxsave->fop << 16);
-	env[5] = fxsave->foo;
-	env[6] = fxsave->fos;
-
-	if (__copy_to_user(buf, env, 7 * sizeof(unsigned long)))
-		return 1;
-
-	to = &buf->_st[0];
-	from = (struct _fpxreg *) &fxsave->st_space[0];
-	for (i = 0; i < 8; i++, to++, from++) {
-		unsigned long __user *t = (unsigned long __user *)to;
-		unsigned long *f = (unsigned long *)from;
-
-		if (__put_user(*f, t) ||
-		    __put_user(*(f + 1), t + 1) ||
-		    __put_user(from->exponent, &to->exponent))
-			return 1;
-	}
-	return 0;
-}
-
-static int convert_fxsr_from_user(struct i387_fxsave_struct *fxsave,
-				  struct _fpstate __user *buf)
-{
-	unsigned long env[7];
-	struct _fpxreg *to;
-	struct _fpreg __user *from;
-	int i;
-
-	if (__copy_from_user(env, buf, 7 * sizeof(long)))
-		return 1;
-
-	fxsave->cwd = (unsigned short)(env[0] & 0xffff);
-	fxsave->swd = (unsigned short)(env[1] & 0xffff);
-	fxsave->twd = twd_i387_to_fxsr((unsigned short)(env[2] & 0xffff));
-	fxsave->fip = env[3];
-	fxsave->fop = (unsigned short)((env[4] & 0xffff0000ul) >> 16);
-	fxsave->fcs = (env[4] & 0xffff);
-	fxsave->foo = env[5];
-	fxsave->fos = env[6];
-
-	to = (struct _fpxreg *) &fxsave->st_space[0];
-	from = &buf->_st[0];
-	for (i = 0; i < 8; i++, to++, from++) {
-		unsigned long *t = (unsigned long *)to;
-		unsigned long __user *f = (unsigned long __user *)from;
-
-		if (__get_user(*t, f) ||
-		    __get_user(*(t + 1), f + 1) ||
-		    __get_user(to->exponent, &from->exponent))
-			return 1;
-	}
-	return 0;
-}
-
-/*
- * Signal frame handlers.
- */
-
-static inline int save_i387_fsave(struct _fpstate __user *buf)
-{
-	struct task_struct *tsk = current;
-
-	unlazy_fpu(tsk);
-	tsk->thread.i387.fsave.status = tsk->thread.i387.fsave.swd;
-	if (__copy_to_user(buf, &tsk->thread.i387.fsave,
-			   sizeof(struct i387_fsave_struct)))
-		return -1;
-	return 1;
-}
-
-static int save_i387_fxsave(struct _fpstate __user *buf)
-{
-	struct task_struct *tsk = current;
-	int err = 0;
-
-	unlazy_fpu(tsk);
-
-	if (convert_fxsr_to_user(buf, &tsk->thread.i387.fxsave))
-		return -1;
-
-	err |= __put_user(tsk->thread.i387.fxsave.swd, &buf->status);
-	err |= __put_user(X86_FXSR_MAGIC, &buf->magic);
-	if (err)
-		return -1;
-
-	if (__copy_to_user(&buf->_fxsr_env[0], &tsk->thread.i387.fxsave,
-			   sizeof(struct i387_fxsave_struct)))
-		return -1;
-	return 1;
-}
-
-int save_i387(struct _fpstate __user *buf)
-{
-	if (!used_math())
-		return 0;
-
-	/* This will cause a "finit" to be triggered by the next
-	 * attempted FPU operation by the 'current' process.
-	 */
-	clear_used_math();
-
-	if (HAVE_HWFP) {
-		if (cpu_has_fxsr) {
-			return save_i387_fxsave(buf);
-		} else {
-			return save_i387_fsave(buf);
-		}
-	} else {
-		return save_i387_soft(&current->thread.i387.soft, buf);
-	}
-}
-
-static inline int restore_i387_fsave(struct _fpstate __user *buf)
-{
-	struct task_struct *tsk = current;
-	clear_fpu(tsk);
-	return __copy_from_user(&tsk->thread.i387.fsave, buf,
-				sizeof(struct i387_fsave_struct));
-}
-
-static int restore_i387_fxsave(struct _fpstate __user *buf)
-{
-	int err;
-	struct task_struct *tsk = current;
-	clear_fpu(tsk);
-	err = __copy_from_user(&tsk->thread.i387.fxsave, &buf->_fxsr_env[0],
-			       sizeof(struct i387_fxsave_struct));
-	/* mxcsr reserved bits must be masked to zero for security reasons */
-	tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
-	return err ? 1 : convert_fxsr_from_user(&tsk->thread.i387.fxsave, buf);
-}
-
-int restore_i387(struct _fpstate __user *buf)
-{
-	int err;
-
-	if (HAVE_HWFP) {
-		if (cpu_has_fxsr) {
-			err = restore_i387_fxsave(buf);
-		} else {
-			err = restore_i387_fsave(buf);
-		}
-	} else {
-		err = restore_i387_soft(&current->thread.i387.soft, buf);
-	}
-	set_used_math();
-	return err;
-}
-
-/*
- * ptrace request handlers.
- */
-
-static inline int get_fpregs_fsave(struct user_i387_struct __user *buf,
-				   struct task_struct *tsk)
-{
-	return __copy_to_user(buf, &tsk->thread.i387.fsave,
-			      sizeof(struct user_i387_struct));
-}
-
-static inline int get_fpregs_fxsave(struct user_i387_struct __user *buf,
-				    struct task_struct *tsk)
-{
-	return convert_fxsr_to_user((struct _fpstate __user *)buf,
-				    &tsk->thread.i387.fxsave);
-}
-
-int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *tsk)
-{
-	if (HAVE_HWFP) {
-		if (cpu_has_fxsr) {
-			return get_fpregs_fxsave(buf, tsk);
-		} else {
-			return get_fpregs_fsave(buf, tsk);
-		}
-	} else {
-		return save_i387_soft(&tsk->thread.i387.soft,
-				      (struct _fpstate __user *)buf);
-	}
-}
-
-static inline int set_fpregs_fsave(struct task_struct *tsk,
-				   struct user_i387_struct __user *buf)
-{
-	return __copy_from_user(&tsk->thread.i387.fsave, buf,
-				sizeof(struct user_i387_struct));
-}
-
-static inline int set_fpregs_fxsave(struct task_struct *tsk,
-				    struct user_i387_struct __user *buf)
-{
-	return convert_fxsr_from_user(&tsk->thread.i387.fxsave,
-				      (struct _fpstate __user *)buf);
-}
-
-int set_fpregs(struct task_struct *tsk, struct user_i387_struct __user *buf)
-{
-	if (HAVE_HWFP) {
-		if (cpu_has_fxsr) {
-			return set_fpregs_fxsave(tsk, buf);
-		} else {
-			return set_fpregs_fsave(tsk, buf);
-		}
-	} else {
-		return restore_i387_soft(&tsk->thread.i387.soft,
-					 (struct _fpstate __user *)buf);
-	}
-}
-
-int get_fpxregs(struct user_fxsr_struct __user *buf, struct task_struct *tsk)
-{
-	if (cpu_has_fxsr) {
-		if (__copy_to_user(buf, &tsk->thread.i387.fxsave,
-				   sizeof(struct user_fxsr_struct)))
-			return -EFAULT;
-		return 0;
-	} else {
-		return -EIO;
-	}
-}
-
-int set_fpxregs(struct task_struct *tsk, struct user_fxsr_struct __user *buf)
-{
-	int ret = 0;
-
-	if (cpu_has_fxsr) {
-		if (__copy_from_user(&tsk->thread.i387.fxsave, buf,
-				     sizeof(struct user_fxsr_struct)))
-			ret = -EFAULT;
-		/* mxcsr reserved bits must be masked to zero
-		 * for security reasons */
-		tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
-	} else {
-		ret = -EIO;
-	}
-	return ret;
-}
-
-/*
- * FPU state for core dumps.
- */
-
-static inline void copy_fpu_fsave(struct task_struct *tsk,
-				  struct user_i387_struct *fpu)
-{
-	memcpy(fpu, &tsk->thread.i387.fsave,
-	       sizeof(struct user_i387_struct));
-}
-
-static inline void copy_fpu_fxsave(struct task_struct *tsk,
-				   struct user_i387_struct *fpu)
-{
-	unsigned short *to;
-	unsigned short *from;
-	int i;
-
-	memcpy(fpu, &tsk->thread.i387.fxsave, 7 * sizeof(long));
-
-	to = (unsigned short *)&fpu->st_space[0];
-	from = (unsigned short *)&tsk->thread.i387.fxsave.st_space[0];
-	for (i = 0; i < 8; i++, to += 5, from += 8)
-		memcpy(to, from, 5 * sizeof(unsigned short));
-}
-
-int dump_fpu(struct pt_regs *regs, struct user_i387_struct *fpu)
-{
-	int fpvalid;
-	struct task_struct *tsk = current;
-
-	fpvalid = !!used_math();
-	if (fpvalid) {
-		unlazy_fpu(tsk);
-		if (cpu_has_fxsr) {
-			copy_fpu_fxsave(tsk, fpu);
-		} else {
-			copy_fpu_fsave(tsk, fpu);
-		}
-	}
-
-	return fpvalid;
-}
-EXPORT_SYMBOL(dump_fpu);
-
-int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu)
-{
-	int fpvalid = !!tsk_used_math(tsk);
-
-	if (fpvalid) {
-		if (tsk == current)
-			unlazy_fpu(tsk);
-		if (cpu_has_fxsr)
-			copy_fpu_fxsave(tsk, fpu);
-		else
-			copy_fpu_fsave(tsk, fpu);
-	}
-	return fpvalid;
-}
-
-int dump_task_extended_fpu(struct task_struct *tsk,
-			   struct user_fxsr_struct *fpu)
-{
-	int fpvalid = tsk_used_math(tsk) && cpu_has_fxsr;
-
-	if (fpvalid) {
-		if (tsk == current)
-		       unlazy_fpu(tsk);
-		memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(*fpu));
-	}
-	return fpvalid;
-}
-- 
cgit v1.2.3


From 4421011120b2304e5c248ae4165a2704588aedf1 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:31:50 +0100
Subject: x86: x86 i387 user_regset

This revamps the i387 code to be shared across 32-bit, 64-bit,
and 32-on-64.  It does so by consolidating the code in one place
based on the user_regset accessor interfaces.  This switches
32-bit to using the i387_64.h header and 64-bit to using the
i387.c that was previously i387_32.c, but that's what took the
least cleanup in each file.  Here i387.h is stubbed to always
include i387_64.h rather than renaming the file, to keep this
diff smaller and easier to read.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/ia32/Makefile      |   2 +-
 arch/x86/ia32/ia32_signal.c |   5 +-
 arch/x86/kernel/Makefile_64 |   3 +-
 arch/x86/kernel/i387.c      | 481 +++++++++++++++++++++++---------------------
 arch/x86/kernel/ptrace.c    |   1 -
 include/asm-x86/fpu32.h     |  10 -
 include/asm-x86/i387.h      |   6 +-
 include/asm-x86/i387_64.h   |  11 +
 8 files changed, 265 insertions(+), 254 deletions(-)
 delete mode 100644 include/asm-x86/fpu32.h

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile
index ec71cfeac87e..93a6fda65f49 100644
--- a/arch/x86/ia32/Makefile
+++ b/arch/x86/ia32/Makefile
@@ -3,7 +3,7 @@
 #
 
 obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o \
-	ia32_binfmt.o fpu32.o
+	ia32_binfmt.o
 
 sysv-$(CONFIG_SYSVIPC) := ipc32.o
 obj-$(CONFIG_IA32_EMULATION) += $(sysv-y)
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 0e24e3fda3d7..0a34c24f19e5 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -29,7 +29,6 @@
 #include <asm/ia32_unistd.h>
 #include <asm/user32.h>
 #include <asm/sigcontext32.h>
-#include <asm/fpu32.h>
 #include <asm/proto.h>
 #include <asm/vdso.h>
 
@@ -258,7 +257,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
 	if (buf) {
 		if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
 			goto badframe;
-		err |= restore_i387_ia32(current, buf, 0);
+		err |= restore_i387_ia32(buf);
 	} else {
 		struct task_struct *me = current;
 
@@ -377,7 +376,7 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc,
 	err |= __put_user((u32)regs->flags, &sc->flags);
 	err |= __put_user((u32)regs->sp, &sc->sp_at_signal);
 
-	tmp = save_i387_ia32(current, fpstate, regs, 0);
+	tmp = save_i387_ia32(fpstate);
 	if (tmp < 0)
 		err = -EFAULT;
 	else {
diff --git a/arch/x86/kernel/Makefile_64 b/arch/x86/kernel/Makefile_64
index fbb370071239..7fcf972aa5d6 100644
--- a/arch/x86/kernel/Makefile_64
+++ b/arch/x86/kernel/Makefile_64
@@ -7,7 +7,7 @@ CPPFLAGS_vmlinux.lds += -Ux86_64
 
 obj-y	:= process_64.o signal_64.o entry_64.o traps_64.o irq_64.o \
 		time_64.o ioport_64.o ldt.o setup_64.o i8259_64.o sys_x86_64.o \
-		x8664_ksyms_64.o i387_64.o syscall_64.o vsyscall_64.o \
+		x8664_ksyms_64.o syscall_64.o vsyscall_64.o \
 		setup64.o bootflag.o e820_64.o reboot_64.o quirks.o i8237.o \
 		pci-dma_64.o pci-nommu_64.o alternative.o hpet.o tsc_64.o bugs_64.o \
 		i8253.o io_delay.o rtc.o
@@ -16,6 +16,7 @@ obj-y				+= ptrace.o
 obj-y				+= ds.o
 obj-y				+= step.o
 
+obj-y				+= i387.o
 obj-$(CONFIG_IA32_EMULATION)	+= tls.o
 obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
 obj-y				+= cpu/
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index bebe03463461..f7f7568dd7bc 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -8,6 +8,7 @@
 
 #include <linux/sched.h>
 #include <linux/module.h>
+#include <linux/regset.h>
 #include <asm/processor.h>
 #include <asm/i387.h>
 #include <asm/math_emu.h>
@@ -16,13 +17,29 @@
 #include <asm/ptrace.h>
 #include <asm/uaccess.h>
 
+#ifdef CONFIG_X86_64
+
+#include <asm/sigcontext32.h>
+#include <asm/user32.h>
+
+#else
+
+#define	save_i387_ia32		save_i387
+#define	restore_i387_ia32	restore_i387
+
+#define _fpstate_ia32 		_fpstate
+#define user_i387_ia32_struct	user_i387_struct
+#define user32_fxsr_struct	user_fxsr_struct
+
+#endif
+
 #ifdef CONFIG_MATH_EMULATION
 #define HAVE_HWFP (boot_cpu_data.hard_math)
 #else
 #define HAVE_HWFP 1
 #endif
 
-static unsigned long mxcsr_feature_mask __read_mostly = 0xffffffff;
+unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
 
 void mxcsr_feature_mask_init(void)
 {
@@ -40,6 +57,30 @@ void mxcsr_feature_mask_init(void)
 	stts();
 }
 
+#ifdef CONFIG_X86_64
+/*
+ * Called at bootup to set up the initial FPU state that is later cloned
+ * into all processes.
+ */
+void __cpuinit fpu_init(void)
+{
+	unsigned long oldcr0 = read_cr0();
+	extern void __bad_fxsave_alignment(void);
+
+	if (offsetof(struct task_struct, thread.i387.fxsave) & 15)
+		__bad_fxsave_alignment();
+	set_in_cr4(X86_CR4_OSFXSR);
+	set_in_cr4(X86_CR4_OSXMMEXCPT);
+
+	write_cr0(oldcr0 & ~((1UL<<3)|(1UL<<2))); /* clear TS and EM */
+
+	mxcsr_feature_mask_init();
+	/* clean state in init */
+	current_thread_info()->status = 0;
+	clear_used_math();
+}
+#endif	/* CONFIG_X86_64 */
+
 /*
  * The _current_ task is using the FPU for the first time
  * so initialize it and set the mxcsr to its default
@@ -48,12 +89,18 @@ void mxcsr_feature_mask_init(void)
  */
 void init_fpu(struct task_struct *tsk)
 {
+	if (tsk_used_math(tsk)) {
+		if (tsk == current)
+			unlazy_fpu(tsk);
+		return;
+	}
+
 	if (cpu_has_fxsr) {
 		memset(&tsk->thread.i387.fxsave, 0,
 		       sizeof(struct i387_fxsave_struct));
 		tsk->thread.i387.fxsave.cwd = 0x37f;
 		if (cpu_has_xmm)
-			tsk->thread.i387.fxsave.mxcsr = 0x1f80;
+			tsk->thread.i387.fxsave.mxcsr = MXCSR_DEFAULT;
 	} else {
 		memset(&tsk->thread.i387.fsave, 0,
 		       sizeof(struct i387_fsave_struct));
@@ -62,27 +109,59 @@ void init_fpu(struct task_struct *tsk)
 		tsk->thread.i387.fsave.twd = 0xffffffffu;
 		tsk->thread.i387.fsave.fos = 0xffff0000u;
 	}
-	/* only the device not available exception
-	 * or ptrace can call init_fpu */
+	/*
+	 * Only the device not available exception or ptrace can call init_fpu.
+	 */
 	set_stopped_child_used_math(tsk);
 }
 
-/*
- * FPU lazy state save handling.
- */
+int fpregs_active(struct task_struct *target, const struct user_regset *regset)
+{
+	return tsk_used_math(target) ? regset->n : 0;
+}
 
-void kernel_fpu_begin(void)
+int xfpregs_active(struct task_struct *target, const struct user_regset *regset)
 {
-	struct thread_info *thread = current_thread_info();
+	return (cpu_has_fxsr && tsk_used_math(target)) ? regset->n : 0;
+}
 
-	preempt_disable();
-	if (thread->status & TS_USEDFPU) {
-		__save_init_fpu(thread->task);
-		return;
-	}
-	clts();
+int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
+		unsigned int pos, unsigned int count,
+		void *kbuf, void __user *ubuf)
+{
+	if (!cpu_has_fxsr)
+		return -ENODEV;
+
+	unlazy_fpu(target);
+
+	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				   &target->thread.i387.fxsave, 0, -1);
 }
-EXPORT_SYMBOL_GPL(kernel_fpu_begin);
+
+int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
+		unsigned int pos, unsigned int count,
+		const void *kbuf, const void __user *ubuf)
+{
+	int ret;
+
+	if (!cpu_has_fxsr)
+		return -ENODEV;
+
+	unlazy_fpu(target);
+	set_stopped_child_used_math(target);
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 &target->thread.i387.fxsave, 0, -1);
+
+	/*
+	 * mxcsr reserved bits must be masked to zero for security reasons.
+	 */
+	target->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
+
+	return ret;
+}
+
+#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
 
 /*
  * FPU tag word conversions.
@@ -94,210 +173,187 @@ static inline unsigned short twd_i387_to_fxsr(unsigned short twd)
 
 	/* Transform each pair of bits into 01 (valid) or 00 (empty) */
 	tmp = ~twd;
-	tmp = (tmp | (tmp >> 1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
+	tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
 	/* and move the valid bits to the lower byte. */
 	tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
 	tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
 	tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
-
 	return tmp;
 }
 
-static inline unsigned long twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)
-{
-	struct _fpxreg *st = NULL;
-	unsigned long tos = (fxsave->swd >> 11) & 7;
-	unsigned long twd = (unsigned long) fxsave->twd;
-	unsigned long tag;
-	unsigned long ret = 0xffff0000u;
-	int i;
-
 #define FPREG_ADDR(f, n)	((void *)&(f)->st_space + (n) * 16);
+#define FP_EXP_TAG_VALID	0
+#define FP_EXP_TAG_ZERO		1
+#define FP_EXP_TAG_SPECIAL	2
+#define FP_EXP_TAG_EMPTY	3
+
+static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)
+{
+	struct _fpxreg *st;
+	u32 tos = (fxsave->swd >> 11) & 7;
+	u32 twd = (unsigned long) fxsave->twd;
+	u32 tag;
+	u32 ret = 0xffff0000u;
+	int i;
 
-	for (i = 0; i < 8; i++) {
+	for (i = 0; i < 8; i++, twd >>= 1) {
 		if (twd & 0x1) {
 			st = FPREG_ADDR(fxsave, (i - tos) & 7);
 
 			switch (st->exponent & 0x7fff) {
 			case 0x7fff:
-				tag = 2;		/* Special */
+				tag = FP_EXP_TAG_SPECIAL;
 				break;
 			case 0x0000:
 				if (!st->significand[0] &&
 				    !st->significand[1] &&
 				    !st->significand[2] &&
-				    !st->significand[3]) {
-					tag = 1;	/* Zero */
-				} else {
-					tag = 2;	/* Special */
-				}
+				    !st->significand[3])
+					tag = FP_EXP_TAG_ZERO;
+				else
+					tag = FP_EXP_TAG_SPECIAL;
 				break;
 			default:
-				if (st->significand[3] & 0x8000) {
-					tag = 0;	/* Valid */
-				} else {
-					tag = 2;	/* Special */
-				}
+				if (st->significand[3] & 0x8000)
+					tag = FP_EXP_TAG_VALID;
+				else
+					tag = FP_EXP_TAG_SPECIAL;
 				break;
 			}
 		} else {
-			tag = 3;			/* Empty */
+			tag = FP_EXP_TAG_EMPTY;
 		}
-		ret |= (tag << (2 * i));
-		twd = twd >> 1;
+		ret |= tag << (2 * i);
 	}
 	return ret;
 }
 
 /*
- * FPU state interaction.
+ * FXSR floating point environment conversions.
  */
 
-unsigned short get_fpu_cwd(struct task_struct *tsk)
+static void convert_from_fxsr(struct user_i387_ia32_struct *env,
+			      struct task_struct *tsk)
 {
-	if (cpu_has_fxsr) {
-		return tsk->thread.i387.fxsave.cwd;
-	} else {
-		return (unsigned short)tsk->thread.i387.fsave.cwd;
-	}
-}
+	struct i387_fxsave_struct *fxsave = &tsk->thread.i387.fxsave;
+	struct _fpreg *to = (struct _fpreg *) &env->st_space[0];
+	struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0];
+	int i;
 
-unsigned short get_fpu_swd(struct task_struct *tsk)
-{
-	if (cpu_has_fxsr) {
-		return tsk->thread.i387.fxsave.swd;
+	env->cwd = fxsave->cwd | 0xffff0000u;
+	env->swd = fxsave->swd | 0xffff0000u;
+	env->twd = twd_fxsr_to_i387(fxsave);
+
+#ifdef CONFIG_X86_64
+	env->fip = fxsave->rip;
+	env->foo = fxsave->rdp;
+	if (tsk == current) {
+		/*
+		 * should be actually ds/cs at fpu exception time, but
+		 * that information is not available in 64bit mode.
+		 */
+		asm("mov %%ds,%0" : "=r" (env->fos));
+		asm("mov %%cs,%0" : "=r" (env->fcs));
 	} else {
-		return (unsigned short)tsk->thread.i387.fsave.swd;
+		struct pt_regs *regs = task_pt_regs(tsk);
+		env->fos = 0xffff0000 | tsk->thread.ds;
+		env->fcs = regs->cs;
 	}
-}
-
-#if 0
-unsigned short get_fpu_twd(struct task_struct *tsk)
-{
-	if (cpu_has_fxsr) {
-		return tsk->thread.i387.fxsave.twd;
-	} else {
-		return (unsigned short)tsk->thread.i387.fsave.twd;
-	}
-}
-#endif  /*  0  */
+#else
+	env->fip = fxsave->fip;
+	env->fcs = fxsave->fcs;
+	env->foo = fxsave->foo;
+	env->fos = fxsave->fos;
+#endif
 
-unsigned short get_fpu_mxcsr(struct task_struct *tsk)
-{
-	if (cpu_has_xmm) {
-		return tsk->thread.i387.fxsave.mxcsr;
-	} else {
-		return 0x1f80;
-	}
+	for (i = 0; i < 8; ++i)
+		memcpy(&to[i], &from[i], sizeof(to[0]));
 }
 
-#if 0
+static void convert_to_fxsr(struct task_struct *tsk,
+			    const struct user_i387_ia32_struct *env)
 
-void set_fpu_cwd(struct task_struct *tsk, unsigned short cwd)
 {
-	if (cpu_has_fxsr) {
-		tsk->thread.i387.fxsave.cwd = cwd;
-	} else {
-		tsk->thread.i387.fsave.cwd = ((long)cwd | 0xffff0000u);
-	}
-}
+	struct i387_fxsave_struct *fxsave = &tsk->thread.i387.fxsave;
+	struct _fpreg *from = (struct _fpreg *) &env->st_space[0];
+	struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0];
+	int i;
 
-void set_fpu_swd(struct task_struct *tsk, unsigned short swd)
-{
-	if (cpu_has_fxsr) {
-		tsk->thread.i387.fxsave.swd = swd;
-	} else {
-		tsk->thread.i387.fsave.swd = ((long)swd | 0xffff0000u);
-	}
-}
+	fxsave->cwd = env->cwd;
+	fxsave->swd = env->swd;
+	fxsave->twd = twd_i387_to_fxsr(env->twd);
+	fxsave->fop = (u16) ((u32) env->fcs >> 16);
+#ifdef CONFIG_X86_64
+	fxsave->rip = env->fip;
+	fxsave->rdp = env->foo;
+	/* cs and ds ignored */
+#else
+	fxsave->fip = env->fip;
+	fxsave->fcs = (env->fcs & 0xffff);
+	fxsave->foo = env->foo;
+	fxsave->fos = env->fos;
+#endif
 
-void set_fpu_twd(struct task_struct *tsk, unsigned short twd)
-{
-	if (cpu_has_fxsr) {
-		tsk->thread.i387.fxsave.twd = twd_i387_to_fxsr(twd);
-	} else {
-		tsk->thread.i387.fsave.twd = ((long)twd | 0xffff0000u);
-	}
+	for (i = 0; i < 8; ++i)
+		memcpy(&to[i], &from[i], sizeof(from[0]));
 }
 
-#endif  /*  0  */
-
-/*
- * FXSR floating point environment conversions.
- */
-
-static int convert_fxsr_to_user(struct _fpstate __user *buf,
-				struct i387_fxsave_struct *fxsave)
+int fpregs_get(struct task_struct *target, const struct user_regset *regset,
+	       unsigned int pos, unsigned int count,
+	       void *kbuf, void __user *ubuf)
 {
-	unsigned long env[7];
-	struct _fpreg __user *to;
-	struct _fpxreg *from;
-	int i;
+	struct user_i387_ia32_struct env;
 
-	env[0] = (unsigned long)fxsave->cwd | 0xffff0000ul;
-	env[1] = (unsigned long)fxsave->swd | 0xffff0000ul;
-	env[2] = twd_fxsr_to_i387(fxsave);
-	env[3] = fxsave->fip;
-	env[4] = fxsave->fcs | ((unsigned long)fxsave->fop << 16);
-	env[5] = fxsave->foo;
-	env[6] = fxsave->fos;
+	if (!HAVE_HWFP)
+		return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf);
 
-	if (__copy_to_user(buf, env, 7 * sizeof(unsigned long)))
-		return 1;
+	unlazy_fpu(target);
 
-	to = &buf->_st[0];
-	from = (struct _fpxreg *) &fxsave->st_space[0];
-	for (i = 0; i < 8; i++, to++, from++) {
-		unsigned long __user *t = (unsigned long __user *)to;
-		unsigned long *f = (unsigned long *)from;
+	if (!cpu_has_fxsr)
+		return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+					   &target->thread.i387.fsave, 0, -1);
 
-		if (__put_user(*f, t) ||
-		    __put_user(*(f + 1), t + 1) ||
-		    __put_user(from->exponent, &to->exponent))
-			return 1;
+	if (kbuf && pos == 0 && count == sizeof(env)) {
+		convert_from_fxsr(kbuf, target);
+		return 0;
 	}
-	return 0;
+
+	convert_from_fxsr(&env, target);
+	return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
 }
 
-static int convert_fxsr_from_user(struct i387_fxsave_struct *fxsave,
-				  struct _fpstate __user *buf)
+int fpregs_set(struct task_struct *target, const struct user_regset *regset,
+	       unsigned int pos, unsigned int count,
+	       const void *kbuf, const void __user *ubuf)
 {
-	unsigned long env[7];
-	struct _fpxreg *to;
-	struct _fpreg __user *from;
-	int i;
+	struct user_i387_ia32_struct env;
+	int ret;
 
-	if (__copy_from_user(env, buf, 7 * sizeof(long)))
-		return 1;
+	if (!HAVE_HWFP)
+		return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
 
-	fxsave->cwd = (unsigned short)(env[0] & 0xffff);
-	fxsave->swd = (unsigned short)(env[1] & 0xffff);
-	fxsave->twd = twd_i387_to_fxsr((unsigned short)(env[2] & 0xffff));
-	fxsave->fip = env[3];
-	fxsave->fop = (unsigned short)((env[4] & 0xffff0000ul) >> 16);
-	fxsave->fcs = (env[4] & 0xffff);
-	fxsave->foo = env[5];
-	fxsave->fos = env[6];
-
-	to = (struct _fpxreg *) &fxsave->st_space[0];
-	from = &buf->_st[0];
-	for (i = 0; i < 8; i++, to++, from++) {
-		unsigned long *t = (unsigned long *)to;
-		unsigned long __user *f = (unsigned long __user *)from;
-
-		if (__get_user(*t, f) ||
-		    __get_user(*(t + 1), f + 1) ||
-		    __get_user(to->exponent, &from->exponent))
-			return 1;
-	}
-	return 0;
+	unlazy_fpu(target);
+	set_stopped_child_used_math(target);
+
+	if (!cpu_has_fxsr)
+		return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+					  &target->thread.i387.fsave, 0, -1);
+
+	if (pos > 0 || count < sizeof(env))
+		convert_from_fxsr(&env, target);
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
+	if (!ret)
+		convert_to_fxsr(target, &env);
+
+	return ret;
 }
 
 /*
  * Signal frame handlers.
  */
 
-static inline int save_i387_fsave(struct _fpstate __user *buf)
+static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
 {
 	struct task_struct *tsk = current;
 
@@ -309,14 +365,16 @@ static inline int save_i387_fsave(struct _fpstate __user *buf)
 	return 1;
 }
 
-static int save_i387_fxsave(struct _fpstate __user *buf)
+static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
 {
 	struct task_struct *tsk = current;
+	struct user_i387_ia32_struct env;
 	int err = 0;
 
 	unlazy_fpu(tsk);
 
-	if (convert_fxsr_to_user(buf, &tsk->thread.i387.fxsave))
+	convert_from_fxsr(&env, tsk);
+	if (__copy_to_user(buf, &env, sizeof(env)))
 		return -1;
 
 	err |= __put_user(tsk->thread.i387.fxsave.swd, &buf->status);
@@ -330,7 +388,7 @@ static int save_i387_fxsave(struct _fpstate __user *buf)
 	return 1;
 }
 
-int save_i387(struct _fpstate __user *buf)
+int save_i387_ia32(struct _fpstate_ia32 __user *buf)
 {
 	if (!used_math())
 		return 0;
@@ -347,11 +405,13 @@ int save_i387(struct _fpstate __user *buf)
 			return save_i387_fsave(buf);
 		}
 	} else {
-		return save_i387_soft(&current->thread.i387.soft, buf);
+		return fpregs_soft_get(current, NULL,
+				       0, sizeof(struct user_i387_ia32_struct),
+				       NULL, buf) ? -1 : 1;
 	}
 }
 
-static inline int restore_i387_fsave(struct _fpstate __user *buf)
+static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf)
 {
 	struct task_struct *tsk = current;
 	clear_fpu(tsk);
@@ -359,19 +419,23 @@ static inline int restore_i387_fsave(struct _fpstate __user *buf)
 				sizeof(struct i387_fsave_struct));
 }
 
-static int restore_i387_fxsave(struct _fpstate __user *buf)
+static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf)
 {
 	int err;
 	struct task_struct *tsk = current;
+	struct user_i387_ia32_struct env;
 	clear_fpu(tsk);
 	err = __copy_from_user(&tsk->thread.i387.fxsave, &buf->_fxsr_env[0],
 			       sizeof(struct i387_fxsave_struct));
 	/* mxcsr reserved bits must be masked to zero for security reasons */
 	tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
-	return err ? 1 : convert_fxsr_from_user(&tsk->thread.i387.fxsave, buf);
+	if (err || __copy_from_user(&env, buf, sizeof(env)))
+		return 1;
+	convert_to_fxsr(tsk, &env);
+	return 0;
 }
 
-int restore_i387(struct _fpstate __user *buf)
+int restore_i387_ia32(struct _fpstate_ia32 __user *buf)
 {
 	int err;
 
@@ -382,101 +446,52 @@ int restore_i387(struct _fpstate __user *buf)
 			err = restore_i387_fsave(buf);
 		}
 	} else {
-		err = restore_i387_soft(&current->thread.i387.soft, buf);
+		err = fpregs_soft_set(current, NULL,
+				      0, sizeof(struct user_i387_ia32_struct),
+				      NULL, buf) != 0;
 	}
 	set_used_math();
 	return err;
 }
 
-/*
- * ptrace request handlers.
- */
+#endif	/* CONFIG_X86_32 || CONFIG_IA32_EMULATION */
 
-static inline int get_fpregs_fsave(struct user_i387_struct __user *buf,
-				   struct task_struct *tsk)
-{
-	return __copy_to_user(buf, &tsk->thread.i387.fsave,
-			      sizeof(struct user_i387_struct));
-}
-
-static inline int get_fpregs_fxsave(struct user_i387_struct __user *buf,
-				    struct task_struct *tsk)
-{
-	return convert_fxsr_to_user((struct _fpstate __user *)buf,
-				    &tsk->thread.i387.fxsave);
-}
+#ifdef CONFIG_X86_64
 
 int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *tsk)
 {
-	if (HAVE_HWFP) {
-		if (cpu_has_fxsr) {
-			return get_fpregs_fxsave(buf, tsk);
-		} else {
-			return get_fpregs_fsave(buf, tsk);
-		}
-	} else {
-		return save_i387_soft(&tsk->thread.i387.soft,
-				      (struct _fpstate __user *)buf);
-	}
+	return xfpregs_get(tsk, NULL, 0, sizeof(*buf), NULL, buf);
 }
 
-static inline int set_fpregs_fsave(struct task_struct *tsk,
-				   struct user_i387_struct __user *buf)
+int set_fpregs(struct task_struct *tsk, struct user_i387_struct __user *buf)
 {
-	return __copy_from_user(&tsk->thread.i387.fsave, buf,
-				sizeof(struct user_i387_struct));
+	return xfpregs_set(tsk, NULL, 0, sizeof(*buf), NULL, buf);
 }
 
-static inline int set_fpregs_fxsave(struct task_struct *tsk,
-				    struct user_i387_struct __user *buf)
+#else
+
+int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *tsk)
 {
-	return convert_fxsr_from_user(&tsk->thread.i387.fxsave,
-				      (struct _fpstate __user *)buf);
+	return fpregs_get(tsk, NULL, 0, sizeof(*buf), NULL, buf);
 }
 
 int set_fpregs(struct task_struct *tsk, struct user_i387_struct __user *buf)
 {
-	if (HAVE_HWFP) {
-		if (cpu_has_fxsr) {
-			return set_fpregs_fxsave(tsk, buf);
-		} else {
-			return set_fpregs_fsave(tsk, buf);
-		}
-	} else {
-		return restore_i387_soft(&tsk->thread.i387.soft,
-					 (struct _fpstate __user *)buf);
-	}
+	return fpregs_set(tsk, NULL, 0, sizeof(*buf), NULL, buf);
 }
 
 int get_fpxregs(struct user_fxsr_struct __user *buf, struct task_struct *tsk)
 {
-	if (cpu_has_fxsr) {
-		if (__copy_to_user(buf, &tsk->thread.i387.fxsave,
-				   sizeof(struct user_fxsr_struct)))
-			return -EFAULT;
-		return 0;
-	} else {
-		return -EIO;
-	}
+	return xfpregs_get(tsk, NULL, 0, sizeof(*buf), NULL, buf);
 }
 
 int set_fpxregs(struct task_struct *tsk, struct user_fxsr_struct __user *buf)
 {
-	int ret = 0;
-
-	if (cpu_has_fxsr) {
-		if (__copy_from_user(&tsk->thread.i387.fxsave, buf,
-				     sizeof(struct user_fxsr_struct)))
-			ret = -EFAULT;
-		/* mxcsr reserved bits must be masked to zero
-		 * for security reasons */
-		tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
-	} else {
-		ret = -EIO;
-	}
-	return ret;
+	return xfpregs_get(tsk, NULL, 0, sizeof(*buf), NULL, buf);
 }
 
+#endif
+
 /*
  * FPU state for core dumps.
  */
@@ -538,7 +553,7 @@ int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu)
 }
 
 int dump_task_extended_fpu(struct task_struct *tsk,
-			   struct user_fxsr_struct *fpu)
+			   struct user32_fxsr_struct *fpu)
 {
 	int fpvalid = tsk_used_math(tsk) && cpu_has_fxsr;
 
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 18972a305890..084805ab7323 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -886,7 +886,6 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 #include <linux/compat.h>
 #include <linux/syscalls.h>
 #include <asm/ia32.h>
-#include <asm/fpu32.h>
 #include <asm/user32.h>
 
 #define R32(l,q)							\
diff --git a/include/asm-x86/fpu32.h b/include/asm-x86/fpu32.h
deleted file mode 100644
index 4153db5c0c31..000000000000
--- a/include/asm-x86/fpu32.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef _FPU32_H
-#define _FPU32_H 1
-
-struct _fpstate_ia32;
-
-int restore_i387_ia32(struct task_struct *tsk, struct _fpstate_ia32 __user *buf, int fsave);
-int save_i387_ia32(struct task_struct *tsk, struct _fpstate_ia32 __user *buf, 
-		   struct pt_regs *regs, int fsave);
-
-#endif
diff --git a/include/asm-x86/i387.h b/include/asm-x86/i387.h
index a8bbed349664..b2bc0050ce99 100644
--- a/include/asm-x86/i387.h
+++ b/include/asm-x86/i387.h
@@ -1,5 +1 @@
-#ifdef CONFIG_X86_32
-# include "i387_32.h"
-#else
-# include "i387_64.h"
-#endif
+#include "i387_64.h"
diff --git a/include/asm-x86/i387_64.h b/include/asm-x86/i387_64.h
index 4cea7373ac78..de435b9114df 100644
--- a/include/asm-x86/i387_64.h
+++ b/include/asm-x86/i387_64.h
@@ -12,6 +12,7 @@
 
 #include <linux/sched.h>
 #include <linux/kernel_stat.h>
+#include <linux/regset.h>
 #include <asm/processor.h>
 #include <asm/sigcontext.h>
 #include <asm/user.h>
@@ -23,6 +24,16 @@ extern void mxcsr_feature_mask_init(void);
 extern void init_fpu(struct task_struct *child);
 extern asmlinkage void math_state_restore(void);
 
+extern user_regset_active_fn fpregs_active, xfpregs_active;
+extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get;
+extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set;
+
+#ifdef CONFIG_IA32_EMULATION
+struct _fpstate_ia32;
+extern int save_i387_ia32(struct _fpstate_ia32 __user *buf);
+extern int restore_i387_ia32(struct _fpstate_ia32 __user *buf);
+#endif
+
 #ifdef CONFIG_X86_64
 
 /* Ignore delayed exceptions from user space */
-- 
cgit v1.2.3


From 1eeaed7679eab3666d2d8c964d060c2169b3813b Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:31:51 +0100
Subject: x86: x86 i387 cleanup

This removes all the old code that is no longer used after
the i387 unification and cleanup.  The i387_64.h is renamed
to i387.h with no changes, but since it replaces the nonempty
one-line stub i387.h it looks like a big diff and not a rename.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/ia32/fpu32.c     | 181 ----------------------
 arch/x86/kernel/i387_64.c | 119 ---------------
 include/asm-x86/i387.h    | 375 +++++++++++++++++++++++++++++++++++++++++++++-
 include/asm-x86/i387_32.h | 149 ------------------
 include/asm-x86/i387_64.h | 374 ---------------------------------------------
 5 files changed, 374 insertions(+), 824 deletions(-)
 delete mode 100644 arch/x86/ia32/fpu32.c
 delete mode 100644 arch/x86/kernel/i387_64.c
 delete mode 100644 include/asm-x86/i387_32.h
 delete mode 100644 include/asm-x86/i387_64.h

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/ia32/fpu32.c b/arch/x86/ia32/fpu32.c
deleted file mode 100644
index ae80745f668f..000000000000
--- a/arch/x86/ia32/fpu32.c
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Copyright 2002 Andi Kleen, SuSE Labs.
- * FXSAVE<->i387 conversion support. Based on code by Gareth Hughes.
- * This is used for ptrace, signals and coredumps in 32bit emulation.
- */
-
-#include <linux/sched.h>
-#include <asm/sigcontext32.h>
-#include <asm/processor.h>
-#include <asm/uaccess.h>
-#include <asm/i387.h>
-
-static inline unsigned short twd_i387_to_fxsr(unsigned short twd)
-{
-	unsigned int tmp; /* to avoid 16 bit prefixes in the code */
-
-	/* Transform each pair of bits into 01 (valid) or 00 (empty) */
-	tmp = ~twd;
-	tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
-	/* and move the valid bits to the lower byte. */
-	tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
-	tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
-	tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
-	return tmp;
-}
-
-#define FPREG_ADDR(f, n)	((void *)&(f)->st_space + (n) * 16);
-#define FP_EXP_TAG_VALID	0
-#define FP_EXP_TAG_ZERO		1
-#define FP_EXP_TAG_SPECIAL	2
-#define FP_EXP_TAG_EMPTY	3
-
-static inline unsigned long twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)
-{
-	struct _fpxreg *st;
-	unsigned long tos = (fxsave->swd >> 11) & 7;
-	unsigned long twd = (unsigned long) fxsave->twd;
-	unsigned long tag;
-	unsigned long ret = 0xffff0000;
-	int i;
-
-	for (i = 0; i < 8; i++, twd >>= 1) {
-		if (twd & 0x1) {
-			st = FPREG_ADDR(fxsave, (i - tos) & 7);
-
-			switch (st->exponent & 0x7fff) {
-			case 0x7fff:
-				tag = FP_EXP_TAG_SPECIAL;
-				break;
-			case 0x0000:
-				if (!st->significand[0] &&
-				    !st->significand[1] &&
-				    !st->significand[2] &&
-				    !st->significand[3])
-					tag = FP_EXP_TAG_ZERO;
-				else
-					tag = FP_EXP_TAG_SPECIAL;
-				break;
-			default:
-				if (st->significand[3] & 0x8000)
-					tag = FP_EXP_TAG_VALID;
-				else
-					tag = FP_EXP_TAG_SPECIAL;
-				break;
-			}
-		} else {
-			tag = FP_EXP_TAG_EMPTY;
-		}
-		ret |= tag << (2 * i);
-	}
-	return ret;
-}
-
-#define G(num, val) err |= __get_user(val, num + (u32 __user *)buf)
-
-static inline int convert_fxsr_from_user(struct i387_fxsave_struct *fxsave,
-					 struct _fpstate_ia32 __user *buf)
-{
-	struct _fpxreg *to;
-	struct _fpreg __user *from;
-	int i, err = 0;
-	u32 v;
-
-	G(0, fxsave->cwd);
-	G(1, fxsave->swd);
-	G(2, fxsave->twd);
-	fxsave->twd = twd_i387_to_fxsr(fxsave->twd);
-	G(3, fxsave->rip);
-	G(4, v);
-	/* cs ignored */
-	fxsave->fop = v>>16;
-	G(5, fxsave->rdp);
-	/* 6: ds ignored */
-	if (err)
-		return -1;
-
-	to = (struct _fpxreg *)&fxsave->st_space[0];
-	from = &buf->_st[0];
-	for (i = 0; i < 8; i++, to++, from++) {
-		if (__copy_from_user(to, from, sizeof(*from)))
-			return -1;
-	}
-	return 0;
-}
-
-#define P(num, val) err |= __put_user(val, num + (u32 __user *)buf)
-
-static inline int convert_fxsr_to_user(struct _fpstate_ia32 __user *buf,
-				       struct i387_fxsave_struct *fxsave,
-				       struct pt_regs *regs,
-				       struct task_struct *tsk)
-{
-	struct _fpreg __user *to;
-	struct _fpxreg *from;
-	int i, err = 0;
-	u16 cs, ds;
-
-	if (tsk == current) {
-		/*
-		 * should be actually ds/cs at fpu exception time, but
-		 * that information is not available in 64bit mode.
-		 */
-		asm("movw %%ds,%0 " : "=r" (ds));
-		asm("movw %%cs,%0 " : "=r" (cs));
-	} else {
-		 /* ptrace. task has stopped. */
-		ds = tsk->thread.ds;
-		cs = regs->cs;
-	}
-
-	P(0, (u32)fxsave->cwd | 0xffff0000);
-	P(1, (u32)fxsave->swd | 0xffff0000);
-	P(2, twd_fxsr_to_i387(fxsave));
-	P(3, (u32)fxsave->rip);
-	P(4,  cs | ((u32)fxsave->fop) << 16);
-	P(5, fxsave->rdp);
-	P(6, 0xffff0000 | ds);
-
-	if (err)
-		return -1;
-
-	to = &buf->_st[0];
-	from = (struct _fpxreg *) &fxsave->st_space[0];
-	for (i = 0; i < 8; i++, to++, from++) {
-		if (__copy_to_user(to, from, sizeof(*to)))
-			return -1;
-	}
-	return 0;
-}
-
-int restore_i387_ia32(struct task_struct *tsk,
-		      struct _fpstate_ia32 __user *buf, int fsave)
-{
-	clear_fpu(tsk);
-	if (!fsave) {
-		if (__copy_from_user(&tsk->thread.i387.fxsave,
-				     &buf->_fxsr_env[0],
-				     sizeof(struct i387_fxsave_struct)))
-			return -1;
-		tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
-		set_stopped_child_used_math(tsk);
-	}
-	return convert_fxsr_from_user(&tsk->thread.i387.fxsave, buf);
-}
-
-int save_i387_ia32(struct task_struct *tsk, struct _fpstate_ia32 __user *buf,
-		   struct pt_regs *regs, int fsave)
-{
-	int err = 0;
-
-	init_fpu(tsk);
-	if (convert_fxsr_to_user(buf, &tsk->thread.i387.fxsave, regs, tsk))
-		return -1;
-	if (fsave)
-		return 0;
-	err |= __put_user(tsk->thread.i387.fxsave.swd, &buf->status);
-	err |= __put_user(X86_FXSR_MAGIC, &buf->magic);
-	err |= __copy_to_user(&buf->_fxsr_env[0], &tsk->thread.i387.fxsave,
-			      sizeof(struct i387_fxsave_struct));
-	return err ? -1 : 1;
-}
diff --git a/arch/x86/kernel/i387_64.c b/arch/x86/kernel/i387_64.c
deleted file mode 100644
index f335a76d7ea7..000000000000
--- a/arch/x86/kernel/i387_64.c
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- *  Copyright (C) 1994 Linus Torvalds
- *  Copyright (C) 2002 Andi Kleen, SuSE Labs
- *
- *  Pentium III FXSR, SSE support
- *  General FPU state handling cleanups
- *	Gareth Hughes <gareth@valinux.com>, May 2000
- * 
- *  x86-64 rework 2002 Andi Kleen. 
- *  Does direct fxsave in and out of user space now for signal handlers.
- *  All the FSAVE<->FXSAVE conversion code has been moved to the 32bit emulation,
- *  the 64bit user space sees a FXSAVE frame directly. 
- */
-
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <asm/processor.h>
-#include <asm/i387.h>
-#include <asm/sigcontext.h>
-#include <asm/user.h>
-#include <asm/ptrace.h>
-#include <asm/uaccess.h>
-
-unsigned int mxcsr_feature_mask __read_mostly = 0xffffffff;
-
-void mxcsr_feature_mask_init(void)
-{
-	unsigned int mask;
-	clts();
-	memset(&current->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
-	asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave));
-	mask = current->thread.i387.fxsave.mxcsr_mask;
-	if (mask == 0) mask = 0x0000ffbf;
-	mxcsr_feature_mask &= mask;
-	stts();
-}
-
-/*
- * Called at bootup to set up the initial FPU state that is later cloned
- * into all processes.
- */
-void __cpuinit fpu_init(void)
-{
-	unsigned long oldcr0 = read_cr0();
-	extern void __bad_fxsave_alignment(void);
-		
-	if (offsetof(struct task_struct, thread.i387.fxsave) & 15)
-		__bad_fxsave_alignment();
-	set_in_cr4(X86_CR4_OSFXSR);
-	set_in_cr4(X86_CR4_OSXMMEXCPT);
-
-	write_cr0(oldcr0 & ~((1UL<<3)|(1UL<<2))); /* clear TS and EM */
-
-	mxcsr_feature_mask_init();
-	/* clean state in init */
-	current_thread_info()->status = 0;
-	clear_used_math();
-}
-
-void init_fpu(struct task_struct *child)
-{
-	if (tsk_used_math(child)) {
-		if (child == current)
-			unlazy_fpu(child);
-		return;
-	}	
-	memset(&child->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
-	child->thread.i387.fxsave.cwd = 0x37f;
-	child->thread.i387.fxsave.mxcsr = 0x1f80;
-	/* only the device not available exception or ptrace can call init_fpu */
-	set_stopped_child_used_math(child);
-}
-
-/*
- * ptrace request handlers.
- */
-
-int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *tsk)
-{
-	init_fpu(tsk);
-	return __copy_to_user(buf, &tsk->thread.i387.fxsave,
-			       sizeof(struct user_i387_struct)) ? -EFAULT : 0;
-}
-
-int set_fpregs(struct task_struct *tsk, struct user_i387_struct __user *buf)
-{
-	if (__copy_from_user(&tsk->thread.i387.fxsave, buf, 
-			     sizeof(struct user_i387_struct)))
-		return -EFAULT;
-		return 0;
-}
-
-/*
- * FPU state for core dumps.
- */
-
-int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu )
-{
-	struct task_struct *tsk = current;
-
-	if (!used_math())
-		return 0;
-
-	unlazy_fpu(tsk);
-	memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct)); 
-	return 1; 
-}
-
-int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu)
-{
-	int fpvalid = !!tsk_used_math(tsk);
-
-	if (fpvalid) {
-		if (tsk == current)
-			unlazy_fpu(tsk);
-		memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct)); 	
-}
-	return fpvalid;
-}
diff --git a/include/asm-x86/i387.h b/include/asm-x86/i387.h
index b2bc0050ce99..de435b9114df 100644
--- a/include/asm-x86/i387.h
+++ b/include/asm-x86/i387.h
@@ -1 +1,374 @@
-#include "i387_64.h"
+/*
+ * Copyright (C) 1994 Linus Torvalds
+ *
+ * Pentium III FXSR, SSE support
+ * General FPU state handling cleanups
+ *	Gareth Hughes <gareth@valinux.com>, May 2000
+ * x86-64 work by Andi Kleen 2002
+ */
+
+#ifndef _ASM_X86_I387_H
+#define _ASM_X86_I387_H
+
+#include <linux/sched.h>
+#include <linux/kernel_stat.h>
+#include <linux/regset.h>
+#include <asm/processor.h>
+#include <asm/sigcontext.h>
+#include <asm/user.h>
+#include <asm/uaccess.h>
+
+extern void fpu_init(void);
+extern unsigned int mxcsr_feature_mask;
+extern void mxcsr_feature_mask_init(void);
+extern void init_fpu(struct task_struct *child);
+extern asmlinkage void math_state_restore(void);
+
+extern user_regset_active_fn fpregs_active, xfpregs_active;
+extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get;
+extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set;
+
+#ifdef CONFIG_IA32_EMULATION
+struct _fpstate_ia32;
+extern int save_i387_ia32(struct _fpstate_ia32 __user *buf);
+extern int restore_i387_ia32(struct _fpstate_ia32 __user *buf);
+#endif
+
+#ifdef CONFIG_X86_64
+
+/* Ignore delayed exceptions from user space */
+static inline void tolerant_fwait(void)
+{
+	asm volatile("1: fwait\n"
+		     "2:\n"
+		     "   .section __ex_table,\"a\"\n"
+		     "	.align 8\n"
+		     "	.quad 1b,2b\n"
+		     "	.previous\n");
+}
+
+static inline int restore_fpu_checking(struct i387_fxsave_struct *fx)
+{
+	int err;
+
+	asm volatile("1:  rex64/fxrstor (%[fx])\n\t"
+		     "2:\n"
+		     ".section .fixup,\"ax\"\n"
+		     "3:  movl $-1,%[err]\n"
+		     "    jmp  2b\n"
+		     ".previous\n"
+		     ".section __ex_table,\"a\"\n"
+		     "   .align 8\n"
+		     "   .quad  1b,3b\n"
+		     ".previous"
+		     : [err] "=r" (err)
+#if 0 /* See comment in __save_init_fpu() below. */
+		     : [fx] "r" (fx), "m" (*fx), "0" (0));
+#else
+		     : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0));
+#endif
+	if (unlikely(err))
+		init_fpu(current);
+	return err;
+}
+
+#define X87_FSW_ES (1 << 7)	/* Exception Summary */
+
+/* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception
+   is pending. Clear the x87 state here by setting it to fixed
+   values. The kernel data segment can be sometimes 0 and sometimes
+   new user value. Both should be ok.
+   Use the PDA as safe address because it should be already in L1. */
+static inline void clear_fpu_state(struct i387_fxsave_struct *fx)
+{
+	if (unlikely(fx->swd & X87_FSW_ES))
+		 asm volatile("fnclex");
+	alternative_input(ASM_NOP8 ASM_NOP2,
+		     "    emms\n"		/* clear stack tags */
+		     "    fildl %%gs:0",	/* load to clear state */
+		     X86_FEATURE_FXSAVE_LEAK);
+}
+
+static inline int save_i387_checking(struct i387_fxsave_struct __user *fx)
+{
+	int err;
+
+	asm volatile("1:  rex64/fxsave (%[fx])\n\t"
+		     "2:\n"
+		     ".section .fixup,\"ax\"\n"
+		     "3:  movl $-1,%[err]\n"
+		     "    jmp  2b\n"
+		     ".previous\n"
+		     ".section __ex_table,\"a\"\n"
+		     "   .align 8\n"
+		     "   .quad  1b,3b\n"
+		     ".previous"
+		     : [err] "=r" (err), "=m" (*fx)
+#if 0 /* See comment in __fxsave_clear() below. */
+		     : [fx] "r" (fx), "0" (0));
+#else
+		     : [fx] "cdaSDb" (fx), "0" (0));
+#endif
+	if (unlikely(err) && __clear_user(fx, sizeof(struct i387_fxsave_struct)))
+		err = -EFAULT;
+	/* No need to clear here because the caller clears USED_MATH */
+	return err;
+}
+
+static inline void __save_init_fpu(struct task_struct *tsk)
+{
+	/* Using "rex64; fxsave %0" is broken because, if the memory operand
+	   uses any extended registers for addressing, a second REX prefix
+	   will be generated (to the assembler, rex64 followed by semicolon
+	   is a separate instruction), and hence the 64-bitness is lost. */
+#if 0
+	/* Using "fxsaveq %0" would be the ideal choice, but is only supported
+	   starting with gas 2.16. */
+	__asm__ __volatile__("fxsaveq %0"
+			     : "=m" (tsk->thread.i387.fxsave));
+#elif 0
+	/* Using, as a workaround, the properly prefixed form below isn't
+	   accepted by any binutils version so far released, complaining that
+	   the same type of prefix is used twice if an extended register is
+	   needed for addressing (fix submitted to mainline 2005-11-21). */
+	__asm__ __volatile__("rex64/fxsave %0"
+			     : "=m" (tsk->thread.i387.fxsave));
+#else
+	/* This, however, we can work around by forcing the compiler to select
+	   an addressing mode that doesn't require extended registers. */
+	__asm__ __volatile__("rex64/fxsave %P2(%1)"
+			     : "=m" (tsk->thread.i387.fxsave)
+			     : "cdaSDb" (tsk),
+				"i" (offsetof(__typeof__(*tsk),
+					      thread.i387.fxsave)));
+#endif
+	clear_fpu_state(&tsk->thread.i387.fxsave);
+	task_thread_info(tsk)->status &= ~TS_USEDFPU;
+}
+
+/*
+ * Signal frame handlers.
+ */
+
+static inline int save_i387(struct _fpstate __user *buf)
+{
+	struct task_struct *tsk = current;
+	int err = 0;
+
+	BUILD_BUG_ON(sizeof(struct user_i387_struct) !=
+			sizeof(tsk->thread.i387.fxsave));
+
+	if ((unsigned long)buf % 16)
+		printk("save_i387: bad fpstate %p\n", buf);
+
+	if (!used_math())
+		return 0;
+	clear_used_math(); /* trigger finit */
+	if (task_thread_info(tsk)->status & TS_USEDFPU) {
+		err = save_i387_checking((struct i387_fxsave_struct __user *)buf);
+		if (err) return err;
+		task_thread_info(tsk)->status &= ~TS_USEDFPU;
+		stts();
+	} else {
+		if (__copy_to_user(buf, &tsk->thread.i387.fxsave,
+				   sizeof(struct i387_fxsave_struct)))
+			return -1;
+	}
+	return 1;
+}
+
+/*
+ * This restores directly out of user space. Exceptions are handled.
+ */
+static inline int restore_i387(struct _fpstate __user *buf)
+{
+	set_used_math();
+	if (!(task_thread_info(current)->status & TS_USEDFPU)) {
+		clts();
+		task_thread_info(current)->status |= TS_USEDFPU;
+	}
+	return restore_fpu_checking((__force struct i387_fxsave_struct *)buf);
+}
+
+#else  /* CONFIG_X86_32 */
+
+static inline void tolerant_fwait(void)
+{
+	asm volatile("fnclex ; fwait");
+}
+
+static inline void restore_fpu(struct task_struct *tsk)
+{
+	/*
+	 * The "nop" is needed to make the instructions the same
+	 * length.
+	 */
+	alternative_input(
+		"nop ; frstor %1",
+		"fxrstor %1",
+		X86_FEATURE_FXSR,
+		"m" ((tsk)->thread.i387.fxsave));
+}
+
+/* We need a safe address that is cheap to find and that is already
+   in L1 during context switch. The best choices are unfortunately
+   different for UP and SMP */
+#ifdef CONFIG_SMP
+#define safe_address (__per_cpu_offset[0])
+#else
+#define safe_address (kstat_cpu(0).cpustat.user)
+#endif
+
+/*
+ * These must be called with preempt disabled
+ */
+static inline void __save_init_fpu(struct task_struct *tsk)
+{
+	/* Use more nops than strictly needed in case the compiler
+	   varies code */
+	alternative_input(
+		"fnsave %[fx] ;fwait;" GENERIC_NOP8 GENERIC_NOP4,
+		"fxsave %[fx]\n"
+		"bt $7,%[fsw] ; jnc 1f ; fnclex\n1:",
+		X86_FEATURE_FXSR,
+		[fx] "m" (tsk->thread.i387.fxsave),
+		[fsw] "m" (tsk->thread.i387.fxsave.swd) : "memory");
+	/* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
+	   is pending.  Clear the x87 state here by setting it to fixed
+	   values. safe_address is a random variable that should be in L1 */
+	alternative_input(
+		GENERIC_NOP8 GENERIC_NOP2,
+		"emms\n\t"	  	/* clear stack tags */
+		"fildl %[addr]", 	/* set F?P to defined value */
+		X86_FEATURE_FXSAVE_LEAK,
+		[addr] "m" (safe_address));
+	task_thread_info(tsk)->status &= ~TS_USEDFPU;
+}
+
+/*
+ * Signal frame handlers...
+ */
+extern int save_i387(struct _fpstate __user *buf);
+extern int restore_i387(struct _fpstate __user *buf);
+
+#endif	/* CONFIG_X86_64 */
+
+static inline void __unlazy_fpu(struct task_struct *tsk)
+{
+	if (task_thread_info(tsk)->status & TS_USEDFPU) {
+		__save_init_fpu(tsk);
+		stts();
+	} else
+		tsk->fpu_counter = 0;
+}
+
+static inline void __clear_fpu(struct task_struct *tsk)
+{
+	if (task_thread_info(tsk)->status & TS_USEDFPU) {
+		tolerant_fwait();
+		task_thread_info(tsk)->status &= ~TS_USEDFPU;
+		stts();
+	}
+}
+
+static inline void kernel_fpu_begin(void)
+{
+	struct thread_info *me = current_thread_info();
+	preempt_disable();
+	if (me->status & TS_USEDFPU)
+		__save_init_fpu(me->task);
+	else
+		clts();
+}
+
+static inline void kernel_fpu_end(void)
+{
+	stts();
+	preempt_enable();
+}
+
+#ifdef CONFIG_X86_64
+
+static inline void save_init_fpu(struct task_struct *tsk)
+{
+	__save_init_fpu(tsk);
+	stts();
+}
+
+#define unlazy_fpu	__unlazy_fpu
+#define clear_fpu	__clear_fpu
+
+#else  /* CONFIG_X86_32 */
+
+/*
+ * These disable preemption on their own and are safe
+ */
+static inline void save_init_fpu(struct task_struct *tsk)
+{
+	preempt_disable();
+	__save_init_fpu(tsk);
+	stts();
+	preempt_enable();
+}
+
+static inline void unlazy_fpu(struct task_struct *tsk)
+{
+	preempt_disable();
+	__unlazy_fpu(tsk);
+	preempt_enable();
+}
+
+static inline void clear_fpu(struct task_struct *tsk)
+{
+	preempt_disable();
+	__clear_fpu(tsk);
+	preempt_enable();
+}
+
+#endif	/* CONFIG_X86_64 */
+
+/*
+ * ptrace request handlers...
+ */
+extern int get_fpregs(struct user_i387_struct __user *buf,
+		      struct task_struct *tsk);
+extern int set_fpregs(struct task_struct *tsk,
+		      struct user_i387_struct __user *buf);
+
+struct user_fxsr_struct;
+extern int get_fpxregs(struct user_fxsr_struct __user *buf,
+		       struct task_struct *tsk);
+extern int set_fpxregs(struct task_struct *tsk,
+		       struct user_fxsr_struct __user *buf);
+
+/*
+ * i387 state interaction
+ */
+static inline unsigned short get_fpu_cwd(struct task_struct *tsk)
+{
+	if (cpu_has_fxsr) {
+		return tsk->thread.i387.fxsave.cwd;
+	} else {
+		return (unsigned short)tsk->thread.i387.fsave.cwd;
+	}
+}
+
+static inline unsigned short get_fpu_swd(struct task_struct *tsk)
+{
+	if (cpu_has_fxsr) {
+		return tsk->thread.i387.fxsave.swd;
+	} else {
+		return (unsigned short)tsk->thread.i387.fsave.swd;
+	}
+}
+
+static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk)
+{
+	if (cpu_has_xmm) {
+		return tsk->thread.i387.fxsave.mxcsr;
+	} else {
+		return MXCSR_DEFAULT;
+	}
+}
+
+#endif	/* _ASM_X86_I387_H */
diff --git a/include/asm-x86/i387_32.h b/include/asm-x86/i387_32.h
deleted file mode 100644
index 9ac2502cdd3d..000000000000
--- a/include/asm-x86/i387_32.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * Copyright (C) 1994 Linus Torvalds
- *
- * Pentium III FXSR, SSE support
- * General FPU state handling cleanups
- *	Gareth Hughes <gareth@valinux.com>, May 2000
- */
-
-#ifndef __ASM_I386_I387_H
-#define __ASM_I386_I387_H
-
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/kernel_stat.h>
-#include <asm/processor.h>
-#include <asm/sigcontext.h>
-#include <asm/user.h>
-
-extern void mxcsr_feature_mask_init(void);
-extern void init_fpu(struct task_struct *);
-
-/*
- * FPU lazy state save handling...
- */
-
-/*
- * The "nop" is needed to make the instructions the same
- * length.
- */
-#define restore_fpu(tsk)			\
-	alternative_input(			\
-		"nop ; frstor %1",		\
-		"fxrstor %1",			\
-		X86_FEATURE_FXSR,		\
-		"m" ((tsk)->thread.i387.fxsave))
-
-extern void kernel_fpu_begin(void);
-#define kernel_fpu_end() do { stts(); preempt_enable(); } while(0)
-
-/* We need a safe address that is cheap to find and that is already
-   in L1 during context switch. The best choices are unfortunately
-   different for UP and SMP */
-#ifdef CONFIG_SMP
-#define safe_address (__per_cpu_offset[0])
-#else
-#define safe_address (kstat_cpu(0).cpustat.user)
-#endif
-
-/*
- * These must be called with preempt disabled
- */
-static inline void __save_init_fpu( struct task_struct *tsk )
-{
-	/* Use more nops than strictly needed in case the compiler
-	   varies code */
-	alternative_input(
-		"fnsave %[fx] ;fwait;" GENERIC_NOP8 GENERIC_NOP4,
-		"fxsave %[fx]\n"
-		"bt $7,%[fsw] ; jnc 1f ; fnclex\n1:",
-		X86_FEATURE_FXSR,
-		[fx] "m" (tsk->thread.i387.fxsave),
-		[fsw] "m" (tsk->thread.i387.fxsave.swd) : "memory");
-	/* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
-	   is pending.  Clear the x87 state here by setting it to fixed
-   	   values. safe_address is a random variable that should be in L1 */
-	alternative_input(
-		GENERIC_NOP8 GENERIC_NOP2,
-		"emms\n\t"	  	/* clear stack tags */
-		"fildl %[addr]", 	/* set F?P to defined value */
-		X86_FEATURE_FXSAVE_LEAK,
-		[addr] "m" (safe_address));
-	task_thread_info(tsk)->status &= ~TS_USEDFPU;
-}
-
-#define __unlazy_fpu( tsk ) do {				\
-	if (task_thread_info(tsk)->status & TS_USEDFPU) {	\
-		__save_init_fpu(tsk);				\
-		stts();						\
-	} else							\
-		tsk->fpu_counter = 0;				\
-} while (0)
-
-#define __clear_fpu( tsk )					\
-do {								\
-	if (task_thread_info(tsk)->status & TS_USEDFPU) {	\
-		asm volatile("fnclex ; fwait");			\
-		task_thread_info(tsk)->status &= ~TS_USEDFPU;	\
-		stts();						\
-	}							\
-} while (0)
-
-
-/*
- * These disable preemption on their own and are safe
- */
-static inline void save_init_fpu( struct task_struct *tsk )
-{
-	preempt_disable();
-	__save_init_fpu(tsk);
-	stts();
-	preempt_enable();
-}
-
-#define unlazy_fpu( tsk ) do {	\
-	preempt_disable();	\
-	__unlazy_fpu(tsk);	\
-	preempt_enable();	\
-} while (0)
-
-#define clear_fpu( tsk ) do {	\
-	preempt_disable();	\
-	__clear_fpu( tsk );	\
-	preempt_enable();	\
-} while (0)
-
-/*
- * FPU state interaction...
- */
-extern unsigned short get_fpu_cwd( struct task_struct *tsk );
-extern unsigned short get_fpu_swd( struct task_struct *tsk );
-extern unsigned short get_fpu_mxcsr( struct task_struct *tsk );
-extern asmlinkage void math_state_restore(void);
-
-/*
- * Signal frame handlers...
- */
-extern int save_i387( struct _fpstate __user *buf );
-extern int restore_i387( struct _fpstate __user *buf );
-
-/*
- * ptrace request handers...
- */
-extern int get_fpregs( struct user_i387_struct __user *buf,
-		       struct task_struct *tsk );
-extern int set_fpregs( struct task_struct *tsk,
-		       struct user_i387_struct __user *buf );
-
-extern int get_fpxregs( struct user_fxsr_struct __user *buf,
-			struct task_struct *tsk );
-extern int set_fpxregs( struct task_struct *tsk,
-			struct user_fxsr_struct __user *buf );
-
-/*
- * FPU state for core dumps...
- */
-extern int dump_fpu( struct pt_regs *regs,
-		     struct user_i387_struct *fpu );
-
-#endif /* __ASM_I386_I387_H */
diff --git a/include/asm-x86/i387_64.h b/include/asm-x86/i387_64.h
deleted file mode 100644
index de435b9114df..000000000000
--- a/include/asm-x86/i387_64.h
+++ /dev/null
@@ -1,374 +0,0 @@
-/*
- * Copyright (C) 1994 Linus Torvalds
- *
- * Pentium III FXSR, SSE support
- * General FPU state handling cleanups
- *	Gareth Hughes <gareth@valinux.com>, May 2000
- * x86-64 work by Andi Kleen 2002
- */
-
-#ifndef _ASM_X86_I387_H
-#define _ASM_X86_I387_H
-
-#include <linux/sched.h>
-#include <linux/kernel_stat.h>
-#include <linux/regset.h>
-#include <asm/processor.h>
-#include <asm/sigcontext.h>
-#include <asm/user.h>
-#include <asm/uaccess.h>
-
-extern void fpu_init(void);
-extern unsigned int mxcsr_feature_mask;
-extern void mxcsr_feature_mask_init(void);
-extern void init_fpu(struct task_struct *child);
-extern asmlinkage void math_state_restore(void);
-
-extern user_regset_active_fn fpregs_active, xfpregs_active;
-extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get;
-extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set;
-
-#ifdef CONFIG_IA32_EMULATION
-struct _fpstate_ia32;
-extern int save_i387_ia32(struct _fpstate_ia32 __user *buf);
-extern int restore_i387_ia32(struct _fpstate_ia32 __user *buf);
-#endif
-
-#ifdef CONFIG_X86_64
-
-/* Ignore delayed exceptions from user space */
-static inline void tolerant_fwait(void)
-{
-	asm volatile("1: fwait\n"
-		     "2:\n"
-		     "   .section __ex_table,\"a\"\n"
-		     "	.align 8\n"
-		     "	.quad 1b,2b\n"
-		     "	.previous\n");
-}
-
-static inline int restore_fpu_checking(struct i387_fxsave_struct *fx)
-{
-	int err;
-
-	asm volatile("1:  rex64/fxrstor (%[fx])\n\t"
-		     "2:\n"
-		     ".section .fixup,\"ax\"\n"
-		     "3:  movl $-1,%[err]\n"
-		     "    jmp  2b\n"
-		     ".previous\n"
-		     ".section __ex_table,\"a\"\n"
-		     "   .align 8\n"
-		     "   .quad  1b,3b\n"
-		     ".previous"
-		     : [err] "=r" (err)
-#if 0 /* See comment in __save_init_fpu() below. */
-		     : [fx] "r" (fx), "m" (*fx), "0" (0));
-#else
-		     : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0));
-#endif
-	if (unlikely(err))
-		init_fpu(current);
-	return err;
-}
-
-#define X87_FSW_ES (1 << 7)	/* Exception Summary */
-
-/* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception
-   is pending. Clear the x87 state here by setting it to fixed
-   values. The kernel data segment can be sometimes 0 and sometimes
-   new user value. Both should be ok.
-   Use the PDA as safe address because it should be already in L1. */
-static inline void clear_fpu_state(struct i387_fxsave_struct *fx)
-{
-	if (unlikely(fx->swd & X87_FSW_ES))
-		 asm volatile("fnclex");
-	alternative_input(ASM_NOP8 ASM_NOP2,
-		     "    emms\n"		/* clear stack tags */
-		     "    fildl %%gs:0",	/* load to clear state */
-		     X86_FEATURE_FXSAVE_LEAK);
-}
-
-static inline int save_i387_checking(struct i387_fxsave_struct __user *fx)
-{
-	int err;
-
-	asm volatile("1:  rex64/fxsave (%[fx])\n\t"
-		     "2:\n"
-		     ".section .fixup,\"ax\"\n"
-		     "3:  movl $-1,%[err]\n"
-		     "    jmp  2b\n"
-		     ".previous\n"
-		     ".section __ex_table,\"a\"\n"
-		     "   .align 8\n"
-		     "   .quad  1b,3b\n"
-		     ".previous"
-		     : [err] "=r" (err), "=m" (*fx)
-#if 0 /* See comment in __fxsave_clear() below. */
-		     : [fx] "r" (fx), "0" (0));
-#else
-		     : [fx] "cdaSDb" (fx), "0" (0));
-#endif
-	if (unlikely(err) && __clear_user(fx, sizeof(struct i387_fxsave_struct)))
-		err = -EFAULT;
-	/* No need to clear here because the caller clears USED_MATH */
-	return err;
-}
-
-static inline void __save_init_fpu(struct task_struct *tsk)
-{
-	/* Using "rex64; fxsave %0" is broken because, if the memory operand
-	   uses any extended registers for addressing, a second REX prefix
-	   will be generated (to the assembler, rex64 followed by semicolon
-	   is a separate instruction), and hence the 64-bitness is lost. */
-#if 0
-	/* Using "fxsaveq %0" would be the ideal choice, but is only supported
-	   starting with gas 2.16. */
-	__asm__ __volatile__("fxsaveq %0"
-			     : "=m" (tsk->thread.i387.fxsave));
-#elif 0
-	/* Using, as a workaround, the properly prefixed form below isn't
-	   accepted by any binutils version so far released, complaining that
-	   the same type of prefix is used twice if an extended register is
-	   needed for addressing (fix submitted to mainline 2005-11-21). */
-	__asm__ __volatile__("rex64/fxsave %0"
-			     : "=m" (tsk->thread.i387.fxsave));
-#else
-	/* This, however, we can work around by forcing the compiler to select
-	   an addressing mode that doesn't require extended registers. */
-	__asm__ __volatile__("rex64/fxsave %P2(%1)"
-			     : "=m" (tsk->thread.i387.fxsave)
-			     : "cdaSDb" (tsk),
-				"i" (offsetof(__typeof__(*tsk),
-					      thread.i387.fxsave)));
-#endif
-	clear_fpu_state(&tsk->thread.i387.fxsave);
-	task_thread_info(tsk)->status &= ~TS_USEDFPU;
-}
-
-/*
- * Signal frame handlers.
- */
-
-static inline int save_i387(struct _fpstate __user *buf)
-{
-	struct task_struct *tsk = current;
-	int err = 0;
-
-	BUILD_BUG_ON(sizeof(struct user_i387_struct) !=
-			sizeof(tsk->thread.i387.fxsave));
-
-	if ((unsigned long)buf % 16)
-		printk("save_i387: bad fpstate %p\n", buf);
-
-	if (!used_math())
-		return 0;
-	clear_used_math(); /* trigger finit */
-	if (task_thread_info(tsk)->status & TS_USEDFPU) {
-		err = save_i387_checking((struct i387_fxsave_struct __user *)buf);
-		if (err) return err;
-		task_thread_info(tsk)->status &= ~TS_USEDFPU;
-		stts();
-	} else {
-		if (__copy_to_user(buf, &tsk->thread.i387.fxsave,
-				   sizeof(struct i387_fxsave_struct)))
-			return -1;
-	}
-	return 1;
-}
-
-/*
- * This restores directly out of user space. Exceptions are handled.
- */
-static inline int restore_i387(struct _fpstate __user *buf)
-{
-	set_used_math();
-	if (!(task_thread_info(current)->status & TS_USEDFPU)) {
-		clts();
-		task_thread_info(current)->status |= TS_USEDFPU;
-	}
-	return restore_fpu_checking((__force struct i387_fxsave_struct *)buf);
-}
-
-#else  /* CONFIG_X86_32 */
-
-static inline void tolerant_fwait(void)
-{
-	asm volatile("fnclex ; fwait");
-}
-
-static inline void restore_fpu(struct task_struct *tsk)
-{
-	/*
-	 * The "nop" is needed to make the instructions the same
-	 * length.
-	 */
-	alternative_input(
-		"nop ; frstor %1",
-		"fxrstor %1",
-		X86_FEATURE_FXSR,
-		"m" ((tsk)->thread.i387.fxsave));
-}
-
-/* We need a safe address that is cheap to find and that is already
-   in L1 during context switch. The best choices are unfortunately
-   different for UP and SMP */
-#ifdef CONFIG_SMP
-#define safe_address (__per_cpu_offset[0])
-#else
-#define safe_address (kstat_cpu(0).cpustat.user)
-#endif
-
-/*
- * These must be called with preempt disabled
- */
-static inline void __save_init_fpu(struct task_struct *tsk)
-{
-	/* Use more nops than strictly needed in case the compiler
-	   varies code */
-	alternative_input(
-		"fnsave %[fx] ;fwait;" GENERIC_NOP8 GENERIC_NOP4,
-		"fxsave %[fx]\n"
-		"bt $7,%[fsw] ; jnc 1f ; fnclex\n1:",
-		X86_FEATURE_FXSR,
-		[fx] "m" (tsk->thread.i387.fxsave),
-		[fsw] "m" (tsk->thread.i387.fxsave.swd) : "memory");
-	/* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
-	   is pending.  Clear the x87 state here by setting it to fixed
-	   values. safe_address is a random variable that should be in L1 */
-	alternative_input(
-		GENERIC_NOP8 GENERIC_NOP2,
-		"emms\n\t"	  	/* clear stack tags */
-		"fildl %[addr]", 	/* set F?P to defined value */
-		X86_FEATURE_FXSAVE_LEAK,
-		[addr] "m" (safe_address));
-	task_thread_info(tsk)->status &= ~TS_USEDFPU;
-}
-
-/*
- * Signal frame handlers...
- */
-extern int save_i387(struct _fpstate __user *buf);
-extern int restore_i387(struct _fpstate __user *buf);
-
-#endif	/* CONFIG_X86_64 */
-
-static inline void __unlazy_fpu(struct task_struct *tsk)
-{
-	if (task_thread_info(tsk)->status & TS_USEDFPU) {
-		__save_init_fpu(tsk);
-		stts();
-	} else
-		tsk->fpu_counter = 0;
-}
-
-static inline void __clear_fpu(struct task_struct *tsk)
-{
-	if (task_thread_info(tsk)->status & TS_USEDFPU) {
-		tolerant_fwait();
-		task_thread_info(tsk)->status &= ~TS_USEDFPU;
-		stts();
-	}
-}
-
-static inline void kernel_fpu_begin(void)
-{
-	struct thread_info *me = current_thread_info();
-	preempt_disable();
-	if (me->status & TS_USEDFPU)
-		__save_init_fpu(me->task);
-	else
-		clts();
-}
-
-static inline void kernel_fpu_end(void)
-{
-	stts();
-	preempt_enable();
-}
-
-#ifdef CONFIG_X86_64
-
-static inline void save_init_fpu(struct task_struct *tsk)
-{
-	__save_init_fpu(tsk);
-	stts();
-}
-
-#define unlazy_fpu	__unlazy_fpu
-#define clear_fpu	__clear_fpu
-
-#else  /* CONFIG_X86_32 */
-
-/*
- * These disable preemption on their own and are safe
- */
-static inline void save_init_fpu(struct task_struct *tsk)
-{
-	preempt_disable();
-	__save_init_fpu(tsk);
-	stts();
-	preempt_enable();
-}
-
-static inline void unlazy_fpu(struct task_struct *tsk)
-{
-	preempt_disable();
-	__unlazy_fpu(tsk);
-	preempt_enable();
-}
-
-static inline void clear_fpu(struct task_struct *tsk)
-{
-	preempt_disable();
-	__clear_fpu(tsk);
-	preempt_enable();
-}
-
-#endif	/* CONFIG_X86_64 */
-
-/*
- * ptrace request handlers...
- */
-extern int get_fpregs(struct user_i387_struct __user *buf,
-		      struct task_struct *tsk);
-extern int set_fpregs(struct task_struct *tsk,
-		      struct user_i387_struct __user *buf);
-
-struct user_fxsr_struct;
-extern int get_fpxregs(struct user_fxsr_struct __user *buf,
-		       struct task_struct *tsk);
-extern int set_fpxregs(struct task_struct *tsk,
-		       struct user_fxsr_struct __user *buf);
-
-/*
- * i387 state interaction
- */
-static inline unsigned short get_fpu_cwd(struct task_struct *tsk)
-{
-	if (cpu_has_fxsr) {
-		return tsk->thread.i387.fxsave.cwd;
-	} else {
-		return (unsigned short)tsk->thread.i387.fsave.cwd;
-	}
-}
-
-static inline unsigned short get_fpu_swd(struct task_struct *tsk)
-{
-	if (cpu_has_fxsr) {
-		return tsk->thread.i387.fxsave.swd;
-	} else {
-		return (unsigned short)tsk->thread.i387.fsave.swd;
-	}
-}
-
-static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk)
-{
-	if (cpu_has_xmm) {
-		return tsk->thread.i387.fxsave.mxcsr;
-	} else {
-		return MXCSR_DEFAULT;
-	}
-}
-
-#endif	/* _ASM_X86_I387_H */
-- 
cgit v1.2.3


From 1bd5718ce58fb49ac158653380fa200f4759daad Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:31:51 +0100
Subject: x86: x86 TLS desc_struct cleanup

This cleans up the TLS code to use struct desc_struct and to separate the
encoding and installation magic from the interface wrappers.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/tls.c  | 89 +++++++++++++++++++++++++-------------------------
 include/asm-x86/desc.h | 11 +++++--
 2 files changed, 54 insertions(+), 46 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index 98f428be8e8c..f11c92a3faac 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -24,6 +24,29 @@ static int get_free_idx(void)
 	return -ESRCH;
 }
 
+static void set_tls_desc(struct task_struct *p, int idx,
+			 const struct user_desc *info)
+{
+	struct thread_struct *t = &p->thread;
+	struct desc_struct *desc = &t->tls_array[idx - GDT_ENTRY_TLS_MIN];
+	int cpu;
+
+	/*
+	 * We must not get preempted while modifying the TLS.
+	 */
+	cpu = get_cpu();
+
+	if (LDT_empty(info))
+		desc->a = desc->b = 0;
+	else
+		fill_ldt(desc, info);
+
+	if (t == &current->thread)
+		load_TLS(t, cpu);
+
+	put_cpu();
+}
+
 /*
  * Set a given TLS descriptor:
  */
@@ -31,10 +54,7 @@ int do_set_thread_area(struct task_struct *p, int idx,
 		       struct user_desc __user *u_info,
 		       int can_allocate)
 {
-	struct thread_struct *t = &p->thread;
 	struct user_desc info;
-	u32 *desc;
-	int cpu;
 
 	if (copy_from_user(&info, u_info, sizeof(info)))
 		return -EFAULT;
@@ -57,23 +77,8 @@ int do_set_thread_area(struct task_struct *p, int idx,
 	if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
 		return -EINVAL;
 
-	desc = (u32 *) &t->tls_array[idx - GDT_ENTRY_TLS_MIN];
+	set_tls_desc(p, idx, &info);
 
-	/*
-	 * We must not get preempted while modifying the TLS.
-	 */
-	cpu = get_cpu();
-
-	if (LDT_empty(&info)) {
-		desc[0] = 0;
-		desc[1] = 0;
-	} else
-		fill_ldt((struct desc_struct *)desc, &info);
-
-	if (t == &current->thread)
-		load_TLS(t, cpu);
-
-	put_cpu();
 	return 0;
 }
 
@@ -87,42 +92,38 @@ asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
  * Get the current Thread-Local Storage area:
  */
 
-#define GET_LIMIT(desc)		(((desc)[0] & 0x0ffff) | ((desc)[1] & 0xf0000))
-#define GET_32BIT(desc)		(((desc)[1] >> 22) & 1)
-#define GET_CONTENTS(desc)	(((desc)[1] >> 10) & 3)
-#define GET_WRITABLE(desc)	(((desc)[1] >>  9) & 1)
-#define GET_LIMIT_PAGES(desc)	(((desc)[1] >> 23) & 1)
-#define GET_PRESENT(desc)	(((desc)[1] >> 15) & 1)
-#define GET_USEABLE(desc)	(((desc)[1] >> 20) & 1)
-#define GET_LONGMODE(desc)	(((desc)[1] >> 21) & 1)
+static void fill_user_desc(struct user_desc *info, int idx,
+			   const struct desc_struct *desc)
+
+{
+	memset(info, 0, sizeof(*info));
+	info->entry_number = idx;
+	info->base_addr = get_desc_base(desc);
+	info->limit = get_desc_limit(desc);
+	info->seg_32bit = desc->d;
+	info->contents = desc->type >> 2;
+	info->read_exec_only = !(desc->type & 2);
+	info->limit_in_pages = desc->g;
+	info->seg_not_present = !desc->p;
+	info->useable = desc->avl;
+#ifdef CONFIG_X86_64
+	info->lm = desc->l;
+#endif
+}
 
 int do_get_thread_area(struct task_struct *p, int idx,
 		       struct user_desc __user *u_info)
 {
-	struct thread_struct *t = &p->thread;
 	struct user_desc info;
-	u32 *desc;
 
 	if (idx == -1 && get_user(idx, &u_info->entry_number))
 		return -EFAULT;
+
 	if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
 		return -EINVAL;
 
-	desc = (u32 *) &t->tls_array[idx - GDT_ENTRY_TLS_MIN];
-
-	memset(&info, 0, sizeof(struct user_desc));
-	info.entry_number = idx;
-	info.base_addr = get_desc_base((struct desc_struct *)desc);
-	info.limit = GET_LIMIT(desc);
-	info.seg_32bit = GET_32BIT(desc);
-	info.contents = GET_CONTENTS(desc);
-	info.read_exec_only = !GET_WRITABLE(desc);
-	info.limit_in_pages = GET_LIMIT_PAGES(desc);
-	info.seg_not_present = !GET_PRESENT(desc);
-	info.useable = GET_USEABLE(desc);
-#ifdef CONFIG_X86_64
-	info.lm = GET_LONGMODE(desc);
-#endif
+	fill_user_desc(&info, idx,
+		       &p->thread.tls_array[idx - GDT_ENTRY_TLS_MIN]);
 
 	if (copy_to_user(u_info, &info, sizeof(info)))
 		return -EFAULT;
diff --git a/include/asm-x86/desc.h b/include/asm-x86/desc.h
index e17581ad8824..5b6a05d3a771 100644
--- a/include/asm-x86/desc.h
+++ b/include/asm-x86/desc.h
@@ -7,7 +7,8 @@
 #include <asm/mmu.h>
 #include <linux/smp.h>
 
-static inline void fill_ldt(struct desc_struct *desc, struct user_desc *info)
+static inline void fill_ldt(struct desc_struct *desc,
+			    const struct user_desc *info)
 {
 	desc->limit0 = info->limit & 0x0ffff;
 	desc->base0 = info->base_addr & 0x0000ffff;
@@ -275,10 +276,16 @@ static inline void load_LDT(mm_context_t *pc)
 	preempt_enable();
 }
 
-static inline unsigned long get_desc_base(struct desc_struct *desc)
+static inline unsigned long get_desc_base(const struct desc_struct *desc)
 {
 	return desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24);
 }
+
+static inline unsigned long get_desc_limit(const struct desc_struct *desc)
+{
+	return desc->limit0 | (desc->limit << 16);
+}
+
 static inline void _set_gate(int gate, unsigned type, void *addr,
 			      unsigned dpl, unsigned ist, unsigned seg)
 {
-- 
cgit v1.2.3


From 4c79a2d8e5b7e0a2f987ace9b6af9e7a1655447b Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:31:52 +0100
Subject: x86: x86 user_regset TLS

This adds accessor functions in the user_regset style for the TLS data.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/tls.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++----
 arch/x86/kernel/tls.h | 21 ++++++++++++
 2 files changed, 104 insertions(+), 6 deletions(-)
 create mode 100644 arch/x86/kernel/tls.h

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index f11c92a3faac..6dfd4e76661a 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -2,6 +2,7 @@
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/user.h>
+#include <linux/regset.h>
 
 #include <asm/uaccess.h>
 #include <asm/desc.h>
@@ -10,6 +11,8 @@
 #include <asm/processor.h>
 #include <asm/proto.h>
 
+#include "tls.h"
+
 /*
  * sys_alloc_thread_area: get a yet unused TLS descriptor index.
  */
@@ -25,7 +28,7 @@ static int get_free_idx(void)
 }
 
 static void set_tls_desc(struct task_struct *p, int idx,
-			 const struct user_desc *info)
+			 const struct user_desc *info, int n)
 {
 	struct thread_struct *t = &p->thread;
 	struct desc_struct *desc = &t->tls_array[idx - GDT_ENTRY_TLS_MIN];
@@ -36,10 +39,14 @@ static void set_tls_desc(struct task_struct *p, int idx,
 	 */
 	cpu = get_cpu();
 
-	if (LDT_empty(info))
-		desc->a = desc->b = 0;
-	else
-		fill_ldt(desc, info);
+	while (n-- > 0) {
+		if (LDT_empty(info))
+			desc->a = desc->b = 0;
+		else
+			fill_ldt(desc, info);
+		++info;
+		++desc;
+	}
 
 	if (t == &current->thread)
 		load_TLS(t, cpu);
@@ -77,7 +84,7 @@ int do_set_thread_area(struct task_struct *p, int idx,
 	if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
 		return -EINVAL;
 
-	set_tls_desc(p, idx, &info);
+	set_tls_desc(p, idx, &info, 1);
 
 	return 0;
 }
@@ -134,3 +141,73 @@ asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
 {
 	return do_get_thread_area(current, -1, u_info);
 }
+
+int regset_tls_active(struct task_struct *target,
+		      const struct user_regset *regset)
+{
+	struct thread_struct *t = &target->thread;
+	int n = GDT_ENTRY_TLS_ENTRIES;
+	while (n > 0 && desc_empty(&t->tls_array[n - 1]))
+		--n;
+	return n;
+}
+
+int regset_tls_get(struct task_struct *target, const struct user_regset *regset,
+		   unsigned int pos, unsigned int count,
+		   void *kbuf, void __user *ubuf)
+{
+	const struct desc_struct *tls;
+
+	if (pos > GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
+	    (pos % sizeof(struct user_desc)) != 0 ||
+	    (count % sizeof(struct user_desc)) != 0)
+		return -EINVAL;
+
+	pos /= sizeof(struct user_desc);
+	count /= sizeof(struct user_desc);
+
+	tls = &target->thread.tls_array[pos];
+
+	if (kbuf) {
+		struct user_desc *info = kbuf;
+		while (count-- > 0)
+			fill_user_desc(info++, GDT_ENTRY_TLS_MIN + pos++,
+				       tls++);
+	} else {
+		struct user_desc __user *u_info = ubuf;
+		while (count-- > 0) {
+			struct user_desc info;
+			fill_user_desc(&info, GDT_ENTRY_TLS_MIN + pos++, tls++);
+			if (__copy_to_user(u_info++, &info, sizeof(info)))
+				return -EFAULT;
+		}
+	}
+
+	return 0;
+}
+
+int regset_tls_set(struct task_struct *target, const struct user_regset *regset,
+		   unsigned int pos, unsigned int count,
+		   const void *kbuf, const void __user *ubuf)
+{
+	struct user_desc infobuf[GDT_ENTRY_TLS_ENTRIES];
+	const struct user_desc *info;
+
+	if (pos > GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
+	    (pos % sizeof(struct user_desc)) != 0 ||
+	    (count % sizeof(struct user_desc)) != 0)
+		return -EINVAL;
+
+	if (kbuf)
+		info = kbuf;
+	else if (__copy_from_user(infobuf, ubuf, count))
+		return -EFAULT;
+	else
+		info = infobuf;
+
+	set_tls_desc(target,
+		     GDT_ENTRY_TLS_MIN + (pos / sizeof(struct user_desc)),
+		     info, count / sizeof(struct user_desc));
+
+	return 0;
+}
diff --git a/arch/x86/kernel/tls.h b/arch/x86/kernel/tls.h
new file mode 100644
index 000000000000..2f083a2fe216
--- /dev/null
+++ b/arch/x86/kernel/tls.h
@@ -0,0 +1,21 @@
+/*
+ * Internal declarations for x86 TLS implementation functions.
+ *
+ * Copyright (C) 2007 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ *
+ * Red Hat Author: Roland McGrath.
+ */
+
+#ifndef _ARCH_X86_KERNEL_TLS_H
+
+#include <linux/regset.h>
+
+extern user_regset_active_fn regset_tls_active;
+extern user_regset_get_fn regset_tls_get;
+extern user_regset_set_fn regset_tls_set;
+
+#endif	/* _ARCH_X86_KERNEL_TLS_H */
-- 
cgit v1.2.3


From 91e7b707a4776185f91f03bd052aa53af820094e Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:31:52 +0100
Subject: x86: x86 user_regset general regs

This adds accessor functions in the user_regset style for
the general registers (struct user_regs_struct).

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ptrace.c | 109 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 109 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 084805ab7323..ef349ff170a7 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -13,6 +13,7 @@
 #include <linux/smp.h>
 #include <linux/errno.h>
 #include <linux/ptrace.h>
+#include <linux/regset.h>
 #include <linux/user.h>
 #include <linux/security.h>
 #include <linux/audit.h>
@@ -368,6 +369,59 @@ static unsigned long getreg(struct task_struct *task, unsigned long offset)
 	return *pt_regs_access(task_pt_regs(task), offset);
 }
 
+static int genregs_get(struct task_struct *target,
+		       const struct user_regset *regset,
+		       unsigned int pos, unsigned int count,
+		       void *kbuf, void __user *ubuf)
+{
+	if (kbuf) {
+		unsigned long *k = kbuf;
+		while (count > 0) {
+			*k++ = getreg(target, pos);
+			count -= sizeof(*k);
+			pos += sizeof(*k);
+		}
+	} else {
+		unsigned long __user *u = ubuf;
+		while (count > 0) {
+			if (__put_user(getreg(target, pos), u++))
+				return -EFAULT;
+			count -= sizeof(*u);
+			pos += sizeof(*u);
+		}
+	}
+
+	return 0;
+}
+
+static int genregs_set(struct task_struct *target,
+		       const struct user_regset *regset,
+		       unsigned int pos, unsigned int count,
+		       const void *kbuf, const void __user *ubuf)
+{
+	int ret = 0;
+	if (kbuf) {
+		const unsigned long *k = kbuf;
+		while (count > 0 && !ret) {
+			ret = putreg(target, pos, *k++);
+			count -= sizeof(*k);
+			pos += sizeof(*k);
+		}
+	} else {
+		const unsigned long  __user *u = ubuf;
+		while (count > 0 && !ret) {
+			unsigned long word;
+			ret = __get_user(word, u++);
+			if (ret)
+				break;
+			ret = putreg(target, pos, word);
+			count -= sizeof(*u);
+			pos += sizeof(*u);
+		}
+	}
+	return ret;
+}
+
 /*
  * This function is trivial and will be inlined by the compiler.
  * Having it separates the implementation details of debug
@@ -1008,6 +1062,61 @@ static int getreg32(struct task_struct *child, unsigned regno, u32 *val)
 #undef R32
 #undef SEG32
 
+static int genregs32_get(struct task_struct *target,
+			 const struct user_regset *regset,
+			 unsigned int pos, unsigned int count,
+			 void *kbuf, void __user *ubuf)
+{
+	if (kbuf) {
+		compat_ulong_t *k = kbuf;
+		while (count > 0) {
+			getreg32(target, pos, k++);
+			count -= sizeof(*k);
+			pos += sizeof(*k);
+		}
+	} else {
+		compat_ulong_t __user *u = ubuf;
+		while (count > 0) {
+			compat_ulong_t word;
+			getreg32(target, pos, &word);
+			if (__put_user(word, u++))
+				return -EFAULT;
+			count -= sizeof(*u);
+			pos += sizeof(*u);
+		}
+	}
+
+	return 0;
+}
+
+static int genregs32_set(struct task_struct *target,
+			 const struct user_regset *regset,
+			 unsigned int pos, unsigned int count,
+			 const void *kbuf, const void __user *ubuf)
+{
+	int ret = 0;
+	if (kbuf) {
+		const compat_ulong_t *k = kbuf;
+		while (count > 0 && !ret) {
+			ret = putreg(target, pos, *k++);
+			count -= sizeof(*k);
+			pos += sizeof(*k);
+		}
+	} else {
+		const compat_ulong_t __user *u = ubuf;
+		while (count > 0 && !ret) {
+			compat_ulong_t word;
+			ret = __get_user(word, u++);
+			if (ret)
+				break;
+			ret = putreg(target, pos, word);
+			count -= sizeof(*u);
+			pos += sizeof(*u);
+		}
+	}
+	return ret;
+}
+
 static long ptrace32_siginfo(unsigned request, u32 pid, u32 addr, u32 data)
 {
 	siginfo_t __user *si = compat_alloc_user_space(sizeof(siginfo_t));
-- 
cgit v1.2.3


From 070459d95e132726a6ac0c8204f0b5ef653bd824 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:31:53 +0100
Subject: x86: x86 user_regset_view

This defines task_user_regset_view and the tables
describing the x86 user_regset layouts for 32 and 64.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ptrace.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index ef349ff170a7..196cc27bd39a 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -15,6 +15,7 @@
 #include <linux/ptrace.h>
 #include <linux/regset.h>
 #include <linux/user.h>
+#include <linux/elf.h>
 #include <linux/security.h>
 #include <linux/audit.h>
 #include <linux/seccomp.h>
@@ -32,6 +33,14 @@
 #include <asm/proto.h>
 #include <asm/ds.h>
 
+#include "tls.h"
+
+enum x86_regset {
+	REGSET_GENERAL,
+	REGSET_FP,
+	REGSET_XFP,
+	REGSET_TLS,
+};
 
 /*
  * does not yet catch signals sent when the child dies.
@@ -1335,6 +1344,84 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
 
 #endif	/* CONFIG_IA32_EMULATION */
 
+#ifdef CONFIG_X86_64
+
+static const struct user_regset x86_64_regsets[] = {
+	[REGSET_GENERAL] = {
+		.core_note_type = NT_PRSTATUS,
+		.n = sizeof(struct user_regs_struct) / sizeof(long),
+		.size = sizeof(long), .align = sizeof(long),
+		.get = genregs_get, .set = genregs_set
+	},
+	[REGSET_FP] = {
+		.core_note_type = NT_PRFPREG,
+		.n = sizeof(struct user_i387_struct) / sizeof(long),
+		.size = sizeof(long), .align = sizeof(long),
+		.active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
+	},
+};
+
+static const struct user_regset_view user_x86_64_view = {
+	.name = "x86_64", .e_machine = EM_X86_64,
+	.regsets = x86_64_regsets, .n = ARRAY_SIZE(x86_64_regsets)
+};
+
+#else  /* CONFIG_X86_32 */
+
+#define user_regs_struct32	user_regs_struct
+#define genregs32_get		genregs_get
+#define genregs32_set		genregs_set
+
+#endif	/* CONFIG_X86_64 */
+
+#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+static const struct user_regset x86_32_regsets[] = {
+	[REGSET_GENERAL] = {
+		.core_note_type = NT_PRSTATUS,
+		.n = sizeof(struct user_regs_struct32) / sizeof(u32),
+		.size = sizeof(u32), .align = sizeof(u32),
+		.get = genregs32_get, .set = genregs32_set
+	},
+	[REGSET_FP] = {
+		.core_note_type = NT_PRFPREG,
+		.n = sizeof(struct user_i387_struct) / sizeof(u32),
+		.size = sizeof(u32), .align = sizeof(u32),
+		.active = fpregs_active, .get = fpregs_get, .set = fpregs_set
+	},
+	[REGSET_XFP] = {
+		.core_note_type = NT_PRXFPREG,
+		.n = sizeof(struct user_i387_struct) / sizeof(u32),
+		.size = sizeof(u32), .align = sizeof(u32),
+		.active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
+	},
+	[REGSET_TLS] = {
+		.n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN,
+		.size = sizeof(struct user_desc),
+		.align = sizeof(struct user_desc),
+		.active = regset_tls_active,
+		.get = regset_tls_get, .set = regset_tls_set
+	},
+};
+
+static const struct user_regset_view user_x86_32_view = {
+	.name = "i386", .e_machine = EM_386,
+	.regsets = x86_32_regsets, .n = ARRAY_SIZE(x86_32_regsets)
+};
+#endif
+
+const struct user_regset_view *task_user_regset_view(struct task_struct *task)
+{
+#ifdef CONFIG_IA32_EMULATION
+	if (test_tsk_thread_flag(task, TIF_IA32))
+#endif
+#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+		return &user_x86_32_view;
+#endif
+#ifdef CONFIG_X86_64
+	return &user_x86_64_view;
+#endif
+}
+
 #ifdef CONFIG_X86_32
 
 void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
-- 
cgit v1.2.3


From 5a4646a4efed8c835f76c3b88f3155f6ab5b8d9b Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:31:54 +0100
Subject: x86: x86 ptrace user_regset

This cleans up the PTRACE_*REGS* request code so each one is just a
simple call to copy_regset_to_user or copy_regset_from_user.  The
ptrace layouts already match the user_regset formats (core dump formats).

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ptrace.c | 236 ++++++++++++++++-------------------------------
 1 file changed, 77 insertions(+), 159 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 196cc27bd39a..f8b89059e6ed 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -750,9 +750,13 @@ void ptrace_disable(struct task_struct *child)
 	}
 }
 
+#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+static const struct user_regset_view user_x86_32_view; /* Initialized below. */
+#endif
+
 long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 {
-	int i, ret;
+	int ret;
 	unsigned long __user *datap = (unsigned long __user *)data;
 
 	switch (request) {
@@ -805,82 +809,46 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		}
 		break;
 
-	case PTRACE_GETREGS: { /* Get all gp regs from the child. */
-		if (!access_ok(VERIFY_WRITE, datap, sizeof(struct user_regs_struct))) {
-			ret = -EIO;
-			break;
-		}
-		for (i = 0; i < sizeof(struct user_regs_struct); i += sizeof(long)) {
-			__put_user(getreg(child, i), datap);
-			datap++;
-		}
-		ret = 0;
-		break;
-	}
-
-	case PTRACE_SETREGS: { /* Set all gp regs in the child. */
-		unsigned long tmp;
-		if (!access_ok(VERIFY_READ, datap, sizeof(struct user_regs_struct))) {
-			ret = -EIO;
-			break;
-		}
-		for (i = 0; i < sizeof(struct user_regs_struct); i += sizeof(long)) {
-			__get_user(tmp, datap);
-			putreg(child, i, tmp);
-			datap++;
-		}
-		ret = 0;
-		break;
-	}
-
-	case PTRACE_GETFPREGS: { /* Get the child FPU state. */
-		if (!access_ok(VERIFY_WRITE, datap,
-			       sizeof(struct user_i387_struct))) {
-			ret = -EIO;
-			break;
-		}
-		ret = 0;
-		if (!tsk_used_math(child))
-			init_fpu(child);
-		get_fpregs((struct user_i387_struct __user *)data, child);
-		break;
-	}
-
-	case PTRACE_SETFPREGS: { /* Set the child FPU state. */
-		if (!access_ok(VERIFY_READ, datap,
-			       sizeof(struct user_i387_struct))) {
-			ret = -EIO;
-			break;
-		}
-		set_stopped_child_used_math(child);
-		set_fpregs(child, (struct user_i387_struct __user *)data);
-		ret = 0;
-		break;
-	}
+	case PTRACE_GETREGS:	/* Get all gp regs from the child. */
+		return copy_regset_to_user(child,
+					   task_user_regset_view(current),
+					   REGSET_GENERAL,
+					   0, sizeof(struct user_regs_struct),
+					   datap);
+
+	case PTRACE_SETREGS:	/* Set all gp regs in the child. */
+		return copy_regset_from_user(child,
+					     task_user_regset_view(current),
+					     REGSET_GENERAL,
+					     0, sizeof(struct user_regs_struct),
+					     datap);
+
+	case PTRACE_GETFPREGS:	/* Get the child FPU state. */
+		return copy_regset_to_user(child,
+					   task_user_regset_view(current),
+					   REGSET_FP,
+					   0, sizeof(struct user_i387_struct),
+					   datap);
+
+	case PTRACE_SETFPREGS:	/* Set the child FPU state. */
+		return copy_regset_from_user(child,
+					     task_user_regset_view(current),
+					     REGSET_FP,
+					     0, sizeof(struct user_i387_struct),
+					     datap);
 
 #ifdef CONFIG_X86_32
-	case PTRACE_GETFPXREGS: { /* Get the child extended FPU state. */
-		if (!access_ok(VERIFY_WRITE, datap,
-			       sizeof(struct user_fxsr_struct))) {
-			ret = -EIO;
-			break;
-		}
-		if (!tsk_used_math(child))
-			init_fpu(child);
-		ret = get_fpxregs((struct user_fxsr_struct __user *)data, child);
-		break;
-	}
-
-	case PTRACE_SETFPXREGS: { /* Set the child extended FPU state. */
-		if (!access_ok(VERIFY_READ, datap,
-			       sizeof(struct user_fxsr_struct))) {
-			ret = -EIO;
-			break;
-		}
-		set_stopped_child_used_math(child);
-		ret = set_fpxregs(child, (struct user_fxsr_struct __user *)data);
-		break;
-	}
+	case PTRACE_GETFPXREGS:	/* Get the child extended FPU state. */
+		return copy_regset_to_user(child, &user_x86_32_view,
+					   REGSET_XFP,
+					   0, sizeof(struct user_fxsr_struct),
+					   datap);
+
+	case PTRACE_SETFPXREGS:	/* Set the child extended FPU state. */
+		return copy_regset_from_user(child, &user_x86_32_view,
+					     REGSET_XFP,
+					     0, sizeof(struct user_fxsr_struct),
+					     datap);
 #endif
 
 #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
@@ -1243,90 +1211,40 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
 		ret = putreg32(child, addr, data);
 		break;
 
-	case PTRACE_GETREGS: { /* Get all gp regs from the child. */
-		int i;
-
-		if (!access_ok(VERIFY_WRITE, datap, 16*4)) {
-			ret = -EIO;
-			break;
-		}
-		ret = 0;
-		for (i = 0; i < sizeof(struct user_regs_struct32); i += sizeof(__u32)) {
-			getreg32(child, i, &val);
-			ret |= __put_user(val, (u32 __user *)datap);
-			datap += sizeof(u32);
-		}
-		break;
-	}
-
-	case PTRACE_SETREGS: { /* Set all gp regs in the child. */
-		unsigned long tmp;
-		int i;
-
-		if (!access_ok(VERIFY_READ, datap, 16*4)) {
-			ret = -EIO;
-			break;
-		}
-		ret = 0;
-		for (i = 0; i < sizeof(struct user_regs_struct32); i += sizeof(u32)) {
-			ret |= __get_user(tmp, (u32 __user *)datap);
-			putreg32(child, i, tmp);
-			datap += sizeof(u32);
-		}
-		break;
-	}
-
-	case PTRACE_GETFPREGS:
-		ret = -EIO;
-		if (!access_ok(VERIFY_READ, compat_ptr(data),
-			       sizeof(struct user_i387_struct)))
-			break;
-		save_i387_ia32(child, datap, childregs, 1);
-		ret = 0;
-			break;
-
-	case PTRACE_SETFPREGS:
-		ret = -EIO;
-		if (!access_ok(VERIFY_WRITE, datap,
-			       sizeof(struct user_i387_struct)))
-			break;
-		ret = 0;
-		/* don't check EFAULT to be bug-to-bug compatible to i386 */
-		restore_i387_ia32(child, datap, 1);
-		break;
-
-	case PTRACE_GETFPXREGS: {
-		struct user32_fxsr_struct __user *u = datap;
-
-		init_fpu(child);
-		ret = -EIO;
-		if (!access_ok(VERIFY_WRITE, u, sizeof(*u)))
-			break;
-			ret = -EFAULT;
-		if (__copy_to_user(u, &child->thread.i387.fxsave, sizeof(*u)))
-			break;
-		ret = __put_user(childregs->cs, &u->fcs);
-		ret |= __put_user(child->thread.ds, &u->fos);
-		break;
-	}
-	case PTRACE_SETFPXREGS: {
-		struct user32_fxsr_struct __user *u = datap;
-
-		unlazy_fpu(child);
-		ret = -EIO;
-		if (!access_ok(VERIFY_READ, u, sizeof(*u)))
-			break;
-		/*
-		 * no checking to be bug-to-bug compatible with i386.
-		 * but silence warning
-		 */
-		if (__copy_from_user(&child->thread.i387.fxsave, u, sizeof(*u)))
-			;
-		set_stopped_child_used_math(child);
-		child->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
-		ret = 0;
-		break;
-	}
+	case PTRACE_GETREGS:	/* Get all gp regs from the child. */
+		return copy_regset_to_user(child, &user_x86_32_view,
+					   REGSET_GENERAL,
+					   0, sizeof(struct user_regs_struct32),
+					   datap);
+
+	case PTRACE_SETREGS:	/* Set all gp regs in the child. */
+		return copy_regset_from_user(child, &user_x86_32_view,
+					     REGSET_GENERAL, 0,
+					     sizeof(struct user_regs_struct32),
+					     datap);
+
+	case PTRACE_GETFPREGS:	/* Get the child FPU state. */
+		return copy_regset_to_user(child, &user_x86_32_view,
+					   REGSET_FP, 0,
+					   sizeof(struct user_i387_ia32_struct),
+					   datap);
+
+	case PTRACE_SETFPREGS:	/* Set the child FPU state. */
+		return copy_regset_from_user(
+			child, &user_x86_32_view, REGSET_FP,
+			0, sizeof(struct user_i387_ia32_struct), datap);
+
+	case PTRACE_GETFPXREGS:	/* Get the child extended FPU state. */
+		return copy_regset_to_user(child, &user_x86_32_view,
+					   REGSET_XFP, 0,
+					   sizeof(struct user32_fxsr_struct),
+					   datap);
+
+	case PTRACE_SETFPXREGS:	/* Set the child extended FPU state. */
+		return copy_regset_from_user(child, &user_x86_32_view,
+					     REGSET_XFP, 0,
+					     sizeof(struct user32_fxsr_struct),
+					     datap);
 
 	case PTRACE_GETEVENTMSG:
 		ret = put_user(child->ptrace_message,
-- 
cgit v1.2.3


From 60b3b9af35aad66345e395be911e46fb8443f0c5 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:31:55 +0100
Subject: x86: x86 user_regset cleanup

This removes a bunch of dead code that is no longer needed now
that the user_regset interfaces are being used for all these jobs.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/i387.c        | 105 ++++--------------------------------------
 arch/x86/kernel/process_32.c  |  16 -------
 arch/x86/kernel/process_64.c  |  19 --------
 arch/x86/math-emu/fpu_entry.c |  14 ------
 include/asm-x86/elf.h         |  68 ---------------------------
 include/asm-x86/i387.h        |  14 ------
 include/asm-x86/math_emu.h    |   5 --
 7 files changed, 9 insertions(+), 232 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index f7f7568dd7bc..26719bd2c77c 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -454,113 +454,26 @@ int restore_i387_ia32(struct _fpstate_ia32 __user *buf)
 	return err;
 }
 
-#endif	/* CONFIG_X86_32 || CONFIG_IA32_EMULATION */
-
-#ifdef CONFIG_X86_64
-
-int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *tsk)
-{
-	return xfpregs_get(tsk, NULL, 0, sizeof(*buf), NULL, buf);
-}
-
-int set_fpregs(struct task_struct *tsk, struct user_i387_struct __user *buf)
-{
-	return xfpregs_set(tsk, NULL, 0, sizeof(*buf), NULL, buf);
-}
-
-#else
-
-int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *tsk)
-{
-	return fpregs_get(tsk, NULL, 0, sizeof(*buf), NULL, buf);
-}
-
-int set_fpregs(struct task_struct *tsk, struct user_i387_struct __user *buf)
-{
-	return fpregs_set(tsk, NULL, 0, sizeof(*buf), NULL, buf);
-}
-
-int get_fpxregs(struct user_fxsr_struct __user *buf, struct task_struct *tsk)
-{
-	return xfpregs_get(tsk, NULL, 0, sizeof(*buf), NULL, buf);
-}
-
-int set_fpxregs(struct task_struct *tsk, struct user_fxsr_struct __user *buf)
-{
-	return xfpregs_get(tsk, NULL, 0, sizeof(*buf), NULL, buf);
-}
-
-#endif
-
 /*
  * FPU state for core dumps.
+ * This is only used for a.out dumps now.
+ * It is declared generically using elf_fpregset_t (which is
+ * struct user_i387_struct) but is in fact only used for 32-bit
+ * dumps, so on 64-bit it is really struct user_i387_ia32_struct.
  */
-
-static inline void copy_fpu_fsave(struct task_struct *tsk,
-				  struct user_i387_struct *fpu)
-{
-	memcpy(fpu, &tsk->thread.i387.fsave,
-	       sizeof(struct user_i387_struct));
-}
-
-static inline void copy_fpu_fxsave(struct task_struct *tsk,
-				   struct user_i387_struct *fpu)
-{
-	unsigned short *to;
-	unsigned short *from;
-	int i;
-
-	memcpy(fpu, &tsk->thread.i387.fxsave, 7 * sizeof(long));
-
-	to = (unsigned short *)&fpu->st_space[0];
-	from = (unsigned short *)&tsk->thread.i387.fxsave.st_space[0];
-	for (i = 0; i < 8; i++, to += 5, from += 8)
-		memcpy(to, from, 5 * sizeof(unsigned short));
-}
-
 int dump_fpu(struct pt_regs *regs, struct user_i387_struct *fpu)
 {
 	int fpvalid;
 	struct task_struct *tsk = current;
 
 	fpvalid = !!used_math();
-	if (fpvalid) {
-		unlazy_fpu(tsk);
-		if (cpu_has_fxsr) {
-			copy_fpu_fxsave(tsk, fpu);
-		} else {
-			copy_fpu_fsave(tsk, fpu);
-		}
-	}
+	if (fpvalid)
+		fpvalid = !fpregs_get(tsk, NULL,
+				      0, sizeof(struct user_i387_ia32_struct),
+				      fpu, NULL);
 
 	return fpvalid;
 }
 EXPORT_SYMBOL(dump_fpu);
 
-int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu)
-{
-	int fpvalid = !!tsk_used_math(tsk);
-
-	if (fpvalid) {
-		if (tsk == current)
-			unlazy_fpu(tsk);
-		if (cpu_has_fxsr)
-			copy_fpu_fxsave(tsk, fpu);
-		else
-			copy_fpu_fsave(tsk, fpu);
-	}
-	return fpvalid;
-}
-
-int dump_task_extended_fpu(struct task_struct *tsk,
-			   struct user32_fxsr_struct *fpu)
-{
-	int fpvalid = tsk_used_math(tsk) && cpu_has_fxsr;
-
-	if (fpvalid) {
-		if (tsk == current)
-		       unlazy_fpu(tsk);
-		memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(*fpu));
-	}
-	return fpvalid;
-}
+#endif	/* CONFIG_X86_32 || CONFIG_IA32_EMULATION */
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 40cc29695eba..35a6f318c541 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -571,22 +571,6 @@ void dump_thread(struct pt_regs * regs, struct user * dump)
 }
 EXPORT_SYMBOL(dump_thread);
 
-/*
- * Capture the user space registers if the task is not running (in user space)
- */
-int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
-{
-	struct pt_regs ptregs = *task_pt_regs(tsk);
-	ptregs.cs &= 0xffff;
-	ptregs.ds &= 0xffff;
-	ptregs.es &= 0xffff;
-	ptregs.ss &= 0xffff;
-
-	elf_core_copy_regs(regs, &ptregs);
-
-	return 1;
-}
-
 #ifdef CONFIG_SECCOMP
 static void hard_disable_TSC(void)
 {
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index e3a3610ade10..78d80067b7f9 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -544,24 +544,6 @@ out:
  */
 #define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
 
-/*
- * Capture the user space registers if the task is not running (in user space)
- */
-int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
-{
-	struct pt_regs *pp, ptregs;
-
-	pp = task_pt_regs(tsk);
-
-	ptregs = *pp;
-	ptregs.cs &= 0xffff;
-	ptregs.ss &= 0xffff;
-
-	elf_core_copy_regs(regs, &ptregs);
-
-	return 1;
-}
-
 static inline void __switch_to_xtra(struct task_struct *prev_p,
 				    struct task_struct *next_p,
 				    struct tss_struct *tss)
@@ -929,4 +911,3 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
 	unsigned long range_end = mm->brk + 0x02000000;
 	return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
 }
-
diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c
index cfbdaa1532ce..760baeea5f07 100644
--- a/arch/x86/math-emu/fpu_entry.c
+++ b/arch/x86/math-emu/fpu_entry.c
@@ -761,17 +761,3 @@ int fpregs_soft_get(struct task_struct *target,
 
 	return ret;
 }
-
-int save_i387_soft(void *s387, struct _fpstate __user *buf)
-{
-	return fpregs_soft_get(current, NULL,
-			       0, sizeof(struct user_i387_struct),
-			       NULL, buf) ? -1 : 1;
-}
-
-int restore_i387_soft(void *s387, struct _fpstate __user *buf)
-{
-	return fpregs_soft_set(current, NULL,
-			       0, sizeof(struct user_i387_struct),
-			       NULL, buf) ? -1 : 1;
-}
diff --git a/include/asm-x86/elf.h b/include/asm-x86/elf.h
index 123e2d599c6d..d6bf7421d7b0 100644
--- a/include/asm-x86/elf.h
+++ b/include/asm-x86/elf.h
@@ -104,28 +104,6 @@ typedef struct user_fxsr_struct elf_fpxregset_t;
 	_r->ax = 0; \
 } while (0)
 
-/* regs is struct pt_regs, pr_reg is elf_gregset_t (which is
-   now struct_user_regs, they are different) */
-
-#define ELF_CORE_COPY_REGS(pr_reg, regs)		\
-	pr_reg[0] = regs->bx;				\
-	pr_reg[1] = regs->cx;				\
-	pr_reg[2] = regs->dx;				\
-	pr_reg[3] = regs->si;				\
-	pr_reg[4] = regs->di;				\
-	pr_reg[5] = regs->bp;				\
-	pr_reg[6] = regs->ax;				\
-	pr_reg[7] = regs->ds & 0xffff;			\
-	pr_reg[8] = regs->es & 0xffff;			\
-	pr_reg[9] = regs->fs & 0xffff;			\
-	savesegment(gs,pr_reg[10]);			\
-	pr_reg[11] = regs->orig_ax;			\
-	pr_reg[12] = regs->ip;				\
-	pr_reg[13] = regs->cs & 0xffff;			\
-	pr_reg[14] = regs->flags;			\
-	pr_reg[15] = regs->sp;				\
-	pr_reg[16] = regs->ss & 0xffff;
-
 #define ELF_PLATFORM	(utsname()->machine)
 #define set_personality_64bit()	do { } while (0)
 extern unsigned int vdso_enabled;
@@ -159,41 +137,6 @@ extern unsigned int vdso_enabled;
 	clear_thread_flag(TIF_IA32);			  \
 } while (0)
 
-/* regs is struct pt_regs, pr_reg is elf_gregset_t (which is
-   now struct_user_regs, they are different). Assumes current is the process
-   getting dumped. */
-
-#define ELF_CORE_COPY_REGS(pr_reg, regs)  do {			\
-	unsigned v;						\
-	(pr_reg)[0] = (regs)->r15;				\
-	(pr_reg)[1] = (regs)->r14;				\
-	(pr_reg)[2] = (regs)->r13;				\
-	(pr_reg)[3] = (regs)->r12;				\
-	(pr_reg)[4] = (regs)->bp;				\
-	(pr_reg)[5] = (regs)->bx;				\
-	(pr_reg)[6] = (regs)->r11;				\
-	(pr_reg)[7] = (regs)->r10;				\
-	(pr_reg)[8] = (regs)->r9;				\
-	(pr_reg)[9] = (regs)->r8;				\
-	(pr_reg)[10] = (regs)->ax;				\
-	(pr_reg)[11] = (regs)->cx;				\
-	(pr_reg)[12] = (regs)->dx;				\
-	(pr_reg)[13] = (regs)->si;				\
-	(pr_reg)[14] = (regs)->di;				\
-	(pr_reg)[15] = (regs)->orig_ax;			\
-	(pr_reg)[16] = (regs)->ip;				\
-	(pr_reg)[17] = (regs)->cs;				\
-	(pr_reg)[18] = (regs)->flags;				\
-	(pr_reg)[19] = (regs)->sp;				\
-	(pr_reg)[20] = (regs)->ss;				\
-	(pr_reg)[21] = current->thread.fs;			\
-	(pr_reg)[22] = current->thread.gs;			\
-	asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v;	\
-	asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v;	\
-	asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v;	\
-	asm("movl %%gs,%0" : "=r" (v)); (pr_reg)[26] = v;	\
-} while(0);
-
 /* I'm not sure if we can use '-' here */
 #define ELF_PLATFORM       ("x86_64")
 extern void set_personality_64bit(void);
@@ -236,18 +179,7 @@ extern int vdso_enabled;
 
 struct task_struct;
 
-extern int dump_task_regs (struct task_struct *, elf_gregset_t *);
-extern int dump_task_fpu (struct task_struct *, elf_fpregset_t *);
-
-#define ELF_CORE_COPY_TASK_REGS(tsk, elf_regs) dump_task_regs(tsk, elf_regs)
-#define ELF_CORE_COPY_FPREGS(tsk, elf_fpregs) dump_task_fpu(tsk, elf_fpregs)
-
 #ifdef CONFIG_X86_32
-extern int dump_task_extended_fpu (struct task_struct *,
-				   struct user_fxsr_struct *);
-#define ELF_CORE_COPY_XFPREGS(tsk, elf_xfpregs) \
-	dump_task_extended_fpu(tsk, elf_xfpregs)
-#define ELF_CORE_XFPREG_TYPE NT_PRXFPREG
 
 #define VDSO_HIGH_BASE		(__fix_to_virt(FIX_VDSO))
 
diff --git a/include/asm-x86/i387.h b/include/asm-x86/i387.h
index de435b9114df..ba8105ca822b 100644
--- a/include/asm-x86/i387.h
+++ b/include/asm-x86/i387.h
@@ -327,20 +327,6 @@ static inline void clear_fpu(struct task_struct *tsk)
 
 #endif	/* CONFIG_X86_64 */
 
-/*
- * ptrace request handlers...
- */
-extern int get_fpregs(struct user_i387_struct __user *buf,
-		      struct task_struct *tsk);
-extern int set_fpregs(struct task_struct *tsk,
-		      struct user_i387_struct __user *buf);
-
-struct user_fxsr_struct;
-extern int get_fpxregs(struct user_fxsr_struct __user *buf,
-		       struct task_struct *tsk);
-extern int set_fpxregs(struct task_struct *tsk,
-		       struct user_fxsr_struct __user *buf);
-
 /*
  * i387 state interaction
  */
diff --git a/include/asm-x86/math_emu.h b/include/asm-x86/math_emu.h
index a4b0aa3320e6..9bf4ae93ab10 100644
--- a/include/asm-x86/math_emu.h
+++ b/include/asm-x86/math_emu.h
@@ -1,11 +1,6 @@
 #ifndef _I386_MATH_EMU_H
 #define _I386_MATH_EMU_H
 
-#include <asm/sigcontext.h>
-
-int restore_i387_soft(void *s387, struct _fpstate __user *buf);
-int save_i387_soft(void *s387, struct _fpstate __user *buf);
-
 /* This structure matches the layout of the data saved to the stack
    following a device-not-present interrupt, part of it saved
    automatically by the 80386/80486.
-- 
cgit v1.2.3


From bb61682b3f31dec7d058cae2f6edd2275248a704 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:31:56 +0100
Subject: x86: x86 core dump TLS

This makes ELF core dumps of 32-bit processes include a new
note type NT_386_TLS (0x200) giving the contents of the TLS
slots in struct user_desc format.  This lets post mortem
examination figure out what the segment registers mean like
the debugger does with get_thread_area on a live process.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ptrace.c | 1 +
 include/linux/elf.h      | 1 +
 2 files changed, 2 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index f8b89059e6ed..e6a680c7daf7 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1313,6 +1313,7 @@ static const struct user_regset x86_32_regsets[] = {
 		.active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
 	},
 	[REGSET_TLS] = {
+		.core_note_type = NT_386_TLS,
 		.n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN,
 		.size = sizeof(struct user_desc),
 		.align = sizeof(struct user_desc),
diff --git a/include/linux/elf.h b/include/linux/elf.h
index 576e83bd6d88..7ceb24d87c1a 100644
--- a/include/linux/elf.h
+++ b/include/linux/elf.h
@@ -355,6 +355,7 @@ typedef struct elf64_shdr {
 #define NT_AUXV		6
 #define NT_PRXFPREG     0x46e62b7f      /* copied from gdb5.1/include/elf/common.h */
 #define NT_PPC_VMX	0x100		/* PowerPC Altivec/VMX registers */
+#define NT_386_TLS	0x200		/* i386 TLS slots (struct user_desc) */
 
 
 /* Note header in a PT_NOTE section */
-- 
cgit v1.2.3


From fdadd54db5e9f05e673eb08746bf2bea460d9f0d Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:31:56 +0100
Subject: x86: x86 ptrace generic requests

This removes duplicated code by calling the generic ptrace_request and
compat_ptrace_request functions for the things they already handle.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ptrace.c | 37 +------------------------------------
 1 file changed, 1 insertion(+), 36 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index e6a680c7daf7..3e34b14e8846 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -760,12 +760,6 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 	unsigned long __user *datap = (unsigned long __user *)data;
 
 	switch (request) {
-	/* when I and D space are separate, these will need to be fixed. */
-	case PTRACE_PEEKTEXT: /* read word at location addr. */
-	case PTRACE_PEEKDATA:
-		ret = generic_ptrace_peekdata(child, addr, data);
-		break;
-
 	/* read the word at location addr in the USER area. */
 	case PTRACE_PEEKUSR: {
 		unsigned long tmp;
@@ -787,12 +781,6 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		break;
 	}
 
-	/* when I and D space are separate, this will have to be fixed. */
-	case PTRACE_POKETEXT: /* write the word at location addr. */
-	case PTRACE_POKEDATA:
-		ret = generic_ptrace_pokedata(child, addr, data);
-		break;
-
 	case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
 		ret = -EIO;
 		if ((addr & (sizeof(data) - 1)) || addr < 0 ||
@@ -1183,24 +1171,6 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
 	childregs = task_pt_regs(child);
 
 	switch (request) {
-	case PTRACE_PEEKDATA:
-	case PTRACE_PEEKTEXT:
-		ret = 0;
-		if (access_process_vm(child, addr, &val, sizeof(u32), 0) !=
-		    sizeof(u32))
-			ret = -EIO;
-		else
-			ret = put_user(val, (unsigned int __user *)datap);
-		break;
-
-	case PTRACE_POKEDATA:
-	case PTRACE_POKETEXT:
-		ret = 0;
-		if (access_process_vm(child, addr, &data, sizeof(u32), 1) !=
-		    sizeof(u32))
-			ret = -EIO;
-		break;
-
 	case PTRACE_PEEKUSR:
 		ret = getreg32(child, addr, &val);
 		if (ret == 0)
@@ -1246,13 +1216,8 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
 					     sizeof(struct user32_fxsr_struct),
 					     datap);
 
-	case PTRACE_GETEVENTMSG:
-		ret = put_user(child->ptrace_message,
-			       (unsigned int __user *)compat_ptr(data));
-		break;
-
 	default:
-		BUG();
+		return compat_ptrace_request(child, request, addr, data);
 	}
 
  out:
-- 
cgit v1.2.3


From bde6f5f59c2b2b48a7a849c129d5b48838fe77ee Mon Sep 17 00:00:00 2001
From: Venki Pallipadi <venkatesh.pallipadi@intel.com>
Date: Wed, 30 Jan 2008 13:32:01 +0100
Subject: x86: voluntary leave_mm before entering ACPI C3

Aviod TLB flush IPIs during C3 states by voluntary leave_mm()
before entering C3.

The performance impact of TLB flush on C3 should not be significant with
respect to C3 wakeup latency. Also, CPUs tend to flush TLB in hardware while in
C3 anyways.

On a 8 logical CPU system, running make -j2, the number of tlbflush IPIs goes
down from 40 per second to ~ 0. Total number of interrupts during the run
of this workload was ~1200 per second, which makes it ~3% savings in wakeups.

There was no measurable performance or power impact however.

[ akpm@linux-foundation.org: symbol export fixes. ]

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/smp_32.c         | 3 ++-
 arch/x86/kernel/smp_64.c         | 3 ++-
 drivers/acpi/processor_idle.c    | 2 ++
 include/asm-ia64/acpi.h          | 2 ++
 include/asm-x86/acpi.h           | 3 +++
 include/asm-x86/mmu.h            | 8 ++++++++
 include/asm-x86/mmu_context_32.h | 2 --
 7 files changed, 19 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/smp_32.c b/arch/x86/kernel/smp_32.c
index 070816ac79e1..dc0cde9d16fb 100644
--- a/arch/x86/kernel/smp_32.c
+++ b/arch/x86/kernel/smp_32.c
@@ -256,13 +256,14 @@ static DEFINE_SPINLOCK(tlbstate_lock);
  * We need to reload %cr3 since the page tables may be going
  * away from under us..
  */
-void leave_mm(unsigned long cpu)
+void leave_mm(int cpu)
 {
 	if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
 		BUG();
 	cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
 	load_cr3(swapper_pg_dir);
 }
+EXPORT_SYMBOL_GPL(leave_mm);
 
 /*
  *
diff --git a/arch/x86/kernel/smp_64.c b/arch/x86/kernel/smp_64.c
index 02a6533e8909..2fd74b06db67 100644
--- a/arch/x86/kernel/smp_64.c
+++ b/arch/x86/kernel/smp_64.c
@@ -69,13 +69,14 @@ static DEFINE_PER_CPU(union smp_flush_state, flush_state);
  * We cannot call mmdrop() because we are in interrupt context,
  * instead update mm->cpu_vm_mask.
  */
-static inline void leave_mm(int cpu)
+void leave_mm(int cpu)
 {
 	if (read_pda(mmu_state) == TLBSTATE_OK)
 		BUG();
 	cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
 	load_cr3(swapper_pg_dir);
 }
+EXPORT_SYMBOL_GPL(leave_mm);
 
 /*
  *
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 2235f4e02d26..0721a8183c89 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -534,6 +534,7 @@ static void acpi_processor_idle(void)
 		break;
 
 	case ACPI_STATE_C3:
+		acpi_unlazy_tlb(smp_processor_id());
 		/*
 		 * Must be done before busmaster disable as we might
 		 * need to access HPET !
@@ -1423,6 +1424,7 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev,
 		return 0;
 	}
 
+	acpi_unlazy_tlb(smp_processor_id());
 	/*
 	 * Must be done before busmaster disable as we might need to
 	 * access HPET !
diff --git a/include/asm-ia64/acpi.h b/include/asm-ia64/acpi.h
index 81bcd5e51789..cd1cc39b5599 100644
--- a/include/asm-ia64/acpi.h
+++ b/include/asm-ia64/acpi.h
@@ -127,6 +127,8 @@ extern int __devinitdata pxm_to_nid_map[MAX_PXM_DOMAINS];
 extern int __initdata nid_to_pxm_map[MAX_NUMNODES];
 #endif
 
+#define acpi_unlazy_tlb(x)
+
 #endif /*__KERNEL__*/
 
 #endif /*_ASM_ACPI_H*/
diff --git a/include/asm-x86/acpi.h b/include/asm-x86/acpi.h
index 2feb0c494be7..98a9ca266531 100644
--- a/include/asm-x86/acpi.h
+++ b/include/asm-x86/acpi.h
@@ -27,6 +27,7 @@
 
 #include <asm/numa.h>
 #include <asm/processor.h>
+#include <asm/mmu.h>
 
 #define COMPILER_DEPENDENT_INT64   long long
 #define COMPILER_DEPENDENT_UINT64  unsigned long long
@@ -167,4 +168,6 @@ static inline void acpi_fake_nodes(const struct bootnode *fake_nodes,
 }
 #endif
 
+#define acpi_unlazy_tlb(x)	leave_mm(x)
+
 #endif /*__X86_ASM_ACPI_H*/
diff --git a/include/asm-x86/mmu.h b/include/asm-x86/mmu.h
index 3f922c8e1c88..efa962c38897 100644
--- a/include/asm-x86/mmu.h
+++ b/include/asm-x86/mmu.h
@@ -20,4 +20,12 @@ typedef struct {
 	void *vdso;
 } mm_context_t;
 
+#ifdef CONFIG_SMP
+void leave_mm(int cpu);
+#else
+static inline void leave_mm(int cpu)
+{
+}
+#endif
+
 #endif /* _ASM_X86_MMU_H */
diff --git a/include/asm-x86/mmu_context_32.h b/include/asm-x86/mmu_context_32.h
index 7eb0b0b1fb3c..8198d1cca1f3 100644
--- a/include/asm-x86/mmu_context_32.h
+++ b/include/asm-x86/mmu_context_32.h
@@ -32,8 +32,6 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 #endif
 }
 
-void leave_mm(unsigned long cpu);
-
 static inline void switch_mm(struct mm_struct *prev,
 			     struct mm_struct *next,
 			     struct task_struct *tsk)
-- 
cgit v1.2.3


From ddc66df876fd33d3956f3c3acc1ae334b16eedee Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Wed, 30 Jan 2008 13:32:01 +0100
Subject: x86: fix kprobe_handler reenable preemption

Fix a preemption bug in kprobe_handler(). It has to call preempt_enable()
before returning.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/kprobes.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 521a469acaad..f0f2b98b9e20 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -467,7 +467,8 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
 				arch_disarm_kprobe(p);
 				regs->ip = (unsigned long)p->addr;
 				reset_current_kprobe();
-				return 1;
+				ret = 1;
+				goto no_kprobe;
 #endif
 			}
 			/* We have reentered the kprobe_handler(), since
-- 
cgit v1.2.3


From 40102d4a41312ad4134c0b802ad074445ce8b17b Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Wed, 30 Jan 2008 13:32:02 +0100
Subject: x86: add reenter_kprobe helper

[ mhiramat@redhat.com: updated it to latest x86.git ]

Factor common X86_32, X86_64 kprobe reenter logic from deeply
indented section to helper function.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
---
 arch/x86/kernel/kprobes.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index f0f2b98b9e20..7dd918633c30 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -427,6 +427,20 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
 	/* Replace the return addr with trampoline addr */
 	*sara = (unsigned long) &kretprobe_trampoline;
 }
+/*
+ * We have reentered the kprobe_handler(), since another probe was hit while
+ * within the handler. We save the original kprobes variables and just single
+ * step on the instruction of the new probe without calling any user handlers.
+ */
+static void __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
+				     struct kprobe_ctlblk *kcb)
+{
+	save_previous_kprobe(kcb);
+	set_current_kprobe(p, regs, kcb);
+	kprobes_inc_nmissed_count(p);
+	prepare_singlestep(p, regs);
+	kcb->kprobe_status = KPROBE_REENTER;
+}
 
 /*
  * Interrupts are disabled on entry as trap3 is an interrupt gate and they
@@ -471,17 +485,7 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
 				goto no_kprobe;
 #endif
 			}
-			/* We have reentered the kprobe_handler(), since
-			 * another probe was hit while within the handler.
-			 * We here save the original kprobes variables and
-			 * just single step on the instruction of the new probe
-			 * without calling any user handlers.
-			 */
-			save_previous_kprobe(kcb);
-			set_current_kprobe(p, regs, kcb);
-			kprobes_inc_nmissed_count(p);
-			prepare_singlestep(p, regs);
-			kcb->kprobe_status = KPROBE_REENTER;
+			reenter_kprobe(p, regs, kcb);
 			return 1;
 		} else {
 			if (*addr != BREAKPOINT_INSTRUCTION) {
-- 
cgit v1.2.3


From 59e87cdcd268daa85c0732e147c59e0c1bacd704 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Wed, 30 Jan 2008 13:32:02 +0100
Subject: x86: move deeply indented code to reenter_kprobe

Move some deeply indented code related to re-entrance processing
from kprobe_handler() to reenter_kprobe().

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/kprobes.c | 46 +++++++++++++++++++++++-----------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 7dd918633c30..4e33329ce8a3 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -432,14 +432,32 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
  * within the handler. We save the original kprobes variables and just single
  * step on the instruction of the new probe without calling any user handlers.
  */
-static void __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
-				     struct kprobe_ctlblk *kcb)
+static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
+				    struct kprobe_ctlblk *kcb)
 {
+	if (kcb->kprobe_status == KPROBE_HIT_SS &&
+	    *p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
+		regs->flags &= ~X86_EFLAGS_TF;
+		regs->flags |= kcb->kprobe_saved_flags;
+		return 0;
+#ifdef CONFIG_X86_64
+	} else if (kcb->kprobe_status == KPROBE_HIT_SSDONE) {
+		/* TODO: Provide re-entrancy from post_kprobes_handler() and
+		 * avoid exception stack corruption while single-stepping on
+		 * the instruction of the new probe.
+		 */
+		arch_disarm_kprobe(p);
+		regs->ip = (unsigned long)p->addr;
+		reset_current_kprobe();
+		return 1;
+#endif
+	}
 	save_previous_kprobe(kcb);
 	set_current_kprobe(p, regs, kcb);
 	kprobes_inc_nmissed_count(p);
 	prepare_singlestep(p, regs);
 	kcb->kprobe_status = KPROBE_REENTER;
+	return 1;
 }
 
 /*
@@ -466,27 +484,9 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
 	if (kprobe_running()) {
 		p = get_kprobe(addr);
 		if (p) {
-			if (kcb->kprobe_status == KPROBE_HIT_SS &&
-				*p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
-				regs->flags &= ~X86_EFLAGS_TF;
-				regs->flags |= kcb->kprobe_saved_flags;
-				goto no_kprobe;
-#ifdef CONFIG_X86_64
-			} else if (kcb->kprobe_status == KPROBE_HIT_SSDONE) {
-				/* TODO: Provide re-entrancy from
-				 * post_kprobes_handler() and avoid exception
-				 * stack corruption while single-stepping on
-				 * the instruction of the new probe.
-				 */
-				arch_disarm_kprobe(p);
-				regs->ip = (unsigned long)p->addr;
-				reset_current_kprobe();
-				ret = 1;
-				goto no_kprobe;
-#endif
-			}
-			reenter_kprobe(p, regs, kcb);
-			return 1;
+			ret = reenter_kprobe(p, regs, kcb);
+			if (kcb->kprobe_status == KPROBE_REENTER)
+				return 1;
 		} else {
 			if (*addr != BREAKPOINT_INSTRUCTION) {
 			/* The breakpoint instruction was removed by
-- 
cgit v1.2.3


From da35c37198132abebf877cca2ad3c6d9bcd84282 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Wed, 30 Jan 2008 13:32:03 +0100
Subject: x86, ptrace: rlimit BTS buffer allocation

Check the rlimit of the tracing task for total and locked memory when allocating the BTS buffer.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ptrace.c | 149 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 103 insertions(+), 46 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 3e34b14e8846..88ed1e74cee9 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -620,12 +620,80 @@ static int ptrace_bts_drain(struct task_struct *child,
 	return i;
 }
 
+static int ptrace_bts_realloc(struct task_struct *child,
+			      int size, int reduce_size)
+{
+	unsigned long rlim, vm;
+	int ret, old_size;
+
+	if (size < 0)
+		return -EINVAL;
+
+	old_size = ds_get_bts_size((void *)child->thread.ds_area_msr);
+	if (old_size < 0)
+		return old_size;
+
+	ret = ds_free((void **)&child->thread.ds_area_msr);
+	if (ret < 0)
+		goto out;
+
+	size >>= PAGE_SHIFT;
+	old_size >>= PAGE_SHIFT;
+
+	current->mm->total_vm  -= old_size;
+	current->mm->locked_vm -= old_size;
+
+	if (size == 0)
+		goto out;
+
+	rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
+	vm = current->mm->total_vm  + size;
+	if (rlim < vm) {
+		ret = -ENOMEM;
+
+		if (!reduce_size)
+			goto out;
+
+		size = rlim - current->mm->total_vm;
+		if (size <= 0)
+			goto out;
+	}
+
+	rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
+	vm = current->mm->locked_vm  + size;
+	if (rlim < vm) {
+		ret = -ENOMEM;
+
+		if (!reduce_size)
+			goto out;
+
+		size = rlim - current->mm->locked_vm;
+		if (size <= 0)
+			goto out;
+	}
+
+	ret = ds_allocate((void **)&child->thread.ds_area_msr,
+			  size << PAGE_SHIFT);
+	if (ret < 0)
+		goto out;
+
+	current->mm->total_vm  += size;
+	current->mm->locked_vm += size;
+
+out:
+	if (child->thread.ds_area_msr)
+		set_tsk_thread_flag(child, TIF_DS_AREA_MSR);
+	else
+		clear_tsk_thread_flag(child, TIF_DS_AREA_MSR);
+
+	return ret;
+}
+
 static int ptrace_bts_config(struct task_struct *child,
 			     const struct ptrace_bts_config __user *ucfg)
 {
 	struct ptrace_bts_config cfg;
-	unsigned long debugctl_mask;
-	int bts_size, ret;
+	int bts_size, ret = 0;
 	void *ds;
 
 	if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
@@ -638,59 +706,46 @@ static int ptrace_bts_config(struct task_struct *child,
 		if (bts_size < 0)
 			return bts_size;
 	}
+	cfg.size = PAGE_ALIGN(cfg.size);
 
 	if (bts_size != cfg.size) {
-		ret = ds_free((void **)&child->thread.ds_area_msr);
+		ret = ptrace_bts_realloc(child, cfg.size,
+					 cfg.flags & PTRACE_BTS_O_CUT_SIZE);
 		if (ret < 0)
-			return ret;
+			goto errout;
 
-		if (cfg.size > 0)
-			ret = ds_allocate((void **)&child->thread.ds_area_msr,
-					  cfg.size);
 		ds = (void *)child->thread.ds_area_msr;
-		if (ds)
-			set_tsk_thread_flag(child, TIF_DS_AREA_MSR);
-		else
-			clear_tsk_thread_flag(child, TIF_DS_AREA_MSR);
-
-		if (ret < 0)
-			return ret;
-
-		bts_size = ds_get_bts_size(ds);
-		if (bts_size <= 0)
-			return bts_size;
 	}
 
-	if (ds) {
-		if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
-			ret = ds_set_overflow(ds, DS_O_SIGNAL);
-		} else {
-			ret = ds_set_overflow(ds, DS_O_WRAP);
-		}
-		if (ret < 0)
-			return ret;
-	}
-
-	debugctl_mask = ds_debugctl_mask();
-	if (ds && (cfg.flags & PTRACE_BTS_O_TRACE)) {
-		child->thread.debugctlmsr |= debugctl_mask;
-		set_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
-	} else {
-		/* there is no way for us to check whether we 'own'
-		 * the respective bits in the DEBUGCTL MSR, we're
-		 * about to clear */
-		child->thread.debugctlmsr &= ~debugctl_mask;
+	if (cfg.flags & PTRACE_BTS_O_SIGNAL)
+		ret = ds_set_overflow(ds, DS_O_SIGNAL);
+	else
+		ret = ds_set_overflow(ds, DS_O_WRAP);
+	if (ret < 0)
+		goto errout;
 
-		if (!child->thread.debugctlmsr)
-			clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
-	}
+	if (cfg.flags & PTRACE_BTS_O_TRACE)
+		child->thread.debugctlmsr |= ds_debugctl_mask();
+	else
+		child->thread.debugctlmsr &= ~ds_debugctl_mask();
 
-	if (ds && (cfg.flags & PTRACE_BTS_O_SCHED))
+	if (cfg.flags & PTRACE_BTS_O_SCHED)
 		set_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
 	else
 		clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
 
-	return 0;
+out:
+	if (child->thread.debugctlmsr)
+		set_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
+	else
+		clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
+
+	return ret;
+
+errout:
+	child->thread.debugctlmsr &= ~ds_debugctl_mask();
+	clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
+	goto out;
 }
 
 static int ptrace_bts_status(struct task_struct *child,
@@ -726,7 +781,7 @@ void ptrace_bts_take_timestamp(struct task_struct *tsk,
 {
 	struct bts_struct rec = {
 		.qualifier = qualifier,
-		.variant.jiffies = jiffies
+		.variant.jiffies = jiffies_64
 	};
 
 	ptrace_bts_write_record(tsk, &rec);
@@ -743,10 +798,12 @@ void ptrace_disable(struct task_struct *child)
 #ifdef TIF_SYSCALL_EMU
 	clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
 #endif
-	ptrace_bts_config(child, /* options = */ 0);
 	if (child->thread.ds_area_msr) {
-	    ds_free((void **)&child->thread.ds_area_msr);
-	    clear_tsk_thread_flag(child, TIF_DS_AREA_MSR);
+		ptrace_bts_realloc(child, 0, 0);
+		child->thread.debugctlmsr &= ~ds_debugctl_mask();
+		if (!child->thread.debugctlmsr)
+			clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
+		clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
 	}
 }
 
-- 
cgit v1.2.3


From e6ae5d9540727b0e2e5e2fbeb683c84671ed0a31 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Wed, 30 Jan 2008 13:32:03 +0100
Subject: x86, ptrace: support 32bit-cross-64bit BTS recording

Support BTS recording of 32bit and 64bit tasks from 32bit or 64bit tasks.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ds.c         | 91 ++++++++++++++++++++++----------------------
 arch/x86/kernel/ptrace.c     |  2 +-
 include/asm-x86/ds.h         | 10 ++---
 include/asm-x86/ptrace-abi.h |  4 +-
 4 files changed, 53 insertions(+), 54 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 6eb5d49a36bb..1c5ca4d18787 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -111,53 +111,53 @@ static struct ds_configuration ds_cfg;
  * Accessor functions for some DS and BTS fields using the above
  * global ptrace_bts_cfg.
  */
-static inline void *get_bts_buffer_base(char *base)
+static inline unsigned long get_bts_buffer_base(char *base)
 {
-	return *(void **)(base + ds_cfg.bts_buffer_base.offset);
+	return *(unsigned long *)(base + ds_cfg.bts_buffer_base.offset);
 }
-static inline void set_bts_buffer_base(char *base, void *value)
+static inline void set_bts_buffer_base(char *base, unsigned long value)
 {
-	(*(void **)(base + ds_cfg.bts_buffer_base.offset)) = value;
+	(*(unsigned long *)(base + ds_cfg.bts_buffer_base.offset)) = value;
 }
-static inline void *get_bts_index(char *base)
+static inline unsigned long get_bts_index(char *base)
 {
-	return *(void **)(base + ds_cfg.bts_index.offset);
+	return *(unsigned long *)(base + ds_cfg.bts_index.offset);
 }
-static inline void set_bts_index(char *base, void *value)
+static inline void set_bts_index(char *base, unsigned long value)
 {
-	(*(void **)(base + ds_cfg.bts_index.offset)) = value;
+	(*(unsigned long *)(base + ds_cfg.bts_index.offset)) = value;
 }
-static inline void *get_bts_absolute_maximum(char *base)
+static inline unsigned long get_bts_absolute_maximum(char *base)
 {
-	return *(void **)(base + ds_cfg.bts_absolute_maximum.offset);
+	return *(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset);
 }
-static inline void set_bts_absolute_maximum(char *base, void *value)
+static inline void set_bts_absolute_maximum(char *base, unsigned long value)
 {
-	(*(void **)(base + ds_cfg.bts_absolute_maximum.offset)) = value;
+	(*(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset)) = value;
 }
-static inline void *get_bts_interrupt_threshold(char *base)
+static inline unsigned long get_bts_interrupt_threshold(char *base)
 {
-	return *(void **)(base + ds_cfg.bts_interrupt_threshold.offset);
+	return *(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset);
 }
-static inline void set_bts_interrupt_threshold(char *base, void *value)
+static inline void set_bts_interrupt_threshold(char *base, unsigned long value)
 {
-	(*(void **)(base + ds_cfg.bts_interrupt_threshold.offset)) = value;
+	(*(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset)) = value;
 }
-static inline long get_from_ip(char *base)
+static inline unsigned long get_from_ip(char *base)
 {
-	return *(long *)(base + ds_cfg.from_ip.offset);
+	return *(unsigned long *)(base + ds_cfg.from_ip.offset);
 }
-static inline void set_from_ip(char *base, long value)
+static inline void set_from_ip(char *base, unsigned long value)
 {
-	(*(long *)(base + ds_cfg.from_ip.offset)) = value;
+	(*(unsigned long *)(base + ds_cfg.from_ip.offset)) = value;
 }
-static inline long get_to_ip(char *base)
+static inline unsigned long get_to_ip(char *base)
 {
-	return *(long *)(base + ds_cfg.to_ip.offset);
+	return *(unsigned long *)(base + ds_cfg.to_ip.offset);
 }
-static inline void set_to_ip(char *base, long value)
+static inline void set_to_ip(char *base, unsigned long value)
 {
-	(*(long *)(base + ds_cfg.to_ip.offset)) = value;
+	(*(unsigned long *)(base + ds_cfg.to_ip.offset)) = value;
 }
 static inline unsigned char get_info_type(char *base)
 {
@@ -180,7 +180,7 @@ static inline void set_info_data(char *base, unsigned long value)
 int ds_allocate(void **dsp, size_t bts_size_in_bytes)
 {
 	size_t bts_size_in_records;
-	void *bts;
+	unsigned long bts;
 	void *ds;
 
 	if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
@@ -197,7 +197,7 @@ int ds_allocate(void **dsp, size_t bts_size_in_bytes)
 	if (bts_size_in_bytes <= 0)
 		return -EINVAL;
 
-	bts = kzalloc(bts_size_in_bytes, GFP_KERNEL);
+	bts = (unsigned long)kzalloc(bts_size_in_bytes, GFP_KERNEL);
 
 	if (!bts)
 		return -ENOMEM;
@@ -205,7 +205,7 @@ int ds_allocate(void **dsp, size_t bts_size_in_bytes)
 	ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL);
 
 	if (!ds) {
-		kfree(bts);
+		kfree((void *)bts);
 		return -ENOMEM;
 	}
 
@@ -221,7 +221,7 @@ int ds_allocate(void **dsp, size_t bts_size_in_bytes)
 int ds_free(void **dsp)
 {
 	if (*dsp)
-		kfree(get_bts_buffer_base(*dsp));
+		kfree((void *)get_bts_buffer_base(*dsp));
 	kfree(*dsp);
 	*dsp = 0;
 
@@ -230,7 +230,7 @@ int ds_free(void **dsp)
 
 int ds_get_bts_size(void *ds)
 {
-	size_t size_in_bytes;
+	int size_in_bytes;
 
 	if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
 		return -EOPNOTSUPP;
@@ -246,7 +246,7 @@ int ds_get_bts_size(void *ds)
 
 int ds_get_bts_end(void *ds)
 {
-	size_t size_in_bytes = ds_get_bts_size(ds);
+	int size_in_bytes = ds_get_bts_size(ds);
 
 	if (size_in_bytes <= 0)
 		return size_in_bytes;
@@ -256,7 +256,7 @@ int ds_get_bts_end(void *ds)
 
 int ds_get_bts_index(void *ds)
 {
-	size_t index_offset_in_bytes;
+	int index_offset_in_bytes;
 
 	if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
 		return -EOPNOTSUPP;
@@ -288,19 +288,19 @@ int ds_get_overflow(void *ds)
 int ds_clear(void *ds)
 {
 	int bts_size = ds_get_bts_size(ds);
-	void *bts_base;
+	unsigned long bts_base;
 
 	if (bts_size <= 0)
 		return bts_size;
 
 	bts_base = get_bts_buffer_base(ds);
-	memset(bts_base, 0, bts_size);
+	memset((void *)bts_base, 0, bts_size);
 
 	set_bts_index(ds, bts_base);
 	return 0;
 }
 
-int ds_read_bts(void *ds, size_t index, struct bts_struct *out)
+int ds_read_bts(void *ds, int index, struct bts_struct *out)
 {
 	void *bts;
 
@@ -313,8 +313,7 @@ int ds_read_bts(void *ds, size_t index, struct bts_struct *out)
 	if (index >= ds_get_bts_size(ds))
 		return -EINVAL;
 
-	bts = get_bts_buffer_base(ds);
-	bts = (char *)bts + (index * ds_cfg.sizeof_bts);
+	bts = (void *)(get_bts_buffer_base(ds) + (index * ds_cfg.sizeof_bts));
 
 	memset(out, 0, sizeof(*out));
 	if (get_from_ip(bts) == BTS_ESCAPE_ADDRESS) {
@@ -326,12 +325,12 @@ int ds_read_bts(void *ds, size_t index, struct bts_struct *out)
 		out->variant.lbr.to_ip   = get_to_ip(bts);
 	}
 
-	return 0;
+	return sizeof(*out);;
 }
 
 int ds_write_bts(void *ds, const struct bts_struct *in)
 {
-	void *bts;
+	unsigned long bts;
 
 	if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
 		return -EOPNOTSUPP;
@@ -341,33 +340,33 @@ int ds_write_bts(void *ds, const struct bts_struct *in)
 
 	bts = get_bts_index(ds);
 
-	memset(bts, 0, ds_cfg.sizeof_bts);
+	memset((void *)bts, 0, ds_cfg.sizeof_bts);
 	switch (in->qualifier) {
 	case BTS_INVALID:
 		break;
 
 	case BTS_BRANCH:
-		set_from_ip(bts, in->variant.lbr.from_ip);
-		set_to_ip(bts, in->variant.lbr.to_ip);
+		set_from_ip((void *)bts, in->variant.lbr.from_ip);
+		set_to_ip((void *)bts, in->variant.lbr.to_ip);
 		break;
 
 	case BTS_TASK_ARRIVES:
 	case BTS_TASK_DEPARTS:
-		set_from_ip(bts, BTS_ESCAPE_ADDRESS);
-		set_info_type(bts, in->qualifier);
-		set_info_data(bts, in->variant.jiffies);
+		set_from_ip((void *)bts, BTS_ESCAPE_ADDRESS);
+		set_info_type((void *)bts, in->qualifier);
+		set_info_data((void *)bts, in->variant.jiffies);
 		break;
 
 	default:
 		return -EINVAL;
 	}
 
-	bts = (char *)bts + ds_cfg.sizeof_bts;
+	bts = bts + ds_cfg.sizeof_bts;
 	if (bts >= get_bts_absolute_maximum(ds))
 		bts = get_bts_buffer_base(ds);
 	set_bts_index(ds, bts);
 
-	return 0;
+	return ds_cfg.sizeof_bts;
 }
 
 unsigned long ds_debugctl_mask(void)
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 88ed1e74cee9..236528bec6eb 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -558,7 +558,7 @@ static int ptrace_bts_read_record(struct task_struct *child,
 
 	retval = ds_read_bts((void *)child->thread.ds_area_msr,
 			     bts_index, &ret);
-	if (retval)
+	if (retval < 0)
 		return retval;
 
 	if (copy_to_user(out, &ret, sizeof(ret)))
diff --git a/include/asm-x86/ds.h b/include/asm-x86/ds.h
index b84040abee68..7881368142fa 100644
--- a/include/asm-x86/ds.h
+++ b/include/asm-x86/ds.h
@@ -39,16 +39,16 @@ enum bts_qualifier {
 };
 
 struct bts_struct {
-	enum bts_qualifier qualifier;
+	u64 qualifier;
 	union {
 		/* BTS_BRANCH */
 		struct {
-			long from_ip;
-			long to_ip;
+			u64 from_ip;
+			u64 to_ip;
 		} lbr;
 		/* BTS_TASK_ARRIVES or
 		   BTS_TASK_DEPARTS */
-		unsigned long jiffies;
+		u64 jiffies;
 	} variant;
 };
 
@@ -64,7 +64,7 @@ extern int ds_get_bts_index(void *);
 extern int ds_set_overflow(void *, int);
 extern int ds_get_overflow(void *);
 extern int ds_clear(void *);
-extern int ds_read_bts(void *, size_t, struct bts_struct *);
+extern int ds_read_bts(void *, int, struct bts_struct *);
 extern int ds_write_bts(void *, const struct bts_struct *);
 extern unsigned long ds_debugctl_mask(void);
 extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *c);
diff --git a/include/asm-x86/ptrace-abi.h b/include/asm-x86/ptrace-abi.h
index cf2fe4633ee5..32fe137822bf 100644
--- a/include/asm-x86/ptrace-abi.h
+++ b/include/asm-x86/ptrace-abi.h
@@ -85,9 +85,9 @@
 */
 struct ptrace_bts_config {
 	/* requested or actual size of BTS buffer in bytes */
-	unsigned long size;
+	unsigned int size;
 	/* bitmask of below flags */
-	unsigned long flags;
+	unsigned int flags;
 };
 
 #define PTRACE_BTS_O_TRACE	0x1 /* branch trace */
-- 
cgit v1.2.3


From cba4b65d359268c40679ca75ac92c0b93cecf6de Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Wed, 30 Jan 2008 13:32:03 +0100
Subject: x86, ptrace: add buffer size checks

Pass the buffer size for (most) ptrace commands that pass user-allocated buffers and check that size before accessing the buffer. Unfortunately, PTRACE_BTS_GET already uses all 4 parameters.
Commands that access user buffers return the number of bytes or records read or written.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ptrace.c     | 25 +++++++++++++++++++++----
 include/asm-x86/ptrace-abi.h | 14 ++++++++------
 2 files changed, 29 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 236528bec6eb..e19a91db9b35 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -591,6 +591,7 @@ static int ptrace_bts_clear(struct task_struct *child)
 }
 
 static int ptrace_bts_drain(struct task_struct *child,
+			    long size,
 			    struct bts_struct __user *out)
 {
 	int end, i;
@@ -603,6 +604,9 @@ static int ptrace_bts_drain(struct task_struct *child,
 	if (end <= 0)
 		return end;
 
+	if (size < (end * sizeof(struct bts_struct)))
+		return -EIO;
+
 	for (i = 0; i < end; i++, out++) {
 		struct bts_struct ret;
 		int retval;
@@ -617,7 +621,7 @@ static int ptrace_bts_drain(struct task_struct *child,
 
 	ds_clear(ds);
 
-	return i;
+	return end;
 }
 
 static int ptrace_bts_realloc(struct task_struct *child,
@@ -690,15 +694,22 @@ out:
 }
 
 static int ptrace_bts_config(struct task_struct *child,
+			     long cfg_size,
 			     const struct ptrace_bts_config __user *ucfg)
 {
 	struct ptrace_bts_config cfg;
 	int bts_size, ret = 0;
 	void *ds;
 
+	if (cfg_size < sizeof(cfg))
+		return -EIO;
+
 	if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
 		return -EFAULT;
 
+	if ((int)cfg.size < 0)
+		return -EINVAL;
+
 	bts_size = 0;
 	ds = (void *)child->thread.ds_area_msr;
 	if (ds) {
@@ -734,6 +745,8 @@ static int ptrace_bts_config(struct task_struct *child,
 	else
 		clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
 
+	ret = sizeof(cfg);
+
 out:
 	if (child->thread.debugctlmsr)
 		set_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
@@ -749,11 +762,15 @@ errout:
 }
 
 static int ptrace_bts_status(struct task_struct *child,
+			     long cfg_size,
 			     struct ptrace_bts_config __user *ucfg)
 {
 	void *ds = (void *)child->thread.ds_area_msr;
 	struct ptrace_bts_config cfg;
 
+	if (cfg_size < sizeof(cfg))
+		return -EIO;
+
 	memset(&cfg, 0, sizeof(cfg));
 
 	if (ds) {
@@ -923,12 +940,12 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 
 	case PTRACE_BTS_CONFIG:
 		ret = ptrace_bts_config
-			(child, (struct ptrace_bts_config __user *)addr);
+			(child, data, (struct ptrace_bts_config __user *)addr);
 		break;
 
 	case PTRACE_BTS_STATUS:
 		ret = ptrace_bts_status
-			(child, (struct ptrace_bts_config __user *)addr);
+			(child, data, (struct ptrace_bts_config __user *)addr);
 		break;
 
 	case PTRACE_BTS_SIZE:
@@ -946,7 +963,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 
 	case PTRACE_BTS_DRAIN:
 		ret = ptrace_bts_drain
-			(child, (struct bts_struct __user *) addr);
+			(child, data, (struct bts_struct __user *) addr);
 		break;
 
 	default:
diff --git a/include/asm-x86/ptrace-abi.h b/include/asm-x86/ptrace-abi.h
index 32fe137822bf..bcf67044754c 100644
--- a/include/asm-x86/ptrace-abi.h
+++ b/include/asm-x86/ptrace-abi.h
@@ -99,13 +99,15 @@ struct ptrace_bts_config {
 
 #define PTRACE_BTS_CONFIG	40
 /* Configure branch trace recording.
-   DATA is ignored, ADDR points to a struct ptrace_bts_config.
+   ADDR points to a struct ptrace_bts_config.
+   DATA gives the size of that buffer.
    A new buffer is allocated, iff the size changes.
+   Returns the number of bytes read.
 */
 #define PTRACE_BTS_STATUS	41
-/* Return the current configuration.
-   DATA is ignored, ADDR points to a struct ptrace_bts_config
-   that will contain the result.
+/* Return the current configuration in a struct ptrace_bts_config
+   pointed to by ADDR; DATA gives the size of that buffer.
+   Returns the number of bytes written.
 */
 #define PTRACE_BTS_SIZE		42
 /* Return the number of available BTS records.
@@ -123,8 +125,8 @@ struct ptrace_bts_config {
 */
 #define PTRACE_BTS_DRAIN	45
 /* Read all available BTS records and clear the buffer.
-   DATA is ignored. ADDR points to an array of struct bts_struct of
-   suitable size.
+   ADDR points to an array of struct bts_struct.
+   DATA gives the size of that buffer.
    BTS records are read from oldest to newest.
    Returns number of BTS records drained.
 */
-- 
cgit v1.2.3


From b1df07bd6674a84fbd9248759dc3fa3ff5c78e5b Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 30 Jan 2008 13:32:04 +0100
Subject: x86: change paravirt_32.c name

This patch changes paravirt_32.c to paravirt.c. The goal
is to have paravirt support in x86_64, so we do it in a common file

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/Makefile_32   |   2 +-
 arch/x86/kernel/paravirt.c    | 475 ++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/paravirt_32.c | 472 -----------------------------------------
 3 files changed, 476 insertions(+), 473 deletions(-)
 create mode 100644 arch/x86/kernel/paravirt.c
 delete mode 100644 arch/x86/kernel/paravirt_32.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32
index a67198044b0d..cfb71a5deb19 100644
--- a/arch/x86/kernel/Makefile_32
+++ b/arch/x86/kernel/Makefile_32
@@ -48,7 +48,7 @@ obj-$(CONFIG_K8_NB)		+= k8.o
 obj-$(CONFIG_MGEODE_LX)		+= geode_32.o mfgpt_32.o
 
 obj-$(CONFIG_VMI)		+= vmi_32.o vmiclock_32.o
-obj-$(CONFIG_PARAVIRT)		+= paravirt_32.o
+obj-$(CONFIG_PARAVIRT)		+= paravirt.o
 obj-y				+= pcspeaker.o
 
 obj-$(CONFIG_SCx200)		+= scx200_32.o
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
new file mode 100644
index 000000000000..1a170877f46c
--- /dev/null
+++ b/arch/x86/kernel/paravirt.c
@@ -0,0 +1,475 @@
+/*  Paravirtualization interfaces
+    Copyright (C) 2006 Rusty Russell IBM Corporation
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    2007 - x86_64 support added by Glauber de Oliveira Costa, Red Hat Inc
+*/
+
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/efi.h>
+#include <linux/bcd.h>
+#include <linux/highmem.h>
+
+#include <asm/bug.h>
+#include <asm/paravirt.h>
+#include <asm/desc.h>
+#include <asm/setup.h>
+#include <asm/arch_hooks.h>
+#include <asm/time.h>
+#include <asm/irq.h>
+#include <asm/delay.h>
+#include <asm/fixmap.h>
+#include <asm/apic.h>
+#include <asm/tlbflush.h>
+#include <asm/timer.h>
+
+/* nop stub */
+void _paravirt_nop(void)
+{
+}
+
+static void __init default_banner(void)
+{
+	printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
+	       pv_info.name);
+}
+
+char *memory_setup(void)
+{
+	return pv_init_ops.memory_setup();
+}
+
+/* Simple instruction patching code. */
+#define DEF_NATIVE(ops, name, code)					\
+	extern const char start_##ops##_##name[], end_##ops##_##name[];	\
+	asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":")
+
+DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
+DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
+DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
+DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
+DEF_NATIVE(pv_cpu_ops, iret, "iret");
+DEF_NATIVE(pv_cpu_ops, irq_enable_syscall_ret, "sti; sysexit");
+DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
+DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
+DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
+DEF_NATIVE(pv_cpu_ops, clts, "clts");
+DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
+
+/* Undefined instruction for dealing with missing ops pointers. */
+static const unsigned char ud2a[] = { 0x0f, 0x0b };
+
+static unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
+			     unsigned long addr, unsigned len)
+{
+	const unsigned char *start, *end;
+	unsigned ret;
+
+	switch(type) {
+#define SITE(ops, x)						\
+	case PARAVIRT_PATCH(ops.x):				\
+		start = start_##ops##_##x;			\
+		end = end_##ops##_##x;				\
+		goto patch_site
+
+	SITE(pv_irq_ops, irq_disable);
+	SITE(pv_irq_ops, irq_enable);
+	SITE(pv_irq_ops, restore_fl);
+	SITE(pv_irq_ops, save_fl);
+	SITE(pv_cpu_ops, iret);
+	SITE(pv_cpu_ops, irq_enable_syscall_ret);
+	SITE(pv_mmu_ops, read_cr2);
+	SITE(pv_mmu_ops, read_cr3);
+	SITE(pv_mmu_ops, write_cr3);
+	SITE(pv_cpu_ops, clts);
+	SITE(pv_cpu_ops, read_tsc);
+#undef SITE
+
+	patch_site:
+		ret = paravirt_patch_insns(ibuf, len, start, end);
+		break;
+
+	default:
+		ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
+		break;
+	}
+
+	return ret;
+}
+
+unsigned paravirt_patch_nop(void)
+{
+	return 0;
+}
+
+unsigned paravirt_patch_ignore(unsigned len)
+{
+	return len;
+}
+
+struct branch {
+	unsigned char opcode;
+	u32 delta;
+} __attribute__((packed));
+
+unsigned paravirt_patch_call(void *insnbuf,
+			     const void *target, u16 tgt_clobbers,
+			     unsigned long addr, u16 site_clobbers,
+			     unsigned len)
+{
+	struct branch *b = insnbuf;
+	unsigned long delta = (unsigned long)target - (addr+5);
+
+	if (tgt_clobbers & ~site_clobbers)
+		return len;	/* target would clobber too much for this site */
+	if (len < 5)
+		return len;	/* call too long for patch site */
+
+	b->opcode = 0xe8; /* call */
+	b->delta = delta;
+	BUILD_BUG_ON(sizeof(*b) != 5);
+
+	return 5;
+}
+
+unsigned paravirt_patch_jmp(void *insnbuf, const void *target,
+			    unsigned long addr, unsigned len)
+{
+	struct branch *b = insnbuf;
+	unsigned long delta = (unsigned long)target - (addr+5);
+
+	if (len < 5)
+		return len;	/* call too long for patch site */
+
+	b->opcode = 0xe9;	/* jmp */
+	b->delta = delta;
+
+	return 5;
+}
+
+/* Neat trick to map patch type back to the call within the
+ * corresponding structure. */
+static void *get_call_destination(u8 type)
+{
+	struct paravirt_patch_template tmpl = {
+		.pv_init_ops = pv_init_ops,
+		.pv_time_ops = pv_time_ops,
+		.pv_cpu_ops = pv_cpu_ops,
+		.pv_irq_ops = pv_irq_ops,
+		.pv_apic_ops = pv_apic_ops,
+		.pv_mmu_ops = pv_mmu_ops,
+	};
+	return *((void **)&tmpl + type);
+}
+
+unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
+				unsigned long addr, unsigned len)
+{
+	void *opfunc = get_call_destination(type);
+	unsigned ret;
+
+	if (opfunc == NULL)
+		/* If there's no function, patch it with a ud2a (BUG) */
+		ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a));
+	else if (opfunc == paravirt_nop)
+		/* If the operation is a nop, then nop the callsite */
+		ret = paravirt_patch_nop();
+	else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
+		 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_syscall_ret))
+		/* If operation requires a jmp, then jmp */
+		ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len);
+	else
+		/* Otherwise call the function; assume target could
+		   clobber any caller-save reg */
+		ret = paravirt_patch_call(insnbuf, opfunc, CLBR_ANY,
+					  addr, clobbers, len);
+
+	return ret;
+}
+
+unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
+			      const char *start, const char *end)
+{
+	unsigned insn_len = end - start;
+
+	if (insn_len > len || start == NULL)
+		insn_len = len;
+	else
+		memcpy(insnbuf, start, insn_len);
+
+	return insn_len;
+}
+
+void init_IRQ(void)
+{
+	pv_irq_ops.init_IRQ();
+}
+
+static void native_flush_tlb(void)
+{
+	__native_flush_tlb();
+}
+
+/*
+ * Global pages have to be flushed a bit differently. Not a real
+ * performance problem because this does not happen often.
+ */
+static void native_flush_tlb_global(void)
+{
+	__native_flush_tlb_global();
+}
+
+static void native_flush_tlb_single(unsigned long addr)
+{
+	__native_flush_tlb_single(addr);
+}
+
+/* These are in entry.S */
+extern void native_iret(void);
+extern void native_irq_enable_syscall_ret(void);
+
+static int __init print_banner(void)
+{
+	pv_init_ops.banner();
+	return 0;
+}
+core_initcall(print_banner);
+
+static struct resource reserve_ioports = {
+	.start = 0,
+	.end = IO_SPACE_LIMIT,
+	.name = "paravirt-ioport",
+	.flags = IORESOURCE_IO | IORESOURCE_BUSY,
+};
+
+static struct resource reserve_iomem = {
+	.start = 0,
+	.end = -1,
+	.name = "paravirt-iomem",
+	.flags = IORESOURCE_MEM | IORESOURCE_BUSY,
+};
+
+/*
+ * Reserve the whole legacy IO space to prevent any legacy drivers
+ * from wasting time probing for their hardware.  This is a fairly
+ * brute-force approach to disabling all non-virtual drivers.
+ *
+ * Note that this must be called very early to have any effect.
+ */
+int paravirt_disable_iospace(void)
+{
+	int ret;
+
+	ret = request_resource(&ioport_resource, &reserve_ioports);
+	if (ret == 0) {
+		ret = request_resource(&iomem_resource, &reserve_iomem);
+		if (ret)
+			release_resource(&reserve_ioports);
+	}
+
+	return ret;
+}
+
+static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LAZY_NONE;
+
+static inline void enter_lazy(enum paravirt_lazy_mode mode)
+{
+	BUG_ON(x86_read_percpu(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
+	BUG_ON(preemptible());
+
+	x86_write_percpu(paravirt_lazy_mode, mode);
+}
+
+void paravirt_leave_lazy(enum paravirt_lazy_mode mode)
+{
+	BUG_ON(x86_read_percpu(paravirt_lazy_mode) != mode);
+	BUG_ON(preemptible());
+
+	x86_write_percpu(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
+}
+
+void paravirt_enter_lazy_mmu(void)
+{
+	enter_lazy(PARAVIRT_LAZY_MMU);
+}
+
+void paravirt_leave_lazy_mmu(void)
+{
+	paravirt_leave_lazy(PARAVIRT_LAZY_MMU);
+}
+
+void paravirt_enter_lazy_cpu(void)
+{
+	enter_lazy(PARAVIRT_LAZY_CPU);
+}
+
+void paravirt_leave_lazy_cpu(void)
+{
+	paravirt_leave_lazy(PARAVIRT_LAZY_CPU);
+}
+
+enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
+{
+	return x86_read_percpu(paravirt_lazy_mode);
+}
+
+struct pv_info pv_info = {
+	.name = "bare hardware",
+	.paravirt_enabled = 0,
+	.kernel_rpl = 0,
+	.shared_kernel_pmd = 1,	/* Only used when CONFIG_X86_PAE is set */
+};
+
+struct pv_init_ops pv_init_ops = {
+	.patch = native_patch,
+	.banner = default_banner,
+	.arch_setup = paravirt_nop,
+	.memory_setup = machine_specific_memory_setup,
+};
+
+struct pv_time_ops pv_time_ops = {
+	.time_init = hpet_time_init,
+	.get_wallclock = native_get_wallclock,
+	.set_wallclock = native_set_wallclock,
+	.sched_clock = native_sched_clock,
+	.get_cpu_khz = native_calculate_cpu_khz,
+};
+
+struct pv_irq_ops pv_irq_ops = {
+	.init_IRQ = native_init_IRQ,
+	.save_fl = native_save_fl,
+	.restore_fl = native_restore_fl,
+	.irq_disable = native_irq_disable,
+	.irq_enable = native_irq_enable,
+	.safe_halt = native_safe_halt,
+	.halt = native_halt,
+};
+
+struct pv_cpu_ops pv_cpu_ops = {
+	.cpuid = native_cpuid,
+	.get_debugreg = native_get_debugreg,
+	.set_debugreg = native_set_debugreg,
+	.clts = native_clts,
+	.read_cr0 = native_read_cr0,
+	.write_cr0 = native_write_cr0,
+	.read_cr4 = native_read_cr4,
+	.read_cr4_safe = native_read_cr4_safe,
+	.write_cr4 = native_write_cr4,
+	.wbinvd = native_wbinvd,
+	.read_msr = native_read_msr_safe,
+	.write_msr = native_write_msr_safe,
+	.read_tsc = native_read_tsc,
+	.read_pmc = native_read_pmc,
+	.load_tr_desc = native_load_tr_desc,
+	.set_ldt = native_set_ldt,
+	.load_gdt = native_load_gdt,
+	.load_idt = native_load_idt,
+	.store_gdt = native_store_gdt,
+	.store_idt = native_store_idt,
+	.store_tr = native_store_tr,
+	.load_tls = native_load_tls,
+	.write_ldt_entry = native_write_ldt_entry,
+	.write_gdt_entry = native_write_gdt_entry,
+	.write_idt_entry = native_write_idt_entry,
+	.load_sp0 = native_load_sp0,
+
+	.irq_enable_syscall_ret = native_irq_enable_syscall_ret,
+	.iret = native_iret,
+
+	.set_iopl_mask = native_set_iopl_mask,
+	.io_delay = native_io_delay,
+
+	.lazy_mode = {
+		.enter = paravirt_nop,
+		.leave = paravirt_nop,
+	},
+};
+
+struct pv_apic_ops pv_apic_ops = {
+#ifdef CONFIG_X86_LOCAL_APIC
+	.apic_write = native_apic_write,
+	.apic_write_atomic = native_apic_write_atomic,
+	.apic_read = native_apic_read,
+	.setup_boot_clock = setup_boot_APIC_clock,
+	.setup_secondary_clock = setup_secondary_APIC_clock,
+	.startup_ipi_hook = paravirt_nop,
+#endif
+};
+
+struct pv_mmu_ops pv_mmu_ops = {
+	.pagetable_setup_start = native_pagetable_setup_start,
+	.pagetable_setup_done = native_pagetable_setup_done,
+
+	.read_cr2 = native_read_cr2,
+	.write_cr2 = native_write_cr2,
+	.read_cr3 = native_read_cr3,
+	.write_cr3 = native_write_cr3,
+
+	.flush_tlb_user = native_flush_tlb,
+	.flush_tlb_kernel = native_flush_tlb_global,
+	.flush_tlb_single = native_flush_tlb_single,
+	.flush_tlb_others = native_flush_tlb_others,
+
+	.alloc_pt = paravirt_nop,
+	.alloc_pd = paravirt_nop,
+	.alloc_pd_clone = paravirt_nop,
+	.release_pt = paravirt_nop,
+	.release_pd = paravirt_nop,
+
+	.set_pte = native_set_pte,
+	.set_pte_at = native_set_pte_at,
+	.set_pmd = native_set_pmd,
+	.pte_update = paravirt_nop,
+	.pte_update_defer = paravirt_nop,
+
+#ifdef CONFIG_HIGHPTE
+	.kmap_atomic_pte = kmap_atomic,
+#endif
+
+#ifdef CONFIG_X86_PAE
+	.set_pte_atomic = native_set_pte_atomic,
+	.set_pte_present = native_set_pte_present,
+	.set_pud = native_set_pud,
+	.pte_clear = native_pte_clear,
+	.pmd_clear = native_pmd_clear,
+
+	.pmd_val = native_pmd_val,
+	.make_pmd = native_make_pmd,
+#endif
+
+	.pte_val = native_pte_val,
+	.pgd_val = native_pgd_val,
+
+	.make_pte = native_make_pte,
+	.make_pgd = native_make_pgd,
+
+	.dup_mmap = paravirt_nop,
+	.exit_mmap = paravirt_nop,
+	.activate_mm = paravirt_nop,
+
+	.lazy_mode = {
+		.enter = paravirt_nop,
+		.leave = paravirt_nop,
+	},
+};
+
+EXPORT_SYMBOL_GPL(pv_time_ops);
+EXPORT_SYMBOL    (pv_cpu_ops);
+EXPORT_SYMBOL    (pv_mmu_ops);
+EXPORT_SYMBOL_GPL(pv_apic_ops);
+EXPORT_SYMBOL_GPL(pv_info);
+EXPORT_SYMBOL    (pv_irq_ops);
diff --git a/arch/x86/kernel/paravirt_32.c b/arch/x86/kernel/paravirt_32.c
deleted file mode 100644
index dd063fba2b21..000000000000
--- a/arch/x86/kernel/paravirt_32.c
+++ /dev/null
@@ -1,472 +0,0 @@
-/*  Paravirtualization interfaces
-    Copyright (C) 2006 Rusty Russell IBM Corporation
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-*/
-#include <linux/errno.h>
-#include <linux/module.h>
-#include <linux/efi.h>
-#include <linux/bcd.h>
-#include <linux/highmem.h>
-
-#include <asm/bug.h>
-#include <asm/paravirt.h>
-#include <asm/desc.h>
-#include <asm/setup.h>
-#include <asm/arch_hooks.h>
-#include <asm/time.h>
-#include <asm/irq.h>
-#include <asm/delay.h>
-#include <asm/fixmap.h>
-#include <asm/apic.h>
-#include <asm/tlbflush.h>
-#include <asm/timer.h>
-
-/* nop stub */
-void _paravirt_nop(void)
-{
-}
-
-static void __init default_banner(void)
-{
-	printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
-	       pv_info.name);
-}
-
-char *memory_setup(void)
-{
-	return pv_init_ops.memory_setup();
-}
-
-/* Simple instruction patching code. */
-#define DEF_NATIVE(ops, name, code)					\
-	extern const char start_##ops##_##name[], end_##ops##_##name[];	\
-	asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":")
-
-DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
-DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
-DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
-DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
-DEF_NATIVE(pv_cpu_ops, iret, "iret");
-DEF_NATIVE(pv_cpu_ops, irq_enable_syscall_ret, "sti; sysexit");
-DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
-DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
-DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
-DEF_NATIVE(pv_cpu_ops, clts, "clts");
-DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
-
-/* Undefined instruction for dealing with missing ops pointers. */
-static const unsigned char ud2a[] = { 0x0f, 0x0b };
-
-static unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
-			     unsigned long addr, unsigned len)
-{
-	const unsigned char *start, *end;
-	unsigned ret;
-
-	switch(type) {
-#define SITE(ops, x)						\
-	case PARAVIRT_PATCH(ops.x):				\
-		start = start_##ops##_##x;			\
-		end = end_##ops##_##x;				\
-		goto patch_site
-
-	SITE(pv_irq_ops, irq_disable);
-	SITE(pv_irq_ops, irq_enable);
-	SITE(pv_irq_ops, restore_fl);
-	SITE(pv_irq_ops, save_fl);
-	SITE(pv_cpu_ops, iret);
-	SITE(pv_cpu_ops, irq_enable_syscall_ret);
-	SITE(pv_mmu_ops, read_cr2);
-	SITE(pv_mmu_ops, read_cr3);
-	SITE(pv_mmu_ops, write_cr3);
-	SITE(pv_cpu_ops, clts);
-	SITE(pv_cpu_ops, read_tsc);
-#undef SITE
-
-	patch_site:
-		ret = paravirt_patch_insns(ibuf, len, start, end);
-		break;
-
-	default:
-		ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
-		break;
-	}
-
-	return ret;
-}
-
-unsigned paravirt_patch_nop(void)
-{
-	return 0;
-}
-
-unsigned paravirt_patch_ignore(unsigned len)
-{
-	return len;
-}
-
-struct branch {
-	unsigned char opcode;
-	u32 delta;
-} __attribute__((packed));
-
-unsigned paravirt_patch_call(void *insnbuf,
-			     const void *target, u16 tgt_clobbers,
-			     unsigned long addr, u16 site_clobbers,
-			     unsigned len)
-{
-	struct branch *b = insnbuf;
-	unsigned long delta = (unsigned long)target - (addr+5);
-
-	if (tgt_clobbers & ~site_clobbers)
-		return len;	/* target would clobber too much for this site */
-	if (len < 5)
-		return len;	/* call too long for patch site */
-
-	b->opcode = 0xe8; /* call */
-	b->delta = delta;
-	BUILD_BUG_ON(sizeof(*b) != 5);
-
-	return 5;
-}
-
-unsigned paravirt_patch_jmp(void *insnbuf, const void *target,
-			    unsigned long addr, unsigned len)
-{
-	struct branch *b = insnbuf;
-	unsigned long delta = (unsigned long)target - (addr+5);
-
-	if (len < 5)
-		return len;	/* call too long for patch site */
-
-	b->opcode = 0xe9;	/* jmp */
-	b->delta = delta;
-
-	return 5;
-}
-
-/* Neat trick to map patch type back to the call within the
- * corresponding structure. */
-static void *get_call_destination(u8 type)
-{
-	struct paravirt_patch_template tmpl = {
-		.pv_init_ops = pv_init_ops,
-		.pv_time_ops = pv_time_ops,
-		.pv_cpu_ops = pv_cpu_ops,
-		.pv_irq_ops = pv_irq_ops,
-		.pv_apic_ops = pv_apic_ops,
-		.pv_mmu_ops = pv_mmu_ops,
-	};
-	return *((void **)&tmpl + type);
-}
-
-unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
-				unsigned long addr, unsigned len)
-{
-	void *opfunc = get_call_destination(type);
-	unsigned ret;
-
-	if (opfunc == NULL)
-		/* If there's no function, patch it with a ud2a (BUG) */
-		ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a));
-	else if (opfunc == paravirt_nop)
-		/* If the operation is a nop, then nop the callsite */
-		ret = paravirt_patch_nop();
-	else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
-		 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_syscall_ret))
-		/* If operation requires a jmp, then jmp */
-		ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len);
-	else
-		/* Otherwise call the function; assume target could
-		   clobber any caller-save reg */
-		ret = paravirt_patch_call(insnbuf, opfunc, CLBR_ANY,
-					  addr, clobbers, len);
-
-	return ret;
-}
-
-unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
-			      const char *start, const char *end)
-{
-	unsigned insn_len = end - start;
-
-	if (insn_len > len || start == NULL)
-		insn_len = len;
-	else
-		memcpy(insnbuf, start, insn_len);
-
-	return insn_len;
-}
-
-void init_IRQ(void)
-{
-	pv_irq_ops.init_IRQ();
-}
-
-static void native_flush_tlb(void)
-{
-	__native_flush_tlb();
-}
-
-/*
- * Global pages have to be flushed a bit differently. Not a real
- * performance problem because this does not happen often.
- */
-static void native_flush_tlb_global(void)
-{
-	__native_flush_tlb_global();
-}
-
-static void native_flush_tlb_single(unsigned long addr)
-{
-	__native_flush_tlb_single(addr);
-}
-
-/* These are in entry.S */
-extern void native_iret(void);
-extern void native_irq_enable_syscall_ret(void);
-
-static int __init print_banner(void)
-{
-	pv_init_ops.banner();
-	return 0;
-}
-core_initcall(print_banner);
-
-static struct resource reserve_ioports = {
-	.start = 0,
-	.end = IO_SPACE_LIMIT,
-	.name = "paravirt-ioport",
-	.flags = IORESOURCE_IO | IORESOURCE_BUSY,
-};
-
-static struct resource reserve_iomem = {
-	.start = 0,
-	.end = -1,
-	.name = "paravirt-iomem",
-	.flags = IORESOURCE_MEM | IORESOURCE_BUSY,
-};
-
-/*
- * Reserve the whole legacy IO space to prevent any legacy drivers
- * from wasting time probing for their hardware.  This is a fairly
- * brute-force approach to disabling all non-virtual drivers.
- *
- * Note that this must be called very early to have any effect.
- */
-int paravirt_disable_iospace(void)
-{
-	int ret;
-
-	ret = request_resource(&ioport_resource, &reserve_ioports);
-	if (ret == 0) {
-		ret = request_resource(&iomem_resource, &reserve_iomem);
-		if (ret)
-			release_resource(&reserve_ioports);
-	}
-
-	return ret;
-}
-
-static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LAZY_NONE;
-
-static inline void enter_lazy(enum paravirt_lazy_mode mode)
-{
-	BUG_ON(x86_read_percpu(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
-	BUG_ON(preemptible());
-
-	x86_write_percpu(paravirt_lazy_mode, mode);
-}
-
-void paravirt_leave_lazy(enum paravirt_lazy_mode mode)
-{
-	BUG_ON(x86_read_percpu(paravirt_lazy_mode) != mode);
-	BUG_ON(preemptible());
-
-	x86_write_percpu(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
-}
-
-void paravirt_enter_lazy_mmu(void)
-{
-	enter_lazy(PARAVIRT_LAZY_MMU);
-}
-
-void paravirt_leave_lazy_mmu(void)
-{
-	paravirt_leave_lazy(PARAVIRT_LAZY_MMU);
-}
-
-void paravirt_enter_lazy_cpu(void)
-{
-	enter_lazy(PARAVIRT_LAZY_CPU);
-}
-
-void paravirt_leave_lazy_cpu(void)
-{
-	paravirt_leave_lazy(PARAVIRT_LAZY_CPU);
-}
-
-enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
-{
-	return x86_read_percpu(paravirt_lazy_mode);
-}
-
-struct pv_info pv_info = {
-	.name = "bare hardware",
-	.paravirt_enabled = 0,
-	.kernel_rpl = 0,
-	.shared_kernel_pmd = 1,	/* Only used when CONFIG_X86_PAE is set */
-};
-
-struct pv_init_ops pv_init_ops = {
-	.patch = native_patch,
-	.banner = default_banner,
-	.arch_setup = paravirt_nop,
-	.memory_setup = machine_specific_memory_setup,
-};
-
-struct pv_time_ops pv_time_ops = {
-	.time_init = hpet_time_init,
-	.get_wallclock = native_get_wallclock,
-	.set_wallclock = native_set_wallclock,
-	.sched_clock = native_sched_clock,
-	.get_cpu_khz = native_calculate_cpu_khz,
-};
-
-struct pv_irq_ops pv_irq_ops = {
-	.init_IRQ = native_init_IRQ,
-	.save_fl = native_save_fl,
-	.restore_fl = native_restore_fl,
-	.irq_disable = native_irq_disable,
-	.irq_enable = native_irq_enable,
-	.safe_halt = native_safe_halt,
-	.halt = native_halt,
-};
-
-struct pv_cpu_ops pv_cpu_ops = {
-	.cpuid = native_cpuid,
-	.get_debugreg = native_get_debugreg,
-	.set_debugreg = native_set_debugreg,
-	.clts = native_clts,
-	.read_cr0 = native_read_cr0,
-	.write_cr0 = native_write_cr0,
-	.read_cr4 = native_read_cr4,
-	.read_cr4_safe = native_read_cr4_safe,
-	.write_cr4 = native_write_cr4,
-	.wbinvd = native_wbinvd,
-	.read_msr = native_read_msr_safe,
-	.write_msr = native_write_msr_safe,
-	.read_tsc = native_read_tsc,
-	.read_pmc = native_read_pmc,
-	.load_tr_desc = native_load_tr_desc,
-	.set_ldt = native_set_ldt,
-	.load_gdt = native_load_gdt,
-	.load_idt = native_load_idt,
-	.store_gdt = native_store_gdt,
-	.store_idt = native_store_idt,
-	.store_tr = native_store_tr,
-	.load_tls = native_load_tls,
-	.write_ldt_entry = native_write_ldt_entry,
-	.write_gdt_entry = native_write_gdt_entry,
-	.write_idt_entry = native_write_idt_entry,
-	.load_sp0 = native_load_sp0,
-
-	.irq_enable_syscall_ret = native_irq_enable_syscall_ret,
-	.iret = native_iret,
-
-	.set_iopl_mask = native_set_iopl_mask,
-	.io_delay = native_io_delay,
-
-	.lazy_mode = {
-		.enter = paravirt_nop,
-		.leave = paravirt_nop,
-	},
-};
-
-struct pv_apic_ops pv_apic_ops = {
-#ifdef CONFIG_X86_LOCAL_APIC
-	.apic_write = native_apic_write,
-	.apic_write_atomic = native_apic_write_atomic,
-	.apic_read = native_apic_read,
-	.setup_boot_clock = setup_boot_APIC_clock,
-	.setup_secondary_clock = setup_secondary_APIC_clock,
-	.startup_ipi_hook = paravirt_nop,
-#endif
-};
-
-struct pv_mmu_ops pv_mmu_ops = {
-	.pagetable_setup_start = native_pagetable_setup_start,
-	.pagetable_setup_done = native_pagetable_setup_done,
-
-	.read_cr2 = native_read_cr2,
-	.write_cr2 = native_write_cr2,
-	.read_cr3 = native_read_cr3,
-	.write_cr3 = native_write_cr3,
-
-	.flush_tlb_user = native_flush_tlb,
-	.flush_tlb_kernel = native_flush_tlb_global,
-	.flush_tlb_single = native_flush_tlb_single,
-	.flush_tlb_others = native_flush_tlb_others,
-
-	.alloc_pt = paravirt_nop,
-	.alloc_pd = paravirt_nop,
-	.alloc_pd_clone = paravirt_nop,
-	.release_pt = paravirt_nop,
-	.release_pd = paravirt_nop,
-
-	.set_pte = native_set_pte,
-	.set_pte_at = native_set_pte_at,
-	.set_pmd = native_set_pmd,
-	.pte_update = paravirt_nop,
-	.pte_update_defer = paravirt_nop,
-
-#ifdef CONFIG_HIGHPTE
-	.kmap_atomic_pte = kmap_atomic,
-#endif
-
-#ifdef CONFIG_X86_PAE
-	.set_pte_atomic = native_set_pte_atomic,
-	.set_pte_present = native_set_pte_present,
-	.set_pud = native_set_pud,
-	.pte_clear = native_pte_clear,
-	.pmd_clear = native_pmd_clear,
-
-	.pmd_val = native_pmd_val,
-	.make_pmd = native_make_pmd,
-#endif
-
-	.pte_val = native_pte_val,
-	.pgd_val = native_pgd_val,
-
-	.make_pte = native_make_pte,
-	.make_pgd = native_make_pgd,
-
-	.dup_mmap = paravirt_nop,
-	.exit_mmap = paravirt_nop,
-	.activate_mm = paravirt_nop,
-
-	.lazy_mode = {
-		.enter = paravirt_nop,
-		.leave = paravirt_nop,
-	},
-};
-
-EXPORT_SYMBOL_GPL(pv_time_ops);
-EXPORT_SYMBOL    (pv_cpu_ops);
-EXPORT_SYMBOL    (pv_mmu_ops);
-EXPORT_SYMBOL_GPL(pv_apic_ops);
-EXPORT_SYMBOL_GPL(pv_info);
-EXPORT_SYMBOL    (pv_irq_ops);
-- 
cgit v1.2.3


From e5aaac443635c7c6f842f0bf8169f71f3236d574 Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 30 Jan 2008 13:32:05 +0100
Subject: x86: provide paravirtualized hook for rdtscp

This patch adds a field in pv_cpu_ops for a paravirtualized hook
for rdtscp, needed for x86_64.

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/paravirt.c |  1 +
 include/asm-x86/paravirt.h | 22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1a170877f46c..072c1a08efe6 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -374,6 +374,7 @@ struct pv_cpu_ops pv_cpu_ops = {
 	.write_msr = native_write_msr_safe,
 	.read_tsc = native_read_tsc,
 	.read_pmc = native_read_pmc,
+	.read_tscp = native_read_tscp,
 	.load_tr_desc = native_load_tr_desc,
 	.set_ldt = native_set_ldt,
 	.load_gdt = native_load_gdt,
diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h
index 66290f58e939..8f7984319c30 100644
--- a/include/asm-x86/paravirt.h
+++ b/include/asm-x86/paravirt.h
@@ -120,6 +120,7 @@ struct pv_cpu_ops {
 
 	u64 (*read_tsc)(void);
 	u64 (*read_pmc)(int counter);
+	unsigned long long (*read_tscp)(unsigned int *aux);
 
 	/* These two are jmp to, not actually called. */
 	void (*irq_enable_syscall_ret)(void);
@@ -668,6 +669,27 @@ static inline unsigned long long paravirt_read_pmc(int counter)
 	high = _l >> 32;			\
 } while(0)
 
+static inline unsigned long long paravirt_rdtscp(unsigned int *aux)
+{
+	return PVOP_CALL1(u64, pv_cpu_ops.read_tscp, aux);
+}
+
+#define rdtscp(low, high, aux)				\
+do {							\
+	int __aux;					\
+	unsigned long __val = paravirt_rdtscp(&__aux);	\
+	(low) = (u32)__val;				\
+	(high) = (u32)(__val >> 32);			\
+	(aux) = __aux;					\
+} while (0)
+
+#define rdtscpll(val, aux)				\
+do {							\
+	unsigned long __aux; 				\
+	val = paravirt_rdtscp(&__aux);			\
+	(aux) = __aux;					\
+} while (0)
+
 static inline void load_TR_desc(void)
 {
 	PVOP_VCALL0(pv_cpu_ops.load_tr_desc);
-- 
cgit v1.2.3


From e801f864ec7e5b149bd05337800e419f408523bb Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 30 Jan 2008 13:32:08 +0100
Subject: x86: adds paravirt hook for swapgs

This patch adds paravirt hook for swapgs operation, which is a privileged
operation in x86_64.

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/paravirt.c  | 1 +
 include/asm-x86/paravirt.h  | 9 +++++++++
 include/asm-x86/processor.h | 8 ++++++++
 3 files changed, 18 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 072c1a08efe6..e7c17cc4a99e 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -390,6 +390,7 @@ struct pv_cpu_ops pv_cpu_ops = {
 
 	.irq_enable_syscall_ret = native_irq_enable_syscall_ret,
 	.iret = native_iret,
+	.swapgs = native_swapgs,
 
 	.set_iopl_mask = native_set_iopl_mask,
 	.io_delay = native_io_delay,
diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h
index 7d1c126e2249..e8fbf742d425 100644
--- a/include/asm-x86/paravirt.h
+++ b/include/asm-x86/paravirt.h
@@ -127,6 +127,8 @@ struct pv_cpu_ops {
 	void (*irq_enable_syscall_ret)(void);
 	void (*iret)(void);
 
+	void (*swapgs)(void);
+
 	struct pv_lazy_ops lazy_mode;
 };
 
@@ -1228,6 +1230,13 @@ static inline unsigned long __raw_local_irq_save(void)
 	call *pv_cpu_ops+PV_CPU_read_cr0;	\
 	pop %edx; pop %ecx
 #else
+#define SWAPGS								\
+	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE,	\
+		  PV_SAVE_REGS;						\
+		  call *pv_cpu_ops+PV_CPU_swapgs;			\
+		  PV_RESTORE_REGS					\
+		 )
+
 #define GET_CR2_INTO_RCX			\
 	call *pv_mmu_ops+PV_MMU_read_cr2;	\
 	movq %rax, %rcx;			\
diff --git a/include/asm-x86/processor.h b/include/asm-x86/processor.h
index e6fa06fee72a..72740c6f1109 100644
--- a/include/asm-x86/processor.h
+++ b/include/asm-x86/processor.h
@@ -435,6 +435,13 @@ static inline void native_load_sp0(struct tss_struct *tss,
 #endif
 }
 
+static inline void native_swapgs(void)
+{
+#ifdef CONFIG_X86_64
+	asm volatile("swapgs" ::: "memory");
+#endif
+}
+
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else
@@ -456,6 +463,7 @@ static inline void load_sp0(struct tss_struct *tss,
 }
 
 #define set_iopl_mask native_set_iopl_mask
+#define SWAPGS	swapgs
 #endif /* CONFIG_PARAVIRT */
 
 /*
-- 
cgit v1.2.3


From 72fe4858544292ad64600765cb78bc02298c6b1c Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 30 Jan 2008 13:32:08 +0100
Subject: x86: replace privileged instructions with paravirt macros

The assembly code in entry_64.S issues a bunch of privileged instructions,
like cli, sti, swapgs, and others. Paravirt guests are forbidden to do so,
and we then replace them with macros that will do the right thing.

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/entry_64.S | 101 ++++++++++++++++++++++++++-------------------
 1 file changed, 59 insertions(+), 42 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e70f3881d7e4..bea8474744ff 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -50,6 +50,7 @@
 #include <asm/hw_irq.h>
 #include <asm/page.h>
 #include <asm/irqflags.h>
+#include <asm/paravirt.h>
 
 	.code64
 
@@ -57,6 +58,13 @@
 #define retint_kernel retint_restore_args
 #endif	
 
+#ifdef CONFIG_PARAVIRT
+ENTRY(native_irq_enable_syscall_ret)
+	movq	%gs:pda_oldrsp,%rsp
+	swapgs
+	sysretq
+#endif /* CONFIG_PARAVIRT */
+
 
 .macro TRACE_IRQS_IRETQ offset=ARGOFFSET
 #ifdef CONFIG_TRACE_IRQFLAGS
@@ -216,14 +224,21 @@ ENTRY(system_call)
 	CFI_DEF_CFA	rsp,PDA_STACKOFFSET
 	CFI_REGISTER	rip,rcx
 	/*CFI_REGISTER	rflags,r11*/
-	swapgs
+	SWAPGS_UNSAFE_STACK
+	/*
+	 * A hypervisor implementation might want to use a label
+	 * after the swapgs, so that it can do the swapgs
+	 * for the guest and jump here on syscall.
+	 */
+ENTRY(system_call_after_swapgs)
+
 	movq	%rsp,%gs:pda_oldrsp 
 	movq	%gs:pda_kernelstack,%rsp
 	/*
 	 * No need to follow this irqs off/on section - it's straight
 	 * and short:
 	 */
-	sti					
+	ENABLE_INTERRUPTS(CLBR_NONE)
 	SAVE_ARGS 8,1
 	movq  %rax,ORIG_RAX-ARGOFFSET(%rsp) 
 	movq  %rcx,RIP-ARGOFFSET(%rsp)
@@ -246,7 +261,7 @@ ret_from_sys_call:
 sysret_check:		
 	LOCKDEP_SYS_EXIT
 	GET_THREAD_INFO(%rcx)
-	cli
+	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	movl threadinfo_flags(%rcx),%edx
 	andl %edi,%edx
@@ -260,9 +275,7 @@ sysret_check:
 	CFI_REGISTER	rip,rcx
 	RESTORE_ARGS 0,-ARG_SKIP,1
 	/*CFI_REGISTER	rflags,r11*/
-	movq	%gs:pda_oldrsp,%rsp
-	swapgs
-	sysretq
+	ENABLE_INTERRUPTS_SYSCALL_RET
 
 	CFI_RESTORE_STATE
 	/* Handle reschedules */
@@ -271,7 +284,7 @@ sysret_careful:
 	bt $TIF_NEED_RESCHED,%edx
 	jnc sysret_signal
 	TRACE_IRQS_ON
-	sti
+	ENABLE_INTERRUPTS(CLBR_NONE)
 	pushq %rdi
 	CFI_ADJUST_CFA_OFFSET 8
 	call schedule
@@ -282,7 +295,7 @@ sysret_careful:
 	/* Handle a signal */ 
 sysret_signal:
 	TRACE_IRQS_ON
-	sti
+	ENABLE_INTERRUPTS(CLBR_NONE)
 	testl $_TIF_DO_NOTIFY_MASK,%edx
 	jz    1f
 
@@ -295,7 +308,7 @@ sysret_signal:
 1:	movl $_TIF_NEED_RESCHED,%edi
 	/* Use IRET because user could have changed frame. This
 	   works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
-	cli
+	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	jmp int_with_check
 	
@@ -327,7 +340,7 @@ tracesys:
  */
 	.globl int_ret_from_sys_call
 int_ret_from_sys_call:
-	cli
+	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	testl $3,CS-ARGOFFSET(%rsp)
 	je retint_restore_args
@@ -349,20 +362,20 @@ int_careful:
 	bt $TIF_NEED_RESCHED,%edx
 	jnc  int_very_careful
 	TRACE_IRQS_ON
-	sti
+	ENABLE_INTERRUPTS(CLBR_NONE)
 	pushq %rdi
 	CFI_ADJUST_CFA_OFFSET 8
 	call schedule
 	popq %rdi
 	CFI_ADJUST_CFA_OFFSET -8
-	cli
+	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	jmp int_with_check
 
 	/* handle signals and tracing -- both require a full stack frame */
 int_very_careful:
 	TRACE_IRQS_ON
-	sti
+	ENABLE_INTERRUPTS(CLBR_NONE)
 	SAVE_REST
 	/* Check for syscall exit trace */	
 	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
@@ -385,7 +398,7 @@ int_signal:
 1:	movl $_TIF_NEED_RESCHED,%edi	
 int_restore_rest:
 	RESTORE_REST
-	cli
+	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	jmp int_with_check
 	CFI_ENDPROC
@@ -506,7 +519,7 @@ END(stub_rt_sigreturn)
 	CFI_DEF_CFA_REGISTER	rbp
 	testl $3,CS(%rdi)
 	je 1f
-	swapgs	
+	SWAPGS
 	/* irqcount is used to check if a CPU is already on an interrupt
 	   stack or not. While this is essentially redundant with preempt_count
 	   it is a little cheaper to use a separate counter in the PDA
@@ -527,7 +540,7 @@ ENTRY(common_interrupt)
 	interrupt do_IRQ
 	/* 0(%rsp): oldrsp-ARGOFFSET */
 ret_from_intr:
-	cli	
+	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	decl %gs:pda_irqcount
 	leaveq
@@ -556,13 +569,13 @@ retint_swapgs:		/* return to user-space */
 	/*
 	 * The iretq could re-enable interrupts:
 	 */
-	cli
+	DISABLE_INTERRUPTS(CLBR_ANY)
 	TRACE_IRQS_IRETQ
-	swapgs 
+	SWAPGS
 	jmp restore_args
 
 retint_restore_args:	/* return to kernel space */
-	cli
+	DISABLE_INTERRUPTS(CLBR_ANY)
 	/*
 	 * The iretq could re-enable interrupts:
 	 */
@@ -570,10 +583,14 @@ retint_restore_args:	/* return to kernel space */
 restore_args:
 	RESTORE_ARGS 0,8,0						
 iret_label:	
+#ifdef CONFIG_PARAVIRT
+	INTERRUPT_RETURN
+#endif
+ENTRY(native_iret)
 	iretq
 
 	.section __ex_table,"a"
-	.quad iret_label,bad_iret	
+	.quad native_iret, bad_iret
 	.previous
 	.section .fixup,"ax"
 	/* force a signal here? this matches i386 behaviour */
@@ -581,24 +598,24 @@ iret_label:
 bad_iret:
 	movq $11,%rdi	/* SIGSEGV */
 	TRACE_IRQS_ON
-	sti
-	jmp do_exit			
-	.previous	
-	
+	ENABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI))
+	jmp do_exit
+	.previous
+
 	/* edi: workmask, edx: work */
 retint_careful:
 	CFI_RESTORE_STATE
 	bt    $TIF_NEED_RESCHED,%edx
 	jnc   retint_signal
 	TRACE_IRQS_ON
-	sti
+	ENABLE_INTERRUPTS(CLBR_NONE)
 	pushq %rdi
 	CFI_ADJUST_CFA_OFFSET	8
 	call  schedule
 	popq %rdi		
 	CFI_ADJUST_CFA_OFFSET	-8
 	GET_THREAD_INFO(%rcx)
-	cli
+	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	jmp retint_check
 	
@@ -606,14 +623,14 @@ retint_signal:
 	testl $_TIF_DO_NOTIFY_MASK,%edx
 	jz    retint_swapgs
 	TRACE_IRQS_ON
-	sti
+	ENABLE_INTERRUPTS(CLBR_NONE)
 	SAVE_REST
 	movq $-1,ORIG_RAX(%rsp) 			
 	xorl %esi,%esi		# oldset
 	movq %rsp,%rdi		# &pt_regs
 	call do_notify_resume
 	RESTORE_REST
-	cli
+	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	movl $_TIF_NEED_RESCHED,%edi
 	GET_THREAD_INFO(%rcx)
@@ -731,7 +748,7 @@ END(spurious_interrupt)
 	rdmsr
 	testl %edx,%edx
 	js    1f
-	swapgs
+	SWAPGS
 	xorl  %ebx,%ebx
 1:
 	.if \ist
@@ -747,7 +764,7 @@ END(spurious_interrupt)
 	.if \ist
 	addq	$EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
 	.endif
-	cli
+	DISABLE_INTERRUPTS(CLBR_NONE)
 	.if \irqtrace
 	TRACE_IRQS_OFF
 	.endif
@@ -776,10 +793,10 @@ paranoid_swapgs\trace:
 	.if \trace
 	TRACE_IRQS_IRETQ 0
 	.endif
-	swapgs
+	SWAPGS_UNSAFE_STACK
 paranoid_restore\trace:
 	RESTORE_ALL 8
-	iretq
+	INTERRUPT_RETURN
 paranoid_userspace\trace:
 	GET_THREAD_INFO(%rcx)
 	movl threadinfo_flags(%rcx),%ebx
@@ -794,11 +811,11 @@ paranoid_userspace\trace:
 	.if \trace
 	TRACE_IRQS_ON
 	.endif
-	sti
+	ENABLE_INTERRUPTS(CLBR_NONE)
 	xorl %esi,%esi 			/* arg2: oldset */
 	movq %rsp,%rdi 			/* arg1: &pt_regs */
 	call do_notify_resume
-	cli
+	DISABLE_INTERRUPTS(CLBR_NONE)
 	.if \trace
 	TRACE_IRQS_OFF
 	.endif
@@ -807,9 +824,9 @@ paranoid_schedule\trace:
 	.if \trace
 	TRACE_IRQS_ON
 	.endif
-	sti
+	ENABLE_INTERRUPTS(CLBR_ANY)
 	call schedule
-	cli
+	DISABLE_INTERRUPTS(CLBR_ANY)
 	.if \trace
 	TRACE_IRQS_OFF
 	.endif
@@ -862,7 +879,7 @@ KPROBE_ENTRY(error_entry)
 	testl $3,CS(%rsp)
 	je  error_kernelspace
 error_swapgs:	
-	swapgs
+	SWAPGS
 error_sti:	
 	movq %rdi,RDI(%rsp) 	
 	CFI_REL_OFFSET	rdi,RDI
@@ -874,7 +891,7 @@ error_sti:
 error_exit:
 	movl %ebx,%eax
 	RESTORE_REST
-	cli
+	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	GET_THREAD_INFO(%rcx)	
 	testl %eax,%eax
@@ -911,12 +928,12 @@ ENTRY(load_gs_index)
 	CFI_STARTPROC
 	pushf
 	CFI_ADJUST_CFA_OFFSET 8
-	cli
-        swapgs
+	DISABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI))
+        SWAPGS
 gs_change:     
         movl %edi,%gs   
 2:	mfence		/* workaround */
-	swapgs
+	SWAPGS
         popf
 	CFI_ADJUST_CFA_OFFSET -8
         ret
@@ -930,7 +947,7 @@ ENDPROC(load_gs_index)
         .section .fixup,"ax"
 	/* running with kernelgs */
 bad_gs: 
-	swapgs			/* switch back to user gs */
+	SWAPGS			/* switch back to user gs */
 	xorl %eax,%eax
         movl %eax,%gs
         jmp  2b
-- 
cgit v1.2.3


From 2f485ef568372af4680c4e2f8490efb9f2523b05 Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 30 Jan 2008 13:32:10 +0100
Subject: x86: move patching code to arch-specific file.

The core patching code for paravirt is sufficiently different
among i386 and x86_64, and we move them to specific files.

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/Makefile_32         |  2 +-
 arch/x86/kernel/paravirt.c          | 50 -------------------------------------
 arch/x86/kernel/paravirt_patch_32.c | 49 ++++++++++++++++++++++++++++++++++++
 include/asm-x86/paravirt.h          |  8 ++++++
 4 files changed, 58 insertions(+), 51 deletions(-)
 create mode 100644 arch/x86/kernel/paravirt_patch_32.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32
index cfb71a5deb19..86c6327798b2 100644
--- a/arch/x86/kernel/Makefile_32
+++ b/arch/x86/kernel/Makefile_32
@@ -48,7 +48,7 @@ obj-$(CONFIG_K8_NB)		+= k8.o
 obj-$(CONFIG_MGEODE_LX)		+= geode_32.o mfgpt_32.o
 
 obj-$(CONFIG_VMI)		+= vmi_32.o vmiclock_32.o
-obj-$(CONFIG_PARAVIRT)		+= paravirt.o
+obj-$(CONFIG_PARAVIRT)		+= paravirt.o paravirt_patch_32.o
 obj-y				+= pcspeaker.o
 
 obj-$(CONFIG_SCx200)		+= scx200_32.o
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index e7c17cc4a99e..864be0498a32 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -58,59 +58,9 @@ char *memory_setup(void)
 	extern const char start_##ops##_##name[], end_##ops##_##name[];	\
 	asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":")
 
-DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
-DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
-DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
-DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
-DEF_NATIVE(pv_cpu_ops, iret, "iret");
-DEF_NATIVE(pv_cpu_ops, irq_enable_syscall_ret, "sti; sysexit");
-DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
-DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
-DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
-DEF_NATIVE(pv_cpu_ops, clts, "clts");
-DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
-
 /* Undefined instruction for dealing with missing ops pointers. */
 static const unsigned char ud2a[] = { 0x0f, 0x0b };
 
-static unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
-			     unsigned long addr, unsigned len)
-{
-	const unsigned char *start, *end;
-	unsigned ret;
-
-	switch(type) {
-#define SITE(ops, x)						\
-	case PARAVIRT_PATCH(ops.x):				\
-		start = start_##ops##_##x;			\
-		end = end_##ops##_##x;				\
-		goto patch_site
-
-	SITE(pv_irq_ops, irq_disable);
-	SITE(pv_irq_ops, irq_enable);
-	SITE(pv_irq_ops, restore_fl);
-	SITE(pv_irq_ops, save_fl);
-	SITE(pv_cpu_ops, iret);
-	SITE(pv_cpu_ops, irq_enable_syscall_ret);
-	SITE(pv_mmu_ops, read_cr2);
-	SITE(pv_mmu_ops, read_cr3);
-	SITE(pv_mmu_ops, write_cr3);
-	SITE(pv_cpu_ops, clts);
-	SITE(pv_cpu_ops, read_tsc);
-#undef SITE
-
-	patch_site:
-		ret = paravirt_patch_insns(ibuf, len, start, end);
-		break;
-
-	default:
-		ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
-		break;
-	}
-
-	return ret;
-}
-
 unsigned paravirt_patch_nop(void)
 {
 	return 0;
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
new file mode 100644
index 000000000000..82fc5fcab4f4
--- /dev/null
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -0,0 +1,49 @@
+#include <asm/paravirt.h>
+
+DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
+DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
+DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
+DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
+DEF_NATIVE(pv_cpu_ops, iret, "iret");
+DEF_NATIVE(pv_cpu_ops, irq_enable_syscall_ret, "sti; sysexit");
+DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
+DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
+DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
+DEF_NATIVE(pv_cpu_ops, clts, "clts");
+DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
+
+unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
+		      unsigned long addr, unsigned len)
+{
+	const unsigned char *start, *end;
+	unsigned ret;
+
+#define PATCH_SITE(ops, x)					\
+		case PARAVIRT_PATCH(ops.x):			\
+			start = start_##ops##_##x;		\
+			end = end_##ops##_##x;			\
+			goto patch_site
+	switch(type) {
+		PATCH_SITE(pv_irq_ops, irq_disable);
+		PATCH_SITE(pv_irq_ops, irq_enable);
+		PATCH_SITE(pv_irq_ops, restore_fl);
+		PATCH_SITE(pv_irq_ops, save_fl);
+		PATCH_SITE(pv_cpu_ops, iret);
+		PATCH_SITE(pv_cpu_ops, irq_enable_syscall_ret);
+		PATCH_SITE(pv_mmu_ops, read_cr2);
+		PATCH_SITE(pv_mmu_ops, read_cr3);
+		PATCH_SITE(pv_mmu_ops, write_cr3);
+		PATCH_SITE(pv_cpu_ops, clts);
+		PATCH_SITE(pv_cpu_ops, read_tsc);
+
+	patch_site:
+		ret = paravirt_patch_insns(ibuf, len, start, end);
+		break;
+
+	default:
+		ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
+		break;
+	}
+#undef PATCH_SITE
+	return ret;
+}
diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h
index 5935c273af1f..b35ed95166e4 100644
--- a/include/asm-x86/paravirt.h
+++ b/include/asm-x86/paravirt.h
@@ -308,6 +308,11 @@ extern struct pv_mmu_ops pv_mmu_ops;
 #define paravirt_alt(insn_string)					\
 	_paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]")
 
+/* Simple instruction patching code. */
+#define DEF_NATIVE(ops, name, code) 					\
+	extern const char start_##ops##_##name[], end_##ops##_##name[];	\
+	asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":")
+
 unsigned paravirt_patch_nop(void);
 unsigned paravirt_patch_ignore(unsigned len);
 unsigned paravirt_patch_call(void *insnbuf,
@@ -322,6 +327,9 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
 unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
 			      const char *start, const char *end);
 
+unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
+		      unsigned long addr, unsigned len);
+
 int paravirt_disable_iospace(void);
 
 /*
-- 
cgit v1.2.3


From 53fd13cff04ce27ff3e8d3eb7e5ad4f56b580f2f Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 30 Jan 2008 13:32:10 +0100
Subject: x86: patching functions on 64-bit

Like i386, x86_64 also need to include its own patching function.
(Well, if you're not in a hurry, and don't care about speed, you don't
really _need_ ;-))

So here they are. Not much different in essence from i386

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/Makefile_64         |  1 +
 arch/x86/kernel/paravirt_patch_64.c | 56 +++++++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+)
 create mode 100644 arch/x86/kernel/paravirt_patch_64.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile_64 b/arch/x86/kernel/Makefile_64
index 7fcf972aa5d6..b8f9d13eb5e3 100644
--- a/arch/x86/kernel/Makefile_64
+++ b/arch/x86/kernel/Makefile_64
@@ -41,6 +41,7 @@ obj-$(CONFIG_X86_VSMP)		+= vsmp_64.o
 obj-$(CONFIG_K8_NB)		+= k8.o
 obj-$(CONFIG_AUDIT)		+= audit_64.o
 obj-$(CONFIG_EFI)		+= efi.o efi_64.o efi_stub_64.o
+obj-$(CONFIG_PARAVIRT)		+= paravirt.o paravirt_patch_64.o
 
 obj-$(CONFIG_MODULES)		+= module_64.o
 obj-$(CONFIG_PCI)		+= early-quirks.o
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
new file mode 100644
index 000000000000..cbfc4f3069e3
--- /dev/null
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -0,0 +1,56 @@
+#include <asm/paravirt.h>
+#include <asm/asm-offsets.h>
+
+DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
+DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
+DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq");
+DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
+DEF_NATIVE(pv_cpu_ops, iret, "iretq");
+DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
+DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
+DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
+DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
+DEF_NATIVE(pv_cpu_ops, clts, "clts");
+DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
+
+/* the three commands give us more control to how to return from a syscall */
+DEF_NATIVE(pv_cpu_ops, irq_enable_syscall_ret, "movq %gs:" __stringify(pda_oldrsp) ", %rsp; swapgs; sysretq;");
+DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
+
+unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
+		      unsigned long addr, unsigned len)
+{
+	const unsigned char *start, *end;
+	unsigned ret;
+
+#define PATCH_SITE(ops, x)					\
+		case PARAVIRT_PATCH(ops.x):			\
+			start = start_##ops##_##x;		\
+			end = end_##ops##_##x;			\
+			goto patch_site
+	switch(type) {
+		PATCH_SITE(pv_irq_ops, restore_fl);
+		PATCH_SITE(pv_irq_ops, save_fl);
+		PATCH_SITE(pv_irq_ops, irq_enable);
+		PATCH_SITE(pv_irq_ops, irq_disable);
+		PATCH_SITE(pv_cpu_ops, iret);
+		PATCH_SITE(pv_cpu_ops, irq_enable_syscall_ret);
+		PATCH_SITE(pv_cpu_ops, swapgs);
+		PATCH_SITE(pv_mmu_ops, read_cr2);
+		PATCH_SITE(pv_mmu_ops, read_cr3);
+		PATCH_SITE(pv_mmu_ops, write_cr3);
+		PATCH_SITE(pv_cpu_ops, clts);
+		PATCH_SITE(pv_mmu_ops, flush_tlb_single);
+		PATCH_SITE(pv_cpu_ops, wbinvd);
+
+	patch_site:
+		ret = paravirt_patch_insns(ibuf, len, start, end);
+		break;
+
+	default:
+		ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
+		break;
+	}
+#undef PATCH_SITE
+	return ret;
+}
-- 
cgit v1.2.3


From bfd074e05bdb69652d24ebc60b126899174ca788 Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 30 Jan 2008 13:32:11 +0100
Subject: replace x86_read/write_per_cpu with a common function.

x86_read_per_cpu() and its writeish sister are not present in x86_64. So in
this patch, we replace them with __get_cpu_var(), which is present in both

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/paravirt.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 864be0498a32..c20b4f8d62f5 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -238,18 +238,18 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA
 
 static inline void enter_lazy(enum paravirt_lazy_mode mode)
 {
-	BUG_ON(x86_read_percpu(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
+	BUG_ON(__get_cpu_var(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
 	BUG_ON(preemptible());
 
-	x86_write_percpu(paravirt_lazy_mode, mode);
+	__get_cpu_var(paravirt_lazy_mode) = mode;
 }
 
 void paravirt_leave_lazy(enum paravirt_lazy_mode mode)
 {
-	BUG_ON(x86_read_percpu(paravirt_lazy_mode) != mode);
+	BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode);
 	BUG_ON(preemptible());
 
-	x86_write_percpu(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
+	__get_cpu_var(paravirt_lazy_mode) = PARAVIRT_LAZY_NONE;
 }
 
 void paravirt_enter_lazy_mmu(void)
@@ -274,7 +274,7 @@ void paravirt_leave_lazy_cpu(void)
 
 enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
 {
-	return x86_read_percpu(paravirt_lazy_mode);
+	return __get_cpu_var(paravirt_lazy_mode);
 }
 
 struct pv_info pv_info = {
-- 
cgit v1.2.3


From 8b2cb7a8f531d6ca72a8aff873b9bb1c6b3122ba Mon Sep 17 00:00:00 2001
From: "Huang, Ying" <ying.huang@intel.com>
Date: Wed, 30 Jan 2008 13:32:11 +0100
Subject: x86: 32-bit EFI runtime service support: fixes in sync with 64-bit
 support

support according to fixes of x86_64 support.

- Delete efi_rt_lock because it is used during system early boot,
  before SMP is initialized.

- Change local_flush_tlb() to __flush_tlb_all() to flush global page
  mapping.

- Clean up includes.

- Revise Kconfig description.

- Enable noefi kernel parameter on i386.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/kernel-parameters.txt   |  2 ++
 Documentation/x86_64/boot-options.txt |  4 ----
 arch/x86/Kconfig                      | 19 ++++++++-----------
 arch/x86/kernel/efi.c                 |  7 +++++++
 arch/x86/kernel/efi_32.c              | 25 +++++--------------------
 arch/x86/kernel/efi_64.c              |  7 -------
 arch/x86/kernel/setup_32.c            |  6 +++---
 7 files changed, 25 insertions(+), 45 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index b3f20beea411..e7910f8b4fcc 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1169,6 +1169,8 @@ and is between 256 and 4096 characters. It is defined in the file
 
 	nodisconnect	[HW,SCSI,M68K] Disables SCSI disconnects.
 
+	noefi		[X86-32,X86-64] Disable EFI runtime services support.
+
 	noexec		[IA-64]
 
 	noexec		[X86-32,X86-64]
diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt
index 638bf46ca059..34abae4e9442 100644
--- a/Documentation/x86_64/boot-options.txt
+++ b/Documentation/x86_64/boot-options.txt
@@ -306,8 +306,4 @@ Debugging
 		newfallback: use new unwinder but fall back to old if it gets
 			stuck (default)
 
-EFI
-
-  noefi		Disable EFI support
-
 Miscellaneous
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d89b94528153..fa6fa52248d9 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -961,21 +961,18 @@ config MTRR
 
 config EFI
 	def_bool n
-	prompt "Boot from EFI support"
+	prompt "EFI runtime service support"
 	depends on ACPI
 	---help---
-	This enables the kernel to boot on EFI platforms using
-	system configuration information passed to it from the firmware.
-	This also enables the kernel to use any EFI runtime services that are
+	This enables the kernel to use EFI runtime services that are
 	available (such as the EFI variable services).
 
-	This option is only useful on systems that have EFI firmware
-	and will result in a kernel image that is ~8k larger.  In addition,
-	you must use the latest ELILO loader available at
-	<http://elilo.sourceforge.net> in order to take advantage of
-	kernel initialization using EFI information (neither GRUB nor LILO know
-	anything about EFI).  However, even with this option, the resultant
-	kernel should continue to boot on existing non-EFI platforms.
+	This option is only useful on systems that have EFI firmware.
+  	In addition, you should use the latest ELILO loader available
+  	at <http://elilo.sourceforge.net> in order to take advantage
+  	of EFI runtime services. However, even with this option, the
+  	resultant kernel should continue to boot on existing non-EFI
+  	platforms.
 
 config IRQBALANCE
 	def_bool y
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 0a61522e85c7..2939b015c2ed 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -55,6 +55,13 @@ struct efi_memory_map memmap;
 struct efi efi_phys __initdata;
 static efi_system_table_t efi_systab __initdata;
 
+static int __init setup_noefi(char *arg)
+{
+	efi_enabled = 0;
+	return 0;
+}
+early_param("noefi", setup_noefi);
+
 static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
 {
 	return efi_call_virt2(get_time, tm, tc);
diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c
index 30f937116288..afd2c3b039d6 100644
--- a/arch/x86/kernel/efi_32.c
+++ b/arch/x86/kernel/efi_32.c
@@ -20,27 +20,15 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/mm.h>
 #include <linux/types.h>
-#include <linux/time.h>
-#include <linux/spinlock.h>
-#include <linux/bootmem.h>
 #include <linux/ioport.h>
-#include <linux/module.h>
 #include <linux/efi.h>
-#include <linux/kexec.h>
 
-#include <asm/setup.h>
 #include <asm/io.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
-#include <asm/processor.h>
-#include <asm/desc.h>
 #include <asm/tlbflush.h>
 
-#define PFX 		"EFI: "
-
 /*
  * To make EFI call EFI runtime service in physical addressing mode we need
  * prelog/epilog before/after the invocation to disable interrupt, to
@@ -49,16 +37,14 @@
  */
 
 static unsigned long efi_rt_eflags;
-static DEFINE_SPINLOCK(efi_rt_lock);
 static pgd_t efi_bak_pg_dir_pointer[2];
 
-void efi_call_phys_prelog(void) __acquires(efi_rt_lock)
+void efi_call_phys_prelog(void)
 {
 	unsigned long cr4;
 	unsigned long temp;
 	struct desc_ptr gdt_descr;
 
-	spin_lock(&efi_rt_lock);
 	local_irq_save(efi_rt_eflags);
 
 	/*
@@ -88,14 +74,14 @@ void efi_call_phys_prelog(void) __acquires(efi_rt_lock)
 	/*
 	 * After the lock is released, the original page table is restored.
 	 */
-	local_flush_tlb();
+	__flush_tlb_all();
 
 	gdt_descr.address = __pa(get_cpu_gdt_table(0));
 	gdt_descr.size = GDT_SIZE - 1;
 	load_gdt(&gdt_descr);
 }
 
-void efi_call_phys_epilog(void) __releases(efi_rt_lock)
+void efi_call_phys_epilog(void)
 {
 	unsigned long cr4;
 	struct desc_ptr gdt_descr;
@@ -119,10 +105,9 @@ void efi_call_phys_epilog(void) __releases(efi_rt_lock)
 	/*
 	 * After the lock is released, the original page table is restored.
 	 */
-	local_flush_tlb();
+	__flush_tlb_all();
 
 	local_irq_restore(efi_rt_eflags);
-	spin_unlock(&efi_rt_lock);
 }
 
 /*
@@ -135,7 +120,7 @@ void __init efi_map_memmap(void)
 	memmap.map = bt_ioremap((unsigned long) memmap.phys_map,
 			(memmap.nr_map * memmap.desc_size));
 	if (memmap.map == NULL)
-		printk(KERN_ERR PFX "Could not remap the EFI memmap!\n");
+		printk(KERN_ERR "Could not remap the EFI memmap!\n");
 
 	memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
 }
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index f2000dbc7195..269de2e049a3 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -40,13 +40,6 @@
 static pgd_t save_pgd __initdata;
 static unsigned long efi_flags __initdata;
 
-static int __init setup_noefi(char *arg)
-{
-	efi_enabled = 0;
-	return 0;
-}
-early_param("noefi", setup_noefi);
-
 static void __init early_mapping_set_exec(unsigned long start,
 					  unsigned long end,
 					  int executable)
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 2e805da337a2..704550fdb84c 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -648,9 +648,6 @@ void __init setup_arch(char **cmdline_p)
 	printk(KERN_INFO "BIOS-provided physical RAM map:\n");
 	print_memory_map(memory_setup());
 
-	if (efi_enabled)
-		efi_init();
-
 	copy_edd();
 
 	if (!boot_params.hdr.root_flags)
@@ -677,6 +674,9 @@ void __init setup_arch(char **cmdline_p)
 	strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
 	*cmdline_p = command_line;
 
+	if (efi_enabled)
+		efi_init();
+
 	max_low_pfn = setup_memory();
 
 #ifdef CONFIG_VMI
-- 
cgit v1.2.3


From 9e8b6d90ac973939c4605236d140ae64d459622b Mon Sep 17 00:00:00 2001
From: Min Zhang <mzhang@mvista.com>
Date: Wed, 30 Jan 2008 13:32:11 +0100
Subject: arch/x86/kernel/cpu/mcheck/p4.c: cleanups

SMP, the machine check exception dispatches all logical processors within a
physical package to the machine-check exception handler, so the printk
within each handler outputs concurrently and makes the output unreadable.
Refer to Intel system programming guide Part 1 Section 7.8.5
http://developer.intel.com/design/processor/manuals/253668.pdf

Signed-off-by: Min Zhang <mzhang@mvista.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/mcheck/k7.c | 11 +++++++----
 arch/x86/kernel/cpu/mcheck/p4.c | 21 ++++++++++++---------
 arch/x86/kernel/cpu/mcheck/p6.c | 11 +++++++----
 3 files changed, 26 insertions(+), 17 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
index 39fa76fd3851..9e929409fd7a 100644
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ b/arch/x86/kernel/cpu/mcheck/k7.c
@@ -33,21 +33,24 @@ static void k7_machine_check(struct pt_regs * regs, long error_code)
 	for (i=1; i<nr_mce_banks; i++) {
 		rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
 		if (high&(1<<31)) {
+			char misc[20];
+			char addr[24];
+			misc[0] = addr[0] = '\0';
 			if (high & (1<<29))
 				recover |= 1;
 			if (high & (1<<25))
 				recover |= 2;
-			printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
 			high &= ~(1<<31);
 			if (high & (1<<27)) {
 				rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
-				printk ("[%08x%08x]", ahigh, alow);
+				snprintf (misc, 20, "[%08x%08x]", ahigh, alow);
 			}
 			if (high & (1<<26)) {
 				rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
-				printk (" at %08x%08x", ahigh, alow);
+				snprintf (addr, 24, " at %08x%08x", ahigh, alow);
 			}
-			printk ("\n");
+			printk (KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
+				smp_processor_id(), i, high, low, misc, addr);
 			/* Clear it */
 			wrmsr (MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
 			/* Serialize */
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
index 16a6238dbc2a..cb03a1b50504 100644
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -158,32 +158,35 @@ static void intel_machine_check(struct pt_regs * regs, long error_code)
 	if (mce_num_extended_msrs > 0) {
 		struct intel_mce_extended_msrs dbg;
 		intel_get_extended_msrs(&dbg);
-		printk (KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n",
-			smp_processor_id(), dbg.eip, dbg.eflags);
-		printk (KERN_DEBUG "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n",
-			dbg.eax, dbg.ebx, dbg.ecx, dbg.edx);
-		printk (KERN_DEBUG "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n",
+		printk (KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n"
+			"\teax: %08x ebx: %08x ecx: %08x edx: %08x\n"
+			"\tesi: %08x edi: %08x ebp: %08x esp: %08x\n",
+			smp_processor_id(), dbg.eip, dbg.eflags,
+			dbg.eax, dbg.ebx, dbg.ecx, dbg.edx,
 			dbg.esi, dbg.edi, dbg.ebp, dbg.esp);
 	}
 
 	for (i=0; i<nr_mce_banks; i++) {
 		rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
 		if (high & (1<<31)) {
+			char misc[20];
+			char addr[24];
+			misc[0] = addr[0] = '\0';
 			if (high & (1<<29))
 				recover |= 1;
 			if (high & (1<<25))
 				recover |= 2;
-			printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
 			high &= ~(1<<31);
 			if (high & (1<<27)) {
 				rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
-				printk ("[%08x%08x]", ahigh, alow);
+				snprintf (misc, 20, "[%08x%08x]", ahigh, alow);
 			}
 			if (high & (1<<26)) {
 				rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
-				printk (" at %08x%08x", ahigh, alow);
+				snprintf (addr, 24, " at %08x%08x", ahigh, alow);
 			}
-			printk ("\n");
+			printk (KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
+				smp_processor_id(), i, high, low, misc, addr);
 		}
 	}
 
diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c
index cb3829e07987..b61f3038c4c8 100644
--- a/arch/x86/kernel/cpu/mcheck/p6.c
+++ b/arch/x86/kernel/cpu/mcheck/p6.c
@@ -33,21 +33,24 @@ static void intel_machine_check(struct pt_regs * regs, long error_code)
 	for (i=0; i<nr_mce_banks; i++) {
 		rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
 		if (high & (1<<31)) {
+			char misc[20];
+			char addr[24];
+			misc[0] = addr[0] = '\0';
 			if (high & (1<<29))
 				recover |= 1;
 			if (high & (1<<25))
 				recover |= 2;
-			printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
 			high &= ~(1<<31);
 			if (high & (1<<27)) {
 				rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
-				printk ("[%08x%08x]", ahigh, alow);
+				snprintf (misc, 20, "[%08x%08x]", ahigh, alow);
 			}
 			if (high & (1<<26)) {
 				rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
-				printk (" at %08x%08x", ahigh, alow);
+				snprintf (addr, 24, " at %08x%08x", ahigh, alow);
 			}
-			printk ("\n");
+			printk (KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
+				smp_processor_id(), i, high, low, misc, addr);
 		}
 	}
 
-- 
cgit v1.2.3


From b912a1c73739b6016ced049552392a148c1736c8 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 30 Jan 2008 13:32:12 +0100
Subject: x86: arch/x86/kernel/cpu/mcheck/ checkpatch fixes

#40: FILE: arch/x86/kernel/cpu/mcheck/k7.c:46:
+				snprintf (misc, 20, "[%08x%08x]", ahigh, alow);

WARNING: line over 80 characters
#45: FILE: arch/x86/kernel/cpu/mcheck/k7.c:50:
+				snprintf (addr, 24, " at %08x%08x", ahigh, alow);

WARNING: no space between function name and open parenthesis '('
#45: FILE: arch/x86/kernel/cpu/mcheck/k7.c:50:
+				snprintf (addr, 24, " at %08x%08x", ahigh, alow);

WARNING: no space between function name and open parenthesis '('
#48: FILE: arch/x86/kernel/cpu/mcheck/k7.c:52:
+			printk (KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",

WARNING: no space between function name and open parenthesis '('
#65: FILE: arch/x86/kernel/cpu/mcheck/p4.c:161:
+		printk (KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n"

WARNING: no space between function name and open parenthesis '('
#88: FILE: arch/x86/kernel/cpu/mcheck/p4.c:182:
+				snprintf (misc, 20, "[%08x%08x]", ahigh, alow);

WARNING: line over 80 characters
#93: FILE: arch/x86/kernel/cpu/mcheck/p4.c:186:
+				snprintf (addr, 24, " at %08x%08x", ahigh, alow);

WARNING: no space between function name and open parenthesis '('
#93: FILE: arch/x86/kernel/cpu/mcheck/p4.c:186:
+				snprintf (addr, 24, " at %08x%08x", ahigh, alow);

WARNING: no space between function name and open parenthesis '('
#96: FILE: arch/x86/kernel/cpu/mcheck/p4.c:188:
+			printk (KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",

WARNING: no space between function name and open parenthesis '('
#120: FILE: arch/x86/kernel/cpu/mcheck/p6.c:46:
+				snprintf (misc, 20, "[%08x%08x]", ahigh, alow);

WARNING: line over 80 characters
#125: FILE: arch/x86/kernel/cpu/mcheck/p6.c:50:
+				snprintf (addr, 24, " at %08x%08x", ahigh, alow);

WARNING: no space between function name and open parenthesis '('
#125: FILE: arch/x86/kernel/cpu/mcheck/p6.c:50:
+				snprintf (addr, 24, " at %08x%08x", ahigh, alow);

WARNING: no space between function name and open parenthesis '('
#128: FILE: arch/x86/kernel/cpu/mcheck/p6.c:52:
+			printk (KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",

total: 0 errors, 13 warnings, 100 lines checked

Your patch has style problems, please review.  If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.

Please run checkpatch prior to sending patches

Cc: Min Zhang <mzhang@mvista.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/mcheck/k7.c | 18 +++++++++---------
 arch/x86/kernel/cpu/mcheck/p4.c | 18 +++++++++---------
 arch/x86/kernel/cpu/mcheck/p6.c | 16 ++++++++--------
 3 files changed, 26 insertions(+), 26 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
index 9e929409fd7a..55dfd4f9446a 100644
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ b/arch/x86/kernel/cpu/mcheck/k7.c
@@ -27,11 +27,11 @@ static void k7_machine_check(struct pt_regs * regs, long error_code)
 	if (mcgstl & (1<<0))	/* Recoverable ? */
 		recover=0;
 
-	printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
+	printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
 		smp_processor_id(), mcgsth, mcgstl);
 
-	for (i=1; i<nr_mce_banks; i++) {
-		rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
+	for (i = 1; i < nr_mce_banks; i++) {
+		rdmsr(MSR_IA32_MC0_STATUS+i*4,low, high);
 		if (high&(1<<31)) {
 			char misc[20];
 			char addr[24];
@@ -42,17 +42,17 @@ static void k7_machine_check(struct pt_regs * regs, long error_code)
 				recover |= 2;
 			high &= ~(1<<31);
 			if (high & (1<<27)) {
-				rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
-				snprintf (misc, 20, "[%08x%08x]", ahigh, alow);
+				rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
+				snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
 			}
 			if (high & (1<<26)) {
-				rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
-				snprintf (addr, 24, " at %08x%08x", ahigh, alow);
+				rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
+				snprintf(addr, 24, " at %08x%08x", ahigh, alow);
 			}
-			printk (KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
+			printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
 				smp_processor_id(), i, high, low, misc, addr);
 			/* Clear it */
-			wrmsr (MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
+			wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
 			/* Serialize */
 			wmb();
 			add_taint(TAINT_MACHINE_CHECK);
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
index cb03a1b50504..77463a3e9adf 100644
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -152,13 +152,13 @@ static void intel_machine_check(struct pt_regs * regs, long error_code)
 	if (mcgstl & (1<<0))	/* Recoverable ? */
 		recover=0;
 
-	printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
+	printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
 		smp_processor_id(), mcgsth, mcgstl);
 
 	if (mce_num_extended_msrs > 0) {
 		struct intel_mce_extended_msrs dbg;
 		intel_get_extended_msrs(&dbg);
-		printk (KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n"
+		printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n"
 			"\teax: %08x ebx: %08x ecx: %08x edx: %08x\n"
 			"\tesi: %08x edi: %08x ebp: %08x esp: %08x\n",
 			smp_processor_id(), dbg.eip, dbg.eflags,
@@ -166,8 +166,8 @@ static void intel_machine_check(struct pt_regs * regs, long error_code)
 			dbg.esi, dbg.edi, dbg.ebp, dbg.esp);
 	}
 
-	for (i=0; i<nr_mce_banks; i++) {
-		rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
+	for (i = 0; i < nr_mce_banks; i++) {
+		rdmsr(MSR_IA32_MC0_STATUS+i*4,low, high);
 		if (high & (1<<31)) {
 			char misc[20];
 			char addr[24];
@@ -178,14 +178,14 @@ static void intel_machine_check(struct pt_regs * regs, long error_code)
 				recover |= 2;
 			high &= ~(1<<31);
 			if (high & (1<<27)) {
-				rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
-				snprintf (misc, 20, "[%08x%08x]", ahigh, alow);
+				rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
+				snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
 			}
 			if (high & (1<<26)) {
-				rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
-				snprintf (addr, 24, " at %08x%08x", ahigh, alow);
+				rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
+				snprintf(addr, 24, " at %08x%08x", ahigh, alow);
 			}
-			printk (KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
+			printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
 				smp_processor_id(), i, high, low, misc, addr);
 		}
 	}
diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c
index b61f3038c4c8..20376a2c0cf8 100644
--- a/arch/x86/kernel/cpu/mcheck/p6.c
+++ b/arch/x86/kernel/cpu/mcheck/p6.c
@@ -27,11 +27,11 @@ static void intel_machine_check(struct pt_regs * regs, long error_code)
 	if (mcgstl & (1<<0))	/* Recoverable ? */
 		recover=0;
 
-	printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
+	printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
 		smp_processor_id(), mcgsth, mcgstl);
 
-	for (i=0; i<nr_mce_banks; i++) {
-		rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
+	for (i = 0; i < nr_mce_banks; i++) {
+		rdmsr(MSR_IA32_MC0_STATUS+i*4,low, high);
 		if (high & (1<<31)) {
 			char misc[20];
 			char addr[24];
@@ -42,14 +42,14 @@ static void intel_machine_check(struct pt_regs * regs, long error_code)
 				recover |= 2;
 			high &= ~(1<<31);
 			if (high & (1<<27)) {
-				rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
-				snprintf (misc, 20, "[%08x%08x]", ahigh, alow);
+				rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
+				snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
 			}
 			if (high & (1<<26)) {
-				rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
-				snprintf (addr, 24, " at %08x%08x", ahigh, alow);
+				rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
+				snprintf(addr, 24, " at %08x%08x", ahigh, alow);
 			}
-			printk (KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
+			printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
 				smp_processor_id(), i, high, low, misc, addr);
 		}
 	}
-- 
cgit v1.2.3


From 7271339347653f6d652bb47aaee1f57d936002f6 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 30 Jan 2008 13:32:13 +0100
Subject: x86: arch/x86/kernel/cpu/mcheck/k7.c checkpatch fixes

#88: FILE: arch/x86/kernel/cpu/mcheck/k7.c:34:
+		rdmsr(MSR_IA32_MC0_STATUS+i*4,low, high);
 		                             ^

ERROR: need space after that ',' (ctx:VxV)
#142: FILE: arch/x86/kernel/cpu/mcheck/p4.c:170:
+		rdmsr(MSR_IA32_MC0_STATUS+i*4,low, high);
 		                             ^

ERROR: need space after that ',' (ctx:VxV)
#180: FILE: arch/x86/kernel/cpu/mcheck/p6.c:34:
+		rdmsr(MSR_IA32_MC0_STATUS+i*4,low, high);
 		                             ^

total: 3 errors, 0 warnings, 114 lines checked

Your patch has style problems, please review.  If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.

Please run checkpatch prior to sending patches

Cc: Min Zhang <mzhang@mvista.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/mcheck/k7.c | 2 +-
 arch/x86/kernel/cpu/mcheck/p4.c | 2 +-
 arch/x86/kernel/cpu/mcheck/p6.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
index 55dfd4f9446a..e633c9c2b764 100644
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ b/arch/x86/kernel/cpu/mcheck/k7.c
@@ -31,7 +31,7 @@ static void k7_machine_check(struct pt_regs * regs, long error_code)
 		smp_processor_id(), mcgsth, mcgstl);
 
 	for (i = 1; i < nr_mce_banks; i++) {
-		rdmsr(MSR_IA32_MC0_STATUS+i*4,low, high);
+		rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
 		if (high&(1<<31)) {
 			char misc[20];
 			char addr[24];
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
index 77463a3e9adf..cb03345554a5 100644
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -167,7 +167,7 @@ static void intel_machine_check(struct pt_regs * regs, long error_code)
 	}
 
 	for (i = 0; i < nr_mce_banks; i++) {
-		rdmsr(MSR_IA32_MC0_STATUS+i*4,low, high);
+		rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
 		if (high & (1<<31)) {
 			char misc[20];
 			char addr[24];
diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c
index 20376a2c0cf8..74342604d30e 100644
--- a/arch/x86/kernel/cpu/mcheck/p6.c
+++ b/arch/x86/kernel/cpu/mcheck/p6.c
@@ -31,7 +31,7 @@ static void intel_machine_check(struct pt_regs * regs, long error_code)
 		smp_processor_id(), mcgsth, mcgstl);
 
 	for (i = 0; i < nr_mce_banks; i++) {
-		rdmsr(MSR_IA32_MC0_STATUS+i*4,low, high);
+		rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
 		if (high & (1<<31)) {
 			char misc[20];
 			char addr[24];
-- 
cgit v1.2.3


From 9930927f36ac3e39ffa674dc23ef06f13c39cef7 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Wed, 30 Jan 2008 13:32:14 +0100
Subject: x86: introduce REX prefix helper for kprobes

Fold some small ifdefs into a helper function.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/kprobes.c | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 4e33329ce8a3..b1804e40235d 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -170,6 +170,19 @@ static void __kprobes set_jmp_op(void *from, void *to)
 	jop->op = RELATIVEJUMP_INSTRUCTION;
 }
 
+/*
+ * Check for the REX prefix which can only exist on X86_64
+ * X86_32 always returns 0
+ */
+static int __kprobes is_REX_prefix(kprobe_opcode_t *insn)
+{
+#ifdef CONFIG_X86_64
+	if ((*insn & 0xf0) == 0x40)
+		return 1;
+#endif
+	return 0;
+}
+
 /*
  * Returns non-zero if opcode is boostable.
  * RIP relative instructions are adjusted at copying time in 64 bits mode
@@ -239,14 +252,14 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
 	case 0x9d:		/* popf/popfd */
 		return 1;
 	}
-#ifdef CONFIG_X86_64
+
 	/*
-	 * on 64 bit x86, 0x40-0x4f are prefixes so we need to look
+	 * on X86_64, 0x40-0x4f are REX prefixes so we need to look
 	 * at the next byte instead.. but of course not recurse infinitely
 	 */
-	if (*insn  >= 0x40 && *insn <= 0x4f)
+	if (is_REX_prefix(insn))
 		return is_IF_modifier(++insn);
-#endif
+
 	return 0;
 }
 
@@ -284,7 +297,7 @@ static void __kprobes fix_riprel(struct kprobe *p)
 	}
 
 	/* Skip REX instruction prefix.  */
-	if ((*insn & 0xf0) == 0x40)
+	if (is_REX_prefix(insn))
 		++insn;
 
 	if (*insn == 0x0f) {
@@ -748,11 +761,9 @@ static void __kprobes resume_execution(struct kprobe *p,
 	unsigned long orig_ip = (unsigned long)p->addr;
 	kprobe_opcode_t *insn = p->ainsn.insn;
 
-#ifdef CONFIG_X86_64
 	/*skip the REX prefix*/
-	if (*insn >= 0x40 && *insn <= 0x4f)
+	if (is_REX_prefix(insn))
 		insn++;
-#endif
 
 	regs->flags &= ~X86_EFLAGS_TF;
 	switch (*insn) {
-- 
cgit v1.2.3


From 31f80e45ea26008939790b4363a4fdcff240e0d6 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Wed, 30 Jan 2008 13:32:16 +0100
Subject: x86: kprobes remove fix_riprel #ifdef

Move #ifdef around function definiton into the function and
unconditionally return on X86_32.  Saves an ifdef from the
one callsite.

[ mingo@elte.hu: minor cleanup. ]

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/kprobes.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index b1804e40235d..80bcb7635465 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -263,15 +263,16 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
 	return 0;
 }
 
-#ifdef CONFIG_X86_64
 /*
  * Adjust the displacement if the instruction uses the %rip-relative
  * addressing mode.
  * If it does, Return the address of the 32-bit displacement word.
  * If not, return null.
+ * Only applicable to 64-bit x86.
  */
 static void __kprobes fix_riprel(struct kprobe *p)
 {
+#ifdef CONFIG_X86_64
 	u8 *insn = p->ainsn.insn;
 	s64 disp;
 	int need_modrm;
@@ -335,15 +336,15 @@ static void __kprobes fix_riprel(struct kprobe *p)
 			*(s32 *)insn = (s32) disp;
 		}
 	}
-}
 #endif
+}
 
 static void __kprobes arch_copy_kprobe(struct kprobe *p)
 {
 	memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
-#ifdef CONFIG_X86_64
+
 	fix_riprel(p);
-#endif
+
 	if (can_boost(p->addr))
 		p->ainsn.boostable = 0;
 	else
-- 
cgit v1.2.3


From 4f2479f03c06e29d7d9e1e02191f8b5ba8ad4eef Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Wed, 30 Jan 2008 13:32:18 +0100
Subject: arch/x86/kernel/apm_32.c: use time_before, time_before_eq,

The functions time_before, time_before_eq, time_after, and time_after_eq
are more robust for comparing jiffies against other values.

A simplified version of the semantic patch making this change is as follows:
(http://www.emn.fr/x-info/coccinelle/)

// <smpl>
@ change_compare_np @
expression E;
@@

(
- jiffies <= E
+ time_before_eq(jiffies,E)
|
- jiffies >= E
+ time_after_eq(jiffies,E)
|
- jiffies < E
+ time_before(jiffies,E)
|
- jiffies > E
+ time_after(jiffies,E)
)

@ include depends on change_compare_np @
@@

#include <linux/jiffies.h>

@ no_include depends on !include && change_compare_np @
@@

  #include <linux/...>
+ #include <linux/jiffies.h>
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apm_32.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index e32f6c37db9b..2467df7eca0b 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -227,6 +227,7 @@
 #include <linux/dmi.h>
 #include <linux/suspend.h>
 #include <linux/kthread.h>
+#include <linux/jiffies.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -1287,7 +1288,7 @@ static void check_events(void)
 				       "event 0x%02x\n", event);
 		}
 		if (ignore_bounce
-		    && ((jiffies - last_resume) > bounce_interval))
+		    && (time_after(jiffies, last_resume + bounce_interval)))
 			ignore_bounce = 0;
 
 		switch (event) {
-- 
cgit v1.2.3


From 1d16b53e387b255d8e30f00594220b23b1290e6b Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Wed, 30 Jan 2008 13:32:19 +0100
Subject: arch/x86/kernel/io_apic_{64,32}.c: use time_before

The functions time_before, time_before_eq, time_after, and time_after_eq
are more robust for comparing jiffies against other values.

A simplified version of the semantic patch making this change is as follows:
(http://www.emn.fr/x-info/coccinelle/)

// <smpl>
@ change_compare_np @
expression E;
@@

(
- jiffies <= E
+ time_before_eq(jiffies,E)
|
- jiffies >= E
+ time_after_eq(jiffies,E)
|
- jiffies < E
+ time_before(jiffies,E)
|
- jiffies > E
+ time_after(jiffies,E)
)

@ include depends on change_compare_np @
@@

#include <linux/jiffies.h>

@ no_include depends on !include && change_compare_np @
@@

  #include <linux/...>
+ #include <linux/jiffies.h>
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/io_apic_32.c | 5 +++--
 arch/x86/kernel/io_apic_64.c | 3 ++-
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 0d204237489e..4ca548632c8d 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -35,6 +35,7 @@
 #include <linux/htirq.h>
 #include <linux/freezer.h>
 #include <linux/kthread.h>
+#include <linux/jiffies.h>	/* time_after() */
 
 #include <asm/io.h>
 #include <asm/smp.h>
@@ -349,7 +350,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
 # include <asm/processor.h>	/* kernel_thread() */
 # include <linux/kernel_stat.h>	/* kstat */
 # include <linux/slab.h>		/* kmalloc() */
-# include <linux/timer.h>	/* time_after() */
+# include <linux/timer.h>
  
 #define IRQBALANCE_CHECK_ARCH -999
 #define MAX_BALANCED_IRQ_INTERVAL	(5*HZ)
@@ -1898,7 +1899,7 @@ static int __init timer_irq_works(void)
 	 * might have cached one ExtINT interrupt.  Finally, at
 	 * least one tick may be lost due to delays.
 	 */
-	if (jiffies - t1 > 4)
+	if (time_after(jiffies, t1 + 4))
 		return 1;
 
 	return 0;
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index f914d84a21da..1627c0d53e0b 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -32,6 +32,7 @@
 #include <linux/msi.h>
 #include <linux/htirq.h>
 #include <linux/dmar.h>
+#include <linux/jiffies.h>
 #ifdef CONFIG_ACPI
 #include <acpi/acpi_bus.h>
 #endif
@@ -1299,7 +1300,7 @@ static int __init timer_irq_works(void)
 	 */
 
 	/* jiffies wrap? */
-	if (jiffies - t1 > 4)
+	if (time_after(jiffies, t1 + 4))
 		return 1;
 	return 0;
 }
-- 
cgit v1.2.3


From b97601563704751162b122c652d7f390b8f480d2 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Wed, 30 Jan 2008 13:32:19 +0100
Subject: x86: kprobes change kprobe_handler flow

Make the control flow of kprobe_handler more obvious.

Collapse the separate if blocks/gotos with if/else blocks
this unifies the duplication of the check for a breakpoint
instruction race with another cpu.

Create two jump targets:
	preempt_out: re-enables preemption before returning ret
	out: only returns ret

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/kprobes.c | 60 ++++++++++++++++++++++-------------------------
 1 file changed, 28 insertions(+), 32 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 80bcb7635465..a72e02bf1135 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -494,32 +494,28 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
 	preempt_disable();
 	kcb = get_kprobe_ctlblk();
 
-	/* Check we're not actually recursing */
-	if (kprobe_running()) {
-		p = get_kprobe(addr);
-		if (p) {
+	p = get_kprobe(addr);
+	if (p) {
+		/* Check we're not actually recursing */
+		if (kprobe_running()) {
 			ret = reenter_kprobe(p, regs, kcb);
 			if (kcb->kprobe_status == KPROBE_REENTER)
-				return 1;
+			{
+				ret = 1;
+				goto out;
+			}
+			goto preempt_out;
 		} else {
-			if (*addr != BREAKPOINT_INSTRUCTION) {
-			/* The breakpoint instruction was removed by
-			 * another cpu right after we hit, no further
-			 * handling of this interrupt is appropriate
-			 */
-				regs->ip = (unsigned long)addr;
+			set_current_kprobe(p, regs, kcb);
+			kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+			if (p->pre_handler && p->pre_handler(p, regs))
+			{
+				/* handler set things up, skip ss setup */
 				ret = 1;
-				goto no_kprobe;
+				goto out;
 			}
-			p = __get_cpu_var(current_kprobe);
-			if (p->break_handler && p->break_handler(p, regs))
-				goto ss_probe;
 		}
-		goto no_kprobe;
-	}
-
-	p = get_kprobe(addr);
-	if (!p) {
+	} else {
 		if (*addr != BREAKPOINT_INSTRUCTION) {
 			/*
 			 * The breakpoint instruction was removed right
@@ -532,34 +528,34 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
 			 */
 			regs->ip = (unsigned long)addr;
 			ret = 1;
+			goto preempt_out;
+		}
+		if (kprobe_running()) {
+			p = __get_cpu_var(current_kprobe);
+			if (p->break_handler && p->break_handler(p, regs))
+				goto ss_probe;
 		}
 		/* Not one of ours: let kernel handle it */
-		goto no_kprobe;
+		goto preempt_out;
 	}
 
-	set_current_kprobe(p, regs, kcb);
-	kcb->kprobe_status = KPROBE_HIT_ACTIVE;
-
-	if (p->pre_handler && p->pre_handler(p, regs))
-		/* handler has already set things up, so skip ss setup */
-		return 1;
-
 ss_probe:
+	ret = 1;
 #if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM)
 	if (p->ainsn.boostable == 1 && !p->post_handler) {
 		/* Boost up -- we can execute copied instructions directly */
 		reset_current_kprobe();
 		regs->ip = (unsigned long)p->ainsn.insn;
-		preempt_enable_no_resched();
-		return 1;
+		goto preempt_out;
 	}
 #endif
 	prepare_singlestep(p, regs);
 	kcb->kprobe_status = KPROBE_HIT_SS;
-	return 1;
+	goto out;
 
-no_kprobe:
+preempt_out:
 	preempt_enable_no_resched();
+out:
 	return ret;
 }
 
-- 
cgit v1.2.3


From 6b0c3d44d33e9429dbc568dc7fd3aee3774c5707 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Wed, 30 Jan 2008 13:32:27 +0100
Subject: x86: unify arch/x86/kernel/Makefile(s)

Combine the 32 and 64 bit specific Makefiles in one file.
While doing so link order was (almost) preserved on 32 bit
but on 64 bit link order changed a lot.

Patch was checked with defconfig + allyesconfig builds.
The same .o files were linked in these configurations.

To keep readability of the Makefiles a few Kconfig
symbols was added/modified and it was checked that
they were not used anywhere else.

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig            | 10 ++++-
 arch/x86/Kconfig.debug      |  3 +-
 arch/x86/kernel/Makefile    | 93 +++++++++++++++++++++++++++++++++++++++++----
 arch/x86/kernel/Makefile_32 | 54 --------------------------
 arch/x86/kernel/Makefile_64 | 52 -------------------------
 5 files changed, 95 insertions(+), 117 deletions(-)
 delete mode 100644 arch/x86/kernel/Makefile_32
 delete mode 100644 arch/x86/kernel/Makefile_64

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fa6fa52248d9..3d2c8dfba915 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -129,9 +129,17 @@ config GENERIC_PENDING_IRQ
 
 config X86_SMP
 	bool
-	depends on X86_32 && SMP && !X86_VOYAGER
+	depends on SMP && ((X86_32 && !X86_VOYAGER) || X86_64)
 	default y
 
+config X86_32_SMP
+	def_bool y
+	depends on X86_32 && SMP
+
+config X86_64_SMP
+	def_bool y
+	depends on X86_64 && SMP
+
 config X86_HT
 	bool
 	depends on SMP
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index b747ab38814d..660200915baa 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -75,8 +75,7 @@ config X86_FIND_SMP_CONFIG
 
 config X86_MPPARSE
 	def_bool y
-	depends on X86_LOCAL_APIC && !X86_VISWS
-	depends on X86_32
+	depends on (X86_32 && (X86_LOCAL_APIC && !X86_VISWS)) || X86_64
 
 config DOUBLEFAULT
 	default y
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 38573340b143..0903bbf0ca4d 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -1,9 +1,86 @@
-ifeq ($(CONFIG_X86_32),y)
-include ${srctree}/arch/x86/kernel/Makefile_32
-else
-include ${srctree}/arch/x86/kernel/Makefile_64
-endif
+#
+# Makefile for the linux kernel.
+#
+
+extra-y                := head_$(BITS).o init_task.o vmlinux.lds
+extra-$(CONFIG_X86_64) += head64.o
+
+CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
+CFLAGS_vsyscall_64.o := $(PROFILING) -g0
+
+obj-y			:= process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
+obj-y			+= traps_$(BITS).o irq_$(BITS).o
+obj-y			+= time_$(BITS).o ioport_$(BITS).o ldt.o
+obj-y			+= setup_$(BITS).o i8259_$(BITS).o
+obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o
+obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o
+obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o setup64.o
+obj-y			+= pci-dma_$(BITS).o  bootflag.o e820_$(BITS).o
+obj-y			+= quirks.o i8237.o topology.o
+obj-y			+= alternative.o i8253.o
+obj-$(CONFIG_X86_64)	+= pci-nommu_64.o bugs_64.o
+obj-y			+= tsc_$(BITS).o io_delay.o rtc.o
+
+obj-y				+= i387.o
+obj-y				+= ptrace.o
+obj-y				+= ds.o
+obj-$(CONFIG_X86_32)		+= tls.o
+obj-$(CONFIG_IA32_EMULATION)	+= tls.o
+obj-y				+= step.o
+obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
+obj-y				+= cpu/
+obj-y				+= acpi/
+obj-$(CONFIG_X86_BIOS_REBOOT)	+= reboot_32.o
+obj-$(CONFIG_X86_64)            += reboot_64.o
+obj-$(CONFIG_MCA)		+= mca_32.o
+obj-$(CONFIG_X86_MSR)		+= msr.o
+obj-$(CONFIG_X86_CPUID)		+= cpuid.o
+obj-$(CONFIG_MICROCODE)		+= microcode.o
+obj-$(CONFIG_PCI)		+= early-quirks.o
+obj-$(CONFIG_APM)		+= apm_32.o
+obj-$(CONFIG_X86_SMP)		+= smp_$(BITS).o smpboot_$(BITS).o tsc_sync.o
+obj-$(CONFIG_X86_32_SMP)	+= smpcommon_32.o
+obj-$(CONFIG_X86_64_SMP)	+= smp_64.o smpboot_64.o tsc_sync.o
+obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline_$(BITS).o
+obj-$(CONFIG_X86_MPPARSE)	+= mpparse_$(BITS).o
+obj-$(CONFIG_X86_LOCAL_APIC)	+= apic_$(BITS).o nmi_$(BITS).o
+obj-$(CONFIG_X86_IO_APIC)	+= io_apic_$(BITS).o
+obj-$(CONFIG_X86_REBOOTFIXUPS)	+= reboot_fixups_32.o
+obj-$(CONFIG_KEXEC)		+= machine_kexec_$(BITS).o
+obj-$(CONFIG_KEXEC)		+= relocate_kernel_$(BITS).o crash.o
+obj-$(CONFIG_CRASH_DUMP)	+= crash_dump_$(BITS).o
+obj-$(CONFIG_X86_NUMAQ)		+= numaq_32.o
+obj-$(CONFIG_X86_SUMMIT_NUMA)	+= summit_32.o
+obj-$(CONFIG_X86_VSMP)		+= vsmp_64.o
+obj-$(CONFIG_KPROBES)		+= kprobes.o
+obj-$(CONFIG_MODULES)		+= module_$(BITS).o
+obj-$(CONFIG_ACPI_SRAT) 	+= srat_32.o
+obj-$(CONFIG_EFI) 		+= efi.o efi_$(BITS).o efi_stub_$(BITS).o
+obj-$(CONFIG_DOUBLEFAULT) 	+= doublefault_32.o
+obj-$(CONFIG_VM86)		+= vm86_32.o
+obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
+
+obj-$(CONFIG_HPET_TIMER) 	+= hpet.o
 
-# Workaround to delete .lds files with make clean
-# The problem is that we do not enter Makefile_32 with make clean.
-clean-files := vsyscall*.lds vsyscall*.so
+obj-$(CONFIG_K8_NB)		+= k8.o
+obj-$(CONFIG_MGEODE_LX)		+= geode_32.o mfgpt_32.o
+
+obj-$(CONFIG_VMI)		+= vmi_32.o vmiclock_32.o
+obj-$(CONFIG_PARAVIRT)		+= paravirt.o paravirt_patch_$(BITS).o
+obj-y				+= pcspeaker.o
+
+obj-$(CONFIG_SCx200)		+= scx200_32.o
+
+###
+# 64 bit specific files
+ifeq ($(CONFIG_X86_64),y)
+        obj-y				+= genapic_64.o genapic_flat_64.o
+        obj-$(CONFIG_X86_PM_TIMER)	+= pmtimer_64.o
+        obj-$(CONFIG_AUDIT)		+= audit_64.o
+        obj-$(CONFIG_PM)		+= suspend_64.o
+        obj-$(CONFIG_HIBERNATION)	+= suspend_asm_64.o
+
+        obj-$(CONFIG_GART_IOMMU)	+= pci-gart_64.o aperture_64.o
+        obj-$(CONFIG_CALGARY_IOMMU)	+= pci-calgary_64.o tce_64.o
+        obj-$(CONFIG_SWIOTLB)		+= pci-swiotlb_64.o
+endif
diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32
deleted file mode 100644
index 86c6327798b2..000000000000
--- a/arch/x86/kernel/Makefile_32
+++ /dev/null
@@ -1,54 +0,0 @@
-#
-# Makefile for the linux kernel.
-#
-
-extra-y := head_32.o init_task.o vmlinux.lds
-CPPFLAGS_vmlinux.lds += -Ui386
-
-obj-y	:= process_32.o signal_32.o entry_32.o traps_32.o irq_32.o \
-		time_32.o ioport_32.o ldt.o setup_32.o i8259_32.o sys_i386_32.o \
-		pci-dma_32.o i386_ksyms_32.o bootflag.o e820_32.o\
-		quirks.o i8237.o topology.o alternative.o i8253.o tsc_32.o io_delay.o rtc.o
-
-obj-y				+= i387.o
-obj-y				+= ptrace.o
-obj-y				+= ds.o
-obj-y				+= tls.o
-obj-y				+= step.o
-obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
-obj-y				+= cpu/
-obj-y				+= acpi/
-obj-$(CONFIG_X86_BIOS_REBOOT)	+= reboot_32.o
-obj-$(CONFIG_MCA)		+= mca_32.o
-obj-$(CONFIG_X86_MSR)		+= msr.o
-obj-$(CONFIG_X86_CPUID)		+= cpuid.o
-obj-$(CONFIG_MICROCODE)		+= microcode.o
-obj-$(CONFIG_PCI)		+= early-quirks.o
-obj-$(CONFIG_APM)		+= apm_32.o
-obj-$(CONFIG_X86_SMP)		+= smp_32.o smpboot_32.o tsc_sync.o
-obj-$(CONFIG_SMP)		+= smpcommon_32.o
-obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline_32.o
-obj-$(CONFIG_X86_MPPARSE)	+= mpparse_32.o
-obj-$(CONFIG_X86_LOCAL_APIC)	+= apic_32.o nmi_32.o
-obj-$(CONFIG_X86_IO_APIC)	+= io_apic_32.o
-obj-$(CONFIG_X86_REBOOTFIXUPS)	+= reboot_fixups_32.o
-obj-$(CONFIG_KEXEC)		+= machine_kexec_32.o relocate_kernel_32.o crash.o
-obj-$(CONFIG_CRASH_DUMP)	+= crash_dump_32.o
-obj-$(CONFIG_X86_NUMAQ)		+= numaq_32.o
-obj-$(CONFIG_X86_SUMMIT_NUMA)	+= summit_32.o
-obj-$(CONFIG_KPROBES)		+= kprobes.o
-obj-$(CONFIG_MODULES)		+= module_32.o
-obj-$(CONFIG_ACPI_SRAT) 	+= srat_32.o
-obj-$(CONFIG_EFI) 		+= efi.o efi_32.o efi_stub_32.o
-obj-$(CONFIG_DOUBLEFAULT) 	+= doublefault_32.o
-obj-$(CONFIG_VM86)		+= vm86_32.o
-obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
-obj-$(CONFIG_HPET_TIMER) 	+= hpet.o
-obj-$(CONFIG_K8_NB)		+= k8.o
-obj-$(CONFIG_MGEODE_LX)		+= geode_32.o mfgpt_32.o
-
-obj-$(CONFIG_VMI)		+= vmi_32.o vmiclock_32.o
-obj-$(CONFIG_PARAVIRT)		+= paravirt.o paravirt_patch_32.o
-obj-y				+= pcspeaker.o
-
-obj-$(CONFIG_SCx200)		+= scx200_32.o
diff --git a/arch/x86/kernel/Makefile_64 b/arch/x86/kernel/Makefile_64
deleted file mode 100644
index b8f9d13eb5e3..000000000000
--- a/arch/x86/kernel/Makefile_64
+++ /dev/null
@@ -1,52 +0,0 @@
-#
-# Makefile for the linux kernel.
-#
-
-extra-y 	:= head_64.o head64.o init_task.o vmlinux.lds
-CPPFLAGS_vmlinux.lds += -Ux86_64
-
-obj-y	:= process_64.o signal_64.o entry_64.o traps_64.o irq_64.o \
-		time_64.o ioport_64.o ldt.o setup_64.o i8259_64.o sys_x86_64.o \
-		x8664_ksyms_64.o syscall_64.o vsyscall_64.o \
-		setup64.o bootflag.o e820_64.o reboot_64.o quirks.o i8237.o \
-		pci-dma_64.o pci-nommu_64.o alternative.o hpet.o tsc_64.o bugs_64.o \
-		i8253.o io_delay.o rtc.o
-
-obj-y				+= ptrace.o
-obj-y				+= ds.o
-obj-y				+= step.o
-
-obj-y				+= i387.o
-obj-$(CONFIG_IA32_EMULATION)	+= tls.o
-obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
-obj-y				+= cpu/
-obj-y				+= acpi/
-obj-$(CONFIG_X86_MSR)		+= msr.o
-obj-$(CONFIG_MICROCODE)		+= microcode.o
-obj-$(CONFIG_X86_CPUID)		+= cpuid.o
-obj-$(CONFIG_SMP)		+= smp_64.o smpboot_64.o trampoline_64.o tsc_sync.o
-obj-y				+= apic_64.o  nmi_64.o
-obj-y				+= io_apic_64.o mpparse_64.o genapic_64.o genapic_flat_64.o
-obj-$(CONFIG_KEXEC)		+= machine_kexec_64.o relocate_kernel_64.o crash.o
-obj-$(CONFIG_CRASH_DUMP)	+= crash_dump_64.o
-obj-$(CONFIG_PM)		+= suspend_64.o
-obj-$(CONFIG_HIBERNATION)	+= suspend_asm_64.o
-obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
-obj-$(CONFIG_GART_IOMMU)	+= pci-gart_64.o aperture_64.o
-obj-$(CONFIG_CALGARY_IOMMU)	+= pci-calgary_64.o tce_64.o
-obj-$(CONFIG_SWIOTLB)		+= pci-swiotlb_64.o
-obj-$(CONFIG_KPROBES)		+= kprobes.o
-obj-$(CONFIG_X86_PM_TIMER)	+= pmtimer_64.o
-obj-$(CONFIG_X86_VSMP)		+= vsmp_64.o
-obj-$(CONFIG_K8_NB)		+= k8.o
-obj-$(CONFIG_AUDIT)		+= audit_64.o
-obj-$(CONFIG_EFI)		+= efi.o efi_64.o efi_stub_64.o
-obj-$(CONFIG_PARAVIRT)		+= paravirt.o paravirt_patch_64.o
-
-obj-$(CONFIG_MODULES)		+= module_64.o
-obj-$(CONFIG_PCI)		+= early-quirks.o
-
-obj-y				+= topology.o
-obj-y				+= pcspeaker.o
-
-CFLAGS_vsyscall_64.o		:= $(PROFILING) -g0
-- 
cgit v1.2.3


From 8eed92605334f1cd8fd025631f6a61522746b124 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Wed, 30 Jan 2008 13:32:31 +0100
Subject: x86: coding style cleanup for kernel/bootflag.c

This patch eliminates checkpatch.pl complaints on bootflag.c

No code changed:

   text    data     bss     dec     hex filename
    321       8       0     329     149 bootflag.o.before
    321       8       0     329     149 bootflag.o.after

   md5:
      9c1b474bcf25ddc1724a29c19880043f  bootflag.o.before.asm
      9c1b474bcf25ddc1724a29c19880043f  bootflag.o.after.asm

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/bootflag.c | 50 +++++++++++++++++++++++++---------------------
 1 file changed, 27 insertions(+), 23 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c
index 0b9860530a6b..30f25a75fe28 100644
--- a/arch/x86/kernel/bootflag.c
+++ b/arch/x86/kernel/bootflag.c
@@ -1,8 +1,6 @@
 /*
  *	Implement 'Simple Boot Flag Specification 2.0'
  */
-
-
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
@@ -14,40 +12,38 @@
 
 #include <linux/mc146818rtc.h>
 
-
 #define SBF_RESERVED (0x78)
 #define SBF_PNPOS    (1<<0)
 #define SBF_BOOTING  (1<<1)
 #define SBF_DIAG     (1<<2)
 #define SBF_PARITY   (1<<7)
 
-
 int sbf_port __initdata = -1;	/* set via acpi_boot_init() */
 
-
 static int __init parity(u8 v)
 {
 	int x = 0;
 	int i;
-	
-	for(i=0;i<8;i++)
-	{
-		x^=(v&1);
-		v>>=1;
+
+	for (i = 0; i < 8; i++) {
+		x ^= (v & 1);
+		v >>= 1;
 	}
+
 	return x;
 }
 
 static void __init sbf_write(u8 v)
 {
 	unsigned long flags;
-	if(sbf_port != -1)
-	{
+
+	if (sbf_port != -1) {
 		v &= ~SBF_PARITY;
-		if(!parity(v))
-			v|=SBF_PARITY;
+		if (!parity(v))
+			v |= SBF_PARITY;
 
-		printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n", sbf_port, v);
+		printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n",
+			sbf_port, v);
 
 		spin_lock_irqsave(&rtc_lock, flags);
 		CMOS_WRITE(v, sbf_port);
@@ -57,33 +53,41 @@ static void __init sbf_write(u8 v)
 
 static u8 __init sbf_read(void)
 {
-	u8 v;
 	unsigned long flags;
-	if(sbf_port == -1)
+	u8 v;
+
+	if (sbf_port == -1)
 		return 0;
+
 	spin_lock_irqsave(&rtc_lock, flags);
 	v = CMOS_READ(sbf_port);
 	spin_unlock_irqrestore(&rtc_lock, flags);
+
 	return v;
 }
 
 static int __init sbf_value_valid(u8 v)
 {
-	if(v&SBF_RESERVED)		/* Reserved bits */
+	if (v & SBF_RESERVED)		/* Reserved bits */
 		return 0;
-	if(!parity(v))
+	if (!parity(v))
 		return 0;
+
 	return 1;
 }
 
 static int __init sbf_init(void)
 {
 	u8 v;
-	if(sbf_port == -1)
+
+	if (sbf_port == -1)
 		return 0;
+
 	v = sbf_read();
-	if(!sbf_value_valid(v))
-		printk(KERN_WARNING "Simple Boot Flag value 0x%x read from CMOS RAM was invalid\n",v);
+	if (!sbf_value_valid(v)) {
+		printk(KERN_WARNING "Simple Boot Flag value 0x%x read from "
+			"CMOS RAM was invalid\n", v);
+	}
 
 	v &= ~SBF_RESERVED;
 	v &= ~SBF_BOOTING;
@@ -92,7 +96,7 @@ static int __init sbf_init(void)
 	v |= SBF_PNPOS;
 #endif
 	sbf_write(v);
+
 	return 0;
 }
-
 module_init(sbf_init);
-- 
cgit v1.2.3


From 3f4380a1e0ea44bc1062ca55e8e479ddcda369fc Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Wed, 30 Jan 2008 13:32:32 +0100
Subject: x86: get rid of checkpatch.pl complains on apm_32.c

This patch eliminates most of code-style errors
discovered by checkpatch.pl on arch/x86/kernel/apm_32.c

no code changed:

      text    data     bss     dec     hex filename
     12142    1837      84   14063    36ef apm_32.o.before
     12142    1837      84   14063    36ef apm_32.o.after

   md5:
       2676b881ad55e387da4a995e8b9ee372  apm_32.o.before.asm
       2676b881ad55e387da4a995e8b9ee372  apm_32.o.after.asm

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apm_32.c | 366 +++++++++++++++++++++++++----------------------
 1 file changed, 196 insertions(+), 170 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 2467df7eca0b..955dd43b1676 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -323,7 +323,7 @@ extern int (*console_blank_hook)(int);
 /*
  * Ignore suspend events for this amount of time after a resume
  */
-#define DEFAULT_BOUNCE_INTERVAL		(3 * HZ)
+#define DEFAULT_BOUNCE_INTERVAL	(3 * HZ)
 
 /*
  * Maximum number of events stored
@@ -335,7 +335,7 @@ extern int (*console_blank_hook)(int);
  */
 struct apm_user {
 	int		magic;
-	struct apm_user *	next;
+	struct apm_user *next;
 	unsigned int	suser: 1;
 	unsigned int	writer: 1;
 	unsigned int	reader: 1;
@@ -371,44 +371,44 @@ struct apm_user {
 static struct {
 	unsigned long	offset;
 	unsigned short	segment;
-}				apm_bios_entry;
-static int			clock_slowed;
-static int			idle_threshold __read_mostly = DEFAULT_IDLE_THRESHOLD;
-static int			idle_period __read_mostly = DEFAULT_IDLE_PERIOD;
-static int			set_pm_idle;
-static int			suspends_pending;
-static int			standbys_pending;
-static int			ignore_sys_suspend;
-static int			ignore_normal_resume;
-static int			bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL;
-
-static int			debug __read_mostly;
-static int			smp __read_mostly;
-static int			apm_disabled = -1;
+} apm_bios_entry;
+static int clock_slowed;
+static int idle_threshold __read_mostly = DEFAULT_IDLE_THRESHOLD;
+static int idle_period __read_mostly = DEFAULT_IDLE_PERIOD;
+static int set_pm_idle;
+static int suspends_pending;
+static int standbys_pending;
+static int ignore_sys_suspend;
+static int ignore_normal_resume;
+static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL;
+
+static int debug __read_mostly;
+static int smp __read_mostly;
+static int apm_disabled = -1;
 #ifdef CONFIG_SMP
-static int			power_off;
+static int power_off;
 #else
-static int			power_off = 1;
+static int power_off = 1;
 #endif
 #ifdef CONFIG_APM_REAL_MODE_POWER_OFF
-static int			realmode_power_off = 1;
+static int realmode_power_off = 1;
 #else
-static int			realmode_power_off;
+static int realmode_power_off;
 #endif
 #ifdef CONFIG_APM_ALLOW_INTS
-static int			allow_ints = 1;
+static int allow_ints = 1;
 #else
-static int			allow_ints;
+static int allow_ints;
 #endif
-static int			broken_psr;
+static int broken_psr;
 
 static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue);
 static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue);
-static struct apm_user *	user_list;
+static struct apm_user *user_list;
 static DEFINE_SPINLOCK(user_list_lock);
 static const struct desc_struct	bad_bios_desc = { { { 0, 0x00409200 } } };
 
-static const char		driver_version[] = "1.16ac";	/* no spaces */
+static const char driver_version[] = "1.16ac";	/* no spaces */
 
 static struct task_struct *kapmd_task;
 
@@ -416,7 +416,7 @@ static struct task_struct *kapmd_task;
  *	APM event names taken from the APM 1.2 specification. These are
  *	the message codes that the BIOS uses to tell us about events
  */
-static const char *	const apm_event_name[] = {
+static const char * const apm_event_name[] = {
 	"system standby",
 	"system suspend",
 	"normal resume",
@@ -434,14 +434,14 @@ static const char *	const apm_event_name[] = {
 
 typedef struct lookup_t {
 	int	key;
-	char *	msg;
+	char 	*msg;
 } lookup_t;
 
 /*
  *	The BIOS returns a set of standard error codes in AX when the
  *	carry flag is set.
  */
- 
+
 static const lookup_t error_table[] = {
 /* N/A	{ APM_SUCCESS,		"Operation succeeded" }, */
 	{ APM_DISABLED,		"Power management disabled" },
@@ -471,24 +471,25 @@ static const lookup_t error_table[] = {
  *	Write a meaningful log entry to the kernel log in the event of
  *	an APM error.
  */
- 
+
 static void apm_error(char *str, int err)
 {
-	int	i;
+	int i;
 
 	for (i = 0; i < ERROR_COUNT; i++)
-		if (error_table[i].key == err) break;
+		if (error_table[i].key == err)
+			break;
 	if (i < ERROR_COUNT)
 		printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg);
 	else
 		printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n",
-			str, err);
+		       str, err);
 }
 
 /*
  * Lock APM functionality to physical CPU 0
  */
- 
+
 #ifdef CONFIG_SMP
 
 static cpumask_t apm_save_cpus(void)
@@ -510,7 +511,7 @@ static inline void apm_restore_cpus(cpumask_t mask)
 /*
  *	No CPU lockdown needed on a uniprocessor
  */
- 
+
 #define apm_save_cpus()		(current->cpus_allowed)
 #define apm_restore_cpus(x)	(void)(x)
 
@@ -589,7 +590,7 @@ static inline void apm_irq_restore(unsigned long flags)
  *	code is returned in AH (bits 8-15 of eax) and this function
  *	returns non-zero.
  */
- 
+
 static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in,
 	u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, u32 *esi)
 {
@@ -601,7 +602,7 @@ static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in,
 	struct desc_struct	*gdt;
 
 	cpus = apm_save_cpus();
-	
+
 	cpu = get_cpu();
 	gdt = get_cpu_gdt_table(cpu);
 	save_desc_40 = gdt[0x40 / 8];
@@ -615,7 +616,7 @@ static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in,
 	gdt[0x40 / 8] = save_desc_40;
 	put_cpu();
 	apm_restore_cpus(cpus);
-	
+
 	return *eax & 0xff;
 }
 
@@ -644,7 +645,7 @@ static u8 apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax)
 	struct desc_struct	*gdt;
 
 	cpus = apm_save_cpus();
-	
+
 	cpu = get_cpu();
 	gdt = get_cpu_gdt_table(cpu);
 	save_desc_40 = gdt[0x40 / 8];
@@ -679,7 +680,7 @@ static u8 apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax)
 
 static int apm_driver_version(u_short *val)
 {
-	u32	eax;
+	u32 eax;
 
 	if (apm_bios_call_simple(APM_FUNC_VERSION, 0, *val, &eax))
 		return (eax >> 8) & 0xff;
@@ -703,16 +704,16 @@ static int apm_driver_version(u_short *val)
  *	that APM 1.2 is in use. If no messges are pending the value 0x80
  *	is returned (No power management events pending).
  */
- 
+
 static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info)
 {
-	u32	eax;
-	u32	ebx;
-	u32	ecx;
-	u32	dummy;
+	u32 eax;
+	u32 ebx;
+	u32 ecx;
+	u32 dummy;
 
 	if (apm_bios_call(APM_FUNC_GET_EVENT, 0, 0, &eax, &ebx, &ecx,
-			&dummy, &dummy))
+			  &dummy, &dummy))
 		return (eax >> 8) & 0xff;
 	*event = ebx;
 	if (apm_info.connection_version < 0x0102)
@@ -735,10 +736,10 @@ static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info)
  *	The state holds the state to transition to, which may in fact
  *	be an acceptance of a BIOS requested state change.
  */
- 
+
 static int set_power_state(u_short what, u_short state)
 {
-	u32	eax;
+	u32 eax;
 
 	if (apm_bios_call_simple(APM_FUNC_SET_STATE, what, state, &eax))
 		return (eax >> 8) & 0xff;
@@ -751,7 +752,7 @@ static int set_power_state(u_short what, u_short state)
  *
  *	Transition the entire system into a new APM power state.
  */
- 
+
 static int set_system_power_state(u_short state)
 {
 	return set_power_state(APM_DEVICE_ALL, state);
@@ -765,13 +766,13 @@ static int set_system_power_state(u_short state)
  *	to handle the idle request. On a success the function returns 1
  *	if the BIOS did clock slowing or 0 otherwise.
  */
- 
+
 static int apm_do_idle(void)
 {
-	u32	eax;
-	u8	ret = 0;
-	int	idled = 0;
-	int	polling;
+	u32 eax;
+	u8 ret = 0;
+	int idled = 0;
+	int polling;
 
 	polling = !!(current_thread_info()->status & TS_POLLING);
 	if (polling) {
@@ -798,10 +799,9 @@ static int apm_do_idle(void)
 		/* This always fails on some SMP boards running UP kernels.
 		 * Only report the failure the first 5 times.
 		 */
-		if (++t < 5)
-		{
+		if (++t < 5) {
 			printk(KERN_DEBUG "apm_do_idle failed (%d)\n",
-					(eax >> 8) & 0xff);
+			       (eax >> 8) & 0xff);
 			t = jiffies;
 		}
 		return -1;
@@ -813,15 +813,15 @@ static int apm_do_idle(void)
 /**
  *	apm_do_busy	-	inform the BIOS the CPU is busy
  *
- *	Request that the BIOS brings the CPU back to full performance. 
+ *	Request that the BIOS brings the CPU back to full performance.
  */
- 
+
 static void apm_do_busy(void)
 {
-	u32	dummy;
+	u32 dummy;
 
 	if (clock_slowed || ALWAYS_CALL_BUSY) {
-		(void) apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy);
+		(void)apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy);
 		clock_slowed = 0;
 	}
 }
@@ -832,15 +832,15 @@ static void apm_do_busy(void)
  * power management - we probably want
  * to conserve power.
  */
-#define IDLE_CALC_LIMIT   (HZ * 100)
-#define IDLE_LEAKY_MAX    16
+#define IDLE_CALC_LIMIT	(HZ * 100)
+#define IDLE_LEAKY_MAX	16
 
 static void (*original_pm_idle)(void) __read_mostly;
 
 /**
  * apm_cpu_idle		-	cpu idling for APM capable Linux
  *
- * This is the idling function the kernel executes when APM is available. It 
+ * This is the idling function the kernel executes when APM is available. It
  * tries to do BIOS powermanagement based on the average system idle time.
  * Furthermore it calls the system default idle routine.
  */
@@ -881,7 +881,8 @@ recalc:
 
 			t = jiffies;
 			switch (apm_do_idle()) {
-			case 0: apm_idle_done = 1;
+			case 0:
+				apm_idle_done = 1;
 				if (t != jiffies) {
 					if (bucket) {
 						bucket = IDLE_LEAKY_MAX;
@@ -892,7 +893,8 @@ recalc:
 					continue;
 				}
 				break;
-			case 1: apm_idle_done = 1;
+			case 1:
+				apm_idle_done = 1;
 				break;
 			default: /* BIOS refused */
 				break;
@@ -920,10 +922,10 @@ recalc:
  *	the SMP call on CPU0 as some systems will only honour this call
  *	on their first cpu.
  */
- 
+
 static void apm_power_off(void)
 {
-	unsigned char	po_bios_call[] = {
+	unsigned char po_bios_call[] = {
 		0xb8, 0x00, 0x10,	/* movw  $0x1000,ax  */
 		0x8e, 0xd0,		/* movw  ax,ss       */
 		0xbc, 0x00, 0xf0,	/* movw  $0xf000,sp  */
@@ -934,13 +936,12 @@ static void apm_power_off(void)
 	};
 
 	/* Some bioses don't like being called from CPU != 0 */
-	if (apm_info.realmode_power_off)
-	{
+	if (apm_info.realmode_power_off) {
 		(void)apm_save_cpus();
 		machine_real_restart(po_bios_call, sizeof(po_bios_call));
+	} else {
+		(void)set_system_power_state(APM_STATE_OFF);
 	}
-	else
-		(void) set_system_power_state(APM_STATE_OFF);
 }
 
 #ifdef CONFIG_APM_DO_ENABLE
@@ -949,17 +950,17 @@ static void apm_power_off(void)
  *	apm_enable_power_management - enable BIOS APM power management
  *	@enable: enable yes/no
  *
- *	Enable or disable the APM BIOS power services. 
+ *	Enable or disable the APM BIOS power services.
  */
- 
+
 static int apm_enable_power_management(int enable)
 {
-	u32	eax;
+	u32 eax;
 
 	if ((enable == 0) && (apm_info.bios.flags & APM_BIOS_DISENGAGED))
 		return APM_NOT_ENGAGED;
 	if (apm_bios_call_simple(APM_FUNC_ENABLE_PM, APM_DEVICE_BALL,
-			enable, &eax))
+				 enable, &eax))
 		return (eax >> 8) & 0xff;
 	if (enable)
 		apm_info.bios.flags &= ~APM_BIOS_DISABLED;
@@ -982,19 +983,19 @@ static int apm_enable_power_management(int enable)
  *	if reported is a lifetime in secodnds/minutes at current powwer
  *	consumption.
  */
- 
+
 static int apm_get_power_status(u_short *status, u_short *bat, u_short *life)
 {
-	u32	eax;
-	u32	ebx;
-	u32	ecx;
-	u32	edx;
-	u32	dummy;
+	u32 eax;
+	u32 ebx;
+	u32 ecx;
+	u32 edx;
+	u32 dummy;
 
 	if (apm_info.get_power_status_broken)
 		return APM_32_UNSUPPORTED;
 	if (apm_bios_call(APM_FUNC_GET_STATUS, APM_DEVICE_ALL, 0,
-			&eax, &ebx, &ecx, &edx, &dummy))
+			  &eax, &ebx, &ecx, &edx, &dummy))
 		return (eax >> 8) & 0xff;
 	*status = ebx;
 	*bat = ecx;
@@ -1010,11 +1011,11 @@ static int apm_get_power_status(u_short *status, u_short *bat, u_short *life)
 static int apm_get_battery_status(u_short which, u_short *status,
 				  u_short *bat, u_short *life, u_short *nbat)
 {
-	u32	eax;
-	u32	ebx;
-	u32	ecx;
-	u32	edx;
-	u32	esi;
+	u32 eax;
+	u32 ebx;
+	u32 ecx;
+	u32 edx;
+	u32 esi;
 
 	if (apm_info.connection_version < 0x0102) {
 		/* pretend we only have one battery. */
@@ -1025,7 +1026,7 @@ static int apm_get_battery_status(u_short which, u_short *status,
 	}
 
 	if (apm_bios_call(APM_FUNC_GET_STATUS, (0x8000 | (which)), 0, &eax,
-			&ebx, &ecx, &edx, &esi))
+			  &ebx, &ecx, &edx, &esi))
 		return (eax >> 8) & 0xff;
 	*status = ebx;
 	*bat = ecx;
@@ -1043,10 +1044,10 @@ static int apm_get_battery_status(u_short which, u_short *status,
  *	Activate or deactive power management on either a specific device
  *	or the entire system (%APM_DEVICE_ALL).
  */
- 
+
 static int apm_engage_power_management(u_short device, int enable)
 {
-	u32	eax;
+	u32 eax;
 
 	if ((enable == 0) && (device == APM_DEVICE_ALL)
 	    && (apm_info.bios.flags & APM_BIOS_DISABLED))
@@ -1073,7 +1074,7 @@ static int apm_engage_power_management(u_short device, int enable)
  *	all video devices. Typically the BIOS will do laptop backlight and
  *	monitor powerdown for us.
  */
- 
+
 static int apm_console_blank(int blank)
 {
 	int error = APM_NOT_ENGAGED; /* silence gcc */
@@ -1125,7 +1126,7 @@ static apm_event_t get_queued_event(struct apm_user *as)
 
 static void queue_event(apm_event_t event, struct apm_user *sender)
 {
-	struct apm_user *	as;
+	struct apm_user *as;
 
 	spin_lock(&user_list_lock);
 	if (user_list == NULL)
@@ -1185,7 +1186,7 @@ static void reinit_timer(void)
 
 static int suspend(int vetoable)
 {
-	int		err;
+	int err;
 	struct apm_user	*as;
 
 	if (pm_send_all(PM_SUSPEND, (void *)3)) {
@@ -1238,7 +1239,7 @@ static int suspend(int vetoable)
 
 static void standby(void)
 {
-	int	err;
+	int err;
 
 	local_irq_disable();
 	device_power_down(PMSG_SUSPEND);
@@ -1255,8 +1256,8 @@ static void standby(void)
 
 static apm_event_t get_event(void)
 {
-	int		error;
-	apm_event_t	event = APM_NO_EVENTS; /* silence gcc */
+	int error;
+	apm_event_t event = APM_NO_EVENTS; /* silence gcc */
 	apm_eventinfo_t	info;
 
 	static int notified;
@@ -1274,9 +1275,9 @@ static apm_event_t get_event(void)
 
 static void check_events(void)
 {
-	apm_event_t		event;
-	static unsigned long	last_resume;
-	static int		ignore_bounce;
+	apm_event_t event;
+	static unsigned long last_resume;
+	static int ignore_bounce;
 
 	while ((event = get_event()) != 0) {
 		if (debug) {
@@ -1356,7 +1357,7 @@ static void check_events(void)
 			/*
 			 * We are not allowed to reject a critical suspend.
 			 */
-			(void) suspend(0);
+			(void)suspend(0);
 			break;
 		}
 	}
@@ -1364,12 +1365,12 @@ static void check_events(void)
 
 static void apm_event_handler(void)
 {
-	static int	pending_count = 4;
-	int		err;
+	static int pending_count = 4;
+	int err;
 
 	if ((standbys_pending > 0) || (suspends_pending > 0)) {
 		if ((apm_info.connection_version > 0x100) &&
-				(pending_count-- <= 0)) {
+		    (pending_count-- <= 0)) {
 			pending_count = 4;
 			if (debug)
 				printk(KERN_DEBUG "apm: setting state busy\n");
@@ -1417,9 +1418,9 @@ static int check_apm_user(struct apm_user *as, const char *func)
 
 static ssize_t do_read(struct file *fp, char __user *buf, size_t count, loff_t *ppos)
 {
-	struct apm_user *	as;
-	int			i;
-	apm_event_t		event;
+	struct apm_user *as;
+	int i;
+	apm_event_t event;
 
 	as = fp->private_data;
 	if (check_apm_user(as, "read"))
@@ -1458,9 +1459,9 @@ static ssize_t do_read(struct file *fp, char __user *buf, size_t count, loff_t *
 	return 0;
 }
 
-static unsigned int do_poll(struct file *fp, poll_table * wait)
+static unsigned int do_poll(struct file *fp, poll_table *wait)
 {
-	struct apm_user * as;
+	struct apm_user *as;
 
 	as = fp->private_data;
 	if (check_apm_user(as, "poll"))
@@ -1471,10 +1472,10 @@ static unsigned int do_poll(struct file *fp, poll_table * wait)
 	return 0;
 }
 
-static int do_ioctl(struct inode * inode, struct file *filp,
+static int do_ioctl(struct inode *inode, struct file *filp,
 		    u_int cmd, u_long arg)
 {
-	struct apm_user *	as;
+	struct apm_user *as;
 
 	as = filp->private_data;
 	if (check_apm_user(as, "ioctl"))
@@ -1514,9 +1515,9 @@ static int do_ioctl(struct inode * inode, struct file *filp,
 	return 0;
 }
 
-static int do_release(struct inode * inode, struct file * filp)
+static int do_release(struct inode *inode, struct file *filp)
 {
-	struct apm_user *	as;
+	struct apm_user *as;
 
 	as = filp->private_data;
 	if (check_apm_user(as, "release"))
@@ -1532,11 +1533,11 @@ static int do_release(struct inode * inode, struct file * filp)
 		if (suspends_pending <= 0)
 			(void) suspend(1);
 	}
-  	spin_lock(&user_list_lock);
+	spin_lock(&user_list_lock);
 	if (user_list == as)
 		user_list = as->next;
 	else {
-		struct apm_user *	as1;
+		struct apm_user *as1;
 
 		for (as1 = user_list;
 		     (as1 != NULL) && (as1->next != as);
@@ -1552,9 +1553,9 @@ static int do_release(struct inode * inode, struct file * filp)
 	return 0;
 }
 
-static int do_open(struct inode * inode, struct file * filp)
+static int do_open(struct inode *inode, struct file *filp)
 {
-	struct apm_user *	as;
+	struct apm_user *as;
 
 	as = kmalloc(sizeof(*as), GFP_KERNEL);
 	if (as == NULL) {
@@ -1568,7 +1569,7 @@ static int do_open(struct inode * inode, struct file * filp)
 	as->suspends_read = as->standbys_read = 0;
 	/*
 	 * XXX - this is a tiny bit broken, when we consider BSD
-         * process accounting. If the device is opened by root, we
+	 * process accounting. If the device is opened by root, we
 	 * instantly flag that we used superuser privs. Who knows,
 	 * we might close the device immediately without doing a
 	 * privileged operation -- cevans
@@ -1651,16 +1652,16 @@ static int proc_apm_show(struct seq_file *m, void *v)
 	   8) min = minutes; sec = seconds */
 
 	seq_printf(m, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n",
-		     driver_version,
-		     (apm_info.bios.version >> 8) & 0xff,
-		     apm_info.bios.version & 0xff,
-		     apm_info.bios.flags,
-		     ac_line_status,
-		     battery_status,
-		     battery_flag,
-		     percentage,
-		     time_units,
-		     units);
+		   driver_version,
+		   (apm_info.bios.version >> 8) & 0xff,
+		   apm_info.bios.version & 0xff,
+		   apm_info.bios.flags,
+		   ac_line_status,
+		   battery_status,
+		   battery_flag,
+		   percentage,
+		   time_units,
+		   units);
 	return 0;
 }
 
@@ -1683,8 +1684,8 @@ static int apm(void *unused)
 	unsigned short	cx;
 	unsigned short	dx;
 	int		error;
-	char *		power_stat;
-	char *		bat_stat;
+	char 		*power_stat;
+	char 		*bat_stat;
 
 #ifdef CONFIG_SMP
 	/* 2002/08/01 - WT
@@ -1743,23 +1744,41 @@ static int apm(void *unused)
 		}
 	}
 
-	if (debug && (num_online_cpus() == 1 || smp )) {
+	if (debug && (num_online_cpus() == 1 || smp)) {
 		error = apm_get_power_status(&bx, &cx, &dx);
 		if (error)
 			printk(KERN_INFO "apm: power status not available\n");
 		else {
 			switch ((bx >> 8) & 0xff) {
-			case 0: power_stat = "off line"; break;
-			case 1: power_stat = "on line"; break;
-			case 2: power_stat = "on backup power"; break;
-			default: power_stat = "unknown"; break;
+			case 0:
+				power_stat = "off line";
+				break;
+			case 1:
+				power_stat = "on line";
+				break;
+			case 2:
+				power_stat = "on backup power";
+				break;
+			default:
+				power_stat = "unknown";
+				break;
 			}
 			switch (bx & 0xff) {
-			case 0: bat_stat = "high"; break;
-			case 1: bat_stat = "low"; break;
-			case 2: bat_stat = "critical"; break;
-			case 3: bat_stat = "charging"; break;
-			default: bat_stat = "unknown"; break;
+			case 0:
+				bat_stat = "high";
+				break;
+			case 1:
+				bat_stat = "low";
+				break;
+			case 2:
+				bat_stat = "critical";
+				break;
+			case 3:
+				bat_stat = "charging";
+				break;
+			default:
+				bat_stat = "unknown";
+				break;
 			}
 			printk(KERN_INFO
 			       "apm: AC %s, battery status %s, battery life ",
@@ -1776,8 +1795,8 @@ static int apm(void *unused)
 					printk("unknown\n");
 				else
 					printk("%d %s\n", dx & 0x7fff,
-						(dx & 0x8000) ?
-						"minutes" : "seconds");
+					       (dx & 0x8000) ?
+					       "minutes" : "seconds");
 			}
 		}
 	}
@@ -1802,7 +1821,7 @@ static int apm(void *unused)
 #ifndef MODULE
 static int __init apm_setup(char *str)
 {
-	int	invert;
+	int invert;
 
 	while ((str != NULL) && (*str != '\0')) {
 		if (strncmp(str, "off", 3) == 0)
@@ -1827,14 +1846,13 @@ static int __init apm_setup(char *str)
 		if ((strncmp(str, "power-off", 9) == 0) ||
 		    (strncmp(str, "power_off", 9) == 0))
 			power_off = !invert;
-		if (strncmp(str, "smp", 3) == 0)
-		{
+		if (strncmp(str, "smp", 3) == 0) {
 			smp = !invert;
 			idle_threshold = 100;
 		}
 		if ((strncmp(str, "allow-ints", 10) == 0) ||
 		    (strncmp(str, "allow_ints", 10) == 0))
- 			apm_info.allow_ints = !invert;
+			apm_info.allow_ints = !invert;
 		if ((strncmp(str, "broken-psr", 10) == 0) ||
 		    (strncmp(str, "broken_psr", 10) == 0))
 			apm_info.get_power_status_broken = !invert;
@@ -1880,7 +1898,8 @@ static int __init print_if_true(const struct dmi_system_id *d)
  */
 static int __init broken_ps2_resume(const struct dmi_system_id *d)
 {
-	printk(KERN_INFO "%s machine detected. Mousepad Resume Bug workaround hopefully not needed.\n", d->ident);
+	printk(KERN_INFO "%s machine detected. Mousepad Resume Bug "
+	       "workaround hopefully not needed.\n", d->ident);
 	return 0;
 }
 
@@ -1889,7 +1908,8 @@ static int __init set_realmode_power_off(const struct dmi_system_id *d)
 {
 	if (apm_info.realmode_power_off == 0) {
 		apm_info.realmode_power_off = 1;
-		printk(KERN_INFO "%s bios detected. Using realmode poweroff only.\n", d->ident);
+		printk(KERN_INFO "%s bios detected. "
+		       "Using realmode poweroff only.\n", d->ident);
 	}
 	return 0;
 }
@@ -1899,7 +1919,8 @@ static int __init set_apm_ints(const struct dmi_system_id *d)
 {
 	if (apm_info.allow_ints == 0) {
 		apm_info.allow_ints = 1;
-		printk(KERN_INFO "%s machine detected. Enabling interrupts during APM calls.\n", d->ident);
+		printk(KERN_INFO "%s machine detected. "
+		       "Enabling interrupts during APM calls.\n", d->ident);
 	}
 	return 0;
 }
@@ -1909,7 +1930,8 @@ static int __init apm_is_horked(const struct dmi_system_id *d)
 {
 	if (apm_info.disabled == 0) {
 		apm_info.disabled = 1;
-		printk(KERN_INFO "%s machine detected. Disabling APM.\n", d->ident);
+		printk(KERN_INFO "%s machine detected. "
+		       "Disabling APM.\n", d->ident);
 	}
 	return 0;
 }
@@ -1918,7 +1940,8 @@ static int __init apm_is_horked_d850md(const struct dmi_system_id *d)
 {
 	if (apm_info.disabled == 0) {
 		apm_info.disabled = 1;
-		printk(KERN_INFO "%s machine detected. Disabling APM.\n", d->ident);
+		printk(KERN_INFO "%s machine detected. "
+		       "Disabling APM.\n", d->ident);
 		printk(KERN_INFO "This bug is fixed in bios P15 which is available for \n");
 		printk(KERN_INFO "download from support.intel.com \n");
 	}
@@ -1930,7 +1953,8 @@ static int __init apm_likes_to_melt(const struct dmi_system_id *d)
 {
 	if (apm_info.forbid_idle == 0) {
 		apm_info.forbid_idle = 1;
-		printk(KERN_INFO "%s machine detected. Disabling APM idle calls.\n", d->ident);
+		printk(KERN_INFO "%s machine detected. "
+		       "Disabling APM idle calls.\n", d->ident);
 	}
 	return 0;
 }
@@ -1953,7 +1977,8 @@ static int __init apm_likes_to_melt(const struct dmi_system_id *d)
 static int __init broken_apm_power(const struct dmi_system_id *d)
 {
 	apm_info.get_power_status_broken = 1;
-	printk(KERN_WARNING "BIOS strings suggest APM bugs, disabling power status reporting.\n");
+	printk(KERN_WARNING "BIOS strings suggest APM bugs, "
+	       "disabling power status reporting.\n");
 	return 0;
 }
 
@@ -1964,7 +1989,8 @@ static int __init broken_apm_power(const struct dmi_system_id *d)
 static int __init swab_apm_power_in_minutes(const struct dmi_system_id *d)
 {
 	apm_info.get_power_status_swabinminutes = 1;
-	printk(KERN_WARNING "BIOS strings suggest APM reports battery life in minutes and wrong byte order.\n");
+	printk(KERN_WARNING "BIOS strings suggest APM reports battery life "
+	       "in minutes and wrong byte order.\n");
 	return 0;
 }
 
@@ -1989,8 +2015,8 @@ static struct dmi_system_id __initdata apm_dmi_table[] = {
 		apm_is_horked, "Dell Inspiron 2500",
 		{	DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
 			DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"),
-			DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"),
-			DMI_MATCH(DMI_BIOS_VERSION,"A11"), },
+			DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+			DMI_MATCH(DMI_BIOS_VERSION, "A11"), },
 	},
 	{	/* Allow interrupts during suspend on Dell Inspiron laptops*/
 		set_apm_ints, "Dell Inspiron", {
@@ -2013,15 +2039,15 @@ static struct dmi_system_id __initdata apm_dmi_table[] = {
 		apm_is_horked, "Dell Dimension 4100",
 		{	DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
 			DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"),
-			DMI_MATCH(DMI_BIOS_VENDOR,"Intel Corp."),
-			DMI_MATCH(DMI_BIOS_VERSION,"A11"), },
+			DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."),
+			DMI_MATCH(DMI_BIOS_VERSION, "A11"), },
 	},
 	{	/* Allow interrupts during suspend on Compaq Laptops*/
 		set_apm_ints, "Compaq 12XL125",
 		{	DMI_MATCH(DMI_SYS_VENDOR, "Compaq"),
 			DMI_MATCH(DMI_PRODUCT_NAME, "Compaq PC"),
 			DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
-			DMI_MATCH(DMI_BIOS_VERSION,"4.06"), },
+			DMI_MATCH(DMI_BIOS_VERSION, "4.06"), },
 	},
 	{	/* Allow interrupts during APM or the clock goes slow */
 		set_apm_ints, "ASUSTeK",
@@ -2063,15 +2089,15 @@ static struct dmi_system_id __initdata apm_dmi_table[] = {
 		apm_is_horked, "Sharp PC-PJ/AX",
 		{	DMI_MATCH(DMI_SYS_VENDOR, "SHARP"),
 			DMI_MATCH(DMI_PRODUCT_NAME, "PC-PJ/AX"),
-			DMI_MATCH(DMI_BIOS_VENDOR,"SystemSoft"),
-			DMI_MATCH(DMI_BIOS_VERSION,"Version R2.08"), },
+			DMI_MATCH(DMI_BIOS_VENDOR, "SystemSoft"),
+			DMI_MATCH(DMI_BIOS_VERSION, "Version R2.08"), },
 	},
 	{	/* APM crashes */
 		apm_is_horked, "Dell Inspiron 2500",
 		{	DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
 			DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"),
-			DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"),
-			DMI_MATCH(DMI_BIOS_VERSION,"A11"), },
+			DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+			DMI_MATCH(DMI_BIOS_VERSION, "A11"), },
 	},
 	{	/* APM idle hangs */
 		apm_likes_to_melt, "Jabil AMD",
@@ -2202,11 +2228,11 @@ static int __init apm_init(void)
 		return -ENODEV;
 	}
 	printk(KERN_INFO
-		"apm: BIOS version %d.%d Flags 0x%02x (Driver version %s)\n",
-		((apm_info.bios.version >> 8) & 0xff),
-		(apm_info.bios.version & 0xff),
-		apm_info.bios.flags,
-		driver_version);
+	       "apm: BIOS version %d.%d Flags 0x%02x (Driver version %s)\n",
+	       ((apm_info.bios.version >> 8) & 0xff),
+	       (apm_info.bios.version & 0xff),
+	       apm_info.bios.flags,
+	       driver_version);
 	if ((apm_info.bios.flags & APM_32_BIT_SUPPORT) == 0) {
 		printk(KERN_INFO "apm: no 32 bit BIOS support\n");
 		return -ENODEV;
@@ -2311,9 +2337,9 @@ static int __init apm_init(void)
 	}
 	wake_up_process(kapmd_task);
 
-	if (num_online_cpus() > 1 && !smp ) {
+	if (num_online_cpus() > 1 && !smp) {
 		printk(KERN_NOTICE
-		   "apm: disabled - APM is not SMP safe (power off active).\n");
+		       "apm: disabled - APM is not SMP safe (power off active).\n");
 		return 0;
 	}
 
@@ -2338,7 +2364,7 @@ static int __init apm_init(void)
 
 static void __exit apm_exit(void)
 {
-	int	error;
+	int error;
 
 	if (set_pm_idle) {
 		pm_idle = original_pm_idle;
-- 
cgit v1.2.3


From b506a9d08bae9336ff9223c8a46a37bf27bd924a Mon Sep 17 00:00:00 2001
From: Quentin Barnes <qbarnes@gmail.com>
Date: Wed, 30 Jan 2008 13:32:32 +0100
Subject: x86: code clarification patch to Kprobes arch code

When developing the Kprobes arch code for ARM, I ran across some code
found in x86 and s390 Kprobes arch code which I didn't consider as
good as it could be.

Once I figured out what the code was doing, I changed the code
for ARM Kprobes to work the way I felt was more appropriate.
I've tested the code this way in ARM for about a year and would
like to push the same change to the other affected architectures.

The code in question is in kprobe_exceptions_notify() which
does:
====
          /* kprobe_running() needs smp_processor_id() */
          preempt_disable();
          if (kprobe_running() &&
              kprobe_fault_handler(args->regs, args->trapnr))
                  ret = NOTIFY_STOP;
          preempt_enable();
====

For the moment, ignore the code having the preempt_disable()/
preempt_enable() pair in it.

The problem is that kprobe_running() needs to call smp_processor_id()
which will assert if preemption is enabled.  That sanity check by
smp_processor_id() makes perfect sense since calling it with preemption
enabled would return an unreliable result.

But the function kprobe_exceptions_notify() can be called from a
context where preemption could be enabled.  If that happens, the
assertion in smp_processor_id() happens and we're dead.  So what
the original author did (speculation on my part!) is put in the
preempt_disable()/preempt_enable() pair to simply defeat the check.

Once I figured out what was going on, I considered this an
inappropriate approach.  If kprobe_exceptions_notify() is called
from a preemptible context, we can't be in a kprobe processing
context at that time anyways since kprobes requires preemption to
already be disabled, so just check for preemption enabled, and if
so, blow out before ever calling kprobe_running().  I wrote the ARM
kprobe code like this:
====
          /* To be potentially processing a kprobe fault and to
           * trust the result from kprobe_running(), we have
           * be non-preemptible. */
          if (!preemptible() && kprobe_running() &&
              kprobe_fault_handler(args->regs, args->trapnr))
                  ret = NOTIFY_STOP;
====

The above code has been working fine for ARM Kprobes for a year.
So I changed the x86 code (2.6.24-rc6) to be the same way and ran
the Systemtap tests on that kernel.  As on ARM, Systemtap on x86
comes up with the same test results either way, so it's a neutral
external functional change (as expected).

This issue has been discussed previously on linux-arm-kernel and the
Systemtap mailing lists.  Pointers to the by base for the two
discussions:
http://lists.arm.linux.org.uk/lurker/message/20071219.223225.1f5c2a5e.en.html
http://sourceware.org/ml/systemtap/2007-q1/msg00251.html

Signed-off-by: Quentin Barnes <qbarnes@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Ananth N Mavinakayahanalli <ananth@in.ibm.com>
Acked-by: Ananth N Mavinakayahanalli <ananth@in.ibm.com>
---
 arch/x86/kernel/kprobes.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index a72e02bf1135..711fec8f6379 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -44,6 +44,7 @@
 #include <linux/ptrace.h>
 #include <linux/string.h>
 #include <linux/slab.h>
+#include <linux/hardirq.h>
 #include <linux/preempt.h>
 #include <linux/module.h>
 #include <linux/kdebug.h>
@@ -951,12 +952,14 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
 			ret = NOTIFY_STOP;
 		break;
 	case DIE_GPF:
-		/* kprobe_running() needs smp_processor_id() */
-		preempt_disable();
-		if (kprobe_running() &&
+		/*
+		 * To be potentially processing a kprobe fault and to
+		 * trust the result from kprobe_running(), we have
+		 * be non-preemptible.
+		 */
+		if (!preemptible() && kprobe_running() &&
 		    kprobe_fault_handler(args->regs, args->trapnr))
 			ret = NOTIFY_STOP;
-		preempt_enable();
 		break;
 	default:
 		break;
-- 
cgit v1.2.3


From e83a5fdca89970f87e06f87a1d18be22b161e2ba Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Wed, 30 Jan 2008 13:32:35 +0100
Subject: x86: clean up apic_32/64.c

White space and coding style clean up.
Make apic_32/64.c similar.

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apic_32.c |  5 ++---
 arch/x86/kernel/apic_64.c | 23 +++++++++++++++++------
 2 files changed, 19 insertions(+), 9 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index 4330a899ddcb..ffbad74e5be0 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -577,7 +577,6 @@ static void local_apic_timer_interrupt(void)
  * [ if a single-CPU system runs an SMP kernel then we call the local
  *   interrupt as well. Thus we cannot inline the local irq ... ]
  */
-
 void smp_apic_timer_interrupt(struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
@@ -1021,7 +1020,7 @@ void __cpuinit setup_local_APIC(void)
 /*
  * Detect and initialize APIC
  */
-static int __init detect_init_APIC (void)
+static int __init detect_init_APIC(void)
 {
 	u32 h, l, features;
 
@@ -1165,7 +1164,7 @@ fake_ioapic_page:
  * This initializes the IO-APIC and APIC hardware if this is
  * a UP kernel.
  */
-int __init APIC_init_uniprocessor (void)
+int __init APIC_init_uniprocessor(void)
 {
 	if (enable_local_apic < 0)
 		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 47b8ef51dde0..d1a696673d9d 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -23,33 +23,37 @@
 #include <linux/mc146818rtc.h>
 #include <linux/kernel_stat.h>
 #include <linux/sysdev.h>
-#include <linux/module.h>
 #include <linux/ioport.h>
 #include <linux/clockchips.h>
 #include <linux/acpi_pmtmr.h>
+#include <linux/module.h>
 
 #include <asm/atomic.h>
 #include <asm/smp.h>
 #include <asm/mtrr.h>
 #include <asm/mpspec.h>
+#include <asm/hpet.h>
 #include <asm/pgalloc.h>
 #include <asm/mach_apic.h>
 #include <asm/nmi.h>
 #include <asm/idle.h>
 #include <asm/proto.h>
 #include <asm/timex.h>
-#include <asm/hpet.h>
 #include <asm/apic.h>
 
-int apic_verbosity;
 int disable_apic_timer __cpuinitdata;
 static int apic_calibrate_pmtmr __initdata;
 int disable_apic;
 
-/* Local APIC timer works in C2? */
+/* Local APIC timer works in C2 */
 int local_apic_timer_c2_ok;
 EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
 
+/*
+ * Debug level, exported for io_apic.c
+ */
+int apic_verbosity;
+
 static struct resource lapic_resource = {
 	.name = "Local APIC",
 	.flags = IORESOURCE_MEM | IORESOURCE_BUSY,
@@ -355,6 +359,11 @@ static void __init calibrate_APIC_clock(void)
 	calibration_result = result / HZ;
 }
 
+/*
+ * Setup the boot APIC
+ *
+ * Calibrate and verify the result.
+ */
 void __init setup_boot_APIC_clock(void)
 {
 	/*
@@ -1109,8 +1118,8 @@ static struct sysdev_class lapic_sysclass = {
 };
 
 static struct sys_device device_lapic = {
-	.id		= 0,
-	.cls		= &lapic_sysclass,
+	.id	= 0,
+	.cls	= &lapic_sysclass,
 };
 
 static void __cpuinit apic_pm_activate(void)
@@ -1121,9 +1130,11 @@ static void __cpuinit apic_pm_activate(void)
 static int __init init_lapic_sysfs(void)
 {
 	int error;
+
 	if (!cpu_has_apic)
 		return 0;
 	/* XXX: remove suspend/resume procs if !apic_pm_state.active? */
+
 	error = sysdev_class_register(&lapic_sysclass);
 	if (!error)
 		error = sysdev_register(&device_lapic);
-- 
cgit v1.2.3


From ff8a03a623d8e9a7431b4236a901c02634cd5d43 Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Wed, 30 Jan 2008 13:32:36 +0100
Subject: x86: clean up apic_32.c, take 2

More white space and coding style clean up.

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apic_32.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index ffbad74e5be0..cbcf72cde956 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -46,7 +46,7 @@
 /*
  * Sanity check
  */
-#if (SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F
+#if ((SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F)
 # error SPURIOUS_APIC_VECTOR definition error
 #endif
 
@@ -55,7 +55,7 @@
  *
  * -1=force-disable, +1=force-enable
  */
-static int enable_local_apic __initdata = 0;
+static int enable_local_apic __initdata;
 
 /* Local APIC timer verification ok */
 static int local_apic_timer_verify_ok;
@@ -432,7 +432,7 @@ void __init setup_boot_APIC_clock(void)
 			       "with PM Timer: %ldms instead of 100ms\n",
 			       (long)res);
 			/* Correct the lapic counter value */
-			res = (((u64) delta ) * pm_100ms);
+			res = (((u64) delta) * pm_100ms);
 			do_div(res, deltapm);
 			printk(KERN_INFO "APIC delta adjusted to PM-Timer: "
 			       "%lu (%ld)\n", (unsigned long) res, delta);
@@ -976,7 +976,8 @@ void __cpuinit setup_local_APIC(void)
 		value |= APIC_LVT_LEVEL_TRIGGER;
 	apic_write_around(APIC_LVT1, value);
 
-	if (integrated && !esr_disable) {		/* !82489DX */
+	if (integrated && !esr_disable) {
+		/* !82489DX */
 		maxlvt = lapic_get_maxlvt();
 		if (maxlvt > 3)		/* Due to the Pentium erratum 3AP. */
 			apic_write(APIC_ESR, 0);
@@ -1262,7 +1263,7 @@ void smp_error_interrupt(struct pt_regs *regs)
 	   6: Received illegal vector
 	   7: Illegal register address
 	*/
-	printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n",
+	printk(KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n",
 		smp_processor_id(), v , v1);
 	irq_exit();
 }
@@ -1349,7 +1350,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
 			value = apic_read(APIC_LVT0);
 			value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
 				APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
-				APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
+				APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
 			value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
 			value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
 			apic_write_around(APIC_LVT0, value);
-- 
cgit v1.2.3


From de4218634e3df6d73a3e6cdfdf3a17fa3bc7e013 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 30 Jan 2008 13:32:37 +0100
Subject: x86: implement support to synchronize RDTSC through MFENCE on AMD
 CPUs

According to AMD RDTSC can be synchronized through MFENCE.
Implement the necessary CPUID bit for that.

Cc: andreas.herrmann3@amd.com
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/amd.c    | 3 +++
 arch/x86/kernel/setup_64.c   | 4 ++--
 include/asm-x86/cpufeature.h | 1 +
 3 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 1ff88c7f45cf..aaa8101d3d80 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -301,6 +301,9 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 	/* K6s reports MCEs but don't actually have all the MSRs */
 	if (c->x86 < 6)
 		clear_bit(X86_FEATURE_MCE, c->x86_capability);
+
+	if (cpu_has_xmm)
+		set_bit(X86_FEATURE_MFENCE_RDTSC, c->x86_capability);
 }
 
 static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 * c, unsigned int size)
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 02409100f456..2139aa6ac469 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -746,8 +746,8 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 	if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
 		set_cpu_cap(c, X86_FEATURE_K8);
 
-	/* RDTSC can be speculated around */
-	clear_cpu_cap(c, X86_FEATURE_SYNC_RDTSC);
+	/* MFENCE stops RDTSC speculation */
+	set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
 
 	/* Family 10 doesn't support C states in MWAIT so don't use it */
 	if (c->x86 == 0x10 && !force_mwait)
diff --git a/include/asm-x86/cpufeature.h b/include/asm-x86/cpufeature.h
index 75e2f78a7fda..7d53eea8b946 100644
--- a/include/asm-x86/cpufeature.h
+++ b/include/asm-x86/cpufeature.h
@@ -79,6 +79,7 @@
 /* 14 free */
 #define X86_FEATURE_SYNC_RDTSC	(3*32+15)  /* RDTSC synchronizes the CPU */
 #define X86_FEATURE_REP_GOOD	(3*32+16) /* rep microcode works well on this CPU */
+#define X86_FEATURE_MFENCE_RDTSC (3*32+17) /* Mfence synchronizes RDTSC */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3	(4*32+ 0) /* Streaming SIMD Extensions-3 */
-- 
cgit v1.2.3


From 707fa8ed923b1b6a3d7af0d386b0b3abad28ed19 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 30 Jan 2008 13:32:37 +0100
Subject: x86: Implement support to synchronize RDTSC with LFENCE on Intel CPUs

According to Intel RDTSC can be always synchronized with LFENCE
on all current CPUs. Implement the necessary CPUID bit for that.

It is unclear yet if that is true for all future CPUs too,
but if there's another way the kernel can be always updated.

Cc: asit.k.mallick@intel.com
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/intel.c  | 3 ++-
 arch/x86/kernel/setup_64.c   | 5 +----
 include/asm-x86/cpufeature.h | 1 +
 3 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index e4b7e73e9024..0a4abdb61ae4 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -203,9 +203,10 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 	}
 #endif
 
+	if (cpu_has_xmm)
+		set_bit(X86_FEATURE_LFENCE_RDTSC, c->x86_capability);
 	if (c->x86 == 15) {
 		set_bit(X86_FEATURE_P4, c->x86_capability);
-		set_bit(X86_FEATURE_SYNC_RDTSC, c->x86_capability);
 	}
 	if (c->x86 == 6) 
 		set_bit(X86_FEATURE_P3, c->x86_capability);
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 2139aa6ac469..bc7758ea06af 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -888,10 +888,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
 	if (c->x86 == 6)
 		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
-	if (c->x86 == 15)
-		set_cpu_cap(c, X86_FEATURE_SYNC_RDTSC);
-	else
-		clear_cpu_cap(c, X86_FEATURE_SYNC_RDTSC);
+	set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
 	c->x86_max_cores = intel_num_cpu_cores(c);
 
 	srat_detect_node();
diff --git a/include/asm-x86/cpufeature.h b/include/asm-x86/cpufeature.h
index 7d53eea8b946..c1a7e07859c8 100644
--- a/include/asm-x86/cpufeature.h
+++ b/include/asm-x86/cpufeature.h
@@ -80,6 +80,7 @@
 #define X86_FEATURE_SYNC_RDTSC	(3*32+15)  /* RDTSC synchronizes the CPU */
 #define X86_FEATURE_REP_GOOD	(3*32+16) /* rep microcode works well on this CPU */
 #define X86_FEATURE_MFENCE_RDTSC (3*32+17) /* Mfence synchronizes RDTSC */
+#define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* Lfence synchronizes RDTSC */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3	(4*32+ 0) /* Streaming SIMD Extensions-3 */
-- 
cgit v1.2.3


From 6d5f718a497375f853d90247f5f6963368e89803 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 30 Jan 2008 13:32:38 +0100
Subject: x86: lfence fix

LFENCE is available on XMM2 or higher Intel CPUs - not XMM or higher...

this caused boot failures on XMM1 & !XMM1 capable CPUs.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/intel.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 0a4abdb61ae4..5731de3e1bd1 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -203,7 +203,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 	}
 #endif
 
-	if (cpu_has_xmm)
+	if (cpu_has_xmm2)
 		set_bit(X86_FEATURE_LFENCE_RDTSC, c->x86_capability);
 	if (c->x86 == 15) {
 		set_bit(X86_FEATURE_P4, c->x86_capability);
-- 
cgit v1.2.3


From cdc7957d1954908a39a6964e9c6f643916e76c4b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 30 Jan 2008 13:32:39 +0100
Subject: x86: move native_read_tsc() offline

move native_read_tsc() offline.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/rtc.c | 10 ++++++++++
 include/asm-x86/msr.h |  8 +-------
 2 files changed, 11 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index d040840ff1b6..c42cf1263882 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -6,6 +6,7 @@
 #include <linux/mc146818rtc.h>
 
 #include <asm/time.h>
+#include <asm/vsyscall.h>
 
 #ifdef CONFIG_X86_32
 # define CMOS_YEARS_OFFS 1900
@@ -194,3 +195,12 @@ int update_persistent_clock(struct timespec now)
 {
 	return set_rtc_mmss(now.tv_sec);
 }
+
+unsigned long long __vsyscall_fn native_read_tsc(void)
+{
+	DECLARE_ARGS(val, low, high);
+
+	asm volatile("rdtsc" : EAX_EDX_RET(val, low, high));
+	return EAX_EDX_VAL(val, low, high);
+}
+EXPORT_SYMBOL_GPL(native_read_tsc);
diff --git a/include/asm-x86/msr.h b/include/asm-x86/msr.h
index bca8c3950132..decfec4ab17c 100644
--- a/include/asm-x86/msr.h
+++ b/include/asm-x86/msr.h
@@ -91,13 +91,7 @@ static inline int native_write_msr_safe(unsigned int msr,
 	return err;
 }
 
-static inline unsigned long long native_read_tsc(void)
-{
-	DECLARE_ARGS(val, low, high);
-
-	asm volatile("rdtsc" : EAX_EDX_RET(val, low, high));
-	return EAX_EDX_VAL(val, low, high);
-}
+extern unsigned long long native_read_tsc(void);
 
 static inline unsigned long long native_read_pmc(int counter)
 {
-- 
cgit v1.2.3


From e4026440130b84101f2da7f5a0c7a3b046173d3c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 30 Jan 2008 13:32:39 +0100
Subject: x86: map vsyscalls early enough

map vsyscalls early enough. This is important if a __vsyscall_fn
function is used by other kernel code too.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/setup_64.c    | 2 ++
 arch/x86/kernel/vsyscall_64.c | 3 +--
 include/asm-x86/vsyscall.h    | 2 ++
 3 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index bc7758ea06af..8618178db842 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -45,6 +45,7 @@
 #include <asm/mtrr.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
+#include <asm/vsyscall.h>
 #include <asm/io.h>
 #include <asm/smp.h>
 #include <asm/msr.h>
@@ -453,6 +454,7 @@ void __init setup_arch(char **cmdline_p)
 #endif
 	reserve_crashkernel();
 	paging_init();
+	map_vsyscall();
 
 	early_quirks();
 
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index e5c1118a8098..3f8242774580 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -319,7 +319,7 @@ cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
 	return NOTIFY_DONE;
 }
 
-static void __init map_vsyscall(void)
+void __init map_vsyscall(void)
 {
 	extern char __vsyscall_0;
 	unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
@@ -335,7 +335,6 @@ static int __init vsyscall_init(void)
 	BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
 	BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
 	BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
-	map_vsyscall();
 #ifdef CONFIG_SYSCTL
 	register_sysctl_table(kernel_root_table2);
 #endif
diff --git a/include/asm-x86/vsyscall.h b/include/asm-x86/vsyscall.h
index f01c49f5d108..17b3700949bf 100644
--- a/include/asm-x86/vsyscall.h
+++ b/include/asm-x86/vsyscall.h
@@ -36,6 +36,8 @@ extern volatile unsigned long __jiffies;
 extern int vgetcpu_mode;
 extern struct timezone sys_tz;
 
+extern void map_vsyscall(void);
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_X86_64_VSYSCALL_H_ */
-- 
cgit v1.2.3


From f06e4ec1c15691b0cfd2397ae32214fa36c90d71 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 30 Jan 2008 13:32:39 +0100
Subject: x86: read_tsc sync

make native_read_tsc() always non-speculative.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/rtc.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index c42cf1263882..276cb7073ab1 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -200,7 +200,10 @@ unsigned long long __vsyscall_fn native_read_tsc(void)
 {
 	DECLARE_ARGS(val, low, high);
 
+	rdtsc_barrier();
 	asm volatile("rdtsc" : EAX_EDX_RET(val, low, high));
+	rdtsc_barrier();
+
 	return EAX_EDX_VAL(val, low, high);
 }
 EXPORT_SYMBOL_GPL(native_read_tsc);
-- 
cgit v1.2.3


From 6d63de8dbcda98511206897562ecfcdacf18f523 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 30 Jan 2008 13:32:39 +0100
Subject: x86: remove get_cycles_sync

rdtsc is now speculation-safe, so no need for the sync variants of
the APIs.

[ mingo@elte.hu: removed the nsec_barrier() complication. ]

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/time_64.c  |  2 +-
 arch/x86/kernel/tsc_64.c   | 12 +++++-----
 arch/x86/kernel/tsc_sync.c |  4 ++--
 include/asm-x86/tsc.h      | 60 +++++++---------------------------------------
 4 files changed, 17 insertions(+), 61 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index 91d4d495904e..61b17f5ec867 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -83,7 +83,7 @@ unsigned long __init native_calculate_cpu_khz(void)
 	rdtscl(tsc_start);
 	do {
 		rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
-		tsc_now = get_cycles_sync();
+		tsc_now = get_cycles();
 	} while ((tsc_now - tsc_start) < TICK_COUNT);
 
 	local_irq_restore(flags);
diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c
index 3723401c4593..2cc55b726c22 100644
--- a/arch/x86/kernel/tsc_64.c
+++ b/arch/x86/kernel/tsc_64.c
@@ -181,12 +181,12 @@ static unsigned long __init tsc_read_refs(unsigned long *pm,
 	int i;
 
 	for (i = 0; i < MAX_RETRIES; i++) {
-		t1 = get_cycles_sync();
+		t1 = get_cycles();
 		if (hpet)
 			*hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
 		else
 			*pm = acpi_pm_read_early();
-		t2 = get_cycles_sync();
+		t2 = get_cycles();
 		if ((t2 - t1) < SMI_TRESHOLD)
 			return t2;
 	}
@@ -210,9 +210,9 @@ void __init tsc_calibrate(void)
 	outb(0xb0, 0x43);
 	outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42);
 	outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42);
-	tr1 = get_cycles_sync();
+	tr1 = get_cycles();
 	while ((inb(0x61) & 0x20) == 0);
-	tr2 = get_cycles_sync();
+	tr2 = get_cycles();
 
 	tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL);
 
@@ -300,13 +300,13 @@ __setup("notsc", notsc_setup);
 /* clock source code: */
 static cycle_t read_tsc(void)
 {
-	cycle_t ret = (cycle_t)get_cycles_sync();
+	cycle_t ret = (cycle_t)get_cycles();
 	return ret;
 }
 
 static cycle_t __vsyscall_fn vread_tsc(void)
 {
-	cycle_t ret = (cycle_t)vget_cycles_sync();
+	cycle_t ret = (cycle_t)vget_cycles();
 	return ret;
 }
 
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 05d8f25de6ae..ace340524c01 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -46,7 +46,7 @@ static __cpuinit void check_tsc_warp(void)
 	cycles_t start, now, prev, end;
 	int i;
 
-	start = get_cycles_sync();
+	start = get_cycles();
 	/*
 	 * The measurement runs for 20 msecs:
 	 */
@@ -61,7 +61,7 @@ static __cpuinit void check_tsc_warp(void)
 		 */
 		__raw_spin_lock(&sync_lock);
 		prev = last_tsc;
-		now = get_cycles_sync();
+		now = get_cycles();
 		last_tsc = now;
 		__raw_spin_unlock(&sync_lock);
 
diff --git a/include/asm-x86/tsc.h b/include/asm-x86/tsc.h
index 401303724130..f51a50da35aa 100644
--- a/include/asm-x86/tsc.h
+++ b/include/asm-x86/tsc.h
@@ -36,62 +36,18 @@ static inline cycles_t get_cycles(void)
 	return ret;
 }
 
-/* Like get_cycles, but make sure the CPU is synchronized. */
-static __always_inline cycles_t __get_cycles_sync(void)
+static inline cycles_t vget_cycles(void)
 {
-	unsigned long long ret;
-	unsigned eax, edx;
-
 	/*
-	 * Use RDTSCP if possible; it is guaranteed to be synchronous
-	 * and doesn't cause a VMEXIT on Hypervisors
+	 * We only do VDSOs on TSC capable CPUs, so this shouldnt
+	 * access boot_cpu_data (which is not VDSO-safe):
 	 */
-	alternative_io(ASM_NOP3, ".byte 0x0f,0x01,0xf9", X86_FEATURE_RDTSCP,
-		       ASM_OUTPUT2("=a" (eax), "=d" (edx)),
-		       "a" (0U), "d" (0U) : "ecx", "memory");
-	ret = (((unsigned long long)edx) << 32) | ((unsigned long long)eax);
-	if (ret)
-		return ret;
-
-	/*
-	 * Don't do an additional sync on CPUs where we know
-	 * RDTSC is already synchronous:
-	 */
-	alternative_io("cpuid", ASM_NOP2, X86_FEATURE_SYNC_RDTSC,
-			  "=a" (eax), "0" (1) : "ebx","ecx","edx","memory");
-
-	return 0;
-}
-
-static __always_inline cycles_t get_cycles_sync(void)
-{
-	unsigned long long ret;
-	ret = __get_cycles_sync();
-	if (!ret)
-		rdtscll(ret);
-	return ret;
-}
-
-#ifdef CONFIG_PARAVIRT
-/*
- * For paravirt guests, some functionalities are executed through function
- * pointers in the various pvops structures.
- * These function pointers exist inside the kernel and can not
- * be accessed by user space. To avoid this, we make a copy of the
- * get_cycles_sync (called in kernel) but force the use of native_read_tsc.
- * Ideally, the guest should set up it's own clock and vread
- */
-static __always_inline long long vget_cycles_sync(void)
-{
-	unsigned long long ret;
-	ret = __get_cycles_sync();
-	if (!ret)
-		ret = native_read_tsc();
-	return ret;
-}
-#else
-# define vget_cycles_sync() get_cycles_sync()
+#ifndef CONFIG_X86_TSC
+	if (!cpu_has_tsc)
+		return 0;
 #endif
+	return (cycles_t) native_read_tsc();
+}
 
 extern void tsc_init(void);
 extern void mark_tsc_unstable(char *reason);
-- 
cgit v1.2.3


From 92767af0e3904b4d35ed547fb514ff6cb227e678 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 30 Jan 2008 13:32:40 +0100
Subject: x86: fix sched_clock()

[ andi@firstfloor.org: build fix ]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/rtc.c | 13 ++++---------
 include/asm-x86/msr.h | 11 +++++++++++
 include/asm-x86/tsc.h |  2 +-
 3 files changed, 16 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 276cb7073ab1..eb9b1a198f5e 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -196,14 +196,9 @@ int update_persistent_clock(struct timespec now)
 	return set_rtc_mmss(now.tv_sec);
 }
 
-unsigned long long __vsyscall_fn native_read_tsc(void)
+unsigned long long native_read_tsc(void)
 {
-	DECLARE_ARGS(val, low, high);
-
-	rdtsc_barrier();
-	asm volatile("rdtsc" : EAX_EDX_RET(val, low, high));
-	rdtsc_barrier();
-
-	return EAX_EDX_VAL(val, low, high);
+	return __native_read_tsc();
 }
-EXPORT_SYMBOL_GPL(native_read_tsc);
+EXPORT_SYMBOL(native_read_tsc);
+
diff --git a/include/asm-x86/msr.h b/include/asm-x86/msr.h
index decfec4ab17c..204a8a30fecf 100644
--- a/include/asm-x86/msr.h
+++ b/include/asm-x86/msr.h
@@ -93,6 +93,17 @@ static inline int native_write_msr_safe(unsigned int msr,
 
 extern unsigned long long native_read_tsc(void);
 
+static __always_inline unsigned long long __native_read_tsc(void)
+{
+	DECLARE_ARGS(val, low, high);
+
+	rdtsc_barrier();
+	asm volatile("rdtsc" : EAX_EDX_RET(val, low, high));
+	rdtsc_barrier();
+
+	return EAX_EDX_VAL(val, low, high);
+}
+
 static inline unsigned long long native_read_pmc(int counter)
 {
 	DECLARE_ARGS(val, low, high);
diff --git a/include/asm-x86/tsc.h b/include/asm-x86/tsc.h
index f51a50da35aa..071e0ce5b664 100644
--- a/include/asm-x86/tsc.h
+++ b/include/asm-x86/tsc.h
@@ -46,7 +46,7 @@ static inline cycles_t vget_cycles(void)
 	if (!cpu_has_tsc)
 		return 0;
 #endif
-	return (cycles_t) native_read_tsc();
+	return (cycles_t) __native_read_tsc();
 }
 
 extern void tsc_init(void);
-- 
cgit v1.2.3


From 2b16a2353814a513cdb5c5c739b76a19d7ea39ce Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 30 Jan 2008 13:32:40 +0100
Subject: x86: move X86_FEATURE_CONSTANT_TSC into early cpu feature detection

Need this in the next patch in time_init and that happens early.

This includes a minor fix on i386 where early_intel_workarounds()
[which is now called early_init_intel] really executes early as
the comments say.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/amd.c    | 17 +++++++++++------
 arch/x86/kernel/cpu/common.c | 11 +++++++++--
 arch/x86/kernel/cpu/cpu.h    |  3 ++-
 arch/x86/kernel/cpu/intel.c  | 13 ++++++-------
 arch/x86/kernel/setup_64.c   | 39 +++++++++++++++++++++++++++++++--------
 5 files changed, 59 insertions(+), 24 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index aaa8101d3d80..cd2fe15ff4b5 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -63,6 +63,15 @@ static __cpuinit int amd_apic_timer_broken(void)
 
 int force_mwait __cpuinitdata;
 
+void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
+{
+	if (cpuid_eax(0x80000000) >= 0x80000007) {
+		c->x86_power = cpuid_edx(0x80000007);
+		if (c->x86_power & (1<<8))
+			set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
+	}
+}
+
 static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 {
 	u32 l, h;
@@ -85,6 +94,8 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 	}
 #endif
 
+	early_init_amd(c);
+
 	/*
 	 *	FIXME: We should handle the K5 here. Set up the write
 	 *	range and also turn on MSR 83 bits 4 and 31 (write alloc,
@@ -257,12 +268,6 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 		c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
 	}
 
-	if (cpuid_eax(0x80000000) >= 0x80000007) {
-		c->x86_power = cpuid_edx(0x80000007);
-		if (c->x86_power & (1<<8))
-			set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
-	}
-
 #ifdef CONFIG_X86_HT
 	/*
 	 * On a AMD multi core setup the lower bits of the APIC id
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index e48832a6c2a9..dbb9142a8241 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -307,6 +307,15 @@ static void __init early_cpu_detect(void)
 	cpu_detect(c);
 
 	get_cpu_vendor(c, 1);
+
+	switch (c->x86_vendor) {
+	case X86_VENDOR_AMD:
+		early_init_amd(c);
+		break;
+	case X86_VENDOR_INTEL:
+		early_init_intel(c);
+		break;
+	}
 }
 
 static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
@@ -364,8 +373,6 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
 		init_scattered_cpuid_features(c);
 	}
 
-	early_intel_workaround(c);
-
 #ifdef CONFIG_X86_HT
 	c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
 #endif
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 2f6432cef6ff..ad6527a5beb1 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -24,5 +24,6 @@ extern struct cpu_dev * cpu_devs [X86_VENDOR_NUM];
 extern int get_model_name(struct cpuinfo_x86 *c);
 extern void display_cacheinfo(struct cpuinfo_x86 *c);
 
-extern void early_intel_workaround(struct cpuinfo_x86 *c);
+extern void early_init_intel(struct cpuinfo_x86 *c);
+extern void early_init_amd(struct cpuinfo_x86 *c);
 
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 5731de3e1bd1..f1136115279a 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -29,13 +29,14 @@
 struct movsl_mask movsl_mask __read_mostly;
 #endif
 
-void __cpuinit early_intel_workaround(struct cpuinfo_x86 *c)
+void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
 {
-	if (c->x86_vendor != X86_VENDOR_INTEL)
-		return;
 	/* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
 	if (c->x86 == 15 && c->x86_cache_alignment == 64)
 		c->x86_cache_alignment = 128;
+	if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
+		(c->x86 == 0x6 && c->x86_model >= 0x0e))
+		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
 }
 
 /*
@@ -115,6 +116,8 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 	unsigned int l2 = 0;
 	char *p = NULL;
 
+	early_init_intel(c);
+
 #ifdef CONFIG_X86_F00F_BUG
 	/*
 	 * All current models of Pentium and Pentium with MMX technology CPUs
@@ -210,10 +213,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 	}
 	if (c->x86 == 6) 
 		set_bit(X86_FEATURE_P3, c->x86_capability);
-	if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
-		(c->x86 == 0x6 && c->x86_model >= 0x0e))
-		set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
-
 	if (cpu_has_ds) {
 		unsigned int l1;
 		rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 8618178db842..3cae326093cb 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -544,9 +544,6 @@ static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
 		printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
 		c->x86_cache_size, ecx & 0xFF);
 	}
-
-	if (n >= 0x80000007)
-		cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power);
 	if (n >= 0x80000008) {
 		cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
 		c->x86_virt_bits = (eax >> 8) & 0xff;
@@ -624,7 +621,7 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
 #endif
 }
 
-static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
+static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_SMP
 	unsigned bits, ecx;
@@ -682,6 +679,15 @@ static __cpuinit int amd_apic_timer_broken(void)
 	return 0;
 }
 
+static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
+{
+	early_init_amd_mc(c);
+
+ 	/* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
+	if (c->x86_power & (1<<8))
+		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+}
+
 static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 {
 	unsigned level;
@@ -731,10 +737,6 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 	}
 	display_cacheinfo(c);
 
-	/* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
-	if (c->x86_power & (1<<8))
-		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
-
 	/* Multi core CPU? */
 	if (c->extended_cpuid_level >= 0x80000008)
 		amd_detect_cmp(c);
@@ -845,6 +847,13 @@ static void srat_detect_node(void)
 #endif
 }
 
+static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
+{
+	if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
+	    (c->x86 == 0x6 && c->x86_model >= 0x0e))
+		set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
+}
+
 static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 {
 	/* Cache sizes */
@@ -1056,6 +1065,20 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 #ifdef CONFIG_NUMA
 	numa_add_cpu(smp_processor_id());
 #endif
+
+	c->extended_cpuid_level = cpuid_eax(0x80000000);
+
+	if (c->extended_cpuid_level >= 0x80000007)
+		c->x86_power = cpuid_edx(0x80000007);
+
+	switch (c->x86_vendor) {
+	case X86_VENDOR_AMD:
+		early_init_amd(c);
+		break;
+	case X86_VENDOR_INTEL:
+		early_init_intel(c);
+		break;
+	}
 }
 
 void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
-- 
cgit v1.2.3


From 51fc97b93545e71cec578d6771bceeb92bc2d50b Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 30 Jan 2008 13:32:40 +0100
Subject: x86: allow TSC clock source on AMD Fam10h and some cleanup

After a lot of discussions with AMD it turns out that TSC
on Fam10h CPUs is synchronized when the CONSTANT_TSC cpuid bit is set.
Or rather that if there are ever systems where that is not
true it would be their BIOS' task to disable the bit.

So finally use TSC gettimeofday on Fam10h by default.

Or rather it is always used now on CPUs where the AMD
specific CONSTANT_TSC bit is set.

This gives a nice speed bost for gettimeofday() on these systems
which tends to be by far the most common v/syscall.

On a Fam10h system here TSC gtod uses about 20% of the CPU time of
acpi_pm based gtod(). This was measured on 32bit, on 64bit
it is even better because TSC gtod() can use a vsyscall
and stay in ring 3, which acpi_pm doesn't.

The Intel check simply checks for CONSTANT_TSC too without hardcoding
Intel vendor. This is equivalent on 64bit because all 64bit capable Intel
CPUs will have CONSTANT_TSC set.

On Intel there is no CPU supplied CONSTANT_TSC bit currently,
but we synthesize one based on hardcoded knowledge which steppings
have p-state invariant TSC.

So the new logic is now: On CPUs which have the AMD specific
CONSTANT_TSC bit set or on Intel CPUs which are new enough
to be known to have p-state invariant TSC always use
TSC based gettimeofday()

Cc: lenb@kernel.org

Signed-off-by: Andi Kleen <ak@suse.de>

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/tsc_32.c | 5 +++++
 arch/x86/kernel/tsc_64.c | 5 ++---
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
index 00bb4c1c0593..2a7b95bd8509 100644
--- a/arch/x86/kernel/tsc_32.c
+++ b/arch/x86/kernel/tsc_32.c
@@ -354,6 +354,11 @@ __cpuinit int unsynchronized_tsc(void)
 {
 	if (!cpu_has_tsc || tsc_unstable)
 		return 1;
+
+	/* Anything with constant TSC should be synchronized */
+	if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+		return 0;
+
 	/*
 	 * Intel systems are normally all synchronized.
 	 * Exceptions must mark TSC as unstable:
diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c
index 2cc55b726c22..322b38c68198 100644
--- a/arch/x86/kernel/tsc_64.c
+++ b/arch/x86/kernel/tsc_64.c
@@ -272,9 +272,8 @@ __cpuinit int unsynchronized_tsc(void)
 	if (apic_is_clustered_box())
 		return 1;
 #endif
-	/* Most intel systems have synchronized TSCs except for
-	   multi node systems */
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+
+	if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
 #ifdef CONFIG_ACPI
 		/* But TSC doesn't tick in C3 so don't use it there */
 		if (acpi_gbl_FADT.header.length > 0 &&
-- 
cgit v1.2.3


From 32c7553f824d0d76771404f0e11d6059f82e8de7 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 30 Jan 2008 13:32:41 +0100
Subject: x86: remove explicit C3 TSC check on 64bit

Trust the ACPI code to disable TSC instead when C3 is used.

AMD Fam10h does not disable TSC in any C states so the
check was incorrect there anyways after the change
to handle this like Intel on AMD too.

This allows to use the TSC when C3 is disabled in software
(acpi.max_c_state=2), but the BIOS supports it anyways.

Match i386 behaviour.

Cc: lenb@kernel.org

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/tsc_64.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c
index 322b38c68198..c62f3b6eacc0 100644
--- a/arch/x86/kernel/tsc_64.c
+++ b/arch/x86/kernel/tsc_64.c
@@ -273,15 +273,8 @@ __cpuinit int unsynchronized_tsc(void)
 		return 1;
 #endif
 
-	if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
-#ifdef CONFIG_ACPI
-		/* But TSC doesn't tick in C3 so don't use it there */
-		if (acpi_gbl_FADT.header.length > 0 &&
-		    acpi_gbl_FADT.C3latency < 1000)
-			return 1;
-#endif
+	if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
 		return 0;
-	}
 
 	/* Assume multi socket systems are not synchronized */
 	return num_present_cpus() > 1;
-- 
cgit v1.2.3


From 9566e91d494ed0668edf88f852de7f251fe8fe9a Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Wed, 30 Jan 2008 13:32:41 +0100
Subject: x86: fix detection of CONSTANT_TSC bit for AMD CPUs

Commits
 - c52f61fcbdb2aa84f0e4d831ef07f375e6b99b2c
  (x86: allow TSC clock source on AMD Fam10h and some cleanup)
 - e30436f05d456efaff77611e4494f607b14c2782
  (x86: move X86_FEATURE_CONSTANT_TSC into early cpu feature detection)

are supposed to fix the detection of contant TSC for AMD CPUs.
Unfortunately on x86_64 it does still not work with current x86/mm.
For a Phenom I still get:

  ...
  TSC calibrated against PM_TIMER
  Marking TSC unstable due to TSCs unsynchronized
  time.c: Detected 2288.366 MHz processor.
  ...

We have to set c->x86_power in early_identify_cpu to properly detect
the CONSTANT_TSC bit in early_init_amd.

Attached patch fixes this issue. Following the relevant boot
messages when the fix is used:

  ...
  TSC calibrated against PM_TIMER
  time.c: Detected 2288.279 MHz processor.
  ...
  Initializing CPU#1
  ...
  checking TSC synchronization [CPU#0 -> CPU#1]: passed.
  ...
  Initializing CPU#2
  ...
  checking TSC synchronization [CPU#0 -> CPU#2]: passed.
  ...
  Booting processor 3/4 APIC 0x3
  ...
  checking TSC synchronization [CPU#0 -> CPU#3]: passed.
  Brought up 4 CPUs
  ...

Patch is against x86/mm (v2.6.24-rc8-672-ga9f7faa).
Please apply.

Set c->x86_power in early_identify_cpu. This ensures that
X86_FEATURE_CONSTANT_TSC can properly be set in early_init_amd.

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/setup_64.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 3cae326093cb..1caf7458dc48 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -996,6 +996,10 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
 			c->x86_capability[2] = cpuid_edx(0x80860001);
 	}
 
+	c->extended_cpuid_level = cpuid_eax(0x80000000);
+	if (c->extended_cpuid_level >= 0x80000007)
+		c->x86_power = cpuid_edx(0x80000007);
+
 	switch (c->x86_vendor) {
 	case X86_VENDOR_AMD:
 		early_init_amd(c);
@@ -1066,11 +1070,6 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 	numa_add_cpu(smp_processor_id());
 #endif
 
-	c->extended_cpuid_level = cpuid_eax(0x80000000);
-
-	if (c->extended_cpuid_level >= 0x80000007)
-		c->x86_power = cpuid_edx(0x80000007);
-
 	switch (c->x86_vendor) {
 	case X86_VENDOR_AMD:
 		early_init_amd(c);
-- 
cgit v1.2.3


From b899c5ed2ef3af3429abd8046197255f179ea496 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 30 Jan 2008 13:32:44 +0100
Subject: x86/efi: fix improper use of lvalue

# HG changeset patch
# User Jeremy Fitzhardinge <jeremy@xensource.com>
# Date 1199391030 28800
# Node ID 5d35c92fdf0e2c52edbb6fc4ccd06c7f65f25009
# Parent  22f6a5902285b58bfc1fbbd9e183498c9017bd78
x86/efi: fix improper use of lvalue

pgd_val is no longer valid as an lvalue, so don't try to assign to it.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/efi_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index 269de2e049a3..1f8bbd9644d7 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -85,7 +85,7 @@ void __init efi_call_phys_prelog(void)
 	local_irq_save(efi_flags);
 	early_runtime_code_mapping_set_exec(1);
 	vaddress = (unsigned long)__va(0x0UL);
-	pgd_val(save_pgd) = pgd_val(*pgd_offset_k(0x0UL));
+	save_pgd = *pgd_offset_k(0x0UL);
 	set_pgd(pgd_offset_k(0x0UL), *pgd_offset_k(vaddress));
 	__flush_tlb_all();
 }
-- 
cgit v1.2.3


From 3898534d85e2da8cedab1ceb6ab9328c61f2c1ce Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 30 Jan 2008 13:32:49 +0100
Subject: x86: remove CPU capabitilites printks on 32-bit

I don't know of any case where they have been useful and they look ugly.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/common.c | 20 +-------------------
 1 file changed, 1 insertion(+), 19 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index dbb9142a8241..ed05c7a0ca9b 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -432,20 +432,9 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 
 	generic_identify(c);
 
-	printk(KERN_DEBUG "CPU: After generic identify, caps:");
-	for (i = 0; i < NCAPINTS; i++)
-		printk(" %08x", c->x86_capability[i]);
-	printk("\n");
-
-	if (this_cpu->c_identify) {
+	if (this_cpu->c_identify)
 		this_cpu->c_identify(c);
 
-		printk(KERN_DEBUG "CPU: After vendor identify, caps:");
-		for (i = 0; i < NCAPINTS; i++)
-			printk(" %08x", c->x86_capability[i]);
-		printk("\n");
-	}
-
 	/*
 	 * Vendor-specific initialization.  In this section we
 	 * canonicalize the feature flags, meaning if there are
@@ -496,13 +485,6 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 				c->x86, c->x86_model);
 	}
 
-	/* Now the feature flags better reflect actual CPU features! */
-
-	printk(KERN_DEBUG "CPU: After all inits, caps:");
-	for (i = 0; i < NCAPINTS; i++)
-		printk(" %08x", c->x86_capability[i]);
-	printk("\n");
-
 	/*
 	 * On SMP, boot_cpu_data holds the common feature set between
 	 * all CPUs; so make sure that we indicate which features are
-- 
cgit v1.2.3


From e3cfac84cfbc8c9f17817573befc0f4913b1a4dc Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 30 Jan 2008 13:32:49 +0100
Subject: x86: mark memory_setup __init

Otherwise

WARNING: vmlinux.o(.text+0x64a9): Section mismatch: reference to .init.text:machine_specific_memory_setup (between 'memory_setup' and 'show_cpuinfo')

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/setup_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 1caf7458dc48..a7124bfb8578 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -279,7 +279,7 @@ static void discover_ebda(void)
 }
 
 /* Overridden in paravirt.c if CONFIG_PARAVIRT */
-void __attribute__((weak)) memory_setup(void)
+void __attribute__((weak)) __init memory_setup(void)
 {
        machine_specific_memory_setup();
 }
-- 
cgit v1.2.3


From 4c4915627f94a81a834a7a65dee83acdfb45788c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Wed, 30 Jan 2008 13:32:50 +0100
Subject: x86: make arch/x86/kernel/acpi/wakeup_32.S use a separate

While examining vmlinux namelist on i386 (nm -v vmlinux) I noticed :

c01021d0 t es7000_rename_gsi
c010221a T es7000_start_cpu
<Big Hole>
c0103000 T thread_saved_pc

and

c0113218 T acpi_restore_state_mem
c0113219 T acpi_save_state_mem
<Big Hole>
c0114000 t wakeup_code

This is because arch/x86/kernel/acpi/wakeup_32.S forces a .text alignment
of 4096 bytes. (I have no idea if it is really needed, since
arch/x86/kernel/acpi/wakeup_64.S uses a 16 bytes alignment *only*)

So arch/x86/kernel/built-in.o also has this alignment

arch/x86/kernel/built-in.o:     file format elf32-i386

Sections:
Idx Name          Size      VMA       LMA       File off  Algn
  0 .text         00018c94  00000000  00000000  00001000  2**12
                  CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE

But as arch/x86/kernel/acpi/wakeup_32.o is not the first object linked
into arch/x86/kernel/built-in.o, linker had to build several holes to meet
alignement requirements, because of .o nestings in the kbuild process.

This can be solved by using a special section, .text.page_aligned, so that
no holes are needed.

# size vmlinux.before vmlinux.after
   text    data     bss     dec     hex filename
4619942  422838  458752 5501532  53f25c vmlinux.before
4610534  422838  458752 5492124  53cd9c vmlinux.after

This saves 9408 bytes

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/acpi/wakeup_32.S | 2 +-
 arch/x86/kernel/vmlinux_32.lds.S | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S
index 1e931aaf2ef6..f53e3277f8e5 100644
--- a/arch/x86/kernel/acpi/wakeup_32.S
+++ b/arch/x86/kernel/acpi/wakeup_32.S
@@ -1,4 +1,4 @@
-.text
+	.section .text.page_aligned
 #include <linux/linkage.h>
 #include <asm/segment.h>
 #include <asm/page.h>
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index ec072588ff01..f1148ac8abe3 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -38,6 +38,8 @@ SECTIONS
 
   /* read-only */
   .text : AT(ADDR(.text) - LOAD_OFFSET) {
+	. = ALIGN(4096); /* not really needed, already page aligned */
+	*(.text.page_aligned)
 	TEXT_TEXT
 	SCHED_TEXT
 	LOCK_TEXT
-- 
cgit v1.2.3


From f315decbd05fefbca09bd492ae54eaa334ba826b Mon Sep 17 00:00:00 2001
From: Abhishek Sagar <sagar.abhishek@gmail.com>
Date: Wed, 30 Jan 2008 13:32:50 +0100
Subject: x86: kprobes change kprobe_handler flow

Signed-off-by: Abhishek Sagar <sagar.abhishek@gmail.com>
Signed-off-by: Quentin Barnes <qbarnes@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/kprobes.c | 153 ++++++++++++++++++++++++++--------------------
 1 file changed, 86 insertions(+), 67 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 711fec8f6379..53ba6a5b6550 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -442,6 +442,34 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
 	/* Replace the return addr with trampoline addr */
 	*sara = (unsigned long) &kretprobe_trampoline;
 }
+
+static void __kprobes recursive_singlestep(struct kprobe *p,
+					   struct pt_regs *regs,
+					   struct kprobe_ctlblk *kcb)
+{
+	save_previous_kprobe(kcb);
+	set_current_kprobe(p, regs, kcb);
+	kprobes_inc_nmissed_count(p);
+	prepare_singlestep(p, regs);
+	kcb->kprobe_status = KPROBE_REENTER;
+}
+
+static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
+				       struct kprobe_ctlblk *kcb)
+{
+#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM)
+	if (p->ainsn.boostable == 1 && !p->post_handler) {
+		/* Boost up -- we can execute copied instructions directly */
+		reset_current_kprobe();
+		regs->ip = (unsigned long)p->ainsn.insn;
+		preempt_enable_no_resched();
+		return;
+	}
+#endif
+	prepare_singlestep(p, regs);
+	kcb->kprobe_status = KPROBE_HIT_SS;
+}
+
 /*
  * We have reentered the kprobe_handler(), since another probe was hit while
  * within the handler. We save the original kprobes variables and just single
@@ -450,13 +478,9 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
 static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
 				    struct kprobe_ctlblk *kcb)
 {
-	if (kcb->kprobe_status == KPROBE_HIT_SS &&
-	    *p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
-		regs->flags &= ~X86_EFLAGS_TF;
-		regs->flags |= kcb->kprobe_saved_flags;
-		return 0;
+	switch (kcb->kprobe_status) {
+	case KPROBE_HIT_SSDONE:
 #ifdef CONFIG_X86_64
-	} else if (kcb->kprobe_status == KPROBE_HIT_SSDONE) {
 		/* TODO: Provide re-entrancy from post_kprobes_handler() and
 		 * avoid exception stack corruption while single-stepping on
 		 * the instruction of the new probe.
@@ -464,14 +488,26 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
 		arch_disarm_kprobe(p);
 		regs->ip = (unsigned long)p->addr;
 		reset_current_kprobe();
-		return 1;
+		preempt_enable_no_resched();
+		break;
 #endif
+	case KPROBE_HIT_ACTIVE:
+		recursive_singlestep(p, regs, kcb);
+		break;
+	case KPROBE_HIT_SS:
+		if (*p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
+			regs->flags &= ~TF_MASK;
+			regs->flags |= kcb->kprobe_saved_flags;
+			return 0;
+		} else {
+			recursive_singlestep(p, regs, kcb);
+		}
+		break;
+	default:
+		/* impossible cases */
+		WARN_ON(1);
 	}
-	save_previous_kprobe(kcb);
-	set_current_kprobe(p, regs, kcb);
-	kprobes_inc_nmissed_count(p);
-	prepare_singlestep(p, regs);
-	kcb->kprobe_status = KPROBE_REENTER;
+
 	return 1;
 }
 
@@ -481,83 +517,66 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
  */
 static int __kprobes kprobe_handler(struct pt_regs *regs)
 {
-	struct kprobe *p;
-	int ret = 0;
 	kprobe_opcode_t *addr;
+	struct kprobe *p;
 	struct kprobe_ctlblk *kcb;
 
 	addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
+	if (*addr != BREAKPOINT_INSTRUCTION) {
+		/*
+		 * The breakpoint instruction was removed right
+		 * after we hit it.  Another cpu has removed
+		 * either a probepoint or a debugger breakpoint
+		 * at this address.  In either case, no further
+		 * handling of this interrupt is appropriate.
+		 * Back up over the (now missing) int3 and run
+		 * the original instruction.
+		 */
+		regs->ip = (unsigned long)addr;
+		return 1;
+	}
 
 	/*
 	 * We don't want to be preempted for the entire
-	 * duration of kprobe processing
+	 * duration of kprobe processing. We conditionally
+	 * re-enable preemption at the end of this function,
+	 * and also in reenter_kprobe() and setup_singlestep().
 	 */
 	preempt_disable();
-	kcb = get_kprobe_ctlblk();
 
+	kcb = get_kprobe_ctlblk();
 	p = get_kprobe(addr);
+
 	if (p) {
-		/* Check we're not actually recursing */
 		if (kprobe_running()) {
-			ret = reenter_kprobe(p, regs, kcb);
-			if (kcb->kprobe_status == KPROBE_REENTER)
-			{
-				ret = 1;
-				goto out;
-			}
-			goto preempt_out;
+			if (reenter_kprobe(p, regs, kcb))
+				return 1;
 		} else {
 			set_current_kprobe(p, regs, kcb);
 			kcb->kprobe_status = KPROBE_HIT_ACTIVE;
-			if (p->pre_handler && p->pre_handler(p, regs))
-			{
-				/* handler set things up, skip ss setup */
-				ret = 1;
-				goto out;
-			}
-		}
-	} else {
-		if (*addr != BREAKPOINT_INSTRUCTION) {
+
 			/*
-			 * The breakpoint instruction was removed right
-			 * after we hit it.  Another cpu has removed
-			 * either a probepoint or a debugger breakpoint
-			 * at this address.  In either case, no further
-			 * handling of this interrupt is appropriate.
-			 * Back up over the (now missing) int3 and run
-			 * the original instruction.
+			 * If we have no pre-handler or it returned 0, we
+			 * continue with normal processing.  If we have a
+			 * pre-handler and it returned non-zero, it prepped
+			 * for calling the break_handler below on re-entry
+			 * for jprobe processing, so get out doing nothing
+			 * more here.
 			 */
-			regs->ip = (unsigned long)addr;
-			ret = 1;
-			goto preempt_out;
+			if (!p->pre_handler || !p->pre_handler(p, regs))
+				setup_singlestep(p, regs, kcb);
+			return 1;
 		}
-		if (kprobe_running()) {
-			p = __get_cpu_var(current_kprobe);
-			if (p->break_handler && p->break_handler(p, regs))
-				goto ss_probe;
+	} else if (kprobe_running()) {
+		p = __get_cpu_var(current_kprobe);
+		if (p->break_handler && p->break_handler(p, regs)) {
+			setup_singlestep(p, regs, kcb);
+			return 1;
 		}
-		/* Not one of ours: let kernel handle it */
-		goto preempt_out;
-	}
+	} /* else: not a kprobe fault; let the kernel handle it */
 
-ss_probe:
-	ret = 1;
-#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM)
-	if (p->ainsn.boostable == 1 && !p->post_handler) {
-		/* Boost up -- we can execute copied instructions directly */
-		reset_current_kprobe();
-		regs->ip = (unsigned long)p->ainsn.insn;
-		goto preempt_out;
-	}
-#endif
-	prepare_singlestep(p, regs);
-	kcb->kprobe_status = KPROBE_HIT_SS;
-	goto out;
-
-preempt_out:
 	preempt_enable_no_resched();
-out:
-	return ret;
+	return 0;
 }
 
 /*
-- 
cgit v1.2.3


From 4d022e35fd7e07c522c7863fee6f07e53cf3fc14 Mon Sep 17 00:00:00 2001
From: Miguel Boton <mboton.lkml@gmail.com>
Date: Wed, 30 Jan 2008 13:32:51 +0100
Subject: x86: reboot_{32|64}.c unification

reboot_{32|64}.c unification patch.

This patch unifies the code from the reboot_32.c and reboot_64.c files.

It has been tested in computers with X86_32 and X86_64 kernels and it
looks like all reboot modes work fine (EFI restart system hasn't been
tested yet).

Probably I made some mistakes (like I usually do) so I hope
we can identify and fix them soon.

Signed-off-by: Miguel Boton <mboton@gmail.com>

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/Makefile            |   4 +-
 arch/x86/kernel/reboot.c            | 449 ++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/reboot_32.c         | 413 ---------------------------------
 arch/x86/kernel/reboot_64.c         | 191 ---------------
 include/asm-x86/emergency-restart.h |   3 +
 5 files changed, 454 insertions(+), 606 deletions(-)
 create mode 100644 arch/x86/kernel/reboot.c
 delete mode 100644 arch/x86/kernel/reboot_32.c
 delete mode 100644 arch/x86/kernel/reboot_64.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 0903bbf0ca4d..b40bed4baa77 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -30,8 +30,8 @@ obj-y				+= step.o
 obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
 obj-y				+= cpu/
 obj-y				+= acpi/
-obj-$(CONFIG_X86_BIOS_REBOOT)	+= reboot_32.o
-obj-$(CONFIG_X86_64)            += reboot_64.o
+obj-$(CONFIG_X86_BIOS_REBOOT)	+= reboot.o
+obj-$(CONFIG_X86_64)		+= reboot.o
 obj-$(CONFIG_MCA)		+= mca_32.o
 obj-$(CONFIG_X86_MSR)		+= msr.o
 obj-$(CONFIG_X86_CPUID)		+= cpuid.o
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
new file mode 100644
index 000000000000..5b32f0b4d133
--- /dev/null
+++ b/arch/x86/kernel/reboot.c
@@ -0,0 +1,449 @@
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/reboot.h>
+#include <linux/init.h>
+#include <linux/pm.h>
+#include <linux/efi.h>
+#include <acpi/reboot.h>
+#include <asm/io.h>
+#include <asm/apic.h>
+#include <asm/desc.h>
+#include <asm/hpet.h>
+#include <asm/reboot_fixups.h>
+#include <asm/reboot.h>
+
+#ifdef CONFIG_X86_32
+# include <linux/dmi.h>
+# include <linux/ctype.h>
+# include <linux/mc146818rtc.h>
+# include <asm/pgtable.h>
+#else
+# include <asm/iommu.h>
+#endif
+
+/*
+ * Power off function, if any
+ */
+void (*pm_power_off)(void);
+EXPORT_SYMBOL(pm_power_off);
+
+static long no_idt[3];
+static int reboot_mode;
+enum reboot_type reboot_type = BOOT_KBD;
+int reboot_force;
+
+#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
+static int reboot_cpu = -1;
+#endif
+
+/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old]
+   warm   Don't set the cold reboot flag
+   cold   Set the cold reboot flag
+   bios   Reboot by jumping through the BIOS (only for X86_32)
+   smp    Reboot by executing reset on BSP or other CPU (only for X86_32)
+   triple Force a triple fault (init)
+   kbd    Use the keyboard controller. cold reset (default)
+   acpi   Use the RESET_REG in the FADT
+   efi    Use efi reset_system runtime service
+   force  Avoid anything that could hang.
+ */
+static int __init reboot_setup(char *str)
+{
+	for (;;) {
+		switch (*str) {
+		case 'w':
+			reboot_mode = 0x1234;
+			break;
+
+		case 'c':
+			reboot_mode = 0;
+			break;
+
+#ifdef CONFIG_X86_32
+#ifdef CONFIG_SMP
+		case 's':
+			if (isdigit(*(str+1))) {
+				reboot_cpu = (int) (*(str+1) - '0');
+				if (isdigit(*(str+2)))
+					reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0');
+			}
+				/* we will leave sorting out the final value
+				   when we are ready to reboot, since we might not
+				   have set up boot_cpu_id or smp_num_cpu */
+			break;
+#endif /* CONFIG_SMP */
+
+		case 'b':
+#endif
+		case 'a':
+		case 'k':
+		case 't':
+		case 'e':
+			reboot_type = *str;
+			break;
+
+		case 'f':
+			reboot_force = 1;
+			break;
+		}
+
+		str = strchr(str, ',');
+		if (str)
+			str++;
+		else
+			break;
+	}
+	return 1;
+}
+
+__setup("reboot=", reboot_setup);
+
+
+#ifdef CONFIG_X86_32
+/*
+ * Reboot options and system auto-detection code provided by
+ * Dell Inc. so their systems "just work". :-)
+ */
+
+/*
+ * Some machines require the "reboot=b"  commandline option,
+ * this quirk makes that automatic.
+ */
+static int __init set_bios_reboot(const struct dmi_system_id *d)
+{
+	if (reboot_type != BOOT_BIOS) {
+		reboot_type = BOOT_BIOS;
+		printk(KERN_INFO "%s series board detected. Selecting BIOS-method for reboots.\n", d->ident);
+	}
+	return 0;
+}
+
+static struct dmi_system_id __initdata reboot_dmi_table[] = {
+	{	/* Handle problems with rebooting on Dell E520's */
+		.callback = set_bios_reboot,
+		.ident = "Dell E520",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Dell DM061"),
+		},
+	},
+	{	/* Handle problems with rebooting on Dell 1300's */
+		.callback = set_bios_reboot,
+		.ident = "Dell PowerEdge 1300",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1300/"),
+		},
+	},
+	{	/* Handle problems with rebooting on Dell 300's */
+		.callback = set_bios_reboot,
+		.ident = "Dell PowerEdge 300",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"),
+		},
+	},
+	{       /* Handle problems with rebooting on Dell Optiplex 745's SFF*/
+		.callback = set_bios_reboot,
+		.ident = "Dell OptiPlex 745",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
+			DMI_MATCH(DMI_BOARD_NAME, "0WF810"),
+		},
+	},
+	{	/* Handle problems with rebooting on Dell 2400's */
+		.callback = set_bios_reboot,
+		.ident = "Dell PowerEdge 2400",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"),
+		},
+	},
+	{	/* Handle problems with rebooting on HP laptops */
+		.callback = set_bios_reboot,
+		.ident = "HP Compaq Laptop",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"),
+		},
+	},
+	{ }
+};
+
+static int __init reboot_init(void)
+{
+	dmi_check_system(reboot_dmi_table);
+	return 0;
+}
+core_initcall(reboot_init);
+
+/* The following code and data reboots the machine by switching to real
+   mode and jumping to the BIOS reset entry point, as if the CPU has
+   really been reset.  The previous version asked the keyboard
+   controller to pulse the CPU reset line, which is more thorough, but
+   doesn't work with at least one type of 486 motherboard.  It is easy
+   to stop this code working; hence the copious comments. */
+static unsigned long long
+real_mode_gdt_entries [3] =
+{
+	0x0000000000000000ULL,	/* Null descriptor */
+	0x00009a000000ffffULL,	/* 16-bit real-mode 64k code at 0x00000000 */
+	0x000092000100ffffULL	/* 16-bit real-mode 64k data at 0x00000100 */
+};
+
+static struct desc_ptr
+real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries },
+real_mode_idt = { 0x3ff, 0 };
+
+/* This is 16-bit protected mode code to disable paging and the cache,
+   switch to real mode and jump to the BIOS reset code.
+
+   The instruction that switches to real mode by writing to CR0 must be
+   followed immediately by a far jump instruction, which set CS to a
+   valid value for real mode, and flushes the prefetch queue to avoid
+   running instructions that have already been decoded in protected
+   mode.
+
+   Clears all the flags except ET, especially PG (paging), PE
+   (protected-mode enable) and TS (task switch for coprocessor state
+   save).  Flushes the TLB after paging has been disabled.  Sets CD and
+   NW, to disable the cache on a 486, and invalidates the cache.  This
+   is more like the state of a 486 after reset.  I don't know if
+   something else should be done for other chips.
+
+   More could be done here to set up the registers as if a CPU reset had
+   occurred; hopefully real BIOSs don't assume much. */
+static unsigned char real_mode_switch [] =
+{
+	0x66, 0x0f, 0x20, 0xc0,			/*    movl  %cr0,%eax        */
+	0x66, 0x83, 0xe0, 0x11,			/*    andl  $0x00000011,%eax */
+	0x66, 0x0d, 0x00, 0x00, 0x00, 0x60,	/*    orl   $0x60000000,%eax */
+	0x66, 0x0f, 0x22, 0xc0,			/*    movl  %eax,%cr0        */
+	0x66, 0x0f, 0x22, 0xd8,			/*    movl  %eax,%cr3        */
+	0x66, 0x0f, 0x20, 0xc3,			/*    movl  %cr0,%ebx        */
+	0x66, 0x81, 0xe3, 0x00, 0x00, 0x00, 0x60,	/*    andl  $0x60000000,%ebx */
+	0x74, 0x02,				/*    jz    f                */
+	0x0f, 0x09,				/*    wbinvd                 */
+	0x24, 0x10,				/* f: andb  $0x10,al         */
+	0x66, 0x0f, 0x22, 0xc0			/*    movl  %eax,%cr0        */
+};
+static unsigned char jump_to_bios [] =
+{
+	0xea, 0x00, 0x00, 0xff, 0xff		/*    ljmp  $0xffff,$0x0000  */
+};
+
+/*
+ * Switch to real mode and then execute the code
+ * specified by the code and length parameters.
+ * We assume that length will aways be less that 100!
+ */
+void machine_real_restart(unsigned char *code, int length)
+{
+	local_irq_disable();
+
+	/* Write zero to CMOS register number 0x0f, which the BIOS POST
+	   routine will recognize as telling it to do a proper reboot.  (Well
+	   that's what this book in front of me says -- it may only apply to
+	   the Phoenix BIOS though, it's not clear).  At the same time,
+	   disable NMIs by setting the top bit in the CMOS address register,
+	   as we're about to do peculiar things to the CPU.  I'm not sure if
+	   `outb_p' is needed instead of just `outb'.  Use it to be on the
+	   safe side.  (Yes, CMOS_WRITE does outb_p's. -  Paul G.)
+	 */
+	spin_lock(&rtc_lock);
+	CMOS_WRITE(0x00, 0x8f);
+	spin_unlock(&rtc_lock);
+
+	/* Remap the kernel at virtual address zero, as well as offset zero
+	   from the kernel segment.  This assumes the kernel segment starts at
+	   virtual address PAGE_OFFSET. */
+	memcpy(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
+		sizeof(swapper_pg_dir [0]) * KERNEL_PGD_PTRS);
+
+	/*
+	 * Use `swapper_pg_dir' as our page directory.
+	 */
+	load_cr3(swapper_pg_dir);
+
+	/* Write 0x1234 to absolute memory location 0x472.  The BIOS reads
+	   this on booting to tell it to "Bypass memory test (also warm
+	   boot)".  This seems like a fairly standard thing that gets set by
+	   REBOOT.COM programs, and the previous reset routine did this
+	   too. */
+	*((unsigned short *)0x472) = reboot_mode;
+
+	/* For the switch to real mode, copy some code to low memory.  It has
+	   to be in the first 64k because it is running in 16-bit mode, and it
+	   has to have the same physical and virtual address, because it turns
+	   off paging.  Copy it near the end of the first page, out of the way
+	   of BIOS variables. */
+	memcpy((void *)(0x1000 - sizeof(real_mode_switch) - 100),
+		real_mode_switch, sizeof (real_mode_switch));
+	memcpy((void *)(0x1000 - 100), code, length);
+
+	/* Set up the IDT for real mode. */
+	load_idt(&real_mode_idt);
+
+	/* Set up a GDT from which we can load segment descriptors for real
+	   mode.  The GDT is not used in real mode; it is just needed here to
+	   prepare the descriptors. */
+	load_gdt(&real_mode_gdt);
+
+	/* Load the data segment registers, and thus the descriptors ready for
+	   real mode.  The base address of each segment is 0x100, 16 times the
+	   selector value being loaded here.  This is so that the segment
+	   registers don't have to be reloaded after switching to real mode:
+	   the values are consistent for real mode operation already. */
+	__asm__ __volatile__ ("movl $0x0010,%%eax\n"
+				"\tmovl %%eax,%%ds\n"
+				"\tmovl %%eax,%%es\n"
+				"\tmovl %%eax,%%fs\n"
+				"\tmovl %%eax,%%gs\n"
+				"\tmovl %%eax,%%ss" : : : "eax");
+
+	/* Jump to the 16-bit code that we copied earlier.  It disables paging
+	   and the cache, switches to real mode, and jumps to the BIOS reset
+	   entry point. */
+	__asm__ __volatile__ ("ljmp $0x0008,%0"
+				:
+				: "i" ((void *)(0x1000 - sizeof (real_mode_switch) - 100)));
+}
+#ifdef CONFIG_APM_MODULE
+EXPORT_SYMBOL(machine_real_restart);
+#endif
+
+#endif /* CONFIG_X86_32 */
+
+static inline void kb_wait(void)
+{
+	int i;
+
+	for (i = 0; i < 0x10000; i++)
+		if ((inb_p(0x64) & 0x02) == 0)
+			break;
+}
+
+void machine_emergency_restart(void)
+{
+	int i;
+
+	/* Tell the BIOS if we want cold or warm reboot */
+	*((unsigned short *)__va(0x472)) = reboot_mode;
+
+	for (;;) {
+		/* Could also try the reset bit in the Hammer NB */
+		switch (reboot_type) {
+		case BOOT_KBD:
+			for (i = 0; i < 10; i++) {
+				kb_wait();
+				udelay(50);
+				outb(0xfe, 0x64); /* pulse reset low */
+				udelay(50);
+			}
+
+		case BOOT_TRIPLE:
+			load_idt((const struct desc_ptr *)&no_idt);
+			__asm__ __volatile__("int3");
+
+			reboot_type = BOOT_KBD;
+			break;
+
+#ifdef CONFIG_X86_32
+		case BOOT_BIOS:
+			machine_real_restart(jump_to_bios, sizeof(jump_to_bios));
+
+			reboot_type = BOOT_KBD;
+			break;
+#endif
+
+		case BOOT_ACPI:
+			acpi_reboot();
+			reboot_type = BOOT_KBD;
+			break;
+
+
+		case BOOT_EFI:
+			if (efi_enabled)
+				efi.reset_system(reboot_mode ? EFI_RESET_WARM : EFI_RESET_COLD,
+						 EFI_SUCCESS, 0, NULL);
+
+			reboot_type = BOOT_KBD;
+			break;
+		}
+	}
+}
+
+void machine_shutdown(void)
+{
+	/* Stop the cpus and apics */
+#ifdef CONFIG_SMP
+	int reboot_cpu_id;
+
+	/* The boot cpu is always logical cpu 0 */
+	reboot_cpu_id = 0;
+
+#ifdef CONFIG_X86_32
+	/* See if there has been given a command line override */
+	if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) &&
+		cpu_isset(reboot_cpu, cpu_online_map))
+		reboot_cpu_id = reboot_cpu;
+#endif
+
+	/* Make certain the cpu I'm about to reboot on is online */
+	if (!cpu_isset(reboot_cpu_id, cpu_online_map))
+		reboot_cpu_id = smp_processor_id();
+
+	/* Make certain I only run on the appropriate processor */
+	set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id));
+
+	/* O.K Now that I'm on the appropriate processor,
+	 * stop all of the others.
+	 */
+	smp_send_stop();
+#endif
+
+	lapic_shutdown();
+
+#ifdef CONFIG_X86_IO_APIC
+	disable_IO_APIC();
+#endif
+
+#ifdef CONFIG_HPET_TIMER
+	hpet_disable();
+#endif
+
+#ifdef CONFIG_X86_64
+	pci_iommu_shutdown();
+#endif
+}
+
+void machine_restart(char *__unused)
+{
+	printk("machine restart\n");
+
+	if (!reboot_force)
+		machine_shutdown();
+	machine_emergency_restart();
+}
+
+void machine_halt(void)
+{
+}
+
+void machine_power_off(void)
+{
+	if (pm_power_off) {
+		if (!reboot_force)
+			machine_shutdown();
+		pm_power_off();
+	}
+}
+
+struct machine_ops machine_ops = {
+	.power_off = machine_power_off,
+	.shutdown = machine_shutdown,
+	.emergency_restart = machine_emergency_restart,
+	.restart = machine_restart,
+	.halt = machine_halt
+};
diff --git a/arch/x86/kernel/reboot_32.c b/arch/x86/kernel/reboot_32.c
deleted file mode 100644
index c3376fae639d..000000000000
--- a/arch/x86/kernel/reboot_32.c
+++ /dev/null
@@ -1,413 +0,0 @@
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/delay.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/mc146818rtc.h>
-#include <linux/efi.h>
-#include <linux/dmi.h>
-#include <linux/ctype.h>
-#include <linux/pm.h>
-#include <linux/reboot.h>
-#include <asm/uaccess.h>
-#include <asm/apic.h>
-#include <asm/hpet.h>
-#include <asm/desc.h>
-#include "mach_reboot.h"
-#include <asm/reboot_fixups.h>
-#include <asm/reboot.h>
-
-/*
- * Power off function, if any
- */
-void (*pm_power_off)(void);
-EXPORT_SYMBOL(pm_power_off);
-
-static int reboot_mode;
-static int reboot_thru_bios;
-
-#ifdef CONFIG_SMP
-static int reboot_cpu = -1;
-#endif
-static int __init reboot_setup(char *str)
-{
-	while(1) {
-		switch (*str) {
-		case 'w': /* "warm" reboot (no memory testing etc) */
-			reboot_mode = 0x1234;
-			break;
-		case 'c': /* "cold" reboot (with memory testing etc) */
-			reboot_mode = 0x0;
-			break;
-		case 'b': /* "bios" reboot by jumping through the BIOS */
-			reboot_thru_bios = 1;
-			break;
-		case 'h': /* "hard" reboot by toggling RESET and/or crashing the CPU */
-			reboot_thru_bios = 0;
-			break;
-#ifdef CONFIG_SMP
-		case 's': /* "smp" reboot by executing reset on BSP or other CPU*/
-			if (isdigit(*(str+1))) {
-				reboot_cpu = (int) (*(str+1) - '0');
-				if (isdigit(*(str+2)))
-					reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0');
-			}
-				/* we will leave sorting out the final value 
-				when we are ready to reboot, since we might not
- 				have set up boot_cpu_id or smp_num_cpu */
-			break;
-#endif
-		}
-		if((str = strchr(str,',')) != NULL)
-			str++;
-		else
-			break;
-	}
-	return 1;
-}
-
-__setup("reboot=", reboot_setup);
-
-/*
- * Reboot options and system auto-detection code provided by
- * Dell Inc. so their systems "just work". :-)
- */
-
-/*
- * Some machines require the "reboot=b"  commandline option, this quirk makes that automatic.
- */
-static int __init set_bios_reboot(const struct dmi_system_id *d)
-{
-	if (!reboot_thru_bios) {
-		reboot_thru_bios = 1;
-		printk(KERN_INFO "%s series board detected. Selecting BIOS-method for reboots.\n", d->ident);
-	}
-	return 0;
-}
-
-static struct dmi_system_id __initdata reboot_dmi_table[] = {
-	{	/* Handle problems with rebooting on Dell E520's */
-		.callback = set_bios_reboot,
-		.ident = "Dell E520",
-		.matches = {
-			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
-			DMI_MATCH(DMI_PRODUCT_NAME, "Dell DM061"),
-		},
-	},
-	{	/* Handle problems with rebooting on Dell 1300's */
-		.callback = set_bios_reboot,
-		.ident = "Dell PowerEdge 1300",
-		.matches = {
-			DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
-			DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1300/"),
-		},
-	},
-	{	/* Handle problems with rebooting on Dell 300's */
-		.callback = set_bios_reboot,
-		.ident = "Dell PowerEdge 300",
-		.matches = {
-			DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
-			DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"),
-		},
-	},
-	{       /* Handle problems with rebooting on Dell Optiplex 745's SFF*/
-		.callback = set_bios_reboot,
-		.ident = "Dell OptiPlex 745",
-		.matches = {
-			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
-			DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
-			DMI_MATCH(DMI_BOARD_NAME, "0WF810"),
-		},
-	},
-	{	/* Handle problems with rebooting on Dell 2400's */
-		.callback = set_bios_reboot,
-		.ident = "Dell PowerEdge 2400",
-		.matches = {
-			DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
-			DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"),
-		},
-	},
-	{	/* Handle problems with rebooting on HP laptops */
-		.callback = set_bios_reboot,
-		.ident = "HP Compaq Laptop",
-		.matches = {
-			DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
-			DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"),
-		},
-	},
-	{ }
-};
-
-static int __init reboot_init(void)
-{
-	dmi_check_system(reboot_dmi_table);
-	return 0;
-}
-
-core_initcall(reboot_init);
-
-/* The following code and data reboots the machine by switching to real
-   mode and jumping to the BIOS reset entry point, as if the CPU has
-   really been reset.  The previous version asked the keyboard
-   controller to pulse the CPU reset line, which is more thorough, but
-   doesn't work with at least one type of 486 motherboard.  It is easy
-   to stop this code working; hence the copious comments. */
-
-static unsigned long long
-real_mode_gdt_entries [3] =
-{
-	0x0000000000000000ULL,	/* Null descriptor */
-	0x00009a000000ffffULL,	/* 16-bit real-mode 64k code at 0x00000000 */
-	0x000092000100ffffULL	/* 16-bit real-mode 64k data at 0x00000100 */
-};
-
-static struct desc_ptr
-real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries },
-real_mode_idt = { 0x3ff, 0 },
-no_idt = { 0, 0 };
-
-
-/* This is 16-bit protected mode code to disable paging and the cache,
-   switch to real mode and jump to the BIOS reset code.
-
-   The instruction that switches to real mode by writing to CR0 must be
-   followed immediately by a far jump instruction, which set CS to a
-   valid value for real mode, and flushes the prefetch queue to avoid
-   running instructions that have already been decoded in protected
-   mode.
-
-   Clears all the flags except ET, especially PG (paging), PE
-   (protected-mode enable) and TS (task switch for coprocessor state
-   save).  Flushes the TLB after paging has been disabled.  Sets CD and
-   NW, to disable the cache on a 486, and invalidates the cache.  This
-   is more like the state of a 486 after reset.  I don't know if
-   something else should be done for other chips.
-
-   More could be done here to set up the registers as if a CPU reset had
-   occurred; hopefully real BIOSs don't assume much. */
-
-static unsigned char real_mode_switch [] =
-{
-	0x66, 0x0f, 0x20, 0xc0,			/*    movl  %cr0,%eax        */
-	0x66, 0x83, 0xe0, 0x11,			/*    andl  $0x00000011,%eax */
-	0x66, 0x0d, 0x00, 0x00, 0x00, 0x60,	/*    orl   $0x60000000,%eax */
-	0x66, 0x0f, 0x22, 0xc0,			/*    movl  %eax,%cr0        */
-	0x66, 0x0f, 0x22, 0xd8,			/*    movl  %eax,%cr3        */
-	0x66, 0x0f, 0x20, 0xc3,			/*    movl  %cr0,%ebx        */
-	0x66, 0x81, 0xe3, 0x00, 0x00, 0x00, 0x60,	/*    andl  $0x60000000,%ebx */
-	0x74, 0x02,				/*    jz    f                */
-	0x0f, 0x09,				/*    wbinvd                 */
-	0x24, 0x10,				/* f: andb  $0x10,al         */
-	0x66, 0x0f, 0x22, 0xc0			/*    movl  %eax,%cr0        */
-};
-static unsigned char jump_to_bios [] =
-{
-	0xea, 0x00, 0x00, 0xff, 0xff		/*    ljmp  $0xffff,$0x0000  */
-};
-
-/*
- * Switch to real mode and then execute the code
- * specified by the code and length parameters.
- * We assume that length will aways be less that 100!
- */
-void machine_real_restart(unsigned char *code, int length)
-{
-	local_irq_disable();
-
-	/* Write zero to CMOS register number 0x0f, which the BIOS POST
-	   routine will recognize as telling it to do a proper reboot.  (Well
-	   that's what this book in front of me says -- it may only apply to
-	   the Phoenix BIOS though, it's not clear).  At the same time,
-	   disable NMIs by setting the top bit in the CMOS address register,
-	   as we're about to do peculiar things to the CPU.  I'm not sure if
-	   `outb_p' is needed instead of just `outb'.  Use it to be on the
-	   safe side.  (Yes, CMOS_WRITE does outb_p's. -  Paul G.)
-	 */
-
-	spin_lock(&rtc_lock);
-	CMOS_WRITE(0x00, 0x8f);
-	spin_unlock(&rtc_lock);
-
-	/* Remap the kernel at virtual address zero, as well as offset zero
-	   from the kernel segment.  This assumes the kernel segment starts at
-	   virtual address PAGE_OFFSET. */
-
-	memcpy (swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
-		sizeof (swapper_pg_dir [0]) * KERNEL_PGD_PTRS);
-
-	/*
-	 * Use `swapper_pg_dir' as our page directory.
-	 */
-	load_cr3(swapper_pg_dir);
-
-	/* Write 0x1234 to absolute memory location 0x472.  The BIOS reads
-	   this on booting to tell it to "Bypass memory test (also warm
-	   boot)".  This seems like a fairly standard thing that gets set by
-	   REBOOT.COM programs, and the previous reset routine did this
-	   too. */
-
-	*((unsigned short *)0x472) = reboot_mode;
-
-	/* For the switch to real mode, copy some code to low memory.  It has
-	   to be in the first 64k because it is running in 16-bit mode, and it
-	   has to have the same physical and virtual address, because it turns
-	   off paging.  Copy it near the end of the first page, out of the way
-	   of BIOS variables. */
-
-	memcpy ((void *) (0x1000 - sizeof (real_mode_switch) - 100),
-		real_mode_switch, sizeof (real_mode_switch));
-	memcpy ((void *) (0x1000 - 100), code, length);
-
-	/* Set up the IDT for real mode. */
-
-	load_idt(&real_mode_idt);
-
-	/* Set up a GDT from which we can load segment descriptors for real
-	   mode.  The GDT is not used in real mode; it is just needed here to
-	   prepare the descriptors. */
-
-	load_gdt(&real_mode_gdt);
-
-	/* Load the data segment registers, and thus the descriptors ready for
-	   real mode.  The base address of each segment is 0x100, 16 times the
-	   selector value being loaded here.  This is so that the segment
-	   registers don't have to be reloaded after switching to real mode:
-	   the values are consistent for real mode operation already. */
-
-	__asm__ __volatile__ ("movl $0x0010,%%eax\n"
-				"\tmovl %%eax,%%ds\n"
-				"\tmovl %%eax,%%es\n"
-				"\tmovl %%eax,%%fs\n"
-				"\tmovl %%eax,%%gs\n"
-				"\tmovl %%eax,%%ss" : : : "eax");
-
-	/* Jump to the 16-bit code that we copied earlier.  It disables paging
-	   and the cache, switches to real mode, and jumps to the BIOS reset
-	   entry point. */
-
-	__asm__ __volatile__ ("ljmp $0x0008,%0"
-				:
-				: "i" ((void *) (0x1000 - sizeof (real_mode_switch) - 100)));
-}
-#ifdef CONFIG_APM_MODULE
-EXPORT_SYMBOL(machine_real_restart);
-#endif
-
-static void native_machine_shutdown(void)
-{
-#ifdef CONFIG_SMP
-	int reboot_cpu_id;
-
-	/* The boot cpu is always logical cpu 0 */
-	reboot_cpu_id = 0;
-
-	/* See if there has been given a command line override */
-	if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) &&
-		cpu_isset(reboot_cpu, cpu_online_map)) {
-		reboot_cpu_id = reboot_cpu;
-	}
-
-	/* Make certain the cpu I'm rebooting on is online */
-	if (!cpu_isset(reboot_cpu_id, cpu_online_map)) {
-		reboot_cpu_id = smp_processor_id();
-	}
-
-	/* Make certain I only run on the appropriate processor */
-	set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id));
-
-	/* O.K. Now that I'm on the appropriate processor, stop
-	 * all of the others, and disable their local APICs.
-	 */
-
-	smp_send_stop();
-#endif /* CONFIG_SMP */
-
-	lapic_shutdown();
-
-#ifdef CONFIG_X86_IO_APIC
-	disable_IO_APIC();
-#endif
-#ifdef CONFIG_HPET_TIMER
-	hpet_disable();
-#endif
-}
-
-void __attribute__((weak)) mach_reboot_fixups(void)
-{
-}
-
-static void native_machine_emergency_restart(void)
-{
-	if (!reboot_thru_bios) {
-		if (efi_enabled) {
-			efi.reset_system(EFI_RESET_COLD, EFI_SUCCESS, 0, NULL);
-			load_idt(&no_idt);
-			__asm__ __volatile__("int3");
-		}
-		/* rebooting needs to touch the page at absolute addr 0 */
-		*((unsigned short *)__va(0x472)) = reboot_mode;
-		for (;;) {
-			mach_reboot_fixups(); /* for board specific fixups */
-			mach_reboot();
-			/* That didn't work - force a triple fault.. */
-			load_idt(&no_idt);
-			__asm__ __volatile__("int3");
-		}
-	}
-	if (efi_enabled)
-		efi.reset_system(EFI_RESET_WARM, EFI_SUCCESS, 0, NULL);
-
-	machine_real_restart(jump_to_bios, sizeof(jump_to_bios));
-}
-
-static void native_machine_restart(char * __unused)
-{
-	machine_shutdown();
-	machine_emergency_restart();
-}
-
-static void native_machine_halt(void)
-{
-}
-
-static void native_machine_power_off(void)
-{
-	if (pm_power_off) {
-		machine_shutdown();
-		pm_power_off();
-	}
-}
-
-
-struct machine_ops machine_ops = {
-	.power_off = native_machine_power_off,
-	.shutdown = native_machine_shutdown,
-	.emergency_restart = native_machine_emergency_restart,
-	.restart = native_machine_restart,
-	.halt = native_machine_halt,
-};
-
-void machine_power_off(void)
-{
-	machine_ops.power_off();
-}
-
-void machine_shutdown(void)
-{
-	machine_ops.shutdown();
-}
-
-void machine_emergency_restart(void)
-{
-	machine_ops.emergency_restart();
-}
-
-void machine_restart(char *cmd)
-{
-	machine_ops.restart(cmd);
-}
-
-void machine_halt(void)
-{
-	machine_ops.halt();
-}
diff --git a/arch/x86/kernel/reboot_64.c b/arch/x86/kernel/reboot_64.c
deleted file mode 100644
index d6bdf93ffca9..000000000000
--- a/arch/x86/kernel/reboot_64.c
+++ /dev/null
@@ -1,191 +0,0 @@
-/* Various gunk just to reboot the machine. */ 
-#include <linux/module.h>
-#include <linux/reboot.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/kernel.h>
-#include <linux/ctype.h>
-#include <linux/string.h>
-#include <linux/pm.h>
-#include <linux/kdebug.h>
-#include <linux/sched.h>
-#include <linux/efi.h>
-#include <acpi/reboot.h>
-#include <asm/io.h>
-#include <asm/delay.h>
-#include <asm/desc.h>
-#include <asm/hw_irq.h>
-#include <asm/system.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-#include <asm/apic.h>
-#include <asm/hpet.h>
-#include <asm/gart.h>
-
-/*
- * Power off function, if any
- */
-void (*pm_power_off)(void);
-EXPORT_SYMBOL(pm_power_off);
-
-static long no_idt[3];
-enum reboot_type reboot_type = BOOT_KBD;
-static int reboot_mode = 0;
-int reboot_force;
-
-/* reboot=t[riple] | k[bd] | e[fi] [, [w]arm | [c]old]
-   warm   Don't set the cold reboot flag
-   cold   Set the cold reboot flag
-   triple Force a triple fault (init)
-   kbd    Use the keyboard controller. cold reset (default)
-   acpi   Use the RESET_REG in the FADT
-   efi    Use efi reset_system runtime service
-   force  Avoid anything that could hang.
- */ 
-static int __init reboot_setup(char *str)
-{
-	for (;;) {
-		switch (*str) {
-		case 'w': 
-			reboot_mode = 0x1234;
-			break;
-
-		case 'c':
-			reboot_mode = 0;
-			break;
-
-		case 't':
-		case 'a':
-		case 'b':
-		case 'k':
-		case 'e':
-			reboot_type = *str;
-			break;
-		case 'f':
-			reboot_force = 1;
-			break;
-		}
-		if((str = strchr(str,',')) != NULL)
-			str++;
-		else
-			break;
-	}
-	return 1;
-}
-
-__setup("reboot=", reboot_setup);
-
-static inline void kb_wait(void)
-{
-	int i;
-
-	for (i=0; i<0x10000; i++)
-		if ((inb_p(0x64) & 0x02) == 0)
-			break;
-}
-
-void machine_shutdown(void)
-{
-	unsigned long flags;
-
-	/* Stop the cpus and apics */
-#ifdef CONFIG_SMP
-	int reboot_cpu_id;
-
-	/* The boot cpu is always logical cpu 0 */
-	reboot_cpu_id = 0;
-
-	/* Make certain the cpu I'm about to reboot on is online */
-	if (!cpu_isset(reboot_cpu_id, cpu_online_map)) {
-		reboot_cpu_id = smp_processor_id();
-	}
-
-	/* Make certain I only run on the appropriate processor */
-	set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id));
-
-	/* O.K Now that I'm on the appropriate processor,
-	 * stop all of the others.
-	 */
-	smp_send_stop();
-#endif
-
-	local_irq_save(flags);
-
-#ifndef CONFIG_SMP
-	disable_local_APIC();
-#endif
-
-	disable_IO_APIC();
-
-#ifdef CONFIG_HPET_TIMER
-	hpet_disable();
-#endif
-	local_irq_restore(flags);
-
-	pci_iommu_shutdown();
-}
-
-void machine_emergency_restart(void)
-{
-	int i;
-
-	/* Tell the BIOS if we want cold or warm reboot */
-	*((unsigned short *)__va(0x472)) = reboot_mode;
-       
-	for (;;) {
-		/* Could also try the reset bit in the Hammer NB */
-		switch (reboot_type) { 
-		case BOOT_KBD:
-		for (i=0; i<10; i++) {
-			kb_wait();
-			udelay(50);
-			outb(0xfe,0x64);         /* pulse reset low */
-			udelay(50);
-		}
-
-		case BOOT_TRIPLE: 
-			load_idt((const struct desc_ptr *)&no_idt);
-			__asm__ __volatile__("int3");
-
-			reboot_type = BOOT_KBD;
-			break;
-
-		case BOOT_ACPI:
-			acpi_reboot();
-			reboot_type = BOOT_KBD;
-			break;
-
-		case BOOT_EFI:
-			if (efi_enabled)
-				efi.reset_system(reboot_mode ? EFI_RESET_WARM : EFI_RESET_COLD,
-						 EFI_SUCCESS, 0, NULL);
-			reboot_type = BOOT_KBD;
-			break;
-		}
-	}      
-}
-
-void machine_restart(char * __unused)
-{
-	printk("machine restart\n");
-
-	if (!reboot_force) {
-		machine_shutdown();
-	}
-	machine_emergency_restart();
-}
-
-void machine_halt(void)
-{
-}
-
-void machine_power_off(void)
-{
-	if (pm_power_off) {
-		if (!reboot_force) {
-			machine_shutdown();
-		}
-		pm_power_off();
-	}
-}
-
diff --git a/include/asm-x86/emergency-restart.h b/include/asm-x86/emergency-restart.h
index 54189084462a..8e6aef19f8f0 100644
--- a/include/asm-x86/emergency-restart.h
+++ b/include/asm-x86/emergency-restart.h
@@ -4,6 +4,9 @@
 enum reboot_type {
 	BOOT_TRIPLE = 't',
 	BOOT_KBD = 'k',
+#ifdef CONFIG_X86_32
+	BOOT_BIOS = 'b',
+#endif
 	BOOT_ACPI = 'a',
 	BOOT_EFI = 'e'
 };
-- 
cgit v1.2.3


From 6d7d7433750c7c6eec93d7b3206019e329228686 Mon Sep 17 00:00:00 2001
From: "Huang, Ying" <ying.huang@intel.com>
Date: Wed, 30 Jan 2008 13:32:51 +0100
Subject: x86 boot : export boot_params via debugfs for debugging

This patch export the boot parameters via debugfs for debugging.

The files added are as follow:

boot_params/data    :  binary file for struct boot_params
boot_params/version :  boot protocol version

This patch is based on 2.6.24-rc5-mm1 and has been tested on i386 and
x86_64 platform.

This patch is based on the Peter Anvin's proposal.

Signed-off-by: Huang Ying <ying.huang@intel.com>

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig.debug     |  7 +++++
 arch/x86/kernel/Makefile   |  2 +-
 arch/x86/kernel/kdebugfs.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/setup64.c  |  4 +++
 arch/x86/kernel/setup_32.c |  4 +++
 5 files changed, 81 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/kernel/kdebugfs.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 660200915baa..15854b53badc 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -185,4 +185,11 @@ config DEFAULT_IO_DELAY_TYPE
 	default IO_DELAY_TYPE_NONE
 endif
 
+config DEBUG_BOOT_PARAMS
+	bool "Debug boot parameters"
+	depends on DEBUG_KERNEL
+	depends on DEBUG_FS
+	help
+	  This option will cause struct boot_params to be exported via debugfs.
+
 endmenu
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index b40bed4baa77..3d23ccd366ea 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -16,7 +16,7 @@ obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o
 obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o
 obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o setup64.o
 obj-y			+= pci-dma_$(BITS).o  bootflag.o e820_$(BITS).o
-obj-y			+= quirks.o i8237.o topology.o
+obj-y			+= quirks.o i8237.o topology.o kdebugfs.o
 obj-y			+= alternative.o i8253.o
 obj-$(CONFIG_X86_64)	+= pci-nommu_64.o bugs_64.o
 obj-y			+= tsc_$(BITS).o io_delay.o rtc.o
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
new file mode 100644
index 000000000000..73354302fda7
--- /dev/null
+++ b/arch/x86/kernel/kdebugfs.c
@@ -0,0 +1,65 @@
+/*
+ * Architecture specific debugfs files
+ *
+ * Copyright (C) 2007, Intel Corp.
+ *	Huang Ying <ying.huang@intel.com>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/debugfs.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+
+#include <asm/setup.h>
+
+#ifdef CONFIG_DEBUG_BOOT_PARAMS
+static struct debugfs_blob_wrapper boot_params_blob = {
+	.data = &boot_params,
+	.size = sizeof(boot_params),
+};
+
+static int __init boot_params_kdebugfs_init(void)
+{
+	int error;
+	struct dentry *dbp, *version, *data;
+
+	dbp = debugfs_create_dir("boot_params", NULL);
+	if (!dbp) {
+		error = -ENOMEM;
+		goto err_return;
+	}
+	version = debugfs_create_x16("version", S_IRUGO, dbp,
+				     &boot_params.hdr.version);
+	if (!version) {
+		error = -ENOMEM;
+		goto err_dir;
+	}
+	data = debugfs_create_blob("data", S_IRUGO, dbp,
+				   &boot_params_blob);
+	if (!data) {
+		error = -ENOMEM;
+		goto err_version;
+	}
+	return 0;
+err_version:
+	debugfs_remove(version);
+err_dir:
+	debugfs_remove(dbp);
+err_return:
+	return error;
+}
+#endif
+
+static int __init arch_kdebugfs_init(void)
+{
+	int error = 0;
+
+#ifdef CONFIG_DEBUG_BOOT_PARAMS
+	error = boot_params_kdebugfs_init();
+#endif
+
+	return error;
+}
+
+arch_initcall(arch_kdebugfs_init);
diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c
index 3b0ffa31f3c0..8fa0de810d0b 100644
--- a/arch/x86/kernel/setup64.c
+++ b/arch/x86/kernel/setup64.c
@@ -24,7 +24,11 @@
 #include <asm/sections.h>
 #include <asm/setup.h>
 
+#ifndef CONFIG_DEBUG_BOOT_PARAMS
 struct boot_params __initdata boot_params;
+#else
+struct boot_params boot_params;
+#endif
 
 cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
 
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 704550fdb84c..3bce4af60bb6 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -194,7 +194,11 @@ unsigned long saved_videomode;
 
 static char __initdata command_line[COMMAND_LINE_SIZE];
 
+#ifndef CONFIG_DEBUG_BOOT_PARAMS
 struct boot_params __initdata boot_params;
+#else
+struct boot_params boot_params;
+#endif
 
 #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
 struct edd edd;
-- 
cgit v1.2.3


From cf8fa920cb4271f17e0265c863d64bea1b31941a Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Wed, 30 Jan 2008 13:32:51 +0100
Subject: i386: handle an initrd in highmem (version 2)

The boot protocol has until now required that the initrd be located in
lowmem, which makes the lowmem/highmem boundary visible to the boot
loader.  This was exported to the bootloader via a compile-time
field.  Unfortunately, the vmalloc= command-line option breaks this
part of the protocol; instead of adding yet another hack that affects
the bootloader, have the kernel relocate the initrd down below the
lowmem boundary inside the kernel itself.

Note that this does not rely on HIGHMEM being enabled in the kernel.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/boot/header.S     |   5 +-
 arch/x86/kernel/setup_32.c | 130 ++++++++++++++++++++++++++++++++++++---------
 arch/x86/mm/discontig_32.c |   4 +-
 3 files changed, 112 insertions(+), 27 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 4cc5b0411db5..64ad9016585a 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -195,10 +195,13 @@ cmd_line_ptr:	.long	0		# (Header version 0x0202 or later)
 					# can be located anywhere in
 					# low memory 0x10000 or higher.
 
-ramdisk_max:	.long (-__PAGE_OFFSET-(512 << 20)-1) & 0x7fffffff
+ramdisk_max:	.long 0x7fffffff
 					# (Header version 0x0203 or later)
 					# The highest safe address for
 					# the contents of an initrd
+					# The current kernel allows up to 4 GB,
+					# but leave it at 2 GB to avoid
+					# possible bootloader bugs.
 
 kernel_alignment:  .long CONFIG_PHYSICAL_ALIGN	#physical addr alignment
 						#required for protected mode
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 3bce4af60bb6..6802a383077d 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -188,9 +188,9 @@ extern int root_mountflags;
 
 unsigned long saved_videomode;
 
-#define RAMDISK_IMAGE_START_MASK  	0x07FF
+#define RAMDISK_IMAGE_START_MASK	0x07FF
 #define RAMDISK_PROMPT_FLAG		0x8000
-#define RAMDISK_LOAD_FLAG		0x4000	
+#define RAMDISK_LOAD_FLAG		0x4000
 
 static char __initdata command_line[COMMAND_LINE_SIZE];
 
@@ -252,7 +252,7 @@ static int __init parse_mem(char *arg)
 		 * trim the existing memory map.
 		 */
 		unsigned long long mem_size;
- 
+
 		mem_size = memparse(arg, &arg);
 		limit_regions(mem_size);
 		user_defined_memmap = 1;
@@ -391,7 +391,7 @@ static void __init reserve_ebda_region(void)
 	unsigned int addr;
 	addr = get_bios_ebda();
 	if (addr)
-		reserve_bootmem(addr, PAGE_SIZE);	
+		reserve_bootmem(addr, PAGE_SIZE);
 }
 
 #ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -496,6 +496,100 @@ static inline void __init reserve_crashkernel(void)
 {}
 #endif
 
+#ifdef CONFIG_BLK_DEV_INITRD
+
+static bool do_relocate_initrd = false;
+
+static void __init reserve_initrd(void)
+{
+	unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
+	unsigned long ramdisk_size  = boot_params.hdr.ramdisk_size;
+	unsigned long ramdisk_end   = ramdisk_image + ramdisk_size;
+	unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
+	unsigned long ramdisk_here;
+
+	initrd_start = 0;
+
+	if (!boot_params.hdr.type_of_loader ||
+	    !ramdisk_image || !ramdisk_size)
+		return;		/* No initrd provided by bootloader */
+
+	if (ramdisk_end < ramdisk_image) {
+		printk(KERN_ERR "initrd wraps around end of memory, "
+		       "disabling initrd\n");
+		return;
+	}
+	if (ramdisk_size >= end_of_lowmem/2) {
+		printk(KERN_ERR "initrd too large to handle, "
+		       "disabling initrd\n");
+		return;
+	}
+	if (ramdisk_end <= end_of_lowmem) {
+		/* All in lowmem, easy case */
+		reserve_bootmem(ramdisk_image, ramdisk_size);
+		initrd_start = ramdisk_image + PAGE_OFFSET;
+		initrd_end = initrd_start+ramdisk_size;
+		return;
+	}
+
+	/* We need to move the initrd down into lowmem */
+	ramdisk_here = (end_of_lowmem - ramdisk_size) & PAGE_MASK;
+
+	/* Note: this includes all the lowmem currently occupied by
+	   the initrd, we rely on that fact to keep the data intact. */
+	reserve_bootmem(ramdisk_here, ramdisk_size);
+	initrd_start = ramdisk_here + PAGE_OFFSET;
+	initrd_end   = initrd_start + ramdisk_size;
+
+	do_relocate_initrd = true;
+}
+
+#define MAX_MAP_CHUNK	(NR_FIX_BTMAPS << PAGE_SHIFT)
+
+static void __init relocate_initrd(void)
+{
+	unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
+	unsigned long ramdisk_size  = boot_params.hdr.ramdisk_size;
+	unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
+	unsigned long ramdisk_here;
+	unsigned long slop, clen, mapaddr;
+	char *p, *q;
+
+	if (!do_relocate_initrd)
+		return;
+
+	ramdisk_here = initrd_start - PAGE_OFFSET;
+
+	q = (char *)initrd_start;
+
+	/* Copy any lowmem portion of the initrd */
+	if (ramdisk_image < end_of_lowmem) {
+		clen = end_of_lowmem - ramdisk_image;
+		p = (char *)__va(ramdisk_image);
+		memcpy(q, p, clen);
+		q += clen;
+		ramdisk_image += clen;
+		ramdisk_size  -= clen;
+	}
+
+	/* Copy the highmem portion of the initrd */
+	while (ramdisk_size) {
+		slop = ramdisk_image & ~PAGE_MASK;
+		clen = ramdisk_size;
+		if (clen > MAX_MAP_CHUNK-slop)
+			clen = MAX_MAP_CHUNK-slop;
+		mapaddr = ramdisk_image & PAGE_MASK;
+		p = bt_ioremap(mapaddr, clen+slop);
+		memcpy(q, p+slop, clen);
+		bt_iounmap(p, clen+slop);
+		q += clen;
+		ramdisk_image += clen;
+		ramdisk_size  -= clen;
+	}
+}
+
+#endif /* CONFIG_BLK_DEV_INITRD */
+
 void __init setup_bootmem_allocator(void)
 {
 	unsigned long bootmap_size;
@@ -551,26 +645,10 @@ void __init setup_bootmem_allocator(void)
 	 */
 	find_smp_config();
 #endif
-	numa_kva_reserve();
 #ifdef CONFIG_BLK_DEV_INITRD
-	if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
-		unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
-		unsigned long ramdisk_size  = boot_params.hdr.ramdisk_size;
-		unsigned long ramdisk_end   = ramdisk_image + ramdisk_size;
-		unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
-
-		if (ramdisk_end <= end_of_lowmem) {
-			reserve_bootmem(ramdisk_image, ramdisk_size);
-			initrd_start = ramdisk_image + PAGE_OFFSET;
-			initrd_end = initrd_start+ramdisk_size;
-		} else {
-			printk(KERN_ERR "initrd extends beyond end of memory "
-			       "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
-			       ramdisk_end, end_of_lowmem);
-			initrd_start = 0;
-		}
-	}
+	reserve_initrd();
 #endif
+	numa_kva_reserve();
 	reserve_crashkernel();
 }
 
@@ -713,15 +791,19 @@ void __init setup_arch(char **cmdline_p)
 	 * NOTE: at this point the bootmem allocator is fully available.
 	 */
 
+#ifdef CONFIG_BLK_DEV_INITRD
+	relocate_initrd();
+#endif
+
 	paravirt_post_allocator_init();
 
 	dmi_scan_machine();
 
-	io_delay_init();;
+	io_delay_init();
 
 #ifdef CONFIG_X86_GENERICARCH
 	generic_apic_probe();
-#endif	
+#endif
 	if (efi_enabled)
 		efi_map_memmap();
 
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index 13a474d3c6e9..88a7499e8e48 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -288,8 +288,8 @@ unsigned long __init setup_memory(void)
 
 #ifdef CONFIG_BLK_DEV_INITRD
 	/* Numa kva area is below the initrd */
-	if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image)
-		kva_start_pfn = PFN_DOWN(boot_params.hdr.ramdisk_image)
+	if (initrd_start)
+		kva_start_pfn = PFN_DOWN(initrd_start - PAGE_OFFSET)
 			- kva_pages;
 #endif
 	kva_start_pfn -= kva_start_pfn & (PTRS_PER_PTE-1);
-- 
cgit v1.2.3


From 4fc2fba804cae404d2665e23b8cbd46d5f63a07e Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Wed, 30 Jan 2008 13:32:54 +0100
Subject: x86: unify arch/x86/kernel/acpi/sleep*.c

Unify arch/x86/kernel/acpi/sleep*.c

Pretty trivial unification; when two functions differed, it was
usually in error handling, and better of the two was picked up.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Looks-okay-to: Rafael J. Wysocki <rjw@sisk.pl>
Tested-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/acpi/Makefile   |   2 +-
 arch/x86/kernel/acpi/sleep.c    |  87 ++++++++++++++++++++++++++++++
 arch/x86/kernel/acpi/sleep_32.c |  70 ------------------------
 arch/x86/kernel/acpi/sleep_64.c | 117 ----------------------------------------
 4 files changed, 88 insertions(+), 188 deletions(-)
 create mode 100644 arch/x86/kernel/acpi/sleep.c
 delete mode 100644 arch/x86/kernel/acpi/sleep_64.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index 1351c3982ee4..19d3d6e9d09b 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -1,5 +1,5 @@
 obj-$(CONFIG_ACPI)		+= boot.o
-obj-$(CONFIG_ACPI_SLEEP)	+= sleep_$(BITS).o wakeup_$(BITS).o
+obj-$(CONFIG_ACPI_SLEEP)	+= sleep.o wakeup_$(BITS).o
 
 ifneq ($(CONFIG_ACPI_PROCESSOR),)
 obj-y				+= cstate.o processor.o
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
new file mode 100644
index 000000000000..6bc815cd8cb3
--- /dev/null
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -0,0 +1,87 @@
+/*
+ * sleep.c - x86-specific ACPI sleep support.
+ *
+ *  Copyright (C) 2001-2003 Patrick Mochel
+ *  Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz>
+ */
+
+#include <linux/acpi.h>
+#include <linux/bootmem.h>
+#include <linux/dmi.h>
+#include <linux/cpumask.h>
+
+#include <asm/smp.h>
+
+/* address in low memory of the wakeup routine. */
+unsigned long acpi_wakeup_address = 0;
+unsigned long acpi_realmode_flags;
+extern char wakeup_start, wakeup_end;
+
+extern unsigned long acpi_copy_wakeup_routine(unsigned long);
+
+/**
+ * acpi_save_state_mem - save kernel state
+ *
+ * Create an identity mapped page table and copy the wakeup routine to
+ * low memory.
+ */
+int acpi_save_state_mem(void)
+{
+	if (!acpi_wakeup_address) {
+		printk(KERN_ERR "Could not allocate memory during boot, S3 disabled\n");
+		return -ENOMEM;
+	}
+	memcpy((void *)acpi_wakeup_address, &wakeup_start,
+	       &wakeup_end - &wakeup_start);
+	acpi_copy_wakeup_routine(acpi_wakeup_address);
+
+	return 0;
+}
+
+/*
+ * acpi_restore_state - undo effects of acpi_save_state_mem
+ */
+void acpi_restore_state_mem(void)
+{
+}
+
+
+/**
+ * acpi_reserve_bootmem - do _very_ early ACPI initialisation
+ *
+ * We allocate a page from the first 1MB of memory for the wakeup
+ * routine for when we come back from a sleep state. The
+ * runtime allocator allows specification of <16MB pages, but not
+ * <1MB pages.
+ */
+void __init acpi_reserve_bootmem(void)
+{
+	if ((&wakeup_end - &wakeup_start) > PAGE_SIZE*2) {
+		printk(KERN_ERR
+		       "ACPI: Wakeup code way too big, S3 disabled.\n");
+		return;
+	}
+
+	acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2);
+	if (!acpi_wakeup_address)
+		printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
+}
+
+
+static int __init acpi_sleep_setup(char *str)
+{
+	while ((str != NULL) && (*str != '\0')) {
+		if (strncmp(str, "s3_bios", 7) == 0)
+			acpi_realmode_flags |= 1;
+		if (strncmp(str, "s3_mode", 7) == 0)
+			acpi_realmode_flags |= 2;
+		if (strncmp(str, "s3_beep", 7) == 0)
+			acpi_realmode_flags |= 4;
+		str = strchr(str, ',');
+		if (str != NULL)
+			str += strspn(str, ", \t");
+	}
+	return 1;
+}
+
+__setup("acpi_sleep=", acpi_sleep_setup);
diff --git a/arch/x86/kernel/acpi/sleep_32.c b/arch/x86/kernel/acpi/sleep_32.c
index 09f820d920b1..63fe5525e026 100644
--- a/arch/x86/kernel/acpi/sleep_32.c
+++ b/arch/x86/kernel/acpi/sleep_32.c
@@ -12,76 +12,6 @@
 
 #include <asm/smp.h>
 
-/* address in low memory of the wakeup routine. */
-unsigned long acpi_wakeup_address = 0;
-unsigned long acpi_realmode_flags;
-extern char wakeup_start, wakeup_end;
-
-extern unsigned long acpi_copy_wakeup_routine(unsigned long);
-
-/**
- * acpi_save_state_mem - save kernel state
- *
- * Create an identity mapped page table and copy the wakeup routine to
- * low memory.
- */
-int acpi_save_state_mem(void)
-{
-	if (!acpi_wakeup_address)
-		return 1;
-	memcpy((void *)acpi_wakeup_address, &wakeup_start,
-	       &wakeup_end - &wakeup_start);
-	acpi_copy_wakeup_routine(acpi_wakeup_address);
-
-	return 0;
-}
-
-/*
- * acpi_restore_state - undo effects of acpi_save_state_mem
- */
-void acpi_restore_state_mem(void)
-{
-}
-
-/**
- * acpi_reserve_bootmem - do _very_ early ACPI initialisation
- *
- * We allocate a page from the first 1MB of memory for the wakeup
- * routine for when we come back from a sleep state. The
- * runtime allocator allows specification of <16MB pages, but not
- * <1MB pages.
- */
-void __init acpi_reserve_bootmem(void)
-{
-	if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) {
-		printk(KERN_ERR
-		       "ACPI: Wakeup code way too big, S3 disabled.\n");
-		return;
-	}
-
-	acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE);
-	if (!acpi_wakeup_address)
-		printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
-}
-
-static int __init acpi_sleep_setup(char *str)
-{
-	while ((str != NULL) && (*str != '\0')) {
-		if (strncmp(str, "s3_bios", 7) == 0)
-			acpi_realmode_flags |= 1;
-		if (strncmp(str, "s3_mode", 7) == 0)
-			acpi_realmode_flags |= 2;
-		if (strncmp(str, "s3_beep", 7) == 0)
-			acpi_realmode_flags |= 4;
-		str = strchr(str, ',');
-		if (str != NULL)
-			str += strspn(str, ", \t");
-	}
-	return 1;
-}
-
-__setup("acpi_sleep=", acpi_sleep_setup);
-
 /* Ouch, we want to delete this. We already have better version in userspace, in
    s2ram from suspend.sf.net project */
 static __init int reset_videomode_after_s3(const struct dmi_system_id *d)
diff --git a/arch/x86/kernel/acpi/sleep_64.c b/arch/x86/kernel/acpi/sleep_64.c
deleted file mode 100644
index da42de261ba8..000000000000
--- a/arch/x86/kernel/acpi/sleep_64.c
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- *  acpi.c - Architecture-Specific Low-Level ACPI Support
- *
- *  Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
- *  Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com>
- *  Copyright (C) 2001 Patrick Mochel <mochel@osdl.org>
- *  Copyright (C) 2002 Andi Kleen, SuSE Labs (x86-64 port)
- *  Copyright (C) 2003 Pavel Machek, SuSE Labs
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/stddef.h>
-#include <linux/slab.h>
-#include <linux/pci.h>
-#include <linux/bootmem.h>
-#include <linux/acpi.h>
-#include <linux/cpumask.h>
-
-#include <asm/mpspec.h>
-#include <asm/io.h>
-#include <asm/apic.h>
-#include <asm/apicdef.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/pgalloc.h>
-#include <asm/io_apic.h>
-#include <asm/proto.h>
-#include <asm/tlbflush.h>
-
-/* --------------------------------------------------------------------------
-                              Low-Level Sleep Support
-   -------------------------------------------------------------------------- */
-
-/* address in low memory of the wakeup routine. */
-unsigned long acpi_wakeup_address = 0;
-unsigned long acpi_realmode_flags;
-extern char wakeup_start, wakeup_end;
-
-extern unsigned long acpi_copy_wakeup_routine(unsigned long);
-
-/**
- * acpi_save_state_mem - save kernel state
- *
- * Create an identity mapped page table and copy the wakeup routine to
- * low memory.
- */
-int acpi_save_state_mem(void)
-{
-	memcpy((void *)acpi_wakeup_address, &wakeup_start,
-	       &wakeup_end - &wakeup_start);
-	acpi_copy_wakeup_routine(acpi_wakeup_address);
-
-	return 0;
-}
-
-/*
- * acpi_restore_state
- */
-void acpi_restore_state_mem(void)
-{
-}
-
-/**
- * acpi_reserve_bootmem - do _very_ early ACPI initialisation
- *
- * We allocate a page in low memory for the wakeup
- * routine for when we come back from a sleep state. The
- * runtime allocator allows specification of <16M pages, but not
- * <1M pages.
- */
-void __init acpi_reserve_bootmem(void)
-{
-	acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2);
-	if ((&wakeup_end - &wakeup_start) > (PAGE_SIZE*2))
-		printk(KERN_CRIT
-		       "ACPI: Wakeup code way too big, will crash on attempt"
-		       " to suspend\n");
-}
-
-static int __init acpi_sleep_setup(char *str)
-{
-	while ((str != NULL) && (*str != '\0')) {
-		if (strncmp(str, "s3_bios", 7) == 0)
-			acpi_realmode_flags |= 1;
-		if (strncmp(str, "s3_mode", 7) == 0)
-			acpi_realmode_flags |= 2;
-		if (strncmp(str, "s3_beep", 7) == 0)
-			acpi_realmode_flags |= 4;
-		str = strchr(str, ',');
-		if (str != NULL)
-			str += strspn(str, ", \t");
-	}
-	return 1;
-}
-
-__setup("acpi_sleep=", acpi_sleep_setup);
-
-- 
cgit v1.2.3


From 87e8407f9ad2a2df901c4b690ab0a2bf0fb168c5 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Wed, 30 Jan 2008 13:32:54 +0100
Subject: x86, ptrace: add bts_struct size to status command

Return the size of bts_struct in the PTRACE_BTS_STATUS command.
Change types to u32.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ptrace.c     |  2 ++
 include/asm-x86/ptrace-abi.h | 11 ++++++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index e19a91db9b35..96286df1bb81 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -787,6 +787,8 @@ static int ptrace_bts_status(struct task_struct *child,
 			cfg.flags |= PTRACE_BTS_O_SCHED;
 	}
 
+	cfg.bts_size = sizeof(struct bts_struct);
+
 	if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
 		return -EFAULT;
 
diff --git a/include/asm-x86/ptrace-abi.h b/include/asm-x86/ptrace-abi.h
index 08a12b790a77..81a8ee4c55fc 100644
--- a/include/asm-x86/ptrace-abi.h
+++ b/include/asm-x86/ptrace-abi.h
@@ -81,16 +81,21 @@
 #define PTRACE_SINGLEBLOCK	33	/* resume execution until next branch */
 
 #ifndef __ASSEMBLY__
+
+#include <asm/types.h>
+
 /* configuration/status structure used in PTRACE_BTS_CONFIG and
    PTRACE_BTS_STATUS commands.
 */
 struct ptrace_bts_config {
 	/* requested or actual size of BTS buffer in bytes */
-	unsigned int size;
+	u32 size;
 	/* bitmask of below flags */
-	unsigned int flags;
+	u32 flags;
 	/* buffer overflow signal */
-	unsigned int signal;
+	u32 signal;
+	/* actual size of bts_struct in bytes */
+	u32 bts_size;
 };
 #endif
 
-- 
cgit v1.2.3


From e33287013585e96180c575288bf1db22bee47b52 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 30 Jan 2008 13:32:58 +0100
Subject: x86/vmi: fix compilation as a result of pte_t changes

Fix various compilation problems as a result of changing pte_t.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Zachary Amsden <zach@vmware.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/vmi_32.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 10c46419d35d..2ee5d8e0ada5 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -475,7 +475,7 @@ static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep
 static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval)
 {
 #ifdef CONFIG_X86_PAE
-	const pte_t pte = { pmdval.pmd, pmdval.pmd >> 32 };
+	const pte_t pte = { .pte = pmdval.pmd };
 	vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PMD);
 #else
 	const pte_t pte = { pmdval.pud.pgd.pgd };
@@ -508,21 +508,21 @@ static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t
 static void vmi_set_pud(pud_t *pudp, pud_t pudval)
 {
 	/* Um, eww */
-	const pte_t pte = { pudval.pgd.pgd, pudval.pgd.pgd >> 32 };
+	const pte_t pte = { .pte = pudval.pgd.pgd };
 	vmi_check_page_type(__pa(pudp) >> PAGE_SHIFT, VMI_PAGE_PGD);
 	vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP);
 }
 
 static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 {
-	const pte_t pte = { 0 };
+	const pte_t pte = { .pte = 0 };
 	vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
 	vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
 }
 
 static void vmi_pmd_clear(pmd_t *pmd)
 {
-	const pte_t pte = { 0 };
+	const pte_t pte = { .pte = 0 };
 	vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD);
 	vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD);
 }
-- 
cgit v1.2.3


From c68461b67d97739707b3fc57618f22091791f2af Mon Sep 17 00:00:00 2001
From: Nikanth Karthikesan <knikanth@suse.de>
Date: Wed, 30 Jan 2008 13:32:59 +0100
Subject: x86: change x86 machine check handler to use unlocked_ioctl instead

The machine check handler registers ioctl handler that is called
with the BKL held. Changing to register unlocked_ioctl instead.
Also mce ioctl handler does not seem to need any lock protection.

To: Andi Kleen <andi@firstfloor.org>
Cc: linux-kernel@vger.kernel.org
Cc: kernel-janitors@vger.kernel.org

Change the Machine check handler to use unlocked_ioctl instead of
ioctl handler. Also the mce ioctl handler does not need any lock
protection.

Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/mcheck/mce_64.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index bc6e35153d83..98b23d55fe6e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -634,8 +634,7 @@ static unsigned int mce_poll(struct file *file, poll_table *wait)
 	return 0;
 }
 
-static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd,
-		     unsigned long arg)
+static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 {
 	int __user *p = (int __user *)arg;
 
@@ -664,7 +663,7 @@ static const struct file_operations mce_chrdev_ops = {
 	.release = mce_release,
 	.read = mce_read,
 	.poll = mce_poll,
-	.ioctl = mce_ioctl,
+	.unlocked_ioctl = mce_ioctl,
 };
 
 static struct miscdevice mce_log_device = {
-- 
cgit v1.2.3


From a604b38036bee1483fb98a520c69895d5d6276a6 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Wed, 30 Jan 2008 13:32:59 +0100
Subject: x86: introduce __die helper to X86_32

Small step towards unifying traps_32|64.c.  No functional
changes.  Pull out a small helper from an if() statement
in die().

Marked as __kprobes as eventually we will want to call this
from do_page_fault similar to how X86_64 does it.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/traps_32.c | 76 +++++++++++++++++++++++++---------------------
 1 file changed, 42 insertions(+), 34 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index 2eb6ca0ef672..83df0f37ba75 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -352,6 +352,45 @@ int is_valid_bugaddr(unsigned long ip)
 	return ud2 == 0x0b0f;
 }
 
+static int die_counter;
+
+int __kprobes __die(const char * str, struct pt_regs * regs, long err)
+{
+	unsigned long sp;
+	unsigned short ss;
+
+	printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
+#ifdef CONFIG_PREEMPT
+	printk("PREEMPT ");
+#endif
+#ifdef CONFIG_SMP
+	printk("SMP ");
+#endif
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	printk("DEBUG_PAGEALLOC");
+#endif
+	printk("\n");
+
+	if (notify_die(DIE_OOPS, str, regs, err,
+				current->thread.trap_no, SIGSEGV) !=
+			NOTIFY_STOP) {
+		show_registers(regs);
+		/* Executive summary in case the oops scrolled away */
+		sp = (unsigned long) (&regs->sp);
+		savesegment(ss, ss);
+		if (user_mode(regs)) {
+			sp = regs->sp;
+			ss = regs->ss & 0xffff;
+		}
+		printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
+		print_symbol("%s", regs->ip);
+		printk(" SS:ESP %04x:%08lx\n", ss, sp);
+		return 0;
+	} else {
+		return 1;
+	}
+}
+
 /*
  * This is gone through when something in the kernel has done something bad and
  * is about to be terminated.
@@ -367,7 +406,6 @@ void die(const char * str, struct pt_regs * regs, long err)
 		.lock_owner =		-1,
 		.lock_owner_depth =	0
 	};
-	static int die_counter;
 	unsigned long flags;
 
 	oops_enter();
@@ -383,43 +421,13 @@ void die(const char * str, struct pt_regs * regs, long err)
 		raw_local_irq_save(flags);
 
 	if (++die.lock_owner_depth < 3) {
-		unsigned long sp;
-		unsigned short ss;
-
 		report_bug(regs->ip, regs);
 
-		printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff,
-		       ++die_counter);
-#ifdef CONFIG_PREEMPT
-		printk("PREEMPT ");
-#endif
-#ifdef CONFIG_SMP
-		printk("SMP ");
-#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
-		printk("DEBUG_PAGEALLOC");
-#endif
-		printk("\n");
-
-		if (notify_die(DIE_OOPS, str, regs, err,
-					current->thread.trap_no, SIGSEGV) !=
-				NOTIFY_STOP) {
-			show_registers(regs);
-			/* Executive summary in case the oops scrolled away */
-			sp = (unsigned long) (&regs->sp);
-			savesegment(ss, ss);
-			if (user_mode(regs)) {
-				sp = regs->sp;
-				ss = regs->ss & 0xffff;
-			}
-			printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
-			print_symbol("%s", regs->ip);
-			printk(" SS:ESP %04x:%08lx\n", ss, sp);
-		}
-		else
+		if (__die(str, regs, err))
 			regs = NULL;
-  	} else
+	} else {
 		printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
+	}
 
 	bust_spinlocks(0);
 	die.lock_owner = -1;
-- 
cgit v1.2.3


From b3a5acc17c4ad9c28c00ee5e5271de1b1285d22b Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Wed, 30 Jan 2008 13:32:59 +0100
Subject: x86: use fixup_exception() in traps_64.c

Use the fixup_exception() helper instead of the open-coded
search_extable() users.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/traps_64.c | 47 +++++++++++++++-------------------------------
 1 file changed, 15 insertions(+), 32 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index 7118fa0320ae..37b07d08704b 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -601,19 +601,12 @@ static void __kprobes do_trap(int trapnr, int signr, char *str,
 	}
 
 
-	/* kernel trap */ 
-	{	     
-		const struct exception_table_entry *fixup;
-		fixup = search_exception_tables(regs->ip);
-		if (fixup)
-			regs->ip = fixup->fixup;
-		else {
-			tsk->thread.error_code = error_code;
-			tsk->thread.trap_no = trapnr;
-			die(str, regs, error_code);
-		}
-		return;
+	if (!fixup_exception(regs)) {
+		tsk->thread.error_code = error_code;
+		tsk->thread.trap_no = trapnr;
+		die(str, regs, error_code);
 	}
+	return;
 }
 
 #define DO_ERROR(trapnr, signr, str, name) \
@@ -703,22 +696,15 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
 		return;
 	} 
 
-	/* kernel gp */
-	{
-		const struct exception_table_entry *fixup;
-		fixup = search_exception_tables(regs->ip);
-		if (fixup) {
-			regs->ip = fixup->fixup;
-			return;
-		}
+	if (fixup_exception(regs))
+		return;
 
-		tsk->thread.error_code = error_code;
-		tsk->thread.trap_no = 13;
-		if (notify_die(DIE_GPF, "general protection fault", regs,
-					error_code, 13, SIGSEGV) == NOTIFY_STOP)
-			return;
-		die("general protection fault", regs, error_code);
-	}
+	tsk->thread.error_code = error_code;
+	tsk->thread.trap_no = 13;
+	if (notify_die(DIE_GPF, "general protection fault", regs,
+				error_code, 13, SIGSEGV) == NOTIFY_STOP)
+		return;
+	die("general protection fault", regs, error_code);
 }
 
 static __kprobes void
@@ -910,12 +896,9 @@ clear_TF_reenable:
 
 static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
 {
-	const struct exception_table_entry *fixup;
-	fixup = search_exception_tables(regs->ip);
-	if (fixup) {
-		regs->ip = fixup->fixup;
+	if (fixup_exception(regs))
 		return 1;
-	}
+
 	notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
 	/* Illegal floating point operation in the kernel */
 	current->thread.trap_no = trapnr;
-- 
cgit v1.2.3


From 3d97775a80a03013abe1fd681620925f884ad18a Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Wed, 30 Jan 2008 13:33:00 +0100
Subject: x86: move out tick_nohz_stop_sched_tick() call from the loop

Move out tick_nohz_stop_sched_tick() call from the loop in cpu_idle
same as 32-bit version.

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/process_64.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 78d80067b7f9..a0130eb2fa50 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -170,14 +170,13 @@ void cpu_idle(void)
 	current_thread_info()->status |= TS_POLLING;
 	/* endless idle loop with no priority at all */
 	while (1) {
+		tick_nohz_stop_sched_tick();
 		while (!need_resched()) {
 			void (*idle)(void);
 
 			if (__get_cpu_var(cpu_idle_state))
 				__get_cpu_var(cpu_idle_state) = 0;
 
-			tick_nohz_stop_sched_tick();
-
 			rmb();
 			idle = pm_idle;
 			if (!idle)
-- 
cgit v1.2.3


From 1379a5ce3ffc549a7ff3daffc49c5e1c372717a3 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Wed, 30 Jan 2008 13:33:00 +0100
Subject: x86: move get_segment_eip() to step.c

get_segment_eip has similarities to convert_rip_to_linear(),
and is used in a similar context.  Move get_segment_eip to
step.c to allow easier consolidation.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/step.c   | 81 ++++++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/mm/fault_32.c   | 77 ---------------------------------------------
 arch/x86/mm/fault_64.c   | 77 ---------------------------------------------
 include/asm-x86/ptrace.h |  2 ++
 4 files changed, 83 insertions(+), 154 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 21ea22fda5fc..5884dd485db8 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -5,6 +5,87 @@
 #include <linux/mm.h>
 #include <linux/ptrace.h>
 
+#ifdef CONFIG_X86_32
+#include <linux/uaccess.h>
+
+#include <asm/desc.h>
+
+/*
+ * Return EIP plus the CS segment base.  The segment limit is also
+ * adjusted, clamped to the kernel/user address space (whichever is
+ * appropriate), and returned in *eip_limit.
+ *
+ * The segment is checked, because it might have been changed by another
+ * task between the original faulting instruction and here.
+ *
+ * If CS is no longer a valid code segment, or if EIP is beyond the
+ * limit, or if it is a kernel address when CS is not a kernel segment,
+ * then the returned value will be greater than *eip_limit.
+ *
+ * This is slow, but is very rarely executed.
+ */
+unsigned long get_segment_eip(struct pt_regs *regs,
+					    unsigned long *eip_limit)
+{
+	unsigned long ip = regs->ip;
+	unsigned seg = regs->cs & 0xffff;
+	u32 seg_ar, seg_limit, base, *desc;
+
+	/* Unlikely, but must come before segment checks. */
+	if (unlikely(regs->flags & VM_MASK)) {
+		base = seg << 4;
+		*eip_limit = base + 0xffff;
+		return base + (ip & 0xffff);
+	}
+
+	/* The standard kernel/user address space limit. */
+	*eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
+
+	/* By far the most common cases. */
+	if (likely(SEGMENT_IS_FLAT_CODE(seg)))
+		return ip;
+
+	/* Check the segment exists, is within the current LDT/GDT size,
+	   that kernel/user (ring 0..3) has the appropriate privilege,
+	   that it's a code segment, and get the limit. */
+	__asm__("larl %3,%0; lsll %3,%1"
+		 : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
+	if ((~seg_ar & 0x9800) || ip > seg_limit) {
+		*eip_limit = 0;
+		return 1;	 /* So that returned ip > *eip_limit. */
+	}
+
+	/* Get the GDT/LDT descriptor base.
+	   When you look for races in this code remember that
+	   LDT and other horrors are only used in user space. */
+	if (seg & (1<<2)) {
+		/* Must lock the LDT while reading it. */
+		mutex_lock(&current->mm->context.lock);
+		desc = current->mm->context.ldt;
+		desc = (void *)desc + (seg & ~7);
+	} else {
+		/* Must disable preemption while reading the GDT. */
+		desc = (u32 *)get_cpu_gdt_table(get_cpu());
+		desc = (void *)desc + (seg & ~7);
+	}
+
+	/* Decode the code segment base from the descriptor */
+	base = get_desc_base((struct desc_struct *)desc);
+
+	if (seg & (1<<2))
+		mutex_unlock(&current->mm->context.lock);
+	else
+		put_cpu();
+
+	/* Adjust EIP and segment limit, and clamp at the kernel limit.
+	   It's legitimate for segments to wrap at 0xffffffff. */
+	seg_limit += base;
+	if (seg_limit < *eip_limit && seg_limit >= base)
+		*eip_limit = seg_limit;
+	return ip + base;
+}
+#endif
+
 #ifdef CONFIG_X86_32
 static
 #endif
diff --git a/arch/x86/mm/fault_32.c b/arch/x86/mm/fault_32.c
index 421e276770ad..b92922a1d65f 100644
--- a/arch/x86/mm/fault_32.c
+++ b/arch/x86/mm/fault_32.c
@@ -61,83 +61,6 @@ static inline int notify_page_fault(struct pt_regs *regs)
 #endif
 }
 
-#ifdef CONFIG_X86_32
-/*
- * Return EIP plus the CS segment base.  The segment limit is also
- * adjusted, clamped to the kernel/user address space (whichever is
- * appropriate), and returned in *eip_limit.
- *
- * The segment is checked, because it might have been changed by another
- * task between the original faulting instruction and here.
- *
- * If CS is no longer a valid code segment, or if EIP is beyond the
- * limit, or if it is a kernel address when CS is not a kernel segment,
- * then the returned value will be greater than *eip_limit.
- *
- * This is slow, but is very rarely executed.
- */
-static inline unsigned long get_segment_eip(struct pt_regs *regs,
-					    unsigned long *eip_limit)
-{
-	unsigned long ip = regs->ip;
-	unsigned seg = regs->cs & 0xffff;
-	u32 seg_ar, seg_limit, base, *desc;
-
-	/* Unlikely, but must come before segment checks. */
-	if (unlikely(regs->flags & VM_MASK)) {
-		base = seg << 4;
-		*eip_limit = base + 0xffff;
-		return base + (ip & 0xffff);
-	}
-
-	/* The standard kernel/user address space limit. */
-	*eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
-
-	/* By far the most common cases. */
-	if (likely(SEGMENT_IS_FLAT_CODE(seg)))
-		return ip;
-
-	/* Check the segment exists, is within the current LDT/GDT size,
-	   that kernel/user (ring 0..3) has the appropriate privilege,
-	   that it's a code segment, and get the limit. */
-	__asm__ ("larl %3,%0; lsll %3,%1"
-		 : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
-	if ((~seg_ar & 0x9800) || ip > seg_limit) {
-		*eip_limit = 0;
-		return 1;	 /* So that returned ip > *eip_limit. */
-	}
-
-	/* Get the GDT/LDT descriptor base.
-	   When you look for races in this code remember that
-	   LDT and other horrors are only used in user space. */
-	if (seg & (1<<2)) {
-		/* Must lock the LDT while reading it. */
-		mutex_lock(&current->mm->context.lock);
-		desc = current->mm->context.ldt;
-		desc = (void *)desc + (seg & ~7);
-	} else {
-		/* Must disable preemption while reading the GDT. */
-		desc = (u32 *)get_cpu_gdt_table(get_cpu());
-		desc = (void *)desc + (seg & ~7);
-	}
-
-	/* Decode the code segment base from the descriptor */
-	base = get_desc_base((struct desc_struct *)desc);
-
-	if (seg & (1<<2))
-		mutex_unlock(&current->mm->context.lock);
-	else
-		put_cpu();
-
-	/* Adjust EIP and segment limit, and clamp at the kernel limit.
-	   It's legitimate for segments to wrap at 0xffffffff. */
-	seg_limit += base;
-	if (seg_limit < *eip_limit && seg_limit >= base)
-		*eip_limit = seg_limit;
-	return ip + base;
-}
-#endif
-
 /*
  * X86_32
  * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
diff --git a/arch/x86/mm/fault_64.c b/arch/x86/mm/fault_64.c
index 95f142f5b5cc..e82832961d72 100644
--- a/arch/x86/mm/fault_64.c
+++ b/arch/x86/mm/fault_64.c
@@ -64,83 +64,6 @@ static inline int notify_page_fault(struct pt_regs *regs)
 #endif
 }
 
-#ifdef CONFIG_X86_32
-/*
- * Return EIP plus the CS segment base.  The segment limit is also
- * adjusted, clamped to the kernel/user address space (whichever is
- * appropriate), and returned in *eip_limit.
- *
- * The segment is checked, because it might have been changed by another
- * task between the original faulting instruction and here.
- *
- * If CS is no longer a valid code segment, or if EIP is beyond the
- * limit, or if it is a kernel address when CS is not a kernel segment,
- * then the returned value will be greater than *eip_limit.
- *
- * This is slow, but is very rarely executed.
- */
-static inline unsigned long get_segment_eip(struct pt_regs *regs,
-					    unsigned long *eip_limit)
-{
-	unsigned long ip = regs->ip;
-	unsigned seg = regs->cs & 0xffff;
-	u32 seg_ar, seg_limit, base, *desc;
-
-	/* Unlikely, but must come before segment checks. */
-	if (unlikely(regs->flags & VM_MASK)) {
-		base = seg << 4;
-		*eip_limit = base + 0xffff;
-		return base + (ip & 0xffff);
-	}
-
-	/* The standard kernel/user address space limit. */
-	*eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
-
-	/* By far the most common cases. */
-	if (likely(SEGMENT_IS_FLAT_CODE(seg)))
-		return ip;
-
-	/* Check the segment exists, is within the current LDT/GDT size,
-	   that kernel/user (ring 0..3) has the appropriate privilege,
-	   that it's a code segment, and get the limit. */
-	__asm__("larl %3,%0; lsll %3,%1"
-		 : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
-	if ((~seg_ar & 0x9800) || ip > seg_limit) {
-		*eip_limit = 0;
-		return 1;	 /* So that returned ip > *eip_limit. */
-	}
-
-	/* Get the GDT/LDT descriptor base.
-	   When you look for races in this code remember that
-	   LDT and other horrors are only used in user space. */
-	if (seg & (1<<2)) {
-		/* Must lock the LDT while reading it. */
-		mutex_lock(&current->mm->context.lock);
-		desc = current->mm->context.ldt;
-		desc = (void *)desc + (seg & ~7);
-	} else {
-		/* Must disable preemption while reading the GDT. */
-		desc = (u32 *)get_cpu_gdt_table(get_cpu());
-		desc = (void *)desc + (seg & ~7);
-	}
-
-	/* Decode the code segment base from the descriptor */
-	base = get_desc_base((struct desc_struct *)desc);
-
-	if (seg & (1<<2))
-		mutex_unlock(&current->mm->context.lock);
-	else
-		put_cpu();
-
-	/* Adjust EIP and segment limit, and clamp at the kernel limit.
-	   It's legitimate for segments to wrap at 0xffffffff. */
-	seg_limit += base;
-	if (seg_limit < *eip_limit && seg_limit >= base)
-		*eip_limit = seg_limit;
-	return ip + base;
-}
-#endif
-
 /*
  * X86_32
  * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
diff --git a/include/asm-x86/ptrace.h b/include/asm-x86/ptrace.h
index 61946fe8c085..cc4456667d89 100644
--- a/include/asm-x86/ptrace.h
+++ b/include/asm-x86/ptrace.h
@@ -184,6 +184,8 @@ convert_rip_to_linear(struct task_struct *child, struct pt_regs *regs);
 
 #ifdef __KERNEL__
 
+unsigned long get_segment_eip(struct pt_regs *regs, unsigned long *eip_limit);
+
 /*
  * These are defined as per linux/ptrace.h, which see.
  */
-- 
cgit v1.2.3


From 1017579a8c6ad6f32154e41ddfdf96d1efbb21b4 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Wed, 30 Jan 2008 13:33:01 +0100
Subject: x86: trivial whitespace in kprobes.c

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Acked-by: Masami Hiramatsu <mhiramat@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/kprobes.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 53ba6a5b6550..93aff49798ee 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -583,8 +583,8 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
  * When a retprobed function returns, this code saves registers and
  * calls trampoline_handler() runs, which calls the kretprobe's handler.
  */
- void __kprobes kretprobe_trampoline_holder(void)
- {
+void __kprobes kretprobe_trampoline_holder(void)
+{
 	asm volatile (
 			".global kretprobe_trampoline\n"
 			"kretprobe_trampoline: \n"
@@ -670,7 +670,7 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
 			"	popf\n"
 #endif
 			"	ret\n");
- }
+}
 
 /*
  * Called from kretprobe_trampoline
-- 
cgit v1.2.3


From c2b84b30b8c8bbccf4d2e32f8a3a70ad09ba9ab8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:33:04 +0100
Subject: x86: sanity check APIC timer frequency

Check the APIC timer calibration result for sanity. When the frequency
is out of range, issue a warning and disable the local APIC timer.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic_32.c | 13 +++++++++++++
 arch/x86/kernel/apic_64.c | 12 ++++++++++++
 2 files changed, 25 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index cbcf72cde956..20d4dbc42e6e 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -470,6 +470,19 @@ void __init setup_boot_APIC_clock(void)
 
 	local_apic_timer_verify_ok = 1;
 
+	/*
+	 * Do a sanity check on the APIC calibration result
+	 */
+	if (calibration_result < (1000000 / HZ)) {
+		local_irq_enable();
+		printk(KERN_WARNING
+		       "APIC frequency too slow, disabling apic timer\n");
+		/* No broadcast on UP ! */
+		if (num_possible_cpus() > 1)
+			setup_APIC_timer();
+		return;
+	}
+
 	/* We trust the pm timer based calibration */
 	if (!pm_referenced) {
 		apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index d1a696673d9d..731b48660fc4 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -383,6 +383,18 @@ void __init setup_boot_APIC_clock(void)
 	printk(KERN_INFO "Using local APIC timer interrupts.\n");
 	calibrate_APIC_clock();
 
+	/*
+	 * Do a sanity check on the APIC calibration result
+	 */
+	if (calibration_result < (1000000 / HZ)) {
+		printk(KERN_WARNING
+		       "APIC frequency too slow, disabling apic timer\n");
+		/* No broadcast on UP ! */
+		if (num_possible_cpus() > 1)
+			setup_APIC_timer();
+		return;
+	}
+
 	/*
 	 * If nmi_watchdog is set to IO_APIC, we need the
 	 * PIT/HPET going.  Otherwise register lapic as a dummy
-- 
cgit v1.2.3


From 9d09951da948c0ca30944df2a9b78e68d6e0744c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:33:04 +0100
Subject: x86: preset apic clockevents multiplicator

The check for an unitialized clock event device triggers, when the local
apic timer is registered as a dummy clock event device for broadcasting.
Preset the multiplicator to avoid a false positive.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic_32.c | 4 +++-
 arch/x86/kernel/apic_64.c | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index 20d4dbc42e6e..d07a603807d1 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -377,8 +377,10 @@ void __init setup_boot_APIC_clock(void)
 	 */
 	if (local_apic_timer_disabled) {
 		/* No broadcast on UP ! */
-		if (num_possible_cpus() > 1)
+		if (num_possible_cpus() > 1) {
+			lapic_clockevent.mult = 1;
 			setup_APIC_timer();
+		}
 		return;
 	}
 
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 731b48660fc4..01d4ca27ecf0 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -375,8 +375,10 @@ void __init setup_boot_APIC_clock(void)
 	if (disable_apic_timer) {
 		printk(KERN_INFO "Disabling APIC timer\n");
 		/* No broadcast on UP ! */
-		if (num_possible_cpus() > 1)
+		if (num_possible_cpus() > 1) {
+			lapic_clockevent.mult = 1;
 			setup_APIC_timer();
+		}
 		return;
 	}
 
-- 
cgit v1.2.3


From d504e39efd4e64a1a6e01dc85fd8a33fdb196dce Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 30 Jan 2008 13:33:05 +0100
Subject: x86: discover_ebda section mismatch

Fix section mismatches.  discover_ebda() can be __init.

WARNING: vmlinux.o(.text+0x738a): Section mismatch: reference to .init.data:ebda_addr (between 'discover_ebda' and 'get_model_name')
WARNING: vmlinux.o(.text+0x73c4): Section mismatch: reference to .init.data:ebda_size (between 'discover_ebda' and 'get_model_name')

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/setup_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index a7124bfb8578..07547febac7a 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -248,7 +248,7 @@ static inline void __init reserve_crashkernel(void)
 unsigned __initdata ebda_addr;
 unsigned __initdata ebda_size;
 
-static void discover_ebda(void)
+static void __init discover_ebda(void)
 {
 	/*
 	 * there is a real-mode segmented pointer pointing to the
-- 
cgit v1.2.3


From 8866cd9dc9d0bbadcf361a14e0cdfecb66473087 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 30 Jan 2008 13:33:06 +0100
Subject: x86: early_idt_handler improvements, 64-bit

It's not too pretty, but I found this made the "PANIC: early exception"
messages become much more reliably useful: 1. print the vector number,
2. print the %cs value, 3. handle error-code-pushing vs non-pushing vectors.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/head64.c  |  2 +-
 arch/x86/kernel/head_64.S | 34 ++++++++++++++++++++++++++++++----
 include/asm-x86/segment.h |  6 ++++++
 3 files changed, 37 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 4a1c1356c41a..85c1e6bf8022 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -59,7 +59,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
 	zap_identity_mappings();
 
 	for (i = 0; i < IDT_ENTRIES; i++)
-		set_intr_gate(i, early_idt_handler);
+		set_intr_gate(i, &early_idt_handlers[i]);
 	load_idt((const struct desc_ptr *)&idt_descr);
 
 	early_printk("Kernel alive\n");
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index c31b1c96a9d3..8b4c35cb519a 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -267,14 +267,40 @@ init_rsp:
 bad_address:
 	jmp bad_address
 
+.macro early_idt_tramp first, last
+	.ifgt \last-\first
+	early_idt_tramp \first, \last-1
+	.endif
+	movl $\last,%esi
+	jmp early_idt_handler
+.endm
+
+	.globl early_idt_handlers
+early_idt_handlers:
+	early_idt_tramp 0, 63
+	early_idt_tramp 64, 127
+	early_idt_tramp 128, 191
+	early_idt_tramp 192, 255
+
 ENTRY(early_idt_handler)
 	cmpl $2,early_recursion_flag(%rip)
 	jz  1f
 	incl early_recursion_flag(%rip)
-	xorl %eax,%eax
-	movq 8(%rsp),%rsi	# get rip
-	movq (%rsp),%rdx
 	GET_CR2_INTO_RCX
+	movq %rcx,%r9
+	xorl %r8d,%r8d		# zero for error code
+	movl %esi,%ecx		# get vector number
+	# Test %ecx against mask of vectors that push error code.
+	cmpl $31,%ecx
+	ja 0f
+	movl $1,%eax
+	salq %cl,%rax
+	testl $0x27d00,%eax
+	je 0f
+	popq %r8		# get error code
+0:	movq 0(%rsp),%rcx	# get ip
+	movq 8(%rsp),%rdx	# get cs
+	xorl %eax,%eax
 	leaq early_idt_msg(%rip),%rdi
 	call early_printk
 	cmpl $2,early_recursion_flag(%rip)
@@ -291,7 +317,7 @@ early_recursion_flag:
 	.long 0
 
 early_idt_msg:
-	.asciz "PANIC: early exception rip %lx error %lx cr2 %lx\n"
+	.asciz "PANIC: early exception %02lx rip %lx:%lx error %lx cr2 %lx\n"
 early_idt_ripmsg:
 	.asciz "RIP %s\n"
 
diff --git a/include/asm-x86/segment.h b/include/asm-x86/segment.h
index 57c8d3723836..23f0535fec61 100644
--- a/include/asm-x86/segment.h
+++ b/include/asm-x86/segment.h
@@ -195,4 +195,10 @@
 #define GDT_ENTRY_TLS_ENTRIES 3
 #define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
 
+#ifdef __KERNEL__
+#ifndef __ASSEMBLY__
+extern const char early_idt_handlers[IDT_ENTRIES][10];
+#endif
+#endif
+
 #endif
-- 
cgit v1.2.3


From 076f9776f5d8d131b36955db8641aba3893c2c1b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 30 Jan 2008 13:33:06 +0100
Subject: x86: make early printk selectable on 64-bit as well

Enable CONFIG_EMBEDDED to select CONFIG_EARLY_PRINTK on 64-bit as well.

saves ~2K:

   text    data     bss     dec     hex filename
   7290283 3672091 1907848 12870222         c4624e vmlinux.before
   7288373 3671795 1907848 12868016         c459b0 vmlinux.after

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig.debug    | 2 +-
 arch/x86/kernel/head64.c  | 7 ++++++-
 arch/x86/kernel/head_64.S | 7 +++++++
 include/asm-x86/kdebug.h  | 1 -
 include/linux/kernel.h    | 3 +++
 kernel/printk.c           | 7 +++++++
 6 files changed, 24 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 15854b53badc..88420af98140 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -6,7 +6,7 @@ config TRACE_IRQFLAGS_SUPPORT
 source "lib/Kconfig.debug"
 
 config EARLY_PRINTK
-	bool "Early printk" if EMBEDDED && DEBUG_KERNEL && X86_32
+	bool "Early printk" if EMBEDDED
 	default y
 	help
 	  Write kernel log output directly into the VGA buffer or to a serial
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 85c1e6bf8022..87e031d4abf1 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -58,8 +58,13 @@ void __init x86_64_start_kernel(char * real_mode_data)
 	/* Make NULL pointers segfault */
 	zap_identity_mappings();
 
-	for (i = 0; i < IDT_ENTRIES; i++)
+	for (i = 0; i < IDT_ENTRIES; i++) {
+#ifdef CONFIG_EARLY_PRINTK
 		set_intr_gate(i, &early_idt_handlers[i]);
+#else
+		set_intr_gate(i, early_idt_handler);
+#endif
+	}
 	load_idt((const struct desc_ptr *)&idt_descr);
 
 	early_printk("Kernel alive\n");
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 8b4c35cb519a..1d5a7a361200 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -267,6 +267,7 @@ init_rsp:
 bad_address:
 	jmp bad_address
 
+#ifdef CONFIG_EARLY_PRINTK
 .macro early_idt_tramp first, last
 	.ifgt \last-\first
 	early_idt_tramp \first, \last-1
@@ -281,8 +282,10 @@ early_idt_handlers:
 	early_idt_tramp 64, 127
 	early_idt_tramp 128, 191
 	early_idt_tramp 192, 255
+#endif
 
 ENTRY(early_idt_handler)
+#ifdef CONFIG_EARLY_PRINTK
 	cmpl $2,early_recursion_flag(%rip)
 	jz  1f
 	incl early_recursion_flag(%rip)
@@ -311,8 +314,11 @@ ENTRY(early_idt_handler)
 	movq 8(%rsp),%rsi	# get rip again
 	call __print_symbol
 #endif
+#endif /* EARLY_PRINTK */
 1:	hlt
 	jmp 1b
+
+#ifdef CONFIG_EARLY_PRINTK
 early_recursion_flag:
 	.long 0
 
@@ -320,6 +326,7 @@ early_idt_msg:
 	.asciz "PANIC: early exception %02lx rip %lx:%lx error %lx cr2 %lx\n"
 early_idt_ripmsg:
 	.asciz "RIP %s\n"
+#endif /* CONFIG_EARLY_PRINTK */
 
 .balign PAGE_SIZE
 
diff --git a/include/asm-x86/kdebug.h b/include/asm-x86/kdebug.h
index a5e5e3b7eb23..e9f42d1ac38f 100644
--- a/include/asm-x86/kdebug.h
+++ b/include/asm-x86/kdebug.h
@@ -22,7 +22,6 @@ enum die_val {
 	DIE_PAGE_FAULT,
 };
 
-extern void early_printk(const char *fmt, ...) __attribute__((format(printf,1,2)));
 extern void printk_address(unsigned long address);
 extern void die(const char *,struct pt_regs *,long);
 extern int __must_check __die(const char *, struct pt_regs *, long);
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index a7283c9beadf..ff356b2ee478 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -194,6 +194,9 @@ static inline int log_buf_read(int idx) { return 0; }
 static inline int log_buf_copy(char *dest, int idx, int len) { return 0; }
 #endif
 
+extern void __attribute__((format(printf, 1, 2)))
+	early_printk(const char *fmt, ...);
+
 unsigned long int_sqrt(unsigned long);
 
 extern int printk_ratelimit(void);
diff --git a/kernel/printk.c b/kernel/printk.c
index 3b7c968d0ef9..58bbec684119 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -36,6 +36,13 @@
 
 #include <asm/uaccess.h>
 
+/*
+ * Architectures can override it:
+ */
+void __attribute__((weak)) early_printk(const char *fmt, ...)
+{
+}
+
 #define __LOG_BUF_LEN	(1 << CONFIG_LOG_BUF_SHIFT)
 
 /* printk's without a loglevel use this.. */
-- 
cgit v1.2.3


From 3d1f7cae883ce4aac99c661562111a25d52effe0 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Wed, 30 Jan 2008 13:33:06 +0100
Subject: x86: fix 32-bit FRAME_POINTER chasing code

The current x86 32 bit FRAME_POINTER chasing code has a nasty bug in
that the EBP tracer doesn't actually update the value of EBP it is
tracing, so that the code doesn't actually switch to the irq stack
properly.

The result is a truncated backtrace:

 WARNING: at timeroops.c:8 kerneloops_regression_test() (Not tainted)
 Pid: 0, comm: swapper Not tainted 2.6.24-0.77.rc4.git4.fc9 #1
  [<c040649a>] show_trace_log_lvl+0x1a/0x2f
  [<c0406d41>] show_trace+0x12/0x14
  [<c0407061>] dump_stack+0x6c/0x72
  [<e0258049>] kerneloops_regression_test+0x44/0x46 [timeroops]
  [<c04371ac>] run_timer_softirq+0x127/0x18f
  [<c0434685>] __do_softirq+0x78/0xff
  [<c0407759>] do_softirq+0x74/0xf7
  =======================

This patch fixes the code to update EBP properly, and to check the EIP
before printing (as the non-framepointer backtracer does) so that
the same test backtrace now looks like this:

 WARNING: at timeroops.c:8 kerneloops_regression_test()
 Pid: 0, comm: swapper Not tainted 2.6.24-rc7 #4
  [<c0405d17>] show_trace_log_lvl+0x1a/0x2f
  [<c0406681>] show_trace+0x12/0x14
  [<c0406ef2>] dump_stack+0x6a/0x70
  [<e01f6040>] kerneloops_regression_test+0x3b/0x3d [timeroops]
  [<c0426f07>] run_timer_softirq+0x11b/0x17c
  [<c04243ac>] __do_softirq+0x42/0x94
  [<c040704c>] do_softirq+0x50/0xb6
  [<c04242a9>] irq_exit+0x37/0x67
  [<c040714c>] do_IRQ+0x9a/0xaf
  [<c04057da>] common_interrupt+0x2e/0x34
  [<c05807fe>] cpuidle_idle_call+0x52/0x78
  [<c04034f3>] cpu_idle+0x46/0x60
  [<c05fbbd3>] rest_init+0x43/0x45
  [<c070aa3d>] start_kernel+0x279/0x27f
  =======================

This shows that the backtrace goes all the way down to user context now.
This bug was found during the port to 64 bit of the frame pointer backtracer.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/traps_32.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index 83df0f37ba75..acc9af260fac 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -125,7 +125,8 @@ static inline unsigned long print_context_stack(struct thread_info *tinfo,
 		unsigned long addr;
 
 		addr = frame->return_address;
-		ops->address(data, addr);
+		if (__kernel_text_address(addr))
+			ops->address(data, addr);
 		/*
 		 * break out of recursive entries (such as
 		 * end_of_stack_stop_unwind_function). Also,
@@ -133,6 +134,7 @@ static inline unsigned long print_context_stack(struct thread_info *tinfo,
 		 * move downwards!
 		 */
 		next = frame->next_frame;
+		bp = (unsigned long) next;
 		if (next <= frame)
 			break;
 		frame = next;
-- 
cgit v1.2.3


From bc850d6b374fffd08336996f4b4d3bbd6bf427f6 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Wed, 30 Jan 2008 13:33:07 +0100
Subject: x86: add the capability to print fuzzy backtraces

For enhancing the 32 bit EBP based backtracer, I need the capability
for the backtracer to tell it's customer that an entry is either
reliable or unreliable, and the backtrace printing code then needs to
print the unreliable ones slightly different.

This patch adds the basic capability, the next patch will add a user
of this capability.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/pci-gart_64.c |  2 +-
 arch/x86/kernel/process_64.c  |  4 ++--
 arch/x86/kernel/stacktrace.c  |  2 +-
 arch/x86/kernel/traps_32.c    |  8 +++++---
 arch/x86/kernel/traps_64.c    | 39 +++++++++++++++++++++++----------------
 arch/x86/mm/fault_64.c        |  2 +-
 arch/x86/oprofile/backtrace.c |  2 +-
 include/asm-x86/kdebug.h      |  5 +++--
 include/asm-x86/stacktrace.h  |  5 +++--
 9 files changed, 40 insertions(+), 29 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index d2b46b489412..04ca5c5221d7 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -167,7 +167,7 @@ static void dump_leak(void)
 	       iommu_leak_pages);
 	for (i = 0; i < iommu_leak_pages; i += 2) {
 		printk(KERN_DEBUG "%lu: ", iommu_pages-i);
-		printk_address((unsigned long) iommu_leak_tab[iommu_pages-i]);
+		printk_address((unsigned long) iommu_leak_tab[iommu_pages-i], 0);
 		printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' ');
 	}
 	printk(KERN_DEBUG "\n");
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index a0130eb2fa50..383760bfd283 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -329,7 +329,7 @@ void __show_regs(struct pt_regs * regs)
 		(int)strcspn(init_utsname()->version, " "),
 		init_utsname()->version);
 	printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
-	printk_address(regs->ip);
+	printk_address(regs->ip, regs->bp);
 	printk("RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->sp,
 		regs->flags);
 	printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
@@ -377,7 +377,7 @@ void show_regs(struct pt_regs *regs)
 {
 	printk("CPU %d:", smp_processor_id());
 	__show_regs(regs);
-	show_trace(NULL, regs, (void *)(regs + 1));
+	show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
 }
 
 /*
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index c571edd11878..8c4e4f5bf040 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -22,7 +22,7 @@ static int save_stack_stack(void *data, char *name)
 	return -1;
 }
 
-static void save_stack_address(void *data, unsigned long addr)
+static void save_stack_address(void *data, unsigned long addr, int reliable)
 {
 	struct stack_trace *trace = (struct stack_trace *)data;
 	if (trace->skip > 0) {
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index acc9af260fac..8ef8a9ddfec6 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -126,7 +126,7 @@ static inline unsigned long print_context_stack(struct thread_info *tinfo,
 
 		addr = frame->return_address;
 		if (__kernel_text_address(addr))
-			ops->address(data, addr);
+			ops->address(data, addr, 1);
 		/*
 		 * break out of recursive entries (such as
 		 * end_of_stack_stop_unwind_function). Also,
@@ -145,7 +145,7 @@ static inline unsigned long print_context_stack(struct thread_info *tinfo,
 
 		addr = *stack++;
 		if (__kernel_text_address(addr))
-			ops->address(data, addr);
+			ops->address(data, addr, 1);
 	}
 #endif
 	return bp;
@@ -220,9 +220,11 @@ static int print_trace_stack(void *data, char *name)
 /*
  * Print one address/symbol entries per line.
  */
-static void print_trace_address(void *data, unsigned long addr)
+static void print_trace_address(void *data, unsigned long addr, int reliable)
 {
 	printk("%s [<%08lx>] ", (char *)data, addr);
+	if (!reliable)
+		printk("? ");
 	print_symbol("%s\n", addr);
 	touch_nmi_watchdog();
 }
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index 37b07d08704b..62c4d8f46ee9 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -99,13 +99,14 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
 int kstack_depth_to_print = 12;
 
 #ifdef CONFIG_KALLSYMS
-void printk_address(unsigned long address)
+void printk_address(unsigned long address, int reliable)
 {
 	unsigned long offset = 0, symsize;
 	const char *symname;
 	char *modname;
 	char *delim = ":";
 	char namebuf[128];
+	char reliab[4] = "";;
 
 	symname = kallsyms_lookup(address, &symsize, &offset,
 					&modname, namebuf);
@@ -113,13 +114,16 @@ void printk_address(unsigned long address)
 		printk(" [<%016lx>]\n", address);
 		return;
 	}
+	if (!reliable)
+		strcpy(reliab, "? ");
+
 	if (!modname)
 		modname = delim = ""; 		
-	printk(" [<%016lx>] %s%s%s%s+0x%lx/0x%lx\n",
-		address, delim, modname, delim, symname, offset, symsize);
+	printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
+		address, reliab, delim, modname, delim, symname, offset, symsize);
 }
 #else
-void printk_address(unsigned long address)
+void printk_address(unsigned long address, int reliable)
 {
 	printk(" [<%016lx>]\n", address);
 }
@@ -215,7 +219,7 @@ static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
 }
 
 void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
-		unsigned long *stack,
+		unsigned long *stack, unsigned long bp,
 		const struct stacktrace_ops *ops, void *data)
 {
 	const unsigned cpu = get_cpu();
@@ -252,7 +256,7 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
 			 * down the cause of the crash will be able to figure \
 			 * out the call path that was taken. \
 			 */ \
-			ops->address(data, addr);   \
+			ops->address(data, addr, 1);   \
 		} \
 	} while (0)
 
@@ -331,10 +335,10 @@ static int print_trace_stack(void *data, char *name)
 	return 0;
 }
 
-static void print_trace_address(void *data, unsigned long addr)
+static void print_trace_address(void *data, unsigned long addr, int reliable)
 {
 	touch_nmi_watchdog();
-	printk_address(addr);
+	printk_address(addr, reliable);
 }
 
 static const struct stacktrace_ops print_trace_ops = {
@@ -345,15 +349,17 @@ static const struct stacktrace_ops print_trace_ops = {
 };
 
 void
-show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack)
+show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack,
+		unsigned long bp)
 {
 	printk("\nCall Trace:\n");
-	dump_trace(tsk, regs, stack, &print_trace_ops, NULL);
+	dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL);
 	printk("\n");
 }
 
 static void
-_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp)
+_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp,
+							unsigned long bp)
 {
 	unsigned long *stack;
 	int i;
@@ -387,12 +393,12 @@ _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp)
 		printk(" %016lx", *stack++);
 		touch_nmi_watchdog();
 	}
-	show_trace(tsk, regs, sp);
+	show_trace(tsk, regs, sp, bp);
 }
 
 void show_stack(struct task_struct *tsk, unsigned long * sp)
 {
-	_show_stack(tsk, NULL, sp);
+	_show_stack(tsk, NULL, sp, 0);
 }
 
 /*
@@ -401,13 +407,14 @@ void show_stack(struct task_struct *tsk, unsigned long * sp)
 void dump_stack(void)
 {
 	unsigned long dummy;
+	unsigned long bp = 0;
 
 	printk("Pid: %d, comm: %.20s %s %s %.*s\n",
 		current->pid, current->comm, print_tainted(),
 		init_utsname()->release,
 		(int)strcspn(init_utsname()->version, " "),
 		init_utsname()->version);
-	show_trace(NULL, NULL, &dummy);
+	show_trace(NULL, NULL, &dummy, bp);
 }
 
 EXPORT_SYMBOL(dump_stack);
@@ -432,7 +439,7 @@ void show_registers(struct pt_regs *regs)
 	 */
 	if (in_kernel) {
 		printk("Stack: ");
-		_show_stack(NULL, regs, (unsigned long*)sp);
+		_show_stack(NULL, regs, (unsigned long *)sp, regs->bp);
 
 		printk("\nCode: ");
 		if (regs->ip < PAGE_OFFSET)
@@ -527,7 +534,7 @@ int __kprobes __die(const char * str, struct pt_regs * regs, long err)
 	add_taint(TAINT_DIE);
 	/* Executive summary in case the oops scrolled away */
 	printk(KERN_ALERT "RIP ");
-	printk_address(regs->ip);
+	printk_address(regs->ip, regs->bp);
 	printk(" RSP <%016lx>\n", regs->sp);
 	if (kexec_should_crash(current))
 		crash_kexec(regs);
diff --git a/arch/x86/mm/fault_64.c b/arch/x86/mm/fault_64.c
index e82832961d72..cf7e99895b91 100644
--- a/arch/x86/mm/fault_64.c
+++ b/arch/x86/mm/fault_64.c
@@ -578,7 +578,7 @@ no_context:
 	else
 		printk(KERN_ALERT "Unable to handle kernel paging request");
 	printk(" at %016lx RIP: \n" KERN_ALERT, address);
-	printk_address(regs->ip);
+	printk_address(regs->ip, regs->bp);
 	dump_pagetable(address);
 	tsk->thread.cr2 = address;
 	tsk->thread.trap_no = 14;
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index cc353a0b183e..671a7ecf11aa 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -32,7 +32,7 @@ static int backtrace_stack(void *data, char *name)
 	return 0;
 }
 
-static void backtrace_address(void *data, unsigned long addr)
+static void backtrace_address(void *data, unsigned long addr, int reliable)
 {
 	unsigned int *depth = data;
 
diff --git a/include/asm-x86/kdebug.h b/include/asm-x86/kdebug.h
index e9f42d1ac38f..dd442a1632c0 100644
--- a/include/asm-x86/kdebug.h
+++ b/include/asm-x86/kdebug.h
@@ -22,12 +22,13 @@ enum die_val {
 	DIE_PAGE_FAULT,
 };
 
-extern void printk_address(unsigned long address);
+extern void printk_address(unsigned long address, int reliable);
 extern void die(const char *,struct pt_regs *,long);
 extern int __must_check __die(const char *, struct pt_regs *, long);
 extern void show_registers(struct pt_regs *regs);
 extern void __show_registers(struct pt_regs *, int all);
-extern void show_trace(struct task_struct *, struct pt_regs *, unsigned long *);
+extern void show_trace(struct task_struct *t, struct pt_regs *regs,
+			unsigned long *sp, unsigned long bp);
 extern void __show_regs(struct pt_regs *regs);
 extern void show_regs(struct pt_regs *regs);
 extern void dump_pagetable(unsigned long);
diff --git a/include/asm-x86/stacktrace.h b/include/asm-x86/stacktrace.h
index 70dd5bae3235..30f82526a8e2 100644
--- a/include/asm-x86/stacktrace.h
+++ b/include/asm-x86/stacktrace.h
@@ -9,12 +9,13 @@ struct stacktrace_ops {
 	void (*warning)(void *data, char *msg);
 	/* msg must contain %s for the symbol */
 	void (*warning_symbol)(void *data, char *msg, unsigned long symbol);
-	void (*address)(void *data, unsigned long address);
+	void (*address)(void *data, unsigned long address, int reliable);
 	/* On negative return stop dumping */
 	int (*stack)(void *data, char *name);
 };
 
-void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack,
+void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
+		unsigned long *stack, unsigned long bp,
 		const struct stacktrace_ops *ops, void *data);
 
 #endif
-- 
cgit v1.2.3


From e9d4efddbec3d852d435b370b9c40ff7ac24afe6 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Wed, 30 Jan 2008 13:33:07 +0100
Subject: x86: improve the 32 bit Frame Pointer backtracer to also use the
 traditional backtrace

The 32 bit Frame Pointer backtracer code checks if the EBP is valid
to do a backtrace; however currently on a failure it just gives up
and prints nothing. That's not very nice; we can do better and still
print a decent backtrace.

This patch changes the backtracer to use the regular backtracing algorithm
at the same time as the EBP backtracer; the EBP backtracer is basically
used to figure out which part of the backtrace are reliable vs those
which are likely to be noise.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/traps_32.c | 44 ++++++++++++++++++++------------------------
 1 file changed, 20 insertions(+), 24 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index 8ef8a9ddfec6..959d40edecd5 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -118,36 +118,32 @@ static inline unsigned long print_context_stack(struct thread_info *tinfo,
 				unsigned long *stack, unsigned long bp,
 				const struct stacktrace_ops *ops, void *data)
 {
-#ifdef	CONFIG_FRAME_POINTER
 	struct stack_frame *frame = (struct stack_frame *)bp;
-	while (valid_stack_ptr(tinfo, frame, sizeof(*frame))) {
-		struct stack_frame *next;
-		unsigned long addr;
 
-		addr = frame->return_address;
-		if (__kernel_text_address(addr))
-			ops->address(data, addr, 1);
-		/*
-		 * break out of recursive entries (such as
-		 * end_of_stack_stop_unwind_function). Also,
-		 * we can never allow a frame pointer to
-		 * move downwards!
-		 */
-		next = frame->next_frame;
-		bp = (unsigned long) next;
-		if (next <= frame)
-			break;
-		frame = next;
-	}
-#else
+	/*
+	 * if EBP is "deeper" into the stack than the actual stack pointer,
+	 * we need to rewind the stack pointer a little to start at the
+	 * first stack frame, but only if EBP is in this stack frame.
+	 */
+	if (stack > (unsigned long *) bp
+			&& valid_stack_ptr(tinfo, frame, sizeof(*frame)))
+		stack = (unsigned long *) bp;
+
 	while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) {
 		unsigned long addr;
 
-		addr = *stack++;
-		if (__kernel_text_address(addr))
-			ops->address(data, addr, 1);
+		addr = *stack;
+		if (__kernel_text_address(addr)) {
+			if ((unsigned long) stack == bp + 4) {
+				ops->address(data, addr, 1);
+				frame = frame->next_frame;
+				bp = (unsigned long) frame;
+			} else {
+				ops->address(data, addr, 0);
+			}
+		}
+		stack++;
 	}
-#endif
 	return bp;
 }
 
-- 
cgit v1.2.3


From 5bc27dc2f55fd3043597b5a8de6536183f28a449 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Wed, 30 Jan 2008 13:33:07 +0100
Subject: x86: pull bp calculation earlier into the backtrace path

Right now, we take the stack pointer early during the backtrace path, but
only calculate bp several functions deep later, making it hard to reconcile
the stack and bp backtraces (as well as showing several internal backtrace
functions on the stack with bp based backtracing).

This patch moves the bp taking to the same place we take the stack pointer;
sadly this ripples through several layers of the back tracing stack,
but it's not all that bad in the end I hope.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/process_32.c  |  2 +-
 arch/x86/kernel/stacktrace.c  |  7 ++++---
 arch/x86/kernel/traps_32.c    | 39 +++++++++++++++++----------------------
 arch/x86/oprofile/backtrace.c |  2 +-
 4 files changed, 23 insertions(+), 27 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 35a6f318c541..7a61b54649de 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -379,7 +379,7 @@ void __show_registers(struct pt_regs *regs, int all)
 void show_regs(struct pt_regs *regs)
 {
 	__show_registers(regs, 1);
-	show_trace(NULL, regs, &regs->sp);
+	show_trace(NULL, regs, &regs->sp, regs->bp);
 }
 
 /*
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 8c4e4f5bf040..4f4021b5bfb5 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -33,7 +33,8 @@ static void save_stack_address(void *data, unsigned long addr, int reliable)
 		trace->entries[trace->nr_entries++] = addr;
 }
 
-static void save_stack_address_nosched(void *data, unsigned long addr)
+static void
+save_stack_address_nosched(void *data, unsigned long addr, int reliable)
 {
 	struct stack_trace *trace = (struct stack_trace *)data;
 	if (in_sched_functions(addr))
@@ -65,14 +66,14 @@ static const struct stacktrace_ops save_stack_ops_nosched = {
  */
 void save_stack_trace(struct stack_trace *trace)
 {
-	dump_trace(current, NULL, NULL, &save_stack_ops, trace);
+	dump_trace(current, NULL, NULL, 0, &save_stack_ops, trace);
 	if (trace->nr_entries < trace->max_entries)
 		trace->entries[trace->nr_entries++] = ULONG_MAX;
 }
 
 void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
 {
-	dump_trace(tsk, NULL, NULL, &save_stack_ops_nosched, trace);
+	dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace);
 	if (trace->nr_entries < trace->max_entries)
 		trace->entries[trace->nr_entries++] = ULONG_MAX;
 }
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index 959d40edecd5..6f3bb287c702 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -120,15 +120,6 @@ static inline unsigned long print_context_stack(struct thread_info *tinfo,
 {
 	struct stack_frame *frame = (struct stack_frame *)bp;
 
-	/*
-	 * if EBP is "deeper" into the stack than the actual stack pointer,
-	 * we need to rewind the stack pointer a little to start at the
-	 * first stack frame, but only if EBP is in this stack frame.
-	 */
-	if (stack > (unsigned long *) bp
-			&& valid_stack_ptr(tinfo, frame, sizeof(*frame)))
-		stack = (unsigned long *) bp;
-
 	while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) {
 		unsigned long addr;
 
@@ -139,7 +130,7 @@ static inline unsigned long print_context_stack(struct thread_info *tinfo,
 				frame = frame->next_frame;
 				bp = (unsigned long) frame;
 			} else {
-				ops->address(data, addr, 0);
+				ops->address(data, addr, bp == 0);
 			}
 		}
 		stack++;
@@ -150,11 +141,9 @@ static inline unsigned long print_context_stack(struct thread_info *tinfo,
 #define MSG(msg) ops->warning(data, msg)
 
 void dump_trace(struct task_struct *task, struct pt_regs *regs,
-	        unsigned long *stack,
+		unsigned long *stack, unsigned long bp,
 		const struct stacktrace_ops *ops, void *data)
 {
-	unsigned long bp = 0;
-
 	if (!task)
 		task = current;
 
@@ -234,20 +223,20 @@ static const struct stacktrace_ops print_trace_ops = {
 
 static void
 show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
-		   unsigned long * stack, char *log_lvl)
+		unsigned long *stack, unsigned long bp, char *log_lvl)
 {
-	dump_trace(task, regs, stack, &print_trace_ops, log_lvl);
+	dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
 	printk("%s =======================\n", log_lvl);
 }
 
 void show_trace(struct task_struct *task, struct pt_regs *regs,
-		unsigned long * stack)
+		unsigned long *stack, unsigned long bp)
 {
-	show_trace_log_lvl(task, regs, stack, "");
+	show_trace_log_lvl(task, regs, stack, bp, "");
 }
 
 static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
-			       unsigned long *sp, char *log_lvl)
+		       unsigned long *sp, unsigned long bp, char *log_lvl)
 {
 	unsigned long *stack;
 	int i;
@@ -268,13 +257,13 @@ static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
 		printk("%08lx ", *stack++);
 	}
 	printk("\n%sCall Trace:\n", log_lvl);
-	show_trace_log_lvl(task, regs, sp, log_lvl);
+	show_trace_log_lvl(task, regs, sp, bp, log_lvl);
 }
 
 void show_stack(struct task_struct *task, unsigned long *sp)
 {
 	printk("       ");
-	show_stack_log_lvl(task, NULL, sp, "");
+	show_stack_log_lvl(task, NULL, sp, 0, "");
 }
 
 /*
@@ -283,13 +272,19 @@ void show_stack(struct task_struct *task, unsigned long *sp)
 void dump_stack(void)
 {
 	unsigned long stack;
+	unsigned long bp = 0;
+
+#ifdef CONFIG_FRAME_POINTER
+	if (!bp)
+		asm("movl %%ebp, %0" : "=r" (bp):);
+#endif
 
 	printk("Pid: %d, comm: %.20s %s %s %.*s\n",
 		current->pid, current->comm, print_tainted(),
 		init_utsname()->release,
 		(int)strcspn(init_utsname()->version, " "),
 		init_utsname()->version);
-	show_trace(current, NULL, &stack);
+	show_trace(current, NULL, &stack, bp);
 }
 
 EXPORT_SYMBOL(dump_stack);
@@ -314,7 +309,7 @@ void show_registers(struct pt_regs *regs)
 		unsigned char c;
 
 		printk("\n" KERN_EMERG "Stack: ");
-		show_stack_log_lvl(NULL, regs, &regs->sp, KERN_EMERG);
+		show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);
 
 		printk(KERN_EMERG "Code: ");
 
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index 671a7ecf11aa..0ca4815a2938 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -81,7 +81,7 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth)
 
 	if (!user_mode_vm(regs)) {
 		if (depth)
-			dump_trace(NULL, regs, (unsigned long *)stack,
+			dump_trace(NULL, regs, (unsigned long *)stack, 0,
 				   &backtrace_ops, &depth);
 		return;
 	}
-- 
cgit v1.2.3


From e4a94568b18c5d7d72741ebde5736d77d235743c Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Wed, 30 Jan 2008 13:33:07 +0100
Subject: x86: turn 64 bit x86 HANDLE_STACK into print_context_stack like 32
 bit has

This patch turns the x86 64 bit HANDLE_STACK macro in the backtrace code
into a function, just like 32 bit has. This is needed pre work in order to
get exact backtraces for CONFIG_FRAME_POINTER to work.

The function and it's arguments are not the same as 32 bit; due to the
exception/interrupt stack way of x86-64 there are a few differences.

This patch should not have any behavior changes, only code movement.

Due to the fragility and importance of the backtrace code, this needs to be
well reviewed and well tested before merging into mainlne.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/traps_64.c | 74 ++++++++++++++++++++++++++++------------------
 1 file changed, 45 insertions(+), 29 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index 62c4d8f46ee9..b8303ed95057 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -212,10 +212,46 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
  * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
  */
 
-static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
+static inline int valid_stack_ptr(struct thread_info *tinfo,
+			void *p, unsigned int size, void *end)
 {
 	void *t = (void *)tinfo;
-        return p > t && p < t + THREAD_SIZE - 3;
+	if (end) {
+		if (p < end && p >= (end-THREAD_SIZE))
+			return 1;
+		else
+			return 0;
+	}
+	return p > t && p < t + THREAD_SIZE - size;
+}
+
+static inline unsigned long print_context_stack(struct thread_info *tinfo,
+				unsigned long *stack, unsigned long bp,
+				const struct stacktrace_ops *ops, void *data,
+				unsigned long *end)
+{
+	/*
+	 * Print function call entries within a stack. 'cond' is the
+	 * "end of stackframe" condition, that the 'stack++'
+	 * iteration will eventually trigger.
+	 */
+	while (valid_stack_ptr(tinfo, stack, 3, end)) {
+		unsigned long addr = *stack++;
+		/* Use unlocked access here because except for NMIs
+		   we should be already protected against module unloads */
+		if (__kernel_text_address(addr)) {
+			/*
+			 * If the address is either in the text segment of the
+			 * kernel, or in the region which contains vmalloc'ed
+			 * memory, it *may* be the address of a calling
+			 * routine; if so, print it so that someone tracing
+			 * down the cause of the crash will be able to figure
+			 * out the call path that was taken.
+			 */
+			ops->address(data, addr, 1);
+		}
+	}
+	return bp;
 }
 
 void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
@@ -229,6 +265,7 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
 
 	if (!tsk)
 		tsk = current;
+	tinfo = task_thread_info(tsk);
 
 	if (!stack) {
 		unsigned long dummy;
@@ -237,28 +274,6 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
 			stack = (unsigned long *)tsk->thread.sp;
 	}
 
-	/*
-	 * Print function call entries within a stack. 'cond' is the
-	 * "end of stackframe" condition, that the 'stack++'
-	 * iteration will eventually trigger.
-	 */
-#define HANDLE_STACK(cond) \
-	do while (cond) { \
-		unsigned long addr = *stack++; \
-		/* Use unlocked access here because except for NMIs	\
-		   we should be already protected against module unloads */ \
-		if (__kernel_text_address(addr)) { \
-			/* \
-			 * If the address is either in the text segment of the \
-			 * kernel, or in the region which contains vmalloc'ed \
-			 * memory, it *may* be the address of a calling \
-			 * routine; if so, print it so that someone tracing \
-			 * down the cause of the crash will be able to figure \
-			 * out the call path that was taken. \
-			 */ \
-			ops->address(data, addr, 1);   \
-		} \
-	} while (0)
 
 	/*
 	 * Print function call entries in all stacks, starting at the
@@ -274,7 +289,9 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
 		if (estack_end) {
 			if (ops->stack(data, id) < 0)
 				break;
-			HANDLE_STACK (stack < estack_end);
+
+			print_context_stack(tinfo, stack, 0, ops,
+						data, estack_end);
 			ops->stack(data, "<EOE>");
 			/*
 			 * We link to the next stack via the
@@ -292,7 +309,8 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
 			if (stack >= irqstack && stack < irqstack_end) {
 				if (ops->stack(data, "IRQ") < 0)
 					break;
-				HANDLE_STACK (stack < irqstack_end);
+				print_context_stack(tinfo, stack, 0, ops,
+							 data, irqstack_end);
 				/*
 				 * We link to the next stack (which would be
 				 * the process stack normally) the last
@@ -310,9 +328,7 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
 	/*
 	 * This handles the process stack:
 	 */
-	tinfo = task_thread_info(tsk);
-	HANDLE_STACK (valid_stack_ptr(tinfo, stack));
-#undef HANDLE_STACK
+	print_context_stack(tinfo, stack, 0, ops, data, NULL);
 	put_cpu();
 }
 EXPORT_SYMBOL(dump_trace);
-- 
cgit v1.2.3


From 80b51f310b6f55006a265d087b8f48744e65663d Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Wed, 30 Jan 2008 13:33:07 +0100
Subject: x86: use the stack frames to get exact stack-traces for
 CONFIG_FRAMEPOINTER on x86-64

x86 32 bit already has this feature: This patch uses the stack frames with
frame pointer into an exact stack trace, by following the frame pointer.
This only affects kernels built with the CONFIG_FRAME_POINTER config option
enabled, and greatly reduces the amount of noise in oopses.

This code uses the traditional method of doing backtraces, but if it
finds a valid frame pointer chain, will use that to show which parts
of the backtrace are reliable and which parts are not

Due to the fragility and importance of the backtrace code, this needs to
be well reviewed and well tested before merging into mainlne.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/traps_64.c | 67 ++++++++++++++++++++++++++++++----------------
 1 file changed, 44 insertions(+), 23 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index b8303ed95057..304ca6b4a1ca 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -225,31 +225,34 @@ static inline int valid_stack_ptr(struct thread_info *tinfo,
 	return p > t && p < t + THREAD_SIZE - size;
 }
 
+/* The form of the top of the frame on the stack */
+struct stack_frame {
+	struct stack_frame *next_frame;
+	unsigned long return_address;
+};
+
+
 static inline unsigned long print_context_stack(struct thread_info *tinfo,
 				unsigned long *stack, unsigned long bp,
 				const struct stacktrace_ops *ops, void *data,
 				unsigned long *end)
 {
-	/*
-	 * Print function call entries within a stack. 'cond' is the
-	 * "end of stackframe" condition, that the 'stack++'
-	 * iteration will eventually trigger.
-	 */
-	while (valid_stack_ptr(tinfo, stack, 3, end)) {
-		unsigned long addr = *stack++;
-		/* Use unlocked access here because except for NMIs
-		   we should be already protected against module unloads */
+	struct stack_frame *frame = (struct stack_frame *)bp;
+
+	while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
+		unsigned long addr;
+
+		addr = *stack;
 		if (__kernel_text_address(addr)) {
-			/*
-			 * If the address is either in the text segment of the
-			 * kernel, or in the region which contains vmalloc'ed
-			 * memory, it *may* be the address of a calling
-			 * routine; if so, print it so that someone tracing
-			 * down the cause of the crash will be able to figure
-			 * out the call path that was taken.
-			 */
-			ops->address(data, addr, 1);
+			if ((unsigned long) stack == bp + 8) {
+				ops->address(data, addr, 1);
+				frame = frame->next_frame;
+				bp = (unsigned long) frame;
+			} else {
+				ops->address(data, addr, bp == 0);
+			}
 		}
+		stack++;
 	}
 	return bp;
 }
@@ -274,6 +277,19 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
 			stack = (unsigned long *)tsk->thread.sp;
 	}
 
+#ifdef CONFIG_FRAME_POINTER
+	if (!bp) {
+		if (tsk == current) {
+			/* Grab bp right from our regs */
+			asm("movq %%rbp, %0" : "=r" (bp):);
+		} else {
+			/* bp is the last reg pushed by switch_to */
+			bp = *(unsigned long *) tsk->thread.sp;
+		}
+	}
+#endif
+
+
 
 	/*
 	 * Print function call entries in all stacks, starting at the
@@ -290,8 +306,8 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
 			if (ops->stack(data, id) < 0)
 				break;
 
-			print_context_stack(tinfo, stack, 0, ops,
-						data, estack_end);
+			bp = print_context_stack(tinfo, stack, bp, ops,
+							data, estack_end);
 			ops->stack(data, "<EOE>");
 			/*
 			 * We link to the next stack via the
@@ -309,8 +325,8 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
 			if (stack >= irqstack && stack < irqstack_end) {
 				if (ops->stack(data, "IRQ") < 0)
 					break;
-				print_context_stack(tinfo, stack, 0, ops,
-							 data, irqstack_end);
+				bp = print_context_stack(tinfo, stack, bp,
+						ops, data, irqstack_end);
 				/*
 				 * We link to the next stack (which would be
 				 * the process stack normally) the last
@@ -328,7 +344,7 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
 	/*
 	 * This handles the process stack:
 	 */
-	print_context_stack(tinfo, stack, 0, ops, data, NULL);
+	bp = print_context_stack(tinfo, stack, bp, ops, data, NULL);
 	put_cpu();
 }
 EXPORT_SYMBOL(dump_trace);
@@ -425,6 +441,11 @@ void dump_stack(void)
 	unsigned long dummy;
 	unsigned long bp = 0;
 
+#ifdef CONFIG_FRAME_POINTER
+	if (!bp)
+		asm("movq %%rbp, %0" : "=r" (bp):);
+#endif
+
 	printk("Pid: %d, comm: %.20s %s %s %.*s\n",
 		current->pid, current->comm, print_tainted(),
 		init_utsname()->release,
-- 
cgit v1.2.3


From a25bd94964e87b1b93903a822fba5025d995d4da Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Wed, 30 Jan 2008 13:33:08 +0100
Subject: x86: add the "print code before the trapping instruction" feature to
 64 bit

The 32 bit x86 tree has a very useful feature that prints the Code: line
for the code even before the trapping instrution (and the start of the
trapping instruction is then denoted with a <>). Unfortunately, the 64 bit
x86 tree does not yet have this feature, making diagnosing backtraces harder
than needed.

This patch adds this feature in the same was as the 32 bit tree has
(including the same kernel boot parameter), and including a bugfix
to make the code use probe_kernel_address() rarther than a buggy (deadlocking)
__get_user.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/kernel-parameters.txt |  4 ++--
 arch/x86/kernel/traps_64.c          | 44 +++++++++++++++++++++++++++----------
 2 files changed, 35 insertions(+), 13 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index e7910f8b4fcc..40db7dd1d92a 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -416,8 +416,8 @@ and is between 256 and 4096 characters. It is defined in the file
 			[SPARC64] tick
 			[X86-64] hpet,tsc
 
-	code_bytes	[IA32] How many bytes of object code to print in an
-			oops report.
+	code_bytes	[IA32/X86_64] How many bytes of object code to print
+			in an oops report.
 			Range: 0 - 8192
 			Default: 64
 
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index 304ca6b4a1ca..0bba7924604e 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -74,6 +74,8 @@ asmlinkage void alignment_check(void);
 asmlinkage void machine_check(void);
 asmlinkage void spurious_interrupt_bug(void);
 
+static unsigned int code_bytes = 64;
+
 static inline void conditional_sti(struct pt_regs *regs)
 {
 	if (regs->flags & X86_EFLAGS_IF)
@@ -459,12 +461,15 @@ EXPORT_SYMBOL(dump_stack);
 void show_registers(struct pt_regs *regs)
 {
 	int i;
-	int in_kernel = !user_mode(regs);
 	unsigned long sp;
 	const int cpu = smp_processor_id();
 	struct task_struct *cur = cpu_pda(cpu)->pcurrent;
+	u8 *ip;
+	unsigned int code_prologue = code_bytes * 43 / 64;
+	unsigned int code_len = code_bytes;
 
 	sp = regs->sp;
+	ip = (u8 *) regs->ip - code_prologue;
 	printk("CPU %d ", cpu);
 	__show_regs(regs);
 	printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
@@ -474,22 +479,28 @@ void show_registers(struct pt_regs *regs)
 	 * When in-kernel, we also print out the stack and code at the
 	 * time of the fault..
 	 */
-	if (in_kernel) {
+	if (!user_mode(regs)) {
+		unsigned char c;
 		printk("Stack: ");
 		_show_stack(NULL, regs, (unsigned long *)sp, regs->bp);
+		printk("\n");
 
-		printk("\nCode: ");
-		if (regs->ip < PAGE_OFFSET)
-			goto bad;
-
-		for (i=0; i<20; i++) {
-			unsigned char c;
-			if (__get_user(c, &((unsigned char*)regs->ip)[i])) {
-bad:
+		printk(KERN_EMERG "Code: ");
+		if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
+			/* try starting at RIP */
+			ip = (u8 *) regs->ip;
+			code_len = code_len - code_prologue + 1;
+		}
+		for (i = 0; i < code_len; i++, ip++) {
+			if (ip < (u8 *)PAGE_OFFSET ||
+					probe_kernel_address(ip, c)) {
 				printk(" Bad RIP value.");
 				break;
 			}
-			printk("%02x ", c);
+			if (ip == (u8 *)regs->ip)
+				printk("<%02x> ", c);
+			else
+				printk("%02x ", c);
 		}
 	}
 	printk("\n");
@@ -1164,3 +1175,14 @@ static int __init kstack_setup(char *s)
 	return 0;
 }
 early_param("kstack", kstack_setup);
+
+
+static int __init code_bytes_setup(char *s)
+{
+	code_bytes = simple_strtoul(s, NULL, 0);
+	if (code_bytes > 8192)
+		code_bytes = 8192;
+
+	return 1;
+}
+__setup("code_bytes=", code_bytes_setup);
-- 
cgit v1.2.3


From aafbd7eb2057edfc9a17b258e3f0258a4e6d8198 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Wed, 30 Jan 2008 13:33:08 +0100
Subject: x86: make printk_address regs->ip always reliable

printk_address()'s second parameter is the reliability indication,
not the ebp. If we're printing regs->ip we're reliable by definition,
so pass a 1 here.

Signed-off-by: Arjan van de Ven
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/process_64.c | 2 +-
 arch/x86/kernel/traps_64.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 383760bfd283..4e65ae8a54bf 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -329,7 +329,7 @@ void __show_regs(struct pt_regs * regs)
 		(int)strcspn(init_utsname()->version, " "),
 		init_utsname()->version);
 	printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
-	printk_address(regs->ip, regs->bp);
+	printk_address(regs->ip, 1);
 	printk("RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->sp,
 		regs->flags);
 	printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index 0bba7924604e..814801f4eb9e 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -582,7 +582,7 @@ int __kprobes __die(const char * str, struct pt_regs * regs, long err)
 	add_taint(TAINT_DIE);
 	/* Executive summary in case the oops scrolled away */
 	printk(KERN_ALERT "RIP ");
-	printk_address(regs->ip, regs->bp);
+	printk_address(regs->ip, 1);
 	printk(" RSP <%016lx>\n", regs->sp);
 	if (kexec_should_crash(current))
 		crash_kexec(regs);
-- 
cgit v1.2.3


From e91a3b4353577c7d38b77dd1293fc3d0a173e8e6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 30 Jan 2008 13:33:08 +0100
Subject: x86: smp_scan_config() debugging printouts

These are useful in figuring out early-mapping problems.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/mpparse_32.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/mpparse_32.c b/arch/x86/kernel/mpparse_32.c
index a4c05372626b..cc06eae1b037 100644
--- a/arch/x86/kernel/mpparse_32.c
+++ b/arch/x86/kernel/mpparse_32.c
@@ -721,7 +721,7 @@ static int __init smp_scan_config (unsigned long base, unsigned long length)
 	unsigned long *bp = phys_to_virt(base);
 	struct intel_mp_floating *mpf;
 
-	Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
+	printk(KERN_INFO "Scan SMP from %p for %ld bytes.\n", bp,length);
 	if (sizeof(*mpf) != 16)
 		printk("Error: MPF size\n");
 
@@ -734,8 +734,8 @@ static int __init smp_scan_config (unsigned long base, unsigned long length)
 				|| (mpf->mpf_specification == 4)) ) {
 
 			smp_found_config = 1;
-			printk(KERN_INFO "found SMP MP-table at %08lx\n",
-						virt_to_phys(mpf));
+			printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
+				mpf, virt_to_phys(mpf));
 			reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE);
 			if (mpf->mpf_physptr) {
 				/*
-- 
cgit v1.2.3


From 94878efdd0815fe3a4159007b1454b25c7696d53 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 30 Jan 2008 13:33:09 +0100
Subject: x86: early fault debugging improvement

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/head_32.S | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index fbad51fce672..f409fe2a52e4 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -502,6 +502,7 @@ early_fault:
 	call printk
 #endif
 #endif
+	call dump_stack
 hlt_loop:
 	hlt
 	jmp hlt_loop
-- 
cgit v1.2.3


From aaf230424204864e2833dcc1da23e2cb0b9f39cd Mon Sep 17 00:00:00 2001
From: Yinghai Lu <Yinghai.Lu@Sun.COM>
Date: Wed, 30 Jan 2008 13:33:09 +0100
Subject: x86: disable the GART early, 64-bit

For K8 system: 4G RAM with memory hole remapping enabled, or more than
4G RAM installed.

when try to use kexec second kernel, and the first doesn't include
gart_shutdown. the second kernel could have different aper position than
the first kernel. and second kernel could use that hole as RAM that is
still used by GART set by the first kernel. esp. when try to kexec
2.6.24 with sparse mem enable from previous kernel (from RHEL 5 or SLES
10). the new kernel will use aper by GART (set by first kernel) for
vmemmap. and after new kernel setting one new GART. the position will be
real RAM. the _mapcount set is lost.

Bad page state in process 'swapper'
page:ffffe2000e600020 flags:0x0000000000000000 mapping:0000000000000000 mapcount:1 count:0
Trying to fix it up, but a reboot is needed
Backtrace:
Pid: 0, comm: swapper Not tainted 2.6.24-rc7-smp-gcdf71a10-dirty #13

Call Trace:
 [<ffffffff8026401f>] bad_page+0x63/0x8d
 [<ffffffff80264169>] __free_pages_ok+0x7c/0x2a5
 [<ffffffff80ba75d1>] free_all_bootmem_core+0xd0/0x198
 [<ffffffff80ba3a42>] numa_free_all_bootmem+0x3b/0x76
 [<ffffffff80ba3461>] mem_init+0x3b/0x152
 [<ffffffff80b959d3>] start_kernel+0x236/0x2c2
 [<ffffffff80b9511a>] _sinittext+0x11a/0x121

and
 [ffffe2000e600000-ffffe2000e7fffff] PMD ->ffff81001c200000 on node 0
phys addr is : 0x1c200000

RHEL 5.1 kernel -53 said:
PCI-DMA: aperture base @ 1c000000 size 65536 KB

new kernel said:
Mapping aperture over 65536 KB of RAM @ 3c000000

So could try to disable that GART if possible.

According to Ingo

> hm, i'm wondering, instead of modifying the GART, why dont we simply
> _detect_ whatever GART settings we have inherited, and propagate that
> into our e820 maps? I.e. if there's inconsistency, then punch that out
> from the memory maps and just dont use that memory.
>
> that way it would not matter whether the GART settings came from a [old
> or crashing] Linux kernel that has not called gart_iommu_shutdown(), or
> whether it's a BIOS that has set up an aperture hole inconsistent with
> the memory map it passed. (or the memory map we _think_ i tried to pass
> us)
>
> it would also be more robust to only read and do a memory map quirk
> based on that, than actively trying to change the GART so early in the
> bootup. Later on we have to re-enable the GART _anyway_ and have to
> punch a hole for it.
>
> and as a bonus, we would have shored up our defenses against crappy
> BIOSes as well.

add e820 modification for gart inconsistent setting.

gart_fix_e820=off could be used to disable e820 fix.

Signed-off-by: Yinghai Lu <yinghai.lu@sun.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/kernel-parameters.txt |  4 ++
 arch/x86/kernel/aperture_64.c       | 89 +++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/e820_64.c           | 12 +++++
 arch/x86/kernel/setup_64.c          |  3 ++
 include/asm-x86/e820_64.h           |  1 +
 include/asm-x86/gart.h              |  5 +++
 6 files changed, 114 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 40db7dd1d92a..860a90875491 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -660,6 +660,10 @@ and is between 256 and 4096 characters. It is defined in the file
 
 	gamma=		[HW,DRM]
 
+	gart_fix_e820=  [X86_64] disable the fix e820 for K8 GART
+			Format: off | on
+			default: on
+
 	gdth=		[HW,SCSI]
 			See header of drivers/scsi/gdth.c.
 
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 52d2beac4556..bf1b469d5847 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -218,6 +218,95 @@ static __u32 __init search_agp_bridge(u32 *order, int *valid_agp)
 	return 0;
 }
 
+static int gart_fix_e820 __initdata = 1;
+
+static int __init parse_gart_mem(char *p)
+{
+	if (!p)
+		return -EINVAL;
+
+	if (!strncmp(p, "off", 3))
+		gart_fix_e820 = 0;
+	else if (!strncmp(p, "on", 2))
+		gart_fix_e820 = 1;
+
+	return 0;
+}
+early_param("gart_fix_e820", parse_gart_mem);
+
+void __init early_gart_iommu_check(void)
+{
+	/*
+	 * in case it is enabled before, esp for kexec/kdump,
+	 * previous kernel already enable that. memset called
+	 * by allocate_aperture/__alloc_bootmem_nopanic cause restart.
+	 * or second kernel have different position for GART hole. and new
+	 * kernel could use hole as RAM that is still used by GART set by
+	 * first kernel
+	 * or BIOS forget to put that in reserved.
+	 * try to update e820 to make that region as reserved.
+	 */
+	int fix, num;
+	u32 ctl;
+	u32 aper_size = 0, aper_order = 0, last_aper_order = 0;
+	u64 aper_base = 0, last_aper_base = 0;
+	int aper_enabled = 0, last_aper_enabled = 0;
+
+	if (!early_pci_allowed())
+		return;
+
+	fix = 0;
+	for (num = 24; num < 32; num++) {
+		if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
+			continue;
+
+		ctl = read_pci_config(0, num, 3, 0x90);
+		aper_enabled = ctl & 1;
+		aper_order = (ctl >> 1) & 7;
+		aper_size = (32 * 1024 * 1024) << aper_order;
+		aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff;
+		aper_base <<= 25;
+
+		if ((last_aper_order && aper_order != last_aper_order) ||
+		    (last_aper_base && aper_base != last_aper_base) ||
+		    (last_aper_enabled && aper_enabled != last_aper_enabled)) {
+			fix = 1;
+			break;
+		}
+		last_aper_order = aper_order;
+		last_aper_base = aper_base;
+		last_aper_enabled = aper_enabled;
+	}
+
+	if (!fix && !aper_enabled)
+		return;
+
+	if (!aper_base || !aper_size || aper_base + aper_size > 0x100000000UL)
+		fix = 1;
+
+	if (gart_fix_e820 && !fix && aper_enabled) {
+		if (e820_any_mapped(aper_base, aper_base + aper_size,
+				    E820_RAM)) {
+			/* reserved it, so we can resuse it in second kernel */
+			printk(KERN_INFO "update e820 for GART\n");
+			add_memory_region(aper_base, aper_size, E820_RESERVED);
+			update_e820();
+		}
+		return;
+	}
+
+	/* different nodes have different setting, disable them all at first*/
+	for (num = 24; num < 32; num++) {
+		if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
+			continue;
+
+		ctl = read_pci_config(0, num, 3, 0x90);
+		ctl &= ~1;
+		write_pci_config(0, num, 3, 0x90, ctl);
+	}
+
+}
+
 void __init gart_iommu_hole_init(void)
 {
 	u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0;
diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
index abc473bcabe8..07cfaae7ab07 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -728,6 +728,18 @@ void __init finish_e820_parsing(void)
 	}
 }
 
+void __init update_e820(void)
+{
+	u8 nr_map;
+
+	nr_map = e820.nr_map;
+	if (sanitize_e820_map(e820.map, &nr_map))
+		return;
+	e820.nr_map = nr_map;
+	printk(KERN_INFO "modified physical RAM map:\n");
+	e820_print_map("modified");
+}
+
 unsigned long pci_mem_start = 0xaeedbabe;
 EXPORT_SYMBOL(pci_mem_start);
 
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 07547febac7a..12bad27d66f8 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -53,6 +53,7 @@
 #include <video/edid.h>
 #include <asm/e820.h>
 #include <asm/dma.h>
+#include <asm/gart.h>
 #include <asm/mpspec.h>
 #include <asm/mmu_context.h>
 #include <asm/proto.h>
@@ -335,6 +336,8 @@ void __init setup_arch(char **cmdline_p)
 
 	finish_e820_parsing();
 
+	early_gart_iommu_check();
+
 	e820_register_active_regions(0, 0, -1UL);
 	/*
 	 * partially used pages are not usable - thus
diff --git a/include/asm-x86/e820_64.h b/include/asm-x86/e820_64.h
index 8cba49da4795..ff36f434f00b 100644
--- a/include/asm-x86/e820_64.h
+++ b/include/asm-x86/e820_64.h
@@ -39,6 +39,7 @@ extern void e820_register_active_regions(int nid,
 extern void finish_e820_parsing(void);
 
 extern struct e820map e820;
+extern void update_e820(void);
 
 extern unsigned ebda_addr, ebda_size;
 extern unsigned long nodemap_addr, nodemap_size;
diff --git a/include/asm-x86/gart.h b/include/asm-x86/gart.h
index f704c50519b8..90958ed993fa 100644
--- a/include/asm-x86/gart.h
+++ b/include/asm-x86/gart.h
@@ -9,6 +9,7 @@ extern int iommu_detected;
 extern void gart_iommu_init(void);
 extern void gart_iommu_shutdown(void);
 extern void __init gart_parse_options(char *);
+extern void early_gart_iommu_check(void);
 extern void gart_iommu_hole_init(void);
 extern int fallback_aper_order;
 extern int fallback_aper_force;
@@ -20,6 +21,10 @@ extern int fix_aperture;
 #define gart_iommu_aperture 0
 #define gart_iommu_aperture_allowed 0
 
+static inline void early_gart_iommu_check(void)
+{
+}
+
 static inline void gart_iommu_shutdown(void)
 {
 }
-- 
cgit v1.2.3


From ccafa59a0061d7c44d15d02403120fd02b52c667 Mon Sep 17 00:00:00 2001
From: "mboton@gmail.com" <mboton@gmail.com>
Date: Wed, 30 Jan 2008 13:33:10 +0100
Subject: x86: ioport_{32|64}.c unification
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ioport_{32|64}.c unification.

This patch unifies the code from the ioport_32.c and ioport_64.c files.

Tested and working fine with i386 and x86_64 kernels.

Signed-off-by: Miguel Botón <mboton@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/Makefile    |   2 +-
 arch/x86/kernel/ioport.c    | 150 ++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/ioport_32.c | 129 -------------------------------------
 arch/x86/kernel/ioport_64.c | 117 ----------------------------------
 4 files changed, 151 insertions(+), 247 deletions(-)
 create mode 100644 arch/x86/kernel/ioport.c
 delete mode 100644 arch/x86/kernel/ioport_32.c
 delete mode 100644 arch/x86/kernel/ioport_64.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 3d23ccd366ea..c414c45a0f13 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -10,7 +10,7 @@ CFLAGS_vsyscall_64.o := $(PROFILING) -g0
 
 obj-y			:= process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
 obj-y			+= traps_$(BITS).o irq_$(BITS).o
-obj-y			+= time_$(BITS).o ioport_$(BITS).o ldt.o
+obj-y			+= time_$(BITS).o ioport.o ldt.o
 obj-y			+= setup_$(BITS).o i8259_$(BITS).o
 obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o
 obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
new file mode 100644
index 000000000000..e723ff3b1d53
--- /dev/null
+++ b/arch/x86/kernel/ioport.c
@@ -0,0 +1,150 @@
+/*
+ * This contains the io-permission bitmap code - written by obz, with changes
+ * by Linus. 32/64 bits code unification by Miguel Botón.
+ */
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/ioport.h>
+#include <linux/smp.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/thread_info.h>
+#include <linux/syscalls.h>
+
+/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
+static void set_bitmap(unsigned long *bitmap, unsigned int base,
+		       unsigned int extent, int new_value)
+{
+	unsigned int i;
+
+	for (i = base; i < base + extent; i++) {
+		if (new_value)
+			__set_bit(i, bitmap);
+		else
+			__clear_bit(i, bitmap);
+	}
+}
+
+/*
+ * this changes the io permissions bitmap in the current task.
+ */
+asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
+{
+	struct thread_struct * t = &current->thread;
+	struct tss_struct * tss;
+	unsigned int i, max_long, bytes, bytes_updated;
+
+	if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
+		return -EINVAL;
+	if (turn_on && !capable(CAP_SYS_RAWIO))
+		return -EPERM;
+
+	/*
+	 * If it's the first ioperm() call in this thread's lifetime, set the
+	 * IO bitmap up. ioperm() is much less timing critical than clone(),
+	 * this is why we delay this operation until now:
+	 */
+	if (!t->io_bitmap_ptr) {
+		unsigned long *bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
+
+		if (!bitmap)
+			return -ENOMEM;
+
+		memset(bitmap, 0xff, IO_BITMAP_BYTES);
+		t->io_bitmap_ptr = bitmap;
+		set_thread_flag(TIF_IO_BITMAP);
+	}
+
+	/*
+	 * do it in the per-thread copy and in the TSS ...
+	 *
+	 * Disable preemption via get_cpu() - we must not switch away
+	 * because the ->io_bitmap_max value must match the bitmap
+	 * contents:
+	 */
+	tss = &per_cpu(init_tss, get_cpu());
+
+	set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
+
+	/*
+	 * Search for a (possibly new) maximum. This is simple and stupid,
+	 * to keep it obviously correct:
+	 */
+	max_long = 0;
+	for (i = 0; i < IO_BITMAP_LONGS; i++)
+		if (t->io_bitmap_ptr[i] != ~0UL)
+			max_long = i;
+
+	bytes = (max_long + 1) * sizeof(unsigned long);
+	bytes_updated = max(bytes, t->io_bitmap_max);
+
+	t->io_bitmap_max = bytes;
+
+#ifdef CONFIG_X86_32
+	/*
+	 * Sets the lazy trigger so that the next I/O operation will
+	 * reload the correct bitmap.
+	 * Reset the owner so that a process switch will not set
+	 * tss->io_bitmap_base to IO_BITMAP_OFFSET.
+	 */
+	tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
+	tss->io_bitmap_owner = NULL;
+#else
+	/* Update the TSS: */
+	memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
+#endif
+
+	put_cpu();
+
+	return 0;
+}
+
+/*
+ * sys_iopl has to be used when you want to access the IO ports
+ * beyond the 0x3ff range: to get the full 65536 ports bitmapped
+ * you'd need 8kB of bitmaps/process, which is a bit excessive.
+ *
+ * Here we just change the flags value on the stack: we allow
+ * only the super-user to do it. This depends on the stack-layout
+ * on system-call entry - see also fork() and the signal handling
+ * code.
+ */
+#ifdef CONFIG_X86_32
+asmlinkage long sys_iopl(unsigned long regsp)
+{
+	volatile struct pt_regs *regs = (struct pt_regs *)&regsp;
+	unsigned int level = regs->bx;
+	unsigned int old = (regs->flags >> 12) & 3;
+
+	if (level > 3)
+		return -EINVAL;
+	/* Trying to gain more privileges? */
+	if (level > old) {
+		if (!capable(CAP_SYS_RAWIO))
+			return -EPERM;
+	}
+	regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12);
+
+	return 0;
+}
+#else
+asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
+{
+	unsigned int old = (regs->flags >> 12) & 3;
+
+	if (level > 3)
+		return -EINVAL;
+	/* Trying to gain more privileges? */
+	if (level > old) {
+		if (!capable(CAP_SYS_RAWIO))
+			return -EPERM;
+	}
+	regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12);
+
+	return 0;
+}
+#endif
diff --git a/arch/x86/kernel/ioport_32.c b/arch/x86/kernel/ioport_32.c
deleted file mode 100644
index 9295e01ff49c..000000000000
--- a/arch/x86/kernel/ioport_32.c
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * This contains the io-permission bitmap code - written by obz, with changes
- * by Linus.
- */
-
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/capability.h>
-#include <linux/errno.h>
-#include <linux/types.h>
-#include <linux/ioport.h>
-#include <linux/smp.h>
-#include <linux/stddef.h>
-#include <linux/slab.h>
-#include <linux/thread_info.h>
-#include <linux/syscalls.h>
-
-/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
-static void set_bitmap(unsigned long *bitmap, unsigned int base,
-		       unsigned int extent, int new_value)
-{
-	unsigned int i;
-
-	for (i = base; i < base + extent; i++) {
-		if (new_value)
-			__set_bit(i, bitmap);
-		else
-			__clear_bit(i, bitmap);
-	}
-}
-
-/*
- * this changes the io permissions bitmap in the current task.
- */
-asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
-{
-	struct thread_struct * t = &current->thread;
-	struct tss_struct * tss;
-	unsigned long i, max_long;
-
-	if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
-		return -EINVAL;
-	if (turn_on && !capable(CAP_SYS_RAWIO))
-		return -EPERM;
-
-	/*
-	 * If it's the first ioperm() call in this thread's lifetime, set the
-	 * IO bitmap up. ioperm() is much less timing critical than clone(),
-	 * this is why we delay this operation until now:
-	 */
-	if (!t->io_bitmap_ptr) {
-		unsigned long *bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
-
-		if (!bitmap)
-			return -ENOMEM;
-
-		memset(bitmap, 0xff, IO_BITMAP_BYTES);
-		t->io_bitmap_ptr = bitmap;
-		set_thread_flag(TIF_IO_BITMAP);
-	}
-
-	/*
-	 * do it in the per-thread copy and in the TSS ...
-	 *
-	 * Disable preemption via get_cpu() - we must not switch away
-	 * because the ->io_bitmap_max value must match the bitmap
-	 * contents:
-	 */
-	tss = &per_cpu(init_tss, get_cpu());
-
-	set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
-
-	/*
-	 * Search for a (possibly new) maximum. This is simple and stupid,
-	 * to keep it obviously correct:
-	 */
-	max_long = 0;
-	for (i = 0; i < IO_BITMAP_LONGS; i++)
-		if (t->io_bitmap_ptr[i] != ~0UL)
-			max_long = i;
-
-	t->io_bitmap_max = (max_long + 1) * sizeof(unsigned long);
-
-	/*
-	 * Sets the lazy trigger so that the next I/O operation will
-	 * reload the correct bitmap.
-	 * Reset the owner so that a process switch will not set
-	 * tss->io_bitmap_base to IO_BITMAP_OFFSET.
-	 */
-	tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
-	tss->io_bitmap_owner = NULL;
-
-	put_cpu();
-
-	return 0;
-}
-
-/*
- * sys_iopl has to be used when you want to access the IO ports
- * beyond the 0x3ff range: to get the full 65536 ports bitmapped
- * you'd need 8kB of bitmaps/process, which is a bit excessive.
- *
- * Here we just change the flags value on the stack: we allow
- * only the super-user to do it. This depends on the stack-layout
- * on system-call entry - see also fork() and the signal handling
- * code.
- */
-
-asmlinkage long sys_iopl(unsigned long regsp)
-{
-	volatile struct pt_regs *regs = (struct pt_regs *)&regsp;
-	unsigned int level = regs->bx;
-	unsigned int old = (regs->flags >> 12) & 3;
-	struct thread_struct *t = &current->thread;
-
-	if (level > 3)
-		return -EINVAL;
-	/* Trying to gain more privileges? */
-	if (level > old) {
-		if (!capable(CAP_SYS_RAWIO))
-			return -EPERM;
-	}
-
-	t->iopl = level << 12;
-	regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | t->iopl;
-	set_iopl_mask(t->iopl);
-
-	return 0;
-}
diff --git a/arch/x86/kernel/ioport_64.c b/arch/x86/kernel/ioport_64.c
deleted file mode 100644
index ff7514b757e5..000000000000
--- a/arch/x86/kernel/ioport_64.c
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * This contains the io-permission bitmap code - written by obz, with changes
- * by Linus.
- */
-
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/capability.h>
-#include <linux/errno.h>
-#include <linux/types.h>
-#include <linux/ioport.h>
-#include <linux/smp.h>
-#include <linux/stddef.h>
-#include <linux/slab.h>
-#include <linux/thread_info.h>
-#include <linux/syscalls.h>
-
-/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
-static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
-{
-	int i;
-		if (new_value)
-		for (i = base; i < base + extent; i++) 
-			__set_bit(i, bitmap); 
-		else
-		for (i = base; i < base + extent; i++) 
-			clear_bit(i, bitmap); 
-}
-
-/*
- * this changes the io permissions bitmap in the current task.
- */
-asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
-{
-	unsigned int i, max_long, bytes, bytes_updated;
-	struct thread_struct * t = &current->thread;
-	struct tss_struct * tss;
-	unsigned long *bitmap;
-
-	if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
-		return -EINVAL;
-	if (turn_on && !capable(CAP_SYS_RAWIO))
-		return -EPERM;
-
-	/*
-	 * If it's the first ioperm() call in this thread's lifetime, set the
-	 * IO bitmap up. ioperm() is much less timing critical than clone(),
-	 * this is why we delay this operation until now:
-	 */
-	if (!t->io_bitmap_ptr) {
-		bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
-		if (!bitmap)
-			return -ENOMEM;
-
-		memset(bitmap, 0xff, IO_BITMAP_BYTES);
-		t->io_bitmap_ptr = bitmap;
-		set_thread_flag(TIF_IO_BITMAP);
-	}
-
-	/*
-	 * do it in the per-thread copy and in the TSS ...
-	 *
-	 * Disable preemption via get_cpu() - we must not switch away
-	 * because the ->io_bitmap_max value must match the bitmap
-	 * contents:
-	 */
-	tss = &per_cpu(init_tss, get_cpu());
-
-	set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
-
-	/*
-	 * Search for a (possibly new) maximum. This is simple and stupid,
-	 * to keep it obviously correct:
-	 */
-	max_long = 0;
-	for (i = 0; i < IO_BITMAP_LONGS; i++)
-		if (t->io_bitmap_ptr[i] != ~0UL)
-			max_long = i;
-
-	bytes = (max_long + 1) * sizeof(long);
-	bytes_updated = max(bytes, t->io_bitmap_max);
-
-	t->io_bitmap_max = bytes;
-
-	/* Update the TSS: */
-	memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
-
-	put_cpu();
-
-	return 0;
-}
-
-/*
- * sys_iopl has to be used when you want to access the IO ports
- * beyond the 0x3ff range: to get the full 65536 ports bitmapped
- * you'd need 8kB of bitmaps/process, which is a bit excessive.
- *
- * Here we just change the flags value on the stack: we allow
- * only the super-user to do it. This depends on the stack-layout
- * on system-call entry - see also fork() and the signal handling
- * code.
- */
-
-asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
-{
-	unsigned int old = (regs->flags >> 12) & 3;
-
-	if (level > 3)
-		return -EINVAL;
-	/* Trying to gain more privileges? */
-	if (level > old) {
-		if (!capable(CAP_SYS_RAWIO))
-			return -EPERM;
-	}
-	regs->flags = (regs->flags &~ X86_EFLAGS_IOPL) | (level << 12);
-	return 0;
-}
-- 
cgit v1.2.3


From 9718769d298f8642d5ef41eb5f55669d7c5b9fd6 Mon Sep 17 00:00:00 2001
From: Chris Wright <chrisw@sous-sol.org>
Date: Wed, 30 Jan 2008 13:33:10 +0100
Subject: x86: fix ioport unification on 32-bit

ioport unification was broken for 32-bit; it was missing
the acutal pushf/popf EFLAGS manipulation (set_iopl_mask()).
Also, use of volatile looks like leftover cruft.

Cc: mboton@gmail.com
Cc: Kevin Winchester <kjwinchester@gmail.com>
Cc: Zach Brown <zach.brown@oracle.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ioport.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index e723ff3b1d53..be72d809bce7 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -116,9 +116,10 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
 #ifdef CONFIG_X86_32
 asmlinkage long sys_iopl(unsigned long regsp)
 {
-	volatile struct pt_regs *regs = (struct pt_regs *)&regsp;
+	struct pt_regs *regs = (struct pt_regs *)&regsp;
 	unsigned int level = regs->bx;
 	unsigned int old = (regs->flags >> 12) & 3;
+	struct thread_struct *t = &current->thread;
 
 	if (level > 3)
 		return -EINVAL;
@@ -127,8 +128,9 @@ asmlinkage long sys_iopl(unsigned long regsp)
 		if (!capable(CAP_SYS_RAWIO))
 			return -EPERM;
 	}
+	t->iopl = level << 12;
 	regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12);
-
+	set_iopl_mask(t->iopl);
 	return 0;
 }
 #else
-- 
cgit v1.2.3


From a1bf250a6f31afb8caac166ae50dc7b89c38084c Mon Sep 17 00:00:00 2001
From: Chris Wright <chrisw@sous-sol.org>
Date: Wed, 30 Jan 2008 13:33:10 +0100
Subject: x86: refactor ioport unification

Refactor ioport unification to pull out common code.

Cc: mboton@gmail.com
Cc: Kevin Winchester <kjwinchester@gmail.com>
Cc: Zach Brown <zach.brown@oracle.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ioport.c | 40 +++++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 19 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index be72d809bce7..50e5e4a31c85 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -113,13 +113,9 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
  * on system-call entry - see also fork() and the signal handling
  * code.
  */
-#ifdef CONFIG_X86_32
-asmlinkage long sys_iopl(unsigned long regsp)
+static int do_iopl(unsigned int level, struct pt_regs *regs)
 {
-	struct pt_regs *regs = (struct pt_regs *)&regsp;
-	unsigned int level = regs->bx;
 	unsigned int old = (regs->flags >> 12) & 3;
-	struct thread_struct *t = &current->thread;
 
 	if (level > 3)
 		return -EINVAL;
@@ -128,25 +124,31 @@ asmlinkage long sys_iopl(unsigned long regsp)
 		if (!capable(CAP_SYS_RAWIO))
 			return -EPERM;
 	}
-	t->iopl = level << 12;
 	regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12);
-	set_iopl_mask(t->iopl);
+
 	return 0;
 }
-#else
-asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
+
+#ifdef CONFIG_X86_32
+asmlinkage long sys_iopl(unsigned long regsp)
 {
-	unsigned int old = (regs->flags >> 12) & 3;
+	struct pt_regs *regs = (struct pt_regs *)&regsp;
+	unsigned int level = regs->bx;
+	struct thread_struct *t = &current->thread;
+	int rc;
 
-	if (level > 3)
-		return -EINVAL;
-	/* Trying to gain more privileges? */
-	if (level > old) {
-		if (!capable(CAP_SYS_RAWIO))
-			return -EPERM;
-	}
-	regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12);
+	rc = do_iopl(level, regs);
+	if (rc < 0)
+		goto out;
 
-	return 0;
+	t->iopl = level << 12;
+	set_iopl_mask(t->iopl);
+out:
+	return rc;
+}
+#else
+asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
+{
+	return do_iopl(level, regs);
 }
 #endif
-- 
cgit v1.2.3


From ef97001f3d869d7cc1956e0cc0d89e514e3f7db0 Mon Sep 17 00:00:00 2001
From: "travis@sgi.com" <travis@sgi.com>
Date: Wed, 30 Jan 2008 13:33:10 +0100
Subject: x86: change size of APICIDs from u8 to u16

Change the size of APICIDs from u8 to u16.  This partially
supports the new x2apic mode that will be present on future
processor chips. (Chips actually support 32-bit APICIDs, but that
change is more intrusive. Supporting 16-bit is sufficient for now).

Signed-off-by: Jack Steiner <steiner@sgi.com>

I've included just the partial change from u8 to u16 apicids.  The
remaining x2apic changes will be in a separate patch.

In addition, the fake_node_to_pxm_map[] and fake_apicid_to_node[]
tables have been moved from local data to the __initdata section
reducing stack pressure when MAX_NUMNODES and MAX_LOCAL_APIC are
increased in size.

Signed-off-by: Mike Travis <travis@sgi.com>
Reviewed-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/genapic_64.c |  4 ++--
 arch/x86/kernel/mpparse_64.c |  4 ++--
 arch/x86/kernel/smpboot_64.c |  2 +-
 arch/x86/mm/numa_64.c        |  2 +-
 arch/x86/mm/srat_64.c        | 22 +++++++++++++---------
 include/asm-x86/processor.h  | 14 +++++++-------
 include/asm-x86/smp_64.h     |  8 ++++----
 7 files changed, 30 insertions(+), 26 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
index ce703e21c912..ac2b78f24074 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -32,10 +32,10 @@
  * array during this time.  Is it zeroed when the per_cpu
  * data area is removed.
  */
-u8 x86_cpu_to_apicid_init[NR_CPUS] __initdata
+u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata
 					= { [0 ... NR_CPUS-1] = BAD_APICID };
 void *x86_cpu_to_apicid_ptr;
-DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID;
+DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
 EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
 
 struct genapic __read_mostly *genapic = &apic_flat;
diff --git a/arch/x86/kernel/mpparse_64.c b/arch/x86/kernel/mpparse_64.c
index ef4aab123581..17d21e5b22d6 100644
--- a/arch/x86/kernel/mpparse_64.c
+++ b/arch/x86/kernel/mpparse_64.c
@@ -67,7 +67,7 @@ unsigned disabled_cpus __cpuinitdata;
 /* Bitmask of physically existing CPUs */
 physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
 
-u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
+u16 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
 
 
 /*
@@ -132,7 +132,7 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
 	 * area is created.
 	 */
 	if (x86_cpu_to_apicid_ptr) {
-		u8 *x86_cpu_to_apicid = (u8 *)x86_cpu_to_apicid_ptr;
+		u16 *x86_cpu_to_apicid = (u16 *)x86_cpu_to_apicid_ptr;
 		x86_cpu_to_apicid[cpu] = m->mpc_apicid;
 	} else {
 		per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid;
diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c
index 5bd42ce144da..1fea185c9dca 100644
--- a/arch/x86/kernel/smpboot_64.c
+++ b/arch/x86/kernel/smpboot_64.c
@@ -65,7 +65,7 @@ int smp_num_siblings = 1;
 EXPORT_SYMBOL(smp_num_siblings);
 
 /* Last level cache ID of each logical CPU */
-DEFINE_PER_CPU(u8, cpu_llc_id) = BAD_APICID;
+DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID;
 
 /* Bitmask of currently online CPUs */
 cpumask_t cpu_online_map __read_mostly;
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 551e3590e5c5..650001a87c8f 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -626,7 +626,7 @@ void __init init_cpu_to_node(void)
 	int i;
 
 	for (i = 0; i < NR_CPUS; i++) {
-		u8 apicid = x86_cpu_to_apicid_init[i];
+		u16 apicid = x86_cpu_to_apicid_init[i];
 
 		if (apicid == BAD_APICID)
 			continue;
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 5c0637e4c2f4..b367bc342fe0 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -130,6 +130,9 @@ void __init
 acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
 {
 	int pxm, node;
+	int apic_id;
+
+	apic_id = pa->apic_id;
 	if (srat_disabled())
 		return;
 	if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
@@ -145,10 +148,10 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
 		bad_srat();
 		return;
 	}
-	apicid_to_node[pa->apic_id] = node;
+	apicid_to_node[apic_id] = node;
 	acpi_numa = 1;
 	printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
-	       pxm, pa->apic_id, node);
+	       pxm, apic_id, node);
 }
 
 int update_end_of_memory(unsigned long end) {return -1;}
@@ -343,7 +346,8 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 	/* First clean up the node list */
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		cutoff_node(i, start, end);
-		if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
+		/* ZZZ why was this needed. At least add a comment */
+		if (nodes[i].end && (nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
 			unparse_node(i);
 			node_set_offline(i);
 		}
@@ -384,6 +388,12 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 }
 
 #ifdef CONFIG_NUMA_EMU
+static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
+	[0 ... MAX_NUMNODES-1] = PXM_INVAL
+};
+static unsigned char fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
+	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
+};
 static int __init find_node_by_addr(unsigned long addr)
 {
 	int ret = NUMA_NO_NODE;
@@ -414,12 +424,6 @@ static int __init find_node_by_addr(unsigned long addr)
 void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
 {
 	int i, j;
-	int fake_node_to_pxm_map[MAX_NUMNODES] = {
-		[0 ... MAX_NUMNODES-1] = PXM_INVAL
-	};
-	unsigned char fake_apicid_to_node[MAX_LOCAL_APIC] = {
-		[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
-	};
 
 	printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
 			 "topology.\n");
diff --git a/include/asm-x86/processor.h b/include/asm-x86/processor.h
index e701ac5487e5..81ecfed83e47 100644
--- a/include/asm-x86/processor.h
+++ b/include/asm-x86/processor.h
@@ -90,14 +90,14 @@ struct cpuinfo_x86 {
 #ifdef CONFIG_SMP
 	cpumask_t llc_shared_map;	/* cpus sharing the last level cache */
 #endif
-	unsigned char x86_max_cores;	/* cpuid returned max cores value */
-	unsigned char apicid;
-	unsigned short x86_clflush_size;
+	u16 x86_max_cores;		/* cpuid returned max cores value */
+	u16 apicid;
+	u16 x86_clflush_size;
 #ifdef CONFIG_SMP
-	unsigned char booted_cores;	/* number of cores as seen by OS */
-	__u8 phys_proc_id; 		/* Physical processor id. */
-	__u8 cpu_core_id;  		/* Core id */
-	__u8 cpu_index;			/* index into per_cpu list */
+	u16 booted_cores;		/* number of cores as seen by OS */
+	u16 phys_proc_id; 		/* Physical processor id. */
+	u16 cpu_core_id;  		/* Core id */
+	u16 cpu_index;			/* index into per_cpu list */
 #endif
 } __attribute__((__aligned__(SMP_CACHE_BYTES)));
 
diff --git a/include/asm-x86/smp_64.h b/include/asm-x86/smp_64.h
index 2feddda91e12..b1d5381aa760 100644
--- a/include/asm-x86/smp_64.h
+++ b/include/asm-x86/smp_64.h
@@ -26,14 +26,14 @@ extern void unlock_ipi_call_lock(void);
 extern int smp_call_function_mask(cpumask_t mask, void (*func)(void *),
 				  void *info, int wait);
 
-extern u8 __initdata x86_cpu_to_apicid_init[];
+extern u16 __initdata x86_cpu_to_apicid_init[];
 extern void *x86_cpu_to_apicid_ptr;
-extern u8 bios_cpu_apicid[];
+extern u16 bios_cpu_apicid[];
 
 DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
 DECLARE_PER_CPU(cpumask_t, cpu_core_map);
-DECLARE_PER_CPU(u8, cpu_llc_id);
-DECLARE_PER_CPU(u8, x86_cpu_to_apicid);
+DECLARE_PER_CPU(u16, cpu_llc_id);
+DECLARE_PER_CPU(u16, x86_cpu_to_apicid);
 
 static inline int cpu_present_to_apicid(int mps_cpu)
 {
-- 
cgit v1.2.3


From 2c6b8c030cfca334c3d700ee504036c585c4c6a3 Mon Sep 17 00:00:00 2001
From: "travis@sgi.com" <travis@sgi.com>
Date: Wed, 30 Jan 2008 13:33:11 +0100
Subject: x86: change NR_CPUS arrays in powernow-k8

Change the following static arrays sized by NR_CPUS to
per_cpu data variables:

	powernow_k8_data *powernow_data[NR_CPUS];

Signed-off-by: Mike Travis <travis@sgi.com>
Reviewed-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 99e1ef9939be..a0522735dd9d 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -52,7 +52,7 @@
 /* serialize freq changes  */
 static DEFINE_MUTEX(fidvid_mutex);
 
-static struct powernow_k8_data *powernow_data[NR_CPUS];
+static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data);
 
 static int cpu_family = CPU_OPTERON;
 
@@ -1018,7 +1018,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i
 static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation)
 {
 	cpumask_t oldmask = CPU_MASK_ALL;
-	struct powernow_k8_data *data = powernow_data[pol->cpu];
+	struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
 	u32 checkfid;
 	u32 checkvid;
 	unsigned int newstate;
@@ -1094,7 +1094,7 @@ err_out:
 /* Driver entry point to verify the policy and range of frequencies */
 static int powernowk8_verify(struct cpufreq_policy *pol)
 {
-	struct powernow_k8_data *data = powernow_data[pol->cpu];
+	struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
 
 	if (!data)
 		return -EINVAL;
@@ -1202,7 +1202,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
 		dprintk("cpu_init done, current fid 0x%x, vid 0x%x\n",
 			data->currfid, data->currvid);
 
-	powernow_data[pol->cpu] = data;
+	per_cpu(powernow_data, pol->cpu) = data;
 
 	return 0;
 
@@ -1216,7 +1216,7 @@ err_out:
 
 static int __devexit powernowk8_cpu_exit (struct cpufreq_policy *pol)
 {
-	struct powernow_k8_data *data = powernow_data[pol->cpu];
+	struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
 
 	if (!data)
 		return -EINVAL;
@@ -1237,7 +1237,7 @@ static unsigned int powernowk8_get (unsigned int cpu)
 	cpumask_t oldmask = current->cpus_allowed;
 	unsigned int khz = 0;
 
-	data = powernow_data[first_cpu(per_cpu(cpu_core_map, cpu))];
+	data = per_cpu(powernow_data, first_cpu(per_cpu(cpu_core_map, cpu)));
 
 	if (!data)
 		return -EINVAL;
-- 
cgit v1.2.3


From 24b0d22b7b63bfc71853faf64e76cd45701ecf2a Mon Sep 17 00:00:00 2001
From: "travis@sgi.com" <travis@sgi.com>
Date: Wed, 30 Jan 2008 13:33:11 +0100
Subject: x86: change NR_CPUS arrays in smpboot_64

Change the following static arrays sized by NR_CPUS to
per_cpu data variables:

	task_struct *idle_thread_array[NR_CPUS];

This is only done if CONFIG_HOTPLUG_CPU is defined
as otherwise, the array is removed after initialization
anyways.

Signed-off-by: Mike Travis <travis@sgi.com>
Reviewed-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/smpboot_64.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c
index 1fea185c9dca..50e207a8261f 100644
--- a/arch/x86/kernel/smpboot_64.c
+++ b/arch/x86/kernel/smpboot_64.c
@@ -111,10 +111,20 @@ DEFINE_PER_CPU(int, cpu_state) = { 0 };
  * a new thread. Also avoids complicated thread destroy functionality
  * for idle threads.
  */
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Needed only for CONFIG_HOTPLUG_CPU because __cpuinitdata is
+ * removed after init for !CONFIG_HOTPLUG_CPU.
+ */
+static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
+#define get_idle_for_cpu(x)     (per_cpu(idle_thread_array, x))
+#define set_idle_for_cpu(x,p)   (per_cpu(idle_thread_array, x) = (p))
+#else
 struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
-
 #define get_idle_for_cpu(x)     (idle_thread_array[(x)])
 #define set_idle_for_cpu(x,p)   (idle_thread_array[(x)] = (p))
+#endif
+
 
 /*
  * Currently trivial. Write the real->protected mode
-- 
cgit v1.2.3


From 30964d54e94229f567a7312a0e6666f9deb6a488 Mon Sep 17 00:00:00 2001
From: "travis@sgi.com" <travis@sgi.com>
Date: Wed, 30 Jan 2008 13:33:11 +0100
Subject: x86: change NR_CPUS arrays in topology

Change the following static arrays sized by NR_CPUS to
per_cpu data variables:

	i386_cpu cpu_devices[NR_CPUS];

(And change the struct name to x86_cpu.)

Signed-off-by: Mike Travis <travis@sgi.com>
Reviewed-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/topology.c | 8 ++++----
 include/asm-x86/cpu.h      | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index 7e16d675eb85..a0d1719bda79 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -31,7 +31,7 @@
 #include <linux/mmzone.h>
 #include <asm/cpu.h>
 
-static struct i386_cpu cpu_devices[NR_CPUS];
+static DEFINE_PER_CPU(struct x86_cpu, cpu_devices);
 
 int __cpuinit arch_register_cpu(int num)
 {
@@ -46,16 +46,16 @@ int __cpuinit arch_register_cpu(int num)
 	 */
 #ifdef CONFIG_HOTPLUG_CPU
 	if (num)
-		cpu_devices[num].cpu.hotpluggable = 1;
+		per_cpu(cpu_devices, num).cpu.hotpluggable = 1;
 #endif
 
-	return register_cpu(&cpu_devices[num].cpu, num);
+	return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
 void arch_unregister_cpu(int num)
 {
-	return unregister_cpu(&cpu_devices[num].cpu);
+	return unregister_cpu(&per_cpu(cpu_devices, num).cpu);
 }
 EXPORT_SYMBOL(arch_register_cpu);
 EXPORT_SYMBOL(arch_unregister_cpu);
diff --git a/include/asm-x86/cpu.h b/include/asm-x86/cpu.h
index b1bc7b1b64b0..85ece5f10e9e 100644
--- a/include/asm-x86/cpu.h
+++ b/include/asm-x86/cpu.h
@@ -7,7 +7,7 @@
 #include <linux/nodemask.h>
 #include <linux/percpu.h>
 
-struct i386_cpu {
+struct x86_cpu {
 	struct cpu cpu;
 };
 extern int arch_register_cpu(int num);
-- 
cgit v1.2.3


From 3b41908902df1dba141cd7de1a727bb03718a654 Mon Sep 17 00:00:00 2001
From: "travis@sgi.com" <travis@sgi.com>
Date: Wed, 30 Jan 2008 13:33:11 +0100
Subject: x86: cleanup x86_cpu_to_apicid references

Clean up references to x86_cpu_to_apicid.  Removes extraneous
comments and standardizes on "x86_*_early_ptr" for the early
kernel init references.

Signed-off-by: Mike Travis <travis@sgi.com>
Reviewed-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/genapic_64.c | 11 ++---------
 arch/x86/kernel/mpparse_64.c | 11 +++--------
 arch/x86/kernel/setup_64.c   |  2 +-
 arch/x86/kernel/smpboot_32.c |  9 ++-------
 arch/x86/kernel/smpboot_64.c | 16 +++++++++-------
 include/asm-x86/smp_32.h     |  2 +-
 include/asm-x86/smp_64.h     |  2 +-
 7 files changed, 19 insertions(+), 34 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
index ac2b78f24074..4ae7b6440260 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -24,17 +24,10 @@
 #include <acpi/acpi_bus.h>
 #endif
 
-/*
- * which logical CPU number maps to which CPU (physical APIC ID)
- *
- * The following static array is used during kernel startup
- * and the x86_cpu_to_apicid_ptr contains the address of the
- * array during this time.  Is it zeroed when the per_cpu
- * data area is removed.
- */
+/* which logical CPU number maps to which CPU (physical APIC ID) */
 u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata
 					= { [0 ... NR_CPUS-1] = BAD_APICID };
-void *x86_cpu_to_apicid_ptr;
+void *x86_cpu_to_apicid_early_ptr;
 DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
 EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
 
diff --git a/arch/x86/kernel/mpparse_64.c b/arch/x86/kernel/mpparse_64.c
index 17d21e5b22d6..528ad9696d96 100644
--- a/arch/x86/kernel/mpparse_64.c
+++ b/arch/x86/kernel/mpparse_64.c
@@ -125,14 +125,9 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
 		cpu = 0;
  	}
 	bios_cpu_apicid[cpu] = m->mpc_apicid;
-	/*
-	 * We get called early in the the start_kernel initialization
-	 * process when the per_cpu data area is not yet setup, so we
-	 * use a static array that is removed after the per_cpu data
-	 * area is created.
-	 */
-	if (x86_cpu_to_apicid_ptr) {
-		u16 *x86_cpu_to_apicid = (u16 *)x86_cpu_to_apicid_ptr;
+	/* are we being called early in kernel startup? */
+	if (x86_cpu_to_apicid_early_ptr) {
+		u16 *x86_cpu_to_apicid = (u16 *)x86_cpu_to_apicid_early_ptr;
 		x86_cpu_to_apicid[cpu] = m->mpc_apicid;
 	} else {
 		per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid;
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 12bad27d66f8..e2beb4cba15f 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -360,7 +360,7 @@ void __init setup_arch(char **cmdline_p)
 
 #ifdef CONFIG_SMP
 	/* setup to use the static apicid table during kernel startup */
-	x86_cpu_to_apicid_ptr = (void *)&x86_cpu_to_apicid_init;
+	x86_cpu_to_apicid_early_ptr = (void *)&x86_cpu_to_apicid_init;
 #endif
 
 #ifdef CONFIG_ACPI
diff --git a/arch/x86/kernel/smpboot_32.c b/arch/x86/kernel/smpboot_32.c
index 2034332ad080..915ec6267326 100644
--- a/arch/x86/kernel/smpboot_32.c
+++ b/arch/x86/kernel/smpboot_32.c
@@ -91,15 +91,10 @@ static cpumask_t smp_commenced_mask;
 DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
 EXPORT_PER_CPU_SYMBOL(cpu_info);
 
-/*
- * The following static array is used during kernel startup
- * and the x86_cpu_to_apicid_ptr contains the address of the
- * array during this time.  Is it zeroed when the per_cpu
- * data area is removed.
- */
+/* which logical CPU number maps to which CPU (physical APIC ID) */
 u8 x86_cpu_to_apicid_init[NR_CPUS] __initdata =
 			{ [0 ... NR_CPUS-1] = BAD_APICID };
-void *x86_cpu_to_apicid_ptr;
+void *x86_cpu_to_apicid_early_ptr;
 DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID;
 EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
 
diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c
index 50e207a8261f..a2076b5f12af 100644
--- a/arch/x86/kernel/smpboot_64.c
+++ b/arch/x86/kernel/smpboot_64.c
@@ -852,23 +852,25 @@ static int __init smp_sanity_check(unsigned max_cpus)
 }
 
 /*
- * Copy apicid's found by MP_processor_info from initial array to the per cpu
- * data area.  The x86_cpu_to_apicid_init array is then expendable and the
- * x86_cpu_to_apicid_ptr is zeroed indicating that the static array is no
- * longer available.
+ * Copy data used in early init routines from the initial arrays to the
+ * per cpu data areas.  These arrays then become expendable and the
+ * *_ptrs are zeroed indicating that the static arrays are gone.
  */
 void __init smp_set_apicids(void)
 {
 	int cpu;
 
-	for_each_cpu_mask(cpu, cpu_possible_map) {
+	for_each_possible_cpu(cpu) {
 		if (per_cpu_offset(cpu))
 			per_cpu(x86_cpu_to_apicid, cpu) =
 						x86_cpu_to_apicid_init[cpu];
+		else
+			printk(KERN_NOTICE "per_cpu_offset zero for cpu %d\n",
+									cpu);
 	}
 
-	/* indicate the static array will be going away soon */
-	x86_cpu_to_apicid_ptr = NULL;
+	/* indicate the early static arrays are gone */
+	x86_cpu_to_apicid_early_ptr = NULL;
 }
 
 static void __init smp_cpu_index_default(void)
diff --git a/include/asm-x86/smp_32.h b/include/asm-x86/smp_32.h
index c69e960429cc..56152e312287 100644
--- a/include/asm-x86/smp_32.h
+++ b/include/asm-x86/smp_32.h
@@ -30,7 +30,7 @@ extern void (*mtrr_hook) (void);
 extern void zap_low_mappings (void);
 
 extern u8 __initdata x86_cpu_to_apicid_init[];
-extern void *x86_cpu_to_apicid_ptr;
+extern void *x86_cpu_to_apicid_early_ptr;
 
 DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
 DECLARE_PER_CPU(cpumask_t, cpu_core_map);
diff --git a/include/asm-x86/smp_64.h b/include/asm-x86/smp_64.h
index b1d5381aa760..6fa332db29cc 100644
--- a/include/asm-x86/smp_64.h
+++ b/include/asm-x86/smp_64.h
@@ -27,7 +27,7 @@ extern int smp_call_function_mask(cpumask_t mask, void (*func)(void *),
 				  void *info, int wait);
 
 extern u16 __initdata x86_cpu_to_apicid_init[];
-extern void *x86_cpu_to_apicid_ptr;
+extern void *x86_cpu_to_apicid_early_ptr;
 extern u16 bios_cpu_apicid[];
 
 DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
-- 
cgit v1.2.3


From df3825c56dd70a4d7796041388f3cfe51c1db832 Mon Sep 17 00:00:00 2001
From: "travis@sgi.com" <travis@sgi.com>
Date: Wed, 30 Jan 2008 13:33:11 +0100
Subject: x86: change NR_CPUS arrays in numa_64

Change the following static arrays sized by NR_CPUS to
per_cpu data variables:

	char cpu_to_node_map[NR_CPUS];

Signed-off-by: Mike Travis <travis@sgi.com>
Reviewed-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/setup_64.c   |  4 +++-
 arch/x86/kernel/smpboot_64.c |  6 +++++-
 arch/x86/mm/numa_64.c        | 20 ++++++++++++++++----
 include/asm-x86/numa_64.h    |  2 --
 include/asm-x86/topology.h   | 15 +++++++++++++--
 net/sunrpc/svc.c             |  1 +
 6 files changed, 38 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index e2beb4cba15f..529e45c37b1c 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -65,6 +65,7 @@
 #include <asm/cacheflush.h>
 #include <asm/mce.h>
 #include <asm/ds.h>
+#include <asm/topology.h>
 
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
@@ -359,8 +360,9 @@ void __init setup_arch(char **cmdline_p)
 	io_delay_init();
 
 #ifdef CONFIG_SMP
-	/* setup to use the static apicid table during kernel startup */
+	/* setup to use the early static init tables during kernel startup */
 	x86_cpu_to_apicid_early_ptr = (void *)&x86_cpu_to_apicid_init;
+	x86_cpu_to_node_map_early_ptr = (void *)&x86_cpu_to_node_map_init;
 #endif
 
 #ifdef CONFIG_ACPI
diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c
index a2076b5f12af..a8bc2bcdb74a 100644
--- a/arch/x86/kernel/smpboot_64.c
+++ b/arch/x86/kernel/smpboot_64.c
@@ -861,9 +861,12 @@ void __init smp_set_apicids(void)
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
-		if (per_cpu_offset(cpu))
+		if (per_cpu_offset(cpu)) {
 			per_cpu(x86_cpu_to_apicid, cpu) =
 						x86_cpu_to_apicid_init[cpu];
+			per_cpu(x86_cpu_to_node_map, cpu) =
+						x86_cpu_to_node_map_init[cpu];
+		}
 		else
 			printk(KERN_NOTICE "per_cpu_offset zero for cpu %d\n",
 									cpu);
@@ -871,6 +874,7 @@ void __init smp_set_apicids(void)
 
 	/* indicate the early static arrays are gone */
 	x86_cpu_to_apicid_early_ptr = NULL;
+	x86_cpu_to_node_map_early_ptr = NULL;
 }
 
 static void __init smp_cpu_index_default(void)
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index bc6dd5051d60..382377d6421d 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -31,10 +31,14 @@ bootmem_data_t plat_node_bdata[MAX_NUMNODES];
 
 struct memnode memnode;
 
-u16 cpu_to_node_map[NR_CPUS] __read_mostly = {
+u16 x86_cpu_to_node_map_init[NR_CPUS] __initdata = {
 	[0 ... NR_CPUS-1] = NUMA_NO_NODE
 };
-EXPORT_SYMBOL(cpu_to_node_map);
+void *x86_cpu_to_node_map_early_ptr;
+EXPORT_SYMBOL(x86_cpu_to_node_map_init);
+EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr);
+DEFINE_PER_CPU(u16, x86_cpu_to_node_map) = NUMA_NO_NODE;
+EXPORT_PER_CPU_SYMBOL(x86_cpu_to_node_map);
 
 u16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
 	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
@@ -544,7 +548,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
 	node_set(0, node_possible_map);
 	for (i = 0; i < NR_CPUS; i++)
 		numa_set_node(i, 0);
-	/* we can't use cpumask_of_cpu() yet */
+	/* cpumask_of_cpu() may not be available during early startup */
 	memset(&node_to_cpumask_map[0], 0, sizeof(node_to_cpumask_map[0]));
 	cpu_set(0, node_to_cpumask_map[0]);
 	e820_register_active_regions(0, start_pfn, end_pfn);
@@ -558,8 +562,16 @@ __cpuinit void numa_add_cpu(int cpu)
 
 void __cpuinit numa_set_node(int cpu, int node)
 {
+	u16 *cpu_to_node_map = (u16 *)x86_cpu_to_node_map_early_ptr;
+
 	cpu_pda(cpu)->nodenumber = node;
-	cpu_to_node_map[cpu] = node;
+
+	if(cpu_to_node_map)
+		cpu_to_node_map[cpu] = node;
+	else if(per_cpu_offset(cpu))
+		per_cpu(x86_cpu_to_node_map, cpu) = node;
+	else
+		Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu);
 }
 
 unsigned long __init numa_free_all_bootmem(void)
diff --git a/include/asm-x86/numa_64.h b/include/asm-x86/numa_64.h
index 941889471b52..c797cd523d2b 100644
--- a/include/asm-x86/numa_64.h
+++ b/include/asm-x86/numa_64.h
@@ -40,6 +40,4 @@ static inline void clear_node_cpumask(int cpu)
 #define clear_node_cpumask(cpu) do {} while (0)
 #endif
 
-#define NUMA_NO_NODE 0xffff
-
 #endif
diff --git a/include/asm-x86/topology.h b/include/asm-x86/topology.h
index f8706b2a3d88..e612ed8d2bc7 100644
--- a/include/asm-x86/topology.h
+++ b/include/asm-x86/topology.h
@@ -30,13 +30,24 @@
 #include <asm/mpspec.h>
 
 /* Mappings between logical cpu number and node number */
-extern u16 cpu_to_node_map[];
+DECLARE_PER_CPU(u16, x86_cpu_to_node_map);
+extern u16 __initdata x86_cpu_to_node_map_init[];
+extern void *x86_cpu_to_node_map_early_ptr;
 extern cpumask_t node_to_cpumask_map[];
 
+#define NUMA_NO_NODE	((u16)(~0))
+
 /* Returns the number of the node containing CPU 'cpu' */
 static inline int cpu_to_node(int cpu)
 {
-	return cpu_to_node_map[cpu];
+	u16 *cpu_to_node_map = (u16 *)x86_cpu_to_node_map_early_ptr;
+
+	if (cpu_to_node_map)
+		return cpu_to_node_map[cpu];
+	else if (per_cpu_offset(cpu))
+		return per_cpu(x86_cpu_to_node_map, cpu);
+	else
+		return NUMA_NO_NODE;
 }
 
 /*
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index a4a6bf7deaa4..4ad5fbbb18b4 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -18,6 +18,7 @@
 #include <linux/mm.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
+#include <linux/sched.h>
 
 #include <linux/sunrpc/types.h>
 #include <linux/sunrpc/xdr.h>
-- 
cgit v1.2.3


From ea348f3e58f43a27c8ac414dd3a14ee59528b86a Mon Sep 17 00:00:00 2001
From: "travis@sgi.com" <travis@sgi.com>
Date: Wed, 30 Jan 2008 13:33:12 +0100
Subject: x86: change NR_CPUS arrays in acpi-cpufreq

Change the following static arrays sized by NR_CPUS to
per_cpu data variables:

	acpi_cpufreq_data *drv_data[NR_CPUS]

Signed-off-by: Mike Travis <travis@sgi.com>
Reviewed-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index fea0af0476b9..a962dcb9c408 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -67,7 +67,8 @@ struct acpi_cpufreq_data {
 	unsigned int cpu_feature;
 };
 
-static struct acpi_cpufreq_data *drv_data[NR_CPUS];
+static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data);
+
 /* acpi_perf_data is a pointer to percpu data. */
 static struct acpi_processor_performance *acpi_perf_data;
 
@@ -218,14 +219,14 @@ static u32 get_cur_val(cpumask_t mask)
 	if (unlikely(cpus_empty(mask)))
 		return 0;
 
-	switch (drv_data[first_cpu(mask)]->cpu_feature) {
+	switch (per_cpu(drv_data, first_cpu(mask))->cpu_feature) {
 	case SYSTEM_INTEL_MSR_CAPABLE:
 		cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
 		cmd.addr.msr.reg = MSR_IA32_PERF_STATUS;
 		break;
 	case SYSTEM_IO_CAPABLE:
 		cmd.type = SYSTEM_IO_CAPABLE;
-		perf = drv_data[first_cpu(mask)]->acpi_data;
+		perf = per_cpu(drv_data, first_cpu(mask))->acpi_data;
 		cmd.addr.io.port = perf->control_register.address;
 		cmd.addr.io.bit_width = perf->control_register.bit_width;
 		break;
@@ -325,7 +326,7 @@ static unsigned int get_measured_perf(unsigned int cpu)
 
 #endif
 
-	retval = drv_data[cpu]->max_freq * perf_percent / 100;
+	retval = per_cpu(drv_data, cpu)->max_freq * perf_percent / 100;
 
 	put_cpu();
 	set_cpus_allowed(current, saved_mask);
@@ -336,7 +337,7 @@ static unsigned int get_measured_perf(unsigned int cpu)
 
 static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
 {
-	struct acpi_cpufreq_data *data = drv_data[cpu];
+	struct acpi_cpufreq_data *data = per_cpu(drv_data, cpu);
 	unsigned int freq;
 
 	dprintk("get_cur_freq_on_cpu (%d)\n", cpu);
@@ -370,7 +371,7 @@ static unsigned int check_freqs(cpumask_t mask, unsigned int freq,
 static int acpi_cpufreq_target(struct cpufreq_policy *policy,
 			       unsigned int target_freq, unsigned int relation)
 {
-	struct acpi_cpufreq_data *data = drv_data[policy->cpu];
+	struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);
 	struct acpi_processor_performance *perf;
 	struct cpufreq_freqs freqs;
 	cpumask_t online_policy_cpus;
@@ -466,7 +467,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
 
 static int acpi_cpufreq_verify(struct cpufreq_policy *policy)
 {
-	struct acpi_cpufreq_data *data = drv_data[policy->cpu];
+	struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);
 
 	dprintk("acpi_cpufreq_verify\n");
 
@@ -570,7 +571,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 		return -ENOMEM;
 
 	data->acpi_data = percpu_ptr(acpi_perf_data, cpu);
-	drv_data[cpu] = data;
+	per_cpu(drv_data, cpu) = data;
 
 	if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
 		acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS;
@@ -714,20 +715,20 @@ err_unreg:
 	acpi_processor_unregister_performance(perf, cpu);
 err_free:
 	kfree(data);
-	drv_data[cpu] = NULL;
+	per_cpu(drv_data, cpu) = NULL;
 
 	return result;
 }
 
 static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
 {
-	struct acpi_cpufreq_data *data = drv_data[policy->cpu];
+	struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);
 
 	dprintk("acpi_cpufreq_cpu_exit\n");
 
 	if (data) {
 		cpufreq_frequency_table_put_attr(policy->cpu);
-		drv_data[policy->cpu] = NULL;
+		per_cpu(drv_data, policy->cpu) = NULL;
 		acpi_processor_unregister_performance(data->acpi_data,
 						      policy->cpu);
 		kfree(data);
@@ -738,7 +739,7 @@ static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
 
 static int acpi_cpufreq_resume(struct cpufreq_policy *policy)
 {
-	struct acpi_cpufreq_data *data = drv_data[policy->cpu];
+	struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);
 
 	dprintk("acpi_cpufreq_resume\n");
 
-- 
cgit v1.2.3


From e8c10ef9dde3ab7b7d7db6804859d9daf38f01c4 Mon Sep 17 00:00:00 2001
From: "travis@sgi.com" <travis@sgi.com>
Date: Wed, 30 Jan 2008 13:33:12 +0100
Subject: x86: change bios_cpu_apicid to percpu data variable

Change static bios_cpu_apicid array to a per_cpu data variable.
This includes using a static array used during initialization
similar to the way x86_cpu_to_apicid[] is handled.

There is one early use of bios_cpu_apicid in apic_is_clustered_box().
The other reference in cpu_present_to_apicid() is called after
smp_set_apicids() has setup the percpu version of bios_cpu_apicid.

[ mingo@elte.hu: build fix ]

Signed-off-by: Mike Travis <travis@sgi.com>
Reviewed-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apic_64.c    | 16 ++++++++++++++--
 arch/x86/kernel/mpparse_64.c | 17 ++++++++++++-----
 arch/x86/kernel/setup_64.c   |  3 +++
 arch/x86/kernel/smpboot_64.c |  7 +++++++
 include/asm-x86/smp_64.h     |  8 +++++---
 5 files changed, 41 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 01d4ca27ecf0..f9919c492699 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -1180,14 +1180,26 @@ __cpuinit int apic_is_clustered_box(void)
 	bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
 
 	for (i = 0; i < NR_CPUS; i++) {
-		id = bios_cpu_apicid[i];
+		/* are we being called early in kernel startup? */
+		if (x86_bios_cpu_apicid_early_ptr) {
+			id = ((u16 *)x86_bios_cpu_apicid_early_ptr)[i];
+		}
+		else if (i < nr_cpu_ids) {
+			if (cpu_present(i))
+				id = per_cpu(x86_bios_cpu_apicid, i);
+			else
+				continue;
+		}
+		else
+			break;
+
 		if (id != BAD_APICID)
 			__set_bit(APIC_CLUSTERID(id), clustermap);
 	}
 
 	/* Problem:  Partially populated chassis may not have CPUs in some of
 	 * the APIC clusters they have been allocated.  Only present CPUs have
-	 * bios_cpu_apicid entries, thus causing zeroes in the bitmap.  Since
+	 * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap.  Since
 	 * clusters are allocated sequentially, count zeros only if they are
 	 * bounded by ones.
 	 */
diff --git a/arch/x86/kernel/mpparse_64.c b/arch/x86/kernel/mpparse_64.c
index 528ad9696d96..fd671754dcb5 100644
--- a/arch/x86/kernel/mpparse_64.c
+++ b/arch/x86/kernel/mpparse_64.c
@@ -67,7 +67,11 @@ unsigned disabled_cpus __cpuinitdata;
 /* Bitmask of physically existing CPUs */
 physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
 
-u16 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
+u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata
+				= { [0 ... NR_CPUS-1] = BAD_APICID };
+void *x86_bios_cpu_apicid_early_ptr;
+DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID;
+EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
 
 
 /*
@@ -118,19 +122,22 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
 	physid_set(m->mpc_apicid, phys_cpu_present_map);
  	if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
  		/*
- 		 * bios_cpu_apicid is required to have processors listed
+ 		 * x86_bios_cpu_apicid is required to have processors listed
  		 * in same order as logical cpu numbers. Hence the first
  		 * entry is BSP, and so on.
  		 */
 		cpu = 0;
  	}
-	bios_cpu_apicid[cpu] = m->mpc_apicid;
 	/* are we being called early in kernel startup? */
 	if (x86_cpu_to_apicid_early_ptr) {
-		u16 *x86_cpu_to_apicid = (u16 *)x86_cpu_to_apicid_early_ptr;
-		x86_cpu_to_apicid[cpu] = m->mpc_apicid;
+		u16 *cpu_to_apicid = (u16 *)x86_cpu_to_apicid_early_ptr;
+		u16 *bios_cpu_apicid = (u16 *)x86_bios_cpu_apicid_early_ptr;
+
+		cpu_to_apicid[cpu] = m->mpc_apicid;
+		bios_cpu_apicid[cpu] = m->mpc_apicid;
 	} else {
 		per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid;
+		per_cpu(x86_bios_cpu_apicid, cpu) = m->mpc_apicid;
 	}
 
 	cpu_set(cpu, cpu_possible_map);
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 529e45c37b1c..71a420c7fee7 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -362,8 +362,11 @@ void __init setup_arch(char **cmdline_p)
 #ifdef CONFIG_SMP
 	/* setup to use the early static init tables during kernel startup */
 	x86_cpu_to_apicid_early_ptr = (void *)&x86_cpu_to_apicid_init;
+#ifdef CONFIG_NUMA
 	x86_cpu_to_node_map_early_ptr = (void *)&x86_cpu_to_node_map_init;
 #endif
+	x86_bios_cpu_apicid_early_ptr = (void *)&x86_bios_cpu_apicid_init;
+#endif
 
 #ifdef CONFIG_ACPI
 	/*
diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c
index a8bc2bcdb74a..93071cdf0849 100644
--- a/arch/x86/kernel/smpboot_64.c
+++ b/arch/x86/kernel/smpboot_64.c
@@ -864,8 +864,12 @@ void __init smp_set_apicids(void)
 		if (per_cpu_offset(cpu)) {
 			per_cpu(x86_cpu_to_apicid, cpu) =
 						x86_cpu_to_apicid_init[cpu];
+#ifdef CONFIG_NUMA
 			per_cpu(x86_cpu_to_node_map, cpu) =
 						x86_cpu_to_node_map_init[cpu];
+#endif
+			per_cpu(x86_bios_cpu_apicid, cpu) =
+						x86_bios_cpu_apicid_init[cpu];
 		}
 		else
 			printk(KERN_NOTICE "per_cpu_offset zero for cpu %d\n",
@@ -874,7 +878,10 @@ void __init smp_set_apicids(void)
 
 	/* indicate the early static arrays are gone */
 	x86_cpu_to_apicid_early_ptr = NULL;
+#ifdef CONFIG_NUMA
 	x86_cpu_to_node_map_early_ptr = NULL;
+#endif
+	x86_bios_cpu_apicid_early_ptr = NULL;
 }
 
 static void __init smp_cpu_index_default(void)
diff --git a/include/asm-x86/smp_64.h b/include/asm-x86/smp_64.h
index 6fa332db29cc..e0a75519ad21 100644
--- a/include/asm-x86/smp_64.h
+++ b/include/asm-x86/smp_64.h
@@ -27,18 +27,20 @@ extern int smp_call_function_mask(cpumask_t mask, void (*func)(void *),
 				  void *info, int wait);
 
 extern u16 __initdata x86_cpu_to_apicid_init[];
+extern u16 __initdata x86_bios_cpu_apicid_init[];
 extern void *x86_cpu_to_apicid_early_ptr;
-extern u16 bios_cpu_apicid[];
+extern void *x86_bios_cpu_apicid_early_ptr;
 
 DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
 DECLARE_PER_CPU(cpumask_t, cpu_core_map);
 DECLARE_PER_CPU(u16, cpu_llc_id);
 DECLARE_PER_CPU(u16, x86_cpu_to_apicid);
+DECLARE_PER_CPU(u16, x86_bios_cpu_apicid);
 
 static inline int cpu_present_to_apicid(int mps_cpu)
 {
-	if (mps_cpu < NR_CPUS)
-		return (int)bios_cpu_apicid[mps_cpu];
+	if (cpu_present(mps_cpu))
+		return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
 	else
 		return BAD_APICID;
 }
-- 
cgit v1.2.3


From f13bd3e7935f7020f7c622bf3f8cae8eee757a53 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Wed, 30 Jan 2008 13:33:12 +0100
Subject: x86: use wrmsrl in kprobes.c, step.c

Where x86_32 passed zero in the high 32 bits, use wrmsrl which
will zero extend for us.  This allows ifdefs for 32/64 bit to
be eliminated.

Eliminate ifdef in step.c.  Similar cleanup was done when unifying
kprobes_32|64.c and wrmsr() was chosen there over wrmsrl().  This
patch changes these to wrmsrl.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/kprobes.c | 4 ++--
 arch/x86/kernel/step.c    | 4 ----
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 93aff49798ee..edc0a8e59247 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -410,13 +410,13 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
 static void __kprobes clear_btf(void)
 {
 	if (test_thread_flag(TIF_DEBUGCTLMSR))
-		wrmsr(MSR_IA32_DEBUGCTLMSR, 0, 0);
+		wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
 }
 
 static void __kprobes restore_btf(void)
 {
 	if (test_thread_flag(TIF_DEBUGCTLMSR))
-		wrmsr(MSR_IA32_DEBUGCTLMSR, current->thread.debugctlmsr, 0);
+		wrmsrl(MSR_IA32_DEBUGCTLMSR, current->thread.debugctlmsr);
 }
 
 static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 5884dd485db8..b801e76cebf7 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -229,11 +229,7 @@ static void write_debugctlmsr(struct task_struct *child, unsigned long val)
 	if (child != current)
 		return;
 
-#ifdef CONFIG_X86_64
 	wrmsrl(MSR_IA32_DEBUGCTLMSR, val);
-#else
-	wrmsr(MSR_IA32_DEBUGCTLMSR, val, 0);
-#endif
 }
 
 /*
-- 
cgit v1.2.3


From 37cd9cf3dafed82f7cf905785883300f6ff7c818 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Wed, 30 Jan 2008 13:33:12 +0100
Subject: x86: common x86_32|64 naming

Rename convert_rip_to_linear to convert_ip_to_linear for shared
X86_32|64 use.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/step.c   | 4 ++--
 arch/x86/mm/fault_32.c   | 2 +-
 arch/x86/mm/fault_64.c   | 2 +-
 include/asm-x86/ptrace.h | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index b801e76cebf7..80b37181a42b 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -89,7 +89,7 @@ unsigned long get_segment_eip(struct pt_regs *regs,
 #ifdef CONFIG_X86_32
 static
 #endif
-unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *regs)
+unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs)
 {
 	unsigned long addr, seg;
 
@@ -136,7 +136,7 @@ static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
 {
 	int i, copied;
 	unsigned char opcode[15];
-	unsigned long addr = convert_rip_to_linear(child, regs);
+	unsigned long addr = convert_ip_to_linear(child, regs);
 
 	copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
 	for (i = 0; i < copied; i++) {
diff --git a/arch/x86/mm/fault_32.c b/arch/x86/mm/fault_32.c
index b92922a1d65f..b4d19c2d4f05 100644
--- a/arch/x86/mm/fault_32.c
+++ b/arch/x86/mm/fault_32.c
@@ -95,7 +95,7 @@ static int is_prefetch(struct pt_regs *regs, unsigned long addr,
 	/* If it was a exec fault ignore */
 	if (error_code & PF_INSTR)
 		return 0;
-	instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
+	instr = (unsigned char __user *)convert_ip_to_linear(current, regs);
 #endif
 
 	max_instr = instr + 15;
diff --git a/arch/x86/mm/fault_64.c b/arch/x86/mm/fault_64.c
index cf7e99895b91..d519b41f1962 100644
--- a/arch/x86/mm/fault_64.c
+++ b/arch/x86/mm/fault_64.c
@@ -98,7 +98,7 @@ static int is_prefetch(struct pt_regs *regs, unsigned long addr,
 	/* If it was a exec fault ignore */
 	if (error_code & PF_INSTR)
 		return 0;
-	instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
+	instr = (unsigned char __user *)convert_ip_to_linear(current, regs);
 #endif
 
 	max_instr = instr + 15;
diff --git a/include/asm-x86/ptrace.h b/include/asm-x86/ptrace.h
index cc4456667d89..35c103714906 100644
--- a/include/asm-x86/ptrace.h
+++ b/include/asm-x86/ptrace.h
@@ -177,7 +177,7 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
 struct task_struct;
 
 extern unsigned long
-convert_rip_to_linear(struct task_struct *child, struct pt_regs *regs);
+convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs);
 
 #endif /* __KERNEL__ */
 #endif /* !__i386__ */
-- 
cgit v1.2.3


From f2857ce92023409df1544737d5b3499b4630a183 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Wed, 30 Jan 2008 13:33:12 +0100
Subject: x86: remove last user of get_segment_eip

is_prefetch was the last user of get_segment_eip and only on
X86_32.  This function returned the faulting instruction's
address and set the upper segment limit.

Instead, use the convert_ip_to_linear helper and rely on
probe_kernel_address to do the segment checks which was
already done everywhere the segment limit was being checked
on X86_32.

Remove get_segment_eip as well.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/step.c   | 84 ------------------------------------------------
 arch/x86/mm/fault_32.c   | 15 ++-------
 arch/x86/mm/fault_64.c   | 15 ++-------
 include/asm-x86/ptrace.h |  6 ++--
 4 files changed, 8 insertions(+), 112 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 80b37181a42b..2ef1a5f8d675 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -5,90 +5,6 @@
 #include <linux/mm.h>
 #include <linux/ptrace.h>
 
-#ifdef CONFIG_X86_32
-#include <linux/uaccess.h>
-
-#include <asm/desc.h>
-
-/*
- * Return EIP plus the CS segment base.  The segment limit is also
- * adjusted, clamped to the kernel/user address space (whichever is
- * appropriate), and returned in *eip_limit.
- *
- * The segment is checked, because it might have been changed by another
- * task between the original faulting instruction and here.
- *
- * If CS is no longer a valid code segment, or if EIP is beyond the
- * limit, or if it is a kernel address when CS is not a kernel segment,
- * then the returned value will be greater than *eip_limit.
- *
- * This is slow, but is very rarely executed.
- */
-unsigned long get_segment_eip(struct pt_regs *regs,
-					    unsigned long *eip_limit)
-{
-	unsigned long ip = regs->ip;
-	unsigned seg = regs->cs & 0xffff;
-	u32 seg_ar, seg_limit, base, *desc;
-
-	/* Unlikely, but must come before segment checks. */
-	if (unlikely(regs->flags & VM_MASK)) {
-		base = seg << 4;
-		*eip_limit = base + 0xffff;
-		return base + (ip & 0xffff);
-	}
-
-	/* The standard kernel/user address space limit. */
-	*eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
-
-	/* By far the most common cases. */
-	if (likely(SEGMENT_IS_FLAT_CODE(seg)))
-		return ip;
-
-	/* Check the segment exists, is within the current LDT/GDT size,
-	   that kernel/user (ring 0..3) has the appropriate privilege,
-	   that it's a code segment, and get the limit. */
-	__asm__("larl %3,%0; lsll %3,%1"
-		 : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
-	if ((~seg_ar & 0x9800) || ip > seg_limit) {
-		*eip_limit = 0;
-		return 1;	 /* So that returned ip > *eip_limit. */
-	}
-
-	/* Get the GDT/LDT descriptor base.
-	   When you look for races in this code remember that
-	   LDT and other horrors are only used in user space. */
-	if (seg & (1<<2)) {
-		/* Must lock the LDT while reading it. */
-		mutex_lock(&current->mm->context.lock);
-		desc = current->mm->context.ldt;
-		desc = (void *)desc + (seg & ~7);
-	} else {
-		/* Must disable preemption while reading the GDT. */
-		desc = (u32 *)get_cpu_gdt_table(get_cpu());
-		desc = (void *)desc + (seg & ~7);
-	}
-
-	/* Decode the code segment base from the descriptor */
-	base = get_desc_base((struct desc_struct *)desc);
-
-	if (seg & (1<<2))
-		mutex_unlock(&current->mm->context.lock);
-	else
-		put_cpu();
-
-	/* Adjust EIP and segment limit, and clamp at the kernel limit.
-	   It's legitimate for segments to wrap at 0xffffffff. */
-	seg_limit += base;
-	if (seg_limit < *eip_limit && seg_limit >= base)
-		*eip_limit = seg_limit;
-	return ip + base;
-}
-#endif
-
-#ifdef CONFIG_X86_32
-static
-#endif
 unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs)
 {
 	unsigned long addr, seg;
diff --git a/arch/x86/mm/fault_32.c b/arch/x86/mm/fault_32.c
index b4d19c2d4f05..36cb67e02b04 100644
--- a/arch/x86/mm/fault_32.c
+++ b/arch/x86/mm/fault_32.c
@@ -81,7 +81,6 @@ static int is_prefetch(struct pt_regs *regs, unsigned long addr,
 	unsigned char *max_instr;
 
 #ifdef CONFIG_X86_32
-	unsigned long limit;
 	if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
 		     boot_cpu_data.x86 >= 6)) {
 		/* Catch an obscure case of prefetch inside an NX page. */
@@ -90,30 +89,23 @@ static int is_prefetch(struct pt_regs *regs, unsigned long addr,
 	} else {
 		return 0;
 	}
-	instr = (unsigned char *)get_segment_eip(regs, &limit);
 #else
 	/* If it was a exec fault ignore */
 	if (error_code & PF_INSTR)
 		return 0;
-	instr = (unsigned char __user *)convert_ip_to_linear(current, regs);
 #endif
 
+	instr = (unsigned char *)convert_ip_to_linear(current, regs);
 	max_instr = instr + 15;
 
-#ifdef CONFIG_X86_64
 	if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
 		return 0;
-#endif
 
 	while (scan_more && instr < max_instr) {
 		unsigned char opcode;
 		unsigned char instr_hi;
 		unsigned char instr_lo;
 
-#ifdef CONFIG_X86_32
-		if (instr > (unsigned char *)limit)
-			break;
-#endif
 		if (probe_kernel_address(instr, opcode))
 			break;
 
@@ -155,10 +147,7 @@ static int is_prefetch(struct pt_regs *regs, unsigned long addr,
 		case 0x00:
 			/* Prefetch instruction is 0x0F0D or 0x0F18 */
 			scan_more = 0;
-#ifdef CONFIG_X86_32
-			if (instr > (unsigned char *)limit)
-				break;
-#endif
+
 			if (probe_kernel_address(instr, opcode))
 				break;
 			prefetch = (instr_lo == 0xF) &&
diff --git a/arch/x86/mm/fault_64.c b/arch/x86/mm/fault_64.c
index d519b41f1962..80f8436ac8b2 100644
--- a/arch/x86/mm/fault_64.c
+++ b/arch/x86/mm/fault_64.c
@@ -84,7 +84,6 @@ static int is_prefetch(struct pt_regs *regs, unsigned long addr,
 	unsigned char *max_instr;
 
 #ifdef CONFIG_X86_32
-	unsigned long limit;
 	if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
 		     boot_cpu_data.x86 >= 6)) {
 		/* Catch an obscure case of prefetch inside an NX page. */
@@ -93,30 +92,23 @@ static int is_prefetch(struct pt_regs *regs, unsigned long addr,
 	} else {
 		return 0;
 	}
-	instr = (unsigned char *)get_segment_eip(regs, &limit);
 #else
 	/* If it was a exec fault ignore */
 	if (error_code & PF_INSTR)
 		return 0;
-	instr = (unsigned char __user *)convert_ip_to_linear(current, regs);
 #endif
 
+	instr = (unsigned char *)convert_ip_to_linear(current, regs);
 	max_instr = instr + 15;
 
-#ifdef CONFIG_X86_64
 	if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
 		return 0;
-#endif
 
 	while (scan_more && instr < max_instr) {
 		unsigned char opcode;
 		unsigned char instr_hi;
 		unsigned char instr_lo;
 
-#ifdef CONFIG_X86_32
-		if (instr > (unsigned char *)limit)
-			break;
-#endif
 		if (probe_kernel_address(instr, opcode))
 			break;
 
@@ -158,10 +150,7 @@ static int is_prefetch(struct pt_regs *regs, unsigned long addr,
 		case 0x00:
 			/* Prefetch instruction is 0x0F0D or 0x0F18 */
 			scan_more = 0;
-#ifdef CONFIG_X86_32
-			if (instr > (unsigned char *)limit)
-				break;
-#endif
+
 			if (probe_kernel_address(instr, opcode))
 				break;
 			prefetch = (instr_lo == 0xF) &&
diff --git a/include/asm-x86/ptrace.h b/include/asm-x86/ptrace.h
index 35c103714906..ee4b595e1ccc 100644
--- a/include/asm-x86/ptrace.h
+++ b/include/asm-x86/ptrace.h
@@ -70,6 +70,10 @@ struct pt_regs {
 #include <asm/segment.h>
 
 struct task_struct;
+
+extern unsigned long
+convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs);
+
 extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code);
 
 /*
@@ -184,8 +188,6 @@ convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs);
 
 #ifdef __KERNEL__
 
-unsigned long get_segment_eip(struct pt_regs *regs, unsigned long *eip_limit);
-
 /*
  * These are defined as per linux/ptrace.h, which see.
  */
-- 
cgit v1.2.3


From 0723a69a63beec1ca6e792239ef75d0181387ef0 Mon Sep 17 00:00:00 2001
From: Benjamin LaHaise <bcrl@kvack.org>
Date: Wed, 30 Jan 2008 13:33:13 +0100
Subject: x86: fix synchronize_rcu(): high latency on idle system

an otherwise idle system takes about 3 ticks per network
interface in unregister_netdev() due to multiple calls to synchronize_rcu(),
which adds up to quite a few seconds for tearing down thousands of
interfaces.  By flushing pending rcu callbacks in the idle loop, the system
makes progress hundreds of times faster.  If this is indeed a sane thing to,
it probably needs to be done for other architectures than x86.  And yes, the
network stack shouldn't call synchronize_rcu() quite so much, but fixing that
is a little more involved.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/process_32.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 7a61b54649de..69a69c3f43bb 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -198,6 +198,9 @@ void cpu_idle(void)
 			rmb();
 			idle = pm_idle;
 
+			if (rcu_pending(cpu))
+				rcu_check_callbacks(cpu, 0);
+
 			if (!idle)
 				idle = default_idle;
 
-- 
cgit v1.2.3


From fb8830e72d9bd86f1e7b6886cb1886c391130f86 Mon Sep 17 00:00:00 2001
From: Abhishek Sagar <sagar.abhishek@gmail.com>
Date: Wed, 30 Jan 2008 13:33:13 +0100
Subject: x86: fix singlestep handling in reenter_kprobe

Highlight peculiar cases in singles-step kprobe handling.

In reenter_kprobe(), a breakpoint in KPROBE_HIT_SS case can only occur
when single-stepping a breakpoint on which a probe was installed. Since
such probes are single-stepped inline, identifying these cases is
unambiguous. All other cases leading up to KPROBE_HIT_SS are possible
bugs. Identify and WARN_ON such cases.

Signed-off-by: Abhishek Sagar <sagar.abhishek@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/kprobes.c | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index edc0a8e59247..f7ddbb8c3fe8 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -443,17 +443,6 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
 	*sara = (unsigned long) &kretprobe_trampoline;
 }
 
-static void __kprobes recursive_singlestep(struct kprobe *p,
-					   struct pt_regs *regs,
-					   struct kprobe_ctlblk *kcb)
-{
-	save_previous_kprobe(kcb);
-	set_current_kprobe(p, regs, kcb);
-	kprobes_inc_nmissed_count(p);
-	prepare_singlestep(p, regs);
-	kcb->kprobe_status = KPROBE_REENTER;
-}
-
 static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
 				       struct kprobe_ctlblk *kcb)
 {
@@ -492,20 +481,29 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
 		break;
 #endif
 	case KPROBE_HIT_ACTIVE:
-		recursive_singlestep(p, regs, kcb);
+		save_previous_kprobe(kcb);
+		set_current_kprobe(p, regs, kcb);
+		kprobes_inc_nmissed_count(p);
+		prepare_singlestep(p, regs);
+		kcb->kprobe_status = KPROBE_REENTER;
 		break;
 	case KPROBE_HIT_SS:
-		if (*p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
+		if (p == kprobe_running()) {
 			regs->flags &= ~TF_MASK;
 			regs->flags |= kcb->kprobe_saved_flags;
 			return 0;
 		} else {
-			recursive_singlestep(p, regs, kcb);
+			/* A probe has been hit in the codepath leading up
+			 * to, or just after, single-stepping of a probed
+			 * instruction. This entire codepath should strictly
+			 * reside in .kprobes.text section. Raise a warning
+			 * to highlight this peculiar case.
+			 */
 		}
-		break;
 	default:
 		/* impossible cases */
 		WARN_ON(1);
+		return 0;
 	}
 
 	return 1;
-- 
cgit v1.2.3


From 466eed22d127a1f16e1251cdc54a9f8f944140c0 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Wed, 30 Jan 2008 13:33:14 +0100
Subject: x86: isolate PIC/PIT in/out calls

Rather than remove and/or mangle inb_p/outb_p we want to remove the use
of them from inappropriate places. For the PIC/PIT this may eventually
depend on 32/64bitism or similar so start by adding inb/outb_pit and
inb/outb_pic so that we can make them use any scheme we settle on without
disturbing the existing, correct (for ISA), port 0x80 usage. (eg we can
make inb_pit use udelay without messing up inb_p).

Floppy already does this for the fdc. That really only leaves the CMOS as
a core logic item to tackle, and bits of parallel port handling in the
chipset layers.

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apm_32.c      |  6 +++---
 arch/x86/kernel/i8253.c       | 30 +++++++++++++++---------------
 arch/x86/kernel/i8259_32.c    | 20 ++++++++++----------
 arch/x86/kernel/i8259_64.c    | 20 ++++++++++----------
 arch/x86/kernel/vmiclock_32.c |  2 +-
 include/asm-x86/i8253.h       |  3 +++
 include/asm-x86/i8259.h       |  3 +++
 7 files changed, 45 insertions(+), 39 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 955dd43b1676..d4438ef296d8 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1174,11 +1174,11 @@ static void reinit_timer(void)
 
 	spin_lock_irqsave(&i8253_lock, flags);
 	/* set the clock to HZ */
-	outb_p(0x34, PIT_MODE);		/* binary, mode 2, LSB/MSB, ch 0 */
+	outb_pit(0x34, PIT_MODE);		/* binary, mode 2, LSB/MSB, ch 0 */
 	udelay(10);
-	outb_p(LATCH & 0xff, PIT_CH0);	/* LSB */
+	outb_pit(LATCH & 0xff, PIT_CH0);	/* LSB */
 	udelay(10);
-	outb(LATCH >> 8, PIT_CH0);	/* MSB */
+	outb_pit(LATCH >> 8, PIT_CH0);	/* MSB */
 	udelay(10);
 	spin_unlock_irqrestore(&i8253_lock, flags);
 #endif
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index c76fef1ce355..ef62b07b2b48 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -43,26 +43,26 @@ static void init_pit_timer(enum clock_event_mode mode,
 	switch(mode) {
 	case CLOCK_EVT_MODE_PERIODIC:
 		/* binary, mode 2, LSB/MSB, ch 0 */
-		outb_p(0x34, PIT_MODE);
-		outb_p(LATCH & 0xff , PIT_CH0);	/* LSB */
-		outb(LATCH >> 8 , PIT_CH0);	/* MSB */
+		outb_pit(0x34, PIT_MODE);
+		outb_pit(LATCH & 0xff , PIT_CH0);	/* LSB */
+		outb_pit(LATCH >> 8 , PIT_CH0);		/* MSB */
 		break;
 
 	case CLOCK_EVT_MODE_SHUTDOWN:
 	case CLOCK_EVT_MODE_UNUSED:
 		if (evt->mode == CLOCK_EVT_MODE_PERIODIC ||
 		    evt->mode == CLOCK_EVT_MODE_ONESHOT) {
-			outb_p(0x30, PIT_MODE);
-			outb_p(0, PIT_CH0);
-			outb_p(0, PIT_CH0);
+			outb_pit(0x30, PIT_MODE);
+			outb_pit(0, PIT_CH0);
+			outb_pit(0, PIT_CH0);
 		}
 		pit_disable_clocksource();
 		break;
 
 	case CLOCK_EVT_MODE_ONESHOT:
 		/* One shot setup */
-		outb_p(0x38, PIT_MODE);
 		pit_disable_clocksource();
+		outb_pit(0x38, PIT_MODE);
 		break;
 
 	case CLOCK_EVT_MODE_RESUME:
@@ -80,8 +80,8 @@ static void init_pit_timer(enum clock_event_mode mode,
 static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
 {
 	spin_lock(&i8253_lock);
-	outb_p(delta & 0xff , PIT_CH0);	/* LSB */
-	outb(delta >> 8 , PIT_CH0);	/* MSB */
+	outb_pit(delta & 0xff , PIT_CH0);	/* LSB */
+	outb_pit(delta >> 8 , PIT_CH0);		/* MSB */
 	spin_unlock(&i8253_lock);
 
 	return 0;
@@ -153,15 +153,15 @@ static cycle_t pit_read(void)
 	 * count), it cannot be newer.
 	 */
 	jifs = jiffies;
-	outb_p(0x00, PIT_MODE);	/* latch the count ASAP */
-	count = inb_p(PIT_CH0);	/* read the latched count */
-	count |= inb_p(PIT_CH0) << 8;
+	outb_pit(0x00, PIT_MODE);	/* latch the count ASAP */
+	count = inb_pit(PIT_CH0);	/* read the latched count */
+	count |= inb_pit(PIT_CH0) << 8;
 
 	/* VIA686a test code... reset the latch if count > max + 1 */
 	if (count > LATCH) {
-		outb_p(0x34, PIT_MODE);
-		outb_p(LATCH & 0xff, PIT_CH0);
-		outb(LATCH >> 8, PIT_CH0);
+		outb_pit(0x34, PIT_MODE);
+		outb_pit(LATCH & 0xff, PIT_CH0);
+		outb_pit(LATCH >> 8, PIT_CH0);
 		count = LATCH - 1;
 	}
 
diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259_32.c
index f201e7da1bbc..2d25b77102fe 100644
--- a/arch/x86/kernel/i8259_32.c
+++ b/arch/x86/kernel/i8259_32.c
@@ -289,20 +289,20 @@ void init_8259A(int auto_eoi)
 	outb(0xff, PIC_SLAVE_IMR);	/* mask all of 8259A-2 */
 
 	/*
-	 * outb_p - this has to work on a wide range of PC hardware.
+	 * outb_pic - this has to work on a wide range of PC hardware.
 	 */
-	outb_p(0x11, PIC_MASTER_CMD);	/* ICW1: select 8259A-1 init */
-	outb_p(0x20 + 0, PIC_MASTER_IMR);	/* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */
-	outb_p(1U << PIC_CASCADE_IR, PIC_MASTER_IMR);	/* 8259A-1 (the master) has a slave on IR2 */
+	outb_pic(0x11, PIC_MASTER_CMD);	/* ICW1: select 8259A-1 init */
+	outb_pic(0x20 + 0, PIC_MASTER_IMR);	/* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */
+	outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR);	/* 8259A-1 (the master) has a slave on IR2 */
 	if (auto_eoi)	/* master does Auto EOI */
-		outb_p(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
+		outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
 	else		/* master expects normal EOI */
-		outb_p(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
+		outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
 
-	outb_p(0x11, PIC_SLAVE_CMD);	/* ICW1: select 8259A-2 init */
-	outb_p(0x20 + 8, PIC_SLAVE_IMR);	/* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */
-	outb_p(PIC_CASCADE_IR, PIC_SLAVE_IMR);	/* 8259A-2 is a slave on master's IR2 */
-	outb_p(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */
+	outb_pic(0x11, PIC_SLAVE_CMD);	/* ICW1: select 8259A-2 init */
+	outb_pic(0x20 + 8, PIC_SLAVE_IMR);	/* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */
+	outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR);	/* 8259A-2 is a slave on master's IR2 */
+	outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */
 	if (auto_eoi)
 		/*
 		 * In AEOI mode we just have to mask the interrupt
diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c
index 99c8406ae253..d3edb9f23f2c 100644
--- a/arch/x86/kernel/i8259_64.c
+++ b/arch/x86/kernel/i8259_64.c
@@ -359,25 +359,25 @@ void init_8259A(int auto_eoi)
 	outb(0xff, PIC_SLAVE_IMR);	/* mask all of 8259A-2 */
 
 	/*
-	 * outb_p - this has to work on a wide range of PC hardware.
+	 * outb_pic - this has to work on a wide range of PC hardware.
 	 */
-	outb_p(0x11, PIC_MASTER_CMD);	/* ICW1: select 8259A-1 init */
+	outb_pic(0x11, PIC_MASTER_CMD);	/* ICW1: select 8259A-1 init */
 	/* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */
-	outb_p(IRQ0_VECTOR, PIC_MASTER_IMR);
+	outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR);
 	/* 8259A-1 (the master) has a slave on IR2 */
-	outb_p(0x04, PIC_MASTER_IMR);
+	outb_pic(0x04, PIC_MASTER_IMR);
 	if (auto_eoi)	/* master does Auto EOI */
-		outb_p(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
+		outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
 	else		/* master expects normal EOI */
-		outb_p(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
+		outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
 
-	outb_p(0x11, PIC_SLAVE_CMD);	/* ICW1: select 8259A-2 init */
+	outb_pic(0x11, PIC_SLAVE_CMD);	/* ICW1: select 8259A-2 init */
 	/* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */
-	outb_p(IRQ8_VECTOR, PIC_SLAVE_IMR);
+	outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR);
 	/* 8259A-2 is a slave on master's IR2 */
-	outb_p(PIC_CASCADE_IR, PIC_SLAVE_IMR);
+	outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR);
 	/* (slave's support for AEOI in flat mode is to be investigated) */
-	outb_p(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR);
+	outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR);
 
 	if (auto_eoi)
 		/*
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 57f9ef5a324c..a2b030780aa9 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -237,7 +237,7 @@ static void __devinit vmi_time_init_clockevent(void)
 void __init vmi_time_init(void)
 {
 	/* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */
-	outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
+	outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
 
 	vmi_time_init_clockevent();
 	setup_irq(0, &vmi_clock_action);
diff --git a/include/asm-x86/i8253.h b/include/asm-x86/i8253.h
index 747548ec5d1d..b51c0487fc41 100644
--- a/include/asm-x86/i8253.h
+++ b/include/asm-x86/i8253.h
@@ -12,4 +12,7 @@ extern struct clock_event_device *global_clock_event;
 
 extern void setup_pit_timer(void);
 
+#define inb_pit		inb_p
+#define outb_pit	outb_p
+
 #endif	/* __ASM_I8253_H__ */
diff --git a/include/asm-x86/i8259.h b/include/asm-x86/i8259.h
index cabcc6cf3923..67c319e0efc7 100644
--- a/include/asm-x86/i8259.h
+++ b/include/asm-x86/i8259.h
@@ -29,4 +29,7 @@ extern void enable_8259A_irq(unsigned int irq);
 extern void disable_8259A_irq(unsigned int irq);
 extern unsigned int startup_8259A_irq(unsigned int irq);
 
+#define inb_pic		inb_p
+#define outb_pic	outb_p
+
 #endif	/* __ASM_I8259_H__ */
-- 
cgit v1.2.3


From c0400030b28e95fd477fdfe8ba8acb0af21e5c15 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Wed, 30 Jan 2008 13:33:14 +0100
Subject: x86-64: clean up linker script

Remove the dead .text.lock. Move _etext and __{start,stop}___ex_table
into their sections.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/vmlinux_64.lds.S | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index ea5386944e67..7457370d7916 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -37,16 +37,15 @@ SECTIONS
 	KPROBES_TEXT
 	*(.fixup)
 	*(.gnu.warning)
-	} :text = 0x9090
-  				/* out-of-line lock text */
-  .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) }
-
-  _etext = .;			/* End of text section */
+	_etext = .;			/* End of text section */
+  } :text = 0x9090
 
   . = ALIGN(16);		/* Exception table */
-  __start___ex_table = .;
-  __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) }
-  __stop___ex_table = .;
+  __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
+  	__start___ex_table = .;
+	 *(__ex_table)
+  	__stop___ex_table = .;
+  }
 
   NOTES :text :note
 
-- 
cgit v1.2.3


From 4dbf7af6442a9a882855bed0d999659ac413e3ac Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Wed, 30 Jan 2008 13:33:14 +0100
Subject: x86: adjust/fix LDT handling for Xen

Based on patch from Jan Beulich <jbeulich@novell.com>.

Don't rely on kmalloc(PAGE_SIZE) returning PAGE_SIZE aligned memory
(Xen requires GDT *and* LDT to be page-aligned). Using the page
allocator interface also removes the (albeit small) slab allocator
overhead. The same change being done for 64-bits for consistency.

Further, the Xen hypercall interface expects the LDT address to be
virtual, not machine.

[ Adjusted to unified ldt.c - Jeremy ]

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Acked-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ldt.c    | 7 +++----
 arch/x86/xen/enlighten.c | 9 +--------
 2 files changed, 4 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index b8ef46270e24..8a7660c8394a 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -12,7 +12,6 @@
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/vmalloc.h>
-#include <linux/slab.h>
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
@@ -40,7 +39,7 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
 	if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE)
 		newldt = vmalloc(mincount * LDT_ENTRY_SIZE);
 	else
-		newldt = kmalloc(mincount * LDT_ENTRY_SIZE, GFP_KERNEL);
+		newldt = (void *)__get_free_page(GFP_KERNEL);
 
 	if (!newldt)
 		return -ENOMEM;
@@ -78,7 +77,7 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
 		if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
 			vfree(oldldt);
 		else
-			kfree(oldldt);
+			put_page(virt_to_page(oldldt));
 	}
 	return 0;
 }
@@ -129,7 +128,7 @@ void destroy_context(struct mm_struct *mm)
 		if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
 			vfree(mm->context.ldt);
 		else
-			kfree(mm->context.ldt);
+			put_page(virt_to_page(mm->context.ldt));
 		mm->context.size = 0;
 	}
 }
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 72dd14d0685c..845b4fd94463 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -275,19 +275,12 @@ static unsigned long xen_store_tr(void)
 
 static void xen_set_ldt(const void *addr, unsigned entries)
 {
-	unsigned long linear_addr = (unsigned long)addr;
 	struct mmuext_op *op;
 	struct multicall_space mcs = xen_mc_entry(sizeof(*op));
 
 	op = mcs.args;
 	op->cmd = MMUEXT_SET_LDT;
-	if (linear_addr) {
-		/* ldt my be vmalloced, use arbitrary_virt_to_machine */
-		xmaddr_t maddr;
-		maddr = arbitrary_virt_to_machine((unsigned long)addr);
-		linear_addr = (unsigned long)maddr.maddr;
-	}
-	op->arg1.linear_addr = linear_addr;
+	op->arg1.linear_addr = (unsigned long)addr;
 	op->arg2.nr_ents = entries;
 
 	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
-- 
cgit v1.2.3


From 693e3c560317577a29c625d89f6745d5c7cfd918 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Wed, 30 Jan 2008 13:33:14 +0100
Subject: x86: reduce memory and intra-node effects

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apic_64.c    | 5 +++--
 arch/x86/kernel/mpparse_64.c | 6 +++---
 arch/x86/mm/numa_64.c        | 2 +-
 arch/x86/mm/srat_64.c        | 8 ++++++--
 include/asm-x86/topology.h   | 2 +-
 5 files changed, 14 insertions(+), 9 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index f9919c492699..85bd3d463cdf 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -1175,14 +1175,15 @@ __cpuinit int apic_is_clustered_box(void)
 {
 	int i, clusters, zeros;
 	unsigned id;
+	u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr;
 	DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
 
 	bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
 
 	for (i = 0; i < NR_CPUS; i++) {
 		/* are we being called early in kernel startup? */
-		if (x86_bios_cpu_apicid_early_ptr) {
-			id = ((u16 *)x86_bios_cpu_apicid_early_ptr)[i];
+		if (bios_cpu_apicid) {
+			id = bios_cpu_apicid[i];
 		}
 		else if (i < nr_cpu_ids) {
 			if (cpu_present(i))
diff --git a/arch/x86/kernel/mpparse_64.c b/arch/x86/kernel/mpparse_64.c
index fd671754dcb5..d3260f8f17dc 100644
--- a/arch/x86/kernel/mpparse_64.c
+++ b/arch/x86/kernel/mpparse_64.c
@@ -122,7 +122,7 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
 	physid_set(m->mpc_apicid, phys_cpu_present_map);
  	if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
  		/*
- 		 * x86_bios_cpu_apicid is required to have processors listed
+		 * x86_bios_cpu_apicid is required to have processors listed
  		 * in same order as logical cpu numbers. Hence the first
  		 * entry is BSP, and so on.
  		 */
@@ -130,8 +130,8 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
  	}
 	/* are we being called early in kernel startup? */
 	if (x86_cpu_to_apicid_early_ptr) {
-		u16 *cpu_to_apicid = (u16 *)x86_cpu_to_apicid_early_ptr;
-		u16 *bios_cpu_apicid = (u16 *)x86_bios_cpu_apicid_early_ptr;
+		u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr;
+		u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr;
 
 		cpu_to_apicid[cpu] = m->mpc_apicid;
 		bios_cpu_apicid[cpu] = m->mpc_apicid;
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 382377d6421d..dca58fb39b08 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -562,7 +562,7 @@ __cpuinit void numa_add_cpu(int cpu)
 
 void __cpuinit numa_set_node(int cpu, int node)
 {
-	u16 *cpu_to_node_map = (u16 *)x86_cpu_to_node_map_early_ptr;
+	u16 *cpu_to_node_map = x86_cpu_to_node_map_early_ptr;
 
 	cpu_pda(cpu)->nodenumber = node;
 
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index bccbdc7be434..e5a1ec8342dc 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -346,8 +346,12 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 	/* First clean up the node list */
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		cutoff_node(i, start, end);
-		/* ZZZ why was this needed. At least add a comment */
-		if (nodes[i].end && (nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
+		/*
+		 * don't confuse VM with a node that doesn't have the
+		 * minimum memory.
+		 */
+		if (nodes[i].end &&
+			(nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
 			unparse_node(i);
 			node_set_offline(i);
 		}
diff --git a/include/asm-x86/topology.h b/include/asm-x86/topology.h
index e612ed8d2bc7..a89b46eb2526 100644
--- a/include/asm-x86/topology.h
+++ b/include/asm-x86/topology.h
@@ -40,7 +40,7 @@ extern cpumask_t node_to_cpumask_map[];
 /* Returns the number of the node containing CPU 'cpu' */
 static inline int cpu_to_node(int cpu)
 {
-	u16 *cpu_to_node_map = (u16 *)x86_cpu_to_node_map_early_ptr;
+	u16 *cpu_to_node_map = x86_cpu_to_node_map_early_ptr;
 
 	if (cpu_to_node_map)
 		return cpu_to_node_map[cpu];
-- 
cgit v1.2.3


From 30d432dfab2bcfd021d352e2058fae6b9405caeb Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 30 Jan 2008 13:33:16 +0100
Subject: x86: move MWAIT idle check to generic CPU initialization on 32-bit

Previously it was only run for Intel CPUs, but AMD Fam10h implements MWAIT too.

This matches 64bit behaviour.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/common.c | 2 ++
 arch/x86/kernel/cpu/intel.c  | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index ed05c7a0ca9b..4bd326d0322c 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -499,6 +499,8 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 
 	/* Init Machine Check Exception if available. */
 	mcheck_init(c);
+
+	select_idle_routine(c);
 }
 
 void __init identify_boot_cpu(void)
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index f1136115279a..d1c372b018db 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -137,7 +137,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 	}
 #endif
 
-	select_idle_routine(c);
 	l2 = init_intel_cacheinfo(c);
 	if (c->cpuid_level > 9 ) {
 		unsigned eax = cpuid_eax(10);
-- 
cgit v1.2.3


From 0c07ee38c9d4eb081758f5ad14bbffa7197e1aec Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 30 Jan 2008 13:33:16 +0100
Subject: x86: use the correct cpuid method to detect MWAIT support for C
 states

Previously there was a AMD specific quirk to handle the case of
AMD Fam10h MWAIT not supporting any C states. But it turns out
that CPUID already has ways to detectly detect that without
using special quirks.

The new code simply checks if MWAIT supports at least C1 and doesn't
use it if it doesn't. No more vendor specific code.

Note this is does not simply clear MWAIT because MWAIT can be still
useful even without C states.

Credit goes to Ben Serebrin for pointing out the (nearly) obvious.

Cc: "Andreas Herrmann" <andreas.herrmann3@amd.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/amd.c    |  3 ---
 arch/x86/kernel/process_32.c | 10 +++++++++-
 arch/x86/kernel/process_64.c | 11 ++++++++++-
 arch/x86/kernel/setup_64.c   |  4 ----
 4 files changed, 19 insertions(+), 9 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index cd2fe15ff4b5..06fa159232fd 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -300,9 +300,6 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 		local_apic_timer_disabled = 1;
 #endif
 
-	if (c->x86 == 0x10 && !force_mwait)
-		clear_bit(X86_FEATURE_MWAIT, c->x86_capability);
-
 	/* K6s reports MCEs but don't actually have all the MSRs */
 	if (c->x86 < 6)
 		clear_bit(X86_FEATURE_MCE, c->x86_capability);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 69a69c3f43bb..9f45a51af968 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -285,9 +285,17 @@ static void mwait_idle(void)
 	mwait_idle_with_hints(0, 0);
 }
 
+static int mwait_usable(const struct cpuinfo_x86 *c)
+{
+	if (force_mwait)
+		return 1;
+	/* Any C1 states supported? */
+	return c->cpuid_level >= 5 && ((cpuid_edx(5) >> 4) & 0xf) > 0;
+}
+
 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
 {
-	if (cpu_has(c, X86_FEATURE_MWAIT)) {
+	if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
 		printk("monitor/mwait feature present.\n");
 		/*
 		 * Skip, if setup has overridden idle.
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 4e65ae8a54bf..dbe0a846ec52 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -280,10 +280,19 @@ static void mwait_idle(void)
 	}
 }
 
+
+static int mwait_usable(const struct cpuinfo_x86 *c)
+{
+	if (force_mwait)
+		return 1;
+	/* Any C1 states supported? */
+	return c->cpuid_level >= 5 && ((cpuid_edx(5) >> 4) & 0xf) > 0;
+}
+
 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
 {
 	static int printed;
-	if (cpu_has(c, X86_FEATURE_MWAIT)) {
+	if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
 		/*
 		 * Skip, if setup has overridden idle.
 		 * One CPU supports mwait => All CPUs supports mwait
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 71a420c7fee7..4a3f00b49236 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -761,10 +761,6 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 	/* MFENCE stops RDTSC speculation */
 	set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
 
-	/* Family 10 doesn't support C states in MWAIT so don't use it */
-	if (c->x86 == 0x10 && !force_mwait)
-		clear_cpu_cap(c, X86_FEATURE_MWAIT);
-
 	if (amd_apic_timer_broken())
 		disable_apic_timer = 1;
 }
-- 
cgit v1.2.3


From 751752789162fde69474edfa15935d0a77c0bc17 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 30 Jan 2008 13:33:17 +0100
Subject: x86: replace hard coded reservations in 64-bit early boot code with
 dynamic table

On x86-64 there are several memory allocations before bootmem. To avoid
them stomping on each other they used to be all hard coded in bad_area().
Replace this with an array that is filled as needed.

This cleans up the code considerably and allows to expand its use.

Cc: peterz@infradead.org
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/e820_64.c  | 95 +++++++++++++++++++++++++---------------------
 arch/x86/kernel/head64.c   | 48 +++++++++++++++++++++++
 arch/x86/kernel/setup_64.c | 67 ++------------------------------
 arch/x86/mm/init_64.c      |  5 ++-
 arch/x86/mm/numa_64.c      |  1 +
 include/asm-x86/e820_64.h  |  4 +-
 include/asm-x86/proto.h    |  2 -
 7 files changed, 110 insertions(+), 112 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
index 07cfaae7ab07..f8b7bebb4344 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -47,56 +47,65 @@ unsigned long end_pfn_map;
  */
 static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
 
-/* Check for some hardcoded bad areas that early boot is not allowed to touch */
-static inline int bad_addr(unsigned long *addrp, unsigned long size)
-{
-	unsigned long addr = *addrp, last = addr + size;
+/*
+ * Early reserved memory areas.
+ */
+#define MAX_EARLY_RES 20
+
+struct early_res {
+	unsigned long start, end;
+};
+static struct early_res early_res[MAX_EARLY_RES] __initdata = {
+	{ 0, PAGE_SIZE },			/* BIOS data page */
+#ifdef CONFIG_SMP
+	{ SMP_TRAMPOLINE_BASE, SMP_TRAMPOLINE_BASE + 2*PAGE_SIZE },
+#endif
+	{}
+};
 
-	/* various gunk below that needed for SMP startup */
-	if (addr < 0x8000) {
-		*addrp = PAGE_ALIGN(0x8000);
-		return 1;
+void __init reserve_early(unsigned long start, unsigned long end)
+{
+	int i;
+	struct early_res *r;
+	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+		r = &early_res[i];
+		if (end > r->start && start < r->end)
+			panic("Duplicated early reservation %lx-%lx\n",
+			      start, end);
 	}
+	if (i >= MAX_EARLY_RES)
+		panic("Too many early reservations");
+	r = &early_res[i];
+	r->start = start;
+	r->end = end;
+}
 
-	/* direct mapping tables of the kernel */
-	if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) {
-		*addrp = PAGE_ALIGN(table_end << PAGE_SHIFT);
-		return 1;
+void __init early_res_to_bootmem(void)
+{
+	int i;
+	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+		struct early_res *r = &early_res[i];
+		reserve_bootmem_generic(r->start, r->end - r->start);
 	}
+}
 
-	/* initrd */
-#ifdef CONFIG_BLK_DEV_INITRD
-	if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
-		unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
-		unsigned long ramdisk_size  = boot_params.hdr.ramdisk_size;
-		unsigned long ramdisk_end   = ramdisk_image+ramdisk_size;
-
-		if (last >= ramdisk_image && addr < ramdisk_end) {
-			*addrp = PAGE_ALIGN(ramdisk_end);
-			return 1;
+/* Check for already reserved areas */
+static inline int bad_addr(unsigned long *addrp, unsigned long size)
+{
+	int i;
+	unsigned long addr = *addrp, last;
+	int changed = 0;
+again:
+	last = addr + size;
+	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+		struct early_res *r = &early_res[i];
+		if (last >= r->start && addr < r->end) {
+			*addrp = addr = r->end;
+			changed = 1;
+			goto again;
 		}
 	}
-#endif
-	/* kernel code */
-	if (last >= __pa_symbol(&_text) && addr < __pa_symbol(&_end)) {
-		*addrp = PAGE_ALIGN(__pa_symbol(&_end));
-		return 1;
-	}
-
-	if (last >= ebda_addr && addr < ebda_addr + ebda_size) {
-		*addrp = PAGE_ALIGN(ebda_addr + ebda_size);
-		return 1;
-	}
-
-#ifdef CONFIG_NUMA
-	/* NUMA memory to node map */
-	if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) {
-		*addrp = nodemap_addr + nodemap_size;
-		return 1;
-	}
-#endif
-	/* XXX ramdisk image here? */
-	return 0;
+	return changed;
 }
 
 /*
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 87e031d4abf1..58438bafedca 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -21,6 +21,7 @@
 #include <asm/tlbflush.h>
 #include <asm/sections.h>
 #include <asm/kdebug.h>
+#include <asm/e820.h>
 
 static void __init zap_identity_mappings(void)
 {
@@ -48,6 +49,35 @@ static void __init copy_bootdata(char *real_mode_data)
 	}
 }
 
+#define EBDA_ADDR_POINTER 0x40E
+
+static __init void reserve_ebda(void)
+{
+	unsigned ebda_addr, ebda_size;
+
+	/*
+	 * there is a real-mode segmented pointer pointing to the
+	 * 4K EBDA area at 0x40E
+	 */
+	ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
+	ebda_addr <<= 4;
+
+	if (!ebda_addr)
+		return;
+
+	ebda_size = *(unsigned short *)__va(ebda_addr);
+
+	/* Round EBDA up to pages */
+	if (ebda_size == 0)
+		ebda_size = 1;
+	ebda_size <<= 10;
+	ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
+	if (ebda_size > 64*1024)
+		ebda_size = 64*1024;
+
+	reserve_early(ebda_addr, ebda_addr + ebda_size);
+}
+
 void __init x86_64_start_kernel(char * real_mode_data)
 {
 	int i;
@@ -75,5 +105,23 @@ void __init x86_64_start_kernel(char * real_mode_data)
 	pda_init(0);
 	copy_bootdata(__va(real_mode_data));
 
+	reserve_early(__pa_symbol(&_text), __pa_symbol(&_end));
+
+	/* Reserve INITRD */
+	if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
+		unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
+		unsigned long ramdisk_size  = boot_params.hdr.ramdisk_size;
+		unsigned long ramdisk_end   = ramdisk_image + ramdisk_size;
+		reserve_early(ramdisk_image, ramdisk_end);
+	}
+
+	reserve_ebda();
+
+	/*
+	 * At this point everything still needed from the boot loader
+	 * or BIOS or kernel text should be early reserved or marked not
+	 * RAM in e820. All other memory is free game.
+	 */
+
 	start_kernel();
 }
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 4a3f00b49236..6cbd15625dce 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -245,41 +245,6 @@ static inline void __init reserve_crashkernel(void)
 {}
 #endif
 
-#define EBDA_ADDR_POINTER 0x40E
-
-unsigned __initdata ebda_addr;
-unsigned __initdata ebda_size;
-
-static void __init discover_ebda(void)
-{
-	/*
-	 * there is a real-mode segmented pointer pointing to the
-	 * 4K EBDA area at 0x40E
-	 */
-	ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
-	/*
-	 * There can be some situations, like paravirtualized guests,
-	 * in which there is no available ebda information. In such
-	 * case, just skip it
-	 */
-	if (!ebda_addr) {
-		ebda_size = 0;
-		return;
-	}
-
-	ebda_addr <<= 4;
-
-	ebda_size = *(unsigned short *)__va(ebda_addr);
-
-	/* Round EBDA up to pages */
-	if (ebda_size == 0)
-		ebda_size = 1;
-	ebda_size <<= 10;
-	ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
-	if (ebda_size > 64*1024)
-		ebda_size = 64*1024;
-}
-
 /* Overridden in paravirt.c if CONFIG_PARAVIRT */
 void __attribute__((weak)) __init memory_setup(void)
 {
@@ -349,8 +314,6 @@ void __init setup_arch(char **cmdline_p)
 
 	check_efer();
 
-	discover_ebda();
-
 	init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
 	if (efi_enabled)
 		efi_init();
@@ -397,33 +360,7 @@ void __init setup_arch(char **cmdline_p)
 	contig_initmem_init(0, end_pfn);
 #endif
 
-	/* Reserve direct mapping */
-	reserve_bootmem_generic(table_start << PAGE_SHIFT,
-				(table_end - table_start) << PAGE_SHIFT);
-
-	/* reserve kernel */
-	reserve_bootmem_generic(__pa_symbol(&_text),
-				__pa_symbol(&_end) - __pa_symbol(&_text));
-
-	/*
-	 * reserve physical page 0 - it's a special BIOS page on many boxes,
-	 * enabling clean reboots, SMP operation, laptop functions.
-	 */
-	reserve_bootmem_generic(0, PAGE_SIZE);
-
-	/* reserve ebda region */
-	if (ebda_addr)
-		reserve_bootmem_generic(ebda_addr, ebda_size);
-#ifdef CONFIG_NUMA
-	/* reserve nodemap region */
-	if (nodemap_addr)
-		reserve_bootmem_generic(nodemap_addr, nodemap_size);
-#endif
-
-#ifdef CONFIG_SMP
-	/* Reserve SMP trampoline */
-	reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, 2*PAGE_SIZE);
-#endif
+	early_res_to_bootmem();
 
 #ifdef CONFIG_ACPI_SLEEP
 	/*
@@ -453,6 +390,8 @@ void __init setup_arch(char **cmdline_p)
 			initrd_start = ramdisk_image + PAGE_OFFSET;
 			initrd_end = initrd_start+ramdisk_size;
 		} else {
+			/* Assumes everything on node 0 */
+			free_bootmem(ramdisk_image, ramdisk_size);
 			printk(KERN_ERR "initrd extends beyond end of memory "
 			       "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
 			       ramdisk_end, end_of_mem);
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 05f12c527b02..8198840c3dcb 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -176,7 +176,8 @@ __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
 	set_pte_phys(address, phys, prot);
 }
 
-unsigned long __meminitdata table_start, table_end;
+static unsigned long __initdata table_start;
+static unsigned long __meminitdata table_end;
 
 static __meminit void *alloc_low_page(unsigned long *phys)
 { 
@@ -387,6 +388,8 @@ void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
 	if (!after_bootmem)
 		mmu_cr4_features = read_cr4();
 	__flush_tlb_all();
+
+	reserve_early(table_start << PAGE_SHIFT, table_end << PAGE_SHIFT);
 }
 
 #ifndef CONFIG_NUMA
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 37d429beba96..5d24dc1ec237 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -102,6 +102,7 @@ static int __init allocate_cachealigned_memnodemap(void)
 	}
 	pad_addr = (nodemap_addr + pad) & ~pad;
 	memnodemap = phys_to_virt(pad_addr);
+	reserve_early(nodemap_addr, nodemap_addr + nodemap_size);
 
 	printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
 	       nodemap_addr, nodemap_addr + nodemap_size);
diff --git a/include/asm-x86/e820_64.h b/include/asm-x86/e820_64.h
index ff36f434f00b..51e4170f9ca5 100644
--- a/include/asm-x86/e820_64.h
+++ b/include/asm-x86/e820_64.h
@@ -41,8 +41,8 @@ extern void finish_e820_parsing(void);
 extern struct e820map e820;
 extern void update_e820(void);
 
-extern unsigned ebda_addr, ebda_size;
-extern unsigned long nodemap_addr, nodemap_size;
+extern void reserve_early(unsigned long start, unsigned long end);
+extern void early_res_to_bootmem(void);
 
 #endif/*!__ASSEMBLY__*/
 
diff --git a/include/asm-x86/proto.h b/include/asm-x86/proto.h
index 3c6fb89f1120..68563c0709ac 100644
--- a/include/asm-x86/proto.h
+++ b/include/asm-x86/proto.h
@@ -22,8 +22,6 @@ extern void syscall32_cpu_init(void);
 
 extern void check_efer(void);
 
-extern unsigned long table_start, table_end;
-
 extern int reboot_force;
 
 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr);
-- 
cgit v1.2.3


From ca74a6f84e68b44867022f4a4f3ec17c087c864e Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 30 Jan 2008 13:33:17 +0100
Subject: x86: optimize lock prefix switching to run less frequently

On VMs implemented using JITs that cache translated code changing the lock
prefixes is a quite costly operation that forces the JIT to throw away and
retranslate a lot of code.

Previously a SMP kernel would rewrite the locks once for each CPU which
is quite unnecessary. This patch changes the code to never switch at boot in
 the normal case (SMP kernel booting with >1 CPU) or only once for SMP kernel
on UP.

This makes a significant difference in boot up performance on AMD SimNow!
Also I expect it to be a little faster on native systems too because a smp
switch does a lot of text_poke()s which each synchronize the pipeline.

v1->v2: Rename max_cpus
v1->v2: Fix off by one in UP check (Thomas Gleixner)

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/alternative.c | 16 ++++++++++++++--
 include/linux/smp.h           |  2 ++
 init/main.c                   | 16 ++++++++--------
 3 files changed, 24 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index cdc43242da92..318a4f9b7ece 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -273,6 +273,7 @@ struct smp_alt_module {
 };
 static LIST_HEAD(smp_alt_modules);
 static DEFINE_SPINLOCK(smp_alt);
+static int smp_mode = 1;	/* protected by smp_alt */
 
 void alternatives_smp_module_add(struct module *mod, char *name,
 				 void *locks, void *locks_end,
@@ -354,7 +355,14 @@ void alternatives_smp_switch(int smp)
 	BUG_ON(!smp && (num_online_cpus() > 1));
 
 	spin_lock_irqsave(&smp_alt, flags);
-	if (smp) {
+
+	/*
+	 * Avoid unnecessary switches because it forces JIT based VMs to
+	 * throw away all cached translations, which can be quite costly.
+	 */
+	if (smp == smp_mode) {
+		/* nothing */
+	} else if (smp) {
 		printk(KERN_INFO "SMP alternatives: switching to SMP code\n");
 		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
 		clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
@@ -369,6 +377,7 @@ void alternatives_smp_switch(int smp)
 			alternatives_smp_unlock(mod->locks, mod->locks_end,
 						mod->text, mod->text_end);
 	}
+	smp_mode = smp;
 	spin_unlock_irqrestore(&smp_alt, flags);
 }
 
@@ -441,7 +450,10 @@ void __init alternative_instructions(void)
 		alternatives_smp_module_add(NULL, "core kernel",
 					    __smp_locks, __smp_locks_end,
 					    _text, _etext);
-		alternatives_smp_switch(0);
+
+		/* Only switch to UP mode if we don't immediately boot others */
+		if (num_possible_cpus() == 1 || setup_max_cpus <= 1)
+			alternatives_smp_switch(0);
 	}
 #endif
  	apply_paravirt(__parainstructions, __parainstructions_end);
diff --git a/include/linux/smp.h b/include/linux/smp.h
index c25e66bcecf3..55232ccf9cfd 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -78,6 +78,8 @@ int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait);
  */
 void smp_prepare_boot_cpu(void);
 
+extern unsigned int setup_max_cpus;
+
 #else /* !SMP */
 
 /*
diff --git a/init/main.c b/init/main.c
index 3f8aba291ed3..5843fe996703 100644
--- a/init/main.c
+++ b/init/main.c
@@ -128,7 +128,7 @@ static char *ramdisk_execute_command;
 
 #ifdef CONFIG_SMP
 /* Setup configured maximum number of CPUs to activate */
-static unsigned int __initdata max_cpus = NR_CPUS;
+unsigned int __initdata setup_max_cpus = NR_CPUS;
 
 /*
  * Setup routine for controlling SMP activation
@@ -146,7 +146,7 @@ static inline void disable_ioapic_setup(void) {};
 
 static int __init nosmp(char *str)
 {
-	max_cpus = 0;
+	setup_max_cpus = 0;
 	disable_ioapic_setup();
 	return 0;
 }
@@ -155,8 +155,8 @@ early_param("nosmp", nosmp);
 
 static int __init maxcpus(char *str)
 {
-	get_option(&str, &max_cpus);
-	if (max_cpus == 0)
+	get_option(&str, &setup_max_cpus);
+	if (setup_max_cpus == 0)
 		disable_ioapic_setup();
 
 	return 0;
@@ -164,7 +164,7 @@ static int __init maxcpus(char *str)
 
 early_param("maxcpus", maxcpus);
 #else
-#define max_cpus NR_CPUS
+#define setup_max_cpus NR_CPUS
 #endif
 
 /*
@@ -393,7 +393,7 @@ static void __init smp_init(void)
 
 	/* FIXME: This should be done in userspace --RR */
 	for_each_present_cpu(cpu) {
-		if (num_online_cpus() >= max_cpus)
+		if (num_online_cpus() >= setup_max_cpus)
 			break;
 		if (!cpu_online(cpu))
 			cpu_up(cpu);
@@ -401,7 +401,7 @@ static void __init smp_init(void)
 
 	/* Any cleanup work */
 	printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus());
-	smp_cpus_done(max_cpus);
+	smp_cpus_done(setup_max_cpus);
 }
 
 #endif
@@ -824,7 +824,7 @@ static int __init kernel_init(void * unused)
 	__set_special_pids(1, 1);
 	cad_pid = task_pid(current);
 
-	smp_prepare_cpus(max_cpus);
+	smp_prepare_cpus(setup_max_cpus);
 
 	do_pre_smp_initcalls();
 
-- 
cgit v1.2.3


From d3432896dae72ee97deb850ad7bbc30329d32c0d Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 30 Jan 2008 13:33:17 +0100
Subject: x86: don't disable the APIC if it hasn't been mapped yet

When the kernel panics early for some unrelated reason
there would be eventually an early exception inside panic because
clear_local_APIC tried to disable the not yet mapped APIC.
Check for that explicitely.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apic_32.c | 11 ++++++++---
 arch/x86/kernel/apic_64.c |  9 +++++++--
 2 files changed, 15 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index d07a603807d1..35a568ea8400 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -99,6 +99,8 @@ static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
 /* Local APIC was disabled by the BIOS and enabled by the kernel */
 static int enabled_via_apicbase;
 
+static unsigned long apic_phys;
+
 /*
  * Get the LAPIC version
  */
@@ -631,9 +633,14 @@ int setup_profiling_timer(unsigned int multiplier)
  */
 void clear_local_APIC(void)
 {
-	int maxlvt = lapic_get_maxlvt();
+	int maxlvt;
 	u32 v;
 
+	/* APIC hasn't been mapped yet */
+	if (!apic_phys)
+		return;
+
+	maxlvt = lapic_get_maxlvt();
 	/*
 	 * Masking an LVT entry can trigger a local APIC error
 	 * if the vector is zero. Mask LVTERR first to prevent this.
@@ -1120,8 +1127,6 @@ no_apic:
  */
 void __init init_apic_mappings(void)
 {
-	unsigned long apic_phys;
-
 	/*
 	 * If no local APIC can be found then set up a fake all
 	 * zeroes page to simulate the local APIC and another
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 85bd3d463cdf..a7a38c9da136 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -81,6 +81,8 @@ static struct clock_event_device lapic_clockevent = {
 };
 static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
 
+static unsigned long apic_phys;
+
 /*
  * Get the LAPIC version
  */
@@ -525,6 +527,11 @@ void clear_local_APIC(void)
 	int maxlvt = lapic_get_maxlvt();
 	u32 v;
 
+	/* APIC hasn't been mapped yet */
+	if (!apic_phys)
+		return;
+
+	maxlvt = lapic_get_maxlvt();
 	/*
 	 * Masking an LVT entry can trigger a local APIC error
 	 * if the vector is zero. Mask LVTERR first to prevent this.
@@ -859,8 +866,6 @@ static int __init detect_init_APIC(void)
  */
 void __init init_apic_mappings(void)
 {
-	unsigned long apic_phys;
-
 	/*
 	 * If no local APIC can be found then set up a fake all
 	 * zeroes page to simulate the local APIC and another
-- 
cgit v1.2.3


From 03252919b79891063cf99145612360efbdf9500b Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 30 Jan 2008 13:33:18 +0100
Subject: x86: print which shared library/executable faulted in segfault etc.
 messages v3

They now look like:

hal-resmgr[13791]: segfault at 3c rip 2b9c8caec182 rsp 7fff1e825d30 error 4 in libacl.so.1.1.0[2b9c8caea000+6000]

This makes it easier to pinpoint bugs to specific libraries.

And printing the offset into a mapping also always allows to find the
correct fault point in a library even with randomized mappings. Previously
there was no way to actually find the correct code address inside
the randomized mapping.

Relies on earlier patch to shorten the printk formats.

They are often now longer than 80 characters, but I think that's worth it.

[includes fix from Eric Dumazet to check d_path error value]

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/signal_32.c |  7 +++++--
 arch/x86/kernel/signal_64.c |  7 +++++--
 arch/x86/kernel/traps_32.c  |  7 +++++--
 arch/x86/kernel/traps_64.c  | 14 ++++++++++----
 arch/x86/mm/fault_32.c      |  4 +++-
 arch/x86/mm/fault_64.c      |  4 +++-
 include/linux/mm.h          |  1 +
 mm/memory.c                 | 31 +++++++++++++++++++++++++++++++
 8 files changed, 63 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index 74df55895c8c..89a690edf999 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -198,12 +198,15 @@ asmlinkage int sys_sigreturn(unsigned long __unused)
 	return ax;
 
 badframe:
-	if (show_unhandled_signals && printk_ratelimit())
+	if (show_unhandled_signals && printk_ratelimit()) {
 		printk("%s%s[%d] bad frame in sigreturn frame:%p ip:%lx"
-		       " sp:%lx oeax:%lx\n",
+		       " sp:%lx oeax:%lx",
 		    task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
 		    current->comm, task_pid_nr(current), frame, regs->ip,
 		    regs->sp, regs->orig_ax);
+		print_vma_addr(" in ", regs->ip);
+		printk("\n");
+	}
 
 	force_sig(SIGSEGV, current);
 	return 0;
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index 4eb751c60390..7347bb14e306 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -484,9 +484,12 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
 { 
 	struct task_struct *me = current; 
-	if (show_unhandled_signals && printk_ratelimit())
-		printk("%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx\n",
+	if (show_unhandled_signals && printk_ratelimit()) {
+		printk("%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
 	       me->comm,me->pid,where,frame,regs->ip,regs->sp,regs->orig_ax);
+		print_vma_addr(" in ", regs->ip);
+		printk("\n");
+	}
 
 	force_sig(SIGSEGV, me); 
 } 
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index 6f3bb287c702..270cfd483160 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -609,11 +609,14 @@ void __kprobes do_general_protection(struct pt_regs * regs,
 	current->thread.error_code = error_code;
 	current->thread.trap_no = 13;
 	if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
-	    printk_ratelimit())
+	    printk_ratelimit()) {
 		printk(KERN_INFO
-		    "%s[%d] general protection ip:%lx sp:%lx error:%lx\n",
+		    "%s[%d] general protection ip:%lx sp:%lx error:%lx",
 		    current->comm, task_pid_nr(current),
 		    regs->ip, regs->sp, error_code);
+		print_vma_addr(" in ", regs->ip);
+		printk("\n");
+	}
 
 	force_sig(SIGSEGV, current);
 	return;
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index 814801f4eb9e..911ed28afff8 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -642,11 +642,14 @@ static void __kprobes do_trap(int trapnr, int signr, char *str,
 		tsk->thread.trap_no = trapnr;
 
 		if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
-		    printk_ratelimit())
+		    printk_ratelimit()) {
 			printk(KERN_INFO
-			       "%s[%d] trap %s ip:%lx sp:%lx error:%lx\n",
+			       "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
 			       tsk->comm, tsk->pid, str,
 			       regs->ip, regs->sp, error_code);
+			print_vma_addr(" in ", regs->ip);
+			printk("\n");
+		}
 
 		if (info)
 			force_sig_info(signr, info, tsk);
@@ -741,11 +744,14 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
 		tsk->thread.trap_no = 13;
 
 		if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
-		    printk_ratelimit())
+		    printk_ratelimit()) {
 			printk(KERN_INFO
-		       "%s[%d] general protection ip:%lx sp:%lx error:%lx\n",
+		       "%s[%d] general protection ip:%lx sp:%lx error:%lx",
 			       tsk->comm, tsk->pid,
 			       regs->ip, regs->sp, error_code);
+			print_vma_addr(" in ", regs->ip);
+			printk("\n");
+		}
 
 		force_sig(SIGSEGV, tsk);
 		return;
diff --git a/arch/x86/mm/fault_32.c b/arch/x86/mm/fault_32.c
index 13d295506a17..276863dc4bdd 100644
--- a/arch/x86/mm/fault_32.c
+++ b/arch/x86/mm/fault_32.c
@@ -514,11 +514,13 @@ bad_area_nosemaphore:
 #ifdef CONFIG_X86_32
 			"%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
 #else
-			"%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx\n",
+			"%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
 #endif
 			task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
 			tsk->comm, task_pid_nr(tsk), address, regs->ip,
 			regs->sp, error_code);
+			print_vma_addr(" in ", regs->ip);
+			printk("\n");
 		}
 		tsk->thread.cr2 = address;
 		/* Kernel addresses are always protection faults */
diff --git a/arch/x86/mm/fault_64.c b/arch/x86/mm/fault_64.c
index b606bdefbb72..9ef0306efe9e 100644
--- a/arch/x86/mm/fault_64.c
+++ b/arch/x86/mm/fault_64.c
@@ -552,11 +552,13 @@ bad_area_nosemaphore:
 #ifdef CONFIG_X86_32
 			"%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
 #else
-			"%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx\n",
+			"%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
 #endif
 			task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
 			tsk->comm, task_pid_nr(tsk), address, regs->ip,
 			regs->sp, error_code);
+			print_vma_addr(" in ", regs->ip);
+			printk("\n");
 		}
 
 		tsk->thread.cr2 = address;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1897ca223eca..3c22d971afa7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1146,6 +1146,7 @@ extern int randomize_va_space;
 #endif
 
 const char * arch_vma_name(struct vm_area_struct *vma);
+void print_vma_addr(char *prefix, unsigned long rip);
 
 struct page *sparse_mem_map_populate(unsigned long pnum, int nid);
 pgd_t *vmemmap_pgd_populate(unsigned long addr, int node);
diff --git a/mm/memory.c b/mm/memory.c
index 673ebbf499c7..d902d0e25edc 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2754,3 +2754,34 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
 
 	return buf - old_buf;
 }
+
+/*
+ * Print the name of a VMA.
+ */
+void print_vma_addr(char *prefix, unsigned long ip)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+
+	down_read(&mm->mmap_sem);
+	vma = find_vma(mm, ip);
+	if (vma && vma->vm_file) {
+		struct file *f = vma->vm_file;
+		char *buf = (char *)__get_free_page(GFP_KERNEL);
+		if (buf) {
+			char *p, *s;
+
+			p = d_path(f->f_dentry, f->f_vfsmnt, buf, PAGE_SIZE);
+			if (IS_ERR(p))
+				p = "?";
+			s = strrchr(p, '/');
+			if (s)
+				p = s+1;
+			printk("%s%s[%lx+%lx]", prefix, p,
+					vma->vm_start,
+					vma->vm_end - vma->vm_start);
+			free_page((unsigned long)buf);
+		}
+	}
+	up_read(&current->mm->mmap_sem);
+}
-- 
cgit v1.2.3


From 99fc8d424bc5d803fe92cad56c068fe64e73747a Mon Sep 17 00:00:00 2001
From: Jesse Barnes <jesse.barnes@intel.com>
Date: Wed, 30 Jan 2008 13:33:18 +0100
Subject: x86, 32-bit: trim memory not covered by wb mtrrs

On some machines, buggy BIOSes don't properly setup WB MTRRs to cover all
available RAM, meaning the last few megs (or even gigs) of memory will be
marked uncached.  Since Linux tends to allocate from high memory addresses
first, this causes the machine to be unusably slow as soon as the kernel
starts really using memory (i.e.  right around init time).

This patch works around the problem by scanning the MTRRs at boot and
figuring out whether the current end_pfn value (setup by early e820 code)
goes beyond the highest WB MTRR range, and if so, trimming it to match.  A
fairly obnoxious KERN_WARNING is printed too, letting the user know that
not all of their memory is available due to a likely BIOS bug.

Something similar could be done on i386 if needed, but the boot ordering
would be slightly different, since the MTRR code on i386 depends on the
boot_cpu_data structure being setup.

This patch fixes a bug in the last patch that caused the code to run on
non-Intel machines (AMD machines apparently don't need it and it's untested
on other non-Intel machines, so best keep it off).

Further enhancements and fixes from:

  Yinghai Lu <Yinghai.Lu@Sun.COM>
  Andi Kleen <ak@suse.de>

Signed-off-by: Jesse Barnes <jesse.barnes@intel.com>
Tested-by: Justin Piszcz <jpiszcz@lucidpixels.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/kernel-parameters.txt |   6 ++
 arch/x86/kernel/bugs_64.c           |   1 -
 arch/x86/kernel/cpu/mtrr/generic.c  |   8 +--
 arch/x86/kernel/cpu/mtrr/if.c       |   8 +--
 arch/x86/kernel/cpu/mtrr/main.c     | 140 +++++++++++++++++++++++++++++-------
 arch/x86/kernel/cpu/mtrr/mtrr.h     |   3 +
 arch/x86/kernel/setup_64.c          |   7 ++
 include/asm-x86/mtrr.h              |   6 +-
 8 files changed, 140 insertions(+), 39 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 860a90875491..b8fadf5f75a3 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -570,6 +570,12 @@ and is between 256 and 4096 characters. It is defined in the file
 			See drivers/char/README.epca and
 			Documentation/digiepca.txt.
 
+	disable_mtrr_trim [X86-64, Intel only]
+			By default the kernel will trim any uncacheable
+			memory out of your available memory pool based on
+			MTRR settings.  This parameter disables that behavior,
+			possibly causing your machine to run very slowly.
+
 	dmasound=	[HW,OSS] Sound subsystem buffers
 
 	dscc4.setup=	[NET]
diff --git a/arch/x86/kernel/bugs_64.c b/arch/x86/kernel/bugs_64.c
index 9a189cef6404..8f520f93ffd4 100644
--- a/arch/x86/kernel/bugs_64.c
+++ b/arch/x86/kernel/bugs_64.c
@@ -13,7 +13,6 @@
 void __init check_bugs(void)
 {
 	identify_cpu(&boot_cpu_data);
-	mtrr_bp_init();
 #if !defined(CONFIG_SMP)
 	printk("CPU: ");
 	print_cpu_info(&boot_cpu_data);
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 55d31ff118fb..103d61a59b19 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -14,7 +14,7 @@
 #include "mtrr.h"
 
 struct mtrr_state {
-	struct mtrr_var_range *var_ranges;
+	struct mtrr_var_range var_ranges[MAX_VAR_RANGES];
 	mtrr_type fixed_ranges[NUM_FIXED_RANGES];
 	unsigned char enabled;
 	unsigned char have_fixed;
@@ -86,12 +86,6 @@ void __init get_mtrr_state(void)
 	struct mtrr_var_range *vrs;
 	unsigned lo, dummy;
 
-	if (!mtrr_state.var_ranges) {
-		mtrr_state.var_ranges = kmalloc(num_var_ranges * sizeof (struct mtrr_var_range), 
-						GFP_KERNEL);
-		if (!mtrr_state.var_ranges)
-			return;
-	} 
 	vrs = mtrr_state.var_ranges;
 
 	rdmsr(MTRRcap_MSR, lo, dummy);
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index 14535686c099..91e150acb46c 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -11,10 +11,6 @@
 #include <asm/mtrr.h>
 #include "mtrr.h"
 
-/* RED-PEN: this is accessed without any locking */
-extern unsigned int *usage_table;
-
-
 #define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private)
 
 static const char *const mtrr_strings[MTRR_NUM_TYPES] =
@@ -397,7 +393,7 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
 	for (i = 0; i < max; i++) {
 		mtrr_if->get(i, &base, &size, &type);
 		if (size == 0)
-			usage_table[i] = 0;
+			mtrr_usage_table[i] = 0;
 		else {
 			if (size < (0x100000 >> PAGE_SHIFT)) {
 				/* less than 1MB */
@@ -411,7 +407,7 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
 			len += seq_printf(seq, 
 				   "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n",
 			     i, base, base >> (20 - PAGE_SHIFT), size, factor,
-			     mtrr_attrib_to_str(type), usage_table[i]);
+			     mtrr_attrib_to_str(type), mtrr_usage_table[i]);
 		}
 	}
 	return 0;
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 60af5ed2b5c0..ccd36ed2187b 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -38,8 +38,8 @@
 #include <linux/cpu.h>
 #include <linux/mutex.h>
 
+#include <asm/e820.h>
 #include <asm/mtrr.h>
-
 #include <asm/uaccess.h>
 #include <asm/processor.h>
 #include <asm/msr.h>
@@ -47,7 +47,7 @@
 
 u32 num_var_ranges = 0;
 
-unsigned int *usage_table;
+unsigned int mtrr_usage_table[MAX_VAR_RANGES];
 static DEFINE_MUTEX(mtrr_mutex);
 
 u64 size_or_mask, size_and_mask;
@@ -121,13 +121,8 @@ static void __init init_table(void)
 	int i, max;
 
 	max = num_var_ranges;
-	if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL))
-	    == NULL) {
-		printk(KERN_ERR "mtrr: could not allocate\n");
-		return;
-	}
 	for (i = 0; i < max; i++)
-		usage_table[i] = 1;
+		mtrr_usage_table[i] = 1;
 }
 
 struct set_mtrr_data {
@@ -383,7 +378,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
 			goto out;
 		}
 		if (increment)
-			++usage_table[i];
+			++mtrr_usage_table[i];
 		error = i;
 		goto out;
 	}
@@ -391,15 +386,15 @@ int mtrr_add_page(unsigned long base, unsigned long size,
 	i = mtrr_if->get_free_region(base, size, replace);
 	if (i >= 0) {
 		set_mtrr(i, base, size, type);
-		if (likely(replace < 0))
-			usage_table[i] = 1;
-		else {
-			usage_table[i] = usage_table[replace];
+		if (likely(replace < 0)) {
+			mtrr_usage_table[i] = 1;
+		} else {
+			mtrr_usage_table[i] = mtrr_usage_table[replace];
 			if (increment)
-				usage_table[i]++;
+				mtrr_usage_table[i]++;
 			if (unlikely(replace != i)) {
 				set_mtrr(replace, 0, 0, 0);
-				usage_table[replace] = 0;
+				mtrr_usage_table[replace] = 0;
 			}
 		}
 	} else
@@ -529,11 +524,11 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
 		printk(KERN_WARNING "mtrr: MTRR %d not used\n", reg);
 		goto out;
 	}
-	if (usage_table[reg] < 1) {
+	if (mtrr_usage_table[reg] < 1) {
 		printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg);
 		goto out;
 	}
-	if (--usage_table[reg] < 1)
+	if (--mtrr_usage_table[reg] < 1)
 		set_mtrr(reg, 0, 0, 0);
 	error = reg;
  out:
@@ -593,16 +588,11 @@ struct mtrr_value {
 	unsigned long	lsize;
 };
 
-static struct mtrr_value * mtrr_state;
+static struct mtrr_value mtrr_state[MAX_VAR_RANGES];
 
 static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
 {
 	int i;
-	int size = num_var_ranges * sizeof(struct mtrr_value);
-
-	mtrr_state = kzalloc(size,GFP_ATOMIC);
-	if (!mtrr_state)
-		return -ENOMEM;
 
 	for (i = 0; i < num_var_ranges; i++) {
 		mtrr_if->get(i,
@@ -624,7 +614,6 @@ static int mtrr_restore(struct sys_device * sysdev)
 				 mtrr_state[i].lsize,
 				 mtrr_state[i].ltype);
 	}
-	kfree(mtrr_state);
 	return 0;
 }
 
@@ -635,6 +624,109 @@ static struct sysdev_driver mtrr_sysdev_driver = {
 	.resume		= mtrr_restore,
 };
 
+#ifdef CONFIG_X86_64
+static int disable_mtrr_trim;
+
+static int __init disable_mtrr_trim_setup(char *str)
+{
+	disable_mtrr_trim = 1;
+	return 0;
+}
+early_param("disable_mtrr_trim", disable_mtrr_trim_setup);
+
+/*
+ * Newer AMD K8s and later CPUs have a special magic MSR way to force WB
+ * for memory >4GB. Check for that here.
+ * Note this won't check if the MTRRs < 4GB where the magic bit doesn't
+ * apply to are wrong, but so far we don't know of any such case in the wild.
+ */
+#define Tom2Enabled (1U << 21)
+#define Tom2ForceMemTypeWB (1U << 22)
+
+static __init int amd_special_default_mtrr(unsigned long end_pfn)
+{
+	u32 l, h;
+
+	/* Doesn't apply to memory < 4GB */
+	if (end_pfn <= (0xffffffff >> PAGE_SHIFT))
+		return 0;
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+		return 0;
+	if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
+		return 0;
+	/* In case some hypervisor doesn't pass SYSCFG through */
+	if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
+		return 0;
+	/*
+	 * Memory between 4GB and top of mem is forced WB by this magic bit.
+	 * Reserved before K8RevF, but should be zero there.
+	 */
+	if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) ==
+		 (Tom2Enabled | Tom2ForceMemTypeWB))
+		return 1;
+	return 0;
+}
+
+/**
+ * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
+ *
+ * Some buggy BIOSes don't setup the MTRRs properly for systems with certain
+ * memory configurations.  This routine checks that the highest MTRR matches
+ * the end of memory, to make sure the MTRRs having a write back type cover
+ * all of the memory the kernel is intending to use. If not, it'll trim any
+ * memory off the end by adjusting end_pfn, removing it from the kernel's
+ * allocation pools, warning the user with an obnoxious message.
+ */
+int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
+{
+	unsigned long i, base, size, highest_addr = 0, def, dummy;
+	mtrr_type type;
+	u64 trim_start, trim_size;
+
+	/*
+	 * Make sure we only trim uncachable memory on machines that
+	 * support the Intel MTRR architecture:
+	 */
+	rdmsr(MTRRdefType_MSR, def, dummy);
+	def &= 0xff;
+	if (!is_cpu(INTEL) || disable_mtrr_trim || def != MTRR_TYPE_UNCACHABLE)
+		return 0;
+
+	/* Find highest cached pfn */
+	for (i = 0; i < num_var_ranges; i++) {
+		mtrr_if->get(i, &base, &size, &type);
+		if (type != MTRR_TYPE_WRBACK)
+			continue;
+		base <<= PAGE_SHIFT;
+		size <<= PAGE_SHIFT;
+		if (highest_addr < base + size)
+			highest_addr = base + size;
+	}
+
+	if (amd_special_default_mtrr(end_pfn))
+		return 0;
+
+	if ((highest_addr >> PAGE_SHIFT) < end_pfn) {
+		printk(KERN_WARNING "***************\n");
+		printk(KERN_WARNING "**** WARNING: likely BIOS bug\n");
+		printk(KERN_WARNING "**** MTRRs don't cover all of "
+		       "memory, trimmed %ld pages\n", end_pfn -
+		       (highest_addr >> PAGE_SHIFT));
+		printk(KERN_WARNING "***************\n");
+
+		printk(KERN_INFO "update e820 for mtrr\n");
+		trim_start = highest_addr;
+		trim_size = end_pfn;
+		trim_size <<= PAGE_SHIFT;
+		trim_size -= trim_start;
+		add_memory_region(trim_start, trim_size, E820_RESERVED);
+		update_e820();
+		return 1;
+	}
+
+	return 0;
+}
+#endif
 
 /**
  * mtrr_bp_init - initialize mtrrs on the boot CPU
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 54347e9a95c0..fb74a2c20814 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -12,6 +12,7 @@
 #define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1)
 
 #define NUM_FIXED_RANGES 88
+#define MAX_VAR_RANGES 256
 #define MTRRfix64K_00000_MSR 0x250
 #define MTRRfix16K_80000_MSR 0x258
 #define MTRRfix16K_A0000_MSR 0x259
@@ -32,6 +33,8 @@
    an 8 bit field: */
 typedef u8 mtrr_type;
 
+extern unsigned int mtrr_usage_table[MAX_VAR_RANGES];
+
 struct mtrr_ops {
 	u32	vendor;
 	u32	use_intel_if;
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 6cbd15625dce..12948316e6a6 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -310,6 +310,13 @@ void __init setup_arch(char **cmdline_p)
 	 * we are rounding upwards:
 	 */
 	end_pfn = e820_end_of_ram();
+	/* update e820 for memory not covered by WB MTRRs */
+	mtrr_bp_init();
+	if (mtrr_trim_uncached_memory(end_pfn)) {
+		e820_register_active_regions(0, 0, -1UL);
+		end_pfn = e820_end_of_ram();
+	}
+
 	num_physpages = end_pfn;
 
 	check_efer();
diff --git a/include/asm-x86/mtrr.h b/include/asm-x86/mtrr.h
index 262670e42078..319d065800be 100644
--- a/include/asm-x86/mtrr.h
+++ b/include/asm-x86/mtrr.h
@@ -97,6 +97,7 @@ extern int mtrr_del_page (int reg, unsigned long base, unsigned long size);
 extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi);
 extern void mtrr_ap_init(void);
 extern void mtrr_bp_init(void);
+extern int mtrr_trim_uncached_memory(unsigned long end_pfn);
 #  else
 #define mtrr_save_fixed_ranges(arg) do {} while (0)
 #define mtrr_save_state() do {} while (0)
@@ -120,7 +121,10 @@ static __inline__ int mtrr_del_page (int reg, unsigned long base,
 {
     return -ENODEV;
 }
-
+static inline int mtrr_trim_uncached_memory(unsigned long end_pfn)
+{
+	return 0;
+}
 static __inline__ void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) {;}
 
 #define mtrr_ap_init() do {} while (0)
-- 
cgit v1.2.3


From 71617bf140fd5a35645527502cd330f84045d40c Mon Sep 17 00:00:00 2001
From: Yinghai Lu <Yinghai.Lu@Sun.COM>
Date: Wed, 30 Jan 2008 13:33:18 +0100
Subject: x86: only call early_init_amd one time

Andi's patch
"
    x86: move X86_FEATURE_CONSTANT_TSC into early cpu feature detection

    Need this in the next patch in time_init and that happens early.

    This includes a minor fix on i386 where early_intel_workarounds()
    [which is now called early_init_intel] really executes early as
    the comments say.
"
calling early_init_amd in early_identify_cpu and identify_cpu two times.

this patch remove the one in identify_cpu

Signed-off-by: Yinghai Lu <yinghai.lu@sun.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/setup_64.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 12948316e6a6..08e85b79e702 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -954,6 +954,9 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
 	case X86_VENDOR_AMD:
 		early_init_amd(c);
 		break;
+	case X86_VENDOR_INTEL:
+		early_init_intel(c);
+		break;
 	}
 
 }
@@ -1020,14 +1023,6 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 	numa_add_cpu(smp_processor_id());
 #endif
 
-	switch (c->x86_vendor) {
-	case X86_VENDOR_AMD:
-		early_init_amd(c);
-		break;
-	case X86_VENDOR_INTEL:
-		early_init_intel(c);
-		break;
-	}
 }
 
 void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
-- 
cgit v1.2.3


From 67926892ef7a7fbc76de607120d44416019fdf07 Mon Sep 17 00:00:00 2001
From: Michael Opdenacker <michael-lists@free-electrons.com>
Date: Wed, 30 Jan 2008 13:33:18 +0100
Subject: x86: fix unconditional arch/x86/kernel/pcspeaker.o compiling

do not add the pcspkr platform device if pcspkr support is disabled.

Signed-off-by: Michael Opdenacker <michael@free-electrons.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/Makefile | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index c414c45a0f13..5bdb0f0431e9 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -67,7 +67,10 @@ obj-$(CONFIG_MGEODE_LX)		+= geode_32.o mfgpt_32.o
 
 obj-$(CONFIG_VMI)		+= vmi_32.o vmiclock_32.o
 obj-$(CONFIG_PARAVIRT)		+= paravirt.o paravirt_patch_$(BITS).o
+
+ifdef CONFIG_INPUT_PCSPKR
 obj-y				+= pcspeaker.o
+endif
 
 obj-$(CONFIG_SCx200)		+= scx200_32.o
 
-- 
cgit v1.2.3


From 27415a4fe369e07a1393ae52c8ed8e48aabed5a9 Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Wed, 30 Jan 2008 13:33:18 +0100
Subject: x86: move warning message of polling idle and HT enabled

The warning message at idle_setup() is never shown because
smp_num_sibling hasn't been updated at this point yet.

Move this polling idle and HT enabled warning to select_idle_routine().
I also implement this warning on 64-bit kernel.

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/process_32.c | 18 ++++++++++++------
 arch/x86/kernel/process_64.c | 17 ++++++++++++-----
 2 files changed, 24 insertions(+), 11 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 9f45a51af968..b72d7d132072 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -295,17 +295,27 @@ static int mwait_usable(const struct cpuinfo_x86 *c)
 
 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
 {
+	static int selected;
+
+	if (selected)
+		return;
+#ifdef CONFIG_X86_SMP
+	if (pm_idle == poll_idle && smp_num_siblings > 1) {
+		printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
+			" performance may degrade.\n");
+	}
+#endif
 	if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
-		printk("monitor/mwait feature present.\n");
 		/*
 		 * Skip, if setup has overridden idle.
 		 * One CPU supports mwait => All CPUs supports mwait
 		 */
 		if (!pm_idle) {
-			printk("using mwait in idle threads.\n");
+			printk(KERN_INFO "using mwait in idle threads.\n");
 			pm_idle = mwait_idle;
 		}
 	}
+	selected = 1;
 }
 
 static int __init idle_setup(char *str)
@@ -313,10 +323,6 @@ static int __init idle_setup(char *str)
 	if (!strcmp(str, "poll")) {
 		printk("using polling idle threads.\n");
 		pm_idle = poll_idle;
-#ifdef CONFIG_X86_SMP
-		if (smp_num_siblings > 1)
-			printk("WARNING: polling idle and HT enabled, performance may degrade.\n");
-#endif
 	} else if (!strcmp(str, "mwait"))
 		force_mwait = 1;
 	else
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index dbe0a846ec52..95313532b2e0 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -291,20 +291,27 @@ static int mwait_usable(const struct cpuinfo_x86 *c)
 
 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
 {
-	static int printed;
+	static int selected;
+
+	if (selected)
+		return;
+#ifdef CONFIG_X86_SMP
+	if (pm_idle == poll_idle && smp_num_siblings > 1) {
+		printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
+			" performance may degrade.\n");
+	}
+#endif
 	if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
 		/*
 		 * Skip, if setup has overridden idle.
 		 * One CPU supports mwait => All CPUs supports mwait
 		 */
 		if (!pm_idle) {
-			if (!printed) {
-				printk(KERN_INFO "using mwait in idle threads.\n");
-				printed = 1;
-			}
+			printk(KERN_INFO "using mwait in idle threads.\n");
 			pm_idle = mwait_idle;
 		}
 	}
+	selected = 1;
 }
 
 static int __init idle_setup(char *str)
-- 
cgit v1.2.3


From 74ff305b05b0974b30022a48145075e6d9a7fb2b Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Wed, 30 Jan 2008 13:33:18 +0100
Subject: x86: move select_idle_routine() call after detect_ht()

Move the select_idle_routine() call to after the detect_ht() call at
identify_cpu() on 64-bit.

This change is for printing the polling idle and HT enabled warning
message properly.

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/setup_64.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 08e85b79e702..7edb43f0b279 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -999,7 +999,6 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 		break;
 	}
 
-	select_idle_routine(c);
 	detect_ht(c);
 
 	/*
@@ -1017,6 +1016,8 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 #ifdef CONFIG_X86_MCE
 	mcheck_init(c);
 #endif
+	select_idle_routine(c);
+
 	if (c != &boot_cpu_data)
 		mtrr_ap_init();
 #ifdef CONFIG_NUMA
-- 
cgit v1.2.3


From 47db4c3e932dbf889f34aab8aac2b1391581a9a5 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <Yinghai.Lu@Sun.COM>
Date: Wed, 30 Jan 2008 13:33:18 +0100
Subject: x86: checking aperture report for node instead

currently when gart iommu is enabled by BIOS or previous we got

"
Checking aperture...
CPU 0: aperture @4000000 size 64MB
CPU 1: aperture @4000000 size 64MB
"
we should use use Node instead.

we will get
"
Checking aperture...
Node 0: aperture @4000000 size 64MB
Node 1: aperture @4000000 size 64MB
"

Signed-off-by: Yinghai Lu <yinghai.lu@sun.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/aperture_64.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index bf1b469d5847..0b837bb3becb 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -312,6 +312,7 @@ void __init gart_iommu_hole_init(void)
 	u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0;
 	u64 aper_base, last_aper_base = 0;
 	int fix, num, valid_agp = 0;
+	int node;
 
 	if (gart_iommu_aperture_disabled || !fix_aperture ||
 	    !early_pci_allowed())
@@ -320,6 +321,7 @@ void __init gart_iommu_hole_init(void)
 	printk(KERN_INFO  "Checking aperture...\n");
 
 	fix = 0;
+	node = 0;
 	for (num = 24; num < 32; num++) {
 		if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
 			continue;
@@ -332,8 +334,9 @@ void __init gart_iommu_hole_init(void)
 		aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff;
 		aper_base <<= 25;
 
-		printk(KERN_INFO "CPU %d: aperture @ %Lx size %u MB\n",
-				num-24, aper_base, aper_size>>20);
+		printk(KERN_INFO "Node %d: aperture @ %Lx size %u MB\n",
+				node, aper_base, aper_size >> 20);
+		node++;
 
 		if (!aperture_valid(aper_base, aper_size)) {
 			fix = 1;
-- 
cgit v1.2.3


From 8a650ce297c723ebe7da17ec2890f6971438aee1 Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 30 Jan 2008 13:33:19 +0100
Subject: x86: add stringify header

We use a __stringify construction at paravirt_patch_64.c.
It's better practice to include the stringify header directly

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/paravirt_patch_64.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index cbfc4f3069e3..7d904e138d7e 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -1,5 +1,6 @@
 #include <asm/paravirt.h>
 #include <asm/asm-offsets.h>
+#include <linux/stringify.h>
 
 DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
 DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
-- 
cgit v1.2.3


From b03878307a78ba0248f6f8d24279c846f1a26f6e Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 30 Jan 2008 13:33:19 +0100
Subject: x86: provide a native_init_IRQ function on 64-bit

x86_64 lacks a native_init_IRQ() function, so we turn the arch's
init_IRQ() function into a native construct

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/i8259_64.c  | 4 +++-
 include/asm-x86/hw_irq_64.h | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c
index d3edb9f23f2c..fa57a1568508 100644
--- a/arch/x86/kernel/i8259_64.c
+++ b/arch/x86/kernel/i8259_64.c
@@ -456,7 +456,9 @@ void __init init_ISA_irqs (void)
 	}
 }
 
-void __init init_IRQ(void)
+void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
+
+void __init native_init_IRQ(void)
 {
 	int i;
 
diff --git a/include/asm-x86/hw_irq_64.h b/include/asm-x86/hw_irq_64.h
index a346159b6ac3..312a58d6dac6 100644
--- a/include/asm-x86/hw_irq_64.h
+++ b/include/asm-x86/hw_irq_64.h
@@ -141,6 +141,7 @@ extern void print_IO_APIC(void);
 extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn);
 extern void send_IPI(int dest, int vector);
 extern void setup_ioapic_dest(void);
+extern void native_init_IRQ(void);
 
 extern unsigned long io_apic_irqs;
 
-- 
cgit v1.2.3


From 88b4755f0fa8f5075ac0182f07852acbb397e140 Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 30 Jan 2008 13:33:19 +0100
Subject: x86: fill pv_cpu_ops structure with cr8 fields

This patch fills in the read and write cr8 fields with their
native version.

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/paravirt.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index c20b4f8d62f5..c67d33103b91 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -319,6 +319,10 @@ struct pv_cpu_ops pv_cpu_ops = {
 	.read_cr4 = native_read_cr4,
 	.read_cr4_safe = native_read_cr4_safe,
 	.write_cr4 = native_write_cr4,
+#ifdef CONFIG_X86_64
+	.read_cr8 = native_read_cr8,
+	.write_cr8 = native_write_cr8,
+#endif
 	.wbinvd = native_wbinvd,
 	.read_msr = native_read_msr_safe,
 	.write_msr = native_write_msr_safe,
-- 
cgit v1.2.3


From a59153dceb860f810b2ecd2504162309291edd4c Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 30 Jan 2008 13:33:19 +0100
Subject: x86: add asm_offset PARAVIRT constants

This patch adds the constant PARAVIRT needs in asm_offsets_64.c

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/asm-offsets_64.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 2b32719a3fea..494e1e096ee6 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -61,6 +61,20 @@ int main(void)
 	ENTRY(data_offset);
 	BLANK();
 #undef ENTRY
+#ifdef CONFIG_PARAVIRT
+	BLANK();
+	OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
+	OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
+	OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
+	OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
+	OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
+	OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
+	OFFSET(PV_CPU_irq_enable_syscall_ret, pv_cpu_ops, irq_enable_syscall_ret);
+	OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
+	OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
+#endif
+
+
 #ifdef CONFIG_IA32_EMULATION
 #define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry))
 	ENTRY(ax);
-- 
cgit v1.2.3


From fbf519240c840f51e4d4af35c3c19e1df160dc48 Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 30 Jan 2008 13:33:19 +0100
Subject: x86: provide __parainstructions section

This patch adds the __parainstructions section to vmlinux.lds.S.
It's needed for the patching system.

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/vmlinux_64.lds.S | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 7457370d7916..0992b9946c6f 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -178,6 +178,14 @@ SECTIONS
   }
   __con_initcall_end = .;
   SECURITY_INIT
+
+  . = ALIGN(8);
+  .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
+  __parainstructions = .;
+       *(.parainstructions)
+  __parainstructions_end = .;
+  }
+
   . = ALIGN(8);
   __alt_instructions = .;
   .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
-- 
cgit v1.2.3


From d8dd8eec57254adac48e46c6d191cb24161d8b4d Mon Sep 17 00:00:00 2001
From: Eduardo Habkost <ehabkost@redhat.com>
Date: Wed, 30 Jan 2008 13:33:20 +0100
Subject: x86: don't set pagetable_setup_{start,done} hooks on 64-bit

paravirt_pagetable_setup_{start,done}() are not used (yet) under x86_64,
and native_pagetable_setup_{start,done}() don't exist on x86_64. So they
don't need to be set.

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/paravirt.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index c67d33103b91..37f38b71c50a 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -367,8 +367,10 @@ struct pv_apic_ops pv_apic_ops = {
 };
 
 struct pv_mmu_ops pv_mmu_ops = {
+#ifndef CONFIG_X86_64
 	.pagetable_setup_start = native_pagetable_setup_start,
 	.pagetable_setup_done = native_pagetable_setup_done,
+#endif
 
 	.read_cr2 = native_read_cr2,
 	.write_cr2 = native_write_cr2,
-- 
cgit v1.2.3


From f95f2f7b9d7a3da79a833a8fd78a5154cc4b3107 Mon Sep 17 00:00:00 2001
From: Eduardo Habkost <ehabkost@redhat.com>
Date: Wed, 30 Jan 2008 13:33:20 +0100
Subject: x86: fill in missing pv_mmu_ops entries for PAGETABLE_LEVELS >= 3

This finally makes paravirt-ops able to compile and boot under x86_64.

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/paravirt.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 37f38b71c50a..075962cc75ab 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -398,16 +398,23 @@ struct pv_mmu_ops pv_mmu_ops = {
 	.kmap_atomic_pte = kmap_atomic,
 #endif
 
+#if PAGETABLE_LEVELS >= 3
 #ifdef CONFIG_X86_PAE
 	.set_pte_atomic = native_set_pte_atomic,
 	.set_pte_present = native_set_pte_present,
-	.set_pud = native_set_pud,
 	.pte_clear = native_pte_clear,
 	.pmd_clear = native_pmd_clear,
-
+#endif
+	.set_pud = native_set_pud,
 	.pmd_val = native_pmd_val,
 	.make_pmd = native_make_pmd,
+
+#if PAGETABLE_LEVELS == 4
+	.pud_val = native_pud_val,
+	.make_pud = native_make_pud,
+	.set_pgd = native_set_pgd,
 #endif
+#endif /* PAGETABLE_LEVELS >= 3 */
 
 	.pte_val = native_pte_val,
 	.pgd_val = native_pgd_val,
-- 
cgit v1.2.3


From 7d851c8d3db0f79b92c8b14361779ede8acd2488 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 30 Jan 2008 13:33:20 +0100
Subject: x86: add framework to disable CPUID bits on the command line

There are already various options to disable specific cpuid bits
on the command line. They all use their own variable. Add a generic
mask to make this easier in the future.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/common.c | 6 ++++++
 arch/x86/kernel/setup_64.c   | 6 ++++++
 include/asm-x86/cpufeature.h | 4 ++++
 include/asm-x86/processor.h  | 1 +
 4 files changed, 17 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4bd326d0322c..f0f29ddf33a2 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -57,6 +57,8 @@ DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
 } };
 EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
 
+__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
+
 static int cachesize_override __cpuinitdata = -1;
 static int disable_x86_fxsr __cpuinitdata;
 static int disable_x86_serial_nr __cpuinitdata = 1;
@@ -497,6 +499,10 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 			boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
 	}
 
+	/* Clear all flags overriden by options */
+	for (i = 0; i < NCAPINTS; i++)
+		c->x86_capability[i] ^= cleared_cpu_caps[i];
+
 	/* Init Machine Check Exception if available. */
 	mcheck_init(c);
 
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 7edb43f0b279..df159520bbd2 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -80,6 +80,8 @@
 struct cpuinfo_x86 boot_cpu_data __read_mostly;
 EXPORT_SYMBOL(boot_cpu_data);
 
+__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
+
 unsigned long mmu_cr4_features;
 
 /* Boot loader ID as an integer, for the benefit of proc_dointvec */
@@ -1013,6 +1015,10 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 			boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
 	}
 
+	/* Clear all flags overriden by options */
+	for (i = 0; i < NCAPINTS; i++)
+		c->x86_capability[i] ^= cleared_cpu_caps[i];
+
 #ifdef CONFIG_X86_MCE
 	mcheck_init(c);
 #endif
diff --git a/include/asm-x86/cpufeature.h b/include/asm-x86/cpufeature.h
index 29727bf0e177..b8f53f869e1f 100644
--- a/include/asm-x86/cpufeature.h
+++ b/include/asm-x86/cpufeature.h
@@ -131,6 +131,10 @@
 
 #define set_cpu_cap(c, bit)	set_bit(bit, (unsigned long *)((c)->x86_capability))
 #define clear_cpu_cap(c, bit)	clear_bit(bit, (unsigned long *)((c)->x86_capability))
+#define setup_clear_cpu_cap(bit) do { \
+	clear_cpu_cap(&boot_cpu_data, bit);	\
+	set_bit(bit, cleared_cpu_caps); 	\
+} while (0)
 
 #define cpu_has_fpu		boot_cpu_has(X86_FEATURE_FPU)
 #define cpu_has_vme		boot_cpu_has(X86_FEATURE_VME)
diff --git a/include/asm-x86/processor.h b/include/asm-x86/processor.h
index 81ecfed83e47..ab4d0c2a3f8f 100644
--- a/include/asm-x86/processor.h
+++ b/include/asm-x86/processor.h
@@ -118,6 +118,7 @@ struct cpuinfo_x86 {
 extern struct cpuinfo_x86 boot_cpu_data;
 extern struct cpuinfo_x86 new_cpu_data;
 extern struct tss_struct doublefault_tss;
+extern __u32 cleared_cpu_caps[NCAPINTS];
 
 #ifdef CONFIG_SMP
 DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
-- 
cgit v1.2.3


From 135302577bb964ebf23376e2d991405ef4ff0457 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 30 Jan 2008 13:33:20 +0100
Subject: x86: convert some existing cpuid disable options to new generic
 bitmap

This convers nofxsr, mem=nopentium and nosep to use the new
generic cpuid disable bitmap instead of using own variables.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/common.c | 34 ++++------------------------------
 arch/x86/kernel/setup_32.c   |  5 +----
 2 files changed, 5 insertions(+), 34 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f0f29ddf33a2..5f9c8e3a3e0f 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -60,14 +60,10 @@ EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
 __u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
 
 static int cachesize_override __cpuinitdata = -1;
-static int disable_x86_fxsr __cpuinitdata;
 static int disable_x86_serial_nr __cpuinitdata = 1;
-static int disable_x86_sep __cpuinitdata;
 
 struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};
 
-extern int disable_pse;
-
 static void __cpuinit default_init(struct cpuinfo_x86 * c)
 {
 	/* Not much we can do here... */
@@ -216,16 +212,8 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early)
 
 static int __init x86_fxsr_setup(char * s)
 {
-	/* Tell all the other CPUs to not use it... */
-	disable_x86_fxsr = 1;
-
-	/*
-	 * ... and clear the bits early in the boot_cpu_data
-	 * so that the bootup process doesn't try to do this
-	 * either.
-	 */
-	clear_bit(X86_FEATURE_FXSR, boot_cpu_data.x86_capability);
-	clear_bit(X86_FEATURE_XMM, boot_cpu_data.x86_capability);
+	setup_clear_cpu_cap(X86_FEATURE_FXSR);
+	setup_clear_cpu_cap(X86_FEATURE_XMM);
 	return 1;
 }
 __setup("nofxsr", x86_fxsr_setup);
@@ -233,7 +221,7 @@ __setup("nofxsr", x86_fxsr_setup);
 
 static int __init x86_sep_setup(char * s)
 {
-	disable_x86_sep = 1;
+	setup_clear_cpu_cap(X86_FEATURE_SEP);
 	return 1;
 }
 __setup("nosep", x86_sep_setup);
@@ -462,19 +450,6 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 	if ( tsc_disable )
 		clear_bit(X86_FEATURE_TSC, c->x86_capability);
 
-	/* FXSR disabled? */
-	if (disable_x86_fxsr) {
-		clear_bit(X86_FEATURE_FXSR, c->x86_capability);
-		clear_bit(X86_FEATURE_XMM, c->x86_capability);
-	}
-
-	/* SEP disabled? */
-	if (disable_x86_sep)
-		clear_bit(X86_FEATURE_SEP, c->x86_capability);
-
-	if (disable_pse)
-		clear_bit(X86_FEATURE_PSE, c->x86_capability);
-
 	/* If the model name is still unset, do table lookup. */
 	if ( !c->x86_model_id[0] ) {
 		char *p;
@@ -629,8 +604,7 @@ void __init early_cpu_init(void)
 	/* pse is not compatible with on-the-fly unmapping,
 	 * disable it even if the cpus claim to support it.
 	 */
-	clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
-	disable_pse = 1;
+	setup_clear_cpu_cap(X86_FEATURE_PSE);
 #endif
 }
 
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 6802a383077d..26a56f714d34 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -68,8 +68,6 @@
    address, and must not be in the .bss segment! */
 unsigned long init_pg_tables_end __initdata = ~0UL;
 
-int disable_pse __cpuinitdata = 0;
-
 /*
  * Machine setup..
  */
@@ -242,8 +240,7 @@ static int __init parse_mem(char *arg)
 		return -EINVAL;
 
 	if (strcmp(arg, "nopentium") == 0) {
-		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PSE);
-		disable_pse = 1;
+		setup_clear_cpu_cap(X86_FEATURE_PSE);
 	} else {
 		/* If the user specifies memory size, we
 		 * limit the BIOS-provided memory map to
-- 
cgit v1.2.3


From 8424950b5e85543a494b5d940bb2f5f9f16f56a9 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 30 Jan 2008 13:33:20 +0100
Subject: x86: don't disable RDTSC in userland for 32bit notsc

Modern 32bit userland doesn't even boot when the TSC is disabled
because ld.so tends to contain RDTSCs.  So make notsc only effective for the
kernel, similar to 64bit.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/common.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 5f9c8e3a3e0f..c66991a04a8a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -654,7 +654,6 @@ void __cpuinit cpu_init(void)
 		printk(KERN_NOTICE "Disabling TSC...\n");
 		/**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
 		clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
-		set_in_cr4(X86_CR4_TSD);
 	}
 
 	load_idt(&idt_descr);
-- 
cgit v1.2.3


From 404ee5b14b68d3cba287c2596588b83790c49f7b Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 30 Jan 2008 13:33:20 +0100
Subject: x86: convert TSC disabling to generic cpuid disable bitmap

Fix from: Ian Campbell <ijc@hellion.org.uk>

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/bugs.c    |  2 +-
 arch/x86/kernel/cpu/common.c  |  9 ---------
 arch/x86/kernel/numaq_32.c    |  2 +-
 arch/x86/kernel/tsc_32.c      | 14 +++-----------
 arch/x86/mach-voyager/setup.c |  2 +-
 arch/x86/xen/time.c           |  2 +-
 include/asm-x86/cpufeature.h  |  4 ++++
 include/asm-x86/tsc.h         |  2 --
 8 files changed, 11 insertions(+), 26 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index a96abd453e0d..9b95edcfc6ae 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -154,7 +154,7 @@ static void __init check_config(void)
  * If we configured ourselves for a TSC, we'd better have one!
  */
 #ifdef CONFIG_X86_TSC
-	if (!cpu_has_tsc && !tsc_disable)
+	if (!cpu_has_tsc)
 		panic("Kernel compiled for Pentium+, requires TSC feature!");
 #endif
 
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c66991a04a8a..dfc9563fc4f0 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -446,10 +446,6 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 	 * we do "generic changes."
 	 */
 
-	/* TSC disabled? */
-	if ( tsc_disable )
-		clear_bit(X86_FEATURE_TSC, c->x86_capability);
-
 	/* If the model name is still unset, do table lookup. */
 	if ( !c->x86_model_id[0] ) {
 		char *p;
@@ -650,11 +646,6 @@ void __cpuinit cpu_init(void)
 
 	if (cpu_has_vme || cpu_has_tsc || cpu_has_de)
 		clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
-	if (tsc_disable && cpu_has_tsc) {
-		printk(KERN_NOTICE "Disabling TSC...\n");
-		/**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
-		clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
-	}
 
 	load_idt(&idt_descr);
 	switch_to_new_gdt();
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index 9000d82c6dc0..e65281b1634b 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -82,7 +82,7 @@ static int __init numaq_tsc_disable(void)
 {
 	if (num_online_nodes() > 1) {
 		printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
-		tsc_disable = 1;
+		setup_clear_cpu_cap(X86_FEATURE_TSC);
 	}
 	return 0;
 }
diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
index 2a7b95bd8509..43517e324be8 100644
--- a/arch/x86/kernel/tsc_32.c
+++ b/arch/x86/kernel/tsc_32.c
@@ -24,8 +24,6 @@ static int tsc_enabled;
 unsigned int tsc_khz;
 EXPORT_SYMBOL_GPL(tsc_khz);
 
-int tsc_disable;
-
 #ifdef CONFIG_X86_TSC
 static int __init tsc_setup(char *str)
 {
@@ -40,8 +38,7 @@ static int __init tsc_setup(char *str)
  */
 static int __init tsc_setup(char *str)
 {
-	tsc_disable = 1;
-
+	setup_clear_cpu_cap(X86_FEATURE_TSC);
 	return 1;
 }
 #endif
@@ -395,7 +392,7 @@ void __init tsc_init(void)
 {
 	int cpu;
 
-	if (!cpu_has_tsc || tsc_disable)
+	if (!cpu_has_tsc)
 		goto out_no_tsc;
 
 	cpu_khz = calculate_cpu_khz();
@@ -439,10 +436,5 @@ void __init tsc_init(void)
 	return;
 
 out_no_tsc:
-	/*
-	 * Set the tsc_disable flag if there's no TSC support, this
-	 * makes it a fast flag for the kernel to see whether it
-	 * should be using the TSC.
-	 */
-	tsc_disable = 1;
+	setup_clear_cpu_cap(X86_FEATURE_TSC);
 }
diff --git a/arch/x86/mach-voyager/setup.c b/arch/x86/mach-voyager/setup.c
index 81257a861984..5ae5466b9eb9 100644
--- a/arch/x86/mach-voyager/setup.c
+++ b/arch/x86/mach-voyager/setup.c
@@ -37,7 +37,7 @@ void __init pre_setup_arch_hook(void)
 {
 	/* Voyagers run their CPUs from independent clocks, so disable
 	 * the TSC code because we can't sync them */
-	tsc_disable = 1;
+	setup_clear_cpu_cap(X86_FEATURE_TSC);
 }
 
 void __init trap_init_hook(void)
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index d083ff5ef088..b3721fd6877b 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -592,7 +592,7 @@ __init void xen_time_init(void)
 	set_normalized_timespec(&wall_to_monotonic,
 				-xtime.tv_sec, -xtime.tv_nsec);
 
-	tsc_disable = 0;
+	setup_force_cpu_cap(X86_FEATURE_TSC);
 
 	xen_setup_timer(cpu);
 	xen_setup_cpu_clockevents();
diff --git a/include/asm-x86/cpufeature.h b/include/asm-x86/cpufeature.h
index b8f53f869e1f..3fb7dfa7fc91 100644
--- a/include/asm-x86/cpufeature.h
+++ b/include/asm-x86/cpufeature.h
@@ -135,6 +135,10 @@
 	clear_cpu_cap(&boot_cpu_data, bit);	\
 	set_bit(bit, cleared_cpu_caps); 	\
 } while (0)
+#define setup_force_cpu_cap(bit) do { \
+	set_cpu_cap(&boot_cpu_data, bit);	\
+	clear_bit(bit, cleared_cpu_caps); 	\
+} while (0)
 
 #define cpu_has_fpu		boot_cpu_has(X86_FEATURE_FPU)
 #define cpu_has_vme		boot_cpu_has(X86_FEATURE_VME)
diff --git a/include/asm-x86/tsc.h b/include/asm-x86/tsc.h
index 071e0ce5b664..a6e8d35c3f86 100644
--- a/include/asm-x86/tsc.h
+++ b/include/asm-x86/tsc.h
@@ -16,8 +16,6 @@ typedef unsigned long long cycles_t;
 
 extern unsigned int cpu_khz;
 extern unsigned int tsc_khz;
-/* flag for disabling the tsc */
-extern int tsc_disable;
 
 extern void disable_TSC(void);
 
-- 
cgit v1.2.3


From 191679fdfa63342752ff6a094a2522ae939b8d0c Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 30 Jan 2008 13:33:21 +0100
Subject: x86: add noclflush option

To disable CLFLUSH usage, especially in change_page_attr().

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/kernel-parameters.txt | 2 ++
 arch/x86/kernel/cpu/common.c        | 7 +++++++
 arch/x86/kernel/setup_64.c          | 7 +++++++
 3 files changed, 16 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index b8fadf5f75a3..040e30a33b7c 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1191,6 +1191,8 @@ and is between 256 and 4096 characters. It is defined in the file
 			register save and restore. The kernel will only save
 			legacy floating-point registers on task switch.
 
+	noclflush	[BUGS=X86] Don't use the CLFLUSH instruction
+
 	nohlt		[BUGS=ARM]
 
 	no-hlt		[BUGS=X86-32] Tells the kernel that the hlt
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index dfc9563fc4f0..56b7ea8e6c79 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -542,6 +542,13 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 }
 #endif
 
+static __init int setup_noclflush(char *arg)
+{
+	setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
+	return 1;
+}
+__setup("noclflush", setup_noclflush);
+
 void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
 {
 	char *vendor = NULL;
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index df159520bbd2..cb9b8a90c094 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -1032,6 +1032,13 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 
 }
 
+static __init int setup_noclflush(char *arg)
+{
+	setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
+	return 1;
+}
+__setup("noclflush", setup_noclflush);
+
 void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
 {
 	if (c->x86_model_id[0])
-- 
cgit v1.2.3


From ac72e7888a612dccfbc15b34698aad441bdfda10 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 30 Jan 2008 13:33:21 +0100
Subject: x86: add generic clearcpuid=... option

Add a generic option to clear any cpuid bit. I added it because it was
very easy to add with the new generic cpuid disable bitmap and perhaps
it will be useful in the future.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/kernel-parameters.txt | 13 +++++++++++++
 arch/x86/kernel/cpu/common.c        | 11 +++++++++++
 arch/x86/kernel/setup_64.c          | 11 +++++++++++
 3 files changed, 35 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 040e30a33b7c..50d564dabb13 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -416,6 +416,19 @@ and is between 256 and 4096 characters. It is defined in the file
 			[SPARC64] tick
 			[X86-64] hpet,tsc
 
+	clearcpuid=BITNUM [X86]
+			Disable CPUID feature X for the kernel. See
+			include/asm-x86/cpufeature.h for the valid bit numbers.
+			Note the Linux specific bits are not necessarily
+			stable over kernel options, but the vendor specific
+			ones should be.
+			Also note that user programs calling CPUID directly
+			or using the feature without checking anything
+			will still see it. This just prevents it from
+			being used by the kernel or shown in /proc/cpuinfo.
+			Also note the kernel might malfunction if you disable
+			some critical bits.
+
 	code_bytes	[IA32/X86_64] How many bytes of object code to print
 			in an oops report.
 			Range: 0 - 8192
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 56b7ea8e6c79..56cc341cc586 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -572,6 +572,17 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
 		printk("\n");
 }
 
+static __init int setup_disablecpuid(char *arg)
+{
+	int bit;
+	if (get_option(&arg, &bit) && bit < NCAPINTS*32)
+		setup_clear_cpu_cap(bit);
+	else
+		return 0;
+	return 1;
+}
+__setup("clearcpuid=", setup_disablecpuid);
+
 cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
 
 /* This is hacky. :)
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index cb9b8a90c094..79635b7bd57a 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -1050,6 +1050,17 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
 		printk(KERN_CONT "\n");
 }
 
+static __init int setup_disablecpuid(char *arg)
+{
+	int bit;
+	if (get_option(&arg, &bit) && bit < NCAPINTS*32)
+		setup_clear_cpu_cap(bit);
+	else
+		return 0;
+	return 1;
+}
+__setup("clearcpuid=", setup_disablecpuid);
+
 /*
  *	Get CPU information for use by the procfs.
  */
-- 
cgit v1.2.3


From 834beda15ecc43c110c0a6ac39ec1aa79f891716 Mon Sep 17 00:00:00 2001
From: "travis@sgi.com" <travis@sgi.com>
Date: Wed, 30 Jan 2008 13:33:21 +0100
Subject: x86: change NR_CPUS arrays in numa_64 fixup

Change the following static arrays sized by NR_CPUS to
per_cpu data variables:

	char cpu_to_node_map[NR_CPUS];

fixup:

  - Split cpu_to_node function into "early" and "late" versions
    so that x86_cpu_to_node_map_early_ptr is not EXPORT'ed and
    the cpu_to_node inline function is more streamlined.

  - This also involves setting up the percpu maps as early as possible.

  - Fix X86_32 NUMA build errors that previous version of this
    patch caused.

V2->V3:
    - add early_cpu_to_node function to keep cpu_to_node efficient
    - move and rename smp_set_apicids() to setup_percpu_maps()
    - call setup_percpu_maps() as early as possible

V1->V2:
    - Removed extraneous casts
    - Fix !NUMA builds with '#ifdef CONFIG_NUMA"

Signed-off-by: Mike Travis <travis@sgi.com>
Reviewed-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/setup64.c    | 41 +++++++++++++++++++++++++++++++++++++++--
 arch/x86/kernel/smpboot_32.c |  2 +-
 arch/x86/kernel/smpboot_64.c | 34 ----------------------------------
 arch/x86/mm/srat_64.c        |  5 +++--
 include/asm-x86/topology.h   | 23 +++++++++++++++++++++++
 5 files changed, 66 insertions(+), 39 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c
index 8fa0de810d0b..855ec82e4f76 100644
--- a/arch/x86/kernel/setup64.c
+++ b/arch/x86/kernel/setup64.c
@@ -83,6 +83,40 @@ static int __init nonx32_setup(char *str)
 }
 __setup("noexec32=", nonx32_setup);
 
+/*
+ * Copy data used in early init routines from the initial arrays to the
+ * per cpu data areas.  These arrays then become expendable and the *_ptrs
+ * are zeroed indicating that the static arrays are gone.
+ */
+void __init setup_percpu_maps(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+#ifdef CONFIG_SMP
+		if (per_cpu_offset(cpu)) {
+#endif
+			per_cpu(x86_cpu_to_apicid, cpu) =
+						x86_cpu_to_apicid_init[cpu];
+#ifdef CONFIG_NUMA
+			per_cpu(x86_cpu_to_node_map, cpu) =
+						x86_cpu_to_node_map_init[cpu];
+#endif
+#ifdef CONFIG_SMP
+		}
+		else
+			printk(KERN_NOTICE "per_cpu_offset zero for cpu %d\n",
+									cpu);
+#endif
+	}
+
+	/* indicate the early static arrays are gone */
+	x86_cpu_to_apicid_early_ptr = NULL;
+#ifdef CONFIG_NUMA
+	x86_cpu_to_node_map_early_ptr = NULL;
+#endif
+}
+
 /*
  * Great future plan:
  * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
@@ -104,18 +138,21 @@ void __init setup_per_cpu_areas(void)
 	for_each_cpu_mask (i, cpu_possible_map) {
 		char *ptr;
 
-		if (!NODE_DATA(cpu_to_node(i))) {
+		if (!NODE_DATA(early_cpu_to_node(i))) {
 			printk("cpu with no node %d, num_online_nodes %d\n",
 			       i, num_online_nodes());
 			ptr = alloc_bootmem_pages(size);
 		} else { 
-			ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size);
+			ptr = alloc_bootmem_pages_node(NODE_DATA(early_cpu_to_node(i)), size);
 		}
 		if (!ptr)
 			panic("Cannot allocate cpu data for CPU %d\n", i);
 		cpu_pda(i)->data_offset = ptr - __per_cpu_start;
 		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
 	}
+
+	/* setup percpu data maps early */
+	setup_percpu_maps();
 } 
 
 void pda_init(int cpu)
diff --git a/arch/x86/kernel/smpboot_32.c b/arch/x86/kernel/smpboot_32.c
index 915ec6267326..50232d476a27 100644
--- a/arch/x86/kernel/smpboot_32.c
+++ b/arch/x86/kernel/smpboot_32.c
@@ -465,7 +465,7 @@ cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly =
 				{ [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE };
 EXPORT_SYMBOL(node_to_cpumask_map);
 /* which node each logical CPU is on */
-int cpu_to_node_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };
+u8 cpu_to_node_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };
 EXPORT_SYMBOL(cpu_to_node_map);
 
 /* set up a mapping between cpu and node. */
diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c
index 93071cdf0849..4e14ecb90764 100644
--- a/arch/x86/kernel/smpboot_64.c
+++ b/arch/x86/kernel/smpboot_64.c
@@ -851,39 +851,6 @@ static int __init smp_sanity_check(unsigned max_cpus)
 	return 0;
 }
 
-/*
- * Copy data used in early init routines from the initial arrays to the
- * per cpu data areas.  These arrays then become expendable and the
- * *_ptrs are zeroed indicating that the static arrays are gone.
- */
-void __init smp_set_apicids(void)
-{
-	int cpu;
-
-	for_each_possible_cpu(cpu) {
-		if (per_cpu_offset(cpu)) {
-			per_cpu(x86_cpu_to_apicid, cpu) =
-						x86_cpu_to_apicid_init[cpu];
-#ifdef CONFIG_NUMA
-			per_cpu(x86_cpu_to_node_map, cpu) =
-						x86_cpu_to_node_map_init[cpu];
-#endif
-			per_cpu(x86_bios_cpu_apicid, cpu) =
-						x86_bios_cpu_apicid_init[cpu];
-		}
-		else
-			printk(KERN_NOTICE "per_cpu_offset zero for cpu %d\n",
-									cpu);
-	}
-
-	/* indicate the early static arrays are gone */
-	x86_cpu_to_apicid_early_ptr = NULL;
-#ifdef CONFIG_NUMA
-	x86_cpu_to_node_map_early_ptr = NULL;
-#endif
-	x86_bios_cpu_apicid_early_ptr = NULL;
-}
-
 static void __init smp_cpu_index_default(void)
 {
 	int i;
@@ -906,7 +873,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 	smp_cpu_index_default();
 	current_cpu_data = boot_cpu_data;
 	current_thread_info()->cpu = 0;  /* needed? */
-	smp_set_apicids();
 	set_cpu_sibling_map(0);
 
 	if (smp_sanity_check(max_cpus) < 0) {
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index e5a1ec8342dc..04cbecaeca81 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -382,9 +382,10 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 			setup_node_bootmem(i, nodes[i].start, nodes[i].end);
 
 	for (i = 0; i < NR_CPUS; i++) {
-		if (cpu_to_node(i) == NUMA_NO_NODE)
+		int node = cpu_to_node(i);
+		if (node == NUMA_NO_NODE)
 			continue;
-		if (!node_isset(cpu_to_node(i), node_possible_map))
+		if (!node_isset(node, node_possible_map))
 			numa_set_node(i, NUMA_NO_NODE);
 	}
 	numa_init_array();
diff --git a/include/asm-x86/topology.h b/include/asm-x86/topology.h
index 2da1464ecbef..040374f030cf 100644
--- a/include/asm-x86/topology.h
+++ b/include/asm-x86/topology.h
@@ -30,15 +30,29 @@
 #include <asm/mpspec.h>
 
 /* Mappings between logical cpu number and node number */
+#ifdef CONFIG_X86_32
+extern u8 cpu_to_node_map[];
+
+#else
 DECLARE_PER_CPU(u16, x86_cpu_to_node_map);
 extern u16 x86_cpu_to_node_map_init[];
 extern void *x86_cpu_to_node_map_early_ptr;
+#endif
+
 extern cpumask_t node_to_cpumask_map[];
 
 #define NUMA_NO_NODE	((u16)(~0))
 
 /* Returns the number of the node containing CPU 'cpu' */
+#ifdef CONFIG_X86_32
+#define early_cpu_to_node(cpu)	cpu_to_node(cpu)
 static inline int cpu_to_node(int cpu)
+{
+	return cpu_to_node_map[cpu];
+}
+
+#else /* CONFIG_X86_64 */
+static inline int early_cpu_to_node(int cpu)
 {
 	u16 *cpu_to_node_map = x86_cpu_to_node_map_early_ptr;
 
@@ -50,6 +64,15 @@ static inline int cpu_to_node(int cpu)
 		return NUMA_NO_NODE;
 }
 
+static inline int cpu_to_node(int cpu)
+{
+	if(per_cpu_offset(cpu))
+		return per_cpu(x86_cpu_to_node_map, cpu);
+	else
+		return NUMA_NO_NODE;
+}
+#endif /* CONFIG_X86_64 */
+
 /*
  * Returns the number of the node containing Node 'node'. This
  * architecture is flat, so it is a pretty simple function!
-- 
cgit v1.2.3


From 602a54a8cab2759fceb20b3e0c2a27c4eac005df Mon Sep 17 00:00:00 2001
From: "travis@sgi.com" <travis@sgi.com>
Date: Wed, 30 Jan 2008 13:33:21 +0100
Subject: x86: change bios_cpu_apicid to percpu data variable fixup

Change static bios_cpu_apicid array to a per_cpu data variable.
This includes using a static array used during initialization
similar to the way x86_cpu_to_apicid[] is handled.

There is one early use of bios_cpu_apicid in apic_is_clustered_box().
The other reference in cpu_present_to_apicid() is called after
smp_set_apicids() has setup the percpu version of bios_cpu_apicid.

Signed-off-by: Mike Travis <travis@sgi.com>
Reviewed-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apic_64.c    |  6 +++---
 arch/x86/kernel/setup64.c    | 13 ++++++++-----
 arch/x86/kernel/setup_64.c   |  1 +
 arch/x86/kernel/smpboot_32.c |  2 +-
 arch/x86/mm/srat_64.c        |  2 +-
 5 files changed, 14 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index a7a38c9da136..d8d03e09dea2 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -1205,9 +1205,9 @@ __cpuinit int apic_is_clustered_box(void)
 
 	/* Problem:  Partially populated chassis may not have CPUs in some of
 	 * the APIC clusters they have been allocated.  Only present CPUs have
-	 * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap.  Since
-	 * clusters are allocated sequentially, count zeros only if they are
-	 * bounded by ones.
+	 * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap.
+	 * Since clusters are allocated sequentially, count zeros only if
+	 * they are bounded by ones.
 	 */
 	clusters = 0;
 	zeros = 0;
diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c
index 855ec82e4f76..0389331059ad 100644
--- a/arch/x86/kernel/setup64.c
+++ b/arch/x86/kernel/setup64.c
@@ -85,10 +85,10 @@ __setup("noexec32=", nonx32_setup);
 
 /*
  * Copy data used in early init routines from the initial arrays to the
- * per cpu data areas.  These arrays then become expendable and the *_ptrs
- * are zeroed indicating that the static arrays are gone.
+ * per cpu data areas.  These arrays then become expendable and the
+ * *_early_ptr's are zeroed indicating that the static arrays are gone.
  */
-void __init setup_percpu_maps(void)
+static void __init setup_per_cpu_maps(void)
 {
 	int cpu;
 
@@ -98,6 +98,8 @@ void __init setup_percpu_maps(void)
 #endif
 			per_cpu(x86_cpu_to_apicid, cpu) =
 						x86_cpu_to_apicid_init[cpu];
+			per_cpu(x86_bios_cpu_apicid, cpu) =
+						x86_bios_cpu_apicid_init[cpu];
 #ifdef CONFIG_NUMA
 			per_cpu(x86_cpu_to_node_map, cpu) =
 						x86_cpu_to_node_map_init[cpu];
@@ -110,8 +112,9 @@ void __init setup_percpu_maps(void)
 #endif
 	}
 
-	/* indicate the early static arrays are gone */
+	/* indicate the early static arrays will soon be gone */
 	x86_cpu_to_apicid_early_ptr = NULL;
+	x86_bios_cpu_apicid_early_ptr = NULL;
 #ifdef CONFIG_NUMA
 	x86_cpu_to_node_map_early_ptr = NULL;
 #endif
@@ -152,7 +155,7 @@ void __init setup_per_cpu_areas(void)
 	}
 
 	/* setup percpu data maps early */
-	setup_percpu_maps();
+	setup_per_cpu_maps();
 } 
 
 void pda_init(int cpu)
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 79635b7bd57a..f84e2662f1ca 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -334,6 +334,7 @@ void __init setup_arch(char **cmdline_p)
 #ifdef CONFIG_SMP
 	/* setup to use the early static init tables during kernel startup */
 	x86_cpu_to_apicid_early_ptr = (void *)&x86_cpu_to_apicid_init;
+	x86_bios_cpu_apicid_early_ptr = (void *)&x86_bios_cpu_apicid_init;
 #ifdef CONFIG_NUMA
 	x86_cpu_to_node_map_early_ptr = (void *)&x86_cpu_to_node_map_init;
 #endif
diff --git a/arch/x86/kernel/smpboot_32.c b/arch/x86/kernel/smpboot_32.c
index 50232d476a27..915ec6267326 100644
--- a/arch/x86/kernel/smpboot_32.c
+++ b/arch/x86/kernel/smpboot_32.c
@@ -465,7 +465,7 @@ cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly =
 				{ [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE };
 EXPORT_SYMBOL(node_to_cpumask_map);
 /* which node each logical CPU is on */
-u8 cpu_to_node_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };
+int cpu_to_node_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };
 EXPORT_SYMBOL(cpu_to_node_map);
 
 /* set up a mapping between cpu and node. */
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index f828e6a6973d..4aed38fa4a65 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -397,7 +397,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
 	[0 ... MAX_NUMNODES-1] = PXM_INVAL
 };
-static u16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
+static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
 	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
 };
 static int __init find_node_by_addr(unsigned long addr)
-- 
cgit v1.2.3


From ade1af77129dea6e335b525ed3be3b846bc1ec13 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@computergmbh.de>
Date: Wed, 30 Jan 2008 13:33:23 +0100
Subject: x86: remove unneded casts

x86: remove unneeded casts

Signed-off-by: Jan Engelhardt <jengelh@computergmbh.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/boot/compressed/misc_32.c     |  5 +++--
 arch/x86/boot/compressed/misc_64.c     |  5 +++--
 arch/x86/ia32/ia32_signal.c            |  2 +-
 arch/x86/kernel/cpu/cpufreq/longhaul.c |  2 +-
 arch/x86/kernel/cpuid.c                |  2 +-
 arch/x86/kernel/kprobes.c              |  2 +-
 arch/x86/kernel/microcode.c            | 12 +++++-------
 arch/x86/kernel/process_64.c           |  2 +-
 arch/x86/kernel/signal_32.c            |  2 +-
 arch/x86/kernel/smpboot_32.c           |  2 +-
 arch/x86/kernel/stacktrace.c           |  2 +-
 arch/x86/kernel/traps_64.c             |  2 +-
 arch/x86/lib/memcpy_32.c               |  4 ++--
 arch/x86/lib/memmove_64.c              |  4 ++--
 14 files changed, 24 insertions(+), 24 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/boot/compressed/misc_32.c b/arch/x86/boot/compressed/misc_32.c
index 9103652058c4..5a0281c81edf 100644
--- a/arch/x86/boot/compressed/misc_32.c
+++ b/arch/x86/boot/compressed/misc_32.c
@@ -285,7 +285,7 @@ static void putstr(const char *s)
 static void* memset(void* s, int c, unsigned n)
 {
 	int i;
-	char *ss = (char*)s;
+	char *ss = s;
 
 	for (i=0;i<n;i++) ss[i] = c;
 	return s;
@@ -294,7 +294,8 @@ static void* memset(void* s, int c, unsigned n)
 static void* memcpy(void* dest, const void* src, unsigned n)
 {
 	int i;
-	char *d = (char *)dest, *s = (char *)src;
+	const char *s = src;
+	char *d = dest;
 
 	for (i=0;i<n;i++) d[i] = s[i];
 	return dest;
diff --git a/arch/x86/boot/compressed/misc_64.c b/arch/x86/boot/compressed/misc_64.c
index 8494f0dcff21..8c1573b5d6c7 100644
--- a/arch/x86/boot/compressed/misc_64.c
+++ b/arch/x86/boot/compressed/misc_64.c
@@ -284,7 +284,7 @@ static void putstr(const char *s)
 static void* memset(void* s, int c, unsigned n)
 {
 	int i;
-	char *ss = (char*)s;
+	char *ss = s;
 
 	for (i=0;i<n;i++) ss[i] = c;
 	return s;
@@ -293,7 +293,8 @@ static void* memset(void* s, int c, unsigned n)
 static void* memcpy(void* dest, const void* src, unsigned n)
 {
 	int i;
-	char *d = (char *)dest, *s = (char *)src;
+	const char *s = src;
+	char *d = dest;
 
 	for (i=0;i<n;i++) d[i] = s[i];
 	return dest;
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 0a34c24f19e5..1c0503bdfb1a 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -472,7 +472,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
 			restorer = VDSO32_SYMBOL(current->mm->context.vdso,
 						 sigreturn);
 		else
-			restorer = (void *)&frame->retcode;
+			restorer = &frame->retcode;
 	}
 	err |= __put_user(ptr_to_compat(restorer), &frame->pretcode);
 
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index 749d00cb2ebd..06fcce516d51 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -694,7 +694,7 @@ static acpi_status longhaul_walk_callback(acpi_handle obj_handle,
 	if ( acpi_bus_get_device(obj_handle, &d) ) {
 		return 0;
 	}
-	*return_value = (void *)acpi_driver_data(d);
+	*return_value = acpi_driver_data(d);
 	return 1;
 }
 
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index d387c770c518..dec66e452810 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -50,7 +50,7 @@ struct cpuid_command {
 
 static void cpuid_smp_cpuid(void *cmd_block)
 {
-	struct cpuid_command *cmd = (struct cpuid_command *)cmd_block;
+	struct cpuid_command *cmd = cmd_block;
 
 	cpuid(cmd->reg, &cmd->data[0], &cmd->data[1], &cmd->data[2],
 		      &cmd->data[3]);
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index f7ddbb8c3fe8..a99e764fd66a 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -953,7 +953,7 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
 				       unsigned long val, void *data)
 {
-	struct die_args *args = (struct die_args *)data;
+	struct die_args *args = data;
 	int ret = NOTIFY_DONE;
 
 	if (args->regs && user_mode_vm(args->regs))
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
index 40cfd5488719..6ff447f9fda7 100644
--- a/arch/x86/kernel/microcode.c
+++ b/arch/x86/kernel/microcode.c
@@ -244,8 +244,8 @@ static int microcode_sanity_check(void *mc)
 		return 0;
 	/* check extended signature checksum */
 	for (i = 0; i < ext_sigcount; i++) {
-		ext_sig = (struct extended_signature *)((void *)ext_header
-			+ EXT_HEADER_SIZE + EXT_SIGNATURE_SIZE * i);
+		ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
+			  EXT_SIGNATURE_SIZE * i;
 		sum = orig_sum
 			- (mc_header->sig + mc_header->pf + mc_header->cksum)
 			+ (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
@@ -279,11 +279,9 @@ static int get_maching_microcode(void *mc, int cpu)
 	if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
 		return 0;
 
-	ext_header = (struct extended_sigtable *)(mc +
-			get_datasize(mc_header) + MC_HEADER_SIZE);
+	ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
 	ext_sigcount = ext_header->count;
-	ext_sig = (struct extended_signature *)((void *)ext_header
-			+ EXT_HEADER_SIZE);
+	ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
 	for (i = 0; i < ext_sigcount; i++) {
 		if (microcode_update_match(cpu, mc_header,
 				ext_sig->sig, ext_sig->pf))
@@ -539,7 +537,7 @@ static int cpu_request_microcode(int cpu)
 		pr_debug("ucode data file %s load failed\n", name);
 		return error;
 	}
-	buf = (void *)firmware->data;
+	buf = firmware->data;
 	size = firmware->size;
 	while ((offset = get_next_ucode_from_buffer(&mc, buf, size, offset))
 			> 0) {
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 95313532b2e0..b4c470658a8a 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -470,7 +470,7 @@ static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
 		.limit_in_pages = 1,
 		.useable = 1,
 	};
-	struct desc_struct *desc = (void *)t->thread.tls_array;
+	struct desc_struct *desc = t->thread.tls_array;
 	desc += tls;
 	fill_ldt(desc, &ud);
 }
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index 89a690edf999..caee1f002fed 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -368,7 +368,7 @@ static int setup_frame(int sig, struct k_sigaction *ka,
 	if (current->binfmt->hasvdso)
 		restorer = VDSO32_SYMBOL(current->mm->context.vdso, sigreturn);
 	else
-		restorer = (void *)&frame->retcode;
+		restorer = &frame->retcode;
 	if (ka->sa.sa_flags & SA_RESTORER)
 		restorer = ka->sa.sa_restorer;
 
diff --git a/arch/x86/kernel/smpboot_32.c b/arch/x86/kernel/smpboot_32.c
index 915ec6267326..7a62dced61ca 100644
--- a/arch/x86/kernel/smpboot_32.c
+++ b/arch/x86/kernel/smpboot_32.c
@@ -132,7 +132,7 @@ static unsigned long __cpuinit setup_trampoline(void)
  */
 void __init smp_alloc_memory(void)
 {
-	trampoline_base = (void *) alloc_bootmem_low_pages(PAGE_SIZE);
+	trampoline_base = alloc_bootmem_low_pages(PAGE_SIZE);
 	/*
 	 * Has to be in very low memory so we can execute
 	 * real-mode AP code.
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 4f4021b5bfb5..02f0f61f5b11 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -24,7 +24,7 @@ static int save_stack_stack(void *data, char *name)
 
 static void save_stack_address(void *data, unsigned long addr, int reliable)
 {
-	struct stack_trace *trace = (struct stack_trace *)data;
+	struct stack_trace *trace = data;
 	if (trace->skip > 0) {
 		trace->skip--;
 		return;
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index 911ed28afff8..4aa4520f9499 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -217,7 +217,7 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
 static inline int valid_stack_ptr(struct thread_info *tinfo,
 			void *p, unsigned int size, void *end)
 {
-	void *t = (void *)tinfo;
+	void *t = tinfo;
 	if (end) {
 		if (p < end && p >= (end-THREAD_SIZE))
 			return 1;
diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c
index 8ac51b82a632..37756b6fb329 100644
--- a/arch/x86/lib/memcpy_32.c
+++ b/arch/x86/lib/memcpy_32.c
@@ -34,8 +34,8 @@ void *memmove(void *dest, const void *src, size_t n)
 			"cld"
 			: "=&c" (d0), "=&S" (d1), "=&D" (d2)
 			:"0" (n),
-			 "1" (n-1+(const char *)src),
-			 "2" (n-1+(char *)dest)
+			 "1" (n-1+src),
+			 "2" (n-1+dest)
 			:"memory");
 	}
 	return dest;
diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c
index 751ebae8ec42..80175e47b190 100644
--- a/arch/x86/lib/memmove_64.c
+++ b/arch/x86/lib/memmove_64.c
@@ -11,8 +11,8 @@ void *memmove(void * dest,const void *src,size_t count)
 	if (dest < src) { 
 		return memcpy(dest,src,count);
 	} else {
-		char *p = (char *) dest + count;
-		char *s = (char *) src + count;
+		char *p = dest + count;
+		const char *s = src + count;
 		while (count--)
 			*--p = *--s;
 	}
-- 
cgit v1.2.3


From df43510b18b8439465b4b58556f0495b5f5d771e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 30 Jan 2008 13:33:23 +0100
Subject: x86: check_tsc_warp() slowness fix

100 million max # of loops is a bit too much - reduce it to 10 million.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/tsc_sync.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index ace340524c01..7110078f242c 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -67,12 +67,12 @@ static __cpuinit void check_tsc_warp(void)
 
 		/*
 		 * Be nice every now and then (and also check whether
-		 * measurement is done [we also insert a 100 million
+		 * measurement is done [we also insert a 10 million
 		 * loops safety exit, so we dont lock up in case the
 		 * TSC readout is totally broken]):
 		 */
 		if (unlikely(!(i & 7))) {
-			if (now > end || i > 100000000)
+			if (now > end || i > 10000000)
 				break;
 			cpu_relax();
 			touch_nmi_watchdog();
-- 
cgit v1.2.3


From ad8ca495bd3e03e6751fc0c6a6af44018ebb4036 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 30 Jan 2008 13:33:24 +0100
Subject: x86: add warning to check_tsc_warp()

add warning to check_tsc_warp() - if get_cycles() does not progress.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/tsc_sync.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 7110078f242c..0577825cf89b 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -87,7 +87,11 @@ static __cpuinit void check_tsc_warp(void)
 			nr_warps++;
 			__raw_spin_unlock(&sync_lock);
 		}
-
+	}
+	if (!(now-start)) {
+		printk("Warning: zero tsc calibration delta: %Ld [max: %Ld]\n",
+			now-start, end-start);
+		WARN_ON(1);
 	}
 }
 
-- 
cgit v1.2.3


From 17abecfe651c862cd31b1f9e8ef6cfc29083f00d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 30 Jan 2008 13:33:24 +0100
Subject: x86: fix up alternatives with lockdep enabled

An older binutils bug caused us to not fix up alternatives.
This problem involved mutex.c but we dont do lockdep section tricks
there anymore, so this workaround is moot. Keep the printk nevertheless,
just in case ... We can remove that later on.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/alternative.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 318a4f9b7ece..45d79ea890ae 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -342,12 +342,13 @@ void alternatives_smp_switch(int smp)
 
 #ifdef CONFIG_LOCKDEP
 	/*
-	 * A not yet fixed binutils section handling bug prevents
-	 * alternatives-replacement from working reliably, so turn
-	 * it off:
+	 * Older binutils section handling bug prevented
+	 * alternatives-replacement from working reliably.
+	 *
+	 * If this still occurs then you should see a hang
+	 * or crash shortly after this line:
 	 */
-	printk("lockdep: not fixing up alternatives.\n");
-	return;
+	printk("lockdep: fixing up alternatives.\n");
 #endif
 
 	if (noreplace_smp || smp_alt_once)
-- 
cgit v1.2.3


From dbae595249bdbacd852e677f9b3e995f65c16781 Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 30 Jan 2008 13:33:24 +0100
Subject: x86: export check_tsc_unstable

Exporrt check_tsc_unstable function as GPL symbol. lguest is
a user of it.

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/tsc_64.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c
index c62f3b6eacc0..947554ddabb6 100644
--- a/arch/x86/kernel/tsc_64.c
+++ b/arch/x86/kernel/tsc_64.c
@@ -92,10 +92,12 @@ sched_clock(void) __attribute__((alias("native_sched_clock")));
 
 static int tsc_unstable;
 
-inline int check_tsc_unstable(void)
+int check_tsc_unstable(void)
 {
 	return tsc_unstable;
 }
+EXPORT_SYMBOL_GPL(check_tsc_unstable);
+
 #ifdef CONFIG_CPU_FREQ
 
 /* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
-- 
cgit v1.2.3


From e68decb52104388ed6c6218be926e10e6cde2814 Mon Sep 17 00:00:00 2001
From: Glauber de Oliveira Costa <gcosta@redhat.com>
Date: Wed, 30 Jan 2008 13:33:24 +0100
Subject: x86: export __supported_pte_mask

export __supported_pte_mask variable as GPL symbol.
lguest is a user of it.

Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/setup64.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c
index 0389331059ad..309366f8f603 100644
--- a/arch/x86/kernel/setup64.c
+++ b/arch/x86/kernel/setup64.c
@@ -41,6 +41,8 @@ struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
 char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
 
 unsigned long __supported_pte_mask __read_mostly = ~0UL;
+EXPORT_SYMBOL_GPL(__supported_pte_mask);
+
 static int do_not_nx __cpuinitdata = 0;
 
 /* noexec=on|off
-- 
cgit v1.2.3


From c84d6af881be84687fa924f16b2f4b4690354165 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Wed, 30 Jan 2008 13:33:25 +0100
Subject: x86: reboot: remove inb_p usage

We are driving a motherboard port so use a 2uS explicit delay at this
point.

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/reboot.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 5b32f0b4d133..5818dc28167d 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -319,9 +319,11 @@ static inline void kb_wait(void)
 {
 	int i;
 
-	for (i = 0; i < 0x10000; i++)
-		if ((inb_p(0x64) & 0x02) == 0)
+	for (i = 0; i < 0x10000; i++) {
+		if ((inb(0x64) & 0x02) == 0)
 			break;
+		udelay(2);
+	}
 }
 
 void machine_emergency_restart(void)
-- 
cgit v1.2.3


From a5ff677c2fb10567d1e750fb9e4417d95081071b Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Wed, 30 Jan 2008 13:33:25 +0100
Subject: x86: make printk_address available on X86_32

Small fomatting fixes to 64-bit as well, trailing whitespace
and extra semicolon, also move the ifdefs for CONFIG_KALLSYMS
into the function itself.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/traps_32.c | 28 ++++++++++++++++++++++++++++
 arch/x86/kernel/traps_64.c | 11 ++++-------
 2 files changed, 32 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index 270cfd483160..8534cb53ff60 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -102,6 +102,34 @@ asmlinkage void machine_check(void);
 int kstack_depth_to_print = 24;
 static unsigned int code_bytes = 64;
 
+void printk_address(unsigned long address, int reliable)
+{
+#ifdef CONFIG_KALLSYMS
+	unsigned long offset = 0, symsize;
+	const char *symname;
+	char *modname;
+	char *delim = ":";
+	char namebuf[128];
+	char reliab[4] = "";
+
+	symname = kallsyms_lookup(address, &symsize, &offset,
+					&modname, namebuf);
+	if (!symname) {
+		printk(" [<%08lx>]\n", address);
+		return;
+	}
+	if (!reliable)
+		strcpy(reliab, "? ");
+
+	if (!modname)
+		modname = delim = "";
+	printk(" [<%08lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
+		address, reliab, delim, modname, delim, symname, offset, symsize);
+#else
+	printk(" [<%08lx>]\n", address);
+#endif
+}
+
 static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size)
 {
 	return	p > (void *)tinfo &&
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index 4aa4520f9499..4ea727869ded 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -100,15 +100,15 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
 
 int kstack_depth_to_print = 12;
 
-#ifdef CONFIG_KALLSYMS
 void printk_address(unsigned long address, int reliable)
 {
+#ifdef CONFIG_KALLSYMS
 	unsigned long offset = 0, symsize;
 	const char *symname;
 	char *modname;
 	char *delim = ":";
 	char namebuf[128];
-	char reliab[4] = "";;
+	char reliab[4] = "";
 
 	symname = kallsyms_lookup(address, &symsize, &offset,
 					&modname, namebuf);
@@ -120,16 +120,13 @@ void printk_address(unsigned long address, int reliable)
 		strcpy(reliab, "? ");
 
 	if (!modname)
-		modname = delim = ""; 		
+		modname = delim = "";
 	printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
 		address, reliab, delim, modname, delim, symname, offset, symsize);
-}
 #else
-void printk_address(unsigned long address, int reliable)
-{
 	printk(" [<%016lx>]\n", address);
-}
 #endif
+}
 
 static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
 					unsigned *usedp, char **idp)
-- 
cgit v1.2.3


From 503595010f2024b236fd2636a7318fe64c794d6c Mon Sep 17 00:00:00 2001
From: Ian Campbell <ijc@hellion.org.uk>
Date: Wed, 30 Jan 2008 13:33:27 +0100
Subject: x86_32: remove unnecessary use of %ebx as the boot cpu flag
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently in head_32.S there are two ways we test to see if we
are the boot cpu.  By looking at %ebx and by looking at the
static variable ready.  When changing things around I have
found that it gets tricky to preserve %ebx.  So this
patch just switches head.S over to the more reliable
test of always using ready.

Hopefully later we can kill these tests entirely.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Ian Campbell <ijc@hellion.org.uk>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Mika Penttilä <mika.penttila@kolumbus.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/head_32.S | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index f409fe2a52e4..7b9b2566b7a8 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -199,7 +199,6 @@ default_entry:
 	addl $0x67, %eax			/* 0x67 == _PAGE_TABLE */
 	movl %eax, 4092(%edx)
 
-	xorl %ebx,%ebx				/* This is the boot CPU (BSP) */
 	jmp 3f
 /*
  * Non-boot CPU entry point; entered from trampoline.S
@@ -268,10 +267,6 @@ ENTRY(startup_32_smp)
 	wrmsr
 
 6:
-	/* This is a secondary processor (AP) */
-	xorl %ebx,%ebx
-	incl %ebx
-
 #endif /* CONFIG_SMP */
 3:
 
@@ -297,7 +292,7 @@ ENTRY(startup_32_smp)
 	popfl
 
 #ifdef CONFIG_SMP
-	andl %ebx,%ebx
+	cmpb $0, ready
 	jz  1f				/* Initial CPU cleans BSS */
 	jmp checkCPUtype
 1:
-- 
cgit v1.2.3


From 5756dd59f118daacd68ee971a2381360ef769c48 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ijc@hellion.org.uk>
Date: Wed, 30 Jan 2008 13:33:27 +0100
Subject: x86_32: always run the full set of paging state.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I am preparing to convert the boot time page table to the kernels
native format.  To achieve that I need to enable PAE. Enabling PSE
and the no execute bit would not hurt.  So this patch modifies
the boot cpu path to execute all of the kernels enable code
if and only if we have the proper bits set in mmu_cr4_features.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Ian Campbell <ijc@hellion.org.uk>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Mika Penttilä <mika.penttila@kolumbus.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/head_32.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 7b9b2566b7a8..a2b6331434da 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -221,6 +221,8 @@ ENTRY(startup_32_smp)
 	movl %eax,%es
 	movl %eax,%fs
 	movl %eax,%gs
+#endif /* CONFIG_SMP */
+3:
 
 /*
  *	New page tables may be in 4Mbyte page mode and may
@@ -267,8 +269,6 @@ ENTRY(startup_32_smp)
 	wrmsr
 
 6:
-#endif /* CONFIG_SMP */
-3:
 
 /*
  * Enable paging
-- 
cgit v1.2.3


From 8b2f7ffffe7f247ba237322fee78c528ba88f16b Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Wed, 30 Jan 2008 13:33:28 +0100
Subject: x86: fix Section mismatch: reference to .init.text:lguest_entry

fix:

> WARNING: vmlinux.o(.data+0x4): Section mismatch: reference to .init.text:lguest_entry (between 'subarch_entries' and 'stack_start')

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/head_32.S | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index a2b6331434da..5d8c5730686b 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -9,6 +9,7 @@
 
 .text
 #include <linux/threads.h>
+#include <linux/init.h>
 #include <linux/linkage.h>
 #include <asm/segment.h>
 #include <asm/page.h>
@@ -151,7 +152,9 @@ WEAK(xen_entry)
 	/* Unknown implementation; there's really
 	   nothing we can do at this point. */
 	ud2a
-.data
+
+	__INITDATA
+
 subarch_entries:
 	.long default_entry		/* normal x86/PC */
 	.long lguest_entry		/* lguest hypervisor */
-- 
cgit v1.2.3


From 1bdbdaacf774f2979ed4cb0c4a4316c9e578c897 Mon Sep 17 00:00:00 2001
From: Bernhard Walle <bwalle@suse.de>
Date: Wed, 30 Jan 2008 13:33:28 +0100
Subject: x86, rtc: make CONFIG_HPET_EMULATE_RTC usable from modules

enabled, then interrupts don't work for the rtc-cmos driver which results in
RTC_AIE*, RTC_PIE* and RTC_ALM being unusable.  This affects hwclock from
util-linux-ng at least on i386 since that uses RTC_PIE_ON.  (For x86-64, a
polling method is used for unknown reasons.)

This patch series now

  1. export the functions from arch/x86/kernel/hpet.c that the old char/rtc
     driver uses to work around that problem,

  2. makes it possible to compile the old rtc driver as module, while still
     having CONFIG_HPET_EMULATE_RTC enabled and

  3. makes use of the exported functions in (1) in the new rtc-cmos driver.

This patch:

This patch makes the RTC emulation functions in arch/x86/kernel/hpet.c usable
for kernel modules. It

  - exports the functions (EXPORT_SYMBOL_GPL()),
  - adds an interface to register the interrupt callback function
    instead of using only a fixed callback function and
  - replaces the rtc_get_rtc_time() function which depends on
    CONFIG_RTC with a call to get_rtc_time() which is defined in
    include/asm-generic/rtc.h.

The only dependency to CONFIG_RTC is the call to rtc_interrupt() which is
removed by the next patch. After this, there's no (code) dependency of
this functions to CONFIG_RTC=y any more.

Signed-off-by: Bernhard Walle <bwalle@suse.de>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Cc: David Brownell <david-b@pacbell.net>
Cc: Andi Kleen <ak@suse.de>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Robert Picco <Robert.Picco@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/hpet.c | 47 ++++++++++++++++++++++++++++++++++++++++++++++-
 include/asm-x86/hpet.h |  3 +++
 2 files changed, 49 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 5c1702789be4..d65ced59a18f 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -107,6 +107,7 @@ int is_hpet_enabled(void)
 {
 	return is_hpet_capable() && hpet_legacy_int_enabled;
 }
+EXPORT_SYMBOL_GPL(is_hpet_enabled);
 
 /*
  * When the hpet driver (/dev/hpet) is enabled, we need to reserve
@@ -475,6 +476,7 @@ void hpet_disable(void)
  */
 #include <linux/mc146818rtc.h>
 #include <linux/rtc.h>
+#include <asm/rtc.h>
 
 #define DEFAULT_RTC_INT_FREQ	64
 #define DEFAULT_RTC_SHIFT	6
@@ -489,6 +491,38 @@ static unsigned long hpet_default_delta;
 static unsigned long hpet_pie_delta;
 static unsigned long hpet_pie_limit;
 
+static rtc_irq_handler irq_handler;
+
+/*
+ * Registers a IRQ handler.
+ */
+int hpet_register_irq_handler(rtc_irq_handler handler)
+{
+	if (!is_hpet_enabled())
+		return -ENODEV;
+	if (irq_handler)
+		return -EBUSY;
+
+	irq_handler = handler;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(hpet_register_irq_handler);
+
+/*
+ * Deregisters the IRQ handler registered with hpet_register_irq_handler()
+ * and does cleanup.
+ */
+void hpet_unregister_irq_handler(rtc_irq_handler handler)
+{
+	if (!is_hpet_enabled())
+		return;
+
+	irq_handler = NULL;
+	hpet_rtc_flags = 0;
+}
+EXPORT_SYMBOL_GPL(hpet_unregister_irq_handler);
+
 /*
  * Timer 1 for RTC emulation. We use one shot mode, as periodic mode
  * is not supported by all HPET implementations for timer 1.
@@ -530,6 +564,7 @@ int hpet_rtc_timer_init(void)
 
 	return 1;
 }
+EXPORT_SYMBOL_GPL(hpet_rtc_timer_init);
 
 /*
  * The functions below are called from rtc driver.
@@ -544,6 +579,7 @@ int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
 	hpet_rtc_flags &= ~bit_mask;
 	return 1;
 }
+EXPORT_SYMBOL_GPL(hpet_mask_rtc_irq_bit);
 
 int hpet_set_rtc_irq_bit(unsigned long bit_mask)
 {
@@ -559,6 +595,7 @@ int hpet_set_rtc_irq_bit(unsigned long bit_mask)
 
 	return 1;
 }
+EXPORT_SYMBOL_GPL(hpet_set_rtc_irq_bit);
 
 int hpet_set_alarm_time(unsigned char hrs, unsigned char min,
 			unsigned char sec)
@@ -572,6 +609,7 @@ int hpet_set_alarm_time(unsigned char hrs, unsigned char min,
 
 	return 1;
 }
+EXPORT_SYMBOL_GPL(hpet_set_alarm_time);
 
 int hpet_set_periodic_freq(unsigned long freq)
 {
@@ -590,11 +628,13 @@ int hpet_set_periodic_freq(unsigned long freq)
 	}
 	return 1;
 }
+EXPORT_SYMBOL_GPL(hpet_set_periodic_freq);
 
 int hpet_rtc_dropped_irq(void)
 {
 	return is_hpet_enabled();
 }
+EXPORT_SYMBOL_GPL(hpet_rtc_dropped_irq);
 
 static void hpet_rtc_timer_reinit(void)
 {
@@ -638,9 +678,10 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
 	unsigned long rtc_int_flag = 0;
 
 	hpet_rtc_timer_reinit();
+	memset(&curr_time, 0, sizeof(struct rtc_time));
 
 	if (hpet_rtc_flags & (RTC_UIE | RTC_AIE))
-		rtc_get_rtc_time(&curr_time);
+		get_rtc_time(&curr_time);
 
 	if (hpet_rtc_flags & RTC_UIE &&
 	    curr_time.tm_sec != hpet_prev_update_sec) {
@@ -662,8 +703,12 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
 
 	if (rtc_int_flag) {
 		rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
+		if (irq_handler)
+			irq_handler(rtc_int_flag, dev_id);
+
 		rtc_interrupt(rtc_int_flag, dev_id);
 	}
 	return IRQ_HANDLED;
 }
+EXPORT_SYMBOL_GPL(hpet_rtc_interrupt);
 #endif
diff --git a/include/asm-x86/hpet.h b/include/asm-x86/hpet.h
index 624f600f7161..6a9b4ac59bf7 100644
--- a/include/asm-x86/hpet.h
+++ b/include/asm-x86/hpet.h
@@ -69,6 +69,7 @@ extern void force_hpet_resume(void);
 
 #include <linux/interrupt.h>
 
+typedef irqreturn_t (*rtc_irq_handler)(int interrupt, void *cookie);
 extern int hpet_mask_rtc_irq_bit(unsigned long bit_mask);
 extern int hpet_set_rtc_irq_bit(unsigned long bit_mask);
 extern int hpet_set_alarm_time(unsigned char hrs, unsigned char min,
@@ -77,6 +78,8 @@ extern int hpet_set_periodic_freq(unsigned long freq);
 extern int hpet_rtc_dropped_irq(void);
 extern int hpet_rtc_timer_init(void);
 extern irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id);
+extern int hpet_register_irq_handler(rtc_irq_handler handler);
+extern void hpet_unregister_irq_handler(rtc_irq_handler handler);
 
 #endif /* CONFIG_HPET_EMULATE_RTC */
 
-- 
cgit v1.2.3


From f8f76481bc2803aea03ff213c7e1405b53f7e488 Mon Sep 17 00:00:00 2001
From: Bernhard Walle <bwalle@suse.de>
Date: Wed, 30 Jan 2008 13:33:31 +0100
Subject: rtc: use the IRQ callback interface in (old) RTC driver

the previous patch in the old RTC driver.  It also removes the direct
rtc_interrupt() call from arch/x86/kernel/hpetc.c so that there's finally no
(code) dependency to CONFIG_RTC in arch/x86/kernel/hpet.c.

Because of this, it's possible to compile the drivers/char/rtc.ko driver as
module and still use the HPET emulation functionality.  This is also expressed
in Kconfig.

Signed-off-by: Bernhard Walle <bwalle@suse.de>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Cc: David Brownell <david-b@pacbell.net>
Cc: Andi Kleen <ak@suse.de>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Robert Picco <Robert.Picco@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig       |  2 +-
 arch/x86/kernel/hpet.c |  2 --
 drivers/char/rtc.c     | 15 ++++++++++++++-
 3 files changed, 15 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 81af31e5a9f1..4e911d0a4962 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -390,7 +390,7 @@ config HPET_TIMER
 
 config HPET_EMULATE_RTC
 	def_bool y
-	depends on HPET_TIMER && RTC=y
+	depends on HPET_TIMER && (RTC=y || RTC=m)
 
 # Mark as embedded because too many people got it wrong.
 # The code disables itself when not needed.
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index d65ced59a18f..429d084e014d 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -705,8 +705,6 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
 		rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
 		if (irq_handler)
 			irq_handler(rtc_int_flag, dev_id);
-
-		rtc_interrupt(rtc_int_flag, dev_id);
 	}
 	return IRQ_HANDLED;
 }
diff --git a/drivers/char/rtc.c b/drivers/char/rtc.c
index 3ac7952fe086..78b151c4d20f 100644
--- a/drivers/char/rtc.c
+++ b/drivers/char/rtc.c
@@ -110,6 +110,8 @@ static int rtc_has_irq = 1;
 #define hpet_set_rtc_irq_bit(arg)		0
 #define hpet_rtc_timer_init()			do { } while (0)
 #define hpet_rtc_dropped_irq()			0
+#define hpet_register_irq_handler(h)		0
+#define hpet_unregister_irq_handler(h)		0
 #ifdef RTC_IRQ
 static irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
 {
@@ -1027,7 +1029,15 @@ no_irq:
 
 #ifdef RTC_IRQ
 	if (is_hpet_enabled()) {
+		int err;
+
 		rtc_int_handler_ptr = hpet_rtc_interrupt;
+		err = hpet_register_irq_handler(rtc_interrupt);
+		if (err != 0) {
+			printk(KERN_WARNING "hpet_register_irq_handler failed "
+					"in rtc_init().");
+			return err;
+		}
 	} else {
 		rtc_int_handler_ptr = rtc_interrupt;
 	}
@@ -1050,6 +1060,7 @@ no_irq:
 	if (misc_register(&rtc_dev)) {
 #ifdef RTC_IRQ
 		free_irq(RTC_IRQ, NULL);
+		hpet_unregister_irq_handler(rtc_interrupt);
 		rtc_has_irq = 0;
 #endif
 		rtc_release_region();
@@ -1141,8 +1152,10 @@ static void __exit rtc_exit(void)
 #else
 	rtc_release_region();
 #ifdef RTC_IRQ
-	if (rtc_has_irq)
+	if (rtc_has_irq) {
 		free_irq(RTC_IRQ, NULL);
+		hpet_unregister_irq_handler(hpet_rtc_interrupt);
+	}
 #endif
 #endif /* CONFIG_SPARC32 */
 }
-- 
cgit v1.2.3


From 8a45eb31d832af5441703bf447c97f786ff850a4 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@computergmbh.de>
Date: Wed, 30 Jan 2008 13:33:32 +0100
Subject: x86: constify function pointer tables

Signed-off-by: Jan Engelhardt <jengelh@computergmbh.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/proc.c | 2 +-
 arch/x86/kernel/setup_64.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 3900e46d66db..028213260148 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -188,7 +188,7 @@ static void *c_next(struct seq_file *m, void *v, loff_t *pos)
 static void c_stop(struct seq_file *m, void *v)
 {
 }
-struct seq_operations cpuinfo_op = {
+const struct seq_operations cpuinfo_op = {
 	.start	= c_start,
 	.next	= c_next,
 	.stop	= c_stop,
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index f84e2662f1ca..116687ade180 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -1251,7 +1251,7 @@ static void c_stop(struct seq_file *m, void *v)
 {
 }
 
-struct seq_operations cpuinfo_op = {
+const struct seq_operations cpuinfo_op = {
 	.start = c_start,
 	.next =	c_next,
 	.stop =	c_stop,
-- 
cgit v1.2.3


From 093af8d7f0ba3c6be1485973508584ef081e9f93 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <Yinghai.Lu@Sun.COM>
Date: Wed, 30 Jan 2008 13:33:32 +0100
Subject: x86_32: trim memory by updating e820

when MTRRs are not covering the whole e820 table, we need to trim the
RAM and need to update e820.

reuse some code on 64-bit as well.

here need to add early_get_cap and use it in early_cpu_detect, and move
mtrr_bp_init early.

The code successfully trimmed the memory map on Justin's system:

from:

 [    0.000000]  BIOS-e820: 0000000100000000 - 000000022c000000 (usable)

to:

 [    0.000000]   modified: 0000000100000000 - 0000000228000000 (usable)
 [    0.000000]   modified: 0000000228000000 - 000000022c000000 (reserved)

According to Justin it makes quite a difference:

|  When I boot the box without any trimming it acts like a 286 or 386,
|  takes about 10 minutes to boot (using raptor disks).

Signed-off-by: Yinghai Lu <yinghai.lu@sun.com>
Tested-by: Justin Piszcz <jpiszcz@lucidpixels.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/kernel-parameters.txt |  2 +-
 arch/x86/kernel/cpu/common.c        | 30 +++++++++++++++++++++++++++++-
 arch/x86/kernel/cpu/mtrr/main.c     | 22 ++++++++++++++--------
 arch/x86/kernel/e820_32.c           | 11 +++++++++++
 arch/x86/kernel/setup_32.c          |  6 ++++++
 include/asm-x86/e820_32.h           |  3 +++
 6 files changed, 64 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 50d564dabb13..fe3031d56431 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -583,7 +583,7 @@ and is between 256 and 4096 characters. It is defined in the file
 			See drivers/char/README.epca and
 			Documentation/digiepca.txt.
 
-	disable_mtrr_trim [X86-64, Intel only]
+	disable_mtrr_trim [X86, Intel and AMD only]
 			By default the kernel will trim any uncacheable
 			memory out of your available memory pool based on
 			MTRR settings.  This parameter disables that behavior,
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 56cc341cc586..bba850b05d0e 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -278,6 +278,33 @@ void __init cpu_detect(struct cpuinfo_x86 *c)
 			c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
 	}
 }
+static void __cpuinit early_get_cap(struct cpuinfo_x86 *c)
+{
+	u32 tfms, xlvl;
+	int ebx;
+
+	memset(&c->x86_capability, 0, sizeof c->x86_capability);
+	if (have_cpuid_p()) {
+		/* Intel-defined flags: level 0x00000001 */
+		if (c->cpuid_level >= 0x00000001) {
+			u32 capability, excap;
+			cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
+			c->x86_capability[0] = capability;
+			c->x86_capability[4] = excap;
+		}
+
+		/* AMD-defined flags: level 0x80000001 */
+		xlvl = cpuid_eax(0x80000000);
+		if ((xlvl & 0xffff0000) == 0x80000000) {
+			if (xlvl >= 0x80000001) {
+				c->x86_capability[1] = cpuid_edx(0x80000001);
+				c->x86_capability[6] = cpuid_ecx(0x80000001);
+			}
+		}
+
+	}
+
+}
 
 /* Do minimum CPU detection early.
    Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
@@ -306,6 +333,8 @@ static void __init early_cpu_detect(void)
 		early_init_intel(c);
 		break;
 	}
+
+	early_get_cap(c);
 }
 
 static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
@@ -485,7 +514,6 @@ void __init identify_boot_cpu(void)
 	identify_cpu(&boot_cpu_data);
 	sysenter_setup();
 	enable_sep_cpu();
-	mtrr_bp_init();
 }
 
 void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index ccd36ed2187b..ac4b6338f3f4 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -624,7 +624,6 @@ static struct sysdev_driver mtrr_sysdev_driver = {
 	.resume		= mtrr_restore,
 };
 
-#ifdef CONFIG_X86_64
 static int disable_mtrr_trim;
 
 static int __init disable_mtrr_trim_setup(char *str)
@@ -643,13 +642,10 @@ early_param("disable_mtrr_trim", disable_mtrr_trim_setup);
 #define Tom2Enabled (1U << 21)
 #define Tom2ForceMemTypeWB (1U << 22)
 
-static __init int amd_special_default_mtrr(unsigned long end_pfn)
+static __init int amd_special_default_mtrr(void)
 {
 	u32 l, h;
 
-	/* Doesn't apply to memory < 4GB */
-	if (end_pfn <= (0xffffffff >> PAGE_SHIFT))
-		return 0;
 	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
 		return 0;
 	if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
@@ -687,9 +683,14 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
 	 * Make sure we only trim uncachable memory on machines that
 	 * support the Intel MTRR architecture:
 	 */
+	if (!is_cpu(INTEL) || disable_mtrr_trim)
+		return 0;
 	rdmsr(MTRRdefType_MSR, def, dummy);
 	def &= 0xff;
-	if (!is_cpu(INTEL) || disable_mtrr_trim || def != MTRR_TYPE_UNCACHABLE)
+	if (def != MTRR_TYPE_UNCACHABLE)
+		return 0;
+
+	if (amd_special_default_mtrr())
 		return 0;
 
 	/* Find highest cached pfn */
@@ -703,8 +704,14 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
 			highest_addr = base + size;
 	}
 
-	if (amd_special_default_mtrr(end_pfn))
+	/* kvm/qemu doesn't have mtrr set right, don't trim them all */
+	if (!highest_addr) {
+		printk(KERN_WARNING "***************\n");
+		printk(KERN_WARNING "**** WARNING: likely strange cpu\n");
+		printk(KERN_WARNING "**** MTRRs all blank, cpu in qemu?\n");
+		printk(KERN_WARNING "***************\n");
 		return 0;
+	}
 
 	if ((highest_addr >> PAGE_SHIFT) < end_pfn) {
 		printk(KERN_WARNING "***************\n");
@@ -726,7 +733,6 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
 
 	return 0;
 }
-#endif
 
 /**
  * mtrr_bp_init - initialize mtrrs on the boot CPU
diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c
index 931934a7b353..4e16ef4a2659 100644
--- a/arch/x86/kernel/e820_32.c
+++ b/arch/x86/kernel/e820_32.c
@@ -749,3 +749,14 @@ static int __init parse_memmap(char *arg)
 	return 0;
 }
 early_param("memmap", parse_memmap);
+void __init update_e820(void)
+{
+	u8 nr_map;
+
+	nr_map = e820.nr_map;
+	if (sanitize_e820_map(e820.map, &nr_map))
+		return;
+	e820.nr_map = nr_map;
+	printk(KERN_INFO "modified physical RAM map:\n");
+	print_memory_map("modified");
+}
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 26a56f714d34..83ba3ca5f431 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -48,6 +48,7 @@
 
 #include <video/edid.h>
 
+#include <asm/mtrr.h>
 #include <asm/apic.h>
 #include <asm/e820.h>
 #include <asm/mpspec.h>
@@ -758,6 +759,11 @@ void __init setup_arch(char **cmdline_p)
 
 	max_low_pfn = setup_memory();
 
+	/* update e820 for memory not covered by WB MTRRs */
+	mtrr_bp_init();
+	if (mtrr_trim_uncached_memory(max_pfn))
+		max_low_pfn = setup_memory();
+
 #ifdef CONFIG_VMI
 	/*
 	 * Must be after max_low_pfn is determined, and before kernel
diff --git a/include/asm-x86/e820_32.h b/include/asm-x86/e820_32.h
index e2faf5f3a0bb..f1da7ebd1905 100644
--- a/include/asm-x86/e820_32.h
+++ b/include/asm-x86/e820_32.h
@@ -19,12 +19,15 @@
 #ifndef __ASSEMBLY__
 
 extern struct e820map e820;
+extern void update_e820(void);
 
 extern int e820_all_mapped(unsigned long start, unsigned long end,
 			   unsigned type);
 extern int e820_any_mapped(u64 start, u64 end, unsigned type);
 extern void find_max_pfn(void);
 extern void register_bootmem_low_pages(unsigned long max_low_pfn);
+extern void add_memory_region(unsigned long long start,
+			      unsigned long long size, int type);
 extern void e820_register_memory(void);
 extern void limit_regions(unsigned long long size);
 extern void print_memory_map(char *who);
-- 
cgit v1.2.3


From ac629a98bf98bfe66bf0a674b22be3474ee6a9a4 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <Yinghai.Lu@Sun.COM>
Date: Wed, 30 Jan 2008 13:33:33 +0100
Subject: x86: remove duplicated line about

Signed-off-by: Yinghai Lu <yinghai.lu@sun.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/setup_64.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 116687ade180..fe47807cac00 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -338,7 +338,6 @@ void __init setup_arch(char **cmdline_p)
 #ifdef CONFIG_NUMA
 	x86_cpu_to_node_map_early_ptr = (void *)&x86_cpu_to_node_map_init;
 #endif
-	x86_bios_cpu_apicid_early_ptr = (void *)&x86_bios_cpu_apicid_init;
 #endif
 
 #ifdef CONFIG_ACPI
-- 
cgit v1.2.3


From 3effef1f3bbc2a63cc79c9eda789642ac06067b3 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <Yinghai.Lu@Sun.COM>
Date: Wed, 30 Jan 2008 13:33:33 +0100
Subject: x86: should use array directly for early_ptr

Signed-off-by: Yinghai Lu <yinghai.lu@sun.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/setup_64.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index fe47807cac00..0d20ed7fb097 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -333,10 +333,10 @@ void __init setup_arch(char **cmdline_p)
 
 #ifdef CONFIG_SMP
 	/* setup to use the early static init tables during kernel startup */
-	x86_cpu_to_apicid_early_ptr = (void *)&x86_cpu_to_apicid_init;
-	x86_bios_cpu_apicid_early_ptr = (void *)&x86_bios_cpu_apicid_init;
+	x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
+	x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
 #ifdef CONFIG_NUMA
-	x86_cpu_to_node_map_early_ptr = (void *)&x86_cpu_to_node_map_init;
+	x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
 #endif
 #endif
 
-- 
cgit v1.2.3


From 85e2aeea80b453800c19e7713d85c8f91ed97b5b Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Wed, 30 Jan 2008 13:33:33 +0100
Subject: x86: use KSYM_NAME_LEN

Use KSYM_NAME_LEN instead of numeric value

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/traps_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index 4ea727869ded..efc66df728b6 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -107,7 +107,7 @@ void printk_address(unsigned long address, int reliable)
 	const char *symname;
 	char *modname;
 	char *delim = ":";
-	char namebuf[128];
+	char namebuf[KSYM_NAME_LEN];
 	char reliab[4] = "";
 
 	symname = kallsyms_lookup(address, &symsize, &offset,
-- 
cgit v1.2.3


From 45078cb5e2fb5a0e55c373755ed45b43d901f9e4 Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Wed, 30 Jan 2008 13:33:33 +0100
Subject: x86: remove struct cpu_model_info

No one uses struct cpu_model_info on x86_64 now.

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/setup_64.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 0d20ed7fb097..9ee2bc30d530 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -869,12 +869,6 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
 		c->x86_vendor = X86_VENDOR_UNKNOWN;
 }
 
-struct cpu_model_info {
-	int vendor;
-	int family;
-	char *model_names[16];
-};
-
 /* Do some early cpuid on the boot CPU to get some parameter that are
    needed before check_bugs. Everything advanced is in identify_cpu
    below. */
-- 
cgit v1.2.3


From e6c4dc6c36b01d33a2e0c8a89828d8dd44fb4c1b Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Wed, 30 Jan 2008 13:33:33 +0100
Subject: x86: GEODE add the "mfgptfix" boot time option to fix MFGPT timers

The new "mfgptfix" boot command line option may be usd to fix MFGPT
timers on AMD Geode platforms when the BIOS has incorrectly applied
a workaround. TinyBIOS version 0.98 is known to be affected, 0.99
fixes the problem by letting the user disable the workaround.

Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/kernel-parameters.txt |  5 +++++
 arch/x86/kernel/mfgpt_32.c          | 15 +++++++++++++++
 2 files changed, 20 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index fe3031d56431..5d171b7b8393 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1092,6 +1092,11 @@ and is between 256 and 4096 characters. It is defined in the file
 			Multi-Function General Purpose Timers on AMD Geode
 			platforms.
 
+	mfgptfix	[X86-32] Fix MFGPT timers on AMD Geode platforms when
+			the BIOS has incorrectly applied a workaround. TinyBIOS
+			version 0.98 is known to be affected, 0.99 fixes the
+			problem by letting the user disable the workaround.
+
 	mga=		[HW,DRM]
 
 	mousedev.tap_time=
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
index 3960ab7e1497..219f86eb6123 100644
--- a/arch/x86/kernel/mfgpt_32.c
+++ b/arch/x86/kernel/mfgpt_32.c
@@ -63,6 +63,21 @@ static int __init mfgpt_disable(char *s)
 }
 __setup("nomfgpt", mfgpt_disable);
 
+/* Reset the MFGPT timers. This is required by some broken BIOSes which already
+ * do the same and leave the system in an unstable state. TinyBIOS 0.98 is
+ * affected at least (0.99 is OK with MFGPT workaround left to off).
+ */
+static int __init mfgpt_fix(char *s)
+{
+	u32 val, dummy;
+
+	/* The following udocumented bit resets the MFGPT timers */
+	val = 0xFF; dummy = 0;
+	wrmsr(0x5140002B, val, dummy);
+	return 1;
+}
+__setup("mfgptfix", mfgpt_fix);
+
 /*
  * Check whether any MFGPTs are available for the kernel to use.  In most
  * cases, firmware that uses AMD's VSA code will claim all timers during
-- 
cgit v1.2.3


From cd7d72bb27a8c7502a602bdc299f1bb0a9357975 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 30 Jan 2008 13:33:35 +0100
Subject: x86: improve MTRR trimming messages

improve the MTTR trimming messages and also trigger a WARN_ON()
so that kerneloops.org can pick it up and categorize it.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/mtrr/main.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index ac4b6338f3f4..715919582657 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -706,20 +706,17 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
 
 	/* kvm/qemu doesn't have mtrr set right, don't trim them all */
 	if (!highest_addr) {
-		printk(KERN_WARNING "***************\n");
-		printk(KERN_WARNING "**** WARNING: likely strange cpu\n");
-		printk(KERN_WARNING "**** MTRRs all blank, cpu in qemu?\n");
-		printk(KERN_WARNING "***************\n");
+		printk(KERN_WARNING "WARNING: strange, CPU MTRRs all blank?\n");
+		WARN_ON(1);
 		return 0;
 	}
 
 	if ((highest_addr >> PAGE_SHIFT) < end_pfn) {
-		printk(KERN_WARNING "***************\n");
-		printk(KERN_WARNING "**** WARNING: likely BIOS bug\n");
-		printk(KERN_WARNING "**** MTRRs don't cover all of "
-		       "memory, trimmed %ld pages\n", end_pfn -
-		       (highest_addr >> PAGE_SHIFT));
-		printk(KERN_WARNING "***************\n");
+		printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover"
+			" all of memory, losing %LdMB of RAM.\n",
+			(((u64)end_pfn << PAGE_SHIFT) - highest_addr) >> 20);
+
+		WARN_ON(1);
 
 		printk(KERN_INFO "update e820 for mtrr\n");
 		trim_start = highest_addr;
-- 
cgit v1.2.3


From 519efbc0b3b6004a3b98d66a446bce30852c8171 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann3 <andreas.herrmann3@amd.com>
Date: Wed, 30 Jan 2008 13:33:35 +0100
Subject: x86: fix cpu MHz reporting for AMD family 0x11

Fix cpu MHz reporting for AMD family 0x11 when powernow-k8 is
disabled.

Just adhere to the CONSTANT_TSC feature bit for AMD CPUs when deciding
whether cpu_khz needs calibration. The additional check for CPU family
is not needed and prevents calibration for future CPUs.

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/time_64.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index 61b17f5ec867..0380795121a6 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -120,8 +120,7 @@ void __init time_init(void)
 
 	cpu_khz = tsc_khz;
 	if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
-		boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
-		boot_cpu_data.x86 == 16)
+		(boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
 		cpu_khz = calculate_cpu_khz();
 
 	if (unsynchronized_tsc())
-- 
cgit v1.2.3


From ade761496dcd0aea1c41da21d5a6ced4897f02e7 Mon Sep 17 00:00:00 2001
From: Andres Salomon <dilinger@queued.net>
Date: Wed, 30 Jan 2008 13:33:35 +0100
Subject: x86: GEODE: update GPIO API to support setting multiple GPIOs at once

The existing Geode GPIO API only allows for updating one GPIO at once.  There
are instances where users want to update multiple GPIOs at once.  With the
current API, they are given two choices; either ignore the GPIO API:

      outl(0xc000, gpio_base + GPIO_OUTPUT_VAL);
      outl(0xc000, gpio_base + GPIO_OUTPUT_ENABLE);

Alternatively, call each GPIO update separately:

      geode_gpio_set(14, GPIO_OUTPUT_VAL);
      geode_gpio_set(15, GPIO_OUTPUT_VAL);
      geode_gpio_set(14, GPIO_OUTPUT_ENABLE);
      geode_gpio_set(15, GPIO_OUTPUT_ENABLE);

Neither are desirable.  This patch changes the GPIO API to allow for setting
of multiple GPIOs at once; rather than being passed an integer, we pass
a bitmask and provide a translation function.  The above code would now
look like this:

      geode_gpio_set(geode_gpio(14)|geode_gpio(15), GPIO_OUTPUT_VAL);
      geode_gpio_set(geode_gpio(14)|geode_gpio(15), GPIO_OUTPUT_ENABLE);

Since there are no upstream users of the GPIO API yet (afaik), best to
change this now.  This also adds a bit of sanity checking; it is no
longer possible to use a GPIO above 28.

Note the semantics of geode_gpio_isset() have changed:
geode_gpio_isset(geode_gpio(3)|geode_gpio(4), ...)
will only return true iff both GPIOs are set.

Signed-off-by: Andres Salomon <dilinger@debian.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/geode_32.c | 48 +++++++++++++++++++++++++++++++---------------
 include/asm-x86/geode.h    | 12 +++++++++---
 2 files changed, 42 insertions(+), 18 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/geode_32.c b/arch/x86/kernel/geode_32.c
index f12d8c5d9809..9c7f7d395968 100644
--- a/arch/x86/kernel/geode_32.c
+++ b/arch/x86/kernel/geode_32.c
@@ -1,6 +1,7 @@
 /*
  * AMD Geode southbridge support code
  * Copyright (C) 2006, Advanced Micro Devices, Inc.
+ * Copyright (C) 2007, Andres Salomon <dilinger@debian.org>
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public License
@@ -51,45 +52,62 @@ EXPORT_SYMBOL_GPL(geode_get_dev_base);
 
 /* === GPIO API === */
 
-void geode_gpio_set(unsigned int gpio, unsigned int reg)
+void geode_gpio_set(u32 gpio, unsigned int reg)
 {
 	u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
 
 	if (!base)
 		return;
 
-	if (gpio < 16)
-		outl(1 << gpio, base + reg);
-	else
-		outl(1 << (gpio - 16), base + 0x80 + reg);
+	/* low bank register */
+	if (gpio & 0xFFFF)
+		outl(gpio & 0xFFFF, base + reg);
+	/* high bank register */
+	gpio >>= 16;
+	if (gpio)
+		outl(gpio, base + 0x80 + reg);
 }
 EXPORT_SYMBOL_GPL(geode_gpio_set);
 
-void geode_gpio_clear(unsigned int gpio, unsigned int reg)
+void geode_gpio_clear(u32 gpio, unsigned int reg)
 {
 	u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
 
 	if (!base)
 		return;
 
-	if (gpio < 16)
-		outl(1 << (gpio + 16), base + reg);
-	else
-		outl(1 << gpio, base + 0x80 + reg);
+	/* low bank register */
+	if (gpio & 0xFFFF)
+		outl((gpio & 0xFFFF) << 16, base + reg);
+	/* high bank register */
+	gpio &= (0xFFFF << 16);
+	if (gpio)
+		outl(gpio, base + 0x80 + reg);
 }
 EXPORT_SYMBOL_GPL(geode_gpio_clear);
 
-int geode_gpio_isset(unsigned int gpio, unsigned int reg)
+int geode_gpio_isset(u32 gpio, unsigned int reg)
 {
 	u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
+	u32 val;
 
 	if (!base)
 		return 0;
 
-	if (gpio < 16)
-		return (inl(base + reg) & (1 << gpio)) ? 1 : 0;
-	else
-		return (inl(base + 0x80 + reg) & (1 << (gpio - 16))) ? 1 : 0;
+	/* low bank register */
+	if (gpio & 0xFFFF) {
+		val = inl(base + reg) & (gpio & 0xFFFF);
+		if ((gpio & 0xFFFF) == val)
+			return 1;
+	}
+	/* high bank register */
+	gpio >>= 16;
+	if (gpio) {
+		val = inl(base + 0x80 + reg) & gpio;
+		if (gpio == val)
+			return 1;
+	}
+	return 0;
 }
 EXPORT_SYMBOL_GPL(geode_gpio_isset);
 
diff --git a/include/asm-x86/geode.h b/include/asm-x86/geode.h
index 771af336734f..811fe14f70b2 100644
--- a/include/asm-x86/geode.h
+++ b/include/asm-x86/geode.h
@@ -121,9 +121,15 @@ extern int geode_get_dev_base(unsigned int dev);
 #define GPIO_MAP_Z		0xE8
 #define GPIO_MAP_W		0xEC
 
-extern void geode_gpio_set(unsigned int, unsigned int);
-extern void geode_gpio_clear(unsigned int, unsigned int);
-extern int geode_gpio_isset(unsigned int, unsigned int);
+static inline u32 geode_gpio(unsigned int nr)
+{
+	BUG_ON(nr > 28);
+	return 1 << nr;
+}
+
+extern void geode_gpio_set(u32, unsigned int);
+extern void geode_gpio_clear(u32, unsigned int);
+extern int geode_gpio_isset(u32, unsigned int);
 extern void geode_gpio_setup_event(unsigned int, int, int);
 extern void geode_gpio_set_irq(unsigned int, unsigned int);
 
-- 
cgit v1.2.3


From ff472a114ca11f5354912745fbb94bb66ec10916 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <florian.fainelli@telecomint.eu>
Date: Wed, 30 Jan 2008 13:33:36 +0100
Subject: x86: add the RDC machine specific reboot fixup

The RDC R-321x SoC needs a reboot fixup which
uses its internal hardware watchdog set to
reset the CPU on next tick.

Signed-off-by: Florian Fainelli <florian.fainelli@telecomint.eu>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/reboot_fixups_32.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c
index f452726c0fe2..dec0b5ec25c2 100644
--- a/arch/x86/kernel/reboot_fixups_32.c
+++ b/arch/x86/kernel/reboot_fixups_32.c
@@ -30,6 +30,19 @@ static void cs5536_warm_reset(struct pci_dev *dev)
 	udelay(50); /* shouldn't get here but be safe and spin a while */
 }
 
+static void rdc321x_reset(struct pci_dev *dev)
+{
+	unsigned i;
+	/* Voluntary reset the watchdog timer */
+	outl(0x80003840, 0xCF8);
+	/* Generate a CPU reset on next tick */
+	i = inl(0xCFC);
+	/* Use the minimum timer resolution */
+	i |= 0x1600;
+	outl(i, 0xCFC);
+	outb(1, 0x92);
+}
+
 struct device_fixup {
 	unsigned int vendor;
 	unsigned int device;
@@ -40,6 +53,7 @@ static struct device_fixup fixups_table[] = {
 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset },
 { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset },
 { PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset },
+{ PCI_VENDOR_ID_RDC, PCI_DEVICE_ID_RDC_R6030, rdc321x_reset },
 };
 
 /*
-- 
cgit v1.2.3


From 1e35669d017eac5bea4739f927b28e64731504e1 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Wed, 30 Jan 2008 13:33:36 +0100
Subject: x86: fix section mismatch warning in mcheck/mce_64.c

Fix following warning:
WARNING: arch/x86/kernel/cpu/mcheck/built-in.o(.text+0x752): Section mismatch: reference to .cpuinit.text:mce_create_device in 'mce_cpu_callback'

mce_cpu_callback() is only used by mce_cpu_notofier.
The notifier is only used for hotplugable cpu's as it is
registered using register_hotcpu_notifier(),

Annotate them both __cpuinit to fix the warning.

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/mcheck/mce_64.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 98b23d55fe6e..9a699ed03598 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -854,8 +854,8 @@ static void mce_remove_device(unsigned int cpu)
 }
 
 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
-static int
-mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
+static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
+				      unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
 
@@ -872,7 +872,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	return NOTIFY_OK;
 }
 
-static struct notifier_block mce_cpu_notifier = {
+static struct notifier_block mce_cpu_notifier __cpuinitdata = {
 	.notifier_call = mce_cpu_callback,
 };
 
-- 
cgit v1.2.3


From b48ed48a7816aafedb260cafe872a55757b9489e Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Wed, 30 Jan 2008 13:33:36 +0100
Subject: x86: fix section mismatch warning in mcheck/mce_amd_64.c

Fix following warning:
WARNING: arch/x86/kernel/cpu/mcheck/built-in.o(.text+0x1584): Section mismatch: reference to .cpuinit.text:threshold_create_device in 'threshold_cpu_callback'

threshold_cpu_callback() is only used by threshold_cpu_notifier.
threshold_cpu_notifier is only used for cpu hot plug as it is registered
using register_hotcpu_notifier().

Mark them both __cpuinit to fix the warning.

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 550502596ca3..77e666628818 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -645,7 +645,7 @@ static void threshold_remove_device(unsigned int cpu)
 }
 
 /* get notified when a cpu comes on/off */
-static int threshold_cpu_callback(struct notifier_block *nfb,
+static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb,
 					    unsigned long action, void *hcpu)
 {
 	/* cpu was unsigned int to begin with */
@@ -670,7 +670,7 @@ static int threshold_cpu_callback(struct notifier_block *nfb,
 	return NOTIFY_OK;
 }
 
-static struct notifier_block threshold_cpu_notifier = {
+static struct notifier_block threshold_cpu_notifier __cpuinitdata = {
 	.notifier_call = threshold_cpu_callback,
 };
 
-- 
cgit v1.2.3


From 4c02ad1efdd1293d6fdd453a2f27ad993458dcd1 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Wed, 30 Jan 2008 13:33:37 +0100
Subject: x86: fix section mismatch warning in process_*.c

Fix the following warning:
WARNING: arch/x86/kernel/built-in.o(.text+0x3): Section mismatch: reference to .cpuinit.data:force_mwait in 'mwait_usable'
[Seen on 64 bit only but similar pattern exist on 32 bit so fix it there too]

mwait_usable() were only used by a function annotated __cpuinit
so annotate mwait_usable() with __cpuinit to fix the warning.

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/process_32.c | 2 +-
 arch/x86/kernel/process_64.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index b72d7d132072..968371ab223a 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -285,7 +285,7 @@ static void mwait_idle(void)
 	mwait_idle_with_hints(0, 0);
 }
 
-static int mwait_usable(const struct cpuinfo_x86 *c)
+static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
 {
 	if (force_mwait)
 		return 1;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b4c470658a8a..137a86171c39 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -281,7 +281,7 @@ static void mwait_idle(void)
 }
 
 
-static int mwait_usable(const struct cpuinfo_x86 *c)
+static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
 {
 	if (force_mwait)
 		return 1;
-- 
cgit v1.2.3


From 87d7e98012e60d36cdb6c9c6a6cd1dec7c2b5a1c Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Wed, 30 Jan 2008 13:33:37 +0100
Subject: x86: fix section mismatch warning in acpi/boot.c

Fix following warning:
WARNING: arch/x86/kernel/built-in.o(.text+0x10ea0): Section mismatch: reference to .cpuinit.data:num_processors in 'acpi_unmap_lsapic'

The exported function acpi_unmap_lsapic() references
the variable num_processors that is annotated __cpuinitdata.

Remove the annotation of num_processors as we never know
when an exported function are called.
And drop the needless initialsation to 0.

Warning was seen on 64 bit but similar pattern were seen
in 32 bit - so fix it up there too.

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/mpparse_32.c | 2 +-
 arch/x86/kernel/mpparse_64.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/mpparse_32.c b/arch/x86/kernel/mpparse_32.c
index cc06eae1b037..67009cdd5eca 100644
--- a/arch/x86/kernel/mpparse_32.c
+++ b/arch/x86/kernel/mpparse_32.c
@@ -68,7 +68,7 @@ unsigned int def_to_bigsmp = 0;
 /* Processor that is doing the boot up */
 unsigned int boot_cpu_physical_apicid = -1U;
 /* Internal processor count */
-unsigned int __cpuinitdata num_processors;
+unsigned int num_processors;
 
 /* Bitmask of physically existing CPUs */
 physid_mask_t phys_cpu_present_map;
diff --git a/arch/x86/kernel/mpparse_64.c b/arch/x86/kernel/mpparse_64.c
index d3260f8f17dc..72ab1403fed7 100644
--- a/arch/x86/kernel/mpparse_64.c
+++ b/arch/x86/kernel/mpparse_64.c
@@ -60,7 +60,7 @@ unsigned int boot_cpu_id = -1U;
 EXPORT_SYMBOL(boot_cpu_id);
 
 /* Internal processor count */
-unsigned int num_processors __cpuinitdata = 0;
+unsigned int num_processors;
 
 unsigned disabled_cpus __cpuinitdata;
 
-- 
cgit v1.2.3


From 85b74d6c119a7dae882569e57925eaafecc07859 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Wed, 30 Jan 2008 13:33:37 +0100
Subject: x86: fix section mismatch warning in early-quirks.c

Fix following warnings:
WARNING: arch/x86/kernel/built-in.o(.text+0x139e1): Section mismatch: reference to .init.data:early_qrk in 'check_dev_quirk'
WARNING: arch/x86/kernel/built-in.o(.text+0x139f5): Section mismatch: reference to .init.data:early_qrk in 'check_dev_quirk'
WARNING: arch/x86/kernel/built-in.o(.text+0x13a0c): Section mismatch: reference to .init.data:early_qrk in 'check_dev_quirk'
WARNING: arch/x86/kernel/built-in.o(.text+0x13a12): Section mismatch: reference to .init.data:early_qrk in 'check_dev_quirk'
WARNING: arch/x86/kernel/built-in.o(.text+0x13a1a): Section mismatch: reference to .init.data:early_qrk in 'check_dev_quirk'
WARNING: arch/x86/kernel/built-in.o(.text+0x13a36): Section mismatch: reference to .init.data:early_qrk in 'check_dev_quirk'
WARNING: arch/x86/kernel/built-in.o(.text+0x13a42): Section mismatch: reference to .init.data:

Warning was caused by access to the __initdata annotated variable
from the non-annotated static function check_dev_quirk().
check_dev_quirk() were only used from a function annotated
__init so add __init annotation to check_dev_quirk() to fix it.

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/early-quirks.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 3f88e437e843..9f51e1ea9e82 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -133,7 +133,7 @@ static struct chipset early_qrk[] __initdata = {
 	{}
 };
 
-static void check_dev_quirk(int num, int slot, int func)
+static void __init check_dev_quirk(int num, int slot, int func)
 {
 	u16 class;
 	u16 vendor;
-- 
cgit v1.2.3


From 3eaf5efae72b33bc6132e9590e5d4bf11532f8c4 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Wed, 30 Jan 2008 13:33:37 +0100
Subject: x86: fix section mismatch warning in topology.c

Fix following warning:
WARNING: arch/x86/kernel/built-in.o(__ksymtab+0x2b0): Section mismatch: reference to .cpuinit.text:arch_register_cpu in '__ksymtab_arch_register_cpu'

Annotating exported symbols are wrong.
Previously the warning were hidden by avoiding the export
in the non HOTPLUG_CPU case but the improved checks in
modpost caught it anyway.
Fix it by removing the __cpuinit annotation and rearrange the
code a bit to save one ifdef/endif pair.

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/topology.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index a0d1719bda79..78cbb655aa79 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -33,7 +33,8 @@
 
 static DEFINE_PER_CPU(struct x86_cpu, cpu_devices);
 
-int __cpuinit arch_register_cpu(int num)
+#ifdef CONFIG_HOTPLUG_CPU
+int arch_register_cpu(int num)
 {
 	/*
 	 * CPU0 cannot be offlined due to several
@@ -44,21 +45,23 @@ int __cpuinit arch_register_cpu(int num)
 	 * Also certain PCI quirks require not to enable hotplug control
 	 * for all CPU's.
 	 */
-#ifdef CONFIG_HOTPLUG_CPU
 	if (num)
 		per_cpu(cpu_devices, num).cpu.hotpluggable = 1;
-#endif
-
 	return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
 }
+EXPORT_SYMBOL(arch_register_cpu);
 
-#ifdef CONFIG_HOTPLUG_CPU
 void arch_unregister_cpu(int num)
 {
 	return unregister_cpu(&per_cpu(cpu_devices, num).cpu);
 }
-EXPORT_SYMBOL(arch_register_cpu);
 EXPORT_SYMBOL(arch_unregister_cpu);
+#else
+int arch_register_cpu(int num)
+{
+	return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
+}
+EXPORT_SYMBOL(arch_register_cpu);
 #endif /*CONFIG_HOTPLUG_CPU*/
 
 static int __init topology_init(void)
-- 
cgit v1.2.3


From adb8daed46356d1772e8bd8def1b70c8cb58ce12 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Wed, 30 Jan 2008 13:33:37 +0100
Subject: x86: fix section mismatch warning in setup_64.c

Fix the following warning:
WARNING: arch/x86/kernel/built-in.o(.cpuinit.text+0x7a3): Section mismatch: reference to .init.text:amd_detect_cmp in 'init_amd'

The function amd_detect_cmp were annotated __init and
was only used from init_amd() which are annotated __cpuinit.

Annotate amd_detect_cmp() with _cpuinit to fix it.

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/setup_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 9ee2bc30d530..c9b7f44ea497 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -530,7 +530,7 @@ static int nearby_node(int apicid)
  * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
  * Assumes number of cores is a power of two.
  */
-static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
+static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_SMP
 	unsigned bits;
-- 
cgit v1.2.3


From 07035f076febf5fd791edbdac0e6d3f2f0e00a01 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <Yinghai.Lu@Sun.COM>
Date: Wed, 30 Jan 2008 13:33:38 +0100
Subject: x86: not set boot cpu in cpu_present_map again

in init/main.c boot_cpu_init() already does that before setup_arch

Signed-off-by: Yinghai Lu <yinghai.lu@sun.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/setup_64.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index c9b7f44ea497..4ccad185ab7e 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -414,12 +414,6 @@ void __init setup_arch(char **cmdline_p)
 
 	early_quirks();
 
-	/*
-	 * set this early, so we dont allocate cpu0
-	 * if MADT list doesnt list BSP first
-	 * mpparse.c/MP_processor_info() allocates logical cpu numbers.
-	 */
-	cpu_set(0, cpu_present_map);
 #ifdef CONFIG_ACPI
 	/*
 	 * Read APIC and some other early information from ACPI tables.
-- 
cgit v1.2.3


From 23916d49155552767726f116daf32dfb2aa3af23 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <Yinghai.Lu@Sun.COM>
Date: Wed, 30 Jan 2008 13:33:39 +0100
Subject: x86: not set boot cpu in cpu_online_map in smp_prepare_boot_cpu()

in init/main.c boot_cpu_init() does that before.

Signed-off-by: Yinghai Lu <yinghai.lu@sun.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/smpboot_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c
index 4e14ecb90764..cc64b8085c2a 100644
--- a/arch/x86/kernel/smpboot_64.c
+++ b/arch/x86/kernel/smpboot_64.c
@@ -921,7 +921,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 void __init smp_prepare_boot_cpu(void)
 {
 	int me = smp_processor_id();
-	cpu_set(me, cpu_online_map);
+	/* already set me in cpu_online_map in boot_cpu_init() */
 	cpu_set(me, cpu_callout_map);
 	per_cpu(cpu_state, me) = CPU_ONLINE;
 }
-- 
cgit v1.2.3


From dff244af014144e4ac87dfc5b6e450dc8832710e Mon Sep 17 00:00:00 2001
From: Alistair John Strachan <alistair@devzero.co.uk>
Date: Wed, 30 Jan 2008 13:33:39 +0100
Subject: x86: force enable HPET on (some?) ICH9 boards

Some consumer ICH9 boards (such as the Abit IP35 Pro) do not provide a BIOS
option for enabling the HPET. The same ICH workaround used for 6,7,8 can be
applied to 9. Here I enable the only PCI id that was visible on my system.

I have confirmed the HPETs work both from userspace and as a clocksource for
the running kernel (2.6.24 here) after applying this patch.

 Force enabled HPET at base address 0xfed00000
 hpet clockevent registered
 hpet0: at MMIO 0xfed00000, IRQs 2, 8, 0, 0
 hpet0: 4 64-bit timers, 14318180 Hz

Signed-off-by: Alistair John Strachan <alistair@devzero.co.uk>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/quirks.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index fab30e134836..150ba29a0d33 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -162,6 +162,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_31,
 			 ich_force_enable_hpet);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1,
 			 ich_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7,
+			 ich_force_enable_hpet);
 
 
 static struct pci_dev *cached_dev;
-- 
cgit v1.2.3


From 261a5ec36befbe6bae069be14f45e618dcd35146 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <Yinghai.Lu@Sun.COM>
Date: Wed, 30 Jan 2008 13:33:39 +0100
Subject: x86: change aper valid checking sequence

old sequence:
  size ==> >4G  ==> point to RAM

changed to:
  >4G ==> point to RAM ==> size

some bios even leave aper to unclear, so check size at last.

To avoid reporting:

  Node 0: Aperture @ 4a42000000 size 32 MB
  Aperture too small (32 MB)

with this change we will get:

  Node 0: Aperture @ 4a42000000 size 32 MB
  Aperture beyond 4G. Ignoring.

Signed-off-by: Yinghai Lu <yinghai.lu@sun.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/aperture_64.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 0b837bb3becb..608152a2a05e 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -85,10 +85,6 @@ static int __init aperture_valid(u64 aper_base, u32 aper_size)
 	if (!aper_base)
 		return 0;
 
-	if (aper_size < 64*1024*1024) {
-		printk(KERN_ERR "Aperture too small (%d MB)\n", aper_size>>20);
-		return 0;
-	}
 	if (aper_base + aper_size > 0x100000000UL) {
 		printk(KERN_ERR "Aperture beyond 4GB. Ignoring.\n");
 		return 0;
@@ -97,6 +93,10 @@ static int __init aperture_valid(u64 aper_base, u32 aper_size)
 		printk(KERN_ERR "Aperture pointing to e820 RAM. Ignoring.\n");
 		return 0;
 	}
+	if (aper_size < 64*1024*1024) {
+		printk(KERN_ERR "Aperture too small (%d MB)\n", aper_size>>20);
+		return 0;
+	}
 
 	return 1;
 }
-- 
cgit v1.2.3


From 6c435456dc91ace468b4e9d72ad0e13dafa22a45 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 30 Jan 2008 13:33:39 +0100
Subject: x86: add mm parameter to paravirt_alloc_pd

Add mm to paravirt_alloc_pd, partly to make it consistent with
paravirt_alloc_pt, and because later changes will make use of it.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/vmi_32.c     | 2 +-
 arch/x86/mm/init_32.c        | 4 ++--
 arch/x86/mm/pgtable_32.c     | 4 +++-
 include/asm-x86/paravirt.h   | 6 +++---
 include/asm-x86/pgalloc_32.h | 3 +--
 5 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 2ee5d8e0ada5..4525bc2c2e19 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -398,7 +398,7 @@ static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn)
 	vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
 }
 
-static void vmi_allocate_pd(u32 pfn)
+static void vmi_allocate_pd(struct mm_struct *mm, u32 pfn)
 {
  	/*
 	 * This call comes in very early, before mem_map is setup.
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 04332c09ad1d..98d2acae4f64 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -65,7 +65,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
 	if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
 		pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
 
-		paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT);
+		paravirt_alloc_pd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
 		set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
 		pud = pud_offset(pgd, 0);
 		if (pmd_table != pmd_offset(pud, 0))
@@ -365,7 +365,7 @@ void __init native_pagetable_setup_start(pgd_t *base)
 	memset(&base[USER_PTRS_PER_PGD], 0,
 	       KERNEL_PGD_PTRS * sizeof(pgd_t));
 #else
-	paravirt_alloc_pd(__pa(swapper_pg_dir) >> PAGE_SHIFT);
+	paravirt_alloc_pd(&init_mm, __pa(base) >> PAGE_SHIFT);
 #endif
 }
 
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index be61a1d845a4..f85ee44720d2 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -330,13 +330,15 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 	if (PTRS_PER_PMD == 1 || !pgd)
 		return pgd;
 
+	mm->pgd = pgd;		/* so that alloc_pd can use it */
+
  	for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
 		pmd_t *pmd = pmd_cache_alloc(i);
 
 		if (!pmd)
 			goto out_oom;
 
-		paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT);
+		paravirt_alloc_pd(mm, __pa(pmd) >> PAGE_SHIFT);
 		set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
 	}
 	return pgd;
diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h
index 9f9ff57b3efb..d6236eb46466 100644
--- a/include/asm-x86/paravirt.h
+++ b/include/asm-x86/paravirt.h
@@ -221,7 +221,7 @@ struct pv_mmu_ops {
 
 	/* Hooks for allocating/releasing pagetable pages */
 	void (*alloc_pt)(struct mm_struct *mm, u32 pfn);
-	void (*alloc_pd)(u32 pfn);
+	void (*alloc_pd)(struct mm_struct *mm, u32 pfn);
 	void (*alloc_pd_clone)(u32 pfn, u32 clonepfn, u32 start, u32 count);
 	void (*release_pt)(u32 pfn);
 	void (*release_pd)(u32 pfn);
@@ -903,9 +903,9 @@ static inline void paravirt_release_pt(unsigned pfn)
 	PVOP_VCALL1(pv_mmu_ops.release_pt, pfn);
 }
 
-static inline void paravirt_alloc_pd(unsigned pfn)
+static inline void paravirt_alloc_pd(struct mm_struct *mm, unsigned pfn)
 {
-	PVOP_VCALL1(pv_mmu_ops.alloc_pd, pfn);
+	PVOP_VCALL2(pv_mmu_ops.alloc_pd, mm, pfn);
 }
 
 static inline void paravirt_alloc_pd_clone(unsigned pfn, unsigned clonepfn,
diff --git a/include/asm-x86/pgalloc_32.h b/include/asm-x86/pgalloc_32.h
index f2fc33ceb9f2..fbc6357f5eba 100644
--- a/include/asm-x86/pgalloc_32.h
+++ b/include/asm-x86/pgalloc_32.h
@@ -8,8 +8,7 @@
 #include <asm/paravirt.h>
 #else
 #define paravirt_alloc_pt(mm, pfn) do { } while (0)
-#define paravirt_alloc_pd(pfn) do { } while (0)
-#define paravirt_alloc_pd(pfn) do { } while (0)
+#define paravirt_alloc_pd(mm, pfn) do { } while (0)
 #define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0)
 #define paravirt_release_pt(pfn) do { } while (0)
 #define paravirt_release_pd(pfn) do { } while (0)
-- 
cgit v1.2.3


From 5a96f4a55c3b0bfd40771a973b173e1b94909559 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Wed, 30 Jan 2008 13:33:40 +0100
Subject: x86: fix recursion in arch/x86/kernel/cpu/mcheck/mce_amd_64.c

remove the recursion from this function.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 77e666628818..32671da8184e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -555,7 +555,7 @@ static __cpuinit int threshold_create_device(unsigned int cpu)
 	int err = 0;
 
 	for (bank = 0; bank < NR_BANKS; ++bank) {
-		if (!(per_cpu(bank_map, cpu) & 1 << bank))
+		if (!(per_cpu(bank_map, cpu) & (1 << bank)))
 			continue;
 		err = threshold_create_bank(cpu, bank);
 		if (err)
@@ -638,7 +638,7 @@ static void threshold_remove_device(unsigned int cpu)
 	unsigned int bank;
 
 	for (bank = 0; bank < NR_BANKS; ++bank) {
-		if (!(per_cpu(bank_map, cpu) & 1 << bank))
+		if (!(per_cpu(bank_map, cpu) & (1 << bank)))
 			continue;
 		threshold_remove_bank(cpu, bank);
 	}
-- 
cgit v1.2.3


From 0947b2f31ca1ea1211d3cde2dbd8fcec579ef395 Mon Sep 17 00:00:00 2001
From: "Huang, Ying" <ying.huang@intel.com>
Date: Wed, 30 Jan 2008 13:33:44 +0100
Subject: i386 boot: replace boot_ioremap with enhanced bt_ioremap - enhance
 bt_ioremap

This patch makes it possible for bt_ioremap() to be used before
paging_init(), via providing an early implementation of set_fixmap()
that can be used before paging_init().

This way boot_ioremap() can be replaced by bt_ioremap().

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/setup_32.c |  1 +
 arch/x86/mm/init_32.c      |  2 ++
 arch/x86/mm/ioremap_32.c   | 87 ++++++++++++++++++++++++++++++++++++++++++++--
 include/asm-x86/io_32.h    |  3 ++
 4 files changed, 91 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 83ba3ca5f431..35db426de300 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -697,6 +697,7 @@ void __init setup_arch(char **cmdline_p)
 	memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
 	pre_setup_arch_hook();
 	early_cpu_init();
+	bt_ioremap_init();
 
 #ifdef CONFIG_EFI
 	if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 206e3f6800b9..f4e1894367a6 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -429,9 +429,11 @@ static void __init pagetable_init (void)
 	 * Fixed mappings, only the page table structure has to be
 	 * created - mappings will be set by set_fixmap():
 	 */
+	bt_ioremap_clear();
 	vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
 	end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
 	page_table_range_init(vaddr, end, pgd_base);
+	bt_ioremap_reset();
 
 	permanent_kmaps_init(pgd_base);
 
diff --git a/arch/x86/mm/ioremap_32.c b/arch/x86/mm/ioremap_32.c
index ef0f6a452ee1..fd1f5b6cfa20 100644
--- a/arch/x86/mm/ioremap_32.c
+++ b/arch/x86/mm/ioremap_32.c
@@ -208,6 +208,89 @@ void iounmap(volatile void __iomem *addr)
 }
 EXPORT_SYMBOL(iounmap);
 
+static __initdata int after_paging_init;
+static __initdata unsigned long bm_pte[1024]
+				__attribute__((aligned(PAGE_SIZE)));
+
+static inline unsigned long * __init bt_ioremap_pgd(unsigned long addr)
+{
+	return (unsigned long *)swapper_pg_dir + ((addr >> 22) & 1023);
+}
+
+static inline unsigned long * __init bt_ioremap_pte(unsigned long addr)
+{
+	return bm_pte + ((addr >> PAGE_SHIFT) & 1023);
+}
+
+void __init bt_ioremap_init(void)
+{
+	unsigned long *pgd;
+
+	pgd = bt_ioremap_pgd(fix_to_virt(FIX_BTMAP_BEGIN));
+	*pgd = __pa(bm_pte) | _PAGE_TABLE;
+	memset(bm_pte, 0, sizeof(bm_pte));
+	BUG_ON(pgd != bt_ioremap_pgd(fix_to_virt(FIX_BTMAP_END)));
+}
+
+void __init bt_ioremap_clear(void)
+{
+	unsigned long *pgd;
+
+	pgd = bt_ioremap_pgd(fix_to_virt(FIX_BTMAP_BEGIN));
+	*pgd = 0;
+	__flush_tlb_all();
+}
+
+void __init bt_ioremap_reset(void)
+{
+	enum fixed_addresses idx;
+	unsigned long *pte, phys, addr;
+
+	after_paging_init = 1;
+	for (idx = FIX_BTMAP_BEGIN; idx <= FIX_BTMAP_END; idx--) {
+		addr = fix_to_virt(idx);
+		pte = bt_ioremap_pte(addr);
+		if (!*pte & _PAGE_PRESENT) {
+			phys = *pte & PAGE_MASK;
+			set_fixmap(idx, phys);
+		}
+	}
+}
+
+static void __init __bt_set_fixmap(enum fixed_addresses idx,
+				   unsigned long phys, pgprot_t flags)
+{
+	unsigned long *pte, addr = __fix_to_virt(idx);
+
+	if (idx >= __end_of_fixed_addresses) {
+		BUG();
+		return;
+	}
+	pte = bt_ioremap_pte(addr);
+	if (pgprot_val(flags))
+		*pte = (phys & PAGE_MASK) | pgprot_val(flags);
+	else
+		*pte = 0;
+	__flush_tlb_one(addr);
+}
+
+static inline void __init bt_set_fixmap(enum fixed_addresses idx,
+					unsigned long phys)
+{
+	if (after_paging_init)
+		set_fixmap(idx, phys);
+	else
+		__bt_set_fixmap(idx, phys, PAGE_KERNEL);
+}
+
+static inline void __init bt_clear_fixmap(enum fixed_addresses idx)
+{
+	if (after_paging_init)
+		clear_fixmap(idx);
+	else
+		__bt_set_fixmap(idx, 0, __pgprot(0));
+}
+
 void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
 {
 	unsigned long offset, last_addr;
@@ -244,7 +327,7 @@ void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
 	 */
 	idx = FIX_BTMAP_BEGIN;
 	while (nrpages > 0) {
-		set_fixmap(idx, phys_addr);
+		bt_set_fixmap(idx, phys_addr);
 		phys_addr += PAGE_SIZE;
 		--idx;
 		--nrpages;
@@ -267,7 +350,7 @@ void __init bt_iounmap(void *addr, unsigned long size)
 
 	idx = FIX_BTMAP_BEGIN;
 	while (nrpages > 0) {
-		clear_fixmap(idx);
+		bt_clear_fixmap(idx);
 		--idx;
 		--nrpages;
 	}
diff --git a/include/asm-x86/io_32.h b/include/asm-x86/io_32.h
index db3978846379..f7b733d9ace9 100644
--- a/include/asm-x86/io_32.h
+++ b/include/asm-x86/io_32.h
@@ -139,6 +139,9 @@ extern void iounmap(volatile void __iomem *addr);
  * mappings, before the real ioremap() is functional.
  * A boot-time mapping is currently limited to at most 16 pages.
  */
+extern void bt_ioremap_init(void);
+extern void bt_ioremap_clear(void);
+extern void bt_ioremap_reset(void);
 extern void *bt_ioremap(unsigned long offset, unsigned long size);
 extern void bt_iounmap(void *addr, unsigned long size);
 extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
-- 
cgit v1.2.3


From 4716e79c9946044a53a65418cfba04836f6a5c36 Mon Sep 17 00:00:00 2001
From: "Huang, Ying" <ying.huang@intel.com>
Date: Wed, 30 Jan 2008 13:33:44 +0100
Subject: x86: replace boot_ioremap() with enhanced bt_ioremap() - remove
 boot_ioremap()

This patch replaces boot_ioremap invokation with bt_ioremap and
removes the boot_ioremap implementation.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig              |   6 ---
 arch/x86/kernel/srat_32.c     |   8 ++--
 arch/x86/mm/Makefile_32       |   1 -
 arch/x86/mm/boot_ioremap_32.c | 100 ------------------------------------------
 include/asm-x86/efi.h         |   8 +---
 5 files changed, 5 insertions(+), 118 deletions(-)
 delete mode 100644 arch/x86/mm/boot_ioremap_32.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 92dc919c7640..fb3eea3e38ee 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1009,12 +1009,6 @@ config IRQBALANCE
 	  The default yes will allow the kernel to do irq load balancing.
 	  Saying no will keep the kernel from doing irq load balancing.
 
-# turning this on wastes a bunch of space.
-# Summit needs it only when NUMA is on
-config BOOT_IOREMAP
-	def_bool y
-	depends on X86_32 && (((X86_SUMMIT || X86_GENERICARCH) && NUMA) || (X86 && EFI))
-
 config SECCOMP
 	def_bool y
 	prompt "Enable seccomp to safely compute untrusted bytecode"
diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/kernel/srat_32.c
index 2a8713ec0f9a..24bfd4a9e62a 100644
--- a/arch/x86/kernel/srat_32.c
+++ b/arch/x86/kernel/srat_32.c
@@ -57,8 +57,6 @@ static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS];
 static int num_memory_chunks;		/* total number of memory chunks */
 static u8 __initdata apicid_to_pxm[MAX_APICID];
 
-extern void * boot_ioremap(unsigned long, unsigned long);
-
 /* Identify CPU proximity domains */
 static void __init parse_cpu_affinity_structure(char *p)
 {
@@ -299,7 +297,7 @@ int __init get_memcfg_from_srat(void)
 	}
 
 	rsdt = (struct acpi_table_rsdt *)
-	    boot_ioremap(rsdp->rsdt_physical_address, sizeof(struct acpi_table_rsdt));
+	    bt_ioremap(rsdp->rsdt_physical_address, sizeof(struct acpi_table_rsdt));
 
 	if (!rsdt) {
 		printk(KERN_WARNING
@@ -339,11 +337,11 @@ int __init get_memcfg_from_srat(void)
 	for (i = 0; i < tables; i++) {
 		/* Map in header, then map in full table length. */
 		header = (struct acpi_table_header *)
-			boot_ioremap(saved_rsdt.table.table_offset_entry[i], sizeof(struct acpi_table_header));
+			bt_ioremap(saved_rsdt.table.table_offset_entry[i], sizeof(struct acpi_table_header));
 		if (!header)
 			break;
 		header = (struct acpi_table_header *)
-			boot_ioremap(saved_rsdt.table.table_offset_entry[i], header->length);
+			bt_ioremap(saved_rsdt.table.table_offset_entry[i], header->length);
 		if (!header)
 			break;
 
diff --git a/arch/x86/mm/Makefile_32 b/arch/x86/mm/Makefile_32
index be2f55160bf8..1aeba3bf34bd 100644
--- a/arch/x86/mm/Makefile_32
+++ b/arch/x86/mm/Makefile_32
@@ -8,4 +8,3 @@ obj-$(CONFIG_CPA_DEBUG) += pageattr-test.o
 obj-$(CONFIG_NUMA) += discontig_32.o
 obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
 obj-$(CONFIG_HIGHMEM) += highmem_32.o
-obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap_32.o
diff --git a/arch/x86/mm/boot_ioremap_32.c b/arch/x86/mm/boot_ioremap_32.c
deleted file mode 100644
index 7ede1e3e1f42..000000000000
--- a/arch/x86/mm/boot_ioremap_32.c
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * arch/i386/mm/boot_ioremap.c
- *
- * Re-map functions for early boot-time before paging_init() when the
- * boot-time pagetables are still in use
- *
- * Written by Dave Hansen <haveblue@us.ibm.com>
- */
-
-
-/*
- * We need to use the 2-level pagetable functions, but CONFIG_X86_PAE
- * keeps that from happening.  If anyone has a better way, I'm listening.
- *
- * boot_pte_t is defined only if this all works correctly
- */
-
-#undef CONFIG_X86_PAE
-#undef CONFIG_PARAVIRT
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-#include <linux/init.h>
-#include <linux/stddef.h>
-
-/*
- * I'm cheating here.  It is known that the two boot PTE pages are
- * allocated next to each other.  I'm pretending that they're just
- * one big array.
- */
-
-#define BOOT_PTE_PTRS (PTRS_PER_PTE*2)
-
-static unsigned long boot_pte_index(unsigned long vaddr)
-{
-	return __pa(vaddr) >> PAGE_SHIFT;
-}
-
-static inline boot_pte_t* boot_vaddr_to_pte(void *address)
-{
-	boot_pte_t* boot_pg = (boot_pte_t*)pg0;
-	return &boot_pg[boot_pte_index((unsigned long)address)];
-}
-
-/*
- * This is only for a caller who is clever enough to page-align
- * phys_addr and virtual_source, and who also has a preference
- * about which virtual address from which to steal ptes
- */
-static void __boot_ioremap(unsigned long phys_addr, unsigned long nrpages,
-		    void* virtual_source)
-{
-	boot_pte_t* pte;
-	int i;
-	char *vaddr = virtual_source;
-
-	pte = boot_vaddr_to_pte(virtual_source);
-	for (i=0; i < nrpages; i++, phys_addr += PAGE_SIZE, pte++) {
-		set_pte(pte, pfn_pte(phys_addr>>PAGE_SHIFT, PAGE_KERNEL));
-		__flush_tlb_one((unsigned long) &vaddr[i*PAGE_SIZE]);
-	}
-}
-
-/* the virtual space we're going to remap comes from this array */
-#define BOOT_IOREMAP_PAGES 4
-#define BOOT_IOREMAP_SIZE (BOOT_IOREMAP_PAGES*PAGE_SIZE)
-static __initdata char boot_ioremap_space[BOOT_IOREMAP_SIZE]
-		       __attribute__ ((aligned (PAGE_SIZE)));
-
-/*
- * This only applies to things which need to ioremap before paging_init()
- * bt_ioremap() and plain ioremap() are both useless at this point.
- *
- * When used, we're still using the boot-time pagetables, which only
- * have 2 PTE pages mapping the first 8MB
- *
- * There is no unmap.  The boot-time PTE pages aren't used after boot.
- * If you really want the space back, just remap it yourself.
- * boot_ioremap(&ioremap_space-PAGE_OFFSET, BOOT_IOREMAP_SIZE)
- */
-__init void* boot_ioremap(unsigned long phys_addr, unsigned long size)
-{
-	unsigned long last_addr, offset;
-	unsigned int nrpages;
-
-	last_addr = phys_addr + size - 1;
-
-	/* page align the requested address */
-	offset = phys_addr & ~PAGE_MASK;
-	phys_addr &= PAGE_MASK;
-	size = PAGE_ALIGN(last_addr) - phys_addr;
-
-	nrpages = size >> PAGE_SHIFT;
-	if (nrpages > BOOT_IOREMAP_PAGES)
-		return NULL;
-
-	__boot_ioremap(phys_addr, nrpages, boot_ioremap_space);
-
-	return &boot_ioremap_space[offset];
-}
diff --git a/include/asm-x86/efi.h b/include/asm-x86/efi.h
index 6d54502755aa..10fcf20bdc73 100644
--- a/include/asm-x86/efi.h
+++ b/include/asm-x86/efi.h
@@ -32,13 +32,9 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...);
 	efi_call_virt(f, a1, a2, a3, a4, a5)
 #define efi_call_virt6(f, a1, a2, a3, a4, a5, a6)	\
 	efi_call_virt(f, a1, a2, a3, a4, a5, a6)
-/*
- * We require an early boot_ioremap mapping mechanism initially
- */
-extern void *boot_ioremap(unsigned long, unsigned long);
 
-#define efi_early_ioremap(addr, size)		boot_ioremap(addr, size)
-#define efi_early_iounmap(vaddr, size)
+#define efi_early_ioremap(addr, size)		bt_ioremap(addr, size)
+#define efi_early_iounmap(vaddr, size)		bt_iounmap(vaddr, size)
 
 #define efi_ioremap(addr, size)			ioremap(addr, size)
 
-- 
cgit v1.2.3


From beacfaac3f23b30814aafee37a055257c7062ef3 Mon Sep 17 00:00:00 2001
From: "Huang, Ying" <ying.huang@intel.com>
Date: Wed, 30 Jan 2008 13:33:44 +0100
Subject: x86 32-bit boot: rename bt_ioremap() to early_ioremap()

This patch renames bt_ioremap to early_ioremap, which is used in
x86_64. This makes it easier to merge i386 and x86_64 usage.

[ mingo@elte.hu: fix ]

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/efi.c      | 24 ++++++++++++------------
 arch/x86/kernel/efi_32.c   |  2 +-
 arch/x86/kernel/setup_32.c |  6 +++---
 arch/x86/kernel/srat_32.c  |  6 +++---
 arch/x86/mm/init_32.c      |  4 ++--
 arch/x86/mm/ioremap_32.c   | 38 +++++++++++++++++++-------------------
 include/asm-x86/dmi.h      |  7 ++-----
 include/asm-x86/efi.h      |  8 --------
 include/asm-x86/io_32.h    | 16 ++++++++--------
 9 files changed, 50 insertions(+), 61 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 2939b015c2ed..5d492f99d967 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -256,12 +256,12 @@ void __init efi_init(void)
 	memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
 	memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
 
-	efi.systab = efi_early_ioremap((unsigned long)efi_phys.systab,
-				       sizeof(efi_system_table_t));
+	efi.systab = early_ioremap((unsigned long)efi_phys.systab,
+				   sizeof(efi_system_table_t));
 	if (efi.systab == NULL)
 		printk(KERN_ERR "Couldn't map the EFI system table!\n");
 	memcpy(&efi_systab, efi.systab, sizeof(efi_system_table_t));
-	efi_early_iounmap(efi.systab, sizeof(efi_system_table_t));
+	early_iounmap(efi.systab, sizeof(efi_system_table_t));
 	efi.systab = &efi_systab;
 
 	/*
@@ -278,14 +278,14 @@ void __init efi_init(void)
 	/*
 	 * Show what we know for posterity
 	 */
-	c16 = tmp = efi_early_ioremap(efi.systab->fw_vendor, 2);
+	c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2);
 	if (c16) {
 		for (i = 0; i < sizeof(vendor) && *c16; ++i)
 			vendor[i] = *c16++;
 		vendor[i] = '\0';
 	} else
 		printk(KERN_ERR PFX "Could not map the firmware vendor!\n");
-	efi_early_iounmap(tmp, 2);
+	early_iounmap(tmp, 2);
 
 	printk(KERN_INFO "EFI v%u.%.02u by %s \n",
 	       efi.systab->hdr.revision >> 16,
@@ -294,7 +294,7 @@ void __init efi_init(void)
 	/*
 	 * Let's see what config tables the firmware passed to us.
 	 */
-	config_tables = efi_early_ioremap(
+	config_tables = early_ioremap(
 		efi.systab->tables,
 		efi.systab->nr_tables * sizeof(efi_config_table_t));
 	if (config_tables == NULL)
@@ -328,7 +328,7 @@ void __init efi_init(void)
 		}
 	}
 	printk("\n");
-	efi_early_iounmap(config_tables,
+	early_iounmap(config_tables,
 			  efi.systab->nr_tables * sizeof(efi_config_table_t));
 
 	/*
@@ -337,8 +337,8 @@ void __init efi_init(void)
 	 * address of several of the EFI runtime functions, needed to
 	 * set the firmware into virtual mode.
 	 */
-	runtime = efi_early_ioremap((unsigned long)efi.systab->runtime,
-				    sizeof(efi_runtime_services_t));
+	runtime = early_ioremap((unsigned long)efi.systab->runtime,
+				sizeof(efi_runtime_services_t));
 	if (runtime != NULL) {
 		/*
 		 * We will only need *early* access to the following
@@ -357,11 +357,11 @@ void __init efi_init(void)
 	} else
 		printk(KERN_ERR "Could not map the EFI runtime service "
 		       "table!\n");
-	efi_early_iounmap(runtime, sizeof(efi_runtime_services_t));
+	early_iounmap(runtime, sizeof(efi_runtime_services_t));
 
 	/* Map the EFI memory map */
-	memmap.map = efi_early_ioremap((unsigned long)memmap.phys_map,
-				       memmap.nr_map * memmap.desc_size);
+	memmap.map = early_ioremap((unsigned long)memmap.phys_map,
+				   memmap.nr_map * memmap.desc_size);
 	if (memmap.map == NULL)
 		printk(KERN_ERR "Could not map the EFI memory map!\n");
 	memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c
index afd2c3b039d6..114b896d7573 100644
--- a/arch/x86/kernel/efi_32.c
+++ b/arch/x86/kernel/efi_32.c
@@ -117,7 +117,7 @@ void __init efi_map_memmap(void)
 {
 	memmap.map = NULL;
 
-	memmap.map = bt_ioremap((unsigned long) memmap.phys_map,
+	memmap.map = early_ioremap((unsigned long) memmap.phys_map,
 			(memmap.nr_map * memmap.desc_size));
 	if (memmap.map == NULL)
 		printk(KERN_ERR "Could not remap the EFI memmap!\n");
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 35db426de300..c038b09b1723 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -577,9 +577,9 @@ static void __init relocate_initrd(void)
 		if (clen > MAX_MAP_CHUNK-slop)
 			clen = MAX_MAP_CHUNK-slop;
 		mapaddr = ramdisk_image & PAGE_MASK;
-		p = bt_ioremap(mapaddr, clen+slop);
+		p = early_ioremap(mapaddr, clen+slop);
 		memcpy(q, p+slop, clen);
-		bt_iounmap(p, clen+slop);
+		early_iounmap(p, clen+slop);
 		q += clen;
 		ramdisk_image += clen;
 		ramdisk_size  -= clen;
@@ -697,7 +697,7 @@ void __init setup_arch(char **cmdline_p)
 	memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
 	pre_setup_arch_hook();
 	early_cpu_init();
-	bt_ioremap_init();
+	early_ioremap_init();
 
 #ifdef CONFIG_EFI
 	if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/kernel/srat_32.c
index 24bfd4a9e62a..2bf6903cb444 100644
--- a/arch/x86/kernel/srat_32.c
+++ b/arch/x86/kernel/srat_32.c
@@ -297,7 +297,7 @@ int __init get_memcfg_from_srat(void)
 	}
 
 	rsdt = (struct acpi_table_rsdt *)
-	    bt_ioremap(rsdp->rsdt_physical_address, sizeof(struct acpi_table_rsdt));
+	    early_ioremap(rsdp->rsdt_physical_address, sizeof(struct acpi_table_rsdt));
 
 	if (!rsdt) {
 		printk(KERN_WARNING
@@ -337,11 +337,11 @@ int __init get_memcfg_from_srat(void)
 	for (i = 0; i < tables; i++) {
 		/* Map in header, then map in full table length. */
 		header = (struct acpi_table_header *)
-			bt_ioremap(saved_rsdt.table.table_offset_entry[i], sizeof(struct acpi_table_header));
+			early_ioremap(saved_rsdt.table.table_offset_entry[i], sizeof(struct acpi_table_header));
 		if (!header)
 			break;
 		header = (struct acpi_table_header *)
-			bt_ioremap(saved_rsdt.table.table_offset_entry[i], header->length);
+			early_ioremap(saved_rsdt.table.table_offset_entry[i], header->length);
 		if (!header)
 			break;
 
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index f4e1894367a6..e00c1d7128b1 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -429,11 +429,11 @@ static void __init pagetable_init (void)
 	 * Fixed mappings, only the page table structure has to be
 	 * created - mappings will be set by set_fixmap():
 	 */
-	bt_ioremap_clear();
+	early_ioremap_clear();
 	vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
 	end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
 	page_table_range_init(vaddr, end, pgd_base);
-	bt_ioremap_reset();
+	early_ioremap_reset();
 
 	permanent_kmaps_init(pgd_base);
 
diff --git a/arch/x86/mm/ioremap_32.c b/arch/x86/mm/ioremap_32.c
index fd1f5b6cfa20..43b5e9fd6370 100644
--- a/arch/x86/mm/ioremap_32.c
+++ b/arch/x86/mm/ioremap_32.c
@@ -212,36 +212,36 @@ static __initdata int after_paging_init;
 static __initdata unsigned long bm_pte[1024]
 				__attribute__((aligned(PAGE_SIZE)));
 
-static inline unsigned long * __init bt_ioremap_pgd(unsigned long addr)
+static inline unsigned long * __init early_ioremap_pgd(unsigned long addr)
 {
 	return (unsigned long *)swapper_pg_dir + ((addr >> 22) & 1023);
 }
 
-static inline unsigned long * __init bt_ioremap_pte(unsigned long addr)
+static inline unsigned long * __init early_ioremap_pte(unsigned long addr)
 {
 	return bm_pte + ((addr >> PAGE_SHIFT) & 1023);
 }
 
-void __init bt_ioremap_init(void)
+void __init early_ioremap_init(void)
 {
 	unsigned long *pgd;
 
-	pgd = bt_ioremap_pgd(fix_to_virt(FIX_BTMAP_BEGIN));
+	pgd = early_ioremap_pgd(fix_to_virt(FIX_BTMAP_BEGIN));
 	*pgd = __pa(bm_pte) | _PAGE_TABLE;
 	memset(bm_pte, 0, sizeof(bm_pte));
-	BUG_ON(pgd != bt_ioremap_pgd(fix_to_virt(FIX_BTMAP_END)));
+	BUG_ON(pgd != early_ioremap_pgd(fix_to_virt(FIX_BTMAP_END)));
 }
 
-void __init bt_ioremap_clear(void)
+void __init early_ioremap_clear(void)
 {
 	unsigned long *pgd;
 
-	pgd = bt_ioremap_pgd(fix_to_virt(FIX_BTMAP_BEGIN));
+	pgd = early_ioremap_pgd(fix_to_virt(FIX_BTMAP_BEGIN));
 	*pgd = 0;
 	__flush_tlb_all();
 }
 
-void __init bt_ioremap_reset(void)
+void __init early_ioremap_reset(void)
 {
 	enum fixed_addresses idx;
 	unsigned long *pte, phys, addr;
@@ -249,7 +249,7 @@ void __init bt_ioremap_reset(void)
 	after_paging_init = 1;
 	for (idx = FIX_BTMAP_BEGIN; idx <= FIX_BTMAP_END; idx--) {
 		addr = fix_to_virt(idx);
-		pte = bt_ioremap_pte(addr);
+		pte = early_ioremap_pte(addr);
 		if (!*pte & _PAGE_PRESENT) {
 			phys = *pte & PAGE_MASK;
 			set_fixmap(idx, phys);
@@ -257,7 +257,7 @@ void __init bt_ioremap_reset(void)
 	}
 }
 
-static void __init __bt_set_fixmap(enum fixed_addresses idx,
+static void __init __early_set_fixmap(enum fixed_addresses idx,
 				   unsigned long phys, pgprot_t flags)
 {
 	unsigned long *pte, addr = __fix_to_virt(idx);
@@ -266,7 +266,7 @@ static void __init __bt_set_fixmap(enum fixed_addresses idx,
 		BUG();
 		return;
 	}
-	pte = bt_ioremap_pte(addr);
+	pte = early_ioremap_pte(addr);
 	if (pgprot_val(flags))
 		*pte = (phys & PAGE_MASK) | pgprot_val(flags);
 	else
@@ -274,24 +274,24 @@ static void __init __bt_set_fixmap(enum fixed_addresses idx,
 	__flush_tlb_one(addr);
 }
 
-static inline void __init bt_set_fixmap(enum fixed_addresses idx,
+static inline void __init early_set_fixmap(enum fixed_addresses idx,
 					unsigned long phys)
 {
 	if (after_paging_init)
 		set_fixmap(idx, phys);
 	else
-		__bt_set_fixmap(idx, phys, PAGE_KERNEL);
+		__early_set_fixmap(idx, phys, PAGE_KERNEL);
 }
 
-static inline void __init bt_clear_fixmap(enum fixed_addresses idx)
+static inline void __init early_clear_fixmap(enum fixed_addresses idx)
 {
 	if (after_paging_init)
 		clear_fixmap(idx);
 	else
-		__bt_set_fixmap(idx, 0, __pgprot(0));
+		__early_set_fixmap(idx, 0, __pgprot(0));
 }
 
-void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
+void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
 {
 	unsigned long offset, last_addr;
 	unsigned int nrpages;
@@ -327,7 +327,7 @@ void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
 	 */
 	idx = FIX_BTMAP_BEGIN;
 	while (nrpages > 0) {
-		bt_set_fixmap(idx, phys_addr);
+		early_set_fixmap(idx, phys_addr);
 		phys_addr += PAGE_SIZE;
 		--idx;
 		--nrpages;
@@ -335,7 +335,7 @@ void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
 	return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN));
 }
 
-void __init bt_iounmap(void *addr, unsigned long size)
+void __init early_iounmap(void *addr, unsigned long size)
 {
 	unsigned long virt_addr;
 	unsigned long offset;
@@ -350,7 +350,7 @@ void __init bt_iounmap(void *addr, unsigned long size)
 
 	idx = FIX_BTMAP_BEGIN;
 	while (nrpages > 0) {
-		bt_clear_fixmap(idx);
+		early_clear_fixmap(idx);
 		--idx;
 		--nrpages;
 	}
diff --git a/include/asm-x86/dmi.h b/include/asm-x86/dmi.h
index 5008c365e6e4..1241e6ad1935 100644
--- a/include/asm-x86/dmi.h
+++ b/include/asm-x86/dmi.h
@@ -5,9 +5,6 @@
 
 #ifdef CONFIG_X86_32
 
-/* Use early IO mappings for DMI because it's initialized early */
-#define dmi_ioremap bt_ioremap
-#define dmi_iounmap bt_iounmap
 #define dmi_alloc alloc_bootmem
 
 #else /* CONFIG_X86_32 */
@@ -28,9 +25,9 @@ static inline void *dmi_alloc(unsigned len)
 	return dmi_alloc_data + idx;
 }
 
+#endif
+
 #define dmi_ioremap early_ioremap
 #define dmi_iounmap early_iounmap
 
 #endif
-
-#endif
diff --git a/include/asm-x86/efi.h b/include/asm-x86/efi.h
index 10fcf20bdc73..8380131ca542 100644
--- a/include/asm-x86/efi.h
+++ b/include/asm-x86/efi.h
@@ -33,9 +33,6 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...);
 #define efi_call_virt6(f, a1, a2, a3, a4, a5, a6)	\
 	efi_call_virt(f, a1, a2, a3, a4, a5, a6)
 
-#define efi_early_ioremap(addr, size)		bt_ioremap(addr, size)
-#define efi_early_iounmap(vaddr, size)		bt_iounmap(vaddr, size)
-
 #define efi_ioremap(addr, size)			ioremap(addr, size)
 
 #define end_pfn_map				max_low_pfn
@@ -91,13 +88,8 @@ extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3,
 	efi_call6((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
 		  (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6))
 
-#define efi_early_ioremap(addr, size)		early_ioremap(addr, size)
-#define efi_early_iounmap(vaddr, size)		early_iounmap(vaddr, size)
-
 extern void *efi_ioremap(unsigned long offset, unsigned long size);
 
-extern int efi_time;
-
 #endif /* CONFIG_X86_32 */
 
 extern void efi_reserve_bootmem(void);
diff --git a/include/asm-x86/io_32.h b/include/asm-x86/io_32.h
index f7b733d9ace9..059a1fee4de3 100644
--- a/include/asm-x86/io_32.h
+++ b/include/asm-x86/io_32.h
@@ -135,20 +135,20 @@ static inline void __iomem * ioremap(unsigned long offset, unsigned long size)
 extern void iounmap(volatile void __iomem *addr);
 
 /*
- * bt_ioremap() and bt_iounmap() are for temporary early boot-time
+ * early_ioremap() and early_iounmap() are for temporary early boot-time
  * mappings, before the real ioremap() is functional.
  * A boot-time mapping is currently limited to at most 16 pages.
  */
-extern void bt_ioremap_init(void);
-extern void bt_ioremap_clear(void);
-extern void bt_ioremap_reset(void);
-extern void *bt_ioremap(unsigned long offset, unsigned long size);
-extern void bt_iounmap(void *addr, unsigned long size);
+extern void early_ioremap_init(void);
+extern void early_ioremap_clear(void);
+extern void early_ioremap_reset(void);
+extern void *early_ioremap(unsigned long offset, unsigned long size);
+extern void early_iounmap(void *addr, unsigned long size);
 extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
 
 /* Use early IO mappings for DMI because it's initialized early */
-#define dmi_ioremap bt_ioremap
-#define dmi_iounmap bt_iounmap
+#define dmi_ioremap early_ioremap
+#define dmi_iounmap early_iounmap
 #define dmi_alloc alloc_bootmem
 
 /*
-- 
cgit v1.2.3


From 927222b102186a6cc3e43e25062fcd18c800435e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 30 Jan 2008 13:33:49 +0100
Subject: x86: fix EISA ioremap

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/traps_32.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index 8534cb53ff60..3cf72977d012 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -1142,11 +1142,11 @@ void __init trap_init(void)
 	int i;
 
 #ifdef CONFIG_EISA
-	void __iomem *p = ioremap(0x0FFFD9, 4);
+	void __iomem *p = early_ioremap(0x0FFFD9, 4);
 	if (readl(p) == 'E'+('I'<<8)+('S'<<16)+('A'<<24)) {
 		EISA_bus = 1;
 	}
-	iounmap(p);
+	early_iounmap(p, 4);
 #endif
 
 #ifdef CONFIG_X86_LOCAL_APIC
-- 
cgit v1.2.3


From 934d15854d04e8ca2495d8f5698164df990d5d66 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 30 Jan 2008 13:33:53 +0100
Subject: x86: remove set_kernel_exec()

The SMP trampoline always runs in real mode, so making it executable
in the page tables doesn't make much sense because it executes
before page tables are set up. That was the only user of
set_kernel_exec(). Remove set_kernel_exec().

Signed-off-by: Andi Kleen <ak@suse.de>
Acked-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/smpboot_32.c | 11 -----------
 arch/x86/mm/init_32.c        | 29 -----------------------------
 include/asm-x86/pgtable_32.h | 12 ------------
 3 files changed, 52 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/smpboot_32.c b/arch/x86/kernel/smpboot_32.c
index 7a62dced61ca..5787a0c3e296 100644
--- a/arch/x86/kernel/smpboot_32.c
+++ b/arch/x86/kernel/smpboot_32.c
@@ -107,7 +107,6 @@ u8 apicid_2_node[MAX_APICID];
 extern const unsigned char trampoline_data [];
 extern const unsigned char trampoline_end  [];
 static unsigned char *trampoline_base;
-static int trampoline_exec;
 
 static void map_cpu_to_logical_apicid(void);
 
@@ -139,10 +138,6 @@ void __init smp_alloc_memory(void)
 	 */
 	if (__pa(trampoline_base) >= 0x9F000)
 		BUG();
-	/*
-	 * Make the SMP trampoline executable:
-	 */
-	trampoline_exec = set_kernel_exec((unsigned long)trampoline_base, 1);
 }
 
 /*
@@ -1290,12 +1285,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
 	setup_ioapic_dest();
 #endif
 	zap_low_mappings();
-#ifndef CONFIG_HOTPLUG_CPU
-	/*
-	 * Disable executability of the SMP trampoline:
-	 */
-	set_kernel_exec((unsigned long)trampoline_base, trampoline_exec);
-#endif
 }
 
 void __init smp_intr_init(void)
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 08ba3a14c72c..016c8ccd1d8d 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -529,35 +529,6 @@ static void __init set_nx(void)
 	}
 }
 
-/*
- * Enables/disables executability of a given kernel page and
- * returns the previous setting.
- */
-int __init set_kernel_exec(unsigned long vaddr, int enable)
-{
-	pte_t *pte;
-	int ret = 1;
-	int level;
-
-	if (!nx_enabled)
-		goto out;
-
-	pte = lookup_address(vaddr, &level);
-	BUG_ON(!pte);
-
-	if (!pte_exec(*pte))
-		ret = 0;
-
-	if (enable)
-		pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
-	else
-		pte->pte_high |= 1 << (_PAGE_BIT_NX - 32);
-	pte_update_defer(&init_mm, vaddr, pte);
-	__flush_tlb_all();
-out:
-	return ret;
-}
-
 #endif
 
 /*
diff --git a/include/asm-x86/pgtable_32.h b/include/asm-x86/pgtable_32.h
index 9381bd37b9b1..72eb06c2bac9 100644
--- a/include/asm-x86/pgtable_32.h
+++ b/include/asm-x86/pgtable_32.h
@@ -184,18 +184,6 @@ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
  */
 extern pte_t *lookup_address(unsigned long address, int *level);
 
-/*
- * Make a given kernel text page executable/non-executable.
- * Returns the previous executability setting of that page (which
- * is used to restore the previous state). Used by the SMP bootup code.
- * NOTE: this is an __init function for security reasons.
- */
-#ifdef CONFIG_X86_PAE
- extern int set_kernel_exec(unsigned long vaddr, int enable);
-#else
- static inline int set_kernel_exec(unsigned long vaddr, int enable) { return 0;}
-#endif
-
 #if defined(CONFIG_HIGHPTE)
 #define pte_offset_map(dir, address) \
 	((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + pte_index(address))
-- 
cgit v1.2.3


From cd58289667293593b04fd315ec7f2f37589134cb Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 30 Jan 2008 13:33:54 +0100
Subject: x86: fix more non-global TLB flushes

fix more __flush_tlb() instances, out of caution.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/head64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 58438bafedca..a317336cdeaa 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -27,7 +27,7 @@ static void __init zap_identity_mappings(void)
 {
 	pgd_t *pgd = pgd_offset_k(0UL);
 	pgd_clear(pgd);
-	__flush_tlb();
+	__flush_tlb_all();
 }
 
 /* Don't add a printk in there. printk relies on the PDA which is not initialized 
-- 
cgit v1.2.3


From a2172e2586f6662af996e47f417bb718c37cf8d2 Mon Sep 17 00:00:00 2001
From: "Huang, Ying" <ying.huang@intel.com>
Date: Wed, 30 Jan 2008 13:33:55 +0100
Subject: x86: fix some bugs about EFI runtime code mapping

This patch fixes some bugs of making EFI runtime code executable.

- Use change_page_attr in i386 too. Because the runtime code may be
  mapped not through ioremap.

- If there is no _PAGE_NX in __supported_pte_mask, the change_page_attr
  is not called.

- Make efi_ioremap map pages as PAGE_KERNEL_EXEC_NOCACHE, because EFI runtime
  code may be mapped through efi_ioremap.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/efi.c    | 35 ++++++++++++++++++++++++++++++-----
 arch/x86/kernel/efi_64.c | 26 ++++++--------------------
 include/asm-x86/efi.h    |  1 -
 3 files changed, 36 insertions(+), 26 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 5d492f99d967..834ecfb41e97 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -40,6 +40,8 @@
 #include <asm/setup.h>
 #include <asm/efi.h>
 #include <asm/time.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
 
 #define EFI_DEBUG	1
 #define PFX 		"EFI: "
@@ -379,6 +381,32 @@ void __init efi_init(void)
 #endif
 }
 
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+static void __init runtime_code_page_mkexec(void)
+{
+	efi_memory_desc_t *md;
+	unsigned long end;
+	void *p;
+
+	if (!(__supported_pte_mask & _PAGE_NX))
+		return;
+
+	/* Make EFI runtime service code area executable */
+	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+		md = p;
+		end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
+		if (md->type == EFI_RUNTIME_SERVICES_CODE &&
+		    (end >> PAGE_SHIFT) <= end_pfn_map)
+			change_page_attr_addr(md->virt_addr,
+					      md->num_pages,
+					      PAGE_KERNEL_EXEC_NOCACHE);
+	}
+	__flush_tlb_all();
+}
+#else
+static inline void __init runtime_code_page_mkexec(void) { }
+#endif
+
 /*
  * This function will switch the EFI runtime services to virtual mode.
  * Essentially, look through the EFI memmap and map every region that
@@ -399,9 +427,9 @@ void __init efi_enter_virtual_mode(void)
 		md = p;
 		if (!(md->attribute & EFI_MEMORY_RUNTIME))
 			continue;
+		end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
 		if ((md->attribute & EFI_MEMORY_WB) &&
-		    (((md->phys_addr + (md->num_pages<<EFI_PAGE_SHIFT)) >>
-		      PAGE_SHIFT) < end_pfn_map))
+		    ((end >> PAGE_SHIFT) <= end_pfn_map))
 			md->virt_addr = (unsigned long)__va(md->phys_addr);
 		else
 			md->virt_addr = (unsigned long)
@@ -410,7 +438,6 @@ void __init efi_enter_virtual_mode(void)
 		if (!md->virt_addr)
 			printk(KERN_ERR PFX "ioremap of 0x%llX failed!\n",
 			       (unsigned long long)md->phys_addr);
-		end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
 		if ((md->phys_addr <= (unsigned long)efi_phys.systab) &&
 		    ((unsigned long)efi_phys.systab < end))
 			efi.systab = (efi_system_table_t *)(unsigned long)
@@ -448,9 +475,7 @@ void __init efi_enter_virtual_mode(void)
 	efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
 	efi.reset_system = virt_efi_reset_system;
 	efi.set_virtual_address_map = virt_efi_set_virtual_address_map;
-#ifdef CONFIG_X86_64
 	runtime_code_page_mkexec();
-#endif
 }
 
 /*
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index 1f8bbd9644d7..9f8a75594398 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -33,7 +33,6 @@
 #include <asm/e820.h>
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
-#include <asm/cacheflush.h>
 #include <asm/proto.h>
 #include <asm/efi.h>
 
@@ -55,7 +54,7 @@ static void __init early_mapping_set_exec(unsigned long start,
 		else
 			set_pte(kpte, __pte((pte_val(*kpte) | _PAGE_NX) & \
 					    __supported_pte_mask));
-		if (pte_huge(*kpte))
+		if (level == 4)
 			start = (start + PMD_SIZE) & PMD_MASK;
 		else
 			start = (start + PAGE_SIZE) & PAGE_MASK;
@@ -67,6 +66,9 @@ static void __init early_runtime_code_mapping_set_exec(int executable)
 	efi_memory_desc_t *md;
 	void *p;
 
+	if (!(__supported_pte_mask & _PAGE_NX))
+		return;
+
 	/* Make EFI runtime service code area executable */
 	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
 		md = p;
@@ -116,22 +118,6 @@ void __init efi_reserve_bootmem(void)
 				memmap.nr_map * memmap.desc_size);
 }
 
-void __init runtime_code_page_mkexec(void)
-{
-	efi_memory_desc_t *md;
-	void *p;
-
-	/* Make EFI runtime service code area executable */
-	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-		md = p;
-		if (md->type == EFI_RUNTIME_SERVICES_CODE)
-			change_page_attr_addr(md->virt_addr,
-					      md->num_pages,
-					      PAGE_KERNEL_EXEC);
-	}
-	__flush_tlb_all();
-}
-
 void __iomem * __init efi_ioremap(unsigned long offset,
 				  unsigned long size)
 {
@@ -146,8 +132,8 @@ void __iomem * __init efi_ioremap(unsigned long offset,
 		return NULL;
 
 	for (i = 0; i < pages; i++) {
-		set_fixmap_nocache(FIX_EFI_IO_MAP_FIRST_PAGE - pages_mapped,
-				   offset);
+		__set_fixmap(FIX_EFI_IO_MAP_FIRST_PAGE - pages_mapped,
+			     offset, PAGE_KERNEL_EXEC_NOCACHE);
 		offset += PAGE_SIZE;
 		pages_mapped++;
 	}
diff --git a/include/asm-x86/efi.h b/include/asm-x86/efi.h
index 8380131ca542..9e08ae84795d 100644
--- a/include/asm-x86/efi.h
+++ b/include/asm-x86/efi.h
@@ -95,6 +95,5 @@ extern void *efi_ioremap(unsigned long offset, unsigned long size);
 extern void efi_reserve_bootmem(void);
 extern void efi_call_phys_prelog(void);
 extern void efi_call_phys_epilog(void);
-extern void runtime_code_page_mkexec(void);
 
 #endif
-- 
cgit v1.2.3


From 12d6f21eacc21d84a809829543f2fe45c7e37319 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 30 Jan 2008 13:33:58 +0100
Subject: x86: do not PSE on CONFIG_DEBUG_PAGEALLOC=y

get more testing of the c_p_a() code done by not turning off
PSE on DEBUG_PAGEALLOC.

this simplifies the early pagetable setup code, and tests
the largepage-splitup code quite heavily.

In the end, all the largepages will be split up pretty quickly,
so there's no difference to how DEBUG_PAGEALLOC worked before.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/common.c |  7 -------
 arch/x86/mm/pageattr_32.c    | 12 +++++++++++-
 include/asm-x86/cacheflush.h |  5 -----
 include/linux/mm.h           | 14 +++++++++++++-
 init/main.c                  |  5 +++++
 5 files changed, 29 insertions(+), 14 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index bba850b05d0e..db28aa9e2f69 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -641,13 +641,6 @@ void __init early_cpu_init(void)
 	nexgen_init_cpu();
 	umc_init_cpu();
 	early_cpu_detect();
-
-#ifdef CONFIG_DEBUG_PAGEALLOC
-	/* pse is not compatible with on-the-fly unmapping,
-	 * disable it even if the cpus claim to support it.
-	 */
-	setup_clear_cpu_cap(X86_FEATURE_PSE);
-#endif
 }
 
 /* Make sure %fs is initialized properly in idle threads */
diff --git a/arch/x86/mm/pageattr_32.c b/arch/x86/mm/pageattr_32.c
index 9cf2fea54eb5..dd49b16b3a0e 100644
--- a/arch/x86/mm/pageattr_32.c
+++ b/arch/x86/mm/pageattr_32.c
@@ -61,13 +61,17 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
 static int split_large_page(pte_t *kpte, unsigned long address)
 {
 	pgprot_t ref_prot = pte_pgprot(pte_clrhuge(*kpte));
+	gfp_t gfp_flags = GFP_KERNEL;
 	unsigned long flags;
 	unsigned long addr;
 	pte_t *pbase, *tmp;
 	struct page *base;
 	int i, level;
 
-	base = alloc_pages(GFP_KERNEL, 0);
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	gfp_flags = GFP_ATOMIC;
+#endif
+	base = alloc_pages(gfp_flags, 0);
 	if (!base)
 		return -ENOMEM;
 
@@ -218,6 +222,12 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
 					   numpages * PAGE_SIZE);
 	}
 
+	/*
+	 * If page allocator is not up yet then do not call c_p_a():
+	 */
+	if (!debug_pagealloc_enabled)
+		return;
+
 	/*
 	 * the return value is ignored - the calls cannot fail,
 	 * large pages are disabled at boot time.
diff --git a/include/asm-x86/cacheflush.h b/include/asm-x86/cacheflush.h
index 9411a2d3f19c..fccb563e2305 100644
--- a/include/asm-x86/cacheflush.h
+++ b/include/asm-x86/cacheflush.h
@@ -29,11 +29,6 @@ int change_page_attr(struct page *page, int numpages, pgprot_t prot);
 int change_page_attr_addr(unsigned long addr, int numpages, pgprot_t prot);
 void clflush_cache_range(void *addr, int size);
 
-#ifdef CONFIG_DEBUG_PAGEALLOC
-/* internal debugging function */
-void kernel_map_pages(struct page *page, int numpages, int enable);
-#endif
-
 #ifdef CONFIG_DEBUG_RODATA
 void mark_rodata_ro(void);
 #endif
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3c22d971afa7..1bba6789a50a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1118,9 +1118,21 @@ static inline void vm_stat_account(struct mm_struct *mm,
 }
 #endif /* CONFIG_PROC_FS */
 
-#ifndef CONFIG_DEBUG_PAGEALLOC
+#ifdef CONFIG_DEBUG_PAGEALLOC
+extern int debug_pagealloc_enabled;
+
+extern void kernel_map_pages(struct page *page, int numpages, int enable);
+
+static inline void enable_debug_pagealloc(void)
+{
+	debug_pagealloc_enabled = 1;
+}
+#else
 static inline void
 kernel_map_pages(struct page *page, int numpages, int enable) {}
+static inline void enable_debug_pagealloc(void)
+{
+}
 #endif
 
 extern struct vm_area_struct *get_gate_vma(struct task_struct *tsk);
diff --git a/init/main.c b/init/main.c
index 3316dffe3e57..cb81ed116f62 100644
--- a/init/main.c
+++ b/init/main.c
@@ -318,6 +318,10 @@ static int __init unknown_bootoption(char *param, char *val)
 	return 0;
 }
 
+#ifdef CONFIG_DEBUG_PAGEALLOC
+int __read_mostly debug_pagealloc_enabled = 0;
+#endif
+
 static int __init init_setup(char *str)
 {
 	unsigned int i;
@@ -552,6 +556,7 @@ asmlinkage void __init start_kernel(void)
 	preempt_disable();
 	build_all_zonelists();
 	page_alloc_init();
+	enable_debug_pagealloc();
 	printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
 	parse_early_param();
 	parse_args("Booting kernel", static_command_line, __start___param,
-- 
cgit v1.2.3


From f87519e8f4f1de9b39a40e56479a7ad2443169dd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:34:05 +0100
Subject: x86: introduce max_pfn_mapped

64bit uses end_pfn_map and 32bit uses max_low_pfn. There are several
files which have #ifdef'ed defines which map either to end_pfn_map or
max_low_pfn. Replace this by a universal define and clean up all the
other instances.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/efi.c       |  4 ++--
 arch/x86/mm/pageattr-test.c | 21 +++++++--------------
 include/asm-x86/efi.h       |  2 --
 include/asm-x86/page.h      |  2 ++
 4 files changed, 11 insertions(+), 18 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 834ecfb41e97..57b57778bf60 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -396,7 +396,7 @@ static void __init runtime_code_page_mkexec(void)
 		md = p;
 		end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
 		if (md->type == EFI_RUNTIME_SERVICES_CODE &&
-		    (end >> PAGE_SHIFT) <= end_pfn_map)
+		    (end >> PAGE_SHIFT) <= max_pfn_mapped)
 			change_page_attr_addr(md->virt_addr,
 					      md->num_pages,
 					      PAGE_KERNEL_EXEC_NOCACHE);
@@ -429,7 +429,7 @@ void __init efi_enter_virtual_mode(void)
 			continue;
 		end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
 		if ((md->attribute & EFI_MEMORY_WB) &&
-		    ((end >> PAGE_SHIFT) <= end_pfn_map))
+		    ((end >> PAGE_SHIFT) <= max_pfn_mapped))
 			md->virt_addr = (unsigned long)__va(md->phys_addr);
 		else
 			md->virt_addr = (unsigned long)
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index d7a93008cc12..6a41a0f0c149 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -27,13 +27,6 @@ enum {
 	GPS			= (1<<30)
 };
 
-#ifdef CONFIG_X86_64
-# include <asm/proto.h>
-# define max_mapped		end_pfn_map
-#else
-# define max_mapped		max_low_pfn
-#endif
-
 struct split_state {
 	long lpg, gpg, spg, exec;
 	long min_exec, max_exec;
@@ -48,7 +41,7 @@ static __init int print_split(struct split_state *s)
 	s->lpg = s->gpg = s->spg = s->exec = 0;
 	s->min_exec = ~0UL;
 	s->max_exec = 0;
-	for (i = 0; i < max_mapped; ) {
+	for (i = 0; i < max_pfn_mapped; ) {
 		unsigned long addr = (unsigned long)__va(i << PAGE_SHIFT);
 		int level;
 		pte_t *pte;
@@ -97,8 +90,8 @@ static __init int print_split(struct split_state *s)
 
 	expected = (s->gpg*GPS + s->lpg*LPS)/PAGE_SIZE + s->spg + missed;
 	if (expected != i) {
-		printk(KERN_ERR "CPA max_mapped %lu but expected %lu\n",
-			max_mapped, expected);
+		printk(KERN_ERR "CPA max_pfn_mapped %lu but expected %lu\n",
+			max_pfn_mapped, expected);
 		return 1;
 	}
 	return err;
@@ -120,22 +113,22 @@ static __init int exercise_pageattr(void)
 
 	printk(KERN_INFO "CPA exercising pageattr\n");
 
-	bm = vmalloc((max_mapped + 7) / 8);
+	bm = vmalloc((max_pfn_mapped + 7) / 8);
 	if (!bm) {
 		printk(KERN_ERR "CPA Cannot vmalloc bitmap\n");
 		return -ENOMEM;
 	}
-	memset(bm, 0, (max_mapped + 7) / 8);
+	memset(bm, 0, (max_pfn_mapped + 7) / 8);
 
 	failed += print_split(&sa);
 	srandom32(100);
 
 	for (i = 0; i < NTEST; i++) {
-		unsigned long pfn = random32() % max_mapped;
+		unsigned long pfn = random32() % max_pfn_mapped;
 
 		addr[i] = (unsigned long)__va(pfn << PAGE_SHIFT);
 		len[i] = random32() % 100;
-		len[i] = min_t(unsigned long, len[i], max_mapped - pfn - 1);
+		len[i] = min_t(unsigned long, len[i], max_pfn_mapped - pfn - 1);
 
 		if (len[i] == 0)
 			len[i] = 1;
diff --git a/include/asm-x86/efi.h b/include/asm-x86/efi.h
index 9e08ae84795d..9c68a1f098d8 100644
--- a/include/asm-x86/efi.h
+++ b/include/asm-x86/efi.h
@@ -35,8 +35,6 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...);
 
 #define efi_ioremap(addr, size)			ioremap(addr, size)
 
-#define end_pfn_map				max_low_pfn
-
 #else /* !CONFIG_X86_32 */
 
 #define MAX_EFI_IO_PAGES	100
diff --git a/include/asm-x86/page.h b/include/asm-x86/page.h
index eba88d940092..e2c79d8dcdcf 100644
--- a/include/asm-x86/page.h
+++ b/include/asm-x86/page.h
@@ -33,8 +33,10 @@
 
 #ifdef CONFIG_X86_64
 #include <asm/page_64.h>
+#define max_pfn_mapped		end_pfn_map
 #else
 #include <asm/page_32.h>
+#define max_pfn_mapped		max_low_pfn
 #endif	/* CONFIG_X86_64 */
 
 #define PAGE_OFFSET		((unsigned long)__PAGE_OFFSET)
-- 
cgit v1.2.3


From 6d238cc4dc8a36a3915c26202fe49f58a0683fb9 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Wed, 30 Jan 2008 13:34:06 +0100
Subject: x86: convert CPA users to the new set_page_ API

This patch converts various users of change_page_attr() to the new,
more intent driven set_page_*/set_memory_* API set.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/efi.c               |  8 ++++----
 arch/x86/kernel/pci-gart_64.c       |  3 +--
 arch/x86/mm/init_32.c               | 20 +++++++-------------
 arch/x86/mm/init_64.c               | 10 ++++------
 drivers/char/agp/intel-agp.c        |  6 +++---
 drivers/video/vermilion/vermilion.c |  9 +++------
 include/asm-x86/agp.h               |  4 ++--
 sound/pci/intel8x0.c                |  7 +++++--
 8 files changed, 29 insertions(+), 38 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 57b57778bf60..a70fe77354b8 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -396,10 +396,10 @@ static void __init runtime_code_page_mkexec(void)
 		md = p;
 		end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
 		if (md->type == EFI_RUNTIME_SERVICES_CODE &&
-		    (end >> PAGE_SHIFT) <= max_pfn_mapped)
-			change_page_attr_addr(md->virt_addr,
-					      md->num_pages,
-					      PAGE_KERNEL_EXEC_NOCACHE);
+		    (end >> PAGE_SHIFT) <= max_pfn_mapped) {
+			set_memory_x(md->virt_addr, md->num_pages);
+			set_memory_uc(md->virt_addr, md->num_pages);
+		}
 	}
 	__flush_tlb_all();
 }
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 04ca5c5221d7..8860c6eba8ab 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -570,8 +570,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
 	gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size));
 	if (!gatt)
 		panic("Cannot allocate GATT table");
-	if (change_page_attr_addr((unsigned long)gatt, gatt_size >> PAGE_SHIFT,
-				  PAGE_KERNEL_NOCACHE))
+	if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT))
 		panic("Could not set GART PTEs to uncacheable pages");
 	global_flush_tlb();
 
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 016c8ccd1d8d..7c9bb3076b8a 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -770,34 +770,30 @@ void mark_rodata_ro(void)
 	if (num_possible_cpus() <= 1)
 #endif
 	{
-		change_page_attr(virt_to_page(start),
-		                 size >> PAGE_SHIFT, PAGE_KERNEL_RX);
+		set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
 		printk("Write protecting the kernel text: %luk\n", size >> 10);
 
 #ifdef CONFIG_CPA_DEBUG
 		global_flush_tlb();
 
 		printk("Testing CPA: Reverting %lx-%lx\n", start, start+size);
-		change_page_attr(virt_to_page(start), size>>PAGE_SHIFT,
-				 PAGE_KERNEL_EXEC);
+		set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
 		global_flush_tlb();
 
 		printk("Testing CPA: write protecting again\n");
-		change_page_attr(virt_to_page(start), size>>PAGE_SHIFT,
-				PAGE_KERNEL_RX);
+		set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
 		global_flush_tlb();
 #endif
 	}
 #endif
 	start += size;
 	size = (unsigned long)__end_rodata - start;
-	change_page_attr(virt_to_page(start),
-	                 size >> PAGE_SHIFT, PAGE_KERNEL_RO);
+	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
 	printk("Write protecting the kernel read-only data: %luk\n",
 	       size >> 10);
 
 	/*
-	 * change_page_attr() requires a global_flush_tlb() call after it.
+	 * set_pages_*() requires a global_flush_tlb() call after it.
 	 * We do this after the printk so that if something went wrong in the
 	 * change, the printk gets out at least to give a better debug hint
 	 * of who is the culprit.
@@ -806,13 +802,11 @@ void mark_rodata_ro(void)
 
 #ifdef CONFIG_CPA_DEBUG
 	printk("Testing CPA: undo %lx-%lx\n", start, start + size);
-	change_page_attr(virt_to_page(start), size >> PAGE_SHIFT,
-				PAGE_KERNEL);
+	set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
 	global_flush_tlb();
 
 	printk("Testing CPA: write protecting again\n");
-	change_page_attr(virt_to_page(start), size >> PAGE_SHIFT,
-				PAGE_KERNEL_RO);
+	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
 	global_flush_tlb();
 #endif
 }
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index c250580a9432..05bb12db0b09 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -556,8 +556,6 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
 		init_page_count(virt_to_page(addr));
 		memset((void *)(addr & ~(PAGE_SIZE-1)),
 			POISON_FREE_INITMEM, PAGE_SIZE);
-		if (addr >= __START_KERNEL_map)
-			change_page_attr_addr(addr, 1, __pgprot(0));
 		free_page(addr);
 		totalram_pages++;
 	}
@@ -594,13 +592,13 @@ void mark_rodata_ro(void)
 	if (end <= start)
 		return;
 
-	change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO);
+	set_memory_ro(start, (end - start) >> PAGE_SHIFT);
 
 	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
 	       (end - start) >> 10);
 
 	/*
-	 * change_page_attr_addr() requires a global_flush_tlb() call after it.
+	 * set_memory_*() requires a global_flush_tlb() call after it.
 	 * We do this after the printk so that if something went wrong in the
 	 * change, the printk gets out at least to give a better debug hint
 	 * of who is the culprit.
@@ -609,11 +607,11 @@ void mark_rodata_ro(void)
 
 #ifdef CONFIG_CPA_DEBUG
 	printk("Testing CPA: undo %lx-%lx\n", start, end);
-	change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL);
+	set_memory_rw(start, (end-start) >> PAGE_SHIFT);
 	global_flush_tlb();
 
 	printk("Testing CPA: again\n");
-	change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO);
+	set_memory_ro(start, (end-start) >> PAGE_SHIFT);
 	global_flush_tlb();
 #endif
 }
diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c
index 03eac1eb8e0f..c03a7143928f 100644
--- a/drivers/char/agp/intel-agp.c
+++ b/drivers/char/agp/intel-agp.c
@@ -210,8 +210,8 @@ static void *i8xx_alloc_pages(void)
 	if (page == NULL)
 		return NULL;
 
-	if (change_page_attr(page, 4, PAGE_KERNEL_NOCACHE) < 0) {
-		change_page_attr(page, 4, PAGE_KERNEL);
+	if (set_pages_uc(page, 4) < 0) {
+		set_pages_wb(page, 4);
 		global_flush_tlb();
 		__free_pages(page, 2);
 		return NULL;
@@ -230,7 +230,7 @@ static void i8xx_destroy_pages(void *addr)
 		return;
 
 	page = virt_to_page(addr);
-	change_page_attr(page, 4, PAGE_KERNEL);
+	set_pages_wb(page, 4);
 	global_flush_tlb();
 	put_page(page);
 	__free_pages(page, 2);
diff --git a/drivers/video/vermilion/vermilion.c b/drivers/video/vermilion/vermilion.c
index c31f549ebea0..fb72778dee48 100644
--- a/drivers/video/vermilion/vermilion.c
+++ b/drivers/video/vermilion/vermilion.c
@@ -88,9 +88,7 @@ static int vmlfb_alloc_vram_area(struct vram_area *va, unsigned max_order,
 {
 	gfp_t flags;
 	unsigned long i;
-	pgprot_t wc_pageprot;
 
-	wc_pageprot = PAGE_KERNEL_NOCACHE;
 	max_order++;
 	do {
 		/*
@@ -131,8 +129,7 @@ static int vmlfb_alloc_vram_area(struct vram_area *va, unsigned max_order,
 	 */
 
 	global_flush_tlb();
-	change_page_attr(virt_to_page(va->logical), va->size >> PAGE_SHIFT,
-			 wc_pageprot);
+	set_pages_uc(virt_to_page(va->logical), va->size >> PAGE_SHIFT);
 	global_flush_tlb();
 
 	printk(KERN_DEBUG MODULE_NAME
@@ -157,8 +154,8 @@ static void vmlfb_free_vram_area(struct vram_area *va)
 		 * Reset the linear kernel map caching policy.
 		 */
 
-		change_page_attr(virt_to_page(va->logical),
-				 va->size >> PAGE_SHIFT, PAGE_KERNEL);
+		set_pages_wb(virt_to_page(va->logical),
+				 va->size >> PAGE_SHIFT);
 		global_flush_tlb();
 
 		/*
diff --git a/include/asm-x86/agp.h b/include/asm-x86/agp.h
index 62df2a9e7130..f6df72561832 100644
--- a/include/asm-x86/agp.h
+++ b/include/asm-x86/agp.h
@@ -16,8 +16,8 @@
  * Caller's responsibility to call global_flush_tlb() for performance
  * reasons
  */
-#define map_page_into_agp(page) change_page_attr(page, 1, PAGE_KERNEL_NOCACHE)
-#define unmap_page_from_agp(page) change_page_attr(page, 1, PAGE_KERNEL)
+#define map_page_into_agp(page) set_pages_uc(page, 1)
+#define unmap_page_from_agp(page) set_pages_wb(page, 1)
 #define flush_agp_mappings() global_flush_tlb()
 
 /*
diff --git a/sound/pci/intel8x0.c b/sound/pci/intel8x0.c
index b4a38a3d855b..e5650905296e 100644
--- a/sound/pci/intel8x0.c
+++ b/sound/pci/intel8x0.c
@@ -711,11 +711,14 @@ static void snd_intel8x0_setup_periods(struct intel8x0 *chip, struct ichdev *ich
 static void fill_nocache(void *buf, int size, int nocache)
 {
 	size = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	change_page_attr(virt_to_page(buf), size, nocache ? PAGE_KERNEL_NOCACHE : PAGE_KERNEL);
+	if (nocache)
+		set_pages_uc(virt_to_page(buf), size);
+	else
+		set_pages_wb(virt_to_page(buf), size);
 	global_flush_tlb();
 }
 #else
-#define fill_nocache(buf,size,nocache)
+#define fill_nocache(buf, size, nocache) do { ; } while (0)
 #endif
 
 /*
-- 
cgit v1.2.3


From d7c8f21a8cad0228c7c5ce2bb6dbd95d1ee49d13 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Jan 2008 13:34:07 +0100
Subject: x86: cpa: move flush to cpa

The set_memory_* and set_pages_* family of API's currently requires the
callers to do a global tlb flush after the function call; forgetting this is
a very nasty deathtrap. This patch moves the global tlb flush into
each of the callers

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/pci-gart_64.c       |   1 -
 arch/x86/mm/init_32.c               |  14 ----
 arch/x86/mm/init_64.c               |  10 ---
 arch/x86/mm/ioremap.c               |   2 -
 arch/x86/mm/pageattr.c              | 137 +++++++++++++++++++-----------------
 drivers/char/agp/ali-agp.c          |   2 -
 drivers/char/agp/i460-agp.c         |   2 -
 drivers/char/agp/intel-agp.c        |   5 --
 drivers/video/vermilion/vermilion.c |   6 --
 include/asm-x86/agp.h               |   6 +-
 include/asm-x86/cacheflush.h        |   1 -
 sound/pci/intel8x0.c                |   1 -
 12 files changed, 72 insertions(+), 115 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 8860c6eba8ab..4d5cc7181982 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -572,7 +572,6 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
 		panic("Cannot allocate GATT table");
 	if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT))
 		panic("Could not set GART PTEs to uncacheable pages");
-	global_flush_tlb();
 
 	memset(gatt, 0, gatt_size);
 	agp_gatt_table = gatt;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index f7b941c3b2c3..0d3369b900e9 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -752,15 +752,11 @@ void mark_rodata_ro(void)
 		printk("Write protecting the kernel text: %luk\n", size >> 10);
 
 #ifdef CONFIG_CPA_DEBUG
-		global_flush_tlb();
-
 		printk("Testing CPA: Reverting %lx-%lx\n", start, start+size);
 		set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
-		global_flush_tlb();
 
 		printk("Testing CPA: write protecting again\n");
 		set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
-		global_flush_tlb();
 #endif
 	}
 #endif
@@ -770,22 +766,12 @@ void mark_rodata_ro(void)
 	printk("Write protecting the kernel read-only data: %luk\n",
 	       size >> 10);
 
-	/*
-	 * set_pages_*() requires a global_flush_tlb() call after it.
-	 * We do this after the printk so that if something went wrong in the
-	 * change, the printk gets out at least to give a better debug hint
-	 * of who is the culprit.
-	 */
-	global_flush_tlb();
-
 #ifdef CONFIG_CPA_DEBUG
 	printk("Testing CPA: undo %lx-%lx\n", start, start + size);
 	set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
-	global_flush_tlb();
 
 	printk("Testing CPA: write protecting again\n");
 	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
-	global_flush_tlb();
 #endif
 }
 #endif
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 4757be7b5e55..9b69fa54a831 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -610,22 +610,12 @@ void mark_rodata_ro(void)
 	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
 	       (end - start) >> 10);
 
-	/*
-	 * set_memory_*() requires a global_flush_tlb() call after it.
-	 * We do this after the printk so that if something went wrong in the
-	 * change, the printk gets out at least to give a better debug hint
-	 * of who is the culprit.
-	 */
-	global_flush_tlb();
-
 #ifdef CONFIG_CPA_DEBUG
 	printk("Testing CPA: undo %lx-%lx\n", start, end);
 	set_memory_rw(start, (end-start) >> PAGE_SHIFT);
-	global_flush_tlb();
 
 	printk("Testing CPA: again\n");
 	set_memory_ro(start, (end-start) >> PAGE_SHIFT);
-	global_flush_tlb();
 #endif
 }
 #endif
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index b86f66fa5185..6a9a1418bc98 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -96,8 +96,6 @@ static int ioremap_change_attr(unsigned long paddr, unsigned long size,
 		err = set_memory_wb(vaddr, nrpages);
 		break;
 	}
-	if (!err)
-		global_flush_tlb();
 
 	return err;
 }
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index e4d2b6930e61..a2d747c06147 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -22,6 +22,36 @@ within(unsigned long addr, unsigned long start, unsigned long end)
 	return addr >= start && addr < end;
 }
 
+/*
+ * Flushing functions
+ */
+void clflush_cache_range(void *addr, int size)
+{
+	int i;
+
+	for (i = 0; i < size; i += boot_cpu_data.x86_clflush_size)
+		clflush(addr+i);
+}
+
+static void flush_kernel_map(void *arg)
+{
+	/*
+	 * Flush all to work around Errata in early athlons regarding
+	 * large page flushing.
+	 */
+	__flush_tlb_all();
+
+	if (boot_cpu_data.x86_model >= 4)
+		wbinvd();
+}
+
+static void global_flush_tlb(void)
+{
+	BUG_ON(irqs_disabled());
+
+	on_each_cpu(flush_kernel_map, NULL, 1, 1);
+}
+
 /*
  * Certain areas of memory on x86 require very specific protection flags,
  * for example the BIOS area or kernel text. Callers don't always get this
@@ -328,149 +358,124 @@ static int change_page_attr_clear(unsigned long addr, int numpages,
 
 int set_memory_uc(unsigned long addr, int numpages)
 {
-	pgprot_t uncached;
+	int err;
 
-	pgprot_val(uncached) = _PAGE_PCD | _PAGE_PWT;
-	return change_page_attr_set(addr, numpages, uncached);
+	err = change_page_attr_set(addr, numpages,
+				__pgprot(_PAGE_PCD | _PAGE_PWT));
+	global_flush_tlb();
+	return err;
 }
 EXPORT_SYMBOL(set_memory_uc);
 
 int set_memory_wb(unsigned long addr, int numpages)
 {
-	pgprot_t uncached;
+	int err;
 
-	pgprot_val(uncached) = _PAGE_PCD | _PAGE_PWT;
-	return change_page_attr_clear(addr, numpages, uncached);
+	err = change_page_attr_clear(addr, numpages,
+				__pgprot(_PAGE_PCD | _PAGE_PWT));
+	global_flush_tlb();
+	return err;
 }
 EXPORT_SYMBOL(set_memory_wb);
 
 int set_memory_x(unsigned long addr, int numpages)
 {
-	pgprot_t nx;
+	int err;
 
-	pgprot_val(nx) = _PAGE_NX;
-	return change_page_attr_clear(addr, numpages, nx);
+	err = change_page_attr_clear(addr, numpages,
+				__pgprot(_PAGE_NX));
+	global_flush_tlb();
+	return err;
 }
 EXPORT_SYMBOL(set_memory_x);
 
 int set_memory_nx(unsigned long addr, int numpages)
 {
-	pgprot_t nx;
+	int err;
 
-	pgprot_val(nx) = _PAGE_NX;
-	return change_page_attr_set(addr, numpages, nx);
+	err = change_page_attr_set(addr, numpages,
+				__pgprot(_PAGE_NX));
+	global_flush_tlb();
+	return err;
 }
 EXPORT_SYMBOL(set_memory_nx);
 
 int set_memory_ro(unsigned long addr, int numpages)
 {
-	pgprot_t rw;
+	int err;
 
-	pgprot_val(rw) = _PAGE_RW;
-	return change_page_attr_clear(addr, numpages, rw);
+	err = change_page_attr_clear(addr, numpages,
+				__pgprot(_PAGE_RW));
+	global_flush_tlb();
+	return err;
 }
 
 int set_memory_rw(unsigned long addr, int numpages)
 {
-	pgprot_t rw;
+	int err;
 
-	pgprot_val(rw) = _PAGE_RW;
-	return change_page_attr_set(addr, numpages, rw);
+	err = change_page_attr_set(addr, numpages,
+				__pgprot(_PAGE_RW));
+	global_flush_tlb();
+	return err;
 }
 
 int set_memory_np(unsigned long addr, int numpages)
 {
-	pgprot_t present;
+	int err;
 
-	pgprot_val(present) = _PAGE_PRESENT;
-	return change_page_attr_clear(addr, numpages, present);
+	err = change_page_attr_clear(addr, numpages,
+				__pgprot(_PAGE_PRESENT));
+	global_flush_tlb();
+	return err;
 }
 
 int set_pages_uc(struct page *page, int numpages)
 {
 	unsigned long addr = (unsigned long)page_address(page);
-	pgprot_t uncached;
 
-	pgprot_val(uncached) = _PAGE_PCD | _PAGE_PWT;
-	return change_page_attr_set(addr, numpages, uncached);
+	return set_memory_uc(addr, numpages);
 }
 EXPORT_SYMBOL(set_pages_uc);
 
 int set_pages_wb(struct page *page, int numpages)
 {
 	unsigned long addr = (unsigned long)page_address(page);
-	pgprot_t uncached;
 
-	pgprot_val(uncached) = _PAGE_PCD | _PAGE_PWT;
-	return change_page_attr_clear(addr, numpages, uncached);
+	return set_memory_wb(addr, numpages);
 }
 EXPORT_SYMBOL(set_pages_wb);
 
 int set_pages_x(struct page *page, int numpages)
 {
 	unsigned long addr = (unsigned long)page_address(page);
-	pgprot_t nx;
 
-	pgprot_val(nx) = _PAGE_NX;
-	return change_page_attr_clear(addr, numpages, nx);
+	return set_memory_x(addr, numpages);
 }
 EXPORT_SYMBOL(set_pages_x);
 
 int set_pages_nx(struct page *page, int numpages)
 {
 	unsigned long addr = (unsigned long)page_address(page);
-	pgprot_t nx;
 
-	pgprot_val(nx) = _PAGE_NX;
-	return change_page_attr_set(addr, numpages, nx);
+	return set_memory_nx(addr, numpages);
 }
 EXPORT_SYMBOL(set_pages_nx);
 
 int set_pages_ro(struct page *page, int numpages)
 {
 	unsigned long addr = (unsigned long)page_address(page);
-	pgprot_t rw;
 
-	pgprot_val(rw) = _PAGE_RW;
-	return change_page_attr_clear(addr, numpages, rw);
+	return set_memory_ro(addr, numpages);
 }
 
 int set_pages_rw(struct page *page, int numpages)
 {
 	unsigned long addr = (unsigned long)page_address(page);
-	pgprot_t rw;
-
-	pgprot_val(rw) = _PAGE_RW;
-	return change_page_attr_set(addr, numpages, rw);
-}
-
-void clflush_cache_range(void *addr, int size)
-{
-	int i;
-
-	for (i = 0; i < size; i += boot_cpu_data.x86_clflush_size)
-		clflush(addr+i);
-}
 
-static void flush_kernel_map(void *arg)
-{
-	/*
-	 * Flush all to work around Errata in early athlons regarding
-	 * large page flushing.
-	 */
-	__flush_tlb_all();
-
-	if (boot_cpu_data.x86_model >= 4)
-		wbinvd();
+	return set_memory_rw(addr, numpages);
 }
 
-void global_flush_tlb(void)
-{
-	BUG_ON(irqs_disabled());
-
-	on_each_cpu(flush_kernel_map, NULL, 1, 1);
-}
-EXPORT_SYMBOL(global_flush_tlb);
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
 
diff --git a/drivers/char/agp/ali-agp.c b/drivers/char/agp/ali-agp.c
index aa5ddb716ffb..1ffb381130c3 100644
--- a/drivers/char/agp/ali-agp.c
+++ b/drivers/char/agp/ali-agp.c
@@ -145,7 +145,6 @@ static void *m1541_alloc_page(struct agp_bridge_data *bridge)
 	void *addr = agp_generic_alloc_page(agp_bridge);
 	u32 temp;
 
-	global_flush_tlb();
 	if (!addr)
 		return NULL;
 
@@ -162,7 +161,6 @@ static void ali_destroy_page(void * addr, int flags)
 		if (flags & AGP_PAGE_DESTROY_UNMAP) {
 			global_cache_flush();	/* is this really needed?  --hch */
 			agp_generic_destroy_page(addr, flags);
-			global_flush_tlb();
 		} else
 			agp_generic_destroy_page(addr, flags);
 	}
diff --git a/drivers/char/agp/i460-agp.c b/drivers/char/agp/i460-agp.c
index e72a83e2bad5..76f581c85a7d 100644
--- a/drivers/char/agp/i460-agp.c
+++ b/drivers/char/agp/i460-agp.c
@@ -527,7 +527,6 @@ static void *i460_alloc_page (struct agp_bridge_data *bridge)
 
 	if (I460_IO_PAGE_SHIFT <= PAGE_SHIFT) {
 		page = agp_generic_alloc_page(agp_bridge);
-		global_flush_tlb();
 	} else
 		/* Returning NULL would cause problems */
 		/* AK: really dubious code. */
@@ -539,7 +538,6 @@ static void i460_destroy_page (void *page, int flags)
 {
 	if (I460_IO_PAGE_SHIFT <= PAGE_SHIFT) {
 		agp_generic_destroy_page(page, flags);
-		global_flush_tlb();
 	}
 }
 
diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c
index c03a7143928f..189efb6ef970 100644
--- a/drivers/char/agp/intel-agp.c
+++ b/drivers/char/agp/intel-agp.c
@@ -212,11 +212,9 @@ static void *i8xx_alloc_pages(void)
 
 	if (set_pages_uc(page, 4) < 0) {
 		set_pages_wb(page, 4);
-		global_flush_tlb();
 		__free_pages(page, 2);
 		return NULL;
 	}
-	global_flush_tlb();
 	get_page(page);
 	atomic_inc(&agp_bridge->current_memory_agp);
 	return page_address(page);
@@ -231,7 +229,6 @@ static void i8xx_destroy_pages(void *addr)
 
 	page = virt_to_page(addr);
 	set_pages_wb(page, 4);
-	global_flush_tlb();
 	put_page(page);
 	__free_pages(page, 2);
 	atomic_dec(&agp_bridge->current_memory_agp);
@@ -341,7 +338,6 @@ static struct agp_memory *alloc_agpphysmem_i8xx(size_t pg_count, int type)
 
 	switch (pg_count) {
 	case 1: addr = agp_bridge->driver->agp_alloc_page(agp_bridge);
-		global_flush_tlb();
 		break;
 	case 4:
 		/* kludge to get 4 physical pages for ARGB cursor */
@@ -404,7 +400,6 @@ static void intel_i810_free_by_type(struct agp_memory *curr)
 		else {
 			agp_bridge->driver->agp_destroy_page(gart_to_virt(curr->memory[0]),
 							     AGP_PAGE_DESTROY_UNMAP);
-			global_flush_tlb();
 			agp_bridge->driver->agp_destroy_page(gart_to_virt(curr->memory[0]),
 							     AGP_PAGE_DESTROY_FREE);
 		}
diff --git a/drivers/video/vermilion/vermilion.c b/drivers/video/vermilion/vermilion.c
index fb72778dee48..1c656667b937 100644
--- a/drivers/video/vermilion/vermilion.c
+++ b/drivers/video/vermilion/vermilion.c
@@ -124,13 +124,8 @@ static int vmlfb_alloc_vram_area(struct vram_area *va, unsigned max_order,
 	/*
 	 * Change caching policy of the linear kernel map to avoid
 	 * mapping type conflicts with user-space mappings.
-	 * The first global_flush_tlb() is really only there to do a global
-	 * wbinvd().
 	 */
-
-	global_flush_tlb();
 	set_pages_uc(virt_to_page(va->logical), va->size >> PAGE_SHIFT);
-	global_flush_tlb();
 
 	printk(KERN_DEBUG MODULE_NAME
 	       ": Allocated %ld bytes vram area at 0x%08lx\n",
@@ -156,7 +151,6 @@ static void vmlfb_free_vram_area(struct vram_area *va)
 
 		set_pages_wb(virt_to_page(va->logical),
 				 va->size >> PAGE_SHIFT);
-		global_flush_tlb();
 
 		/*
 		 * Decrease the usage count on the pages we've used
diff --git a/include/asm-x86/agp.h b/include/asm-x86/agp.h
index f6df72561832..0c309b9a5217 100644
--- a/include/asm-x86/agp.h
+++ b/include/asm-x86/agp.h
@@ -12,13 +12,9 @@
  * page. This avoids data corruption on some CPUs.
  */
 
-/*
- * Caller's responsibility to call global_flush_tlb() for performance
- * reasons
- */
 #define map_page_into_agp(page) set_pages_uc(page, 1)
 #define unmap_page_from_agp(page) set_pages_wb(page, 1)
-#define flush_agp_mappings() global_flush_tlb()
+#define flush_agp_mappings() do { } while (0)
 
 /*
  * Could use CLFLUSH here if the cpu supports it. But then it would
diff --git a/include/asm-x86/cacheflush.h b/include/asm-x86/cacheflush.h
index d15ff359d3e3..157da0206ccc 100644
--- a/include/asm-x86/cacheflush.h
+++ b/include/asm-x86/cacheflush.h
@@ -24,7 +24,6 @@
 #define copy_from_user_page(vma, page, vaddr, dst, src, len) \
 	memcpy(dst, src, len)
 
-void global_flush_tlb(void);
 int __deprecated_for_modules change_page_attr(struct page *page, int numpages,
 								pgprot_t prot);
 
diff --git a/sound/pci/intel8x0.c b/sound/pci/intel8x0.c
index e5650905296e..4bb97646a67a 100644
--- a/sound/pci/intel8x0.c
+++ b/sound/pci/intel8x0.c
@@ -715,7 +715,6 @@ static void fill_nocache(void *buf, int size, int nocache)
 		set_pages_uc(virt_to_page(buf), size);
 	else
 		set_pages_wb(virt_to_page(buf), size);
-	global_flush_tlb();
 }
 #else
 #define fill_nocache(buf, size, nocache) do { ; } while (0)
-- 
cgit v1.2.3


From edeed30589f5defe63ce6aaae56f2b7c855e4520 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Wed, 30 Jan 2008 13:34:08 +0100
Subject: x86: add testcases for RODATA and NX protections/attributes

Latest update; I now have 4 NX tests, but 2 fail so they're #if 0'd.
I also cleaned up the NX test code quite a bit, and got rid of the ugly
exception table sorting stuff.

From: Arjan van de Ven <arjan@linux.intel.com>

This patch adds testcases for the CONFIG_DEBUG_RODATA configuration option
as well as the NX CPU feature/mappings. Both testcases can move to tests/
once that patch gets merged into mainline.
(I'm half considering moving the rodata test into mm/init.c but I'll
wait with that until init.c is unified)

As part of this I had to fix a not-quite-right alignment in the vmlinux.lds.h
for the RODATA sections, which lead to 1 page less being marked read only.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/test_nx.c         | 176 ++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/test_rodata.c     |  86 +++++++++++++++++++
 arch/x86/mm/init_32.c             |   3 +
 arch/x86/mm/init_64.c             |   3 +
 include/asm-generic/vmlinux.lds.h |   1 +
 include/asm-x86/cacheflush.h      |   7 ++
 6 files changed, 276 insertions(+)
 create mode 100644 arch/x86/kernel/test_nx.c
 create mode 100644 arch/x86/kernel/test_rodata.c

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c
new file mode 100644
index 000000000000..6d7ef11e7975
--- /dev/null
+++ b/arch/x86/kernel/test_nx.c
@@ -0,0 +1,176 @@
+/*
+ * test_nx.c: functional test for NX functionality
+ *
+ * (C) Copyright 2008 Intel Corporation
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#include <linux/module.h>
+#include <linux/sort.h>
+#include <asm/uaccess.h>
+
+extern int rodata_test_data;
+
+/*
+ * This file checks 4 things:
+ * 1) Check if the stack is not executable
+ * 2) Check if kmalloc memory is not executable
+ * 3) Check if the .rodata section is not executable
+ * 4) Check if the .data section of a module is not executable
+ *
+ * To do this, the test code tries to execute memory in stack/kmalloc/etc,
+ * and then checks if the expected trap happens.
+ *
+ * Sadly, this implies having a dynamic exception handling table entry.
+ * ... which can be done (and will make Rusty cry)... but it can only
+ * be done in a stand-alone module with only 1 entry total.
+ * (otherwise we'd have to sort and that's just too messy)
+ */
+
+
+
+/*
+ * We want to set up an exception handling point on our stack,
+ * which means a variable value. This function is rather dirty
+ * and walks the exception table of the module, looking for a magic
+ * marker and replaces it with a specific function.
+ */
+static void fudze_exception_table(void *marker, void *new)
+{
+	struct module *mod = THIS_MODULE;
+	struct exception_table_entry *extable;
+
+	/*
+	 * Note: This module has only 1 exception table entry,
+	 * so searching and sorting is not needed. If that changes,
+	 * this would be the place to search and re-sort the exception
+	 * table.
+	 */
+	if (mod->num_exentries > 1) {
+		printk(KERN_ERR "test_nx: too many exception table entries!\n");
+		printk(KERN_ERR "test_nx: test results are not reliable.\n");
+		return;
+	}
+	extable = (struct exception_table_entry *)mod->extable;
+	extable[0].insn = (unsigned long)new;
+}
+
+
+/*
+ * exception tables get their symbols translated so we need
+ * to use a fake function to put in there, which we can then
+ * replace at runtime.
+ */
+void foo_label(void);
+
+/*
+ * returns 0 for not-executable, negative for executable
+ *
+ * Note: we cannot allow this function to be inlined, because
+ * that would give us more than 1 exception table entry.
+ * This in turn would break the assumptions above.
+ */
+static noinline int test_address(void *address)
+{
+	unsigned long result;
+
+	/* Set up an exception table entry for our address */
+	fudze_exception_table(&foo_label, address);
+	result = 1;
+	asm volatile(
+		"foo_label:\n"
+		"0:	call *%[fake_code]\n"
+		"1:\n"
+		".section .fixup,\"ax\"\n"
+		"2:	mov %[zero], %[rslt]\n"
+		"	ret\n"
+		".previous\n"
+		".section __ex_table,\"a\"\n"
+		"       .align 8\n"
+		"	.quad 0b\n"
+		"	.quad 2b\n"
+		".previous\n"
+		: [rslt] "=r" (result)
+		: [fake_code] "r" (address), [zero] "r" (0UL), "0" (result)
+	);
+	/* change the exception table back for the next round */
+	fudze_exception_table(address, &foo_label);
+
+	if (result)
+		return -ENODEV;
+	return 0;
+}
+
+static unsigned char test_data = 0xC3; /* 0xC3 is the opcode for "ret" */
+
+static int test_NX(void)
+{
+	int ret = 0;
+	/* 0xC3 is the opcode for "ret" */
+	char stackcode[] = {0xC3, 0x90, 0 };
+	char *heap;
+
+	test_data = 0xC3;
+
+	printk(KERN_INFO "Testing NX protection\n");
+
+	/* Test 1: check if the stack is not executable */
+	if (test_address(&stackcode)) {
+		printk(KERN_ERR "test_nx: stack was executable\n");
+		ret = -ENODEV;
+	}
+
+
+	/* Test 2: Check if the heap is executable */
+	heap = kmalloc(64, GFP_KERNEL);
+	if (!heap)
+		return -ENOMEM;
+	heap[0] = 0xC3; /* opcode for "ret" */
+
+	if (test_address(heap)) {
+		printk(KERN_ERR "test_nx: heap was executable\n");
+		ret = -ENODEV;
+	}
+	kfree(heap);
+
+	/*
+	 * The following 2 tests currently fail, this needs to get fixed
+	 * Until then, don't run them to avoid too many people getting scared
+	 * by the error message
+	 */
+#if 0
+
+#ifdef CONFIG_DEBUG_RODATA
+	/* Test 3: Check if the .rodata section is executable */
+	if (rodata_test_data != 0xC3) {
+		printk(KERN_ERR "test_nx: .rodata marker has invalid value\n");
+		ret = -ENODEV;
+	} else if (test_address(&rodata_test_data)) {
+		printk(KERN_ERR "test_nx: .rodata section is executable\n");
+		ret = -ENODEV;
+	}
+#endif
+
+	/* Test 4: Check if the .data section of a module is executable */
+	if (test_address(&test_data)) {
+		printk(KERN_ERR "test_nx: .data section is executable\n");
+		ret = -ENODEV;
+	}
+
+#endif
+	return 0;
+}
+
+static void test_exit(void)
+{
+}
+
+module_init(test_NX);
+module_exit(test_exit);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Testcase for the NX infrastructure");
+MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c
new file mode 100644
index 000000000000..4c163772000e
--- /dev/null
+++ b/arch/x86/kernel/test_rodata.c
@@ -0,0 +1,86 @@
+/*
+ * test_rodata.c: functional test for mark_rodata_ro function
+ *
+ * (C) Copyright 2008 Intel Corporation
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#include <linux/module.h>
+#include <asm/sections.h>
+extern int rodata_test_data;
+
+int rodata_test(void)
+{
+	unsigned long result;
+	unsigned long start, end;
+
+	/* test 1: read the value */
+	/* If this test fails, some previous testrun has clobbered the state */
+	if (!rodata_test_data) {
+		printk(KERN_ERR "rodata_test: test 1 fails (start data)\n");
+		return -ENODEV;
+	}
+
+	/* test 2: write to the variable; this should fault */
+	/*
+	 * If this test fails, we managed to overwrite the data
+	 *
+	 * This is written in assembly to be able to catch the
+	 * exception that is supposed to happen in the correct
+	 * case
+	 */
+
+	result = 1;
+	asm volatile(
+		"0:	mov %[zero],(%[rodata_test])\n"
+		"	mov %[zero], %[rslt]\n"
+		"1:\n"
+		".section .fixup,\"ax\"\n"
+		"2:	jmp 1b\n"
+		".previous\n"
+		".section __ex_table,\"a\"\n"
+		"       .align 16\n"
+#ifdef CONFIG_X86_32
+		"	.long 0b,2b\n"
+#else
+		"	.quad 0b,2b\n"
+#endif
+		".previous"
+		: [rslt] "=r" (result)
+		: [rodata_test] "r" (&rodata_test_data), [zero] "r" (0UL)
+	);
+
+
+	if (!result) {
+		printk(KERN_ERR "rodata_test: test data was not read only\n");
+		return -ENODEV;
+	}
+
+	/* test 3: check the value hasn't changed */
+	/* If this test fails, we managed to overwrite the data */
+	if (!rodata_test_data) {
+		printk(KERN_ERR "rodata_test: Test 3 failes (end data)\n");
+		return -ENODEV;
+	}
+	/* test 4: check if the rodata section is 4Kb aligned */
+	start = (unsigned long)__start_rodata;
+	end = (unsigned long)__end_rodata;
+	if (start & (PAGE_SIZE - 1)) {
+		printk(KERN_ERR "rodata_test: .rodata is not 4k aligned\n");
+		return -ENODEV;
+	}
+	if (end & (PAGE_SIZE - 1)) {
+		printk(KERN_ERR "rodata_test: .rodata end is not 4k aligned\n");
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Testcase for the DEBUG_RODATA infrastructure");
+MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 4d1156545194..a72737c05747 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -736,6 +736,8 @@ static int noinline do_test_wp_bit(void)
 }
 
 #ifdef CONFIG_DEBUG_RODATA
+const int rodata_test_data = 0xC3;
+EXPORT_SYMBOL_GPL(rodata_test_data);
 
 void mark_rodata_ro(void)
 {
@@ -765,6 +767,7 @@ void mark_rodata_ro(void)
 	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
 	printk("Write protecting the kernel read-only data: %luk\n",
 	       size >> 10);
+	rodata_test();
 
 #ifdef CONFIG_CPA_DEBUG
 	printk("Testing CPA: undo %lx-%lx\n", start, start + size);
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index f97ace7a55e5..50d29f5da02b 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -573,6 +573,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
 		set_memory_rw(begin, (end - begin)/PAGE_SIZE);
 		set_memory_np(begin, (end - begin)/PAGE_SIZE);
 		set_memory_nx(begin, (end - begin)/PAGE_SIZE);
+		rodata_test();
 	}
 #endif
 }
@@ -585,6 +586,8 @@ void free_initmem(void)
 }
 
 #ifdef CONFIG_DEBUG_RODATA
+const int rodata_test_data = 0xC3;
+EXPORT_SYMBOL_GPL(rodata_test_data);
 
 void mark_rodata_ro(void)
 {
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 76df771be585..f784d2f34149 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -184,6 +184,7 @@
 		VMLINUX_SYMBOL(__start___param) = .;			\
 		*(__param)						\
 		VMLINUX_SYMBOL(__stop___param) = .;			\
+		. = ALIGN((align));					\
 		VMLINUX_SYMBOL(__end_rodata) = .;			\
 	}								\
 	. = ALIGN((align));
diff --git a/include/asm-x86/cacheflush.h b/include/asm-x86/cacheflush.h
index 157da0206ccc..3e74aff90809 100644
--- a/include/asm-x86/cacheflush.h
+++ b/include/asm-x86/cacheflush.h
@@ -47,5 +47,12 @@ void clflush_cache_range(void *addr, int size);
 #ifdef CONFIG_DEBUG_RODATA
 void mark_rodata_ro(void);
 #endif
+#ifdef CONFIG_DEBUG_RODATA_TEST
+void rodata_test(void);
+#else
+static inline void rodata_test(void)
+{
+}
+#endif
 
 #endif
-- 
cgit v1.2.3


From aba8391f7323294e88e3a665513434aba4042a7d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 30 Jan 2008 13:34:09 +0100
Subject: x86: rodata config hookup

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig.debug   | 16 ++++++++++++++++
 arch/x86/kernel/Makefile |  2 ++
 2 files changed, 18 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 347e33e5f395..2d0bd33b73aa 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -68,6 +68,22 @@ config DEBUG_RODATA
 	  data. This is recommended so that we can catch kernel bugs sooner.
 	  If in doubt, say "Y".
 
+config DEBUG_RODATA_TEST
+	bool "Testcase for the DEBUG_RODATA feature"
+	depends on DEBUG_RODATA
+	help
+	  This option enables a testcase for the DEBUG_RODATA
+	  feature as well as for the change_page_attr() infrastructure.
+	  If in doubt, say "N"
+
+config DEBUG_NX_TEST
+	tristate "Testcase for the NX non-executable stack feature"
+	depends on DEBUG_KERNEL && m
+	help
+	  This option enables a testcase for the CPU NX capability
+	  and the software setup of this feature.
+	  If in doubt, say "N"
+
 config 4KSTACKS
 	bool "Use 4Kb for kernel stacks instead of 8Kb"
 	depends on DEBUG_KERNEL
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5bdb0f0431e9..6f813009d44b 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -64,6 +64,8 @@ obj-$(CONFIG_HPET_TIMER) 	+= hpet.o
 
 obj-$(CONFIG_K8_NB)		+= k8.o
 obj-$(CONFIG_MGEODE_LX)		+= geode_32.o mfgpt_32.o
+obj-$(CONFIG_DEBUG_RODATA_TEST)	+= test_rodata.o
+obj-$(CONFIG_DEBUG_NX_TEST)	+= test_nx.o
 
 obj-$(CONFIG_VMI)		+= vmi_32.o vmiclock_32.o
 obj-$(CONFIG_PARAVIRT)		+= paravirt.o paravirt_patch_$(BITS).o
-- 
cgit v1.2.3


From bed23c67fe077b4d66cb3329263d7cfa33b3fd67 Mon Sep 17 00:00:00 2001
From: "Huang, Ying" <ying.huang@intel.com>
Date: Wed, 30 Jan 2008 13:34:10 +0100
Subject: x86: use reboot_type on EFI 32

This patch makes reboot_type of BOOT_EFI is used on i386 too. Because
correpsonding reboot code of i386 and x86_64 is merged.

Signed-off-by: Huang Ying <ying.huang@intel.com>

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/efi.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index a70fe77354b8..174b067b9d03 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -371,11 +371,9 @@ void __init efi_init(void)
 		printk(KERN_WARNING "Kernel-defined memdesc"
 		       "doesn't match the one from EFI!\n");
 
-#ifdef CONFIG_X86_64
 	/* Setup for EFI runtime service */
 	reboot_type = BOOT_EFI;
 
-#endif
 #if EFI_DEBUG
 	print_efi_memmap();
 #endif
-- 
cgit v1.2.3


From a3828064be4ed8e95907d3943e7af13cb709694d Mon Sep 17 00:00:00 2001
From: "Huang, Ying" <ying.huang@intel.com>
Date: Wed, 30 Jan 2008 13:34:10 +0100
Subject: x86: fixes some bugs about EFI memory map handling

This patch fixes some bugs of EFI memory handing code.

- On x86_64, it is possible that EFI memory map can not be mapped via
  identity map, so efi_map_memmap is removed, just use early_ioremap.

- On i386, the EFI memory map mapping take effect cross paging_init,
  so it is not necessary to use efi_map_memmap.

- EFI memory map is unmapped in efi_enter_virtual_mode to avoid
  early_ioremap leak.

Signed-off-by: Huang Ying <ying.huang@intel.com>

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/efi.c      |  2 ++
 arch/x86/kernel/efi_32.c   | 15 ---------------
 arch/x86/kernel/efi_64.c   |  9 ---------
 arch/x86/kernel/setup_32.c |  2 --
 arch/x86/kernel/setup_64.c |  4 +---
 5 files changed, 3 insertions(+), 29 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 174b067b9d03..1411324a625c 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -474,6 +474,8 @@ void __init efi_enter_virtual_mode(void)
 	efi.reset_system = virt_efi_reset_system;
 	efi.set_virtual_address_map = virt_efi_set_virtual_address_map;
 	runtime_code_page_mkexec();
+	early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size);
+	memmap.map = NULL;
 }
 
 /*
diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c
index 114b896d7573..cb91f985b4a1 100644
--- a/arch/x86/kernel/efi_32.c
+++ b/arch/x86/kernel/efi_32.c
@@ -109,18 +109,3 @@ void efi_call_phys_epilog(void)
 
 	local_irq_restore(efi_rt_eflags);
 }
-
-/*
- * We need to map the EFI memory map again after paging_init().
- */
-void __init efi_map_memmap(void)
-{
-	memmap.map = NULL;
-
-	memmap.map = early_ioremap((unsigned long) memmap.phys_map,
-			(memmap.nr_map * memmap.desc_size));
-	if (memmap.map == NULL)
-		printk(KERN_ERR "Could not remap the EFI memmap!\n");
-
-	memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
-}
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index 9f8a75594398..4b73992c1e11 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -103,15 +103,6 @@ void __init efi_call_phys_epilog(void)
 	local_irq_restore(efi_flags);
 }
 
-/*
- * We need to map the EFI memory map again after init_memory_mapping().
- */
-void __init efi_map_memmap(void)
-{
-	memmap.map = __va(memmap.phys_map);
-	memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
-}
-
 void __init efi_reserve_bootmem(void)
 {
 	reserve_bootmem_generic((unsigned long)memmap.phys_map,
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index c038b09b1723..9c0ef4945a58 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -808,8 +808,6 @@ void __init setup_arch(char **cmdline_p)
 #ifdef CONFIG_X86_GENERICARCH
 	generic_apic_probe();
 #endif
-	if (efi_enabled)
-		efi_map_memmap();
 
 #ifdef CONFIG_ACPI
 	/*
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 4ccad185ab7e..697533e86822 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -378,10 +378,8 @@ void __init setup_arch(char **cmdline_p)
        acpi_reserve_bootmem();
 #endif
 
-	if (efi_enabled) {
-		efi_map_memmap();
+	if (efi_enabled)
 		efi_reserve_bootmem();
-	}
 
        /*
 	* Find and reserve possible boot-time SMP configuration:
-- 
cgit v1.2.3


From f212ec4b7b4d84290f12c9c0416cdea283bf5f40 Mon Sep 17 00:00:00 2001
From: Bernhard Kaindl <bk@suse.de>
Date: Wed, 30 Jan 2008 13:34:11 +0100
Subject: x86: early boot debugging via FireWire (ohci1394_dma=early)

This patch adds a new configuration option, which adds support for a new
early_param which gets checked in arch/x86/kernel/setup_{32,64}.c:setup_arch()
to decide wether OHCI-1394 FireWire controllers should be initialized and
enabled for physical DMA access to allow remote debugging of early problems
like issues ACPI or other subsystems which are executed very early.

If the config option is not enabled, no code is changed, and if the boot
paramenter is not given, no new code is executed, and independent of that,
all new code is freed after boot, so the config option can be even enabled
in standard, non-debug kernels.

With specialized tools, it is then possible to get debugging information
from machines which have no serial ports (notebooks) such as the printk
buffer contents, or any data which can be referenced from global pointers,
if it is stored below the 4GB limit and even memory dumps of of the physical
RAM region below the 4GB limit can be taken without any cooperation from the
CPU of the host, so the machine can be crashed early, it does not matter.

In the extreme, even kernel debuggers can be accessed in this way. I wrote
a small kgdb module and an accompanying gdb stub for FireWire which allows
to gdb to talk to kgdb using remote remory reads and writes over FireWire.

An version of the gdb stub fore FireWire is able to read all global data
from a system which is running a a normal kernel without any kernel debugger,
without any interruption or support of the system's CPU. That way, e.g. the
task struct and so on can be read and even manipulated when the physical DMA
access is granted.

A HOWTO is included in this patch, in Documentation/debugging-via-ohci1394.txt
and I've put a copy online at
ftp://ftp.suse.de/private/bk/firewire/docs/debugging-via-ohci1394.txt

It also has links to all the tools which are available to make use of it
another copy of it is online at:
ftp://ftp.suse.de/private/bk/firewire/kernel/ohci1394_dma_early-v2.diff

Signed-Off-By: Bernhard Kaindl <bk@suse.de>
Tested-By: Thomas Renninger <trenn@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/debugging-via-ohci1394.txt | 179 +++++++++++++++++++
 arch/x86/kernel/setup_32.c               |  11 ++
 arch/x86/kernel/setup_64.c               |  11 ++
 drivers/Makefile                         |   2 +-
 drivers/ieee1394/Makefile                |   1 +
 drivers/ieee1394/init_ohci1394_dma.c     | 285 +++++++++++++++++++++++++++++++
 include/asm-x86/fixmap_32.h              |   3 +
 include/asm-x86/fixmap_64.h              |   3 +
 include/linux/init_ohci1394_dma.h        |   4 +
 lib/Kconfig.debug                        |  28 +++
 10 files changed, 526 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/debugging-via-ohci1394.txt
 create mode 100644 drivers/ieee1394/init_ohci1394_dma.c
 create mode 100644 include/linux/init_ohci1394_dma.h

(limited to 'arch/x86/kernel')

diff --git a/Documentation/debugging-via-ohci1394.txt b/Documentation/debugging-via-ohci1394.txt
new file mode 100644
index 000000000000..de4804e8b396
--- /dev/null
+++ b/Documentation/debugging-via-ohci1394.txt
@@ -0,0 +1,179 @@
+
+  Using physical DMA provided by OHCI-1394 FireWire controllers for debugging
+  ---------------------------------------------------------------------------
+
+Introduction
+------------
+
+Basically all FireWire controllers which are in use today are compliant
+to the OHCI-1394 specification which defines the controller to be a PCI
+bus master which uses DMA to offload data transfers from the CPU and has
+a "Physical Response Unit" which executes specific requests by employing
+PCI-Bus master DMA after applying filters defined by the OHCI-1394 driver.
+
+Once properly configured, remote machines can send these requests to
+ask the OHCI-1394 controller to perform read and write requests on
+physical system memory and, for read requests, send the result of
+the physical memory read back to the requester.
+
+With that, it is possible to debug issues by reading interesting memory
+locations such as buffers like the printk buffer or the process table.
+
+Retrieving a full system memory dump is also possible over the FireWire,
+using data transfer rates in the order of 10MB/s or more.
+
+Memory access is currently limited to the low 4G of physical address
+space which can be a problem on IA64 machines where memory is located
+mostly above that limit, but it is rarely a problem on more common
+hardware such as hardware based on x86, x86-64 and PowerPC.
+
+Together with a early initialization of the OHCI-1394 controller for debugging,
+this facility proved most useful for examining long debugs logs in the printk
+buffer on to debug early boot problems in areas like ACPI where the system
+fails to boot and other means for debugging (serial port) are either not
+available (notebooks) or too slow for extensive debug information (like ACPI).
+
+Drivers
+-------
+
+The OHCI-1394 drivers in drivers/firewire and drivers/ieee1394 initialize
+the OHCI-1394 controllers to a working state and can be used to enable
+physical DMA. By default you only have to load the driver, and physical
+DMA access will be granted to all remote nodes, but it can be turned off
+when using the ohci1394 driver.
+
+Because these drivers depend on the PCI enumeration to be completed, an
+initialization routine which can runs pretty early (long before console_init(),
+which makes the printk buffer appear on the console can be called) was written.
+
+To activate it, enable CONFIG_PROVIDE_OHCI1394_DMA_INIT (Kernel hacking menu:
+Provide code for enabling DMA over FireWire early on boot) and pass the
+parameter "ohci1394_dma=early" to the recompiled kernel on boot.
+
+Tools
+-----
+
+firescope - Originally developed by Benjamin Herrenschmidt, Andi Kleen ported
+it from PowerPC to x86 and x86_64 and added functionality, firescope can now
+be used to view the printk buffer of a remote machine, even with live update.
+
+Bernhard Kaindl enhanced firescope to support accessing 64-bit machines
+from 32-bit firescope and vice versa:
+- ftp://ftp.suse.de/private/bk/firewire/tools/firescope-0.2.2.tar.bz2
+
+and he implemented fast system dump (alpha version - read README.txt):
+- ftp://ftp.suse.de/private/bk/firewire/tools/firedump-0.1.tar.bz2
+
+There is also a gdb proxy for firewire which allows to use gdb to access
+data which can be referenced from symbols found by gdb in vmlinux:
+- ftp://ftp.suse.de/private/bk/firewire/tools/fireproxy-0.33.tar.bz2
+
+The latest version of this gdb proxy (fireproxy-0.34) can communicate (not
+yet stable) with kgdb over an memory-based communication module (kgdbom).
+
+Getting Started
+---------------
+
+The OHCI-1394 specification regulates that the OHCI-1394 controller must
+disable all physical DMA on each bus reset.
+
+This means that if you want to debug an issue in a system state where
+interrupts are disabled and where no polling of the OHCI-1394 controller
+for bus resets takes place, you have to establish any FireWire cable
+connections and fully initialize all FireWire hardware __before__ the
+system enters such state.
+
+Step-by-step instructions for using firescope with early OHCI initialization:
+
+1) Verify that your hardware is supported:
+
+   Load the ohci1394 or the fw-ohci module and check your kernel logs.
+   You should see a line similar to
+
+   ohci1394: fw-host0: OHCI-1394 1.1 (PCI): IRQ=[18]  MMIO=[fe9ff800-fe9fffff]
+   ... Max Packet=[2048]  IR/IT contexts=[4/8]
+
+   when loading the driver. If you have no supported controller, many PCI,
+   CardBus and even some Express cards which are fully compliant to OHCI-1394
+   specification are available. If it requires no driver for Windows operating
+   systems, it most likely is. Only specialized shops have cards which are not
+   compliant, they are based on TI PCILynx chips and require drivers for Win-
+   dows operating systems.
+
+2) Establish a working FireWire cable connection:
+
+   Any FireWire cable, as long at it provides electrically and mechanically
+   stable connection and has matching connectors (there are small 4-pin and
+   large 6-pin FireWire ports) will do.
+
+   If an driver is running on both machines you should see a line like
+
+   ieee1394: Node added: ID:BUS[0-01:1023]  GUID[0090270001b84bba]
+
+   on both machines in the kernel log when the cable is plugged in
+   and connects the two machines.
+
+3) Test physical DMA using firescope:
+
+   On the debug host,
+	- load the raw1394 module,
+	- make sure that /dev/raw1394 is accessible,
+   then start firescope:
+
+	$ firescope
+	Port 0 (ohci1394) opened, 2 nodes detected
+
+	FireScope
+	---------
+	Target : <unspecified>
+	Gen    : 1
+	[Ctrl-T] choose target
+	[Ctrl-H] this menu
+	[Ctrl-Q] quit
+
+    ------> Press Ctrl-T now, the output should be similar to:
+
+	2 nodes available, local node is: 0
+	 0: ffc0, uuid: 00000000 00000000 [LOCAL]
+	 1: ffc1, uuid: 00279000 ba4bb801
+
+   Besides the [LOCAL] node, it must show another node without error message.
+
+4) Prepare for debugging with early OHCI-1394 initialization:
+
+   4.1) Kernel compilation and installation on debug target
+
+   Compile the kernel to be debugged with CONFIG_PROVIDE_OHCI1394_DMA_INIT
+   (Kernel hacking: Provide code for enabling DMA over FireWire early on boot)
+   enabled and install it on the machine to be debugged (debug target).
+
+   4.2) Transfer the System.map of the debugged kernel to the debug host
+
+   Copy the System.map of the kernel be debugged to the debug host (the host
+   which is connected to the debugged machine over the FireWire cable).
+
+5) Retrieving the printk buffer contents:
+
+   With the FireWire cable connected, the OHCI-1394 driver on the debugging
+   host loaded, reboot the debugged machine, booting the kernel which has
+   CONFIG_PROVIDE_OHCI1394_DMA_INIT enabled, with the option ohci1394_dma=early.
+
+   Then, on the debugging host, run firescope, for example by using -A:
+
+	firescope -A System.map-of-debug-target-kernel
+
+   Note: -A automatically attaches to the first non-local node. It only works
+   reliably if only connected two machines are connected using FireWire.
+
+   After having attached to the debug target, press Ctrl-D to view the
+   complete printk buffer or Ctrl-U to enter auto update mode and get an
+   updated live view of recent kernel messages logged on the debug target.
+
+   Call "firescope -h" to get more information on firescope's options.
+
+Notes
+-----
+Documentation and specifications: ftp://ftp.suse.de/private/bk/firewire/docs
+
+FireWire is a trademark of Apple Inc. - for more information please refer to:
+http://en.wikipedia.org/wiki/FireWire
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 9c0ef4945a58..62adc5f20be5 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -45,6 +45,7 @@
 #include <linux/dmi.h>
 #include <linux/pfn.h>
 #include <linux/pci.h>
+#include <linux/init_ohci1394_dma.h>
 
 #include <video/edid.h>
 
@@ -787,6 +788,16 @@ void __init setup_arch(char **cmdline_p)
 	smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
 #endif
 	paging_init();
+
+	/*
+	 * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
+	 */
+
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+	if (init_ohci1394_dma_early)
+		init_ohci1394_dma_on_all_controllers();
+#endif
+
 	remapped_pgdat_init();
 	sparse_init();
 	zone_sizes_init();
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 697533e86822..77fb87bf6e5a 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -41,6 +41,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/ctype.h>
 #include <linux/uaccess.h>
+#include <linux/init_ohci1394_dma.h>
 
 #include <asm/mtrr.h>
 #include <asm/uaccess.h>
@@ -253,6 +254,11 @@ void __attribute__((weak)) __init memory_setup(void)
        machine_specific_memory_setup();
 }
 
+/*
+ * setup_arch - architecture-specific boot-time initializations
+ *
+ * Note: On x86_64, fixmaps are ready for use even before this is called.
+ */
 void __init setup_arch(char **cmdline_p)
 {
 	unsigned i;
@@ -302,6 +308,11 @@ void __init setup_arch(char **cmdline_p)
 
 	parse_early_param();
 
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+	if (init_ohci1394_dma_early)
+		init_ohci1394_dma_on_all_controllers();
+#endif
+
 	finish_e820_parsing();
 
 	early_gart_iommu_check();
diff --git a/drivers/Makefile b/drivers/Makefile
index 8cb37e3557d4..d92d4d82d001 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -38,7 +38,7 @@ obj-$(CONFIG_SCSI)		+= scsi/
 obj-$(CONFIG_ATA)		+= ata/
 obj-$(CONFIG_FUSION)		+= message/
 obj-$(CONFIG_FIREWIRE)		+= firewire/
-obj-$(CONFIG_IEEE1394)		+= ieee1394/
+obj-y				+= ieee1394/
 obj-$(CONFIG_UIO)		+= uio/
 obj-y				+= cdrom/
 obj-y				+= auxdisplay/
diff --git a/drivers/ieee1394/Makefile b/drivers/ieee1394/Makefile
index 489c133664d5..1f8153b57503 100644
--- a/drivers/ieee1394/Makefile
+++ b/drivers/ieee1394/Makefile
@@ -15,3 +15,4 @@ obj-$(CONFIG_IEEE1394_SBP2) += sbp2.o
 obj-$(CONFIG_IEEE1394_DV1394) += dv1394.o
 obj-$(CONFIG_IEEE1394_ETH1394) += eth1394.o
 
+obj-$(CONFIG_PROVIDE_OHCI1394_DMA_INIT) += init_ohci1394_dma.o
diff --git a/drivers/ieee1394/init_ohci1394_dma.c b/drivers/ieee1394/init_ohci1394_dma.c
new file mode 100644
index 000000000000..ddaab6eb8ace
--- /dev/null
+++ b/drivers/ieee1394/init_ohci1394_dma.c
@@ -0,0 +1,285 @@
+/*
+ * init_ohci1394_dma.c - Initializes physical DMA on all OHCI 1394 controllers
+ *
+ * Copyright (C) 2006-2007      Bernhard Kaindl <bk@suse.de>
+ *
+ * Derived from drivers/ieee1394/ohci1394.c and arch/x86/kernel/early-quirks.c
+ * this file has functions to:
+ * - scan the PCI very early on boot for all OHCI 1394-compliant controllers
+ * - reset and initialize them and make them join the IEEE1394 bus and
+ * - enable physical DMA on them to allow remote debugging
+ *
+ * All code and data is marked as __init and __initdata, respective as
+ * during boot, all OHCI1394 controllers may be claimed by the firewire
+ * stack and at this point, this code should not touch them anymore.
+ *
+ * To use physical DMA after the initialization of the firewire stack,
+ * be sure that the stack enables it and (re-)attach after the bus reset
+ * which may be caused by the firewire stack initialization.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+
+#include <linux/interrupt.h>	/* for ohci1394.h */
+#include <linux/delay.h>
+#include <linux/pci.h>		/* for PCI defines */
+#include <linux/init_ohci1394_dma.h>
+#include <asm/pci-direct.h>	/* for direct PCI config space access */
+#include <asm/fixmap.h>
+
+#include "ieee1394_types.h"
+#include "ohci1394.h"
+
+int __initdata init_ohci1394_dma_early;
+
+/* Reads a PHY register of an OHCI-1394 controller */
+static inline u8 __init get_phy_reg(struct ti_ohci *ohci, u8 addr)
+{
+	int i;
+	quadlet_t r;
+
+	reg_write(ohci, OHCI1394_PhyControl, (addr << 8) | 0x00008000);
+
+	for (i = 0; i < OHCI_LOOP_COUNT; i++) {
+		if (reg_read(ohci, OHCI1394_PhyControl) & 0x80000000)
+			break;
+		mdelay(1);
+	}
+	r = reg_read(ohci, OHCI1394_PhyControl);
+
+	return (r & 0x00ff0000) >> 16;
+}
+
+/* Writes to a PHY register of an OHCI-1394 controller */
+static inline void __init set_phy_reg(struct ti_ohci *ohci, u8 addr, u8 data)
+{
+	int i;
+
+	reg_write(ohci, OHCI1394_PhyControl, (addr << 8) | data | 0x00004000);
+
+	for (i = 0; i < OHCI_LOOP_COUNT; i++) {
+		u32 r = reg_read(ohci, OHCI1394_PhyControl);
+		if (!(r & 0x00004000))
+			break;
+		mdelay(1);
+	}
+}
+
+/* Resets an OHCI-1394 controller (for sane state before initialization) */
+static inline void __init init_ohci1394_soft_reset(struct ti_ohci *ohci) {
+	int i;
+
+	reg_write(ohci, OHCI1394_HCControlSet, OHCI1394_HCControl_softReset);
+
+	for (i = 0; i < OHCI_LOOP_COUNT; i++) {
+		if (!(reg_read(ohci, OHCI1394_HCControlSet)
+				   & OHCI1394_HCControl_softReset))
+			break;
+		mdelay(1);
+	}
+}
+
+/* Basic OHCI-1394 register and port inititalization */
+static inline void __init init_ohci1394_initialize(struct ti_ohci *ohci)
+{
+	quadlet_t bus_options;
+	int num_ports, i;
+
+	/* Put some defaults to these undefined bus options */
+	bus_options = reg_read(ohci, OHCI1394_BusOptions);
+	bus_options |=  0x60000000; /* Enable CMC and ISC */
+	bus_options &= ~0x00ff0000; /* XXX: Set cyc_clk_acc to zero for now */
+	bus_options &= ~0x18000000; /* Disable PMC and BMC */
+	reg_write(ohci, OHCI1394_BusOptions, bus_options);
+
+	/* Set the bus number */
+	reg_write(ohci, OHCI1394_NodeID, 0x0000ffc0);
+
+	/* Enable posted writes */
+	reg_write(ohci, OHCI1394_HCControlSet,
+			OHCI1394_HCControl_postedWriteEnable);
+
+	/* Clear link control register */
+	reg_write(ohci, OHCI1394_LinkControlClear, 0xffffffff);
+
+	/* enable phys */
+	reg_write(ohci, OHCI1394_LinkControlSet,
+			OHCI1394_LinkControl_RcvPhyPkt);
+
+	/* Don't accept phy packets into AR request context */
+	reg_write(ohci, OHCI1394_LinkControlClear, 0x00000400);
+
+	/* Clear the Isochonouys interrupt masks */
+	reg_write(ohci, OHCI1394_IsoRecvIntMaskClear, 0xffffffff);
+	reg_write(ohci, OHCI1394_IsoRecvIntEventClear, 0xffffffff);
+	reg_write(ohci, OHCI1394_IsoXmitIntMaskClear, 0xffffffff);
+	reg_write(ohci, OHCI1394_IsoXmitIntEventClear, 0xffffffff);
+
+	/* Accept asyncronous transfer requests from all nodes for now */
+	reg_write(ohci,OHCI1394_AsReqFilterHiSet, 0x80000000);
+
+	/* Specify asyncronous transfer retries */
+	reg_write(ohci, OHCI1394_ATRetries,
+		  OHCI1394_MAX_AT_REQ_RETRIES |
+		  (OHCI1394_MAX_AT_RESP_RETRIES<<4) |
+		  (OHCI1394_MAX_PHYS_RESP_RETRIES<<8));
+
+	/* We don't want hardware swapping */
+	reg_write(ohci, OHCI1394_HCControlClear, OHCI1394_HCControl_noByteSwap);
+
+	/* Enable link */
+	reg_write(ohci, OHCI1394_HCControlSet, OHCI1394_HCControl_linkEnable);
+
+	/* If anything is connected to a port, make sure it is enabled */
+	num_ports = get_phy_reg(ohci, 2) & 0xf;
+	for (i = 0; i < num_ports; i++) {
+		unsigned int status;
+
+		set_phy_reg(ohci, 7, i);
+		status = get_phy_reg(ohci, 8);
+
+		if (status & 0x20)
+			set_phy_reg(ohci, 8, status & ~1);
+	}
+}
+
+/**
+ * init_ohci1394_wait_for_busresets - wait until bus resets are completed
+ *
+ * OHCI1394 initialization itself and any device going on- or offline
+ * and any cable issue cause a IEEE1394 bus reset. The OHCI1394 spec
+ * specifies that physical DMA is disabled on each bus reset and it
+ * has to be enabled after each bus reset when needed. We resort
+ * to polling here because on early boot, we have no interrupts.
+ */
+static inline void __init init_ohci1394_wait_for_busresets(struct ti_ohci *ohci)
+{
+	int i, events;
+
+	for (i=0; i < 9; i++) {
+		mdelay(200);
+		events = reg_read(ohci, OHCI1394_IntEventSet);
+		if (events & OHCI1394_busReset)
+			reg_write(ohci, OHCI1394_IntEventClear,
+					OHCI1394_busReset);
+	}
+}
+
+/**
+ * init_ohci1394_enable_physical_dma - Enable physical DMA for remote debugging
+ * This enables remote DMA access over IEEE1394 from every host for the low
+ * 4GB of address space. DMA accesses above 4GB are not available currently.
+ */
+static inline void __init init_ohci1394_enable_physical_dma(struct ti_ohci *hci)
+{
+	reg_write(hci, OHCI1394_PhyReqFilterHiSet, 0xffffffff);
+	reg_write(hci, OHCI1394_PhyReqFilterLoSet, 0xffffffff);
+	reg_write(hci, OHCI1394_PhyUpperBound, 0xffff0000);
+}
+
+/**
+ * init_ohci1394_reset_and_init_dma - init controller and enable DMA
+ * This initializes the given controller and enables physical DMA engine in it.
+ */
+static inline void __init init_ohci1394_reset_and_init_dma(struct ti_ohci *ohci)
+{
+	/* Start off with a soft reset, clears everything to a sane state. */
+	init_ohci1394_soft_reset(ohci);
+
+	/* Accessing some registers without LPS enabled may cause lock up */
+	reg_write(ohci, OHCI1394_HCControlSet, OHCI1394_HCControl_LPS);
+
+	/* Disable and clear interrupts */
+	reg_write(ohci, OHCI1394_IntEventClear, 0xffffffff);
+	reg_write(ohci, OHCI1394_IntMaskClear, 0xffffffff);
+
+	mdelay(50); /* Wait 50msec to make sure we have full link enabled */
+
+	init_ohci1394_initialize(ohci);
+	/*
+	 * The initialization causes at least one IEEE1394 bus reset. Enabling
+	 * physical DMA only works *after* *all* bus resets have calmed down:
+	 */
+	init_ohci1394_wait_for_busresets(ohci);
+
+	/* We had to wait and do this now if we want to debug early problems */
+	init_ohci1394_enable_physical_dma(ohci);
+}
+
+/**
+ * init_ohci1394_controller - Map the registers of the controller and init DMA
+ * This maps the registers of the specified controller and initializes it
+ */
+static inline void __init init_ohci1394_controller(int num, int slot, int func)
+{
+	unsigned long ohci_base;
+	struct ti_ohci ohci;
+
+	printk(KERN_INFO "init_ohci1394_dma: initializing OHCI-1394"
+			 " at %02x:%02x.%x\n", num, slot, func);
+
+	ohci_base = read_pci_config(num, slot, func, PCI_BASE_ADDRESS_0+(0<<2))
+						   & PCI_BASE_ADDRESS_MEM_MASK;
+
+	set_fixmap_nocache(FIX_OHCI1394_BASE, ohci_base);
+
+	ohci.registers = (void *)fix_to_virt(FIX_OHCI1394_BASE);
+
+	init_ohci1394_reset_and_init_dma(&ohci);
+}
+
+/**
+ * debug_init_ohci1394_dma - scan for OHCI1394 controllers and init DMA on them
+ * Scans the whole PCI space for OHCI1394 controllers and inits DMA on them
+ */
+void __init init_ohci1394_dma_on_all_controllers(void)
+{
+	int num, slot, func;
+
+	if (!early_pci_allowed())
+		return;
+
+	/* Poor man's PCI discovery, the only thing we can do at early boot */
+	for (num = 0; num < 32; num++) {
+		for (slot = 0; slot < 32; slot++) {
+			for (func = 0; func < 8; func++) {
+				u32 class = read_pci_config(num,slot,func,
+							PCI_CLASS_REVISION);
+				if ((class == 0xffffffff))
+					continue; /* No device at this func */
+
+				if (class>>8 != PCI_CLASS_SERIAL_FIREWIRE_OHCI)
+					continue; /* Not an OHCI-1394 device */
+
+				init_ohci1394_controller(num, slot, func);
+				break; /* Assume one controller per device */
+			}
+		}
+	}
+	printk(KERN_INFO "init_ohci1394_dma: finished initializing OHCI DMA\n");
+}
+
+/**
+ * setup_init_ohci1394_early - enables early OHCI1394 DMA initialization
+ */
+static int __init setup_ohci1394_dma(char *opt)
+{
+	if (!strcmp(opt, "early"))
+		init_ohci1394_dma_early = 1;
+	return 0;
+}
+
+/* passing ohci1394_dma=early on boot causes early OHCI1394 DMA initialization */
+early_param("ohci1394_dma", setup_ohci1394_dma);
diff --git a/include/asm-x86/fixmap_32.h b/include/asm-x86/fixmap_32.h
index fde140fd6d95..a7404d50686b 100644
--- a/include/asm-x86/fixmap_32.h
+++ b/include/asm-x86/fixmap_32.h
@@ -104,6 +104,9 @@ enum fixed_addresses {
 			(__end_of_permanent_fixed_addresses & 511),
 	FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1,
 	FIX_WP_TEST,
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+	FIX_OHCI1394_BASE,
+#endif
 	__end_of_fixed_addresses
 };
 
diff --git a/include/asm-x86/fixmap_64.h b/include/asm-x86/fixmap_64.h
index 8f44782e5fe5..70ddb21e6458 100644
--- a/include/asm-x86/fixmap_64.h
+++ b/include/asm-x86/fixmap_64.h
@@ -44,6 +44,9 @@ enum fixed_addresses {
 	FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
 	FIX_EFI_IO_MAP_LAST_PAGE,
 	FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE+MAX_EFI_IO_PAGES-1,
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+	FIX_OHCI1394_BASE,
+#endif
 	__end_of_fixed_addresses
 };
 
diff --git a/include/linux/init_ohci1394_dma.h b/include/linux/init_ohci1394_dma.h
new file mode 100644
index 000000000000..3c03a4bba5e4
--- /dev/null
+++ b/include/linux/init_ohci1394_dma.h
@@ -0,0 +1,4 @@
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+extern int __initdata init_ohci1394_dma_early;
+extern void __init init_ohci1394_dma_on_all_controllers(void);
+#endif
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index aa56e631580d..89f4035b526c 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -586,5 +586,33 @@ config LATENCYTOP
 	  Enable this option if you want to use the LatencyTOP tool
 	  to find out which userspace is blocking on what kernel operations.
 
+config PROVIDE_OHCI1394_DMA_INIT
+	bool "Provide code for enabling DMA over FireWire early on boot"
+	depends on PCI && X86
+	help
+	  If you want to debug problems which hang or crash the kernel early
+	  on boot and the crashing machine has a FireWire port, you can use
+	  this feature to remotely access the memory of the crashed machine
+	  over FireWire. This employs remote DMA as part of the OHCI1394
+	  specification which is now the standard for FireWire controllers.
+
+	  With remote DMA, you can monitor the printk buffer remotely using
+	  firescope and access all memory below 4GB using fireproxy from gdb.
+	  Even controlling a kernel debugger is possible using remote DMA.
+
+	  Usage:
+
+	  If ohci1394_dma=early is used as boot parameter, it will initialize
+	  all OHCI1394 controllers which are found in the PCI config space.
+
+	  As all changes to the FireWire bus such as enabling and disabling
+	  devices cause a bus reset and thereby disable remote DMA for all
+	  devices, be sure to have the cable plugged and FireWire enabled on
+	  the debugging host before booting the debug target for debugging.
+
+	  This code (~1k) is freed after boot. By then, the firewire stack
+	  in charge of the OHCI-1394 controllers should be used instead.
+
+	  See Documentation/debugging-via-ohci1394.txt for more information.
 
 source "samples/Kconfig"
-- 
cgit v1.2.3


From 9198715763e8d0fd7fb7578c07916a5313e28b9d Mon Sep 17 00:00:00 2001
From: Yinghai Lu <Yinghai.Lu@Sun.COM>
Date: Wed, 30 Jan 2008 13:34:12 +0100
Subject: x86: fix overlap between pagetable with bss section

one early crash on one 8 node 256g machine:

Command line: console=uart8250,io,0x3f8,115200n8 initrd=kernel.org/mydisk11_x86_64.gz rw root=/dev/ram0 debug initcall_debug apic=debug acpi.debug_level=0x0000000f pci=routeirq ip=dhcp load_ramdisk=1 ramdisk_size=131072 BOOT_IMAGE=kernel.org/bzImage_2.6.25_k8.1
BIOS-provided physical RAM map:
 BIOS-e820: 0000000000000000 - 000000000009bc00 (usable)
 BIOS-e820: 000000000009bc00 - 00000000000a0000 (reserved)
 BIOS-e820: 00000000000e6000 - 0000000000100000 (reserved)
 BIOS-e820: 0000000000100000 - 00000000dffe0000 (usable)
 BIOS-e820: 00000000dffe0000 - 00000000dffee000 (ACPI data)
 BIOS-e820: 00000000dffee000 - 00000000dffff050 (ACPI NVS)
 BIOS-e820: 00000000dffff050 - 00000000e0000000 (reserved)
 BIOS-e820: 00000000fec00000 - 00000000fec01000 (reserved)
 BIOS-e820: 00000000fee00000 - 00000000fee01000 (reserved)
 BIOS-e820: 00000000ff700000 - 0000000100000000 (reserved)
 BIOS-e820: 0000000100000000 - 0000004020000000 (usable)
Early serial console at I/O port 0x3f8 (options '115200n8')
console [uart0] enabled
end_pfn_map = 67239936
Kernel panic - not syncing: Duplicated early reservation d40000-e42000

Pid: 0, comm: swapper Not tainted 2.6.24-smp-g5a514e21-dirty #3

Call Trace:
 [<ffffffff80221545>] lapic_get_maxlvt+0x0/0x10
 [<ffffffff80221657>] clear_local_APIC+0x5/0xcf
 [<ffffffff80221726>] disable_local_APIC+0x5/0x17
 [<ffffffff8021fe16>] smp_send_stop+0x46/0x4c
 [<ffffffff80235293>] panic+0x94/0x13e
 [<ffffffff80bc3b03>] sctp_eps_proc_init+0x12/0x34
 [<ffffffff80b9f1c5>] reserve_early+0x30/0x6c
 [<ffffffff80803925>] init_memory_mapping+0x2cd/0x2dc
 [<ffffffff80b9dc01>] setup_arch+0x21f/0x44e
 [<ffffffff80b978be>] start_kernel+0x6f/0x2c7
 [<ffffffff80b971cc>] _sinittext+0x1cc/0x1d3

it turns out there is overlap between pgtable and bss...

in System.map we have
ffffffff80d40420 b rsi_table
ffffffff80d40620 B krb5_seq_lock
ffffffff80d40628 b i.20437
ffffffff80d40630 b xprt_rdma_inline_write_padding
ffffffff80d40638 b sunrpc_table_header
ffffffff80d40640 b zero
ffffffff80d40644 b min_memreg
ffffffff80d40648 b rpcrdma_tk_lock_g
ffffffff80d40650 B sctp_assocs_id_lock
ffffffff80d40658 B proc_net_sctp
ffffffff80d40660 B sctp_assocs_id
ffffffff80d40680 B sysctl_sctp_mem
ffffffff80d40690 B sysctl_sctp_rmem
ffffffff80d406a0 B sysctl_sctp_wmem
ffffffff80d406b0 b sctp_ctl_socket
ffffffff80d406b8 b sctp_pf_inet6_specific
ffffffff80d406c0 b sctp_pf_inet_specific
ffffffff80d406c8 b sctp_af_v4_specific
ffffffff80d406d0 b sctp_af_v6_specific
ffffffff80d406d8 b sctp_rand.33270
ffffffff80d406dc b sctp_memory_pressure
ffffffff80d406e0 b sctp_sockets_allocated
ffffffff80d406e4 b sctp_memory_allocated
ffffffff80d406e8 b sctp_sysctl_header
ffffffff80d406f0 b zero
ffffffff80d406f4 A __bss_stop
ffffffff80d406f4 A _end

need to round up table_start to PAGE_SIZE.

also make the panic more informative.

Signed-off-by: Yinghai Lu <yinghai.lu@sun.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/e820_64.c | 4 ++--
 arch/x86/mm/init_64.c     | 7 +++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
index f8b7bebb4344..c617174e8963 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -70,8 +70,8 @@ void __init reserve_early(unsigned long start, unsigned long end)
 	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
 		r = &early_res[i];
 		if (end > r->start && start < r->end)
-			panic("Duplicated early reservation %lx-%lx\n",
-			      start, end);
+			panic("Overlapping early reservations %lx-%lx to %lx-%lx\n",
+			      start, end, r->start, r->end);
 	}
 	if (i >= MAX_EARLY_RES)
 		panic("Too many early reservations");
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index a95272644591..cc50a13ce8d9 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -358,6 +358,13 @@ static void __init find_early_table_space(unsigned long end)
 	if (table_start == -1UL)
 		panic("Cannot find space for the kernel page tables");
 
+	/*
+	 * When you have a lot of RAM like 256GB, early_table will not fit
+	 * into 0x8000 range, find_e820_area() will find area after kernel
+	 * bss but the table_start is not page aligned, so need to round it
+	 * up to avoid overlap with bss:
+	 */
+	table_start = round_up(table_start, PAGE_SIZE);
 	table_start >>= PAGE_SHIFT;
 	table_end = table_start;
 
-- 
cgit v1.2.3